Update Linux to v5.10.157
Sourced from [1]
[1] https://cdn.kernel.org/pub/linux/kernel/v5.x/linux-5.10.157.tar.xz
Signed-off-by: Olivier Deprez <olivier.deprez@arm.com>
Change-Id: I7b30d9e98d8c465d6b44de8e7433b4a40b3289ba
diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c
index a13ef83..84f3a64 100644
--- a/fs/9p/vfs_inode_dotl.c
+++ b/fs/9p/vfs_inode_dotl.c
@@ -657,14 +657,10 @@
if (stat->st_result_mask & P9_STATS_NLINK)
set_nlink(inode, stat->st_nlink);
if (stat->st_result_mask & P9_STATS_MODE) {
- inode->i_mode = stat->st_mode;
- if ((S_ISBLK(inode->i_mode)) ||
- (S_ISCHR(inode->i_mode)))
- init_special_inode(inode, inode->i_mode,
- inode->i_rdev);
+ mode = stat->st_mode & S_IALLUGO;
+ mode |= inode->i_mode & ~S_IALLUGO;
+ inode->i_mode = mode;
}
- if (stat->st_result_mask & P9_STATS_RDEV)
- inode->i_rdev = new_decode_dev(stat->st_rdev);
if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) &&
stat->st_result_mask & P9_STATS_SIZE)
v9fs_i_size_write(inode, stat->st_size);
diff --git a/fs/afs/dir.c b/fs/afs/dir.c
index 262c0ae..1597950 100644
--- a/fs/afs/dir.c
+++ b/fs/afs/dir.c
@@ -412,8 +412,11 @@
}
/* skip if starts before the current position */
- if (offset < curr)
+ if (offset < curr) {
+ if (next > curr)
+ ctx->pos = blkoff + next * sizeof(union afs_xdr_dirent);
continue;
+ }
/* found the next entry */
if (!dir_emit(ctx, dire->u.name, nlen,
diff --git a/fs/afs/flock.c b/fs/afs/flock.c
index cb3054c..466ad60 100644
--- a/fs/afs/flock.c
+++ b/fs/afs/flock.c
@@ -76,7 +76,7 @@
if (call->error == 0) {
spin_lock(&vnode->lock);
trace_afs_flock_ev(vnode, NULL, afs_flock_timestamp, 0);
- vnode->locked_at = call->reply_time;
+ vnode->locked_at = call->issue_time;
afs_schedule_lock_extension(vnode);
spin_unlock(&vnode->lock);
}
diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
index 1d95ed9..0048a32 100644
--- a/fs/afs/fsclient.c
+++ b/fs/afs/fsclient.c
@@ -130,7 +130,7 @@
static time64_t xdr_decode_expiry(struct afs_call *call, u32 expiry)
{
- return ktime_divns(call->reply_time, NSEC_PER_SEC) + expiry;
+ return ktime_divns(call->issue_time, NSEC_PER_SEC) + expiry;
}
static void xdr_decode_AFSCallBack(const __be32 **_bp,
diff --git a/fs/afs/inode.c b/fs/afs/inode.c
index f81a972..826fae2 100644
--- a/fs/afs/inode.c
+++ b/fs/afs/inode.c
@@ -729,10 +729,23 @@
{
struct inode *inode = d_inode(path->dentry);
struct afs_vnode *vnode = AFS_FS_I(inode);
- int seq = 0;
+ struct key *key;
+ int ret, seq = 0;
_enter("{ ino=%lu v=%u }", inode->i_ino, inode->i_generation);
+ if (vnode->volume &&
+ !(query_flags & AT_STATX_DONT_SYNC) &&
+ !test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) {
+ key = afs_request_key(vnode->volume->cell);
+ if (IS_ERR(key))
+ return PTR_ERR(key);
+ ret = afs_validate(vnode, key);
+ key_put(key);
+ if (ret < 0)
+ return ret;
+ }
+
do {
read_seqbegin_or_lock(&vnode->cb_lock, &seq);
generic_fillattr(inode, stat);
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index dc08a3d..637cbe5 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -135,7 +135,6 @@
bool need_attention; /* T if RxRPC poked us */
bool async; /* T if asynchronous */
bool upgrade; /* T to request service upgrade */
- bool have_reply_time; /* T if have got reply_time */
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
u16 service_id; /* Actual service ID (after upgrade) */
@@ -149,7 +148,7 @@
} __attribute__((packed));
__be64 tmp64;
};
- ktime_t reply_time; /* Time of first reply packet */
+ ktime_t issue_time; /* Time of issue of operation */
};
struct afs_call_type {
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index 1d1a8de..f1dc216 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -69,6 +69,7 @@
/* Unified AFS error table */
case UAEPERM: return -EPERM;
case UAENOENT: return -ENOENT;
+ case UAEAGAIN: return -EAGAIN;
case UAEACCES: return -EACCES;
case UAEBUSY: return -EBUSY;
case UAEEXIST: return -EEXIST;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index 8be709c..535d28b 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -429,6 +429,7 @@
if (call->max_lifespan)
rxrpc_kernel_set_max_life(call->net->socket, rxcall,
call->max_lifespan);
+ call->issue_time = ktime_get_real();
/* send the request */
iov[0].iov_base = call->request;
@@ -533,12 +534,6 @@
return;
}
- if (!call->have_reply_time &&
- rxrpc_kernel_get_reply_time(call->net->socket,
- call->rxcall,
- &call->reply_time))
- call->have_reply_time = true;
-
ret = call->type->deliver(call);
state = READ_ONCE(call->state);
if (ret == 0 && call->unmarshalling_error)
@@ -572,6 +567,8 @@
case -ENODATA:
case -EBADMSG:
case -EMSGSIZE:
+ case -ENOMEM:
+ case -EFAULT:
abort_code = RXGEN_CC_UNMARSHAL;
if (state != AFS_CALL_CL_AWAIT_REPLY)
abort_code = RXGEN_SS_UNMARSHAL;
@@ -579,7 +576,7 @@
abort_code, ret, "KUM");
goto local_abort;
default:
- abort_code = RX_USER_ABORT;
+ abort_code = RX_CALL_DEAD;
rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
abort_code, ret, "KER");
goto local_abort;
@@ -871,7 +868,7 @@
case -ENOMEM:
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RX_USER_ABORT, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
fallthrough;
default:
_leave(" [error]");
@@ -913,7 +910,7 @@
if (n == -ENOMEM) {
_debug("oom");
rxrpc_kernel_abort_call(net->socket, call->rxcall,
- RX_USER_ABORT, -ENOMEM, "KOO");
+ RXGEN_SS_MARSHAL, -ENOMEM, "KOO");
}
_leave(" [error]");
}
diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c
index bd787e7..5b2ef5f 100644
--- a/fs/afs/yfsclient.c
+++ b/fs/afs/yfsclient.c
@@ -239,8 +239,7 @@
struct afs_callback *cb = &scb->callback;
ktime_t cb_expiry;
- cb_expiry = call->reply_time;
- cb_expiry = ktime_add(cb_expiry, xdr_to_u64(x->expiration_time) * 100);
+ cb_expiry = ktime_add(call->issue_time, xdr_to_u64(x->expiration_time) * 100);
cb->expires_at = ktime_divns(cb_expiry, NSEC_PER_SEC);
scb->have_cb = true;
*_bp += xdr_size(x);
diff --git a/fs/attr.c b/fs/attr.c
index b4bbdbd..848ffe6 100644
--- a/fs/attr.c
+++ b/fs/attr.c
@@ -134,6 +134,8 @@
*/
int inode_newsize_ok(const struct inode *inode, loff_t offset)
{
+ if (offset < 0)
+ return -EINVAL;
if (inode->i_size < offset) {
unsigned long limit;
diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c
index 04c4aa7..ccc4c6d 100644
--- a/fs/binfmt_elf.c
+++ b/fs/binfmt_elf.c
@@ -170,8 +170,8 @@
static int
create_elf_tables(struct linux_binprm *bprm, const struct elfhdr *exec,
- unsigned long load_addr, unsigned long interp_load_addr,
- unsigned long e_entry)
+ unsigned long interp_load_addr,
+ unsigned long e_entry, unsigned long phdr_addr)
{
struct mm_struct *mm = current->mm;
unsigned long p = bprm->p;
@@ -256,7 +256,7 @@
NEW_AUX_ENT(AT_HWCAP, ELF_HWCAP);
NEW_AUX_ENT(AT_PAGESZ, ELF_EXEC_PAGESIZE);
NEW_AUX_ENT(AT_CLKTCK, CLOCKS_PER_SEC);
- NEW_AUX_ENT(AT_PHDR, load_addr + exec->e_phoff);
+ NEW_AUX_ENT(AT_PHDR, phdr_addr);
NEW_AUX_ENT(AT_PHENT, sizeof(struct elf_phdr));
NEW_AUX_ENT(AT_PHNUM, exec->e_phnum);
NEW_AUX_ENT(AT_BASE, interp_load_addr);
@@ -820,7 +820,7 @@
static int load_elf_binary(struct linux_binprm *bprm)
{
struct file *interpreter = NULL; /* to shut gcc up */
- unsigned long load_addr = 0, load_bias = 0;
+ unsigned long load_addr, load_bias = 0, phdr_addr = 0;
int load_addr_set = 0;
unsigned long error;
struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL;
@@ -907,7 +907,7 @@
interp_elf_ex = kmalloc(sizeof(*interp_elf_ex), GFP_KERNEL);
if (!interp_elf_ex) {
retval = -ENOMEM;
- goto out_free_ph;
+ goto out_free_file;
}
/* Get the exec headers */
@@ -1153,6 +1153,17 @@
reloc_func_desc = load_bias;
}
}
+
+ /*
+ * Figure out which segment in the file contains the Program
+ * Header table, and map to the associated memory address.
+ */
+ if (elf_ppnt->p_offset <= elf_ex->e_phoff &&
+ elf_ex->e_phoff < elf_ppnt->p_offset + elf_ppnt->p_filesz) {
+ phdr_addr = elf_ex->e_phoff - elf_ppnt->p_offset +
+ elf_ppnt->p_vaddr;
+ }
+
k = elf_ppnt->p_vaddr;
if ((elf_ppnt->p_flags & PF_X) && k < start_code)
start_code = k;
@@ -1188,6 +1199,7 @@
}
e_entry = elf_ex->e_entry + load_bias;
+ phdr_addr += load_bias;
elf_bss += load_bias;
elf_brk += load_bias;
start_code += load_bias;
@@ -1251,8 +1263,8 @@
goto out;
#endif /* ARCH_HAS_SETUP_ADDITIONAL_PAGES */
- retval = create_elf_tables(bprm, elf_ex,
- load_addr, interp_load_addr, e_entry);
+ retval = create_elf_tables(bprm, elf_ex, interp_load_addr,
+ e_entry, phdr_addr);
if (retval < 0)
goto out;
@@ -1316,6 +1328,7 @@
out_free_dentry:
kfree(interp_elf_ex);
kfree(interp_elf_phdata);
+out_free_file:
allow_write_access(interpreter);
if (interpreter)
fput(interpreter);
@@ -1601,17 +1614,16 @@
* long file_ofs
* followed by COUNT filenames in ASCII: "FILE1" NUL "FILE2" NUL...
*/
-static int fill_files_note(struct memelfnote *note)
+static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm)
{
- struct mm_struct *mm = current->mm;
- struct vm_area_struct *vma;
unsigned count, size, names_ofs, remaining, n;
user_long_t *data;
user_long_t *start_end_ofs;
char *name_base, *name_curpos;
+ int i;
/* *Estimated* file count and total data size needed */
- count = mm->map_count;
+ count = cprm->vma_count;
if (count > UINT_MAX / 64)
return -EINVAL;
size = count * 64;
@@ -1633,11 +1645,12 @@
name_base = name_curpos = ((char *)data) + names_ofs;
remaining = size - names_ofs;
count = 0;
- for (vma = mm->mmap; vma != NULL; vma = vma->vm_next) {
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = &cprm->vma_meta[i];
struct file *file;
const char *filename;
- file = vma->vm_file;
+ file = m->file;
if (!file)
continue;
filename = file_path(file, name_curpos, remaining);
@@ -1657,9 +1670,9 @@
memmove(name_curpos, filename, n);
name_curpos += n;
- *start_end_ofs++ = vma->vm_start;
- *start_end_ofs++ = vma->vm_end;
- *start_end_ofs++ = vma->vm_pgoff;
+ *start_end_ofs++ = m->start;
+ *start_end_ofs++ = m->end;
+ *start_end_ofs++ = m->pgoff;
count++;
}
@@ -1670,7 +1683,7 @@
* Count usually is less than mm->map_count,
* we need to move filenames down.
*/
- n = mm->map_count - count;
+ n = cprm->vma_count - count;
if (n != 0) {
unsigned shift_bytes = n * 3 * sizeof(data[0]);
memmove(name_base - shift_bytes, name_base,
@@ -1785,7 +1798,7 @@
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct task_struct *dump_task = current;
const struct user_regset_view *view = task_user_regset_view(dump_task);
@@ -1857,7 +1870,7 @@
* Now fill in each thread's information.
*/
for (t = info->thread; t != NULL; t = t->next)
- if (!fill_thread_core_info(t, view, siginfo->si_signo, &info->size))
+ if (!fill_thread_core_info(t, view, cprm->siginfo->si_signo, &info->size))
return 0;
/*
@@ -1866,13 +1879,13 @@
fill_psinfo(psinfo, dump_task->group_leader, dump_task->mm);
info->size += notesize(&info->psinfo);
- fill_siginfo_note(&info->signote, &info->csigdata, siginfo);
+ fill_siginfo_note(&info->signote, &info->csigdata, cprm->siginfo);
info->size += notesize(&info->signote);
fill_auxv_note(&info->auxv, current->mm);
info->size += notesize(&info->auxv);
- if (fill_files_note(&info->files) == 0)
+ if (fill_files_note(&info->files, cprm) == 0)
info->size += notesize(&info->files);
return 1;
@@ -2014,7 +2027,7 @@
static int fill_note_info(struct elfhdr *elf, int phdrs,
struct elf_note_info *info,
- const kernel_siginfo_t *siginfo, struct pt_regs *regs)
+ struct coredump_params *cprm)
{
struct core_thread *ct;
struct elf_thread_status *ets;
@@ -2035,13 +2048,13 @@
list_for_each_entry(ets, &info->thread_list, list) {
int sz;
- sz = elf_dump_thread_status(siginfo->si_signo, ets);
+ sz = elf_dump_thread_status(cprm->siginfo->si_signo, ets);
info->thread_status_size += sz;
}
/* now collect the dump for the current */
memset(info->prstatus, 0, sizeof(*info->prstatus));
- fill_prstatus(info->prstatus, current, siginfo->si_signo);
- elf_core_copy_regs(&info->prstatus->pr_reg, regs);
+ fill_prstatus(info->prstatus, current, cprm->siginfo->si_signo);
+ elf_core_copy_regs(&info->prstatus->pr_reg, cprm->regs);
/* Set up header */
fill_elf_header(elf, phdrs, ELF_ARCH, ELF_CORE_EFLAGS);
@@ -2057,18 +2070,18 @@
fill_note(info->notes + 1, "CORE", NT_PRPSINFO,
sizeof(*info->psinfo), info->psinfo);
- fill_siginfo_note(info->notes + 2, &info->csigdata, siginfo);
+ fill_siginfo_note(info->notes + 2, &info->csigdata, cprm->siginfo);
fill_auxv_note(info->notes + 3, current->mm);
info->numnote = 4;
- if (fill_files_note(info->notes + info->numnote) == 0) {
+ if (fill_files_note(info->notes + info->numnote, cprm) == 0) {
info->notes_files = info->notes + info->numnote;
info->numnote++;
}
/* Try to dump the FPU. */
- info->prstatus->pr_fpvalid = elf_core_copy_task_fpregs(current, regs,
- info->fpu);
+ info->prstatus->pr_fpvalid =
+ elf_core_copy_task_fpregs(current, cprm->regs, info->fpu);
if (info->prstatus->pr_fpvalid)
fill_note(info->notes + info->numnote++,
"CORE", NT_PRFPREG, sizeof(*info->fpu), info->fpu);
@@ -2154,8 +2167,7 @@
static int elf_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs, i;
- size_t vma_data_size;
+ int segs, i;
struct elfhdr elf;
loff_t offset = 0, dataoff;
struct elf_note_info info = { };
@@ -2163,16 +2175,12 @@
struct elf_shdr *shdr4extnum = NULL;
Elf_Half e_phnum;
elf_addr_t e_shoff;
- struct core_vma_metadata *vma_meta;
-
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- return 0;
/*
* The number of segs are recored into ELF header as 16bit value.
* Please check DEFAULT_MAX_MAP_COUNT definition when you modify here.
*/
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -2186,7 +2194,7 @@
* Collect all the non-memory information about the process for the
* notes. This also sets up the file header.
*/
- if (!fill_note_info(&elf, e_phnum, &info, cprm->siginfo, cprm->regs))
+ if (!fill_note_info(&elf, e_phnum, &info, cprm))
goto end_coredump;
has_dumped = 1;
@@ -2210,7 +2218,7 @@
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -2230,8 +2238,8 @@
goto end_coredump;
/* Write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
phdr.p_type = PT_LOAD;
@@ -2268,8 +2276,8 @@
if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
if (!dump_user_range(cprm, meta->start, meta->dump_size))
goto end_coredump;
@@ -2287,7 +2295,6 @@
end_coredump:
free_note_info(&info);
kfree(shdr4extnum);
- kvfree(vma_meta);
kfree(phdr4note);
return has_dumped;
}
diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c
index be4062b..5764295 100644
--- a/fs/binfmt_elf_fdpic.c
+++ b/fs/binfmt_elf_fdpic.c
@@ -1479,7 +1479,7 @@
static int elf_fdpic_core_dump(struct coredump_params *cprm)
{
int has_dumped = 0;
- int vma_count, segs;
+ int segs;
int i;
struct elfhdr *elf = NULL;
loff_t offset = 0, dataoff;
@@ -1494,8 +1494,6 @@
elf_addr_t e_shoff;
struct core_thread *ct;
struct elf_thread_status *tmp;
- struct core_vma_metadata *vma_meta = NULL;
- size_t vma_data_size;
/* alloc memory for large data structures: too large to be on stack */
elf = kmalloc(sizeof(*elf), GFP_KERNEL);
@@ -1505,9 +1503,6 @@
if (!psinfo)
goto end_coredump;
- if (dump_vma_snapshot(cprm, &vma_count, &vma_meta, &vma_data_size))
- goto end_coredump;
-
for (ct = current->mm->core_state->dumper.next;
ct; ct = ct->next) {
tmp = elf_dump_thread_status(cprm->siginfo->si_signo,
@@ -1527,7 +1522,7 @@
tmp->next = thread_list;
thread_list = tmp;
- segs = vma_count + elf_core_extra_phdrs();
+ segs = cprm->vma_count + elf_core_extra_phdrs();
/* for notes section */
segs++;
@@ -1572,7 +1567,7 @@
/* Page-align dumped data */
dataoff = offset = roundup(offset, ELF_EXEC_PAGESIZE);
- offset += vma_data_size;
+ offset += cprm->vma_data_size;
offset += elf_core_extra_data_size();
e_shoff = offset;
@@ -1592,8 +1587,8 @@
goto end_coredump;
/* write program headers for segments dump */
- for (i = 0; i < vma_count; i++) {
- struct core_vma_metadata *meta = vma_meta + i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *meta = cprm->vma_meta + i;
struct elf_phdr phdr;
size_t sz;
@@ -1643,7 +1638,7 @@
if (!dump_skip(cprm, dataoff - cprm->pos))
goto end_coredump;
- if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count))
+ if (!elf_fdpic_dump_segments(cprm, cprm->vma_meta, cprm->vma_count))
goto end_coredump;
if (!elf_core_write_extra_data(cprm))
@@ -1667,7 +1662,6 @@
thread_list = thread_list->next;
kfree(tmp);
}
- kvfree(vma_meta);
kfree(phdr4note);
kfree(elf);
kfree(psinfo);
diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c
index b9c658e..69f4db0 100644
--- a/fs/binfmt_flat.c
+++ b/fs/binfmt_flat.c
@@ -427,6 +427,30 @@
/****************************************************************************/
+static inline u32 __user *skip_got_header(u32 __user *rp)
+{
+ if (IS_ENABLED(CONFIG_RISCV)) {
+ /*
+ * RISC-V has a 16 byte GOT PLT header for elf64-riscv
+ * and 8 byte GOT PLT header for elf32-riscv.
+ * Skip the whole GOT PLT header, since it is reserved
+ * for the dynamic linker (ld.so).
+ */
+ u32 rp_val0, rp_val1;
+
+ if (get_user(rp_val0, rp))
+ return rp;
+ if (get_user(rp_val1, rp + 1))
+ return rp;
+
+ if (rp_val0 == 0xffffffff && rp_val1 == 0xffffffff)
+ rp += 4;
+ else if (rp_val0 == 0xffffffff)
+ rp += 2;
+ }
+ return rp;
+}
+
static int load_flat_file(struct linux_binprm *bprm,
struct lib_info *libinfo, int id, unsigned long *extra_stack)
{
@@ -774,7 +798,8 @@
* image.
*/
if (flags & FLAT_FLAG_GOTPIC) {
- for (rp = (u32 __user *)datapos; ; rp++) {
+ rp = skip_got_header((u32 __user *) datapos);
+ for (; ; rp++) {
u32 addr, rp_val;
if (get_user(rp_val, rp))
return -EFAULT;
diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c
index baff31a..6942707 100644
--- a/fs/btrfs/backref.c
+++ b/fs/btrfs/backref.c
@@ -137,6 +137,7 @@
u64 root_objectid;
u64 inum;
int share_count;
+ bool have_delayed_delete_refs;
};
static inline int extent_is_shared(struct share_check *sc)
@@ -287,8 +288,10 @@
struct prelim_ref *ref, *next_ref;
rbtree_postorder_for_each_entry_safe(ref, next_ref,
- &preftree->root.rb_root, rbnode)
+ &preftree->root.rb_root, rbnode) {
+ free_inode_elem_list(ref->inode_list);
free_pref(ref);
+ }
preftree->root = RB_ROOT_CACHED;
preftree->count = 0;
@@ -646,6 +649,18 @@
return (struct extent_inode_elem *)(uintptr_t)node->aux;
}
+static void free_leaf_list(struct ulist *ulist)
+{
+ struct ulist_node *node;
+ struct ulist_iterator uiter;
+
+ ULIST_ITER_INIT(&uiter);
+ while ((node = ulist_next(ulist, &uiter)))
+ free_inode_elem_list(unode_aux_to_inode_list(node));
+
+ ulist_free(ulist);
+}
+
/*
* We maintain three separate rbtrees: one for direct refs, one for
* indirect refs which have a key, and one for indirect refs which do not
@@ -760,7 +775,11 @@
cond_resched();
}
out:
- ulist_free(parents);
+ /*
+ * We may have inode lists attached to refs in the parents ulist, so we
+ * must free them before freeing the ulist and its refs.
+ */
+ free_leaf_list(parents);
return ret;
}
@@ -817,16 +836,11 @@
struct preftrees *preftrees, struct share_check *sc)
{
struct btrfs_delayed_ref_node *node;
- struct btrfs_delayed_extent_op *extent_op = head->extent_op;
struct btrfs_key key;
- struct btrfs_key tmp_op_key;
struct rb_node *n;
int count;
int ret = 0;
- if (extent_op && extent_op->update_key)
- btrfs_disk_key_to_cpu(&tmp_op_key, &extent_op->key);
-
spin_lock(&head->lock);
for (n = rb_first_cached(&head->ref_tree); n; n = rb_next(n)) {
node = rb_entry(n, struct btrfs_delayed_ref_node,
@@ -852,10 +866,16 @@
case BTRFS_TREE_BLOCK_REF_KEY: {
/* NORMAL INDIRECT METADATA backref */
struct btrfs_delayed_tree_ref *ref;
+ struct btrfs_key *key_ptr = NULL;
+
+ if (head->extent_op && head->extent_op->update_key) {
+ btrfs_disk_key_to_cpu(&key, &head->extent_op->key);
+ key_ptr = &key;
+ }
ref = btrfs_delayed_node_to_tree_ref(node);
ret = add_indirect_ref(fs_info, preftrees, ref->root,
- &tmp_op_key, ref->level + 1,
+ key_ptr, ref->level + 1,
node->bytenr, count, sc,
GFP_ATOMIC);
break;
@@ -881,13 +901,22 @@
key.offset = ref->offset;
/*
- * Found a inum that doesn't match our known inum, we
- * know it's shared.
+ * If we have a share check context and a reference for
+ * another inode, we can't exit immediately. This is
+ * because even if this is a BTRFS_ADD_DELAYED_REF
+ * reference we may find next a BTRFS_DROP_DELAYED_REF
+ * which cancels out this ADD reference.
+ *
+ * If this is a DROP reference and there was no previous
+ * ADD reference, then we need to signal that when we
+ * process references from the extent tree (through
+ * add_inline_refs() and add_keyed_refs()), we should
+ * not exit early if we find a reference for another
+ * inode, because one of the delayed DROP references
+ * may cancel that reference in the extent tree.
*/
- if (sc && sc->inum && ref->objectid != sc->inum) {
- ret = BACKREF_FOUND_SHARED;
- goto out;
- }
+ if (sc && count < 0)
+ sc->have_delayed_delete_refs = true;
ret = add_indirect_ref(fs_info, preftrees, ref->root,
&key, 0, node->bytenr, count, sc,
@@ -917,7 +946,7 @@
}
if (!ret)
ret = extent_is_shared(sc);
-out:
+
spin_unlock(&head->lock);
return ret;
}
@@ -1020,7 +1049,8 @@
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1030,6 +1060,7 @@
ret = add_indirect_ref(fs_info, preftrees, root,
&key, 0, bytenr, count,
sc, GFP_NOFS);
+
break;
}
default:
@@ -1119,7 +1150,8 @@
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = btrfs_extent_data_ref_offset(leaf, dref);
- if (sc && sc->inum && key.objectid != sc->inum) {
+ if (sc && sc->inum && key.objectid != sc->inum &&
+ !sc->have_delayed_delete_refs) {
ret = BACKREF_FOUND_SHARED;
break;
}
@@ -1358,6 +1390,12 @@
if (ret < 0)
goto out;
ref->inode_list = eie;
+ /*
+ * We transferred the list ownership to the ref,
+ * so set to NULL to avoid a double free in case
+ * an error happens after this.
+ */
+ eie = NULL;
}
ret = ulist_add_merge_ptr(refs, ref->parent,
ref->inode_list,
@@ -1383,6 +1421,14 @@
eie->next = ref->inode_list;
}
eie = NULL;
+ /*
+ * We have transferred the inode list ownership from
+ * this ref to the ref we added to the 'refs' ulist.
+ * So set this ref's inode list to NULL to avoid
+ * use-after-free when our caller uses it or double
+ * frees in case an error happens before we return.
+ */
+ ref->inode_list = NULL;
}
cond_resched();
}
@@ -1399,24 +1445,6 @@
return ret;
}
-static void free_leaf_list(struct ulist *blocks)
-{
- struct ulist_node *node = NULL;
- struct extent_inode_elem *eie;
- struct ulist_iterator uiter;
-
- ULIST_ITER_INIT(&uiter);
- while ((node = ulist_next(blocks, &uiter))) {
- if (!node->aux)
- continue;
- eie = unode_aux_to_inode_list(node);
- free_inode_elem_list(eie);
- node->aux = 0;
- }
-
- ulist_free(blocks);
-}
-
/*
* Finds all leafs with a reference to the specified combination of bytenr and
* offset. key_list_head will point to a list of corresponding keys (caller must
@@ -1542,6 +1570,7 @@
.root_objectid = root->root_key.objectid,
.inum = inum,
.share_count = 0,
+ .have_delayed_delete_refs = false,
};
ulist_init(roots);
@@ -1576,6 +1605,7 @@
break;
bytenr = node->val;
shared.share_count = 0;
+ shared.have_delayed_delete_refs = false;
cond_resched();
}
diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c
index c99e293..889a598 100644
--- a/fs/btrfs/block-group.c
+++ b/fs/btrfs/block-group.c
@@ -2570,7 +2570,6 @@
struct btrfs_path *path = NULL;
LIST_HEAD(dirty);
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
int loops = 0;
spin_lock(&cur_trans->dirty_bgs_lock);
@@ -2636,7 +2635,6 @@
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
/*
@@ -2737,7 +2735,6 @@
int should_put;
struct btrfs_path *path;
struct list_head *io = &cur_trans->io_bgs;
- int num_started = 0;
path = btrfs_alloc_path();
if (!path)
@@ -2795,7 +2792,6 @@
cache->io_ctl.inode = NULL;
ret = btrfs_write_out_cache(trans, cache, path);
if (ret == 0 && cache->io_ctl.inode) {
- num_started++;
should_put = 0;
list_add_tail(&cache->io_list, io);
} else {
@@ -3130,6 +3126,7 @@
* attempt.
*/
wait_for_alloc = true;
+ force = CHUNK_ALLOC_NO_FORCE;
spin_unlock(&space_info->lock);
mutex_lock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->chunk_mutex);
diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c
index d297804..be6935d 100644
--- a/fs/btrfs/dev-replace.c
+++ b/fs/btrfs/dev-replace.c
@@ -161,7 +161,7 @@
if (btrfs_find_device(fs_info->fs_devices,
BTRFS_DEV_REPLACE_DEVID, NULL, NULL, false)) {
btrfs_err(fs_info,
- "replace devid present without an active replace item");
+"replace without active item, run 'device scan --forget' on the target device");
ret = -EUCLEAN;
} else {
dev_replace->srcdev = NULL;
@@ -954,8 +954,7 @@
up_write(&dev_replace->rwsem);
/* Scrub for replace must not be running in suspended state */
- ret = btrfs_scrub_cancel(fs_info);
- ASSERT(ret != -ENOTCONN);
+ btrfs_scrub_cancel(fs_info);
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c
index a5bcad0..f2abd8b 100644
--- a/fs/btrfs/disk-io.c
+++ b/fs/btrfs/disk-io.c
@@ -1596,9 +1596,10 @@
ret = btrfs_insert_fs_root(fs_info, root);
if (ret) {
- btrfs_put_root(root);
- if (ret == -EEXIST)
+ if (ret == -EEXIST) {
+ btrfs_put_root(root);
goto again;
+ }
goto fail;
}
return root;
@@ -3060,7 +3061,7 @@
~BTRFS_FEATURE_INCOMPAT_SUPP;
if (features) {
btrfs_err(fs_info,
- "cannot mount because of unsupported optional features (%llx)",
+ "cannot mount because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@@ -3098,11 +3099,25 @@
~BTRFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) {
btrfs_err(fs_info,
- "cannot mount read-write because of unsupported optional features (%llx)",
+ "cannot mount read-write because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
}
+ /*
+ * We have unsupported RO compat features, although RO mounted, we
+ * should not cause any metadata write, including log replay.
+ * Or we could screw up whatever the new feature requires.
+ */
+ if (unlikely(features && btrfs_super_log_root(disk_super) &&
+ !btrfs_test_opt(fs_info, NOLOGREPLAY))) {
+ btrfs_err(fs_info,
+"cannot replay dirty log with unsupported compat_ro features (0x%llx), try rescue=nologreplay",
+ features);
+ err = -EINVAL;
+ goto fail_alloc;
+ }
+
ret = btrfs_init_workqueues(fs_info, fs_devices);
if (ret) {
@@ -4090,6 +4105,31 @@
/* clear out the rbtree of defraggable inodes */
btrfs_cleanup_defrag_inodes(fs_info);
+ /*
+ * After we parked the cleaner kthread, ordered extents may have
+ * completed and created new delayed iputs. If one of the async reclaim
+ * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we
+ * can hang forever trying to stop it, because if a delayed iput is
+ * added after it ran btrfs_run_delayed_iputs() and before it called
+ * btrfs_wait_on_delayed_iputs(), it will hang forever since there is
+ * no one else to run iputs.
+ *
+ * So wait for all ongoing ordered extents to complete and then run
+ * delayed iputs. This works because once we reach this point no one
+ * can either create new ordered extents nor create delayed iputs
+ * through some other means.
+ *
+ * Also note that btrfs_wait_ordered_roots() is not safe here, because
+ * it waits for BTRFS_ORDERED_COMPLETE to be set on an ordered extent,
+ * but the delayed iput for the respective inode is made only when doing
+ * the final btrfs_put_ordered_extent() (which must happen at
+ * btrfs_finish_ordered_io() when we are unmounting).
+ */
+ btrfs_flush_workqueue(fs_info->endio_write_workers);
+ /* Ordered extents for free space inodes. */
+ btrfs_flush_workqueue(fs_info->endio_freespace_worker);
+ btrfs_run_delayed_iputs(fs_info);
+
cancel_work_sync(&fs_info->async_reclaim_work);
cancel_work_sync(&fs_info->async_data_reclaim_work);
diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c
index 1a8d419..bfa2bf4 100644
--- a/fs/btrfs/export.c
+++ b/fs/btrfs/export.c
@@ -58,7 +58,7 @@
}
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
diff --git a/fs/btrfs/export.h b/fs/btrfs/export.h
index f32f411..5afb7ca 100644
--- a/fs/btrfs/export.h
+++ b/fs/btrfs/export.h
@@ -19,7 +19,7 @@
} __attribute__ ((packed));
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
- u64 root_objectid, u32 generation,
+ u64 root_objectid, u64 generation,
int check_generation);
struct dentry *btrfs_get_parent(struct dentry *child);
diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h
index f39d02e..16f44bc 100644
--- a/fs/btrfs/extent_io.h
+++ b/fs/btrfs/extent_io.h
@@ -121,7 +121,7 @@
*/
struct extent_changeset {
/* How many bytes are set/cleared in this operation */
- unsigned int bytes_changed;
+ u64 bytes_changed;
/* Changed ranges */
struct ulist range_changed;
diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c
index f59ec55..416a1b7 100644
--- a/fs/btrfs/file.c
+++ b/fs/btrfs/file.c
@@ -2833,8 +2833,9 @@
return ret;
}
-static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len)
+static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct extent_state *cached_state = NULL;
@@ -2866,6 +2867,10 @@
goto out_only_mutex;
}
+ ret = file_modified(file);
+ if (ret)
+ goto out_only_mutex;
+
lockstart = round_up(offset, btrfs_inode_sectorsize(BTRFS_I(inode)));
lockend = round_down(offset + len,
btrfs_inode_sectorsize(BTRFS_I(inode))) - 1;
@@ -3301,7 +3306,7 @@
return -EOPNOTSUPP;
if (mode & FALLOC_FL_PUNCH_HOLE)
- return btrfs_punch_hole(inode, offset, len);
+ return btrfs_punch_hole(file, offset, len);
/*
* Only trigger disk allocation, don't trigger qgroup reserve
@@ -3323,6 +3328,10 @@
goto out;
}
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
/*
* TODO: Move these two operations after we have checked
* accurate reserved space, or fallocate can still fail but
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c
index 1d9262a..779b774 100644
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -995,7 +995,6 @@
int ret = 0;
if (btrfs_is_free_space_inode(inode)) {
- WARN_ON_ONCE(1);
ret = -EINVAL;
goto out_unlock;
}
@@ -4023,6 +4022,13 @@
dest->root_key.objectid);
return -EPERM;
}
+ if (atomic_read(&dest->nr_swapfiles)) {
+ spin_unlock(&dest->root_item_lock);
+ btrfs_warn(fs_info,
+ "attempt to delete subvolume %llu with active swapfile",
+ root->root_key.objectid);
+ return -EPERM;
+ }
root_flags = btrfs_root_flags(&dest->root_item);
btrfs_set_root_flags(&dest->root_item,
root_flags | BTRFS_ROOT_SUBVOL_DEAD);
@@ -7474,7 +7480,19 @@
if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
em->block_start == EXTENT_MAP_INLINE) {
free_extent_map(em);
- ret = -ENOTBLK;
+ /*
+ * If we are in a NOWAIT context, return -EAGAIN in order to
+ * fallback to buffered IO. This is not only because we can
+ * block with buffered IO (no support for NOWAIT semantics at
+ * the moment) but also to avoid returning short reads to user
+ * space - this happens if we were able to read some data from
+ * previous non-compressed extents and then when we fallback to
+ * buffered IO, at btrfs_file_read_iter() by calling
+ * filemap_read(), we fail to fault in pages for the read buffer,
+ * in which case filemap_read() returns a short read (the number
+ * of bytes previously read is > 0, so it does not return -EFAULT).
+ */
+ ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
goto unlock_err;
}
@@ -10215,8 +10233,23 @@
* set. We use this counter to prevent snapshots. We must increment it
* before walking the extents because we don't want a concurrent
* snapshot to run after we've already checked the extents.
+ *
+ * It is possible that subvolume is marked for deletion but still not
+ * removed yet. To prevent this race, we check the root status before
+ * activating the swapfile.
*/
+ spin_lock(&root->root_item_lock);
+ if (btrfs_root_dead(root)) {
+ spin_unlock(&root->root_item_lock);
+
+ btrfs_exclop_finish(fs_info);
+ btrfs_warn(fs_info,
+ "cannot activate swapfile because subvolume %llu is being deleted",
+ root->root_key.objectid);
+ return -EPERM;
+ }
atomic_inc(&root->nr_swapfiles);
+ spin_unlock(&root->root_item_lock);
isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize);
diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c
index b5e9bfe..d0c3165 100644
--- a/fs/btrfs/ioctl.c
+++ b/fs/btrfs/ioctl.c
@@ -2811,6 +2811,8 @@
}
}
+ btrfs_free_path(path);
+ path = NULL;
if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
ret = -EFAULT;
@@ -2903,6 +2905,8 @@
}
out:
+ btrfs_free_path(path);
+
if (!ret || ret == -EOVERFLOW) {
rootrefs->num_items = found;
/* update min_treeid for next search */
@@ -2914,7 +2918,6 @@
}
kfree(rootrefs);
- btrfs_free_path(path);
return ret;
}
@@ -3878,6 +3881,8 @@
ipath->fspath->val[i] = rel_ptr;
}
+ btrfs_free_path(path);
+ path = NULL;
ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
ipath->fspath, size);
if (ret) {
diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c
index a02e38f..36da775 100644
--- a/fs/btrfs/qgroup.c
+++ b/fs/btrfs/qgroup.c
@@ -1158,6 +1158,21 @@
fs_info->qgroup_rescan_running = true;
btrfs_queue_work(fs_info->qgroup_rescan_workers,
&fs_info->qgroup_rescan_work);
+ } else {
+ /*
+ * We have set both BTRFS_FS_QUOTA_ENABLED and
+ * BTRFS_QGROUP_STATUS_FLAG_ON, so we can only fail with
+ * -EINPROGRESS. That can happen because someone started the
+ * rescan worker by calling quota rescan ioctl before we
+ * attempted to initialize the rescan worker. Failure due to
+ * quotas disabled in the meanwhile is not possible, because
+ * we are holding a write lock on fs_info->subvol_sem, which
+ * is also acquired when disabling quotas.
+ * Ignore such error, and any other error would need to undo
+ * everything we did in the transaction we just committed.
+ */
+ ASSERT(ret == -EINPROGRESS);
+ ret = 0;
}
out_free_path:
diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c
index e65d0fa..9678d7f 100644
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@@ -332,6 +332,9 @@
{
bio_list_merge(&dest->bio_list, &victim->bio_list);
dest->bio_list_bytes += victim->bio_list_bytes;
+ /* Also inherit the bitmaps from @victim. */
+ bitmap_or(dest->dbitmap, victim->dbitmap, dest->dbitmap,
+ dest->stripe_npages);
dest->generic_bio_cnt += victim->generic_bio_cnt;
bio_list_init(&victim->bio_list);
}
@@ -874,6 +877,12 @@
if (rbio->generic_bio_cnt)
btrfs_bio_counter_sub(rbio->fs_info, rbio->generic_bio_cnt);
+ /*
+ * Clear the data bitmap, as the rbio may be cached for later usage.
+ * do this before before unlock_stripe() so there will be no new bio
+ * for this bio.
+ */
+ bitmap_clear(rbio->dbitmap, 0, rbio->stripe_npages);
/*
* At this moment, rbio->bio_list is empty, however since rbio does not
@@ -1207,6 +1216,9 @@
else
BUG();
+ /* We should have at least one data sector. */
+ ASSERT(bitmap_weight(rbio->dbitmap, rbio->stripe_npages));
+
/* at this point we either have a full stripe,
* or we've read the full stripe from the drive.
* recalculate the parity and write the new results.
@@ -1280,6 +1292,11 @@
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(pagenr, rbio->dbitmap))
+ continue;
+
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
@@ -1304,6 +1321,11 @@
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
struct page *page;
+
+ /* This vertical stripe has no data, skip it. */
+ if (!test_bit(pagenr, rbio->dbitmap))
+ continue;
+
if (stripe < rbio->nr_data) {
page = page_in_rbio(rbio, stripe, pagenr, 1);
if (!page)
@@ -1729,6 +1751,33 @@
run_plug(plug);
}
+/* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
+static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
+{
+ const struct btrfs_fs_info *fs_info = rbio->fs_info;
+ const u64 orig_logical = orig_bio->bi_iter.bi_sector << SECTOR_SHIFT;
+ const u64 full_stripe_start = rbio->bbio->raid_map[0];
+ const u32 orig_len = orig_bio->bi_iter.bi_size;
+ const u32 sectorsize = fs_info->sectorsize;
+ u64 cur_logical;
+
+ ASSERT(orig_logical >= full_stripe_start &&
+ orig_logical + orig_len <= full_stripe_start +
+ rbio->nr_data * rbio->stripe_len);
+
+ bio_list_add(&rbio->bio_list, orig_bio);
+ rbio->bio_list_bytes += orig_bio->bi_iter.bi_size;
+
+ /* Update the dbitmap. */
+ for (cur_logical = orig_logical; cur_logical < orig_logical + orig_len;
+ cur_logical += sectorsize) {
+ int bit = ((u32)(cur_logical - full_stripe_start) >>
+ PAGE_SHIFT) % rbio->stripe_npages;
+
+ set_bit(bit, rbio->dbitmap);
+ }
+}
+
/*
* our main entry point for writes from the rest of the FS.
*/
@@ -1745,9 +1794,8 @@
btrfs_put_bbio(bbio);
return PTR_ERR(rbio);
}
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
rbio->operation = BTRFS_RBIO_WRITE;
+ rbio_add_bio(rbio, bio);
btrfs_bio_counter_inc_noblocked(fs_info);
rbio->generic_bio_cnt = 1;
@@ -2046,9 +2094,12 @@
atomic_set(&rbio->error, 0);
/*
- * read everything that hasn't failed. Thanks to the
- * stripe cache, it is possible that some or all of these
- * pages are going to be uptodate.
+ * Read everything that hasn't failed. However this time we will
+ * not trust any cached sector.
+ * As we may read out some stale data but higher layer is not reading
+ * that stale part.
+ *
+ * So here we always re-read everything in recovery path.
*/
for (stripe = 0; stripe < rbio->real_stripes; stripe++) {
if (rbio->faila == stripe || rbio->failb == stripe) {
@@ -2057,16 +2108,6 @@
}
for (pagenr = 0; pagenr < rbio->stripe_npages; pagenr++) {
- struct page *p;
-
- /*
- * the rmw code may have already read this
- * page in
- */
- p = rbio_stripe_page(rbio, stripe, pagenr);
- if (PageUptodate(p))
- continue;
-
ret = rbio_add_io_page(rbio, &bio_list,
rbio_stripe_page(rbio, stripe, pagenr),
stripe, pagenr, rbio->stripe_len);
@@ -2144,8 +2185,7 @@
}
rbio->operation = BTRFS_RBIO_READ_REBUILD;
- bio_list_add(&rbio->bio_list, bio);
- rbio->bio_list_bytes = bio->bi_iter.bi_size;
+ rbio_add_bio(rbio, bio);
rbio->faila = find_logical_bio_stripe(rbio, bio);
if (rbio->faila == -1) {
diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c
index 3a3102b..4b3ae0f 100644
--- a/fs/btrfs/reflink.c
+++ b/fs/btrfs/reflink.c
@@ -503,8 +503,11 @@
*/
ASSERT(key.offset == 0);
ASSERT(datal <= fs_info->sectorsize);
- if (key.offset != 0 || datal > fs_info->sectorsize)
- return -EUCLEAN;
+ if (WARN_ON(key.offset != 0) ||
+ WARN_ON(datal > fs_info->sectorsize)) {
+ ret = -EUCLEAN;
+ goto out;
+ }
ret = clone_copy_inline_extent(inode, path, &new_key,
drop_start, datal, size,
diff --git a/fs/btrfs/root-tree.c b/fs/btrfs/root-tree.c
index db37a37..e9e8ca4 100644
--- a/fs/btrfs/root-tree.c
+++ b/fs/btrfs/root-tree.c
@@ -336,9 +336,10 @@
key.offset = ref_id;
again:
ret = btrfs_search_slot(trans, tree_root, &key, path, -1, 1);
- if (ret < 0)
+ if (ret < 0) {
+ err = ret;
goto out;
- if (ret == 0) {
+ } else if (ret == 0) {
leaf = path->nodes[0];
ref = btrfs_item_ptr(leaf, path->slots[0],
struct btrfs_root_ref);
diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
index 0392c55..88b9a53 100644
--- a/fs/btrfs/scrub.c
+++ b/fs/btrfs/scrub.c
@@ -3811,6 +3811,7 @@
int ret;
struct btrfs_device *dev;
unsigned int nofs_flag;
+ bool need_commit = false;
if (btrfs_fs_closing(fs_info))
return -EAGAIN;
@@ -3924,6 +3925,12 @@
*/
nofs_flag = memalloc_nofs_save();
if (!is_dev_replace) {
+ u64 old_super_errors;
+
+ spin_lock(&sctx->stat_lock);
+ old_super_errors = sctx->stat.super_errors;
+ spin_unlock(&sctx->stat_lock);
+
btrfs_info(fs_info, "scrub: started on devid %llu", devid);
/*
* by holding device list mutex, we can
@@ -3932,6 +3939,16 @@
mutex_lock(&fs_info->fs_devices->device_list_mutex);
ret = scrub_supers(sctx, dev);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
+
+ spin_lock(&sctx->stat_lock);
+ /*
+ * Super block errors found, but we can not commit transaction
+ * at current context, since btrfs_commit_transaction() needs
+ * to pause the current running scrub (hold by ourselves).
+ */
+ if (sctx->stat.super_errors > old_super_errors && !sctx->readonly)
+ need_commit = true;
+ spin_unlock(&sctx->stat_lock);
}
if (!ret)
@@ -3958,6 +3975,25 @@
scrub_workers_put(fs_info);
scrub_put_ctx(sctx);
+ /*
+ * We found some super block errors before, now try to force a
+ * transaction commit, as scrub has finished.
+ */
+ if (need_commit) {
+ struct btrfs_trans_handle *trans;
+
+ trans = btrfs_start_transaction(fs_info->tree_root, 0);
+ if (IS_ERR(trans)) {
+ ret = PTR_ERR(trans);
+ btrfs_err(fs_info,
+ "scrub: failed to start transaction to fix super block errors: %d", ret);
+ return ret;
+ }
+ ret = btrfs_commit_transaction(trans);
+ if (ret < 0)
+ btrfs_err(fs_info,
+ "scrub: failed to commit transaction to fix super block errors: %d", ret);
+ }
return ret;
out:
scrub_workers_put(fs_info);
diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c
index 2663485..8bf8cdb 100644
--- a/fs/btrfs/super.c
+++ b/fs/btrfs/super.c
@@ -652,6 +652,8 @@
compress_force = false;
no_compress++;
} else {
+ btrfs_err(info, "unrecognized compression value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -710,8 +712,11 @@
case Opt_thread_pool:
ret = match_int(&args[0], &intarg);
if (ret) {
+ btrfs_err(info, "unrecognized thread_pool value %s",
+ args[0].from);
goto out;
} else if (intarg == 0) {
+ btrfs_err(info, "invalid value 0 for thread_pool");
ret = -EINVAL;
goto out;
}
@@ -772,8 +777,11 @@
break;
case Opt_ratio:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized metadata_ratio value %s",
+ args[0].from);
goto out;
+ }
info->metadata_ratio = intarg;
btrfs_info(info, "metadata ratio %u",
info->metadata_ratio);
@@ -790,6 +798,8 @@
btrfs_set_and_info(info, DISCARD_ASYNC,
"turning on async discard");
} else {
+ btrfs_err(info, "unrecognized discard mode value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -814,6 +824,8 @@
btrfs_set_and_info(info, FREE_SPACE_TREE,
"enabling free space tree");
} else {
+ btrfs_err(info, "unrecognized space_cache value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -889,8 +901,12 @@
break;
case Opt_check_integrity_print_mask:
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info,
+ "unrecognized check_integrity_print_mask value %s",
+ args[0].from);
goto out;
+ }
info->check_integrity_print_mask = intarg;
btrfs_info(info, "check_integrity_print_mask 0x%x",
info->check_integrity_print_mask);
@@ -905,13 +921,15 @@
goto out;
#endif
case Opt_fatal_errors:
- if (strcmp(args[0].from, "panic") == 0)
+ if (strcmp(args[0].from, "panic") == 0) {
btrfs_set_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else if (strcmp(args[0].from, "bug") == 0)
+ } else if (strcmp(args[0].from, "bug") == 0) {
btrfs_clear_opt(info->mount_opt,
PANIC_ON_FATAL_ERROR);
- else {
+ } else {
+ btrfs_err(info, "unrecognized fatal_errors value %s",
+ args[0].from);
ret = -EINVAL;
goto out;
}
@@ -919,8 +937,12 @@
case Opt_commit_interval:
intarg = 0;
ret = match_int(&args[0], &intarg);
- if (ret)
+ if (ret) {
+ btrfs_err(info, "unrecognized commit_interval value %s",
+ args[0].from);
+ ret = -EINVAL;
goto out;
+ }
if (intarg == 0) {
btrfs_info(info,
"using default commit interval %us",
@@ -934,8 +956,11 @@
break;
case Opt_rescue:
ret = parse_rescue_options(info, args[0].from);
- if (ret < 0)
+ if (ret < 0) {
+ btrfs_err(info, "unrecognized rescue value %s",
+ args[0].from);
goto out;
+ }
break;
#ifdef CONFIG_BTRFS_DEBUG
case Opt_fragment_all:
diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c
index 3bb6b68..ecf1902 100644
--- a/fs/btrfs/sysfs.c
+++ b/fs/btrfs/sysfs.c
@@ -1767,8 +1767,11 @@
#ifdef CONFIG_BTRFS_DEBUG
ret = sysfs_create_group(&btrfs_kset->kobj, &btrfs_debug_feature_attr_group);
- if (ret)
- goto out2;
+ if (ret) {
+ sysfs_unmerge_group(&btrfs_kset->kobj,
+ &btrfs_static_feature_attr_group);
+ goto out_remove_group;
+ }
#endif
return 0;
diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c
index 999c14e..0599566 100644
--- a/fs/btrfs/tests/btrfs-tests.c
+++ b/fs/btrfs/tests/btrfs-tests.c
@@ -192,7 +192,7 @@
void btrfs_free_dummy_root(struct btrfs_root *root)
{
- if (!root)
+ if (IS_ERR_OR_NULL(root))
return;
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
diff --git a/fs/btrfs/tests/qgroup-tests.c b/fs/btrfs/tests/qgroup-tests.c
index ce1ca8e..289366c 100644
--- a/fs/btrfs/tests/qgroup-tests.c
+++ b/fs/btrfs/tests/qgroup-tests.c
@@ -230,21 +230,21 @@
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -256,31 +256,33 @@
return ret;
}
+ /* btrfs_qgroup_account_extent() always frees the ulists passed to it. */
+ old_roots = NULL;
+ new_roots = NULL;
+
if (btrfs_verify_qgroup_counts(fs_info, BTRFS_FS_TREE_OBJECTID,
nodesize, nodesize)) {
test_err("qgroup counts didn't match expected values");
return -EINVAL;
}
- old_roots = NULL;
- new_roots = NULL;
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_item(root, nodesize, nodesize);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return -EINVAL;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -331,21 +333,21 @@
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -366,21 +368,21 @@
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = add_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
@@ -407,21 +409,21 @@
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots,
false);
if (ret) {
- ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
ret = remove_extent_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
- if (ret)
+ if (ret) {
+ ulist_free(old_roots);
return ret;
+ }
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots,
false);
if (ret) {
ulist_free(old_roots);
- ulist_free(new_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c
index 62784b9..9a8dc16 100644
--- a/fs/btrfs/tree-log.c
+++ b/fs/btrfs/tree-log.c
@@ -1075,7 +1075,9 @@
extref = btrfs_lookup_inode_extref(NULL, root, path, name, namelen,
inode_objectid, parent_objectid, 0,
0);
- if (!IS_ERR_OR_NULL(extref)) {
+ if (IS_ERR(extref)) {
+ return PTR_ERR(extref);
+ } else if (extref) {
u32 item_size;
u32 cur_offset = 0;
unsigned long base;
@@ -5335,6 +5337,18 @@
}
/*
+ * For symlinks, we must always log their content, which is stored in an
+ * inline extent, otherwise we could end up with an empty symlink after
+ * log replay, which is invalid on linux (symlink(2) returns -ENOENT if
+ * one attempts to create an empty symlink).
+ * We don't need to worry about flushing delalloc, because when we create
+ * the inline extent when the symlink is created (we never have delalloc
+ * for symlinks).
+ */
+ if (S_ISLNK(inode->vfs_inode.i_mode))
+ inode_only = LOG_INODE_ALL;
+
+ /*
* a brute force approach to making sure we get the most uptodate
* copies of everything.
*/
@@ -5724,7 +5738,7 @@
}
ctx->log_new_dentries = false;
- if (type == BTRFS_FT_DIR || type == BTRFS_FT_SYMLINK)
+ if (type == BTRFS_FT_DIR)
log_mode = LOG_INODE_ALL;
ret = btrfs_log_inode(trans, root, BTRFS_I(di_inode),
log_mode, ctx);
diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c
index e462de9..d4d89e0 100644
--- a/fs/btrfs/volumes.c
+++ b/fs/btrfs/volumes.c
@@ -540,15 +540,47 @@
return ret;
}
-static bool device_path_matched(const char *path, struct btrfs_device *device)
+/*
+ * Check if the device in the path matches the device in the given struct device.
+ *
+ * Returns:
+ * true If it is the same device.
+ * false If it is not the same device or on error.
+ */
+static bool device_matched(const struct btrfs_device *device, const char *path)
{
- int found;
+ char *device_name;
+ struct block_device *bdev_old;
+ struct block_device *bdev_new;
+
+ /*
+ * If we are looking for a device with the matching dev_t, then skip
+ * device without a name (a missing device).
+ */
+ if (!device->name)
+ return false;
+
+ device_name = kzalloc(BTRFS_PATH_NAME_MAX, GFP_KERNEL);
+ if (!device_name)
+ return false;
rcu_read_lock();
- found = strcmp(rcu_str_deref(device->name), path);
+ scnprintf(device_name, BTRFS_PATH_NAME_MAX, "%s", rcu_str_deref(device->name));
rcu_read_unlock();
- return found == 0;
+ bdev_old = lookup_bdev(device_name);
+ kfree(device_name);
+ if (IS_ERR(bdev_old))
+ return false;
+
+ bdev_new = lookup_bdev(path);
+ if (IS_ERR(bdev_new))
+ return false;
+
+ if (bdev_old == bdev_new)
+ return true;
+
+ return false;
}
/*
@@ -581,9 +613,7 @@
&fs_devices->devices, dev_list) {
if (skip_device && skip_device == device)
continue;
- if (path && !device->name)
- continue;
- if (path && !device_path_matched(path, device))
+ if (path && !device_matched(device, path))
continue;
if (fs_devices->opened) {
/* for an already deleted device return 0 */
@@ -4220,10 +4250,12 @@
struct btrfs_fs_info *fs_info = data;
int ret = 0;
+ sb_start_write(fs_info->sb);
mutex_lock(&fs_info->balance_mutex);
if (fs_info->balance_ctl)
ret = btrfs_balance(fs_info, fs_info->balance_ctl, NULL);
mutex_unlock(&fs_info->balance_mutex);
+ sb_end_write(fs_info->sb);
return ret;
}
@@ -7189,12 +7221,12 @@
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
- btrfs_err(fs_info,
- "super_num_devices %llu mismatch with num_devices %llu found here",
+ btrfs_warn(fs_info,
+"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
- ret = -EINVAL;
- goto error;
+ fs_info->fs_devices->total_devices = total_dev;
+ btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
diff --git a/fs/btrfs/xattr.c b/fs/btrfs/xattr.c
index f1a60bc..cd6049b 100644
--- a/fs/btrfs/xattr.c
+++ b/fs/btrfs/xattr.c
@@ -389,6 +389,9 @@
const char *name, const void *buffer,
size_t size, int flags)
{
+ if (btrfs_root_readonly(BTRFS_I(inode)->root))
+ return -EROFS;
+
name = xattr_full_name(handler, name);
return btrfs_setxattr_trans(inode, name, buffer, size, flags);
}
diff --git a/fs/buffer.c b/fs/buffer.c
index 23f6456..ee66aba 100644
--- a/fs/buffer.c
+++ b/fs/buffer.c
@@ -2350,7 +2350,7 @@
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int err;
err = inode_newsize_ok(inode, size);
@@ -2376,7 +2376,7 @@
struct inode *inode = mapping->host;
unsigned int blocksize = i_blocksize(inode);
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
pgoff_t index, curidx;
loff_t curpos;
unsigned zerofrom, offset, len;
diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c
index d3f6727..51562d3 100644
--- a/fs/ceph/caps.c
+++ b/fs/ceph/caps.c
@@ -2294,6 +2294,7 @@
*/
static int unsafe_request_wait(struct inode *inode)
{
+ struct ceph_mds_client *mdsc = ceph_sb_to_client(inode->i_sb)->mdsc;
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_mds_request *req1 = NULL, *req2 = NULL;
int ret, err = 0;
@@ -2313,6 +2314,76 @@
}
spin_unlock(&ci->i_unsafe_lock);
+ /*
+ * Trigger to flush the journal logs in all the relevant MDSes
+ * manually, or in the worst case we must wait at most 5 seconds
+ * to wait the journal logs to be flushed by the MDSes periodically.
+ */
+ if (req1 || req2) {
+ struct ceph_mds_request *req;
+ struct ceph_mds_session **sessions;
+ struct ceph_mds_session *s;
+ unsigned int max_sessions;
+ int i;
+
+ mutex_lock(&mdsc->mutex);
+ max_sessions = mdsc->max_sessions;
+
+ sessions = kcalloc(max_sessions, sizeof(s), GFP_KERNEL);
+ if (!sessions) {
+ mutex_unlock(&mdsc->mutex);
+ err = -ENOMEM;
+ goto out;
+ }
+
+ spin_lock(&ci->i_unsafe_lock);
+ if (req1) {
+ list_for_each_entry(req, &ci->i_unsafe_dirops,
+ r_unsafe_dir_item) {
+ s = req->r_session;
+ if (!s)
+ continue;
+ if (!sessions[s->s_mds]) {
+ s = ceph_get_mds_session(s);
+ sessions[s->s_mds] = s;
+ }
+ }
+ }
+ if (req2) {
+ list_for_each_entry(req, &ci->i_unsafe_iops,
+ r_unsafe_target_item) {
+ s = req->r_session;
+ if (!s)
+ continue;
+ if (!sessions[s->s_mds]) {
+ s = ceph_get_mds_session(s);
+ sessions[s->s_mds] = s;
+ }
+ }
+ }
+ spin_unlock(&ci->i_unsafe_lock);
+
+ /* the auth MDS */
+ spin_lock(&ci->i_ceph_lock);
+ if (ci->i_auth_cap) {
+ s = ci->i_auth_cap->session;
+ if (!sessions[s->s_mds])
+ sessions[s->s_mds] = ceph_get_mds_session(s);
+ }
+ spin_unlock(&ci->i_ceph_lock);
+ mutex_unlock(&mdsc->mutex);
+
+ /* send flush mdlog request to MDSes */
+ for (i = 0; i < max_sessions; i++) {
+ s = sessions[i];
+ if (s) {
+ send_flush_mdlog(s);
+ ceph_put_mds_session(s);
+ }
+ }
+ kfree(sessions);
+ }
+
dout("unsafe_request_wait %p wait on tid %llu %llu\n",
inode, req1 ? req1->r_tid : 0ULL, req2 ? req2->r_tid : 0ULL);
if (req1) {
@@ -2320,15 +2391,19 @@
ceph_timeout_jiffies(req1->r_timeout));
if (ret)
err = -EIO;
- ceph_mdsc_put_request(req1);
}
if (req2) {
ret = !wait_for_completion_timeout(&req2->r_safe_completion,
ceph_timeout_jiffies(req2->r_timeout));
if (ret)
err = -EIO;
- ceph_mdsc_put_request(req2);
}
+
+out:
+ if (req1)
+ ceph_mdsc_put_request(req1);
+ if (req2)
+ ceph_mdsc_put_request(req2);
return err;
}
@@ -3501,24 +3576,23 @@
fill_inline = true;
}
- if (ci->i_auth_cap == cap &&
- le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
- if (newcaps & ~extra_info->issued)
- wake = true;
+ if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT) {
+ if (ci->i_auth_cap == cap) {
+ if (newcaps & ~extra_info->issued)
+ wake = true;
- if (ci->i_requested_max_size > max_size ||
- !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
- /* re-request max_size if necessary */
- ci->i_requested_max_size = 0;
- wake = true;
+ if (ci->i_requested_max_size > max_size ||
+ !(le32_to_cpu(grant->wanted) & CEPH_CAP_ANY_FILE_WR)) {
+ /* re-request max_size if necessary */
+ ci->i_requested_max_size = 0;
+ wake = true;
+ }
+
+ ceph_kick_flushing_inode_caps(session, ci);
}
-
- ceph_kick_flushing_inode_caps(session, ci);
- spin_unlock(&ci->i_ceph_lock);
up_read(&session->s_mdsc->snap_rwsem);
- } else {
- spin_unlock(&ci->i_ceph_lock);
}
+ spin_unlock(&ci->i_ceph_lock);
if (fill_inline)
ceph_fill_inline_data(inode, NULL, extra_info->inline_data,
@@ -4311,33 +4385,9 @@
dout("flush_dirty_caps done\n");
}
-static void iterate_sessions(struct ceph_mds_client *mdsc,
- void (*cb)(struct ceph_mds_session *))
-{
- int mds;
-
- mutex_lock(&mdsc->mutex);
- for (mds = 0; mds < mdsc->max_sessions; ++mds) {
- struct ceph_mds_session *s;
-
- if (!mdsc->sessions[mds])
- continue;
-
- s = ceph_get_mds_session(mdsc->sessions[mds]);
- if (!s)
- continue;
-
- mutex_unlock(&mdsc->mutex);
- cb(s);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
-}
-
void ceph_flush_dirty_caps(struct ceph_mds_client *mdsc)
{
- iterate_sessions(mdsc, flush_dirty_session_caps);
+ ceph_mdsc_iterate_sessions(mdsc, flush_dirty_session_caps, true);
}
void __ceph_touch_fmode(struct ceph_inode_info *ci,
diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c
index f63c1a0..1fddb9c 100644
--- a/fs/ceph/dir.c
+++ b/fs/ceph/dir.c
@@ -478,8 +478,11 @@
2 : (fpos_off(rde->offset) + 1);
err = note_last_dentry(dfi, rde->name, rde->name_len,
next_offset);
- if (err)
+ if (err) {
+ ceph_mdsc_put_request(dfi->last_readdir);
+ dfi->last_readdir = NULL;
return err;
+ }
} else if (req->r_reply_info.dir_end) {
dfi->next_offset = 2;
/* keep last name */
@@ -520,6 +523,12 @@
if (!dir_emit(ctx, rde->name, rde->name_len,
ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
le32_to_cpu(rde->inode.in->mode) >> 12)) {
+ /*
+ * NOTE: Here no need to put the 'dfi->last_readdir',
+ * because when dir_emit stops us it's most likely
+ * doesn't have enough memory, etc. So for next readdir
+ * it will continue.
+ */
dout("filldir stopping us...\n");
return 0;
}
diff --git a/fs/ceph/file.c b/fs/ceph/file.c
index 4500508..943655e 100644
--- a/fs/ceph/file.c
+++ b/fs/ceph/file.c
@@ -592,9 +592,15 @@
iinfo.change_attr = 1;
ceph_encode_timespec64(&iinfo.btime, &now);
- iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
- iinfo.xattr_data = xattr_buf;
- memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ if (req->r_pagelist) {
+ iinfo.xattr_len = req->r_pagelist->length;
+ iinfo.xattr_data = req->r_pagelist->mapped_tail;
+ } else {
+ /* fake it */
+ iinfo.xattr_len = ARRAY_SIZE(xattr_buf);
+ iinfo.xattr_data = xattr_buf;
+ memset(iinfo.xattr_data, 0, iinfo.xattr_len);
+ }
in.ino = cpu_to_le64(vino.ino);
in.snapid = cpu_to_le64(CEPH_NOSNAP);
@@ -697,6 +703,12 @@
if (dentry->d_name.len > NAME_MAX)
return -ENAMETOOLONG;
+ /*
+ * Do not truncate the file, since atomic_open is called before the
+ * permission check. The caller will do the truncation afterward.
+ */
+ flags &= ~O_TRUNC;
+
if (flags & O_CREAT) {
if (ceph_quota_is_max_files_exceeded(dir))
return -EDQUOT;
@@ -706,6 +718,10 @@
err = ceph_security_init_secctx(dentry, mode, &as_ctx);
if (err < 0)
goto out_ctx;
+ /* Async create can't handle more than a page of xattrs */
+ if (as_ctx.pagelist &&
+ !list_is_singular(&as_ctx.pagelist->head))
+ try_async = false;
} else if (!d_in_lookup(dentry)) {
/* If it's not being looked up, it's negative */
return -ENOENT;
@@ -759,9 +775,7 @@
}
set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags);
- err = ceph_mdsc_do_request(mdsc,
- (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
- req);
+ err = ceph_mdsc_do_request(mdsc, (flags & O_CREAT) ? dir : NULL, req);
err = ceph_handle_snapdir(req, dentry, err);
if (err)
goto out_req;
diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c
index 981a915..fa51872 100644
--- a/fs/ceph/mds_client.c
+++ b/fs/ceph/mds_client.c
@@ -809,6 +809,33 @@
}
}
+void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+ void (*cb)(struct ceph_mds_session *),
+ bool check_state)
+{
+ int mds;
+
+ mutex_lock(&mdsc->mutex);
+ for (mds = 0; mds < mdsc->max_sessions; ++mds) {
+ struct ceph_mds_session *s;
+
+ s = __ceph_lookup_mds_session(mdsc, mds);
+ if (!s)
+ continue;
+
+ if (check_state && !check_session_state(s)) {
+ ceph_put_mds_session(s);
+ continue;
+ }
+
+ mutex_unlock(&mdsc->mutex);
+ cb(s);
+ ceph_put_mds_session(s);
+ mutex_lock(&mdsc->mutex);
+ }
+ mutex_unlock(&mdsc->mutex);
+}
+
void ceph_mdsc_release_request(struct kref *kref)
{
struct ceph_mds_request *req = container_of(kref,
@@ -1157,7 +1184,7 @@
/*
* session messages
*/
-static struct ceph_msg *create_session_msg(u32 op, u64 seq)
+struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq)
{
struct ceph_msg *msg;
struct ceph_mds_session_head *h;
@@ -1165,7 +1192,8 @@
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h), GFP_NOFS,
false);
if (!msg) {
- pr_err("create_session_msg ENOMEM creating msg\n");
+ pr_err("ENOMEM creating session %s msg\n",
+ ceph_session_op_name(op));
return NULL;
}
h = msg->front.iov_base;
@@ -1184,14 +1212,17 @@
if (count > 0) {
size_t i;
size_t size = FEATURE_BYTES(count);
+ unsigned long bit;
if (WARN_ON_ONCE(*p + 4 + size > end))
return -ERANGE;
ceph_encode_32(p, size);
memset(*p, 0, size);
- for (i = 0; i < count; i++)
- ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
+ for (i = 0; i < count; i++) {
+ bit = feature_bits[i];
+ ((unsigned char *)(*p))[bit / 8] |= BIT(bit % 8);
+ }
*p += size;
} else {
if (WARN_ON_ONCE(*p + 4 > end))
@@ -1296,7 +1327,7 @@
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
GFP_NOFS, false);
if (!msg) {
- pr_err("create_session_msg ENOMEM creating msg\n");
+ pr_err("ENOMEM creating session open msg\n");
return ERR_PTR(-ENOMEM);
}
p = msg->front.iov_base;
@@ -1830,8 +1861,8 @@
dout("send_renew_caps to mds%d (%s)\n", session->s_mds,
ceph_mds_state_name(state));
- msg = create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
- ++session->s_renew_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_RENEWCAPS,
+ ++session->s_renew_seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1845,7 +1876,7 @@
dout("send_flushmsg_ack to mds%d (%s)s seq %lld\n",
session->s_mds, ceph_session_state_name(session->s_state), seq);
- msg = create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_FLUSHMSG_ACK, seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -1897,7 +1928,8 @@
dout("request_close_session mds%d state %s seq %lld\n",
session->s_mds, ceph_session_state_name(session->s_state),
session->s_seq);
- msg = create_session_msg(CEPH_SESSION_REQUEST_CLOSE, session->s_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_CLOSE,
+ session->s_seq);
if (!msg)
return -ENOMEM;
ceph_con_send(&session->s_con, msg);
@@ -4372,24 +4404,12 @@
}
/*
- * lock unlock sessions, to wait ongoing session activities
+ * lock unlock the session, to wait ongoing session activities
*/
-static void lock_unlock_sessions(struct ceph_mds_client *mdsc)
+static void lock_unlock_session(struct ceph_mds_session *s)
{
- int i;
-
- mutex_lock(&mdsc->mutex);
- for (i = 0; i < mdsc->max_sessions; i++) {
- struct ceph_mds_session *s = __ceph_lookup_mds_session(mdsc, i);
- if (!s)
- continue;
- mutex_unlock(&mdsc->mutex);
- mutex_lock(&s->s_mutex);
- mutex_unlock(&s->s_mutex);
- ceph_put_mds_session(s);
- mutex_lock(&mdsc->mutex);
- }
- mutex_unlock(&mdsc->mutex);
+ mutex_lock(&s->s_mutex);
+ mutex_unlock(&s->s_mutex);
}
static void maybe_recover_session(struct ceph_mds_client *mdsc)
@@ -4644,6 +4664,30 @@
dout("wait_requests done\n");
}
+void send_flush_mdlog(struct ceph_mds_session *s)
+{
+ struct ceph_msg *msg;
+
+ /*
+ * Pre-luminous MDS crashes when it sees an unknown session request
+ */
+ if (!CEPH_HAVE_FEATURE(s->s_con.peer_features, SERVER_LUMINOUS))
+ return;
+
+ mutex_lock(&s->s_mutex);
+ dout("request mdlog flush to mds%d (%s)s seq %lld\n", s->s_mds,
+ ceph_session_state_name(s->s_state), s->s_seq);
+ msg = ceph_create_session_msg(CEPH_SESSION_REQUEST_FLUSH_MDLOG,
+ s->s_seq);
+ if (!msg) {
+ pr_err("failed to request mdlog flush to mds%d (%s) seq %lld\n",
+ s->s_mds, ceph_session_state_name(s->s_state), s->s_seq);
+ } else {
+ ceph_con_send(&s->s_con, msg);
+ }
+ mutex_unlock(&s->s_mutex);
+}
+
/*
* called before mount is ro, and before dentries are torn down.
* (hmm, does this still race with new lookups?)
@@ -4653,7 +4697,8 @@
dout("pre_umount\n");
mdsc->stopping = 1;
- lock_unlock_sessions(mdsc);
+ ceph_mdsc_iterate_sessions(mdsc, send_flush_mdlog, true);
+ ceph_mdsc_iterate_sessions(mdsc, lock_unlock_session, false);
ceph_flush_dirty_caps(mdsc);
wait_requests(mdsc);
diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h
index f5adbeb..a92e42e 100644
--- a/fs/ceph/mds_client.h
+++ b/fs/ceph/mds_client.h
@@ -33,10 +33,6 @@
CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_METRIC_COLLECT,
};
-/*
- * This will always have the highest feature bit value
- * as the last element of the array.
- */
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \
@@ -45,8 +41,6 @@
CEPHFS_FEATURE_MULTI_RECONNECT, \
CEPHFS_FEATURE_DELEG_INO, \
CEPHFS_FEATURE_METRIC_COLLECT, \
- \
- CEPHFS_FEATURE_MAX, \
}
#define CEPHFS_FEATURES_CLIENT_REQUIRED {}
@@ -524,6 +518,11 @@
kref_put(&req->r_kref, ceph_mdsc_release_request);
}
+extern void send_flush_mdlog(struct ceph_mds_session *s);
+extern void ceph_mdsc_iterate_sessions(struct ceph_mds_client *mdsc,
+ void (*cb)(struct ceph_mds_session *),
+ bool check_state);
+extern struct ceph_msg *ceph_create_session_msg(u32 op, u64 seq);
extern void __ceph_queue_cap_release(struct ceph_mds_session *session,
struct ceph_cap *cap);
extern void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c
index 0369f67..734873b 100644
--- a/fs/ceph/snap.c
+++ b/fs/ceph/snap.c
@@ -697,9 +697,10 @@
struct ceph_mds_snap_realm *ri; /* encoded */
__le64 *snaps; /* encoded */
__le64 *prior_parent_snaps; /* encoded */
- struct ceph_snap_realm *realm = NULL;
+ struct ceph_snap_realm *realm;
struct ceph_snap_realm *first_realm = NULL;
- int invalidate = 0;
+ struct ceph_snap_realm *realm_to_rebuild = NULL;
+ int rebuild_snapcs;
int err = -ENOMEM;
LIST_HEAD(dirty_realms);
@@ -707,6 +708,8 @@
dout("update_snap_trace deletion=%d\n", deletion);
more:
+ realm = NULL;
+ rebuild_snapcs = 0;
ceph_decode_need(&p, e, sizeof(*ri), bad);
ri = p;
p += sizeof(*ri);
@@ -730,7 +733,7 @@
err = adjust_snap_realm_parent(mdsc, realm, le64_to_cpu(ri->parent));
if (err < 0)
goto fail;
- invalidate += err;
+ rebuild_snapcs += err;
if (le64_to_cpu(ri->seq) > realm->seq) {
dout("update_snap_trace updating %llx %p %lld -> %lld\n",
@@ -755,22 +758,30 @@
if (realm->seq > mdsc->last_snap_seq)
mdsc->last_snap_seq = realm->seq;
- invalidate = 1;
+ rebuild_snapcs = 1;
} else if (!realm->cached_context) {
dout("update_snap_trace %llx %p seq %lld new\n",
realm->ino, realm, realm->seq);
- invalidate = 1;
+ rebuild_snapcs = 1;
} else {
dout("update_snap_trace %llx %p seq %lld unchanged\n",
realm->ino, realm, realm->seq);
}
- dout("done with %llx %p, invalidated=%d, %p %p\n", realm->ino,
- realm, invalidate, p, e);
+ dout("done with %llx %p, rebuild_snapcs=%d, %p %p\n", realm->ino,
+ realm, rebuild_snapcs, p, e);
- /* invalidate when we reach the _end_ (root) of the trace */
- if (invalidate && p >= e)
- rebuild_snap_realms(realm, &dirty_realms);
+ /*
+ * this will always track the uppest parent realm from which
+ * we need to rebuild the snapshot contexts _downward_ in
+ * hierarchy.
+ */
+ if (rebuild_snapcs)
+ realm_to_rebuild = realm;
+
+ /* rebuild_snapcs when we reach the _end_ (root) of the trace */
+ if (realm_to_rebuild && p >= e)
+ rebuild_snap_realms(realm_to_rebuild, &dirty_realms);
if (!first_realm)
first_realm = realm;
diff --git a/fs/ceph/strings.c b/fs/ceph/strings.c
index 4a79f36..573bb95 100644
--- a/fs/ceph/strings.c
+++ b/fs/ceph/strings.c
@@ -46,6 +46,7 @@
case CEPH_SESSION_FLUSHMSG_ACK: return "flushmsg_ack";
case CEPH_SESSION_FORCE_RO: return "force_ro";
case CEPH_SESSION_REJECT: return "reject";
+ case CEPH_SESSION_REQUEST_FLUSH_MDLOG: return "flush_mdlog";
}
return "???";
}
diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c
index 197cb12..76322c0 100644
--- a/fs/ceph/xattr.c
+++ b/fs/ceph/xattr.c
@@ -317,6 +317,14 @@
}
#define XATTR_RSTAT_FIELD(_type, _name) \
XATTR_NAME_CEPH(_type, _name, VXATTR_FLAG_RSTAT)
+#define XATTR_RSTAT_FIELD_UPDATABLE(_type, _name) \
+ { \
+ .name = CEPH_XATTR_NAME(_type, _name), \
+ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
+ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
+ .exists_cb = NULL, \
+ .flags = VXATTR_FLAG_RSTAT, \
+ }
#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
{ \
.name = CEPH_XATTR_NAME2(_type, _name, _field), \
@@ -354,7 +362,7 @@
XATTR_RSTAT_FIELD(dir, rfiles),
XATTR_RSTAT_FIELD(dir, rsubdirs),
XATTR_RSTAT_FIELD(dir, rbytes),
- XATTR_RSTAT_FIELD(dir, rctime),
+ XATTR_RSTAT_FIELD_UPDATABLE(dir, rctime),
{
.name = "ceph.dir.pin",
.name_size = sizeof("ceph.dir.pin"),
diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c
index aa5a4d7..f442ef8 100644
--- a/fs/cifs/cifsfs.c
+++ b/fs/cifs/cifsfs.c
@@ -898,7 +898,7 @@
ssize_t rc;
struct inode *inode = file_inode(iocb->ki_filp);
- if (iocb->ki_filp->f_flags & O_DIRECT)
+ if (iocb->ki_flags & IOCB_DIRECT)
return cifs_user_readv(iocb, iter);
rc = cifs_revalidate_mapping(inode);
@@ -1033,7 +1033,7 @@
};
MODULE_ALIAS_FS("cifs");
-static struct file_system_type smb3_fs_type = {
+struct file_system_type smb3_fs_type = {
.owner = THIS_MODULE,
.name = "smb3",
.mount = smb3_do_mount,
@@ -1221,8 +1221,11 @@
ssize_t rc;
struct cifsFileInfo *cfile = dst_file->private_data;
- if (cfile->swapfile)
- return -EOPNOTSUPP;
+ if (cfile->swapfile) {
+ rc = -EOPNOTSUPP;
+ free_xid(xid);
+ return rc;
+ }
rc = cifs_file_copychunk_range(xid, src_file, off, dst_file, destoff,
len, flags);
diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h
index 905d038..e996f0b 100644
--- a/fs/cifs/cifsfs.h
+++ b/fs/cifs/cifsfs.h
@@ -51,7 +51,7 @@
return (unsigned long) dentry->d_fsdata;
}
-extern struct file_system_type cifs_fs_type;
+extern struct file_system_type cifs_fs_type, smb3_fs_type;
extern const struct address_space_operations cifs_addr_ops;
extern const struct address_space_operations cifs_addr_ops_smallbuf;
diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h
index 6599069..196285b 100644
--- a/fs/cifs/cifsglob.h
+++ b/fs/cifs/cifsglob.h
@@ -1982,11 +1982,13 @@
/* Operations for different SMB versions */
#define SMB1_VERSION_STRING "1.0"
+#define SMB20_VERSION_STRING "2.0"
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
extern struct smb_version_operations smb1_operations;
extern struct smb_version_values smb1_values;
-#define SMB20_VERSION_STRING "2.0"
extern struct smb_version_operations smb20_operations;
extern struct smb_version_values smb20_values;
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
#define SMB21_VERSION_STRING "2.1"
extern struct smb_version_operations smb21_operations;
extern struct smb_version_values smb21_values;
diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h
index 24c6f36..a6ca4ed 100644
--- a/fs/cifs/cifsproto.h
+++ b/fs/cifs/cifsproto.h
@@ -230,6 +230,8 @@
extern void dequeue_mid(struct mid_q_entry *mid, bool malformed);
extern int cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read);
+extern ssize_t cifs_discard_from_socket(struct TCP_Server_Info *server,
+ size_t to_read);
extern int cifs_read_page_from_socket(struct TCP_Server_Info *server,
struct page *page,
unsigned int page_offset,
diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c
index 0496934..c279527 100644
--- a/fs/cifs/cifssmb.c
+++ b/fs/cifs/cifssmb.c
@@ -1451,9 +1451,9 @@
while (remaining > 0) {
int length;
- length = cifs_read_from_socket(server, server->bigbuf,
- min_t(unsigned int, remaining,
- CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
+ length = cifs_discard_from_socket(server,
+ min_t(size_t, remaining,
+ CIFSMaxBufSize + MAX_HEADER_SIZE(server)));
if (length < 0)
return length;
server->total_read += length;
diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c
index 7f5d173..d1c3086 100644
--- a/fs/cifs/connect.c
+++ b/fs/cifs/connect.c
@@ -695,9 +695,6 @@
int length = 0;
int total_read;
- smb_msg->msg_control = NULL;
- smb_msg->msg_controllen = 0;
-
for (total_read = 0; msg_data_left(smb_msg); total_read += length) {
try_to_freeze();
@@ -748,18 +745,33 @@
cifs_read_from_socket(struct TCP_Server_Info *server, char *buf,
unsigned int to_read)
{
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
struct kvec iov = {.iov_base = buf, .iov_len = to_read};
iov_iter_kvec(&smb_msg.msg_iter, READ, &iov, 1, to_read);
return cifs_readv_from_socket(server, &smb_msg);
}
+ssize_t
+cifs_discard_from_socket(struct TCP_Server_Info *server, size_t to_read)
+{
+ struct msghdr smb_msg = {};
+
+ /*
+ * iov_iter_discard already sets smb_msg.type and count and iov_offset
+ * and cifs_readv_from_socket sets msg_control and msg_controllen
+ * so little to initialize in struct msghdr
+ */
+ iov_iter_discard(&smb_msg.msg_iter, READ, to_read);
+
+ return cifs_readv_from_socket(server, &smb_msg);
+}
+
int
cifs_read_page_from_socket(struct TCP_Server_Info *server, struct page *page,
unsigned int page_offset, unsigned int to_read)
{
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
struct bio_vec bv = {
.bv_page = page, .bv_len = to_read, .bv_offset = page_offset};
iov_iter_bvec(&smb_msg.msg_iter, READ, &bv, 1, to_read);
diff --git a/fs/cifs/file.c b/fs/cifs/file.c
index 6c06870..144064d 100644
--- a/fs/cifs/file.c
+++ b/fs/cifs/file.c
@@ -1735,11 +1735,13 @@
struct cifsFileInfo *cfile;
__u32 type;
- rc = -EACCES;
xid = get_xid();
- if (!(fl->fl_flags & FL_FLOCK))
- return -ENOLCK;
+ if (!(fl->fl_flags & FL_FLOCK)) {
+ rc = -ENOLCK;
+ free_xid(xid);
+ return rc;
+ }
cfile = (struct cifsFileInfo *)file->private_data;
tcon = tlink_tcon(cfile->tlink);
@@ -1758,8 +1760,9 @@
* if no lock or unlock then nothing to do since we do not
* know what it is
*/
+ rc = -EOPNOTSUPP;
free_xid(xid);
- return -EOPNOTSUPP;
+ return rc;
}
rc = cifs_setlk(file, fl, type, wait_flag, posix_lck, lock, unlock,
@@ -3244,6 +3247,9 @@
ssize_t cifs_direct_writev(struct kiocb *iocb, struct iov_iter *from)
{
+ struct file *file = iocb->ki_filp;
+
+ cifs_revalidate_mapping(file->f_inode);
return __cifs_writev(iocb, from, true);
}
@@ -3933,6 +3939,15 @@
len = ctx->len;
}
+ if (direct) {
+ rc = filemap_write_and_wait_range(file->f_inode->i_mapping,
+ offset, offset + len - 1);
+ if (rc) {
+ kref_put(&ctx->refcount, cifs_aio_ctx_release);
+ return -EAGAIN;
+ }
+ }
+
/* grab a lock here due to read response handlers can access ctx */
mutex_lock(&ctx->aio_mutex);
diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c
index dcde44f..e45598b 100644
--- a/fs/cifs/ioctl.c
+++ b/fs/cifs/ioctl.c
@@ -193,7 +193,7 @@
rc = put_user(ExtAttrBits &
FS_FL_USER_VISIBLE,
(int __user *)arg);
- if (rc != EOPNOTSUPP)
+ if (rc != -EOPNOTSUPP)
break;
}
#endif /* CONFIG_CIFS_POSIX */
@@ -222,7 +222,7 @@
* pSMBFile->fid.netfid,
* extAttrBits,
* &ExtAttrMask);
- * if (rc != EOPNOTSUPP)
+ * if (rc != -EOPNOTSUPP)
* break;
*/
diff --git a/fs/cifs/link.c b/fs/cifs/link.c
index 94dab43..85d30fe 100644
--- a/fs/cifs/link.c
+++ b/fs/cifs/link.c
@@ -97,6 +97,9 @@
if (rc != 1)
return -EINVAL;
+ if (link_len > CIFS_MF_SYMLINK_LINK_MAXLEN)
+ return -EINVAL;
+
rc = symlink_hash(link_len, link_str, md5_hash);
if (rc) {
cifs_dbg(FYI, "%s: MD5 hash failure: %d\n", __func__, rc);
diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c
index 1c14cf0..9d74091 100644
--- a/fs/cifs/misc.c
+++ b/fs/cifs/misc.c
@@ -1053,18 +1053,23 @@
.data = data,
.sb = NULL,
};
+ struct file_system_type **fs_type = (struct file_system_type *[]) {
+ &cifs_fs_type, &smb3_fs_type, NULL,
+ };
- iterate_supers_type(&cifs_fs_type, f, &sd);
-
- if (!sd.sb)
- return ERR_PTR(-EINVAL);
- /*
- * Grab an active reference in order to prevent automounts (DFS links)
- * of expiring and then freeing up our cifs superblock pointer while
- * we're doing failover.
- */
- cifs_sb_active(sd.sb);
- return sd.sb;
+ for (; *fs_type; fs_type++) {
+ iterate_supers_type(*fs_type, f, &sd);
+ if (sd.sb) {
+ /*
+ * Grab an active reference in order to prevent automounts (DFS links)
+ * of expiring and then freeing up our cifs superblock pointer while
+ * we're doing failover.
+ */
+ cifs_sb_active(sd.sb);
+ return sd.sb;
+ }
+ }
+ return ERR_PTR(-EINVAL);
}
static void __cifs_put_super(struct super_block *sb)
diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c
index d58c5ff..cf6fd13 100644
--- a/fs/cifs/sess.c
+++ b/fs/cifs/sess.c
@@ -306,6 +306,7 @@
cifs_put_tcp_session(chan->server, 0);
unload_nls(vol.local_nls);
+ free_xid(xid);
return rc;
}
diff --git a/fs/cifs/smb2file.c b/fs/cifs/smb2file.c
index 2fa3ba3..001c26d 100644
--- a/fs/cifs/smb2file.c
+++ b/fs/cifs/smb2file.c
@@ -74,7 +74,6 @@
nr_ioctl_req.Reserved = 0;
rc = SMB2_ioctl(xid, oparms->tcon, fid->persistent_fid,
fid->volatile_fid, FSCTL_LMR_REQUEST_RESILIENCY,
- true /* is_fsctl */,
(char *)&nr_ioctl_req, sizeof(nr_ioctl_req),
CIFSMaxBufSize, NULL, NULL /* no return info */);
if (rc == -EOPNOTSUPP) {
diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c
index a718dc7..97cd4df 100644
--- a/fs/cifs/smb2inode.c
+++ b/fs/cifs/smb2inode.c
@@ -371,8 +371,6 @@
num_rqst++;
if (cfile) {
- cifsFileInfo_put(cfile);
- cfile = NULL;
rc = compound_send_recv(xid, ses, server,
flags, num_rqst - 2,
&rqst[1], &resp_buftype[1],
diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c
index fdb1d66..72368b6 100644
--- a/fs/cifs/smb2ops.c
+++ b/fs/cifs/smb2ops.c
@@ -587,7 +587,7 @@
struct cifs_ses *ses = tcon->ses;
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
- FSCTL_QUERY_NETWORK_INTERFACE_INFO, true /* is_fsctl */,
+ FSCTL_QUERY_NETWORK_INTERFACE_INFO,
NULL /* no data input */, 0 /* no data input */,
CIFSMaxBufSize, (char **)&out_buf, &ret_data_len);
if (rc == -EOPNOTSUPP) {
@@ -1000,9 +1000,7 @@
size_t name_len, value_len, user_name_len;
while (src_size > 0) {
- name = &src->ea_data[0];
name_len = (size_t)src->ea_name_length;
- value = &src->ea_data[src->ea_name_length + 1];
value_len = (size_t)le16_to_cpu(src->ea_value_length);
if (name_len == 0)
@@ -1014,6 +1012,9 @@
goto out;
}
+ name = &src->ea_data[0];
+ value = &src->ea_data[src->ea_name_length + 1];
+
if (ea_name) {
if (ea_name_len == name_len &&
memcmp(ea_name, name, name_len) == 0) {
@@ -1255,6 +1256,8 @@
COMPOUND_FID, current->tgid,
FILE_FULL_EA_INFORMATION,
SMB2_O_INFO_FILE, 0, data, size);
+ if (rc)
+ goto sea_exit;
smb2_set_next_command(tcon, &rqst[1]);
smb2_set_related(&rqst[1]);
@@ -1265,6 +1268,8 @@
rqst[2].rq_nvec = 1;
rc = SMB2_close_init(tcon, server,
&rqst[2], COMPOUND_FID, COMPOUND_FID, false);
+ if (rc)
+ goto sea_exit;
smb2_set_related(&rqst[2]);
rc = compound_send_recv(xid, ses, server,
@@ -1469,9 +1474,8 @@
struct resume_key_req *res_key;
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
- FSCTL_SRV_REQUEST_RESUME_KEY, true /* is_fsctl */,
- NULL, 0 /* no input */, CIFSMaxBufSize,
- (char **)&res_key, &ret_data_len);
+ FSCTL_SRV_REQUEST_RESUME_KEY, NULL, 0 /* no input */,
+ CIFSMaxBufSize, (char **)&res_key, &ret_data_len);
if (rc) {
cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc);
@@ -1526,6 +1530,7 @@
unsigned int size[2];
void *data[2];
int create_options = is_dir ? CREATE_NOT_FILE : CREATE_NOT_DIR;
+ void (*free_req1_func)(struct smb_rqst *r);
vars = kzalloc(sizeof(*vars), GFP_ATOMIC);
if (vars == NULL)
@@ -1535,27 +1540,29 @@
resp_buftype[0] = resp_buftype[1] = resp_buftype[2] = CIFS_NO_BUFFER;
- if (copy_from_user(&qi, arg, sizeof(struct smb_query_info)))
- goto e_fault;
-
+ if (copy_from_user(&qi, arg, sizeof(struct smb_query_info))) {
+ rc = -EFAULT;
+ goto free_vars;
+ }
if (qi.output_buffer_length > 1024) {
- kfree(vars);
- return -EINVAL;
+ rc = -EINVAL;
+ goto free_vars;
}
if (!ses || !server) {
- kfree(vars);
- return -EIO;
+ rc = -EIO;
+ goto free_vars;
}
if (smb3_encryption_required(tcon))
flags |= CIFS_TRANSFORM_REQ;
- buffer = memdup_user(arg + sizeof(struct smb_query_info),
- qi.output_buffer_length);
- if (IS_ERR(buffer)) {
- kfree(vars);
- return PTR_ERR(buffer);
+ if (qi.output_buffer_length) {
+ buffer = memdup_user(arg + sizeof(struct smb_query_info), qi.output_buffer_length);
+ if (IS_ERR(buffer)) {
+ rc = PTR_ERR(buffer);
+ goto free_vars;
+ }
}
/* Open */
@@ -1593,45 +1600,45 @@
rc = SMB2_open_init(tcon, server,
&rqst[0], &oplock, &oparms, path);
if (rc)
- goto iqinf_exit;
+ goto free_output_buffer;
smb2_set_next_command(tcon, &rqst[0]);
/* Query */
if (qi.flags & PASSTHRU_FSCTL) {
/* Can eventually relax perm check since server enforces too */
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN)) {
rc = -EPERM;
- else {
- rqst[1].rq_iov = &vars->io_iov[0];
- rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
-
- rc = SMB2_ioctl_init(tcon, server,
- &rqst[1],
- COMPOUND_FID, COMPOUND_FID,
- qi.info_type, true, buffer,
- qi.output_buffer_length,
- CIFSMaxBufSize -
- MAX_SMB2_CREATE_RESPONSE_SIZE -
- MAX_SMB2_CLOSE_RESPONSE_SIZE);
+ goto free_open_req;
}
+ rqst[1].rq_iov = &vars->io_iov[0];
+ rqst[1].rq_nvec = SMB2_IOCTL_IOV_SIZE;
+
+ rc = SMB2_ioctl_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID,
+ qi.info_type, buffer, qi.output_buffer_length,
+ CIFSMaxBufSize - MAX_SMB2_CREATE_RESPONSE_SIZE -
+ MAX_SMB2_CLOSE_RESPONSE_SIZE);
+ free_req1_func = SMB2_ioctl_free;
} else if (qi.flags == PASSTHRU_SET_INFO) {
/* Can eventually relax perm check since server enforces too */
- if (!capable(CAP_SYS_ADMIN))
+ if (!capable(CAP_SYS_ADMIN)) {
rc = -EPERM;
- else {
- rqst[1].rq_iov = &vars->si_iov[0];
- rqst[1].rq_nvec = 1;
-
- size[0] = 8;
- data[0] = buffer;
-
- rc = SMB2_set_info_init(tcon, server,
- &rqst[1],
- COMPOUND_FID, COMPOUND_FID,
- current->tgid,
- FILE_END_OF_FILE_INFORMATION,
- SMB2_O_INFO_FILE, 0, data, size);
+ goto free_open_req;
}
+ if (qi.output_buffer_length < 8) {
+ rc = -EINVAL;
+ goto free_open_req;
+ }
+ rqst[1].rq_iov = &vars->si_iov[0];
+ rqst[1].rq_nvec = 1;
+
+ /* MS-FSCC 2.4.13 FileEndOfFileInformation */
+ size[0] = 8;
+ data[0] = buffer;
+
+ rc = SMB2_set_info_init(tcon, server, &rqst[1], COMPOUND_FID, COMPOUND_FID,
+ current->tgid, FILE_END_OF_FILE_INFORMATION,
+ SMB2_O_INFO_FILE, 0, data, size);
+ free_req1_func = SMB2_set_info_free;
} else if (qi.flags == PASSTHRU_QUERY_INFO) {
rqst[1].rq_iov = &vars->qi_iov[0];
rqst[1].rq_nvec = 1;
@@ -1642,6 +1649,7 @@
qi.info_type, qi.additional_information,
qi.input_buffer_length,
qi.output_buffer_length, buffer);
+ free_req1_func = SMB2_query_info_free;
} else { /* unknown flags */
cifs_tcon_dbg(VFS, "Invalid passthru query flags: 0x%x\n",
qi.flags);
@@ -1649,7 +1657,7 @@
}
if (rc)
- goto iqinf_exit;
+ goto free_open_req;
smb2_set_next_command(tcon, &rqst[1]);
smb2_set_related(&rqst[1]);
@@ -1660,14 +1668,14 @@
rc = SMB2_close_init(tcon, server,
&rqst[2], COMPOUND_FID, COMPOUND_FID, false);
if (rc)
- goto iqinf_exit;
+ goto free_req_1;
smb2_set_related(&rqst[2]);
rc = compound_send_recv(xid, ses, server,
flags, 3, rqst,
resp_buftype, rsp_iov);
if (rc)
- goto iqinf_exit;
+ goto out;
/* No need to bump num_remote_opens since handle immediately closed */
if (qi.flags & PASSTHRU_FSCTL) {
@@ -1677,18 +1685,22 @@
qi.input_buffer_length = le32_to_cpu(io_rsp->OutputCount);
if (qi.input_buffer_length > 0 &&
le32_to_cpu(io_rsp->OutputOffset) + qi.input_buffer_length
- > rsp_iov[1].iov_len)
- goto e_fault;
+ > rsp_iov[1].iov_len) {
+ rc = -EFAULT;
+ goto out;
+ }
if (copy_to_user(&pqi->input_buffer_length,
&qi.input_buffer_length,
- sizeof(qi.input_buffer_length)))
- goto e_fault;
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto out;
+ }
if (copy_to_user((void __user *)pqi + sizeof(struct smb_query_info),
(const void *)io_rsp + le32_to_cpu(io_rsp->OutputOffset),
qi.input_buffer_length))
- goto e_fault;
+ rc = -EFAULT;
} else {
pqi = (struct smb_query_info __user *)arg;
qi_rsp = (struct smb2_query_info_rsp *)rsp_iov[1].iov_base;
@@ -1696,28 +1708,30 @@
qi.input_buffer_length = le32_to_cpu(qi_rsp->OutputBufferLength);
if (copy_to_user(&pqi->input_buffer_length,
&qi.input_buffer_length,
- sizeof(qi.input_buffer_length)))
- goto e_fault;
+ sizeof(qi.input_buffer_length))) {
+ rc = -EFAULT;
+ goto out;
+ }
if (copy_to_user(pqi + 1, qi_rsp->Buffer,
qi.input_buffer_length))
- goto e_fault;
+ rc = -EFAULT;
}
- iqinf_exit:
- cifs_small_buf_release(rqst[0].rq_iov[0].iov_base);
- cifs_small_buf_release(rqst[1].rq_iov[0].iov_base);
- cifs_small_buf_release(rqst[2].rq_iov[0].iov_base);
+out:
free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base);
free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base);
free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base);
- kfree(vars);
+ SMB2_close_free(&rqst[2]);
+free_req_1:
+ free_req1_func(&rqst[1]);
+free_open_req:
+ SMB2_open_free(&rqst[0]);
+free_output_buffer:
kfree(buffer);
+free_vars:
+ kfree(vars);
return rc;
-
-e_fault:
- rc = -EFAULT;
- goto iqinf_exit;
}
static ssize_t
@@ -1734,9 +1748,17 @@
int chunks_copied = 0;
bool chunk_sizes_updated = false;
ssize_t bytes_written, total_bytes_written = 0;
+ struct inode *inode;
pcchunk = kmalloc(sizeof(struct copychunk_ioctl), GFP_KERNEL);
+ /*
+ * We need to flush all unwritten data before we can send the
+ * copychunk ioctl to the server.
+ */
+ inode = d_inode(trgtfile->dentry);
+ filemap_write_and_wait(inode->i_mapping);
+
if (pcchunk == NULL)
return -ENOMEM;
@@ -1768,9 +1790,8 @@
retbuf = NULL;
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE,
- true /* is_fsctl */, (char *)pcchunk,
- sizeof(struct copychunk_ioctl), CIFSMaxBufSize,
- (char **)&retbuf, &ret_data_len);
+ (char *)pcchunk, sizeof(struct copychunk_ioctl),
+ CIFSMaxBufSize, (char **)&retbuf, &ret_data_len);
if (rc == 0) {
if (ret_data_len !=
sizeof(struct copychunk_ioctl_rsp)) {
@@ -1930,7 +1951,6 @@
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_SPARSE,
- true /* is_fctl */,
&setsparse, 1, CIFSMaxBufSize, NULL, NULL);
if (rc) {
tcon->broken_sparse_sup = true;
@@ -2013,7 +2033,6 @@
rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid,
trgtfile->fid.volatile_fid,
FSCTL_DUPLICATE_EXTENTS_TO_FILE,
- true /* is_fsctl */,
(char *)&dup_ext_buf,
sizeof(struct duplicate_extents_to_file),
CIFSMaxBufSize, NULL,
@@ -2048,7 +2067,6 @@
return SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
FSCTL_SET_INTEGRITY_INFORMATION,
- true /* is_fsctl */,
(char *)&integr_info,
sizeof(struct fsctl_set_integrity_information_req),
CIFSMaxBufSize, NULL,
@@ -2101,7 +2119,6 @@
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
FSCTL_SRV_ENUMERATE_SNAPSHOTS,
- true /* is_fsctl */,
NULL, 0 /* no input data */, max_response_size,
(char **)&retbuf,
&ret_data_len);
@@ -2743,7 +2760,6 @@
do {
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
FSCTL_DFS_GET_REFERRALS,
- true /* is_fsctl */,
(char *)dfs_req, dfs_req_size, CIFSMaxBufSize,
(char **)&dfs_rsp, &dfs_rsp_size);
} while (rc == -EAGAIN);
@@ -2945,8 +2961,7 @@
rc = SMB2_ioctl_init(tcon, server,
&rqst[1], fid.persistent_fid,
- fid.volatile_fid, FSCTL_GET_REPARSE_POINT,
- true /* is_fctl */, NULL, 0,
+ fid.volatile_fid, FSCTL_GET_REPARSE_POINT, NULL, 0,
CIFSMaxBufSize -
MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE);
@@ -3126,8 +3141,7 @@
rc = SMB2_ioctl_init(tcon, server,
&rqst[1], COMPOUND_FID,
- COMPOUND_FID, FSCTL_GET_REPARSE_POINT,
- true /* is_fctl */, NULL, 0,
+ COMPOUND_FID, FSCTL_GET_REPARSE_POINT, NULL, 0,
CIFSMaxBufSize -
MAX_SMB2_CREATE_RESPONSE_SIZE -
MAX_SMB2_CLOSE_RESPONSE_SIZE);
@@ -3390,7 +3404,7 @@
fsctl_buf.BeyondFinalZero = cpu_to_le64(offset + len);
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
- cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA, true,
+ cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
(char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
0, NULL, NULL);
@@ -3420,7 +3434,7 @@
static long smb3_punch_hole(struct file *file, struct cifs_tcon *tcon,
loff_t offset, loff_t len)
{
- struct inode *inode;
+ struct inode *inode = file_inode(file);
struct cifsFileInfo *cfile = file->private_data;
struct file_zero_data_information fsctl_buf;
long rc;
@@ -3429,14 +3443,12 @@
xid = get_xid();
- inode = d_inode(cfile->dentry);
-
+ inode_lock(inode);
/* Need to make file sparse, if not already, before freeing range. */
/* Consider adding equivalent for compressed since it could also work */
if (!smb2_set_sparse(xid, tcon, cfile, inode, set_sparse)) {
rc = -EOPNOTSUPP;
- free_xid(xid);
- return rc;
+ goto out;
}
/*
@@ -3452,9 +3464,11 @@
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid, FSCTL_SET_ZERO_DATA,
- true /* is_fctl */, (char *)&fsctl_buf,
+ (char *)&fsctl_buf,
sizeof(struct file_zero_data_information),
CIFSMaxBufSize, NULL, NULL);
+out:
+ inode_unlock(inode);
free_xid(xid);
return rc;
}
@@ -3511,7 +3525,7 @@
in_data.length = cpu_to_le64(len);
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
- FSCTL_QUERY_ALLOCATED_RANGES, true,
+ FSCTL_QUERY_ALLOCATED_RANGES,
(char *)&in_data, sizeof(in_data),
1024 * sizeof(struct file_allocated_range_buffer),
(char **)&out_data, &out_data_len);
@@ -3614,7 +3628,7 @@
if (rc)
goto out;
- if ((cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE) == 0)
+ if (cifsi->cifsAttrs & FILE_ATTRIBUTE_SPARSE_FILE)
smb2_set_sparse(xid, tcon, cfile, inode, false);
eof = cpu_to_le64(off + len);
@@ -3752,7 +3766,7 @@
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
- FSCTL_QUERY_ALLOCATED_RANGES, true,
+ FSCTL_QUERY_ALLOCATED_RANGES,
(char *)&in_data, sizeof(in_data),
sizeof(struct file_allocated_range_buffer),
(char **)&out_data, &out_data_len);
@@ -3812,7 +3826,7 @@
rc = SMB2_ioctl(xid, tcon, cfile->fid.persistent_fid,
cfile->fid.volatile_fid,
- FSCTL_QUERY_ALLOCATED_RANGES, true,
+ FSCTL_QUERY_ALLOCATED_RANGES,
(char *)&in_data, sizeof(in_data),
1024 * sizeof(struct file_allocated_range_buffer),
(char **)&out_data, &out_data_len);
@@ -4014,11 +4028,13 @@
}
}
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
static bool
smb2_is_read_op(__u32 oplock)
{
return oplock == SMB2_OPLOCK_LEVEL_II;
}
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
static bool
smb21_is_read_op(__u32 oplock)
@@ -5104,7 +5120,7 @@
return rc;
}
-
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct smb_version_operations smb20_operations = {
.compare_fids = smb2_compare_fids,
.setup_request = smb2_setup_request,
@@ -5202,6 +5218,7 @@
.llseek = smb3_llseek,
.is_status_io_timeout = smb2_is_status_io_timeout,
};
+#endif /* CIFS_ALLOW_INSECURE_LEGACY */
struct smb_version_operations smb21_operations = {
.compare_fids = smb2_compare_fids,
@@ -5530,6 +5547,7 @@
.is_status_io_timeout = smb2_is_status_io_timeout,
};
+#ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY
struct smb_version_values smb20_values = {
.version_string = SMB20_VERSION_STRING,
.protocol_id = SMB20_PROT_ID,
@@ -5550,6 +5568,7 @@
.signing_required = SMB2_NEGOTIATE_SIGNING_REQUIRED,
.create_lease_size = sizeof(struct create_lease),
};
+#endif /* ALLOW_INSECURE_LEGACY */
struct smb_version_values smb21_values = {
.version_string = SMB21_VERSION_STRING,
diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c
index 88554b6..0c4a247 100644
--- a/fs/cifs/smb2pdu.c
+++ b/fs/cifs/smb2pdu.c
@@ -281,6 +281,9 @@
ses->binding_chan = NULL;
mutex_unlock(&tcon->ses->session_mutex);
goto failed;
+ } else if (rc) {
+ mutex_unlock(&ses->session_mutex);
+ goto out;
}
}
/*
@@ -1072,13 +1075,13 @@
pneg_inbuf->Dialects[0] =
cpu_to_le16(server->vals->protocol_id);
pneg_inbuf->DialectCount = cpu_to_le16(1);
- /* structure is big enough for 3 dialects, sending only 1 */
+ /* structure is big enough for 4 dialects, sending only 1 */
inbuflen = sizeof(*pneg_inbuf) -
- sizeof(pneg_inbuf->Dialects[0]) * 2;
+ sizeof(pneg_inbuf->Dialects[0]) * 3;
}
rc = SMB2_ioctl(xid, tcon, NO_FILE_ID, NO_FILE_ID,
- FSCTL_VALIDATE_NEGOTIATE_INFO, true /* is_fsctl */,
+ FSCTL_VALIDATE_NEGOTIATE_INFO,
(char *)pneg_inbuf, inbuflen, CIFSMaxBufSize,
(char **)&pneg_rsp, &rsplen);
if (rc == -EOPNOTSUPP) {
@@ -2291,7 +2294,7 @@
unsigned int acelen, acl_size, ace_count;
unsigned int owner_offset = 0;
unsigned int group_offset = 0;
- struct smb3_acl acl;
+ struct smb3_acl acl = {};
*len = roundup(sizeof(struct crt_sd_ctxt) + (sizeof(struct cifs_ace) * 4), 8);
@@ -2364,6 +2367,7 @@
acl.AclRevision = ACL_REVISION; /* See 2.4.4.1 of MS-DTYP */
acl.AclSize = cpu_to_le16(acl_size);
acl.AceCount = cpu_to_le16(ace_count);
+ /* acl.Sbz1 and Sbz2 MBZ so are not set here, but initialized above */
memcpy(aclptr, &acl, sizeof(struct smb3_acl));
buf->ccontext.DataLength = cpu_to_le32(ptr - (__u8 *)&buf->sd);
@@ -2919,7 +2923,7 @@
SMB2_ioctl_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server,
struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen,
+ char *in_data, u32 indatalen,
__u32 max_response_size)
{
struct smb2_ioctl_req *req;
@@ -2994,10 +2998,8 @@
req->sync_hdr.CreditCharge =
cpu_to_le16(DIV_ROUND_UP(max(indatalen, max_response_size),
SMB2_MAX_BUFFER_SIZE));
- if (is_fsctl)
- req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
- else
- req->Flags = 0;
+ /* always an FSCTL (for now) */
+ req->Flags = cpu_to_le32(SMB2_0_IOCTL_IS_FSCTL);
/* validate negotiate request must be signed - see MS-SMB2 3.2.5.5 */
if (opcode == FSCTL_VALIDATE_NEGOTIATE_INFO)
@@ -3024,9 +3026,9 @@
*/
int
SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid,
- u64 volatile_fid, u32 opcode, bool is_fsctl,
- char *in_data, u32 indatalen, u32 max_out_data_len,
- char **out_data, u32 *plen /* returned data len */)
+ u64 volatile_fid, u32 opcode, char *in_data, u32 indatalen,
+ u32 max_out_data_len, char **out_data,
+ u32 *plen /* returned data len */)
{
struct smb_rqst rqst;
struct smb2_ioctl_rsp *rsp = NULL;
@@ -3068,7 +3070,7 @@
rc = SMB2_ioctl_init(tcon, server,
&rqst, persistent_fid, volatile_fid, opcode,
- is_fsctl, in_data, indatalen, max_out_data_len);
+ in_data, indatalen, max_out_data_len);
if (rc)
goto ioctl_exit;
@@ -3150,7 +3152,7 @@
cpu_to_le16(COMPRESSION_FORMAT_DEFAULT);
rc = SMB2_ioctl(xid, tcon, persistent_fid, volatile_fid,
- FSCTL_SET_COMPRESSION, true /* is_fsctl */,
+ FSCTL_SET_COMPRESSION,
(char *)&fsctl_input /* data input */,
2 /* in data len */, CIFSMaxBufSize /* max out data */,
&ret_data /* out data */, NULL);
diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h
index 4eb0ca8..ed2b4fb 100644
--- a/fs/cifs/smb2proto.h
+++ b/fs/cifs/smb2proto.h
@@ -155,13 +155,13 @@
extern void SMB2_open_free(struct smb_rqst *rqst);
extern int SMB2_ioctl(const unsigned int xid, struct cifs_tcon *tcon,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen, u32 maxoutlen,
+ char *in_data, u32 indatalen, u32 maxoutlen,
char **out_data, u32 *plen /* returned data len */);
extern int SMB2_ioctl_init(struct cifs_tcon *tcon,
struct TCP_Server_Info *server,
struct smb_rqst *rqst,
u64 persistent_fid, u64 volatile_fid, u32 opcode,
- bool is_fsctl, char *in_data, u32 indatalen,
+ char *in_data, u32 indatalen,
__u32 max_response_size);
extern void SMB2_ioctl_free(struct smb_rqst *rqst);
extern int SMB2_change_notify(const unsigned int xid, struct cifs_tcon *tcon,
diff --git a/fs/cifs/transport.c b/fs/cifs/transport.c
index 503a005..b137006 100644
--- a/fs/cifs/transport.c
+++ b/fs/cifs/transport.c
@@ -209,10 +209,6 @@
*sent = 0;
- smb_msg->msg_name = (struct sockaddr *) &server->dstaddr;
- smb_msg->msg_namelen = sizeof(struct sockaddr);
- smb_msg->msg_control = NULL;
- smb_msg->msg_controllen = 0;
if (server->noblocksnd)
smb_msg->msg_flags = MSG_DONTWAIT + MSG_NOSIGNAL;
else
@@ -324,7 +320,7 @@
sigset_t mask, oldmask;
size_t total_len = 0, sent, size;
struct socket *ssocket = server->ssocket;
- struct msghdr smb_msg;
+ struct msghdr smb_msg = {};
__be32 rfc1002_marker;
if (cifs_rdma_enabled(server)) {
diff --git a/fs/coredump.c b/fs/coredump.c
index c56a3bd..edbaf61 100644
--- a/fs/coredump.c
+++ b/fs/coredump.c
@@ -41,6 +41,7 @@
#include <linux/fs.h>
#include <linux/path.h>
#include <linux/timekeeping.h>
+#include <linux/elf.h>
#include <linux/uaccess.h>
#include <asm/mmu_context.h>
@@ -52,6 +53,9 @@
#include <trace/events/sched.h>
+static bool dump_vma_snapshot(struct coredump_params *cprm);
+static void free_vma_snapshot(struct coredump_params *cprm);
+
int core_uses_pid;
unsigned int core_pipe_limit;
char core_pattern[CORENAME_MAX_SIZE] = "core";
@@ -601,6 +605,7 @@
* by any locks.
*/
.mm_flags = mm->flags,
+ .vma_meta = NULL,
};
audit_core_dumps(siginfo->si_signo);
@@ -806,9 +811,13 @@
pr_info("Core dump to |%s disabled\n", cn.corename);
goto close_fail;
}
+ if (!dump_vma_snapshot(&cprm))
+ goto close_fail;
+
file_start_write(cprm.file);
core_dumped = binfmt->core_dump(&cprm);
file_end_write(cprm.file);
+ free_vma_snapshot(&cprm);
}
if (ispipe && core_pipe_limit)
wait_for_dump_helpers(cprm.file);
@@ -969,6 +978,8 @@
return false;
}
+#define DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER 1
+
/*
* Decide how much of @vma's contents should be included in a core dump.
*/
@@ -1028,9 +1039,20 @@
* dump the first page to aid in determining what was mapped here.
*/
if (FILTER(ELF_HEADERS) &&
- vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ) &&
- (READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
- return PAGE_SIZE;
+ vma->vm_pgoff == 0 && (vma->vm_flags & VM_READ)) {
+ if ((READ_ONCE(file_inode(vma->vm_file)->i_mode) & 0111) != 0)
+ return PAGE_SIZE;
+
+ /*
+ * ELF libraries aren't always executable.
+ * We'll want to check whether the mapping starts with the ELF
+ * magic, but not now - we're holding the mmap lock,
+ * so copy_from_user() doesn't work here.
+ * Use a placeholder instead, and fix it up later in
+ * dump_vma_snapshot().
+ */
+ return DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER;
+ }
#undef FILTER
@@ -1067,18 +1089,29 @@
return gate_vma;
}
+static void free_vma_snapshot(struct coredump_params *cprm)
+{
+ if (cprm->vma_meta) {
+ int i;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct file *file = cprm->vma_meta[i].file;
+ if (file)
+ fput(file);
+ }
+ kvfree(cprm->vma_meta);
+ cprm->vma_meta = NULL;
+ }
+}
+
/*
* Under the mmap_lock, take a snapshot of relevant information about the task's
* VMAs.
*/
-int dump_vma_snapshot(struct coredump_params *cprm, int *vma_count,
- struct core_vma_metadata **vma_meta,
- size_t *vma_data_size_ptr)
+static bool dump_vma_snapshot(struct coredump_params *cprm)
{
struct vm_area_struct *vma, *gate_vma;
struct mm_struct *mm = current->mm;
int i;
- size_t vma_data_size = 0;
/*
* Once the stack expansion code is fixed to not change VMA bounds
@@ -1086,36 +1119,51 @@
* mmap_lock in read mode.
*/
if (mmap_write_lock_killable(mm))
- return -EINTR;
+ return false;
+ cprm->vma_data_size = 0;
gate_vma = get_gate_vma(mm);
- *vma_count = mm->map_count + (gate_vma ? 1 : 0);
+ cprm->vma_count = mm->map_count + (gate_vma ? 1 : 0);
- *vma_meta = kvmalloc_array(*vma_count, sizeof(**vma_meta), GFP_KERNEL);
- if (!*vma_meta) {
+ cprm->vma_meta = kvmalloc_array(cprm->vma_count, sizeof(*cprm->vma_meta), GFP_KERNEL);
+ if (!cprm->vma_meta) {
mmap_write_unlock(mm);
- return -ENOMEM;
+ return false;
}
for (i = 0, vma = first_vma(current, gate_vma); vma != NULL;
vma = next_vma(vma, gate_vma), i++) {
- struct core_vma_metadata *m = (*vma_meta) + i;
+ struct core_vma_metadata *m = cprm->vma_meta + i;
m->start = vma->vm_start;
m->end = vma->vm_end;
m->flags = vma->vm_flags;
m->dump_size = vma_dump_size(vma, cprm->mm_flags);
+ m->pgoff = vma->vm_pgoff;
- vma_data_size += m->dump_size;
+ m->file = vma->vm_file;
+ if (m->file)
+ get_file(m->file);
}
mmap_write_unlock(mm);
- if (WARN_ON(i != *vma_count)) {
- kvfree(*vma_meta);
- return -EFAULT;
+ for (i = 0; i < cprm->vma_count; i++) {
+ struct core_vma_metadata *m = cprm->vma_meta + i;
+
+ if (m->dump_size == DUMP_SIZE_MAYBE_ELFHDR_PLACEHOLDER) {
+ char elfmag[SELFMAG];
+
+ if (copy_from_user(elfmag, (void __user *)m->start, SELFMAG) ||
+ memcmp(elfmag, ELFMAG, SELFMAG) != 0) {
+ m->dump_size = 0;
+ } else {
+ m->dump_size = PAGE_SIZE;
+ }
+ }
+
+ cprm->vma_data_size += m->dump_size;
}
- *vma_data_size_ptr = vma_data_size;
- return 0;
+ return true;
}
diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h
index 052ad40..b746d7d 100644
--- a/fs/crypto/fscrypt_private.h
+++ b/fs/crypto/fscrypt_private.h
@@ -220,7 +220,7 @@
* will be NULL if the master key was found in a process-subscribed
* keyring rather than in the filesystem-level keyring.
*/
- struct key *ci_master_key;
+ struct fscrypt_master_key *ci_master_key;
/*
* Link in list of inodes that were unlocked with the master key.
@@ -432,6 +432,40 @@
struct fscrypt_master_key {
/*
+ * Back-pointer to the super_block of the filesystem to which this
+ * master key has been added. Only valid if ->mk_active_refs > 0.
+ */
+ struct super_block *mk_sb;
+
+ /*
+ * Link in ->mk_sb->s_master_keys->key_hashtable.
+ * Only valid if ->mk_active_refs > 0.
+ */
+ struct hlist_node mk_node;
+
+ /* Semaphore that protects ->mk_secret and ->mk_users */
+ struct rw_semaphore mk_sem;
+
+ /*
+ * Active and structural reference counts. An active ref guarantees
+ * that the struct continues to exist, continues to be in the keyring
+ * ->mk_sb->s_master_keys, and that any embedded subkeys (e.g.
+ * ->mk_direct_keys) that have been prepared continue to exist.
+ * A structural ref only guarantees that the struct continues to exist.
+ *
+ * There is one active ref associated with ->mk_secret being present,
+ * and one active ref for each inode in ->mk_decrypted_inodes.
+ *
+ * There is one structural ref associated with the active refcount being
+ * nonzero. Finding a key in the keyring also takes a structural ref,
+ * which is then held temporarily while the key is operated on.
+ */
+ refcount_t mk_active_refs;
+ refcount_t mk_struct_refs;
+
+ struct rcu_head mk_rcu_head;
+
+ /*
* The secret key material. After FS_IOC_REMOVE_ENCRYPTION_KEY is
* executed, this is wiped and no new inodes can be unlocked with this
* key; however, there may still be inodes in ->mk_decrypted_inodes
@@ -439,16 +473,12 @@
* FS_IOC_REMOVE_ENCRYPTION_KEY can be retried, or
* FS_IOC_ADD_ENCRYPTION_KEY can add the secret again.
*
- * Locking: protected by key->sem (outer) and mk_secret_sem (inner).
- * The reason for two locks is that key->sem also protects modifying
- * mk_users, which ranks it above the semaphore for the keyring key
- * type, which is in turn above page faults (via keyring_read). But
- * sometimes filesystems call fscrypt_get_encryption_info() from within
- * a transaction, which ranks it below page faults. So we need a
- * separate lock which protects mk_secret but not also mk_users.
+ * While ->mk_secret is present, one ref in ->mk_active_refs is held.
+ *
+ * Locking: protected by ->mk_sem. The manipulation of ->mk_active_refs
+ * associated with this field is protected by ->mk_sem as well.
*/
struct fscrypt_master_key_secret mk_secret;
- struct rw_semaphore mk_secret_sem;
/*
* For v1 policy keys: an arbitrary key descriptor which was assigned by
@@ -467,23 +497,13 @@
*
* This is NULL for v1 policy keys; those can only be added by root.
*
- * Locking: in addition to this keyrings own semaphore, this is
- * protected by the master key's key->sem, so we can do atomic
- * search+insert. It can also be searched without taking any locks, but
- * in that case the returned key may have already been removed.
+ * Locking: protected by ->mk_sem. (We don't just rely on the keyrings
+ * subsystem semaphore ->mk_users->sem, as we need support for atomic
+ * search+insert along with proper synchronization with ->mk_secret.)
*/
struct key *mk_users;
/*
- * Length of ->mk_decrypted_inodes, plus one if mk_secret is present.
- * Once this goes to 0, the master key is removed from ->s_master_keys.
- * The 'struct fscrypt_master_key' will continue to live as long as the
- * 'struct key' whose payload it is, but we won't let this reference
- * count rise again.
- */
- refcount_t mk_refcount;
-
- /*
* List of inodes that were unlocked using this key. This allows the
* inodes to be evicted efficiently if the key is removed.
*/
@@ -508,11 +528,11 @@
is_master_key_secret_present(const struct fscrypt_master_key_secret *secret)
{
/*
- * The READ_ONCE() is only necessary for fscrypt_drop_inode() and
- * fscrypt_key_describe(). These run in atomic context, so they can't
- * take ->mk_secret_sem and thus 'secret' can change concurrently which
- * would be a data race. But they only need to know whether the secret
- * *was* present at the time of check, so READ_ONCE() suffices.
+ * The READ_ONCE() is only necessary for fscrypt_drop_inode().
+ * fscrypt_drop_inode() runs in atomic context, so it can't take the key
+ * semaphore and thus 'secret' can change concurrently which would be a
+ * data race. But fscrypt_drop_inode() only need to know whether the
+ * secret *was* present at the time of check, so READ_ONCE() suffices.
*/
return READ_ONCE(secret->size) != 0;
}
@@ -540,7 +560,11 @@
return 0;
}
-struct key *
+void fscrypt_put_master_key(struct fscrypt_master_key *mk);
+
+void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk);
+
+struct fscrypt_master_key *
fscrypt_find_master_key(struct super_block *sb,
const struct fscrypt_key_specifier *mk_spec);
diff --git a/fs/crypto/hooks.c b/fs/crypto/hooks.c
index 4180371..8268206 100644
--- a/fs/crypto/hooks.c
+++ b/fs/crypto/hooks.c
@@ -5,8 +5,6 @@
* Encryption hooks for higher-level filesystem operations.
*/
-#include <linux/key.h>
-
#include "fscrypt_private.h"
/**
@@ -154,13 +152,13 @@
ci = inode->i_crypt_info;
if (ci->ci_policy.version != FSCRYPT_POLICY_V2)
return -EINVAL;
- mk = ci->ci_master_key->payload.data[0];
- down_read(&mk->mk_secret_sem);
+ mk = ci->ci_master_key;
+ down_read(&mk->mk_sem);
if (is_master_key_secret_present(&mk->mk_secret))
err = fscrypt_derive_dirhash_key(ci, mk);
else
err = -ENOKEY;
- up_read(&mk->mk_secret_sem);
+ up_read(&mk->mk_sem);
return err;
}
return 0;
diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c
index d7ec52c..02f8bf8 100644
--- a/fs/crypto/keyring.c
+++ b/fs/crypto/keyring.c
@@ -18,6 +18,7 @@
* information about these ioctls.
*/
+#include <asm/unaligned.h>
#include <crypto/skcipher.h>
#include <linux/key-type.h>
#include <linux/random.h>
@@ -25,6 +26,18 @@
#include "fscrypt_private.h"
+/* The master encryption keys for a filesystem (->s_master_keys) */
+struct fscrypt_keyring {
+ /*
+ * Lock that protects ->key_hashtable. It does *not* protect the
+ * fscrypt_master_key structs themselves.
+ */
+ spinlock_t lock;
+
+ /* Hash table that maps fscrypt_key_specifier to fscrypt_master_key */
+ struct hlist_head key_hashtable[128];
+};
+
static void wipe_master_key_secret(struct fscrypt_master_key_secret *secret)
{
fscrypt_destroy_hkdf(&secret->hkdf);
@@ -38,20 +51,70 @@
memzero_explicit(src, sizeof(*src));
}
-static void free_master_key(struct fscrypt_master_key *mk)
+static void fscrypt_free_master_key(struct rcu_head *head)
{
+ struct fscrypt_master_key *mk =
+ container_of(head, struct fscrypt_master_key, mk_rcu_head);
+ /*
+ * The master key secret and any embedded subkeys should have already
+ * been wiped when the last active reference to the fscrypt_master_key
+ * struct was dropped; doing it here would be unnecessarily late.
+ * Nevertheless, use kfree_sensitive() in case anything was missed.
+ */
+ kfree_sensitive(mk);
+}
+
+void fscrypt_put_master_key(struct fscrypt_master_key *mk)
+{
+ if (!refcount_dec_and_test(&mk->mk_struct_refs))
+ return;
+ /*
+ * No structural references left, so free ->mk_users, and also free the
+ * fscrypt_master_key struct itself after an RCU grace period ensures
+ * that concurrent keyring lookups can no longer find it.
+ */
+ WARN_ON(refcount_read(&mk->mk_active_refs) != 0);
+ key_put(mk->mk_users);
+ mk->mk_users = NULL;
+ call_rcu(&mk->mk_rcu_head, fscrypt_free_master_key);
+}
+
+void fscrypt_put_master_key_activeref(struct fscrypt_master_key *mk)
+{
+ struct super_block *sb = mk->mk_sb;
+ struct fscrypt_keyring *keyring = sb->s_master_keys;
size_t i;
- wipe_master_key_secret(&mk->mk_secret);
+ if (!refcount_dec_and_test(&mk->mk_active_refs))
+ return;
+ /*
+ * No active references left, so complete the full removal of this
+ * fscrypt_master_key struct by removing it from the keyring and
+ * destroying any subkeys embedded in it.
+ */
+
+ spin_lock(&keyring->lock);
+ hlist_del_rcu(&mk->mk_node);
+ spin_unlock(&keyring->lock);
+
+ /*
+ * ->mk_active_refs == 0 implies that ->mk_secret is not present and
+ * that ->mk_decrypted_inodes is empty.
+ */
+ WARN_ON(is_master_key_secret_present(&mk->mk_secret));
+ WARN_ON(!list_empty(&mk->mk_decrypted_inodes));
for (i = 0; i <= FSCRYPT_MODE_MAX; i++) {
fscrypt_destroy_prepared_key(&mk->mk_direct_keys[i]);
fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_64_keys[i]);
fscrypt_destroy_prepared_key(&mk->mk_iv_ino_lblk_32_keys[i]);
}
+ memzero_explicit(&mk->mk_ino_hash_key,
+ sizeof(mk->mk_ino_hash_key));
+ mk->mk_ino_hash_key_initialized = false;
- key_put(mk->mk_users);
- kfree_sensitive(mk);
+ /* Drop the structural ref associated with the active refs. */
+ fscrypt_put_master_key(mk);
}
static inline bool valid_key_spec(const struct fscrypt_key_specifier *spec)
@@ -61,44 +124,6 @@
return master_key_spec_len(spec) != 0;
}
-static int fscrypt_key_instantiate(struct key *key,
- struct key_preparsed_payload *prep)
-{
- key->payload.data[0] = (struct fscrypt_master_key *)prep->data;
- return 0;
-}
-
-static void fscrypt_key_destroy(struct key *key)
-{
- free_master_key(key->payload.data[0]);
-}
-
-static void fscrypt_key_describe(const struct key *key, struct seq_file *m)
-{
- seq_puts(m, key->description);
-
- if (key_is_positive(key)) {
- const struct fscrypt_master_key *mk = key->payload.data[0];
-
- if (!is_master_key_secret_present(&mk->mk_secret))
- seq_puts(m, ": secret removed");
- }
-}
-
-/*
- * Type of key in ->s_master_keys. Each key of this type represents a master
- * key which has been added to the filesystem. Its payload is a
- * 'struct fscrypt_master_key'. The "." prefix in the key type name prevents
- * users from adding keys of this type via the keyrings syscalls rather than via
- * the intended method of FS_IOC_ADD_ENCRYPTION_KEY.
- */
-static struct key_type key_type_fscrypt = {
- .name = "._fscrypt",
- .instantiate = fscrypt_key_instantiate,
- .destroy = fscrypt_key_destroy,
- .describe = fscrypt_key_describe,
-};
-
static int fscrypt_user_key_instantiate(struct key *key,
struct key_preparsed_payload *prep)
{
@@ -131,32 +156,6 @@
.describe = fscrypt_user_key_describe,
};
-/* Search ->s_master_keys or ->mk_users */
-static struct key *search_fscrypt_keyring(struct key *keyring,
- struct key_type *type,
- const char *description)
-{
- /*
- * We need to mark the keyring reference as "possessed" so that we
- * acquire permission to search it, via the KEY_POS_SEARCH permission.
- */
- key_ref_t keyref = make_key_ref(keyring, true /* possessed */);
-
- keyref = keyring_search(keyref, type, description, false);
- if (IS_ERR(keyref)) {
- if (PTR_ERR(keyref) == -EAGAIN || /* not found */
- PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
- keyref = ERR_PTR(-ENOKEY);
- return ERR_CAST(keyref);
- }
- return key_ref_to_ptr(keyref);
-}
-
-#define FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE \
- (CONST_STRLEN("fscrypt-") + sizeof_field(struct super_block, s_id))
-
-#define FSCRYPT_MK_DESCRIPTION_SIZE (2 * FSCRYPT_KEY_IDENTIFIER_SIZE + 1)
-
#define FSCRYPT_MK_USERS_DESCRIPTION_SIZE \
(CONST_STRLEN("fscrypt-") + 2 * FSCRYPT_KEY_IDENTIFIER_SIZE + \
CONST_STRLEN("-users") + 1)
@@ -164,21 +163,6 @@
#define FSCRYPT_MK_USER_DESCRIPTION_SIZE \
(2 * FSCRYPT_KEY_IDENTIFIER_SIZE + CONST_STRLEN(".uid.") + 10 + 1)
-static void format_fs_keyring_description(
- char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE],
- const struct super_block *sb)
-{
- sprintf(description, "fscrypt-%s", sb->s_id);
-}
-
-static void format_mk_description(
- char description[FSCRYPT_MK_DESCRIPTION_SIZE],
- const struct fscrypt_key_specifier *mk_spec)
-{
- sprintf(description, "%*phN",
- master_key_spec_len(mk_spec), (u8 *)&mk_spec->u);
-}
-
static void format_mk_users_keyring_description(
char description[FSCRYPT_MK_USERS_DESCRIPTION_SIZE],
const u8 mk_identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
@@ -199,20 +183,15 @@
/* Create ->s_master_keys if needed. Synchronized by fscrypt_add_key_mutex. */
static int allocate_filesystem_keyring(struct super_block *sb)
{
- char description[FSCRYPT_FS_KEYRING_DESCRIPTION_SIZE];
- struct key *keyring;
+ struct fscrypt_keyring *keyring;
if (sb->s_master_keys)
return 0;
- format_fs_keyring_description(description, sb);
- keyring = keyring_alloc(description, GLOBAL_ROOT_UID, GLOBAL_ROOT_GID,
- current_cred(), KEY_POS_SEARCH |
- KEY_USR_SEARCH | KEY_USR_READ | KEY_USR_VIEW,
- KEY_ALLOC_NOT_IN_QUOTA, NULL, NULL);
- if (IS_ERR(keyring))
- return PTR_ERR(keyring);
-
+ keyring = kzalloc(sizeof(*keyring), GFP_KERNEL);
+ if (!keyring)
+ return -ENOMEM;
+ spin_lock_init(&keyring->lock);
/*
* Pairs with the smp_load_acquire() in fscrypt_find_master_key().
* I.e., here we publish ->s_master_keys with a RELEASE barrier so that
@@ -222,21 +201,80 @@
return 0;
}
-void fscrypt_sb_free(struct super_block *sb)
+/*
+ * Release all encryption keys that have been added to the filesystem, along
+ * with the keyring that contains them.
+ *
+ * This is called at unmount time. The filesystem's underlying block device(s)
+ * are still available at this time; this is important because after user file
+ * accesses have been allowed, this function may need to evict keys from the
+ * keyslots of an inline crypto engine, which requires the block device(s).
+ *
+ * This is also called when the super_block is being freed. This is needed to
+ * avoid a memory leak if mounting fails after the "test_dummy_encryption"
+ * option was processed, as in that case the unmount-time call isn't made.
+ */
+void fscrypt_destroy_keyring(struct super_block *sb)
{
- key_put(sb->s_master_keys);
+ struct fscrypt_keyring *keyring = sb->s_master_keys;
+ size_t i;
+
+ if (!keyring)
+ return;
+
+ for (i = 0; i < ARRAY_SIZE(keyring->key_hashtable); i++) {
+ struct hlist_head *bucket = &keyring->key_hashtable[i];
+ struct fscrypt_master_key *mk;
+ struct hlist_node *tmp;
+
+ hlist_for_each_entry_safe(mk, tmp, bucket, mk_node) {
+ /*
+ * Since all inodes were already evicted, every key
+ * remaining in the keyring should have an empty inode
+ * list, and should only still be in the keyring due to
+ * the single active ref associated with ->mk_secret.
+ * There should be no structural refs beyond the one
+ * associated with the active ref.
+ */
+ WARN_ON(refcount_read(&mk->mk_active_refs) != 1);
+ WARN_ON(refcount_read(&mk->mk_struct_refs) != 1);
+ WARN_ON(!is_master_key_secret_present(&mk->mk_secret));
+ wipe_master_key_secret(&mk->mk_secret);
+ fscrypt_put_master_key_activeref(mk);
+ }
+ }
+ kfree_sensitive(keyring);
sb->s_master_keys = NULL;
}
-/*
- * Find the specified master key in ->s_master_keys.
- * Returns ERR_PTR(-ENOKEY) if not found.
- */
-struct key *fscrypt_find_master_key(struct super_block *sb,
- const struct fscrypt_key_specifier *mk_spec)
+static struct hlist_head *
+fscrypt_mk_hash_bucket(struct fscrypt_keyring *keyring,
+ const struct fscrypt_key_specifier *mk_spec)
{
- struct key *keyring;
- char description[FSCRYPT_MK_DESCRIPTION_SIZE];
+ /*
+ * Since key specifiers should be "random" values, it is sufficient to
+ * use a trivial hash function that just takes the first several bits of
+ * the key specifier.
+ */
+ unsigned long i = get_unaligned((unsigned long *)&mk_spec->u);
+
+ return &keyring->key_hashtable[i % ARRAY_SIZE(keyring->key_hashtable)];
+}
+
+/*
+ * Find the specified master key struct in ->s_master_keys and take a structural
+ * ref to it. The structural ref guarantees that the key struct continues to
+ * exist, but it does *not* guarantee that ->s_master_keys continues to contain
+ * the key struct. The structural ref needs to be dropped by
+ * fscrypt_put_master_key(). Returns NULL if the key struct is not found.
+ */
+struct fscrypt_master_key *
+fscrypt_find_master_key(struct super_block *sb,
+ const struct fscrypt_key_specifier *mk_spec)
+{
+ struct fscrypt_keyring *keyring;
+ struct hlist_head *bucket;
+ struct fscrypt_master_key *mk;
/*
* Pairs with the smp_store_release() in allocate_filesystem_keyring().
@@ -246,10 +284,38 @@
*/
keyring = smp_load_acquire(&sb->s_master_keys);
if (keyring == NULL)
- return ERR_PTR(-ENOKEY); /* No keyring yet, so no keys yet. */
+ return NULL; /* No keyring yet, so no keys yet. */
- format_mk_description(description, mk_spec);
- return search_fscrypt_keyring(keyring, &key_type_fscrypt, description);
+ bucket = fscrypt_mk_hash_bucket(keyring, mk_spec);
+ rcu_read_lock();
+ switch (mk_spec->type) {
+ case FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR:
+ hlist_for_each_entry_rcu(mk, bucket, mk_node) {
+ if (mk->mk_spec.type ==
+ FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR &&
+ memcmp(mk->mk_spec.u.descriptor,
+ mk_spec->u.descriptor,
+ FSCRYPT_KEY_DESCRIPTOR_SIZE) == 0 &&
+ refcount_inc_not_zero(&mk->mk_struct_refs))
+ goto out;
+ }
+ break;
+ case FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER:
+ hlist_for_each_entry_rcu(mk, bucket, mk_node) {
+ if (mk->mk_spec.type ==
+ FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER &&
+ memcmp(mk->mk_spec.u.identifier,
+ mk_spec->u.identifier,
+ FSCRYPT_KEY_IDENTIFIER_SIZE) == 0 &&
+ refcount_inc_not_zero(&mk->mk_struct_refs))
+ goto out;
+ }
+ break;
+ }
+ mk = NULL;
+out:
+ rcu_read_unlock();
+ return mk;
}
static int allocate_master_key_users_keyring(struct fscrypt_master_key *mk)
@@ -277,17 +343,30 @@
static struct key *find_master_key_user(struct fscrypt_master_key *mk)
{
char description[FSCRYPT_MK_USER_DESCRIPTION_SIZE];
+ key_ref_t keyref;
format_mk_user_description(description, mk->mk_spec.u.identifier);
- return search_fscrypt_keyring(mk->mk_users, &key_type_fscrypt_user,
- description);
+
+ /*
+ * We need to mark the keyring reference as "possessed" so that we
+ * acquire permission to search it, via the KEY_POS_SEARCH permission.
+ */
+ keyref = keyring_search(make_key_ref(mk->mk_users, true /*possessed*/),
+ &key_type_fscrypt_user, description, false);
+ if (IS_ERR(keyref)) {
+ if (PTR_ERR(keyref) == -EAGAIN || /* not found */
+ PTR_ERR(keyref) == -EKEYREVOKED) /* recently invalidated */
+ keyref = ERR_PTR(-ENOKEY);
+ return ERR_CAST(keyref);
+ }
+ return key_ref_to_ptr(keyref);
}
/*
* Give the current user a "key" in ->mk_users. This charges the user's quota
* and marks the master key as added by the current user, so that it cannot be
- * removed by another user with the key. Either the master key's key->sem must
- * be held for write, or the master key must be still undergoing initialization.
+ * removed by another user with the key. Either ->mk_sem must be held for
+ * write, or the master key must be still undergoing initialization.
*/
static int add_master_key_user(struct fscrypt_master_key *mk)
{
@@ -309,7 +388,7 @@
/*
* Remove the current user's "key" from ->mk_users.
- * The master key's key->sem must be held for write.
+ * ->mk_sem must be held for write.
*
* Returns 0 if removed, -ENOKEY if not found, or another -errno code.
*/
@@ -327,64 +406,49 @@
}
/*
- * Allocate a new fscrypt_master_key which contains the given secret, set it as
- * the payload of a new 'struct key' of type fscrypt, and link the 'struct key'
- * into the given keyring. Synchronized by fscrypt_add_key_mutex.
+ * Allocate a new fscrypt_master_key, transfer the given secret over to it, and
+ * insert it into sb->s_master_keys.
*/
-static int add_new_master_key(struct fscrypt_master_key_secret *secret,
- const struct fscrypt_key_specifier *mk_spec,
- struct key *keyring)
+static int add_new_master_key(struct super_block *sb,
+ struct fscrypt_master_key_secret *secret,
+ const struct fscrypt_key_specifier *mk_spec)
{
+ struct fscrypt_keyring *keyring = sb->s_master_keys;
struct fscrypt_master_key *mk;
- char description[FSCRYPT_MK_DESCRIPTION_SIZE];
- struct key *key;
int err;
mk = kzalloc(sizeof(*mk), GFP_KERNEL);
if (!mk)
return -ENOMEM;
+ mk->mk_sb = sb;
+ init_rwsem(&mk->mk_sem);
+ refcount_set(&mk->mk_struct_refs, 1);
mk->mk_spec = *mk_spec;
- move_master_key_secret(&mk->mk_secret, secret);
- init_rwsem(&mk->mk_secret_sem);
-
- refcount_set(&mk->mk_refcount, 1); /* secret is present */
INIT_LIST_HEAD(&mk->mk_decrypted_inodes);
spin_lock_init(&mk->mk_decrypted_inodes_lock);
if (mk_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) {
err = allocate_master_key_users_keyring(mk);
if (err)
- goto out_free_mk;
+ goto out_put;
err = add_master_key_user(mk);
if (err)
- goto out_free_mk;
+ goto out_put;
}
- /*
- * Note that we don't charge this key to anyone's quota, since when
- * ->mk_users is in use those keys are charged instead, and otherwise
- * (when ->mk_users isn't in use) only root can add these keys.
- */
- format_mk_description(description, mk_spec);
- key = key_alloc(&key_type_fscrypt, description,
- GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
- KEY_POS_SEARCH | KEY_USR_SEARCH | KEY_USR_VIEW,
- KEY_ALLOC_NOT_IN_QUOTA, NULL);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- goto out_free_mk;
- }
- err = key_instantiate_and_link(key, mk, sizeof(*mk), keyring, NULL);
- key_put(key);
- if (err)
- goto out_free_mk;
+ move_master_key_secret(&mk->mk_secret, secret);
+ refcount_set(&mk->mk_active_refs, 1); /* ->mk_secret is present */
+ spin_lock(&keyring->lock);
+ hlist_add_head_rcu(&mk->mk_node,
+ fscrypt_mk_hash_bucket(keyring, mk_spec));
+ spin_unlock(&keyring->lock);
return 0;
-out_free_mk:
- free_master_key(mk);
+out_put:
+ fscrypt_put_master_key(mk);
return err;
}
@@ -393,45 +457,34 @@
static int add_existing_master_key(struct fscrypt_master_key *mk,
struct fscrypt_master_key_secret *secret)
{
- struct key *mk_user;
- bool rekey;
int err;
/*
* If the current user is already in ->mk_users, then there's nothing to
- * do. (Not applicable for v1 policy keys, which have NULL ->mk_users.)
+ * do. Otherwise, we need to add the user to ->mk_users. (Neither is
+ * applicable for v1 policy keys, which have NULL ->mk_users.)
*/
if (mk->mk_users) {
- mk_user = find_master_key_user(mk);
+ struct key *mk_user = find_master_key_user(mk);
+
if (mk_user != ERR_PTR(-ENOKEY)) {
if (IS_ERR(mk_user))
return PTR_ERR(mk_user);
key_put(mk_user);
return 0;
}
- }
-
- /* If we'll be re-adding ->mk_secret, try to take the reference. */
- rekey = !is_master_key_secret_present(&mk->mk_secret);
- if (rekey && !refcount_inc_not_zero(&mk->mk_refcount))
- return KEY_DEAD;
-
- /* Add the current user to ->mk_users, if applicable. */
- if (mk->mk_users) {
err = add_master_key_user(mk);
- if (err) {
- if (rekey && refcount_dec_and_test(&mk->mk_refcount))
- return KEY_DEAD;
+ if (err)
return err;
- }
}
/* Re-add the secret if needed. */
- if (rekey) {
- down_write(&mk->mk_secret_sem);
+ if (!is_master_key_secret_present(&mk->mk_secret)) {
+ if (!refcount_inc_not_zero(&mk->mk_active_refs))
+ return KEY_DEAD;
move_master_key_secret(&mk->mk_secret, secret);
- up_write(&mk->mk_secret_sem);
}
+
return 0;
}
@@ -440,38 +493,36 @@
const struct fscrypt_key_specifier *mk_spec)
{
static DEFINE_MUTEX(fscrypt_add_key_mutex);
- struct key *key;
+ struct fscrypt_master_key *mk;
int err;
mutex_lock(&fscrypt_add_key_mutex); /* serialize find + link */
-retry:
- key = fscrypt_find_master_key(sb, mk_spec);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
- if (err != -ENOKEY)
- goto out_unlock;
+
+ mk = fscrypt_find_master_key(sb, mk_spec);
+ if (!mk) {
/* Didn't find the key in ->s_master_keys. Add it. */
err = allocate_filesystem_keyring(sb);
- if (err)
- goto out_unlock;
- err = add_new_master_key(secret, mk_spec, sb->s_master_keys);
+ if (!err)
+ err = add_new_master_key(sb, secret, mk_spec);
} else {
/*
* Found the key in ->s_master_keys. Re-add the secret if
* needed, and add the user to ->mk_users if needed.
*/
- down_write(&key->sem);
- err = add_existing_master_key(key->payload.data[0], secret);
- up_write(&key->sem);
+ down_write(&mk->mk_sem);
+ err = add_existing_master_key(mk, secret);
+ up_write(&mk->mk_sem);
if (err == KEY_DEAD) {
- /* Key being removed or needs to be removed */
- key_invalidate(key);
- key_put(key);
- goto retry;
+ /*
+ * We found a key struct, but it's already been fully
+ * removed. Ignore the old struct and add a new one.
+ * fscrypt_add_key_mutex means we don't need to worry
+ * about concurrent adds.
+ */
+ err = add_new_master_key(sb, secret, mk_spec);
}
- key_put(key);
+ fscrypt_put_master_key(mk);
}
-out_unlock:
mutex_unlock(&fscrypt_add_key_mutex);
return err;
}
@@ -735,19 +786,19 @@
const u8 identifier[FSCRYPT_KEY_IDENTIFIER_SIZE])
{
struct fscrypt_key_specifier mk_spec;
- struct key *key, *mk_user;
struct fscrypt_master_key *mk;
+ struct key *mk_user;
int err;
mk_spec.type = FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER;
memcpy(mk_spec.u.identifier, identifier, FSCRYPT_KEY_IDENTIFIER_SIZE);
- key = fscrypt_find_master_key(sb, &mk_spec);
- if (IS_ERR(key)) {
- err = PTR_ERR(key);
+ mk = fscrypt_find_master_key(sb, &mk_spec);
+ if (!mk) {
+ err = -ENOKEY;
goto out;
}
- mk = key->payload.data[0];
+ down_read(&mk->mk_sem);
mk_user = find_master_key_user(mk);
if (IS_ERR(mk_user)) {
err = PTR_ERR(mk_user);
@@ -755,7 +806,8 @@
key_put(mk_user);
err = 0;
}
- key_put(key);
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
out:
if (err == -ENOKEY && capable(CAP_FOWNER))
err = 0;
@@ -917,11 +969,10 @@
struct super_block *sb = file_inode(filp)->i_sb;
struct fscrypt_remove_key_arg __user *uarg = _uarg;
struct fscrypt_remove_key_arg arg;
- struct key *key;
struct fscrypt_master_key *mk;
u32 status_flags = 0;
int err;
- bool dead;
+ bool inodes_remain;
if (copy_from_user(&arg, uarg, sizeof(arg)))
return -EFAULT;
@@ -941,12 +992,10 @@
return -EACCES;
/* Find the key being removed. */
- key = fscrypt_find_master_key(sb, &arg.key_spec);
- if (IS_ERR(key))
- return PTR_ERR(key);
- mk = key->payload.data[0];
-
- down_write(&key->sem);
+ mk = fscrypt_find_master_key(sb, &arg.key_spec);
+ if (!mk)
+ return -ENOKEY;
+ down_write(&mk->mk_sem);
/* If relevant, remove current user's (or all users) claim to the key */
if (mk->mk_users && mk->mk_users->keys.nr_leaves_on_tree != 0) {
@@ -955,7 +1004,7 @@
else
err = remove_master_key_user(mk);
if (err) {
- up_write(&key->sem);
+ up_write(&mk->mk_sem);
goto out_put_key;
}
if (mk->mk_users->keys.nr_leaves_on_tree != 0) {
@@ -967,28 +1016,22 @@
status_flags |=
FSCRYPT_KEY_REMOVAL_STATUS_FLAG_OTHER_USERS;
err = 0;
- up_write(&key->sem);
+ up_write(&mk->mk_sem);
goto out_put_key;
}
}
/* No user claims remaining. Go ahead and wipe the secret. */
- dead = false;
+ err = -ENOKEY;
if (is_master_key_secret_present(&mk->mk_secret)) {
- down_write(&mk->mk_secret_sem);
wipe_master_key_secret(&mk->mk_secret);
- dead = refcount_dec_and_test(&mk->mk_refcount);
- up_write(&mk->mk_secret_sem);
- }
- up_write(&key->sem);
- if (dead) {
- /*
- * No inodes reference the key, and we wiped the secret, so the
- * key object is free to be removed from the keyring.
- */
- key_invalidate(key);
+ fscrypt_put_master_key_activeref(mk);
err = 0;
- } else {
+ }
+ inodes_remain = refcount_read(&mk->mk_active_refs) > 0;
+ up_write(&mk->mk_sem);
+
+ if (inodes_remain) {
/* Some inodes still reference this key; try to evict them. */
err = try_to_lock_encrypted_files(sb, mk);
if (err == -EBUSY) {
@@ -1004,7 +1047,7 @@
* has been fully removed including all files locked.
*/
out_put_key:
- key_put(key);
+ fscrypt_put_master_key(mk);
if (err == 0)
err = put_user(status_flags, &uarg->removal_status_flags);
return err;
@@ -1051,7 +1094,6 @@
{
struct super_block *sb = file_inode(filp)->i_sb;
struct fscrypt_get_key_status_arg arg;
- struct key *key;
struct fscrypt_master_key *mk;
int err;
@@ -1068,19 +1110,18 @@
arg.user_count = 0;
memset(arg.__out_reserved, 0, sizeof(arg.__out_reserved));
- key = fscrypt_find_master_key(sb, &arg.key_spec);
- if (IS_ERR(key)) {
- if (key != ERR_PTR(-ENOKEY))
- return PTR_ERR(key);
+ mk = fscrypt_find_master_key(sb, &arg.key_spec);
+ if (!mk) {
arg.status = FSCRYPT_KEY_STATUS_ABSENT;
err = 0;
goto out;
}
- mk = key->payload.data[0];
- down_read(&key->sem);
+ down_read(&mk->mk_sem);
if (!is_master_key_secret_present(&mk->mk_secret)) {
- arg.status = FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED;
+ arg.status = refcount_read(&mk->mk_active_refs) > 0 ?
+ FSCRYPT_KEY_STATUS_INCOMPLETELY_REMOVED :
+ FSCRYPT_KEY_STATUS_ABSENT /* raced with full removal */;
err = 0;
goto out_release_key;
}
@@ -1102,8 +1143,8 @@
}
err = 0;
out_release_key:
- up_read(&key->sem);
- key_put(key);
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
out:
if (!err && copy_to_user(uarg, &arg, sizeof(arg)))
err = -EFAULT;
@@ -1115,13 +1156,9 @@
{
int err;
- err = register_key_type(&key_type_fscrypt);
- if (err)
- return err;
-
err = register_key_type(&key_type_fscrypt_user);
if (err)
- goto err_unregister_fscrypt;
+ return err;
err = register_key_type(&key_type_fscrypt_provisioning);
if (err)
@@ -1131,7 +1168,5 @@
err_unregister_fscrypt_user:
unregister_key_type(&key_type_fscrypt_user);
-err_unregister_fscrypt:
- unregister_key_type(&key_type_fscrypt);
return err;
}
diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c
index 73d96e3..7b14054 100644
--- a/fs/crypto/keysetup.c
+++ b/fs/crypto/keysetup.c
@@ -9,7 +9,6 @@
*/
#include <crypto/skcipher.h>
-#include <linux/key.h>
#include <linux/random.h>
#include "fscrypt_private.h"
@@ -151,6 +150,7 @@
{
crypto_free_skcipher(prep_key->tfm);
fscrypt_destroy_inline_crypt_key(prep_key);
+ memzero_explicit(prep_key, sizeof(*prep_key));
}
/* Given a per-file encryption key, set up the file's crypto transform object */
@@ -404,20 +404,18 @@
/*
* Find the master key, then set up the inode's actual encryption key.
*
- * If the master key is found in the filesystem-level keyring, then the
- * corresponding 'struct key' is returned in *master_key_ret with
- * ->mk_secret_sem read-locked. This is needed to ensure that only one task
- * links the fscrypt_info into ->mk_decrypted_inodes (as multiple tasks may race
- * to create an fscrypt_info for the same inode), and to synchronize the master
- * key being removed with a new inode starting to use it.
+ * If the master key is found in the filesystem-level keyring, then it is
+ * returned in *mk_ret with its semaphore read-locked. This is needed to ensure
+ * that only one task links the fscrypt_info into ->mk_decrypted_inodes (as
+ * multiple tasks may race to create an fscrypt_info for the same inode), and to
+ * synchronize the master key being removed with a new inode starting to use it.
*/
static int setup_file_encryption_key(struct fscrypt_info *ci,
bool need_dirhash_key,
- struct key **master_key_ret)
+ struct fscrypt_master_key **mk_ret)
{
- struct key *key;
- struct fscrypt_master_key *mk = NULL;
struct fscrypt_key_specifier mk_spec;
+ struct fscrypt_master_key *mk;
int err;
err = fscrypt_select_encryption_impl(ci);
@@ -442,11 +440,10 @@
return -EINVAL;
}
- key = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
- if (IS_ERR(key)) {
- if (key != ERR_PTR(-ENOKEY) ||
- ci->ci_policy.version != FSCRYPT_POLICY_V1)
- return PTR_ERR(key);
+ mk = fscrypt_find_master_key(ci->ci_inode->i_sb, &mk_spec);
+ if (!mk) {
+ if (ci->ci_policy.version != FSCRYPT_POLICY_V1)
+ return -ENOKEY;
/*
* As a legacy fallback for v1 policies, search for the key in
@@ -456,9 +453,7 @@
*/
return fscrypt_setup_v1_file_key_via_subscribed_keyrings(ci);
}
-
- mk = key->payload.data[0];
- down_read(&mk->mk_secret_sem);
+ down_read(&mk->mk_sem);
/* Has the secret been removed (via FS_IOC_REMOVE_ENCRYPTION_KEY)? */
if (!is_master_key_secret_present(&mk->mk_secret)) {
@@ -486,18 +481,18 @@
if (err)
goto out_release_key;
- *master_key_ret = key;
+ *mk_ret = mk;
return 0;
out_release_key:
- up_read(&mk->mk_secret_sem);
- key_put(key);
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
return err;
}
static void put_crypt_info(struct fscrypt_info *ci)
{
- struct key *key;
+ struct fscrypt_master_key *mk;
if (!ci)
return;
@@ -507,24 +502,18 @@
else if (ci->ci_owns_key)
fscrypt_destroy_prepared_key(&ci->ci_enc_key);
- key = ci->ci_master_key;
- if (key) {
- struct fscrypt_master_key *mk = key->payload.data[0];
-
+ mk = ci->ci_master_key;
+ if (mk) {
/*
* Remove this inode from the list of inodes that were unlocked
- * with the master key.
- *
- * In addition, if we're removing the last inode from a key that
- * already had its secret removed, invalidate the key so that it
- * gets removed from ->s_master_keys.
+ * with the master key. In addition, if we're removing the last
+ * inode from a master key struct that already had its secret
+ * removed, then complete the full removal of the struct.
*/
spin_lock(&mk->mk_decrypted_inodes_lock);
list_del(&ci->ci_master_key_link);
spin_unlock(&mk->mk_decrypted_inodes_lock);
- if (refcount_dec_and_test(&mk->mk_refcount))
- key_invalidate(key);
- key_put(key);
+ fscrypt_put_master_key_activeref(mk);
}
memzero_explicit(ci, sizeof(*ci));
kmem_cache_free(fscrypt_info_cachep, ci);
@@ -538,7 +527,7 @@
{
struct fscrypt_info *crypt_info;
struct fscrypt_mode *mode;
- struct key *master_key = NULL;
+ struct fscrypt_master_key *mk = NULL;
int res;
res = fscrypt_initialize(inode->i_sb->s_cop->flags);
@@ -561,8 +550,7 @@
WARN_ON(mode->ivsize > FSCRYPT_MAX_IV_SIZE);
crypt_info->ci_mode = mode;
- res = setup_file_encryption_key(crypt_info, need_dirhash_key,
- &master_key);
+ res = setup_file_encryption_key(crypt_info, need_dirhash_key, &mk);
if (res)
goto out;
@@ -577,12 +565,9 @@
* We won the race and set ->i_crypt_info to our crypt_info.
* Now link it into the master key's inode list.
*/
- if (master_key) {
- struct fscrypt_master_key *mk =
- master_key->payload.data[0];
-
- refcount_inc(&mk->mk_refcount);
- crypt_info->ci_master_key = key_get(master_key);
+ if (mk) {
+ crypt_info->ci_master_key = mk;
+ refcount_inc(&mk->mk_active_refs);
spin_lock(&mk->mk_decrypted_inodes_lock);
list_add(&crypt_info->ci_master_key_link,
&mk->mk_decrypted_inodes);
@@ -592,11 +577,9 @@
}
res = 0;
out:
- if (master_key) {
- struct fscrypt_master_key *mk = master_key->payload.data[0];
-
- up_read(&mk->mk_secret_sem);
- key_put(master_key);
+ if (mk) {
+ up_read(&mk->mk_sem);
+ fscrypt_put_master_key(mk);
}
put_crypt_info(crypt_info);
return res;
@@ -747,7 +730,6 @@
int fscrypt_drop_inode(struct inode *inode)
{
const struct fscrypt_info *ci = fscrypt_get_info(inode);
- const struct fscrypt_master_key *mk;
/*
* If ci is NULL, then the inode doesn't have an encryption key set up
@@ -757,7 +739,6 @@
*/
if (!ci || !ci->ci_master_key)
return 0;
- mk = ci->ci_master_key->payload.data[0];
/*
* With proper, non-racy use of FS_IOC_REMOVE_ENCRYPTION_KEY, all inodes
@@ -769,13 +750,13 @@
return 0;
/*
- * Note: since we aren't holding ->mk_secret_sem, the result here can
+ * Note: since we aren't holding the key semaphore, the result here can
* immediately become outdated. But there's no correctness problem with
* unnecessarily evicting. Nor is there a correctness problem with not
* evicting while iput() is racing with the key being removed, since
* then the thread removing the key will either evict the inode itself
* or will correctly detect that it wasn't evicted due to the race.
*/
- return !is_master_key_secret_present(&mk->mk_secret);
+ return !is_master_key_secret_present(&ci->ci_master_key->mk_secret);
}
EXPORT_SYMBOL_GPL(fscrypt_drop_inode);
diff --git a/fs/crypto/policy.c b/fs/crypto/policy.c
index faa0f21..f68265c 100644
--- a/fs/crypto/policy.c
+++ b/fs/crypto/policy.c
@@ -686,12 +686,8 @@
* delayed key setup that requires the inode number.
*/
if (ci->ci_policy.version == FSCRYPT_POLICY_V2 &&
- (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32)) {
- const struct fscrypt_master_key *mk =
- ci->ci_master_key->payload.data[0];
-
- fscrypt_hash_inode_number(ci, mk);
- }
+ (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))
+ fscrypt_hash_inode_number(ci, ci->ci_master_key);
return inode->i_sb->s_cop->set_context(inode, &ctx, ctxsize, fs_data);
}
diff --git a/fs/dax.c b/fs/dax.c
index d5d7b93..3e7e9a5 100644
--- a/fs/dax.c
+++ b/fs/dax.c
@@ -846,7 +846,8 @@
if (!pmd_dirty(*pmdp) && !pmd_write(*pmdp))
goto unlock_pmd;
- flush_cache_page(vma, address, pfn);
+ flush_cache_range(vma, address,
+ address + HPAGE_PMD_SIZE);
pmd = pmdp_invalidate(vma, address, pmdp);
pmd = pmd_wrprotect(pmd);
pmd = pmd_mkclean(pmd);
diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c
index 848e0aa..f47f0a7 100644
--- a/fs/debugfs/inode.c
+++ b/fs/debugfs/inode.c
@@ -731,6 +731,28 @@
EXPORT_SYMBOL_GPL(debugfs_remove);
/**
+ * debugfs_lookup_and_remove - lookup a directory or file and recursively remove it
+ * @name: a pointer to a string containing the name of the item to look up.
+ * @parent: a pointer to the parent dentry of the item.
+ *
+ * This is the equlivant of doing something like
+ * debugfs_remove(debugfs_lookup(..)) but with the proper reference counting
+ * handled for the directory being looked up.
+ */
+void debugfs_lookup_and_remove(const char *name, struct dentry *parent)
+{
+ struct dentry *dentry;
+
+ dentry = debugfs_lookup(name, parent);
+ if (!dentry)
+ return;
+
+ debugfs_remove(dentry);
+ dput(dentry);
+}
+EXPORT_SYMBOL_GPL(debugfs_lookup_and_remove);
+
+/**
* debugfs_rename - rename a file/directory in the debugfs filesystem
* @old_dir: a pointer to the parent dentry for the renamed object. This
* should be a directory dentry.
diff --git a/fs/dlm/ast.c b/fs/dlm/ast.c
index 283c7b9..ca06069 100644
--- a/fs/dlm/ast.c
+++ b/fs/dlm/ast.c
@@ -198,13 +198,13 @@
if (!prev_seq) {
kref_get(&lkb->lkb_ref);
+ mutex_lock(&ls->ls_cb_mutex);
if (test_bit(LSFL_CB_DELAY, &ls->ls_flags)) {
- mutex_lock(&ls->ls_cb_mutex);
list_add(&lkb->lkb_cb_list, &ls->ls_cb_delay);
- mutex_unlock(&ls->ls_cb_mutex);
} else {
queue_work(ls->ls_callback_wq, &lkb->lkb_cb_work);
}
+ mutex_unlock(&ls->ls_cb_mutex);
}
out:
mutex_unlock(&lkb->lkb_cb_mutex);
@@ -284,7 +284,9 @@
void dlm_callback_suspend(struct dlm_ls *ls)
{
+ mutex_lock(&ls->ls_cb_mutex);
set_bit(LSFL_CB_DELAY, &ls->ls_flags);
+ mutex_unlock(&ls->ls_cb_mutex);
if (ls->ls_callback_wq)
flush_workqueue(ls->ls_callback_wq);
diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c
index 1e9d899..dde9afb 100644
--- a/fs/dlm/lock.c
+++ b/fs/dlm/lock.c
@@ -1551,6 +1551,7 @@
lkb->lkb_wait_type = 0;
lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
lkb->lkb_wait_count--;
+ unhold_lkb(lkb);
goto out_del;
}
@@ -1577,6 +1578,7 @@
log_error(ls, "remwait error %x reply %d wait_type %d overlap",
lkb->lkb_id, mstype, lkb->lkb_wait_type);
lkb->lkb_wait_count--;
+ unhold_lkb(lkb);
lkb->lkb_wait_type = 0;
}
@@ -2886,17 +2888,9 @@
static int validate_lock_args(struct dlm_ls *ls, struct dlm_lkb *lkb,
struct dlm_args *args)
{
- int rv = -EINVAL;
+ int rv = -EBUSY;
if (args->flags & DLM_LKF_CONVERT) {
- if (lkb->lkb_flags & DLM_IFL_MSTCPY)
- goto out;
-
- if (args->flags & DLM_LKF_QUECVT &&
- !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
- goto out;
-
- rv = -EBUSY;
if (lkb->lkb_status != DLM_LKSTS_GRANTED)
goto out;
@@ -2905,6 +2899,14 @@
if (is_overlap(lkb))
goto out;
+
+ rv = -EINVAL;
+ if (lkb->lkb_flags & DLM_IFL_MSTCPY)
+ goto out;
+
+ if (args->flags & DLM_LKF_QUECVT &&
+ !__quecvt_compat_matrix[lkb->lkb_grmode+1][args->mode+1])
+ goto out;
}
lkb->lkb_exflags = args->flags;
@@ -4065,13 +4067,14 @@
rv = _create_message(ls, sizeof(struct dlm_message) + len,
dir_nodeid, DLM_MSG_REMOVE, &ms, &mh);
if (rv)
- return;
+ goto out;
memcpy(ms->m_extra, name, len);
ms->m_hash = hash;
send_message(mh, ms);
+out:
spin_lock(&ls->ls_remove_spin);
ls->ls_remove_len = 0;
memset(ls->ls_remove_name, 0, DLM_RESNAME_MAXLEN);
@@ -5312,11 +5315,16 @@
lkb->lkb_flags &= ~DLM_IFL_OVERLAP_UNLOCK;
lkb->lkb_flags &= ~DLM_IFL_OVERLAP_CANCEL;
lkb->lkb_wait_type = 0;
- lkb->lkb_wait_count = 0;
+ /* drop all wait_count references we still
+ * hold a reference for this iteration.
+ */
+ while (lkb->lkb_wait_count) {
+ lkb->lkb_wait_count--;
+ unhold_lkb(lkb);
+ }
mutex_lock(&ls->ls_waiters_mutex);
list_del_init(&lkb->lkb_wait_reply);
mutex_unlock(&ls->ls_waiters_mutex);
- unhold_lkb(lkb); /* for waiters list */
if (oc || ou) {
/* do an unlock or cancel instead of resending */
diff --git a/fs/dlm/plock.c b/fs/dlm/plock.c
index c38b2b8..a10d2bc 100644
--- a/fs/dlm/plock.c
+++ b/fs/dlm/plock.c
@@ -23,11 +23,11 @@
struct list_head list;
int done;
struct dlm_plock_info info;
+ int (*callback)(struct file_lock *fl, int result);
};
struct plock_xop {
struct plock_op xop;
- int (*callback)(struct file_lock *fl, int result);
void *fl;
void *file;
struct file_lock flc;
@@ -129,19 +129,18 @@
/* fl_owner is lockd which doesn't distinguish
processes on the nfs client */
op->info.owner = (__u64) fl->fl_pid;
- xop->callback = fl->fl_lmops->lm_grant;
+ op->callback = fl->fl_lmops->lm_grant;
locks_init_lock(&xop->flc);
locks_copy_lock(&xop->flc, fl);
xop->fl = fl;
xop->file = file;
} else {
op->info.owner = (__u64)(long) fl->fl_owner;
- xop->callback = NULL;
}
send_op(op);
- if (xop->callback == NULL) {
+ if (!op->callback) {
rv = wait_event_interruptible(recv_wq, (op->done != 0));
if (rv == -ERESTARTSYS) {
log_debug(ls, "dlm_posix_lock: wait killed %llx",
@@ -203,7 +202,7 @@
file = xop->file;
flc = &xop->flc;
fl = xop->fl;
- notify = xop->callback;
+ notify = op->callback;
if (op->info.rv) {
notify(fl, op->info.rv);
@@ -436,10 +435,9 @@
if (op->info.fsid == info.fsid &&
op->info.number == info.number &&
op->info.owner == info.owner) {
- struct plock_xop *xop = (struct plock_xop *)op;
list_del_init(&op->list);
memcpy(&op->info, &info, sizeof(info));
- if (xop->callback)
+ if (op->callback)
do_callback = 1;
else
op->done = 1;
diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c
index 8a6260a..f921580 100644
--- a/fs/erofs/decompressor.c
+++ b/fs/erofs/decompressor.c
@@ -56,14 +56,18 @@
if (page) {
__clear_bit(j, bounced);
- if (kaddr) {
- if (kaddr + PAGE_SIZE == page_address(page))
+ if (!PageHighMem(page)) {
+ if (!i) {
+ kaddr = page_address(page);
+ continue;
+ }
+ if (kaddr &&
+ kaddr + PAGE_SIZE == page_address(page)) {
kaddr += PAGE_SIZE;
- else
- kaddr = NULL;
- } else if (!i) {
- kaddr = page_address(page);
+ continue;
+ }
}
+ kaddr = NULL;
continue;
}
kaddr = NULL;
diff --git a/fs/eventpoll.c b/fs/eventpoll.c
index 6094b2e..2f1f053 100644
--- a/fs/eventpoll.c
+++ b/fs/eventpoll.c
@@ -1804,6 +1804,21 @@
return timespec64_add_safe(now, ts);
}
+/*
+ * autoremove_wake_function, but remove even on failure to wake up, because we
+ * know that default_wake_function/ttwu will only fail if the thread is already
+ * woken, and in that case the ep_poll loop will remove the entry anyways, not
+ * try to reuse it.
+ */
+static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry,
+ unsigned int mode, int sync, void *key)
+{
+ int ret = default_wake_function(wq_entry, mode, sync, key);
+
+ list_del_init(&wq_entry->entry);
+ return ret;
+}
+
/**
* ep_poll - Retrieves ready events, and delivers them to the caller supplied
* event buffer.
@@ -1881,8 +1896,15 @@
* normal wakeup path no need to call __remove_wait_queue()
* explicitly, thus ep->lock is not taken, which halts the
* event delivery.
+ *
+ * In fact, we now use an even more aggressive function that
+ * unconditionally removes, because we don't reuse the wait
+ * entry between loop iterations. This lets us also avoid the
+ * performance issue if a process is killed, causing all of its
+ * threads to wake up without being removed normally.
*/
init_wait(&wait);
+ wait.func = ep_autoremove_wake_function;
write_lock_irq(&ep->lock);
/*
diff --git a/fs/exec.c b/fs/exec.c
index ca89e0e..983295c 100644
--- a/fs/exec.c
+++ b/fs/exec.c
@@ -494,8 +494,14 @@
* the stack. They aren't stored until much later when we can't
* signal to the parent that the child has run out of stack space.
* Instead, calculate it here so it's possible to fail gracefully.
+ *
+ * In the case of argc = 0, make sure there is space for adding a
+ * empty string (which will bump argc to 1), to ensure confused
+ * userspace programs don't start processing from argv[1], thinking
+ * argc can never be 0, to keep them from walking envp by accident.
+ * See do_execveat_common().
*/
- ptr_size = (bprm->argc + bprm->envc) * sizeof(void *);
+ ptr_size = (max(bprm->argc, 1) + bprm->envc) * sizeof(void *);
if (limit <= ptr_size)
return -E2BIG;
limit -= ptr_size;
@@ -1192,11 +1198,11 @@
return -ENOMEM;
refcount_set(&newsighand->count, 1);
- memcpy(newsighand->action, oldsighand->action,
- sizeof(newsighand->action));
write_lock_irq(&tasklist_lock);
spin_lock(&oldsighand->siglock);
+ memcpy(newsighand->action, oldsighand->action,
+ sizeof(newsighand->action));
rcu_assign_pointer(me->sighand, newsighand);
spin_unlock(&oldsighand->siglock);
write_unlock_irq(&tasklist_lock);
@@ -1280,7 +1286,10 @@
bprm->mm = NULL;
#ifdef CONFIG_POSIX_TIMERS
- exit_itimers(me->signal);
+ spin_lock_irq(&me->sighand->siglock);
+ posix_cpu_timers_exit(me);
+ spin_unlock_irq(&me->sighand->siglock);
+ exit_itimers(me);
flush_itimer_signals();
#endif
@@ -1886,6 +1895,9 @@
}
retval = count(argv, MAX_ARG_STRINGS);
+ if (retval == 0)
+ pr_warn_once("process '%s' launched '%s' with NULL argv: empty string added\n",
+ current->comm, bprm->filename);
if (retval < 0)
goto out_free;
bprm->argc = retval;
@@ -1912,6 +1924,19 @@
if (retval < 0)
goto out_free;
+ /*
+ * When argv is empty, add an empty string ("") as argv[0] to
+ * ensure confused userspace programs that start processing
+ * from argv[1] won't end up walking envp. See also
+ * bprm_stack_limits().
+ */
+ if (bprm->argc == 0) {
+ retval = copy_string_kernel("", bprm);
+ if (retval < 0)
+ goto out_free;
+ bprm->argc = 1;
+ }
+
retval = bprm_execve(bprm, fd, filename, flags);
out_free:
free_bprm(bprm);
@@ -1940,6 +1965,8 @@
}
retval = count_strings_kernel(argv);
+ if (WARN_ON_ONCE(retval == 0))
+ retval = -EINVAL;
if (retval < 0)
goto out_free;
bprm->argc = retval;
diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c
index 579c10f..258b6bb 100644
--- a/fs/exfat/balloc.c
+++ b/fs/exfat/balloc.c
@@ -148,7 +148,9 @@
struct super_block *sb = inode->i_sb;
struct exfat_sb_info *sbi = EXFAT_SB(sb);
- WARN_ON(clu < EXFAT_FIRST_CLUSTER);
+ if (!is_valid_cluster(sbi, clu))
+ return -EINVAL;
+
ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
@@ -166,7 +168,9 @@
struct exfat_sb_info *sbi = EXFAT_SB(sb);
struct exfat_mount_options *opts = &sbi->options;
- WARN_ON(clu < EXFAT_FIRST_CLUSTER);
+ if (!is_valid_cluster(sbi, clu))
+ return;
+
ent_idx = CLUSTER_TO_BITMAP_ENT(clu);
i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx);
b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx);
diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h
index b8f0e82..0d139c7 100644
--- a/fs/exfat/exfat_fs.h
+++ b/fs/exfat/exfat_fs.h
@@ -380,6 +380,14 @@
EXFAT_RESERVED_CLUSTERS;
}
+static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
+ unsigned int clus)
+{
+ if (clus < EXFAT_FIRST_CLUSTER || sbi->num_clusters <= clus)
+ return false;
+ return true;
+}
+
/* super.c */
int exfat_set_volume_dirty(struct super_block *sb);
int exfat_clear_volume_dirty(struct super_block *sb);
diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c
index c3c9afe..a1481e4 100644
--- a/fs/exfat/fatent.c
+++ b/fs/exfat/fatent.c
@@ -81,14 +81,6 @@
return 0;
}
-static inline bool is_valid_cluster(struct exfat_sb_info *sbi,
- unsigned int clus)
-{
- if (clus < EXFAT_FIRST_CLUSTER || sbi->num_clusters <= clus)
- return false;
- return true;
-}
-
int exfat_ent_get(struct super_block *sb, unsigned int loc,
unsigned int *content)
{
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 09f1fe6..9a6475b 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -756,8 +756,12 @@
res += 1LL << (bits-2);
res += 1LL << (2*(bits-2));
res += 1LL << (3*(bits-2));
+ /* Compute how many metadata blocks are needed */
+ meta_blocks = 1;
+ meta_blocks += 1 + ppb;
+ meta_blocks += 1 + ppb + ppb * ppb;
/* Does block tree limit file size? */
- if (res < upper_limit)
+ if (res + meta_blocks <= upper_limit)
goto check_lfs;
res = upper_limit;
@@ -1056,9 +1060,10 @@
sbi->s_frags_per_group);
goto failed_mount;
}
- if (sbi->s_inodes_per_group > sb->s_blocksize * 8) {
+ if (sbi->s_inodes_per_group < sbi->s_inodes_per_block ||
+ sbi->s_inodes_per_group > sb->s_blocksize * 8) {
ext2_msg(sb, KERN_ERR,
- "error: #inodes per group too big: %lu",
+ "error: invalid #inodes per group: %lu",
sbi->s_inodes_per_group);
goto failed_mount;
}
@@ -1068,6 +1073,13 @@
sbi->s_groups_count = ((le32_to_cpu(es->s_blocks_count) -
le32_to_cpu(es->s_first_data_block) - 1)
/ EXT2_BLOCKS_PER_GROUP(sb)) + 1;
+ if ((u64)sbi->s_groups_count * sbi->s_inodes_per_group !=
+ le32_to_cpu(es->s_inodes_count)) {
+ ext2_msg(sb, KERN_ERR, "error: invalid #inodes: %u vs computed %llu",
+ le32_to_cpu(es->s_inodes_count),
+ (u64)sbi->s_groups_count * sbi->s_inodes_per_group);
+ goto failed_mount;
+ }
db_count = (sbi->s_groups_count + EXT2_DESC_PER_BLOCK(sb) - 1) /
EXT2_DESC_PER_BLOCK(sb);
sbi->s_group_desc = kmalloc_array (db_count,
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 455eb34..4ad1c3c 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -1419,12 +1419,6 @@
#ifdef __KERNEL__
-#ifdef CONFIG_FS_ENCRYPTION
-#define DUMMY_ENCRYPTION_ENABLED(sbi) ((sbi)->s_dummy_enc_policy.policy != NULL)
-#else
-#define DUMMY_ENCRYPTION_ENABLED(sbi) (0)
-#endif
-
/* Number of quota types we support */
#define EXT4_MAXQUOTAS 3
@@ -2159,6 +2153,10 @@
* Structure of a directory entry
*/
#define EXT4_NAME_LEN 255
+/*
+ * Base length of the ext4 directory entry excluding the name length
+ */
+#define EXT4_BASE_DIR_LEN (sizeof(struct ext4_dir_entry_2) - EXT4_NAME_LEN)
struct ext4_dir_entry {
__le32 inode; /* Inode number */
@@ -2870,7 +2868,7 @@
extern int ext4_can_truncate(struct inode *inode);
extern int ext4_truncate(struct inode *);
extern int ext4_break_layouts(struct inode *);
-extern int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length);
+extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length);
extern void ext4_set_inode_flags(struct inode *, bool init);
extern int ext4_alloc_da_blocks(struct inode *inode);
extern void ext4_set_aops(struct inode *inode);
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index 0fda305..54750b7 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -371,7 +371,7 @@
{
unsigned short entries;
ext4_lblk_t lblock = 0;
- ext4_lblk_t prev = 0;
+ ext4_lblk_t cur = 0;
if (eh->eh_entries == 0)
return 1;
@@ -395,11 +395,11 @@
/* Check for overlapping extents */
lblock = le32_to_cpu(ext->ee_block);
- if ((lblock <= prev) && prev) {
+ if (lblock < cur) {
*pblk = ext4_ext_pblock(ext);
return 0;
}
- prev = lblock + ext4_ext_get_actual_len(ext) - 1;
+ cur = lblock + ext4_ext_get_actual_len(ext);
ext++;
entries--;
}
@@ -419,13 +419,13 @@
/* Check for overlapping index extents */
lblock = le32_to_cpu(ext_idx->ei_block);
- if ((lblock <= prev) && prev) {
+ if (lblock < cur) {
*pblk = ext4_idx_pblock(ext_idx);
return 0;
}
ext_idx++;
entries--;
- prev = lblock;
+ cur = lblock + 1;
}
}
return 1;
@@ -459,6 +459,10 @@
error_msg = "invalid eh_entries";
goto corrupted;
}
+ if (unlikely((eh->eh_entries == 0) && (depth > 0))) {
+ error_msg = "eh_entries is 0 but eh_depth is > 0";
+ goto corrupted;
+ }
if (!ext4_valid_extent_entries(inode, eh, lblk, &pblk, depth)) {
error_msg = "invalid extent entries";
goto corrupted;
@@ -4498,9 +4502,9 @@
return ret > 0 ? ret2 : ret;
}
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len);
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len);
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len);
static long ext4_zero_range(struct file *file, loff_t offset,
loff_t len, int mode)
@@ -4571,6 +4575,10 @@
/* Wait all existing dio workers, newcomers will block on i_mutex */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/* Preallocate the range including the unaligned edges */
if (partial_begin || partial_end) {
ret = ext4_alloc_file_blocks(file,
@@ -4687,23 +4695,24 @@
return -EOPNOTSUPP;
ext4_fc_start_update(inode);
-
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- ret = ext4_punch_hole(inode, offset, len);
- goto exit;
- }
-
+ inode_lock(inode);
ret = ext4_convert_inline_data(inode);
+ inode_unlock(inode);
if (ret)
goto exit;
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ ret = ext4_punch_hole(file, offset, len);
+ goto exit;
+ }
+
if (mode & FALLOC_FL_COLLAPSE_RANGE) {
- ret = ext4_collapse_range(inode, offset, len);
+ ret = ext4_collapse_range(file, offset, len);
goto exit;
}
if (mode & FALLOC_FL_INSERT_RANGE) {
- ret = ext4_insert_range(inode, offset, len);
+ ret = ext4_insert_range(file, offset, len);
goto exit;
}
@@ -4739,6 +4748,10 @@
/* Wait all existing dio workers, newcomers will block on i_mutex */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags);
if (ret)
goto out;
@@ -5169,6 +5182,7 @@
* and it is decreased till we reach start.
*/
again:
+ ret = 0;
if (SHIFT == SHIFT_LEFT)
iterator = &start;
else
@@ -5212,14 +5226,21 @@
ext4_ext_get_actual_len(extent);
} else {
extent = EXT_FIRST_EXTENT(path[depth].p_hdr);
- if (le32_to_cpu(extent->ee_block) > 0)
+ if (le32_to_cpu(extent->ee_block) > start)
*iterator = le32_to_cpu(extent->ee_block) - 1;
- else
- /* Beginning is reached, end of the loop */
+ else if (le32_to_cpu(extent->ee_block) == start)
iterator = NULL;
- /* Update path extent in case we need to stop */
- while (le32_to_cpu(extent->ee_block) < start)
+ else {
+ extent = EXT_LAST_EXTENT(path[depth].p_hdr);
+ while (le32_to_cpu(extent->ee_block) >= start)
+ extent--;
+
+ if (extent == EXT_LAST_EXTENT(path[depth].p_hdr))
+ break;
+
extent++;
+ iterator = NULL;
+ }
path[depth].p_ext = extent;
}
ret = ext4_ext_shift_path_extents(path, shift, inode,
@@ -5241,8 +5262,9 @@
* This implements the fallocate's collapse range functionality for ext4
* Returns: 0 and non-zero on error.
*/
-static int ext4_collapse_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t punch_start, punch_stop;
handle_t *handle;
@@ -5293,6 +5315,10 @@
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -5387,8 +5413,9 @@
* by len bytes.
* Returns 0 on success, error otherwise.
*/
-static int ext4_insert_range(struct inode *inode, loff_t offset, loff_t len)
+static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
handle_t *handle;
struct ext4_ext_path *path;
@@ -5444,6 +5471,10 @@
/* Wait for existing dio to complete */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index 501e607..41dcf21 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -766,22 +766,25 @@
tl.fc_tag = cpu_to_le16(EXT4_FC_TAG_INODE);
tl.fc_len = cpu_to_le16(inode_len + sizeof(fc_inode.fc_ino));
+ ret = -ECANCELED;
dst = ext4_fc_reserve_space(inode->i_sb,
sizeof(tl) + inode_len + sizeof(fc_inode.fc_ino), crc);
if (!dst)
- return -ECANCELED;
+ goto err;
if (!ext4_fc_memcpy(inode->i_sb, dst, &tl, sizeof(tl), crc))
- return -ECANCELED;
+ goto err;
dst += sizeof(tl);
if (!ext4_fc_memcpy(inode->i_sb, dst, &fc_inode, sizeof(fc_inode), crc))
- return -ECANCELED;
+ goto err;
dst += sizeof(fc_inode);
if (!ext4_fc_memcpy(inode->i_sb, dst, (u8 *)ext4_raw_inode(&iloc),
inode_len, crc))
- return -ECANCELED;
-
- return 0;
+ goto err;
+ ret = 0;
+err:
+ brelse(iloc.bh);
+ return ret;
}
/*
@@ -1388,13 +1391,15 @@
if (state->fc_modified_inodes[i] == ino)
return 0;
if (state->fc_modified_inodes_used == state->fc_modified_inodes_size) {
- state->fc_modified_inodes = krealloc(
- state->fc_modified_inodes,
+ int *fc_modified_inodes;
+
+ fc_modified_inodes = krealloc(state->fc_modified_inodes,
sizeof(int) * (state->fc_modified_inodes_size +
EXT4_FC_REPLAY_REALLOC_INCREMENT),
GFP_KERNEL);
- if (!state->fc_modified_inodes)
+ if (!fc_modified_inodes)
return -ENOMEM;
+ state->fc_modified_inodes = fc_modified_inodes;
state->fc_modified_inodes_size +=
EXT4_FC_REPLAY_REALLOC_INCREMENT;
}
@@ -1579,15 +1584,18 @@
if (replay && state->fc_regions_used != state->fc_regions_valid)
state->fc_regions_used = state->fc_regions_valid;
if (state->fc_regions_used == state->fc_regions_size) {
+ struct ext4_fc_alloc_region *fc_regions;
+
+ fc_regions = krealloc(state->fc_regions,
+ sizeof(struct ext4_fc_alloc_region) *
+ (state->fc_regions_size +
+ EXT4_FC_REPLAY_REALLOC_INCREMENT),
+ GFP_KERNEL);
+ if (!fc_regions)
+ return -ENOMEM;
state->fc_regions_size +=
EXT4_FC_REPLAY_REALLOC_INCREMENT;
- state->fc_regions = krealloc(
- state->fc_regions,
- state->fc_regions_size *
- sizeof(struct ext4_fc_alloc_region),
- GFP_KERNEL);
- if (!state->fc_regions)
- return -ENOMEM;
+ state->fc_regions = fc_regions;
}
region = &state->fc_regions[state->fc_regions_used++];
region->ino = ino;
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index 7b28d44..0f61e0a 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -529,6 +529,12 @@
ret = -EAGAIN;
goto out;
}
+ /*
+ * Make sure inline data cannot be created anymore since we are going
+ * to allocate blocks for DIO. We know the inode does not have any
+ * inline data now because ext4_dio_supported() checked for that.
+ */
+ ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
offset = iocb->ki_pos;
count = ret;
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index 875af32..c53c9b1 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -508,7 +508,7 @@
goto fallback;
}
- max_dirs = ndirs / ngroups + inodes_per_group / 16;
+ max_dirs = ndirs / ngroups + inodes_per_group*flex_size / 16;
min_inodes = avefreei - inodes_per_group*flex_size / 4;
if (min_inodes < 1)
min_inodes = 1;
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index ae1f0c5..88bd1d1 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -34,6 +34,9 @@
struct ext4_inode *raw_inode;
int free, min_offs;
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
+ return 0;
+
min_offs = EXT4_SB(inode->i_sb)->s_inode_size -
EXT4_GOOD_OLD_INODE_SIZE -
EXT4_I(inode)->i_extra_isize -
@@ -1768,19 +1771,20 @@
void *inline_pos;
unsigned int offset;
struct ext4_dir_entry_2 *de;
- bool ret = true;
+ bool ret = false;
err = ext4_get_inode_loc(dir, &iloc);
if (err) {
EXT4_ERROR_INODE_ERR(dir, -err,
"error %d getting inode %lu block",
err, dir->i_ino);
- return true;
+ return false;
}
down_read(&EXT4_I(dir)->xattr_sem);
if (!ext4_has_inline_data(dir)) {
*has_inline_data = 0;
+ ret = true;
goto out;
}
@@ -1789,7 +1793,6 @@
ext4_warning(dir->i_sb,
"bad inline directory (dir #%lu) - no `..'",
dir->i_ino);
- ret = true;
goto out;
}
@@ -1808,16 +1811,15 @@
dir->i_ino, le32_to_cpu(de->inode),
le16_to_cpu(de->rec_len), de->name_len,
inline_size);
- ret = true;
goto out;
}
if (le32_to_cpu(de->inode)) {
- ret = false;
goto out;
}
offset += ext4_rec_len_from_disk(de->rec_len, inline_size);
}
+ ret = true;
out:
up_read(&EXT4_I(dir)->xattr_sem);
brelse(iloc.bh);
@@ -1975,6 +1977,18 @@
if (!ext4_has_inline_data(inode)) {
ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
return 0;
+ } else if (!ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
+ /*
+ * Inode has inline data but EXT4_STATE_MAY_INLINE_DATA is
+ * cleared. This means we are in the middle of moving of
+ * inline data to delay allocated block. Just force writeout
+ * here to finish conversion.
+ */
+ error = filemap_flush(inode->i_mapping);
+ if (error)
+ return error;
+ if (!ext4_has_inline_data(inode))
+ return 0;
}
needed_blocks = ext4_writepage_trans_blocks(inode);
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index d59474a..45f31dc 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -1175,6 +1175,13 @@
page = grab_cache_page_write_begin(mapping, index, flags);
if (!page)
return -ENOMEM;
+ /*
+ * The same as page allocation, we prealloc buffer heads before
+ * starting the handle.
+ */
+ if (!page_has_buffers(page))
+ create_empty_buffers(page, inode->i_sb->s_blocksize, 0);
+
unlock_page(page);
retry_journal:
@@ -1577,7 +1584,14 @@
ext4_lblk_t start, last;
start = index << (PAGE_SHIFT - inode->i_blkbits);
last = end << (PAGE_SHIFT - inode->i_blkbits);
+
+ /*
+ * avoid racing with extent status tree scans made by
+ * ext4_insert_delayed_block()
+ */
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_es_remove_extent(inode, start, last - start + 1);
+ up_write(&EXT4_I(inode)->i_data_sem);
}
pagevec_init(&pvec);
@@ -2023,6 +2037,15 @@
else
len = PAGE_SIZE;
+ /* Should never happen but for bugs in other kernel subsystems */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(inode,
+ "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ return 0;
+ }
+
page_bufs = page_buffers(page);
/*
* We cannot do block allocation or other extent handling in this
@@ -2626,6 +2649,22 @@
wait_on_page_writeback(page);
BUG_ON(PageWriteback(page));
+ /*
+ * Should never happen but for buggy code in
+ * other subsystems that call
+ * set_page_dirty() without properly warning
+ * the file system first. See [1] for more
+ * information.
+ *
+ * [1] https://lore.kernel.org/linux-mm/20180103100430.GE4911@quack2.suse.cz
+ */
+ if (!page_has_buffers(page)) {
+ ext4_warning_inode(mpd->inode, "page %lu does not have buffers attached", page->index);
+ ClearPageDirty(page);
+ unlock_page(page);
+ continue;
+ }
+
if (mpd->map.m_len == 0)
mpd->first_page = page->index;
mpd->next_page = page->index + 1;
@@ -3194,13 +3233,15 @@
{
struct inode *inode = mapping->host;
journal_t *journal;
+ sector_t ret = 0;
int err;
+ inode_lock_shared(inode);
/*
* We can get here for an inline file via the FIBMAP ioctl
*/
if (ext4_has_inline_data(inode))
- return 0;
+ goto out;
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY) &&
test_opt(inode->i_sb, DELALLOC)) {
@@ -3239,10 +3280,14 @@
jbd2_journal_unlock_updates(journal);
if (err)
- return 0;
+ goto out;
}
- return iomap_bmap(mapping, block, &ext4_iomap_ops);
+ ret = iomap_bmap(mapping, block, &ext4_iomap_ops);
+
+out:
+ inode_unlock_shared(inode);
+ return ret;
}
static int ext4_readpage(struct file *file, struct page *page)
@@ -4003,27 +4048,20 @@
* Returns: 0 on success or negative on failure
*/
-int ext4_punch_hole(struct inode *inode, loff_t offset, loff_t length)
+int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
{
+ struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t first_block, stop_block;
struct address_space *mapping = inode->i_mapping;
- loff_t first_block_offset, last_block_offset;
+ loff_t first_block_offset, last_block_offset, max_length;
+ struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
handle_t *handle;
unsigned int credits;
int ret = 0, ret2 = 0;
trace_ext4_punch_hole(inode, offset, length, 0);
- ext4_clear_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA);
- if (ext4_has_inline_data(inode)) {
- down_write(&EXT4_I(inode)->i_mmap_sem);
- ret = ext4_convert_inline_data(inode);
- up_write(&EXT4_I(inode)->i_mmap_sem);
- if (ret)
- return ret;
- }
-
/*
* Write out all dirty pages to avoid race conditions
* Then release them.
@@ -4051,6 +4089,14 @@
offset;
}
+ /*
+ * For punch hole the length + offset needs to be within one block
+ * before last range. Adjust the length if it goes beyond that limit.
+ */
+ max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize;
+ if (offset + length > max_length)
+ length = max_length - offset;
+
if (offset & (sb->s_blocksize - 1) ||
(offset + length) & (sb->s_blocksize - 1)) {
/*
@@ -4066,6 +4112,10 @@
/* Wait all existing dio workers, newcomers will block on i_mutex */
inode_dio_wait(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out_mutex;
+
/*
* Prevent page faults from reinstantiating pages we have released from
* page cache.
@@ -4570,8 +4620,7 @@
__le32 *magic = (void *)raw_inode +
EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize;
- if (EXT4_GOOD_OLD_INODE_SIZE + ei->i_extra_isize + sizeof(__le32) <=
- EXT4_INODE_SIZE(inode->i_sb) &&
+ if (EXT4_INODE_HAS_XATTR_SPACE(inode) &&
*magic == cpu_to_le32(EXT4_XATTR_MAGIC)) {
ext4_set_inode_state(inode, EXT4_STATE_XATTR);
return ext4_find_inline_data_nolock(inode);
@@ -5405,6 +5454,7 @@
if (attr->ia_valid & ATTR_SIZE) {
handle_t *handle;
loff_t oldsize = inode->i_size;
+ loff_t old_disksize;
int shrink = (attr->ia_size < inode->i_size);
if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) {
@@ -5478,6 +5528,7 @@
inode->i_sb->s_blocksize_bits);
down_write(&EXT4_I(inode)->i_data_sem);
+ old_disksize = EXT4_I(inode)->i_disksize;
EXT4_I(inode)->i_disksize = attr->ia_size;
rc = ext4_mark_inode_dirty(handle, inode);
if (!error)
@@ -5489,6 +5540,8 @@
*/
if (!error)
i_size_write(inode, attr->ia_size);
+ else
+ EXT4_I(inode)->i_disksize = old_disksize;
up_write(&EXT4_I(inode)->i_data_sem);
ext4_journal_stop(handle);
if (error)
@@ -5723,7 +5776,12 @@
}
ext4_fc_track_inode(handle, inode);
- if (IS_I_VERSION(inode))
+ /*
+ * ea_inodes are using i_version for storing reference count, don't
+ * mess with it
+ */
+ if (IS_I_VERSION(inode) &&
+ !(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL))
inode_inc_iversion(inode);
/* the do_update_inode consumes one bh->b_count */
diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c
index 110c258..d5ca02a 100644
--- a/fs/ext4/mballoc.c
+++ b/fs/ext4/mballoc.c
@@ -3320,69 +3320,95 @@
struct ext4_sb_info *sbi = EXT4_SB(sb);
ext4_group_t group;
ext4_grpblk_t blkoff;
- int i, clen, err;
+ int i, err;
int already;
+ unsigned int clen, clen_changed, thisgrp_len;
- clen = EXT4_B2C(sbi, len);
+ while (len > 0) {
+ ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
- ext4_get_group_no_and_offset(sb, block, &group, &blkoff);
- bitmap_bh = ext4_read_block_bitmap(sb, group);
- if (IS_ERR(bitmap_bh)) {
- err = PTR_ERR(bitmap_bh);
- bitmap_bh = NULL;
- goto out_err;
+ /*
+ * Check to see if we are freeing blocks across a group
+ * boundary.
+ * In case of flex_bg, this can happen that (block, len) may
+ * span across more than one group. In that case we need to
+ * get the corresponding group metadata to work with.
+ * For this we have goto again loop.
+ */
+ thisgrp_len = min_t(unsigned int, (unsigned int)len,
+ EXT4_BLOCKS_PER_GROUP(sb) - EXT4_C2B(sbi, blkoff));
+ clen = EXT4_NUM_B2C(sbi, thisgrp_len);
+
+ bitmap_bh = ext4_read_block_bitmap(sb, group);
+ if (IS_ERR(bitmap_bh)) {
+ err = PTR_ERR(bitmap_bh);
+ bitmap_bh = NULL;
+ break;
+ }
+
+ err = -EIO;
+ gdp = ext4_get_group_desc(sb, group, &gdp_bh);
+ if (!gdp)
+ break;
+
+ ext4_lock_group(sb, group);
+ already = 0;
+ for (i = 0; i < clen; i++)
+ if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) ==
+ !state)
+ already++;
+
+ clen_changed = clen - already;
+ if (state)
+ ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
+ else
+ mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
+ if (ext4_has_group_desc_csum(sb) &&
+ (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
+ gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
+ ext4_free_group_clusters_set(sb, gdp,
+ ext4_free_clusters_after_init(sb, group, gdp));
+ }
+ if (state)
+ clen = ext4_free_group_clusters(sb, gdp) - clen_changed;
+ else
+ clen = ext4_free_group_clusters(sb, gdp) + clen_changed;
+
+ ext4_free_group_clusters_set(sb, gdp, clen);
+ ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
+ ext4_group_desc_csum_set(sb, group, gdp);
+
+ ext4_unlock_group(sb, group);
+
+ if (sbi->s_log_groups_per_flex) {
+ ext4_group_t flex_group = ext4_flex_group(sbi, group);
+ struct flex_groups *fg = sbi_array_rcu_deref(sbi,
+ s_flex_groups, flex_group);
+
+ if (state)
+ atomic64_sub(clen_changed, &fg->free_clusters);
+ else
+ atomic64_add(clen_changed, &fg->free_clusters);
+
+ }
+
+ err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
+ if (err)
+ break;
+ sync_dirty_buffer(bitmap_bh);
+ err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
+ sync_dirty_buffer(gdp_bh);
+ if (err)
+ break;
+
+ block += thisgrp_len;
+ len -= thisgrp_len;
+ brelse(bitmap_bh);
+ BUG_ON(len < 0);
}
- err = -EIO;
- gdp = ext4_get_group_desc(sb, group, &gdp_bh);
- if (!gdp)
- goto out_err;
-
- ext4_lock_group(sb, group);
- already = 0;
- for (i = 0; i < clen; i++)
- if (!mb_test_bit(blkoff + i, bitmap_bh->b_data) == !state)
- already++;
-
- if (state)
- ext4_set_bits(bitmap_bh->b_data, blkoff, clen);
- else
- mb_test_and_clear_bits(bitmap_bh->b_data, blkoff, clen);
- if (ext4_has_group_desc_csum(sb) &&
- (gdp->bg_flags & cpu_to_le16(EXT4_BG_BLOCK_UNINIT))) {
- gdp->bg_flags &= cpu_to_le16(~EXT4_BG_BLOCK_UNINIT);
- ext4_free_group_clusters_set(sb, gdp,
- ext4_free_clusters_after_init(sb,
- group, gdp));
- }
- if (state)
- clen = ext4_free_group_clusters(sb, gdp) - clen + already;
- else
- clen = ext4_free_group_clusters(sb, gdp) + clen - already;
-
- ext4_free_group_clusters_set(sb, gdp, clen);
- ext4_block_bitmap_csum_set(sb, group, gdp, bitmap_bh);
- ext4_group_desc_csum_set(sb, group, gdp);
-
- ext4_unlock_group(sb, group);
-
- if (sbi->s_log_groups_per_flex) {
- ext4_group_t flex_group = ext4_flex_group(sbi, group);
-
- atomic64_sub(len,
- &sbi_array_rcu_deref(sbi, s_flex_groups,
- flex_group)->free_clusters);
- }
-
- err = ext4_handle_dirty_metadata(NULL, NULL, bitmap_bh);
if (err)
- goto out_err;
- sync_dirty_buffer(bitmap_bh);
- err = ext4_handle_dirty_metadata(NULL, NULL, gdp_bh);
- sync_dirty_buffer(gdp_bh);
-
-out_err:
- brelse(bitmap_bh);
+ brelse(bitmap_bh);
}
/*
@@ -3494,6 +3520,15 @@
size = size >> bsbits;
start = start_off >> bsbits;
+ /*
+ * For tiny groups (smaller than 8MB) the chosen allocation
+ * alignment may be larger than group size. Make sure the
+ * alignment does not move allocation to a different group which
+ * makes mballoc fail assertions later.
+ */
+ start = max(start, rounddown(ac->ac_o_ex.fe_logical,
+ (ext4_lblk_t)EXT4_BLOCKS_PER_GROUP(ac->ac_sb)));
+
/* don't cover already allocated blocks in selected range */
if (ar->pleft && start <= ar->lleft) {
size -= ar->lleft + 1 - start;
@@ -4924,6 +4959,7 @@
ext4_fsblk_t block = 0;
unsigned int inquota = 0;
unsigned int reserv_clstrs = 0;
+ int retries = 0;
u64 seq;
might_sleep();
@@ -5026,7 +5062,8 @@
ar->len = ac->ac_b_ex.fe_len;
}
} else {
- if (ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
+ if (++retries < 3 &&
+ ext4_mb_discard_preallocations_should_retry(sb, ac, &seq))
goto repeat;
/*
* If block allocation fails then the pa allocated above
diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c
index 4991281..4bfe225 100644
--- a/fs/ext4/migrate.c
+++ b/fs/ext4/migrate.c
@@ -417,7 +417,7 @@
struct inode *tmp_inode = NULL;
struct migrate_struct lb;
unsigned long max_entries;
- __u32 goal;
+ __u32 goal, tmp_csum_seed;
uid_t owner[2];
/*
@@ -425,7 +425,8 @@
* already is extent-based, error out.
*/
if (!ext4_has_feature_extents(inode->i_sb) ||
- (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) ||
+ ext4_has_inline_data(inode))
return -EINVAL;
if (S_ISLNK(inode->i_mode) && inode->i_blocks == 0)
@@ -465,6 +466,7 @@
* the migration.
*/
ei = EXT4_I(inode);
+ tmp_csum_seed = EXT4_I(tmp_inode)->i_csum_seed;
EXT4_I(tmp_inode)->i_csum_seed = ei->i_csum_seed;
i_size_write(tmp_inode, i_size_read(inode));
/*
@@ -575,6 +577,7 @@
* the inode is not visible to user space.
*/
tmp_inode->i_blocks = 0;
+ EXT4_I(tmp_inode)->i_csum_seed = tmp_csum_seed;
/* Reset the extent details */
ext4_ext_tree_init(handle, tmp_inode);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index f71de6c..b2e131d 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -54,6 +54,7 @@
struct inode *inode,
ext4_lblk_t *block)
{
+ struct ext4_map_blocks map;
struct buffer_head *bh;
int err;
@@ -63,6 +64,21 @@
return ERR_PTR(-ENOSPC);
*block = inode->i_size >> inode->i_sb->s_blocksize_bits;
+ map.m_lblk = *block;
+ map.m_len = 1;
+
+ /*
+ * We're appending new directory block. Make sure the block is not
+ * allocated yet, otherwise we will end up corrupting the
+ * directory.
+ */
+ err = ext4_map_blocks(NULL, inode, &map, 0);
+ if (err < 0)
+ return ERR_PTR(err);
+ if (err) {
+ EXT4_ERROR_INODE(inode, "Logical block already allocated");
+ return ERR_PTR(-EFSCORRUPTED);
+ }
bh = ext4_bread(handle, inode, *block, EXT4_GET_BLOCKS_CREATE);
if (IS_ERR(bh))
@@ -109,6 +125,13 @@
struct ext4_dir_entry *dirent;
int is_dx_block = 0;
+ if (block >= inode->i_size >> inode->i_blkbits) {
+ ext4_error_inode(inode, func, line, block,
+ "Attempting to read directory block (%u) that is past i_size (%llu)",
+ block, inode->i_size);
+ return ERR_PTR(-EFSCORRUPTED);
+ }
+
if (ext4_simulate_fail(inode->i_sb, EXT4_SIM_DIRBLOCK_EIO))
bh = ERR_PTR(-EIO);
else
@@ -280,9 +303,9 @@
struct dx_hash_info *hinfo,
struct dx_frame *frame);
static void dx_release(struct dx_frame *frames);
-static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
- struct dx_map_entry map[]);
+static int dx_make_map(struct inode *dir, struct buffer_head *bh,
+ struct dx_hash_info *hinfo,
+ struct dx_map_entry *map_tail);
static void dx_sort_map(struct dx_map_entry *map, unsigned count);
static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to,
struct dx_map_entry *offsets, int count, unsigned blocksize);
@@ -756,12 +779,14 @@
dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_hash_info *hinfo, struct dx_frame *frame_in)
{
- unsigned count, indirect;
+ unsigned count, indirect, level, i;
struct dx_entry *at, *entries, *p, *q, *m;
struct dx_root *root;
struct dx_frame *frame = frame_in;
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash;
+ ext4_lblk_t block;
+ ext4_lblk_t blocks[EXT4_HTREE_LEVEL];
memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
frame->bh = ext4_read_dirblock(dir, 0, INDEX);
@@ -817,6 +842,8 @@
}
dxtrace(printk("Look up %x", hash));
+ level = 0;
+ blocks[0] = 0;
while (1) {
count = dx_get_count(entries);
if (!count || count > dx_get_limit(entries)) {
@@ -858,15 +885,27 @@
dx_get_block(at)));
frame->entries = entries;
frame->at = at;
- if (!indirect--)
+
+ block = dx_get_block(at);
+ for (i = 0; i <= level; i++) {
+ if (blocks[i] == block) {
+ ext4_warning_inode(dir,
+ "dx entry: tree cycle block %u points back to block %u",
+ blocks[level], block);
+ goto fail;
+ }
+ }
+ if (++level > indirect)
return frame;
+ blocks[level] = block;
frame++;
- frame->bh = ext4_read_dirblock(dir, dx_get_block(at), INDEX);
+ frame->bh = ext4_read_dirblock(dir, block, INDEX);
if (IS_ERR(frame->bh)) {
ret_err = (struct dx_frame *) frame->bh;
frame->bh = NULL;
goto fail;
}
+
entries = ((struct dx_node *) frame->bh->b_data)->entries;
if (dx_get_limit(entries) != dx_node_limit(dir)) {
@@ -1208,15 +1247,23 @@
* Create map of hash values, offsets, and sizes, stored at end of block.
* Returns number of entries mapped.
*/
-static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de,
- unsigned blocksize, struct dx_hash_info *hinfo,
+static int dx_make_map(struct inode *dir, struct buffer_head *bh,
+ struct dx_hash_info *hinfo,
struct dx_map_entry *map_tail)
{
int count = 0;
- char *base = (char *) de;
+ struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *)bh->b_data;
+ unsigned int buflen = bh->b_size;
+ char *base = bh->b_data;
struct dx_hash_info h = *hinfo;
- while ((char *) de < base + blocksize) {
+ if (ext4_has_metadata_csum(dir->i_sb))
+ buflen -= sizeof(struct ext4_dir_entry_tail);
+
+ while ((char *) de < base + buflen) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh, base, buflen,
+ ((char *)de) - base))
+ return -EFSCORRUPTED;
if (de->name_len && de->inode) {
ext4fs_dirhash(dir, de->name, de->name_len, &h);
map_tail--;
@@ -1226,8 +1273,7 @@
count++;
cond_resched();
}
- /* XXX: do we need to check rec_len == 0 case? -Chris */
- de = ext4_next_entry(de, blocksize);
+ de = ext4_next_entry(de, dir->i_sb->s_blocksize);
}
return count;
}
@@ -1388,10 +1434,10 @@
de = (struct ext4_dir_entry_2 *)search_buf;
dlimit = search_buf + buf_size;
- while ((char *) de < dlimit) {
+ while ((char *) de < dlimit - EXT4_BASE_DIR_LEN) {
/* this code is executed quadratically often */
/* do minimal checking `by hand' */
- if ((char *) de + de->name_len <= dlimit &&
+ if (de->name + de->name_len <= dlimit &&
ext4_match(dir, fname, de)) {
/* found a match - just to be sure, do
* a full check */
@@ -1818,7 +1864,8 @@
struct dx_hash_info *hinfo)
{
unsigned blocksize = dir->i_sb->s_blocksize;
- unsigned count, continued;
+ unsigned continued;
+ int count;
struct buffer_head *bh2;
ext4_lblk_t newblock;
u32 hash2;
@@ -1853,8 +1900,11 @@
/* create map in the end of data2 block */
map = (struct dx_map_entry *) (data2 + blocksize);
- count = dx_make_map(dir, (struct ext4_dir_entry_2 *) data1,
- blocksize, hinfo, map);
+ count = dx_make_map(dir, *bh, hinfo, map);
+ if (count < 0) {
+ err = count;
+ goto journal_error;
+ }
map -= count;
dx_sort_map(map, count);
/* Ensure that neither split block is over half full */
@@ -2103,8 +2153,16 @@
memcpy(data2, de, len);
de = (struct ext4_dir_entry_2 *) data2;
top = data2 + len;
- while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top)
+ while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) {
+ if (ext4_check_dir_entry(dir, NULL, de, bh2, data2, len,
+ (data2 + (blocksize - csum_size) -
+ (char *) de))) {
+ brelse(bh2);
+ brelse(bh);
+ return -EFSCORRUPTED;
+ }
de = de2;
+ }
de->rec_len = ext4_rec_len_to_disk(data2 + (blocksize - csum_size) -
(char *) de, blocksize);
@@ -2868,14 +2926,14 @@
sb = inode->i_sb;
if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) {
EXT4_ERROR_INODE(inode, "invalid size");
- return true;
+ return false;
}
/* The first directory block must not be a hole,
* so treat it as DIRENT_HTREE
*/
bh = ext4_read_dirblock(inode, 0, DIRENT_HTREE);
if (IS_ERR(bh))
- return true;
+ return false;
de = (struct ext4_dir_entry_2 *) bh->b_data;
if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data, bh->b_size,
@@ -2883,7 +2941,7 @@
le32_to_cpu(de->inode) != inode->i_ino || strcmp(".", de->name)) {
ext4_warning_inode(inode, "directory missing '.'");
brelse(bh);
- return true;
+ return false;
}
offset = ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
de = ext4_next_entry(de, sb->s_blocksize);
@@ -2892,7 +2950,7 @@
le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
ext4_warning_inode(inode, "directory missing '..'");
brelse(bh);
- return true;
+ return false;
}
offset += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize);
while (offset < inode->i_size) {
@@ -2906,16 +2964,13 @@
continue;
}
if (IS_ERR(bh))
- return true;
+ return false;
}
de = (struct ext4_dir_entry_2 *) (bh->b_data +
(offset & (sb->s_blocksize - 1)));
if (ext4_check_dir_entry(inode, NULL, de, bh,
- bh->b_data, bh->b_size, offset)) {
- offset = (offset | (sb->s_blocksize - 1)) + 1;
- continue;
- }
- if (le32_to_cpu(de->inode)) {
+ bh->b_data, bh->b_size, offset) ||
+ le32_to_cpu(de->inode)) {
brelse(bh);
return false;
}
@@ -3500,6 +3555,9 @@
struct buffer_head *bh;
if (!ext4_has_inline_data(inode)) {
+ struct ext4_dir_entry_2 *de;
+ unsigned int offset;
+
/* The first directory block must not be a hole, so
* treat it as DIRENT_HTREE
*/
@@ -3508,9 +3566,30 @@
*retval = PTR_ERR(bh);
return NULL;
}
- *parent_de = ext4_next_entry(
- (struct ext4_dir_entry_2 *)bh->b_data,
- inode->i_sb->s_blocksize);
+
+ de = (struct ext4_dir_entry_2 *) bh->b_data;
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
+ bh->b_size, 0) ||
+ le32_to_cpu(de->inode) != inode->i_ino ||
+ strcmp(".", de->name)) {
+ EXT4_ERROR_INODE(inode, "directory missing '.'");
+ brelse(bh);
+ *retval = -EFSCORRUPTED;
+ return NULL;
+ }
+ offset = ext4_rec_len_from_disk(de->rec_len,
+ inode->i_sb->s_blocksize);
+ de = ext4_next_entry(de, inode->i_sb->s_blocksize);
+ if (ext4_check_dir_entry(inode, NULL, de, bh, bh->b_data,
+ bh->b_size, offset) ||
+ le32_to_cpu(de->inode) == 0 || strcmp("..", de->name)) {
+ EXT4_ERROR_INODE(inode, "directory missing '..'");
+ brelse(bh);
+ *retval = -EFSCORRUPTED;
+ return NULL;
+ }
+ *parent_de = de;
+
return bh;
}
diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c
index defd2e1..4569075 100644
--- a/fs/ext4/page-io.c
+++ b/fs/ext4/page-io.c
@@ -137,8 +137,10 @@
continue;
}
clear_buffer_async_write(bh);
- if (bio->bi_status)
+ if (bio->bi_status) {
+ set_buffer_write_io_error(bh);
buffer_io_error(bh);
+ }
} while ((bh = bh->b_this_page) != head);
spin_unlock_irqrestore(&head->b_uptodate_lock, flags);
if (!under_io) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index 6513079..c55ba03 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -53,6 +53,16 @@
return -EPERM;
/*
+ * If the reserved GDT blocks is non-zero, the resize_inode feature
+ * should always be set.
+ */
+ if (EXT4_SB(sb)->s_es->s_reserved_gdt_blocks &&
+ !ext4_has_feature_resize_inode(sb)) {
+ ext4_error(sb, "resize_inode disabled but reserved GDT blocks non-zero");
+ return -EFSCORRUPTED;
+ }
+
+ /*
* If we are not using the primary superblock/GDT copy don't resize,
* because the user tools have no way of handling this. Probably a
* bad time to do it anyways.
@@ -1451,6 +1461,7 @@
* Update the fs overhead information
*/
ext4_calculate_overhead(sb);
+ es->s_overhead_clusters = cpu_to_le32(sbi->s_overhead);
if (test_opt(sb, DEBUG))
printk(KERN_DEBUG "EXT4-fs: added group %u:"
@@ -1946,6 +1957,16 @@
}
brelse(bh);
+ /*
+ * For bigalloc, trim the requested size to the nearest cluster
+ * boundary to avoid creating an unusable filesystem. We do this
+ * silently, instead of returning an error, to avoid breaking
+ * callers that blindly resize the filesystem to the full size of
+ * the underlying block device.
+ */
+ if (ext4_has_feature_bigalloc(sb))
+ n_blocks_count &= ~((1 << EXT4_CLUSTER_BITS(sb)) - 1);
+
retry:
o_blocks_count = ext4_blocks_count(es);
@@ -2047,7 +2068,7 @@
goto out;
}
- if (ext4_blocks_count(es) == n_blocks_count)
+ if (ext4_blocks_count(es) == n_blocks_count && n_blocks_count_retry == 0)
goto out;
err = ext4_alloc_flex_bg_array(sb, n_group + 1);
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 9e210bc..9573d49 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -188,19 +188,12 @@
int ext4_read_bh_lock(struct buffer_head *bh, int op_flags, bool wait)
{
- if (trylock_buffer(bh)) {
- if (wait)
- return ext4_read_bh(bh, op_flags, NULL);
+ lock_buffer(bh);
+ if (!wait) {
ext4_read_bh_nowait(bh, op_flags, NULL);
return 0;
}
- if (wait) {
- wait_on_buffer(bh);
- if (buffer_uptodate(bh))
- return 0;
- return -EIO;
- }
- return 0;
+ return ext4_read_bh(bh, op_flags, NULL);
}
/*
@@ -247,7 +240,8 @@
struct buffer_head *bh = sb_getblk_gfp(sb, block, 0);
if (likely(bh)) {
- ext4_read_bh_lock(bh, REQ_RAHEAD, false);
+ if (trylock_buffer(bh))
+ ext4_read_bh_nowait(bh, REQ_RAHEAD, NULL);
brelse(bh);
}
}
@@ -1176,18 +1170,23 @@
int aborted = 0;
int i, err;
- ext4_unregister_li_request(sb);
- ext4_quota_off_umount(sb);
-
- destroy_workqueue(sbi->rsv_conversion_wq);
-
/*
* Unregister sysfs before destroying jbd2 journal.
* Since we could still access attr_journal_task attribute via sysfs
* path which could have sbi->s_journal->j_task as NULL
+ * Unregister sysfs before flush sbi->s_error_work.
+ * Since user may read /proc/fs/ext4/xx/mb_groups during umount, If
+ * read metadata verify failed then will queue error work.
+ * flush_stashed_error_work will call start_this_handle may trigger
+ * BUG_ON.
*/
ext4_unregister_sysfs(sb);
+ ext4_unregister_li_request(sb);
+ ext4_quota_off_umount(sb);
+
+ destroy_workqueue(sbi->rsv_conversion_wq);
+
if (sbi->s_journal) {
aborted = is_journal_aborted(sbi->s_journal);
err = jbd2_journal_destroy(sbi->s_journal);
@@ -1955,6 +1954,7 @@
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_warn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_SET},
{Opt_nowarn_on_error, EXT4_MOUNT_WARN_ON_ERROR, MOPT_CLEAR},
+ {Opt_commit, 0, MOPT_NO_EXT2},
{Opt_nojournal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
MOPT_EXT4_ONLY | MOPT_CLEAR},
{Opt_journal_checksum, EXT4_MOUNT_JOURNAL_CHECKSUM,
@@ -2078,6 +2078,12 @@
struct ext4_sb_info *sbi = EXT4_SB(sb);
int err;
+ if (!ext4_has_feature_encrypt(sb)) {
+ ext4_msg(sb, KERN_WARNING,
+ "test_dummy_encryption requires encrypt feature");
+ return -1;
+ }
+
/*
* This mount option is just for testing, and it's not worthwhile to
* implement the extra complexity (e.g. RCU protection) that would be
@@ -2105,11 +2111,13 @@
return -1;
}
ext4_msg(sb, KERN_WARNING, "Test dummy encryption mode enabled");
+ return 1;
#else
ext4_msg(sb, KERN_WARNING,
- "Test dummy encryption mount option ignored");
+ "test_dummy_encryption option not supported");
+ return -1;
+
#endif
- return 1;
}
static int handle_mount_opt(struct super_block *sb, char *opt, int token,
@@ -3536,6 +3544,7 @@
unsigned long next_wakeup, cur;
BUG_ON(NULL == eli);
+ set_freezable();
cont_thread:
while (true) {
@@ -3870,9 +3879,11 @@
ext4_fsblk_t first_block, last_block, b;
ext4_group_t i, ngroups = ext4_get_groups_count(sb);
int s, j, count = 0;
+ int has_super = ext4_bg_has_super(sb, grp);
if (!ext4_has_feature_bigalloc(sb))
- return (ext4_bg_has_super(sb, grp) + ext4_bg_num_gdb(sb, grp) +
+ return (has_super + ext4_bg_num_gdb(sb, grp) +
+ (has_super ? le16_to_cpu(sbi->s_es->s_reserved_gdt_blocks) : 0) +
sbi->s_itb_per_group + 2);
first_block = le32_to_cpu(sbi->s_es->s_first_data_block) +
@@ -4530,7 +4541,7 @@
sbi->s_inodes_per_block;
sbi->s_desc_per_block = blocksize / EXT4_DESC_SIZE(sb);
sbi->s_sbh = bh;
- sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_mount_state = le16_to_cpu(es->s_state) & ~EXT4_FC_REPLAY;
sbi->s_addr_per_block_bits = ilog2(EXT4_ADDR_PER_BLOCK(sb));
sbi->s_desc_per_block_bits = ilog2(EXT4_DESC_PER_BLOCK(sb));
@@ -4921,19 +4932,22 @@
goto failed_mount_wq;
}
- if (DUMMY_ENCRYPTION_ENABLED(sbi) && !sb_rdonly(sb) &&
- !ext4_has_feature_encrypt(sb)) {
- ext4_set_feature_encrypt(sb);
- ext4_commit_super(sb, 1);
- }
-
/*
* Get the # of file system overhead blocks from the
* superblock if present.
*/
- if (es->s_overhead_clusters)
- sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
- else {
+ sbi->s_overhead = le32_to_cpu(es->s_overhead_clusters);
+ /* ignore the precalculated value if it is ridiculous */
+ if (sbi->s_overhead > ext4_blocks_count(es))
+ sbi->s_overhead = 0;
+ /*
+ * If the bigalloc feature is not enabled recalculating the
+ * overhead doesn't take long, so we might as well just redo
+ * it to make sure we are using the correct value.
+ */
+ if (!ext4_has_feature_bigalloc(sb))
+ sbi->s_overhead = 0;
+ if (sbi->s_overhead == 0) {
err = ext4_calculate_overhead(sb);
if (err)
goto failed_mount_wq;
@@ -5981,7 +5995,8 @@
if (err)
goto restore_opts;
}
- sbi->s_mount_state = le16_to_cpu(es->s_state);
+ sbi->s_mount_state = (le16_to_cpu(es->s_state) &
+ ~EXT4_FC_REPLAY);
err = ext4_setup_super(sb, es, 0);
if (err)
@@ -6253,7 +6268,7 @@
handle_t *handle;
/* Data block + inode block */
- handle = ext4_journal_start(d_inode(sb->s_root), EXT4_HT_QUOTA, 2);
+ handle = ext4_journal_start_sb(sb, EXT4_HT_QUOTA, 2);
if (IS_ERR(handle))
return PTR_ERR(handle);
ret = dquot_commit_info(sb, type);
diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c
index 00e3cbd..35be8e7 100644
--- a/fs/ext4/verity.c
+++ b/fs/ext4/verity.c
@@ -370,13 +370,14 @@
pgoff_t index,
unsigned long num_ra_pages)
{
- DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
struct page *page;
index += ext4_verity_metadata_pos(inode) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
+ DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
+
if (page)
put_page(page);
else if (num_ra_pages > 1)
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 5462f26..38531c5 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -2167,8 +2167,9 @@
struct ext4_inode *raw_inode;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return 0;
+
raw_inode = ext4_raw_inode(&is->iloc);
header = IHDR(inode, raw_inode);
is->s.base = is->s.first = IFIRST(header);
@@ -2196,8 +2197,9 @@
struct ext4_xattr_search *s = &is->s;
int error;
- if (EXT4_I(inode)->i_extra_isize == 0)
+ if (!EXT4_INODE_HAS_XATTR_SPACE(inode))
return -ENOSPC;
+
error = ext4_xattr_set_entry(i, s, handle, inode, false /* is_block */);
if (error)
return error;
diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h
index 730b91f..87e5863 100644
--- a/fs/ext4/xattr.h
+++ b/fs/ext4/xattr.h
@@ -95,6 +95,19 @@
#define EXT4_ZERO_XATTR_VALUE ((void *)-1)
+/*
+ * If we want to add an xattr to the inode, we should make sure that
+ * i_extra_isize is not 0 and that the inode size is not less than
+ * EXT4_GOOD_OLD_INODE_SIZE + extra_isize + pad.
+ * EXT4_GOOD_OLD_INODE_SIZE extra_isize header entry pad data
+ * |--------------------------|------------|------|---------|---|-------|
+ */
+#define EXT4_INODE_HAS_XATTR_SPACE(inode) \
+ ((EXT4_I(inode)->i_extra_isize != 0) && \
+ (EXT4_GOOD_OLD_INODE_SIZE + EXT4_I(inode)->i_extra_isize + \
+ sizeof(struct ext4_xattr_ibody_header) + EXT4_XATTR_PAD <= \
+ EXT4_INODE_SIZE((inode)->i_sb)))
+
struct ext4_xattr_info {
const char *name;
const void *value;
diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c
index 9bcd77d..cd46a64 100644
--- a/fs/f2fs/checkpoint.c
+++ b/fs/f2fs/checkpoint.c
@@ -136,7 +136,7 @@
unsigned int segno, offset;
bool exist;
- if (type != DATA_GENERIC_ENHANCE && type != DATA_GENERIC_ENHANCE_READ)
+ if (type == DATA_GENERIC)
return true;
segno = GET_SEGNO(sbi, blkaddr);
@@ -144,11 +144,18 @@
se = get_seg_entry(sbi, segno);
exist = f2fs_test_bit(offset, se->cur_valid_map);
+ if (exist && type == DATA_GENERIC_ENHANCE_UPDATE) {
+ f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
+ blkaddr, exist);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ return exist;
+ }
+
if (!exist && type == DATA_GENERIC_ENHANCE) {
f2fs_err(sbi, "Inconsistent error blkaddr:%u, sit bitmap:%d",
blkaddr, exist);
set_sbi_flag(sbi, SBI_NEED_FSCK);
- WARN_ON(1);
+ dump_stack();
}
return exist;
}
@@ -181,12 +188,13 @@
case DATA_GENERIC:
case DATA_GENERIC_ENHANCE:
case DATA_GENERIC_ENHANCE_READ:
+ case DATA_GENERIC_ENHANCE_UPDATE:
if (unlikely(blkaddr >= MAX_BLKADDR(sbi) ||
blkaddr < MAIN_BLKADDR(sbi))) {
f2fs_warn(sbi, "access invalid blkaddr:%u",
blkaddr);
set_sbi_flag(sbi, SBI_NEED_FSCK);
- WARN_ON(1);
+ dump_stack();
return false;
} else {
return __is_bitmap_valid(sbi, blkaddr, type);
@@ -851,6 +859,7 @@
struct page *cp_page_1 = NULL, *cp_page_2 = NULL;
struct f2fs_checkpoint *cp_block = NULL;
unsigned long long cur_version = 0, pre_version = 0;
+ unsigned int cp_blocks;
int err;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
@@ -858,15 +867,16 @@
if (err)
return NULL;
- if (le32_to_cpu(cp_block->cp_pack_total_block_count) >
- sbi->blocks_per_seg) {
+ cp_blocks = le32_to_cpu(cp_block->cp_pack_total_block_count);
+
+ if (cp_blocks > sbi->blocks_per_seg || cp_blocks <= F2FS_CP_PACKS) {
f2fs_warn(sbi, "invalid cp_pack_total_block_count:%u",
le32_to_cpu(cp_block->cp_pack_total_block_count));
goto invalid_cp;
}
pre_version = *version;
- cp_addr += le32_to_cpu(cp_block->cp_pack_total_block_count) - 1;
+ cp_addr += cp_blocks - 1;
err = get_checkpoint_version(sbi, cp_addr, &cp_block,
&cp_page_2, version);
if (err)
@@ -1037,7 +1047,8 @@
spin_unlock(&sbi->inode_lock[type]);
}
-int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type)
+int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type,
+ bool from_cp)
{
struct list_head *head;
struct inode *inode;
@@ -1072,11 +1083,15 @@
if (inode) {
unsigned long cur_ino = inode->i_ino;
- F2FS_I(inode)->cp_task = current;
+ if (from_cp)
+ F2FS_I(inode)->cp_task = current;
+ F2FS_I(inode)->wb_task = current;
filemap_fdatawrite(inode->i_mapping);
- F2FS_I(inode)->cp_task = NULL;
+ F2FS_I(inode)->wb_task = NULL;
+ if (from_cp)
+ F2FS_I(inode)->cp_task = NULL;
iput(inode);
/* We need to give cpu to another writers. */
@@ -1205,7 +1220,7 @@
/* write all the dirty dentry pages */
if (get_pages(sbi, F2FS_DIRTY_DENTS)) {
f2fs_unlock_all(sbi);
- err = f2fs_sync_dirty_inodes(sbi, DIR_INODE);
+ err = f2fs_sync_dirty_inodes(sbi, DIR_INODE, true);
if (err)
return err;
cond_resched();
diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c
index ec542e8..1541da5 100644
--- a/fs/f2fs/compress.c
+++ b/fs/f2fs/compress.c
@@ -286,10 +286,9 @@
}
if (ret != PAGE_SIZE << dic->log_cluster_size) {
- printk_ratelimited("%sF2FS-fs (%s): lz4 invalid rlen:%zu, "
+ printk_ratelimited("%sF2FS-fs (%s): lz4 invalid ret:%d, "
"expected:%lu\n", KERN_ERR,
- F2FS_I_SB(dic->inode)->sb->s_id,
- dic->rlen,
+ F2FS_I_SB(dic->inode)->sb->s_id, ret,
PAGE_SIZE << dic->log_cluster_size);
return -EIO;
}
diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c
index 1b11a42..9270330 100644
--- a/fs/f2fs/data.c
+++ b/fs/f2fs/data.c
@@ -2912,7 +2912,7 @@
}
unlock_page(page);
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
- !F2FS_I(inode)->cp_task && allow_balance)
+ !F2FS_I(inode)->wb_task && allow_balance)
f2fs_balance_fs(sbi, need_balance_fs);
if (unlikely(f2fs_cp_error(sbi))) {
@@ -3210,7 +3210,7 @@
struct writeback_control *wbc)
{
/* to avoid deadlock in path of data flush */
- if (F2FS_I(inode)->cp_task)
+ if (F2FS_I(inode)->wb_task)
return false;
if (!S_ISREG(inode->i_mode))
@@ -3264,8 +3264,12 @@
/* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[DATA]);
- else if (atomic_read(&sbi->wb_sync_req[DATA]))
+ else if (atomic_read(&sbi->wb_sync_req[DATA])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
if (__should_serialize_io(inode, wbc)) {
mutex_lock(&sbi->writepages);
@@ -3457,6 +3461,9 @@
*fsdata = NULL;
+ if (len == PAGE_SIZE && !(f2fs_is_atomic_file(inode)))
+ goto repeat;
+
ret = f2fs_prepare_compress_overwrite(inode, pagep,
index, fsdata);
if (ret < 0) {
diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c
index 3ebf976..bd16c78 100644
--- a/fs/f2fs/extent_cache.c
+++ b/fs/f2fs/extent_cache.c
@@ -762,9 +762,8 @@
if (!f2fs_may_extent_tree(inode))
return;
- set_inode_flag(inode, FI_NO_EXTENT);
-
write_lock(&et->lock);
+ set_inode_flag(inode, FI_NO_EXTENT);
__free_extent_tree(sbi, et);
if (et->largest.len) {
et->largest.len = 0;
diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h
index 6c4bf22..c03fdda 100644
--- a/fs/f2fs/f2fs.h
+++ b/fs/f2fs/f2fs.h
@@ -235,6 +235,10 @@
* condition of read on truncated area
* by extent_cache
*/
+ DATA_GENERIC_ENHANCE_UPDATE, /*
+ * strong check on range and segment
+ * bitmap for update case
+ */
META_GENERIC,
};
@@ -697,6 +701,7 @@
unsigned int clevel; /* maximum level of given file name */
struct task_struct *task; /* lookup and create consistency */
struct task_struct *cp_task; /* separate cp/wb IO stats*/
+ struct task_struct *wb_task; /* indicate inode is in context of writeback */
nid_t i_xattr_nid; /* node id that contains xattrs */
loff_t last_disk_size; /* lastly written file size */
spinlock_t i_size_lock; /* protect last_disk_size */
@@ -1021,8 +1026,8 @@
*/
#define PAGE_TYPE_OF_BIO(type) ((type) > META ? META : (type))
enum page_type {
- DATA,
- NODE,
+ DATA = 0,
+ NODE = 1, /* should not change this */
META,
NR_PAGE_TYPE,
META_FLUSH,
@@ -2284,11 +2289,17 @@
{
spin_lock(&sbi->stat_lock);
- f2fs_bug_on(sbi, !sbi->total_valid_block_count);
- f2fs_bug_on(sbi, !sbi->total_valid_node_count);
+ if (unlikely(!sbi->total_valid_block_count ||
+ !sbi->total_valid_node_count)) {
+ f2fs_warn(sbi, "dec_valid_node_count: inconsistent block counts, total_valid_block:%u, total_valid_node:%u",
+ sbi->total_valid_block_count,
+ sbi->total_valid_node_count);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ } else {
+ sbi->total_valid_block_count--;
+ sbi->total_valid_node_count--;
+ }
- sbi->total_valid_node_count--;
- sbi->total_valid_block_count--;
if (sbi->reserved_blocks &&
sbi->current_reserved_blocks < sbi->reserved_blocks)
sbi->current_reserved_blocks++;
@@ -2416,24 +2427,31 @@
return entry;
}
-static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
+static inline bool is_inflight_io(struct f2fs_sb_info *sbi, int type)
{
- if (sbi->gc_mode == GC_URGENT_HIGH)
- return true;
-
if (get_pages(sbi, F2FS_RD_DATA) || get_pages(sbi, F2FS_RD_NODE) ||
get_pages(sbi, F2FS_RD_META) || get_pages(sbi, F2FS_WB_DATA) ||
get_pages(sbi, F2FS_WB_CP_DATA) ||
get_pages(sbi, F2FS_DIO_READ) ||
get_pages(sbi, F2FS_DIO_WRITE))
- return false;
+ return true;
if (type != DISCARD_TIME && SM_I(sbi) && SM_I(sbi)->dcc_info &&
atomic_read(&SM_I(sbi)->dcc_info->queued_discard))
- return false;
+ return true;
if (SM_I(sbi) && SM_I(sbi)->fcc_info &&
atomic_read(&SM_I(sbi)->fcc_info->queued_flush))
+ return true;
+ return false;
+}
+
+static inline bool is_idle(struct f2fs_sb_info *sbi, int type)
+{
+ if (sbi->gc_mode == GC_URGENT_HIGH)
+ return true;
+
+ if (is_inflight_io(sbi, type))
return false;
if (sbi->gc_mode == GC_URGENT_LOW &&
@@ -3383,7 +3401,8 @@
int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi);
void f2fs_update_dirty_page(struct inode *inode, struct page *page);
void f2fs_remove_dirty_inode(struct inode *inode);
-int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type);
+int f2fs_sync_dirty_inodes(struct f2fs_sb_info *sbi, enum inode_type type,
+ bool from_cp);
void f2fs_wait_on_all_pages(struct f2fs_sb_info *sbi, int type);
int f2fs_write_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc);
void f2fs_init_ino_entry_info(struct f2fs_sb_info *sbi);
@@ -3729,6 +3748,7 @@
* inline.c
*/
bool f2fs_may_inline_data(struct inode *inode);
+bool f2fs_sanity_check_inline_data(struct inode *inode);
bool f2fs_may_inline_dentry(struct inode *inode);
void f2fs_do_read_inline_data(struct page *page, struct page *ipage);
void f2fs_truncate_inline_inode(struct inode *inode,
diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c
index 1fbaab1..d56fcac 100644
--- a/fs/f2fs/file.c
+++ b/fs/f2fs/file.c
@@ -1413,11 +1413,19 @@
ret = -ENOSPC;
break;
}
- if (dn->data_blkaddr != NEW_ADDR) {
- f2fs_invalidate_blocks(sbi, dn->data_blkaddr);
- dn->data_blkaddr = NEW_ADDR;
- f2fs_set_data_blkaddr(dn);
+
+ if (dn->data_blkaddr == NEW_ADDR)
+ continue;
+
+ if (!f2fs_is_valid_blkaddr(sbi, dn->data_blkaddr,
+ DATA_GENERIC_ENHANCE)) {
+ ret = -EFSCORRUPTED;
+ break;
}
+
+ f2fs_invalidate_blocks(sbi, dn->data_blkaddr);
+ dn->data_blkaddr = NEW_ADDR;
+ f2fs_set_data_blkaddr(dn);
}
f2fs_update_extent_cache_range(dn, start, 0, index - start);
@@ -1736,6 +1744,10 @@
inode_lock(inode);
+ ret = file_modified(file);
+ if (ret)
+ goto out;
+
if (mode & FALLOC_FL_PUNCH_HOLE) {
if (offset >= inode->i_size)
goto out;
@@ -1832,10 +1844,7 @@
if (masked_flags & F2FS_COMPR_FL) {
if (!f2fs_disable_compressed_file(inode))
return -EINVAL;
- }
- if (iflags & F2FS_NOCOMP_FL)
- return -EINVAL;
- if (iflags & F2FS_COMPR_FL) {
+ } else {
if (!f2fs_may_compress(inode))
return -EINVAL;
if (S_ISREG(inode->i_mode) && inode->i_size)
@@ -1844,10 +1853,6 @@
set_compress_context(inode);
}
}
- if ((iflags ^ masked_flags) & F2FS_NOCOMP_FL) {
- if (masked_flags & F2FS_COMPR_FL)
- return -EINVAL;
- }
fi->i_flags = iflags | (fi->i_flags & ~mask);
f2fs_bug_on(F2FS_I_SB(inode), (fi->i_flags & F2FS_COMPR_FL) &&
@@ -2035,7 +2040,10 @@
inode_lock(inode);
- f2fs_disable_compressed_file(inode);
+ if (!f2fs_disable_compressed_file(inode)) {
+ ret = -EINVAL;
+ goto out;
+ }
if (f2fs_is_atomic_file(inode)) {
if (is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST))
diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c
index 6b240b7..3baa62e 100644
--- a/fs/f2fs/gc.c
+++ b/fs/f2fs/gc.c
@@ -977,7 +977,7 @@
{
struct page *node_page;
nid_t nid;
- unsigned int ofs_in_node;
+ unsigned int ofs_in_node, max_addrs;
block_t source_blkaddr;
nid = le32_to_cpu(sum->nid);
@@ -998,8 +998,18 @@
set_sbi_flag(sbi, SBI_NEED_FSCK);
}
- if (f2fs_check_nid_range(sbi, dni->ino))
+ if (f2fs_check_nid_range(sbi, dni->ino)) {
+ f2fs_put_page(node_page, 1);
return false;
+ }
+
+ max_addrs = IS_INODE(node_page) ? DEF_ADDRS_PER_INODE :
+ DEF_ADDRS_PER_BLOCK;
+ if (ofs_in_node >= max_addrs) {
+ f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%u, nid:%u, max:%u",
+ ofs_in_node, dni->ino, dni->nid, max_addrs);
+ return false;
+ }
*nofs = ofs_of_node(node_page);
source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node);
@@ -1156,7 +1166,8 @@
}
if (f2fs_is_pinned_file(inode)) {
- f2fs_pin_file_control(inode, true);
+ if (gc_type == FG_GC)
+ f2fs_pin_file_control(inode, true);
err = -EAGAIN;
goto out;
}
@@ -1738,23 +1749,31 @@
if (sync)
goto stop;
- if (has_not_enough_free_secs(sbi, sec_freed, 0)) {
- if (skipped_round <= MAX_SKIP_GC_COUNT ||
- skipped_round * 2 < round) {
- segno = NULL_SEGNO;
- goto gc_more;
- }
+ if (!has_not_enough_free_secs(sbi, sec_freed, 0))
+ goto stop;
- if (first_skipped < last_skipped &&
- (last_skipped - first_skipped) >
- sbi->skipped_gc_rwsem) {
- f2fs_drop_inmem_pages_all(sbi, true);
- segno = NULL_SEGNO;
- goto gc_more;
- }
- if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))
+ if (skipped_round <= MAX_SKIP_GC_COUNT || skipped_round * 2 < round) {
+
+ /* Write checkpoint to reclaim prefree segments */
+ if (free_sections(sbi) < NR_CURSEG_PERSIST_TYPE &&
+ prefree_segments(sbi) &&
+ !is_sbi_flag_set(sbi, SBI_CP_DISABLED)) {
ret = f2fs_write_checkpoint(sbi, &cpc);
+ if (ret)
+ goto stop;
+ }
+ segno = NULL_SEGNO;
+ goto gc_more;
}
+ if (first_skipped < last_skipped &&
+ (last_skipped - first_skipped) >
+ sbi->skipped_gc_rwsem) {
+ f2fs_drop_inmem_pages_all(sbi, true);
+ segno = NULL_SEGNO;
+ goto gc_more;
+ }
+ if (gc_type == FG_GC && !is_sbi_flag_set(sbi, SBI_CP_DISABLED))
+ ret = f2fs_write_checkpoint(sbi, &cpc);
stop:
SIT_I(sbi)->last_victim[ALLOC_NEXT] = 0;
SIT_I(sbi)->last_victim[FLUSH_DEVICE] = init_segno;
diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c
index 1d7dafd..f97c23e 100644
--- a/fs/f2fs/inline.c
+++ b/fs/f2fs/inline.c
@@ -14,21 +14,40 @@
#include "node.h"
#include <trace/events/f2fs.h>
-bool f2fs_may_inline_data(struct inode *inode)
+static bool support_inline_data(struct inode *inode)
{
if (f2fs_is_atomic_file(inode))
return false;
-
if (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))
return false;
-
if (i_size_read(inode) > MAX_INLINE_DATA(inode))
return false;
+ return true;
+}
- if (f2fs_post_read_required(inode))
+bool f2fs_may_inline_data(struct inode *inode)
+{
+ if (!support_inline_data(inode))
return false;
- return true;
+ return !f2fs_post_read_required(inode);
+}
+
+bool f2fs_sanity_check_inline_data(struct inode *inode)
+{
+ if (!f2fs_has_inline_data(inode))
+ return false;
+
+ if (!support_inline_data(inode))
+ return true;
+
+ /*
+ * used by sanity_check_inode(), when disk layout fields has not
+ * been synchronized to inmem fields.
+ */
+ return (S_ISREG(inode->i_mode) &&
+ (file_is_encrypt(inode) || file_is_verity(inode) ||
+ (F2FS_I(inode)->i_flags & F2FS_COMPR_FL)));
}
bool f2fs_may_inline_dentry(struct inode *inode)
diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c
index a35fcf4..8775255 100644
--- a/fs/f2fs/inode.c
+++ b/fs/f2fs/inode.c
@@ -272,8 +272,7 @@
}
}
- if (f2fs_has_inline_data(inode) &&
- (!S_ISREG(inode->i_mode) && !S_ISLNK(inode->i_mode))) {
+ if (f2fs_sanity_check_inline_data(inode)) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
f2fs_warn(sbi, "%s: inode (ino=%lx, mode=%u) should not have inline_data, run fsck to fix",
__func__, inode->i_ino, inode->i_mode);
@@ -757,8 +756,22 @@
f2fs_lock_op(sbi);
err = f2fs_remove_inode_page(inode);
f2fs_unlock_op(sbi);
- if (err == -ENOENT)
+ if (err == -ENOENT) {
err = 0;
+
+ /*
+ * in fuzzed image, another node may has the same
+ * block address as inode's, if it was truncated
+ * previously, truncation of inode node will fail.
+ */
+ if (is_inode_flag_set(inode, FI_DIRTY_INODE)) {
+ f2fs_warn(F2FS_I_SB(inode),
+ "f2fs_evict_inode: inconsistent node id, ino:%lu",
+ inode->i_ino);
+ f2fs_inode_synced(inode);
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ }
+ }
}
/* give more chances, if ENOMEM case */
@@ -848,6 +861,7 @@
err = f2fs_get_node_info(sbi, inode->i_ino, &ni);
if (err) {
set_sbi_flag(sbi, SBI_NEED_FSCK);
+ set_inode_flag(inode, FI_FREE_NID);
f2fs_warn(sbi, "May loss orphan inode, run fsck to fix.");
goto out;
}
diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c
index 6ae2bea..72b1096 100644
--- a/fs/f2fs/namei.c
+++ b/fs/f2fs/namei.c
@@ -91,8 +91,6 @@
if (test_opt(sbi, INLINE_XATTR))
set_inode_flag(inode, FI_INLINE_XATTR);
- if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
- set_inode_flag(inode, FI_INLINE_DATA);
if (f2fs_may_inline_dentry(inode))
set_inode_flag(inode, FI_INLINE_DENTRY);
@@ -109,10 +107,6 @@
f2fs_init_extent_tree(inode, NULL);
- stat_inc_inline_xattr(inode);
- stat_inc_inline_inode(inode);
- stat_inc_inline_dir(inode);
-
F2FS_I(inode)->i_flags =
f2fs_mask_flags(mode, F2FS_I(dir)->i_flags & F2FS_FL_INHERITED);
@@ -129,6 +123,14 @@
set_compress_context(inode);
}
+ /* Should enable inline_data after compression set */
+ if (test_opt(sbi, INLINE_DATA) && f2fs_may_inline_data(inode))
+ set_inode_flag(inode, FI_INLINE_DATA);
+
+ stat_inc_inline_xattr(inode);
+ stat_inc_inline_inode(inode);
+ stat_inc_inline_dir(inode);
+
f2fs_set_inode_flags(inode);
trace_f2fs_new_inode(inode, 0);
@@ -317,6 +319,9 @@
if (!is_extension_exist(name, ext[i], false))
continue;
+ /* Do not use inline_data with compression */
+ stat_dec_inline_inode(inode);
+ clear_inode_flag(inode, FI_INLINE_DATA);
set_compress_context(inode);
return;
}
diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c
index 7e62580..c63274d 100644
--- a/fs/f2fs/node.c
+++ b/fs/f2fs/node.c
@@ -1238,7 +1238,11 @@
dec_valid_node_count(sbi, dn->inode, !ofs);
goto fail;
}
- f2fs_bug_on(sbi, new_ni.blk_addr != NULL_ADDR);
+ if (unlikely(new_ni.blk_addr != NULL_ADDR)) {
+ err = -EFSCORRUPTED;
+ set_sbi_flag(sbi, SBI_NEED_FSCK);
+ goto fail;
+ }
#endif
new_ni.nid = dn->nid;
new_ni.ino = dn->inode->i_ino;
@@ -2055,8 +2059,12 @@
if (wbc->sync_mode == WB_SYNC_ALL)
atomic_inc(&sbi->wb_sync_req[NODE]);
- else if (atomic_read(&sbi->wb_sync_req[NODE]))
+ else if (atomic_read(&sbi->wb_sync_req[NODE])) {
+ /* to avoid potential deadlock */
+ if (current->plug)
+ blk_finish_plug(current->plug);
goto skip_write;
+ }
trace_f2fs_writepages(mapping->host, wbc, NODE);
diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c
index 72ce131..c3c527a 100644
--- a/fs/f2fs/recovery.c
+++ b/fs/f2fs/recovery.c
@@ -437,7 +437,7 @@
struct dnode_of_data tdn = *dn;
nid_t ino, nid;
struct inode *inode;
- unsigned int offset;
+ unsigned int offset, ofs_in_node, max_addrs;
block_t bidx;
int i;
@@ -463,15 +463,24 @@
got_it:
/* Use the locked dnode page and inode */
nid = le32_to_cpu(sum.nid);
+ ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+
+ max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode);
+ if (ofs_in_node >= max_addrs) {
+ f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u",
+ ofs_in_node, dn->inode->i_ino, nid, max_addrs);
+ return -EFSCORRUPTED;
+ }
+
if (dn->inode->i_ino == nid) {
tdn.nid = nid;
if (!dn->inode_page_locked)
lock_page(dn->inode_page);
tdn.node_page = dn->inode_page;
- tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ tdn.ofs_in_node = ofs_in_node;
goto truncate_out;
} else if (dn->nid == nid) {
- tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
+ tdn.ofs_in_node = ofs_in_node;
goto truncate_out;
}
@@ -661,6 +670,14 @@
goto err;
}
+ if (f2fs_is_valid_blkaddr(sbi, dest,
+ DATA_GENERIC_ENHANCE_UPDATE)) {
+ f2fs_err(sbi, "Inconsistent dest blkaddr:%u, ino:%lu, ofs:%u",
+ dest, inode->i_ino, dn.ofs_in_node);
+ err = -EFSCORRUPTED;
+ goto err;
+ }
+
/* write dummy data page */
f2fs_replace_block(sbi, &dn, src, dest,
ni.version, false, false);
diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c
index d04b449..68774d6 100644
--- a/fs/f2fs/segment.c
+++ b/fs/f2fs/segment.c
@@ -356,16 +356,19 @@
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
struct list_head *head = &fi->inmem_pages;
struct inmem_pages *cur = NULL;
+ struct inmem_pages *tmp;
f2fs_bug_on(sbi, !IS_ATOMIC_WRITTEN_PAGE(page));
mutex_lock(&fi->inmem_lock);
- list_for_each_entry(cur, head, list) {
- if (cur->page == page)
+ list_for_each_entry(tmp, head, list) {
+ if (tmp->page == page) {
+ cur = tmp;
break;
+ }
}
- f2fs_bug_on(sbi, list_empty(head) || cur->page != page);
+ f2fs_bug_on(sbi, !cur);
list_del(&cur->list);
mutex_unlock(&fi->inmem_lock);
@@ -533,31 +536,38 @@
else
f2fs_build_free_nids(sbi, false, false);
- if (!is_idle(sbi, REQ_TIME) &&
- (!excess_dirty_nats(sbi) && !excess_dirty_nodes(sbi)))
+ if (excess_dirty_nats(sbi) || excess_dirty_nodes(sbi) ||
+ excess_prefree_segs(sbi))
+ goto do_sync;
+
+ /* there is background inflight IO or foreground operation recently */
+ if (is_inflight_io(sbi, REQ_TIME) ||
+ (!f2fs_time_over(sbi, REQ_TIME) && rwsem_is_locked(&sbi->cp_rwsem)))
return;
+ /* exceed periodical checkpoint timeout threshold */
+ if (f2fs_time_over(sbi, CP_TIME))
+ goto do_sync;
+
/* checkpoint is the only way to shrink partial cached entries */
- if (!f2fs_available_free_memory(sbi, NAT_ENTRIES) ||
- !f2fs_available_free_memory(sbi, INO_ENTRIES) ||
- excess_prefree_segs(sbi) ||
- excess_dirty_nats(sbi) ||
- excess_dirty_nodes(sbi) ||
- f2fs_time_over(sbi, CP_TIME)) {
- if (test_opt(sbi, DATA_FLUSH) && from_bg) {
- struct blk_plug plug;
+ if (f2fs_available_free_memory(sbi, NAT_ENTRIES) &&
+ f2fs_available_free_memory(sbi, INO_ENTRIES))
+ return;
- mutex_lock(&sbi->flush_lock);
+do_sync:
+ if (test_opt(sbi, DATA_FLUSH) && from_bg) {
+ struct blk_plug plug;
- blk_start_plug(&plug);
- f2fs_sync_dirty_inodes(sbi, FILE_INODE);
- blk_finish_plug(&plug);
+ mutex_lock(&sbi->flush_lock);
- mutex_unlock(&sbi->flush_lock);
- }
- f2fs_sync_fs(sbi->sb, true);
- stat_inc_bg_cp_count(sbi->stat_info);
+ blk_start_plug(&plug);
+ f2fs_sync_dirty_inodes(sbi, FILE_INODE, false);
+ blk_finish_plug(&plug);
+
+ mutex_unlock(&sbi->flush_lock);
}
+ f2fs_sync_fs(sbi->sb, true);
+ stat_inc_bg_cp_count(sbi->stat_info);
}
static int __submit_flush_wait(struct f2fs_sb_info *sbi,
@@ -4420,7 +4430,7 @@
unsigned int i, start, end;
unsigned int readed, start_blk = 0;
int err = 0;
- block_t total_node_blocks = 0;
+ block_t sit_valid_blocks[2] = {0, 0};
do {
readed = f2fs_ra_meta_pages(sbi, start_blk, BIO_MAX_PAGES,
@@ -4445,8 +4455,14 @@
if (err)
return err;
seg_info_from_raw_sit(se, &sit);
- if (IS_NODESEG(se->type))
- total_node_blocks += se->valid_blocks;
+
+ if (se->type >= NR_PERSISTENT_LOG) {
+ f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
+ se->type, start);
+ return -EFSCORRUPTED;
+ }
+
+ sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
/* build discard map only one time */
if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
@@ -4484,15 +4500,22 @@
sit = sit_in_journal(journal, i);
old_valid_blocks = se->valid_blocks;
- if (IS_NODESEG(se->type))
- total_node_blocks -= old_valid_blocks;
+
+ sit_valid_blocks[SE_PAGETYPE(se)] -= old_valid_blocks;
err = check_block_count(sbi, start, &sit);
if (err)
break;
seg_info_from_raw_sit(se, &sit);
- if (IS_NODESEG(se->type))
- total_node_blocks += se->valid_blocks;
+
+ if (se->type >= NR_PERSISTENT_LOG) {
+ f2fs_err(sbi, "Invalid segment type: %u, segno: %u",
+ se->type, start);
+ err = -EFSCORRUPTED;
+ break;
+ }
+
+ sit_valid_blocks[SE_PAGETYPE(se)] += se->valid_blocks;
if (is_set_ckpt_flags(sbi, CP_TRIMMED_FLAG)) {
memset(se->discard_map, 0xff, SIT_VBLOCK_MAP_SIZE);
@@ -4512,13 +4535,24 @@
}
up_read(&curseg->journal_rwsem);
- if (!err && total_node_blocks != valid_node_count(sbi)) {
+ if (err)
+ return err;
+
+ if (sit_valid_blocks[NODE] != valid_node_count(sbi)) {
f2fs_err(sbi, "SIT is corrupted node# %u vs %u",
- total_node_blocks, valid_node_count(sbi));
- err = -EFSCORRUPTED;
+ sit_valid_blocks[NODE], valid_node_count(sbi));
+ return -EFSCORRUPTED;
}
- return err;
+ if (sit_valid_blocks[DATA] + sit_valid_blocks[NODE] >
+ valid_user_blocks(sbi)) {
+ f2fs_err(sbi, "SIT is corrupted data# %u %u vs %u",
+ sit_valid_blocks[DATA], sit_valid_blocks[NODE],
+ valid_user_blocks(sbi));
+ return -EFSCORRUPTED;
+ }
+
+ return 0;
}
static void init_free_segmap(struct f2fs_sb_info *sbi)
@@ -4650,6 +4684,13 @@
sanity_check_seg_type(sbi, curseg->seg_type);
+ if (curseg->alloc_type != LFS && curseg->alloc_type != SSR) {
+ f2fs_err(sbi,
+ "Current segment has invalid alloc_type:%d",
+ curseg->alloc_type);
+ return -EFSCORRUPTED;
+ }
+
if (f2fs_test_bit(blkofs, se->cur_valid_map))
goto out;
diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h
index beef833..eafd89f 100644
--- a/fs/f2fs/segment.h
+++ b/fs/f2fs/segment.h
@@ -24,6 +24,7 @@
#define IS_DATASEG(t) ((t) <= CURSEG_COLD_DATA)
#define IS_NODESEG(t) ((t) >= CURSEG_HOT_NODE && (t) <= CURSEG_COLD_NODE)
+#define SE_PAGETYPE(se) ((IS_NODESEG((se)->type) ? NODE : DATA))
static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi,
unsigned short seg_type)
@@ -573,11 +574,10 @@
return GET_SEC_FROM_SEG(sbi, reserved_segments(sbi));
}
-static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi)
+static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi,
+ unsigned int node_blocks, unsigned int dent_blocks)
{
- unsigned int node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
- get_pages(sbi, F2FS_DIRTY_DENTS);
- unsigned int dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+
unsigned int segno, left_blocks;
int i;
@@ -603,19 +603,28 @@
static inline bool has_not_enough_free_secs(struct f2fs_sb_info *sbi,
int freed, int needed)
{
- int node_secs = get_blocktype_secs(sbi, F2FS_DIRTY_NODES);
- int dent_secs = get_blocktype_secs(sbi, F2FS_DIRTY_DENTS);
- int imeta_secs = get_blocktype_secs(sbi, F2FS_DIRTY_IMETA);
+ unsigned int total_node_blocks = get_pages(sbi, F2FS_DIRTY_NODES) +
+ get_pages(sbi, F2FS_DIRTY_DENTS) +
+ get_pages(sbi, F2FS_DIRTY_IMETA);
+ unsigned int total_dent_blocks = get_pages(sbi, F2FS_DIRTY_DENTS);
+ unsigned int node_secs = total_node_blocks / BLKS_PER_SEC(sbi);
+ unsigned int dent_secs = total_dent_blocks / BLKS_PER_SEC(sbi);
+ unsigned int node_blocks = total_node_blocks % BLKS_PER_SEC(sbi);
+ unsigned int dent_blocks = total_dent_blocks % BLKS_PER_SEC(sbi);
+ unsigned int free, need_lower, need_upper;
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
return false;
- if (free_sections(sbi) + freed == reserved_sections(sbi) + needed &&
- has_curseg_enough_space(sbi))
+ free = free_sections(sbi) + freed;
+ need_lower = node_secs + dent_secs + reserved_sections(sbi) + needed;
+ need_upper = need_lower + (node_blocks ? 1 : 0) + (dent_blocks ? 1 : 0);
+
+ if (free > need_upper)
return false;
- return (free_sections(sbi) + freed) <=
- (node_secs + 2 * dent_secs + imeta_secs +
- reserved_sections(sbi) + needed);
+ else if (free <= need_lower)
+ return true;
+ return !has_curseg_enough_space(sbi, node_blocks, dent_blocks);
}
static inline bool f2fs_is_checkpoint_ready(struct f2fs_sb_info *sbi)
diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c
index af98abb..fba413c 100644
--- a/fs/f2fs/super.c
+++ b/fs/f2fs/super.c
@@ -267,10 +267,10 @@
static inline void limit_reserve_root(struct f2fs_sb_info *sbi)
{
- block_t limit = min((sbi->user_block_count << 1) / 1000,
+ block_t limit = min((sbi->user_block_count >> 3),
sbi->user_block_count - sbi->reserved_blocks);
- /* limit is 0.2% */
+ /* limit is 12.5% */
if (test_opt(sbi, RESERVE_ROOT) &&
F2FS_OPTION(sbi).root_reserved_blocks > limit) {
F2FS_OPTION(sbi).root_reserved_blocks = limit;
@@ -2278,7 +2278,7 @@
struct f2fs_sb_info *sbi = F2FS_SB(sb);
struct quota_info *dqopt = sb_dqopt(sb);
int cnt;
- int ret;
+ int ret = 0;
/*
* Now when everything is written we can discard the pagecache so
@@ -2289,10 +2289,11 @@
if (type != -1 && cnt != type)
continue;
- if (!sb_has_quota_active(sb, type))
- return 0;
+ if (!sb_has_quota_active(sb, cnt))
+ continue;
- inode_lock(dqopt->files[cnt]);
+ if (!f2fs_sb_has_quota_ino(sbi))
+ inode_lock(dqopt->files[cnt]);
/*
* do_quotactl
@@ -2311,7 +2312,8 @@
up_read(&sbi->quota_sem);
f2fs_unlock_op(sbi);
- inode_unlock(dqopt->files[cnt]);
+ if (!f2fs_sb_has_quota_ino(sbi))
+ inode_unlock(dqopt->files[cnt]);
if (ret)
break;
diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c
index 7ffd4bb..a7e7d68 100644
--- a/fs/f2fs/sysfs.c
+++ b/fs/f2fs/sysfs.c
@@ -386,7 +386,7 @@
} else if (t == GC_IDLE_AT) {
if (!sbi->am.atgc_enabled)
return -EINVAL;
- sbi->gc_mode = GC_AT;
+ sbi->gc_mode = GC_IDLE_AT;
} else {
sbi->gc_mode = GC_NORMAL;
}
diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c
index 15ba369..cff94d0 100644
--- a/fs/f2fs/verity.c
+++ b/fs/f2fs/verity.c
@@ -261,13 +261,14 @@
pgoff_t index,
unsigned long num_ra_pages)
{
- DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
struct page *page;
index += f2fs_verity_metadata_pos(inode) >> PAGE_SHIFT;
page = find_get_page_flags(inode->i_mapping, index, FGP_ACCESSED);
if (!page || !PageUptodate(page)) {
+ DEFINE_READAHEAD(ractl, NULL, inode->i_mapping, index);
+
if (page)
put_page(page);
else if (num_ra_pages > 1)
diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c
index f7e3304..3537350 100644
--- a/fs/fat/fatent.c
+++ b/fs/fat/fatent.c
@@ -93,7 +93,8 @@
err_brelse:
brelse(bhs[0]);
err:
- fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)", (llu)blocknr);
+ fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
+ (llu)blocknr);
return -EIO;
}
@@ -106,8 +107,8 @@
fatent->fat_inode = MSDOS_SB(sb)->fat_inode;
fatent->bhs[0] = sb_bread(sb, blocknr);
if (!fatent->bhs[0]) {
- fat_msg(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
- (llu)blocknr);
+ fat_msg_ratelimit(sb, KERN_ERR, "FAT read failed (blocknr %llu)",
+ (llu)blocknr);
return -EIO;
}
fatent->nr_bhs = 1;
diff --git a/fs/fcntl.c b/fs/fcntl.c
index 71b4353..fcf34f8 100644
--- a/fs/fcntl.c
+++ b/fs/fcntl.c
@@ -148,12 +148,17 @@
pid_t f_getown(struct file *filp)
{
- pid_t pid;
- read_lock(&filp->f_owner.lock);
- pid = pid_vnr(filp->f_owner.pid);
- if (filp->f_owner.pid_type == PIDTYPE_PGID)
- pid = -pid;
- read_unlock(&filp->f_owner.lock);
+ pid_t pid = 0;
+
+ read_lock_irq(&filp->f_owner.lock);
+ rcu_read_lock();
+ if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type)) {
+ pid = pid_vnr(filp->f_owner.pid);
+ if (filp->f_owner.pid_type == PIDTYPE_PGID)
+ pid = -pid;
+ }
+ rcu_read_unlock();
+ read_unlock_irq(&filp->f_owner.lock);
return pid;
}
@@ -200,11 +205,14 @@
static int f_getown_ex(struct file *filp, unsigned long arg)
{
struct f_owner_ex __user *owner_p = (void __user *)arg;
- struct f_owner_ex owner;
+ struct f_owner_ex owner = {};
int ret = 0;
- read_lock(&filp->f_owner.lock);
- owner.pid = pid_vnr(filp->f_owner.pid);
+ read_lock_irq(&filp->f_owner.lock);
+ rcu_read_lock();
+ if (pid_task(filp->f_owner.pid, filp->f_owner.pid_type))
+ owner.pid = pid_vnr(filp->f_owner.pid);
+ rcu_read_unlock();
switch (filp->f_owner.pid_type) {
case PIDTYPE_PID:
owner.type = F_OWNER_TID;
@@ -223,7 +231,7 @@
ret = -EINVAL;
break;
}
- read_unlock(&filp->f_owner.lock);
+ read_unlock_irq(&filp->f_owner.lock);
if (!ret) {
ret = copy_to_user(owner_p, &owner, sizeof(owner));
@@ -241,10 +249,10 @@
uid_t src[2];
int err;
- read_lock(&filp->f_owner.lock);
+ read_lock_irq(&filp->f_owner.lock);
src[0] = from_kuid(user_ns, filp->f_owner.uid);
src[1] = from_kuid(user_ns, filp->f_owner.euid);
- read_unlock(&filp->f_owner.lock);
+ read_unlock_irq(&filp->f_owner.lock);
err = put_user(src[0], &dst[0]);
err |= put_user(src[1], &dst[1]);
diff --git a/fs/file.c b/fs/file.c
index 79a76d0..8431dfd 100644
--- a/fs/file.c
+++ b/fs/file.c
@@ -85,6 +85,21 @@
copy_fd_bitmaps(nfdt, ofdt, ofdt->max_fds);
}
+/*
+ * Note how the fdtable bitmap allocations very much have to be a multiple of
+ * BITS_PER_LONG. This is not only because we walk those things in chunks of
+ * 'unsigned long' in some places, but simply because that is how the Linux
+ * kernel bitmaps are defined to work: they are not "bits in an array of bytes",
+ * they are very much "bits in an array of unsigned long".
+ *
+ * The ALIGN(nr, BITS_PER_LONG) here is for clarity: since we just multiplied
+ * by that "1024/sizeof(ptr)" before, we already know there are sufficient
+ * clear low bits. Clang seems to realize that, gcc ends up being confused.
+ *
+ * On a 128-bit machine, the ALIGN() would actually matter. In the meantime,
+ * let's consider it documentation (and maybe a test-case for gcc to improve
+ * its code generation ;)
+ */
static struct fdtable * alloc_fdtable(unsigned int nr)
{
struct fdtable *fdt;
@@ -100,6 +115,7 @@
nr /= (1024 / sizeof(struct file *));
nr = roundup_pow_of_two(nr + 1);
nr *= (1024 / sizeof(struct file *));
+ nr = ALIGN(nr, BITS_PER_LONG);
/*
* Note that this can drive nr *below* what we had passed if sysctl_nr_open
* had been set lower between the check in expand_files() and here. Deal
@@ -267,6 +283,19 @@
return i;
}
+/*
+ * Note that a sane fdtable size always has to be a multiple of
+ * BITS_PER_LONG, since we have bitmaps that are sized by this.
+ *
+ * 'max_fds' will normally already be properly aligned, but it
+ * turns out that in the close_range() -> __close_range() ->
+ * unshare_fd() -> dup_fd() -> sane_fdtable_size() we can end
+ * up having a 'max_fds' value that isn't already aligned.
+ *
+ * Rather than make close_range() have to worry about this,
+ * just make that BITS_PER_LONG alignment be part of a sane
+ * fdtable size. Becuase that's really what it is.
+ */
static unsigned int sane_fdtable_size(struct fdtable *fdt, unsigned int max_fds)
{
unsigned int count;
@@ -274,7 +303,7 @@
count = count_open_files(fdt);
if (max_fds < NR_OPEN_DEFAULT)
max_fds = NR_OPEN_DEFAULT;
- return min(count, max_fds);
+ return ALIGN(min(count, max_fds), BITS_PER_LONG);
}
/*
diff --git a/fs/file_table.c b/fs/file_table.c
index 709ada3..7a3b4a7 100644
--- a/fs/file_table.c
+++ b/fs/file_table.c
@@ -376,6 +376,7 @@
}
EXPORT_SYMBOL(fput);
+EXPORT_SYMBOL(__fput_sync);
void __init files_init(void)
{
diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c
index a086919..46c15dd 100644
--- a/fs/fs-writeback.c
+++ b/fs/fs-writeback.c
@@ -1650,11 +1650,12 @@
};
unsigned long start_time = jiffies;
long write_chunk;
- long wrote = 0; /* count both pages and inodes */
+ long total_wrote = 0; /* count both pages and inodes */
while (!list_empty(&wb->b_io)) {
struct inode *inode = wb_inode(wb->b_io.prev);
struct bdi_writeback *tmp_wb;
+ long wrote;
if (inode->i_sb != sb) {
if (work->sb) {
@@ -1730,7 +1731,9 @@
wbc_detach_inode(&wbc);
work->nr_pages -= write_chunk - wbc.nr_to_write;
- wrote += write_chunk - wbc.nr_to_write;
+ wrote = write_chunk - wbc.nr_to_write - wbc.pages_skipped;
+ wrote = wrote < 0 ? 0 : wrote;
+ total_wrote += wrote;
if (need_resched()) {
/*
@@ -1752,7 +1755,7 @@
tmp_wb = inode_to_wb_and_lock_list(inode);
spin_lock(&inode->i_lock);
if (!(inode->i_state & I_DIRTY_ALL))
- wrote++;
+ total_wrote++;
requeue_inode(inode, tmp_wb, &wbc);
inode_sync_complete(inode);
spin_unlock(&inode->i_lock);
@@ -1766,14 +1769,14 @@
* bail out to wb_writeback() often enough to check
* background threshold and other termination conditions.
*/
- if (wrote) {
+ if (total_wrote) {
if (time_is_before_jiffies(start_time + HZ / 10UL))
break;
if (work->nr_pages <= 0)
break;
}
}
- return wrote;
+ return total_wrote;
}
static long __writeback_inodes_wb(struct bdi_writeback *wb,
diff --git a/fs/fuse/control.c b/fs/fuse/control.c
index cc7e94d..24b4d9d 100644
--- a/fs/fuse/control.c
+++ b/fs/fuse/control.c
@@ -275,7 +275,7 @@
struct dentry *parent;
char name[32];
- if (!fuse_control_sb)
+ if (!fuse_control_sb || fc->no_control)
return 0;
parent = fuse_control_sb->s_root;
@@ -313,7 +313,7 @@
{
int i;
- if (!fuse_control_sb)
+ if (!fuse_control_sb || fc->no_control)
return;
for (i = fc->ctl_ndents - 1; i >= 0; i--) {
diff --git a/fs/fuse/file.c b/fs/fuse/file.c
index d1bc96e..5043895 100644
--- a/fs/fuse/file.c
+++ b/fs/fuse/file.c
@@ -3275,10 +3275,9 @@
.mode = mode
};
int err;
- bool lock_inode = !(mode & FALLOC_FL_KEEP_SIZE) ||
- (mode & FALLOC_FL_PUNCH_HOLE);
-
- bool block_faults = FUSE_IS_DAX(inode) && lock_inode;
+ bool block_faults = FUSE_IS_DAX(inode) &&
+ (!(mode & FALLOC_FL_KEEP_SIZE) ||
+ (mode & FALLOC_FL_PUNCH_HOLE));
if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE))
return -EOPNOTSUPP;
@@ -3286,22 +3285,20 @@
if (fm->fc->no_fallocate)
return -EOPNOTSUPP;
- if (lock_inode) {
- inode_lock(inode);
- if (block_faults) {
- down_write(&fi->i_mmap_sem);
- err = fuse_dax_break_layouts(inode, 0, 0);
- if (err)
- goto out;
- }
+ inode_lock(inode);
+ if (block_faults) {
+ down_write(&fi->i_mmap_sem);
+ err = fuse_dax_break_layouts(inode, 0, 0);
+ if (err)
+ goto out;
+ }
- if (mode & FALLOC_FL_PUNCH_HOLE) {
- loff_t endbyte = offset + length - 1;
+ if (mode & FALLOC_FL_PUNCH_HOLE) {
+ loff_t endbyte = offset + length - 1;
- err = fuse_writeback_range(inode, offset, endbyte);
- if (err)
- goto out;
- }
+ err = fuse_writeback_range(inode, offset, endbyte);
+ if (err)
+ goto out;
}
if (!(mode & FALLOC_FL_KEEP_SIZE) &&
@@ -3311,6 +3308,10 @@
goto out;
}
+ err = file_modified(file);
+ if (err)
+ goto out;
+
if (!(mode & FALLOC_FL_KEEP_SIZE))
set_bit(FUSE_I_SIZE_UNSTABLE, &fi->state);
@@ -3347,8 +3348,7 @@
if (block_faults)
up_write(&fi->i_mmap_sem);
- if (lock_inode)
- inode_unlock(inode);
+ inode_unlock(inode);
fuse_flush_time_update(inode);
diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c
index 5e48467..2ede05d 100644
--- a/fs/fuse/inode.c
+++ b/fs/fuse/inode.c
@@ -182,6 +182,12 @@
inode->i_uid = make_kuid(fc->user_ns, attr->uid);
inode->i_gid = make_kgid(fc->user_ns, attr->gid);
inode->i_blocks = attr->blocks;
+
+ /* Sanitize nsecs */
+ attr->atimensec = min_t(u32, attr->atimensec, NSEC_PER_SEC - 1);
+ attr->mtimensec = min_t(u32, attr->mtimensec, NSEC_PER_SEC - 1);
+ attr->ctimensec = min_t(u32, attr->ctimensec, NSEC_PER_SEC - 1);
+
inode->i_atime.tv_sec = attr->atime;
inode->i_atime.tv_nsec = attr->atimensec;
/* mtime from server may be stale due to local buffered write */
diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c
index bc26783..d5294e6 100644
--- a/fs/fuse/readdir.c
+++ b/fs/fuse/readdir.c
@@ -77,8 +77,10 @@
goto unlock;
addr = kmap_atomic(page);
- if (!offset)
+ if (!offset) {
clear_page(addr);
+ SetPageUptodate(page);
+ }
memcpy(addr + offset, dirent, reclen);
kunmap_atomic(addr);
fi->rdc.size = (index << PAGE_SHIFT) + offset + reclen;
@@ -516,6 +518,12 @@
page = find_get_page_flags(file->f_mapping, index,
FGP_ACCESSED | FGP_LOCK);
+ /* Page gone missing, then re-added to cache, but not initialized? */
+ if (page && !PageUptodate(page)) {
+ unlock_page(page);
+ put_page(page);
+ page = NULL;
+ }
spin_lock(&fi->rdc.lock);
if (!page) {
/*
diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c
index b34c029..b4fde3a 100644
--- a/fs/gfs2/bmap.c
+++ b/fs/gfs2/bmap.c
@@ -1235,13 +1235,12 @@
if (length != written && (iomap->flags & IOMAP_F_NEW)) {
/* Deallocate blocks that were just allocated. */
- loff_t blockmask = i_blocksize(inode) - 1;
- loff_t end = (pos + length) & ~blockmask;
+ loff_t hstart = round_up(pos + written, i_blocksize(inode));
+ loff_t hend = iomap->offset + iomap->length;
- pos = (pos + written + blockmask) & ~blockmask;
- if (pos < end) {
- truncate_pagecache_range(inode, pos, end - 1);
- punch_hole(ip, pos, end - pos);
+ if (hstart < hend) {
+ truncate_pagecache_range(inode, hstart, hend - 1);
+ punch_hole(ip, hstart, hend - hstart);
}
}
@@ -2200,7 +2199,7 @@
ret = do_shrink(inode, newsize);
out:
- gfs2_rs_delete(ip, NULL);
+ gfs2_rs_delete(ip);
gfs2_qa_put(ip);
return ret;
}
diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c
index cfd9d03..55a8eb3 100644
--- a/fs/gfs2/file.c
+++ b/fs/gfs2/file.c
@@ -717,7 +717,8 @@
file->private_data = NULL;
if (file->f_mode & FMODE_WRITE) {
- gfs2_rs_delete(ip, &inode->i_writecount);
+ if (gfs2_rs_active(&ip->i_res))
+ gfs2_rs_delete(ip);
gfs2_qa_put(ip);
}
return 0;
@@ -857,14 +858,16 @@
return ret;
iocb->ki_flags &= ~IOCB_DIRECT;
}
+ pagefault_disable();
iocb->ki_flags |= IOCB_NOIO;
ret = generic_file_read_iter(iocb, to);
iocb->ki_flags &= ~IOCB_NOIO;
+ pagefault_enable();
if (ret >= 0) {
if (!iov_iter_count(to))
return ret;
written = ret;
- } else {
+ } else if (ret != -EFAULT) {
if (ret != -EAGAIN)
return ret;
if (iocb->ki_flags & IOCB_NOWAIT)
diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c
index 65ae4fc..74a6b08 100644
--- a/fs/gfs2/inode.c
+++ b/fs/gfs2/inode.c
@@ -811,7 +811,7 @@
if (free_vfs_inode) /* else evict will do the put for us */
gfs2_glock_put(ip->i_gl);
}
- gfs2_rs_delete(ip, NULL);
+ gfs2_rs_deltree(&ip->i_res);
gfs2_qa_put(ip);
fail_free_acls:
posix_acl_release(default_acl);
diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c
index b9ed6a6..648f733 100644
--- a/fs/gfs2/ops_fstype.c
+++ b/fs/gfs2/ops_fstype.c
@@ -182,7 +182,10 @@
pr_warn("Invalid superblock size\n");
return -EINVAL;
}
-
+ if (sb->sb_bsize_shift != ffs(sb->sb_bsize) - 1) {
+ pr_warn("Invalid block size shift\n");
+ return -EINVAL;
+ }
return 0;
}
@@ -381,8 +384,10 @@
if (!table[0])
table = sdp->sd_vfs->s_id;
- strlcpy(sdp->sd_proto_name, proto, GFS2_FSNAME_LEN);
- strlcpy(sdp->sd_table_name, table, GFS2_FSNAME_LEN);
+ BUILD_BUG_ON(GFS2_LOCKNAME_LEN > GFS2_FSNAME_LEN);
+
+ strscpy(sdp->sd_proto_name, proto, GFS2_LOCKNAME_LEN);
+ strscpy(sdp->sd_table_name, table, GFS2_LOCKNAME_LEN);
table = sdp->sd_table_name;
while ((table = strchr(table, '/')))
@@ -1414,13 +1419,13 @@
switch (o) {
case Opt_lockproto:
- strlcpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_lockproto, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_locktable:
- strlcpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_locktable, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_hostdata:
- strlcpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN);
+ strscpy(args->ar_hostdata, param->string, GFS2_LOCKNAME_LEN);
break;
case Opt_spectator:
args->ar_spectator = 1;
diff --git a/fs/gfs2/quota.c b/fs/gfs2/quota.c
index 6e173ae..ad953ec 100644
--- a/fs/gfs2/quota.c
+++ b/fs/gfs2/quota.c
@@ -531,34 +531,42 @@
*/
int gfs2_qa_get(struct gfs2_inode *ip)
{
- int error = 0;
struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode);
+ struct inode *inode = &ip->i_inode;
if (sdp->sd_args.ar_quota == GFS2_QUOTA_OFF)
return 0;
- down_write(&ip->i_rw_mutex);
+ spin_lock(&inode->i_lock);
if (ip->i_qadata == NULL) {
- ip->i_qadata = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
- if (!ip->i_qadata) {
- error = -ENOMEM;
- goto out;
- }
+ struct gfs2_qadata *tmp;
+
+ spin_unlock(&inode->i_lock);
+ tmp = kmem_cache_zalloc(gfs2_qadata_cachep, GFP_NOFS);
+ if (!tmp)
+ return -ENOMEM;
+
+ spin_lock(&inode->i_lock);
+ if (ip->i_qadata == NULL)
+ ip->i_qadata = tmp;
+ else
+ kmem_cache_free(gfs2_qadata_cachep, tmp);
}
ip->i_qadata->qa_ref++;
-out:
- up_write(&ip->i_rw_mutex);
- return error;
+ spin_unlock(&inode->i_lock);
+ return 0;
}
void gfs2_qa_put(struct gfs2_inode *ip)
{
- down_write(&ip->i_rw_mutex);
+ struct inode *inode = &ip->i_inode;
+
+ spin_lock(&inode->i_lock);
if (ip->i_qadata && --ip->i_qadata->qa_ref == 0) {
kmem_cache_free(gfs2_qadata_cachep, ip->i_qadata);
ip->i_qadata = NULL;
}
- up_write(&ip->i_rw_mutex);
+ spin_unlock(&inode->i_lock);
}
int gfs2_quota_hold(struct gfs2_inode *ip, kuid_t uid, kgid_t gid)
diff --git a/fs/gfs2/rgrp.c b/fs/gfs2/rgrp.c
index 5e8eef9..c5bde78 100644
--- a/fs/gfs2/rgrp.c
+++ b/fs/gfs2/rgrp.c
@@ -664,13 +664,14 @@
/**
* gfs2_rs_delete - delete a multi-block reservation
* @ip: The inode for this reservation
- * @wcount: The inode's write count, or NULL
*
*/
-void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount)
+void gfs2_rs_delete(struct gfs2_inode *ip)
{
+ struct inode *inode = &ip->i_inode;
+
down_write(&ip->i_rw_mutex);
- if ((wcount == NULL) || (atomic_read(wcount) <= 1))
+ if (atomic_read(&inode->i_writecount) <= 1)
gfs2_rs_deltree(&ip->i_res);
up_write(&ip->i_rw_mutex);
}
@@ -905,15 +906,15 @@
rgd->rd_bitbytes = be32_to_cpu(buf.ri_bitbytes);
spin_lock_init(&rgd->rd_rsspin);
- error = compute_bitstructs(rgd);
- if (error)
- goto fail;
-
error = gfs2_glock_get(sdp, rgd->rd_addr,
&gfs2_rgrp_glops, CREATE, &rgd->rd_gl);
if (error)
goto fail;
+ error = compute_bitstructs(rgd);
+ if (error)
+ goto fail_glock;
+
rgd->rd_rgl = (struct gfs2_rgrp_lvb *)rgd->rd_gl->gl_lksb.sb_lvbptr;
rgd->rd_flags &= ~(GFS2_RDF_UPTODATE | GFS2_RDF_PREFERRED);
if (rgd->rd_data > sdp->sd_max_rg_data)
@@ -927,6 +928,7 @@
}
error = 0; /* someone else read in the rgrp; free it and ignore it */
+fail_glock:
gfs2_glock_put(rgd->rd_gl);
fail:
@@ -1389,7 +1391,8 @@
start = r.start >> bs_shift;
end = start + (r.len >> bs_shift);
- minlen = max_t(u64, r.minlen,
+ minlen = max_t(u64, r.minlen, sdp->sd_sb.sb_bsize);
+ minlen = max_t(u64, minlen,
q->limits.discard_granularity) >> bs_shift;
if (end <= start || minlen > sdp->sd_max_rg_data)
diff --git a/fs/gfs2/rgrp.h b/fs/gfs2/rgrp.h
index 9a587ad..2d3c150 100644
--- a/fs/gfs2/rgrp.h
+++ b/fs/gfs2/rgrp.h
@@ -45,7 +45,7 @@
bool dinode, u64 *generation);
extern void gfs2_rs_deltree(struct gfs2_blkreserv *rs);
-extern void gfs2_rs_delete(struct gfs2_inode *ip, atomic_t *wcount);
+extern void gfs2_rs_delete(struct gfs2_inode *ip);
extern void __gfs2_free_blocks(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
u64 bstart, u32 blen, int meta);
extern void gfs2_free_meta(struct gfs2_inode *ip, struct gfs2_rgrpd *rgd,
diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c
index d2b7ecb..d14b98a 100644
--- a/fs/gfs2/super.c
+++ b/fs/gfs2/super.c
@@ -1434,7 +1434,7 @@
truncate_inode_pages_final(&inode->i_data);
if (ip->i_qadata)
gfs2_assert_warn(sdp, ip->i_qadata->qa_ref == 0);
- gfs2_rs_delete(ip, NULL);
+ gfs2_rs_deltree(&ip->i_res);
gfs2_ordered_del_inode(ip);
clear_inode(inode);
gfs2_dir_hash_inval(ip);
diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c
index 5fc9cca..a2f43f1 100644
--- a/fs/hugetlbfs/inode.c
+++ b/fs/hugetlbfs/inode.c
@@ -206,7 +206,7 @@
info.flags = 0;
info.length = len;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
return vm_unmapped_area(&info);
@@ -222,7 +222,7 @@
info.flags = VM_UNMAPPED_AREA_TOPDOWN;
info.length = len;
info.low_limit = max(PAGE_SIZE, mmap_min_addr);
- info.high_limit = current->mm->mmap_base;
+ info.high_limit = arch_get_mmap_base(addr, current->mm->mmap_base);
info.align_mask = PAGE_MASK & ~huge_page_mask(h);
info.align_offset = 0;
addr = vm_unmapped_area(&info);
@@ -237,7 +237,7 @@
VM_BUG_ON(addr != -ENOMEM);
info.flags = 0;
info.low_limit = current->mm->mmap_base;
- info.high_limit = TASK_SIZE;
+ info.high_limit = arch_get_mmap_end(addr);
addr = vm_unmapped_area(&info);
}
@@ -251,6 +251,7 @@
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
struct hstate *h = hstate_file(file);
+ const unsigned long mmap_end = arch_get_mmap_end(addr);
if (len & ~huge_page_mask(h))
return -EINVAL;
@@ -266,7 +267,7 @@
if (addr) {
addr = ALIGN(addr, huge_page_size(h));
vma = find_vma(mm, addr);
- if (TASK_SIZE - len >= addr &&
+ if (mmap_end - len >= addr &&
(!vma || addr + len <= vm_start_gap(vma)))
return addr;
}
diff --git a/fs/inode.c b/fs/inode.c
index 638d5d5..9f49e0b 100644
--- a/fs/inode.c
+++ b/fs/inode.c
@@ -168,8 +168,6 @@
inode->i_wb_frn_history = 0;
#endif
- if (security_inode_alloc(inode))
- goto out;
spin_lock_init(&inode->i_lock);
lockdep_set_class(&inode->i_lock, &sb->s_type->i_lock_key);
@@ -202,11 +200,12 @@
inode->i_fsnotify_mask = 0;
#endif
inode->i_flctx = NULL;
+
+ if (unlikely(security_inode_alloc(inode)))
+ return -ENOMEM;
this_cpu_inc(nr_inodes);
return 0;
-out:
- return -ENOMEM;
}
EXPORT_SYMBOL(inode_init_always);
diff --git a/fs/io_uring.c b/fs/io_uring.c
index fd188b9..d1cb1ad 100644
--- a/fs/io_uring.c
+++ b/fs/io_uring.c
@@ -696,6 +696,8 @@
*/
struct list_head inflight_entry;
+ struct list_head iopoll_entry;
+
struct percpu_ref *fixed_file_refs;
struct callback_head task_work;
/* for polled requests, i.e. IORING_OP_POLL_ADD and async armed poll */
@@ -773,7 +775,8 @@
.buffer_select = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FILES,
},
[IORING_OP_WRITEV] = {
.needs_file = 1,
@@ -783,7 +786,7 @@
.needs_async_data = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FSIZE,
+ IO_WQ_WORK_FSIZE | IO_WQ_WORK_FILES,
},
[IORING_OP_FSYNC] = {
.needs_file = 1,
@@ -794,7 +797,8 @@
.unbound_nonreg_file = 1,
.pollin = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM,
+ .work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_MM |
+ IO_WQ_WORK_FILES,
},
[IORING_OP_WRITE_FIXED] = {
.needs_file = 1,
@@ -803,7 +807,7 @@
.pollout = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_BLKCG | IO_WQ_WORK_FSIZE |
- IO_WQ_WORK_MM,
+ IO_WQ_WORK_MM | IO_WQ_WORK_FILES,
},
[IORING_OP_POLL_ADD] = {
.needs_file = 1,
@@ -857,7 +861,7 @@
.pollout = 1,
.needs_async_data = 1,
.async_size = sizeof(struct io_async_connect),
- .work_flags = IO_WQ_WORK_MM,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_FS,
},
[IORING_OP_FALLOCATE] = {
.needs_file = 1,
@@ -885,7 +889,8 @@
.pollin = 1,
.buffer_select = 1,
.async_size = sizeof(struct io_async_rw),
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FILES,
},
[IORING_OP_WRITE] = {
.needs_file = 1,
@@ -894,7 +899,7 @@
.pollout = 1,
.async_size = sizeof(struct io_async_rw),
.work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
- IO_WQ_WORK_FSIZE,
+ IO_WQ_WORK_FSIZE | IO_WQ_WORK_FILES,
},
[IORING_OP_FADVISE] = {
.needs_file = 1,
@@ -907,14 +912,16 @@
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollout = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FS,
},
[IORING_OP_RECV] = {
.needs_file = 1,
.unbound_nonreg_file = 1,
.pollin = 1,
.buffer_select = 1,
- .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG,
+ .work_flags = IO_WQ_WORK_MM | IO_WQ_WORK_BLKCG |
+ IO_WQ_WORK_FS,
},
[IORING_OP_OPENAT2] = {
.work_flags = IO_WQ_WORK_FILES | IO_WQ_WORK_FS |
@@ -1156,7 +1163,7 @@
*/
static inline void io_req_init_async(struct io_kiocb *req)
{
- struct io_uring_task *tctx = current->io_uring;
+ struct io_uring_task *tctx = req->task->io_uring;
if (req->flags & REQ_F_WORK_INITIALIZED)
return;
@@ -1318,7 +1325,7 @@
*/
static bool io_identity_cow(struct io_kiocb *req)
{
- struct io_uring_task *tctx = current->io_uring;
+ struct io_uring_task *tctx = req->task->io_uring;
const struct cred *creds = NULL;
struct io_identity *id;
@@ -1556,6 +1563,7 @@
static void io_flush_timeouts(struct io_ring_ctx *ctx)
{
+ struct io_kiocb *req, *tmp;
u32 seq;
if (list_empty(&ctx->timeout_list))
@@ -1563,10 +1571,8 @@
seq = ctx->cached_cq_tail - atomic_read(&ctx->cq_timeouts);
- do {
+ list_for_each_entry_safe(req, tmp, &ctx->timeout_list, timeout.list) {
u32 events_needed, events_got;
- struct io_kiocb *req = list_first_entry(&ctx->timeout_list,
- struct io_kiocb, timeout.list);
if (io_is_timeout_noseq(req))
break;
@@ -1583,9 +1589,8 @@
if (events_got < events_needed)
break;
- list_del_init(&req->timeout.list);
io_kill_timeout(req, 0);
- } while (!list_empty(&ctx->timeout_list));
+ }
ctx->cq_last_tm_flush = seq;
}
@@ -2347,8 +2352,8 @@
struct io_kiocb *req;
do {
- req = list_first_entry(again, struct io_kiocb, inflight_entry);
- list_del(&req->inflight_entry);
+ req = list_first_entry(again, struct io_kiocb, iopoll_entry);
+ list_del(&req->iopoll_entry);
__io_complete_rw(req, -EAGAIN, 0, NULL);
} while (!list_empty(again));
}
@@ -2370,14 +2375,14 @@
while (!list_empty(done)) {
int cflags = 0;
- req = list_first_entry(done, struct io_kiocb, inflight_entry);
+ req = list_first_entry(done, struct io_kiocb, iopoll_entry);
if (READ_ONCE(req->result) == -EAGAIN) {
req->result = 0;
req->iopoll_completed = 0;
- list_move_tail(&req->inflight_entry, &again);
+ list_move_tail(&req->iopoll_entry, &again);
continue;
}
- list_del(&req->inflight_entry);
+ list_del(&req->iopoll_entry);
if (req->flags & REQ_F_BUFFER_SELECTED)
cflags = io_put_rw_kbuf(req);
@@ -2413,7 +2418,7 @@
spin = !ctx->poll_multi_file && *nr_events < min;
ret = 0;
- list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, inflight_entry) {
+ list_for_each_entry_safe(req, tmp, &ctx->iopoll_list, iopoll_entry) {
struct kiocb *kiocb = &req->rw.kiocb;
/*
@@ -2422,7 +2427,7 @@
* and complete those lists first, if we have entries there.
*/
if (READ_ONCE(req->iopoll_completed)) {
- list_move_tail(&req->inflight_entry, &done);
+ list_move_tail(&req->iopoll_entry, &done);
continue;
}
if (!list_empty(&done))
@@ -2434,7 +2439,7 @@
/* iopoll may have completed current req */
if (READ_ONCE(req->iopoll_completed))
- list_move_tail(&req->inflight_entry, &done);
+ list_move_tail(&req->iopoll_entry, &done);
if (ret && spin)
spin = false;
@@ -2581,45 +2586,6 @@
#ifdef CONFIG_BLOCK
static bool io_resubmit_prep(struct io_kiocb *req, int error)
{
- struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
- ssize_t ret = -ECANCELED;
- struct iov_iter iter;
- int rw;
-
- if (error) {
- ret = error;
- goto end_req;
- }
-
- switch (req->opcode) {
- case IORING_OP_READV:
- case IORING_OP_READ_FIXED:
- case IORING_OP_READ:
- rw = READ;
- break;
- case IORING_OP_WRITEV:
- case IORING_OP_WRITE_FIXED:
- case IORING_OP_WRITE:
- rw = WRITE;
- break;
- default:
- printk_once(KERN_WARNING "io_uring: bad opcode in resubmit %d\n",
- req->opcode);
- goto end_req;
- }
-
- if (!req->async_data) {
- ret = io_import_iovec(rw, req, &iovec, &iter, false);
- if (ret < 0)
- goto end_req;
- ret = io_setup_async_rw(req, iovec, inline_vecs, &iter, false);
- if (!ret)
- return true;
- kfree(iovec);
- } else {
- return true;
- }
-end_req:
req_set_fail_links(req);
return false;
}
@@ -2706,7 +2672,7 @@
struct io_kiocb *list_req;
list_req = list_first_entry(&ctx->iopoll_list, struct io_kiocb,
- inflight_entry);
+ iopoll_entry);
if (list_req->file != req->file)
ctx->poll_multi_file = true;
}
@@ -2716,9 +2682,9 @@
* it to the front so we find it first.
*/
if (READ_ONCE(req->iopoll_completed))
- list_add(&req->inflight_entry, &ctx->iopoll_list);
+ list_add(&req->iopoll_entry, &ctx->iopoll_list);
else
- list_add_tail(&req->inflight_entry, &ctx->iopoll_list);
+ list_add_tail(&req->iopoll_entry, &ctx->iopoll_list);
if ((ctx->flags & IORING_SETUP_SQPOLL) &&
wq_has_sleeper(&ctx->sq_data->wait))
@@ -3220,13 +3186,15 @@
ret = nr;
break;
}
+ ret += nr;
if (!iov_iter_is_bvec(iter)) {
iov_iter_advance(iter, nr);
} else {
- req->rw.len -= nr;
req->rw.addr += nr;
+ req->rw.len -= nr;
+ if (!req->rw.len)
+ break;
}
- ret += nr;
if (nr != iovec.iov_len)
break;
}
@@ -3428,6 +3396,7 @@
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
+ struct iov_iter iter_cp;
struct io_async_rw *rw = req->async_data;
ssize_t io_size, ret, ret2;
bool no_async;
@@ -3438,6 +3407,7 @@
ret = io_import_iovec(READ, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
+ iter_cp = *iter;
io_size = iov_iter_count(iter);
req->result = io_size;
ret = 0;
@@ -3473,7 +3443,7 @@
if (req->file->f_flags & O_NONBLOCK)
goto done;
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, io_size - iov_iter_count(iter));
+ *iter = iter_cp;
ret = 0;
goto copy_iov;
} else if (ret < 0) {
@@ -3556,6 +3526,7 @@
struct iovec inline_vecs[UIO_FASTIOV], *iovec = inline_vecs;
struct kiocb *kiocb = &req->rw.kiocb;
struct iov_iter __iter, *iter = &__iter;
+ struct iov_iter iter_cp;
struct io_async_rw *rw = req->async_data;
ssize_t ret, ret2, io_size;
@@ -3565,6 +3536,7 @@
ret = io_import_iovec(WRITE, req, &iovec, iter, !force_nonblock);
if (ret < 0)
return ret;
+ iter_cp = *iter;
io_size = iov_iter_count(iter);
req->result = io_size;
@@ -3626,7 +3598,7 @@
} else {
copy_iov:
/* some cases will consume bytes even on error returns */
- iov_iter_revert(iter, io_size - iov_iter_count(iter));
+ *iter = iter_cp;
ret = io_setup_async_rw(req, iovec, inline_vecs, iter, false);
if (!ret)
return -EAGAIN;
@@ -4252,12 +4224,8 @@
struct io_statx *ctx = &req->statx;
int ret;
- if (force_nonblock) {
- /* only need file table for an actual valid fd */
- if (ctx->dfd == -1 || ctx->dfd == AT_FDCWD)
- req->flags |= REQ_F_NO_FILE_TABLE;
+ if (force_nonblock)
return -EAGAIN;
- }
ret = do_statx(ctx->dfd, ctx->filename, ctx->flags, ctx->mask,
ctx->buffer);
@@ -4398,6 +4366,8 @@
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->splice_fd_in || sqe->ioprio))
+ return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -4633,6 +4603,8 @@
if (unlikely(req->ctx->flags & IORING_SETUP_IOPOLL))
return -EINVAL;
+ if (unlikely(sqe->addr2 || sqe->splice_fd_in || sqe->ioprio))
+ return -EINVAL;
sr->msg_flags = READ_ONCE(sqe->msg_flags);
sr->umsg = u64_to_user_ptr(READ_ONCE(sqe->addr));
@@ -5226,6 +5198,11 @@
struct io_ring_ctx *ctx = req->ctx;
bool cancel = false;
+ if (req->file->f_op->may_pollfree) {
+ spin_lock_irq(&ctx->completion_lock);
+ return -EOPNOTSUPP;
+ }
+
INIT_HLIST_NODE(&req->hash_node);
io_init_poll_iocb(poll, mask, wake_func);
poll->file = req->file;
@@ -5637,6 +5614,7 @@
else
data->mode = HRTIMER_MODE_REL;
+ INIT_LIST_HEAD(&req->timeout.list);
hrtimer_init(&data->timer, CLOCK_MONOTONIC, data->mode);
return 0;
}
@@ -6280,12 +6258,12 @@
if (!list_empty(&req->link_list)) {
prev = list_entry(req->link_list.prev, struct io_kiocb,
link_list);
- if (refcount_inc_not_zero(&prev->refs))
- list_del_init(&req->link_list);
- else
+ list_del_init(&req->link_list);
+ if (!refcount_inc_not_zero(&prev->refs))
prev = NULL;
}
+ list_del(&req->timeout.list);
spin_unlock_irqrestore(&ctx->completion_lock, flags);
if (prev) {
@@ -7323,6 +7301,7 @@
}
skb->sk = sk;
+ skb->scm_io_uring = 1;
nr_files = 0;
fpl->user = get_uid(ctx->user);
@@ -7344,10 +7323,15 @@
refcount_add(skb->truesize, &sk->sk_wmem_alloc);
skb_queue_head(&sk->sk_receive_queue, skb);
- for (i = 0; i < nr_files; i++)
- fput(fpl->fp[i]);
+ for (i = 0; i < nr; i++) {
+ struct file *file = io_file_from_index(ctx, i + offset);
+
+ if (file)
+ fput(file);
+ }
} else {
kfree_skb(skb);
+ free_uid(fpl->user);
kfree(fpl);
}
@@ -8453,8 +8437,6 @@
if (ctx->sqo_task) {
put_task_struct(ctx->sqo_task);
ctx->sqo_task = NULL;
- mmdrop(ctx->mm_account);
- ctx->mm_account = NULL;
}
#ifdef CONFIG_BLK_CGROUP
@@ -8473,6 +8455,11 @@
}
#endif
+ if (ctx->mm_account) {
+ mmdrop(ctx->mm_account);
+ ctx->mm_account = NULL;
+ }
+
io_mem_free(ctx->rings);
io_mem_free(ctx->sq_sqes);
@@ -9051,7 +9038,7 @@
if (unlikely(ctx->sqo_dead)) {
ret = -EOWNERDEAD;
- goto out;
+ break;
}
if (!io_sqring_full(ctx))
@@ -9061,7 +9048,6 @@
} while (!signal_pending(current));
finish_wait(&ctx->sqo_sq_wait, &wait);
-out:
return ret;
}
diff --git a/fs/ioctl.c b/fs/ioctl.c
index 4e6cc0a..7bcc600 100644
--- a/fs/ioctl.c
+++ b/fs/ioctl.c
@@ -170,7 +170,7 @@
if (*len == 0)
return -EINVAL;
- if (start > maxbytes)
+ if (start >= maxbytes)
return -EFBIG;
/*
diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c
index cd9f7ba..86297f5 100644
--- a/fs/iomap/buffered-io.c
+++ b/fs/iomap/buffered-io.c
@@ -528,7 +528,8 @@
* write started inside the existing inode size.
*/
if (pos + len > i_size)
- truncate_pagecache_range(inode, max(pos, i_size), pos + len);
+ truncate_pagecache_range(inode, max(pos, i_size),
+ pos + len - 1);
}
static int
@@ -1460,13 +1461,6 @@
goto redirty;
/*
- * Given that we do not allow direct reclaim to call us, we should
- * never be called in a recursive filesystem reclaim context.
- */
- if (WARN_ON_ONCE(current->flags & PF_MEMALLOC_NOFS))
- goto redirty;
-
- /*
* Is this page beyond the end of the file?
*
* The page index is less than the end_index, adjust the end_offset
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index b121d7d..fa24b40 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -501,7 +501,6 @@
}
spin_unlock(&commit_transaction->t_handle_lock);
commit_transaction->t_state = T_SWITCH;
- write_unlock(&journal->j_state_lock);
J_ASSERT (atomic_read(&commit_transaction->t_outstanding_credits) <=
journal->j_max_transaction_buffers);
@@ -521,6 +520,8 @@
* has reserved. This is consistent with the existing behaviour
* that multiple jbd2_journal_get_write_access() calls to the same
* buffer are perfectly permissible.
+ * We use journal->j_state_lock here to serialize processing of
+ * t_reserved_list with eviction of buffers from journal_unmap_buffer().
*/
while (commit_transaction->t_reserved_list) {
jh = commit_transaction->t_reserved_list;
@@ -540,6 +541,7 @@
jbd2_journal_refile_buffer(journal, jh);
}
+ write_unlock(&journal->j_state_lock);
/*
* Now try to drop any written-back buffers from the journal's
* checkpoint lists. We do this *before* commit because it potentially
@@ -562,13 +564,13 @@
*/
jbd2_journal_switch_revoke_table(journal);
+ write_lock(&journal->j_state_lock);
/*
* Reserved credits cannot be claimed anymore, free them
*/
atomic_sub(atomic_read(&journal->j_reserved_credits),
&commit_transaction->t_outstanding_credits);
- write_lock(&journal->j_state_lock);
trace_jbd2_commit_flushing(journal, commit_transaction);
stats.run.rs_flushing = jiffies;
stats.run.rs_locked = jbd2_time_diff(stats.run.rs_locked,
@@ -579,7 +581,7 @@
journal->j_running_transaction = NULL;
start_time = ktime_get();
commit_transaction->t_log_start = journal->j_head;
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
write_unlock(&journal->j_state_lock);
jbd_debug(3, "JBD2: commit phase 2a\n");
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index b748329..6689d23 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -924,10 +924,16 @@
for (i = j_fc_off - 1; i >= j_fc_off - num_blks; i--) {
bh = journal->j_fc_wbuf[i];
wait_on_buffer(bh);
+ /*
+ * Update j_fc_off so jbd2_fc_release_bufs can release remain
+ * buffer head.
+ */
+ if (unlikely(!buffer_uptodate(bh))) {
+ journal->j_fc_off = i + 1;
+ return -EIO;
+ }
put_bh(bh);
journal->j_fc_wbuf[i] = NULL;
- if (unlikely(!buffer_uptodate(bh)))
- return -EIO;
}
return 0;
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index 1e07dfa..1ae1697 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -256,6 +256,7 @@
err = journal->j_fc_replay_callback(journal, bh, pass,
next_fc_block - journal->j_fc_first,
expected_commit_id);
+ brelse(bh);
next_fc_block++;
if (err < 0 || err == JBD2_FC_REPLAY_STOP)
break;
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index e8fc45f..8647221 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -173,7 +173,7 @@
int need_to_start;
tid_t tid = journal->j_running_transaction->t_tid;
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
need_to_start = !tid_geq(journal->j_commit_request, tid);
read_unlock(&journal->j_state_lock);
@@ -199,7 +199,7 @@
read_unlock(&journal->j_state_lock);
return;
}
- prepare_to_wait(&journal->j_wait_transaction_locked, &wait,
+ prepare_to_wait_exclusive(&journal->j_wait_transaction_locked, &wait,
TASK_UNINTERRUPTIBLE);
read_unlock(&journal->j_state_lock);
/*
@@ -894,7 +894,7 @@
write_lock(&journal->j_state_lock);
--journal->j_barrier_count;
write_unlock(&journal->j_state_lock);
- wake_up(&journal->j_wait_transaction_locked);
+ wake_up_all(&journal->j_wait_transaction_locked);
}
static void warn_dirty_buffer(struct buffer_head *bh)
@@ -1460,8 +1460,6 @@
struct journal_head *jh;
int ret = 0;
- if (is_handle_aborted(handle))
- return -EROFS;
if (!buffer_jbd(bh))
return -EUCLEAN;
@@ -1508,6 +1506,18 @@
journal = transaction->t_journal;
spin_lock(&jh->b_state_lock);
+ if (is_handle_aborted(handle)) {
+ /*
+ * Check journal aborting with @jh->b_state_lock locked,
+ * since 'jh->b_transaction' could be replaced with
+ * 'jh->b_next_transaction' during old transaction
+ * committing if journal aborted, which may fail
+ * assertion on 'jh->b_frozen_data == NULL'.
+ */
+ ret = -EROFS;
+ goto out_unlock_bh;
+ }
+
if (jh->b_modified == 0) {
/*
* This buffer's got modified and becoming part
diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c
index b288c8a..837cd55 100644
--- a/fs/jffs2/build.c
+++ b/fs/jffs2/build.c
@@ -415,13 +415,15 @@
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
ret = -EIO;
- goto out_free;
+ goto out_sum_exit;
}
jffs2_calc_trigger_levels(c);
return 0;
+ out_sum_exit:
+ jffs2_sum_exit(c);
out_free:
kvfree(c->blocks);
diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c
index 78858f6..db21098 100644
--- a/fs/jffs2/fs.c
+++ b/fs/jffs2/fs.c
@@ -602,8 +602,9 @@
jffs2_free_ino_caches(c);
jffs2_free_raw_node_refs(c);
kvfree(c->blocks);
- out_inohash:
jffs2_clear_xattr_subsystem(c);
+ jffs2_sum_exit(c);
+ out_inohash:
kfree(c->inocache_list);
out_wbuf:
jffs2_flash_cleanup(c);
diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c
index b676056..29671e3 100644
--- a/fs/jffs2/scan.c
+++ b/fs/jffs2/scan.c
@@ -136,7 +136,7 @@
if (!s) {
JFFS2_WARNING("Can't allocate memory for summary\n");
ret = -ENOMEM;
- goto out;
+ goto out_buf;
}
}
@@ -275,13 +275,15 @@
}
ret = 0;
out:
+ jffs2_sum_reset_collected(s);
+ kfree(s);
+ out_buf:
if (buf_size)
kfree(flashbuf);
#ifndef __ECOS
else
mtd_unpoint(c->mtd, 0, c->mtd->size);
#endif
- kfree(s);
return ret;
}
diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c
index b0eb9c8..980aa33 100644
--- a/fs/jfs/inode.c
+++ b/fs/jfs/inode.c
@@ -146,12 +146,13 @@
dquot_initialize(inode);
if (JFS_IP(inode)->fileset == FILESYSTEM_I) {
+ struct inode *ipimap = JFS_SBI(inode->i_sb)->ipimap;
truncate_inode_pages_final(&inode->i_data);
if (test_cflag(COMMIT_Freewmap, inode))
jfs_free_zero_link(inode);
- if (JFS_SBI(inode->i_sb)->ipimap)
+ if (ipimap && JFS_IP(ipimap)->i_imap)
diFree(inode);
/*
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index aedad59..0ce17ea 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -148,6 +148,7 @@
* 0 - success
* -ENOMEM - insufficient memory
* -EIO - i/o error
+ * -EINVAL - wrong bmap data
*/
int dbMount(struct inode *ipbmap)
{
@@ -179,6 +180,12 @@
bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree);
bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage);
bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag);
+ if (!bmp->db_numag) {
+ release_metapage(mp);
+ kfree(bmp);
+ return -EINVAL;
+ }
+
bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel);
bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag);
bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref);
@@ -378,7 +385,8 @@
}
/* write the last buffer. */
- write_metapage(mp);
+ if (mp)
+ write_metapage(mp);
IREAD_UNLOCK(ipbmap);
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index 9aec80b..8b3c86a 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -19,7 +19,15 @@
DEFINE_MUTEX(kernfs_mutex);
static DEFINE_SPINLOCK(kernfs_rename_lock); /* kn->parent and ->name */
-static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by rename_lock */
+/*
+ * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
+ * call pr_cont() while holding rename_lock. Because sometimes pr_cont()
+ * will perform wakeups when releasing console_sem. Holding rename_lock
+ * will introduce deadlock if the scheduler reads the kernfs_name in the
+ * wakeup path.
+ */
+static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
+static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */
static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
@@ -230,12 +238,12 @@
{
unsigned long flags;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
- kernfs_name_locked(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
+ kernfs_name(kn, kernfs_pr_cont_buf, sizeof(kernfs_pr_cont_buf));
pr_cont("%s", kernfs_pr_cont_buf);
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}
/**
@@ -249,10 +257,10 @@
unsigned long flags;
int sz;
- spin_lock_irqsave(&kernfs_rename_lock, flags);
+ spin_lock_irqsave(&kernfs_pr_cont_lock, flags);
- sz = kernfs_path_from_node_locked(kn, NULL, kernfs_pr_cont_buf,
- sizeof(kernfs_pr_cont_buf));
+ sz = kernfs_path_from_node(kn, NULL, kernfs_pr_cont_buf,
+ sizeof(kernfs_pr_cont_buf));
if (sz < 0) {
pr_cont("(error)");
goto out;
@@ -266,7 +274,7 @@
pr_cont("%s", kernfs_pr_cont_buf);
out:
- spin_unlock_irqrestore(&kernfs_rename_lock, flags);
+ spin_unlock_irqrestore(&kernfs_pr_cont_lock, flags);
}
/**
@@ -864,13 +872,12 @@
lockdep_assert_held(&kernfs_mutex);
- /* grab kernfs_rename_lock to piggy back on kernfs_pr_cont_buf */
- spin_lock_irq(&kernfs_rename_lock);
+ spin_lock_irq(&kernfs_pr_cont_lock);
len = strlcpy(kernfs_pr_cont_buf, path, sizeof(kernfs_pr_cont_buf));
if (len >= sizeof(kernfs_pr_cont_buf)) {
- spin_unlock_irq(&kernfs_rename_lock);
+ spin_unlock_irq(&kernfs_pr_cont_lock);
return NULL;
}
@@ -882,7 +889,7 @@
parent = kernfs_find_ns(parent, name, ns);
}
- spin_unlock_irq(&kernfs_rename_lock);
+ spin_unlock_irq(&kernfs_pr_cont_lock);
return parent;
}
@@ -1512,8 +1519,11 @@
mutex_lock(&kernfs_mutex);
kn = kernfs_find_ns(parent, name, ns);
- if (kn)
+ if (kn) {
+ kernfs_get(kn);
__kernfs_remove(kn);
+ kernfs_put(kn);
+ }
mutex_unlock(&kernfs_mutex);
diff --git a/fs/minix/inode.c b/fs/minix/inode.c
index 34f5464..e938f5b 100644
--- a/fs/minix/inode.c
+++ b/fs/minix/inode.c
@@ -446,7 +446,8 @@
.writepage = minix_writepage,
.write_begin = minix_write_begin,
.write_end = generic_write_end,
- .bmap = minix_bmap
+ .bmap = minix_bmap,
+ .direct_IO = noop_direct_IO
};
static const struct inode_operations minix_symlink_inode_operations = {
diff --git a/fs/namei.c b/fs/namei.c
index 72f354b..4375565 100644
--- a/fs/namei.c
+++ b/fs/namei.c
@@ -1348,6 +1348,8 @@
* becoming unpinned.
*/
flags = dentry->d_flags;
+ if (read_seqretry(&mount_lock, nd->m_seq))
+ return false;
continue;
}
if (read_seqretry(&mount_lock, nd->m_seq))
@@ -3272,6 +3274,8 @@
child = d_alloc(dentry, &slash_name);
if (unlikely(!child))
goto out_err;
+ if (!IS_POSIXACL(dir))
+ mode &= ~current_umask();
error = dir->i_op->tmpfile(dir, child, mode);
if (error)
goto out_err;
@@ -4629,7 +4633,7 @@
{
struct address_space *mapping = inode->i_mapping;
struct page *page;
- void *fsdata;
+ void *fsdata = NULL;
int err;
unsigned int flags = 0;
if (nofs)
diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c
index b44219c..bfdd212 100644
--- a/fs/nfs/callback_proc.c
+++ b/fs/nfs/callback_proc.c
@@ -283,6 +283,7 @@
rv = NFS4_OK;
break;
case -ENOENT:
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
/* Embrace your forgetfulness! */
rv = NFS4ERR_NOMATCHING_LAYOUT;
@@ -353,12 +354,11 @@
struct cb_process_state *cps)
{
struct cb_devicenotifyargs *args = argp;
+ const struct pnfs_layoutdriver_type *ld = NULL;
uint32_t i;
__be32 res = 0;
- struct nfs_client *clp = cps->clp;
- struct nfs_server *server = NULL;
- if (!clp) {
+ if (!cps->clp) {
res = cpu_to_be32(NFS4ERR_OP_NOT_IN_SESSION);
goto out;
}
@@ -366,23 +366,15 @@
for (i = 0; i < args->ndevs; i++) {
struct cb_devicenotifyitem *dev = &args->devs[i];
- if (!server ||
- server->pnfs_curr_ld->id != dev->cbd_layout_type) {
- rcu_read_lock();
- list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link)
- if (server->pnfs_curr_ld &&
- server->pnfs_curr_ld->id == dev->cbd_layout_type) {
- rcu_read_unlock();
- goto found;
- }
- rcu_read_unlock();
- continue;
+ if (!ld || ld->id != dev->cbd_layout_type) {
+ pnfs_put_layoutdriver(ld);
+ ld = pnfs_find_layoutdriver(dev->cbd_layout_type);
+ if (!ld)
+ continue;
}
-
- found:
- nfs4_delete_deviceid(server->pnfs_curr_ld, clp, &dev->cbd_dev_id);
+ nfs4_delete_deviceid(ld, cps->clp, &dev->cbd_dev_id);
}
-
+ pnfs_put_layoutdriver(ld);
out:
kfree(args->devs);
return res;
diff --git a/fs/nfs/callback_xdr.c b/fs/nfs/callback_xdr.c
index 1725079..ca8a4aa 100644
--- a/fs/nfs/callback_xdr.c
+++ b/fs/nfs/callback_xdr.c
@@ -272,10 +272,6 @@
n = ntohl(*p++);
if (n == 0)
goto out;
- if (n > ULONG_MAX / sizeof(*args->devs)) {
- status = htonl(NFS4ERR_BADXDR);
- goto out;
- }
args->devs = kmalloc_array(n, sizeof(*args->devs), GFP_KERNEL);
if (!args->devs) {
diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c
index d6ac2c4..1eb6c7a 100644
--- a/fs/nfs/delegation.c
+++ b/fs/nfs/delegation.c
@@ -228,8 +228,7 @@
*
*/
void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred,
- fmode_t type,
- const nfs4_stateid *stateid,
+ fmode_t type, const nfs4_stateid *stateid,
unsigned long pagemod_limit)
{
struct nfs_delegation *delegation;
@@ -239,25 +238,24 @@
delegation = rcu_dereference(NFS_I(inode)->delegation);
if (delegation != NULL) {
spin_lock(&delegation->lock);
- if (nfs4_is_valid_delegation(delegation, 0)) {
- nfs4_stateid_copy(&delegation->stateid, stateid);
- delegation->type = type;
- delegation->pagemod_limit = pagemod_limit;
- oldcred = delegation->cred;
- delegation->cred = get_cred(cred);
- clear_bit(NFS_DELEGATION_NEED_RECLAIM,
- &delegation->flags);
- spin_unlock(&delegation->lock);
- rcu_read_unlock();
- put_cred(oldcred);
- trace_nfs4_reclaim_delegation(inode, type);
- return;
- }
- /* We appear to have raced with a delegation return. */
+ nfs4_stateid_copy(&delegation->stateid, stateid);
+ delegation->type = type;
+ delegation->pagemod_limit = pagemod_limit;
+ oldcred = delegation->cred;
+ delegation->cred = get_cred(cred);
+ clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags);
+ if (test_and_clear_bit(NFS_DELEGATION_REVOKED,
+ &delegation->flags))
+ atomic_long_inc(&nfs_active_delegations);
spin_unlock(&delegation->lock);
+ rcu_read_unlock();
+ put_cred(oldcred);
+ trace_nfs4_reclaim_delegation(inode, type);
+ } else {
+ rcu_read_unlock();
+ nfs_inode_set_delegation(inode, cred, type, stateid,
+ pagemod_limit);
}
- rcu_read_unlock();
- nfs_inode_set_delegation(inode, cred, type, stateid, pagemod_limit);
}
static int nfs_do_return_delegation(struct inode *inode, struct nfs_delegation *delegation, int issync)
diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c
index 2ad56ff..9f88ca7 100644
--- a/fs/nfs/dir.c
+++ b/fs/nfs/dir.c
@@ -1628,16 +1628,6 @@
};
EXPORT_SYMBOL_GPL(nfs4_dentry_operations);
-static fmode_t flags_to_mode(int flags)
-{
- fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
- if ((flags & O_ACCMODE) != O_WRONLY)
- res |= FMODE_READ;
- if ((flags & O_ACCMODE) != O_RDONLY)
- res |= FMODE_WRITE;
- return res;
-}
-
static struct nfs_open_context *create_nfs_open_context(struct dentry *dentry, int open_flags, struct file *filp)
{
return alloc_nfs_open_context(dentry, flags_to_mode(open_flags), filp);
diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c
index 3c0335c..c220810 100644
--- a/fs/nfs/direct.c
+++ b/fs/nfs/direct.c
@@ -172,8 +172,8 @@
VM_BUG_ON(iov_iter_count(iter) != PAGE_SIZE);
if (iov_iter_rw(iter) == READ)
- return nfs_file_direct_read(iocb, iter);
- return nfs_file_direct_write(iocb, iter);
+ return nfs_file_direct_read(iocb, iter, true);
+ return nfs_file_direct_write(iocb, iter, true);
}
static void nfs_direct_release_pages(struct page **pages, unsigned int npages)
@@ -424,6 +424,7 @@
* nfs_file_direct_read - file direct read operation for NFS files
* @iocb: target I/O control block
* @iter: vector of user buffers into which to read data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
*
* We use this function for direct reads instead of calling
* generic_file_aio_read() in order to avoid gfar's check to see if
@@ -439,7 +440,8 @@
* client must read the updated atime from the server back into its
* cache.
*/
-ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_read(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
{
struct file *file = iocb->ki_filp;
struct address_space *mapping = file->f_mapping;
@@ -481,12 +483,14 @@
if (iter_is_iovec(iter))
dreq->flags = NFS_ODIRECT_SHOULD_DIRTY;
- nfs_start_io_direct(inode);
+ if (!swap)
+ nfs_start_io_direct(inode);
NFS_I(inode)->read_io += count;
requested = nfs_direct_read_schedule_iovec(dreq, iter, iocb->ki_pos);
- nfs_end_io_direct(inode);
+ if (!swap)
+ nfs_end_io_direct(inode);
if (requested > 0) {
result = nfs_direct_wait(dreq);
@@ -789,7 +793,7 @@
*/
static ssize_t nfs_direct_write_schedule_iovec(struct nfs_direct_req *dreq,
struct iov_iter *iter,
- loff_t pos)
+ loff_t pos, int ioflags)
{
struct nfs_pageio_descriptor desc;
struct inode *inode = dreq->inode;
@@ -797,7 +801,7 @@
size_t requested_bytes = 0;
size_t wsize = max_t(size_t, NFS_SERVER(inode)->wsize, PAGE_SIZE);
- nfs_pageio_init_write(&desc, inode, FLUSH_COND_STABLE, false,
+ nfs_pageio_init_write(&desc, inode, ioflags, false,
&nfs_direct_write_completion_ops);
desc.pg_dreq = dreq;
get_dreq(dreq);
@@ -875,6 +879,7 @@
* nfs_file_direct_write - file direct write operation for NFS files
* @iocb: target I/O control block
* @iter: vector of user buffers from which to write data
+ * @swap: flag indicating this is swap IO, not O_DIRECT IO
*
* We use this function for direct writes instead of calling
* generic_file_aio_write() in order to avoid taking the inode
@@ -891,7 +896,8 @@
* Note that O_APPEND is not supported for NFS direct writes, as there
* is no atomic O_APPEND write facility in the NFS protocol.
*/
-ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter)
+ssize_t nfs_file_direct_write(struct kiocb *iocb, struct iov_iter *iter,
+ bool swap)
{
ssize_t result, requested;
size_t count;
@@ -905,7 +911,11 @@
dfprintk(FILE, "NFS: direct write(%pD2, %zd@%Ld)\n",
file, iov_iter_count(iter), (long long) iocb->ki_pos);
- result = generic_write_checks(iocb, iter);
+ if (swap)
+ /* bypass generic checks */
+ result = iov_iter_count(iter);
+ else
+ result = generic_write_checks(iocb, iter);
if (result <= 0)
return result;
count = result;
@@ -936,17 +946,23 @@
dreq->iocb = iocb;
pnfs_init_ds_commit_info_ops(&dreq->ds_cinfo, inode);
- nfs_start_io_direct(inode);
+ if (swap) {
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_STABLE);
+ } else {
+ nfs_start_io_direct(inode);
- requested = nfs_direct_write_schedule_iovec(dreq, iter, pos);
+ requested = nfs_direct_write_schedule_iovec(dreq, iter, pos,
+ FLUSH_COND_STABLE);
- if (mapping->nrpages) {
- invalidate_inode_pages2_range(mapping,
- pos >> PAGE_SHIFT, end);
+ if (mapping->nrpages) {
+ invalidate_inode_pages2_range(mapping,
+ pos >> PAGE_SHIFT, end);
+ }
+
+ nfs_end_io_direct(inode);
}
- nfs_end_io_direct(inode);
-
if (requested > 0) {
result = nfs_direct_wait(dreq);
if (result > 0) {
diff --git a/fs/nfs/file.c b/fs/nfs/file.c
index 63940a7..ad856b7 100644
--- a/fs/nfs/file.c
+++ b/fs/nfs/file.c
@@ -161,7 +161,7 @@
ssize_t result;
if (iocb->ki_flags & IOCB_DIRECT)
- return nfs_file_direct_read(iocb, to);
+ return nfs_file_direct_read(iocb, to, false);
dprintk("NFS: read(%pD2, %zu@%lu)\n",
iocb->ki_filp,
@@ -208,15 +208,16 @@
nfs_file_fsync_commit(struct file *file, int datasync)
{
struct inode *inode = file_inode(file);
- int ret;
+ int ret, ret2;
dprintk("NFS: fsync file(%pD2) datasync %d\n", file, datasync);
nfs_inc_stats(inode, NFSIOS_VFSFSYNC);
ret = nfs_commit_inode(inode, FLUSH_SYNC);
- if (ret < 0)
- return ret;
- return file_check_and_advance_wb_err(file);
+ ret2 = file_check_and_advance_wb_err(file);
+ if (ret2 < 0)
+ return ret2;
+ return ret;
}
int
@@ -389,11 +390,8 @@
return status;
NFS_I(mapping->host)->write_io += copied;
- if (nfs_ctx_key_to_expire(ctx, mapping->host)) {
- status = nfs_wb_all(mapping->host);
- if (status < 0)
- return status;
- }
+ if (nfs_ctx_key_to_expire(ctx, mapping->host))
+ nfs_wb_all(mapping->host);
return copied;
}
@@ -616,7 +614,7 @@
return result;
if (iocb->ki_flags & IOCB_DIRECT)
- return nfs_file_direct_write(iocb, from);
+ return nfs_file_direct_write(iocb, from, false);
dprintk("NFS: write(%pD2, %zu@%Ld)\n",
file, iov_iter_count(from), (long long) iocb->ki_pos);
diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c
index 05b39e8..d60c086 100644
--- a/fs/nfs/fs_context.c
+++ b/fs/nfs/fs_context.c
@@ -476,7 +476,7 @@
if (result.negated)
ctx->flags &= ~NFS_MOUNT_SOFTREVAL;
else
- ctx->flags &= NFS_MOUNT_SOFTREVAL;
+ ctx->flags |= NFS_MOUNT_SOFTREVAL;
break;
case Opt_posix:
if (result.negated)
diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c
index f27ecc2..1adece1 100644
--- a/fs/nfs/inode.c
+++ b/fs/nfs/inode.c
@@ -1139,7 +1139,6 @@
nfs_fscache_open_file(inode, filp);
return 0;
}
-EXPORT_SYMBOL_GPL(nfs_open);
/*
* This function is called whenever some part of NFS notices that
diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h
index 98554dd..a7e0970 100644
--- a/fs/nfs/internal.h
+++ b/fs/nfs/internal.h
@@ -42,6 +42,16 @@
return true;
}
+static inline fmode_t flags_to_mode(int flags)
+{
+ fmode_t res = (__force fmode_t)flags & FMODE_EXEC;
+ if ((flags & O_ACCMODE) != O_WRONLY)
+ res |= FMODE_READ;
+ if ((flags & O_ACCMODE) != O_RDONLY)
+ res |= FMODE_WRITE;
+ return res;
+}
+
/*
* Note: RFC 1813 doesn't limit the number of auth flavors that
* a server can return, so make something up.
@@ -578,6 +588,13 @@
!nfs_write_verifier_cmp(&req->wb_verf, &verf->verifier);
}
+static inline gfp_t nfs_io_gfp_mask(void)
+{
+ if (current->flags & PF_WQ_WORKER)
+ return GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
+ return GFP_KERNEL;
+}
+
/* unlink.c */
extern struct rpc_task *
nfs_async_rename(struct inode *old_dir, struct inode *new_dir,
@@ -815,6 +832,7 @@
case 0:
case -ERESTARTSYS:
case -EINTR:
+ case -ENOMEM:
return false;
}
return nfs_error_is_fatal(err);
diff --git a/fs/nfs/nfs2xdr.c b/fs/nfs/nfs2xdr.c
index f6676af..5e6453e 100644
--- a/fs/nfs/nfs2xdr.c
+++ b/fs/nfs/nfs2xdr.c
@@ -948,7 +948,7 @@
error = decode_filename_inline(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return error;
+ return -EAGAIN;
/*
* The type (size and byte order) of nfscookie isn't defined in
diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c
index 5601e47..b49359a 100644
--- a/fs/nfs/nfs3client.c
+++ b/fs/nfs/nfs3client.c
@@ -108,7 +108,6 @@
if (mds_srv->flags & NFS_MOUNT_NORESVPORT)
__set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags);
- __set_bit(NFS_CS_NOPING, &cl_init.init_flags);
__set_bit(NFS_CS_DS, &cl_init.init_flags);
/* Use the MDS nfs_client cl_ipaddr. */
diff --git a/fs/nfs/nfs3xdr.c b/fs/nfs/nfs3xdr.c
index dff6b52..b5a9379 100644
--- a/fs/nfs/nfs3xdr.c
+++ b/fs/nfs/nfs3xdr.c
@@ -1964,7 +1964,6 @@
bool plus)
{
struct user_namespace *userns = rpc_userns(entry->server->client);
- struct nfs_entry old = *entry;
__be32 *p;
int error;
u64 new_cookie;
@@ -1984,15 +1983,15 @@
error = decode_fileid3(xdr, &entry->ino);
if (unlikely(error))
- return error;
+ return -EAGAIN;
error = decode_inline_filename3(xdr, &entry->name, &entry->len);
if (unlikely(error))
- return error;
+ return -EAGAIN;
error = decode_cookie3(xdr, &new_cookie);
if (unlikely(error))
- return error;
+ return -EAGAIN;
entry->d_type = DT_UNKNOWN;
@@ -2000,7 +1999,7 @@
entry->fattr->valid = 0;
error = decode_post_op_attr(xdr, entry->fattr, userns);
if (unlikely(error))
- return error;
+ return -EAGAIN;
if (entry->fattr->valid & NFS_ATTR_FATTR_V3)
entry->d_type = nfs_umode_to_dtype(entry->fattr->mode);
@@ -2015,11 +2014,8 @@
return -EAGAIN;
if (*p != xdr_zero) {
error = decode_nfs_fh3(xdr, entry->fh);
- if (unlikely(error)) {
- if (error == -E2BIG)
- goto out_truncated;
- return error;
- }
+ if (unlikely(error))
+ return -EAGAIN;
} else
zero_nfs_fh3(entry->fh);
}
@@ -2028,11 +2024,6 @@
entry->cookie = new_cookie;
return 0;
-
-out_truncated:
- dprintk("NFS: directory entry contains invalid file handle\n");
- *entry = old;
- return -EAGAIN;
}
/*
diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c
index 2587b1b..dad32b1 100644
--- a/fs/nfs/nfs42proc.c
+++ b/fs/nfs/nfs42proc.c
@@ -567,8 +567,10 @@
ctx = get_nfs_open_context(nfs_file_open_context(src));
l_ctx = nfs_get_lock_context(ctx);
- if (IS_ERR(l_ctx))
- return PTR_ERR(l_ctx);
+ if (IS_ERR(l_ctx)) {
+ status = PTR_ERR(l_ctx);
+ goto out;
+ }
status = nfs4_set_rw_stateid(&args->cna_src_stateid, ctx, l_ctx,
FMODE_READ);
@@ -576,7 +578,7 @@
if (status) {
if (status == -EAGAIN)
status = -NFS4ERR_BAD_STATEID;
- return status;
+ goto out;
}
status = nfs4_call_sync(src_server->client, src_server, &msg,
@@ -584,6 +586,7 @@
if (status == -ENOTSUPP)
src_server->caps &= ~NFS_CAP_COPY_NOTIFY;
+out:
put_nfs_open_context(nfs_file_open_context(src));
return status;
}
diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c
index 0e6437b..252c99c 100644
--- a/fs/nfs/nfs4client.c
+++ b/fs/nfs/nfs4client.c
@@ -346,6 +346,7 @@
ret = nfs4_setup_slot_table(tbl, NFS4_MAX_SLOT_TABLE,
"NFSv4.0 transport Slot table");
if (ret) {
+ nfs4_shutdown_slot_table(tbl);
kfree(tbl);
return ret;
}
diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c
index a1e5c6b..70cd0d7 100644
--- a/fs/nfs/nfs4file.c
+++ b/fs/nfs/nfs4file.c
@@ -32,6 +32,7 @@
struct dentry *parent = NULL;
struct inode *dir;
unsigned openflags = filp->f_flags;
+ fmode_t f_mode;
struct iattr attr;
int err;
@@ -50,8 +51,9 @@
if (err)
return err;
+ f_mode = filp->f_mode;
if ((openflags & O_ACCMODE) == 3)
- return nfs_open(inode, filp);
+ f_mode |= flags_to_mode(openflags);
/* We can't create new files here */
openflags &= ~(O_CREAT|O_EXCL);
@@ -59,7 +61,7 @@
parent = dget_parent(dentry);
dir = d_inode(parent);
- ctx = alloc_nfs_open_context(file_dentry(filp), filp->f_mode, filp);
+ ctx = alloc_nfs_open_context(file_dentry(filp), f_mode, filp);
err = PTR_ERR(ctx);
if (IS_ERR(ctx))
goto out;
@@ -319,7 +321,7 @@
static struct file *__nfs42_ssc_open(struct vfsmount *ss_mnt,
struct nfs_fh *src_fh, nfs4_stateid *stateid)
{
- struct nfs_fattr fattr;
+ struct nfs_fattr *fattr = nfs_alloc_fattr();
struct file *filep, *res;
struct nfs_server *server;
struct inode *r_ino = NULL;
@@ -330,14 +332,20 @@
server = NFS_SERVER(ss_mnt->mnt_root->d_inode);
- nfs_fattr_init(&fattr);
+ if (!fattr)
+ return ERR_PTR(-ENOMEM);
- status = nfs4_proc_getattr(server, src_fh, &fattr, NULL, NULL);
+ status = nfs4_proc_getattr(server, src_fh, fattr, NULL, NULL);
if (status < 0) {
res = ERR_PTR(status);
goto out;
}
+ if (!S_ISREG(fattr->mode)) {
+ res = ERR_PTR(-EBADF);
+ goto out;
+ }
+
res = ERR_PTR(-ENOMEM);
len = strlen(SSC_READ_NAME_BODY) + 16;
read_name = kzalloc(len, GFP_NOFS);
@@ -345,7 +353,7 @@
goto out;
snprintf(read_name, len, SSC_READ_NAME_BODY, read_name_gen++);
- r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, &fattr,
+ r_ino = nfs_fhget(ss_mnt->mnt_root->d_inode->i_sb, src_fh, fattr,
NULL);
if (IS_ERR(r_ino)) {
res = ERR_CAST(r_ino);
@@ -356,6 +364,7 @@
r_ino->i_fop);
if (IS_ERR(filep)) {
res = ERR_CAST(filep);
+ iput(r_ino);
goto out_free_name;
}
filep->f_mode |= FMODE_READ;
@@ -390,6 +399,7 @@
out_free_name:
kfree(read_name);
out:
+ nfs_free_fattr(fattr);
return res;
out_stateowner:
nfs4_put_state_owner(sp);
diff --git a/fs/nfs/nfs4idmap.c b/fs/nfs/nfs4idmap.c
index f331866..ec6afd3 100644
--- a/fs/nfs/nfs4idmap.c
+++ b/fs/nfs/nfs4idmap.c
@@ -561,22 +561,20 @@
return true;
}
-static void
-nfs_idmap_complete_pipe_upcall_locked(struct idmap *idmap, int ret)
+static void nfs_idmap_complete_pipe_upcall(struct idmap_legacy_upcalldata *data,
+ int ret)
{
- struct key *authkey = idmap->idmap_upcall_data->authkey;
-
- kfree(idmap->idmap_upcall_data);
- idmap->idmap_upcall_data = NULL;
- complete_request_key(authkey, ret);
- key_put(authkey);
+ complete_request_key(data->authkey, ret);
+ key_put(data->authkey);
+ kfree(data);
}
-static void
-nfs_idmap_abort_pipe_upcall(struct idmap *idmap, int ret)
+static void nfs_idmap_abort_pipe_upcall(struct idmap *idmap,
+ struct idmap_legacy_upcalldata *data,
+ int ret)
{
- if (idmap->idmap_upcall_data != NULL)
- nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+ if (cmpxchg(&idmap->idmap_upcall_data, data, NULL) == data)
+ nfs_idmap_complete_pipe_upcall(data, ret);
}
static int nfs_idmap_legacy_upcall(struct key *authkey, void *aux)
@@ -613,7 +611,7 @@
ret = rpc_queue_upcall(idmap->idmap_pipe, msg);
if (ret < 0)
- nfs_idmap_abort_pipe_upcall(idmap, ret);
+ nfs_idmap_abort_pipe_upcall(idmap, data, ret);
return ret;
out2:
@@ -669,6 +667,7 @@
struct request_key_auth *rka;
struct rpc_inode *rpci = RPC_I(file_inode(filp));
struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
struct key *authkey;
struct idmap_msg im;
size_t namelen_in;
@@ -678,10 +677,11 @@
* will have been woken up and someone else may now have used
* idmap_key_cons - so after this point we may no longer touch it.
*/
- if (idmap->idmap_upcall_data == NULL)
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data == NULL)
goto out_noupcall;
- authkey = idmap->idmap_upcall_data->authkey;
+ authkey = data->authkey;
rka = get_request_key_auth(authkey);
if (mlen != sizeof(im)) {
@@ -703,18 +703,17 @@
if (namelen_in == 0 || namelen_in == IDMAP_NAMESZ) {
ret = -EINVAL;
goto out;
-}
+ }
- ret = nfs_idmap_read_and_verify_message(&im,
- &idmap->idmap_upcall_data->idmap_msg,
- rka->target_key, authkey);
+ ret = nfs_idmap_read_and_verify_message(&im, &data->idmap_msg,
+ rka->target_key, authkey);
if (ret >= 0) {
key_set_timeout(rka->target_key, nfs_idmap_cache_timeout);
ret = mlen;
}
out:
- nfs_idmap_complete_pipe_upcall_locked(idmap, ret);
+ nfs_idmap_complete_pipe_upcall(data, ret);
out_noupcall:
return ret;
}
@@ -728,7 +727,7 @@
struct idmap *idmap = data->idmap;
if (msg->errno)
- nfs_idmap_abort_pipe_upcall(idmap, msg->errno);
+ nfs_idmap_abort_pipe_upcall(idmap, data, msg->errno);
}
static void
@@ -736,8 +735,11 @@
{
struct rpc_inode *rpci = RPC_I(inode);
struct idmap *idmap = (struct idmap *)rpci->private;
+ struct idmap_legacy_upcalldata *data;
- nfs_idmap_abort_pipe_upcall(idmap, -EPIPE);
+ data = xchg(&idmap->idmap_upcall_data, NULL);
+ if (data)
+ nfs_idmap_complete_pipe_upcall(data, -EPIPE);
}
int nfs_map_name_to_uid(const struct nfs_server *server, const char *name, size_t namelen, kuid_t *uid)
diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c
index d222a98..36af373 100644
--- a/fs/nfs/nfs4proc.c
+++ b/fs/nfs/nfs4proc.c
@@ -369,6 +369,14 @@
kunmap_atomic(start);
}
+static void nfs4_fattr_set_prechange(struct nfs_fattr *fattr, u64 version)
+{
+ if (!(fattr->valid & NFS_ATTR_FATTR_PRECHANGE)) {
+ fattr->pre_change_attr = version;
+ fattr->valid |= NFS_ATTR_FATTR_PRECHANGE;
+ }
+}
+
static void nfs4_test_and_free_stateid(struct nfs_server *server,
nfs4_stateid *stateid,
const struct cred *cred)
@@ -782,10 +790,9 @@
if ((s32)(seqnr - slot->seq_nr_highest_sent) > 0)
slot->seq_nr_highest_sent = seqnr;
}
-static void nfs4_slot_sequence_acked(struct nfs4_slot *slot,
- u32 seqnr)
+static void nfs4_slot_sequence_acked(struct nfs4_slot *slot, u32 seqnr)
{
- slot->seq_nr_highest_sent = seqnr;
+ nfs4_slot_sequence_record_sent(slot, seqnr);
slot->seq_nr_last_acked = seqnr;
}
@@ -852,7 +859,6 @@
__func__,
slot->slot_nr,
slot->seq_nr);
- nfs4_slot_sequence_acked(slot, slot->seq_nr);
goto out_retry;
case -NFS4ERR_RETRY_UNCACHED_REP:
case -NFS4ERR_SEQ_FALSE_RETRY:
@@ -3078,8 +3084,13 @@
}
out:
- if (!opendata->cancelled)
+ if (!opendata->cancelled) {
+ if (opendata->lgp) {
+ nfs4_lgopen_release(opendata->lgp);
+ opendata->lgp = NULL;
+ }
nfs4_sequence_free_slot(&opendata->o_res.seq_res);
+ }
return ret;
}
@@ -6464,7 +6475,9 @@
pnfs_roc_release(&data->lr.arg, &data->lr.res,
data->res.lr_ret);
if (inode) {
- nfs_post_op_update_inode_force_wcc(inode, &data->fattr);
+ nfs4_fattr_set_prechange(&data->fattr,
+ inode_peek_iversion_raw(inode));
+ nfs_refresh_inode(inode, &data->fattr);
nfs_iput_and_deactive(inode);
}
kfree(calldata);
@@ -7001,6 +7014,7 @@
{
struct nfs4_lockdata *data = calldata;
struct nfs4_lock_state *lsp = data->lsp;
+ struct nfs_server *server = NFS_SERVER(d_inode(data->ctx->dentry));
dprintk("%s: begin!\n", __func__);
@@ -7010,8 +7024,7 @@
data->rpc_status = task->tk_status;
switch (task->tk_status) {
case 0:
- renew_lease(NFS_SERVER(d_inode(data->ctx->dentry)),
- data->timestamp);
+ renew_lease(server, data->timestamp);
if (data->arg.new_lock && !data->cancelled) {
data->fl.fl_flags &= ~(FL_SLEEP | FL_ACCESS);
if (locks_lock_inode_wait(lsp->ls_state->inode, &data->fl) < 0)
@@ -7032,6 +7045,8 @@
if (!nfs4_stateid_match(&data->arg.open_stateid,
&lsp->ls_state->open_stateid))
goto out_restart;
+ else if (nfs4_async_handle_error(task, server, lsp->ls_state, NULL) == -EAGAIN)
+ goto out_restart;
} else if (!nfs4_stateid_match(&data->arg.lock_stateid,
&lsp->ls_stateid))
goto out_restart;
@@ -8205,6 +8220,7 @@
case -NFS4ERR_DEADSESSION:
nfs4_schedule_session_recovery(clp->cl_session,
task->tk_status);
+ return;
}
if (args->dir == NFS4_CDFC4_FORE_OR_BOTH &&
res->dir != NFS4_CDFS4_BOTH) {
@@ -9260,6 +9276,9 @@
rpc_delay(task, NFS4_POLL_RETRY_MAX);
fallthrough;
case -NFS4ERR_RETRY_UNCACHED_REP:
+ case -EACCES:
+ dprintk("%s: failed to reclaim complete error %d for server %s, retrying\n",
+ __func__, task->tk_status, clp->cl_hostname);
return -EAGAIN;
case -NFS4ERR_BADSESSION:
case -NFS4ERR_DEADSESSION:
diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c
index cbeec29..a77a3d8 100644
--- a/fs/nfs/nfs4state.c
+++ b/fs/nfs/nfs4state.c
@@ -49,6 +49,7 @@
#include <linux/workqueue.h>
#include <linux/bitops.h>
#include <linux/jiffies.h>
+#include <linux/sched/mm.h>
#include <linux/sunrpc/clnt.h>
@@ -1776,6 +1777,7 @@
static void nfs4_state_start_reclaim_reboot(struct nfs_client *clp)
{
+ set_bit(NFS4CLNT_RECLAIM_REBOOT, &clp->cl_state);
/* Mark all delegations for reclaim */
nfs_delegation_mark_reclaim(clp);
nfs4_state_mark_reclaim_helper(clp, nfs4_state_mark_reclaim_reboot);
@@ -2557,9 +2559,17 @@
static void nfs4_state_manager(struct nfs_client *clp)
{
+ unsigned int memflags;
int status = 0;
const char *section = "", *section_sep = "";
+ /*
+ * State recovery can deadlock if the direct reclaim code tries
+ * start NFS writeback. So ensure memory allocations are all
+ * GFP_NOFS.
+ */
+ memflags = memalloc_nofs_save();
+
/* Ensure exclusive access to NFSv4 state */
do {
trace_nfs4_state_mgr(clp);
@@ -2633,6 +2643,7 @@
if (status < 0)
goto out_error;
nfs4_state_end_reclaim_reboot(clp);
+ continue;
}
/* Detect expired delegations... */
@@ -2654,6 +2665,7 @@
clear_bit(NFS4CLNT_RECLAIM_NOGRACE, &clp->cl_state);
}
+ memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
@@ -2671,6 +2683,7 @@
return;
if (test_and_set_bit(NFS4CLNT_MANAGER_RUNNING, &clp->cl_state) != 0)
return;
+ memflags = memalloc_nofs_save();
} while (refcount_read(&clp->cl_count) > 1 && !signalled());
goto out_drain;
@@ -2683,6 +2696,7 @@
clp->cl_hostname, -status);
ssleep(1);
out_drain:
+ memalloc_nofs_restore(memflags);
nfs4_end_drain_session(clp);
nfs4_clear_state_manager_bit(clp);
}
diff --git a/fs/nfs/pagelist.c b/fs/nfs/pagelist.c
index 98b9c1e..17fef6e 100644
--- a/fs/nfs/pagelist.c
+++ b/fs/nfs/pagelist.c
@@ -90,10 +90,10 @@
}
}
-static inline struct nfs_page *
-nfs_page_alloc(void)
+static inline struct nfs_page *nfs_page_alloc(void)
{
- struct nfs_page *p = kmem_cache_zalloc(nfs_page_cachep, GFP_KERNEL);
+ struct nfs_page *p =
+ kmem_cache_zalloc(nfs_page_cachep, nfs_io_gfp_mask());
if (p)
INIT_LIST_HEAD(&p->wb_list);
return p;
@@ -901,7 +901,7 @@
struct nfs_commit_info cinfo;
struct nfs_page_array *pg_array = &hdr->page_array;
unsigned int pagecount, pageused;
- gfp_t gfp_flags = GFP_KERNEL;
+ gfp_t gfp_flags = nfs_io_gfp_mask();
pagecount = nfs_page_array_len(mirror->pg_base, mirror->pg_count);
pg_array->npages = pagecount;
@@ -984,7 +984,7 @@
desc->pg_mirrors_dynamic = NULL;
if (mirror_count == 1)
return desc->pg_mirrors_static;
- ret = kmalloc_array(mirror_count, sizeof(*ret), GFP_KERNEL);
+ ret = kmalloc_array(mirror_count, sizeof(*ret), nfs_io_gfp_mask());
if (ret != NULL) {
for (i = 0; i < mirror_count; i++)
nfs_pageio_mirror_init(&ret[i], desc->pg_bsize);
diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c
index 5370e08..2143672 100644
--- a/fs/nfs/pnfs.c
+++ b/fs/nfs/pnfs.c
@@ -92,6 +92,17 @@
return local;
}
+const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id)
+{
+ return find_pnfs_driver(id);
+}
+
+void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld)
+{
+ if (ld)
+ module_put(ld->owner);
+}
+
void
unset_pnfs_layoutdriver(struct nfs_server *nfss)
{
@@ -458,6 +469,7 @@
pnfs_clear_lseg_state(lseg, lseg_list);
pnfs_clear_layoutreturn_info(lo);
pnfs_free_returned_lsegs(lo, lseg_list, &range, 0);
+ set_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags);
if (test_bit(NFS_LAYOUT_RETURN, &lo->plh_flags) &&
!test_and_set_bit(NFS_LAYOUT_RETURN_LOCK, &lo->plh_flags))
pnfs_clear_layoutreturn_waitbit(lo);
@@ -1912,8 +1924,9 @@
static void nfs_layoutget_end(struct pnfs_layout_hdr *lo)
{
- if (atomic_dec_and_test(&lo->plh_outstanding))
- wake_up_var(&lo->plh_outstanding);
+ if (atomic_dec_and_test(&lo->plh_outstanding) &&
+ test_and_clear_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags))
+ wake_up_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN);
}
static bool pnfs_is_first_layoutget(struct pnfs_layout_hdr *lo)
@@ -1995,6 +2008,7 @@
lo = pnfs_find_alloc_layout(ino, ctx, gfp_flags);
if (lo == NULL) {
spin_unlock(&ino->i_lock);
+ lseg = ERR_PTR(-ENOMEM);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, lseg,
PNFS_UPDATE_LAYOUT_NOMEM);
goto out;
@@ -2019,11 +2033,11 @@
* If the layout segment list is empty, but there are outstanding
* layoutget calls, then they might be subject to a layoutrecall.
*/
- if ((list_empty(&lo->plh_segs) || !pnfs_layout_is_valid(lo)) &&
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
atomic_read(&lo->plh_outstanding) != 0) {
spin_unlock(&ino->i_lock);
- lseg = ERR_PTR(wait_var_event_killable(&lo->plh_outstanding,
- !atomic_read(&lo->plh_outstanding)));
+ lseg = ERR_PTR(wait_on_bit(&lo->plh_flags, NFS_LAYOUT_DRAIN,
+ TASK_KILLABLE));
if (IS_ERR(lseg))
goto out_put_layout_hdr;
pnfs_put_layout_hdr(lo);
@@ -2123,6 +2137,7 @@
lgp = pnfs_alloc_init_layoutget_args(ino, ctx, &stateid, &arg, gfp_flags);
if (!lgp) {
+ lseg = ERR_PTR(-ENOMEM);
trace_pnfs_update_layout(ino, pos, count, iomode, lo, NULL,
PNFS_UPDATE_LAYOUT_NOMEM);
nfs_layoutget_end(lo);
@@ -2142,6 +2157,12 @@
case -ERECALLCONFLICT:
case -EAGAIN:
break;
+ case -ENODATA:
+ /* The server returned NFS4ERR_LAYOUTUNAVAILABLE */
+ pnfs_layout_set_fail_bit(
+ lo, pnfs_iomode_to_fail_bit(iomode));
+ lseg = NULL;
+ goto out_put_layout_hdr;
default:
if (!nfs_error_is_fatal(PTR_ERR(lseg))) {
pnfs_layout_clear_fail_bit(lo, pnfs_iomode_to_fail_bit(iomode));
@@ -2395,7 +2416,8 @@
goto out_forget;
}
- if (!pnfs_layout_is_valid(lo) && !pnfs_is_first_layoutget(lo))
+ if (test_bit(NFS_LAYOUT_DRAIN, &lo->plh_flags) &&
+ !pnfs_is_first_layoutget(lo))
goto out_forget;
if (nfs4_stateid_match_other(&lo->plh_stateid, &res->stateid)) {
diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h
index 0212fe3..a7cf84a 100644
--- a/fs/nfs/pnfs.h
+++ b/fs/nfs/pnfs.h
@@ -107,6 +107,7 @@
NFS_LAYOUT_FIRST_LAYOUTGET, /* Serialize first layoutget */
NFS_LAYOUT_INODE_FREEING, /* The inode is being freed */
NFS_LAYOUT_HASHED, /* The layout visible */
+ NFS_LAYOUT_DRAIN,
};
enum layoutdriver_policy_flags {
@@ -236,6 +237,8 @@
extern int pnfs_register_layoutdriver(struct pnfs_layoutdriver_type *);
extern void pnfs_unregister_layoutdriver(struct pnfs_layoutdriver_type *);
+extern const struct pnfs_layoutdriver_type *pnfs_find_layoutdriver(u32 id);
+extern void pnfs_put_layoutdriver(const struct pnfs_layoutdriver_type *ld);
/* nfs4proc.c */
extern size_t max_response_pages(struct nfs_server *server);
diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c
index 7b9d701..a2ad8bb 100644
--- a/fs/nfs/pnfs_nfs.c
+++ b/fs/nfs/pnfs_nfs.c
@@ -419,7 +419,7 @@
pnfs_bucket_fetch_commitdata(struct pnfs_commit_bucket *bucket,
struct nfs_commit_info *cinfo)
{
- struct nfs_commit_data *data = nfs_commitdata_alloc(false);
+ struct nfs_commit_data *data = nfs_commitdata_alloc();
if (!data)
return NULL;
@@ -515,7 +515,11 @@
unsigned int nreq = 0;
if (!list_empty(mds_pages)) {
- data = nfs_commitdata_alloc(true);
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(mds_pages, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
data->ds_commit_index = -1;
list_splice_init(mds_pages, &data->pages);
list_add_tail(&data->list, &list);
diff --git a/fs/nfs/super.c b/fs/nfs/super.c
index 4034102..b3fcc27 100644
--- a/fs/nfs/super.c
+++ b/fs/nfs/super.c
@@ -1029,22 +1029,31 @@
if (ctx && ctx->bsize)
sb->s_blocksize = nfs_block_size(ctx->bsize, &sb->s_blocksize_bits);
- if (server->nfs_client->rpc_ops->version != 2) {
- /* The VFS shouldn't apply the umask to mode bits. We will do
- * so ourselves when necessary.
+ switch (server->nfs_client->rpc_ops->version) {
+ case 2:
+ sb->s_time_gran = 1000;
+ sb->s_time_min = 0;
+ sb->s_time_max = U32_MAX;
+ break;
+ case 3:
+ /*
+ * The VFS shouldn't apply the umask to mode bits.
+ * We will do so ourselves when necessary.
*/
sb->s_flags |= SB_POSIXACL;
sb->s_time_gran = 1;
- sb->s_export_op = &nfs_export_ops;
- } else
- sb->s_time_gran = 1000;
-
- if (server->nfs_client->rpc_ops->version != 4) {
sb->s_time_min = 0;
sb->s_time_max = U32_MAX;
- } else {
+ sb->s_export_op = &nfs_export_ops;
+ break;
+ case 4:
+ sb->s_flags |= SB_POSIXACL;
+ sb->s_time_gran = 1;
sb->s_time_min = S64_MIN;
sb->s_time_max = S64_MAX;
+ if (server->caps & NFS_CAP_ATOMIC_OPEN_V1)
+ sb->s_export_op = &nfs_export_ops;
+ break;
}
sb->s_magic = NFS_SUPER_MAGIC;
diff --git a/fs/nfs/write.c b/fs/nfs/write.c
index bde4c36..dc08a0c 100644
--- a/fs/nfs/write.c
+++ b/fs/nfs/write.c
@@ -70,27 +70,17 @@
static struct kmem_cache *nfs_cdata_cachep;
static mempool_t *nfs_commit_mempool;
-struct nfs_commit_data *nfs_commitdata_alloc(bool never_fail)
+struct nfs_commit_data *nfs_commitdata_alloc(void)
{
struct nfs_commit_data *p;
- if (never_fail)
- p = mempool_alloc(nfs_commit_mempool, GFP_NOIO);
- else {
- /* It is OK to do some reclaim, not no safe to wait
- * for anything to be returned to the pool.
- * mempool_alloc() cannot handle that particular combination,
- * so we need two separate attempts.
- */
+ p = kmem_cache_zalloc(nfs_cdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
p = mempool_alloc(nfs_commit_mempool, GFP_NOWAIT);
if (!p)
- p = kmem_cache_alloc(nfs_cdata_cachep, GFP_NOIO |
- __GFP_NOWARN | __GFP_NORETRY);
- if (!p)
return NULL;
+ memset(p, 0, sizeof(*p));
}
-
- memset(p, 0, sizeof(*p));
INIT_LIST_HEAD(&p->pages);
return p;
}
@@ -104,9 +94,15 @@
static struct nfs_pgio_header *nfs_writehdr_alloc(void)
{
- struct nfs_pgio_header *p = mempool_alloc(nfs_wdata_mempool, GFP_KERNEL);
+ struct nfs_pgio_header *p;
- memset(p, 0, sizeof(*p));
+ p = kmem_cache_zalloc(nfs_wdata_cachep, nfs_io_gfp_mask());
+ if (!p) {
+ p = mempool_alloc(nfs_wdata_mempool, GFP_NOWAIT);
+ if (!p)
+ return NULL;
+ memset(p, 0, sizeof(*p));
+ }
p->rw_mode = FMODE_WRITE;
return p;
}
@@ -314,7 +310,10 @@
struct address_space *mapping = page_file_mapping(page);
SetPageError(page);
- mapping_set_error(mapping, error);
+ filemap_set_wb_err(mapping, error);
+ if (mapping->host)
+ errseq_set(&mapping->host->i_sb->s_wb_err,
+ error == -ENOSPC ? -ENOSPC : -EIO);
nfs_set_pageerror(mapping);
}
@@ -676,11 +675,7 @@
err = nfs_do_writepage(page, wbc, &pgio);
pgio.pg_error = 0;
nfs_pageio_complete(&pgio);
- if (err < 0)
- return err;
- if (nfs_error_is_fatal(pgio.pg_error))
- return pgio.pg_error;
- return 0;
+ return err;
}
int nfs_writepage(struct page *page, struct writeback_control *wbc)
@@ -731,9 +726,6 @@
if (err < 0)
goto out_err;
- err = pgio.pg_error;
- if (nfs_error_is_fatal(err))
- goto out_err;
return 0;
out_err:
return err;
@@ -1412,7 +1404,7 @@
while (!list_empty(head)) {
req = nfs_list_entry(head->next);
nfs_list_remove_request(req);
- if (nfs_error_is_fatal(error))
+ if (nfs_error_is_fatal_on_server(error))
nfs_write_error(req, error);
else
nfs_redirty_request(req);
@@ -1797,7 +1789,11 @@
if (list_empty(head))
return 0;
- data = nfs_commitdata_alloc(true);
+ data = nfs_commitdata_alloc();
+ if (!data) {
+ nfs_retry_commit(head, NULL, cinfo, -1);
+ return -ENOMEM;
+ }
/* Set up the argument struct */
nfs_init_commit(data, head, NULL, cinfo);
diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c
index e5aad1c..e30e1dd 100644
--- a/fs/nfsd/filecache.c
+++ b/fs/nfsd/filecache.c
@@ -194,7 +194,6 @@
__set_bit(NFSD_FILE_BREAK_READ, &nf->nf_flags);
}
nf->nf_mark = NULL;
- init_rwsem(&nf->nf_rwsem);
trace_nfsd_file_alloc(nf);
}
return nf;
@@ -641,7 +640,7 @@
if (!nfsd_filecache_wq)
goto out;
- nfsd_file_hashtbl = kcalloc(NFSD_FILE_HASH_SIZE,
+ nfsd_file_hashtbl = kvcalloc(NFSD_FILE_HASH_SIZE,
sizeof(*nfsd_file_hashtbl), GFP_KERNEL);
if (!nfsd_file_hashtbl) {
pr_err("nfsd: unable to allocate nfsd_file_hashtbl\n");
@@ -708,7 +707,7 @@
nfsd_file_slab = NULL;
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
+ kvfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
@@ -854,7 +853,7 @@
fsnotify_wait_marks_destroyed();
kmem_cache_destroy(nfsd_file_mark_slab);
nfsd_file_mark_slab = NULL;
- kfree(nfsd_file_hashtbl);
+ kvfree(nfsd_file_hashtbl);
nfsd_file_hashtbl = NULL;
destroy_workqueue(nfsd_filecache_wq);
nfsd_filecache_wq = NULL;
diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h
index 7872df5..435ceab 100644
--- a/fs/nfsd/filecache.h
+++ b/fs/nfsd/filecache.h
@@ -46,7 +46,6 @@
refcount_t nf_ref;
unsigned char nf_may;
struct nfsd_file_mark *nf_mark;
- struct rw_semaphore nf_rwsem;
};
int nfsd_file_cache_init(void);
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index 7850d14..735ee8a 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -1380,6 +1380,8 @@
static ssize_t _nfsd_copy_file_range(struct nfsd4_copy *copy)
{
+ struct file *dst = copy->nf_dst->nf_file;
+ struct file *src = copy->nf_src->nf_file;
ssize_t bytes_copied = 0;
size_t bytes_total = copy->cp_count;
u64 src_pos = copy->cp_src_pos;
@@ -1388,9 +1390,8 @@
do {
if (kthread_should_stop())
break;
- bytes_copied = nfsd_copy_file_range(copy->nf_src->nf_file,
- src_pos, copy->nf_dst->nf_file, dst_pos,
- bytes_total);
+ bytes_copied = nfsd_copy_file_range(src, src_pos, dst, dst_pos,
+ bytes_total);
if (bytes_copied <= 0)
break;
bytes_total -= bytes_copied;
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index f9b730c..83c4e68 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -815,8 +815,10 @@
princhash.data = memdup_user(
&ci->cc_princhash.cp_data,
princhashlen);
- if (IS_ERR_OR_NULL(princhash.data))
+ if (IS_ERR_OR_NULL(princhash.data)) {
+ kfree(name.data);
return -EFAULT;
+ }
princhash.len = princhashlen;
} else
princhash.len = 0;
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index d01d792..665d0ea 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -843,6 +843,7 @@
static void nfs4_free_deleg(struct nfs4_stid *stid)
{
+ WARN_ON(!list_empty(&stid->sc_cp_list));
kmem_cache_free(deleg_slab, stid);
atomic_long_dec(&num_delegations);
}
@@ -1358,6 +1359,7 @@
release_all_access(stp);
if (stp->st_stateowner)
nfs4_put_stateowner(stp->st_stateowner);
+ WARN_ON(!list_empty(&stid->sc_cp_list));
kmem_cache_free(stateid_slab, stid);
}
@@ -4607,6 +4609,14 @@
return ret;
}
+/**
+ * nfsd_breaker_owns_lease - Check if lease conflict was resolved
+ * @fl: Lock state to check
+ *
+ * Return values:
+ * %true: Lease conflict was resolved
+ * %false: Lease conflict was not resolved.
+ */
static bool nfsd_breaker_owns_lease(struct file_lock *fl)
{
struct nfs4_delegation *dl = fl->fl_owner;
@@ -4614,11 +4624,11 @@
struct nfs4_client *clp;
if (!i_am_nfsd())
- return NULL;
+ return false;
rqst = kthread_data(current);
/* Note rq_prog == NFS_ACL_PROGRAM is also possible: */
if (rqst->rq_prog != NFS_PROGRAM || rqst->rq_vers < 4)
- return NULL;
+ return false;
clp = *(rqst->rq_lease_breaker);
return dl->dl_stid.sc_client == clp;
}
@@ -6199,6 +6209,7 @@
struct nfs4_client *clp = s->st_stid.sc_client;
bool unhashed;
LIST_HEAD(reaplist);
+ struct nfs4_ol_stateid *stp;
spin_lock(&clp->cl_lock);
unhashed = unhash_open_stateid(s, &reaplist);
@@ -6207,6 +6218,8 @@
if (unhashed)
put_ol_stateid_locked(s, &reaplist);
spin_unlock(&clp->cl_lock);
+ list_for_each_entry(stp, &reaplist, st_locks)
+ nfs4_free_cpntf_statelist(clp->net, &stp->st_stid);
free_ol_stateid_reaplist(&reaplist);
} else {
spin_unlock(&clp->cl_lock);
@@ -7114,16 +7127,12 @@
if (sop->so_is_open_owner || !same_owner_str(sop, owner))
continue;
- /* see if there are still any locks associated with it */
- lo = lockowner(sop);
- list_for_each_entry(stp, &sop->so_stateids, st_perstateowner) {
- if (check_for_locks(stp->st_stid.sc_file, lo)) {
- status = nfserr_locks_held;
- spin_unlock(&clp->cl_lock);
- return status;
- }
+ if (atomic_read(&sop->so_count) != 1) {
+ spin_unlock(&clp->cl_lock);
+ return nfserr_locks_held;
}
+ lo = lockowner(sop);
nfs4_get_stateowner(sop);
break;
}
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index 46f825c..cc605ee 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -3871,7 +3871,7 @@
if (resp->xdr.buf->page_len &&
test_bit(RQ_SPLICE_OK, &resp->rqstp->rq_flags)) {
WARN_ON_ONCE(1);
- return nfserr_resource;
+ return nfserr_serverfault;
}
xdr_commit_encode(xdr);
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 9c9de2b..bbd01e8 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -223,7 +223,7 @@
unsigned long cnt = argp->len;
unsigned int nvecs;
- dprintk("nfsd: WRITE %s %d bytes at %d\n",
+ dprintk("nfsd: WRITE %s %u bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 011cd57..a4ae1fc 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -535,10 +535,11 @@
{
struct file *src = nf_src->nf_file;
struct file *dst = nf_dst->nf_file;
+ errseq_t since;
loff_t cloned;
__be32 ret = 0;
- down_write(&nf_dst->nf_rwsem);
+ since = READ_ONCE(dst->f_wb_err);
cloned = vfs_clone_file_range(src, src_pos, dst, dst_pos, count, 0);
if (cloned < 0) {
ret = nfserrno(cloned);
@@ -553,6 +554,8 @@
int status = vfs_fsync_range(dst, dst_pos, dst_end, 0);
if (!status)
+ status = filemap_check_wb_err(dst->f_mapping, since);
+ if (!status)
status = commit_inode_metadata(file_inode(src));
if (status < 0) {
nfsd_reset_boot_verifier(net_generic(nf_dst->nf_net,
@@ -561,7 +564,6 @@
}
}
out_err:
- up_write(&nf_dst->nf_rwsem);
return ret;
}
@@ -980,6 +982,7 @@
struct file *file = nf->nf_file;
struct svc_export *exp;
struct iov_iter iter;
+ errseq_t since;
__be32 nfserr;
int host_err;
int use_wgather;
@@ -1009,21 +1012,22 @@
flags |= RWF_SYNC;
iov_iter_kvec(&iter, WRITE, vec, vlen, *cnt);
+ since = READ_ONCE(file->f_wb_err);
if (flags & RWF_SYNC) {
- down_write(&nf->nf_rwsem);
- host_err = vfs_iter_write(file, &iter, &pos, flags);
- if (host_err < 0)
- nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
- nfsd_net_id));
- up_write(&nf->nf_rwsem);
- } else {
- down_read(&nf->nf_rwsem);
if (verf)
nfsd_copy_boot_verifier(verf,
net_generic(SVC_NET(rqstp),
nfsd_net_id));
host_err = vfs_iter_write(file, &iter, &pos, flags);
- up_read(&nf->nf_rwsem);
+ if (host_err < 0)
+ nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
+ nfsd_net_id));
+ } else {
+ if (verf)
+ nfsd_copy_boot_verifier(verf,
+ net_generic(SVC_NET(rqstp),
+ nfsd_net_id));
+ host_err = vfs_iter_write(file, &iter, &pos, flags);
}
if (host_err < 0) {
nfsd_reset_boot_verifier(net_generic(SVC_NET(rqstp),
@@ -1033,6 +1037,9 @@
*cnt = host_err;
nfsdstats.io_write += *cnt;
fsnotify_modify(file);
+ host_err = filemap_check_wb_err(file->f_mapping, since);
+ if (host_err < 0)
+ goto out_nfserr;
if (stable && use_wgather) {
host_err = wait_for_concurrent_writes(file);
@@ -1113,19 +1120,6 @@
}
#ifdef CONFIG_NFSD_V3
-static int
-nfsd_filemap_write_and_wait_range(struct nfsd_file *nf, loff_t offset,
- loff_t end)
-{
- struct address_space *mapping = nf->nf_file->f_mapping;
- int ret = filemap_fdatawrite_range(mapping, offset, end);
-
- if (ret)
- return ret;
- filemap_fdatawait_range_keep_errors(mapping, offset, end);
- return 0;
-}
-
/*
* Commit all pending writes to stable storage.
*
@@ -1156,25 +1150,26 @@
if (err)
goto out;
if (EX_ISSYNC(fhp->fh_export)) {
- int err2 = nfsd_filemap_write_and_wait_range(nf, offset, end);
+ errseq_t since = READ_ONCE(nf->nf_file->f_wb_err);
+ int err2;
- down_write(&nf->nf_rwsem);
- if (!err2)
- err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
+ err2 = vfs_fsync_range(nf->nf_file, offset, end, 0);
switch (err2) {
case 0:
nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
nfsd_net_id));
+ err2 = filemap_check_wb_err(nf->nf_file->f_mapping,
+ since);
+ err = nfserrno(err2);
break;
case -EINVAL:
err = nfserr_notsupp;
break;
default:
- err = nfserrno(err2);
nfsd_reset_boot_verifier(net_generic(nf->nf_net,
nfsd_net_id));
+ err = nfserrno(err2);
}
- up_write(&nf->nf_rwsem);
} else
nfsd_copy_boot_verifier(verf, net_generic(nf->nf_net,
nfsd_net_id));
diff --git a/fs/nfsd/xdr.h b/fs/nfsd/xdr.h
index 0ff336b..b8cc6a4 100644
--- a/fs/nfsd/xdr.h
+++ b/fs/nfsd/xdr.h
@@ -33,7 +33,7 @@
struct nfsd_writeargs {
svc_fh fh;
__u32 offset;
- int len;
+ __u32 len;
struct kvec first;
};
diff --git a/fs/nilfs2/btnode.c b/fs/nilfs2/btnode.c
index 4391fd3..e00e184 100644
--- a/fs/nilfs2/btnode.c
+++ b/fs/nilfs2/btnode.c
@@ -20,6 +20,23 @@
#include "page.h"
#include "btnode.h"
+
+/**
+ * nilfs_init_btnc_inode - initialize B-tree node cache inode
+ * @btnc_inode: inode to be initialized
+ *
+ * nilfs_init_btnc_inode() sets up an inode for B-tree node cache.
+ */
+void nilfs_init_btnc_inode(struct inode *btnc_inode)
+{
+ struct nilfs_inode_info *ii = NILFS_I(btnc_inode);
+
+ btnc_inode->i_mode = S_IFREG;
+ ii->i_flags = 0;
+ memset(&ii->i_bmap_data, 0, sizeof(struct nilfs_bmap));
+ mapping_set_gfp_mask(btnc_inode->i_mapping, GFP_NOFS);
+}
+
void nilfs_btnode_cache_clear(struct address_space *btnc)
{
invalidate_mapping_pages(btnc, 0, -1);
@@ -29,7 +46,7 @@
struct buffer_head *
nilfs_btnode_create_block(struct address_space *btnc, __u64 blocknr)
{
- struct inode *inode = NILFS_BTNC_I(btnc);
+ struct inode *inode = btnc->host;
struct buffer_head *bh;
bh = nilfs_grab_buffer(inode, btnc, blocknr, BIT(BH_NILFS_Node));
@@ -57,7 +74,7 @@
struct buffer_head **pbh, sector_t *submit_ptr)
{
struct buffer_head *bh;
- struct inode *inode = NILFS_BTNC_I(btnc);
+ struct inode *inode = btnc->host;
struct page *page;
int err;
@@ -157,7 +174,7 @@
struct nilfs_btnode_chkey_ctxt *ctxt)
{
struct buffer_head *obh, *nbh;
- struct inode *inode = NILFS_BTNC_I(btnc);
+ struct inode *inode = btnc->host;
__u64 oldkey = ctxt->oldkey, newkey = ctxt->newkey;
int err;
diff --git a/fs/nilfs2/btnode.h b/fs/nilfs2/btnode.h
index 0f88dbc..05ab64d 100644
--- a/fs/nilfs2/btnode.h
+++ b/fs/nilfs2/btnode.h
@@ -30,6 +30,7 @@
struct buffer_head *newbh;
};
+void nilfs_init_btnc_inode(struct inode *btnc_inode);
void nilfs_btnode_cache_clear(struct address_space *);
struct buffer_head *nilfs_btnode_create_block(struct address_space *btnc,
__u64 blocknr);
diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c
index f42ab57..77efd69 100644
--- a/fs/nilfs2/btree.c
+++ b/fs/nilfs2/btree.c
@@ -58,7 +58,8 @@
static int nilfs_btree_get_new_block(const struct nilfs_bmap *btree,
__u64 ptr, struct buffer_head **bhp)
{
- struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+ struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
+ struct address_space *btnc = btnc_inode->i_mapping;
struct buffer_head *bh;
bh = nilfs_btnode_create_block(btnc, ptr);
@@ -470,7 +471,8 @@
struct buffer_head **bhp,
const struct nilfs_btree_readahead_info *ra)
{
- struct address_space *btnc = &NILFS_BMAP_I(btree)->i_btnode_cache;
+ struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
+ struct address_space *btnc = btnc_inode->i_mapping;
struct buffer_head *bh, *ra_bh;
sector_t submit_ptr = 0;
int ret;
@@ -1742,6 +1744,10 @@
dat = nilfs_bmap_get_dat(btree);
}
+ ret = nilfs_attach_btree_node_cache(&NILFS_BMAP_I(btree)->vfs_inode);
+ if (ret < 0)
+ return ret;
+
ret = nilfs_bmap_prepare_alloc_ptr(btree, dreq, dat);
if (ret < 0)
return ret;
@@ -1914,7 +1920,7 @@
path[level].bp_ctxt.newkey = path[level].bp_newreq.bpr_ptr;
path[level].bp_ctxt.bh = path[level].bp_bh;
ret = nilfs_btnode_prepare_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
if (ret < 0) {
nilfs_dat_abort_update(dat,
@@ -1940,7 +1946,7 @@
if (buffer_nilfs_node(path[level].bp_bh)) {
nilfs_btnode_commit_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
path[level].bp_bh = path[level].bp_ctxt.bh;
}
@@ -1959,7 +1965,7 @@
&path[level].bp_newreq.bpr_req);
if (buffer_nilfs_node(path[level].bp_bh))
nilfs_btnode_abort_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
}
@@ -2135,7 +2141,8 @@
static void nilfs_btree_lookup_dirty_buffers(struct nilfs_bmap *btree,
struct list_head *listp)
{
- struct address_space *btcache = &NILFS_BMAP_I(btree)->i_btnode_cache;
+ struct inode *btnc_inode = NILFS_BMAP_I(btree)->i_assoc_inode;
+ struct address_space *btcache = btnc_inode->i_mapping;
struct list_head lists[NILFS_BTREE_LEVEL_MAX];
struct pagevec pvec;
struct buffer_head *bh, *head;
@@ -2189,12 +2196,12 @@
path[level].bp_ctxt.newkey = blocknr;
path[level].bp_ctxt.bh = *bh;
ret = nilfs_btnode_prepare_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
if (ret < 0)
return ret;
nilfs_btnode_commit_change_key(
- &NILFS_BMAP_I(btree)->i_btnode_cache,
+ NILFS_BMAP_I(btree)->i_assoc_inode->i_mapping,
&path[level].bp_ctxt);
*bh = path[level].bp_ctxt.bh;
}
@@ -2399,6 +2406,10 @@
if (nilfs_btree_root_broken(nilfs_btree_get_root(bmap), bmap->b_inode))
ret = -EIO;
+ else
+ ret = nilfs_attach_btree_node_cache(
+ &NILFS_BMAP_I(bmap)->vfs_inode);
+
return ret;
}
diff --git a/fs/nilfs2/dat.c b/fs/nilfs2/dat.c
index 8bccdf1..1a3d183 100644
--- a/fs/nilfs2/dat.c
+++ b/fs/nilfs2/dat.c
@@ -497,7 +497,9 @@
di = NILFS_DAT_I(dat);
lockdep_set_class(&di->mi.mi_sem, &dat_lock_key);
nilfs_palloc_setup_cache(dat, &di->palloc_cache);
- nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+ err = nilfs_mdt_setup_shadow_map(dat, &di->shadow);
+ if (err)
+ goto failed;
err = nilfs_read_inode_common(dat, raw_inode);
if (err)
diff --git a/fs/nilfs2/gcinode.c b/fs/nilfs2/gcinode.c
index 4483204..aadea66 100644
--- a/fs/nilfs2/gcinode.c
+++ b/fs/nilfs2/gcinode.c
@@ -126,9 +126,10 @@
int nilfs_gccache_submit_read_node(struct inode *inode, sector_t pbn,
__u64 vbn, struct buffer_head **out_bh)
{
+ struct inode *btnc_inode = NILFS_I(inode)->i_assoc_inode;
int ret;
- ret = nilfs_btnode_submit_block(&NILFS_I(inode)->i_btnode_cache,
+ ret = nilfs_btnode_submit_block(btnc_inode->i_mapping,
vbn ? : pbn, pbn, REQ_OP_READ, 0,
out_bh, &pbn);
if (ret == -EEXIST) /* internal code (cache hit) */
@@ -170,7 +171,7 @@
ii->i_flags = 0;
nilfs_bmap_init_gc(ii->i_bmap);
- return 0;
+ return nilfs_attach_btree_node_cache(inode);
}
/**
@@ -185,7 +186,7 @@
ii = list_first_entry(head, struct nilfs_inode_info, i_dirty);
list_del_init(&ii->i_dirty);
truncate_inode_pages(&ii->vfs_inode.i_data, 0);
- nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+ nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping);
iput(&ii->vfs_inode);
}
}
diff --git a/fs/nilfs2/inode.c b/fs/nilfs2/inode.c
index 745d371..fb594ed 100644
--- a/fs/nilfs2/inode.c
+++ b/fs/nilfs2/inode.c
@@ -29,12 +29,16 @@
* @cno: checkpoint number
* @root: pointer on NILFS root object (mounted checkpoint)
* @for_gc: inode for GC flag
+ * @for_btnc: inode for B-tree node cache flag
+ * @for_shadow: inode for shadowed page cache flag
*/
struct nilfs_iget_args {
u64 ino;
__u64 cno;
struct nilfs_root *root;
- int for_gc;
+ bool for_gc;
+ bool for_btnc;
+ bool for_shadow;
};
static int nilfs_iget_test(struct inode *inode, void *opaque);
@@ -314,7 +318,8 @@
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ .ino = ino, .root = root, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = false
};
return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
@@ -327,6 +332,7 @@
struct inode *inode;
struct nilfs_inode_info *ii;
struct nilfs_root *root;
+ struct buffer_head *bh;
int err = -ENOMEM;
ino_t ino;
@@ -342,11 +348,25 @@
ii->i_state = BIT(NILFS_I_NEW);
ii->i_root = root;
- err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
+ err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
if (unlikely(err))
goto failed_ifile_create_inode;
/* reference count of i_bh inherits from nilfs_mdt_read_block() */
+ if (unlikely(ino < NILFS_USER_INO)) {
+ nilfs_warn(sb,
+ "inode bitmap is inconsistent for reserved inodes");
+ do {
+ brelse(bh);
+ err = nilfs_ifile_create_inode(root->ifile, &ino, &bh);
+ if (unlikely(err))
+ goto failed_ifile_create_inode;
+ } while (ino < NILFS_USER_INO);
+
+ nilfs_info(sb, "repaired inode bitmap for reserved inodes");
+ }
+ ii->i_bh = bh;
+
atomic64_inc(&root->inodes_count);
inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
@@ -439,6 +459,8 @@
inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
+ if (nilfs_is_metadata_file_inode(inode) && !S_ISREG(inode->i_mode))
+ return -EIO; /* this inode is for metadata and corrupted */
if (inode->i_nlink == 0)
return -ESTALE; /* this inode is deleted */
@@ -527,6 +549,19 @@
return 0;
ii = NILFS_I(inode);
+ if (test_bit(NILFS_I_BTNC, &ii->i_state)) {
+ if (!args->for_btnc)
+ return 0;
+ } else if (args->for_btnc) {
+ return 0;
+ }
+ if (test_bit(NILFS_I_SHADOW, &ii->i_state)) {
+ if (!args->for_shadow)
+ return 0;
+ } else if (args->for_shadow) {
+ return 0;
+ }
+
if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
return !args->for_gc;
@@ -538,15 +573,17 @@
struct nilfs_iget_args *args = opaque;
inode->i_ino = args->ino;
- if (args->for_gc) {
+ NILFS_I(inode)->i_cno = args->cno;
+ NILFS_I(inode)->i_root = args->root;
+ if (args->root && args->ino == NILFS_ROOT_INO)
+ nilfs_get_root(args->root);
+
+ if (args->for_gc)
NILFS_I(inode)->i_state = BIT(NILFS_I_GCINODE);
- NILFS_I(inode)->i_cno = args->cno;
- NILFS_I(inode)->i_root = NULL;
- } else {
- if (args->root && args->ino == NILFS_ROOT_INO)
- nilfs_get_root(args->root);
- NILFS_I(inode)->i_root = args->root;
- }
+ if (args->for_btnc)
+ NILFS_I(inode)->i_state |= BIT(NILFS_I_BTNC);
+ if (args->for_shadow)
+ NILFS_I(inode)->i_state |= BIT(NILFS_I_SHADOW);
return 0;
}
@@ -554,7 +591,8 @@
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ .ino = ino, .root = root, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = false
};
return ilookup5(sb, ino, nilfs_iget_test, &args);
@@ -564,7 +602,8 @@
unsigned long ino)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = root, .cno = 0, .for_gc = 0
+ .ino = ino, .root = root, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = false
};
return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
@@ -595,7 +634,8 @@
__u64 cno)
{
struct nilfs_iget_args args = {
- .ino = ino, .root = NULL, .cno = cno, .for_gc = 1
+ .ino = ino, .root = NULL, .cno = cno, .for_gc = true,
+ .for_btnc = false, .for_shadow = false
};
struct inode *inode;
int err;
@@ -615,6 +655,113 @@
return inode;
}
+/**
+ * nilfs_attach_btree_node_cache - attach a B-tree node cache to the inode
+ * @inode: inode object
+ *
+ * nilfs_attach_btree_node_cache() attaches a B-tree node cache to @inode,
+ * or does nothing if the inode already has it. This function allocates
+ * an additional inode to maintain page cache of B-tree nodes one-on-one.
+ *
+ * Return Value: On success, 0 is returned. On errors, one of the following
+ * negative error code is returned.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+int nilfs_attach_btree_node_cache(struct inode *inode)
+{
+ struct nilfs_inode_info *ii = NILFS_I(inode);
+ struct inode *btnc_inode;
+ struct nilfs_iget_args args;
+
+ if (ii->i_assoc_inode)
+ return 0;
+
+ args.ino = inode->i_ino;
+ args.root = ii->i_root;
+ args.cno = ii->i_cno;
+ args.for_gc = test_bit(NILFS_I_GCINODE, &ii->i_state) != 0;
+ args.for_btnc = true;
+ args.for_shadow = test_bit(NILFS_I_SHADOW, &ii->i_state) != 0;
+
+ btnc_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
+ nilfs_iget_set, &args);
+ if (unlikely(!btnc_inode))
+ return -ENOMEM;
+ if (btnc_inode->i_state & I_NEW) {
+ nilfs_init_btnc_inode(btnc_inode);
+ unlock_new_inode(btnc_inode);
+ }
+ NILFS_I(btnc_inode)->i_assoc_inode = inode;
+ NILFS_I(btnc_inode)->i_bmap = ii->i_bmap;
+ ii->i_assoc_inode = btnc_inode;
+
+ return 0;
+}
+
+/**
+ * nilfs_detach_btree_node_cache - detach the B-tree node cache from the inode
+ * @inode: inode object
+ *
+ * nilfs_detach_btree_node_cache() detaches the B-tree node cache and its
+ * holder inode bound to @inode, or does nothing if @inode doesn't have it.
+ */
+void nilfs_detach_btree_node_cache(struct inode *inode)
+{
+ struct nilfs_inode_info *ii = NILFS_I(inode);
+ struct inode *btnc_inode = ii->i_assoc_inode;
+
+ if (btnc_inode) {
+ NILFS_I(btnc_inode)->i_assoc_inode = NULL;
+ ii->i_assoc_inode = NULL;
+ iput(btnc_inode);
+ }
+}
+
+/**
+ * nilfs_iget_for_shadow - obtain inode for shadow mapping
+ * @inode: inode object that uses shadow mapping
+ *
+ * nilfs_iget_for_shadow() allocates a pair of inodes that holds page
+ * caches for shadow mapping. The page cache for data pages is set up
+ * in one inode and the one for b-tree node pages is set up in the
+ * other inode, which is attached to the former inode.
+ *
+ * Return Value: On success, a pointer to the inode for data pages is
+ * returned. On errors, one of the following negative error code is returned
+ * in a pointer type.
+ *
+ * %-ENOMEM - Insufficient memory available.
+ */
+struct inode *nilfs_iget_for_shadow(struct inode *inode)
+{
+ struct nilfs_iget_args args = {
+ .ino = inode->i_ino, .root = NULL, .cno = 0, .for_gc = false,
+ .for_btnc = false, .for_shadow = true
+ };
+ struct inode *s_inode;
+ int err;
+
+ s_inode = iget5_locked(inode->i_sb, inode->i_ino, nilfs_iget_test,
+ nilfs_iget_set, &args);
+ if (unlikely(!s_inode))
+ return ERR_PTR(-ENOMEM);
+ if (!(s_inode->i_state & I_NEW))
+ return inode;
+
+ NILFS_I(s_inode)->i_flags = 0;
+ memset(NILFS_I(s_inode)->i_bmap, 0, sizeof(struct nilfs_bmap));
+ mapping_set_gfp_mask(s_inode->i_mapping, GFP_NOFS);
+
+ err = nilfs_attach_btree_node_cache(s_inode);
+ if (unlikely(err)) {
+ iget_failed(s_inode);
+ return ERR_PTR(err);
+ }
+ unlock_new_inode(s_inode);
+ return s_inode;
+}
+
void nilfs_write_inode_common(struct inode *inode,
struct nilfs_inode *raw_inode, int has_bmap)
{
@@ -762,7 +909,8 @@
if (test_bit(NILFS_I_BMAP, &ii->i_state))
nilfs_bmap_clear(ii->i_bmap);
- nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+ if (!test_bit(NILFS_I_BTNC, &ii->i_state))
+ nilfs_detach_btree_node_cache(inode);
if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
nilfs_put_root(ii->i_root);
diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c
index c0361ce..e80ef2c 100644
--- a/fs/nilfs2/mdt.c
+++ b/fs/nilfs2/mdt.c
@@ -469,9 +469,18 @@
void nilfs_mdt_clear(struct inode *inode)
{
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
+ struct nilfs_shadow_map *shadow = mdi->mi_shadow;
if (mdi->mi_palloc_cache)
nilfs_palloc_destroy_cache(inode);
+
+ if (shadow) {
+ struct inode *s_inode = shadow->inode;
+
+ shadow->inode = NULL;
+ iput(s_inode);
+ mdi->mi_shadow = NULL;
+ }
}
/**
@@ -505,12 +514,15 @@
struct nilfs_shadow_map *shadow)
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
+ struct inode *s_inode;
INIT_LIST_HEAD(&shadow->frozen_buffers);
- address_space_init_once(&shadow->frozen_data);
- nilfs_mapping_init(&shadow->frozen_data, inode);
- address_space_init_once(&shadow->frozen_btnodes);
- nilfs_mapping_init(&shadow->frozen_btnodes, inode);
+
+ s_inode = nilfs_iget_for_shadow(inode);
+ if (IS_ERR(s_inode))
+ return PTR_ERR(s_inode);
+
+ shadow->inode = s_inode;
mi->mi_shadow = shadow;
return 0;
}
@@ -524,14 +536,15 @@
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
struct nilfs_inode_info *ii = NILFS_I(inode);
struct nilfs_shadow_map *shadow = mi->mi_shadow;
+ struct inode *s_inode = shadow->inode;
int ret;
- ret = nilfs_copy_dirty_pages(&shadow->frozen_data, inode->i_mapping);
+ ret = nilfs_copy_dirty_pages(s_inode->i_mapping, inode->i_mapping);
if (ret)
goto out;
- ret = nilfs_copy_dirty_pages(&shadow->frozen_btnodes,
- &ii->i_btnode_cache);
+ ret = nilfs_copy_dirty_pages(NILFS_I(s_inode)->i_assoc_inode->i_mapping,
+ ii->i_assoc_inode->i_mapping);
if (ret)
goto out;
@@ -547,7 +560,7 @@
struct page *page;
int blkbits = inode->i_blkbits;
- page = grab_cache_page(&shadow->frozen_data, bh->b_page->index);
+ page = grab_cache_page(shadow->inode->i_mapping, bh->b_page->index);
if (!page)
return -ENOMEM;
@@ -579,7 +592,7 @@
struct page *page;
int n;
- page = find_lock_page(&shadow->frozen_data, bh->b_page->index);
+ page = find_lock_page(shadow->inode->i_mapping, bh->b_page->index);
if (page) {
if (page_has_buffers(page)) {
n = bh_offset(bh) >> inode->i_blkbits;
@@ -620,10 +633,11 @@
nilfs_palloc_clear_cache(inode);
nilfs_clear_dirty_pages(inode->i_mapping, true);
- nilfs_copy_back_pages(inode->i_mapping, &shadow->frozen_data);
+ nilfs_copy_back_pages(inode->i_mapping, shadow->inode->i_mapping);
- nilfs_clear_dirty_pages(&ii->i_btnode_cache, true);
- nilfs_copy_back_pages(&ii->i_btnode_cache, &shadow->frozen_btnodes);
+ nilfs_clear_dirty_pages(ii->i_assoc_inode->i_mapping, true);
+ nilfs_copy_back_pages(ii->i_assoc_inode->i_mapping,
+ NILFS_I(shadow->inode)->i_assoc_inode->i_mapping);
nilfs_bmap_restore(ii->i_bmap, &shadow->bmap_store);
@@ -638,10 +652,11 @@
{
struct nilfs_mdt_info *mi = NILFS_MDT(inode);
struct nilfs_shadow_map *shadow = mi->mi_shadow;
+ struct inode *shadow_btnc_inode = NILFS_I(shadow->inode)->i_assoc_inode;
down_write(&mi->mi_sem);
nilfs_release_frozen_buffers(shadow);
- truncate_inode_pages(&shadow->frozen_data, 0);
- truncate_inode_pages(&shadow->frozen_btnodes, 0);
+ truncate_inode_pages(shadow->inode->i_mapping, 0);
+ truncate_inode_pages(shadow_btnc_inode->i_mapping, 0);
up_write(&mi->mi_sem);
}
diff --git a/fs/nilfs2/mdt.h b/fs/nilfs2/mdt.h
index e77aea4..9d8ac0d 100644
--- a/fs/nilfs2/mdt.h
+++ b/fs/nilfs2/mdt.h
@@ -18,14 +18,12 @@
/**
* struct nilfs_shadow_map - shadow mapping of meta data file
* @bmap_store: shadow copy of bmap state
- * @frozen_data: shadowed dirty data pages
- * @frozen_btnodes: shadowed dirty b-tree nodes' pages
+ * @inode: holder of page caches used in shadow mapping
* @frozen_buffers: list of frozen buffers
*/
struct nilfs_shadow_map {
struct nilfs_bmap_store bmap_store;
- struct address_space frozen_data;
- struct address_space frozen_btnodes;
+ struct inode *inode;
struct list_head frozen_buffers;
};
diff --git a/fs/nilfs2/nilfs.h b/fs/nilfs2/nilfs.h
index f8450ee..ace27a8 100644
--- a/fs/nilfs2/nilfs.h
+++ b/fs/nilfs2/nilfs.h
@@ -28,7 +28,7 @@
* @i_xattr: <TODO>
* @i_dir_start_lookup: page index of last successful search
* @i_cno: checkpoint number for GC inode
- * @i_btnode_cache: cached pages of b-tree nodes
+ * @i_assoc_inode: associated inode (B-tree node cache holder or back pointer)
* @i_dirty: list for connecting dirty files
* @xattr_sem: semaphore for extended attributes processing
* @i_bh: buffer contains disk inode
@@ -43,7 +43,7 @@
__u64 i_xattr; /* sector_t ??? */
__u32 i_dir_start_lookup;
__u64 i_cno; /* check point number for GC inode */
- struct address_space i_btnode_cache;
+ struct inode *i_assoc_inode;
struct list_head i_dirty; /* List for connecting dirty files */
#ifdef CONFIG_NILFS_XATTR
@@ -75,13 +75,6 @@
return container_of(bmap, struct nilfs_inode_info, i_bmap_data);
}
-static inline struct inode *NILFS_BTNC_I(struct address_space *btnc)
-{
- struct nilfs_inode_info *ii =
- container_of(btnc, struct nilfs_inode_info, i_btnode_cache);
- return &ii->vfs_inode;
-}
-
/*
* Dynamic state flags of NILFS on-memory inode (i_state)
*/
@@ -98,6 +91,8 @@
NILFS_I_INODE_SYNC, /* dsync is not allowed for inode */
NILFS_I_BMAP, /* has bmap and btnode_cache */
NILFS_I_GCINODE, /* inode for GC, on memory only */
+ NILFS_I_BTNC, /* inode for btree node cache */
+ NILFS_I_SHADOW, /* inode for shadowed page cache */
};
/*
@@ -203,6 +198,9 @@
static inline int nilfs_init_acl(struct inode *inode, struct inode *dir)
{
+ if (S_ISLNK(inode->i_mode))
+ return 0;
+
inode->i_mode &= ~current_umask();
return 0;
}
@@ -264,6 +262,9 @@
unsigned long ino);
extern struct inode *nilfs_iget_for_gc(struct super_block *sb,
unsigned long ino, __u64 cno);
+int nilfs_attach_btree_node_cache(struct inode *inode);
+void nilfs_detach_btree_node_cache(struct inode *inode);
+struct inode *nilfs_iget_for_shadow(struct inode *inode);
extern void nilfs_update_inode(struct inode *, struct buffer_head *, int);
extern void nilfs_truncate(struct inode *);
extern void nilfs_evict_inode(struct inode *);
diff --git a/fs/nilfs2/page.c b/fs/nilfs2/page.c
index 171fb5c..d1a148f 100644
--- a/fs/nilfs2/page.c
+++ b/fs/nilfs2/page.c
@@ -448,10 +448,9 @@
/*
* NILFS2 needs clear_page_dirty() in the following two cases:
*
- * 1) For B-tree node pages and data pages of the dat/gcdat, NILFS2 clears
- * page dirty flags when it copies back pages from the shadow cache
- * (gcdat->{i_mapping,i_btnode_cache}) to its original cache
- * (dat->{i_mapping,i_btnode_cache}).
+ * 1) For B-tree node pages and data pages of DAT file, NILFS2 clears dirty
+ * flag of pages when it copies back pages from shadow cache to the
+ * original cache.
*
* 2) Some B-tree operations like insertion or deletion may dispose buffers
* in dirty state, and this needs to cancel the dirty state of their pages.
diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c
index e3726ac..5ee4973 100644
--- a/fs/nilfs2/segment.c
+++ b/fs/nilfs2/segment.c
@@ -322,7 +322,7 @@
struct the_nilfs *nilfs = sb->s_fs_info;
struct nilfs_sc_info *sci = nilfs->ns_writer;
- if (!sci || !sci->sc_flush_request)
+ if (sb_rdonly(sb) || unlikely(!sci) || !sci->sc_flush_request)
return;
set_bit(NILFS_SC_PRIOR_FLUSH, &sci->sc_flags);
@@ -738,15 +738,18 @@
struct list_head *listp)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
- struct address_space *mapping = &ii->i_btnode_cache;
+ struct inode *btnc_inode = ii->i_assoc_inode;
struct pagevec pvec;
struct buffer_head *bh, *head;
unsigned int i;
pgoff_t index = 0;
+ if (!btnc_inode)
+ return;
+
pagevec_init(&pvec);
- while (pagevec_lookup_tag(&pvec, mapping, &index,
+ while (pagevec_lookup_tag(&pvec, btnc_inode->i_mapping, &index,
PAGECACHE_TAG_DIRTY)) {
for (i = 0; i < pagevec_count(&pvec); i++) {
bh = head = page_buffers(pvec.pages[i]);
@@ -877,9 +880,11 @@
nilfs_mdt_mark_dirty(nilfs->ns_cpfile);
nilfs_cpfile_put_checkpoint(
nilfs->ns_cpfile, nilfs->ns_cno, bh_cp);
- } else
- WARN_ON(err == -EINVAL || err == -ENOENT);
-
+ } else if (err == -EINVAL || err == -ENOENT) {
+ nilfs_error(sci->sc_super,
+ "checkpoint creation failed due to metadata corruption.");
+ err = -EIO;
+ }
return err;
}
@@ -893,7 +898,11 @@
err = nilfs_cpfile_get_checkpoint(nilfs->ns_cpfile, nilfs->ns_cno, 0,
&raw_cp, &bh_cp);
if (unlikely(err)) {
- WARN_ON(err == -EINVAL || err == -ENOENT);
+ if (err == -EINVAL || err == -ENOENT) {
+ nilfs_error(sci->sc_super,
+ "checkpoint finalization failed due to metadata corruption.");
+ err = -EIO;
+ }
goto failed_ibh;
}
raw_cp->cp_snapshot_list.ssl_next = 0;
@@ -2239,7 +2248,7 @@
struct nilfs_transaction_info *ti;
int err;
- if (!sci)
+ if (sb_rdonly(sb) || unlikely(!sci))
return -EROFS;
/* A call inside transactions causes a deadlock. */
@@ -2278,7 +2287,7 @@
struct nilfs_transaction_info ti;
int err = 0;
- if (!sci)
+ if (sb_rdonly(sb) || unlikely(!sci))
return -EROFS;
nilfs_transaction_lock(sb, &ti, 0);
@@ -2415,7 +2424,7 @@
continue;
list_del_init(&ii->i_dirty);
truncate_inode_pages(&ii->vfs_inode.i_data, 0);
- nilfs_btnode_cache_clear(&ii->i_btnode_cache);
+ nilfs_btnode_cache_clear(ii->i_assoc_inode->i_mapping);
iput(&ii->vfs_inode);
}
}
@@ -2774,11 +2783,12 @@
if (nilfs->ns_writer) {
/*
- * This happens if the filesystem was remounted
- * read/write after nilfs_error degenerated it into a
- * read-only mount.
+ * This happens if the filesystem is made read-only by
+ * __nilfs_error or nilfs_remount and then remounted
+ * read/write. In these cases, reuse the existing
+ * writer.
*/
- nilfs_detach_log_writer(sb);
+ return 0;
}
nilfs->ns_writer = nilfs_segctor_new(sb, root);
@@ -2788,10 +2798,9 @@
inode_attach_wb(nilfs->ns_bdev->bd_inode, NULL);
err = nilfs_segctor_start_thread(nilfs->ns_writer);
- if (err) {
- kfree(nilfs->ns_writer);
- nilfs->ns_writer = NULL;
- }
+ if (unlikely(err))
+ nilfs_detach_log_writer(sb);
+
return err;
}
diff --git a/fs/nilfs2/sufile.c b/fs/nilfs2/sufile.c
index 6372247..51f4cb0 100644
--- a/fs/nilfs2/sufile.c
+++ b/fs/nilfs2/sufile.c
@@ -495,14 +495,22 @@
int nilfs_sufile_mark_dirty(struct inode *sufile, __u64 segnum)
{
struct buffer_head *bh;
+ void *kaddr;
+ struct nilfs_segment_usage *su;
int ret;
+ down_write(&NILFS_MDT(sufile)->mi_sem);
ret = nilfs_sufile_get_segment_usage_block(sufile, segnum, 0, &bh);
if (!ret) {
mark_buffer_dirty(bh);
nilfs_mdt_mark_dirty(sufile);
+ kaddr = kmap_atomic(bh->b_page);
+ su = nilfs_sufile_block_get_segment_usage(sufile, segnum, bh, kaddr);
+ nilfs_segment_usage_set_dirty(su);
+ kunmap_atomic(kaddr);
brelse(bh);
}
+ up_write(&NILFS_MDT(sufile)->mi_sem);
return ret;
}
diff --git a/fs/nilfs2/super.c b/fs/nilfs2/super.c
index 4abd928..7a41c97 100644
--- a/fs/nilfs2/super.c
+++ b/fs/nilfs2/super.c
@@ -157,7 +157,8 @@
ii->i_bh = NULL;
ii->i_state = 0;
ii->i_cno = 0;
- nilfs_mapping_init(&ii->i_btnode_cache, &ii->vfs_inode);
+ ii->i_assoc_inode = NULL;
+ ii->i_bmap = &ii->i_bmap_data;
return &ii->vfs_inode;
}
@@ -1132,8 +1133,6 @@
if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb))
goto out;
if (*flags & SB_RDONLY) {
- /* Shutting down log writer */
- nilfs_detach_log_writer(sb);
sb->s_flags |= SB_RDONLY;
/*
@@ -1377,8 +1376,6 @@
#ifdef CONFIG_NILFS_XATTR
init_rwsem(&ii->xattr_sem);
#endif
- address_space_init_once(&ii->i_btnode_cache);
- ii->i_bmap = &ii->i_bmap_data;
inode_init_once(&ii->vfs_inode);
}
diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c
index c20ebec..ce103dd 100644
--- a/fs/nilfs2/the_nilfs.c
+++ b/fs/nilfs2/the_nilfs.c
@@ -690,9 +690,7 @@
{
unsigned long ncleansegs;
- down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
ncleansegs = nilfs_sufile_get_ncleansegs(nilfs->ns_sufile);
- up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
*nblocks = (sector_t)ncleansegs * nilfs->ns_blocks_per_segment;
return 0;
}
diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c
index f0d6b54..765b50a 100644
--- a/fs/notify/fdinfo.c
+++ b/fs/notify/fdinfo.c
@@ -83,16 +83,9 @@
inode_mark = container_of(mark, struct inotify_inode_mark, fsn_mark);
inode = igrab(fsnotify_conn_inode(mark->connector));
if (inode) {
- /*
- * IN_ALL_EVENTS represents all of the mask bits
- * that we expose to userspace. There is at
- * least one bit (FS_EVENT_ON_CHILD) which is
- * used only internally to the kernel.
- */
- u32 mask = mark->mask & IN_ALL_EVENTS;
- seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:%x ",
+ seq_printf(m, "inotify wd:%x ino:%lx sdev:%x mask:%x ignored_mask:0 ",
inode_mark->wd, inode->i_ino, inode->i_sb->s_dev,
- mask, mark->ignored_mask);
+ inotify_mark_user_mask(mark));
show_mark_fhandle(m, inode);
seq_putc(m, '\n');
iput(inode);
diff --git a/fs/notify/inotify/inotify.h b/fs/notify/inotify/inotify.h
index 2007e37..8f00151 100644
--- a/fs/notify/inotify/inotify.h
+++ b/fs/notify/inotify/inotify.h
@@ -22,6 +22,18 @@
return container_of(fse, struct inotify_event_info, fse);
}
+/*
+ * INOTIFY_USER_FLAGS represents all of the mask bits that we expose to
+ * userspace. There is at least one bit (FS_EVENT_ON_CHILD) which is
+ * used only internally to the kernel.
+ */
+#define INOTIFY_USER_MASK (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK)
+
+static inline __u32 inotify_mark_user_mask(struct fsnotify_mark *fsn_mark)
+{
+ return fsn_mark->mask & INOTIFY_USER_MASK;
+}
+
extern void inotify_ignored_and_remove_idr(struct fsnotify_mark *fsn_mark,
struct fsnotify_group *group);
extern int inotify_handle_inode_event(struct fsnotify_mark *inode_mark,
diff --git a/fs/notify/inotify/inotify_user.c b/fs/notify/inotify/inotify_user.c
index 5f6c6bf..32b6b97 100644
--- a/fs/notify/inotify/inotify_user.c
+++ b/fs/notify/inotify/inotify_user.c
@@ -88,7 +88,7 @@
mask |= FS_EVENT_ON_CHILD;
/* mask off the flags used to open the fd */
- mask |= (arg & (IN_ALL_EVENTS | IN_ONESHOT | IN_EXCL_UNLINK));
+ mask |= (arg & INOTIFY_USER_MASK);
return mask;
}
diff --git a/fs/notify/mark.c b/fs/notify/mark.c
index 8387937..5b44be5 100644
--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -430,7 +430,7 @@
void fsnotify_destroy_mark(struct fsnotify_mark *mark,
struct fsnotify_group *group)
{
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ mutex_lock(&group->mark_mutex);
fsnotify_detach_mark(mark);
mutex_unlock(&group->mark_mutex);
fsnotify_free_mark(mark);
@@ -742,7 +742,7 @@
* move marks to free to to_free list in one go and then free marks in
* to_free list one by one.
*/
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ mutex_lock(&group->mark_mutex);
list_for_each_entry_safe(mark, lmark, &group->marks_list, g_list) {
if ((1U << mark->connector->type) & type_mask)
list_move(&mark->g_list, &to_free);
@@ -751,7 +751,7 @@
clear:
while (1) {
- mutex_lock_nested(&group->mark_mutex, SINGLE_DEPTH_NESTING);
+ mutex_lock(&group->mark_mutex);
if (list_empty(head)) {
mutex_unlock(&group->mark_mutex);
break;
diff --git a/fs/ntfs/attrib.c b/fs/ntfs/attrib.c
index d563abc..c0881d3 100644
--- a/fs/ntfs/attrib.c
+++ b/fs/ntfs/attrib.c
@@ -592,15 +592,39 @@
a = (ATTR_RECORD*)((u8*)ctx->attr +
le32_to_cpu(ctx->attr->length));
for (;; a = (ATTR_RECORD*)((u8*)a + le32_to_cpu(a->length))) {
- if ((u8*)a < (u8*)ctx->mrec || (u8*)a > (u8*)ctx->mrec +
- le32_to_cpu(ctx->mrec->bytes_allocated))
+ u8 *mrec_end = (u8 *)ctx->mrec +
+ le32_to_cpu(ctx->mrec->bytes_allocated);
+ u8 *name_end;
+
+ /* check whether ATTR_RECORD wrap */
+ if ((u8 *)a < (u8 *)ctx->mrec)
break;
+
+ /* check whether Attribute Record Header is within bounds */
+ if ((u8 *)a > mrec_end ||
+ (u8 *)a + sizeof(ATTR_RECORD) > mrec_end)
+ break;
+
+ /* check whether ATTR_RECORD's name is within bounds */
+ name_end = (u8 *)a + le16_to_cpu(a->name_offset) +
+ a->name_length * sizeof(ntfschar);
+ if (name_end > mrec_end)
+ break;
+
ctx->attr = a;
if (unlikely(le32_to_cpu(a->type) > le32_to_cpu(type) ||
a->type == AT_END))
return -ENOENT;
if (unlikely(!a->length))
break;
+
+ /* check whether ATTR_RECORD's length wrap */
+ if ((u8 *)a + le32_to_cpu(a->length) < (u8 *)a)
+ break;
+ /* check whether ATTR_RECORD's length is within bounds */
+ if ((u8 *)a + le32_to_cpu(a->length) > mrec_end)
+ break;
+
if (a->type != type)
continue;
/*
diff --git a/fs/ntfs/inode.c b/fs/ntfs/inode.c
index ea18e4a..645c4b1 100644
--- a/fs/ntfs/inode.c
+++ b/fs/ntfs/inode.c
@@ -1829,6 +1829,13 @@
goto err_out;
}
+ /* Sanity check offset to the first attribute */
+ if (le16_to_cpu(m->attrs_offset) >= le32_to_cpu(m->bytes_allocated)) {
+ ntfs_error(sb, "Incorrect mft offset to the first attribute %u in superblock.",
+ le16_to_cpu(m->attrs_offset));
+ goto err_out;
+ }
+
/* Need this to sanity check attribute list references to $MFT. */
vi->i_generation = ni->seq_no = le16_to_cpu(m->sequence_number);
@@ -1881,6 +1888,10 @@
}
/* Now allocate memory for the attribute list. */
ni->attr_list_size = (u32)ntfs_attr_size(a);
+ if (!ni->attr_list_size) {
+ ntfs_error(sb, "Attr_list_size is zero");
+ goto put_err_out;
+ }
ni->attr_list = ntfs_malloc_nofs(ni->attr_list_size);
if (!ni->attr_list) {
ntfs_error(sb, "Not enough memory to allocate buffer "
diff --git a/fs/ntfs/super.c b/fs/ntfs/super.c
index 0d7e948..7f69422 100644
--- a/fs/ntfs/super.c
+++ b/fs/ntfs/super.c
@@ -2092,7 +2092,8 @@
// TODO: Initialize security.
/* Get the extended system files' directory inode. */
vol->extend_ino = ntfs_iget(sb, FILE_Extend);
- if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino)) {
+ if (IS_ERR(vol->extend_ino) || is_bad_inode(vol->extend_ino) ||
+ !S_ISDIR(vol->extend_ino->i_mode)) {
if (!IS_ERR(vol->extend_ino))
iput(vol->extend_ino);
ntfs_error(sb, "Failed to load $Extend.");
diff --git a/fs/ocfs2/dlmfs/userdlm.c b/fs/ocfs2/dlmfs/userdlm.c
index 339f098..8fa289d 100644
--- a/fs/ocfs2/dlmfs/userdlm.c
+++ b/fs/ocfs2/dlmfs/userdlm.c
@@ -435,6 +435,11 @@
}
spin_lock(&lockres->l_lock);
+ if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
+ spin_unlock(&lockres->l_lock);
+ status = -EAGAIN;
+ goto bail;
+ }
/* We only compare against the currently granted level
* here. If the lock is blocked waiting on a downconvert,
@@ -597,7 +602,7 @@
spin_lock(&lockres->l_lock);
if (lockres->l_flags & USER_LOCK_IN_TEARDOWN) {
spin_unlock(&lockres->l_lock);
- return 0;
+ goto bail;
}
lockres->l_flags |= USER_LOCK_IN_TEARDOWN;
@@ -611,12 +616,17 @@
}
if (lockres->l_ro_holders || lockres->l_ex_holders) {
+ lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN;
spin_unlock(&lockres->l_lock);
goto bail;
}
status = 0;
if (!(lockres->l_flags & USER_LOCK_ATTACHED)) {
+ /*
+ * lock is never requested, leave USER_LOCK_IN_TEARDOWN set
+ * to avoid new lock request coming in.
+ */
spin_unlock(&lockres->l_lock);
goto bail;
}
@@ -627,6 +637,10 @@
status = ocfs2_dlm_unlock(conn, &lockres->l_lksb, DLM_LKF_VALBLK);
if (status) {
+ spin_lock(&lockres->l_lock);
+ lockres->l_flags &= ~USER_LOCK_IN_TEARDOWN;
+ lockres->l_flags &= ~USER_LOCK_BUSY;
+ spin_unlock(&lockres->l_lock);
user_log_dlm_error("ocfs2_dlm_unlock", status, lockres);
goto bail;
}
diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c
index c46bf7f..856474b 100644
--- a/fs/ocfs2/namei.c
+++ b/fs/ocfs2/namei.c
@@ -231,6 +231,7 @@
handle_t *handle = NULL;
struct ocfs2_super *osb;
struct ocfs2_dinode *dirfe;
+ struct ocfs2_dinode *fe = NULL;
struct buffer_head *new_fe_bh = NULL;
struct inode *inode = NULL;
struct ocfs2_alloc_context *inode_ac = NULL;
@@ -381,6 +382,7 @@
goto leave;
}
+ fe = (struct ocfs2_dinode *) new_fe_bh->b_data;
if (S_ISDIR(mode)) {
status = ocfs2_fill_new_dir(osb, handle, dir, inode,
new_fe_bh, data_ac, meta_ac);
@@ -453,8 +455,11 @@
leave:
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && fe)
+ ocfs2_set_links_count(fe, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
@@ -631,18 +636,9 @@
return status;
}
- status = __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
+ return __ocfs2_mknod_locked(dir, inode, dev, new_fe_bh,
parent_fe_bh, handle, inode_ac,
fe_blkno, suballoc_loc, suballoc_bit);
- if (status < 0) {
- u64 bg_blkno = ocfs2_which_suballoc_group(fe_blkno, suballoc_bit);
- int tmp = ocfs2_free_suballoc_bits(handle, inode_ac->ac_inode,
- inode_ac->ac_bh, suballoc_bit, bg_blkno, 1);
- if (tmp)
- mlog_errno(tmp);
- }
-
- return status;
}
static int ocfs2_mkdir(struct inode *dir,
@@ -2023,8 +2019,11 @@
ocfs2_clusters_to_bytes(osb->sb, 1));
if (status < 0 && did_quota_inode)
dquot_free_inode(inode);
- if (handle)
+ if (handle) {
+ if (status < 0 && fe)
+ ocfs2_set_links_count(fe, 0);
ocfs2_commit_trans(osb, handle);
+ }
ocfs2_inode_unlock(dir, 1);
if (did_block_signals)
diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h
index 7993d52..0a8cd8e 100644
--- a/fs/ocfs2/ocfs2.h
+++ b/fs/ocfs2/ocfs2.h
@@ -279,7 +279,6 @@
OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT = 1 << 15, /* Journal Async Commit */
OCFS2_MOUNT_ERRORS_CONT = 1 << 16, /* Return EIO to the calling process on error */
OCFS2_MOUNT_ERRORS_ROFS = 1 << 17, /* Change filesystem to read-only on error */
- OCFS2_MOUNT_NOCLUSTER = 1 << 18, /* No cluster aware filesystem mount */
};
#define OCFS2_OSB_SOFT_RO 0x0001
@@ -675,8 +674,7 @@
static inline int ocfs2_mount_local(struct ocfs2_super *osb)
{
- return ((osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT)
- || (osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER));
+ return (osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT);
}
static inline int ocfs2_uses_extended_slot_map(struct ocfs2_super *osb)
diff --git a/fs/ocfs2/slot_map.c b/fs/ocfs2/slot_map.c
index 4da0e4b..8caecee 100644
--- a/fs/ocfs2/slot_map.c
+++ b/fs/ocfs2/slot_map.c
@@ -254,16 +254,14 @@
int i, ret = -ENOSPC;
if ((preferred >= 0) && (preferred < si->si_num_slots)) {
- if (!si->si_slots[preferred].sl_valid ||
- !si->si_slots[preferred].sl_node_num) {
+ if (!si->si_slots[preferred].sl_valid) {
ret = preferred;
goto out;
}
}
for(i = 0; i < si->si_num_slots; i++) {
- if (!si->si_slots[i].sl_valid ||
- !si->si_slots[i].sl_node_num) {
+ if (!si->si_slots[i].sl_valid) {
ret = i;
break;
}
@@ -458,30 +456,24 @@
spin_lock(&osb->osb_lock);
ocfs2_update_slot_info(si);
- if (ocfs2_mount_local(osb))
- /* use slot 0 directly in local mode */
- slot = 0;
- else {
- /* search for ourselves first and take the slot if it already
- * exists. Perhaps we need to mark this in a variable for our
- * own journal recovery? Possibly not, though we certainly
- * need to warn to the user */
- slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ /* search for ourselves first and take the slot if it already
+ * exists. Perhaps we need to mark this in a variable for our
+ * own journal recovery? Possibly not, though we certainly
+ * need to warn to the user */
+ slot = __ocfs2_node_num_to_slot(si, osb->node_num);
+ if (slot < 0) {
+ /* if no slot yet, then just take 1st available
+ * one. */
+ slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
if (slot < 0) {
- /* if no slot yet, then just take 1st available
- * one. */
- slot = __ocfs2_find_empty_slot(si, osb->preferred_slot);
- if (slot < 0) {
- spin_unlock(&osb->osb_lock);
- mlog(ML_ERROR, "no free slots available!\n");
- status = -EINVAL;
- goto bail;
- }
- } else
- printk(KERN_INFO "ocfs2: Slot %d on device (%s) was "
- "already allocated to this node!\n",
- slot, osb->dev_str);
- }
+ spin_unlock(&osb->osb_lock);
+ mlog(ML_ERROR, "no free slots available!\n");
+ status = -EINVAL;
+ goto bail;
+ }
+ } else
+ printk(KERN_INFO "ocfs2: Slot %d on device (%s) was already "
+ "allocated to this node!\n", slot, osb->dev_str);
ocfs2_set_slot(si, slot, osb->node_num);
osb->slot_num = slot;
diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c
index 477ad05..c0e5f1b 100644
--- a/fs/ocfs2/super.c
+++ b/fs/ocfs2/super.c
@@ -175,7 +175,6 @@
Opt_dir_resv_level,
Opt_journal_async_commit,
Opt_err_cont,
- Opt_nocluster,
Opt_err,
};
@@ -209,7 +208,6 @@
{Opt_dir_resv_level, "dir_resv_level=%u"},
{Opt_journal_async_commit, "journal_async_commit"},
{Opt_err_cont, "errors=continue"},
- {Opt_nocluster, "nocluster"},
{Opt_err, NULL}
};
@@ -621,13 +619,6 @@
goto out;
}
- tmp = OCFS2_MOUNT_NOCLUSTER;
- if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
- ret = -EINVAL;
- mlog(ML_ERROR, "Cannot change nocluster option on remount\n");
- goto out;
- }
-
tmp = OCFS2_MOUNT_HB_LOCAL | OCFS2_MOUNT_HB_GLOBAL |
OCFS2_MOUNT_HB_NONE;
if ((osb->s_mount_opt & tmp) != (parsed_options.mount_opt & tmp)) {
@@ -868,7 +859,6 @@
}
if (ocfs2_userspace_stack(osb) &&
- !(osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
strncmp(osb->osb_cluster_stack, mopt->cluster_stack,
OCFS2_STACK_LABEL_LEN)) {
mlog(ML_ERROR,
@@ -1149,11 +1139,6 @@
osb->s_mount_opt & OCFS2_MOUNT_DATA_WRITEBACK ? "writeback" :
"ordered");
- if ((osb->s_mount_opt & OCFS2_MOUNT_NOCLUSTER) &&
- !(osb->s_feature_incompat & OCFS2_FEATURE_INCOMPAT_LOCAL_MOUNT))
- printk(KERN_NOTICE "ocfs2: The shared device (%s) is mounted "
- "without cluster aware mode.\n", osb->dev_str);
-
atomic_set(&osb->vol_state, VOLUME_MOUNTED);
wake_up(&osb->osb_mount_event);
@@ -1460,9 +1445,6 @@
case Opt_journal_async_commit:
mopt->mount_opt |= OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT;
break;
- case Opt_nocluster:
- mopt->mount_opt |= OCFS2_MOUNT_NOCLUSTER;
- break;
default:
mlog(ML_ERROR,
"Unrecognized mount option \"%s\" "
@@ -1574,9 +1556,6 @@
if (opts & OCFS2_MOUNT_JOURNAL_ASYNC_COMMIT)
seq_printf(s, ",journal_async_commit");
- if (opts & OCFS2_MOUNT_NOCLUSTER)
- seq_printf(s, ",nocluster");
-
return 0;
}
diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c
index f469982..44118f0 100644
--- a/fs/overlayfs/export.c
+++ b/fs/overlayfs/export.c
@@ -257,7 +257,7 @@
return FILEID_INVALID;
dentry = d_find_any_alias(inode);
- if (WARN_ON(!dentry))
+ if (!dentry)
return FILEID_INVALID;
bytes = ovl_dentry_to_fid(dentry, fid, buflen);
diff --git a/fs/pipe.c b/fs/pipe.c
index 9f2ca1b..dbb090e 100644
--- a/fs/pipe.c
+++ b/fs/pipe.c
@@ -652,7 +652,7 @@
unsigned int head, tail;
/* Epoll has some historical nasty semantics, this enables them */
- pipe->poll_usage = 1;
+ WRITE_ONCE(pipe->poll_usage, true);
/*
* Reading pipe state only -- no need for acquiring the semaphore.
@@ -1244,30 +1244,33 @@
/*
* Resize the pipe ring to a number of slots.
+ *
+ * Note the pipe can be reduced in capacity, but only if the current
+ * occupancy doesn't exceed nr_slots; if it does, EBUSY will be
+ * returned instead.
*/
int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots)
{
struct pipe_buffer *bufs;
unsigned int head, tail, mask, n;
- /*
- * We can shrink the pipe, if arg is greater than the ring occupancy.
- * Since we don't expect a lot of shrink+grow operations, just free and
- * allocate again like we would do for growing. If the pipe currently
- * contains more buffers than arg, then return busy.
- */
- mask = pipe->ring_size - 1;
- head = pipe->head;
- tail = pipe->tail;
- n = pipe_occupancy(pipe->head, pipe->tail);
- if (nr_slots < n)
- return -EBUSY;
-
bufs = kcalloc(nr_slots, sizeof(*bufs),
GFP_KERNEL_ACCOUNT | __GFP_NOWARN);
if (unlikely(!bufs))
return -ENOMEM;
+ spin_lock_irq(&pipe->rd_wait.lock);
+ mask = pipe->ring_size - 1;
+ head = pipe->head;
+ tail = pipe->tail;
+
+ n = pipe_occupancy(head, tail);
+ if (nr_slots < n) {
+ spin_unlock_irq(&pipe->rd_wait.lock);
+ kfree(bufs);
+ return -EBUSY;
+ }
+
/*
* The pipe array wraps around, so just start the new one at zero
* and adjust the indices.
@@ -1299,6 +1302,8 @@
pipe->tail = tail;
pipe->head = head;
+ spin_unlock_irq(&pipe->rd_wait.lock);
+
/* This might have made more room for writers */
wake_up_interruptible(&pipe->wr_wait);
return 0;
diff --git a/fs/proc/bootconfig.c b/fs/proc/bootconfig.c
index ad31ec4..d82dae1 100644
--- a/fs/proc/bootconfig.c
+++ b/fs/proc/bootconfig.c
@@ -32,6 +32,8 @@
int ret = 0;
key = kzalloc(XBC_KEYLEN_MAX, GFP_KERNEL);
+ if (!key)
+ return -ENOMEM;
xbc_for_each_key_value(leaf, val) {
ret = xbc_node_compose_key(leaf, key, XBC_KEYLEN_MAX);
diff --git a/fs/proc/generic.c b/fs/proc/generic.c
index 09e4d8a..5898761 100644
--- a/fs/proc/generic.c
+++ b/fs/proc/generic.c
@@ -453,6 +453,9 @@
proc_set_user(ent, (*parent)->uid, (*parent)->gid);
ent->proc_dops = &proc_misc_dentry_ops;
+ /* Revalidate everything under /proc/${pid}/net */
+ if ((*parent)->proc_dops == &proc_net_dentry_ops)
+ pde_force_lookup(ent);
out:
return ent;
diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c
index e502414..4d2e64e 100644
--- a/fs/proc/kcore.c
+++ b/fs/proc/kcore.c
@@ -193,8 +193,6 @@
return 1;
p = pfn_to_page(pfn);
- if (!memmap_valid_within(pfn, p, page_zone(p)))
- return 1;
ent = kmalloc(sizeof(*ent), GFP_KERNEL);
if (!ent)
diff --git a/fs/proc/proc_net.c b/fs/proc/proc_net.c
index 1aa9236..707477e 100644
--- a/fs/proc/proc_net.c
+++ b/fs/proc/proc_net.c
@@ -362,6 +362,9 @@
proc_set_user(netd, uid, gid);
+ /* Seed dentry revalidation for /proc/${pid}/net */
+ pde_force_lookup(netd);
+
err = -EEXIST;
net_statd = proc_net_mkdir(net, "stat", netd);
if (!net_statd)
diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c
index ba98371..8b75a04 100644
--- a/fs/proc/task_mmu.c
+++ b/fs/proc/task_mmu.c
@@ -503,10 +503,12 @@
struct vm_area_struct *vma = walk->vma;
bool locked = !!(vma->vm_flags & VM_LOCKED);
struct page *page = NULL;
- bool migration = false;
+ bool migration = false, young = false, dirty = false;
if (pte_present(*pte)) {
page = vm_normal_page(vma, addr, *pte);
+ young = pte_young(*pte);
+ dirty = pte_dirty(*pte);
} else if (is_swap_pte(*pte)) {
swp_entry_t swpent = pte_to_swp_entry(*pte);
@@ -540,8 +542,7 @@
if (!page)
return;
- smaps_account(mss, page, false, pte_young(*pte), pte_dirty(*pte),
- locked, migration);
+ smaps_account(mss, page, false, young, dirty, locked, migration);
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -950,7 +951,7 @@
vma = vma->vm_next;
}
- show_vma_header_prefix(m, priv->mm->mmap->vm_start,
+ show_vma_header_prefix(m, priv->mm->mmap ? priv->mm->mmap->vm_start : 0,
last_vma_end, 0, 0, 0, 0);
seq_pad(m, ' ');
seq_puts(m, "[rollup]\n");
diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c
index b1ebf7b..ce03c3d 100644
--- a/fs/pstore/platform.c
+++ b/fs/pstore/platform.c
@@ -143,21 +143,22 @@
mod_timer(&pstore_timer, jiffies + msecs_to_jiffies(pstore_update_ms));
}
-/*
- * Should pstore_dump() wait for a concurrent pstore_dump()? If
- * not, the current pstore_dump() will report a failure to dump
- * and return.
- */
-static bool pstore_cannot_wait(enum kmsg_dump_reason reason)
+static bool pstore_cannot_block_path(enum kmsg_dump_reason reason)
{
- /* In NMI path, pstore shouldn't block regardless of reason. */
+ /*
+ * In case of NMI path, pstore shouldn't be blocked
+ * regardless of reason.
+ */
if (in_nmi())
return true;
switch (reason) {
/* In panic case, other cpus are stopped by smp_send_stop(). */
case KMSG_DUMP_PANIC:
- /* Emergency restart shouldn't be blocked. */
+ /*
+ * Emergency restart shouldn't be blocked by spinning on
+ * pstore_info::buf_lock.
+ */
case KMSG_DUMP_EMERG:
return true;
default:
@@ -388,21 +389,19 @@
unsigned long total = 0;
const char *why;
unsigned int part = 1;
+ unsigned long flags = 0;
int ret;
why = kmsg_dump_reason_str(reason);
- if (down_trylock(&psinfo->buf_lock)) {
- /* Failed to acquire lock: give up if we cannot wait. */
- if (pstore_cannot_wait(reason)) {
- pr_err("dump skipped in %s path: may corrupt error record\n",
- in_nmi() ? "NMI" : why);
+ if (pstore_cannot_block_path(reason)) {
+ if (!spin_trylock_irqsave(&psinfo->buf_lock, flags)) {
+ pr_err("dump skipped in %s path because of concurrent dump\n",
+ in_nmi() ? "NMI" : why);
return;
}
- if (down_interruptible(&psinfo->buf_lock)) {
- pr_err("could not grab semaphore?!\n");
- return;
- }
+ } else {
+ spin_lock_irqsave(&psinfo->buf_lock, flags);
}
oopscount++;
@@ -464,8 +463,7 @@
total += record.size;
part++;
}
-
- up(&psinfo->buf_lock);
+ spin_unlock_irqrestore(&psinfo->buf_lock, flags);
}
static struct kmsg_dumper pstore_dumper = {
@@ -591,7 +589,7 @@
psi->write_user = pstore_write_user_compat;
psinfo = psi;
mutex_init(&psinfo->read_mutex);
- sema_init(&psinfo->buf_lock, 1);
+ spin_lock_init(&psinfo->buf_lock);
if (psi->flags & PSTORE_FLAGS_DMESG)
allocate_buf_for_compression();
diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c
index 09fb845..65f123d 100644
--- a/fs/quota/dquot.c
+++ b/fs/quota/dquot.c
@@ -79,6 +79,7 @@
#include <linux/capability.h>
#include <linux/quotaops.h>
#include <linux/blkdev.h>
+#include <linux/sched/mm.h>
#include "../internal.h" /* ugh */
#include <linux/uaccess.h>
@@ -427,9 +428,11 @@
int dquot_acquire(struct dquot *dquot)
{
int ret = 0, ret2 = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
if (!test_bit(DQ_READ_B, &dquot->dq_flags)) {
ret = dqopt->ops[dquot->dq_id.type]->read_dqblk(dquot);
if (ret < 0)
@@ -460,6 +463,7 @@
smp_mb__before_atomic();
set_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_iolock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
@@ -471,9 +475,11 @@
int dquot_commit(struct dquot *dquot)
{
int ret = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
if (!clear_dquot_dirty(dquot))
goto out_lock;
/* Inactive dquot can be only if there was error during read/init
@@ -483,6 +489,7 @@
else
ret = -EIO;
out_lock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
@@ -494,9 +501,11 @@
int dquot_release(struct dquot *dquot)
{
int ret = 0, ret2 = 0;
+ unsigned int memalloc;
struct quota_info *dqopt = sb_dqopt(dquot->dq_sb);
mutex_lock(&dquot->dq_lock);
+ memalloc = memalloc_nofs_save();
/* Check whether we are not racing with some other dqget() */
if (dquot_is_busy(dquot))
goto out_dqlock;
@@ -512,6 +521,7 @@
}
clear_bit(DQ_ACTIVE_B, &dquot->dq_flags);
out_dqlock:
+ memalloc_nofs_restore(memalloc);
mutex_unlock(&dquot->dq_lock);
return ret;
}
diff --git a/fs/quota/quota_tree.c b/fs/quota/quota_tree.c
index 1a188fb..07948f6 100644
--- a/fs/quota/quota_tree.c
+++ b/fs/quota/quota_tree.c
@@ -80,6 +80,35 @@
return ret;
}
+static inline int do_check_range(struct super_block *sb, const char *val_name,
+ uint val, uint min_val, uint max_val)
+{
+ if (val < min_val || val > max_val) {
+ quota_error(sb, "Getting %s %u out of range %u-%u",
+ val_name, val, min_val, max_val);
+ return -EUCLEAN;
+ }
+
+ return 0;
+}
+
+static int check_dquot_block_header(struct qtree_mem_dqinfo *info,
+ struct qt_disk_dqdbheader *dh)
+{
+ int err = 0;
+
+ err = do_check_range(info->dqi_sb, "dqdh_next_free",
+ le32_to_cpu(dh->dqdh_next_free), 0,
+ info->dqi_blocks - 1);
+ if (err)
+ return err;
+ err = do_check_range(info->dqi_sb, "dqdh_prev_free",
+ le32_to_cpu(dh->dqdh_prev_free), 0,
+ info->dqi_blocks - 1);
+
+ return err;
+}
+
/* Remove empty block from list and return it */
static int get_free_dqblk(struct qtree_mem_dqinfo *info)
{
@@ -94,6 +123,9 @@
ret = read_blk(info, blk, buf);
if (ret < 0)
goto out_buf;
+ ret = check_dquot_block_header(info, dh);
+ if (ret)
+ goto out_buf;
info->dqi_free_blk = le32_to_cpu(dh->dqdh_next_free);
}
else {
@@ -241,6 +273,9 @@
*err = read_blk(info, blk, buf);
if (*err < 0)
goto out_buf;
+ *err = check_dquot_block_header(info, dh);
+ if (*err)
+ goto out_buf;
} else {
blk = get_free_dqblk(info);
if ((int)blk < 0) {
@@ -433,6 +468,9 @@
goto out_buf;
}
dh = (struct qt_disk_dqdbheader *)buf;
+ ret = check_dquot_block_header(info, dh);
+ if (ret)
+ goto out_buf;
le16_add_cpu(&dh->dqdh_entries, -1);
if (!le16_to_cpu(dh->dqdh_entries)) { /* Block got free? */
ret = remove_free_dqentry(info, buf, blk);
diff --git a/fs/remap_range.c b/fs/remap_range.c
index e6099be..e8e00e2 100644
--- a/fs/remap_range.c
+++ b/fs/remap_range.c
@@ -71,7 +71,8 @@
* Otherwise, make sure the count is also block-aligned, having
* already confirmed the starting offsets' block alignment.
*/
- if (pos_in + count == size_in) {
+ if (pos_in + count == size_in &&
+ (!(remap_flags & REMAP_FILE_DEDUP) || pos_out + count == size_out)) {
bcount = ALIGN(size_in, bs) - pos_in;
} else {
if (!IS_ALIGNED(count, bs))
diff --git a/fs/signalfd.c b/fs/signalfd.c
index b94fb5f..41dc597 100644
--- a/fs/signalfd.c
+++ b/fs/signalfd.c
@@ -248,6 +248,7 @@
.poll = signalfd_poll,
.read = signalfd_read,
.llseek = noop_llseek,
+ .may_pollfree = true,
};
static int do_signalfd4(int ufd, sigset_t *mask, int flags)
diff --git a/fs/stat.c b/fs/stat.c
index 1196af4..04550c0 100644
--- a/fs/stat.c
+++ b/fs/stat.c
@@ -306,9 +306,6 @@
# define choose_32_64(a,b) b
#endif
-#define valid_dev(x) choose_32_64(old_valid_dev(x),true)
-#define encode_dev(x) choose_32_64(old_encode_dev,new_encode_dev)(x)
-
#ifndef INIT_STRUCT_STAT_PADDING
# define INIT_STRUCT_STAT_PADDING(st) memset(&st, 0, sizeof(st))
#endif
@@ -317,7 +314,9 @@
{
struct stat tmp;
- if (!valid_dev(stat->dev) || !valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
#if BITS_PER_LONG == 32
if (stat->size > MAX_NON_LFS)
@@ -325,7 +324,7 @@
#endif
INIT_STRUCT_STAT_PADDING(tmp);
- tmp.st_dev = encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -335,7 +334,7 @@
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
tmp.st_size = stat->size;
tmp.st_atime = stat->atime.tv_sec;
tmp.st_mtime = stat->mtime.tv_sec;
@@ -616,11 +615,13 @@
{
struct compat_stat tmp;
- if (!old_valid_dev(stat->dev) || !old_valid_dev(stat->rdev))
+ if (sizeof(tmp.st_dev) < 4 && !old_valid_dev(stat->dev))
+ return -EOVERFLOW;
+ if (sizeof(tmp.st_rdev) < 4 && !old_valid_dev(stat->rdev))
return -EOVERFLOW;
memset(&tmp, 0, sizeof(tmp));
- tmp.st_dev = old_encode_dev(stat->dev);
+ tmp.st_dev = new_encode_dev(stat->dev);
tmp.st_ino = stat->ino;
if (sizeof(tmp.st_ino) < sizeof(stat->ino) && tmp.st_ino != stat->ino)
return -EOVERFLOW;
@@ -630,7 +631,7 @@
return -EOVERFLOW;
SET_UID(tmp.st_uid, from_kuid_munged(current_user_ns(), stat->uid));
SET_GID(tmp.st_gid, from_kgid_munged(current_user_ns(), stat->gid));
- tmp.st_rdev = old_encode_dev(stat->rdev);
+ tmp.st_rdev = new_encode_dev(stat->rdev);
if ((u64) stat->size > MAX_NON_LFS)
return -EOVERFLOW;
tmp.st_size = stat->size;
diff --git a/fs/super.c b/fs/super.c
index bae3fe8..7629f9d 100644
--- a/fs/super.c
+++ b/fs/super.c
@@ -293,7 +293,7 @@
WARN_ON(s->s_inode_lru.node);
WARN_ON(!list_empty(&s->s_mounts));
security_sb_free(s);
- fscrypt_sb_free(s);
+ fscrypt_destroy_keyring(s);
put_user_ns(s->s_user_ns);
kfree(s->s_subtype);
call_rcu(&s->rcu, destroy_super_rcu);
@@ -454,6 +454,7 @@
evict_inodes(sb);
/* only nonzero refcount inodes can have marks */
fsnotify_sb_delete(sb);
+ fscrypt_destroy_keyring(sb);
if (sb->s_dio_done_wq) {
destroy_workqueue(sb->s_dio_done_wq);
diff --git a/fs/sync.c b/fs/sync.c
index 1373a61..79180e5 100644
--- a/fs/sync.c
+++ b/fs/sync.c
@@ -22,32 +22,13 @@
SYNC_FILE_RANGE_WAIT_AFTER)
/*
- * Do the filesystem syncing work. For simple filesystems
- * writeback_inodes_sb(sb) just dirties buffers with inodes so we have to
- * submit IO for these buffers via __sync_blockdev(). This also speeds up the
- * wait == 1 case since in that case write_inode() functions do
- * sync_dirty_buffer() and thus effectively write one block at a time.
- */
-static int __sync_filesystem(struct super_block *sb, int wait)
-{
- if (wait)
- sync_inodes_sb(sb);
- else
- writeback_inodes_sb(sb, WB_REASON_SYNC);
-
- if (sb->s_op->sync_fs)
- sb->s_op->sync_fs(sb, wait);
- return __sync_blockdev(sb->s_bdev, wait);
-}
-
-/*
* Write out and wait upon all dirty data associated with this
* superblock. Filesystem data as well as the underlying block
* device. Takes the superblock lock.
*/
int sync_filesystem(struct super_block *sb)
{
- int ret;
+ int ret = 0;
/*
* We need to be protected against the filesystem going from
@@ -61,10 +42,31 @@
if (sb_rdonly(sb))
return 0;
- ret = __sync_filesystem(sb, 0);
- if (ret < 0)
+ /*
+ * Do the filesystem syncing work. For simple filesystems
+ * writeback_inodes_sb(sb) just dirties buffers with inodes so we have
+ * to submit I/O for these buffers via __sync_blockdev(). This also
+ * speeds up the wait == 1 case since in that case write_inode()
+ * methods call sync_dirty_buffer() and thus effectively write one block
+ * at a time.
+ */
+ writeback_inodes_sb(sb, WB_REASON_SYNC);
+ if (sb->s_op->sync_fs) {
+ ret = sb->s_op->sync_fs(sb, 0);
+ if (ret)
+ return ret;
+ }
+ ret = __sync_blockdev(sb->s_bdev, 0);
+ if (ret)
return ret;
- return __sync_filesystem(sb, 1);
+
+ sync_inodes_sb(sb);
+ if (sb->s_op->sync_fs) {
+ ret = sb->s_op->sync_fs(sb, 1);
+ if (ret)
+ return ret;
+ }
+ return __sync_blockdev(sb->s_bdev, 1);
}
EXPORT_SYMBOL(sync_filesystem);
diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c
index 8b7315c..4b70571 100644
--- a/fs/tracefs/inode.c
+++ b/fs/tracefs/inode.c
@@ -139,6 +139,8 @@
kuid_t uid;
kgid_t gid;
umode_t mode;
+ /* Opt_* bitfield. */
+ unsigned int opts;
};
enum {
@@ -239,6 +241,7 @@
kgid_t gid;
char *p;
+ opts->opts = 0;
opts->mode = TRACEFS_DEFAULT_MODE;
while ((p = strsep(&data, ",")) != NULL) {
@@ -273,24 +276,36 @@
* but traditionally tracefs has ignored all mount options
*/
}
+
+ opts->opts |= BIT(token);
}
return 0;
}
-static int tracefs_apply_options(struct super_block *sb)
+static int tracefs_apply_options(struct super_block *sb, bool remount)
{
struct tracefs_fs_info *fsi = sb->s_fs_info;
struct inode *inode = sb->s_root->d_inode;
struct tracefs_mount_opts *opts = &fsi->mount_opts;
- inode->i_mode &= ~S_IALLUGO;
- inode->i_mode |= opts->mode;
+ /*
+ * On remount, only reset mode/uid/gid if they were provided as mount
+ * options.
+ */
- inode->i_uid = opts->uid;
+ if (!remount || opts->opts & BIT(Opt_mode)) {
+ inode->i_mode &= ~S_IALLUGO;
+ inode->i_mode |= opts->mode;
+ }
- /* Set all the group ids to the mount option */
- set_gid(sb->s_root, opts->gid);
+ if (!remount || opts->opts & BIT(Opt_uid))
+ inode->i_uid = opts->uid;
+
+ if (!remount || opts->opts & BIT(Opt_gid)) {
+ /* Set all the group ids to the mount option */
+ set_gid(sb->s_root, opts->gid);
+ }
return 0;
}
@@ -305,7 +320,7 @@
if (err)
goto fail;
- tracefs_apply_options(sb);
+ tracefs_apply_options(sb, true);
fail:
return err;
@@ -357,7 +372,7 @@
sb->s_op = &tracefs_super_operations;
- tracefs_apply_options(sb);
+ tracefs_apply_options(sb, false);
return 0;
diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c
index ad90a3a..9257ee8 100644
--- a/fs/ubifs/dir.c
+++ b/fs/ubifs/dir.c
@@ -353,15 +353,18 @@
{
struct inode *inode;
struct ubifs_info *c = dir->i_sb->s_fs_info;
- struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1};
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino = 1};
struct ubifs_budget_req ino_req = { .dirtied_ino = 1 };
struct ubifs_inode *ui, *dir_ui = ubifs_inode(dir);
int err, instantiated = 0;
struct fscrypt_name nm;
/*
- * Budget request settings: new dirty inode, new direntry,
- * budget for dirtied inode will be released via writeback.
+ * Budget request settings: new inode, new direntry, changing the
+ * parent directory inode.
+ * Allocate budget separately for new dirtied inode, the budget will
+ * be released via writeback.
*/
dbg_gen("dent '%pd', mode %#hx in dir ino %lu",
@@ -431,6 +434,8 @@
make_bad_inode(inode);
if (!instantiated)
iput(inode);
+ else if (whiteout)
+ iput(*whiteout);
out_budg:
ubifs_release_budget(c, &req);
if (!instantiated)
@@ -947,7 +952,8 @@
struct ubifs_inode *dir_ui = ubifs_inode(dir);
struct ubifs_info *c = dir->i_sb->s_fs_info;
int err, sz_change;
- struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1 };
+ struct ubifs_budget_req req = { .new_ino = 1, .new_dent = 1,
+ .dirtied_ino = 1};
struct fscrypt_name nm;
/*
@@ -1322,6 +1328,7 @@
if (flags & RENAME_WHITEOUT) {
union ubifs_dev_desc *dev = NULL;
+ struct ubifs_budget_req wht_req;
dev = kmalloc(sizeof(union ubifs_dev_desc), GFP_NOFS);
if (!dev) {
@@ -1343,6 +1350,23 @@
whiteout_ui->data = dev;
whiteout_ui->data_len = ubifs_encode_dev(dev, MKDEV(0, 0));
ubifs_assert(c, !whiteout_ui->dirty);
+
+ memset(&wht_req, 0, sizeof(struct ubifs_budget_req));
+ wht_req.dirtied_ino = 1;
+ wht_req.dirtied_ino_d = ALIGN(whiteout_ui->data_len, 8);
+ /*
+ * To avoid deadlock between space budget (holds ui_mutex and
+ * waits wb work) and writeback work(waits ui_mutex), do space
+ * budget before ubifs inodes locked.
+ */
+ err = ubifs_budget_space(c, &wht_req);
+ if (err) {
+ iput(whiteout);
+ goto out_release;
+ }
+
+ /* Add the old_dentry size to the old_dir size. */
+ old_sz -= CALC_DENT_SIZE(fname_len(&old_nm));
}
lock_4_inodes(old_dir, new_dir, new_inode, whiteout);
@@ -1417,18 +1441,6 @@
}
if (whiteout) {
- struct ubifs_budget_req wht_req = { .dirtied_ino = 1,
- .dirtied_ino_d = \
- ALIGN(ubifs_inode(whiteout)->data_len, 8) };
-
- err = ubifs_budget_space(c, &wht_req);
- if (err) {
- kfree(whiteout_ui->data);
- whiteout_ui->data_len = 0;
- iput(whiteout);
- goto out_release;
- }
-
inc_nlink(whiteout);
mark_inode_dirty(whiteout);
diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c
index f4826b6..354457e 100644
--- a/fs/ubifs/file.c
+++ b/fs/ubifs/file.c
@@ -570,7 +570,7 @@
}
if (!PagePrivate(page)) {
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
@@ -947,7 +947,7 @@
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
kunmap(page);
@@ -1303,7 +1303,7 @@
release_existing_page_budget(c);
atomic_long_dec(&c->dirty_pg_cnt);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
}
@@ -1470,8 +1470,8 @@
return rc;
if (PagePrivate(page)) {
- ClearPagePrivate(page);
- SetPagePrivate(newpage);
+ detach_page_private(page);
+ attach_page_private(newpage, (void *)1);
}
if (mode != MIGRATE_SYNC_NO_COPY)
@@ -1495,7 +1495,7 @@
return 0;
ubifs_assert(c, PagePrivate(page));
ubifs_assert(c, 0);
- ClearPagePrivate(page);
+ detach_page_private(page);
ClearPageChecked(page);
return 1;
}
@@ -1566,7 +1566,7 @@
else {
if (!PageChecked(page))
ubifs_convert_page_budget(c);
- SetPagePrivate(page);
+ attach_page_private(page, (void *)1);
atomic_long_inc(&c->dirty_pg_cnt);
__set_page_dirty_nobuffers(page);
}
diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c
index eae9cf5..89b671a 100644
--- a/fs/ubifs/io.c
+++ b/fs/ubifs/io.c
@@ -846,16 +846,42 @@
*/
n = aligned_len >> c->max_write_shift;
if (n) {
- n <<= c->max_write_shift;
+ int m = n - 1;
+
dbg_io("write %d bytes to LEB %d:%d", n, wbuf->lnum,
wbuf->offs);
- err = ubifs_leb_write(c, wbuf->lnum, buf + written,
- wbuf->offs, n);
+
+ if (m) {
+ /* '(n-1)<<c->max_write_shift < len' is always true. */
+ m <<= c->max_write_shift;
+ err = ubifs_leb_write(c, wbuf->lnum, buf + written,
+ wbuf->offs, m);
+ if (err)
+ goto out;
+ wbuf->offs += m;
+ aligned_len -= m;
+ len -= m;
+ written += m;
+ }
+
+ /*
+ * The non-written len of buf may be less than 'n' because
+ * parameter 'len' is not 8 bytes aligned, so here we read
+ * min(len, n) bytes from buf.
+ */
+ n = 1 << c->max_write_shift;
+ memcpy(wbuf->buf, buf + written, min(len, n));
+ if (n > len) {
+ ubifs_assert(c, n - len < 8);
+ ubifs_pad(c, wbuf->buf + len, n - len);
+ }
+
+ err = ubifs_leb_write(c, wbuf->lnum, wbuf->buf, wbuf->offs, n);
if (err)
goto out;
wbuf->offs += n;
aligned_len -= n;
- len -= n;
+ len -= min(len, n);
written += n;
}
diff --git a/fs/ubifs/ioctl.c b/fs/ubifs/ioctl.c
index 4363d85..8db380a 100644
--- a/fs/ubifs/ioctl.c
+++ b/fs/ubifs/ioctl.c
@@ -107,7 +107,7 @@
struct ubifs_inode *ui = ubifs_inode(inode);
struct ubifs_info *c = inode->i_sb->s_fs_info;
struct ubifs_budget_req req = { .dirtied_ino = 1,
- .dirtied_ino_d = ui->data_len };
+ .dirtied_ino_d = ALIGN(ui->data_len, 8) };
err = ubifs_budget_space(c, &req);
if (err)
diff --git a/fs/udf/namei.c b/fs/udf/namei.c
index 9f3aced..aff5ca3 100644
--- a/fs/udf/namei.c
+++ b/fs/udf/namei.c
@@ -241,7 +241,7 @@
poffset - lfi);
else {
if (!copy_name) {
- copy_name = kmalloc(UDF_NAME_LEN,
+ copy_name = kmalloc(UDF_NAME_LEN_CS0,
GFP_NOFS);
if (!copy_name) {
fi = ERR_PTR(-ENOMEM);
diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c
index aef0da5..a3074a9 100644
--- a/fs/userfaultfd.c
+++ b/fs/userfaultfd.c
@@ -974,7 +974,7 @@
int fd;
fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, new,
- O_RDWR | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
+ O_RDONLY | (new->flags & UFFD_SHARED_FCNTL_FLAGS));
if (fd < 0)
return fd;
@@ -1987,7 +1987,7 @@
mmgrab(ctx->mm);
fd = anon_inode_getfd("[userfaultfd]", &userfaultfd_fops, ctx,
- O_RDWR | (flags & UFFD_SHARED_FCNTL_FLAGS));
+ O_RDONLY | (flags & UFFD_SHARED_FCNTL_FLAGS));
if (fd < 0) {
mmdrop(ctx->mm);
kmem_cache_free(userfaultfd_ctx_cachep, ctx);
diff --git a/fs/xfs/libxfs/xfs_attr.c b/fs/xfs/libxfs/xfs_attr.c
index 96ac7e5..fcca36b 100644
--- a/fs/xfs/libxfs/xfs_attr.c
+++ b/fs/xfs/libxfs/xfs_attr.c
@@ -876,21 +876,18 @@
state = xfs_da_state_alloc(args);
if (statep != NULL)
- *statep = NULL;
+ *statep = state;
/*
* Search to see if name exists, and get back a pointer to it.
*/
error = xfs_da3_node_lookup_int(state, &retval);
- if (error) {
- xfs_da_state_free(state);
- return error;
- }
+ if (error)
+ retval = error;
- if (statep != NULL)
- *statep = state;
- else
+ if (!statep)
xfs_da_state_free(state);
+
return retval;
}
diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c
index d9a6924..de9c27e 100644
--- a/fs/xfs/libxfs/xfs_bmap.c
+++ b/fs/xfs/libxfs/xfs_bmap.c
@@ -6229,6 +6229,11 @@
xfs_fsblock_t endfsb;
bool isrt;
+ if (irec->br_startblock + irec->br_blockcount <= irec->br_startblock)
+ return __this_address;
+ if (irec->br_startoff + irec->br_blockcount <= irec->br_startoff)
+ return __this_address;
+
isrt = XFS_IS_REALTIME_INODE(ip);
endfsb = irec->br_startblock + irec->br_blockcount - 1;
if (isrt && whichfork == XFS_DATA_FORK) {
diff --git a/fs/xfs/libxfs/xfs_btree.c b/fs/xfs/libxfs/xfs_btree.c
index 2d25bab..24c7d30 100644
--- a/fs/xfs/libxfs/xfs_btree.c
+++ b/fs/xfs/libxfs/xfs_btree.c
@@ -353,20 +353,17 @@
*/
void
xfs_btree_del_cursor(
- xfs_btree_cur_t *cur, /* btree cursor */
- int error) /* del because of error */
+ struct xfs_btree_cur *cur, /* btree cursor */
+ int error) /* del because of error */
{
- int i; /* btree level */
+ int i; /* btree level */
/*
- * Clear the buffer pointers, and release the buffers.
- * If we're doing this in the face of an error, we
- * need to make sure to inspect all of the entries
- * in the bc_bufs array for buffers to be unlocked.
- * This is because some of the btree code works from
- * level n down to 0, and if we get an error along
- * the way we won't have initialized all the entries
- * down to 0.
+ * Clear the buffer pointers and release the buffers. If we're doing
+ * this because of an error, inspect all of the entries in the bc_bufs
+ * array for buffers to be unlocked. This is because some of the btree
+ * code works from level n down to 0, and if we get an error along the
+ * way we won't have initialized all the entries down to 0.
*/
for (i = 0; i < cur->bc_nlevels; i++) {
if (cur->bc_bufs[i])
@@ -374,17 +371,17 @@
else if (!error)
break;
}
+
/*
- * Can't free a bmap cursor without having dealt with the
- * allocated indirect blocks' accounting.
+ * If we are doing a BMBT update, the number of unaccounted blocks
+ * allocated during this cursor life time should be zero. If it's not
+ * zero, then we should be shut down or on our way to shutdown due to
+ * cancelling a dirty transaction on error.
*/
- ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP ||
- cur->bc_ino.allocated == 0);
- /*
- * Free the cursor.
- */
+ ASSERT(cur->bc_btnum != XFS_BTNUM_BMAP || cur->bc_ino.allocated == 0 ||
+ XFS_FORCED_SHUTDOWN(cur->bc_mp) || error != 0);
if (unlikely(cur->bc_flags & XFS_BTREE_STAGING))
- kmem_free((void *)cur->bc_ops);
+ kmem_free(cur->bc_ops);
kmem_cache_free(xfs_btree_cur_zone, cur);
}
@@ -2814,7 +2811,7 @@
struct xfs_btree_split_args *args = container_of(work,
struct xfs_btree_split_args, work);
unsigned long pflags;
- unsigned long new_pflags = PF_MEMALLOC_NOFS;
+ unsigned long new_pflags = 0;
/*
* we are in a transaction context here, but may also be doing work
@@ -2826,12 +2823,20 @@
new_pflags |= PF_MEMALLOC | PF_SWAPWRITE | PF_KSWAPD;
current_set_flags_nested(&pflags, new_pflags);
+ xfs_trans_set_context(args->cur->bc_tp);
args->result = __xfs_btree_split(args->cur, args->level, args->ptrp,
args->key, args->curp, args->stat);
+
+ xfs_trans_clear_context(args->cur->bc_tp);
+ current_restore_flags_nested(&pflags, new_pflags);
+
+ /*
+ * Do not access args after complete() has run here. We don't own args
+ * and the owner may run and free args before we return here.
+ */
complete(args->done);
- current_restore_flags_nested(&pflags, new_pflags);
}
/*
diff --git a/fs/xfs/libxfs/xfs_dir2.h b/fs/xfs/libxfs/xfs_dir2.h
index e553786..d03e609 100644
--- a/fs/xfs/libxfs/xfs_dir2.h
+++ b/fs/xfs/libxfs/xfs_dir2.h
@@ -47,8 +47,6 @@
extern int xfs_dir_removename(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t ino,
xfs_extlen_t tot);
-extern bool xfs_dir2_sf_replace_needblock(struct xfs_inode *dp,
- xfs_ino_t inum);
extern int xfs_dir_replace(struct xfs_trans *tp, struct xfs_inode *dp,
struct xfs_name *name, xfs_ino_t inum,
xfs_extlen_t tot);
diff --git a/fs/xfs/libxfs/xfs_dir2_sf.c b/fs/xfs/libxfs/xfs_dir2_sf.c
index 2463b5d..8c4f76b 100644
--- a/fs/xfs/libxfs/xfs_dir2_sf.c
+++ b/fs/xfs/libxfs/xfs_dir2_sf.c
@@ -1018,7 +1018,7 @@
/*
* Check whether the sf dir replace operation need more blocks.
*/
-bool
+static bool
xfs_dir2_sf_replace_needblock(
struct xfs_inode *dp,
xfs_ino_t inum)
diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c
index c667c63..fa8aefe 100644
--- a/fs/xfs/libxfs/xfs_inode_buf.c
+++ b/fs/xfs/libxfs/xfs_inode_buf.c
@@ -358,19 +358,36 @@
int whichfork)
{
uint32_t di_nextents = XFS_DFORK_NEXTENTS(dip, whichfork);
+ mode_t mode = be16_to_cpu(dip->di_mode);
+ uint32_t fork_size = XFS_DFORK_SIZE(dip, mp, whichfork);
+ uint32_t fork_format = XFS_DFORK_FORMAT(dip, whichfork);
- switch (XFS_DFORK_FORMAT(dip, whichfork)) {
- case XFS_DINODE_FMT_LOCAL:
- /*
- * no local regular files yet
- */
- if (whichfork == XFS_DATA_FORK) {
- if (S_ISREG(be16_to_cpu(dip->di_mode)))
- return __this_address;
- if (be64_to_cpu(dip->di_size) >
- XFS_DFORK_SIZE(dip, mp, whichfork))
+ /*
+ * For fork types that can contain local data, check that the fork
+ * format matches the size of local data contained within the fork.
+ *
+ * For all types, check that when the size says the should be in extent
+ * or btree format, the inode isn't claiming it is in local format.
+ */
+ if (whichfork == XFS_DATA_FORK) {
+ if (S_ISDIR(mode) || S_ISLNK(mode)) {
+ if (be64_to_cpu(dip->di_size) <= fork_size &&
+ fork_format != XFS_DINODE_FMT_LOCAL)
return __this_address;
}
+
+ if (be64_to_cpu(dip->di_size) > fork_size &&
+ fork_format == XFS_DINODE_FMT_LOCAL)
+ return __this_address;
+ }
+
+ switch (fork_format) {
+ case XFS_DINODE_FMT_LOCAL:
+ /*
+ * No local regular files yet.
+ */
+ if (S_ISREG(mode) && whichfork == XFS_DATA_FORK)
+ return __this_address;
if (di_nextents)
return __this_address;
break;
diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h
index 8bd00da..2f46ef3 100644
--- a/fs/xfs/libxfs/xfs_log_format.h
+++ b/fs/xfs/libxfs/xfs_log_format.h
@@ -414,7 +414,16 @@
/* start of the extended dinode, writable fields */
uint32_t di_crc; /* CRC of the inode */
uint64_t di_changecount; /* number of attribute changes */
- xfs_lsn_t di_lsn; /* flush sequence */
+
+ /*
+ * The LSN we write to this field during formatting is not a reflection
+ * of the current on-disk LSN. It should never be used for recovery
+ * sequencing, nor should it be recovered into the on-disk inode at all.
+ * See xlog_recover_inode_commit_pass2() and xfs_log_dinode_to_disk()
+ * for details.
+ */
+ xfs_lsn_t di_lsn;
+
uint64_t di_flags2; /* more random flags */
uint32_t di_cowextsize; /* basic cow extent size for file */
uint8_t di_pad2[12]; /* more padding for future expansion */
diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c
index 5aeafa5..66e8353 100644
--- a/fs/xfs/libxfs/xfs_sb.c
+++ b/fs/xfs/libxfs/xfs_sb.c
@@ -956,9 +956,19 @@
struct xfs_mount *mp = tp->t_mountp;
struct xfs_buf *bp = xfs_trans_getsb(tp);
- mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
- mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
- mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ /*
+ * Lazy sb counters don't update the in-core superblock so do that now.
+ * If this is at unmount, the counters will be exactly correct, but at
+ * any other time they will only be ballpark correct because of
+ * reservations that have been taken out percpu counters. If we have an
+ * unclean shutdown, this will be corrected by log recovery rebuilding
+ * the counters from the AGF block counts.
+ */
+ if (xfs_sb_version_haslazysbcount(&mp->m_sb)) {
+ mp->m_sb.sb_icount = percpu_counter_sum(&mp->m_icount);
+ mp->m_sb.sb_ifree = percpu_counter_sum(&mp->m_ifree);
+ mp->m_sb.sb_fdblocks = percpu_counter_sum(&mp->m_fdblocks);
+ }
xfs_sb_to_disk(bp->b_addr, &mp->m_sb);
xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF);
diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h
index 397d947..1ce0617 100644
--- a/fs/xfs/libxfs/xfs_types.h
+++ b/fs/xfs/libxfs/xfs_types.h
@@ -21,6 +21,7 @@
typedef uint32_t xfs_rtword_t; /* word type for bitmap manipulations */
typedef int64_t xfs_lsn_t; /* log sequence number */
+typedef int64_t xfs_csn_t; /* CIL sequence number */
typedef uint32_t xfs_dablk_t; /* dir/attr block number (in file) */
typedef uint32_t xfs_dahash_t; /* dir/attr hash value */
diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c
index 4304c64..953de84 100644
--- a/fs/xfs/xfs_aops.c
+++ b/fs/xfs/xfs_aops.c
@@ -62,7 +62,7 @@
* We hand off the transaction to the completion thread now, so
* clear the flag here.
*/
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ xfs_trans_clear_context(tp);
return 0;
}
@@ -125,7 +125,7 @@
* thus we need to mark ourselves as being in a transaction manually.
* Similarly for freeze protection.
*/
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ xfs_trans_set_context(tp);
__sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS);
/* we abort the update if there was an IO error */
@@ -145,6 +145,7 @@
struct iomap_ioend *ioend)
{
struct xfs_inode *ip = XFS_I(ioend->io_inode);
+ struct xfs_mount *mp = ip->i_mount;
xfs_off_t offset = ioend->io_offset;
size_t size = ioend->io_size;
unsigned int nofs_flag;
@@ -160,18 +161,26 @@
/*
* Just clean up the in-memory strutures if the fs has been shut down.
*/
- if (XFS_FORCED_SHUTDOWN(ip->i_mount)) {
+ if (XFS_FORCED_SHUTDOWN(mp)) {
error = -EIO;
goto done;
}
/*
- * Clean up any COW blocks on an I/O error.
+ * Clean up all COW blocks and underlying data fork delalloc blocks on
+ * I/O error. The delalloc punch is required because this ioend was
+ * mapped to blocks in the COW fork and the associated pages are no
+ * longer dirty. If we don't remove delalloc blocks here, they become
+ * stale and can corrupt free space accounting on unmount.
*/
error = blk_status_to_errno(ioend->io_bio->bi_status);
if (unlikely(error)) {
- if (ioend->io_flags & IOMAP_F_SHARED)
+ if (ioend->io_flags & IOMAP_F_SHARED) {
xfs_reflink_cancel_cow_range(ip, offset, size, true);
+ xfs_bmap_punch_delalloc_range(ip,
+ XFS_B_TO_FSBT(mp, offset),
+ XFS_B_TO_FSB(mp, size));
+ }
goto done;
}
@@ -568,6 +577,12 @@
{
struct xfs_writepage_ctx wpc = { };
+ if (WARN_ON_ONCE(current->journal_info)) {
+ redirty_page_for_writepage(wbc, page);
+ unlock_page(page);
+ return 0;
+ }
+
return iomap_writepage(page, wbc, &wpc.ctx, &xfs_writeback_ops);
}
@@ -578,6 +593,13 @@
{
struct xfs_writepage_ctx wpc = { };
+ /*
+ * Writing back data in a transaction context can result in recursive
+ * transactions. This is bad, so issue a warning and get out of here.
+ */
+ if (WARN_ON_ONCE(current->journal_info))
+ return 0;
+
xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED);
return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops);
}
diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c
index 0356f2e..a3d5ecc 100644
--- a/fs/xfs/xfs_buf_item.c
+++ b/fs/xfs/xfs_buf_item.c
@@ -56,14 +56,12 @@
}
/*
- * This returns the number of log iovecs needed to log the
- * given buf log item.
+ * Return the number of log iovecs and space needed to log the given buf log
+ * item segment.
*
- * It calculates this as 1 iovec for the buf log format structure
- * and 1 for each stretch of non-contiguous chunks to be logged.
- * Contiguous chunks are logged in a single iovec.
- *
- * If the XFS_BLI_STALE flag has been set, then log nothing.
+ * It calculates this as 1 iovec for the buf log format structure and 1 for each
+ * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
+ * in a single iovec.
*/
STATIC void
xfs_buf_item_size_segment(
@@ -119,11 +117,8 @@
}
/*
- * This returns the number of log iovecs needed to log the given buf log item.
- *
- * It calculates this as 1 iovec for the buf log format structure and 1 for each
- * stretch of non-contiguous chunks to be logged. Contiguous chunks are logged
- * in a single iovec.
+ * Return the number of log iovecs and space needed to log the given buf log
+ * item.
*
* Discontiguous buffers need a format structure per region that is being
* logged. This makes the changes in the buffer appear to log recovery as though
@@ -133,7 +128,11 @@
* what ends up on disk.
*
* If the XFS_BLI_STALE flag has been set, then log nothing but the buf log
- * format structures.
+ * format structures. If the item has previously been logged and has dirty
+ * regions, we do not relog them in stale buffers. This has the effect of
+ * reducing the size of the relogged item by the amount of dirty data tracked
+ * by the log item. This can result in the committing transaction reducing the
+ * amount of space being consumed by the CIL.
*/
STATIC void
xfs_buf_item_size(
@@ -147,9 +146,9 @@
ASSERT(atomic_read(&bip->bli_refcount) > 0);
if (bip->bli_flags & XFS_BLI_STALE) {
/*
- * The buffer is stale, so all we need to log
- * is the buf log format structure with the
- * cancel flag in it.
+ * The buffer is stale, so all we need to log is the buf log
+ * format structure with the cancel flag in it as we are never
+ * going to replay the changes tracked in the log item.
*/
trace_xfs_buf_item_size_stale(bip);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
@@ -164,9 +163,9 @@
if (bip->bli_flags & XFS_BLI_ORDERED) {
/*
- * The buffer has been logged just to order it.
- * It is not being included in the transaction
- * commit, so no vectors are used at all.
+ * The buffer has been logged just to order it. It is not being
+ * included in the transaction commit, so no vectors are used at
+ * all.
*/
trace_xfs_buf_item_size_ordered(bip);
*nvecs = XFS_LOG_VEC_ORDERED;
@@ -394,17 +393,8 @@
}
/*
- * This is called to unpin the buffer associated with the buf log
- * item which was previously pinned with a call to xfs_buf_item_pin().
- *
- * Also drop the reference to the buf item for the current transaction.
- * If the XFS_BLI_STALE flag is set and we are the last reference,
- * then free up the buf log item and unlock the buffer.
- *
- * If the remove flag is set we are called from uncommit in the
- * forced-shutdown path. If that is true and the reference count on
- * the log item is going to drop to zero we need to free the item's
- * descriptor in the transaction.
+ * This is called to unpin the buffer associated with the buf log item which
+ * was previously pinned with a call to xfs_buf_item_pin().
*/
STATIC void
xfs_buf_item_unpin(
@@ -421,38 +411,35 @@
trace_xfs_buf_item_unpin(bip);
+ /*
+ * Drop the bli ref associated with the pin and grab the hold required
+ * for the I/O simulation failure in the abort case. We have to do this
+ * before the pin count drops because the AIL doesn't acquire a bli
+ * reference. Therefore if the refcount drops to zero, the bli could
+ * still be AIL resident and the buffer submitted for I/O (and freed on
+ * completion) at any point before we return. This can be removed once
+ * the AIL properly holds a reference on the bli.
+ */
freed = atomic_dec_and_test(&bip->bli_refcount);
-
+ if (freed && !stale && remove)
+ xfs_buf_hold(bp);
if (atomic_dec_and_test(&bp->b_pin_count))
wake_up_all(&bp->b_waiters);
- if (freed && stale) {
+ /* nothing to do but drop the pin count if the bli is active */
+ if (!freed)
+ return;
+
+ if (stale) {
ASSERT(bip->bli_flags & XFS_BLI_STALE);
ASSERT(xfs_buf_islocked(bp));
ASSERT(bp->b_flags & XBF_STALE);
ASSERT(bip->__bli_format.blf_flags & XFS_BLF_CANCEL);
+ ASSERT(list_empty(&lip->li_trans));
+ ASSERT(!bp->b_transp);
trace_xfs_buf_item_unpin_stale(bip);
- if (remove) {
- /*
- * If we are in a transaction context, we have to
- * remove the log item from the transaction as we are
- * about to release our reference to the buffer. If we
- * don't, the unlock that occurs later in
- * xfs_trans_uncommit() will try to reference the
- * buffer which we no longer have a hold on.
- */
- if (!list_empty(&lip->li_trans))
- xfs_trans_del_item(lip);
-
- /*
- * Since the transaction no longer refers to the buffer,
- * the buffer should no longer refer to the transaction.
- */
- bp->b_transp = NULL;
- }
-
/*
* If we get called here because of an IO error, we may or may
* not have the item on the AIL. xfs_trans_ail_delete() will
@@ -469,13 +456,13 @@
ASSERT(bp->b_log_item == NULL);
}
xfs_buf_relse(bp);
- } else if (freed && remove) {
+ } else if (remove) {
/*
* The buffer must be locked and held by the caller to simulate
- * an async I/O failure.
+ * an async I/O failure. We acquired the hold for this case
+ * before the buffer was unpinned.
*/
xfs_buf_lock(bp);
- xfs_buf_hold(bp);
bp->b_flags |= XBF_ASYNC;
xfs_buf_ioend_fail(bp);
}
@@ -633,7 +620,7 @@
STATIC void
xfs_buf_item_committing(
struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
+ xfs_csn_t seq)
{
return xfs_buf_item_release(lip);
}
diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c
index d44e8b4..b374c9c 100644
--- a/fs/xfs/xfs_buf_item_recover.c
+++ b/fs/xfs/xfs_buf_item_recover.c
@@ -796,6 +796,7 @@
switch (magicda) {
case XFS_DIR3_LEAF1_MAGIC:
case XFS_DIR3_LEAFN_MAGIC:
+ case XFS_ATTR3_LEAF_MAGIC:
case XFS_DA3_NODE_MAGIC:
lsn = be64_to_cpu(((struct xfs_da3_blkinfo *)blk)->lsn);
uuid = &((struct xfs_da3_blkinfo *)blk)->uuid;
@@ -805,7 +806,7 @@
}
if (lsn != (xfs_lsn_t)-1) {
- if (!uuid_equal(&mp->m_sb.sb_uuid, uuid))
+ if (!uuid_equal(&mp->m_sb.sb_meta_uuid, uuid))
goto recover_immediately;
return lsn;
}
diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c
index 1d95ed3..80c4579 100644
--- a/fs/xfs/xfs_dquot.c
+++ b/fs/xfs/xfs_dquot.c
@@ -500,6 +500,42 @@
return dqp;
}
+/* Check the ondisk dquot's id and type match what the incore dquot expects. */
+static bool
+xfs_dquot_check_type(
+ struct xfs_dquot *dqp,
+ struct xfs_disk_dquot *ddqp)
+{
+ uint8_t ddqp_type;
+ uint8_t dqp_type;
+
+ ddqp_type = ddqp->d_type & XFS_DQTYPE_REC_MASK;
+ dqp_type = xfs_dquot_type(dqp);
+
+ if (be32_to_cpu(ddqp->d_id) != dqp->q_id)
+ return false;
+
+ /*
+ * V5 filesystems always expect an exact type match. V4 filesystems
+ * expect an exact match for user dquots and for non-root group and
+ * project dquots.
+ */
+ if (xfs_sb_version_hascrc(&dqp->q_mount->m_sb) ||
+ dqp_type == XFS_DQTYPE_USER || dqp->q_id != 0)
+ return ddqp_type == dqp_type;
+
+ /*
+ * V4 filesystems support either group or project quotas, but not both
+ * at the same time. The non-user quota file can be switched between
+ * group and project quota uses depending on the mount options, which
+ * means that we can encounter the other type when we try to load quota
+ * defaults. Quotacheck will soon reset the the entire quota file
+ * (including the root dquot) anyway, but don't log scary corruption
+ * reports to dmesg.
+ */
+ return ddqp_type == XFS_DQTYPE_GROUP || ddqp_type == XFS_DQTYPE_PROJ;
+}
+
/* Copy the in-core quota fields in from the on-disk buffer. */
STATIC int
xfs_dquot_from_disk(
@@ -512,8 +548,7 @@
* Ensure that we got the type and ID we were looking for.
* Everything else was checked by the dquot buffer verifier.
*/
- if ((ddqp->d_type & XFS_DQTYPE_REC_MASK) != xfs_dquot_type(dqp) ||
- be32_to_cpu(ddqp->d_id) != dqp->q_id) {
+ if (!xfs_dquot_check_type(dqp, ddqp)) {
xfs_alert_tag(bp->b_mount, XFS_PTAG_VERIFIER_ERROR,
"Metadata corruption detected at %pS, quota %u",
__this_address, dqp->q_id);
diff --git a/fs/xfs/xfs_dquot_item.c b/fs/xfs/xfs_dquot_item.c
index 8c1fdf3..8ed47b7 100644
--- a/fs/xfs/xfs_dquot_item.c
+++ b/fs/xfs/xfs_dquot_item.c
@@ -188,7 +188,7 @@
STATIC void
xfs_qm_dquot_logitem_committing(
struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
+ xfs_csn_t seq)
{
return xfs_qm_dquot_logitem_release(lip);
}
diff --git a/fs/xfs/xfs_error.c b/fs/xfs/xfs_error.c
index 7f6e208..f9e2f60 100644
--- a/fs/xfs/xfs_error.c
+++ b/fs/xfs/xfs_error.c
@@ -293,6 +293,8 @@
struct xfs_mount *mp,
unsigned int error_tag)
{
+ BUILD_BUG_ON(ARRAY_SIZE(xfs_errortag_random_default) != XFS_ERRTAG_MAX);
+
if (error_tag >= XFS_ERRTAG_MAX)
return -EINVAL;
diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c
index 5c03952..1147477 100644
--- a/fs/xfs/xfs_extfree_item.c
+++ b/fs/xfs/xfs_extfree_item.c
@@ -482,7 +482,7 @@
free->xefi_startblock,
free->xefi_blockcount,
&free->xefi_oinfo, free->xefi_skip_discard);
- kmem_free(free);
+ kmem_cache_free(xfs_bmap_free_item_zone, free);
return error;
}
@@ -502,7 +502,7 @@
struct xfs_extent_free_item *free;
free = container_of(item, struct xfs_extent_free_item, xefi_list);
- kmem_free(free);
+ kmem_cache_free(xfs_bmap_free_item_zone, free);
}
const struct xfs_defer_op_type xfs_extent_free_defer_type = {
@@ -564,7 +564,7 @@
extp->ext_len = free->xefi_blockcount;
efdp->efd_next_extent++;
- kmem_free(free);
+ kmem_cache_free(xfs_bmap_free_item_zone, free);
return error;
}
diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c
index 5b0f93f..4d6bf8d 100644
--- a/fs/xfs/xfs_file.c
+++ b/fs/xfs/xfs_file.c
@@ -118,6 +118,54 @@
return xfs_log_force_inode(ip);
}
+static xfs_csn_t
+xfs_fsync_seq(
+ struct xfs_inode *ip,
+ bool datasync)
+{
+ if (!xfs_ipincount(ip))
+ return 0;
+ if (datasync && !(ip->i_itemp->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
+ return 0;
+ return ip->i_itemp->ili_commit_seq;
+}
+
+/*
+ * All metadata updates are logged, which means that we just have to flush the
+ * log up to the latest LSN that touched the inode.
+ *
+ * If we have concurrent fsync/fdatasync() calls, we need them to all block on
+ * the log force before we clear the ili_fsync_fields field. This ensures that
+ * we don't get a racing sync operation that does not wait for the metadata to
+ * hit the journal before returning. If we race with clearing ili_fsync_fields,
+ * then all that will happen is the log force will do nothing as the lsn will
+ * already be on disk. We can't race with setting ili_fsync_fields because that
+ * is done under XFS_ILOCK_EXCL, and that can't happen because we hold the lock
+ * shared until after the ili_fsync_fields is cleared.
+ */
+static int
+xfs_fsync_flush_log(
+ struct xfs_inode *ip,
+ bool datasync,
+ int *log_flushed)
+{
+ int error = 0;
+ xfs_csn_t seq;
+
+ xfs_ilock(ip, XFS_ILOCK_SHARED);
+ seq = xfs_fsync_seq(ip, datasync);
+ if (seq) {
+ error = xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC,
+ log_flushed);
+
+ spin_lock(&ip->i_itemp->ili_lock);
+ ip->i_itemp->ili_fsync_fields = 0;
+ spin_unlock(&ip->i_itemp->ili_lock);
+ }
+ xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ return error;
+}
+
STATIC int
xfs_file_fsync(
struct file *file,
@@ -125,13 +173,10 @@
loff_t end,
int datasync)
{
- struct inode *inode = file->f_mapping->host;
- struct xfs_inode *ip = XFS_I(inode);
- struct xfs_inode_log_item *iip = ip->i_itemp;
+ struct xfs_inode *ip = XFS_I(file->f_mapping->host);
struct xfs_mount *mp = ip->i_mount;
int error = 0;
int log_flushed = 0;
- xfs_lsn_t lsn = 0;
trace_xfs_file_fsync(ip);
@@ -155,33 +200,7 @@
else if (mp->m_logdev_targp != mp->m_ddev_targp)
xfs_blkdev_issue_flush(mp->m_ddev_targp);
- /*
- * All metadata updates are logged, which means that we just have to
- * flush the log up to the latest LSN that touched the inode. If we have
- * concurrent fsync/fdatasync() calls, we need them to all block on the
- * log force before we clear the ili_fsync_fields field. This ensures
- * that we don't get a racing sync operation that does not wait for the
- * metadata to hit the journal before returning. If we race with
- * clearing the ili_fsync_fields, then all that will happen is the log
- * force will do nothing as the lsn will already be on disk. We can't
- * race with setting ili_fsync_fields because that is done under
- * XFS_ILOCK_EXCL, and that can't happen because we hold the lock shared
- * until after the ili_fsync_fields is cleared.
- */
- xfs_ilock(ip, XFS_ILOCK_SHARED);
- if (xfs_ipincount(ip)) {
- if (!datasync ||
- (iip->ili_fsync_fields & ~XFS_ILOG_TIMESTAMP))
- lsn = iip->ili_last_lsn;
- }
-
- if (lsn) {
- error = xfs_log_force_lsn(mp, lsn, XFS_LOG_SYNC, &log_flushed);
- spin_lock(&iip->ili_lock);
- iip->ili_fsync_fields = 0;
- spin_unlock(&iip->ili_lock);
- }
- xfs_iunlock(ip, XFS_ILOCK_SHARED);
+ error = xfs_fsync_flush_log(ip, datasync, &log_flushed);
/*
* If we only have a single device, and the log force about was
diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c
index db23e45..bc41ec0 100644
--- a/fs/xfs/xfs_filestream.c
+++ b/fs/xfs/xfs_filestream.c
@@ -128,11 +128,12 @@
if (!pag->pagf_init) {
err = xfs_alloc_pagf_init(mp, NULL, ag, trylock);
if (err) {
- xfs_perag_put(pag);
- if (err != -EAGAIN)
+ if (err != -EAGAIN) {
+ xfs_perag_put(pag);
return err;
+ }
/* Couldn't lock the AGF, skip this AG. */
- continue;
+ goto next_ag;
}
}
diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c
index ef1d5bb..775f833 100644
--- a/fs/xfs/xfs_fsops.c
+++ b/fs/xfs/xfs_fsops.c
@@ -376,46 +376,36 @@
* If the request is larger than the current reservation, reserve the
* blocks before we update the reserve counters. Sample m_fdblocks and
* perform a partial reservation if the request exceeds free space.
+ *
+ * The code below estimates how many blocks it can request from
+ * fdblocks to stash in the reserve pool. This is a classic TOCTOU
+ * race since fdblocks updates are not always coordinated via
+ * m_sb_lock. Set the reserve size even if there's not enough free
+ * space to fill it because mod_fdblocks will refill an undersized
+ * reserve when it can.
*/
- error = -ENOSPC;
- do {
- free = percpu_counter_sum(&mp->m_fdblocks) -
- mp->m_alloc_set_aside;
- if (free <= 0)
- break;
-
- delta = request - mp->m_resblks;
- lcounter = free - delta;
- if (lcounter < 0)
- /* We can't satisfy the request, just get what we can */
- fdblks_delta = free;
- else
- fdblks_delta = delta;
-
+ free = percpu_counter_sum(&mp->m_fdblocks) -
+ xfs_fdblocks_unavailable(mp);
+ delta = request - mp->m_resblks;
+ mp->m_resblks = request;
+ if (delta > 0 && free > 0) {
/*
* We'll either succeed in getting space from the free block
- * count or we'll get an ENOSPC. If we get a ENOSPC, it means
- * things changed while we were calculating fdblks_delta and so
- * we should try again to see if there is anything left to
- * reserve.
+ * count or we'll get an ENOSPC. Don't set the reserved flag
+ * here - we don't want to reserve the extra reserve blocks
+ * from the reserve.
*
- * Don't set the reserved flag here - we don't want to reserve
- * the extra reserve blocks from the reserve.....
+ * The desired reserve size can change after we drop the lock.
+ * Use mod_fdblocks to put the space into the reserve or into
+ * fdblocks as appropriate.
*/
+ fdblks_delta = min(free, delta);
spin_unlock(&mp->m_sb_lock);
error = xfs_mod_fdblocks(mp, -fdblks_delta, 0);
+ if (!error)
+ xfs_mod_fdblocks(mp, fdblks_delta, 0);
spin_lock(&mp->m_sb_lock);
- } while (error == -ENOSPC);
-
- /*
- * Update the reserve counters if blocks have been successfully
- * allocated.
- */
- if (!error && fdblks_delta) {
- mp->m_resblks += fdblks_delta;
- mp->m_resblks_avail += fdblks_delta;
}
-
out:
if (outval) {
outval->resblks = mp->m_resblks;
diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c
index deb9930..e69a08e 100644
--- a/fs/xfs/xfs_icache.c
+++ b/fs/xfs/xfs_icache.c
@@ -47,8 +47,9 @@
return NULL;
}
- /* VFS doesn't initialise i_mode! */
+ /* VFS doesn't initialise i_mode or i_state! */
VFS_I(ip)->i_mode = 0;
+ VFS_I(ip)->i_state = 0;
XFS_STATS_INC(mp, vn_active);
ASSERT(atomic_read(&ip->i_pincount) == 0);
diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c
index 2bfbcf2..1900883 100644
--- a/fs/xfs/xfs_inode.c
+++ b/fs/xfs/xfs_inode.c
@@ -802,6 +802,7 @@
xfs_buf_t **ialloc_context,
xfs_inode_t **ipp)
{
+ struct inode *dir = pip ? VFS_I(pip) : NULL;
struct xfs_mount *mp = tp->t_mountp;
xfs_ino_t ino;
xfs_inode_t *ip;
@@ -847,18 +848,17 @@
return error;
ASSERT(ip != NULL);
inode = VFS_I(ip);
- inode->i_mode = mode;
set_nlink(inode, nlink);
- inode->i_uid = current_fsuid();
inode->i_rdev = rdev;
ip->i_d.di_projid = prid;
- if (pip && XFS_INHERIT_GID(pip)) {
- inode->i_gid = VFS_I(pip)->i_gid;
- if ((VFS_I(pip)->i_mode & S_ISGID) && S_ISDIR(mode))
- inode->i_mode |= S_ISGID;
+ if (dir && !(dir->i_mode & S_ISGID) &&
+ (mp->m_flags & XFS_MOUNT_GRPID)) {
+ inode->i_uid = current_fsuid();
+ inode->i_gid = dir->i_gid;
+ inode->i_mode = mode;
} else {
- inode->i_gid = current_fsgid();
+ inode_init_owner(inode, dir, mode);
}
/*
@@ -2669,14 +2669,13 @@
}
/*
- * This is called to return an inode to the inode free list.
- * The inode should already be truncated to 0 length and have
- * no pages associated with it. This routine also assumes that
- * the inode is already a part of the transaction.
+ * This is called to return an inode to the inode free list. The inode should
+ * already be truncated to 0 length and have no pages associated with it. This
+ * routine also assumes that the inode is already a part of the transaction.
*
- * The on-disk copy of the inode will have been added to the list
- * of unlinked inodes in the AGI. We need to remove the inode from
- * that list atomically with respect to freeing it here.
+ * The on-disk copy of the inode will have been added to the list of unlinked
+ * inodes in the AGI. We need to remove the inode from that list atomically with
+ * respect to freeing it here.
*/
int
xfs_ifree(
@@ -2694,13 +2693,16 @@
ASSERT(ip->i_d.di_nblocks == 0);
/*
- * Pull the on-disk inode from the AGI unlinked list.
+ * Free the inode first so that we guarantee that the AGI lock is going
+ * to be taken before we remove the inode from the unlinked list. This
+ * makes the AGI lock -> unlinked list modification order the same as
+ * used in O_TMPFILE creation.
*/
- error = xfs_iunlink_remove(tp, ip);
+ error = xfs_difree(tp, ip->i_ino, &xic);
if (error)
return error;
- error = xfs_difree(tp, ip->i_ino, &xic);
+ error = xfs_iunlink_remove(tp, ip);
if (error)
return error;
@@ -2754,7 +2756,7 @@
trace_xfs_inode_unpin_nowait(ip, _RET_IP_);
/* Give the log a push to start the unpinning I/O */
- xfs_log_force_lsn(ip->i_mount, ip->i_itemp->ili_last_lsn, 0, NULL);
+ xfs_log_force_seq(ip->i_mount, ip->i_itemp->ili_commit_seq, 0, NULL);
}
@@ -3152,7 +3154,7 @@
struct xfs_trans *tp;
struct xfs_inode *wip = NULL; /* whiteout inode */
struct xfs_inode *inodes[__XFS_SORT_INODES];
- struct xfs_buf *agibp;
+ int i;
int num_inodes = __XFS_SORT_INODES;
bool new_parent = (src_dp != target_dp);
bool src_is_directory = S_ISDIR(VFS_I(src_ip)->i_mode);
@@ -3170,7 +3172,6 @@
* appropriately.
*/
if (flags & RENAME_WHITEOUT) {
- ASSERT(!(flags & (RENAME_NOREPLACE | RENAME_EXCHANGE)));
error = xfs_rename_alloc_whiteout(target_dp, &wip);
if (error)
return error;
@@ -3266,6 +3267,30 @@
}
/*
+ * Lock the AGI buffers we need to handle bumping the nlink of the
+ * whiteout inode off the unlinked list and to handle dropping the
+ * nlink of the target inode. Per locking order rules, do this in
+ * increasing AG order and before directory block allocation tries to
+ * grab AGFs because we grab AGIs before AGFs.
+ *
+ * The (vfs) caller must ensure that if src is a directory then
+ * target_ip is either null or an empty directory.
+ */
+ for (i = 0; i < num_inodes && inodes[i] != NULL; i++) {
+ if (inodes[i] == wip ||
+ (inodes[i] == target_ip &&
+ (VFS_I(target_ip)->i_nlink == 1 || src_is_directory))) {
+ struct xfs_buf *bp;
+ xfs_agnumber_t agno;
+
+ agno = XFS_INO_TO_AGNO(mp, inodes[i]->i_ino);
+ error = xfs_read_agi(mp, tp, agno, &bp);
+ if (error)
+ goto out_trans_cancel;
+ }
+ }
+
+ /*
* Directory entry creation below may acquire the AGF. Remove
* the whiteout from the unlinked list first to preserve correct
* AGI/AGF locking order. This dirties the transaction so failures
@@ -3317,22 +3342,6 @@
* In case there is already an entry with the same
* name at the destination directory, remove it first.
*/
-
- /*
- * Check whether the replace operation will need to allocate
- * blocks. This happens when the shortform directory lacks
- * space and we have to convert it to a block format directory.
- * When more blocks are necessary, we must lock the AGI first
- * to preserve locking order (AGI -> AGF).
- */
- if (xfs_dir2_sf_replace_needblock(target_dp, src_ip->i_ino)) {
- error = xfs_read_agi(mp, tp,
- XFS_INO_TO_AGNO(mp, target_ip->i_ino),
- &agibp);
- if (error)
- goto out_trans_cancel;
- }
-
error = xfs_dir_replace(tp, target_dp, target_name,
src_ip->i_ino, spaceres);
if (error)
@@ -3709,16 +3718,16 @@
xfs_log_force_inode(
struct xfs_inode *ip)
{
- xfs_lsn_t lsn = 0;
+ xfs_csn_t seq = 0;
xfs_ilock(ip, XFS_ILOCK_SHARED);
if (xfs_ipincount(ip))
- lsn = ip->i_itemp->ili_last_lsn;
+ seq = ip->i_itemp->ili_commit_seq;
xfs_iunlock(ip, XFS_ILOCK_SHARED);
- if (!lsn)
+ if (!seq)
return 0;
- return xfs_log_force_lsn(ip->i_mount, lsn, XFS_LOG_SYNC, NULL);
+ return xfs_log_force_seq(ip->i_mount, seq, XFS_LOG_SYNC, NULL);
}
/*
diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c
index 17e20a6..3aba455 100644
--- a/fs/xfs/xfs_inode_item.c
+++ b/fs/xfs/xfs_inode_item.c
@@ -28,6 +28,20 @@
return container_of(lip, struct xfs_inode_log_item, ili_item);
}
+/*
+ * The logged size of an inode fork is always the current size of the inode
+ * fork. This means that when an inode fork is relogged, the size of the logged
+ * region is determined by the current state, not the combination of the
+ * previously logged state + the current state. This is different relogging
+ * behaviour to most other log items which will retain the size of the
+ * previously logged changes when smaller regions are relogged.
+ *
+ * Hence operations that remove data from the inode fork (e.g. shortform
+ * dir/attr remove, extent form extent removal, etc), the size of the relogged
+ * inode gets -smaller- rather than stays the same size as the previously logged
+ * size and this can result in the committing transaction reducing the amount of
+ * space being consumed by the CIL.
+ */
STATIC void
xfs_inode_item_data_fork_size(
struct xfs_inode_log_item *iip,
@@ -603,9 +617,9 @@
STATIC void
xfs_inode_item_committing(
struct xfs_log_item *lip,
- xfs_lsn_t commit_lsn)
+ xfs_csn_t seq)
{
- INODE_ITEM(lip)->ili_last_lsn = commit_lsn;
+ INODE_ITEM(lip)->ili_commit_seq = seq;
return xfs_inode_item_release(lip);
}
diff --git a/fs/xfs/xfs_inode_item.h b/fs/xfs/xfs_inode_item.h
index 4b926e3..403b45a 100644
--- a/fs/xfs/xfs_inode_item.h
+++ b/fs/xfs/xfs_inode_item.h
@@ -33,7 +33,7 @@
unsigned int ili_fields; /* fields to be logged */
unsigned int ili_fsync_fields; /* logged since last fsync */
xfs_lsn_t ili_flush_lsn; /* lsn at last flush */
- xfs_lsn_t ili_last_lsn; /* lsn at last transaction */
+ xfs_csn_t ili_commit_seq; /* last transaction commit */
};
static inline int xfs_inode_clean(struct xfs_inode *ip)
diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c
index cb44f76..538724f 100644
--- a/fs/xfs/xfs_inode_item_recover.c
+++ b/fs/xfs/xfs_inode_item_recover.c
@@ -145,7 +145,8 @@
STATIC void
xfs_log_dinode_to_disk(
struct xfs_log_dinode *from,
- struct xfs_dinode *to)
+ struct xfs_dinode *to,
+ xfs_lsn_t lsn)
{
to->di_magic = cpu_to_be16(from->di_magic);
to->di_mode = cpu_to_be16(from->di_mode);
@@ -182,7 +183,7 @@
to->di_flags2 = cpu_to_be64(from->di_flags2);
to->di_cowextsize = cpu_to_be32(from->di_cowextsize);
to->di_ino = cpu_to_be64(from->di_ino);
- to->di_lsn = cpu_to_be64(from->di_lsn);
+ to->di_lsn = cpu_to_be64(lsn);
memcpy(to->di_pad2, from->di_pad2, sizeof(to->di_pad2));
uuid_copy(&to->di_uuid, &from->di_uuid);
to->di_flushiter = 0;
@@ -261,16 +262,25 @@
}
/*
- * If the inode has an LSN in it, recover the inode only if it's less
- * than the lsn of the transaction we are replaying. Note: we still
- * need to replay an owner change even though the inode is more recent
- * than the transaction as there is no guarantee that all the btree
- * blocks are more recent than this transaction, too.
+ * If the inode has an LSN in it, recover the inode only if the on-disk
+ * inode's LSN is older than the lsn of the transaction we are
+ * replaying. We can have multiple checkpoints with the same start LSN,
+ * so the current LSN being equal to the on-disk LSN doesn't necessarily
+ * mean that the on-disk inode is more recent than the change being
+ * replayed.
+ *
+ * We must check the current_lsn against the on-disk inode
+ * here because the we can't trust the log dinode to contain a valid LSN
+ * (see comment below before replaying the log dinode for details).
+ *
+ * Note: we still need to replay an owner change even though the inode
+ * is more recent than the transaction as there is no guarantee that all
+ * the btree blocks are more recent than this transaction, too.
*/
if (dip->di_version >= 3) {
xfs_lsn_t lsn = be64_to_cpu(dip->di_lsn);
- if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) >= 0) {
+ if (lsn && lsn != -1 && XFS_LSN_CMP(lsn, current_lsn) > 0) {
trace_xfs_log_recover_inode_skip(log, in_f);
error = 0;
goto out_owner_change;
@@ -368,8 +378,17 @@
goto out_release;
}
- /* recover the log dinode inode into the on disk inode */
- xfs_log_dinode_to_disk(ldip, dip);
+ /*
+ * Recover the log dinode inode into the on disk inode.
+ *
+ * The LSN in the log dinode is garbage - it can be zero or reflect
+ * stale in-memory runtime state that isn't coherent with the changes
+ * logged in this transaction or the changes written to the on-disk
+ * inode. Hence we write the current lSN into the inode because that
+ * matches what xfs_iflush() would write inode the inode when flushing
+ * the changes in this transaction.
+ */
+ xfs_log_dinode_to_disk(ldip, dip, current_lsn);
fields = in_f->ilf_fields;
if (fields & XFS_ILOG_DEV)
diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c
index 646735a..103fa83 100644
--- a/fs/xfs/xfs_ioctl.c
+++ b/fs/xfs/xfs_ioctl.c
@@ -371,7 +371,7 @@
xfs_ioc_attr_list(
struct xfs_inode *dp,
void __user *ubuf,
- int bufsize,
+ size_t bufsize,
int flags,
struct xfs_attrlist_cursor __user *ucursor)
{
@@ -1689,7 +1689,7 @@
if (bmx.bmv_count < 2)
return -EINVAL;
- if (bmx.bmv_count > ULONG_MAX / recsize)
+ if (bmx.bmv_count >= INT_MAX / recsize)
return -ENOMEM;
buf = kvzalloc(bmx.bmv_count * sizeof(*buf), GFP_KERNEL);
diff --git a/fs/xfs/xfs_ioctl.h b/fs/xfs/xfs_ioctl.h
index bab6a5a..416e20d 100644
--- a/fs/xfs/xfs_ioctl.h
+++ b/fs/xfs/xfs_ioctl.h
@@ -38,8 +38,9 @@
int xfs_ioc_attrmulti_one(struct file *parfilp, struct inode *inode,
uint32_t opcode, void __user *uname, void __user *value,
uint32_t *len, uint32_t flags);
-int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf, int bufsize,
- int flags, struct xfs_attrlist_cursor __user *ucursor);
+int xfs_ioc_attr_list(struct xfs_inode *dp, void __user *ubuf,
+ size_t bufsize, int flags,
+ struct xfs_attrlist_cursor __user *ucursor);
extern struct dentry *
xfs_handle_to_dentry(
diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c
index 7b9ff82..bd5a25f 100644
--- a/fs/xfs/xfs_iomap.c
+++ b/fs/xfs/xfs_iomap.c
@@ -870,6 +870,9 @@
int allocfork = XFS_DATA_FORK;
int error = 0;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return -EIO;
+
/* we can't use delayed allocations when using extent size hints */
if (xfs_get_extsz_hint(ip))
return xfs_direct_write_iomap_begin(inode, offset, count,
@@ -1059,11 +1062,11 @@
error = xfs_bmbt_to_iomap(ip, srcmap, &imap, 0);
if (error)
return error;
- } else {
- xfs_trim_extent(&cmap, offset_fsb,
- imap.br_startoff - offset_fsb);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
}
- return xfs_bmbt_to_iomap(ip, iomap, &cmap, IOMAP_F_SHARED);
+
+ xfs_trim_extent(&cmap, offset_fsb, imap.br_startoff - offset_fsb);
+ return xfs_bmbt_to_iomap(ip, iomap, &cmap, 0);
out_unlock:
xfs_iunlock(ip, XFS_ILOCK_EXCL);
diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c
index b7f7b31..6a3026e 100644
--- a/fs/xfs/xfs_iops.c
+++ b/fs/xfs/xfs_iops.c
@@ -1328,7 +1328,7 @@
gfp_t gfp_mask;
inode->i_ino = ip->i_ino;
- inode->i_state = I_NEW;
+ inode->i_state |= I_NEW;
inode_sb_list_add(inode);
/* make the inode look hashed for the writeback code */
diff --git a/fs/xfs/xfs_iwalk.c b/fs/xfs/xfs_iwalk.c
index 2a45138..eae3aff 100644
--- a/fs/xfs/xfs_iwalk.c
+++ b/fs/xfs/xfs_iwalk.c
@@ -363,7 +363,7 @@
/* Delete cursor but remember the last record we cached... */
xfs_iwalk_del_inobt(tp, curpp, agi_bpp, 0);
irec = &iwag->recs[iwag->nr_recs - 1];
- ASSERT(next_agino == irec->ir_startino + XFS_INODES_PER_CHUNK);
+ ASSERT(next_agino >= irec->ir_startino + XFS_INODES_PER_CHUNK);
error = xfs_iwalk_ag_recs(iwag);
if (error)
diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c
index fa2d05e..22d7d74 100644
--- a/fs/xfs/xfs_log.c
+++ b/fs/xfs/xfs_log.c
@@ -347,6 +347,25 @@
tic->t_res_num++;
}
+bool
+xfs_log_writable(
+ struct xfs_mount *mp)
+{
+ /*
+ * Never write to the log on norecovery mounts, if the block device is
+ * read-only, or if the filesystem is shutdown. Read-only mounts still
+ * allow internal writes for log recovery and unmount purposes, so don't
+ * restrict that case here.
+ */
+ if (mp->m_flags & XFS_MOUNT_NORECOVERY)
+ return false;
+ if (xfs_readonly_buftarg(mp->m_log->l_targ))
+ return false;
+ if (XFS_FORCED_SHUTDOWN(mp))
+ return false;
+ return true;
+}
+
/*
* Replenish the byte reservation required by moving the grant write head.
*/
@@ -746,6 +765,9 @@
if (readonly)
mp->m_flags |= XFS_MOUNT_RDONLY;
+ /* Make sure the log is dead if we're returning failure. */
+ ASSERT(!error || (mp->m_log->l_flags & XLOG_IO_ERROR));
+
return error;
}
@@ -886,15 +908,8 @@
{
struct xlog *log = mp->m_log;
- /*
- * Don't write out unmount record on norecovery mounts or ro devices.
- * Or, if we are doing a forced umount (typically because of IO errors).
- */
- if (mp->m_flags & XFS_MOUNT_NORECOVERY ||
- xfs_readonly_buftarg(log->l_targ)) {
- ASSERT(mp->m_flags & XFS_MOUNT_RDONLY);
+ if (!xfs_log_writable(mp))
return;
- }
xfs_log_force(mp, XFS_LOG_SYNC);
@@ -3198,14 +3213,13 @@
}
static int
-__xfs_log_force_lsn(
- struct xfs_mount *mp,
+xlog_force_lsn(
+ struct xlog *log,
xfs_lsn_t lsn,
uint flags,
int *log_flushed,
bool already_slept)
{
- struct xlog *log = mp->m_log;
struct xlog_in_core *iclog;
spin_lock(&log->l_icloglock);
@@ -3238,8 +3252,6 @@
if (!already_slept &&
(iclog->ic_prev->ic_state == XLOG_STATE_WANT_SYNC ||
iclog->ic_prev->ic_state == XLOG_STATE_SYNCING)) {
- XFS_STATS_INC(mp, xs_log_force_sleep);
-
xlog_wait(&iclog->ic_prev->ic_write_wait,
&log->l_icloglock);
return -EAGAIN;
@@ -3277,25 +3289,29 @@
* to disk, that thread will wake up all threads waiting on the queue.
*/
int
-xfs_log_force_lsn(
+xfs_log_force_seq(
struct xfs_mount *mp,
- xfs_lsn_t lsn,
+ xfs_csn_t seq,
uint flags,
int *log_flushed)
{
+ struct xlog *log = mp->m_log;
+ xfs_lsn_t lsn;
int ret;
- ASSERT(lsn != 0);
+ ASSERT(seq != 0);
XFS_STATS_INC(mp, xs_log_force);
- trace_xfs_log_force(mp, lsn, _RET_IP_);
+ trace_xfs_log_force(mp, seq, _RET_IP_);
- lsn = xlog_cil_force_lsn(mp->m_log, lsn);
+ lsn = xlog_cil_force_seq(log, seq);
if (lsn == NULLCOMMITLSN)
return 0;
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, false);
- if (ret == -EAGAIN)
- ret = __xfs_log_force_lsn(mp, lsn, flags, log_flushed, true);
+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, false);
+ if (ret == -EAGAIN) {
+ XFS_STATS_INC(mp, xs_log_force_sleep);
+ ret = xlog_force_lsn(log, lsn, flags, log_flushed, true);
+ }
return ret;
}
diff --git a/fs/xfs/xfs_log.h b/fs/xfs/xfs_log.h
index 58c3fcb..a1089f8 100644
--- a/fs/xfs/xfs_log.h
+++ b/fs/xfs/xfs_log.h
@@ -106,7 +106,7 @@
struct xfs_trans;
int xfs_log_force(struct xfs_mount *mp, uint flags);
-int xfs_log_force_lsn(struct xfs_mount *mp, xfs_lsn_t lsn, uint flags,
+int xfs_log_force_seq(struct xfs_mount *mp, xfs_csn_t seq, uint flags,
int *log_forced);
int xfs_log_mount(struct xfs_mount *mp,
struct xfs_buftarg *log_target,
@@ -127,12 +127,11 @@
int xfs_log_regrant(struct xfs_mount *mp, struct xlog_ticket *tic);
void xfs_log_unmount(struct xfs_mount *mp);
int xfs_log_force_umount(struct xfs_mount *mp, int logerror);
+bool xfs_log_writable(struct xfs_mount *mp);
struct xlog_ticket *xfs_log_ticket_get(struct xlog_ticket *ticket);
void xfs_log_ticket_put(struct xlog_ticket *ticket);
-void xfs_log_commit_cil(struct xfs_mount *mp, struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn, bool regrant);
void xlog_cil_process_committed(struct list_head *list);
bool xfs_log_item_in_current_chkpt(struct xfs_log_item *lip);
diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c
index b0ef071..fbe160d 100644
--- a/fs/xfs/xfs_log_cil.c
+++ b/fs/xfs/xfs_log_cil.c
@@ -668,9 +668,14 @@
ASSERT(push_seq <= ctx->sequence);
/*
- * Wake up any background push waiters now this context is being pushed.
+ * As we are about to switch to a new, empty CIL context, we no longer
+ * need to throttle tasks on CIL space overruns. Wake any waiters that
+ * the hard push throttle may have caught so they can start committing
+ * to the new context. The ctx->xc_push_lock provides the serialisation
+ * necessary for safely using the lockless waitqueue_active() check in
+ * this context.
*/
- if (ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log))
+ if (waitqueue_active(&cil->xc_push_wait))
wake_up_all(&cil->xc_push_wait);
/*
@@ -772,7 +777,7 @@
* that higher sequences will wait for us to write out a commit record
* before they do.
*
- * xfs_log_force_lsn requires us to mirror the new sequence into the cil
+ * xfs_log_force_seq requires us to mirror the new sequence into the cil
* structure atomically with the addition of this sequence to the
* committing list. This also ensures that we can do unlocked checks
* against the current sequence in log forces without risking
@@ -907,7 +912,7 @@
ASSERT(!list_empty(&cil->xc_cil));
/*
- * don't do a background push if we haven't used up all the
+ * Don't do a background push if we haven't used up all the
* space available yet.
*/
if (cil->xc_ctx->space_used < XLOG_CIL_SPACE_LIMIT(log)) {
@@ -931,9 +936,16 @@
/*
* If we are well over the space limit, throttle the work that is being
- * done until the push work on this context has begun.
+ * done until the push work on this context has begun. Enforce the hard
+ * throttle on all transaction commits once it has been activated, even
+ * if the committing transactions have resulted in the space usage
+ * dipping back down under the hard limit.
+ *
+ * The ctx->xc_push_lock provides the serialisation necessary for safely
+ * using the lockless waitqueue_active() check in this context.
*/
- if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log)) {
+ if (cil->xc_ctx->space_used >= XLOG_CIL_BLOCKING_SPACE_LIMIT(log) ||
+ waitqueue_active(&cil->xc_push_wait)) {
trace_xfs_log_cil_wait(log, cil->xc_ctx->ticket);
ASSERT(cil->xc_ctx->space_used < log->l_logsize);
xlog_wait(&cil->xc_push_wait, &cil->xc_push_lock);
@@ -1008,16 +1020,14 @@
* allowed again.
*/
void
-xfs_log_commit_cil(
- struct xfs_mount *mp,
+xlog_cil_commit(
+ struct xlog *log,
struct xfs_trans *tp,
- xfs_lsn_t *commit_lsn,
+ xfs_csn_t *commit_seq,
bool regrant)
{
- struct xlog *log = mp->m_log;
struct xfs_cil *cil = log->l_cilp;
struct xfs_log_item *lip, *next;
- xfs_lsn_t xc_commit_lsn;
/*
* Do all necessary memory allocation before we lock the CIL.
@@ -1031,10 +1041,6 @@
xlog_cil_insert_items(log, tp);
- xc_commit_lsn = cil->xc_ctx->sequence;
- if (commit_lsn)
- *commit_lsn = xc_commit_lsn;
-
if (regrant && !XLOG_FORCED_SHUTDOWN(log))
xfs_log_ticket_regrant(log, tp->t_ticket);
else
@@ -1057,8 +1063,10 @@
list_for_each_entry_safe(lip, next, &tp->t_items, li_trans) {
xfs_trans_del_item(lip);
if (lip->li_ops->iop_committing)
- lip->li_ops->iop_committing(lip, xc_commit_lsn);
+ lip->li_ops->iop_committing(lip, cil->xc_ctx->sequence);
}
+ if (commit_seq)
+ *commit_seq = cil->xc_ctx->sequence;
/* xlog_cil_push_background() releases cil->xc_ctx_lock */
xlog_cil_push_background(log);
@@ -1075,9 +1083,9 @@
* iclog flush is necessary following this call.
*/
xfs_lsn_t
-xlog_cil_force_lsn(
+xlog_cil_force_seq(
struct xlog *log,
- xfs_lsn_t sequence)
+ xfs_csn_t sequence)
{
struct xfs_cil *cil = log->l_cilp;
struct xfs_cil_ctx *ctx;
@@ -1171,23 +1179,19 @@
*/
bool
xfs_log_item_in_current_chkpt(
- struct xfs_log_item *lip)
+ struct xfs_log_item *lip)
{
- struct xfs_cil_ctx *ctx;
+ struct xfs_cil *cil = lip->li_mountp->m_log->l_cilp;
if (list_empty(&lip->li_cil))
return false;
- ctx = lip->li_mountp->m_log->l_cilp->xc_ctx;
-
/*
* li_seq is written on the first commit of a log item to record the
* first checkpoint it is written to. Hence if it is different to the
* current sequence, we're in a new checkpoint.
*/
- if (XFS_LSN_CMP(lip->li_seq, ctx->sequence) != 0)
- return false;
- return true;
+ return lip->li_seq == READ_ONCE(cil->xc_current_sequence);
}
/*
diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h
index 1c6fdbf..42cd160 100644
--- a/fs/xfs/xfs_log_priv.h
+++ b/fs/xfs/xfs_log_priv.h
@@ -230,7 +230,7 @@
struct xfs_cil_ctx {
struct xfs_cil *cil;
- xfs_lsn_t sequence; /* chkpt sequence # */
+ xfs_csn_t sequence; /* chkpt sequence # */
xfs_lsn_t start_lsn; /* first LSN of chkpt commit */
xfs_lsn_t commit_lsn; /* chkpt commit record lsn */
struct xlog_ticket *ticket; /* chkpt ticket */
@@ -268,10 +268,10 @@
struct xfs_cil_ctx *xc_ctx;
spinlock_t xc_push_lock ____cacheline_aligned_in_smp;
- xfs_lsn_t xc_push_seq;
+ xfs_csn_t xc_push_seq;
struct list_head xc_committing;
wait_queue_head_t xc_commit_wait;
- xfs_lsn_t xc_current_sequence;
+ xfs_csn_t xc_current_sequence;
struct work_struct xc_push_work;
wait_queue_head_t xc_push_wait; /* background push throttle */
} ____cacheline_aligned_in_smp;
@@ -547,19 +547,18 @@
void xlog_cil_init_post_recovery(struct xlog *log);
void xlog_cil_destroy(struct xlog *log);
bool xlog_cil_empty(struct xlog *log);
+void xlog_cil_commit(struct xlog *log, struct xfs_trans *tp,
+ xfs_csn_t *commit_seq, bool regrant);
/*
* CIL force routines
*/
-xfs_lsn_t
-xlog_cil_force_lsn(
- struct xlog *log,
- xfs_lsn_t sequence);
+xfs_lsn_t xlog_cil_force_seq(struct xlog *log, xfs_csn_t sequence);
static inline void
xlog_cil_force(struct xlog *log)
{
- xlog_cil_force_lsn(log, log->l_cilp->xc_current_sequence);
+ xlog_cil_force_seq(log, log->l_cilp->xc_current_sequence);
}
/*
diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c
index 87886b7..e61f28c 100644
--- a/fs/xfs/xfs_log_recover.c
+++ b/fs/xfs/xfs_log_recover.c
@@ -2061,7 +2061,9 @@
old_ptr = item->ri_buf[item->ri_cnt-1].i_addr;
old_len = item->ri_buf[item->ri_cnt-1].i_len;
- ptr = krealloc(old_ptr, len + old_len, GFP_KERNEL | __GFP_NOFAIL);
+ ptr = kvrealloc(old_ptr, old_len, len + old_len, GFP_KERNEL);
+ if (!ptr)
+ return -ENOMEM;
memcpy(&ptr[old_len], dp, len);
item->ri_buf[item->ri_cnt-1].i_len += len;
item->ri_buf[item->ri_cnt-1].i_addr = ptr;
@@ -2457,8 +2459,10 @@
error = xfs_trans_alloc(mp, &resv, dfc->dfc_blkres,
dfc->dfc_rtxres, XFS_TRANS_RESERVE, &tp);
- if (error)
+ if (error) {
+ xfs_force_shutdown(mp, SHUTDOWN_LOG_IO_ERROR);
return error;
+ }
/*
* Transfer to this new transaction all the dfops we captured
@@ -3454,6 +3458,7 @@
* this) before we get around to xfs_log_mount_cancel.
*/
xlog_recover_cancel_intents(log);
+ xfs_force_shutdown(log->l_mp, SHUTDOWN_LOG_IO_ERROR);
xfs_alert(log->l_mp, "Failed to recover intents");
return error;
}
diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c
index 7110507..a2a5a0f 100644
--- a/fs/xfs/xfs_mount.c
+++ b/fs/xfs/xfs_mount.c
@@ -632,6 +632,47 @@
}
/*
+ * Flush and reclaim dirty inodes in preparation for unmount. Inodes and
+ * internal inode structures can be sitting in the CIL and AIL at this point,
+ * so we need to unpin them, write them back and/or reclaim them before unmount
+ * can proceed.
+ *
+ * An inode cluster that has been freed can have its buffer still pinned in
+ * memory because the transaction is still sitting in a iclog. The stale inodes
+ * on that buffer will be pinned to the buffer until the transaction hits the
+ * disk and the callbacks run. Pushing the AIL will skip the stale inodes and
+ * may never see the pinned buffer, so nothing will push out the iclog and
+ * unpin the buffer.
+ *
+ * Hence we need to force the log to unpin everything first. However, log
+ * forces don't wait for the discards they issue to complete, so we have to
+ * explicitly wait for them to complete here as well.
+ *
+ * Then we can tell the world we are unmounting so that error handling knows
+ * that the filesystem is going away and we should error out anything that we
+ * have been retrying in the background. This will prevent never-ending
+ * retries in AIL pushing from hanging the unmount.
+ *
+ * Finally, we can push the AIL to clean all the remaining dirty objects, then
+ * reclaim the remaining inodes that are still in memory at this point in time.
+ */
+static void
+xfs_unmount_flush_inodes(
+ struct xfs_mount *mp)
+{
+ xfs_log_force(mp, XFS_LOG_SYNC);
+ xfs_extent_busy_wait_all(mp);
+ flush_workqueue(xfs_discard_wq);
+
+ mp->m_flags |= XFS_MOUNT_UNMOUNTING;
+
+ xfs_ail_push_all_sync(mp->m_ail);
+ cancel_delayed_work_sync(&mp->m_reclaim_work);
+ xfs_reclaim_inodes(mp);
+ xfs_health_unmount(mp);
+}
+
+/*
* This function does the following on an initial mount of a file system:
* - reads the superblock from disk and init the mount struct
* - if we're a 32-bit kernel, do a size check on the superblock
@@ -927,9 +968,17 @@
/*
* Finish recovering the file system. This part needed to be delayed
* until after the root and real-time bitmap inodes were consistently
- * read in.
+ * read in. Temporarily create per-AG space reservations for metadata
+ * btree shape changes because space freeing transactions (for inode
+ * inactivation) require the per-AG reservation in lieu of reserving
+ * blocks.
*/
+ error = xfs_fs_reserve_ag_blocks(mp);
+ if (error && error == -ENOSPC)
+ xfs_warn(mp,
+ "ENOSPC reserving per-AG metadata pool, log recovery may fail.");
error = xfs_log_mount_finish(mp);
+ xfs_fs_unreserve_ag_blocks(mp);
if (error) {
xfs_warn(mp, "log mount finish failed");
goto out_rtunmount;
@@ -1005,7 +1054,7 @@
/* Clean out dquots that might be in memory after quotacheck. */
xfs_qm_unmount(mp);
/*
- * Cancel all delayed reclaim work and reclaim the inodes directly.
+ * Flush all inode reclamation work and flush the log.
* We have to do this /after/ rtunmount and qm_unmount because those
* two will have scheduled delayed reclaim for the rt/quota inodes.
*
@@ -1015,11 +1064,8 @@
* qm_unmount_quotas and therefore rely on qm_unmount to release the
* quota inodes.
*/
- cancel_delayed_work_sync(&mp->m_reclaim_work);
- xfs_reclaim_inodes(mp);
- xfs_health_unmount(mp);
+ xfs_unmount_flush_inodes(mp);
out_log_dealloc:
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
xfs_log_mount_cancel(mp);
out_fail_wait:
if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp)
@@ -1060,47 +1106,7 @@
xfs_rtunmount_inodes(mp);
xfs_irele(mp->m_rootip);
- /*
- * We can potentially deadlock here if we have an inode cluster
- * that has been freed has its buffer still pinned in memory because
- * the transaction is still sitting in a iclog. The stale inodes
- * on that buffer will be pinned to the buffer until the
- * transaction hits the disk and the callbacks run. Pushing the AIL will
- * skip the stale inodes and may never see the pinned buffer, so
- * nothing will push out the iclog and unpin the buffer. Hence we
- * need to force the log here to ensure all items are flushed into the
- * AIL before we go any further.
- */
- xfs_log_force(mp, XFS_LOG_SYNC);
-
- /*
- * Wait for all busy extents to be freed, including completion of
- * any discard operation.
- */
- xfs_extent_busy_wait_all(mp);
- flush_workqueue(xfs_discard_wq);
-
- /*
- * We now need to tell the world we are unmounting. This will allow
- * us to detect that the filesystem is going away and we should error
- * out anything that we have been retrying in the background. This will
- * prevent neverending retries in AIL pushing from hanging the unmount.
- */
- mp->m_flags |= XFS_MOUNT_UNMOUNTING;
-
- /*
- * Flush all pending changes from the AIL.
- */
- xfs_ail_push_all_sync(mp->m_ail);
-
- /*
- * Reclaim all inodes. At this point there should be no dirty inodes and
- * none should be pinned or locked. Stop background inode reclaim here
- * if it is still running.
- */
- cancel_delayed_work_sync(&mp->m_reclaim_work);
- xfs_reclaim_inodes(mp);
- xfs_health_unmount(mp);
+ xfs_unmount_flush_inodes(mp);
xfs_qm_unmount(mp);
@@ -1176,8 +1182,7 @@
int
xfs_log_sbcount(xfs_mount_t *mp)
{
- /* allow this to proceed during the freeze sequence... */
- if (!xfs_fs_writable(mp, SB_FREEZE_COMPLETE))
+ if (!xfs_log_writable(mp))
return 0;
/*
diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h
index dfa429b..3a6bc9d 100644
--- a/fs/xfs/xfs_mount.h
+++ b/fs/xfs/xfs_mount.h
@@ -406,6 +406,14 @@
xfs_agnumber_t *maxagi);
extern void xfs_unmountfs(xfs_mount_t *);
+/* Accessor added for 5.10.y backport */
+static inline uint64_t
+xfs_fdblocks_unavailable(
+ struct xfs_mount *mp)
+{
+ return mp->m_alloc_set_aside;
+}
+
extern int xfs_mod_fdblocks(struct xfs_mount *mp, int64_t delta,
bool reserved);
extern int xfs_mod_frextents(struct xfs_mount *mp, int64_t delta);
diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c
index b2a9abe..64e5da3 100644
--- a/fs/xfs/xfs_qm.c
+++ b/fs/xfs/xfs_qm.c
@@ -1786,6 +1786,29 @@
xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_ICOUNT, 1);
/*
+ * Back when we made quota reservations for the chown, we reserved the
+ * ondisk blocks + delalloc blocks with the new dquot. Now that we've
+ * switched the dquots, decrease the new dquot's block reservation
+ * (having already bumped up the real counter) so that we don't have
+ * any reservation to give back when we commit.
+ */
+ xfs_trans_mod_dquot(tp, newdq, XFS_TRANS_DQ_RES_BLKS,
+ -ip->i_delayed_blks);
+
+ /*
+ * Give the incore reservation for delalloc blocks back to the old
+ * dquot. We don't normally handle delalloc quota reservations
+ * transactionally, so just lock the dquot and subtract from the
+ * reservation. Dirty the transaction because it's too late to turn
+ * back now.
+ */
+ tp->t_flags |= XFS_TRANS_DIRTY;
+ xfs_dqlock(prevdq);
+ ASSERT(prevdq->q_blk.reserved >= ip->i_delayed_blks);
+ prevdq->q_blk.reserved -= ip->i_delayed_blks;
+ xfs_dqunlock(prevdq);
+
+ /*
* Take an extra reference, because the inode is going to keep
* this dquot pointer even after the trans_commit.
*/
@@ -1807,84 +1830,39 @@
uint flags)
{
struct xfs_mount *mp = ip->i_mount;
- uint64_t delblks;
unsigned int blkflags;
- struct xfs_dquot *udq_unres = NULL;
- struct xfs_dquot *gdq_unres = NULL;
- struct xfs_dquot *pdq_unres = NULL;
struct xfs_dquot *udq_delblks = NULL;
struct xfs_dquot *gdq_delblks = NULL;
struct xfs_dquot *pdq_delblks = NULL;
- int error;
-
ASSERT(xfs_isilocked(ip, XFS_ILOCK_EXCL|XFS_ILOCK_SHARED));
ASSERT(XFS_IS_QUOTA_RUNNING(mp));
- delblks = ip->i_delayed_blks;
blkflags = XFS_IS_REALTIME_INODE(ip) ?
XFS_QMOPT_RES_RTBLKS : XFS_QMOPT_RES_REGBLKS;
if (XFS_IS_UQUOTA_ON(mp) && udqp &&
- i_uid_read(VFS_I(ip)) != udqp->q_id) {
+ i_uid_read(VFS_I(ip)) != udqp->q_id)
udq_delblks = udqp;
- /*
- * If there are delayed allocation blocks, then we have to
- * unreserve those from the old dquot, and add them to the
- * new dquot.
- */
- if (delblks) {
- ASSERT(ip->i_udquot);
- udq_unres = ip->i_udquot;
- }
- }
+
if (XFS_IS_GQUOTA_ON(ip->i_mount) && gdqp &&
- i_gid_read(VFS_I(ip)) != gdqp->q_id) {
+ i_gid_read(VFS_I(ip)) != gdqp->q_id)
gdq_delblks = gdqp;
- if (delblks) {
- ASSERT(ip->i_gdquot);
- gdq_unres = ip->i_gdquot;
- }
- }
if (XFS_IS_PQUOTA_ON(ip->i_mount) && pdqp &&
- ip->i_d.di_projid != pdqp->q_id) {
+ ip->i_d.di_projid != pdqp->q_id)
pdq_delblks = pdqp;
- if (delblks) {
- ASSERT(ip->i_pdquot);
- pdq_unres = ip->i_pdquot;
- }
- }
-
- error = xfs_trans_reserve_quota_bydquots(tp, ip->i_mount,
- udq_delblks, gdq_delblks, pdq_delblks,
- ip->i_d.di_nblocks, 1, flags | blkflags);
- if (error)
- return error;
/*
- * Do the delayed blks reservations/unreservations now. Since, these
- * are done without the help of a transaction, if a reservation fails
- * its previous reservations won't be automatically undone by trans
- * code. So, we have to do it manually here.
+ * Reserve enough quota to handle blocks on disk and reserved for a
+ * delayed allocation. We'll actually transfer the delalloc
+ * reservation between dquots at chown time, even though that part is
+ * only semi-transactional.
*/
- if (delblks) {
- /*
- * Do the reservations first. Unreservation can't fail.
- */
- ASSERT(udq_delblks || gdq_delblks || pdq_delblks);
- ASSERT(udq_unres || gdq_unres || pdq_unres);
- error = xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- udq_delblks, gdq_delblks, pdq_delblks,
- (xfs_qcnt_t)delblks, 0, flags | blkflags);
- if (error)
- return error;
- xfs_trans_reserve_quota_bydquots(NULL, ip->i_mount,
- udq_unres, gdq_unres, pdq_unres,
- -((xfs_qcnt_t)delblks), 0, blkflags);
- }
-
- return 0;
+ return xfs_trans_reserve_quota_bydquots(tp, ip->i_mount, udq_delblks,
+ gdq_delblks, pdq_delblks,
+ ip->i_d.di_nblocks + ip->i_delayed_blks,
+ 1, blkflags | flags);
}
int
diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c
index 6fa05fb..aa46b75 100644
--- a/fs/xfs/xfs_reflink.c
+++ b/fs/xfs/xfs_reflink.c
@@ -1503,7 +1503,8 @@
if (error)
goto out;
- error = filemap_write_and_wait_range(inode->i_mapping, offset, len);
+ error = filemap_write_and_wait_range(inode->i_mapping, offset,
+ offset + len - 1);
if (error)
goto out;
diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c
index e3e229e..434c87c 100644
--- a/fs/xfs/xfs_super.c
+++ b/fs/xfs/xfs_super.c
@@ -199,10 +199,12 @@
seq_printf(m, ",swidth=%d",
(int)XFS_FSB_TO_BB(mp, mp->m_swidth));
- if (mp->m_qflags & (XFS_UQUOTA_ACCT|XFS_UQUOTA_ENFD))
- seq_puts(m, ",usrquota");
- else if (mp->m_qflags & XFS_UQUOTA_ACCT)
- seq_puts(m, ",uqnoenforce");
+ if (mp->m_qflags & XFS_UQUOTA_ACCT) {
+ if (mp->m_qflags & XFS_UQUOTA_ENFD)
+ seq_puts(m, ",usrquota");
+ else
+ seq_puts(m, ",uqnoenforce");
+ }
if (mp->m_qflags & XFS_PQUOTA_ACCT) {
if (mp->m_qflags & XFS_PQUOTA_ENFD)
@@ -755,6 +757,7 @@
int wait)
{
struct xfs_mount *mp = XFS_M(sb);
+ int error;
/*
* Doing anything during the async pass would be counterproductive.
@@ -762,7 +765,10 @@
if (!wait)
return 0;
- xfs_log_force(mp, XFS_LOG_SYNC);
+ error = xfs_log_force(mp, XFS_LOG_SYNC);
+ if (error)
+ return error;
+
if (laptop_mode) {
/*
* The disk must be active because we're syncing.
@@ -1153,6 +1159,22 @@
return ret;
}
+static inline void
+xfs_fs_warn_deprecated(
+ struct fs_context *fc,
+ struct fs_parameter *param,
+ uint64_t flag,
+ bool value)
+{
+ /* Don't print the warning if reconfiguring and current mount point
+ * already had the flag set
+ */
+ if ((fc->purpose & FS_CONTEXT_FOR_RECONFIGURE) &&
+ !!(XFS_M(fc->root->d_sb)->m_flags & flag) == value)
+ return;
+ xfs_warn(fc->s_fs_info, "%s mount option is deprecated.", param->key);
+}
+
/*
* Set mount state from a mount option.
*
@@ -1163,7 +1185,7 @@
struct fs_context *fc,
struct fs_parameter *param)
{
- struct xfs_mount *mp = fc->s_fs_info;
+ struct xfs_mount *parsing_mp = fc->s_fs_info;
struct fs_parse_result result;
int size = 0;
int opt;
@@ -1174,142 +1196,142 @@
switch (opt) {
case Opt_logbufs:
- mp->m_logbufs = result.uint_32;
+ parsing_mp->m_logbufs = result.uint_32;
return 0;
case Opt_logbsize:
- if (suffix_kstrtoint(param->string, 10, &mp->m_logbsize))
+ if (suffix_kstrtoint(param->string, 10, &parsing_mp->m_logbsize))
return -EINVAL;
return 0;
case Opt_logdev:
- kfree(mp->m_logname);
- mp->m_logname = kstrdup(param->string, GFP_KERNEL);
- if (!mp->m_logname)
+ kfree(parsing_mp->m_logname);
+ parsing_mp->m_logname = kstrdup(param->string, GFP_KERNEL);
+ if (!parsing_mp->m_logname)
return -ENOMEM;
return 0;
case Opt_rtdev:
- kfree(mp->m_rtname);
- mp->m_rtname = kstrdup(param->string, GFP_KERNEL);
- if (!mp->m_rtname)
+ kfree(parsing_mp->m_rtname);
+ parsing_mp->m_rtname = kstrdup(param->string, GFP_KERNEL);
+ if (!parsing_mp->m_rtname)
return -ENOMEM;
return 0;
case Opt_allocsize:
if (suffix_kstrtoint(param->string, 10, &size))
return -EINVAL;
- mp->m_allocsize_log = ffs(size) - 1;
- mp->m_flags |= XFS_MOUNT_ALLOCSIZE;
+ parsing_mp->m_allocsize_log = ffs(size) - 1;
+ parsing_mp->m_flags |= XFS_MOUNT_ALLOCSIZE;
return 0;
case Opt_grpid:
case Opt_bsdgroups:
- mp->m_flags |= XFS_MOUNT_GRPID;
+ parsing_mp->m_flags |= XFS_MOUNT_GRPID;
return 0;
case Opt_nogrpid:
case Opt_sysvgroups:
- mp->m_flags &= ~XFS_MOUNT_GRPID;
+ parsing_mp->m_flags &= ~XFS_MOUNT_GRPID;
return 0;
case Opt_wsync:
- mp->m_flags |= XFS_MOUNT_WSYNC;
+ parsing_mp->m_flags |= XFS_MOUNT_WSYNC;
return 0;
case Opt_norecovery:
- mp->m_flags |= XFS_MOUNT_NORECOVERY;
+ parsing_mp->m_flags |= XFS_MOUNT_NORECOVERY;
return 0;
case Opt_noalign:
- mp->m_flags |= XFS_MOUNT_NOALIGN;
+ parsing_mp->m_flags |= XFS_MOUNT_NOALIGN;
return 0;
case Opt_swalloc:
- mp->m_flags |= XFS_MOUNT_SWALLOC;
+ parsing_mp->m_flags |= XFS_MOUNT_SWALLOC;
return 0;
case Opt_sunit:
- mp->m_dalign = result.uint_32;
+ parsing_mp->m_dalign = result.uint_32;
return 0;
case Opt_swidth:
- mp->m_swidth = result.uint_32;
+ parsing_mp->m_swidth = result.uint_32;
return 0;
case Opt_inode32:
- mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
+ parsing_mp->m_flags |= XFS_MOUNT_SMALL_INUMS;
return 0;
case Opt_inode64:
- mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
+ parsing_mp->m_flags &= ~XFS_MOUNT_SMALL_INUMS;
return 0;
case Opt_nouuid:
- mp->m_flags |= XFS_MOUNT_NOUUID;
+ parsing_mp->m_flags |= XFS_MOUNT_NOUUID;
return 0;
case Opt_largeio:
- mp->m_flags |= XFS_MOUNT_LARGEIO;
+ parsing_mp->m_flags |= XFS_MOUNT_LARGEIO;
return 0;
case Opt_nolargeio:
- mp->m_flags &= ~XFS_MOUNT_LARGEIO;
+ parsing_mp->m_flags &= ~XFS_MOUNT_LARGEIO;
return 0;
case Opt_filestreams:
- mp->m_flags |= XFS_MOUNT_FILESTREAMS;
+ parsing_mp->m_flags |= XFS_MOUNT_FILESTREAMS;
return 0;
case Opt_noquota:
- mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
- mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
- mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
+ parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACCT;
+ parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ENFD;
+ parsing_mp->m_qflags &= ~XFS_ALL_QUOTA_ACTIVE;
return 0;
case Opt_quota:
case Opt_uquota:
case Opt_usrquota:
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
+ parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE |
XFS_UQUOTA_ENFD);
return 0;
case Opt_qnoenforce:
case Opt_uqnoenforce:
- mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_UQUOTA_ENFD;
+ parsing_mp->m_qflags |= (XFS_UQUOTA_ACCT | XFS_UQUOTA_ACTIVE);
+ parsing_mp->m_qflags &= ~XFS_UQUOTA_ENFD;
return 0;
case Opt_pquota:
case Opt_prjquota:
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
+ parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE |
XFS_PQUOTA_ENFD);
return 0;
case Opt_pqnoenforce:
- mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_PQUOTA_ENFD;
+ parsing_mp->m_qflags |= (XFS_PQUOTA_ACCT | XFS_PQUOTA_ACTIVE);
+ parsing_mp->m_qflags &= ~XFS_PQUOTA_ENFD;
return 0;
case Opt_gquota:
case Opt_grpquota:
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
+ parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE |
XFS_GQUOTA_ENFD);
return 0;
case Opt_gqnoenforce:
- mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
- mp->m_qflags &= ~XFS_GQUOTA_ENFD;
+ parsing_mp->m_qflags |= (XFS_GQUOTA_ACCT | XFS_GQUOTA_ACTIVE);
+ parsing_mp->m_qflags &= ~XFS_GQUOTA_ENFD;
return 0;
case Opt_discard:
- mp->m_flags |= XFS_MOUNT_DISCARD;
+ parsing_mp->m_flags |= XFS_MOUNT_DISCARD;
return 0;
case Opt_nodiscard:
- mp->m_flags &= ~XFS_MOUNT_DISCARD;
+ parsing_mp->m_flags &= ~XFS_MOUNT_DISCARD;
return 0;
#ifdef CONFIG_FS_DAX
case Opt_dax:
- xfs_mount_set_dax_mode(mp, XFS_DAX_ALWAYS);
+ xfs_mount_set_dax_mode(parsing_mp, XFS_DAX_ALWAYS);
return 0;
case Opt_dax_enum:
- xfs_mount_set_dax_mode(mp, result.uint_32);
+ xfs_mount_set_dax_mode(parsing_mp, result.uint_32);
return 0;
#endif
/* Following mount options will be removed in September 2025 */
case Opt_ikeep:
- xfs_warn(mp, "%s mount option is deprecated.", param->key);
- mp->m_flags |= XFS_MOUNT_IKEEP;
+ xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, true);
+ parsing_mp->m_flags |= XFS_MOUNT_IKEEP;
return 0;
case Opt_noikeep:
- xfs_warn(mp, "%s mount option is deprecated.", param->key);
- mp->m_flags &= ~XFS_MOUNT_IKEEP;
+ xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_IKEEP, false);
+ parsing_mp->m_flags &= ~XFS_MOUNT_IKEEP;
return 0;
case Opt_attr2:
- xfs_warn(mp, "%s mount option is deprecated.", param->key);
- mp->m_flags |= XFS_MOUNT_ATTR2;
+ xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_ATTR2, true);
+ parsing_mp->m_flags |= XFS_MOUNT_ATTR2;
return 0;
case Opt_noattr2:
- xfs_warn(mp, "%s mount option is deprecated.", param->key);
- mp->m_flags &= ~XFS_MOUNT_ATTR2;
- mp->m_flags |= XFS_MOUNT_NOATTR2;
+ xfs_fs_warn_deprecated(fc, param, XFS_MOUNT_NOATTR2, true);
+ parsing_mp->m_flags &= ~XFS_MOUNT_ATTR2;
+ parsing_mp->m_flags |= XFS_MOUNT_NOATTR2;
return 0;
default:
- xfs_warn(mp, "unknown mount option [%s].", param->key);
+ xfs_warn(parsing_mp, "unknown mount option [%s].", param->key);
return -EINVAL;
}
@@ -1693,7 +1715,15 @@
xfs_remount_ro(
struct xfs_mount *mp)
{
- int error;
+ struct xfs_eofblocks eofb = {
+ .eof_flags = XFS_EOF_FLAGS_SYNC,
+ };
+ int error;
+
+ /* Flush all the dirty data to disk. */
+ error = sync_filesystem(mp->m_super);
+ if (error)
+ return error;
/*
* Cancel background eofb scanning so it cannot race with the final
@@ -1701,8 +1731,13 @@
*/
xfs_stop_block_reaping(mp);
- /* Get rid of any leftover CoW reservations... */
- error = xfs_icache_free_cowblocks(mp, NULL);
+ /*
+ * Clear out all remaining COW staging extents and speculative post-EOF
+ * preallocations so that we don't leave inodes requiring inactivation
+ * cleanups during reclaim on a read-only mount. We must process every
+ * cached inode, so this requires a synchronous cache scan.
+ */
+ error = xfs_icache_free_cowblocks(mp, &eofb);
if (error) {
xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE);
return error;
@@ -1760,8 +1795,6 @@
if (error)
return error;
- sync_filesystem(mp->m_super);
-
/* inode32 -> inode64 */
if ((mp->m_flags & XFS_MOUNT_SMALL_INUMS) &&
!(new_mp->m_flags & XFS_MOUNT_SMALL_INUMS)) {
@@ -1908,7 +1941,7 @@
if (!xfs_ifork_zone)
goto out_destroy_da_state_zone;
- xfs_trans_zone = kmem_cache_create("xf_trans",
+ xfs_trans_zone = kmem_cache_create("xfs_trans",
sizeof(struct xfs_trans),
0, 0, NULL);
if (!xfs_trans_zone)
diff --git a/fs/xfs/xfs_symlink.c b/fs/xfs/xfs_symlink.c
index 8e88a7c..8d3abf0 100644
--- a/fs/xfs/xfs_symlink.c
+++ b/fs/xfs/xfs_symlink.c
@@ -300,6 +300,7 @@
}
ASSERT(pathlen == 0);
}
+ i_size_write(VFS_I(ip), ip->i_d.di_size);
/*
* Create the directory entry for the symlink.
diff --git a/fs/xfs/xfs_trans.c b/fs/xfs/xfs_trans.c
index c94e71f..73a1de7 100644
--- a/fs/xfs/xfs_trans.c
+++ b/fs/xfs/xfs_trans.c
@@ -68,6 +68,7 @@
xfs_extent_busy_clear(tp->t_mountp, &tp->t_busy, false);
trace_xfs_trans_free(tp, _RET_IP_);
+ xfs_trans_clear_context(tp);
if (!(tp->t_flags & XFS_TRANS_NO_WRITECOUNT))
sb_end_intwrite(tp->t_mountp->m_super);
xfs_trans_free_dqinfo(tp);
@@ -119,7 +120,8 @@
ntp->t_rtx_res = tp->t_rtx_res - tp->t_rtx_res_used;
tp->t_rtx_res = tp->t_rtx_res_used;
- ntp->t_pflags = tp->t_pflags;
+
+ xfs_trans_switch_context(tp, ntp);
/* move deferred ops over to the new tp */
xfs_defer_move(ntp, tp);
@@ -153,9 +155,6 @@
int error = 0;
bool rsvd = (tp->t_flags & XFS_TRANS_RESERVE) != 0;
- /* Mark this thread as being in a transaction */
- current_set_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
/*
* Attempt to reserve the needed disk blocks by decrementing
* the number needed from the number available. This will
@@ -163,10 +162,8 @@
*/
if (blocks > 0) {
error = xfs_mod_fdblocks(mp, -((int64_t)blocks), rsvd);
- if (error != 0) {
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
+ if (error != 0)
return -ENOSPC;
- }
tp->t_blk_res += blocks;
}
@@ -240,9 +237,6 @@
xfs_mod_fdblocks(mp, (int64_t)blocks, rsvd);
tp->t_blk_res = 0;
}
-
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
return error;
}
@@ -266,6 +260,7 @@
tp = kmem_cache_zalloc(xfs_trans_zone, GFP_KERNEL | __GFP_NOFAIL);
if (!(flags & XFS_TRANS_NO_WRITECOUNT))
sb_start_intwrite(mp->m_super);
+ xfs_trans_set_context(tp);
/*
* Zero-reservation ("empty") transactions can't modify anything, so
@@ -620,6 +615,9 @@
/* apply remaining deltas */
spin_lock(&mp->m_sb_lock);
+ mp->m_sb.sb_fdblocks += tp->t_fdblocks_delta + tp->t_res_fdblocks_delta;
+ mp->m_sb.sb_icount += idelta;
+ mp->m_sb.sb_ifree += ifreedelta;
mp->m_sb.sb_frextents += rtxdelta;
mp->m_sb.sb_dblocks += tp->t_dblocks_delta;
mp->m_sb.sb_agcount += tp->t_agcount_delta;
@@ -834,7 +832,7 @@
bool regrant)
{
struct xfs_mount *mp = tp->t_mountp;
- xfs_lsn_t commit_lsn = -1;
+ xfs_csn_t commit_seq = 0;
int error = 0;
int sync = tp->t_flags & XFS_TRANS_SYNC;
@@ -876,9 +874,8 @@
xfs_trans_apply_sb_deltas(tp);
xfs_trans_apply_dquot_deltas(tp);
- xfs_log_commit_cil(mp, tp, &commit_lsn, regrant);
+ xlog_cil_commit(mp->m_log, tp, &commit_seq, regrant);
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free(tp);
/*
@@ -886,7 +883,7 @@
* log out now and wait for it.
*/
if (sync) {
- error = xfs_log_force_lsn(mp, commit_lsn, XFS_LOG_SYNC, NULL);
+ error = xfs_log_force_seq(mp, commit_seq, XFS_LOG_SYNC, NULL);
XFS_STATS_INC(mp, xs_trans_sync);
} else {
XFS_STATS_INC(mp, xs_trans_async);
@@ -910,7 +907,6 @@
xfs_log_ticket_ungrant(mp->m_log, tp->t_ticket);
tp->t_ticket = NULL;
}
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
xfs_trans_free_items(tp, !!error);
xfs_trans_free(tp);
@@ -970,9 +966,6 @@
tp->t_ticket = NULL;
}
- /* mark this thread as no longer being in a transaction */
- current_restore_flags_nested(&tp->t_pflags, PF_MEMALLOC_NOFS);
-
xfs_trans_free_items(tp, dirty);
xfs_trans_free(tp);
}
diff --git a/fs/xfs/xfs_trans.h b/fs/xfs/xfs_trans.h
index 0846589..9748555 100644
--- a/fs/xfs/xfs_trans.h
+++ b/fs/xfs/xfs_trans.h
@@ -43,7 +43,7 @@
struct list_head li_cil; /* CIL pointers */
struct xfs_log_vec *li_lv; /* active log vector */
struct xfs_log_vec *li_lv_shadow; /* standby vector */
- xfs_lsn_t li_seq; /* CIL commit seq */
+ xfs_csn_t li_seq; /* CIL commit seq */
};
/*
@@ -69,7 +69,7 @@
void (*iop_pin)(struct xfs_log_item *);
void (*iop_unpin)(struct xfs_log_item *, int remove);
uint (*iop_push)(struct xfs_log_item *, struct list_head *);
- void (*iop_committing)(struct xfs_log_item *, xfs_lsn_t commit_lsn);
+ void (*iop_committing)(struct xfs_log_item *lip, xfs_csn_t seq);
void (*iop_release)(struct xfs_log_item *);
xfs_lsn_t (*iop_committed)(struct xfs_log_item *, xfs_lsn_t);
int (*iop_recover)(struct xfs_log_item *lip,
@@ -268,4 +268,34 @@
return lip->li_ops->iop_relog(lip, tp);
}
+static inline void
+xfs_trans_set_context(
+ struct xfs_trans *tp)
+{
+ ASSERT(current->journal_info == NULL);
+ tp->t_pflags = memalloc_nofs_save();
+ current->journal_info = tp;
+}
+
+static inline void
+xfs_trans_clear_context(
+ struct xfs_trans *tp)
+{
+ if (current->journal_info == tp) {
+ memalloc_nofs_restore(tp->t_pflags);
+ current->journal_info = NULL;
+ }
+}
+
+static inline void
+xfs_trans_switch_context(
+ struct xfs_trans *old_tp,
+ struct xfs_trans *new_tp)
+{
+ ASSERT(current->journal_info == old_tp);
+ new_tp->t_pflags = old_tp->t_pflags;
+ old_tp->t_pflags = 0;
+ current->journal_info = new_tp;
+}
+
#endif /* __XFS_TRANS_H__ */
diff --git a/fs/xfs/xfs_trans_dquot.c b/fs/xfs/xfs_trans_dquot.c
index fe45b0c..288ea38 100644
--- a/fs/xfs/xfs_trans_dquot.c
+++ b/fs/xfs/xfs_trans_dquot.c
@@ -615,7 +615,6 @@
return QUOTA_NL_ISOFTLONGWARN;
}
- res->warnings++;
return QUOTA_NL_ISOFTWARN;
}
diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c
index e60759d..475d23a 100644
--- a/fs/zonefs/super.c
+++ b/fs/zonefs/super.c
@@ -32,6 +32,17 @@
lockdep_assert_held(&zi->i_truncate_mutex);
+ /*
+ * With ZNS drives, closing an explicitly open zone that has not been
+ * written will change the zone state to "closed", that is, the zone
+ * will remain active. Since this can then cause failure of explicit
+ * open operation on other zones if the drive active zone resources
+ * are exceeded, make sure that the zone does not remain active by
+ * resetting it.
+ */
+ if (op == REQ_OP_ZONE_CLOSE && !zi->i_wpoffset)
+ op = REQ_OP_ZONE_RESET;
+
ret = blkdev_zone_mgmt(inode->i_sb->s_bdev, op, zi->i_zsector,
zi->i_zone_size >> SECTOR_SHIFT, GFP_NOFS);
if (ret) {
@@ -57,15 +68,49 @@
zi->i_flags &= ~ZONEFS_ZONE_OPEN;
}
-static int zonefs_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
- unsigned int flags, struct iomap *iomap,
- struct iomap *srcmap)
+static int zonefs_read_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
struct super_block *sb = inode->i_sb;
loff_t isize;
- /* All I/Os should always be within the file maximum size */
+ /*
+ * All blocks are always mapped below EOF. If reading past EOF,
+ * act as if there is a hole up to the file maximum size.
+ */
+ mutex_lock(&zi->i_truncate_mutex);
+ iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
+ isize = i_size_read(inode);
+ if (iomap->offset >= isize) {
+ iomap->type = IOMAP_HOLE;
+ iomap->addr = IOMAP_NULL_ADDR;
+ iomap->length = length;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+ iomap->length = isize - iomap->offset;
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
+
+ return 0;
+}
+
+static const struct iomap_ops zonefs_read_iomap_ops = {
+ .iomap_begin = zonefs_read_iomap_begin,
+};
+
+static int zonefs_write_iomap_begin(struct inode *inode, loff_t offset,
+ loff_t length, unsigned int flags,
+ struct iomap *iomap, struct iomap *srcmap)
+{
+ struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ struct super_block *sb = inode->i_sb;
+ loff_t isize;
+
+ /* All write I/Os should always be within the file maximum size */
if (WARN_ON_ONCE(offset + length > zi->i_max_size))
return -EIO;
@@ -75,7 +120,7 @@
* operation.
*/
if (WARN_ON_ONCE(zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
- (flags & IOMAP_WRITE) && !(flags & IOMAP_DIRECT)))
+ !(flags & IOMAP_DIRECT)))
return -EIO;
/*
@@ -84,45 +129,42 @@
* write pointer) and unwriten beyond.
*/
mutex_lock(&zi->i_truncate_mutex);
- isize = i_size_read(inode);
- if (offset >= isize)
- iomap->type = IOMAP_UNWRITTEN;
- else
- iomap->type = IOMAP_MAPPED;
- if (flags & IOMAP_WRITE)
- length = zi->i_max_size - offset;
- else
- length = min(length, isize - offset);
- mutex_unlock(&zi->i_truncate_mutex);
-
- iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
- iomap->length = ALIGN(offset + length, sb->s_blocksize) - iomap->offset;
iomap->bdev = inode->i_sb->s_bdev;
+ iomap->offset = ALIGN_DOWN(offset, sb->s_blocksize);
iomap->addr = (zi->i_zsector << SECTOR_SHIFT) + iomap->offset;
+ isize = i_size_read(inode);
+ if (iomap->offset >= isize) {
+ iomap->type = IOMAP_UNWRITTEN;
+ iomap->length = zi->i_max_size - iomap->offset;
+ } else {
+ iomap->type = IOMAP_MAPPED;
+ iomap->length = isize - iomap->offset;
+ }
+ mutex_unlock(&zi->i_truncate_mutex);
return 0;
}
-static const struct iomap_ops zonefs_iomap_ops = {
- .iomap_begin = zonefs_iomap_begin,
+static const struct iomap_ops zonefs_write_iomap_ops = {
+ .iomap_begin = zonefs_write_iomap_begin,
};
static int zonefs_readpage(struct file *unused, struct page *page)
{
- return iomap_readpage(page, &zonefs_iomap_ops);
+ return iomap_readpage(page, &zonefs_read_iomap_ops);
}
static void zonefs_readahead(struct readahead_control *rac)
{
- iomap_readahead(rac, &zonefs_iomap_ops);
+ iomap_readahead(rac, &zonefs_read_iomap_ops);
}
/*
* Map blocks for page writeback. This is used only on conventional zone files,
* which implies that the page range can only be within the fixed inode size.
*/
-static int zonefs_map_blocks(struct iomap_writepage_ctx *wpc,
- struct inode *inode, loff_t offset)
+static int zonefs_write_map_blocks(struct iomap_writepage_ctx *wpc,
+ struct inode *inode, loff_t offset)
{
struct zonefs_inode_info *zi = ZONEFS_I(inode);
@@ -136,12 +178,12 @@
offset < wpc->iomap.offset + wpc->iomap.length)
return 0;
- return zonefs_iomap_begin(inode, offset, zi->i_max_size - offset,
- IOMAP_WRITE, &wpc->iomap, NULL);
+ return zonefs_write_iomap_begin(inode, offset, zi->i_max_size - offset,
+ IOMAP_WRITE, &wpc->iomap, NULL);
}
static const struct iomap_writeback_ops zonefs_writeback_ops = {
- .map_blocks = zonefs_map_blocks,
+ .map_blocks = zonefs_write_map_blocks,
};
static int zonefs_writepage(struct page *page, struct writeback_control *wbc)
@@ -171,7 +213,8 @@
return -EINVAL;
}
- return iomap_swapfile_activate(sis, swap_file, span, &zonefs_iomap_ops);
+ return iomap_swapfile_activate(sis, swap_file, span,
+ &zonefs_read_iomap_ops);
}
static const struct address_space_operations zonefs_file_aops = {
@@ -397,8 +440,7 @@
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
unsigned int noio_flag;
- unsigned int nr_zones =
- zi->i_zone_size >> (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+ unsigned int nr_zones = 1;
struct zonefs_ioerr_data err = {
.inode = inode,
.write = write,
@@ -406,6 +448,15 @@
int ret;
/*
+ * The only files that have more than one zone are conventional zone
+ * files with aggregated conventional zones, for which the inode zone
+ * size is always larger than the device zone size.
+ */
+ if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev))
+ nr_zones = zi->i_zone_size >>
+ (sbi->s_zone_sectors_shift + SECTOR_SHIFT);
+
+ /*
* Memory allocations in blkdev_report_zones() can trigger a memory
* reclaim which may in turn cause a recursion into zonefs as well as
* struct request allocations for the same device. The former case may
@@ -601,7 +652,7 @@
/* Serialize against truncates */
down_read(&zi->i_mmap_sem);
- ret = iomap_page_mkwrite(vmf, &zonefs_iomap_ops);
+ ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops);
up_read(&zi->i_mmap_sem);
sb_end_pagefault(inode->i_sb);
@@ -858,7 +909,7 @@
if (append)
ret = zonefs_file_dio_append(iocb, from);
else
- ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
+ ret = iomap_dio_rw(iocb, from, &zonefs_write_iomap_ops,
&zonefs_write_dio_ops, sync);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
@@ -900,7 +951,7 @@
if (ret <= 0)
goto inode_unlock;
- ret = iomap_file_buffered_write(iocb, from, &zonefs_iomap_ops);
+ ret = iomap_file_buffered_write(iocb, from, &zonefs_write_iomap_ops);
if (ret > 0)
iocb->ki_pos += ret;
else if (ret == -EIO)
@@ -993,7 +1044,7 @@
goto inode_unlock;
}
file_accessed(iocb->ki_filp);
- ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
+ ret = iomap_dio_rw(iocb, to, &zonefs_read_iomap_ops,
&zonefs_read_dio_ops, is_sync_kiocb(iocb));
} else {
ret = generic_file_read_iter(iocb, to);
@@ -1152,6 +1203,7 @@
mutex_init(&zi->i_truncate_mutex);
init_rwsem(&zi->i_mmap_sem);
zi->i_wr_refcnt = 0;
+ zi->i_flags = 0;
return &zi->i_vnode;
}
@@ -1306,12 +1358,13 @@
inc_nlink(parent);
}
-static void zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
- enum zonefs_ztype type)
+static int zonefs_init_file_inode(struct inode *inode, struct blk_zone *zone,
+ enum zonefs_ztype type)
{
struct super_block *sb = inode->i_sb;
struct zonefs_sb_info *sbi = ZONEFS_SB(sb);
struct zonefs_inode_info *zi = ZONEFS_I(inode);
+ int ret = 0;
inode->i_ino = zone->start >> sbi->s_zone_sectors_shift;
inode->i_mode = S_IFREG | sbi->s_perm;
@@ -1319,6 +1372,14 @@
zi->i_ztype = type;
zi->i_zsector = zone->start;
zi->i_zone_size = zone->len << SECTOR_SHIFT;
+ if (zi->i_zone_size > bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT &&
+ !(sbi->s_features & ZONEFS_F_AGGRCNV)) {
+ zonefs_err(sb,
+ "zone size %llu doesn't match device's zone sectors %llu\n",
+ zi->i_zone_size,
+ bdev_zone_sectors(sb->s_bdev) << SECTOR_SHIFT);
+ return -EINVAL;
+ }
zi->i_max_size = min_t(loff_t, MAX_LFS_FILESIZE,
zone->capacity << SECTOR_SHIFT);
@@ -1336,6 +1397,22 @@
sb->s_maxbytes = max(zi->i_max_size, sb->s_maxbytes);
sbi->s_blocks += zi->i_max_size >> sb->s_blocksize_bits;
sbi->s_used_blocks += zi->i_wpoffset >> sb->s_blocksize_bits;
+
+ /*
+ * For sequential zones, make sure that any open zone is closed first
+ * to ensure that the initial number of open zones is 0, in sync with
+ * the open zone accounting done when the mount option
+ * ZONEFS_MNTOPT_EXPLICIT_OPEN is used.
+ */
+ if (type == ZONEFS_ZTYPE_SEQ &&
+ (zone->cond == BLK_ZONE_COND_IMP_OPEN ||
+ zone->cond == BLK_ZONE_COND_EXP_OPEN)) {
+ mutex_lock(&zi->i_truncate_mutex);
+ ret = zonefs_zone_mgmt(inode, REQ_OP_ZONE_CLOSE);
+ mutex_unlock(&zi->i_truncate_mutex);
+ }
+
+ return ret;
}
static struct dentry *zonefs_create_inode(struct dentry *parent,
@@ -1345,20 +1422,27 @@
struct inode *dir = d_inode(parent);
struct dentry *dentry;
struct inode *inode;
+ int ret = -ENOMEM;
dentry = d_alloc_name(parent, name);
if (!dentry)
- return NULL;
+ return ERR_PTR(ret);
inode = new_inode(parent->d_sb);
if (!inode)
goto dput;
inode->i_ctime = inode->i_mtime = inode->i_atime = dir->i_ctime;
- if (zone)
- zonefs_init_file_inode(inode, zone, type);
- else
+ if (zone) {
+ ret = zonefs_init_file_inode(inode, zone, type);
+ if (ret) {
+ iput(inode);
+ goto dput;
+ }
+ } else {
zonefs_init_dir_inode(dir, inode, type);
+ }
+
d_add(dentry, inode);
dir->i_size++;
@@ -1367,7 +1451,7 @@
dput:
dput(dentry);
- return NULL;
+ return ERR_PTR(ret);
}
struct zonefs_zone_data {
@@ -1387,7 +1471,7 @@
struct blk_zone *zone, *next, *end;
const char *zgroup_name;
char *file_name;
- struct dentry *dir;
+ struct dentry *dir, *dent;
unsigned int n = 0;
int ret;
@@ -1405,8 +1489,8 @@
zgroup_name = "seq";
dir = zonefs_create_inode(sb->s_root, zgroup_name, NULL, type);
- if (!dir) {
- ret = -ENOMEM;
+ if (IS_ERR(dir)) {
+ ret = PTR_ERR(dir);
goto free;
}
@@ -1452,8 +1536,9 @@
* Use the file number within its group as file name.
*/
snprintf(file_name, ZONEFS_NAME_MAX - 1, "%u", n);
- if (!zonefs_create_inode(dir, file_name, zone, type)) {
- ret = -ENOMEM;
+ dent = zonefs_create_inode(dir, file_name, zone, type);
+ if (IS_ERR(dent)) {
+ ret = PTR_ERR(dent);
goto free;
}
@@ -1670,11 +1755,6 @@
sbi->s_mount_opts = ZONEFS_MNTOPT_ERRORS_RO;
sbi->s_max_open_zones = bdev_max_open_zones(sb->s_bdev);
atomic_set(&sbi->s_open_zones, 0);
- if (!sbi->s_max_open_zones &&
- sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
- zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
- sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
- }
ret = zonefs_read_super(sb);
if (ret)
@@ -1693,6 +1773,12 @@
zonefs_info(sb, "Mounting %u zones",
blkdev_nr_zones(sb->s_bdev->bd_disk));
+ if (!sbi->s_max_open_zones &&
+ sbi->s_mount_opts & ZONEFS_MNTOPT_EXPLICIT_OPEN) {
+ zonefs_info(sb, "No open zones limit. Ignoring explicit_open mount option\n");
+ sbi->s_mount_opts &= ~ZONEFS_MNTOPT_EXPLICIT_OPEN;
+ }
+
/* Create root directory inode */
ret = -ENOMEM;
inode = new_inode(sb);