diff options
Diffstat (limited to 'fs')
403 files changed, 5829 insertions, 3482 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 39def020a074..cdb99507ef33 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -583,7 +583,7 @@ static struct attribute *v9fs_attrs[] = { NULL, }; -static struct attribute_group v9fs_attr_group = { +static const struct attribute_group v9fs_attr_group = { .attrs = v9fs_attrs, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 649f04f112dc..59c32c9b799f 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -86,8 +86,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) * to work. */ writeback_fid = v9fs_writeback_fid(file_dentry(file)); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); + if (IS_ERR(writeback_fid)) { + err = PTR_ERR(writeback_fid); mutex_unlock(&v9inode->v_mutex); goto out_error; } diff --git a/fs/Kconfig b/fs/Kconfig index 97e7b77c9309..141a856c50e7 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -223,10 +223,13 @@ config TMPFS_INODE64 If unsure, say N. +config ARCH_SUPPORTS_HUGETLBFS + def_bool n + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ - SYS_SUPPORTS_HUGETLBFS || BROKEN + ARCH_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -335,8 +338,8 @@ config NFS_COMMON default y config NFS_V4_2_SSC_HELPER - tristate - default y if NFS_V4=y || NFS_FS=y + bool + default y if NFS_V4_2 source "net/sunrpc/Kconfig" source "fs/ceph/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index c6f1c8c1934e..06fb7a93a1bd 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK config BINFMT_FLAT_OLD_ALWAYS_RAM bool +config BINFMT_FLAT_NO_DATA_START_OFFSET + bool + config BINFMT_FLAT_OLD bool "Enable support for very old legacy flat binaries" depends on BINFMT_FLAT diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index a4e9e6e07e93..d3c6bb22c5f4 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -322,6 +322,8 @@ static int afs_deliver_cb_callback(struct afs_call *call) return ret; call->unmarshall++; + fallthrough; + case 5: break; } @@ -418,6 +420,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) r->node[loop] = ntohl(b[loop + 5]); call->unmarshall++; + fallthrough; case 2: break; @@ -530,6 +533,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) r->node[loop] = ntohl(b[loop + 5]); call->unmarshall++; + fallthrough; case 2: break; @@ -663,6 +667,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; + fallthrough; case 3: break; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 117df15e5367..78719f2f567e 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -1419,6 +1419,7 @@ static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; op->create.mode = S_IFDIR | mode; @@ -1500,6 +1501,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; @@ -1636,6 +1638,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; /* Try to make sure we have a callback promise on the victim. */ @@ -1718,6 +1721,7 @@ static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; @@ -1792,6 +1796,7 @@ static int afs_link(struct dentry *from, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -1914,7 +1919,9 @@ static void afs_rename_edit_dir(struct afs_operation *op) new_inode = d_inode(new_dentry); if (new_inode) { spin_lock(&new_inode->i_lock); - if (new_inode->i_nlink > 0) + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else if (new_inode->i_nlink > 0) drop_nlink(new_inode); spin_unlock(&new_inode->i_lock); } @@ -1987,6 +1994,8 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; op->file[1].dv_delta = 1; + op->file[0].modification = true; + op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 04f75a44f243..dae9a57d7ec0 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -73,6 +73,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode afs_op_set_vnode(op, 1, dvnode); op->file[0].dv_delta = 1; op->file[1].dv_delta = 1; + op->file[0].modification = true; + op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -201,6 +203,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->file[1].op_unlinked = true; op->file[1].update_ctime = true; diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 2cb0951acca6..d222dfbe976b 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -118,6 +118,8 @@ static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param * vp->cb_break_before = afs_calc_vnode_cb_break(vnode); if (vnode->lock_state != AFS_VNODE_LOCK_NONE) op->flags |= AFS_OPERATION_CUR_ONLY; + if (vp->modification) + set_bit(AFS_VNODE_MODIFYING, &vnode->flags); } if (vp->fid.vnode) @@ -225,6 +227,10 @@ int afs_put_operation(struct afs_operation *op) if (op->ops && op->ops->put) op->ops->put(op); + if (op->file[0].modification) + clear_bit(AFS_VNODE_MODIFYING, &op->file[0].vnode->flags); + if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode) + clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags); if (op->file[0].put_vnode) iput(&op->file[0].vnode->vfs_inode); if (op->file[1].put_vnode) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 2f695a260442..dd3f45d906d2 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -388,6 +388,7 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) req->file_size = vp->scb.status.size; call->unmarshall++; + fallthrough; case 5: break; @@ -1408,6 +1409,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) _debug("motd '%s'", p); call->unmarshall++; + fallthrough; case 8: break; @@ -1845,6 +1847,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; + fallthrough; case 6: break; @@ -1979,6 +1982,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; + fallthrough; case 4: break; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 3a129b9fd9b8..80b6c8d967d5 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -294,8 +294,9 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } } else if (vp->scb.have_status) { - if (vp->dv_before + vp->dv_delta != vp->scb.status.data_version && - vp->speculative) + if (vp->speculative && + (test_bit(AFS_VNODE_MODIFYING, &vnode->flags) || + vp->dv_before != vnode->status.data_version)) /* Ignore the result of a speculative bulk status fetch * if it splits around a modification op, thereby * appearing to regress the data version. @@ -911,6 +912,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } op->ctime = attr->ia_ctime; op->file[0].update_ctime = 1; + op->file[0].modification = true; op->ops = &afs_setattr_operation; ret = afs_do_sync_operation(op); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 52157a05796a..5ed416f4ff33 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -645,6 +645,7 @@ struct afs_vnode { #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ +#define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ @@ -762,6 +763,7 @@ struct afs_vnode_param { bool set_size:1; /* Must update i_size */ bool op_unlinked:1; /* True if file was unlinked by op */ bool speculative:1; /* T if speculative status fetch (no vnode lock) */ + bool modification:1; /* Set if the content gets modified */ }; /* diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index dc9327332f06..00fca3c66ba6 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -593,6 +593,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (ret < 0) return ret; call->unmarshall = 6; + fallthrough; case 6: break; diff --git a/fs/afs/write.c b/fs/afs/write.c index dc66ff15dd16..3edb6204b937 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -377,6 +377,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->store.write_iter = iter; op->store.pos = pos; op->store.size = size; @@ -323,16 +323,13 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags) +static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; - if (flags & MREMAP_DONTUNMAP) - return -EINVAL; - spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 054f97b07754..918826eaceea 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -87,6 +87,7 @@ struct autofs_wait_queue { autofs_wqt_t wait_queue_token; /* We use the following to see what we are waiting for */ struct qstr name; + u32 offset; u32 dev; u64 ino; kuid_t uid; diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index a1c7701007e7..b3fefd6237c3 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -355,7 +355,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { + if (d_is_symlink(dentry)) { pr_debug("checking symlink %p %pd\n", dentry, dentry); /* Forced expire, user space handles busy mounts */ diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 5ced859dac53..16b5fca0626e 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -30,7 +30,7 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi) while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ - kfree(wq->name.name); + kfree(wq->name.name - wq->offset); wq->name.name = NULL; wq->wait_ctr--; wake_up_interruptible(&wq->queue); @@ -175,51 +175,6 @@ static void autofs_notify_daemon(struct autofs_sb_info *sbi, fput(pipe); } -static int autofs_getpath(struct autofs_sb_info *sbi, - struct dentry *dentry, char *name) -{ - struct dentry *root = sbi->sb->s_root; - struct dentry *tmp; - char *buf; - char *p; - int len; - unsigned seq; - -rename_retry: - buf = name; - len = 0; - - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - spin_lock(&sbi->fs_lock); - for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) - len += tmp->d_name.len + 1; - - if (!len || --len > NAME_MAX) { - spin_unlock(&sbi->fs_lock); - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) - goto rename_retry; - return 0; - } - - *(buf + len) = '\0'; - p = buf + len - dentry->d_name.len; - strncpy(p, dentry->d_name.name, dentry->d_name.len); - - for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) { - *(--p) = '/'; - p -= tmp->d_name.len; - strncpy(p, tmp->d_name.name, tmp->d_name.len); - } - spin_unlock(&sbi->fs_lock); - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) - goto rename_retry; - - return len; -} - static struct autofs_wait_queue * autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { @@ -352,6 +307,7 @@ int autofs_wait(struct autofs_sb_info *sbi, struct qstr qstr; char *name; int status, ret, type; + unsigned int offset = 0; pid_t pid; pid_t tgid; @@ -389,20 +345,23 @@ int autofs_wait(struct autofs_sb_info *sbi, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) + if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) { + qstr.name = name; qstr.len = sprintf(name, "%p", dentry); - else { - qstr.len = autofs_getpath(sbi, dentry, name); - if (!qstr.len) { + } else { + char *p = dentry_path_raw(dentry, name, NAME_MAX); + if (IS_ERR(p)) { kfree(name); return -ENOENT; } + qstr.name = ++p; // skip the leading slash + qstr.len = strlen(p); + offset = p - name; } - qstr.name = name; qstr.hash = full_name_hash(dentry, name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { - kfree(qstr.name); + kfree(name); return -EINTR; } @@ -410,7 +369,7 @@ int autofs_wait(struct autofs_sb_info *sbi, if (ret <= 0) { if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); + kfree(name); return ret; } @@ -418,7 +377,7 @@ int autofs_wait(struct autofs_sb_info *sbi, /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL); if (!wq) { - kfree(qstr.name); + kfree(name); mutex_unlock(&sbi->wq_mutex); return -ENOMEM; } @@ -430,6 +389,7 @@ int autofs_wait(struct autofs_sb_info *sbi, sbi->queues = wq; init_waitqueue_head(&wq->queue); memcpy(&wq->name, &qstr, sizeof(struct qstr)); + wq->offset = offset; wq->dev = autofs_get_dev(sbi); wq->ino = autofs_get_ino(sbi); wq->uid = current_uid(); @@ -469,7 +429,7 @@ int autofs_wait(struct autofs_sb_info *sbi, (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); + kfree(name); } /* @@ -540,7 +500,7 @@ int autofs_wait_release(struct autofs_sb_info *sbi, } *wql = wq->next; /* Unlink from chain */ - kfree(wq->name.name); + kfree(wq->name.name - wq->offset); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; wake_up(&wq->queue); diff --git a/fs/befs/TODO b/fs/befs/TODO deleted file mode 100644 index 3250921aa2e6..000000000000 --- a/fs/befs/TODO +++ /dev/null @@ -1,14 +0,0 @@ -TODO -========== - -* Convert comments to the Kernel-Doc format. - -* Befs_fs.h has gotten big and messy. No reason not to break it up into - smaller peices. - -* See if Alexander Viro's option parser made it into the kernel tree. - Use that if we can. (include/linux/parser.h) - -* See if we really need separate types for on-disk and in-memory - representations of the superblock and inode. - diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index b9c658e0548e..a1072c6a2341 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -74,6 +74,12 @@ #define MAX_SHARED_LIBS (1) #endif +#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET +#define DATA_START_OFFSET_WORDS (0) +#else +#define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS) +#endif + struct lib_info { struct { unsigned long start_code; /* Start of text segment */ @@ -576,7 +582,8 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = data_len + extra + + DATA_START_OFFSET_WORDS * sizeof(unsigned long); len = PAGE_ALIGN(len); realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); @@ -591,7 +598,7 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } datapos = ALIGN(realdatastart + - MAX_SHARED_LIBS * sizeof(unsigned long), + DATA_START_OFFSET_WORDS * sizeof(unsigned long), FLAT_DATA_ALIGN); pr_debug("Allocated data+bss+stack (%u bytes): %lx\n", @@ -622,7 +629,8 @@ static int load_flat_file(struct linux_binprm *bprm, memp_size = len; } else { - len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32); + len = text_len + data_len + extra + + DATA_START_OFFSET_WORDS * sizeof(u32); len = PAGE_ALIGN(len); textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); @@ -638,7 +646,7 @@ static int load_flat_file(struct linux_binprm *bprm, realdatastart = textpos + ntohl(hdr->data_start); datapos = ALIGN(realdatastart + - MAX_SHARED_LIBS * sizeof(u32), + DATA_START_OFFSET_WORDS * sizeof(u32), FLAT_DATA_ALIGN); reloc = (__be32 __user *) @@ -714,7 +722,7 @@ static int load_flat_file(struct linux_binprm *bprm, ret = result; pr_err("Unable to read code+data+bss, errno %d\n", ret); vm_munmap(textpos, text_len + data_len + extra + - MAX_SHARED_LIBS * sizeof(u32)); + DATA_START_OFFSET_WORDS * sizeof(u32)); goto err; } } diff --git a/fs/block_dev.c b/fs/block_dev.c index a5244e08b6c8..6cc4d4cfe0c2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) return; invalidate_bh_lrus(); @@ -1244,6 +1244,9 @@ int bdev_disk_changed(struct block_device *bdev, bool invalidate) lockdep_assert_held(&bdev->bd_mutex); + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + rescan: if (bdev->bd_part_count) return -EBUSY; @@ -1298,6 +1301,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) struct gendisk *disk = bdev->bd_disk; int ret = 0; + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + if (!bdev->bd_openers) { if (!bdev_is_partition(bdev)) { ret = 0; @@ -1332,8 +1338,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) whole->bd_part_count++; mutex_unlock(&whole->bd_mutex); - if (!(disk->flags & GENHD_FL_UP) || - !bdev_nr_sectors(bdev)) { + if (!bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); return -ENXIO; @@ -1364,16 +1369,12 @@ struct block_device *blkdev_get_no_open(dev_t dev) struct block_device *bdev; struct gendisk *disk; - down_read(&bdev_lookup_sem); bdev = bdget(dev); if (!bdev) { - up_read(&bdev_lookup_sem); blk_request_module(dev); - down_read(&bdev_lookup_sem); - bdev = bdget(dev); if (!bdev) - goto unlock; + return NULL; } disk = bdev->bd_disk; @@ -1383,14 +1384,11 @@ struct block_device *blkdev_get_no_open(dev_t dev) goto put_disk; if (!try_module_get(bdev->bd_disk->fops->owner)) goto put_disk; - up_read(&bdev_lookup_sem); return bdev; put_disk: put_disk(disk); bdput: bdput(bdev); -unlock: - up_read(&bdev_lookup_sem); return NULL; } @@ -1677,6 +1675,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; + size_t shorted = 0; ssize_t ret; if (bdev_read_only(I_BDEV(bd_inode))) @@ -1694,12 +1693,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; - iov_iter_truncate(from, size - iocb->ki_pos); + size -= iocb->ki_pos; + if (iov_iter_count(from) > size) { + shorted = iov_iter_count(from) - size; + iov_iter_truncate(from, size); + } blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); if (ret > 0) ret = generic_write_sync(iocb, ret); + iov_iter_reexpand(from, iov_iter_count(from) + shorted); blk_finish_plug(&plug); return ret; } @@ -1711,13 +1715,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; + size_t shorted = 0; + ssize_t ret; if (pos >= size) return 0; size -= pos; - iov_iter_truncate(to, size); - return generic_file_read_iter(iocb, to); + if (iov_iter_count(to) > size) { + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); + } + + ret = generic_file_read_iter(iocb, to); + iov_iter_reexpand(to, iov_iter_count(to) + shorted); + return ret; } EXPORT_SYMBOL_GPL(blkdev_read_iter); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 17f93fd28f7e..1346d698463a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -28,6 +28,7 @@ #include "compression.h" #include "extent_io.h" #include "extent_map.h" +#include "zoned.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -349,6 +350,7 @@ static void end_compressed_bio_write(struct bio *bio) */ inode = cb->inode; cb->compressed_pages[0]->mapping = cb->inode->i_mapping; + btrfs_record_physical_zoned(inode, cb->start, bio); btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], cb->start, cb->start + cb->len - 1, bio->bi_status == BLK_STS_OK); @@ -401,6 +403,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, u64 first_byte = disk_start; blk_status_t ret; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; + const bool use_append = btrfs_use_zone_append(inode, disk_start); + const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; WARN_ON(!PAGE_ALIGNED(start)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); @@ -418,10 +422,31 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->nr_pages = nr_pages; bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = REQ_OP_WRITE | write_flags; + bio->bi_opf = bio_op | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + if (use_append) { + struct extent_map *em; + struct map_lookup *map; + struct block_device *bdev; + + em = btrfs_get_chunk_map(fs_info, disk_start, PAGE_SIZE); + if (IS_ERR(em)) { + kfree(cb); + bio_put(bio); + return BLK_STS_NOTSUPP; + } + + map = em->map_lookup; + /* We only support single profile for now */ + ASSERT(map->num_stripes == 1); + bdev = map->stripes[0].dev->bdev; + + bio_set_dev(bio, bdev); + free_extent_map(em); + } + if (blkcg_css) { bio->bi_opf |= REQ_CGROUP_PUNT; kthread_associate_blkcg(blkcg_css); @@ -432,6 +457,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { int submit = 0; + int len = 0; page = compressed_pages[pg_index]; page->mapping = inode->vfs_inode.i_mapping; @@ -439,9 +465,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, 0); + /* + * Page can only be added to bio if the current bio fits in + * stripe. + */ + if (!submit) { + if (pg_index == 0 && use_append) + len = bio_add_zone_append_page(bio, page, + PAGE_SIZE, 0); + else + len = bio_add_page(bio, page, PAGE_SIZE, 0); + } + page->mapping = NULL; - if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (submit || len < PAGE_SIZE) { /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -465,11 +502,15 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, } bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = REQ_OP_WRITE | write_flags; + bio->bi_opf = bio_op | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; if (blkcg_css) bio->bi_opf |= REQ_CGROUP_PUNT; + /* + * Use bio_add_page() to ensure the bio has at least one + * page. + */ bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { @@ -591,16 +632,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, free_extent_map(em); if (page->index == end_index) { - char *userpage; size_t zero_offset = offset_in_page(isize); if (zero_offset) { int zeros; zeros = PAGE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, zeros); + memzero_page(page, zero_offset, zeros); flush_dcache_page(page); - kunmap_atomic(userpage); } } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index f83fd3cbf243..9fb76829a281 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3127,7 +3127,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_snapshot(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 7a28314189b4..3d5c35e4cb76 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1340,12 +1340,16 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; + struct btrfs_device *device = stripe->dev; - if (!stripe->dev->bdev) { + if (!device->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + continue; + ret = do_discard_extent(stripe, &bytes); if (!ret) { discarded_bytes += bytes; @@ -1864,7 +1868,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); - return 0; + return ret; } static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index f2d1bb234377..dee2dafbc872 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3421,15 +3421,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } if (page->index == last_byte >> PAGE_SHIFT) { - char *userpage; size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, iosize); + memzero_page(page, zero_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); } } begin_page_read(fs_info, page); @@ -3438,14 +3435,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 disk_bytenr; if (cur >= last_byte) { - char *userpage; struct extent_state *cached = NULL; iosize = PAGE_SIZE - pg_offset; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); + memzero_page(page, pg_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, @@ -3528,13 +3522,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - char *userpage; struct extent_state *cached = NULL; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); + memzero_page(page, pg_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -3762,7 +3753,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, /* Note that em_end from extent_map_end() is exclusive */ iosize = min(em_end, end + 1) - cur; - if (btrfs_use_zone_append(inode, em)) + if (btrfs_use_zone_append(inode, em->block_start)) opf = REQ_OP_ZONE_APPEND; free_extent_map(em); @@ -3845,12 +3836,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, - PAGE_SIZE - pg_offset); - kunmap_atomic(userpage); + memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); flush_dcache_page(page); } @@ -5210,7 +5196,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret = 0; - u64 off = start; + u64 off; u64 max = start + len; u32 flags = 0; u32 found_type; @@ -5245,6 +5231,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto out_free_ulist; } + /* + * We can't initialize that to 'start' as this could miss extents due + * to extent item merging + */ + off = 0; start = round_down(start, btrfs_inode_sectorsize(inode)); len = round_up(max, btrfs_inode_sectorsize(inode)) - start; diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 294602f139ef..441cee7fbb62 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -788,7 +788,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 end_byte = bytenr + len; u64 csum_end; struct extent_buffer *leaf; - int ret; + int ret = 0; const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; @@ -806,6 +806,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -862,7 +863,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, path->slots[0], del_nr); if (ret) - goto out; + break; if (key.offset == bytenr) break; } else if (key.offset < bytenr && csum_end > end_byte) { @@ -906,8 +907,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_split_item(trans, root, path, &key, offset); if (ret && ret != -EAGAIN) { btrfs_abort_transaction(trans, ret); - goto out; + break; } + ret = 0; key.offset = end_byte - 1; } else { @@ -917,12 +919,41 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, } btrfs_release_path(path); } - ret = 0; -out: btrfs_free_path(path); return ret; } +static int find_next_csum_offset(struct btrfs_root *root, + struct btrfs_path *path, + u64 *next_offset) +{ + const u32 nritems = btrfs_header_nritems(path->nodes[0]); + struct btrfs_key found_key; + int slot = path->slots[0] + 1; + int ret; + + if (nritems == 0 || slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *next_offset = (u64)-1; + return 0; + } + slot = path->slots[0]; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) + *next_offset = (u64)-1; + else + *next_offset = found_key.offset; + + return 0; +} + int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) @@ -938,7 +969,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, u64 total_bytes = 0; u64 csum_offset; u64 bytenr; - u32 nritems; u32 ins_size; int index = 0; int found_next; @@ -981,26 +1011,10 @@ again: goto insert; } } else { - int slot = path->slots[0] + 1; - /* we didn't find a csum item, insert one */ - nritems = btrfs_header_nritems(path->nodes[0]); - if (!nritems || (path->slots[0] >= nritems - 1)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - found_next = 1; - goto insert; - } - slot = path->slots[0]; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); - if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - found_key.type != BTRFS_EXTENT_CSUM_KEY) { - found_next = 1; - goto insert; - } - next_offset = found_key.offset; + /* We didn't find a csum item, insert one. */ + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; found_next = 1; goto insert; } @@ -1056,8 +1070,48 @@ extend_csum: tmp = sums->len - total_bytes; tmp >>= fs_info->sectorsize_bits; WARN_ON(tmp < 1); + extend_nr = max_t(int, 1, tmp); + + /* + * A log tree can already have checksum items with a subset of + * the checksums we are trying to log. This can happen after + * doing a sequence of partial writes into prealloc extents and + * fsyncs in between, with a full fsync logging a larger subrange + * of an extent for which a previous fast fsync logged a smaller + * subrange. And this happens in particular due to merging file + * extent items when we complete an ordered extent for a range + * covered by a prealloc extent - this is done at + * btrfs_mark_extent_written(). + * + * So if we try to extend the previous checksum item, which has + * a range that ends at the start of the range we want to insert, + * make sure we don't extend beyond the start offset of the next + * checksum item. If we are at the last item in the leaf, then + * forget the optimization of extending and add a new checksum + * item - it is not worth the complexity of releasing the path, + * getting the first key for the next leaf, repeat the btree + * search, etc, because log trees are temporary anyway and it + * would only save a few bytes of leaf space. + */ + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (path->slots[0] + 1 >= + btrfs_header_nritems(path->nodes[0])) { + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + found_next = 1; + goto insert; + } + + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + + tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits; + if (tmp <= INT_MAX) + extend_nr = min_t(int, extend_nr, tmp); + } - extend_nr = max_t(int, 1, (int)tmp); diff = (csum_offset + extend_nr) * csum_size; diff = min(diff, MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 864c08d08a35..3b10d98b4ebb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2067,6 +2067,30 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) return ret; } +static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) +{ + struct btrfs_inode *inode = BTRFS_I(ctx->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (btrfs_inode_in_log(inode, fs_info->generation) && + list_empty(&ctx->ordered_extents)) + return true; + + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ + if (inode->last_trans <= fs_info->last_trans_committed && + (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || + list_empty(&ctx->ordered_extents))) + return true; + + return false; +} + /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. @@ -2185,17 +2209,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); - /* - * If we are doing a fast fsync we can not bail out if the inode's - * last_trans is <= then the last committed transaction, because we only - * update the last_trans of the inode during ordered extent completion, - * and for a fast fsync we don't wait for that, we only wait for the - * writeback to complete. - */ smp_mb(); - if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && - (full_sync || list_empty(&ctx.ordered_extents)))) { + if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index e54466fc101f..4806295116d8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -3949,7 +3949,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *block_group; struct rb_node *node; - int ret; + int ret = 0; btrfs_info(fs_info, "cleaning free space cache v1"); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index b21d491b3adc..46f392943f4d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -646,17 +646,12 @@ again: if (!ret) { unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; - char *kaddr; /* zero the tail end of the last page, we might be * sending it down to disk */ - if (offset) { - kaddr = kmap_atomic(page); - memset(kaddr + offset, 0, - PAGE_SIZE - offset); - kunmap_atomic(kaddr); - } + if (offset) + memzero_page(page, offset, PAGE_SIZE - offset); will_compress = 1; } } @@ -3005,6 +3000,18 @@ out: if (ret || truncated) { u64 unwritten_start = start; + /* + * If we failed to finish this ordered extent for any reason we + * need to make sure BTRFS_ORDERED_IOERR is set on the ordered + * extent, and mark the inode with the error if it wasn't + * already set. Any error during writeback would have already + * set the mapping error, so we need to set it if we're the ones + * marking this ordered extent as failed. + */ + if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, + &ordered_extent->flags)) + mapping_set_error(ordered_extent->inode->i_mapping, -EIO); + if (truncated) unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); @@ -3246,6 +3253,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); + cond_resched_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); } @@ -4833,7 +4841,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - char *kaddr; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4925,15 +4932,13 @@ again: if (offset != blocksize) { if (!len) len = blocksize - offset; - kaddr = kmap(page); if (front) - memset(kaddr + (block_start - page_offset(page)), - 0, offset); + memzero_page(page, (block_start - page_offset(page)), + offset); else - memset(kaddr + (block_start - page_offset(page)) + offset, - 0, len); + memzero_page(page, (block_start - page_offset(page)) + offset, + len); flush_dcache_page(page); - kunmap(page); } ClearPageChecked(page); set_page_dirty(page); @@ -6832,11 +6837,9 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size + pg_offset < PAGE_SIZE) { - char *map = kmap(page); - memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset); - kunmap(page); - } + if (max_size + pg_offset < PAGE_SIZE) + memzero_page(page, pg_offset + max_size, + PAGE_SIZE - max_size - pg_offset); kfree(tmp); return ret; } @@ -7795,7 +7798,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->bdev = fs_info->fs_devices->latest_bdev; iomap->length = len; - if (write && btrfs_use_zone_append(BTRFS_I(inode), em)) + if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) iomap->flags |= IOMAP_F_ZONE_APPEND; free_extent_map(em); @@ -8506,7 +8509,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - char *kaddr; unsigned long zero_start; loff_t size; vm_fault_t ret; @@ -8620,10 +8622,8 @@ again: zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) { - kaddr = kmap(page); - memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); + memzero_page(page, zero_start, PAGE_SIZE - zero_start); flush_dcache_page(page); - kunmap(page); } ClearPageChecked(page); set_page_dirty(page); @@ -9088,6 +9088,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; + bool need_abort = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) @@ -9147,6 +9148,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_idx); if (ret) goto out_fail; + need_abort = true; } /* And now for the dest. */ @@ -9162,8 +9164,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); - if (ret) + if (ret) { + if (need_abort) + btrfs_abort_transaction(trans, ret); goto out_fail; + } } /* Update inode version and ctime/mtime. */ @@ -9691,7 +9696,7 @@ out: return ret; } -int btrfs_start_delalloc_snapshot(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { struct writeback_control wbc = { .nr_to_write = LONG_MAX, @@ -9704,7 +9709,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - return start_delalloc_inodes(root, &wbc, true, false); + return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index ee1dbabb5d3c..5dc2fd843ae3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -259,6 +259,8 @@ int btrfs_fileattr_set(struct user_namespace *mnt_userns, if (!fa->flags_valid) { /* 1 item for the inode */ trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); goto update_flags; } @@ -907,7 +909,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent, */ btrfs_drew_read_lock(&root->snapshot_lock); - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) goto out; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 07b0b4218791..6c413bb451a3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -984,7 +984,7 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, if (pre) ret = clone_ordered_extent(ordered, 0, pre); - if (post) + if (ret == 0 && post) ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, post); diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 2319c923c9e6..3ded812f522c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -3545,11 +3545,15 @@ static int try_flush_qgroup(struct btrfs_root *root) struct btrfs_trans_handle *trans; int ret; - /* Can't hold an open transaction or we run the risk of deadlocking */ - ASSERT(current->journal_info == NULL || - current->journal_info == BTRFS_SEND_TRANS_STUB); - if (WARN_ON(current->journal_info && - current->journal_info != BTRFS_SEND_TRANS_STUB)) + /* + * Can't hold an open transaction or we run the risk of deadlocking, + * and can't either be under the context of a send operation (where + * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that + * would result in a crash when starting a transaction and does not + * make sense either (send is a read-only operation). + */ + ASSERT(current->journal_info == NULL); + if (WARN_ON(current->journal_info)) return 0; /* @@ -3562,7 +3566,7 @@ static int try_flush_qgroup(struct btrfs_root *root) return 0; } - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f4ec06b53aa0..9178da07cc9c 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -129,12 +129,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) { - char *map; - - map = kmap(page); - memset(map + datal, 0, block_size - datal); + memzero_page(page, datal, block_size - datal); flush_dcache_page(page); - kunmap(page); } SetPageUptodate(page); @@ -207,10 +203,7 @@ static int clone_copy_inline_extent(struct inode *dst, * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, - comp_type); - goto out; + goto copy_to_page; } } else if (i_size_read(dst) <= datal) { struct btrfs_file_extent_item *ei; @@ -226,13 +219,10 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, comp_type); - goto out; + goto copy_to_page; } copy_inline_extent: - ret = 0; /* * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. @@ -244,11 +234,13 @@ copy_inline_extent: * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, comp_type); - goto out; + goto copy_to_page; } + /* + * Release path before starting a new transaction so we don't hold locks + * that would confuse lockdep. + */ btrfs_release_path(path); /* * If we end up here it means were copy the inline extent into a leaf @@ -305,6 +297,21 @@ out: *trans_out = trans; return ret; + +copy_to_page: + /* + * Release our path because we don't need it anymore and also because + * copy_inline_to_page() needs to reserve data and metadata, which may + * need to flush delalloc when we are low on available space and + * therefore cause a deadlock if writeback of an inline extent needs to + * write to the same leaf or an ordered extent completion needs to write + * to the same leaf. + */ + btrfs_release_path(path); + + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); + goto out; } /** diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 55741adf9071..bd69db72acc5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -7170,7 +7170,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) int i; if (root) { - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); @@ -7178,7 +7178,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) for (i = 0; i < sctx->clone_roots_cnt; i++) { root = sctx->clone_roots[i].root; - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index f67721d82e5d..362d14db1e38 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1574,7 +1574,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + goto out; } ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; @@ -1749,7 +1751,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + goto out; } BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1787,6 +1791,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; if (ret == 1) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -1799,17 +1804,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + break; btrfs_release_path(path); inode = read_one_inode(root, key.offset); - if (!inode) - return -EIO; + if (!inode) { + ret = -EIO; + break; + } ret = fixup_inode_link_count(trans, root, inode); iput(inode); if (ret) - goto out; + break; /* * fixup on a directory may create new entries, @@ -1818,8 +1825,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, */ key.offset = (u64)-1; } - ret = 0; -out: btrfs_release_path(path); return ret; } @@ -1858,8 +1863,6 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } else if (ret == -EEXIST) { ret = 0; - } else { - BUG(); /* Logic Error */ } iput(inode); @@ -6061,7 +6064,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * (since logging them is pointless, a link count of 0 means they * will never be accessible). */ - if (btrfs_inode_in_log(inode, trans->transid) || + if ((btrfs_inode_in_log(inode, trans->transid) && + list_empty(&ctx->ordered_extents)) || inode->vfs_inode.i_nlink == 0) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; @@ -6462,6 +6466,24 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, (!old_dir || old_dir->logged_trans < trans->transid)) return; + /* + * If we are doing a rename (old_dir is not NULL) from a directory that + * was previously logged, make sure the next log attempt on the directory + * is not skipped and logs the inode again. This is because the log may + * not currently be authoritative for a range including the old + * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make + * sure after a log replay we do not end up with both the new and old + * dentries around (in case the inode is a directory we would have a + * directory with two hard links and 2 inode references for different + * parents). The next log attempt of old_dir will happen at + * btrfs_log_all_parents(), called through btrfs_log_inode_parent() + * below, because we have previously set inode->last_unlink_trans to the + * current transaction ID, either here or at btrfs_record_unlink_dir() in + * case inode is a directory. + */ + if (old_dir) + old_dir->logged_trans = 0; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; /* diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 9a1ead0c4a31..47d27059d064 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1459,7 +1459,7 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, /* Given hole range was invalid (outside of device) */ if (ret == -ERANGE) { *hole_start += *hole_size; - *hole_size = false; + *hole_size = 0; return true; } diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index d524acf7b3e5..c3fa7d3fa770 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -375,7 +375,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, unsigned long bytes_left; unsigned long total_out = 0; unsigned long pg_offset = 0; - char *kaddr; destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes_left = destlen; @@ -455,9 +454,7 @@ next: * end of the inline extent (destlen) to the end of the page */ if (pg_offset < destlen) { - kaddr = kmap_atomic(dest_page); - memset(kaddr + pg_offset, 0, destlen - pg_offset); - kunmap_atomic(kaddr); + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 70b23a0d03b1..1bb8ee97aae0 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -1126,6 +1126,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + ret = -EIO; + goto out; + } + switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: @@ -1273,7 +1278,7 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) spin_unlock(&trans->releasing_ebs_lock); } -bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) +bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; @@ -1288,7 +1293,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) if (!is_data_inode(&inode->vfs_inode)) return false; - cache = btrfs_lookup_block_group(fs_info, em->block_start); + cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); if (!cache) return false; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 5e41a74a9cb2..e55d32595c2c 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -53,7 +53,7 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); -bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); +bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, struct bio *bio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); @@ -152,8 +152,7 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } -static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, - struct extent_map *em) +static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) { return false; } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 8e9626d63976..3e26b466476a 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -631,7 +631,6 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, size_t ret2; unsigned long total_out = 0; unsigned long pg_offset = 0; - char *kaddr; stream = ZSTD_initDStream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); @@ -696,9 +695,7 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, ret = 0; finish: if (pg_offset < destlen) { - kaddr = kmap_atomic(dest_page); - memset(kaddr + pg_offset, 0, destlen - pg_offset); - kunmap_atomic(kaddr); + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 0cb7ffd4977c..ea48c01fb76b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1020,11 +1020,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) pgoff_t index; int sizebits; - sizebits = -1; - do { - sizebits++; - } while ((size << sizebits) < PAGE_SIZE); - + sizebits = PAGE_SHIFT - __ffs(size); index = block >> sizebits; /* @@ -1264,6 +1260,15 @@ static void bh_lru_install(struct buffer_head *bh) int i; check_irqs_on(); + /* + * the refcount of buffer_head in bh_lru prevents dropping the + * attached page(i.e., try_to_free_buffers) so it could cause + * failing page migration. + * Skip putting upcoming bh into bh_lru until migration is done. + */ + if (lru_cache_disabled()) + return; + bh_lru_lock(); b = this_cpu_ptr(&bh_lrus); @@ -1404,6 +1409,15 @@ __bread_gfp(struct block_device *bdev, sector_t block, } EXPORT_SYMBOL(__bread_gfp); +static void __invalidate_bh_lrus(struct bh_lru *b) +{ + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } +} /* * invalidate_bh_lrus() is called rarely - but not only at unmount. * This doesn't race because it runs in each cpu either in irq @@ -1412,16 +1426,12 @@ EXPORT_SYMBOL(__bread_gfp); static void invalidate_bh_lru(void *arg) { struct bh_lru *b = &get_cpu_var(bh_lrus); - int i; - for (i = 0; i < BH_LRU_SIZE; i++) { - brelse(b->bhs[i]); - b->bhs[i] = NULL; - } + __invalidate_bh_lrus(b); put_cpu_var(bh_lrus); } -static bool has_bh_in_lru(int cpu, void *dummy) +bool has_bh_in_lru(int cpu, void *dummy) { struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); int i; @@ -1440,6 +1450,16 @@ void invalidate_bh_lrus(void) } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); +void invalidate_bh_lrus_cpu(int cpu) +{ + struct bh_lru *b; + + bh_lru_lock(); + b = per_cpu_ptr(&bh_lrus, cpu); + __invalidate_bh_lrus(b); + bh_lru_unlock(); +} + void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset) { diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 471e40156065..94df854147d3 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -6,6 +6,7 @@ config CEPH_FS select LIBCRC32C select CRYPTO_AES select CRYPTO + select NETFS_SUPPORT default n help Choose Y or M here to include support for mounting the diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 26e66436f005..c1570fada3d8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -12,6 +12,7 @@ #include <linux/signal.h> #include <linux/iversion.h> #include <linux/ktime.h> +#include <linux/netfs.h> #include "super.h" #include "mds_client.h" @@ -61,6 +62,9 @@ (CONGESTION_ON_THRESH(congestion_kb) - \ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) +static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, + struct page *page, void **_fsdata); + static inline struct ceph_snap_context *page_snap_context(struct page *page) { if (PagePrivate(page)) @@ -124,8 +128,7 @@ static int ceph_set_page_dirty(struct page *page) * PagePrivate so that we get invalidatepage callback. */ BUG_ON(PagePrivate(page)); - page->private = (unsigned long)snapc; - SetPagePrivate(page); + attach_page_private(page, snapc); ret = __set_page_dirty_nobuffers(page); WARN_ON(!PageLocked(page)); @@ -144,19 +147,19 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, { struct inode *inode; struct ceph_inode_info *ci; - struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_snap_context *snapc; + + wait_on_page_fscache(page); inode = page->mapping->host; ci = ceph_inode(inode); - if (offset != 0 || length != PAGE_SIZE) { + if (offset != 0 || length != thp_size(page)) { dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", inode, page, page->index, offset, length); return; } - ceph_invalidate_fscache_page(inode, page); - WARN_ON(!PageLocked(page)); if (!PagePrivate(page)) return; @@ -164,333 +167,222 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, dout("%p invalidatepage %p idx %lu full dirty page\n", inode, page, page->index); + snapc = detach_page_private(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); - page->private = 0; - ClearPagePrivate(page); } -static int ceph_releasepage(struct page *page, gfp_t g) +static int ceph_releasepage(struct page *page, gfp_t gfp) { dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, page, page->index, PageDirty(page) ? "" : "not "); - /* Can we release the page from the cache? */ - if (!ceph_release_fscache_page(page, g)) - return 0; - + if (PageFsCache(page)) { + if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) + return 0; + wait_on_page_fscache(page); + } return !PagePrivate(page); } -/* read a single page, without unlocking it. */ -static int ceph_do_readpage(struct file *filp, struct page *page) +static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) { - struct inode *inode = file_inode(filp); + struct inode *inode = rreq->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_client *osdc = &fsc->client->osdc; - struct ceph_osd_request *req; - struct ceph_vino vino = ceph_vino(inode); - int err = 0; - u64 off = page_offset(page); - u64 len = PAGE_SIZE; - - if (off >= i_size_read(inode)) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - return 0; - } - - if (ci->i_inline_version != CEPH_INLINE_NONE) { - /* - * Uptodate inline data should have been added - * into page cache while getting Fcr caps. - */ - if (off == 0) - return -EINVAL; - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - return 0; - } - - err = ceph_readpage_from_fscache(inode, page); - if (err == 0) - return -EINPROGRESS; - - dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", - vino.ino, vino.snap, filp, off, len, page, page->index); - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, - ci->i_truncate_seq, ci->i_truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); + struct ceph_file_layout *lo = &ci->i_layout; + u32 blockoff; + u64 blockno; - osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); + /* Expand the start downward */ + blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff); + rreq->start = blockno * lo->stripe_unit; + rreq->len += blockoff; - err = ceph_osdc_start_request(osdc, req, false); - if (!err) - err = ceph_osdc_wait_request(osdc, req); - - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, err); - - ceph_osdc_put_request(req); - dout("readpage result %d\n", err); - - if (err == -ENOENT) - err = 0; - if (err < 0) { - ceph_fscache_readpage_cancel(inode, page); - if (err == -EBLOCKLISTED) - fsc->blocklisted = true; - goto out; - } - if (err < PAGE_SIZE) - /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_SIZE); - else - flush_dcache_page(page); - - SetPageUptodate(page); - ceph_readpage_to_fscache(inode, page); - -out: - return err < 0 ? err : 0; + /* Now, round up the length to the next block */ + rreq->len = roundup(rreq->len, lo->stripe_unit); } -static int ceph_readpage(struct file *filp, struct page *page) +static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq) { - int r = ceph_do_readpage(filp, page); - if (r != -EINPROGRESS) - unlock_page(page); - else - r = 0; - return r; + struct inode *inode = subreq->rreq->mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objno, objoff; + u32 xlen; + + /* Truncate the extent at the end of the current block */ + ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, + &objno, &objoff, &xlen); + subreq->len = min(xlen, fsc->mount_options->rsize); + return true; } -/* - * Finish an async read(ahead) op. - */ -static void finish_read(struct ceph_osd_request *req) +static void finish_netfs_read(struct ceph_osd_request *req) { - struct inode *inode = req->r_inode; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_data *osd_data; - int rc = req->r_result <= 0 ? req->r_result : 0; - int bytes = req->r_result >= 0 ? req->r_result : 0; + struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct netfs_read_subrequest *subreq = req->r_priv; int num_pages; - int i; + int err = req->r_result; - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); - if (rc == -EBLOCKLISTED) - ceph_inode_to_client(inode)->blocklisted = true; + ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); - /* unlock all pages, zeroing any data we didn't read */ - osd_data = osd_req_op_extent_osd_data(req, 0); - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); - num_pages = calc_pages_for((u64)osd_data->alignment, - (u64)osd_data->length); - for (i = 0; i < num_pages; i++) { - struct page *page = osd_data->pages[i]; - - if (rc < 0 && rc != -ENOENT) { - ceph_fscache_readpage_cancel(inode, page); - goto unlock; - } - if (bytes < (int)PAGE_SIZE) { - /* zero (remainder of) page */ - int s = bytes < 0 ? 0 : bytes; - zero_user_segment(page, s, PAGE_SIZE); - } - dout("finish_read %p uptodate %p idx %lu\n", inode, page, - page->index); - flush_dcache_page(page); - SetPageUptodate(page); - ceph_readpage_to_fscache(inode, page); -unlock: - unlock_page(page); - put_page(page); - bytes -= PAGE_SIZE; - } + dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result, + subreq->len, i_size_read(req->r_inode)); - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, rc); + /* no object means success but no data */ + if (err == -ENOENT) + err = 0; + else if (err == -EBLOCKLISTED) + fsc->blocklisted = true; + + if (err >= 0 && err < subreq->len) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + + netfs_subreq_terminated(subreq, err, true); - kfree(osd_data->pages); + num_pages = calc_pages_for(osd_data->alignment, osd_data->length); + ceph_put_page_vector(osd_data->pages, num_pages, false); + iput(req->r_inode); } -/* - * start an async read(ahead) operation. return nr_pages we submitted - * a read for on success, or negative error code. - */ -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, - struct list_head *page_list, int max) +static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) { - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; + struct netfs_read_request *rreq = subreq->rreq; + struct inode *inode = rreq->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct page *page = lru_to_page(page_list); - struct ceph_vino vino; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; - u64 off; - u64 len; - int i; + struct ceph_vino vino = ceph_vino(inode); + struct iov_iter iter; struct page **pages; - pgoff_t next_index; - int nr_pages = 0; - int got = 0; - int ret = 0; - - if (!rw_ctx) { - /* caller of readpages does not hold buffer and read caps - * (fadvise, madvise and readahead cases) */ - int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, - true, &got); - if (ret < 0) { - dout("start_read %p, error getting cap\n", inode); - } else if (!(got & want)) { - dout("start_read %p, no cache cap\n", inode); - ret = 0; - } - if (ret <= 0) { - if (got) - ceph_put_cap_refs(ci, got); - while (!list_empty(page_list)) { - page = lru_to_page(page_list); - list_del(&page->lru); - put_page(page); - } - return ret; - } - } - - off = (u64) page_offset(page); + size_t page_off; + int err = 0; + u64 len = subreq->len; - /* count pages */ - next_index = page->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index != next_index) - break; - nr_pages++; - next_index++; - if (max && nr_pages == max) - break; - } - len = nr_pages << PAGE_SHIFT; - dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, - off, len); - vino = ceph_vino(inode); - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, - 0, 1, CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ, NULL, - ci->i_truncate_seq, ci->i_truncate_size, - false); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, + 0, 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, + NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { - ret = PTR_ERR(req); + err = PTR_ERR(req); + req = NULL; goto out; } - /* build page vector */ - nr_pages = calc_pages_for(0, len); - pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out_put; - } - for (i = 0; i < nr_pages; ++i) { - page = list_entry(page_list->prev, struct page, lru); - BUG_ON(PageLocked(page)); - list_del(&page->lru); - - dout("start_read %p adding %p idx %lu\n", inode, page, - page->index); - if (add_to_page_cache_lru(page, &inode->i_data, page->index, - GFP_KERNEL)) { - ceph_fscache_uncache_page(inode, page); - put_page(page); - dout("start_read %p add_to_page_cache failed %p\n", - inode, page); - nr_pages = i; - if (nr_pages > 0) { - len = nr_pages << PAGE_SHIFT; - osd_req_op_extent_update(req, 0, len); - break; - } - goto out_pages; - } - pages[i] = page; + dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off); + if (err < 0) { + dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); + goto out; } + + /* should always give us a page-aligned read */ + WARN_ON_ONCE(page_off); + len = err; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); - req->r_callback = finish_read; + req->r_callback = finish_netfs_read; + req->r_priv = subreq; req->r_inode = inode; + ihold(inode); - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); - ret = ceph_osdc_start_request(osdc, req, false); - if (ret < 0) - goto out_pages; + err = ceph_osdc_start_request(req->r_osdc, req, false); + if (err) + iput(inode); +out: ceph_osdc_put_request(req); + if (err) + netfs_subreq_terminated(subreq, err, false); + dout("%s: result %d\n", __func__, err); +} - /* After adding locked pages to page cache, the inode holds cache cap. - * So we can drop our cap refs. */ - if (got) - ceph_put_cap_refs(ci, got); +static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file) +{ +} - return nr_pages; +static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + int got = (uintptr_t)priv; -out_pages: - for (i = 0; i < nr_pages; ++i) { - ceph_fscache_readpage_cancel(inode, pages[i]); - unlock_page(pages[i]); - } - ceph_put_page_vector(pages, nr_pages, false); -out_put: - ceph_osdc_put_request(req); -out: if (got) ceph_put_cap_refs(ci, got); - return ret; } +const struct netfs_read_request_ops ceph_netfs_read_ops = { + .init_rreq = ceph_init_rreq, + .is_cache_enabled = ceph_is_cache_enabled, + .begin_cache_operation = ceph_begin_cache_operation, + .issue_op = ceph_netfs_issue_op, + .expand_readahead = ceph_netfs_expand_readahead, + .clamp_length = ceph_netfs_clamp_length, + .check_write_begin = ceph_netfs_check_write_begin, + .cleanup = ceph_readahead_cleanup, +}; -/* - * Read multiple pages. Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. - */ -static int ceph_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned nr_pages) +/* read a single page, without unlocking it. */ +static int ceph_readpage(struct file *file, struct page *page) { struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_file_info *fi = file->private_data; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vino vino = ceph_vino(inode); + u64 off = page_offset(page); + u64 len = thp_size(page); + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + /* + * Uptodate inline data should have been added + * into page cache while getting Fcr caps. + */ + if (off == 0) { + unlock_page(page); + return -EINVAL; + } + zero_user_segment(page, 0, thp_size(page)); + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", + vino.ino, vino.snap, file, off, len, page, page->index); + + return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL); +} + +static void ceph_readahead(struct readahead_control *ractl) +{ + struct inode *inode = file_inode(ractl->file); + struct ceph_file_info *fi = ractl->file->private_data; struct ceph_rw_context *rw_ctx; - int rc = 0; - int max = 0; + int got = 0; + int ret = 0; if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) - return -EINVAL; + return; - rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, - &nr_pages); + rw_ctx = ceph_find_rw_context(fi); + if (!rw_ctx) { + /* + * readahead callers do not necessarily hold Fcb caps + * (e.g. fadvise, madvise). + */ + int want = CEPH_CAP_FILE_CACHE; - if (rc == 0) - goto out; + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); + if (ret < 0) + dout("start_read %p, error getting cap\n", inode); + else if (!(got & want)) + dout("start_read %p, no cache cap\n", inode); - rw_ctx = ceph_find_rw_context(fi); - max = fsc->mount_options->rsize >> PAGE_SHIFT; - dout("readpages %p file %p ctx %p nr_pages %d max %d\n", - inode, file, rw_ctx, nr_pages, max); - while (!list_empty(page_list)) { - rc = start_read(inode, rw_ctx, page_list, max); - if (rc < 0) - goto out; + if (ret <= 0) + return; } -out: - ceph_fscache_readpages_cancel(inode, page_list); - - dout("readpages %p file %p ret %d\n", inode, file, rc); - return rc; + netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got); } struct ceph_writeback_ctl @@ -585,8 +477,8 @@ static u64 get_writepages_data_length(struct inode *inode, spin_unlock(&ci->i_ceph_lock); WARN_ON(!found); } - if (end > page_offset(page) + PAGE_SIZE) - end = page_offset(page) + PAGE_SIZE; + if (end > page_offset(page) + thp_size(page)) + end = page_offset(page) + thp_size(page); return end > start ? end - start : 0; } @@ -604,7 +496,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); int err; - loff_t len = PAGE_SIZE; + loff_t len = thp_size(page); struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; @@ -632,7 +524,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) /* is this a partial page at end of file? */ if (page_off >= ceph_wbc.i_size) { dout("%p page eof %llu\n", page, ceph_wbc.i_size); - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); + page->mapping->a_ops->invalidatepage(page, 0, thp_size(page)); return 0; } @@ -658,7 +550,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } /* it may be a short write due to an object boundary */ - WARN_ON_ONCE(len > PAGE_SIZE); + WARN_ON_ONCE(len > thp_size(page)); osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); @@ -667,7 +559,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (!err) err = ceph_osdc_wait_request(osdc, req); - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, err); ceph_osdc_put_request(req); @@ -695,8 +587,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage cleaned page %p\n", page); err = 0; /* vfs expects us to return 0 */ } - page->private = 0; - ClearPagePrivate(page); + oldest = detach_page_private(page); + WARN_ON_ONCE(oldest != snapc); end_page_writeback(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ @@ -755,7 +647,7 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_clear_error_write(ci); } - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, rc); /* @@ -788,11 +680,9 @@ static void writepages_finish(struct ceph_osd_request *req) clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - ceph_put_snap_context(page_snap_context(page)); - page->private = 0; - ClearPagePrivate(page); - dout("unlocking %p\n", page); + ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); + dout("unlocking %p\n", page); if (remove_page) generic_error_remove_page(inode->i_mapping, @@ -949,7 +839,7 @@ get_more_pages: page_offset(page) >= i_size_read(inode)) && clear_page_dirty_for_io(page)) mapping->a_ops->invalidatepage(page, - 0, PAGE_SIZE); + 0, thp_size(page)); unlock_page(page); continue; } @@ -1038,7 +928,7 @@ get_more_pages: pages[locked_pages++] = page; pvec.pages[i] = NULL; - len += PAGE_SIZE; + len += thp_size(page); } /* did we get anything? */ @@ -1087,7 +977,7 @@ new_request: BUG_ON(IS_ERR(req)); } BUG_ON(len < page_offset(pages[locked_pages - 1]) + - PAGE_SIZE - offset); + thp_size(page) - offset); req->r_callback = writepages_finish; req->r_inode = inode; @@ -1117,7 +1007,7 @@ new_request: } set_page_writeback(pages[i]); - len += PAGE_SIZE; + len += thp_size(page); } if (ceph_wbc.size_stable) { @@ -1126,7 +1016,7 @@ new_request: /* writepages_finish() clears writeback pages * according to the data length, so make sure * data length covers all locked pages */ - u64 min_len = len + 1 - PAGE_SIZE; + u64 min_len = len + 1 - thp_size(page); len = get_writepages_data_length(inode, pages[i - 1], offset); len = max(len, min_len); @@ -1302,6 +1192,31 @@ ceph_find_incompatible(struct page *page) return NULL; } +static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, + struct page *page, void **_fsdata) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + + snapc = ceph_find_incompatible(page); + if (snapc) { + int r; + + unlock_page(page); + put_page(page); + if (IS_ERR(snapc)) + return PTR_ERR(snapc); + + ceph_queue_writeback(inode); + r = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + return r == 0 ? -EAGAIN : r; + } + return 0; +} + /* * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. @@ -1312,75 +1227,47 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_snap_context *snapc; struct page *page = NULL; pgoff_t index = pos >> PAGE_SHIFT; - int pos_in_page = pos & ~PAGE_MASK; - int r = 0; + int r; - dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); - - for (;;) { + /* + * Uninlining should have already been done and everything updated, EXCEPT + * for inline_version sent to the MDS. + */ + if (ci->i_inline_version != CEPH_INLINE_NONE) { page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { - r = -ENOMEM; - break; - } - - snapc = ceph_find_incompatible(page); - if (snapc) { - if (IS_ERR(snapc)) { - r = PTR_ERR(snapc); - break; - } - unlock_page(page); - put_page(page); - page = NULL; - ceph_queue_writeback(inode); - r = wait_event_killable(ci->i_cap_wq, - context_is_writeable_or_written(inode, snapc)); - ceph_put_snap_context(snapc); - if (r != 0) - break; - continue; - } - - if (PageUptodate(page)) { - dout(" page %p already uptodate\n", page); - break; - } + if (!page) + return -ENOMEM; /* - * In some cases we don't need to read at all: - * - full page write - * - write that lies completely beyond EOF - * - write that covers the the page from start to EOF or beyond it + * The inline_version on a new inode is set to 1. If that's the + * case, then the page is brand new and isn't yet Uptodate. */ - if ((pos_in_page == 0 && len == PAGE_SIZE) || - (pos >= i_size_read(inode)) || - (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) { - zero_user_segments(page, 0, pos_in_page, - pos_in_page + len, PAGE_SIZE); - break; + r = 0; + if (index == 0 && ci->i_inline_version != 1) { + if (!PageUptodate(page)) { + WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", + ci->i_inline_version); + r = -EINVAL; + } + goto out; } - - /* - * We need to read it. If we get back -EINPROGRESS, then the page was - * handed off to fscache and it will be unlocked when the read completes. - * Refind the page in that case so we can reacquire the page lock. Otherwise - * we got a hard error or the read was completed synchronously. - */ - r = ceph_do_readpage(file, page); - if (r != -EINPROGRESS) - break; + zero_user_segment(page, 0, thp_size(page)); + SetPageUptodate(page); + goto out; } + r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL, + &ceph_netfs_read_ops, NULL); +out: + if (r == 0) + wait_on_page_fscache(page); if (r < 0) { - if (page) { - unlock_page(page); + if (page) put_page(page); - } } else { + WARN_ON_ONCE(!PageLocked(page)); *pagep = page; } return r; @@ -1438,7 +1325,7 @@ static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations ceph_aops = { .readpage = ceph_readpage, - .readpages = ceph_readpages, + .readahead = ceph_readahead, .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, @@ -1470,7 +1357,6 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; - struct page *pinned_page = NULL; loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; int want, got, err; sigset_t oldset; @@ -1478,21 +1364,20 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ceph_block_sigs(&oldset); - dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", - inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); + dout("filemap_fault %p %llx.%llx %llu trying to get caps\n", + inode, ceph_vinop(inode), off); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; got = 0; - err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, - &got, &pinned_page); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got); if (err < 0) goto out_restore; - dout("filemap_fault %p %llu~%zd got cap refs on %s\n", - inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); + dout("filemap_fault %p %llu got cap refs on %s\n", + inode, off, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || ci->i_inline_version == CEPH_INLINE_NONE) { @@ -1500,14 +1385,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); ceph_del_rw_context(fi, &rw_ctx); - dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n", - inode, off, (size_t)PAGE_SIZE, - ceph_cap_string(got), ret); + dout("filemap_fault %p %llu drop cap refs %s ret %x\n", + inode, off, ceph_cap_string(got), ret); } else err = -EAGAIN; - if (pinned_page) - put_page(pinned_page); ceph_put_cap_refs(ci, got); if (err != -EAGAIN) @@ -1542,8 +1424,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: - dout("filemap_fault %p %llu~%zd read inline data ret %x\n", - inode, off, (size_t)PAGE_SIZE, ret); + dout("filemap_fault %p %llu read inline data ret %x\n", + inode, off, ret); } out_restore: ceph_restore_sigs(&oldset); @@ -1553,9 +1435,6 @@ out_restore: return ret; } -/* - * Reuse write_begin here for simplicity. - */ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -1591,10 +1470,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) goto out_free; } - if (off + PAGE_SIZE <= size) - len = PAGE_SIZE; + if (off + thp_size(page) <= size) + len = thp_size(page); else - len = size & ~PAGE_MASK; + len = offset_in_thp(page, size); dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", inode, ceph_vinop(inode), off, len, size); @@ -1604,8 +1483,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, - &got, NULL); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got); if (err < 0) goto out_free; @@ -1832,7 +1710,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, err); out_put: @@ -2057,6 +1935,10 @@ int ceph_pool_perm_check(struct inode *inode, int need) s64 pool; int ret, flags; + /* Only need to do this for regular files */ + if (!S_ISREG(inode->i_mode)) + return 0; + if (ci->i_vino.snap != CEPH_NOSNAP) { /* * Pool permission check needs to write to the first object. diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 2f5cb6bc78e1..9cfadbb86568 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -173,7 +173,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) ci->fscache = NULL; - fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); fscache_relinquish_cookie(cookie, &ci->i_vino, false); } @@ -194,7 +193,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) dout("fscache_file_set_cookie %p %p disabling cache\n", inode, filp); fscache_disable_cookie(ci->fscache, &ci->i_vino, false); - fscache_uncache_all_inode_pages(ci->fscache, inode); } else { fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode), ceph_fscache_can_enable, inode); @@ -205,108 +203,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) } } -static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) -{ - if (!error) - SetPageUptodate(page); - - unlock_page(page); -} - -static inline bool cache_valid(struct ceph_inode_info *ci) -{ - return ci->i_fscache_gen == ci->i_rdcache_gen; -} - - -/* Atempt to read from the fscache, - * - * This function is called from the readpage_nounlock context. DO NOT attempt to - * unlock the page here (or in the callback). - */ -int ceph_readpage_from_fscache(struct inode *inode, struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - - if (!cache_valid(ci)) - return -ENOBUFS; - - ret = fscache_read_or_alloc_page(ci->fscache, page, - ceph_readpage_from_fscache_complete, NULL, - GFP_KERNEL); - - switch (ret) { - case 0: /* Page found */ - dout("page read submitted\n"); - return 0; - case -ENOBUFS: /* Pages were not found, and can't be */ - case -ENODATA: /* Pages were not found */ - dout("page/inode not in cache\n"); - return ret; - default: - dout("%s: unknown error ret = %i\n", __func__, ret); - return ret; - } -} - -int ceph_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - - if (!cache_valid(ci)) - return -ENOBUFS; - - ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, - ceph_readpage_from_fscache_complete, - NULL, mapping_gfp_mask(mapping)); - - switch (ret) { - case 0: /* All pages found */ - dout("all-page read submitted\n"); - return 0; - case -ENOBUFS: /* Some pages were not found, and can't be */ - case -ENODATA: /* some pages were not found */ - dout("page/inode not in cache\n"); - return ret; - default: - dout("%s: unknown error ret = %i\n", __func__, ret); - return ret; - } -} - -void ceph_readpage_to_fscache(struct inode *inode, struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - - if (!PageFsCache(page)) - return; - - if (!cache_valid(ci)) - return; - - ret = fscache_write_page(ci->fscache, page, i_size_read(inode), - GFP_KERNEL); - if (ret) - fscache_uncache_page(ci->fscache, page); -} - -void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - - if (!PageFsCache(page)) - return; - - fscache_wait_on_page_write(ci->fscache, page); - fscache_uncache_page(ci->fscache, page); -} - void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { if (fscache_cookie_valid(fsc->fscache)) { @@ -329,24 +225,3 @@ void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) } fsc->fscache = NULL; } - -/* - * caller should hold CEPH_CAP_FILE_{RD,CACHE} - */ -void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) -{ - if (cache_valid(ci)) - return; - - /* resue i_truncate_mutex. There should be no pending - * truncate while the caller holds CEPH_CAP_FILE_RD */ - mutex_lock(&ci->i_truncate_mutex); - if (!cache_valid(ci)) { - if (fscache_check_consistency(ci->fscache, &ci->i_vino)) - fscache_invalidate(ci->fscache); - spin_lock(&ci->i_ceph_lock); - ci->i_fscache_gen = ci->i_rdcache_gen; - spin_unlock(&ci->i_ceph_lock); - } - mutex_unlock(&ci->i_truncate_mutex); -} diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 89dbdd1eb14a..1409d6149281 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -9,6 +9,8 @@ #ifndef _CEPH_CACHE_H #define _CEPH_CACHE_H +#include <linux/netfs.h> + #ifdef CONFIG_CEPH_FSCACHE extern struct fscache_netfs ceph_cache_netfs; @@ -29,54 +31,37 @@ int ceph_readpages_from_fscache(struct inode *inode, struct address_space *mapping, struct list_head *pages, unsigned *nr_pages); -void ceph_readpage_to_fscache(struct inode *inode, struct page *page); -void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { ci->fscache = NULL; - ci->i_fscache_gen = 0; } -static inline void ceph_fscache_invalidate(struct inode *inode) +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) { - fscache_invalidate(ceph_inode(inode)->fscache); + return ci->fscache; } -static inline void ceph_fscache_uncache_page(struct inode *inode, - struct page *page) +static inline void ceph_fscache_invalidate(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - return fscache_uncache_page(ci->fscache, page); + fscache_invalidate(ceph_inode(inode)->fscache); } -static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +static inline bool ceph_is_cache_enabled(struct inode *inode) { - struct inode* inode = page->mapping->host; - struct ceph_inode_info *ci = ceph_inode(inode); - return fscache_maybe_release_page(ci->fscache, page, gfp); -} + struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode)); -static inline void ceph_fscache_readpage_cancel(struct inode *inode, - struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) - __fscache_uncache_page(ci->fscache, page); + if (!cookie) + return false; + return fscache_cookie_enabled(cookie); } -static inline void ceph_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) +static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) { - struct ceph_inode_info *ci = ceph_inode(inode); - return fscache_readpages_cancel(ci->fscache, pages); -} + struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); -static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) -{ - ci->i_fscache_gen = ci->i_rdcache_gen - 1; + return fscache_begin_read_operation(rreq, cookie); } - #else static inline int ceph_fscache_register(void) @@ -102,6 +87,11 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { } +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +{ + return NULL; +} + static inline void ceph_fscache_register_inode_cookie(struct inode *inode) { } @@ -115,62 +105,19 @@ static inline void ceph_fscache_file_set_cookie(struct inode *inode, { } -static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) -{ -} - -static inline void ceph_fscache_uncache_page(struct inode *inode, - struct page *pages) -{ -} - -static inline int ceph_readpage_from_fscache(struct inode* inode, - struct page *page) -{ - return -ENOBUFS; -} - -static inline int ceph_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - return -ENOBUFS; -} - -static inline void ceph_readpage_to_fscache(struct inode *inode, - struct page *page) -{ -} - static inline void ceph_fscache_invalidate(struct inode *inode) { } -static inline void ceph_invalidate_fscache_page(struct inode *inode, - struct page *page) +static inline bool ceph_is_cache_enabled(struct inode *inode) { + return false; } -static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) -{ - return 1; -} - -static inline void ceph_fscache_readpage_cancel(struct inode *inode, - struct page *page) -{ -} - -static inline void ceph_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) -{ -} - -static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) +static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) { + return -ENOBUFS; } - #endif -#endif +#endif /* _CEPH_CACHE_H */ diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 3c03fa37cac4..a5e93b185515 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1390,7 +1390,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; - arg->size = inode->i_size; + arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; if (cap == ci->i_auth_cap) { @@ -1867,6 +1867,7 @@ static int try_nonblocking_invalidate(struct inode *inode) u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); + ceph_fscache_invalidate(inode); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); @@ -1884,7 +1885,7 @@ static int try_nonblocking_invalidate(struct inode *inode) bool __ceph_should_report_size(struct ceph_inode_info *ci) { - loff_t size = ci->vfs_inode.i_size; + loff_t size = i_size_read(&ci->vfs_inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; @@ -2730,10 +2731,6 @@ again: *got = need | want; else *got = need; - if (S_ISREG(inode->i_mode) && - (need & CEPH_CAP_FILE_RD) && - !(*got & CEPH_CAP_FILE_CACHE)) - ceph_disable_fscache_readpage(ci); ceph_take_cap_refs(ci, *got, true); ret = 1; } @@ -2858,8 +2855,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct file *filp, int need, int want, - loff_t endoff, int *got, struct page **pinned_page) +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) { struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); @@ -2957,11 +2953,11 @@ int ceph_get_caps(struct file *filp, int need, int want, struct page *page = find_get_page(inode->i_mapping, 0); if (page) { - if (PageUptodate(page)) { - *pinned_page = page; - break; - } + bool uptodate = PageUptodate(page); + put_page(page); + if (uptodate) + break; } /* * drop cap refs first because getattr while @@ -2983,11 +2979,6 @@ int ceph_get_caps(struct file *filp, int need, int want, } break; } - - if (S_ISREG(ci->vfs_inode.i_mode) && - (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) - ceph_fscache_revalidate_cookie(ci); - *got = _got; return 0; } @@ -3308,7 +3299,7 @@ static void handle_cap_grant(struct inode *inode, dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, - inode->i_size); + i_size_read(inode)); /* diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 66989c880adb..425f3356332a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -162,34 +162,34 @@ static int metric_show(struct seq_file *s, void *p) seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); seq_printf(s, "-----------------------------------------------------------------------------------\n"); - spin_lock(&m->read_latency_lock); + spin_lock(&m->read_metric_lock); total = m->total_reads; sum = m->read_latency_sum; avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; min = m->read_latency_min; max = m->read_latency_max; sq = m->read_latency_sq_sum; - spin_unlock(&m->read_latency_lock); + spin_unlock(&m->read_metric_lock); CEPH_METRIC_SHOW("read", total, avg, min, max, sq); - spin_lock(&m->write_latency_lock); + spin_lock(&m->write_metric_lock); total = m->total_writes; sum = m->write_latency_sum; avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; min = m->write_latency_min; max = m->write_latency_max; sq = m->write_latency_sq_sum; - spin_unlock(&m->write_latency_lock); + spin_unlock(&m->write_metric_lock); CEPH_METRIC_SHOW("write", total, avg, min, max, sq); - spin_lock(&m->metadata_latency_lock); + spin_lock(&m->metadata_metric_lock); total = m->total_metadatas; sum = m->metadata_latency_sum; avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; min = m->metadata_latency_min; max = m->metadata_latency_max; sq = m->metadata_latency_sq_sum; - spin_unlock(&m->metadata_latency_lock); + spin_unlock(&m->metadata_metric_lock); CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq); seq_printf(s, "\n"); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index f7a790ed62c4..5624fae7a603 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -631,10 +631,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_CUR: offset += file->f_pos; + break; case SEEK_SET: break; case SEEK_END: retval = -EOPNOTSUPP; + goto out; default: goto out; } @@ -665,8 +667,8 @@ out: /* * Handle lookups for the hidden .snap directory. */ -int ceph_handle_snapdir(struct ceph_mds_request *req, - struct dentry *dentry, int err) +struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ @@ -674,18 +676,17 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, /* .snap dir? */ if (err == -ENOENT && ceph_snap(parent) == CEPH_NOSNAP && - strcmp(dentry->d_name.name, - fsc->mount_options->snapdir_name) == 0) { + strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { + struct dentry *res; struct inode *inode = ceph_get_snapdir(parent); - if (IS_ERR(inode)) - return PTR_ERR(inode); - dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", - dentry, dentry, inode); - BUG_ON(!d_unhashed(dentry)); - d_add(dentry, inode); - err = 0; + + res = d_splice_alias(inode, dentry); + dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n", + dentry, dentry, inode, res); + if (res) + dentry = res; } - return err; + return dentry; } /* @@ -741,6 +742,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; + struct dentry *res; int op; int mask; int err; @@ -791,7 +793,13 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, req->r_parent = dir; set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, NULL, req); - err = ceph_handle_snapdir(req, dentry, err); + res = ceph_handle_snapdir(req, dentry, err); + if (IS_ERR(res)) { + err = PTR_ERR(res); + } else { + dentry = res; + err = 0; + } dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ dout("lookup result=%p\n", dentry); diff --git a/fs/ceph/export.c b/fs/ceph/export.c index f22156ee7306..65540a4429b2 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -129,6 +129,10 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino) vino.ino = ino; vino.snap = CEPH_NOSNAP; + + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-ESTALE); + inode = ceph_find_inode(sb, vino); if (!inode) { struct ceph_mds_request *req; @@ -178,8 +182,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) return ERR_CAST(inode); /* We need LINK caps to reliably check i_nlink */ err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false); - if (err) + if (err) { + iput(inode); return ERR_PTR(err); + } /* -ESTALE if inode as been unlinked and no file is open */ if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { iput(inode); @@ -212,6 +218,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, vino.ino = sfh->ino; vino.snap = sfh->snapid; } + + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-ESTALE); + inode = ceph_find_inode(sb, vino); if (inode) return d_obtain_alias(inode); diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 209535d5b8d3..77fc037d5beb 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -739,9 +739,12 @@ retry: err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); - err = ceph_handle_snapdir(req, dentry, err); - if (err) + dentry = ceph_handle_snapdir(req, dentry, err); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); goto out_req; + } + err = 0; if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); @@ -892,7 +895,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, if (!ret) ret = ceph_osdc_wait_request(osdc, req); - ceph_update_read_latency(&fsc->mdsc->metric, + ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, ret); @@ -1034,16 +1037,6 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, osd_data->bvec_pos.iter.bi_size); - /* r_start_latency == 0 means the request was not submitted */ - if (req->r_start_latency) { - if (aio_req->write) - ceph_update_write_latency(metric, req->r_start_latency, - req->r_end_latency, rc); - else - ceph_update_read_latency(metric, req->r_start_latency, - req->r_end_latency, rc); - } - if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; BUG_ON(!aio_req->write); @@ -1086,6 +1079,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } } + /* r_start_latency == 0 means the request was not submitted */ + if (req->r_start_latency) { + if (aio_req->write) + ceph_update_write_metrics(metric, req->r_start_latency, + req->r_end_latency, rc); + else + ceph_update_read_metrics(metric, req->r_start_latency, + req->r_end_latency, rc); + } + put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, aio_req->should_dirty); ceph_osdc_put_request(req); @@ -1290,10 +1293,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ret = ceph_osdc_wait_request(&fsc->client->osdc, req); if (write) - ceph_update_write_latency(metric, req->r_start_latency, + ceph_update_write_metrics(metric, req->r_start_latency, req->r_end_latency, ret); else - ceph_update_read_latency(metric, req->r_start_latency, + ceph_update_read_metrics(metric, req->r_start_latency, req->r_end_latency, ret); size = i_size_read(inode); @@ -1467,7 +1470,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, ret); out: ceph_osdc_put_request(req); @@ -1510,7 +1513,6 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) size_t len = iov_iter_count(to); struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct page *pinned_page = NULL; bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; int want, got = 0; @@ -1529,8 +1531,7 @@ again: want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, - &got, &pinned_page); + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); if (ret < 0) { if (iocb->ki_flags & IOCB_DIRECT) ceph_end_io_direct(inode); @@ -1571,10 +1572,6 @@ again: dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); - if (pinned_page) { - put_page(pinned_page); - pinned_page = NULL; - } ceph_put_cap_refs(ci, got); if (direct_lock) @@ -1753,8 +1750,7 @@ retry_snap: else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, - &got, NULL); + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) goto out; @@ -2083,7 +2079,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got); if (ret < 0) goto unlock; @@ -2121,7 +2117,7 @@ static int get_rd_wr_caps(struct file *src_filp, int *src_got, retry_caps: ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, - dst_endoff, dst_got, NULL); + dst_endoff, dst_got); if (ret < 0) return ret; @@ -2143,7 +2139,7 @@ retry_caps: return ret; } ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, - CEPH_CAP_FILE_SHARED, -1, src_got, NULL); + CEPH_CAP_FILE_SHARED, -1, src_got); if (ret < 0) return ret; /*... drop src_ci caps too, and retry */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 689e3ffd29d7..e1c63adb196d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -56,6 +56,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) { struct inode *inode; + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-EREMOTEIO); + inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, ceph_set_ino_cb, &vino); if (!inode) @@ -99,14 +102,15 @@ struct inode *ceph_get_snapdir(struct inode *parent) inode->i_mtime = parent->i_mtime; inode->i_ctime = parent->i_ctime; inode->i_atime = parent->i_atime; - inode->i_op = &ceph_snapdir_iops; - inode->i_fop = &ceph_snapdir_fops; - ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_rbytes = 0; ci->i_btime = ceph_inode(parent)->i_btime; - if (inode->i_state & I_NEW) + if (inode->i_state & I_NEW) { + inode->i_op = &ceph_snapdir_iops; + inode->i_fop = &ceph_snapdir_fops; + ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ unlock_new_inode(inode); + } return inode; } @@ -628,10 +632,11 @@ int ceph_fill_file_size(struct inode *inode, int issued, { struct ceph_inode_info *ci = ceph_inode(inode); int queue_trunc = 0; + loff_t isize = i_size_read(inode); if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || - (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { - dout("size %lld -> %llu\n", inode->i_size, size); + (truncate_seq == ci->i_truncate_seq && size > isize)) { + dout("size %lld -> %llu\n", isize, size); if (size > 0 && S_ISDIR(inode->i_mode)) { pr_err("fill_file_size non-zero size for directory\n"); size = 0; @@ -925,6 +930,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); ci->i_dir_pin = iinfo->dir_pin; + ci->i_rsnaps = iinfo->rsnaps; ceph_decode_timespec64(&ci->i_rctime, &info->rctime); } } @@ -1818,7 +1824,7 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) bool ret; spin_lock(&ci->i_ceph_lock); - dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); + dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); i_size_write(inode, size); inode->i_blocks = calc_inode_blocks(size); @@ -1894,6 +1900,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); + ceph_fscache_invalidate(inode); if (invalidate_inode_pages2(inode->i_mapping) < 0) { pr_err("invalidate_pages %p fails\n", inode); } @@ -2124,20 +2131,19 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } if (ia_valid & ATTR_SIZE) { - dout("setattr %p size %lld -> %lld\n", inode, - inode->i_size, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && - attr->ia_size > inode->i_size) { + loff_t isize = i_size_read(inode); + + dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); + if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > isize) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; ia_valid |= ATTR_MTIME; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - attr->ia_size != inode->i_size) { + attr->ia_size != isize) { req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = - cpu_to_le64(inode->i_size); + req->r_args.setattr.old_size = cpu_to_le64(isize); mask |= CEPH_SETATTR_SIZE; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; @@ -2247,7 +2253,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size > max(inode->i_size, fsc->max_file_size)) + attr->ia_size > max(i_size_read(inode), fsc->max_file_size)) return -EFBIG; if ((attr->ia_valid & ATTR_SIZE) && diff --git a/fs/ceph/io.c b/fs/ceph/io.c index 97602ea92ff4..c456509b31c3 100644 --- a/fs/ceph/io.c +++ b/fs/ceph/io.c @@ -118,7 +118,7 @@ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) } /** - * ceph_end_io_direct - declare the file is being used for direct i/o + * ceph_start_io_direct - declare the file is being used for direct i/o * @inode: file inode * * Declare that a direct I/O operation is about to start, and ensure diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d87bd852ed96..e5af591d3bd4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -176,6 +176,13 @@ static int parse_reply_info_in(void **p, void *end, memset(&info->snap_btime, 0, sizeof(info->snap_btime)); } + /* snapshot count, remains zero for v<=3 */ + if (struct_v >= 4) { + ceph_decode_64_safe(p, end, info->rsnaps, bad); + } else { + info->rsnaps = 0; + } + *p = end; } else { if (features & CEPH_FEATURE_MDS_INLINE_DATA) { @@ -214,7 +221,7 @@ static int parse_reply_info_in(void **p, void *end, } info->dir_pin = -ENODATA; - /* info->snap_btime remains zero */ + /* info->snap_btime and info->rsnaps remain zero */ } return 0; bad: @@ -433,6 +440,13 @@ static int ceph_parse_deleg_inos(void **p, void *end, ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, len, bad); + + /* Don't accept a delegation of system inodes */ + if (start < CEPH_INO_SYSTEM_BASE) { + pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n", + start, len); + continue; + } while (len--) { int err = xa_insert(&s->s_delegated_inos, ino = start++, DELEGATED_INO_AVAILABLE, @@ -3306,7 +3320,7 @@ out_err: /* kick calling process */ complete_request(mdsc, req); - ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency, + ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency, req->r_end_latency, err); out: ceph_mdsc_put_request(req); @@ -3780,7 +3794,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); - rec.v1.size = cpu_to_le64(inode->i_size); + rec.v1.size = cpu_to_le64(i_size_read(inode)); ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index eaa7c5422116..15c11a0f2caf 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -88,6 +88,7 @@ struct ceph_mds_reply_info_in { s32 dir_pin; struct ceph_timespec btime; struct ceph_timespec snap_btime; + u64 rsnaps; u64 change_attr; }; diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 5ec94bd4c1de..28b6b42ad677 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -17,6 +17,9 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_metric_write_latency *write; struct ceph_metric_metadata_latency *meta; struct ceph_metric_dlease *dlease; + struct ceph_opened_files *files; + struct ceph_pinned_icaps *icaps; + struct ceph_opened_inodes *inodes; struct ceph_client_metric *m = &mdsc->metric; u64 nr_caps = atomic64_read(&m->total_caps); struct ceph_msg *msg; @@ -26,7 +29,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, s32 len; len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) - + sizeof(*meta) + sizeof(*dlease); + + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) + + sizeof(*icaps) + sizeof(*inodes); msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { @@ -95,6 +99,38 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); items++; + sum = percpu_counter_sum(&m->total_inodes); + + /* encode the opened files metric */ + files = (struct ceph_opened_files *)(dlease + 1); + files->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); + files->ver = 1; + files->compat = 1; + files->data_len = cpu_to_le32(sizeof(*files) - 10); + files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files)); + files->total = cpu_to_le64(sum); + items++; + + /* encode the pinned icaps metric */ + icaps = (struct ceph_pinned_icaps *)(files + 1); + icaps->type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); + icaps->ver = 1; + icaps->compat = 1; + icaps->data_len = cpu_to_le32(sizeof(*icaps) - 10); + icaps->pinned_icaps = cpu_to_le64(nr_caps); + icaps->total = cpu_to_le64(sum); + items++; + + /* encode the opened inodes metric */ + inodes = (struct ceph_opened_inodes *)(icaps + 1); + inodes->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); + inodes->ver = 1; + inodes->compat = 1; + inodes->data_len = cpu_to_le32(sizeof(*inodes) - 10); + inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes)); + inodes->total = cpu_to_le64(sum); + items++; + put_unaligned_le32(items, &head->num); msg->front.iov_len = len; msg->hdr.version = cpu_to_le16(1); @@ -183,21 +219,21 @@ int ceph_metric_init(struct ceph_client_metric *m) if (ret) goto err_i_caps_mis; - spin_lock_init(&m->read_latency_lock); + spin_lock_init(&m->read_metric_lock); m->read_latency_sq_sum = 0; m->read_latency_min = KTIME_MAX; m->read_latency_max = 0; m->total_reads = 0; m->read_latency_sum = 0; - spin_lock_init(&m->write_latency_lock); + spin_lock_init(&m->write_metric_lock); m->write_latency_sq_sum = 0; m->write_latency_min = KTIME_MAX; m->write_latency_max = 0; m->total_writes = 0; m->write_latency_sum = 0; - spin_lock_init(&m->metadata_latency_lock); + spin_lock_init(&m->metadata_metric_lock); m->metadata_latency_sq_sum = 0; m->metadata_latency_min = KTIME_MAX; m->metadata_latency_max = 0; @@ -274,7 +310,7 @@ static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, *sq_sump += sq; } -void ceph_update_read_latency(struct ceph_client_metric *m, +void ceph_update_read_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc) { @@ -283,14 +319,14 @@ void ceph_update_read_latency(struct ceph_client_metric *m, if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) return; - spin_lock(&m->read_latency_lock); + spin_lock(&m->read_metric_lock); __update_latency(&m->total_reads, &m->read_latency_sum, &m->read_latency_min, &m->read_latency_max, &m->read_latency_sq_sum, lat); - spin_unlock(&m->read_latency_lock); + spin_unlock(&m->read_metric_lock); } -void ceph_update_write_latency(struct ceph_client_metric *m, +void ceph_update_write_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc) { @@ -299,14 +335,14 @@ void ceph_update_write_latency(struct ceph_client_metric *m, if (unlikely(rc && rc != -ETIMEDOUT)) return; - spin_lock(&m->write_latency_lock); + spin_lock(&m->write_metric_lock); __update_latency(&m->total_writes, &m->write_latency_sum, &m->write_latency_min, &m->write_latency_max, &m->write_latency_sq_sum, lat); - spin_unlock(&m->write_latency_lock); + spin_unlock(&m->write_metric_lock); } -void ceph_update_metadata_latency(struct ceph_client_metric *m, +void ceph_update_metadata_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc) { @@ -315,9 +351,9 @@ void ceph_update_metadata_latency(struct ceph_client_metric *m, if (unlikely(rc && rc != -ENOENT)) return; - spin_lock(&m->metadata_latency_lock); + spin_lock(&m->metadata_metric_lock); __update_latency(&m->total_metadatas, &m->metadata_latency_sum, &m->metadata_latency_min, &m->metadata_latency_max, &m->metadata_latency_sq_sum, lat); - spin_unlock(&m->metadata_latency_lock); + spin_unlock(&m->metadata_metric_lock); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index af6038ff39d4..e984eb2bb14b 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -14,8 +14,11 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_WRITE_LATENCY, CLIENT_METRIC_TYPE_METADATA_LATENCY, CLIENT_METRIC_TYPE_DENTRY_LEASE, + CLIENT_METRIC_TYPE_OPENED_FILES, + CLIENT_METRIC_TYPE_PINNED_ICAPS, + CLIENT_METRIC_TYPE_OPENED_INODES, - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_OPENED_INODES, }; /* @@ -28,6 +31,9 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_WRITE_LATENCY, \ CLIENT_METRIC_TYPE_METADATA_LATENCY, \ CLIENT_METRIC_TYPE_DENTRY_LEASE, \ + CLIENT_METRIC_TYPE_OPENED_FILES, \ + CLIENT_METRIC_TYPE_PINNED_ICAPS, \ + CLIENT_METRIC_TYPE_OPENED_INODES, \ \ CLIENT_METRIC_TYPE_MAX, \ } @@ -94,6 +100,42 @@ struct ceph_metric_dlease { __le64 total; } __packed; +/* metric opened files header */ +struct ceph_opened_files { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(opened_files + total) */ + __le64 opened_files; + __le64 total; +} __packed; + +/* metric pinned i_caps header */ +struct ceph_pinned_icaps { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(pinned_icaps + total) */ + __le64 pinned_icaps; + __le64 total; +} __packed; + +/* metric opened inodes header */ +struct ceph_opened_inodes { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(opened_inodes + total) */ + __le64 opened_inodes; + __le64 total; +} __packed; + struct ceph_metric_head { __le32 num; /* the number of metrics that will be sent */ } __packed; @@ -108,21 +150,21 @@ struct ceph_client_metric { struct percpu_counter i_caps_hit; struct percpu_counter i_caps_mis; - spinlock_t read_latency_lock; + spinlock_t read_metric_lock; u64 total_reads; ktime_t read_latency_sum; ktime_t read_latency_sq_sum; ktime_t read_latency_min; ktime_t read_latency_max; - spinlock_t write_latency_lock; + spinlock_t write_metric_lock; u64 total_writes; ktime_t write_latency_sum; ktime_t write_latency_sq_sum; ktime_t write_latency_min; ktime_t write_latency_max; - spinlock_t metadata_latency_lock; + spinlock_t metadata_metric_lock; u64 total_metadatas; ktime_t metadata_latency_sum; ktime_t metadata_latency_sq_sum; @@ -162,13 +204,13 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m) percpu_counter_inc(&m->i_caps_mis); } -extern void ceph_update_read_latency(struct ceph_client_metric *m, +extern void ceph_update_read_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); -extern void ceph_update_write_latency(struct ceph_client_metric *m, +extern void ceph_update_write_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); -extern void ceph_update_metadata_latency(struct ceph_client_metric *m, +extern void ceph_update_metadata_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); #endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 0728b01d4d43..4ce18055d931 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -605,7 +605,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); BUG_ON(capsnap->writing); - capsnap->size = inode->i_size; + capsnap->size = i_size_read(inode); capsnap->mtime = inode->i_mtime; capsnap->atime = inode->i_atime; capsnap->ctime = inode->i_ctime; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c48bb30c8d70..db80d89556b1 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -21,6 +21,7 @@ #include <linux/ceph/libceph.h> #ifdef CONFIG_CEPH_FSCACHE +#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> #endif @@ -333,7 +334,7 @@ struct ceph_inode_info { /* for dirs */ struct timespec64 i_rctime; - u64 i_rbytes, i_rfiles, i_rsubdirs; + u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps; u64 i_files, i_subdirs; /* quotas */ @@ -427,7 +428,6 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - u32 i_fscache_gen; #endif errseq_t i_meta_err; @@ -529,10 +529,34 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) ci->i_vino.snap == pvino->snap; } +/* + * The MDS reserves a set of inodes for its own usage. These should never + * be accessible by clients, and so the MDS has no reason to ever hand these + * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE. + * + * These come from src/mds/mdstypes.h in the ceph sources. + */ +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 +#define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) + +static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) +{ + if (vino.ino < CEPH_INO_SYSTEM_BASE && + vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { + WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); + return true; + } + return false; +} static inline struct inode *ceph_find_inode(struct super_block *sb, struct ceph_vino vino) { + if (ceph_vino_is_reserved(vino)) + return NULL; + /* * NB: The hashval will be run through the fs/inode.c hash function * anyway, so there is no need to squash the inode number down to @@ -1156,7 +1180,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, int mds, int drop, int unless); extern int ceph_get_caps(struct file *filp, int need, int want, - loff_t endoff, int *got, struct page **pinned_page); + loff_t endoff, int *got); extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); @@ -1193,7 +1217,7 @@ extern const struct dentry_operations ceph_dentry_ops; extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); -extern int ceph_handle_snapdir(struct ceph_mds_request *req, +extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 02f59bcb4f27..1242db8d3444 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -233,6 +233,12 @@ static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs); } +static ssize_t ceph_vxattrcb_dir_rsnaps(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_rsnaps); +} + static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, size_t size) { @@ -384,6 +390,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rentries), XATTR_RSTAT_FIELD(dir, rfiles), XATTR_RSTAT_FIELD(dir, rsubdirs), + XATTR_RSTAT_FIELD(dir, rsnaps), XATTR_RSTAT_FIELD(dir, rbytes), XATTR_RSTAT_FIELD(dir, rctime), { diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index 2a5325a7ae49..9c45b3a82ad9 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -55,6 +55,7 @@ #define CIFS_MOUNT_MODE_FROM_SID 0x10000000 /* retrieve mode from special ACE */ #define CIFS_MOUNT_RO_CACHE 0x20000000 /* assumes share will not change */ #define CIFS_MOUNT_RW_CACHE 0x40000000 /* assumes only client accessing */ +#define CIFS_MOUNT_SHUTDOWN 0x80000000 struct cifs_sb_info { struct rb_root tlink_tree; diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 153d5c842a9b..37fc7d6ac457 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -57,6 +57,12 @@ struct smb_query_info { /* char buffer[]; */ } __packed; +/* + * Dumping the commonly used 16 byte (e.g. CCM and GCM128) keys still supported + * for backlevel compatibility, but is not sufficient for dumping the less + * frequently used GCM256 (32 byte) keys (see the newer "CIFS_DUMP_FULL_KEY" + * ioctl for dumping decryption info for GCM256 mounts) + */ struct smb3_key_debug_info { __u64 Suid; __u16 cipher_type; @@ -65,6 +71,31 @@ struct smb3_key_debug_info { __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; } __packed; +/* + * Dump variable-sized keys + */ +struct smb3_full_key_debug_info { + /* INPUT: size of userspace buffer */ + __u32 in_size; + + /* + * INPUT: 0 for current user, otherwise session to dump + * OUTPUT: session id that was dumped + */ + __u64 session_id; + __u16 cipher_type; + __u8 session_key_length; + __u8 server_in_key_length; + __u8 server_out_key_length; + __u8 data[]; + /* + * return this struct with the keys appended at the end: + * __u8 session_key[session_key_length]; + * __u8 server_in_key[server_in_key_length]; + * __u8 server_out_key[server_out_key_length]; + */ +} __packed; + struct smb3_notify { __u32 completion_filter; bool watch_tree; @@ -78,3 +109,20 @@ struct smb3_notify { #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) #define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) +#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info) +#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32) + +/* + * Flags for going down operation + */ +#define CIFS_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define CIFS_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define CIFS_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +static inline bool cifs_forced_shutdown(struct cifs_sb_info *sbi) +{ + if (CIFS_MOUNT_SHUTDOWN & sbi->mnt_cifs_flags) + return true; + else + return false; +} diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5f2c139143a7..2ffcb29d5c8f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -75,7 +75,7 @@ bool enable_oplocks = true; bool linuxExtEnabled = true; bool lookupCacheEnabled = true; bool disable_legacy_dialects; /* false by default */ -bool enable_gcm_256; /* false by default, change when more servers support it */ +bool enable_gcm_256 = true; bool require_gcm_256; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ @@ -133,6 +133,7 @@ struct workqueue_struct *cifsiod_wq; struct workqueue_struct *decrypt_wq; struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; +struct workqueue_struct *deferredclose_wq; __u32 cifs_lock_secret; /* @@ -390,6 +391,8 @@ cifs_alloc_inode(struct super_block *sb) /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */ INIT_LIST_HEAD(&cifs_inode->openFileList); INIT_LIST_HEAD(&cifs_inode->llist); + INIT_LIST_HEAD(&cifs_inode->deferred_closes); + spin_lock_init(&cifs_inode->deferred_lock); return &cifs_inode->vfs_inode; } @@ -860,13 +863,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, goto out; } - /* cifs_setup_volume_info->smb3_parse_devname() redups UNC & prepath */ - kfree(cifs_sb->ctx->UNC); - cifs_sb->ctx->UNC = NULL; - kfree(cifs_sb->ctx->prepath); - cifs_sb->ctx->prepath = NULL; - - rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC); + rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, NULL); if (rc) { root = ERR_PTR(rc); goto out; @@ -1637,9 +1634,16 @@ init_cifs(void) goto out_destroy_fileinfo_put_wq; } + deferredclose_wq = alloc_workqueue("deferredclose", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!deferredclose_wq) { + rc = -ENOMEM; + goto out_destroy_cifsoplockd_wq; + } + rc = cifs_fscache_register(); if (rc) - goto out_destroy_cifsoplockd_wq; + goto out_destroy_deferredclose_wq; rc = cifs_init_inodecache(); if (rc) @@ -1707,6 +1711,8 @@ out_destroy_inodecache: cifs_destroy_inodecache(); out_unreg_fscache: cifs_fscache_unregister(); +out_destroy_deferredclose_wq: + destroy_workqueue(deferredclose_wq); out_destroy_cifsoplockd_wq: destroy_workqueue(cifsoplockd_wq); out_destroy_fileinfo_put_wq: @@ -1741,6 +1747,7 @@ exit_cifs(void) cifs_destroy_mids(); cifs_destroy_inodecache(); cifs_fscache_unregister(); + destroy_workqueue(deferredclose_wq); destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); destroy_workqueue(fileinfo_put_wq); diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index b23a0ee8c6f8..8488d7024462 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -1154,6 +1154,14 @@ struct cifs_pending_open { __u32 oplock; }; +struct cifs_deferred_close { + struct list_head dlist; + struct tcon_link *tlink; + __u16 netfid; + __u64 persistent_fid; + __u64 volatile_fid; +}; + /* * This info hangs off the cifsFileInfo structure, pointed to by llist. * This is used to track byte stream locks on the file @@ -1248,6 +1256,8 @@ struct cifsFileInfo { struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ struct work_struct put; /* work for the final part of _put */ + struct delayed_work deferred; + bool deferred_close_scheduled; /* Flag to indicate close is scheduled */ }; struct cifs_io_parms { @@ -1392,6 +1402,7 @@ struct cifsInodeInfo { #define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ #define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ #define CIFS_INO_LOCK (5) /* lock bit for synchronization */ +#define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */ unsigned long flags; spinlock_t writers_lock; unsigned int writers; /* Number of writers on this inode */ @@ -1404,6 +1415,9 @@ struct cifsInodeInfo { struct fscache_cookie *fscache; #endif struct inode vfs_inode; + struct list_head deferred_closes; /* list of deferred closes */ + spinlock_t deferred_lock; /* protection on deferred list */ + bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ }; static inline struct cifsInodeInfo * @@ -1871,11 +1885,14 @@ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ void cifs_oplock_break(struct work_struct *work); void cifs_queue_oplock_break(struct cifsFileInfo *cfile); +void smb2_deferred_work_close(struct work_struct *work); +extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; extern struct workqueue_struct *decrypt_wq; extern struct workqueue_struct *fileinfo_put_wq; extern struct workqueue_struct *cifsoplockd_wq; +extern struct workqueue_struct *deferredclose_wq; extern __u32 cifs_lock_secret; extern mempool_t *cifs_mid_poolp; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index b53a87db282f..554d64fe171e 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -148,7 +148,8 @@ #define SMB3_SIGN_KEY_SIZE (16) /* - * Size of the smb3 encryption/decryption keys + * Size of the smb3 encryption/decryption key storage. + * This size is big enough to store any cipher key types. */ #define SMB3_ENC_DEC_KEY_SIZE (32) diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index a79d50001fbf..d30cba44ba29 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -267,6 +267,19 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open); extern void cifs_del_pending_open(struct cifs_pending_open *open); + +extern bool cifs_is_deferred_close(struct cifsFileInfo *cfile, + struct cifs_deferred_close **dclose); + +extern void cifs_add_deferred_close(struct cifsFileInfo *cfile, + struct cifs_deferred_close *dclose); + +extern void cifs_del_deferred_close(struct cifsFileInfo *cfile); + +extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode); + +extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon); + extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb3_fs_context *ctx); extern void cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect); diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 121d8b4535b0..495c395f9def 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -392,16 +392,6 @@ cifs_echo_request(struct work_struct *work) int rc; struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); - unsigned long echo_interval; - - /* - * If we need to renegotiate, set echo interval to zero to - * immediately call echo service where we can renegotiate. - */ - if (server->tcpStatus == CifsNeedNegotiate) - echo_interval = 0; - else - echo_interval = server->echo_interval; /* * We cannot send an echo if it is disabled. @@ -412,7 +402,7 @@ cifs_echo_request(struct work_struct *work) server->tcpStatus == CifsExiting || server->tcpStatus == CifsNew || (server->ops->can_echo && !server->ops->can_echo(server)) || - time_before(jiffies, server->lstrp + echo_interval - HZ)) + time_before(jiffies, server->lstrp + server->echo_interval - HZ)) goto requeue_echo; rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; @@ -476,6 +466,7 @@ server_unresponsive(struct TCP_Server_Info *server) */ if ((server->tcpStatus == CifsGood || server->tcpStatus == CifsNeedNegotiate) && + (!server->ops->can_echo || server->ops->can_echo(server)) && time_after(jiffies, server->lstrp + 3 * server->echo_interval)) { cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n", (3 * server->echo_interval) / HZ); @@ -3158,17 +3149,29 @@ out: int cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname) { - int rc = 0; + int rc; - smb3_parse_devname(devname, ctx); + if (devname) { + cifs_dbg(FYI, "%s: devname=%s\n", __func__, devname); + rc = smb3_parse_devname(devname, ctx); + if (rc) { + cifs_dbg(VFS, "%s: failed to parse %s: %d\n", __func__, devname, rc); + return rc; + } + } if (mntopts) { char *ip; - cifs_dbg(FYI, "%s: mntopts=%s\n", __func__, mntopts); rc = smb3_parse_opt(mntopts, "ip", &ip); - if (!rc && !cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, - strlen(ip))) { + if (rc) { + cifs_dbg(VFS, "%s: failed to parse ip options: %d\n", __func__, rc); + return rc; + } + + rc = cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, strlen(ip)); + kfree(ip); + if (!rc) { cifs_dbg(VFS, "%s: failed to convert ip address\n", __func__); return -EINVAL; } @@ -3188,7 +3191,7 @@ cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const c return -EINVAL; } - return rc; + return 0; } static int diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index c85aff838305..6bcd3e8f7cda 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -34,6 +34,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "fs_context.h" +#include "cifs_ioctl.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -430,6 +431,9 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, __u32 oplock; struct cifsFileInfo *file_info; + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) + return -EIO; + /* * Posix open is only called (at lookup time) for file create now. For * opens (rather than creates), because we do not know if it is a file @@ -546,6 +550,9 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n", inode, direntry, direntry); + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) + return -EIO; + tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); rc = PTR_ERR(tlink); if (IS_ERR(tlink)) @@ -583,6 +590,9 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, return -EINVAL; cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 639c59596d4f..379a427f3c2f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -45,6 +45,7 @@ #include "fscache.h" #include "smbdirect.h" #include "fs_context.h" +#include "cifs_ioctl.h" static inline int cifs_convert_flags(unsigned int flags) { @@ -322,9 +323,11 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->dentry = dget(dentry); cfile->f_flags = file->f_flags; cfile->invalidHandle = false; + cfile->deferred_close_scheduled = false; cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); INIT_WORK(&cfile->put, cifsFileInfo_put_work); + INIT_DELAYED_WORK(&cfile->deferred, smb2_deferred_work_close); mutex_init(&cfile->fh_mutex); spin_lock_init(&cfile->file_info_lock); @@ -539,6 +542,11 @@ int cifs_open(struct inode *inode, struct file *file) xid = get_xid(); cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) { + free_xid(xid); + return -EIO; + } + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { free_xid(xid); @@ -565,6 +573,20 @@ int cifs_open(struct inode *inode, struct file *file) file->f_op = &cifs_file_direct_ops; } + /* Get the cached handle as SMB2 close is deferred */ + rc = cifs_get_readable_path(tcon, full_path, &cfile); + if (rc == 0) { + if (file->f_flags == cfile->f_flags) { + file->private_data = cfile; + spin_lock(&CIFS_I(inode)->deferred_lock); + cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(inode)->deferred_lock); + goto out; + } else { + _cifsFileInfo_put(cfile, true, false); + } + } + if (server->oplocks) oplock = REQ_OPLOCK; else @@ -846,11 +868,59 @@ reopen_error_exit: return rc; } +void smb2_deferred_work_close(struct work_struct *work) +{ + struct cifsFileInfo *cfile = container_of(work, + struct cifsFileInfo, deferred.work); + + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + cifs_del_deferred_close(cfile); + cfile->deferred_close_scheduled = false; + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + _cifsFileInfo_put(cfile, true, false); +} + int cifs_close(struct inode *inode, struct file *file) { + struct cifsFileInfo *cfile; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_deferred_close *dclose; + if (file->private_data != NULL) { - _cifsFileInfo_put(file->private_data, true, false); + cfile = file->private_data; file->private_data = NULL; + dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); + if ((cinode->oplock == CIFS_CACHE_RHW_FLG) && + cinode->lease_granted && + dclose) { + if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) + inode->i_ctime = inode->i_mtime = current_time(inode); + spin_lock(&cinode->deferred_lock); + cifs_add_deferred_close(cfile, dclose); + if (cfile->deferred_close_scheduled && + delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, + &cfile->deferred, cifs_sb->ctx->acregmax)) + cifsFileInfo_get(cfile); + } else { + /* Deferred close for files */ + queue_delayed_work(deferredclose_wq, + &cfile->deferred, cifs_sb->ctx->acregmax); + cfile->deferred_close_scheduled = true; + spin_unlock(&cinode->deferred_lock); + return 0; + } + spin_unlock(&cinode->deferred_lock); + _cifsFileInfo_put(cfile, true, false); + } else { + _cifsFileInfo_put(cfile, true, false); + kfree(dclose); + } } /* return code from the ->release op is always ignored */ @@ -1920,8 +1990,10 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, if (total_written > 0) { spin_lock(&d_inode(dentry)->i_lock); - if (*offset > d_inode(dentry)->i_size) + if (*offset > d_inode(dentry)->i_size) { i_size_write(d_inode(dentry), *offset); + d_inode(dentry)->i_blocks = (512 - 1 + *offset) >> 9; + } spin_unlock(&d_inode(dentry)->i_lock); } mark_inode_dirty_sync(d_inode(dentry)); @@ -1947,7 +2019,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) { - if (!open_file->invalidHandle) { + if ((!open_file->invalidHandle)) { /* found a good file */ /* lock it so it will not be closed on us */ cifsFileInfo_get(open_file); @@ -2476,6 +2548,8 @@ retry: if (cfile) cifsFileInfo_put(cfile); free_xid(xid); + /* Indication to update ctime and mtime as close is deferred */ + set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); return rc; } @@ -2577,13 +2651,17 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, if (rc > 0) { spin_lock(&inode->i_lock); - if (pos > inode->i_size) + if (pos > inode->i_size) { i_size_write(inode, pos); + inode->i_blocks = (512 - 1 + pos) >> 9; + } spin_unlock(&inode->i_lock); } unlock_page(page); put_page(page); + /* Indication to update ctime and mtime as close is deferred */ + set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); return rc; } @@ -4744,6 +4822,8 @@ void cifs_oplock_break(struct work_struct *work) struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; bool purge_cache = false; + bool is_deferred = false; + struct cifs_deferred_close *dclose; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, TASK_UNINTERRUPTIBLE); @@ -4790,6 +4870,24 @@ oplock_break_ack: cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + /* + * When oplock break is received and there are no active + * file handles but cached, then schedule deferred close immediately. + * So, new open will not use cached handle. + */ + spin_lock(&CIFS_I(inode)->deferred_lock); + is_deferred = cifs_is_deferred_close(cfile, &dclose); + if (is_deferred && + cfile->deferred_close_scheduled && + delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) + cifsFileInfo_get(cfile); + } + spin_unlock(&CIFS_I(inode)->deferred_lock); _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 3e0d016849e3..92d4ab029c91 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -476,6 +476,7 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) /* move "pos" up to delimiter or NULL */ pos += len; + kfree(ctx->UNC); ctx->UNC = kstrndup(devname, pos - devname, GFP_KERNEL); if (!ctx->UNC) return -ENOMEM; @@ -486,6 +487,9 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) if (*pos == '/' || *pos == '\\') pos++; + kfree(ctx->prepath); + ctx->prepath = NULL; + /* If pos is NULL then no prepath */ if (!*pos) return 0; @@ -1017,6 +1021,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; } ctx->max_channels = result.uint_32; + /* If more than one channel requested ... they want multichan */ + if (result.uint_32 > 1) + ctx->multichannel = true; break; case Opt_handletimeout: ctx->handle_timeout = result.uint_32; @@ -1138,7 +1145,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, /* if iocharset not set then load_nls_default * is used by caller */ - cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset); + cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset); break; case Opt_netbiosname: memset(ctx->source_rfc1001_name, 0x20, @@ -1642,6 +1649,7 @@ void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb) cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n"); } } + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SHUTDOWN; return; } diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index 002d864b8f7b..1dfa57982522 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -26,7 +26,6 @@ #include <linux/sched/signal.h> #include <linux/wait_bit.h> #include <linux/fiemap.h> - #include <asm/div64.h> #include "cifsfs.h" #include "cifspdu.h" @@ -38,7 +37,7 @@ #include "cifs_unicode.h" #include "fscache.h" #include "fs_context.h" - +#include "cifs_ioctl.h" static void cifs_set_ops(struct inode *inode) { @@ -1610,6 +1609,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -1632,6 +1634,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) goto unlink_out; } + cifs_close_all_deferred_files(tcon); if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = CIFSPOSIXDelFile(xid, tcon, full_path, @@ -1872,6 +1875,8 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, mode, inode); cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -1954,6 +1959,11 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) } cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) { + rc = -EIO; + goto rmdir_exit; + } + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { rc = PTR_ERR(tlink); @@ -2088,6 +2098,9 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, return -EINVAL; cifs_sb = CIFS_SB(source_dir->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -2109,6 +2122,7 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, goto cifs_rename_exit; } + cifs_close_all_deferred_files(tcon); rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); @@ -2404,6 +2418,9 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct inode *inode = d_inode(dentry); int rc; + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) + return -EIO; + /* * We need to be sure that all dirty pages are written and the server * has actual ctime, mtime and file length. @@ -2476,6 +2493,9 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, struct cifsFileInfo *cfile; int rc; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + /* * We need to be sure that all dirty pages are written as they * might fill holes on the server. @@ -2962,6 +2982,9 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); int rc, retries = 0; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + do { if (pTcon->unix_ext) rc = cifs_setattr_unix(direntry, attrs); diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index 08d99fec593e..d67d281ab863 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -33,6 +33,7 @@ #include "cifsfs.h" #include "cifs_ioctl.h" #include "smb2proto.h" +#include "smb2glob.h" #include <linux/btrfs.h> static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, @@ -164,6 +165,164 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, return rc; } +static int cifs_shutdown(struct super_block *sb, unsigned long arg) +{ + struct cifs_sb_info *sbi = CIFS_SB(sb); + __u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (__u32 __user *)arg)) + return -EFAULT; + + if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH) + return -EINVAL; + + if (cifs_forced_shutdown(sbi)) + return 0; + + cifs_dbg(VFS, "shut down requested (%d)", flags); +/* trace_cifs_shutdown(sb, flags);*/ + + /* + * see: + * https://man7.org/linux/man-pages/man2/ioctl_xfs_goingdown.2.html + * for more information and description of original intent of the flags + */ + switch (flags) { + /* + * We could add support later for default flag which requires: + * "Flush all dirty data and metadata to disk" + * would need to call syncfs or equivalent to flush page cache for + * the mount and then issue fsync to server (if nostrictsync not set) + */ + case CIFS_GOING_FLAGS_DEFAULT: + cifs_dbg(FYI, "shutdown with default flag not supported\n"); + return -EINVAL; + /* + * FLAGS_LOGFLUSH is easy since it asks to write out metadata (not + * data) but metadata writes are not cached on the client, so can treat + * it similarly to NOLOGFLUSH + */ + case CIFS_GOING_FLAGS_LOGFLUSH: + case CIFS_GOING_FLAGS_NOLOGFLUSH: + sbi->mnt_cifs_flags |= CIFS_MOUNT_SHUTDOWN; + return 0; + default: + return -EINVAL; + } + return 0; +} + +static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug_info __user *in) +{ + struct smb3_full_key_debug_info out; + struct cifs_ses *ses; + int rc = 0; + bool found = false; + u8 __user *end; + + if (!smb3_encryption_required(tcon)) { + rc = -EOPNOTSUPP; + goto out; + } + + /* copy user input into our output buffer */ + if (copy_from_user(&out, in, sizeof(out))) { + rc = -EINVAL; + goto out; + } + + if (!out.session_id) { + /* if ses id is 0, use current user session */ + ses = tcon->ses; + } else { + /* otherwise if a session id is given, look for it in all our sessions */ + struct cifs_ses *ses_it = NULL; + struct TCP_Server_Info *server_it = NULL; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) { + if (ses_it->Suid == out.session_id) { + ses = ses_it; + /* + * since we are using the session outside the crit + * section, we need to make sure it won't be released + * so increment its refcount + */ + ses->ses_count++; + found = true; + goto search_end; + } + } + } +search_end: + spin_unlock(&cifs_tcp_ses_lock); + if (!found) { + rc = -ENOENT; + goto out; + } + } + + switch (ses->server->cipher_type) { + case SMB2_ENCRYPTION_AES128_CCM: + case SMB2_ENCRYPTION_AES128_GCM: + out.session_key_length = CIFS_SESS_KEY_SIZE; + out.server_in_key_length = out.server_out_key_length = SMB3_GCM128_CRYPTKEY_SIZE; + break; + case SMB2_ENCRYPTION_AES256_CCM: + case SMB2_ENCRYPTION_AES256_GCM: + out.session_key_length = CIFS_SESS_KEY_SIZE; + out.server_in_key_length = out.server_out_key_length = SMB3_GCM256_CRYPTKEY_SIZE; + break; + default: + rc = -EOPNOTSUPP; + goto out; + } + + /* check if user buffer is big enough to store all the keys */ + if (out.in_size < sizeof(out) + out.session_key_length + out.server_in_key_length + + out.server_out_key_length) { + rc = -ENOBUFS; + goto out; + } + + out.session_id = ses->Suid; + out.cipher_type = le16_to_cpu(ses->server->cipher_type); + + /* overwrite user input with our output */ + if (copy_to_user(in, &out, sizeof(out))) { + rc = -EINVAL; + goto out; + } + + /* append all the keys at the end of the user buffer */ + end = in->data; + if (copy_to_user(end, ses->auth_key.response, out.session_key_length)) { + rc = -EINVAL; + goto out; + } + end += out.session_key_length; + + if (copy_to_user(end, ses->smb3encryptionkey, out.server_in_key_length)) { + rc = -EINVAL; + goto out; + } + end += out.server_in_key_length; + + if (copy_to_user(end, ses->smb3decryptionkey, out.server_out_key_length)) { + rc = -EINVAL; + goto out; + } + +out: + if (found) + cifs_put_smb_ses(ses); + return rc; +} + long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); @@ -277,6 +436,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; break; case CIFS_DUMP_KEY: + /* + * Dump encryption keys. This is an old ioctl that only + * handles AES-128-{CCM,GCM}. + */ if (pSMBFile == NULL) break; if (!capable(CAP_SYS_ADMIN)) { @@ -304,6 +467,19 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = 0; break; + case CIFS_DUMP_FULL_KEY: + /* + * Dump encryption keys (handles any key sizes) + */ + if (pSMBFile == NULL) + break; + if (!capable(CAP_SYS_ADMIN)) { + rc = -EACCES; + break; + } + tcon = tlink_tcon(pSMBFile->tlink); + rc = cifs_dump_full_key(tcon, (void __user *)arg); + break; case CIFS_IOC_NOTIFY: if (!S_ISDIR(inode->i_mode)) { /* Notify can only be done on directories */ @@ -325,6 +501,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; cifs_put_tlink(tlink); break; + case CIFS_IOC_SHUTDOWN: + rc = cifs_shutdown(inode->i_sb, arg); + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 616e1bc0cc0a..970fcf2adb08 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -30,6 +30,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "smb2proto.h" +#include "cifs_ioctl.h" /* * M-F Symlink Functions - Begin @@ -518,6 +519,9 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, struct TCP_Server_Info *server; struct cifsInodeInfo *cifsInode; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -679,9 +683,16 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, struct tcon_link *tlink; struct cifs_tcon *pTcon; const char *full_path; - void *page = alloc_dentry_path(); + void *page; struct inode *newinode = NULL; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + + page = alloc_dentry_path(); + if (!page) + return -ENOMEM; + xid = get_xid(); tlink = cifs_sb_tlink(cifs_sb); diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index c15a90e422be..7207a63819cb 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -672,6 +672,100 @@ cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } +/* + * Critical section which runs after acquiring deferred_lock. + * As there is no reference count on cifs_deferred_close, pdclose + * should not be used outside deferred_lock. + */ +bool +cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close **pdclose) +{ + struct cifs_deferred_close *dclose; + + list_for_each_entry(dclose, &CIFS_I(d_inode(cfile->dentry))->deferred_closes, dlist) { + if ((dclose->netfid == cfile->fid.netfid) && + (dclose->persistent_fid == cfile->fid.persistent_fid) && + (dclose->volatile_fid == cfile->fid.volatile_fid)) { + *pdclose = dclose; + return true; + } + } + return false; +} + +/* + * Critical section which runs after acquiring deferred_lock. + */ +void +cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close *dclose) +{ + bool is_deferred = false; + struct cifs_deferred_close *pdclose; + + is_deferred = cifs_is_deferred_close(cfile, &pdclose); + if (is_deferred) { + kfree(dclose); + return; + } + + dclose->tlink = cfile->tlink; + dclose->netfid = cfile->fid.netfid; + dclose->persistent_fid = cfile->fid.persistent_fid; + dclose->volatile_fid = cfile->fid.volatile_fid; + list_add_tail(&dclose->dlist, &CIFS_I(d_inode(cfile->dentry))->deferred_closes); +} + +/* + * Critical section which runs after acquiring deferred_lock. + */ +void +cifs_del_deferred_close(struct cifsFileInfo *cfile) +{ + bool is_deferred = false; + struct cifs_deferred_close *dclose; + + is_deferred = cifs_is_deferred_close(cfile, &dclose); + if (!is_deferred) + return; + list_del(&dclose->dlist); + kfree(dclose); +} + +void +cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) +{ + struct cifsFileInfo *cfile = NULL; + struct cifs_deferred_close *dclose; + + list_for_each_entry(cfile, &cifs_inode->openFileList, flist) { + spin_lock(&cifs_inode->deferred_lock); + if (cifs_is_deferred_close(cfile, &dclose)) + mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); + spin_unlock(&cifs_inode->deferred_lock); + } +} + +void +cifs_close_all_deferred_files(struct cifs_tcon *tcon) +{ + struct cifsFileInfo *cfile; + struct list_head *tmp; + + spin_lock(&tcon->open_file_lock); + list_for_each(tmp, &tcon->openFileList) { + cfile = list_entry(tmp, struct cifsFileInfo, tlist); + if (delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) + cifsFileInfo_get(cfile); + } + } + spin_unlock(&tcon->open_file_lock); +} + /* parses DFS refferal V3 structure * caller is responsible for freeing target_nodes * returns: diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 63d517b9f2ff..a92a1fb7cb52 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -97,6 +97,12 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) return 0; } + if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname); + ses->chan_max = 1; + return 0; + } + /* * Make a copy of the iface list at the time and use that * instead so as to not hold the iface spinlock for opening diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index dd0eb665b680..21ef51d338e0 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -1861,6 +1861,8 @@ smb2_copychunk_range(const unsigned int xid, cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); /* Request server copy to target from src identified by key */ + kfree(retbuf); + retbuf = NULL; rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, true /* is_fsctl */, (char *)pcchunk, @@ -3981,6 +3983,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { oplock &= 0xFF; + cinode->lease_granted = false; if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; if (oplock == SMB2_OPLOCK_LEVEL_BATCH) { @@ -4007,6 +4010,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int new_oplock = 0; oplock &= 0xFF; + cinode->lease_granted = true; if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index e36c2a867783..c205f93e0a10 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -841,6 +841,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) req->SecurityMode = 0; req->Capabilities = cpu_to_le32(server->vals->req_capabilities); + if (ses->chan_max > 1) + req->Capabilities |= cpu_to_le32(SMB2_GLOBAL_CAP_MULTI_CHANNEL); /* ClientGUID must be zero for SMB2.02 dialect */ if (server->vals->protocol_id == SMB20_PROT_ID) @@ -956,6 +958,13 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* Internal types */ server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES; + /* + * SMB3.0 supports only 1 cipher and doesn't have a encryption neg context + * Set the cipher type manually. + */ + if (server->dialect == SMB30_PROT_ID && (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + server->cipher_type = SMB2_ENCRYPTION_AES128_CCM; + security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, (struct smb2_sync_hdr *)rsp); /* @@ -1032,6 +1041,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->Capabilities = cpu_to_le32(server->vals->req_capabilities); + if (tcon->ses->chan_max > 1) + pneg_inbuf->Capabilities |= cpu_to_le32(SMB2_GLOBAL_CAP_MULTI_CHANNEL); + memcpy(pneg_inbuf->Guid, server->client_guid, SMB2_CLIENT_GUID_SIZE); @@ -3895,10 +3907,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * Related requests use info from previous read request * in chain. */ - shdr->SessionId = 0xFFFFFFFF; + shdr->SessionId = 0xFFFFFFFFFFFFFFFF; shdr->TreeId = 0xFFFFFFFF; - req->PersistentFileId = 0xFFFFFFFF; - req->VolatileFileId = 0xFFFFFFFF; + req->PersistentFileId = 0xFFFFFFFFFFFFFFFF; + req->VolatileFileId = 0xFFFFFFFFFFFFFFFF; } } if (remaining_bytes > io_parms->length) diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index d6df908dccad..dafcb6ab050d 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -12,6 +12,11 @@ #include <linux/tracepoint.h> +/* + * Please use this 3-part article as a reference for writing new tracepoints: + * https://lwn.net/Articles/379903/ + */ + /* For logging errors in read or write */ DECLARE_EVENT_CLASS(smb3_rw_err_class, TP_PROTO(unsigned int xid, @@ -529,16 +534,16 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class, TP_ARGS(xid, func_name, rc), TP_STRUCT__entry( __field(unsigned int, xid) - __field(const char *, func_name) + __string(func_name, func_name) __field(int, rc) ), TP_fast_assign( __entry->xid = xid; - __entry->func_name = func_name; + __assign_str(func_name, func_name); __entry->rc = rc; ), TP_printk("\t%s: xid=%u rc=%d", - __entry->func_name, __entry->xid, __entry->rc) + __get_str(func_name), __entry->xid, __entry->rc) ) #define DEFINE_SMB3_EXIT_ERR_EVENT(name) \ @@ -583,14 +588,14 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class, TP_ARGS(xid, func_name), TP_STRUCT__entry( __field(unsigned int, xid) - __field(const char *, func_name) + __string(func_name, func_name) ), TP_fast_assign( __entry->xid = xid; - __entry->func_name = func_name; + __assign_str(func_name, func_name); ), TP_printk("\t%s: xid=%u", - __entry->func_name, __entry->xid) + __get_str(func_name), __entry->xid) ) #define DEFINE_SMB3_ENTER_EXIT_EVENT(name) \ @@ -857,16 +862,16 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_STRUCT__entry( __field(__u64, currmid) __field(__u64, conn_id) - __field(char *, hostname) + __string(hostname, hostname) ), TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __entry->hostname = hostname; + __assign_str(hostname, hostname); ), TP_printk("conn_id=0x%llx server=%s current_mid=%llu", __entry->conn_id, - __entry->hostname, + __get_str(hostname), __entry->currmid) ) @@ -891,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_STRUCT__entry( __field(__u64, currmid) __field(__u64, conn_id) - __field(char *, hostname) + __string(hostname, hostname) __field(int, credits) __field(int, credits_to_add) __field(int, in_flight) @@ -899,7 +904,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __entry->hostname = hostname; + __assign_str(hostname, hostname); __entry->credits = credits; __entry->credits_to_add = credits_to_add; __entry->in_flight = in_flight; @@ -907,7 +912,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_printk("conn_id=0x%llx server=%s current_mid=%llu " "credits=%d credit_change=%d in_flight=%d", __entry->conn_id, - __entry->hostname, + __get_str(hostname), __entry->currmid, __entry->credits, __entry->credits_to_add, diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index e351b945135b..aa3e8ca0457c 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -30,6 +30,7 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" #include "cifs_unicode.h" +#include "cifs_ioctl.h" #define MAX_EA_VALUE_SIZE CIFSMaxBufSize #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" /* DACL only */ @@ -421,6 +422,9 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) const char *full_path; void *page; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) return -EOPNOTSUPP; diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 9a3aed249692..c0395363eab9 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset:8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * configfs_internal.h - Internal stuff for configfs * * Based on sysfs: diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b6098e02e20b..ac5e0c0e9181 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dir.c - Operations for configfs directories. * * Based on sysfs: diff --git a/fs/configfs/file.c b/fs/configfs/file.c index da8351d1e455..e26060dae70a 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * file.c - operations for regular (text) files. * * Based on sysfs: diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 42c348bb2903..eb5ec3e46283 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * inode.c - basic inode and dentry operations. * * Based on sysfs: diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 704a4356f137..254170a82aa3 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * item.c - library routines for handling generic config items * * Based on kobject: diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 0c6e8cf61953..c2d820063ec4 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * mount.c - operations for initializing and mounting configfs. * * Based on sysfs: diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 77c854364e60..0623c3edcfb9 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * symlink.c - operations for configfs symlinks. * * Based on sysfs: diff --git a/fs/d_path.c b/fs/d_path.c index a69e2cd36e6e..270d62133996 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -326,9 +326,9 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) /* * Write full pathname from the root of the filesystem into the buffer. */ -static char *__dentry_path(struct dentry *d, char *buf, int buflen) +static char *__dentry_path(const struct dentry *d, char *buf, int buflen) { - struct dentry *dentry; + const struct dentry *dentry; char *end, *retval; int len, seq = 0; int error = 0; @@ -347,7 +347,7 @@ restart: *retval = '/'; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { - struct dentry *parent = dentry->d_parent; + const struct dentry *parent = dentry->d_parent; prefetch(parent); error = prepend_name(&end, &len, &dentry->d_name); @@ -371,13 +371,13 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } -char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) +char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { return __dentry_path(dentry, buf, buflen); } EXPORT_SYMBOL(dentry_path_raw); -char *dentry_path(struct dentry *dentry, char *buf, int buflen) +char *dentry_path(const struct dentry *dentry, char *buf, int buflen) { char *p = NULL; char *retval; @@ -144,6 +144,16 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; +/** + * enum dax_wake_mode: waitqueue wakeup behaviour + * @WAKE_ALL: wake all waiters in the waitqueue + * @WAKE_NEXT: wake only the first waiter in the waitqueue + */ +enum dax_wake_mode { + WAKE_ALL, + WAKE_NEXT, +}; + static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry, struct exceptional_entry_key *key) { @@ -182,7 +192,8 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ -static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) +static void dax_wake_entry(struct xa_state *xas, void *entry, + enum dax_wake_mode mode) { struct exceptional_entry_key key; wait_queue_head_t *wq; @@ -196,7 +207,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) * must be in the waitqueue and the following check will see them. */ if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); + __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); } /* @@ -264,11 +275,11 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry) finish_wait(wq, &ewait.wait); } -static void put_unlocked_entry(struct xa_state *xas, void *entry) +static void put_unlocked_entry(struct xa_state *xas, void *entry, + enum dax_wake_mode mode) { - /* If we were the only waiter woken, wake the next one */ if (entry && !dax_is_conflict(entry)) - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, mode); } /* @@ -286,7 +297,7 @@ static void dax_unlock_entry(struct xa_state *xas, void *entry) old = xas_store(xas, entry); xas_unlock_irq(xas); BUG_ON(!dax_is_locked(old)); - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, WAKE_NEXT); } /* @@ -524,8 +535,8 @@ retry: dax_disassociate_entry(entry, mapping, false); xas_store(xas, NULL); /* undo the PMD join */ - dax_wake_entry(xas, entry, true); - mapping->nrexceptional--; + dax_wake_entry(xas, entry, WAKE_ALL); + mapping->nrpages -= PG_PMD_NR; entry = NULL; xas_set(xas, index); } @@ -541,7 +552,7 @@ retry: dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; - mapping->nrexceptional++; + mapping->nrpages += 1UL << order; } out_unlock: @@ -622,7 +633,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, entry = get_unlocked_entry(&xas, 0); if (entry) page = dax_busy_page(entry); - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_NEXT); if (page) break; if (++scanned % XA_CHECK_SCHED) @@ -661,10 +672,10 @@ static int __dax_invalidate_entry(struct address_space *mapping, goto out; dax_disassociate_entry(entry, mapping, trunc); xas_store(&xas, NULL); - mapping->nrexceptional--; + mapping->nrpages -= 1UL << dax_entry_order(entry); ret = 1; out: - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_ALL); xas_unlock_irq(&xas); return ret; } @@ -937,13 +948,13 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, xas_lock_irq(xas); xas_store(xas, entry); xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, WAKE_NEXT); trace_dax_writeback_one(mapping->host, index, count); return ret; put_unlocked: - put_unlocked_entry(xas, entry); + put_unlocked_entry(xas, entry, WAKE_NEXT); return ret; } @@ -965,7 +976,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; - if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) + if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL) return 0; trace_dax_writeback_range(inode, xas.xa_index, end_index); @@ -1684,7 +1695,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /* Did we race with someone splitting entry or so? */ if (!entry || dax_is_conflict(entry) || (order == 0 && !dax_is_pte_entry(entry))) { - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_NEXT); xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); diff --git a/fs/dcache.c b/fs/dcache.c index 7d24ff7eb206..cf871a81f4fd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -84,6 +84,8 @@ const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); +const struct qstr dotdot_name = QSTR_INIT("..", 2); +EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 1d252164d97b..8129a430d789 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -45,10 +45,13 @@ static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; static int debugfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *ia) { - int ret = security_locked_down(LOCKDOWN_DEBUGFS); + int ret; - if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) - return ret; + if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) { + ret = security_locked_down(LOCKDOWN_DEBUGFS); + if (ret) + return ret; + } return simple_setattr(&init_user_ns, dentry, ia); } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 49c5f9407098..88d95d96e36c 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item, CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, - int *info_field, bool (*check_cb)(unsigned int x), + int *info_field, int (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; @@ -137,8 +137,11 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, if (rc) return rc; - if (check_cb && check_cb(x)) - return -EINVAL; + if (check_cb) { + rc = check_cb(x); + if (rc) + return rc; + } *cl_field = x; *info_field = x; @@ -161,17 +164,53 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ } \ CONFIGFS_ATTR(cluster_, name); -static bool dlm_check_zero(unsigned int x) +static int dlm_check_protocol_and_dlm_running(unsigned int x) +{ + switch (x) { + case 0: + /* TCP */ + break; + case 1: + /* SCTP */ + break; + default: + return -EINVAL; + } + + if (dlm_allow_conn) + return -EBUSY; + + return 0; +} + +static int dlm_check_zero_and_dlm_running(unsigned int x) +{ + if (!x) + return -EINVAL; + + if (dlm_allow_conn) + return -EBUSY; + + return 0; +} + +static int dlm_check_zero(unsigned int x) { - return !x; + if (!x) + return -EINVAL; + + return 0; } -static bool dlm_check_buffer_size(unsigned int x) +static int dlm_check_buffer_size(unsigned int x) { - return (x < DEFAULT_BUFFER_SIZE); + if (x < DEFAULT_BUFFER_SIZE) + return -EINVAL; + + return 0; } -CLUSTER_ATTR(tcp_port, dlm_check_zero); +CLUSTER_ATTR(tcp_port, dlm_check_zero_and_dlm_running); CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); CLUSTER_ATTR(recover_timer, dlm_check_zero); @@ -179,7 +218,7 @@ CLUSTER_ATTR(toss_secs, dlm_check_zero); CLUSTER_ATTR(scan_secs, dlm_check_zero); CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); -CLUSTER_ATTR(protocol, NULL); +CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); CLUSTER_ATTR(timewarn_cs, dlm_check_zero); CLUSTER_ATTR(waitwarn_us, NULL); @@ -688,6 +727,7 @@ static ssize_t comm_mark_show(struct config_item *item, char *buf) static ssize_t comm_mark_store(struct config_item *item, const char *buf, size_t len) { + struct dlm_comm *comm; unsigned int mark; int rc; @@ -695,7 +735,15 @@ static ssize_t comm_mark_store(struct config_item *item, const char *buf, if (rc) return rc; - config_item_to_comm(item)->mark = mark; + if (mark == 0) + mark = dlm_config.ci_mark; + + comm = config_item_to_comm(item); + rc = dlm_lowcomms_nodes_set_mark(comm->nodeid, mark); + if (rc) + return rc; + + comm->mark = mark; return len; } @@ -870,24 +918,6 @@ int dlm_comm_seq(int nodeid, uint32_t *seq) return 0; } -void dlm_comm_mark(int nodeid, unsigned int *mark) -{ - struct dlm_comm *cm; - - cm = get_comm(nodeid); - if (!cm) { - *mark = dlm_config.ci_mark; - return; - } - - if (cm->mark) - *mark = cm->mark; - else - *mark = dlm_config.ci_mark; - - put_comm(cm); -} - int dlm_our_nodeid(void) { return local_comm ? local_comm->nodeid : 0; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index c210250a2581..d2cd4bd20313 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -48,7 +48,6 @@ void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); int dlm_comm_seq(int nodeid, uint32_t *seq); -void dlm_comm_mark(int nodeid, unsigned int *mark); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index d6bbccb0ed15..d5bd990bcab8 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -542,6 +542,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) if (bucket >= ls->ls_rsbtbl_size) { kfree(ri); + ++*pos; return NULL; } tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 002123efc6b0..b93df39d0915 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -3541,8 +3541,6 @@ static int _create_message(struct dlm_ls *ls, int mb_len, if (!mh) return -ENOBUFS; - memset(mb, 0, mb_len); - ms = (struct dlm_message *) mb; ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 561dcad08ad6..c14cf2b7faab 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -404,12 +404,6 @@ static int threads_start(void) return error; } -static void threads_stop(void) -{ - dlm_scand_stop(); - dlm_lowcomms_stop(); -} - static int new_lockspace(const char *name, const char *cluster, uint32_t flags, int lvblen, const struct dlm_lockspace_ops *ops, void *ops_arg, @@ -702,8 +696,11 @@ int dlm_new_lockspace(const char *name, const char *cluster, ls_count++; if (error > 0) error = 0; - if (!ls_count) - threads_stop(); + if (!ls_count) { + dlm_scand_stop(); + dlm_lowcomms_shutdown(); + dlm_lowcomms_stop(); + } out: mutex_unlock(&ls_lock); return error; @@ -788,6 +785,11 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_recoverd_stop(ls); + if (ls_count == 1) { + dlm_scand_stop(); + dlm_lowcomms_shutdown(); + } + dlm_callback_stop(ls); remove_lockspace(ls); @@ -880,7 +882,7 @@ int dlm_release_lockspace(void *lockspace, int force) if (!error) ls_count--; if (!ls_count) - threads_stop(); + dlm_lowcomms_stop(); mutex_unlock(&ls_lock); return error; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 372c34ff8594..166e36fcf3e4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -102,6 +102,9 @@ struct listen_connection { struct work_struct rwork; }; +#define DLM_WQ_REMAIN_BYTES(e) (PAGE_SIZE - e->end) +#define DLM_WQ_LENGTH_BYTES(e) (e->end - e->offset) + /* An entry waiting to be sent */ struct writequeue_entry { struct list_head list; @@ -116,6 +119,7 @@ struct writequeue_entry { struct dlm_node_addr { struct list_head list; int nodeid; + int mark; int addr_count; int curr_addr_index; struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; @@ -134,7 +138,7 @@ static DEFINE_SPINLOCK(dlm_node_addrs_spin); static struct listen_connection listen_con; static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; -static int dlm_allow_conn; +int dlm_allow_conn; /* Work queues */ static struct workqueue_struct *recv_workqueue; @@ -303,7 +307,8 @@ static int addr_compare(const struct sockaddr_storage *x, } static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, - struct sockaddr *sa_out, bool try_new_addr) + struct sockaddr *sa_out, bool try_new_addr, + unsigned int *mark) { struct sockaddr_storage sas; struct dlm_node_addr *na; @@ -331,6 +336,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, if (!na->addr_count) return -ENOENT; + *mark = na->mark; + if (sas_out) memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); @@ -350,7 +357,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, return 0; } -static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) +static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid, + unsigned int *mark) { struct dlm_node_addr *na; int rv = -EEXIST; @@ -364,6 +372,7 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) for (addr_i = 0; addr_i < na->addr_count; addr_i++) { if (addr_compare(na->addr[addr_i], addr)) { *nodeid = na->nodeid; + *mark = na->mark; rv = 0; goto unlock; } @@ -412,6 +421,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) new_node->nodeid = nodeid; new_node->addr[0] = new_addr; new_node->addr_count = 1; + new_node->mark = dlm_config.ci_mark; list_add(&new_node->list, &dlm_node_addrs); spin_unlock(&dlm_node_addrs_spin); return 0; @@ -519,6 +529,23 @@ int dlm_lowcomms_connect_node(int nodeid) return 0; } +int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) +{ + struct dlm_node_addr *na; + + spin_lock(&dlm_node_addrs_spin); + na = find_node_addr(nodeid); + if (!na) { + spin_unlock(&dlm_node_addrs_spin); + return -ENOENT; + } + + na->mark = mark; + spin_unlock(&dlm_node_addrs_spin); + + return 0; +} + static void lowcomms_error_report(struct sock *sk) { struct connection *con; @@ -685,10 +712,7 @@ static void shutdown_connection(struct connection *con) { int ret; - if (cancel_work_sync(&con->swork)) { - log_print("canceled swork for node %d", con->nodeid); - clear_bit(CF_WRITE_PENDING, &con->flags); - } + flush_work(&con->swork); mutex_lock(&con->sock_mutex); /* nothing to shutdown */ @@ -867,7 +891,7 @@ static int accept_from_sock(struct listen_connection *con) /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); - if (addr_to_nodeid(&peeraddr, &nodeid)) { + if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) { unsigned char *b=(unsigned char *)&peeraddr; log_print("connect from non cluster node"); print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, @@ -876,9 +900,6 @@ static int accept_from_sock(struct listen_connection *con) return -1; } - dlm_comm_mark(nodeid, &mark); - sock_set_mark(newsock->sk, mark); - log_print("got connection from %d", nodeid); /* Check to see if we already have a connection to this node. This @@ -892,6 +913,8 @@ static int accept_from_sock(struct listen_connection *con) goto accept_err; } + sock_set_mark(newsock->sk, mark); + mutex_lock(&newcon->sock_mutex); if (newcon->sock) { struct connection *othercon = newcon->othercon; @@ -908,16 +931,18 @@ static int accept_from_sock(struct listen_connection *con) result = dlm_con_init(othercon, nodeid); if (result < 0) { kfree(othercon); + mutex_unlock(&newcon->sock_mutex); goto accept_err; } + lockdep_set_subclass(&othercon->sock_mutex, 1); newcon->othercon = othercon; } else { /* close other sock con if we have something new */ close_connection(othercon, false, true, false); } - mutex_lock_nested(&othercon->sock_mutex, 1); + mutex_lock(&othercon->sock_mutex); add_sock(newsock, othercon); addcon = othercon; mutex_unlock(&othercon->sock_mutex); @@ -930,6 +955,7 @@ static int accept_from_sock(struct listen_connection *con) addcon = newcon; } + set_bit(CF_CONNECTED, &addcon->flags); mutex_unlock(&newcon->sock_mutex); /* @@ -1015,8 +1041,6 @@ static void sctp_connect_to_sock(struct connection *con) struct socket *sock; unsigned int mark; - dlm_comm_mark(con->nodeid, &mark); - mutex_lock(&con->sock_mutex); /* Some odd races can cause double-connects, ignore them */ @@ -1029,7 +1053,7 @@ static void sctp_connect_to_sock(struct connection *con) } memset(&daddr, 0, sizeof(daddr)); - result = nodeid_to_addr(con->nodeid, &daddr, NULL, true); + result = nodeid_to_addr(con->nodeid, &daddr, NULL, true, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); goto out; @@ -1104,13 +1128,11 @@ out: static void tcp_connect_to_sock(struct connection *con) { struct sockaddr_storage saddr, src_addr; + unsigned int mark; int addr_len; struct socket *sock = NULL; - unsigned int mark; int result; - dlm_comm_mark(con->nodeid, &mark); - mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) goto out; @@ -1125,15 +1147,15 @@ static void tcp_connect_to_sock(struct connection *con) if (result < 0) goto out_err; - sock_set_mark(sock->sk, mark); - memset(&saddr, 0, sizeof(saddr)); - result = nodeid_to_addr(con->nodeid, &saddr, NULL, false); + result = nodeid_to_addr(con->nodeid, &saddr, NULL, false, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); goto out_err; } + sock_set_mark(sock->sk, mark); + add_sock(sock, con); /* Bind to our cluster-known address connecting to avoid @@ -1330,70 +1352,72 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con, { struct writequeue_entry *entry; - entry = kmalloc(sizeof(struct writequeue_entry), allocation); + entry = kzalloc(sizeof(*entry), allocation); if (!entry) return NULL; - entry->page = alloc_page(allocation); + entry->page = alloc_page(allocation | __GFP_ZERO); if (!entry->page) { kfree(entry); return NULL; } - entry->offset = 0; - entry->len = 0; - entry->end = 0; - entry->users = 0; entry->con = con; + entry->users = 1; return entry; } -void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +static struct writequeue_entry *new_wq_entry(struct connection *con, int len, + gfp_t allocation, char **ppc) { - struct connection *con; struct writequeue_entry *e; - int offset = 0; - if (len > LOWCOMMS_MAX_TX_BUFFER_LEN) { - BUILD_BUG_ON(PAGE_SIZE < LOWCOMMS_MAX_TX_BUFFER_LEN); - log_print("failed to allocate a buffer of size %d", len); - return NULL; + spin_lock(&con->writequeue_lock); + if (!list_empty(&con->writequeue)) { + e = list_last_entry(&con->writequeue, struct writequeue_entry, list); + if (DLM_WQ_REMAIN_BYTES(e) >= len) { + *ppc = page_address(e->page) + e->end; + e->end += len; + e->users++; + spin_unlock(&con->writequeue_lock); + + return e; + } } + spin_unlock(&con->writequeue_lock); - con = nodeid2con(nodeid, allocation); - if (!con) + e = new_writequeue_entry(con, allocation); + if (!e) return NULL; + *ppc = page_address(e->page); + e->end += len; + spin_lock(&con->writequeue_lock); - e = list_entry(con->writequeue.prev, struct writequeue_entry, list); - if ((&e->list == &con->writequeue) || - (PAGE_SIZE - e->end < len)) { - e = NULL; - } else { - offset = e->end; - e->end += len; - e->users++; - } + list_add_tail(&e->list, &con->writequeue); spin_unlock(&con->writequeue_lock); - if (e) { - got_one: - *ppc = page_address(e->page) + offset; - return e; - } + return e; +}; - e = new_writequeue_entry(con, allocation); - if (e) { - spin_lock(&con->writequeue_lock); - offset = e->end; - e->end += len; - e->users++; - list_add_tail(&e->list, &con->writequeue); - spin_unlock(&con->writequeue_lock); - goto got_one; +void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +{ + struct connection *con; + + if (len > DEFAULT_BUFFER_SIZE || + len < sizeof(struct dlm_header)) { + BUILD_BUG_ON(PAGE_SIZE < DEFAULT_BUFFER_SIZE); + log_print("failed to allocate a buffer of size %d", len); + WARN_ON(1); + return NULL; } - return NULL; + + con = nodeid2con(nodeid, allocation); + if (!con) + return NULL; + + return new_wq_entry(con, len, allocation, ppc); } void dlm_lowcomms_commit_buffer(void *mh) @@ -1406,7 +1430,8 @@ void dlm_lowcomms_commit_buffer(void *mh) users = --e->users; if (users) goto out; - e->len = e->end - e->offset; + + e->len = DLM_WQ_LENGTH_BYTES(e); spin_unlock(&con->writequeue_lock); queue_work(send_workqueue, &con->swork); @@ -1432,11 +1457,10 @@ static void send_to_sock(struct connection *con) spin_lock(&con->writequeue_lock); for (;;) { - e = list_entry(con->writequeue.next, struct writequeue_entry, - list); - if ((struct list_head *) e == &con->writequeue) + if (list_empty(&con->writequeue)) break; + e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; BUG_ON(len == 0 && e->users == 0); @@ -1589,6 +1613,29 @@ static int work_start(void) return 0; } +static void shutdown_conn(struct connection *con) +{ + if (con->shutdown_action) + con->shutdown_action(con); +} + +void dlm_lowcomms_shutdown(void) +{ + /* Set all the flags to prevent any + * socket activity. + */ + dlm_allow_conn = 0; + + if (recv_workqueue) + flush_workqueue(recv_workqueue); + if (send_workqueue) + flush_workqueue(send_workqueue); + + dlm_close_sock(&listen_con.sock); + + foreach_conn(shutdown_conn); +} + static void _stop_conn(struct connection *con, bool and_other) { mutex_lock(&con->sock_mutex); @@ -1610,12 +1657,6 @@ static void stop_conn(struct connection *con) _stop_conn(con, true); } -static void shutdown_conn(struct connection *con) -{ - if (con->shutdown_action) - con->shutdown_action(con); -} - static void connection_release(struct rcu_head *rcu) { struct connection *con = container_of(rcu, struct connection, rcu); @@ -1672,19 +1713,6 @@ static void work_flush(void) void dlm_lowcomms_stop(void) { - /* Set all the flags to prevent any - socket activity. - */ - dlm_allow_conn = 0; - - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); - - dlm_close_sock(&listen_con.sock); - - foreach_conn(shutdown_conn); work_flush(); foreach_conn(free_conn); work_stop(); diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 0918f9376489..48bbc4e18761 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -14,13 +14,18 @@ #define LOWCOMMS_MAX_TX_BUFFER_LEN 4096 +/* switch to check if dlm is running */ +extern int dlm_allow_conn; + int dlm_lowcomms_start(void); +void dlm_lowcomms_shutdown(void); void dlm_lowcomms_stop(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_lowcomms_commit_buffer(void *mh); int dlm_lowcomms_connect_node(int nodeid); +int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index fde3a6afe4be..1c6654a21ec4 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -22,8 +22,6 @@ * into packets and sends them to the comms layer. */ -#include <asm/unaligned.h> - #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" @@ -45,13 +43,22 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) while (len >= sizeof(struct dlm_header)) { hd = (struct dlm_header *)ptr; - /* no message should be more than this otherwise we - * cannot deliver this message to upper layers + /* no message should be more than DEFAULT_BUFFER_SIZE or + * less than dlm_header size. + * + * Some messages does not have a 8 byte length boundary yet + * which can occur in a unaligned memory access of some dlm + * messages. However this problem need to be fixed at the + * sending side, for now it seems nobody run into architecture + * related issues yet but it slows down some processing. + * Fixing this issue should be scheduled in future by doing + * the next major version bump. */ - msglen = get_unaligned_le16(&hd->h_length); - if (msglen > DEFAULT_BUFFER_SIZE) { - log_print("received invalid length header: %u, will abort message parsing", - msglen); + msglen = le16_to_cpu(hd->h_length); + if (msglen > DEFAULT_BUFFER_SIZE || + msglen < sizeof(struct dlm_header)) { + log_print("received invalid length header: %u from node %d, will abort message parsing", + msglen, nodeid); return -EBADMSG; } @@ -84,15 +91,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) goto skip; } - /* for aligned memory access, we just copy current message - * to begin of the buffer which contains already parsed buffer - * data and should provide align access for upper layers - * because the start address of the buffer has a aligned - * address. This memmove can be removed when the upperlayer - * is capable of unaligned memory access. - */ - memmove(buf, ptr, msglen); - dlm_receive_buffer((union dlm_packet *)buf, nodeid); + dlm_receive_buffer((union dlm_packet *)ptr, nodeid); skip: ret += msglen; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 73ddee5159d7..f5b1bd65728d 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -41,7 +41,6 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, to_nodeid, type, len); return -ENOBUFS; } - memset(mb, 0, mb_len); rc = (struct dlm_rcom *) mb; @@ -462,7 +461,6 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; - memset(mb, 0, mb_len); rc = (struct dlm_rcom *) mb; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 943e523f4c9d..e3f5d7f3c8a0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2004 Erez Zadok @@ -296,10 +296,6 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, struct extent_crypt_result ecr; int rc = 0; - if (!crypt_stat || !crypt_stat->tfm - || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) - return -EINVAL; - if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n", crypt_stat->key_size); @@ -350,7 +346,7 @@ out: return rc; } -/** +/* * lower_offset_for_page * * Convert an eCryptfs page index into a lower byte offset @@ -535,7 +531,7 @@ int ecryptfs_decrypt_page(struct page *page) rc = crypt_extent(crypt_stat, page, page, extent_offset, DECRYPT); if (rc) { - printk(KERN_ERR "%s: Error encrypting extent; " + printk(KERN_ERR "%s: Error decrypting extent; " "rc = [%d]\n", __func__, rc); goto out; } @@ -627,9 +623,8 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) } } -/** +/* * ecryptfs_compute_root_iv - * @crypt_stats * * On error, sets the root IV to all 0's. */ @@ -1370,7 +1365,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, return rc; } -/** +/* * ecryptfs_read_metadata * * Common entry point for reading file metadata. From here, we could @@ -1448,7 +1443,7 @@ out: return rc; } -/** +/* * ecryptfs_encrypt_filename - encrypt filename * * CBC-encrypts the filename. We do not want to encrypt the same @@ -1590,11 +1585,10 @@ out: struct kmem_cache *ecryptfs_key_tfm_cache; static struct list_head key_tfm_list; -struct mutex key_tfm_list_mutex; +DEFINE_MUTEX(key_tfm_list_mutex); int __init ecryptfs_init_crypto(void) { - mutex_init(&key_tfm_list_mutex); INIT_LIST_HEAD(&key_tfm_list); return 0; } @@ -1877,10 +1871,11 @@ out: /** * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text - * @crypt_stat: The crypt_stat struct associated with the file anem to encode + * @encoded_name: The encrypted name + * @encoded_name_size: Length of the encrypted name + * @mount_crypt_stat: The crypt_stat struct associated with the file name to encode * @name: The plaintext name - * @length: The length of the plaintext - * @encoded_name: The encypted name + * @name_size: The length of the plaintext name * * Encrypts and encodes a filename into something that constitutes a * valid filename for a filesystem, with printable characters. @@ -1992,7 +1987,7 @@ static bool is_dot_dotdot(const char *name, size_t name_size) * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext * @plaintext_name: The plaintext name * @plaintext_name_size: The plaintext name size - * @ecryptfs_dir_dentry: eCryptfs directory dentry + * @sb: Ecryptfs's super_block * @name: The filename in cipher text * @name_size: The cipher text name size * diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c index 1f65e99f9a41..cf6d0e8e25a1 100644 --- a/fs/ecryptfs/debug.c +++ b/fs/ecryptfs/debug.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * Functions only useful for debugging. * @@ -9,7 +9,7 @@ #include "ecryptfs_kernel.h" -/** +/* * ecryptfs_dump_auth_tok - debug function to print auth toks * * This function will print the contents of an ecryptfs authentication diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 44606f079efb..acaa0825e9bb 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2003 Erez Zadok diff --git a/fs/ecryptfs/ecryptfs_kernel.h b/fs/ecryptfs/ecryptfs_kernel.h index e6ac78c62ca4..5f2b49e13731 100644 --- a/fs/ecryptfs/ecryptfs_kernel.h +++ b/fs/ecryptfs/ecryptfs_kernel.h @@ -262,10 +262,7 @@ struct ecryptfs_inode_info { * vfsmount too. */ struct ecryptfs_dentry_info { struct path lower_path; - union { - struct ecryptfs_crypt_stat *crypt_stat; - struct rcu_head rcu; - }; + struct rcu_head rcu; }; /** @@ -496,12 +493,6 @@ ecryptfs_set_superblock_lower(struct super_block *sb, ((struct ecryptfs_sb_info *)sb->s_fs_info)->wsi_sb = lower_sb; } -static inline struct ecryptfs_dentry_info * -ecryptfs_dentry_to_private(struct dentry *dentry) -{ - return (struct ecryptfs_dentry_info *)dentry->d_fsdata; -} - static inline void ecryptfs_set_dentry_private(struct dentry *dentry, struct ecryptfs_dentry_info *dentry_info) @@ -515,12 +506,6 @@ ecryptfs_dentry_to_lower(struct dentry *dentry) return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.dentry; } -static inline struct vfsmount * -ecryptfs_dentry_to_lower_mnt(struct dentry *dentry) -{ - return ((struct ecryptfs_dentry_info *)dentry->d_fsdata)->lower_path.mnt; -} - static inline struct path * ecryptfs_dentry_to_lower_path(struct dentry *dentry) { @@ -528,7 +513,7 @@ ecryptfs_dentry_to_lower_path(struct dentry *dentry) } #define ecryptfs_printk(type, fmt, arg...) \ - __ecryptfs_printk(type "%s: " fmt, __func__, ## arg); + __ecryptfs_printk(type "%s: " fmt, __func__, ## arg) __printf(1, 2) void __ecryptfs_printk(const char *fmt, ...); diff --git a/fs/ecryptfs/file.c b/fs/ecryptfs/file.c index 5fb45d865ce5..18d5b91cb573 100644 --- a/fs/ecryptfs/file.c +++ b/fs/ecryptfs/file.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2004 Erez Zadok @@ -19,7 +19,7 @@ #include <linux/fs_stack.h> #include "ecryptfs_kernel.h" -/** +/* * ecryptfs_read_update_atime * * generic_file_read updates the atime of upper layer inode. But, it diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index 7169ea873347..16d50dface59 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2004 Erez Zadok @@ -22,19 +22,18 @@ #include <asm/unaligned.h> #include "ecryptfs_kernel.h" -static struct dentry *lock_parent(struct dentry *dentry) +static int lock_parent(struct dentry *dentry, + struct dentry **lower_dentry, + struct inode **lower_dir) { - struct dentry *dir; + struct dentry *lower_dir_dentry; - dir = dget_parent(dentry); - inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - return dir; -} + lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); + *lower_dir = d_inode(lower_dir_dentry); + *lower_dentry = ecryptfs_dentry_to_lower(dentry); -static void unlock_dir(struct dentry *dir) -{ - inode_unlock(d_inode(dir)); - dput(dir); + inode_lock_nested(*lower_dir, I_MUTEX_PARENT); + return (*lower_dentry)->d_parent == lower_dir_dentry ? 0 : -EINVAL; } static int ecryptfs_inode_test(struct inode *inode, void *lower_inode) @@ -128,32 +127,29 @@ static int ecryptfs_interpose(struct dentry *lower_dentry, static int ecryptfs_do_unlink(struct inode *dir, struct dentry *dentry, struct inode *inode) { - struct dentry *lower_dentry = ecryptfs_dentry_to_lower(dentry); - struct dentry *lower_dir_dentry; - struct inode *lower_dir_inode; + struct dentry *lower_dentry; + struct inode *lower_dir; int rc; - lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); - lower_dir_inode = d_inode(lower_dir_dentry); - inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT); + rc = lock_parent(dentry, &lower_dentry, &lower_dir); dget(lower_dentry); // don't even try to make the lower negative - if (lower_dentry->d_parent != lower_dir_dentry) - rc = -EINVAL; - else if (d_unhashed(lower_dentry)) - rc = -EINVAL; - else - rc = vfs_unlink(&init_user_ns, lower_dir_inode, lower_dentry, - NULL); + if (!rc) { + if (d_unhashed(lower_dentry)) + rc = -EINVAL; + else + rc = vfs_unlink(&init_user_ns, lower_dir, lower_dentry, + NULL); + } if (rc) { printk(KERN_ERR "Error in vfs_unlink; rc = [%d]\n", rc); goto out_unlock; } - fsstack_copy_attr_times(dir, lower_dir_inode); + fsstack_copy_attr_times(dir, lower_dir); set_nlink(inode, ecryptfs_inode_to_lower(inode)->i_nlink); inode->i_ctime = dir->i_ctime; out_unlock: dput(lower_dentry); - inode_unlock(lower_dir_inode); + inode_unlock(lower_dir); if (!rc) d_drop(dentry); return rc; @@ -177,13 +173,13 @@ ecryptfs_do_create(struct inode *directory_inode, { int rc; struct dentry *lower_dentry; - struct dentry *lower_dir_dentry; + struct inode *lower_dir; struct inode *inode; - lower_dentry = ecryptfs_dentry_to_lower(ecryptfs_dentry); - lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_create(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, - mode, true); + rc = lock_parent(ecryptfs_dentry, &lower_dentry, &lower_dir); + if (!rc) + rc = vfs_create(&init_user_ns, lower_dir, + lower_dentry, mode, true); if (rc) { printk(KERN_ERR "%s: Failure to create dentry in lower fs; " "rc = [%d]\n", __func__, rc); @@ -193,18 +189,17 @@ ecryptfs_do_create(struct inode *directory_inode, inode = __ecryptfs_get_inode(d_inode(lower_dentry), directory_inode->i_sb); if (IS_ERR(inode)) { - vfs_unlink(&init_user_ns, d_inode(lower_dir_dentry), - lower_dentry, NULL); + vfs_unlink(&init_user_ns, lower_dir, lower_dentry, NULL); goto out_lock; } - fsstack_copy_attr_times(directory_inode, d_inode(lower_dir_dentry)); - fsstack_copy_inode_size(directory_inode, d_inode(lower_dir_dentry)); + fsstack_copy_attr_times(directory_inode, lower_dir); + fsstack_copy_inode_size(directory_inode, lower_dir); out_lock: - unlock_dir(lower_dir_dentry); + inode_unlock(lower_dir); return inode; } -/** +/* * ecryptfs_initialize_file * * Cause the file to be changed from a basic empty file to an ecryptfs @@ -247,10 +242,8 @@ out: return rc; } -/** +/* * ecryptfs_create - * @dir: The inode of the directory in which to create the file. - * @dentry: The eCryptfs dentry * @mode: The mode of the new file. * * Creates a new file. @@ -318,7 +311,7 @@ static int ecryptfs_i_size_read(struct dentry *dentry, struct inode *inode) return 0; } -/** +/* * ecryptfs_lookup_interpose - Dentry interposition for a lookup */ static struct dentry *ecryptfs_lookup_interpose(struct dentry *dentry, @@ -431,32 +424,28 @@ static int ecryptfs_link(struct dentry *old_dentry, struct inode *dir, { struct dentry *lower_old_dentry; struct dentry *lower_new_dentry; - struct dentry *lower_dir_dentry; + struct inode *lower_dir; u64 file_size_save; int rc; file_size_save = i_size_read(d_inode(old_dentry)); lower_old_dentry = ecryptfs_dentry_to_lower(old_dentry); - lower_new_dentry = ecryptfs_dentry_to_lower(new_dentry); - dget(lower_old_dentry); - dget(lower_new_dentry); - lower_dir_dentry = lock_parent(lower_new_dentry); - rc = vfs_link(lower_old_dentry, &init_user_ns, - d_inode(lower_dir_dentry), lower_new_dentry, NULL); + rc = lock_parent(new_dentry, &lower_new_dentry, &lower_dir); + if (!rc) + rc = vfs_link(lower_old_dentry, &init_user_ns, lower_dir, + lower_new_dentry, NULL); if (rc || d_really_is_negative(lower_new_dentry)) goto out_lock; rc = ecryptfs_interpose(lower_new_dentry, new_dentry, dir->i_sb); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + fsstack_copy_attr_times(dir, lower_dir); + fsstack_copy_inode_size(dir, lower_dir); set_nlink(d_inode(old_dentry), ecryptfs_inode_to_lower(d_inode(old_dentry))->i_nlink); i_size_write(d_inode(new_dentry), file_size_save); out_lock: - unlock_dir(lower_dir_dentry); - dput(lower_new_dentry); - dput(lower_old_dentry); + inode_unlock(lower_dir); return rc; } @@ -471,14 +460,14 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns, { int rc; struct dentry *lower_dentry; - struct dentry *lower_dir_dentry; + struct inode *lower_dir; char *encoded_symname; size_t encoded_symlen; struct ecryptfs_mount_crypt_stat *mount_crypt_stat = NULL; - lower_dentry = ecryptfs_dentry_to_lower(dentry); - dget(lower_dentry); - lower_dir_dentry = lock_parent(lower_dentry); + rc = lock_parent(dentry, &lower_dentry, &lower_dir); + if (rc) + goto out_lock; mount_crypt_stat = &ecryptfs_superblock_to_private( dir->i_sb)->mount_crypt_stat; rc = ecryptfs_encrypt_and_encode_filename(&encoded_symname, @@ -487,7 +476,7 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns, strlen(symname)); if (rc) goto out_lock; - rc = vfs_symlink(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, + rc = vfs_symlink(&init_user_ns, lower_dir, lower_dentry, encoded_symname); kfree(encoded_symname); if (rc || d_really_is_negative(lower_dentry)) @@ -495,11 +484,10 @@ static int ecryptfs_symlink(struct user_namespace *mnt_userns, rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out_lock; - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + fsstack_copy_attr_times(dir, lower_dir); + fsstack_copy_inode_size(dir, lower_dir); out_lock: - unlock_dir(lower_dir_dentry); - dput(lower_dentry); + inode_unlock(lower_dir); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -510,22 +498,22 @@ static int ecryptfs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, { int rc; struct dentry *lower_dentry; - struct dentry *lower_dir_dentry; + struct inode *lower_dir; - lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mkdir(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, - mode); + rc = lock_parent(dentry, &lower_dentry, &lower_dir); + if (!rc) + rc = vfs_mkdir(&init_user_ns, lower_dir, + lower_dentry, mode); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); - set_nlink(dir, d_inode(lower_dir_dentry)->i_nlink); + fsstack_copy_attr_times(dir, lower_dir); + fsstack_copy_inode_size(dir, lower_dir); + set_nlink(dir, lower_dir->i_nlink); out: - unlock_dir(lower_dir_dentry); + inode_unlock(lower_dir); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -534,29 +522,24 @@ out: static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) { struct dentry *lower_dentry; - struct dentry *lower_dir_dentry; - struct inode *lower_dir_inode; + struct inode *lower_dir; int rc; - lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_dir_dentry = ecryptfs_dentry_to_lower(dentry->d_parent); - lower_dir_inode = d_inode(lower_dir_dentry); - - inode_lock_nested(lower_dir_inode, I_MUTEX_PARENT); + rc = lock_parent(dentry, &lower_dentry, &lower_dir); dget(lower_dentry); // don't even try to make the lower negative - if (lower_dentry->d_parent != lower_dir_dentry) - rc = -EINVAL; - else if (d_unhashed(lower_dentry)) - rc = -EINVAL; - else - rc = vfs_rmdir(&init_user_ns, lower_dir_inode, lower_dentry); + if (!rc) { + if (d_unhashed(lower_dentry)) + rc = -EINVAL; + else + rc = vfs_rmdir(&init_user_ns, lower_dir, lower_dentry); + } if (!rc) { clear_nlink(d_inode(dentry)); - fsstack_copy_attr_times(dir, lower_dir_inode); - set_nlink(dir, lower_dir_inode->i_nlink); + fsstack_copy_attr_times(dir, lower_dir); + set_nlink(dir, lower_dir->i_nlink); } dput(lower_dentry); - inode_unlock(lower_dir_inode); + inode_unlock(lower_dir); if (!rc) d_drop(dentry); return rc; @@ -568,21 +551,21 @@ ecryptfs_mknod(struct user_namespace *mnt_userns, struct inode *dir, { int rc; struct dentry *lower_dentry; - struct dentry *lower_dir_dentry; + struct inode *lower_dir; - lower_dentry = ecryptfs_dentry_to_lower(dentry); - lower_dir_dentry = lock_parent(lower_dentry); - rc = vfs_mknod(&init_user_ns, d_inode(lower_dir_dentry), lower_dentry, - mode, dev); + rc = lock_parent(dentry, &lower_dentry, &lower_dir); + if (!rc) + rc = vfs_mknod(&init_user_ns, lower_dir, + lower_dentry, mode, dev); if (rc || d_really_is_negative(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) goto out; - fsstack_copy_attr_times(dir, d_inode(lower_dir_dentry)); - fsstack_copy_inode_size(dir, d_inode(lower_dir_dentry)); + fsstack_copy_attr_times(dir, lower_dir); + fsstack_copy_inode_size(dir, lower_dir); out: - unlock_dir(lower_dir_dentry); + inode_unlock(lower_dir); if (d_really_is_negative(dentry)) d_drop(dentry); return rc; @@ -888,6 +871,7 @@ ecryptfs_permission(struct user_namespace *mnt_userns, struct inode *inode, /** * ecryptfs_setattr + * @mnt_userns: user namespace of the target mount * @dentry: dentry handle to the inode to modify * @ia: Structure with flags of what to change and values * diff --git a/fs/ecryptfs/keystore.c b/fs/ecryptfs/keystore.c index f6a17d259db7..3fe41964c0d8 100644 --- a/fs/ecryptfs/keystore.c +++ b/fs/ecryptfs/keystore.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * In-kernel key management code. Includes functions to parse and * write authentication token-related packets with the underlying @@ -21,7 +21,7 @@ #include <linux/slab.h> #include "ecryptfs_kernel.h" -/** +/* * request_key returned an error instead of a valid key address; * determine the type of error, make appropriate log entries, and * return an error code. @@ -536,8 +536,9 @@ out: /** * ecryptfs_find_auth_tok_for_sig + * @auth_tok_key: key containing the authentication token * @auth_tok: Set to the matching auth_tok; NULL if not found - * @crypt_stat: inode crypt_stat crypto context + * @mount_crypt_stat: inode crypt_stat crypto context * @sig: Sig of auth_tok to find * * For now, this function simply looks at the registered auth_tok's @@ -576,7 +577,7 @@ ecryptfs_find_auth_tok_for_sig( return rc; } -/** +/* * write_tag_70_packet can gobble a lot of stack space. We stuff most * of the function's parameters in a kmalloc'd struct to help reduce * eCryptfs' overall stack usage. @@ -604,7 +605,7 @@ struct ecryptfs_write_tag_70_packet_silly_stack { struct shash_desc *hash_desc; }; -/** +/* * write_tag_70_packet - Write encrypted filename (EFN) packet against FNEK * @filename: NULL-terminated filename string * @@ -873,7 +874,7 @@ struct ecryptfs_parse_tag_70_packet_silly_stack { }; /** - * parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet + * ecryptfs_parse_tag_70_packet - Parse and process FNEK-encrypted passphrase packet * @filename: This function kmalloc's the memory for the filename * @filename_size: This function sets this to the amount of memory * kmalloc'd for the filename @@ -1172,7 +1173,7 @@ decrypt_pki_encrypted_session_key(struct ecryptfs_auth_tok *auth_tok, rc = ecryptfs_cipher_code_to_string(crypt_stat->cipher, cipher_code); if (rc) { ecryptfs_printk(KERN_ERR, "Cipher code [%d] is invalid\n", - cipher_code) + cipher_code); goto out; } crypt_stat->flags |= ECRYPTFS_KEY_VALID; diff --git a/fs/ecryptfs/kthread.c b/fs/ecryptfs/kthread.c index a7c903cb01a0..ae4cb4e2e134 100644 --- a/fs/ecryptfs/kthread.c +++ b/fs/ecryptfs/kthread.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 2008 International Business Machines Corp. @@ -108,6 +108,7 @@ void ecryptfs_destroy_kthread(void) * @lower_file: Result of dentry_open by root on lower dentry * @lower_dentry: Lower dentry for file to open * @lower_mnt: Lower vfsmount for file to open + * @cred: credential to use for this call * * This function gets a r/w file opened against the lower dentry. * diff --git a/fs/ecryptfs/main.c b/fs/ecryptfs/main.c index cdf40a54a35d..d66bbd2df191 100644 --- a/fs/ecryptfs/main.c +++ b/fs/ecryptfs/main.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2003 Erez Zadok @@ -24,7 +24,7 @@ #include <linux/magic.h> #include "ecryptfs_kernel.h" -/** +/* * Module parameter that defines the ecryptfs_verbosity level. */ int ecryptfs_verbosity = 0; @@ -34,7 +34,7 @@ MODULE_PARM_DESC(ecryptfs_verbosity, "Initial verbosity level (0 or 1; defaults to " "0, which is Quiet)"); -/** +/* * Module parameter that defines the number of message buffer elements */ unsigned int ecryptfs_message_buf_len = ECRYPTFS_DEFAULT_MSG_CTX_ELEMS; @@ -43,7 +43,7 @@ module_param(ecryptfs_message_buf_len, uint, 0); MODULE_PARM_DESC(ecryptfs_message_buf_len, "Number of message buffer elements"); -/** +/* * Module parameter that defines the maximum guaranteed amount of time to wait * for a response from ecryptfsd. The actual sleep time will be, more than * likely, a small amount greater than this specified value, but only less if @@ -57,7 +57,7 @@ MODULE_PARM_DESC(ecryptfs_message_wait_timeout, "sleep while waiting for a message response from " "userspace"); -/** +/* * Module parameter that is an estimate of the maximum number of users * that will be concurrently using eCryptfs. Set this to the right * value to balance performance and memory use. @@ -80,7 +80,7 @@ void __ecryptfs_printk(const char *fmt, ...) va_end(args); } -/** +/* * ecryptfs_init_lower_file * @ecryptfs_dentry: Fully initialized eCryptfs dentry object, with * the lower dentry and the lower mount set @@ -221,7 +221,7 @@ static void ecryptfs_init_mount_crypt_stat( /** * ecryptfs_parse_options - * @sb: The ecryptfs super block + * @sbi: The ecryptfs super block * @options: The options passed to the kernel * @check_ruid: set to 1 if device uid should be checked against the ruid * @@ -466,10 +466,10 @@ out: struct kmem_cache *ecryptfs_sb_info_cache; static struct file_system_type ecryptfs_fs_type; -/** - * ecryptfs_get_sb - * @fs_type - * @flags +/* + * ecryptfs_mount + * @fs_type: The filesystem type that the superblock should belong to + * @flags: The flags associated with the mount * @dev_name: The path to mount over * @raw_data: The options passed into the kernel */ @@ -492,6 +492,12 @@ static struct dentry *ecryptfs_mount(struct file_system_type *fs_type, int flags goto out; } + if (!dev_name) { + rc = -EINVAL; + err = "Device name cannot be null"; + goto out; + } + rc = ecryptfs_parse_options(sbi, raw_data, &check_ruid); if (rc) { err = "Error parsing options"; @@ -635,7 +641,7 @@ static struct file_system_type ecryptfs_fs_type = { }; MODULE_ALIAS_FS("ecryptfs"); -/** +/* * inode_info_init_once * * Initializes the ecryptfs_inode_info_cache when it is created diff --git a/fs/ecryptfs/messaging.c b/fs/ecryptfs/messaging.c index c0dfd9647627..6318f3500e5c 100644 --- a/fs/ecryptfs/messaging.c +++ b/fs/ecryptfs/messaging.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 2004-2008 International Business Machines Corp. @@ -14,10 +14,10 @@ static LIST_HEAD(ecryptfs_msg_ctx_free_list); static LIST_HEAD(ecryptfs_msg_ctx_alloc_list); -static struct mutex ecryptfs_msg_ctx_lists_mux; +static DEFINE_MUTEX(ecryptfs_msg_ctx_lists_mux); static struct hlist_head *ecryptfs_daemon_hash; -struct mutex ecryptfs_daemon_hash_mux; +DEFINE_MUTEX(ecryptfs_daemon_hash_mux); static int ecryptfs_hash_bits; #define ecryptfs_current_euid_hash(uid) \ hash_long((unsigned long)from_kuid(&init_user_ns, current_euid()), ecryptfs_hash_bits) @@ -147,7 +147,7 @@ out: return rc; } -/** +/* * ecryptfs_exorcise_daemon - Destroy the daemon struct * * Must be called ceremoniously while in possession of @@ -181,7 +181,8 @@ out: } /** - * ecryptfs_process_reponse + * ecryptfs_process_response + * @daemon: eCryptfs daemon object * @msg: The ecryptfs message received; the caller should sanity check * msg->data_len and free the memory * @seq: The sequence number of the message; must match the sequence @@ -250,6 +251,7 @@ out: * ecryptfs_send_message_locked * @data: The data to send * @data_len: The length of data + * @msg_type: Type of message * @msg_ctx: The message context allocated for the send * * Must be called with ecryptfs_daemon_hash_mux held. @@ -359,7 +361,6 @@ int __init ecryptfs_init_messaging(void) "too large, defaulting to [%d] users\n", __func__, ecryptfs_number_of_users); } - mutex_init(&ecryptfs_daemon_hash_mux); mutex_lock(&ecryptfs_daemon_hash_mux); ecryptfs_hash_bits = 1; while (ecryptfs_number_of_users >> ecryptfs_hash_bits) @@ -383,7 +384,6 @@ int __init ecryptfs_init_messaging(void) rc = -ENOMEM; goto out; } - mutex_init(&ecryptfs_msg_ctx_lists_mux); mutex_lock(&ecryptfs_msg_ctx_lists_mux); ecryptfs_msg_counter = 0; for (i = 0; i < ecryptfs_message_buf_len; i++) { diff --git a/fs/ecryptfs/miscdev.c b/fs/ecryptfs/miscdev.c index 742ece22c1d4..4e62c3cef70f 100644 --- a/fs/ecryptfs/miscdev.c +++ b/fs/ecryptfs/miscdev.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-only -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 2008 International Business Machines Corp. @@ -312,6 +312,7 @@ out_unlock_daemon: /** * ecryptfs_miscdev_response - miscdevess response to message previously sent to daemon + * @daemon: eCryptfs daemon object * @data: Bytes comprising struct ecryptfs_message * @data_size: sizeof(struct ecryptfs_message) + data len * @seq: Sequence number for miscdev response packet diff --git a/fs/ecryptfs/mmap.c b/fs/ecryptfs/mmap.c index 2f333a40ff4d..392e721b50a3 100644 --- a/fs/ecryptfs/mmap.c +++ b/fs/ecryptfs/mmap.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * This is where eCryptfs coordinates the symmetric encryption and * decryption of the file data as it passes between the lower @@ -22,7 +22,7 @@ #include <asm/unaligned.h> #include "ecryptfs_kernel.h" -/** +/* * ecryptfs_get_locked_page * * Get one page from cache or lower f/s, return error otherwise. @@ -41,6 +41,7 @@ struct page *ecryptfs_get_locked_page(struct inode *inode, loff_t index) /** * ecryptfs_writepage * @page: Page that is locked before this call is made + * @wbc: Write-back control structure * * Returns zero on success; non-zero otherwise * @@ -78,7 +79,7 @@ static void strip_xattr_flag(char *page_virt, } } -/** +/* * Header Extent: * Octets 0-7: Unencrypted file size (big-endian) * Octets 8-15: eCryptfs special marker @@ -229,7 +230,7 @@ out: return rc; } -/** +/* * Called with lower inode mutex held. */ static int fill_zeros_to_end_of_page(struct page *page, unsigned int to) @@ -368,7 +369,7 @@ out: return rc; } -/** +/* * ecryptfs_write_inode_size_to_header * * Writes the lower file size to the first 8 bytes of the header. diff --git a/fs/ecryptfs/read_write.c b/fs/ecryptfs/read_write.c index 0438997ac9d8..60bdcaddcbe5 100644 --- a/fs/ecryptfs/read_write.c +++ b/fs/ecryptfs/read_write.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 2007 International Business Machines Corp. @@ -230,6 +230,8 @@ int ecryptfs_read_lower(char *data, loff_t offset, size_t size, * ecryptfs_read_lower_page_segment * @page_for_ecryptfs: The page into which data for eCryptfs will be * written + * @page_index: Page index in @page_for_ecryptfs from which to start + * writing * @offset_in_page: Offset in @page_for_ecryptfs from which to start * writing * @size: The number of bytes to write into @page_for_ecryptfs diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 6b1853f1c06a..39116af0390f 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2003 Erez Zadok @@ -81,7 +81,7 @@ static void ecryptfs_destroy_inode(struct inode *inode) /** * ecryptfs_statfs - * @sb: The ecryptfs super block + * @dentry: The ecryptfs dentry * @buf: The struct kstatfs to fill in with stats * * Get the filesystem statistics. Currently, we let this pass right through @@ -108,7 +108,7 @@ static int ecryptfs_statfs(struct dentry *dentry, struct kstatfs *buf) /** * ecryptfs_evict_inode - * @inode - The ecryptfs inode + * @inode: The ecryptfs inode * * Called by iput() when the inode reference count reached zero * and the inode is not hashed anywhere. Used to clear anything @@ -123,7 +123,7 @@ static void ecryptfs_evict_inode(struct inode *inode) iput(ecryptfs_inode_to_lower(inode)); } -/** +/* * ecryptfs_show_options * * Prints the mount options for a given superblock. diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index e62d813756f2..efaf32596b97 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -450,14 +450,31 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, lcn = m->lcn + 1; if (m->compressedlcs) goto out; - if (lcn == initial_lcn) - goto err_bonus_cblkcnt; err = z_erofs_load_cluster_from_disk(m, lcn); if (err) return err; + /* + * If the 1st NONHEAD lcluster has already been handled initially w/o + * valid compressedlcs, which means at least it mustn't be CBLKCNT, or + * an internal implemenatation error is detected. + * + * The following code can also handle it properly anyway, but let's + * BUG_ON in the debugging mode only for developers to notice that. + */ + DBG_BUGON(lcn == initial_lcn && + m->type == Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD); + switch (m->type) { + case Z_EROFS_VLE_CLUSTER_TYPE_PLAIN: + case Z_EROFS_VLE_CLUSTER_TYPE_HEAD: + /* + * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type + * rather than CBLKCNT, it's a 1 lcluster-sized pcluster. + */ + m->compressedlcs = 1; + break; case Z_EROFS_VLE_CLUSTER_TYPE_NONHEAD: if (m->delta[0] != 1) goto err_bonus_cblkcnt; diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 73138ea68342..1e596e1d0bba 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -657,6 +657,12 @@ static void ep_done_scan(struct eventpoll *ep, */ list_splice(txlist, &ep->rdllist); __pm_relax(ep->ws); + + if (!list_empty(&ep->rdllist)) { + if (waitqueue_active(&ep->wq)) + wake_up(&ep->wq); + } + write_unlock_irq(&ep->lock); } diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index c6b8bba73031..1f69b81655b6 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -81,11 +81,10 @@ static struct dentry *ext2_lookup(struct inode * dir, struct dentry *dentry, uns struct dentry *ext2_get_parent(struct dentry *child) { - struct qstr dotdot = QSTR_INIT("..", 2); ino_t ino; int res; - res = ext2_inode_by_name(d_inode(child), &dotdot, &ino); + res = ext2_inode_by_name(d_inode(child), &dotdot_name, &ino); if (res) return ERR_PTR(res); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 74a5172c2d83..9dc6e74b265c 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -239,7 +239,7 @@ unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_group_t block_group, struct ext4_group_desc *gdp) { - return num_clusters_in_group(sb, block_group) - + return num_clusters_in_group(sb, block_group) - ext4_num_overhead_clusters(sb, block_group, gdp); } diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 5ed870614c8d..ffb295aa891c 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -55,6 +55,18 @@ static int is_dx_dir(struct inode *inode) return 0; } +static bool is_fake_dir_entry(struct ext4_dir_entry_2 *de) +{ + /* Check if . or .. , or skip if namelen is 0 */ + if ((de->name_len > 0) && (de->name_len <= 2) && (de->name[0] == '.') && + (de->name[1] == '.' || de->name[1] == '\0')) + return true; + /* Check if this is a csum entry */ + if (de->file_type == EXT4_FT_DIR_CSUM) + return true; + return false; +} + /* * Return 0 if the directory entry is OK, and 1 if there is a problem * @@ -73,16 +85,20 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, const int rlen = ext4_rec_len_from_disk(de->rec_len, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; + bool fake = is_fake_dir_entry(de); + bool has_csum = ext4_has_metadata_csum(dir->i_sb); - if (unlikely(rlen < EXT4_DIR_REC_LEN(1))) + if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; else if (unlikely(rlen % 4 != 0)) error_msg = "rec_len % 4 != 0"; - else if (unlikely(rlen < EXT4_DIR_REC_LEN(de->name_len))) + else if (unlikely(rlen < ext4_dir_rec_len(de->name_len, + fake ? NULL : dir))) error_msg = "rec_len is too small for name_len"; else if (unlikely(next_offset > size)) error_msg = "directory entry overrun"; - else if (unlikely(next_offset > size - EXT4_DIR_REC_LEN(1) && + else if (unlikely(next_offset > size - ext4_dir_rec_len(1, + has_csum ? NULL : dir) && next_offset != size)) error_msg = "directory entry too close to block end"; else if (unlikely(le32_to_cpu(de->inode) > @@ -94,15 +110,15 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, if (filp) ext4_error_file(filp, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " - "inode=%u, rec_len=%d, name_len=%d, size=%d", + "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), - rlen, de->name_len, size); + rlen, size, fake); else ext4_error_inode(dir, function, line, bh->b_blocknr, "bad entry in directory: %s - offset=%u, " - "inode=%u, rec_len=%d, name_len=%d, size=%d", + "inode=%u, rec_len=%d, size=%d fake=%d", error_msg, offset, le32_to_cpu(de->inode), - rlen, de->name_len, size); + rlen, size, fake); return 1; } @@ -124,9 +140,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) if (is_dx_dir(inode)) { err = ext4_dx_readdir(file, ctx); - if (err != ERR_BAD_DX_DIR) { + if (err != ERR_BAD_DX_DIR) return err; - } + /* Can we just clear INDEX flag to ignore htree information? */ if (!ext4_has_metadata_csum(sb)) { /* @@ -224,7 +240,8 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, - sb->s_blocksize) < EXT4_DIR_REC_LEN(1)) + sb->s_blocksize) < ext4_dir_rec_len(1, + inode)) break; i += ext4_rec_len_from_disk(de->rec_len, sb->s_blocksize); @@ -265,7 +282,9 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) /* Directory is encrypted */ err = fscrypt_fname_disk_to_usr(inode, - 0, 0, &de_name, &fstr); + EXT4_DIRENT_HASH(de), + EXT4_DIRENT_MINOR_HASH(de), + &de_name, &fstr); de_name = fstr; fstr.len = save_len; if (err) diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 18f021c988a1..37002663d521 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -162,7 +162,12 @@ enum SHIFT_DIRECTION { #define EXT4_MB_USE_RESERVED 0x2000 /* Do strict check for free blocks while retrying block allocation */ #define EXT4_MB_STRICT_CHECK 0x4000 - +/* Large fragment size list lookup succeeded at least once for cr = 0 */ +#define EXT4_MB_CR0_OPTIMIZED 0x8000 +/* Avg fragment size rb tree lookup succeeded at least once for cr = 1 */ +#define EXT4_MB_CR1_OPTIMIZED 0x00010000 +/* Perform linear traversal for one group */ +#define EXT4_MB_SEARCH_NEXT_LINEAR 0x00020000 struct ext4_allocation_request { /* target inode for block we're allocating */ struct inode *inode; @@ -1213,7 +1218,7 @@ struct ext4_inode_info { #define EXT4_MOUNT_JOURNAL_CHECKSUM 0x800000 /* Journal checksums */ #define EXT4_MOUNT_JOURNAL_ASYNC_COMMIT 0x1000000 /* Journal Async Commit */ #define EXT4_MOUNT_WARN_ON_ERROR 0x2000000 /* Trigger WARN_ON on error */ -#define EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS 0x4000000 +#define EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS 0x4000000 #define EXT4_MOUNT_DELALLOC 0x8000000 /* Delalloc support */ #define EXT4_MOUNT_DATA_ERR_ABORT 0x10000000 /* Abort on file data write */ #define EXT4_MOUNT_BLOCK_VALIDITY 0x20000000 /* Block validity checking */ @@ -1238,7 +1243,9 @@ struct ext4_inode_info { #define EXT4_MOUNT2_JOURNAL_FAST_COMMIT 0x00000010 /* Journal fast commit */ #define EXT4_MOUNT2_DAX_NEVER 0x00000020 /* Do not allow Direct Access */ #define EXT4_MOUNT2_DAX_INODE 0x00000040 /* For printing options only */ - +#define EXT4_MOUNT2_MB_OPTIMIZE_SCAN 0x00000080 /* Optimize group + * scanning in mballoc + */ #define clear_opt(sb, opt) EXT4_SB(sb)->s_mount_opt &= \ ~EXT4_MOUNT_##opt @@ -1519,9 +1526,14 @@ struct ext4_sb_info { unsigned int s_mb_free_pending; struct list_head s_freed_data_list; /* List of blocks to be freed after commit completed */ + struct rb_root s_mb_avg_fragment_size_root; + rwlock_t s_mb_rb_lock; + struct list_head *s_mb_largest_free_orders; + rwlock_t *s_mb_largest_free_orders_locks; /* tunables */ unsigned long s_stripe; + unsigned int s_mb_max_linear_groups; unsigned int s_mb_stream_request; unsigned int s_mb_max_to_scan; unsigned int s_mb_min_to_scan; @@ -1541,12 +1553,17 @@ struct ext4_sb_info { atomic_t s_bal_success; /* we found long enough chunks */ atomic_t s_bal_allocated; /* in blocks */ atomic_t s_bal_ex_scanned; /* total extents scanned */ + atomic_t s_bal_groups_scanned; /* number of groups scanned */ atomic_t s_bal_goals; /* goal hits */ atomic_t s_bal_breaks; /* too long searches */ atomic_t s_bal_2orders; /* 2^order hits */ - spinlock_t s_bal_lock; - unsigned long s_mb_buddies_generated; - unsigned long long s_mb_generation_time; + atomic_t s_bal_cr0_bad_suggestions; + atomic_t s_bal_cr1_bad_suggestions; + atomic64_t s_bal_cX_groups_considered[4]; + atomic64_t s_bal_cX_hits[4]; + atomic64_t s_bal_cX_failed[4]; /* cX loop didn't find blocks */ + atomic_t s_mb_buddies_generated; /* number of buddies generated */ + atomic64_t s_mb_generation_time; atomic_t s_mb_lost_chunks; atomic_t s_mb_preallocated; atomic_t s_mb_discarded; @@ -2187,6 +2204,17 @@ struct ext4_dir_entry { char name[EXT4_NAME_LEN]; /* File name */ }; + +/* + * Encrypted Casefolded entries require saving the hash on disk. This structure + * followed ext4_dir_entry_2's name[name_len] at the next 4 byte aligned + * boundary. + */ +struct ext4_dir_entry_hash { + __le32 hash; + __le32 minor_hash; +}; + /* * The new version of the directory entry. Since EXT4 structures are * stored in intel byte order, and the name_len field could never be @@ -2202,6 +2230,22 @@ struct ext4_dir_entry_2 { }; /* + * Access the hashes at the end of ext4_dir_entry_2 + */ +#define EXT4_DIRENT_HASHES(entry) \ + ((struct ext4_dir_entry_hash *) \ + (((void *)(entry)) + \ + ((8 + (entry)->name_len + EXT4_DIR_ROUND) & ~EXT4_DIR_ROUND))) +#define EXT4_DIRENT_HASH(entry) le32_to_cpu(EXT4_DIRENT_HASHES(de)->hash) +#define EXT4_DIRENT_MINOR_HASH(entry) \ + le32_to_cpu(EXT4_DIRENT_HASHES(de)->minor_hash) + +static inline bool ext4_hash_in_dirent(const struct inode *inode) +{ + return IS_CASEFOLDED(inode) && IS_ENCRYPTED(inode); +} + +/* * This is a bogus directory entry at the end of each leaf block that * records checksums. */ @@ -2242,11 +2286,25 @@ struct ext4_dir_entry_tail { */ #define EXT4_DIR_PAD 4 #define EXT4_DIR_ROUND (EXT4_DIR_PAD - 1) -#define EXT4_DIR_REC_LEN(name_len) (((name_len) + 8 + EXT4_DIR_ROUND) & \ - ~EXT4_DIR_ROUND) #define EXT4_MAX_REC_LEN ((1<<16)-1) /* + * The rec_len is dependent on the type of directory. Directories that are + * casefolded and encrypted need to store the hash as well, so we add room for + * ext4_extended_dir_entry_2. For all entries related to '.' or '..' you should + * pass NULL for dir, as those entries do not use the extra fields. + */ +static inline unsigned int ext4_dir_rec_len(__u8 name_len, + const struct inode *dir) +{ + int rec_len = (name_len + 8 + EXT4_DIR_ROUND); + + if (dir && ext4_hash_in_dirent(dir)) + rec_len += sizeof(struct ext4_dir_entry_hash); + return (rec_len & ~EXT4_DIR_ROUND); +} + +/* * If we ever get support for fs block sizes > page_size, we'll need * to remove the #if statements in the next two functions... */ @@ -2302,6 +2360,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_LEGACY_UNSIGNED 3 #define DX_HASH_HALF_MD4_UNSIGNED 4 #define DX_HASH_TEA_UNSIGNED 5 +#define DX_HASH_SIPHASH 6 static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, const void *address, unsigned int length) @@ -2356,6 +2415,7 @@ struct ext4_filename { }; #define fname_name(p) ((p)->disk_name.name) +#define fname_usr_name(p) ((p)->usr_fname->name) #define fname_len(p) ((p)->disk_name.len) /* @@ -2586,9 +2646,9 @@ extern unsigned ext4_free_clusters_after_init(struct super_block *sb, ext4_fsblk_t ext4_inode_to_goal_block(struct inode *); #ifdef CONFIG_UNICODE -extern void ext4_fname_setup_ci_filename(struct inode *dir, +extern int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, - struct fscrypt_str *fname); + struct ext4_filename *fname); #endif #ifdef CONFIG_FS_ENCRYPTION @@ -2619,9 +2679,9 @@ static inline int ext4_fname_setup_filename(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); #ifdef CONFIG_UNICODE - ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); + err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif - return 0; + return err; } static inline int ext4_fname_prepare_lookup(struct inode *dir, @@ -2638,9 +2698,9 @@ static inline int ext4_fname_prepare_lookup(struct inode *dir, ext4_fname_from_fscrypt_name(fname, &name); #ifdef CONFIG_UNICODE - ext4_fname_setup_ci_filename(dir, &dentry->d_name, &fname->cf_name); + err = ext4_fname_setup_ci_filename(dir, &dentry->d_name, fname); #endif - return 0; + return err; } static inline void ext4_fname_free_filename(struct ext4_filename *fname) @@ -2665,15 +2725,16 @@ static inline int ext4_fname_setup_filename(struct inode *dir, int lookup, struct ext4_filename *fname) { + int err = 0; fname->usr_fname = iname; fname->disk_name.name = (unsigned char *) iname->name; fname->disk_name.len = iname->len; #ifdef CONFIG_UNICODE - ext4_fname_setup_ci_filename(dir, iname, &fname->cf_name); + err = ext4_fname_setup_ci_filename(dir, iname, fname); #endif - return 0; + return err; } static inline int ext4_fname_prepare_lookup(struct inode *dir, @@ -2698,9 +2759,9 @@ extern int __ext4_check_dir_entry(const char *, unsigned int, struct inode *, struct ext4_dir_entry_2 *, struct buffer_head *, char *, int, unsigned int); -#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ +#define ext4_check_dir_entry(dir, filp, de, bh, buf, size, offset) \ unlikely(__ext4_check_dir_entry(__func__, __LINE__, (dir), (filp), \ - (de), (bh), (buf), (size), (offset))) + (de), (bh), (buf), (size), (offset))) extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, __u32 minor_hash, struct ext4_dir_entry_2 *dirent, @@ -2711,7 +2772,7 @@ extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); -void ext4_insert_dentry(struct inode *inode, +void ext4_insert_dentry(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname); @@ -2802,8 +2863,10 @@ int __init ext4_fc_init_dentry_cache(void); /* mballoc.c */ extern const struct seq_operations ext4_mb_seq_groups_ops; +extern const struct seq_operations ext4_mb_seq_structs_summary_ops; extern long ext4_mb_stats; extern long ext4_mb_max_to_scan; +extern int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset); extern int ext4_mb_init(struct super_block *); extern int ext4_mb_release(struct super_block *); extern ext4_fsblk_t ext4_mb_new_blocks(handle_t *, @@ -3306,11 +3369,14 @@ struct ext4_group_info { ext4_grpblk_t bb_free; /* total free blocks */ ext4_grpblk_t bb_fragments; /* nr of freespace fragments */ ext4_grpblk_t bb_largest_free_order;/* order of largest frag in BG */ + ext4_group_t bb_group; /* Group number */ struct list_head bb_prealloc_list; #ifdef DOUBLE_CHECK void *bb_bitmap; #endif struct rw_semaphore alloc_sem; + struct rb_node bb_avg_fragment_size_rb; + struct list_head bb_largest_free_order_node; ext4_grpblk_t bb_counters[]; /* Nr of free power-of-two-block * regions, index is order. * bb_counters[3] = 5 means @@ -3513,9 +3579,6 @@ extern void ext4_initialize_dirent_tail(struct buffer_head *bh, unsigned int blocksize); extern int ext4_handle_dirty_dirblock(handle_t *handle, struct inode *inode, struct buffer_head *bh); -extern int ext4_ci_compare(const struct inode *parent, - const struct qstr *fname, - const struct qstr *entry, bool quick); extern int __ext4_unlink(handle_t *handle, struct inode *dir, const struct qstr *d_name, struct inode *inode); extern int __ext4_link(struct inode *dir, struct inode *inode, diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index 7541d0b5d706..f98ca4f37ef6 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -66,7 +66,7 @@ * Fast Commit Ineligibility * ------------------------- * Not all operations are supported by fast commits today (e.g extended - * attributes). Fast commit ineligiblity is marked by calling one of the + * attributes). Fast commit ineligibility is marked by calling one of the * two following functions: * * - ext4_fc_mark_ineligible(): This makes next fast commit operation to fall @@ -1088,8 +1088,10 @@ static int ext4_fc_perform_commit(journal_t *journal) head.fc_tid = cpu_to_le32( sbi->s_journal->j_running_transaction->t_tid); if (!ext4_fc_add_tlv(sb, EXT4_FC_TAG_HEAD, sizeof(head), - (u8 *)&head, &crc)) + (u8 *)&head, &crc)) { + ret = -ENOSPC; goto out; + } } spin_lock(&sbi->s_fc_lock); @@ -1734,7 +1736,7 @@ static int ext4_fc_replay_add_range(struct super_block *sb, } /* Range is mapped and needs a state change */ - jbd_debug(1, "Converting from %d to %d %lld", + jbd_debug(1, "Converting from %ld to %d %lld", map.m_flags & EXT4_MAP_UNWRITTEN, ext4_ext_is_unwritten(ex), map.m_pblk); ret = ext4_ext_replay_update_ex(inode, cur, map.m_len, diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 5332dd3ea7e2..816dedcbd541 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -371,15 +371,32 @@ truncate: static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, int error, unsigned int flags) { - loff_t offset = iocb->ki_pos; + loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); if (error) return error; - if (size && flags & IOMAP_DIO_UNWRITTEN) - return ext4_convert_unwritten_extents(NULL, inode, - offset, size); + if (size && flags & IOMAP_DIO_UNWRITTEN) { + error = ext4_convert_unwritten_extents(NULL, inode, pos, size); + if (error < 0) + return error; + } + /* + * If we are extending the file, we have to update i_size here before + * page cache gets invalidated in iomap_dio_rw(). Otherwise racing + * buffered reads could zero out too much from page cache pages. Update + * of on-disk size will happen later in ext4_dio_write_iter() where + * we have enough information to also perform orphan list handling etc. + * Note that we perform all extending writes synchronously under + * i_rwsem held exclusively so i_size update is safe here in that case. + * If the write was not extending, we cannot see pos > i_size here + * because operations reducing i_size like truncate wait for all + * outstanding DIO before updating i_size. + */ + pos += size; + if (pos > i_size_read(inode)) + i_size_write(inode, pos); return 0; } diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index a92eb79de0cc..f34f4176c1e7 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -197,7 +197,7 @@ static void str2hashbuf_unsigned(const char *msg, int len, __u32 *buf, int num) * represented, and whether or not the returned hash is 32 bits or 64 * bits. 32 bit hashes will return 0 for the minor hash. */ -static int __ext4fs_dirhash(const char *name, int len, +static int __ext4fs_dirhash(const struct inode *dir, const char *name, int len, struct dx_hash_info *hinfo) { __u32 hash; @@ -259,6 +259,22 @@ static int __ext4fs_dirhash(const char *name, int len, hash = buf[0]; minor_hash = buf[1]; break; + case DX_HASH_SIPHASH: + { + struct qstr qname = QSTR_INIT(name, len); + __u64 combined_hash; + + if (fscrypt_has_encryption_key(dir)) { + combined_hash = fscrypt_fname_siphash(dir, &qname); + } else { + ext4_warning_inode(dir, "Siphash requires key"); + return -1; + } + + hash = (__u32)(combined_hash >> 32); + minor_hash = (__u32)combined_hash; + break; + } default: hinfo->hash = 0; return -1; @@ -280,7 +296,8 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, unsigned char *buff; struct qstr qstr = {.name = name, .len = len }; - if (len && IS_CASEFOLDED(dir) && um) { + if (len && IS_CASEFOLDED(dir) && um && + (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); if (!buff) return -ENOMEM; @@ -291,12 +308,12 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, goto opaque_seq; } - r = __ext4fs_dirhash(buff, dlen, hinfo); + r = __ext4fs_dirhash(dir, buff, dlen, hinfo); kfree(buff); return r; } opaque_seq: #endif - return __ext4fs_dirhash(name, len, hinfo); + return __ext4fs_dirhash(dir, name, len, hinfo); } diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 755a68bb7e22..81a17a3cd80e 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1292,7 +1292,8 @@ got: ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; - if (ext4_has_feature_inline_data(sb)) + if (ext4_has_feature_inline_data(sb) && + (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); @@ -1513,6 +1514,7 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, handle_t *handle; ext4_fsblk_t blk; int num, ret = 0, used_blks = 0; + unsigned long used_inos = 0; /* This should not happen, but just to be sure check this */ if (sb_rdonly(sb)) { @@ -1543,22 +1545,37 @@ int ext4_init_inode_table(struct super_block *sb, ext4_group_t group, * used inodes so we need to skip blocks with used inodes in * inode table. */ - if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) - used_blks = DIV_ROUND_UP((EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp)), - sbi->s_inodes_per_block); - - if ((used_blks < 0) || (used_blks > sbi->s_itb_per_group) || - ((group == 0) && ((EXT4_INODES_PER_GROUP(sb) - - ext4_itable_unused_count(sb, gdp)) < - EXT4_FIRST_INO(sb)))) { - ext4_error(sb, "Something is wrong with group %u: " - "used itable blocks: %d; " - "itable unused count: %u", - group, used_blks, - ext4_itable_unused_count(sb, gdp)); - ret = 1; - goto err_out; + if (!(gdp->bg_flags & cpu_to_le16(EXT4_BG_INODE_UNINIT))) { + used_inos = EXT4_INODES_PER_GROUP(sb) - + ext4_itable_unused_count(sb, gdp); + used_blks = DIV_ROUND_UP(used_inos, sbi->s_inodes_per_block); + + /* Bogus inode unused count? */ + if (used_blks < 0 || used_blks > sbi->s_itb_per_group) { + ext4_error(sb, "Something is wrong with group %u: " + "used itable blocks: %d; " + "itable unused count: %u", + group, used_blks, + ext4_itable_unused_count(sb, gdp)); + ret = 1; + goto err_out; + } + + used_inos += group * EXT4_INODES_PER_GROUP(sb); + /* + * Are there some uninitialized inodes in the inode table + * before the first normal inode? + */ + if ((used_blks != sbi->s_itb_per_group) && + (used_inos < EXT4_FIRST_INO(sb))) { + ext4_error(sb, "Something is wrong with group %u: " + "itable unused count: %u; " + "itables initialized count: %ld", + group, ext4_itable_unused_count(sb, gdp), + used_inos); + ret = 1; + goto err_out; + } } blk = ext4_inode_table(sb, gdp) + used_blks; diff --git a/fs/ext4/indirect.c b/fs/ext4/indirect.c index 1223a18c3ff9..a7bc6ad656a9 100644 --- a/fs/ext4/indirect.c +++ b/fs/ext4/indirect.c @@ -705,7 +705,7 @@ static int ext4_ind_trunc_restart_fn(handle_t *handle, struct inode *inode, /* * Truncate transactions can be complex and absolutely huge. So we need to - * be able to restart the transaction at a conventient checkpoint to make + * be able to restart the transaction at a convenient checkpoint to make * sure we don't overflow the journal. * * Try to extend this transaction for the purposes of truncation. If diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index b41512d1badc..3cf01629010d 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -795,7 +795,7 @@ ext4_journalled_write_inline_data(struct inode *inode, * clear the inode state safely. * 2. The inode has inline data, then we need to read the data, make it * update and dirty so that ext4_da_writepages can handle it. We don't - * need to start the journal since the file's metatdata isn't changed now. + * need to start the journal since the file's metadata isn't changed now. */ static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, struct inode *inode, @@ -1031,7 +1031,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, err = ext4_journal_get_write_access(handle, iloc->bh); if (err) return err; - ext4_insert_dentry(inode, de, inline_size, fname); + ext4_insert_dentry(dir, inode, de, inline_size, fname); ext4_show_inline_dir(dir, iloc->bh, inline_start, inline_size); @@ -1100,7 +1100,7 @@ static int ext4_update_inline_dir(handle_t *handle, struct inode *dir, int old_size = EXT4_I(dir)->i_inline_size - EXT4_MIN_INLINE_DATA_SIZE; int new_size = get_max_inline_xattr_value_size(dir, iloc); - if (new_size - old_size <= EXT4_DIR_REC_LEN(1)) + if (new_size - old_size <= ext4_dir_rec_len(1, NULL)) return -ENOSPC; ret = ext4_update_inline_data(handle, dir, @@ -1380,8 +1380,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file, fake.name_len = 1; strcpy(fake.name, "."); fake.rec_len = ext4_rec_len_to_disk( - EXT4_DIR_REC_LEN(fake.name_len), - inline_size); + ext4_dir_rec_len(fake.name_len, NULL), + inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_OFFSET; @@ -1390,8 +1390,8 @@ int ext4_inlinedir_to_tree(struct file *dir_file, fake.name_len = 2; strcpy(fake.name, ".."); fake.rec_len = ext4_rec_len_to_disk( - EXT4_DIR_REC_LEN(fake.name_len), - inline_size); + ext4_dir_rec_len(fake.name_len, NULL), + inline_size); ext4_set_de_type(inode->i_sb, &fake, S_IFDIR); de = &fake; pos = EXT4_INLINE_DOTDOT_SIZE; @@ -1406,7 +1406,12 @@ int ext4_inlinedir_to_tree(struct file *dir_file, } } - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + if (ext4_hash_in_dirent(dir)) { + hinfo->hash = EXT4_DIRENT_HASH(de); + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) @@ -1488,8 +1493,8 @@ int ext4_read_inline_dir(struct file *file, * So we will use extra_offset and extra_size to indicate them * during the inline dir iteration. */ - dotdot_offset = EXT4_DIR_REC_LEN(1); - dotdot_size = dotdot_offset + EXT4_DIR_REC_LEN(2); + dotdot_offset = ext4_dir_rec_len(1, NULL); + dotdot_size = dotdot_offset + ext4_dir_rec_len(2, NULL); extra_offset = dotdot_size - EXT4_INLINE_DOTDOT_SIZE; extra_size = extra_offset + inline_size; @@ -1524,7 +1529,7 @@ int ext4_read_inline_dir(struct file *file, * failure will be detected in the * dirent test below. */ if (ext4_rec_len_from_disk(de->rec_len, extra_size) - < EXT4_DIR_REC_LEN(1)) + < ext4_dir_rec_len(1, NULL)) break; i += ext4_rec_len_from_disk(de->rec_len, extra_size); diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 0948a43f1b3d..fe6045a46599 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -1066,8 +1066,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, block_end = block_start + blocksize; if (block_end <= from || block_start >= to) { if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); } continue; } @@ -1092,8 +1091,7 @@ static int ext4_block_write_begin(struct page *page, loff_t pos, unsigned len, } } if (PageUptodate(page)) { - if (!buffer_uptodate(bh)) - set_buffer_uptodate(bh); + set_buffer_uptodate(bh); continue; } if (!buffer_uptodate(bh) && !buffer_delay(bh) && @@ -3824,7 +3822,7 @@ unlock: * starting from file offset 'from'. The range to be zero'd must * be contained with in one block. If the specified range exceeds * the end of the block it will be shortened to end of the block - * that cooresponds to 'from' + * that corresponds to 'from' */ static int ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index e9b0a1fa2ba8..31627f7dc5cd 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -316,6 +316,12 @@ static void ext4_dax_dontcache(struct inode *inode, unsigned int flags) static bool dax_compatible(struct inode *inode, unsigned int oldflags, unsigned int flags) { + /* Allow the DAX flag to be changed on inline directories */ + if (S_ISDIR(inode->i_mode)) { + flags &= ~EXT4_INLINE_DATA_FL; + oldflags &= ~EXT4_INLINE_DATA_FL; + } + if (flags & EXT4_DAX_FL) { if ((oldflags & EXT4_DAX_MUT_EXCL) || ext4_test_inode_state(inode, diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index a02fadf4fc84..3239e6669e84 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -127,11 +127,50 @@ * smallest multiple of the stripe value (sbi->s_stripe) which is * greater than the default mb_group_prealloc. * + * If "mb_optimize_scan" mount option is set, we maintain in memory group info + * structures in two data structures: + * + * 1) Array of largest free order lists (sbi->s_mb_largest_free_orders) + * + * Locking: sbi->s_mb_largest_free_orders_locks(array of rw locks) + * + * This is an array of lists where the index in the array represents the + * largest free order in the buddy bitmap of the participating group infos of + * that list. So, there are exactly MB_NUM_ORDERS(sb) (which means total + * number of buddy bitmap orders possible) number of lists. Group-infos are + * placed in appropriate lists. + * + * 2) Average fragment size rb tree (sbi->s_mb_avg_fragment_size_root) + * + * Locking: sbi->s_mb_rb_lock (rwlock) + * + * This is a red black tree consisting of group infos and the tree is sorted + * by average fragment sizes (which is calculated as ext4_group_info->bb_free + * / ext4_group_info->bb_fragments). + * + * When "mb_optimize_scan" mount option is set, mballoc consults the above data + * structures to decide the order in which groups are to be traversed for + * fulfilling an allocation request. + * + * At CR = 0, we look for groups which have the largest_free_order >= the order + * of the request. We directly look at the largest free order list in the data + * structure (1) above where largest_free_order = order of the request. If that + * list is empty, we look at remaining list in the increasing order of + * largest_free_order. This allows us to perform CR = 0 lookup in O(1) time. + * + * At CR = 1, we only consider groups where average fragment size > request + * size. So, we lookup a group which has average fragment size just above or + * equal to request size using our rb tree (data structure 2) in O(log N) time. + * + * If "mb_optimize_scan" mount option is not set, mballoc traverses groups in + * linear order which requires O(N) search time for each CR 0 and CR 1 phase. + * * The regular allocator (using the buddy cache) supports a few tunables. * * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req + * /sys/fs/ext4/<partition>/mb_linear_limit * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The @@ -149,6 +188,16 @@ * can be used for allocation. ext4_mb_good_group explains how the groups are * checked. * + * When "mb_optimize_scan" is turned on, as mentioned above, the groups may not + * get traversed linearly. That may result in subsequent allocations being not + * close to each other. And so, the underlying device may get filled up in a + * non-linear fashion. While that may not matter on non-rotational devices, for + * rotational devices that may result in higher seek times. "mb_linear_limit" + * tells mballoc how many groups mballoc should search linearly before + * performing consulting above data structures for more efficient lookups. For + * non rotational devices, this value defaults to 0 and for rotational devices + * this is set to MB_DEFAULT_LINEAR_LIMIT. + * * Both the prealloc space are getting populated as above. So for the first * request we will hit the buddy cache which will result in this prealloc * space getting filled. The prealloc space is then later used for the @@ -299,6 +348,8 @@ * - bitlock on a group (group) * - object (inode/locality) (object) * - per-pa lock (pa) + * - cr0 lists lock (cr0) + * - cr1 tree lock (cr1) * * Paths: * - new pa @@ -328,6 +379,9 @@ * group * object * + * - allocation path (ext4_mb_regular_allocator) + * group + * cr0/cr1 */ static struct kmem_cache *ext4_pspace_cachep; static struct kmem_cache *ext4_ac_cachep; @@ -351,6 +405,9 @@ static void ext4_mb_generate_from_freelist(struct super_block *sb, void *bitmap, ext4_group_t group); static void ext4_mb_new_preallocation(struct ext4_allocation_context *ac); +static bool ext4_mb_good_group(struct ext4_allocation_context *ac, + ext4_group_t group, int cr); + /* * The algorithm using this percpu seq counter goes below: * 1. We sample the percpu discard_pa_seq counter before trying for block @@ -744,6 +801,269 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, } } +static void ext4_mb_rb_insert(struct rb_root *root, struct rb_node *new, + int (*cmp)(struct rb_node *, struct rb_node *)) +{ + struct rb_node **iter = &root->rb_node, *parent = NULL; + + while (*iter) { + parent = *iter; + if (cmp(new, *iter) > 0) + iter = &((*iter)->rb_left); + else + iter = &((*iter)->rb_right); + } + + rb_link_node(new, parent, iter); + rb_insert_color(new, root); +} + +static int +ext4_mb_avg_fragment_size_cmp(struct rb_node *rb1, struct rb_node *rb2) +{ + struct ext4_group_info *grp1 = rb_entry(rb1, + struct ext4_group_info, + bb_avg_fragment_size_rb); + struct ext4_group_info *grp2 = rb_entry(rb2, + struct ext4_group_info, + bb_avg_fragment_size_rb); + int num_frags_1, num_frags_2; + + num_frags_1 = grp1->bb_fragments ? + grp1->bb_free / grp1->bb_fragments : 0; + num_frags_2 = grp2->bb_fragments ? + grp2->bb_free / grp2->bb_fragments : 0; + + return (num_frags_2 - num_frags_1); +} + +/* + * Reinsert grpinfo into the avg_fragment_size tree with new average + * fragment size. + */ +static void +mb_update_avg_fragment_size(struct super_block *sb, struct ext4_group_info *grp) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (!test_opt2(sb, MB_OPTIMIZE_SCAN) || grp->bb_free == 0) + return; + + write_lock(&sbi->s_mb_rb_lock); + if (!RB_EMPTY_NODE(&grp->bb_avg_fragment_size_rb)) { + rb_erase(&grp->bb_avg_fragment_size_rb, + &sbi->s_mb_avg_fragment_size_root); + RB_CLEAR_NODE(&grp->bb_avg_fragment_size_rb); + } + + ext4_mb_rb_insert(&sbi->s_mb_avg_fragment_size_root, + &grp->bb_avg_fragment_size_rb, + ext4_mb_avg_fragment_size_cmp); + write_unlock(&sbi->s_mb_rb_lock); +} + +/* + * Choose next group by traversing largest_free_order lists. Updates *new_cr if + * cr level needs an update. + */ +static void ext4_mb_choose_next_group_cr0(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + struct ext4_group_info *iter, *grp; + int i; + + if (ac->ac_status == AC_STATUS_FOUND) + return; + + if (unlikely(sbi->s_mb_stats && ac->ac_flags & EXT4_MB_CR0_OPTIMIZED)) + atomic_inc(&sbi->s_bal_cr0_bad_suggestions); + + grp = NULL; + for (i = ac->ac_2order; i < MB_NUM_ORDERS(ac->ac_sb); i++) { + if (list_empty(&sbi->s_mb_largest_free_orders[i])) + continue; + read_lock(&sbi->s_mb_largest_free_orders_locks[i]); + if (list_empty(&sbi->s_mb_largest_free_orders[i])) { + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + continue; + } + grp = NULL; + list_for_each_entry(iter, &sbi->s_mb_largest_free_orders[i], + bb_largest_free_order_node) { + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[0]); + if (likely(ext4_mb_good_group(ac, iter->bb_group, 0))) { + grp = iter; + break; + } + } + read_unlock(&sbi->s_mb_largest_free_orders_locks[i]); + if (grp) + break; + } + + if (!grp) { + /* Increment cr and search again */ + *new_cr = 1; + } else { + *group = grp->bb_group; + ac->ac_last_optimal_group = *group; + ac->ac_flags |= EXT4_MB_CR0_OPTIMIZED; + } +} + +/* + * Choose next group by traversing average fragment size tree. Updates *new_cr + * if cr lvel needs an update. Sets EXT4_MB_SEARCH_NEXT_LINEAR to indicate that + * the linear search should continue for one iteration since there's lock + * contention on the rb tree lock. + */ +static void ext4_mb_choose_next_group_cr1(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); + int avg_fragment_size, best_so_far; + struct rb_node *node, *found; + struct ext4_group_info *grp; + + /* + * If there is contention on the lock, instead of waiting for the lock + * to become available, just continue searching lineraly. We'll resume + * our rb tree search later starting at ac->ac_last_optimal_group. + */ + if (!read_trylock(&sbi->s_mb_rb_lock)) { + ac->ac_flags |= EXT4_MB_SEARCH_NEXT_LINEAR; + return; + } + + if (unlikely(ac->ac_flags & EXT4_MB_CR1_OPTIMIZED)) { + if (sbi->s_mb_stats) + atomic_inc(&sbi->s_bal_cr1_bad_suggestions); + /* We have found something at CR 1 in the past */ + grp = ext4_get_group_info(ac->ac_sb, ac->ac_last_optimal_group); + for (found = rb_next(&grp->bb_avg_fragment_size_rb); found != NULL; + found = rb_next(found)) { + grp = rb_entry(found, struct ext4_group_info, + bb_avg_fragment_size_rb); + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[1]); + if (likely(ext4_mb_good_group(ac, grp->bb_group, 1))) + break; + } + goto done; + } + + node = sbi->s_mb_avg_fragment_size_root.rb_node; + best_so_far = 0; + found = NULL; + + while (node) { + grp = rb_entry(node, struct ext4_group_info, + bb_avg_fragment_size_rb); + avg_fragment_size = 0; + if (ext4_mb_good_group(ac, grp->bb_group, 1)) { + avg_fragment_size = grp->bb_fragments ? + grp->bb_free / grp->bb_fragments : 0; + if (!best_so_far || avg_fragment_size < best_so_far) { + best_so_far = avg_fragment_size; + found = node; + } + } + if (avg_fragment_size > ac->ac_g_ex.fe_len) + node = node->rb_right; + else + node = node->rb_left; + } + +done: + if (found) { + grp = rb_entry(found, struct ext4_group_info, + bb_avg_fragment_size_rb); + *group = grp->bb_group; + ac->ac_flags |= EXT4_MB_CR1_OPTIMIZED; + } else { + *new_cr = 2; + } + + read_unlock(&sbi->s_mb_rb_lock); + ac->ac_last_optimal_group = *group; +} + +static inline int should_optimize_scan(struct ext4_allocation_context *ac) +{ + if (unlikely(!test_opt2(ac->ac_sb, MB_OPTIMIZE_SCAN))) + return 0; + if (ac->ac_criteria >= 2) + return 0; + if (ext4_test_inode_flag(ac->ac_inode, EXT4_INODE_EXTENTS)) + return 0; + return 1; +} + +/* + * Return next linear group for allocation. If linear traversal should not be + * performed, this function just returns the same group + */ +static int +next_linear_group(struct ext4_allocation_context *ac, int group, int ngroups) +{ + if (!should_optimize_scan(ac)) + goto inc_and_return; + + if (ac->ac_groups_linear_remaining) { + ac->ac_groups_linear_remaining--; + goto inc_and_return; + } + + if (ac->ac_flags & EXT4_MB_SEARCH_NEXT_LINEAR) { + ac->ac_flags &= ~EXT4_MB_SEARCH_NEXT_LINEAR; + goto inc_and_return; + } + + return group; +inc_and_return: + /* + * Artificially restricted ngroups for non-extent + * files makes group > ngroups possible on first loop. + */ + return group + 1 >= ngroups ? 0 : group + 1; +} + +/* + * ext4_mb_choose_next_group: choose next group for allocation. + * + * @ac Allocation Context + * @new_cr This is an output parameter. If the there is no good group + * available at current CR level, this field is updated to indicate + * the new cr level that should be used. + * @group This is an input / output parameter. As an input it indicates the + * next group that the allocator intends to use for allocation. As + * output, this field indicates the next group that should be used as + * determined by the optimization functions. + * @ngroups Total number of groups + */ +static void ext4_mb_choose_next_group(struct ext4_allocation_context *ac, + int *new_cr, ext4_group_t *group, ext4_group_t ngroups) +{ + *new_cr = ac->ac_criteria; + + if (!should_optimize_scan(ac) || ac->ac_groups_linear_remaining) + return; + + if (*new_cr == 0) { + ext4_mb_choose_next_group_cr0(ac, new_cr, group, ngroups); + } else if (*new_cr == 1) { + ext4_mb_choose_next_group_cr1(ac, new_cr, group, ngroups); + } else { + /* + * TODO: For CR=2, we can arrange groups in an rb tree sorted by + * bb_free. But until that happens, we should never come here. + */ + WARN_ON(1); + } +} + /* * Cache the order of the largest free extent we have available in this block * group. @@ -751,18 +1071,33 @@ static void ext4_mb_mark_free_simple(struct super_block *sb, static void mb_set_largest_free_order(struct super_block *sb, struct ext4_group_info *grp) { + struct ext4_sb_info *sbi = EXT4_SB(sb); int i; - int bits; + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && grp->bb_largest_free_order >= 0) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_del_init(&grp->bb_largest_free_order_node); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } grp->bb_largest_free_order = -1; /* uninit */ - bits = sb->s_blocksize_bits + 1; - for (i = bits; i >= 0; i--) { + for (i = MB_NUM_ORDERS(sb) - 1; i >= 0; i--) { if (grp->bb_counters[i] > 0) { grp->bb_largest_free_order = i; break; } } + if (test_opt2(sb, MB_OPTIMIZE_SCAN) && + grp->bb_largest_free_order >= 0 && grp->bb_free) { + write_lock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + list_add_tail(&grp->bb_largest_free_order_node, + &sbi->s_mb_largest_free_orders[grp->bb_largest_free_order]); + write_unlock(&sbi->s_mb_largest_free_orders_locks[ + grp->bb_largest_free_order]); + } } static noinline_for_stack @@ -816,10 +1151,9 @@ void ext4_mb_generate_buddy(struct super_block *sb, clear_bit(EXT4_GROUP_INFO_NEED_INIT_BIT, &(grp->bb_state)); period = get_cycles() - period; - spin_lock(&sbi->s_bal_lock); - sbi->s_mb_buddies_generated++; - sbi->s_mb_generation_time += period; - spin_unlock(&sbi->s_bal_lock); + atomic_inc(&sbi->s_mb_buddies_generated); + atomic64_add(period, &sbi->s_mb_generation_time); + mb_update_avg_fragment_size(sb, grp); } /* The buddy information is attached the buddy cache inode @@ -959,7 +1293,7 @@ static int ext4_mb_init_cache(struct page *page, char *incore, gfp_t gfp) grinfo->bb_fragments = 0; memset(grinfo->bb_counters, 0, sizeof(*grinfo->bb_counters) * - (sb->s_blocksize_bits+2)); + (MB_NUM_ORDERS(sb))); /* * incore got set to the group block bitmap below */ @@ -1519,6 +1853,7 @@ static void mb_free_blocks(struct inode *inode, struct ext4_buddy *e4b, done: mb_set_largest_free_order(sb, e4b->bd_info); + mb_update_avg_fragment_size(sb, e4b->bd_info); mb_check_buddy(e4b); } @@ -1655,6 +1990,7 @@ static int mb_mark_used(struct ext4_buddy *e4b, struct ext4_free_extent *ex) } mb_set_largest_free_order(e4b->bd_sb, e4b->bd_info); + mb_update_avg_fragment_size(e4b->bd_sb, e4b->bd_info); ext4_set_bits(e4b->bd_bitmap, ex->fe_start, len0); mb_check_buddy(e4b); @@ -1930,7 +2266,7 @@ void ext4_mb_simple_scan_group(struct ext4_allocation_context *ac, int max; BUG_ON(ac->ac_2order <= 0); - for (i = ac->ac_2order; i <= sb->s_blocksize_bits + 1; i++) { + for (i = ac->ac_2order; i < MB_NUM_ORDERS(sb); i++) { if (grp->bb_counters[i] == 0) continue; @@ -2109,7 +2445,7 @@ static bool ext4_mb_good_group(struct ext4_allocation_context *ac, if (free < ac->ac_g_ex.fe_len) return false; - if (ac->ac_2order > ac->ac_sb->s_blocksize_bits+1) + if (ac->ac_2order >= MB_NUM_ORDERS(ac->ac_sb)) return true; if (grp->bb_largest_free_order < ac->ac_2order) @@ -2148,6 +2484,8 @@ static int ext4_mb_good_group_nolock(struct ext4_allocation_context *ac, ext4_grpblk_t free; int ret = 0; + if (sbi->s_mb_stats) + atomic64_inc(&sbi->s_bal_cX_groups_considered[ac->ac_criteria]); if (should_lock) ext4_lock_group(sb, group); free = grp->bb_free; @@ -2315,13 +2653,13 @@ ext4_mb_regular_allocator(struct ext4_allocation_context *ac) * We also support searching for power-of-two requests only for * requests upto maximum buddy size we have constructed. */ - if (i >= sbi->s_mb_order2_reqs && i <= sb->s_blocksize_bits + 2) { + if (i >= sbi->s_mb_order2_reqs && i <= MB_NUM_ORDERS(sb)) { /* * This should tell if fe_len is exactly power of 2 */ if ((ac->ac_g_ex.fe_len & (~(1 << (i - 1)))) == 0) ac->ac_2order = array_index_nospec(i - 1, - sb->s_blocksize_bits + 2); + MB_NUM_ORDERS(sb)); } /* if stream allocation is enabled, use global goal */ @@ -2347,17 +2685,21 @@ repeat: * from the goal value specified */ group = ac->ac_g_ex.fe_group; + ac->ac_last_optimal_group = group; + ac->ac_groups_linear_remaining = sbi->s_mb_max_linear_groups; prefetch_grp = group; - for (i = 0; i < ngroups; group++, i++) { - int ret = 0; + for (i = 0; i < ngroups; group = next_linear_group(ac, group, ngroups), + i++) { + int ret = 0, new_cr; + cond_resched(); - /* - * Artificially restricted ngroups for non-extent - * files makes group > ngroups possible on first loop. - */ - if (group >= ngroups) - group = 0; + + ext4_mb_choose_next_group(ac, &new_cr, &group, ngroups); + if (new_cr != cr) { + cr = new_cr; + goto repeat; + } /* * Batch reads of the block allocation bitmaps @@ -2422,6 +2764,9 @@ repeat: if (ac->ac_status != AC_STATUS_CONTINUE) break; } + /* Processed all groups and haven't found blocks */ + if (sbi->s_mb_stats && i == ngroups) + atomic64_inc(&sbi->s_bal_cX_failed[cr]); } if (ac->ac_b_ex.fe_len > 0 && ac->ac_status != AC_STATUS_FOUND && @@ -2451,6 +2796,9 @@ repeat: goto repeat; } } + + if (sbi->s_mb_stats && ac->ac_status == AC_STATUS_FOUND) + atomic64_inc(&sbi->s_bal_cX_hits[ac->ac_criteria]); out: if (!err && ac->ac_status != AC_STATUS_FOUND && first_err) err = first_err; @@ -2550,6 +2898,157 @@ const struct seq_operations ext4_mb_seq_groups_ops = { .show = ext4_mb_seq_groups_show, }; +int ext4_seq_mb_stats_show(struct seq_file *seq, void *offset) +{ + struct super_block *sb = (struct super_block *)seq->private; + struct ext4_sb_info *sbi = EXT4_SB(sb); + + seq_puts(seq, "mballoc:\n"); + if (!sbi->s_mb_stats) { + seq_puts(seq, "\tmb stats collection turned off.\n"); + seq_puts(seq, "\tTo enable, please write \"1\" to sysfs file mb_stats.\n"); + return 0; + } + seq_printf(seq, "\treqs: %u\n", atomic_read(&sbi->s_bal_reqs)); + seq_printf(seq, "\tsuccess: %u\n", atomic_read(&sbi->s_bal_success)); + + seq_printf(seq, "\tgroups_scanned: %u\n", atomic_read(&sbi->s_bal_groups_scanned)); + + seq_puts(seq, "\tcr0_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[0])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[0])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[0])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_cr0_bad_suggestions)); + + seq_puts(seq, "\tcr1_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[1])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[1])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[1])); + seq_printf(seq, "\t\tbad_suggestions: %u\n", + atomic_read(&sbi->s_bal_cr1_bad_suggestions)); + + seq_puts(seq, "\tcr2_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[2])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[2])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[2])); + + seq_puts(seq, "\tcr3_stats:\n"); + seq_printf(seq, "\t\thits: %llu\n", atomic64_read(&sbi->s_bal_cX_hits[3])); + seq_printf(seq, "\t\tgroups_considered: %llu\n", + atomic64_read(&sbi->s_bal_cX_groups_considered[3])); + seq_printf(seq, "\t\tuseless_loops: %llu\n", + atomic64_read(&sbi->s_bal_cX_failed[3])); + seq_printf(seq, "\textents_scanned: %u\n", atomic_read(&sbi->s_bal_ex_scanned)); + seq_printf(seq, "\t\tgoal_hits: %u\n", atomic_read(&sbi->s_bal_goals)); + seq_printf(seq, "\t\t2^n_hits: %u\n", atomic_read(&sbi->s_bal_2orders)); + seq_printf(seq, "\t\tbreaks: %u\n", atomic_read(&sbi->s_bal_breaks)); + seq_printf(seq, "\t\tlost: %u\n", atomic_read(&sbi->s_mb_lost_chunks)); + + seq_printf(seq, "\tbuddies_generated: %u/%u\n", + atomic_read(&sbi->s_mb_buddies_generated), + ext4_get_groups_count(sb)); + seq_printf(seq, "\tbuddies_time_used: %llu\n", + atomic64_read(&sbi->s_mb_generation_time)); + seq_printf(seq, "\tpreallocated: %u\n", + atomic_read(&sbi->s_mb_preallocated)); + seq_printf(seq, "\tdiscarded: %u\n", + atomic_read(&sbi->s_mb_discarded)); + return 0; +} + +static void *ext4_mb_seq_structs_summary_start(struct seq_file *seq, loff_t *pos) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + unsigned long position; + + read_lock(&EXT4_SB(sb)->s_mb_rb_lock); + + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +} + +static void *ext4_mb_seq_structs_summary_next(struct seq_file *seq, void *v, loff_t *pos) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + unsigned long position; + + ++*pos; + if (*pos < 0 || *pos >= MB_NUM_ORDERS(sb) + 1) + return NULL; + position = *pos + 1; + return (void *) ((unsigned long) position); +} + +static int ext4_mb_seq_structs_summary_show(struct seq_file *seq, void *v) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + struct ext4_sb_info *sbi = EXT4_SB(sb); + unsigned long position = ((unsigned long) v); + struct ext4_group_info *grp; + struct rb_node *n; + unsigned int count, min, max; + + position--; + if (position >= MB_NUM_ORDERS(sb)) { + seq_puts(seq, "fragment_size_tree:\n"); + n = rb_first(&sbi->s_mb_avg_fragment_size_root); + if (!n) { + seq_puts(seq, "\ttree_min: 0\n\ttree_max: 0\n\ttree_nodes: 0\n"); + return 0; + } + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); + min = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + count = 1; + while (rb_next(n)) { + count++; + n = rb_next(n); + } + grp = rb_entry(n, struct ext4_group_info, bb_avg_fragment_size_rb); + max = grp->bb_fragments ? grp->bb_free / grp->bb_fragments : 0; + + seq_printf(seq, "\ttree_min: %u\n\ttree_max: %u\n\ttree_nodes: %u\n", + min, max, count); + return 0; + } + + if (position == 0) { + seq_printf(seq, "optimize_scan: %d\n", + test_opt2(sb, MB_OPTIMIZE_SCAN) ? 1 : 0); + seq_puts(seq, "max_free_order_lists:\n"); + } + count = 0; + list_for_each_entry(grp, &sbi->s_mb_largest_free_orders[position], + bb_largest_free_order_node) + count++; + seq_printf(seq, "\tlist_order_%u_groups: %u\n", + (unsigned int)position, count); + + return 0; +} + +static void ext4_mb_seq_structs_summary_stop(struct seq_file *seq, void *v) +{ + struct super_block *sb = PDE_DATA(file_inode(seq->file)); + + read_unlock(&EXT4_SB(sb)->s_mb_rb_lock); +} + +const struct seq_operations ext4_mb_seq_structs_summary_ops = { + .start = ext4_mb_seq_structs_summary_start, + .next = ext4_mb_seq_structs_summary_next, + .stop = ext4_mb_seq_structs_summary_stop, + .show = ext4_mb_seq_structs_summary_show, +}; + static struct kmem_cache *get_groupinfo_cache(int blocksize_bits) { int cache_index = blocksize_bits - EXT4_MIN_BLOCK_LOG_SIZE; @@ -2590,7 +3089,7 @@ int ext4_mb_alloc_groupinfo(struct super_block *sb, ext4_group_t ngroups) sbi->s_group_info_size = size / sizeof(*sbi->s_group_info); if (old_groupinfo) ext4_kvfree_array_rcu(old_groupinfo); - ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", + ext4_debug("allocated s_groupinfo array for %d meta_bg's\n", sbi->s_group_info_size); return 0; } @@ -2652,7 +3151,10 @@ int ext4_mb_add_groupinfo(struct super_block *sb, ext4_group_t group, INIT_LIST_HEAD(&meta_group_info[i]->bb_prealloc_list); init_rwsem(&meta_group_info[i]->alloc_sem); meta_group_info[i]->bb_free_root = RB_ROOT; + INIT_LIST_HEAD(&meta_group_info[i]->bb_largest_free_order_node); + RB_CLEAR_NODE(&meta_group_info[i]->bb_avg_fragment_size_rb); meta_group_info[i]->bb_largest_free_order = -1; /* uninit */ + meta_group_info[i]->bb_group = group; mb_group_bb_bitmap_alloc(sb, meta_group_info[i], group); return 0; @@ -2813,7 +3315,7 @@ int ext4_mb_init(struct super_block *sb) unsigned max; int ret; - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_offsets); + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_offsets); sbi->s_mb_offsets = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_offsets == NULL) { @@ -2821,7 +3323,7 @@ int ext4_mb_init(struct super_block *sb) goto out; } - i = (sb->s_blocksize_bits + 2) * sizeof(*sbi->s_mb_maxs); + i = MB_NUM_ORDERS(sb) * sizeof(*sbi->s_mb_maxs); sbi->s_mb_maxs = kmalloc(i, GFP_KERNEL); if (sbi->s_mb_maxs == NULL) { ret = -ENOMEM; @@ -2847,10 +3349,30 @@ int ext4_mb_init(struct super_block *sb) offset_incr = offset_incr >> 1; max = max >> 1; i++; - } while (i <= sb->s_blocksize_bits + 1); + } while (i < MB_NUM_ORDERS(sb)); + + sbi->s_mb_avg_fragment_size_root = RB_ROOT; + sbi->s_mb_largest_free_orders = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(struct list_head), + GFP_KERNEL); + if (!sbi->s_mb_largest_free_orders) { + ret = -ENOMEM; + goto out; + } + sbi->s_mb_largest_free_orders_locks = + kmalloc_array(MB_NUM_ORDERS(sb), sizeof(rwlock_t), + GFP_KERNEL); + if (!sbi->s_mb_largest_free_orders_locks) { + ret = -ENOMEM; + goto out; + } + for (i = 0; i < MB_NUM_ORDERS(sb); i++) { + INIT_LIST_HEAD(&sbi->s_mb_largest_free_orders[i]); + rwlock_init(&sbi->s_mb_largest_free_orders_locks[i]); + } + rwlock_init(&sbi->s_mb_rb_lock); spin_lock_init(&sbi->s_md_lock); - spin_lock_init(&sbi->s_bal_lock); sbi->s_mb_free_pending = 0; INIT_LIST_HEAD(&sbi->s_freed_data_list); @@ -2901,6 +3423,10 @@ int ext4_mb_init(struct super_block *sb) spin_lock_init(&lg->lg_prealloc_lock); } + if (blk_queue_nonrot(bdev_get_queue(sb->s_bdev))) + sbi->s_mb_max_linear_groups = 0; + else + sbi->s_mb_max_linear_groups = MB_DEFAULT_LINEAR_LIMIT; /* init file for buddy data */ ret = ext4_mb_init_backend(sb); if (ret != 0) @@ -2912,6 +3438,8 @@ out_free_locality_groups: free_percpu(sbi->s_locality_groups); sbi->s_locality_groups = NULL; out: + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); sbi->s_mb_offsets = NULL; kfree(sbi->s_mb_maxs); @@ -2968,6 +3496,8 @@ int ext4_mb_release(struct super_block *sb) kvfree(group_info); rcu_read_unlock(); } + kfree(sbi->s_mb_largest_free_orders); + kfree(sbi->s_mb_largest_free_orders_locks); kfree(sbi->s_mb_offsets); kfree(sbi->s_mb_maxs); iput(sbi->s_buddy_cache); @@ -2978,17 +3508,18 @@ int ext4_mb_release(struct super_block *sb) atomic_read(&sbi->s_bal_reqs), atomic_read(&sbi->s_bal_success)); ext4_msg(sb, KERN_INFO, - "mballoc: %u extents scanned, %u goal hits, " + "mballoc: %u extents scanned, %u groups scanned, %u goal hits, " "%u 2^N hits, %u breaks, %u lost", atomic_read(&sbi->s_bal_ex_scanned), + atomic_read(&sbi->s_bal_groups_scanned), atomic_read(&sbi->s_bal_goals), atomic_read(&sbi->s_bal_2orders), atomic_read(&sbi->s_bal_breaks), atomic_read(&sbi->s_mb_lost_chunks)); ext4_msg(sb, KERN_INFO, - "mballoc: %lu generated and it took %Lu", - sbi->s_mb_buddies_generated, - sbi->s_mb_generation_time); + "mballoc: %u generated and it took %llu", + atomic_read(&sbi->s_mb_buddies_generated), + atomic64_read(&sbi->s_mb_generation_time)); ext4_msg(sb, KERN_INFO, "mballoc: %u preallocated, %u discarded", atomic_read(&sbi->s_mb_preallocated), @@ -3583,12 +4114,13 @@ static void ext4_mb_collect_stats(struct ext4_allocation_context *ac) { struct ext4_sb_info *sbi = EXT4_SB(ac->ac_sb); - if (sbi->s_mb_stats && ac->ac_g_ex.fe_len > 1) { + if (sbi->s_mb_stats && ac->ac_g_ex.fe_len >= 1) { atomic_inc(&sbi->s_bal_reqs); atomic_add(ac->ac_b_ex.fe_len, &sbi->s_bal_allocated); if (ac->ac_b_ex.fe_len >= ac->ac_o_ex.fe_len) atomic_inc(&sbi->s_bal_success); atomic_add(ac->ac_found, &sbi->s_bal_ex_scanned); + atomic_add(ac->ac_groups_scanned, &sbi->s_bal_groups_scanned); if (ac->ac_g_ex.fe_start == ac->ac_b_ex.fe_start && ac->ac_g_ex.fe_group == ac->ac_b_ex.fe_group) atomic_inc(&sbi->s_bal_goals); diff --git a/fs/ext4/mballoc.h b/fs/ext4/mballoc.h index e75b4749aa1c..39da92ceabf8 100644 --- a/fs/ext4/mballoc.h +++ b/fs/ext4/mballoc.h @@ -59,7 +59,7 @@ * by the stream allocator, which purpose is to pack requests * as close each to other as possible to produce smooth I/O traffic * We use locality group prealloc space for stream request. - * We can tune the same via /proc/fs/ext4/<parition>/stream_req + * We can tune the same via /proc/fs/ext4/<partition>/stream_req */ #define MB_DEFAULT_STREAM_THRESHOLD 16 /* 64K */ @@ -78,6 +78,23 @@ */ #define MB_DEFAULT_MAX_INODE_PREALLOC 512 +/* + * Number of groups to search linearly before performing group scanning + * optimization. + */ +#define MB_DEFAULT_LINEAR_LIMIT 4 + +/* + * Minimum number of groups that should be present in the file system to perform + * group scanning optimizations. + */ +#define MB_DEFAULT_LINEAR_SCAN_THRESHOLD 16 + +/* + * Number of valid buddy orders + */ +#define MB_NUM_ORDERS(sb) ((sb)->s_blocksize_bits + 2) + struct ext4_free_data { /* this links the free block information from sb_info */ struct list_head efd_list; @@ -161,11 +178,14 @@ struct ext4_allocation_context { /* copy of the best found extent taken before preallocation efforts */ struct ext4_free_extent ac_f_ex; + ext4_group_t ac_last_optimal_group; + __u32 ac_groups_considered; + __u32 ac_flags; /* allocation hints */ __u16 ac_groups_scanned; + __u16 ac_groups_linear_remaining; __u16 ac_found; __u16 ac_tail; __u16 ac_buddy; - __u16 ac_flags; /* allocation hints */ __u8 ac_status; __u8 ac_criteria; __u8 ac_2order; /* if request is to allocate 2^N blocks and diff --git a/fs/ext4/migrate.c b/fs/ext4/migrate.c index c5e3fc998211..7e0b4f81c6c0 100644 --- a/fs/ext4/migrate.c +++ b/fs/ext4/migrate.c @@ -32,7 +32,7 @@ static int finish_range(handle_t *handle, struct inode *inode, newext.ee_block = cpu_to_le32(lb->first_block); newext.ee_len = cpu_to_le16(lb->last_block - lb->first_block + 1); ext4_ext_store_pblock(&newext, lb->first_pblock); - /* Locking only for convinience since we are operating on temp inode */ + /* Locking only for convenience since we are operating on temp inode */ down_write(&EXT4_I(inode)->i_data_sem); path = ext4_find_extent(inode, lb->first_block, NULL, 0); if (IS_ERR(path)) { @@ -43,8 +43,8 @@ static int finish_range(handle_t *handle, struct inode *inode, /* * Calculate the credit needed to inserting this extent - * Since we are doing this in loop we may accumalate extra - * credit. But below we try to not accumalate too much + * Since we are doing this in loop we may accumulate extra + * credit. But below we try to not accumulate too much * of them by restarting the journal. */ needed = ext4_ext_calc_credits_for_single_extent(inode, diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 795c3ff2907c..68fbeedd627b 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -56,7 +56,7 @@ static int write_mmp_block(struct super_block *sb, struct buffer_head *bh) wait_on_buffer(bh); sb_end_write(sb); if (unlikely(!buffer_uptodate(bh))) - return 1; + return -EIO; return 0; } diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index a37a19fabee4..afb9d05a99ba 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -280,9 +280,11 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, unsigned blocksize, struct dx_hash_info *hinfo, struct dx_map_entry map[]); static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents(char *from, char *to, - struct dx_map_entry *offsets, int count, unsigned blocksize); -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize); +static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, + char *to, struct dx_map_entry *offsets, + int count, unsigned int blocksize); +static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + unsigned int blocksize); static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block); static int ext4_htree_next_block(struct inode *dir, __u32 hash, @@ -574,8 +576,9 @@ static inline void dx_set_limit(struct dx_entry *entries, unsigned value) static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) { - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(1) - - EXT4_DIR_REC_LEN(2) - infosize; + unsigned int entry_space = dir->i_sb->s_blocksize - + ext4_dir_rec_len(1, NULL) - + ext4_dir_rec_len(2, NULL) - infosize; if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); @@ -584,7 +587,8 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) static inline unsigned dx_node_limit(struct inode *dir) { - unsigned entry_space = dir->i_sb->s_blocksize - EXT4_DIR_REC_LEN(0); + unsigned int entry_space = dir->i_sb->s_blocksize - + ext4_dir_rec_len(0, dir); if (ext4_has_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); @@ -673,7 +677,10 @@ static struct stats dx_show_leaf(struct inode *dir, name = fname_crypto_str.name; len = fname_crypto_str.len; } - ext4fs_dirhash(dir, de->name, + if (IS_CASEFOLDED(dir)) + h.hash = EXT4_DIRENT_HASH(de); + else + ext4fs_dirhash(dir, de->name, de->name_len, &h); printk("%*.s:(E)%x.%u ", len, name, h.hash, (unsigned) ((char *) de @@ -689,7 +696,7 @@ static struct stats dx_show_leaf(struct inode *dir, (unsigned) ((char *) de - base)); #endif } - space += EXT4_DIR_REC_LEN(de->name_len); + space += ext4_dir_rec_len(de->name_len, dir); names++; } de = ext4_next_entry(de, size); @@ -784,18 +791,34 @@ dx_probe(struct ext4_filename *fname, struct inode *dir, root = (struct dx_root *) frame->bh->b_data; if (root->info.hash_version != DX_HASH_TEA && root->info.hash_version != DX_HASH_HALF_MD4 && - root->info.hash_version != DX_HASH_LEGACY) { + root->info.hash_version != DX_HASH_LEGACY && + root->info.hash_version != DX_HASH_SIPHASH) { ext4_warning_inode(dir, "Unrecognised inode hash code %u", root->info.hash_version); goto fail; } + if (ext4_hash_in_dirent(dir)) { + if (root->info.hash_version != DX_HASH_SIPHASH) { + ext4_warning_inode(dir, + "Hash in dirent, but hash is not SIPHASH"); + goto fail; + } + } else { + if (root->info.hash_version == DX_HASH_SIPHASH) { + ext4_warning_inode(dir, + "Hash code is SIPHASH, but hash not in dirent"); + goto fail; + } + } if (fname) hinfo = &fname->hinfo; hinfo->hash_version = root->info.hash_version; if (hinfo->hash_version <= DX_HASH_TEA) hinfo->hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; hinfo->seed = EXT4_SB(dir->i_sb)->s_hash_seed; - if (fname && fname_name(fname)) + /* hash is already computed for encrypted casefolded directory */ + if (fname && fname_name(fname) && + !(IS_ENCRYPTED(dir) && IS_CASEFOLDED(dir))) ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), hinfo); hash = hinfo->hash; @@ -956,7 +979,7 @@ static int ext4_htree_next_block(struct inode *dir, __u32 hash, * If the hash is 1, then continue only if the next page has a * continuation hash of any value. This is used for readdir * handling. Otherwise, check to see if the hash matches the - * desired contiuation hash. If it doesn't, return since + * desired continuation hash. If it doesn't, return since * there's no point to read in the successive index pages. */ bhash = dx_get_hash(p->at); @@ -997,6 +1020,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; + int csum = ext4_has_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -1005,9 +1029,11 @@ static int htree_dirblock_to_tree(struct file *dir_file, return PTR_ERR(bh); de = (struct ext4_dir_entry_2 *) bh->b_data; + /* csum entries are not larger in the casefolded encrypted case */ top = (struct ext4_dir_entry_2 *) ((char *) de + dir->i_sb->s_blocksize - - EXT4_DIR_REC_LEN(0)); + ext4_dir_rec_len(0, + csum ? NULL : dir)); /* Check if the directory is encrypted */ if (IS_ENCRYPTED(dir)) { err = fscrypt_prepare_readdir(dir); @@ -1031,7 +1057,17 @@ static int htree_dirblock_to_tree(struct file *dir_file, /* silently ignore the rest of the block */ break; } - ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + if (ext4_hash_in_dirent(dir)) { + if (de->name_len && de->inode) { + hinfo->hash = EXT4_DIRENT_HASH(de); + hinfo->minor_hash = EXT4_DIRENT_MINOR_HASH(de); + } else { + hinfo->hash = 0; + hinfo->minor_hash = 0; + } + } else { + ext4fs_dirhash(dir, de->name, de->name_len, hinfo); + } if ((hinfo->hash < start_hash) || ((hinfo->hash == start_hash) && (hinfo->minor_hash < start_minor_hash))) @@ -1100,7 +1136,11 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash, start_hash, start_minor_hash)); dir = file_inode(dir_file); if (!(ext4_test_inode_flag(dir, EXT4_INODE_INDEX))) { - hinfo.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (ext4_hash_in_dirent(dir)) + hinfo.hash_version = DX_HASH_SIPHASH; + else + hinfo.hash_version = + EXT4_SB(dir->i_sb)->s_def_hash_version; if (hinfo.hash_version <= DX_HASH_TEA) hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; @@ -1218,7 +1258,10 @@ static int dx_make_map(struct inode *dir, struct ext4_dir_entry_2 *de, while ((char *) de < base + blocksize) { if (de->name_len && de->inode) { - ext4fs_dirhash(dir, de->name, de->name_len, &h); + if (ext4_hash_in_dirent(dir)) + h.hash = EXT4_DIRENT_HASH(de); + else + ext4fs_dirhash(dir, de->name, de->name_len, &h); map_tail--; map_tail->hash = h.hash; map_tail->offs = ((char *) de - base)>>2; @@ -1282,47 +1325,65 @@ static void dx_insert_block(struct dx_frame *frame, u32 hash, ext4_lblk_t block) * Returns: 0 if the directory entry matches, more than 0 if it * doesn't match or less than zero on error. */ -int ext4_ci_compare(const struct inode *parent, const struct qstr *name, - const struct qstr *entry, bool quick) +static int ext4_ci_compare(const struct inode *parent, const struct qstr *name, + u8 *de_name, size_t de_name_len, bool quick) { const struct super_block *sb = parent->i_sb; const struct unicode_map *um = sb->s_encoding; + struct fscrypt_str decrypted_name = FSTR_INIT(NULL, de_name_len); + struct qstr entry = QSTR_INIT(de_name, de_name_len); int ret; + if (IS_ENCRYPTED(parent)) { + const struct fscrypt_str encrypted_name = + FSTR_INIT(de_name, de_name_len); + + decrypted_name.name = kmalloc(de_name_len, GFP_KERNEL); + if (!decrypted_name.name) + return -ENOMEM; + ret = fscrypt_fname_disk_to_usr(parent, 0, 0, &encrypted_name, + &decrypted_name); + if (ret < 0) + goto out; + entry.name = decrypted_name.name; + entry.len = decrypted_name.len; + } + if (quick) - ret = utf8_strncasecmp_folded(um, name, entry); + ret = utf8_strncasecmp_folded(um, name, &entry); else - ret = utf8_strncasecmp(um, name, entry); - + ret = utf8_strncasecmp(um, name, &entry); if (ret < 0) { /* Handle invalid character sequence as either an error * or as an opaque byte sequence. */ if (sb_has_strict_encoding(sb)) - return -EINVAL; - - if (name->len != entry->len) - return 1; - - return !!memcmp(name->name, entry->name, name->len); + ret = -EINVAL; + else if (name->len != entry.len) + ret = 1; + else + ret = !!memcmp(name->name, entry.name, entry.len); } - +out: + kfree(decrypted_name.name); return ret; } -void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, - struct fscrypt_str *cf_name) +int ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, + struct ext4_filename *name) { + struct fscrypt_str *cf_name = &name->cf_name; + struct dx_hash_info *hinfo = &name->hinfo; int len; if (!IS_CASEFOLDED(dir) || !dir->i_sb->s_encoding) { cf_name->name = NULL; - return; + return 0; } cf_name->name = kmalloc(EXT4_NAME_LEN, GFP_NOFS); if (!cf_name->name) - return; + return -ENOMEM; len = utf8_casefold(dir->i_sb->s_encoding, iname, cf_name->name, @@ -1330,10 +1391,18 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, if (len <= 0) { kfree(cf_name->name); cf_name->name = NULL; - return; } cf_name->len = (unsigned) len; + if (!IS_ENCRYPTED(dir)) + return 0; + hinfo->hash_version = DX_HASH_SIPHASH; + hinfo->seed = NULL; + if (cf_name->name) + ext4fs_dirhash(dir, cf_name->name, cf_name->len, hinfo); + else + ext4fs_dirhash(dir, iname->name, iname->len, hinfo); + return 0; } #endif @@ -1342,14 +1411,11 @@ void ext4_fname_setup_ci_filename(struct inode *dir, const struct qstr *iname, * * Return: %true if the directory entry matches, otherwise %false. */ -static inline bool ext4_match(const struct inode *parent, +static bool ext4_match(struct inode *parent, const struct ext4_filename *fname, - const struct ext4_dir_entry_2 *de) + struct ext4_dir_entry_2 *de) { struct fscrypt_name f; -#ifdef CONFIG_UNICODE - const struct qstr entry = {.name = de->name, .len = de->name_len}; -#endif if (!de->inode) return false; @@ -1365,10 +1431,19 @@ static inline bool ext4_match(const struct inode *parent, if (fname->cf_name.name) { struct qstr cf = {.name = fname->cf_name.name, .len = fname->cf_name.len}; - return !ext4_ci_compare(parent, &cf, &entry, true); + if (IS_ENCRYPTED(parent)) { + if (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || + fname->hinfo.minor_hash != + EXT4_DIRENT_MINOR_HASH(de)) { + + return 0; + } + } + return !ext4_ci_compare(parent, &cf, de->name, + de->name_len, true); } - return !ext4_ci_compare(parent, fname->usr_fname, &entry, - false); + return !ext4_ci_compare(parent, fname->usr_fname, de->name, + de->name_len, false); } #endif @@ -1739,11 +1814,10 @@ static struct dentry *ext4_lookup(struct inode *dir, struct dentry *dentry, unsi struct dentry *ext4_get_parent(struct dentry *child) { __u32 ino; - static const struct qstr dotdot = QSTR_INIT("..", 2); struct ext4_dir_entry_2 * de; struct buffer_head *bh; - bh = ext4_find_entry(d_inode(child), &dotdot, &de, NULL); + bh = ext4_find_entry(d_inode(child), &dotdot_name, &de, NULL); if (IS_ERR(bh)) return ERR_CAST(bh); if (!bh) @@ -1765,7 +1839,8 @@ struct dentry *ext4_get_parent(struct dentry *child) * Returns pointer to last entry moved. */ static struct ext4_dir_entry_2 * -dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, +dx_move_dirents(struct inode *dir, char *from, char *to, + struct dx_map_entry *map, int count, unsigned blocksize) { unsigned rec_len = 0; @@ -1773,11 +1848,19 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, while (count--) { struct ext4_dir_entry_2 *de = (struct ext4_dir_entry_2 *) (from + (map->offs<<2)); - rec_len = EXT4_DIR_REC_LEN(de->name_len); + rec_len = ext4_dir_rec_len(de->name_len, dir); + memcpy (to, de, rec_len); ((struct ext4_dir_entry_2 *) to)->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); + + /* wipe dir_entry excluding the rec_len field */ de->inode = 0; + memset(&de->name_len, 0, ext4_rec_len_from_disk(de->rec_len, + blocksize) - + offsetof(struct ext4_dir_entry_2, + name_len)); + map++; to += rec_len; } @@ -1788,7 +1871,8 @@ dx_move_dirents(char *from, char *to, struct dx_map_entry *map, int count, * Compact each dir entry in the range to the minimal rec_len. * Returns pointer to last entry in range. */ -static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) +static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, + unsigned int blocksize) { struct ext4_dir_entry_2 *next, *to, *prev, *de = (struct ext4_dir_entry_2 *) base; unsigned rec_len = 0; @@ -1797,7 +1881,7 @@ static struct ext4_dir_entry_2* dx_pack_dirents(char *base, unsigned blocksize) while ((char*)de < base + blocksize) { next = ext4_next_entry(de, blocksize); if (de->inode && de->name_len) { - rec_len = EXT4_DIR_REC_LEN(de->name_len); + rec_len = ext4_dir_rec_len(de->name_len, dir); if (de > to) memmove(to, de, rec_len); to->rec_len = ext4_rec_len_to_disk(rec_len, blocksize); @@ -1887,9 +1971,9 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, hash2, split, count-split)); /* Fancy dance to stay within two buffers */ - de2 = dx_move_dirents(data1, data2, map + split, count - split, + de2 = dx_move_dirents(dir, data1, data2, map + split, count - split, blocksize); - de = dx_pack_dirents(data1, blocksize); + de = dx_pack_dirents(dir, data1, blocksize); de->rec_len = ext4_rec_len_to_disk(data1 + (blocksize - csum_size) - (char *) de, blocksize); @@ -1937,7 +2021,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, struct ext4_dir_entry_2 **dest_de) { struct ext4_dir_entry_2 *de; - unsigned short reclen = EXT4_DIR_REC_LEN(fname_len(fname)); + unsigned short reclen = ext4_dir_rec_len(fname_len(fname), dir); int nlen, rlen; unsigned int offset = 0; char *top; @@ -1950,7 +2034,7 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, return -EFSCORRUPTED; if (ext4_match(dir, fname, de)) return -EEXIST; - nlen = EXT4_DIR_REC_LEN(de->name_len); + nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if ((de->inode ? rlen - nlen : rlen) >= reclen) break; @@ -1964,7 +2048,8 @@ int ext4_find_dest_de(struct inode *dir, struct inode *inode, return 0; } -void ext4_insert_dentry(struct inode *inode, +void ext4_insert_dentry(struct inode *dir, + struct inode *inode, struct ext4_dir_entry_2 *de, int buf_size, struct ext4_filename *fname) @@ -1972,7 +2057,7 @@ void ext4_insert_dentry(struct inode *inode, int nlen, rlen; - nlen = EXT4_DIR_REC_LEN(de->name_len); + nlen = ext4_dir_rec_len(de->name_len, dir); rlen = ext4_rec_len_from_disk(de->rec_len, buf_size); if (de->inode) { struct ext4_dir_entry_2 *de1 = @@ -1986,6 +2071,13 @@ void ext4_insert_dentry(struct inode *inode, ext4_set_de_type(inode->i_sb, de, inode->i_mode); de->name_len = fname_len(fname); memcpy(de->name, fname_name(fname), fname_len(fname)); + if (ext4_hash_in_dirent(dir)) { + struct dx_hash_info *hinfo = &fname->hinfo; + + EXT4_DIRENT_HASHES(de)->hash = cpu_to_le32(hinfo->hash); + EXT4_DIRENT_HASHES(de)->minor_hash = + cpu_to_le32(hinfo->minor_hash); + } } /* @@ -2022,7 +2114,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, } /* By now the buffer is marked for journaling */ - ext4_insert_dentry(inode, de, blocksize, fname); + ext4_insert_dentry(dir, inode, de, blocksize, fname); /* * XXX shouldn't update any times until successful @@ -2102,6 +2194,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, data2 = bh2->b_data; memcpy(data2, de, len); + memset(de, 0, len); /* wipe old data */ de = (struct ext4_dir_entry_2 *) data2; top = data2 + len; while ((char *)(de2 = ext4_next_entry(de, blocksize)) < top) @@ -2114,11 +2207,16 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, /* Initialize the root; the dot dirents already exist */ de = (struct ext4_dir_entry_2 *) (&root->dotdot); - de->rec_len = ext4_rec_len_to_disk(blocksize - EXT4_DIR_REC_LEN(2), - blocksize); + de->rec_len = ext4_rec_len_to_disk( + blocksize - ext4_dir_rec_len(2, NULL), blocksize); memset (&root->info, 0, sizeof(root->info)); root->info.info_length = sizeof(root->info); - root->info.hash_version = EXT4_SB(dir->i_sb)->s_def_hash_version; + if (ext4_hash_in_dirent(dir)) + root->info.hash_version = DX_HASH_SIPHASH; + else + root->info.hash_version = + EXT4_SB(dir->i_sb)->s_def_hash_version; + entries = root->entries; dx_set_block(entries, 1); dx_set_count(entries, 1); @@ -2129,7 +2227,11 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, if (fname->hinfo.hash_version <= DX_HASH_TEA) fname->hinfo.hash_version += EXT4_SB(dir->i_sb)->s_hash_unsigned; fname->hinfo.seed = EXT4_SB(dir->i_sb)->s_hash_seed; - ext4fs_dirhash(dir, fname_name(fname), fname_len(fname), &fname->hinfo); + + /* casefolded encrypted hashes are computed on fname setup */ + if (!ext4_hash_in_dirent(dir)) + ext4fs_dirhash(dir, fname_name(fname), + fname_len(fname), &fname->hinfo); memset(frames, 0, sizeof(frames)); frame = frames; @@ -2139,10 +2241,10 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, retval = ext4_handle_dirty_dx_node(handle, dir, frame->bh); if (retval) - goto out_frames; + goto out_frames; retval = ext4_handle_dirty_dirblock(handle, dir, bh2); if (retval) - goto out_frames; + goto out_frames; de = do_split(handle,dir, &bh2, frame, &fname->hinfo); if (IS_ERR(de)) { @@ -2482,15 +2584,27 @@ int ext4_generic_delete_entry(struct inode *dir, entry_buf, buf_size, i)) return -EFSCORRUPTED; if (de == de_del) { - if (pde) + if (pde) { pde->rec_len = ext4_rec_len_to_disk( ext4_rec_len_from_disk(pde->rec_len, blocksize) + ext4_rec_len_from_disk(de->rec_len, blocksize), blocksize); - else + + /* wipe entire dir_entry */ + memset(de, 0, ext4_rec_len_from_disk(de->rec_len, + blocksize)); + } else { + /* wipe dir_entry excluding the rec_len field */ de->inode = 0; + memset(&de->name_len, 0, + ext4_rec_len_from_disk(de->rec_len, + blocksize) - + offsetof(struct ext4_dir_entry_2, + name_len)); + } + inode_inc_iversion(dir); return 0; } @@ -2722,7 +2836,7 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, { de->inode = cpu_to_le32(inode->i_ino); de->name_len = 1; - de->rec_len = ext4_rec_len_to_disk(EXT4_DIR_REC_LEN(de->name_len), + de->rec_len = ext4_rec_len_to_disk(ext4_dir_rec_len(de->name_len, NULL), blocksize); strcpy(de->name, "."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); @@ -2732,11 +2846,12 @@ struct ext4_dir_entry_2 *ext4_init_dot_dotdot(struct inode *inode, de->name_len = 2; if (!dotdot_real_len) de->rec_len = ext4_rec_len_to_disk(blocksize - - (csum_size + EXT4_DIR_REC_LEN(1)), + (csum_size + ext4_dir_rec_len(1, NULL)), blocksize); else de->rec_len = ext4_rec_len_to_disk( - EXT4_DIR_REC_LEN(de->name_len), blocksize); + ext4_dir_rec_len(de->name_len, NULL), + blocksize); strcpy(de->name, ".."); ext4_set_de_type(inode->i_sb, de, S_IFDIR); @@ -2869,7 +2984,8 @@ bool ext4_empty_dir(struct inode *inode) } sb = inode->i_sb; - if (inode->i_size < EXT4_DIR_REC_LEN(1) + EXT4_DIR_REC_LEN(2)) { + if (inode->i_size < ext4_dir_rec_len(1, NULL) + + ext4_dir_rec_len(2, NULL)) { EXT4_ERROR_INODE(inode, "invalid size"); return true; } @@ -3372,7 +3488,7 @@ static int ext4_symlink(struct user_namespace *mnt_userns, struct inode *dir, * for transaction commit if we are running out of space * and thus we deadlock. So we have to stop transaction now * and restart it when symlink contents is written. - * + * * To keep fs consistent in case of crash, we have to put inode * to orphan list in the mean time. */ diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 3868377dec2d..7dc94f3e18e6 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -667,9 +667,6 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, ext4_commit_super(sb); } - if (sb_rdonly(sb) || continue_fs) - return; - /* * We force ERRORS_RO behavior when system is rebooting. Otherwise we * could panic during 'reboot -f' as the underlying device got already @@ -679,6 +676,10 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, panic("EXT4-fs (device %s): panic forced after error\n", sb->s_id); } + + if (sb_rdonly(sb) || continue_fs) + return; + ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* * Make sure updated value of ->s_mount_flags will be visible before @@ -1688,7 +1689,7 @@ enum { Opt_dioread_nolock, Opt_dioread_lock, Opt_discard, Opt_nodiscard, Opt_init_itable, Opt_noinit_itable, Opt_max_dir_size_kb, Opt_nojournal_checksum, Opt_nombcache, - Opt_prefetch_block_bitmaps, + Opt_no_prefetch_block_bitmaps, Opt_mb_optimize_scan, #ifdef CONFIG_EXT4_DEBUG Opt_fc_debug_max_replay, Opt_fc_debug_force #endif @@ -1788,7 +1789,9 @@ static const match_table_t tokens = { {Opt_inlinecrypt, "inlinecrypt"}, {Opt_nombcache, "nombcache"}, {Opt_nombcache, "no_mbcache"}, /* for backward compatibility */ - {Opt_prefetch_block_bitmaps, "prefetch_block_bitmaps"}, + {Opt_removed, "prefetch_block_bitmaps"}, + {Opt_no_prefetch_block_bitmaps, "no_prefetch_block_bitmaps"}, + {Opt_mb_optimize_scan, "mb_optimize_scan=%d"}, {Opt_removed, "check=none"}, /* mount option from ext2/3 */ {Opt_removed, "nocheck"}, /* mount option from ext2/3 */ {Opt_removed, "reservation"}, /* mount option from ext2/3 */ @@ -1821,6 +1824,8 @@ static ext4_fsblk_t get_sb_block(void **data) } #define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +#define DEFAULT_MB_OPTIMIZE_SCAN (-1) + static const char deprecated_msg[] = "Mount option \"%s\" will be removed by %s\n" "Contact linux-ext4@vger.kernel.org if you think we should keep it.\n"; @@ -2007,8 +2012,9 @@ static const struct mount_opts { {Opt_max_dir_size_kb, 0, MOPT_GTE0}, {Opt_test_dummy_encryption, 0, MOPT_STRING}, {Opt_nombcache, EXT4_MOUNT_NO_MBCACHE, MOPT_SET}, - {Opt_prefetch_block_bitmaps, EXT4_MOUNT_PREFETCH_BLOCK_BITMAPS, + {Opt_no_prefetch_block_bitmaps, EXT4_MOUNT_NO_PREFETCH_BLOCK_BITMAPS, MOPT_SET}, + {Opt_mb_optimize_scan, EXT4_MOUNT2_MB_OPTIMIZE_SCAN, MOPT_GTE0}, #ifdef CONFIG_EXT4_DEBUG {Opt_fc_debug_force, EXT4_MOUNT2_JOURNAL_FAST_COMMIT, MOPT_SET | MOPT_2 | MOPT_EXT4_ONLY}, @@ -2090,9 +2096,15 @@ static int ext4_set_test_dummy_encryption(struct super_block *sb, return 1; } +struct ext4_parsed_options { + unsigned long journal_devnum; + unsigned int journal_ioprio; + int mb_optimize_scan; +}; + static int handle_mount_opt(struct super_block *sb, char *opt, int token, - substring_t *args, unsigned long *journal_devnum, - unsigned int *journal_ioprio, int is_remount) + substring_t *args, struct ext4_parsed_options *parsed_opts, + int is_remount) { struct ext4_sb_info *sbi = EXT4_SB(sb); const struct mount_opts *m; @@ -2249,7 +2261,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, "Cannot specify journal on remount"); return -1; } - *journal_devnum = arg; + parsed_opts->journal_devnum = arg; } else if (token == Opt_journal_path) { char *journal_path; struct inode *journal_inode; @@ -2285,7 +2297,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, return -1; } - *journal_devnum = new_encode_dev(journal_inode->i_rdev); + parsed_opts->journal_devnum = new_encode_dev(journal_inode->i_rdev); path_put(&path); kfree(journal_path); } else if (token == Opt_journal_ioprio) { @@ -2294,7 +2306,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, " (must be 0-7)"); return -1; } - *journal_ioprio = + parsed_opts->journal_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, arg); } else if (token == Opt_test_dummy_encryption) { return ext4_set_test_dummy_encryption(sb, opt, &args[0], @@ -2384,6 +2396,13 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, sbi->s_mount_opt |= m->mount_opt; } else if (token == Opt_data_err_ignore) { sbi->s_mount_opt &= ~m->mount_opt; + } else if (token == Opt_mb_optimize_scan) { + if (arg != 0 && arg != 1) { + ext4_msg(sb, KERN_WARNING, + "mb_optimize_scan should be set to 0 or 1."); + return -1; + } + parsed_opts->mb_optimize_scan = arg; } else { if (!args->from) arg = 1; @@ -2411,8 +2430,7 @@ static int handle_mount_opt(struct super_block *sb, char *opt, int token, } static int parse_options(char *options, struct super_block *sb, - unsigned long *journal_devnum, - unsigned int *journal_ioprio, + struct ext4_parsed_options *ret_opts, int is_remount) { struct ext4_sb_info __maybe_unused *sbi = EXT4_SB(sb); @@ -2432,8 +2450,8 @@ static int parse_options(char *options, struct super_block *sb, */ args[0].to = args[0].from = NULL; token = match_token(p, tokens, args); - if (handle_mount_opt(sb, p, token, args, journal_devnum, - journal_ioprio, is_remount) < 0) + if (handle_mount_opt(sb, p, token, args, ret_opts, + is_remount) < 0) return 0; } #ifdef CONFIG_QUOTA @@ -3023,9 +3041,6 @@ static void ext4_orphan_cleanup(struct super_block *sb, sb->s_flags &= ~SB_RDONLY; } #ifdef CONFIG_QUOTA - /* Needed for iput() to work correctly and not trash data */ - sb->s_flags |= SB_ACTIVE; - /* * Turn on quotas which were not enabled for read-only mounts if * filesystem has quota feature, so that they are updated correctly. @@ -3691,11 +3706,11 @@ static struct ext4_li_request *ext4_li_request_new(struct super_block *sb, elr->lr_super = sb; elr->lr_first_not_zeroed = start; - if (test_opt(sb, PREFETCH_BLOCK_BITMAPS)) - elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; - else { + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) { elr->lr_mode = EXT4_LI_MODE_ITABLE; elr->lr_next_group = start; + } else { + elr->lr_mode = EXT4_LI_MODE_PREFETCH_BBITMAP; } /* @@ -3726,7 +3741,7 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (!test_opt(sb, PREFETCH_BLOCK_BITMAPS) && + if (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || sb_rdonly(sb) || !test_opt(sb, INIT_INODE_TABLE))) goto out; @@ -4015,7 +4030,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) ext4_fsblk_t sb_block = get_sb_block(&data); ext4_fsblk_t logical_sb_block; unsigned long offset = 0; - unsigned long journal_devnum = 0; unsigned long def_mount_opts; struct inode *root; const char *descr; @@ -4026,8 +4040,13 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) int needs_recovery, has_huge_files; __u64 blocks_count; int err = 0; - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; ext4_group_t first_not_zeroed; + struct ext4_parsed_options parsed_opts; + + /* Set defaults for the variables that will be set during parsing */ + parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + parsed_opts.journal_devnum = 0; + parsed_opts.mb_optimize_scan = DEFAULT_MB_OPTIMIZE_SCAN; if ((data && !orig_data) || !sbi) goto out_free_base; @@ -4273,8 +4292,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) GFP_KERNEL); if (!s_mount_opts) goto failed_mount; - if (!parse_options(s_mount_opts, sb, &journal_devnum, - &journal_ioprio, 0)) { + if (!parse_options(s_mount_opts, sb, &parsed_opts, 0)) { ext4_msg(sb, KERN_WARNING, "failed to parse options in superblock: %s", s_mount_opts); @@ -4282,8 +4300,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) kfree(s_mount_opts); } sbi->s_def_mount_opt = sbi->s_mount_opt; - if (!parse_options((char *) data, sb, &journal_devnum, - &journal_ioprio, 0)) + if (!parse_options((char *) data, sb, &parsed_opts, 0)) goto failed_mount; #ifdef CONFIG_UNICODE @@ -4292,12 +4309,6 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) struct unicode_map *encoding; __u16 encoding_flags; - if (ext4_has_feature_encrypt(sb)) { - ext4_msg(sb, KERN_ERR, - "Can't mount with encoding and encryption"); - goto failed_mount; - } - if (ext4_sb_read_encoding(es, &encoding_info, &encoding_flags)) { ext4_msg(sb, KERN_ERR, @@ -4774,7 +4785,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) * root first: it may be modified in the journal! */ if (!test_opt(sb, NOLOAD) && ext4_has_feature_journal(sb)) { - err = ext4_load_journal(sb, es, journal_devnum); + err = ext4_load_journal(sb, es, parsed_opts.journal_devnum); if (err) goto failed_mount3a; } else if (test_opt(sb, NOLOAD) && !sb_rdonly(sb) && @@ -4874,7 +4885,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent) goto failed_mount_wq; } - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); sbi->s_journal->j_submit_inode_data_buffers = ext4_journal_submit_inode_data_buffers; @@ -4980,6 +4991,19 @@ no_journal: ext4_fc_replay_cleanup(sb); ext4_ext_init(sb); + + /* + * Enable optimize_scan if number of groups is > threshold. This can be + * turned off by passing "mb_optimize_scan=0". This can also be + * turned on forcefully by passing "mb_optimize_scan=1". + */ + if (parsed_opts.mb_optimize_scan == 1) + set_opt2(sb, MB_OPTIMIZE_SCAN); + else if (parsed_opts.mb_optimize_scan == 0) + clear_opt2(sb, MB_OPTIMIZE_SCAN); + else if (sbi->s_groups_count >= MB_DEFAULT_LINEAR_SCAN_THRESHOLD) + set_opt2(sb, MB_OPTIMIZE_SCAN); + err = ext4_mb_init(sb); if (err) { ext4_msg(sb, KERN_ERR, "failed to initialize mballoc (%d)", @@ -4996,7 +5020,7 @@ no_journal: ext4_journal_commit_callback; block = ext4_count_free_clusters(sb); - ext4_free_blocks_count_set(sbi->s_es, + ext4_free_blocks_count_set(sbi->s_es, EXT4_C2B(sbi, block)); err = percpu_counter_init(&sbi->s_freeclusters_counter, block, GFP_KERNEL); @@ -5561,8 +5585,10 @@ static int ext4_commit_super(struct super_block *sb) struct buffer_head *sbh = EXT4_SB(sb)->s_sbh; int error = 0; - if (!sbh || block_device_ejected(sb)) - return error; + if (!sbh) + return -EINVAL; + if (block_device_ejected(sb)) + return -ENODEV; ext4_update_super(sb); @@ -5813,13 +5839,16 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) struct ext4_mount_options old_opts; int enable_quota = 0; ext4_group_t g; - unsigned int journal_ioprio = DEFAULT_JOURNAL_IOPRIO; int err = 0; #ifdef CONFIG_QUOTA int i, j; char *to_free[EXT4_MAXQUOTAS]; #endif char *orig_data = kstrdup(data, GFP_KERNEL); + struct ext4_parsed_options parsed_opts; + + parsed_opts.journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + parsed_opts.journal_devnum = 0; if (data && !orig_data) return -ENOMEM; @@ -5850,7 +5879,8 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) old_opts.s_qf_names[i] = NULL; #endif if (sbi->s_journal && sbi->s_journal->j_task->io_context) - journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + parsed_opts.journal_ioprio = + sbi->s_journal->j_task->io_context->ioprio; /* * Some options can be enabled by ext4 and/or by VFS mount flag @@ -5860,7 +5890,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) vfs_flags = SB_LAZYTIME | SB_I_VERSION; sb->s_flags = (sb->s_flags & ~vfs_flags) | (*flags & vfs_flags); - if (!parse_options(data, sb, NULL, &journal_ioprio, 1)) { + if (!parse_options(data, sb, &parsed_opts, 1)) { err = -EINVAL; goto restore_opts; } @@ -5910,7 +5940,7 @@ static int ext4_remount(struct super_block *sb, int *flags, char *data) if (sbi->s_journal) { ext4_init_journal_params(sb, sbi->s_journal); - set_task_ioprio(sbi->s_journal->j_task, journal_ioprio); + set_task_ioprio(sbi->s_journal->j_task, parsed_opts.journal_ioprio); } /* Flush outstanding errors before changing fs state */ diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index a3d08276d441..6f825dedc3d4 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -215,6 +215,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs); EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request); EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc); EXT4_RW_ATTR_SBI_UI(mb_max_inode_prealloc, s_mb_max_inode_prealloc); +EXT4_RW_ATTR_SBI_UI(mb_max_linear_groups, s_mb_max_linear_groups); EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb); EXT4_ATTR(trigger_fs_error, 0200, trigger_test_error); EXT4_RW_ATTR_SBI_UI(err_ratelimit_interval_ms, s_err_ratelimit_state.interval); @@ -263,6 +264,7 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_stream_req), ATTR_LIST(mb_group_prealloc), ATTR_LIST(mb_max_inode_prealloc), + ATTR_LIST(mb_max_linear_groups), ATTR_LIST(max_writeback_mb_bump), ATTR_LIST(extent_max_zeroout_kb), ATTR_LIST(trigger_fs_error), @@ -313,6 +315,7 @@ EXT4_ATTR_FEATURE(verity); #endif EXT4_ATTR_FEATURE(metadata_csum_seed); EXT4_ATTR_FEATURE(fast_commit); +EXT4_ATTR_FEATURE(encrypted_casefold); static struct attribute *ext4_feat_attrs[] = { ATTR_LIST(lazy_itable_init), @@ -330,6 +333,7 @@ static struct attribute *ext4_feat_attrs[] = { #endif ATTR_LIST(metadata_csum_seed), ATTR_LIST(fast_commit), + ATTR_LIST(encrypted_casefold), NULL, }; ATTRIBUTE_GROUPS(ext4_feat); @@ -528,6 +532,10 @@ int ext4_register_sysfs(struct super_block *sb) ext4_fc_info_show, sb); proc_create_seq_data("mb_groups", S_IRUGO, sbi->s_proc, &ext4_mb_seq_groups_ops, sb); + proc_create_single_data("mb_stats", 0444, sbi->s_proc, + ext4_seq_mb_stats_show, sb); + proc_create_seq_data("mb_structs_summary", 0444, sbi->s_proc, + &ext4_mb_seq_structs_summary_ops, sb); } return 0; } diff --git a/fs/ext4/verity.c b/fs/ext4/verity.c index 07438f46b558..eacbd489e3bf 100644 --- a/fs/ext4/verity.c +++ b/fs/ext4/verity.c @@ -45,16 +45,13 @@ static int pagecache_read(struct inode *inode, void *buf, size_t count, size_t n = min_t(size_t, count, PAGE_SIZE - offset_in_page(pos)); struct page *page; - void *addr; page = read_mapping_page(inode->i_mapping, pos >> PAGE_SHIFT, NULL); if (IS_ERR(page)) return PTR_ERR(page); - addr = kmap_atomic(page); - memcpy(buf, addr + offset_in_page(pos), n); - kunmap_atomic(addr); + memcpy_from_page(buf, page, offset_in_page(pos), n); put_page(page); @@ -80,7 +77,6 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, PAGE_SIZE - offset_in_page(pos)); struct page *page; void *fsdata; - void *addr; int res; res = pagecache_write_begin(NULL, inode->i_mapping, pos, n, 0, @@ -88,9 +84,7 @@ static int pagecache_write(struct inode *inode, const void *buf, size_t count, if (res) return res; - addr = kmap_atomic(page); - memcpy(addr + offset_in_page(pos), buf, n); - kunmap_atomic(addr); + memcpy_to_page(page, offset_in_page(pos), buf, n); res = pagecache_write_end(NULL, inode->i_mapping, pos, n, n, page, fsdata); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 6c1018223c54..10ba4b24a0aa 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -1617,7 +1617,7 @@ static int ext4_xattr_set_entry(struct ext4_xattr_info *i, * If storing the value in an external inode is an option, * reserve space for xattr entries/names in the external * attribute block so that a long value does not occupy the - * whole space and prevent futher entries being added. + * whole space and prevent further entries being added. */ if (ext4_has_feature_ea_inode(inode->i_sb) && new_size && is_block && diff --git a/fs/f2fs/Kconfig b/fs/f2fs/Kconfig index 62e638a49bbf..7669de7b49ce 100644 --- a/fs/f2fs/Kconfig +++ b/fs/f2fs/Kconfig @@ -7,6 +7,13 @@ config F2FS_FS select CRYPTO_CRC32 select F2FS_FS_XATTR if FS_ENCRYPTION select FS_ENCRYPTION_ALGS if FS_ENCRYPTION + select LZ4_COMPRESS if F2FS_FS_LZ4 + select LZ4_DECOMPRESS if F2FS_FS_LZ4 + select LZ4HC_COMPRESS if F2FS_FS_LZ4HC + select LZO_COMPRESS if F2FS_FS_LZO + select LZO_DECOMPRESS if F2FS_FS_LZO + select ZSTD_COMPRESS if F2FS_FS_ZSTD + select ZSTD_DECOMPRESS if F2FS_FS_ZSTD help F2FS is based on Log-structured File System (LFS), which supports versatile "flash-friendly" features. The design has been focused on @@ -94,8 +101,6 @@ config F2FS_FS_COMPRESSION config F2FS_FS_LZO bool "LZO compression support" depends on F2FS_FS_COMPRESSION - select LZO_COMPRESS - select LZO_DECOMPRESS default y help Support LZO compress algorithm, if unsure, say Y. @@ -103,8 +108,6 @@ config F2FS_FS_LZO config F2FS_FS_LZ4 bool "LZ4 compression support" depends on F2FS_FS_COMPRESSION - select LZ4_COMPRESS - select LZ4_DECOMPRESS default y help Support LZ4 compress algorithm, if unsure, say Y. @@ -113,7 +116,6 @@ config F2FS_FS_LZ4HC bool "LZ4HC compression support" depends on F2FS_FS_COMPRESSION depends on F2FS_FS_LZ4 - select LZ4HC_COMPRESS default y help Support LZ4HC compress algorithm, LZ4HC has compatible on-disk @@ -122,8 +124,6 @@ config F2FS_FS_LZ4HC config F2FS_FS_ZSTD bool "ZSTD compression support" depends on F2FS_FS_COMPRESSION - select ZSTD_COMPRESS - select ZSTD_DECOMPRESS default y help Support ZSTD compress algorithm, if unsure, say Y. @@ -132,8 +132,6 @@ config F2FS_FS_LZORLE bool "LZO-RLE compression support" depends on F2FS_FS_COMPRESSION depends on F2FS_FS_LZO - select LZO_COMPRESS - select LZO_DECOMPRESS default y help Support LZO-RLE compress algorithm, if unsure, say Y. diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 965037a9c205..239ad9453b99 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -29,6 +29,7 @@ static inline size_t f2fs_acl_size(int count) static inline int f2fs_acl_count(size_t size) { ssize_t s; + size -= sizeof(struct f2fs_acl_header); s = size - 4 * sizeof(struct f2fs_acl_entry_short); if (s < 0) { diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index be5415a0dbbc..f795049e63d5 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -719,6 +719,7 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) orphan_blk = (struct f2fs_orphan_block *)page_address(page); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); + err = recover_orphan_inode(sbi, ino); if (err) { f2fs_put_page(page, 1); @@ -1456,7 +1457,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) orphan_blocks); if (__remain_node_summaries(cpc->reason)) - ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS+ + ckpt->cp_pack_total_block_count = cpu_to_le32(F2FS_CP_PACKS + cp_payload_blks + data_sum_blocks + orphan_blocks + NR_CURSEG_NODE_TYPE); else @@ -1818,7 +1819,11 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) llist_add(&req.llnode, &cprc->issue_list); atomic_inc(&cprc->queued_ckpt); - /* update issue_list before we wake up issue_checkpoint thread */ + /* + * update issue_list before we wake up issue_checkpoint thread, + * this smp_mb() pairs with another barrier in ___wait_event(), + * see more details in comments of waitqueue_active(). + */ smp_mb(); if (waitqueue_active(&cprc->ckpt_wait_queue)) diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 77fa342de38f..925a5ca3744a 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -76,12 +76,6 @@ bool f2fs_is_compressed_page(struct page *page) return false; if (IS_ATOMIC_WRITTEN_PAGE(page) || IS_DUMMY_WRITTEN_PAGE(page)) return false; - /* - * page->private may be set with pid. - * pid_max is enough to check if it is traced. - */ - if (IS_IO_TRACED_PAGE(page)) - return false; f2fs_bug_on(F2FS_M_SB(page->mapping), *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); @@ -123,19 +117,6 @@ static void f2fs_unlock_rpages(struct compress_ctx *cc, int len) f2fs_drop_rpages(cc, len, true); } -static void f2fs_put_rpages_mapping(struct address_space *mapping, - pgoff_t start, int len) -{ - int i; - - for (i = 0; i < len; i++) { - struct page *page = find_get_page(mapping, start + i); - - put_page(page); - put_page(page); - } -} - static void f2fs_put_rpages_wbc(struct compress_ctx *cc, struct writeback_control *wbc, bool redirty, int unlock) { @@ -164,13 +145,14 @@ int f2fs_init_compress_ctx(struct compress_ctx *cc) return cc->rpages ? 0 : -ENOMEM; } -void f2fs_destroy_compress_ctx(struct compress_ctx *cc) +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse) { page_array_free(cc->inode, cc->rpages, cc->cluster_size); cc->rpages = NULL; cc->nr_rpages = 0; cc->nr_cpages = 0; - cc->cluster_idx = NULL_CLUSTER; + if (!reuse) + cc->cluster_idx = NULL_CLUSTER; } void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct page *page) @@ -896,7 +878,6 @@ bool f2fs_cluster_can_merge_page(struct compress_ctx *cc, pgoff_t index) static bool __cluster_may_compress(struct compress_ctx *cc) { - struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); loff_t i_size = i_size_read(cc->inode); unsigned nr_pages = DIV_ROUND_UP(i_size, PAGE_SIZE); int i; @@ -904,12 +885,7 @@ static bool __cluster_may_compress(struct compress_ctx *cc) for (i = 0; i < cc->cluster_size; i++) { struct page *page = cc->rpages[i]; - f2fs_bug_on(sbi, !page); - - if (unlikely(f2fs_cp_error(sbi))) - return false; - if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) - return false; + f2fs_bug_on(F2FS_I_SB(cc->inode), !page); /* beyond EOF */ if (page->index >= nr_pages) @@ -1048,7 +1024,7 @@ retry: } if (PageUptodate(page)) - unlock_page(page); + f2fs_put_page(page, 1); else f2fs_compress_ctx_add_page(cc, page); } @@ -1058,33 +1034,35 @@ retry: ret = f2fs_read_multi_pages(cc, &bio, cc->cluster_size, &last_block_in_bio, false, true); - f2fs_destroy_compress_ctx(cc); + f2fs_put_rpages(cc); + f2fs_destroy_compress_ctx(cc, true); if (ret) - goto release_pages; + goto out; if (bio) f2fs_submit_bio(sbi, bio, DATA); ret = f2fs_init_compress_ctx(cc); if (ret) - goto release_pages; + goto out; } for (i = 0; i < cc->cluster_size; i++) { f2fs_bug_on(sbi, cc->rpages[i]); page = find_lock_page(mapping, start_idx + i); - f2fs_bug_on(sbi, !page); + if (!page) { + /* page can be truncated */ + goto release_and_retry; + } f2fs_wait_on_page_writeback(page, DATA, true, true); - f2fs_compress_ctx_add_page(cc, page); - f2fs_put_page(page, 0); if (!PageUptodate(page)) { +release_and_retry: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); - f2fs_put_rpages_mapping(mapping, start_idx, - cc->cluster_size); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); goto retry; } } @@ -1115,10 +1093,10 @@ retry: } unlock_pages: + f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i); -release_pages: - f2fs_put_rpages_mapping(mapping, start_idx, i); - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, true); +out: return ret; } @@ -1153,7 +1131,7 @@ bool f2fs_compress_write_end(struct inode *inode, void *fsdata, set_cluster_dirty(&cc); f2fs_put_rpages_wbc(&cc, NULL, false, 1); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); return first_index; } @@ -1353,6 +1331,7 @@ unlock_continue: if (fio.compr_blocks) f2fs_i_compr_blocks_update(inode, fio.compr_blocks - 1, false); f2fs_i_compr_blocks_update(inode, cc->nr_cpages, true); + add_compr_block_stat(inode, cc->nr_cpages); set_inode_flag(cc->inode, FI_APPEND_WRITE); if (cc->cluster_idx == 0) @@ -1372,7 +1351,7 @@ unlock_continue: f2fs_put_rpages(cc); page_array_free(cc->inode, cc->cpages, cc->nr_cpages); cc->cpages = NULL; - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return 0; out_destroy_crypt: @@ -1383,7 +1362,8 @@ out_destroy_crypt: for (i = 0; i < cc->nr_cpages; i++) { if (!cc->cpages[i]) continue; - f2fs_put_page(cc->cpages[i], 1); + f2fs_compress_free_page(cc->cpages[i]); + cc->cpages[i] = NULL; } out_put_cic: kmem_cache_free(cic_entry_slab, cic); @@ -1533,7 +1513,7 @@ write: err = f2fs_write_raw_pages(cc, submitted, wbc, io_type); f2fs_put_rpages_wbc(cc, wbc, false, 0); destroy_out: - f2fs_destroy_compress_ctx(cc); + f2fs_destroy_compress_ctx(cc, false); return err; } diff --git a/fs/f2fs/compress.h b/fs/f2fs/compress.h deleted file mode 100644 index e69de29bb2d1..000000000000 --- a/fs/f2fs/compress.h +++ /dev/null diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 4e5257c763d0..009a09fb9d88 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -1086,6 +1086,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) for (; count > 0; dn->ofs_in_node++) { block_t blkaddr = f2fs_data_blkaddr(dn); + if (blkaddr == NULL_ADDR) { dn->data_blkaddr = NEW_ADDR; __set_data_blkaddr(dn); @@ -1722,7 +1723,7 @@ static int get_data_block_dio_write(struct inode *inode, sector_t iblock, return __get_data_block(inode, iblock, bh_result, create, F2FS_GET_BLOCK_DIO, NULL, f2fs_rw_hint_to_seg_type(inode->i_write_hint), - IS_SWAPFILE(inode) ? false : true); + true); } static int get_data_block_dio(struct inode *inode, sector_t iblock, @@ -1837,6 +1838,7 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, int ret = 0; bool compr_cluster = false; unsigned int cluster_size = F2FS_I(inode)->i_cluster_size; + loff_t maxbytes; if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { ret = f2fs_precache_extents(inode); @@ -1850,6 +1852,15 @@ int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, inode_lock(inode); + maxbytes = max_file_blocks(inode) << F2FS_BLKSIZE_BITS; + if (start > maxbytes) { + ret = -EFBIG; + goto out; + } + + if (len > maxbytes || (maxbytes - len) < start) + len = maxbytes - start; + if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { ret = f2fs_xattr_fiemap(inode, fieinfo); goto out; @@ -2276,7 +2287,7 @@ static int f2fs_mpage_readpages(struct inode *inode, max_nr_pages, &last_block_in_bio, rac != NULL, false); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); if (ret) goto set_error_page; } @@ -2321,7 +2332,7 @@ next_page: max_nr_pages, &last_block_in_bio, rac != NULL, false); - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); } } #endif @@ -3022,7 +3033,7 @@ next: } } if (f2fs_compressed_file(inode)) - f2fs_destroy_compress_ctx(&cc); + f2fs_destroy_compress_ctx(&cc, false); #endif if (retry) { index = 0; @@ -3755,6 +3766,7 @@ int f2fs_migrate_page(struct address_space *mapping, if (atomic_written) { struct inmem_pages *cur; + list_for_each_entry(cur, &fi->inmem_pages, list) if (cur->page == page) { cur->page = newpage; @@ -3780,11 +3792,72 @@ int f2fs_migrate_page(struct address_space *mapping, #endif #ifdef CONFIG_SWAP +static int f2fs_is_file_aligned(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + block_t main_blkaddr = SM_I(sbi)->main_blkaddr; + block_t cur_lblock; + block_t last_lblock; + block_t pblock; + unsigned long nr_pblocks; + unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; + int ret = 0; + + cur_lblock = 0; + last_lblock = bytes_to_blks(inode, i_size_read(inode)); + + while (cur_lblock < last_lblock) { + struct f2fs_map_blocks map; + + memset(&map, 0, sizeof(map)); + map.m_lblk = cur_lblock; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; + map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; + + ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); + if (ret) + goto out; + + /* hole */ + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes\n"); + ret = -ENOENT; + goto out; + } + + pblock = map.m_pblk; + nr_pblocks = map.m_len; + + if ((pblock - main_blkaddr) & (blocks_per_sec - 1) || + nr_pblocks & (blocks_per_sec - 1)) { + if (f2fs_is_pinned_file(inode)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + not_aligned++; + } + + cur_lblock += nr_pblocks; + } + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" + "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); +out: + return ret; +} + static int check_swap_activate_fast(struct swap_info_struct *sis, struct file *swap_file, sector_t *span) { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); sector_t cur_lblock; sector_t last_lblock; sector_t pblock; @@ -3792,8 +3865,9 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sector_t highest_pblock = 0; int nr_extents = 0; unsigned long nr_pblocks; - u64 len; - int ret; + unsigned int blocks_per_sec = BLKS_PER_SEC(sbi); + unsigned int not_aligned = 0; + int ret = 0; /* * Map all the blocks into the extent list. This code doesn't try @@ -3801,31 +3875,44 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, */ cur_lblock = 0; last_lblock = bytes_to_blks(inode, i_size_read(inode)); - len = i_size_read(inode); - while (cur_lblock <= last_lblock && cur_lblock < sis->max) { + while (cur_lblock < last_lblock && cur_lblock < sis->max) { struct f2fs_map_blocks map; - pgoff_t next_pgofs; cond_resched(); memset(&map, 0, sizeof(map)); map.m_lblk = cur_lblock; - map.m_len = bytes_to_blks(inode, len) - cur_lblock; - map.m_next_pgofs = &next_pgofs; + map.m_len = last_lblock - cur_lblock; + map.m_next_pgofs = NULL; + map.m_next_extent = NULL; map.m_seg_type = NO_CHECK_TYPE; + map.m_may_create = false; ret = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_FIEMAP); if (ret) - goto err_out; + goto out; /* hole */ - if (!(map.m_flags & F2FS_MAP_FLAGS)) - goto err_out; + if (!(map.m_flags & F2FS_MAP_FLAGS)) { + f2fs_err(sbi, "Swapfile has holes\n"); + ret = -EINVAL; + goto out; + } pblock = map.m_pblk; nr_pblocks = map.m_len; + if ((pblock - SM_I(sbi)->main_blkaddr) & (blocks_per_sec - 1) || + nr_pblocks & (blocks_per_sec - 1)) { + if (f2fs_is_pinned_file(inode)) { + f2fs_err(sbi, "Swapfile does not align to section"); + ret = -EINVAL; + goto out; + } + not_aligned++; + } + if (cur_lblock + nr_pblocks >= sis->max) nr_pblocks = sis->max - cur_lblock; @@ -3852,11 +3939,13 @@ static int check_swap_activate_fast(struct swap_info_struct *sis, sis->max = cur_lblock; sis->pages = cur_lblock - 1; sis->highest_bit = cur_lblock - 1; + + if (not_aligned) + f2fs_warn(sbi, "Swapfile (%u) is not align to section: \n" + "\t1) creat(), 2) ioctl(F2FS_IOC_SET_PIN_FILE), 3) fallocate()", + not_aligned); out: return ret; -err_out: - pr_err("swapon: swapfile has holes\n"); - return -EINVAL; } /* Copied from generic_swapfile_activate() to check any holes */ @@ -3865,6 +3954,7 @@ static int check_swap_activate(struct swap_info_struct *sis, { struct address_space *mapping = swap_file->f_mapping; struct inode *inode = mapping->host; + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned blocks_per_page; unsigned long page_no; sector_t probe_block; @@ -3872,11 +3962,15 @@ static int check_swap_activate(struct swap_info_struct *sis, sector_t lowest_block = -1; sector_t highest_block = 0; int nr_extents = 0; - int ret; + int ret = 0; if (PAGE_SIZE == F2FS_BLKSIZE) return check_swap_activate_fast(sis, swap_file, span); + ret = f2fs_is_file_aligned(inode); + if (ret) + goto out; + blocks_per_page = bytes_to_blks(inode, PAGE_SIZE); /* @@ -3891,13 +3985,14 @@ static int check_swap_activate(struct swap_info_struct *sis, unsigned block_in_page; sector_t first_block; sector_t block = 0; - int err = 0; cond_resched(); block = probe_block; - err = bmap(inode, &block); - if (err || !block) + ret = bmap(inode, &block); + if (ret) + goto out; + if (!block) goto bad_bmap; first_block = block; @@ -3913,9 +4008,10 @@ static int check_swap_activate(struct swap_info_struct *sis, block_in_page++) { block = probe_block + block_in_page; - err = bmap(inode, &block); - - if (err || !block) + ret = bmap(inode, &block); + if (ret) + goto out; + if (!block) goto bad_bmap; if (block != first_block + block_in_page) { @@ -3955,7 +4051,7 @@ reprobe: out: return ret; bad_bmap: - pr_err("swapon: swapfile has holes\n"); + f2fs_err(sbi, "Swapfile has holes\n"); return -EINVAL; } diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 91855d5721cd..c03949a7ccff 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -173,6 +173,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->util_invalid = 50 - si->util_free - si->util_valid; for (i = CURSEG_HOT_DATA; i < NO_CHECK_TYPE; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + si->curseg[i] = curseg->segno; si->cursec[i] = GET_SEC_FROM_SEG(sbi, curseg->segno); si->curzone[i] = GET_ZONE_FROM_SEC(sbi, si->cursec[i]); @@ -300,10 +301,12 @@ get_cache: si->page_mem = 0; if (sbi->node_inode) { unsigned npages = NODE_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } if (sbi->meta_inode) { unsigned npages = META_MAPPING(sbi)->nrpages; + si->page_mem += (unsigned long long)npages << PAGE_SHIFT; } } diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index e6270a867be1..dc7ce79672b8 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -449,9 +449,7 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) { - struct qstr dotdot = QSTR_INIT("..", 2); - - return f2fs_find_entry(dir, &dotdot, p); + return f2fs_find_entry(dir, &dotdot_name, p); } ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, @@ -473,6 +471,7 @@ void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, struct page *page, struct inode *inode) { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; + lock_page(page); f2fs_wait_on_page_writeback(page, type, true, true); de->ino = cpu_to_le32(inode->i_ino); diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 11a20dc505aa..c83d90125ebd 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -97,6 +97,7 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_NORECOVERY 0x04000000 #define F2FS_MOUNT_ATGC 0x08000000 #define F2FS_MOUNT_MERGE_CHECKPOINT 0x10000000 +#define F2FS_MOUNT_GC_MERGE 0x20000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -637,21 +638,26 @@ enum { #define FADVISE_MODIFIABLE_BITS (FADVISE_COLD_BIT | FADVISE_HOT_BIT) #define file_is_cold(inode) is_file(inode, FADVISE_COLD_BIT) -#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) #define file_set_cold(inode) set_file(inode, FADVISE_COLD_BIT) -#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_clear_cold(inode) clear_file(inode, FADVISE_COLD_BIT) + +#define file_wrong_pino(inode) is_file(inode, FADVISE_LOST_PINO_BIT) +#define file_lost_pino(inode) set_file(inode, FADVISE_LOST_PINO_BIT) #define file_got_pino(inode) clear_file(inode, FADVISE_LOST_PINO_BIT) + #define file_is_encrypt(inode) is_file(inode, FADVISE_ENCRYPT_BIT) #define file_set_encrypt(inode) set_file(inode, FADVISE_ENCRYPT_BIT) -#define file_clear_encrypt(inode) clear_file(inode, FADVISE_ENCRYPT_BIT) + #define file_enc_name(inode) is_file(inode, FADVISE_ENC_NAME_BIT) #define file_set_enc_name(inode) set_file(inode, FADVISE_ENC_NAME_BIT) + #define file_keep_isize(inode) is_file(inode, FADVISE_KEEP_SIZE_BIT) #define file_set_keep_isize(inode) set_file(inode, FADVISE_KEEP_SIZE_BIT) + #define file_is_hot(inode) is_file(inode, FADVISE_HOT_BIT) #define file_set_hot(inode) set_file(inode, FADVISE_HOT_BIT) #define file_clear_hot(inode) clear_file(inode, FADVISE_HOT_BIT) + #define file_is_verity(inode) is_file(inode, FADVISE_VERITY_BIT) #define file_set_verity(inode) set_file(inode, FADVISE_VERITY_BIT) @@ -860,7 +866,7 @@ struct f2fs_nm_info { /* NAT cache management */ struct radix_tree_root nat_root;/* root of the nat entry cache */ struct radix_tree_root nat_set_root;/* root of the nat set cache */ - struct rw_semaphore nat_tree_lock; /* protect nat_tree_lock */ + struct rw_semaphore nat_tree_lock; /* protect nat entry tree */ struct list_head nat_entries; /* cached nat entry list (clean) */ spinlock_t nat_list_lock; /* protect clean nat entry list */ unsigned int nat_cnt[MAX_NAT_STATE]; /* the # of cached nat entries */ @@ -1297,14 +1303,6 @@ enum { #define IS_DUMMY_WRITTEN_PAGE(page) \ (page_private(page) == DUMMY_WRITTEN_PAGE) -#ifdef CONFIG_F2FS_IO_TRACE -#define IS_IO_TRACED_PAGE(page) \ - (page_private(page) > 0 && \ - page_private(page) < (unsigned long)PID_MAX_LIMIT) -#else -#define IS_IO_TRACED_PAGE(page) (0) -#endif - /* For compression */ enum compress_algorithm_type { COMPRESS_LZO, @@ -1623,6 +1621,11 @@ struct f2fs_sb_info { #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ + + /* For runtime compression statistics */ + u64 compr_written_block; + u64 compr_saved_block; + u32 compr_new_inode; #endif }; @@ -2215,6 +2218,7 @@ static inline block_t __cp_payload(struct f2fs_sb_info *sbi) static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) { struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); + void *tmp_ptr = &ckpt->sit_nat_version_bitmap; int offset; if (is_set_ckpt_flags(sbi, CP_LARGE_NAT_BITMAP_FLAG)) { @@ -2224,7 +2228,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) * if large_nat_bitmap feature is enabled, leave checksum * protection for all nat/sit bitmaps. */ - return &ckpt->sit_nat_version_bitmap + offset + sizeof(__le32); + return tmp_ptr + offset + sizeof(__le32); } if (__cp_payload(sbi) > 0) { @@ -2235,7 +2239,7 @@ static inline void *__bitmap_ptr(struct f2fs_sb_info *sbi, int flag) } else { offset = (flag == NAT_BITMAP) ? le32_to_cpu(ckpt->sit_ver_bitmap_bytesize) : 0; - return &ckpt->sit_nat_version_bitmap + offset; + return tmp_ptr + offset; } } @@ -3302,7 +3306,6 @@ void f2fs_hash_filename(const struct inode *dir, struct f2fs_filename *fname); /* * node.c */ -struct dnode_of_data; struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); @@ -3379,6 +3382,7 @@ block_t f2fs_get_unusable_blocks(struct f2fs_sb_info *sbi); int f2fs_disable_cp_again(struct f2fs_sb_info *sbi, block_t unusable); void f2fs_release_discard_addrs(struct f2fs_sb_info *sbi); int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra); +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno); void f2fs_init_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_save_inmem_curseg(struct f2fs_sb_info *sbi); void f2fs_restore_inmem_curseg(struct f2fs_sb_info *sbi); @@ -3386,7 +3390,7 @@ void f2fs_get_new_segment(struct f2fs_sb_info *sbi, unsigned int *newseg, bool new_sec, int dir); void f2fs_allocate_segment_for_resize(struct f2fs_sb_info *sbi, int type, unsigned int start, unsigned int end); -void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type); +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force); void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, @@ -3550,7 +3554,7 @@ void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); int f2fs_start_gc_thread(struct f2fs_sb_info *sbi); void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi); block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode); -int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, +int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, bool background, bool force, unsigned int segno); void f2fs_build_gc_manager(struct f2fs_sb_info *sbi); int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count); @@ -3952,12 +3956,24 @@ struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed); void f2fs_put_page_dic(struct page *page); int f2fs_init_compress_ctx(struct compress_ctx *cc); -void f2fs_destroy_compress_ctx(struct compress_ctx *cc); +void f2fs_destroy_compress_ctx(struct compress_ctx *cc, bool reuse); void f2fs_init_compress_info(struct f2fs_sb_info *sbi); int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi); void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi); int __init f2fs_init_compress_cache(void); void f2fs_destroy_compress_cache(void); +#define inc_compr_inode_stat(inode) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + sbi->compr_new_inode++; \ + } while (0) +#define add_compr_block_stat(inode, blocks) \ + do { \ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); \ + int diff = F2FS_I(inode)->i_cluster_size - blocks; \ + sbi->compr_written_block += blocks; \ + sbi->compr_saved_block += diff; \ + } while (0) #else static inline bool f2fs_is_compressed_page(struct page *page) { return false; } static inline bool f2fs_is_compress_backend_ready(struct inode *inode) @@ -3986,6 +4002,7 @@ static inline int f2fs_init_page_array_cache(struct f2fs_sb_info *sbi) { return static inline void f2fs_destroy_page_array_cache(struct f2fs_sb_info *sbi) { } static inline int __init f2fs_init_compress_cache(void) { return 0; } static inline void f2fs_destroy_compress_cache(void) { } +#define inc_compr_inode_stat(inode) do { } while (0) #endif static inline void set_compress_context(struct inode *inode) @@ -4009,6 +4026,7 @@ static inline void set_compress_context(struct inode *inode) F2FS_I(inode)->i_flags |= F2FS_COMPR_FL; set_inode_flag(inode, FI_COMPRESSED_FILE); stat_inc_compr_inode(inode); + inc_compr_inode_stat(inode); f2fs_mark_inode_dirty_sync(inode, true); } @@ -4179,8 +4197,7 @@ static inline bool f2fs_force_buffered_io(struct inode *inode, if (F2FS_IO_ALIGNED(sbi)) return true; } - if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED) && - !IS_SWAPFILE(inode)) + if (is_sbi_flag_set(F2FS_I_SB(inode), SBI_CP_DISABLED)) return true; return false; diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index 8a56acbcee4c..ceb575f99048 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -1622,9 +1622,10 @@ static int expand_inode_data(struct inode *inode, loff_t offset, struct f2fs_map_blocks map = { .m_next_pgofs = NULL, .m_next_extent = NULL, .m_seg_type = NO_CHECK_TYPE, .m_may_create = true }; - pgoff_t pg_end; + pgoff_t pg_start, pg_end; loff_t new_size = i_size_read(inode); loff_t off_end; + block_t expanded = 0; int err; err = inode_newsize_ok(inode, (len + offset)); @@ -1637,11 +1638,12 @@ static int expand_inode_data(struct inode *inode, loff_t offset, f2fs_balance_fs(sbi, true); + pg_start = ((unsigned long long)offset) >> PAGE_SHIFT; pg_end = ((unsigned long long)offset + len) >> PAGE_SHIFT; off_end = (offset + len) & (PAGE_SIZE - 1); - map.m_lblk = ((unsigned long long)offset) >> PAGE_SHIFT; - map.m_len = pg_end - map.m_lblk; + map.m_lblk = pg_start; + map.m_len = pg_end - pg_start; if (off_end) map.m_len++; @@ -1649,19 +1651,15 @@ static int expand_inode_data(struct inode *inode, loff_t offset, return 0; if (f2fs_is_pinned_file(inode)) { - block_t len = (map.m_len >> sbi->log_blocks_per_seg) << - sbi->log_blocks_per_seg; - block_t done = 0; + block_t sec_blks = BLKS_PER_SEC(sbi); + block_t sec_len = roundup(map.m_len, sec_blks); - if (map.m_len % sbi->blocks_per_seg) - len += sbi->blocks_per_seg; - - map.m_len = sbi->blocks_per_seg; + map.m_len = sec_blks; next_alloc: if (has_not_enough_free_secs(sbi, 0, GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, NULL_SEGNO); + err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err && err != -ENODATA && err != -EAGAIN) goto out_err; } @@ -1669,7 +1667,7 @@ next_alloc: down_write(&sbi->pin_sem); f2fs_lock_op(sbi); - f2fs_allocate_new_segment(sbi, CURSEG_COLD_DATA_PINNED); + f2fs_allocate_new_section(sbi, CURSEG_COLD_DATA_PINNED, false); f2fs_unlock_op(sbi); map.m_seg_type = CURSEG_COLD_DATA_PINNED; @@ -1677,24 +1675,25 @@ next_alloc: up_write(&sbi->pin_sem); - done += map.m_len; - len -= map.m_len; + expanded += map.m_len; + sec_len -= map.m_len; map.m_lblk += map.m_len; - if (!err && len) + if (!err && sec_len) goto next_alloc; - map.m_len = done; + map.m_len = expanded; } else { err = f2fs_map_blocks(inode, &map, 1, F2FS_GET_BLOCK_PRE_AIO); + expanded = map.m_len; } out_err: if (err) { pgoff_t last_off; - if (!map.m_len) + if (!expanded) return err; - last_off = map.m_lblk + map.m_len - 1; + last_off = pg_start + expanded - 1; /* update new size to the failed position */ new_size = (last_off == pg_end) ? offset + len : @@ -1818,7 +1817,8 @@ static int f2fs_setflags_common(struct inode *inode, u32 iflags, u32 mask) struct f2fs_inode_info *fi = F2FS_I(inode); u32 masked_flags = fi->i_flags & mask; - f2fs_bug_on(F2FS_I_SB(inode), (iflags & ~mask)); + /* mask can be shrunk by flags_valid selector */ + iflags &= mask; /* Is it quota file? Do not allow user to mess with it */ if (IS_NOQUOTA(inode)) @@ -2434,7 +2434,7 @@ static int f2fs_ioc_gc(struct file *filp, unsigned long arg) down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, sync, true, NULL_SEGNO); + ret = f2fs_gc(sbi, sync, true, false, NULL_SEGNO); out: mnt_drop_write_file(filp); return ret; @@ -2470,7 +2470,8 @@ do_more: down_write(&sbi->gc_lock); } - ret = f2fs_gc(sbi, range->sync, true, GET_SEGNO(sbi, range->start)); + ret = f2fs_gc(sbi, range->sync, true, false, + GET_SEGNO(sbi, range->start)); if (ret) { if (ret == -EBUSY) ret = -EAGAIN; @@ -2527,7 +2528,7 @@ static int f2fs_defragment_range(struct f2fs_sb_info *sbi, { struct inode *inode = file_inode(filp); struct f2fs_map_blocks map = { .m_next_extent = NULL, - .m_seg_type = NO_CHECK_TYPE , + .m_seg_type = NO_CHECK_TYPE, .m_may_create = false }; struct extent_info ei = {0, 0, 0}; pgoff_t pg_start, pg_end, next_pgofs; @@ -2923,7 +2924,7 @@ static int f2fs_ioc_flush_device(struct file *filp, unsigned long arg) sm->last_victim[GC_CB] = end_segno + 1; sm->last_victim[GC_GREEDY] = end_segno + 1; sm->last_victim[ALLOC_NEXT] = end_segno + 1; - ret = f2fs_gc(sbi, true, true, start_segno); + ret = f2fs_gc(sbi, true, true, true, start_segno); if (ret == -EAGAIN) ret = 0; else if (ret < 0) @@ -4311,8 +4312,13 @@ write: clear_inode_flag(inode, FI_NO_PREALLOC); /* if we couldn't write data, we should deallocate blocks. */ - if (preallocated && i_size_read(inode) < target_size) + if (preallocated && i_size_read(inode) < target_size) { + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + down_write(&F2FS_I(inode)->i_mmap_sem); f2fs_truncate(inode); + up_write(&F2FS_I(inode)->i_mmap_sem); + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + } if (ret > 0) f2fs_update_iostat(F2FS_I_SB(inode), APP_WRITE_IO, ret); diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index 39330ad3c44e..8d1f17ab94d8 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -31,19 +31,24 @@ static int gc_thread_func(void *data) struct f2fs_sb_info *sbi = data; struct f2fs_gc_kthread *gc_th = sbi->gc_thread; wait_queue_head_t *wq = &sbi->gc_thread->gc_wait_queue_head; + wait_queue_head_t *fggc_wq = &sbi->gc_thread->fggc_wq; unsigned int wait_ms; wait_ms = gc_th->min_sleep_time; set_freezable(); do { - bool sync_mode; + bool sync_mode, foreground = false; wait_event_interruptible_timeout(*wq, kthread_should_stop() || freezing(current) || + waitqueue_active(fggc_wq) || gc_th->gc_wake, msecs_to_jiffies(wait_ms)); + if (test_opt(sbi, GC_MERGE) && waitqueue_active(fggc_wq)) + foreground = true; + /* give it a try one time */ if (gc_th->gc_wake) gc_th->gc_wake = 0; @@ -90,7 +95,10 @@ static int gc_thread_func(void *data) goto do_gc; } - if (!down_write_trylock(&sbi->gc_lock)) { + if (foreground) { + down_write(&sbi->gc_lock); + goto do_gc; + } else if (!down_write_trylock(&sbi->gc_lock)) { stat_other_skip_bggc_count(sbi); goto next; } @@ -107,14 +115,22 @@ static int gc_thread_func(void *data) else increase_sleep_time(gc_th, &wait_ms); do_gc: - stat_inc_bggc_count(sbi->stat_info); + if (!foreground) + stat_inc_bggc_count(sbi->stat_info); sync_mode = F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_SYNC; + /* foreground GC was been triggered via f2fs_balance_fs() */ + if (foreground) + sync_mode = false; + /* if return value is not zero, no victim was selected */ - if (f2fs_gc(sbi, sync_mode, true, NULL_SEGNO)) + if (f2fs_gc(sbi, sync_mode, !foreground, false, NULL_SEGNO)) wait_ms = gc_th->no_gc_sleep_time; + if (foreground) + wake_up_all(&gc_th->fggc_wq); + trace_f2fs_background_gc(sbi->sb, wait_ms, prefree_segments(sbi), free_segments(sbi)); @@ -144,10 +160,11 @@ int f2fs_start_gc_thread(struct f2fs_sb_info *sbi) gc_th->max_sleep_time = DEF_GC_THREAD_MAX_SLEEP_TIME; gc_th->no_gc_sleep_time = DEF_GC_THREAD_NOGC_SLEEP_TIME; - gc_th->gc_wake= 0; + gc_th->gc_wake = 0; sbi->gc_thread = gc_th; init_waitqueue_head(&sbi->gc_thread->gc_wait_queue_head); + init_waitqueue_head(&sbi->gc_thread->fggc_wq); sbi->gc_thread->f2fs_gc_task = kthread_run(gc_thread_func, sbi, "f2fs_gc-%u:%u", MAJOR(dev), MINOR(dev)); if (IS_ERR(gc_th->f2fs_gc_task)) { @@ -162,9 +179,11 @@ out: void f2fs_stop_gc_thread(struct f2fs_sb_info *sbi) { struct f2fs_gc_kthread *gc_th = sbi->gc_thread; + if (!gc_th) return; kthread_stop(gc_th->f2fs_gc_task); + wake_up_all(&gc_th->fggc_wq); kfree(gc_th); sbi->gc_thread = NULL; } @@ -392,10 +411,6 @@ static void add_victim_entry(struct f2fs_sb_info *sbi, if (p->gc_mode == GC_AT && get_valid_blocks(sbi, segno, true) == 0) return; - - if (p->alloc_mode == AT_SSR && - get_seg_entry(sbi, segno)->ckpt_valid_blocks == 0) - return; } for (i = 0; i < sbi->segs_per_sec; i++) @@ -728,11 +743,27 @@ retry: if (sec_usage_check(sbi, secno)) goto next; + /* Don't touch checkpointed data */ - if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) && - get_ckpt_valid_blocks(sbi, segno) && - p.alloc_mode == LFS)) - goto next; + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (p.alloc_mode == LFS) { + /* + * LFS is set to find source section during GC. + * The victim should have no checkpointed data. + */ + if (get_ckpt_valid_blocks(sbi, segno, true)) + goto next; + } else { + /* + * SSR | AT_SSR are set to find target segment + * for writes which can be full by checkpointed + * and newly written blocks. + */ + if (!f2fs_segment_has_free_slot(sbi, segno)) + goto next; + } + } + if (gc_type == BG_GC && test_bit(secno, dirty_i->victim_secmap)) goto next; @@ -828,6 +859,7 @@ static void add_gc_inode(struct gc_inode_list *gc_list, struct inode *inode) static void put_gc_inode(struct gc_inode_list *gc_list) { struct inode_entry *ie, *next_ie; + list_for_each_entry_safe(ie, next_ie, &gc_list->ilist, list) { radix_tree_delete(&gc_list->iroot, ie->inode->i_ino); iput(ie->inode); @@ -952,9 +984,11 @@ block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) bidx = node_ofs - 1; } else if (node_ofs <= indirect_blks) { int dec = (node_ofs - 4) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 2 - dec; } else { int dec = (node_ofs - indirect_blks - 3) / (NIDS_PER_BLOCK + 1); + bidx = node_ofs - 5 - dec; } return bidx * ADDRS_PER_BLOCK(inode) + ADDRS_PER_INODE(inode); @@ -1120,7 +1154,8 @@ static int move_data_block(struct inode *inode, block_t bidx, block_t newaddr; int err = 0; bool lfs_mode = f2fs_lfs_mode(fio.sbi); - int type = fio.sbi->am.atgc_enabled ? + int type = fio.sbi->am.atgc_enabled && (gc_type == BG_GC) && + (fio.sbi->gc_mode != GC_URGENT_HIGH) ? CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; /* do not read out */ @@ -1354,7 +1389,8 @@ out: * the victim data block is ignored. */ static int gc_data_segment(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, - struct gc_inode_list *gc_list, unsigned int segno, int gc_type) + struct gc_inode_list *gc_list, unsigned int segno, int gc_type, + bool force_migrate) { struct super_block *sb = sbi->sb; struct f2fs_summary *entry; @@ -1383,8 +1419,8 @@ next_step: * race condition along with SSR block allocation. */ if ((gc_type == BG_GC && has_not_enough_free_secs(sbi, 0, 0)) || - get_valid_blocks(sbi, segno, true) == - BLKS_PER_SEC(sbi)) + (!force_migrate && get_valid_blocks(sbi, segno, true) == + BLKS_PER_SEC(sbi))) return submitted; if (check_valid_map(sbi, segno, off) == 0) @@ -1519,7 +1555,8 @@ static int __get_victim(struct f2fs_sb_info *sbi, unsigned int *victim, static int do_garbage_collect(struct f2fs_sb_info *sbi, unsigned int start_segno, - struct gc_inode_list *gc_list, int gc_type) + struct gc_inode_list *gc_list, int gc_type, + bool force_migrate) { struct page *sum_page; struct f2fs_summary_block *sum; @@ -1606,7 +1643,8 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, gc_type); else submitted += gc_data_segment(sbi, sum->entries, gc_list, - segno, gc_type); + segno, gc_type, + force_migrate); stat_inc_seg_count(sbi, type, gc_type); migrated++; @@ -1634,7 +1672,7 @@ skip: } int f2fs_gc(struct f2fs_sb_info *sbi, bool sync, - bool background, unsigned int segno) + bool background, bool force, unsigned int segno) { int gc_type = sync ? FG_GC : BG_GC; int sec_freed = 0, seg_freed = 0, total_freed = 0; @@ -1696,7 +1734,7 @@ gc_more: if (ret) goto stop; - seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type); + seg_freed = do_garbage_collect(sbi, segno, &gc_list, gc_type, force); if (gc_type == FG_GC && seg_freed == f2fs_usable_segs_in_sec(sbi, segno)) sec_freed++; @@ -1835,7 +1873,7 @@ static int free_segment_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; - do_garbage_collect(sbi, segno, &gc_list, FG_GC); + do_garbage_collect(sbi, segno, &gc_list, FG_GC, true); put_gc_inode(&gc_list); if (!gc_only && get_valid_blocks(sbi, segno, true)) { @@ -1974,7 +2012,20 @@ int f2fs_resize_fs(struct f2fs_sb_info *sbi, __u64 block_count) /* stop CP to protect MAIN_SEC in free_segment_range */ f2fs_lock_op(sbi); + + spin_lock(&sbi->stat_lock); + if (shrunk_blocks + valid_user_blocks(sbi) + + sbi->current_reserved_blocks + sbi->unusable_block_count + + F2FS_OPTION(sbi).root_reserved_blocks > sbi->user_block_count) + err = -ENOSPC; + spin_unlock(&sbi->stat_lock); + + if (err) + goto out_unlock; + err = free_segment_range(sbi, secs, true); + +out_unlock: f2fs_unlock_op(sbi); up_write(&sbi->gc_lock); if (err) diff --git a/fs/f2fs/gc.h b/fs/f2fs/gc.h index 0c8dae12dc51..3fe145e8e594 100644 --- a/fs/f2fs/gc.h +++ b/fs/f2fs/gc.h @@ -42,6 +42,12 @@ struct f2fs_gc_kthread { /* for changing gc mode */ unsigned int gc_wake; + + /* for GC_MERGE mount option */ + wait_queue_head_t fggc_wq; /* + * caller of f2fs_balance_fs() + * will wait on this wait queue. + */ }; struct gc_inode_list { diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 993caefcd2bb..92652ca7a7c8 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -219,7 +219,8 @@ out: f2fs_put_page(page, 1); - f2fs_balance_fs(sbi, dn.node_changed); + if (!err) + f2fs_balance_fs(sbi, dn.node_changed); return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 349d9cb933ee..b401f08569f7 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -666,6 +666,7 @@ retry: node_page = f2fs_get_node_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); + if (err == -ENOMEM) { cond_resched(); goto retry; @@ -698,7 +699,7 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) /* * We need to balance fs here to prevent from producing dirty node pages - * during the urgent cleaning time when runing out of free sections. + * during the urgent cleaning time when running out of free sections. */ f2fs_update_inode_page(inode); if (wbc && wbc->nr_to_write) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 14bf4f65bcb3..a9cd9cf97229 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -416,9 +416,9 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { - struct qstr dotdot = QSTR_INIT("..", 2); struct page *page; - unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot, &page); + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &page); + if (!ino) { if (IS_ERR(page)) return ERR_CAST(page); @@ -628,6 +628,7 @@ static const char *f2fs_get_link(struct dentry *dentry, struct delayed_call *done) { const char *link = page_get_link(dentry, inode, done); + if (!IS_ERR(link) && !*link) { /* this is broken symlink case */ do_delayed_call(done); @@ -766,6 +767,7 @@ out_fail: static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) { struct inode *inode = d_inode(dentry); + if (f2fs_empty_dir(inode)) return f2fs_unlink(dir, dentry); return -ENOTEMPTY; diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 4b0e2e3c2c88..e67ce5f13b98 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -43,11 +43,15 @@ int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid) bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) { struct f2fs_nm_info *nm_i = NM_I(sbi); + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; struct sysinfo val; unsigned long avail_ram; unsigned long mem_size = 0; bool res = false; + if (!nm_i) + return true; + si_meminfo(&val); /* only uses low memory */ @@ -89,6 +93,10 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) /* it allows 20% / total_ram for inmemory pages */ mem_size = get_pages(sbi, F2FS_INMEM_PAGES); res = mem_size < (val.totalram / 5); + } else if (type == DISCARD_CACHE) { + mem_size = (atomic_read(&dcc->discard_cmd_cnt) * + sizeof(struct discard_cmd)) >> PAGE_SHIFT; + res = mem_size < (avail_ram * nm_i->ram_thresh / 100); } else { if (!sbi->sb->s_bdi->wb.dirty_exceeded) return true; @@ -462,6 +470,7 @@ static void set_node_addr(struct f2fs_sb_info *sbi, struct node_info *ni, /* increment version no as node is removed */ if (nat_get_blkaddr(e) != NEW_ADDR && new_blkaddr == NULL_ADDR) { unsigned char version = nat_get_version(e); + nat_set_version(e, inc_node_version(version)); } @@ -1383,7 +1392,7 @@ repeat: goto out_err; } page_hit: - if(unlikely(nid != nid_of_node(page))) { + if (unlikely(nid != nid_of_node(page))) { f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), @@ -1775,7 +1784,7 @@ continue_unlock: out: if (nwritten) f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); - return ret ? -EIO: 0; + return ret ? -EIO : 0; } static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) @@ -2117,8 +2126,8 @@ static int __insert_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i) { struct f2fs_nm_info *nm_i = NM_I(sbi); - int err = radix_tree_insert(&nm_i->free_nid_root, i->nid, i); + if (err) return err; @@ -2785,6 +2794,9 @@ static void remove_nats_in_journal(struct f2fs_sb_info *sbi) struct f2fs_nat_entry raw_ne; nid_t nid = le32_to_cpu(nid_in_journal(journal, i)); + if (f2fs_check_nid_range(sbi, nid)) + continue; + raw_ne = nat_in_journal(journal, i); ne = __lookup_nat_cache(nm_i, nid); @@ -2980,6 +2992,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) while ((found = __gang_lookup_nat_set(nm_i, set_idx, SETVEC_SIZE, setvec))) { unsigned idx; + set_idx = setvec[found - 1]->set + 1; for (idx = 0; idx < found; idx++) __adjust_nat_entry_set(setvec[idx], &sets, diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index f84541b57acb..7a45c0f10629 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -147,6 +147,7 @@ enum mem_type { INO_ENTRIES, /* indicates inode entries */ EXTENT_CACHE, /* indicates extent cache */ INMEM_PAGES, /* indicates inmemory pages */ + DISCARD_CACHE, /* indicates memory of cached discard cmds */ BASE_CHECK, /* check kernel status */ }; diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index da75d5d52f0a..422146c6d866 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -458,6 +458,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, /* Get the previous summary */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { struct curseg_info *curseg = CURSEG_I(sbi, i); + if (curseg->segno == segno) { sum = curseg->sum_blk->entries[blkoff]; goto got_it; @@ -875,5 +876,5 @@ out: #endif sbi->sb->s_flags = s_flags; /* Restore SB_RDONLY status */ - return ret ? ret: err; + return ret ? ret : err; } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c2866561263e..51dc79fad4fe 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -186,7 +186,10 @@ void f2fs_register_inmem_page(struct inode *inode, struct page *page) { struct inmem_pages *new; - f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); + if (PagePrivate(page)) + set_page_private(page, (unsigned long)ATOMIC_WRITTEN_PAGE); + else + f2fs_set_page_private(page, ATOMIC_WRITTEN_PAGE); new = f2fs_kmem_cache_alloc(inmem_entry_slab, GFP_NOFS); @@ -324,23 +327,27 @@ void f2fs_drop_inmem_pages(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - while (!list_empty(&fi->inmem_pages)) { + do { mutex_lock(&fi->inmem_lock); + if (list_empty(&fi->inmem_pages)) { + fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; + + spin_lock(&sbi->inode_lock[ATOMIC_FILE]); + if (!list_empty(&fi->inmem_ilist)) + list_del_init(&fi->inmem_ilist); + if (f2fs_is_atomic_file(inode)) { + clear_inode_flag(inode, FI_ATOMIC_FILE); + sbi->atomic_files--; + } + spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + + mutex_unlock(&fi->inmem_lock); + break; + } __revoke_inmem_pages(inode, &fi->inmem_pages, true, false, true); mutex_unlock(&fi->inmem_lock); - } - - fi->i_gc_failures[GC_FAILURE_ATOMIC] = 0; - - spin_lock(&sbi->inode_lock[ATOMIC_FILE]); - if (!list_empty(&fi->inmem_ilist)) - list_del_init(&fi->inmem_ilist); - if (f2fs_is_atomic_file(inode)) { - clear_inode_flag(inode, FI_ATOMIC_FILE); - sbi->atomic_files--; - } - spin_unlock(&sbi->inode_lock[ATOMIC_FILE]); + } while (1); } void f2fs_drop_inmem_page(struct inode *inode, struct page *page) @@ -503,8 +510,19 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) * dir/node pages without enough free segments. */ if (has_not_enough_free_secs(sbi, 0, 0)) { - down_write(&sbi->gc_lock); - f2fs_gc(sbi, false, false, NULL_SEGNO); + if (test_opt(sbi, GC_MERGE) && sbi->gc_thread && + sbi->gc_thread->f2fs_gc_task) { + DEFINE_WAIT(wait); + + prepare_to_wait(&sbi->gc_thread->fggc_wq, &wait, + TASK_UNINTERRUPTIBLE); + wake_up(&sbi->gc_thread->gc_wait_queue_head); + io_schedule(); + finish_wait(&sbi->gc_thread->fggc_wq, &wait); + } else { + down_write(&sbi->gc_lock); + f2fs_gc(sbi, false, false, false, NULL_SEGNO); + } } } @@ -653,7 +671,11 @@ int f2fs_issue_flush(struct f2fs_sb_info *sbi, nid_t ino) llist_add(&cmd.llnode, &fcc->issue_list); - /* update issue_list before we wake up issue_flush thread */ + /* + * update issue_list before we wake up issue_flush thread, this + * smp_mb() pairs with another barrier in ___wait_event(), see + * more details in comments of waitqueue_active(). + */ smp_mb(); if (waitqueue_active(&fcc->flush_wait_queue)) @@ -861,7 +883,7 @@ static void locate_dirty_segment(struct f2fs_sb_info *sbi, unsigned int segno) mutex_lock(&dirty_i->seglist_lock); valid_blocks = get_valid_blocks(sbi, segno, false); - ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno); + ckpt_valid_blocks = get_ckpt_valid_blocks(sbi, segno, false); if (valid_blocks == 0 && (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) || ckpt_valid_blocks == usable_blocks)) { @@ -946,7 +968,7 @@ static unsigned int get_free_segment(struct f2fs_sb_info *sbi) for_each_set_bit(segno, dirty_i->dirty_segmap[DIRTY], MAIN_SEGS(sbi)) { if (get_valid_blocks(sbi, segno, false)) continue; - if (get_ckpt_valid_blocks(sbi, segno)) + if (get_ckpt_valid_blocks(sbi, segno, false)) continue; mutex_unlock(&dirty_i->seglist_lock); return segno; @@ -1095,6 +1117,8 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, struct discard_policy *dpolicy, int discard_type, unsigned int granularity) { + struct discard_cmd_control *dcc = SM_I(sbi)->dcc_info; + /* common policy */ dpolicy->type = discard_type; dpolicy->sync = true; @@ -1114,7 +1138,9 @@ static void __init_discard_policy(struct f2fs_sb_info *sbi, dpolicy->ordered = true; if (utilization(sbi) > DEF_DISCARD_URGENT_UTIL) { dpolicy->granularity = 1; - dpolicy->max_interval = DEF_MIN_DISCARD_ISSUE_TIME; + if (atomic_read(&dcc->discard_cmd_cnt)) + dpolicy->max_interval = + DEF_MIN_DISCARD_ISSUE_TIME; } } else if (discard_type == DPOLICY_FORCE) { dpolicy->min_interval = DEF_MIN_DISCARD_ISSUE_TIME; @@ -1730,8 +1756,15 @@ static int issue_discard_thread(void *data) set_freezable(); do { - __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, - dcc->discard_granularity); + if (sbi->gc_mode == GC_URGENT_HIGH || + !f2fs_available_free_memory(sbi, DISCARD_CACHE)) + __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + else + __init_discard_policy(sbi, &dpolicy, DPOLICY_BG, + dcc->discard_granularity); + + if (!atomic_read(&dcc->discard_cmd_cnt)) + wait_ms = dpolicy.max_interval; wait_event_interruptible_timeout(*q, kthread_should_stop() || freezing(current) || @@ -1755,9 +1788,8 @@ static int issue_discard_thread(void *data) wait_ms = dpolicy.max_interval; continue; } - - if (sbi->gc_mode == GC_URGENT_HIGH) - __init_discard_policy(sbi, &dpolicy, DPOLICY_FORCE, 1); + if (!atomic_read(&dcc->discard_cmd_cnt)) + continue; sb_start_intwrite(sbi->sb); @@ -1765,7 +1797,7 @@ static int issue_discard_thread(void *data) if (issued > 0) { __wait_all_discard_cmd(sbi, &dpolicy); wait_ms = dpolicy.min_interval; - } else if (issued == -1){ + } else if (issued == -1) { wait_ms = f2fs_time_to_wait(sbi, DISCARD_TIME); if (!wait_ms) wait_ms = dpolicy.mid_interval; @@ -2142,6 +2174,7 @@ static void __set_sit_entry_type(struct f2fs_sb_info *sbi, int type, unsigned int segno, int modified) { struct seg_entry *se = get_seg_entry(sbi, segno); + se->type = type; if (modified) __mark_sit_entry_dirty(sbi, segno); @@ -2333,6 +2366,7 @@ static void __add_sum_entry(struct f2fs_sb_info *sbi, int type, { struct curseg_info *curseg = CURSEG_I(sbi, type); void *addr = curseg->sum_blk; + addr += curseg->next_blkoff * sizeof(struct f2fs_summary); memcpy(addr, sum, sizeof(struct f2fs_summary)); } @@ -2604,22 +2638,20 @@ static void new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) curseg->alloc_type = LFS; } -static void __next_free_blkoff(struct f2fs_sb_info *sbi, - struct curseg_info *seg, block_t start) +static int __next_free_blkoff(struct f2fs_sb_info *sbi, + int segno, block_t start) { - struct seg_entry *se = get_seg_entry(sbi, seg->segno); + struct seg_entry *se = get_seg_entry(sbi, segno); int entries = SIT_VBLOCK_MAP_SIZE / sizeof(unsigned long); unsigned long *target_map = SIT_I(sbi)->tmp_map; unsigned long *ckpt_map = (unsigned long *)se->ckpt_valid_map; unsigned long *cur_map = (unsigned long *)se->cur_valid_map; - int i, pos; + int i; for (i = 0; i < entries; i++) target_map[i] = ckpt_map[i] | cur_map[i]; - pos = __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); - - seg->next_blkoff = pos; + return __find_rev_next_zero_bit(target_map, sbi->blocks_per_seg, start); } /* @@ -2631,11 +2663,18 @@ static void __refresh_next_blkoff(struct f2fs_sb_info *sbi, struct curseg_info *seg) { if (seg->alloc_type == SSR) - __next_free_blkoff(sbi, seg, seg->next_blkoff + 1); + seg->next_blkoff = + __next_free_blkoff(sbi, seg->segno, + seg->next_blkoff + 1); else seg->next_blkoff++; } +bool f2fs_segment_has_free_slot(struct f2fs_sb_info *sbi, int segno) +{ + return __next_free_blkoff(sbi, segno, 0) < sbi->blocks_per_seg; +} + /* * This function always allocates a used segment(from dirty seglist) by SSR * manner, so it should recover the existing segment information of valid blocks @@ -2661,7 +2700,7 @@ static void change_curseg(struct f2fs_sb_info *sbi, int type, bool flush) reset_curseg(sbi, type, 1); curseg->alloc_type = SSR; - __next_free_blkoff(sbi, curseg, 0); + curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); sum_page = f2fs_get_sum_page(sbi, new_segno); if (IS_ERR(sum_page)) { @@ -2893,7 +2932,8 @@ unlock: up_read(&SM_I(sbi)->curseg_lock); } -static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type, + bool new_sec, bool force) { struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int old_segno; @@ -2901,32 +2941,43 @@ static void __allocate_new_segment(struct f2fs_sb_info *sbi, int type) if (!curseg->inited) goto alloc; - if (!curseg->next_blkoff && - !get_valid_blocks(sbi, curseg->segno, false) && - !get_ckpt_valid_blocks(sbi, curseg->segno)) - return; + if (force || curseg->next_blkoff || + get_valid_blocks(sbi, curseg->segno, new_sec)) + goto alloc; + if (!get_ckpt_valid_blocks(sbi, curseg->segno, new_sec)) + return; alloc: old_segno = curseg->segno; SIT_I(sbi)->s_ops->allocate_segment(sbi, type, true); locate_dirty_segment(sbi, old_segno); } -void f2fs_allocate_new_segment(struct f2fs_sb_info *sbi, int type) +static void __allocate_new_section(struct f2fs_sb_info *sbi, + int type, bool force) +{ + __allocate_new_segment(sbi, type, true, force); +} + +void f2fs_allocate_new_section(struct f2fs_sb_info *sbi, int type, bool force) { + down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); - __allocate_new_segment(sbi, type); + __allocate_new_section(sbi, type, force); up_write(&SIT_I(sbi)->sentry_lock); + up_read(&SM_I(sbi)->curseg_lock); } void f2fs_allocate_new_segments(struct f2fs_sb_info *sbi) { int i; + down_read(&SM_I(sbi)->curseg_lock); down_write(&SIT_I(sbi)->sentry_lock); for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) - __allocate_new_segment(sbi, i); + __allocate_new_segment(sbi, i, false, false); up_write(&SIT_I(sbi)->sentry_lock); + up_read(&SM_I(sbi)->curseg_lock); } static const struct segment_allocation default_salloc_ops = { @@ -3239,7 +3290,9 @@ static int __get_segment_type_6(struct f2fs_io_info *fio) struct inode *inode = fio->page->mapping->host; if (is_cold_data(fio->page)) { - if (fio->sbi->am.atgc_enabled) + if (fio->sbi->am.atgc_enabled && + (fio->io_type == FS_DATA_IO) && + (fio->sbi->gc_mode != GC_URGENT_HIGH)) return CURSEG_ALL_DATA_ATGC; else return CURSEG_COLD_DATA; @@ -3365,12 +3418,12 @@ void f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, f2fs_inode_chksum_set(sbi, page); } - if (F2FS_IO_ALIGNED(sbi)) - fio->retry = false; - if (fio) { struct f2fs_bio_info *io; + if (F2FS_IO_ALIGNED(sbi)) + fio->retry = false; + INIT_LIST_HEAD(&fio->list); fio->in_list = true; io = sbi->write_io[fio->type] + fio->temp; @@ -3499,7 +3552,13 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn(sbi, "%s: incorrect segment(%u) type, run fsck to fix.", __func__, segno); - return -EFSCORRUPTED; + err = -EFSCORRUPTED; + goto drop_bio; + } + + if (is_sbi_flag_set(sbi, SBI_NEED_FSCK) || f2fs_cp_error(sbi)) { + err = -EIO; + goto drop_bio; } stat_inc_inplace_blocks(fio->sbi); @@ -3514,6 +3573,15 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) } return err; +drop_bio: + if (fio->bio && *(fio->bio)) { + struct bio *bio = *(fio->bio); + + bio->bi_status = BLK_STS_IOERR; + bio_endio(bio); + *(fio->bio) = NULL; + } + return err; } static inline int __f2fs_get_curseg(struct f2fs_sb_info *sbi, @@ -3539,6 +3607,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct seg_entry *se; int type; unsigned short old_blkoff; + unsigned char old_alloc_type; segno = GET_SEGNO(sbi, new_blkaddr); se = get_seg_entry(sbi, segno); @@ -3572,6 +3641,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, old_cursegno = curseg->segno; old_blkoff = curseg->next_blkoff; + old_alloc_type = curseg->alloc_type; /* change the current segment */ if (segno != curseg->segno) { @@ -3606,6 +3676,7 @@ void f2fs_do_replace_block(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, change_curseg(sbi, type, true); } curseg->next_blkoff = old_blkoff; + curseg->alloc_type = old_alloc_type; } up_write(&sit_i->sentry_lock); @@ -3717,6 +3788,7 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) for (j = 0; j < blk_off; j++) { struct f2fs_summary *s; + s = (struct f2fs_summary *)(kaddr + offset); seg_i->sum_blk->entries[j] = *s; offset += SUMMARY_SIZE; @@ -3779,6 +3851,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) if (__exist_node_summaries(sbi)) { struct f2fs_summary *ns = &sum->entries[0]; int i; + for (i = 0; i < sbi->blocks_per_seg; i++, ns++) { ns->version = 0; ns->ofs_in_node = 0; @@ -3880,6 +3953,7 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) /* Step 3: write summary entries */ for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { unsigned short blkoff; + seg_i = CURSEG_I(sbi, i); if (sbi->ckpt->alloc_type[i] == SSR) blkoff = sbi->blocks_per_seg; @@ -3916,6 +3990,7 @@ static void write_normal_summaries(struct f2fs_sb_info *sbi, block_t blkaddr, int type) { int i, end; + if (IS_DATASEG(type)) end = type + NR_CURSEG_DATA_TYPE; else @@ -4499,6 +4574,7 @@ static void init_free_segmap(struct f2fs_sb_info *sbi) /* set use the current segments */ for (type = CURSEG_HOT_DATA; type <= CURSEG_COLD_NODE; type++) { struct curseg_info *curseg_t = CURSEG_I(sbi, type); + __set_test_and_inuse(sbi, curseg_t->segno); } } @@ -4731,7 +4807,8 @@ static struct f2fs_dev_info *get_target_zoned_dev(struct f2fs_sb_info *sbi, } static int report_one_zone_cb(struct blk_zone *zone, unsigned int idx, - void *data) { + void *data) +{ memcpy(data, zone, sizeof(struct blk_zone)); return 0; } @@ -4783,7 +4860,8 @@ static int fix_curseg_write_pointer(struct f2fs_sb_info *sbi, int type) f2fs_notice(sbi, "Assign new section to curseg[%d]: " "curseg[0x%x,0x%x]", type, cs->segno, cs->next_blkoff); - allocate_segment_by_default(sbi, type, true); + + f2fs_allocate_new_section(sbi, type, true); /* check consistency of the zone curseg pointed to */ if (check_zone_write_pointer(sbi, zbd, &zone)) @@ -4847,8 +4925,10 @@ struct check_zone_write_pointer_args { }; static int check_zone_write_pointer_cb(struct blk_zone *zone, unsigned int idx, - void *data) { + void *data) +{ struct check_zone_write_pointer_args *args; + args = (struct check_zone_write_pointer_args *)data; return check_zone_write_pointer(args->sbi, args->fdev, zone); @@ -5127,6 +5207,7 @@ static void discard_dirty_segmap(struct f2fs_sb_info *sbi, static void destroy_victim_secmap(struct f2fs_sb_info *sbi) { struct dirty_seglist_info *dirty_i = DIRTY_I(sbi); + kvfree(dirty_i->victim_secmap); } @@ -5171,6 +5252,7 @@ static void destroy_curseg(struct f2fs_sb_info *sbi) static void destroy_free_segmap(struct f2fs_sb_info *sbi) { struct free_segmap_info *free_i = SM_I(sbi)->free_info; + if (!free_i) return; SM_I(sbi)->free_info = NULL; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index e9a7a637d688..050230c70a53 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -172,12 +172,10 @@ enum { /* * BG_GC means the background cleaning job. * FG_GC means the on-demand cleaning job. - * FORCE_FG_GC means on-demand cleaning job in background. */ enum { BG_GC = 0, FG_GC, - FORCE_FG_GC, }; /* for a function parameter to select a victim segment */ @@ -361,8 +359,20 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, } static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, - unsigned int segno) + unsigned int segno, bool use_section) { + if (use_section && __is_large_section(sbi)) { + unsigned int start_segno = START_SEGNO(segno); + unsigned int blocks = 0; + int i; + + for (i = 0; i < sbi->segs_per_sec; i++, start_segno++) { + struct seg_entry *se = get_seg_entry(sbi, start_segno); + + blocks += se->ckpt_valid_blocks; + } + return blocks; + } return get_seg_entry(sbi, segno)->ckpt_valid_blocks; } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 82592b19b4e0..7d325bfaf65a 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -151,6 +151,8 @@ enum { Opt_compress_chksum, Opt_compress_mode, Opt_atgc, + Opt_gc_merge, + Opt_nogc_merge, Opt_err, }; @@ -223,6 +225,8 @@ static match_table_t f2fs_tokens = { {Opt_compress_chksum, "compress_chksum"}, {Opt_compress_mode, "compress_mode=%s"}, {Opt_atgc, "atgc"}, + {Opt_gc_merge, "gc_merge"}, + {Opt_nogc_merge, "nogc_merge"}, {Opt_err, NULL}, }; @@ -555,6 +559,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) while ((p = strsep(&options, ",")) != NULL) { int token; + if (!*p) continue; /* @@ -1073,6 +1078,12 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) case Opt_atgc: set_opt(sbi, ATGC); break; + case Opt_gc_merge: + set_opt(sbi, GC_MERGE); + break; + case Opt_nogc_merge: + clear_opt(sbi, GC_MERGE); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); @@ -1616,6 +1627,7 @@ static inline void f2fs_show_quota_options(struct seq_file *seq, #endif } +#ifdef CONFIG_F2FS_FS_COMPRESSION static inline void f2fs_show_compress_options(struct seq_file *seq, struct super_block *sb) { @@ -1661,6 +1673,7 @@ static inline void f2fs_show_compress_options(struct seq_file *seq, else if (F2FS_OPTION(sbi).compress_mode == COMPR_MODE_USER) seq_printf(seq, ",compress_mode=%s", "user"); } +#endif static int f2fs_show_options(struct seq_file *seq, struct dentry *root) { @@ -1673,6 +1686,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) seq_printf(seq, ",background_gc=%s", "off"); + if (test_opt(sbi, GC_MERGE)) + seq_puts(seq, ",gc_merge"); + if (test_opt(sbi, DISABLE_ROLL_FORWARD)) seq_puts(seq, ",disable_roll_forward"); if (test_opt(sbi, NORECOVERY)) @@ -1824,6 +1840,7 @@ static void default_options(struct f2fs_sb_info *sbi) set_opt(sbi, EXTENT_CACHE); set_opt(sbi, NOHEAP); clear_opt(sbi, DISABLE_CHECKPOINT); + set_opt(sbi, MERGE_CHECKPOINT); F2FS_OPTION(sbi).unusable_cap = 0; sbi->sb->s_flags |= SB_LAZYTIME; set_opt(sbi, FLUSH_MERGE); @@ -1865,7 +1882,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) while (!f2fs_time_over(sbi, DISABLE_TIME)) { down_write(&sbi->gc_lock); - err = f2fs_gc(sbi, true, false, NULL_SEGNO); + err = f2fs_gc(sbi, true, false, false, NULL_SEGNO); if (err == -ENODATA) { err = 0; break; @@ -1876,7 +1893,7 @@ static int f2fs_disable_checkpoint(struct f2fs_sb_info *sbi) ret = sync_filesystem(sbi->sb); if (ret || err) { - err = ret ? ret: err; + err = ret ? ret : err; goto restore_flag; } @@ -1925,8 +1942,9 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) struct f2fs_mount_info org_mount_opt; unsigned long old_sb_flags; int err; - bool need_restart_gc = false; - bool need_stop_gc = false; + bool need_restart_gc = false, need_stop_gc = false; + bool need_restart_ckpt = false, need_stop_ckpt = false; + bool need_restart_flush = false, need_stop_flush = false; bool no_extent_cache = !test_opt(sbi, EXTENT_CACHE); bool disable_checkpoint = test_opt(sbi, DISABLE_CHECKPOINT); bool no_io_align = !F2FS_IO_ALIGNED(sbi); @@ -2035,7 +2053,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) * option. Also sync the filesystem. */ if ((*flags & SB_RDONLY) || - F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF) { + (F2FS_OPTION(sbi).bggc_mode == BGGC_MODE_OFF && + !test_opt(sbi, GC_MERGE))) { if (sbi->gc_thread) { f2fs_stop_gc_thread(sbi); need_restart_gc = true; @@ -2057,18 +2076,11 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) clear_sbi_flag(sbi, SBI_IS_CLOSE); } - if (checkpoint_changed) { - if (test_opt(sbi, DISABLE_CHECKPOINT)) { - err = f2fs_disable_checkpoint(sbi); - if (err) - goto restore_gc; - } else { - f2fs_enable_checkpoint(sbi); - } - } - - if (!test_opt(sbi, DISABLE_CHECKPOINT) && - test_opt(sbi, MERGE_CHECKPOINT)) { + if ((*flags & SB_RDONLY) || test_opt(sbi, DISABLE_CHECKPOINT) || + !test_opt(sbi, MERGE_CHECKPOINT)) { + f2fs_stop_ckpt_thread(sbi); + need_restart_ckpt = true; + } else { err = f2fs_start_ckpt_thread(sbi); if (err) { f2fs_err(sbi, @@ -2076,8 +2088,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) err); goto restore_gc; } - } else { - f2fs_stop_ckpt_thread(sbi); + need_stop_ckpt = true; } /* @@ -2087,11 +2098,24 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) if ((*flags & SB_RDONLY) || !test_opt(sbi, FLUSH_MERGE)) { clear_opt(sbi, FLUSH_MERGE); f2fs_destroy_flush_cmd_control(sbi, false); + need_restart_flush = true; } else { err = f2fs_create_flush_cmd_control(sbi); if (err) - goto restore_gc; + goto restore_ckpt; + need_stop_flush = true; } + + if (checkpoint_changed) { + if (test_opt(sbi, DISABLE_CHECKPOINT)) { + err = f2fs_disable_checkpoint(sbi); + if (err) + goto restore_flush; + } else { + f2fs_enable_checkpoint(sbi); + } + } + skip: #ifdef CONFIG_QUOTA /* Release old quota file names */ @@ -2106,6 +2130,21 @@ skip: adjust_unusable_cap_perc(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); return 0; +restore_flush: + if (need_restart_flush) { + if (f2fs_create_flush_cmd_control(sbi)) + f2fs_warn(sbi, "background flush thread has stopped"); + } else if (need_stop_flush) { + clear_opt(sbi, FLUSH_MERGE); + f2fs_destroy_flush_cmd_control(sbi, false); + } +restore_ckpt: + if (need_restart_ckpt) { + if (f2fs_start_ckpt_thread(sbi)) + f2fs_warn(sbi, "background ckpt thread has stopped"); + } else if (need_stop_ckpt) { + f2fs_stop_ckpt_thread(sbi); + } restore_gc: if (need_restart_gc) { if (f2fs_start_gc_thread(sbi)) @@ -3719,7 +3758,7 @@ try_onemore: sbi->iostat_period_ms = DEFAULT_IOSTAT_PERIOD_MS; for (i = 0; i < NR_PAGE_TYPE; i++) { - int n = (i == META) ? 1: NR_TEMP_TYPE; + int n = (i == META) ? 1 : NR_TEMP_TYPE; int j; sbi->write_io[i] = @@ -3833,7 +3872,7 @@ try_onemore: /* setup checkpoint request control and start checkpoint issue thread */ f2fs_init_ckpt_req_control(sbi); - if (!test_opt(sbi, DISABLE_CHECKPOINT) && + if (!f2fs_readonly(sb) && !test_opt(sbi, DISABLE_CHECKPOINT) && test_opt(sbi, MERGE_CHECKPOINT)) { err = f2fs_start_ckpt_thread(sbi); if (err) { @@ -3929,10 +3968,18 @@ try_onemore: * previous checkpoint was not done by clean system shutdown. */ if (f2fs_hw_is_readonly(sbi)) { - if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) - f2fs_err(sbi, "Need to recover fsync data, but write access unavailable"); - else - f2fs_info(sbi, "write access unavailable, skipping recovery"); + if (!is_set_ckpt_flags(sbi, CP_UMOUNT_FLAG)) { + err = f2fs_recover_fsync_data(sbi, true); + if (err > 0) { + err = -EROFS; + f2fs_err(sbi, "Need to recover fsync data, but " + "write access unavailable, please try " + "mount w/ disable_roll_forward or norecovery"); + } + if (err < 0) + goto free_meta; + } + f2fs_info(sbi, "write access unavailable, skipping recovery"); goto reset_checkpoint; } @@ -3989,7 +4036,8 @@ reset_checkpoint: * If filesystem is not mounted as read-only then * do start the gc_thread. */ - if (F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF && !f2fs_readonly(sb)) { + if ((F2FS_OPTION(sbi).bggc_mode != BGGC_MODE_OFF || + test_opt(sbi, GC_MERGE)) && !f2fs_readonly(sb)) { /* After POR, we can run background GC thread.*/ err = f2fs_start_gc_thread(sbi); if (err) diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index e38a7f6921dd..39b522ec73e7 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -12,6 +12,7 @@ #include <linux/seq_file.h> #include <linux/unicode.h> #include <linux/ioprio.h> +#include <linux/sysfs.h> #include "f2fs.h" #include "segment.h" @@ -91,6 +92,13 @@ static ssize_t free_segments_show(struct f2fs_attr *a, (unsigned long long)(free_segments(sbi))); } +static ssize_t ovp_segments_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sprintf(buf, "%llu\n", + (unsigned long long)(overprovision_segments(sbi))); +} + static ssize_t lifetime_write_kbytes_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -282,6 +290,17 @@ static ssize_t f2fs_sbi_show(struct f2fs_attr *a, return len; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_written_block); + + if (!strcmp(a->attr.name, "compr_saved_block")) + return sysfs_emit(buf, "%llu\n", sbi->compr_saved_block); + + if (!strcmp(a->attr.name, "compr_new_inode")) + return sysfs_emit(buf, "%u\n", sbi->compr_new_inode); +#endif + ui = (unsigned int *)(ptr + a->offset); return sprintf(buf, "%u\n", *ui); @@ -458,6 +477,24 @@ out: return count; } +#ifdef CONFIG_F2FS_FS_COMPRESSION + if (!strcmp(a->attr.name, "compr_written_block") || + !strcmp(a->attr.name, "compr_saved_block")) { + if (t != 0) + return -EINVAL; + sbi->compr_written_block = 0; + sbi->compr_saved_block = 0; + return count; + } + + if (!strcmp(a->attr.name, "compr_new_inode")) { + if (t != 0) + return -EINVAL; + sbi->compr_new_inode = 0; + return count; + } +#endif + *ui = (unsigned int)t; return count; @@ -629,6 +666,7 @@ F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, node_io_flag, node_io_flag); F2FS_RW_ATTR(CPRC_INFO, ckpt_req_control, ckpt_thread_ioprio, ckpt_thread_ioprio); F2FS_GENERAL_RO_ATTR(dirty_segments); F2FS_GENERAL_RO_ATTR(free_segments); +F2FS_GENERAL_RO_ATTR(ovp_segments); F2FS_GENERAL_RO_ATTR(lifetime_write_kbytes); F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); @@ -668,6 +706,9 @@ F2FS_FEATURE_RO_ATTR(sb_checksum, FEAT_SB_CHECKSUM); F2FS_FEATURE_RO_ATTR(casefold, FEAT_CASEFOLD); #ifdef CONFIG_F2FS_FS_COMPRESSION F2FS_FEATURE_RO_ATTR(compression, FEAT_COMPRESSION); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_written_block, compr_written_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_saved_block, compr_saved_block); +F2FS_RW_ATTR(F2FS_SBI, f2fs_sb_info, compr_new_inode, compr_new_inode); #endif #define ATTR_LIST(name) (&f2fs_attr_##name.attr) @@ -715,6 +756,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(ckpt_thread_ioprio), ATTR_LIST(dirty_segments), ATTR_LIST(free_segments), + ATTR_LIST(ovp_segments), ATTR_LIST(unusable), ATTR_LIST(lifetime_write_kbytes), ATTR_LIST(features), @@ -731,6 +773,11 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(moved_blocks_background), ATTR_LIST(avg_vblocks), #endif +#ifdef CONFIG_F2FS_FS_COMPRESSION + ATTR_LIST(compr_written_block), + ATTR_LIST(compr_saved_block), + ATTR_LIST(compr_new_inode), +#endif NULL, }; ATTRIBUTE_GROUPS(f2fs); diff --git a/fs/f2fs/verity.c b/fs/f2fs/verity.c index a7beff28a3c5..03549b5ba204 100644 --- a/fs/f2fs/verity.c +++ b/fs/f2fs/verity.c @@ -152,40 +152,73 @@ static int f2fs_end_enable_verity(struct file *filp, const void *desc, size_t desc_size, u64 merkle_tree_size) { struct inode *inode = file_inode(filp); + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); u64 desc_pos = f2fs_verity_metadata_pos(inode) + merkle_tree_size; struct fsverity_descriptor_location dloc = { .version = cpu_to_le32(F2FS_VERIFY_VER), .size = cpu_to_le32(desc_size), .pos = cpu_to_le64(desc_pos), }; - int err = 0; + int err = 0, err2 = 0; - if (desc != NULL) { - /* Succeeded; write the verity descriptor. */ - err = pagecache_write(inode, desc, desc_size, desc_pos); + /* + * If an error already occurred (which fs/verity/ signals by passing + * desc == NULL), then only clean-up is needed. + */ + if (desc == NULL) + goto cleanup; - /* Write all pages before clearing FI_VERITY_IN_PROGRESS. */ - if (!err) - err = filemap_write_and_wait(inode->i_mapping); - } + /* Append the verity descriptor. */ + err = pagecache_write(inode, desc, desc_size, desc_pos); + if (err) + goto cleanup; + + /* + * Write all pages (both data and verity metadata). Note that this must + * happen before clearing FI_VERITY_IN_PROGRESS; otherwise pages beyond + * i_size won't be written properly. For crash consistency, this also + * must happen before the verity inode flag gets persisted. + */ + err = filemap_write_and_wait(inode->i_mapping); + if (err) + goto cleanup; + + /* Set the verity xattr. */ + err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, + F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), + NULL, XATTR_CREATE); + if (err) + goto cleanup; - /* If we failed, truncate anything we wrote past i_size. */ - if (desc == NULL || err) - f2fs_truncate(inode); + /* Finally, set the verity inode flag. */ + file_set_verity(inode); + f2fs_set_inode_flags(inode); + f2fs_mark_inode_dirty_sync(inode, true); clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return 0; - if (desc != NULL && !err) { - err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_VERITY, - F2FS_XATTR_NAME_VERITY, &dloc, sizeof(dloc), - NULL, XATTR_CREATE); - if (!err) { - file_set_verity(inode); - f2fs_set_inode_flags(inode); - f2fs_mark_inode_dirty_sync(inode, true); - } +cleanup: + /* + * Verity failed to be enabled, so clean up by truncating any verity + * metadata that was written beyond i_size (both from cache and from + * disk) and clearing FI_VERITY_IN_PROGRESS. + * + * Taking i_gc_rwsem[WRITE] is needed to stop f2fs garbage collection + * from re-instantiating cached pages we are truncating (since unlike + * normal file accesses, garbage collection isn't limited by i_size). + */ + down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + truncate_inode_pages(inode->i_mapping, inode->i_size); + err2 = f2fs_truncate(inode); + if (err2) { + f2fs_err(sbi, "Truncating verity metadata failed (errno=%d)", + err2); + set_sbi_flag(sbi, SBI_NEED_FSCK); } - return err; + up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); + clear_inode_flag(inode, FI_VERITY_IN_PROGRESS); + return err ?: err2; } static int f2fs_get_verity_descriptor(struct inode *inode, void *buf, diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 490f843ec3bf..c8f34decbf8e 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -488,6 +488,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, f2fs_wait_on_page_writeback(xpage, NODE, true, true); } else { struct dnode_of_data dn; + set_new_dnode(&dn, inode, NULL, NULL, new_nid); xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); if (IS_ERR(xpage)) { diff --git a/fs/fat/fatent.c b/fs/fat/fatent.c index f7e3304b7802..860e884e56e8 100644 --- a/fs/fat/fatent.c +++ b/fs/fat/fatent.c @@ -771,7 +771,7 @@ int fat_trim_fs(struct inode *inode, struct fstrim_range *range) /* * FAT data is organized as clusters, trim at the granulary of cluster. * - * fstrim_range is in byte, convert vaules to cluster index. + * fstrim_range is in byte, convert values to cluster index. * Treat sectors before data region as all used, not to trim them. */ ent_start = max_t(u64, range->start>>sbi->cluster_bits, FAT_START_ENT); diff --git a/fs/file.c b/fs/file.c index f633348029a5..86dc9956af32 100644 --- a/fs/file.c +++ b/fs/file.c @@ -1081,8 +1081,6 @@ out_unlock: /** * __receive_fd() - Install received file into file descriptor table - * - * @fd: fd to install into (if negative, a new fd will be allocated) * @file: struct file that was received from another process * @ufd: __user pointer to write new fd number to * @o_flags: the O_* flags to apply to the new fd entry @@ -1096,7 +1094,7 @@ out_unlock: * * Returns newly install fd or -ve on error. */ -int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flags) +int __receive_fd(struct file *file, int __user *ufd, unsigned int o_flags) { int new_fd; int error; @@ -1105,32 +1103,33 @@ int __receive_fd(int fd, struct file *file, int __user *ufd, unsigned int o_flag if (error) return error; - if (fd < 0) { - new_fd = get_unused_fd_flags(o_flags); - if (new_fd < 0) - return new_fd; - } else { - new_fd = fd; - } + new_fd = get_unused_fd_flags(o_flags); + if (new_fd < 0) + return new_fd; if (ufd) { error = put_user(new_fd, ufd); if (error) { - if (fd < 0) - put_unused_fd(new_fd); + put_unused_fd(new_fd); return error; } } - if (fd < 0) { - fd_install(new_fd, get_file(file)); - } else { - error = replace_fd(new_fd, file, o_flags); - if (error) - return error; - } + fd_install(new_fd, get_file(file)); + __receive_sock(file); + return new_fd; +} - /* Bump the sock usage counts, if any. */ +int receive_fd_replace(int new_fd, struct file *file, unsigned int o_flags) +{ + int error; + + error = security_file_receive(file); + if (error) + return error; + error = replace_fd(new_fd, file, o_flags); + if (error) + return error; __receive_sock(file); return new_fd; } diff --git a/fs/fs_parser.c b/fs/fs_parser.c index 68b0148f4bb8..980d44fd3a36 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -310,7 +310,6 @@ EXPORT_SYMBOL(fs_param_is_path); #ifdef CONFIG_VALIDATE_FS_PARSER /** * validate_constant_table - Validate a constant table - * @name: Name to use in reporting * @tbl: The constant table to validate. * @tbl_size: The size of the table. * @low: The lowest permissible value. @@ -360,6 +359,7 @@ bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, /** * fs_validate_description - Validate a parameter description + * @name: The parameter name to search for. * @desc: The parameter description to validate. */ bool fs_validate_description(const char *name, diff --git a/fs/fuse/acl.c b/fs/fuse/acl.c index e9c0f916349d..52b165319be1 100644 --- a/fs/fuse/acl.c +++ b/fs/fuse/acl.c @@ -71,6 +71,7 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return -EINVAL; if (acl) { + unsigned int extra_flags = 0; /* * Fuse userspace is responsible for updating access * permissions in the inode, if needed. fuse_setxattr @@ -94,7 +95,11 @@ int fuse_set_acl(struct user_namespace *mnt_userns, struct inode *inode, return ret; } - ret = fuse_setxattr(inode, name, value, size, 0); + if (!in_group_p(i_gid_into_mnt(&init_user_ns, inode)) && + !capable_wrt_inode_uidgid(&init_user_ns, inode, CAP_FSETID)) + extra_flags |= FUSE_SETXATTR_ACL_KILL_SGID; + + ret = fuse_setxattr(inode, name, value, size, 0, extra_flags); kfree(value); } else { ret = fuse_removexattr(inode, name); diff --git a/fs/fuse/cuse.c b/fs/fuse/cuse.c index 45082269e698..c7d882a9fe33 100644 --- a/fs/fuse/cuse.c +++ b/fs/fuse/cuse.c @@ -511,20 +511,18 @@ static int cuse_channel_open(struct inode *inode, struct file *file) fuse_conn_init(&cc->fc, &cc->fm, file->f_cred->user_ns, &fuse_dev_fiq_ops, NULL); + cc->fc.release = cuse_fc_release; fud = fuse_dev_alloc_install(&cc->fc); - if (!fud) { - kfree(cc); + fuse_conn_put(&cc->fc); + if (!fud) return -ENOMEM; - } INIT_LIST_HEAD(&cc->list); - cc->fc.release = cuse_fc_release; cc->fc.initialized = 1; rc = cuse_send_init(cc); if (rc) { fuse_dev_free(fud); - fuse_conn_put(&cc->fc); return rc; } file->private_data = fud; @@ -561,8 +559,6 @@ static int cuse_channel_release(struct inode *inode, struct file *file) unregister_chrdev_region(cc->cdev->dev, 1); cdev_del(cc->cdev); } - /* Base reference is now owned by "fud" */ - fuse_conn_put(&cc->fc); rc = fuse_dev_release(inode, file); /* puts the base reference */ @@ -627,6 +623,8 @@ static int __init cuse_init(void) cuse_channel_fops.owner = THIS_MODULE; cuse_channel_fops.open = cuse_channel_open; cuse_channel_fops.release = cuse_channel_release; + /* CUSE is not prepared for FUSE_DEV_IOC_CLONE */ + cuse_channel_fops.unlocked_ioctl = NULL; cuse_class = class_create(THIS_MODULE, "cuse"); if (IS_ERR(cuse_class)) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index c0fee830a34e..a5ceccc5ef00 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -2233,11 +2233,8 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, int oldfd; struct fuse_dev *fud = NULL; - if (_IOC_TYPE(cmd) != FUSE_DEV_IOC_MAGIC) - return -ENOTTY; - - switch (_IOC_NR(cmd)) { - case _IOC_NR(FUSE_DEV_IOC_CLONE): + switch (cmd) { + case FUSE_DEV_IOC_CLONE: res = -EFAULT; if (!get_user(oldfd, (__u32 __user *)arg)) { struct file *old = fget(oldfd); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index e8aa5337eb29..09ef2a4d25ed 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -802,21 +802,12 @@ static void fuse_short_read(struct inode *inode, u64 attr_ver, size_t num_read, { struct fuse_conn *fc = get_fuse_conn(inode); - if (fc->writeback_cache) { - /* - * A hole in a file. Some data after the hole are in page cache, - * but have not reached the client fs yet. So, the hole is not - * present there. - */ - int i; - int start_idx = num_read >> PAGE_SHIFT; - size_t off = num_read & (PAGE_SIZE - 1); - - for (i = start_idx; i < ap->num_pages; i++) { - zero_user_segment(ap->pages[i], off, PAGE_SIZE); - off = 0; - } - } else { + /* + * If writeback_cache is enabled, a short read means there's a hole in + * the file. Some data after the hole is in page cache, but has not + * reached the client fs yet. So the hole is not present there. + */ + if (!fc->writeback_cache) { loff_t pos = page_offset(ap->pages[0]) + num_read; fuse_read_update_size(inode, pos, attr_ver); } @@ -1103,6 +1094,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; unsigned int offset, i; + bool short_write; int err; for (i = 0; i < ap->num_pages; i++) @@ -1117,32 +1109,38 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, if (!err && ia->write.out.size > count) err = -EIO; + short_write = ia->write.out.size < count; offset = ap->descs[0].offset; count = ia->write.out.size; for (i = 0; i < ap->num_pages; i++) { struct page *page = ap->pages[i]; - if (!err && !offset && count >= PAGE_SIZE) - SetPageUptodate(page); - - if (count > PAGE_SIZE - offset) - count -= PAGE_SIZE - offset; - else - count = 0; - offset = 0; - - unlock_page(page); + if (err) { + ClearPageUptodate(page); + } else { + if (count >= PAGE_SIZE - offset) + count -= PAGE_SIZE - offset; + else { + if (short_write) + ClearPageUptodate(page); + count = 0; + } + offset = 0; + } + if (ia->write.page_locked && (i == ap->num_pages - 1)) + unlock_page(page); put_page(page); } return err; } -static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, +static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct address_space *mapping, struct iov_iter *ii, loff_t pos, unsigned int max_pages) { + struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); size_t count = 0; @@ -1195,6 +1193,16 @@ static ssize_t fuse_fill_write_pages(struct fuse_args_pages *ap, if (offset == PAGE_SIZE) offset = 0; + /* If we copied full page, mark it uptodate */ + if (tmp == PAGE_SIZE) + SetPageUptodate(page); + + if (PageUptodate(page)) { + unlock_page(page); + } else { + ia->write.page_locked = true; + break; + } if (!fc->big_writes) break; } while (iov_iter_count(ii) && count < fc->max_write && @@ -1238,7 +1246,7 @@ static ssize_t fuse_perform_write(struct kiocb *iocb, break; } - count = fuse_fill_write_pages(ap, mapping, ii, pos, nr_pages); + count = fuse_fill_write_pages(&ia, mapping, ii, pos, nr_pages); if (count <= 0) { err = count; } else { @@ -1753,8 +1761,17 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, container_of(args, typeof(*wpa), ia.ap.args); struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); mapping_set_error(inode->i_mapping, error); + /* + * A writeback finished and this might have updated mtime/ctime on + * server making local mtime/ctime stale. Hence invalidate attrs. + * Do this only if writeback_cache is not enabled. If writeback_cache + * is enabled, we trust local ctime/mtime. + */ + if (!fc->writeback_cache) + fuse_invalidate_attr(inode); spin_lock(&fi->lock); rb_erase(&wpa->writepages_entry, &fi->writepages); while (wpa->next) { diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index ca868b71eb97..7e463e220053 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -552,9 +552,12 @@ struct fuse_conn { /** Maximum write size */ unsigned max_write; - /** Maxmum number of pages that can be used in a single request */ + /** Maximum number of pages that can be used in a single request */ unsigned int max_pages; + /** Constrain ->max_pages to this value during feature negotiation */ + unsigned int max_pages_limit; + /** Input queue */ struct fuse_iqueue iq; @@ -668,6 +671,9 @@ struct fuse_conn { /** Is setxattr not implemented by fs? */ unsigned no_setxattr:1; + /** Does file server support extended setxattr */ + unsigned setxattr_ext:1; + /** Is getxattr not implemented by fs? */ unsigned no_getxattr:1; @@ -713,7 +719,7 @@ struct fuse_conn { /** Use enhanced/automatic page cache invalidation. */ unsigned auto_inval_data:1; - /** Filesystem is fully reponsible for page cache invalidation. */ + /** Filesystem is fully responsible for page cache invalidation. */ unsigned explicit_inval_data:1; /** Does the filesystem support readdirplus? */ @@ -934,6 +940,7 @@ struct fuse_io_args { struct { struct fuse_write_in in; struct fuse_write_out out; + bool page_locked; } write; }; struct fuse_args_pages ap; @@ -1193,7 +1200,7 @@ void fuse_unlock_inode(struct inode *inode, bool locked); bool fuse_lock_inode(struct inode *inode); int fuse_setxattr(struct inode *inode, const char *name, const void *value, - size_t size, int flags); + size_t size, int flags, unsigned int extra_flags); ssize_t fuse_getxattr(struct inode *inode, const char *name, void *value, size_t size); ssize_t fuse_listxattr(struct dentry *entry, char *list, size_t size); diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index b4b956da3851..393e36b74dc4 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -712,6 +712,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->pid_ns = get_pid_ns(task_active_pid_ns(current)); fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; + fc->max_pages_limit = FUSE_MAX_MAX_PAGES; INIT_LIST_HEAD(&fc->mounts); list_add(&fm->fc_entry, &fc->mounts); @@ -872,14 +873,13 @@ static struct dentry *fuse_get_parent(struct dentry *child) struct inode *inode; struct dentry *parent; struct fuse_entry_out outarg; - const struct qstr name = QSTR_INIT("..", 2); int err; if (!fc->export_support) return ERR_PTR(-ESTALE); err = fuse_lookup_name(child_inode->i_sb, get_node_id(child_inode), - &name, &outarg, &inode); + &dotdot_name, &outarg, &inode); if (err) { if (err == -ENOENT) return ERR_PTR(-ESTALE); @@ -1040,7 +1040,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->abort_err = 1; if (arg->flags & FUSE_MAX_PAGES) { fc->max_pages = - min_t(unsigned int, FUSE_MAX_MAX_PAGES, + min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); } if (IS_ENABLED(CONFIG_FUSE_DAX) && @@ -1052,6 +1052,8 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->handle_killpriv_v2 = 1; fm->sb->s_flags |= SB_NOSEC; } + if (arg->flags & FUSE_SETXATTR_EXT) + fc->setxattr_ext = 1; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; @@ -1095,7 +1097,7 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_PARALLEL_DIROPS | FUSE_HANDLE_KILLPRIV | FUSE_POSIX_ACL | FUSE_ABORT_ERROR | FUSE_MAX_PAGES | FUSE_CACHE_SYMLINKS | FUSE_NO_OPENDIR_SUPPORT | FUSE_EXPLICIT_INVAL_DATA | - FUSE_HANDLE_KILLPRIV_V2; + FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) ia->in.flags |= FUSE_MAP_ALIGNMENT; diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 4ee6f734ba83..bcb8a02e2d8b 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -18,6 +18,12 @@ #include <linux/uio.h> #include "fuse_i.h" +/* Used to help calculate the FUSE connection's max_pages limit for a request's + * size. Parts of the struct fuse_req are sliced into scattergather lists in + * addition to the pages used, so this can help account for that overhead. + */ +#define FUSE_HEADER_OVERHEAD 4 + /* List of virtio-fs device instances and a lock for the list. Also provides * mutual exclusion in device removal and mounting path */ @@ -127,11 +133,6 @@ static inline struct virtio_fs_vq *vq_to_fsvq(struct virtqueue *vq) return &fs->vqs[vq->index]; } -static inline struct fuse_pqueue *vq_to_fpq(struct virtqueue *vq) -{ - return &vq_to_fsvq(vq)->fud->pq; -} - /* Should be called with fsvq->lock held. */ static inline void inc_in_flight_req(struct virtio_fs_vq *fsvq) { @@ -896,6 +897,7 @@ static int virtio_fs_probe(struct virtio_device *vdev) out_vqs: vdev->config->reset(vdev); virtio_fs_cleanup_vqs(vdev, fs); + kfree(fs->vqs); out: vdev->priv = NULL; @@ -1413,9 +1415,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) { struct virtio_fs *fs; struct super_block *sb; - struct fuse_conn *fc; + struct fuse_conn *fc = NULL; struct fuse_mount *fm; - int err; + unsigned int virtqueue_size; + int err = -EIO; /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() @@ -1427,6 +1430,10 @@ static int virtio_fs_get_tree(struct fs_context *fsc) return -EINVAL; } + virtqueue_size = virtqueue_get_vring_size(fs->vqs[VQ_REQUEST].vq); + if (WARN_ON(virtqueue_size <= FUSE_HEADER_OVERHEAD)) + goto out_err; + err = -ENOMEM; fc = kzalloc(sizeof(struct fuse_conn), GFP_KERNEL); if (!fc) @@ -1436,12 +1443,15 @@ static int virtio_fs_get_tree(struct fs_context *fsc) if (!fm) goto out_err; - fuse_conn_init(fc, fm, get_user_ns(current_user_ns()), - &virtio_fs_fiq_ops, fs); + fuse_conn_init(fc, fm, fsc->user_ns, &virtio_fs_fiq_ops, fs); fc->release = fuse_free_conn; fc->delete_stale = true; fc->auto_submounts = true; + /* Tell FUSE to split requests that exceed the virtqueue's size */ + fc->max_pages_limit = min_t(unsigned int, fc->max_pages_limit, + virtqueue_size - FUSE_HEADER_OVERHEAD); + fsc->s_fs_info = fm; sb = sget_fc(fsc, virtio_fs_test_super, set_anon_super_fc); if (fsc->s_fs_info) { diff --git a/fs/fuse/xattr.c b/fs/fuse/xattr.c index 1a7d7ace54e1..61dfaf7b7d20 100644 --- a/fs/fuse/xattr.c +++ b/fs/fuse/xattr.c @@ -12,7 +12,7 @@ #include <linux/posix_acl_xattr.h> int fuse_setxattr(struct inode *inode, const char *name, const void *value, - size_t size, int flags) + size_t size, int flags, unsigned int extra_flags) { struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); @@ -25,10 +25,13 @@ int fuse_setxattr(struct inode *inode, const char *name, const void *value, memset(&inarg, 0, sizeof(inarg)); inarg.size = size; inarg.flags = flags; + inarg.setxattr_flags = extra_flags; + args.opcode = FUSE_SETXATTR; args.nodeid = get_node_id(inode); args.in_numargs = 3; - args.in_args[0].size = sizeof(inarg); + args.in_args[0].size = fm->fc->setxattr_ext ? + sizeof(inarg) : FUSE_COMPAT_SETXATTR_IN_SIZE; args.in_args[0].value = &inarg; args.in_args[1].size = strlen(name) + 1; args.in_args[1].value = name; @@ -199,7 +202,7 @@ static int fuse_xattr_set(const struct xattr_handler *handler, if (!value) return fuse_removexattr(inode, name); - return fuse_setxattr(inode, name, value, size, flags); + return fuse_setxattr(inode, name, value, size, flags, 0); } static bool no_xattr_list(struct dentry *dentry) diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index a0b542d84cd9..493a83e3f590 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -911,8 +911,11 @@ static ssize_t gfs2_file_write_iter(struct kiocb *iocb, struct iov_iter *from) current->backing_dev_info = inode_to_bdi(inode); buffered = iomap_file_buffered_write(iocb, from, &gfs2_iomap_ops); current->backing_dev_info = NULL; - if (unlikely(buffered <= 0)) + if (unlikely(buffered <= 0)) { + if (!ret) + ret = buffered; goto out_unlock; + } /* * We need to ensure that the page cache pages are written to diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 84c38103aa06..d9cb261f55b0 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -273,8 +273,7 @@ static void __gfs2_glock_put(struct gfs2_glock *gl) if (mapping) { truncate_inode_pages_final(mapping); if (!gfs2_withdrawn(sdp)) - GLOCK_BUG_ON(gl, mapping->nrpages || - mapping->nrexceptional); + GLOCK_BUG_ON(gl, !mapping_empty(mapping)); } trace_gfs2_glock_put(gl); sdp->sd_lockstruct.ls_ops->lm_put_lock(gl); @@ -583,6 +582,16 @@ out_locked: spin_unlock(&gl->gl_lockref.lock); } +static bool is_system_glock(struct gfs2_glock *gl) +{ + struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; + struct gfs2_inode *m_ip = GFS2_I(sdp->sd_statfs_inode); + + if (gl == m_ip->i_gl) + return true; + return false; +} + /** * do_xmote - Calls the DLM to change the state of a lock * @gl: The lock state @@ -672,17 +681,25 @@ skip_inval: * to see sd_log_error and withdraw, and in the meantime, requeue the * work for later. * + * We make a special exception for some system glocks, such as the + * system statfs inode glock, which needs to be granted before the + * gfs2_quotad daemon can exit, and that exit needs to finish before + * we can unmount the withdrawn file system. + * * However, if we're just unlocking the lock (say, for unmount, when * gfs2_gl_hash_clear calls clear_glock) and recovery is complete * then it's okay to tell dlm to unlock it. */ if (unlikely(sdp->sd_log_error && !gfs2_withdrawn(sdp))) gfs2_withdraw_delayed(sdp); - if (glock_blocked_by_withdraw(gl)) { - if (target != LM_ST_UNLOCKED || - test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags)) { + if (glock_blocked_by_withdraw(gl) && + (target != LM_ST_UNLOCKED || + test_bit(SDF_WITHDRAW_RECOVERY, &sdp->sd_flags))) { + if (!is_system_glock(gl)) { gfs2_glock_queue_work(gl, GL_GLOCK_DFT_HOLD); goto out; + } else { + clear_bit(GLF_INVALIDATE_IN_PROGRESS, &gl->gl_flags); } } @@ -1467,9 +1484,11 @@ void gfs2_glock_dq(struct gfs2_holder *gh) glock_blocked_by_withdraw(gl) && gh->gh_gl != sdp->sd_jinode_gl) { sdp->sd_glock_dqs_held++; + spin_unlock(&gl->gl_lockref.lock); might_sleep(); wait_on_bit(&sdp->sd_flags, SDF_WITHDRAW_RECOVERY, TASK_UNINTERRUPTIBLE); + spin_lock(&gl->gl_lockref.lock); } if (gh->gh_flags & GL_NOCACHE) handle_callback(gl, LM_ST_UNLOCKED, 0, false); @@ -1776,6 +1795,7 @@ __acquires(&lru_lock) while(!list_empty(list)) { gl = list_first_entry(list, struct gfs2_glock, gl_lru); list_del_init(&gl->gl_lru); + clear_bit(GLF_LRU, &gl->gl_flags); if (!spin_trylock(&gl->gl_lockref.lock)) { add_back_to_lru: list_add(&gl->gl_lru, &lru_list); @@ -1821,7 +1841,6 @@ static long gfs2_scan_glock_lru(int nr) if (!test_bit(GLF_LOCK, &gl->gl_flags)) { list_move(&gl->gl_lru, &dispose); atomic_dec(&lru_count); - clear_bit(GLF_LRU, &gl->gl_flags); freed++; continue; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index 454095e9fedf..54d3fbeb3002 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -396,7 +396,7 @@ static int gfs2_dinode_in(struct gfs2_inode *ip, const void *buf) struct timespec64 atime; u16 height, depth; umode_t mode = be32_to_cpu(str->di_mode); - bool is_new = ip->i_inode.i_flags & I_NEW; + bool is_new = ip->i_inode.i_state & I_NEW; if (unlikely(ip->i_no_addr != be64_to_cpu(str->di_num.no_addr))) goto corrupt; diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index 97d54e581a7b..42c15cfc0821 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -926,10 +926,10 @@ static void log_write_header(struct gfs2_sbd *sdp, u32 flags) } /** - * ail_drain - drain the ail lists after a withdraw + * gfs2_ail_drain - drain the ail lists after a withdraw * @sdp: Pointer to GFS2 superblock */ -static void ail_drain(struct gfs2_sbd *sdp) +void gfs2_ail_drain(struct gfs2_sbd *sdp) { struct gfs2_trans *tr; @@ -956,6 +956,7 @@ static void ail_drain(struct gfs2_sbd *sdp) list_del(&tr->tr_list); gfs2_trans_free(sdp, tr); } + gfs2_drain_revokes(sdp); spin_unlock(&sdp->sd_ail_lock); } @@ -1162,7 +1163,6 @@ out_withdraw: if (tr && list_empty(&tr->tr_list)) list_add(&tr->tr_list, &sdp->sd_ail1_list); spin_unlock(&sdp->sd_ail_lock); - ail_drain(sdp); /* frees all transactions */ tr = NULL; goto out_end; } diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index eea58015710e..fc905c2af53c 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -93,5 +93,6 @@ extern int gfs2_logd(void *data); extern void gfs2_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); extern void gfs2_glock_remove_revoke(struct gfs2_glock *gl); extern void gfs2_flush_revokes(struct gfs2_sbd *sdp); +extern void gfs2_ail_drain(struct gfs2_sbd *sdp); #endif /* __LOG_DOT_H__ */ diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 221e7118cc3b..8ee05d25dfa6 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -885,7 +885,7 @@ static void revoke_lo_before_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) gfs2_log_write_page(sdp, page); } -static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +void gfs2_drain_revokes(struct gfs2_sbd *sdp) { struct list_head *head = &sdp->sd_log_revokes; struct gfs2_bufdata *bd; @@ -900,6 +900,11 @@ static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) } } +static void revoke_lo_after_commit(struct gfs2_sbd *sdp, struct gfs2_trans *tr) +{ + gfs2_drain_revokes(sdp); +} + static void revoke_lo_before_scan(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, int pass) { diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 31b6dd0d2e5d..f707601597dc 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -20,6 +20,7 @@ extern void gfs2_log_submit_bio(struct bio **biop, int opf); extern void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); extern int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bool keep_cache); +extern void gfs2_drain_revokes(struct gfs2_sbd *sdp); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) { return sdp->sd_ldptrs; diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 3e08027a6c81..f4325b44956d 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -131,6 +131,7 @@ static void signal_our_withdraw(struct gfs2_sbd *sdp) if (test_bit(SDF_NORECOVERY, &sdp->sd_flags) || !sdp->sd_jdesc) return; + gfs2_ail_drain(sdp); /* frees all transactions */ inode = sdp->sd_jdesc->jd_inode; ip = GFS2_I(inode); i_gl = ip->i_gl; diff --git a/fs/hfsplus/extents.c b/fs/hfsplus/extents.c index a930ddd15681..7054a542689f 100644 --- a/fs/hfsplus/extents.c +++ b/fs/hfsplus/extents.c @@ -598,13 +598,15 @@ void hfsplus_file_truncate(struct inode *inode) res = __hfsplus_ext_cache_extent(&fd, inode, alloc_cnt); if (res) break; - hfs_brec_remove(&fd); - mutex_unlock(&fd.tree->tree_lock); start = hip->cached_start; + if (blk_cnt <= start) + hfs_brec_remove(&fd); + mutex_unlock(&fd.tree->tree_lock); hfsplus_free_extents(sb, hip->cached_extents, alloc_cnt - start, alloc_cnt - blk_cnt); hfsplus_dump_extent(hip->cached_extents); + mutex_lock(&fd.tree->tree_lock); if (blk_cnt > start) { hip->extent_state |= HFSPLUS_EXT_DIRTY; break; @@ -612,7 +614,6 @@ void hfsplus_file_truncate(struct inode *inode) alloc_cnt = start; hip->cached_start = hip->cached_blocks = 0; hip->extent_state &= ~(HFSPLUS_EXT_DIRTY | HFSPLUS_EXT_NEW); - mutex_lock(&fd.tree->tree_lock); } hfs_find_exit(&fd); diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index 7b5e984ff02a..7d0c3dbb2898 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -316,7 +316,7 @@ retry: if (mode & FMODE_WRITE) r = w = 1; - name = dentry_name(d_real(file->f_path.dentry, file->f_inode)); + name = dentry_name(file_dentry(file)); if (name == NULL) return -ENOMEM; diff --git a/fs/hpfs/hpfs.h b/fs/hpfs/hpfs.h index 302f45101a96..d92c4af3e1b4 100644 --- a/fs/hpfs/hpfs.h +++ b/fs/hpfs/hpfs.h @@ -356,7 +356,8 @@ struct hpfs_dirent { u8 no_of_acls; /* number of ACL's (low 3 bits) */ u8 ix; /* code page index (of filename), see struct code_page_data */ - u8 namelen, name[1]; /* file name */ + u8 namelen; /* file name length */ + u8 name[]; /* file name */ /* dnode_secno down; btree down pointer, if present, follows name on next word boundary, or maybe it precedes next dirent, which is on a word boundary. */ diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 701c82c36138..55efd3dd04f6 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -131,6 +131,7 @@ static void huge_pagevec_release(struct pagevec *pvec) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) { struct inode *inode = file_inode(file); + struct hugetlbfs_inode_info *info = HUGETLBFS_I(inode); loff_t len, vma_len; int ret; struct hstate *h = hstate_file(file); @@ -146,6 +147,10 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) vma->vm_flags |= VM_HUGETLB | VM_DONTEXPAND; vma->vm_ops = &hugetlb_vm_ops; + ret = seal_check_future_write(info->seals, vma); + if (ret) + return ret; + /* * page based offset in vm_pgoff could be sufficiently large to * overflow a loff_t when converted to byte offset. This can @@ -463,14 +468,11 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, struct address_space *mapping = &inode->i_data; const pgoff_t start = lstart >> huge_page_shift(h); const pgoff_t end = lend >> huge_page_shift(h); - struct vm_area_struct pseudo_vma; struct pagevec pvec; pgoff_t next, index; int i, freed = 0; bool truncate_op = (lend == LLONG_MAX); - vma_init(&pseudo_vma, current->mm); - pseudo_vma.vm_flags = (VM_HUGETLB | VM_MAYSHARE | VM_SHARED); pagevec_init(&pvec); next = start; while (next < end) { @@ -482,10 +484,9 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, for (i = 0; i < pagevec_count(&pvec); ++i) { struct page *page = pvec.pages[i]; - u32 hash; + u32 hash = 0; index = page->index; - hash = hugetlb_fault_mutex_hash(mapping, index); if (!truncate_op) { /* * Only need to hold the fault mutex in the @@ -493,6 +494,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * page faults. Races are not possible in the * case of truncation. */ + hash = hugetlb_fault_mutex_hash(mapping, index); mutex_lock(&hugetlb_fault_mutex_table[hash]); } @@ -527,7 +529,7 @@ static void remove_inode_hugepages(struct inode *inode, loff_t lstart, * the subpool and global reserve usage count can need * to be adjusted. */ - VM_BUG_ON(PagePrivate(page)); + VM_BUG_ON(HPageRestoreReserve(page)); remove_huge_page(page); freed++; if (!truncate_op) { @@ -1435,7 +1437,7 @@ static int get_hstate_idx(int page_size_log) if (!h) return -1; - return h - hstates; + return hstate_index(h); } /* diff --git a/fs/inode.c b/fs/inode.c index 9e192bea0630..c93500d84264 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -529,7 +529,14 @@ void clear_inode(struct inode *inode) */ xa_lock_irq(&inode->i_data.i_pages); BUG_ON(inode->i_data.nrpages); - BUG_ON(inode->i_data.nrexceptional); + /* + * Almost always, mapping_empty(&inode->i_data) here; but there are + * two known and long-standing ways in which nodes may get left behind + * (when deep radix-tree node allocation failed partway; or when THP + * collapse_file() failed). Until those two known cases are cleaned up, + * or a cleanup function is called here, do not BUG_ON(!mapping_empty), + * nor even WARN_ON(!mapping_empty). + */ xa_unlock_irq(&inode->i_data.i_pages); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 0129e6bab985..9023717c5188 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -394,7 +394,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) { struct inode *inode = rac->mapping->host; loff_t pos = readahead_pos(rac); - loff_t length = readahead_length(rac); + size_t length = readahead_length(rac); struct iomap_readpage_ctx ctx = { .rac = rac, }; @@ -402,7 +402,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) trace_iomap_readahead(inode, readahead_count(rac)); while (length > 0) { - loff_t ret = iomap_apply(inode, pos, length, 0, ops, + ssize_t ret = iomap_apply(inode, pos, length, 0, ops, &ctx, iomap_readahead_actor); if (ret <= 0) { WARN_ON_ONCE(ret == 0); @@ -1134,9 +1134,7 @@ iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) } void -iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends, - void (*merge_private)(struct iomap_ioend *ioend, - struct iomap_ioend *next)) +iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) { struct iomap_ioend *next; @@ -1148,8 +1146,6 @@ iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends, break; list_move_tail(&next->io_list, &ioend->io_list); ioend->io_size += next->io_size; - if (next->io_private && merge_private) - merge_private(ioend, next); } } EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); @@ -1236,7 +1232,6 @@ iomap_alloc_ioend(struct inode *inode, struct iomap_writepage_ctx *wpc, ioend->io_inode = inode; ioend->io_size = 0; ioend->io_offset = offset; - ioend->io_private = NULL; ioend->io_bio = bio; return ioend; } diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index bdd0d89bbf0a..9398b8c31323 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -487,12 +487,28 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (pos >= dio->i_size) goto out_free_dio; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_needs_writeback(mapping, pos, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + iomap_flags |= IOMAP_NOWAIT; + } + if (iter_is_iovec(iter)) dio->flags |= IOMAP_DIO_DIRTY; } else { iomap_flags |= IOMAP_WRITE; dio->flags |= IOMAP_DIO_WRITE; + if (iocb->ki_flags & IOCB_NOWAIT) { + if (filemap_range_has_page(mapping, pos, end)) { + ret = -EAGAIN; + goto out_free_dio; + } + iomap_flags |= IOMAP_NOWAIT; + } + /* for data sync or sync, we need sync completion processing */ if (iocb->ki_flags & IOCB_DSYNC) dio->flags |= IOMAP_DIO_NEED_SYNC; @@ -507,14 +523,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, dio->flags |= IOMAP_DIO_WRITE_FUA; } - if (iocb->ki_flags & IOCB_NOWAIT) { - if (filemap_range_has_page(mapping, pos, end)) { - ret = -EAGAIN; - goto out_free_dio; - } - iomap_flags |= IOMAP_NOWAIT; - } - if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) { ret = -EAGAIN; if (pos >= dio->i_size || pos + count > dio->i_size) diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index 94ef92fe806c..4880146babaf 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -767,6 +767,7 @@ repeat: rs.cont_extent = isonum_733(rr->u.CE.extent); rs.cont_offset = isonum_733(rr->u.CE.offset); rs.cont_size = isonum_733(rr->u.CE.size); + break; default: break; } diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 69f18fe20923..d47a0d96bf30 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -245,15 +245,14 @@ static int fc_do_one_pass(journal_t *journal, return 0; while (next_fc_block <= journal->j_fc_last) { - jbd_debug(3, "Fast commit replay: next block %ld", + jbd_debug(3, "Fast commit replay: next block %ld\n", next_fc_block); err = jread(&bh, journal, next_fc_block); if (err) { - jbd_debug(3, "Fast commit replay: read error"); + jbd_debug(3, "Fast commit replay: read error\n"); break; } - jbd_debug(3, "Processing fast commit blk with seq %d"); err = journal->j_fc_replay_callback(journal, bh, pass, next_fc_block - journal->j_fc_first, expected_commit_id); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 9396666b7314..e8fc45fd751f 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -349,7 +349,12 @@ static int start_this_handle(journal_t *journal, handle_t *handle, } alloc_transaction: - if (!journal->j_running_transaction) { + /* + * This check is racy but it is just an optimization of allocating new + * transaction early if there are high chances we'll need it. If we + * guess wrong, we'll retry or free unused transaction. + */ + if (!data_race(journal->j_running_transaction)) { /* * If __GFP_FS is not present, then we may be being called from * inside the fs writeback layer, so we MUST NOT fail. @@ -1474,8 +1479,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) * crucial to catch bugs so let's do a reliable check until the * lockless handling is fully proven. */ - if (jh->b_transaction != transaction && - jh->b_next_transaction != transaction) { + if (data_race(jh->b_transaction != transaction && + jh->b_next_transaction != transaction)) { spin_lock(&jh->b_state_lock); J_ASSERT_JH(jh, jh->b_transaction == transaction || jh->b_next_transaction == transaction); @@ -1483,8 +1488,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) } if (jh->b_modified == 1) { /* If it's in our transaction it must be in BJ_Metadata list. */ - if (jh->b_transaction == transaction && - jh->b_jlist != BJ_Metadata) { + if (data_race(jh->b_transaction == transaction && + jh->b_jlist != BJ_Metadata)) { spin_lock(&jh->b_state_lock); if (jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata) diff --git a/fs/jffs2/TODO b/fs/jffs2/TODO deleted file mode 100644 index ca28964abd4b..000000000000 --- a/fs/jffs2/TODO +++ /dev/null @@ -1,37 +0,0 @@ - - - support asynchronous operation -- add a per-fs 'reserved_space' count, - let each outstanding write reserve the _maximum_ amount of physical - space it could take. Let GC flush the outstanding writes because the - reservations will necessarily be pessimistic. With this we could even - do shared writable mmap, if we can have a fs hook for do_wp_page() to - make the reservation. - - disable compression in commit_write()? - - fine-tune the allocation / GC thresholds - - chattr support - turning on/off and tuning compression per-inode - - checkpointing (do we need this? scan is quite fast) - - make the scan code populate real inodes so read_inode just after - mount doesn't have to read the flash twice for large files. - Make this a per-inode option, changeable with chattr, so you can - decide which inodes should be in-core immediately after mount. - - test, test, test - - - NAND flash support: - - almost done :) - - use bad block check instead of the hardwired byte check - - - Optimisations: - - Split writes so they go to two separate blocks rather than just c->nextblock. - By writing _new_ nodes to one block, and garbage-collected REF_PRISTINE - nodes to a different one, we can separate clean nodes from those which - are likely to become dirty, and end up with blocks which are each far - closer to 100% or 0% clean, hence speeding up later GC progress dramatically. - - Stop keeping name in-core with struct jffs2_full_dirent. If we keep the hash in - the full dirent, we only need to go to the flash in lookup() when we think we've - got a match, and in readdir(). - - Doubly-linked next_in_ino list to allow us to free obsoleted raw_node_refs immediately? - - Remove size from jffs2_raw_node_frag. - -dedekind: -1. __jffs2_flush_wbuf() has a strange 'pad' parameter. Eliminate. -2. get_sb()->build_fs()->scan() path... Why get_sb() removes scan()'s crap in - case of failure? scan() does not clean everything. Fix. diff --git a/fs/jffs2/file.c b/fs/jffs2/file.c index f8fb89b10227..4fc8cd698d1a 100644 --- a/fs/jffs2/file.c +++ b/fs/jffs2/file.c @@ -57,6 +57,7 @@ const struct file_operations jffs2_file_operations = .mmap = generic_file_readonly_mmap, .fsync = jffs2_fsync, .splice_read = generic_file_splice_read, + .splice_write = iter_file_splice_write, }; /* jffs2_file_inode_operations */ diff --git a/fs/jffs2/scan.c b/fs/jffs2/scan.c index db72a9d2d0af..b676056826be 100644 --- a/fs/jffs2/scan.c +++ b/fs/jffs2/scan.c @@ -1079,7 +1079,7 @@ static int jffs2_scan_dirent_node(struct jffs2_sb_info *c, struct jffs2_eraseblo memcpy(&fd->name, rd->name, checkedlen); fd->name[checkedlen] = 0; - crc = crc32(0, fd->name, rd->nsize); + crc = crc32(0, fd->name, checkedlen); if (crc != je32_to_cpu(rd->name_crc)) { pr_notice("%s(): Name CRC failed on node at 0x%08x: Read 0x%08x, calculated 0x%08x\n", __func__, ofs, je32_to_cpu(rd->name_crc), crc); diff --git a/fs/jffs2/summary.h b/fs/jffs2/summary.h index e4131cb1f1d4..36d9a1280770 100644 --- a/fs/jffs2/summary.h +++ b/fs/jffs2/summary.h @@ -194,18 +194,18 @@ int jffs2_sum_scan_sumnode(struct jffs2_sb_info *c, struct jffs2_eraseblock *jeb #define jffs2_sum_active() (0) #define jffs2_sum_init(a) (0) -#define jffs2_sum_exit(a) +#define jffs2_sum_exit(a) do { } while (0) #define jffs2_sum_disable_collecting(a) #define jffs2_sum_is_disabled(a) (0) -#define jffs2_sum_reset_collected(a) +#define jffs2_sum_reset_collected(a) do { } while (0) #define jffs2_sum_add_kvec(a,b,c,d) (0) -#define jffs2_sum_move_collected(a,b) +#define jffs2_sum_move_collected(a,b) do { } while (0) #define jffs2_sum_write_sumnode(a) (0) -#define jffs2_sum_add_padding_mem(a,b) -#define jffs2_sum_add_inode_mem(a,b,c) -#define jffs2_sum_add_dirent_mem(a,b,c) -#define jffs2_sum_add_xattr_mem(a,b,c) -#define jffs2_sum_add_xref_mem(a,b,c) +#define jffs2_sum_add_padding_mem(a,b) do { } while (0) +#define jffs2_sum_add_inode_mem(a,b,c) do { } while (0) +#define jffs2_sum_add_dirent_mem(a,b,c) do { } while (0) +#define jffs2_sum_add_xattr_mem(a,b,c) do { } while (0) +#define jffs2_sum_add_xref_mem(a,b,c) do { } while (0) #define jffs2_sum_scan_sumnode(a,b,c,d,e) (0) #endif /* CONFIG_JFFS2_SUMMARY */ diff --git a/fs/locks.c b/fs/locks.c index 5c42363aa811..74b2a1dfe8d8 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1808,6 +1808,9 @@ check_conflicting_open(struct file *filp, const long arg, int flags) if (flags & FL_LAYOUT) return 0; + if (flags & FL_DELEG) + /* We leave these checks to the caller */ + return 0; if (arg == F_RDLCK) return inode_is_open_for_write(inode) ? -EAGAIN : 0; diff --git a/fs/namespace.c b/fs/namespace.c index f63337828e1c..c3f1a78ba369 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -3855,8 +3855,12 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) if (!(m->mnt_sb->s_type->fs_flags & FS_ALLOW_IDMAP)) return -EINVAL; + /* Don't yet support filesystem mountable in user namespaces. */ + if (m->mnt_sb->s_user_ns != &init_user_ns) + return -EINVAL; + /* We're not controlling the superblock. */ - if (!ns_capable(m->mnt_sb->s_user_ns, CAP_SYS_ADMIN)) + if (!capable(CAP_SYS_ADMIN)) return -EPERM; /* Mount has already been visible in the filesystem hierarchy. */ diff --git a/fs/netfs/Kconfig b/fs/netfs/Kconfig index 578112713703..b4db21022cb4 100644 --- a/fs/netfs/Kconfig +++ b/fs/netfs/Kconfig @@ -1,7 +1,7 @@ # SPDX-License-Identifier: GPL-2.0-only config NETFS_SUPPORT - tristate "Support for network filesystem high-level I/O" + tristate help This option enables support for network filesystems, including helpers for high-level buffered I/O, abstracting out read diff --git a/fs/netfs/read_helper.c b/fs/netfs/read_helper.c index 193841d03de0..725614625ed4 100644 --- a/fs/netfs/read_helper.c +++ b/fs/netfs/read_helper.c @@ -1068,7 +1068,7 @@ int netfs_write_begin(struct file *file, struct address_space *mapping, DEFINE_READAHEAD(ractl, file, NULL, mapping, index); retry: - page = grab_cache_page_write_begin(mapping, index, 0); + page = grab_cache_page_write_begin(mapping, index, flags); if (!page) return -ENOMEM; diff --git a/fs/nfs/callback_proc.c b/fs/nfs/callback_proc.c index f7786e00a6a7..ed9d580826f5 100644 --- a/fs/nfs/callback_proc.c +++ b/fs/nfs/callback_proc.c @@ -137,12 +137,12 @@ static struct inode *nfs_layout_find_inode_by_stateid(struct nfs_client *clp, list_for_each_entry_rcu(lo, &server->layouts, plh_layouts) { if (!pnfs_layout_is_valid(lo)) continue; - if (stateid != NULL && - !nfs4_stateid_match_other(stateid, &lo->plh_stateid)) + if (!nfs4_stateid_match_other(stateid, &lo->plh_stateid)) continue; - if (!nfs_sb_active(server->super)) - continue; - inode = igrab(lo->plh_inode); + if (nfs_sb_active(server->super)) + inode = igrab(lo->plh_inode); + else + inode = ERR_PTR(-EAGAIN); rcu_read_unlock(); if (inode) return inode; @@ -176,9 +176,10 @@ static struct inode *nfs_layout_find_inode_by_fh(struct nfs_client *clp, continue; if (nfsi->layout != lo) continue; - if (!nfs_sb_active(server->super)) - continue; - inode = igrab(lo->plh_inode); + if (nfs_sb_active(server->super)) + inode = igrab(lo->plh_inode); + else + inode = ERR_PTR(-EAGAIN); rcu_read_unlock(); if (inode) return inode; diff --git a/fs/nfs/client.c b/fs/nfs/client.c index ff5c4d0d6d13..cfeaadf56bf0 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -476,7 +476,6 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_maxval = to->to_initval; to->to_exponential = 0; break; -#ifndef CONFIG_NFS_DISABLE_UDP_SUPPORT case XPRT_TRANSPORT_UDP: if (retrans == NFS_UNSPEC_RETRANS) to->to_retries = NFS_DEF_UDP_RETRANS; @@ -487,7 +486,6 @@ void nfs_init_timeout_values(struct rpc_timeout *to, int proto, to->to_maxval = NFS_MAX_UDP_TIMEOUT; to->to_exponential = 1; break; -#endif default: BUG(); } @@ -698,9 +696,18 @@ static int nfs_init_server(struct nfs_server *server, /* Initialise the client representation from the mount data */ server->flags = ctx->flags; server->options = ctx->options; - server->caps |= NFS_CAP_HARDLINKS|NFS_CAP_SYMLINKS|NFS_CAP_FILEID| - NFS_CAP_MODE|NFS_CAP_NLINK|NFS_CAP_OWNER|NFS_CAP_OWNER_GROUP| - NFS_CAP_ATIME|NFS_CAP_CTIME|NFS_CAP_MTIME; + server->caps |= NFS_CAP_HARDLINKS | NFS_CAP_SYMLINKS; + + switch (clp->rpc_ops->version) { + case 2: + server->fattr_valid = NFS_ATTR_FATTR_V2; + break; + case 3: + server->fattr_valid = NFS_ATTR_FATTR_V3; + break; + default: + server->fattr_valid = NFS_ATTR_FATTR_V4; + } if (ctx->rsize) server->rsize = nfs_block_size(ctx->rsize, NULL); @@ -794,6 +801,7 @@ static void nfs_server_set_fsinfo(struct nfs_server *server, server->maxfilesize = fsinfo->maxfilesize; server->time_delta = fsinfo->time_delta; + server->change_attr_type = fsinfo->change_attr_type; server->clone_blksize = fsinfo->clone_blksize; /* We're airborne Set socket buffersize */ @@ -935,6 +943,8 @@ struct nfs_server *nfs_alloc_server(void) return NULL; } + server->change_attr_type = NFS4_CHANGE_TYPE_IS_UNDEFINED; + ida_init(&server->openowner_id); ida_init(&server->lockowner_id); pnfs_init_server(server); diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 04bf8066980c..e6ec6f09ac6e 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -114,7 +114,7 @@ nfs4_do_check_delegation(struct inode *inode, fmode_t flags, bool mark) return ret; } /** - * nfs_have_delegation - check if inode has a delegation, mark it + * nfs4_have_delegation - check if inode has a delegation, mark it * NFS_DELEGATION_REFERENCED if there is one. * @inode: inode to check * @flags: delegation types to check for @@ -481,6 +481,22 @@ int nfs_inode_set_delegation(struct inode *inode, const struct cred *cred, if (freeme == NULL) goto out; add_new: + /* + * If we didn't revalidate the change attribute before setting + * the delegation, then pre-emptively ask for a full attribute + * cache revalidation. + */ + spin_lock(&inode->i_lock); + if (NFS_I(inode)->cache_validity & NFS_INO_INVALID_CHANGE) + nfs_set_cache_invalid(inode, + NFS_INO_INVALID_ATIME | NFS_INO_INVALID_CTIME | + NFS_INO_INVALID_MTIME | NFS_INO_INVALID_SIZE | + NFS_INO_INVALID_BLOCKS | NFS_INO_INVALID_NLINK | + NFS_INO_INVALID_OTHER | NFS_INO_INVALID_DATA | + NFS_INO_INVALID_ACCESS | NFS_INO_INVALID_ACL | + NFS_INO_INVALID_XATTR); + spin_unlock(&inode->i_lock); + list_add_tail_rcu(&delegation->super_list, &server->delegations); rcu_assign_pointer(nfsi->delegation, delegation); delegation = NULL; @@ -488,11 +504,6 @@ add_new: atomic_long_inc(&nfs_active_delegations); trace_nfs4_set_delegation(inode, type); - - spin_lock(&inode->i_lock); - if (NFS_I(inode)->cache_validity & (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_ATIME)) - NFS_I(inode)->cache_validity |= NFS_INO_REVAL_FORCED; - spin_unlock(&inode->i_lock); out: spin_unlock(&clp->cl_lock); if (delegation != NULL) @@ -674,7 +685,7 @@ void nfs_inode_evict_delegation(struct inode *inode) } /** - * nfs_inode_return_delegation - synchronously return a delegation + * nfs4_inode_return_delegation - synchronously return a delegation * @inode: inode to process * * This routine will always flush any dirty data to disk on the @@ -697,7 +708,7 @@ int nfs4_inode_return_delegation(struct inode *inode) } /** - * nfs_inode_return_delegation_on_close - asynchronously return a delegation + * nfs4_inode_return_delegation_on_close - asynchronously return a delegation * @inode: inode to process * * This routine is called on file close in order to determine if the @@ -811,7 +822,7 @@ void nfs_expire_all_delegations(struct nfs_client *clp) } /** - * nfs_super_return_all_delegations - return delegations for one superblock + * nfs_server_return_all_delegations - return delegations for one superblock * @server: pointer to nfs_server to process * */ diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 9b00a0b7f832..c19b4fd20781 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -84,8 +84,7 @@ int nfs4_inode_make_writeable(struct inode *inode); static inline int nfs_have_delegated_attributes(struct inode *inode) { - return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ) && - !(NFS_I(inode)->cache_validity & NFS_INO_REVAL_FORCED); + return NFS_PROTO(inode)->have_delegation(inode, FMODE_READ); } #endif diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index fc4f490f2d78..1a6d2867fba4 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -866,6 +866,8 @@ static int nfs_readdir_xdr_to_array(struct nfs_readdir_descriptor *desc, break; } + verf_arg = verf_res; + status = nfs_readdir_page_filler(desc, entry, pages, pglen, arrays, narrays); } while (!status && nfs_readdir_page_needs_filling(page)); @@ -927,7 +929,12 @@ static int find_and_lock_cache_page(struct nfs_readdir_descriptor *desc) } return res; } - memcpy(nfsi->cookieverf, verf, sizeof(nfsi->cookieverf)); + /* + * Set the cookie verifier if the page cache was empty + */ + if (desc->page_index == 0) + memcpy(nfsi->cookieverf, verf, + sizeof(nfsi->cookieverf)); } res = nfs_readdir_search_array(desc); if (res == 0) { @@ -974,10 +981,10 @@ static int readdir_search_pagecache(struct nfs_readdir_descriptor *desc) /* * Once we've found the start of the dirent within a page: fill 'er up... */ -static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) +static void nfs_do_filldir(struct nfs_readdir_descriptor *desc, + const __be32 *verf) { struct file *file = desc->file; - struct nfs_inode *nfsi = NFS_I(file_inode(file)); struct nfs_cache_array *array; unsigned int i = 0; @@ -991,7 +998,7 @@ static void nfs_do_filldir(struct nfs_readdir_descriptor *desc) desc->eof = true; break; } - memcpy(desc->verf, nfsi->cookieverf, sizeof(desc->verf)); + memcpy(desc->verf, verf, sizeof(desc->verf)); if (i < (array->size-1)) desc->dir_cookie = array->array[i+1].cookie; else @@ -1048,7 +1055,7 @@ static int uncached_readdir(struct nfs_readdir_descriptor *desc) for (i = 0; !desc->eof && i < sz && arrays[i]; i++) { desc->page = arrays[i]; - nfs_do_filldir(desc); + nfs_do_filldir(desc, verf); } desc->page = NULL; @@ -1069,6 +1076,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) { struct dentry *dentry = file_dentry(file); struct inode *inode = d_inode(dentry); + struct nfs_inode *nfsi = NFS_I(inode); struct nfs_open_dir_context *dir_ctx = file->private_data; struct nfs_readdir_descriptor *desc; int res; @@ -1122,7 +1130,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) break; } if (res == -ETOOSMALL && desc->plus) { - clear_bit(NFS_INO_ADVISE_RDPLUS, &NFS_I(inode)->flags); + clear_bit(NFS_INO_ADVISE_RDPLUS, &nfsi->flags); nfs_zap_caches(inode); desc->page_index = 0; desc->plus = false; @@ -1132,7 +1140,7 @@ static int nfs_readdir(struct file *file, struct dir_context *ctx) if (res < 0) break; - nfs_do_filldir(desc); + nfs_do_filldir(desc, nfsi->cookieverf); nfs_readdir_page_unlock_and_put_cached(desc); } while (!desc->eof); @@ -1703,7 +1711,7 @@ static void nfs_drop_nlink(struct inode *inode) NFS_I(inode)->attr_gencount = nfs_inc_attr_generation_counter(); nfs_set_cache_invalid( inode, NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_CTIME | - NFS_INO_INVALID_OTHER | NFS_INO_REVAL_FORCED); + NFS_INO_INVALID_NLINK); spin_unlock(&inode->i_lock); } @@ -2940,7 +2948,7 @@ static int nfs_execute_ok(struct inode *inode, int mask) if (S_ISDIR(inode->i_mode)) return 0; - if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_OTHER)) { + if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_MODE)) { if (mask & MAY_NOT_BLOCK) return -ECHILD; ret = __nfs_revalidate_inode(server, inode); @@ -2998,16 +3006,10 @@ out_notsup: if (mask & MAY_NOT_BLOCK) return -ECHILD; - res = nfs_revalidate_inode(NFS_SERVER(inode), inode); + res = nfs_revalidate_inode(inode, NFS_INO_INVALID_MODE | + NFS_INO_INVALID_OTHER); if (res == 0) res = generic_permission(&init_user_ns, inode, mask); goto out; } EXPORT_SYMBOL_GPL(nfs_permission); - -/* - * Local variables: - * version-control: t - * kept-new-versions: 5 - * End: - */ diff --git a/fs/nfs/export.c b/fs/nfs/export.c index f2b34cfe286c..37a1a88df771 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -169,19 +169,8 @@ out: static u64 nfs_fetch_iversion(struct inode *inode) { - struct nfs_server *server = NFS_SERVER(inode); - - /* Is this the right call?: */ - nfs_revalidate_inode(server, inode); - /* - * Also, note we're ignoring any returned error. That seems to be - * the practice for cache consistency information elsewhere in - * the server, but I'm not sure why. - */ - if (server->nfs_client->rpc_ops->version >= 4) - return inode_peek_iversion_raw(inode); - else - return time_to_chattr(&inode->i_ctime); + nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); + return inode_peek_iversion_raw(inode); } const struct export_operations nfs_export_ops = { diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 16ad5050e046..1fef107961bc 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -105,7 +105,7 @@ static int nfs_revalidate_file_size(struct inode *inode, struct file *filp) if (filp->f_flags & O_DIRECT) goto force_reval; - if (nfs_check_cache_invalid(inode, NFS_INO_REVAL_PAGECACHE)) + if (nfs_check_cache_invalid(inode, NFS_INO_INVALID_SIZE)) goto force_reval; return 0; force_reval: diff --git a/fs/nfs/filelayout/filelayout.c b/fs/nfs/filelayout/filelayout.c index d158a500c25c..d2103852475f 100644 --- a/fs/nfs/filelayout/filelayout.c +++ b/fs/nfs/filelayout/filelayout.c @@ -718,7 +718,7 @@ filelayout_decode_layout(struct pnfs_layout_hdr *flo, if (unlikely(!p)) goto out_err; fl->fh_array[i]->size = be32_to_cpup(p++); - if (sizeof(struct nfs_fh) < fl->fh_array[i]->size) { + if (fl->fh_array[i]->size > NFS_MAXFHSIZE) { printk(KERN_ERR "NFS: Too big fh %d received %d\n", i, fl->fh_array[i]->size); goto out_err; diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 872112bffcab..d383de00d486 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -106,7 +106,7 @@ static int decode_nfs_fh(struct xdr_stream *xdr, struct nfs_fh *fh) if (unlikely(!p)) return -ENOBUFS; fh->size = be32_to_cpup(p++); - if (fh->size > sizeof(struct nfs_fh)) { + if (fh->size > NFS_MAXFHSIZE) { printk(KERN_ERR "NFS flexfiles: Too big fh received %d\n", fh->size); return -EOVERFLOW; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index a06d213d7689..d95c9a39bc70 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -283,20 +283,40 @@ static int nfs_verify_server_address(struct sockaddr *addr) return 0; } +#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT +static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx) +{ + return true; +} +#else +static bool nfs_server_transport_udp_invalid(const struct nfs_fs_context *ctx) +{ + if (ctx->version == 4) + return true; + return false; +} +#endif + /* * Sanity check the NFS transport protocol. - * */ -static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx) +static int nfs_validate_transport_protocol(struct fs_context *fc, + struct nfs_fs_context *ctx) { switch (ctx->nfs_server.protocol) { case XPRT_TRANSPORT_UDP: + if (nfs_server_transport_udp_invalid(ctx)) + goto out_invalid_transport_udp; + break; case XPRT_TRANSPORT_TCP: case XPRT_TRANSPORT_RDMA: break; default: ctx->nfs_server.protocol = XPRT_TRANSPORT_TCP; } + return 0; +out_invalid_transport_udp: + return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); } /* @@ -305,8 +325,6 @@ static void nfs_validate_transport_protocol(struct nfs_fs_context *ctx) */ static void nfs_set_mount_transport_protocol(struct nfs_fs_context *ctx) { - nfs_validate_transport_protocol(ctx); - if (ctx->mount_server.protocol == XPRT_TRANSPORT_UDP || ctx->mount_server.protocol == XPRT_TRANSPORT_TCP) return; @@ -932,6 +950,7 @@ static int nfs23_parse_monolithic(struct fs_context *fc, struct nfs_fh *mntfh = ctx->mntfh; struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; int extra_flags = NFS_MOUNT_LEGACY_INTERFACE; + int ret; if (data == NULL) goto out_no_data; @@ -977,6 +996,15 @@ static int nfs23_parse_monolithic(struct fs_context *fc, sizeof(mntfh->data) - mntfh->size); /* + * for proto == XPRT_TRANSPORT_UDP, which is what uses + * to_exponential, implying shift: limit the shift value + * to BITS_PER_LONG (majortimeo is unsigned long) + */ + if (!(data->flags & NFS_MOUNT_TCP)) /* this will be UDP */ + if (data->retrans >= 64) /* shift value is too large */ + goto out_invalid_data; + + /* * Translate to nfs_fs_context, which nfs_fill_super * can deal with. */ @@ -1048,6 +1076,10 @@ static int nfs23_parse_monolithic(struct fs_context *fc, goto generic; } + ret = nfs_validate_transport_protocol(fc, ctx); + if (ret) + return ret; + ctx->skip_reconfig_option_check = true; return 0; @@ -1076,6 +1108,9 @@ out_no_address: out_invalid_fh: return nfs_invalf(fc, "NFS: invalid root filehandle"); + +out_invalid_data: + return nfs_invalf(fc, "NFS: invalid binary mount data"); } #if IS_ENABLED(CONFIG_NFS_V4) @@ -1146,6 +1181,7 @@ static int nfs4_parse_monolithic(struct fs_context *fc, { struct nfs_fs_context *ctx = nfs_fc2context(fc); struct sockaddr *sap = (struct sockaddr *)&ctx->nfs_server.address; + int ret; char *c; if (!data) { @@ -1218,9 +1254,9 @@ static int nfs4_parse_monolithic(struct fs_context *fc, ctx->acdirmin = data->acdirmin; ctx->acdirmax = data->acdirmax; ctx->nfs_server.protocol = data->proto; - nfs_validate_transport_protocol(ctx); - if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; + ret = nfs_validate_transport_protocol(fc, ctx); + if (ret) + return ret; done: ctx->skip_reconfig_option_check = true; return 0; @@ -1231,9 +1267,6 @@ out_inval_auth: out_no_address: return nfs_invalf(fc, "NFS4: mount program didn't pass remote address"); - -out_invalid_transport_udp: - return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); } #endif @@ -1298,6 +1331,10 @@ static int nfs_fs_context_validate(struct fs_context *fc) if (!nfs_verify_server_address(sap)) goto out_no_address; + ret = nfs_validate_transport_protocol(fc, ctx); + if (ret) + return ret; + if (ctx->version == 4) { if (IS_ENABLED(CONFIG_NFS_V4)) { if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA) @@ -1306,9 +1343,6 @@ static int nfs_fs_context_validate(struct fs_context *fc) port = NFS_PORT; max_namelen = NFS4_MAXNAMLEN; max_pathlen = NFS4_MAXPATHLEN; - nfs_validate_transport_protocol(ctx); - if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; ctx->flags &= ~(NFS_MOUNT_NONLM | NFS_MOUNT_NOACL | NFS_MOUNT_VER3 | NFS_MOUNT_LOCAL_FLOCK | NFS_MOUNT_LOCAL_FCNTL); @@ -1317,10 +1351,6 @@ static int nfs_fs_context_validate(struct fs_context *fc) } } else { nfs_set_mount_transport_protocol(ctx); -#ifdef CONFIG_NFS_DISABLE_UDP_SUPPORT - if (ctx->nfs_server.protocol == XPRT_TRANSPORT_UDP) - goto out_invalid_transport_udp; -#endif if (ctx->nfs_server.protocol == XPRT_TRANSPORT_RDMA) port = NFS_RDMA_PORT; } @@ -1354,8 +1384,6 @@ out_no_device_name: out_v4_not_compiled: nfs_errorf(fc, "NFS: NFSv4 is not compiled into kernel"); return -EPROTONOSUPPORT; -out_invalid_transport_udp: - return nfs_invalf(fc, "NFS: Unsupported transport protocol udp"); out_no_address: return nfs_invalf(fc, "NFS: mount program didn't pass remote address"); out_mountproto_mismatch: diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 5a8854de0c19..529c4099f482 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -164,34 +164,19 @@ static int nfs_attribute_timeout(struct inode *inode) return !time_in_range_open(jiffies, nfsi->read_cache_jiffies, nfsi->read_cache_jiffies + nfsi->attrtimeo); } -static bool nfs_check_cache_invalid_delegated(struct inode *inode, unsigned long flags) +static bool nfs_check_cache_flags_invalid(struct inode *inode, + unsigned long flags) { unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); - /* Special case for the pagecache or access cache */ - if (flags == NFS_INO_REVAL_PAGECACHE && - !(cache_validity & NFS_INO_REVAL_FORCED)) - return false; return (cache_validity & flags) != 0; } -static bool nfs_check_cache_invalid_not_delegated(struct inode *inode, unsigned long flags) -{ - unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); - - if ((cache_validity & flags) != 0) - return true; - if (nfs_attribute_timeout(inode)) - return true; - return false; -} - bool nfs_check_cache_invalid(struct inode *inode, unsigned long flags) { - if (NFS_PROTO(inode)->have_delegation(inode, FMODE_READ)) - return nfs_check_cache_invalid_delegated(inode, flags); - - return nfs_check_cache_invalid_not_delegated(inode, flags); + if (nfs_check_cache_flags_invalid(inode, flags)) + return true; + return nfs_attribute_cache_expired(inode); } EXPORT_SYMBOL_GPL(nfs_check_cache_invalid); @@ -214,20 +199,21 @@ void nfs_set_cache_invalid(struct inode *inode, unsigned long flags) if (have_delegation) { if (!(flags & NFS_INO_REVAL_FORCED)) - flags &= ~NFS_INO_INVALID_OTHER; - flags &= ~(NFS_INO_INVALID_CHANGE - | NFS_INO_INVALID_SIZE - | NFS_INO_REVAL_PAGECACHE - | NFS_INO_INVALID_XATTR); - } + flags &= ~(NFS_INO_INVALID_MODE | + NFS_INO_INVALID_OTHER | + NFS_INO_INVALID_XATTR); + flags &= ~(NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE); + } else if (flags & NFS_INO_REVAL_PAGECACHE) + flags |= NFS_INO_INVALID_CHANGE | NFS_INO_INVALID_SIZE; if (!nfs_has_xattr_cache(nfsi)) flags &= ~NFS_INO_INVALID_XATTR; + if (flags & NFS_INO_INVALID_DATA) + nfs_fscache_invalidate(inode); if (inode->i_mapping->nrpages == 0) flags &= ~(NFS_INO_INVALID_DATA|NFS_INO_DATA_INVAL_DEFER); + flags &= ~(NFS_INO_REVAL_PAGECACHE | NFS_INO_REVAL_FORCED); nfsi->cache_validity |= flags; - if (flags & NFS_INO_INVALID_DATA) - nfs_fscache_invalidate(inode); } EXPORT_SYMBOL_GPL(nfs_set_cache_invalid); @@ -452,6 +438,7 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st .fattr = fattr }; struct inode *inode = ERR_PTR(-ENOENT); + u64 fattr_supported = NFS_SB(sb)->fattr_valid; unsigned long hash; nfs_attr_check_mountpoint(sb, fattr); @@ -484,8 +471,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st inode->i_mode = fattr->mode; nfsi->cache_validity = 0; if ((fattr->valid & NFS_ATTR_FATTR_MODE) == 0 - && nfs_server_capable(inode, NFS_CAP_MODE)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); + && (fattr_supported & NFS_ATTR_FATTR_MODE)) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MODE); /* Why so? Because we want revalidate for devices/FIFOs, and * that's precisely what we have in nfs_file_inode_operations. */ @@ -530,15 +517,15 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfsi->attr_gencount = fattr->gencount; if (fattr->valid & NFS_ATTR_FATTR_ATIME) inode->i_atime = fattr->atime; - else if (nfs_server_capable(inode, NFS_CAP_ATIME)) + else if (fattr_supported & NFS_ATTR_FATTR_ATIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_ATIME); if (fattr->valid & NFS_ATTR_FATTR_MTIME) inode->i_mtime = fattr->mtime; - else if (nfs_server_capable(inode, NFS_CAP_MTIME)) + else if (fattr_supported & NFS_ATTR_FATTR_MTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); if (fattr->valid & NFS_ATTR_FATTR_CTIME) inode->i_ctime = fattr->ctime; - else if (nfs_server_capable(inode, NFS_CAP_CTIME)) + else if (fattr_supported & NFS_ATTR_FATTR_CTIME) nfs_set_cache_invalid(inode, NFS_INO_INVALID_CTIME); if (fattr->valid & NFS_ATTR_FATTR_CHANGE) inode_set_iversion_raw(inode, fattr->change_attr); @@ -550,29 +537,31 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr, st nfs_set_cache_invalid(inode, NFS_INO_INVALID_SIZE); if (fattr->valid & NFS_ATTR_FATTR_NLINK) set_nlink(inode, fattr->nlink); - else if (nfs_server_capable(inode, NFS_CAP_NLINK)) - nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); + else if (fattr_supported & NFS_ATTR_FATTR_NLINK) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; - else if (nfs_server_capable(inode, NFS_CAP_OWNER)) + else if (fattr_supported & NFS_ATTR_FATTR_OWNER) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (fattr->valid & NFS_ATTR_FATTR_GROUP) inode->i_gid = fattr->gid; - else if (nfs_server_capable(inode, NFS_CAP_OWNER_GROUP)) + else if (fattr_supported & NFS_ATTR_FATTR_GROUP) nfs_set_cache_invalid(inode, NFS_INO_INVALID_OTHER); if (nfs_server_capable(inode, NFS_CAP_XATTR)) nfs_set_cache_invalid(inode, NFS_INO_INVALID_XATTR); if (fattr->valid & NFS_ATTR_FATTR_BLOCKS_USED) inode->i_blocks = fattr->du.nfs2.blocks; + else if (fattr_supported & NFS_ATTR_FATTR_BLOCKS_USED && + fattr->size != 0) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); if (fattr->valid & NFS_ATTR_FATTR_SPACE_USED) { /* * report the blocks in 512byte units */ inode->i_blocks = nfs_calc_block_size(fattr->du.nfs3.used); - } - - if (nfsi->cache_validity != 0) - nfsi->cache_validity |= NFS_INO_REVAL_FORCED; + } else if (fattr_supported & NFS_ATTR_FATTR_SPACE_USED && + fattr->size != 0) + nfs_set_cache_invalid(inode, NFS_INO_INVALID_BLOCKS); nfs_setsecurity(inode, fattr, label); @@ -634,8 +623,7 @@ nfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } /* Optimization: if the end result is no change, don't RPC */ - attr->ia_valid &= NFS_VALID_ATTRS; - if ((attr->ia_valid & ~(ATTR_FILE|ATTR_OPEN)) == 0) + if (((attr->ia_valid & NFS_VALID_ATTRS) & ~(ATTR_FILE|ATTR_OPEN)) == 0) return 0; trace_nfs_setattr_enter(inode); @@ -710,12 +698,20 @@ void nfs_setattr_update_inode(struct inode *inode, struct iattr *attr, spin_lock(&inode->i_lock); NFS_I(inode)->attr_gencount = fattr->gencount; if ((attr->ia_valid & ATTR_SIZE) != 0) { - nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME); + nfs_set_cache_invalid(inode, NFS_INO_INVALID_MTIME | + NFS_INO_INVALID_BLOCKS); nfs_inc_stats(inode, NFSIOS_SETATTRTRUNC); nfs_vmtruncate(inode, attr->ia_size); } if ((attr->ia_valid & (ATTR_MODE|ATTR_UID|ATTR_GID)) != 0) { NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_CTIME; + if ((attr->ia_valid & ATTR_KILL_SUID) != 0 && + inode->i_mode & S_ISUID) + inode->i_mode &= ~S_ISUID; + if ((attr->ia_valid & ATTR_KILL_SGID) != 0 && + (inode->i_mode & (S_ISGID | S_IXGRP)) == + (S_ISGID | S_IXGRP)) + inode->i_mode &= ~S_ISGID; if ((attr->ia_valid & ATTR_MODE) != 0) { int mode = attr->ia_mode & S_IALLUGO; mode |= inode->i_mode & ~S_IALLUGO; @@ -793,14 +789,28 @@ static void nfs_readdirplus_parent_cache_hit(struct dentry *dentry) dput(parent); } -static bool nfs_need_revalidate_inode(struct inode *inode) +static u32 nfs_get_valid_attrmask(struct inode *inode) { - if (NFS_I(inode)->cache_validity & - (NFS_INO_INVALID_ATTR|NFS_INO_INVALID_LABEL)) - return true; - if (nfs_attribute_cache_expired(inode)) - return true; - return false; + unsigned long cache_validity = READ_ONCE(NFS_I(inode)->cache_validity); + u32 reply_mask = STATX_INO | STATX_TYPE; + + if (!(cache_validity & NFS_INO_INVALID_ATIME)) + reply_mask |= STATX_ATIME; + if (!(cache_validity & NFS_INO_INVALID_CTIME)) + reply_mask |= STATX_CTIME; + if (!(cache_validity & NFS_INO_INVALID_MTIME)) + reply_mask |= STATX_MTIME; + if (!(cache_validity & NFS_INO_INVALID_SIZE)) + reply_mask |= STATX_SIZE; + if (!(cache_validity & NFS_INO_INVALID_NLINK)) + reply_mask |= STAT |