diff options
Diffstat (limited to 'fs')
640 files changed, 17696 insertions, 11854 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 39def020a074..cdb99507ef33 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -583,7 +583,7 @@ static struct attribute *v9fs_attrs[] = { NULL, }; -static struct attribute_group v9fs_attr_group = { +static const struct attribute_group v9fs_attr_group = { .attrs = v9fs_attrs, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 649f04f112dc..59c32c9b799f 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -86,8 +86,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) * to work. */ writeback_fid = v9fs_writeback_fid(file_dentry(file)); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); + if (IS_ERR(writeback_fid)) { + err = PTR_ERR(writeback_fid); mutex_unlock(&v9inode->v_mutex); goto out_error; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 8d97f0b45e9c..795706520b5e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -399,7 +399,7 @@ static int v9fs_test_inode(struct inode *inode, void *data) umode = p9mode2unixmode(v9ses, st, &rdev); /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + if (inode_wrong_type(inode, umode)) return 0; /* compare qid details */ @@ -1390,7 +1390,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) * Don't update inode if the file type is different */ umode = p9mode2unixmode(v9ses, st, &rdev); - if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + if (inode_wrong_type(inode, umode)) goto out; /* diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 1dc7af046615..e1c0240b51c0 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -59,7 +59,7 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data) struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + if (inode_wrong_type(inode, st->st_mode)) return 0; if (inode->i_generation != st->st_gen) @@ -663,14 +663,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, if (stat->st_result_mask & P9_STATS_NLINK) set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { - inode->i_mode = stat->st_mode; - if ((S_ISBLK(inode->i_mode)) || - (S_ISCHR(inode->i_mode))) - init_special_inode(inode, inode->i_mode, - inode->i_rdev); + mode = stat->st_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; } - if (stat->st_result_mask & P9_STATS_RDEV) - inode->i_rdev = new_decode_dev(stat->st_rdev); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && stat->st_result_mask & P9_STATS_SIZE) v9fs_i_size_write(inode, stat->st_size); @@ -959,7 +955,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) /* * Don't update inode if the file type is different */ - if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + if (inode_wrong_type(inode, st->st_mode)) goto out; /* diff --git a/fs/Kconfig b/fs/Kconfig index a55bda4233bb..141a856c50e7 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -125,6 +125,7 @@ source "fs/overlayfs/Kconfig" menu "Caches" +source "fs/netfs/Kconfig" source "fs/fscache/Kconfig" source "fs/cachefiles/Kconfig" @@ -222,10 +223,13 @@ config TMPFS_INODE64 If unsure, say N. +config ARCH_SUPPORTS_HUGETLBFS + def_bool n + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ - SYS_SUPPORTS_HUGETLBFS || BROKEN + ARCH_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -334,8 +338,8 @@ config NFS_COMMON default y config NFS_V4_2_SSC_HELPER - tristate - default y if NFS_V4=y || NFS_FS=y + bool + default y if NFS_V4_2 source "net/sunrpc/Kconfig" source "fs/ceph/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index c6f1c8c1934e..06fb7a93a1bd 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK config BINFMT_FLAT_OLD_ALWAYS_RAM bool +config BINFMT_FLAT_NO_DATA_START_OFFSET + bool + config BINFMT_FLAT_OLD bool "Enable support for very old legacy flat binaries" depends on BINFMT_FLAT diff --git a/fs/Makefile b/fs/Makefile index 3215fe205256..9c708e1fbe8f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -67,6 +67,7 @@ obj-y += devpts/ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line +obj-$(CONFIG_NETFS_SUPPORT) += netfs/ obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT4_FS) += ext4/ diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 1ad211d72b3b..fc8ba9142f2f 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -4,6 +4,7 @@ config AFS_FS depends on INET select AF_RXRPC select DNS_RESOLVER + select NETFS_SUPPORT help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index a4e9e6e07e93..d3c6bb22c5f4 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -322,6 +322,8 @@ static int afs_deliver_cb_callback(struct afs_call *call) return ret; call->unmarshall++; + fallthrough; + case 5: break; } @@ -418,6 +420,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) r->node[loop] = ntohl(b[loop + 5]); call->unmarshall++; + fallthrough; case 2: break; @@ -530,6 +533,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) r->node[loop] = ntohl(b[loop + 5]); call->unmarshall++; + fallthrough; case 2: break; @@ -663,6 +667,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; + fallthrough; case 3: break; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 17548c1faf02..78719f2f567e 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -103,6 +103,35 @@ struct afs_lookup_cookie { }; /* + * Drop the refs that we're holding on the pages we were reading into. We've + * got refs on the first nr_pages pages. + */ +static void afs_dir_read_cleanup(struct afs_read *req) +{ + struct address_space *mapping = req->vnode->vfs_inode.i_mapping; + struct page *page; + pgoff_t last = req->nr_pages - 1; + + XA_STATE(xas, &mapping->i_pages, 0); + + if (unlikely(!req->nr_pages)) + return; + + rcu_read_lock(); + xas_for_each(&xas, page, last) { + if (xas_retry(&xas, page)) + continue; + BUG_ON(xa_is_value(page)); + BUG_ON(PageCompound(page)); + ASSERTCMP(page->mapping, ==, mapping); + + put_page(page); + } + + rcu_read_unlock(); +} + +/* * check that a directory page is valid */ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, @@ -127,7 +156,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, qty /= sizeof(union afs_xdr_dir_block); /* check them */ - dbuf = kmap(page); + dbuf = kmap_atomic(page); for (tmp = 0; tmp < qty; tmp++) { if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n", @@ -146,7 +175,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, ((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0; } - kunmap(page); + kunmap_atomic(dbuf); checked: afs_stat_v(dvnode, n_read_dir); @@ -157,35 +186,74 @@ error: } /* - * Check the contents of a directory that we've just read. + * Dump the contents of a directory. */ -static bool afs_dir_check_pages(struct afs_vnode *dvnode, struct afs_read *req) +static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) { struct afs_xdr_dir_page *dbuf; - unsigned int i, j, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block); + struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct page *page; + unsigned int i, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block); + pgoff_t last = req->nr_pages - 1; - for (i = 0; i < req->nr_pages; i++) - if (!afs_dir_check_page(dvnode, req->pages[i], req->actual_len)) - goto bad; - return true; + XA_STATE(xas, &mapping->i_pages, 0); -bad: - pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx r=%llx\n", + pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx\n", dvnode->fid.vid, dvnode->fid.vnode, - req->file_size, req->len, req->actual_len, req->remain); - pr_warn("DIR %llx %x %x %x\n", - req->pos, req->index, req->nr_pages, req->offset); + req->file_size, req->len, req->actual_len); + pr_warn("DIR %llx %x %zx %zx\n", + req->pos, req->nr_pages, + req->iter->iov_offset, iov_iter_count(req->iter)); - for (i = 0; i < req->nr_pages; i++) { - dbuf = kmap(req->pages[i]); - for (j = 0; j < qty; j++) { - union afs_xdr_dir_block *block = &dbuf->blocks[j]; + xas_for_each(&xas, page, last) { + if (xas_retry(&xas, page)) + continue; + + BUG_ON(PageCompound(page)); + BUG_ON(page->mapping != mapping); + + dbuf = kmap_atomic(page); + for (i = 0; i < qty; i++) { + union afs_xdr_dir_block *block = &dbuf->blocks[i]; - pr_warn("[%02x] %32phN\n", i * qty + j, block); + pr_warn("[%02lx] %32phN\n", page->index * qty + i, block); } - kunmap(req->pages[i]); + kunmap_atomic(dbuf); } - return false; +} + +/* + * Check all the pages in a directory. All the pages are held pinned. + */ +static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) +{ + struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct page *page; + pgoff_t last = req->nr_pages - 1; + int ret = 0; + + XA_STATE(xas, &mapping->i_pages, 0); + + if (unlikely(!req->nr_pages)) + return 0; + + rcu_read_lock(); + xas_for_each(&xas, page, last) { + if (xas_retry(&xas, page)) + continue; + + BUG_ON(PageCompound(page)); + BUG_ON(page->mapping != mapping); + + if (!afs_dir_check_page(dvnode, page, req->file_size)) { + afs_dir_dump(dvnode, req); + ret = -EIO; + break; + } + } + + rcu_read_unlock(); + return ret; } /* @@ -214,57 +282,57 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) { struct afs_read *req; loff_t i_size; - int nr_pages, nr_inline, i, n; - int ret = -ENOMEM; + int nr_pages, i, n; + int ret; + + _enter(""); -retry: + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return ERR_PTR(-ENOMEM); + + refcount_set(&req->usage, 1); + req->vnode = dvnode; + req->key = key_get(key); + req->cleanup = afs_dir_read_cleanup; + +expand: i_size = i_size_read(&dvnode->vfs_inode); - if (i_size < 2048) - return ERR_PTR(afs_bad(dvnode, afs_file_error_dir_small)); + if (i_size < 2048) { + ret = afs_bad(dvnode, afs_file_error_dir_small); + goto error; + } if (i_size > 2048 * 1024) { trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); - return ERR_PTR(-EFBIG); + ret = -EFBIG; + goto error; } _enter("%llu", i_size); - /* Get a request record to hold the page list. We want to hold it - * inline if we can, but we don't want to make an order 1 allocation. - */ nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE; - nr_inline = nr_pages; - if (nr_inline > (PAGE_SIZE - sizeof(*req)) / sizeof(struct page *)) - nr_inline = 0; - req = kzalloc(struct_size(req, array, nr_inline), GFP_KERNEL); - if (!req) - return ERR_PTR(-ENOMEM); - - refcount_set(&req->usage, 1); - req->nr_pages = nr_pages; req->actual_len = i_size; /* May change */ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ req->data_version = dvnode->status.data_version; /* May change */ - if (nr_inline > 0) { - req->pages = req->array; - } else { - req->pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_KERNEL); - if (!req->pages) - goto error; - } + iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages, + 0, i_size); + req->iter = &req->def_iter; - /* Get a list of all the pages that hold or will hold the directory - * content. We need to fill in any gaps that we might find where the - * memory reclaimer has been at work. If there are any gaps, we will + /* Fill in any gaps that we might find where the memory reclaimer has + * been at work and pin all the pages. If there are any gaps, we will * need to reread the entire directory contents. */ - i = 0; - do { + i = req->nr_pages; + while (i < nr_pages) { + struct page *pages[8], *page; + n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i, - req->nr_pages - i, - req->pages + i); - _debug("find %u at %u/%u", n, i, req->nr_pages); + min_t(unsigned int, nr_pages - i, + ARRAY_SIZE(pages)), + pages); + _debug("find %u at %u/%u", n, i, nr_pages); + if (n == 0) { gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask; @@ -272,22 +340,24 @@ retry: afs_stat_v(dvnode, n_inval); ret = -ENOMEM; - req->pages[i] = __page_cache_alloc(gfp); - if (!req->pages[i]) + page = __page_cache_alloc(gfp); + if (!page) goto error; - ret = add_to_page_cache_lru(req->pages[i], + ret = add_to_page_cache_lru(page, dvnode->vfs_inode.i_mapping, i, gfp); if (ret < 0) goto error; - attach_page_private(req->pages[i], (void *)1); - unlock_page(req->pages[i]); + attach_page_private(page, (void *)1); + unlock_page(page); + req->nr_pages++; i++; } else { + req->nr_pages += n; i += n; } - } while (i < req->nr_pages); + } /* If we're going to reload, we need to lock all the pages to prevent * races. @@ -305,18 +375,23 @@ retry: if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { trace_afs_reload_dir(dvnode); - ret = afs_fetch_data(dvnode, key, req); + ret = afs_fetch_data(dvnode, req); if (ret < 0) goto error_unlock; task_io_account_read(PAGE_SIZE * req->nr_pages); - if (req->len < req->file_size) - goto content_has_grown; + if (req->len < req->file_size) { + /* The content has grown, so we need to expand the + * buffer. + */ + up_write(&dvnode->validate_lock); + goto expand; + } /* Validate the data we just read. */ - ret = -EIO; - if (!afs_dir_check_pages(dvnode, req)) + ret = afs_dir_check(dvnode, req); + if (ret < 0) goto error_unlock; // TODO: Trim excess pages @@ -334,11 +409,6 @@ error: afs_put_read(req); _leave(" = %d", ret); return ERR_PTR(ret); - -content_has_grown: - up_write(&dvnode->validate_lock); - afs_put_read(req); - goto retry; } /* @@ -448,6 +518,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, struct afs_read *req; struct page *page; unsigned blkoff, limit; + void __rcu **slot; int ret; _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos); @@ -472,9 +543,15 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1); /* Fetch the appropriate page from the directory and re-add it - * to the LRU. + * to the LRU. We have all the pages pinned with an extra ref. */ - page = req->pages[blkoff / PAGE_SIZE]; + rcu_read_lock(); + page = NULL; + slot = radix_tree_lookup_slot(&dvnode->vfs_inode.i_mapping->i_pages, + blkoff / PAGE_SIZE); + if (slot) + page = radix_tree_deref_slot(slot); + rcu_read_unlock(); if (!page) { ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; @@ -1342,6 +1419,7 @@ static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; op->create.mode = S_IFDIR | mode; @@ -1423,6 +1501,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; @@ -1559,6 +1638,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; /* Try to make sure we have a callback promise on the victim. */ @@ -1641,6 +1721,7 @@ static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; @@ -1715,6 +1796,7 @@ static int afs_link(struct dentry *from, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -1837,7 +1919,9 @@ static void afs_rename_edit_dir(struct afs_operation *op) new_inode = d_inode(new_dentry); if (new_inode) { spin_lock(&new_inode->i_lock); - if (new_inode->i_nlink > 0) + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else if (new_inode->i_nlink > 0) drop_nlink(new_inode); spin_unlock(&new_inode->i_lock); } @@ -1910,6 +1994,8 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; op->file[1].dv_delta = 1; + op->file[0].modification = true; + op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -2006,6 +2092,6 @@ static void afs_dir_invalidatepage(struct page *page, unsigned int offset, afs_stat_v(dvnode, n_inval); /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_SIZE) + if (offset == 0 && length == thp_size(page)) detach_page_private(page); } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 04f75a44f243..dae9a57d7ec0 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -73,6 +73,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode afs_op_set_vnode(op, 1, dvnode); op->file[0].dv_delta = 1; op->file[1].dv_delta = 1; + op->file[0].modification = true; + op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -201,6 +203,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->file[1].op_unlinked = true; op->file[1].update_ctime = true; diff --git a/fs/afs/file.c b/fs/afs/file.c index 960b64268623..db035ae2a134 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -14,6 +14,7 @@ #include <linux/gfp.h> #include <linux/task_io_accounting_ops.h> #include <linux/mm.h> +#include <linux/netfs.h> #include "internal.h" static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); @@ -22,8 +23,7 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int afs_releasepage(struct page *page, gfp_t gfp_flags); -static int afs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); +static void afs_readahead(struct readahead_control *ractl); const struct file_operations afs_file_operations = { .open = afs_open, @@ -47,7 +47,7 @@ const struct inode_operations afs_file_inode_operations = { const struct address_space_operations afs_fs_aops = { .readpage = afs_readpage, - .readpages = afs_readpages, + .readahead = afs_readahead, .set_page_dirty = afs_set_page_dirty, .launder_page = afs_launder_page, .releasepage = afs_releasepage, @@ -184,41 +184,50 @@ int afs_release(struct inode *inode, struct file *file) } /* + * Allocate a new read record. + */ +struct afs_read *afs_alloc_read(gfp_t gfp) +{ + struct afs_read *req; + + req = kzalloc(sizeof(struct afs_read), gfp); + if (req) + refcount_set(&req->usage, 1); + + return req; +} + +/* * Dispose of a ref to a read record. */ void afs_put_read(struct afs_read *req) { - int i; - if (refcount_dec_and_test(&req->usage)) { - if (req->pages) { - for (i = 0; i < req->nr_pages; i++) - if (req->pages[i]) - put_page(req->pages[i]); - if (req->pages != req->array) - kfree(req->pages); - } + if (req->cleanup) + req->cleanup(req); + key_put(req->key); kfree(req); } } -#ifdef CONFIG_AFS_FSCACHE -/* - * deal with notification that a page was read from the cache - */ -static void afs_file_readpage_read_complete(struct page *page, - void *data, - int error) +static void afs_fetch_data_notify(struct afs_operation *op) { - _enter("%p,%p,%d", page, data, error); - - /* if the read completes with an error, we just unlock the page and let - * the VM reissue the readpage */ - if (!error) - SetPageUptodate(page); - unlock_page(page); + struct afs_read *req = op->fetch.req; + struct netfs_read_subrequest *subreq = req->subreq; + int error = op->error; + + if (error == -ECONNABORTED) + error = afs_abort_to_error(op->ac.abort_code); + req->error = error; + + if (subreq) { + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_subreq_terminated(subreq, error ?: req->actual_len, false); + req->subreq = NULL; + } else if (req->done) { + req->done(req); + } } -#endif static void afs_fetch_data_success(struct afs_operation *op) { @@ -228,10 +237,12 @@ static void afs_fetch_data_success(struct afs_operation *op) afs_vnode_commit_status(op, &op->file[0]); afs_stat_v(vnode, n_fetches); atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes); + afs_fetch_data_notify(op); } static void afs_fetch_data_put(struct afs_operation *op) { + op->fetch.req->error = op->error; afs_put_read(op->fetch.req); } @@ -240,13 +251,14 @@ static const struct afs_operation_ops afs_fetch_data_operation = { .issue_yfs_rpc = yfs_fs_fetch_data, .success = afs_fetch_data_success, .aborted = afs_check_for_remote_deletion, + .failed = afs_fetch_data_notify, .put = afs_fetch_data_put, }; /* * Fetch file data from the volume. */ -int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req) +int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) { struct afs_operation *op; @@ -255,11 +267,14 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, - key_serial(key)); + key_serial(req->key)); - op = afs_alloc_operation(key, vnode->volume); - if (IS_ERR(op)) + op = afs_alloc_operation(req->key, vnode->volume); + if (IS_ERR(op)) { + if (req->subreq) + netfs_subreq_terminated(req->subreq, PTR_ERR(op), false); return PTR_ERR(op); + } afs_op_set_vnode(op, 0, vnode); @@ -268,336 +283,103 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re return afs_do_sync_operation(op); } -/* - * read page from file, directory or symlink, given a key to use - */ -int afs_page_filler(void *data, struct page *page) +static void afs_req_issue_op(struct netfs_read_subrequest *subreq) { - struct inode *inode = page->mapping->host; - struct afs_vnode *vnode = AFS_FS_I(inode); - struct afs_read *req; - struct key *key = data; - int ret; - - _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + struct afs_read *fsreq; - BUG_ON(!PageLocked(page)); + fsreq = afs_alloc_read(GFP_NOFS); + if (!fsreq) + return netfs_subreq_terminated(subreq, -ENOMEM, false); - ret = -ESTALE; - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - goto error; + fsreq->subreq = subreq; + fsreq->pos = subreq->start + subreq->transferred; + fsreq->len = subreq->len - subreq->transferred; + fsreq->key = subreq->rreq->netfs_priv; + fsreq->vnode = vnode; + fsreq->iter = &fsreq->def_iter; - /* is it cached? */ -#ifdef CONFIG_AFS_FSCACHE - ret = fscache_read_or_alloc_page(vnode->cache, - page, - afs_file_readpage_read_complete, - NULL, - GFP_KERNEL); -#else - ret = -ENOBUFS; -#endif - switch (ret) { - /* read BIO submitted (page in cache) */ - case 0: - break; - - /* page not yet cached */ - case -ENODATA: - _debug("cache said ENODATA"); - goto go_on; - - /* page will not be cached */ - case -ENOBUFS: - _debug("cache said ENOBUFS"); - - fallthrough; - default: - go_on: - req = kzalloc(struct_size(req, array, 1), GFP_KERNEL); - if (!req) - goto enomem; - - /* We request a full page. If the page is a partial one at the - * end of the file, the server will return a short read and the - * unmarshalling code will clear the unfilled space. - */ - refcount_set(&req->usage, 1); - req->pos = (loff_t)page->index << PAGE_SHIFT; - req->len = PAGE_SIZE; - req->nr_pages = 1; - req->pages = req->array; - req->pages[0] = page; - get_page(page); - - /* read the contents of the file from the server into the - * page */ - ret = afs_fetch_data(vnode, key, req); - afs_put_read(req); - - if (ret < 0) { - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } + iov_iter_xarray(&fsreq->def_iter, READ, + &fsreq->vnode->vfs_inode.i_mapping->i_pages, + fsreq->pos, fsreq->len); -#ifdef CONFIG_AFS_FSCACHE - fscache_uncache_page(vnode->cache, page); -#endif - BUG_ON(PageFsCache(page)); - - if (ret == -EINTR || - ret == -ENOMEM || - ret == -ERESTARTSYS || - ret == -EAGAIN) - goto error; - goto io_error; - } + afs_fetch_data(fsreq->vnode, fsreq); +} - SetPageUptodate(page); +static int afs_symlink_readpage(struct page *page) +{ + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + struct afs_read *fsreq; + int ret; - /* send the page to the cache */ -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, vnode->status.size, - GFP_KERNEL) != 0) { - fscache_uncache_page(vnode->cache, page); - BUG_ON(PageFsCache(page)); - } -#endif - unlock_page(page); - } + fsreq = afs_alloc_read(GFP_NOFS); + if (!fsreq) + return -ENOMEM; - _leave(" = 0"); - return 0; + fsreq->pos = page->index * PAGE_SIZE; + fsreq->len = PAGE_SIZE; + fsreq->vnode = vnode; + fsreq->iter = &fsreq->def_iter; + iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages, + fsreq->pos, fsreq->len); -io_error: - SetPageError(page); - goto error; -enomem: - ret = -ENOMEM; -error: - unlock_page(page); - _leave(" = %d", ret); + ret = afs_fetch_data(fsreq->vnode, fsreq); + page_endio(page, false, ret); return ret; } -/* - * read page from file, directory or symlink, given a file to nominate the key - * to be used - */ -static int afs_readpage(struct file *file, struct page *page) +static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file) { - struct key *key; - int ret; - - if (file) { - key = afs_file_key(file); - ASSERT(key != NULL); - ret = afs_page_filler(key, page); - } else { - struct inode *inode = page->mapping->host; - key = afs_request_key(AFS_FS_S(inode->i_sb)->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - } else { - ret = afs_page_filler(key, page); - key_put(key); - } - } - return ret; + rreq->netfs_priv = key_get(afs_file_key(file)); } -/* - * Make pages available as they're filled. - */ -static void afs_readpages_page_done(struct afs_read *req) +static bool afs_is_cache_enabled(struct inode *inode) { -#ifdef CONFIG_AFS_FSCACHE - struct afs_vnode *vnode = req->vnode; -#endif - struct page *page = req->pages[req->index]; + struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode)); - req->pages[req->index] = NULL; - SetPageUptodate(page); - - /* send the page to the cache */ -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, vnode->status.size, - GFP_KERNEL) != 0) { - fscache_uncache_page(vnode->cache, page); - BUG_ON(PageFsCache(page)); - } -#endif - unlock_page(page); - put_page(page); + return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects); } -/* - * Read a contiguous set of pages. - */ -static int afs_readpages_one(struct file *file, struct address_space *mapping, - struct list_head *pages) +static int afs_begin_cache_operation(struct netfs_read_request *rreq) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct afs_read *req; - struct list_head *p; - struct page *first, *page; - struct key *key = afs_file_key(file); - pgoff_t index; - int ret, n, i; - - /* Count the number of contiguous pages at the front of the list. Note - * that the list goes prev-wards rather than next-wards. - */ - first = lru_to_page(pages); - index = first->index + 1; - n = 1; - for (p = first->lru.prev; p != pages; p = p->prev) { - page = list_entry(p, struct page, lru); - if (page->index != index) - break; - index++; - n++; - } - - req = kzalloc(struct_size(req, array, n), GFP_NOFS); - if (!req) - return -ENOMEM; - - refcount_set(&req->usage, 1); - req->vnode = vnode; - req->page_done = afs_readpages_page_done; - req->pos = first->index; - req->pos <<= PAGE_SHIFT; - req->pages = req->array; - - /* Transfer the pages to the request. We add them in until one fails - * to add to the LRU and then we stop (as that'll make a hole in the - * contiguous run. - * - * Note that it's possible for the file size to change whilst we're - * doing this, but we rely on the server returning less than we asked - * for if the file shrank. We also rely on this to deal with a partial - * page at the end of the file. - */ - do { - page = lru_to_page(pages); - list_del(&page->lru); - index = page->index; - if (add_to_page_cache_lru(page, mapping, index, - readahead_gfp_mask(mapping))) { -#ifdef CONFIG_AFS_FSCACHE - fscache_uncache_page(vnode->cache, page); -#endif - put_page(page); - break; - } - - req->pages[req->nr_pages++] = page; - req->len += PAGE_SIZE; - } while (req->nr_pages < n); + struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - if (req->nr_pages == 0) { - kfree(req); - return 0; - } - - ret = afs_fetch_data(vnode, key, req); - if (ret < 0) - goto error; - - task_io_account_read(PAGE_SIZE * req->nr_pages); - afs_put_read(req); - return 0; - -error: - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - - for (i = 0; i < req->nr_pages; i++) { - page = req->pages[i]; - if (page) { -#ifdef CONFIG_AFS_FSCACHE - fscache_uncache_page(vnode->cache, page); -#endif - SetPageError(page); - unlock_page(page); - } - } - - afs_put_read(req); - return ret; + return fscache_begin_read_operation(rreq, afs_vnode_cache(vnode)); } -/* - * read a set of pages - */ -static int afs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, + struct page *page, void **_fsdata) { - struct key *key = afs_file_key(file); - struct afs_vnode *vnode; - int ret = 0; - - _enter("{%d},{%lu},,%d", - key_serial(key), mapping->host->i_ino, nr_pages); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - ASSERT(key != NULL); + return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; +} - vnode = AFS_FS_I(mapping->host); - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _leave(" = -ESTALE"); - return -ESTALE; - } +static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv) +{ + key_put(netfs_priv); +} - /* attempt to read as many of the pages as possible */ -#ifdef CONFIG_AFS_FSCACHE - ret = fscache_read_or_alloc_pages(vnode->cache, - mapping, - pages, - &nr_pages, - afs_file_readpage_read_complete, - NULL, - mapping_gfp_mask(mapping)); -#else - ret = -ENOBUFS; -#endif +const struct netfs_read_request_ops afs_req_ops = { + .init_rreq = afs_init_rreq, + .is_cache_enabled = afs_is_cache_enabled, + .begin_cache_operation = afs_begin_cache_operation, + .check_write_begin = afs_check_write_begin, + .issue_op = afs_req_issue_op, + .cleanup = afs_priv_cleanup, +}; - switch (ret) { - /* all pages are being read from the cache */ - case 0: - BUG_ON(!list_empty(pages)); - BUG_ON(nr_pages != 0); - _leave(" = 0 [reading all]"); - return 0; - - /* there were pages that couldn't be read from the cache */ - case -ENODATA: - case -ENOBUFS: - break; - - /* other error */ - default: - _leave(" = %d", ret); - return ret; - } +static int afs_readpage(struct file *file, struct page *page) +{ + if (!file) + return afs_symlink_readpage(page); - while (!list_empty(pages)) { - ret = afs_readpages_one(file, mapping, pages); - if (ret < 0) - break; - } + return netfs_readpage(file, page, &afs_req_ops, NULL); +} - _leave(" = %d [netting]", ret); - return ret; +static void afs_readahead(struct readahead_control *ractl) +{ + netfs_readahead(ractl, &afs_req_ops, NULL); } /* @@ -625,8 +407,8 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset, return; /* We may need to shorten the dirty region */ - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); if (t <= offset || f >= end) return; /* Doesn't overlap */ @@ -644,17 +426,17 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset, if (f == t) goto undirty; - priv = afs_page_dirty(f, t); + priv = afs_page_dirty(page, f, t); set_page_private(page, priv); - trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page); return; undirty: - trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page); clear_page_dirty_for_io(page); full_invalidate: - priv = (unsigned long)detach_page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("inval"), page); + detach_page_private(page); } /* @@ -669,20 +451,10 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, BUG_ON(!PageLocked(page)); -#ifdef CONFIG_AFS_FSCACHE - /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_SIZE) { - if (PageFsCache(page)) { - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - fscache_wait_on_page_write(vnode->cache, page); - fscache_uncache_page(vnode->cache, page); - } - } -#endif - if (PagePrivate(page)) afs_invalidate_dirty(page, offset, length); + wait_on_page_fscache(page); _leave(""); } @@ -693,7 +465,6 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, static int afs_releasepage(struct page *page, gfp_t gfp_flags) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - unsigned long priv; _enter("{{%llx:%llu}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, @@ -702,16 +473,16 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) /* deny if page is being written to the cache and the caller hasn't * elected to wait */ #ifdef CONFIG_AFS_FSCACHE - if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) { - _leave(" = F [cache busy]"); - return 0; + if (PageFsCache(page)) { + if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS)) + return false; + wait_on_page_fscache(page); } #endif if (PagePrivate(page)) { - priv = (unsigned long)detach_page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("rel"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("rel"), page); + detach_page_private(page); } /* indicate that the page can be released */ diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 71c58723763d..d222dfbe976b 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -118,6 +118,8 @@ static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param * vp->cb_break_before = afs_calc_vnode_cb_break(vnode); if (vnode->lock_state != AFS_VNODE_LOCK_NONE) op->flags |= AFS_OPERATION_CUR_ONLY; + if (vp->modification) + set_bit(AFS_VNODE_MODIFYING, &vnode->flags); } if (vp->fid.vnode) @@ -198,8 +200,10 @@ void afs_wait_for_operation(struct afs_operation *op) case -ECONNABORTED: if (op->ops->aborted) op->ops->aborted(op); - break; + fallthrough; default: + if (op->ops->failed) + op->ops->failed(op); break; } @@ -223,6 +227,10 @@ int afs_put_operation(struct afs_operation *op) if (op->ops && op->ops->put) op->ops->put(op); + if (op->file[0].modification) + clear_bit(AFS_VNODE_MODIFYING, &op->file[0].vnode->flags); + if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode) + clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags); if (op->file[0].put_vnode) iput(&op->file[0].vnode->vfs_inode); if (op->file[1].put_vnode) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 1d95ed9dd86e..dd3f45d906d2 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -10,6 +10,7 @@ #include <linux/sched.h> #include <linux/circ_buf.h> #include <linux/iversion.h> +#include <linux/netfs.h> #include "internal.h" #include "afs_fs.h" #include "xdr_fs.h" @@ -302,17 +303,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) struct afs_vnode_param *vp = &op->file[0]; struct afs_read *req = op->fetch.req; const __be32 *bp; - unsigned int size; int ret; - _enter("{%u,%zu/%llu}", - call->unmarshall, iov_iter_count(call->iter), req->actual_len); + _enter("{%u,%zu,%zu/%llu}", + call->unmarshall, call->iov_len, iov_iter_count(call->iter), + req->actual_len); switch (call->unmarshall) { case 0: req->actual_len = 0; - req->index = 0; - req->offset = req->pos & (PAGE_SIZE - 1); call->unmarshall++; if (call->operation_ID == FSFETCHDATA64) { afs_extract_to_tmp64(call); @@ -322,7 +321,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) } fallthrough; - /* extract the returned data length */ + /* Extract the returned data length into + * ->actual_len. This may indicate more or less data than was + * requested will be returned. + */ case 1: _debug("extract data length"); ret = afs_extract_data(call, true); @@ -331,44 +333,25 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) req->actual_len = be64_to_cpu(call->tmp64); _debug("DATA length: %llu", req->actual_len); - req->remain = min(req->len, req->actual_len); - if (req->remain == 0) + + if (req->actual_len == 0) goto no_more_data; + call->iter = req->iter; + call->iov_len = min(req->actual_len, req->len); call->unmarshall++; - - begin_page: - ASSERTCMP(req->index, <, req->nr_pages); - if (req->remain > PAGE_SIZE - req->offset) - size = PAGE_SIZE - req->offset; - else - size = req->remain; - call->bvec[0].bv_len = size; - call->bvec[0].bv_offset = req->offset; - call->bvec[0].bv_page = req->pages[req->index]; - iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); - ASSERTCMP(size, <=, PAGE_SIZE); fallthrough; /* extract the returned data */ case 2: _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->remain); + iov_iter_count(call->iter), req->actual_len); ret = afs_extract_data(call, true); if (ret < 0) return ret; - req->remain -= call->bvec[0].bv_len; - req->offset += call->bvec[0].bv_len; - ASSERTCMP(req->offset, <=, PAGE_SIZE); - if (req->offset == PAGE_SIZE) { - req->offset = 0; - req->index++; - if (req->remain > 0) - goto begin_page; - } - ASSERTCMP(req->remain, ==, 0); + call->iter = &call->def_iter; if (req->actual_len <= req->len) goto no_more_data; @@ -405,22 +388,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) req->file_size = vp->scb.status.size; call->unmarshall++; + fallthrough; case 5: break; } - for (; req->index < req->nr_pages; req->index++) { - if (req->offset < PAGE_SIZE) - zero_user_segment(req->pages[req->index], - req->offset, PAGE_SIZE); - req->offset = 0; - } - - if (req->page_done) - for (req->index = 0; req->index < req->nr_pages; req->index++) - req->page_done(req); - _leave(" = 0 [done]"); return 0; } @@ -494,6 +467,8 @@ void afs_fs_fetch_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); + req->call_debug_id = call->debug_id; + /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA); @@ -1079,8 +1054,7 @@ static const struct afs_call_type afs_RXFSStoreData64 = { /* * store a set of pages to a very large file */ -static void afs_fs_store_data64(struct afs_operation *op, - loff_t pos, loff_t size, loff_t i_size) +static void afs_fs_store_data64(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; @@ -1095,7 +1069,7 @@ static void afs_fs_store_data64(struct afs_operation *op, if (!call) return afs_op_nomem(op); - call->send_pages = true; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; @@ -1111,47 +1085,38 @@ static void afs_fs_store_data64(struct afs_operation *op, *bp++ = 0; /* unix mode */ *bp++ = 0; /* segment size */ - *bp++ = htonl(upper_32_bits(pos)); - *bp++ = htonl(lower_32_bits(pos)); - *bp++ = htonl(upper_32_bits(size)); - *bp++ = htonl(lower_32_bits(size)); - *bp++ = htonl(upper_32_bits(i_size)); - *bp++ = htonl(lower_32_bits(i_size)); + *bp++ = htonl(upper_32_bits(op->store.pos)); + *bp++ = htonl(lower_32_bits(op->store.pos)); + *bp++ = htonl(upper_32_bits(op->store.size)); + *bp++ = htonl(lower_32_bits(op->store.size)); + *bp++ = htonl(upper_32_bits(op->store.i_size)); + *bp++ = htonl(lower_32_bits(op->store.i_size)); trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } /* - * store a set of pages + * Write data to a file on the server. */ void afs_fs_store_data(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - loff_t size, pos, i_size; __be32 *bp; _enter(",%x,{%llx:%llu},,", key_serial(op->key), vp->fid.vid, vp->fid.vnode); - size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset; - if (op->store.first != op->store.last) - size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT; - pos = (loff_t)op->store.first << PAGE_SHIFT; - pos += op->store.first_offset; - - i_size = i_size_read(&vp->vnode->vfs_inode); - if (pos + size > i_size) - i_size = size + pos; - _debug("size %llx, at %llx, i_size %llx", - (unsigned long long) size, (unsigned long long) pos, - (unsigned long long) i_size); + (unsigned long long)op->store.size, + (unsigned long long)op->store.pos, + (unsigned long long)op->store.i_size); - if (upper_32_bits(pos) || upper_32_bits(i_size) || upper_32_bits(size) || - upper_32_bits(pos + size)) - return afs_fs_store_data64(op, pos, size, i_size); + if (upper_32_bits(op->store.pos) || + upper_32_bits(op->store.size) || + upper_32_bits(op->store.i_size)) + return afs_fs_store_data64(op); call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData, (4 + 6 + 3) * 4, @@ -1159,7 +1124,7 @@ void afs_fs_store_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); - call->send_pages = true; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; @@ -1175,9 +1140,9 @@ void afs_fs_store_data(struct afs_operation *op) *bp++ = 0; /* unix mode */ *bp++ = 0; /* segment size */ - *bp++ = htonl(lower_32_bits(pos)); - *bp++ = htonl(lower_32_bits(size)); - *bp++ = htonl(lower_32_bits(i_size)); + *bp++ = htonl(lower_32_bits(op->store.pos)); + *bp++ = htonl(lower_32_bits(op->store.size)); + *bp++ = htonl(lower_32_bits(op->store.i_size)); trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); @@ -1444,6 +1409,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) _debug("motd '%s'", p); call->unmarshall++; + fallthrough; case 8: break; @@ -1881,6 +1847,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; + fallthrough; case 6: break; @@ -2015,6 +1982,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; + fallthrough; case 4: break; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 12be88716e4c..80b6c8d967d5 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -102,13 +102,13 @@ static int afs_inode_init_from_status(struct afs_operation *op, switch (status->type) { case AFS_FTYPE_FILE: - inode->i_mode = S_IFREG | status->mode; + inode->i_mode = S_IFREG | (status->mode & S_IALLUGO); inode->i_op = &afs_file_inode_operations; inode->i_fop = &afs_file_operations; inode->i_mapping->a_ops = &afs_fs_aops; break; case AFS_FTYPE_DIR: - inode->i_mode = S_IFDIR | status->mode; + inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO); inode->i_op = &afs_dir_inode_operations; inode->i_fop = &afs_dir_file_operations; inode->i_mapping->a_ops = &afs_dir_aops; @@ -198,7 +198,7 @@ static void afs_apply_status(struct afs_operation *op, if (status->mode != vnode->status.mode) { mode = inode->i_mode; mode &= ~S_IALLUGO; - mode |= status->mode; + mode |= status->mode & S_IALLUGO; WRITE_ONCE(inode->i_mode, mode); } @@ -214,11 +214,12 @@ static void afs_apply_status(struct afs_operation *op, if (vp->dv_before + vp->dv_delta != status->data_version) { if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) - pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s\n", + pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n", vnode->fid.vid, vnode->fid.vnode, (unsigned long long)vp->dv_before + vp->dv_delta, (unsigned long long)status->data_version, - op->type ? op->type->name : "???"); + op->type ? op->type->name : "???", + op->debug_id); vnode->invalid_before = status->data_version; if (vnode->status.type == AFS_FTYPE_DIR) { @@ -293,8 +294,9 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } } else if (vp->scb.have_status) { - if (vp->dv_before + vp->dv_delta != vp->scb.status.data_version && - vp->speculative) + if (vp->speculative && + (test_bit(AFS_VNODE_MODIFYING, &vnode->flags) || + vp->dv_before != vnode->status.data_version)) /* Ignore the result of a speculative bulk status fetch * if it splits around a modification op, thereby * appearing to regress the data version. @@ -427,7 +429,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) } __packed key; struct afs_vnode_cache_aux aux; - if (vnode->status.type == AFS_FTYPE_DIR) { + if (vnode->status.type != AFS_FTYPE_FILE) { vnode->cache = NULL; return; } @@ -910,6 +912,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } op->ctime = attr->ia_ctime; op->file[0].update_ctime = 1; + op->file[0].modification = true; op->ops = &afs_setattr_operation; ret = afs_do_sync_operation(op); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 1627b1872812..5ed416f4ff33 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -14,6 +14,7 @@ #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> +#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> @@ -31,6 +32,7 @@ struct pagevec; struct afs_call; +struct afs_vnode; /* * Partial file-locking emulation mode. (The problem being that AFS3 only @@ -104,7 +106,9 @@ struct afs_call { struct afs_server *server; /* The fileserver record if fs op (pins ref) */ struct afs_vlserver *vlserver; /* The vlserver record if vl op */ void *request; /* request data (first part) */ + size_t iov_len; /* Size of *iter to be used */ struct iov_iter def_iter; /* Default buffer/data iterator */ + struct iov_iter *write_iter; /* Iterator defining write to be made */ struct iov_iter *iter; /* Iterator currently in use */ union { /* Convenience for ->def_iter */ struct kvec kvec[1]; @@ -131,7 +135,6 @@ struct afs_call { unsigned char unmarshall; /* unmarshalling phase */ unsigned char addr_ix; /* Address in ->alist */ bool drop_ref; /* T if need to drop ref for incoming call */ - bool send_pages; /* T if data from mapping should be sent */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ @@ -202,17 +205,19 @@ struct afs_read { loff_t pos; /* Where to start reading */ loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ - loff_t remain; /* Amount remaining */ loff_t file_size; /* File size returned by server */ + struct key *key; /* The key to use to reissue the read */ + struct afs_vnode *vnode; /* The file being read into. */ + struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */ afs_dataversion_t data_version; /* Version number returned by server */ refcount_t usage; - unsigned int index; /* Which page we're reading into */ + unsigned int call_debug_id; unsigned int nr_pages; - unsigned int offset; /* offset into current page */ - struct afs_vnode *vnode; - void (*page_done)(struct afs_read *); - struct page **pages; - struct page *array[]; + int error; + void (*done)(struct afs_read *); + void (*cleanup)(struct afs_read *); + struct iov_iter *iter; /* Iterator representing the buffer */ + struct iov_iter def_iter; /* Default iterator */ }; /* @@ -640,6 +645,7 @@ struct afs_vnode { #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ +#define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ @@ -739,6 +745,7 @@ struct afs_operation_ops { void (*issue_yfs_rpc)(struct afs_operation *op); void (*success)(struct afs_operation *op); void (*aborted)(struct afs_operation *op); + void (*failed)(struct afs_operation *op); void (*edit_dir)(struct afs_operation *op); void (*put)(struct afs_operation *op); }; @@ -756,6 +763,7 @@ struct afs_vnode_param { bool set_size:1; /* Must update i_size */ bool op_unlinked:1; /* True if file was unlinked by op */ bool speculative:1; /* T if speculative status fetch (no vnode lock) */ + bool modification:1; /* Set if the content gets modified */ }; /* @@ -808,12 +816,11 @@ struct afs_operation { afs_lock_type_t type; } lock; struct { - struct address_space *mapping; /* Pages being written from */ - pgoff_t first; /* first page in mapping to deal with */ - pgoff_t last; /* last page in mapping to deal with */ - unsigned first_offset; /* offset into mapping[first] */ - unsigned last_to; /* amount of mapping[last] */ - bool laundering; /* Laundering page, PG_writeback not set */ + struct iov_iter *write_iter; + loff_t pos; + loff_t size; + loff_t i_size; + bool laundering; /* Laundering page, PG_writeback not set */ } store; struct { struct iattr *attr; @@ -875,31 +882,31 @@ struct afs_vnode_cache_aux { #define __AFS_PAGE_PRIV_MMAPPED 0x8000UL #endif -static inline unsigned int afs_page_dirty_resolution(void) +static inline unsigned int afs_page_dirty_resolution(struct page *page) { - int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); + int shift = thp_order(page) + PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); return (shift > 0) ? shift : 0; } -static inline size_t afs_page_dirty_from(unsigned long priv) +static inline size_t afs_page_dirty_from(struct page *page, unsigned long priv) { unsigned long x = priv & __AFS_PAGE_PRIV_MASK; /* The lower bound is inclusive */ - return x << afs_page_dirty_resolution(); + return x << afs_page_dirty_resolution(page); } -static inline size_t afs_page_dirty_to(unsigned long priv) +static inline size_t afs_page_dirty_to(struct page *page, unsigned long priv) { unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; /* The upper bound is immediately beyond the region */ - return (x + 1) << afs_page_dirty_resolution(); + return (x + 1) << afs_page_dirty_resolution(page); } -static inline unsigned long afs_page_dirty(size_t from, size_t to) +static inline unsigned long afs_page_dirty(struct page *page, size_t from, size_t to) { - unsigned int res = afs_page_dirty_resolution(); + unsigned int res = afs_page_dirty_resolution(page); from >>= res; to = (to - 1) >> res; return (to << __AFS_PAGE_PRIV_SHIFT) | from; @@ -1040,13 +1047,14 @@ extern void afs_dynroot_depopulate(struct super_block *); extern const struct address_space_operations afs_fs_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; +extern const struct netfs_read_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); -extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *); -extern int afs_page_filler(void *, struct page *); +extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); +extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); static inline struct afs_read *afs_get_read(struct afs_read *req) @@ -1270,6 +1278,7 @@ static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *c static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) { + call->iov_len = size; call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size); @@ -1277,21 +1286,25 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si static inline void afs_extract_to_tmp(struct afs_call *call) { + call->iov_len = sizeof(call->tmp); afs_extract_begin(call, &call->tmp, sizeof(call->tmp)); } static inline void afs_extract_to_tmp64(struct afs_call *call) { + call->iov_len = sizeof(call->tmp64); afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64)); } static inline void afs_extract_discard(struct afs_call *call, size_t size) { + call->iov_len = size; iov_iter_discard(&call->def_iter, READ, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) { + call->iov_len = size; afs_extract_begin(call, call->buffer, size); } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8be709cb8542..23a1a92d64bb 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -271,40 +271,6 @@ void afs_flat_call_destructor(struct afs_call *call) call->buffer = NULL; } -#define AFS_BVEC_MAX 8 - -/* - * Load the given bvec with the next few pages. - */ -static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, - struct bio_vec *bv, pgoff_t first, pgoff_t last, - unsigned offset) -{ - struct afs_operation *op = call->op; - struct page *pages[AFS_BVEC_MAX]; - unsigned int nr, n, i, to, bytes = 0; - - nr = min_t(pgoff_t, last - first + 1, AFS_BVEC_MAX); - n = find_get_pages_contig(op->store.mapping, first, nr, pages); - ASSERTCMP(n, ==, nr); - - msg->msg_flags |= MSG_MORE; - for (i = 0; i < nr; i++) { - to = PAGE_SIZE; - if (first + i >= last) { - to = op->store.last_to; - msg->msg_flags &= ~MSG_MORE; - } - bv[i].bv_page = pages[i]; - bv[i].bv_len = to - offset; - bv[i].bv_offset = offset; - bytes += to - offset; - offset = 0; - } - - iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes); -} - /* * Advance the AFS call state when the RxRPC call ends the transmit phase. */ @@ -318,42 +284,6 @@ static void afs_notify_end_request_tx(struct sock *sock, } /* - * attach the data from a bunch of pages on an inode to a call - */ -static int afs_send_pages(struct afs_call *call, struct msghdr *msg) -{ - struct afs_operation *op = call->op; - struct bio_vec bv[AFS_BVEC_MAX]; - unsigned int bytes, nr, loop, offset; - pgoff_t first = op->store.first, last = op->store.last; - int ret; - - offset = op->store.first_offset; - op->store.first_offset = 0; - - do { - afs_load_bvec(call, msg, bv, first, last, offset); - trace_afs_send_pages(call, msg, first, last, offset); - - offset = 0; - bytes = msg->msg_iter.count; - nr = msg->msg_iter.nr_segs; - - ret = rxrpc_kernel_send_data(op->net->socket, call->rxcall, msg, - bytes, afs_notify_end_request_tx); - for (loop = 0; loop < nr; loop++) - put_page(bv[loop].bv_page); - if (ret < 0) - break; - - first += nr; - } while (first <= last); - - trace_afs_sent_pages(call, op->store.first, last, first, ret); - return ret; -} - -/* * Initiate a call and synchronously queue up the parameters for dispatch. Any * error is stored into the call struct, which the caller must check for. */ @@ -363,6 +293,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; + size_t len; s64 tx_total_len; int ret; @@ -383,21 +314,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) * after the initial fixed part. */ tx_total_len = call->request_size; - if (call->send_pages) { - struct afs_operation *op = call->op; - - if (op->store.last == op->store.first) { - tx_total_len += op->store.last_to - op->store.first_offset; - } else { - /* It looks mathematically like you should be able to - * combine the following lines with the ones above, but - * unsigned arithmetic is fun when it wraps... - */ - tx_total_len += PAGE_SIZE - op->store.first_offset; - tx_total_len += op->store.last_to; - tx_total_len += (op->store.last - op->store.first - 1) * PAGE_SIZE; - } - } + if (call->write_iter) + tx_total_len += iov_iter_count(call->write_iter); /* If the call is going to be asynchronous, we need an extra ref for * the call to hold itself so the caller need not hang on to its ref. @@ -439,7 +357,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; - msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0); + msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0); ret = rxrpc_kernel_send_data(call->net->socket, rxcall, &msg, call->request_size, @@ -447,8 +365,18 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) if (ret < 0) goto error_do_abort; - if (call->send_pages) { - ret = afs_send_pages(call, &msg); + if (call->write_iter) { + msg.msg_iter = *call->write_iter; + msg.msg_flags &= ~MSG_MORE; + trace_afs_send_data(call, &msg); + + ret = rxrpc_kernel_send_data(call->net->socket, + call->rxcall, &msg, + iov_iter_count(&msg.msg_iter), + afs_notify_end_request_tx); + *call->write_iter = msg.msg_iter; + + trace_afs_sent_data(call, &msg, ret); if (ret < 0) goto error_do_abort; } @@ -466,9 +394,10 @@ error_do_abort: rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); } else { + len = 0; iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, - &msg.msg_iter, false, + &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); ac->abort_code = call->abort_code; ac->responded = true; @@ -499,11 +428,45 @@ error_kill_call: } /* + * Log remote abort codes that indicate that we have a protocol disagreement + * with the server. + */ +static void afs_log_error(struct afs_call *call, s32 remote_abort) +{ + static int max = 0; + const char *msg; + int m; + + switch (remote_abort) { + case RX_EOF: msg = "unexpected EOF"; break; + case RXGEN_CC_MARSHAL: msg = "client marshalling"; break; + case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling"; break; + case RXGEN_SS_MARSHAL: msg = "server marshalling"; break; + case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling"; break; + case RXGEN_DECODE: msg = "opcode decode"; break; + case RXGEN_SS_XDRFREE: msg = "server XDR cleanup"; break; + case RXGEN_CC_XDRFREE: msg = "client XDR cleanup"; break; + case -32: msg = "insufficient data"; break; + default: + return; + } + + m = max; + if (m < 3) { + max = m + 1; + pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", + msg, call->type->name, + &call->alist->addrs[call->addr_ix].transport); + } +} + +/* * deliver messages to a call */ static void afs_deliver_to_call(struct afs_call *call) { enum afs_call_state state; + size_t len; u32 abort_code, remote_abort = 0; int ret; @@ -516,10 +479,11 @@ static void afs_deliver_to_call(struct afs_call *call) state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { + len = 0; iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, call->rxcall, &call->def_iter, - false, &remote_abort, + &len, false, &remote_abort, &call->service_id); trace_afs_receive_data(call, &call->def_iter, false, ret); @@ -559,6 +523,7 @@ static void afs_deliver_to_call(struct afs_call *call) goto out; case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); + afs_log_error(call, call->abort_code); goto done; case -ENOTSUPP: abort_code = RXGEN_OPCODE; @@ -929,10 +894,11 @@ int afs_extract_data(struct afs_call *call, bool want_more) u32 remote_abort = 0; int ret; - _enter("{%s,%zu},%d", call->type->name, iov_iter_count(iter), want_more); + _enter("{%s,%zu,%zu},%d", + call->type->name, call->iov_len, iov_iter_count(iter), want_more); ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter, - want_more, &remote_abort, + &call->iov_len, want_more, &remote_abort, &call->service_id); if (ret == 0 || ret == -EAGAIN) return ret; diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index dc9327332f06..00fca3c66ba6 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -593,6 +593,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (ret < 0) return ret; call->unmarshall = 6; + fallthrough; case 6: break; diff --git a/fs/afs/write.c b/fs/afs/write.c index eb737ed63afb..a523bb86915d 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -11,6 +11,8 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> +#include <linux/netfs.h> +#include <linux/fscache.h> #include "internal.h" /* @@ -23,55 +25,6 @@ int afs_set_page_dirty(struct page *page) } /* - * partly or wholly fill a page that's under preparation for writing - */ -static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - loff_t pos, unsigned int len, struct page *page) -{ - struct afs_read *req; - size_t p; - void *data; - int ret; - - _enter(",,%llu", (unsigned long long)pos); - - if (pos >= vnode->vfs_inode.i_size) { - p = pos & ~PAGE_MASK; - ASSERTCMP(p + len, <=, PAGE_SIZE); - data = kmap(page); - memset(data + p, 0, len); - kunmap(page); - return 0; - } - - req = kzalloc(struct_size(req, array, 1), GFP_KERNEL); - if (!req) - return -ENOMEM; - - refcount_set(&req->usage, 1); - req->pos = pos; - req->len = len; - req->nr_pages = 1; - req->pages = req->array; - req->pages[0] = page; - get_page(page); - - ret = afs_fetch_data(vnode, key, req); - afs_put_read(req); - if (ret < 0) { - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - } - - _leave(" = %d", ret); - return ret; -} - -/* * prepare to perform part of a write to a page */ int afs_write_begin(struct file *file, struct address_space *mapping, @@ -80,47 +33,40 @@ int afs_write_begin(struct file *file, struct address_space *mapping, { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct page *page; - struct key *key = afs_file_key(file); unsigned long priv; - unsigned f, from = pos & (PAGE_SIZE - 1); - unsigned t, to = from + len; - pgoff_t index = pos >> PAGE_SHIFT; + unsigned f, from; + unsigned t, to; + pgoff_t index; int ret; - _enter("{%llx:%llu},{%lx},%u,%u", - vnode->fid.vid, vnode->fid.vnode, index, from, to); + _enter("{%llx:%llu},%llx,%x", + vnode->fid.vid, vnode->fid.vnode, pos, len); - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; + /* Prefetch area to be written into the cache if we're caching this + * file. We need to do this before we get a lock on the page in case + * there's more than one writer competing for the same cache block. + */ + ret = netfs_write_begin(file, mapping, pos, len, flags, &page, fsdata, + &afs_req_ops, NULL); + if (ret < 0) + return ret; - if (!PageUptodate(page) && len != PAGE_SIZE) { - ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - _leave(" = %d [prep]", ret); - return ret; - } - SetPageUptodate(page); - } + index = page->index; + from = pos - index * PAGE_SIZE; + to = from + len; try_again: /* See if this page is already partially written in a way that we can * merge the new write with. */ - t = f = 0; if (PagePrivate(page)) { priv = page_private(page); - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); ASSERTCMP(f, <=, t); - } - if (f != t) { if (PageWriteback(page)) { - trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), page); goto flush_conflicting_write; } /* If the file is being filled locally, allow inter-write @@ -164,12 +110,10 @@ int afs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct key *key = afs_file_key(file); unsigned long priv; - unsigned int f, from = pos & (PAGE_SIZE - 1); + unsigned int f, from = pos & (thp_size(page) - 1); unsigned int t, to = from + copied; loff_t i_size, maybe_i_size; - int ret = 0; _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); @@ -188,88 +132,75 @@ int afs_write_end(struct file *file, struct address_space *mapping, write_sequnlock(&vnode->cb_lock); } - if (!PageUptodate(page)) { - if (copied < len) { - /* Try and load any missing data from the server. The - * unmarshalling routine will take care of clearing any - * bits that are beyond the EOF. - */ - ret = afs_fill_page(vnode, key, pos + copied, - len - copied, page); - if (ret < 0) - goto out; - } - SetPageUptodate(page); - } + ASSERT(PageUptodate(page)); if (PagePrivate(page)) { priv = page_private(page); - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); if (from < f) f = from; if (to > t) t = to; - priv = afs_page_dirty(f, t); + priv = afs_page_dirty(page, f, t); set_page_private(page, priv); - trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), page); } else { - priv = afs_page_dirty(from, to); + priv = afs_page_dirty(page, from, to); attach_page_private(page, (void *)priv); - trace_afs_page_dirty(vnode, tracepoint_string("dirty"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty"), page); } - set_page_dirty(page); - if (PageDirty(page)) - _debug("dirtied"); - ret = copied; + if (set_page_dirty(page)) + _debug("dirtied %lx", page->index); out: unlock_page(page); put_page(page); - return ret; + return copied; } /* * kill all the pages in the given range */ static void afs_kill_pages(struct address_space *mapping, - pgoff_t first, pgoff_t last) + loff_t start, loff_t len) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct pagevec pv; - unsigned count, loop; + unsigned int loop, psize; - _enter("{%llx:%llu},%lx-%lx", - vnode->fid.vid, vnode->fid.vnode, first, last); + _enter("{%llx:%llu},%llx @%llx", + vnode->fid.vid, vnode->fid.vnode, len, start); pagevec_init(&pv); do { - _debug("kill %lx-%lx", first, last); + _debug("kill %llx @%llx", len, start); - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(mapping, first, count, pv.pages); - ASSERTCMP(pv.nr, ==, count); + pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE, + PAGEVEC_SIZE, pv.pages); + if (pv.nr == 0) + break; - for (loop = 0; loop < count; loop++) { + for (loop = 0; loop < pv.nr; loop++) { struct page *page = pv.pages[loop]; + + if (page->index * PAGE_SIZE >= start + len) + break; + + psize = thp_size(page); + start += psize; + len -= psize; ClearPageUptodate(page); - SetPageError(page); end_page_writeback(page); - if (page->index >= first) - first = page->index + 1; lock_page(page); generic_error_remove_page(mapping, page); unlock_page(page); } __pagevec_release(&pv); - } while (first <= last); + } while (len > 0); _leave(""); } @@ -279,37 +210,40 @@ static void afs_kill_pages(struct address_space *mapping, */ static void afs_redirty_pages(struct writeback_control *wbc, struct address_space *mapping, - pgoff_t first, pgoff_t last) + loff_t start, loff_t len) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct pagevec pv; - unsigned count, loop; + unsigned int loop, psize; - _enter("{%llx:%llu},%lx-%lx", - vnode->fid.vid, vnode->fid.vnode, first, last); + _enter("{%llx:%llu},%llx @%llx", + vnode->fid.vid, vnode->fid.vnode, len, start); pagevec_init(&pv); do { - _debug("redirty %lx-%lx", first, last); + _debug("redirty %llx @%llx", len, start); - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(mapping, first, count, pv.pages); - ASSERTCMP(pv.nr, ==, count); + pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE, + PAGEVEC_SIZE, pv.pages); + if (pv.nr == 0) + break; - for (loop = 0; loop < count; loop++) { + for (loop = 0; loop < pv.nr; loop++) { struct page *page = pv.pages[loop]; + if (page->index * PAGE_SIZE >= start + len) + break; + + psize = thp_size(page); + start += psize; + len -= psize; redirty_page_for_writepage(wbc, page); end_page_writeback(page); - if (page->index >= first) - first = page->index + 1; } __pagevec_release(&pv); - } while (first <= last); + } while (len > 0); _leave(""); } @@ -317,37 +251,32 @@ static void afs_redirty_pages(struct writeback_control *wbc, /* * completion of write to server */ -static void afs_pages_written_back(struct afs_vnode *vnode, - pgoff_t first, pgoff_t last) +static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct pagevec pv; - unsigned long priv; - unsigned count, loop; + struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct page *page; + pgoff_t end; - _enter("{%llx:%llu},{%lx-%lx}", - vnode->fid.vid, vnode->fid.vnode, first, last); + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); - pagevec_init(&pv); + _enter("{%llx:%llu},{%x @%llx}", + vnode->fid.vid, vnode->fid.vnode, len, start); - do { - _debug("done %lx-%lx", first, last); - - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, - first, count, pv.pages); - ASSERTCMP(pv.nr, ==, count); - - for (loop = 0; loop < count; loop++) { - priv = (unsigned long)detach_page_private(pv.pages[loop]); - trace_afs_page_dirty(vnode, tracepoint_string("clear"), - pv.pages[loop]->index, priv); - end_page_writeback(pv.pages[loop]); + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, page, end) { + if (!PageWriteback(page)) { + kdebug("bad %x @%llx page %lx %lx", len, start, page->index, end); + ASSERT(PageWriteback(page)); } - first += count; - __pagevec_release(&pv); - } while (first <= last); + + trace_afs_page_dirty(vnode, tracepoint_string("clear"), page); + detach_page_private(page); + page_endio(page, true, 0); + } + + rcu_read_unlock(); afs_prune_wb_keys(vnode); _leave(""); @@ -402,11 +331,9 @@ static void afs_store_data_success(struct afs_operation *op) afs_vnode_commit_status(op, &op->file[0]); if (op->error == 0) { if (!op->store.laundering) - afs_pages_written_back(vnode, op->store.first, op->store.last); + afs_pages_written_back(vnode, op->store.pos, op->store.size); afs_stat_v(vnode, n_stores); - atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) - - (op->store.first * PAGE_SIZE + op->store.first_offset), - &afs_v2net(vnode)->n_store_bytes); + atomic_long_add(op->store.size, &afs_v2net(vnode)->n_store_bytes); } } @@ -419,21 +346,20 @@ static const struct afs_operation_ops afs_store_data_operation = { /* * write to a file */ -static int afs_store_data(struct address_space *mapping, - pgoff_t first, pgoff_t last, - unsigned offset, unsigned to, bool laundering) +static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos, + bool laundering) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct afs_operation *op; struct afs_wb_key *wbk = NULL; - int ret; + loff_t size = iov_iter_count(iter), i_size; + int ret = -ENOKEY; - _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x", + _enter("%s{%llx:%llu.%u},%llx,%llx", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, - first, last, offset, to); + size, pos); ret = afs_get_writeback_key(vnode, &wbk); if (ret) { @@ -447,13 +373,15 @@ static int afs_store_data(struct address_space *mapping, return -ENOMEM; } + i_size = i_size_read(&vnode->vfs_inode); + afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; - op->store.mapping = mapping; - op->store.first = first; - op->store.last = last; - op->store.first_offset = offset; - op->store.last_to = to; + op->file[0].modification = true; + op->store.write_iter = iter; + op->store.pos = pos; + op->store.size = size; + op->store.i_size = max(pos + size, i_size); op->store.laundering = laundering; op->mtime = vnode->vfs_inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; @@ -487,73 +415,58 @@ try_next_key: } /* - * Synchronously write back the locked page and any subsequent non-locked dirty - * pages. + * Extend the region to be written back to include subsequent contiguously + * dirty pages if possible, but don't sleep while doing so. + * + * If this page holds new content, then we can include filler zeros in the + * writeback. */ -static int afs_write_back_from_locked_page(struct address_space *mapping, - struct writeback_control *wbc, - struct page *primary_page, - pgoff_t final_page) +static void afs_extend_writeback(struct address_space *mapping, + struct afs_vnode *vnode, + long *_count, + loff_t start, + loff_t max_len, + bool new_content, + unsigned int *_len) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct page *pages[8], *page; - unsigned long count, priv; - unsigned n, offset, to, f, t; - pgoff_t start, first, last; - loff_t i_size, end; - int loop, ret; - - _enter(",%lx", primary_page->index); + struct pagevec pvec; + struct page *page; + unsigned long priv; + unsigned int psize, filler = 0; + unsigned int f, t; + loff_t len = *_len; + pgoff_t index = (start + len) / PAGE_SIZE; + bool stop = true; + unsigned int i; - count = 1; - if (test_set_page_writeback(primary_page)) - BUG(); + XA_STATE(xas, &mapping->i_pages, index); + pagevec_init(&pvec); - /* Find all consecutive lockable dirty pages that have contiguous - * written regions, stopping when we find a page that is not - * immediately lockable, is not dirty or is missing, or we reach the - * end of the range. - */ - start = primary_page->index; - priv = page_private(primary_page); - offset = afs_page_dirty_from(priv); - to = afs_page_dirty_to(priv); - trace_afs_page_dirty(vnode, tracepoint_string("store"), - primary_page->index, priv); - - WARN_ON(offset == to); - if (offset == to) - trace_afs_page_dirty(vnode, tracepoint_string("WARN"), - primary_page->index, priv); - - if (start >= final_page || - (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))) - goto no_more; - - start++; do { - _debug("more %lx [%lx]", start, count); - n = final_page - start + 1; - if (n > ARRAY_SIZE(pages)) - n = ARRAY_SIZE(pages); - n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages); - _debug("fgpc %u", n); - if (n == 0) - goto no_more; - if (pages[0]->index != start) { - do { - put_page(pages[--n]); - } while (n > 0); - goto no_more; - } + /* Firstly, we gather up a batch of contiguous dirty pages + * under the RCU read lock - but we can't clear the dirty flags + * there if any of those pages are mapped. + */ + rcu_read_lock(); - for (loop = 0; loop < n; loop++) { - page = pages[loop]; - if (to != PAGE_SIZE && - !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) + xas_for_each(&xas, page, ULONG_MAX) { + stop = true; + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) break; - if (page->index > final_page) + if (page->index != index) break; + + if (!page_cache_get_speculative(page)) { + xas_reset(&xas); + continue; + } + + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(&xas))) + break; + if (!trylock_page(page)) break; if (!PageDirty(page) || PageWriteback(page)) { @@ -561,57 +474,134 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, break; } + psize = thp_size(page); priv = page_private(page); - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); - if (f != 0 && - !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); + if (f != 0 && !new_content) { unlock_page(page); break; } - to = t; - trace_afs_page_dirty(vnode, tracepoint_string("store+"), - page->index, priv); + len += filler + t; + filler = psize - t; + if (len >= max_len || *_count <= 0) + stop = true; + else if (t == psize || new_content) + stop = false; + + index += thp_nr_pages(page); + if (!pagevec_add(&pvec, page)) + break; + if (stop) + break; + } + + if (!stop) + xas_pause(&xas); + rcu_read_unlock(); + + /* Now, if we obtained any pages, we can shift them to being + * writable and mark them for caching. + */ + if (!pagevec_count(&pvec)) + break; + + for (i = 0; i < pagevec_count(&pvec); i++) { + page = pvec.pages[i]; + trace_afs_page_dirty(vnode, tracepoint_string("store+"), page); if (!clear_page_dirty_for_io(page)) BUG(); if (test_set_page_writeback(page)) BUG(); + + *_count -= thp_nr_pages(page); unlock_page(page); - put_page(page); - } - count += loop; - if (loop < n) { - for (; loop < n; loop++) - put_page(pages[loop]); - goto no_more; } - start += loop; - } while (start <= final_page && count < 65536); + pagevec_release(&pvec); + cond_resched(); + } while (!stop); + + *_len = len; +} + +/* + * Synchronously write back the locked page and any subsequent non-locked dirty + * pages. + */ +static ssize_t afs_write_back_from_locked_page(struct address_space *mapping, + struct writeback_control *wbc, + struct page *page, + loff_t start, loff_t end) +{ + struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct iov_iter iter; + unsigned long priv; + unsigned int offset, to, len, max_len; + loff_t i_size = i_size_read(&vnode->vfs_inode); + bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + long count = wbc->nr_to_write; + int ret; + + _enter(",%lx,%llx-%llx", page->index, start, end); + + if (test_set_page_writeback(page)) + BUG(); + + count -= thp_nr_pages(page); + + /* Find all consecutive lockable dirty pages that have contiguous + * written regions, stopping when we find a page that is not + * immediately lockable, is not dirty or is missing, or we reach the + * end of the range. + */ + priv = page_private(page); + offset = afs_page_dirty_from(page, priv); + to = afs_page_dirty_to(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("store"), page); + + len = to - offset; + start += offset; + if (start < i_size) { + /* Trim the write to the EOF; the extra data is ignored. Also + * put an upper limit on the size of a single storedata op. + */ + max_len = 65536 * 4096; + max_len = min_t(unsigned long long, max_len, end - start + 1); + max_len = min_t(unsigned long long, max_len, i_size - start); + + if (len < max_len && + (to == thp_size(page) || new_content)) + afs_extend_writeback(mapping, vnode, &count, + start, max_len, new_content, &len); + len = min_t(loff_t, len, max_len); + } -no_more: /* We now have a contiguous set of dirty pages, each with writeback * set; the first page is still locked at this point, but all the rest * have been unlocked. */ - unlock_page(primary_page); + unlock_page(page); - first = primary_page->index; - last = first + count - 1; + if (start < i_size) { + _debug("write back %x @%llx [%llx]", len, start, i_size); - end = (loff_t)last * PAGE_SIZE + to; - i_size = i_size_read(&vnode->vfs_inode); + iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + ret = afs_store_data(vnode, &iter, start, false); + } else { + _debug("write discard %x @%llx [%llx]", len, start, i_size); - _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); - if (end > i_size) - to = i_size & ~PAGE_MASK; + /* The dirty region was entirely beyond the EOF. */ + afs_pages_written_back(vnode, start, len); + ret = 0; + } - ret = afs_store_data(mapping, first, last, offset, to, false); switch (ret) { case 0: - ret = count; + wbc->nr_to_write = count; + ret = len; break; default: @@ -623,13 +613,13 @@ no_more: case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: - afs_redirty_pages(wbc, mapping, first, last); + afs_redirty_pages(wbc, mapping, start, len); mapping_set_error(mapping, ret); break; case -EDQUOT: case -ENOSPC: - afs_redirty_pages(wbc, mapping, first, last); + afs_redirty_pages(wbc, mapping, start, len); mapping_set_error(mapping, -ENOSPC); break; @@ -641,7 +631,7 @@ no_more: case -ENOMEDIUM: case -ENXIO: trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail); - afs_kill_pages(mapping, first, last); + afs_kill_pages(mapping, start, len); mapping_set_error(mapping, ret); break; } @@ -656,19 +646,19 @@ no_more: */ int afs_writepage(struct page *page, struct writeback_control *wbc) { - int ret; + ssize_t ret; + loff_t start; _enter("{%lx},", page->index); + start = page->index * PAGE_SIZE; ret = afs_write_back_from_locked_page(page->mapping, wbc, page, - wbc->range_end >> PAGE_SHIFT); + start, LLONG_MAX - start); if (ret < 0) { - _leave(" = %d", ret); - return 0; + _leave(" = %zd", ret); + return ret; } - wbc->nr_to_write -= ret; - _leave(" = 0"); return 0; } @@ -678,35 +668,46 @@ int afs_writepage(struct page *page, struct writeback_control *wbc) */ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, - pgoff_t index, pgoff_t end, pgoff_t *_next) + loff_t start, loff_t end, loff_t *_next) { struct page *page; - int ret, n; + ssize_t ret; + int n; - _enter(",,%lx,%lx,", index, end); + _enter("%llx,%llx,", start, end); do { - n = find_get_pages_range_tag(mapping, &index, end, - PAGECACHE_TAG_DIRTY, 1, &page); + pgoff_t index = start / PAGE_SIZE; + + n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, + PAGECACHE_TAG_DIRTY, 1, &page); if (!n) break; + start = (loff_t)page->index * PAGE_SIZE; /* May regress with THPs */ + _debug("wback %lx", page->index); - /* - * at this point we hold neither the i_pages lock nor the + /* At this point we hold neither the i_pages lock nor the * page lock: the page may be truncated or invalidated * (changing page->mapping to NULL), or even swizzled * back from swapper_space to tmpfs file mapping */ - ret = lock_page_killable(page); - if (ret < 0) { - put_page(page); - _leave(" = %d", ret); - return ret; + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = lock_page_killable(page); + if (ret < 0) { + put_page(page); + return ret; + } + } else { + if (!trylock_page(page)) { + put_page(page); + return 0; + } } if (page->mapping != mapping || !PageDirty(page)) { + start += thp_size(page); unlock_page(page); put_page(page); continue; @@ -722,20 +723,20 @@ static int afs_writepages_region(struct address_space *mapping, if (!clear_page_dirty_for_io(page)) BUG(); - ret = afs_write_back_from_locked_page(mapping, wbc, page, end); + ret = afs_write_back_from_locked_page(mapping, wbc, page, start, end); put_page(page); if (ret < 0) { - _leave(" = %d", ret); + _leave(" = %zd", ret); return ret; } - wbc->nr_to_write -= ret; + start += ret; cond_resched(); - } while (index < end && wbc->nr_to_write > 0); + } while (wbc->nr_to_write > 0); - *_next = index; - _leave(" = 0 [%lx]", *_next); + *_next = start; + _leave(" = 0 [%llx]", *_next); return 0; } @@ -746,7 +747,7 @@ int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - pgoff_t start, end, next; + loff_t start, next; int ret; _enter(""); @@ -761,22 +762,19 @@ int afs_writepages(struct address_space *mapping, return 0; if (wbc->range_cyclic) { - start = mapping->writeback_index; - end = -1; - ret = afs_writepages_region(mapping, wbc, start, end, &next); + start = mapping->writeback_index * PAGE_SIZE; + ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, &next); if (start > 0 && wbc->nr_to_write > 0 && ret == 0) ret = afs_writepages_region(mapping, wbc, 0, start, &next); - mapping->writeback_index = next; + mapping->writeback_index = next / PAGE_SIZE; } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - end = (pgoff_t)(LLONG_MAX >> PAGE_SHIFT); - ret = afs_writepages_region(mapping, wbc, 0, end, &next); + ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next); if (wbc->nr_to_write > 0) mapping->writeback_index = next; } else { - start = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - ret = afs_writepages_region(mapping, wbc, start, end, &next); + ret = afs_writepages_region(mapping, wbc, + wbc->range_start, wbc->range_end, &next); } up_read(&vnode->validate_lock); @@ -834,13 +832,13 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { + struct page *page = thp_head(vmf->page); struct file *file = vmf->vma->vm_file; struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); unsigned long priv; - _enter("{{%llx:%llu}},{%lx}", - vnode->fid.vid, vnode->fid.vnode, vmf->page->index); + _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); sb_start_pagefault(inode->i_sb); @@ -848,29 +846,35 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) * be modified. We then assume the entire page will need writing back. */ #ifdef CONFIG_AFS_FSCACHE - fscache_wait_on_page_write(vnode->cache, vmf->page); + if (PageFsCache(page) && + wait_on_page_fscache_killable(page) < 0) + return VM_FAULT_RETRY; #endif - if (wait_on_page_writeback_killable(vmf->page)) + if (wait_on_page_writeback_killable(page)) return VM_FAULT_RETRY; - if (lock_page_killable(vmf->page) < 0) + if (lock_page_killable(page) < 0) return VM_FAULT_RETRY; /* We mustn't change page->private until writeback is complete as that * details the portion of the page we need to write back and we might * need to redirty the page if there's a problem. */ - wait_on_page_writeback(vmf->page); + if (wait_on_page_writeback_killable(page) < 0) { + unlock_page(page); + return VM_FAULT_RETRY; + } - priv = afs_page_dirty(0, PAGE_SIZE); + priv = afs_page_dirty(page, 0, thp_size(page)); priv = afs_page_dirty_mmapped(priv); - trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), - vmf->page->index, priv); - if (PagePrivate(vmf->page)) - set_page_private(vmf->page, priv); - else - attach_page_private(vmf->page, (void *)priv); + if (PagePrivate(page)) { + set_page_private(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("mkwrite+"), page); + } else { + attach_page_private(page, (void *)priv); + trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), page); + } file_update_time(file); sb_end_pagefault(inode->i_sb); @@ -912,6 +916,8 @@ int afs_launder_page(struct page *page) { struct address_space *mapping = page->mapping; struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct iov_iter iter; + struct bio_vec bv[1]; unsigned long priv; unsigned int f, t; int ret = 0; @@ -921,26 +927,24 @@ int afs_launder_page(struct page *page) priv = page_private(page); if (clear_page_dirty_for_io(page)) { f = 0; - t = PAGE_SIZE; + t = thp_size(page); if (PagePrivate(page)) { - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); } - trace_afs_page_dirty(vnode, tracepoint_string("launder"), - page->index, priv); - ret = afs_store_data(mapping, page->index, page->index, t, f, true); - } - - priv = (unsigned long)detach_page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("laundered"), - page->index, priv); + bv[0].bv_page = page; + bv[0].bv_offset = f; + bv[0].bv_len = t - f; + iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page)) { - fscache_wait_on_page_write(vnode->cache, page); - fscache_uncache_page(vnode->cache, page); + trace_afs_page_dirty(vnode, tracepoint_string("launder"), page); + ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE, + true); } -#endif + + trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page); + detach_page_private(page); + wait_on_page_fscache(page); return ret; } diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index bd787e71a657..2b35cba8ad62 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -360,22 +360,23 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) struct afs_vnode_param *vp = &op->file[0]; struct afs_read *req = op->fetch.req; const __be32 *bp; - unsigned int size; int ret; - _enter("{%u,%zu/%llu}", - call->unmarshall, iov_iter_count(call->iter), req->actual_len); + _enter("{%u,%zu, %zu/%llu}", + call->unmarshall, call->iov_len, iov_iter_count(call->iter), + req->actual_len); switch (call->unmarshall) { case 0: req->actual_len = 0; - req->index = 0; - req->offset = req->pos & (PAGE_SIZE - 1); afs_extract_to_tmp64(call); call->unmarshall++; fallthrough; - /* extract the returned data length */ + /* Extract the returned data length into ->actual_len. This + * may indicate more or less data than was requested will be + * returned. + */ case 1: _debug("extract data length"); ret = afs_extract_data(call, true); @@ -384,44 +385,25 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) req->actual_len = be64_to_cpu(call->tmp64); _debug("DATA length: %llu", req->actual_len); - req->remain = min(req->len, req->actual_len); - if (req->remain == 0) + + if (req->actual_len == 0) goto no_more_data; + call->iter = req->iter; + call->iov_len = min(req->actual_len, req->len); call->unmarshall++; - - begin_page: - ASSERTCMP(req->index, <, req->nr_pages); - if (req->remain > PAGE_SIZE - req->offset) - size = PAGE_SIZE - req->offset; - else - size = req->remain; - call->bvec[0].bv_len = size; - call->bvec[0].bv_offset = req->offset; - call->bvec[0].bv_page = req->pages[req->index]; - iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); - ASSERTCMP(size, <=, PAGE_SIZE); fallthrough; /* extract the returned data */ case 2: _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->remain); + iov_iter_count(call->iter), req->actual_len); ret = afs_extract_data(call, true); if (ret < 0) return ret; - req->remain -= call->bvec[0].bv_len; - req->offset += call->bvec[0].bv_len; - ASSERTCMP(req->offset, <=, PAGE_SIZE); - if (req->offset == PAGE_SIZE) { - req->offset = 0; - req->index++; - if (req->remain > 0) - goto begin_page; - } - ASSERTCMP(req->remain, ==, 0); + call->iter = &call->def_iter; if (req->actual_len <= req->len) goto no_more_data; @@ -467,17 +449,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) break; } - for (; req->index < req->nr_pages; req->index++) { - if (req->offset < PAGE_SIZE) - zero_user_segment(req->pages[req->index], - req->offset, PAGE_SIZE); - req->offset = 0; - } - - if (req->page_done) - for (req->index = 0; req->index < req->nr_pages; req->index++) - req->page_done(req); - _leave(" = 0 [done]"); return 0; } @@ -516,6 +487,8 @@ void yfs_fs_fetch_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); + req->call_debug_id = call->debug_id; + /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSFETCHDATA64); @@ -1102,25 +1075,15 @@ void yfs_fs_store_data(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - loff_t size, pos, i_size; __be32 *bp; _enter(",%x,{%llx:%llu},,", key_serial(op->key), vp->fid.vid, vp->fid.vnode); - size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset; - if (op->store.first != op->store.last) - size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT; - pos = (loff_t)op->store.first << PAGE_SHIFT; - pos += op->store.first_offset; - - i_size = i_size_read(&vp->vnode->vfs_inode); - if (pos + size > i_size) - i_size = size + pos; - _debug("size %llx, at %llx, i_size %llx", - (unsigned long long)size, (unsigned long long)pos, - (unsigned long long)i_size); + (unsigned long long)op->store.size, + (unsigned long long)op->store.pos, + (unsigned long long)op->store.i_size); call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreData64, sizeof(__be32) + @@ -1133,8 +1096,7 @@ void yfs_fs_store_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); - call->key = op->key; - call->send_pages = true; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; @@ -1142,9 +1104,9 @@ void yfs_fs_store_data(struct afs_operation *op) bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &vp->fid); bp = xdr_encode_YFSStoreStatus_mtime(bp, &op->mtime); - bp = xdr_encode_u64(bp, pos); - bp = xdr_encode_u64(bp, size); - bp = xdr_encode_u64(bp, i_size); + bp = xdr_encode_u64(bp, op->store.pos); + bp = xdr_encode_u64(bp, op->store.size); + bp = xdr_encode_u64(bp, op->store.i_size); yfs_check_req(call, bp); trace_afs_make_fs_call(call, &vp->fid); @@ -323,16 +323,13 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags) +static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; - if (flags & MREMAP_DONTUNMAP) - return -EINVAL; - spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 054f97b07754..918826eaceea 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -87,6 +87,7 @@ struct autofs_wait_queue { autofs_wqt_t wait_queue_token; /* We use the following to see what we are waiting for */ struct qstr name; + u32 offset; u32 dev; u64 ino; kuid_t uid; diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index a1c7701007e7..b3fefd6237c3 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -355,7 +355,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { + if (d_is_symlink(dentry)) { pr_debug("checking symlink %p %pd\n", dentry, dentry); /* Forced expire, user space handles busy mounts */ diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 5ced859dac53..16b5fca0626e 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -30,7 +30,7 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi) while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ - kfree(wq->name.name); + kfree(wq->name.name - wq->offset); wq->name.name = NULL; wq->wait_ctr--; wake_up_interruptible(&wq->queue); @@ -175,51 +175,6 @@ static void autofs_notify_daemon(struct autofs_sb_info *sbi, fput(pipe); } -static int autofs_getpath(struct autofs_sb_info *sbi, - struct dentry *dentry, char *name) -{ - struct dentry *root = sbi->sb->s_root; - struct dentry *tmp; - char *buf; - char *p; - int len; - unsigned seq; - -rename_retry: - buf = name; - len = 0; - - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - spin_lock(&sbi->fs_lock); - for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) - len += tmp->d_name.len + 1; - - if (!len || --len > NAME_MAX) { - spin_unlock(&sbi->fs_lock); - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) - goto rename_retry; - return 0; - } - - *(buf + len) = '\0'; - p = buf + len - dentry->d_name.len; - strncpy(p, dentry->d_name.name, dentry->d_name.len); - - for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) { - *(--p) = '/'; - p -= tmp->d_name.len; - strncpy(p, tmp->d_name.name, tmp->d_name.len); - } - spin_unlock(&sbi->fs_lock); - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) - goto rename_retry; - - return len; -} - static struct autofs_wait_queue * autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { @@ -352,6 +307,7 @@ int autofs_wait(struct autofs_sb_info *sbi, struct qstr qstr; char *name; int status, ret, type; + unsigned int offset = 0; pid_t pid; pid_t tgid; @@ -389,20 +345,23 @@ int autofs_wait(struct autofs_sb_info *sbi, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) + if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) { + qstr.name = name; qstr.len = sprintf(name, "%p", dentry); - else { - qstr.len = autofs_getpath(sbi, dentry, name); - if (!qstr.len) { + } else { + char *p = dentry_path_raw(dentry, name, NAME_MAX); + if (IS_ERR(p)) { kfree(name); return -ENOENT; } + qstr.name = ++p; // skip the leading slash + qstr.len = strlen(p); + offset = p - name; } - qstr.name = name; qstr.hash = full_name_hash(dentry, name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { - kfree(qstr.name); + kfree(name); return -EINTR; } @@ -410,7 +369,7 @@ int autofs_wait(struct autofs_sb_info *sbi, if (ret <= 0) { if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); + kfree(name); return ret; } @@ -418,7 +377,7 @@ int autofs_wait(struct autofs_sb_info *sbi, /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL); if (!wq) { - kfree(qstr.name); + kfree(name); mutex_unlock(&sbi->wq_mutex); return -ENOMEM; } @@ -430,6 +389,7 @@ int autofs_wait(struct autofs_sb_info *sbi, sbi->queues = wq; init_waitqueue_head(&wq->queue); memcpy(&wq->name, &qstr, sizeof(struct qstr)); + wq->offset = offset; wq->dev = autofs_get_dev(sbi); wq->ino = autofs_get_ino(sbi); wq->uid = current_uid(); @@ -469,7 +429,7 @@ int autofs_wait(struct autofs_sb_info *sbi, (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); + kfree(name); } /* @@ -540,7 +500,7 @@ int autofs_wait_release(struct autofs_sb_info *sbi, } *wql = wq->next; /* Unlink from chain */ - kfree(wq->name.name); + kfree(wq->name.name - wq->offset); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; wake_up(&wq->queue); diff --git a/fs/befs/TODO b/fs/befs/TODO deleted file mode 100644 index 3250921aa2e6..000000000000 --- a/fs/befs/TODO +++ /dev/null @@ -1,14 +0,0 @@ -TODO -========== - -* Convert comments to the Kernel-Doc format. - -* Befs_fs.h has gotten big and messy. No reason not to break it up into - smaller peices. - -* See if Alexander Viro's option parser made it into the kernel tree. - Use that if we can. (include/linux/parser.h) - -* See if we really need separate types for on-disk and in-memory - representations of the superblock and inode. - diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b12ba98ae9f5..187b3f2b9202 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2267,8 +2267,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Align to page */ - if (!dump_skip(cprm, dataoff - cprm->pos)) - goto end_coredump; + dump_skip_to(cprm, dataoff); for (i = 0; i < vma_count; i++) { struct core_vma_metadata *meta = vma_meta + i; @@ -2276,7 +2275,6 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; } - dump_truncate(cprm); if (!elf_core_write_extra_data(cprm)) goto end_coredump; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 3cfd6cd46f26..2c99b102c860 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1631,8 +1631,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; } - if (!dump_skip(cprm, dataoff - cprm->pos)) - goto end_coredump; + dump_skip_to(cprm, dataoff); if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) goto end_coredump; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index b9c658e0548e..a1072c6a2341 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -74,6 +74,12 @@ #define MAX_SHARED_LIBS (1) #endif +#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET +#define DATA_START_OFFSET_WORDS (0) +#else +#define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS) +#endif + struct lib_info { struct { unsigned long start_code; /* Start of text segment */ @@ -576,7 +582,8 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = data_len + extra + + DATA_START_OFFSET_WORDS * sizeof(unsigned long); len = PAGE_ALIGN(len); realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); @@ -591,7 +598,7 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } datapos = ALIGN(realdatastart + - MAX_SHARED_LIBS * sizeof(unsigned long), + DATA_START_OFFSET_WORDS * sizeof(unsigned long), FLAT_DATA_ALIGN); pr_debug("Allocated data+bss+stack (%u bytes): %lx\n", @@ -622,7 +629,8 @@ static int load_flat_file(struct linux_binprm *bprm, memp_size = len; } else { - len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32); + len = text_len + data_len + extra + + DATA_START_OFFSET_WORDS * sizeof(u32); len = PAGE_ALIGN(len); textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); @@ -638,7 +646,7 @@ static int load_flat_file(struct linux_binprm *bprm, realdatastart = textpos + ntohl(hdr->data_start); datapos = ALIGN(realdatastart + - MAX_SHARED_LIBS * sizeof(u32), + DATA_START_OFFSET_WORDS * sizeof(u32), FLAT_DATA_ALIGN); reloc = (__be32 __user *) @@ -714,7 +722,7 @@ static int load_flat_file(struct linux_binprm *bprm, ret = result; pr_err("Unable to read code+data+bss, errno %d\n", ret); vm_munmap(textpos, text_len + data_len + extra + - MAX_SHARED_LIBS * sizeof(u32)); + DATA_START_OFFSET_WORDS * sizeof(u32)); goto err; } } diff --git a/fs/block_dev.c b/fs/block_dev.c index 09d6f7229db9..6cc4d4cfe0c2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) return; invalidate_bh_lrus(); @@ -1240,14 +1240,19 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); int bdev_disk_changed(struct block_device *bdev, bool invalidate) { struct gendisk *disk = bdev->bd_disk; - int ret; + int ret = 0; lockdep_assert_held(&bdev->bd_mutex); + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + rescan: - ret = blk_drop_partitions(bdev); - if (ret) - return ret; + if (bdev->bd_part_count) + return -EBUSY; + sync_blockdev(bdev); + invalidate_bdev(bdev); + blk_drop_partitions(disk); clear_bit(GD_NEED_PART_SCAN, &disk->state); @@ -1263,9 +1268,6 @@ rescan: if (disk_part_scan_enabled(disk) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); - } else { - if (disk->fops->revalidate_disk) - disk->fops->revalidate_disk(disk); } if (get_capacity(disk)) { @@ -1299,6 +1301,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) struct gendisk *disk = bdev->bd_disk; int ret = 0; + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + if (!bdev->bd_openers) { if (!bdev_is_partition(bdev)) { ret = 0; @@ -1333,8 +1338,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) whole->bd_part_count++; mutex_unlock(&whole->bd_mutex); - if (!(disk->flags & GENHD_FL_UP) || - !bdev_nr_sectors(bdev)) { + if (!bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); return -ENXIO; @@ -1365,16 +1369,12 @@ struct block_device *blkdev_get_no_open(dev_t dev) struct block_device *bdev; struct gendisk *disk; - down_read(&bdev_lookup_sem); bdev = bdget(dev); if (!bdev) { - up_read(&bdev_lookup_sem); blk_request_module(dev); - down_read(&bdev_lookup_sem); - bdev = bdget(dev); if (!bdev) - goto unlock; + return NULL; } disk = bdev->bd_disk; @@ -1384,14 +1384,11 @@ struct block_device *blkdev_get_no_open(dev_t dev) goto put_disk; if (!try_module_get(bdev->bd_disk->fops->owner)) goto put_disk; - up_read(&bdev_lookup_sem); return bdev; put_disk: put_disk(disk); bdput: bdput(bdev); -unlock: - up_read(&bdev_lookup_sem); return NULL; } @@ -1437,10 +1434,6 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (ret) return ERR_PTR(ret); - /* - * If we lost a race with 'disk' being deleted, try again. See md.c. - */ -retry: bdev = blkdev_get_no_open(dev); if (!bdev) return ERR_PTR(-ENXIO); @@ -1487,8 +1480,6 @@ abort_claiming: disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); - if (ret == -ERESTARTSYS) - goto retry; return ERR_PTR(ret); } EXPORT_SYMBOL(blkdev_get_by_dev); @@ -1684,6 +1675,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; + size_t shorted = 0; ssize_t ret; if (bdev_read_only(I_BDEV(bd_inode))) @@ -1701,12 +1693,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; - iov_iter_truncate(from, size - iocb->ki_pos); + size -= iocb->ki_pos; + if (iov_iter_count(from) > size) { + shorted = iov_iter_count(from) - size; + iov_iter_truncate(from, size); + } blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); if (ret > 0) ret = generic_write_sync(iocb, ret); + iov_iter_reexpand(from, iov_iter_count(from) + shorted); blk_finish_plug(&plug); return ret; } @@ -1718,13 +1715,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; + size_t shorted = 0; + ssize_t ret; if (pos >= size) return 0; size -= pos; - iov_iter_truncate(to, size); - return generic_file_read_iter(iocb, to); + if (iov_iter_count(to) > size) { + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); + } + + ret = generic_file_read_iter(iocb, to); + iov_iter_reexpand(to, iov_iter_count(to) + shorted); + return ret; } EXPORT_SYMBOL_GPL(blkdev_read_iter); diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 5cb4f3b88285..1346d698463a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -632,16 +632,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, free_extent_map(em); if (page->index == end_index) { - char *userpage; size_t zero_offset = offset_in_page(isize); if (zero_offset) { int zeros; zeros = PAGE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, zeros); + memzero_page(page, zero_offset, zeros); flush_dcache_page(page); - kunmap_atomic(userpage); } } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 0f5b0b12762b..9fb76829a281 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -3207,6 +3207,9 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(u8 *uuid); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index d9f20ca3ac7d..dee2dafbc872 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -3421,15 +3421,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } if (page->index == last_byte >> PAGE_SHIFT) { - char *userpage; size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, iosize); + memzero_page(page, zero_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); } } begin_page_read(fs_info, page); @@ -3438,14 +3435,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 disk_bytenr; if (cur >= last_byte) { - char *userpage; struct extent_state *cached = NULL; iosize = PAGE_SIZE - pg_offset; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); + memzero_page(page, pg_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, @@ -3528,13 +3522,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - char *userpage; struct extent_state *cached = NULL; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); + memzero_page(page, pg_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -3845,12 +3836,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, - PAGE_SIZE - pg_offset); - kunmap_atomic(userpage); + memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); flush_dcache_page(page); } diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index f5d32d85247a..46f392943f4d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -646,17 +646,12 @@ again: if (!ret) { unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; - char *kaddr; /* zero the tail end of the last page, we might be * sending it down to disk */ - if (offset) { - kaddr = kmap_atomic(page); - memset(kaddr + offset, 0, - PAGE_SIZE - offset); - kunmap_atomic(kaddr); - } + if (offset) + memzero_page(page, offset, PAGE_SIZE - offset); will_compress = 1; } } @@ -4846,7 +4841,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - char *kaddr; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4938,15 +4932,13 @@ again: if (offset != blocksize) { if (!len) len = blocksize - offset; - kaddr = kmap(page); if (front) - memset(kaddr + (block_start - page_offset(page)), - 0, offset); + memzero_page(page, (block_start - page_offset(page)), + offset); else - memset(kaddr + (block_start - page_offset(page)) + offset, - 0, len); + memzero_page(page, (block_start - page_offset(page)) + offset, + len); flush_dcache_page(page); - kunmap(page); } ClearPageChecked(page); set_page_dirty(page); @@ -6845,11 +6837,9 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size + pg_offset < PAGE_SIZE) { - char *map = kmap(page); - memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset); - kunmap(page); - } + if (max_size + pg_offset < PAGE_SIZE) + memzero_page(page, pg_offset + max_size, + PAGE_SIZE - max_size - pg_offset); kfree(tmp); return ret; } @@ -8519,7 +8509,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - char *kaddr; unsigned long zero_start; loff_t size; vm_fault_t ret; @@ -8633,10 +8622,8 @@ again: zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) { - kaddr = kmap(page); - memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); + memzero_page(page, zero_start, PAGE_SIZE - zero_start); flush_dcache_page(page); - kunmap(page); } ClearPageChecked(page); set_page_dirty(page); @@ -10638,6 +10625,8 @@ static const struct inode_operations btrfs_dir_inode_operations = { .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, + .fileattr_get = btrfs_fileattr_get, + .fileattr_set = btrfs_fileattr_set, }; static const struct file_operations btrfs_dir_file_operations = { @@ -10691,6 +10680,8 @@ static const struct inode_operations btrfs_file_inode_operations = { .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, + .fileattr_get = btrfs_fileattr_get, + .fileattr_set = btrfs_fileattr_set, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 0ba0e4ddaf6b..5dc2fd843ae3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -26,6 +26,7 @@ #include <linux/btrfs.h> #include <linux/uaccess.h> #include <linux/iversion.h> +#include <linux/fileattr.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -153,16 +154,6 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) new_fl); } -static int btrfs_ioctl_getflags(struct file *file, void __user *arg) -{ - struct btrfs_inode *binode = BTRFS_I(file_inode(file)); - unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags); - - if (copy_to_user(arg, &flags, sizeof(flags))) - return -EFAULT; - return 0; -} - /* * Check if @flags are a supported and valid set of FS_*_FL flags and that * the old and new flags are not conflicting @@ -201,9 +192,22 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, return 0; } -static int btrfs_ioctl_setflags(struct file *file, void __user *arg) +/* + * Set flags/xflags from the internal inode flags. The remaining items of + * fsxattr are zeroed. + */ +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = file_inode(file); + struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); + + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags)); + return 0; +} + +int btrfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) +{ + struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; @@ -213,34 +217,21 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) const char *comp = NULL; u32 binode_flags; - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EPERM; - if (btrfs_root_readonly(root)) return -EROFS; - if (copy_from_user(&fsflags, arg, sizeof(fsflags))) - return -EFAULT; - - ret = mnt_want_write_file(file); - if (ret) - return ret; + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; - btrfs_inode_lock(inode, 0); - fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); + fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); - - ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags); - if (ret) - goto out_unlock; - ret = check_fsflags(old_fsflags, fsflags); if (ret) - goto out_unlock; + return ret; ret = check_fsflags_compatible(fs_info, fsflags); if (ret) - goto out_unlock; + return ret; binode_flags = binode->flags; if (fsflags & FS_SYNC_FL) @@ -263,6 +254,16 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) binode_flags |= BTRFS_INODE_NOATIME; else binode_flags &= ~BTRFS_INODE_NOATIME; + + /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ + if (!fa->flags_valid) { + /* 1 item for the inode */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + goto update_flags; + } + if (fsflags & FS_DIRSYNC_FL) binode_flags |= BTRFS_INODE_DIRSYNC; else @@ -303,10 +304,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) binode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_unlock; - } + if (IS_SWAPFILE(inode)) + return -ETXTBSY; binode_flags |= BTRFS_INODE_COMPRESS; binode_flags &= ~BTRFS_INODE_NOCOMPRESS; @@ -323,10 +322,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) * 2 for properties */ trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_unlock; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); if (comp) { ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, @@ -344,6 +341,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } +update_flags: binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); @@ -352,44 +350,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) out_end_trans: btrfs_end_transaction(trans); - out_unlock: - btrfs_inode_unlock(inode, 0); - mnt_drop_write_file(file); return ret; } -/* - * Translate btrfs internal inode flags to xflags as expected by the - * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are - * silently dropped. - */ -static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags) -{ - unsigned int xflags = 0; - - if (flags & BTRFS_INODE_APPEND) - xflags |= FS_XFLAG_APPEND; - if (flags & BTRFS_INODE_IMMUTABLE) - xflags |= FS_XFLAG_IMMUTABLE; - if (flags & BTRFS_INODE_NOATIME) - xflags |= FS_XFLAG_NOATIME; - if (flags & BTRFS_INODE_NODUMP) - xflags |= FS_XFLAG_NODUMP; - if (flags & BTRFS_INODE_SYNC) - xflags |= FS_XFLAG_SYNC; - - return xflags; -} - -/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */ -static int check_xflags(unsigned int flags) -{ - if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME | - FS_XFLAG_NODUMP | FS_XFLAG_SYNC)) - return -EOPNOTSUPP; - return 0; -} - bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { @@ -402,111 +365,6 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } -/* - * Set the xflags from the internal inode flags. The remaining items of fsxattr - * are zeroed. - */ -static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg) -{ - struct btrfs_inode *binode = BTRFS_I(file_inode(file)); - struct fsxattr fa; - - simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags)); - if (copy_to_user(arg, &fa, sizeof(fa))) - return -EFAULT; - - return 0; -} - -static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) -{ - struct inode *inode = file_inode(file); - struct btrfs_inode *binode = BTRFS_I(inode); - struct btrfs_root *root = binode->root; - struct btrfs_trans_handle *trans; - struct fsxattr fa, old_fa; - unsigned old_flags; - unsigned old_i_flags; - int ret = 0; - - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EPERM; - - if (btrfs_root_readonly(root)) - return -EROFS; - - if (copy_from_user(&fa, arg, sizeof(fa))) - return -EFAULT; - - ret = check_xflags(fa.fsx_xflags); - if (ret) - return ret; - - if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0) - return -EOPNOTSUPP; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - btrfs_inode_lock(inode, 0); - - old_flags = binode->flags; - old_i_flags = inode->i_flags; - - simple_fill_fsxattr(&old_fa, - btrfs_inode_flags_to_xflags(binode->flags)); - ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa); - if (ret) - goto out_unlock; - - if (fa.fsx_xflags & FS_XFLAG_SYNC) - binode->flags |= BTRFS_INODE_SYNC; - else - binode->flags &= ~BTRFS_INODE_SYNC; - if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE) - binode->flags |= BTRFS_INODE_IMMUTABLE; - else - binode->flags &= ~BTRFS_INODE_IMMUTABLE; - if (fa.fsx_xflags & FS_XFLAG_APPEND) - binode->flags |= BTRFS_INODE_APPEND; - else - binode->flags &= ~BTRFS_INODE_APPEND; - if (fa.fsx_xflags & FS_XFLAG_NODUMP) - binode->flags |= BTRFS_INODE_NODUMP; - else - binode->flags &= ~BTRFS_INODE_NODUMP; - if (fa.fsx_xflags & FS_XFLAG_NOATIME) - binode->flags |= BTRFS_INODE_NOATIME; - else - binode->flags &= ~BTRFS_INODE_NOATIME; - - /* 1 item for the inode */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_unlock; - } - - btrfs_sync_inode_flags_to_i_flags(inode); - inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - - btrfs_end_transaction(trans); - -out_unlock: - if (ret) { - binode->flags = old_flags; - inode->i_flags = old_i_flags; - } - - btrfs_inode_unlock(inode, 0); - mnt_drop_write_file(file); - - return ret; -} - static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { struct inode *inode = file_inode(file); @@ -4932,10 +4790,6 @@ long btrfs_ioctl(struct file *file, unsigned int void __user *argp = (void __user *)arg; switch (cmd) { - case FS_IOC_GETFLAGS: - return btrfs_ioctl_getflags(file, argp); - case FS_IOC_SETFLAGS: - return btrfs_ioctl_setflags(file, argp); case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); case FS_IOC_GETFSLABEL: @@ -5061,10 +4915,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_features(fs_info, argp); case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); - case FS_IOC_FSGETXATTR: - return btrfs_ioctl_fsgetxattr(file, argp); - case FS_IOC_FSSETXATTR: - return btrfs_ioctl_fssetxattr(file, argp); case BTRFS_IOC_GET_SUBVOL_INFO: return btrfs_ioctl_get_subvol_info(file, argp); case BTRFS_IOC_GET_SUBVOL_ROOTREF: @@ -5084,12 +4934,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) * handling is necessary. */ switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index bb768c309bc3..244d499ebc72 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -1633,7 +1633,8 @@ struct btrfs_plug_cb { /* * rbios on the plug list are sorted for easier merging. */ -static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +static int plug_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, plug_list); diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 58ddc7ed9e84..9178da07cc9c 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -129,12 +129,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) { - char *map; - - map = kmap(page); - memset(map + datal, 0, block_size - datal); + memzero_page(page, datal, block_size - datal); flush_dcache_page(page); - kunmap(page); } SetPageUptodate(page); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 324a9a078b6a..dbcf8bb2f3b9 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -4157,7 +4157,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, return ret; } -static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) +static int extent_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct extent_map *em1, *em2; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bc53939fef48..47d27059d064 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1224,7 +1224,8 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, return 0; } -static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) +static int devid_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct btrfs_device *dev1, *dev2; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index d524acf7b3e5..c3fa7d3fa770 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -375,7 +375,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, unsigned long bytes_left; unsigned long total_out = 0; unsigned long pg_offset = 0; - char *kaddr; destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes_left = destlen; @@ -455,9 +454,7 @@ next: * end of the inline extent (destlen) to the end of the page */ if (pg_offset < destlen) { - kaddr = kmap_atomic(dest_page); - memset(kaddr + pg_offset, 0, destlen - pg_offset); - kunmap_atomic(kaddr); + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 8e9626d63976..3e26b466476a 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -631,7 +631,6 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, size_t ret2; unsigned long total_out = 0; unsigned long pg_offset = 0; - char *kaddr; stream = ZSTD_initDStream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); @@ -696,9 +695,7 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, ret = 0; finish: if (pg_offset < destlen) { - kaddr = kmap_atomic(dest_page); - memset(kaddr + pg_offset, 0, destlen - pg_offset); - kunmap_atomic(kaddr); + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 0cb7ffd4977c..ea48c01fb76b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1020,11 +1020,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) pgoff_t index; int sizebits; - sizebits = -1; - do { - sizebits++; - } while ((size << sizebits) < PAGE_SIZE); - + sizebits = PAGE_SHIFT - __ffs(size); index = block >> sizebits; /* @@ -1264,6 +1260,15 @@ static void bh_lru_install(struct buffer_head *bh) int i; check_irqs_on(); + /* + * the refcount of buffer_head in bh_lru prevents dropping the + * attached page(i.e., try_to_free_buffers) so it could cause + * failing page migration. + * Skip putting upcoming bh into bh_lru until migration is done. + */ + if (lru_cache_disabled()) + return; + bh_lru_lock(); b = this_cpu_ptr(&bh_lrus); @@ -1404,6 +1409,15 @@ __bread_gfp(struct block_device *bdev, sector_t block, } EXPORT_SYMBOL(__bread_gfp); +static void __invalidate_bh_lrus(struct bh_lru *b) +{ + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } +} /* * invalidate_bh_lrus() is called rarely - but not only at unmount. * This doesn't race because it runs in each cpu either in irq @@ -1412,16 +1426,12 @@ EXPORT_SYMBOL(__bread_gfp); static void invalidate_bh_lru(void *arg) { struct bh_lru *b = &get_cpu_var(bh_lrus); - int i; - for (i = 0; i < BH_LRU_SIZE; i++) { - brelse(b->bhs[i]); - b->bhs[i] = NULL; - } + __invalidate_bh_lrus(b); put_cpu_var(bh_lrus); } -static bool has_bh_in_lru(int cpu, void *dummy) +bool has_bh_in_lru(int cpu, void *dummy) { struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); int i; @@ -1440,6 +1450,16 @@ void invalidate_bh_lrus(void) } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); +void invalidate_bh_lrus_cpu(int cpu) +{ + struct bh_lru *b; + + bh_lru_lock(); + b = per_cpu_ptr(&bh_lrus, cpu); + __invalidate_bh_lrus(b); + bh_lru_unlock(); +} + void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset) { diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile index 891dedda5905..2227dc2d5498 100644 --- a/fs/cachefiles/Makefile +++ b/fs/cachefiles/Makefile @@ -7,6 +7,7 @@ cachefiles-y := \ bind.o \ daemon.o \ interface.o \ + io.o \ key.o \ main.o \ namei.o \ diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 5efa6a3702c0..da3948fdb615 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -319,8 +319,8 @@ static void cachefiles_drop_object(struct fscache_object *_object) /* * dispose of a reference to an object */ -static void cachefiles_put_object(struct fscache_object *_object, - enum fscache_obj_ref_trace why) +void cachefiles_put_object(struct fscache_object *_object, + enum fscache_obj_ref_trace why) { struct cachefiles_object *object; struct fscache_cache *cache; @@ -568,4 +568,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = { .uncache_page = cachefiles_uncache_page, .dissociate_pages = cachefiles_dissociate_pages, .check_consistency = cachefiles_check_consistency, + .begin_read_operation = cachefiles_begin_read_operation, }; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index cf9bd6401c2d..4ed83aa5253b 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -150,6 +150,9 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache, */ extern const struct fscache_cache_ops cachefiles_cache_ops; +void cachefiles_put_object(struct fscache_object *_object, + enum fscache_obj_ref_trace why); + /* * key.c */ @@ -218,6 +221,12 @@ extern int cachefiles_write_page(struct fscache_storage *, struct page *); extern void cachefiles_uncache_page(struct fscache_object *, struct page *); /* + * rdwr2.c + */ +extern int cachefiles_begin_read_operation(struct netfs_read_request *, + struct fscache_retrieval *); + +/* * security.c */ extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c new file mode 100644 index 000000000000..b13fb45fc3f3 --- /dev/null +++ b/fs/cachefiles/io.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* kiocb-using read/write + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/mount.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/sched/mm.h> +#include <linux/netfs.h> +#include "internal.h" + +struct cachefiles_kiocb { + struct kiocb iocb; + refcount_t ki_refcnt; + loff_t start; + union { + size_t skipped; + size_t len; + }; + netfs_io_terminated_t term_func; + void *term_func_priv; + bool was_async; +}; + +static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) +{ + if (refcount_dec_and_test(&ki->ki_refcnt)) { + fput(ki->iocb.ki_filp); + kfree(ki); + } +} + +/* + * Handle completion of a read from the cache. + */ +static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2) +{ + struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); + + _enter("%ld,%ld", ret, ret2); + + if (ki->term_func) { + if (ret >= 0) + ret += ki->skipped; + ki->term_func(ki->term_func_priv, ret, ki->was_async); + } + + cachefiles_put_kiocb(ki); +} + +/* + * Initiate a read from the cache. + */ +static int cachefiles_read(struct netfs_cache_resources *cres, + loff_t start_pos, + struct iov_iter *iter, + bool seek_data, + netfs_io_terminated_t term_func, + void *term_func_priv) +{ + struct cachefiles_kiocb *ki; + struct file *file = cres->cache_priv2; + unsigned int old_nofs; + ssize_t ret = -ENOBUFS; + size_t len = iov_iter_count(iter), skipped = 0; + + _enter("%pD,%li,%llx,%zx/%llx", + file, file_inode(file)->i_ino, start_pos, len, + i_size_read(file->f_inode)); + + /* If the caller asked us to seek for data before doing the read, then + * we should do that now. If we find a gap, we fill it with zeros. + */ + if (seek_data) { + loff_t off = start_pos, off2; + + off2 = vfs_llseek(file, off, SEEK_DATA); + if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) { + skipped = 0; + ret = off2; + goto presubmission_error; + } + + if (off2 == -ENXIO || off2 >= start_pos + len) { + /* The region is beyond the EOF or there's no more data + * in the region, so clear the rest of the buffer and + * return success. + */ + iov_iter_zero(len, iter); + skipped = len; + ret = 0; + goto presubmission_error; + } + + skipped = off2 - off; + iov_iter_zero(skipped, iter); + } + + ret = -ENOBUFS; + ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); + if (!ki) + goto presubmission_error; + + refcount_set(&ki->ki_refcnt, 2); + ki->iocb.ki_filp = file; + ki->iocb.ki_pos = start_pos + skipped; + ki->iocb.ki_flags = IOCB_DIRECT; + ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); + ki->iocb.ki_ioprio = get_current_ioprio(); + ki->skipped = skipped; + ki->term_func = term_func; + ki->term_func_priv = term_func_priv; + ki->was_async = true; + + if (ki->term_func) + ki->iocb.ki_complete = cachefiles_read_complete; + + get_file(ki->iocb.ki_filp); + + old_nofs = memalloc_nofs_save(); + ret = vfs_iocb_iter_read(file, &ki->iocb, iter); + memalloc_nofs_restore(old_nofs); + switch (ret) { + case -EIOCBQUEUED: + goto in_progress; + + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + ret = -EINTR; + fallthrough; + default: + ki->was_async = false; + cachefiles_read_complete(&ki->iocb, ret, 0); + if (ret > 0) + ret = 0; + break; + } + +in_progress: + cachefiles_put_kiocb(ki); + _leave(" = %zd", ret); + return ret; + +presubmission_error: + if (term_func) + term_func(term_func_priv, ret < 0 ? ret : skipped, false); + return ret; +} + +/* + * Handle completion of a write to the cache. + */ +static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2) +{ + struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); + struct inode *inode = file_inode(ki->iocb.ki_filp); + + _enter("%ld,%ld", ret, ret2); + + /* Tell lockdep we inherited freeze protection from submission thread */ + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); + + if (ki->term_func) + ki->term_func(ki->term_func_priv, ret, ki->was_async); + + cachefiles_put_kiocb(ki); +} + +/* + * Initiate a write to the cache. + */ +static int cachefiles_write(struct netfs_cache_resources *cres, + loff_t start_pos, + struct iov_iter *iter, + netfs_io_terminated_t term_func, + void *term_func_priv) +{ + struct cachefiles_kiocb *ki; + struct inode *inode; + struct file *file = cres->cache_priv2; + unsigned int old_nofs; + ssize_t ret = -ENOBUFS; + size_t len = iov_iter_count(iter); + + _enter("%pD,%li,%llx,%zx/%llx", + file, file_inode(file)->i_ino, start_pos, len, + i_size_read(file->f_inode)); + + ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); + if (!ki) + goto presubmission_error; + + refcount_set(&ki->ki_refcnt, 2); + ki->iocb.ki_filp = file; + ki->iocb.ki_pos = start_pos; + ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE; + ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); + ki->iocb.ki_ioprio = get_current_ioprio(); + ki->start = start_pos; + ki->len = len; + ki->term_func = term_func; + ki->term_func_priv = term_func_priv; + ki->was_async = true; + + if (ki->term_func) + ki->iocb.ki_complete = cachefiles_write_complete; + + /* Open-code file_start_write here to grab freeze protection, which + * will be released by another thread in aio_complete_rw(). Fool + * lockdep by telling it the lock got released so that it doesn't + * complain about the held lock when we return to userspace. + */ + inode = file_inode(file); + __sb_start_write(inode->i_sb, SB_FREEZE_WRITE); + __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + + get_file(ki->iocb.ki_filp); + + old_nofs = memalloc_nofs_save(); + ret = vfs_iocb_iter_write(file, &ki->iocb, iter); + memalloc_nofs_restore(old_nofs); + switch (ret) { + case -EIOCBQUEUED: + goto in_progress; + + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + ret = -EINTR; + fallthrough; + default: + ki->was_async = false; + cachefiles_write_complete(&ki->iocb, ret, 0); + if (ret > 0) + ret = 0; + break; + } + +in_progress: + cachefiles_put_kiocb(ki); + _leave(" = %zd", ret); + return ret; + +presubmission_error: + if (term_func) + term_func(term_func_priv, -ENOMEM, false); + return -ENOMEM; +} + +/* + * Prepare a read operation, shortening it to a cached/uncached + * boundary as appropriate. + */ +static enum netfs_read_source cachefiles_prepare_read(struct netfs_read_subrequest *subreq, + loff_t i_size) +{ + struct fscache_retrieval *op = subreq->rreq->cache_resources.cache_priv; + struct cachefiles_object *object; + struct cachefiles_cache *cache; + const struct cred *saved_cred; + struct file *file = subreq->rreq->cache_resources.cache_priv2; + loff_t off, to; + + _enter("%zx @%llx/%llx", subreq->len, subreq->start, i_size); + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + if (!file) + goto cache_fail_nosec; + + if (subreq->start >= i_size) + return NETFS_FILL_WITH_ZEROES; + + cachefiles_begin_secure(cache, &saved_cred); + + off = vfs_llseek(file, subreq->start, SEEK_DATA); + if (off < 0 && off >= (loff_t)-MAX_ERRNO) { + if (off == (loff_t)-ENXIO) + goto download_and_store; + goto cache_fail; + } + + if (off >= subreq->start + subreq->len) + goto download_and_store; + + if (off > subreq->start) { + off = round_up(off, cache->bsize); + subreq->len = off - subreq->start; + goto download_and_store; + } + + to = vfs_llseek(file, subreq->start, SEEK_HOLE); + if (to < 0 && to >= (loff_t)-MAX_ERRNO) + goto cache_fail; + + if (to < subreq->start + subreq->len) { + if (subreq->start + subreq->len >= i_size) + to = round_up(to, cache->bsize); + else + to = round_down(to, cache->bsize); + subreq->len = to - subreq->start; + } + + cachefiles_end_secure(cache, saved_cred); + return NETFS_READ_FROM_CACHE; + +download_and_store: + if (cachefiles_has_space(cache, 0, (subreq->len + PAGE_SIZE - 1) / PAGE_SIZE) == 0) + __set_bit(NETFS_SREQ_WRITE_TO_CACHE, &subreq->flags); +cache_fail: + cachefiles_end_secure(cache, saved_cred); +cache_fail_nosec: + return NETFS_DOWNLOAD_FROM_SERVER; +} + +/* + * Prepare for a write to occur. + */ +static int cachefiles_prepare_write(struct netfs_cache_resources *cres, + loff_t *_start, size_t *_len, loff_t i_size) +{ + loff_t start = *_start; + size_t len = *_len, down; + + /* Round to DIO size */ + down = start - round_down(start, PAGE_SIZE); + *_start = start - down; + *_len = round_up(down + len, PAGE_SIZE); + return 0; +} + +/* + * Clean up an operation. + */ +static void cachefiles_end_operation(struct netfs_cache_resources *cres) +{ + struct fscache_retrieval *op = cres->cache_priv; + struct file *file = cres->cache_priv2; + + _enter(""); + + if (file) + fput(file); + if (op) { + fscache_op_complete(&op->op, false); + fscache_put_retrieval(op); + } + + _leave(""); +} + +static const struct netfs_cache_ops cachefiles_netfs_cache_ops = { + .end_operation = cachefiles_end_operation, + .read = cachefiles_read, + .write = cachefiles_write, + .prepare_read = cachefiles_prepare_read, + .prepare_write = cachefiles_prepare_write, +}; + +/* + * Open the cache file when beginning a cache operation. + */ +int cachefiles_begin_read_operation(struct netfs_read_request *rreq, + struct fscache_retrieval *op) +{ + struct cachefiles_object *object; + struct cachefiles_cache *cache; + struct path path; + struct file *file; + + _enter(""); + + object = container_of(op->op.object, + struct cachefiles_object, fscache); + cache = container_of(object->fscache.cache, + struct cachefiles_cache, cache); + + path.mnt = cache->mnt; + path.dentry = object->backer; + file = open_with_fake_path(&path, O_RDWR | O_LARGEFILE | O_DIRECT, + d_inode(object->backer), cache->cache_cred); + if (IS_ERR(file)) + return PTR_ERR(file); + if (!S_ISREG(file_inode(file)->i_mode)) + goto error_file; + if (unlikely(!file->f_op->read_iter) || + unlikely(!file->f_op->write_iter)) { + pr_notice("Cache does not support read_iter and write_iter\n"); + goto error_file; + } + + fscache_get_retrieval(op); + rreq->cache_resources.cache_priv = op; + rreq->cache_resources.cache_priv2 = file; + rreq->cache_resources.ops = &cachefiles_netfs_cache_ops; + rreq->cookie_debug_id = object->fscache.debug_id; + _leave(""); + return 0; + +error_file: + fput(file); + return -EIO; +} diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 471e40156065..94df854147d3 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -6,6 +6,7 @@ config CEPH_FS select LIBCRC32C select CRYPTO_AES select CRYPTO + select NETFS_SUPPORT default n help Choose Y or M here to include support for mounting the diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 26e66436f005..c1570fada3d8 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -12,6 +12,7 @@ #include <linux/signal.h> #include <linux/iversion.h> #include <linux/ktime.h> +#include <linux/netfs.h> #include "super.h" #include "mds_client.h" @@ -61,6 +62,9 @@ (CONGESTION_ON_THRESH(congestion_kb) - \ (CONGESTION_ON_THRESH(congestion_kb) >> 2)) +static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, + struct page *page, void **_fsdata); + static inline struct ceph_snap_context *page_snap_context(struct page *page) { if (PagePrivate(page)) @@ -124,8 +128,7 @@ static int ceph_set_page_dirty(struct page *page) * PagePrivate so that we get invalidatepage callback. */ BUG_ON(PagePrivate(page)); - page->private = (unsigned long)snapc; - SetPagePrivate(page); + attach_page_private(page, snapc); ret = __set_page_dirty_nobuffers(page); WARN_ON(!PageLocked(page)); @@ -144,19 +147,19 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, { struct inode *inode; struct ceph_inode_info *ci; - struct ceph_snap_context *snapc = page_snap_context(page); + struct ceph_snap_context *snapc; + + wait_on_page_fscache(page); inode = page->mapping->host; ci = ceph_inode(inode); - if (offset != 0 || length != PAGE_SIZE) { + if (offset != 0 || length != thp_size(page)) { dout("%p invalidatepage %p idx %lu partial dirty page %u~%u\n", inode, page, page->index, offset, length); return; } - ceph_invalidate_fscache_page(inode, page); - WARN_ON(!PageLocked(page)); if (!PagePrivate(page)) return; @@ -164,333 +167,222 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, dout("%p invalidatepage %p idx %lu full dirty page\n", inode, page, page->index); + snapc = detach_page_private(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); - page->private = 0; - ClearPagePrivate(page); } -static int ceph_releasepage(struct page *page, gfp_t g) +static int ceph_releasepage(struct page *page, gfp_t gfp) { dout("%p releasepage %p idx %lu (%sdirty)\n", page->mapping->host, page, page->index, PageDirty(page) ? "" : "not "); - /* Can we release the page from the cache? */ - if (!ceph_release_fscache_page(page, g)) - return 0; - + if (PageFsCache(page)) { + if (!(gfp & __GFP_DIRECT_RECLAIM) || !(gfp & __GFP_FS)) + return 0; + wait_on_page_fscache(page); + } return !PagePrivate(page); } -/* read a single page, without unlocking it. */ -static int ceph_do_readpage(struct file *filp, struct page *page) +static void ceph_netfs_expand_readahead(struct netfs_read_request *rreq) { - struct inode *inode = file_inode(filp); + struct inode *inode = rreq->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_client *osdc = &fsc->client->osdc; - struct ceph_osd_request *req; - struct ceph_vino vino = ceph_vino(inode); - int err = 0; - u64 off = page_offset(page); - u64 len = PAGE_SIZE; - - if (off >= i_size_read(inode)) { - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - return 0; - } - - if (ci->i_inline_version != CEPH_INLINE_NONE) { - /* - * Uptodate inline data should have been added - * into page cache while getting Fcr caps. - */ - if (off == 0) - return -EINVAL; - zero_user_segment(page, 0, PAGE_SIZE); - SetPageUptodate(page); - return 0; - } - - err = ceph_readpage_from_fscache(inode, page); - if (err == 0) - return -EINPROGRESS; - - dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", - vino.ino, vino.snap, filp, off, len, page, page->index); - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, 0, 1, - CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, NULL, - ci->i_truncate_seq, ci->i_truncate_size, - false); - if (IS_ERR(req)) - return PTR_ERR(req); + struct ceph_file_layout *lo = &ci->i_layout; + u32 blockoff; + u64 blockno; - osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); + /* Expand the start downward */ + blockno = div_u64_rem(rreq->start, lo->stripe_unit, &blockoff); + rreq->start = blockno * lo->stripe_unit; + rreq->len += blockoff; - err = ceph_osdc_start_request(osdc, req, false); - if (!err) - err = ceph_osdc_wait_request(osdc, req); - - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, err); - - ceph_osdc_put_request(req); - dout("readpage result %d\n", err); - - if (err == -ENOENT) - err = 0; - if (err < 0) { - ceph_fscache_readpage_cancel(inode, page); - if (err == -EBLOCKLISTED) - fsc->blocklisted = true; - goto out; - } - if (err < PAGE_SIZE) - /* zero fill remainder of page */ - zero_user_segment(page, err, PAGE_SIZE); - else - flush_dcache_page(page); - - SetPageUptodate(page); - ceph_readpage_to_fscache(inode, page); - -out: - return err < 0 ? err : 0; + /* Now, round up the length to the next block */ + rreq->len = roundup(rreq->len, lo->stripe_unit); } -static int ceph_readpage(struct file *filp, struct page *page) +static bool ceph_netfs_clamp_length(struct netfs_read_subrequest *subreq) { - int r = ceph_do_readpage(filp, page); - if (r != -EINPROGRESS) - unlock_page(page); - else - r = 0; - return r; + struct inode *inode = subreq->rreq->mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objno, objoff; + u32 xlen; + + /* Truncate the extent at the end of the current block */ + ceph_calc_file_object_mapping(&ci->i_layout, subreq->start, subreq->len, + &objno, &objoff, &xlen); + subreq->len = min(xlen, fsc->mount_options->rsize); + return true; } -/* - * Finish an async read(ahead) op. - */ -static void finish_read(struct ceph_osd_request *req) +static void finish_netfs_read(struct ceph_osd_request *req) { - struct inode *inode = req->r_inode; - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_osd_data *osd_data; - int rc = req->r_result <= 0 ? req->r_result : 0; - int bytes = req->r_result >= 0 ? req->r_result : 0; + struct ceph_fs_client *fsc = ceph_inode_to_client(req->r_inode); + struct ceph_osd_data *osd_data = osd_req_op_extent_osd_data(req, 0); + struct netfs_read_subrequest *subreq = req->r_priv; int num_pages; - int i; + int err = req->r_result; - dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); - if (rc == -EBLOCKLISTED) - ceph_inode_to_client(inode)->blocklisted = true; + ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, + req->r_end_latency, err); - /* unlock all pages, zeroing any data we didn't read */ - osd_data = osd_req_op_extent_osd_data(req, 0); - BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES); - num_pages = calc_pages_for((u64)osd_data->alignment, - (u64)osd_data->length); - for (i = 0; i < num_pages; i++) { - struct page *page = osd_data->pages[i]; - - if (rc < 0 && rc != -ENOENT) { - ceph_fscache_readpage_cancel(inode, page); - goto unlock; - } - if (bytes < (int)PAGE_SIZE) { - /* zero (remainder of) page */ - int s = bytes < 0 ? 0 : bytes; - zero_user_segment(page, s, PAGE_SIZE); - } - dout("finish_read %p uptodate %p idx %lu\n", inode, page, - page->index); - flush_dcache_page(page); - SetPageUptodate(page); - ceph_readpage_to_fscache(inode, page); -unlock: - unlock_page(page); - put_page(page); - bytes -= PAGE_SIZE; - } + dout("%s: result %d subreq->len=%zu i_size=%lld\n", __func__, req->r_result, + subreq->len, i_size_read(req->r_inode)); - ceph_update_read_latency(&fsc->mdsc->metric, req->r_start_latency, - req->r_end_latency, rc); + /* no object means success but no data */ + if (err == -ENOENT) + err = 0; + else if (err == -EBLOCKLISTED) + fsc->blocklisted = true; + + if (err >= 0 && err < subreq->len) + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + + netfs_subreq_terminated(subreq, err, true); - kfree(osd_data->pages); + num_pages = calc_pages_for(osd_data->alignment, osd_data->length); + ceph_put_page_vector(osd_data->pages, num_pages, false); + iput(req->r_inode); } -/* - * start an async read(ahead) operation. return nr_pages we submitted - * a read for on success, or negative error code. - */ -static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx, - struct list_head *page_list, int max) +static void ceph_netfs_issue_op(struct netfs_read_subrequest *subreq) { - struct ceph_osd_client *osdc = - &ceph_inode_to_client(inode)->client->osdc; + struct netfs_read_request *rreq = subreq->rreq; + struct inode *inode = rreq->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); - struct page *page = lru_to_page(page_list); - struct ceph_vino vino; + struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_osd_request *req; - u64 off; - u64 len; - int i; + struct ceph_vino vino = ceph_vino(inode); + struct iov_iter iter; struct page **pages; - pgoff_t next_index; - int nr_pages = 0; - int got = 0; - int ret = 0; - - if (!rw_ctx) { - /* caller of readpages does not hold buffer and read caps - * (fadvise, madvise and readahead cases) */ - int want = CEPH_CAP_FILE_CACHE; - ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, - true, &got); - if (ret < 0) { - dout("start_read %p, error getting cap\n", inode); - } else if (!(got & want)) { - dout("start_read %p, no cache cap\n", inode); - ret = 0; - } - if (ret <= 0) { - if (got) - ceph_put_cap_refs(ci, got); - while (!list_empty(page_list)) { - page = lru_to_page(page_list); - list_del(&page->lru); - put_page(page); - } - return ret; - } - } - - off = (u64) page_offset(page); + size_t page_off; + int err = 0; + u64 len = subreq->len; - /* count pages */ - next_index = page->index; - list_for_each_entry_reverse(page, page_list, lru) { - if (page->index != next_index) - break; - nr_pages++; - next_index++; - if (max && nr_pages == max) - break; - } - len = nr_pages << PAGE_SHIFT; - dout("start_read %p nr_pages %d is %lld~%lld\n", inode, nr_pages, - off, len); - vino = ceph_vino(inode); - req = ceph_osdc_new_request(osdc, &ci->i_layout, vino, off, &len, - 0, 1, CEPH_OSD_OP_READ, - CEPH_OSD_FLAG_READ, NULL, - ci->i_truncate_seq, ci->i_truncate_size, - false); + req = ceph_osdc_new_request(&fsc->client->osdc, &ci->i_layout, vino, subreq->start, &len, + 0, 1, CEPH_OSD_OP_READ, + CEPH_OSD_FLAG_READ | fsc->client->osdc.client->options->read_from_replica, + NULL, ci->i_truncate_seq, ci->i_truncate_size, false); if (IS_ERR(req)) { - ret = PTR_ERR(req); + err = PTR_ERR(req); + req = NULL; goto out; } - /* build page vector */ - nr_pages = calc_pages_for(0, len); - pages = kmalloc_array(nr_pages, sizeof(*pages), GFP_KERNEL); - if (!pages) { - ret = -ENOMEM; - goto out_put; - } - for (i = 0; i < nr_pages; ++i) { - page = list_entry(page_list->prev, struct page, lru); - BUG_ON(PageLocked(page)); - list_del(&page->lru); - - dout("start_read %p adding %p idx %lu\n", inode, page, - page->index); - if (add_to_page_cache_lru(page, &inode->i_data, page->index, - GFP_KERNEL)) { - ceph_fscache_uncache_page(inode, page); - put_page(page); - dout("start_read %p add_to_page_cache failed %p\n", - inode, page); - nr_pages = i; - if (nr_pages > 0) { - len = nr_pages << PAGE_SHIFT; - osd_req_op_extent_update(req, 0, len); - break; - } - goto out_pages; - } - pages[i] = page; + dout("%s: pos=%llu orig_len=%zu len=%llu\n", __func__, subreq->start, subreq->len, len); + iov_iter_xarray(&iter, READ, &rreq->mapping->i_pages, subreq->start, len); + err = iov_iter_get_pages_alloc(&iter, &pages, len, &page_off); + if (err < 0) { + dout("%s: iov_ter_get_pages_alloc returned %d\n", __func__, err); + goto out; } + + /* should always give us a page-aligned read */ + WARN_ON_ONCE(page_off); + len = err; + osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0, false, false); - req->r_callback = finish_read; + req->r_callback = finish_netfs_read; + req->r_priv = subreq; req->r_inode = inode; + ihold(inode); - dout("start_read %p starting %p %lld~%lld\n", inode, req, off, len); - ret = ceph_osdc_start_request(osdc, req, false); - if (ret < 0) - goto out_pages; + err = ceph_osdc_start_request(req->r_osdc, req, false); + if (err) + iput(inode); +out: ceph_osdc_put_request(req); + if (err) + netfs_subreq_terminated(subreq, err, false); + dout("%s: result %d\n", __func__, err); +} - /* After adding locked pages to page cache, the inode holds cache cap. - * So we can drop our cap refs. */ - if (got) - ceph_put_cap_refs(ci, got); +static void ceph_init_rreq(struct netfs_read_request *rreq, struct file *file) +{ +} - return nr_pages; +static void ceph_readahead_cleanup(struct address_space *mapping, void *priv) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + int got = (uintptr_t)priv; -out_pages: - for (i = 0; i < nr_pages; ++i) { - ceph_fscache_readpage_cancel(inode, pages[i]); - unlock_page(pages[i]); - } - ceph_put_page_vector(pages, nr_pages, false); -out_put: - ceph_osdc_put_request(req); -out: if (got) ceph_put_cap_refs(ci, got); - return ret; } +const struct netfs_read_request_ops ceph_netfs_read_ops = { + .init_rreq = ceph_init_rreq, + .is_cache_enabled = ceph_is_cache_enabled, + .begin_cache_operation = ceph_begin_cache_operation, + .issue_op = ceph_netfs_issue_op, + .expand_readahead = ceph_netfs_expand_readahead, + .clamp_length = ceph_netfs_clamp_length, + .check_write_begin = ceph_netfs_check_write_begin, + .cleanup = ceph_readahead_cleanup, +}; -/* - * Read multiple pages. Leave pages we don't read + unlock in page_list; - * the caller (VM) cleans them up. - */ -static int ceph_readpages(struct file *file, struct address_space *mapping, - struct list_head *page_list, unsigned nr_pages) +/* read a single page, without unlocking it. */ +static int ceph_readpage(struct file *file, struct page *page) { struct inode *inode = file_inode(file); - struct ceph_fs_client *fsc = ceph_inode_to_client(inode); - struct ceph_file_info *fi = file->private_data; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_vino vino = ceph_vino(inode); + u64 off = page_offset(page); + u64 len = thp_size(page); + + if (ci->i_inline_version != CEPH_INLINE_NONE) { + /* + * Uptodate inline data should have been added + * into page cache while getting Fcr caps. + */ + if (off == 0) { + unlock_page(page); + return -EINVAL; + } + zero_user_segment(page, 0, thp_size(page)); + SetPageUptodate(page); + unlock_page(page); + return 0; + } + + dout("readpage ino %llx.%llx file %p off %llu len %llu page %p index %lu\n", + vino.ino, vino.snap, file, off, len, page, page->index); + + return netfs_readpage(file, page, &ceph_netfs_read_ops, NULL); +} + +static void ceph_readahead(struct readahead_control *ractl) +{ + struct inode *inode = file_inode(ractl->file); + struct ceph_file_info *fi = ractl->file->private_data; struct ceph_rw_context *rw_ctx; - int rc = 0; - int max = 0; + int got = 0; + int ret = 0; if (ceph_inode(inode)->i_inline_version != CEPH_INLINE_NONE) - return -EINVAL; + return; - rc = ceph_readpages_from_fscache(mapping->host, mapping, page_list, - &nr_pages); + rw_ctx = ceph_find_rw_context(fi); + if (!rw_ctx) { + /* + * readahead callers do not necessarily hold Fcb caps + * (e.g. fadvise, madvise). + */ + int want = CEPH_CAP_FILE_CACHE; - if (rc == 0) - goto out; + ret = ceph_try_get_caps(inode, CEPH_CAP_FILE_RD, want, true, &got); + if (ret < 0) + dout("start_read %p, error getting cap\n", inode); + else if (!(got & want)) + dout("start_read %p, no cache cap\n", inode); - rw_ctx = ceph_find_rw_context(fi); - max = fsc->mount_options->rsize >> PAGE_SHIFT; - dout("readpages %p file %p ctx %p nr_pages %d max %d\n", - inode, file, rw_ctx, nr_pages, max); - while (!list_empty(page_list)) { - rc = start_read(inode, rw_ctx, page_list, max); - if (rc < 0) - goto out; + if (ret <= 0) + return; } -out: - ceph_fscache_readpages_cancel(inode, page_list); - - dout("readpages %p file %p ret %d\n", inode, file, rc); - return rc; + netfs_readahead(ractl, &ceph_netfs_read_ops, (void *)(uintptr_t)got); } struct ceph_writeback_ctl @@ -585,8 +477,8 @@ static u64 get_writepages_data_length(struct inode *inode, spin_unlock(&ci->i_ceph_lock); WARN_ON(!found); } - if (end > page_offset(page) + PAGE_SIZE) - end = page_offset(page) + PAGE_SIZE; + if (end > page_offset(page) + thp_size(page)) + end = page_offset(page) + thp_size(page); return end > start ? end - start : 0; } @@ -604,7 +496,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) struct ceph_snap_context *snapc, *oldest; loff_t page_off = page_offset(page); int err; - loff_t len = PAGE_SIZE; + loff_t len = thp_size(page); struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; struct ceph_osd_request *req; @@ -632,7 +524,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) /* is this a partial page at end of file? */ if (page_off >= ceph_wbc.i_size) { dout("%p page eof %llu\n", page, ceph_wbc.i_size); - page->mapping->a_ops->invalidatepage(page, 0, PAGE_SIZE); + page->mapping->a_ops->invalidatepage(page, 0, thp_size(page)); return 0; } @@ -658,7 +550,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) } /* it may be a short write due to an object boundary */ - WARN_ON_ONCE(len > PAGE_SIZE); + WARN_ON_ONCE(len > thp_size(page)); osd_req_op_extent_osd_data_pages(req, 0, &page, len, 0, false, false); dout("writepage %llu~%llu (%llu bytes)\n", page_off, len, len); @@ -667,7 +559,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (!err) err = ceph_osdc_wait_request(osdc, req); - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, err); ceph_osdc_put_request(req); @@ -695,8 +587,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) dout("writepage cleaned page %p\n", page); err = 0; /* vfs expects us to return 0 */ } - page->private = 0; - ClearPagePrivate(page); + oldest = detach_page_private(page); + WARN_ON_ONCE(oldest != snapc); end_page_writeback(page); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ @@ -755,7 +647,7 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_clear_error_write(ci); } - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, rc); /* @@ -788,11 +680,9 @@ static void writepages_finish(struct ceph_osd_request *req) clear_bdi_congested(inode_to_bdi(inode), BLK_RW_ASYNC); - ceph_put_snap_context(page_snap_context(page)); - page->private = 0; - ClearPagePrivate(page); - dout("unlocking %p\n", page); + ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); + dout("unlocking %p\n", page); if (remove_page) generic_error_remove_page(inode->i_mapping, @@ -949,7 +839,7 @@ get_more_pages: page_offset(page) >= i_size_read(inode)) && clear_page_dirty_for_io(page)) mapping->a_ops->invalidatepage(page, - 0, PAGE_SIZE); + 0, thp_size(page)); unlock_page(page); continue; } @@ -1038,7 +928,7 @@ get_more_pages: pages[locked_pages++] = page; pvec.pages[i] = NULL; - len += PAGE_SIZE; + len += thp_size(page); } /* did we get anything? */ @@ -1087,7 +977,7 @@ new_request: BUG_ON(IS_ERR(req)); } BUG_ON(len < page_offset(pages[locked_pages - 1]) + - PAGE_SIZE - offset); + thp_size(page) - offset); req->r_callback = writepages_finish; req->r_inode = inode; @@ -1117,7 +1007,7 @@ new_request: } set_page_writeback(pages[i]); - len += PAGE_SIZE; + len += thp_size(page); } if (ceph_wbc.size_stable) { @@ -1126,7 +1016,7 @@ new_request: /* writepages_finish() clears writeback pages * according to the data length, so make sure * data length covers all locked pages */ - u64 min_len = len + 1 - PAGE_SIZE; + u64 min_len = len + 1 - thp_size(page); len = get_writepages_data_length(inode, pages[i - 1], offset); len = max(len, min_len); @@ -1302,6 +1192,31 @@ ceph_find_incompatible(struct page *page) return NULL; } +static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned int len, + struct page *page, void **_fsdata) +{ + struct inode *inode = file_inode(file); + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_snap_context *snapc; + + snapc = ceph_find_incompatible(page); + if (snapc) { + int r; + + unlock_page(page); + put_page(page); + if (IS_ERR(snapc)) + return PTR_ERR(snapc); + + ceph_queue_writeback(inode); + r = wait_event_killable(ci->i_cap_wq, + context_is_writeable_or_written(inode, snapc)); + ceph_put_snap_context(snapc); + return r == 0 ? -EAGAIN : r; + } + return 0; +} + /* * We are only allowed to write into/dirty the page if the page is * clean, or already dirty within the same snap context. @@ -1312,75 +1227,47 @@ static int ceph_write_begin(struct file *file, struct address_space *mapping, { struct inode *inode = file_inode(file); struct ceph_inode_info *ci = ceph_inode(inode); - struct ceph_snap_context *snapc; struct page *page = NULL; pgoff_t index = pos >> PAGE_SHIFT; - int pos_in_page = pos & ~PAGE_MASK; - int r = 0; + int r; - dout("write_begin file %p inode %p page %p %d~%d\n", file, inode, page, (int)pos, (int)len); - - for (;;) { + /* + * Uninlining should have already been done and everything updated, EXCEPT + * for inline_version sent to the MDS. + */ + if (ci->i_inline_version != CEPH_INLINE_NONE) { page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) { - r = -ENOMEM; - break; - } - - snapc = ceph_find_incompatible(page); - if (snapc) { - if (IS_ERR(snapc)) { - r = PTR_ERR(snapc); - break; - } - unlock_page(page); - put_page(page); - page = NULL; - ceph_queue_writeback(inode); - r = wait_event_killable(ci->i_cap_wq, - context_is_writeable_or_written(inode, snapc)); - ceph_put_snap_context(snapc); - if (r != 0) - break; - continue; - } - - if (PageUptodate(page)) { - dout(" page %p already uptodate\n", page); - break; - } + if (!page) + return -ENOMEM; /* - * In some cases we don't need to read at all: - * - full page write - * - write that lies completely beyond EOF - * - write that covers the the page from start to EOF or beyond it + * The inline_version on a new inode is set to 1. If that's the + * case, then the page is brand new and isn't yet Uptodate. */ - if ((pos_in_page == 0 && len == PAGE_SIZE) || - (pos >= i_size_read(inode)) || - (pos_in_page == 0 && (pos + len) >= i_size_read(inode))) { - zero_user_segments(page, 0, pos_in_page, - pos_in_page + len, PAGE_SIZE); - break; + r = 0; + if (index == 0 && ci->i_inline_version != 1) { + if (!PageUptodate(page)) { + WARN_ONCE(1, "ceph: write_begin called on still-inlined inode (inline_version %llu)!\n", + ci->i_inline_version); + r = -EINVAL; + } + goto out; } - - /* - * We need to read it. If we get back -EINPROGRESS, then the page was - * handed off to fscache and it will be unlocked when the read completes. - * Refind the page in that case so we can reacquire the page lock. Otherwise - * we got a hard error or the read was completed synchronously. - */ - r = ceph_do_readpage(file, page); - if (r != -EINPROGRESS) - break; + zero_user_segment(page, 0, thp_size(page)); + SetPageUptodate(page); + goto out; } + r = netfs_write_begin(file, inode->i_mapping, pos, len, 0, &page, NULL, + &ceph_netfs_read_ops, NULL); +out: + if (r == 0) + wait_on_page_fscache(page); if (r < 0) { - if (page) { - unlock_page(page); + if (page) put_page(page); - } } else { + WARN_ON_ONCE(!PageLocked(page)); *pagep = page; } return r; @@ -1438,7 +1325,7 @@ static ssize_t ceph_direct_io(struct kiocb *iocb, struct iov_iter *iter) const struct address_space_operations ceph_aops = { .readpage = ceph_readpage, - .readpages = ceph_readpages, + .readahead = ceph_readahead, .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, @@ -1470,7 +1357,6 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) struct inode *inode = file_inode(vma->vm_file); struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; - struct page *pinned_page = NULL; loff_t off = (loff_t)vmf->pgoff << PAGE_SHIFT; int want, got, err; sigset_t oldset; @@ -1478,21 +1364,20 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ceph_block_sigs(&oldset); - dout("filemap_fault %p %llx.%llx %llu~%zd trying to get caps\n", - inode, ceph_vinop(inode), off, (size_t)PAGE_SIZE); + dout("filemap_fault %p %llx.%llx %llu trying to get caps\n", + inode, ceph_vinop(inode), off); if (fi->fmode & CEPH_FILE_MODE_LAZY) want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; got = 0; - err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, - &got, &pinned_page); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_RD, want, -1, &got); if (err < 0) goto out_restore; - dout("filemap_fault %p %llu~%zd got cap refs on %s\n", - inode, off, (size_t)PAGE_SIZE, ceph_cap_string(got)); + dout("filemap_fault %p %llu got cap refs on %s\n", + inode, off, ceph_cap_string(got)); if ((got & (CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO)) || ci->i_inline_version == CEPH_INLINE_NONE) { @@ -1500,14 +1385,11 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) ceph_add_rw_context(fi, &rw_ctx); ret = filemap_fault(vmf); ceph_del_rw_context(fi, &rw_ctx); - dout("filemap_fault %p %llu~%zd drop cap refs %s ret %x\n", - inode, off, (size_t)PAGE_SIZE, - ceph_cap_string(got), ret); + dout("filemap_fault %p %llu drop cap refs %s ret %x\n", + inode, off, ceph_cap_string(got), ret); } else err = -EAGAIN; - if (pinned_page) - put_page(pinned_page); ceph_put_cap_refs(ci, got); if (err != -EAGAIN) @@ -1542,8 +1424,8 @@ static vm_fault_t ceph_filemap_fault(struct vm_fault *vmf) vmf->page = page; ret = VM_FAULT_MAJOR | VM_FAULT_LOCKED; out_inline: - dout("filemap_fault %p %llu~%zd read inline data ret %x\n", - inode, off, (size_t)PAGE_SIZE, ret); + dout("filemap_fault %p %llu read inline data ret %x\n", + inode, off, ret); } out_restore: ceph_restore_sigs(&oldset); @@ -1553,9 +1435,6 @@ out_restore: return ret; } -/* - * Reuse write_begin here for simplicity. - */ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; @@ -1591,10 +1470,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) goto out_free; } - if (off + PAGE_SIZE <= size) - len = PAGE_SIZE; + if (off + thp_size(page) <= size) + len = thp_size(page); else - len = size & ~PAGE_MASK; + len = offset_in_thp(page, size); dout("page_mkwrite %p %llx.%llx %llu~%zd getting caps i_size %llu\n", inode, ceph_vinop(inode), off, len, size); @@ -1604,8 +1483,7 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, - &got, NULL); + err = ceph_get_caps(vma->vm_file, CEPH_CAP_FILE_WR, want, off + len, &got); if (err < 0) goto out_free; @@ -1832,7 +1710,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) if (!err) err = ceph_osdc_wait_request(&fsc->client->osdc, req); - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, err); out_put: @@ -2057,6 +1935,10 @@ int ceph_pool_perm_check(struct inode *inode, int need) s64 pool; int ret, flags; + /* Only need to do this for regular files */ + if (!S_ISREG(inode->i_mode)) + return 0; + if (ci->i_vino.snap != CEPH_NOSNAP) { /* * Pool permission check needs to write to the first object. diff --git a/fs/ceph/cache.c b/fs/ceph/cache.c index 2f5cb6bc78e1..9cfadbb86568 100644 --- a/fs/ceph/cache.c +++ b/fs/ceph/cache.c @@ -173,7 +173,6 @@ void ceph_fscache_unregister_inode_cookie(struct ceph_inode_info* ci) ci->fscache = NULL; - fscache_uncache_all_inode_pages(cookie, &ci->vfs_inode); fscache_relinquish_cookie(cookie, &ci->i_vino, false); } @@ -194,7 +193,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) dout("fscache_file_set_cookie %p %p disabling cache\n", inode, filp); fscache_disable_cookie(ci->fscache, &ci->i_vino, false); - fscache_uncache_all_inode_pages(ci->fscache, inode); } else { fscache_enable_cookie(ci->fscache, &ci->i_vino, i_size_read(inode), ceph_fscache_can_enable, inode); @@ -205,108 +203,6 @@ void ceph_fscache_file_set_cookie(struct inode *inode, struct file *filp) } } -static void ceph_readpage_from_fscache_complete(struct page *page, void *data, int error) -{ - if (!error) - SetPageUptodate(page); - - unlock_page(page); -} - -static inline bool cache_valid(struct ceph_inode_info *ci) -{ - return ci->i_fscache_gen == ci->i_rdcache_gen; -} - - -/* Atempt to read from the fscache, - * - * This function is called from the readpage_nounlock context. DO NOT attempt to - * unlock the page here (or in the callback). - */ -int ceph_readpage_from_fscache(struct inode *inode, struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - - if (!cache_valid(ci)) - return -ENOBUFS; - - ret = fscache_read_or_alloc_page(ci->fscache, page, - ceph_readpage_from_fscache_complete, NULL, - GFP_KERNEL); - - switch (ret) { - case 0: /* Page found */ - dout("page read submitted\n"); - return 0; - case -ENOBUFS: /* Pages were not found, and can't be */ - case -ENODATA: /* Pages were not found */ - dout("page/inode not in cache\n"); - return ret; - default: - dout("%s: unknown error ret = %i\n", __func__, ret); - return ret; - } -} - -int ceph_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - - if (!cache_valid(ci)) - return -ENOBUFS; - - ret = fscache_read_or_alloc_pages(ci->fscache, mapping, pages, nr_pages, - ceph_readpage_from_fscache_complete, - NULL, mapping_gfp_mask(mapping)); - - switch (ret) { - case 0: /* All pages found */ - dout("all-page read submitted\n"); - return 0; - case -ENOBUFS: /* Some pages were not found, and can't be */ - case -ENODATA: /* some pages were not found */ - dout("page/inode not in cache\n"); - return ret; - default: - dout("%s: unknown error ret = %i\n", __func__, ret); - return ret; - } -} - -void ceph_readpage_to_fscache(struct inode *inode, struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - int ret; - - if (!PageFsCache(page)) - return; - - if (!cache_valid(ci)) - return; - - ret = fscache_write_page(ci->fscache, page, i_size_read(inode), - GFP_KERNEL); - if (ret) - fscache_uncache_page(ci->fscache, page); -} - -void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - - if (!PageFsCache(page)) - return; - - fscache_wait_on_page_write(ci->fscache, page); - fscache_uncache_page(ci->fscache, page); -} - void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) { if (fscache_cookie_valid(fsc->fscache)) { @@ -329,24 +225,3 @@ void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) } fsc->fscache = NULL; } - -/* - * caller should hold CEPH_CAP_FILE_{RD,CACHE} - */ -void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) -{ - if (cache_valid(ci)) - return; - - /* resue i_truncate_mutex. There should be no pending - * truncate while the caller holds CEPH_CAP_FILE_RD */ - mutex_lock(&ci->i_truncate_mutex); - if (!cache_valid(ci)) { - if (fscache_check_consistency(ci->fscache, &ci->i_vino)) - fscache_invalidate(ci->fscache); - spin_lock(&ci->i_ceph_lock); - ci->i_fscache_gen = ci->i_rdcache_gen; - spin_unlock(&ci->i_ceph_lock); - } - mutex_unlock(&ci->i_truncate_mutex); -} diff --git a/fs/ceph/cache.h b/fs/ceph/cache.h index 89dbdd1eb14a..1409d6149281 100644 --- a/fs/ceph/cache.h +++ b/fs/ceph/cache.h @@ -9,6 +9,8 @@ #ifndef _CEPH_CACHE_H #define _CEPH_CACHE_H +#include <linux/netfs.h> + #ifdef CONFIG_CEPH_FSCACHE extern struct fscache_netfs ceph_cache_netfs; @@ -29,54 +31,37 @@ int ceph_readpages_from_fscache(struct inode *inode, struct address_space *mapping, struct list_head *pages, unsigned *nr_pages); -void ceph_readpage_to_fscache(struct inode *inode, struct page *page); -void ceph_invalidate_fscache_page(struct inode* inode, struct page *page); static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { ci->fscache = NULL; - ci->i_fscache_gen = 0; } -static inline void ceph_fscache_invalidate(struct inode *inode) +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) { - fscache_invalidate(ceph_inode(inode)->fscache); + return ci->fscache; } -static inline void ceph_fscache_uncache_page(struct inode *inode, - struct page *page) +static inline void ceph_fscache_invalidate(struct inode *inode) { - struct ceph_inode_info *ci = ceph_inode(inode); - return fscache_uncache_page(ci->fscache, page); + fscache_invalidate(ceph_inode(inode)->fscache); } -static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) +static inline bool ceph_is_cache_enabled(struct inode *inode) { - struct inode* inode = page->mapping->host; - struct ceph_inode_info *ci = ceph_inode(inode); - return fscache_maybe_release_page(ci->fscache, page, gfp); -} + struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(inode)); -static inline void ceph_fscache_readpage_cancel(struct inode *inode, - struct page *page) -{ - struct ceph_inode_info *ci = ceph_inode(inode); - if (fscache_cookie_valid(ci->fscache) && PageFsCache(page)) - __fscache_uncache_page(ci->fscache, page); + if (!cookie) + return false; + return fscache_cookie_enabled(cookie); } -static inline void ceph_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) +static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) { - struct ceph_inode_info *ci = ceph_inode(inode); - return fscache_readpages_cancel(ci->fscache, pages); -} + struct fscache_cookie *cookie = ceph_fscache_cookie(ceph_inode(rreq->inode)); -static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) -{ - ci->i_fscache_gen = ci->i_rdcache_gen - 1; + return fscache_begin_read_operation(rreq, cookie); } - #else static inline int ceph_fscache_register(void) @@ -102,6 +87,11 @@ static inline void ceph_fscache_inode_init(struct ceph_inode_info *ci) { } +static inline struct fscache_cookie *ceph_fscache_cookie(struct ceph_inode_info *ci) +{ + return NULL; +} + static inline void ceph_fscache_register_inode_cookie(struct inode *inode) { } @@ -115,62 +105,19 @@ static inline void ceph_fscache_file_set_cookie(struct inode *inode, { } -static inline void ceph_fscache_revalidate_cookie(struct ceph_inode_info *ci) -{ -} - -static inline void ceph_fscache_uncache_page(struct inode *inode, - struct page *pages) -{ -} - -static inline int ceph_readpage_from_fscache(struct inode* inode, - struct page *page) -{ - return -ENOBUFS; -} - -static inline int ceph_readpages_from_fscache(struct inode *inode, - struct address_space *mapping, - struct list_head *pages, - unsigned *nr_pages) -{ - return -ENOBUFS; -} - -static inline void ceph_readpage_to_fscache(struct inode *inode, - struct page *page) -{ -} - static inline void ceph_fscache_invalidate(struct inode *inode) { } -static inline void ceph_invalidate_fscache_page(struct inode *inode, - struct page *page) +static inline bool ceph_is_cache_enabled(struct inode *inode) { + return false; } -static inline int ceph_release_fscache_page(struct page *page, gfp_t gfp) -{ - return 1; -} - -static inline void ceph_fscache_readpage_cancel(struct inode *inode, - struct page *page) -{ -} - -static inline void ceph_fscache_readpages_cancel(struct inode *inode, - struct list_head *pages) -{ -} - -static inline void ceph_disable_fscache_readpage(struct ceph_inode_info *ci) +static inline int ceph_begin_cache_operation(struct netfs_read_request *rreq) { + return -ENOBUFS; } - #endif -#endif +#endif /* _CEPH_CACHE_H */ diff --git a/fs/ceph/caps.c b/fs/ceph/caps.c index 570731c4d019..a5e93b185515 100644 --- a/fs/ceph/caps.c +++ b/fs/ceph/caps.c @@ -1390,7 +1390,7 @@ static void __prep_cap(struct cap_msg_args *arg, struct ceph_cap *cap, arg->flush_tid = flush_tid; arg->oldest_flush_tid = oldest_flush_tid; - arg->size = inode->i_size; + arg->size = i_size_read(inode); ci->i_reported_size = arg->size; arg->max_size = ci->i_wanted_max_size; if (cap == ci->i_auth_cap) { @@ -1867,6 +1867,7 @@ static int try_nonblocking_invalidate(struct inode *inode) u32 invalidating_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); + ceph_fscache_invalidate(inode); invalidate_mapping_pages(&inode->i_data, 0, -1); spin_lock(&ci->i_ceph_lock); @@ -1884,7 +1885,7 @@ static int try_nonblocking_invalidate(struct inode *inode) bool __ceph_should_report_size(struct ceph_inode_info *ci) { - loff_t size = ci->vfs_inode.i_size; + loff_t size = i_size_read(&ci->vfs_inode); /* mds will adjust max size according to the reported size */ if (ci->i_flushing_caps & CEPH_CAP_FILE_WR) return false; @@ -2730,10 +2731,6 @@ again: *got = need | want; else *got = need; - if (S_ISREG(inode->i_mode) && - (need & CEPH_CAP_FILE_RD) && - !(*got & CEPH_CAP_FILE_CACHE)) - ceph_disable_fscache_readpage(ci); ceph_take_cap_refs(ci, *got, true); ret = 1; } @@ -2858,8 +2855,7 @@ int ceph_try_get_caps(struct inode *inode, int need, int want, * due to a small max_size, make sure we check_max_size (and possibly * ask the mds) so we don't get hung up indefinitely. */ -int ceph_get_caps(struct file *filp, int need, int want, - loff_t endoff, int *got, struct page **pinned_page) +int ceph_get_caps(struct file *filp, int need, int want, loff_t endoff, int *got) { struct ceph_file_info *fi = filp->private_data; struct inode *inode = file_inode(filp); @@ -2957,11 +2953,11 @@ int ceph_get_caps(struct file *filp, int need, int want, struct page *page = find_get_page(inode->i_mapping, 0); if (page) { - if (PageUptodate(page)) { - *pinned_page = page; - break; - } + bool uptodate = PageUptodate(page); + put_page(page); + if (uptodate) + break; } /* * drop cap refs first because getattr while @@ -2983,11 +2979,6 @@ int ceph_get_caps(struct file *filp, int need, int want, } break; } - - if (S_ISREG(ci->vfs_inode.i_mode) && - (_got & CEPH_CAP_FILE_RD) && (_got & CEPH_CAP_FILE_CACHE)) - ceph_fscache_revalidate_cookie(ci); - *got = _got; return 0; } @@ -3308,7 +3299,7 @@ static void handle_cap_grant(struct inode *inode, dout("handle_cap_grant inode %p cap %p mds%d seq %d %s\n", inode, cap, session->s_mds, seq, ceph_cap_string(newcaps)); dout(" size %llu max_size %llu, i_size %llu\n", size, max_size, - inode->i_size); + i_size_read(inode)); /* @@ -3358,7 +3349,13 @@ static void handle_cap_grant(struct inode *inode, if ((newcaps & CEPH_CAP_AUTH_SHARED) && (extra_info->issued & CEPH_CAP_AUTH_EXCL) == 0) { - inode->i_mode = le32_to_cpu(grant->mode); + umode_t mode = le32_to_cpu(grant->mode); + + if (inode_wrong_type(inode, mode)) + pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", + ceph_vinop(inode), inode->i_mode, mode); + else + inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(grant->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(grant->gid)); ci->i_btime = extra_info->btime; diff --git a/fs/ceph/debugfs.c b/fs/ceph/debugfs.c index 66989c880adb..425f3356332a 100644 --- a/fs/ceph/debugfs.c +++ b/fs/ceph/debugfs.c @@ -162,34 +162,34 @@ static int metric_show(struct seq_file *s, void *p) seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n"); seq_printf(s, "-----------------------------------------------------------------------------------\n"); - spin_lock(&m->read_latency_lock); + spin_lock(&m->read_metric_lock); total = m->total_reads; sum = m->read_latency_sum; avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; min = m->read_latency_min; max = m->read_latency_max; sq = m->read_latency_sq_sum; - spin_unlock(&m->read_latency_lock); + spin_unlock(&m->read_metric_lock); CEPH_METRIC_SHOW("read", total, avg, min, max, sq); - spin_lock(&m->write_latency_lock); + spin_lock(&m->write_metric_lock); total = m->total_writes; sum = m->write_latency_sum; avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; min = m->write_latency_min; max = m->write_latency_max; sq = m->write_latency_sq_sum; - spin_unlock(&m->write_latency_lock); + spin_unlock(&m->write_metric_lock); CEPH_METRIC_SHOW("write", total, avg, min, max, sq); - spin_lock(&m->metadata_latency_lock); + spin_lock(&m->metadata_metric_lock); total = m->total_metadatas; sum = m->metadata_latency_sum; avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0; min = m->metadata_latency_min; max = m->metadata_latency_max; sq = m->metadata_latency_sq_sum; - spin_unlock(&m->metadata_latency_lock); + spin_unlock(&m->metadata_metric_lock); CEPH_METRIC_SHOW("metadata", total, avg, min, max, sq); seq_printf(s, "\n"); diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 83d9358854fb..5624fae7a603 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -631,10 +631,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) switch (whence) { case SEEK_CUR: offset += file->f_pos; + break; case SEEK_SET: break; case SEEK_END: retval = -EOPNOTSUPP; + goto out; default: goto out; } @@ -665,8 +667,8 @@ out: /* * Handle lookups for the hidden .snap directory. */ -int ceph_handle_snapdir(struct ceph_mds_request *req, - struct dentry *dentry, int err) +struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, + struct dentry *dentry, int err) { struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb); struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */ @@ -674,16 +676,17 @@ int ceph_handle_snapdir(struct ceph_mds_request *req, /* .snap dir? */ if (err == -ENOENT && ceph_snap(parent) == CEPH_NOSNAP && - strcmp(dentry->d_name.name, - fsc->mount_options->snapdir_name) == 0) { + strcmp(dentry->d_name.name, fsc->mount_options->snapdir_name) == 0) { + struct dentry *res; struct inode *inode = ceph_get_snapdir(parent); - dout("ENOENT on snapdir %p '%pd', linking to snapdir %p\n", - dentry, dentry, inode); - BUG_ON(!d_unhashed(dentry)); - d_add(dentry, inode); - err = 0; + + res = d_splice_alias(inode, dentry); + dout("ENOENT on snapdir %p '%pd', linking to snapdir %p. Spliced dentry %p\n", + dentry, dentry, inode, res); + if (res) + dentry = res; } - return err; + return dentry; } /* @@ -739,6 +742,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, struct ceph_fs_client *fsc = ceph_sb_to_client(dir->i_sb); struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_mds_request *req; + struct dentry *res; int op; int mask; int err; @@ -789,7 +793,13 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, req->r_parent = dir; set_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags); err = ceph_mdsc_do_request(mdsc, NULL, req); - err = ceph_handle_snapdir(req, dentry, err); + res = ceph_handle_snapdir(req, dentry, err); + if (IS_ERR(res)) { + err = PTR_ERR(res); + } else { + dentry = res; + err = 0; + } dentry = ceph_finish_lookup(req, dentry, err); ceph_mdsc_put_request(req); /* will dput(dentry) */ dout("lookup result=%p\n", dentry); diff --git a/fs/ceph/export.c b/fs/ceph/export.c index e088843a7734..65540a4429b2 100644 --- a/fs/ceph/export.c +++ b/fs/ceph/export.c @@ -129,6 +129,10 @@ static struct inode *__lookup_inode(struct super_block *sb, u64 ino) vino.ino = ino; vino.snap = CEPH_NOSNAP; + + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-ESTALE); + inode = ceph_find_inode(sb, vino); if (!inode) { struct ceph_mds_request *req; @@ -178,8 +182,10 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) return ERR_CAST(inode); /* We need LINK caps to reliably check i_nlink */ err = ceph_do_getattr(inode, CEPH_CAP_LINK_SHARED, false); - if (err) + if (err) { + iput(inode); return ERR_PTR(err); + } /* -ESTALE if inode as been unlinked and no file is open */ if ((inode->i_nlink == 0) && (atomic_read(&inode->i_count) == 1)) { iput(inode); @@ -212,6 +218,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, vino.ino = sfh->ino; vino.snap = sfh->snapid; } + + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-ESTALE); + inode = ceph_find_inode(sb, vino); if (inode) return d_obtain_alias(inode); @@ -248,9 +258,10 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, ihold(inode); } else { /* mds does not support lookup snapped inode */ - err = -EOPNOTSUPP; - inode = NULL; + inode = ERR_PTR(-EOPNOTSUPP); } + } else { + inode = ERR_PTR(-ESTALE); } ceph_mdsc_put_request(req); @@ -261,8 +272,8 @@ static struct dentry *__snapfh_to_dentry(struct super_block *sb, dout("snapfh_to_dentry %llx.%llx parent %llx hash %x err=%d", vino.ino, vino.snap, sfh->parent_ino, sfh->hash, err); } - if (!inode) - return ERR_PTR(-ESTALE); + if (IS_ERR(inode)) + return ERR_CAST(inode); /* see comments in ceph_get_parent() */ return unlinked ? d_obtain_root(inode) : d_obtain_alias(inode); } diff --git a/fs/ceph/file.c b/fs/ceph/file.c index 209535d5b8d3..77fc037d5beb 100644 --- a/fs/ceph/file.c +++ b/fs/ceph/file.c @@ -739,9 +739,12 @@ retry: err = ceph_mdsc_do_request(mdsc, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL, req); - err = ceph_handle_snapdir(req, dentry, err); - if (err) + dentry = ceph_handle_snapdir(req, dentry, err); + if (IS_ERR(dentry)) { + err = PTR_ERR(dentry); goto out_req; + } + err = 0; if ((flags & O_CREAT) && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); @@ -892,7 +895,7 @@ static ssize_t ceph_sync_read(struct kiocb *iocb, struct iov_iter *to, if (!ret) ret = ceph_osdc_wait_request(osdc, req); - ceph_update_read_latency(&fsc->mdsc->metric, + ceph_update_read_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, ret); @@ -1034,16 +1037,6 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) dout("ceph_aio_complete_req %p rc %d bytes %u\n", inode, rc, osd_data->bvec_pos.iter.bi_size); - /* r_start_latency == 0 means the request was not submitted */ - if (req->r_start_latency) { - if (aio_req->write) - ceph_update_write_latency(metric, req->r_start_latency, - req->r_end_latency, rc); - else - ceph_update_read_latency(metric, req->r_start_latency, - req->r_end_latency, rc); - } - if (rc == -EOLDSNAPC) { struct ceph_aio_work *aio_work; BUG_ON(!aio_req->write); @@ -1086,6 +1079,16 @@ static void ceph_aio_complete_req(struct ceph_osd_request *req) } } + /* r_start_latency == 0 means the request was not submitted */ + if (req->r_start_latency) { + if (aio_req->write) + ceph_update_write_metrics(metric, req->r_start_latency, + req->r_end_latency, rc); + else + ceph_update_read_metrics(metric, req->r_start_latency, + req->r_end_latency, rc); + } + put_bvecs(osd_data->bvec_pos.bvecs, osd_data->num_bvecs, aio_req->should_dirty); ceph_osdc_put_request(req); @@ -1290,10 +1293,10 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ret = ceph_osdc_wait_request(&fsc->client->osdc, req); if (write) - ceph_update_write_latency(metric, req->r_start_latency, + ceph_update_write_metrics(metric, req->r_start_latency, req->r_end_latency, ret); else - ceph_update_read_latency(metric, req->r_start_latency, + ceph_update_read_metrics(metric, req->r_start_latency, req->r_end_latency, ret); size = i_size_read(inode); @@ -1467,7 +1470,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, if (!ret) ret = ceph_osdc_wait_request(&fsc->client->osdc, req); - ceph_update_write_latency(&fsc->mdsc->metric, req->r_start_latency, + ceph_update_write_metrics(&fsc->mdsc->metric, req->r_start_latency, req->r_end_latency, ret); out: ceph_osdc_put_request(req); @@ -1510,7 +1513,6 @@ static ssize_t ceph_read_iter(struct kiocb *iocb, struct iov_iter *to) size_t len = iov_iter_count(to); struct inode *inode = file_inode(filp); struct ceph_inode_info *ci = ceph_inode(inode); - struct page *pinned_page = NULL; bool direct_lock = iocb->ki_flags & IOCB_DIRECT; ssize_t ret; int want, got = 0; @@ -1529,8 +1531,7 @@ again: want = CEPH_CAP_FILE_CACHE | CEPH_CAP_FILE_LAZYIO; else want = CEPH_CAP_FILE_CACHE; - ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, - &got, &pinned_page); + ret = ceph_get_caps(filp, CEPH_CAP_FILE_RD, want, -1, &got); if (ret < 0) { if (iocb->ki_flags & IOCB_DIRECT) ceph_end_io_direct(inode); @@ -1571,10 +1572,6 @@ again: dout("aio_read %p %llx.%llx dropping cap refs on %s = %d\n", inode, ceph_vinop(inode), ceph_cap_string(got), (int)ret); - if (pinned_page) { - put_page(pinned_page); - pinned_page = NULL; - } ceph_put_cap_refs(ci, got); if (direct_lock) @@ -1753,8 +1750,7 @@ retry_snap: else want = CEPH_CAP_FILE_BUFFER; got = 0; - err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, - &got, NULL); + err = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, pos + count, &got); if (err < 0) goto out; @@ -2083,7 +2079,7 @@ static long ceph_fallocate(struct file *file, int mode, else want = CEPH_CAP_FILE_BUFFER; - ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got, NULL); + ret = ceph_get_caps(file, CEPH_CAP_FILE_WR, want, endoff, &got); if (ret < 0) goto unlock; @@ -2121,7 +2117,7 @@ static int get_rd_wr_caps(struct file *src_filp, int *src_got, retry_caps: ret = ceph_get_caps(dst_filp, CEPH_CAP_FILE_WR, CEPH_CAP_FILE_BUFFER, - dst_endoff, dst_got, NULL); + dst_endoff, dst_got); if (ret < 0) return ret; @@ -2143,7 +2139,7 @@ retry_caps: return ret; } ret = ceph_get_caps(src_filp, CEPH_CAP_FILE_RD, - CEPH_CAP_FILE_SHARED, -1, src_got, NULL); + CEPH_CAP_FILE_SHARED, -1, src_got); if (ret < 0) return ret; /*... drop src_ci caps too, and retry */ diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 156f849f5385..e1c63adb196d 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -56,6 +56,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) { struct inode *inode; + if (ceph_vino_is_reserved(vino)) + return ERR_PTR(-EREMOTEIO); + inode = iget5_locked(sb, (unsigned long)vino.ino, ceph_ino_compare, ceph_set_ino_cb, &vino); if (!inode) @@ -78,23 +81,36 @@ struct inode *ceph_get_snapdir(struct inode *parent) struct inode *inode = ceph_get_inode(parent->i_sb, vino); struct ceph_inode_info *ci = ceph_inode(inode); - BUG_ON(!S_ISDIR(parent->i_mode)); if (IS_ERR(inode)) return inode; + + if (!S_ISDIR(parent->i_mode)) { + pr_warn_once("bad snapdir parent type (mode=0%o)\n", + parent->i_mode); + return ERR_PTR(-ENOTDIR); + } + + if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) { + pr_warn_once("bad snapdir inode type (mode=0%o)\n", + inode->i_mode); + return ERR_PTR(-ENOTDIR); + } + inode->i_mode = parent->i_mode; inode->i_uid = parent->i_uid; inode->i_gid = parent->i_gid; inode->i_mtime = parent->i_mtime; inode->i_ctime = parent->i_ctime; inode->i_atime = parent->i_atime; - inode->i_op = &ceph_snapdir_iops; - inode->i_fop = &ceph_snapdir_fops; - ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_rbytes = 0; ci->i_btime = ceph_inode(parent)->i_btime; - if (inode->i_state & I_NEW) + if (inode->i_state & I_NEW) { + inode->i_op = &ceph_snapdir_iops; + inode->i_fop = &ceph_snapdir_fops; + ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ unlock_new_inode(inode); + } return inode; } @@ -616,10 +632,11 @@ int ceph_fill_file_size(struct inode *inode, int issued, { struct ceph_inode_info *ci = ceph_inode(inode); int queue_trunc = 0; + loff_t isize = i_size_read(inode); if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || - (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { - dout("size %lld -> %llu\n", inode->i_size, size); + (truncate_seq == ci->i_truncate_seq && size > isize)) { + dout("size %lld -> %llu\n", isize, size); if (size > 0 && S_ISDIR(inode->i_mode)) { pr_err("fill_file_size non-zero size for directory\n"); size = 0; @@ -757,11 +774,32 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, bool queue_trunc = false; bool new_version = false; bool fill_inline = false; + umode_t mode = le32_to_cpu(info->mode); + dev_t rdev = le32_to_cpu(info->rdev); dout("%s %p ino %llx.%llx v %llu had %llu\n", __func__, inode, ceph_vinop(inode), le64_to_cpu(info->version), ci->i_version); + /* Once I_NEW is cleared, we can't change type or dev numbers */ + if (inode->i_state & I_NEW) { + inode->i_mode = mode; + } else { + if (inode_wrong_type(inode, mode)) { + pr_warn_once("inode type changed! (ino %llx.%llx is 0%o, mds says 0%o)\n", + ceph_vinop(inode), inode->i_mode, mode); + return -ESTALE; + } + + if ((S_ISCHR(mode) || S_ISBLK(mode)) && inode->i_rdev != rdev) { + pr_warn_once("dev inode rdev changed! (ino %llx.%llx is %u:%u, mds says %u:%u)\n", + ceph_vinop(inode), MAJOR(inode->i_rdev), + MINOR(inode->i_rdev), MAJOR(rdev), + MINOR(rdev)); + return -ESTALE; + } + } + info_caps = le32_to_cpu(info->cap.caps); /* prealloc new cap struct */ @@ -815,8 +853,6 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, issued |= __ceph_caps_dirty(ci); new_issued = ~issued & info_caps; - /* update inode */ - inode->i_rdev = le32_to_cpu(info->rdev); /* directories have fl_stripe_unit set to zero */ if (le32_to_cpu(info->layout.fl_stripe_unit)) inode->i_blkbits = @@ -828,7 +864,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, if ((new_version || (new_issued & CEPH_CAP_AUTH_SHARED)) && (issued & CEPH_CAP_AUTH_EXCL) == 0) { - inode->i_mode = le32_to_cpu(info->mode); + inode->i_mode = mode; inode->i_uid = make_kuid(&init_user_ns, le32_to_cpu(info->uid)); inode->i_gid = make_kgid(&init_user_ns, le32_to_cpu(info->gid)); dout("%p mode 0%o uid.gid %d.%d\n", inode, inode->i_mode, @@ -894,6 +930,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, ci->i_rfiles = le64_to_cpu(info->rfiles); ci->i_rsubdirs = le64_to_cpu(info->rsubdirs); ci->i_dir_pin = iinfo->dir_pin; + ci->i_rsnaps = iinfo->rsnaps; ceph_decode_timespec64(&ci->i_rctime, &info->rctime); } } @@ -926,7 +963,7 @@ int ceph_fill_inode(struct inode *inode, struct page *locked_page, case S_IFCHR: case S_IFSOCK: inode->i_blkbits = PAGE_SHIFT; - init_special_inode(inode, inode->i_mode, inode->i_rdev); + init_special_inode(inode, inode->i_mode, rdev); inode->i_op = &ceph_file_iops; break; case S_IFREG: @@ -1787,7 +1824,7 @@ bool ceph_inode_set_size(struct inode *inode, loff_t size) bool ret; spin_lock(&ci->i_ceph_lock); - dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); + dout("set_size %p %llu -> %llu\n", inode, i_size_read(inode), size); i_size_write(inode, size); inode->i_blocks = calc_inode_blocks(size); @@ -1863,6 +1900,7 @@ static void ceph_do_invalidate_pages(struct inode *inode) orig_gen = ci->i_rdcache_gen; spin_unlock(&ci->i_ceph_lock); + ceph_fscache_invalidate(inode); if (invalidate_inode_pages2(inode->i_mapping) < 0) { pr_err("invalidate_pages %p fails\n", inode); } @@ -2093,20 +2131,19 @@ int __ceph_setattr(struct inode *inode, struct iattr *attr) } } if (ia_valid & ATTR_SIZE) { - dout("setattr %p size %lld -> %lld\n", inode, - inode->i_size, attr->ia_size); - if ((issued & CEPH_CAP_FILE_EXCL) && - attr->ia_size > inode->i_size) { + loff_t isize = i_size_read(inode); + + dout("setattr %p size %lld -> %lld\n", inode, isize, attr->ia_size); + if ((issued & CEPH_CAP_FILE_EXCL) && attr->ia_size > isize) { i_size_write(inode, attr->ia_size); inode->i_blocks = calc_inode_blocks(attr->ia_size); ci->i_reported_size = attr->ia_size; dirtied |= CEPH_CAP_FILE_EXCL; ia_valid |= ATTR_MTIME; } else if ((issued & CEPH_CAP_FILE_SHARED) == 0 || - attr->ia_size != inode->i_size) { + attr->ia_size != isize) { req->r_args.setattr.size = cpu_to_le64(attr->ia_size); - req->r_args.setattr.old_size = - cpu_to_le64(inode->i_size); + req->r_args.setattr.old_size = cpu_to_le64(isize); mask |= CEPH_SETATTR_SIZE; release |= CEPH_CAP_FILE_SHARED | CEPH_CAP_FILE_EXCL | CEPH_CAP_FILE_RD | CEPH_CAP_FILE_WR; @@ -2216,7 +2253,7 @@ int ceph_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, return err; if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size > max(inode->i_size, fsc->max_file_size)) + attr->ia_size > max(i_size_read(inode), fsc->max_file_size)) return -EFBIG; if ((attr->ia_valid & ATTR_SIZE) && diff --git a/fs/ceph/io.c b/fs/ceph/io.c index 97602ea92ff4..c456509b31c3 100644 --- a/fs/ceph/io.c +++ b/fs/ceph/io.c @@ -118,7 +118,7 @@ static void ceph_block_buffered(struct ceph_inode_info *ci, struct inode *inode) } /** - * ceph_end_io_direct - declare the file is being used for direct i/o + * ceph_start_io_direct - declare the file is being used for direct i/o * @inode: file inode * * Declare that a direct I/O operation is about to start, and ensure diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index d87bd852ed96..e5af591d3bd4 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -176,6 +176,13 @@ static int parse_reply_info_in(void **p, void *end, memset(&info->snap_btime, 0, sizeof(info->snap_btime)); } + /* snapshot count, remains zero for v<=3 */ + if (struct_v >= 4) { + ceph_decode_64_safe(p, end, info->rsnaps, bad); + } else { + info->rsnaps = 0; + } + *p = end; } else { if (features & CEPH_FEATURE_MDS_INLINE_DATA) { @@ -214,7 +221,7 @@ static int parse_reply_info_in(void **p, void *end, } info->dir_pin = -ENODATA; - /* info->snap_btime remains zero */ + /* info->snap_btime and info->rsnaps remain zero */ } return 0; bad: @@ -433,6 +440,13 @@ static int ceph_parse_deleg_inos(void **p, void *end, ceph_decode_64_safe(p, end, start, bad); ceph_decode_64_safe(p, end, len, bad); + + /* Don't accept a delegation of system inodes */ + if (start < CEPH_INO_SYSTEM_BASE) { + pr_warn_ratelimited("ceph: ignoring reserved inode range delegation (start=0x%llx len=0x%llx)\n", + start, len); + continue; + } while (len--) { int err = xa_insert(&s->s_delegated_inos, ino = start++, DELEGATED_INO_AVAILABLE, @@ -3306,7 +3320,7 @@ out_err: /* kick calling process */ complete_request(mdsc, req); - ceph_update_metadata_latency(&mdsc->metric, req->r_start_latency, + ceph_update_metadata_metrics(&mdsc->metric, req->r_start_latency, req->r_end_latency, err); out: ceph_mdsc_put_request(req); @@ -3780,7 +3794,7 @@ static int reconnect_caps_cb(struct inode *inode, struct ceph_cap *cap, rec.v1.cap_id = cpu_to_le64(cap->cap_id); rec.v1.wanted = cpu_to_le32(__ceph_caps_wanted(ci)); rec.v1.issued = cpu_to_le32(cap->issued); - rec.v1.size = cpu_to_le64(inode->i_size); + rec.v1.size = cpu_to_le64(i_size_read(inode)); ceph_encode_timespec64(&rec.v1.mtime, &inode->i_mtime); ceph_encode_timespec64(&rec.v1.atime, &inode->i_atime); rec.v1.snaprealm = cpu_to_le64(ci->i_snap_realm->ino); diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index eaa7c5422116..15c11a0f2caf 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -88,6 +88,7 @@ struct ceph_mds_reply_info_in { s32 dir_pin; struct ceph_timespec btime; struct ceph_timespec snap_btime; + u64 rsnaps; u64 change_attr; }; diff --git a/fs/ceph/metric.c b/fs/ceph/metric.c index 5ec94bd4c1de..28b6b42ad677 100644 --- a/fs/ceph/metric.c +++ b/fs/ceph/metric.c @@ -17,6 +17,9 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, struct ceph_metric_write_latency *write; struct ceph_metric_metadata_latency *meta; struct ceph_metric_dlease *dlease; + struct ceph_opened_files *files; + struct ceph_pinned_icaps *icaps; + struct ceph_opened_inodes *inodes; struct ceph_client_metric *m = &mdsc->metric; u64 nr_caps = atomic64_read(&m->total_caps); struct ceph_msg *msg; @@ -26,7 +29,8 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, s32 len; len = sizeof(*head) + sizeof(*cap) + sizeof(*read) + sizeof(*write) - + sizeof(*meta) + sizeof(*dlease); + + sizeof(*meta) + sizeof(*dlease) + sizeof(*files) + + sizeof(*icaps) + sizeof(*inodes); msg = ceph_msg_new(CEPH_MSG_CLIENT_METRICS, len, GFP_NOFS, true); if (!msg) { @@ -95,6 +99,38 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc, dlease->total = cpu_to_le64(atomic64_read(&m->total_dentries)); items++; + sum = percpu_counter_sum(&m->total_inodes); + + /* encode the opened files metric */ + files = (struct ceph_opened_files *)(dlease + 1); + files->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_FILES); + files->ver = 1; + files->compat = 1; + files->data_len = cpu_to_le32(sizeof(*files) - 10); + files->opened_files = cpu_to_le64(atomic64_read(&m->opened_files)); + files->total = cpu_to_le64(sum); + items++; + + /* encode the pinned icaps metric */ + icaps = (struct ceph_pinned_icaps *)(files + 1); + icaps->type = cpu_to_le32(CLIENT_METRIC_TYPE_PINNED_ICAPS); + icaps->ver = 1; + icaps->compat = 1; + icaps->data_len = cpu_to_le32(sizeof(*icaps) - 10); + icaps->pinned_icaps = cpu_to_le64(nr_caps); + icaps->total = cpu_to_le64(sum); + items++; + + /* encode the opened inodes metric */ + inodes = (struct ceph_opened_inodes *)(icaps + 1); + inodes->type = cpu_to_le32(CLIENT_METRIC_TYPE_OPENED_INODES); + inodes->ver = 1; + inodes->compat = 1; + inodes->data_len = cpu_to_le32(sizeof(*inodes) - 10); + inodes->opened_inodes = cpu_to_le64(percpu_counter_sum(&m->opened_inodes)); + inodes->total = cpu_to_le64(sum); + items++; + put_unaligned_le32(items, &head->num); msg->front.iov_len = len; msg->hdr.version = cpu_to_le16(1); @@ -183,21 +219,21 @@ int ceph_metric_init(struct ceph_client_metric *m) if (ret) goto err_i_caps_mis; - spin_lock_init(&m->read_latency_lock); + spin_lock_init(&m->read_metric_lock); m->read_latency_sq_sum = 0; m->read_latency_min = KTIME_MAX; m->read_latency_max = 0; m->total_reads = 0; m->read_latency_sum = 0; - spin_lock_init(&m->write_latency_lock); + spin_lock_init(&m->write_metric_lock); m->write_latency_sq_sum = 0; m->write_latency_min = KTIME_MAX; m->write_latency_max = 0; m->total_writes = 0; m->write_latency_sum = 0; - spin_lock_init(&m->metadata_latency_lock); + spin_lock_init(&m->metadata_metric_lock); m->metadata_latency_sq_sum = 0; m->metadata_latency_min = KTIME_MAX; m->metadata_latency_max = 0; @@ -274,7 +310,7 @@ static inline void __update_latency(ktime_t *totalp, ktime_t *lsump, *sq_sump += sq; } -void ceph_update_read_latency(struct ceph_client_metric *m, +void ceph_update_read_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc) { @@ -283,14 +319,14 @@ void ceph_update_read_latency(struct ceph_client_metric *m, if (unlikely(rc < 0 && rc != -ENOENT && rc != -ETIMEDOUT)) return; - spin_lock(&m->read_latency_lock); + spin_lock(&m->read_metric_lock); __update_latency(&m->total_reads, &m->read_latency_sum, &m->read_latency_min, &m->read_latency_max, &m->read_latency_sq_sum, lat); - spin_unlock(&m->read_latency_lock); + spin_unlock(&m->read_metric_lock); } -void ceph_update_write_latency(struct ceph_client_metric *m, +void ceph_update_write_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc) { @@ -299,14 +335,14 @@ void ceph_update_write_latency(struct ceph_client_metric *m, if (unlikely(rc && rc != -ETIMEDOUT)) return; - spin_lock(&m->write_latency_lock); + spin_lock(&m->write_metric_lock); __update_latency(&m->total_writes, &m->write_latency_sum, &m->write_latency_min, &m->write_latency_max, &m->write_latency_sq_sum, lat); - spin_unlock(&m->write_latency_lock); + spin_unlock(&m->write_metric_lock); } -void ceph_update_metadata_latency(struct ceph_client_metric *m, +void ceph_update_metadata_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc) { @@ -315,9 +351,9 @@ void ceph_update_metadata_latency(struct ceph_client_metric *m, if (unlikely(rc && rc != -ENOENT)) return; - spin_lock(&m->metadata_latency_lock); + spin_lock(&m->metadata_metric_lock); __update_latency(&m->total_metadatas, &m->metadata_latency_sum, &m->metadata_latency_min, &m->metadata_latency_max, &m->metadata_latency_sq_sum, lat); - spin_unlock(&m->metadata_latency_lock); + spin_unlock(&m->metadata_metric_lock); } diff --git a/fs/ceph/metric.h b/fs/ceph/metric.h index af6038ff39d4..e984eb2bb14b 100644 --- a/fs/ceph/metric.h +++ b/fs/ceph/metric.h @@ -14,8 +14,11 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_WRITE_LATENCY, CLIENT_METRIC_TYPE_METADATA_LATENCY, CLIENT_METRIC_TYPE_DENTRY_LEASE, + CLIENT_METRIC_TYPE_OPENED_FILES, + CLIENT_METRIC_TYPE_PINNED_ICAPS, + CLIENT_METRIC_TYPE_OPENED_INODES, - CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_DENTRY_LEASE, + CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_OPENED_INODES, }; /* @@ -28,6 +31,9 @@ enum ceph_metric_type { CLIENT_METRIC_TYPE_WRITE_LATENCY, \ CLIENT_METRIC_TYPE_METADATA_LATENCY, \ CLIENT_METRIC_TYPE_DENTRY_LEASE, \ + CLIENT_METRIC_TYPE_OPENED_FILES, \ + CLIENT_METRIC_TYPE_PINNED_ICAPS, \ + CLIENT_METRIC_TYPE_OPENED_INODES, \ \ CLIENT_METRIC_TYPE_MAX, \ } @@ -94,6 +100,42 @@ struct ceph_metric_dlease { __le64 total; } __packed; +/* metric opened files header */ +struct ceph_opened_files { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(opened_files + total) */ + __le64 opened_files; + __le64 total; +} __packed; + +/* metric pinned i_caps header */ +struct ceph_pinned_icaps { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(pinned_icaps + total) */ + __le64 pinned_icaps; + __le64 total; +} __packed; + +/* metric opened inodes header */ +struct ceph_opened_inodes { + __le32 type; /* ceph metric type */ + + __u8 ver; + __u8 compat; + + __le32 data_len; /* length of sizeof(opened_inodes + total) */ + __le64 opened_inodes; + __le64 total; +} __packed; + struct ceph_metric_head { __le32 num; /* the number of metrics that will be sent */ } __packed; @@ -108,21 +150,21 @@ struct ceph_client_metric { struct percpu_counter i_caps_hit; struct percpu_counter i_caps_mis; - spinlock_t read_latency_lock; + spinlock_t read_metric_lock; u64 total_reads; ktime_t read_latency_sum; ktime_t read_latency_sq_sum; ktime_t read_latency_min; ktime_t read_latency_max; - spinlock_t write_latency_lock; + spinlock_t write_metric_lock; u64 total_writes; ktime_t write_latency_sum; ktime_t write_latency_sq_sum; ktime_t write_latency_min; ktime_t write_latency_max; - spinlock_t metadata_latency_lock; + spinlock_t metadata_metric_lock; u64 total_metadatas; ktime_t metadata_latency_sum; ktime_t metadata_latency_sq_sum; @@ -162,13 +204,13 @@ static inline void ceph_update_cap_mis(struct ceph_client_metric *m) percpu_counter_inc(&m->i_caps_mis); } -extern void ceph_update_read_latency(struct ceph_client_metric *m, +extern void ceph_update_read_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); -extern void ceph_update_write_latency(struct ceph_client_metric *m, +extern void ceph_update_write_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); -extern void ceph_update_metadata_latency(struct ceph_client_metric *m, +extern void ceph_update_metadata_metrics(struct ceph_client_metric *m, ktime_t r_start, ktime_t r_end, int rc); #endif /* _FS_CEPH_MDS_METRIC_H */ diff --git a/fs/ceph/snap.c b/fs/ceph/snap.c index 0728b01d4d43..4ce18055d931 100644 --- a/fs/ceph/snap.c +++ b/fs/ceph/snap.c @@ -605,7 +605,7 @@ int __ceph_finish_cap_snap(struct ceph_inode_info *ci, struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); BUG_ON(capsnap->writing); - capsnap->size = inode->i_size; + capsnap->size = i_size_read(inode); capsnap->mtime = inode->i_mtime; capsnap->atime = inode->i_atime; capsnap->ctime = inode->i_ctime; diff --git a/fs/ceph/super.h b/fs/ceph/super.h index c48bb30c8d70..db80d89556b1 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -21,6 +21,7 @@ #include <linux/ceph/libceph.h> #ifdef CONFIG_CEPH_FSCACHE +#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> #endif @@ -333,7 +334,7 @@ struct ceph_inode_info { /* for dirs */ struct timespec64 i_rctime; - u64 i_rbytes, i_rfiles, i_rsubdirs; + u64 i_rbytes, i_rfiles, i_rsubdirs, i_rsnaps; u64 i_files, i_subdirs; /* quotas */ @@ -427,7 +428,6 @@ struct ceph_inode_info { #ifdef CONFIG_CEPH_FSCACHE struct fscache_cookie *fscache; - u32 i_fscache_gen; #endif errseq_t i_meta_err; @@ -529,10 +529,34 @@ static inline int ceph_ino_compare(struct inode *inode, void *data) ci->i_vino.snap == pvino->snap; } +/* + * The MDS reserves a set of inodes for its own usage. These should never + * be accessible by clients, and so the MDS has no reason to ever hand these + * out. The range is CEPH_MDS_INO_MDSDIR_OFFSET..CEPH_INO_SYSTEM_BASE. + * + * These come from src/mds/mdstypes.h in the ceph sources. + */ +#define CEPH_MAX_MDS 0x100 +#define CEPH_NUM_STRAY 10 +#define CEPH_MDS_INO_MDSDIR_OFFSET (1 * CEPH_MAX_MDS) +#define CEPH_INO_SYSTEM_BASE ((6*CEPH_MAX_MDS) + (CEPH_MAX_MDS * CEPH_NUM_STRAY)) + +static inline bool ceph_vino_is_reserved(const struct ceph_vino vino) +{ + if (vino.ino < CEPH_INO_SYSTEM_BASE && + vino.ino >= CEPH_MDS_INO_MDSDIR_OFFSET) { + WARN_RATELIMIT(1, "Attempt to access reserved inode number 0x%llx", vino.ino); + return true; + } + return false; +} static inline struct inode *ceph_find_inode(struct super_block *sb, struct ceph_vino vino) { + if (ceph_vino_is_reserved(vino)) + return NULL; + /* * NB: The hashval will be run through the fs/inode.c hash function * anyway, so there is no need to squash the inode number down to @@ -1156,7 +1180,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn, int mds, int drop, int unless); extern int ceph_get_caps(struct file *filp, int need, int want, - loff_t endoff, int *got, struct page **pinned_page); + loff_t endoff, int *got); extern int ceph_try_get_caps(struct inode *inode, int need, int want, bool nonblock, int *got); @@ -1193,7 +1217,7 @@ extern const struct dentry_operations ceph_dentry_ops; extern loff_t ceph_make_fpos(unsigned high, unsigned off, bool hash_order); extern int ceph_handle_notrace_create(struct inode *dir, struct dentry *dentry); -extern int ceph_handle_snapdir(struct ceph_mds_request *req, +extern struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req, struct dentry *dentry, int err); extern struct dentry *ceph_finish_lookup(struct ceph_mds_request *req, struct dentry *dentry, int err); diff --git a/fs/ceph/xattr.c b/fs/ceph/xattr.c index 02f59bcb4f27..1242db8d3444 100644 --- a/fs/ceph/xattr.c +++ b/fs/ceph/xattr.c @@ -233,6 +233,12 @@ static ssize_t ceph_vxattrcb_dir_rsubdirs(struct ceph_inode_info *ci, char *val, return ceph_fmt_xattr(val, size, "%lld", ci->i_rsubdirs); } +static ssize_t ceph_vxattrcb_dir_rsnaps(struct ceph_inode_info *ci, char *val, + size_t size) +{ + return ceph_fmt_xattr(val, size, "%lld", ci->i_rsnaps); +} + static ssize_t ceph_vxattrcb_dir_rbytes(struct ceph_inode_info *ci, char *val, size_t size) { @@ -384,6 +390,7 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { XATTR_RSTAT_FIELD(dir, rentries), XATTR_RSTAT_FIELD(dir, rfiles), XATTR_RSTAT_FIELD(dir, rsubdirs), + XATTR_RSTAT_FIELD(dir, rsnaps), XATTR_RSTAT_FIELD(dir, rbytes), XATTR_RSTAT_FIELD(dir, rctime), { diff --git a/fs/cifs/cifs_debug.c b/fs/cifs/cifs_debug.c index 88a7958170ee..68e8e5b27841 100644 --- a/fs/cifs/cifs_debug.c +++ b/fs/cifs/cifs_debug.c @@ -17,15 +17,14 @@ #include "cifsproto.h" #include "cifs_debug.h" #include "cifsfs.h" +#include "fs_context.h" #ifdef CONFIG_CIFS_DFS_UPCALL #include "dfs_cache.h" #endif #ifdef CONFIG_CIFS_SMB_DIRECT #include "smbdirect.h" #endif -#ifdef CONFIG_CIFS_SWN_UPCALL #include "cifs_swn.h" -#endif void cifs_dump_mem(char *label, void *data, int length) @@ -118,10 +117,8 @@ static void cifs_debug_tcon(struct seq_file *m, struct cifs_tcon *tcon) seq_printf(m, " POSIX Extensions"); if (tcon->ses->server->ops->dump_share_caps) tcon->ses->server->ops->dump_share_caps(m, tcon); -#ifdef CONFIG_CIFS_SWN_UPCALL if (tcon->use_witness) seq_puts(m, " Witness"); -#endif if (tcon->need_reconnect) seq_puts(m, "\tDISCONNECTED "); @@ -490,10 +487,8 @@ skip_rdma: spin_unlock(&cifs_tcp_ses_lock); seq_putc(m, '\n'); - -#ifdef CONFIG_CIFS_SWN_UPCALL cifs_swn_dump(m); -#endif + /* BB add code to dump additional info such as TCP session info now */ return 0; } @@ -702,6 +697,7 @@ static const struct proc_ops cifs_lookup_cache_proc_ops; static const struct proc_ops traceSMB_proc_ops; static const struct proc_ops cifs_security_flags_proc_ops; static const struct proc_ops cifs_linux_ext_proc_ops; +static const struct proc_ops cifs_mount_params_proc_ops; void cifs_proc_init(void) @@ -726,6 +722,8 @@ cifs_proc_init(void) proc_create("LookupCacheEnabled", 0644, proc_fs_cifs, &cifs_lookup_cache_proc_ops); + proc_create("mount_params", 0444, proc_fs_cifs, &cifs_mount_params_proc_ops); + #ifdef CONFIG_CIFS_DFS_UPCALL proc_create("dfscache", 0644, proc_fs_cifs, &dfscache_proc_ops); #endif @@ -764,6 +762,7 @@ cifs_proc_clean(void) remove_proc_entry("SecurityFlags", proc_fs_cifs); remove_proc_entry("LinuxExtensionsEnabled", proc_fs_cifs); remove_proc_entry("LookupCacheEnabled", proc_fs_cifs); + remove_proc_entry("mount_params", proc_fs_cifs); #ifdef CONFIG_CIFS_DFS_UPCALL remove_proc_entry("dfscache", proc_fs_cifs); @@ -1023,6 +1022,51 @@ static const struct proc_ops cifs_security_flags_proc_ops = { .proc_release = single_release, .proc_write = cifs_security_flags_proc_write, }; + +/* To make it easier to debug, can help to show mount params */ +static int cifs_mount_params_proc_show(struct seq_file *m, void *v) +{ + const struct fs_parameter_spec *p; + const char *type; + + for (p = smb3_fs_parameters; p->name; p++) { + /* cannot use switch with pointers... */ + if (!p->type) { + if (p->flags == fs_param_neg_with_no) + type = "noflag"; + else + type = "flag"; + } else if (p->type == fs_param_is_bool) + type = "bool"; + else if (p->type == fs_param_is_u32) + type = "u32"; + else if (p->type == fs_param_is_u64) + type = "u64"; + else if (p->type == fs_param_is_string) + type = "string"; + else + type = "unknown"; + + seq_printf(m, "%s:%s\n", p->name, type); + } + + return 0; +} + +static int cifs_mount_params_proc_open(struct inode *inode, struct file *file) +{ + return single_open(file, cifs_mount_params_proc_show, NULL); +} + +static const struct proc_ops cifs_mount_params_proc_ops = { + .proc_open = cifs_mount_params_proc_open, + .proc_read = seq_read, + .proc_lseek = seq_lseek, + .proc_release = single_release, + /* No need for write for now */ + /* .proc_write = cifs_mount_params_proc_write, */ +}; + #else inline void cifs_proc_init(void) { diff --git a/fs/cifs/cifs_dfs_ref.c b/fs/cifs/cifs_dfs_ref.c index 6b1ce4efb591..c87c37cf2914 100644 --- a/fs/cifs/cifs_dfs_ref.c +++ b/fs/cifs/cifs_dfs_ref.c @@ -270,7 +270,7 @@ static struct vfsmount *cifs_dfs_do_mount(struct dentry *mntpt, char *mountdata; char *devname; - devname = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); + devname = kstrdup(fullpath, GFP_KERNEL); if (!devname) return ERR_PTR(-ENOMEM); @@ -302,6 +302,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) struct cifs_sb_info *cifs_sb; struct cifs_ses *ses; struct cifs_tcon *tcon; + void *page; char *full_path, *root_path; unsigned int xid; int rc; @@ -324,10 +325,13 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) goto cdda_exit; } + page = alloc_dentry_path(); /* always use tree name prefix */ - full_path = build_path_from_dentry_optional_prefix(mntpt, true); - if (full_path == NULL) - goto cdda_exit; + full_path = build_path_from_dentry_optional_prefix(mntpt, page, true); + if (IS_ERR(full_path)) { + mnt = ERR_CAST(full_path); + goto free_full_path; + } convert_delimiter(full_path, '\\'); @@ -385,7 +389,7 @@ static struct vfsmount *cifs_dfs_do_automount(struct dentry *mntpt) free_root_path: kfree(root_path); free_full_path: - kfree(full_path); + free_dentry_path(page); cdda_exit: cifs_dbg(FYI, "leaving %s\n" , __func__); return mnt; diff --git a/fs/cifs/cifs_fs_sb.h b/fs/cifs/cifs_fs_sb.h index aa77edc12212..9c45b3a82ad9 100644 --- a/fs/cifs/cifs_fs_sb.h +++ b/fs/cifs/cifs_fs_sb.h @@ -55,6 +55,7 @@ #define CIFS_MOUNT_MODE_FROM_SID 0x10000000 /* retrieve mode from special ACE */ #define CIFS_MOUNT_RO_CACHE 0x20000000 /* assumes share will not change */ #define CIFS_MOUNT_RW_CACHE 0x40000000 /* assumes only client accessing */ +#define CIFS_MOUNT_SHUTDOWN 0x80000000 struct cifs_sb_info { struct rb_root tlink_tree; @@ -81,5 +82,9 @@ struct cifs_sb_info { * (cifs_autodisable_serverino) in order to match new mounts. */ bool mnt_cifs_serverino_autodisabled; + /* + * Available once the mount has completed. + */ + struct dentry *root; }; #endif /* _CIFS_FS_SB_H */ diff --git a/fs/cifs/cifs_ioctl.h b/fs/cifs/cifs_ioctl.h index 153d5c842a9b..37fc7d6ac457 100644 --- a/fs/cifs/cifs_ioctl.h +++ b/fs/cifs/cifs_ioctl.h @@ -57,6 +57,12 @@ struct smb_query_info { /* char buffer[]; */ } __packed; +/* + * Dumping the commonly used 16 byte (e.g. CCM and GCM128) keys still supported + * for backlevel compatibility, but is not sufficient for dumping the less + * frequently used GCM256 (32 byte) keys (see the newer "CIFS_DUMP_FULL_KEY" + * ioctl for dumping decryption info for GCM256 mounts) + */ struct smb3_key_debug_info { __u64 Suid; __u16 cipher_type; @@ -65,6 +71,31 @@ struct smb3_key_debug_info { __u8 smb3decryptionkey[SMB3_SIGN_KEY_SIZE]; } __packed; +/* + * Dump variable-sized keys + */ +struct smb3_full_key_debug_info { + /* INPUT: size of userspace buffer */ + __u32 in_size; + + /* + * INPUT: 0 for current user, otherwise session to dump + * OUTPUT: session id that was dumped + */ + __u64 session_id; + __u16 cipher_type; + __u8 session_key_length; + __u8 server_in_key_length; + __u8 server_out_key_length; + __u8 data[]; + /* + * return this struct with the keys appended at the end: + * __u8 session_key[session_key_length]; + * __u8 server_in_key[server_in_key_length]; + * __u8 server_out_key[server_out_key_length]; + */ +} __packed; + struct smb3_notify { __u32 completion_filter; bool watch_tree; @@ -78,3 +109,20 @@ struct smb3_notify { #define CIFS_QUERY_INFO _IOWR(CIFS_IOCTL_MAGIC, 7, struct smb_query_info) #define CIFS_DUMP_KEY _IOWR(CIFS_IOCTL_MAGIC, 8, struct smb3_key_debug_info) #define CIFS_IOC_NOTIFY _IOW(CIFS_IOCTL_MAGIC, 9, struct smb3_notify) +#define CIFS_DUMP_FULL_KEY _IOWR(CIFS_IOCTL_MAGIC, 10, struct smb3_full_key_debug_info) +#define CIFS_IOC_SHUTDOWN _IOR ('X', 125, __u32) + +/* + * Flags for going down operation + */ +#define CIFS_GOING_FLAGS_DEFAULT 0x0 /* going down */ +#define CIFS_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ +#define CIFS_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ + +static inline bool cifs_forced_shutdown(struct cifs_sb_info *sbi) +{ + if (CIFS_MOUNT_SHUTDOWN & sbi->mnt_cifs_flags) + return true; + else + return false; +} diff --git a/fs/cifs/cifs_swn.h b/fs/cifs/cifs_swn.h index 236ecd4959d5..8a9d2a5c9077 100644 --- a/fs/cifs/cifs_swn.h +++ b/fs/cifs/cifs_swn.h @@ -7,11 +7,13 @@ #ifndef _CIFS_SWN_H #define _CIFS_SWN_H +#include "cifsglob.h" struct cifs_tcon; struct sk_buff; struct genl_info; +#ifdef CONFIG_CIFS_SWN_UPCALL extern int cifs_swn_register(struct cifs_tcon *tcon); extern int cifs_swn_unregister(struct cifs_tcon *tcon); @@ -22,4 +24,29 @@ extern void cifs_swn_dump(struct seq_file *m); extern void cifs_swn_check(void); +static inline bool cifs_swn_set_server_dstaddr(struct TCP_Server_Info *server) +{ + if (server->use_swn_dstaddr) { + server->dstaddr = server->swn_dstaddr; + return true; + } + return false; +} + +static inline void cifs_swn_reset_server_dstaddr(struct TCP_Server_Info *server) +{ + server->use_swn_dstaddr = false; +} + +#else + +static inline int cifs_swn_register(struct cifs_tcon *tcon) { return 0; } +static inline int cifs_swn_unregister(struct cifs_tcon *tcon) { return 0; } +static inline int cifs_swn_notify(struct sk_buff *s, struct genl_info *i) { return 0; } +static inline void cifs_swn_dump(struct seq_file *m) {} +static inline void cifs_swn_check(void) {} +static inline bool cifs_swn_set_server_dstaddr(struct TCP_Server_Info *server) { return false; } +static inline void cifs_swn_reset_server_dstaddr(struct TCP_Server_Info *server) {} + +#endif /* CONFIG_CIFS_SWN_UPCALL */ #endif /* _CIFS_SWN_H */ diff --git a/fs/cifs/cifsacl.c b/fs/cifs/cifsacl.c index d178cf85e926..784407f9280f 100644 --- a/fs/cifs/cifsacl.c +++ b/fs/cifs/cifsacl.c @@ -1094,11 +1094,9 @@ static int set_chmod_dacl(struct cifs_acl *pdacl, struct cifs_acl *pndacl, struct cifs_ace *pnntace = NULL; char *nacl_base = NULL; u32 num_aces = 0; - __u64 nmode; bool new_aces_set = false; /* Assuming that pndacl and pnmode are never NULL */ - nmode = *pnmode; nacl_base = (char *)pndacl; nsize = sizeof(struct cifs_acl); @@ -1651,7 +1649,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, * Add three ACEs for owner, group, everyone getting rid of other ACEs * as chmod disables ACEs and set the security descriptor. Allocate * memory for the smb header, set security descriptor request security - * descriptor parameters, and secuirty descriptor itself + * descriptor parameters, and security descriptor itself */ nsecdesclen = max_t(u32, nsecdesclen, DEFAULT_SEC_DESC_LEN); pnntsd = kmalloc(nsecdesclen, GFP_KERNEL); diff --git a/fs/cifs/cifsfs.c b/fs/cifs/cifsfs.c index 5ddd20b62484..2ffcb29d5c8f 100644 --- a/fs/cifs/cifsfs.c +++ b/fs/cifs/cifsfs.c @@ -75,7 +75,7 @@ bool enable_oplocks = true; bool linuxExtEnabled = true; bool lookupCacheEnabled = true; bool disable_legacy_dialects; /* false by default */ -bool enable_gcm_256; /* false by default, change when more servers support it */ +bool enable_gcm_256 = true; bool require_gcm_256; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ @@ -133,6 +133,7 @@ struct workqueue_struct *cifsiod_wq; struct workqueue_struct *decrypt_wq; struct workqueue_struct *fileinfo_put_wq; struct workqueue_struct *cifsoplockd_wq; +struct workqueue_struct *deferredclose_wq; __u32 cifs_lock_secret; /* @@ -217,8 +218,11 @@ cifs_read_super(struct super_block *sb) rc = super_setup_bdi(sb); if (rc) goto out_no_root; - /* tune readahead according to rsize */ - sb->s_bdi->ra_pages = cifs_sb->ctx->rsize / PAGE_SIZE; + /* tune readahead according to rsize if readahead size not set on mount */ + if (cifs_sb->ctx->rasize) + sb->s_bdi->ra_pages = cifs_sb->ctx->rasize / PAGE_SIZE; + else + sb->s_bdi->ra_pages = cifs_sb->ctx->rsize / PAGE_SIZE; sb->s_blocksize = CIFS_MAX_MSGSIZE; sb->s_blocksize_bits = 14; /* default 2**14 = CIFS_MAX_MSGSIZE */ @@ -257,6 +261,29 @@ out_no_root: static void cifs_kill_sb(struct super_block *sb) { struct cifs_sb_info *cifs_sb = CIFS_SB(sb); + struct cifs_tcon *tcon; + struct cached_fid *cfid; + + /* + * We ned to release all dentries for the cached directories + * before we kill the sb. + */ + if (cifs_sb->root) { + dput(cifs_sb->root); + cifs_sb->root = NULL; + } + tcon = cifs_sb_master_tcon(cifs_sb); + if (tcon) { + cfid = &tcon->crfid; + mutex_lock(&cfid->fid_mutex); + if (cfid->dentry) { + + dput(cfid->dentry); + cfid->dentry = NULL; + } + mutex_unlock(&cfid->fid_mutex); + } + kill_anon_super(sb); cifs_umount(cifs_sb); } @@ -364,6 +391,8 @@ cifs_alloc_inode(struct super_block *sb) /* cifs_inode->vfs_inode.i_flags = S_NOATIME | S_NOCMTIME; */ INIT_LIST_HEAD(&cifs_inode->openFileList); INIT_LIST_HEAD(&cifs_inode->llist); + INIT_LIST_HEAD(&cifs_inode->deferred_closes); + spin_lock_init(&cifs_inode->deferred_lock); return &cifs_inode->vfs_inode; } @@ -626,6 +655,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",rsize=%u", cifs_sb->ctx->rsize); seq_printf(s, ",wsize=%u", cifs_sb->ctx->wsize); seq_printf(s, ",bsize=%u", cifs_sb->ctx->bsize); + if (cifs_sb->ctx->rasize) + seq_printf(s, ",rasize=%u", cifs_sb->ctx->rasize); if (tcon->ses->server->min_offload) seq_printf(s, ",esize=%u", tcon->ses->server->min_offload); seq_printf(s, ",echo_interval=%lu", @@ -656,10 +687,8 @@ cifs_show_options(struct seq_file *s, struct dentry *root) seq_printf(s, ",multichannel,max_channels=%zu", tcon->ses->chan_max); -#ifdef CONFIG_CIFS_SWN_UPCALL if (tcon->use_witness) seq_puts(s, ",witness"); -#endif return 0; } @@ -834,7 +863,7 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, goto out; } - rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, old_ctx->UNC); + rc = cifs_setup_volume_info(cifs_sb->ctx, NULL, NULL); if (rc) { root = ERR_PTR(rc); goto out; @@ -888,6 +917,9 @@ cifs_smb3_do_mount(struct file_system_type *fs_type, if (IS_ERR(root)) goto out_super; + if (cifs_sb) + cifs_sb->root = dget(root); + cifs_dbg(FYI, "dentry root is: %p\n", root); return root; @@ -1528,10 +1560,6 @@ init_cifs(void) int rc = 0; cifs_proc_init(); INIT_LIST_HEAD(&cifs_tcp_ses_list); -#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ - INIT_LIST_HEAD(&GlobalDnotifyReqList); - INIT_LIST_HEAD(&GlobalDnotifyRsp_Q); -#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ /* * Initialize Global counters */ @@ -1606,9 +1634,16 @@ init_cifs(void) goto out_destroy_fileinfo_put_wq; } + deferredclose_wq = alloc_workqueue("deferredclose", + WQ_FREEZABLE|WQ_MEM_RECLAIM, 0); + if (!deferredclose_wq) { + rc = -ENOMEM; + goto out_destroy_cifsoplockd_wq; + } + rc = cifs_fscache_register(); if (rc) - goto out_destroy_cifsoplockd_wq; + goto out_destroy_deferredclose_wq; rc = cifs_init_inodecache(); if (rc) @@ -1676,6 +1711,8 @@ out_destroy_inodecache: cifs_destroy_inodecache(); out_unreg_fscache: cifs_fscache_unregister(); +out_destroy_deferredclose_wq: + destroy_workqueue(deferredclose_wq); out_destroy_cifsoplockd_wq: destroy_workqueue(cifsoplockd_wq); out_destroy_fileinfo_put_wq: @@ -1710,6 +1747,7 @@ exit_cifs(void) cifs_destroy_mids(); cifs_destroy_inodecache(); cifs_fscache_unregister(); + destroy_workqueue(deferredclose_wq); destroy_workqueue(cifsoplockd_wq); destroy_workqueue(decrypt_wq); destroy_workqueue(fileinfo_put_wq); diff --git a/fs/cifs/cifsfs.h b/fs/cifs/cifsfs.h index 0d7ef150dbb2..6beddb108ba0 100644 --- a/fs/cifs/cifsfs.h +++ b/fs/cifs/cifsfs.h @@ -165,5 +165,5 @@ extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ -#define CIFS_VERSION "2.31" +#define CIFS_VERSION "2.32" #endif /* _CIFSFS_H */ diff --git a/fs/cifs/cifsglob.h b/fs/cifs/cifsglob.h index ec824ab8c5ca..8488d7024462 100644 --- a/fs/cifs/cifsglob.h +++ b/fs/cifs/cifsglob.h @@ -495,7 +495,7 @@ struct smb_version_operations { struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - char *full_path, + const char *full_path, umode_t mode, dev_t device_number); /* version specific fiemap implementation */ @@ -988,10 +988,12 @@ struct cached_fid { bool is_valid:1; /* Do we have a useable root fid */ bool file_all_info_is_valid:1; bool has_lease:1; + unsigned long time; /* jiffies of when lease was taken */ struct kref refcount; struct cifs_fid *fid; struct mutex fid_mutex; struct cifs_tcon *tcon; + struct dentry *dentry; struct work_struct lease_break; struct smb2_file_all_info file_all_info; }; @@ -1070,6 +1072,7 @@ struct cifs_tcon { bool use_resilient:1; /* use resilient instead of durable handles */ bool use_persistent:1; /* use persistent instead of durable handles */ bool no_lease:1; /* Do not request leases on files or directories */ + bool use_witness:1; /* use witness protocol */ __le32 capabilities; __u32 share_flags; __u32 maximal_access; @@ -1094,9 +1097,6 @@ struct cifs_tcon { int remap:2; struct list_head ulist; /* cache update list */ #endif -#ifdef CONFIG_CIFS_SWN_UPCALL - bool use_witness:1; /* use witness protocol */ -#endif }; /* @@ -1154,6 +1154,14 @@ struct cifs_pending_open { __u32 oplock; }; +struct cifs_deferred_close { + struct list_head dlist; + struct tcon_link *tlink; + __u16 netfid; + __u64 persistent_fid; + __u64 volatile_fid; +}; + /* * This info hangs off the cifsFileInfo structure, pointed to by llist. * This is used to track byte stream locks on the file @@ -1248,6 +1256,8 @@ struct cifsFileInfo { struct cifs_search_info srch_inf; struct work_struct oplock_break; /* work for oplock breaks */ struct work_struct put; /* work for the final part of _put */ + struct delayed_work deferred; + bool deferred_close_scheduled; /* Flag to indicate close is scheduled */ }; struct cifs_io_parms { @@ -1316,8 +1326,6 @@ struct cifs_readdata { struct page **pages; }; -struct cifs_writedata; - /* asynchronous write support */ struct cifs_writedata { struct kref refcount; @@ -1394,6 +1402,7 @@ struct cifsInodeInfo { #define CIFS_INO_DELETE_PENDING (3) /* delete pending on server */ #define CIFS_INO_INVALID_MAPPING (4) /* pagecache is invalid */ #define CIFS_INO_LOCK (5) /* lock bit for synchronization */ +#define CIFS_INO_MODIFIED_ATTR (6) /* Indicate change in mtime/ctime */ unsigned long flags; spinlock_t writers_lock; unsigned int writers; /* Number of writers on this inode */ @@ -1406,6 +1415,9 @@ struct cifsInodeInfo { struct fscache_cookie *fscache; #endif struct inode vfs_inode; + struct list_head deferred_closes; /* list of deferred closes */ + spinlock_t deferred_lock; /* protection on deferred list */ + bool lease_granted; /* Flag to indicate whether lease or oplock is granted. */ }; static inline struct cifsInodeInfo * @@ -1796,9 +1808,8 @@ require use of the stronger protocol */ * * Semaphores * ---------- - * sesSem operations on smb session - * tconSem operations on tree connection - * fh_sem file handle reconnection operations + * cifsInodeInfo->lock_sem protects: + * the list of locks held by the inode * ****************************************************************************/ @@ -1829,13 +1840,6 @@ GLOBAL_EXTERN struct list_head cifs_tcp_ses_list; */ GLOBAL_EXTERN spinlock_t cifs_tcp_ses_lock; -#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ -/* Outstanding dir notify requests */ -GLOBAL_EXTERN struct list_head GlobalDnotifyReqList; -/* DirNotify response queue */ -GLOBAL_EXTERN struct list_head GlobalDnotifyRsp_Q; -#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ - /* * Global transaction id (XID) information */ @@ -1879,23 +1883,16 @@ extern unsigned int cifs_min_small; /* min size of small buf pool */ extern unsigned int cifs_max_pending; /* MAX requests at once to server*/ extern bool disable_legacy_dialects; /* forbid vers=1.0 and vers=2.0 mounts */ -GLOBAL_EXTERN struct rb_root uidtree; -GLOBAL_EXTERN struct rb_root gidtree; -GLOBAL_EXTERN spinlock_t siduidlock; -GLOBAL_EXTERN spinlock_t sidgidlock; -GLOBAL_EXTERN struct rb_root siduidtree; -GLOBAL_EXTERN struct rb_root sidgidtree; -GLOBAL_EXTERN spinlock_t uidsidlock; -GLOBAL_EXTERN spinlock_t gidsidlock; - void cifs_oplock_break(struct work_struct *work); void cifs_queue_oplock_break(struct cifsFileInfo *cfile); +void smb2_deferred_work_close(struct work_struct *work); extern const struct slow_work_ops cifs_oplock_break_ops; extern struct workqueue_struct *cifsiod_wq; extern struct workqueue_struct *decrypt_wq; extern struct workqueue_struct *fileinfo_put_wq; extern struct workqueue_struct *cifsoplockd_wq; +extern struct workqueue_struct *deferredclose_wq; extern __u32 cifs_lock_secret; extern mempool_t *cifs_mid_poolp; diff --git a/fs/cifs/cifspdu.h b/fs/cifs/cifspdu.h index 9adc74bd9f8f..554d64fe171e 100644 --- a/fs/cifs/cifspdu.h +++ b/fs/cifs/cifspdu.h @@ -148,7 +148,8 @@ #define SMB3_SIGN_KEY_SIZE (16) /* - * Size of the smb3 encryption/decryption keys + * Size of the smb3 encryption/decryption key storage. + * This size is big enough to store any cipher key types. */ #define SMB3_ENC_DEC_KEY_SIZE (32) @@ -1903,7 +1904,7 @@ typedef struct smb_com_transaction2_fnext_req { __le16 InformationLevel; __u32 ResumeKey; __le16 SearchFlags; - char ResumeFileName[1]; + char ResumeFileName[]; } __attribute__((packed)) TRANSACTION2_FNEXT_REQ; typedef struct smb_com_transaction2_fnext_rsp { diff --git a/fs/cifs/cifsproto.h b/fs/cifs/cifsproto.h index 75ce6f742b8d..d30cba44ba29 100644 --- a/fs/cifs/cifsproto.h +++ b/fs/cifs/cifsproto.h @@ -69,9 +69,20 @@ extern int init_cifs_idmap(void); extern void exit_cifs_idmap(void); extern int init_cifs_spnego(void); extern void exit_cifs_spnego(void); -extern char *build_path_from_dentry(struct dentry *); +extern const char *build_path_from_dentry(struct dentry *, void *); extern char *build_path_from_dentry_optional_prefix(struct dentry *direntry, - bool prefix); + void *page, bool prefix); +static inline void *alloc_dentry_path(void) +{ + return __getname(); +} + +static inline void free_dentry_path(void *page) +{ + if (page) + __putname(page); +} + extern char *cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, @@ -184,7 +195,7 @@ extern struct cifsFileInfo *cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, struct tcon_link *tlink, __u32 oplock); -extern int cifs_posix_open(char *full_path, struct inode **inode, +extern int cifs_posix_open(const char *full_path, struct inode **inode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *oplock, __u16 *netfid, unsigned int xid); @@ -194,7 +205,7 @@ extern void cifs_unix_basic_to_fattr(struct cifs_fattr *fattr, struct cifs_sb_info *cifs_sb); extern void cifs_dir_info_to_fattr(struct cifs_fattr *, FILE_DIRECTORY_INFO *, struct cifs_sb_info *); -extern void cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); +extern int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr); extern struct inode *cifs_iget(struct super_block *sb, struct cifs_fattr *fattr); @@ -207,7 +218,7 @@ extern int cifs_get_inode_info_unix(struct inode **pinode, const unsigned char *search_path, struct super_block *sb, unsigned int xid); extern int cifs_set_file_info(struct inode *inode, struct iattr *attrs, - unsigned int xid, char *full_path, __u32 dosattr); + unsigned int xid, const char *full_path, __u32 dosattr); extern int cifs_rename_pending_delete(const char *full_path, struct dentry *dentry, const unsigned int xid); @@ -256,6 +267,19 @@ extern void cifs_add_pending_open_locked(struct cifs_fid *fid, struct tcon_link *tlink, struct cifs_pending_open *open); extern void cifs_del_pending_open(struct cifs_pending_open *open); + +extern bool cifs_is_deferred_close(struct cifsFileInfo *cfile, + struct cifs_deferred_close **dclose); + +extern void cifs_add_deferred_close(struct cifsFileInfo *cfile, + struct cifs_deferred_close *dclose); + +extern void cifs_del_deferred_close(struct cifsFileInfo *cfile); + +extern void cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode); + +extern void cifs_close_all_deferred_files(struct cifs_tcon *cifs_tcon); + extern struct TCP_Server_Info *cifs_get_tcp_session(struct smb3_fs_context *ctx); extern void cifs_put_tcp_session(struct TCP_Server_Info *server, int from_reconnect); @@ -358,11 +382,6 @@ extern int CIFSSMBSetFileDisposition(const unsigned int xid, struct cifs_tcon *tcon, bool delete_file, __u16 fid, __u32 pid_of_opener); -#if 0 -extern int CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, - char *fileName, __u16 dos_attributes, - const struct nls_table *nls_codepage); -#endif /* possibly unneeded function */ extern int CIFSSMBSetEOF(const unsigned int xid, struct cifs_tcon *tcon, const char *file_name, __u64 size, struct cifs_sb_info *cifs_sb, bool set_allocation); @@ -504,12 +523,6 @@ extern int generate_smb311signingkey(struct cifs_ses *); extern int calc_lanman_hash(const char *password, const char *cryptkey, bool encrypt, char *lnm_session_key); #endif /* CIFS_WEAK_PW_HASH */ -#ifdef CONFIG_CIFS_DNOTIFY_EXPERIMENTAL /* unused temporarily */ -extern int CIFSSMBNotify(const unsigned int xid, struct cifs_tcon *tcon, - const int notify_subdirs, const __u16 netfid, - __u32 filter, struct file *file, int multishot, - const struct nls_table *nls_codepage); -#endif /* was needed for dnotify, and will be needed for inotify when VFS fix */ extern int CIFSSMBCopy(unsigned int xid, struct cifs_tcon *source_tcon, const char *fromName, diff --git a/fs/cifs/cifssmb.c b/fs/cifs/cifssmb.c index c279527aae92..41f74163cc1c 100644 --- a/fs/cifs/cifssmb.c +++ b/fs/cifs/cifssmb.c @@ -114,7 +114,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) mutex_lock(&tcon->crfid.fid_mutex); tcon->crfid.is_valid = false; /* cached handle is not valid, so SMB2_CLOSE won't be sent below */ - close_shroot_lease_locked(&tcon->crfid); + close_cached_dir_lease_locked(&tcon->crfid); memset(tcon->crfid.fid, 0, sizeof(struct cifs_fid)); mutex_unlock(&tcon->crfid.fid_mutex); @@ -5917,56 +5917,6 @@ SetTimesRetry: return rc; } -/* Can not be used to set time stamps yet (due to old DOS time format) */ -/* Can be used to set attributes */ -#if 0 /* Possibly not needed - since it turns out that strangely NT4 has a bug - handling it anyway and NT4 was what we thought it would be needed for - Do not delete it until we prove whether needed for Win9x though */ -int -CIFSSMBSetAttrLegacy(unsigned int xid, struct cifs_tcon *tcon, char *fileName, - __u16 dos_attrs, const struct nls_table *nls_codepage) -{ - SETATTR_REQ *pSMB = NULL; - SETATTR_RSP *pSMBr = NULL; - int rc = 0; - int bytes_returned; - int name_len; - - cifs_dbg(FYI, "In SetAttrLegacy\n"); - -SetAttrLgcyRetry: - rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, - (void **) &pSMBr); - if (rc) - return rc; - - if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { - name_len = - ConvertToUTF16((__le16 *) pSMB->fileName, fileName, - PATH_MAX, nls_codepage); - name_len++; /* trailing null */ - name_len *= 2; - } else { - name_len = copy_path_name(pSMB->fileName, fileName); - } - pSMB->attr = cpu_to_le16(dos_attrs); - pSMB->BufferFormat = 0x04; - inc_rfc1001_len(pSMB, name_len + 1); - pSMB->ByteCount = cpu_to_le16(name_len + 1); - rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, - (struct smb_hdr *) pSMBr, &bytes_returned, 0); - if (rc) - cifs_dbg(FYI, "Error in LegacySetAttr = %d\n", rc); - - cifs_buf_release(pSMB); - - if (rc == -EAGAIN) - goto SetAttrLgcyRetry; - - return rc; -} -#endif /* temporarily unneeded SetAttr legacy function */ - static void cifs_fill_unix_set_info(FILE_UNIX_BASIC_INFO *data_offset, const struct cifs_unix_set_info_args *args) diff --git a/fs/cifs/connect.c b/fs/cifs/connect.c index 24668eb006c6..495c395f9def 100644 --- a/fs/cifs/connect.c +++ b/fs/cifs/connect.c @@ -62,9 +62,7 @@ #include "dfs_cache.h" #endif #include "fs_context.h" -#ifdef CONFIG_CIFS_SWN_UPCALL #include "cifs_swn.h" -#endif extern mempool_t *cifs_req_poolp; extern bool disable_legacy_dialects; @@ -314,12 +312,8 @@ cifs_reconnect(struct TCP_Server_Info *server) mutex_lock(&server->srv_mutex); -#ifdef CONFIG_CIFS_SWN_UPCALL - if (server->use_swn_dstaddr) { - server->dstaddr = server->swn_dstaddr; - } else { -#endif + if (!cifs_swn_set_server_dstaddr(server)) { #ifdef CONFIG_CIFS_DFS_UPCALL if (cifs_sb && cifs_sb->origin_fullpath) /* @@ -344,9 +338,7 @@ cifs_reconnect(struct TCP_Server_Info *server) #endif -#ifdef CONFIG_CIFS_SWN_UPCALL } -#endif if (cifs_rdma_enabled(server)) rc = smbd_reconnect(server); @@ -363,9 +355,7 @@ cifs_reconnect(struct TCP_Server_Info *server) if (server->tcpStatus != CifsExiting) server->tcpStatus = CifsNeedNegotiate; spin_unlock(&GlobalMid_Lock); -#ifdef CONFIG_CIFS_SWN_UPCALL - server->use_swn_dstaddr = false; -#endif + cifs_swn_reset_server_dstaddr(server); mutex_unlock(&server->srv_mutex); } } while (server->tcpStatus == CifsNeedReconnect); @@ -402,16 +392,6 @@ cifs_echo_request(struct work_struct *work) int rc; struct TCP_Server_Info *server = container_of(work, struct TCP_Server_Info, echo.work); - unsigned long echo_interval; - - /* - * If we need to renegotiate, set echo interval to zero to - * immediately call echo service where we can renegotiate. - */ - if (server->tcpStatus == CifsNeedNegotiate) - echo_interval = 0; - else - echo_interval = server->echo_interval; /* * We cannot send an echo if it is disabled. @@ -422,7 +402,7 @@ cifs_echo_request(struct work_struct *work) server->tcpStatus == CifsExiting || server->tcpStatus == CifsNew || (server->ops->can_echo && !server->ops->can_echo(server)) || - time_before(jiffies, server->lstrp + echo_interval - HZ)) + time_before(jiffies, server->lstrp + server->echo_interval - HZ)) goto requeue_echo; rc = server->ops->echo ? server->ops->echo(server) : -ENOSYS; @@ -430,10 +410,8 @@ cifs_echo_request(struct work_struct *work) cifs_dbg(FYI, "Unable to send echo request to server: %s\n", server->hostname); -#ifdef CONFIG_CIFS_SWN_UPCALL /* Check witness registrations */ cifs_swn_check(); -#endif requeue_echo: queue_delayed_work(cifsiod_wq, &server->echo, server->echo_interval); @@ -488,6 +466,7 @@ server_unresponsive(struct TCP_Server_Info *server) */ if ((server->tcpStatus == CifsGood || server->tcpStatus == CifsNeedNegotiate) && + (!server->ops->can_echo || server->ops->can_echo(server)) && time_after(jiffies, server->lstrp + 3 * server->echo_interval)) { cifs_server_dbg(VFS, "has not responded in %lu seconds. Reconnecting...\n", (3 * server->echo_interval) / HZ); @@ -1790,9 +1769,7 @@ cifs_set_cifscreds(struct smb3_fs_context *ctx, struct cifs_ses *ses) * for the request. */ if (is_domain && ses->domainName) { - ctx->domainname = kstrndup(ses->domainName, - strlen(ses->domainName), - GFP_KERNEL); + ctx->domainname = kstrdup(ses->domainName, GFP_KERNEL); if (!ctx->domainname) { cifs_dbg(FYI, "Unable to allocate %zd bytes for domain\n", len); @@ -2009,7 +1986,6 @@ cifs_put_tcon(struct cifs_tcon *tcon) return; } -#ifdef CONFIG_CIFS_SWN_UPCALL if (tcon->use_witness) { int rc; @@ -2019,7 +1995,6 @@ cifs_put_tcon(struct cifs_tcon *tcon) __func__, rc); } } -#endif list_del_init(&tcon->tcon_list); spin_unlock(&cifs_tcp_ses_lock); @@ -2181,9 +2156,9 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) } tcon->use_resilient = true; } -#ifdef CONFIG_CIFS_SWN_UPCALL + tcon->use_witness = false; - if (ctx->witness) { + if (IS_ENABLED(CONFIG_CIFS_SWN_UPCALL) && ctx->witness) { if (ses->server->vals->protocol_id >= SMB30_PROT_ID) { if (tcon->capabilities & SMB2_SHARE_CAP_CLUSTER) { /* @@ -2209,7 +2184,6 @@ cifs_get_tcon(struct cifs_ses *ses, struct smb3_fs_context *ctx) goto out_fail; } } -#endif /* If the user really knows what they are doing they can override */ if (tcon->share_flags & SMB2_SHAREFLAG_NO_CACHING) { @@ -3175,17 +3149,29 @@ out: int cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const char *devname) { - int rc = 0; + int rc; - smb3_parse_devname(devname, ctx); + if (devname) { + cifs_dbg(FYI, "%s: devname=%s\n", __func__, devname); + rc = smb3_parse_devname(devname, ctx); + if (rc) { + cifs_dbg(VFS, "%s: failed to parse %s: %d\n", __func__, devname, rc); + return rc; + } + } if (mntopts) { char *ip; - cifs_dbg(FYI, "%s: mntopts=%s\n", __func__, mntopts); rc = smb3_parse_opt(mntopts, "ip", &ip); - if (!rc && !cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, - strlen(ip))) { + if (rc) { + cifs_dbg(VFS, "%s: failed to parse ip options: %d\n", __func__, rc); + return rc; + } + + rc = cifs_convert_address((struct sockaddr *)&ctx->dstaddr, ip, strlen(ip)); + kfree(ip); + if (!rc) { cifs_dbg(VFS, "%s: failed to convert ip address\n", __func__); return -EINVAL; } @@ -3205,7 +3191,7 @@ cifs_setup_volume_info(struct smb3_fs_context *ctx, const char *mntopts, const c return -EINVAL; } - return rc; + return 0; } static int @@ -3426,8 +3412,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) goto error; } /* Save mount options */ - mntdata = kstrndup(cifs_sb->ctx->mount_options, - strlen(cifs_sb->ctx->mount_options), GFP_KERNEL); + mntdata = kstrdup(cifs_sb->ctx->mount_options, GFP_KERNEL); if (!mntdata) { rc = -ENOMEM; goto error; @@ -3500,7 +3485,7 @@ int cifs_mount(struct cifs_sb_info *cifs_sb, struct smb3_fs_context *ctx) * links, the prefix path is included in both and may be changed during reconnect. See * cifs_tree_connect(). */ - cifs_sb->origin_fullpath = kstrndup(full_path, strlen(full_path), GFP_KERNEL); + cifs_sb->origin_fullpath = kstrdup(full_path, GFP_KERNEL); if (!cifs_sb->origin_fullpath) { rc = -ENOMEM; goto error; @@ -3877,9 +3862,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->sectype = master_tcon->ses->sectype; ctx->sign = master_tcon->ses->sign; ctx->seal = master_tcon->seal; -#ifdef CONFIG_CIFS_SWN_UPCALL ctx->witness = master_tcon->use_witness; -#endif rc = cifs_set_vol_auth(ctx, master_tcon->ses); if (rc) { diff --git a/fs/cifs/dfs_cache.c b/fs/cifs/dfs_cache.c index 098b4bc8da59..b1fa30fefe1f 100644 --- a/fs/cifs/dfs_cache.c +++ b/fs/cifs/dfs_cache.c @@ -81,23 +81,24 @@ static void refresh_cache_worker(struct work_struct *work); static DECLARE_DELAYED_WORK(refresh_task, refresh_cache_worker); -static int get_normalized_path(const char *path, char **npath) +static int get_normalized_path(const char *path, const char **npath) { if (!path || strlen(path) < 3 || (*path != '\\' && *path != '/')) return -EINVAL; if (*path == '\\') { - *npath = (char *)path; + *npath = path; } else { - *npath = kstrndup(path, strlen(path), GFP_KERNEL); - if (!*npath) + char *s = kstrdup(path, GFP_KERNEL); + if (!s) return -ENOMEM; - convert_delimiter(*npath, '\\'); + convert_delimiter(s, '\\'); + *npath = s; } return 0; } -static inline void free_normalized_path(const char *path, char *npath) +static inline void free_normalized_path(const char *path, const char *npath) { if (path != npath) kfree(npath); @@ -358,7 +359,7 @@ static struct cache_dfs_tgt *alloc_target(const char *name, int path_consumed) t = kmalloc(sizeof(*t), GFP_ATOMIC); if (!t) return ERR_PTR(-ENOMEM); - t->name = kstrndup(name, strlen(name), GFP_ATOMIC); + t->name = kstrdup(name, GFP_ATOMIC); if (!t->name) { kfree(t); return ERR_PTR(-ENOMEM); @@ -419,7 +420,7 @@ static struct cache_entry *alloc_cache_entry(const char *path, if (!ce) return ERR_PTR(-ENOMEM); - ce->path = kstrndup(path, strlen(path), GFP_KERNEL); + ce->path = kstrdup(path, GFP_KERNEL); if (!ce->path) { kmem_cache_free(cache_slab, ce); return ERR_PTR(-ENOMEM); @@ -531,7 +532,7 @@ static struct cache_entry *lookup_cache_entry(const char *path, unsigned int *ha char *s, *e; char sep; - npath = kstrndup(path, strlen(path), GFP_KERNEL); + npath = kstrdup(path, GFP_KERNEL); if (!npath) return ERR_PTR(-ENOMEM); @@ -641,7 +642,7 @@ static int __update_cache_entry(const char *path, if (ce->tgthint) { s = ce->tgthint->name; - th = kstrndup(s, strlen(s), GFP_ATOMIC); + th = kstrdup(s, GFP_ATOMIC); if (!th) return -ENOMEM; } @@ -786,11 +787,11 @@ static int setup_referral(const char *path, struct cache_entry *ce, memset(ref, 0, sizeof(*ref)); - ref->path_name = kstrndup(path, strlen(path), GFP_ATOMIC); + ref->path_name = kstrdup(path, GFP_ATOMIC); if (!ref->path_name) return -ENOMEM; - ref->node_name = kstrndup(target, strlen(target), GFP_ATOMIC); + ref->node_name = kstrdup(target, GFP_ATOMIC); if (!ref->node_name) { rc = -ENOMEM; goto err_free_path; @@ -828,7 +829,7 @@ static int get_targets(struct cache_entry *ce, struct dfs_cache_tgt_list *tl) goto err_free_it; } - it->it_name = kstrndup(t->name, strlen(t->name), GFP_ATOMIC); + it->it_name = kstrdup(t->name, GFP_ATOMIC); if (!it->it_name) { kfree(it); rc = -ENOMEM; @@ -882,7 +883,7 @@ int dfs_cache_find(const unsigned int xid, struct cifs_ses *ses, struct dfs_cache_tgt_list *tgt_list) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; rc = get_normalized_path(path, &npath); @@ -936,7 +937,7 @@ int dfs_cache_noreq_find(const char *path, struct dfs_info3_param *ref, struct dfs_cache_tgt_list *tgt_list) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; rc = get_normalized_path(path, &npath); @@ -991,7 +992,7 @@ int dfs_cache_update_tgthint(const unsigned int xid, struct cifs_ses *ses, const struct dfs_cache_tgt_iterator *it) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; struct cache_dfs_tgt *t; @@ -1053,7 +1054,7 @@ int dfs_cache_noreq_update_tgthint(const char *path, const struct dfs_cache_tgt_iterator *it) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; struct cache_dfs_tgt *t; @@ -1111,7 +1112,7 @@ int dfs_cache_get_tgt_referral(const char *path, struct dfs_info3_param *ref) { int rc; - char *npath; + const char *npath; struct cache_entry *ce; if (!it || !ref) @@ -1166,7 +1167,7 @@ int dfs_cache_add_vol(char *mntdata, struct smb3_fs_context *ctx, const char *fu if (!vi) return -ENOMEM; - vi->fullpath = kstrndup(fullpath, strlen(fullpath), GFP_KERNEL); + vi->fullpath = kstrdup(fullpath, GFP_KERNEL); if (!vi->fullpath) { rc = -ENOMEM; goto err_free_vi; @@ -1484,7 +1485,7 @@ static int refresh_tcon(struct vol_info *vi, struct cifs_tcon *tcon) { int rc = 0; unsigned int xid; - char *path, *npath; + const char *path, *npath; struct cache_entry *ce; struct cifs_ses *root_ses = NULL, *ses; struct dfs_info3_param *refs = NULL; diff --git a/fs/cifs/dir.c b/fs/cifs/dir.c index a3fb81e0ba17..6bcd3e8f7cda 100644 --- a/fs/cifs/dir.c +++ b/fs/cifs/dir.c @@ -34,6 +34,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "fs_context.h" +#include "cifs_ioctl.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -78,31 +79,31 @@ cifs_build_path_to_root(struct smb3_fs_context *ctx, struct cifs_sb_info *cifs_s } /* Note: caller must free return buffer */ -char * -build_path_from_dentry(struct dentry *direntry) +const char * +build_path_from_dentry(struct dentry *direntry, void *page) { struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); bool prefix = tcon->Flags & SMB_SHARE_IS_IN_DFS; - return build_path_from_dentry_optional_prefix(direntry, + return build_path_from_dentry_optional_prefix(direntry, page, prefix); } char * -build_path_from_dentry_optional_prefix(struct dentry *direntry, bool prefix) +build_path_from_dentry_optional_prefix(struct dentry *direntry, void *page, + bool prefix) { - struct dentry *temp; - int namelen; int dfsplen; int pplen = 0; - char *full_path; - char dirsep; struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - unsigned seq; + char dirsep = CIFS_DIR_SEP(cifs_sb); + char *s; + + if (unlikely(!page)) + return ERR_PTR(-ENOMEM); - dirsep = CIFS_DIR_SEP(cifs_sb); if (prefix) dfsplen = strnlen(tcon->treeName, MAX_TREE_SIZE + 1); else @@ -111,86 +112,39 @@ build_path_from_dentry_optional_prefix(struct dentry *direntry, bool prefix) if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_USE_PREFIX_PATH) pplen = cifs_sb->prepath ? strlen(cifs_sb->prepath) + 1 : 0; -cifs_bp_rename_retry: - namelen = dfsplen + pplen; - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - for (temp = direntry; !IS_ROOT(temp);) { - namelen += (1 + temp->d_name.len); - temp = temp->d_parent; - if (temp == NULL) { - cifs_dbg(VFS, "corrupt dentry\n"); - rcu_read_unlock(); - return NULL; - } - } - rcu_read_unlock(); - - full_path = kmalloc(namelen+1, GFP_ATOMIC); - if (full_path == NULL) - return full_path; - full_path[namelen] = 0; /* trailing null */ - rcu_read_lock(); - for (temp = direntry; !IS_ROOT(temp);) { - spin_lock(&temp->d_lock); - namelen -= 1 + temp->d_name.len; - if (namelen < 0) { - spin_unlock(&temp->d_lock); - break; - } else { - full_path[namelen] = dirsep; - strncpy(full_path + namelen + 1, temp->d_name.name, - temp->d_name.len); - cifs_dbg(FYI, "name: %s\n", full_path + namelen); - } - spin_unlock(&temp->d_lock); - temp = temp->d_parent; - if (temp == NULL) { - cifs_dbg(VFS, "corrupt dentry\n"); - rcu_read_unlock(); - kfree(full_path); - return NULL; - } - } - rcu_read_unlock(); - if (namelen != dfsplen + pplen || read_seqretry(&rename_lock, seq)) { - cifs_dbg(FYI, "did not end path lookup where expected. namelen=%ddfsplen=%d\n", - namelen, dfsplen); - /* presumably this is only possible if racing with a rename - of one of the parent directories (we can not lock the dentries - above us to prevent this, but retrying should be harmless) */ - kfree(full_path); - goto cifs_bp_rename_retry; - } - /* DIR_SEP already set for byte 0 / vs \ but not for - subsequent slashes in prepath which currently must - be entered the right way - not sure if there is an alternative - since the '\' is a valid posix character so we can not switch - those safely to '/' if any are found in the middle of the prepath */ - /* BB test paths to Windows with '/' in the midst of prepath */ - + s = dentry_path_raw(direntry, page, PAGE_SIZE); + if (IS_ERR(s)) + return s; + if (!s[1]) // for root we want "", not "/" + s++; + if (s < (char *)page + pplen + dfsplen) + return ERR_PTR(-ENAMETOOLONG); if (pplen) { - int i; - cifs_dbg(FYI, "using cifs_sb prepath <%s>\n", cifs_sb->prepath); - memcpy(full_path+dfsplen+1, cifs_sb->prepath, pplen-1); - full_path[dfsplen] = dirsep; - for (i = 0; i < pplen-1; i++) - if (full_path[dfsplen+1+i] == '/') - full_path[dfsplen+1+i] = CIFS_DIR_SEP(cifs_sb); + s -= pplen; + memcpy(s + 1, cifs_sb->prepath, pplen - 1); + *s = '/'; } + if (dirsep != '/') { + /* BB test paths to Windows with '/' in the midst of prepath */ + char *p; + for (p = s; *p; p++) + if (*p == '/') + *p = dirsep; + } if (dfsplen) { - strncpy(full_path, tcon->treeName, dfsplen); + s -= dfsplen; + memcpy(s, tcon->treeName, dfsplen); if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_POSIX_PATHS) { int i; for (i = 0; i < dfsplen; i++) { - if (full_path[i] == '\\') - full_path[i] = '/'; + if (s[i] == '\\') + s[i] = '/'; } } } - return full_path; + return s; } /* @@ -233,7 +187,8 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, int desired_access; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = tlink_tcon(tlink); - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); FILE_ALL_INFO *buf = NULL; struct inode *newinode = NULL; int disposition; @@ -244,9 +199,11 @@ cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned int xid, if (tcon->ses->server->oplocks) *oplock = REQ_OPLOCK; - full_path = build_path_from_dentry(direntry); - if (!full_path) - return -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + free_dentry_path(page); + return PTR_ERR(full_path); + } if (tcon->unix_ext && cap_unix(tcon->ses) && !tcon->broken_posix_open && (CIFS_UNIX_POSIX_PATH_OPS_CAP & @@ -418,15 +375,16 @@ cifs_create_get_file_info: if (newinode) { if (server->ops->set_lease_key) server->ops->set_lease_key(newinode, fid); - if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) - newinode->i_mode = mode; - if ((*oplock & CIFS_CREATE_ACTION) && - (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID)) { - newinode->i_uid = current_fsuid(); - if (inode->i_mode & S_ISGID) - newinode->i_gid = inode->i_gid; - else - newinode->i_gid = current_fsgid(); + if ((*oplock & CIFS_CREATE_ACTION) && S_ISREG(newinode->i_mode)) { + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_DYNPERM) + newinode->i_mode = mode; + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_SET_UID) { + newinode->i_uid = current_fsuid(); + if (inode->i_mode & S_ISGID) + newinode->i_gid = inode->i_gid; + else + newinode->i_gid = current_fsgid(); + } } } } @@ -448,7 +406,7 @@ cifs_create_set_dentry: out: kfree(buf); - kfree(full_path); + free_dentry_path(page); return rc; out_err: @@ -473,6 +431,9 @@ cifs_atomic_open(struct inode *inode, struct dentry *direntry, __u32 oplock; struct cifsFileInfo *file_info; + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) + return -EIO; + /* * Posix open is only called (at lookup time) for file create now. For * opens (rather than creates), because we do not know if it is a file @@ -589,6 +550,9 @@ int cifs_create(struct user_namespace *mnt_userns, struct inode *inode, cifs_dbg(FYI, "cifs_create parent inode = 0x%p name is: %pd and dentry = 0x%p\n", inode, direntry, direntry); + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) + return -EIO; + tlink = cifs_sb_tlink(CIFS_SB(inode->i_sb)); rc = PTR_ERR(tlink); if (IS_ERR(tlink)) @@ -619,23 +583,27 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; - char *full_path = NULL; + const char *full_path; + void *page; if (!old_valid_dev(device_number)) return -EINVAL; cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); + page = alloc_dentry_path(); tcon = tlink_tcon(tlink); - xid = get_xid(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto mknod_out; } @@ -644,7 +612,7 @@ int cifs_mknod(struct user_namespace *mnt_userns, struct inode *inode, device_number); mknod_out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -660,7 +628,8 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, struct tcon_link *tlink; struct cifs_tcon *pTcon; struct inode *newInode = NULL; - char *full_path = NULL; + const char *full_path; + void *page; xid = get_xid(); @@ -687,11 +656,13 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, /* can not grab the rename sem here since it would deadlock in the cases (beginning of sys_rename itself) in which we already have the sb rename sem */ - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { + page = alloc_dentry_path(); + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { cifs_put_tlink(tlink); free_xid(xid); - return ERR_PTR(-ENOMEM); + free_dentry_path(page); + return ERR_CAST(full_path); } if (d_really_is_positive(direntry)) { @@ -727,7 +698,7 @@ cifs_lookup(struct inode *parent_dir_inode, struct dentry *direntry, } newInode = ERR_PTR(rc); } - kfree(full_path); + free_dentry_path(page); cifs_put_tlink(tlink); free_xid(xid); return d_splice_alias(newInode, direntry); diff --git a/fs/cifs/file.c b/fs/cifs/file.c index 042e24aad410..379a427f3c2f 100644 --- a/fs/cifs/file.c +++ b/fs/cifs/file.c @@ -45,6 +45,7 @@ #include "fscache.h" #include "smbdirect.h" #include "fs_context.h" +#include "cifs_ioctl.h" static inline int cifs_convert_flags(unsigned int flags) { @@ -112,7 +113,7 @@ static inline int cifs_get_disposition(unsigned int flags) return FILE_OPEN; } -int cifs_posix_open(char *full_path, struct inode **pinode, +int cifs_posix_open(const char *full_path, struct inode **pinode, struct super_block *sb, int mode, unsigned int f_flags, __u32 *poplock, __u16 *pnetfid, unsigned int xid) { @@ -166,7 +167,7 @@ int cifs_posix_open(char *full_path, struct inode **pinode, } } else { cifs_revalidate_mapping(*pinode); - cifs_fattr_to_inode(*pinode, &fattr); + rc = cifs_fattr_to_inode(*pinode, &fattr); } posix_open_ret: @@ -175,7 +176,7 @@ posix_open_ret: } static int -cifs_nt_open(char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, +cifs_nt_open(const char *full_path, struct inode *inode, struct cifs_sb_info *cifs_sb, struct cifs_tcon *tcon, unsigned int f_flags, __u32 *oplock, struct cifs_fid *fid, unsigned int xid) { @@ -322,9 +323,11 @@ cifs_new_fileinfo(struct cifs_fid *fid, struct file *file, cfile->dentry = dget(dentry); cfile->f_flags = file->f_flags; cfile->invalidHandle = false; + cfile->deferred_close_scheduled = false; cfile->tlink = cifs_get_tlink(tlink); INIT_WORK(&cfile->oplock_break, cifs_oplock_break); INIT_WORK(&cfile->put, cifsFileInfo_put_work); + INIT_DELAYED_WORK(&cfile->deferred, smb2_deferred_work_close); mutex_init(&cfile->fh_mutex); spin_lock_init(&cfile->file_info_lock); @@ -530,7 +533,8 @@ int cifs_open(struct inode *inode, struct file *file) struct cifs_tcon *tcon; struct tcon_link *tlink; struct cifsFileInfo *cfile = NULL; - char *full_path = NULL; + void *page; + const char *full_path; bool posix_open_ok = false; struct cifs_fid fid; struct cifs_pending_open open; @@ -538,6 +542,11 @@ int cifs_open(struct inode *inode, struct file *file) xid = get_xid(); cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) { + free_xid(xid); + return -EIO; + } + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { free_xid(xid); @@ -546,9 +555,10 @@ int cifs_open(struct inode *inode, struct file *file) tcon = tlink_tcon(tlink); server = tcon->ses->server; - full_path = build_path_from_dentry(file_dentry(file)); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(file_dentry(file), page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -563,6 +573,20 @@ int cifs_open(struct inode *inode, struct file *file) file->f_op = &cifs_file_direct_ops; } + /* Get the cached handle as SMB2 close is deferred */ + rc = cifs_get_readable_path(tcon, full_path, &cfile); + if (rc == 0) { + if (file->f_flags == cfile->f_flags) { + file->private_data = cfile; + spin_lock(&CIFS_I(inode)->deferred_lock); + cifs_del_deferred_close(cfile); + spin_unlock(&CIFS_I(inode)->deferred_lock); + goto out; + } else { + _cifsFileInfo_put(cfile, true, false); + } + } + if (server->oplocks) oplock = REQ_OPLOCK; else @@ -640,7 +664,7 @@ int cifs_open(struct inode *inode, struct file *file) } out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -689,7 +713,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) struct TCP_Server_Info *server; struct cifsInodeInfo *cinode; struct inode *inode; - char *full_path = NULL; + void *page; + const char *full_path; int desired_access; int disposition = FILE_OPEN; int create_options = CREATE_NOT_DIR; @@ -699,9 +724,8 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) mutex_lock(&cfile->fh_mutex); if (!cfile->invalidHandle) { mutex_unlock(&cfile->fh_mutex); - rc = 0; free_xid(xid); - return rc; + return 0; } inode = d_inode(cfile->dentry); @@ -715,12 +739,13 @@ cifs_reopen_file(struct cifsFileInfo *cfile, bool can_flush) * called and if the server was down that means we end up here, and we * can never tell if the caller already has the rename_sem. */ - full_path = build_path_from_dentry(cfile->dentry); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(cfile->dentry, page); + if (IS_ERR(full_path)) { mutex_unlock(&cfile->fh_mutex); + free_dentry_path(page); free_xid(xid); - return rc; + return PTR_ERR(full_path); } cifs_dbg(FYI, "inode = 0x%p file flags 0x%x for %s\n", @@ -838,16 +863,64 @@ reopen_success: cifs_relock_file(cfile); reopen_error_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } +void smb2_deferred_work_close(struct work_struct *work) +{ + struct cifsFileInfo *cfile = container_of(work, + struct cifsFileInfo, deferred.work); + + spin_lock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + cifs_del_deferred_close(cfile); + cfile->deferred_close_scheduled = false; + spin_unlock(&CIFS_I(d_inode(cfile->dentry))->deferred_lock); + _cifsFileInfo_put(cfile, true, false); +} + int cifs_close(struct inode *inode, struct file *file) { + struct cifsFileInfo *cfile; + struct cifsInodeInfo *cinode = CIFS_I(inode); + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_deferred_close *dclose; + if (file->private_data != NULL) { - _cifsFileInfo_put(file->private_data, true, false); + cfile = file->private_data; file->private_data = NULL; + dclose = kmalloc(sizeof(struct cifs_deferred_close), GFP_KERNEL); + if ((cinode->oplock == CIFS_CACHE_RHW_FLG) && + cinode->lease_granted && + dclose) { + if (test_bit(CIFS_INO_MODIFIED_ATTR, &cinode->flags)) + inode->i_ctime = inode->i_mtime = current_time(inode); + spin_lock(&cinode->deferred_lock); + cifs_add_deferred_close(cfile, dclose); + if (cfile->deferred_close_scheduled && + delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, + &cfile->deferred, cifs_sb->ctx->acregmax)) + cifsFileInfo_get(cfile); + } else { + /* Deferred close for files */ + queue_delayed_work(deferredclose_wq, + &cfile->deferred, cifs_sb->ctx->acregmax); + cfile->deferred_close_scheduled = true; + spin_unlock(&cinode->deferred_lock); + return 0; + } + spin_unlock(&cinode->deferred_lock); + _cifsFileInfo_put(cfile, true, false); + } else { + _cifsFileInfo_put(cfile, true, false); + kfree(dclose); + } } /* return code from the ->release op is always ignored */ @@ -1917,8 +1990,10 @@ cifs_write(struct cifsFileInfo *open_file, __u32 pid, const char *write_data, if (total_written > 0) { spin_lock(&d_inode(dentry)->i_lock); - if (*offset > d_inode(dentry)->i_size) + if (*offset > d_inode(dentry)->i_size) { i_size_write(d_inode(dentry), *offset); + d_inode(dentry)->i_blocks = (512 - 1 + *offset) >> 9; + } spin_unlock(&d_inode(dentry)->i_lock); } mark_inode_dirty_sync(d_inode(dentry)); @@ -1944,7 +2019,7 @@ struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, if (fsuid_only && !uid_eq(open_file->uid, current_fsuid())) continue; if (OPEN_FMODE(open_file->f_flags) & FMODE_READ) { - if (!open_file->invalidHandle) { + if ((!open_file->invalidHandle)) { /* found a good file */ /* lock it so it will not be closed on us */ cifsFileInfo_get(open_file); @@ -2069,34 +2144,31 @@ cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, int flags, struct cifsFileInfo **ret_file) { - struct list_head *tmp; struct cifsFileInfo *cfile; - struct cifsInodeInfo *cinode; - char *full_path; + void *page = alloc_dentry_path(); *ret_file = NULL; spin_lock(&tcon->open_file_lock); - list_for_each(tmp, &tcon->openFileList) { - cfile = list_entry(tmp, struct cifsFileInfo, - tlist); - full_path = build_path_from_dentry(cfile->dentry); - if (full_path == NULL) { + list_for_each_entry(cfile, &tcon->openFileList, tlist) { + struct cifsInodeInfo *cinode; + const char *full_path = build_path_from_dentry(cfile->dentry, page); + if (IS_ERR(full_path)) { spin_unlock(&tcon->open_file_lock); - return -ENOMEM; + free_dentry_path(page); + return PTR_ERR(full_path); } - if (strcmp(full_path, name)) { - kfree(full_path); + if (strcmp(full_path, name)) continue; - } - kfree(full_path); cinode = CIFS_I(d_inode(cfile->dentry)); spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); return cifs_get_writable_file(cinode, flags, ret_file); } spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); return -ENOENT; } @@ -2104,35 +2176,32 @@ int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, struct cifsFileInfo **ret_file) { - struct list_head *tmp; struct cifsFileInfo *cfile; - struct cifsInodeInfo *cinode; - char *full_path; + void *page = alloc_dentry_path(); *ret_file = NULL; spin_lock(&tcon->open_file_lock); - list_for_each(tmp, &tcon->openFileList) { - cfile = list_entry(tmp, struct cifsFileInfo, - tlist); - full_path = build_path_from_dentry(cfile->dentry); - if (full_path == NULL) { + list_for_each_entry(cfile, &tcon->openFileList, tlist) { + struct cifsInodeInfo *cinode; + const char *full_path = build_path_from_dentry(cfile->dentry, page); + if (IS_ERR(full_path)) { spin_unlock(&tcon->open_file_lock); - return -ENOMEM; + free_dentry_path(page); + return PTR_ERR(full_path); } - if (strcmp(full_path, name)) { - kfree(full_path); + if (strcmp(full_path, name)) continue; - } - kfree(full_path); cinode = CIFS_I(d_inode(cfile->dentry)); spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); *ret_file = find_readable_file(cinode, 0); return *ret_file ? 0 : -ENOENT; } spin_unlock(&tcon->open_file_lock); + free_dentry_path(page); return -ENOENT; } @@ -2479,6 +2548,8 @@ retry: if (cfile) cifsFileInfo_put(cfile); free_xid(xid); + /* Indication to update ctime and mtime as close is deferred */ + set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); return rc; } @@ -2580,13 +2651,17 @@ static int cifs_write_end(struct file *file, struct address_space *mapping, if (rc > 0) { spin_lock(&inode->i_lock); - if (pos > inode->i_size) + if (pos > inode->i_size) { i_size_write(inode, pos); + inode->i_blocks = (512 - 1 + pos) >> 9; + } spin_unlock(&inode->i_lock); } unlock_page(page); put_page(page); + /* Indication to update ctime and mtime as close is deferred */ + set_bit(CIFS_INO_MODIFIED_ATTR, &CIFS_I(inode)->flags); return rc; } @@ -4747,6 +4822,8 @@ void cifs_oplock_break(struct work_struct *work) struct TCP_Server_Info *server = tcon->ses->server; int rc = 0; bool purge_cache = false; + bool is_deferred = false; + struct cifs_deferred_close *dclose; wait_on_bit(&cinode->flags, CIFS_INODE_PENDING_WRITERS, TASK_UNINTERRUPTIBLE); @@ -4793,6 +4870,24 @@ oplock_break_ack: cinode); cifs_dbg(FYI, "Oplock release rc = %d\n", rc); } + /* + * When oplock break is received and there are no active + * file handles but cached, then schedule deferred close immediately. + * So, new open will not use cached handle. + */ + spin_lock(&CIFS_I(inode)->deferred_lock); + is_deferred = cifs_is_deferred_close(cfile, &dclose); + if (is_deferred && + cfile->deferred_close_scheduled && + delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) + cifsFileInfo_get(cfile); + } + spin_unlock(&CIFS_I(inode)->deferred_lock); _cifsFileInfo_put(cfile, false /* do not wait for ourself */, false); cifs_done_oplock_break(cinode); } diff --git a/fs/cifs/fs_context.c b/fs/cifs/fs_context.c index 78889024a7ed..92d4ab029c91 100644 --- a/fs/cifs/fs_context.c +++ b/fs/cifs/fs_context.c @@ -137,6 +137,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_u32("min_enc_offload", Opt_min_enc_offload), fsparam_u32("esize", Opt_min_enc_offload), fsparam_u32("bsize", Opt_blocksize), + fsparam_u32("rasize", Opt_rasize), fsparam_u32("rsize", Opt_rsize), fsparam_u32("wsize", Opt_wsize), fsparam_u32("actimeo", Opt_actimeo), @@ -188,8 +189,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { {} }; -int -cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx) +static int +cifs_parse_security_flavors(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; @@ -203,7 +204,7 @@ cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx) switch (match_token(value, cifs_secflavor_tokens, args)) { case Opt_sec_krb5p: - cifs_dbg(VFS, "sec=krb5p is not supported!\n"); + cifs_errorf(fc, "sec=krb5p is not supported!\n"); return 1; case Opt_sec_krb5i: ctx->sign = true; @@ -238,7 +239,7 @@ cifs_parse_security_flavors(char *value, struct smb3_fs_context *ctx) ctx->nullauth = 1; break; default: - cifs_dbg(VFS, "bad security option: %s\n", value); + cifs_errorf(fc, "bad security option: %s\n", value); return 1; } @@ -254,8 +255,8 @@ static const match_table_t cifs_cacheflavor_tokens = { { Opt_cache_err, NULL } }; -int -cifs_parse_cache_flavor(char *value, struct smb3_fs_context *ctx) +static int +cifs_parse_cache_flavor(struct fs_context *fc, char *value, struct smb3_fs_context *ctx) { substring_t args[MAX_OPT_ARGS]; @@ -291,7 +292,7 @@ cifs_parse_cache_flavor(char *value, struct smb3_fs_context *ctx) ctx->cache_rw = true; break; default: - cifs_dbg(VFS, "bad cache= option: %s\n", value); + cifs_errorf(fc, "bad cache= option: %s\n", value); return 1; } return 0; @@ -339,7 +340,7 @@ smb3_fs_context_dup(struct smb3_fs_context *new_ctx, struct smb3_fs_context *ctx } static int -cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) +cifs_parse_smb_version(struct fs_context *fc, char *value, struct smb3_fs_context *ctx, bool is_smb3) { substring_t args[MAX_OPT_ARGS]; @@ -347,24 +348,24 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) #ifdef CONFIG_CIFS_ALLOW_INSECURE_LEGACY case Smb_1: if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + cifs_errorf(fc, "mount with legacy dialect disabled\n"); return 1; } if (is_smb3) { - cifs_dbg(VFS, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); + cifs_errorf(fc, "vers=1.0 (cifs) not permitted when mounting with smb3\n"); return 1; } - cifs_dbg(VFS, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); + cifs_errorf(fc, "Use of the less secure dialect vers=1.0 is not recommended unless required for access to very old servers\n"); ctx->ops = &smb1_operations; ctx->vals = &smb1_values; break; case Smb_20: if (disable_legacy_dialects) { - cifs_dbg(VFS, "mount with legacy dialect disabled\n"); + cifs_errorf(fc, "mount with legacy dialect disabled\n"); return 1; } if (is_smb3) { - cifs_dbg(VFS, "vers=2.0 not permitted when mounting with smb3\n"); + cifs_errorf(fc, "vers=2.0 not permitted when mounting with smb3\n"); return 1; } ctx->ops = &smb20_operations; @@ -372,10 +373,10 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) break; #else case Smb_1: - cifs_dbg(VFS, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); + cifs_errorf(fc, "vers=1.0 (cifs) mount not permitted when legacy dialects disabled\n"); return 1; case Smb_20: - cifs_dbg(VFS, "vers=2.0 mount not permitted when legacy dialects disabled\n"); + cifs_errorf(fc, "vers=2.0 mount not permitted when legacy dialects disabled\n"); return 1; #endif /* CIFS_ALLOW_INSECURE_LEGACY */ case Smb_21: @@ -403,7 +404,7 @@ cifs_parse_smb_version(char *value, struct smb3_fs_context *ctx, bool is_smb3) ctx->vals = &smbdefault_values; break; default: - cifs_dbg(VFS, "Unknown vers= option specified: %s\n", value); + cifs_errorf(fc, "Unknown vers= option specified: %s\n", value); return 1; } return 0; @@ -430,7 +431,7 @@ int smb3_parse_opt(const char *options, const char *key, char **val) if (nval == p) continue; *nval++ = 0; - *val = kstrndup(nval, strlen(nval), GFP_KERNEL); + *val = kstrdup(nval, GFP_KERNEL); rc = !*val ? -ENOMEM : 0; goto out; } @@ -475,6 +476,7 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) /* move "pos" up to delimiter or NULL */ pos += len; + kfree(ctx->UNC); ctx->UNC = kstrndup(devname, pos - devname, GFP_KERNEL); if (!ctx->UNC) return -ENOMEM; @@ -485,6 +487,9 @@ smb3_parse_devname(const char *devname, struct smb3_fs_context *ctx) if (*pos == '/' || *pos == '\\') pos++; + kfree(ctx->prepath); + ctx->prepath = NULL; + /* If pos is NULL then no prepath */ if (!*pos) return 0; @@ -588,14 +593,14 @@ static int smb3_fs_context_validate(struct fs_context *fc) struct smb3_fs_context *ctx = smb3_fc2context(fc); if (ctx->rdma && ctx->vals->protocol_id < SMB30_PROT_ID) { - cifs_dbg(VFS, "SMB Direct requires Version >=3.0\n"); + cifs_errorf(fc, "SMB Direct requires Version >=3.0\n"); return -EOPNOTSUPP; } #ifndef CONFIG_KEYS /* Muliuser mounts require CONFIG_KEYS support */ if (ctx->multiuser) { - cifs_dbg(VFS, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); + cifs_errorf(fc, "Multiuser mounts require kernels with CONFIG_KEYS enabled\n"); return -1; } #endif @@ -605,13 +610,13 @@ static int smb3_fs_context_validate(struct fs_context *fc) if (!ctx->UNC) { - cifs_dbg(VFS, "CIFS mount error: No usable UNC path provided in device string!\n"); + cifs_errorf(fc, "CIFS mount error: No usable UNC path provided in device string!\n"); return -1; } /* make sure UNC has a share name */ if (strlen(ctx->UNC) < 3 || !strchr(ctx->UNC + 3, '\\')) { - cifs_dbg(VFS, "Malformed UNC. Unable to find share name.\n"); + cifs_errorf(fc, "Malformed UNC. Unable to find share name.\n"); return -ENOENT; } @@ -684,49 +689,50 @@ static void smb3_fs_context_free(struct fs_context *fc) * Compare the old and new proposed context during reconfigure * and check if the changes are compatible. */ -static int smb3_verify_reconfigure_ctx(struct smb3_fs_context *new_ctx, +static int smb3_verify_reconfigure_ctx(struct fs_context *fc, + struct smb3_fs_context *new_ctx, struct smb3_fs_context *old_ctx) { if (new_ctx->posix_paths != old_ctx->posix_paths) { - cifs_dbg(VFS, "can not change posixpaths during remount\n"); + cifs_errorf(fc, "can not change posixpaths during remount\n"); return -EINVAL; } if (new_ctx->sectype != old_ctx->sectype) { - cifs_dbg(VFS, "can not change sec during remount\n"); + cifs_errorf(fc, "can not change sec during remount\n"); return -EINVAL; } if (new_ctx->multiuser != old_ctx->multiuser) { - cifs_dbg(VFS, "can not change multiuser during remount\n"); + cifs_errorf(fc, "can not change multiuser during remount\n"); return -EINVAL; } if (new_ctx->UNC && (!old_ctx->UNC || strcmp(new_ctx->UNC, old_ctx->UNC))) { - cifs_dbg(VFS, "can not change UNC during remount\n"); + cifs_errorf(fc, "can not change UNC during remount\n"); return -EINVAL; } if (new_ctx->username && (!old_ctx->username || strcmp(new_ctx->username, old_ctx->username))) { - cifs_dbg(VFS, "can not change username during remount\n"); + cifs_errorf(fc, "can not change username during remount\n"); return -EINVAL; } if (new_ctx->password && (!old_ctx->password || strcmp(new_ctx->password, old_ctx->password))) { - cifs_dbg(VFS, "can not change password during remount\n"); + cifs_errorf(fc, "can not change password during remount\n"); return -EINVAL; } if (new_ctx->domainname && (!old_ctx->domainname || strcmp(new_ctx->domainname, old_ctx->domainname))) { - cifs_dbg(VFS, "can not change domainname during remount\n"); + cifs_errorf(fc, "can not change domainname during remount\n"); return -EINVAL; } if (new_ctx->nodename && (!old_ctx->nodename || strcmp(new_ctx->nodename, old_ctx->nodename))) { - cifs_dbg(VFS, "can not change nodename during remount\n"); + cifs_errorf(fc, "can not change nodename during remount\n"); return -EINVAL; } if (new_ctx->iocharset && (!old_ctx->iocharset || strcmp(new_ctx->iocharset, old_ctx->iocharset))) { - cifs_dbg(VFS, "can not change iocharset during remount\n"); + cifs_errorf(fc, "can not change iocharset during remount\n"); return -EINVAL; } @@ -747,7 +753,7 @@ static int smb3_reconfigure(struct fs_context *fc) struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); int rc; - rc = smb3_verify_reconfigure_ctx(ctx, cifs_sb->ctx); + rc = smb3_verify_reconfigure_ctx(fc, ctx, cifs_sb->ctx); if (rc) return rc; @@ -933,13 +939,33 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, */ if ((result.uint_32 < CIFS_MAX_MSGSIZE) || (result.uint_32 > (4 * SMB3_DEFAULT_IOSIZE))) { - cifs_dbg(VFS, "%s: Invalid blocksize\n", + cifs_errorf(fc, "%s: Invalid blocksize\n", __func__); goto cifs_parse_mount_err; } ctx->bsize = result.uint_32; ctx->got_bsize = true; break; + case Opt_rasize: + /* + * readahead size realistically should never need to be + * less than 1M (CIFS_DEFAULT_IOSIZE) or greater than 32M + * (perhaps an exception should be considered in the + * for the case of a large number of channels + * when multichannel is negotiated) since that would lead + * to plenty of parallel I/O in flight to the server. + * Note that smaller read ahead sizes would + * hurt performance of common tools like cp and scp + * which often trigger sequential i/o with read ahead + */ + if ((result.uint_32 > (8 * SMB3_DEFAULT_IOSIZE)) || + (result.uint_32 < CIFS_DEFAULT_IOSIZE)) { + cifs_errorf(fc, "%s: Invalid rasize %d vs. %d\n", + __func__, result.uint_32, SMB3_DEFAULT_IOSIZE); + goto cifs_parse_mount_err; + } + ctx->rasize = result.uint_32; + break; case Opt_rsize: ctx->rsize = result.uint_32; ctx->got_rsize = true; @@ -951,25 +977,25 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case Opt_acregmax: ctx->acregmax = HZ * result.uint_32; if (ctx->acregmax > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "acregmax too large\n"); + cifs_errorf(fc, "acregmax too large\n"); goto cifs_parse_mount_err; } break; case Opt_acdirmax: ctx->acdirmax = HZ * result.uint_32; if (ctx->acdirmax > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "acdirmax too large\n"); + cifs_errorf(fc, "acdirmax too large\n"); goto cifs_parse_mount_err; } break; case Opt_actimeo: if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) { - cifs_dbg(VFS, "timeout too large\n"); + cifs_errorf(fc, "timeout too large\n"); goto cifs_parse_mount_err; } if ((ctx->acdirmax != CIFS_DEF_ACTIMEO) || (ctx->acregmax != CIFS_DEF_ACTIMEO)) { - cifs_dbg(VFS, "actimeo ignored since acregmax or acdirmax specified\n"); + cifs_errorf(fc, "actimeo ignored since acregmax or acdirmax specified\n"); break; } ctx->acdirmax = ctx->acregmax = HZ * result.uint_32; @@ -982,7 +1008,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; case Opt_max_credits: if (result.uint_32 < 20 || result.uint_32 > 60000) { - cifs_dbg(VFS, "%s: Invalid max_credits value\n", + cifs_errorf(fc, "%s: Invalid max_credits value\n", __func__); goto cifs_parse_mount_err; } @@ -990,16 +1016,19 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; case Opt_max_channels: if (result.uint_32 < 1 || result.uint_32 > CIFS_MAX_CHANNELS) { - cifs_dbg(VFS, "%s: Invalid max_channels value, needs to be 1-%d\n", + cifs_errorf(fc, "%s: Invalid max_channels value, needs to be 1-%d\n", __func__, CIFS_MAX_CHANNELS); goto cifs_parse_mount_err; } ctx->max_channels = result.uint_32; + /* If more than one channel requested ... they want multichan */ + if (result.uint_32 > 1) + ctx->multichannel = true; break; case Opt_handletimeout: ctx->handle_timeout = result.uint_32; if (ctx->handle_timeout > SMB3_MAX_HANDLE_TIMEOUT) { - cifs_dbg(VFS, "Invalid handle cache timeout, longer than 16 minutes\n"); + cifs_errorf(fc, "Invalid handle cache timeout, longer than 16 minutes\n"); goto cifs_parse_mount_err; } break; @@ -1010,23 +1039,23 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, case 0: break; case -ENOMEM: - cifs_dbg(VFS, "Unable to allocate memory for devname\n"); + cifs_errorf(fc, "Unable to allocate memory for devname\n"); goto cifs_parse_mount_err; case -EINVAL: - cifs_dbg(VFS, "Malformed UNC in devname\n"); + cifs_errorf(fc, "Malformed UNC in devname\n"); goto cifs_parse_mount_err; default: - cifs_dbg(VFS, "Unknown error parsing devname\n"); + cifs_errorf(fc, "Unknown error parsing devname\n"); goto cifs_parse_mount_err; } ctx->source = kstrdup(param->string, GFP_KERNEL); if (ctx->source == NULL) { - cifs_dbg(VFS, "OOM when copying UNC string\n"); + cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } fc->source = kstrdup(param->string, GFP_KERNEL); if (fc->source == NULL) { - cifs_dbg(VFS, "OOM when copying UNC string\n"); + cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } break; @@ -1046,7 +1075,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } ctx->username = kstrdup(param->string, GFP_KERNEL); if (ctx->username == NULL) { - cifs_dbg(VFS, "OOM when copying username string\n"); + cifs_errorf(fc, "OOM when copying username string\n"); goto cifs_parse_mount_err; } break; @@ -1058,7 +1087,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->password = kstrdup(param->string, GFP_KERNEL); if (ctx->password == NULL) { - cifs_dbg(VFS, "OOM when copying password string\n"); + cifs_errorf(fc, "OOM when copying password string\n"); goto cifs_parse_mount_err; } break; @@ -1085,7 +1114,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, kfree(ctx->domainname); ctx->domainname = kstrdup(param->string, GFP_KERNEL); if (ctx->domainname == NULL) { - cifs_dbg(VFS, "OOM when copying domainname string\n"); + cifs_errorf(fc, "OOM when copying domainname string\n"); goto cifs_parse_mount_err; } cifs_dbg(FYI, "Domain name set\n"); @@ -1109,14 +1138,14 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, kfree(ctx->iocharset); ctx->iocharset = kstrdup(param->string, GFP_KERNEL); if (ctx->iocharset == NULL) { - cifs_dbg(VFS, "OOM when copying iocharset string\n"); + cifs_errorf(fc, "OOM when copying iocharset string\n"); goto cifs_parse_mount_err; } } /* if iocharset not set then load_nls_default * is used by caller */ - cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset); + cifs_dbg(FYI, "iocharset set to %s\n", ctx->iocharset); break; case Opt_netbiosname: memset(ctx->source_rfc1001_name, 0x20, @@ -1175,21 +1204,21 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, goto cifs_parse_mount_err; case Opt_vers: /* protocol version (dialect) */ - if (cifs_parse_smb_version(param->string, ctx, is_smb3) != 0) + if (cifs_parse_smb_version(fc, param->string, ctx, is_smb3) != 0) goto cifs_parse_mount_err; ctx->got_version = true; break; case Opt_sec: - if (cifs_parse_security_flavors(param->string, ctx) != 0) + if (cifs_parse_security_flavors(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_cache: - if (cifs_parse_cache_flavor(param->string, ctx) != 0) + if (cifs_parse_cache_flavor(fc, param->string, ctx) != 0) goto cifs_parse_mount_err; break; case Opt_witness: #ifndef CONFIG_CIFS_SWN_UPCALL - cifs_dbg(VFS, "Witness support needs CONFIG_CIFS_SWN_UPCALL config option\n"); + cifs_errorf(fc, "Witness support needs CONFIG_CIFS_SWN_UPCALL config option\n"); goto cifs_parse_mount_err; #endif ctx->witness = true; @@ -1290,7 +1319,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, break; case Opt_fsc: #ifndef CONFIG_CIFS_FSCACHE - cifs_dbg(VFS, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); + cifs_errorf(fc, "FS-Cache support needs CONFIG_CIFS_FSCACHE kernel config option set\n"); goto cifs_parse_mount_err; #endif ctx->fsc = true; @@ -1311,15 +1340,13 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (result.negated) { ctx->nopersistent = true; if (ctx->persistent) { - cifs_dbg(VFS, - "persistenthandles mount options conflict\n"); + cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } else { ctx->persistent = true; if ((ctx->nopersistent) || (ctx->resilient)) { - cifs_dbg(VFS, - "persistenthandles mount options conflict\n"); + cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } @@ -1330,8 +1357,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } else { ctx->resilient = true; if (ctx->persistent) { - cifs_dbg(VFS, - "persistenthandles mount options conflict\n"); + cifs_errorf(fc, "persistenthandles mount options conflict\n"); goto cifs_parse_mount_err; } } @@ -1379,7 +1405,9 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->cred_uid = current_uid(); ctx->linux_uid = current_uid(); ctx->linux_gid = current_gid(); - ctx->bsize = 1024 * 1024; /* can improve cp performance significantly */ + /* By default 4MB read ahead size, 1MB block size */ + ctx->bsize = CIFS_DEFAULT_IOSIZE; /* can improve cp performance significantly */ + ctx->rasize = 0; /* 0 = use default (ie negotiated rsize) for read ahead pages */ /* * default to SFM style remapping of seven reserved characters @@ -1621,6 +1649,7 @@ void smb3_update_mnt_flags(struct cifs_sb_info *cifs_sb) cifs_dbg(VFS, "mount options mfsymlinks and sfu both enabled\n"); } } + cifs_sb->mnt_cifs_flags &= ~CIFS_MOUNT_SHUTDOWN; return; } diff --git a/fs/cifs/fs_context.h b/fs/cifs/fs_context.h index 87dd1f7168f2..2a71c8e411ac 100644 --- a/fs/cifs/fs_context.h +++ b/fs/cifs/fs_context.h @@ -13,7 +13,12 @@ #include <linux/parser.h> #include <linux/fs_parser.h> -#define cifs_invalf(fc, fmt, ...) invalf(fc, fmt, ## __VA_ARGS__) +/* Log errors in fs_context (new mount api) but also in dmesg (old style) */ +#define cifs_errorf(fc, fmt, ...) \ + do { \ + errorf(fc, fmt, ## __VA_ARGS__); \ + cifs_dbg(VFS, fmt, ## __VA_ARGS__); \ + } while (0) enum smb_version { Smb_1 = 1, @@ -115,6 +120,7 @@ enum cifs_param { Opt_dirmode, Opt_min_enc_offload, Opt_blocksize, + Opt_rasize, Opt_rsize, Opt_wsize, Opt_actimeo, @@ -230,6 +236,7 @@ struct smb3_fs_context { /* reuse existing guid for multichannel */ u8 client_guid[SMB2_CLIENT_GUID_SIZE]; unsigned int bsize; + unsigned int rasize; unsigned int rsize; unsigned int wsize; unsigned int min_offload; @@ -257,10 +264,6 @@ struct smb3_fs_context { extern const struct fs_parameter_spec smb3_fs_parameters[]; -extern int cifs_parse_cache_flavor(char *value, - struct smb3_fs_context *ctx); -extern int cifs_parse_security_flavors(char *value, - struct smb3_fs_context *ctx); extern int smb3_init_fs_context(struct fs_context *fc); extern void smb3_cleanup_fs_context_contents(struct smb3_fs_context *ctx); extern void smb3_cleanup_fs_context(struct smb3_fs_context *ctx); diff --git a/fs/cifs/inode.c b/fs/cifs/inode.c index f2df4422e54a..1dfa57982522 100644 --- a/fs/cifs/inode.c +++ b/fs/cifs/inode.c @@ -26,7 +26,6 @@ #include <linux/sched/signal.h> #include <linux/wait_bit.h> #include <linux/fiemap.h> - #include <asm/div64.h> #include "cifsfs.h" #include "cifspdu.h" @@ -38,7 +37,7 @@ #include "cifs_unicode.h" #include "fscache.h" #include "fs_context.h" - +#include "cifs_ioctl.h" static void cifs_set_ops(struct inode *inode) { @@ -157,12 +156,18 @@ cifs_nlink_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) } /* populate an inode with info from a cifs_fattr struct */ -void +int cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + if (!(inode->i_state & I_NEW) && + unlikely(inode_wrong_type(inode, fattr->cf_mode))) { + CIFS_I(inode)->time = 0; /* force reval */ + return -ESTALE; + } + cifs_revalidate_cache(inode, fattr); spin_lock(&inode->i_lock); @@ -219,6 +224,7 @@ cifs_fattr_to_inode(struct inode *inode, struct cifs_fattr *fattr) inode->i_flags |= S_AUTOMOUNT; if (inode->i_state & I_NEW) cifs_set_ops(inode); + return 0; } void @@ -363,7 +369,7 @@ cifs_get_file_info_unix(struct file *filp) rc = 0; } - cifs_fattr_to_inode(inode, &fattr); + rc = cifs_fattr_to_inode(inode, &fattr); free_xid(xid); return rc; } @@ -426,14 +432,7 @@ int cifs_get_inode_info_unix(struct inode **pinode, } /* if filetype is different, return error */ - if (unlikely(((*pinode)->i_mode & S_IFMT) != - (fattr.cf_mode & S_IFMT))) { - CIFS_I(*pinode)->time = 0; /* force reval */ - rc = -ESTALE; - goto cgiiu_exit; - } - - cifs_fattr_to_inode(*pinode, &fattr); + rc = cifs_fattr_to_inode(*pinode, &fattr); } cgiiu_exit: @@ -783,7 +782,8 @@ cifs_get_file_info(struct file *filp) */ fattr.cf_uniqueid = CIFS_I(inode)->uniqueid; fattr.cf_flags |= CIFS_FATTR_NEED_REVAL; - cifs_fattr_to_inode(inode, &fattr); + /* if filetype is different, return error */ + rc = cifs_fattr_to_inode(inode, &fattr); cgfi_exit: free_xid(xid); return rc; @@ -1100,16 +1100,8 @@ handle_mnt_opt: rc = -ESTALE; goto out; } - /* if filetype is different, return error */ - if (unlikely(((*inode)->i_mode & S_IFMT) != - (fattr.cf_mode & S_IFMT))) { - CIFS_I(*inode)->time = 0; /* force reval */ - rc = -ESTALE; - goto out; - } - - cifs_fattr_to_inode(*inode, &fattr); + rc = cifs_fattr_to_inode(*inode, &fattr); } out: cifs_buf_release(smb1_backup_rsp_buf); @@ -1215,14 +1207,7 @@ smb311_posix_get_inode_info(struct inode **inode, } /* if filetype is different, return error */ - if (unlikely(((*inode)->i_mode & S_IFMT) != - (fattr.cf_mode & S_IFMT))) { - CIFS_I(*inode)->time = 0; /* force reval */ - rc = -ESTALE; - goto out; - } - - cifs_fattr_to_inode(*inode, &fattr); + rc = cifs_fattr_to_inode(*inode, &fattr); } out: cifs_put_tlink(tlink); @@ -1249,7 +1234,7 @@ cifs_find_inode(struct inode *inode, void *opaque) return 0; /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (fattr->cf_mode & S_IFMT)) + if (inode_wrong_type(inode, fattr->cf_mode)) return 0; /* if it's not a directory or has no dentries, then flag it */ @@ -1317,6 +1302,7 @@ retry_iget5_locked: } } + /* can't fail - see cifs_find_inode() */ cifs_fattr_to_inode(inode, fattr); if (sb->s_flags & SB_NOATIME) inode->i_flags |= S_NOATIME | S_NOCMTIME; @@ -1408,7 +1394,7 @@ out: int cifs_set_file_info(struct inode *inode, struct iattr *attrs, unsigned int xid, - char *full_path, __u32 dosattr) + const char *full_path, __u32 dosattr) { bool set_time = false; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -1609,7 +1595,8 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) { int rc = 0; unsigned int xid; - char *full_path = NULL; + const char *full_path; + void *page; struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifs_inode; struct super_block *sb = dir->i_sb; @@ -1622,6 +1609,9 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) cifs_dbg(FYI, "cifs_unlink, dir=0x%p, dentry=0x%p\n", dir, dentry); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -1629,6 +1619,7 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) server = tcon->ses->server; xid = get_xid(); + page = alloc_dentry_path(); if (tcon->nodelete) { rc = -EACCES; @@ -1637,12 +1628,13 @@ int cifs_unlink(struct inode *dir, struct dentry *dentry) /* Unlink can be called from rename so we can not take the * sb->s_vfs_rename_mutex here */ - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto unlink_out; } + cifs_close_all_deferred_files(tcon); if (cap_unix(tcon->ses) && (CIFS_UNIX_POSIX_PATH_OPS_CAP & le64_to_cpu(tcon->fsUnixInfo.Capability))) { rc = CIFSPOSIXDelFile(xid, tcon, full_path, @@ -1713,7 +1705,7 @@ out_reval: cifs_inode = CIFS_I(dir); CIFS_I(dir)->time = 0; /* force revalidate of dir as well */ unlink_out: - kfree(full_path); + free_dentry_path(page); kfree(attrs); free_xid(xid); cifs_put_tlink(tlink); @@ -1740,6 +1732,16 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, if (rc) return rc; + if (!S_ISDIR(inode->i_mode)) { + /* + * mkdir succeeded, but another client has managed to remove the + * sucker and replace it with non-directory. Return success, + * but don't leave the child in dcache. + */ + iput(inode); + d_drop(dentry); + return 0; + } /* * setting nlink not necessary except in cases where we failed to get it * from the server or was set bogus. Also, since this is a brand new @@ -1791,7 +1793,7 @@ cifs_mkdir_qinfo(struct inode *parent, struct dentry *dentry, umode_t mode, } } d_instantiate(dentry, inode); - return rc; + return 0; } static int @@ -1866,12 +1868,15 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - char *full_path; + const char *full_path; + void *page; cifs_dbg(FYI, "In cifs_mkdir, mode = %04ho inode = 0x%p\n", mode, inode); cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); @@ -1879,9 +1884,10 @@ int cifs_mkdir(struct user_namespace *mnt_userns, struct inode *inode, xid = get_xid(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto mkdir_out; } @@ -1924,7 +1930,7 @@ mkdir_out: * attributes are invalid now. */ CIFS_I(inode)->time = 0; - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -1938,20 +1944,26 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); struct cifsInodeInfo *cifsInode; cifs_dbg(FYI, "cifs_rmdir, inode = 0x%p\n", inode); xid = get_xid(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto rmdir_exit; } cifs_sb = CIFS_SB(inode->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) { + rc = -EIO; + goto rmdir_exit; + } + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) { rc = PTR_ERR(tlink); @@ -1997,7 +2009,7 @@ int cifs_rmdir(struct inode *inode, struct dentry *direntry) current_time(inode); rmdir_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } @@ -2072,8 +2084,8 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, struct dentry *source_dentry, struct inode *target_dir, struct dentry *target_dentry, unsigned int flags) { - char *from_name = NULL; - char *to_name = NULL; + const char *from_name, *to_name; + void *page1, *page2; struct cifs_sb_info *cifs_sb; struct tcon_link *tlink; struct cifs_tcon *tcon; @@ -2086,29 +2098,31 @@ cifs_rename2(struct user_namespace *mnt_userns, struct inode *source_dir, return -EINVAL; cifs_sb = CIFS_SB(source_dir->i_sb); + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); + page1 = alloc_dentry_path(); + page2 = alloc_dentry_path(); xid = get_xid(); - /* - * we already have the rename sem so we do not need to - * grab it again here to protect the path integrity - */ - from_name = build_path_from_dentry(source_dentry); - if (from_name == NULL) { - rc = -ENOMEM; + from_name = build_path_from_dentry(source_dentry, page1); + if (IS_ERR(from_name)) { + rc = PTR_ERR(from_name); goto cifs_rename_exit; } - to_name = build_path_from_dentry(target_dentry); - if (to_name == NULL) { - rc = -ENOMEM; + to_name = build_path_from_dentry(target_dentry, page2); + if (IS_ERR(to_name)) { + rc = PTR_ERR(to_name); goto cifs_rename_exit; } + cifs_close_all_deferred_files(tcon); rc = cifs_do_rename(xid, source_dentry, from_name, target_dentry, to_name); @@ -2177,18 +2191,21 @@ unlink_target: cifs_rename_exit: kfree(info_buf_source); - kfree(from_name); - kfree(to_name); + free_dentry_path(page2); + free_dentry_path(page1); free_xid(xid); cifs_put_tlink(tlink); return rc; } static bool -cifs_inode_needs_reval(struct inode *inode) +cifs_dentry_needs_reval(struct dentry *dentry) { + struct inode *inode = d_inode(dentry); struct cifsInodeInfo *cifs_i = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); + struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); + struct cached_fid *cfid = NULL; if (cifs_i->time == 0) return true; @@ -2199,6 +2216,16 @@ cifs_inode_needs_reval(struct inode *inode) if (!lookupCacheEnabled) return true; + if (!open_cached_dir_by_dentry(tcon, dentry->d_parent, &cfid)) { + mutex_lock(&cfid->fid_mutex); + if (cfid->time && cifs_i->time > cfid->time) { + mutex_unlock(&cfid->fid_mutex); + close_cached_dir(cfid); + return false; + } + mutex_unlock(&cfid->fid_mutex); + close_cached_dir(cfid); + } /* * depending on inode type, check if attribute caching disabled for * files or directories @@ -2297,10 +2324,10 @@ cifs_zap_mapping(struct inode *inode) int cifs_revalidate_file_attr(struct file *filp) { int rc = 0; - struct inode *inode = file_inode(filp); + struct dentry *dentry = file_dentry(filp); struct cifsFileInfo *cfile = (struct cifsFileInfo *) filp->private_data; - if (!cifs_inode_needs_reval(inode)) + if (!cifs_dentry_needs_reval(dentry)) return rc; if (tlink_tcon(cfile->tlink)->unix_ext) @@ -2317,22 +2344,22 @@ int cifs_revalidate_dentry_attr(struct dentry *dentry) int rc = 0; struct inode *inode = d_inode(dentry); struct super_block *sb = dentry->d_sb; - char *full_path = NULL; + const char *full_path; + void *page; int count = 0; if (inode == NULL) return -ENOENT; - if (!cifs_inode_needs_reval(inode)) + if (!cifs_dentry_needs_reval(dentry)) return rc; xid = get_xid(); - /* can not safely grab the rename sem here if rename calls revalidate - since that would deadlock */ - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + page = alloc_dentry_path(); + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -2351,7 +2378,7 @@ again: if (rc == -EAGAIN && count++ < 10) goto again; out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; @@ -2391,6 +2418,9 @@ int cifs_getattr(struct user_namespace *mnt_userns, const struct path *path, struct inode *inode = d_inode(dentry); int rc; + if (unlikely(cifs_forced_shutdown(CIFS_SB(inode->i_sb)))) + return -EIO; + /* * We need to be sure that all dirty pages are written and the server * has actual ctime, mtime and file length. @@ -2463,6 +2493,9 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, struct cifsFileInfo *cfile; int rc; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + /* * We need to be sure that all dirty pages are written as they * might fill holes on the server. @@ -2522,7 +2555,7 @@ void cifs_setsize(struct inode *inode, loff_t offset) static int cifs_set_file_size(struct inode *inode, struct iattr *attrs, - unsigned int xid, char *full_path) + unsigned int xid, const char *full_path) { int rc; struct cifsFileInfo *open_file; @@ -2613,7 +2646,8 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) { int rc; unsigned int xid; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); struct inode *inode = d_inode(direntry); struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); @@ -2634,9 +2668,9 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) if (rc < 0) goto out; - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -2748,7 +2782,7 @@ cifs_setattr_unix(struct dentry *direntry, struct iattr *attrs) cifsInode->time = 0; out: kfree(args); - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } @@ -2764,7 +2798,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) struct cifsInodeInfo *cifsInode = CIFS_I(inode); struct cifsFileInfo *wfile; struct cifs_tcon *tcon; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); int rc = -EACCES; __u32 dosattr = 0; __u64 mode = NO_CHANGE_64; @@ -2778,16 +2813,13 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) attrs->ia_valid |= ATTR_FORCE; rc = setattr_prepare(&init_user_ns, direntry, attrs); - if (rc < 0) { - free_xid(xid); - return rc; - } + if (rc < 0) + goto cifs_setattr_exit; - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; - free_xid(xid); - return rc; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); + goto cifs_setattr_exit; } /* @@ -2937,8 +2969,8 @@ cifs_setattr_nounix(struct dentry *direntry, struct iattr *attrs) mark_inode_dirty(inode); cifs_setattr_exit: - kfree(full_path); free_xid(xid); + free_dentry_path(page); return rc; } @@ -2950,6 +2982,9 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, struct cifs_tcon *pTcon = cifs_sb_master_tcon(cifs_sb); int rc, retries = 0; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + do { if (pTcon->unix_ext) rc = cifs_setattr_unix(direntry, attrs); @@ -2961,12 +2996,3 @@ cifs_setattr(struct user_namespace *mnt_userns, struct dentry *direntry, /* BB: add cifs_setattr_legacy for really old servers */ return rc; } - -#if 0 -void cifs_delete_inode(struct inode *inode) -{ - cifs_dbg(FYI, "In cifs_delete_inode, inode = 0x%p\n", inode); - /* may have to add back in if and when safe distributed caching of - directories added e.g. via FindNotify */ -} -#endif diff --git a/fs/cifs/ioctl.c b/fs/cifs/ioctl.c index dcde44ff6cf9..d67d281ab863 100644 --- a/fs/cifs/ioctl.c +++ b/fs/cifs/ioctl.c @@ -33,6 +33,7 @@ #include "cifsfs.h" #include "cifs_ioctl.h" #include "smb2proto.h" +#include "smb2glob.h" #include <linux/btrfs.h> static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, @@ -42,13 +43,16 @@ static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); struct dentry *dentry = filep->f_path.dentry; - unsigned char *path; + const unsigned char *path; + void *page = alloc_dentry_path(); __le16 *utf16_path = NULL, root_path; int rc = 0; - path = build_path_from_dentry(dentry); - if (path == NULL) - return -ENOMEM; + path = build_path_from_dentry(dentry, page); + if (IS_ERR(path)) { + free_dentry_path(page); + return PTR_ERR(path); + } cifs_dbg(FYI, "%s %s\n", __func__, path); @@ -73,7 +77,7 @@ static long cifs_ioctl_query_info(unsigned int xid, struct file *filep, ici_exit: if (utf16_path != &root_path) kfree(utf16_path); - kfree(path); + free_dentry_path(page); return rc; } @@ -161,6 +165,164 @@ static long smb_mnt_get_fsinfo(unsigned int xid, struct cifs_tcon *tcon, return rc; } +static int cifs_shutdown(struct super_block *sb, unsigned long arg) +{ + struct cifs_sb_info *sbi = CIFS_SB(sb); + __u32 flags; + + if (!capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (get_user(flags, (__u32 __user *)arg)) + return -EFAULT; + + if (flags > CIFS_GOING_FLAGS_NOLOGFLUSH) + return -EINVAL; + + if (cifs_forced_shutdown(sbi)) + return 0; + + cifs_dbg(VFS, "shut down requested (%d)", flags); +/* trace_cifs_shutdown(sb, flags);*/ + + /* + * see: + * https://man7.org/linux/man-pages/man2/ioctl_xfs_goingdown.2.html + * for more information and description of original intent of the flags + */ + switch (flags) { + /* + * We could add support later for default flag which requires: + * "Flush all dirty data and metadata to disk" + * would need to call syncfs or equivalent to flush page cache for + * the mount and then issue fsync to server (if nostrictsync not set) + */ + case CIFS_GOING_FLAGS_DEFAULT: + cifs_dbg(FYI, "shutdown with default flag not supported\n"); + return -EINVAL; + /* + * FLAGS_LOGFLUSH is easy since it asks to write out metadata (not + * data) but metadata writes are not cached on the client, so can treat + * it similarly to NOLOGFLUSH + */ + case CIFS_GOING_FLAGS_LOGFLUSH: + case CIFS_GOING_FLAGS_NOLOGFLUSH: + sbi->mnt_cifs_flags |= CIFS_MOUNT_SHUTDOWN; + return 0; + default: + return -EINVAL; + } + return 0; +} + +static int cifs_dump_full_key(struct cifs_tcon *tcon, struct smb3_full_key_debug_info __user *in) +{ + struct smb3_full_key_debug_info out; + struct cifs_ses *ses; + int rc = 0; + bool found = false; + u8 __user *end; + + if (!smb3_encryption_required(tcon)) { + rc = -EOPNOTSUPP; + goto out; + } + + /* copy user input into our output buffer */ + if (copy_from_user(&out, in, sizeof(out))) { + rc = -EINVAL; + goto out; + } + + if (!out.session_id) { + /* if ses id is 0, use current user session */ + ses = tcon->ses; + } else { + /* otherwise if a session id is given, look for it in all our sessions */ + struct cifs_ses *ses_it = NULL; + struct TCP_Server_Info *server_it = NULL; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(server_it, &cifs_tcp_ses_list, tcp_ses_list) { + list_for_each_entry(ses_it, &server_it->smb_ses_list, smb_ses_list) { + if (ses_it->Suid == out.session_id) { + ses = ses_it; + /* + * since we are using the session outside the crit + * section, we need to make sure it won't be released + * so increment its refcount + */ + ses->ses_count++; + found = true; + goto search_end; + } + } + } +search_end: + spin_unlock(&cifs_tcp_ses_lock); + if (!found) { + rc = -ENOENT; + goto out; + } + } + + switch (ses->server->cipher_type) { + case SMB2_ENCRYPTION_AES128_CCM: + case SMB2_ENCRYPTION_AES128_GCM: + out.session_key_length = CIFS_SESS_KEY_SIZE; + out.server_in_key_length = out.server_out_key_length = SMB3_GCM128_CRYPTKEY_SIZE; + break; + case SMB2_ENCRYPTION_AES256_CCM: + case SMB2_ENCRYPTION_AES256_GCM: + out.session_key_length = CIFS_SESS_KEY_SIZE; + out.server_in_key_length = out.server_out_key_length = SMB3_GCM256_CRYPTKEY_SIZE; + break; + default: + rc = -EOPNOTSUPP; + goto out; + } + + /* check if user buffer is big enough to store all the keys */ + if (out.in_size < sizeof(out) + out.session_key_length + out.server_in_key_length + + out.server_out_key_length) { + rc = -ENOBUFS; + goto out; + } + + out.session_id = ses->Suid; + out.cipher_type = le16_to_cpu(ses->server->cipher_type); + + /* overwrite user input with our output */ + if (copy_to_user(in, &out, sizeof(out))) { + rc = -EINVAL; + goto out; + } + + /* append all the keys at the end of the user buffer */ + end = in->data; + if (copy_to_user(end, ses->auth_key.response, out.session_key_length)) { + rc = -EINVAL; + goto out; + } + end += out.session_key_length; + + if (copy_to_user(end, ses->smb3encryptionkey, out.server_in_key_length)) { + rc = -EINVAL; + goto out; + } + end += out.server_in_key_length; + + if (copy_to_user(end, ses->smb3decryptionkey, out.server_out_key_length)) { + rc = -EINVAL; + goto out; + } + +out: + if (found) + cifs_put_smb_ses(ses); + return rc; +} + long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) { struct inode *inode = file_inode(filep); @@ -274,6 +436,10 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; break; case CIFS_DUMP_KEY: + /* + * Dump encryption keys. This is an old ioctl that only + * handles AES-128-{CCM,GCM}. + */ if (pSMBFile == NULL) break; if (!capable(CAP_SYS_ADMIN)) { @@ -301,6 +467,19 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) else rc = 0; break; + case CIFS_DUMP_FULL_KEY: + /* + * Dump encryption keys (handles any key sizes) + */ + if (pSMBFile == NULL) + break; + if (!capable(CAP_SYS_ADMIN)) { + rc = -EACCES; + break; + } + tcon = tlink_tcon(pSMBFile->tlink); + rc = cifs_dump_full_key(tcon, (void __user *)arg); + break; case CIFS_IOC_NOTIFY: if (!S_ISDIR(inode->i_mode)) { /* Notify can only be done on directories */ @@ -322,6 +501,9 @@ long cifs_ioctl(struct file *filep, unsigned int command, unsigned long arg) rc = -EOPNOTSUPP; cifs_put_tlink(tlink); break; + case CIFS_IOC_SHUTDOWN: + rc = cifs_shutdown(inode->i_sb, arg); + break; default: cifs_dbg(FYI, "unsupported ioctl\n"); break; diff --git a/fs/cifs/link.c b/fs/cifs/link.c index 7c5878a645d9..970fcf2adb08 100644 --- a/fs/cifs/link.c +++ b/fs/cifs/link.c @@ -30,6 +30,7 @@ #include "cifs_fs_sb.h" #include "cifs_unicode.h" #include "smb2proto.h" +#include "cifs_ioctl.h" /* * M-F Symlink Functions - Begin @@ -510,25 +511,34 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, { int rc = -EACCES; unsigned int xid; - char *from_name = NULL; - char *to_name = NULL; + const char *from_name, *to_name; + void *page1, *page2; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; struct cifs_tcon *tcon; struct TCP_Server_Info *server; struct cifsInodeInfo *cifsInode; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) return PTR_ERR(tlink); tcon = tlink_tcon(tlink); xid = get_xid(); + page1 = alloc_dentry_path(); + page2 = alloc_dentry_path(); - from_name = build_path_from_dentry(old_file); - to_name = build_path_from_dentry(direntry); - if ((from_name == NULL) || (to_name == NULL)) { - rc = -ENOMEM; + from_name = build_path_from_dentry(old_file, page1); + if (IS_ERR(from_name)) { + rc = PTR_ERR(from_name); + goto cifs_hl_exit; + } + to_name = build_path_from_dentry(direntry, page2); + if (IS_ERR(to_name)) { + rc = PTR_ERR(to_name); goto cifs_hl_exit; } @@ -587,8 +597,8 @@ cifs_hardlink(struct dentry *old_file, struct inode *inode, } cifs_hl_exit: - kfree(from_name); - kfree(to_name); + free_dentry_path(page1); + free_dentry_path(page2); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -600,7 +610,8 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, { int rc = -ENOMEM; unsigned int xid; - char *full_path = NULL; + const char *full_path; + void *page; char *target_path = NULL; struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; @@ -620,11 +631,13 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, tcon = tlink_tcon(tlink); server = tcon->ses->server; - full_path = build_path_from_dentry(direntry); - if (!full_path) { + page = alloc_dentry_path(); + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { free_xid(xid); cifs_put_tlink(tlink); - return ERR_PTR(-ENOMEM); + free_dentry_path(page); + return ERR_CAST(full_path); } cifs_dbg(FYI, "Full path: %s inode = 0x%p\n", full_path, inode); @@ -649,7 +662,7 @@ cifs_get_link(struct dentry *direntry, struct inode *inode, &target_path, reparse_point); } - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); if (rc != 0) { @@ -669,9 +682,17 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path = NULL; + const char *full_path; + void *page; struct inode *newinode = NULL; + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; + + page = alloc_dentry_path(); + if (!page) + return -ENOMEM; + xid = get_xid(); tlink = cifs_sb_tlink(cifs_sb); @@ -681,9 +702,9 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, } pTcon = tlink_tcon(tlink); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto symlink_exit; } @@ -719,7 +740,7 @@ cifs_symlink(struct user_namespace *mnt_userns, struct inode *inode, } } symlink_exit: - kfree(full_path); + free_dentry_path(page); cifs_put_tlink(tlink); free_xid(xid); return rc; diff --git a/fs/cifs/misc.c b/fs/cifs/misc.c index 82e176720ca6..7207a63819cb 100644 --- a/fs/cifs/misc.c +++ b/fs/cifs/misc.c @@ -672,6 +672,100 @@ cifs_add_pending_open(struct cifs_fid *fid, struct tcon_link *tlink, spin_unlock(&tlink_tcon(open->tlink)->open_file_lock); } +/* + * Critical section which runs after acquiring deferred_lock. + * As there is no reference count on cifs_deferred_close, pdclose + * should not be used outside deferred_lock. + */ +bool +cifs_is_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close **pdclose) +{ + struct cifs_deferred_close *dclose; + + list_for_each_entry(dclose, &CIFS_I(d_inode(cfile->dentry))->deferred_closes, dlist) { + if ((dclose->netfid == cfile->fid.netfid) && + (dclose->persistent_fid == cfile->fid.persistent_fid) && + (dclose->volatile_fid == cfile->fid.volatile_fid)) { + *pdclose = dclose; + return true; + } + } + return false; +} + +/* + * Critical section which runs after acquiring deferred_lock. + */ +void +cifs_add_deferred_close(struct cifsFileInfo *cfile, struct cifs_deferred_close *dclose) +{ + bool is_deferred = false; + struct cifs_deferred_close *pdclose; + + is_deferred = cifs_is_deferred_close(cfile, &pdclose); + if (is_deferred) { + kfree(dclose); + return; + } + + dclose->tlink = cfile->tlink; + dclose->netfid = cfile->fid.netfid; + dclose->persistent_fid = cfile->fid.persistent_fid; + dclose->volatile_fid = cfile->fid.volatile_fid; + list_add_tail(&dclose->dlist, &CIFS_I(d_inode(cfile->dentry))->deferred_closes); +} + +/* + * Critical section which runs after acquiring deferred_lock. + */ +void +cifs_del_deferred_close(struct cifsFileInfo *cfile) +{ + bool is_deferred = false; + struct cifs_deferred_close *dclose; + + is_deferred = cifs_is_deferred_close(cfile, &dclose); + if (!is_deferred) + return; + list_del(&dclose->dlist); + kfree(dclose); +} + +void +cifs_close_deferred_file(struct cifsInodeInfo *cifs_inode) +{ + struct cifsFileInfo *cfile = NULL; + struct cifs_deferred_close *dclose; + + list_for_each_entry(cfile, &cifs_inode->openFileList, flist) { + spin_lock(&cifs_inode->deferred_lock); + if (cifs_is_deferred_close(cfile, &dclose)) + mod_delayed_work(deferredclose_wq, &cfile->deferred, 0); + spin_unlock(&cifs_inode->deferred_lock); + } +} + +void +cifs_close_all_deferred_files(struct cifs_tcon *tcon) +{ + struct cifsFileInfo *cfile; + struct list_head *tmp; + + spin_lock(&tcon->open_file_lock); + list_for_each(tmp, &tcon->openFileList) { + cfile = list_entry(tmp, struct cifsFileInfo, tlist); + if (delayed_work_pending(&cfile->deferred)) { + /* + * If there is no pending work, mod_delayed_work queues new work. + * So, Increase the ref count to avoid use-after-free. + */ + if (!mod_delayed_work(deferredclose_wq, &cfile->deferred, 0)) + cifsFileInfo_get(cfile); + } + } + spin_unlock(&tcon->open_file_lock); +} + /* parses DFS refferal V3 structure * caller is responsible for freeing target_nodes * returns: @@ -1180,7 +1274,7 @@ int update_super_prepath(struct cifs_tcon *tcon, char *prefix) kfree(cifs_sb->prepath); if (prefix && *prefix) { - cifs_sb->prepath = kstrndup(prefix, strlen(prefix), GFP_ATOMIC); + cifs_sb->prepath = kstrdup(prefix, GFP_ATOMIC); if (!cifs_sb->prepath) { rc = -ENOMEM; goto out; diff --git a/fs/cifs/readdir.c b/fs/cifs/readdir.c index 80bf4c6f4c7b..63bfc533c9fb 100644 --- a/fs/cifs/readdir.c +++ b/fs/cifs/readdir.c @@ -119,9 +119,7 @@ retry: /* update inode in place * if both i_ino and i_mode didn't change */ if (CIFS_I(inode)->uniqueid == fattr->cf_uniqueid && - (inode->i_mode & S_IFMT) == - (fattr->cf_mode & S_IFMT)) { - cifs_fattr_to_inode(inode, fattr); + cifs_fattr_to_inode(inode, fattr) == 0) { dput(dentry); return; } @@ -384,7 +382,7 @@ int get_symlink_reparse_path(char *full_path, struct cifs_sb_info *cifs_sb, static int initiate_cifs_search(const unsigned int xid, struct file *file, - char *full_path) + const char *full_path) { __u16 search_flags; int rc = 0; @@ -704,7 +702,7 @@ static int cifs_save_resume_key(const char *current_entry, */ static int find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, - struct file *file, char *full_path, + struct file *file, const char *full_path, char **current_entry, int *num_to_ret) { __u16 search_flags; @@ -942,13 +940,14 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) char *tmp_buf = NULL; char *end_of_smb; unsigned int max_len; - char *full_path = NULL; + const char *full_path; + void *page = alloc_dentry_path(); xid = get_xid(); - full_path = build_path_from_dentry(file_dentry(file)); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(file_dentry(file), page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto rddir2_exit; } @@ -1043,7 +1042,7 @@ int cifs_readdir(struct file *file, struct dir_context *ctx) kfree(tmp_buf); rddir2_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); return rc; } diff --git a/fs/cifs/sess.c b/fs/cifs/sess.c index 63d517b9f2ff..a92a1fb7cb52 100644 --- a/fs/cifs/sess.c +++ b/fs/cifs/sess.c @@ -97,6 +97,12 @@ int cifs_try_adding_channels(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses) return 0; } + if (!(ses->server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + cifs_dbg(VFS, "server %s does not support multichannel\n", ses->server->hostname); + ses->chan_max = 1; + return 0; + } + /* * Make a copy of the iface list at the time and use that * instead so as to not hold the iface spinlock for opening diff --git a/fs/cifs/smb1ops.c b/fs/cifs/smb1ops.c index e31b939e628c..3b83839fc2c2 100644 --- a/fs/cifs/smb1ops.c +++ b/fs/cifs/smb1ops.c @@ -926,9 +926,7 @@ cifs_unix_dfs_readlink(const unsigned int xid, struct cifs_tcon *tcon, 0); if (!rc) { - *symlinkinfo = kstrndup(referral.node_name, - strlen(referral.node_name), - GFP_KERNEL); + *symlinkinfo = kstrdup(referral.node_name, GFP_KERNEL); free_dfs_info_param(&referral); if (!*symlinkinfo) rc = -ENOMEM; @@ -1027,7 +1025,7 @@ cifs_can_echo(struct TCP_Server_Info *server) static int cifs_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - char *full_path, umode_t mode, dev_t dev) + const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct inode *newinode = NULL; diff --git a/fs/cifs/smb2inode.c b/fs/cifs/smb2inode.c index a718dc77e604..9a61209a283e 100644 --- a/fs/cifs/smb2inode.c +++ b/fs/cifs/smb2inode.c @@ -512,7 +512,6 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; struct smb2_file_all_info *smb2_data; __u32 create_options = 0; - bool no_cached_open = tcon->nohandlecache; struct cifsFileInfo *cfile; struct cached_fid *cfid = NULL; @@ -525,11 +524,8 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, return -ENOMEM; /* If it is a root and its handle is cached then use it */ - if (!strlen(full_path) && !no_cached_open) { - rc = open_shroot(xid, tcon, cifs_sb, &cfid); - if (rc) - goto out; - + rc = open_cached_dir(xid, tcon, full_path, cifs_sb, &cfid); + if (!rc) { if (tcon->crfid.file_all_info_is_valid) { move_smb2_info_to_cifs(data, &tcon->crfid.file_all_info); @@ -540,7 +536,7 @@ smb2_query_path_info(const unsigned int xid, struct cifs_tcon *tcon, if (!rc) move_smb2_info_to_cifs(data, smb2_data); } - close_shroot(cfid); + close_cached_dir(cfid); goto out; } diff --git a/fs/cifs/smb2misc.c b/fs/cifs/smb2misc.c index aac384f69f74..06d555d4da9a 100644 --- a/fs/cifs/smb2misc.c +++ b/fs/cifs/smb2misc.c @@ -667,6 +667,7 @@ smb2_is_valid_lease_break(char *buffer) !memcmp(rsp->LeaseKey, tcon->crfid.fid->lease_key, SMB2_LEASE_KEY_SIZE)) { + tcon->crfid.time = 0; INIT_WORK(&tcon->crfid.lease_break, smb2_cached_lease_break); queue_work(cifsiod_wq, diff --git a/fs/cifs/smb2ops.c b/fs/cifs/smb2ops.c index f703204fb185..21ef51d338e0 100644 --- a/fs/cifs/smb2ops.c +++ b/fs/cifs/smb2ops.c @@ -690,17 +690,21 @@ smb2_close_cached_fid(struct kref *ref) cfid->is_valid = false; cfid->file_all_info_is_valid = false; cfid->has_lease = false; + if (cfid->dentry) { + dput(cfid->dentry); + cfid->dentry = NULL; + } } } -void close_shroot(struct cached_fid *cfid) +void close_cached_dir(struct cached_fid *cfid) { mutex_lock(&cfid->fid_mutex); kref_put(&cfid->refcount, smb2_close_cached_fid); mutex_unlock(&cfid->fid_mutex); } -void close_shroot_lease_locked(struct cached_fid *cfid) +void close_cached_dir_lease_locked(struct cached_fid *cfid) { if (cfid->has_lease) { cfid->has_lease = false; @@ -708,10 +712,10 @@ void close_shroot_lease_locked(struct cached_fid *cfid) } } -void close_shroot_lease(struct cached_fid *cfid) +void close_cached_dir_lease(struct cached_fid *cfid) { mutex_lock(&cfid->fid_mutex); - close_shroot_lease_locked(cfid); + close_cached_dir_lease_locked(cfid); mutex_unlock(&cfid->fid_mutex); } @@ -721,13 +725,15 @@ smb2_cached_lease_break(struct work_struct *work) struct cached_fid *cfid = container_of(work, struct cached_fid, lease_break); - close_shroot_lease(cfid); + close_cached_dir_lease(cfid); } /* - * Open the directory at the root of a share + * Open the and cache a directory handle. + * Only supported for the root handle. */ -int open_shroot(unsigned int xid, struct cifs_tcon *tcon, +int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, + const char *path, struct cifs_sb_info *cifs_sb, struct cached_fid **cfid) { @@ -745,6 +751,18 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, __le16 utf16_path = 0; /* Null - since an open of top of share */ u8 oplock = SMB2_OPLOCK_LEVEL_II; struct cifs_fid *pfid; + struct dentry *dentry; + + if (tcon->nohandlecache) + return -ENOTSUPP; + + if (cifs_sb->root == NULL) + return -ENOENT; + + if (strlen(path)) + return -ENOENT; + + dentry = cifs_sb->root; mutex_lock(&tcon->crfid.fid_mutex); if (tcon->crfid.is_valid) { @@ -830,11 +848,9 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, }; /* - * caller expects this func to set pfid to a valid - * cached root, so we copy the existing one and get a - * reference. + * caller expects this func to set the fid in crfid to valid + * cached root, so increment the refcount. */ - memcpy(pfid, tcon->crfid.fid, sizeof(*pfid)); kref_get(&tcon->crfid.refcount); mutex_unlock(&tcon->crfid.fid_mutex); @@ -867,13 +883,18 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, oparms.fid->mid = le64_to_cpu(o_rsp->sync_hdr.MessageId); #endif /* CIFS_DEBUG2 */ - memcpy(tcon->crfid.fid, pfid, sizeof(struct cifs_fid)); tcon->crfid.tcon = tcon; tcon->crfid.is_valid = true; + tcon->crfid.dentry = dentry; + dget(dentry); kref_init(&tcon->crfid.refcount); /* BB TBD check to see if oplock level check can be removed below */ if (o_rsp->OplockLevel == SMB2_OPLOCK_LEVEL_LEASE) { + /* + * See commit 2f94a3125b87. Increment the refcount when we + * get a lease for root, release it if lease break occurs + */ kref_get(&tcon->crfid.refcount); tcon->crfid.has_lease = true; smb2_parse_contexts(server, o_rsp, @@ -892,6 +913,8 @@ int open_shroot(unsigned int xid, struct cifs_tcon *tcon, &rsp_iov[1], sizeof(struct smb2_file_all_info), (char *)&tcon->crfid.file_all_info)) tcon->crfid.file_all_info_is_valid = true; + tcon->crfid.time = jiffies; + oshr_exit: mutex_unlock(&tcon->crfid.fid_mutex); @@ -905,6 +928,22 @@ oshr_free: return rc; } +int open_cached_dir_by_dentry(struct cifs_tcon *tcon, + struct dentry *dentry, + struct cached_fid **cfid) +{ + mutex_lock(&tcon->crfid.fid_mutex); + if (tcon->crfid.dentry == dentry) { + cifs_dbg(FYI, "found a cached root file handle by dentry\n"); + *cfid = &tcon->crfid; + kref_get(&tcon->crfid.refcount); + mutex_unlock(&tcon->crfid.fid_mutex); + return 0; + } + mutex_unlock(&tcon->crfid.fid_mutex); + return -ENOENT; +} + static void smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb) @@ -914,7 +953,6 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, u8 oplock = SMB2_OPLOCK_LEVEL_NONE; struct cifs_open_parms oparms; struct cifs_fid fid; - bool no_cached_open = tcon->nohandlecache; struct cached_fid *cfid = NULL; oparms.tcon = tcon; @@ -924,14 +962,12 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, oparms.fid = &fid; oparms.reconnect = false; - if (no_cached_open) { + rc = open_cached_dir(xid, tcon, "", cifs_sb, &cfid); + if (rc == 0) + memcpy(&fid, cfid->fid, sizeof(struct cifs_fid)); + else rc = SMB2_open(xid, &oparms, &srch_path, &oplock, NULL, NULL, NULL, NULL); - } else { - rc = open_shroot(xid, tcon, cifs_sb, &cfid); - if (rc == 0) - memcpy(&fid, cfid->fid, sizeof(struct cifs_fid)); - } if (rc) return; @@ -945,10 +981,10 @@ smb3_qfs_tcon(const unsigned int xid, struct cifs_tcon *tcon, FS_VOLUME_INFORMATION); SMB2_QFS_attr(xid, tcon, fid.persistent_fid, fid.volatile_fid, FS_SECTOR_SIZE_INFORMATION); /* SMB3 specific */ - if (no_cached_open) + if (cfid == NULL) SMB2_close(xid, tcon, fid.persistent_fid, fid.volatile_fid); else - close_shroot(cfid); + close_cached_dir(cfid); } static void @@ -1531,7 +1567,10 @@ SMB2_request_res_key(const unsigned int xid, struct cifs_tcon *tcon, NULL, 0 /* no input */, CIFSMaxBufSize, (char **)&res_key, &ret_data_len); - if (rc) { + if (rc == -EOPNOTSUPP) { + pr_warn_once("Server share %s does not support copy range\n", tcon->treeName); + goto req_res_key_exit; + } else if (rc) { cifs_tcon_dbg(VFS, "refcpy ioctl error %d getting resume key\n", rc); goto req_res_key_exit; } @@ -1763,18 +1802,14 @@ smb2_ioctl_query_info(const unsigned int xid, } iqinf_exit: - kfree(vars); - kfree(buffer); - SMB2_open_free(&rqst[0]); - if (qi.flags & PASSTHRU_FSCTL) - SMB2_ioctl_free(&rqst[1]); - else - SMB2_query_info_free(&rqst[1]); - - SMB2_close_free(&rqst[2]); + cifs_small_buf_release(rqst[0].rq_iov[0].iov_base); + cifs_small_buf_release(rqst[1].rq_iov[0].iov_base); + cifs_small_buf_release(rqst[2].rq_iov[0].iov_base); free_rsp_buf(resp_buftype[0], rsp_iov[0].iov_base); free_rsp_buf(resp_buftype[1], rsp_iov[1].iov_base); free_rsp_buf(resp_buftype[2], rsp_iov[2].iov_base); + kfree(vars); + kfree(buffer); return rc; e_fault: @@ -1826,6 +1861,8 @@ smb2_copychunk_range(const unsigned int xid, cpu_to_le32(min_t(u32, len, tcon->max_bytes_chunk)); /* Request server copy to target from src identified by key */ + kfree(retbuf); + retbuf = NULL; rc = SMB2_ioctl(xid, tcon, trgtfile->fid.persistent_fid, trgtfile->fid.volatile_fid, FSCTL_SRV_COPYCHUNK_WRITE, true /* is_fsctl */, (char *)pcchunk, @@ -2217,22 +2254,23 @@ smb3_notify(const unsigned int xid, struct file *pfile, struct smb3_notify notify; struct dentry *dentry = pfile->f_path.dentry; struct inode *inode = file_inode(pfile); - struct cifs_sb_info *cifs_sb; + struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct cifs_open_parms oparms; struct cifs_fid fid; struct cifs_tcon *tcon; - unsigned char *path = NULL; + const unsigned char *path; + void *page = alloc_dentry_path(); __le16 *utf16_path = NULL; u8 oplock = SMB2_OPLOCK_LEVEL_NONE; int rc = 0; - path = build_path_from_dentry(dentry); - if (path == NULL) - return -ENOMEM; - - cifs_sb = CIFS_SB(inode->i_sb); + path = build_path_from_dentry(dentry, page); + if (IS_ERR(path)) { + rc = PTR_ERR(path); + goto notify_exit; + } - utf16_path = cifs_convert_path_to_utf16(path + 1, cifs_sb); + utf16_path = cifs_convert_path_to_utf16(path, cifs_sb); if (utf16_path == NULL) { rc = -ENOMEM; goto notify_exit; @@ -2264,7 +2302,7 @@ smb3_notify(const unsigned int xid, struct file *pfile, cifs_dbg(FYI, "change notify for path %s rc %d\n", path, rc); notify_exit: - kfree(path); + free_dentry_path(page); kfree(utf16_path); return rc; } @@ -3652,6 +3690,77 @@ out: return rc; } +static long smb3_collapse_range(struct file *file, struct cifs_tcon *tcon, + loff_t off, loff_t len) +{ + int rc; + unsigned int xid; + struct cifsFileInfo *cfile = file->private_data; + __le64 eof; + + xid = get_xid(); + + if (off >= i_size_read(file->f_inode) || + off + len >= i_size_read(file->f_inode)) { + rc = -EINVAL; + goto out; + } + + rc = smb2_copychunk_range(xid, cfile, cfile, off + len, + i_size_read(file->f_inode) - off - len, off); + if (rc < 0) + goto out; + + eof = cpu_to_le64(i_size_read(file->f_inode) - len); + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc < 0) + goto out; + + rc = 0; + out: + free_xid(xid); + return rc; +} + +static long smb3_insert_range(struct file *file, struct cifs_tcon *tcon, + loff_t off, loff_t len) +{ + int rc; + unsigned int xid; + struct cifsFileInfo *cfile = file->private_data; + __le64 eof; + __u64 count; + + xid = get_xid(); + + if (off >= i_size_read(file->f_inode)) { + rc = -EINVAL; + goto out; + } + + count = i_size_read(file->f_inode) - off; + eof = cpu_to_le64(i_size_read(file->f_inode) + len); + + rc = SMB2_set_eof(xid, tcon, cfile->fid.persistent_fid, + cfile->fid.volatile_fid, cfile->pid, &eof); + if (rc < 0) + goto out; + + rc = smb2_copychunk_range(xid, cfile, cfile, off, count, off + len); + if (rc < 0) + goto out; + + rc = smb3_zero_range(file, tcon, off, len, 1); + if (rc < 0) + goto out; + + rc = 0; + out: + free_xid(xid); + return rc; +} + static loff_t smb3_llseek(struct file *file, struct cifs_tcon *tcon, loff_t offset, int whence) { struct cifsFileInfo *wrcfile, *cfile = file->private_data; @@ -3823,6 +3932,10 @@ static long smb3_fallocate(struct file *file, struct cifs_tcon *tcon, int mode, return smb3_zero_range(file, tcon, off, len, false); } else if (mode == FALLOC_FL_KEEP_SIZE) return smb3_simple_falloc(file, tcon, off, len, true); + else if (mode == FALLOC_FL_COLLAPSE_RANGE) + return smb3_collapse_range(file, tcon, off, len); + else if (mode == FALLOC_FL_INSERT_RANGE) + return smb3_insert_range(file, tcon, off, len); else if (mode == 0) return smb3_simple_falloc(file, tcon, off, len, false); @@ -3870,6 +3983,7 @@ smb2_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int epoch, bool *purge_cache) { oplock &= 0xFF; + cinode->lease_granted = false; if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; if (oplock == SMB2_OPLOCK_LEVEL_BATCH) { @@ -3896,6 +4010,7 @@ smb21_set_oplock_level(struct cifsInodeInfo *cinode, __u32 oplock, unsigned int new_oplock = 0; oplock &= 0xFF; + cinode->lease_granted = true; if (oplock == SMB2_OPLOCK_LEVEL_NOCHANGE) return; @@ -4178,7 +4293,7 @@ smb2_get_enc_key(struct TCP_Server_Info *server, __u64 ses_id, int enc, u8 *key) } spin_unlock(&cifs_tcp_ses_lock); - return 1; + return -EAGAIN; } /* * Encrypt or decrypt @rqst message. @rqst[0] has the following format: @@ -4968,7 +5083,7 @@ smb2_next_header(char *buf) static int smb2_make_node(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, - char *full_path, umode_t mode, dev_t dev) + const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); int rc = -EPERM; diff --git a/fs/cifs/smb2pdu.c b/fs/cifs/smb2pdu.c index 2199a9bfae8f..c205f93e0a10 100644 --- a/fs/cifs/smb2pdu.c +++ b/fs/cifs/smb2pdu.c @@ -841,6 +841,8 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) req->SecurityMode = 0; req->Capabilities = cpu_to_le32(server->vals->req_capabilities); + if (ses->chan_max > 1) + req->Capabilities |= cpu_to_le32(SMB2_GLOBAL_CAP_MULTI_CHANNEL); /* ClientGUID must be zero for SMB2.02 dialect */ if (server->vals->protocol_id == SMB20_PROT_ID) @@ -956,6 +958,13 @@ SMB2_negotiate(const unsigned int xid, struct cifs_ses *ses) /* Internal types */ server->capabilities |= SMB2_NT_FIND | SMB2_LARGE_FILES; + /* + * SMB3.0 supports only 1 cipher and doesn't have a encryption neg context + * Set the cipher type manually. + */ + if (server->dialect == SMB30_PROT_ID && (server->capabilities & SMB2_GLOBAL_CAP_ENCRYPTION)) + server->cipher_type = SMB2_ENCRYPTION_AES128_CCM; + security_blob = smb2_get_data_area_len(&blob_offset, &blob_length, (struct smb2_sync_hdr *)rsp); /* @@ -1032,6 +1041,9 @@ int smb3_validate_negotiate(const unsigned int xid, struct cifs_tcon *tcon) pneg_inbuf->Capabilities = cpu_to_le32(server->vals->req_capabilities); + if (tcon->ses->chan_max > 1) + pneg_inbuf->Capabilities |= cpu_to_le32(SMB2_GLOBAL_CAP_MULTI_CHANNEL); + memcpy(pneg_inbuf->Guid, server->client_guid, SMB2_CLIENT_GUID_SIZE); @@ -1857,7 +1869,7 @@ SMB2_tdis(const unsigned int xid, struct cifs_tcon *tcon) if ((tcon->need_reconnect) || (tcon->ses->need_reconnect)) return 0; - close_shroot_lease(&tcon->crfid); + close_cached_dir_lease(&tcon->crfid); rc = smb2_plain_req_init(SMB2_TREE_DISCONNECT, tcon, ses->server, (void **) &req, @@ -3895,10 +3907,10 @@ smb2_new_read_req(void **buf, unsigned int *total_len, * Related requests use info from previous read request * in chain. */ - shdr->SessionId = 0xFFFFFFFF; + shdr->SessionId = 0xFFFFFFFFFFFFFFFF; shdr->TreeId = 0xFFFFFFFF; - req->PersistentFileId = 0xFFFFFFFF; - req->VolatileFileId = 0xFFFFFFFF; + req->PersistentFileId = 0xFFFFFFFFFFFFFFFF; + req->VolatileFileId = 0xFFFFFFFFFFFFFFFF; } } if (remaining_bytes > io_parms->length) diff --git a/fs/cifs/smb2pdu.h b/fs/cifs/smb2pdu.h index a5a9e33c0d73..6442dc1c292b 100644 --- a/fs/cifs/smb2pdu.h +++ b/fs/cifs/smb2pdu.h @@ -144,7 +144,7 @@ struct smb2_transform_hdr { } __packed; /* See MS-SMB2 2.2.42 */ -struct smb2_compression_transform_hdr { +struct smb2_compression_transform_hdr_unchained { __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ __le32 OriginalCompressedSegmentSize; __le16 CompressionAlgorithm; @@ -160,10 +160,17 @@ struct compression_payload_header { __le16 CompressionAlgorithm; __le16 Flags; __le32 Length; /* length of compressed playload including field below if present */ - /* __le32 OriginalPayloadSize; */ /* optional */ + /* __le32 OriginalPayloadSize; */ /* optional, present when LZNT1, LZ77, LZ77+Huffman */ } __packed; /* See MS-SMB2 2.2.42.2 */ +struct smb2_compression_transform_hdr_chained { + __le32 ProtocolId; /* 0xFC 'S' 'M' 'B' */ + __le32 OriginalCompressedSegmentSize; + /* struct compression_payload_header[] */ +} __packed; + +/* See MS-SMB2 2.2.42.2.2 */ struct compression_pattern_payload_v1 { __le16 Pattern; __le16 Reserved1; @@ -181,7 +188,11 @@ struct smb2_rdma_transform { __le32 Reserved2; } __packed; -struct smb2_rdma_encryption_transform { +/* TransformType */ +#define SMB2_RDMA_TRANSFORM_TYPE_ENCRYPTION 0x0001 +#define SMB2_RDMA_TRANSFORM_TYPE_SIGNING 0x0002 + +struct smb2_rdma_crypto_transform { __le16 TransformType; __le16 SignatureLength; __le16 NonceLength; @@ -409,13 +420,29 @@ struct smb2_netname_neg_context { } __packed; /* - * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * For smb2_transport_capabilities context see MS-SMB2 2.2.3.1.5 * and 2.2.4.1.5 */ +/* Flags */ +#define SMB2_ACCEPT_TRANSFORM_LEVEL_SECURITY 0x00000001 + +struct smb2_transport_capabilities_context { + __le16 ContextType; /* 6 */ + __le16 DataLength; + __u32 Reserved; + __le32 Flags; +} __packed; + +/* + * For rdma transform capabilities context see MS-SMB2 2.2.3.1.6 + * and 2.2.4.1.6 + */ + /* RDMA Transform IDs */ #define SMB2_RDMA_TRANSFORM_NONE 0x0000 #define SMB2_RDMA_TRANSFORM_ENCRYPTION 0x0001 +#define SMB2_RDMA_TRANSFORM_SIGNING 0x0002 struct smb2_rdma_transform_capabilities_context { __le16 ContextType; /* 7 */ @@ -427,6 +454,11 @@ struct smb2_rdma_transform_capabilities_context { __le16 RDMATransformIds[]; } __packed; +/* + * For signing capabilities context see MS-SMB2 2.2.3.1.7 + * and 2.2.4.1.7 + */ + /* Signing algorithms */ #define SIGNING_ALG_HMAC_SHA256 0 #define SIGNING_ALG_AES_CMAC 1 @@ -634,7 +666,8 @@ struct smb2_tree_connect_rsp { #define SHI1005_FLAGS_ENABLE_HASH_V2 0x00004000 #define SHI1005_FLAGS_ENCRYPT_DATA 0x00008000 #define SMB2_SHAREFLAG_IDENTITY_REMOTING 0x00040000 /* 3.1.1 */ -#define SHI1005_FLAGS_ALL 0x0004FF33 +#define SMB2_SHAREFLAG_COMPRESS_DATA 0x00100000 /* 3.1.1 */ +#define SHI1005_FLAGS_ALL 0x0014FF33 /* Possible share capabilities */ #define SMB2_SHARE_CAP_DFS cpu_to_le32(0x00000008) /* all dialects */ @@ -1390,7 +1423,11 @@ struct smb2_lock_req { struct smb2_sync_hdr sync_hdr; __le16 StructureSize; /* Must be 48 */ __le16 LockCount; - __le32 Reserved; + /* + * The least significant four bits are the index, the other 28 bits are + * the lock sequence number (0 to 64). See MS-SMB2 2.2.26 + */ + __le32 LockSequenceNumber; __u64 PersistentFileId; /* opaque endianness */ __u64 VolatileFileId; /* opaque endianness */ /* Followed by at least one */ diff --git a/fs/cifs/smb2proto.h b/fs/cifs/smb2proto.h index a2eb34a8d9c9..a5f87b02cfaf 100644 --- a/fs/cifs/smb2proto.h +++ b/fs/cifs/smb2proto.h @@ -69,12 +69,16 @@ extern struct cifs_ses *smb2_find_smb_ses(struct TCP_Server_Info *server, extern int smb3_handle_read_data(struct TCP_Server_Info *server, struct mid_q_entry *mid); -extern int open_shroot(unsigned int xid, struct cifs_tcon *tcon, - struct cifs_sb_info *cifs_sb, - struct cached_fid **cfid); -extern void close_shroot(struct cached_fid *cfid); -extern void close_shroot_lease(struct cached_fid *cfid); -extern void close_shroot_lease_locked(struct cached_fid *cfid); +extern int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, + const char *path, + struct cifs_sb_info *cifs_sb, + struct cached_fid **cfid); +extern int open_cached_dir_by_dentry(struct cifs_tcon *tcon, + struct dentry *dentry, + struct cached_fid **cfid); +extern void close_cached_dir(struct cached_fid *cfid); +extern void close_cached_dir_lease(struct cached_fid *cfid); +extern void close_cached_dir_lease_locked(struct cached_fid *cfid); extern void move_smb2_info_to_cifs(FILE_ALL_INFO *dst, struct smb2_file_all_info *src); extern int smb2_query_reparse_tag(const unsigned int xid, struct cifs_tcon *tcon, diff --git a/fs/cifs/trace.h b/fs/cifs/trace.h index d6df908dccad..dafcb6ab050d 100644 --- a/fs/cifs/trace.h +++ b/fs/cifs/trace.h @@ -12,6 +12,11 @@ #include <linux/tracepoint.h> +/* + * Please use this 3-part article as a reference for writing new tracepoints: + * https://lwn.net/Articles/379903/ + */ + /* For logging errors in read or write */ DECLARE_EVENT_CLASS(smb3_rw_err_class, TP_PROTO(unsigned int xid, @@ -529,16 +534,16 @@ DECLARE_EVENT_CLASS(smb3_exit_err_class, TP_ARGS(xid, func_name, rc), TP_STRUCT__entry( __field(unsigned int, xid) - __field(const char *, func_name) + __string(func_name, func_name) __field(int, rc) ), TP_fast_assign( __entry->xid = xid; - __entry->func_name = func_name; + __assign_str(func_name, func_name); __entry->rc = rc; ), TP_printk("\t%s: xid=%u rc=%d", - __entry->func_name, __entry->xid, __entry->rc) + __get_str(func_name), __entry->xid, __entry->rc) ) #define DEFINE_SMB3_EXIT_ERR_EVENT(name) \ @@ -583,14 +588,14 @@ DECLARE_EVENT_CLASS(smb3_enter_exit_class, TP_ARGS(xid, func_name), TP_STRUCT__entry( __field(unsigned int, xid) - __field(const char *, func_name) + __string(func_name, func_name) ), TP_fast_assign( __entry->xid = xid; - __entry->func_name = func_name; + __assign_str(func_name, func_name); ), TP_printk("\t%s: xid=%u", - __entry->func_name, __entry->xid) + __get_str(func_name), __entry->xid) ) #define DEFINE_SMB3_ENTER_EXIT_EVENT(name) \ @@ -857,16 +862,16 @@ DECLARE_EVENT_CLASS(smb3_reconnect_class, TP_STRUCT__entry( __field(__u64, currmid) __field(__u64, conn_id) - __field(char *, hostname) + __string(hostname, hostname) ), TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __entry->hostname = hostname; + __assign_str(hostname, hostname); ), TP_printk("conn_id=0x%llx server=%s current_mid=%llu", __entry->conn_id, - __entry->hostname, + __get_str(hostname), __entry->currmid) ) @@ -891,7 +896,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_STRUCT__entry( __field(__u64, currmid) __field(__u64, conn_id) - __field(char *, hostname) + __string(hostname, hostname) __field(int, credits) __field(int, credits_to_add) __field(int, in_flight) @@ -899,7 +904,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_fast_assign( __entry->currmid = currmid; __entry->conn_id = conn_id; - __entry->hostname = hostname; + __assign_str(hostname, hostname); __entry->credits = credits; __entry->credits_to_add = credits_to_add; __entry->in_flight = in_flight; @@ -907,7 +912,7 @@ DECLARE_EVENT_CLASS(smb3_credit_class, TP_printk("conn_id=0x%llx server=%s current_mid=%llu " "credits=%d credit_change=%d in_flight=%d", __entry->conn_id, - __entry->hostname, + __get_str(hostname), __entry->currmid, __entry->credits, __entry->credits_to_add, diff --git a/fs/cifs/unc.c b/fs/cifs/unc.c index 394aa00cea40..f6fc5e343ea4 100644 --- a/fs/cifs/unc.c +++ b/fs/cifs/unc.c @@ -50,7 +50,6 @@ char *extract_sharename(const char *unc) { const char *src; char *delim, *dst; - int len; /* skip double chars at the beginning */ src = unc + 2; @@ -60,10 +59,9 @@ char *extract_sharename(const char *unc) if (!delim) return ERR_PTR(-EINVAL); delim++; - len = strlen(delim); /* caller has to free the memory */ - dst = kstrndup(delim, len, GFP_KERNEL); + dst = kstrdup(delim, GFP_KERNEL); if (!dst) return ERR_PTR(-ENOMEM); diff --git a/fs/cifs/xattr.c b/fs/cifs/xattr.c index 41a611e76bb7..aa3e8ca0457c 100644 --- a/fs/cifs/xattr.c +++ b/fs/cifs/xattr.c @@ -30,6 +30,7 @@ #include "cifs_debug.h" #include "cifs_fs_sb.h" #include "cifs_unicode.h" +#include "cifs_ioctl.h" #define MAX_EA_VALUE_SIZE CIFSMaxBufSize #define CIFS_XATTR_CIFS_ACL "system.cifs_acl" /* DACL only */ @@ -53,7 +54,7 @@ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT, XATTR_CIFS_NTSD, XATTR_CIFS_NTSD_FULL }; static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon, - struct inode *inode, char *full_path, + struct inode *inode, const char *full_path, const void *value, size_t size) { ssize_t rc = -EOPNOTSUPP; @@ -77,7 +78,7 @@ static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon, } static int cifs_creation_time_set(unsigned int xid, struct cifs_tcon *pTcon, - struct inode *inode, char *full_path, + struct inode *inode, const char *full_path, const void *value, size_t size) { ssize_t rc = -EOPNOTSUPP; @@ -112,7 +113,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path; + const char *full_path; + void *page; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -120,10 +122,11 @@ static int cifs_xattr_set(const struct xattr_handler *handler, pTcon = tlink_tcon(tlink); xid = get_xid(); + page = alloc_dentry_path(); - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } /* return dos attributes as pseudo xattr */ @@ -235,7 +238,7 @@ static int cifs_xattr_set(const struct xattr_handler *handler, } out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -297,7 +300,8 @@ static int cifs_xattr_get(const struct xattr_handler *handler, struct cifs_sb_info *cifs_sb = CIFS_SB(sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path; + const char *full_path; + void *page; tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) @@ -305,10 +309,11 @@ static int cifs_xattr_get(const struct xattr_handler *handler, pTcon = tlink_tcon(tlink); xid = get_xid(); + page = alloc_dentry_path(); - full_path = build_path_from_dentry(dentry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(dentry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto out; } @@ -401,7 +406,7 @@ static int cifs_xattr_get(const struct xattr_handler *handler, rc = -EOPNOTSUPP; out: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; @@ -414,7 +419,11 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) struct cifs_sb_info *cifs_sb = CIFS_SB(direntry->d_sb); struct tcon_link *tlink; struct cifs_tcon *pTcon; - char *full_path; + const char *full_path; + void *page; + + if (unlikely(cifs_forced_shutdown(cifs_sb))) + return -EIO; if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_XATTR) return -EOPNOTSUPP; @@ -425,10 +434,11 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) pTcon = tlink_tcon(tlink); xid = get_xid(); + page = alloc_dentry_path(); - full_path = build_path_from_dentry(direntry); - if (full_path == NULL) { - rc = -ENOMEM; + full_path = build_path_from_dentry(direntry, page); + if (IS_ERR(full_path)) { + rc = PTR_ERR(full_path); goto list_ea_exit; } /* return dos attributes as pseudo xattr */ @@ -442,7 +452,7 @@ ssize_t cifs_listxattr(struct dentry *direntry, char *data, size_t buf_size) rc = pTcon->ses->server->ops->query_all_EAs(xid, pTcon, full_path, NULL, data, buf_size, cifs_sb); list_ea_exit: - kfree(full_path); + free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); return rc; diff --git a/fs/coda/file.c b/fs/coda/file.c index 128d63df5bfb..ef5ca22bfb3e 100644 --- a/fs/coda/file.c +++ b/fs/coda/file.c @@ -175,10 +175,10 @@ coda_file_mmap(struct file *coda_file, struct vm_area_struct *vma) ret = call_mmap(vma->vm_file, vma); if (ret) { - /* if call_mmap fails, our caller will put coda_file so we - * should drop the reference to the host_file that we got. + /* if call_mmap fails, our caller will put host_file so we + * should drop the reference to the coda_file that we got. */ - fput(host_file); + fput(coda_file); kfree(cvm_ops); } else { /* here we add redirects for the open/close vm_operations */ diff --git a/fs/configfs/configfs_internal.h b/fs/configfs/configfs_internal.h index 9a3aed249692..c0395363eab9 100644 --- a/fs/configfs/configfs_internal.h +++ b/fs/configfs/configfs_internal.h @@ -1,7 +1,5 @@ /* SPDX-License-Identifier: GPL-2.0-or-later */ -/* -*- mode: c; c-basic-offset:8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * configfs_internal.h - Internal stuff for configfs * * Based on sysfs: diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index b6098e02e20b..ac5e0c0e9181 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * dir.c - Operations for configfs directories. * * Based on sysfs: diff --git a/fs/configfs/file.c b/fs/configfs/file.c index da8351d1e455..e26060dae70a 100644 --- a/fs/configfs/file.c +++ b/fs/configfs/file.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * file.c - operations for regular (text) files. * * Based on sysfs: diff --git a/fs/configfs/inode.c b/fs/configfs/inode.c index 42c348bb2903..eb5ec3e46283 100644 --- a/fs/configfs/inode.c +++ b/fs/configfs/inode.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * inode.c - basic inode and dentry operations. * * Based on sysfs: diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 704a4356f137..254170a82aa3 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * item.c - library routines for handling generic config items * * Based on kobject: diff --git a/fs/configfs/mount.c b/fs/configfs/mount.c index 0c6e8cf61953..c2d820063ec4 100644 --- a/fs/configfs/mount.c +++ b/fs/configfs/mount.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * mount.c - operations for initializing and mounting configfs. * * Based on sysfs: diff --git a/fs/configfs/symlink.c b/fs/configfs/symlink.c index 77c854364e60..0623c3edcfb9 100644 --- a/fs/configfs/symlink.c +++ b/fs/configfs/symlink.c @@ -1,7 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/* -*- mode: c; c-basic-offset: 8; -*- - * vim: noexpandtab sw=8 ts=8 sts=0: - * +/* * symlink.c - operations for configfs symlinks. * * Based on sysfs: diff --git a/fs/coredump.c b/fs/coredump.c index 1c0fdc1aa70b..2868e3e171ae 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -809,6 +809,16 @@ void do_coredump(const kernel_siginfo_t *siginfo) } file_start_write(cprm.file); core_dumped = binfmt->core_dump(&cprm); + /* + * Ensures that file size is big enough to contain the current + * file postion. This prevents gdb from complaining about + * a truncated file if the last "write" to the file was + * dump_skip. + */ + if (cprm.to_skip) { + cprm.to_skip--; + dump_emit(&cprm, "", 1); + } file_end_write(cprm.file); } if (ispipe && core_pipe_limit) @@ -835,7 +845,7 @@ fail: * do on a core-file: use only these functions to write out all the * necessary info. */ -int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr) { struct file *file = cprm->file; loff_t pos = file->f_pos; @@ -855,9 +865,8 @@ int dump_emit(struct coredump_params *cprm, const void *addr, int nr) return 1; } -EXPORT_SYMBOL(dump_emit); -int dump_skip(struct coredump_params *cprm, size_t nr) +static int __dump_skip(struct coredump_params *cprm, size_t nr) { static char zeroes[PAGE_SIZE]; struct file *file = cprm->file; @@ -869,13 +878,35 @@ int dump_skip(struct coredump_params *cprm, size_t nr) return 1; } else { while (nr > PAGE_SIZE) { - if (!dump_emit(cprm, zeroes, PAGE_SIZE)) + if (!__dump_emit(cprm, zeroes, PAGE_SIZE)) return 0; nr -= PAGE_SIZE; } - return dump_emit(cprm, zeroes, nr); + return __dump_emit(cprm, zeroes, nr); } } + +int dump_emit(struct coredump_params *cprm, const void *addr, int nr) +{ + if (cprm->to_skip) { + if (!__dump_skip(cprm, cprm->to_skip)) + return 0; + cprm->to_skip = 0; + } + return __dump_emit(cprm, addr, nr); +} +EXPORT_SYMBOL(dump_emit); + +void dump_skip_to(struct coredump_params *cprm, unsigned long pos) +{ + cprm->to_skip = pos - cprm->pos; +} +EXPORT_SYMBOL(dump_skip_to); + +void dump_skip(struct coredump_params *cprm, size_t nr) +{ + cprm->to_skip += nr; +} EXPORT_SYMBOL(dump_skip); #ifdef CONFIG_ELF_CORE @@ -902,11 +933,11 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, stop = !dump_emit(cprm, kaddr, PAGE_SIZE); kunmap_local(kaddr); put_page(page); + if (stop) + return 0; } else { - stop = !dump_skip(cprm, PAGE_SIZE); + dump_skip(cprm, PAGE_SIZE); } - if (stop) - return 0; } return 1; } @@ -914,33 +945,16 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, int dump_align(struct coredump_params *cprm, int align) { - unsigned mod = cprm->pos & (align - 1); + unsigned mod = (cprm->pos + cprm->to_skip) & (align - 1); if (align & (align - 1)) return 0; - return mod ? dump_skip(cprm, align - mod) : 1; + if (mod) + cprm->to_skip += align - mod; + return 1; } EXPORT_SYMBOL(dump_align); /* - * Ensures that file size is big enough to contain the current file - * postion. This prevents gdb from complaining about a truncated file - * if the last "write" to the file was dump_skip. - */ -void dump_truncate(struct coredump_params *cprm) -{ - struct file *file = cprm->file; - loff_t offset; - - if (file->f_op->llseek && file->f_op->llseek != no_llseek) { - offset = file->f_op->llseek(file, 0, SEEK_CUR); - if (i_size_read(file->f_mapping->host) < offset) - do_truncate(file_mnt_user_ns(file), file->f_path.dentry, - offset, 0, file); - } -} -EXPORT_SYMBOL(dump_truncate); - -/* * The purpose of always_dump_vma() is to make sure that special kernel mappings * that are useful for post-mortem analysis are included in every core dump. * In that way we ensure that the core dump is fully interpretable later diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index a5f5c30368a2..2d0c8922f635 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -14,16 +14,30 @@ config FS_ENCRYPTION F2FS and UBIFS make use of this feature. # Filesystems supporting encryption must select this if FS_ENCRYPTION. This -# allows the algorithms to be built as modules when all the filesystems are. +# allows the algorithms to be built as modules when all the filesystems are, +# whereas selecting them from FS_ENCRYPTION would force them to be built-in. +# +# Note: this option only pulls in the algorithms that filesystem encryption +# needs "by default". If userspace will use "non-default" encryption modes such +# as Adiantum encryption, then those other modes need to be explicitly enabled +# in the crypto API; see Documentation/filesystems/fscrypt.rst for details. +# +# Also note that this option only pulls in the generic implementations of the +# algorithms, not any per-architecture optimized implementations. It is +# strongly recommended to enable optimized implementations too. It is safe to +# disable these generic implementations if corresponding optimized +# implementations will always be available too; for this reason, these are soft +# dependencies ('imply' rather than 'select'). Only disable these generic +# implementations if you're sure they will never be needed, though. config FS_ENCRYPTION_ALGS tristate - select CRYPTO_AES - select CRYPTO_CBC - select CRYPTO_CTS - select CRYPTO_ECB - select CRYPTO_HMAC - select CRYPTO_SHA512 - select CRYPTO_XTS + imply CRYPTO_AES + imply CRYPTO_CBC + imply CRYPTO_CTS + imply CRYPTO_ECB + imply CRYPTO_HMAC + imply CRYPTO_SHA512 + imply CRYPTO_XTS config FS_ENCRYPTION_INLINE_CRYPT bool "Enable fscrypt to use inline crypto" diff --git a/fs/d_path.c b/fs/d_path.c index a69e2cd36e6e..270d62133996 100644 --- a/fs/d_path.c +++ b/fs/d_path.c @@ -326,9 +326,9 @@ char *simple_dname(struct dentry *dentry, char *buffer, int buflen) /* * Write full pathname from the root of the filesystem into the buffer. */ -static char *__dentry_path(struct dentry *d, char *buf, int buflen) +static char *__dentry_path(const struct dentry *d, char *buf, int buflen) { - struct dentry *dentry; + const struct dentry *dentry; char *end, *retval; int len, seq = 0; int error = 0; @@ -347,7 +347,7 @@ restart: *retval = '/'; read_seqbegin_or_lock(&rename_lock, &seq); while (!IS_ROOT(dentry)) { - struct dentry *parent = dentry->d_parent; + const struct dentry *parent = dentry->d_parent; prefetch(parent); error = prepend_name(&end, &len, &dentry->d_name); @@ -371,13 +371,13 @@ Elong: return ERR_PTR(-ENAMETOOLONG); } -char *dentry_path_raw(struct dentry *dentry, char *buf, int buflen) +char *dentry_path_raw(const struct dentry *dentry, char *buf, int buflen) { return __dentry_path(dentry, buf, buflen); } EXPORT_SYMBOL(dentry_path_raw); -char *dentry_path(struct dentry *dentry, char *buf, int buflen) +char *dentry_path(const struct dentry *dentry, char *buf, int buflen) { char *p = NULL; char *retval; @@ -144,6 +144,16 @@ struct wait_exceptional_entry_queue { struct exceptional_entry_key key; }; +/** + * enum dax_wake_mode: waitqueue wakeup behaviour + * @WAKE_ALL: wake all waiters in the waitqueue + * @WAKE_NEXT: wake only the first waiter in the waitqueue + */ +enum dax_wake_mode { + WAKE_ALL, + WAKE_NEXT, +}; + static wait_queue_head_t *dax_entry_waitqueue(struct xa_state *xas, void *entry, struct exceptional_entry_key *key) { @@ -182,7 +192,8 @@ static int wake_exceptional_entry_func(wait_queue_entry_t *wait, * The important information it's conveying is whether the entry at * this index used to be a PMD entry. */ -static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) +static void dax_wake_entry(struct xa_state *xas, void *entry, + enum dax_wake_mode mode) { struct exceptional_entry_key key; wait_queue_head_t *wq; @@ -196,7 +207,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, bool wake_all) * must be in the waitqueue and the following check will see them. */ if (waitqueue_active(wq)) - __wake_up(wq, TASK_NORMAL, wake_all ? 0 : 1, &key); + __wake_up(wq, TASK_NORMAL, mode == WAKE_ALL ? 0 : 1, &key); } /* @@ -264,11 +275,11 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry) finish_wait(wq, &ewait.wait); } -static void put_unlocked_entry(struct xa_state *xas, void *entry) +static void put_unlocked_entry(struct xa_state *xas, void *entry, + enum dax_wake_mode mode) { - /* If we were the only waiter woken, wake the next one */ if (entry && !dax_is_conflict(entry)) - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, mode); } /* @@ -286,7 +297,7 @@ static void dax_unlock_entry(struct xa_state *xas, void *entry) old = xas_store(xas, entry); xas_unlock_irq(xas); BUG_ON(!dax_is_locked(old)); - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, WAKE_NEXT); } /* @@ -524,8 +535,8 @@ retry: dax_disassociate_entry(entry, mapping, false); xas_store(xas, NULL); /* undo the PMD join */ - dax_wake_entry(xas, entry, true); - mapping->nrexceptional--; + dax_wake_entry(xas, entry, WAKE_ALL); + mapping->nrpages -= PG_PMD_NR; entry = NULL; xas_set(xas, index); } @@ -541,7 +552,7 @@ retry: dax_lock_entry(xas, entry); if (xas_error(xas)) goto out_unlock; - mapping->nrexceptional++; + mapping->nrpages += 1UL << order; } out_unlock: @@ -622,7 +633,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, entry = get_unlocked_entry(&xas, 0); if (entry) page = dax_busy_page(entry); - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_NEXT); if (page) break; if (++scanned % XA_CHECK_SCHED) @@ -661,10 +672,10 @@ static int __dax_invalidate_entry(struct address_space *mapping, goto out; dax_disassociate_entry(entry, mapping, trunc); xas_store(&xas, NULL); - mapping->nrexceptional--; + mapping->nrpages -= 1UL << dax_entry_order(entry); ret = 1; out: - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_ALL); xas_unlock_irq(&xas); return ret; } @@ -937,13 +948,13 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, xas_lock_irq(xas); xas_store(xas, entry); xas_clear_mark(xas, PAGECACHE_TAG_DIRTY); - dax_wake_entry(xas, entry, false); + dax_wake_entry(xas, entry, WAKE_NEXT); trace_dax_writeback_one(mapping->host, index, count); return ret; put_unlocked: - put_unlocked_entry(xas, entry); + put_unlocked_entry(xas, entry, WAKE_NEXT); return ret; } @@ -965,7 +976,7 @@ int dax_writeback_mapping_range(struct address_space *mapping, if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) return -EIO; - if (!mapping->nrexceptional || wbc->sync_mode != WB_SYNC_ALL) + if (mapping_empty(mapping) || wbc->sync_mode != WB_SYNC_ALL) return 0; trace_dax_writeback_range(inode, xas.xa_index, end_index); @@ -1684,7 +1695,7 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) /* Did we race with someone splitting entry or so? */ if (!entry || dax_is_conflict(entry) || (order == 0 && !dax_is_pte_entry(entry))) { - put_unlocked_entry(&xas, entry); + put_unlocked_entry(&xas, entry, WAKE_NEXT); xas_unlock_irq(&xas); trace_dax_insert_pfn_mkwrite_no_entry(mapping->host, vmf, VM_FAULT_NOPAGE); diff --git a/fs/dcache.c b/fs/dcache.c index 7d24ff7eb206..cf871a81f4fd 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -84,6 +84,8 @@ const struct qstr empty_name = QSTR_INIT("", 0); EXPORT_SYMBOL(empty_name); const struct qstr slash_name = QSTR_INIT("/", 1); EXPORT_SYMBOL(slash_name); +const struct qstr dotdot_name = QSTR_INIT("..", 2); +EXPORT_SYMBOL(dotdot_name); /* * This is the single most critical data structure when it comes diff --git a/fs/debugfs/file.c b/fs/debugfs/file.c index 686e0ad28788..e813acfaa6e8 100644 --- a/fs/debugfs/file.c +++ b/fs/debugfs/file.c @@ -773,7 +773,7 @@ EXPORT_SYMBOL_GPL(debugfs_create_atomic_t); ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { - char buf[3]; + char buf[2]; bool val; int r; struct dentry *dentry = F_DENTRY(file); @@ -789,7 +789,6 @@ ssize_t debugfs_read_file_bool(struct file *file, char __user *user_buf, else buf[0] = 'N'; buf[1] = '\n'; - buf[2] = 0x00; return simple_read_from_buffer(user_buf, count, ppos, buf, 2); } EXPORT_SYMBOL_GPL(debugfs_read_file_bool); @@ -865,6 +864,97 @@ struct dentry *debugfs_create_bool(const char *name, umode_t mode, } EXPORT_SYMBOL_GPL(debugfs_create_bool); +ssize_t debugfs_read_file_str(struct file *file, char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct dentry *dentry = F_DENTRY(file); + char *str, *copy = NULL; + int copy_len, len; + ssize_t ret; + + ret = debugfs_file_get(dentry); + if (unlikely(ret)) + return ret; + + str = *(char **)file->private_data; + len = strlen(str) + 1; + copy = kmalloc(len, GFP_KERNEL); + if (!copy) { + debugfs_file_put(dentry); + return -ENOMEM; + } + + copy_len = strscpy(copy, str, len); + debugfs_file_put(dentry); + if (copy_len < 0) { + kfree(copy); + return copy_len; + } + + copy[copy_len] = '\n'; + + ret = simple_read_from_buffer(user_buf, count, ppos, copy, copy_len); + kfree(copy); + + return ret; +} + +static ssize_t debugfs_write_file_str(struct file *file, const char __user *user_buf, + size_t count, loff_t *ppos) +{ + /* This is really only for read-only strings */ + return -EINVAL; +} + +static const struct file_operations fops_str = { + .read = debugfs_read_file_str, + .write = debugfs_write_file_str, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct file_operations fops_str_ro = { + .read = debugfs_read_file_str, + .open = simple_open, + .llseek = default_llseek, +}; + +static const struct file_operations fops_str_wo = { + .write = debugfs_write_file_str, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * debugfs_create_str - create a debugfs file that is used to read and write a string value + * @name: a pointer to a string containing the name of the file to create. + * @mode: the permission that the file should have + * @parent: a pointer to the parent dentry for this file. This should be a + * directory dentry if set. If this parameter is %NULL, then the + * file will be created in the root of the debugfs filesystem. + * @value: a pointer to the variable that the file should read to and write + * from. + * + * This function creates a file in debugfs with the given name that + * contains the value of the variable @value. If the @mode variable is so + * set, it can be read from, and written to. + * + * This function will return a pointer to a dentry if it succeeds. This + * pointer must be passed to the debugfs_remove() function when the file is + * to be removed (no automatic cleanup happens if your module is unloaded, + * you are responsible here.) If an error occurs, ERR_PTR(-ERROR) will be + * returned. + * + * If debugfs is not enabled in the kernel, the value ERR_PTR(-ENODEV) will + * be returned. + */ +void debugfs_create_str(const char *name, umode_t mode, + struct dentry *parent, char **value) +{ + debugfs_create_mode_unsafe(name, mode, parent, value, &fops_str, + &fops_str_ro, &fops_str_wo); +} + static ssize_t read_file_blob(struct file *file, char __user *user_buf, size_t count, loff_t *ppos) { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 22e86ae4dd5a..8129a430d789 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -35,7 +35,7 @@ static struct vfsmount *debugfs_mount; static int debugfs_mount_count; static bool debugfs_registered; -static unsigned int debugfs_allow = DEFAULT_DEBUGFS_ALLOW_BITS; +static unsigned int debugfs_allow __ro_after_init = DEFAULT_DEBUGFS_ALLOW_BITS; /* * Don't allow access attributes to be changed whilst the kernel is locked down @@ -45,10 +45,13 @@ static unsigned int debugfs_allow = DEFAULT_DEBUGFS_ALLOW_BITS; static int debugfs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, struct iattr *ia) { - int ret = security_locked_down(LOCKDOWN_DEBUGFS); + int ret; - if (ret && (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID))) - return ret; + if (ia->ia_valid & (ATTR_MODE | ATTR_UID | ATTR_GID)) { + ret = security_locked_down(LOCKDOWN_DEBUGFS); + if (ret) + return ret; + } return simple_setattr(&init_user_ns, dentry, ia); } diff --git a/fs/dlm/config.c b/fs/dlm/config.c index 49c5f9407098..88d95d96e36c 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -125,7 +125,7 @@ static ssize_t cluster_cluster_name_store(struct config_item *item, CONFIGFS_ATTR(cluster_, cluster_name); static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, - int *info_field, bool (*check_cb)(unsigned int x), + int *info_field, int (*check_cb)(unsigned int x), const char *buf, size_t len) { unsigned int x; @@ -137,8 +137,11 @@ static ssize_t cluster_set(struct dlm_cluster *cl, unsigned int *cl_field, if (rc) return rc; - if (check_cb && check_cb(x)) - return -EINVAL; + if (check_cb) { + rc = check_cb(x); + if (rc) + return rc; + } *cl_field = x; *info_field = x; @@ -161,17 +164,53 @@ static ssize_t cluster_##name##_show(struct config_item *item, char *buf) \ } \ CONFIGFS_ATTR(cluster_, name); -static bool dlm_check_zero(unsigned int x) +static int dlm_check_protocol_and_dlm_running(unsigned int x) +{ + switch (x) { + case 0: + /* TCP */ + break; + case 1: + /* SCTP */ + break; + default: + return -EINVAL; + } + + if (dlm_allow_conn) + return -EBUSY; + + return 0; +} + +static int dlm_check_zero_and_dlm_running(unsigned int x) +{ + if (!x) + return -EINVAL; + + if (dlm_allow_conn) + return -EBUSY; + + return 0; +} + +static int dlm_check_zero(unsigned int x) { - return !x; + if (!x) + return -EINVAL; + + return 0; } -static bool dlm_check_buffer_size(unsigned int x) +static int dlm_check_buffer_size(unsigned int x) { - return (x < DEFAULT_BUFFER_SIZE); + if (x < DEFAULT_BUFFER_SIZE) + return -EINVAL; + + return 0; } -CLUSTER_ATTR(tcp_port, dlm_check_zero); +CLUSTER_ATTR(tcp_port, dlm_check_zero_and_dlm_running); CLUSTER_ATTR(buffer_size, dlm_check_buffer_size); CLUSTER_ATTR(rsbtbl_size, dlm_check_zero); CLUSTER_ATTR(recover_timer, dlm_check_zero); @@ -179,7 +218,7 @@ CLUSTER_ATTR(toss_secs, dlm_check_zero); CLUSTER_ATTR(scan_secs, dlm_check_zero); CLUSTER_ATTR(log_debug, NULL); CLUSTER_ATTR(log_info, NULL); -CLUSTER_ATTR(protocol, NULL); +CLUSTER_ATTR(protocol, dlm_check_protocol_and_dlm_running); CLUSTER_ATTR(mark, NULL); CLUSTER_ATTR(timewarn_cs, dlm_check_zero); CLUSTER_ATTR(waitwarn_us, NULL); @@ -688,6 +727,7 @@ static ssize_t comm_mark_show(struct config_item *item, char *buf) static ssize_t comm_mark_store(struct config_item *item, const char *buf, size_t len) { + struct dlm_comm *comm; unsigned int mark; int rc; @@ -695,7 +735,15 @@ static ssize_t comm_mark_store(struct config_item *item, const char *buf, if (rc) return rc; - config_item_to_comm(item)->mark = mark; + if (mark == 0) + mark = dlm_config.ci_mark; + + comm = config_item_to_comm(item); + rc = dlm_lowcomms_nodes_set_mark(comm->nodeid, mark); + if (rc) + return rc; + + comm->mark = mark; return len; } @@ -870,24 +918,6 @@ int dlm_comm_seq(int nodeid, uint32_t *seq) return 0; } -void dlm_comm_mark(int nodeid, unsigned int *mark) -{ - struct dlm_comm *cm; - - cm = get_comm(nodeid); - if (!cm) { - *mark = dlm_config.ci_mark; - return; - } - - if (cm->mark) - *mark = cm->mark; - else - *mark = dlm_config.ci_mark; - - put_comm(cm); -} - int dlm_our_nodeid(void) { return local_comm ? local_comm->nodeid : 0; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index c210250a2581..d2cd4bd20313 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -48,7 +48,6 @@ void dlm_config_exit(void); int dlm_config_nodes(char *lsname, struct dlm_config_node **nodes_out, int *count_out); int dlm_comm_seq(int nodeid, uint32_t *seq); -void dlm_comm_mark(int nodeid, unsigned int *mark); int dlm_our_nodeid(void); int dlm_our_addr(struct sockaddr_storage *addr, int num); diff --git a/fs/dlm/debug_fs.c b/fs/dlm/debug_fs.c index d6bbccb0ed15..d5bd990bcab8 100644 --- a/fs/dlm/debug_fs.c +++ b/fs/dlm/debug_fs.c @@ -542,6 +542,7 @@ static void *table_seq_next(struct seq_file *seq, void *iter_ptr, loff_t *pos) if (bucket >= ls->ls_rsbtbl_size) { kfree(ri); + ++*pos; return NULL; } tree = toss ? &ls->ls_rsbtbl[bucket].toss : &ls->ls_rsbtbl[bucket].keep; diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index 002123efc6b0..b93df39d0915 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -3541,8 +3541,6 @@ static int _create_message(struct dlm_ls *ls, int mb_len, if (!mh) return -ENOBUFS; - memset(mb, 0, mb_len); - ms = (struct dlm_message *) mb; ms->m_header.h_version = (DLM_HEADER_MAJOR | DLM_HEADER_MINOR); diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 561dcad08ad6..c14cf2b7faab 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -404,12 +404,6 @@ static int threads_start(void) return error; } -static void threads_stop(void) -{ - dlm_scand_stop(); - dlm_lowcomms_stop(); -} - static int new_lockspace(const char *name, const char *cluster, uint32_t flags, int lvblen, const struct dlm_lockspace_ops *ops, void *ops_arg, @@ -702,8 +696,11 @@ int dlm_new_lockspace(const char *name, const char *cluster, ls_count++; if (error > 0) error = 0; - if (!ls_count) - threads_stop(); + if (!ls_count) { + dlm_scand_stop(); + dlm_lowcomms_shutdown(); + dlm_lowcomms_stop(); + } out: mutex_unlock(&ls_lock); return error; @@ -788,6 +785,11 @@ static int release_lockspace(struct dlm_ls *ls, int force) dlm_recoverd_stop(ls); + if (ls_count == 1) { + dlm_scand_stop(); + dlm_lowcomms_shutdown(); + } + dlm_callback_stop(ls); remove_lockspace(ls); @@ -880,7 +882,7 @@ int dlm_release_lockspace(void *lockspace, int force) if (!error) ls_count--; if (!ls_count) - threads_stop(); + dlm_lowcomms_stop(); mutex_unlock(&ls_lock); return error; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 372c34ff8594..166e36fcf3e4 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -102,6 +102,9 @@ struct listen_connection { struct work_struct rwork; }; +#define DLM_WQ_REMAIN_BYTES(e) (PAGE_SIZE - e->end) +#define DLM_WQ_LENGTH_BYTES(e) (e->end - e->offset) + /* An entry waiting to be sent */ struct writequeue_entry { struct list_head list; @@ -116,6 +119,7 @@ struct writequeue_entry { struct dlm_node_addr { struct list_head list; int nodeid; + int mark; int addr_count; int curr_addr_index; struct sockaddr_storage *addr[DLM_MAX_ADDR_COUNT]; @@ -134,7 +138,7 @@ static DEFINE_SPINLOCK(dlm_node_addrs_spin); static struct listen_connection listen_con; static struct sockaddr_storage *dlm_local_addr[DLM_MAX_ADDR_COUNT]; static int dlm_local_count; -static int dlm_allow_conn; +int dlm_allow_conn; /* Work queues */ static struct workqueue_struct *recv_workqueue; @@ -303,7 +307,8 @@ static int addr_compare(const struct sockaddr_storage *x, } static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, - struct sockaddr *sa_out, bool try_new_addr) + struct sockaddr *sa_out, bool try_new_addr, + unsigned int *mark) { struct sockaddr_storage sas; struct dlm_node_addr *na; @@ -331,6 +336,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, if (!na->addr_count) return -ENOENT; + *mark = na->mark; + if (sas_out) memcpy(sas_out, &sas, sizeof(struct sockaddr_storage)); @@ -350,7 +357,8 @@ static int nodeid_to_addr(int nodeid, struct sockaddr_storage *sas_out, return 0; } -static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) +static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid, + unsigned int *mark) { struct dlm_node_addr *na; int rv = -EEXIST; @@ -364,6 +372,7 @@ static int addr_to_nodeid(struct sockaddr_storage *addr, int *nodeid) for (addr_i = 0; addr_i < na->addr_count; addr_i++) { if (addr_compare(na->addr[addr_i], addr)) { *nodeid = na->nodeid; + *mark = na->mark; rv = 0; goto unlock; } @@ -412,6 +421,7 @@ int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len) new_node->nodeid = nodeid; new_node->addr[0] = new_addr; new_node->addr_count = 1; + new_node->mark = dlm_config.ci_mark; list_add(&new_node->list, &dlm_node_addrs); spin_unlock(&dlm_node_addrs_spin); return 0; @@ -519,6 +529,23 @@ int dlm_lowcomms_connect_node(int nodeid) return 0; } +int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark) +{ + struct dlm_node_addr *na; + + spin_lock(&dlm_node_addrs_spin); + na = find_node_addr(nodeid); + if (!na) { + spin_unlock(&dlm_node_addrs_spin); + return -ENOENT; + } + + na->mark = mark; + spin_unlock(&dlm_node_addrs_spin); + + return 0; +} + static void lowcomms_error_report(struct sock *sk) { struct connection *con; @@ -685,10 +712,7 @@ static void shutdown_connection(struct connection *con) { int ret; - if (cancel_work_sync(&con->swork)) { - log_print("canceled swork for node %d", con->nodeid); - clear_bit(CF_WRITE_PENDING, &con->flags); - } + flush_work(&con->swork); mutex_lock(&con->sock_mutex); /* nothing to shutdown */ @@ -867,7 +891,7 @@ static int accept_from_sock(struct listen_connection *con) /* Get the new node's NODEID */ make_sockaddr(&peeraddr, 0, &len); - if (addr_to_nodeid(&peeraddr, &nodeid)) { + if (addr_to_nodeid(&peeraddr, &nodeid, &mark)) { unsigned char *b=(unsigned char *)&peeraddr; log_print("connect from non cluster node"); print_hex_dump_bytes("ss: ", DUMP_PREFIX_NONE, @@ -876,9 +900,6 @@ static int accept_from_sock(struct listen_connection *con) return -1; } - dlm_comm_mark(nodeid, &mark); - sock_set_mark(newsock->sk, mark); - log_print("got connection from %d", nodeid); /* Check to see if we already have a connection to this node. This @@ -892,6 +913,8 @@ static int accept_from_sock(struct listen_connection *con) goto accept_err; } + sock_set_mark(newsock->sk, mark); + mutex_lock(&newcon->sock_mutex); if (newcon->sock) { struct connection *othercon = newcon->othercon; @@ -908,16 +931,18 @@ static int accept_from_sock(struct listen_connection *con) result = dlm_con_init(othercon, nodeid); if (result < 0) { kfree(othercon); + mutex_unlock(&newcon->sock_mutex); goto accept_err; } + lockdep_set_subclass(&othercon->sock_mutex, 1); newcon->othercon = othercon; } else { /* close other sock con if we have something new */ close_connection(othercon, false, true, false); } - mutex_lock_nested(&othercon->sock_mutex, 1); + mutex_lock(&othercon->sock_mutex); add_sock(newsock, othercon); addcon = othercon; mutex_unlock(&othercon->sock_mutex); @@ -930,6 +955,7 @@ static int accept_from_sock(struct listen_connection *con) addcon = newcon; } + set_bit(CF_CONNECTED, &addcon->flags); mutex_unlock(&newcon->sock_mutex); /* @@ -1015,8 +1041,6 @@ static void sctp_connect_to_sock(struct connection *con) struct socket *sock; unsigned int mark; - dlm_comm_mark(con->nodeid, &mark); - mutex_lock(&con->sock_mutex); /* Some odd races can cause double-connects, ignore them */ @@ -1029,7 +1053,7 @@ static void sctp_connect_to_sock(struct connection *con) } memset(&daddr, 0, sizeof(daddr)); - result = nodeid_to_addr(con->nodeid, &daddr, NULL, true); + result = nodeid_to_addr(con->nodeid, &daddr, NULL, true, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); goto out; @@ -1104,13 +1128,11 @@ out: static void tcp_connect_to_sock(struct connection *con) { struct sockaddr_storage saddr, src_addr; + unsigned int mark; int addr_len; struct socket *sock = NULL; - unsigned int mark; int result; - dlm_comm_mark(con->nodeid, &mark); - mutex_lock(&con->sock_mutex); if (con->retries++ > MAX_CONNECT_RETRIES) goto out; @@ -1125,15 +1147,15 @@ static void tcp_connect_to_sock(struct connection *con) if (result < 0) goto out_err; - sock_set_mark(sock->sk, mark); - memset(&saddr, 0, sizeof(saddr)); - result = nodeid_to_addr(con->nodeid, &saddr, NULL, false); + result = nodeid_to_addr(con->nodeid, &saddr, NULL, false, &mark); if (result < 0) { log_print("no address for nodeid %d", con->nodeid); goto out_err; } + sock_set_mark(sock->sk, mark); + add_sock(sock, con); /* Bind to our cluster-known address connecting to avoid @@ -1330,70 +1352,72 @@ static struct writequeue_entry *new_writequeue_entry(struct connection *con, { struct writequeue_entry *entry; - entry = kmalloc(sizeof(struct writequeue_entry), allocation); + entry = kzalloc(sizeof(*entry), allocation); if (!entry) return NULL; - entry->page = alloc_page(allocation); + entry->page = alloc_page(allocation | __GFP_ZERO); if (!entry->page) { kfree(entry); return NULL; } - entry->offset = 0; - entry->len = 0; - entry->end = 0; - entry->users = 0; entry->con = con; + entry->users = 1; return entry; } -void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +static struct writequeue_entry *new_wq_entry(struct connection *con, int len, + gfp_t allocation, char **ppc) { - struct connection *con; struct writequeue_entry *e; - int offset = 0; - if (len > LOWCOMMS_MAX_TX_BUFFER_LEN) { - BUILD_BUG_ON(PAGE_SIZE < LOWCOMMS_MAX_TX_BUFFER_LEN); - log_print("failed to allocate a buffer of size %d", len); - return NULL; + spin_lock(&con->writequeue_lock); + if (!list_empty(&con->writequeue)) { + e = list_last_entry(&con->writequeue, struct writequeue_entry, list); + if (DLM_WQ_REMAIN_BYTES(e) >= len) { + *ppc = page_address(e->page) + e->end; + e->end += len; + e->users++; + spin_unlock(&con->writequeue_lock); + + return e; + } } + spin_unlock(&con->writequeue_lock); - con = nodeid2con(nodeid, allocation); - if (!con) + e = new_writequeue_entry(con, allocation); + if (!e) return NULL; + *ppc = page_address(e->page); + e->end += len; + spin_lock(&con->writequeue_lock); - e = list_entry(con->writequeue.prev, struct writequeue_entry, list); - if ((&e->list == &con->writequeue) || - (PAGE_SIZE - e->end < len)) { - e = NULL; - } else { - offset = e->end; - e->end += len; - e->users++; - } + list_add_tail(&e->list, &con->writequeue); spin_unlock(&con->writequeue_lock); - if (e) { - got_one: - *ppc = page_address(e->page) + offset; - return e; - } + return e; +}; - e = new_writequeue_entry(con, allocation); - if (e) { - spin_lock(&con->writequeue_lock); - offset = e->end; - e->end += len; - e->users++; - list_add_tail(&e->list, &con->writequeue); - spin_unlock(&con->writequeue_lock); - goto got_one; +void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc) +{ + struct connection *con; + + if (len > DEFAULT_BUFFER_SIZE || + len < sizeof(struct dlm_header)) { + BUILD_BUG_ON(PAGE_SIZE < DEFAULT_BUFFER_SIZE); + log_print("failed to allocate a buffer of size %d", len); + WARN_ON(1); + return NULL; } - return NULL; + + con = nodeid2con(nodeid, allocation); + if (!con) + return NULL; + + return new_wq_entry(con, len, allocation, ppc); } void dlm_lowcomms_commit_buffer(void *mh) @@ -1406,7 +1430,8 @@ void dlm_lowcomms_commit_buffer(void *mh) users = --e->users; if (users) goto out; - e->len = e->end - e->offset; + + e->len = DLM_WQ_LENGTH_BYTES(e); spin_unlock(&con->writequeue_lock); queue_work(send_workqueue, &con->swork); @@ -1432,11 +1457,10 @@ static void send_to_sock(struct connection *con) spin_lock(&con->writequeue_lock); for (;;) { - e = list_entry(con->writequeue.next, struct writequeue_entry, - list); - if ((struct list_head *) e == &con->writequeue) + if (list_empty(&con->writequeue)) break; + e = list_first_entry(&con->writequeue, struct writequeue_entry, list); len = e->len; offset = e->offset; BUG_ON(len == 0 && e->users == 0); @@ -1589,6 +1613,29 @@ static int work_start(void) return 0; } +static void shutdown_conn(struct connection *con) +{ + if (con->shutdown_action) + con->shutdown_action(con); +} + +void dlm_lowcomms_shutdown(void) +{ + /* Set all the flags to prevent any + * socket activity. + */ + dlm_allow_conn = 0; + + if (recv_workqueue) + flush_workqueue(recv_workqueue); + if (send_workqueue) + flush_workqueue(send_workqueue); + + dlm_close_sock(&listen_con.sock); + + foreach_conn(shutdown_conn); +} + static void _stop_conn(struct connection *con, bool and_other) { mutex_lock(&con->sock_mutex); @@ -1610,12 +1657,6 @@ static void stop_conn(struct connection *con) _stop_conn(con, true); } -static void shutdown_conn(struct connection *con) -{ - if (con->shutdown_action) - con->shutdown_action(con); -} - static void connection_release(struct rcu_head *rcu) { struct connection *con = container_of(rcu, struct connection, rcu); @@ -1672,19 +1713,6 @@ static void work_flush(void) void dlm_lowcomms_stop(void) { - /* Set all the flags to prevent any - socket activity. - */ - dlm_allow_conn = 0; - - if (recv_workqueue) - flush_workqueue(recv_workqueue); - if (send_workqueue) - flush_workqueue(send_workqueue); - - dlm_close_sock(&listen_con.sock); - - foreach_conn(shutdown_conn); work_flush(); foreach_conn(free_conn); work_stop(); diff --git a/fs/dlm/lowcomms.h b/fs/dlm/lowcomms.h index 0918f9376489..48bbc4e18761 100644 --- a/fs/dlm/lowcomms.h +++ b/fs/dlm/lowcomms.h @@ -14,13 +14,18 @@ #define LOWCOMMS_MAX_TX_BUFFER_LEN 4096 +/* switch to check if dlm is running */ +extern int dlm_allow_conn; + int dlm_lowcomms_start(void); +void dlm_lowcomms_shutdown(void); void dlm_lowcomms_stop(void); void dlm_lowcomms_exit(void); int dlm_lowcomms_close(int nodeid); void *dlm_lowcomms_get_buffer(int nodeid, int len, gfp_t allocation, char **ppc); void dlm_lowcomms_commit_buffer(void *mh); int dlm_lowcomms_connect_node(int nodeid); +int dlm_lowcomms_nodes_set_mark(int nodeid, unsigned int mark); int dlm_lowcomms_addr(int nodeid, struct sockaddr_storage *addr, int len); #endif /* __LOWCOMMS_DOT_H__ */ diff --git a/fs/dlm/midcomms.c b/fs/dlm/midcomms.c index fde3a6afe4be..1c6654a21ec4 100644 --- a/fs/dlm/midcomms.c +++ b/fs/dlm/midcomms.c @@ -22,8 +22,6 @@ * into packets and sends them to the comms layer. */ -#include <asm/unaligned.h> - #include "dlm_internal.h" #include "lowcomms.h" #include "config.h" @@ -45,13 +43,22 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) while (len >= sizeof(struct dlm_header)) { hd = (struct dlm_header *)ptr; - /* no message should be more than this otherwise we - * cannot deliver this message to upper layers + /* no message should be more than DEFAULT_BUFFER_SIZE or + * less than dlm_header size. + * + * Some messages does not have a 8 byte length boundary yet + * which can occur in a unaligned memory access of some dlm + * messages. However this problem need to be fixed at the + * sending side, for now it seems nobody run into architecture + * related issues yet but it slows down some processing. + * Fixing this issue should be scheduled in future by doing + * the next major version bump. */ - msglen = get_unaligned_le16(&hd->h_length); - if (msglen > DEFAULT_BUFFER_SIZE) { - log_print("received invalid length header: %u, will abort message parsing", - msglen); + msglen = le16_to_cpu(hd->h_length); + if (msglen > DEFAULT_BUFFER_SIZE || + msglen < sizeof(struct dlm_header)) { + log_print("received invalid length header: %u from node %d, will abort message parsing", + msglen, nodeid); return -EBADMSG; } @@ -84,15 +91,7 @@ int dlm_process_incoming_buffer(int nodeid, unsigned char *buf, int len) goto skip; } - /* for aligned memory access, we just copy current message - * to begin of the buffer which contains already parsed buffer - * data and should provide align access for upper layers - * because the start address of the buffer has a aligned - * address. This memmove can be removed when the upperlayer - * is capable of unaligned memory access. - */ - memmove(buf, ptr, msglen); - dlm_receive_buffer((union dlm_packet *)buf, nodeid); + dlm_receive_buffer((union dlm_packet *)ptr, nodeid); skip: ret += msglen; diff --git a/fs/dlm/rcom.c b/fs/dlm/rcom.c index 73ddee5159d7..f5b1bd65728d 100644 --- a/fs/dlm/rcom.c +++ b/fs/dlm/rcom.c @@ -41,7 +41,6 @@ static int create_rcom(struct dlm_ls *ls, int to_nodeid, int type, int len, to_nodeid, type, len); return -ENOBUFS; } - memset(mb, 0, mb_len); rc = (struct dlm_rcom *) mb; @@ -462,7 +461,6 @@ int dlm_send_ls_not_ready(int nodeid, struct dlm_rcom *rc_in) mh = dlm_lowcomms_get_buffer(nodeid, mb_len, GFP_NOFS, &mb); if (!mh) return -ENOBUFS; - memset(mb, 0, mb_len); rc = (struct dlm_rcom *) mb; diff --git a/fs/ecryptfs/crypto.c b/fs/ecryptfs/crypto.c index 943e523f4c9d..e3f5d7f3c8a0 100644 --- a/fs/ecryptfs/crypto.c +++ b/fs/ecryptfs/crypto.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * * Copyright (C) 1997-2004 Erez Zadok @@ -296,10 +296,6 @@ static int crypt_scatterlist(struct ecryptfs_crypt_stat *crypt_stat, struct extent_crypt_result ecr; int rc = 0; - if (!crypt_stat || !crypt_stat->tfm - || !(crypt_stat->flags & ECRYPTFS_STRUCT_INITIALIZED)) - return -EINVAL; - if (unlikely(ecryptfs_verbosity > 0)) { ecryptfs_printk(KERN_DEBUG, "Key size [%zd]; key:\n", crypt_stat->key_size); @@ -350,7 +346,7 @@ out: return rc; } -/** +/* * lower_offset_for_page * * Convert an eCryptfs page index into a lower byte offset @@ -535,7 +531,7 @@ int ecryptfs_decrypt_page(struct page *page) rc = crypt_extent(crypt_stat, page, page, extent_offset, DECRYPT); if (rc) { - printk(KERN_ERR "%s: Error encrypting extent; " + printk(KERN_ERR "%s: Error decrypting extent; " "rc = [%d]\n", __func__, rc); goto out; } @@ -627,9 +623,8 @@ void ecryptfs_set_default_sizes(struct ecryptfs_crypt_stat *crypt_stat) } } -/** +/* * ecryptfs_compute_root_iv - * @crypt_stats * * On error, sets the root IV to all 0's. */ @@ -1370,7 +1365,7 @@ int ecryptfs_read_and_validate_xattr_region(struct dentry *dentry, return rc; } -/** +/* * ecryptfs_read_metadata * * Common entry point for reading file metadata. From here, we could @@ -1448,7 +1443,7 @@ out: return rc; } -/** +/* * ecryptfs_encrypt_filename - encrypt filename * * CBC-encrypts the filename. We do not want to encrypt the same @@ -1590,11 +1585,10 @@ out: struct kmem_cache *ecryptfs_key_tfm_cache; static struct list_head key_tfm_list; -struct mutex key_tfm_list_mutex; +DEFINE_MUTEX(key_tfm_list_mutex); int __init ecryptfs_init_crypto(void) { - mutex_init(&key_tfm_list_mutex); INIT_LIST_HEAD(&key_tfm_list); return 0; } @@ -1877,10 +1871,11 @@ out: /** * ecryptfs_encrypt_and_encode_filename - converts a plaintext file name to cipher text - * @crypt_stat: The crypt_stat struct associated with the file anem to encode + * @encoded_name: The encrypted name + * @encoded_name_size: Length of the encrypted name + * @mount_crypt_stat: The crypt_stat struct associated with the file name to encode * @name: The plaintext name - * @length: The length of the plaintext - * @encoded_name: The encypted name + * @name_size: The length of the plaintext name * * Encrypts and encodes a filename into something that constitutes a * valid filename for a filesystem, with printable characters. @@ -1992,7 +1987,7 @@ static bool is_dot_dotdot(const char *name, size_t name_size) * ecryptfs_decode_and_decrypt_filename - converts the encoded cipher text name to decoded plaintext * @plaintext_name: The plaintext name * @plaintext_name_size: The plaintext name size - * @ecryptfs_dir_dentry: eCryptfs directory dentry + * @sb: Ecryptfs's super_block * @name: The filename in cipher text * @name_size: The cipher text name size * diff --git a/fs/ecryptfs/debug.c b/fs/ecryptfs/debug.c index 1f65e99f9a41..cf6d0e8e25a1 100644 --- a/fs/ecryptfs/debug.c +++ b/fs/ecryptfs/debug.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer * Functions only useful for debugging. * @@ -9,7 +9,7 @@ #include "ecryptfs_kernel.h" -/** +/* * ecryptfs_dump_auth_tok - debug function to print auth toks * * This function will print the contents of an ecryptfs authentication diff --git a/fs/ecryptfs/dentry.c b/fs/ecryptfs/dentry.c index 44606f079efb..acaa0825e9bb 100644 --- a/fs/ecryptfs/dentry.c +++ b/fs/ecryptfs/dentry.c @@ -1,5 +1,5 @@ // SPDX-License-Identifier: GPL-2.0-or-later -/** +/* * eCryptfs: Linux filesystem encryption layer |