diff options
Diffstat (limited to 'fs')
671 files changed, 20003 insertions, 13293 deletions
diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index 39def020a074..cdb99507ef33 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -583,7 +583,7 @@ static struct attribute *v9fs_attrs[] = { NULL, }; -static struct attribute_group v9fs_attr_group = { +static const struct attribute_group v9fs_attr_group = { .attrs = v9fs_attrs, }; diff --git a/fs/9p/vfs_file.c b/fs/9p/vfs_file.c index 649f04f112dc..59c32c9b799f 100644 --- a/fs/9p/vfs_file.c +++ b/fs/9p/vfs_file.c @@ -86,8 +86,8 @@ int v9fs_file_open(struct inode *inode, struct file *file) * to work. */ writeback_fid = v9fs_writeback_fid(file_dentry(file)); - if (IS_ERR(fid)) { - err = PTR_ERR(fid); + if (IS_ERR(writeback_fid)) { + err = PTR_ERR(writeback_fid); mutex_unlock(&v9inode->v_mutex); goto out_error; } diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 8d97f0b45e9c..795706520b5e 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -399,7 +399,7 @@ static int v9fs_test_inode(struct inode *inode, void *data) umode = p9mode2unixmode(v9ses, st, &rdev); /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + if (inode_wrong_type(inode, umode)) return 0; /* compare qid details */ @@ -1390,7 +1390,7 @@ int v9fs_refresh_inode(struct p9_fid *fid, struct inode *inode) * Don't update inode if the file type is different */ umode = p9mode2unixmode(v9ses, st, &rdev); - if ((inode->i_mode & S_IFMT) != (umode & S_IFMT)) + if (inode_wrong_type(inode, umode)) goto out; /* diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 1dc7af046615..e1c0240b51c0 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -59,7 +59,7 @@ static int v9fs_test_inode_dotl(struct inode *inode, void *data) struct p9_stat_dotl *st = (struct p9_stat_dotl *)data; /* don't match inode of different type */ - if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + if (inode_wrong_type(inode, st->st_mode)) return 0; if (inode->i_generation != st->st_gen) @@ -663,14 +663,10 @@ v9fs_stat2inode_dotl(struct p9_stat_dotl *stat, struct inode *inode, if (stat->st_result_mask & P9_STATS_NLINK) set_nlink(inode, stat->st_nlink); if (stat->st_result_mask & P9_STATS_MODE) { - inode->i_mode = stat->st_mode; - if ((S_ISBLK(inode->i_mode)) || - (S_ISCHR(inode->i_mode))) - init_special_inode(inode, inode->i_mode, - inode->i_rdev); + mode = stat->st_mode & S_IALLUGO; + mode |= inode->i_mode & ~S_IALLUGO; + inode->i_mode = mode; } - if (stat->st_result_mask & P9_STATS_RDEV) - inode->i_rdev = new_decode_dev(stat->st_rdev); if (!(flags & V9FS_STAT2INODE_KEEP_ISIZE) && stat->st_result_mask & P9_STATS_SIZE) v9fs_i_size_write(inode, stat->st_size); @@ -959,7 +955,7 @@ int v9fs_refresh_inode_dotl(struct p9_fid *fid, struct inode *inode) /* * Don't update inode if the file type is different */ - if ((inode->i_mode & S_IFMT) != (st->st_mode & S_IFMT)) + if (inode_wrong_type(inode, st->st_mode)) goto out; /* diff --git a/fs/Kconfig b/fs/Kconfig index a55bda4233bb..141a856c50e7 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -125,6 +125,7 @@ source "fs/overlayfs/Kconfig" menu "Caches" +source "fs/netfs/Kconfig" source "fs/fscache/Kconfig" source "fs/cachefiles/Kconfig" @@ -222,10 +223,13 @@ config TMPFS_INODE64 If unsure, say N. +config ARCH_SUPPORTS_HUGETLBFS + def_bool n + config HUGETLBFS bool "HugeTLB file system support" depends on X86 || IA64 || SPARC64 || (S390 && 64BIT) || \ - SYS_SUPPORTS_HUGETLBFS || BROKEN + ARCH_SUPPORTS_HUGETLBFS || BROKEN help hugetlbfs is a filesystem backing for HugeTLB pages, based on ramfs. For architectures that support it, say Y here and read @@ -334,8 +338,8 @@ config NFS_COMMON default y config NFS_V4_2_SSC_HELPER - tristate - default y if NFS_V4=y || NFS_FS=y + bool + default y if NFS_V4_2 source "net/sunrpc/Kconfig" source "fs/ceph/Kconfig" diff --git a/fs/Kconfig.binfmt b/fs/Kconfig.binfmt index c6f1c8c1934e..06fb7a93a1bd 100644 --- a/fs/Kconfig.binfmt +++ b/fs/Kconfig.binfmt @@ -112,6 +112,9 @@ config BINFMT_FLAT_ARGVP_ENVP_ON_STACK config BINFMT_FLAT_OLD_ALWAYS_RAM bool +config BINFMT_FLAT_NO_DATA_START_OFFSET + bool + config BINFMT_FLAT_OLD bool "Enable support for very old legacy flat binaries" depends on BINFMT_FLAT diff --git a/fs/Makefile b/fs/Makefile index 3215fe205256..9c708e1fbe8f 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -67,6 +67,7 @@ obj-y += devpts/ obj-$(CONFIG_DLM) += dlm/ # Do not add any filesystems before this line +obj-$(CONFIG_NETFS_SUPPORT) += netfs/ obj-$(CONFIG_FSCACHE) += fscache/ obj-$(CONFIG_REISERFS_FS) += reiserfs/ obj-$(CONFIG_EXT4_FS) += ext4/ diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index 1ad211d72b3b..fc8ba9142f2f 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -4,6 +4,7 @@ config AFS_FS depends on INET select AF_RXRPC select DNS_RESOLVER + select NETFS_SUPPORT help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index a4e9e6e07e93..d3c6bb22c5f4 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -322,6 +322,8 @@ static int afs_deliver_cb_callback(struct afs_call *call) return ret; call->unmarshall++; + fallthrough; + case 5: break; } @@ -418,6 +420,7 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) r->node[loop] = ntohl(b[loop + 5]); call->unmarshall++; + fallthrough; case 2: break; @@ -530,6 +533,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) r->node[loop] = ntohl(b[loop + 5]); call->unmarshall++; + fallthrough; case 2: break; @@ -663,6 +667,7 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) afs_extract_to_tmp(call); call->unmarshall++; + fallthrough; case 3: break; diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 17548c1faf02..78719f2f567e 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -103,6 +103,35 @@ struct afs_lookup_cookie { }; /* + * Drop the refs that we're holding on the pages we were reading into. We've + * got refs on the first nr_pages pages. + */ +static void afs_dir_read_cleanup(struct afs_read *req) +{ + struct address_space *mapping = req->vnode->vfs_inode.i_mapping; + struct page *page; + pgoff_t last = req->nr_pages - 1; + + XA_STATE(xas, &mapping->i_pages, 0); + + if (unlikely(!req->nr_pages)) + return; + + rcu_read_lock(); + xas_for_each(&xas, page, last) { + if (xas_retry(&xas, page)) + continue; + BUG_ON(xa_is_value(page)); + BUG_ON(PageCompound(page)); + ASSERTCMP(page->mapping, ==, mapping); + + put_page(page); + } + + rcu_read_unlock(); +} + +/* * check that a directory page is valid */ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, @@ -127,7 +156,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, qty /= sizeof(union afs_xdr_dir_block); /* check them */ - dbuf = kmap(page); + dbuf = kmap_atomic(page); for (tmp = 0; tmp < qty; tmp++) { if (dbuf->blocks[tmp].hdr.magic != AFS_DIR_MAGIC) { printk("kAFS: %s(%lx): bad magic %d/%d is %04hx\n", @@ -146,7 +175,7 @@ static bool afs_dir_check_page(struct afs_vnode *dvnode, struct page *page, ((u8 *)&dbuf->blocks[tmp])[AFS_DIR_BLOCK_SIZE - 1] = 0; } - kunmap(page); + kunmap_atomic(dbuf); checked: afs_stat_v(dvnode, n_read_dir); @@ -157,35 +186,74 @@ error: } /* - * Check the contents of a directory that we've just read. + * Dump the contents of a directory. */ -static bool afs_dir_check_pages(struct afs_vnode *dvnode, struct afs_read *req) +static void afs_dir_dump(struct afs_vnode *dvnode, struct afs_read *req) { struct afs_xdr_dir_page *dbuf; - unsigned int i, j, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block); + struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct page *page; + unsigned int i, qty = PAGE_SIZE / sizeof(union afs_xdr_dir_block); + pgoff_t last = req->nr_pages - 1; - for (i = 0; i < req->nr_pages; i++) - if (!afs_dir_check_page(dvnode, req->pages[i], req->actual_len)) - goto bad; - return true; + XA_STATE(xas, &mapping->i_pages, 0); -bad: - pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx r=%llx\n", + pr_warn("DIR %llx:%llx f=%llx l=%llx al=%llx\n", dvnode->fid.vid, dvnode->fid.vnode, - req->file_size, req->len, req->actual_len, req->remain); - pr_warn("DIR %llx %x %x %x\n", - req->pos, req->index, req->nr_pages, req->offset); + req->file_size, req->len, req->actual_len); + pr_warn("DIR %llx %x %zx %zx\n", + req->pos, req->nr_pages, + req->iter->iov_offset, iov_iter_count(req->iter)); - for (i = 0; i < req->nr_pages; i++) { - dbuf = kmap(req->pages[i]); - for (j = 0; j < qty; j++) { - union afs_xdr_dir_block *block = &dbuf->blocks[j]; + xas_for_each(&xas, page, last) { + if (xas_retry(&xas, page)) + continue; + + BUG_ON(PageCompound(page)); + BUG_ON(page->mapping != mapping); + + dbuf = kmap_atomic(page); + for (i = 0; i < qty; i++) { + union afs_xdr_dir_block *block = &dbuf->blocks[i]; - pr_warn("[%02x] %32phN\n", i * qty + j, block); + pr_warn("[%02lx] %32phN\n", page->index * qty + i, block); } - kunmap(req->pages[i]); + kunmap_atomic(dbuf); } - return false; +} + +/* + * Check all the pages in a directory. All the pages are held pinned. + */ +static int afs_dir_check(struct afs_vnode *dvnode, struct afs_read *req) +{ + struct address_space *mapping = dvnode->vfs_inode.i_mapping; + struct page *page; + pgoff_t last = req->nr_pages - 1; + int ret = 0; + + XA_STATE(xas, &mapping->i_pages, 0); + + if (unlikely(!req->nr_pages)) + return 0; + + rcu_read_lock(); + xas_for_each(&xas, page, last) { + if (xas_retry(&xas, page)) + continue; + + BUG_ON(PageCompound(page)); + BUG_ON(page->mapping != mapping); + + if (!afs_dir_check_page(dvnode, page, req->file_size)) { + afs_dir_dump(dvnode, req); + ret = -EIO; + break; + } + } + + rcu_read_unlock(); + return ret; } /* @@ -214,57 +282,57 @@ static struct afs_read *afs_read_dir(struct afs_vnode *dvnode, struct key *key) { struct afs_read *req; loff_t i_size; - int nr_pages, nr_inline, i, n; - int ret = -ENOMEM; + int nr_pages, i, n; + int ret; + + _enter(""); -retry: + req = kzalloc(sizeof(*req), GFP_KERNEL); + if (!req) + return ERR_PTR(-ENOMEM); + + refcount_set(&req->usage, 1); + req->vnode = dvnode; + req->key = key_get(key); + req->cleanup = afs_dir_read_cleanup; + +expand: i_size = i_size_read(&dvnode->vfs_inode); - if (i_size < 2048) - return ERR_PTR(afs_bad(dvnode, afs_file_error_dir_small)); + if (i_size < 2048) { + ret = afs_bad(dvnode, afs_file_error_dir_small); + goto error; + } if (i_size > 2048 * 1024) { trace_afs_file_error(dvnode, -EFBIG, afs_file_error_dir_big); - return ERR_PTR(-EFBIG); + ret = -EFBIG; + goto error; } _enter("%llu", i_size); - /* Get a request record to hold the page list. We want to hold it - * inline if we can, but we don't want to make an order 1 allocation. - */ nr_pages = (i_size + PAGE_SIZE - 1) / PAGE_SIZE; - nr_inline = nr_pages; - if (nr_inline > (PAGE_SIZE - sizeof(*req)) / sizeof(struct page *)) - nr_inline = 0; - req = kzalloc(struct_size(req, array, nr_inline), GFP_KERNEL); - if (!req) - return ERR_PTR(-ENOMEM); - - refcount_set(&req->usage, 1); - req->nr_pages = nr_pages; req->actual_len = i_size; /* May change */ req->len = nr_pages * PAGE_SIZE; /* We can ask for more than there is */ req->data_version = dvnode->status.data_version; /* May change */ - if (nr_inline > 0) { - req->pages = req->array; - } else { - req->pages = kcalloc(nr_pages, sizeof(struct page *), - GFP_KERNEL); - if (!req->pages) - goto error; - } + iov_iter_xarray(&req->def_iter, READ, &dvnode->vfs_inode.i_mapping->i_pages, + 0, i_size); + req->iter = &req->def_iter; - /* Get a list of all the pages that hold or will hold the directory - * content. We need to fill in any gaps that we might find where the - * memory reclaimer has been at work. If there are any gaps, we will + /* Fill in any gaps that we might find where the memory reclaimer has + * been at work and pin all the pages. If there are any gaps, we will * need to reread the entire directory contents. */ - i = 0; - do { + i = req->nr_pages; + while (i < nr_pages) { + struct page *pages[8], *page; + n = find_get_pages_contig(dvnode->vfs_inode.i_mapping, i, - req->nr_pages - i, - req->pages + i); - _debug("find %u at %u/%u", n, i, req->nr_pages); + min_t(unsigned int, nr_pages - i, + ARRAY_SIZE(pages)), + pages); + _debug("find %u at %u/%u", n, i, nr_pages); + if (n == 0) { gfp_t gfp = dvnode->vfs_inode.i_mapping->gfp_mask; @@ -272,22 +340,24 @@ retry: afs_stat_v(dvnode, n_inval); ret = -ENOMEM; - req->pages[i] = __page_cache_alloc(gfp); - if (!req->pages[i]) + page = __page_cache_alloc(gfp); + if (!page) goto error; - ret = add_to_page_cache_lru(req->pages[i], + ret = add_to_page_cache_lru(page, dvnode->vfs_inode.i_mapping, i, gfp); if (ret < 0) goto error; - attach_page_private(req->pages[i], (void *)1); - unlock_page(req->pages[i]); + attach_page_private(page, (void *)1); + unlock_page(page); + req->nr_pages++; i++; } else { + req->nr_pages += n; i += n; } - } while (i < req->nr_pages); + } /* If we're going to reload, we need to lock all the pages to prevent * races. @@ -305,18 +375,23 @@ retry: if (!test_bit(AFS_VNODE_DIR_VALID, &dvnode->flags)) { trace_afs_reload_dir(dvnode); - ret = afs_fetch_data(dvnode, key, req); + ret = afs_fetch_data(dvnode, req); if (ret < 0) goto error_unlock; task_io_account_read(PAGE_SIZE * req->nr_pages); - if (req->len < req->file_size) - goto content_has_grown; + if (req->len < req->file_size) { + /* The content has grown, so we need to expand the + * buffer. + */ + up_write(&dvnode->validate_lock); + goto expand; + } /* Validate the data we just read. */ - ret = -EIO; - if (!afs_dir_check_pages(dvnode, req)) + ret = afs_dir_check(dvnode, req); + if (ret < 0) goto error_unlock; // TODO: Trim excess pages @@ -334,11 +409,6 @@ error: afs_put_read(req); _leave(" = %d", ret); return ERR_PTR(ret); - -content_has_grown: - up_write(&dvnode->validate_lock); - afs_put_read(req); - goto retry; } /* @@ -448,6 +518,7 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, struct afs_read *req; struct page *page; unsigned blkoff, limit; + void __rcu **slot; int ret; _enter("{%lu},%u,,", dir->i_ino, (unsigned)ctx->pos); @@ -472,9 +543,15 @@ static int afs_dir_iterate(struct inode *dir, struct dir_context *ctx, blkoff = ctx->pos & ~(sizeof(union afs_xdr_dir_block) - 1); /* Fetch the appropriate page from the directory and re-add it - * to the LRU. + * to the LRU. We have all the pages pinned with an extra ref. */ - page = req->pages[blkoff / PAGE_SIZE]; + rcu_read_lock(); + page = NULL; + slot = radix_tree_lookup_slot(&dvnode->vfs_inode.i_mapping->i_pages, + blkoff / PAGE_SIZE); + if (slot) + page = radix_tree_deref_slot(slot); + rcu_read_unlock(); if (!page) { ret = afs_bad(dvnode, afs_file_error_dir_missing_page); break; @@ -1342,6 +1419,7 @@ static int afs_mkdir(struct user_namespace *mnt_userns, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; op->create.mode = S_IFDIR | mode; @@ -1423,6 +1501,7 @@ static int afs_rmdir(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; @@ -1559,6 +1638,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; /* Try to make sure we have a callback promise on the victim. */ @@ -1641,6 +1721,7 @@ static int afs_create(struct user_namespace *mnt_userns, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->dentry = dentry; @@ -1715,6 +1796,7 @@ static int afs_link(struct dentry *from, struct inode *dir, afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -1837,7 +1919,9 @@ static void afs_rename_edit_dir(struct afs_operation *op) new_inode = d_inode(new_dentry); if (new_inode) { spin_lock(&new_inode->i_lock); - if (new_inode->i_nlink > 0) + if (S_ISDIR(new_inode->i_mode)) + clear_nlink(new_inode); + else if (new_inode->i_nlink > 0) drop_nlink(new_inode); spin_unlock(&new_inode->i_lock); } @@ -1910,6 +1994,8 @@ static int afs_rename(struct user_namespace *mnt_userns, struct inode *old_dir, afs_op_set_vnode(op, 1, new_dvnode); /* May be same as orig_dvnode */ op->file[0].dv_delta = 1; op->file[1].dv_delta = 1; + op->file[0].modification = true; + op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -2006,6 +2092,6 @@ static void afs_dir_invalidatepage(struct page *page, unsigned int offset, afs_stat_v(dvnode, n_inval); /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_SIZE) + if (offset == 0 && length == thp_size(page)) detach_page_private(page); } diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index 04f75a44f243..dae9a57d7ec0 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -73,6 +73,8 @@ static int afs_do_silly_rename(struct afs_vnode *dvnode, struct afs_vnode *vnode afs_op_set_vnode(op, 1, dvnode); op->file[0].dv_delta = 1; op->file[1].dv_delta = 1; + op->file[0].modification = true; + op->file[1].modification = true; op->file[0].update_ctime = true; op->file[1].update_ctime = true; @@ -201,6 +203,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode afs_op_set_vnode(op, 0, dvnode); afs_op_set_vnode(op, 1, vnode); op->file[0].dv_delta = 1; + op->file[0].modification = true; op->file[0].update_ctime = true; op->file[1].op_unlinked = true; op->file[1].update_ctime = true; diff --git a/fs/afs/file.c b/fs/afs/file.c index 960b64268623..db035ae2a134 100644 --- a/fs/afs/file.c +++ b/fs/afs/file.c @@ -14,6 +14,7 @@ #include <linux/gfp.h> #include <linux/task_io_accounting_ops.h> #include <linux/mm.h> +#include <linux/netfs.h> #include "internal.h" static int afs_file_mmap(struct file *file, struct vm_area_struct *vma); @@ -22,8 +23,7 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, unsigned int length); static int afs_releasepage(struct page *page, gfp_t gfp_flags); -static int afs_readpages(struct file *filp, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages); +static void afs_readahead(struct readahead_control *ractl); const struct file_operations afs_file_operations = { .open = afs_open, @@ -47,7 +47,7 @@ const struct inode_operations afs_file_inode_operations = { const struct address_space_operations afs_fs_aops = { .readpage = afs_readpage, - .readpages = afs_readpages, + .readahead = afs_readahead, .set_page_dirty = afs_set_page_dirty, .launder_page = afs_launder_page, .releasepage = afs_releasepage, @@ -184,41 +184,50 @@ int afs_release(struct inode *inode, struct file *file) } /* + * Allocate a new read record. + */ +struct afs_read *afs_alloc_read(gfp_t gfp) +{ + struct afs_read *req; + + req = kzalloc(sizeof(struct afs_read), gfp); + if (req) + refcount_set(&req->usage, 1); + + return req; +} + +/* * Dispose of a ref to a read record. */ void afs_put_read(struct afs_read *req) { - int i; - if (refcount_dec_and_test(&req->usage)) { - if (req->pages) { - for (i = 0; i < req->nr_pages; i++) - if (req->pages[i]) - put_page(req->pages[i]); - if (req->pages != req->array) - kfree(req->pages); - } + if (req->cleanup) + req->cleanup(req); + key_put(req->key); kfree(req); } } -#ifdef CONFIG_AFS_FSCACHE -/* - * deal with notification that a page was read from the cache - */ -static void afs_file_readpage_read_complete(struct page *page, - void *data, - int error) +static void afs_fetch_data_notify(struct afs_operation *op) { - _enter("%p,%p,%d", page, data, error); - - /* if the read completes with an error, we just unlock the page and let - * the VM reissue the readpage */ - if (!error) - SetPageUptodate(page); - unlock_page(page); + struct afs_read *req = op->fetch.req; + struct netfs_read_subrequest *subreq = req->subreq; + int error = op->error; + + if (error == -ECONNABORTED) + error = afs_abort_to_error(op->ac.abort_code); + req->error = error; + + if (subreq) { + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_subreq_terminated(subreq, error ?: req->actual_len, false); + req->subreq = NULL; + } else if (req->done) { + req->done(req); + } } -#endif static void afs_fetch_data_success(struct afs_operation *op) { @@ -228,10 +237,12 @@ static void afs_fetch_data_success(struct afs_operation *op) afs_vnode_commit_status(op, &op->file[0]); afs_stat_v(vnode, n_fetches); atomic_long_add(op->fetch.req->actual_len, &op->net->n_fetch_bytes); + afs_fetch_data_notify(op); } static void afs_fetch_data_put(struct afs_operation *op) { + op->fetch.req->error = op->error; afs_put_read(op->fetch.req); } @@ -240,13 +251,14 @@ static const struct afs_operation_ops afs_fetch_data_operation = { .issue_yfs_rpc = yfs_fs_fetch_data, .success = afs_fetch_data_success, .aborted = afs_check_for_remote_deletion, + .failed = afs_fetch_data_notify, .put = afs_fetch_data_put, }; /* * Fetch file data from the volume. */ -int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *req) +int afs_fetch_data(struct afs_vnode *vnode, struct afs_read *req) { struct afs_operation *op; @@ -255,11 +267,14 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, - key_serial(key)); + key_serial(req->key)); - op = afs_alloc_operation(key, vnode->volume); - if (IS_ERR(op)) + op = afs_alloc_operation(req->key, vnode->volume); + if (IS_ERR(op)) { + if (req->subreq) + netfs_subreq_terminated(req->subreq, PTR_ERR(op), false); return PTR_ERR(op); + } afs_op_set_vnode(op, 0, vnode); @@ -268,336 +283,103 @@ int afs_fetch_data(struct afs_vnode *vnode, struct key *key, struct afs_read *re return afs_do_sync_operation(op); } -/* - * read page from file, directory or symlink, given a key to use - */ -int afs_page_filler(void *data, struct page *page) +static void afs_req_issue_op(struct netfs_read_subrequest *subreq) { - struct inode *inode = page->mapping->host; - struct afs_vnode *vnode = AFS_FS_I(inode); - struct afs_read *req; - struct key *key = data; - int ret; - - _enter("{%x},{%lu},{%lu}", key_serial(key), inode->i_ino, page->index); + struct afs_vnode *vnode = AFS_FS_I(subreq->rreq->inode); + struct afs_read *fsreq; - BUG_ON(!PageLocked(page)); + fsreq = afs_alloc_read(GFP_NOFS); + if (!fsreq) + return netfs_subreq_terminated(subreq, -ENOMEM, false); - ret = -ESTALE; - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) - goto error; + fsreq->subreq = subreq; + fsreq->pos = subreq->start + subreq->transferred; + fsreq->len = subreq->len - subreq->transferred; + fsreq->key = subreq->rreq->netfs_priv; + fsreq->vnode = vnode; + fsreq->iter = &fsreq->def_iter; - /* is it cached? */ -#ifdef CONFIG_AFS_FSCACHE - ret = fscache_read_or_alloc_page(vnode->cache, - page, - afs_file_readpage_read_complete, - NULL, - GFP_KERNEL); -#else - ret = -ENOBUFS; -#endif - switch (ret) { - /* read BIO submitted (page in cache) */ - case 0: - break; - - /* page not yet cached */ - case -ENODATA: - _debug("cache said ENODATA"); - goto go_on; - - /* page will not be cached */ - case -ENOBUFS: - _debug("cache said ENOBUFS"); - - fallthrough; - default: - go_on: - req = kzalloc(struct_size(req, array, 1), GFP_KERNEL); - if (!req) - goto enomem; - - /* We request a full page. If the page is a partial one at the - * end of the file, the server will return a short read and the - * unmarshalling code will clear the unfilled space. - */ - refcount_set(&req->usage, 1); - req->pos = (loff_t)page->index << PAGE_SHIFT; - req->len = PAGE_SIZE; - req->nr_pages = 1; - req->pages = req->array; - req->pages[0] = page; - get_page(page); - - /* read the contents of the file from the server into the - * page */ - ret = afs_fetch_data(vnode, key, req); - afs_put_read(req); - - if (ret < 0) { - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } + iov_iter_xarray(&fsreq->def_iter, READ, + &fsreq->vnode->vfs_inode.i_mapping->i_pages, + fsreq->pos, fsreq->len); -#ifdef CONFIG_AFS_FSCACHE - fscache_uncache_page(vnode->cache, page); -#endif - BUG_ON(PageFsCache(page)); - - if (ret == -EINTR || - ret == -ENOMEM || - ret == -ERESTARTSYS || - ret == -EAGAIN) - goto error; - goto io_error; - } + afs_fetch_data(fsreq->vnode, fsreq); +} - SetPageUptodate(page); +static int afs_symlink_readpage(struct page *page) +{ + struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); + struct afs_read *fsreq; + int ret; - /* send the page to the cache */ -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, vnode->status.size, - GFP_KERNEL) != 0) { - fscache_uncache_page(vnode->cache, page); - BUG_ON(PageFsCache(page)); - } -#endif - unlock_page(page); - } + fsreq = afs_alloc_read(GFP_NOFS); + if (!fsreq) + return -ENOMEM; - _leave(" = 0"); - return 0; + fsreq->pos = page->index * PAGE_SIZE; + fsreq->len = PAGE_SIZE; + fsreq->vnode = vnode; + fsreq->iter = &fsreq->def_iter; + iov_iter_xarray(&fsreq->def_iter, READ, &page->mapping->i_pages, + fsreq->pos, fsreq->len); -io_error: - SetPageError(page); - goto error; -enomem: - ret = -ENOMEM; -error: - unlock_page(page); - _leave(" = %d", ret); + ret = afs_fetch_data(fsreq->vnode, fsreq); + page_endio(page, false, ret); return ret; } -/* - * read page from file, directory or symlink, given a file to nominate the key - * to be used - */ -static int afs_readpage(struct file *file, struct page *page) +static void afs_init_rreq(struct netfs_read_request *rreq, struct file *file) { - struct key *key; - int ret; - - if (file) { - key = afs_file_key(file); - ASSERT(key != NULL); - ret = afs_page_filler(key, page); - } else { - struct inode *inode = page->mapping->host; - key = afs_request_key(AFS_FS_S(inode->i_sb)->cell); - if (IS_ERR(key)) { - ret = PTR_ERR(key); - } else { - ret = afs_page_filler(key, page); - key_put(key); - } - } - return ret; + rreq->netfs_priv = key_get(afs_file_key(file)); } -/* - * Make pages available as they're filled. - */ -static void afs_readpages_page_done(struct afs_read *req) +static bool afs_is_cache_enabled(struct inode *inode) { -#ifdef CONFIG_AFS_FSCACHE - struct afs_vnode *vnode = req->vnode; -#endif - struct page *page = req->pages[req->index]; + struct fscache_cookie *cookie = afs_vnode_cache(AFS_FS_I(inode)); - req->pages[req->index] = NULL; - SetPageUptodate(page); - - /* send the page to the cache */ -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page) && - fscache_write_page(vnode->cache, page, vnode->status.size, - GFP_KERNEL) != 0) { - fscache_uncache_page(vnode->cache, page); - BUG_ON(PageFsCache(page)); - } -#endif - unlock_page(page); - put_page(page); + return fscache_cookie_enabled(cookie) && !hlist_empty(&cookie->backing_objects); } -/* - * Read a contiguous set of pages. - */ -static int afs_readpages_one(struct file *file, struct address_space *mapping, - struct list_head *pages) +static int afs_begin_cache_operation(struct netfs_read_request *rreq) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct afs_read *req; - struct list_head *p; - struct page *first, *page; - struct key *key = afs_file_key(file); - pgoff_t index; - int ret, n, i; - - /* Count the number of contiguous pages at the front of the list. Note - * that the list goes prev-wards rather than next-wards. - */ - first = lru_to_page(pages); - index = first->index + 1; - n = 1; - for (p = first->lru.prev; p != pages; p = p->prev) { - page = list_entry(p, struct page, lru); - if (page->index != index) - break; - index++; - n++; - } - - req = kzalloc(struct_size(req, array, n), GFP_NOFS); - if (!req) - return -ENOMEM; - - refcount_set(&req->usage, 1); - req->vnode = vnode; - req->page_done = afs_readpages_page_done; - req->pos = first->index; - req->pos <<= PAGE_SHIFT; - req->pages = req->array; - - /* Transfer the pages to the request. We add them in until one fails - * to add to the LRU and then we stop (as that'll make a hole in the - * contiguous run. - * - * Note that it's possible for the file size to change whilst we're - * doing this, but we rely on the server returning less than we asked - * for if the file shrank. We also rely on this to deal with a partial - * page at the end of the file. - */ - do { - page = lru_to_page(pages); - list_del(&page->lru); - index = page->index; - if (add_to_page_cache_lru(page, mapping, index, - readahead_gfp_mask(mapping))) { -#ifdef CONFIG_AFS_FSCACHE - fscache_uncache_page(vnode->cache, page); -#endif - put_page(page); - break; - } - - req->pages[req->nr_pages++] = page; - req->len += PAGE_SIZE; - } while (req->nr_pages < n); + struct afs_vnode *vnode = AFS_FS_I(rreq->inode); - if (req->nr_pages == 0) { - kfree(req); - return 0; - } - - ret = afs_fetch_data(vnode, key, req); - if (ret < 0) - goto error; - - task_io_account_read(PAGE_SIZE * req->nr_pages); - afs_put_read(req); - return 0; - -error: - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - - for (i = 0; i < req->nr_pages; i++) { - page = req->pages[i]; - if (page) { -#ifdef CONFIG_AFS_FSCACHE - fscache_uncache_page(vnode->cache, page); -#endif - SetPageError(page); - unlock_page(page); - } - } - - afs_put_read(req); - return ret; + return fscache_begin_read_operation(rreq, afs_vnode_cache(vnode)); } -/* - * read a set of pages - */ -static int afs_readpages(struct file *file, struct address_space *mapping, - struct list_head *pages, unsigned nr_pages) +static int afs_check_write_begin(struct file *file, loff_t pos, unsigned len, + struct page *page, void **_fsdata) { - struct key *key = afs_file_key(file); - struct afs_vnode *vnode; - int ret = 0; - - _enter("{%d},{%lu},,%d", - key_serial(key), mapping->host->i_ino, nr_pages); + struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - ASSERT(key != NULL); + return test_bit(AFS_VNODE_DELETED, &vnode->flags) ? -ESTALE : 0; +} - vnode = AFS_FS_I(mapping->host); - if (test_bit(AFS_VNODE_DELETED, &vnode->flags)) { - _leave(" = -ESTALE"); - return -ESTALE; - } +static void afs_priv_cleanup(struct address_space *mapping, void *netfs_priv) +{ + key_put(netfs_priv); +} - /* attempt to read as many of the pages as possible */ -#ifdef CONFIG_AFS_FSCACHE - ret = fscache_read_or_alloc_pages(vnode->cache, - mapping, - pages, - &nr_pages, - afs_file_readpage_read_complete, - NULL, - mapping_gfp_mask(mapping)); -#else - ret = -ENOBUFS; -#endif +const struct netfs_read_request_ops afs_req_ops = { + .init_rreq = afs_init_rreq, + .is_cache_enabled = afs_is_cache_enabled, + .begin_cache_operation = afs_begin_cache_operation, + .check_write_begin = afs_check_write_begin, + .issue_op = afs_req_issue_op, + .cleanup = afs_priv_cleanup, +}; - switch (ret) { - /* all pages are being read from the cache */ - case 0: - BUG_ON(!list_empty(pages)); - BUG_ON(nr_pages != 0); - _leave(" = 0 [reading all]"); - return 0; - - /* there were pages that couldn't be read from the cache */ - case -ENODATA: - case -ENOBUFS: - break; - - /* other error */ - default: - _leave(" = %d", ret); - return ret; - } +static int afs_readpage(struct file *file, struct page *page) +{ + if (!file) + return afs_symlink_readpage(page); - while (!list_empty(pages)) { - ret = afs_readpages_one(file, mapping, pages); - if (ret < 0) - break; - } + return netfs_readpage(file, page, &afs_req_ops, NULL); +} - _leave(" = %d [netting]", ret); - return ret; +static void afs_readahead(struct readahead_control *ractl) +{ + netfs_readahead(ractl, &afs_req_ops, NULL); } /* @@ -625,8 +407,8 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset, return; /* We may need to shorten the dirty region */ - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); if (t <= offset || f >= end) return; /* Doesn't overlap */ @@ -644,17 +426,17 @@ static void afs_invalidate_dirty(struct page *page, unsigned int offset, if (f == t) goto undirty; - priv = afs_page_dirty(f, t); + priv = afs_page_dirty(page, f, t); set_page_private(page, priv); - trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("trunc"), page); return; undirty: - trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("undirty"), page); clear_page_dirty_for_io(page); full_invalidate: - priv = (unsigned long)detach_page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("inval"), page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("inval"), page); + detach_page_private(page); } /* @@ -669,20 +451,10 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, BUG_ON(!PageLocked(page)); -#ifdef CONFIG_AFS_FSCACHE - /* we clean up only if the entire page is being invalidated */ - if (offset == 0 && length == PAGE_SIZE) { - if (PageFsCache(page)) { - struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - fscache_wait_on_page_write(vnode->cache, page); - fscache_uncache_page(vnode->cache, page); - } - } -#endif - if (PagePrivate(page)) afs_invalidate_dirty(page, offset, length); + wait_on_page_fscache(page); _leave(""); } @@ -693,7 +465,6 @@ static void afs_invalidatepage(struct page *page, unsigned int offset, static int afs_releasepage(struct page *page, gfp_t gfp_flags) { struct afs_vnode *vnode = AFS_FS_I(page->mapping->host); - unsigned long priv; _enter("{{%llx:%llu}[%lu],%lx},%x", vnode->fid.vid, vnode->fid.vnode, page->index, page->flags, @@ -702,16 +473,16 @@ static int afs_releasepage(struct page *page, gfp_t gfp_flags) /* deny if page is being written to the cache and the caller hasn't * elected to wait */ #ifdef CONFIG_AFS_FSCACHE - if (!fscache_maybe_release_page(vnode->cache, page, gfp_flags)) { - _leave(" = F [cache busy]"); - return 0; + if (PageFsCache(page)) { + if (!(gfp_flags & __GFP_DIRECT_RECLAIM) || !(gfp_flags & __GFP_FS)) + return false; + wait_on_page_fscache(page); } #endif if (PagePrivate(page)) { - priv = (unsigned long)detach_page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("rel"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("rel"), page); + detach_page_private(page); } /* indicate that the page can be released */ diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c index 71c58723763d..d222dfbe976b 100644 --- a/fs/afs/fs_operation.c +++ b/fs/afs/fs_operation.c @@ -118,6 +118,8 @@ static void afs_prepare_vnode(struct afs_operation *op, struct afs_vnode_param * vp->cb_break_before = afs_calc_vnode_cb_break(vnode); if (vnode->lock_state != AFS_VNODE_LOCK_NONE) op->flags |= AFS_OPERATION_CUR_ONLY; + if (vp->modification) + set_bit(AFS_VNODE_MODIFYING, &vnode->flags); } if (vp->fid.vnode) @@ -198,8 +200,10 @@ void afs_wait_for_operation(struct afs_operation *op) case -ECONNABORTED: if (op->ops->aborted) op->ops->aborted(op); - break; + fallthrough; default: + if (op->ops->failed) + op->ops->failed(op); break; } @@ -223,6 +227,10 @@ int afs_put_operation(struct afs_operation *op) if (op->ops && op->ops->put) op->ops->put(op); + if (op->file[0].modification) + clear_bit(AFS_VNODE_MODIFYING, &op->file[0].vnode->flags); + if (op->file[1].modification && op->file[1].vnode != op->file[0].vnode) + clear_bit(AFS_VNODE_MODIFYING, &op->file[1].vnode->flags); if (op->file[0].put_vnode) iput(&op->file[0].vnode->vfs_inode); if (op->file[1].put_vnode) diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 1d95ed9dd86e..dd3f45d906d2 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -10,6 +10,7 @@ #include <linux/sched.h> #include <linux/circ_buf.h> #include <linux/iversion.h> +#include <linux/netfs.h> #include "internal.h" #include "afs_fs.h" #include "xdr_fs.h" @@ -302,17 +303,15 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) struct afs_vnode_param *vp = &op->file[0]; struct afs_read *req = op->fetch.req; const __be32 *bp; - unsigned int size; int ret; - _enter("{%u,%zu/%llu}", - call->unmarshall, iov_iter_count(call->iter), req->actual_len); + _enter("{%u,%zu,%zu/%llu}", + call->unmarshall, call->iov_len, iov_iter_count(call->iter), + req->actual_len); switch (call->unmarshall) { case 0: req->actual_len = 0; - req->index = 0; - req->offset = req->pos & (PAGE_SIZE - 1); call->unmarshall++; if (call->operation_ID == FSFETCHDATA64) { afs_extract_to_tmp64(call); @@ -322,7 +321,10 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) } fallthrough; - /* extract the returned data length */ + /* Extract the returned data length into + * ->actual_len. This may indicate more or less data than was + * requested will be returned. + */ case 1: _debug("extract data length"); ret = afs_extract_data(call, true); @@ -331,44 +333,25 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) req->actual_len = be64_to_cpu(call->tmp64); _debug("DATA length: %llu", req->actual_len); - req->remain = min(req->len, req->actual_len); - if (req->remain == 0) + + if (req->actual_len == 0) goto no_more_data; + call->iter = req->iter; + call->iov_len = min(req->actual_len, req->len); call->unmarshall++; - - begin_page: - ASSERTCMP(req->index, <, req->nr_pages); - if (req->remain > PAGE_SIZE - req->offset) - size = PAGE_SIZE - req->offset; - else - size = req->remain; - call->bvec[0].bv_len = size; - call->bvec[0].bv_offset = req->offset; - call->bvec[0].bv_page = req->pages[req->index]; - iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); - ASSERTCMP(size, <=, PAGE_SIZE); fallthrough; /* extract the returned data */ case 2: _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->remain); + iov_iter_count(call->iter), req->actual_len); ret = afs_extract_data(call, true); if (ret < 0) return ret; - req->remain -= call->bvec[0].bv_len; - req->offset += call->bvec[0].bv_len; - ASSERTCMP(req->offset, <=, PAGE_SIZE); - if (req->offset == PAGE_SIZE) { - req->offset = 0; - req->index++; - if (req->remain > 0) - goto begin_page; - } - ASSERTCMP(req->remain, ==, 0); + call->iter = &call->def_iter; if (req->actual_len <= req->len) goto no_more_data; @@ -405,22 +388,12 @@ static int afs_deliver_fs_fetch_data(struct afs_call *call) req->file_size = vp->scb.status.size; call->unmarshall++; + fallthrough; case 5: break; } - for (; req->index < req->nr_pages; req->index++) { - if (req->offset < PAGE_SIZE) - zero_user_segment(req->pages[req->index], - req->offset, PAGE_SIZE); - req->offset = 0; - } - - if (req->page_done) - for (req->index = 0; req->index < req->nr_pages; req->index++) - req->page_done(req); - _leave(" = 0 [done]"); return 0; } @@ -494,6 +467,8 @@ void afs_fs_fetch_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); + req->call_debug_id = call->debug_id; + /* marshall the parameters */ bp = call->request; bp[0] = htonl(FSFETCHDATA); @@ -1079,8 +1054,7 @@ static const struct afs_call_type afs_RXFSStoreData64 = { /* * store a set of pages to a very large file */ -static void afs_fs_store_data64(struct afs_operation *op, - loff_t pos, loff_t size, loff_t i_size) +static void afs_fs_store_data64(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; @@ -1095,7 +1069,7 @@ static void afs_fs_store_data64(struct afs_operation *op, if (!call) return afs_op_nomem(op); - call->send_pages = true; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; @@ -1111,47 +1085,38 @@ static void afs_fs_store_data64(struct afs_operation *op, *bp++ = 0; /* unix mode */ *bp++ = 0; /* segment size */ - *bp++ = htonl(upper_32_bits(pos)); - *bp++ = htonl(lower_32_bits(pos)); - *bp++ = htonl(upper_32_bits(size)); - *bp++ = htonl(lower_32_bits(size)); - *bp++ = htonl(upper_32_bits(i_size)); - *bp++ = htonl(lower_32_bits(i_size)); + *bp++ = htonl(upper_32_bits(op->store.pos)); + *bp++ = htonl(lower_32_bits(op->store.pos)); + *bp++ = htonl(upper_32_bits(op->store.size)); + *bp++ = htonl(lower_32_bits(op->store.size)); + *bp++ = htonl(upper_32_bits(op->store.i_size)); + *bp++ = htonl(lower_32_bits(op->store.i_size)); trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); } /* - * store a set of pages + * Write data to a file on the server. */ void afs_fs_store_data(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - loff_t size, pos, i_size; __be32 *bp; _enter(",%x,{%llx:%llu},,", key_serial(op->key), vp->fid.vid, vp->fid.vnode); - size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset; - if (op->store.first != op->store.last) - size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT; - pos = (loff_t)op->store.first << PAGE_SHIFT; - pos += op->store.first_offset; - - i_size = i_size_read(&vp->vnode->vfs_inode); - if (pos + size > i_size) - i_size = size + pos; - _debug("size %llx, at %llx, i_size %llx", - (unsigned long long) size, (unsigned long long) pos, - (unsigned long long) i_size); + (unsigned long long)op->store.size, + (unsigned long long)op->store.pos, + (unsigned long long)op->store.i_size); - if (upper_32_bits(pos) || upper_32_bits(i_size) || upper_32_bits(size) || - upper_32_bits(pos + size)) - return afs_fs_store_data64(op, pos, size, i_size); + if (upper_32_bits(op->store.pos) || + upper_32_bits(op->store.size) || + upper_32_bits(op->store.i_size)) + return afs_fs_store_data64(op); call = afs_alloc_flat_call(op->net, &afs_RXFSStoreData, (4 + 6 + 3) * 4, @@ -1159,7 +1124,7 @@ void afs_fs_store_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); - call->send_pages = true; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; @@ -1175,9 +1140,9 @@ void afs_fs_store_data(struct afs_operation *op) *bp++ = 0; /* unix mode */ *bp++ = 0; /* segment size */ - *bp++ = htonl(lower_32_bits(pos)); - *bp++ = htonl(lower_32_bits(size)); - *bp++ = htonl(lower_32_bits(i_size)); + *bp++ = htonl(lower_32_bits(op->store.pos)); + *bp++ = htonl(lower_32_bits(op->store.size)); + *bp++ = htonl(lower_32_bits(op->store.i_size)); trace_afs_make_fs_call(call, &vp->fid); afs_make_op_call(op, call, GFP_NOFS); @@ -1444,6 +1409,7 @@ static int afs_deliver_fs_get_volume_status(struct afs_call *call) _debug("motd '%s'", p); call->unmarshall++; + fallthrough; case 8: break; @@ -1881,6 +1847,7 @@ static int afs_deliver_fs_inline_bulk_status(struct afs_call *call) xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; + fallthrough; case 6: break; @@ -2015,6 +1982,7 @@ static int afs_deliver_fs_fetch_acl(struct afs_call *call) xdr_decode_AFSVolSync(&bp, &op->volsync); call->unmarshall++; + fallthrough; case 4: break; diff --git a/fs/afs/inode.c b/fs/afs/inode.c index 12be88716e4c..80b6c8d967d5 100644 --- a/fs/afs/inode.c +++ b/fs/afs/inode.c @@ -102,13 +102,13 @@ static int afs_inode_init_from_status(struct afs_operation *op, switch (status->type) { case AFS_FTYPE_FILE: - inode->i_mode = S_IFREG | status->mode; + inode->i_mode = S_IFREG | (status->mode & S_IALLUGO); inode->i_op = &afs_file_inode_operations; inode->i_fop = &afs_file_operations; inode->i_mapping->a_ops = &afs_fs_aops; break; case AFS_FTYPE_DIR: - inode->i_mode = S_IFDIR | status->mode; + inode->i_mode = S_IFDIR | (status->mode & S_IALLUGO); inode->i_op = &afs_dir_inode_operations; inode->i_fop = &afs_dir_file_operations; inode->i_mapping->a_ops = &afs_dir_aops; @@ -198,7 +198,7 @@ static void afs_apply_status(struct afs_operation *op, if (status->mode != vnode->status.mode) { mode = inode->i_mode; mode &= ~S_IALLUGO; - mode |= status->mode; + mode |= status->mode & S_IALLUGO; WRITE_ONCE(inode->i_mode, mode); } @@ -214,11 +214,12 @@ static void afs_apply_status(struct afs_operation *op, if (vp->dv_before + vp->dv_delta != status->data_version) { if (test_bit(AFS_VNODE_CB_PROMISED, &vnode->flags)) - pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s\n", + pr_warn("kAFS: vnode modified {%llx:%llu} %llx->%llx %s (op=%x)\n", vnode->fid.vid, vnode->fid.vnode, (unsigned long long)vp->dv_before + vp->dv_delta, (unsigned long long)status->data_version, - op->type ? op->type->name : "???"); + op->type ? op->type->name : "???", + op->debug_id); vnode->invalid_before = status->data_version; if (vnode->status.type == AFS_FTYPE_DIR) { @@ -293,8 +294,9 @@ void afs_vnode_commit_status(struct afs_operation *op, struct afs_vnode_param *v op->flags &= ~AFS_OPERATION_DIR_CONFLICT; } } else if (vp->scb.have_status) { - if (vp->dv_before + vp->dv_delta != vp->scb.status.data_version && - vp->speculative) + if (vp->speculative && + (test_bit(AFS_VNODE_MODIFYING, &vnode->flags) || + vp->dv_before != vnode->status.data_version)) /* Ignore the result of a speculative bulk status fetch * if it splits around a modification op, thereby * appearing to regress the data version. @@ -427,7 +429,7 @@ static void afs_get_inode_cache(struct afs_vnode *vnode) } __packed key; struct afs_vnode_cache_aux aux; - if (vnode->status.type == AFS_FTYPE_DIR) { + if (vnode->status.type != AFS_FTYPE_FILE) { vnode->cache = NULL; return; } @@ -910,6 +912,7 @@ int afs_setattr(struct user_namespace *mnt_userns, struct dentry *dentry, } op->ctime = attr->ia_ctime; op->file[0].update_ctime = 1; + op->file[0].modification = true; op->ops = &afs_setattr_operation; ret = afs_do_sync_operation(op); diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 1627b1872812..5ed416f4ff33 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -14,6 +14,7 @@ #include <linux/key.h> #include <linux/workqueue.h> #include <linux/sched.h> +#define FSCACHE_USE_NEW_IO_API #include <linux/fscache.h> #include <linux/backing-dev.h> #include <linux/uuid.h> @@ -31,6 +32,7 @@ struct pagevec; struct afs_call; +struct afs_vnode; /* * Partial file-locking emulation mode. (The problem being that AFS3 only @@ -104,7 +106,9 @@ struct afs_call { struct afs_server *server; /* The fileserver record if fs op (pins ref) */ struct afs_vlserver *vlserver; /* The vlserver record if vl op */ void *request; /* request data (first part) */ + size_t iov_len; /* Size of *iter to be used */ struct iov_iter def_iter; /* Default buffer/data iterator */ + struct iov_iter *write_iter; /* Iterator defining write to be made */ struct iov_iter *iter; /* Iterator currently in use */ union { /* Convenience for ->def_iter */ struct kvec kvec[1]; @@ -131,7 +135,6 @@ struct afs_call { unsigned char unmarshall; /* unmarshalling phase */ unsigned char addr_ix; /* Address in ->alist */ bool drop_ref; /* T if need to drop ref for incoming call */ - bool send_pages; /* T if data from mapping should be sent */ bool need_attention; /* T if RxRPC poked us */ bool async; /* T if asynchronous */ bool upgrade; /* T to request service upgrade */ @@ -202,17 +205,19 @@ struct afs_read { loff_t pos; /* Where to start reading */ loff_t len; /* How much we're asking for */ loff_t actual_len; /* How much we're actually getting */ - loff_t remain; /* Amount remaining */ loff_t file_size; /* File size returned by server */ + struct key *key; /* The key to use to reissue the read */ + struct afs_vnode *vnode; /* The file being read into. */ + struct netfs_read_subrequest *subreq; /* Fscache helper read request this belongs to */ afs_dataversion_t data_version; /* Version number returned by server */ refcount_t usage; - unsigned int index; /* Which page we're reading into */ + unsigned int call_debug_id; unsigned int nr_pages; - unsigned int offset; /* offset into current page */ - struct afs_vnode *vnode; - void (*page_done)(struct afs_read *); - struct page **pages; - struct page *array[]; + int error; + void (*done)(struct afs_read *); + void (*cleanup)(struct afs_read *); + struct iov_iter *iter; /* Iterator representing the buffer */ + struct iov_iter def_iter; /* Default iterator */ }; /* @@ -640,6 +645,7 @@ struct afs_vnode { #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ +#define AFS_VNODE_MODIFYING 10 /* Set if we're performing a modification op */ struct list_head wb_keys; /* List of keys available for writeback */ struct list_head pending_locks; /* locks waiting to be granted */ @@ -739,6 +745,7 @@ struct afs_operation_ops { void (*issue_yfs_rpc)(struct afs_operation *op); void (*success)(struct afs_operation *op); void (*aborted)(struct afs_operation *op); + void (*failed)(struct afs_operation *op); void (*edit_dir)(struct afs_operation *op); void (*put)(struct afs_operation *op); }; @@ -756,6 +763,7 @@ struct afs_vnode_param { bool set_size:1; /* Must update i_size */ bool op_unlinked:1; /* True if file was unlinked by op */ bool speculative:1; /* T if speculative status fetch (no vnode lock) */ + bool modification:1; /* Set if the content gets modified */ }; /* @@ -808,12 +816,11 @@ struct afs_operation { afs_lock_type_t type; } lock; struct { - struct address_space *mapping; /* Pages being written from */ - pgoff_t first; /* first page in mapping to deal with */ - pgoff_t last; /* last page in mapping to deal with */ - unsigned first_offset; /* offset into mapping[first] */ - unsigned last_to; /* amount of mapping[last] */ - bool laundering; /* Laundering page, PG_writeback not set */ + struct iov_iter *write_iter; + loff_t pos; + loff_t size; + loff_t i_size; + bool laundering; /* Laundering page, PG_writeback not set */ } store; struct { struct iattr *attr; @@ -875,31 +882,31 @@ struct afs_vnode_cache_aux { #define __AFS_PAGE_PRIV_MMAPPED 0x8000UL #endif -static inline unsigned int afs_page_dirty_resolution(void) +static inline unsigned int afs_page_dirty_resolution(struct page *page) { - int shift = PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); + int shift = thp_order(page) + PAGE_SHIFT - (__AFS_PAGE_PRIV_SHIFT - 1); return (shift > 0) ? shift : 0; } -static inline size_t afs_page_dirty_from(unsigned long priv) +static inline size_t afs_page_dirty_from(struct page *page, unsigned long priv) { unsigned long x = priv & __AFS_PAGE_PRIV_MASK; /* The lower bound is inclusive */ - return x << afs_page_dirty_resolution(); + return x << afs_page_dirty_resolution(page); } -static inline size_t afs_page_dirty_to(unsigned long priv) +static inline size_t afs_page_dirty_to(struct page *page, unsigned long priv) { unsigned long x = (priv >> __AFS_PAGE_PRIV_SHIFT) & __AFS_PAGE_PRIV_MASK; /* The upper bound is immediately beyond the region */ - return (x + 1) << afs_page_dirty_resolution(); + return (x + 1) << afs_page_dirty_resolution(page); } -static inline unsigned long afs_page_dirty(size_t from, size_t to) +static inline unsigned long afs_page_dirty(struct page *page, size_t from, size_t to) { - unsigned int res = afs_page_dirty_resolution(); + unsigned int res = afs_page_dirty_resolution(page); from >>= res; to = (to - 1) >> res; return (to << __AFS_PAGE_PRIV_SHIFT) | from; @@ -1040,13 +1047,14 @@ extern void afs_dynroot_depopulate(struct super_block *); extern const struct address_space_operations afs_fs_aops; extern const struct inode_operations afs_file_inode_operations; extern const struct file_operations afs_file_operations; +extern const struct netfs_read_request_ops afs_req_ops; extern int afs_cache_wb_key(struct afs_vnode *, struct afs_file *); extern void afs_put_wb_key(struct afs_wb_key *); extern int afs_open(struct inode *, struct file *); extern int afs_release(struct inode *, struct file *); -extern int afs_fetch_data(struct afs_vnode *, struct key *, struct afs_read *); -extern int afs_page_filler(void *, struct page *); +extern int afs_fetch_data(struct afs_vnode *, struct afs_read *); +extern struct afs_read *afs_alloc_read(gfp_t); extern void afs_put_read(struct afs_read *); static inline struct afs_read *afs_get_read(struct afs_read *req) @@ -1270,6 +1278,7 @@ static inline void afs_make_op_call(struct afs_operation *op, struct afs_call *c static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t size) { + call->iov_len = size; call->kvec[0].iov_base = buf; call->kvec[0].iov_len = size; iov_iter_kvec(&call->def_iter, READ, call->kvec, 1, size); @@ -1277,21 +1286,25 @@ static inline void afs_extract_begin(struct afs_call *call, void *buf, size_t si static inline void afs_extract_to_tmp(struct afs_call *call) { + call->iov_len = sizeof(call->tmp); afs_extract_begin(call, &call->tmp, sizeof(call->tmp)); } static inline void afs_extract_to_tmp64(struct afs_call *call) { + call->iov_len = sizeof(call->tmp64); afs_extract_begin(call, &call->tmp64, sizeof(call->tmp64)); } static inline void afs_extract_discard(struct afs_call *call, size_t size) { + call->iov_len = size; iov_iter_discard(&call->def_iter, READ, size); } static inline void afs_extract_to_buf(struct afs_call *call, size_t size) { + call->iov_len = size; afs_extract_begin(call, call->buffer, size); } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 8be709cb8542..23a1a92d64bb 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -271,40 +271,6 @@ void afs_flat_call_destructor(struct afs_call *call) call->buffer = NULL; } -#define AFS_BVEC_MAX 8 - -/* - * Load the given bvec with the next few pages. - */ -static void afs_load_bvec(struct afs_call *call, struct msghdr *msg, - struct bio_vec *bv, pgoff_t first, pgoff_t last, - unsigned offset) -{ - struct afs_operation *op = call->op; - struct page *pages[AFS_BVEC_MAX]; - unsigned int nr, n, i, to, bytes = 0; - - nr = min_t(pgoff_t, last - first + 1, AFS_BVEC_MAX); - n = find_get_pages_contig(op->store.mapping, first, nr, pages); - ASSERTCMP(n, ==, nr); - - msg->msg_flags |= MSG_MORE; - for (i = 0; i < nr; i++) { - to = PAGE_SIZE; - if (first + i >= last) { - to = op->store.last_to; - msg->msg_flags &= ~MSG_MORE; - } - bv[i].bv_page = pages[i]; - bv[i].bv_len = to - offset; - bv[i].bv_offset = offset; - bytes += to - offset; - offset = 0; - } - - iov_iter_bvec(&msg->msg_iter, WRITE, bv, nr, bytes); -} - /* * Advance the AFS call state when the RxRPC call ends the transmit phase. */ @@ -318,42 +284,6 @@ static void afs_notify_end_request_tx(struct sock *sock, } /* - * attach the data from a bunch of pages on an inode to a call - */ -static int afs_send_pages(struct afs_call *call, struct msghdr *msg) -{ - struct afs_operation *op = call->op; - struct bio_vec bv[AFS_BVEC_MAX]; - unsigned int bytes, nr, loop, offset; - pgoff_t first = op->store.first, last = op->store.last; - int ret; - - offset = op->store.first_offset; - op->store.first_offset = 0; - - do { - afs_load_bvec(call, msg, bv, first, last, offset); - trace_afs_send_pages(call, msg, first, last, offset); - - offset = 0; - bytes = msg->msg_iter.count; - nr = msg->msg_iter.nr_segs; - - ret = rxrpc_kernel_send_data(op->net->socket, call->rxcall, msg, - bytes, afs_notify_end_request_tx); - for (loop = 0; loop < nr; loop++) - put_page(bv[loop].bv_page); - if (ret < 0) - break; - - first += nr; - } while (first <= last); - - trace_afs_sent_pages(call, op->store.first, last, first, ret); - return ret; -} - -/* * Initiate a call and synchronously queue up the parameters for dispatch. Any * error is stored into the call struct, which the caller must check for. */ @@ -363,6 +293,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) struct rxrpc_call *rxcall; struct msghdr msg; struct kvec iov[1]; + size_t len; s64 tx_total_len; int ret; @@ -383,21 +314,8 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) * after the initial fixed part. */ tx_total_len = call->request_size; - if (call->send_pages) { - struct afs_operation *op = call->op; - - if (op->store.last == op->store.first) { - tx_total_len += op->store.last_to - op->store.first_offset; - } else { - /* It looks mathematically like you should be able to - * combine the following lines with the ones above, but - * unsigned arithmetic is fun when it wraps... - */ - tx_total_len += PAGE_SIZE - op->store.first_offset; - tx_total_len += op->store.last_to; - tx_total_len += (op->store.last - op->store.first - 1) * PAGE_SIZE; - } - } + if (call->write_iter) + tx_total_len += iov_iter_count(call->write_iter); /* If the call is going to be asynchronous, we need an extra ref for * the call to hold itself so the caller need not hang on to its ref. @@ -439,7 +357,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) iov_iter_kvec(&msg.msg_iter, WRITE, iov, 1, call->request_size); msg.msg_control = NULL; msg.msg_controllen = 0; - msg.msg_flags = MSG_WAITALL | (call->send_pages ? MSG_MORE : 0); + msg.msg_flags = MSG_WAITALL | (call->write_iter ? MSG_MORE : 0); ret = rxrpc_kernel_send_data(call->net->socket, rxcall, &msg, call->request_size, @@ -447,8 +365,18 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) if (ret < 0) goto error_do_abort; - if (call->send_pages) { - ret = afs_send_pages(call, &msg); + if (call->write_iter) { + msg.msg_iter = *call->write_iter; + msg.msg_flags &= ~MSG_MORE; + trace_afs_send_data(call, &msg); + + ret = rxrpc_kernel_send_data(call->net->socket, + call->rxcall, &msg, + iov_iter_count(&msg.msg_iter), + afs_notify_end_request_tx); + *call->write_iter = msg.msg_iter; + + trace_afs_sent_data(call, &msg, ret); if (ret < 0) goto error_do_abort; } @@ -466,9 +394,10 @@ error_do_abort: rxrpc_kernel_abort_call(call->net->socket, rxcall, RX_USER_ABORT, ret, "KSD"); } else { + len = 0; iov_iter_kvec(&msg.msg_iter, READ, NULL, 0, 0); rxrpc_kernel_recv_data(call->net->socket, rxcall, - &msg.msg_iter, false, + &msg.msg_iter, &len, false, &call->abort_code, &call->service_id); ac->abort_code = call->abort_code; ac->responded = true; @@ -499,11 +428,45 @@ error_kill_call: } /* + * Log remote abort codes that indicate that we have a protocol disagreement + * with the server. + */ +static void afs_log_error(struct afs_call *call, s32 remote_abort) +{ + static int max = 0; + const char *msg; + int m; + + switch (remote_abort) { + case RX_EOF: msg = "unexpected EOF"; break; + case RXGEN_CC_MARSHAL: msg = "client marshalling"; break; + case RXGEN_CC_UNMARSHAL: msg = "client unmarshalling"; break; + case RXGEN_SS_MARSHAL: msg = "server marshalling"; break; + case RXGEN_SS_UNMARSHAL: msg = "server unmarshalling"; break; + case RXGEN_DECODE: msg = "opcode decode"; break; + case RXGEN_SS_XDRFREE: msg = "server XDR cleanup"; break; + case RXGEN_CC_XDRFREE: msg = "client XDR cleanup"; break; + case -32: msg = "insufficient data"; break; + default: + return; + } + + m = max; + if (m < 3) { + max = m + 1; + pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", + msg, call->type->name, + &call->alist->addrs[call->addr_ix].transport); + } +} + +/* * deliver messages to a call */ static void afs_deliver_to_call(struct afs_call *call) { enum afs_call_state state; + size_t len; u32 abort_code, remote_abort = 0; int ret; @@ -516,10 +479,11 @@ static void afs_deliver_to_call(struct afs_call *call) state == AFS_CALL_SV_AWAIT_ACK ) { if (state == AFS_CALL_SV_AWAIT_ACK) { + len = 0; iov_iter_kvec(&call->def_iter, READ, NULL, 0, 0); ret = rxrpc_kernel_recv_data(call->net->socket, call->rxcall, &call->def_iter, - false, &remote_abort, + &len, false, &remote_abort, &call->service_id); trace_afs_receive_data(call, &call->def_iter, false, ret); @@ -559,6 +523,7 @@ static void afs_deliver_to_call(struct afs_call *call) goto out; case -ECONNABORTED: ASSERTCMP(state, ==, AFS_CALL_COMPLETE); + afs_log_error(call, call->abort_code); goto done; case -ENOTSUPP: abort_code = RXGEN_OPCODE; @@ -929,10 +894,11 @@ int afs_extract_data(struct afs_call *call, bool want_more) u32 remote_abort = 0; int ret; - _enter("{%s,%zu},%d", call->type->name, iov_iter_count(iter), want_more); + _enter("{%s,%zu,%zu},%d", + call->type->name, call->iov_len, iov_iter_count(iter), want_more); ret = rxrpc_kernel_recv_data(net->socket, call->rxcall, iter, - want_more, &remote_abort, + &call->iov_len, want_more, &remote_abort, &call->service_id); if (ret == 0 || ret == -EAGAIN) return ret; diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c index dc9327332f06..00fca3c66ba6 100644 --- a/fs/afs/vlclient.c +++ b/fs/afs/vlclient.c @@ -593,6 +593,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) if (ret < 0) return ret; call->unmarshall = 6; + fallthrough; case 6: break; diff --git a/fs/afs/write.c b/fs/afs/write.c index c9195fc67fd8..3edb6204b937 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -11,6 +11,8 @@ #include <linux/pagemap.h> #include <linux/writeback.h> #include <linux/pagevec.h> +#include <linux/netfs.h> +#include <linux/fscache.h> #include "internal.h" /* @@ -23,55 +25,6 @@ int afs_set_page_dirty(struct page *page) } /* - * partly or wholly fill a page that's under preparation for writing - */ -static int afs_fill_page(struct afs_vnode *vnode, struct key *key, - loff_t pos, unsigned int len, struct page *page) -{ - struct afs_read *req; - size_t p; - void *data; - int ret; - - _enter(",,%llu", (unsigned long long)pos); - - if (pos >= vnode->vfs_inode.i_size) { - p = pos & ~PAGE_MASK; - ASSERTCMP(p + len, <=, PAGE_SIZE); - data = kmap(page); - memset(data + p, 0, len); - kunmap(page); - return 0; - } - - req = kzalloc(struct_size(req, array, 1), GFP_KERNEL); - if (!req) - return -ENOMEM; - - refcount_set(&req->usage, 1); - req->pos = pos; - req->len = len; - req->nr_pages = 1; - req->pages = req->array; - req->pages[0] = page; - get_page(page); - - ret = afs_fetch_data(vnode, key, req); - afs_put_read(req); - if (ret < 0) { - if (ret == -ENOENT) { - _debug("got NOENT from server" - " - marking file deleted and stale"); - set_bit(AFS_VNODE_DELETED, &vnode->flags); - ret = -ESTALE; - } - } - - _leave(" = %d", ret); - return ret; -} - -/* * prepare to perform part of a write to a page */ int afs_write_begin(struct file *file, struct address_space *mapping, @@ -80,47 +33,40 @@ int afs_write_begin(struct file *file, struct address_space *mapping, { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); struct page *page; - struct key *key = afs_file_key(file); unsigned long priv; - unsigned f, from = pos & (PAGE_SIZE - 1); - unsigned t, to = from + len; - pgoff_t index = pos >> PAGE_SHIFT; + unsigned f, from; + unsigned t, to; + pgoff_t index; int ret; - _enter("{%llx:%llu},{%lx},%u,%u", - vnode->fid.vid, vnode->fid.vnode, index, from, to); + _enter("{%llx:%llu},%llx,%x", + vnode->fid.vid, vnode->fid.vnode, pos, len); - page = grab_cache_page_write_begin(mapping, index, flags); - if (!page) - return -ENOMEM; + /* Prefetch area to be written into the cache if we're caching this + * file. We need to do this before we get a lock on the page in case + * there's more than one writer competing for the same cache block. + */ + ret = netfs_write_begin(file, mapping, pos, len, flags, &page, fsdata, + &afs_req_ops, NULL); + if (ret < 0) + return ret; - if (!PageUptodate(page) && len != PAGE_SIZE) { - ret = afs_fill_page(vnode, key, pos & PAGE_MASK, PAGE_SIZE, page); - if (ret < 0) { - unlock_page(page); - put_page(page); - _leave(" = %d [prep]", ret); - return ret; - } - SetPageUptodate(page); - } + index = page->index; + from = pos - index * PAGE_SIZE; + to = from + len; try_again: /* See if this page is already partially written in a way that we can * merge the new write with. */ - t = f = 0; if (PagePrivate(page)) { priv = page_private(page); - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); ASSERTCMP(f, <=, t); - } - if (f != t) { if (PageWriteback(page)) { - trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("alrdy"), page); goto flush_conflicting_write; } /* If the file is being filled locally, allow inter-write @@ -164,12 +110,10 @@ int afs_write_end(struct file *file, struct address_space *mapping, struct page *page, void *fsdata) { struct afs_vnode *vnode = AFS_FS_I(file_inode(file)); - struct key *key = afs_file_key(file); unsigned long priv; - unsigned int f, from = pos & (PAGE_SIZE - 1); + unsigned int f, from = pos & (thp_size(page) - 1); unsigned int t, to = from + copied; loff_t i_size, maybe_i_size; - int ret = 0; _enter("{%llx:%llu},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); @@ -188,88 +132,75 @@ int afs_write_end(struct file *file, struct address_space *mapping, write_sequnlock(&vnode->cb_lock); } - if (!PageUptodate(page)) { - if (copied < len) { - /* Try and load any missing data from the server. The - * unmarshalling routine will take care of clearing any - * bits that are beyond the EOF. - */ - ret = afs_fill_page(vnode, key, pos + copied, - len - copied, page); - if (ret < 0) - goto out; - } - SetPageUptodate(page); - } + ASSERT(PageUptodate(page)); if (PagePrivate(page)) { priv = page_private(page); - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); if (from < f) f = from; if (to > t) t = to; - priv = afs_page_dirty(f, t); + priv = afs_page_dirty(page, f, t); set_page_private(page, priv); - trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty+"), page); } else { - priv = afs_page_dirty(from, to); + priv = afs_page_dirty(page, from, to); attach_page_private(page, (void *)priv); - trace_afs_page_dirty(vnode, tracepoint_string("dirty"), - page->index, priv); + trace_afs_page_dirty(vnode, tracepoint_string("dirty"), page); } - set_page_dirty(page); - if (PageDirty(page)) - _debug("dirtied"); - ret = copied; + if (set_page_dirty(page)) + _debug("dirtied %lx", page->index); out: unlock_page(page); put_page(page); - return ret; + return copied; } /* * kill all the pages in the given range */ static void afs_kill_pages(struct address_space *mapping, - pgoff_t first, pgoff_t last) + loff_t start, loff_t len) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct pagevec pv; - unsigned count, loop; + unsigned int loop, psize; - _enter("{%llx:%llu},%lx-%lx", - vnode->fid.vid, vnode->fid.vnode, first, last); + _enter("{%llx:%llu},%llx @%llx", + vnode->fid.vid, vnode->fid.vnode, len, start); pagevec_init(&pv); do { - _debug("kill %lx-%lx", first, last); + _debug("kill %llx @%llx", len, start); - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(mapping, first, count, pv.pages); - ASSERTCMP(pv.nr, ==, count); + pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE, + PAGEVEC_SIZE, pv.pages); + if (pv.nr == 0) + break; - for (loop = 0; loop < count; loop++) { + for (loop = 0; loop < pv.nr; loop++) { struct page *page = pv.pages[loop]; + + if (page->index * PAGE_SIZE >= start + len) + break; + + psize = thp_size(page); + start += psize; + len -= psize; ClearPageUptodate(page); - SetPageError(page); end_page_writeback(page); - if (page->index >= first) - first = page->index + 1; lock_page(page); generic_error_remove_page(mapping, page); unlock_page(page); } __pagevec_release(&pv); - } while (first <= last); + } while (len > 0); _leave(""); } @@ -279,37 +210,40 @@ static void afs_kill_pages(struct address_space *mapping, */ static void afs_redirty_pages(struct writeback_control *wbc, struct address_space *mapping, - pgoff_t first, pgoff_t last) + loff_t start, loff_t len) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct pagevec pv; - unsigned count, loop; + unsigned int loop, psize; - _enter("{%llx:%llu},%lx-%lx", - vnode->fid.vid, vnode->fid.vnode, first, last); + _enter("{%llx:%llu},%llx @%llx", + vnode->fid.vid, vnode->fid.vnode, len, start); pagevec_init(&pv); do { - _debug("redirty %lx-%lx", first, last); + _debug("redirty %llx @%llx", len, start); - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(mapping, first, count, pv.pages); - ASSERTCMP(pv.nr, ==, count); + pv.nr = find_get_pages_contig(mapping, start / PAGE_SIZE, + PAGEVEC_SIZE, pv.pages); + if (pv.nr == 0) + break; - for (loop = 0; loop < count; loop++) { + for (loop = 0; loop < pv.nr; loop++) { struct page *page = pv.pages[loop]; + if (page->index * PAGE_SIZE >= start + len) + break; + + psize = thp_size(page); + start += psize; + len -= psize; redirty_page_for_writepage(wbc, page); end_page_writeback(page); - if (page->index >= first) - first = page->index + 1; } __pagevec_release(&pv); - } while (first <= last); + } while (len > 0); _leave(""); } @@ -317,37 +251,32 @@ static void afs_redirty_pages(struct writeback_control *wbc, /* * completion of write to server */ -static void afs_pages_written_back(struct afs_vnode *vnode, - pgoff_t first, pgoff_t last) +static void afs_pages_written_back(struct afs_vnode *vnode, loff_t start, unsigned int len) { - struct pagevec pv; - unsigned long priv; - unsigned count, loop; + struct address_space *mapping = vnode->vfs_inode.i_mapping; + struct page *page; + pgoff_t end; - _enter("{%llx:%llu},{%lx-%lx}", - vnode->fid.vid, vnode->fid.vnode, first, last); + XA_STATE(xas, &mapping->i_pages, start / PAGE_SIZE); - pagevec_init(&pv); + _enter("{%llx:%llu},{%x @%llx}", + vnode->fid.vid, vnode->fid.vnode, len, start); - do { - _debug("done %lx-%lx", first, last); - - count = last - first + 1; - if (count > PAGEVEC_SIZE) - count = PAGEVEC_SIZE; - pv.nr = find_get_pages_contig(vnode->vfs_inode.i_mapping, - first, count, pv.pages); - ASSERTCMP(pv.nr, ==, count); - - for (loop = 0; loop < count; loop++) { - priv = (unsigned long)detach_page_private(pv.pages[loop]); - trace_afs_page_dirty(vnode, tracepoint_string("clear"), - pv.pages[loop]->index, priv); - end_page_writeback(pv.pages[loop]); + rcu_read_lock(); + + end = (start + len - 1) / PAGE_SIZE; + xas_for_each(&xas, page, end) { + if (!PageWriteback(page)) { + kdebug("bad %x @%llx page %lx %lx", len, start, page->index, end); + ASSERT(PageWriteback(page)); } - first += count; - __pagevec_release(&pv); - } while (first <= last); + + trace_afs_page_dirty(vnode, tracepoint_string("clear"), page); + detach_page_private(page); + page_endio(page, true, 0); + } + + rcu_read_unlock(); afs_prune_wb_keys(vnode); _leave(""); @@ -402,11 +331,9 @@ static void afs_store_data_success(struct afs_operation *op) afs_vnode_commit_status(op, &op->file[0]); if (op->error == 0) { if (!op->store.laundering) - afs_pages_written_back(vnode, op->store.first, op->store.last); + afs_pages_written_back(vnode, op->store.pos, op->store.size); afs_stat_v(vnode, n_stores); - atomic_long_add((op->store.last * PAGE_SIZE + op->store.last_to) - - (op->store.first * PAGE_SIZE + op->store.first_offset), - &afs_v2net(vnode)->n_store_bytes); + atomic_long_add(op->store.size, &afs_v2net(vnode)->n_store_bytes); } } @@ -419,21 +346,20 @@ static const struct afs_operation_ops afs_store_data_operation = { /* * write to a file */ -static int afs_store_data(struct address_space *mapping, - pgoff_t first, pgoff_t last, - unsigned offset, unsigned to, bool laundering) +static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t pos, + bool laundering) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); struct afs_operation *op; struct afs_wb_key *wbk = NULL; - int ret; + loff_t size = iov_iter_count(iter), i_size; + int ret = -ENOKEY; - _enter("%s{%llx:%llu.%u},%lx,%lx,%x,%x", + _enter("%s{%llx:%llu.%u},%llx,%llx", vnode->volume->name, vnode->fid.vid, vnode->fid.vnode, vnode->fid.unique, - first, last, offset, to); + size, pos); ret = afs_get_writeback_key(vnode, &wbk); if (ret) { @@ -447,13 +373,15 @@ static int afs_store_data(struct address_space *mapping, return -ENOMEM; } + i_size = i_size_read(&vnode->vfs_inode); + afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; - op->store.mapping = mapping; - op->store.first = first; - op->store.last = last; - op->store.first_offset = offset; - op->store.last_to = to; + op->file[0].modification = true; + op->store.write_iter = iter; + op->store.pos = pos; + op->store.size = size; + op->store.i_size = max(pos + size, i_size); op->store.laundering = laundering; op->mtime = vnode->vfs_inode.i_mtime; op->flags |= AFS_OPERATION_UNINTR; @@ -487,73 +415,58 @@ try_next_key: } /* - * Synchronously write back the locked page and any subsequent non-locked dirty - * pages. + * Extend the region to be written back to include subsequent contiguously + * dirty pages if possible, but don't sleep while doing so. + * + * If this page holds new content, then we can include filler zeros in the + * writeback. */ -static int afs_write_back_from_locked_page(struct address_space *mapping, - struct writeback_control *wbc, - struct page *primary_page, - pgoff_t final_page) +static void afs_extend_writeback(struct address_space *mapping, + struct afs_vnode *vnode, + long *_count, + loff_t start, + loff_t max_len, + bool new_content, + unsigned int *_len) { - struct afs_vnode *vnode = AFS_FS_I(mapping->host); - struct page *pages[8], *page; - unsigned long count, priv; - unsigned n, offset, to, f, t; - pgoff_t start, first, last; - loff_t i_size, end; - int loop, ret; - - _enter(",%lx", primary_page->index); + struct pagevec pvec; + struct page *page; + unsigned long priv; + unsigned int psize, filler = 0; + unsigned int f, t; + loff_t len = *_len; + pgoff_t index = (start + len) / PAGE_SIZE; + bool stop = true; + unsigned int i; - count = 1; - if (test_set_page_writeback(primary_page)) - BUG(); + XA_STATE(xas, &mapping->i_pages, index); + pagevec_init(&pvec); - /* Find all consecutive lockable dirty pages that have contiguous - * written regions, stopping when we find a page that is not - * immediately lockable, is not dirty or is missing, or we reach the - * end of the range. - */ - start = primary_page->index; - priv = page_private(primary_page); - offset = afs_page_dirty_from(priv); - to = afs_page_dirty_to(priv); - trace_afs_page_dirty(vnode, tracepoint_string("store"), - primary_page->index, priv); - - WARN_ON(offset == to); - if (offset == to) - trace_afs_page_dirty(vnode, tracepoint_string("WARN"), - primary_page->index, priv); - - if (start >= final_page || - (to < PAGE_SIZE && !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags))) - goto no_more; - - start++; do { - _debug("more %lx [%lx]", start, count); - n = final_page - start + 1; - if (n > ARRAY_SIZE(pages)) - n = ARRAY_SIZE(pages); - n = find_get_pages_contig(mapping, start, ARRAY_SIZE(pages), pages); - _debug("fgpc %u", n); - if (n == 0) - goto no_more; - if (pages[0]->index != start) { - do { - put_page(pages[--n]); - } while (n > 0); - goto no_more; - } + /* Firstly, we gather up a batch of contiguous dirty pages + * under the RCU read lock - but we can't clear the dirty flags + * there if any of those pages are mapped. + */ + rcu_read_lock(); - for (loop = 0; loop < n; loop++) { - page = pages[loop]; - if (to != PAGE_SIZE && - !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) + xas_for_each(&xas, page, ULONG_MAX) { + stop = true; + if (xas_retry(&xas, page)) + continue; + if (xa_is_value(page)) break; - if (page->index > final_page) + if (page->index != index) break; + + if (!page_cache_get_speculative(page)) { + xas_reset(&xas); + continue; + } + + /* Has the page moved or been split? */ + if (unlikely(page != xas_reload(&xas))) + break; + if (!trylock_page(page)) break; if (!PageDirty(page) || PageWriteback(page)) { @@ -561,57 +474,134 @@ static int afs_write_back_from_locked_page(struct address_space *mapping, break; } + psize = thp_size(page); priv = page_private(page); - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); - if (f != 0 && - !test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags)) { + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); + if (f != 0 && !new_content) { unlock_page(page); break; } - to = t; - trace_afs_page_dirty(vnode, tracepoint_string("store+"), - page->index, priv); + len += filler + t; + filler = psize - t; + if (len >= max_len || *_count <= 0) + stop = true; + else if (t == psize || new_content) + stop = false; + + index += thp_nr_pages(page); + if (!pagevec_add(&pvec, page)) + break; + if (stop) + break; + } + + if (!stop) + xas_pause(&xas); + rcu_read_unlock(); + + /* Now, if we obtained any pages, we can shift them to being + * writable and mark them for caching. + */ + if (!pagevec_count(&pvec)) + break; + + for (i = 0; i < pagevec_count(&pvec); i++) { + page = pvec.pages[i]; + trace_afs_page_dirty(vnode, tracepoint_string("store+"), page); if (!clear_page_dirty_for_io(page)) BUG(); if (test_set_page_writeback(page)) BUG(); + + *_count -= thp_nr_pages(page); unlock_page(page); - put_page(page); - } - count += loop; - if (loop < n) { - for (; loop < n; loop++) - put_page(pages[loop]); - goto no_more; } - start += loop; - } while (start <= final_page && count < 65536); + pagevec_release(&pvec); + cond_resched(); + } while (!stop); + + *_len = len; +} + +/* + * Synchronously write back the locked page and any subsequent non-locked dirty + * pages. + */ +static ssize_t afs_write_back_from_locked_page(struct address_space *mapping, + struct writeback_control *wbc, + struct page *page, + loff_t start, loff_t end) +{ + struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct iov_iter iter; + unsigned long priv; + unsigned int offset, to, len, max_len; + loff_t i_size = i_size_read(&vnode->vfs_inode); + bool new_content = test_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); + long count = wbc->nr_to_write; + int ret; + + _enter(",%lx,%llx-%llx", page->index, start, end); + + if (test_set_page_writeback(page)) + BUG(); + + count -= thp_nr_pages(page); + + /* Find all consecutive lockable dirty pages that have contiguous + * written regions, stopping when we find a page that is not + * immediately lockable, is not dirty or is missing, or we reach the + * end of the range. + */ + priv = page_private(page); + offset = afs_page_dirty_from(page, priv); + to = afs_page_dirty_to(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("store"), page); + + len = to - offset; + start += offset; + if (start < i_size) { + /* Trim the write to the EOF; the extra data is ignored. Also + * put an upper limit on the size of a single storedata op. + */ + max_len = 65536 * 4096; + max_len = min_t(unsigned long long, max_len, end - start + 1); + max_len = min_t(unsigned long long, max_len, i_size - start); + + if (len < max_len && + (to == thp_size(page) || new_content)) + afs_extend_writeback(mapping, vnode, &count, + start, max_len, new_content, &len); + len = min_t(loff_t, len, max_len); + } -no_more: /* We now have a contiguous set of dirty pages, each with writeback * set; the first page is still locked at this point, but all the rest * have been unlocked. */ - unlock_page(primary_page); + unlock_page(page); - first = primary_page->index; - last = first + count - 1; + if (start < i_size) { + _debug("write back %x @%llx [%llx]", len, start, i_size); - end = (loff_t)last * PAGE_SIZE + to; - i_size = i_size_read(&vnode->vfs_inode); + iov_iter_xarray(&iter, WRITE, &mapping->i_pages, start, len); + ret = afs_store_data(vnode, &iter, start, false); + } else { + _debug("write discard %x @%llx [%llx]", len, start, i_size); - _debug("write back %lx[%u..] to %lx[..%u]", first, offset, last, to); - if (end > i_size) - to = i_size & ~PAGE_MASK; + /* The dirty region was entirely beyond the EOF. */ + afs_pages_written_back(vnode, start, len); + ret = 0; + } - ret = afs_store_data(mapping, first, last, offset, to, false); switch (ret) { case 0: - ret = count; + wbc->nr_to_write = count; + ret = len; break; default: @@ -623,13 +613,13 @@ no_more: case -EKEYEXPIRED: case -EKEYREJECTED: case -EKEYREVOKED: - afs_redirty_pages(wbc, mapping, first, last); + afs_redirty_pages(wbc, mapping, start, len); mapping_set_error(mapping, ret); break; case -EDQUOT: case -ENOSPC: - afs_redirty_pages(wbc, mapping, first, last); + afs_redirty_pages(wbc, mapping, start, len); mapping_set_error(mapping, -ENOSPC); break; @@ -641,7 +631,7 @@ no_more: case -ENOMEDIUM: case -ENXIO: trace_afs_file_error(vnode, ret, afs_file_error_writeback_fail); - afs_kill_pages(mapping, first, last); + afs_kill_pages(mapping, start, len); mapping_set_error(mapping, ret); break; } @@ -656,19 +646,19 @@ no_more: */ int afs_writepage(struct page *page, struct writeback_control *wbc) { - int ret; + ssize_t ret; + loff_t start; _enter("{%lx},", page->index); + start = page->index * PAGE_SIZE; ret = afs_write_back_from_locked_page(page->mapping, wbc, page, - wbc->range_end >> PAGE_SHIFT); + start, LLONG_MAX - start); if (ret < 0) { - _leave(" = %d", ret); - return 0; + _leave(" = %zd", ret); + return ret; } - wbc->nr_to_write -= ret; - _leave(" = 0"); return 0; } @@ -678,35 +668,46 @@ int afs_writepage(struct page *page, struct writeback_control *wbc) */ static int afs_writepages_region(struct address_space *mapping, struct writeback_control *wbc, - pgoff_t index, pgoff_t end, pgoff_t *_next) + loff_t start, loff_t end, loff_t *_next) { struct page *page; - int ret, n; + ssize_t ret; + int n; - _enter(",,%lx,%lx,", index, end); + _enter("%llx,%llx,", start, end); do { - n = find_get_pages_range_tag(mapping, &index, end, - PAGECACHE_TAG_DIRTY, 1, &page); + pgoff_t index = start / PAGE_SIZE; + + n = find_get_pages_range_tag(mapping, &index, end / PAGE_SIZE, + PAGECACHE_TAG_DIRTY, 1, &page); if (!n) break; + start = (loff_t)page->index * PAGE_SIZE; /* May regress with THPs */ + _debug("wback %lx", page->index); - /* - * at this point we hold neither the i_pages lock nor the + /* At this point we hold neither the i_pages lock nor the * page lock: the page may be truncated or invalidated * (changing page->mapping to NULL), or even swizzled * back from swapper_space to tmpfs file mapping */ - ret = lock_page_killable(page); - if (ret < 0) { - put_page(page); - _leave(" = %d", ret); - return ret; + if (wbc->sync_mode != WB_SYNC_NONE) { + ret = lock_page_killable(page); + if (ret < 0) { + put_page(page); + return ret; + } + } else { + if (!trylock_page(page)) { + put_page(page); + return 0; + } } if (page->mapping != mapping || !PageDirty(page)) { + start += thp_size(page); unlock_page(page); put_page(page); continue; @@ -722,20 +723,20 @@ static int afs_writepages_region(struct address_space *mapping, if (!clear_page_dirty_for_io(page)) BUG(); - ret = afs_write_back_from_locked_page(mapping, wbc, page, end); + ret = afs_write_back_from_locked_page(mapping, wbc, page, start, end); put_page(page); if (ret < 0) { - _leave(" = %d", ret); + _leave(" = %zd", ret); return ret; } - wbc->nr_to_write -= ret; + start += ret * PAGE_SIZE; cond_resched(); - } while (index < end && wbc->nr_to_write > 0); + } while (wbc->nr_to_write > 0); - *_next = index; - _leave(" = 0 [%lx]", *_next); + *_next = start; + _leave(" = 0 [%llx]", *_next); return 0; } @@ -746,7 +747,7 @@ int afs_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct afs_vnode *vnode = AFS_FS_I(mapping->host); - pgoff_t start, end, next; + loff_t start, next; int ret; _enter(""); @@ -761,22 +762,19 @@ int afs_writepages(struct address_space *mapping, return 0; if (wbc->range_cyclic) { - start = mapping->writeback_index; - end = -1; - ret = afs_writepages_region(mapping, wbc, start, end, &next); + start = mapping->writeback_index * PAGE_SIZE; + ret = afs_writepages_region(mapping, wbc, start, LLONG_MAX, &next); if (start > 0 && wbc->nr_to_write > 0 && ret == 0) ret = afs_writepages_region(mapping, wbc, 0, start, &next); - mapping->writeback_index = next; + mapping->writeback_index = next / PAGE_SIZE; } else if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) { - end = (pgoff_t)(LLONG_MAX >> PAGE_SHIFT); - ret = afs_writepages_region(mapping, wbc, 0, end, &next); + ret = afs_writepages_region(mapping, wbc, 0, LLONG_MAX, &next); if (wbc->nr_to_write > 0) mapping->writeback_index = next; } else { - start = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; - ret = afs_writepages_region(mapping, wbc, start, end, &next); + ret = afs_writepages_region(mapping, wbc, + wbc->range_start, wbc->range_end, &next); } up_read(&vnode->validate_lock); @@ -834,13 +832,13 @@ int afs_fsync(struct file *file, loff_t start, loff_t end, int datasync) */ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) { + struct page *page = thp_head(vmf->page); struct file *file = vmf->vma->vm_file; struct inode *inode = file_inode(file); struct afs_vnode *vnode = AFS_FS_I(inode); unsigned long priv; - _enter("{{%llx:%llu}},{%lx}", - vnode->fid.vid, vnode->fid.vnode, vmf->page->index); + _enter("{{%llx:%llu}},{%lx}", vnode->fid.vid, vnode->fid.vnode, page->index); sb_start_pagefault(inode->i_sb); @@ -848,30 +846,35 @@ vm_fault_t afs_page_mkwrite(struct vm_fault *vmf) * be modified. We then assume the entire page will need writing back. */ #ifdef CONFIG_AFS_FSCACHE - fscache_wait_on_page_write(vnode->cache, vmf->page); + if (PageFsCache(page) && + wait_on_page_fscache_killable(page) < 0) + return VM_FAULT_RETRY; #endif - if (PageWriteback(vmf->page) && - wait_on_page_bit_killable(vmf->page, PG_writeback) < 0) + if (wait_on_page_writeback_killable(page)) return VM_FAULT_RETRY; - if (lock_page_killable(vmf->page) < 0) + if (lock_page_killable(page) < 0) return VM_FAULT_RETRY; /* We mustn't change page->private until writeback is complete as that * details the portion of the page we need to write back and we might * need to redirty the page if there's a problem. */ - wait_on_page_writeback(vmf->page); + if (wait_on_page_writeback_killable(page) < 0) { + unlock_page(page); + return VM_FAULT_RETRY; + } - priv = afs_page_dirty(0, PAGE_SIZE); + priv = afs_page_dirty(page, 0, thp_size(page)); priv = afs_page_dirty_mmapped(priv); - trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), - vmf->page->index, priv); - if (PagePrivate(vmf->page)) - set_page_private(vmf->page, priv); - else - attach_page_private(vmf->page, (void *)priv); + if (PagePrivate(page)) { + set_page_private(page, priv); + trace_afs_page_dirty(vnode, tracepoint_string("mkwrite+"), page); + } else { + attach_page_private(page, (void *)priv); + trace_afs_page_dirty(vnode, tracepoint_string("mkwrite"), page); + } file_update_time(file); sb_end_pagefault(inode->i_sb); @@ -913,6 +916,8 @@ int afs_launder_page(struct page *page) { struct address_space *mapping = page->mapping; struct afs_vnode *vnode = AFS_FS_I(mapping->host); + struct iov_iter iter; + struct bio_vec bv[1]; unsigned long priv; unsigned int f, t; int ret = 0; @@ -922,26 +927,24 @@ int afs_launder_page(struct page *page) priv = page_private(page); if (clear_page_dirty_for_io(page)) { f = 0; - t = PAGE_SIZE; + t = thp_size(page); if (PagePrivate(page)) { - f = afs_page_dirty_from(priv); - t = afs_page_dirty_to(priv); + f = afs_page_dirty_from(page, priv); + t = afs_page_dirty_to(page, priv); } - trace_afs_page_dirty(vnode, tracepoint_string("launder"), - page->index, priv); - ret = afs_store_data(mapping, page->index, page->index, t, f, true); - } - - priv = (unsigned long)detach_page_private(page); - trace_afs_page_dirty(vnode, tracepoint_string("laundered"), - page->index, priv); + bv[0].bv_page = page; + bv[0].bv_offset = f; + bv[0].bv_len = t - f; + iov_iter_bvec(&iter, WRITE, bv, 1, bv[0].bv_len); -#ifdef CONFIG_AFS_FSCACHE - if (PageFsCache(page)) { - fscache_wait_on_page_write(vnode->cache, page); - fscache_uncache_page(vnode->cache, page); + trace_afs_page_dirty(vnode, tracepoint_string("launder"), page); + ret = afs_store_data(vnode, &iter, (loff_t)page->index * PAGE_SIZE, + true); } -#endif + + trace_afs_page_dirty(vnode, tracepoint_string("laundered"), page); + detach_page_private(page); + wait_on_page_fscache(page); return ret; } diff --git a/fs/afs/yfsclient.c b/fs/afs/yfsclient.c index bd787e71a657..2b35cba8ad62 100644 --- a/fs/afs/yfsclient.c +++ b/fs/afs/yfsclient.c @@ -360,22 +360,23 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) struct afs_vnode_param *vp = &op->file[0]; struct afs_read *req = op->fetch.req; const __be32 *bp; - unsigned int size; int ret; - _enter("{%u,%zu/%llu}", - call->unmarshall, iov_iter_count(call->iter), req->actual_len); + _enter("{%u,%zu, %zu/%llu}", + call->unmarshall, call->iov_len, iov_iter_count(call->iter), + req->actual_len); switch (call->unmarshall) { case 0: req->actual_len = 0; - req->index = 0; - req->offset = req->pos & (PAGE_SIZE - 1); afs_extract_to_tmp64(call); call->unmarshall++; fallthrough; - /* extract the returned data length */ + /* Extract the returned data length into ->actual_len. This + * may indicate more or less data than was requested will be + * returned. + */ case 1: _debug("extract data length"); ret = afs_extract_data(call, true); @@ -384,44 +385,25 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) req->actual_len = be64_to_cpu(call->tmp64); _debug("DATA length: %llu", req->actual_len); - req->remain = min(req->len, req->actual_len); - if (req->remain == 0) + + if (req->actual_len == 0) goto no_more_data; + call->iter = req->iter; + call->iov_len = min(req->actual_len, req->len); call->unmarshall++; - - begin_page: - ASSERTCMP(req->index, <, req->nr_pages); - if (req->remain > PAGE_SIZE - req->offset) - size = PAGE_SIZE - req->offset; - else - size = req->remain; - call->bvec[0].bv_len = size; - call->bvec[0].bv_offset = req->offset; - call->bvec[0].bv_page = req->pages[req->index]; - iov_iter_bvec(&call->def_iter, READ, call->bvec, 1, size); - ASSERTCMP(size, <=, PAGE_SIZE); fallthrough; /* extract the returned data */ case 2: _debug("extract data %zu/%llu", - iov_iter_count(call->iter), req->remain); + iov_iter_count(call->iter), req->actual_len); ret = afs_extract_data(call, true); if (ret < 0) return ret; - req->remain -= call->bvec[0].bv_len; - req->offset += call->bvec[0].bv_len; - ASSERTCMP(req->offset, <=, PAGE_SIZE); - if (req->offset == PAGE_SIZE) { - req->offset = 0; - req->index++; - if (req->remain > 0) - goto begin_page; - } - ASSERTCMP(req->remain, ==, 0); + call->iter = &call->def_iter; if (req->actual_len <= req->len) goto no_more_data; @@ -467,17 +449,6 @@ static int yfs_deliver_fs_fetch_data64(struct afs_call *call) break; } - for (; req->index < req->nr_pages; req->index++) { - if (req->offset < PAGE_SIZE) - zero_user_segment(req->pages[req->index], - req->offset, PAGE_SIZE); - req->offset = 0; - } - - if (req->page_done) - for (req->index = 0; req->index < req->nr_pages; req->index++) - req->page_done(req); - _leave(" = 0 [done]"); return 0; } @@ -516,6 +487,8 @@ void yfs_fs_fetch_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); + req->call_debug_id = call->debug_id; + /* marshall the parameters */ bp = call->request; bp = xdr_encode_u32(bp, YFSFETCHDATA64); @@ -1102,25 +1075,15 @@ void yfs_fs_store_data(struct afs_operation *op) { struct afs_vnode_param *vp = &op->file[0]; struct afs_call *call; - loff_t size, pos, i_size; __be32 *bp; _enter(",%x,{%llx:%llu},,", key_serial(op->key), vp->fid.vid, vp->fid.vnode); - size = (loff_t)op->store.last_to - (loff_t)op->store.first_offset; - if (op->store.first != op->store.last) - size += (loff_t)(op->store.last - op->store.first) << PAGE_SHIFT; - pos = (loff_t)op->store.first << PAGE_SHIFT; - pos += op->store.first_offset; - - i_size = i_size_read(&vp->vnode->vfs_inode); - if (pos + size > i_size) - i_size = size + pos; - _debug("size %llx, at %llx, i_size %llx", - (unsigned long long)size, (unsigned long long)pos, - (unsigned long long)i_size); + (unsigned long long)op->store.size, + (unsigned long long)op->store.pos, + (unsigned long long)op->store.i_size); call = afs_alloc_flat_call(op->net, &yfs_RXYFSStoreData64, sizeof(__be32) + @@ -1133,8 +1096,7 @@ void yfs_fs_store_data(struct afs_operation *op) if (!call) return afs_op_nomem(op); - call->key = op->key; - call->send_pages = true; + call->write_iter = op->store.write_iter; /* marshall the parameters */ bp = call->request; @@ -1142,9 +1104,9 @@ void yfs_fs_store_data(struct afs_operation *op) bp = xdr_encode_u32(bp, 0); /* RPC flags */ bp = xdr_encode_YFSFid(bp, &vp->fid); bp = xdr_encode_YFSStoreStatus_mtime(bp, &op->mtime); - bp = xdr_encode_u64(bp, pos); - bp = xdr_encode_u64(bp, size); - bp = xdr_encode_u64(bp, i_size); + bp = xdr_encode_u64(bp, op->store.pos); + bp = xdr_encode_u64(bp, op->store.size); + bp = xdr_encode_u64(bp, op->store.i_size); yfs_check_req(call, bp); trace_afs_make_fs_call(call, &vp->fid); @@ -323,16 +323,13 @@ static void aio_free_ring(struct kioctx *ctx) } } -static int aio_ring_mremap(struct vm_area_struct *vma, unsigned long flags) +static int aio_ring_mremap(struct vm_area_struct *vma) { struct file *file = vma->vm_file; struct mm_struct *mm = vma->vm_mm; struct kioctx_table *table; int i, res = -EINVAL; - if (flags & MREMAP_DONTUNMAP) - return -EINVAL; - spin_lock(&mm->ioctx_lock); rcu_read_lock(); table = rcu_dereference(mm->ioctx_table); diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 054f97b07754..918826eaceea 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -87,6 +87,7 @@ struct autofs_wait_queue { autofs_wqt_t wait_queue_token; /* We use the following to see what we are waiting for */ struct qstr name; + u32 offset; u32 dev; u64 ino; kuid_t uid; diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c index a1c7701007e7..b3fefd6237c3 100644 --- a/fs/autofs/expire.c +++ b/fs/autofs/expire.c @@ -355,7 +355,7 @@ static struct dentry *should_expire(struct dentry *dentry, return NULL; } - if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { + if (d_is_symlink(dentry)) { pr_debug("checking symlink %p %pd\n", dentry, dentry); /* Forced expire, user space handles busy mounts */ diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 5ced859dac53..16b5fca0626e 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -30,7 +30,7 @@ void autofs_catatonic_mode(struct autofs_sb_info *sbi) while (wq) { nwq = wq->next; wq->status = -ENOENT; /* Magic is gone - report failure */ - kfree(wq->name.name); + kfree(wq->name.name - wq->offset); wq->name.name = NULL; wq->wait_ctr--; wake_up_interruptible(&wq->queue); @@ -175,51 +175,6 @@ static void autofs_notify_daemon(struct autofs_sb_info *sbi, fput(pipe); } -static int autofs_getpath(struct autofs_sb_info *sbi, - struct dentry *dentry, char *name) -{ - struct dentry *root = sbi->sb->s_root; - struct dentry *tmp; - char *buf; - char *p; - int len; - unsigned seq; - -rename_retry: - buf = name; - len = 0; - - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - spin_lock(&sbi->fs_lock); - for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) - len += tmp->d_name.len + 1; - - if (!len || --len > NAME_MAX) { - spin_unlock(&sbi->fs_lock); - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) - goto rename_retry; - return 0; - } - - *(buf + len) = '\0'; - p = buf + len - dentry->d_name.len; - strncpy(p, dentry->d_name.name, dentry->d_name.len); - - for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) { - *(--p) = '/'; - p -= tmp->d_name.len; - strncpy(p, tmp->d_name.name, tmp->d_name.len); - } - spin_unlock(&sbi->fs_lock); - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) - goto rename_retry; - - return len; -} - static struct autofs_wait_queue * autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { @@ -352,6 +307,7 @@ int autofs_wait(struct autofs_sb_info *sbi, struct qstr qstr; char *name; int status, ret, type; + unsigned int offset = 0; pid_t pid; pid_t tgid; @@ -389,20 +345,23 @@ int autofs_wait(struct autofs_sb_info *sbi, return -ENOMEM; /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) + if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) { + qstr.name = name; qstr.len = sprintf(name, "%p", dentry); - else { - qstr.len = autofs_getpath(sbi, dentry, name); - if (!qstr.len) { + } else { + char *p = dentry_path_raw(dentry, name, NAME_MAX); + if (IS_ERR(p)) { kfree(name); return -ENOENT; } + qstr.name = ++p; // skip the leading slash + qstr.len = strlen(p); + offset = p - name; } - qstr.name = name; qstr.hash = full_name_hash(dentry, name, qstr.len); if (mutex_lock_interruptible(&sbi->wq_mutex)) { - kfree(qstr.name); + kfree(name); return -EINTR; } @@ -410,7 +369,7 @@ int autofs_wait(struct autofs_sb_info *sbi, if (ret <= 0) { if (ret != -EINTR) mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); + kfree(name); return ret; } @@ -418,7 +377,7 @@ int autofs_wait(struct autofs_sb_info *sbi, /* Create a new wait queue */ wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL); if (!wq) { - kfree(qstr.name); + kfree(name); mutex_unlock(&sbi->wq_mutex); return -ENOMEM; } @@ -430,6 +389,7 @@ int autofs_wait(struct autofs_sb_info *sbi, sbi->queues = wq; init_waitqueue_head(&wq->queue); memcpy(&wq->name, &qstr, sizeof(struct qstr)); + wq->offset = offset; wq->dev = autofs_get_dev(sbi); wq->ino = autofs_get_ino(sbi); wq->uid = current_uid(); @@ -469,7 +429,7 @@ int autofs_wait(struct autofs_sb_info *sbi, (unsigned long) wq->wait_queue_token, wq->name.len, wq->name.name, notify); mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); + kfree(name); } /* @@ -540,7 +500,7 @@ int autofs_wait_release(struct autofs_sb_info *sbi, } *wql = wq->next; /* Unlink from chain */ - kfree(wq->name.name); + kfree(wq->name.name - wq->offset); wq->name.name = NULL; /* Do not wait on this queue */ wq->status = status; wake_up(&wq->queue); diff --git a/fs/befs/TODO b/fs/befs/TODO deleted file mode 100644 index 3250921aa2e6..000000000000 --- a/fs/befs/TODO +++ /dev/null @@ -1,14 +0,0 @@ -TODO -========== - -* Convert comments to the Kernel-Doc format. - -* Befs_fs.h has gotten big and messy. No reason not to break it up into - smaller peices. - -* See if Alexander Viro's option parser made it into the kernel tree. - Use that if we can. (include/linux/parser.h) - -* See if we really need separate types for on-disk and in-memory - representations of the superblock and inode. - diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index b12ba98ae9f5..187b3f2b9202 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -2267,8 +2267,7 @@ static int elf_core_dump(struct coredump_params *cprm) goto end_coredump; /* Align to page */ - if (!dump_skip(cprm, dataoff - cprm->pos)) - goto end_coredump; + dump_skip_to(cprm, dataoff); for (i = 0; i < vma_count; i++) { struct core_vma_metadata *meta = vma_meta + i; @@ -2276,7 +2275,6 @@ static int elf_core_dump(struct coredump_params *cprm) if (!dump_user_range(cprm, meta->start, meta->dump_size)) goto end_coredump; } - dump_truncate(cprm); if (!elf_core_write_extra_data(cprm)) goto end_coredump; diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index 3cfd6cd46f26..2c99b102c860 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1631,8 +1631,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) goto end_coredump; } - if (!dump_skip(cprm, dataoff - cprm->pos)) - goto end_coredump; + dump_skip_to(cprm, dataoff); if (!elf_fdpic_dump_segments(cprm, vma_meta, vma_count)) goto end_coredump; diff --git a/fs/binfmt_flat.c b/fs/binfmt_flat.c index b9c658e0548e..a1072c6a2341 100644 --- a/fs/binfmt_flat.c +++ b/fs/binfmt_flat.c @@ -74,6 +74,12 @@ #define MAX_SHARED_LIBS (1) #endif +#ifdef CONFIG_BINFMT_FLAT_NO_DATA_START_OFFSET +#define DATA_START_OFFSET_WORDS (0) +#else +#define DATA_START_OFFSET_WORDS (MAX_SHARED_LIBS) +#endif + struct lib_info { struct { unsigned long start_code; /* Start of text segment */ @@ -576,7 +582,8 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } - len = data_len + extra + MAX_SHARED_LIBS * sizeof(unsigned long); + len = data_len + extra + + DATA_START_OFFSET_WORDS * sizeof(unsigned long); len = PAGE_ALIGN(len); realdatastart = vm_mmap(NULL, 0, len, PROT_READ|PROT_WRITE|PROT_EXEC, MAP_PRIVATE, 0); @@ -591,7 +598,7 @@ static int load_flat_file(struct linux_binprm *bprm, goto err; } datapos = ALIGN(realdatastart + - MAX_SHARED_LIBS * sizeof(unsigned long), + DATA_START_OFFSET_WORDS * sizeof(unsigned long), FLAT_DATA_ALIGN); pr_debug("Allocated data+bss+stack (%u bytes): %lx\n", @@ -622,7 +629,8 @@ static int load_flat_file(struct linux_binprm *bprm, memp_size = len; } else { - len = text_len + data_len + extra + MAX_SHARED_LIBS * sizeof(u32); + len = text_len + data_len + extra + + DATA_START_OFFSET_WORDS * sizeof(u32); len = PAGE_ALIGN(len); textpos = vm_mmap(NULL, 0, len, PROT_READ | PROT_EXEC | PROT_WRITE, MAP_PRIVATE, 0); @@ -638,7 +646,7 @@ static int load_flat_file(struct linux_binprm *bprm, realdatastart = textpos + ntohl(hdr->data_start); datapos = ALIGN(realdatastart + - MAX_SHARED_LIBS * sizeof(u32), + DATA_START_OFFSET_WORDS * sizeof(u32), FLAT_DATA_ALIGN); reloc = (__be32 __user *) @@ -714,7 +722,7 @@ static int load_flat_file(struct linux_binprm *bprm, ret = result; pr_err("Unable to read code+data+bss, errno %d\n", ret); vm_munmap(textpos, text_len + data_len + extra + - MAX_SHARED_LIBS * sizeof(u32)); + DATA_START_OFFSET_WORDS * sizeof(u32)); goto err; } } diff --git a/fs/block_dev.c b/fs/block_dev.c index 92ed7d5df677..6cc4d4cfe0c2 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -79,7 +79,7 @@ static void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0 && mapping->nrexceptional == 0) + if (mapping_empty(mapping)) return; invalidate_bh_lrus(); @@ -275,6 +275,8 @@ __blkdev_direct_IO_simple(struct kiocb *iocb, struct iov_iter *iter, bio.bi_opf = dio_bio_write_op(iocb); task_io_account_write(ret); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio.bi_opf |= REQ_NOWAIT; if (iocb->ki_flags & IOCB_HIPRI) bio_set_polled(&bio, iocb); @@ -428,6 +430,8 @@ static ssize_t __blkdev_direct_IO(struct kiocb *iocb, struct iov_iter *iter, bio->bi_opf = dio_bio_write_op(iocb); task_io_account_write(bio->bi_iter.bi_size); } + if (iocb->ki_flags & IOCB_NOWAIT) + bio->bi_opf |= REQ_NOWAIT; dio->size += bio->bi_iter.bi_size; pos += bio->bi_iter.bi_size; @@ -1236,16 +1240,21 @@ static void __blkdev_put(struct block_device *bdev, fmode_t mode, int for_part); int bdev_disk_changed(struct block_device *bdev, bool invalidate) { struct gendisk *disk = bdev->bd_disk; - int ret; + int ret = 0; lockdep_assert_held(&bdev->bd_mutex); - clear_bit(GD_NEED_PART_SCAN, &bdev->bd_disk->state); + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; rescan: - ret = blk_drop_partitions(bdev); - if (ret) - return ret; + if (bdev->bd_part_count) + return -EBUSY; + sync_blockdev(bdev); + invalidate_bdev(bdev); + blk_drop_partitions(disk); + + clear_bit(GD_NEED_PART_SCAN, &disk->state); /* * Historically we only set the capacity to zero for devices that @@ -1259,9 +1268,6 @@ rescan: if (disk_part_scan_enabled(disk) || !(disk->flags & GENHD_FL_REMOVABLE)) set_capacity(disk, 0); - } else { - if (disk->fops->revalidate_disk) - disk->fops->revalidate_disk(disk); } if (get_capacity(disk)) { @@ -1295,6 +1301,9 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) struct gendisk *disk = bdev->bd_disk; int ret = 0; + if (!(disk->flags & GENHD_FL_UP)) + return -ENXIO; + if (!bdev->bd_openers) { if (!bdev_is_partition(bdev)) { ret = 0; @@ -1329,8 +1338,7 @@ static int __blkdev_get(struct block_device *bdev, fmode_t mode) whole->bd_part_count++; mutex_unlock(&whole->bd_mutex); - if (!(disk->flags & GENHD_FL_UP) || - !bdev_nr_sectors(bdev)) { + if (!bdev_nr_sectors(bdev)) { __blkdev_put(whole, mode, 1); bdput(whole); return -ENXIO; @@ -1361,16 +1369,12 @@ struct block_device *blkdev_get_no_open(dev_t dev) struct block_device *bdev; struct gendisk *disk; - down_read(&bdev_lookup_sem); bdev = bdget(dev); if (!bdev) { - up_read(&bdev_lookup_sem); blk_request_module(dev); - down_read(&bdev_lookup_sem); - bdev = bdget(dev); if (!bdev) - goto unlock; + return NULL; } disk = bdev->bd_disk; @@ -1380,14 +1384,11 @@ struct block_device *blkdev_get_no_open(dev_t dev) goto put_disk; if (!try_module_get(bdev->bd_disk->fops->owner)) goto put_disk; - up_read(&bdev_lookup_sem); return bdev; put_disk: put_disk(disk); bdput: bdput(bdev); -unlock: - up_read(&bdev_lookup_sem); return NULL; } @@ -1433,10 +1434,6 @@ struct block_device *blkdev_get_by_dev(dev_t dev, fmode_t mode, void *holder) if (ret) return ERR_PTR(ret); - /* - * If we lost a race with 'disk' being deleted, try again. See md.c. - */ -retry: bdev = blkdev_get_no_open(dev); if (!bdev) return ERR_PTR(-ENXIO); @@ -1483,8 +1480,6 @@ abort_claiming: disk_unblock_events(disk); put_blkdev: blkdev_put_no_open(bdev); - if (ret == -ERESTARTSYS) - goto retry; return ERR_PTR(ret); } EXPORT_SYMBOL(blkdev_get_by_dev); @@ -1680,6 +1675,7 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); struct blk_plug plug; + size_t shorted = 0; ssize_t ret; if (bdev_read_only(I_BDEV(bd_inode))) @@ -1697,12 +1693,17 @@ ssize_t blkdev_write_iter(struct kiocb *iocb, struct iov_iter *from) if ((iocb->ki_flags & (IOCB_NOWAIT | IOCB_DIRECT)) == IOCB_NOWAIT) return -EOPNOTSUPP; - iov_iter_truncate(from, size - iocb->ki_pos); + size -= iocb->ki_pos; + if (iov_iter_count(from) > size) { + shorted = iov_iter_count(from) - size; + iov_iter_truncate(from, size); + } blk_start_plug(&plug); ret = __generic_file_write_iter(iocb, from); if (ret > 0) ret = generic_write_sync(iocb, ret); + iov_iter_reexpand(from, iov_iter_count(from) + shorted); blk_finish_plug(&plug); return ret; } @@ -1714,13 +1715,21 @@ ssize_t blkdev_read_iter(struct kiocb *iocb, struct iov_iter *to) struct inode *bd_inode = bdev_file_inode(file); loff_t size = i_size_read(bd_inode); loff_t pos = iocb->ki_pos; + size_t shorted = 0; + ssize_t ret; if (pos >= size) return 0; size -= pos; - iov_iter_truncate(to, size); - return generic_file_read_iter(iocb, to); + if (iov_iter_count(to) > size) { + shorted = iov_iter_count(to) - size; + iov_iter_truncate(to, size); + } + + ret = generic_file_read_iter(iocb, to); + iov_iter_reexpand(to, iov_iter_count(to) + shorted); + return ret; } EXPORT_SYMBOL_GPL(blkdev_read_iter); diff --git a/fs/btrfs/Makefile b/fs/btrfs/Makefile index b634c42115ea..cec88a66bd6c 100644 --- a/fs/btrfs/Makefile +++ b/fs/btrfs/Makefile @@ -7,10 +7,12 @@ subdir-ccflags-y += -Wmissing-format-attribute subdir-ccflags-y += -Wmissing-prototypes subdir-ccflags-y += -Wold-style-definition subdir-ccflags-y += -Wmissing-include-dirs -subdir-ccflags-y += $(call cc-option, -Wunused-but-set-variable) -subdir-ccflags-y += $(call cc-option, -Wunused-const-variable) -subdir-ccflags-y += $(call cc-option, -Wpacked-not-aligned) -subdir-ccflags-y += $(call cc-option, -Wstringop-truncation) +condflags := \ + $(call cc-option, -Wunused-but-set-variable) \ + $(call cc-option, -Wunused-const-variable) \ + $(call cc-option, -Wpacked-not-aligned) \ + $(call cc-option, -Wstringop-truncation) +subdir-ccflags-y += $(condflags) # The following turn off the warnings enabled by -Wextra subdir-ccflags-y += -Wno-missing-field-initializers subdir-ccflags-y += -Wno-sign-compare @@ -28,7 +30,7 @@ btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \ reada.o backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \ uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \ block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \ - subpage.o + subpage.o tree-mod-log.o btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index f47c1528eb9a..117d423fdb93 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -14,6 +14,7 @@ #include "delayed-ref.h" #include "locking.h" #include "misc.h" +#include "tree-mod-log.h" /* Just an arbitrary number so we can be sure this happened */ #define BACKREF_FOUND_SHARED 6 @@ -452,7 +453,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (path->slots[0] >= btrfs_header_nritems(eb) || is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb)) { - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -476,7 +477,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, if (slot == 0 && (is_shared_data_backref(preftrees, eb->start) || ref->root_id != btrfs_header_owner(eb))) { - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_leaf(root, path); else ret = btrfs_next_old_leaf(root, path, time_seq); @@ -514,7 +515,7 @@ static int add_all_parents(struct btrfs_root *root, struct btrfs_path *path, eie = NULL; } next: - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_next_item(root, path); else ret = btrfs_next_old_item(root, path, time_seq); @@ -574,7 +575,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, if (path->search_commit_root) root_level = btrfs_header_level(root->commit_root); - else if (time_seq == SEQ_LAST) + else if (time_seq == BTRFS_SEQ_LAST) root_level = btrfs_header_level(root->node); else root_level = btrfs_old_root_level(root, time_seq); @@ -605,7 +606,7 @@ static int resolve_indirect_ref(struct btrfs_fs_info *fs_info, search_key.offset >= LLONG_MAX) search_key.offset = 0; path->lowest_level = level; - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); else ret = btrfs_search_old_slot(root, &search_key, path, time_seq); @@ -1147,8 +1148,8 @@ static int add_keyed_refs(struct btrfs_fs_info *fs_info, * indirect refs to their parent bytenr. * When roots are found, they're added to the roots list * - * If time_seq is set to SEQ_LAST, it will not search delayed_refs, and behave - * much like trans == NULL case, the difference only lies in it will not + * If time_seq is set to BTRFS_SEQ_LAST, it will not search delayed_refs, and + * behave much like trans == NULL case, the difference only lies in it will not * commit root. * The special case is for qgroup to search roots in commit_transaction(). * @@ -1199,7 +1200,7 @@ static int find_parent_nodes(struct btrfs_trans_handle *trans, path->skip_locking = 1; } - if (time_seq == SEQ_LAST) + if (time_seq == BTRFS_SEQ_LAST) path->skip_locking = 1; /* @@ -1217,9 +1218,9 @@ again: #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS if (trans && likely(trans->type != __TRANS_DUMMY) && - time_seq != SEQ_LAST) { + time_seq != BTRFS_SEQ_LAST) { #else - if (trans && time_seq != SEQ_LAST) { + if (trans && time_seq != BTRFS_SEQ_LAST) { #endif /* * look if there are updates for this ref queued and lock the @@ -1527,7 +1528,7 @@ int btrfs_check_shared(struct btrfs_root *root, u64 inum, u64 bytenr, struct btrfs_trans_handle *trans; struct ulist_iterator uiter; struct ulist_node *node; - struct seq_list elem = SEQ_LIST_INIT(elem); + struct btrfs_seq_list elem = BTRFS_SEQ_LIST_INIT(elem); int ret = 0; struct share_check shared = { .root_objectid = root->root_key.objectid, @@ -1953,7 +1954,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, struct ulist *roots = NULL; struct ulist_node *ref_node = NULL; struct ulist_node *root_node = NULL; - struct seq_list tree_mod_seq_elem = SEQ_LIST_INIT(tree_mod_seq_elem); + struct btrfs_seq_list seq_elem = BTRFS_SEQ_LIST_INIT(seq_elem); struct ulist_iterator ref_uiter; struct ulist_iterator root_uiter; @@ -1971,12 +1972,12 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, } if (trans) - btrfs_get_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_get_tree_mod_seq(fs_info, &seq_elem); else down_read(&fs_info->commit_root_sem); ret = btrfs_find_all_leafs(trans, fs_info, extent_item_objectid, - tree_mod_seq_elem.seq, &refs, + seq_elem.seq, &refs, &extent_item_pos, ignore_offset); if (ret) goto out; @@ -1984,7 +1985,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, ULIST_ITER_INIT(&ref_uiter); while (!ret && (ref_node = ulist_next(refs, &ref_uiter))) { ret = btrfs_find_all_roots_safe(trans, fs_info, ref_node->val, - tree_mod_seq_elem.seq, &roots, + seq_elem.seq, &roots, ignore_offset); if (ret) break; @@ -2007,7 +2008,7 @@ int iterate_extent_inodes(struct btrfs_fs_info *fs_info, free_leaf_list(refs); out: if (trans) { - btrfs_put_tree_mod_seq(fs_info, &tree_mod_seq_elem); + btrfs_put_tree_mod_seq(fs_info, &seq_elem); btrfs_end_transaction(trans); } else { up_read(&fs_info->commit_root_sem); diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index 744b99ddc28c..aa57bdc8fc89 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -1289,7 +1289,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * Long running balances can keep us blocked here for eternity, so * simply skip deletion if we're unable to get the mutex. */ - if (!mutex_trylock(&fs_info->delete_unused_bgs_mutex)) + if (!mutex_trylock(&fs_info->reclaim_bgs_lock)) return; spin_lock(&fs_info->unused_bgs_lock); @@ -1462,12 +1462,12 @@ next: spin_lock(&fs_info->unused_bgs_lock); } spin_unlock(&fs_info->unused_bgs_lock); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); return; flip_async: btrfs_end_transaction(trans); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_put_block_group(block_group); btrfs_discard_punt_unused_bgs_list(fs_info); } @@ -1485,6 +1485,97 @@ void btrfs_mark_bg_unused(struct btrfs_block_group *bg) spin_unlock(&fs_info->unused_bgs_lock); } +void btrfs_reclaim_bgs_work(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info = + container_of(work, struct btrfs_fs_info, reclaim_bgs_work); + struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + int ret; + + if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags)) + return; + + if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) + return; + + mutex_lock(&fs_info->reclaim_bgs_lock); + spin_lock(&fs_info->unused_bgs_lock); + while (!list_empty(&fs_info->reclaim_bgs)) { + bg = list_first_entry(&fs_info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&bg->bg_list); + + space_info = bg->space_info; + spin_unlock(&fs_info->unused_bgs_lock); + + /* Don't race with allocators so take the groups_sem */ + down_write(&space_info->groups_sem); + + spin_lock(&bg->lock); + if (bg->reserved || bg->pinned || bg->ro) { + /* + * We want to bail if we made new allocations or have + * outstanding allocations in this block group. We do + * the ro check in case balance is currently acting on + * this block group. + */ + spin_unlock(&bg->lock); + up_write(&space_info->groups_sem); + goto next; + } + spin_unlock(&bg->lock); + + /* Get out fast, in case we're unmounting the filesystem */ + if (btrfs_fs_closing(fs_info)) { + up_write(&space_info->groups_sem); + goto next; + } + + ret = inc_block_group_ro(bg, 0); + up_write(&space_info->groups_sem); + if (ret < 0) + goto next; + + btrfs_info(fs_info, "reclaiming chunk %llu with %llu%% used", + bg->start, div_u64(bg->used * 100, bg->length)); + trace_btrfs_reclaim_block_group(bg); + ret = btrfs_relocate_chunk(fs_info, bg->start); + if (ret) + btrfs_err(fs_info, "error relocating chunk %llu", + bg->start); + +next: + btrfs_put_block_group(bg); + spin_lock(&fs_info->unused_bgs_lock); + } + spin_unlock(&fs_info->unused_bgs_lock); + mutex_unlock(&fs_info->reclaim_bgs_lock); + btrfs_exclop_finish(fs_info); +} + +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info) +{ + spin_lock(&fs_info->unused_bgs_lock); + if (!list_empty(&fs_info->reclaim_bgs)) + queue_work(system_unbound_wq, &fs_info->reclaim_bgs_work); + spin_unlock(&fs_info->unused_bgs_lock); +} + +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + trace_btrfs_add_reclaim_block_group(bg); + list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); + } + spin_unlock(&fs_info->unused_bgs_lock); +} + static int read_bg_from_eb(struct btrfs_fs_info *fs_info, struct btrfs_key *key, struct btrfs_path *path) { @@ -2267,29 +2358,33 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, struct btrfs_trans_handle *trans; u64 alloc_flags; int ret; + bool dirty_bg_running; -again: - trans = btrfs_join_transaction(fs_info->extent_root); - if (IS_ERR(trans)) - return PTR_ERR(trans); + do { + trans = btrfs_join_transaction(fs_info->extent_root); + if (IS_ERR(trans)) + return PTR_ERR(trans); - /* - * we're not allowed to set block groups readonly after the dirty - * block groups cache has started writing. If it already started, - * back off and let this transaction commit - */ - mutex_lock(&fs_info->ro_block_group_mutex); - if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { - u64 transid = trans->transid; + dirty_bg_running = false; - mutex_unlock(&fs_info->ro_block_group_mutex); - btrfs_end_transaction(trans); + /* + * We're not allowed to set block groups readonly after the dirty + * block group cache has started writing. If it already started, + * back off and let this transaction commit. + */ + mutex_lock(&fs_info->ro_block_group_mutex); + if (test_bit(BTRFS_TRANS_DIRTY_BG_RUN, &trans->transaction->flags)) { + u64 transid = trans->transid; - ret = btrfs_wait_for_commit(fs_info, transid); - if (ret) - return ret; - goto again; - } + mutex_unlock(&fs_info->ro_block_group_mutex); + btrfs_end_transaction(trans); + + ret = btrfs_wait_for_commit(fs_info, transid); + if (ret) + return ret; + dirty_bg_running = true; + } + } while (dirty_bg_running); if (do_chunk_alloc) { /* @@ -3269,6 +3364,7 @@ static u64 get_profile_num_devs(struct btrfs_fs_info *fs_info, u64 type) */ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) { + struct btrfs_transaction *cur_trans = trans->transaction; struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_space_info *info; u64 left; @@ -3283,6 +3379,7 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) lockdep_assert_held(&fs_info->chunk_mutex); info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_SYSTEM); +again: spin_lock(&info->lock); left = info->total_bytes - btrfs_space_info_used(info, true); spin_unlock(&info->lock); @@ -3301,6 +3398,58 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) if (left < thresh) { u64 flags = btrfs_system_alloc_profile(fs_info); + u64 reserved = atomic64_read(&cur_trans->chunk_bytes_reserved); + + /* + * If there's not available space for the chunk tree (system + * space) and there are other tasks that reserved space for + * creating a new system block group, wait for them to complete + * the creation of their system block group and release excess + * reserved space. We do this because: + * + * *) We can end up allocating more system chunks than necessary + * when there are multiple tasks that are concurrently + * allocating block groups, which can lead to exhaustion of + * the system array in the superblock; + * + * *) If we allocate extra and unnecessary system block groups, + * despite being empty for a long time, and possibly forever, + * they end not being added to the list of unused block groups + * because that typically happens only when deallocating the + * last extent from a block group - which never happens since + * we never allocate from them in the first place. The few + * exceptions are when mounting a filesystem or running scrub, + * which add unused block groups to the list of unused block + * groups, to be deleted by the cleaner kthread. + * And even when they are added to the list of unused block + * groups, it can take a long time until they get deleted, + * since the cleaner kthread might be sleeping or busy with + * other work (deleting subvolumes, running delayed iputs, + * defrag scheduling, etc); + * + * This is rare in practice, but can happen when too many tasks + * are allocating blocks groups in parallel (via fallocate()) + * and before the one that reserved space for a new system block + * group finishes the block group creation and releases the space + * reserved in excess (at btrfs_create_pending_block_groups()), + * other tasks end up here and see free system space temporarily + * not enough for updating the chunk tree. + * + * We unlock the chunk mutex before waiting for such tasks and + * lock it again after the wait, otherwise we would deadlock. + * It is safe to do so because allocating a system chunk is the + * first thing done while allocating a new block group. + */ + if (reserved > trans->chunk_bytes_reserved) { + const u64 min_needed = reserved - thresh; + + mutex_unlock(&fs_info->chunk_mutex); + wait_event(cur_trans->chunk_reserve_wait, + atomic64_read(&cur_trans->chunk_bytes_reserved) <= + min_needed); + mutex_lock(&fs_info->chunk_mutex); + goto again; + } /* * Ignore failure to create system chunk. We might end up not @@ -3315,8 +3464,10 @@ void check_system_chunk(struct btrfs_trans_handle *trans, u64 type) ret = btrfs_block_rsv_add(fs_info->chunk_root, &fs_info->chunk_block_rsv, thresh, BTRFS_RESERVE_NO_FLUSH); - if (!ret) + if (!ret) { + atomic64_add(thresh, &cur_trans->chunk_bytes_reserved); trans->chunk_bytes_reserved += thresh; + } } } @@ -3386,6 +3537,16 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) } spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->unused_bgs_lock); + while (!list_empty(&info->reclaim_bgs)) { + block_group = list_first_entry(&info->reclaim_bgs, + struct btrfs_block_group, + bg_list); + list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + } + spin_unlock(&info->unused_bgs_lock); + spin_lock(&info->block_group_cache_lock); while ((n = rb_last(&info->block_group_cache_tree)) != NULL) { block_group = rb_entry(n, struct btrfs_block_group, diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 3ecc3372a5ce..7b927425dc71 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -264,6 +264,9 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans, u64 group_start, struct extent_map *em); void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_unused(struct btrfs_block_group *bg); +void btrfs_reclaim_bgs_work(struct work_struct *work); +void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); +void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); int btrfs_make_block_group(struct btrfs_trans_handle *trans, u64 bytes_used, u64 type, u64 chunk_offset, u64 size); diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index 28e202e89660..c652e19ad74e 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -220,6 +220,7 @@ struct btrfs_inode { /* Hook into fs_info->delayed_iputs */ struct list_head delayed_iput; + struct rw_semaphore i_mmap_lock; struct inode vfs_inode; }; @@ -299,24 +300,30 @@ static inline void btrfs_mod_outstanding_extents(struct btrfs_inode *inode, mod); } -static inline int btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) +/* + * Called every time after doing a buffered, direct IO or memory mapped write. + * + * This is to ensure that if we write to a file that was previously fsynced in + * the current transaction, then try to fsync it again in the same transaction, + * we will know that there were changes in the file and that it needs to be + * logged. + */ +static inline void btrfs_set_inode_last_sub_trans(struct btrfs_inode *inode) +{ + spin_lock(&inode->lock); + inode->last_sub_trans = inode->root->log_transid; + spin_unlock(&inode->lock); +} + +static inline bool btrfs_inode_in_log(struct btrfs_inode *inode, u64 generation) { - int ret = 0; + bool ret = false; spin_lock(&inode->lock); if (inode->logged_trans == generation && inode->last_sub_trans <= inode->last_log_commit && - inode->last_sub_trans <= inode->root->last_log_commit) { - /* - * After a ranged fsync we might have left some extent maps - * (that fall outside the fsync's range). So return false - * here if the list isn't empty, to make sure btrfs_log_inode() - * will be called and process those extent maps. - */ - smp_mb(); - if (list_empty(&inode->extent_tree.modified_extents)) - ret = 1; - } + inode->last_sub_trans <= inode->root->last_log_commit) + ret = true; spin_unlock(&inode->lock); return ret; } diff --git a/fs/btrfs/check-integrity.c b/fs/btrfs/check-integrity.c index 113cb85c1fd4..169508609324 100644 --- a/fs/btrfs/check-integrity.c +++ b/fs/btrfs/check-integrity.c @@ -1555,10 +1555,11 @@ static void btrfsic_release_block_ctx(struct btrfsic_block_data_ctx *block_ctx) BUG_ON(!block_ctx->pagev); num_pages = (block_ctx->len + (u64)PAGE_SIZE - 1) >> PAGE_SHIFT; + /* Pages must be unmapped in reverse order */ while (num_pages > 0) { num_pages--; if (block_ctx->datav[num_pages]) { - kunmap(block_ctx->pagev[num_pages]); + kunmap_local(block_ctx->datav[num_pages]); block_ctx->datav[num_pages] = NULL; } if (block_ctx->pagev[num_pages]) { @@ -1637,7 +1638,7 @@ static int btrfsic_read_block(struct btrfsic_state *state, i = j; } for (i = 0; i < num_pages; i++) - block_ctx->datav[i] = kmap(block_ctx->pagev[i]); + block_ctx->datav[i] = kmap_local_page(block_ctx->pagev[i]); return block_ctx->len; } @@ -2677,7 +2678,7 @@ static void __btrfsic_submit_bio(struct bio *bio) dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev); if (NULL != dev_state && (bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) { - unsigned int i = 0; + int i = 0; u64 dev_bytenr; u64 cur_bytenr; struct bio_vec bvec; @@ -2702,7 +2703,7 @@ static void __btrfsic_submit_bio(struct bio *bio) bio_for_each_segment(bvec, bio, iter) { BUG_ON(bvec.bv_len != PAGE_SIZE); - mapped_datav[i] = kmap(bvec.bv_page); + mapped_datav[i] = kmap_local_page(bvec.bv_page); i++; if (dev_state->state->print_mask & @@ -2715,8 +2716,9 @@ static void __btrfsic_submit_bio(struct bio *bio) mapped_datav, segs, bio, &bio_is_patched, bio->bi_opf); - bio_for_each_segment(bvec, bio, iter) - kunmap(bvec.bv_page); + /* Unmap in reverse order */ + for (--i; i >= 0; i--) + kunmap_local(mapped_datav[i]); kfree(mapped_datav); } else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) { if (dev_state->state->print_mask & diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 3f4c832abfed..1346d698463a 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -28,6 +28,7 @@ #include "compression.h" #include "extent_io.h" #include "extent_map.h" +#include "zoned.h" static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" }; @@ -80,10 +81,15 @@ static int compression_compress_pages(int type, struct list_head *ws, case BTRFS_COMPRESS_NONE: default: /* - * This can't happen, the type is validated several times - * before we get here. As a sane fallback, return what the - * callers will understand as 'no compression happened'. + * This can happen when compression races with remount setting + * it to 'no compress', while caller doesn't call + * inode_need_compress() to check if we really need to + * compress. + * + * Not a big deal, just need to inform caller that we + * haven't allocated any pages yet. */ + *out_pages = 0; return -E2BIG; } } @@ -344,6 +350,7 @@ static void end_compressed_bio_write(struct bio *bio) */ inode = cb->inode; cb->compressed_pages[0]->mapping = cb->inode->i_mapping; + btrfs_record_physical_zoned(inode, cb->start, bio); btrfs_writepage_endio_finish_ordered(cb->compressed_pages[0], cb->start, cb->start + cb->len - 1, bio->bi_status == BLK_STS_OK); @@ -396,6 +403,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, u64 first_byte = disk_start; blk_status_t ret; int skip_sum = inode->flags & BTRFS_INODE_NODATASUM; + const bool use_append = btrfs_use_zone_append(inode, disk_start); + const unsigned int bio_op = use_append ? REQ_OP_ZONE_APPEND : REQ_OP_WRITE; WARN_ON(!PAGE_ALIGNED(start)); cb = kmalloc(compressed_bio_size(fs_info, compressed_len), GFP_NOFS); @@ -413,10 +422,31 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, cb->nr_pages = nr_pages; bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = REQ_OP_WRITE | write_flags; + bio->bi_opf = bio_op | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; + if (use_append) { + struct extent_map *em; + struct map_lookup *map; + struct block_device *bdev; + + em = btrfs_get_chunk_map(fs_info, disk_start, PAGE_SIZE); + if (IS_ERR(em)) { + kfree(cb); + bio_put(bio); + return BLK_STS_NOTSUPP; + } + + map = em->map_lookup; + /* We only support single profile for now */ + ASSERT(map->num_stripes == 1); + bdev = map->stripes[0].dev->bdev; + + bio_set_dev(bio, bdev); + free_extent_map(em); + } + if (blkcg_css) { bio->bi_opf |= REQ_CGROUP_PUNT; kthread_associate_blkcg(blkcg_css); @@ -427,6 +457,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, bytes_left = compressed_len; for (pg_index = 0; pg_index < cb->nr_pages; pg_index++) { int submit = 0; + int len = 0; page = compressed_pages[pg_index]; page->mapping = inode->vfs_inode.i_mapping; @@ -434,9 +465,20 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, submit = btrfs_bio_fits_in_stripe(page, PAGE_SIZE, bio, 0); + /* + * Page can only be added to bio if the current bio fits in + * stripe. + */ + if (!submit) { + if (pg_index == 0 && use_append) + len = bio_add_zone_append_page(bio, page, + PAGE_SIZE, 0); + else + len = bio_add_page(bio, page, PAGE_SIZE, 0); + } + page->mapping = NULL; - if (submit || bio_add_page(bio, page, PAGE_SIZE, 0) < - PAGE_SIZE) { + if (submit || len < PAGE_SIZE) { /* * inc the count before we submit the bio so * we know the end IO handler won't happen before @@ -460,11 +502,15 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start, } bio = btrfs_bio_alloc(first_byte); - bio->bi_opf = REQ_OP_WRITE | write_flags; + bio->bi_opf = bio_op | write_flags; bio->bi_private = cb; bio->bi_end_io = end_compressed_bio_write; if (blkcg_css) bio->bi_opf |= REQ_CGROUP_PUNT; + /* + * Use bio_add_page() to ensure the bio has at least one + * page. + */ bio_add_page(bio, page, PAGE_SIZE, 0); } if (bytes_left < PAGE_SIZE) { @@ -586,16 +632,13 @@ static noinline int add_ra_bio_pages(struct inode *inode, free_extent_map(em); if (page->index == end_index) { - char *userpage; size_t zero_offset = offset_in_page(isize); if (zero_offset) { int zeros; zeros = PAGE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, zeros); + memzero_page(page, zero_offset, zeros); flush_dcache_page(page); - kunmap_atomic(userpage); } } @@ -1611,7 +1654,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, curr_sample_pos = 0; while (index < index_end) { page = find_get_page(inode->i_mapping, index); - in_data = kmap(page); + in_data = kmap_local_page(page); /* Handle case where the start is not aligned to PAGE_SIZE */ i = start % PAGE_SIZE; while (i < PAGE_SIZE - SAMPLING_READ_SIZE) { @@ -1624,7 +1667,7 @@ static void heuristic_collect_sample(struct inode *inode, u64 start, u64 end, start += SAMPLING_INTERVAL; curr_sample_pos += SAMPLING_READ_SIZE; } - kunmap(page); + kunmap_local(in_data); put_page(page); index++; diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 34b929bd5c1a..a484fb72a01f 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -14,6 +14,7 @@ #include "locking.h" #include "volumes.h" #include "qgroup.h" +#include "tree-mod-log.h" static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_path *path, int level); @@ -233,597 +234,6 @@ int btrfs_copy_root(struct btrfs_trans_handle *trans, return 0; } -enum mod_log_op { - MOD_LOG_KEY_REPLACE, - MOD_LOG_KEY_ADD, - MOD_LOG_KEY_REMOVE, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, - MOD_LOG_MOVE_KEYS, - MOD_LOG_ROOT_REPLACE, -}; - -struct tree_mod_root { - u64 logical; - u8 level; -}; - -struct tree_mod_elem { - struct rb_node node; - u64 logical; - u64 seq; - enum mod_log_op op; - - /* this is used for MOD_LOG_KEY_* and MOD_LOG_MOVE_KEYS operations */ - int slot; - - /* this is used for MOD_LOG_KEY* and MOD_LOG_ROOT_REPLACE */ - u64 generation; - - /* those are used for op == MOD_LOG_KEY_{REPLACE,REMOVE} */ - struct btrfs_disk_key key; - u64 blockptr; - - /* this is used for op == MOD_LOG_MOVE_KEYS */ - struct { - int dst_slot; - int nr_items; - } move; - - /* this is used for op == MOD_LOG_ROOT_REPLACE */ - struct tree_mod_root old_root; -}; - -/* - * Pull a new tree mod seq number for our operation. - */ -static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) -{ - return atomic64_inc_return(&fs_info->tree_mod_seq); -} - -/* - * This adds a new blocker to the tree mod log's blocker list if the @elem - * passed does not already have a sequence number set. So when a caller expects - * to record tree modifications, it should ensure to set elem->seq to zero - * before calling btrfs_get_tree_mod_seq. - * Returns a fresh, unused tree log modification sequence number, even if no new - * blocker was added. - */ -u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) -{ - write_lock(&fs_info->tree_mod_log_lock); - if (!elem->seq) { - elem->seq = btrfs_inc_tree_mod_seq(fs_info); - list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); - } - write_unlock(&fs_info->tree_mod_log_lock); - - return elem->seq; -} - -void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem) -{ - struct rb_root *tm_root; - struct rb_node *node; - struct rb_node *next; - struct tree_mod_elem *tm; - u64 min_seq = (u64)-1; - u64 seq_putting = elem->seq; - - if (!seq_putting) - return; - - write_lock(&fs_info->tree_mod_log_lock); - list_del(&elem->list); - elem->seq = 0; - - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *first; - - first = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - if (seq_putting > first->seq) { - /* - * Blocker with lower sequence number exists, we - * cannot remove anything from the log. - */ - write_unlock(&fs_info->tree_mod_log_lock); - return; - } - min_seq = first->seq; - } - - /* - * anything that's lower than the lowest existing (read: blocked) - * sequence number can be removed from the tree. - */ - tm_root = &fs_info->tree_mod_log; - for (node = rb_first(tm_root); node; node = next) { - next = rb_next(node); - tm = rb_entry(node, struct tree_mod_elem, node); - if (tm->seq >= min_seq) - continue; - rb_erase(node, tm_root); - kfree(tm); - } - write_unlock(&fs_info->tree_mod_log_lock); -} - -/* - * key order of the log: - * node/leaf start address -> sequence - * - * The 'start address' is the logical address of the *new* root node - * for root replace operations, or the logical address of the affected - * block for all other operations. - */ -static noinline int -__tree_mod_log_insert(struct btrfs_fs_info *fs_info, struct tree_mod_elem *tm) -{ - struct rb_root *tm_root; - struct rb_node **new; - struct rb_node *parent = NULL; - struct tree_mod_elem *cur; - - lockdep_assert_held_write(&fs_info->tree_mod_log_lock); - - tm->seq = btrfs_inc_tree_mod_seq(fs_info); - - tm_root = &fs_info->tree_mod_log; - new = &tm_root->rb_node; - while (*new) { - cur = rb_entry(*new, struct tree_mod_elem, node); - parent = *new; - if (cur->logical < tm->logical) - new = &((*new)->rb_left); - else if (cur->logical > tm->logical) - new = &((*new)->rb_right); - else if (cur->seq < tm->seq) - new = &((*new)->rb_left); - else if (cur->seq > tm->seq) - new = &((*new)->rb_right); - else - return -EEXIST; - } - - rb_link_node(&tm->node, parent, new); - rb_insert_color(&tm->node, tm_root); - return 0; -} - -/* - * Determines if logging can be omitted. Returns 1 if it can. Otherwise, it - * returns zero with the tree_mod_log_lock acquired. The caller must hold - * this until all tree mod log insertions are recorded in the rb tree and then - * write unlock fs_info::tree_mod_log_lock. - */ -static inline int tree_mod_dont_log(struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) { - smp_mb(); - if (list_empty(&(fs_info)->tree_mod_seq_list)) - return 1; - if (eb && btrfs_header_level(eb) == 0) - return 1; - - write_lock(&fs_info->tree_mod_log_lock); - if (list_empty(&(fs_info)->tree_mod_seq_list)) { - write_unlock(&fs_info->tree_mod_log_lock); - return 1; - } - - return 0; -} - -/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ -static inline int tree_mod_need_log(const struct btrfs_fs_info *fs_info, - struct extent_buffer *eb) -{ - smp_mb(); - if (list_empty(&(fs_info)->tree_mod_seq_list)) - return 0; - if (eb && btrfs_header_level(eb) == 0) - return 0; - - return 1; -} - -static struct tree_mod_elem * -alloc_tree_mod_elem(struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) -{ - struct tree_mod_elem *tm; - - tm = kzalloc(sizeof(*tm), flags); - if (!tm) - return NULL; - - tm->logical = eb->start; - if (op != MOD_LOG_KEY_ADD) { - btrfs_node_key(eb, &tm->key, slot); - tm->blockptr = btrfs_node_blockptr(eb, slot); - } - tm->op = op; - tm->slot = slot; - tm->generation = btrfs_node_ptr_generation(eb, slot); - RB_CLEAR_NODE(&tm->node); - - return tm; -} - -static noinline int tree_mod_log_insert_key(struct extent_buffer *eb, int slot, - enum mod_log_op op, gfp_t flags) -{ - struct tree_mod_elem *tm; - int ret; - - if (!tree_mod_need_log(eb->fs_info, eb)) - return 0; - - tm = alloc_tree_mod_elem(eb, slot, op, flags); - if (!tm) - return -ENOMEM; - - if (tree_mod_dont_log(eb->fs_info, eb)) { - kfree(tm); - return 0; - } - - ret = __tree_mod_log_insert(eb->fs_info, tm); - write_unlock(&eb->fs_info->tree_mod_log_lock); - if (ret) - kfree(tm); - - return ret; -} - -static noinline int tree_mod_log_insert_move(struct extent_buffer *eb, - int dst_slot, int src_slot, int nr_items) -{ - struct tree_mod_elem *tm = NULL; - struct tree_mod_elem **tm_list = NULL; - int ret = 0; - int i; - int locked = 0; - - if (!tree_mod_need_log(eb->fs_info, eb)) - return 0; - - tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - tm = kzalloc(sizeof(*tm), GFP_NOFS); - if (!tm) { - ret = -ENOMEM; - goto free_tms; - } - - tm->logical = eb->start; - tm->slot = src_slot; - tm->move.dst_slot = dst_slot; - tm->move.nr_items = nr_items; - tm->op = MOD_LOG_MOVE_KEYS; - - for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, - MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); - if (!tm_list[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - - if (tree_mod_dont_log(eb->fs_info, eb)) - goto free_tms; - locked = 1; - - /* - * When we override something during the move, we log these removals. - * This can only happen when we move towards the beginning of the - * buffer, i.e. dst_slot < src_slot. - */ - for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { - ret = __tree_mod_log_insert(eb->fs_info, tm_list[i]); - if (ret) - goto free_tms; - } - - ret = __tree_mod_log_insert(eb->fs_info, tm); - if (ret) - goto free_tms; - write_unlock(&eb->fs_info->tree_mod_log_lock); - kfree(tm_list); - - return 0; -free_tms: - for (i = 0; i < nr_items; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); - kfree(tm_list[i]); - } - if (locked) - write_unlock(&eb->fs_info->tree_mod_log_lock); - kfree(tm_list); - kfree(tm); - - return ret; -} - -static inline int -__tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, - struct tree_mod_elem **tm_list, - int nritems) -{ - int i, j; - int ret; - - for (i = nritems - 1; i >= 0; i--) { - ret = __tree_mod_log_insert(fs_info, tm_list[i]); - if (ret) { - for (j = nritems - 1; j > i; j--) - rb_erase(&tm_list[j]->node, - &fs_info->tree_mod_log); - return ret; - } - } - - return 0; -} - -static noinline int tree_mod_log_insert_root(struct extent_buffer *old_root, - struct extent_buffer *new_root, int log_removal) -{ - struct btrfs_fs_info *fs_info = old_root->fs_info; - struct tree_mod_elem *tm = NULL; - struct tree_mod_elem **tm_list = NULL; - int nritems = 0; - int ret = 0; - int i; - - if (!tree_mod_need_log(fs_info, NULL)) - return 0; - - if (log_removal && btrfs_header_level(old_root) > 0) { - nritems = btrfs_header_nritems(old_root); - tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), - GFP_NOFS); - if (!tm_list) { - ret = -ENOMEM; - goto free_tms; - } - for (i = 0; i < nritems; i++) { - tm_list[i] = alloc_tree_mod_elem(old_root, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); - if (!tm_list[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - } - - tm = kzalloc(sizeof(*tm), GFP_NOFS); - if (!tm) { - ret = -ENOMEM; - goto free_tms; - } - - tm->logical = new_root->start; - tm->old_root.logical = old_root->start; - tm->old_root.level = btrfs_header_level(old_root); - tm->generation = btrfs_header_generation(old_root); - tm->op = MOD_LOG_ROOT_REPLACE; - - if (tree_mod_dont_log(fs_info, NULL)) - goto free_tms; - - if (tm_list) - ret = __tree_mod_log_free_eb(fs_info, tm_list, nritems); - if (!ret) - ret = __tree_mod_log_insert(fs_info, tm); - - write_unlock(&fs_info->tree_mod_log_lock); - if (ret) - goto free_tms; - kfree(tm_list); - - return ret; - -free_tms: - if (tm_list) { - for (i = 0; i < nritems; i++) - kfree(tm_list[i]); - kfree(tm_list); - } - kfree(tm); - - return ret; -} - -static struct tree_mod_elem * -__tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq, - int smallest) -{ - struct rb_root *tm_root; - struct rb_node *node; - struct tree_mod_elem *cur = NULL; - struct tree_mod_elem *found = NULL; - - read_lock(&fs_info->tree_mod_log_lock); - tm_root = &fs_info->tree_mod_log; - node = tm_root->rb_node; - while (node) { - cur = rb_entry(node, struct tree_mod_elem, node); - if (cur->logical < start) { - node = node->rb_left; - } else if (cur->logical > start) { - node = node->rb_right; - } else if (cur->seq < min_seq) { - node = node->rb_left; - } else if (!smallest) { - /* we want the node with the highest seq */ - if (found) - BUG_ON(found->seq > cur->seq); - found = cur; - node = node->rb_left; - } else if (cur->seq > min_seq) { - /* we want the node with the smallest seq */ - if (found) - BUG_ON(found->seq < cur->seq); - found = cur; - node = node->rb_right; - } else { - found = cur; - break; - } - } - read_unlock(&fs_info->tree_mod_log_lock); - - return found; -} - -/* - * this returns the element from the log with the smallest time sequence - * value that's in the log (the oldest log item). any element with a time - * sequence lower than min_seq will be ignored. - */ -static struct tree_mod_elem * -tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, u64 start, - u64 min_seq) -{ - return __tree_mod_log_search(fs_info, start, min_seq, 1); -} - -/* - * this returns the element from the log with the largest time sequence - * value that's in the log (the most recent log item). any element with - * a time sequence lower than min_seq will be ignored. - */ -static struct tree_mod_elem * -tree_mod_log_search(struct btrfs_fs_info *fs_info, u64 start, u64 min_seq) -{ - return __tree_mod_log_search(fs_info, start, min_seq, 0); -} - -static noinline int tree_mod_log_eb_copy(struct extent_buffer *dst, - struct extent_buffer *src, unsigned long dst_offset, - unsigned long src_offset, int nr_items) -{ - struct btrfs_fs_info *fs_info = dst->fs_info; - int ret = 0; - struct tree_mod_elem **tm_list = NULL; - struct tree_mod_elem **tm_list_add, **tm_list_rem; - int i; - int locked = 0; - - if (!tree_mod_need_log(fs_info, NULL)) - return 0; - - if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) - return 0; - - tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), - GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - tm_list_add = tm_list; - tm_list_rem = tm_list + nr_items; - for (i = 0; i < nr_items; i++) { - tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, - MOD_LOG_KEY_REMOVE, GFP_NOFS); - if (!tm_list_rem[i]) { - ret = -ENOMEM; - goto free_tms; - } - - tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, - MOD_LOG_KEY_ADD, GFP_NOFS); - if (!tm_list_add[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - - if (tree_mod_dont_log(fs_info, NULL)) - goto free_tms; - locked = 1; - - for (i = 0; i < nr_items; i++) { - ret = __tree_mod_log_insert(fs_info, tm_list_rem[i]); - if (ret) - goto free_tms; - ret = __tree_mod_log_insert(fs_info, tm_list_add[i]); - if (ret) - goto free_tms; - } - - write_unlock(&fs_info->tree_mod_log_lock); - kfree(tm_list); - - return 0; - -free_tms: - for (i = 0; i < nr_items * 2; i++) { - if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) - rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); - kfree(tm_list[i]); - } - if (locked) - write_unlock(&fs_info->tree_mod_log_lock); - kfree(tm_list); - - return ret; -} - -static noinline int tree_mod_log_free_eb(struct extent_buffer *eb) -{ - struct tree_mod_elem **tm_list = NULL; - int nritems = 0; - int i; - int ret = 0; - - if (btrfs_header_level(eb) == 0) - return 0; - - if (!tree_mod_need_log(eb->fs_info, NULL)) - return 0; - - nritems = btrfs_header_nritems(eb); - tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); - if (!tm_list) - return -ENOMEM; - - for (i = 0; i < nritems; i++) { - tm_list[i] = alloc_tree_mod_elem(eb, i, - MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); - if (!tm_list[i]) { - ret = -ENOMEM; - goto free_tms; - } - } - - if (tree_mod_dont_log(eb->fs_info, eb)) - goto free_tms; - - ret = __tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); - write_unlock(&eb->fs_info->tree_mod_log_lock); - if (ret) - goto free_tms; - kfree(tm_list); - - return 0; - -free_tms: - for (i = 0; i < nritems; i++) - kfree(tm_list[i]); - kfree(tm_list); - - return ret; -} - /* * check if the tree block can be shared by multiple trees */ @@ -1090,7 +500,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, parent_start = buf->start; atomic_inc(&cow->refs); - ret = tree_mod_log_insert_root(root->node, cow, 1); + ret = btrfs_tree_mod_log_insert_root(root->node, cow, true); BUG_ON(ret < 0); rcu_assign_pointer(root->node, cow); @@ -1100,15 +510,15 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, add_root_to_dirty_list(root); } else { WARN_ON(trans->transid != btrfs_header_generation(parent)); - tree_mod_log_insert_key(parent, parent_slot, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + btrfs_tree_mod_log_insert_key(parent, parent_slot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); btrfs_set_node_blockptr(parent, parent_slot, cow->start); btrfs_set_node_ptr_generation(parent, parent_slot, trans->transid); btrfs_mark_buffer_dirty(parent); if (last_ref) { - ret = tree_mod_log_free_eb(buf); + ret = btrfs_tree_mod_log_free_eb(buf); if (ret) { btrfs_tree_unlock(cow); free_extent_buffer(cow); @@ -1127,298 +537,6 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans, return 0; } -/* - * returns the logical address of the oldest predecessor of the given root. - * entries older than time_seq are ignored. - */ -static struct tree_mod_elem *__tree_mod_log_oldest_root( - struct extent_buffer *eb_root, u64 time_seq) -{ - struct tree_mod_elem *tm; - struct tree_mod_elem *found = NULL; - u64 root_logical = eb_root->start; - int looped = 0; - - if (!time_seq) - return NULL; - - /* - * the very last operation that's logged for a root is the - * replacement operation (if it is replaced at all). this has - * the logical address of the *new* root, making it the very - * first operation that's logged for this root. - */ - while (1) { - tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, - time_seq); - if (!looped && !tm) - return NULL; - /* - * if there are no tree operation for the oldest root, we simply - * return it. this should only happen if that (old) root is at - * level 0. - */ - if (!tm) - break; - - /* - * if there's an operation that's not a root replacement, we - * found the oldest version of our root. normally, we'll find a - * MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. - */ - if (tm->op != MOD_LOG_ROOT_REPLACE) - break; - - found = tm; - root_logical = tm->old_root.logical; - looped = 1; - } - - /* if there's no old root to return, return what we found instead */ - if (!found) - found = tm; - - return found; -} - -/* - * tm is a pointer to the first operation to rewind within eb. then, all - * previous operations will be rewound (until we reach something older than - * time_seq). - */ -static void -__tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct extent_buffer *eb, - u64 time_seq, struct tree_mod_elem *first_tm) -{ - u32 n; - struct rb_node *next; - struct tree_mod_elem *tm = first_tm; - unsigned long o_dst; - unsigned long o_src; - unsigned long p_size = sizeof(struct btrfs_key_ptr); - - n = btrfs_header_nritems(eb); - read_lock(&fs_info->tree_mod_log_lock); - while (tm && tm->seq >= time_seq) { - /* - * all the operations are recorded with the operator used for - * the modification. as we're going backwards, we do the - * opposite of each operation here. - */ - switch (tm->op) { - case MOD_LOG_KEY_REMOVE_WHILE_FREEING: - BUG_ON(tm->slot < n); - fallthrough; - case MOD_LOG_KEY_REMOVE_WHILE_MOVING: - case MOD_LOG_KEY_REMOVE: - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); - btrfs_set_node_ptr_generation(eb, tm->slot, - tm->generation); - n++; - break; - case MOD_LOG_KEY_REPLACE: - BUG_ON(tm->slot >= n); - btrfs_set_node_key(eb, &tm->key, tm->slot); - btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); - btrfs_set_node_ptr_generation(eb, tm->slot, - tm->generation); - break; - case MOD_LOG_KEY_ADD: - /* if a move operation is needed it's in the log */ - n--; - break; - case MOD_LOG_MOVE_KEYS: - o_dst = btrfs_node_key_ptr_offset(tm->slot); - o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); - memmove_extent_buffer(eb, o_dst, o_src, - tm->move.nr_items * p_size); - break; - case MOD_LOG_ROOT_REPLACE: - /* - * this operation is special. for roots, this must be - * handled explicitly before rewinding. - * for non-roots, this operation may exist if the node - * was a root: root A -> child B; then A gets empty and - * B is promoted to the new root. in the mod log, we'll - * have a root-replace operation for B, a tree block - * that is no root. we simply ignore that operation. - */ - break; - } - next = rb_next(&tm->node); - if (!next) - break; - tm = rb_entry(next, struct tree_mod_elem, node); - if (tm->logical != first_tm->logical) - break; - } - read_unlock(&fs_info->tree_mod_log_lock); - btrfs_set_header_nritems(eb, n); -} - -/* - * Called with eb read locked. If the buffer cannot be rewound, the same buffer - * is returned. If rewind operations happen, a fresh buffer is returned. The - * returned buffer is always read-locked. If the returned buffer is not the - * input buffer, the lock on the input buffer is released and the input buffer - * is freed (its refcount is decremented). - */ -static struct extent_buffer * -tree_mod_log_rewind(struct btrfs_fs_info *fs_info, struct btrfs_path *path, - struct extent_buffer *eb, u64 time_seq) -{ - struct extent_buffer *eb_rewin; - struct tree_mod_elem *tm; - - if (!time_seq) - return eb; - - if (btrfs_header_level(eb) == 0) - return eb; - - tm = tree_mod_log_search(fs_info, eb->start, time_seq); - if (!tm) - return eb; - - if (tm->op == MOD_LOG_KEY_REMOVE_WHILE_FREEING) { - BUG_ON(tm->slot != 0); - eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); - if (!eb_rewin) { - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - return NULL; - } - btrfs_set_header_bytenr(eb_rewin, eb->start); - btrfs_set_header_backref_rev(eb_rewin, - btrfs_header_backref_rev(eb)); - btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); - btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); - } else { - eb_rewin = btrfs_clone_extent_buffer(eb); - if (!eb_rewin) { - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - return NULL; - } - } - - btrfs_tree_read_unlock(eb); - free_extent_buffer(eb); - - btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), - eb_rewin, btrfs_header_level(eb_rewin)); - btrfs_tree_read_lock(eb_rewin); - __tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); - WARN_ON(btrfs_header_nritems(eb_rewin) > - BTRFS_NODEPTRS_PER_BLOCK(fs_info)); - - return eb_rewin; -} - -/* - * get_old_root() rewinds the state of @root's root node to the given @time_seq - * value. If there are no changes, the current root->root_node is returned. If - * anything changed in between, there's a fresh buffer allocated on which the - * rewind operations are done. In any case, the returned buffer is read locked. - * Returns NULL on error (with no locks held). - */ -static inline struct extent_buffer * -get_old_root(struct btrfs_root *root, u64 time_seq) -{ - struct btrfs_fs_info *fs_info = root->fs_info; - struct tree_mod_elem *tm; - struct extent_buffer *eb = NULL; - struct extent_buffer *eb_root; - u64 eb_root_owner = 0; - struct extent_buffer *old; - struct tree_mod_root *old_root = NULL; - u64 old_generation = 0; - u64 logical; - int level; - - eb_root = btrfs_read_lock_root_node(root); - tm = __tree_mod_log_oldest_root(eb_root, time_seq); - if (!tm) - return eb_root; - - if (tm->op == MOD_LOG_ROOT_REPLACE) { - old_root = &tm->old_root; - old_generation = tm->generation; - logical = old_root->logical; - level = old_root->level; - } else { - logical = eb_root->start; - level = btrfs_header_level(eb_root); - } - - tm = tree_mod_log_search(fs_info, logical, time_seq); - if (old_root && tm && tm->op != MOD_LOG_KEY_REMOVE_WHILE_FREEING) { - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - old = read_tree_block(fs_info, logical, root->root_key.objectid, - 0, level, NULL); - if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { - if (!IS_ERR(old)) - free_extent_buffer(old); - btrfs_warn(fs_info, - "failed to read tree block %llu from get_old_root", - logical); - } else { - btrfs_tree_read_lock(old); - eb = btrfs_clone_extent_buffer(old); - btrfs_tree_read_unlock(old); - free_extent_buffer(old); - } - } else if (old_root) { - eb_root_owner = btrfs_header_owner(eb_root); - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - eb = alloc_dummy_extent_buffer(fs_info, logical); - } else { - eb = btrfs_clone_extent_buffer(eb_root); - btrfs_tree_read_unlock(eb_root); - free_extent_buffer(eb_root); - } - - if (!eb) - return NULL; - if (old_root) { - btrfs_set_header_bytenr(eb, eb->start); - btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); - btrfs_set_header_owner(eb, eb_root_owner); - btrfs_set_header_level(eb, old_root->level); - btrfs_set_header_generation(eb, old_generation); - } - btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, - btrfs_header_level(eb)); - btrfs_tree_read_lock(eb); - if (tm) - __tree_mod_log_rewind(fs_info, eb, time_seq, tm); - else - WARN_ON(btrfs_header_level(eb) != 0); - WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); - - return eb; -} - -int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) -{ - struct tree_mod_elem *tm; - int level; - struct extent_buffer *eb_root = btrfs_root_node(root); - - tm = __tree_mod_log_oldest_root(eb_root, time_seq); - if (tm && tm->op == MOD_LOG_ROOT_REPLACE) { - level = tm->old_root.level; - } else { - level = btrfs_header_level(eb_root); - } - free_extent_buffer(eb_root); - - return level; -} - static inline int should_cow_block(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct extent_buffer *buf) @@ -1840,7 +958,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, goto enospc; } - ret = tree_mod_log_insert_root(root->node, child, 1); + ret = btrfs_tree_mod_log_insert_root(root->node, child, true); BUG_ON(ret < 0); rcu_assign_pointer(root->node, child); @@ -1920,8 +1038,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, } else { struct btrfs_disk_key right_key; btrfs_node_key(right, &right_key, 0); - ret = tree_mod_log_insert_key(parent, pslot + 1, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &right_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -1966,8 +1084,8 @@ static noinline int balance_level(struct btrfs_trans_handle *trans, /* update the parent key to reflect our changes */ struct btrfs_disk_key mid_key; btrfs_node_key(mid, &mid_key, 0); - ret = tree_mod_log_insert_key(parent, pslot, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &mid_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -2068,8 +1186,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; orig_slot += left_nr; btrfs_node_key(mid, &disk_key, 0); - ret = tree_mod_log_insert_key(parent, pslot, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot); btrfs_mark_buffer_dirty(parent); @@ -2122,8 +1240,8 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans, struct btrfs_disk_key disk_key; btrfs_node_key(right, &disk_key, 0); - ret = tree_mod_log_insert_key(parent, pslot + 1, - MOD_LOG_KEY_REPLACE, GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS); BUG_ON(ret < 0); btrfs_set_node_key(parent, &disk_key, pslot + 1); btrfs_mark_buffer_dirty(parent); @@ -2161,12 +1279,13 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, u64 search; u64 target; u64 nread = 0; + u64 nread_max; struct extent_buffer *eb; u32 nr; u32 blocksize; u32 nscan = 0; - if (level != 1) + if (level != 1 && path->reada != READA_FORWARD_ALWAYS) return; if (!path->nodes[level]) @@ -2174,6 +1293,20 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, node = path->nodes[level]; + /* + * Since the time between visiting leaves is much shorter than the time + * between visiting nodes, limit read ahead of nodes to 1, to avoid too + * much IO at once (possibly random). + */ + if (path->reada == READA_FORWARD_ALWAYS) { + if (level > 1) + nread_max = node->fs_info->nodesize; + else + nread_max = SZ_128K; + } else { + nread_max = SZ_64K; + } + search = btrfs_node_blockptr(node, slot); blocksize = fs_info->nodesize; eb = find_extent_buffer(fs_info, search); @@ -2192,7 +1325,8 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, if (nr == 0) break; nr--; - } else if (path->reada == READA_FORWARD) { + } else if (path->reada == READA_FORWARD || + path->reada == READA_FORWARD_ALWAYS) { nr++; if (nr >= nritems) break; @@ -2203,13 +1337,14 @@ static void reada_for_search(struct btrfs_fs_info *fs_info, break; } search = btrfs_node_blockptr(node, nr); - if ((search <= target && target - search <= 65536) || + if (path->reada == READA_FORWARD_ALWAYS || + (search <= target && target - search <= 65536) || (search > target && search - target <= 65536)) { btrfs_readahead_node_child(node, nr); nread += blocksize; } nscan++; - if ((nread > 65536 || nscan > 32)) + if (nread > nread_max || nscan > 32) break; } } @@ -2318,6 +1453,9 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p, tmp = find_extent_buffer(fs_info, blocknr); if (tmp) { + if (p->reada == READA_FORWARD_ALWAYS) + reada_for_search(fs_info, p, level, slot, key->objectid); + /* first we do an atomic uptodate check */ if (btrfs_buffer_uptodate(tmp, gen, 1) > 0) { /* @@ -2861,7 +1999,7 @@ int btrfs_search_old_slot(struct btrfs_root *root, const struct btrfs_key *key, } again: - b = get_old_root(root, time_seq); + b = btrfs_get_old_root(root, time_seq); if (!b) { ret = -EIO; goto done; @@ -2916,7 +2054,7 @@ again: level = btrfs_header_level(b); btrfs_tree_read_lock(b); - b = tree_mod_log_rewind(fs_info, p, b, time_seq); + b = btrfs_tree_mod_log_rewind(fs_info, p, b, time_seq); if (!b) { ret = -ENOMEM; goto done; @@ -3030,8 +2168,8 @@ static void fixup_low_keys(struct btrfs_path *path, if (!path->nodes[i]) break; t = path->nodes[i]; - ret = tree_mod_log_insert_key(t, tslot, MOD_LOG_KEY_REPLACE, - GFP_ATOMIC); + ret = btrfs_tree_mod_log_insert_key(t, tslot, + BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC); BUG_ON(ret < 0); btrfs_set_node_key(t, key, tslot); btrfs_mark_buffer_dirty(path->nodes[i]); @@ -3194,7 +2332,7 @@ static int push_node_left(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } - ret = tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); + ret = btrfs_tree_mod_log_eb_copy(dst, src, dst_nritems, 0, push_items); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3206,8 +2344,8 @@ static int push_node_left(struct btrfs_trans_handle *trans, if (push_items < src_nritems) { /* - * Don't call tree_mod_log_insert_move here, key removal was - * already fully logged by tree_mod_log_eb_copy above. + * Don't call btrfs_tree_mod_log_insert_move() here, key removal + * was already fully logged by btrfs_tree_mod_log_eb_copy() above. */ memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0), btrfs_node_key_ptr_offset(push_items), @@ -3268,15 +2406,15 @@ static int balance_node_right(struct btrfs_trans_handle *trans, btrfs_abort_transaction(trans, ret); return ret; } - ret = tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); + ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems); BUG_ON(ret < 0); memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items), btrfs_node_key_ptr_offset(0), (dst_nritems) * sizeof(struct btrfs_key_ptr)); - ret = tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, - push_items); + ret = btrfs_tree_mod_log_eb_copy(dst, src, 0, src_nritems - push_items, + push_items); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -3342,7 +2480,7 @@ static noinline int insert_new_root(struct btrfs_trans_handle *trans, btrfs_mark_buffer_dirty(c); old = root->node; - ret = tree_mod_log_insert_root(root->node, c, 0); + ret = btrfs_tree_mod_log_insert_root(root->node, c, false); BUG_ON(ret < 0); rcu_assign_pointer(root->node, c); @@ -3381,8 +2519,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, BUG_ON(nritems == BTRFS_NODEPTRS_PER_BLOCK(trans->fs_info)); if (slot != nritems) { if (level) { - ret = tree_mod_log_insert_move(lower, slot + 1, slot, - nritems - slot); + ret = btrfs_tree_mod_log_insert_move(lower, slot + 1, + slot, nritems - slot); BUG_ON(ret < 0); } memmove_extent_buffer(lower, @@ -3391,8 +2529,8 @@ static void insert_ptr(struct btrfs_trans_handle *trans, (nritems - slot) * sizeof(struct btrfs_key_ptr)); } if (level) { - ret = tree_mod_log_insert_key(lower, slot, MOD_LOG_KEY_ADD, - GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(lower, slot, + BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); BUG_ON(ret < 0); } btrfs_set_node_key(lower, key, slot); @@ -3433,9 +2571,9 @@ static noinline int split_node(struct btrfs_trans_handle *trans, * tree mod log: We don't log_removal old root in * insert_new_root, because that root buffer will be kept as a * normal node. We are going to log removal of half of the - * elements below with tree_mod_log_eb_copy. We're holding a - * tree lock on the buffer, which is why we cannot race with - * other tree_mod_log users. + * elements below with btrfs_tree_mod_log_eb_copy(). We're + * holding a tree lock on the buffer, which is why we cannot + * race with other tree_mod_log users. */ ret = insert_new_root(trans, root, path, level + 1); if (ret) @@ -3462,7 +2600,7 @@ static noinline int split_node(struct btrfs_trans_handle *trans, root_add_used(root, fs_info->nodesize); ASSERT(btrfs_header_level(c) == level); - ret = tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); + ret = btrfs_tree_mod_log_eb_copy(split, c, 0, mid, c_nritems - mid); if (ret) { btrfs_abort_transaction(trans, ret); return ret; @@ -4844,8 +3982,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, nritems = btrfs_header_nritems(parent); if (slot != nritems - 1) { if (level) { - ret = tree_mod_log_insert_move(parent, slot, slot + 1, - nritems - slot - 1); + ret = btrfs_tree_mod_log_insert_move(parent, slot, + slot + 1, nritems - slot - 1); BUG_ON(ret < 0); } memmove_extent_buffer(parent, @@ -4854,8 +3992,8 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path, sizeof(struct btrfs_key_ptr) * (nritems - slot - 1)); } else if (level) { - ret = tree_mod_log_insert_key(parent, slot, MOD_LOG_KEY_REMOVE, - GFP_NOFS); + ret = btrfs_tree_mod_log_insert_key(parent, slot, + BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); BUG_ON(ret < 0); } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 9ae776ab3967..9fb76829a281 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -342,6 +342,27 @@ struct btrfs_node { struct btrfs_key_ptr ptrs[]; } __attribute__ ((__packed__)); +/* Read ahead values for struct btrfs_path.reada */ +enum { + READA_NONE, + READA_BACK, + READA_FORWARD, + /* + * Similar to READA_FORWARD but unlike it: + * + * 1) It will trigger readahead even for leaves that are not close to + * each other on disk; + * 2) It also triggers readahead for nodes; + * 3) During a search, even when a node or leaf is already in memory, it + * will still trigger readahead for other nodes and leaves that follow + * it. + * + * This is meant to be used only when we know we are iterating over the + * entire tree or a very large part of it. + */ + READA_FORWARD_ALWAYS, +}; + /* * btrfs_paths remember the path taken from the root down to the leaf. * level 0 is always the leaf, and nodes[1...BTRFS_MAX_LEVEL] will point @@ -350,7 +371,6 @@ struct btrfs_node { * The slots array records the index of the item or block pointer * used while walking the tree. */ -enum { READA_NONE, READA_BACK, READA_FORWARD }; struct btrfs_path { struct extent_buffer *nodes[BTRFS_MAX_LEVEL]; int slots[BTRFS_MAX_LEVEL]; @@ -482,16 +502,6 @@ struct btrfs_discard_ctl { atomic64_t discard_bytes_saved; }; -/* delayed seq elem */ -struct seq_list { - struct list_head list; - u64 seq; -}; - -#define SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } - -#define SEQ_LAST ((u64)-1) - enum btrfs_orphan_cleanup_state { ORPHAN_CLEANUP_STARTED = 1, ORPHAN_CLEANUP_DONE = 2, @@ -572,6 +582,15 @@ enum { /* Indicate that we can't trust the free space tree for caching yet */ BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED, + + /* Indicate whether there are any tree modification log users */ + BTRFS_FS_TREE_MOD_LOG_USERS, + +#if BITS_PER_LONG == 32 + /* Indicate if we have error/warn message printed on 32bit systems */ + BTRFS_FS_32BIT_ERROR, + BTRFS_FS_32BIT_WARN, +#endif }; /* @@ -941,10 +960,16 @@ struct btrfs_fs_info { struct work_struct async_data_reclaim_work; struct work_struct preempt_reclaim_work; + /* Reclaim partially filled block groups in the background */ + struct work_struct reclaim_bgs_work; + struct list_head reclaim_bgs; + int bg_reclaim_threshold; + spinlock_t unused_bgs_lock; struct list_head unused_bgs; struct mutex unused_bg_unpin_mutex; - struct mutex delete_unused_bgs_mutex; + /* Protect block groups that are going to be deleted */ + struct mutex reclaim_bgs_lock; /* Cached block sizes */ u32 nodesize; @@ -2691,7 +2716,6 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref); -int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr); void btrfs_clear_space_info_full(struct btrfs_fs_info *info); /* @@ -2929,13 +2953,6 @@ static inline void btrfs_clear_sb_rdonly(struct super_block *sb) clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state); } -/* tree mod log functions from ctree.c */ -u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); -void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, - struct seq_list *elem); -int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); - /* root-item.c */ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id, u64 ref_id, u64 dirid, u64 sequence, const char *name, @@ -3084,7 +3101,7 @@ u64 btrfs_file_extent_end(const struct btrfs_path *path); blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio, int mirror_num, unsigned long bio_flags); int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end, int mirror); + struct page *page, u64 start, u64 end); struct extent_map *btrfs_get_extent_fiemap(struct btrfs_inode *inode, u64 start, u64 len); noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, @@ -3110,7 +3127,7 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, u64 new_size, u32 min_type); -int btrfs_start_delalloc_snapshot(struct btrfs_root *root); +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, bool in_reclaim_context); int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, @@ -3179,6 +3196,7 @@ extern const struct iomap_dio_ops btrfs_dio_ops; /* Inode locking type flags, by default the exclusive lock is taken */ #define BTRFS_ILOCK_SHARED (1U << 0) #define BTRFS_ILOCK_TRY (1U << 1) +#define BTRFS_ILOCK_MMAP (1U << 2) int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags); void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags); @@ -3189,6 +3207,9 @@ void btrfs_update_inode_bytes(struct btrfs_inode *inode, /* ioctl.c */ long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg); long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg); +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); +int btrfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); int __pure btrfs_is_empty_uuid(u8 *uuid); @@ -3217,8 +3238,9 @@ extern const struct file_operations btrfs_file_operations; int btrfs_drop_extents(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_inode *inode, struct btrfs_drop_extents_args *args); -int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, - const u64 start, const u64 end, +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, struct btrfs_replace_extent_info *extent_info, struct btrfs_trans_handle **trans_out); int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, @@ -3405,6 +3427,19 @@ static inline void assertfail(const char *expr, const char* file, int line) { } #define ASSERT(expr) (void)(expr) #endif +#if BITS_PER_LONG == 32 +#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT) +/* + * The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical + * addresses of extents. + * + * For 4K page size it's about 10T, for 64K it's 160T. + */ +#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8) +void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info); +void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info); +#endif + /* * Get the correct offset inside the page of extent buffer. * @@ -3732,8 +3767,6 @@ static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info) return signal_pending(current); } -#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len)) - /* Sanity test specific functions */ #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS void btrfs_test_destroy_inode(struct inode *inode); diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index bf25401c9768..1a88f6214ebc 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -602,7 +602,6 @@ static void btrfs_delayed_item_release_metadata(struct btrfs_root *root, static int btrfs_delayed_inode_reserve_metadata( struct btrfs_trans_handle *trans, struct btrfs_root *root, - struct btrfs_inode *inode, struct btrfs_delayed_node *node) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -633,32 +632,17 @@ static int btrfs_delayed_inode_reserve_metadata( return ret; ret = btrfs_block_rsv_add(root, dst_rsv, num_bytes, BTRFS_RESERVE_NO_FLUSH); - /* - * Since we're under a transaction reserve_metadata_bytes could - * try to commit the transaction which will make it return - * EAGAIN to make us stop the transaction we have, so return - * ENOSPC instead so that btrfs_dirty_inode knows what to do. - */ - if (ret == -EAGAIN) { - ret = -ENOSPC; - btrfs_qgroup_free_meta_prealloc(root, num_bytes); - } - if (!ret) { - node->bytes_reserved = num_bytes; - trace_btrfs_space_reservation(fs_info, - "delayed_inode", - btrfs_ino(inode), - num_bytes, 1); - } else { + /* NO_FLUSH could only fail with -ENOSPC */ + ASSERT(ret == 0 || ret == -ENOSPC); + if (ret) btrfs_qgroup_free_meta_prealloc(root, num_bytes); - } - return ret; + } else { + ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); } - ret = btrfs_block_rsv_migrate(src_rsv, dst_rsv, num_bytes, true); if (!ret) { trace_btrfs_space_reservation(fs_info, "delayed_inode", - btrfs_ino(inode), num_bytes, 1); + node->inode_id, num_bytes, 1); node->bytes_reserved = num_bytes; } @@ -1589,8 +1573,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode, * We can only do one readdir with delayed items at a time because of * item->readdir_list. */ - inode_unlock_shared(inode); - inode_lock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); + btrfs_inode_lock(inode, 0); mutex_lock(&delayed_node->mutex); item = __btrfs_first_delayed_insertion_item(delayed_node); @@ -1833,8 +1817,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, goto release_node; } - ret = btrfs_delayed_inode_reserve_metadata(trans, root, inode, - delayed_node); + ret = btrfs_delayed_inode_reserve_metadata(trans, root, delayed_node); if (ret) goto release_node; diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 63be7d01a9a3..c92d9d4f5f46 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -11,6 +11,7 @@ #include "transaction.h" #include "qgroup.h" #include "space-info.h" +#include "tree-mod-log.h" struct kmem_cache *btrfs_delayed_ref_head_cachep; struct kmem_cache *btrfs_delayed_tree_ref_cachep; @@ -494,16 +495,7 @@ void btrfs_merge_delayed_refs(struct btrfs_trans_handle *trans, if (head->is_data) return; - read_lock(&fs_info->tree_mod_log_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - struct seq_list *elem; - - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - seq = elem->seq; - } - read_unlock(&fs_info->tree_mod_log_lock); - + seq = btrfs_tree_mod_log_lowest_seq(fs_info); again: for (node = rb_first_cached(&head->ref_tree); node; node = rb_next(node)) { @@ -517,23 +509,16 @@ again: int btrfs_check_delayed_seq(struct btrfs_fs_info *fs_info, u64 seq) { - struct seq_list *elem; int ret = 0; + u64 min_seq = btrfs_tree_mod_log_lowest_seq(fs_info); - read_lock(&fs_info->tree_mod_log_lock); - if (!list_empty(&fs_info->tree_mod_seq_list)) { - elem = list_first_entry(&fs_info->tree_mod_seq_list, - struct seq_list, list); - if (seq >= elem->seq) { - btrfs_debug(fs_info, - "holding back delayed_ref %#x.%x, lowest is %#x.%x", - (u32)(seq >> 32), (u32)seq, - (u32)(elem->seq >> 32), (u32)elem->seq); - ret = 1; - } + if (min_seq != 0 && seq >= min_seq) { + btrfs_debug(fs_info, + "holding back delayed_ref %llu, lowest is %llu", + seq, min_seq); + ret = 1; } - read_unlock(&fs_info->tree_mod_log_lock); return ret; } diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index 3a9c1e046ebe..d05f73530af7 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -81,6 +81,9 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) struct btrfs_dev_replace_item *ptr; u64 src_devid; + if (!dev_root) + return 0; + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index 41b718cfea40..c9a3036c23bf 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -42,6 +42,7 @@ #include "discard.h" #include "space-info.h" #include "zoned.h" +#include "subpage.h" #define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\ BTRFS_HEADER_FLAG_RELOC |\ @@ -440,6 +441,74 @@ static int btree_read_extent_buffer_pages(struct extent_buffer *eb, return ret; } +static int csum_one_extent_buffer(struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + u8 result[BTRFS_CSUM_SIZE]; + int ret; + + ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, + offsetof(struct btrfs_header, fsid), + BTRFS_FSID_SIZE) == 0); + csum_tree_block(eb, result); + + if (btrfs_header_level(eb)) + ret = btrfs_check_node(eb); + else + ret = btrfs_check_leaf_full(eb); + + if (ret < 0) { + btrfs_print_tree(eb, 0); + btrfs_err(fs_info, + "block=%llu write time tree block corruption detected", + eb->start); + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + return ret; + } + write_extent_buffer(eb, result, 0, fs_info->csum_size); + + return 0; +} + +/* Checksum all dirty extent buffers in one bio_vec */ +static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info, + struct bio_vec *bvec) +{ + struct page *page = bvec->bv_page; + u64 bvec_start = page_offset(page) + bvec->bv_offset; + u64 cur; + int ret = 0; + + for (cur = bvec_start; cur < bvec_start + bvec->bv_len; + cur += fs_info->nodesize) { + struct extent_buffer *eb; + bool uptodate; + + eb = find_extent_buffer(fs_info, cur); + uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur, + fs_info->nodesize); + + /* A dirty eb shouldn't disappear from buffer_radix */ + if (WARN_ON(!eb)) + return -EUCLEAN; + + if (WARN_ON(cur != btrfs_header_bytenr(eb))) { + free_extent_buffer(eb); + return -EUCLEAN; + } + if (WARN_ON(!uptodate)) { + free_extent_buffer(eb); + return -EUCLEAN; + } + + ret = csum_one_extent_buffer(eb); + free_extent_buffer(eb); + if (ret < 0) + return ret; + } + return ret; +} + /* * Checksum a dirty tree block before IO. This has extra checks to make sure * we only fill in the checksum field in the first page of a multi-page block. @@ -450,9 +519,10 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec struct page *page = bvec->bv_page; u64 start = page_offset(page); u64 found_start; - u8 result[BTRFS_CSUM_SIZE]; struct extent_buffer *eb; - int ret; + + if (fs_info->sectorsize < PAGE_SIZE) + return csum_dirty_subpage_buffers(fs_info, bvec); eb = (struct extent_buffer *)page->private; if (page != eb->pages[0]) @@ -474,28 +544,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec if (WARN_ON(!PageUptodate(page))) return -EUCLEAN; - ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, - offsetof(struct btrfs_header, fsid), - BTRFS_FSID_SIZE) == 0); - - csum_tree_block(eb, result); - - if (btrfs_header_level(eb)) - ret = btrfs_check_node(eb); - else - ret = btrfs_check_leaf_full(eb); - - if (ret < 0) { - btrfs_print_tree(eb, 0); - btrfs_err(fs_info, - "block=%llu write time tree block corruption detected", - eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); - return ret; - } - write_extent_buffer(eb, result, 0, fs_info->csum_size); - - return 0; + return csum_one_extent_buffer(eb); } static int check_tree_block_fsid(struct extent_buffer *eb) @@ -992,14 +1041,48 @@ static void btree_invalidatepage(struct page *page, unsigned int offset, static int btree_set_page_dirty(struct page *page) { #ifdef DEBUG + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + struct btrfs_subpage *subpage; struct extent_buffer *eb; + int cur_bit = 0; + u64 page_start = page_offset(page); + + if (fs_info->sectorsize == PAGE_SIZE) { + BUG_ON(!PagePrivate(page)); + eb = (struct extent_buffer *)page->private; + BUG_ON(!eb); + BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + BUG_ON(!atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); + return __set_page_dirty_nobuffers(page); + } + ASSERT(PagePrivate(page) && page->private); + subpage = (struct btrfs_subpage *)page->private; + + ASSERT(subpage->dirty_bitmap); + while (cur_bit < BTRFS_SUBPAGE_BITMAP_SIZE) { + unsigned long flags; + u64 cur; + u16 tmp = (1 << cur_bit); + + spin_lock_irqsave(&subpage->lock, flags); + if (!(tmp & subpage->dirty_bitmap)) { + spin_unlock_irqrestore(&subpage->lock, flags); + cur_bit++; + continue; + } + spin_unlock_irqrestore(&subpage->lock, flags); + cur = page_start + cur_bit * fs_info->sectorsize; - BUG_ON(!PagePrivate(page)); - eb = (struct extent_buffer *)page->private; - BUG_ON(!eb); - BUG_ON(!test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); - BUG_ON(!atomic_read(&eb->refs)); - btrfs_assert_tree_locked(eb); + eb = find_extent_buffer(fs_info, cur); + ASSERT(eb); + ASSERT(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); + ASSERT(atomic_read(&eb->refs)); + btrfs_assert_tree_locked(eb); + free_extent_buffer(eb); + + cur_bit += (fs_info->nodesize >> fs_info->sectorsize_bits); + } #endif return __set_page_dirty_nobuffers(page); } @@ -1807,14 +1890,21 @@ static int cleaner_kthread(void *arg) btrfs_run_defrag_inodes(fs_info); /* - * Acquires fs_info->delete_unused_bgs_mutex to avoid racing + * Acquires fs_info->reclaim_bgs_lock to avoid racing * with relocation (btrfs_relocate_chunk) and relocation * acquires fs_info->cleaner_mutex (btrfs_relocate_block_group) - * after acquiring fs_info->delete_unused_bgs_mutex. So we + * after acquiring fs_info->reclaim_bgs_lock. So we * can't hold, nor need to, fs_info->cleaner_mutex when deleting * unused block groups. */ btrfs_delete_unused_bgs(fs_info); + + /* + * Reclaim block groups in the reclaim_bgs list after we deleted + * all unused block_groups. This possibly gives us some more free + * space. + */ + btrfs_reclaim_bgs(fs_info); sleep: clear_and_wake_up_bit(BTRFS_FS_CLEANER_RUNNING, &fs_info->flags); if (kthread_should_park()) @@ -2387,8 +2477,9 @@ static int btrfs_read_roots(struct btrfs_fs_info *fs_info) } else { set_bit(BTRFS_ROOT_TRACK_DIRTY, &root->state); fs_info->dev_root = root; - btrfs_init_devices_late(fs_info); } + /* Initialize fs_info for all devices in any case */ + btrfs_init_devices_late(fs_info); /* If IGNOREDATACSUMS is set don't bother reading the csum root. */ if (!btrfs_test_opt(fs_info, IGNOREDATACSUMS)) { @@ -2792,7 +2883,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->treelog_bg_lock); rwlock_init(&fs_info->tree_mod_log_lock); mutex_init(&fs_info->unused_bg_unpin_mutex); - mutex_init(&fs_info->delete_unused_bgs_mutex); + mutex_init(&fs_info->reclaim_bgs_lock); mutex_init(&fs_info->reloc_mutex); mutex_init(&fs_info->delalloc_root_mutex); mutex_init(&fs_info->zoned_meta_io_lock); @@ -2802,6 +2893,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) INIT_LIST_HEAD(&fs_info->space_info); INIT_LIST_HEAD(&fs_info->tree_mod_seq_list); INIT_LIST_HEAD(&fs_info->unused_bgs); + INIT_LIST_HEAD(&fs_info->reclaim_bgs); #ifdef CONFIG_BTRFS_DEBUG INIT_LIST_HEAD(&fs_info->allocated_roots); INIT_LIST_HEAD(&fs_info->allocated_ebs); @@ -2890,6 +2982,9 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) fs_info->swapfile_pins = RB_ROOT; fs_info->send_in_progress = 0; + + fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH; + INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work); } static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block *sb) @@ -3009,6 +3104,21 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) } } + /* + * btrfs_find_orphan_roots() is responsible for finding all the dead + * roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load + * them into the fs_info->fs_roots_radix tree. This must be done before + * calling btrfs_orphan_cleanup() on the tree root. If we don't do it + * first, then btrfs_orphan_cleanup() will delete a dead root's orphan + * item before the root's tree is deleted - this means that if we unmount + * or crash before the deletion completes, on the next mount we will not + * delete what remains of the tree because the orphan item does not + * exists anymore, which is what tells us we have a pending deletion. + */ + ret = btrfs_find_orphan_roots(fs_info); + if (ret) + goto out; + ret = btrfs_cleanup_fs_roots(fs_info); if (ret) goto out; @@ -3068,7 +3178,6 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info) } } - ret = btrfs_find_orphan_roots(fs_info); out: return ret; } @@ -4234,6 +4343,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) cancel_work_sync(&fs_info->async_data_reclaim_work); cancel_work_sync(&fs_info->preempt_reclaim_work); + cancel_work_sync(&fs_info->reclaim_bgs_work); + /* Cancel or finish ongoing discard work */ btrfs_discard_cleanup(fs_info); diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 36a3c973fda1..3d5c35e4cb76 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -1340,12 +1340,16 @@ int btrfs_discard_extent(struct btrfs_fs_info *fs_info, u64 bytenr, stripe = bbio->stripes; for (i = 0; i < bbio->num_stripes; i++, stripe++) { u64 bytes; + struct btrfs_device *device = stripe->dev; - if (!stripe->dev->bdev) { + if (!device->bdev) { ASSERT(btrfs_test_opt(fs_info, DEGRADED)); continue; } + if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) + continue; + ret = do_discard_extent(stripe, &bytes); if (!ret) { discarded_bytes += bytes; @@ -1864,7 +1868,7 @@ static int cleanup_ref_head(struct btrfs_trans_handle *trans, trace_run_delayed_ref_head(fs_info, head, 0); btrfs_delayed_ref_unlock(head); btrfs_put_delayed_ref_head(head); - return 0; + return ret; } static struct btrfs_delayed_ref_head *btrfs_obtain_ref_head( @@ -2490,19 +2494,6 @@ int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root, return __btrfs_mod_ref(trans, root, buf, full_backref, 0); } -int btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) -{ - struct btrfs_block_group *block_group; - int readonly = 0; - - block_group = btrfs_lookup_block_group(fs_info, bytenr); - if (!block_group || block_group->ro) - readonly = 1; - if (block_group) - btrfs_put_block_group(block_group); - return readonly; -} - static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -3355,11 +3346,9 @@ void btrfs_free_tree_block(struct btrfs_trans_handle *trans, * find a node pointing to this leaf and record operations that * point to this leaf. */ - if (btrfs_header_level(buf) == 0) { - read_lock(&fs_info->tree_mod_log_lock); - must_pin = !list_empty(&fs_info->tree_mod_seq_list); - read_unlock(&fs_info->tree_mod_log_lock); - } + if (btrfs_header_level(buf) == 0 && + test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + must_pin = true; if (must_pin || btrfs_is_zoned(fs_info)) { btrfs_redirty_list_add(trans->transaction, buf); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index 910769d5fcdb..dee2dafbc872 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -13,6 +13,7 @@ #include <linux/pagevec.h> #include <linux/prefetch.h> #include <linux/cleancache.h> +#include "misc.h" #include "extent_io.h" #include "extent-io-tree.h" #include "extent_map.h" @@ -2983,8 +2984,7 @@ static void end_bio_extent_readpage(struct bio *bio) if (likely(uptodate)) { if (is_data_inode(inode)) ret = btrfs_verify_data_csum(io_bio, - bio_offset, page, start, end, - mirror); + bio_offset, page, start, end); else ret = btrfs_validate_metadata_buffer(io_bio, page, start, end, mirror); @@ -3421,15 +3421,12 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, } if (page->index == last_byte >> PAGE_SHIFT) { - char *userpage; size_t zero_offset = offset_in_page(last_byte); if (zero_offset) { iosize = PAGE_SIZE - zero_offset; - userpage = kmap_atomic(page); - memset(userpage + zero_offset, 0, iosize); + memzero_page(page, zero_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); } } begin_page_read(fs_info, page); @@ -3438,14 +3435,11 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, u64 disk_bytenr; if (cur >= last_byte) { - char *userpage; struct extent_state *cached = NULL; iosize = PAGE_SIZE - pg_offset; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); + memzero_page(page, pg_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); unlock_extent_cached(tree, cur, @@ -3528,13 +3522,10 @@ int btrfs_do_readpage(struct page *page, struct extent_map **em_cached, /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - char *userpage; struct extent_state *cached = NULL; - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, iosize); + memzero_page(page, pg_offset, iosize); flush_dcache_page(page); - kunmap_atomic(userpage); set_extent_uptodate(tree, cur, cur + iosize - 1, &cached, GFP_NOFS); @@ -3762,7 +3753,7 @@ static noinline_for_stack int __extent_writepage_io(struct btrfs_inode *inode, /* Note that em_end from extent_map_end() is exclusive */ iosize = min(em_end, end + 1) - cur; - if (btrfs_use_zone_append(inode, em)) + if (btrfs_use_zone_append(inode, em->block_start)) opf = REQ_OP_ZONE_APPEND; free_extent_map(em); @@ -3845,12 +3836,7 @@ static int __extent_writepage(struct page *page, struct writeback_control *wbc, } if (page->index == end_index) { - char *userpage; - - userpage = kmap_atomic(page); - memset(userpage + pg_offset, 0, - PAGE_SIZE - pg_offset); - kunmap_atomic(userpage); + memzero_page(page, pg_offset, PAGE_SIZE - pg_offset); flush_dcache_page(page); } @@ -3967,7 +3953,13 @@ static noinline_for_stack int lock_extent_buffer_for_io(struct extent_buffer *eb btrfs_tree_unlock(eb); - if (!ret) + /* + * Either we don't need to submit any tree block, or we're submitting + * subpage eb. + * Subpage metadata doesn't use page locking at all, so we can skip + * the page locking. + */ + if (!ret || fs_info->sectorsize < PAGE_SIZE) return ret; num_pages = num_extent_pages(eb); @@ -4012,12 +4004,11 @@ err_unlock: return ret; } -static void set_btree_ioerr(struct page *page) +static void set_btree_ioerr(struct page *page, struct extent_buffer *eb) { - struct extent_buffer *eb = (struct extent_buffer *)page->private; - struct btrfs_fs_info *fs_info; + struct btrfs_fs_info *fs_info = eb->fs_info; - SetPageError(page); + btrfs_page_set_error(fs_info, page, eb->start, eb->len); if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; @@ -4025,7 +4016,6 @@ static void set_btree_ioerr(struct page *page) * If we error out, we should add back the dirty_metadata_bytes * to make it consistent. */ - fs_info = eb->fs_info; percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, eb->len, fs_info->dirty_metadata_batch); @@ -4069,26 +4059,111 @@ static void set_btree_ioerr(struct page *page) */ switch (eb->log_index) { case -1: - set_bit(BTRFS_FS_BTREE_ERR, &eb->fs_info->flags); + set_bit(BTRFS_FS_BTREE_ERR, &fs_info->flags); break; case 0: - set_bit(BTRFS_FS_LOG1_ERR, &eb->fs_info->flags); + set_bit(BTRFS_FS_LOG1_ERR, &fs_info->flags); break; case 1: - set_bit(BTRFS_FS_LOG2_ERR, &eb->fs_info->flags); + set_bit(BTRFS_FS_LOG2_ERR, &fs_info->flags); break; default: BUG(); /* unexpected, logic error */ } } +/* + * The endio specific version which won't touch any unsafe spinlock in endio + * context. + */ +static struct extent_buffer *find_extent_buffer_nolock( + struct btrfs_fs_info *fs_info, u64 start) +{ + struct extent_buffer *eb; + + rcu_read_lock(); + eb = radix_tree_lookup(&fs_info->buffer_radix, + start >> fs_info->sectorsize_bits); + if (eb && atomic_inc_not_zero(&eb->refs)) { + rcu_read_unlock(); + return eb; + } + rcu_read_unlock(); + return NULL; +} + +/* + * The endio function for subpage extent buffer write. + * + * Unlike end_bio_extent_buffer_writepage(), we only call end_page_writeback() + * after all extent buffers in the page has finished their writeback. + */ +static void end_bio_subpage_eb_writepage(struct btrfs_fs_info *fs_info, + struct bio *bio) +{ + struct bio_vec *bvec; + struct bvec_iter_all iter_all; + + ASSERT(!bio_flagged(bio, BIO_CLONED)); + bio_for_each_segment_all(bvec, bio, iter_all) { + struct page *page = bvec->bv_page; + u64 bvec_start = page_offset(page) + bvec->bv_offset; + u64 bvec_end = bvec_start + bvec->bv_len - 1; + u64 cur_bytenr = bvec_start; + + ASSERT(IS_ALIGNED(bvec->bv_len, fs_info->nodesize)); + + /* Iterate through all extent buffers in the range */ + while (cur_bytenr <= bvec_end) { + struct extent_buffer *eb; + int done; + + /* + * Here we can't use find_extent_buffer(), as it may + * try to lock eb->refs_lock, which is not safe in endio + * context. + */ + eb = find_extent_buffer_nolock(fs_info, cur_bytenr); + ASSERT(eb); + + cur_bytenr = eb->start + eb->len; + + ASSERT(test_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags)); + done = atomic_dec_and_test(&eb->io_pages); + ASSERT(done); + + if (bio->bi_status || + test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { + ClearPageUptodate(page); + set_btree_ioerr(page, eb); + } + + btrfs_subpage_clear_writeback(fs_info, page, eb->start, + eb->len); + end_extent_buffer_writeback(eb); + /* + * free_extent_buffer() will grab spinlock which is not + * safe in endio context. Thus here we manually dec + * the ref. + */ + atomic_dec(&eb->refs); + } + } + bio_put(bio); +} + static void end_bio_extent_buffer_writepage(struct bio *bio) { + struct btrfs_fs_info *fs_info; struct bio_vec *bvec; struct extent_buffer *eb; int done; struct bvec_iter_all iter_all; + fs_info = btrfs_sb(bio_first_page_all(bio)->mapping->host->i_sb); + if (fs_info->sectorsize < PAGE_SIZE) + return end_bio_subpage_eb_writepage(fs_info, bio); + ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_segment_all(bvec, bio, iter_all) { struct page *page = bvec->bv_page; @@ -4100,7 +4175,7 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) if (bio->bi_status || test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) { ClearPageUptodate(page); - set_btree_ioerr(page); + set_btree_ioerr(page, eb); } end_page_writeback(page); @@ -4114,6 +4189,56 @@ static void end_bio_extent_buffer_writepage(struct bio *bio) bio_put(bio); } +/* + * Unlike the work in write_one_eb(), we rely completely on extent locking. + * Page locking is only utilized at minimum to keep the VMM code happy. + * + * Caller should still call write_one_eb() other than this function directly. + * As write_one_eb() has extra preparation before submitting the extent buffer. + */ +static int write_one_subpage_eb(struct extent_buffer *eb, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page = eb->pages[0]; + unsigned int write_flags = wbc_to_write_flags(wbc) | REQ_META; + bool no_dirty_ebs = false; + int ret; + + /* clear_page_dirty_for_io() in subpage helper needs page locked */ + lock_page(page); + btrfs_subpage_set_writeback(fs_info, page, eb->start, eb->len); + + /* Check if this is the last dirty bit to update nr_written */ + no_dirty_ebs = btrfs_subpage_clear_and_test_dirty(fs_info, page, + eb->start, eb->len); + if (no_dirty_ebs) + clear_page_dirty_for_io(page); + + ret = submit_extent_page(REQ_OP_WRITE | write_flags, wbc, page, + eb->start, eb->len, eb->start - page_offset(page), + &epd->bio, end_bio_extent_buffer_writepage, 0, 0, 0, + false); + if (ret) { + btrfs_subpage_clear_writeback(fs_info, page, eb->start, eb->len); + set_btree_ioerr(page, eb); + unlock_page(page); + + if (atomic_dec_and_test(&eb->io_pages)) + end_extent_buffer_writeback(eb); + return -EIO; + } + unlock_page(page); + /* + * Submission finished without problem, if no range of the page is + * dirty anymore, we have submitted a page. Update nr_written in wbc. + */ + if (no_dirty_ebs) + update_nr_written(wbc, 1); + return ret; +} + static noinline_for_stack int write_one_eb(struct extent_buffer *eb, struct writeback_control *wbc, struct extent_page_data *epd) @@ -4145,6 +4270,9 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, memzero_extent_buffer(eb, start, end - start); } + if (eb->fs_info->sectorsize < PAGE_SIZE) + return write_one_subpage_eb(eb, wbc, epd); + for (i = 0; i < num_pages; i++) { struct page *p = eb->pages[i]; @@ -4156,7 +4284,7 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, end_bio_extent_buffer_writepage, 0, 0, 0, false); if (ret) { - set_btree_ioerr(p); + set_btree_ioerr(p, eb); if (PageWriteback(p)) end_page_writeback(p); if (atomic_sub_and_test(num_pages - i, &eb->io_pages)) @@ -4181,6 +4309,98 @@ static noinline_for_stack int write_one_eb(struct extent_buffer *eb, } /* + * Submit one subpage btree page. + * + * The main difference to submit_eb_page() is: + * - Page locking + * For subpage, we don't rely on page locking at all. + * + * - Flush write bio + * We only flush bio if we may be unable to fit current extent buffers into + * current bio. + * + * Return >=0 for the number of submitted extent buffers. + * Return <0 for fatal error. + */ +static int submit_eb_subpage(struct page *page, + struct writeback_control *wbc, + struct extent_page_data *epd) +{ + struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb); + int submitted = 0; + u64 page_start = page_offset(page); + int bit_start = 0; + const int nbits = BTRFS_SUBPAGE_BITMAP_SIZE; + int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; + int ret; + + /* Lock and write each dirty extent buffers in the range */ + while (bit_start < nbits) { + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + struct extent_buffer *eb; + unsigned long flags; + u64 start; + + /* + * Take private lock to ensure the subpage won't be detached + * in the meantime. + */ + spin_lock(&page->mapping->private_lock); + if (!PagePrivate(page)) { + spin_unlock(&page->mapping->private_lock); + break; + } + spin_lock_irqsave(&subpage->lock, flags); + if (!((1 << bit_start) & subpage->dirty_bitmap)) { + spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock(&page->mapping->private_lock); + bit_start++; + continue; + } + + start = page_start + bit_start * fs_info->sectorsize; + bit_start += sectors_per_node; + + /* + * Here we just want to grab the eb without touching extra + * spin locks, so call find_extent_buffer_nolock(). + */ + eb = find_extent_buffer_nolock(fs_info, start); + spin_unlock_irqrestore(&subpage->lock, flags); + spin_unlock(&page->mapping->private_lock); + + /* + * The eb has already reached 0 refs thus find_extent_buffer() + * doesn't return it. We don't need to write back such eb + * anyway. + */ + if (!eb) + continue; + + ret = lock_extent_buffer_for_io(eb, epd); + if (ret == 0) { + free_extent_buffer(eb); + continue; + } + if (ret < 0) { + free_extent_buffer(eb); + goto cleanup; + } + ret = write_one_eb(eb, wbc, epd); + free_extent_buffer(eb); + if (ret < 0) + goto cleanup; + submitted++; + } + return submitted; + +cleanup: + /* We hit error, end bio for the submitted extent buffers */ + end_write_bio(epd, ret); + return ret; +} + +/* * Submit all page(s) of one extent buffer. * * @page: the page of one extent buffer @@ -4212,6 +4432,9 @@ static int submit_eb_page(struct page *page, struct writeback_control *wbc, if (!PagePrivate(page)) return 0; + if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE) + return submit_eb_subpage(page, wbc, epd); + spin_lock(&mapping->private_lock); if (!PagePrivate(page)) { spin_unlock(&mapping->private_lock); @@ -4652,10 +4875,8 @@ void extent_readahead(struct readahead_control *rac) int nr; while ((nr = readahead_page_batch(rac, pagepool))) { - u64 contig_start = page_offset(pagepool[0]); - u64 contig_end = page_offset(pagepool[nr - 1]) + PAGE_SIZE - 1; - - ASSERT(contig_start + nr * PAGE_SIZE - 1 == contig_end); + u64 contig_start = readahead_pos(rac); + u64 contig_end = contig_start + readahead_batch_length(rac) - 1; contiguous_readpages(pagepool, nr, contig_start, contig_end, &em_cached, &bio, &bio_flags, &prev_em_start); @@ -4975,7 +5196,7 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len) { int ret = 0; - u64 off = start; + u64 off; u64 max = start + len; u32 flags = 0; u32 found_type; @@ -5010,6 +5231,11 @@ int extent_fiemap(struct btrfs_inode *inode, struct fiemap_extent_info *fieinfo, goto out_free_ulist; } + /* + * We can't initialize that to 'start' as this could miss extents due + * to extent item merging + */ + off = 0; start = round_down(start, btrfs_inode_sectorsize(inode)); len = round_up(max, btrfs_inode_sectorsize(inode)) - start; @@ -5469,36 +5695,28 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, { struct extent_buffer *eb; - rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - /* - * Lock our eb's refs_lock to avoid races with - * free_extent_buffer. When we get our eb it might be flagged - * with EXTENT_BUFFER_STALE and another task running - * free_extent_buffer might have seen that flag set, - * eb->refs == 2, that the buffer isn't under IO (dirty and - * writeback flags not set) and it's still in the tree (flag - * EXTENT_BUFFER_TREE_REF set), therefore being in the process - * of decrementing the extent buffer's reference count twice. - * So here we could race and increment the eb's reference count, - * clear its stale flag, mark it as dirty and drop our reference - * before the other task finishes executing free_extent_buffer, - * which would later result in an attempt to free an extent - * buffer that is dirty. - */ - if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { - spin_lock(&eb->refs_lock); - spin_unlock(&eb->refs_lock); - } - mark_extent_buffer_accessed(eb, NULL); - return eb; + eb = find_extent_buffer_nolock(fs_info, start); + if (!eb) + return NULL; + /* + * Lock our eb's refs_lock to avoid races with free_extent_buffer(). + * When we get our eb it might be flagged with EXTENT_BUFFER_STALE and + * another task running free_extent_buffer() might have seen that flag + * set, eb->refs == 2, that the buffer isn't under IO (dirty and + * writeback flags not set) and it's still in the tree (flag + * EXTENT_BUFFER_TREE_REF set), therefore being in the process of + * decrementing the extent buffer's reference count twice. So here we + * could race and increment the eb's reference count, clear its stale + * flag, mark it as dirty and drop our reference before the other task + * finishes executing free_extent_buffer, which would later result in + * an attempt to free an extent buffer that is dirty. + */ + if (test_bit(EXTENT_BUFFER_STALE, &eb->bflags)) { + spin_lock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); } - rcu_read_unlock(); - - return NULL; + mark_extent_buffer_accessed(eb, NULL); + return eb; } #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS @@ -5594,6 +5812,17 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-EINVAL); } +#if BITS_PER_LONG == 32 + if (start >= MAX_LFS_FILESIZE) { + btrfs_err_rl(fs_info, + "extent buffer %llu is beyond 32bit page cache limit", start); + btrfs_err_32bit_limit(fs_info); + return ERR_PTR(-EOVERFLOW); + } + if (start >= BTRFS_32BIT_EARLY_WARN_THRESHOLD) + btrfs_warn_32bit_limit(fs_info); +#endif + if (fs_info->sectorsize < PAGE_SIZE && offset_in_page(start) + len > PAGE_SIZE) { btrfs_err(fs_info, @@ -5665,7 +5894,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, btrfs_page_inc_eb_refs(fs_info, p); spin_unlock(&mapping->private_lock); - WARN_ON(PageDirty(p)); + WARN_ON(btrfs_page_test_dirty(fs_info, p, eb->start, eb->len)); eb->pages[i] = p; if (!PageUptodate(p)) uptodate = 0; @@ -5814,28 +6043,51 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } +static void btree_clear_page_dirty(struct page *page) +{ + ASSERT(PageDirty(page)); + ASSERT(PageLocked(page)); + clear_page_dirty_for_io(page); + xa_lock_irq(&page->mapping->i_pages); + if (!PageDirty(page)) + __xa_clear_mark(&page->mapping->i_pages, + page_index(page), PAGECACHE_TAG_DIRTY); + xa_unlock_irq(&page->mapping->i_pages); +} + +static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + struct page *page = eb->pages[0]; + bool last; + + /* btree_clear_page_dirty() needs page locked */ + lock_page(page); + last = btrfs_subpage_clear_and_test_dirty(fs_info, page, eb->start, + eb->len); + if (last) + btree_clear_page_dirty(page); + unlock_page(page); + WARN_ON(atomic_read(&eb->refs) == 0); +} + void clear_extent_buffer_dirty(const struct extent_buffer *eb) { int i; int num_pages; struct page *page; + if (eb->fs_info->sectorsize < PAGE_SIZE) + return clear_subpage_extent_buffer_dirty(eb); + num_pages = num_extent_pages(eb); for (i = 0; i < num_pages; i++) { page = eb->pages[i]; if (!PageDirty(page)) continue; - lock_page(page); - WARN_ON(!PagePrivate(page)); - - clear_page_dirty_for_io(page); - xa_lock_irq(&page->mapping->i_pages); - if (!PageDirty(page)) - __xa_clear_mark(&page->mapping->i_pages, - page_index(page), PAGECACHE_TAG_DIRTY); - xa_unlock_irq(&page->mapping->i_pages); + btree_clear_page_dirty(page); ClearPageError(page); unlock_page(page); } @@ -5856,10 +6108,28 @@ bool set_extent_buffer_dirty(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); - if (!was_dirty) - for (i = 0; i < num_pages; i++) - set_page_dirty(eb->pages[i]); + if (!was_dirty) { + bool subpage = eb->fs_info->sectorsize < PAGE_SIZE; + /* + * For subpage case, we can have other extent buffers in the + * same page, and in clear_subpage_extent_buffer_dirty() we + * have to clear page dirty without subpage lock held. + * This can cause race where our page gets dirty cleared after + * we just set it. + * + * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * its page for other reasons, we can use page lock to prevent + * the above race. + */ + if (subpage) + lock_page(eb->pages[0]); + for (i = 0; i < num_pages; i++) + btrfs_page_set_dirty(eb->fs_info, eb->pages[i], + eb->start, eb->len); + if (subpage) + unlock_page(eb->pages[0]); + } #ifdef CONFIG_BTRFS_DEBUG for (i = 0; i < num_pages; i++) ASSERT(PageDirty(eb->pages[i])); @@ -6217,12 +6487,34 @@ int memcmp_extent_buffer(const struct extent_buffer *eb, const void *ptrv, return ret; } +/* + * Check that the extent buffer is uptodate. + * + * For regular sector size == PAGE_SIZE case, check if @page is uptodate. + * For subpage case, check if the range covered by the eb has EXTENT_UPTODATE. + */ +static void assert_eb_page_uptodate(const struct extent_buffer *eb, + struct page *page) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + + if (fs_info->sectorsize < PAGE_SIZE) { + bool uptodate; + + uptodate = btrfs_subpage_test_uptodate(fs_info, page, + eb->start, eb->len); + WARN_ON(!uptodate); + } else { + WARN_ON(!PageUptodate(page)); + } +} + void write_extent_buffer_chunk_tree_uuid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; - WARN_ON(!PageUptodate(eb->pages[0])); + assert_eb_page_uptodate(eb, eb->pages[0]); kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, chunk_tree_uuid), srcv, BTRFS_FSID_SIZE); @@ -6232,7 +6524,7 @@ void write_extent_buffer_fsid(const struct extent_buffer *eb, const void *srcv) { char *kaddr; - WARN_ON(!PageUptodate(eb->pages[0])); + assert_eb_page_uptodate(eb, eb->pages[0]); kaddr = page_address(eb->pages[0]) + get_eb_offset_in_page(eb, 0); memcpy(kaddr + offsetof(struct btrfs_header, fsid), srcv, BTRFS_FSID_SIZE); @@ -6257,7 +6549,7 @@ void write_extent_buffer(const struct extent_buffer *eb, const void *srcv, while (len > 0) { page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); @@ -6286,7 +6578,7 @@ void memzero_extent_buffer(const struct extent_buffer *eb, unsigned long start, while (len > 0) { page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); cur = min(len, PAGE_SIZE - offset); kaddr = page_address(page); @@ -6344,7 +6636,7 @@ void copy_extent_buffer(const struct extent_buffer *dst, while (len > 0) { page = dst->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(dst, page); cur = min(len, (unsigned long)(PAGE_SIZE - offset)); @@ -6406,7 +6698,7 @@ int extent_buffer_test_bit(const struct extent_buffer *eb, unsigned long start, eb_bitmap_offset(eb, start, nr, &i, &offset); page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); return 1U & (kaddr[offset] >> (nr & (BITS_PER_BYTE - 1))); } @@ -6431,7 +6723,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); while (len >= bits_to_set) { @@ -6442,7 +6734,7 @@ void extent_buffer_bitmap_set(const struct extent_buffer *eb, unsigned long star if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); } } @@ -6474,7 +6766,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, eb_bitmap_offset(eb, start, pos, &i, &offset); page = eb->pages[i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); while (len >= bits_to_clear) { @@ -6485,7 +6777,7 @@ void extent_buffer_bitmap_clear(const struct extent_buffer *eb, if (++offset >= PAGE_SIZE && len > 0) { offset = 0; page = eb->pages[++i]; - WARN_ON(!PageUptodate(page)); + assert_eb_page_uptodate(eb, page); kaddr = page_address(page); } } diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 824640cb0ace..227215a5722c 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -66,6 +66,7 @@ enum { struct btrfs_root; struct btrfs_inode; struct btrfs_io_bio; +struct btrfs_fs_info; struct io_failure_record; struct extent_io_tree; @@ -270,9 +271,6 @@ struct bio *btrfs_io_bio_alloc(unsigned int nr_iovecs); struct bio *btrfs_bio_clone(struct bio *bio); struct bio *btrfs_bio_clone_partial(struct bio *orig, int offset, int size); -struct btrfs_fs_info; -struct btrfs_inode; - int repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, u64 length, u64 logical, struct page *page, unsigned int pg_offset, int mirror_num); diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index 47cd3a6dc635..441cee7fbb62 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -9,6 +9,7 @@ #include <linux/highmem.h> #include <linux/sched/mm.h> #include <crypto/hash.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "transaction.h" @@ -787,7 +788,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, u64 end_byte = bytenr + len; u64 csum_end; struct extent_buffer *leaf; - int ret; + int ret = 0; const u32 csum_size = fs_info->csum_size; u32 blocksize_bits = fs_info->sectorsize_bits; @@ -805,6 +806,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -861,7 +863,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_del_items(trans, root, path, path->slots[0], del_nr); if (ret) - goto out; + break; if (key.offset == bytenr) break; } else if (key.offset < bytenr && csum_end > end_byte) { @@ -905,8 +907,9 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, ret = btrfs_split_item(trans, root, path, &key, offset); if (ret && ret != -EAGAIN) { btrfs_abort_transaction(trans, ret); - goto out; + break; } + ret = 0; key.offset = end_byte - 1; } else { @@ -916,12 +919,41 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, } btrfs_release_path(path); } - ret = 0; -out: btrfs_free_path(path); return ret; } +static int find_next_csum_offset(struct btrfs_root *root, + struct btrfs_path *path, + u64 *next_offset) +{ + const u32 nritems = btrfs_header_nritems(path->nodes[0]); + struct btrfs_key found_key; + int slot = path->slots[0] + 1; + int ret; + + if (nritems == 0 || slot >= nritems) { + ret = btrfs_next_leaf(root, path); + if (ret < 0) { + return ret; + } else if (ret > 0) { + *next_offset = (u64)-1; + return 0; + } + slot = path->slots[0]; + } + + btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); + + if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || + found_key.type != BTRFS_EXTENT_CSUM_KEY) + *next_offset = (u64)-1; + else + *next_offset = found_key.offset; + + return 0; +} + int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums) @@ -937,7 +969,6 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, u64 total_bytes = 0; u64 csum_offset; u64 bytenr; - u32 nritems; u32 ins_size; int index = 0; int found_next; @@ -980,26 +1011,10 @@ again: goto insert; } } else { - int slot = path->slots[0] + 1; - /* we didn't find a csum item, insert one */ - nritems = btrfs_header_nritems(path->nodes[0]); - if (!nritems || (path->slots[0] >= nritems - 1)) { - ret = btrfs_next_leaf(root, path); - if (ret < 0) { - goto out; - } else if (ret > 0) { - found_next = 1; - goto insert; - } - slot = path->slots[0]; - } - btrfs_item_key_to_cpu(path->nodes[0], &found_key, slot); - if (found_key.objectid != BTRFS_EXTENT_CSUM_OBJECTID || - found_key.type != BTRFS_EXTENT_CSUM_KEY) { - found_next = 1; - goto insert; - } - next_offset = found_key.offset; + /* We didn't find a csum item, insert one. */ + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; found_next = 1; goto insert; } @@ -1055,8 +1070,48 @@ extend_csum: tmp = sums->len - total_bytes; tmp >>= fs_info->sectorsize_bits; WARN_ON(tmp < 1); + extend_nr = max_t(int, 1, tmp); + + /* + * A log tree can already have checksum items with a subset of + * the checksums we are trying to log. This can happen after + * doing a sequence of partial writes into prealloc extents and + * fsyncs in between, with a full fsync logging a larger subrange + * of an extent for which a previous fast fsync logged a smaller + * subrange. And this happens in particular due to merging file + * extent items when we complete an ordered extent for a range + * covered by a prealloc extent - this is done at + * btrfs_mark_extent_written(). + * + * So if we try to extend the previous checksum item, which has + * a range that ends at the start of the range we want to insert, + * make sure we don't extend beyond the start offset of the next + * checksum item. If we are at the last item in the leaf, then + * forget the optimization of extending and add a new checksum + * item - it is not worth the complexity of releasing the path, + * getting the first key for the next leaf, repeat the btree + * search, etc, because log trees are temporary anyway and it + * would only save a few bytes of leaf space. + */ + if (root->root_key.objectid == BTRFS_TREE_LOG_OBJECTID) { + if (path->slots[0] + 1 >= + btrfs_header_nritems(path->nodes[0])) { + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + found_next = 1; + goto insert; + } + + ret = find_next_csum_offset(root, path, &next_offset); + if (ret < 0) + goto out; + + tmp = (next_offset - bytenr) >> fs_info->sectorsize_bits; + if (tmp <= INT_MAX) + extend_nr = min_t(int, extend_nr, tmp); + } - extend_nr = max_t(int, 1, (int)tmp); diff = (csum_offset + extend_nr) * csum_size; diff = min(diff, MAX_CSUM_ITEMS(fs_info, csum_size) * csum_size); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index 0e155f013839..3b10d98b4ebb 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -2014,14 +2014,8 @@ static ssize_t btrfs_file_write_iter(struct kiocb *iocb, else num_written = btrfs_buffered_write(iocb, from); - /* - * We also have to set last_sub_trans to the current log transid, - * otherwise subsequent syncs to a file that's been synced in this - * transaction will appear to have already occurred. - */ - spin_lock(&inode->lock); - inode->last_sub_trans = inode->root->log_transid; - spin_unlock(&inode->lock); + btrfs_set_inode_last_sub_trans(inode); + if (num_written > 0) num_written = generic_write_sync(iocb, num_written); @@ -2073,6 +2067,30 @@ static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end) return ret; } +static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx) +{ + struct btrfs_inode *inode = BTRFS_I(ctx->inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (btrfs_inode_in_log(inode, fs_info->generation) && + list_empty(&ctx->ordered_extents)) + return true; + + /* + * If we are doing a fast fsync we can not bail out if the inode's + * last_trans is <= then the last committed transaction, because we only + * update the last_trans of the inode during ordered extent completion, + * and for a fast fsync we don't wait for that, we only wait for the + * writeback to complete. + */ + if (inode->last_trans <= fs_info->last_trans_committed && + (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) || + list_empty(&ctx->ordered_extents))) + return true; + + return false; +} + /* * fsync call for both files and directories. This logs the inode into * the tree log instead of forcing full commits whenever possible. @@ -2122,7 +2140,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) if (ret) goto out; - inode_lock(inode); + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); atomic_inc(&root->log_batch); @@ -2135,11 +2153,11 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) &BTRFS_I(inode)->runtime_flags); /* - * Before we acquired the inode's lock, someone may have dirtied more - * pages in the target range. We need to make sure that writeback for - * any such pages does not start while we are logging the inode, because - * if it does, any of the following might happen when we are not doing a - * full inode sync: + * Before we acquired the inode's lock and the mmap lock, someone may + * have dirtied more pages in the target range. We need to make sure + * that writeback for any such pages does not start while we are logging + * the inode, because if it does, any of the following might happen when + * we are not doing a full inode sync: * * 1) We log an extent after its writeback finishes but before its * checksums are added to the csum tree, leading to -EIO errors @@ -2154,7 +2172,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) */ ret = start_ordered_ops(inode, start, end); if (ret) { - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -2191,17 +2209,8 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) atomic_inc(&root->log_batch); - /* - * If we are doing a fast fsync we can not bail out if the inode's - * last_trans is <= then the last committed transaction, because we only - * update the last_trans of the inode during ordered extent completion, - * and for a fast fsync we don't wait for that, we only wait for the - * writeback to complete. - */ smp_mb(); - if (btrfs_inode_in_log(BTRFS_I(inode), fs_info->generation) || - (BTRFS_I(inode)->last_trans <= fs_info->last_trans_committed && - (full_sync || list_empty(&ctx.ordered_extents)))) { + if (skip_inode_logging(&ctx)) { /* * We've had everything committed since the last time we were * modified so clear this flag in case it was set for whatever @@ -2255,7 +2264,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync) * file again, but that will end up using the synchronization * inside btrfs_sync_log to keep things safe. */ - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); if (ret != BTRFS_NO_LOG_SYNC) { if (!ret) { @@ -2285,7 +2294,7 @@ out: out_release_extents: btrfs_release_log_ctx_extents(&ctx); - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); goto out; } @@ -2605,16 +2614,17 @@ static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans, * extents without inserting a new one, so we must abort the transaction to avoid * a corruption. */ -int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, - const u64 start, const u64 end, - struct btrfs_replace_extent_info *extent_info, - struct btrfs_trans_handle **trans_out) +int btrfs_replace_file_extents(struct btrfs_inode *inode, + struct btrfs_path *path, const u64 start, + const u64 end, + struct btrfs_replace_extent_info *extent_info, + struct btrfs_trans_handle **trans_out) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1); - u64 ino_size = round_up(inode->i_size, fs_info->sectorsize); - struct btrfs_root *root = BTRFS_I(inode)->root; + u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; struct btrfs_block_rsv *rsv; unsigned int rsv_count; @@ -2662,10 +2672,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, drop_args.drop_cache = true; while (cur_offset < end) { drop_args.start = cur_offset; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); /* If we are punching a hole decrement the inode's byte count */ if (!extent_info) - btrfs_update_inode_bytes(BTRFS_I(inode), 0, + btrfs_update_inode_bytes(inode, 0, drop_args.bytes_found); if (ret != -ENOSPC) { /* @@ -2685,8 +2695,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, if (!extent_info && cur_offset < drop_args.drop_end && cur_offset < ino_size) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_args.drop_end); + ret = fill_holes(trans, inode, path, cur_offset, + drop_args.drop_end); if (ret) { /* * If we failed then we didn't insert our hole @@ -2704,7 +2714,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, * know to not set disk_i_size in this area until a new * file extent is inserted here. */ - ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), + ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, drop_args.drop_end - cur_offset); if (ret) { @@ -2723,8 +2733,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, u64 replace_len = drop_args.drop_end - extent_info->file_offset; - ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), - path, extent_info, replace_len, + ret = btrfs_insert_replace_extent(trans, inode, path, + extent_info, replace_len, drop_args.bytes_found); if (ret) { btrfs_abort_transaction(trans, ret); @@ -2735,9 +2745,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, extent_info->file_offset += replace_len; } - cur_offset = drop_args.drop_end; - - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, inode); if (ret) break; @@ -2756,9 +2764,10 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, BUG_ON(ret); /* shouldn't happen */ trans->block_rsv = rsv; - if (!extent_info) { - ret = find_first_non_hole(BTRFS_I(inode), &cur_offset, - &len); + cur_offset = drop_args.drop_end; + len = end - cur_offset; + if (!extent_info && len) { + ret = find_first_non_hole(inode, &cur_offset, &len); if (unlikely(ret < 0)) break; if (ret && !len) { @@ -2771,14 +2780,11 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, /* * If we were cloning, force the next fsync to be a full one since we * we replaced (or just dropped in the case of cloning holes when - * NO_HOLES is enabled) extents and extent maps. - * This is for the sake of simplicity, and cloning into files larger - * than 16Mb would force the full fsync any way (when - * try_release_extent_mapping() is invoked during page cache truncation. + * NO_HOLES is enabled) file extent items and did not setup new extent + * maps for the replacement extents (or holes). */ if (extent_info && !extent_info->is_new_extent) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); if (ret) goto out_trans; @@ -2804,8 +2810,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, */ if (!extent_info && cur_offset < ino_size && cur_offset < drop_args.drop_end) { - ret = fill_holes(trans, BTRFS_I(inode), path, - cur_offset, drop_args.drop_end); + ret = fill_holes(trans, inode, path, cur_offset, + drop_args.drop_end); if (ret) { /* Same comment as above. */ btrfs_abort_transaction(trans, ret); @@ -2813,8 +2819,8 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } } else if (!extent_info && cur_offset < drop_args.drop_end) { /* See the comment in the loop above for the reasoning here. */ - ret = btrfs_inode_clear_file_extent_range(BTRFS_I(inode), - cur_offset, drop_args.drop_end - cur_offset); + ret = btrfs_inode_clear_file_extent_range(inode, cur_offset, + drop_args.drop_end - cur_offset); if (ret) { btrfs_abort_transaction(trans, ret); goto out_trans; @@ -2822,7 +2828,7 @@ int btrfs_replace_file_extents(struct inode *inode, struct btrfs_path *path, } if (extent_info) { - ret = btrfs_insert_replace_extent(trans, BTRFS_I(inode), path, + ret = btrfs_insert_replace_extent(trans, inode, path, extent_info, extent_info->data_len, drop_args.bytes_found); if (ret) { @@ -2868,7 +2874,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) if (ret) return ret; - inode_lock(inode); + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); ino_size = round_up(inode->i_size, fs_info->sectorsize); ret = find_first_non_hole(BTRFS_I(inode), &offset, &len); if (ret < 0) @@ -2908,7 +2914,7 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); if (ret) { - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); return ret; } } @@ -2967,8 +2973,8 @@ static int btrfs_punch_hole(struct inode *inode, loff_t offset, loff_t len) goto out; } - ret = btrfs_replace_file_extents(inode, path, lockstart, lockend, NULL, - &trans); + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart, + lockend, NULL, &trans); btrfs_free_path(path); if (ret) goto out; @@ -3009,7 +3015,7 @@ out_only_mutex: ret = ret2; } } - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); return ret; } @@ -3335,7 +3341,7 @@ static long btrfs_fallocate(struct file *file, int mode, return ret; } - btrfs_inode_lock(inode, 0); + btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP); if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) { ret = inode_newsize_ok(inode, offset + len); @@ -3377,7 +3383,7 @@ static long btrfs_fallocate(struct file *file, int mode, if (mode & FALLOC_FL_ZERO_RANGE) { ret = btrfs_zero_range(inode, offset, len, mode); - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); return ret; } @@ -3487,7 +3493,7 @@ out_unlock: unlock_extent_cached(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, &cached_state); out: - inode_unlock(inode); + btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP); /* Let go of our reservation. */ if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE)) btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved, @@ -3496,13 +3502,13 @@ out: return ret; } -static loff_t find_desired_extent(struct inode *inode, loff_t offset, +static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset, int whence) { - struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map *em = NULL; struct extent_state *cached_state = NULL; - loff_t i_size = inode->i_size; + loff_t i_size = inode->vfs_inode.i_size; u64 lockstart; u64 lockend; u64 start; @@ -3525,11 +3531,10 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset, lockend--; len = lockend - lockstart + 1; - lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + lock_extent_bits(&inode->io_tree, lockstart, lockend, &cached_state); while (start < i_size) { - em = btrfs_get_extent_fiemap(BTRFS_I(inode), start, len); + em = btrfs_get_extent_fiemap(inode, start, len); if (IS_ERR(em)) { ret = PTR_ERR(em); em = NULL; @@ -3551,7 +3556,7 @@ static loff_t find_desired_extent(struct inode *inode, loff_t offset, cond_resched(); } free_extent_map(em); - unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_extent_cached(&inode->io_tree, lockstart, lockend, &cached_state); if (ret) { offset = ret; @@ -3575,7 +3580,7 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence) case SEEK_DATA: case SEEK_HOLE: btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED); - offset = find_desired_extent(inode, offset, whence); + offset = find_desired_extent(BTRFS_I(inode), offset, whence); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); break; } diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index 9988decd5717..4806295116d8 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -11,6 +11,7 @@ #include <linux/ratelimit.h> #include <linux/error-injection.h> #include <linux/sched/mm.h> +#include "misc.h" #include "ctree.h" #include "free-space-cache.h" #include "transaction.h" @@ -2539,6 +2540,7 @@ out: static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, u64 bytenr, u64 size, bool used) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl; u64 offset = bytenr - block_group->start; u64 to_free, to_unusable; @@ -2569,8 +2571,13 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group, } /* All the region is now unusable. Mark it as unused and reclaim */ - if (block_group->zone_unusable == block_group->length) + if (block_group->zone_unusable == block_group->length) { btrfs_mark_bg_unused(block_group); + } else if (block_group->zone_unusable >= + div_factor_fine(block_group->length, + fs_info->bg_reclaim_threshold)) { + btrfs_mark_bg_to_reclaim(block_group); + } return 0; } @@ -3942,7 +3949,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info, { struct btrfs_block_group *block_group; struct rb_node *node; - int ret; + int ret = 0; btrfs_info(fs_info, "cleaning free space cache v1"); diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index 7cdf65be3707..46f392943f4d 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -102,6 +102,7 @@ static void __endio_write_update_ordered(struct btrfs_inode *inode, * BTRFS_ILOCK_SHARED - acquire a shared lock on the inode * BTRFS_ILOCK_TRY - try to acquire the lock, if fails on first attempt * return -EAGAIN + * BTRFS_ILOCK_MMAP - acquire a write lock on the i_mmap_lock */ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) { @@ -122,6 +123,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) } inode_lock(inode); } + if (ilock_flags & BTRFS_ILOCK_MMAP) + down_write(&BTRFS_I(inode)->i_mmap_lock); return 0; } @@ -133,6 +136,8 @@ int btrfs_inode_lock(struct inode *inode, unsigned int ilock_flags) */ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags) { + if (ilock_flags & BTRFS_ILOCK_MMAP) + up_write(&BTRFS_I(inode)->i_mmap_lock); if (ilock_flags & BTRFS_ILOCK_SHARED) inode_unlock_shared(inode); else @@ -641,17 +646,12 @@ again: if (!ret) { unsigned long offset = offset_in_page(total_compressed); struct page *page = pages[nr_pages - 1]; - char *kaddr; /* zero the tail end of the last page, we might be * sending it down to disk */ - if (offset) { - kaddr = kmap_atomic(page); - memset(kaddr + offset, 0, - PAGE_SIZE - offset); - kunmap_atomic(kaddr); - } + if (offset) + memzero_page(page, offset, PAGE_SIZE - offset); will_compress = 1; } } @@ -1516,7 +1516,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page, static noinline int run_delalloc_nocow(struct btrfs_inode *inode, struct page *locked_page, const u64 start, const u64 end, - int *page_started, int force, + int *page_started, unsigned long *nr_written) { struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1530,6 +1530,7 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, u64 ino = btrfs_ino(inode); bool nocow = false; u64 disk_bytenr = 0; + const bool force = inode->flags & BTRFS_INODE_NODATACOW; path = btrfs_alloc_path(); if (!path) { @@ -1863,23 +1864,16 @@ error: return ret; } -static inline int need_force_cow(struct btrfs_inode *inode, u64 start, u64 end) +static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { - - if (!(inode->flags & BTRFS_INODE_NODATACOW) && - !(inode->flags & BTRFS_INODE_PREALLOC)) - return 0; - - /* - * @defrag_bytes is a hint value, no spinlock held here, - * if is not zero, it means the file is defragging. - * Force cow if given extent needs to be defragged. - */ - if (inode->defrag_bytes && - test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, 0, NULL)) - return 1; - - return 0; + if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { + if (inode->defrag_bytes && + test_range_bit(&inode->io_tree, start, end, EXTENT_DEFRAG, + 0, NULL)) + return false; + return true; + } + return false; } /* @@ -1891,17 +1885,12 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page struct writeback_control *wbc) { int ret; - int force_cow = need_force_cow(inode, start, end); const bool zoned = btrfs_is_zoned(inode->root->fs_info); - if (inode->flags & BTRFS_INODE_NODATACOW && !force_cow) { + if (should_nocow(inode, start, end)) { ASSERT(!zoned); ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, 1, nr_written); - } else if (inode->flags & BTRFS_INODE_PREALLOC && !force_cow) { - ASSERT(!zoned); - ret = run_delalloc_nocow(inode, locked_page, start, end, - page_started, 0, nr_written); + page_started, nr_written); } else if (!inode_can_compress(inode) || !inode_need_compress(inode, start, end)) { if (zoned) @@ -3011,6 +3000,18 @@ out: if (ret || truncated) { u64 unwritten_start = start; + /* + * If we failed to finish this ordered extent for any reason we + * need to make sure BTRFS_ORDERED_IOERR is set on the ordered + * extent, and mark the inode with the error if it wasn't + * already set. Any error during writeback would have already + * set the mapping error, so we need to set it if we're the ones + * marking this ordered extent as failed. + */ + if (ret && !test_and_set_bit(BTRFS_ORDERED_IOERR, + &ordered_extent->flags)) + mapping_set_error(ordered_extent->inode->i_mapping, -EIO); + if (truncated) unwritten_start += logical_len; clear_extent_uptodate(io_tree, unwritten_start, end, NULL); @@ -3099,11 +3100,13 @@ void btrfs_writepage_endio_finish_ordered(struct page *page, u64 start, * @bio_offset: offset to the beginning of the bio (in bytes) * @page: page where is the data to be verified * @pgoff: offset inside the page + * @start: logical offset in the file * * The length of such check is always one sector size. */ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, - u32 bio_offset, struct page *page, u32 pgoff) + u32 bio_offset, struct page *page, u32 pgoff, + u64 start) { struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); @@ -3130,8 +3133,8 @@ static int check_data_csum(struct inode *inode, struct btrfs_io_bio *io_bio, kunmap_atomic(kaddr); return 0; zeroit: - btrfs_print_data_csum_error(BTRFS_I(inode), page_offset(page) + pgoff, - csum, csum_expected, io_bio->mirror_num); + btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected, + io_bio->mirror_num); if (io_bio->device) btrfs_dev_stat_inc_and_print(io_bio->device, BTRFS_DEV_STAT_CORRUPTION_ERRS); @@ -3149,10 +3152,9 @@ zeroit: * @bio_offset: offset to the beginning of the bio (in bytes) * @start: file offset of the range start * @end: file offset of the range end (inclusive) - * @mirror: mirror number */ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, - struct page *page, u64 start, u64 end, int mirror) + struct page *page, u64 start, u64 end) { struct inode *inode = page->mapping->host; struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; @@ -3184,7 +3186,8 @@ int btrfs_verify_data_csum(struct btrfs_io_bio *io_bio, u32 bio_offset, pg_off += sectorsize, bio_offset += sectorsize) { int ret; - ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off); + ret = check_data_csum(inode, io_bio, bio_offset, page, pg_off, + page_offset(page) + pg_off); if (ret < 0) return -EIO; } @@ -3250,6 +3253,7 @@ void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info) inode = list_first_entry(&fs_info->delayed_iputs, struct btrfs_inode, delayed_iput); run_delayed_iput_locked(fs_info, inode); + cond_resched_lock(&fs_info->delayed_iput_lock); } spin_unlock(&fs_info->delayed_iput_lock); } @@ -3390,15 +3394,19 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) int is_dead_root = 0; /* - * this is an orphan in the tree root. Currently these + * This is an orphan in the tree root. Currently these * could come from 2 sources: - * a) a snapshot deletion in progress + * a) a root (snapshot/subvolume) deletion in progress * b) a free space cache inode - * We need to distinguish those two, as the snapshot - * orphan must not get deleted. - * find_dead_roots already ran before us, so if this - * is a snapshot deletion, we should find the root - * in the fs_roots radix tree. + * We need to distinguish those two, as the orphan item + * for a root must not get deleted before the deletion + * of the snapshot/subvolume's tree completes. + * + * btrfs_find_orphan_roots() ran before us, which has + * found all deleted roots and loaded them into + * fs_info->fs_roots_radix. So here we can find if an + * orphan item corresponds to a deleted root by looking + * up the root from that radix tree. */ spin_lock(&fs_info->fs_roots_radix_lock); @@ -4329,7 +4337,11 @@ int btrfs_delete_subvolume(struct inode *dir, struct dentry *dentry) goto out_end_trans; } - btrfs_record_root_in_trans(trans, dest); + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto out_end_trans; + } memset(&dest->root_item.drop_progress, 0, sizeof(dest->root_item.drop_progress)); @@ -4829,7 +4841,6 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - char *kaddr; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; pgoff_t index = from >> PAGE_SHIFT; @@ -4921,15 +4932,13 @@ again: if (offset != blocksize) { if (!len) len = blocksize - offset; - kaddr = kmap(page); if (front) - memset(kaddr + (block_start - page_offset(page)), - 0, offset); + memzero_page(page, (block_start - page_offset(page)), + offset); else - memset(kaddr + (block_start - page_offset(page)) + offset, - 0, len); + memzero_page(page, (block_start - page_offset(page)) + offset, + len); flush_dcache_page(page); - kunmap(page); } ClearPageChecked(page); set_page_dirty(page); @@ -6828,11 +6837,9 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size + pg_offset < PAGE_SIZE) { - char *map = kmap(page); - memset(map + pg_offset + max_size, 0, PAGE_SIZE - max_size - pg_offset); - kunmap(page); - } + if (max_size + pg_offset < PAGE_SIZE) + memzero_page(page, pg_offset + max_size, + PAGE_SIZE - max_size - pg_offset); kfree(tmp); return ret; } @@ -7023,7 +7030,7 @@ next: if (ret) goto out; } else { - map = kmap(page); + map = kmap_local_page(page); read_extent_buffer(leaf, map + pg_offset, ptr, copy_size); if (pg_offset + copy_size < PAGE_SIZE) { @@ -7031,7 +7038,7 @@ next: PAGE_SIZE - pg_offset - copy_size); } - kunmap(page); + kunmap_local(map); } flush_dcache_page(page); } @@ -7259,6 +7266,19 @@ static struct extent_map *btrfs_new_extent_direct(struct btrfs_inode *inode, return em; } +static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) +{ + struct btrfs_block_group *block_group; + bool readonly = false; + + block_group = btrfs_lookup_block_group(fs_info, bytenr); + if (!block_group || block_group->ro) + readonly = true; + if (block_group) + btrfs_put_block_group(block_group); + return readonly; +} + /* * Check if we can do nocow write into the range [@offset, @offset + @len) * @@ -7778,7 +7798,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->bdev = fs_info->fs_devices->latest_bdev; iomap->length = len; - if (write && btrfs_use_zone_append(BTRFS_I(inode), em)) + if (write && btrfs_use_zone_append(BTRFS_I(inode), em->block_start)) iomap->flags |= IOMAP_F_ZONE_APPEND; free_extent_map(em); @@ -7910,7 +7930,8 @@ static blk_status_t btrfs_check_read_dio_bio(struct inode *inode, ASSERT(pgoff < PAGE_SIZE); if (uptodate && (!csum || !check_data_csum(inode, io_bio, - bio_offset, bvec.bv_page, pgoff))) { + bio_offset, bvec.bv_page, + pgoff, start))) { clean_io_failure(fs_info, failure_tree, io_tree, start, bvec.bv_page, btrfs_ino(BTRFS_I(inode)), @@ -8169,10 +8190,6 @@ static blk_qc_t btrfs_submit_direct(struct inode *inode, struct iomap *iomap, bio->bi_end_io = btrfs_end_dio_bio; btrfs_io_bio(bio)->logical = file_offset; - WARN_ON_ONCE(write && btrfs_is_zoned(fs_info) && - fs_info->max_zone_append_size && - bio_op(bio) != REQ_OP_ZONE_APPEND); - if (bio_op(bio) == REQ_OP_ZONE_APPEND) { status = extract_ordered_extent(BTRFS_I(inode), bio, file_offset); @@ -8403,17 +8420,11 @@ again: * for the finish_ordered_io */ if (TestClearPagePrivate2(page)) { - struct btrfs_ordered_inode_tree *tree; - u64 new_len; - - tree = &inode->ordered_tree; - - spin_lock_irq(&tree->lock); + spin_lock_irq(&inode->ordered_tree.lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); - new_len = start - ordered->file_offset; - if (new_len < ordered->truncated_len) - ordered->truncated_len = new_len; - spin_unlock_irq(&tree->lock); + ordered->truncated_len = min(ordered->truncated_len, + start - ordered->file_offset); + spin_unlock_irq(&inode->ordered_tree.lock); if (btrfs_dec_test_ordered_pending(inode, &ordered, start, @@ -8498,7 +8509,6 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct btrfs_ordered_extent *ordered; struct extent_state *cached_state = NULL; struct extent_changeset *data_reserved = NULL; - char *kaddr; unsigned long zero_start; loff_t size; vm_fault_t ret; @@ -8539,6 +8549,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) ret = VM_FAULT_NOPAGE; /* make the VM retry the fault */ again: + down_read(&BTRFS_I(inode)->i_mmap_lock); lock_page(page); size = i_size_read(inode); @@ -8567,6 +8578,7 @@ again: unlock_extent_cached(io_tree, page_start, page_end, &cached_state); unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered, 1); btrfs_put_ordered_extent(ordered); goto again; @@ -8610,20 +8622,17 @@ again: zero_start = PAGE_SIZE; if (zero_start != PAGE_SIZE) { - kaddr = kmap(page); - memset(kaddr + zero_start, 0, PAGE_SIZE - zero_start); + memzero_page(page, zero_start, PAGE_SIZE - zero_start); flush_dcache_page(page); - kunmap(page); } ClearPageChecked(page); set_page_dirty(page); SetPageUptodate(page); - BTRFS_I(inode)->last_trans = fs_info->generation; - BTRFS_I(inode)->last_sub_trans = BTRFS_I(inode)->root->log_transid; - BTRFS_I(inode)->last_log_commit = BTRFS_I(inode)->root->last_log_commit; + btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); unlock_extent_cached(io_tree, page_start, page_end, &cached_state); + up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); sb_end_pagefault(inode->i_sb); @@ -8632,6 +8641,7 @@ again: out_unlock: unlock_page(page); + up_read(&BTRFS_I(inode)->i_mmap_lock); out: btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, @@ -8883,6 +8893,7 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) INIT_LIST_HEAD(&ei->delalloc_inodes); INIT_LIST_HEAD(&ei->delayed_iput); RB_CLEAR_NODE(&ei->rb_node); + init_rwsem(&ei->i_mmap_lock); return inode; } @@ -9077,6 +9088,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, int ret2; bool root_log_pinned = false; bool dest_log_pinned = false; + bool need_abort = false; /* we only allow rename subvolume link between subvolumes */ if (old_ino != BTRFS_FIRST_FREE_OBJECTID && root != dest) @@ -9101,8 +9113,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, goto out_notrans; } - if (dest != root) - btrfs_record_root_in_trans(trans, dest); + if (dest != root) { + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) + goto out_fail; + } /* * We need to find a free sequence number both in the source and @@ -9133,6 +9148,7 @@ static int btrfs_rename_exchange(struct inode *old_dir, old_idx); if (ret) goto out_fail; + need_abort = true; } /* And now for the dest. */ @@ -9148,8 +9164,11 @@ static int btrfs_rename_exchange(struct inode *old_dir, new_ino, btrfs_ino(BTRFS_I(old_dir)), new_idx); - if (ret) + if (ret) { + if (need_abort) + btrfs_abort_transaction(trans, ret); goto out_fail; + } } /* Update inode version and ctime/mtime. */ @@ -9406,8 +9425,11 @@ static int btrfs_rename(struct inode *old_dir, struct dentry *old_dentry, goto out_notrans; } - if (dest != root) - btrfs_record_root_in_trans(trans, dest); + if (dest != root) { + ret = btrfs_record_root_in_trans(trans, dest); + if (ret) + goto out_fail; + } ret = btrfs_set_inode_index(BTRFS_I(new_dir), &index); if (ret) @@ -9674,7 +9696,7 @@ out: return ret; } -int btrfs_start_delalloc_snapshot(struct btrfs_root *root) +int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context) { struct writeback_control wbc = { .nr_to_write = LONG_MAX, @@ -9687,7 +9709,7 @@ int btrfs_start_delalloc_snapshot(struct btrfs_root *root) if (test_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state)) return -EROFS; - return start_delalloc_inodes(root, &wbc, true, false); + return start_delalloc_inodes(root, &wbc, true, in_reclaim_context); } int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, @@ -9919,7 +9941,7 @@ static struct btrfs_trans_handle *insert_prealloc_file_extent( goto free_qgroup; } - ret = btrfs_replace_file_extents(&inode->vfs_inode, path, file_offset, + ret = btrfs_replace_file_extents(inode, path, file_offset, file_offset + len - 1, &extent_info, &trans); btrfs_free_path(path); @@ -10603,6 +10625,8 @@ static const struct inode_operations btrfs_dir_inode_operations = { .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, .tmpfile = btrfs_tmpfile, + .fileattr_get = btrfs_fileattr_get, + .fileattr_set = btrfs_fileattr_set, }; static const struct file_operations btrfs_dir_file_operations = { @@ -10656,6 +10680,8 @@ static const struct inode_operations btrfs_file_inode_operations = { .get_acl = btrfs_get_acl, .set_acl = btrfs_set_acl, .update_time = btrfs_update_time, + .fileattr_get = btrfs_fileattr_get, + .fileattr_set = btrfs_fileattr_set, }; static const struct inode_operations btrfs_special_inode_operations = { .getattr = btrfs_getattr, diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index e8d53fea4c61..5dc2fd843ae3 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -26,6 +26,7 @@ #include <linux/btrfs.h> #include <linux/uaccess.h> #include <linux/iversion.h> +#include <linux/fileattr.h> #include "ctree.h" #include "disk-io.h" #include "export.h" @@ -153,16 +154,6 @@ void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) new_fl); } -static int btrfs_ioctl_getflags(struct file *file, void __user *arg) -{ - struct btrfs_inode *binode = BTRFS_I(file_inode(file)); - unsigned int flags = btrfs_inode_flags_to_fsflags(binode->flags); - - if (copy_to_user(arg, &flags, sizeof(flags))) - return -EFAULT; - return 0; -} - /* * Check if @flags are a supported and valid set of FS_*_FL flags and that * the old and new flags are not conflicting @@ -201,9 +192,22 @@ static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, return 0; } -static int btrfs_ioctl_setflags(struct file *file, void __user *arg) +/* + * Set flags/xflags from the internal inode flags. The remaining items of + * fsxattr are zeroed. + */ +int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) +{ + struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); + + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode->flags)); + return 0; +} + +int btrfs_fileattr_set(struct user_namespace *mnt_userns, + struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = file_inode(file); + struct inode *inode = d_inode(dentry); struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb); struct btrfs_inode *binode = BTRFS_I(inode); struct btrfs_root *root = binode->root; @@ -213,34 +217,21 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) const char *comp = NULL; u32 binode_flags; - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EPERM; - if (btrfs_root_readonly(root)) return -EROFS; - if (copy_from_user(&fsflags, arg, sizeof(fsflags))) - return -EFAULT; - - ret = mnt_want_write_file(file); - if (ret) - return ret; + if (fileattr_has_fsx(fa)) + return -EOPNOTSUPP; - inode_lock(inode); - fsflags = btrfs_mask_fsflags_for_type(inode, fsflags); + fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); old_fsflags = btrfs_inode_flags_to_fsflags(binode->flags); - - ret = vfs_ioc_setflags_prepare(inode, old_fsflags, fsflags); - if (ret) - goto out_unlock; - ret = check_fsflags(old_fsflags, fsflags); if (ret) - goto out_unlock; + return ret; ret = check_fsflags_compatible(fs_info, fsflags); if (ret) - goto out_unlock; + return ret; binode_flags = binode->flags; if (fsflags & FS_SYNC_FL) @@ -263,6 +254,16 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) binode_flags |= BTRFS_INODE_NOATIME; else binode_flags &= ~BTRFS_INODE_NOATIME; + + /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ + if (!fa->flags_valid) { + /* 1 item for the inode */ + trans = btrfs_start_transaction(root, 1); + if (IS_ERR(trans)) + return PTR_ERR(trans); + goto update_flags; + } + if (fsflags & FS_DIRSYNC_FL) binode_flags |= BTRFS_INODE_DIRSYNC; else @@ -303,10 +304,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) binode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - if (IS_SWAPFILE(inode)) { - ret = -ETXTBSY; - goto out_unlock; - } + if (IS_SWAPFILE(inode)) + return -ETXTBSY; binode_flags |= BTRFS_INODE_COMPRESS; binode_flags &= ~BTRFS_INODE_NOCOMPRESS; @@ -323,10 +322,8 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) * 2 for properties */ trans = btrfs_start_transaction(root, 3); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_unlock; - } + if (IS_ERR(trans)) + return PTR_ERR(trans); if (comp) { ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, @@ -344,6 +341,7 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) } } +update_flags: binode->flags = binode_flags; btrfs_sync_inode_flags_to_i_flags(inode); inode_inc_iversion(inode); @@ -352,44 +350,9 @@ static int btrfs_ioctl_setflags(struct file *file, void __user *arg) out_end_trans: btrfs_end_transaction(trans); - out_unlock: - inode_unlock(inode); - mnt_drop_write_file(file); return ret; } -/* - * Translate btrfs internal inode flags to xflags as expected by the - * FS_IOC_FSGETXATT ioctl. Filter only the supported ones, unknown flags are - * silently dropped. - */ -static unsigned int btrfs_inode_flags_to_xflags(unsigned int flags) -{ - unsigned int xflags = 0; - - if (flags & BTRFS_INODE_APPEND) - xflags |= FS_XFLAG_APPEND; - if (flags & BTRFS_INODE_IMMUTABLE) - xflags |= FS_XFLAG_IMMUTABLE; - if (flags & BTRFS_INODE_NOATIME) - xflags |= FS_XFLAG_NOATIME; - if (flags & BTRFS_INODE_NODUMP) - xflags |= FS_XFLAG_NODUMP; - if (flags & BTRFS_INODE_SYNC) - xflags |= FS_XFLAG_SYNC; - - return xflags; -} - -/* Check if @flags are a supported and valid set of FS_XFLAGS_* flags */ -static int check_xflags(unsigned int flags) -{ - if (flags & ~(FS_XFLAG_APPEND | FS_XFLAG_IMMUTABLE | FS_XFLAG_NOATIME | - FS_XFLAG_NODUMP | FS_XFLAG_SYNC)) - return -EOPNOTSUPP; - return 0; -} - bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type) { @@ -402,111 +365,6 @@ void btrfs_exclop_finish(struct btrfs_fs_info *fs_info) sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation"); } -/* - * Set the xflags from the internal inode flags. The remaining items of fsxattr - * are zeroed. - */ -static int btrfs_ioctl_fsgetxattr(struct file *file, void __user *arg) -{ - struct btrfs_inode *binode = BTRFS_I(file_inode(file)); - struct fsxattr fa; - - simple_fill_fsxattr(&fa, btrfs_inode_flags_to_xflags(binode->flags)); - if (copy_to_user(arg, &fa, sizeof(fa))) - return -EFAULT; - - return 0; -} - -static int btrfs_ioctl_fssetxattr(struct file *file, void __user *arg) -{ - struct inode *inode = file_inode(file); - struct btrfs_inode *binode = BTRFS_I(inode); - struct btrfs_root *root = binode->root; - struct btrfs_trans_handle *trans; - struct fsxattr fa, old_fa; - unsigned old_flags; - unsigned old_i_flags; - int ret = 0; - - if (!inode_owner_or_capable(&init_user_ns, inode)) - return -EPERM; - - if (btrfs_root_readonly(root)) - return -EROFS; - - if (copy_from_user(&fa, arg, sizeof(fa))) - return -EFAULT; - - ret = check_xflags(fa.fsx_xflags); - if (ret) - return ret; - - if (fa.fsx_extsize != 0 || fa.fsx_projid != 0 || fa.fsx_cowextsize != 0) - return -EOPNOTSUPP; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - inode_lock(inode); - - old_flags = binode->flags; - old_i_flags = inode->i_flags; - - simple_fill_fsxattr(&old_fa, - btrfs_inode_flags_to_xflags(binode->flags)); - ret = vfs_ioc_fssetxattr_check(inode, &old_fa, &fa); - if (ret) - goto out_unlock; - - if (fa.fsx_xflags & FS_XFLAG_SYNC) - binode->flags |= BTRFS_INODE_SYNC; - else - binode->flags &= ~BTRFS_INODE_SYNC; - if (fa.fsx_xflags & FS_XFLAG_IMMUTABLE) - binode->flags |= BTRFS_INODE_IMMUTABLE; - else - binode->flags &= ~BTRFS_INODE_IMMUTABLE; - if (fa.fsx_xflags & FS_XFLAG_APPEND) - binode->flags |= BTRFS_INODE_APPEND; - else - binode->flags &= ~BTRFS_INODE_APPEND; - if (fa.fsx_xflags & FS_XFLAG_NODUMP) - binode->flags |= BTRFS_INODE_NODUMP; - else - binode->flags &= ~BTRFS_INODE_NODUMP; - if (fa.fsx_xflags & FS_XFLAG_NOATIME) - binode->flags |= BTRFS_INODE_NOATIME; - else - binode->flags &= ~BTRFS_INODE_NOATIME; - - /* 1 item for the inode */ - trans = btrfs_start_transaction(root, 1); - if (IS_ERR(trans)) { - ret = PTR_ERR(trans); - goto out_unlock; - } - - btrfs_sync_inode_flags_to_i_flags(inode); - inode_inc_iversion(inode); - inode->i_ctime = current_time(inode); - ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); - - btrfs_end_transaction(trans); - -out_unlock: - if (ret) { - binode->flags = old_flags; - inode->i_flags = old_i_flags; - } - - inode_unlock(inode); - mnt_drop_write_file(file); - - return ret; -} - static int btrfs_ioctl_getversion(struct file *file, int __user *arg) { struct inode *inode = file_inode(file); @@ -697,8 +555,6 @@ static noinline int create_subvol(struct inode *dir, btrfs_set_root_otransid(root_item, trans->transid); btrfs_tree_unlock(leaf); - free_extent_buffer(leaf); - leaf = NULL; btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); @@ -707,8 +563,22 @@ static noinline int create_subvol(struct inode *dir, key.type = BTRFS_ROOT_ITEM_KEY; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); - if (ret) + if (ret) { + /* + * Since we don't abort the transaction in this case, free the + * tree block so that we don't leak space and leave the + * filesystem in an inconsistent state (an extent item in the + * extent tree without backreferences). Also no need to have + * the tree block locked since it is not in any tree at this + * point, so no other task can find it and use it. + */ + btrfs_free_tree_block(trans, root, leaf, 0, 1); + free_extent_buffer(leaf); goto fail; + } + + free_extent_buffer(leaf); + leaf = NULL; key.offset = (u64)-1; new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev); @@ -721,7 +591,12 @@ static noinline int create_subvol(struct inode *dir, /* Freeing will be done in btrfs_put_root() of new_root */ anon_dev = 0; - btrfs_record_root_in_trans(trans, new_root); + ret = btrfs_record_root_in_trans(trans, new_root); + if (ret) { + btrfs_put_root(new_root); + btrfs_abort_transaction(trans, ret); + goto fail; + } ret = btrfs_create_subvol_root(trans, new_root, root); btrfs_put_root(new_root); @@ -1014,7 +889,7 @@ out_up_read: out_dput: dput(dentry); out_unlock: - inode_unlock(dir); + btrfs_inode_unlock(dir, 0); return error; } @@ -1034,7 +909,7 @@ static noinline int btrfs_mksnapshot(const struct path *parent, */ btrfs_drew_read_lock(&root->snapshot_lock); - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) goto out; @@ -1612,7 +1487,7 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ra_index += cluster; } - inode_lock(inode); + btrfs_inode_lock(inode, 0); if (IS_SWAPFILE(inode)) { ret = -ETXTBSY; } else { @@ -1621,13 +1496,13 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, ret = cluster_pages_for_defrag(inode, pages, i, cluster); } if (ret < 0) { - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); goto out_ra; } defrag_count += ret; balance_dirty_pages_ratelimited(inode->i_mapping); - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); if (newer_than) { if (newer_off == (u64)-1) @@ -1675,9 +1550,9 @@ int btrfs_defrag_file(struct inode *inode, struct file *file, out_ra: if (do_compress) { - inode_lock(inode); + btrfs_inode_lock(inode, 0); BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); } if (!file) kfree(ra); @@ -3112,9 +2987,9 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out_dput; } - inode_lock(inode); + btrfs_inode_lock(inode, 0); err = btrfs_delete_subvolume(dir, dentry); - inode_unlock(inode); + btrfs_inode_unlock(inode, 0); if (!err) { fsnotify_rmdir(dir, dentry); d_delete(dentry); @@ -3123,7 +2998,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, out_dput: dput(dentry); out_unlock_dir: - inode_unlock(dir); + btrfs_inode_unlock(dir, 0); free_subvol_name: kfree(subvol_name_ptr); free_parent: @@ -4915,10 +4790,6 @@ long btrfs_ioctl(struct file *file, unsigned int void __user *argp = (void __user *)arg; switch (cmd) { - case FS_IOC_GETFLAGS: - return btrfs_ioctl_getflags(file, argp); - case FS_IOC_SETFLAGS: - return btrfs_ioctl_setflags(file, argp); case FS_IOC_GETVERSION: return btrfs_ioctl_getversion(file, argp); case FS_IOC_GETFSLABEL: @@ -5044,10 +4915,6 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_get_features(fs_info, argp); case BTRFS_IOC_SET_FEATURES: return btrfs_ioctl_set_features(file, argp); - case FS_IOC_FSGETXATTR: - return btrfs_ioctl_fsgetxattr(file, argp); - case FS_IOC_FSSETXATTR: - return btrfs_ioctl_fssetxattr(file, argp); case BTRFS_IOC_GET_SUBVOL_INFO: return btrfs_ioctl_get_subvol_info(file, argp); case BTRFS_IOC_GET_SUBVOL_ROOTREF: @@ -5067,12 +4934,6 @@ long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) * handling is necessary. */ switch (cmd) { - case FS_IOC32_GETFLAGS: - cmd = FS_IOC_GETFLAGS; - break; - case FS_IOC32_SETFLAGS: - cmd = FS_IOC_SETFLAGS; - break; case FS_IOC32_GETVERSION: cmd = FS_IOC_GETVERSION; break; diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index 9084a950dc09..cd042c7567a4 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -118,7 +118,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, struct workspace *workspace = list_entry(ws, struct workspace, list); int ret = 0; char *data_in; - char *cpage_out; + char *cpage_out, *sizes_ptr; int nr_pages = 0; struct page *in_page = NULL; struct page *out_page = NULL; @@ -258,10 +258,9 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping, } /* store the size of all chunks of compressed data */ - cpage_out = kmap(pages[0]); - write_compress_length(cpage_out, tot_out); - - kunmap(pages[0]); + sizes_ptr = kmap_local_page(pages[0]); + write_compress_length(sizes_ptr, tot_out); + kunmap_local(sizes_ptr); ret = 0; *total_out = tot_out; diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 985a21558437..6c413bb451a3 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -107,17 +107,6 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 file_offset, return NULL; } -/* - * helper to check if a given offset is inside a given entry - */ -static int offset_in_entry(struct btrfs_ordered_extent *entry, u64 file_offset) -{ - if (file_offset < entry->file_offset || - entry->file_offset + entry->num_bytes <= file_offset) - return 0; - return 1; -} - static int range_overlaps(struct btrfs_ordered_extent *entry, u64 file_offset, u64 len) { @@ -142,7 +131,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree, if (tree->last) { entry = rb_entry(tree->last, struct btrfs_ordered_extent, rb_node); - if (offset_in_entry(entry, file_offset)) + if (in_range(file_offset, entry->file_offset, entry->num_bytes)) return tree->last; } ret = __tree_search(root, file_offset, &prev); @@ -349,7 +338,7 @@ bool btrfs_dec_test_first_ordered_pending(struct btrfs_inode *inode, goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, *file_offset)) + if (!in_range(*file_offset, entry->file_offset, entry->num_bytes)) goto out; dec_start = max(*file_offset, entry->file_offset); @@ -428,7 +417,7 @@ bool btrfs_dec_test_ordered_pending(struct btrfs_inode *inode, entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); have_entry: - if (!offset_in_entry(entry, file_offset)) + if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) goto out; if (io_size > entry->bytes_left) @@ -779,7 +768,7 @@ struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *ino goto out; entry = rb_entry(node, struct btrfs_ordered_extent, rb_node); - if (!offset_in_entry(entry, file_offset)) + if (!in_range(file_offset, entry->file_offset, entry->num_bytes)) entry = NULL; if (entry) refcount_inc(&entry->refs); @@ -995,7 +984,7 @@ int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre, if (pre) ret = clone_ordered_extent(ordered, 0, pre); - if (post) + if (ret == 0 && post) ret = clone_ordered_extent(ordered, pre + ordered->disk_num_bytes, post); diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 99e0853e4d3b..e60c07f36427 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -39,8 +39,8 @@ struct btrfs_ordered_sum { */ enum { /* - * Different types for direct io, one and only one of the 4 type can - * be set when creating ordered extent. + * Different types for ordered extents, one and only one of the 4 types + * need to be set when creating ordered extent. * * REGULAR: For regular non-compressed COW write * NOCOW: For NOCOW write into existing non-hole extent diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index 14ff388fd3bd..3ded812f522c 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -23,6 +23,7 @@ #include "qgroup.h" #include "block-group.h" #include "sysfs.h" +#include "tree-mod-log.h" /* TODO XXX FIXME * - subvol delete -> delete when ref goes to 0? delete limits also? @@ -226,7 +227,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, { struct btrfs_qgroup_list *list; - btrfs_sysfs_del_one_qgroup(fs_info, qgroup); list_del(&qgroup->dirty); while (!list_empty(&qgroup->groups)) { list = list_first_entry(&qgroup->groups, @@ -243,7 +243,6 @@ static void __del_qgroup_rb(struct btrfs_fs_info *fs_info, list_del(&list->next_member); kfree(list); } - kfree(qgroup); } /* must be called with qgroup_lock held */ @@ -569,6 +568,8 @@ void btrfs_free_qgroup_config(struct btrfs_fs_info *fs_info) qgroup = rb_entry(n, struct btrfs_qgroup, node); rb_erase(n, &fs_info->qgroup_tree); __del_qgroup_rb(fs_info, qgroup); + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kfree(qgroup); } /* * We call btrfs_free_qgroup_config() when unmounting @@ -1578,6 +1579,14 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) spin_lock(&fs_info->qgroup_lock); del_qgroup_rb(fs_info, qgroupid); spin_unlock(&fs_info->qgroup_lock); + + /* + * Remove the qgroup from sysfs now without holding the qgroup_lock + * spinlock, since the sysfs_remove_group() function needs to take + * the mutex kernfs_mutex through kernfs_remove_by_name_ns(). + */ + btrfs_sysfs_del_one_qgroup(fs_info, qgroup); + kfree(qgroup); out: mutex_unlock(&fs_info->qgroup_ioctl_lock); return ret; @@ -2631,12 +2640,12 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans) record->data_rsv, BTRFS_QGROUP_RSV_DATA); /* - * Use SEQ_LAST as time_seq to do special search, which - * doesn't lock tree or delayed_refs and search current - * root. It's safe inside commit_transaction(). + * Use BTRFS_SEQ_LAST as time_seq to do special search, + * which doesn't lock tree or delayed_refs and search + * current root. It's safe inside commit_transaction(). */ ret = btrfs_find_all_roots(trans, fs_info, - record->bytenr, SEQ_LAST, &new_roots, false); + record->bytenr, BTRFS_SEQ_LAST, &new_roots, false); if (ret < 0) goto cleanup; if (qgroup_to_skip) { @@ -3535,43 +3544,29 @@ static int try_flush_qgroup(struct btrfs_root *root) { struct btrfs_trans_handle *trans; int ret; - bool can_commit = true; /* - * If current process holds a transaction, we shouldn't flush, as we - * assume all space reservation happens before a transaction handle is - * held. - * - * But there are cases like btrfs_delayed_item_reserve_metadata() where - * we try to reserve space with one transction handle already held. - * In that case we can't commit transaction, but at least try to end it - * and hope the started data writes can free some space. + * Can't hold an open transaction or we run the risk of deadlocking, + * and can't either be under the context of a send operation (where + * current->journal_info is set to BTRFS_SEND_TRANS_STUB), as that + * would result in a crash when starting a transaction and does not + * make sense either (send is a read-only operation). */ - if (current->journal_info && - current->journal_info != BTRFS_SEND_TRANS_STUB) - can_commit = false; + ASSERT(current->journal_info == NULL); + if (WARN_ON(current->journal_info)) + return 0; /* * We don't want to run flush again and again, so if there is a running * one, we won't try to start a new flush, but exit directly. */ if (test_and_set_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)) { - /* - * We are already holding a transaction, thus we can block other - * threads from flushing. So exit right now. This increases - * the chance of EDQUOT for heavy load and near limit cases. - * But we can argue that if we're already near limit, EDQUOT is - * unavoidable anyway. - */ - if (!can_commit) - return 0; - wait_event(root->qgroup_flush_wait, !test_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state)); return 0; } - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, true); if (ret < 0) goto out; btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1); @@ -3582,10 +3577,7 @@ static int try_flush_qgroup(struct btrfs_root *root) goto out; } - if (can_commit) - ret = btrfs_commit_transaction(trans); - else - ret = btrfs_end_transaction(trans); + ret = btrfs_commit_transaction(trans); out: clear_bit(BTRFS_ROOT_QGROUP_FLUSHING, &root->state); wake_up(&root->qgroup_flush_wait); @@ -3638,8 +3630,7 @@ cleanup: qgroup_unreserve_range(inode, reserved, start, len); out: if (new_reserved) { - extent_changeset_release(reserved); - kfree(reserved); + extent_changeset_free(reserved); *reserved_ret = NULL; } return ret; diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index 8c31357f08ed..244d499ebc72 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -13,6 +13,7 @@ #include <linux/list_sort.h> #include <linux/raid/xor.h> #include <linux/mm.h> +#include "misc.h" #include "ctree.h" #include "disk-io.h" #include "volumes.h" @@ -1231,13 +1232,13 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap(p); + pointers[stripe] = kmap_local_page(p); } /* then add the parity stripe */ p = rbio_pstripe_page(rbio, pagenr); SetPageUptodate(p); - pointers[stripe++] = kmap(p); + pointers[stripe++] = kmap_local_page(p); if (has_qstripe) { @@ -1247,7 +1248,7 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) */ p = rbio_qstripe_page(rbio, pagenr); SetPageUptodate(p); - pointers[stripe++] = kmap(p); + pointers[stripe++] = kmap_local_page(p); raid6_call.gen_syndrome(rbio->real_stripes, PAGE_SIZE, pointers); @@ -1256,10 +1257,8 @@ static noinline void finish_rmw(struct btrfs_raid_bio *rbio) copy_page(pointers[nr_data], pointers[0]); run_xor(pointers + 1, nr_data - 1, PAGE_SIZE); } - - - for (stripe = 0; stripe < rbio->real_stripes; stripe++) - kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + for (stripe = stripe - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); } /* @@ -1634,7 +1633,8 @@ struct btrfs_plug_cb { /* * rbios on the plug list are sorted for easier merging. */ -static int plug_cmp(void *priv, struct list_head *a, struct list_head *b) +static int plug_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct btrfs_raid_bio *ra = container_of(a, struct btrfs_raid_bio, plug_list); @@ -1776,6 +1776,7 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) { int pagenr, stripe; void **pointers; + void **unmap_array; int faila = -1, failb = -1; struct page *page; blk_status_t err; @@ -1787,6 +1788,16 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) goto cleanup_io; } + /* + * Store copy of pointers that does not get reordered during + * reconstruction so that kunmap_local works. + */ + unmap_array = kcalloc(rbio->real_stripes, sizeof(void *), GFP_NOFS); + if (!unmap_array) { + err = BLK_STS_RESOURCE; + goto cleanup_pointers; + } + faila = rbio->faila; failb = rbio->failb; @@ -1808,8 +1819,11 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) !test_bit(pagenr, rbio->dbitmap)) continue; - /* setup our array of pointers with pages - * from each stripe + /* + * Setup our array of pointers with pages from each stripe + * + * NOTE: store a duplicate array of pointers to preserve the + * pointer order */ for (stripe = 0; stripe < rbio->real_stripes; stripe++) { /* @@ -1823,7 +1837,8 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio) } else { page = rbio_stripe_page(rbio, stripe, pagenr); } - pointers[stripe] = kmap(page); + pointers[stripe] = kmap_local_page(page); + unmap_array[stripe] = pointers[stripe]; } /* all raid6 handling here */ @@ -1916,24 +1931,14 @@ pstripe: } } } - for (stripe = 0; stripe < rbio->real_stripes; stripe++) { - /* - * if we're rebuilding a read, we have to use - * pages from the bio list - */ - if ((rbio->operation == BTRFS_RBIO_READ_REBUILD || - rbio->operation == BTRFS_RBIO_REBUILD_MISSING) && - (stripe == faila || stripe == failb)) { - page = page_in_rbio(rbio, stripe, pagenr, 0); - } else { - page = rbio_stripe_page(rbio, stripe, pagenr); - } - kunmap(page); - } + for (stripe = rbio->real_stripes - 1; stripe >= 0; stripe--) + kunmap_local(unmap_array[stripe]); } err = BLK_STS_OK; cleanup: + kfree(unmap_array); +cleanup_pointers: kfree(pointers); cleanup_io: @@ -2358,13 +2363,13 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, goto cleanup; } SetPageUptodate(q_page); - pointers[rbio->real_stripes - 1] = kmap(q_page); + pointers[rbio->real_stripes - 1] = kmap_local_page(q_page); } atomic_set(&rbio->error, 0); /* Map the parity stripe just once */ - pointers[nr_data] = kmap(p_page); + pointers[nr_data] = kmap_local_page(p_page); for_each_set_bit(pagenr, rbio->dbitmap, rbio->stripe_npages) { struct page *p; @@ -2372,7 +2377,7 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { p = page_in_rbio(rbio, stripe, pagenr, 0); - pointers[stripe] = kmap(p); + pointers[stripe] = kmap_local_page(p); } if (has_qstripe) { @@ -2387,22 +2392,22 @@ static noinline void finish_parity_scrub(struct btrfs_raid_bio *rbio, /* Check scrubbing parity and repair it */ p = rbio_stripe_page(rbio, rbio->scrubp, pagenr); - parity = kmap(p); + parity = kmap_local_page(p); if (memcmp(parity, pointers[rbio->scrubp], PAGE_SIZE)) copy_page(parity, pointers[rbio->scrubp]); else /* Parity is right, needn't writeback */ bitmap_clear(rbio->dbitmap, pagenr, 1); - kunmap(p); + kunmap_local(parity); - for (stripe = 0; stripe < nr_data; stripe++) - kunmap(page_in_rbio(rbio, stripe, pagenr, 0)); + for (stripe = nr_data - 1; stripe >= 0; stripe--) + kunmap_local(pointers[stripe]); } - kunmap(p_page); + kunmap_local(pointers[nr_data]); __free_page(p_page); if (q_page) { - kunmap(q_page); + kunmap_local(pointers[rbio->real_stripes - 1]); __free_page(q_page); } diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index 762881b777b3..9178da07cc9c 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -129,12 +129,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, * So what's in the range [500, 4095] corresponds to zeroes. */ if (datal < block_size) { - char *map; - - map = kmap(page); - memset(map + datal, 0, block_size - datal); + memzero_page(page, datal, block_size - datal); flush_dcache_page(page); - kunmap(page); } SetPageUptodate(page); @@ -207,10 +203,7 @@ static int clone_copy_inline_extent(struct inode *dst, * inline extent's data to the page. */ ASSERT(key.offset > 0); - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, - comp_type); - goto out; + goto copy_to_page; } } else if (i_size_read(dst) <= datal) { struct btrfs_file_extent_item *ei; @@ -226,13 +219,10 @@ static int clone_copy_inline_extent(struct inode *dst, BTRFS_FILE_EXTENT_INLINE) goto copy_inline_extent; - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, comp_type); - goto out; + goto copy_to_page; } copy_inline_extent: - ret = 0; /* * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. @@ -244,11 +234,13 @@ copy_inline_extent: * clone. Deal with all these cases by copying the inline extent * data into the respective page at the destination inode. */ - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, - inline_data, size, datal, comp_type); - goto out; + goto copy_to_page; } + /* + * Release path before starting a new transaction so we don't hold locks + * that would confuse lockdep. + */ btrfs_release_path(path); /* * If we end up here it means were copy the inline extent into a leaf @@ -305,6 +297,21 @@ out: *trans_out = trans; return ret; + +copy_to_page: + /* + * Release our path because we don't need it anymore and also because + * copy_inline_to_page() needs to reserve data and metadata, which may + * need to flush delalloc when we are low on available space and + * therefore cause a deadlock if writeback of an inline extent needs to + * write to the same leaf or an ordered extent completion needs to write + * to the same leaf. + */ + btrfs_release_path(path); + + ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + inline_data, size, datal, comp_type); + goto out; } /** @@ -478,9 +485,9 @@ process_slot: clone_info.file_offset = new_key.offset; clone_info.extent_buf = buf; clone_info.is_new_extent = false; - ret = btrfs_replace_file_extents(inode, path, drop_start, - new_key.offset + datal - 1, &clone_info, - &trans); + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, + drop_start, new_key.offset + datal - 1, + &clone_info, &trans); if (ret) goto out; } else if (type == BTRFS_FILE_EXTENT_INLINE) { @@ -567,8 +574,8 @@ process_slot: set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &BTRFS_I(inode)->runtime_flags); - ret = btrfs_replace_file_extents(inode, path, last_dest_end, - destoff + len - 1, NULL, &trans); + ret = btrfs_replace_file_extents(BTRFS_I(inode), path, + last_dest_end, destoff + len - 1, NULL, &trans); if (ret) goto out; @@ -604,6 +611,20 @@ static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1, lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1); } +static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) +{ + if (inode1 < inode2) + swap(inode1, inode2); + down_write(&BTRFS_I(inode1)->i_mmap_lock); + down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); +} + +static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) +{ + up_write(&BTRFS_I(inode1)->i_mmap_lock); + up_write(&BTRFS_I(inode2)->i_mmap_lock); +} + static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, struct inode *dst, u64 dst_loff) { @@ -820,6 +841,16 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, len, remap_flags); } +static bool file_sync_write(const struct file *file) +{ + if (file->f_flags & (__O_SYNC | O_DSYNC)) + return true; + if (IS_SYNC(file_inode(file))) + return true; + + return false; +} + loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) @@ -832,10 +863,12 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, if (remap_flags & ~(REMAP_FILE_DEDUP | REMAP_FILE_ADVISORY)) return -EINVAL; - if (same_inode) - inode_lock(src_inode); - else + if (same_inode) { + btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); + } else { lock_two_nondirectories(src_inode, dst_inode); + btrfs_double_mmap_lock(src_inode, dst_inode); + } ret = btrfs_remap_file_range_prep(src_file, off, dst_file, destoff, &len, remap_flags); @@ -848,10 +881,27 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: - if (same_inode) - inode_unlock(src_inode); - else + if (same_inode) { + btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); + } else { + btrfs_double_mmap_unlock(src_inode, dst_inode); unlock_two_nondirectories(src_inode, dst_inode); + } + + /* + * If either the source or the destination file was opened with O_SYNC, + * O_DSYNC or has the S_SYNC attribute, fsync both the destination and + * source files/ranges, so that after a successful return (0) followed + * by a power failure results in the reflinked data to be readable from + * both files/ranges. + */ + if (ret == 0 && len > 0 && + (file_sync_write(src_file) || file_sync_write(dst_file))) { + ret = btrfs_sync_file(src_file, off, off + len - 1, 0); + if (ret == 0) + ret = btrfs_sync_file(dst_file, destoff, + destoff + len - 1, 0); + } return ret < 0 ? ret : len; } diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index 232d5da7b7be..b70be2ac2e9e 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -638,9 +638,10 @@ static int __must_check __add_reloc_root(struct btrfs_root *root) node->bytenr, &node->rb_node); spin_unlock(&rc->reloc_root_tree.lock); if (rb_node) { - btrfs_panic(fs_info, -EEXIST, + btrfs_err(fs_info, "Duplicate root found for start=%llu while inserting into relocation tree", node->bytenr); + return -EEXIST; } list_add_tail(&root->root_list, &rc->reloc_roots); @@ -733,10 +734,12 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, struct extent_buffer *eb; struct btrfs_root_item *root_item; struct btrfs_key root_key; - int ret; + int ret = 0; + bool must_abort = false; root_item = kmalloc(sizeof(*root_item), GFP_NOFS); - BUG_ON(!root_item); + if (!root_item) + return ERR_PTR(-ENOMEM); root_key.objectid = BTRFS_TREE_RELOC_OBJECTID; root_key.type = BTRFS_ROOT_ITEM_KEY; @@ -748,7 +751,9 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, /* called by btrfs_init_reloc_root */ ret = btrfs_copy_root(trans, root, root->commit_root, &eb, BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (ret) + goto fail; + /* * Set the last_snapshot field to the generation of the commit * root - like this ctree.c:btrfs_block_can_be_shared() behaves @@ -769,9 +774,16 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, */ ret = btrfs_copy_root(trans, root, root->node, &eb, BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(ret); + if (ret) + goto fail; } + /* + * We have changed references at this point, we must abort the + * transaction if anything fails. + */ + must_abort = true; + memcpy(root_item, &root->root_item, sizeof(*root_item)); btrfs_set_root_bytenr(root_item, eb->start); btrfs_set_root_level(root_item, btrfs_header_level(eb)); @@ -789,14 +801,25 @@ static struct btrfs_root *create_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_insert_root(trans, fs_info->tree_root, &root_key, root_item); - BUG_ON(ret); + if (ret) + goto fail; + kfree(root_item); reloc_root = btrfs_read_tree_root(fs_info->tree_root, &root_key); - BUG_ON(IS_ERR(reloc_root)); + if (IS_ERR(reloc_root)) { + ret = PTR_ERR(reloc_root); + goto abort; + } set_bit(BTRFS_ROOT_SHAREABLE, &reloc_root->state); reloc_root->last_trans = trans->transid; return reloc_root; +fail: + kfree(root_item); +abort: + if (must_abort) + btrfs_abort_transaction(trans, ret); + return ERR_PTR(ret); } /* @@ -856,9 +879,16 @@ int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, reloc_root = create_reloc_root(trans, root, root->root_key.objectid); if (clear_rsv) trans->block_rsv = rsv; + if (IS_ERR(reloc_root)) + return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); - BUG_ON(ret < 0); + ASSERT(ret != -EEXIST); + if (ret) { + /* Pairs with create_reloc_root */ + btrfs_put_root(reloc_root); + return ret; + } root->reloc_root = btrfs_grab_root(reloc_root); return 0; } @@ -875,7 +905,7 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, int ret; if (!have_reloc_root(root)) - goto out; + return 0; reloc_root = root->reloc_root; root_item = &reloc_root->root_item; @@ -908,10 +938,8 @@ int btrfs_update_reloc_root(struct btrfs_trans_handle *trans, ret = btrfs_update_root(trans, fs_info->tree_root, &reloc_root->root_key, root_item); - BUG_ON(ret); btrfs_put_root(reloc_root); -out: - return 0; + return ret; } /* @@ -1185,8 +1213,8 @@ int replace_path(struct btrfs_trans_handle *trans, struct reloc_control *rc, int ret; int slot; - BUG_ON(src->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); - BUG_ON(dest->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(src->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID); + ASSERT(dest->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); last_snapshot = btrfs_root_last_snapshot(&src->root_item); again: @@ -1205,7 +1233,11 @@ again: if (cow) { ret = btrfs_cow_block(trans, dest, eb, NULL, 0, &eb, BTRFS_NESTING_COW); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + return ret; + } } if (next_key) { @@ -1217,7 +1249,7 @@ again: parent = eb; while (1) { level = btrfs_header_level(parent); - BUG_ON(level < lowest_level); + ASSERT(level >= lowest_level); ret = btrfs_bin_search(parent, &key, &slot); if (ret < 0) @@ -1265,7 +1297,11 @@ again: ret = btrfs_cow_block(trans, dest, eb, parent, slot, &eb, BTRFS_NESTING_COW); - BUG_ON(ret); + if (ret) { + btrfs_tree_unlock(eb); + free_extent_buffer(eb); + break; + } } btrfs_tree_unlock(parent); @@ -1289,7 +1325,11 @@ again: path->lowest_level = level; ret = btrfs_search_slot(trans, src, &key, path, 0, 1); path->lowest_level = 0; - BUG_ON(ret); + if (ret) { + if (ret > 0) + ret = -ENOENT; + break; + } /* * Info qgroup to trace both subtrees. @@ -1329,27 +1369,39 @@ again: ref.skip_qgroup = true; btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, new_bytenr, blocksize, 0); ref.skip_qgroup = true; btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, new_bytenr, blocksize, path->nodes[level]->start); btrfs_init_tree_ref(&ref, level - 1, src->root_key.objectid); ref.skip_qgroup = true; ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, old_bytenr, blocksize, 0); btrfs_init_tree_ref(&ref, level - 1, dest->root_key.objectid); ref.skip_qgroup = true; ret = btrfs_free_extent(trans, &ref); - BUG_ON(ret); + if (ret) { + btrfs_abort_transaction(trans, ret); + break; + } btrfs_unlock_up_safe(path, 0); @@ -1537,12 +1589,13 @@ static int find_next_key(struct btrfs_path *path, int level, /* * Insert current subvolume into reloc_control::dirty_subvol_roots */ -static void insert_dirty_subvol(struct btrfs_trans_handle *trans, - struct reloc_control *rc, - struct btrfs_root *root) +static int insert_dirty_subvol(struct btrfs_trans_handle *trans, + struct reloc_control *rc, + struct btrfs_root *root) { struct btrfs_root *reloc_root = root->reloc_root; struct btrfs_root_item *reloc_root_item; + int ret; /* @root must be a subvolume tree root with a valid reloc tree */ ASSERT(root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID); @@ -1553,12 +1606,16 @@ static void insert_dirty_subvol(struct btrfs_trans_handle *trans, sizeof(reloc_root_item->drop_progress)); btrfs_set_root_drop_level(reloc_root_item, 0); btrfs_set_root_refs(reloc_root_item, 0); - btrfs_update_reloc_root(trans, root); + ret = btrfs_update_reloc_root(trans, root); + if (ret) + return ret; if (list_empty(&root->reloc_dirty_list)) { btrfs_grab_root(root); list_add_tail(&root->reloc_dirty_list, &rc->dirty_subvol_roots); } + + return 0; } static int clean_dirty_subvols(struct reloc_control *rc) @@ -1760,8 +1817,11 @@ static noinline_for_stack int merge_reloc_root(struct reloc_control *rc, out: btrfs_free_path(path); - if (ret == 0) - insert_dirty_subvol(trans, rc, root); + if (ret == 0) { + ret = insert_dirty_subvol(trans, rc, root); + if (ret) + btrfs_abort_transaction(trans, ret); + } if (trans) btrfs_end_transaction_throttle(trans); @@ -1825,8 +1885,18 @@ again: root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + if (IS_ERR(root)) { + /* + * Even if we have an error we need this reloc root + * back on our list so we can clean up properly. + */ + list_add(&reloc_root->root_list, &reloc_roots); + btrfs_abort_transaction(trans, (int)PTR_ERR(root)); + if (!err) + err = PTR_ERR(root); + break; + } + ASSERT(root->reloc_root == reloc_root); /* * set reference count to 1, so btrfs_recover_relocation @@ -1834,16 +1904,27 @@ again: */ if (!err) btrfs_set_root_refs(&reloc_root->root_item, 1); - btrfs_update_reloc_root(trans, root); + ret = btrfs_update_reloc_root(trans, root); + /* + * Even if we have an error we need this reloc root back on our + * list so we can clean up properly. + */ list_add(&reloc_root->root_list, &reloc_roots); btrfs_put_root(root); + + if (ret) { + btrfs_abort_transaction(trans, ret); + if (!err) + err = ret; + break; + } } list_splice(&reloc_roots, &rc->reloc_roots); if (!err) - btrfs_commit_transaction(trans); + err = btrfs_commit_transaction(trans); else btrfs_end_transaction(trans); return err; @@ -1888,8 +1969,29 @@ again: root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); if (btrfs_root_refs(&reloc_root->root_item) > 0) { - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + if (IS_ERR(root)) { + /* + * For recovery we read the fs roots on mount, + * and if we didn't find the root then we marked + * the reloc root as a garbage root. For normal + * relocation obviously the root should exist in + * memory. However there's no reason we can't + * handle the error properly here just in case. + */ + ASSERT(0); + ret = PTR_ERR(root); + goto out; + } + if (root->reloc_root != reloc_root) { + /* + * This is actually impossible without something + * going really wrong (like weird race condition + * or cosmic rays). + */ + ASSERT(0); + ret = -EINVAL; + goto out; + } ret = merge_reloc_root(rc, root); btrfs_put_root(root); if (ret) { @@ -1971,8 +2073,27 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, return 0; root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); - BUG_ON(IS_ERR(root)); - BUG_ON(root->reloc_root != reloc_root); + + /* + * This should succeed, since we can't have a reloc root without having + * already looked up the actual root and created the reloc root for this + * root. + * + * However if there's some sort of corruption where we have a ref to a + * reloc root without a corresponding root this could return ENOENT. + */ + if (IS_ERR(root)) { + ASSERT(0); + return PTR_ERR(root); + } + if (root->reloc_root != reloc_root) { + ASSERT(0); + btrfs_err(fs_info, + "root %llu has two reloc roots associated with it", + reloc_root->root_key.offset); + btrfs_put_root(root); + return -EUCLEAN; + } ret = btrfs_record_root_in_trans(trans, root); btrfs_put_root(root); @@ -1988,26 +2109,77 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_backref_node *next; struct btrfs_root *root; int index = 0; + int ret; next = node; while (1) { cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - BUG_ON(!root); - BUG_ON(!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)); + + /* + * If there is no root, then our references for this block are + * incomplete, as we should be able to walk all the way up to a + * block that is owned by a root. + * + * This path is only for SHAREABLE roots, so if we come upon a + * non-SHAREABLE root then we have backrefs that resolve + * improperly. + * + * Both of these cases indicate file system corruption, or a bug + * in the backref walking code. + */ + if (!root) { + ASSERT(0); + btrfs_err(trans->fs_info, + "bytenr %llu doesn't have a backref path ending in a root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } + if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { + ASSERT(0); + btrfs_err(trans->fs_info, + "bytenr %llu has multiple refs with one ending in a non-shareable root", + node->bytenr); + return ERR_PTR(-EUCLEAN); + } if (root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) { - record_reloc_root_in_trans(trans, root); + ret = record_reloc_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); break; } - btrfs_record_root_in_trans(trans, root); + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + return ERR_PTR(ret); root = root->reloc_root; + /* + * We could have raced with another thread which failed, so + * root->reloc_root may not be set, return ENOENT in this case. + */ + if (!root) + return ERR_PTR(-ENOENT); + if (next->new_bytenr != root->node->start) { - BUG_ON(next->new_bytenr); - BUG_ON(!list_empty(&next->list)); + /* + * We just created the reloc root, so we shouldn't have + * ->new_bytenr set and this shouldn't be in the changed + * list. If it is then we have multiple roots pointing + * at the same bytenr which indicates corruption, or + * we've made a mistake in the backref walking code. + */ + ASSERT(next->new_bytenr == 0); + ASSERT(list_empty(&next->list)); + if (next->new_bytenr || !list_empty(&next->list)) { + btrfs_err(trans->fs_info, + "bytenr %llu possibly has multiple roots pointing at the same bytenr %llu", + node->bytenr, next->bytenr); + return ERR_PTR(-EUCLEAN); + } + next->new_bytenr = root->node->start; btrfs_put_root(next->root); next->root = btrfs_grab_root(root); @@ -2024,8 +2196,14 @@ struct btrfs_root *select_reloc_root(struct btrfs_trans_handle *trans, if (!next || next->level <= node->level) break; } - if (!root) - return NULL; + if (!root) { + /* + * This can happen if there's fs corruption or if there's a bug + * in the backref lookup code. + */ + ASSERT(0); + return ERR_PTR(-ENOENT); + } next = node; /* setup backref node path for btrfs_reloc_cow_block */ @@ -2061,7 +2239,13 @@ struct btrfs_root *select_one_root(struct btrfs_backref_node *node) cond_resched(); next = walk_up_backref(next, edges, &index); root = next->root; - BUG_ON(!root); + + /* + * This can occur if we have incomplete extent refs leading all + * the way up a particular path, in this case return -EUCLEAN. + */ + if (!root) + return ERR_PTR(-EUCLEAN); /* No other choice for non-shareable tree */ if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) @@ -2181,7 +2365,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, int slot; int ret = 0; - BUG_ON(lowest && node->eb); + /* + * If we are lowest then this is the first time we're processing this + * block, and thus shouldn't have an eb associated with it yet. + */ + ASSERT(!lowest || !node->eb); path->lowest_level = node->level + 1; rc->backref_cache.path[node->level] = node; @@ -2192,7 +2380,10 @@ static int do_relocation(struct btrfs_trans_handle *trans, upper = edge->node[UPPER]; root = select_reloc_root(trans, rc, upper, edges); - BUG_ON(!root); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + goto next; + } if (upper->eb && !upper->locked) { if (!lowest) { @@ -2266,7 +2457,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, free_extent_buffer(eb); if (ret < 0) goto next; - BUG_ON(node->eb != eb); + /* + * We've just COWed this block, it should have updated + * the correct backref node entry. + */ + ASSERT(node->eb == eb); } else { btrfs_set_node_blockptr(upper->eb, slot, node->eb->start); @@ -2281,10 +2476,11 @@ static int do_relocation(struct btrfs_trans_handle *trans, btrfs_init_tree_ref(&ref, node->level, btrfs_header_owner(upper->eb)); ret = btrfs_inc_extent_ref(trans, &ref); - BUG_ON(ret); - - ret = btrfs_drop_subtree(trans, root, eb, upper->eb); - BUG_ON(ret); + if (!ret) + ret = btrfs_drop_subtree(trans, root, eb, + upper->eb); + if (ret) + btrfs_abort_transaction(trans, ret); } next: if (!upper->pending) @@ -2302,7 +2498,12 @@ next: } path->lowest_level = 0; - BUG_ON(ret == -ENOSPC); + + /* + * We should have allocated all of our space in the block rsv and thus + * shouldn't ENOSPC. + */ + ASSERT(ret != -ENOSPC); return ret; } @@ -2434,16 +2635,53 @@ static int relocate_tree_block(struct btrfs_trans_handle *trans, BUG_ON(node->processed); root = select_one_root(node); - if (root == ERR_PTR(-ENOENT)) { - update_processed_blocks(rc, node); + if (IS_ERR(root)) { + ret = PTR_ERR(root); + + /* See explanation in select_one_root for the -EUCLEAN case. */ + ASSERT(ret == -ENOENT); + if (ret == -ENOENT) { + ret = 0; + update_processed_blocks(rc, node); + } goto out; } if (root) { if (test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) { - BUG_ON(node->new_bytenr); - BUG_ON(!list_empty(&node->list)); - btrfs_record_root_in_trans(trans, root); + /* + * This block was the root block of a root, and this is + * the first time we're processing the block and thus it + * should not have had the ->new_bytenr modified and + * should have not been included on the changed list. + * + * However in the case of corruption we could have + * multiple refs pointing to the same block improperly, + * and thus we would trip over these checks. ASSERT() + * for the developer case, because it could indicate a + * bug in the backref code, however error out for a + * normal user in the case of corruption. + */ + ASSERT(node->new_bytenr == 0); + ASSERT(list_empty(&node->list)); + if (node->new_bytenr || !list_empty(&node->list)) { + btrfs_err(root->fs_info, + "bytenr %llu has improper references to it", + node->bytenr); + ret = -EUCLEAN; + goto out; + } + ret = btrfs_record_root_in_trans(trans, root); + if (ret) + goto out; + /* + * Another thread could have failed, need to check if we + * have reloc_root actually set. + */ + if (!root->reloc_root) { + ret = -ENOENT; + goto out; + } root = root->reloc_root; node->new_bytenr = root->node->start; btrfs_put_root(node->root); @@ -2578,7 +2816,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( return btrfs_end_transaction(trans); } - inode_lock(&inode->vfs_inode); + btrfs_inode_lock(&inode->vfs_inode, 0); for (nr = 0; nr < cluster->nr; nr++) { start = cluster->boundary[nr] - offset; if (nr + 1 < cluster->nr) @@ -2596,7 +2834,7 @@ static noinline_for_stack int prealloc_file_extent_cluster( if (ret) break; } - inode_unlock(&inode->vfs_inode); + btrfs_inode_unlock(&inode->vfs_inode, 0); if (cur_offset < prealloc_end) btrfs_free_reserved_data_space_noquota(inode->root->fs_info, @@ -3220,20 +3458,6 @@ static void unset_reloc_control(struct reloc_control *rc) mutex_unlock(&fs_info->reloc_mutex); } -static int check_extent_flags(u64 flags) -{ - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if (!(flags & BTRFS_EXTENT_FLAG_DATA) && - !(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK)) - return 1; - if ((flags & BTRFS_EXTENT_FLAG_DATA) && - (flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) - return 1; - return 0; -} - static noinline_for_stack int prepare_to_relocate(struct reloc_control *rc) { @@ -3272,8 +3496,7 @@ int prepare_to_relocate(struct reloc_control *rc) */ return PTR_ERR(trans); } - btrfs_commit_transaction(trans); - return 0; + return btrfs_commit_transaction(trans); } static noinline_for_stack int relocate_block_group(struct reloc_control *rc) @@ -3285,7 +3508,6 @@ static noinline_for_stack int relocate_block_group(struct reloc_control *rc) struct btrfs_path *path; struct btrfs_extent_item *ei; u64 flags; - u32 item_size; int ret; int err = 0; int progress = 0; @@ -3334,19 +3556,7 @@ restart: ei = btrfs_item_ptr(path->nodes[0], path->slots[0], struct btrfs_extent_item); - item_size = btrfs_item_size_nr(path->nodes[0], path->slots[0]); - if (item_size >= sizeof(*ei)) { - flags = btrfs_extent_flags(path->nodes[0], ei); - ret = check_extent_flags(flags); - BUG_ON(ret); - } else if (unlikely(item_size == sizeof(struct btrfs_extent_item_v0))) { - err = -EINVAL; - btrfs_print_v0_err(trans->fs_info); - btrfs_abort_transaction(trans, err); - break; - } else { - BUG(); - } + flags = btrfs_extent_flags(path->nodes[0], ei); if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { ret = add_tree_block(rc, &key, path, &blocks); @@ -3445,7 +3655,9 @@ restart: err = PTR_ERR(trans); goto out_free; } - btrfs_commit_transaction(trans); + ret = btrfs_commit_transaction(trans); + if (ret && !err) + err = ret; out_free: ret = clean_dirty_subvols(rc); if (ret < 0 && !err) @@ -3488,6 +3700,35 @@ out: return ret; } +static void delete_orphan_inode(struct btrfs_trans_handle *trans, + struct btrfs_root *root, u64 objectid) +{ + struct btrfs_path *path; + struct btrfs_key key; + int ret = 0; + + path = btrfs_alloc_path(); + if (!path) { + ret = -ENOMEM; + goto out; + } + + key.objectid = objectid; + key.type = BTRFS_INODE_ITEM_KEY; + key.offset = 0; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); + if (ret) { + if (ret > 0) + ret = -ENOENT; + goto out; + } + ret = btrfs_del_item(trans, root, path); +out: + if (ret) + btrfs_abort_transaction(trans, ret); + btrfs_free_path(path); +} + /* * helper to create inode for data relocation. * the inode is in data relocation tree and its link count is 0 @@ -3514,10 +3755,16 @@ struct inode *create_reloc_inode(struct btrfs_fs_info *fs_info, goto out; err = __insert_orphan_inode(trans, root, objectid); - BUG_ON(err); + if (err) + goto out; inode = btrfs_iget(fs_info->sb, objectid, root); - BUG_ON(IS_ERR(inode)); + if (IS_ERR(inode)) { + delete_orphan_inode(trans, root, objectid); + err = PTR_ERR(inode); + inode = NULL; + goto out; + } BTRFS_I(inode)->index_cnt = group->start; err = btrfs_orphan_add(trans, BTRFS_I(inode)); @@ -3859,7 +4106,13 @@ int btrfs_recover_relocation(struct btrfs_root *root) } err = __add_reloc_root(reloc_root); - BUG_ON(err < 0); /* -ENOMEM or logic error */ + ASSERT(err != -EEXIST); + if (err) { + list_add_tail(&reloc_root->root_list, &reloc_roots); + btrfs_put_root(fs_root); + btrfs_end_transaction(trans); + goto out_unset; + } fs_root->reloc_root = btrfs_grab_root(reloc_root); btrfs_put_root(fs_root); } @@ -4074,7 +4327,12 @@ int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans, return PTR_ERR(reloc_root); ret = __add_reloc_root(reloc_root); - BUG_ON(ret < 0); + ASSERT(ret != -EEXIST); + if (ret) { + /* Pairs with create_reloc_root */ + btrfs_put_root(reloc_root); + return ret; + } new_root->reloc_root = btrfs_grab_root(reloc_root); if (rc->create_reloc_tree) diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 3d9088eab2fc..485cda3eb8d7 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -206,9 +206,6 @@ struct full_stripe_lock { struct mutex mutex; }; -static void scrub_pending_bio_inc(struct scrub_ctx *sctx); -static void scrub_pending_bio_dec(struct scrub_ctx *sctx); -static int scrub_handle_errored_block(struct scrub_block *sblock_to_check); static int scrub_setup_recheck_block(struct scrub_block *original_sblock, struct scrub_block *sblocks_for_recheck); static void scrub_recheck_block(struct btrfs_fs_info *fs_info, @@ -226,14 +223,11 @@ static int scrub_write_page_to_dev_replace(struct scrub_block *sblock, static int scrub_checksum_data(struct scrub_block *sblock); static int scrub_checksum_tree_block(struct scrub_block *sblock); static int scrub_checksum_super(struct scrub_block *sblock); -static void scrub_block_get(struct scrub_block *sblock); static void scrub_block_put(struct scrub_block *sblock); static void scrub_page_get(struct scrub_page *spage); static void scrub_page_put(struct scrub_page *spage); static void scrub_parity_get(struct scrub_parity *sparity); static void scrub_parity_put(struct scrub_parity *sparity); -static int scrub_add_page_to_rd_bio(struct scrub_ctx *sctx, - struct scrub_page *spage); static int scrub_pages(struct scrub_ctx *sctx, u64 logical, u32 len, u64 physical, struct btrfs_device *dev, u64 flags, u64 gen, int mirror_num, u8 *csum, @@ -251,8 +245,6 @@ static int scrub_add_page_to_wr_bio(struct scrub_ctx *sctx, static void scrub_wr_submit(struct scrub_ctx *sctx); static void scrub_wr_bio_end_io(struct bio *bio); static void scrub_wr_bio_end_io_worker(struct btrfs_work *work); -static void __scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); -static void scrub_blocked_if_needed(struct btrfs_fs_info *fs_info); static void scrub_put_ctx(struct scrub_ctx *sctx); static inline int scrub_is_page_on_raid56(struct scrub_page *spage) @@ -3682,8 +3674,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, spin_lock(&cache->lock); if (!cache->to_copy) { spin_unlock(&cache->lock); - ro_set = 0; - goto done; + btrfs_put_block_group(cache); + goto skip; } spin_unlock(&cache->lock); } @@ -3841,7 +3833,6 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, cache, found_key.offset)) ro_set = 0; -done: down_write(&dev_replace->rwsem); dev_replace->cursor_left = dev_replace->cursor_right; dev_replace->item_needs_writeback = 1; diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index 8f323859156b..bd69db72acc5 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -6650,6 +6650,7 @@ static int full_send_tree(struct send_ctx *sctx) path = alloc_path_for_send(); if (!path) return -ENOMEM; + path->reada = READA_FORWARD_ALWAYS; key.objectid = BTRFS_FIRST_FREE_OBJECTID; key.type = BTRFS_INODE_ITEM_KEY; @@ -6688,15 +6689,35 @@ out: return ret; } -static int tree_move_down(struct btrfs_path *path, int *level) +static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen) { struct extent_buffer *eb; + struct extent_buffer *parent = path->nodes[*level]; + int slot = path->slots[*level]; + const int nritems = btrfs_header_nritems(parent); + u64 reada_max; + u64 reada_done = 0; BUG_ON(*level == 0); - eb = btrfs_read_node_slot(path->nodes[*level], path->slots[*level]); + eb = btrfs_read_node_slot(parent, slot); if (IS_ERR(eb)) return PTR_ERR(eb); + /* + * Trigger readahead for the next leaves we will process, so that it is + * very likely that when we need them they are already in memory and we + * will not block on disk IO. For nodes we only do readahead for one, + * since the time window between processing nodes is typically larger. + */ + reada_max = (*level == 1 ? SZ_128K : eb->fs_info->nodesize); + + for (slot++; slot < nritems && reada_done < reada_max; slot++) { + if (btrfs_node_ptr_generation(parent, slot) > reada_min_gen) { + btrfs_readahead_node_child(parent, slot); + reada_done += eb->fs_info->nodesize; + } + } + path->nodes[*level - 1] = eb; path->slots[*level - 1] = 0; (*level)--; @@ -6736,14 +6757,15 @@ static int tree_move_next_or_upnext(struct btrfs_path *path, static int tree_advance(struct btrfs_path *path, int *level, int root_level, int allow_down, - struct btrfs_key *key) + struct btrfs_key *key, + u64 reada_min_gen) { int ret; if (*level == 0 || !allow_down) { ret = tree_move_next_or_upnext(path, level, root_level); } else { - ret = tree_move_down(path, level); + ret = tree_move_down(path, level, reada_min_gen); } if (ret >= 0) { if (*level == 0) @@ -6817,6 +6839,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, u64 right_blockptr; u64 left_gen; u64 right_gen; + u64 reada_min_gen; left_path = btrfs_alloc_path(); if (!left_path) { @@ -6896,6 +6919,14 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = -ENOMEM; goto out; } + /* + * Our right root is the parent root, while the left root is the "send" + * root. We know that all new nodes/leaves in the left root must have + * a generation greater than the right root's generation, so we trigger + * readahead for those nodes and leaves of the left root, as we know we + * will need to read them at some point. + */ + reada_min_gen = btrfs_header_generation(right_root->commit_root); up_read(&fs_info->commit_root_sem); if (left_level == 0) @@ -6920,7 +6951,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = tree_advance(left_path, &left_level, left_root_level, advance_left != ADVANCE_ONLY_NEXT, - &left_key); + &left_key, reada_min_gen); if (ret == -1) left_end_reached = ADVANCE; else if (ret < 0) @@ -6931,7 +6962,7 @@ static int btrfs_compare_trees(struct btrfs_root *left_root, ret = tree_advance(right_path, &right_level, right_root_level, advance_right != ADVANCE_ONLY_NEXT, - &right_key); + &right_key, reada_min_gen); if (ret == -1) right_end_reached = ADVANCE; else if (ret < 0) @@ -7139,7 +7170,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) int i; if (root) { - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); @@ -7147,7 +7178,7 @@ static int flush_delalloc_roots(struct send_ctx *sctx) for (i = 0; i < sctx->clone_roots_cnt; i++) { root = sctx->clone_roots[i].root; - ret = btrfs_start_delalloc_snapshot(root); + ret = btrfs_start_delalloc_snapshot(root, false); if (ret) return ret; btrfs_wait_ordered_extents(root, U64_MAX, 0, U64_MAX); diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index 2da6177f4b0b..2dc674b7c3b1 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -861,8 +861,8 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info, * of heavy DIO or ordered reservations, preemptive flushing will just * waste time and cause us to slow down. */ - ordered = percpu_counter_sum_positive(&fs_info->ordered_bytes); - delalloc = percpu_counter_sum_positive(&fs_info->delalloc_bytes); + ordered = percpu_counter_read_positive(&fs_info->ordered_bytes); + delalloc = percpu_counter_read_positive(&fs_info->delalloc_bytes); if (ordered >= delalloc) used += fs_info->delayed_refs_rsv.reserved + fs_info->delayed_block_rsv.reserved; diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index c69049e7daa9..2d19089ab625 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -4,6 +4,64 @@ #include "ctree.h" #include "subpage.h" +/* + * Subpage (sectorsize < PAGE_SIZE) support overview: + * + * Limitations: + * + * - Only support 64K page size for now + * This is to make metadata handling easier, as 64K page would ensure + * all nodesize would fit inside one page, thus we don't need to handle + * cases where a tree block crosses several pages. + * + * - Only metadata read-write for now + * The data read-write part is in development. + * + * - Metadata can't cross 64K page boundary + * btrfs-progs and kernel have done that for a while, thus only ancient + * filesystems could have such problem. For such case, do a graceful + * rejection. + * + * Special behavior: + * + * - Metadata + * Metadata read is fully supported. + * Meaning when reading one tree block will only trigger the read for the + * needed range, other unrelated range in the same page will not be touched. + * + * Metadata write support is partial. + * The writeback is still for the full page, but we will only submit + * the dirty extent buffers in the page. + * + * This means, if we have a metadata page like this: + * + * Page offset + * 0 16K 32K 48K 64K + * |/////////| |///////////| + * \- Tree block A \- Tree block B + * + * Even if we just want to writeback tree block A, we will also writeback + * tree block B if it's also dirty. + * + * This may cause extra metadata writeback which results more COW. + * + * Implementation: + * + * - Common + * Both metadata and data will use a new structure, btrfs_subpage, to + * record the status of each sector inside a page. This provides the extra + * granularity needed. + * + * - Metadata + * Since we have multiple tree blocks inside one page, we can't rely on page + * locking anymore, or we will have greatly reduced concurrency or even + * deadlocks (hold one tree lock while trying to lock another tree lock in + * the same page). + * + * Thus for metadata locking, subpage support relies on io_tree locking only. + * This means a slightly higher tree locking latency. + */ + int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct page *page, enum btrfs_subpage_type type) { @@ -220,6 +278,82 @@ void btrfs_subpage_clear_error(const struct btrfs_fs_info *fs_info, spin_unlock_irqrestore(&subpage->lock, flags); } +void btrfs_subpage_set_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->dirty_bitmap |= tmp; + spin_unlock_irqrestore(&subpage->lock, flags); + set_page_dirty(page); +} + +/* + * Extra clear_and_test function for subpage dirty bitmap. + * + * Return true if we're the last bits in the dirty_bitmap and clear the + * dirty_bitmap. + * Return false otherwise. + * + * NOTE: Callers should manually clear page dirty for true case, as we have + * extra handling for tree blocks. + */ +bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + bool last = false; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->dirty_bitmap &= ~tmp; + if (subpage->dirty_bitmap == 0) + last = true; + spin_unlock_irqrestore(&subpage->lock, flags); + return last; +} + +void btrfs_subpage_clear_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + bool last; + + last = btrfs_subpage_clear_and_test_dirty(fs_info, page, start, len); + if (last) + clear_page_dirty_for_io(page); +} + +void btrfs_subpage_set_writeback(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->writeback_bitmap |= tmp; + set_page_writeback(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + +void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len) +{ + struct btrfs_subpage *subpage = (struct btrfs_subpage *)page->private; + u16 tmp = btrfs_subpage_calc_bitmap(fs_info, page, start, len); + unsigned long flags; + + spin_lock_irqsave(&subpage->lock, flags); + subpage->writeback_bitmap &= ~tmp; + if (subpage->writeback_bitmap == 0) + end_page_writeback(page); + spin_unlock_irqrestore(&subpage->lock, flags); +} + /* * Unlike set/clear which is dependent on each page status, for test all bits * are tested in the same way. @@ -240,6 +374,8 @@ bool btrfs_subpage_test_##name(const struct btrfs_fs_info *fs_info, \ } IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(uptodate); IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(error); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(dirty); +IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(writeback); /* * Note that, in selftests (extent-io-tests), we can have empty fs_info passed @@ -276,3 +412,7 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ IMPLEMENT_BTRFS_PAGE_OPS(uptodate, SetPageUptodate, ClearPageUptodate, PageUptodate); IMPLEMENT_BTRFS_PAGE_OPS(error, SetPageError, ClearPageError, PageError); +IMPLEMENT_BTRFS_PAGE_OPS(dirty, set_page_dirty, clear_page_dirty_for_io, + PageDirty); +IMPLEMENT_BTRFS_PAGE_OPS(writeback, set_page_writeback, end_page_writeback, + PageWriteback); diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index b86a4881475d..bfd626e955be 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -20,6 +20,8 @@ struct btrfs_subpage { spinlock_t lock; u16 uptodate_bitmap; u16 error_bitmap; + u16 dirty_bitmap; + u16 writeback_bitmap; union { /* * Structures only used by metadata @@ -87,5 +89,10 @@ bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \ DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(error); +DECLARE_BTRFS_SUBPAGE_OPS(dirty); +DECLARE_BTRFS_SUBPAGE_OPS(writeback); + +bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, + struct page *page, u64 start, u32 len); #endif diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index f7a4ad86adee..4a396c1147f1 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -252,6 +252,32 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, . } #endif +#if BITS_PER_LONG == 32 +void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) { + btrfs_warn(fs_info, "reaching 32bit limit for logical addresses"); + btrfs_warn(fs_info, +"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_warn(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} + +void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info) +{ + if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) { + btrfs_err(fs_info, "reached 32bit limit for logical addresses"); + btrfs_err(fs_info, +"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed", + BTRFS_32BIT_MAX_FILE_SIZE >> 40); + btrfs_err(fs_info, + "please consider upgrading to 64bit kernel/hardware"); + } +} +#endif + /* * We only mark the transaction aborted and then set the file system read-only. * This will prevent new transactions from starting or trying to join this diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 6eb1c50fa98c..436ac7b4b334 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -360,11 +360,26 @@ static ssize_t supported_rescue_options_show(struct kobject *kobj, BTRFS_ATTR(static_feature, supported_rescue_options, supported_rescue_options_show); +static ssize_t supported_sectorsizes_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + ssize_t ret = 0; + + /* Only sectorsize == PAGE_SIZE is now supported */ + ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%lu\n", PAGE_SIZE); + + return ret; +} +BTRFS_ATTR(static_feature, supported_sectorsizes, + supported_sectorsizes_show); + static struct attribute *btrfs_supported_static_feature_attrs[] = { BTRFS_ATTR_PTR(static_feature, rmdir_subvol), BTRFS_ATTR_PTR(static_feature, supported_checksums), BTRFS_ATTR_PTR(static_feature, send_stream_version), BTRFS_ATTR_PTR(static_feature, supported_rescue_options), + BTRFS_ATTR_PTR(static_feature, supported_sectorsizes), NULL }; @@ -965,6 +980,40 @@ static ssize_t btrfs_read_policy_store(struct kobject *kobj, } BTRFS_ATTR_RW(, read_policy, btrfs_read_policy_show, btrfs_read_policy_store); +static ssize_t btrfs_bg_reclaim_threshold_show(struct kobject *kobj, + struct kobj_attribute *a, + char *buf) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + ssize_t ret; + + ret = scnprintf(buf, PAGE_SIZE, "%d\n", fs_info->bg_reclaim_threshold); + + return ret; +} + +static ssize_t btrfs_bg_reclaim_threshold_store(struct kobject *kobj, + struct kobj_attribute *a, + const char *buf, size_t len) +{ + struct btrfs_fs_info *fs_info = to_fs_info(kobj); + int thresh; + int ret; + + ret = kstrtoint(buf, 10, &thresh); + if (ret) + return ret; + + if (thresh <= 50 || thresh > 100) + return -EINVAL; + + fs_info->bg_reclaim_threshold = thresh; + + return len; +} +BTRFS_ATTR_RW(, bg_reclaim_threshold, btrfs_bg_reclaim_threshold_show, + btrfs_bg_reclaim_threshold_store); + static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, label), BTRFS_ATTR_PTR(, nodesize), @@ -976,6 +1025,7 @@ static const struct attribute *btrfs_attrs[] = { BTRFS_ATTR_PTR(, exclusive_operation), BTRFS_ATTR_PTR(, generation), BTRFS_ATTR_PTR(, read_policy), + BTRFS_ATTR_PTR(, bg_reclaim_threshold), NULL, }; diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index acff6bb49a97..f75de9f6c0ad 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -260,6 +260,7 @@ static inline int extwriter_counter_read(struct btrfs_transaction *trans) void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) { struct btrfs_fs_info *fs_info = trans->fs_info; + struct btrfs_transaction *cur_trans = trans->transaction; if (!trans->chunk_bytes_reserved) return; @@ -268,6 +269,8 @@ void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans) btrfs_block_rsv_release(fs_info, &fs_info->chunk_block_rsv, trans->chunk_bytes_reserved, NULL); + atomic64_sub(trans->chunk_bytes_reserved, &cur_trans->chunk_bytes_reserved); + cond_wake_up(&cur_trans->chunk_reserve_wait); trans->chunk_bytes_reserved = 0; } @@ -383,6 +386,8 @@ loop: spin_lock_init(&cur_trans->dropped_roots_lock); INIT_LIST_HEAD(&cur_trans->releasing_ebs); spin_lock_init(&cur_trans->releasing_ebs_lock); + atomic64_set(&cur_trans->chunk_bytes_reserved, 0); + init_waitqueue_head(&cur_trans->chunk_reserve_wait); list_add_tail(&cur_trans->list, &fs_info->trans_list); extent_io_tree_init(fs_info, &cur_trans->dirty_pages, IO_TREE_TRANS_DIRTY_PAGES, fs_info->btree_inode); @@ -408,6 +413,7 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, int force) { struct btrfs_fs_info *fs_info = root->fs_info; + int ret = 0; if ((test_bit(BTRFS_ROOT_SHAREABLE, &root->state) && root->last_trans < trans->transid) || force) { @@ -456,11 +462,11 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans, * lock. smp_wmb() makes sure that all the writes above are * done before we pop in the zero below */ - btrfs_init_reloc_root(trans, root); + ret = btrfs_init_reloc_root(trans, root); smp_mb__before_atomic(); clear_bit(BTRFS_ROOT_IN_TRANS_SETUP, &root->state); } - return 0; + return ret; } @@ -487,6 +493,7 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; + int ret; if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state)) return 0; @@ -501,10 +508,10 @@ int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans, return 0; mutex_lock(&fs_info->reloc_mutex); - record_root_in_trans(trans, root, 0); + ret = record_root_in_trans(trans, root, 0); mutex_unlock(&fs_info->reloc_mutex); - return 0; + return ret; } static inline int is_transaction_blocked(struct btrfs_transaction *trans) @@ -741,7 +748,16 @@ got_it: * Thus it need to be called after current->journal_info initialized, * or we can deadlock. */ - btrfs_record_root_in_trans(h, root); + ret = btrfs_record_root_in_trans(h, root); + if (ret) { + /* + * The transaction handle is fully initialized and linked with + * other structures so it needs to be ended in case of errors, + * not just freed. + */ + btrfs_end_transaction(h); + return ERR_PTR(ret); + } return h; @@ -1347,7 +1363,9 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans) spin_unlock(&fs_info->fs_roots_radix_lock); btrfs_free_log(trans, root); - btrfs_update_reloc_root(trans, root); + ret2 = btrfs_update_reloc_root(trans, root); + if (ret2) + return ret2; /* see comments in should_cow_block() */ clear_bit(BTRFS_ROOT_FORCE_COW, &root->state); @@ -1440,7 +1458,9 @@ static int qgroup_account_snapshot(struct btrfs_trans_handle *trans, * recorded root will never be updated again, causing an outdated root * item. */ - record_root_in_trans(trans, src, 1); + ret = record_root_in_trans(trans, src, 1); + if (ret) + return ret; /* * btrfs_qgroup_inherit relies on a consistent view of the usage for the @@ -1509,7 +1529,7 @@ out: * insert_dir_item() */ if (!ret) - record_root_in_trans(trans, parent, 1); + ret = record_root_in_trans(trans, parent, 1); return ret; } @@ -1586,8 +1606,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, dentry = pending->dentry; parent_inode = pending->dir; parent_root = BTRFS_I(parent_inode)->root; - record_root_in_trans(trans, parent_root, 0); - + ret = record_root_in_trans(trans, parent_root, 0); + if (ret) + goto fail; cur_time = current_time(parent_inode); /* @@ -1623,7 +1644,11 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; } - record_root_in_trans(trans, root, 0); + ret = record_root_in_trans(trans, root, 0); + if (ret) { + btrfs_abort_transaction(trans, ret); + goto fail; + } btrfs_set_root_last_snapshot(&root->root_item, trans->transid); memcpy(new_root_item, &root->root_item, sizeof(*new_root_item)); btrfs_check_and_init_root_item(new_root_item); @@ -1961,7 +1986,6 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) */ BUG_ON(list_empty(&cur_trans->list)); - list_del_init(&cur_trans->list); if (cur_trans == fs_info->running_transaction) { cur_trans->state = TRANS_STATE_COMMIT_DOING; spin_unlock(&fs_info->trans_lock); @@ -1970,6 +1994,17 @@ static void cleanup_transaction(struct btrfs_trans_handle *trans, int err) spin_lock(&fs_info->trans_lock); } + + /* + * Now that we know no one else is still using the transaction we can + * remove the transaction from the list of transactions. This avoids + * the transaction kthread from cleaning up the transaction while some + * other task is still using it, which could result in a use-after-free + * on things like log trees, as it forces the transaction kthread to + * wait for this transaction to be cleaned up by us. + */ + list_del_init(&cur_trans->list); + spin_unlock(&fs_info->trans_lock); btrfs_cleanup_one_transaction(trans->transaction, fs_info); diff --git a/fs/btrfs/transaction.h b/fs/btrfs/transaction.h index 6335716e513f..364cfbb4c5c5 100644 --- a/fs/btrfs/transaction.h +++ b/fs/btrfs/transaction.h @@ -96,6 +96,13 @@ struct btrfs_transaction { spinlock_t releasing_ebs_lock; struct list_head releasing_ebs; + + /* + * The number of bytes currently reserved, by all transaction handles + * attached to this transaction, for metadata extents of the chunk tree. + */ + atomic64_t chunk_bytes_reserved; + wait_queue_head_t chunk_reserve_wait; }; #define __TRANS_FREEZABLE (1U << 0) @@ -175,7 +182,7 @@ static inline void btrfs_set_inode_last_trans(struct btrfs_trans_handle *trans, spin_lock(&inode->lock); inode->last_trans = trans->transaction->transid; inode->last_sub_trans = inode->root->log_transid; - inode->last_log_commit = inode->root->last_log_commit; + inode->last_log_commit = inode->last_sub_trans - 1; spin_unlock(&inode->lock); } diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index f4ade821307d..a8b2e0d2c025 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1290,6 +1290,11 @@ static int check_extent_item(struct extent_buffer *leaf, key->offset, fs_info->sectorsize); return -EUCLEAN; } + if (unlikely(flags & BTRFS_BLOCK_FLAG_FULL_BACKREF)) { + extent_err(leaf, slot, + "invalid extent flag, data has full backref set"); + return -EUCLEAN; + } } ptr = (unsigned long)(struct btrfs_extent_item *)(ei + 1); diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 92a368627791..362d14db1e38 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -1574,7 +1574,9 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + goto out; } ref_ptr = (unsigned long)(ref_ptr + ref_struct_size) + namelen; @@ -1749,7 +1751,9 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, if (nlink != inode->i_nlink) { set_nlink(inode, nlink); - btrfs_update_inode(trans, root, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); + if (ret) + goto out; } BTRFS_I(inode)->index_cnt = (u64)-1; @@ -1787,6 +1791,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, break; if (ret == 1) { + ret = 0; if (path->slots[0] == 0) break; path->slots[0]--; @@ -1799,17 +1804,19 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, ret = btrfs_del_item(trans, root, path); if (ret) - goto out; + break; btrfs_release_path(path); inode = read_one_inode(root, key.offset); - if (!inode) - return -EIO; + if (!inode) { + ret = -EIO; + break; + } ret = fixup_inode_link_count(trans, root, inode); iput(inode); if (ret) - goto out; + break; /* * fixup on a directory may create new entries, @@ -1818,8 +1825,6 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, */ key.offset = (u64)-1; } - ret = 0; -out: btrfs_release_path(path); return ret; } @@ -1858,8 +1863,6 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, ret = btrfs_update_inode(trans, root, BTRFS_I(inode)); } else if (ret == -EEXIST) { ret = 0; - } else { - BUG(); /* Logic Error */ } iput(inode); @@ -3165,20 +3168,22 @@ int btrfs_sync_log(struct btrfs_trans_handle *trans, */ mutex_unlock(&root->log_mutex); - btrfs_init_log_ctx(&root_log_ctx, NULL); - - mutex_lock(&log_root_tree->log_mutex); - if (btrfs_is_zoned(fs_info)) { + mutex_lock(&fs_info->tree_root->log_mutex); if (!log_root_tree->node) { ret = btrfs_alloc_log_tree_node(trans, log_root_tree); if (ret) { - mutex_unlock(&log_root_tree->log_mutex); + mutex_unlock(&fs_info->tree_log_mutex); goto out; } } + mutex_unlock(&fs_info->tree_root->log_mutex); } + btrfs_init_log_ctx(&root_log_ctx, NULL); + + mutex_lock(&log_root_tree->log_mutex); + index2 = log_root_tree->log_transid % 2; list_add_tail(&root_log_ctx.list, &log_root_tree->log_ctxs[index2]); root_log_ctx.log_transid = log_root_tree->log_transid; @@ -4136,7 +4141,8 @@ static noinline int copy_items(struct btrfs_trans_handle *trans, return ret; } -static int extent_cmp(void *priv, struct list_head *a, struct list_head *b) +static int extent_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct extent_map *em1, *em2; @@ -6058,7 +6064,8 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, * (since logging them is pointless, a link count of 0 means they * will never be accessible). */ - if (btrfs_inode_in_log(inode, trans->transid) || + if ((btrfs_inode_in_log(inode, trans->transid) && + list_empty(&ctx->ordered_extents)) || inode->vfs_inode.i_nlink == 0) { ret = BTRFS_NO_LOG_SYNC; goto end_no_trans; @@ -6278,8 +6285,13 @@ again: } wc.replay_dest->log_root = log; - btrfs_record_root_in_trans(trans, wc.replay_dest); - ret = walk_log_tree(trans, log, &wc); + ret = btrfs_record_root_in_trans(trans, wc.replay_dest); + if (ret) + /* The loop needs to continue due to the root refs */ + btrfs_handle_fs_error(fs_info, ret, + "failed to record the log root in transaction"); + else + ret = walk_log_tree(trans, log, &wc); if (!ret && wc.stage == LOG_WALK_REPLAY_ALL) { ret = fixup_inode_link_counts(trans, wc.replay_dest, @@ -6454,6 +6466,24 @@ void btrfs_log_new_name(struct btrfs_trans_handle *trans, (!old_dir || old_dir->logged_trans < trans->transid)) return; + /* + * If we are doing a rename (old_dir is not NULL) from a directory that + * was previously logged, make sure the next log attempt on the directory + * is not skipped and logs the inode again. This is because the log may + * not currently be authoritative for a range including the old + * BTRFS_DIR_ITEM_KEY and BTRFS_DIR_INDEX_KEY keys, so we want to make + * sure after a log replay we do not end up with both the new and old + * dentries around (in case the inode is a directory we would have a + * directory with two hard links and 2 inode references for different + * parents). The next log attempt of old_dir will happen at + * btrfs_log_all_parents(), called through btrfs_log_inode_parent() + * below, because we have previously set inode->last_unlink_trans to the + * current transaction ID, either here or at btrfs_record_unlink_dir() in + * case inode is a directory. + */ + if (old_dir) + old_dir->logged_trans = 0; + btrfs_init_log_ctx(&ctx, &inode->vfs_inode); ctx.logging_new_name = true; /* diff --git a/fs/btrfs/tree-mod-log.c b/fs/btrfs/tree-mod-log.c new file mode 100644 index 000000000000..8a3a14686d3e --- /dev/null +++ b/fs/btrfs/tree-mod-log.c @@ -0,0 +1,929 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "tree-mod-log.h" +#include "disk-io.h" + +struct tree_mod_root { + u64 logical; + u8 level; +}; + +struct tree_mod_elem { + struct rb_node node; + u64 logical; + u64 seq; + enum btrfs_mod_log_op op; + + /* + * This is used for BTRFS_MOD_LOG_KEY_* and BTRFS_MOD_LOG_MOVE_KEYS + * operations. + */ + int slot; + + /* This is used for BTRFS_MOD_LOG_KEY* and BTRFS_MOD_LOG_ROOT_REPLACE. */ + u64 generation; + + /* Those are used for op == BTRFS_MOD_LOG_KEY_{REPLACE,REMOVE}. */ + struct btrfs_disk_key key; + u64 blockptr; + + /* This is used for op == BTRFS_MOD_LOG_MOVE_KEYS. */ + struct { + int dst_slot; + int nr_items; + } move; + + /* This is used for op == BTRFS_MOD_LOG_ROOT_REPLACE. */ + struct tree_mod_root old_root; +}; + +/* + * Pull a new tree mod seq number for our operation. + */ +static inline u64 btrfs_inc_tree_mod_seq(struct btrfs_fs_info *fs_info) +{ + return atomic64_inc_return(&fs_info->tree_mod_seq); +} + +/* + * This adds a new blocker to the tree mod log's blocker list if the @elem + * passed does not already have a sequence number set. So when a caller expects + * to record tree modifications, it should ensure to set elem->seq to zero + * before calling btrfs_get_tree_mod_seq. + * Returns a fresh, unused tree log modification sequence number, even if no new + * blocker was added. + */ +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem) +{ + write_lock(&fs_info->tree_mod_log_lock); + if (!elem->seq) { + elem->seq = btrfs_inc_tree_mod_seq(fs_info); + list_add_tail(&elem->list, &fs_info->tree_mod_seq_list); + set_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); + } + write_unlock(&fs_info->tree_mod_log_lock); + + return elem->seq; +} + +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct rb_node *next; + struct tree_mod_elem *tm; + u64 min_seq = BTRFS_SEQ_LAST; + u64 seq_putting = elem->seq; + + if (!seq_putting) + return; + + write_lock(&fs_info->tree_mod_log_lock); + list_del(&elem->list); + elem->seq = 0; + + if (list_empty(&fs_info->tree_mod_seq_list)) { + clear_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags); + } else { + struct btrfs_seq_list *first; + + first = list_first_entry(&fs_info->tree_mod_seq_list, + struct btrfs_seq_list, list); + if (seq_putting > first->seq) { + /* + * Blocker with lower sequence number exists, we cannot + * remove anything from the log. + */ + write_unlock(&fs_info->tree_mod_log_lock); + return; + } + min_seq = first->seq; + } + + /* + * Anything that's lower than the lowest existing (read: blocked) + * sequence number can be removed from the tree. + */ + tm_root = &fs_info->tree_mod_log; + for (node = rb_first(tm_root); node; node = next) { + next = rb_next(node); + tm = rb_entry(node, struct tree_mod_elem, node); + if (tm->seq >= min_seq) + continue; + rb_erase(node, tm_root); + kfree(tm); + } + write_unlock(&fs_info->tree_mod_log_lock); +} + +/* + * Key order of the log: + * node/leaf start address -> sequence + * + * The 'start address' is the logical address of the *new* root node for root + * replace operations, or the logical address of the affected block for all + * other operations. + */ +static noinline int tree_mod_log_insert(struct btrfs_fs_info *fs_info, + struct tree_mod_elem *tm) +{ + struct rb_root *tm_root; + struct rb_node **new; + struct rb_node *parent = NULL; + struct tree_mod_elem *cur; + + lockdep_assert_held_write(&fs_info->tree_mod_log_lock); + + tm->seq = btrfs_inc_tree_mod_seq(fs_info); + + tm_root = &fs_info->tree_mod_log; + new = &tm_root->rb_node; + while (*new) { + cur = rb_entry(*new, struct tree_mod_elem, node); + parent = *new; + if (cur->logical < tm->logical) + new = &((*new)->rb_left); + else if (cur->logical > tm->logical) + new = &((*new)->rb_right); + else if (cur->seq < tm->seq) + new = &((*new)->rb_left); + else if (cur->seq > tm->seq) + new = &((*new)->rb_right); + else + return -EEXIST; + } + + rb_link_node(&tm->node, parent, new); + rb_insert_color(&tm->node, tm_root); + return 0; +} + +/* + * Determines if logging can be omitted. Returns true if it can. Otherwise, it + * returns false with the tree_mod_log_lock acquired. The caller must hold + * this until all tree mod log insertions are recorded in the rb tree and then + * write unlock fs_info::tree_mod_log_lock. + */ +static inline bool tree_mod_dont_log(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + return true; + if (eb && btrfs_header_level(eb) == 0) + return true; + + write_lock(&fs_info->tree_mod_log_lock); + if (list_empty(&(fs_info)->tree_mod_seq_list)) { + write_unlock(&fs_info->tree_mod_log_lock); + return true; + } + + return false; +} + +/* Similar to tree_mod_dont_log, but doesn't acquire any locks. */ +static inline bool tree_mod_need_log(const struct btrfs_fs_info *fs_info, + struct extent_buffer *eb) +{ + if (!test_bit(BTRFS_FS_TREE_MOD_LOG_USERS, &fs_info->flags)) + return false; + if (eb && btrfs_header_level(eb) == 0) + return false; + + return true; +} + +static struct tree_mod_elem *alloc_tree_mod_elem(struct extent_buffer *eb, + int slot, + enum btrfs_mod_log_op op, + gfp_t flags) +{ + struct tree_mod_elem *tm; + + tm = kzalloc(sizeof(*tm), flags); + if (!tm) + return NULL; + + tm->logical = eb->start; + if (op != BTRFS_MOD_LOG_KEY_ADD) { + btrfs_node_key(eb, &tm->key, slot); + tm->blockptr = btrfs_node_blockptr(eb, slot); + } + tm->op = op; + tm->slot = slot; + tm->generation = btrfs_node_ptr_generation(eb, slot); + RB_CLEAR_NODE(&tm->node); + + return tm; +} + +int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, + enum btrfs_mod_log_op op, gfp_t flags) +{ + struct tree_mod_elem *tm; + int ret; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + tm = alloc_tree_mod_elem(eb, slot, op, flags); + if (!tm) + return -ENOMEM; + + if (tree_mod_dont_log(eb->fs_info, eb)) { + kfree(tm); + return 0; + } + + ret = tree_mod_log_insert(eb->fs_info, tm); + write_unlock(&eb->fs_info->tree_mod_log_lock); + if (ret) + kfree(tm); + + return ret; +} + +int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items) +{ + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int ret = 0; + int i; + bool locked = false; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + tm_list = kcalloc(nr_items, sizeof(struct tree_mod_elem *), GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->logical = eb->start; + tm->slot = src_slot; + tm->move.dst_slot = dst_slot; + tm->move.nr_items = nr_items; + tm->op = BTRFS_MOD_LOG_MOVE_KEYS; + + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i + dst_slot, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(eb->fs_info, eb)) + goto free_tms; + locked = true; + + /* + * When we override something during the move, we log these removals. + * This can only happen when we move towards the beginning of the + * buffer, i.e. dst_slot < src_slot. + */ + for (i = 0; i + dst_slot < src_slot && i < nr_items; i++) { + ret = tree_mod_log_insert(eb->fs_info, tm_list[i]); + if (ret) + goto free_tms; + } + + ret = tree_mod_log_insert(eb->fs_info, tm); + if (ret) + goto free_tms; + write_unlock(&eb->fs_info->tree_mod_log_lock); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &eb->fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + write_unlock(&eb->fs_info->tree_mod_log_lock); + kfree(tm_list); + kfree(tm); + + return ret; +} + +static inline int tree_mod_log_free_eb(struct btrfs_fs_info *fs_info, + struct tree_mod_elem **tm_list, + int nritems) +{ + int i, j; + int ret; + + for (i = nritems - 1; i >= 0; i--) { + ret = tree_mod_log_insert(fs_info, tm_list[i]); + if (ret) { + for (j = nritems - 1; j > i; j--) + rb_erase(&tm_list[j]->node, + &fs_info->tree_mod_log); + return ret; + } + } + + return 0; +} + +int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, + struct extent_buffer *new_root, + bool log_removal) +{ + struct btrfs_fs_info *fs_info = old_root->fs_info; + struct tree_mod_elem *tm = NULL; + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int ret = 0; + int i; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (log_removal && btrfs_header_level(old_root) > 0) { + nritems = btrfs_header_nritems(old_root); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) { + ret = -ENOMEM; + goto free_tms; + } + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(old_root, i, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + } + + tm = kzalloc(sizeof(*tm), GFP_NOFS); + if (!tm) { + ret = -ENOMEM; + goto free_tms; + } + + tm->logical = new_root->start; + tm->old_root.logical = old_root->start; + tm->old_root.level = btrfs_header_level(old_root); + tm->generation = btrfs_header_generation(old_root); + tm->op = BTRFS_MOD_LOG_ROOT_REPLACE; + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + + if (tm_list) + ret = tree_mod_log_free_eb(fs_info, tm_list, nritems); + if (!ret) + ret = tree_mod_log_insert(fs_info, tm); + + write_unlock(&fs_info->tree_mod_log_lock); + if (ret) + goto free_tms; + kfree(tm_list); + + return ret; + +free_tms: + if (tm_list) { + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + } + kfree(tm); + + return ret; +} + +static struct tree_mod_elem *__tree_mod_log_search(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq, + bool smallest) +{ + struct rb_root *tm_root; + struct rb_node *node; + struct tree_mod_elem *cur = NULL; + struct tree_mod_elem *found = NULL; + + read_lock(&fs_info->tree_mod_log_lock); + tm_root = &fs_info->tree_mod_log; + node = tm_root->rb_node; + while (node) { + cur = rb_entry(node, struct tree_mod_elem, node); + if (cur->logical < start) { + node = node->rb_left; + } else if (cur->logical > start) { + node = node->rb_right; + } else if (cur->seq < min_seq) { + node = node->rb_left; + } else if (!smallest) { + /* We want the node with the highest seq */ + if (found) + BUG_ON(found->seq > cur->seq); + found = cur; + node = node->rb_left; + } else if (cur->seq > min_seq) { + /* We want the node with the smallest seq */ + if (found) + BUG_ON(found->seq < cur->seq); + found = cur; + node = node->rb_right; + } else { + found = cur; + break; + } + } + read_unlock(&fs_info->tree_mod_log_lock); + + return found; +} + +/* + * This returns the element from the log with the smallest time sequence + * value that's in the log (the oldest log item). Any element with a time + * sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem *tree_mod_log_search_oldest(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, true); +} + +/* + * This returns the element from the log with the largest time sequence + * value that's in the log (the most recent log item). Any element with + * a time sequence lower than min_seq will be ignored. + */ +static struct tree_mod_elem *tree_mod_log_search(struct btrfs_fs_info *fs_info, + u64 start, u64 min_seq) +{ + return __tree_mod_log_search(fs_info, start, min_seq, false); +} + +int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, + struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, + int nr_items) +{ + struct btrfs_fs_info *fs_info = dst->fs_info; + int ret = 0; + struct tree_mod_elem **tm_list = NULL; + struct tree_mod_elem **tm_list_add, **tm_list_rem; + int i; + bool locked = false; + + if (!tree_mod_need_log(fs_info, NULL)) + return 0; + + if (btrfs_header_level(dst) == 0 && btrfs_header_level(src) == 0) + return 0; + + tm_list = kcalloc(nr_items * 2, sizeof(struct tree_mod_elem *), + GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + tm_list_add = tm_list; + tm_list_rem = tm_list + nr_items; + for (i = 0; i < nr_items; i++) { + tm_list_rem[i] = alloc_tree_mod_elem(src, i + src_offset, + BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS); + if (!tm_list_rem[i]) { + ret = -ENOMEM; + goto free_tms; + } + + tm_list_add[i] = alloc_tree_mod_elem(dst, i + dst_offset, + BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS); + if (!tm_list_add[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(fs_info, NULL)) + goto free_tms; + locked = true; + + for (i = 0; i < nr_items; i++) { + ret = tree_mod_log_insert(fs_info, tm_list_rem[i]); + if (ret) + goto free_tms; + ret = tree_mod_log_insert(fs_info, tm_list_add[i]); + if (ret) + goto free_tms; + } + + write_unlock(&fs_info->tree_mod_log_lock); + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nr_items * 2; i++) { + if (tm_list[i] && !RB_EMPTY_NODE(&tm_list[i]->node)) + rb_erase(&tm_list[i]->node, &fs_info->tree_mod_log); + kfree(tm_list[i]); + } + if (locked) + write_unlock(&fs_info->tree_mod_log_lock); + kfree(tm_list); + + return ret; +} + +int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb) +{ + struct tree_mod_elem **tm_list = NULL; + int nritems = 0; + int i; + int ret = 0; + + if (!tree_mod_need_log(eb->fs_info, eb)) + return 0; + + nritems = btrfs_header_nritems(eb); + tm_list = kcalloc(nritems, sizeof(struct tree_mod_elem *), GFP_NOFS); + if (!tm_list) + return -ENOMEM; + + for (i = 0; i < nritems; i++) { + tm_list[i] = alloc_tree_mod_elem(eb, i, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, GFP_NOFS); + if (!tm_list[i]) { + ret = -ENOMEM; + goto free_tms; + } + } + + if (tree_mod_dont_log(eb->fs_info, eb)) + goto free_tms; + + ret = tree_mod_log_free_eb(eb->fs_info, tm_list, nritems); + write_unlock(&eb->fs_info->tree_mod_log_lock); + if (ret) + goto free_tms; + kfree(tm_list); + + return 0; + +free_tms: + for (i = 0; i < nritems; i++) + kfree(tm_list[i]); + kfree(tm_list); + + return ret; +} + +/* + * Returns the logical address of the oldest predecessor of the given root. + * Entries older than time_seq are ignored. + */ +static struct tree_mod_elem *tree_mod_log_oldest_root(struct extent_buffer *eb_root, + u64 time_seq) +{ + struct tree_mod_elem *tm; + struct tree_mod_elem *found = NULL; + u64 root_logical = eb_root->start; + bool looped = false; + + if (!time_seq) + return NULL; + + /* + * The very last operation that's logged for a root is the replacement + * operation (if it is replaced at all). This has the logical address + * of the *new* root, making it the very first operation that's logged + * for this root. + */ + while (1) { + tm = tree_mod_log_search_oldest(eb_root->fs_info, root_logical, + time_seq); + if (!looped && !tm) + return NULL; + /* + * If there are no tree operation for the oldest root, we simply + * return it. This should only happen if that (old) root is at + * level 0. + */ + if (!tm) + break; + + /* + * If there's an operation that's not a root replacement, we + * found the oldest version of our root. Normally, we'll find a + * BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING operation here. + */ + if (tm->op != BTRFS_MOD_LOG_ROOT_REPLACE) + break; + + found = tm; + root_logical = tm->old_root.logical; + looped = true; + } + + /* If there's no old root to return, return what we found instead */ + if (!found) + found = tm; + + return found; +} + + +/* + * tm is a pointer to the first operation to rewind within eb. Then, all + * previous operations will be rewound (until we reach something older than + * time_seq). + */ +static void tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct extent_buffer *eb, + u64 time_seq, + struct tree_mod_elem *first_tm) +{ + u32 n; + struct rb_node *next; + struct tree_mod_elem *tm = first_tm; + unsigned long o_dst; + unsigned long o_src; + unsigned long p_size = sizeof(struct btrfs_key_ptr); + + n = btrfs_header_nritems(eb); + read_lock(&fs_info->tree_mod_log_lock); + while (tm && tm->seq >= time_seq) { + /* + * All the operations are recorded with the operator used for + * the modification. As we're going backwards, we do the + * opposite of each operation here. + */ + switch (tm->op) { + case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING: + BUG_ON(tm->slot < n); + fallthrough; + case BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING: + case BTRFS_MOD_LOG_KEY_REMOVE: + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + n++; + break; + case BTRFS_MOD_LOG_KEY_REPLACE: + BUG_ON(tm->slot >= n); + btrfs_set_node_key(eb, &tm->key, tm->slot); + btrfs_set_node_blockptr(eb, tm->slot, tm->blockptr); + btrfs_set_node_ptr_generation(eb, tm->slot, + tm->generation); + break; + case BTRFS_MOD_LOG_KEY_ADD: + /* if a move operation is needed it's in the log */ + n--; + break; + case BTRFS_MOD_LOG_MOVE_KEYS: + o_dst = btrfs_node_key_ptr_offset(tm->slot); + o_src = btrfs_node_key_ptr_offset(tm->move.dst_slot); + memmove_extent_buffer(eb, o_dst, o_src, + tm->move.nr_items * p_size); + break; + case BTRFS_MOD_LOG_ROOT_REPLACE: + /* + * This operation is special. For roots, this must be + * handled explicitly before rewinding. + * For non-roots, this operation may exist if the node + * was a root: root A -> child B; then A gets empty and + * B is promoted to the new root. In the mod log, we'll + * have a root-replace operation for B, a tree block + * that is no root. We simply ignore that operation. + */ + break; + } + next = rb_next(&tm->node); + if (!next) + break; + tm = rb_entry(next, struct tree_mod_elem, node); + if (tm->logical != first_tm->logical) + break; + } + read_unlock(&fs_info->tree_mod_log_lock); + btrfs_set_header_nritems(eb, n); +} + +/* + * Called with eb read locked. If the buffer cannot be rewound, the same buffer + * is returned. If rewind operations happen, a fresh buffer is returned. The + * returned buffer is always read-locked. If the returned buffer is not the + * input buffer, the lock on the input buffer is released and the input buffer + * is freed (its refcount is decremented). + */ +struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct extent_buffer *eb, + u64 time_seq) +{ + struct extent_buffer *eb_rewin; + struct tree_mod_elem *tm; + + if (!time_seq) + return eb; + + if (btrfs_header_level(eb) == 0) + return eb; + + tm = tree_mod_log_search(fs_info, eb->start, time_seq); + if (!tm) + return eb; + + if (tm->op == BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + BUG_ON(tm->slot != 0); + eb_rewin = alloc_dummy_extent_buffer(fs_info, eb->start); + if (!eb_rewin) { + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + return NULL; + } + btrfs_set_header_bytenr(eb_rewin, eb->start); + btrfs_set_header_backref_rev(eb_rewin, + btrfs_header_backref_rev(eb)); + btrfs_set_header_owner(eb_rewin, btrfs_header_owner(eb)); + btrfs_set_header_level(eb_rewin, btrfs_header_level(eb)); + } else { + eb_rewin = btrfs_clone_extent_buffer(eb); + if (!eb_rewin) { + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + return NULL; + } + } + + btrfs_tree_read_unlock(eb); + free_extent_buffer(eb); + + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb_rewin), + eb_rewin, btrfs_header_level(eb_rewin)); + btrfs_tree_read_lock(eb_rewin); + tree_mod_log_rewind(fs_info, eb_rewin, time_seq, tm); + WARN_ON(btrfs_header_nritems(eb_rewin) > + BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + + return eb_rewin; +} + +/* + * Rewind the state of @root's root node to the given @time_seq value. + * If there are no changes, the current root->root_node is returned. If anything + * changed in between, there's a fresh buffer allocated on which the rewind + * operations are done. In any case, the returned buffer is read locked. + * Returns NULL on error (with no locks held). + */ +struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq) +{ + struct btrfs_fs_info *fs_info = root->fs_info; + struct tree_mod_elem *tm; + struct extent_buffer *eb = NULL; + struct extent_buffer *eb_root; + u64 eb_root_owner = 0; + struct extent_buffer *old; + struct tree_mod_root *old_root = NULL; + u64 old_generation = 0; + u64 logical; + int level; + + eb_root = btrfs_read_lock_root_node(root); + tm = tree_mod_log_oldest_root(eb_root, time_seq); + if (!tm) + return eb_root; + + if (tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) { + old_root = &tm->old_root; + old_generation = tm->generation; + logical = old_root->logical; + level = old_root->level; + } else { + logical = eb_root->start; + level = btrfs_header_level(eb_root); + } + + tm = tree_mod_log_search(fs_info, logical, time_seq); + if (old_root && tm && tm->op != BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING) { + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + old = read_tree_block(fs_info, logical, root->root_key.objectid, + 0, level, NULL); + if (WARN_ON(IS_ERR(old) || !extent_buffer_uptodate(old))) { + if (!IS_ERR(old)) + free_extent_buffer(old); + btrfs_warn(fs_info, + "failed to read tree block %llu from get_old_root", + logical); + } else { + struct tree_mod_elem *tm2; + + btrfs_tree_read_lock(old); + eb = btrfs_clone_extent_buffer(old); + /* + * After the lookup for the most recent tree mod operation + * above and before we locked and cloned the extent buffer + * 'old', a new tree mod log operation may have been added. + * So lookup for a more recent one to make sure the number + * of mod log operations we replay is consistent with the + * number of items we have in the cloned extent buffer, + * otherwise we can hit a BUG_ON when rewinding the extent + * buffer. + */ + tm2 = tree_mod_log_search(fs_info, logical, time_seq); + btrfs_tree_read_unlock(old); + free_extent_buffer(old); + ASSERT(tm2); + ASSERT(tm2 == tm || tm2->seq > tm->seq); + if (!tm2 || tm2->seq < tm->seq) { + free_extent_buffer(eb); + return NULL; + } + tm = tm2; + } + } else if (old_root) { + eb_root_owner = btrfs_header_owner(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + eb = alloc_dummy_extent_buffer(fs_info, logical); + } else { + eb = btrfs_clone_extent_buffer(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + } + + if (!eb) + return NULL; + if (old_root) { + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); + btrfs_set_header_owner(eb, eb_root_owner); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + } + btrfs_set_buffer_lockdep_class(btrfs_header_owner(eb), eb, + btrfs_header_level(eb)); + btrfs_tree_read_lock(eb); + if (tm) + tree_mod_log_rewind(fs_info, eb, time_seq, tm); + else + WARN_ON(btrfs_header_level(eb) != 0); + WARN_ON(btrfs_header_nritems(eb) > BTRFS_NODEPTRS_PER_BLOCK(fs_info)); + + return eb; +} + +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq) +{ + struct tree_mod_elem *tm; + int level; + struct extent_buffer *eb_root = btrfs_root_node(root); + + tm = tree_mod_log_oldest_root(eb_root, time_seq); + if (tm && tm->op == BTRFS_MOD_LOG_ROOT_REPLACE) + level = tm->old_root.level; + else + level = btrfs_header_level(eb_root); + + free_extent_buffer(eb_root); + + return level; +} + +/* + * Return the lowest sequence number in the tree modification log. + * + * Return the sequence number of the oldest tree modification log user, which + * corresponds to the lowest sequence number of all existing users. If there are + * no users it returns 0. + */ +u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info) +{ + u64 ret = 0; + + read_lock(&fs_info->tree_mod_log_lock); + if (!list_empty(&fs_info->tree_mod_seq_list)) { + struct btrfs_seq_list *elem; + + elem = list_first_entry(&fs_info->tree_mod_seq_list, + struct btrfs_seq_list, list); + ret = elem->seq; + } + read_unlock(&fs_info->tree_mod_log_lock); + + return ret; +} diff --git a/fs/btrfs/tree-mod-log.h b/fs/btrfs/tree-mod-log.h new file mode 100644 index 000000000000..12605d19621b --- /dev/null +++ b/fs/btrfs/tree-mod-log.h @@ -0,0 +1,53 @@ +// SPDX-License-Identifier: GPL-2.0 + +#ifndef BTRFS_TREE_MOD_LOG_H +#define BTRFS_TREE_MOD_LOG_H + +#include "ctree.h" + +/* Represents a tree mod log user. */ +struct btrfs_seq_list { + struct list_head list; + u64 seq; +}; + +#define BTRFS_SEQ_LIST_INIT(name) { .list = LIST_HEAD_INIT((name).list), .seq = 0 } +#define BTRFS_SEQ_LAST ((u64)-1) + +enum btrfs_mod_log_op { + BTRFS_MOD_LOG_KEY_REPLACE, + BTRFS_MOD_LOG_KEY_ADD, + BTRFS_MOD_LOG_KEY_REMOVE, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_FREEING, + BTRFS_MOD_LOG_KEY_REMOVE_WHILE_MOVING, + BTRFS_MOD_LOG_MOVE_KEYS, + BTRFS_MOD_LOG_ROOT_REPLACE, +}; + +u64 btrfs_get_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem); +void btrfs_put_tree_mod_seq(struct btrfs_fs_info *fs_info, + struct btrfs_seq_list *elem); +int btrfs_tree_mod_log_insert_root(struct extent_buffer *old_root, + struct extent_buffer *new_root, + bool log_removal); +int btrfs_tree_mod_log_insert_key(struct extent_buffer *eb, int slot, + enum btrfs_mod_log_op op, gfp_t flags); +int btrfs_tree_mod_log_free_eb(struct extent_buffer *eb); +struct extent_buffer *btrfs_tree_mod_log_rewind(struct btrfs_fs_info *fs_info, + struct btrfs_path *path, + struct extent_buffer *eb, + u64 time_seq); +struct extent_buffer *btrfs_get_old_root(struct btrfs_root *root, u64 time_seq); +int btrfs_old_root_level(struct btrfs_root *root, u64 time_seq); +int btrfs_tree_mod_log_eb_copy(struct extent_buffer *dst, + struct extent_buffer *src, + unsigned long dst_offset, + unsigned long src_offset, + int nr_items); +int btrfs_tree_mod_log_insert_move(struct extent_buffer *eb, + int dst_slot, int src_slot, + int nr_items); +u64 btrfs_tree_mod_log_lowest_seq(struct btrfs_fs_info *fs_info); + +#endif diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index bc3b33efddc5..47d27059d064 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -1224,7 +1224,8 @@ static int open_fs_devices(struct btrfs_fs_devices *fs_devices, return 0; } -static int devid_cmp(void *priv, struct list_head *a, struct list_head *b) +static int devid_cmp(void *priv, const struct list_head *a, + const struct list_head *b) { struct btrfs_device *dev1, *dev2; @@ -1459,7 +1460,7 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, if (ret == -ERANGE) { *hole_start += *hole_size; *hole_size = 0; - return 1; + return true; } *hole_start += zone_size; @@ -3098,11 +3099,12 @@ out: return ret; } -static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) { struct btrfs_root *root = fs_info->chunk_root; struct btrfs_trans_handle *trans; struct btrfs_block_group *block_group; + u64 length; int ret; /* @@ -3117,7 +3119,7 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) * we release the path used to search the chunk/dev tree and before * the current task acquires this mutex and calls us. */ - lockdep_assert_held(&fs_info->delete_unused_bgs_mutex); + lockdep_assert_held(&fs_info->reclaim_bgs_lock); /* step one, relocate all the extents inside this chunk */ btrfs_scrub_pause(fs_info); @@ -3130,8 +3132,23 @@ static int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset) if (!block_group) return -ENOENT; btrfs_discard_cancel_work(&fs_info->discard_ctl, block_group); + length = block_group->length; btrfs_put_block_group(block_group); + /* + * On a zoned file system, discard the whole block group, this will + * trigger a REQ_OP_ZONE_RESET operation on the device zone. If + * resetting the zone fails, don't treat it as a fatal problem from the + * filesystem's point of view. + */ + if (btrfs_is_zoned(fs_info)) { + ret = btrfs_discard_extent(fs_info, chunk_offset, length, NULL); + if (ret) + btrfs_info(fs_info, + "failed to reset zone %llu after relocation", + chunk_offset); + } + trans = btrfs_start_trans_remove_block_group(root->fs_info, chunk_offset); if (IS_ERR(trans)) { @@ -3172,10 +3189,10 @@ again: key.type = BTRFS_CHUNK_ITEM_KEY; while (1) { - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } BUG_ON(ret == 0); /* Corruption */ @@ -3183,7 +3200,7 @@ again: ret = btrfs_previous_item(chunk_root, path, key.objectid, key.type); if (ret) - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) goto error; if (ret > 0) @@ -3204,7 +3221,7 @@ again: else BUG_ON(ret); } - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (found_key.offset == 0) break; @@ -3744,10 +3761,10 @@ again: goto error; } - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, chunk_root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } @@ -3761,7 +3778,7 @@ again: ret = btrfs_previous_item(chunk_root, path, 0, BTRFS_CHUNK_ITEM_KEY); if (ret) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); ret = 0; break; } @@ -3771,7 +3788,7 @@ again: btrfs_item_key_to_cpu(leaf, &found_key, slot); if (found_key.objectid != key.objectid) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); break; } @@ -3788,12 +3805,12 @@ again: btrfs_release_path(path); if (!ret) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto loop; } if (counting) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); spin_lock(&fs_info->balance_lock); bctl->stat.expected++; spin_unlock(&fs_info->balance_lock); @@ -3818,7 +3835,7 @@ again: count_meta < bctl->meta.limit_min) || ((chunk_type & BTRFS_BLOCK_GROUP_SYSTEM) && count_sys < bctl->sys.limit_min)) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto loop; } @@ -3832,7 +3849,7 @@ again: ret = btrfs_may_alloc_data_chunk(fs_info, found_key.offset); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto error; } else if (ret == 1) { chunk_reserved = 1; @@ -3840,7 +3857,7 @@ again: } ret = btrfs_relocate_chunk(fs_info, found_key.offset); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { enospc_errors++; } else if (ret == -ETXTBSY) { @@ -4725,16 +4742,16 @@ again: key.type = BTRFS_DEV_EXTENT_KEY; do { - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto done; } ret = btrfs_previous_item(root, path, 0, key.type); if (ret) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret < 0) goto done; ret = 0; @@ -4747,7 +4764,7 @@ again: btrfs_item_key_to_cpu(l, &key, path->slots[0]); if (key.objectid != device->devid) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_release_path(path); break; } @@ -4756,7 +4773,7 @@ again: length = btrfs_dev_extent_length(l, dev_extent); if (key.offset + length <= new_size) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_release_path(path); break; } @@ -4772,12 +4789,12 @@ again: */ ret = btrfs_may_alloc_data_chunk(fs_info, chunk_offset); if (ret < 0) { - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); goto done; } ret = btrfs_relocate_chunk(fs_info, chunk_offset); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); if (ret == -ENOSPC) { failed++; } else if (ret) { @@ -4989,6 +5006,8 @@ static void init_alloc_chunk_ctl_policy_zoned( ctl->max_chunk_size = 2 * ctl->max_stripe_size; ctl->devs_max = min_t(int, ctl->devs_max, BTRFS_MAX_DEVS_SYS_CHUNK); + } else { + BUG(); } /* We don't want a chunk larger than 10% of writable space */ @@ -6787,6 +6806,46 @@ static u64 calc_stripe_length(u64 type, u64 chunk_len, int num_stripes) return div_u64(chunk_len, data_stripes); } +#if BITS_PER_LONG == 32 +/* + * Due to page cache limit, metadata beyond BTRFS_32BIT_MAX_FILE_SIZE + * can't be accessed on 32bit systems. + * + * This function do mount time check to reject the fs if it already has + * metadata chunk beyond that limit. + */ +static int check_32bit_meta_chunk(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, u64 type) +{ + if (!(type & BTRFS_BLOCK_GROUP_METADATA)) + return 0; + + if (logical + length < MAX_LFS_FILESIZE) + return 0; + + btrfs_err_32bit_limit(fs_info); + return -EOVERFLOW; +} + +/* + * This is to give early warning for any metadata chunk reaching + * BTRFS_32BIT_EARLY_WARN_THRESHOLD. + * Although we can still access the metadata, it's not going to be possible + * once the limit is reached. + */ +static void warn_32bit_meta_chunk(struct btrfs_fs_info *fs_info, + u64 logical, u64 length, u64 type) +{ + if (!(type & BTRFS_BLOCK_GROUP_METADATA)) + return; + + if (logical + length < BTRFS_32BIT_EARLY_WARN_THRESHOLD) + return; + + btrfs_warn_32bit_limit(fs_info); +} +#endif + static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, struct btrfs_chunk *chunk) { @@ -6797,6 +6856,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, u64 logical; u64 length; u64 devid; + u64 type; u8 uuid[BTRFS_UUID_SIZE]; int num_stripes; int ret; @@ -6804,8 +6864,16 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, logical = key->offset; length = btrfs_chunk_length(leaf, chunk); + type = btrfs_chunk_type(leaf, chunk); num_stripes = btrfs_chunk_num_stripes(leaf, chunk); +#if BITS_PER_LONG == 32 + ret = check_32bit_meta_chunk(fs_info, logical, length, type); + if (ret < 0) + return ret; + warn_32bit_meta_chunk(fs_info, logical, length, type); +#endif + /* * Only need to verify chunk item if we're reading from sys chunk array, * as chunk item in tree block is already verified by tree-checker. @@ -6849,10 +6917,10 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, map->io_width = btrfs_chunk_io_width(leaf, chunk); map->io_align = btrfs_chunk_io_align(leaf, chunk); map->stripe_len = btrfs_chunk_stripe_len(leaf, chunk); - map->type = btrfs_chunk_type(leaf, chunk); + map->type = type; map->sub_stripes = btrfs_chunk_sub_stripes(leaf, chunk); map->verified_stripes = 0; - em->orig_block_len = calc_stripe_length(map->type, em->len, + em->orig_block_len = calc_stripe_length(type, em->len, map->num_stripes); for (i = 0; i < num_stripes; i++) { map->stripes[i].physical = @@ -7448,6 +7516,9 @@ static int btrfs_device_init_dev_stats(struct btrfs_device *device, int item_size; int i, ret, slot; + if (!device->fs_info->dev_root) + return 0; + key.objectid = BTRFS_DEV_STATS_OBJECTID; key.type = BTRFS_PERSISTENT_ITEM_KEY; key.offset = device->devid; @@ -7998,7 +8069,7 @@ static int relocating_repair_kthread(void *data) return -EBUSY; } - mutex_lock(&fs_info->delete_unused_bgs_mutex); + mutex_lock(&fs_info->reclaim_bgs_lock); /* Ensure block group still exists */ cache = btrfs_lookup_block_group(fs_info, target); @@ -8020,7 +8091,7 @@ static int relocating_repair_kthread(void *data) out: if (cache) btrfs_put_block_group(cache); - mutex_unlock(&fs_info->delete_unused_bgs_mutex); + mutex_unlock(&fs_info->reclaim_bgs_lock); btrfs_exclop_finish(fs_info); return ret; diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index d4c3e0dd32b8..9c0d84e5ec06 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -484,6 +484,7 @@ void btrfs_describe_block_groups(u64 flags, char *buf, u32 size_buf); int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info); int btrfs_recover_balance(struct btrfs_fs_info *fs_info); int btrfs_pause_balance(struct btrfs_fs_info *fs_info); +int btrfs_relocate_chunk(struct btrfs_fs_info *fs_info, u64 chunk_offset); int btrfs_cancel_balance(struct btrfs_fs_info *fs_info); int btrfs_create_uuid_tree(struct btrfs_fs_info *fs_info); int btrfs_uuid_scan_kthread(void *data); diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index d524acf7b3e5..c3fa7d3fa770 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -375,7 +375,6 @@ int zlib_decompress(struct list_head *ws, unsigned char *data_in, unsigned long bytes_left; unsigned long total_out = 0; unsigned long pg_offset = 0; - char *kaddr; destlen = min_t(unsigned long, destlen, PAGE_SIZE); bytes_left = destlen; @@ -455,9 +454,7 @@ next: * end of the inline extent (destlen) to the end of the page */ if (pg_offset < destlen) { - kaddr = kmap_atomic(dest_page); - memset(kaddr + pg_offset, 0, destlen - pg_offset); - kunmap_atomic(kaddr); + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 1f972b75a9ab..1bb8ee97aae0 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -21,9 +21,30 @@ /* Pseudo write pointer value for conventional zone */ #define WP_CONVENTIONAL ((u64)-2) +/* + * Location of the first zone of superblock logging zone pairs. + * + * - primary superblock: 0B (zone 0) + * - first copy: 512G (zone starting at that offset) + * - second copy: 4T (zone starting at that offset) + */ +#define BTRFS_SB_LOG_PRIMARY_OFFSET (0ULL) +#define BTRFS_SB_LOG_FIRST_OFFSET (512ULL * SZ_1G) +#define BTRFS_SB_LOG_SECOND_OFFSET (4096ULL * SZ_1G) + +#define BTRFS_SB_LOG_FIRST_SHIFT const_ilog2(BTRFS_SB_LOG_FIRST_OFFSET) +#define BTRFS_SB_LOG_SECOND_SHIFT const_ilog2(BTRFS_SB_LOG_SECOND_OFFSET) + /* Number of superblock log zones */ #define BTRFS_NR_SB_LOG_ZONES 2 +/* + * Maximum supported zone size. Currently, SMR disks have a zone size of + * 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not + * expect the zone size to become larger than 8GiB in the near future. + */ +#define BTRFS_MAX_ZONE_SIZE SZ_8G + static int copy_zone_info_cb(struct blk_zone *zone, unsigned int idx, void *data) { struct blk_zone *zones = data; @@ -111,23 +132,22 @@ static int sb_write_pointer(struct block_device *bdev, struct blk_zone *zones, } /* - * The following zones are reserved as the circular buffer on ZONED btrfs. - * - The primary superblock: zones 0 and 1 - * - The first copy: zones 16 and 17 - * - The second copy: zones 1024 or zone at 256GB which is minimum, and - * the following one + * Get the first zone number of the superblock mirror */ static inline u32 sb_zone_number(int shift, int mirror) { - ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); + u64 zone; + ASSERT(mirror < BTRFS_SUPER_MIRROR_MAX); switch (mirror) { - case 0: return 0; - case 1: return 16; - case 2: return min_t(u64, btrfs_sb_offset(mirror) >> shift, 1024); + case 0: zone = 0; break; + case 1: zone = 1ULL << (BTRFS_SB_LOG_FIRST_SHIFT - shift); break; + case 2: zone = 1ULL << (BTRFS_SB_LOG_SECOND_SHIFT - shift); break; } - return 0; + ASSERT(zone <= U32_MAX); + + return (u32)zone; } /* @@ -300,10 +320,21 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) zone_sectors = bdev_zone_sectors(bdev); } - nr_sectors = bdev_nr_sectors(bdev); /* Check if it's power of 2 (see is_power_of_2) */ ASSERT(zone_sectors != 0 && (zone_sectors & (zone_sectors - 1)) == 0); zone_info->zone_size = zone_sectors << SECTOR_SHIFT; + + /* We reject devices with a zone size larger than 8GB */ + if (zone_info->zone_size > BTRFS_MAX_ZONE_SIZE) { + btrfs_err_in_rcu(fs_info, + "zoned: %s: zone size %llu larger than supported maximum %llu", + rcu_str_deref(device->name), + zone_info->zone_size, BTRFS_MAX_ZONE_SIZE); + ret = -EINVAL; + goto out; + } + + nr_sectors = bdev_nr_sectors(bdev); zone_info->zone_size_shift = ilog2(zone_info->zone_size); zone_info->max_zone_append_size = (u64)queue_max_zone_append_sectors(queue) << SECTOR_SHIFT; @@ -311,6 +342,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device) if (!IS_ALIGNED(nr_sectors, zone_sectors)) zone_info->nr_zones++; + if (bdev_is_zoned(bdev) && zone_info->max_zone_append_size == 0) { + btrfs_err(fs_info, "zoned: device %pg does not support zone append", + bdev); + ret = -EINVAL; + goto out; + } + zone_info->seq_zones = bitmap_zalloc(zone_info->nr_zones, GFP_KERNEL); if (!zone_info->seq_zones) { ret = -ENOMEM; @@ -1088,6 +1126,11 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) goto out; } + if (zone.type == BLK_ZONE_TYPE_CONVENTIONAL) { + ret = -EIO; + goto out; + } + switch (zone.cond) { case BLK_ZONE_COND_OFFLINE: case BLK_ZONE_COND_READONLY: @@ -1235,7 +1278,7 @@ void btrfs_free_redirty_list(struct btrfs_transaction *trans) spin_unlock(&trans->releasing_ebs_lock); } -bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) +bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_block_group *cache; @@ -1250,7 +1293,7 @@ bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em) if (!is_data_inode(&inode->vfs_inode)) return false; - cache = btrfs_lookup_block_group(fs_info, em->block_start); + cache = btrfs_lookup_block_group(fs_info, start); ASSERT(cache); if (!cache) return false; diff --git a/fs/btrfs/zoned.h b/fs/btrfs/zoned.h index 61e969652fe1..e55d32595c2c 100644 --- a/fs/btrfs/zoned.h +++ b/fs/btrfs/zoned.h @@ -9,6 +9,12 @@ #include "disk-io.h" #include "block-group.h" +/* + * Block groups with more than this value (percents) of unusable space will be + * scheduled for background reclaim. + */ +#define BTRFS_DEFAULT_RECLAIM_THRESH 75 + struct btrfs_zoned_device_info { /* * Number of zones, zone size and types of zones if bdev is a @@ -47,7 +53,7 @@ void btrfs_calc_zone_unusable(struct btrfs_block_group *cache); void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb); void btrfs_free_redirty_list(struct btrfs_transaction *trans); -bool btrfs_use_zone_append(struct btrfs_inode *inode, struct extent_map *em); +bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start); void btrfs_record_physical_zoned(struct inode *inode, u64 file_offset, struct bio *bio); void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered); @@ -146,8 +152,7 @@ static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans, struct extent_buffer *eb) { } static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { } -static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, - struct extent_map *em) +static inline bool btrfs_use_zone_append(struct btrfs_inode *inode, u64 start) { return false; } diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 8e9626d63976..3e26b466476a 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -631,7 +631,6 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, size_t ret2; unsigned long total_out = 0; unsigned long pg_offset = 0; - char *kaddr; stream = ZSTD_initDStream( ZSTD_BTRFS_MAX_INPUT, workspace->mem, workspace->size); @@ -696,9 +695,7 @@ int zstd_decompress(struct list_head *ws, unsigned char *data_in, ret = 0; finish: if (pg_offset < destlen) { - kaddr = kmap_atomic(dest_page); - memset(kaddr + pg_offset, 0, destlen - pg_offset); - kunmap_atomic(kaddr); + memzero_page(dest_page, pg_offset, destlen - pg_offset); } return ret; } diff --git a/fs/buffer.c b/fs/buffer.c index 0cb7ffd4977c..ea48c01fb76b 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -1020,11 +1020,7 @@ grow_buffers(struct block_device *bdev, sector_t block, int size, gfp_t gfp) pgoff_t index; int sizebits; - sizebits = -1; - do { - sizebits++; - } while ((size << sizebits) < PAGE_SIZE); - + sizebits = PAGE_SHIFT - __ffs(size); index = block >> sizebits; /* @@ -1264,6 +1260,15 @@ static void bh_lru_install(struct buffer_head *bh) int i; check_irqs_on(); + /* + * the refcount of buffer_head in bh_lru prevents dropping the + * attached page(i.e., try_to_free_buffers) so it could cause + * failing page migration. + * Skip putting upcoming bh into bh_lru until migration is done. + */ + if (lru_cache_disabled()) + return; + bh_lru_lock(); b = this_cpu_ptr(&bh_lrus); @@ -1404,6 +1409,15 @@ __bread_gfp(struct block_device *bdev, sector_t block, } EXPORT_SYMBOL(__bread_gfp); +static void __invalidate_bh_lrus(struct bh_lru *b) +{ + int i; + + for (i = 0; i < BH_LRU_SIZE; i++) { + brelse(b->bhs[i]); + b->bhs[i] = NULL; + } +} /* * invalidate_bh_lrus() is called rarely - but not only at unmount. * This doesn't race because it runs in each cpu either in irq @@ -1412,16 +1426,12 @@ EXPORT_SYMBOL(__bread_gfp); static void invalidate_bh_lru(void *arg) { struct bh_lru *b = &get_cpu_var(bh_lrus); - int i; - for (i = 0; i < BH_LRU_SIZE; i++) { - brelse(b->bhs[i]); - b->bhs[i] = NULL; - } + __invalidate_bh_lrus(b); put_cpu_var(bh_lrus); } -static bool has_bh_in_lru(int cpu, void *dummy) +bool has_bh_in_lru(int cpu, void *dummy) { struct bh_lru *b = per_cpu_ptr(&bh_lrus, cpu); int i; @@ -1440,6 +1450,16 @@ void invalidate_bh_lrus(void) } EXPORT_SYMBOL_GPL(invalidate_bh_lrus); +void invalidate_bh_lrus_cpu(int cpu) +{ + struct bh_lru *b; + + bh_lru_lock(); + b = per_cpu_ptr(&bh_lrus, cpu); + __invalidate_bh_lrus(b); + bh_lru_unlock(); +} + void set_bh_page(struct buffer_head *bh, struct page *page, unsigned long offset) { diff --git a/fs/cachefiles/Makefile b/fs/cachefiles/Makefile index 891dedda5905..2227dc2d5498 100644 --- a/fs/cachefiles/Makefile +++ b/fs/cachefiles/Makefile @@ -7,6 +7,7 @@ cachefiles-y := \ bind.o \ daemon.o \ interface.o \ + io.o \ key.o \ main.o \ namei.o \ diff --git a/fs/cachefiles/bind.c b/fs/cachefiles/bind.c index dfb14dbddf51..38bb7764b454 100644 --- a/fs/cachefiles/bind.c +++ b/fs/cachefiles/bind.c @@ -118,6 +118,12 @@ static int cachefiles_daemon_add_cache(struct cachefiles_cache *cache) cache->mnt = path.mnt; root = path.dentry; + ret = -EINVAL; + if (mnt_user_ns(path.mnt) != &init_user_ns) { + pr_warn("File cache on idmapped mounts not supported"); + goto error_unsupported; + } + /* check parameters */ ret = -EOPNOTSUPP; if (d_is_negative(root) || diff --git a/fs/cachefiles/interface.c b/fs/cachefiles/interface.c index 5efa6a3702c0..da3948fdb615 100644 --- a/fs/cachefiles/interface.c +++ b/fs/cachefiles/interface.c @@ -319,8 +319,8 @@ static void cachefiles_drop_object(struct fscache_object *_object) /* * dispose of a reference to an object */ -static void cachefiles_put_object(struct fscache_object *_object, - enum fscache_obj_ref_trace why) +void cachefiles_put_object(struct fscache_object *_object, + enum fscache_obj_ref_trace why) { struct cachefiles_object *object; struct fscache_cache *cache; @@ -568,4 +568,5 @@ const struct fscache_cache_ops cachefiles_cache_ops = { .uncache_page = cachefiles_uncache_page, .dissociate_pages = cachefiles_dissociate_pages, .check_consistency = cachefiles_check_consistency, + .begin_read_operation = cachefiles_begin_read_operation, }; diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index cf9bd6401c2d..4ed83aa5253b 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -150,6 +150,9 @@ extern int cachefiles_has_space(struct cachefiles_cache *cache, */ extern const struct fscache_cache_ops cachefiles_cache_ops; +void cachefiles_put_object(struct fscache_object *_object, + enum fscache_obj_ref_trace why); + /* * key.c */ @@ -218,6 +221,12 @@ extern int cachefiles_write_page(struct fscache_storage *, struct page *); extern void cachefiles_uncache_page(struct fscache_object *, struct page *); /* + * rdwr2.c + */ +extern int cachefiles_begin_read_operation(struct netfs_read_request *, + struct fscache_retrieval *); + +/* * security.c */ extern int cachefiles_get_security_ID(struct cachefiles_cache *cache); diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c new file mode 100644 index 000000000000..b13fb45fc3f3 --- /dev/null +++ b/fs/cachefiles/io.c @@ -0,0 +1,420 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* kiocb-using read/write + * + * Copyright (C) 2021 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/mount.h> +#include <linux/slab.h> +#include <linux/file.h> +#include <linux/uio.h> +#include <linux/sched/mm.h> +#include <linux/netfs.h> +#include "internal.h" + +struct cachefiles_kiocb { + struct kiocb iocb; + refcount_t ki_refcnt; + loff_t start; + union { + size_t skipped; + size_t len; + }; + netfs_io_terminated_t term_func; + void *term_func_priv; + bool was_async; +}; + +static inline void cachefiles_put_kiocb(struct cachefiles_kiocb *ki) +{ + if (refcount_dec_and_test(&ki->ki_refcnt)) { + fput(ki->iocb.ki_filp); + kfree(ki); + } +} + +/* + * Handle completion of a read from the cache. + */ +static void cachefiles_read_complete(struct kiocb *iocb, long ret, long ret2) +{ + struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); + + _enter("%ld,%ld", ret, ret2); + + if (ki->term_func) { + if (ret >= 0) + ret += ki->skipped; + ki->term_func(ki->term_func_priv, ret, ki->was_async); + } + + cachefiles_put_kiocb(ki); +} + +/* + * Initiate a read from the cache. + */ +static int cachefiles_read(struct netfs_cache_resources *cres, + loff_t start_pos, + struct iov_iter *iter, + bool seek_data, + netfs_io_terminated_t term_func, + void *term_func_priv) +{ + struct cachefiles_kiocb *ki; + struct file *file = cres->cache_priv2; + unsigned int old_nofs; + ssize_t ret = -ENOBUFS; + size_t len = iov_iter_count(iter), skipped = 0; + + _enter("%pD,%li,%llx,%zx/%llx", + file, file_inode(file)->i_ino, start_pos, len, + i_size_read(file->f_inode)); + + /* If the caller asked us to seek for data before doing the read, then + * we should do that now. If we find a gap, we fill it with zeros. + */ + if (seek_data) { + loff_t off = start_pos, off2; + + off2 = vfs_llseek(file, off, SEEK_DATA); + if (off2 < 0 && off2 >= (loff_t)-MAX_ERRNO && off2 != -ENXIO) { + skipped = 0; + ret = off2; + goto presubmission_error; + } + + if (off2 == -ENXIO || off2 >= start_pos + len) { + /* The region is beyond the EOF or there's no more data + * in the region, so clear the rest of the buffer and + * return success. + */ + iov_iter_zero(len, iter); + skipped = len; + ret = 0; + goto presubmission_error; + } + + skipped = off2 - off; + iov_iter_zero(skipped, iter); + } + + ret = -ENOBUFS; + ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); + if (!ki) + goto presubmission_error; + + refcount_set(&ki->ki_refcnt, 2); + ki->iocb.ki_filp = file; + ki->iocb.ki_pos = start_pos + skipped; + ki->iocb.ki_flags = IOCB_DIRECT; + ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); + ki->iocb.ki_ioprio = get_current_ioprio(); + ki->skipped = skipped; + ki->term_func = term_func; + ki->term_func_priv = term_func_priv; + ki->was_async = true; + + if (ki->term_func) + ki->iocb.ki_complete = cachefiles_read_complete; + + get_file(ki->iocb.ki_filp); + + old_nofs = memalloc_nofs_save(); + ret = vfs_iocb_iter_read(file, &ki->iocb, iter); + memalloc_nofs_restore(old_nofs); + switch (ret) { + case -EIOCBQUEUED: + goto in_progress; + + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + ret = -EINTR; + fallthrough; + default: + ki->was_async = false; + cachefiles_read_complete(&ki->iocb, ret, 0); + if (ret > 0) + ret = 0; + break; + } + +in_progress: + cachefiles_put_kiocb(ki); + _leave(" = %zd", ret); + return ret; + +presubmission_error: + if (term_func) + term_func(term_func_priv, ret < 0 ? ret : skipped, false); + return ret; +} + +/* + * Handle completion of a write to the cache. + */ +static void cachefiles_write_complete(struct kiocb *iocb, long ret, long ret2) +{ + struct cachefiles_kiocb *ki = container_of(iocb, struct cachefiles_kiocb, iocb); + struct inode *inode = file_inode(ki->iocb.ki_filp); + + _enter("%ld,%ld", ret, ret2); + + /* Tell lockdep we inherited freeze protection from submission thread */ + __sb_writers_acquired(inode->i_sb, SB_FREEZE_WRITE); + __sb_end_write(inode->i_sb, SB_FREEZE_WRITE); + + if (ki->term_func) + ki->term_func(ki->term_func_priv, ret, ki->was_async); + + cachefiles_put_kiocb(ki); +} + +/* + * Initiate a write to the cache. + */ +static int cachefiles_write(struct netfs_cache_resources *cres, + loff_t start_pos, + struct iov_iter *iter, + netfs_io_terminated_t term_func, + void *term_func_priv) +{ + struct cachefiles_kiocb *ki; + struct inode *inode; + struct file *file = cres->cache_priv2; + unsigned int old_nofs; + ssize_t ret = -ENOBUFS; + size_t len = iov_iter_count(iter); + + _enter("%pD,%li,%llx,%zx/%llx", + file, file_inode(file)->i_ino, start_pos, len, + i_size_read(file->f_inode)); + + ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); + if (!ki) + goto presubmission_error; + + refcount_set(&ki->ki_refcnt, 2); + ki->iocb.ki_filp = file; + ki->iocb.ki_pos = start_pos; + ki->iocb.ki_flags = IOCB_DIRECT | IOCB_WRITE; + ki->iocb.ki_hint = ki_hint_validate(file_write_hint(file)); + ki->iocb.ki_ioprio = get_current_ioprio(); + ki->start = start_pos; + ki->len = len; + ki->term_func = term_func; + ki->term_func_priv = term_func_priv; + ki->was_async = true; + + if (ki->term_func) + ki->iocb.ki_complete = cachefiles_write_complete; + + /* Open-code file_start_write here to grab freeze protection, which + * will be released by another thread in aio_complete_rw(). Fool + * lockdep by telling it the lock got released so that it doesn't + * complain about the held lock when we return to userspace. + */ + inode = file_inode(file); + __sb_start_write(inode->i_sb, SB_FREEZE_WRITE); + __sb_writers_release(inode->i_sb, SB_FREEZE_WRITE); + + get_file(ki->iocb.ki_filp); + + old_nofs = memalloc_nofs_save(); + ret = vfs_iocb_iter_write(file, &ki->iocb, iter); + memalloc_nofs_restore(old_nofs); + switch (ret) { + case -EIOCBQUEUED: + goto in_progress; + + case -ERESTARTSYS: + case -ERESTARTNOINTR: + case -ERESTARTNOHAND: + case -ERESTART_RESTARTBLOCK: + /* There's no easy way to restart the syscall since other AIO's + * may be already running. Just fail this IO with EINTR. + */ + |