diff options
Diffstat (limited to 'fs')
911 files changed, 53868 insertions, 29377 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 32619d146cbc..1286d96a29bc 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -164,4 +164,5 @@ const struct address_space_operations v9fs_addr_operations = { .invalidate_folio = netfs_invalidate_folio, .direct_IO = noop_direct_IO, .writepages = netfs_writepages, + .migrate_folio = filemap_migrate_folio, }; diff --git a/fs/9p/vfs_inode.c b/fs/9p/vfs_inode.c index 3e68521f4e2f..399d455d50d6 100644 --- a/fs/9p/vfs_inode.c +++ b/fs/9p/vfs_inode.c @@ -669,8 +669,8 @@ v9fs_vfs_create(struct mnt_idmap *idmap, struct inode *dir, * */ -static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int err; u32 perm; @@ -692,8 +692,7 @@ static int v9fs_vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (fid) p9_fid_put(fid); - - return err; + return ERR_PTR(err); } /** diff --git a/fs/9p/vfs_inode_dotl.c b/fs/9p/vfs_inode_dotl.c index 143ac03b7425..5b5fda617b80 100644 --- a/fs/9p/vfs_inode_dotl.c +++ b/fs/9p/vfs_inode_dotl.c @@ -350,9 +350,9 @@ out: * */ -static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, - umode_t omode) +static struct dentry *v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t omode) { int err; struct v9fs_session_info *v9ses; @@ -407,8 +407,8 @@ static int v9fs_vfs_mkdir_dotl(struct mnt_idmap *idmap, err); goto error; } - v9fs_fid_add(dentry, &fid); v9fs_set_create_acl(inode, fid, dacl, pacl); + v9fs_fid_add(dentry, &fid); d_instantiate(dentry, inode); err = 0; inc_nlink(dir); @@ -417,7 +417,7 @@ error: p9_fid_put(fid); v9fs_put_acl(dacl, pacl); p9_fid_put(dfid); - return err; + return ERR_PTR(err); } static int diff --git a/fs/Kconfig b/fs/Kconfig index 64d420e3c475..44b6cdd36dc1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -286,6 +286,7 @@ config HUGETLB_PAGE_OPTIMIZE_VMEMMAP def_bool HUGETLB_PAGE depends on ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP depends on SPARSEMEM_VMEMMAP + select SPARSEMEM_VMEMMAP_PREINIT if ARCH_WANT_HUGETLB_VMEMMAP_PREINIT config HUGETLB_PMD_PAGE_TABLE_SHARING def_bool HUGETLB_PAGE @@ -334,9 +335,9 @@ source "fs/omfs/Kconfig" source "fs/hpfs/Kconfig" source "fs/qnx4/Kconfig" source "fs/qnx6/Kconfig" +source "fs/resctrl/Kconfig" source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" -source "fs/sysv/Kconfig" source "fs/ufs/Kconfig" source "fs/erofs/Kconfig" source "fs/vboxsf/Kconfig" @@ -368,6 +369,7 @@ config GRACE_PERIOD config LOCKD tristate depends on FILE_LOCKING + select CRC32 select GRACE_PERIOD config LOCKD_V4 diff --git a/fs/Makefile b/fs/Makefile index 15df0a923d3a..79c08b914c47 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -87,7 +87,6 @@ obj-$(CONFIG_NFSD) += nfsd/ obj-$(CONFIG_LOCKD) += lockd/ obj-$(CONFIG_NLS) += nls/ obj-y += unicode/ -obj-$(CONFIG_SYSV_FS) += sysv/ obj-$(CONFIG_SMBFS) += smb/ obj-$(CONFIG_HPFS_FS) += hpfs/ obj-$(CONFIG_NTFS3_FS) += ntfs3/ @@ -129,3 +128,4 @@ obj-$(CONFIG_EROFS_FS) += erofs/ obj-$(CONFIG_VBOXSF_FS) += vboxsf/ obj-$(CONFIG_ZONEFS_FS) += zonefs/ obj-$(CONFIG_BPF_LSM) += bpf_fs_kfuncs.o +obj-$(CONFIG_RESCTRL_FS) += resctrl/ diff --git a/fs/affs/affs.h b/fs/affs/affs.h index e8c2c4535cb3..ac4e9a02910b 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -168,7 +168,7 @@ extern struct dentry *affs_lookup(struct inode *dir, struct dentry *dentry, unsi extern int affs_unlink(struct inode *dir, struct dentry *dentry); extern int affs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool); -extern int affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, +extern struct dentry *affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode); extern int affs_rmdir(struct inode *dir, struct dentry *dentry); extern int affs_link(struct dentry *olddentry, struct inode *dir, diff --git a/fs/affs/file.c b/fs/affs/file.c index a5a861dd5223..7a71018e3f67 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -596,7 +596,7 @@ affs_extent_file_ofs(struct inode *inode, u32 newsize) BUG_ON(tmp > bsize); AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); - AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1); AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); affs_fix_checksum(sb, bh); bh->b_state &= ~(1UL << BH_New); @@ -724,7 +724,8 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, tmp = min(bsize - boff, to - from); BUG_ON(boff + tmp > bsize || tmp > bsize); memcpy(AFFS_DATA(bh) + boff, data + from, tmp); - be32_add_cpu(&AFFS_DATA_HEAD(bh)->size, tmp); + AFFS_DATA_HEAD(bh)->size = cpu_to_be32( + max(boff + tmp, be32_to_cpu(AFFS_DATA_HEAD(bh)->size))); affs_fix_checksum(sb, bh); mark_buffer_dirty_inode(bh, inode); written += tmp; @@ -746,7 +747,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, if (buffer_new(bh)) { AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); - AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1); AFFS_DATA_HEAD(bh)->size = cpu_to_be32(bsize); AFFS_DATA_HEAD(bh)->next = 0; bh->b_state &= ~(1UL << BH_New); @@ -780,7 +781,7 @@ static int affs_write_end_ofs(struct file *file, struct address_space *mapping, if (buffer_new(bh)) { AFFS_DATA_HEAD(bh)->ptype = cpu_to_be32(T_DATA); AFFS_DATA_HEAD(bh)->key = cpu_to_be32(inode->i_ino); - AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx); + AFFS_DATA_HEAD(bh)->sequence = cpu_to_be32(bidx + 1); AFFS_DATA_HEAD(bh)->size = cpu_to_be32(tmp); AFFS_DATA_HEAD(bh)->next = 0; bh->b_state &= ~(1UL << BH_New); diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 8c154490a2d6..f883be50db12 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -273,7 +273,7 @@ affs_create(struct mnt_idmap *idmap, struct inode *dir, return 0; } -int +struct dentry * affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { @@ -285,7 +285,7 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, inode = affs_new_inode(dir); if (!inode) - return -ENOSPC; + return ERR_PTR(-ENOSPC); inode->i_mode = S_IFDIR | mode; affs_mode_to_prot(inode); @@ -298,9 +298,9 @@ affs_mkdir(struct mnt_idmap *idmap, struct inode *dir, clear_nlink(inode); mark_inode_dirty(inode); iput(inode); - return error; + return ERR_PTR(error); } - return 0; + return NULL; } int diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index fc8ba9142f2f..682bd8ec2c10 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -5,6 +5,7 @@ config AFS_FS select AF_RXRPC select DNS_RESOLVER select NETFS_SUPPORT + select CRYPTO_KRB5 help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 5efd7e13b304..b49b8fe682f3 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -8,6 +8,7 @@ kafs-y := \ addr_prefs.o \ callback.o \ cell.o \ + cm_security.o \ cmservice.o \ dir.o \ dir_edit.o \ diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c index 6d42f85c6be5..e941da5b6dd9 100644 --- a/fs/afs/addr_list.c +++ b/fs/afs/addr_list.c @@ -362,3 +362,53 @@ int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist, alist->nr_addrs++; return 0; } + +/* + * Set the app data on the rxrpc peers an address list points to + */ +void afs_set_peer_appdata(struct afs_server *server, + struct afs_addr_list *old_alist, + struct afs_addr_list *new_alist) +{ + unsigned long data = (unsigned long)server; + int n = 0, o = 0; + + if (!old_alist) { + /* New server. Just set all. */ + for (; n < new_alist->nr_addrs; n++) + rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data); + return; + } + if (!new_alist) { + /* Dead server. Just remove all. */ + for (; o < old_alist->nr_addrs; o++) + rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0); + return; + } + + /* Walk through the two lists simultaneously, setting new peers and + * clearing old ones. The two lists are ordered by pointer to peer + * record. + */ + while (n < new_alist->nr_addrs && o < old_alist->nr_addrs) { + struct rxrpc_peer *pn = new_alist->addrs[n].peer; + struct rxrpc_peer *po = old_alist->addrs[o].peer; + + if (pn == po) + continue; + if (pn < po) { + rxrpc_kernel_set_peer_data(pn, data); + n++; + } else { + rxrpc_kernel_set_peer_data(po, 0); + o++; + } + } + + if (n < new_alist->nr_addrs) + for (; n < new_alist->nr_addrs; n++) + rxrpc_kernel_set_peer_data(new_alist->addrs[n].peer, data); + if (o < old_alist->nr_addrs) + for (; o < old_alist->nr_addrs; o++) + rxrpc_kernel_set_peer_data(old_alist->addrs[o].peer, 0); +} diff --git a/fs/afs/cell.c b/fs/afs/cell.c index cee42646736c..0168bbf53fe0 100644 --- a/fs/afs/cell.c +++ b/fs/afs/cell.c @@ -20,8 +20,9 @@ static unsigned __read_mostly afs_cell_min_ttl = 10 * 60; static unsigned __read_mostly afs_cell_max_ttl = 24 * 60 * 60; static atomic_t cell_debug_id; -static void afs_queue_cell_manager(struct afs_net *); -static void afs_manage_cell_work(struct work_struct *); +static void afs_cell_timer(struct timer_list *timer); +static void afs_destroy_cell_work(struct work_struct *work); +static void afs_manage_cell_work(struct work_struct *work); static void afs_dec_cells_outstanding(struct afs_net *net) { @@ -29,19 +30,11 @@ static void afs_dec_cells_outstanding(struct afs_net *net) wake_up_var(&net->cells_outstanding); } -/* - * Set the cell timer to fire after a given delay, assuming it's not already - * set for an earlier time. - */ -static void afs_set_cell_timer(struct afs_net *net, time64_t delay) +static void afs_set_cell_state(struct afs_cell *cell, enum afs_cell_state state) { - if (net->live) { - atomic_inc(&net->cells_outstanding); - if (timer_reduce(&net->cells_timer, jiffies + delay * HZ)) - afs_dec_cells_outstanding(net); - } else { - afs_queue_cell_manager(net); - } + smp_store_release(&cell->state, state); /* Commit cell changes before state */ + smp_wmb(); /* Set cell state before task state */ + wake_up_var(&cell->state); } /* @@ -64,7 +57,8 @@ static struct afs_cell *afs_find_cell_locked(struct afs_net *net, return ERR_PTR(-ENAMETOOLONG); if (!name) { - cell = net->ws_cell; + cell = rcu_dereference_protected(net->ws_cell, + lockdep_is_held(&net->cells_lock)); if (!cell) return ERR_PTR(-EDESTADDRREQ); goto found; @@ -115,7 +109,7 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, const char *name, unsigned int namelen, const char *addresses) { - struct afs_vlserver_list *vllist; + struct afs_vlserver_list *vllist = NULL; struct afs_cell *cell; int i, ret; @@ -162,13 +156,15 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->net = net; refcount_set(&cell->ref, 1); atomic_set(&cell->active, 0); + INIT_WORK(&cell->destroyer, afs_destroy_cell_work); INIT_WORK(&cell->manager, afs_manage_cell_work); + timer_setup(&cell->management_timer, afs_cell_timer, 0); init_rwsem(&cell->vs_lock); cell->volumes = RB_ROOT; INIT_HLIST_HEAD(&cell->proc_volumes); seqlock_init(&cell->volume_lock); cell->fs_servers = RB_ROOT; - seqlock_init(&cell->fs_lock); + init_rwsem(&cell->fs_lock); rwlock_init(&cell->vl_servers_lock); cell->flags = (1 << AFS_CELL_FL_CHECK_ALIAS); @@ -203,7 +199,13 @@ static struct afs_cell *afs_alloc_cell(struct afs_net *net, cell->dns_status = vllist->status; smp_store_release(&cell->dns_lookup_count, 1); /* vs source/status */ atomic_inc(&net->cells_outstanding); + ret = idr_alloc_cyclic(&net->cells_dyn_ino, cell, + 2, INT_MAX / 2, GFP_KERNEL); + if (ret < 0) + goto error; + cell->dynroot_ino = ret; cell->debug_id = atomic_inc_return(&cell_debug_id); + trace_afs_cell(cell->debug_id, 1, 0, afs_cell_trace_alloc); _leave(" = %p", cell); @@ -213,6 +215,7 @@ parse_failed: if (ret == -EINVAL) printk(KERN_ERR "kAFS: bad VL server IP address\n"); error: + afs_put_vlserverlist(cell->net, vllist); kfree(cell->name - 1); kfree(cell); _leave(" = %d", ret); @@ -226,6 +229,7 @@ error: * @namesz: The strlen of the cell name. * @vllist: A colon/comma separated list of numeric IP addresses or NULL. * @excl: T if an error should be given if the cell name already exists. + * @trace: The reason to be logged if the lookup is successful. * * Look up a cell record by name and query the DNS for VL server addresses if * needed. Note that that actual DNS query is punted off to the manager thread @@ -234,7 +238,8 @@ error: */ struct afs_cell *afs_lookup_cell(struct afs_net *net, const char *name, unsigned int namesz, - const char *vllist, bool excl) + const char *vllist, bool excl, + enum afs_cell_trace trace) { struct afs_cell *cell, *candidate, *cursor; struct rb_node *parent, **pp; @@ -244,7 +249,7 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, _enter("%s,%s", name, vllist); if (!excl) { - cell = afs_find_cell(net, name, namesz, afs_cell_trace_use_lookup); + cell = afs_find_cell(net, name, namesz, trace); if (!IS_ERR(cell)) goto wait_for_cell; } @@ -287,26 +292,28 @@ struct afs_cell *afs_lookup_cell(struct afs_net *net, cell = candidate; candidate = NULL; - atomic_set(&cell->active, 2); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 2, afs_cell_trace_insert); + afs_use_cell(cell, trace); rb_link_node_rcu(&cell->net_node, parent, pp); rb_insert_color(&cell->net_node, &net->cells); up_write(&net->cells_lock); - afs_queue_cell(cell, afs_cell_trace_get_queue_new); + afs_queue_cell(cell, afs_cell_trace_queue_new); wait_for_cell: - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), atomic_read(&cell->active), - afs_cell_trace_wait); _debug("wait_for_cell"); - wait_var_event(&cell->state, - ({ - state = smp_load_acquire(&cell->state); /* vs error */ - state == AFS_CELL_ACTIVE || state == AFS_CELL_REMOVED; - })); + state = smp_load_acquire(&cell->state); /* vs error */ + if (state != AFS_CELL_ACTIVE && + state != AFS_CELL_DEAD) { + afs_see_cell(cell, afs_cell_trace_wait); + wait_var_event(&cell->state, + ({ + state = smp_load_acquire(&cell->state); /* vs error */ + state == AFS_CELL_ACTIVE || state == AFS_CELL_DEAD; + })); + } /* Check the state obtained from the wait check. */ - if (state == AFS_CELL_REMOVED) { + if (state == AFS_CELL_DEAD) { ret = cell->error; goto error; } @@ -320,7 +327,7 @@ cell_already_exists: if (excl) { ret = -EEXIST; } else { - afs_use_cell(cursor, afs_cell_trace_use_lookup); + afs_use_cell(cursor, trace); ret = 0; } up_write(&net->cells_lock); @@ -330,7 +337,7 @@ cell_already_exists: goto wait_for_cell; goto error_noput; error: - afs_unuse_cell(net, cell, afs_cell_trace_unuse_lookup); + afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_error); error_noput: _leave(" = %d [error]", ret); return ERR_PTR(ret); @@ -375,8 +382,9 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) if (cp && cp < rootcell + len) return -EINVAL; - /* allocate a cell record for the root cell */ - new_root = afs_lookup_cell(net, rootcell, len, vllist, false); + /* allocate a cell record for the root/workstation cell */ + new_root = afs_lookup_cell(net, rootcell, len, vllist, false, + afs_cell_trace_use_lookup_ws); if (IS_ERR(new_root)) { _leave(" = %ld", PTR_ERR(new_root)); return PTR_ERR(new_root); @@ -387,12 +395,11 @@ int afs_cell_init(struct afs_net *net, const char *rootcell) /* install the new cell */ down_write(&net->cells_lock); - afs_see_cell(new_root, afs_cell_trace_see_ws); - old_root = net->ws_cell; - net->ws_cell = new_root; + old_root = rcu_replace_pointer(net->ws_cell, new_root, + lockdep_is_held(&net->cells_lock)); up_write(&net->cells_lock); - afs_unuse_cell(net, old_root, afs_cell_trace_unuse_ws); + afs_unuse_cell(old_root, afs_cell_trace_unuse_ws); _leave(" = 0"); return 0; } @@ -510,8 +517,9 @@ static void afs_cell_destroy(struct rcu_head *rcu) trace_afs_cell(cell->debug_id, r, atomic_read(&cell->active), afs_cell_trace_free); afs_put_vlserverlist(net, rcu_access_pointer(cell->vl_servers)); - afs_unuse_cell(net, cell->alias_of, afs_cell_trace_unuse_alias); + afs_unuse_cell(cell->alias_of, afs_cell_trace_unuse_alias); key_put(cell->anonymous_key); + idr_remove(&net->cells_dyn_ino, cell->dynroot_ino); kfree(cell->name - 1); kfree(cell); @@ -519,30 +527,14 @@ static void afs_cell_destroy(struct rcu_head *rcu) _leave(" [destroyed]"); } -/* - * Queue the cell manager. - */ -static void afs_queue_cell_manager(struct afs_net *net) -{ - int outstanding = atomic_inc_return(&net->cells_outstanding); - - _enter("%d", outstanding); - - if (!queue_work(afs_wq, &net->cells_manager)) - afs_dec_cells_outstanding(net); -} - -/* - * Cell management timer. We have an increment on cells_outstanding that we - * need to pass along to the work item. - */ -void afs_cells_timer(struct timer_list *timer) +static void afs_destroy_cell_work(struct work_struct *work) { - struct afs_net *net = container_of(timer, struct afs_net, cells_timer); + struct afs_cell *cell = container_of(work, struct afs_cell, destroyer); - _enter(""); - if (!queue_work(afs_wq, &net->cells_manager)) - afs_dec_cells_outstanding(net); + afs_see_cell(cell, afs_cell_trace_destroy); + timer_delete_sync(&cell->management_timer); + cancel_work_sync(&cell->manager); + call_rcu(&cell->rcu, afs_cell_destroy); } /* @@ -574,7 +566,7 @@ void afs_put_cell(struct afs_cell *cell, enum afs_cell_trace reason) if (zero) { a = atomic_read(&cell->active); WARN(a != 0, "Cell active count %u > 0\n", a); - call_rcu(&cell->rcu, afs_cell_destroy); + WARN_ON(!queue_work(afs_wq, &cell->destroyer)); } } } @@ -586,10 +578,9 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) { int r, a; - r = refcount_read(&cell->ref); - WARN_ON(r == 0); + __refcount_inc(&cell->ref, &r); a = atomic_inc_return(&cell->active); - trace_afs_cell(cell->debug_id, r, a, reason); + trace_afs_cell(cell->debug_id, r + 1, a, reason); return cell; } @@ -597,10 +588,11 @@ struct afs_cell *afs_use_cell(struct afs_cell *cell, enum afs_cell_trace reason) * Record a cell becoming less active. When the active counter reaches 1, it * is scheduled for destruction, but may get reactivated. */ -void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_trace reason) +void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason) { unsigned int debug_id; time64_t now, expire_delay; + bool zero; int r, a; if (!cell) @@ -615,13 +607,15 @@ void afs_unuse_cell(struct afs_net *net, struct afs_cell *cell, enum afs_cell_tr expire_delay = afs_cell_gc_delay; debug_id = cell->debug_id; - r = refcount_read(&cell->ref); a = atomic_dec_return(&cell->active); - trace_afs_cell(debug_id, r, a, reason); - WARN_ON(a == 0); - if (a == 1) + if (!a) /* 'cell' may now be garbage collected. */ - afs_set_cell_timer(net, expire_delay); + afs_set_cell_timer(cell, expire_delay); + + zero = __refcount_dec_and_test(&cell->ref, &r); + trace_afs_cell(debug_id, r - 1, a, reason); + if (zero) + WARN_ON(!queue_work(afs_wq, &cell->destroyer)); } /* @@ -641,9 +635,27 @@ void afs_see_cell(struct afs_cell *cell, enum afs_cell_trace reason) */ void afs_queue_cell(struct afs_cell *cell, enum afs_cell_trace reason) { - afs_get_cell(cell, reason); - if (!queue_work(afs_wq, &cell->manager)) - afs_put_cell(cell, afs_cell_trace_put_queue_fail); + queue_work(afs_wq, &cell->manager); +} + +/* + * Cell-specific management timer. + */ +static void afs_cell_timer(struct timer_list *timer) +{ + struct afs_cell *cell = container_of(timer, struct afs_cell, management_timer); + + afs_see_cell(cell, afs_cell_trace_see_mgmt_timer); + if (refcount_read(&cell->ref) > 0 && cell->net->live) + queue_work(afs_wq, &cell->manager); +} + +/* + * Set/reduce the cell timer. + */ +void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs) +{ + timer_reduce(&cell->management_timer, jiffies + delay_secs * HZ); } /* @@ -705,7 +717,6 @@ static int afs_activate_cell(struct afs_net *net, struct afs_cell *cell) if (cell->proc_link.next) cell->proc_link.next->pprev = &cell->proc_link.next; - afs_dynroot_mkdir(net, cell); mutex_unlock(&net->proc_cells_lock); return 0; } @@ -722,241 +733,162 @@ static void afs_deactivate_cell(struct afs_net *net, struct afs_cell *cell) mutex_lock(&net->proc_cells_lock); if (!hlist_unhashed(&cell->proc_link)) hlist_del_rcu(&cell->proc_link); - afs_dynroot_rmdir(net, cell); mutex_unlock(&net->proc_cells_lock); _leave(""); } +static bool afs_has_cell_expired(struct afs_cell *cell, time64_t *_next_manage) +{ + const struct afs_vlserver_list *vllist; + time64_t expire_at = cell->last_inactive; + time64_t now = ktime_get_real_seconds(); + + if (atomic_read(&cell->active)) + return false; + if (!cell->net->live) + return true; + + vllist = rcu_dereference_protected(cell->vl_servers, true); + if (vllist && vllist->nr_servers > 0) + expire_at += afs_cell_gc_delay; + + if (expire_at <= now) + return true; + if (expire_at < *_next_manage) + *_next_manage = expire_at; + return false; +} + /* * Manage a cell record, initialising and destroying it, maintaining its DNS * records. */ -static void afs_manage_cell(struct afs_cell *cell) +static bool afs_manage_cell(struct afs_cell *cell) { struct afs_net *net = cell->net; - int ret, active; + time64_t next_manage = TIME64_MAX; + int ret; _enter("%s", cell->name); -again: _debug("state %u", cell->state); switch (cell->state) { - case AFS_CELL_INACTIVE: - case AFS_CELL_FAILED: - down_write(&net->cells_lock); - active = 1; - if (atomic_try_cmpxchg_relaxed(&cell->active, &active, 0)) { - rb_erase(&cell->net_node, &net->cells); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), 0, - afs_cell_trace_unuse_delete); - smp_store_release(&cell->state, AFS_CELL_REMOVED); - } - up_write(&net->cells_lock); - if (cell->state == AFS_CELL_REMOVED) { - wake_up_var(&cell->state); - goto final_destruction; - } - if (cell->state == AFS_CELL_FAILED) - goto done; - smp_store_release(&cell->state, AFS_CELL_UNSET); - wake_up_var(&cell->state); - goto again; - - case AFS_CELL_UNSET: - smp_store_release(&cell->state, AFS_CELL_ACTIVATING); - wake_up_var(&cell->state); - goto again; - - case AFS_CELL_ACTIVATING: - ret = afs_activate_cell(net, cell); - if (ret < 0) - goto activation_failed; + case AFS_CELL_SETTING_UP: + goto set_up_cell; + case AFS_CELL_ACTIVE: + goto cell_is_active; + case AFS_CELL_REMOVING: + WARN_ON_ONCE(1); + return false; + case AFS_CELL_DEAD: + return false; + default: + _debug("bad state %u", cell->state); + WARN_ON_ONCE(1); /* Unhandled state */ + return false; + } - smp_store_release(&cell->state, AFS_CELL_ACTIVE); - wake_up_var(&cell->state); - goto again; +set_up_cell: + ret = afs_activate_cell(net, cell); + if (ret < 0) { + cell->error = ret; + goto remove_cell; + } - case AFS_CELL_ACTIVE: - if (atomic_read(&cell->active) > 1) { - if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { - ret = afs_update_cell(cell); - if (ret < 0) - cell->error = ret; - } - goto done; - } - smp_store_release(&cell->state, AFS_CELL_DEACTIVATING); - wake_up_var(&cell->state); - goto again; + afs_set_cell_state(cell, AFS_CELL_ACTIVE); + +cell_is_active: + if (afs_has_cell_expired(cell, &next_manage)) + goto remove_cell; - case AFS_CELL_DEACTIVATING: - if (atomic_read(&cell->active) > 1) - goto reverse_deactivation; - afs_deactivate_cell(net, cell); - smp_store_release(&cell->state, AFS_CELL_INACTIVE); - wake_up_var(&cell->state); - goto again; + if (test_and_clear_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) { + ret = afs_update_cell(cell); + if (ret < 0) + cell->error = ret; + } - case AFS_CELL_REMOVED: - goto done; + if (next_manage < TIME64_MAX && cell->net->live) { + time64_t now = ktime_get_real_seconds(); - default: - break; + if (next_manage - now <= 0) + afs_queue_cell(cell, afs_cell_trace_queue_again); + else + afs_set_cell_timer(cell, next_manage - now); } - _debug("bad state %u", cell->state); - BUG(); /* Unhandled state */ + _leave(" [done %u]", cell->state); + return false; -activation_failed: - cell->error = ret; - afs_deactivate_cell(net, cell); +remove_cell: + down_write(&net->cells_lock); - smp_store_release(&cell->state, AFS_CELL_FAILED); /* vs error */ - wake_up_var(&cell->state); - goto again; + if (atomic_read(&cell->active)) { + up_write(&net->cells_lock); + goto cell_is_active; + } -reverse_deactivation: - smp_store_release(&cell->state, AFS_CELL_ACTIVE); - wake_up_var(&cell->state); - _leave(" [deact->act]"); - return; + /* Make sure that the expiring server records are going to see the fact + * that the cell is caput. + */ + afs_set_cell_state(cell, AFS_CELL_REMOVING); -done: - _leave(" [done %u]", cell->state); - return; + afs_deactivate_cell(net, cell); + afs_purge_servers(cell); + + rb_erase(&cell->net_node, &net->cells); + afs_see_cell(cell, afs_cell_trace_unuse_delete); + up_write(&net->cells_lock); -final_destruction: /* The root volume is pinning the cell */ afs_put_volume(cell->root_volume, afs_volume_trace_put_cell_root); cell->root_volume = NULL; - afs_put_cell(cell, afs_cell_trace_put_destroy); + + afs_set_cell_state(cell, AFS_CELL_DEAD); + return true; } static void afs_manage_cell_work(struct work_struct *work) { struct afs_cell *cell = container_of(work, struct afs_cell, manager); + bool final_put; - afs_manage_cell(cell); - afs_put_cell(cell, afs_cell_trace_put_queue_work); + afs_see_cell(cell, afs_cell_trace_manage); + final_put = afs_manage_cell(cell); + afs_see_cell(cell, afs_cell_trace_managed); + if (final_put) + afs_put_cell(cell, afs_cell_trace_put_final); } /* - * Manage the records of cells known to a network namespace. This includes - * updating the DNS records and garbage collecting unused cells that were - * automatically added. - * - * Note that constructed cell records may only be removed from net->cells by - * this work item, so it is safe for this work item to stash a cursor pointing - * into the tree and then return to caller (provided it skips cells that are - * still under construction). - * - * Note also that we were given an increment on net->cells_outstanding by - * whoever queued us that we need to deal with before returning. + * Purge in-memory cell database. */ -void afs_manage_cells(struct work_struct *work) +void afs_cell_purge(struct afs_net *net) { - struct afs_net *net = container_of(work, struct afs_net, cells_manager); + struct afs_cell *ws; struct rb_node *cursor; - time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX; - bool purging = !net->live; _enter(""); - /* Trawl the cell database looking for cells that have expired from - * lack of use and cells whose DNS results have expired and dispatch - * their managers. - */ - down_read(&net->cells_lock); + down_write(&net->cells_lock); + ws = rcu_replace_pointer(net->ws_cell, NULL, + lockdep_is_held(&net->cells_lock)); + up_write(&net->cells_lock); + afs_unuse_cell(ws, afs_cell_trace_unuse_ws); + _debug("kick cells"); + down_read(&net->cells_lock); for (cursor = rb_first(&net->cells); cursor; cursor = rb_next(cursor)) { - struct afs_cell *cell = - rb_entry(cursor, struct afs_cell, net_node); - unsigned active; - bool sched_cell = false; - - active = atomic_read(&cell->active); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), - active, afs_cell_trace_manage); - - ASSERTCMP(active, >=, 1); - - if (purging) { - if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) { - active = atomic_dec_return(&cell->active); - trace_afs_cell(cell->debug_id, refcount_read(&cell->ref), - active, afs_cell_trace_unuse_pin); - } - } + struct afs_cell *cell = rb_entry(cursor, struct afs_cell, net_node); - if (active == 1) { - struct afs_vlserver_list *vllist; - time64_t expire_at = cell->last_inactive; - - read_lock(&cell->vl_servers_lock); - vllist = rcu_dereference_protected( - cell->vl_servers, - lockdep_is_held(&cell->vl_servers_lock)); - if (vllist->nr_servers > 0) - expire_at += afs_cell_gc_delay; - read_unlock(&cell->vl_servers_lock); - if (purging || expire_at <= now) - sched_cell = true; - else if (expire_at < next_manage) - next_manage = expire_at; - } + afs_see_cell(cell, afs_cell_trace_purge); - if (!purging) { - if (test_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags)) - sched_cell = true; - } + if (test_and_clear_bit(AFS_CELL_FL_NO_GC, &cell->flags)) + afs_unuse_cell(cell, afs_cell_trace_unuse_pin); - if (sched_cell) - afs_queue_cell(cell, afs_cell_trace_get_queue_manage); + afs_queue_cell(cell, afs_cell_trace_queue_purge); } - up_read(&net->cells_lock); - /* Update the timer on the way out. We have to pass an increment on - * cells_outstanding in the namespace that we are in to the timer or - * the work scheduler. - */ - if (!purging && next_manage < TIME64_MAX) { - now = ktime_get_real_seconds(); - - if (next_manage - now <= 0) { - if (queue_work(afs_wq, &net->cells_manager)) - atomic_inc(&net->cells_outstanding); - } else { - afs_set_cell_timer(net, next_manage - now); - } - } - - afs_dec_cells_outstanding(net); - _leave(" [%d]", atomic_read(&net->cells_outstanding)); -} - -/* - * Purge in-memory cell database. - */ -void afs_cell_purge(struct afs_net *net) -{ - struct afs_cell *ws; - - _enter(""); - - down_write(&net->cells_lock); - ws = net->ws_cell; - net->ws_cell = NULL; - up_write(&net->cells_lock); - afs_unuse_cell(net, ws, afs_cell_trace_unuse_ws); - - _debug("del timer"); - if (del_timer_sync(&net->cells_timer)) - atomic_dec(&net->cells_outstanding); - - _debug("kick mgr"); - afs_queue_cell_manager(net); - _debug("wait"); wait_var_event(&net->cells_outstanding, !atomic_read(&net->cells_outstanding)); diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c new file mode 100644 index 000000000000..edcbd249d202 --- /dev/null +++ b/fs/afs/cm_security.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Cache manager security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include <crypto/krb5.h> +#include "internal.h" +#include "afs_cm.h" +#include "afs_fs.h" +#include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include <trace/events/rxrpc.h> + +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32))) + +#ifdef CONFIG_RXGK +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server); +#endif + +/* + * Respond to an RxGK challenge, adding appdata. + */ +static int afs_respond_to_challenge(struct sk_buff *challenge) +{ +#ifdef CONFIG_RXGK + struct krb5_buffer appdata = {}; + struct afs_server *server; +#endif + struct rxrpc_peer *peer; + unsigned long peer_data; + u16 service_id; + u8 security_index; + + rxrpc_kernel_query_challenge(challenge, &peer, &peer_data, + &service_id, &security_index); + + _enter("%u,%u", service_id, security_index); + + switch (service_id) { + /* We don't send CM_SERVICE RPCs, so don't expect a challenge + * therefrom. + */ + case FS_SERVICE: + case VL_SERVICE: + case YFS_FS_SERVICE: + case YFS_VL_SERVICE: + break; + default: + pr_warn("Can't respond to unknown challenge %u:%u", + service_id, security_index); + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } + + switch (security_index) { +#ifdef CONFIG_RXKAD + case RXRPC_SECURITY_RXKAD: + return rxkad_kernel_respond_to_challenge(challenge); +#endif + +#ifdef CONFIG_RXGK + case RXRPC_SECURITY_RXGK: + return rxgk_kernel_respond_to_challenge(challenge, &appdata); + + case RXRPC_SECURITY_YFS_RXGK: + switch (service_id) { + case FS_SERVICE: + case YFS_FS_SERVICE: + server = (struct afs_server *)peer_data; + if (!server->cm_rxgk_appdata.data) { + mutex_lock(&server->cm_token_lock); + if (!server->cm_rxgk_appdata.data) + afs_create_yfs_cm_token(challenge, server); + mutex_unlock(&server->cm_token_lock); + } + if (server->cm_rxgk_appdata.data) + appdata = server->cm_rxgk_appdata; + break; + } + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +#endif + + default: + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } +} + +/* + * Process the OOB message queue, processing challenge packets. + */ +void afs_process_oob_queue(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, rx_oob_work); + struct sk_buff *oob; + enum rxrpc_oob_type type; + + while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) { + switch (type) { + case RXRPC_OOB_CHALLENGE: + afs_respond_to_challenge(oob); + break; + } + rxrpc_kernel_free_oob(oob); + } +} + +#ifdef CONFIG_RXGK +/* + * Create a securities keyring for the cache manager and attach a key to it for + * the RxGK tokens we want to use to secure the callback connection back from + * the fileserver. + */ +int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + const struct krb5_enctype *krb5; + struct key *ring; + key_ref_t key; + char K0[32], *desc; + int ret; + + ring = keyring_alloc("kafs", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_SEARCH | KEY_POS_WRITE | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(ring)) + return PTR_ERR(ring); + + ret = rxrpc_sock_set_security_keyring(socket->sk, ring); + if (ret < 0) + goto out; + + ret = -ENOPKG; + krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (!krb5) + goto out; + + if (WARN_ON_ONCE(krb5->key_len > sizeof(K0))) + goto out; + + ret = -ENOMEM; + desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u", + YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype); + if (!desc) + goto out; + + wait_for_random_bytes(); + get_random_bytes(K0, krb5->key_len); + + key = key_create(make_key_ref(ring, true), + "rxrpc_s", desc, + K0, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + kfree(desc); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto out; + } + + net->fs_cm_token_key = key_ref_to_ptr(key); + ret = 0; +out: + key_put(ring); + return ret; +} + +/* + * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver. + */ +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server) +{ + const struct krb5_enctype *conn_krb5, *token_krb5; + const struct krb5_buffer *token_key; + struct crypto_aead *aead; + struct scatterlist sg; + struct afs_net *net = server->cell->net; + const struct key *key = net->fs_cm_token_key; + size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset; + __be32 caps[1] = { + [0] = htonl(AFS_CAP_ERROR_TRANSLATION), + }; + __be32 *xdr; + void *appdata, *K0, *encbase; + u32 enctype; + int ret; + + if (!key) + return -ENOKEY; + + /* Assume that the fileserver is happy to use the same encoding type as + * we were told to use by the token obtained by the user. + */ + enctype = rxgk_kernel_query_challenge(challenge); + + conn_krb5 = crypto_krb5_find_enctype(enctype); + if (!conn_krb5) + return -ENOPKG; + token_krb5 = key->payload.data[0]; + token_key = (const struct krb5_buffer *)&key->payload.data[2]; + + /* struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + */ + keysize = 4 + xdr_len_object(conn_krb5->key_len); + + /* struct RXGK_AuthName { + * afs_int32 kind; + * opaque data<AUTHDATAMAX>; + * opaque display<AUTHPRINTABLEMAX>; + * }; + */ + uuidsize = sizeof(server->uuid); + authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0); + + /* struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ + toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize); + + offset = 0; + encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset); + + /* struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + */ + contsize = 4 + 4 + xdr_len_object(encsize); + + /* struct YFSAppData { + * opr_uuid initiatorUuid; + * opr_uuid acceptorUuid; + * Capabilities caps; + * afs_int32 enctype; + * opaque callbackKey<>; + * opaque callbackToken<>; + * }; + */ + adatasize = 16 + 16 + + xdr_len_object(sizeof(caps)) + + 4 + + xdr_len_object(conn_krb5->key_len) + + xdr_len_object(contsize); + + ret = -ENOMEM; + appdata = kzalloc(adatasize, GFP_KERNEL); + if (!appdata) + goto out; + xdr = appdata; + + memcpy(xdr, &net->uuid, 16); /* appdata.initiatorUuid */ + xdr += 16 / 4; + memcpy(xdr, &server->uuid, 16); /* appdata.acceptorUuid */ + xdr += 16 / 4; + *xdr++ = htonl(ARRAY_SIZE(caps)); /* appdata.caps.len */ + memcpy(xdr, &caps, sizeof(caps)); /* appdata.caps */ + xdr += ARRAY_SIZE(caps); + *xdr++ = htonl(conn_krb5->etype); /* appdata.enctype */ + + *xdr++ = htonl(conn_krb5->key_len); /* appdata.callbackKey.len */ + K0 = xdr; + get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(contsize); /* appdata.callbackToken.len */ + *xdr++ = htonl(1); /* cont.kvno */ + *xdr++ = htonl(token_krb5->etype); /* cont.enctype */ + *xdr++ = htonl(encsize); /* cont.encrypted_token.len */ + + encbase = xdr; + xdr += offset / 4; + *xdr++ = htonl(conn_krb5->etype); /* token.K0.enctype */ + *xdr++ = htonl(conn_krb5->key_len); /* token.K0.key.len */ + memcpy(xdr, K0, conn_krb5->key_len); /* token.K0.key.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(RXRPC_SECURITY_ENCRYPT); /* token.level */ + *xdr++ = htonl(0); /* token.starttime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(0); /* token.lifetime */ + *xdr++ = htonl(0); /* token.bytelife */ + *xdr++ = htonl(0); /* token.expirationtime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(1); /* token.identities.count */ + *xdr++ = htonl(0); /* token.identities[0].kind */ + *xdr++ = htonl(uuidsize); /* token.identities[0].data.len */ + memcpy(xdr, &server->uuid, uuidsize); + xdr += xdr_round_up(uuidsize) / 4; + *xdr++ = htonl(0); /* token.identities[0].display.len */ + + xdr = encbase + xdr_round_up(encsize); + + if ((unsigned long)xdr - (unsigned long)appdata != adatasize) + pr_err("Appdata size incorrect %lx != %zx\n", + (unsigned long)xdr - (unsigned long)appdata, adatasize); + + aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN, + GFP_KERNEL); + if (IS_ERR(aead)) { + ret = PTR_ERR(aead); + goto out_token; + } + + sg_init_one(&sg, encbase, encsize); + ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false); + if (ret < 0) + goto out_aead; + + server->cm_rxgk_appdata.len = adatasize; + server->cm_rxgk_appdata.data = appdata; + appdata = NULL; + +out_aead: + crypto_free_aead(aead); +out_token: + kfree(appdata); +out: + return ret; +} +#endif /* CONFIG_RXGK */ diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c index 99a3f20bc786..1a906805a9e3 100644 --- a/fs/afs/cmservice.c +++ b/fs/afs/cmservice.c @@ -139,49 +139,6 @@ bool afs_cm_incoming_call(struct afs_call *call) } /* - * Find the server record by peer address and record a probe to the cache - * manager from a server. - */ -static int afs_find_cm_server_by_peer(struct afs_call *call) -{ - struct sockaddr_rxrpc srx; - struct afs_server *server; - struct rxrpc_peer *peer; - - peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall); - - server = afs_find_server(call->net, peer); - if (!server) { - trace_afs_cm_no_server(call, &srx); - return 0; - } - - call->server = server; - return 0; -} - -/* - * Find the server record by server UUID and record a probe to the cache - * manager from a server. - */ -static int afs_find_cm_server_by_uuid(struct afs_call *call, - struct afs_uuid *uuid) -{ - struct afs_server *server; - - rcu_read_lock(); - server = afs_find_server_by_uuid(call->net, call->request); - rcu_read_unlock(); - if (!server) { - trace_afs_cm_no_server_u(call, call->request); - return 0; - } - - call->server = server; - return 0; -} - -/* * Clean up a cache manager call. */ static void afs_cm_destructor(struct afs_call *call) @@ -322,10 +279,7 @@ static int afs_deliver_cb_callback(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -349,18 +303,10 @@ static void SRXAFSCB_InitCallBackState(struct work_struct *work) */ static int afs_deliver_cb_init_call_back_state(struct afs_call *call) { - int ret; - _enter(""); afs_extract_discard(call, 0); - ret = afs_extract_data(call, false); - if (ret < 0) - return ret; - - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_peer(call); + return afs_extract_data(call, false); } /* @@ -373,8 +319,6 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) __be32 *b; int ret; - _enter(""); - _enter("{%u}", call->unmarshall); switch (call->unmarshall) { @@ -421,9 +365,13 @@ static int afs_deliver_cb_init_call_back_state3(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - /* we'll need the file server record as that tells us which set of - * vnodes to operate upon */ - return afs_find_cm_server_by_uuid(call, call->request); + if (memcmp(call->request, &call->server->_uuid, sizeof(call->server->_uuid)) != 0) { + pr_notice("Callback UUID does not match fileserver UUID\n"); + trace_afs_cm_no_server_u(call, call->request); + return 0; + } + + return 0; } /* @@ -455,7 +403,7 @@ static int afs_deliver_cb_probe(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -533,7 +481,7 @@ static int afs_deliver_cb_probe_uuid(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -593,7 +541,7 @@ static int afs_deliver_cb_tell_me_about_yourself(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - return afs_find_cm_server_by_peer(call); + return 0; } /* @@ -667,9 +615,5 @@ static int afs_deliver_yfs_cb_callback(struct afs_call *call) if (!afs_check_call_state(call, AFS_CALL_SV_REPLYING)) return afs_io_error(call, afs_io_error_cm_reply); - - /* We'll need the file server record as that tells us which set of - * vnodes to operate upon. - */ - return afs_find_cm_server_by_peer(call); + return 0; } diff --git a/fs/afs/dir.c b/fs/afs/dir.c index 02cbf38e1a77..bfb69e066672 100644 --- a/fs/afs/dir.c +++ b/fs/afs/dir.c @@ -33,8 +33,8 @@ static bool afs_lookup_filldir(struct dir_context *ctx, const char *name, int nl loff_t fpos, u64 ino, unsigned dtype); static int afs_create(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode, bool excl); -static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode); +static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode); static int afs_rmdir(struct inode *dir, struct dentry *dentry); static int afs_unlink(struct inode *dir, struct dentry *dentry); static int afs_link(struct dentry *from, struct inode *dir, @@ -943,7 +943,7 @@ static struct dentry *afs_lookup_atsys(struct inode *dir, struct dentry *dentry) } strcpy(p, name); - ret = lookup_one_len(buf, dentry->d_parent, len); + ret = lookup_noperm(&QSTR(buf), dentry->d_parent); if (IS_ERR(ret) || d_is_positive(ret)) goto out_s; dput(ret); @@ -1004,9 +1004,8 @@ static struct dentry *afs_lookup(struct inode *dir, struct dentry *dentry, afs_stat_v(dvnode, n_lookup); inode = afs_do_lookup(dir, dentry); if (inode == ERR_PTR(-ENOENT)) - inode = afs_try_auto_mntpt(dentry, dir); - - if (!IS_ERR_OR_NULL(inode)) + inode = NULL; + else if (!IS_ERR_OR_NULL(inode)) fid = AFS_FS_I(inode)->fid; _debug("splice %p", dentry->d_inode); @@ -1315,8 +1314,8 @@ static const struct afs_operation_ops afs_mkdir_operation = { /* * create a directory on an AFS filesystem */ -static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct afs_operation *op; struct afs_vnode *dvnode = AFS_FS_I(dir); @@ -1328,7 +1327,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, op = afs_alloc_operation(NULL, dvnode->volume); if (IS_ERR(op)) { d_drop(dentry); - return PTR_ERR(op); + return ERR_CAST(op); } fscache_use_cookie(afs_vnode_cache(dvnode), true); @@ -1344,7 +1343,7 @@ static int afs_mkdir(struct mnt_idmap *idmap, struct inode *dir, op->ops = &afs_mkdir_operation; ret = afs_do_sync_operation(op); afs_dir_unuse_cookie(dvnode, ret); - return ret; + return ERR_PTR(ret); } /* diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c index a1e581946b93..0b80eb93fa40 100644 --- a/fs/afs/dir_silly.c +++ b/fs/afs/dir_silly.c @@ -113,16 +113,14 @@ int afs_sillyrename(struct afs_vnode *dvnode, struct afs_vnode *vnode, sdentry = NULL; do { - int slen; - dput(sdentry); sillycounter++; /* Create a silly name. Note that the ".__afs" prefix is * understood by the salvager and must not be changed. */ - slen = scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter); - sdentry = lookup_one_len(silly, dentry->d_parent, slen); + scnprintf(silly, sizeof(silly), ".__afs%04X", sillycounter); + sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent); /* N.B. Better to return EBUSY here ... it could be dangerous * to delete the file while it's in use. diff --git a/fs/afs/dynroot.c b/fs/afs/dynroot.c index d8bf52f77d93..8c6130789fde 100644 --- a/fs/afs/dynroot.c +++ b/fs/afs/dynroot.c @@ -10,16 +10,19 @@ #include <linux/dns_resolver.h> #include "internal.h" -static atomic_t afs_autocell_ino; +#define AFS_MIN_DYNROOT_CELL_INO 4 /* Allow for ., .., @cell, .@cell */ +#define AFS_MAX_DYNROOT_CELL_INO ((unsigned int)INT_MAX) + +static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino); /* * iget5() comparator for inode created by autocell operations - * - * These pseudo inodes don't match anything. */ static int afs_iget5_pseudo_test(struct inode *inode, void *opaque) { - return 0; + struct afs_fid *fid = opaque; + + return inode->i_ino == fid->vnode; } /* @@ -39,28 +42,16 @@ static int afs_iget5_pseudo_set(struct inode *inode, void *opaque) } /* - * Create an inode for a dynamic root directory or an autocell dynamic - * automount dir. + * Create an inode for an autocell dynamic automount dir. */ -struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) +static struct inode *afs_iget_pseudo_dir(struct super_block *sb, ino_t ino) { - struct afs_super_info *as = AFS_FS_S(sb); struct afs_vnode *vnode; struct inode *inode; - struct afs_fid fid = {}; + struct afs_fid fid = { .vnode = ino, .unique = 1, }; _enter(""); - if (as->volume) - fid.vid = as->volume->vid; - if (root) { - fid.vnode = 1; - fid.unique = 1; - } else { - fid.vnode = atomic_inc_return(&afs_autocell_ino); - fid.unique = 0; - } - inode = iget5_locked(sb, fid.vnode, afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); if (!inode) { @@ -73,115 +64,71 @@ struct inode *afs_iget_pseudo_dir(struct super_block *sb, bool root) vnode = AFS_FS_I(inode); - /* there shouldn't be an existing inode */ - BUG_ON(!(inode->i_state & I_NEW)); - - netfs_inode_init(&vnode->netfs, NULL, false); - inode->i_size = 0; - inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO; - if (root) { - inode->i_op = &afs_dynroot_inode_operations; - inode->i_fop = &simple_dir_operations; - } else { - inode->i_op = &afs_autocell_inode_operations; - } - set_nlink(inode, 2); - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - simple_inode_init_ts(inode); - inode->i_blocks = 0; - inode->i_generation = 0; - - set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); - if (!root) { + if (inode->i_state & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 2); + inode->i_size = 0; + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_autocell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_AUTOMOUNT | S_NOATIME; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); set_bit(AFS_VNODE_MOUNTPOINT, &vnode->flags); - inode->i_flags |= S_AUTOMOUNT; - } - inode->i_flags |= S_NOATIME; - unlock_new_inode(inode); + unlock_new_inode(inode); + } _leave(" = %p", inode); return inode; } /* - * Probe to see if a cell may exist. This prevents positive dentries from - * being created unnecessarily. + * Try to automount the mountpoint with pseudo directory, if the autocell + * option is set. */ -static int afs_probe_cell_name(struct dentry *dentry) +static struct dentry *afs_dynroot_lookup_cell(struct inode *dir, struct dentry *dentry, + unsigned int flags) { - struct afs_cell *cell; + struct afs_cell *cell = NULL; struct afs_net *net = afs_d2net(dentry); + struct inode *inode = NULL; const char *name = dentry->d_name.name; size_t len = dentry->d_name.len; - char *result = NULL; - int ret; + bool dotted = false; + int ret = -ENOENT; /* Names prefixed with a dot are R/W mounts. */ if (name[0] == '.') { - if (len == 1) - return -EINVAL; name++; len--; + dotted = true; } - cell = afs_find_cell(net, name, len, afs_cell_trace_use_probe); - if (!IS_ERR(cell)) { - afs_unuse_cell(net, cell, afs_cell_trace_unuse_probe); - return 0; - } - - ret = dns_query(net->net, "afsdb", name, len, "srv=1", - &result, NULL, false); - if (ret == -ENODATA || ret == -ENOKEY || ret == 0) - ret = -ENOENT; - if (ret > 0 && ret >= sizeof(struct dns_server_list_v1_header)) { - struct dns_server_list_v1_header *v1 = (void *)result; - - if (v1->hdr.zero == 0 && - v1->hdr.content == DNS_PAYLOAD_IS_SERVER_LIST && - v1->hdr.version == 1 && - (v1->status != DNS_LOOKUP_GOOD && - v1->status != DNS_LOOKUP_GOOD_WITH_BAD)) - return -ENOENT; - + cell = afs_lookup_cell(net, name, len, NULL, false, + afs_cell_trace_use_lookup_dynroot); + if (IS_ERR(cell)) { + ret = PTR_ERR(cell); + goto out_no_cell; } - kfree(result); - return ret; -} - -/* - * Try to auto mount the mountpoint with pseudo directory, if the autocell - * operation is setted. - */ -struct inode *afs_try_auto_mntpt(struct dentry *dentry, struct inode *dir) -{ - struct afs_vnode *vnode = AFS_FS_I(dir); - struct inode *inode; - int ret = -ENOENT; - - _enter("%p{%pd}, {%llx:%llu}", - dentry, dentry, vnode->fid.vid, vnode->fid.vnode); - - if (!test_bit(AFS_VNODE_AUTOCELL, &vnode->flags)) - goto out; - - ret = afs_probe_cell_name(dentry); - if (ret < 0) - goto out; - - inode = afs_iget_pseudo_dir(dir->i_sb, false); + inode = afs_iget_pseudo_dir(dir->i_sb, cell->dynroot_ino * 2 + dotted); if (IS_ERR(inode)) { ret = PTR_ERR(inode); goto out; } - _leave("= %p", inode); - return inode; + dentry->d_fsdata = cell; + return d_splice_alias(inode, dentry); out: - _leave("= %d", ret); + afs_unuse_cell(cell, afs_cell_trace_unuse_lookup_dynroot); +out_no_cell: + if (!inode) + return d_splice_alias(inode, dentry); return ret == -ENOENT ? NULL : ERR_PTR(ret); } @@ -193,8 +140,6 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr { _enter("%pd", dentry); - ASSERTCMP(d_inode(dentry), ==, NULL); - if (flags & LOOKUP_CREATE) return ERR_PTR(-EOPNOTSUPP); @@ -203,98 +148,49 @@ static struct dentry *afs_dynroot_lookup(struct inode *dir, struct dentry *dentr return ERR_PTR(-ENAMETOOLONG); } - return d_splice_alias(afs_try_auto_mntpt(dentry, dir), dentry); + if (dentry->d_name.len == 5 && + memcmp(dentry->d_name.name, "@cell", 5) == 0) + return afs_lookup_atcell(dir, dentry, 2); + + if (dentry->d_name.len == 6 && + memcmp(dentry->d_name.name, ".@cell", 6) == 0) + return afs_lookup_atcell(dir, dentry, 3); + + return afs_dynroot_lookup_cell(dir, dentry, flags); } const struct inode_operations afs_dynroot_inode_operations = { .lookup = afs_dynroot_lookup, }; -const struct dentry_operations afs_dynroot_dentry_operations = { - .d_delete = always_delete_dentry, - .d_release = afs_d_release, - .d_automount = afs_d_automount, -}; - -/* - * Create a manually added cell mount directory. - * - The caller must hold net->proc_cells_lock - */ -int afs_dynroot_mkdir(struct afs_net *net, struct afs_cell *cell) -{ - struct super_block *sb = net->dynroot_sb; - struct dentry *root, *subdir, *dsubdir; - char *dotname = cell->name - 1; - int ret; - - if (!sb || atomic_read(&sb->s_active) == 0) - return 0; - - /* Let the ->lookup op do the creation */ - root = sb->s_root; - inode_lock(root->d_inode); - subdir = lookup_one_len(cell->name, root, cell->name_len); - if (IS_ERR(subdir)) { - ret = PTR_ERR(subdir); - goto unlock; - } - - dsubdir = lookup_one_len(dotname, root, cell->name_len + 1); - if (IS_ERR(dsubdir)) { - ret = PTR_ERR(dsubdir); - dput(subdir); - goto unlock; - } - - /* Note that we're retaining extra refs on the dentries. */ - subdir->d_fsdata = (void *)1UL; - dsubdir->d_fsdata = (void *)1UL; - ret = 0; -unlock: - inode_unlock(root->d_inode); - return ret; -} - -static void afs_dynroot_rm_one_dir(struct dentry *root, const char *name, size_t name_len) +static void afs_dynroot_d_release(struct dentry *dentry) { - struct dentry *subdir; - - /* Don't want to trigger a lookup call, which will re-add the cell */ - subdir = try_lookup_one_len(name, root, name_len); - if (IS_ERR_OR_NULL(subdir)) { - _debug("lookup %ld", PTR_ERR(subdir)); - return; - } - - _debug("rmdir %pd %u", subdir, d_count(subdir)); + struct afs_cell *cell = dentry->d_fsdata; - if (subdir->d_fsdata) { - _debug("unpin %u", d_count(subdir)); - subdir->d_fsdata = NULL; - dput(subdir); - } - dput(subdir); + afs_unuse_cell(cell, afs_cell_trace_unuse_dynroot_mntpt); } /* - * Remove a manually added cell mount directory. - * - The caller must hold net->proc_cells_lock + * Keep @cell symlink dentries around, but only keep cell autodirs when they're + * being used. */ -void afs_dynroot_rmdir(struct afs_net *net, struct afs_cell *cell) +static int afs_dynroot_delete_dentry(const struct dentry *dentry) { - struct super_block *sb = net->dynroot_sb; - char *dotname = cell->name - 1; - - if (!sb || atomic_read(&sb->s_active) == 0) - return; + const struct qstr *name = &dentry->d_name; - inode_lock(sb->s_root->d_inode); - afs_dynroot_rm_one_dir(sb->s_root, cell->name, cell->name_len); - afs_dynroot_rm_one_dir(sb->s_root, dotname, cell->name_len + 1); - inode_unlock(sb->s_root->d_inode); - _leave(""); + if (name->len == 5 && memcmp(name->name, "@cell", 5) == 0) + return 0; + if (name->len == 6 && memcmp(name->name, ".@cell", 6) == 0) + return 0; + return 1; } +const struct dentry_operations afs_dynroot_dentry_operations = { + .d_delete = afs_dynroot_delete_dentry, + .d_release = afs_dynroot_d_release, + .d_automount = afs_d_automount, +}; + static void afs_atcell_delayed_put_cell(void *arg) { struct afs_cell *cell = arg; @@ -314,12 +210,23 @@ static const char *afs_atcell_get_link(struct dentry *dentry, struct inode *inod const char *name; bool dotted = vnode->fid.vnode == 3; - if (!net->ws_cell) + if (!rcu_access_pointer(net->ws_cell)) return ERR_PTR(-ENOENT); + if (!dentry) { + /* We're in RCU-pathwalk. */ + cell = rcu_dereference(net->ws_cell); + if (dotted) + name = cell->name - 1; + else + name = cell->name; + /* Shouldn't need to set a delayed call. */ + return name; + } + down_read(&net->cells_lock); - cell = net->ws_cell; + cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock)); if (dotted) name = cell->name - 1; else @@ -336,149 +243,163 @@ static const struct inode_operations afs_atcell_inode_operations = { }; /* - * Look up @cell or .@cell in a dynroot directory. This is a substitution for - * the local cell name for the net namespace. + * Create an inode for the @cell or .@cell symlinks. */ -static struct dentry *afs_dynroot_create_symlink(struct dentry *root, const char *name) +static struct dentry *afs_lookup_atcell(struct inode *dir, struct dentry *dentry, ino_t ino) { struct afs_vnode *vnode; - struct afs_fid fid = { .vnode = 2, .unique = 1, }; - struct dentry *dentry; struct inode *inode; + struct afs_fid fid = { .vnode = ino, .unique = 1, }; - if (name[0] == '.') - fid.vnode = 3; - - dentry = d_alloc_name(root, name); - if (!dentry) - return ERR_PTR(-ENOMEM); - - inode = iget5_locked(dentry->d_sb, fid.vnode, + inode = iget5_locked(dir->i_sb, fid.vnode, afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); - if (!inode) { - dput(dentry); + if (!inode) return ERR_PTR(-ENOMEM); - } vnode = AFS_FS_I(inode); - /* there shouldn't be an existing inode */ - if (WARN_ON_ONCE(!(inode->i_state & I_NEW))) { - iput(inode); - dput(dentry); - return ERR_PTR(-EIO); + if (inode->i_state & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 1); + inode->i_size = 0; + inode->i_mode = S_IFLNK | 0555; + inode->i_op = &afs_atcell_inode_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + unlock_new_inode(inode); } - - netfs_inode_init(&vnode->netfs, NULL, false); - simple_inode_init_ts(inode); - set_nlink(inode, 1); - inode->i_size = 0; - inode->i_mode = S_IFLNK | 0555; - inode->i_op = &afs_atcell_inode_operations; - inode->i_uid = GLOBAL_ROOT_UID; - inode->i_gid = GLOBAL_ROOT_GID; - inode->i_blocks = 0; - inode->i_generation = 0; - inode->i_flags |= S_NOATIME; - - unlock_new_inode(inode); - d_splice_alias(inode, dentry); - return dentry; + return d_splice_alias(inode, dentry); } /* - * Create @cell and .@cell symlinks. + * Transcribe the cell database into readdir content under the RCU read lock. + * Each cell produces two entries, one prefixed with a dot and one not. */ -static int afs_dynroot_symlink(struct afs_net *net) +static int afs_dynroot_readdir_cells(struct afs_net *net, struct dir_context *ctx) { - struct super_block *sb = net->dynroot_sb; - struct dentry *root, *symlink, *dsymlink; - int ret; - - /* Let the ->lookup op do the creation */ - root = sb->s_root; - inode_lock(root->d_inode); - symlink = afs_dynroot_create_symlink(root, "@cell"); - if (IS_ERR(symlink)) { - ret = PTR_ERR(symlink); - goto unlock; - } + const struct afs_cell *cell; + loff_t newpos; + + _enter("%llu", ctx->pos); + + for (;;) { + unsigned int ix = ctx->pos >> 1; + + cell = idr_get_next(&net->cells_dyn_ino, &ix); + if (!cell) + return 0; + if (READ_ONCE(cell->state) == AFS_CELL_REMOVING || + READ_ONCE(cell->state) == AFS_CELL_DEAD) { + ctx->pos += 2; + ctx->pos &= ~1; + continue; + } - dsymlink = afs_dynroot_create_symlink(root, ".@cell"); - if (IS_ERR(dsymlink)) { - ret = PTR_ERR(dsymlink); - dput(symlink); - goto unlock; - } + newpos = ix << 1; + if (newpos > ctx->pos) + ctx->pos = newpos; - /* Note that we're retaining extra refs on the dentries. */ - symlink->d_fsdata = (void *)1UL; - dsymlink->d_fsdata = (void *)1UL; - ret = 0; -unlock: - inode_unlock(root->d_inode); - return ret; + _debug("pos %llu -> cell %u", ctx->pos, cell->dynroot_ino); + + if ((ctx->pos & 1) == 0) { + if (!dir_emit(ctx, cell->name, cell->name_len, + cell->dynroot_ino, DT_DIR)) + return 0; + ctx->pos++; + } + if ((ctx->pos & 1) == 1) { + if (!dir_emit(ctx, cell->name - 1, cell->name_len + 1, + cell->dynroot_ino + 1, DT_DIR)) + return 0; + ctx->pos++; + } + } + return 0; } /* - * Populate a newly created dynamic root with cell names. + * Read the AFS dynamic root directory. This produces a list of cellnames, + * dotted and undotted, along with @cell and .@cell links if configured. */ -int afs_dynroot_populate(struct super_block *sb) +static int afs_dynroot_readdir(struct file *file, struct dir_context *ctx) { - struct afs_cell *cell; - struct afs_net *net = afs_sb2net(sb); - int ret; - - mutex_lock(&net->proc_cells_lock); + struct afs_net *net = afs_d2net(file->f_path.dentry); + int ret = 0; - net->dynroot_sb = sb; - ret = afs_dynroot_symlink(net); - if (ret < 0) - goto error; + if (!dir_emit_dots(file, ctx)) + return 0; - hlist_for_each_entry(cell, &net->proc_cells, proc_link) { - ret = afs_dynroot_mkdir(net, cell); - if (ret < 0) - goto error; + if (ctx->pos == 2) { + if (rcu_access_pointer(net->ws_cell) && + !dir_emit(ctx, "@cell", 5, 2, DT_LNK)) + return 0; + ctx->pos = 3; + } + if (ctx->pos == 3) { + if (rcu_access_pointer(net->ws_cell) && + !dir_emit(ctx, ".@cell", 6, 3, DT_LNK)) + return 0; + ctx->pos = 4; } - ret = 0; -out: - mutex_unlock(&net->proc_cells_lock); + if ((unsigned long long)ctx->pos <= AFS_MAX_DYNROOT_CELL_INO) { + down_read(&net->cells_lock); + ret = afs_dynroot_readdir_cells(net, ctx); + up_read(&net->cells_lock); + } return ret; - -error: - net->dynroot_sb = NULL; - goto out; } +static const struct file_operations afs_dynroot_file_operations = { + .llseek = generic_file_llseek, + .read = generic_read_dir, + .iterate_shared = afs_dynroot_readdir, + .fsync = noop_fsync, +}; + /* - * When a dynamic root that's in the process of being destroyed, depopulate it - * of pinned directories. + * Create an inode for a dynamic root directory. */ -void afs_dynroot_depopulate(struct super_block *sb) +struct inode *afs_dynroot_iget_root(struct super_block *sb) { - struct afs_net *net = afs_sb2net(sb); - struct dentry *root = sb->s_root, *subdir; - - /* Prevent more subdirs from being created */ - mutex_lock(&net->proc_cells_lock); - if (net->dynroot_sb == sb) - net->dynroot_sb = NULL; - mutex_unlock(&net->proc_cells_lock); - - if (root) { - struct hlist_node *n; - inode_lock(root->d_inode); - - /* Remove all the pins for dirs created for manually added cells */ - hlist_for_each_entry_safe(subdir, n, &root->d_children, d_sib) { - if (subdir->d_fsdata) { - subdir->d_fsdata = NULL; - dput(subdir); - } - } + struct afs_super_info *as = AFS_FS_S(sb); + struct afs_vnode *vnode; + struct inode *inode; + struct afs_fid fid = { .vid = 0, .vnode = 1, .unique = 1,}; + + if (as->volume) + fid.vid = as->volume->vid; - inode_unlock(root->d_inode); + inode = iget5_locked(sb, fid.vnode, + afs_iget5_pseudo_test, afs_iget5_pseudo_set, &fid); + if (!inode) + return ERR_PTR(-ENOMEM); + + vnode = AFS_FS_I(inode); + + /* there shouldn't be an existing inode */ + if (inode->i_state & I_NEW) { + netfs_inode_init(&vnode->netfs, NULL, false); + simple_inode_init_ts(inode); + set_nlink(inode, 2); + inode->i_size = 0; + inode->i_mode = S_IFDIR | 0555; + inode->i_op = &afs_dynroot_inode_operations; + inode->i_fop = &afs_dynroot_file_operations; + inode->i_uid = GLOBAL_ROOT_UID; + inode->i_gid = GLOBAL_ROOT_GID; + inode->i_blocks = 0; + inode->i_generation = 0; + inode->i_flags |= S_NOATIME; + + set_bit(AFS_VNODE_PSEUDODIR, &vnode->flags); + unlock_new_inode(inode); } + _leave(" = %p", inode); + return inode; } diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c index b516d05b0fef..e0030ac74ea0 100644 --- a/fs/afs/fs_probe.c +++ b/fs/afs/fs_probe.c @@ -235,20 +235,20 @@ out: * Probe all of a fileserver's addresses to find out the best route and to * query its capabilities. */ -void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, - struct afs_addr_list *new_alist, struct key *key) +int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_alist, struct key *key) { struct afs_endpoint_state *estate, *old; - struct afs_addr_list *alist; + struct afs_addr_list *old_alist = NULL, *alist; unsigned long unprobed; _enter("%pU", &server->uuid); estate = kzalloc(sizeof(*estate), GFP_KERNEL); if (!estate) - return; + return -ENOMEM; - refcount_set(&estate->ref, 1); + refcount_set(&estate->ref, 2); estate->server_id = server->debug_id; estate->rtt = UINT_MAX; @@ -256,21 +256,31 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, old = rcu_dereference_protected(server->endpoint_state, lockdep_is_held(&server->fs_lock)); - estate->responsive_set = old->responsive_set; - estate->addresses = afs_get_addrlist(new_alist ?: old->addresses, - afs_alist_trace_get_estate); + if (old) { + estate->responsive_set = old->responsive_set; + if (!new_alist) + new_alist = old->addresses; + } + + if (old_alist != new_alist) + afs_set_peer_appdata(server, old_alist, new_alist); + + estate->addresses = afs_get_addrlist(new_alist, afs_alist_trace_get_estate); alist = estate->addresses; estate->probe_seq = ++server->probe_counter; atomic_set(&estate->nr_probing, alist->nr_addrs); + if (new_alist) + server->addr_version = new_alist->version; rcu_assign_pointer(server->endpoint_state, estate); - set_bit(AFS_ESTATE_SUPERSEDED, &old->flags); write_unlock(&server->fs_lock); + if (old) + set_bit(AFS_ESTATE_SUPERSEDED, &old->flags); trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), afs_estate_trace_alloc_probe); - afs_get_address_preferences(net, alist); + afs_get_address_preferences(net, new_alist); server->probed_at = jiffies; unprobed = (1UL << alist->nr_addrs) - 1; @@ -293,6 +303,8 @@ void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, } afs_put_endpoint_state(old, afs_estate_trace_put_probe); + afs_put_endpoint_state(estate, afs_estate_trace_put_probe); + return 0; } /* @@ -522,6 +534,6 @@ dont_wait: */ void afs_fs_probe_cleanup(struct afs_net *net) { - if (del_timer_sync(&net->fs_probe_timer)) + if (timer_delete_sync(&net->fs_probe_timer)) afs_dec_servers_outstanding(net); } diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c index 1d9ecd5418d8..bc9556991d7c 100644 --- a/fs/afs/fsclient.c +++ b/fs/afs/fsclient.c @@ -1653,7 +1653,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, struct afs_server *server, bp = call->request; *bp++ = htonl(FSGIVEUPALLCALLBACKS); - call->server = afs_use_server(server, afs_server_trace_give_up_cb); + call->server = afs_use_server(server, false, afs_server_trace_use_give_up_cb); afs_make_call(call, GFP_NOFS); afs_wait_for_call_to_complete(call); ret = call->error; @@ -1760,7 +1760,7 @@ bool afs_fs_get_capabilities(struct afs_net *net, struct afs_server *server, return false; call->key = key; - call->server = afs_use_server(server, afs_server_trace_get_caps); + call->server = afs_use_server(server, false, afs_server_trace_use_get_caps); call->peer = rxrpc_kernel_get_peer(estate->addresses->addrs[addr_index].peer); call->probe = afs_get_endpoint_state(estate, afs_estate_trace_get_getcaps); call->probe_index = addr_index; diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 90f407774a9a..1124ea4000cb 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -20,6 +20,7 @@ #include <linux/uuid.h> #include <linux/mm_types.h> #include <linux/dns_resolver.h> +#include <crypto/krb5.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> @@ -176,8 +177,10 @@ struct afs_call { bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ bool responded; /* Got a response from the call (may be abort) */ + u8 security_ix; /* Security class */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ + u32 enctype; /* Security encoding type */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ @@ -281,15 +284,15 @@ struct afs_net { struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; + struct work_struct rx_oob_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; /* Cell database */ struct rb_root cells; - struct afs_cell *ws_cell; - struct work_struct cells_manager; - struct timer_list cells_timer; + struct idr cells_dyn_ino; /* cell->dynroot_ino mapping */ + struct afs_cell __rcu *ws_cell; atomic_t cells_outstanding; struct rw_semaphore cells_lock; struct mutex cells_alias_lock; @@ -301,18 +304,12 @@ struct afs_net { * cell, but in practice, people create aliases and subsets and there's * no easy way to distinguish them. */ - seqlock_t fs_lock; /* For fs_servers, fs_probe_*, fs_proc */ - struct rb_root fs_servers; /* afs_server (by server UUID or address) */ + seqlock_t fs_lock; /* For fs_probe_*, fs_proc */ struct list_head fs_probe_fast; /* List of afs_server to probe at 30s intervals */ struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ - struct hlist_head fs_addresses; /* afs_server (by lowest IPv6 addr) */ - seqlock_t fs_addr_lock; /* For fs_addresses[46] */ - - struct work_struct fs_manager; - struct timer_list fs_timer; - + struct key *fs_cm_token_key; /* Key for creating CM tokens */ struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; @@ -345,13 +342,10 @@ struct afs_net { extern const char afs_init_sysname[]; enum afs_cell_state { - AFS_CELL_UNSET, - AFS_CELL_ACTIVATING, + AFS_CELL_SETTING_UP, AFS_CELL_ACTIVE, - AFS_CELL_DEACTIVATING, - AFS_CELL_INACTIVE, - AFS_CELL_FAILED, - AFS_CELL_REMOVED, + AFS_CELL_REMOVING, + AFS_CELL_DEAD, }; /* @@ -382,7 +376,9 @@ struct afs_cell { struct afs_cell *alias_of; /* The cell this is an alias of */ struct afs_volume *root_volume; /* The root.cell volume if there is one */ struct key *anonymous_key; /* anonymous user key for this cell */ + struct work_struct destroyer; /* Destroyer for cell */ struct work_struct manager; /* Manager for init/deinit/dns */ + struct timer_list management_timer; /* General management timer */ struct hlist_node proc_link; /* /proc cell list link */ time64_t dns_expiry; /* Time AFSDB/SRV record expires */ time64_t last_inactive; /* Time of last drop of usage count */ @@ -398,6 +394,7 @@ struct afs_cell { enum dns_lookup_status dns_status:8; /* Latest status of data from lookup */ unsigned int dns_lookup_count; /* Counter of DNS lookups */ unsigned int debug_id; + unsigned int dynroot_ino; /* Inode numbers for dynroot (a pair) */ /* The volumes belonging to this cell */ struct rw_semaphore vs_lock; /* Lock for server->volumes */ @@ -407,7 +404,7 @@ struct afs_cell { /* Active fileserver interaction state. */ struct rb_root fs_servers; /* afs_server (by server UUID) */ - seqlock_t fs_lock; /* For fs_servers */ + struct rw_semaphore fs_lock; /* For fs_servers */ /* VL server list. */ rwlock_t vl_servers_lock; /* Lock on vl_servers */ @@ -542,22 +539,24 @@ struct afs_server { }; struct afs_cell *cell; /* Cell to which belongs (pins ref) */ - struct rb_node uuid_rb; /* Link in net->fs_servers */ - struct afs_server __rcu *uuid_next; /* Next server with same UUID */ - struct afs_server *uuid_prev; /* Previous server with same UUID */ - struct list_head probe_link; /* Link in net->fs_probe_list */ - struct hlist_node addr_link; /* Link in net->fs_addresses6 */ + struct rb_node uuid_rb; /* Link in cell->fs_servers */ + struct list_head probe_link; /* Link in net->fs_probe_* */ struct hlist_node proc_link; /* Link in net->fs_proc */ struct list_head volumes; /* RCU list of afs_server_entry objects */ - struct afs_server *gc_next; /* Next server in manager's list */ + struct work_struct destroyer; /* Work item to try and destroy a server */ + struct timer_list timer; /* Management timer */ + struct mutex cm_token_lock; /* Lock governing creation of appdata */ + struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ #define AFS_SERVER_FL_UPDATING 1 #define AFS_SERVER_FL_NEEDS_UPDATE 2 /* Fileserver address list is out of date */ -#define AFS_SERVER_FL_NOT_READY 4 /* The record is not ready for use */ -#define AFS_SERVER_FL_NOT_FOUND 5 /* VL server says no such server */ -#define AFS_SERVER_FL_VL_FAIL 6 /* Failed to access VL server */ +#define AFS_SERVER_FL_UNCREATED 3 /* The record needs creating */ +#define AFS_SERVER_FL_CREATING 4 /* The record is being created */ +#define AFS_SERVER_FL_EXPIRED 5 /* The record has expired */ +#define AFS_SERVER_FL_NOT_FOUND 6 /* VL server says no such server */ +#define AFS_SERVER_FL_VL_FAIL 7 /* Failed to access VL server */ #define AFS_SERVER_FL_MAY_HAVE_CB 8 /* May have callbacks on this fileserver */ #define AFS_SERVER_FL_IS_YFS 16 /* Server is YFS not AFS */ #define AFS_SERVER_FL_NO_IBULK 17 /* Fileserver doesn't support FS.InlineBulkStatus */ @@ -567,6 +566,7 @@ struct afs_server { atomic_t active; /* Active user count */ u32 addr_version; /* Address list version */ u16 service_id; /* Service ID we're using. */ + short create_error; /* Creation error */ unsigned int rtt; /* Server's current RTT in uS */ unsigned int debug_id; /* Debugging ID for traces */ @@ -621,6 +621,7 @@ struct afs_volume { afs_volid_t vid; /* The volume ID of this volume */ afs_volid_t vids[AFS_MAXTYPES]; /* All associated volume IDs */ refcount_t ref; + unsigned int debug_id; /* Debugging ID for traces */ time64_t update_at; /* Time at which to next update */ struct afs_cell *cell; /* Cell to which belongs (pins ref) */ struct rb_node cell_node; /* Link in cell->volumes */ @@ -700,7 +701,6 @@ struct afs_vnode { #define AFS_VNODE_ZAP_DATA 3 /* set if vnode's data should be invalidated */ #define AFS_VNODE_DELETED 4 /* set if vnode deleted on server */ #define AFS_VNODE_MOUNTPOINT 5 /* set if vnode is a mountpoint symlink */ -#define AFS_VNODE_AUTOCELL 6 /* set if Vnode is an auto mount point */ #define AFS_VNODE_PSEUDODIR 7 /* set if Vnode is a pseudo directory */ #define AFS_VNODE_NEW_CONTENT 8 /* Set if file has new content (create/trunc-0) */ #define AFS_VNODE_SILLY_DELETED 9 /* Set if file has been silly-deleted */ @@ -1008,6 +1008,9 @@ extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, __be32 xdr, u16 port); extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, __be32 *xdr, u16 port); +void afs_set_peer_appdata(struct afs_server *server, + struct afs_addr_list *old_alist, + struct afs_addr_list *new_alist); /* * addr_prefs.c @@ -1044,16 +1047,17 @@ static inline bool afs_cb_is_broken(unsigned int cb_break, extern int afs_cell_init(struct afs_net *, const char *); extern struct afs_cell *afs_find_cell(struct afs_net *, const char *, unsigned, enum afs_cell_trace); -extern struct afs_cell *afs_lookup_cell(struct afs_net *, const char *, unsigned, - const char *, bool); +struct afs_cell *afs_lookup_cell(struct afs_net *net, + const char *name, unsigned int namesz, + const char *vllist, bool excl, + enum afs_cell_trace trace); extern struct afs_cell *afs_use_cell(struct afs_cell *, enum afs_cell_trace); -extern void afs_unuse_cell(struct afs_net *, struct afs_cell *, enum afs_cell_trace); +void afs_unuse_cell(struct afs_cell *cell, enum afs_cell_trace reason); extern struct afs_cell *afs_get_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_see_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_put_cell(struct afs_cell *, enum afs_cell_trace); extern void afs_queue_cell(struct afs_cell *, enum afs_cell_trace); -extern void afs_manage_cells(struct work_struct *); -extern void afs_cells_timer(struct timer_list *); +void afs_set_cell_timer(struct afs_cell *cell, unsigned int delay_secs); extern void __net_exit afs_cell_purge(struct afs_net *); /* @@ -1062,6 +1066,19 @@ extern void __net_exit afs_cell_purge(struct afs_net *); extern bool afs_cm_incoming_call(struct afs_call *); /* + * cm_security.c + */ +void afs_process_oob_queue(struct work_struct *work); +#ifdef CONFIG_RXGK +int afs_create_token_key(struct afs_net *net, struct socket *socket); +#else +static inline int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + return 0; +} +#endif + +/* * dir.c */ extern const struct file_operations afs_dir_file_operations; @@ -1111,11 +1128,7 @@ extern int afs_silly_iput(struct dentry *, struct inode *); extern const struct inode_operations afs_dynroot_inode_operations; extern const struct dentry_operations afs_dynroot_dentry_operations; -extern struct inode *afs_try_auto_mntpt(struct dentry *, struct inode *); -extern int afs_dynroot_mkdir(struct afs_net *, struct afs_cell *); -extern void afs_dynroot_rmdir(struct afs_net *, struct afs_cell *); -extern int afs_dynroot_populate(struct super_block *); -extern void afs_dynroot_depopulate(struct super_block *); +struct inode *afs_dynroot_iget_root(struct super_block *sb); /* * file.c @@ -1207,8 +1220,8 @@ struct afs_endpoint_state *afs_get_endpoint_state(struct afs_endpoint_state *est enum afs_estate_trace where); void afs_put_endpoint_state(struct afs_endpoint_state *estate, enum afs_estate_trace where); extern void afs_fileserver_probe_result(struct afs_call *); -void afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, - struct afs_addr_list *new_addrs, struct key *key); +int afs_fs_probe_fileserver(struct afs_net *net, struct afs_server *server, + struct afs_addr_list *new_alist, struct key *key); int afs_wait_for_fs_probes(struct afs_operation *op, struct afs_server_state *states, bool intr); extern void afs_probe_fileserver(struct afs_net *, struct afs_server *); extern void afs_fs_probe_dispatcher(struct work_struct *); @@ -1228,7 +1241,6 @@ int afs_readlink(struct dentry *dentry, char __user *buffer, int buflen); extern void afs_vnode_commit_status(struct afs_operation *, struct afs_vnode_param *); extern int afs_fetch_status(struct afs_vnode *, struct key *, bool, afs_access_t *); extern int afs_ilookup5_test_by_fid(struct inode *, void *); -extern struct inode *afs_iget_pseudo_dir(struct super_block *, bool); extern struct inode *afs_iget(struct afs_operation *, struct afs_vnode_param *); extern struct inode *afs_root_iget(struct super_block *, struct key *); extern int afs_getattr(struct mnt_idmap *idmap, const struct path *, @@ -1510,20 +1522,30 @@ extern void __exit afs_clean_up_permit_cache(void); */ extern spinlock_t afs_server_peer_lock; -extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *); -extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); +struct afs_server *afs_find_server(const struct rxrpc_peer *peer); extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); -extern struct afs_server *afs_use_server(struct afs_server *, enum afs_server_trace); -extern void afs_unuse_server(struct afs_net *, struct afs_server *, enum afs_server_trace); -extern void afs_unuse_server_notime(struct afs_net *, struct afs_server *, enum afs_server_trace); +struct afs_server *afs_use_server(struct afs_server *server, bool activate, + enum afs_server_trace reason); +void afs_unuse_server(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason); +void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, + enum afs_server_trace reason); extern void afs_put_server(struct afs_net *, struct afs_server *, enum afs_server_trace); -extern void afs_manage_servers(struct work_struct *); -extern void afs_servers_timer(struct timer_list *); +void afs_purge_servers(struct afs_cell *cell); extern void afs_fs_probe_timer(struct timer_list *); -extern void __net_exit afs_purge_servers(struct afs_net *); +void __net_exit afs_wait_for_servers(struct afs_net *net); bool afs_check_server_record(struct afs_operation *op, struct afs_server *server, struct key *key); +static inline void afs_see_server(struct afs_server *server, enum afs_server_trace trace) +{ + int r = refcount_read(&server->ref); + int a = atomic_read(&server->active); + + trace_afs_server(server->debug_id, r, a, trace); + +} + static inline void afs_inc_servers_outstanding(struct afs_net *net) { atomic_inc(&net->servers_outstanding); diff --git a/fs/afs/main.c b/fs/afs/main.c index 1ae0067f772d..02475d415d88 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -73,28 +73,21 @@ static int __net_init afs_net_init(struct net *net_ns) generate_random_uuid((unsigned char *)&net->uuid); INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); + INIT_WORK(&net->rx_oob_work, afs_process_oob_queue); mutex_init(&net->socket_mutex); net->cells = RB_ROOT; + idr_init(&net->cells_dyn_ino); init_rwsem(&net->cells_lock); - INIT_WORK(&net->cells_manager, afs_manage_cells); - timer_setup(&net->cells_timer, afs_cells_timer, 0); - mutex_init(&net->cells_alias_lock); mutex_init(&net->proc_cells_lock); INIT_HLIST_HEAD(&net->proc_cells); seqlock_init(&net->fs_lock); - net->fs_servers = RB_ROOT; INIT_LIST_HEAD(&net->fs_probe_fast); INIT_LIST_HEAD(&net->fs_probe_slow); INIT_HLIST_HEAD(&net->fs_proc); - INIT_HLIST_HEAD(&net->fs_addresses); - seqlock_init(&net->fs_addr_lock); - - INIT_WORK(&net->fs_manager, afs_manage_servers); - timer_setup(&net->fs_timer, afs_servers_timer, 0); INIT_WORK(&net->fs_prober, afs_fs_probe_dispatcher); timer_setup(&net->fs_probe_timer, afs_fs_probe_timer, 0); atomic_set(&net->servers_outstanding, 1); @@ -130,13 +123,14 @@ error_open_socket: net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); - afs_purge_servers(net); + afs_wait_for_servers(net); error_cell_init: net->live = false; afs_proc_cleanup(net); error_proc: afs_put_sysnames(net->sysnames); error_sysnames: + idr_destroy(&net->cells_dyn_ino); net->live = false; return ret; } @@ -151,10 +145,11 @@ static void __net_exit afs_net_exit(struct net *net_ns) net->live = false; afs_fs_probe_cleanup(net); afs_cell_purge(net); - afs_purge_servers(net); + afs_wait_for_servers(net); afs_close_socket(net); afs_proc_cleanup(net); afs_put_sysnames(net->sysnames); + idr_destroy(&net->cells_dyn_ino); kfree_rcu(rcu_access_pointer(net->address_prefs), rcu); } diff --git a/fs/afs/misc.c b/fs/afs/misc.c index b8180bf2281f..8f2b3a177690 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> +#include <crypto/krb5.h> #include "internal.h" #include "afs_fs.h" #include "protocol_uae.h" @@ -103,6 +104,32 @@ int afs_abort_to_error(u32 abort_code) case RXKADDATALEN: return -EKEYREJECTED; case RXKADILLEGALLEVEL: return -EKEYREJECTED; + case RXGK_INCONSISTENCY: return -EPROTO; + case RXGK_PACKETSHORT: return -EPROTO; + case RXGK_BADCHALLENGE: return -EPROTO; + case RXGK_SEALEDINCON: return -EKEYREJECTED; + case RXGK_NOTAUTH: return -EKEYREJECTED; + case RXGK_EXPIRED: return -EKEYEXPIRED; + case RXGK_BADLEVEL: return -EKEYREJECTED; + case RXGK_BADKEYNO: return -EKEYREJECTED; + case RXGK_NOTRXGK: return -EKEYREJECTED; + case RXGK_UNSUPPORTED: return -EKEYREJECTED; + case RXGK_GSSERROR: return -EKEYREJECTED; +#ifdef RXGK_BADETYPE + case RXGK_BADETYPE: return -ENOPKG; +#endif +#ifdef RXGK_BADTOKEN + case RXGK_BADTOKEN: return -EKEYREJECTED; +#endif +#ifdef RXGK_BADETYPE + case RXGK_DATALEN: return -EPROTO; +#endif +#ifdef RXGK_BADQOP + case RXGK_BADQOP: return -EKEYREJECTED; +#endif + + case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; + case RXGEN_OPCODE: return -ENOTSUPP; default: return -EREMOTEIO; diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 507c25a5b2cb..45cee6534122 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -87,7 +87,7 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) ctx->force = true; } if (ctx->cell) { - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_mntpt); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_mntpt); ctx->cell = NULL; } if (test_bit(AFS_VNODE_PSEUDODIR, &vnode->flags)) { @@ -107,7 +107,8 @@ static int afs_mntpt_set_params(struct fs_context *fc, struct dentry *mntpt) if (size > AFS_MAXCELLNAME) return -ENAMETOOLONG; - cell = afs_lookup_cell(ctx->net, p, size, NULL, false); + cell = afs_lookup_cell(ctx->net, p, size, NULL, false, + afs_cell_trace_use_lookup_mntpt); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%pd'\n", mntpt); return PTR_ERR(cell); diff --git a/fs/afs/proc.c b/fs/afs/proc.c index e7614f4f30c2..40e879c8ca77 100644 --- a/fs/afs/proc.c +++ b/fs/afs/proc.c @@ -122,14 +122,15 @@ static int afs_proc_cells_write(struct file *file, char *buf, size_t size) if (strcmp(buf, "add") == 0) { struct afs_cell *cell; - cell = afs_lookup_cell(net, name, strlen(name), args, true); + cell = afs_lookup_cell(net, name, strlen(name), args, true, + afs_cell_trace_use_lookup_add); if (IS_ERR(cell)) { ret = PTR_ERR(cell); goto done; } if (test_and_set_bit(AFS_CELL_FL_NO_GC, &cell->flags)) - afs_unuse_cell(net, cell, afs_cell_trace_unuse_no_pin); + afs_unuse_cell(cell, afs_cell_trace_unuse_no_pin); } else { goto inval; } @@ -206,7 +207,7 @@ static int afs_proc_rootcell_show(struct seq_file *m, void *v) net = afs_seq2net_single(m); down_read(&net->cells_lock); - cell = net->ws_cell; + cell = rcu_dereference_protected(net->ws_cell, lockdep_is_held(&net->cells_lock)); if (cell) seq_printf(m, "%s\n", cell->name); up_read(&net->cells_lock); @@ -242,7 +243,7 @@ static int afs_proc_rootcell_write(struct file *file, char *buf, size_t size) ret = -EEXIST; inode_lock(file_inode(file)); - if (!net->ws_cell) + if (!rcu_access_pointer(net->ws_cell)) ret = afs_cell_init(net, buf); else printk("busy\n"); @@ -443,8 +444,6 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) } server = list_entry(v, struct afs_server, proc_link); - estate = rcu_dereference(server->endpoint_state); - alist = estate->addresses; seq_printf(m, "%pU %3d %3d %s\n", &server->uuid, refcount_read(&server->ref), @@ -454,10 +453,16 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) server->flags, server->rtt); seq_printf(m, " - probe: last=%d\n", (int)(jiffies - server->probed_at) / HZ); + + estate = rcu_dereference(server->endpoint_state); + if (!estate) + goto out; failed = estate->failed_set; seq_printf(m, " - ESTATE pq=%x np=%u rsp=%lx f=%lx\n", estate->probe_seq, atomic_read(&estate->nr_probing), estate->responsive_set, estate->failed_set); + + alist = estate->addresses; seq_printf(m, " - ALIST v=%u ap=%u\n", alist->version, alist->addr_pref_version); for (i = 0; i < alist->nr_addrs; i++) { @@ -470,6 +475,8 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) rxrpc_kernel_get_srtt(addr->peer), addr->last_error, addr->prio); } + +out: return 0; } diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index 886416ea1d96..c1cadf8fb346 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -24,8 +24,17 @@ static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob); static int afs_deliver_cm_op_id(struct afs_call *); +static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { + .notify_new_call = afs_rx_new_call, + .discard_new_call = afs_rx_discard_new_call, + .user_attach_call = afs_rx_attach, + .notify_oob = afs_rx_notify_oob, +}; + /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", @@ -49,6 +58,7 @@ int afs_open_socket(struct afs_net *net) goto error_1; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_user_data = net; /* bind the callback manager's address to make this a server socket */ memset(&srx, 0, sizeof(srx)); @@ -64,6 +74,14 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) goto error_2; + ret = rxrpc_sock_set_manage_response(socket->sk, true); + if (ret < 0) + goto error_2; + + ret = afs_create_token_key(net, socket); + if (ret < 0) + pr_err("Couldn't create RxGK CM key: %d\n", ret); + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; @@ -84,8 +102,7 @@ int afs_open_socket(struct afs_net *net) * it sends back to us. */ - rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, - afs_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -125,7 +142,9 @@ void afs_close_socket(struct afs_net *net) kernel_sock_shutdown(net->socket, SHUT_RDWR); flush_workqueue(afs_async_calls); + net->socket->sk->sk_user_data = NULL; sock_release(net->socket); + key_put(net->fs_cm_token_key); _debug("dework"); _leave(""); @@ -179,7 +198,7 @@ static void afs_free_call(struct afs_call *call) if (call->type->destructor) call->type->destructor(call); - afs_unuse_server_notime(call->net, call->server, afs_server_trace_put_call); + afs_unuse_server_notime(call->net, call->server, afs_server_trace_unuse_call); kfree(call->request); o = atomic_read(&net->nr_outstanding_calls); @@ -738,7 +757,6 @@ void afs_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(net->socket, afs_wake_up_async_call, - afs_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -766,8 +784,14 @@ static void afs_rx_discard_new_call(struct rxrpc_call *rxcall, static void afs_rx_new_call(struct sock *sk, struct rxrpc_call *rxcall, unsigned long user_call_ID) { + struct afs_call *call = (struct afs_call *)user_call_ID; struct afs_net *net = afs_sock2net(sk); + call->peer = rxrpc_kernel_get_call_peer(sk->sk_socket, call->rxcall); + call->server = afs_find_server(call->peer); + if (!call->server) + trace_afs_cm_no_server(call, rxrpc_kernel_remote_srx(call->peer)); + queue_work(afs_wq, &net->charge_preallocation_work); } @@ -794,10 +818,14 @@ static int afs_deliver_cm_op_id(struct afs_call *call) if (!afs_cm_incoming_call(call)) return -ENOTSUPP; + call->security_ix = rxrpc_kernel_query_call_security(call->rxcall, + &call->service_id, + &call->enctype); + trace_afs_cb_call(call); call->work.func = call->type->work; - /* pass responsibility for the remainer of this message off to the + /* pass responsibility for the remainder of this message off to the * cache manager op */ return call->type->deliver(call); } @@ -946,3 +974,13 @@ noinline int afs_protocol_error(struct afs_call *call, call->unmarshalling_error = true; return -EBADMSG; } + +/* + * Wake up OOB notification processing. + */ +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob) +{ + struct afs_net *net = sk->sk_user_data; + + schedule_work(&net->rx_oob_work); +} diff --git a/fs/afs/server.c b/fs/afs/server.c index 038f9d0ae3af..a97562f831eb 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -14,188 +14,104 @@ static unsigned afs_server_gc_delay = 10; /* Server record timeout in seconds */ static atomic_t afs_server_debug_id; -static struct afs_server *afs_maybe_use_server(struct afs_server *, - enum afs_server_trace); static void __afs_put_server(struct afs_net *, struct afs_server *); +static void afs_server_timer(struct timer_list *timer); +static void afs_server_destroyer(struct work_struct *work); /* * Find a server by one of its addresses. */ -struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer) +struct afs_server *afs_find_server(const struct rxrpc_peer *peer) { - const struct afs_endpoint_state *estate; - const struct afs_addr_list *alist; - struct afs_server *server = NULL; - unsigned int i; - int seq = 1; + struct afs_server *server = (struct afs_server *)rxrpc_kernel_get_peer_data(peer); - rcu_read_lock(); - - do { - if (server) - afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq); - server = NULL; - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - read_seqbegin_or_lock(&net->fs_addr_lock, &seq); - - hlist_for_each_entry_rcu(server, &net->fs_addresses, addr_link) { - estate = rcu_dereference(server->endpoint_state); - alist = estate->addresses; - for (i = 0; i < alist->nr_addrs; i++) - if (alist->addrs[i].peer == peer) - goto found; - } - - server = NULL; - continue; - found: - server = afs_maybe_use_server(server, afs_server_trace_get_by_addr); - - } while (need_seqretry(&net->fs_addr_lock, seq)); - - done_seqretry(&net->fs_addr_lock, seq); - - rcu_read_unlock(); - return server; + if (!server) + return NULL; + return afs_use_server(server, false, afs_server_trace_use_cm_call); } /* - * Look up a server by its UUID and mark it active. + * Look up a server by its UUID and mark it active. The caller must hold + * cell->fs_lock. */ -struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uuid) +static struct afs_server *afs_find_server_by_uuid(struct afs_cell *cell, const uuid_t *uuid) { - struct afs_server *server = NULL; + struct afs_server *server; struct rb_node *p; - int diff, seq = 1; + int diff; _enter("%pU", uuid); - do { - /* Unfortunately, rbtree walking doesn't give reliable results - * under just the RCU read lock, so we have to check for - * changes. - */ - if (server) - afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq); - server = NULL; - seq++; /* 2 on the 1st/lockless path, otherwise odd */ - read_seqbegin_or_lock(&net->fs_lock, &seq); - - p = net->fs_servers.rb_node; - while (p) { - server = rb_entry(p, struct afs_server, uuid_rb); - - diff = memcmp(uuid, &server->uuid, sizeof(*uuid)); - if (diff < 0) { - p = p->rb_left; - } else if (diff > 0) { - p = p->rb_right; - } else { - afs_use_server(server, afs_server_trace_get_by_uuid); - break; - } - - server = NULL; - } - } while (need_seqretry(&net->fs_lock, seq)); + p = cell->fs_servers.rb_node; + while (p) { + server = rb_entry(p, struct afs_server, uuid_rb); - done_seqretry(&net->fs_lock, seq); + diff = memcmp(uuid, &server->uuid, sizeof(*uuid)); + if (diff < 0) { + p = p->rb_left; + } else if (diff > 0) { + p = p->rb_right; + } else { + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) + return NULL; /* Need a write lock */ + afs_use_server(server, true, afs_server_trace_use_by_uuid); + return server; + } + } - _leave(" = %p", server); - return server; + return NULL; } /* - * Install a server record in the namespace tree. If there's a clash, we stick - * it into a list anchored on whichever afs_server struct is actually in the - * tree. + * Install a server record in the cell tree. The caller must hold an exclusive + * lock on cell->fs_lock. */ static struct afs_server *afs_install_server(struct afs_cell *cell, - struct afs_server *candidate) + struct afs_server **candidate) { - const struct afs_endpoint_state *estate; - const struct afs_addr_list *alist; - struct afs_server *server, *next; + struct afs_server *server; struct afs_net *net = cell->net; struct rb_node **pp, *p; int diff; _enter("%p", candidate); - write_seqlock(&net->fs_lock); - /* Firstly install the server in the UUID lookup tree */ - pp = &net->fs_servers.rb_node; + pp = &cell->fs_servers.rb_node; p = NULL; while (*pp) { p = *pp; _debug("- consider %p", p); server = rb_entry(p, struct afs_server, uuid_rb); - diff = memcmp(&candidate->uuid, &server->uuid, sizeof(uuid_t)); - if (diff < 0) { + diff = memcmp(&(*candidate)->uuid, &server->uuid, sizeof(uuid_t)); + if (diff < 0) pp = &(*pp)->rb_left; - } else if (diff > 0) { + else if (diff > 0) pp = &(*pp)->rb_right; - } else { - if (server->cell == cell) - goto exists; - - /* We have the same UUID representing servers in - * different cells. Append the new server to the list. - */ - for (;;) { - next = rcu_dereference_protected( - server->uuid_next, - lockdep_is_held(&net->fs_lock.lock)); - if (!next) - break; - server = next; - } - rcu_assign_pointer(server->uuid_next, candidate); - candidate->uuid_prev = server; - server = candidate; - goto added_dup; - } + else + goto exists; } - server = candidate; + server = *candidate; + *candidate = NULL; rb_link_node(&server->uuid_rb, p, pp); - rb_insert_color(&server->uuid_rb, &net->fs_servers); + rb_insert_color(&server->uuid_rb, &cell->fs_servers); + write_seqlock(&net->fs_lock); hlist_add_head_rcu(&server->proc_link, &net->fs_proc); + write_sequnlock(&net->fs_lock); -added_dup: - write_seqlock(&net->fs_addr_lock); - estate = rcu_dereference_protected(server->endpoint_state, - lockdep_is_held(&net->fs_addr_lock.lock)); - alist = estate->addresses; - - /* Secondly, if the server has any IPv4 and/or IPv6 addresses, install - * it in the IPv4 and/or IPv6 reverse-map lists. - * - * TODO: For speed we want to use something other than a flat list - * here; even sorting the list in terms of lowest address would help a - * bit, but anything we might want to do gets messy and memory - * intensive. - */ - if (alist->nr_addrs > 0) - hlist_add_head_rcu(&server->addr_link, &net->fs_addresses); - - write_sequnlock(&net->fs_addr_lock); + afs_get_cell(cell, afs_cell_trace_get_server); exists: - afs_get_server(server, afs_server_trace_get_install); - write_sequnlock(&net->fs_lock); + afs_use_server(server, true, afs_server_trace_use_install); return server; } /* - * Allocate a new server record and mark it active. + * Allocate a new server record and mark it as active but uncreated. */ -static struct afs_server *afs_alloc_server(struct afs_cell *cell, - const uuid_t *uuid, - struct afs_addr_list *alist) +static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *uuid) { - struct afs_endpoint_state *estate; struct afs_server *server; struct afs_net *net = cell->net; @@ -203,65 +119,50 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, server = kzalloc(sizeof(struct afs_server), GFP_KERNEL); if (!server) - goto enomem; - - estate = kzalloc(sizeof(struct afs_endpoint_state), GFP_KERNEL); - if (!estate) - goto enomem_server; + return NULL; refcount_set(&server->ref, 1); - atomic_set(&server->active, 1); + atomic_set(&server->active, 0); + __set_bit(AFS_SERVER_FL_UNCREATED, &server->flags); server->debug_id = atomic_inc_return(&afs_server_debug_id); - server->addr_version = alist->version; server->uuid = *uuid; rwlock_init(&server->fs_lock); + INIT_WORK(&server->destroyer, &afs_server_destroyer); + timer_setup(&server->timer, afs_server_timer, 0); INIT_LIST_HEAD(&server->volumes); init_waitqueue_head(&server->probe_wq); + mutex_init(&server->cm_token_lock); INIT_LIST_HEAD(&server->probe_link); + INIT_HLIST_NODE(&server->proc_link); spin_lock_init(&server->probe_lock); server->cell = cell; server->rtt = UINT_MAX; server->service_id = FS_SERVICE; - server->probe_counter = 1; server->probed_at = jiffies - LONG_MAX / 2; - refcount_set(&estate->ref, 1); - estate->addresses = alist; - estate->server_id = server->debug_id; - estate->probe_seq = 1; - rcu_assign_pointer(server->endpoint_state, estate); afs_inc_servers_outstanding(net); - trace_afs_server(server->debug_id, 1, 1, afs_server_trace_alloc); - trace_afs_estate(estate->server_id, estate->probe_seq, refcount_read(&estate->ref), - afs_estate_trace_alloc_server); _leave(" = %p", server); return server; - -enomem_server: - kfree(server); -enomem: - _leave(" = NULL [nomem]"); - return NULL; } /* * Look up an address record for a server */ -static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, - struct key *key, const uuid_t *uuid) +static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_server *server, + struct key *key) { struct afs_vl_cursor vc; struct afs_addr_list *alist = NULL; int ret; ret = -ERESTARTSYS; - if (afs_begin_vlserver_operation(&vc, cell, key)) { + if (afs_begin_vlserver_operation(&vc, server->cell, key)) { while (afs_select_vlserver(&vc)) { if (test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) - alist = afs_yfsvl_get_endpoints(&vc, uuid); + alist = afs_yfsvl_get_endpoints(&vc, &server->uuid); else - alist = afs_vl_get_addrs_u(&vc, uuid); + alist = afs_vl_get_addrs_u(&vc, &server->uuid); } ret = afs_end_vlserver_operation(&vc); @@ -271,72 +172,122 @@ static struct afs_addr_list *afs_vl_lookup_addrs(struct afs_cell *cell, } /* - * Get or create a fileserver record. + * Get or create a fileserver record and return it with an active-use count on + * it. */ struct afs_server *afs_lookup_server(struct afs_cell *cell, struct key *key, const uuid_t *uuid, u32 addr_version) { - struct afs_addr_list *alist; - struct afs_server *server, *candidate; + struct afs_addr_list *alist = NULL; + struct afs_server *server, *candidate = NULL; + bool creating = false; + int ret; _enter("%p,%pU", cell->net, uuid); - server = afs_find_server_by_uuid(cell->net, uuid); + down_read(&cell->fs_lock); + server = afs_find_server_by_uuid(cell, uuid); + /* Won't see servers marked uncreated. */ + up_read(&cell->fs_lock); + if (server) { + timer_delete_sync(&server->timer); + if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) + goto wait_for_creation; if (server->addr_version != addr_version) set_bit(AFS_SERVER_FL_NEEDS_UPDATE, &server->flags); return server; } - alist = afs_vl_lookup_addrs(cell, key, uuid); - if (IS_ERR(alist)) - return ERR_CAST(alist); - - candidate = afs_alloc_server(cell, uuid, alist); + candidate = afs_alloc_server(cell, uuid); if (!candidate) { afs_put_addrlist(alist, afs_alist_trace_put_server_oom); return ERR_PTR(-ENOMEM); } - server = afs_install_server(cell, candidate); - if (server != candidate) { - afs_put_addrlist(alist, afs_alist_trace_put_server_dup); + down_write(&cell->fs_lock); + server = afs_install_server(cell, &candidate); + if (test_bit(AFS_SERVER_FL_CREATING, &server->flags)) { + /* We need to wait for creation to complete. */ + up_write(&cell->fs_lock); + goto wait_for_creation; + } + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) { + set_bit(AFS_SERVER_FL_CREATING, &server->flags); + clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + creating = true; + } + up_write(&cell->fs_lock); + timer_delete_sync(&server->timer); + + /* If we get to create the server, we look up the addresses and then + * immediately dispatch an asynchronous probe to each interface on the + * fileserver. This will make sure the repeat-probing service is + * started. + */ + if (creating) { + alist = afs_vl_lookup_addrs(server, key); + if (IS_ERR(alist)) { + ret = PTR_ERR(alist); + goto create_failed; + } + + ret = afs_fs_probe_fileserver(cell->net, server, alist, key); + if (ret) + goto create_failed; + + clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags); + } + +out: + afs_put_addrlist(alist, afs_alist_trace_put_server_create); + if (candidate) { + kfree(rcu_access_pointer(server->endpoint_state)); kfree(candidate); - } else { - /* Immediately dispatch an asynchronous probe to each interface - * on the fileserver. This will make sure the repeat-probing - * service is started. - */ - afs_fs_probe_fileserver(cell->net, server, alist, key); + afs_dec_servers_outstanding(cell->net); + } + return server ?: ERR_PTR(ret); + +wait_for_creation: + afs_see_server(server, afs_server_trace_wait_create); + wait_on_bit(&server->flags, AFS_SERVER_FL_CREATING, TASK_UNINTERRUPTIBLE); + if (test_bit_acquire(AFS_SERVER_FL_UNCREATED, &server->flags)) { + /* Barrier: read flag before error */ + ret = READ_ONCE(server->create_error); + afs_put_server(cell->net, server, afs_server_trace_unuse_create_fail); + server = NULL; + goto out; } - return server; -} + ret = 0; + goto out; -/* - * Set the server timer to fire after a given delay, assuming it's not already - * set for an earlier time. - */ -static void afs_set_server_timer(struct afs_net *net, time64_t delay) -{ - if (net->live) { - afs_inc_servers_outstanding(net); - if (timer_reduce(&net->fs_timer, jiffies + delay * HZ)) - afs_dec_servers_outstanding(net); +create_failed: + down_write(&cell->fs_lock); + + WRITE_ONCE(server->create_error, ret); + smp_wmb(); /* Barrier: set error before flag. */ + set_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + + clear_and_wake_up_bit(AFS_SERVER_FL_CREATING, &server->flags); + + if (test_bit(AFS_SERVER_FL_UNCREATED, &server->flags)) { + clear_bit(AFS_SERVER_FL_UNCREATED, &server->flags); + creating = true; } + afs_unuse_server(cell->net, server, afs_server_trace_unuse_create_fail); + server = NULL; + + up_write(&cell->fs_lock); + goto out; } /* - * Server management timer. We have an increment on fs_outstanding that we - * need to pass along to the work item. + * Set/reduce a server's timer. */ -void afs_servers_timer(struct timer_list *timer) +static void afs_set_server_timer(struct afs_server *server, unsigned int delay_secs) { - struct afs_net *net = container_of(timer, struct afs_net, fs_timer); - - _enter(""); - if (!queue_work(afs_wq, &net->fs_manager)) - afs_dec_servers_outstanding(net); + mod_timer(&server->timer, jiffies + delay_secs * HZ); } /* @@ -355,32 +306,20 @@ struct afs_server *afs_get_server(struct afs_server *server, } /* - * Try to get a reference on a server object. + * Get an active count on a server object and maybe remove from the inactive + * list. */ -static struct afs_server *afs_maybe_use_server(struct afs_server *server, - enum afs_server_trace reason) -{ - unsigned int a; - int r; - - if (!__refcount_inc_not_zero(&server->ref, &r)) - return NULL; - - a = atomic_inc_return(&server->active); - trace_afs_server(server->debug_id, r + 1, a, reason); - return server; -} - -/* - * Get an active count on a server object. - */ -struct afs_server *afs_use_server(struct afs_server *server, enum afs_server_trace reason) +struct afs_server *afs_use_server(struct afs_server *server, bool activate, + enum afs_server_trace reason) { unsigned int a; int r; __refcount_inc(&server->ref, &r); a = atomic_inc_return(&server->active); + if (a == 1 && activate && + !test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + timer_delete(&server->timer); trace_afs_server(server->debug_id, r + 1, a, reason); return server; @@ -413,13 +352,16 @@ void afs_put_server(struct afs_net *net, struct afs_server *server, void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - if (server) { - unsigned int active = atomic_dec_return(&server->active); + if (!server) + return; - if (active == 0) - afs_set_server_timer(net, afs_server_gc_delay); - afs_put_server(net, server, reason); + if (atomic_dec_and_test(&server->active)) { + if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) || + READ_ONCE(server->cell->state) >= AFS_CELL_REMOVING) + schedule_work(&server->destroyer); } + + afs_put_server(net, server, reason); } /* @@ -428,10 +370,22 @@ void afs_unuse_server_notime(struct afs_net *net, struct afs_server *server, void afs_unuse_server(struct afs_net *net, struct afs_server *server, enum afs_server_trace reason) { - if (server) { - server->unuse_time = ktime_get_real_seconds(); - afs_unuse_server_notime(net, server, reason); + if (!server) + return; + + if (atomic_dec_and_test(&server->active)) { + if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags) && + READ_ONCE(server->cell->state) < AFS_CELL_REMOVING) { + time64_t unuse_time = ktime_get_real_seconds(); + + server->unuse_time = unuse_time; + afs_set_server_timer(server, afs_server_gc_delay); + } else { + schedule_work(&server->destroyer); + } } + + afs_put_server(net, server, reason); } static void afs_server_rcu(struct rcu_head *rcu) @@ -442,6 +396,8 @@ static void afs_server_rcu(struct rcu_head *rcu) atomic_read(&server->active), afs_server_trace_free); afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), afs_estate_trace_put_server); + afs_put_cell(server->cell, afs_cell_trace_put_server); + kfree(server->cm_rxgk_appdata.data); kfree(server); } @@ -460,159 +416,119 @@ static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server } /* - * destroy a dead server + * Check to see if the server record has expired. */ -static void afs_destroy_server(struct afs_net *net, struct afs_server *server) +static bool afs_has_server_expired(const struct afs_server *server) { - if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) - afs_give_up_callbacks(net, server); + time64_t expires_at; - afs_put_server(net, server, afs_server_trace_destroy); + if (atomic_read(&server->active)) + return false; + + if (server->cell->net->live || + server->cell->state >= AFS_CELL_REMOVING) { + trace_afs_server(server->debug_id, refcount_read(&server->ref), + 0, afs_server_trace_purging); + return true; + } + + expires_at = server->unuse_time; + if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && + !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) + expires_at += afs_server_gc_delay; + + return ktime_get_real_seconds() > expires_at; } /* - * Garbage collect any expired servers. + * Remove a server record from it's parent cell's database. */ -static void afs_gc_servers(struct afs_net *net, struct afs_server *gc_list) +static bool afs_remove_server_from_cell(struct afs_server *server) { - struct afs_server *server, *next, *prev; - int active; - - while ((server = gc_list)) { - gc_list = server->gc_next; - - write_seqlock(&net->fs_lock); - - active = atomic_read(&server->active); - if (active == 0) { - trace_afs_server(server->debug_id, refcount_read(&server->ref), - active, afs_server_trace_gc); - next = rcu_dereference_protected( - server->uuid_next, lockdep_is_held(&net->fs_lock.lock)); - prev = server->uuid_prev; - if (!prev) { - /* The one at the front is in the tree */ - if (!next) { - rb_erase(&server->uuid_rb, &net->fs_servers); - } else { - rb_replace_node_rcu(&server->uuid_rb, - &next->uuid_rb, - &net->fs_servers); - next->uuid_prev = NULL; - } - } else { - /* This server is not at the front */ - rcu_assign_pointer(prev->uuid_next, next); - if (next) - next->uuid_prev = prev; - } - - list_del(&server->probe_link); - hlist_del_rcu(&server->proc_link); - if (!hlist_unhashed(&server->addr_link)) - hlist_del_rcu(&server->addr_link); - } - write_sequnlock(&net->fs_lock); + struct afs_cell *cell = server->cell; + + down_write(&cell->fs_lock); - if (active == 0) - afs_destroy_server(net, server); + if (!afs_has_server_expired(server)) { + up_write(&cell->fs_lock); + return false; } + + set_bit(AFS_SERVER_FL_EXPIRED, &server->flags); + _debug("expire %pU %u", &server->uuid, atomic_read(&server->active)); + afs_see_server(server, afs_server_trace_see_expired); + rb_erase(&server->uuid_rb, &cell->fs_servers); + up_write(&cell->fs_lock); + return true; } -/* - * Manage the records of servers known to be within a network namespace. This - * includes garbage collecting unused servers. - * - * Note also that we were given an increment on net->servers_outstanding by - * whoever queued us that we need to deal with before returning. - */ -void afs_manage_servers(struct work_struct *work) +static void afs_server_destroyer(struct work_struct *work) { - struct afs_net *net = container_of(work, struct afs_net, fs_manager); - struct afs_server *gc_list = NULL; - struct rb_node *cursor; - time64_t now = ktime_get_real_seconds(), next_manage = TIME64_MAX; - bool purging = !net->live; - - _enter(""); + struct afs_endpoint_state *estate; + struct afs_server *server = container_of(work, struct afs_server, destroyer); + struct afs_net *net = server->cell->net; - /* Trawl the server list looking for servers that have expired from - * lack of use. - */ - read_seqlock_excl(&net->fs_lock); + afs_see_server(server, afs_server_trace_see_destroyer); - for (cursor = rb_first(&net->fs_servers); cursor; cursor = rb_next(cursor)) { - struct afs_server *server = - rb_entry(cursor, struct afs_server, uuid_rb); - int active = atomic_read(&server->active); + if (test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + return; - _debug("manage %pU %u", &server->uuid, active); + if (!afs_remove_server_from_cell(server)) + return; - if (purging) { - trace_afs_server(server->debug_id, refcount_read(&server->ref), - active, afs_server_trace_purging); - if (active != 0) - pr_notice("Can't purge s=%08x\n", server->debug_id); - } + timer_shutdown_sync(&server->timer); + cancel_work(&server->destroyer); - if (active == 0) { - time64_t expire_at = server->unuse_time; - - if (!test_bit(AFS_SERVER_FL_VL_FAIL, &server->flags) && - !test_bit(AFS_SERVER_FL_NOT_FOUND, &server->flags)) - expire_at += afs_server_gc_delay; - if (purging || expire_at <= now) { - server->gc_next = gc_list; - gc_list = server; - } else if (expire_at < next_manage) { - next_manage = expire_at; - } - } - } + if (test_bit(AFS_SERVER_FL_MAY_HAVE_CB, &server->flags)) + afs_give_up_callbacks(net, server); - read_sequnlock_excl(&net->fs_lock); + /* Unbind the rxrpc_peer records from the server. */ + estate = rcu_access_pointer(server->endpoint_state); + if (estate) + afs_set_peer_appdata(server, estate->addresses, NULL); - /* Update the timer on the way out. We have to pass an increment on - * servers_outstanding in the namespace that we are in to the timer or - * the work scheduler. - */ - if (!purging && next_manage < TIME64_MAX) { - now = ktime_get_real_seconds(); + write_seqlock(&net->fs_lock); + list_del_init(&server->probe_link); + if (!hlist_unhashed(&server->proc_link)) + hlist_del_rcu(&server->proc_link); + write_sequnlock(&net->fs_lock); - if (next_manage - now <= 0) { - if (queue_work(afs_wq, &net->fs_manager)) - afs_inc_servers_outstanding(net); - } else { - afs_set_server_timer(net, next_manage - now); - } - } + afs_put_server(net, server, afs_server_trace_destroy); +} - afs_gc_servers(net, gc_list); +static void afs_server_timer(struct timer_list *timer) +{ + struct afs_server *server = container_of(timer, struct afs_server, timer); - afs_dec_servers_outstanding(net); - _leave(" [%d]", atomic_read(&net->servers_outstanding)); + afs_see_server(server, afs_server_trace_see_timer); + if (!test_bit(AFS_SERVER_FL_EXPIRED, &server->flags)) + schedule_work(&server->destroyer); } -static void afs_queue_server_manager(struct afs_net *net) +/* + * Wake up all the servers in a cell so that they can purge themselves. + */ +void afs_purge_servers(struct afs_cell *cell) { - afs_inc_servers_outstanding(net); - if (!queue_work(afs_wq, &net->fs_manager)) - afs_dec_servers_outstanding(net); + struct afs_server *server; + struct rb_node *rb; + + down_read(&cell->fs_lock); + for (rb = rb_first(&cell->fs_servers); rb; rb = rb_next(rb)) { + server = rb_entry(rb, struct afs_server, uuid_rb); + afs_see_server(server, afs_server_trace_see_purge); + schedule_work(&server->destroyer); + } + up_read(&cell->fs_lock); } /* - * Purge list of servers. + * Wait for outstanding servers. */ -void afs_purge_servers(struct afs_net *net) +void afs_wait_for_servers(struct afs_net *net) { _enter(""); - if (del_timer_sync(&net->fs_timer)) - afs_dec_servers_outstanding(net); - - afs_queue_server_manager(net); - - _debug("wait"); atomic_dec(&net->servers_outstanding); wait_var_event(&net->servers_outstanding, !atomic_read(&net->servers_outstanding)); @@ -636,7 +552,7 @@ static noinline bool afs_update_server_record(struct afs_operation *op, atomic_read(&server->active), afs_server_trace_update); - alist = afs_vl_lookup_addrs(op->volume->cell, op->key, &server->uuid); + alist = afs_vl_lookup_addrs(server, op->key); if (IS_ERR(alist)) { rcu_read_lock(); estate = rcu_dereference(server->endpoint_state); diff --git a/fs/afs/server_list.c b/fs/afs/server_list.c index 7e7e567a7f8a..20d5474837df 100644 --- a/fs/afs/server_list.c +++ b/fs/afs/server_list.c @@ -16,7 +16,7 @@ void afs_put_serverlist(struct afs_net *net, struct afs_server_list *slist) if (slist && refcount_dec_and_test(&slist->usage)) { for (i = 0; i < slist->nr_servers; i++) afs_unuse_server(net, slist->servers[i].server, - afs_server_trace_put_slist); + afs_server_trace_unuse_slist); kfree_rcu(slist, rcu); } } @@ -97,8 +97,8 @@ struct afs_server_list *afs_alloc_server_list(struct afs_volume *volume, break; if (j < slist->nr_servers) { if (slist->servers[j].server == server) { - afs_put_server(volume->cell->net, server, - afs_server_trace_put_slist_isort); + afs_unuse_server_notime(volume->cell->net, server, + afs_server_trace_unuse_slist_isort); continue; } diff --git a/fs/afs/super.c b/fs/afs/super.c index a9bee610674e..25b306db6992 100644 --- a/fs/afs/super.c +++ b/fs/afs/super.c @@ -194,8 +194,6 @@ static int afs_show_options(struct seq_file *m, struct dentry *root) if (as->dyn_root) seq_puts(m, ",dyn"); - if (test_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(d_inode(root))->flags)) - seq_puts(m, ",autocell"); switch (as->flock_mode) { case afs_flock_mode_unset: break; case afs_flock_mode_local: p = "local"; break; @@ -292,13 +290,14 @@ static int afs_parse_source(struct fs_context *fc, struct fs_parameter *param) /* lookup the cell record */ if (cellname) { cell = afs_lookup_cell(ctx->net, cellname, cellnamesz, - NULL, false); + NULL, false, + afs_cell_trace_use_lookup_mount); if (IS_ERR(cell)) { pr_err("kAFS: unable to lookup cell '%*.*s'\n", cellnamesz, cellnamesz, cellname ?: ""); return PTR_ERR(cell); } - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_parse); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_parse); afs_see_cell(cell, afs_cell_trace_see_source); ctx->cell = cell; } @@ -395,7 +394,7 @@ static int afs_validate_fc(struct fs_context *fc) ctx->key = NULL; cell = afs_use_cell(ctx->cell->alias_of, afs_cell_trace_use_fc_alias); - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc); ctx->cell = cell; goto reget_key; } @@ -468,7 +467,7 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) /* allocate the root inode and dentry */ if (as->dyn_root) { - inode = afs_iget_pseudo_dir(sb, true); + inode = afs_dynroot_iget_root(sb); } else { sprintf(sb->s_id, "%llu", as->volume->vid); afs_activate_volume(as->volume); @@ -478,9 +477,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) if (IS_ERR(inode)) return PTR_ERR(inode); - if (ctx->autocell || as->dyn_root) - set_bit(AFS_VNODE_AUTOCELL, &AFS_FS_I(inode)->flags); - ret = -ENOMEM; sb->s_root = d_make_root(inode); if (!sb->s_root) @@ -488,9 +484,6 @@ static int afs_fill_super(struct super_block *sb, struct afs_fs_context *ctx) if (as->dyn_root) { sb->s_d_op = &afs_dynroot_dentry_operations; - ret = afs_dynroot_populate(sb); - if (ret < 0) - goto error; } else { sb->s_d_op = &afs_fs_dentry_operations; rcu_assign_pointer(as->volume->sb, sb); @@ -527,9 +520,8 @@ static struct afs_super_info *afs_alloc_sbi(struct fs_context *fc) static void afs_destroy_sbi(struct afs_super_info *as) { if (as) { - struct afs_net *net = afs_net(as->net_ns); afs_put_volume(as->volume, afs_volume_trace_put_destroy_sbi); - afs_unuse_cell(net, as->cell, afs_cell_trace_unuse_sbi); + afs_unuse_cell(as->cell, afs_cell_trace_unuse_sbi); put_net(as->net_ns); kfree(as); } @@ -539,9 +531,6 @@ static void afs_kill_super(struct super_block *sb) { struct afs_super_info *as = AFS_FS_S(sb); - if (as->dyn_root) - afs_dynroot_depopulate(sb); - /* Clear the callback interests (which will do ilookup5) before * deactivating the superblock. */ @@ -615,7 +604,7 @@ static void afs_free_fc(struct fs_context *fc) afs_destroy_sbi(fc->s_fs_info); afs_put_volume(ctx->volume, afs_volume_trace_put_free_fc); - afs_unuse_cell(ctx->net, ctx->cell, afs_cell_trace_unuse_fc); + afs_unuse_cell(ctx->cell, afs_cell_trace_unuse_fc); key_put(ctx->key); kfree(ctx); } diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c index f9e76b604f31..709b4cdb723e 100644 --- a/fs/afs/vl_alias.c +++ b/fs/afs/vl_alias.c @@ -205,11 +205,11 @@ static int afs_query_for_alias(struct afs_cell *cell, struct key *key) goto is_alias; if (mutex_lock_interruptible(&cell->net->proc_cells_lock) < 0) { - afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); + afs_unuse_cell(p, afs_cell_trace_unuse_check_alias); return -ERESTARTSYS; } - afs_unuse_cell(cell->net, p, afs_cell_trace_unuse_check_alias); + afs_unuse_cell(p, afs_cell_trace_unuse_check_alias); } mutex_unlock(&cell->net->proc_cells_lock); @@ -269,7 +269,8 @@ static int yfs_check_canonical_cell_name(struct afs_cell *cell, struct key *key) if (!name_len || name_len > AFS_MAXCELLNAME) master = ERR_PTR(-EOPNOTSUPP); else - master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false); + master = afs_lookup_cell(cell->net, cell_name, name_len, NULL, false, + afs_cell_trace_use_lookup_canonical); kfree(cell_name); if (IS_ERR(master)) return PTR_ERR(master); diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c index d8f79f6ada3d..6ad9688d8f4b 100644 --- a/fs/afs/vl_rotate.c +++ b/fs/afs/vl_rotate.c @@ -48,7 +48,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) cell->dns_expiry <= ktime_get_real_seconds()) { dns_lookup_count = smp_load_acquire(&cell->dns_lookup_count); set_bit(AFS_CELL_FL_DO_LOOKUP, &cell->flags); - afs_queue_cell(cell, afs_cell_trace_get_queue_dns); + afs_queue_cell(cell, afs_cell_trace_queue_dns); if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { if (wait_var_event_interruptible( diff --git a/fs/afs/volume.c b/fs/afs/volume.c index af3a3f57c1b3..0efff3d25133 100644 --- a/fs/afs/volume.c +++ b/fs/afs/volume.c @@ -10,6 +10,7 @@ #include "internal.h" static unsigned __read_mostly afs_volume_record_life = 60 * 60; +static atomic_t afs_volume_debug_id; static void afs_destroy_volume(struct work_struct *work); @@ -59,7 +60,7 @@ static void afs_remove_volume_from_cell(struct afs_volume *volume) struct afs_cell *cell = volume->cell; if (!hlist_unhashed(&volume->proc_link)) { - trace_afs_volume(volume->vid, refcount_read(&cell->ref), + trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref), afs_volume_trace_remove); write_seqlock(&cell->volume_lock); hlist_del_rcu(&volume->proc_link); @@ -84,6 +85,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, if (!volume) goto error_0; + volume->debug_id = atomic_inc_return(&afs_volume_debug_id); volume->vid = vldb->vid[params->type]; volume->update_at = ktime_get_real_seconds() + afs_volume_record_life; volume->cell = afs_get_cell(params->cell, afs_cell_trace_get_vol); @@ -115,7 +117,7 @@ static struct afs_volume *afs_alloc_volume(struct afs_fs_context *params, *_slist = slist; rcu_assign_pointer(volume->servers, slist); - trace_afs_volume(volume->vid, 1, afs_volume_trace_alloc); + trace_afs_volume(volume->debug_id, volume->vid, 1, afs_volume_trace_alloc); return volume; error_1: @@ -247,7 +249,7 @@ static void afs_destroy_volume(struct work_struct *work) afs_remove_volume_from_cell(volume); afs_put_serverlist(volume->cell->net, slist); afs_put_cell(volume->cell, afs_cell_trace_put_vol); - trace_afs_volume(volume->vid, refcount_read(&volume->ref), + trace_afs_volume(volume->debug_id, volume->vid, refcount_read(&volume->ref), afs_volume_trace_free); kfree_rcu(volume, rcu); @@ -262,7 +264,7 @@ bool afs_try_get_volume(struct afs_volume *volume, enum afs_volume_trace reason) int r; if (__refcount_inc_not_zero(&volume->ref, &r)) { - trace_afs_volume(volume->vid, r + 1, reason); + trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason); return true; } return false; @@ -278,7 +280,7 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume, int r; __refcount_inc(&volume->ref, &r); - trace_afs_volume(volume->vid, r + 1, reason); + trace_afs_volume(volume->debug_id, volume->vid, r + 1, reason); } return volume; } @@ -290,12 +292,13 @@ struct afs_volume *afs_get_volume(struct afs_volume *volume, void afs_put_volume(struct afs_volume *volume, enum afs_volume_trace reason) { if (volume) { + unsigned int debug_id = volume->debug_id; afs_volid_t vid = volume->vid; bool zero; int r; zero = __refcount_dec_and_test(&volume->ref, &r); - trace_afs_volume(vid, r - 1, reason); + trace_afs_volume(debug_id, vid, r - 1, reason); if (zero) schedule_work(&volume->destructor); } @@ -1511,6 +1511,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb, int rw_type) { int ret; + req->ki_write_stream = 0; req->ki_complete = aio_complete_rw; req->private = NULL; req->ki_pos = iocb->aio_offset; diff --git a/fs/anon_inodes.c b/fs/anon_inodes.c index 583ac81669c2..e51e7d88980a 100644 --- a/fs/anon_inodes.c +++ b/fs/anon_inodes.c @@ -24,10 +24,51 @@ #include <linux/uaccess.h> +#include "internal.h" + static struct vfsmount *anon_inode_mnt __ro_after_init; static struct inode *anon_inode_inode __ro_after_init; /* + * User space expects anonymous inodes to have no file type in st_mode. + * + * In particular, 'lsof' has this legacy logic: + * + * type = s->st_mode & S_IFMT; + * switch (type) { + * ... + * case 0: + * if (!strcmp(p, "anon_inode")) + * Lf->ntype = Ntype = N_ANON_INODE; + * + * to detect our old anon_inode logic. + * + * Rather than mess with our internal sane inode data, just fix it + * up here in getattr() by masking off the format bits. + */ +int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags) +{ + struct inode *inode = d_inode(path->dentry); + + generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); + stat->mode &= ~S_IFMT; + return 0; +} + +int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr) +{ + return -EOPNOTSUPP; +} + +static const struct inode_operations anon_inode_operations = { + .getattr = anon_inode_getattr, + .setattr = anon_inode_setattr, +}; + +/* * anon_inodefs_dname() is called from d_path(). */ static char *anon_inodefs_dname(struct dentry *dentry, char *buffer, int buflen) @@ -45,6 +86,8 @@ static int anon_inodefs_init_fs_context(struct fs_context *fc) struct pseudo_fs_context *ctx = init_pseudo(fc, ANON_INODE_FS_MAGIC); if (!ctx) return -ENOMEM; + fc->s_iflags |= SB_I_NOEXEC; + fc->s_iflags |= SB_I_NODEV; ctx->dops = &anon_inodefs_dentry_operations; return 0; } @@ -66,6 +109,7 @@ static struct inode *anon_inode_make_secure_inode( if (IS_ERR(inode)) return inode; inode->i_flags &= ~S_PRIVATE; + inode->i_op = &anon_inode_operations; error = security_inode_init_security_anon(inode, &QSTR(name), context_inode); if (error) { @@ -313,6 +357,7 @@ static int __init anon_inode_init(void) anon_inode_inode = alloc_anon_inode(anon_inode_mnt->mnt_sb); if (IS_ERR(anon_inode_inode)) panic("anon_inode_init() inode allocation failed (%ld)\n", PTR_ERR(anon_inode_inode)); + anon_inode_inode->i_op = &anon_inode_operations; return 0; } diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 77c7991d89aa..23cea74f9933 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -218,6 +218,8 @@ void autofs_clean_ino(struct autofs_info *); static inline int autofs_check_pipe(struct file *pipe) { + if (pipe->f_mode & FMODE_PATH) + return -EINVAL; if (!(pipe->f_mode & FMODE_CAN_WRITE)) return -EINVAL; if (!S_ISFIFO(file_inode(pipe)->i_mode)) diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index 6d57efbb8110..d8dd150cbd74 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -442,7 +442,6 @@ static int autofs_dev_ioctl_timeout(struct file *fp, sbi->exp_timeout = timeout * HZ; } else { struct dentry *base = fp->f_path.dentry; - struct inode *inode = base->d_inode; int path_len = param->size - AUTOFS_DEV_IOCTL_SIZE - 1; struct dentry *dentry; struct autofs_info *ino; @@ -460,9 +459,8 @@ static int autofs_dev_ioctl_timeout(struct file *fp, "the parent autofs mount timeout which could " "prevent shutdown\n"); - inode_lock_shared(inode); - dentry = try_lookup_one_len(param->path, base, path_len); - inode_unlock_shared(inode); + dentry = try_lookup_noperm(&QSTR_LEN(param->path, path_len), + base); if (IS_ERR_OR_NULL(dentry)) return dentry ? PTR_ERR(dentry) : -ENOENT; ino = autofs_dentry_ino(dentry); diff --git a/fs/autofs/root.c b/fs/autofs/root.c index 530d18827e35..174c7205fee4 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -15,8 +15,8 @@ static int autofs_dir_symlink(struct mnt_idmap *, struct inode *, struct dentry *, const char *); static int autofs_dir_unlink(struct inode *, struct dentry *); static int autofs_dir_rmdir(struct inode *, struct dentry *); -static int autofs_dir_mkdir(struct mnt_idmap *, struct inode *, - struct dentry *, umode_t); +static struct dentry *autofs_dir_mkdir(struct mnt_idmap *, struct inode *, + struct dentry *, umode_t); static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT static long autofs_root_compat_ioctl(struct file *, @@ -720,9 +720,9 @@ static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs_dir_mkdir(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, - umode_t mode) +static struct dentry *autofs_dir_mkdir(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t mode) { struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); struct autofs_info *ino = autofs_dentry_ino(dentry); @@ -739,7 +739,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); d_add(dentry, inode); if (sbi->version < 5) @@ -751,7 +751,7 @@ static int autofs_dir_mkdir(struct mnt_idmap *idmap, inc_nlink(dir); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - return 0; + return NULL; } /* Get/set timeout ioctl() operation */ diff --git a/fs/bad_inode.c b/fs/bad_inode.c index 316d88da2ce1..0ef9bcb744dd 100644 --- a/fs/bad_inode.c +++ b/fs/bad_inode.c @@ -58,10 +58,10 @@ static int bad_inode_symlink(struct mnt_idmap *idmap, return -EIO; } -static int bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *bad_inode_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return -EIO; + return ERR_PTR(-EIO); } static int bad_inode_rmdir (struct inode *dir, struct dentry *dentry) diff --git a/fs/bcachefs/Kconfig b/fs/bcachefs/Kconfig index fc7efd0a7525..8cb2b9d5da96 100644 --- a/fs/bcachefs/Kconfig +++ b/fs/bcachefs/Kconfig @@ -4,7 +4,7 @@ config BCACHEFS_FS depends on BLOCK select EXPORTFS select CLOSURES - select LIBCRC32C + select CRC32 select CRC64 select FS_POSIX_ACL select LZ4_COMPRESS @@ -15,10 +15,9 @@ config BCACHEFS_FS select ZLIB_INFLATE select ZSTD_COMPRESS select ZSTD_DECOMPRESS - select CRYPTO - select CRYPTO_SHA256 - select CRYPTO_CHACHA20 - select CRYPTO_POLY1305 + select CRYPTO_LIB_SHA256 + select CRYPTO_LIB_CHACHA + select CRYPTO_LIB_POLY1305 select KEYS select RAID6_PQ select XOR_BLOCKS @@ -26,6 +25,7 @@ config BCACHEFS_FS select SRCU select SYMBOLIC_ERRNAME select MIN_HEAP + select XARRAY_MULTI help The bcachefs filesystem - a modern, copy on write filesystem, with support for multiple devices, compression, checksumming, etc. @@ -103,6 +103,14 @@ config BCACHEFS_PATH_TRACEPOINTS Enable extra tracepoints for debugging btree_path operations; we don't normally want these enabled because they happen at very high rates. +config BCACHEFS_TRANS_KMALLOC_TRACE + bool "Trace bch2_trans_kmalloc() calls" + depends on BCACHEFS_FS + +config BCACHEFS_ASYNC_OBJECT_LISTS + bool "Keep async objects on fast_lists for debugfs visibility" + depends on BCACHEFS_FS && DEBUG_FS + config MEAN_AND_VARIANCE_UNIT_TEST tristate "mean_and_variance unit tests" if !KUNIT_ALL_TESTS depends on KUNIT diff --git a/fs/bcachefs/Makefile b/fs/bcachefs/Makefile index d2689388d5e8..93c8ee5425c8 100644 --- a/fs/bcachefs/Makefile +++ b/fs/bcachefs/Makefile @@ -35,13 +35,14 @@ bcachefs-y := \ disk_accounting.o \ disk_groups.o \ ec.o \ + enumerated_ref.o \ errcode.o \ error.o \ extents.o \ extent_update.o \ eytzinger.o \ + fast_list.o \ fs.o \ - fs-common.o \ fs-ioctl.o \ fs-io.o \ fs-io-buffered.o \ @@ -64,9 +65,11 @@ bcachefs-y := \ migrate.o \ move.o \ movinggc.o \ + namei.o \ nocow_locking.o \ opts.o \ printbuf.o \ + progress.o \ quota.o \ rebalance.o \ rcu_pending.o \ @@ -96,6 +99,8 @@ bcachefs-y := \ varint.o \ xattr.o +bcachefs-$(CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS) += async_objs.o + obj-$(CONFIG_MEAN_AND_VARIANCE_UNIT_TEST) += mean_and_variance_test.o # Silence "note: xyz changed in GCC X.X" messages diff --git a/fs/bcachefs/acl.c b/fs/bcachefs/acl.c index 99487727ae64..d03adc36100e 100644 --- a/fs/bcachefs/acl.c +++ b/fs/bcachefs/acl.c @@ -273,7 +273,7 @@ struct posix_acl *bch2_get_acl(struct inode *vinode, int type, bool rcu) struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); struct xattr_search_key search = X_SEARCH(acl_to_xattr_type(type), "", 0); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct posix_acl *acl = NULL; if (rcu) @@ -344,7 +344,7 @@ int bch2_set_acl(struct mnt_idmap *idmap, { struct bch_inode_info *inode = to_bch_ei(dentry->d_inode); struct bch_fs *c = inode->v.i_sb->s_fs_info; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode_u; struct posix_acl *acl; umode_t mode; diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 3ea809990ef1..173e81c2bbcb 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -17,6 +17,7 @@ #include "debug.h" #include "disk_accounting.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "lru.h" #include "recovery.h" @@ -232,7 +233,7 @@ int bch2_alloc_v3_validate(struct bch_fs *c, struct bkey_s_c k, int ret = 0; bkey_fsck_err_on(bch2_alloc_unpack_v3(&u, k), - c, alloc_v2_unpack_error, + c, alloc_v3_unpack_error, "unpack error"); fsck_err: return ret; @@ -308,7 +309,8 @@ int bch2_alloc_v4_validate(struct bch_fs *c, struct bkey_s_c k, "data type inconsistency"); bkey_fsck_err_on(!a.io_time[READ] && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_alloc_to_lru_refs, + !(c->recovery.passes_to_run & + BIT_ULL(BCH_RECOVERY_PASS_check_alloc_to_lru_refs)), c, alloc_key_cached_but_read_time_zero, "cached bucket with read_time == 0"); break; @@ -478,12 +480,27 @@ struct bkey_i_alloc_v4 *bch2_trans_start_alloc_update(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { struct btree_iter iter; - struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update_noupdate(trans, &iter, pos); - int ret = PTR_ERR_OR_ZERO(a); - if (ret) + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, pos, + BTREE_ITER_with_updates| + BTREE_ITER_cached| + BTREE_ITER_intent); + int ret = bkey_err(k); + if (unlikely(ret)) return ERR_PTR(ret); - ret = bch2_trans_update(trans, &iter, &a->k_i, flags); + if ((void *) k.v >= trans->mem && + (void *) k.v < trans->mem + trans->mem_top) { + bch2_trans_iter_exit(trans, &iter); + return container_of(bkey_s_c_to_alloc_v4(k).v, struct bkey_i_alloc_v4, v); + } + + struct bkey_i_alloc_v4 *a = bch2_alloc_to_v4_mut_inlined(trans, k); + if (IS_ERR(a)) { + bch2_trans_iter_exit(trans, &iter); + return a; + } + + ret = bch2_trans_update_ip(trans, &iter, &a->k_i, flags, _RET_IP_); bch2_trans_iter_exit(trans, &iter); return unlikely(ret) ? ERR_PTR(ret) : a; } @@ -589,6 +606,8 @@ iter_err: int bch2_alloc_read(struct bch_fs *c) { + down_read(&c->state_lock); + struct btree_trans *trans = bch2_trans_get(c); struct bch_dev *ca = NULL; int ret; @@ -608,7 +627,7 @@ int bch2_alloc_read(struct bch_fs *c) * bch2_check_alloc_key() which runs later: */ if (!ca) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } @@ -629,17 +648,17 @@ int bch2_alloc_read(struct bch_fs *c) * bch2_check_alloc_key() which runs later: */ if (!ca) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } if (k.k->p.offset < ca->mi.first_bucket) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode, ca->mi.first_bucket)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode, ca->mi.first_bucket)); continue; } if (k.k->p.offset >= ca->mi.nbuckets) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } @@ -652,6 +671,7 @@ int bch2_alloc_read(struct bch_fs *c) bch2_dev_put(ca); bch2_trans_put(trans); + up_read(&c->state_lock); bch_err_fn(c, ret); return ret; } @@ -673,8 +693,7 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, alloc_k); int ret = __bch2_fsck_err(NULL, trans, flags, err_id, - "bucket incorrectly %sset in %s btree\n" - " %s", + "bucket incorrectly %sset in %s btree\n%s", set ? "" : "un", bch2_btree_id_str(btree), buf.buf); @@ -777,14 +796,12 @@ static inline int bch2_dev_data_type_accounting_mod(struct btree_trans *trans, s s64 delta_sectors, s64 delta_fragmented, unsigned flags) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = data_type, - }; s64 d[3] = { delta_buckets, delta_sectors, delta_fragmented }; - return bch2_disk_accounting_mod(trans, &acc, d, 3, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + d, dev_data_type, + .dev = ca->dev_idx, + .data_type = data_type); } int bch2_alloc_key_to_dev_counters(struct btree_trans *trans, struct bch_dev *ca, @@ -837,7 +854,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); if (!ca) - return -EIO; + return -BCH_ERR_trigger_alloc; struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); @@ -871,6 +888,9 @@ int bch2_trigger_alloc(struct btree_trans *trans, if (data_type_is_empty(new_a->data_type) && BCH_ALLOC_V4_NEED_INC_GEN(new_a) && !bch2_bucket_is_open_safe(c, new.k->p.inode, new.k->p.offset)) { + if (new_a->oldest_gen == new_a->gen && + !bch2_bucket_sectors_total(*new_a)) + new_a->oldest_gen++; new_a->gen++; SET_BCH_ALLOC_V4_NEED_INC_GEN(new_a, false); alloc_data_type_set(new_a, new_a->data_type); @@ -889,26 +909,20 @@ int bch2_trigger_alloc(struct btree_trans *trans, !new_a->io_time[READ]) new_a->io_time[READ] = bch2_current_io_time(c, READ); - u64 old_lru = alloc_lru_idx_read(*old_a); - u64 new_lru = alloc_lru_idx_read(*new_a); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, new.k->p.inode, - bucket_to_u64(new.k->p), - old_lru, new_lru); - if (ret) - goto err; - } + ret = bch2_lru_change(trans, new.k->p.inode, + bucket_to_u64(new.k->p), + alloc_lru_idx_read(*old_a), + alloc_lru_idx_read(*new_a)); + if (ret) + goto err; - old_lru = alloc_lru_idx_fragmentation(*old_a, ca); - new_lru = alloc_lru_idx_fragmentation(*new_a, ca); - if (old_lru != new_lru) { - ret = bch2_lru_change(trans, - BCH_LRU_FRAGMENTATION_START, - bucket_to_u64(new.k->p), - old_lru, new_lru); - if (ret) - goto err; - } + ret = bch2_lru_change(trans, + BCH_LRU_BUCKET_FRAGMENTATION, + bucket_to_u64(new.k->p), + alloc_lru_idx_fragmentation(*old_a, ca), + alloc_lru_idx_fragmentation(*new_a, ca)); + if (ret) + goto err; if (old_a->gen != new_a->gen) { ret = bch2_bucket_gen_update(trans, new.k->p, new_a->gen); @@ -916,15 +930,6 @@ int bch2_trigger_alloc(struct btree_trans *trans, goto err; } - if ((flags & BTREE_TRIGGER_bucket_invalidate) && - old_a->cached_sectors) { - ret = bch2_mod_dev_cached_sectors(trans, ca->dev_idx, - -((s64) old_a->cached_sectors), - flags & BTREE_TRIGGER_gc); - if (ret) - goto err; - } - ret = bch2_alloc_key_to_dev_counters(trans, ca, old_a, new_a, flags); if (ret) goto err; @@ -1032,9 +1037,9 @@ fsck_err: bch2_dev_put(ca); return ret; invalid_bucket: - bch2_fs_inconsistent(c, "reference to invalid bucket\n %s", + bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = -EIO; + ret = -BCH_ERR_trigger_alloc; goto err; } @@ -1042,9 +1047,10 @@ invalid_bucket: * This synthesizes deleted extents for holes, similar to BTREE_ITER_slots for * extents style btrees, but works on non-extents btrees: */ -static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos end, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_hole(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, struct bkey *hole) { - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); if (bkey_err(k)) return k; @@ -1055,9 +1061,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos struct btree_iter iter2; struct bpos next; - bch2_trans_copy_iter(&iter2, iter); + bch2_trans_copy_iter(trans, &iter2, iter); - struct btree_path *path = btree_iter_path(iter->trans, iter); + struct btree_path *path = btree_iter_path(trans, iter); if (!bpos_eq(path->l[0].b->key.k.p, SPOS_MAX)) end = bkey_min(end, bpos_nosnap_successor(path->l[0].b->key.k.p)); @@ -1067,9 +1073,9 @@ static struct bkey_s_c bch2_get_key_or_hole(struct btree_iter *iter, struct bpos * btree node min/max is a closed interval, upto takes a half * open interval: */ - k = bch2_btree_iter_peek_max(&iter2, end); + k = bch2_btree_iter_peek_max(trans, &iter2, end); next = iter2.pos; - bch2_trans_iter_exit(iter->trans, &iter2); + bch2_trans_iter_exit(trans, &iter2); BUG_ON(next.offset >= iter->pos.offset + U32_MAX); @@ -1110,13 +1116,14 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck return *ca != NULL; } -static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_iter *iter, - struct bch_dev **ca, struct bkey *hole) +static struct bkey_s_c bch2_get_key_or_real_bucket_hole(struct btree_trans *trans, + struct btree_iter *iter, + struct bch_dev **ca, struct bkey *hole) { - struct bch_fs *c = iter->trans->c; + struct bch_fs *c = trans->c; struct bkey_s_c k; again: - k = bch2_get_key_or_hole(iter, POS_MAX, hole); + k = bch2_get_key_or_hole(trans, iter, POS_MAX, hole); if (bkey_err(k)) return k; @@ -1129,7 +1136,7 @@ again: if (!next_bucket(c, ca, &hole_start)) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, hole_start); + bch2_btree_iter_set_pos(trans, iter, hole_start); goto again; } @@ -1170,8 +1177,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, a = bch2_alloc_to_v4(alloc_k, &a_convert); - bch2_btree_iter_set_pos(discard_iter, alloc_k.k->p); - k = bch2_btree_iter_peek_slot(discard_iter); + bch2_btree_iter_set_pos(trans, discard_iter, alloc_k.k->p); + k = bch2_btree_iter_peek_slot(trans, discard_iter); ret = bkey_err(k); if (ret) goto err; @@ -1184,8 +1191,8 @@ int bch2_check_alloc_key(struct btree_trans *trans, goto err; } - bch2_btree_iter_set_pos(freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); - k = bch2_btree_iter_peek_slot(freespace_iter); + bch2_btree_iter_set_pos(trans, freespace_iter, alloc_freespace_pos(alloc_k.k->p, *a)); + k = bch2_btree_iter_peek_slot(trans, freespace_iter); ret = bkey_err(k); if (ret) goto err; @@ -1198,16 +1205,15 @@ int bch2_check_alloc_key(struct btree_trans *trans, goto err; } - bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); - k = bch2_btree_iter_peek_slot(bucket_gens_iter); + bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(alloc_k.k->p, &gens_offset)); + k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); ret = bkey_err(k); if (ret) goto err; if (fsck_err_on(a->gen != alloc_gen(k, gens_offset), trans, bucket_gens_key_wrong, - "incorrect gen in bucket_gens btree (got %u should be %u)\n" - " %s", + "incorrect gen in bucket_gens btree (got %u should be %u)\n%s", alloc_gen(k, gens_offset), a->gen, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { @@ -1253,9 +1259,9 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, if (!ca->mi.freespace_initialized) return 0; - bch2_btree_iter_set_pos(freespace_iter, start); + bch2_btree_iter_set_pos(trans, freespace_iter, start); - k = bch2_btree_iter_peek_slot(freespace_iter); + k = bch2_btree_iter_peek_slot(trans, freespace_iter); ret = bkey_err(k); if (ret) goto err; @@ -1265,7 +1271,7 @@ int bch2_check_alloc_hole_freespace(struct btree_trans *trans, if (fsck_err_on(k.k->type != KEY_TYPE_set, trans, freespace_hole_missing, "hole in alloc btree missing in freespace btree\n" - " device %llu buckets %llu-%llu", + "device %llu buckets %llu-%llu", freespace_iter->pos.inode, freespace_iter->pos.offset, end->offset)) { @@ -1304,9 +1310,9 @@ int bch2_check_alloc_hole_bucket_gens(struct btree_trans *trans, unsigned i, gens_offset, gens_end_offset; int ret; - bch2_btree_iter_set_pos(bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); + bch2_btree_iter_set_pos(trans, bucket_gens_iter, alloc_gens_pos(start, &gens_offset)); - k = bch2_btree_iter_peek_slot(bucket_gens_iter); + k = bch2_btree_iter_peek_slot(trans, bucket_gens_iter); ret = bkey_err(k); if (ret) goto err; @@ -1383,7 +1389,7 @@ static void check_discard_freespace_key_work(struct work_struct *work) container_of(work, struct check_discard_freespace_key_async, work); bch2_trans_do(w->c, bch2_recheck_discard_freespace_key(trans, w->pos)); - bch2_write_ref_put(w->c, BCH_WRITE_REF_check_discard_freespace_key); + enumerated_ref_put(&w->c->writes, BCH_WRITE_REF_check_discard_freespace_key); kfree(w); } @@ -1424,7 +1430,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite (state == BCH_DATA_free && genbits != alloc_freespace_genbits(*a))) { if (fsck_err(trans, need_discard_freespace_key_bad, - "%s\n incorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", + "%s\nincorrectly set at %s:%llu:%llu:0 (free %u, genbits %llu should be %llu)", (bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf), bch2_btree_id_str(iter->btree_id), iter->pos.inode, @@ -1439,7 +1445,7 @@ int bch2_check_discard_freespace_key(struct btree_trans *trans, struct btree_ite *gen = a->gen; out: fsck_err: - bch2_set_btree_iter_dontneed(&alloc_iter); + bch2_set_btree_iter_dontneed(trans, &alloc_iter); bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; @@ -1460,7 +1466,7 @@ delete: if (!w) goto out; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_check_discard_freespace_key)) { + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_check_discard_freespace_key)) { kfree(w); goto out; } @@ -1469,6 +1475,8 @@ delete: w->c = c; w->pos = BBPOS(iter->btree_id, iter->pos); queue_work(c->write_ref_wq, &w->work); + + ret = 1; /* don't allocate from this bucket */ goto out; } } @@ -1505,7 +1513,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget_noerror(c, k.k->p.inode); if (!ca) { if (fsck_err(trans, bucket_gens_to_invalid_dev, - "bucket_gens key for invalid device:\n %s", + "bucket_gens key for invalid device:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, 0); goto out; @@ -1514,7 +1522,7 @@ int bch2_check_bucket_gens_key(struct btree_trans *trans, if (fsck_err_on(end <= ca->mi.first_bucket || start >= ca->mi.nbuckets, trans, bucket_gens_to_invalid_buckets, - "bucket_gens key for invalid buckets:\n %s", + "bucket_gens key for invalid buckets:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = bch2_btree_delete_at(trans, iter, 0); goto out; @@ -1576,7 +1584,7 @@ int bch2_check_alloc_info(struct bch_fs *c) bch2_trans_begin(trans); - k = bch2_get_key_or_real_bucket_hole(&iter, &ca, &hole); + k = bch2_get_key_or_real_bucket_hole(trans, &iter, &ca, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -1614,7 +1622,7 @@ int bch2_check_alloc_info(struct bch_fs *c) if (ret) goto bkey_err; - bch2_btree_iter_set_pos(&iter, next); + bch2_btree_iter_set_pos(trans, &iter, next); bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; @@ -1642,7 +1650,7 @@ bkey_err: BTREE_ITER_prefetch); while (1) { bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(trans, &iter); if (!k.k) break; @@ -1661,7 +1669,7 @@ bkey_err: break; } - bch2_btree_iter_set_pos(&iter, bpos_nosnap_successor(iter.pos)); + bch2_btree_iter_set_pos(trans, &iter, bpos_nosnap_successor(iter.pos)); } bch2_trans_iter_exit(trans, &iter); if (ret) @@ -1689,7 +1697,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret; - alloc_k = bch2_btree_iter_peek(alloc_iter); + alloc_k = bch2_btree_iter_peek(trans, alloc_iter); if (!alloc_k.k) return 0; @@ -1705,7 +1713,8 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); if (lru_idx) { - ret = bch2_lru_check_set(trans, BCH_LRU_FRAGMENTATION_START, + ret = bch2_lru_check_set(trans, BCH_LRU_BUCKET_FRAGMENTATION, + bucket_to_u64(alloc_k.k->p), lru_idx, alloc_k, last_flushed); if (ret) goto err; @@ -1716,8 +1725,7 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, if (fsck_err_on(!a->io_time[READ], trans, alloc_key_cached_but_read_time_zero, - "cached bucket with read_time 0\n" - " %s", + "cached bucket with read_time 0\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, alloc_k), buf.buf))) { struct bkey_i_alloc_v4 *a_mut = @@ -1735,7 +1743,9 @@ static int bch2_check_alloc_to_lru_ref(struct btree_trans *trans, a = &a_mut->v; } - ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, a->io_time[READ], + ret = bch2_lru_check_set(trans, alloc_k.k->p.inode, + bucket_to_u64(alloc_k.k->p), + a->io_time[READ], alloc_k, last_flushed); if (ret) goto err; @@ -1757,7 +1767,8 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) for_each_btree_key_commit(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))); + bch2_check_alloc_to_lru_ref(trans, &iter, &last_flushed))) ?: + bch2_check_stripe_to_lru_refs(c); bch2_bkey_buf_exit(&last_flushed, c); bch_err_fn(c, ret); @@ -1814,7 +1825,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bpos pos = need_discard_iter->pos; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bkey_s_c k; struct bkey_i_alloc_v4 *a; struct printbuf buf = PRINTBUF; @@ -1868,7 +1879,7 @@ static int bch2_discard_one_bucket(struct btree_trans *trans, s->discarded++; *discard_pos_done = iter.pos; - if (ca->mi.discard && !c->opts.nochanges) { + if (bch2_discard_opt_enabled(c, ca) && !c->opts.nochanges) { /* * This works without any other locks because this is the only * thread that removes items from the need_discard tree @@ -1897,7 +1908,10 @@ commit: if (ret) goto out; - count_event(c, bucket_discard); + if (!fastpath) + count_event(c, bucket_discard); + else + count_event(c, bucket_discard_fast); out: fsck_err: if (discard_locked) @@ -1935,26 +1949,26 @@ static void bch2_do_discards_work(struct work_struct *work) trace_discard_buckets(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); - percpu_ref_put(&ca->io_ref); - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } void bch2_dev_do_discards(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_dev_do_discards)) goto put_write_ref; if (queue_work(c->write_ref_wq, &ca->discard_work)) return; - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_dev_do_discards); put_write_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_discard); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard); } void bch2_do_discards(struct bch_fs *c) @@ -2030,8 +2044,8 @@ static void bch2_do_discards_fast_work(struct work_struct *work) trace_discard_buckets_fast(c, s.seen, s.open, s.need_journal_commit, s.discarded, bch2_err_str(ret)); bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref); - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); } static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) @@ -2041,30 +2055,88 @@ static void bch2_discard_one_bucket_fast(struct bch_dev *ca, u64 bucket) if (discard_in_flight_add(ca, bucket, false)) return; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_discard_fast)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_discard_fast)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_discard_one_bucket_fast)) goto put_ref; if (queue_work(c->write_ref_wq, &ca->discard_fast_work)) return; - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_discard_one_bucket_fast); put_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_discard_fast); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_discard_fast); +} + +static int invalidate_one_bp(struct btree_trans *trans, + struct bch_dev *ca, + struct bkey_s_c_backpointer bp, + struct bkey_buf *last_flushed) +{ + struct btree_iter extent_iter; + struct bkey_s_c extent_k = + bch2_backpointer_get_key(trans, bp, &extent_iter, 0, last_flushed); + int ret = bkey_err(extent_k); + if (ret) + return ret; + + if (!extent_k.k) + return 0; + + struct bkey_i *n = + bch2_bkey_make_mut(trans, &extent_iter, &extent_k, + BTREE_UPDATE_internal_snapshot_node); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bch2_bkey_drop_device(bkey_i_to_s(n), ca->dev_idx); +err: + bch2_trans_iter_exit(trans, &extent_iter); + return ret; } +static int invalidate_one_bucket_by_bps(struct btree_trans *trans, + struct bch_dev *ca, + struct bpos bucket, + u8 gen, + struct bkey_buf *last_flushed) +{ + struct bpos bp_start = bucket_pos_to_bp_start(ca, bucket); + struct bpos bp_end = bucket_pos_to_bp_end(ca, bucket); + + return for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, + bp_start, bp_end, 0, k, + NULL, NULL, + BCH_WATERMARK_btree| + BCH_TRANS_COMMIT_no_enospc, ({ + if (k.k->type != KEY_TYPE_backpointer) + continue; + + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); + + if (bp.v->bucket_gen != gen) + continue; + + /* filter out bps with gens that don't match */ + + invalidate_one_bp(trans, ca, bp, last_flushed); + })); +} + +noinline_for_stack static int invalidate_one_bucket(struct btree_trans *trans, + struct bch_dev *ca, struct btree_iter *lru_iter, struct bkey_s_c lru_k, + struct bkey_buf *last_flushed, s64 *nr_to_invalidate) { struct bch_fs *c = trans->c; - struct bkey_i_alloc_v4 *a = NULL; struct printbuf buf = PRINTBUF; struct bpos bucket = u64_to_bucket(lru_k.k->p.offset); - unsigned cached_sectors; + struct btree_iter alloc_iter = {}; int ret = 0; if (*nr_to_invalidate <= 0) @@ -2081,35 +2153,40 @@ static int invalidate_one_bucket(struct btree_trans *trans, if (bch2_bucket_is_open_safe(c, bucket.inode, bucket.offset)) return 0; - a = bch2_trans_start_alloc_update(trans, bucket, BTREE_TRIGGER_bucket_invalidate); - ret = PTR_ERR_OR_ZERO(a); + struct bkey_s_c alloc_k = bch2_bkey_get_iter(trans, &alloc_iter, + BTREE_ID_alloc, bucket, + BTREE_ITER_cached); + ret = bkey_err(alloc_k); if (ret) - goto out; + return ret; + + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); /* We expect harmless races here due to the btree write buffer: */ - if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(a->v)) + if (lru_pos_time(lru_iter->pos) != alloc_lru_idx_read(*a)) goto out; - BUG_ON(a->v.data_type != BCH_DATA_cached); - BUG_ON(a->v.dirty_sectors); - - if (!a->v.cached_sectors) - bch_err(c, "invalidating empty bucket, confused"); + /* + * Impossible since alloc_lru_idx_read() only returns nonzero if the + * bucket is supposed to be on the cached bucket LRU (i.e. + * BCH_DATA_cached) + * + * bch2_lru_validate() also disallows lru keys with lru_pos_time() == 0 + */ + BUG_ON(a->data_type != BCH_DATA_cached); + BUG_ON(a->dirty_sectors); - cached_sectors = a->v.cached_sectors; + if (!a->cached_sectors) { + bch2_check_bucket_backpointer_mismatch(trans, ca, bucket.offset, + true, last_flushed); + goto out; + } - SET_BCH_ALLOC_V4_NEED_INC_GEN(&a->v, false); - a->v.gen++; - a->v.data_type = 0; - a->v.dirty_sectors = 0; - a->v.stripe_sectors = 0; - a->v.cached_sectors = 0; - a->v.io_time[READ] = bch2_current_io_time(c, READ); - a->v.io_time[WRITE] = bch2_current_io_time(c, WRITE); + unsigned cached_sectors = a->cached_sectors; + u8 gen = a->gen; - ret = bch2_trans_commit(trans, NULL, NULL, - BCH_WATERMARK_btree| - BCH_TRANS_COMMIT_no_enospc); + ret = invalidate_one_bucket_by_bps(trans, ca, bucket, gen, last_flushed); if (ret) goto out; @@ -2117,6 +2194,7 @@ static int invalidate_one_bucket(struct btree_trans *trans, --*nr_to_invalidate; out: fsck_err: + bch2_trans_iter_exit(trans, &alloc_iter); printbuf_exit(&buf); return ret; } @@ -2126,9 +2204,9 @@ static struct bkey_s_c next_lru_key(struct btree_trans *trans, struct btree_iter { struct bkey_s_c k; again: - k = bch2_btree_iter_peek_max(iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); + k = bch2_btree_iter_peek_max(trans, iter, lru_pos(ca->dev_idx, U64_MAX, LRU_TIME_MAX)); if (!k.k && !*wrapped) { - bch2_btree_iter_set_pos(iter, lru_pos(ca->dev_idx, 0, 0)); + bch2_btree_iter_set_pos(trans, iter, lru_pos(ca->dev_idx, 0, 0)); *wrapped = true; goto again; } @@ -2143,6 +2221,10 @@ static void bch2_do_invalidates_work(struct work_struct *work) struct btree_trans *trans = bch2_trans_get(c); int ret = 0; + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + ret = bch2_btree_write_buffer_tryflush(trans); if (ret) goto err; @@ -2167,38 +2249,39 @@ static void bch2_do_invalidates_work(struct work_struct *work) if (!k.k) break; - ret = invalidate_one_bucket(trans, &iter, k, &nr_to_invalidate); + ret = invalidate_one_bucket(trans, ca, &iter, k, &last_flushed, &nr_to_invalidate); restart_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } bch2_trans_iter_exit(trans, &iter); err: bch2_trans_put(trans); - percpu_ref_put(&ca->io_ref); - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + bch2_bkey_buf_exit(&last_flushed, c); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); } void bch2_dev_do_invalidates(struct bch_dev *ca) { struct bch_fs *c = ca->fs; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_invalidate)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_invalidate)) return; - if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE)) + if (!bch2_dev_get_ioref(c, ca->dev_idx, WRITE, BCH_DEV_WRITE_REF_do_invalidates)) goto put_ref; if (queue_work(c->write_ref_wq, &ca->invalidate_work)) return; - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_do_invalidates); put_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_invalidate); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_invalidate); } void bch2_do_invalidates(struct bch_fs *c) @@ -2243,7 +2326,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, break; } - k = bch2_get_key_or_hole(&iter, end, &hole); + k = bch2_get_key_or_hole(trans, &iter, end, &hole); ret = bkey_err(k); if (ret) goto bkey_err; @@ -2262,7 +2345,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, if (ret) goto bkey_err; - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } else { struct bkey_i *freespace; @@ -2282,7 +2365,7 @@ int bch2_dev_freespace_init(struct bch_fs *c, struct bch_dev *ca, if (ret) goto bkey_err; - bch2_btree_iter_set_pos(&iter, k.k->p); + bch2_btree_iter_set_pos(trans, &iter, k.k->p); } bkey_err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -2309,14 +2392,16 @@ bkey_err: int bch2_fs_freespace_init(struct bch_fs *c) { - int ret = 0; - bool doing_init = false; + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) + return 0; + /* * We can crash during the device add path, so we need to check this on * every mount: */ + bool doing_init = false; for_each_member_device(c, ca) { if (ca->mi.freespace_initialized) continue; @@ -2326,7 +2411,7 @@ int bch2_fs_freespace_init(struct bch_fs *c) doing_init = true; } - ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); + int ret = bch2_dev_freespace_init(c, ca, 0, ca->mi.nbuckets); if (ret) { bch2_dev_put(ca); bch_err_fn(c, ret); @@ -2356,8 +2441,7 @@ int bch2_dev_remove_alloc(struct bch_fs *c, struct bch_dev *ca) * We clear the LRU and need_discard btrees first so that we don't race * with bch2_do_invalidates() and bch2_do_discards() */ - ret = bch2_dev_remove_stripes(c, ca->dev_idx) ?: - bch2_btree_delete_range(c, BTREE_ID_lru, start, end, + ret = bch2_btree_delete_range(c, BTREE_ID_lru, start, end, BTREE_TRIGGER_norun, NULL) ?: bch2_btree_delete_range(c, BTREE_ID_need_discard, start, end, BTREE_TRIGGER_norun, NULL) ?: @@ -2420,15 +2504,15 @@ void bch2_recalc_capacity(struct bch_fs *c) lockdep_assert_held(&c->state_lock); - for_each_online_member(c, ca) { - struct backing_dev_info *bdi = ca->disk_sb.bdev->bd_disk->bdi; - - ra_pages += bdi->ra_pages; - } + rcu_read_lock(); + for_each_member_device_rcu(c, ca, NULL) { + struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); + if (bdev) + ra_pages += bdev->bd_disk->bdi->ra_pages; - bch2_set_ra_pages(c, ra_pages); + if (ca->mi.state != BCH_MEMBER_STATE_rw) + continue; - for_each_rw_member(c, ca) { u64 dev_reserve = 0; /* @@ -2465,6 +2549,9 @@ void bch2_recalc_capacity(struct bch_fs *c) bucket_size_max = max_t(unsigned, bucket_size_max, ca->mi.bucket_size); } + rcu_read_unlock(); + + bch2_set_ra_pages(c, ra_pages); gc_reserve = c->opts.gc_reserve_bytes ? c->opts.gc_reserve_bytes >> 9 @@ -2487,27 +2574,41 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) { u64 ret = U64_MAX; - for_each_rw_member(c, ca) + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); + rcu_read_unlock(); return ret; } static bool bch2_dev_has_open_write_point(struct bch_fs *c, struct bch_dev *ca) { struct open_bucket *ob; - bool ret = false; for (ob = c->open_buckets; ob < c->open_buckets + ARRAY_SIZE(c->open_buckets); ob++) { - spin_lock(&ob->lock); - if (ob->valid && !ob->on_partial_list && - ob->dev == ca->dev_idx) - ret = true; - spin_unlock(&ob->lock); + scoped_guard(spinlock, &ob->lock) { + if (ob->valid && !ob->on_partial_list && + ob->dev == ca->dev_idx) + return true; + } } - return ret; + return false; +} + +void bch2_dev_allocator_set_rw(struct bch_fs *c, struct bch_dev *ca, bool rw) +{ + /* BCH_DATA_free == all rw devs */ + + for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) + if (rw && + (i == BCH_DATA_free || + (ca->mi.data_allowed & BIT(i)))) + set_bit(ca->dev_idx, c->rw_devs[i].d); + else + clear_bit(ca->dev_idx, c->rw_devs[i].d); } /* device goes ro: */ @@ -2516,9 +2617,7 @@ void bch2_dev_allocator_remove(struct bch_fs *c, struct bch_dev *ca) lockdep_assert_held(&c->state_lock); /* First, remove device from allocation groups: */ - - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - clear_bit(ca->dev_idx, c->rw_devs[i].d); + bch2_dev_allocator_set_rw(c, ca, false); c->rw_devs_change_count++; @@ -2552,10 +2651,7 @@ void bch2_dev_allocator_add(struct bch_fs *c, struct bch_dev *ca) { lockdep_assert_held(&c->state_lock); - for (unsigned i = 0; i < ARRAY_SIZE(c->rw_devs); i++) - if (ca->mi.data_allowed & (1 << i)) - set_bit(ca->dev_idx, c->rw_devs[i].d); - + bch2_dev_allocator_set_rw(c, ca, true); c->rw_devs_change_count++; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index de25ba4ee94b..4f94c6a661bf 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -131,7 +131,7 @@ static inline enum bch_data_type alloc_data_type(struct bch_alloc_v4 a, if (a.stripe) return data_type == BCH_DATA_parity ? data_type : BCH_DATA_stripe; if (bch2_bucket_sectors_dirty(a)) - return data_type; + return bucket_data_type(data_type); if (a.cached_sectors) return BCH_DATA_cached; if (BCH_ALLOC_V4_NEED_DISCARD(&a)) @@ -321,11 +321,11 @@ static inline u64 should_invalidate_buckets(struct bch_dev *ca, { u64 want_free = ca->mi.nbuckets >> 7; u64 free = max_t(s64, 0, - u.d[BCH_DATA_free].buckets - + u.d[BCH_DATA_need_discard].buckets + u.buckets[BCH_DATA_free] + + u.buckets[BCH_DATA_need_discard] - bch2_dev_buckets_reserved(ca, BCH_WATERMARK_stripe)); - return clamp_t(s64, want_free - free, 0, u.d[BCH_DATA_cached].buckets); + return clamp_t(s64, want_free - free, 0, u.buckets[BCH_DATA_cached]); } void bch2_dev_do_invalidates(struct bch_dev *); @@ -350,6 +350,7 @@ int bch2_dev_remove_alloc(struct bch_fs *, struct bch_dev *); void bch2_recalc_capacity(struct bch_fs *); u64 bch2_min_rw_member_capacity(struct bch_fs *); +void bch2_dev_allocator_set_rw(struct bch_fs *, struct bch_dev *, bool); void bch2_dev_allocator_remove(struct bch_fs *, struct bch_dev *); void bch2_dev_allocator_add(struct bch_fs *, struct bch_dev *); diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 5a781fb4c794..1a52c12c51ae 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -127,14 +127,14 @@ void __bch2_open_bucket_put(struct bch_fs *c, struct open_bucket *ob) void bch2_open_bucket_write_error(struct bch_fs *c, struct open_buckets *obs, - unsigned dev) + unsigned dev, int err) { struct open_bucket *ob; unsigned i; open_bucket_for_each(c, obs, ob, i) if (ob->dev == dev && ob->ec) - bch2_ec_bucket_cancel(c, ob); + bch2_ec_bucket_cancel(c, ob, err); } static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) @@ -154,7 +154,7 @@ static struct open_bucket *bch2_open_bucket_alloc(struct bch_fs *c) static inline bool is_superblock_bucket(struct bch_fs *c, struct bch_dev *ca, u64 b) { - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_trans_mark_dev_sbs) + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_trans_mark_dev_sbs)) return false; return bch2_is_superblock_bucket(ca, b); @@ -179,29 +179,12 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) closure_wake_up(&c->freelist_wait); } -static inline unsigned open_buckets_reserved(enum bch_watermark watermark) -{ - switch (watermark) { - case BCH_WATERMARK_interior_updates: - return 0; - case BCH_WATERMARK_reclaim: - return OPEN_BUCKETS_COUNT / 6; - case BCH_WATERMARK_btree: - case BCH_WATERMARK_btree_copygc: - return OPEN_BUCKETS_COUNT / 4; - case BCH_WATERMARK_copygc: - return OPEN_BUCKETS_COUNT / 3; - default: - return OPEN_BUCKETS_COUNT / 2; - } -} - static inline bool may_alloc_bucket(struct bch_fs *c, - struct bpos bucket, - struct bucket_alloc_state *s) + struct alloc_request *req, + struct bpos bucket) { if (bch2_bucket_is_open(c, bucket.inode, bucket.offset)) { - s->skipped_open++; + req->counters.skipped_open++; return false; } @@ -210,36 +193,37 @@ static inline bool may_alloc_bucket(struct bch_fs *c, bucket.inode, bucket.offset); if (journal_seq_ready > c->journal.flushed_seq_ondisk) { if (journal_seq_ready > c->journal.flushing_seq) - s->need_journal_commit++; - s->skipped_need_journal_commit++; + req->counters.need_journal_commit++; + req->counters.skipped_need_journal_commit++; return false; } if (bch2_bucket_nocow_is_locked(&c->nocow_locks, bucket)) { - s->skipped_nocow++; + req->counters.skipped_nocow++; return false; } return true; } -static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev *ca, +static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, + struct alloc_request *req, u64 bucket, u8 gen, - enum bch_watermark watermark, - struct bucket_alloc_state *s, struct closure *cl) { + struct bch_dev *ca = req->ca; + if (unlikely(is_superblock_bucket(c, ca, bucket))) return NULL; if (unlikely(ca->buckets_nouse && test_bit(bucket, ca->buckets_nouse))) { - s->skipped_nouse++; + req->counters.skipped_nouse++; return NULL; } spin_lock(&c->freelist_lock); - if (unlikely(c->open_buckets_nr_free <= open_buckets_reserved(watermark))) { + if (unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(req->watermark))) { if (cl) closure_wait(&c->open_buckets_wait, cl); @@ -251,7 +235,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * /* Recheck under lock: */ if (bch2_bucket_is_open(c, ca->dev_idx, bucket)) { spin_unlock(&c->freelist_lock); - s->skipped_open++; + req->counters.skipped_open++; return NULL; } @@ -275,16 +259,15 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, struct bch_dev * return ob; } -static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, +static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, + struct alloc_request *req, struct btree_iter *freespace_iter, struct closure *cl) { struct bch_fs *c = trans->c; u64 b = freespace_iter->pos.offset & ~(~0ULL << 56); - if (!may_alloc_bucket(c, POS(ca->dev_idx, b), s)) + if (!may_alloc_bucket(c, req, POS(req->ca->dev_idx, b))) return NULL; u8 gen; @@ -294,7 +277,7 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc if (ret) return NULL; - return __try_alloc_bucket(c, ca, b, gen, watermark, s, cl); + return __try_alloc_bucket(c, req, b, gen, cl); } /* @@ -302,17 +285,16 @@ static struct open_bucket *try_alloc_bucket(struct btree_trans *trans, struct bc */ static noinline struct open_bucket * bch2_bucket_alloc_early(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, + struct alloc_request *req, struct closure *cl) { struct bch_fs *c = trans->c; + struct bch_dev *ca = req->ca; struct btree_iter iter, citer; struct bkey_s_c k, ck; struct open_bucket *ob = NULL; u64 first_bucket = ca->mi.first_bucket; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; u64 alloc_start = max(first_bucket, *dev_alloc_cursor); u64 alloc_cursor = alloc_start; int ret; @@ -334,19 +316,19 @@ again: if (bkey_ge(k.k->p, POS(ca->dev_idx, ca->mi.nbuckets))) break; - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + if (req->btree_bitmap != BTREE_BITMAP_ANY && + req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (s->btree_bitmap == BTREE_BITMAP_YES && + if (req->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) break; bucket = sector_to_bucket(ca, round_up(bucket_to_sector(ca, bucket) + 1, 1ULL << ca->mi.btree_bitmap_shift)); - bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, bucket)); - s->buckets_seen++; - s->skipped_mi_btree_bitmap++; + bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, bucket)); + req->counters.buckets_seen++; + req->counters.skipped_mi_btree_bitmap++; continue; } @@ -365,14 +347,13 @@ again: if (a->data_type != BCH_DATA_free) goto next; - s->buckets_seen++; + req->counters.buckets_seen++; - ob = may_alloc_bucket(c, k.k->p, s) - ? __try_alloc_bucket(c, ca, k.k->p.offset, a->gen, - watermark, s, cl) + ob = may_alloc_bucket(c, req, k.k->p) + ? __try_alloc_bucket(c, req, k.k->p.offset, a->gen, cl) : NULL; next: - bch2_set_btree_iter_dontneed(&citer); + bch2_set_btree_iter_dontneed(trans, &citer); bch2_trans_iter_exit(trans, &citer); if (ob) break; @@ -395,15 +376,14 @@ next: } static struct open_bucket *bch2_bucket_alloc_freelist(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - struct bucket_alloc_state *s, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { + struct bch_dev *ca = req->ca; struct btree_iter iter; struct bkey_s_c k; struct open_bucket *ob = NULL; - u64 *dev_alloc_cursor = &ca->alloc_cursor[s->btree_bitmap]; + u64 *dev_alloc_cursor = &ca->alloc_cursor[req->btree_bitmap]; u64 alloc_start = max_t(u64, ca->mi.first_bucket, READ_ONCE(*dev_alloc_cursor)); u64 alloc_cursor = alloc_start; int ret; @@ -419,13 +399,13 @@ again: iter.k.size = iter.k.p.offset - iter.pos.offset; while (iter.k.size) { - s->buckets_seen++; + req->counters.buckets_seen++; u64 bucket = iter.pos.offset & ~(~0ULL << 56); - if (s->btree_bitmap != BTREE_BITMAP_ANY && - s->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, + if (req->btree_bitmap != BTREE_BITMAP_ANY && + req->btree_bitmap != bch2_dev_btree_bitmap_marked_sectors(ca, bucket_to_sector(ca, bucket), ca->mi.bucket_size)) { - if (s->btree_bitmap == BTREE_BITMAP_YES && + if (req->btree_bitmap == BTREE_BITMAP_YES && bucket_to_sector(ca, bucket) > 64ULL << ca->mi.btree_bitmap_shift) goto fail; @@ -434,16 +414,16 @@ again: 1ULL << ca->mi.btree_bitmap_shift)); alloc_cursor = bucket|(iter.pos.offset & (~0ULL << 56)); - bch2_btree_iter_set_pos(&iter, POS(ca->dev_idx, alloc_cursor)); - s->skipped_mi_btree_bitmap++; + bch2_btree_iter_set_pos(trans, &iter, POS(ca->dev_idx, alloc_cursor)); + req->counters.skipped_mi_btree_bitmap++; goto next; } - ob = try_alloc_bucket(trans, ca, watermark, s, &iter, cl); + ob = try_alloc_bucket(trans, req, &iter, cl); if (ob) { if (!IS_ERR(ob)) *dev_alloc_cursor = iter.pos.offset; - bch2_set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(trans, &iter); break; } @@ -470,33 +450,30 @@ fail: return ob; } -static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, +static noinline void trace_bucket_alloc2(struct bch_fs *c, + struct alloc_request *req, struct closure *cl, - struct bch_dev_usage *usage, - struct bucket_alloc_state *s, struct open_bucket *ob) { struct printbuf buf = PRINTBUF; printbuf_tabstop_push(&buf, 24); - prt_printf(&buf, "dev\t%s (%u)\n", ca->name, ca->dev_idx); - prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[watermark]); - prt_printf(&buf, "data type\t%s\n", __bch2_data_types[data_type]); + prt_printf(&buf, "dev\t%s (%u)\n", req->ca->name, req->ca->dev_idx); + prt_printf(&buf, "watermark\t%s\n", bch2_watermarks[req->watermark]); + prt_printf(&buf, "data type\t%s\n", __bch2_data_types[req->data_type]); prt_printf(&buf, "blocking\t%u\n", cl != NULL); - prt_printf(&buf, "free\t%llu\n", usage->d[BCH_DATA_free].buckets); - prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(ca, *usage, watermark)); - prt_printf(&buf, "copygc_wait\t%lu/%lli\n", + prt_printf(&buf, "free\t%llu\n", req->usage.buckets[BCH_DATA_free]); + prt_printf(&buf, "avail\t%llu\n", dev_buckets_free(req->ca, req->usage, req->watermark)); + prt_printf(&buf, "copygc_wait\t%llu/%lli\n", bch2_copygc_wait_amount(c), c->copygc_wait - atomic64_read(&c->io_clock[WRITE].now)); - prt_printf(&buf, "seen\t%llu\n", s->buckets_seen); - prt_printf(&buf, "open\t%llu\n", s->skipped_open); - prt_printf(&buf, "need journal commit\t%llu\n", s->skipped_need_journal_commit); - prt_printf(&buf, "nocow\t%llu\n", s->skipped_nocow); - prt_printf(&buf, "nouse\t%llu\n", s->skipped_nouse); - prt_printf(&buf, "mi_btree_bitmap\t%llu\n", s->skipped_mi_btree_bitmap); + prt_printf(&buf, "seen\t%llu\n", req->counters.buckets_seen); + prt_printf(&buf, "open\t%llu\n", req->counters.skipped_open); + prt_printf(&buf, "need journal commit\t%llu\n", req->counters.skipped_need_journal_commit); + prt_printf(&buf, "nocow\t%llu\n", req->counters.skipped_nocow); + prt_printf(&buf, "nouse\t%llu\n", req->counters.skipped_nouse); + prt_printf(&buf, "mi_btree_bitmap\t%llu\n", req->counters.skipped_mi_btree_bitmap); if (!IS_ERR(ob)) { prt_printf(&buf, "allocated\t%llu\n", ob->bucket); @@ -512,47 +489,42 @@ static noinline void trace_bucket_alloc2(struct bch_fs *c, struct bch_dev *ca, /** * bch2_bucket_alloc_trans - allocate a single bucket from a specific device * @trans: transaction object - * @ca: device to allocate from - * @watermark: how important is this allocation? - * @data_type: BCH_DATA_journal, btree, user... + * @req: state for the entire allocation * @cl: if not NULL, closure to be used to wait if buckets not available * @nowait: if true, do not wait for buckets to become available - * @usage: for secondarily also returning the current device usage * * Returns: an open_bucket on success, or an ERR_PTR() on failure. */ static struct open_bucket *bch2_bucket_alloc_trans(struct btree_trans *trans, - struct bch_dev *ca, - enum bch_watermark watermark, - enum bch_data_type data_type, - struct closure *cl, - bool nowait, - struct bch_dev_usage *usage) + struct alloc_request *req, + struct closure *cl, + bool nowait) { struct bch_fs *c = trans->c; + struct bch_dev *ca = req->ca; struct open_bucket *ob = NULL; bool freespace = READ_ONCE(ca->mi.freespace_initialized); u64 avail; - struct bucket_alloc_state s = { - .btree_bitmap = data_type == BCH_DATA_btree, - }; bool waiting = nowait; + + req->btree_bitmap = req->data_type == BCH_DATA_btree; + memset(&req->counters, 0, sizeof(req->counters)); again: - bch2_dev_usage_read_fast(ca, usage); - avail = dev_buckets_free(ca, *usage, watermark); + bch2_dev_usage_read_fast(ca, &req->usage); + avail = dev_buckets_free(ca, req->usage, req->watermark); - if (usage->d[BCH_DATA_need_discard].buckets > avail) + if (req->usage.buckets[BCH_DATA_need_discard] > avail) bch2_dev_do_discards(ca); - if (usage->d[BCH_DATA_need_gc_gens].buckets > avail) + if (req->usage.buckets[BCH_DATA_need_gc_gens] > avail) bch2_gc_gens_async(c); - if (should_invalidate_buckets(ca, *usage)) + if (should_invalidate_buckets(ca, req->usage)) bch2_dev_do_invalidates(ca); if (!avail) { - if (watermark > BCH_WATERMARK_normal && - c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + if (req->watermark > BCH_WATERMARK_normal && + c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) goto alloc; if (cl && !waiting) { @@ -571,18 +543,18 @@ again: closure_wake_up(&c->freelist_wait); alloc: ob = likely(freespace) - ? bch2_bucket_alloc_freelist(trans, ca, watermark, &s, cl) - : bch2_bucket_alloc_early(trans, ca, watermark, &s, cl); + ? bch2_bucket_alloc_freelist(trans, req, cl) + : bch2_bucket_alloc_early(trans, req, cl); - if (s.need_journal_commit * 2 > avail) + if (req->counters.need_journal_commit * 2 > avail) bch2_journal_flush_async(&c->journal, NULL); - if (!ob && s.btree_bitmap != BTREE_BITMAP_ANY) { - s.btree_bitmap = BTREE_BITMAP_ANY; + if (!ob && req->btree_bitmap != BTREE_BITMAP_ANY) { + req->btree_bitmap = BTREE_BITMAP_ANY; goto alloc; } - if (!ob && freespace && c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_alloc_info) { + if (!ob && freespace && c->recovery.pass_done < BCH_RECOVERY_PASS_check_alloc_info) { freespace = false; goto alloc; } @@ -591,7 +563,7 @@ err: ob = ERR_PTR(-BCH_ERR_no_buckets_found); if (!IS_ERR(ob)) - ob->data_type = data_type; + ob->data_type = req->data_type; if (!IS_ERR(ob)) count_event(c, bucket_alloc); @@ -601,7 +573,7 @@ err: if (!IS_ERR(ob) ? trace_bucket_alloc_enabled() : trace_bucket_alloc_fail_enabled()) - trace_bucket_alloc2(c, ca, watermark, data_type, cl, usage, &s, ob); + trace_bucket_alloc2(c, req, cl, ob); return ob; } @@ -611,20 +583,22 @@ struct open_bucket *bch2_bucket_alloc(struct bch_fs *c, struct bch_dev *ca, enum bch_data_type data_type, struct closure *cl) { - struct bch_dev_usage usage; struct open_bucket *ob; + struct alloc_request req = { + .watermark = watermark, + .data_type = data_type, + .ca = ca, + }; bch2_trans_do(c, - PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, ca, watermark, - data_type, cl, false, &usage))); + PTR_ERR_OR_ZERO(ob = bch2_bucket_alloc_trans(trans, &req, cl, false))); return ob; } static int __dev_stripe_cmp(struct dev_stripe_state *stripe, unsigned l, unsigned r) { - return ((stripe->next_alloc[l] > stripe->next_alloc[r]) - - (stripe->next_alloc[l] < stripe->next_alloc[r])); + return cmp_int(stripe->next_alloc[l], stripe->next_alloc[r]); } #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) @@ -643,25 +617,62 @@ struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, return ret; } +static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ +static const u64 stripe_clock_hand_max = 1ULL << 56; /* max after rescale */ +static const u64 stripe_clock_hand_inv = 1ULL << 52; /* max increment, if a device is empty */ + +static noinline void bch2_stripe_state_rescale(struct dev_stripe_state *stripe) +{ + /* + * Avoid underflowing clock hands if at all possible, if clock hands go + * to 0 then we lose information - clock hands can be in a wide range if + * we have devices we rarely try to allocate from, if we generally + * allocate from a specified target but only sometimes have to fall back + * to the whole filesystem. + */ + u64 scale_max = U64_MAX; /* maximum we can subtract without underflow */ + u64 scale_min = 0; /* minumum we must subtract to avoid overflow */ + + for (u64 *v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) { + if (*v) + scale_max = min(scale_max, *v); + if (*v > stripe_clock_hand_max) + scale_min = max(scale_min, *v - stripe_clock_hand_max); + } + + u64 scale = max(scale_min, scale_max); + + for (u64 *v = stripe->next_alloc; + v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) + *v = *v < scale ? 0 : *v - scale; +} + static inline void bch2_dev_stripe_increment_inlined(struct bch_dev *ca, struct dev_stripe_state *stripe, struct bch_dev_usage *usage) { + /* + * Stripe state has a per device clock hand: we allocate from the device + * with the smallest clock hand. + * + * When we allocate, we don't do a simple increment; we add the inverse + * of the device's free space. This results in round robin behavior that + * biases in favor of the device(s) with more free space. + */ + u64 *v = stripe->next_alloc + ca->dev_idx; - u64 free_space = dev_buckets_available(ca, BCH_WATERMARK_normal); + u64 free_space = __dev_buckets_available(ca, *usage, BCH_WATERMARK_normal); u64 free_space_inv = free_space - ? div64_u64(1ULL << 48, free_space) - : 1ULL << 48; - u64 scale = *v / 4; + ? div64_u64(stripe_clock_hand_inv, free_space) + : stripe_clock_hand_inv; - if (*v + free_space_inv >= *v) - *v += free_space_inv; - else - *v = U64_MAX; + /* Saturating add, avoid overflow: */ + u64 sum = *v + free_space_inv; + *v = sum >= *v ? sum : U64_MAX; - for (v = stripe->next_alloc; - v < stripe->next_alloc + ARRAY_SIZE(stripe->next_alloc); v++) - *v = *v < scale ? 0 : *v - scale; + if (unlikely(*v > stripe_clock_hand_rescale)) + bch2_stripe_state_rescale(stripe); } void bch2_dev_stripe_increment(struct bch_dev *ca, @@ -674,24 +685,20 @@ void bch2_dev_stripe_increment(struct bch_dev *ca, } static int add_new_bucket(struct bch_fs *c, - struct open_buckets *ptrs, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - struct open_bucket *ob) + struct alloc_request *req, + struct open_bucket *ob) { unsigned durability = ob_dev(c, ob)->mi.durability; - BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(req->nr_effective >= req->nr_replicas); - __clear_bit(ob->dev, devs_may_alloc->d); - *nr_effective += durability; - *have_cache |= !durability; + __clear_bit(ob->dev, req->devs_may_alloc.d); + req->nr_effective += durability; + req->have_cache |= !durability; - ob_push(c, ptrs, ob); + ob_push(c, &req->ptrs, ob); - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 1; if (ob->ec) return 1; @@ -699,39 +706,31 @@ static int add_new_bucket(struct bch_fs *c, } int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct open_buckets *ptrs, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_write_flags flags, - enum bch_data_type data_type, - enum bch_watermark watermark, - struct closure *cl) + struct alloc_request *req, + struct dev_stripe_state *stripe, + struct closure *cl) { struct bch_fs *c = trans->c; int ret = -BCH_ERR_insufficient_devices; - BUG_ON(*nr_effective >= nr_replicas); + BUG_ON(req->nr_effective >= req->nr_replicas); - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, devs_may_alloc); + struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc); darray_for_each(devs_sorted, i) { - struct bch_dev *ca = bch2_dev_tryget_noerror(c, *i); - if (!ca) + req->ca = bch2_dev_tryget_noerror(c, *i); + if (!req->ca) continue; - if (!ca->mi.durability && *have_cache) { - bch2_dev_put(ca); + if (!req->ca->mi.durability && req->have_cache) { + bch2_dev_put(req->ca); continue; } - struct bch_dev_usage usage; - struct open_bucket *ob = bch2_bucket_alloc_trans(trans, ca, watermark, data_type, - cl, flags & BCH_WRITE_ALLOC_NOWAIT, &usage); + struct open_bucket *ob = bch2_bucket_alloc_trans(trans, req, cl, + req->flags & BCH_WRITE_alloc_nowait); if (!IS_ERR(ob)) - bch2_dev_stripe_increment_inlined(ca, stripe, &usage); - bch2_dev_put(ca); + bch2_dev_stripe_increment_inlined(req->ca, stripe, &req->usage); + bch2_dev_put(req->ca); if (IS_ERR(ob)) { ret = PTR_ERR(ob); @@ -740,9 +739,7 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - if (add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob)) { + if (add_new_bucket(c, req, ob)) { ret = 0; break; } @@ -760,34 +757,27 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, */ static int bucket_alloc_from_stripe(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - u16 target, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { struct bch_fs *c = trans->c; int ret = 0; - if (nr_replicas < 2) + if (req->nr_replicas < 2) return 0; - if (ec_open_bucket(c, ptrs)) + if (ec_open_bucket(c, &req->ptrs)) return 0; struct ec_stripe_head *h = - bch2_ec_stripe_head_get(trans, target, 0, nr_replicas - 1, watermark, cl); + bch2_ec_stripe_head_get(trans, req, 0, cl); if (IS_ERR(h)) return PTR_ERR(h); if (!h) return 0; - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, &wp->stripe, devs_may_alloc); + struct dev_alloc_list devs_sorted = + bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc); darray_for_each(devs_sorted, i) for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) @@ -799,9 +789,7 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, ob->ec = h->s; ec_stripe_new_get(h->s, STRIPE_REF_io); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); goto out; } } @@ -813,65 +801,49 @@ out: /* Sector allocator */ static bool want_bucket(struct bch_fs *c, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - bool *have_cache, bool ec, + struct alloc_request *req, struct open_bucket *ob) { struct bch_dev *ca = ob_dev(c, ob); - if (!test_bit(ob->dev, devs_may_alloc->d)) + if (!test_bit(ob->dev, req->devs_may_alloc.d)) return false; - if (ob->data_type != wp->data_type) + if (ob->data_type != req->wp->data_type) return false; if (!ca->mi.durability && - (wp->data_type == BCH_DATA_btree || ec || *have_cache)) + (req->wp->data_type == BCH_DATA_btree || req->ec || req->have_cache)) return false; - if (ec != (ob->ec != NULL)) + if (req->ec != (ob->ec != NULL)) return false; return true; } static int bucket_alloc_set_writepoint(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - bool ec) + struct alloc_request *req) { - struct open_buckets ptrs_skip = { .nr = 0 }; struct open_bucket *ob; unsigned i; int ret = 0; - open_bucket_for_each(c, &wp->ptrs, ob, i) { - if (!ret && want_bucket(c, wp, devs_may_alloc, - have_cache, ec, ob)) - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + req->scratch_ptrs.nr = 0; + + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { + if (!ret && want_bucket(c, req, ob)) + ret = add_new_bucket(c, req, ob); else - ob_push(c, &ptrs_skip, ob); + ob_push(c, &req->scratch_ptrs, ob); } - wp->ptrs = ptrs_skip; + req->wp->ptrs = req->scratch_ptrs; return ret; } static int bucket_alloc_set_partial(struct bch_fs *c, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_mask *devs_may_alloc, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, bool ec, - enum bch_watermark watermark) + struct alloc_request *req) { int i, ret = 0; @@ -886,13 +858,12 @@ static int bucket_alloc_set_partial(struct bch_fs *c, for (i = c->open_buckets_partial_nr - 1; i >= 0; --i) { struct open_bucket *ob = c->open_buckets + c->open_buckets_partial[i]; - if (want_bucket(c, wp, devs_may_alloc, have_cache, ec, ob)) { + if (want_bucket(c, req, ob)) { struct bch_dev *ca = ob_dev(c, ob); - struct bch_dev_usage usage; u64 avail; - bch2_dev_usage_read_fast(ca, &usage); - avail = dev_buckets_free(ca, usage, watermark) + ca->nr_partial_buckets; + bch2_dev_usage_read_fast(ca, &req->usage); + avail = dev_buckets_free(ca, req->usage, req->watermark) + ca->nr_partial_buckets; if (!avail) continue; @@ -905,9 +876,7 @@ static int bucket_alloc_set_partial(struct bch_fs *c, bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; rcu_read_unlock(); - ret = add_new_bucket(c, ptrs, devs_may_alloc, - nr_replicas, nr_effective, - have_cache, ob); + ret = add_new_bucket(c, req, ob); if (ret) break; } @@ -918,61 +887,41 @@ unlock: } static int __open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - bool erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *_cl) + struct alloc_request *req, + struct closure *_cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs; struct open_bucket *ob; struct closure *cl = NULL; unsigned i; int ret; - devs = target_rw_devs(c, wp->data_type, target); + req->devs_may_alloc = target_rw_devs(c, req->wp->data_type, req->target); /* Don't allocate from devices we already have pointers to: */ - darray_for_each(*devs_have, i) - __clear_bit(*i, devs.d); + darray_for_each(*req->devs_have, i) + __clear_bit(*i, req->devs_may_alloc.d); - open_bucket_for_each(c, ptrs, ob, i) - __clear_bit(ob->dev, devs.d); + open_bucket_for_each(c, &req->ptrs, ob, i) + __clear_bit(ob->dev, req->devs_may_alloc.d); - ret = bucket_alloc_set_writepoint(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code); + ret = bucket_alloc_set_writepoint(c, req); if (ret) return ret; - ret = bucket_alloc_set_partial(c, ptrs, wp, &devs, - nr_replicas, nr_effective, - have_cache, erasure_code, watermark); + ret = bucket_alloc_set_partial(c, req); if (ret) return ret; - if (erasure_code) { - ret = bucket_alloc_from_stripe(trans, ptrs, wp, &devs, - target, - nr_replicas, nr_effective, - have_cache, - watermark, flags, _cl); + if (req->ec) { + ret = bucket_alloc_from_stripe(trans, req, _cl); } else { retry_blocking: /* * Try nonblocking first, so that if one device is full we'll try from * other devices: */ - ret = bch2_bucket_alloc_set_trans(trans, ptrs, &wp->stripe, &devs, - nr_replicas, nr_effective, have_cache, - flags, wp->data_type, watermark, cl); + ret = bch2_bucket_alloc_set_trans(trans, req, &req->wp->stripe, cl); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && !bch2_err_matches(ret, BCH_ERR_insufficient_devices) && @@ -986,38 +935,27 @@ retry_blocking: } static int open_bucket_add_buckets(struct btree_trans *trans, - struct open_buckets *ptrs, - struct write_point *wp, - struct bch_devs_list *devs_have, - u16 target, - unsigned erasure_code, - unsigned nr_replicas, - unsigned *nr_effective, - bool *have_cache, - enum bch_watermark watermark, - enum bch_write_flags flags, - struct closure *cl) + struct alloc_request *req, + struct closure *cl) { int ret; - if (erasure_code && !ec_open_bucket(trans->c, ptrs)) { - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, erasure_code, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + if (req->ec && !ec_open_bucket(trans->c, &req->ptrs)) { + ret = __open_bucket_add_buckets(trans, req, cl); if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || bch2_err_matches(ret, BCH_ERR_operation_blocked) || bch2_err_matches(ret, BCH_ERR_freelist_empty) || bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) return ret; - if (*nr_effective >= nr_replicas) + if (req->nr_effective >= req->nr_replicas) return 0; } - ret = __open_bucket_add_buckets(trans, ptrs, wp, - devs_have, target, false, - nr_replicas, nr_effective, have_cache, - watermark, flags, cl); + bool ec = false; + swap(ec, req->ec); + ret = __open_bucket_add_buckets(trans, req, cl); + swap(ec, req->ec); + return ret < 0 ? ret : 0; } @@ -1270,26 +1208,26 @@ out: static noinline void deallocate_extra_replicas(struct bch_fs *c, - struct open_buckets *ptrs, - struct open_buckets *ptrs_no_use, - unsigned extra_replicas) + struct alloc_request *req) { - struct open_buckets ptrs2 = { 0 }; struct open_bucket *ob; + unsigned extra_replicas = req->nr_effective - req->nr_replicas; unsigned i; - open_bucket_for_each(c, ptrs, ob, i) { + req->scratch_ptrs.nr = 0; + + open_bucket_for_each(c, &req->ptrs, ob, i) { unsigned d = ob_dev(c, ob)->mi.durability; if (d && d <= extra_replicas) { extra_replicas -= d; - ob_push(c, ptrs_no_use, ob); + ob_push(c, &req->wp->ptrs, ob); } else { - ob_push(c, &ptrs2, ob); + ob_push(c, &req->scratch_ptrs, ob); } } - *ptrs = ptrs2; + req->ptrs = req->scratch_ptrs; } /* @@ -1308,51 +1246,53 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *trans, struct write_point **wp_ret) { struct bch_fs *c = trans->c; - struct write_point *wp; struct open_bucket *ob; - struct open_buckets ptrs; - unsigned nr_effective, write_points_nr; - bool have_cache; - int ret; + unsigned write_points_nr; int i; + struct alloc_request *req = bch2_trans_kmalloc_nomemzero(trans, sizeof(*req)); + int ret = PTR_ERR_OR_ZERO(req); + if (unlikely(ret)) + return ret; + if (!IS_ENABLED(CONFIG_BCACHEFS_ERASURE_CODING)) erasure_code = false; + req->nr_replicas = nr_replicas; + req->target = target; + req->ec = erasure_code; + req->watermark = watermark; + req->flags = flags; + req->devs_have = devs_have; + BUG_ON(!nr_replicas || !nr_replicas_required); retry: - ptrs.nr = 0; - nr_effective = 0; - write_points_nr = c->write_points_nr; - have_cache = false; + req->ptrs.nr = 0; + req->nr_effective = 0; + req->have_cache = false; + write_points_nr = c->write_points_nr; + + *wp_ret = req->wp = writepoint_find(trans, write_point.v); - *wp_ret = wp = writepoint_find(trans, write_point.v); + req->data_type = req->wp->data_type; ret = bch2_trans_relock(trans); if (ret) goto err; /* metadata may not allocate on cache devices: */ - if (wp->data_type != BCH_DATA_user) - have_cache = true; - - if (target && !(flags & BCH_WRITE_ONLY_SPECIFIED_DEVS)) { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, NULL); + if (req->data_type != BCH_DATA_user) + req->have_cache = true; + + if (target && !(flags & BCH_WRITE_only_specified_devs)) { + ret = open_bucket_add_buckets(trans, req, NULL); if (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto alloc_done; /* Don't retry from all devices if we're out of open buckets: */ if (bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) { - int ret2 = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + int ret2 = open_bucket_add_buckets(trans, req, cl); if (!ret2 || bch2_err_matches(ret2, BCH_ERR_transaction_restart) || bch2_err_matches(ret2, BCH_ERR_open_buckets_empty)) { @@ -1365,59 +1305,74 @@ retry: * Only try to allocate cache (durability = 0 devices) from the * specified target: */ - have_cache = true; + req->have_cache = true; + req->target = 0; - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - 0, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, req, cl); } else { - ret = open_bucket_add_buckets(trans, &ptrs, wp, devs_have, - target, erasure_code, - nr_replicas, &nr_effective, - &have_cache, watermark, - flags, cl); + ret = open_bucket_add_buckets(trans, req, cl); } alloc_done: - BUG_ON(!ret && nr_effective < nr_replicas); + BUG_ON(!ret && req->nr_effective < req->nr_replicas); - if (erasure_code && !ec_open_bucket(c, &ptrs)) + if (erasure_code && !ec_open_bucket(c, &req->ptrs)) pr_debug("failed to get ec bucket: ret %u", ret); if (ret == -BCH_ERR_insufficient_devices && - nr_effective >= nr_replicas_required) + req->nr_effective >= nr_replicas_required) ret = 0; if (ret) goto err; - if (nr_effective > nr_replicas) - deallocate_extra_replicas(c, &ptrs, &wp->ptrs, nr_effective - nr_replicas); + if (req->nr_effective > req->nr_replicas) + deallocate_extra_replicas(c, req); /* Free buckets we didn't use: */ - open_bucket_for_each(c, &wp->ptrs, ob, i) + open_bucket_for_each(c, &req->wp->ptrs, ob, i) open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req->wp->ptrs = req->ptrs; - wp->sectors_free = UINT_MAX; + req->wp->sectors_free = UINT_MAX; - open_bucket_for_each(c, &wp->ptrs, ob, i) - wp->sectors_free = min(wp->sectors_free, ob->sectors_free); + open_bucket_for_each(c, &req->wp->ptrs, ob, i) { + /* + * Ensure proper write alignment - either due to misaligned + * bucket sizes (from buggy bcachefs-tools), or writes that mix + * logical/physical alignment: + */ + struct bch_dev *ca = ob_dev(c, ob); + u64 offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free; + unsigned align = round_up(offset, block_sectors(c)) - offset; - BUG_ON(!wp->sectors_free || wp->sectors_free == UINT_MAX); + ob->sectors_free = max_t(int, 0, ob->sectors_free - align); + + req->wp->sectors_free = min(req->wp->sectors_free, ob->sectors_free); + } + + req->wp->sectors_free = rounddown(req->wp->sectors_free, block_sectors(c)); + + /* Did alignment use up space in an open_bucket? */ + if (unlikely(!req->wp->sectors_free)) { + bch2_alloc_sectors_done(c, req->wp); + goto retry; + } + + BUG_ON(!req->wp->sectors_free || req->wp->sectors_free == UINT_MAX); return 0; err: - open_bucket_for_each(c, &wp->ptrs, ob, i) - if (ptrs.nr < ARRAY_SIZE(ptrs.v)) - ob_push(c, &ptrs, ob); + open_bucket_for_each(c, &req->wp->ptrs, ob, i) + if (req->ptrs.nr < ARRAY_SIZE(req->ptrs.v)) + ob_push(c, &req->ptrs, ob); else open_bucket_free_unused(c, ob); - wp->ptrs = ptrs; + req->wp->ptrs = req->ptrs; - mutex_unlock(&wp->lock); + mutex_unlock(&req->wp->lock); if (bch2_err_matches(ret, BCH_ERR_freelist_empty) && try_decrease_writepoints(trans, write_points_nr)) @@ -1426,27 +1381,13 @@ err: if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) ret = -BCH_ERR_bucket_alloc_blocked; - if (cl && !(flags & BCH_WRITE_ALLOC_NOWAIT) && + if (cl && !(flags & BCH_WRITE_alloc_nowait) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) ret = -BCH_ERR_bucket_alloc_blocked; return ret; } -struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) -{ - struct bch_dev *ca = ob_dev(c, ob); - - return (struct bch_extent_ptr) { - .type = 1 << BCH_EXTENT_ENTRY_ptr, - .gen = ob->gen, - .dev = ob->dev, - .offset = bucket_to_sector(ca, ob->bucket) + - ca->mi.bucket_size - - ob->sectors_free, - }; -} - void bch2_alloc_sectors_append_ptrs(struct bch_fs *c, struct write_point *wp, struct bkey_i *k, unsigned sectors, bool cached) @@ -1576,8 +1517,10 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, struct open_bucket *ob; unsigned i; + mutex_lock(&wp->lock); + prt_printf(out, "%lu: ", wp->write_point); - prt_human_readable_u64(out, wp->sectors_allocated); + prt_human_readable_u64(out, wp->sectors_allocated << 9); prt_printf(out, " last wrote: "); bch2_pr_time_units(out, sched_clock() - wp->last_used); @@ -1593,6 +1536,8 @@ static void bch2_write_point_to_text(struct printbuf *out, struct bch_fs *c, open_bucket_for_each(c, &wp->ptrs, ob, i) bch2_open_bucket_to_text(out, c, ob); printbuf_indent_sub(out, 2); + + mutex_unlock(&wp->lock); } void bch2_write_points_to_text(struct printbuf *out, struct bch_fs *c) @@ -1650,7 +1595,7 @@ void bch2_fs_alloc_debug_to_text(struct printbuf *out, struct bch_fs *c) void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) { struct bch_fs *c = ca->fs; - struct bch_dev_usage stats = bch2_dev_usage_read(ca); + struct bch_dev_usage_full stats = bch2_dev_usage_full_read(ca); unsigned nr[BCH_DATA_NR]; memset(nr, 0, sizeof(nr)); @@ -1673,7 +1618,8 @@ void bch2_dev_alloc_debug_to_text(struct printbuf *out, struct bch_dev *ca) printbuf_tabstop_push(out, 16); prt_printf(out, "open buckets\t%i\r\n", ca->nr_open_buckets); - prt_printf(out, "buckets to invalidate\t%llu\r\n", should_invalidate_buckets(ca, stats)); + prt_printf(out, "buckets to invalidate\t%llu\r\n", + should_invalidate_buckets(ca, bch2_dev_usage_read(ca))); } static noinline void bch2_print_allocator_stuck(struct bch_fs *c) @@ -1689,7 +1635,12 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) printbuf_indent_sub(&buf, 2); prt_newline(&buf); - for_each_online_member(c, ca) { + bch2_printbuf_make_room(&buf, 4096); + + rcu_read_lock(); + buf.atomic++; + + for_each_online_member_rcu(c, ca) { prt_printf(&buf, "Dev %u:\n", ca->dev_idx); printbuf_indent_add(&buf, 2); bch2_dev_alloc_debug_to_text(&buf, ca); @@ -1697,6 +1648,9 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) prt_newline(&buf); } + --buf.atomic; + rcu_read_unlock(); + prt_printf(&buf, "Copygc debug:\n"); printbuf_indent_add(&buf, 2); bch2_copygc_wait_to_text(&buf, c); @@ -1708,7 +1662,7 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) bch2_journal_debug_to_text(&buf, &c->journal); printbuf_indent_sub(&buf, 2); - bch2_print_string_as_lines(KERN_ERR, buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); } diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index f25481a0d1a0..2e01c7b61ed1 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -3,8 +3,10 @@ #define _BCACHEFS_ALLOC_FOREGROUND_H #include "bcachefs.h" +#include "buckets.h" #include "alloc_types.h" #include "extents.h" +#include "io_write_types.h" #include "sb-members.h" #include <linux/hash.h> @@ -23,6 +25,52 @@ struct dev_alloc_list { u8 data[BCH_SB_MEMBERS_MAX]; }; +struct alloc_request { + unsigned nr_replicas; + unsigned target; + bool ec; + enum bch_watermark watermark; + enum bch_write_flags flags; + enum bch_data_type data_type; + struct bch_devs_list *devs_have; + struct write_point *wp; + + /* These fields are used primarily by open_bucket_add_buckets */ + struct open_buckets ptrs; + unsigned nr_effective; /* sum of @ptrs durability */ + bool have_cache; /* have we allocated from a 0 durability dev */ + struct bch_devs_mask devs_may_alloc; + + /* bch2_bucket_alloc_set_trans(): */ + struct bch_dev_usage usage; + + /* bch2_bucket_alloc_trans(): */ + struct bch_dev *ca; + + enum { + BTREE_BITMAP_NO, + BTREE_BITMAP_YES, + BTREE_BITMAP_ANY, + } btree_bitmap; + + struct { + u64 buckets_seen; + u64 skipped_open; + u64 skipped_need_journal_commit; + u64 need_journal_commit; + u64 skipped_nocow; + u64 skipped_nouse; + u64 skipped_mi_btree_bitmap; + } counters; + + unsigned scratch_nr_replicas; + unsigned scratch_nr_effective; + bool scratch_have_cache; + enum bch_data_type scratch_data_type; + struct open_buckets scratch_ptrs; + struct bch_devs_mask scratch_devs_may_alloc; +}; + struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, struct dev_stripe_state *, struct bch_devs_mask *); @@ -33,6 +81,23 @@ static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) return bch2_dev_have_ref(c, ob->dev); } +static inline unsigned bch2_open_buckets_reserved(enum bch_watermark watermark) +{ + switch (watermark) { + case BCH_WATERMARK_interior_updates: + return 0; + case BCH_WATERMARK_reclaim: + return OPEN_BUCKETS_COUNT / 6; + case BCH_WATERMARK_btree: + case BCH_WATERMARK_btree_copygc: + return OPEN_BUCKETS_COUNT / 4; + case BCH_WATERMARK_copygc: + return OPEN_BUCKETS_COUNT / 3; + default: + return OPEN_BUCKETS_COUNT / 2; + } +} + struct open_bucket *bch2_bucket_alloc(struct bch_fs *, struct bch_dev *, enum bch_watermark, enum bch_data_type, struct closure *); @@ -65,7 +130,7 @@ static inline struct open_bucket *ec_open_bucket(struct bch_fs *c, } void bch2_open_bucket_write_error(struct bch_fs *, - struct open_buckets *, unsigned); + struct open_buckets *, unsigned, int); void __bch2_open_bucket_put(struct bch_fs *, struct open_bucket *); @@ -93,7 +158,9 @@ static inline void bch2_alloc_sectors_done_inlined(struct bch_fs *c, struct writ unsigned i; open_bucket_for_each(c, &wp->ptrs, ob, i) - ob_push(c, !ob->sectors_free ? &ptrs : &keep, ob); + ob_push(c, ob->sectors_free < block_sectors(c) + ? &ptrs + : &keep, ob); wp->ptrs = keep; mutex_unlock(&wp->lock); @@ -154,11 +221,8 @@ static inline bool bch2_bucket_is_open_safe(struct bch_fs *c, unsigned dev, u64 } enum bch_write_flags; -int bch2_bucket_alloc_set_trans(struct btree_trans *, struct open_buckets *, - struct dev_stripe_state *, struct bch_devs_mask *, - unsigned, unsigned *, bool *, enum bch_write_flags, - enum bch_data_type, enum bch_watermark, - struct closure *); +int bch2_bucket_alloc_set_trans(struct btree_trans *, struct alloc_request *, + struct dev_stripe_state *, struct closure *); int bch2_alloc_sectors_start_trans(struct btree_trans *, unsigned, unsigned, @@ -170,7 +234,19 @@ int bch2_alloc_sectors_start_trans(struct btree_trans *, struct closure *, struct write_point **); -struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *, struct open_bucket *); +static inline struct bch_extent_ptr bch2_ob_ptr(struct bch_fs *c, struct open_bucket *ob) +{ + struct bch_dev *ca = ob_dev(c, ob); + + return (struct bch_extent_ptr) { + .type = 1 << BCH_EXTENT_ENTRY_ptr, + .gen = ob->gen, + .dev = ob->dev, + .offset = bucket_to_sector(ca, ob->bucket) + + ca->mi.bucket_size - + ob->sectors_free, + }; +} /* * Append pointers to the space we just allocated to @k, and mark @sectors space diff --git a/fs/bcachefs/alloc_types.h b/fs/bcachefs/alloc_types.h index 4aa8ee026cb8..e7becdf22cba 100644 --- a/fs/bcachefs/alloc_types.h +++ b/fs/bcachefs/alloc_types.h @@ -8,22 +8,6 @@ #include "clock_types.h" #include "fifo.h" -struct bucket_alloc_state { - enum { - BTREE_BITMAP_NO, - BTREE_BITMAP_YES, - BTREE_BITMAP_ANY, - } btree_bitmap; - - u64 buckets_seen; - u64 skipped_open; - u64 skipped_need_journal_commit; - u64 need_journal_commit; - u64 skipped_nocow; - u64 skipped_nouse; - u64 skipped_mi_btree_bitmap; -}; - #define BCH_WATERMARKS() \ x(stripe) \ x(normal) \ @@ -90,6 +74,7 @@ struct dev_stripe_state { x(stopped) \ x(waiting_io) \ x(waiting_work) \ + x(runnable) \ x(running) enum write_point_state { @@ -125,6 +110,7 @@ struct write_point { enum write_point_state state; u64 last_state_change; u64 time[WRITE_POINT_STATE_NR]; + u64 last_runtime; } __aligned(SMP_CACHE_BYTES); }; diff --git a/fs/bcachefs/async_objs.c b/fs/bcachefs/async_objs.c new file mode 100644 index 000000000000..a7cd1f0f0964 --- /dev/null +++ b/fs/bcachefs/async_objs.c @@ -0,0 +1,132 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Async obj debugging: keep asynchronous objects on (very fast) lists, make + * them visibile in debugfs: + */ + +#include "bcachefs.h" +#include "async_objs.h" +#include "btree_io.h" +#include "debug.h" +#include "io_read.h" +#include "io_write.h" + +#include <linux/debugfs.h> + +static void promote_obj_to_text(struct printbuf *out, void *obj) +{ + bch2_promote_op_to_text(out, obj); +} + +static void rbio_obj_to_text(struct printbuf *out, void *obj) +{ + bch2_read_bio_to_text(out, obj); +} + +static void write_op_obj_to_text(struct printbuf *out, void *obj) +{ + bch2_write_op_to_text(out, obj); +} + +static void btree_read_bio_obj_to_text(struct printbuf *out, void *obj) +{ + struct btree_read_bio *rbio = obj; + bch2_btree_read_bio_to_text(out, rbio); +} + +static void btree_write_bio_obj_to_text(struct printbuf *out, void *obj) +{ + struct btree_write_bio *wbio = obj; + bch2_bio_to_text(out, &wbio->wbio.bio); +} + +static int bch2_async_obj_list_open(struct inode *inode, struct file *file) +{ + struct async_obj_list *list = inode->i_private; + struct dump_iter *i; + + i = kzalloc(sizeof(struct dump_iter), GFP_KERNEL); + if (!i) + return -ENOMEM; + + file->private_data = i; + i->from = POS_MIN; + i->iter = 0; + i->c = container_of(list, struct bch_fs, async_objs[list->idx]); + i->list = list; + i->buf = PRINTBUF; + return 0; +} + +static ssize_t bch2_async_obj_list_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + struct dump_iter *i = file->private_data; + struct async_obj_list *list = i->list; + ssize_t ret = 0; + + i->ubuf = buf; + i->size = size; + i->ret = 0; + + struct genradix_iter iter; + void *obj; + fast_list_for_each_from(&list->list, iter, obj, i->iter) { + ret = bch2_debugfs_flush_buf(i); + if (ret) + return ret; + + if (!i->size) + break; + + list->obj_to_text(&i->buf, obj); + } + + if (i->buf.allocation_failure) + ret = -ENOMEM; + else + i->iter = iter.pos; + + if (!ret) + ret = bch2_debugfs_flush_buf(i); + + return ret ?: i->ret; +} + +static const struct file_operations async_obj_ops = { + .owner = THIS_MODULE, + .open = bch2_async_obj_list_open, + .release = bch2_dump_release, + .read = bch2_async_obj_list_read, +}; + +void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) +{ + c->async_obj_dir = debugfs_create_dir("async_objs", c->fs_debug_dir); + +#define x(n) debugfs_create_file(#n, 0400, c->async_obj_dir, \ + &c->async_objs[BCH_ASYNC_OBJ_LIST_##n], &async_obj_ops); + BCH_ASYNC_OBJ_LISTS() +#undef x +} + +void bch2_fs_async_obj_exit(struct bch_fs *c) +{ + for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) + fast_list_exit(&c->async_objs[i].list); +} + +int bch2_fs_async_obj_init(struct bch_fs *c) +{ + for (unsigned i = 0; i < ARRAY_SIZE(c->async_objs); i++) { + if (fast_list_init(&c->async_objs[i].list)) + return -BCH_ERR_ENOMEM_async_obj_init; + c->async_objs[i].idx = i; + } + +#define x(n) c->async_objs[BCH_ASYNC_OBJ_LIST_##n].obj_to_text = n##_obj_to_text; + BCH_ASYNC_OBJ_LISTS() +#undef x + + return 0; +} diff --git a/fs/bcachefs/async_objs.h b/fs/bcachefs/async_objs.h new file mode 100644 index 000000000000..cd6489b8cf76 --- /dev/null +++ b/fs/bcachefs/async_objs.h @@ -0,0 +1,44 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ASYNC_OBJS_H +#define _BCACHEFS_ASYNC_OBJS_H + +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS +static inline void __async_object_list_del(struct fast_list *head, unsigned idx) +{ + fast_list_remove(head, idx); +} + +static inline int __async_object_list_add(struct fast_list *head, void *obj, unsigned *idx) +{ + int ret = fast_list_add(head, obj); + *idx = ret > 0 ? ret : 0; + return ret < 0 ? ret : 0; +} + +#define async_object_list_del(_c, _list, idx) \ + __async_object_list_del(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, idx) + +#define async_object_list_add(_c, _list, obj, idx) \ + __async_object_list_add(&(_c)->async_objs[BCH_ASYNC_OBJ_LIST_##_list].list, obj, idx) + +void bch2_fs_async_obj_debugfs_init(struct bch_fs *); +void bch2_fs_async_obj_exit(struct bch_fs *); +int bch2_fs_async_obj_init(struct bch_fs *); + +#else /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ + +#define async_object_list_del(_c, _n, idx) do {} while (0) + +static inline int __async_object_list_add(void) +{ + return 0; +} +#define async_object_list_add(_c, _n, obj, idx) __async_object_list_add() + +static inline void bch2_fs_async_obj_debugfs_init(struct bch_fs *c) {} +static inline void bch2_fs_async_obj_exit(struct bch_fs *c) {} +static inline int bch2_fs_async_obj_init(struct bch_fs *c) { return 0; } + +#endif /* CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS */ + +#endif /* _BCACHEFS_ASYNC_OBJS_H */ diff --git a/fs/bcachefs/async_objs_types.h b/fs/bcachefs/async_objs_types.h new file mode 100644 index 000000000000..8d713c0f5841 --- /dev/null +++ b/fs/bcachefs/async_objs_types.h @@ -0,0 +1,25 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ASYNC_OBJS_TYPES_H +#define _BCACHEFS_ASYNC_OBJS_TYPES_H + +#define BCH_ASYNC_OBJ_LISTS() \ + x(promote) \ + x(rbio) \ + x(write_op) \ + x(btree_read_bio) \ + x(btree_write_bio) + +enum bch_async_obj_lists { +#define x(n) BCH_ASYNC_OBJ_LIST_##n, + BCH_ASYNC_OBJ_LISTS() +#undef x + BCH_ASYNC_OBJ_NR +}; + +struct async_obj_list { + struct fast_list list; + void (*obj_to_text)(struct printbuf *, void *); + unsigned idx; +}; + +#endif /* _BCACHEFS_ASYNC_OBJS_TYPES_H */ diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index ebeb6a5ff9d2..cde7dd115267 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -11,9 +11,21 @@ #include "checksum.h" #include "disk_accounting.h" #include "error.h" +#include "progress.h" +#include "recovery_passes.h" #include <linux/mm.h> +static int bch2_bucket_bitmap_set(struct bch_dev *, struct bucket_bitmap *, u64); + +static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) +{ + return (struct bbpos) { + .btree = bp.btree_id, + .pos = bp.pos, + }; +} + int bch2_backpointer_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -49,6 +61,8 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); + prt_str(out, " data_type="); + bch2_prt_data_type(out, bp.v->data_type); prt_printf(out, " suboffset=%u len=%u gen=%u pos=", (u32) bp.k->p.offset & ~(~0U << MAX_EXTENT_COMPRESS_RATIO_SHIFT), bp.v->bucket_len, @@ -93,6 +107,9 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; + bool will_check = c->recovery.passes_to_run & + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers); + int ret = 0; if (insert) { prt_printf(&buf, "existing backpointer found when inserting "); @@ -106,9 +123,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, prt_printf(&buf, "for "); bch2_bkey_val_to_text(&buf, c, orig_k); - - bch_err(c, "%s", buf.buf); - } else if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { + } else if (!will_check) { prt_printf(&buf, "backpointer not found when deleting\n"); printbuf_indent_add(&buf, 2); @@ -122,17 +137,14 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, prt_printf(&buf, "for "); bch2_bkey_val_to_text(&buf, c, orig_k); - - bch_err(c, "%s", buf.buf); } - printbuf_exit(&buf); + if (!will_check && __bch2_inconsistent_error(c, &buf)) + ret = -BCH_ERR_erofs_unfixed_errors; - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_check_extents_to_backpointers) { - return bch2_inconsistent_error(c) ? BCH_ERR_erofs_unfixed_errors : 0; - } else { - return 0; - } + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + return ret; } int bch2_bucket_backpointer_mod_nowritebuffer(struct btree_trans *trans, @@ -172,7 +184,7 @@ err: static int bch2_backpointer_del(struct btree_trans *trans, struct bpos pos) { - return (likely(!bch2_backpointers_no_use_write_buffer) + return (!static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) ? bch2_btree_delete_at_buffered(trans, BTREE_ID_backpointers, pos) : bch2_btree_delete(trans, BTREE_ID_backpointers, pos, 0)) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); @@ -182,7 +194,7 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, struct bkey_s_c visiting_k, struct bkey_buf *last_flushed) { - return likely(!bch2_backpointers_no_use_write_buffer) + return !static_branch_unlikely(&bch2_backpointers_no_use_write_buffer) ? bch2_btree_write_buffer_maybe_flush(trans, visiting_k, last_flushed) : 0; } @@ -190,7 +202,8 @@ static inline int bch2_backpointers_maybe_flush(struct btree_trans *trans, static int backpointer_target_not_found(struct btree_trans *trans, struct bkey_s_c_backpointer bp, struct bkey_s_c target_k, - struct bkey_buf *last_flushed) + struct bkey_buf *last_flushed, + bool commit) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; @@ -207,11 +220,11 @@ static int backpointer_target_not_found(struct btree_trans *trans, if (ret) return ret; - prt_printf(&buf, "backpointer doesn't match %s it points to:\n ", + prt_printf(&buf, "backpointer doesn't match %s it points to:\n", bp.v->level ? "btree node" : "extent"); bch2_bkey_val_to_text(&buf, c, bp.s_c); - prt_printf(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, target_k); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(target_k); @@ -219,63 +232,45 @@ static int backpointer_target_not_found(struct btree_trans *trans, struct extent_ptr_decoded p; bkey_for_each_ptr_decode(target_k.k, ptrs, p, entry) if (p.ptr.dev == bp.k->p.inode) { - prt_printf(&buf, "\n "); + prt_newline(&buf); struct bkey_i_backpointer bp2; bch2_extent_ptr_to_bp(c, bp.v->btree_id, bp.v->level, target_k, p, entry, &bp2); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp2.k_i)); } if (fsck_err(trans, backpointer_to_missing_ptr, - "%s", buf.buf)) + "%s", buf.buf)) { ret = bch2_backpointer_del(trans, bp.k->p); + if (ret || !commit) + goto out; + + /* + * Normally, on transaction commit from inside a transaction, + * we'll return -BCH_ERR_transaction_restart_nested, since a + * transaction commit invalidates pointers given out by peek(). + * + * However, since we're updating a write buffer btree, if we + * return a transaction restart and loop we won't see that the + * backpointer has been deleted without an additional write + * buffer flush - and those are expensive. + * + * So we're relying on the caller immediately advancing to the + * next backpointer and starting a new transaction immediately + * after backpointer_get_key() returns NULL: + */ + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + } +out: fsck_err: printbuf_exit(&buf); return ret; } -struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - unsigned iter_flags, - struct bkey_buf *last_flushed) -{ - struct bch_fs *c = trans->c; - - if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) - return bkey_s_c_null; - - if (likely(!bp.v->level)) { - bch2_trans_node_iter_init(trans, iter, - bp.v->btree_id, - bp.v->pos, - 0, 0, - iter_flags); - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); - if (bkey_err(k)) { - bch2_trans_iter_exit(trans, iter); - return k; - } - - if (k.k && - extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) - return k; - - bch2_trans_iter_exit(trans, iter); - int ret = backpointer_target_not_found(trans, bp, k, last_flushed); - return ret ? bkey_s_c_err(ret) : bkey_s_c_null; - } else { - struct btree *b = bch2_backpointer_get_node(trans, bp, iter, last_flushed); - if (IS_ERR_OR_NULL(b)) - return ((struct bkey_s_c) { .k = ERR_CAST(b) }); - - return bkey_i_to_s_c(&b->key); - } -} - -struct btree *bch2_backpointer_get_node(struct btree_trans *trans, - struct bkey_s_c_backpointer bp, - struct btree_iter *iter, - struct bkey_buf *last_flushed) +static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct btree_iter *iter, + struct bkey_buf *last_flushed, + bool commit) { struct bch_fs *c = trans->c; @@ -287,7 +282,7 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, 0, bp.v->level - 1, 0); - struct btree *b = bch2_btree_iter_peek_node(iter); + struct btree *b = bch2_btree_iter_peek_node(trans, iter); if (IS_ERR_OR_NULL(b)) goto err; @@ -300,7 +295,8 @@ struct btree *bch2_backpointer_get_node(struct btree_trans *trans, if (btree_node_will_make_reachable(b)) { b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); } else { - int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed); + int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), + last_flushed, commit); b = ret ? ERR_PTR(ret) : NULL; } err: @@ -308,6 +304,79 @@ err: return b; } +static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct btree_iter *iter, + unsigned iter_flags, + struct bkey_buf *last_flushed, + bool commit) +{ + struct bch_fs *c = trans->c; + + if (unlikely(bp.v->btree_id >= btree_id_nr_alive(c))) + return bkey_s_c_null; + + bch2_trans_node_iter_init(trans, iter, + bp.v->btree_id, + bp.v->pos, + 0, + bp.v->level, + iter_flags); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); + if (bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } + + /* + * peek_slot() doesn't normally return NULL - except when we ask for a + * key at a btree level that doesn't exist. + * + * We may want to revisit this and change peek_slot(): + */ + if (!k.k) { + bkey_init(&iter->k); + iter->k.p = bp.v->pos; + k.k = &iter->k; + } + + if (k.k && + extent_matches_bp(c, bp.v->btree_id, bp.v->level, k, bp)) + return k; + + bch2_trans_iter_exit(trans, iter); + + if (!bp.v->level) { + int ret = backpointer_target_not_found(trans, bp, k, last_flushed, commit); + return ret ? bkey_s_c_err(ret) : bkey_s_c_null; + } else { + struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); + if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) + return bkey_s_c_null; + if (IS_ERR_OR_NULL(b)) + return ((struct bkey_s_c) { .k = ERR_CAST(b) }); + + return bkey_i_to_s_c(&b->key); + } +} + +struct btree *bch2_backpointer_get_node(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct btree_iter *iter, + struct bkey_buf *last_flushed) +{ + return __bch2_backpointer_get_node(trans, bp, iter, last_flushed, true); +} + +struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + struct btree_iter *iter, + unsigned iter_flags, + struct bkey_buf *last_flushed) +{ + return __bch2_backpointer_get_key(trans, bp, iter, iter_flags, last_flushed, true); +} + static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, struct bkey_s_c k, struct bkey_buf *last_flushed) { @@ -315,7 +384,7 @@ static int bch2_check_backpointer_has_valid_bucket(struct btree_trans *trans, st return 0; struct bch_fs *c = trans->c; - struct btree_iter alloc_iter = { NULL }; + struct btree_iter alloc_iter = {}; struct bkey_s_c alloc_k; struct printbuf buf = PRINTBUF; int ret = 0; @@ -419,7 +488,8 @@ found: bytes = p.crc.compressed_size << 9; - struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, dev, READ, + BCH_DEV_READ_REF_check_extent_checksums); if (!ca) return false; @@ -436,12 +506,11 @@ found: if (ret) goto err; - prt_str(&buf, "extents pointing to same space, but first extent checksum bad:"); - prt_printf(&buf, "\n "); + prt_printf(&buf, "extents pointing to same space, but first extent checksum bad:\n"); bch2_btree_id_to_text(&buf, btree); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, extent); - prt_printf(&buf, "\n "); + prt_newline(&buf); bch2_btree_id_to_text(&buf, o_btree); prt_str(&buf, " "); bch2_bkey_val_to_text(&buf, c, extent2); @@ -457,7 +526,8 @@ err: if (bio) bio_put(bio); kvfree(data_buf); - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_check_extent_checksums); printbuf_exit(&buf); return ret; } @@ -504,7 +574,7 @@ check_existing_bp: struct bkey_s_c_backpointer other_bp = bkey_s_c_to_backpointer(bp_k); struct bkey_s_c other_extent = - bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL); + __bch2_backpointer_get_key(trans, other_bp, &other_extent_iter, 0, NULL, false); ret = bkey_err(other_extent); if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) ret = 0; @@ -514,11 +584,27 @@ check_existing_bp: if (!other_extent.k) goto missing; + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp->k.p.inode); + if (ca) { + struct bkey_ptrs_c other_extent_ptrs = bch2_bkey_ptrs_c(other_extent); + bkey_for_each_ptr(other_extent_ptrs, ptr) + if (ptr->dev == bp->k.p.inode && + dev_ptr_stale_rcu(ca, ptr)) { + ret = drop_dev_and_update(trans, other_bp.v->btree_id, + other_extent, bp->k.p.inode); + if (ret) + goto err; + goto out; + } + } + rcu_read_unlock(); + if (bch2_extents_match(orig_k, other_extent)) { printbuf_reset(&buf); - prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n "); + prt_printf(&buf, "duplicate versions of same extent, deleting smaller\n"); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_str(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, other_extent); bch_err(c, "%s", buf.buf); @@ -557,20 +643,20 @@ check_existing_bp: } printbuf_reset(&buf); - prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n ", bp->k.p.inode); + prt_printf(&buf, "duplicate extents pointing to same space on dev %llu\n", bp->k.p.inode); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_str(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, other_extent); bch_err(c, "%s", buf.buf); ret = -BCH_ERR_fsck_repair_unimplemented; goto err; missing: printbuf_reset(&buf); - prt_str(&buf, "missing backpointer\n for: "); + prt_str(&buf, "missing backpointer\nfor: "); bch2_bkey_val_to_text(&buf, c, orig_k); - prt_printf(&buf, "\n want: "); + prt_printf(&buf, "\nwant: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&bp->k_i)); - prt_printf(&buf, "\n got: "); + prt_printf(&buf, "\ngot: "); bch2_bkey_val_to_text(&buf, c, bp_k); if (fsck_err(trans, ptr_to_missing_backpointer, "%s", buf.buf)) @@ -590,28 +676,38 @@ static int check_extent_to_backpointers(struct btree_trans *trans, struct extent_ptr_decoded p; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { - if (p.ptr.cached) - continue; - if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - bool check = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_mismatches); - bool empty = ca && test_bit(PTR_BUCKET_NR(ca, &p.ptr), ca->bucket_backpointer_empty); - rcu_read_unlock(); + if (!ca) { + rcu_read_unlock(); + continue; + } - if (check || empty) { - struct bkey_i_backpointer bp; - bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) { + rcu_read_unlock(); + continue; + } - int ret = check - ? check_bp_exists(trans, s, &bp, k) - : bch2_bucket_backpointer_mod(trans, k, &bp, true); - if (ret) - return ret; + u64 b = PTR_BUCKET_NR(ca, &p.ptr); + if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) { + rcu_read_unlock(); + continue; } + + bool empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); + rcu_read_unlock(); + + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); + + int ret = !empty + ? check_bp_exists(trans, s, &bp, k) + : bch2_bucket_backpointer_mod(trans, k, &bp, true); + if (ret) + return ret; } return 0; @@ -630,7 +726,7 @@ static int check_btree_root_to_backpointers(struct btree_trans *trans, retry: bch2_trans_node_iter_init(trans, &iter, btree_id, POS_MIN, 0, bch2_btree_id_root(c, btree_id)->b->c.level, 0); - b = bch2_btree_iter_peek_node(&iter); + b = bch2_btree_iter_peek_node(trans, &iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; @@ -649,14 +745,6 @@ err: return ret; } -static inline struct bbpos bp_to_bbpos(struct bch_backpointer bp) -{ - return (struct bbpos) { - .btree = bp.btree_id, - .pos = bp.pos, - }; -} - static u64 mem_may_pin_bytes(struct bch_fs *c) { struct sysinfo i; @@ -715,69 +803,11 @@ static int bch2_get_btree_in_memory_pos(struct btree_trans *trans, return ret; } -struct progress_indicator_state { - unsigned long next_print; - u64 nodes_seen; - u64 nodes_total; - struct btree *last_node; -}; - -static inline void progress_init(struct progress_indicator_state *s, - struct bch_fs *c, - u64 btree_id_mask) +static inline int bch2_fs_going_ro(struct bch_fs *c) { - memset(s, 0, sizeof(*s)); - - s->next_print = jiffies + HZ * 10; - - for (unsigned i = 0; i < BTREE_ID_NR; i++) { - if (!(btree_id_mask & BIT_ULL(i))) - continue; - - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = i, - }; - - u64 v; - bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); - s->nodes_total += div64_ul(v, btree_sectors(c)); - } -} - -static inline bool progress_update_p(struct progress_indicator_state *s) -{ - bool ret = time_after_eq(jiffies, s->next_print); - - if (ret) - s->next_print = jiffies + HZ * 10; - return ret; -} - -static void progress_update_iter(struct btree_trans *trans, - struct progress_indicator_state *s, - struct btree_iter *iter, - const char *msg) -{ - struct bch_fs *c = trans->c; - struct btree *b = path_l(btree_iter_path(trans, iter))->b; - - s->nodes_seen += b != s->last_node; - s->last_node = b; - - if (progress_update_p(s)) { - struct printbuf buf = PRINTBUF; - unsigned percent = s->nodes_total - ? div64_u64(s->nodes_seen * 100, s->nodes_total) - : 0; - - prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", - msg, percent, s->nodes_seen, s->nodes_total); - bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); - - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } + return test_bit(BCH_FS_going_ro, &c->flags) + ? -EROFS + : 0; } static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, @@ -787,7 +817,7 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, struct progress_indicator_state progress; int ret = 0; - progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); + bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_extents)|BIT_ULL(BTREE_ID_reflink)); for (enum btree_id btree_id = 0; btree_id < btree_id_nr_alive(c); @@ -806,7 +836,8 @@ static int bch2_check_extents_to_backpointers_pass(struct btree_trans *trans, BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ - progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); + bch2_progress_update_iter(trans, &progress, &iter, "extents_to_backpointers"); + bch2_fs_going_ro(c) ?: check_extent_to_backpointers(trans, s, btree_id, level, k) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); })); @@ -827,7 +858,7 @@ enum alloc_sector_counter { ALLOC_SECTORS_NR }; -static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t) +static int data_type_to_alloc_counter(enum bch_data_type t) { switch (t) { case BCH_DATA_btree: @@ -836,15 +867,17 @@ static enum alloc_sector_counter data_type_to_alloc_counter(enum bch_data_type t case BCH_DATA_cached: return ALLOC_cached; case BCH_DATA_stripe: + case BCH_DATA_parity: return ALLOC_stripe; default: - BUG(); + return -1; } } static int check_bucket_backpointers_to_extents(struct btree_trans *, struct bch_dev *, struct bpos); static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct bkey_s_c alloc_k, + bool *had_mismatch, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; @@ -852,6 +885,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b const struct bch_alloc_v4 *a = bch2_alloc_to_v4(alloc_k, &a_convert); bool need_commit = false; + *had_mismatch = false; + if (a->data_type == BCH_DATA_sb || a->data_type == BCH_DATA_journal || a->data_type == BCH_DATA_parity) @@ -889,7 +924,11 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b if (bp.v->bucket_gen != a->gen) continue; - sectors[data_type_to_alloc_counter(bp.v->data_type)] += bp.v->bucket_len; + int alloc_counter = data_type_to_alloc_counter(bp.v->data_type); + if (alloc_counter < 0) + continue; + + sectors[alloc_counter] += bp.v->bucket_len; }; bch2_trans_iter_exit(trans, &iter); if (ret) @@ -901,9 +940,8 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b goto err; } - /* Cached pointers don't have backpointers: */ - if (sectors[ALLOC_dirty] != a->dirty_sectors || + sectors[ALLOC_cached] != a->cached_sectors || sectors[ALLOC_stripe] != a->stripe_sectors) { if (c->sb.version_upgrade_complete >= bcachefs_metadata_version_backpointer_bucket_gen) { ret = bch2_backpointers_maybe_flush(trans, alloc_k, last_flushed); @@ -912,17 +950,25 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b } if (sectors[ALLOC_dirty] > a->dirty_sectors || + sectors[ALLOC_cached] > a->cached_sectors || sectors[ALLOC_stripe] > a->stripe_sectors) { ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: -BCH_ERR_transaction_restart_nested; goto err; } - if (!sectors[ALLOC_dirty] && - !sectors[ALLOC_stripe]) - __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_empty); - else - __set_bit(alloc_k.k->p.offset, ca->bucket_backpointer_mismatches); + bool empty = (sectors[ALLOC_dirty] + + sectors[ALLOC_stripe] + + sectors[ALLOC_cached]) == 0; + + ret = bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_mismatch, + alloc_k.k->p.offset) ?: + (empty + ? bch2_bucket_bitmap_set(ca, &ca->bucket_backpointer_empty, + alloc_k.k->p.offset) + : 0); + + *had_mismatch = true; } err: bch2_dev_put(ca); @@ -946,8 +992,14 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) goto next; struct bpos bucket = bp_pos_to_bucket(ca, pos); - bucket.offset = find_next_bit(ca->bucket_backpointer_mismatches, - ca->mi.nbuckets, bucket.offset); + u64 next = ca->mi.nbuckets; + + unsigned long *bitmap = READ_ONCE(ca->bucket_backpointer_mismatch.buckets); + if (bitmap) + next = min_t(u64, next, + find_next_bit(bitmap, ca->mi.nbuckets, bucket.offset)); + + bucket.offset = next; if (bucket.offset == ca->mi.nbuckets) goto next; @@ -973,7 +1025,7 @@ static int btree_node_get_and_pin(struct btree_trans *trans, struct bkey_i *k, { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, k->k.p, 0, level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto err; @@ -1056,28 +1108,6 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) { int ret = 0; - /* - * Can't allow devices to come/go/resize while we have bucket bitmaps - * allocated - */ - lockdep_assert_held(&c->state_lock); - - for_each_member_device(c, ca) { - BUG_ON(ca->bucket_backpointer_mismatches); - ca->bucket_backpointer_mismatches = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), - GFP_KERNEL); - ca->bucket_backpointer_empty = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), - sizeof(unsigned long), - GFP_KERNEL); - if (!ca->bucket_backpointer_mismatches || - !ca->bucket_backpointer_empty) { - bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; - goto err_free_bitmaps; - } - } - struct btree_trans *trans = bch2_trans_get(c); struct extents_to_bp_state s = { .bp_start = POS_MIN }; @@ -1086,23 +1116,24 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) ret = for_each_btree_key(trans, iter, BTREE_ID_alloc, POS_MIN, BTREE_ITER_prefetch, k, ({ - check_bucket_backpointer_mismatch(trans, k, &s.last_flushed); + bool had_mismatch; + bch2_fs_going_ro(c) ?: + check_bucket_backpointer_mismatch(trans, k, &had_mismatch, &s.last_flushed); })); if (ret) goto err; - u64 nr_buckets = 0, nr_mismatches = 0, nr_empty = 0; + u64 nr_buckets = 0, nr_mismatches = 0; for_each_member_device(c, ca) { nr_buckets += ca->mi.nbuckets; - nr_mismatches += bitmap_weight(ca->bucket_backpointer_mismatches, ca->mi.nbuckets); - nr_empty += bitmap_weight(ca->bucket_backpointer_empty, ca->mi.nbuckets); + nr_mismatches += ca->bucket_backpointer_mismatch.nr; } - if (!nr_mismatches && !nr_empty) + if (!nr_mismatches) goto err; bch_info(c, "scanning for missing backpointers in %llu/%llu buckets", - nr_mismatches + nr_empty, nr_buckets); + nr_mismatches, nr_buckets); while (1) { ret = bch2_pin_backpointer_nodes_with_missing(trans, s.bp_start, &s.bp_end); @@ -1133,22 +1164,71 @@ int bch2_check_extents_to_backpointers(struct bch_fs *c) s.bp_start = bpos_successor(s.bp_end); } + + for_each_member_device(c, ca) { + bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); + } err: bch2_trans_put(trans); bch2_bkey_buf_exit(&s.last_flushed, c); bch2_btree_cache_unpin(c); -err_free_bitmaps: - for_each_member_device(c, ca) { - kvfree(ca->bucket_backpointer_empty); - ca->bucket_backpointer_empty = NULL; - kvfree(ca->bucket_backpointer_mismatches); - ca->bucket_backpointer_mismatches = NULL; - } bch_err_fn(c, ret); return ret; } +static int check_bucket_backpointer_pos_mismatch(struct btree_trans *trans, + struct bpos bucket, + bool *had_mismatch, + struct bkey_buf *last_flushed) +{ + struct btree_iter alloc_iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &alloc_iter, + BTREE_ID_alloc, bucket, + BTREE_ITER_cached); + int ret = bkey_err(k); + if (ret) + return ret; + + ret = check_bucket_backpointer_mismatch(trans, k, had_mismatch, last_flushed); + bch2_trans_iter_exit(trans, &alloc_iter); + return ret; +} + +int bch2_check_bucket_backpointer_mismatch(struct btree_trans *trans, + struct bch_dev *ca, u64 bucket, + bool copygc, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + bool had_mismatch; + int ret = lockrestart_do(trans, + check_bucket_backpointer_pos_mismatch(trans, POS(ca->dev_idx, bucket), + &had_mismatch, last_flushed)); + if (ret || !had_mismatch) + return ret; + + u64 nr = ca->bucket_backpointer_mismatch.nr; + u64 allowed = copygc ? ca->mi.nbuckets >> 7 : 0; + + struct printbuf buf = PRINTBUF; + __bch2_log_msg_start(ca->name, &buf); + + prt_printf(&buf, "Detected missing backpointers in bucket %llu, now have %llu/%llu with missing\n", + bucket, nr, ca->mi.nbuckets); + + bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_extents_to_backpointers, + nr < allowed ? RUN_RECOVERY_PASS_ratelimit : 0); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + return 0; +} + +/* backpointers -> extents */ + static int check_one_backpointer(struct btree_trans *trans, struct bbpos start, struct bbpos end, @@ -1206,11 +1286,11 @@ static int bch2_check_backpointers_to_extents_pass(struct btree_trans *trans, bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); - progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); + bch2_progress_init(&progress, trans->c, BIT_ULL(BTREE_ID_backpointers)); int ret = for_each_btree_key(trans, iter, BTREE_ID_backpointers, POS_MIN, BTREE_ITER_prefetch, k, ({ - progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); + bch2_progress_update_iter(trans, &progress, &iter, "backpointers_to_extents"); check_one_backpointer(trans, start, end, k, &last_flushed); })); @@ -1264,3 +1344,48 @@ int bch2_check_backpointers_to_extents(struct bch_fs *c) bch_err_fn(c, ret); return ret; } + +static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u64 bit) +{ + scoped_guard(mutex, &b->lock) { + if (!b->buckets) { + b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), + sizeof(unsigned long), GFP_KERNEL); + if (!b->buckets) + return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + } + + b->nr += !__test_and_set_bit(bit, b->buckets); + } + + return 0; +} + +int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_size) +{ + scoped_guard(mutex, &b->lock) { + if (!b->buckets) + return 0; + + unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), + sizeof(unsigned long), GFP_KERNEL); + if (!n) + return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + + memcpy(n, b->buckets, + BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); + kvfree(b->buckets); + b->buckets = n; + } + + return 0; +} + +void bch2_bucket_bitmap_free(struct bucket_bitmap *b) +{ + mutex_lock(&b->lock); + kvfree(b->buckets); + b->buckets = NULL; + b->nr = 0; + mutex_unlock(&b->lock); +} diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 060dad1521ee..6840561084ce 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_BACKPOINTERS_BACKGROUND_H -#define _BCACHEFS_BACKPOINTERS_BACKGROUND_H +#ifndef _BCACHEFS_BACKPOINTERS_H +#define _BCACHEFS_BACKPOINTERS_H #include "btree_cache.h" #include "btree_iter.h" @@ -102,7 +102,7 @@ static inline int bch2_bucket_backpointer_mod(struct btree_trans *trans, struct bkey_i_backpointer *bp, bool insert) { - if (unlikely(bch2_backpointers_no_use_write_buffer)) + if (static_branch_unlikely(&bch2_backpointers_no_use_write_buffer)) return bch2_bucket_backpointer_mod_nowritebuffer(trans, orig_k, bp, insert); if (!insert) { @@ -123,7 +123,12 @@ static inline enum bch_data_type bch2_bkey_ptr_data_type(struct bkey_s_c k, return BCH_DATA_btree; case KEY_TYPE_extent: case KEY_TYPE_reflink_v: - return p.has_ec ? BCH_DATA_stripe : BCH_DATA_user; + if (p.has_ec) + return BCH_DATA_stripe; + if (p.ptr.cached) + return BCH_DATA_cached; + else + return BCH_DATA_user; case KEY_TYPE_stripe: { const struct bch_extent_ptr *ptr = &entry->ptr; struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); @@ -147,7 +152,20 @@ static inline void bch2_extent_ptr_to_bp(struct bch_fs *c, struct bkey_i_backpointer *bp) { bkey_backpointer_init(&bp->k_i); - bp->k.p = POS(p.ptr.dev, ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset); + bp->k.p.inode = p.ptr.dev; + + if (k.k->type != KEY_TYPE_stripe) + bp->k.p.offset = ((u64) p.ptr.offset << MAX_EXTENT_COMPRESS_RATIO_SHIFT) + p.crc.offset; + else { + /* + * Put stripe backpointers where they won't collide with the + * extent backpointers within the stripe: + */ + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + bp->k.p.offset = ((u64) (p.ptr.offset + le16_to_cpu(s.v->sectors)) << + MAX_EXTENT_COMPRESS_RATIO_SHIFT) - 1; + } + bp->v = (struct bch_backpointer) { .btree_id = btree_id, .level = level, @@ -164,8 +182,20 @@ struct bkey_s_c bch2_backpointer_get_key(struct btree_trans *, struct bkey_s_c_b struct btree *bch2_backpointer_get_node(struct btree_trans *, struct bkey_s_c_backpointer, struct btree_iter *, struct bkey_buf *); +int bch2_check_bucket_backpointer_mismatch(struct btree_trans *, struct bch_dev *, u64, + bool, struct bkey_buf *); + int bch2_check_btree_backpointers(struct bch_fs *); int bch2_check_extents_to_backpointers(struct bch_fs *); int bch2_check_backpointers_to_extents(struct bch_fs *); +static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) +{ + unsigned long *bitmap = READ_ONCE(b->buckets); + return bitmap && test_bit(i, bitmap); +} + +int bch2_bucket_bitmap_resize(struct bucket_bitmap *, u64, u64); +void bch2_bucket_bitmap_free(struct bucket_bitmap *); + #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 161cf2f05d2a..7824da2af9d0 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -203,22 +203,24 @@ #include <linux/types.h> #include <linux/workqueue.h> #include <linux/zstd.h> +#include <linux/unicode.h> #include "bcachefs_format.h" #include "btree_journal_iter_types.h" #include "disk_accounting_types.h" #include "errcode.h" +#include "fast_list.h" #include "fifo.h" #include "nocow_locking_types.h" #include "opts.h" -#include "recovery_passes_types.h" #include "sb-errors_types.h" #include "seqmutex.h" +#include "snapshot_types.h" #include "time_stats.h" #include "util.h" #ifdef CONFIG_BCACHEFS_DEBUG -#define BCH_WRITE_REF_DEBUG +#define ENUMERATED_REF_DEBUG #endif #ifndef dynamic_fault @@ -268,7 +270,8 @@ do { \ #define bch2_fmt(_c, fmt) bch2_log_msg(_c, fmt "\n") -void bch2_print_str(struct bch_fs *, const char *); +void bch2_print_str(struct bch_fs *, const char *, const char *); +void bch2_print_str_nonblocking(struct bch_fs *, const char *, const char *); __printf(2, 3) void bch2_print_opts(struct bch_opts *, const char *, ...); @@ -292,6 +295,16 @@ do { \ bch2_print(_c, __VA_ARGS__); \ } while (0) +#define bch2_print_str_ratelimited(_c, ...) \ +do { \ + static DEFINE_RATELIMIT_STATE(_rs, \ + DEFAULT_RATELIMIT_INTERVAL, \ + DEFAULT_RATELIMIT_BURST); \ + \ + if (__ratelimit(&_rs)) \ + bch2_print_str(_c, __VA_ARGS__); \ +} while (0) + #define bch_info(c, fmt, ...) \ bch2_print(c, KERN_INFO bch2_fmt(c, fmt), ##__VA_ARGS__) #define bch_info_ratelimited(c, fmt, ...) \ @@ -389,17 +402,20 @@ do { \ "compare them") \ BCH_DEBUG_PARAM(backpointers_no_use_write_buffer, \ "Don't use the write buffer for backpointers, enabling "\ - "extra runtime checks") - -/* Parameters that should only be compiled in debug mode: */ -#define BCH_DEBUG_PARAMS_DEBUG() \ - BCH_DEBUG_PARAM(expensive_debug_checks, \ - "Enables various runtime debugging checks that " \ - "significantly affect performance") \ + "extra runtime checks") \ + BCH_DEBUG_PARAM(debug_check_btree_locking, \ + "Enable additional asserts for btree locking") \ BCH_DEBUG_PARAM(debug_check_iterators, \ "Enables extra verification for btree iterators") \ + BCH_DEBUG_PARAM(debug_check_bset_lookups, \ + "Enables extra verification for bset lookups") \ BCH_DEBUG_PARAM(debug_check_btree_accounting, \ "Verify btree accounting for keys within a node") \ + BCH_DEBUG_PARAM(debug_check_bkey_unpack, \ + "Enables extra verification for bkey unpack") + +/* Parameters that should only be compiled in debug mode: */ +#define BCH_DEBUG_PARAMS_DEBUG() \ BCH_DEBUG_PARAM(journal_seq_verify, \ "Store the journal sequence number in the version " \ "number of every btree key, and verify that btree " \ @@ -426,28 +442,28 @@ do { \ #define BCH_DEBUG_PARAMS() BCH_DEBUG_PARAMS_ALWAYS() #endif -#define BCH_DEBUG_PARAM(name, description) extern bool bch2_##name; -BCH_DEBUG_PARAMS() +#define BCH_DEBUG_PARAM(name, description) extern struct static_key_false bch2_##name; +BCH_DEBUG_PARAMS_ALL() #undef BCH_DEBUG_PARAM -#ifndef CONFIG_BCACHEFS_DEBUG -#define BCH_DEBUG_PARAM(name, description) static const __maybe_unused bool bch2_##name; -BCH_DEBUG_PARAMS_DEBUG() -#undef BCH_DEBUG_PARAM -#endif - #define BCH_TIME_STATS() \ x(btree_node_mem_alloc) \ x(btree_node_split) \ x(btree_node_compact) \ x(btree_node_merge) \ x(btree_node_sort) \ + x(btree_node_get) \ x(btree_node_read) \ x(btree_node_read_done) \ + x(btree_node_write) \ x(btree_interior_update_foreground) \ x(btree_interior_update_total) \ x(btree_gc) \ x(data_write) \ + x(data_write_to_submit) \ + x(data_write_to_queue) \ + x(data_write_to_btree_update) \ + x(data_write_btree_update) \ x(data_read) \ x(data_promote) \ x(journal_flush_write) \ @@ -456,6 +472,7 @@ BCH_DEBUG_PARAMS_DEBUG() x(blocked_journal_low_on_space) \ x(blocked_journal_low_on_pin) \ x(blocked_journal_max_in_flight) \ + x(blocked_journal_max_open) \ x(blocked_key_cache_flush) \ x(blocked_allocate) \ x(blocked_allocate_open_bucket) \ @@ -470,6 +487,7 @@ enum bch_time_stats { }; #include "alloc_types.h" +#include "async_objs_types.h" #include "btree_gc_types.h" #include "btree_types.h" #include "btree_node_scan_types.h" @@ -479,10 +497,12 @@ enum bch_time_stats { #include "clock_types.h" #include "disk_groups_types.h" #include "ec_types.h" +#include "enumerated_ref_types.h" #include "journal_types.h" #include "keylist_types.h" #include "quota_types.h" #include "rebalance_types.h" +#include "recovery_passes_types.h" #include "replicas_types.h" #include "sb-members_types.h" #include "subvolume_types.h" @@ -511,6 +531,57 @@ struct discard_in_flight { u64 bucket:63; }; +#define BCH_DEV_READ_REFS() \ + x(bch2_online_devs) \ + x(trans_mark_dev_sbs) \ + x(read_fua_test) \ + x(sb_field_resize) \ + x(write_super) \ + x(journal_read) \ + x(fs_journal_alloc) \ + x(fs_resize_on_mount) \ + x(btree_node_read) \ + x(btree_node_read_all_replicas) \ + x(btree_node_scrub) \ + x(btree_node_write) \ + x(btree_node_scan) \ + x(btree_verify_replicas) \ + x(btree_node_ondisk_to_text) \ + x(io_read) \ + x(check_extent_checksums) \ + x(ec_block) + +enum bch_dev_read_ref { +#define x(n) BCH_DEV_READ_REF_##n, + BCH_DEV_READ_REFS() +#undef x + BCH_DEV_READ_REF_NR, +}; + +#define BCH_DEV_WRITE_REFS() \ + x(journal_write) \ + x(journal_do_discards) \ + x(dev_do_discards) \ + x(discard_one_bucket_fast) \ + x(do_invalidates) \ + x(nocow_flush) \ + x(io_write) \ + x(ec_block) \ + x(ec_bucket_zero) + +enum bch_dev_write_ref { +#define x(n) BCH_DEV_WRITE_REF_##n, + BCH_DEV_WRITE_REFS() +#undef x + BCH_DEV_WRITE_REF_NR, +}; + +struct bucket_bitmap { + unsigned long *buckets; + u64 nr; + struct mutex lock; +}; + struct bch_dev { struct kobject kobj; #ifdef CONFIG_BCACHEFS_DEBUG @@ -521,8 +592,7 @@ struct bch_dev { struct percpu_ref ref; #endif struct completion ref_completion; - struct percpu_ref io_ref; - struct completion io_ref_completion; + struct enumerated_ref io_ref[2]; struct bch_fs *fs; @@ -533,6 +603,7 @@ struct bch_dev { */ struct bch_member_cpu mi; atomic64_t errors[BCH_MEMBER_ERROR_NR]; + unsigned long write_errors_start; __uuid_t uuid; char name[BDEVNAME_SIZE]; @@ -555,10 +626,11 @@ struct bch_dev { u8 *oldest_gen; unsigned long *buckets_nouse; - unsigned long *bucket_backpointer_mismatches; - unsigned long *bucket_backpointer_empty; + struct bucket_bitmap bucket_backpointer_mismatch; + struct bucket_bitmap bucket_backpointer_empty; - struct bch_dev_usage __percpu *usage; + struct bch_dev_usage_full __percpu + *usage; /* Allocator: */ u64 alloc_cursor[3]; @@ -567,10 +639,6 @@ struct bch_dev { unsigned nr_partial_buckets; unsigned nr_btree_reserve; - size_t inc_gen_needs_gc; - size_t inc_gen_really_needs_gc; - size_t buckets_waiting_on_journal; - struct work_struct invalidate_work; struct work_struct discard_work; struct mutex discard_buckets_in_flight_lock; @@ -609,21 +677,23 @@ struct bch_dev { x(accounting_replay_done) \ x(may_go_rw) \ x(rw) \ + x(rw_init_done) \ x(was_rw) \ x(stopping) \ x(emergency_ro) \ x(going_ro) \ x(write_disable_complete) \ x(clean_shutdown) \ - x(recovery_running) \ - x(fsck_running) \ + x(in_recovery) \ + x(in_fsck) \ x(initial_gc_unfixed) \ x(need_delete_dead_snapshots) \ x(error) \ x(topology_error) \ x(errors_fixed) \ x(errors_not_fixed) \ - x(no_invalid_checks) + x(no_invalid_checks) \ + x(discard_mount_opt_set) \ enum bch_fs_flags { #define x(n) BCH_FS_##n, @@ -642,8 +712,10 @@ struct btree_transaction_stats { struct bch2_time_stats lock_hold_times; struct mutex lock; unsigned nr_max_paths; - unsigned journal_entries_size; unsigned max_mem; +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif char *max_paths_text; }; @@ -664,9 +736,6 @@ struct btree_trans_buf { struct btree_trans *trans; }; -#define BCACHEFS_ROOT_SUBVOL_INUM \ - ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) - #define BCH_WRITE_REFS() \ x(journal) \ x(trans) \ @@ -687,7 +756,9 @@ struct btree_trans_buf { x(gc_gens) \ x(snapshot_delete_pagecache) \ x(sysfs) \ - x(btree_write_buffer) + x(btree_write_buffer) \ + x(btree_node_scrub) \ + x(async_recovery_passes) enum bch_write_ref { #define x(n) BCH_WRITE_REF_##n, @@ -696,6 +767,8 @@ enum bch_write_ref { BCH_WRITE_REF_NR, }; +#define BCH_FS_DEFAULT_UTF8_ENCODING UNICODE_AGE(12, 1, 0) + struct bch_fs { struct closure cl; @@ -719,11 +792,7 @@ struct bch_fs { struct rw_semaphore state_lock; /* Counts outstanding writes, for clean transition to read-only */ -#ifdef BCH_WRITE_REF_DEBUG - atomic_long_t writes[BCH_WRITE_REF_NR]; -#else - struct percpu_ref writes; -#endif + struct enumerated_ref writes; /* * Certain operations are only allowed in single threaded mode, during * recovery, and we want to assert that this is the case: @@ -767,6 +836,7 @@ struct bch_fs { u8 nr_devices; u8 clean; + bool multi_device; /* true if we've ever had more than one device */ u8 encryption_type; @@ -776,10 +846,16 @@ struct bch_fs { unsigned nsec_per_time_unit; u64 features; u64 compat; + u64 recovery_passes_required; unsigned long errors_silent[BITS_TO_LONGS(BCH_FSCK_ERR_MAX)]; u64 btrees_lost_data; } sb; + DARRAY(enum bcachefs_metadata_version) + incompat_versions_requested; +#ifdef CONFIG_UNICODE + struct unicode_map *cf_encoding; +#endif struct bch_sb_handle disk_sb; @@ -795,7 +871,7 @@ struct bch_fs { struct mutex snapshot_table_lock; struct rw_semaphore snapshot_create_lock; - struct work_struct snapshot_delete_work; + struct snapshot_delete snapshot_delete; struct work_struct snapshot_wait_for_pagecache_and_delete_work; snapshot_id_list snapshots_unlinked; struct mutex snapshots_unlinked_lock; @@ -860,7 +936,7 @@ struct bch_fs { struct btree_write_buffer btree_write_buffer; struct workqueue_struct *btree_update_wq; - struct workqueue_struct *btree_io_complete_wq; + struct workqueue_struct *btree_write_complete_wq; /* copygc needs its own workqueue for index updates.. */ struct workqueue_struct *copygc_wq; /* @@ -871,6 +947,7 @@ struct bch_fs { struct workqueue_struct *write_ref_wq; /* ALLOCATION */ + struct bch_devs_mask online_devs; struct bch_devs_mask rw_devs[BCH_DATA_NR]; unsigned long rw_devs_change_count; @@ -965,13 +1042,16 @@ struct bch_fs { nocow_locks; struct rhashtable promote_table; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + struct async_obj_list async_objs[BCH_ASYNC_OBJ_NR]; +#endif + mempool_t compression_bounce[2]; mempool_t compress_workspace[BCH_COMPRESSION_OPT_NR]; size_t zstd_workspace_size; - struct crypto_shash *sha256; - struct crypto_sync_skcipher *chacha20; - struct crypto_shash *poly1305; + struct bch_key chacha20_key; + bool chacha20_key_set; atomic64_t key_version; @@ -993,15 +1073,11 @@ struct bch_fs { wait_queue_head_t copygc_running_wq; /* STRIPES: */ - GENRADIX(struct stripe) stripes; GENRADIX(struct gc_stripe) gc_stripes; struct hlist_head ec_stripes_new[32]; spinlock_t ec_stripes_new_lock; - ec_stripes_heap ec_stripes_heap; - struct mutex ec_stripes_heap_lock; - /* ERASURE CODING */ struct list_head ec_stripe_head_list; struct mutex ec_stripe_head_lock; @@ -1039,25 +1115,12 @@ struct bch_fs { /* RECOVERY */ u64 journal_replay_seq_start; u64 journal_replay_seq_end; - /* - * Two different uses: - * "Has this fsck pass?" - i.e. should this type of error be an - * emergency read-only - * And, in certain situations fsck will rewind to an earlier pass: used - * for signaling to the toplevel code which pass we want to run now. - */ - enum bch_recovery_pass curr_recovery_pass; - enum bch_recovery_pass next_recovery_pass; - /* bitmask of recovery passes that we actually ran */ - u64 recovery_passes_complete; - /* never rewinds version of curr_recovery_pass */ - enum bch_recovery_pass recovery_pass_done; - spinlock_t recovery_pass_lock; - struct semaphore online_fsck_mutex; + struct bch_fs_recovery recovery; /* DEBUG JUNK */ struct dentry *fs_debug_dir; struct dentry *btree_debug_dir; + struct dentry *async_obj_dir; struct btree_debug btree_debug[BTREE_ID_NR]; struct btree *verify_data; struct btree_node *verify_ondisk; @@ -1099,54 +1162,6 @@ struct bch_fs { extern struct wait_queue_head bch2_read_only_wait; -static inline void bch2_write_ref_get(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - atomic_long_inc(&c->writes[ref]); -#else - percpu_ref_get(&c->writes); -#endif -} - -static inline bool __bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_going_ro, &c->flags) && - atomic_long_inc_not_zero(&c->writes[ref]); -#else - return percpu_ref_tryget(&c->writes); -#endif -} - -static inline bool bch2_write_ref_tryget(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - return !test_bit(BCH_FS_going_ro, &c->flags) && - atomic_long_inc_not_zero(&c->writes[ref]); -#else - return percpu_ref_tryget_live(&c->writes); -#endif -} - -static inline void bch2_write_ref_put(struct bch_fs *c, enum bch_write_ref ref) -{ -#ifdef BCH_WRITE_REF_DEBUG - long v = atomic_long_dec_return(&c->writes[ref]); - - BUG_ON(v < 0); - if (v) - return; - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) - if (atomic_long_read(&c->writes[i])) - return; - - set_bit(BCH_FS_write_disable_complete, &c->flags); - wake_up(&bch2_read_only_wait); -#else - percpu_ref_put(&c->writes); -#endif -} - static inline bool bch2_ro_ref_tryget(struct bch_fs *c) { if (test_bit(BCH_FS_stopping, &c->flags)) @@ -1247,4 +1262,17 @@ static inline unsigned data_replicas_required(struct bch_fs *c) #define BKEY_PADDED_ONSTACK(key, pad) \ struct { struct bkey_i key; __u64 key ## _pad[pad]; } +/* + * This is needed because discard is both a filesystem option and a device + * option, and mount options are supposed to apply to that mount and not be + * persisted, i.e. if it's set as a mount option we can't propagate it to the + * device. + */ +static inline bool bch2_discard_opt_enabled(struct bch_fs *c, struct bch_dev *ca) +{ + return test_bit(BCH_FS_discard_mount_opt_set, &c->flags) + ? c->opts.discard + : ca->mi.discard; +} + #endif /* _BCACHEFS_H */ diff --git a/fs/bcachefs/bcachefs_format.h b/fs/bcachefs/bcachefs_format.h index f70f0108401f..b4a04df5ea95 100644 --- a/fs/bcachefs/bcachefs_format.h +++ b/fs/bcachefs/bcachefs_format.h @@ -366,6 +366,10 @@ static inline void bkey_init(struct bkey *k) #define __BKEY_PADDED(key, pad) \ struct bkey_i key; __u64 key ## _pad[pad] +enum bch_bkey_type_flags { + BKEY_TYPE_strict_btree_checks = BIT(0), +}; + /* * - DELETED keys are used internally to mark keys that should be ignored but * override keys in composition order. Their version number is ignored. @@ -383,46 +387,46 @@ static inline void bkey_init(struct bkey *k) * * - WHITEOUT: for hash table btrees */ -#define BCH_BKEY_TYPES() \ - x(deleted, 0) \ - x(whiteout, 1) \ - x(error, 2) \ - x(cookie, 3) \ - x(hash_whiteout, 4) \ - x(btree_ptr, 5) \ - x(extent, 6) \ - x(reservation, 7) \ - x(inode, 8) \ - x(inode_generation, 9) \ - x(dirent, 10) \ - x(xattr, 11) \ - x(alloc, 12) \ - x(quota, 13) \ - x(stripe, 14) \ - x(reflink_p, 15) \ - x(reflink_v, 16) \ - x(inline_data, 17) \ - x(btree_ptr_v2, 18) \ - x(indirect_inline_data, 19) \ - x(alloc_v2, 20) \ - x(subvolume, 21) \ - x(snapshot, 22) \ - x(inode_v2, 23) \ - x(alloc_v3, 24) \ - x(set, 25) \ - x(lru, 26) \ - x(alloc_v4, 27) \ - x(backpointer, 28) \ - x(inode_v3, 29) \ - x(bucket_gens, 30) \ - x(snapshot_tree, 31) \ - x(logged_op_truncate, 32) \ - x(logged_op_finsert, 33) \ - x(accounting, 34) \ - x(inode_alloc_cursor, 35) +#define BCH_BKEY_TYPES() \ + x(deleted, 0, 0) \ + x(whiteout, 1, 0) \ + x(error, 2, 0) \ + x(cookie, 3, 0) \ + x(hash_whiteout, 4, BKEY_TYPE_strict_btree_checks) \ + x(btree_ptr, 5, BKEY_TYPE_strict_btree_checks) \ + x(extent, 6, BKEY_TYPE_strict_btree_checks) \ + x(reservation, 7, BKEY_TYPE_strict_btree_checks) \ + x(inode, 8, BKEY_TYPE_strict_btree_checks) \ + x(inode_generation, 9, BKEY_TYPE_strict_btree_checks) \ + x(dirent, 10, BKEY_TYPE_strict_btree_checks) \ + x(xattr, 11, BKEY_TYPE_strict_btree_checks) \ + x(alloc, 12, BKEY_TYPE_strict_btree_checks) \ + x(quota, 13, BKEY_TYPE_strict_btree_checks) \ + x(stripe, 14, BKEY_TYPE_strict_btree_checks) \ + x(reflink_p, 15, BKEY_TYPE_strict_btree_checks) \ + x(reflink_v, 16, BKEY_TYPE_strict_btree_checks) \ + x(inline_data, 17, BKEY_TYPE_strict_btree_checks) \ + x(btree_ptr_v2, 18, BKEY_TYPE_strict_btree_checks) \ + x(indirect_inline_data, 19, BKEY_TYPE_strict_btree_checks) \ + x(alloc_v2, 20, BKEY_TYPE_strict_btree_checks) \ + x(subvolume, 21, BKEY_TYPE_strict_btree_checks) \ + x(snapshot, 22, BKEY_TYPE_strict_btree_checks) \ + x(inode_v2, 23, BKEY_TYPE_strict_btree_checks) \ + x(alloc_v3, 24, BKEY_TYPE_strict_btree_checks) \ + x(set, 25, 0) \ + x(lru, 26, BKEY_TYPE_strict_btree_checks) \ + x(alloc_v4, 27, BKEY_TYPE_strict_btree_checks) \ + x(backpointer, 28, BKEY_TYPE_strict_btree_checks) \ + x(inode_v3, 29, BKEY_TYPE_strict_btree_checks) \ + x(bucket_gens, 30, BKEY_TYPE_strict_btree_checks) \ + x(snapshot_tree, 31, BKEY_TYPE_strict_btree_checks) \ + x(logged_op_truncate, 32, BKEY_TYPE_strict_btree_checks) \ + x(logged_op_finsert, 33, BKEY_TYPE_strict_btree_checks) \ + x(accounting, 34, BKEY_TYPE_strict_btree_checks) \ + x(inode_alloc_cursor, 35, BKEY_TYPE_strict_btree_checks) enum bch_bkey_type { -#define x(name, nr) KEY_TYPE_##name = nr, +#define x(name, nr, ...) KEY_TYPE_##name = nr, BCH_BKEY_TYPES() #undef x KEY_TYPE_MAX, @@ -493,7 +497,8 @@ struct bch_sb_field { x(members_v2, 11) \ x(errors, 12) \ x(ext, 13) \ - x(downgrade, 14) + x(downgrade, 14) \ + x(recovery_passes, 15) #include "alloc_background_format.h" #include "dirent_format.h" @@ -506,6 +511,7 @@ struct bch_sb_field { #include "logged_ops_format.h" #include "lru_format.h" #include "quota_format.h" +#include "recovery_passes_format.h" #include "reflink_format.h" #include "replicas_format.h" #include "snapshot_format.h" @@ -686,7 +692,15 @@ struct bch_sb_field_ext { x(inode_depth, BCH_VERSION(1, 17)) \ x(persistent_inode_cursors, BCH_VERSION(1, 18)) \ x(autofix_errors, BCH_VERSION(1, 19)) \ - x(directory_size, BCH_VERSION(1, 20)) + x(directory_size, BCH_VERSION(1, 20)) \ + x(cached_backpointers, BCH_VERSION(1, 21)) \ + x(stripe_backpointers, BCH_VERSION(1, 22)) \ + x(stripe_lru, BCH_VERSION(1, 23)) \ + x(casefolding, BCH_VERSION(1, 24)) \ + x(extent_flags, BCH_VERSION(1, 25)) \ + x(snapshot_deletion_v2, BCH_VERSION(1, 26)) \ + x(fast_device_removal, BCH_VERSION(1, 27)) \ + x(inode_has_case_insensitive, BCH_VERSION(1, 28)) enum bcachefs_metadata_version { bcachefs_metadata_version_min = 9, @@ -837,6 +851,7 @@ LE64_BITMASK(BCH_SB_SHARD_INUMS, struct bch_sb, flags[3], 28, 29); LE64_BITMASK(BCH_SB_INODES_USE_KEY_CACHE,struct bch_sb, flags[3], 29, 30); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DELAY,struct bch_sb, flags[3], 30, 62); LE64_BITMASK(BCH_SB_JOURNAL_FLUSH_DISABLED,struct bch_sb, flags[3], 62, 63); +LE64_BITMASK(BCH_SB_MULTI_DEVICE, struct bch_sb, flags[3], 63, 64); LE64_BITMASK(BCH_SB_JOURNAL_RECLAIM_DELAY,struct bch_sb, flags[4], 0, 32); LE64_BITMASK(BCH_SB_JOURNAL_TRANSACTION_NAMES,struct bch_sb, flags[4], 32, 33); LE64_BITMASK(BCH_SB_NOCOW, struct bch_sb, flags[4], 33, 34); @@ -855,6 +870,11 @@ LE64_BITMASK(BCH_SB_VERSION_INCOMPAT, struct bch_sb, flags[5], 32, 48); LE64_BITMASK(BCH_SB_VERSION_INCOMPAT_ALLOWED, struct bch_sb, flags[5], 48, 64); LE64_BITMASK(BCH_SB_SHARD_INUMS_NBITS, struct bch_sb, flags[6], 0, 4); +LE64_BITMASK(BCH_SB_WRITE_ERROR_TIMEOUT,struct bch_sb, flags[6], 4, 14); +LE64_BITMASK(BCH_SB_CSUM_ERR_RETRY_NR, struct bch_sb, flags[6], 14, 20); +LE64_BITMASK(BCH_SB_DEGRADED_ACTION, struct bch_sb, flags[6], 20, 22); +LE64_BITMASK(BCH_SB_CASEFOLD, struct bch_sb, flags[6], 22, 23); +LE64_BITMASK(BCH_SB_REBALANCE_AC_ONLY, struct bch_sb, flags[6], 23, 24); static inline __u64 BCH_SB_COMPRESSION_TYPE(const struct bch_sb *sb) { @@ -908,7 +928,10 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u x(journal_no_flush, 16) \ x(alloc_v2, 17) \ x(extents_across_btree_nodes, 18) \ - x(incompat_version_field, 19) + x(incompat_version_field, 19) \ + x(casefolding, 20) \ + x(no_alloc_info, 21) \ + x(small_image, 22) #define BCH_SB_FEATURES_ALWAYS \ (BIT_ULL(BCH_FEATURE_new_extent_overwrite)| \ @@ -922,7 +945,8 @@ static inline void SET_BCH_SB_BACKGROUND_COMPRESSION_TYPE(struct bch_sb *sb, __u BIT_ULL(BCH_FEATURE_new_siphash)| \ BIT_ULL(BCH_FEATURE_btree_ptr_v2)| \ BIT_ULL(BCH_FEATURE_new_varint)| \ - BIT_ULL(BCH_FEATURE_journal_no_flush)) + BIT_ULL(BCH_FEATURE_journal_no_flush)| \ + BIT_ULL(BCH_FEATURE_incompat_version_field)) enum bch_sb_feature { #define x(f, n) BCH_FEATURE_##f, @@ -974,6 +998,19 @@ enum bch_error_actions { BCH_ON_ERROR_NR }; +#define BCH_DEGRADED_ACTIONS() \ + x(ask, 0) \ + x(yes, 1) \ + x(very, 2) \ + x(no, 3) + +enum bch_degraded_actions { +#define x(t, n) BCH_DEGRADED_##t = n, + BCH_DEGRADED_ACTIONS() +#undef x + BCH_DEGRADED_ACTIONS_NR +}; + #define BCH_STR_HASH_TYPES() \ x(crc32c, 0) \ x(crc64, 1) \ @@ -1133,7 +1170,8 @@ static inline __u64 __bset_magic(struct bch_sb *sb) x(log, 9) \ x(overwrite, 10) \ x(write_buffer_keys, 11) \ - x(datetime, 12) + x(datetime, 12) \ + x(log_bkey, 13) enum bch_jset_entry_type { #define x(f, nr) BCH_JSET_ENTRY_##f = nr, diff --git a/fs/bcachefs/bcachefs_ioctl.h b/fs/bcachefs/bcachefs_ioctl.h index 3c23bdf788ce..52594e925eb7 100644 --- a/fs/bcachefs/bcachefs_ioctl.h +++ b/fs/bcachefs/bcachefs_ioctl.h @@ -87,6 +87,7 @@ struct bch_ioctl_incremental { #define BCH_IOCTL_FSCK_OFFLINE _IOW(0xbc, 19, struct bch_ioctl_fsck_offline) #define BCH_IOCTL_FSCK_ONLINE _IOW(0xbc, 20, struct bch_ioctl_fsck_online) #define BCH_IOCTL_QUERY_ACCOUNTING _IOW(0xbc, 21, struct bch_ioctl_query_accounting) +#define BCH_IOCTL_QUERY_COUNTERS _IOW(0xbc, 21, struct bch_ioctl_query_counters) /* ioctl below act on a particular file, not the filesystem as a whole: */ @@ -215,6 +216,10 @@ struct bch_ioctl_data { union { struct { __u32 dev; + __u32 data_types; + } scrub; + struct { + __u32 dev; __u32 pad; } migrate; struct { @@ -229,6 +234,11 @@ enum bch_data_event { BCH_DATA_EVENT_NR = 1, }; +enum data_progress_data_type_special { + DATA_PROGRESS_DATA_TYPE_phys = 254, + DATA_PROGRESS_DATA_TYPE_done = 255, +}; + struct bch_ioctl_data_progress { __u8 data_type; __u8 btree_id; @@ -237,11 +247,19 @@ struct bch_ioctl_data_progress { __u64 sectors_done; __u64 sectors_total; + __u64 sectors_error_corrected; + __u64 sectors_error_uncorrected; } __packed __aligned(8); +enum bch_ioctl_data_event_ret { + BCH_IOCTL_DATA_EVENT_RET_done = 1, + BCH_IOCTL_DATA_EVENT_RET_device_offline = 2, +}; + struct bch_ioctl_data_event { __u8 type; - __u8 pad[7]; + __u8 ret; + __u8 pad[6]; union { struct bch_ioctl_data_progress p; __u64 pad2[15]; @@ -443,4 +461,13 @@ struct bch_ioctl_query_accounting { struct bkey_i_accounting accounting[]; }; +#define BCH_IOCTL_QUERY_COUNTERS_MOUNT (1 << 0) + +struct bch_ioctl_query_counters { + __u16 nr; + __u16 flags; + __u32 pad; + __u64 d[]; +}; + #endif /* _BCACHEFS_IOCTL_H */ diff --git a/fs/bcachefs/bkey.c b/fs/bcachefs/bkey.c index 995ba32e9b6e..ee823c640642 100644 --- a/fs/bcachefs/bkey.c +++ b/fs/bcachefs/bkey.c @@ -47,11 +47,9 @@ void bch2_bkey_packed_to_binary_text(struct printbuf *out, } } -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) +static void __bch2_bkey_pack_verify(const struct bkey_packed *packed, + const struct bkey *unpacked, + const struct bkey_format *format) { struct bkey tmp; @@ -95,11 +93,13 @@ static void bch2_bkey_pack_verify(const struct bkey_packed *packed, } } -#else static inline void bch2_bkey_pack_verify(const struct bkey_packed *packed, - const struct bkey *unpacked, - const struct bkey_format *format) {} -#endif + const struct bkey *unpacked, + const struct bkey_format *format) +{ + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) + __bch2_bkey_pack_verify(packed, unpacked, format); +} struct pack_state { const struct bkey_format *format; @@ -398,7 +398,6 @@ static bool set_inc_field_lossy(struct pack_state *state, unsigned field, u64 v) return ret; } -#ifdef CONFIG_BCACHEFS_DEBUG static bool bkey_packed_successor(struct bkey_packed *out, const struct btree *b, struct bkey_packed k) @@ -455,7 +454,6 @@ static bool bkey_format_has_too_big_fields(const struct bkey_format *f) return false; } -#endif /* * Returns a packed key that compares <= in @@ -472,9 +470,7 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, const struct bkey_format *f = &b->format; struct pack_state state = pack_state_init(f, out); u64 *w = out->_data; -#ifdef CONFIG_BCACHEFS_DEBUG struct bpos orig = in; -#endif bool exact = true; unsigned i; @@ -527,18 +523,18 @@ enum bkey_pack_pos_ret bch2_bkey_pack_pos_lossy(struct bkey_packed *out, out->format = KEY_FORMAT_LOCAL_BTREE; out->type = KEY_TYPE_deleted; -#ifdef CONFIG_BCACHEFS_DEBUG - if (exact) { - BUG_ON(bkey_cmp_left_packed(b, out, &orig)); - } else { - struct bkey_packed successor; + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { + if (exact) { + BUG_ON(bkey_cmp_left_packed(b, out, &orig)); + } else { + struct bkey_packed successor; - BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); - BUG_ON(bkey_packed_successor(&successor, b, *out) && - bkey_cmp_left_packed(b, &successor, &orig) < 0 && - !bkey_format_has_too_big_fields(f)); + BUG_ON(bkey_cmp_left_packed(b, out, &orig) >= 0); + BUG_ON(bkey_packed_successor(&successor, b, *out) && + bkey_cmp_left_packed(b, &successor, &orig) < 0 && + !bkey_format_has_too_big_fields(f)); + } } -#endif return exact ? BKEY_PACK_POS_EXACT : BKEY_PACK_POS_SMALLER; } @@ -627,14 +623,13 @@ struct bkey_format bch2_bkey_format_done(struct bkey_format_state *s) } } -#ifdef CONFIG_BCACHEFS_DEBUG - { + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { struct printbuf buf = PRINTBUF; BUG_ON(bch2_bkey_format_invalid(NULL, &ret, 0, &buf)); printbuf_exit(&buf); } -#endif + return ret; } diff --git a/fs/bcachefs/bkey.h b/fs/bcachefs/bkey.h index 054e2d5e8448..3ccd521c190a 100644 --- a/fs/bcachefs/bkey.h +++ b/fs/bcachefs/bkey.h @@ -191,6 +191,7 @@ static inline struct bpos bkey_max(struct bpos l, struct bpos r) static inline bool bkey_and_val_eq(struct bkey_s_c l, struct bkey_s_c r) { return bpos_eq(l.k->p, r.k->p) && + l.k->size == r.k->size && bkey_bytes(l.k) == bkey_bytes(r.k) && !memcmp(l.v, r.v, bkey_val_bytes(l.k)); } @@ -397,8 +398,7 @@ __bkey_unpack_key_format_checked(const struct btree *b, compiled_unpack_fn unpack_fn = b->aux_data; unpack_fn(dst, src); - if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && - bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_debug_check_bkey_unpack)) { struct bkey dst2 = __bch2_bkey_unpack_key(&b->format, src); BUG_ON(memcmp(dst, &dst2, sizeof(*dst))); diff --git a/fs/bcachefs/bkey_methods.c b/fs/bcachefs/bkey_methods.c index 15c93576b5c2..fcd8c82cba4f 100644 --- a/fs/bcachefs/bkey_methods.c +++ b/fs/bcachefs/bkey_methods.c @@ -21,7 +21,7 @@ #include "xattr.h" const char * const bch2_bkey_types[] = { -#define x(name, nr) #name, +#define x(name, nr, ...) #name, BCH_BKEY_TYPES() #undef x NULL @@ -115,7 +115,7 @@ static bool key_type_set_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_ }) const struct bkey_ops bch2_bkey_ops[] = { -#define x(name, nr) [KEY_TYPE_##name] = bch2_bkey_ops_##name, +#define x(name, nr, ...) [KEY_TYPE_##name] = bch2_bkey_ops_##name, BCH_BKEY_TYPES() #undef x }; @@ -155,6 +155,12 @@ static u64 bch2_key_types_allowed[] = { #undef x }; +static const enum bch_bkey_type_flags bch2_bkey_type_flags[] = { +#define x(name, nr, flags) [KEY_TYPE_##name] = flags, + BCH_BKEY_TYPES() +#undef x +}; + const char *bch2_btree_node_type_str(enum btree_node_type type) { return type == BKEY_TYPE_btree ? "internal btree node" : bch2_btree_id_str(type - 1); @@ -177,8 +183,18 @@ int __bch2_bkey_validate(struct bch_fs *c, struct bkey_s_c k, if (type >= BKEY_TYPE_NR) return 0; - bkey_fsck_err_on(k.k->type < KEY_TYPE_MAX && - (type == BKEY_TYPE_btree || (from.flags & BCH_VALIDATE_commit)) && + enum bch_bkey_type_flags bkey_flags = k.k->type < KEY_TYPE_MAX + ? bch2_bkey_type_flags[k.k->type] + : 0; + + bool strict_key_type_allowed = + (from.flags & BCH_VALIDATE_commit) || + type == BKEY_TYPE_btree || + (from.btree < BTREE_ID_NR && + (bkey_flags & BKEY_TYPE_strict_btree_checks)); + + bkey_fsck_err_on(strict_key_type_allowed && + k.k->type < KEY_TYPE_MAX && !(bch2_key_types_allowed[type] & BIT_ULL(k.k->type)), c, bkey_invalid_type_for_btree, "invalid key type for btree %s (%s)", @@ -340,7 +356,7 @@ bool bch2_bkey_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return ops->key_merge && bch2_bkey_maybe_mergable(l.k, r.k) && (u64) l.k->size + r.k->size <= KEY_SIZE_MAX && - !bch2_key_merging_disabled && + !static_branch_unlikely(&bch2_key_merging_disabled) && ops->key_merge(c, l, r); } diff --git a/fs/bcachefs/bset.c b/fs/bcachefs/bset.c index 9a4a83d6fd2d..32841f762eb2 100644 --- a/fs/bcachefs/bset.c +++ b/fs/bcachefs/bset.c @@ -144,8 +144,6 @@ struct btree_nr_keys bch2_btree_node_count_keys(struct btree *b) return nr; } -#ifdef CONFIG_BCACHEFS_DEBUG - void __bch2_verify_btree_nr_keys(struct btree *b) { struct btree_nr_keys nr = bch2_btree_node_count_keys(b); @@ -153,7 +151,7 @@ void __bch2_verify_btree_nr_keys(struct btree *b) BUG_ON(memcmp(&nr, &b->nr, sizeof(nr))); } -static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, +static void __bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, struct btree *b) { struct btree_node_iter iter = *_iter; @@ -190,8 +188,8 @@ static void bch2_btree_node_iter_next_check(struct btree_node_iter *_iter, } } -void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) +void __bch2_btree_node_iter_verify(struct btree_node_iter *iter, + struct btree *b) { struct btree_node_iter_set *set, *s2; struct bkey_packed *k, *p; @@ -237,8 +235,8 @@ found: } } -void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, - struct bkey_packed *insert, unsigned clobber_u64s) +static void __bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, + struct bkey_packed *insert, unsigned clobber_u64s) { struct bset_tree *t = bch2_bkey_to_bset(b, where); struct bkey_packed *prev = bch2_bkey_prev_all(b, t, where); @@ -285,12 +283,15 @@ void bch2_verify_insert_pos(struct btree *b, struct bkey_packed *where, #endif } -#else - -static inline void bch2_btree_node_iter_next_check(struct btree_node_iter *iter, - struct btree *b) {} +static inline void bch2_verify_insert_pos(struct btree *b, + struct bkey_packed *where, + struct bkey_packed *insert, + unsigned clobber_u64s) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_verify_insert_pos(b, where, insert, clobber_u64s); +} -#endif /* Auxiliary search trees */ @@ -361,9 +362,8 @@ static struct bkey_float *bkey_float(const struct btree *b, return ro_aux_tree_base(b, t)->f + idx; } -static void bset_aux_tree_verify(struct btree *b) +static void __bset_aux_tree_verify(struct btree *b) { -#ifdef CONFIG_BCACHEFS_DEBUG for_each_bset(b, t) { if (t->aux_data_offset == U16_MAX) continue; @@ -375,7 +375,12 @@ static void bset_aux_tree_verify(struct btree *b) BUG_ON(t->aux_data_offset > btree_aux_data_u64s(b)); BUG_ON(bset_aux_tree_buf_end(t) > btree_aux_data_u64s(b)); } -#endif +} + +static inline void bset_aux_tree_verify(struct btree *b) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bset_aux_tree_verify(b); } void bch2_btree_keys_init(struct btree *b) @@ -495,15 +500,11 @@ static void rw_aux_tree_set(const struct btree *b, struct bset_tree *t, }; } -static void bch2_bset_verify_rw_aux_tree(struct btree *b, - struct bset_tree *t) +static void __bch2_bset_verify_rw_aux_tree(struct btree *b, struct bset_tree *t) { struct bkey_packed *k = btree_bkey_first(b, t); unsigned j = 0; - if (!bch2_expensive_debug_checks) - return; - BUG_ON(bset_has_ro_aux_tree(t)); if (!bset_has_rw_aux_tree(t)) @@ -530,6 +531,13 @@ start: } } +static inline void bch2_bset_verify_rw_aux_tree(struct btree *b, + struct bset_tree *t) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_bset_verify_rw_aux_tree(b, t); +} + /* returns idx of first entry >= offset: */ static unsigned rw_aux_tree_bsearch(struct btree *b, struct bset_tree *t, @@ -869,7 +877,7 @@ struct bkey_packed *bch2_bkey_prev_filter(struct btree *b, k = p; } - if (bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { BUG_ON(ret >= orig_k); for (i = ret @@ -1195,7 +1203,7 @@ struct bkey_packed *bch2_bset_search_linear(struct btree *b, bkey_iter_pos_cmp(b, m, search) < 0) m = bkey_p_next(m); - if (bch2_expensive_debug_checks) { + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { struct bkey_packed *prev = bch2_bkey_prev_all(b, t, m); BUG_ON(prev && @@ -1435,9 +1443,9 @@ static inline void __bch2_btree_node_iter_advance(struct btree_node_iter *iter, void bch2_btree_node_iter_advance(struct btree_node_iter *iter, struct btree *b) { - if (bch2_expensive_debug_checks) { - bch2_btree_node_iter_verify(iter, b); - bch2_btree_node_iter_next_check(iter, b); + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) { + __bch2_btree_node_iter_verify(iter, b); + __bch2_btree_node_iter_next_check(iter, b); } __bch2_btree_node_iter_advance(iter, b); @@ -1453,8 +1461,7 @@ struct bkey_packed *bch2_btree_node_iter_prev_all(struct btree_node_iter *iter, struct btree_node_iter_set *set; unsigned end = 0; - if (bch2_expensive_debug_checks) - bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_verify(iter, b); for_each_bset(b, t) { k = bch2_bkey_prev_all(b, t, @@ -1489,8 +1496,7 @@ found: iter->data[0].k = __btree_node_key_to_offset(b, prev); iter->data[0].end = end; - if (bch2_expensive_debug_checks) - bch2_btree_node_iter_verify(iter, b); + bch2_btree_node_iter_verify(iter, b); return prev; } diff --git a/fs/bcachefs/bset.h b/fs/bcachefs/bset.h index 6953d55b72cc..a15ecf9d006e 100644 --- a/fs/bcachefs/bset.h +++ b/fs/bcachefs/bset.h @@ -517,27 +517,19 @@ void bch2_dump_bset(struct bch_fs *, struct btree *, struct bset *, unsigned); void bch2_dump_btree_node(struct bch_fs *, struct btree *); void bch2_dump_btree_node_iter(struct btree *, struct btree_node_iter *); -#ifdef CONFIG_BCACHEFS_DEBUG - void __bch2_verify_btree_nr_keys(struct btree *); -void bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -void bch2_verify_insert_pos(struct btree *, struct bkey_packed *, - struct bkey_packed *, unsigned); - -#else +void __bch2_btree_node_iter_verify(struct btree_node_iter *, struct btree *); -static inline void __bch2_verify_btree_nr_keys(struct btree *b) {} static inline void bch2_btree_node_iter_verify(struct btree_node_iter *iter, - struct btree *b) {} -static inline void bch2_verify_insert_pos(struct btree *b, - struct bkey_packed *where, - struct bkey_packed *insert, - unsigned clobber_u64s) {} -#endif + struct btree *b) +{ + if (static_branch_unlikely(&bch2_debug_check_bset_lookups)) + __bch2_btree_node_iter_verify(iter, b); +} static inline void bch2_verify_btree_nr_keys(struct btree *b) { - if (bch2_debug_check_btree_accounting) + if (static_branch_unlikely(&bch2_debug_check_btree_accounting)) __bch2_verify_btree_nr_keys(b); } diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index ca755e8d1a37..8557cbd3d818 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -17,12 +17,6 @@ #include <linux/sched/mm.h> #include <linux/swap.h> -#define BTREE_CACHE_NOT_FREED_INCREMENT(counter) \ -do { \ - if (shrinker_counter) \ - bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_##counter]++; \ -} while (0) - const char * const bch2_btree_node_flags[] = { "typebit", "typebit", @@ -203,7 +197,7 @@ struct btree *__bch2_btree_node_mem_alloc(struct bch_fs *c) return NULL; } - bch2_btree_lock_init(&b->c, 0); + bch2_btree_lock_init(&b->c, 0, GFP_KERNEL); __bch2_btree_node_to_freelist(bc, b); return b; @@ -350,115 +344,118 @@ static inline struct btree *btree_cache_find(struct btree_cache *bc, return rhashtable_lookup_fast(&bc->table, &v, bch_btree_cache_params); } -/* - * this version is for btree nodes that have already been freed (we're not - * reaping a real btree node) - */ -static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush, bool shrinker_counter) +static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, + bool flush, bool locked) { struct btree_cache *bc = &c->btree_cache; - int ret = 0; lockdep_assert_held(&bc->lock); -wait_on_io: - if (b->flags & ((1U << BTREE_NODE_dirty)| - (1U << BTREE_NODE_read_in_flight)| + + if (btree_node_noevict(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + if (btree_node_write_blocked(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + if (btree_node_will_make_reachable(b)) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + + if (btree_node_dirty(b)) { + if (!flush) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; + return -BCH_ERR_ENOMEM_btree_node_reclaim; + } + + if (locked) { + /* + * Using the underscore version because we don't want to compact + * bsets after the write, since this node is about to be evicted + * - unless btree verify mode is enabled, since it runs out of + * the post write cleanup: + */ + if (static_branch_unlikely(&bch2_verify_btree_ondisk)) + bch2_btree_node_write(c, b, SIX_LOCK_intent, + BTREE_WRITE_cache_reclaim); + else + __bch2_btree_node_write(c, b, + BTREE_WRITE_cache_reclaim); + } + } + + if (b->flags & ((1U << BTREE_NODE_read_in_flight)| (1U << BTREE_NODE_write_in_flight))) { if (!flush) { - if (btree_node_dirty(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); - else if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); + if (btree_node_read_in_flight(b)) + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; return -BCH_ERR_ENOMEM_btree_node_reclaim; } + if (locked) + return -EINTR; + /* XXX: waiting on IO with btree cache lock held */ bch2_btree_node_wait_on_read(b); bch2_btree_node_wait_on_write(b); } + return 0; +} + +/* + * this version is for btree nodes that have already been freed (we're not + * reaping a real btree node) + */ +static int __btree_node_reclaim(struct bch_fs *c, struct btree *b, bool flush) +{ + struct btree_cache *bc = &c->btree_cache; + int ret = 0; + + lockdep_assert_held(&bc->lock); +retry_unlocked: + ret = __btree_node_reclaim_checks(c, b, flush, false); + if (ret) + return ret; + if (!six_trylock_intent(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_intent); + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; return -BCH_ERR_ENOMEM_btree_node_reclaim; } if (!six_trylock_write(&b->c.lock)) { - BTREE_CACHE_NOT_FREED_INCREMENT(lock_write); - goto out_unlock_intent; + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; + six_unlock_intent(&b->c.lock); + return -BCH_ERR_ENOMEM_btree_node_reclaim; } /* recheck under lock */ - if (b->flags & ((1U << BTREE_NODE_read_in_flight)| - (1U << BTREE_NODE_write_in_flight))) { - if (!flush) { - if (btree_node_read_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(read_in_flight); - else if (btree_node_write_in_flight(b)) - BTREE_CACHE_NOT_FREED_INCREMENT(write_in_flight); - goto out_unlock; - } + ret = __btree_node_reclaim_checks(c, b, flush, true); + if (ret) { six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); - goto wait_on_io; - } - - if (btree_node_noevict(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(noevict); - goto out_unlock; - } - if (btree_node_write_blocked(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(write_blocked); - goto out_unlock; - } - if (btree_node_will_make_reachable(b)) { - BTREE_CACHE_NOT_FREED_INCREMENT(will_make_reachable); - goto out_unlock; + if (ret == -EINTR) + goto retry_unlocked; + return ret; } - if (btree_node_dirty(b)) { - if (!flush) { - BTREE_CACHE_NOT_FREED_INCREMENT(dirty); - goto out_unlock; - } - /* - * Using the underscore version because we don't want to compact - * bsets after the write, since this node is about to be evicted - * - unless btree verify mode is enabled, since it runs out of - * the post write cleanup: - */ - if (bch2_verify_btree_ondisk) - bch2_btree_node_write(c, b, SIX_LOCK_intent, - BTREE_WRITE_cache_reclaim); - else - __bch2_btree_node_write(c, b, - BTREE_WRITE_cache_reclaim); - - six_unlock_write(&b->c.lock); - six_unlock_intent(&b->c.lock); - goto wait_on_io; - } -out: if (b->hash_val && !ret) trace_and_count(c, btree_cache_reap, c, b); - return ret; -out_unlock: - six_unlock_write(&b->c.lock); -out_unlock_intent: - six_unlock_intent(&b->c.lock); - ret = -BCH_ERR_ENOMEM_btree_node_reclaim; - goto out; + return 0; } -static int btree_node_reclaim(struct bch_fs *c, struct btree *b, bool shrinker_counter) +static int btree_node_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, false, shrinker_counter); + return __btree_node_reclaim(c, b, false); } static int btree_node_write_and_reclaim(struct bch_fs *c, struct btree *b) { - return __btree_node_reclaim(c, b, true, false); + return __btree_node_reclaim(c, b, true); } static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, @@ -476,7 +473,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, unsigned long ret = SHRINK_STOP; bool trigger_writes = atomic_long_read(&bc->nr_dirty) + nr >= list->nr * 3 / 4; - if (bch2_btree_shrinker_disabled) + if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return SHRINK_STOP; mutex_lock(&bc->lock); @@ -490,7 +487,10 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, * IO can always make forward progress: */ can_free = btree_cache_can_free(list); - nr = min_t(unsigned long, nr, can_free); + if (nr > can_free) { + bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_cache_reserve] += nr - can_free; + nr = can_free; + } i = 0; list_for_each_entry_safe(b, t, &bc->freeable, list) { @@ -506,7 +506,7 @@ static unsigned long bch2_btree_cache_scan(struct shrinker *shrink, if (touched >= nr) goto out; - if (!btree_node_reclaim(c, b, true)) { + if (!btree_node_reclaim(c, b)) { btree_node_data_free(bc, b); six_unlock_write(&b->c.lock); six_unlock_intent(&b->c.lock); @@ -522,7 +522,7 @@ restart: clear_btree_node_accessed(b); bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_access_bit]++; --touched;; - } else if (!btree_node_reclaim(c, b, true)) { + } else if (!btree_node_reclaim(c, b)) { __bch2_btree_node_hash_remove(bc, b); __btree_node_data_free(bc, b); @@ -569,7 +569,7 @@ static unsigned long bch2_btree_cache_count(struct shrinker *shrink, { struct btree_cache_list *list = shrink->private_data; - if (bch2_btree_shrinker_disabled) + if (static_branch_unlikely(&bch2_btree_shrinker_disabled)) return 0; return btree_cache_can_free(list); @@ -610,6 +610,7 @@ void bch2_fs_btree_cache_exit(struct bch_fs *c) btree_node_write_in_flight(b)); btree_node_data_free(bc, b); + cond_resched(); } BUG_ON(!bch2_journal_error(&c->journal) && @@ -754,7 +755,7 @@ static struct btree *btree_node_cannibalize(struct bch_fs *c) for (unsigned i = 0; i < ARRAY_SIZE(bc->live); i++) list_for_each_entry_reverse(b, &bc->live[i].list, list) - if (!btree_node_reclaim(c, b, false)) + if (!btree_node_reclaim(c, b)) return b; while (1) { @@ -789,23 +790,24 @@ struct btree *bch2_btree_node_mem_alloc(struct btree_trans *trans, bool pcpu_rea * disk node. Check the freed list before allocating a new one: */ list_for_each_entry(b, freed, list) - if (!btree_node_reclaim(c, b, false)) { + if (!btree_node_reclaim(c, b)) { list_del_init(&b->list); goto got_node; } b = __btree_node_mem_alloc(c, GFP_NOWAIT|__GFP_NOWARN); - if (!b) { + if (b) { + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_NOWAIT); + } else { mutex_unlock(&bc->lock); bch2_trans_unlock(trans); b = __btree_node_mem_alloc(c, GFP_KERNEL); if (!b) goto err; + bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); mutex_lock(&bc->lock); } - bch2_btree_lock_init(&b->c, pcpu_read_locks ? SIX_LOCK_INIT_PCPU : 0); - BUG_ON(!six_trylock_intent(&b->c.lock)); BUG_ON(!six_trylock_write(&b->c.lock)); @@ -815,7 +817,7 @@ got_node: * the list. Check if there's any freed nodes there: */ list_for_each_entry(b2, &bc->freeable, list) - if (!btree_node_reclaim(c, b2, false)) { + if (!btree_node_reclaim(c, b2)) { swap(b->data, b2->data); swap(b->aux_data, b2->aux_data); @@ -850,7 +852,6 @@ out: b->sib_u64s[1] = 0; b->whiteout_u64s = 0; bch2_btree_keys_init(b); - set_btree_node_accessed(b); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_mem_alloc], start_time); @@ -976,7 +977,7 @@ static noinline struct btree *bch2_btree_node_fill(struct btree_trans *trans, /* Unlock before doing IO: */ six_unlock_intent(&b->c.lock); - bch2_trans_unlock_noassert(trans); + bch2_trans_unlock(trans); bch2_btree_node_read(trans, b, sync); @@ -1002,7 +1003,7 @@ static noinline void btree_bad_header(struct bch_fs *c, struct btree *b) { struct printbuf buf = PRINTBUF; - if (c->curr_recovery_pass <= BCH_RECOVERY_PASS_check_allocations) + if (c->recovery.pass_done < BCH_RECOVERY_PASS_check_allocations) return; prt_printf(&buf, @@ -1284,6 +1285,10 @@ lock_node: six_unlock_read(&b->c.lock); goto retry; } + + /* avoid atomic set bit if it's not needed: */ + if (!btree_node_accessed(b)) + set_btree_node_accessed(b); } /* XXX: waiting on IO with btree locks held: */ @@ -1299,10 +1304,6 @@ lock_node: prefetch(p + L1_CACHE_BYTES * 2); } - /* avoid atomic set bit if it's not needed: */ - if (!btree_node_accessed(b)) - set_btree_node_accessed(b); - if (unlikely(btree_node_read_error(b))) { six_unlock_read(&b->c.lock); b = ERR_PTR(-BCH_ERR_btree_node_read_err_cached); @@ -1415,7 +1416,7 @@ void __bch2_btree_pos_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, "%u", r->level); else prt_printf(out, "(unknown)"); - prt_printf(out, "\n "); + prt_newline(out); bch2_bkey_val_to_text(out, c, k); } @@ -1491,9 +1492,10 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc prt_btree_cache_line(out, c, "live:", bc->live[0].nr); prt_btree_cache_line(out, c, "pinned:", bc->live[1].nr); - prt_btree_cache_line(out, c, "freeable:", bc->nr_freeable); + prt_btree_cache_line(out, c, "reserve:", bc->nr_reserve); + prt_btree_cache_line(out, c, "freed:", bc->nr_freeable); prt_btree_cache_line(out, c, "dirty:", atomic_long_read(&bc->nr_dirty)); - prt_printf(out, "cannibalize lock:\t%p\n", bc->alloc_lock); + prt_printf(out, "cannibalize lock:\t%s\n", bc->alloc_lock ? "held" : "not held"); prt_newline(out); for (unsigned i = 0; i < ARRAY_SIZE(bc->nr_by_btree); i++) { @@ -1504,6 +1506,7 @@ void bch2_btree_cache_to_text(struct printbuf *out, const struct btree_cache *bc } prt_newline(out); + prt_printf(out, "counters since mount:\n"); prt_printf(out, "freed:\t%zu\n", bc->nr_freed); prt_printf(out, "not freed:\n"); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index dd1d9b74076e..91b6395421df 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -22,11 +22,13 @@ #include "debug.h" #include "disk_accounting.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "journal.h" #include "keylist.h" #include "move.h" +#include "progress.h" #include "recovery_passes.h" #include "reflink.h" #include "recovery.h" @@ -46,6 +48,27 @@ #define DROP_PREV_NODE 11 #define DID_FILL_FROM_SCAN 12 +/* + * Returns true if it's a btree we can easily reconstruct, or otherwise won't + * cause data loss if it's missing: + */ +static bool btree_id_important(enum btree_id btree) +{ + if (btree_id_is_alloc(btree)) + return false; + + switch (btree) { + case BTREE_ID_quotas: + case BTREE_ID_snapshot_trees: + case BTREE_ID_logged_ops: + case BTREE_ID_rebalance_work: + case BTREE_ID_subvolume_children: + return false; + default: + return true; + } +} + static const char * const bch2_gc_phase_strs[] = { #define x(n) #n, GC_PHASES() @@ -212,15 +235,15 @@ static int btree_check_node_boundaries(struct btree_trans *trans, struct btree * prt_printf(&buf, " at "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_printf(&buf, ":\n parent: "); + prt_printf(&buf, ":\nparent: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); if (prev) { - prt_printf(&buf, "\n prev: "); + prt_printf(&buf, "\nprev: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&prev->key)); } - prt_str(&buf, "\n next: "); + prt_str(&buf, "\nnext: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&cur->key)); if (bpos_lt(expected_start, cur->data->min_key)) { /* gap */ @@ -279,12 +302,12 @@ static int btree_repair_node_end(struct btree_trans *trans, struct btree *b, if (bpos_eq(child->key.k.p, b->key.k.p)) return 0; - prt_printf(&buf, " at "); + prt_printf(&buf, "\nat: "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_printf(&buf, ":\n parent: "); + prt_printf(&buf, "\nparent: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\n child: "); + prt_str(&buf, "\nchild: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&child->key)); if (mustfix_fsck_err(trans, btree_node_topology_bad_max_key, @@ -348,21 +371,13 @@ again: prt_char(&buf, ' '); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(cur_k.k)); - if (mustfix_fsck_err_on(bch2_err_matches(ret, EIO), - trans, btree_node_read_error, - "Topology repair: unreadable btree node at\n" - " %s", - buf.buf)) { + if (bch2_err_matches(ret, EIO)) { bch2_btree_node_evict(trans, cur_k.k); cur = NULL; ret = bch2_journal_key_delete(c, b->c.btree_id, b->c.level, cur_k.k->k.p); if (ret) break; - - ret = bch2_btree_lost_data(c, b->c.btree_id); - if (ret) - break; continue; } @@ -524,9 +539,6 @@ int bch2_check_topology(struct bch_fs *c) bch2_btree_id_to_text(&buf, i); if (r->error) { - ret = bch2_btree_lost_data(c, i); - if (ret) - break; reconstruct_root: bch_info(c, "btree root %s unreadable, must recover from scan", buf.buf); @@ -534,8 +546,10 @@ reconstruct_root: r->error = 0; if (!bch2_btree_has_scanned_nodes(c, i)) { - mustfix_fsck_err(trans, btree_root_unreadable_and_scan_found_nothing, - "no nodes found for btree %s, continue?", buf.buf); + __fsck_err(trans, + FSCK_CAN_FIX|(!btree_id_important(i) ? FSCK_AUTOFIX : 0), + btree_root_unreadable_and_scan_found_nothing, + "no nodes found for btree %s, continue?", buf.buf); bch2_btree_root_alloc_fake_trans(trans, i, 0); } else { bch2_btree_root_alloc_fake_trans(trans, i, 1); @@ -605,13 +619,13 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, deleted.p = k.k->p; if (initial) { - BUG_ON(bch2_journal_seq_verify && + BUG_ON(static_branch_unlikely(&bch2_journal_seq_verify) && k.k->bversion.lo > atomic64_read(&c->journal.seq)); if (fsck_err_on(btree_id != BTREE_ID_accounting && k.k->bversion.lo > atomic64_read(&c->key_version), trans, bkey_version_in_future, - "key version number higher than recorded %llu\n %s", + "key version number higher than recorded %llu\n%s", atomic64_read(&c->key_version), (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) atomic64_set(&c->key_version, k.k->bversion.lo); @@ -619,7 +633,7 @@ static int bch2_gc_mark_key(struct btree_trans *trans, enum btree_id btree_id, if (mustfix_fsck_err_on(level && !bch2_dev_btree_bitmap_marked(c, k), trans, btree_bitmap_not_marked, - "btree ptr not marked in member info btree allocated bitmap\n %s", + "btree ptr not marked in member info btree allocated bitmap\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { @@ -656,7 +670,9 @@ fsck_err: return ret; } -static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool initial) +static int bch2_gc_btree(struct btree_trans *trans, + struct progress_indicator_state *progress, + enum btree_id btree, bool initial) { struct bch_fs *c = trans->c; unsigned target_depth = btree_node_type_has_triggers(__btree_node_type(0, btree)) ? 0 : 1; @@ -673,6 +689,7 @@ static int bch2_gc_btree(struct btree_trans *trans, enum btree_id btree, bool in BTREE_ITER_prefetch); ret = for_each_btree_key_continue(trans, iter, 0, k, ({ + bch2_progress_update_iter(trans, progress, &iter, "check_allocations"); gc_pos_set(c, gc_pos_btree(btree, level, k.k->p)); bch2_gc_mark_key(trans, btree, level, &prev, &iter, k, initial); })); @@ -688,7 +705,7 @@ retry_root: struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, POS_MIN, 0, bch2_btree_id_root(c, btree)->b->c.level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); ret = PTR_ERR_OR_ZERO(b); if (ret) goto err_root; @@ -717,22 +734,24 @@ static inline int btree_id_gc_phase_cmp(enum btree_id l, enum btree_id r) static int bch2_gc_btrees(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - enum btree_id ids[BTREE_ID_NR]; struct printbuf buf = PRINTBUF; - unsigned i; int ret = 0; - for (i = 0; i < BTREE_ID_NR; i++) + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, ~0ULL); + + enum btree_id ids[BTREE_ID_NR]; + for (unsigned i = 0; i < BTREE_ID_NR; i++) ids[i] = i; bubble_sort(ids, BTREE_ID_NR, btree_id_gc_phase_cmp); - for (i = 0; i < btree_id_nr_alive(c) && !ret; i++) { + for (unsigned i = 0; i < btree_id_nr_alive(c) && !ret; i++) { unsigned btree = i < BTREE_ID_NR ? ids[i] : i; if (IS_ERR_OR_NULL(bch2_btree_id_root(c, btree)->b)) continue; - ret = bch2_gc_btree(trans, btree, true); + ret = bch2_gc_btree(trans, &progress, btree, true); } printbuf_exit(&buf); @@ -1015,8 +1034,7 @@ int bch2_check_allocations(struct bch_fs *c) { int ret; - lockdep_assert_held(&c->state_lock); - + down_read(&c->state_lock); down_write(&c->gc_lock); bch2_btree_interior_updates_flush(c); @@ -1054,12 +1072,17 @@ out: percpu_up_write(&c->mark_lock); up_write(&c->gc_lock); + up_read(&c->state_lock); /* * At startup, allocations can happen directly instead of via the * allocator thread - issue wakeup in case they blocked on gc_lock: */ closure_wake_up(&c->freelist_wait); + + if (!ret && !test_bit(BCH_FS_errors_not_fixed, &c->flags)) + bch2_sb_members_clean_deleted(c); + bch_err_fn(c, ret); return ret; } @@ -1194,7 +1217,7 @@ int bch2_gc_gens(struct bch_fs *c) BCH_TRANS_COMMIT_no_enospc, ({ ca = bch2_dev_iterate(c, ca, k.k->p.inode); if (!ca) { - bch2_btree_iter_set_pos(&iter, POS(k.k->p.inode + 1, 0)); + bch2_btree_iter_set_pos(trans, &iter, POS(k.k->p.inode + 1, 0)); continue; } bch2_alloc_write_oldest_gen(trans, ca, &iter, k); @@ -1228,26 +1251,21 @@ static void bch2_gc_gens_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, gc_gens_work); bch2_gc_gens(c); - bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); } void bch2_gc_gens_async(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_gc_gens) && + if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_gc_gens) && !queue_work(c->write_ref_wq, &c->gc_gens_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_gc_gens); -} - -void bch2_fs_btree_gc_exit(struct bch_fs *c) -{ + enumerated_ref_put(&c->writes, BCH_WRITE_REF_gc_gens); } -int bch2_fs_btree_gc_init(struct bch_fs *c) +void bch2_fs_btree_gc_init_early(struct bch_fs *c) { seqcount_init(&c->gc_pos_lock); INIT_WORK(&c->gc_gens_work, bch2_gc_gens_work); init_rwsem(&c->gc_lock); mutex_init(&c->gc_gens_lock); - return 0; } diff --git a/fs/bcachefs/btree_gc.h b/fs/bcachefs/btree_gc.h index 9693a90a48a2..ec77662369a2 100644 --- a/fs/bcachefs/btree_gc.h +++ b/fs/bcachefs/btree_gc.h @@ -83,7 +83,6 @@ void bch2_gc_pos_to_text(struct printbuf *, struct gc_pos *); int bch2_gc_gens(struct bch_fs *); void bch2_gc_gens_async(struct bch_fs *); -void bch2_fs_btree_gc_exit(struct bch_fs *); -int bch2_fs_btree_gc_init(struct bch_fs *); +void bch2_fs_btree_gc_init_early(struct bch_fs *); #endif /* _BCACHEFS_BTREE_GC_H */ diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index e371e60e3133..34018296053a 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -1,6 +1,8 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "async_objs.h" +#include "bkey_buf.h" #include "bkey_methods.h" #include "bkey_sort.h" #include "btree_cache.h" @@ -12,6 +14,7 @@ #include "buckets.h" #include "checksum.h" #include "debug.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "io_write.h" @@ -40,6 +43,7 @@ void bch2_btree_node_io_unlock(struct btree *b) clear_btree_node_write_in_flight_inner(b); clear_btree_node_write_in_flight(b); + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); } @@ -512,21 +516,23 @@ void bch2_btree_init_next(struct btree_trans *trans, struct btree *b) static void btree_err_msg(struct printbuf *out, struct bch_fs *c, struct bch_dev *ca, + bool print_pos, struct btree *b, struct bset *i, struct bkey_packed *k, - unsigned offset, int write) + unsigned offset, int rw) { - prt_printf(out, bch2_log_msg(c, "%s"), - write == READ - ? "error validating btree node " - : "corrupt btree node before write "); - if (ca) - prt_printf(out, "on %s ", ca->name); - prt_printf(out, "at btree "); - bch2_btree_pos_to_text(out, c, b); + if (print_pos) { + prt_str(out, rw == READ + ? "error validating btree node " + : "corrupt btree node before write "); + prt_printf(out, "at btree "); + bch2_btree_pos_to_text(out, c, b); + prt_newline(out); + } - printbuf_indent_add(out, 2); + if (ca) + prt_printf(out, "%s ", ca->name); - prt_printf(out, "\nnode offset %u/%u", + prt_printf(out, "node offset %u/%u", b->written, btree_ptr_sectors_written(bkey_i_to_s_c(&b->key))); if (i) prt_printf(out, " bset u64s %u", le16_to_cpu(i->u64s)); @@ -537,34 +543,32 @@ static void btree_err_msg(struct printbuf *out, struct bch_fs *c, prt_str(out, ": "); } -__printf(10, 11) +__printf(11, 12) static int __btree_err(int ret, struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, struct bkey_packed *k, - int write, - bool have_retry, + int rw, enum bch_sb_error_id err_type, + struct bch_io_failures *failed, + struct printbuf *err_msg, const char *fmt, ...) { - struct printbuf out = PRINTBUF; - bool silent = c->curr_recovery_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes; - va_list args; + if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) + return -BCH_ERR_fsck_fix; - btree_err_msg(&out, c, ca, b, i, k, b->written, write); + bool have_retry = false; + int ret2; - va_start(args, fmt); - prt_vprintf(&out, fmt, args); - va_end(args); + if (ca) { + bch2_mark_btree_validate_failure(failed, ca->dev_idx); - if (write == WRITE) { - bch2_print_string_as_lines(KERN_ERR, out.buf); - ret = c->opts.errors == BCH_ON_ERROR_continue - ? 0 - : -BCH_ERR_fsck_errors_not_fixed; - goto out; + struct extent_ptr_decoded pick; + have_retry = !bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + failed, &pick, -1); } if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) @@ -572,37 +576,77 @@ static int __btree_err(int ret, if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) ret = -BCH_ERR_btree_node_read_err_bad_node; - if (!silent && ret != -BCH_ERR_btree_node_read_err_fixable) - bch2_sb_error_count(c, err_type); + bch2_sb_error_count(c, err_type); + + bool print_deferred = err_msg && + rw == READ && + !(test_bit(BCH_FS_in_fsck, &c->flags) && + c->opts.fix_errors == FSCK_FIX_ask); + + struct printbuf out = PRINTBUF; + bch2_log_msg_start(c, &out); + + if (!print_deferred) + err_msg = &out; + + btree_err_msg(err_msg, c, ca, !print_deferred, b, i, k, b->written, rw); + + va_list args; + va_start(args, fmt); + prt_vprintf(err_msg, fmt, args); + va_end(args); + + if (print_deferred) { + prt_newline(err_msg); + + switch (ret) { + case -BCH_ERR_btree_node_read_err_fixable: + ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); + if (ret2 != -BCH_ERR_fsck_fix && + ret2 != -BCH_ERR_fsck_ignore) { + ret = ret2; + goto fsck_err; + } + + if (!have_retry) + ret = -BCH_ERR_fsck_fix; + goto out; + case -BCH_ERR_btree_node_read_err_bad_node: + prt_str(&out, ", "); + ret = __bch2_topology_error(c, &out); + break; + } + + goto out; + } + + if (rw == WRITE) { + prt_str(&out, ", "); + ret = __bch2_inconsistent_error(c, &out) + ? -BCH_ERR_fsck_errors_not_fixed + : 0; + goto print; + } switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: - ret = !silent - ? __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf) - : -BCH_ERR_fsck_fix; - if (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore) + ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); + if (ret2 != -BCH_ERR_fsck_fix && + ret2 != -BCH_ERR_fsck_ignore) { + ret = ret2; goto fsck_err; - ret = -BCH_ERR_fsck_fix; - break; - case -BCH_ERR_btree_node_read_err_want_retry: - case -BCH_ERR_btree_node_read_err_must_retry: - if (!silent) - bch2_print_string_as_lines(KERN_ERR, out.buf); - break; + } + + if (!have_retry) + ret = -BCH_ERR_fsck_fix; + goto out; case -BCH_ERR_btree_node_read_err_bad_node: - if (!silent) - bch2_print_string_as_lines(KERN_ERR, out.buf); - ret = bch2_topology_error(c); - break; - case -BCH_ERR_btree_node_read_err_incompatible: - if (!silent) - bch2_print_string_as_lines(KERN_ERR, out.buf); - ret = -BCH_ERR_fsck_errors_not_fixed; + prt_str(&out, ", "); + ret = __bch2_topology_error(c, &out); break; - default: - BUG(); } +print: + bch2_print_str(c, KERN_ERR, out.buf); out: fsck_err: printbuf_exit(&out); @@ -611,8 +655,9 @@ fsck_err: #define btree_err(type, c, ca, b, i, k, _err_type, msg, ...) \ ({ \ - int _ret = __btree_err(type, c, ca, b, i, k, write, have_retry, \ + int _ret = __btree_err(type, c, ca, b, i, k, write, \ BCH_FSCK_ERR_##_err_type, \ + failed, err_msg, \ msg, ##__VA_ARGS__); \ \ if (_ret != -BCH_ERR_fsck_fix) { \ @@ -620,7 +665,7 @@ fsck_err: goto fsck_err; \ } \ \ - *saw_error = true; \ + true; \ }) #define btree_err_on(cond, ...) ((cond) ? btree_err(__VA_ARGS__) : false) @@ -678,8 +723,9 @@ void bch2_btree_node_drop_keys_outside_node(struct btree *b) static int validate_bset(struct bch_fs *c, struct bch_dev *ca, struct btree *b, struct bset *i, - unsigned offset, unsigned sectors, - int write, bool have_retry, bool *saw_error) + unsigned offset, unsigned sectors, int write, + struct bch_io_failures *failed, + struct printbuf *err_msg) { unsigned version = le16_to_cpu(i->version); unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); @@ -816,7 +862,7 @@ static int validate_bset(struct bch_fs *c, struct bch_dev *ca, -BCH_ERR_btree_node_read_err_bad_node, c, ca, b, i, NULL, btree_node_bad_format, - "invalid bkey format: %s\n %s", buf1.buf, + "invalid bkey format: %s\n%s", buf1.buf, (printbuf_reset(&buf2), bch2_bkey_format_to_text(&buf2, &bn->format), buf2.buf)); printbuf_reset(&buf1); @@ -892,7 +938,8 @@ static inline int btree_node_read_bkey_cmp(const struct btree *b, static int validate_bset_keys(struct bch_fs *c, struct btree *b, struct bset *i, int write, - bool have_retry, bool *saw_error) + struct bch_io_failures *failed, + struct printbuf *err_msg) { unsigned version = le16_to_cpu(i->version); struct bkey_packed *k, *prev = NULL; @@ -996,7 +1043,7 @@ drop_this_key: } got_good_key: le16_add_cpu(&i->u64s, -next_good_key); - memmove_u64s_down(k, bkey_p_next(k), (u64 *) vstruct_end(i) - (u64 *) k); + memmove_u64s_down(k, (u64 *) k + next_good_key, (u64 *) vstruct_end(i) - (u64 *) k); set_btree_node_need_rewrite(b); } fsck_err: @@ -1005,7 +1052,9 @@ fsck_err: } int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, - struct btree *b, bool have_retry, bool *saw_error) + struct btree *b, + struct bch_io_failures *failed, + struct printbuf *err_msg) { struct btree_node_entry *bne; struct sort_iter *iter; @@ -1015,11 +1064,10 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, bool used_mempool, blacklisted; bool updated_range = b->key.k.type == KEY_TYPE_btree_ptr_v2 && BTREE_PTR_RANGE_UPDATED(&bkey_i_to_btree_ptr_v2(&b->key)->v); - unsigned u64s; unsigned ptr_written = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); u64 max_journal_seq = 0; struct printbuf buf = PRINTBUF; - int ret = 0, retry_read = 0, write = READ; + int ret = 0, write = READ; u64 start_time = local_clock(); b->version_ondisk = U16_MAX; @@ -1153,15 +1201,14 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, b->version_ondisk = min(b->version_ondisk, le16_to_cpu(i->version)); - ret = validate_bset(c, ca, b, i, b->written, sectors, - READ, have_retry, saw_error); + ret = validate_bset(c, ca, b, i, b->written, sectors, READ, failed, err_msg); if (ret) goto fsck_err; if (!b->written) btree_node_set_format(b, b->data->format); - ret = validate_bset_keys(c, b, i, READ, have_retry, saw_error); + ret = validate_bset_keys(c, b, i, READ, failed, err_msg); if (ret) goto fsck_err; @@ -1186,7 +1233,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, le64_to_cpu(i->journal_seq), b->written, b->written + sectors, ptr_written); - b->written += sectors; + b->written = min(b->written + sectors, btree_sectors(c)); if (blacklisted && !first) continue; @@ -1222,23 +1269,20 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, sorted = btree_bounce_alloc(c, btree_buf_bytes(b), &used_mempool); sorted->keys.u64s = 0; - set_btree_bset(b, b->set, &b->data->keys); - b->nr = bch2_key_sort_fix_overlapping(c, &sorted->keys, iter); memset((uint8_t *)(sorted + 1) + b->nr.live_u64s * sizeof(u64), 0, btree_buf_bytes(b) - sizeof(struct btree_node) - b->nr.live_u64s * sizeof(u64)); - u64s = le16_to_cpu(sorted->keys.u64s); + b->data->keys.u64s = sorted->keys.u64s; *sorted = *b->data; - sorted->keys.u64s = cpu_to_le16(u64s); swap(sorted, b->data); set_btree_bset(b, b->set, &b->data->keys); b->nsets = 1; b->data->keys.journal_seq = cpu_to_le64(max_journal_seq); - BUG_ON(b->nr.live_u64s != u64s); + BUG_ON(b->nr.live_u64s != le16_to_cpu(b->data->keys.u64s)); btree_bounce_free(c, btree_buf_bytes(b), used_mempool, sorted); @@ -1252,7 +1296,7 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, ret = btree_node_bkey_val_validate(c, b, u.s_c, READ); if (ret == -BCH_ERR_fsck_delete_bkey || - (bch2_inject_invalid_keys && + (static_branch_unlikely(&bch2_inject_invalid_keys) && !bversion_cmp(u.k->bversion, MAX_VERSION))) { btree_keys_account_key_drop(&b->nr, 0, k); @@ -1292,20 +1336,11 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, if (!ptr_written) set_btree_node_need_rewrite(b); -out: +fsck_err: mempool_free(iter, &c->fill_iter); printbuf_exit(&buf); bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read_done], start_time); - return retry_read; -fsck_err: - if (ret == -BCH_ERR_btree_node_read_err_want_retry || - ret == -BCH_ERR_btree_node_read_err_must_retry) { - retry_read = 1; - } else { - set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); - } - goto out; + return ret; } static void btree_node_read_work(struct work_struct *work) @@ -1317,17 +1352,28 @@ static void btree_node_read_work(struct work_struct *work) struct btree *b = rb->b; struct bio *bio = &rb->bio; struct bch_io_failures failed = { .nr = 0 }; + int ret = 0; + struct printbuf buf = PRINTBUF; - bool saw_error = false; - bool retry = false; - bool can_retry; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "btree node read error at btree "); + bch2_btree_pos_to_text(&buf, c, b); + prt_newline(&buf); goto start; while (1) { - retry = true; - bch_info(c, "retrying read"); - ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ); + ret = bch2_bkey_pick_read_device(c, + bkey_i_to_s_c(&b->key), + &failed, &rb->pick, -1); + if (ret) { + set_btree_node_read_error(b); + break; + } + + ca = bch2_dev_get_ioref(c, rb->pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = ca != NULL; + rb->start_time = local_clock(); bio_reset(bio, NULL, REQ_OP_READ|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = rb->pick.ptr.offset; bio->bi_iter.bi_size = btree_buf_bytes(b); @@ -1338,60 +1384,66 @@ static void btree_node_read_work(struct work_struct *work) } else { bio->bi_status = BLK_STS_REMOVED; } + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); start: - printbuf_reset(&buf); - bch2_btree_pos_to_text(&buf, c, b); - bch2_dev_io_err_on(ca && bio->bi_status, ca, BCH_MEMBER_ERROR_read, - "btree read error %s for %s", - bch2_blk_status_to_str(bio->bi_status), buf.buf); if (rb->have_ioref) - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_read); rb->have_ioref = false; - bch2_mark_io_failure(&failed, &rb->pick); - - can_retry = bch2_bkey_pick_read_device(c, - bkey_i_to_s_c(&b->key), - &failed, &rb->pick) > 0; - - if (!bio->bi_status && - !bch2_btree_node_read_done(c, ca, b, can_retry, &saw_error)) { - if (retry) - bch_info(c, "retry success"); - break; + if (bio->bi_status) { + bch2_mark_io_failure(&failed, &rb->pick, false); + continue; } - saw_error = true; + ret = bch2_btree_node_read_done(c, ca, b, &failed, &buf); + if (ret == -BCH_ERR_btree_node_read_err_want_retry || + ret == -BCH_ERR_btree_node_read_err_must_retry) + continue; - if (!can_retry) { + if (ret) set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); - break; - } + + break; } - bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], - rb->start_time); - bio_put(&rb->bio); + bch2_io_failures_to_text(&buf, c, &failed); - if ((saw_error || + if (btree_node_read_error(b)) + bch2_btree_lost_data(c, &buf, b->c.btree_id); + + /* + * only print retry success if we read from a replica with no errors + */ + if (btree_node_read_error(b)) + prt_printf(&buf, "ret %s", bch2_err_str(ret)); + else if (failed.nr) { + if (!bch2_dev_io_failures(&failed, rb->pick.ptr.dev)) + prt_printf(&buf, "retry success"); + else + prt_printf(&buf, "repair success"); + } + + if ((failed.nr || btree_node_need_rewrite(b)) && !btree_node_read_error(b) && - c->curr_recovery_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { - if (saw_error) { - printbuf_reset(&buf); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - bch_err_ratelimited(c, "%s: rewriting btree node at due to error\n %s", - __func__, buf.buf); - } - + c->recovery.curr_pass != BCH_RECOVERY_PASS_scan_for_btree_nodes) { + prt_printf(&buf, " (rewriting node)"); bch2_btree_node_rewrite_async(c, b); } + prt_newline(&buf); + if (failed.nr) + bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); + + async_object_list_del(c, btree_read_bio, rb->list_idx); + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_read], + rb->start_time); + bio_put(&rb->bio); printbuf_exit(&buf); clear_btree_node_read_in_flight(b); + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); } @@ -1400,16 +1452,20 @@ static void btree_node_read_endio(struct bio *bio) struct btree_read_bio *rb = container_of(bio, struct btree_read_bio, bio); struct bch_fs *c = rb->c; + struct bch_dev *ca = rb->have_ioref + ? bch2_dev_have_ref(c, rb->pick.ptr.dev) : NULL; - if (rb->have_ioref) { - struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); - - bch2_latency_acct(ca, rb->start_time, READ); - } + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rb->start_time, !bio->bi_status); queue_work(c->btree_read_complete_wq, &rb->work); } +void bch2_btree_read_bio_to_text(struct printbuf *out, struct btree_read_bio *rbio) +{ + bch2_bio_to_text(out, &rbio->bio); +} + struct btree_node_read_all { struct closure cl; struct bch_fs *c; @@ -1469,12 +1525,13 @@ static CLOSURE_CALLBACK(btree_node_read_all_replicas_done) struct btree *b = ra->b; struct printbuf buf = PRINTBUF; bool dump_bset_maps = false; - bool have_retry = false; int ret = 0, best = -1, write = READ; unsigned i, written = 0, written2 = 0; __le64 seq = b->key.k.type == KEY_TYPE_btree_ptr_v2 ? bkey_i_to_btree_ptr_v2(&b->key)->v.seq : 0; bool _saw_error = false, *saw_error = &_saw_error; + struct printbuf *err_msg = NULL; + struct bch_io_failures *failed = NULL; for (i = 0; i < ra->nr; i++) { struct btree_node *bn = ra->buf[i]; @@ -1567,14 +1624,19 @@ fsck_err: if (best >= 0) { memcpy(b->data, ra->buf[best], btree_buf_bytes(b)); - ret = bch2_btree_node_read_done(c, NULL, b, false, saw_error); + ret = bch2_btree_node_read_done(c, NULL, b, NULL, NULL); } else { ret = -1; } if (ret) { set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); + + struct printbuf buf = PRINTBUF; + bch2_btree_lost_data(c, &buf, b->c.btree_id); + if (buf.pos) + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); } else if (*saw_error) bch2_btree_node_rewrite_async(c, b); @@ -1588,6 +1650,7 @@ fsck_err: printbuf_exit(&buf); clear_btree_node_read_in_flight(b); + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); } @@ -1602,6 +1665,8 @@ static void btree_node_read_all_replicas_endio(struct bio *bio) struct bch_dev *ca = bch2_dev_have_ref(c, rb->pick.ptr.dev); bch2_latency_acct(ca, rb->start_time, READ); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_read_all_replicas); } ra->err[rb->idx] = bio->bi_status; @@ -1641,7 +1706,8 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool i = 0; bkey_for_each_ptr_decode(k.k, ptrs, pick, entry) { - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_read_all_replicas); struct btree_read_bio *rb = container_of(ra->bio[i], struct btree_read_bio, bio); rb->c = c; @@ -1692,33 +1758,42 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, trace_and_count(c, btree_node_read, trans, b); - if (bch2_verify_all_btree_replicas && + if (static_branch_unlikely(&bch2_verify_all_btree_replicas) && !btree_node_read_all_replicas(c, b, sync)) return; ret = bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), - NULL, &pick); + NULL, &pick, -1); if (ret <= 0) { + bool ratelimit = true; struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); prt_str(&buf, "btree node read error: no device to read from\n at "); bch2_btree_pos_to_text(&buf, c, b); - bch_err_ratelimited(c, "%s", buf.buf); - - if (c->opts.recovery_passes & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && - c->curr_recovery_pass > BCH_RECOVERY_PASS_check_topology) - bch2_fatal_error(c); + prt_newline(&buf); + bch2_btree_lost_data(c, &buf, b->c.btree_id); + + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_check_topology) && + bch2_fs_emergency_read_only2(c, &buf)) + ratelimit = false; + + static DEFINE_RATELIMIT_STATE(rs, + DEFAULT_RATELIMIT_INTERVAL, + DEFAULT_RATELIMIT_BURST); + if (!ratelimit || __ratelimit(&rs)) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); set_btree_node_read_error(b); - bch2_btree_lost_data(c, b->c.btree_id); clear_btree_node_read_in_flight(b); + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_read_in_flight); - printbuf_exit(&buf); return; } - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_read); bio = bio_alloc_bioset(NULL, buf_pages(b->data, btree_buf_bytes(b)), @@ -1737,6 +1812,8 @@ void bch2_btree_node_read(struct btree_trans *trans, struct btree *b, bio->bi_end_io = btree_node_read_endio; bch2_bio_map(bio, b->data, btree_buf_bytes(b)); + async_object_list_add(c, btree_read_bio, rb, &rb->list_idx); + if (rb->have_ioref) { this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_btree], bio_sectors(bio)); @@ -1811,6 +1888,192 @@ int bch2_btree_root_read(struct bch_fs *c, enum btree_id id, return bch2_trans_run(c, __bch2_btree_root_read(trans, id, k, level)); } +struct btree_node_scrub { + struct bch_fs *c; + struct bch_dev *ca; + void *buf; + bool used_mempool; + unsigned written; + + enum btree_id btree; + unsigned level; + struct bkey_buf key; + __le64 seq; + + struct work_struct work; + struct bio bio; +}; + +static bool btree_node_scrub_check(struct bch_fs *c, struct btree_node *data, unsigned ptr_written, + struct printbuf *err) +{ + unsigned written = 0; + + if (le64_to_cpu(data->magic) != bset_magic(c)) { + prt_printf(err, "bad magic: want %llx, got %llx", + bset_magic(c), le64_to_cpu(data->magic)); + return false; + } + + while (written < (ptr_written ?: btree_sectors(c))) { + struct btree_node_entry *bne; + struct bset *i; + bool first = !written; + + if (first) { + bne = NULL; + i = &data->keys; + } else { + bne = (void *) data + (written << 9); + i = &bne->keys; + + if (!ptr_written && i->seq != data->keys.seq) + break; + } + + struct nonce nonce = btree_nonce(i, written << 9); + bool good_csum_type = bch2_checksum_type_valid(c, BSET_CSUM_TYPE(i)); + + if (first) { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, data); + if (bch2_crc_cmp(data->csum, csum)) { + bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), data->csum, csum); + return false; + } + } + + written += vstruct_sectors(data, c->block_bits); + } else { + if (good_csum_type) { + struct bch_csum csum = csum_vstruct(c, BSET_CSUM_TYPE(i), nonce, bne); + if (bch2_crc_cmp(bne->csum, csum)) { + bch2_csum_err_msg(err, BSET_CSUM_TYPE(i), bne->csum, csum); + return false; + } + } + + written += vstruct_sectors(bne, c->block_bits); + } + } + + return true; +} + +static void btree_node_scrub_work(struct work_struct *work) +{ + struct btree_node_scrub *scrub = container_of(work, struct btree_node_scrub, work); + struct bch_fs *c = scrub->c; + struct printbuf err = PRINTBUF; + + __bch2_btree_pos_to_text(&err, c, scrub->btree, scrub->level, + bkey_i_to_s_c(scrub->key.k)); + prt_newline(&err); + + if (!btree_node_scrub_check(c, scrub->buf, scrub->written, &err)) { + struct btree_trans *trans = bch2_trans_get(c); + + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, scrub->btree, + scrub->key.k->k.p, 0, scrub->level - 1, 0); + + struct btree *b; + int ret = lockrestart_do(trans, + PTR_ERR_OR_ZERO(b = bch2_btree_iter_peek_node(trans, &iter))); + if (ret) + goto err; + + if (bkey_i_to_btree_ptr_v2(&b->key)->v.seq == scrub->seq) { + bch_err(c, "error validating btree node during scrub on %s at btree %s", + scrub->ca->name, err.buf); + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0); + } +err: + bch2_trans_iter_exit(trans, &iter); + bch2_trans_begin(trans); + bch2_trans_put(trans); + } + + printbuf_exit(&err); + bch2_bkey_buf_exit(&scrub->key, c);; + btree_bounce_free(c, c->opts.btree_node_size, scrub->used_mempool, scrub->buf); + enumerated_ref_put(&scrub->ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); + kfree(scrub); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); +} + +static void btree_node_scrub_endio(struct bio *bio) +{ + struct btree_node_scrub *scrub = container_of(bio, struct btree_node_scrub, bio); + + queue_work(scrub->c->btree_read_complete_wq, &scrub->work); +} + +int bch2_btree_node_scrub(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_s_c k, unsigned dev) +{ + if (k.k->type != KEY_TYPE_btree_ptr_v2) + return 0; + + struct bch_fs *c = trans->c; + + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) + return -BCH_ERR_erofs_no_writes; + + struct extent_ptr_decoded pick; + int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); + if (ret <= 0) + goto err; + + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_scrub); + if (!ca) { + ret = -BCH_ERR_device_offline; + goto err; + } + + bool used_mempool = false; + void *buf = btree_bounce_alloc(c, c->opts.btree_node_size, &used_mempool); + + unsigned vecs = buf_pages(buf, c->opts.btree_node_size); + + struct btree_node_scrub *scrub = + kzalloc(sizeof(*scrub) + sizeof(struct bio_vec) * vecs, GFP_KERNEL); + if (!scrub) { + ret = -ENOMEM; + goto err_free; + } + + scrub->c = c; + scrub->ca = ca; + scrub->buf = buf; + scrub->used_mempool = used_mempool; + scrub->written = btree_ptr_sectors_written(k); + + scrub->btree = btree; + scrub->level = level; + bch2_bkey_buf_init(&scrub->key); + bch2_bkey_buf_reassemble(&scrub->key, c, k); + scrub->seq = bkey_s_c_to_btree_ptr_v2(k).v->seq; + + INIT_WORK(&scrub->work, btree_node_scrub_work); + + bio_init(&scrub->bio, ca->disk_sb.bdev, scrub->bio.bi_inline_vecs, vecs, REQ_OP_READ); + bch2_bio_map(&scrub->bio, scrub->buf, c->opts.btree_node_size); + scrub->bio.bi_iter.bi_sector = pick.ptr.offset; + scrub->bio.bi_end_io = btree_node_scrub_endio; + submit_bio(&scrub->bio); + return 0; +err_free: + btree_bounce_free(c, c->opts.btree_node_size, used_mempool, buf); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scrub); +err: + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_node_scrub); + return ret; +} + static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, struct btree_write *w) { @@ -1831,7 +2094,7 @@ static void bch2_btree_complete_write(struct bch_fs *c, struct btree *b, bch2_journal_pin_drop(&c->journal, &w->journal); } -static void __btree_node_write_done(struct bch_fs *c, struct btree *b) +static void __btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) { struct btree_write *w = btree_prev_write(b); unsigned long old, new; @@ -1839,6 +2102,9 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) bch2_btree_complete_write(c, b, w); + if (start_time) + bch2_time_stats_update(&c->times[BCH_TIME_btree_node_write], start_time); + old = READ_ONCE(b->flags); do { new = old; @@ -1865,11 +2131,13 @@ static void __btree_node_write_done(struct bch_fs *c, struct btree *b) if (new & (1U << BTREE_NODE_write_in_flight)) __bch2_btree_node_write(c, b, BTREE_WRITE_ALREADY_STARTED|type); - else + else { + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_write_in_flight); + } } -static void btree_node_write_done(struct bch_fs *c, struct btree *b) +static void btree_node_write_done(struct bch_fs *c, struct btree *b, u64 start_time) { struct btree_trans *trans = bch2_trans_get(c); @@ -1877,7 +2145,7 @@ static void btree_node_write_done(struct bch_fs *c, struct btree *b) /* we don't need transaction context anymore after we got the lock. */ bch2_trans_put(trans); - __btree_node_write_done(c, b); + __btree_node_write_done(c, b, start_time); six_unlock_read(&b->c.lock); } @@ -1887,6 +2155,7 @@ static void btree_node_write_work(struct work_struct *work) container_of(work, struct btree_write_bio, work); struct bch_fs *c = wbio->wbio.c; struct btree *b = wbio->wbio.bio.bi_private; + u64 start_time = wbio->start_time; int ret = 0; btree_bounce_free(c, @@ -1918,13 +2187,20 @@ static void btree_node_write_work(struct work_struct *work) goto err; } out: + async_object_list_del(c, btree_write_bio, wbio->list_idx); bio_put(&wbio->wbio.bio); - btree_node_write_done(c, b); + btree_node_write_done(c, b, start_time); return; err: set_btree_node_noevict(b); - bch2_fs_fatal_err_on(!bch2_err_matches(ret, EROFS), c, - "writing btree node: %s", bch2_err_str(ret)); + + if (!bch2_err_matches(ret, EROFS)) { + struct printbuf buf = PRINTBUF; + prt_printf(&buf, "writing btree node: %s\n ", bch2_err_str(ret)); + bch2_btree_pos_to_text(&buf, c, b); + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + } goto out; } @@ -1937,23 +2213,34 @@ static void btree_node_write_endio(struct bio *bio) struct bch_fs *c = wbio->c; struct btree *b = wbio->bio.bi_private; struct bch_dev *ca = wbio->have_ioref ? bch2_dev_have_ref(c, wbio->dev) : NULL; - unsigned long flags; - if (wbio->have_ioref) - bch2_latency_acct(ca, wbio->submit_time, WRITE); + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); + + if (ca && bio->bi_status) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + prt_printf(&buf, "btree write error: %s\n ", + bch2_blk_status_to_str(bio->bi_status)); + bch2_btree_pos_to_text(&buf, c, b); + bch_err_dev_ratelimited(ca, "%s", buf.buf); + printbuf_exit(&buf); + } - if (!ca || - bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, - "btree write error: %s", - bch2_blk_status_to_str(bio->bi_status)) || - bch2_meta_write_fault("btree")) { + if (bio->bi_status) { + unsigned long flags; spin_lock_irqsave(&c->btree_write_error_lock, flags); bch2_dev_list_add_dev(&orig->failed, wbio->dev); spin_unlock_irqrestore(&c->btree_write_error_lock, flags); } + /* + * XXX: we should be using io_ref[WRITE], but we aren't retrying failed + * btree writes yet (due to device removal/ro): + */ if (wbio->have_ioref) - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_write); if (parent) { bio_put(bio); @@ -1962,16 +2249,15 @@ static void btree_node_write_endio(struct bio *bio) } clear_btree_node_write_in_flight_inner(b); + smp_mb__after_atomic(); wake_up_bit(&b->flags, BTREE_NODE_write_in_flight_inner); INIT_WORK(&wb->work, btree_node_write_work); - queue_work(c->btree_io_complete_wq, &wb->work); + queue_work(c->btree_write_complete_wq, &wb->work); } static int validate_bset_for_write(struct bch_fs *c, struct btree *b, struct bset *i, unsigned sectors) { - bool saw_error; - int ret = bch2_bkey_validate(c, bkey_i_to_s_c(&b->key), (struct bkey_validate_context) { .from = BKEY_VALIDATE_btree_node, @@ -1984,8 +2270,8 @@ static int validate_bset_for_write(struct bch_fs *c, struct btree *b, return ret; } - ret = validate_bset_keys(c, b, i, WRITE, false, &saw_error) ?: - validate_bset(c, NULL, b, i, b->written, sectors, WRITE, false, &saw_error); + ret = validate_bset_keys(c, b, i, WRITE, NULL, NULL) ?: + validate_bset(c, NULL, b, i, b->written, sectors, WRITE, NULL, NULL); if (ret) { bch2_inconsistent_error(c); dump_stack(); @@ -2023,6 +2309,7 @@ void __bch2_btree_node_write(struct bch_fs *c, struct btree *b, unsigned flags) bool validate_before_checksum = false; enum btree_write_type type = flags & BTREE_WRITE_TYPE_MASK; void *data; + u64 start_time = local_clock(); int ret; if (flags & BTREE_WRITE_ALREADY_STARTED) @@ -2231,6 +2518,7 @@ do_write: wbio->data = data; wbio->data_bytes = bytes; wbio->sector_offset = b->written; + wbio->start_time = start_time; wbio->wbio.c = c; wbio->wbio.used_mempool = used_mempool; wbio->wbio.first_btree_write = !b->written; @@ -2250,6 +2538,8 @@ do_write: atomic64_inc(&c->btree_write_stats[type].nr); atomic64_add(bytes_to_write, &c->btree_write_stats[type].bytes); + async_object_list_add(c, btree_write_bio, wbio, &wbio->list_idx); + INIT_WORK(&wbio->work, btree_write_submit); queue_work(c->btree_write_submit_wq, &wbio->work); return; @@ -2258,7 +2548,7 @@ err: b->written += sectors_to_write; nowrite: btree_bounce_free(c, bytes, used_mempool, data); - __btree_node_write_done(c, b); + __btree_node_write_done(c, b, 0); } /* diff --git a/fs/bcachefs/btree_io.h b/fs/bcachefs/btree_io.h index 6f9e4a6dacf7..30a5180532c8 100644 --- a/fs/bcachefs/btree_io.h +++ b/fs/bcachefs/btree_io.h @@ -41,6 +41,9 @@ struct btree_read_bio { u64 start_time; unsigned have_ioref:1; unsigned idx:7; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct extent_ptr_decoded pick; struct work_struct work; struct bio bio; @@ -52,6 +55,10 @@ struct btree_write_bio { void *data; unsigned data_bytes; unsigned sector_offset; + u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct bch_write_bio wbio; }; @@ -127,11 +134,18 @@ void bch2_btree_build_aux_trees(struct btree *); void bch2_btree_init_next(struct btree_trans *, struct btree *); int bch2_btree_node_read_done(struct bch_fs *, struct bch_dev *, - struct btree *, bool, bool *); + struct btree *, + struct bch_io_failures *, + struct printbuf *); void bch2_btree_node_read(struct btree_trans *, struct btree *, bool); int bch2_btree_root_read(struct bch_fs *, enum btree_id, const struct bkey_i *, unsigned); +void bch2_btree_read_bio_to_text(struct printbuf *, struct btree_read_bio *); + +int bch2_btree_node_scrub(struct btree_trans *, enum btree_id, unsigned, + struct bkey_s_c, unsigned); + bool bch2_btree_post_write_cleanup(struct bch_fs *, struct btree *); enum btree_write_flags { diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index e32fce4fd258..b4bf4217a3fa 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -16,6 +16,7 @@ #include "journal_io.h" #include "replicas.h" #include "snapshot.h" +#include "super.h" #include "trace.h" #include <linux/random.h> @@ -114,11 +115,9 @@ static inline bool btree_path_pos_in_node(struct btree_path *path, !btree_path_pos_after_node(path, b); } -/* Btree iterator: */ +/* Debug: */ -#ifdef CONFIG_BCACHEFS_DEBUG - -static void bch2_btree_path_verify_cached(struct btree_trans *trans, +static void __bch2_btree_path_verify_cached(struct btree_trans *trans, struct btree_path *path) { struct bkey_cached *ck; @@ -135,7 +134,7 @@ static void bch2_btree_path_verify_cached(struct btree_trans *trans, btree_node_unlock(trans, path, 0); } -static void bch2_btree_path_verify_level(struct btree_trans *trans, +static void __bch2_btree_path_verify_level(struct btree_trans *trans, struct btree_path *path, unsigned level) { struct btree_path_level *l; @@ -147,16 +146,13 @@ static void bch2_btree_path_verify_level(struct btree_trans *trans, struct printbuf buf3 = PRINTBUF; const char *msg; - if (!bch2_debug_check_iterators) - return; - l = &path->l[level]; tmp = l->iter; locked = btree_node_locked(path, level); if (path->cached) { if (!level) - bch2_btree_path_verify_cached(trans, path); + __bch2_btree_path_verify_cached(trans, path); return; } @@ -217,7 +213,7 @@ err: msg, level, buf1.buf, buf2.buf, buf3.buf); } -static void bch2_btree_path_verify(struct btree_trans *trans, +static void __bch2_btree_path_verify(struct btree_trans *trans, struct btree_path *path) { struct bch_fs *c = trans->c; @@ -229,25 +225,23 @@ static void bch2_btree_path_verify(struct btree_trans *trans, break; } - bch2_btree_path_verify_level(trans, path, i); + __bch2_btree_path_verify_level(trans, path, i); } - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); } -void bch2_trans_verify_paths(struct btree_trans *trans) +void __bch2_trans_verify_paths(struct btree_trans *trans) { struct btree_path *path; unsigned iter; trans_for_each_path(trans, path, iter) - bch2_btree_path_verify(trans, path); + __bch2_btree_path_verify(trans, path); } -static void bch2_btree_iter_verify(struct btree_iter *iter) +static void __bch2_btree_iter_verify(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - BUG_ON(!!(iter->flags & BTREE_ITER_cached) != btree_iter_path(trans, iter)->cached); BUG_ON((iter->flags & BTREE_ITER_is_extents) && @@ -258,11 +252,11 @@ static void bch2_btree_iter_verify(struct btree_iter *iter) !btree_type_has_snapshot_field(iter->btree_id)); if (iter->update_path) - bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); - bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); + __bch2_btree_path_verify(trans, &trans->paths[iter->update_path]); + __bch2_btree_path_verify(trans, btree_iter_path(trans, iter)); } -static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +static void __bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) { BUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && !iter->pos.snapshot); @@ -276,16 +270,13 @@ static void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) bkey_gt(iter->pos, iter->k.p))); } -static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) +static int __bch2_btree_iter_verify_ret(struct btree_trans *trans, + struct btree_iter *iter, struct bkey_s_c k) { - struct btree_trans *trans = iter->trans; struct btree_iter copy; struct bkey_s_c prev; int ret = 0; - if (!bch2_debug_check_iterators) - return 0; - if (!(iter->flags & BTREE_ITER_filter_snapshots)) return 0; @@ -299,7 +290,7 @@ static int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k bch2_trans_iter_init(trans, ©, iter->btree_id, iter->pos, BTREE_ITER_nopreserve| BTREE_ITER_all_snapshots); - prev = bch2_btree_iter_prev(©); + prev = bch2_btree_iter_prev(trans, ©); if (!prev.k) goto out; @@ -326,7 +317,7 @@ out: return ret; } -void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, +void __bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, struct bpos pos) { bch2_trans_verify_not_unlocked_or_in_restart(trans); @@ -359,17 +350,40 @@ void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, panic("not locked: %s %s\n", bch2_btree_id_str(id), buf.buf); } -#else - static inline void bch2_btree_path_verify_level(struct btree_trans *trans, - struct btree_path *path, unsigned l) {} + struct btree_path *path, unsigned l) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_path_verify_level(trans, path, l); +} + static inline void bch2_btree_path_verify(struct btree_trans *trans, - struct btree_path *path) {} -static inline void bch2_btree_iter_verify(struct btree_iter *iter) {} -static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) {} -static inline int bch2_btree_iter_verify_ret(struct btree_iter *iter, struct bkey_s_c k) { return 0; } + struct btree_path *path) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_path_verify(trans, path); +} -#endif +static inline void bch2_btree_iter_verify(struct btree_trans *trans, + struct btree_iter *iter) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_iter_verify(trans, iter); +} + +static inline void bch2_btree_iter_verify_entry_exit(struct btree_iter *iter) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_btree_iter_verify_entry_exit(iter); +} + +static inline int bch2_btree_iter_verify_ret(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_s_c k) +{ + return static_branch_unlikely(&bch2_debug_check_iterators) + ? __bch2_btree_iter_verify_ret(trans, iter, k) + : 0; +} /* Btree path: fixups after btree updates */ @@ -523,7 +537,7 @@ void bch2_btree_node_iter_fix(struct btree_trans *trans, __bch2_btree_node_iter_fix(path, b, node_iter, t, where, clobber_u64s, new_u64s); - if (bch2_debug_check_iterators) + if (static_branch_unlikely(&bch2_debug_check_iterators)) bch2_btree_node_iter_verify(node_iter, b); } @@ -562,20 +576,6 @@ static inline struct bkey_s_c btree_path_level_peek_all(struct bch_fs *c, bch2_btree_node_iter_peek_all(&l->iter, l->b)); } -static inline struct bkey_s_c btree_path_level_peek(struct btree_trans *trans, - struct btree_path *path, - struct btree_path_level *l, - struct bkey *u) -{ - struct bkey_s_c k = __btree_iter_unpack(trans->c, l, u, - bch2_btree_node_iter_peek(&l->iter, l->b)); - - path->pos = k.k ? k.k->p : l->b->key.k.p; - trans->paths_sorted = false; - bch2_btree_path_verify_level(trans, path, l - path->l); - return k; -} - static inline struct bkey_s_c btree_path_level_prev(struct btree_trans *trans, struct btree_path *path, struct btree_path_level *l, @@ -991,7 +991,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, path->level = level; bch2_btree_path_level_init(trans, path, b); - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); err: bch2_bkey_buf_exit(&tmp, c); return ret; @@ -1103,7 +1103,7 @@ static void btree_path_set_level_down(struct btree_trans *trans, if (btree_lock_want(path, l) == BTREE_NODE_UNLOCKED) btree_node_unlock(trans, path, l); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); bch2_btree_path_verify(trans, path); } @@ -1176,7 +1176,7 @@ int bch2_btree_path_traverse_one(struct btree_trans *trans, } if (path->cached) { - ret = bch2_btree_path_traverse_cached(trans, path, flags); + ret = bch2_btree_path_traverse_cached(trans, path_idx, flags); goto out; } @@ -1301,7 +1301,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, if (unlikely(path->cached)) { btree_node_unlock(trans, path, 0); path->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_up); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); goto out; } @@ -1330,7 +1330,7 @@ __bch2_btree_path_set_pos(struct btree_trans *trans, } if (unlikely(level != path->level)) { - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); __bch2_btree_path_unlock(trans, path); } out: @@ -1399,45 +1399,45 @@ static bool bch2_btree_path_can_relock(struct btree_trans *trans, struct btree_p void bch2_path_put(struct btree_trans *trans, btree_path_idx_t path_idx, bool intent) { - struct btree_path *path = trans->paths + path_idx, *dup; + struct btree_path *path = trans->paths + path_idx, *dup = NULL; if (!__btree_path_put(trans, path, intent)) return; + if (!path->preserve && !path->should_be_locked) + goto free; + dup = path->preserve ? have_path_at_pos(trans, path) : have_node_at_pos(trans, path); - - trace_btree_path_free(trans, path_idx, dup); - - if (!dup && !(!path->preserve && !is_btree_node(path, path->level))) + if (!dup) return; - if (path->should_be_locked && !trans->restarted) { - if (!dup) - return; - + /* + * If we need this path locked, the duplicate also has te be locked + * before we free this one: + */ + if (path->should_be_locked && + !dup->should_be_locked && + !trans->restarted) { if (!(trans->locked ? bch2_btree_path_relock_norestart(trans, dup) : bch2_btree_path_can_relock(trans, dup))) return; - } - if (dup) { - dup->preserve |= path->preserve; - dup->should_be_locked |= path->should_be_locked; + dup->should_be_locked = true; } - __bch2_path_free(trans, path_idx); -} - -static void bch2_path_put_nokeep(struct btree_trans *trans, btree_path_idx_t path, - bool intent) -{ - if (!__btree_path_put(trans, trans->paths + path, intent)) - return; + BUG_ON(path->should_be_locked && + !trans->restarted && + trans->locked && + !btree_node_locked(dup, dup->level)); - __bch2_path_free(trans, path); + path->should_be_locked = false; + dup->preserve |= path->preserve; +free: + trace_btree_path_free(trans, path_idx, dup); + __bch2_path_free(trans, path_idx); } void __noreturn bch2_trans_restart_error(struct btree_trans *trans, u32 restart_count) @@ -1499,24 +1499,16 @@ void bch2_trans_updates_to_text(struct printbuf *buf, struct btree_trans *trans) prt_newline(buf); } - for (struct jset_entry *e = trans->journal_entries; + for (struct jset_entry *e = btree_trans_journal_entries_start(trans); e != btree_trans_journal_entries_top(trans); - e = vstruct_next(e)) + e = vstruct_next(e)) { bch2_journal_entry_to_text(buf, trans->c, e); + prt_newline(buf); + } printbuf_indent_sub(buf, 2); } -noinline __cold -void bch2_dump_trans_updates(struct btree_trans *trans) -{ - struct printbuf buf = PRINTBUF; - - bch2_trans_updates_to_text(&buf, trans); - bch2_print_str(trans->c, buf.buf); - printbuf_exit(&buf); -} - static void bch2_btree_path_to_text_short(struct printbuf *out, struct btree_trans *trans, btree_path_idx_t path_idx) { struct btree_path *path = trans->paths + path_idx; @@ -1613,7 +1605,7 @@ void __bch2_dump_trans_paths_updates(struct btree_trans *trans, bool nosort) __bch2_trans_paths_to_text(&buf, trans, nosort); bch2_trans_updates_to_text(&buf, trans); - bch2_print_str(trans->c, buf.buf); + bch2_print_str(trans->c, KERN_ERR, buf.buf); printbuf_exit(&buf); } @@ -1757,6 +1749,10 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, btree_trans_sort_paths(trans); + if (intent) + locks_want = max(locks_want, level + 1); + locks_want = min(locks_want, BTREE_MAX_DEPTH); + trans_for_each_path_inorder(trans, path, iter) { if (__btree_path_cmp(path, btree_id, @@ -1771,7 +1767,8 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, if (path_pos && trans->paths[path_pos].cached == cached && trans->paths[path_pos].btree_id == btree_id && - trans->paths[path_pos].level == level) { + trans->paths[path_pos].level == level && + bch2_btree_path_upgrade_norestart(trans, trans->paths + path_pos, locks_want)) { trace_btree_path_get(trans, trans->paths + path_pos, &pos); __btree_path_get(trans, trans->paths + path_pos, intent); @@ -1803,9 +1800,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, if (!(flags & BTREE_ITER_nopreserve)) path->preserve = true; - if (path->intent_ref) - locks_want = max(locks_want, level + 1); - /* * If the path has locks_want greater than requested, we don't downgrade * it here - on transaction restart because btree node split needs to @@ -1814,10 +1808,6 @@ btree_path_idx_t bch2_path_get(struct btree_trans *trans, * a successful transaction commit. */ - locks_want = min(locks_want, BTREE_MAX_DEPTH); - if (locks_want > path->locks_want) - bch2_btree_path_upgrade_noupgrade_sibs(trans, path, locks_want, NULL); - return path_idx; } @@ -1877,10 +1867,8 @@ hole: return (struct bkey_s_c) { u, NULL }; } -void bch2_set_btree_iter_dontneed(struct btree_iter *iter) +void bch2_set_btree_iter_dontneed(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - if (!iter->path || trans->restarted) return; @@ -1892,17 +1880,14 @@ void bch2_set_btree_iter_dontneed(struct btree_iter *iter) /* Btree iterators: */ int __must_check -__bch2_btree_iter_traverse(struct btree_iter *iter) +__bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) { - return bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + return bch2_btree_path_traverse(trans, iter->path, iter->flags); } int __must_check -bch2_btree_iter_traverse(struct btree_iter *iter) +bch2_btree_iter_traverse(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; - int ret; - bch2_trans_verify_not_unlocked_or_in_restart(trans); iter->path = bch2_btree_path_set_pos(trans, iter->path, @@ -1910,7 +1895,7 @@ bch2_btree_iter_traverse(struct btree_iter *iter) iter->flags & BTREE_ITER_intent, btree_iter_ip_allocated(iter)); - ret = bch2_btree_path_traverse(iter->trans, iter->path, iter->flags); + int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) return ret; @@ -1922,14 +1907,14 @@ bch2_btree_iter_traverse(struct btree_iter *iter) /* Iterate across nodes (leaf and interior nodes) */ -struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) +struct btree *bch2_btree_iter_peek_node(struct btree_trans *trans, + struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct btree *b = NULL; int ret; EBUG_ON(trans->paths[iter->path].cached); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) @@ -1951,7 +1936,7 @@ struct btree *bch2_btree_iter_peek_node(struct btree_iter *iter) btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return b; err: @@ -1960,26 +1945,26 @@ err: } /* Only kept for -tools */ -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *iter) +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *trans, + struct btree_iter *iter) { struct btree *b; - while (b = bch2_btree_iter_peek_node(iter), + while (b = bch2_btree_iter_peek_node(trans, iter), bch2_err_matches(PTR_ERR_OR_ZERO(b), BCH_ERR_transaction_restart)) - bch2_trans_begin(iter->trans); + bch2_trans_begin(trans); return b; } -struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) +struct btree *bch2_btree_iter_next_node(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct btree *b = NULL; int ret; EBUG_ON(trans->paths[iter->path].cached); bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (ret) @@ -1994,17 +1979,24 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) /* got to end? */ if (!btree_path_node(path, path->level + 1)) { + path->should_be_locked = false; btree_path_set_level_up(trans, path); return NULL; } + /* + * We don't correctly handle nodes with extra intent locks here: + * downgrade so we don't violate locking invariants + */ + bch2_btree_path_downgrade(trans, path); + if (!bch2_btree_node_relock(trans, path, path->level + 1)) { + trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); + ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); __bch2_btree_path_unlock(trans, path); path->l[path->level].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); path->l[path->level + 1].b = ERR_PTR(-BCH_ERR_no_btree_node_relock); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - trace_and_count(trans->c, trans_restart_relock_next_node, trans, _THIS_IP_, path); - ret = btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); goto err; } @@ -2046,7 +2038,7 @@ struct btree *bch2_btree_iter_next_node(struct btree_iter *iter) EBUG_ON(btree_iter_path(trans, iter)->uptodate); out: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return b; err: @@ -2056,7 +2048,7 @@ err: /* Iterate across keys (in leaf nodes only) */ -inline bool bch2_btree_iter_advance(struct btree_iter *iter) +inline bool bch2_btree_iter_advance(struct btree_trans *trans, struct btree_iter *iter) { struct bpos pos = iter->k.p; bool ret = !(iter->flags & BTREE_ITER_all_snapshots @@ -2065,11 +2057,11 @@ inline bool bch2_btree_iter_advance(struct btree_iter *iter) if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_successor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); return ret; } -inline bool bch2_btree_iter_rewind(struct btree_iter *iter) +inline bool bch2_btree_iter_rewind(struct btree_trans *trans, struct btree_iter *iter) { struct bpos pos = bkey_start_pos(&iter->k); bool ret = !(iter->flags & BTREE_ITER_all_snapshots @@ -2078,7 +2070,7 @@ inline bool bch2_btree_iter_rewind(struct btree_iter *iter) if (ret && !(iter->flags & BTREE_ITER_is_extents)) pos = bkey_predecessor(iter, pos); - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); return ret; } @@ -2205,9 +2197,9 @@ void btree_trans_peek_prev_journal(struct btree_trans *trans, * bkey_s_c_null: */ static noinline -struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos pos) +struct bkey_s_c btree_trans_peek_key_cache(struct btree_trans *trans, struct btree_iter *iter, + struct bpos pos) { - struct btree_trans *trans = iter->trans; struct bch_fs *c = trans->c; struct bkey u; struct bkey_s_c k; @@ -2253,14 +2245,14 @@ struct bkey_s_c btree_trans_peek_key_cache(struct btree_iter *iter, struct bpos return k; } -static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bpos search_key) +static struct bkey_s_c __bch2_btree_iter_peek(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key) { - struct btree_trans *trans = iter->trans; struct bkey_s_c k, k2; int ret; EBUG_ON(btree_iter_path(trans, iter)->cached); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, @@ -2270,7 +2262,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); break; } @@ -2280,7 +2272,7 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(!l->b)) { /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; break; } @@ -2291,10 +2283,10 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && - (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { k = k2; if (bkey_err(k)) { - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); break; } } @@ -2327,27 +2319,28 @@ static struct bkey_s_c __bch2_btree_iter_peek(struct btree_iter *iter, struct bp search_key = bpos_successor(l->b->key.k.p); } else { /* End of btree: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; break; } } - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return k; } /** * bch2_btree_iter_peek_max() - returns first key greater than or equal to * iterator's current position + * @trans: btree transaction object * @iter: iterator to peek from * @end: search limit: returns keys less than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end) { - struct btree_trans *trans = iter->trans; struct bpos search_key = btree_iter_search_key(iter); struct bkey_s_c k; struct bpos iter_pos = iter->pos; @@ -2364,13 +2357,12 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en } if (iter->update_path) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); iter->update_path = 0; } while (1) { - k = __bch2_btree_iter_peek(iter, search_key); + k = __bch2_btree_iter_peek(trans, iter, search_key); if (unlikely(!k.k)) goto end; if (unlikely(bkey_err(k))) @@ -2394,8 +2386,8 @@ struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *iter, struct bpos en if (iter->update_path && !bkey_eq(trans->paths[iter->update_path].pos, k.k->p)) { - bch2_path_put_nokeep(trans, iter->update_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, iter->update_path, + iter->flags & BTREE_ITER_intent); iter->update_path = 0; } @@ -2484,9 +2476,9 @@ out_no_locked: if (!(iter->flags & BTREE_ITER_all_snapshots)) iter->pos.snapshot = iter->snapshot; - ret = bch2_btree_iter_verify_ret(iter, k); + ret = bch2_btree_iter_verify_ret(trans, iter, k); if (unlikely(ret)) { - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); } @@ -2494,7 +2486,7 @@ out_no_locked: return k; end: - bch2_btree_iter_set_pos(iter, end); + bch2_btree_iter_set_pos(trans, iter, end); k = bkey_s_c_null; goto out_no_locked; } @@ -2502,24 +2494,25 @@ end: /** * bch2_btree_iter_next() - returns first key greater than iterator's current * position + * @trans: btree transaction object * @iter: iterator to peek from * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_next(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_advance(iter)) + if (!bch2_btree_iter_advance(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek(iter); + return bch2_btree_iter_peek(trans, iter); } -static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, struct bpos search_key) +static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter, + struct bpos search_key) { - struct btree_trans *trans = iter->trans; struct bkey_s_c k, k2; - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); while (1) { iter->path = bch2_btree_path_set_pos(trans, iter->path, search_key, @@ -2529,7 +2522,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru int ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { /* ensure that iter->k is consistent with iter->pos: */ - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); k = bkey_s_c_err(ret); break; } @@ -2539,7 +2532,7 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru if (unlikely(!l->b)) { /* No btree nodes at requested level: */ - bch2_btree_iter_set_pos(iter, SPOS_MAX); + bch2_btree_iter_set_pos(trans, iter, SPOS_MAX); k = bkey_s_c_null; break; } @@ -2555,10 +2548,10 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && k.k && - (k2 = btree_trans_peek_key_cache(iter, k.k->p)).k) { + (k2 = btree_trans_peek_key_cache(trans, iter, k.k->p)).k) { k = k2; if (bkey_err(k2)) { - bch2_btree_iter_set_pos(iter, iter->pos); + bch2_btree_iter_set_pos(trans, iter, iter->pos); break; } } @@ -2579,28 +2572,33 @@ static struct bkey_s_c __bch2_btree_iter_peek_prev(struct btree_iter *iter, stru search_key = bpos_predecessor(path->l[0].b->data->min_key); } else { /* Start of btree: */ - bch2_btree_iter_set_pos(iter, POS_MIN); + bch2_btree_iter_set_pos(trans, iter, POS_MIN); k = bkey_s_c_null; break; } } - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return k; } /** * bch2_btree_iter_peek_prev_min() - returns first key less than or equal to * iterator's current position + * @trans: btree transaction object * @iter: iterator to peek from * @end: search limit: returns keys greater than or equal to @end * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bpos end) +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end) { if ((iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots)) && - !bkey_eq(iter->pos, POS_MAX)) { + !bkey_eq(iter->pos, POS_MAX) && + !((iter->flags & BTREE_ITER_is_extents) && + iter->pos.offset == U64_MAX)) { + /* * bkey_start_pos(), for extents, is not monotonically * increasing until after filtering for snapshots: @@ -2609,7 +2607,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp * real visible extents - easiest to just use peek_slot() (which * internally uses peek() for extents) */ - struct bkey_s_c k = bch2_btree_iter_peek_slot(iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, iter); if (bkey_err(k)) return k; @@ -2619,14 +2617,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp return k; } - struct btree_trans *trans = iter->trans; struct bpos search_key = iter->pos; struct bkey_s_c k; btree_path_idx_t saved_path = 0; bch2_trans_verify_not_unlocked_or_in_restart(trans); bch2_btree_iter_verify_entry_exit(iter); - EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && bpos_eq(end, POS_MIN)); + EBUG_ON((iter->flags & BTREE_ITER_filter_snapshots) && iter->pos.inode != end.inode); int ret = trans_maybe_inject_restart(trans, _RET_IP_); if (unlikely(ret)) { @@ -2635,7 +2632,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp } while (1) { - k = __bch2_btree_iter_peek_prev(iter, search_key); + k = __bch2_btree_iter_peek_prev(trans, iter, search_key); if (unlikely(!k.k)) goto end; if (unlikely(bkey_err(k))) @@ -2649,7 +2646,7 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp * the last possible snapshot overwrite, return * it: */ - bch2_path_put_nokeep(trans, iter->path, + bch2_path_put(trans, iter->path, iter->flags & BTREE_ITER_intent); iter->path = saved_path; saved_path = 0; @@ -2679,8 +2676,8 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp * our previous saved candidate: */ if (saved_path) { - bch2_path_put_nokeep(trans, saved_path, - iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, saved_path, + iter->flags & BTREE_ITER_intent); saved_path = 0; } @@ -2723,13 +2720,13 @@ struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *iter, struct bp iter->pos.snapshot = iter->snapshot; out_no_locked: if (saved_path) - bch2_path_put_nokeep(trans, saved_path, iter->flags & BTREE_ITER_intent); + bch2_path_put(trans, saved_path, iter->flags & BTREE_ITER_intent); bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); return k; end: - bch2_btree_iter_set_pos(iter, end); + bch2_btree_iter_set_pos(trans, iter, end); k = bkey_s_c_null; goto out_no_locked; } @@ -2737,34 +2734,34 @@ end: /** * bch2_btree_iter_prev() - returns first key less than iterator's current * position + * @trans: btree transaction object * @iter: iterator to peek from * * Returns: key if found, or an error extractable with bkey_err(). */ -struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_rewind(iter)) + if (!bch2_btree_iter_rewind(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_prev(iter); + return bch2_btree_iter_peek_prev(trans, iter); } -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *trans, struct btree_iter *iter) { - struct btree_trans *trans = iter->trans; struct bpos search_key; struct bkey_s_c k; int ret; bch2_trans_verify_not_unlocked_or_in_restart(trans); - bch2_btree_iter_verify(iter); + bch2_btree_iter_verify(trans, iter); bch2_btree_iter_verify_entry_exit(iter); EBUG_ON(btree_iter_path(trans, iter)->level && (iter->flags & BTREE_ITER_with_key_cache)); ret = trans_maybe_inject_restart(trans, _RET_IP_); if (unlikely(ret)) { k = bkey_s_c_err(ret); - goto out_no_locked; + goto out; } /* extents can't span inode numbers: */ @@ -2773,7 +2770,7 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->pos.inode == KEY_INODE_MAX) return bkey_s_c_null; - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); } search_key = btree_iter_search_key(iter); @@ -2784,13 +2781,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) ret = bch2_btree_path_traverse(trans, iter->path, iter->flags); if (unlikely(ret)) { k = bkey_s_c_err(ret); - goto out_no_locked; + goto out; } struct btree_path *path = btree_iter_path(trans, iter); if (unlikely(!btree_path_node(path, path->level))) return bkey_s_c_null; + btree_path_set_should_be_locked(trans, path); + if ((iter->flags & BTREE_ITER_cached) || !(iter->flags & (BTREE_ITER_is_extents|BTREE_ITER_filter_snapshots))) { k = bkey_s_c_null; @@ -2807,16 +2806,16 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) goto out; if (unlikely(iter->flags & BTREE_ITER_with_key_cache) && - (k = btree_trans_peek_key_cache(iter, iter->pos)).k) { + (k = btree_trans_peek_key_cache(trans, iter, iter->pos)).k) { if (!bkey_err(k)) iter->k = *k.k; /* We're not returning a key from iter->path: */ - goto out_no_locked; + goto out; } - k = bch2_btree_path_peek_slot(trans->paths + iter->path, &iter->k); + k = bch2_btree_path_peek_slot(btree_iter_path(trans, iter), &iter->k); if (unlikely(!k.k)) - goto out_no_locked; + goto out; if (unlikely(k.k->type == KEY_TYPE_whiteout && (iter->flags & BTREE_ITER_filter_snapshots) && @@ -2834,8 +2833,8 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) if (iter->flags & BTREE_ITER_intent) { struct btree_iter iter2; - bch2_trans_copy_iter(&iter2, iter); - k = bch2_btree_iter_peek_max(&iter2, end); + bch2_trans_copy_iter(trans, &iter2, iter); + k = bch2_btree_iter_peek_max(trans, &iter2, end); if (k.k && !bkey_err(k)) { swap(iter->key_cache_path, iter2.key_cache_path); @@ -2846,15 +2845,15 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } else { struct bpos pos = iter->pos; - k = bch2_btree_iter_peek_max(iter, end); + k = bch2_btree_iter_peek_max(trans, iter, end); if (unlikely(bkey_err(k))) - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); else iter->pos = pos; } if (unlikely(bkey_err(k))) - goto out_no_locked; + goto out; next = k.k ? bkey_start_pos(k.k) : POS_MAX; @@ -2876,42 +2875,40 @@ struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *iter) } } out: - btree_path_set_should_be_locked(trans, btree_iter_path(trans, iter)); -out_no_locked: bch2_btree_iter_verify_entry_exit(iter); - bch2_btree_iter_verify(iter); - ret = bch2_btree_iter_verify_ret(iter, k); + bch2_btree_iter_verify(trans, iter); + ret = bch2_btree_iter_verify_ret(trans, iter, k); if (unlikely(ret)) return bkey_s_c_err(ret); return k; } -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_advance(iter)) + if (!bch2_btree_iter_advance(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *trans, struct btree_iter *iter) { - if (!bch2_btree_iter_rewind(iter)) + if (!bch2_btree_iter_rewind(trans, iter)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } /* Obsolete, but still used by rust wrapper in -tools */ -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *iter) +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *trans, struct btree_iter *iter) { struct bkey_s_c k; - while (btree_trans_too_many_iters(iter->trans) || - (k = bch2_btree_iter_peek_type(iter, iter->flags), + while (btree_trans_too_many_iters(trans) || + (k = bch2_btree_iter_peek_type(trans, iter, iter->flags), bch2_err_matches(bkey_err(k), BCH_ERR_transaction_restart))) - bch2_trans_begin(iter->trans); + bch2_trans_begin(trans); return k; } @@ -2944,7 +2941,7 @@ static void btree_trans_verify_sorted(struct btree_trans *trans) struct btree_path *path, *prev = NULL; struct trans_for_each_path_inorder_iter iter; - if (!bch2_debug_check_iterators) + if (!static_branch_unlikely(&bch2_debug_check_iterators)) return; trans_for_each_path_inorder(trans, path, iter) { @@ -3046,7 +3043,7 @@ static inline void btree_path_list_add(struct btree_trans *trans, void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) { if (iter->update_path) - bch2_path_put_nokeep(trans, iter->update_path, + bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); if (iter->path) bch2_path_put(trans, iter->path, @@ -3057,7 +3054,6 @@ void bch2_trans_iter_exit(struct btree_trans *trans, struct btree_iter *iter) iter->path = 0; iter->update_path = 0; iter->key_cache_path = 0; - iter->trans = NULL; } void bch2_trans_iter_init_outlined(struct btree_trans *trans, @@ -3097,10 +3093,9 @@ void bch2_trans_node_iter_init(struct btree_trans *trans, BUG_ON(iter->min_depth != depth); } -void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) +void bch2_trans_copy_iter(struct btree_trans *trans, + struct btree_iter *dst, struct btree_iter *src) { - struct btree_trans *trans = src->trans; - *dst = *src; #ifdef TRACK_PATH_ALLOCATED dst->ip_allocated = _RET_IP_; @@ -3112,7 +3107,19 @@ void bch2_trans_copy_iter(struct btree_iter *dst, struct btree_iter *src) dst->key_cache_path = 0; } -void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *out, + darray_trans_kmalloc_trace *trace) +{ + printbuf_tabstops_reset(out); + printbuf_tabstop_push(out, 60); + + darray_for_each(*trace, i) + prt_printf(out, "%pS\t%zu\n", (void *) i->ip, i->bytes); +} +#endif + +void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size, unsigned long ip) { struct bch_fs *c = trans->c; unsigned new_top = trans->mem_top + size; @@ -3122,14 +3129,35 @@ void *__bch2_trans_kmalloc(struct btree_trans *trans, size_t size) void *new_mem; void *p; - WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX); + if (WARN_ON_ONCE(new_bytes > BTREE_TRANS_MEM_MAX)) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + struct printbuf buf = PRINTBUF; + bch2_trans_kmalloc_trace_to_text(&buf, &trans->trans_kmalloc_trace); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); +#endif + } ret = trans_maybe_inject_restart(trans, _RET_IP_); if (ret) return ERR_PTR(ret); struct btree_transaction_stats *s = btree_trans_stats(trans); - s->max_mem = max(s->max_mem, new_bytes); + if (new_bytes > s->max_mem) { + mutex_lock(&s->lock); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_resize(&s->trans_kmalloc_trace, trans->trans_kmalloc_trace.nr); + s->trans_kmalloc_trace.nr = min(s->trans_kmalloc_trace.size, + trans->trans_kmalloc_trace.nr); + + memcpy(s->trans_kmalloc_trace.data, + trans->trans_kmalloc_trace.data, + sizeof(s->trans_kmalloc_trace.data[0]) * + s->trans_kmalloc_trace.nr); +#endif + s->max_mem = new_bytes; + mutex_unlock(&s->lock); + } if (trans->used_mempool) { if (trans->mem_bytes >= new_bytes) @@ -3189,6 +3217,8 @@ out_new_mem: BCH_ERR_transaction_restart_mem_realloced, _RET_IP_)); } out_change_top: + bch2_trans_kmalloc_trace(trans, size, ip); + p = trans->mem + trans->mem_top; trans->mem_top += size; memset(p, 0, size); @@ -3248,7 +3278,6 @@ u32 bch2_trans_begin(struct btree_trans *trans) trans->restart_count++; trans->mem_top = 0; - trans->journal_entries = NULL; trans_for_each_path(trans, path, i) { path->should_be_locked = false; @@ -3302,6 +3331,10 @@ u32 bch2_trans_begin(struct btree_trans *trans) } #endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + trans->trans_kmalloc_trace.nr = 0; +#endif + trans_set_locked(trans, false); if (trans->restarted) { @@ -3402,7 +3435,6 @@ got_trans: } trans->nr_paths_max = s->nr_max_paths; - trans->journal_entries_size = s->journal_entries_size; } trans->srcu_idx = srcu_read_lock(&c->btree_trans_barrier); @@ -3414,29 +3446,45 @@ got_trans: return trans; } -static void check_btree_paths_leaked(struct btree_trans *trans) -{ #ifdef CONFIG_BCACHEFS_DEBUG - struct bch_fs *c = trans->c; + +static bool btree_paths_leaked(struct btree_trans *trans) +{ struct btree_path *path; unsigned i; trans_for_each_path(trans, path, i) if (path->ref) - goto leaked; - return; -leaked: - bch_err(c, "btree paths leaked from %s!", trans->fn); - trans_for_each_path(trans, path, i) - if (path->ref) - printk(KERN_ERR " btree %s %pS\n", - bch2_btree_id_str(path->btree_id), - (void *) path->ip_allocated); - /* Be noisy about this: */ - bch2_fatal_error(c); -#endif + return true; + return false; } +static void check_btree_paths_leaked(struct btree_trans *trans) +{ + if (btree_paths_leaked(trans)) { + struct bch_fs *c = trans->c; + struct btree_path *path; + unsigned i; + + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "btree paths leaked from %s!\n", trans->fn); + trans_for_each_path(trans, path, i) + if (path->ref) + prt_printf(&buf, "btree %s %pS\n", + bch2_btree_id_str(path->btree_id), + (void *) path->ip_allocated); + + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } +} +#else +static inline void check_btree_paths_leaked(struct btree_trans *trans) {} +#endif + void bch2_trans_put(struct btree_trans *trans) __releases(&c->btree_trans_barrier) { @@ -3471,6 +3519,9 @@ void bch2_trans_put(struct btree_trans *trans) #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); #endif +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&trans->trans_kmalloc_trace); +#endif unsigned long *paths_allocated = trans->paths_allocated; trans->paths_allocated = NULL; @@ -3625,6 +3676,9 @@ void bch2_fs_btree_iter_exit(struct bch_fs *c) for (s = c->btree_transaction_stats; s < c->btree_transaction_stats + ARRAY_SIZE(c->btree_transaction_stats); s++) { +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_exit(&s->trans_kmalloc_trace); +#endif kfree(s->max_paths_text); bch2_time_stats_exit(&s->lock_hold_times); } diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index b96157f3dc9c..2cabb5f0f484 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -9,7 +9,6 @@ void bch2_trans_updates_to_text(struct printbuf *, struct btree_trans *); void bch2_btree_path_to_text(struct printbuf *, struct btree_trans *, btree_path_idx_t); void bch2_trans_paths_to_text(struct printbuf *, struct btree_trans *); -void bch2_dump_trans_updates(struct btree_trans *); void bch2_dump_trans_paths_updates(struct btree_trans *); static inline int __bkey_err(const struct bkey *k) @@ -47,9 +46,11 @@ static inline bool __btree_path_put(struct btree_trans *trans, struct btree_path return --path->ref == 0; } -static inline void btree_path_set_dirty(struct btree_path *path, +static inline void btree_path_set_dirty(struct btree_trans *trans, + struct btree_path *path, enum btree_path_uptodate u) { + BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); path->uptodate = max_t(unsigned, path->uptodate, u); } @@ -286,14 +287,23 @@ static inline int bch2_trans_mutex_lock(struct btree_trans *trans, struct mutex : __bch2_trans_mutex_lock(trans, lock); } -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_trans_verify_paths(struct btree_trans *); -void bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); -#else -static inline void bch2_trans_verify_paths(struct btree_trans *trans) {} -static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id id, - struct bpos pos) {} -#endif +/* Debug: */ + +void __bch2_trans_verify_paths(struct btree_trans *); +void __bch2_assert_pos_locked(struct btree_trans *, enum btree_id, struct bpos); + +static inline void bch2_trans_verify_paths(struct btree_trans *trans) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_trans_verify_paths(trans); +} + +static inline void bch2_assert_pos_locked(struct btree_trans *trans, enum btree_id btree, + struct bpos pos) +{ + if (static_branch_unlikely(&bch2_debug_check_iterators)) + __bch2_assert_pos_locked(trans, btree, pos); +} void bch2_btree_path_fix_key_modified(struct btree_trans *trans, struct btree *, struct bkey_packed *); @@ -335,13 +345,20 @@ static inline void bch2_trans_verify_not_unlocked_or_in_restart(struct btree_tra } __always_inline -static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +static int btree_trans_restart_foreign_task(struct btree_trans *trans, int err, unsigned long ip) { BUG_ON(err <= 0); BUG_ON(!bch2_err_matches(-err, BCH_ERR_transaction_restart)); trans->restarted = err; trans->last_restarted_ip = ip; + return -err; +} + +__always_inline +static int btree_trans_restart_ip(struct btree_trans *trans, int err, unsigned long ip) +{ + btree_trans_restart_foreign_task(trans, err, ip); #ifdef CONFIG_BCACHEFS_DEBUG darray_exit(&trans->last_restarted_trace); bch2_save_backtrace(&trans->last_restarted_trace, current, 0, GFP_NOWAIT); @@ -387,36 +404,37 @@ void bch2_trans_node_add(struct btree_trans *trans, struct btree_path *, struct void bch2_trans_node_drop(struct btree_trans *trans, struct btree *); void bch2_trans_node_reinit_iter(struct btree_trans *, struct btree *); -int __must_check __bch2_btree_iter_traverse(struct btree_iter *iter); -int __must_check bch2_btree_iter_traverse(struct btree_iter *); +int __must_check __bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); +int __must_check bch2_btree_iter_traverse(struct btree_trans *, struct btree_iter *); -struct btree *bch2_btree_iter_peek_node(struct btree_iter *); -struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_iter *); -struct btree *bch2_btree_iter_next_node(struct btree_iter *); +struct btree *bch2_btree_iter_peek_node(struct btree_trans *, struct btree_iter *); +struct btree *bch2_btree_iter_peek_node_and_restart(struct btree_trans *, struct btree_iter *); +struct btree *bch2_btree_iter_next_node(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_max(struct btree_iter *, struct bpos); -struct bkey_s_c bch2_btree_iter_next(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_max(struct btree_trans *, struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_next(struct btree_trans *, struct btree_iter *); -static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_iter *iter) +static inline struct bkey_s_c bch2_btree_iter_peek(struct btree_trans *trans, + struct btree_iter *iter) { - return bch2_btree_iter_peek_max(iter, SPOS_MAX); + return bch2_btree_iter_peek_max(trans, iter, SPOS_MAX); } -struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_iter *, struct bpos); +struct bkey_s_c bch2_btree_iter_peek_prev_min(struct btree_trans *, struct btree_iter *, struct bpos); -static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_iter *iter) +static inline struct bkey_s_c bch2_btree_iter_peek_prev(struct btree_trans *trans, struct btree_iter *iter) { - return bch2_btree_iter_peek_prev_min(iter, POS_MIN); + return bch2_btree_iter_peek_prev_min(trans, iter, POS_MIN); } -struct bkey_s_c bch2_btree_iter_prev(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev(struct btree_trans *, struct btree_iter *); -struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_next_slot(struct btree_iter *); -struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_slot(struct btree_trans *, struct btree_iter *); +struct bkey_s_c bch2_btree_iter_next_slot(struct btree_trans *, struct btree_iter *); +struct bkey_s_c bch2_btree_iter_prev_slot(struct btree_trans *, struct btree_iter *); -bool bch2_btree_iter_advance(struct btree_iter *); -bool bch2_btree_iter_rewind(struct btree_iter *); +bool bch2_btree_iter_advance(struct btree_trans *, struct btree_iter *); +bool bch2_btree_iter_rewind(struct btree_trans *, struct btree_iter *); static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) { @@ -427,10 +445,9 @@ static inline void __bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpo iter->k.size = 0; } -static inline void bch2_btree_iter_set_pos(struct btree_iter *iter, struct bpos new_pos) +static inline void bch2_btree_iter_set_pos(struct btree_trans *trans, + struct btree_iter *iter, struct bpos new_pos) { - struct btree_trans *trans = iter->trans; - if (unlikely(iter->update_path)) bch2_path_put(trans, iter->update_path, iter->flags & BTREE_ITER_intent); @@ -448,13 +465,14 @@ static inline void bch2_btree_iter_set_pos_to_extent_start(struct btree_iter *it iter->pos = bkey_start_pos(&iter->k); } -static inline void bch2_btree_iter_set_snapshot(struct btree_iter *iter, u32 snapshot) +static inline void bch2_btree_iter_set_snapshot(struct btree_trans *trans, + struct btree_iter *iter, u32 snapshot) { struct bpos pos = iter->pos; iter->snapshot = snapshot; pos.snapshot = snapshot; - bch2_btree_iter_set_pos(iter, pos); + bch2_btree_iter_set_pos(trans, iter, pos); } void bch2_trans_iter_exit(struct btree_trans *, struct btree_iter *); @@ -496,7 +514,6 @@ static inline void bch2_trans_iter_init_common(struct btree_trans *trans, unsigned flags, unsigned long ip) { - iter->trans = trans; iter->update_path = 0; iter->key_cache_path = 0; iter->btree_id = btree_id; @@ -533,47 +550,77 @@ static inline void bch2_trans_iter_init(struct btree_trans *trans, void bch2_trans_node_iter_init(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos, unsigned, unsigned, unsigned); -void bch2_trans_copy_iter(struct btree_iter *, struct btree_iter *); +void bch2_trans_copy_iter(struct btree_trans *, struct btree_iter *, struct btree_iter *); -void bch2_set_btree_iter_dontneed(struct btree_iter *); +void bch2_set_btree_iter_dontneed(struct btree_trans *, struct btree_iter *); -void *__bch2_trans_kmalloc(struct btree_trans *, size_t); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE +void bch2_trans_kmalloc_trace_to_text(struct printbuf *, + darray_trans_kmalloc_trace *); +#endif -/** - * bch2_trans_kmalloc - allocate memory for use by the current transaction - * - * Must be called after bch2_trans_begin, which on second and further calls - * frees all memory allocated in this transaction - */ -static inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +void *__bch2_trans_kmalloc(struct btree_trans *, size_t, unsigned long); + +static inline void bch2_trans_kmalloc_trace(struct btree_trans *trans, size_t size, + unsigned long ip) +{ +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_push(&trans->trans_kmalloc_trace, + ((struct trans_kmalloc_trace) { .ip = ip, .bytes = size })); +#endif +} + +static __always_inline void *bch2_trans_kmalloc_nomemzero_ip(struct btree_trans *trans, size_t size, + unsigned long ip) { size = roundup(size, 8); + bch2_trans_kmalloc_trace(trans, size, ip); + if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; trans->mem_top += size; - memset(p, 0, size); return p; } else { - return __bch2_trans_kmalloc(trans, size); + return __bch2_trans_kmalloc(trans, size, ip); } } -static inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +static __always_inline void *bch2_trans_kmalloc_ip(struct btree_trans *trans, size_t size, + unsigned long ip) { - size = round_up(size, 8); + size = roundup(size, 8); + + bch2_trans_kmalloc_trace(trans, size, ip); if (likely(trans->mem_top + size <= trans->mem_bytes)) { void *p = trans->mem + trans->mem_top; trans->mem_top += size; + memset(p, 0, size); return p; } else { - return __bch2_trans_kmalloc(trans, size); + return __bch2_trans_kmalloc(trans, size, ip); } } +/** + * bch2_trans_kmalloc - allocate memory for use by the current transaction + * + * Must be called after bch2_trans_begin, which on second and further calls + * frees all memory allocated in this transaction + */ +static __always_inline void *bch2_trans_kmalloc(struct btree_trans *trans, size_t size) +{ + return bch2_trans_kmalloc_ip(trans, size, _THIS_IP_); +} + +static __always_inline void *bch2_trans_kmalloc_nomemzero(struct btree_trans *trans, size_t size) +{ + return bch2_trans_kmalloc_nomemzero_ip(trans, size, _THIS_IP_); +} + static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, struct btree_iter *iter, unsigned btree_id, struct bpos pos, @@ -582,7 +629,7 @@ static inline struct bkey_s_c __bch2_bkey_get_iter(struct btree_trans *trans, struct bkey_s_c k; bch2_trans_iter_init(trans, iter, btree_id, pos, flags); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(trans, iter); if (!bkey_err(k) && type && k.k->type != type) k = bkey_s_c_err(-BCH_ERR_ENOENT_bkey_type_mismatch); @@ -652,14 +699,14 @@ u32 bch2_trans_begin(struct btree_trans *); int _ret3 = 0; \ do { \ _ret3 = lockrestart_do((_trans), ({ \ - struct btree *_b = bch2_btree_iter_peek_node(&_iter); \ + struct btree *_b = bch2_btree_iter_peek_node(_trans, &_iter);\ if (!_b) \ break; \ \ PTR_ERR_OR_ZERO(_b) ?: (_do); \ })) ?: \ lockrestart_do((_trans), \ - PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(&_iter))); \ + PTR_ERR_OR_ZERO(bch2_btree_iter_next_node(_trans, &_iter)));\ } while (!_ret3); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ @@ -671,31 +718,34 @@ u32 bch2_trans_begin(struct btree_trans *); __for_each_btree_node(_trans, _iter, _btree_id, _start, \ 0, 0, _flags, _b, _do) -static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_prev_type(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : - bch2_btree_iter_peek_prev(iter); + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : + bch2_btree_iter_peek_prev(trans, iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_iter *iter, +static inline struct bkey_s_c bch2_btree_iter_peek_type(struct btree_trans *trans, + struct btree_iter *iter, unsigned flags) { - return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(iter) : - bch2_btree_iter_peek(iter); + return flags & BTREE_ITER_slots ? bch2_btree_iter_peek_slot(trans, iter) : + bch2_btree_iter_peek(trans, iter); } -static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_iter *iter, - struct bpos end, - unsigned flags) +static inline struct bkey_s_c bch2_btree_iter_peek_max_type(struct btree_trans *trans, + struct btree_iter *iter, + struct bpos end, + unsigned flags) { if (!(flags & BTREE_ITER_slots)) - return bch2_btree_iter_peek_max(iter, end); + return bch2_btree_iter_peek_max(trans, iter, end); if (bkey_gt(iter->pos, end)) return bkey_s_c_null; - return bch2_btree_iter_peek_slot(iter); + return bch2_btree_iter_peek_slot(trans, iter); } int __bch2_btree_trans_too_many_iters(struct btree_trans *); @@ -762,14 +812,14 @@ transaction_restart: \ \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_max_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), \ _end, (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -807,14 +857,14 @@ transaction_restart: \ \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_prev_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), \ (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_rewind(&(_iter))); \ + } while (!_ret3 && bch2_btree_iter_rewind(_trans, &(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -844,37 +894,38 @@ transaction_restart: \ (_do) ?: bch2_trans_commit(_trans, (_disk_res),\ (_journal_seq), (_commit_flags))) -struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_iter *); +struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, + struct btree_iter *); #define for_each_btree_key_max_norestart(_trans, _iter, _btree_id, \ _start, _end, _flags, _k, _ret) \ for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags),\ + (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags),\ !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) + bch2_btree_iter_advance(_trans, &(_iter))) -#define for_each_btree_key_max_continue_norestart(_iter, _end, _flags, _k, _ret)\ +#define for_each_btree_key_max_continue_norestart(_trans, _iter, _end, _flags, _k, _ret)\ for (; \ - (_k) = bch2_btree_iter_peek_max_type(&(_iter), _end, _flags), \ + (_k) = bch2_btree_iter_peek_max_type(_trans, &(_iter), _end, _flags), \ !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_advance(&(_iter))) + bch2_btree_iter_advance(_trans, &(_iter))) #define for_each_btree_key_norestart(_trans, _iter, _btree_id, \ _start, _flags, _k, _ret) \ for_each_btree_key_max_norestart(_trans, _iter, _btree_id, _start,\ SPOS_MAX, _flags, _k, _ret) -#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ - _start, _flags, _k, _ret) \ - for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ - (_start), (_flags)); \ - (_k) = bch2_btree_iter_peek_prev_type(&(_iter), _flags), \ - !((_ret) = bkey_err(_k)) && (_k).k; \ - bch2_btree_iter_rewind(&(_iter))) +#define for_each_btree_key_reverse_norestart(_trans, _iter, _btree_id, \ + _start, _flags, _k, _ret) \ + for (bch2_trans_iter_init((_trans), &(_iter), (_btree_id), \ + (_start), (_flags)); \ + (_k) = bch2_btree_iter_peek_prev_type(_trans, &(_iter), _flags), \ + !((_ret) = bkey_err(_k)) && (_k).k; \ + bch2_btree_iter_rewind(_trans, &(_iter))) -#define for_each_btree_key_continue_norestart(_iter, _flags, _k, _ret) \ - for_each_btree_key_max_continue_norestart(_iter, SPOS_MAX, _flags, _k, _ret) +#define for_each_btree_key_continue_norestart(_trans, _iter, _flags, _k, _ret) \ + for_each_btree_key_max_continue_norestart(_trans, _iter, SPOS_MAX, _flags, _k, _ret) /* * This should not be used in a fastpath, without first trying _do in diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index 6d25e3f85ce8..ade3b5addd75 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -288,7 +288,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, .size = max_t(size_t, keys->size, 8) * 2, }; - new_keys.data = kvmalloc_array(new_keys.size, sizeof(new_keys.data[0]), GFP_KERNEL); + new_keys.data = bch2_kvmalloc(new_keys.size * sizeof(new_keys.data[0]), GFP_KERNEL); if (!new_keys.data) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); @@ -687,7 +687,8 @@ void bch2_journal_keys_put(struct bch_fs *c) static void __journal_keys_sort(struct journal_keys *keys) { - sort(keys->data, keys->nr, sizeof(keys->data[0]), journal_sort_key_cmp, NULL); + sort_nonatomic(keys->data, keys->nr, sizeof(keys->data[0]), + journal_sort_key_cmp, NULL); cond_resched(); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 1821f40c161a..9da950e7eb7d 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -101,8 +101,8 @@ static void __bkey_cached_free(struct rcu_pending *pending, struct rcu_head *rcu kmem_cache_free(bch2_key_cache, ck); } -static void bkey_cached_free(struct btree_key_cache *bc, - struct bkey_cached *ck) +static inline void bkey_cached_free_noassert(struct btree_key_cache *bc, + struct bkey_cached *ck) { kfree(ck->k); ck->k = NULL; @@ -116,6 +116,19 @@ static void bkey_cached_free(struct btree_key_cache *bc, this_cpu_inc(*bc->nr_pending); } +static void bkey_cached_free(struct btree_trans *trans, + struct btree_key_cache *bc, + struct bkey_cached *ck) +{ + /* + * we'll hit strange issues in the SRCU code if we aren't holding an + * SRCU read lock... + */ + EBUG_ON(!trans->srcu_held); + + bkey_cached_free_noassert(bc, ck); +} + static struct bkey_cached *__bkey_cached_alloc(unsigned key_u64s, gfp_t gfp) { gfp |= __GFP_ACCOUNT|__GFP_RECLAIMABLE; @@ -156,7 +169,7 @@ bkey_cached_alloc(struct btree_trans *trans, struct btree_path *path, unsigned k } if (ck) { - bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0); + bch2_btree_lock_init(&ck->c, pcpu_readers ? SIX_LOCK_INIT_PCPU : 0, GFP_KERNEL); ck->c.cached = true; goto lock; } @@ -281,16 +294,31 @@ static int btree_key_cache_create(struct btree_trans *trans, ck_path->uptodate = BTREE_ITER_UPTODATE; return 0; err: - bkey_cached_free(bc, ck); + bkey_cached_free(trans, bc, ck); mark_btree_node_locked_noreset(ck_path, 0, BTREE_NODE_UNLOCKED); return ret; } +static noinline_for_stack void do_trace_key_cache_fill(struct btree_trans *trans, + struct btree_path *ck_path, + struct bkey_s_c k) +{ + struct printbuf buf = PRINTBUF; + + bch2_bpos_to_text(&buf, ck_path->pos); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, trans->c, k); + trace_key_cache_fill(trans, buf.buf); + printbuf_exit(&buf); +} + static noinline int btree_key_cache_fill(struct btree_trans *trans, - struct btree_path *ck_path, + btree_path_idx_t ck_path_idx, unsigned flags) { + struct btree_path *ck_path = trans->paths + ck_path_idx; + if (flags & BTREE_ITER_cached_nofill) { ck_path->l[0].b = NULL; return 0; @@ -306,12 +334,13 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, BTREE_ITER_key_cache_fill| BTREE_ITER_cached_nofill); iter.flags &= ~BTREE_ITER_with_journal; - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; /* Recheck after btree lookup, before allocating: */ + ck_path = trans->paths + ck_path_idx; ret = bch2_btree_key_cache_find(c, ck_path->btree_id, ck_path->pos) ? -EEXIST : 0; if (unlikely(ret)) goto out; @@ -320,28 +349,22 @@ static noinline int btree_key_cache_fill(struct btree_trans *trans, if (ret) goto err; - if (trace_key_cache_fill_enabled()) { - struct printbuf buf = PRINTBUF; - - bch2_bpos_to_text(&buf, ck_path->pos); - prt_char(&buf, ' '); - bch2_bkey_val_to_text(&buf, trans->c, k); - trace_key_cache_fill(trans, buf.buf); - printbuf_exit(&buf); - } + if (trace_key_cache_fill_enabled()) + do_trace_key_cache_fill(trans, ck_path, k); out: /* We're not likely to need this iterator again: */ - bch2_set_btree_iter_dontneed(&iter); + bch2_set_btree_iter_dontneed(trans, &iter); err: bch2_trans_iter_exit(trans, &iter); return ret; } static inline int btree_path_traverse_cached_fast(struct btree_trans *trans, - struct btree_path *path) + btree_path_idx_t path_idx) { struct bch_fs *c = trans->c; struct bkey_cached *ck; + struct btree_path *path = trans->paths + path_idx; retry: ck = bch2_btree_key_cache_find(c, path->btree_id, path->pos); if (!ck) @@ -367,27 +390,32 @@ retry: return 0; } -int bch2_btree_path_traverse_cached(struct btree_trans *trans, struct btree_path *path, +int bch2_btree_path_traverse_cached(struct btree_trans *trans, + btree_path_idx_t path_idx, unsigned flags) { - EBUG_ON(path->level); - - path->l[1].b = NULL; + EBUG_ON(trans->paths[path_idx].level); int ret; do { - ret = btree_path_traverse_cached_fast(trans, path); + ret = btree_path_traverse_cached_fast(trans, path_idx); if (unlikely(ret == -ENOENT)) - ret = btree_key_cache_fill(trans, path, flags); + ret = btree_key_cache_fill(trans, path_idx, flags); } while (ret == -EEXIST); + struct btree_path *path = trans->paths + path_idx; + if (unlikely(ret)) { path->uptodate = BTREE_ITER_NEED_TRAVERSE; if (!bch2_err_matches(ret, BCH_ERR_transaction_restart)) { btree_node_unlock(trans, path, 0); path->l[0].b = ERR_PTR(ret); } + } else { + BUG_ON(path->uptodate); + BUG_ON(!path->nodes_locked); } + return ret; } @@ -412,7 +440,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, BTREE_ITER_intent); b_iter.flags &= ~BTREE_ITER_with_key_cache; - ret = bch2_btree_iter_traverse(&c_iter); + ret = bch2_btree_iter_traverse(trans, &c_iter); if (ret) goto out; @@ -444,7 +472,7 @@ static int btree_key_cache_flush_pos(struct btree_trans *trans, !test_bit(JOURNAL_space_low, &c->journal.flags)) commit_flags |= BCH_TRANS_COMMIT_no_journal_res; - struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(&b_iter); + struct bkey_s_c btree_k = bch2_btree_iter_peek_slot(trans, &b_iter); ret = bkey_err(btree_k); if (ret) goto err; @@ -496,7 +524,7 @@ evict: mark_btree_node_locked_noreset(path, 0, BTREE_NODE_UNLOCKED); if (bkey_cached_evict(&c->btree_key_cache, ck)) { - bkey_cached_free(&c->btree_key_cache, ck); + bkey_cached_free(trans, &c->btree_key_cache, ck); } else { six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); @@ -610,7 +638,7 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, } bkey_cached_evict(bc, ck); - bkey_cached_free(bc, ck); + bkey_cached_free(trans, bc, ck); mark_btree_node_locked(trans, path, 0, BTREE_NODE_UNLOCKED); @@ -618,10 +646,17 @@ void bch2_btree_key_cache_drop(struct btree_trans *trans, unsigned i; trans_for_each_path(trans, path2, i) if (path2->l[0].b == (void *) ck) { + /* + * It's safe to clear should_be_locked here because + * we're evicting from the key cache, and we still have + * the underlying btree locked: filling into the key + * cache would require taking a write lock on the btree + * node + */ + path2->should_be_locked = false; __bch2_btree_path_unlock(trans, path2); path2->l[0].b = ERR_PTR(-BCH_ERR_no_btree_node_drop); - path2->should_be_locked = false; - btree_path_set_dirty(path2, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path2, BTREE_ITER_NEED_TRAVERSE); } bch2_trans_verify_locks(trans); @@ -678,7 +713,7 @@ static unsigned long bch2_btree_key_cache_scan(struct shrinker *shrink, } else if (!bkey_cached_lock_for_evict(ck)) { bc->skipped_lock_fail++; } else if (bkey_cached_evict(bc, ck)) { - bkey_cached_free(bc, ck); + bkey_cached_free_noassert(bc, ck); bc->freed++; freed++; } else { diff --git a/fs/bcachefs/btree_key_cache.h b/fs/bcachefs/btree_key_cache.h index 51d6289b8dee..82d8c72512a9 100644 --- a/fs/bcachefs/btree_key_cache.h +++ b/fs/bcachefs/btree_key_cache.h @@ -40,8 +40,7 @@ int bch2_btree_key_cache_journal_flush(struct journal *, struct bkey_cached * bch2_btree_key_cache_find(struct bch_fs *, enum btree_id, struct bpos); -int bch2_btree_path_traverse_cached(struct btree_trans *, struct btree_path *, - unsigned); +int bch2_btree_path_traverse_cached(struct btree_trans *, btree_path_idx_t, unsigned); bool bch2_btree_insert_key_cached(struct btree_trans *, unsigned, struct btree_insert_entry *); diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 10b805a60f52..2f2aed0c9916 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -1,15 +1,17 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "btree_cache.h" #include "btree_locking.h" #include "btree_types.h" static struct lock_class_key bch2_btree_node_lock_key; void bch2_btree_lock_init(struct btree_bkey_cached_common *b, - enum six_lock_init_flags flags) + enum six_lock_init_flags flags, + gfp_t gfp) { - __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags); + __six_lock_init(&b->lock, "b->c.lock", &bch2_btree_node_lock_key, flags, gfp); lockdep_set_notrack_class(&b->lock); } @@ -90,10 +92,10 @@ static noinline void print_chain(struct printbuf *out, struct lock_graph *g) struct trans_waiting_for_lock *i; for (i = g->g; i != g->g + g->nr; i++) { - struct task_struct *task = i->trans->locking_wait.task; + struct task_struct *task = READ_ONCE(i->trans->locking_wait.task); if (i != g->g) prt_str(out, "<- "); - prt_printf(out, "%u ", task ?task->pid : 0); + prt_printf(out, "%u ", task ? task->pid : 0); } prt_newline(out); } @@ -171,7 +173,9 @@ static int abort_lock(struct lock_graph *g, struct trans_waiting_for_lock *i) { if (i == g->g) { trace_would_deadlock(g, i->trans); - return btree_trans_restart(i->trans, BCH_ERR_transaction_restart_would_deadlock); + return btree_trans_restart_foreign_task(i->trans, + BCH_ERR_transaction_restart_would_deadlock, + _THIS_IP_); } else { i->trans->lock_must_abort = true; wake_up_process(i->trans->locking_wait.task); @@ -233,7 +237,7 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, prt_newline(&buf); } - bch2_print_string_as_lines_nonblocking(KERN_ERR, buf.buf); + bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf); printbuf_exit(&buf); BUG(); } @@ -447,13 +451,13 @@ void bch2_btree_node_lock_write_nofail(struct btree_trans *trans, /* relock */ -static inline bool btree_path_get_locks(struct btree_trans *trans, - struct btree_path *path, - bool upgrade, - struct get_locks_fail *f) +static int btree_path_get_locks(struct btree_trans *trans, + struct btree_path *path, + bool upgrade, + struct get_locks_fail *f, + int restart_err) { unsigned l = path->level; - int fail_idx = -1; do { if (!btree_path_node(path, l)) @@ -461,39 +465,49 @@ static inline bool btree_path_get_locks(struct btree_trans *trans, if (!(upgrade ? bch2_btree_node_upgrade(trans, path, l) - : bch2_btree_node_relock(trans, path, l))) { - fail_idx = l; - - if (f) { - f->l = l; - f->b = path->l[l].b; - } - } + : bch2_btree_node_relock(trans, path, l))) + goto err; l++; } while (l < path->locks_want); + if (path->uptodate == BTREE_ITER_NEED_RELOCK) + path->uptodate = BTREE_ITER_UPTODATE; + + return path->uptodate < BTREE_ITER_NEED_RELOCK ? 0 : -1; +err: + if (f) { + f->l = l; + f->b = path->l[l].b; + } + + /* + * Do transaction restart before unlocking, so we don't pop + * should_be_locked asserts + */ + if (restart_err) { + btree_trans_restart(trans, restart_err); + } else if (path->should_be_locked && !trans->restarted) { + if (upgrade) + path->locks_want = l; + return -1; + } + + __bch2_btree_path_unlock(trans, path); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); + /* * When we fail to get a lock, we have to ensure that any child nodes * can't be relocked so bch2_btree_path_traverse has to walk back up to * the node that we failed to relock: */ - if (fail_idx >= 0) { - __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); - - do { - path->l[fail_idx].b = upgrade - ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) - : ERR_PTR(-BCH_ERR_no_btree_node_relock); - --fail_idx; - } while (fail_idx >= 0); - } - - if (path->uptodate == BTREE_ITER_NEED_RELOCK) - path->uptodate = BTREE_ITER_UPTODATE; + do { + path->l[l].b = upgrade + ? ERR_PTR(-BCH_ERR_no_btree_node_upgrade) + : ERR_PTR(-BCH_ERR_no_btree_node_relock); + } while (l--); - return path->uptodate < BTREE_ITER_NEED_RELOCK; + return -restart_err ?: -1; } bool __bch2_btree_node_relock(struct btree_trans *trans, @@ -580,7 +594,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, l++) { if (!bch2_btree_node_relock(trans, path, l)) { __bch2_btree_path_unlock(trans, path); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); trace_and_count(trans->c, trans_restart_relock_path_intent, trans, _RET_IP_, path); return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock_path_intent); } @@ -592,9 +606,7 @@ int bch2_btree_path_relock_intent(struct btree_trans *trans, __flatten bool bch2_btree_path_relock_norestart(struct btree_trans *trans, struct btree_path *path) { - struct get_locks_fail f; - - bool ret = btree_path_get_locks(trans, path, false, &f); + bool ret = !btree_path_get_locks(trans, path, false, NULL, 0); bch2_trans_verify_locks(trans); return ret; } @@ -610,27 +622,37 @@ int __bch2_btree_path_relock(struct btree_trans *trans, return 0; } -bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want, - struct get_locks_fail *f) +bool __bch2_btree_path_upgrade_norestart(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { - EBUG_ON(path->locks_want >= new_locks_want); - path->locks_want = new_locks_want; - bool ret = btree_path_get_locks(trans, path, true, f); - bch2_trans_verify_locks(trans); + /* + * If we need it locked, we can't touch it. Otherwise, we can return + * success - bch2_path_get() will use this path, and it'll just be + * retraversed: + */ + bool ret = !btree_path_get_locks(trans, path, true, NULL, 0) || + !path->should_be_locked; + + bch2_btree_path_verify_locks(trans, path); return ret; } -bool __bch2_btree_path_upgrade(struct btree_trans *trans, - struct btree_path *path, - unsigned new_locks_want, - struct get_locks_fail *f) +int __bch2_btree_path_upgrade(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) { - bool ret = bch2_btree_path_upgrade_noupgrade_sibs(trans, path, new_locks_want, f); - if (ret) + unsigned old_locks = path->nodes_locked; + unsigned old_locks_want = path->locks_want; + + path->locks_want = max_t(unsigned, path->locks_want, new_locks_want); + + struct get_locks_fail f = {}; + int ret = btree_path_get_locks(trans, path, true, &f, + BCH_ERR_transaction_restart_upgrade); + if (!ret) goto out; /* @@ -662,9 +684,30 @@ bool __bch2_btree_path_upgrade(struct btree_trans *trans, linked->btree_id == path->btree_id && linked->locks_want < new_locks_want) { linked->locks_want = new_locks_want; - btree_path_get_locks(trans, linked, true, NULL); + btree_path_get_locks(trans, linked, true, NULL, 0); } } + + count_event(trans->c, trans_restart_upgrade); + if (trace_trans_restart_upgrade_enabled()) { + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "%s %pS\n", trans->fn, (void *) _RET_IP_); + prt_printf(&buf, "btree %s pos\n", bch2_btree_id_str(path->btree_id)); + bch2_bpos_to_text(&buf, path->pos); + prt_printf(&buf, "locks want %u -> %u level %u\n", + old_locks_want, new_locks_want, f.l); + prt_printf(&buf, "nodes_locked %x -> %x\n", + old_locks, path->nodes_locked); + prt_printf(&buf, "node %s ", IS_ERR(f.b) ? bch2_err_str(PTR_ERR(f.b)) : + !f.b ? "(null)" : "(node)"); + prt_printf(&buf, "path seq %u node seq %u\n", + IS_ERR_OR_NULL(f.b) ? 0 : f.b->c.lock.seq, + path->l[f.l].lock_seq); + + trace_trans_restart_upgrade(trans->c, buf.buf); + printbuf_exit(&buf); + } out: bch2_trans_verify_locks(trans); return ret; @@ -696,7 +739,7 @@ void __bch2_btree_path_downgrade(struct btree_trans *trans, } } - bch2_btree_path_verify_locks(path); + bch2_btree_path_verify_locks(trans, path); trace_path_downgrade(trans, _RET_IP_, path, old_locks_want); } @@ -725,7 +768,7 @@ static inline void __bch2_trans_unlock(struct btree_trans *trans) __bch2_btree_path_unlock(trans, path); } -static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, +static noinline __cold void bch2_trans_relock_fail(struct btree_trans *trans, struct btree_path *path, struct get_locks_fail *f, bool trace) { if (!trace) @@ -735,7 +778,9 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str struct printbuf buf = PRINTBUF; bch2_bpos_to_text(&buf, path->pos); - prt_printf(&buf, " l=%u seq=%u node seq=", f->l, path->l[f->l].lock_seq); + prt_printf(&buf, " %s l=%u seq=%u node seq=", + bch2_btree_id_str(path->btree_id), + f->l, path->l[f->l].lock_seq); if (IS_ERR_OR_NULL(f->b)) { prt_str(&buf, bch2_err_str(PTR_ERR(f->b))); } else { @@ -757,7 +802,6 @@ static noinline __cold int bch2_trans_relock_fail(struct btree_trans *trans, str out: __bch2_trans_unlock(trans); bch2_trans_verify_locks(trans); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_relock); } static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) @@ -774,10 +818,14 @@ static inline int __bch2_trans_relock(struct btree_trans *trans, bool trace) trans_for_each_path(trans, path, i) { struct get_locks_fail f; + int ret; if (path->should_be_locked && - !btree_path_get_locks(trans, path, false, &f)) - return bch2_trans_relock_fail(trans, path, &f, trace); + (ret = btree_path_get_locks(trans, path, false, &f, + BCH_ERR_transaction_restart_relock))) { + bch2_trans_relock_fail(trans, path, &f, trace); + return ret; + } } trans_set_locked(trans, true); @@ -796,18 +844,11 @@ int bch2_trans_relock_notrace(struct btree_trans *trans) return __bch2_trans_relock(trans, false); } -void bch2_trans_unlock_noassert(struct btree_trans *trans) +void bch2_trans_unlock(struct btree_trans *trans) { - __bch2_trans_unlock(trans); - trans_set_unlocked(trans); -} -void bch2_trans_unlock(struct btree_trans *trans) -{ __bch2_trans_unlock(trans); - - trans_set_unlocked(trans); } void bch2_trans_unlock_long(struct btree_trans *trans) @@ -839,32 +880,28 @@ int __bch2_trans_mutex_lock(struct btree_trans *trans, /* Debug */ -#ifdef CONFIG_BCACHEFS_DEBUG - -void bch2_btree_path_verify_locks(struct btree_path *path) +void __bch2_btree_path_verify_locks(struct btree_trans *trans, struct btree_path *path) { - /* - * A path may be uptodate and yet have nothing locked if and only if - * there is no node at path->level, which generally means we were - * iterating over all nodes and got to the end of the btree - */ - BUG_ON(path->uptodate == BTREE_ITER_UPTODATE && - btree_path_node(path, path->level) && - !path->nodes_locked); + if (!path->nodes_locked && btree_path_node(path, path->level)) { + /* + * A path may be uptodate and yet have nothing locked if and only if + * there is no node at path->level, which generally means we were + * iterating over all nodes and got to the end of the btree + */ + BUG_ON(path->uptodate == BTREE_ITER_UPTODATE); + BUG_ON(path->should_be_locked && trans->locked && !trans->restarted); + } if (!path->nodes_locked) return; for (unsigned l = 0; l < BTREE_MAX_DEPTH; l++) { int want = btree_lock_want(path, l); - int have = btree_node_locked_type(path, l); + int have = btree_node_locked_type_nowrite(path, l); BUG_ON(!is_btree_node(path, l) && have != BTREE_NODE_UNLOCKED); - BUG_ON(is_btree_node(path, l) && - (want == BTREE_NODE_UNLOCKED || - have != BTREE_NODE_WRITE_LOCKED) && - want != have); + BUG_ON(is_btree_node(path, l) && want != have); BUG_ON(btree_node_locked(path, l) && path->l[l].lock_seq != six_lock_seq(&path->l[l].b->c.lock)); @@ -882,7 +919,7 @@ static bool bch2_trans_locked(struct btree_trans *trans) return false; } -void bch2_trans_verify_locks(struct btree_trans *trans) +void __bch2_trans_verify_locks(struct btree_trans *trans) { if (!trans->locked) { BUG_ON(bch2_trans_locked(trans)); @@ -893,7 +930,5 @@ void bch2_trans_verify_locks(struct btree_trans *trans) unsigned i; trans_for_each_path(trans, path, i) - bch2_btree_path_verify_locks(path); + __bch2_btree_path_verify_locks(trans, path); } - -#endif diff --git a/fs/bcachefs/btree_locking.h b/fs/bcachefs/btree_locking.h index b54ef48eb8cc..9adca77e2580 100644 --- a/fs/bcachefs/btree_locking.h +++ b/fs/bcachefs/btree_locking.h @@ -13,9 +13,8 @@ #include "btree_iter.h" #include "six.h" -void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags); +void bch2_btree_lock_init(struct btree_bkey_cached_common *, enum six_lock_init_flags, gfp_t gfp); -void bch2_trans_unlock_noassert(struct btree_trans *); void bch2_trans_unlock_write(struct btree_trans *); static inline bool is_btree_node(struct btree_path *path, unsigned l) @@ -44,6 +43,15 @@ static inline int btree_node_locked_type(struct btree_path *path, return BTREE_NODE_UNLOCKED + ((path->nodes_locked >> (level << 1)) & 3); } +static inline int btree_node_locked_type_nowrite(struct btree_path *path, + unsigned level) +{ + int have = btree_node_locked_type(path, level); + return have == BTREE_NODE_WRITE_LOCKED + ? BTREE_NODE_INTENT_LOCKED + : have; +} + static inline bool btree_node_write_locked(struct btree_path *path, unsigned l) { return btree_node_locked_type(path, l) == BTREE_NODE_WRITE_LOCKED; @@ -152,7 +160,7 @@ static inline int btree_path_highest_level_locked(struct btree_path *path) static inline void __bch2_btree_path_unlock(struct btree_trans *trans, struct btree_path *path) { - btree_path_set_dirty(path, BTREE_ITER_NEED_RELOCK); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_RELOCK); while (path->nodes_locked) btree_node_unlock(trans, path, btree_path_lowest_level_locked(path)); @@ -367,8 +375,8 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, struct btree_path *path, unsigned level) { EBUG_ON(btree_node_locked(path, level) && - !btree_node_write_locked(path, level) && - btree_node_locked_type(path, level) != __btree_lock_want(path, level)); + btree_node_locked_type_nowrite(path, level) != + __btree_lock_want(path, level)); return likely(btree_node_locked(path, level)) || (!IS_ERR_OR_NULL(path->l[level].b) && @@ -377,31 +385,29 @@ static inline bool bch2_btree_node_relock_notrace(struct btree_trans *trans, /* upgrade */ -bool bch2_btree_path_upgrade_noupgrade_sibs(struct btree_trans *, - struct btree_path *, unsigned, - struct get_locks_fail *); +bool __bch2_btree_path_upgrade_norestart(struct btree_trans *, struct btree_path *, unsigned); -bool __bch2_btree_path_upgrade(struct btree_trans *, - struct btree_path *, unsigned, - struct get_locks_fail *); +static inline bool bch2_btree_path_upgrade_norestart(struct btree_trans *trans, + struct btree_path *path, + unsigned new_locks_want) +{ + return new_locks_want > path->locks_want + ? __bch2_btree_path_upgrade_norestart(trans, path, new_locks_want) + : true; +} + +int __bch2_btree_path_upgrade(struct btree_trans *, + struct btree_path *, unsigned); static inline int bch2_btree_path_upgrade(struct btree_trans *trans, struct btree_path *path, unsigned new_locks_want) { - struct get_locks_fail f = {}; - unsigned old_locks_want = path->locks_want; - new_locks_want = min(new_locks_want, BTREE_MAX_DEPTH); - if (path->locks_want < new_locks_want - ? __bch2_btree_path_upgrade(trans, path, new_locks_want, &f) - : path->nodes_locked) - return 0; - - trace_and_count(trans->c, trans_restart_upgrade, trans, _THIS_IP_, path, - old_locks_want, new_locks_want, &f); - return btree_trans_restart(trans, BCH_ERR_transaction_restart_upgrade); + return likely(path->locks_want >= new_locks_want && path->nodes_locked) + ? 0 + : __bch2_btree_path_upgrade(trans, path, new_locks_want); } /* misc: */ @@ -427,7 +433,7 @@ static inline void btree_path_set_level_up(struct btree_trans *trans, struct btree_path *path) { __btree_path_set_level_up(trans, path, path->level++); - btree_path_set_dirty(path, BTREE_ITER_NEED_TRAVERSE); + btree_path_set_dirty(trans, path, BTREE_ITER_NEED_TRAVERSE); } /* debug */ @@ -439,12 +445,20 @@ struct six_lock_count bch2_btree_node_lock_counts(struct btree_trans *, int bch2_check_for_deadlock(struct btree_trans *, struct printbuf *); -#ifdef CONFIG_BCACHEFS_DEBUG -void bch2_btree_path_verify_locks(struct btree_path *); -void bch2_trans_verify_locks(struct btree_trans *); -#else -static inline void bch2_btree_path_verify_locks(struct btree_path *path) {} -static inline void bch2_trans_verify_locks(struct btree_trans *trans) {} -#endif +void __bch2_btree_path_verify_locks(struct btree_trans *, struct btree_path *); +void __bch2_trans_verify_locks(struct btree_trans *); + +static inline void bch2_btree_path_verify_locks(struct btree_trans *trans, + struct btree_path *path) +{ + if (static_branch_unlikely(&bch2_debug_check_btree_locking)) + __bch2_btree_path_verify_locks(trans, path); +} + +static inline void bch2_trans_verify_locks(struct btree_trans *trans) +{ + if (static_branch_unlikely(&bch2_debug_check_btree_locking)) + __bch2_trans_verify_locks(trans); +} #endif /* _BCACHEFS_BTREE_LOCKING_H */ diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index a7f06deee13c..5a97a6b8a757 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -13,6 +13,7 @@ #include <linux/kthread.h> #include <linux/min_heap.h> +#include <linux/sched/sysctl.h> #include <linux/sort.h> struct find_btree_nodes_worker { @@ -166,17 +167,23 @@ static void try_read_btree_node(struct find_btree_nodes *f, struct bch_dev *ca, bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, bn, PAGE_SIZE); + u64 submit_time = local_clock(); submit_bio_wait(bio); - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_read, - "IO error in try_read_btree_node() at %llu: %s", - offset, bch2_blk_status_to_str(bio->bi_status))) + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, + "IO error in try_read_btree_node() at %llu: %s", + offset, bch2_blk_status_to_str(bio->bi_status)); return; + } if (le64_to_cpu(bn->magic) != bset_magic(c)) return; if (bch2_csum_type_is_encryption(BSET_CSUM_TYPE(&bn->keys))) { - if (!c->chacha20) + if (!c->chacha20_key_set) return; struct nonce nonce = btree_nonce(&bn->keys, 0); @@ -264,7 +271,7 @@ static int read_btree_nodes_worker(void *p) err: bio_put(bio); free_page((unsigned long) buf); - percpu_ref_get(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); closure_put(w->cl); kfree(w); return 0; @@ -278,37 +285,37 @@ static int read_btree_nodes(struct find_btree_nodes *f) closure_init_stack(&cl); - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_btree_node_scan) { if (!(ca->mi.data_allowed & BIT(BCH_DATA_btree))) continue; struct find_btree_nodes_worker *w = kmalloc(sizeof(*w), GFP_KERNEL); - struct task_struct *t; - if (!w) { - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); ret = -ENOMEM; goto err; } - percpu_ref_get(&ca->io_ref); - closure_get(&cl); w->cl = &cl; w->f = f; w->ca = ca; - t = kthread_run(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); + struct task_struct *t = kthread_create(read_btree_nodes_worker, w, "read_btree_nodes/%s", ca->name); ret = PTR_ERR_OR_ZERO(t); if (ret) { - percpu_ref_put(&ca->io_ref); - closure_put(&cl); - f->ret = ret; - bch_err(c, "error starting kthread: %i", ret); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); + kfree(w); + bch_err_msg(c, ret, "starting kthread"); break; } + + closure_get(&cl); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_btree_node_scan); + wake_up_process(t); } err: - closure_sync(&cl); + while (closure_sync_timeout(&cl, sysctl_hung_task_timeout_secs * HZ / 2)) + ; return f->ret ?: ret; } @@ -388,10 +395,10 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } - sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); + sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_cookie, NULL); dst = 0; darray_for_each(f->nodes, i) { @@ -411,13 +418,13 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) } f->nodes.nr = dst; - sort(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); + sort_nonatomic(f->nodes.data, f->nodes.nr, sizeof(f->nodes.data[0]), found_btree_node_cmp_pos, NULL); if (0 && c->opts.verbose) { printbuf_reset(&buf); prt_printf(&buf, "%s: nodes after merging replicas:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } swap(nodes_heap, f->nodes); @@ -463,7 +470,7 @@ int bch2_scan_for_btree_nodes(struct bch_fs *c) printbuf_reset(&buf); prt_printf(&buf, "%s: nodes found after overwrites:\n", __func__); found_btree_nodes_to_text(&buf, c, f->nodes); - bch2_print_string_as_lines(KERN_INFO, buf.buf); + bch2_print_str(c, KERN_INFO, buf.buf); } else { bch_info(c, "btree node scan found %zu nodes after overwrites", f->nodes.nr); } @@ -534,7 +541,7 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, struct find_btree_nodes *f = &c->found_btree_nodes; - int ret = bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); + int ret = bch2_run_print_explicit_recovery_pass(c, BCH_RECOVERY_PASS_scan_for_btree_nodes); if (ret) return ret; @@ -572,10 +579,12 @@ int bch2_get_scanned_nodes(struct bch_fs *c, enum btree_id btree, found_btree_node_to_key(&tmp.k, &n); - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); - bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); - printbuf_exit(&buf); + if (c->opts.verbose) { + struct printbuf buf = PRINTBUF; + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&tmp.k)); + bch_verbose(c, "%s(): recovering %s", __func__, buf.buf); + printbuf_exit(&buf); + } BUG_ON(bch2_bkey_validate(c, bkey_i_to_s_c(&tmp.k), (struct bkey_validate_context) { diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index c4f524b2ca9a..1c03c965d836 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -11,6 +11,7 @@ #include "btree_write_buffer.h" #include "buckets.h" #include "disk_accounting.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "journal.h" @@ -20,6 +21,7 @@ #include "snapshot.h" #include <linux/prefetch.h> +#include <linux/string_helpers.h> static const char * const trans_commit_flags_strs[] = { #define x(n, ...) #n, @@ -164,6 +166,7 @@ bool bch2_btree_bset_insert_key(struct btree_trans *trans, EBUG_ON(bpos_gt(insert->k.p, b->data->max_key)); EBUG_ON(insert->k.u64s > bch2_btree_keys_u64s_remaining(b)); EBUG_ON(!b->c.level && !bpos_eq(insert->k.p, path->pos)); + kmsan_check_memory(insert, bkey_bytes(&insert->k)); k = bch2_btree_node_iter_peek_all(node_iter, b); if (k && bkey_cmp_left_packed(b, k, &insert->k.p)) @@ -336,6 +339,7 @@ static inline void btree_insert_entry_checks(struct btree_trans *trans, BUG_ON(i->cached != path->cached); BUG_ON(i->level != path->level); BUG_ON(i->btree_id != path->btree_id); + BUG_ON(i->bkey_type != __btree_node_type(path->level, path->btree_id)); EBUG_ON(!i->level && btree_type_has_snapshots(i->btree_id) && !(i->flags & BTREE_UPDATE_internal_snapshot_node) && @@ -364,7 +368,8 @@ static noinline void journal_transaction_name(struct btree_trans *trans) struct jset_entry_log *l = container_of(entry, struct jset_entry_log, entry); - strncpy(l->d, trans->fn, JSET_ENTRY_LOG_U64s * sizeof(u64)); + memcpy_and_pad(l->d, JSET_ENTRY_LOG_U64s * sizeof(u64), + trans->fn, strlen(trans->fn), 0); } static inline int btree_key_can_insert(struct btree_trans *trans, @@ -517,69 +522,45 @@ static int run_one_trans_trigger(struct btree_trans *trans, struct btree_insert_ } } -static int run_btree_triggers(struct btree_trans *trans, enum btree_id btree_id, - unsigned *btree_id_updates_start) +static int bch2_trans_commit_run_triggers(struct btree_trans *trans) { - bool trans_trigger_run; + unsigned sort_id_start = 0; - /* - * Running triggers will append more updates to the list of updates as - * we're walking it: - */ - do { - trans_trigger_run = false; + while (sort_id_start < trans->nr_updates) { + unsigned i, sort_id = trans->updates[sort_id_start].sort_order; + bool trans_trigger_run; - for (unsigned i = *btree_id_updates_start; - i < trans->nr_updates && trans->updates[i].btree_id <= btree_id; - i++) { - if (trans->updates[i].btree_id < btree_id) { - *btree_id_updates_start = i; - continue; + /* + * For a given btree, this algorithm runs insert triggers before + * overwrite triggers: this is so that when extents are being + * moved (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop + * references before they are re-added. + * + * Running triggers will append more updates to the list of + * updates as we're walking it: + */ + do { + trans_trigger_run = false; + + for (i = sort_id_start; + i < trans->nr_updates && trans->updates[i].sort_order <= sort_id; + i++) { + if (trans->updates[i].sort_order < sort_id) { + sort_id_start = i; + continue; + } + + int ret = run_one_trans_trigger(trans, trans->updates + i); + if (ret < 0) + return ret; + if (ret) + trans_trigger_run = true; } + } while (trans_trigger_run); - int ret = run_one_trans_trigger(trans, trans->updates + i); - if (ret < 0) - return ret; - if (ret) - trans_trigger_run = true; - } - } while (trans_trigger_run); - - trans_for_each_update(trans, i) - BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && - i->btree_id == btree_id && - btree_node_type_has_trans_triggers(i->bkey_type) && - (!i->insert_trigger_run || !i->overwrite_trigger_run)); - - return 0; -} - -static int bch2_trans_commit_run_triggers(struct btree_trans *trans) -{ - unsigned btree_id = 0, btree_id_updates_start = 0; - int ret = 0; - - /* - * - * For a given btree, this algorithm runs insert triggers before - * overwrite triggers: this is so that when extents are being moved - * (e.g. by FALLOCATE_FL_INSERT_RANGE), we don't drop references before - * they are re-added. - */ - for (btree_id = 0; btree_id < BTREE_ID_NR; btree_id++) { - if (btree_id == BTREE_ID_alloc) - continue; - - ret = run_btree_triggers(trans, btree_id, &btree_id_updates_start); - if (ret) - return ret; + sort_id_start = i; } - btree_id_updates_start = 0; - ret = run_btree_triggers(trans, BTREE_ID_alloc, &btree_id_updates_start); - if (ret) - return ret; - #ifdef CONFIG_BCACHEFS_DEBUG trans_for_each_update(trans, i) BUG_ON(!(i->flags & BTREE_TRIGGER_norun) && @@ -666,10 +647,10 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (IS_ENABLED(CONFIG_BCACHEFS_DEBUG) && !(flags & BCH_TRANS_COMMIT_no_journal_res)) { - if (bch2_journal_seq_verify) + if (static_branch_unlikely(&bch2_journal_seq_verify)) trans_for_each_update(trans, i) i->k->k.bversion.lo = trans->journal_res.seq; - else if (bch2_inject_invalid_keys) + else if (static_branch_unlikely(&bch2_inject_invalid_keys)) trans_for_each_update(trans, i) i->k->k.bversion = MAX_VERSION; } @@ -682,18 +663,17 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, h = h->next; } - struct jset_entry *entry = trans->journal_entries; + struct bkey_i *accounting; percpu_down_read(&c->mark_lock); - for (entry = trans->journal_entries; - entry != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - entry = vstruct_next(entry)) - if (entry->type == BCH_JSET_ENTRY_write_buffer_keys && - entry->start->k.type == KEY_TYPE_accounting) { - ret = bch2_accounting_trans_commit_hook(trans, bkey_i_to_accounting(entry->start), flags); - if (ret) - goto revert_fs_usage; - } + for (accounting = btree_trans_subbuf_base(trans, &trans->accounting); + accounting != btree_trans_subbuf_top(trans, &trans->accounting); + accounting = bkey_next(accounting)) { + ret = bch2_accounting_trans_commit_hook(trans, + bkey_i_to_accounting(accounting), flags); + if (ret) + goto revert_fs_usage; + } percpu_up_read(&c->mark_lock); /* XXX: we only want to run this if deltas are nonzero */ @@ -717,8 +697,8 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, if (!(flags & BCH_TRANS_COMMIT_no_journal_res)) validate_context.flags = BCH_VALIDATE_write|BCH_VALIDATE_commit; - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); i = vstruct_next(i)) { ret = bch2_journal_entry_validate(c, NULL, i, bcachefs_metadata_version_current, @@ -773,11 +753,18 @@ bch2_trans_commit_write_locked(struct btree_trans *trans, unsigned flags, } memcpy_u64s_small(journal_res_entry(&c->journal, &trans->journal_res), - trans->journal_entries, - trans->journal_entries_u64s); + btree_trans_journal_entries_start(trans), + trans->journal_entries.u64s); + + trans->journal_res.offset += trans->journal_entries.u64s; + trans->journal_res.u64s -= trans->journal_entries.u64s; - trans->journal_res.offset += trans->journal_entries_u64s; - trans->journal_res.u64s -= trans->journal_entries_u64s; + memcpy_u64s_small(bch2_journal_add_entry(j, &trans->journal_res, + BCH_JSET_ENTRY_write_buffer_keys, + BTREE_ID_accounting, 0, + trans->accounting.u64s)->_data, + btree_trans_subbuf_base(trans, &trans->accounting), + trans->accounting.u64s); if (trans->journal_seq) *trans->journal_seq = trans->journal_res.seq; @@ -799,13 +786,10 @@ fatal_err: bch2_fs_fatal_error(c, "fatal error in transaction commit: %s", bch2_err_str(ret)); percpu_down_read(&c->mark_lock); revert_fs_usage: - for (struct jset_entry *entry2 = trans->journal_entries; - entry2 != entry; - entry2 = vstruct_next(entry2)) - if (entry2->type == BCH_JSET_ENTRY_write_buffer_keys && - entry2->start->k.type == KEY_TYPE_accounting) - bch2_accounting_trans_commit_revert(trans, - bkey_i_to_accounting(entry2->start), flags); + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != accounting; + i = bkey_next(i)) + bch2_accounting_trans_commit_revert(trans, bkey_i_to_accounting(i), flags); percpu_up_read(&c->mark_lock); return ret; } @@ -903,18 +887,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, struct bch_fs *c = trans->c; enum bch_watermark watermark = flags & BCH_WATERMARK_MASK; - switch (ret) { - case -BCH_ERR_btree_insert_btree_node_full: - ret = bch2_btree_split_leaf(trans, i->path, flags); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - trace_and_count(c, trans_restart_btree_node_split, trans, - trace_ip, trans->paths + i->path); - break; - case -BCH_ERR_btree_insert_need_mark_replicas: - ret = drop_locks_do(trans, - bch2_accounting_update_sb(trans)); - break; - case -BCH_ERR_journal_res_get_blocked: + if (bch2_err_matches(ret, BCH_ERR_journal_res_blocked)) { /* * XXX: this should probably be a separate BTREE_INSERT_NONBLOCK * flag @@ -922,13 +895,26 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { ret = -BCH_ERR_journal_reclaim_would_deadlock; - break; + goto out; } ret = drop_locks_do(trans, bch2_trans_journal_res_get(trans, (flags & BCH_WATERMARK_MASK)| JOURNAL_RES_GET_CHECK)); + goto out; + } + + switch (ret) { + case -BCH_ERR_btree_insert_btree_node_full: + ret = bch2_btree_split_leaf(trans, i->path, flags); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + trace_and_count(c, trans_restart_btree_node_split, trans, + trace_ip, trans->paths + i->path); + break; + case -BCH_ERR_btree_insert_need_mark_replicas: + ret = drop_locks_do(trans, + bch2_accounting_update_sb(trans)); break; case -BCH_ERR_btree_insert_need_journal_reclaim: bch2_trans_unlock(trans); @@ -950,7 +936,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, BUG_ON(ret >= 0); break; } - +out: BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart) != !!trans->restarted); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOSPC) && @@ -978,8 +964,8 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) return ret; } - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + for (struct jset_entry *i = btree_trans_journal_entries_start(trans); + i != btree_trans_journal_entries_top(trans); i = vstruct_next(i)) if (i->type == BCH_JSET_ENTRY_btree_keys || i->type == BCH_JSET_ENTRY_write_buffer_keys) { @@ -988,6 +974,14 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) return ret; } + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != btree_trans_subbuf_top(trans, &trans->accounting); + i = bkey_next(i)) { + int ret = bch2_journal_key_insert(c, BTREE_ID_accounting, 0, i); + if (ret) + return ret; + } + return 0; } @@ -1004,7 +998,8 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; if (!trans->nr_updates && - !trans->journal_entries_u64s) + !trans->journal_entries.u64s && + !trans->accounting.u64s) goto out_reset; ret = bch2_trans_commit_run_triggers(trans); @@ -1012,7 +1007,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) goto out_reset; if (!(flags & BCH_TRANS_COMMIT_no_check_rw) && - unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_trans))) { + unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_trans))) { if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) ret = do_bch2_trans_commit_to_journal_replay(trans); else @@ -1022,7 +1017,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) EBUG_ON(test_bit(BCH_FS_clean_shutdown, &c->flags)); - trans->journal_u64s = trans->journal_entries_u64s; + trans->journal_u64s = trans->journal_entries.u64s + jset_u64s(trans->accounting.u64s); trans->journal_transaction_names = READ_ONCE(c->opts.journal_transaction_names); if (trans->journal_transaction_names) trans->journal_u64s += jset_u64s(JSET_ENTRY_LOG_U64s); @@ -1078,7 +1073,7 @@ retry: trace_and_count(c, transaction_commit, trans, _RET_IP_); out: if (likely(!(flags & BCH_TRANS_COMMIT_no_check_rw))) - bch2_write_ref_put(c, BCH_WRITE_REF_trans); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_trans); out_reset: if (!ret) bch2_trans_downgrade(trans); diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index a09cbe9cd94f..9d641bf9d2a2 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -139,6 +139,7 @@ struct btree { }; #define BCH_BTREE_CACHE_NOT_FREED_REASONS() \ + x(cache_reserve) \ x(lock_intent) \ x(lock_write) \ x(dirty) \ @@ -257,9 +258,6 @@ struct btree_node_iter { * * BTREE_TRIGGER_insert - @new is entering the btree * BTREE_TRIGGER_overwrite - @old is leaving the btree - * - * BTREE_TRIGGER_bucket_invalidate - signal from bucket invalidate path to alloc - * trigger */ #define BTREE_TRIGGER_FLAGS() \ x(norun) \ @@ -269,8 +267,7 @@ struct btree_node_iter { x(gc) \ x(insert) \ x(overwrite) \ - x(is_root) \ - x(bucket_invalidate) + x(is_root) enum { #define x(n) BTREE_ITER_FLAG_BIT_##n, @@ -367,7 +364,6 @@ static inline unsigned long btree_path_ip_allocated(struct btree_path *path) * @nodes_intent_locked - bitmask indicating which locks are intent locks */ struct btree_iter { - struct btree_trans *trans; btree_path_idx_t path; btree_path_idx_t update_path; btree_path_idx_t key_cache_path; @@ -423,6 +419,7 @@ static inline struct bpos btree_node_pos(struct btree_bkey_cached_common *b) struct btree_insert_entry { unsigned flags; + u8 sort_order; u8 bkey_type; enum btree_id btree_id:8; u8 level:4; @@ -477,6 +474,18 @@ struct btree_trans_paths { struct btree_path paths[]; }; +struct trans_kmalloc_trace { + unsigned long ip; + size_t bytes; +}; +typedef DARRAY(struct trans_kmalloc_trace) darray_trans_kmalloc_trace; + +struct btree_trans_subbuf { + u16 base; + u16 u64s; + u16 size;; +}; + struct btree_trans { struct bch_fs *c; @@ -488,6 +497,9 @@ struct btree_trans { void *mem; unsigned mem_top; unsigned mem_bytes; +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + darray_trans_kmalloc_trace trans_kmalloc_trace; +#endif btree_path_idx_t nr_sorted; btree_path_idx_t nr_paths; @@ -528,9 +540,8 @@ struct btree_trans { int srcu_idx; /* update path: */ - u16 journal_entries_u64s; - u16 journal_entries_size; - struct jset_entry *journal_entries; + struct btree_trans_subbuf journal_entries; + struct btree_trans_subbuf accounting; struct btree_trans_commit_hook *hooks; struct journal_entry_pin *journal_pin; @@ -647,13 +658,13 @@ static inline struct bset_tree *bset_tree_last(struct btree *b) static inline void * __btree_node_offset_to_ptr(const struct btree *b, u16 offset) { - return (void *) ((u64 *) b->data + 1 + offset); + return (void *) ((u64 *) b->data + offset); } static inline u16 __btree_node_ptr_to_offset(const struct btree *b, const void *p) { - u16 ret = (u64 *) p - 1 - (u64 *) b->data; + u16 ret = (u64 *) p - (u64 *) b->data; EBUG_ON(__btree_node_offset_to_ptr(b, ret) != p); return ret; @@ -853,6 +864,18 @@ static inline bool btree_type_uses_write_buffer(enum btree_id btree) return BIT_ULL(btree) & mask; } +static inline u8 btree_trigger_order(enum btree_id btree) +{ + switch (btree) { + case BTREE_ID_alloc: + return U8_MAX; + case BTREE_ID_stripes: + return U8_MAX - 1; + default: + return btree; + } +} + struct btree_root { struct btree *b; diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 13d794f201a5..5dac09c98026 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -14,10 +14,12 @@ #include "snapshot.h" #include "trace.h" +#include <linux/string_helpers.h> + static inline int btree_insert_entry_cmp(const struct btree_insert_entry *l, const struct btree_insert_entry *r) { - return cmp_int(l->btree_id, r->btree_id) ?: + return cmp_int(l->sort_order, r->sort_order) ?: cmp_int(l->cached, r->cached) ?: -cmp_int(l->level, r->level) ?: bpos_cmp(l->k->k.p, r->k->k.p); @@ -126,7 +128,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, struct bpos new_pos) { struct bch_fs *c = trans->c; - struct btree_iter old_iter, new_iter = { NULL }; + struct btree_iter old_iter, new_iter = {}; struct bkey_s_c old_k, new_k; snapshot_id_list s; struct bkey_i *update; @@ -140,7 +142,7 @@ int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, bch2_trans_iter_init(trans, &old_iter, id, old_pos, BTREE_ITER_not_extents| BTREE_ITER_all_snapshots); - while ((old_k = bch2_btree_iter_prev(&old_iter)).k && + while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k && !(ret = bkey_err(old_k)) && bkey_eq(old_pos, old_k.k->p)) { struct bpos whiteout_pos = @@ -296,7 +298,7 @@ static int bch2_trans_update_extent(struct btree_trans *trans, BTREE_ITER_intent| BTREE_ITER_with_updates| BTREE_ITER_not_extents); - k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); + k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -322,8 +324,8 @@ static int bch2_trans_update_extent(struct btree_trans *trans, if (done) goto out; next: - bch2_btree_iter_advance(&iter); - k = bch2_btree_iter_peek_max(&iter, POS(insert->k.p.inode, U64_MAX)); + bch2_btree_iter_advance(trans, &iter); + k = bch2_btree_iter_peek_max(trans, &iter, POS(insert->k.p.inode, U64_MAX)); if ((ret = bkey_err(k))) goto err; if (!k.k) @@ -397,6 +399,7 @@ bch2_trans_update_by_path(struct btree_trans *trans, btree_path_idx_t path_idx, n = (struct btree_insert_entry) { .flags = flags, + .sort_order = btree_trigger_order(path->btree_id), .bkey_type = __btree_node_type(path->level, path->btree_id), .btree_id = path->btree_id, .level = path->level, @@ -508,9 +511,12 @@ static noinline int bch2_trans_update_get_key_cache(struct btree_trans *trans, return 0; } -int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *k, enum btree_iter_update_trigger_flags flags) +int __must_check bch2_trans_update_ip(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags, + unsigned long ip) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + btree_path_idx_t path_idx = iter->update_path ?: iter->path; int ret; @@ -543,7 +549,7 @@ int __must_check bch2_trans_update(struct btree_trans *trans, struct btree_iter path_idx = iter->key_cache_path; } - return bch2_trans_update_by_path(trans, path_idx, k, flags, _RET_IP_); + return bch2_trans_update_by_path(trans, path_idx, k, flags, ip); } int bch2_btree_insert_clone_trans(struct btree_trans *trans, @@ -559,43 +565,42 @@ int bch2_btree_insert_clone_trans(struct btree_trans *trans, return bch2_btree_insert_trans(trans, btree, n, 0); } -struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) +void *__bch2_trans_subbuf_alloc(struct btree_trans *trans, + struct btree_trans_subbuf *buf, + unsigned u64s) { - unsigned new_top = trans->journal_entries_u64s + u64s; - unsigned old_size = trans->journal_entries_size; + unsigned new_top = buf->u64s + u64s; + unsigned old_size = buf->size; - if (new_top > trans->journal_entries_size) { - trans->journal_entries_size = roundup_pow_of_two(new_top); - - btree_trans_stats(trans)->journal_entries_size = trans->journal_entries_size; - } + if (new_top > buf->size) + buf->size = roundup_pow_of_two(new_top); - struct jset_entry *n = - bch2_trans_kmalloc_nomemzero(trans, - trans->journal_entries_size * sizeof(u64)); + void *n = bch2_trans_kmalloc_nomemzero(trans, buf->size * sizeof(u64)); if (IS_ERR(n)) - return ERR_CAST(n); + return n; - if (trans->journal_entries) - memcpy(n, trans->journal_entries, old_size * sizeof(u64)); - trans->journal_entries = n; + if (buf->u64s) + memcpy(n, + btree_trans_subbuf_base(trans, buf), + old_size * sizeof(u64)); + buf->base = (u64 *) n - (u64 *) trans->mem; - struct jset_entry *e = btree_trans_journal_entries_top(trans); - trans->journal_entries_u64s = new_top; - return e; + void *p = btree_trans_subbuf_top(trans, buf); + buf->u64s = new_top; + return p; } int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, enum btree_id btree, struct bpos end) { bch2_trans_iter_init(trans, iter, btree, end, BTREE_ITER_intent); - struct bkey_s_c k = bch2_btree_iter_peek_prev(iter); + struct bkey_s_c k = bch2_btree_iter_peek_prev(trans, iter); int ret = bkey_err(k); if (ret) goto err; - bch2_btree_iter_advance(iter); - k = bch2_btree_iter_peek_slot(iter); + bch2_btree_iter_advance(trans, iter); + k = bch2_btree_iter_peek_slot(trans, iter); ret = bkey_err(k); if (ret) goto err; @@ -631,7 +636,7 @@ int bch2_btree_insert_nonextent(struct btree_trans *trans, BTREE_ITER_cached| BTREE_ITER_not_extents| BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; @@ -643,7 +648,7 @@ int bch2_btree_insert_trans(struct btree_trans *trans, enum btree_id id, struct btree_iter iter; bch2_trans_iter_init(trans, &iter, id, bkey_start_pos(&k->k), BTREE_ITER_intent|flags); - int ret = bch2_btree_iter_traverse(&iter) ?: + int ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, k, flags); bch2_trans_iter_exit(trans, &iter); return ret; @@ -692,7 +697,7 @@ int bch2_btree_delete(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_cached| BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, update_flags); bch2_trans_iter_exit(trans, &iter); @@ -710,7 +715,7 @@ int bch2_btree_delete_range_trans(struct btree_trans *trans, enum btree_id id, int ret = 0; bch2_trans_iter_init(trans, &iter, id, start, BTREE_ITER_intent); - while ((k = bch2_btree_iter_peek_max(&iter, end)).k) { + while ((k = bch2_btree_iter_peek_max(trans, &iter, end)).k) { struct disk_reservation disk_res = bch2_disk_reservation_init(trans->c, 0); struct bkey_i delete; @@ -805,7 +810,7 @@ int bch2_btree_bit_mod(struct btree_trans *trans, enum btree_id btree, struct btree_iter iter; bch2_trans_iter_init(trans, &iter, btree, pos, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(&iter) ?: + int ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_bit_mod_iter(trans, &iter, set); bch2_trans_iter_exit(trans, &iter); return ret; @@ -823,23 +828,45 @@ int bch2_btree_bit_mod_buffered(struct btree_trans *trans, enum btree_id btree, return bch2_trans_update_buffered(trans, btree, &k); } -int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) +static int __bch2_trans_log_str(struct btree_trans *trans, const char *str, unsigned len) { - unsigned u64s = DIV_ROUND_UP(buf->pos, sizeof(u64)); - prt_chars(buf, '\0', u64s * sizeof(u64) - buf->pos); + unsigned u64s = DIV_ROUND_UP(len, sizeof(u64)); + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); + int ret = PTR_ERR_OR_ZERO(e); + if (ret) + return ret; + + struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); + journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); + memcpy_and_pad(l->d, u64s * sizeof(u64), str, len, 0); + return 0; +} + +int bch2_trans_log_str(struct btree_trans *trans, const char *str) +{ + return __bch2_trans_log_str(trans, str, strlen(str)); +} + +int bch2_trans_log_msg(struct btree_trans *trans, struct printbuf *buf) +{ int ret = buf->allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) return ret; - struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(u64s)); - ret = PTR_ERR_OR_ZERO(e); + return __bch2_trans_log_str(trans, buf->buf, buf->pos); +} + +int bch2_trans_log_bkey(struct btree_trans *trans, enum btree_id btree, + unsigned level, struct bkey_i *k) +{ + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, jset_u64s(k->k.u64s)); + int ret = PTR_ERR_OR_ZERO(e); if (ret) return ret; - struct jset_entry_log *l = container_of(e, struct jset_entry_log, entry); - journal_entry_init(e, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf->buf, buf->pos); + journal_entry_init(e, BCH_JSET_ENTRY_log_bkey, btree, level, k->k.u64s); + bkey_copy(e->start, k); return 0; } @@ -852,7 +879,6 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, prt_vprintf(&buf, fmt, args); unsigned u64s = DIV_ROUND_UP(buf.pos, sizeof(u64)); - prt_chars(&buf, '\0', u64s * sizeof(u64) - buf.pos); int ret = buf.allocation_failure ? -BCH_ERR_ENOMEM_trans_log_msg : 0; if (ret) @@ -865,7 +891,7 @@ __bch2_fs_log_msg(struct bch_fs *c, unsigned commit_flags, const char *fmt, struct jset_entry_log *l = (void *) &darray_top(c->journal.early_journal_entries); journal_entry_init(&l->entry, BCH_JSET_ENTRY_log, 0, 1, u64s); - memcpy(l->d, buf.buf, buf.pos); + memcpy_and_pad(l->d, u64s * sizeof(u64), buf.buf, buf.pos, 0); c->journal.early_journal_entries.nr += jset_u64s(u64s); } else { ret = bch2_trans_commit_do(c, NULL, NULL, commit_flags, diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index 8f22ef9a7651..f907eaa8b185 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -102,34 +102,80 @@ int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter * int bch2_bkey_get_empty_slot(struct btree_trans *, struct btree_iter *, enum btree_id, struct bpos); -int __must_check bch2_trans_update(struct btree_trans *, struct btree_iter *, - struct bkey_i *, enum btree_iter_update_trigger_flags); +int __must_check bch2_trans_update_ip(struct btree_trans *, struct btree_iter *, + struct bkey_i *, enum btree_iter_update_trigger_flags, + unsigned long); -struct jset_entry *__bch2_trans_jset_entry_alloc(struct btree_trans *, unsigned); +static inline int __must_check +bch2_trans_update(struct btree_trans *trans, struct btree_iter *iter, + struct bkey_i *k, enum btree_iter_update_trigger_flags flags) +{ + return bch2_trans_update_ip(trans, iter, k, flags, _THIS_IP_); +} + +static inline void *btree_trans_subbuf_base(struct btree_trans *trans, + struct btree_trans_subbuf *buf) +{ + return (u64 *) trans->mem + buf->base; +} + +static inline void *btree_trans_subbuf_top(struct btree_trans *trans, + struct btree_trans_subbuf *buf) +{ + return (u64 *) trans->mem + buf->base + buf->u64s; +} + +void *__bch2_trans_subbuf_alloc(struct btree_trans *, + struct btree_trans_subbuf *, + unsigned); + +static inline void * +bch2_trans_subbuf_alloc(struct btree_trans *trans, + struct btree_trans_subbuf *buf, + unsigned u64s) +{ + if (buf->u64s + u64s > buf->size) + return __bch2_trans_subbuf_alloc(trans, buf, u64s); + + void *p = btree_trans_subbuf_top(trans, buf); + buf->u64s += u64s; + return p; +} + +static inline struct jset_entry *btree_trans_journal_entries_start(struct btree_trans *trans) +{ + return btree_trans_subbuf_base(trans, &trans->journal_entries); +} static inline struct jset_entry *btree_trans_journal_entries_top(struct btree_trans *trans) { - return (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); + return btree_trans_subbuf_top(trans, &trans->journal_entries); } static inline struct jset_entry * bch2_trans_jset_entry_alloc(struct btree_trans *trans, unsigned u64s) { - if (!trans->journal_entries || - trans->journal_entries_u64s + u64s > trans->journal_entries_size) - return __bch2_trans_jset_entry_alloc(trans, u64s); - - struct jset_entry *e = btree_trans_journal_entries_top(trans); - trans->journal_entries_u64s += u64s; - return e; + return bch2_trans_subbuf_alloc(trans, &trans->journal_entries, u64s); } int bch2_btree_insert_clone_trans(struct btree_trans *, enum btree_id, struct bkey_i *); +int bch2_btree_write_buffer_insert_err(struct btree_trans *, + enum btree_id, struct bkey_i *); + static inline int __must_check bch2_trans_update_buffered(struct btree_trans *trans, enum btree_id btree, struct bkey_i *k) { + kmsan_check_memory(k, bkey_bytes(&k->k)); + + EBUG_ON(k->k.u64s > BTREE_WRITE_BUFERED_U64s_MAX); + + if (unlikely(!btree_type_uses_write_buffer(btree))) { + int ret = bch2_btree_write_buffer_insert_err(trans, btree, k); + dump_stack(); + return ret; + } /* * Most updates skip the btree write buffer until journal replay is * finished because synchronization with journal replay relies on having @@ -159,7 +205,10 @@ void bch2_trans_commit_hook(struct btree_trans *, struct btree_trans_commit_hook *); int __bch2_trans_commit(struct btree_trans *, unsigned); +int bch2_trans_log_str(struct btree_trans *, const char *); int bch2_trans_log_msg(struct btree_trans *, struct printbuf *); +int bch2_trans_log_bkey(struct btree_trans *, enum btree_id, unsigned, struct bkey_i *); + __printf(2, 3) int bch2_fs_log_msg(struct bch_fs *, const char *, ...); __printf(2, 3) int bch2_journal_log_msg(struct bch_fs *, const char *, ...); @@ -205,12 +254,15 @@ static inline void bch2_trans_reset_updates(struct btree_trans *trans) bch2_path_put(trans, i->path, true); trans->nr_updates = 0; - trans->journal_entries_u64s = 0; + trans->journal_entries.u64s = 0; + trans->journal_entries.size = 0; + trans->accounting.u64s = 0; + trans->accounting.size = 0; trans->hooks = NULL; trans->extra_disk_res = 0; } -static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, +static __always_inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k, unsigned type, unsigned min_bytes) { unsigned bytes = max_t(unsigned, min_bytes, bkey_bytes(k.k)); @@ -233,7 +285,7 @@ static inline struct bkey_i *__bch2_bkey_make_mut_noupdate(struct btree_trans *t return mut; } -static inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) +static __always_inline struct bkey_i *bch2_bkey_make_mut_noupdate(struct btree_trans *trans, struct bkey_s_c k) { return __bch2_bkey_make_mut_noupdate(trans, k, 0, 0); } diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index e4e7c804625e..74e65714fecd 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -14,6 +14,7 @@ #include "btree_locking.h" #include "buckets.h" #include "clock.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "io_write.h" @@ -35,6 +36,8 @@ static const char * const bch2_btree_update_modes[] = { NULL }; +static void bch2_btree_update_to_text(struct printbuf *, struct btree_update *); + static int bch2_btree_insert_node(struct btree_update *, struct btree_trans *, btree_path_idx_t, struct btree *, struct keylist *); static void bch2_btree_update_add_new_node(struct btree_update *, struct btree *); @@ -54,6 +57,8 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) struct bkey_buf prev; int ret = 0; + printbuf_indent_add_nextline(&buf, 2); + BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); @@ -64,19 +69,20 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (b == btree_node_root(c, b)) { if (!bpos_eq(b->data->min_key, POS_MIN)) { - printbuf_reset(&buf); + ret = __bch2_topology_error(c, &buf); + bch2_bpos_to_text(&buf, b->data->min_key); log_fsck_err(trans, btree_root_bad_min_key, "btree root with incorrect min_key: %s", buf.buf); - goto topology_repair; + goto out; } if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - printbuf_reset(&buf); + ret = __bch2_topology_error(c, &buf); bch2_bpos_to_text(&buf, b->data->max_key); log_fsck_err(trans, btree_root_bad_max_key, "btree root with incorrect max_key: %s", buf.buf); - goto topology_repair; + goto out; } } @@ -94,20 +100,19 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) : bpos_successor(prev.k->k.p); if (!bpos_eq(expected_min, bp.v->min_key)) { - bch2_topology_error(c); + ret = __bch2_topology_error(c, &buf); - printbuf_reset(&buf); - prt_str(&buf, "end of prev node doesn't match start of next node\n in "); + prt_str(&buf, "end of prev node doesn't match start of next node\nin "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\n prev "); + prt_str(&buf, "\nprev "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); - prt_str(&buf, "\n next "); + prt_str(&buf, "\nnext "); bch2_bkey_val_to_text(&buf, c, k); log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); - goto topology_repair; + goto out; } bch2_bkey_buf_reassemble(&prev, c, k); @@ -115,29 +120,25 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) } if (bkey_deleted(&prev.k->k)) { - bch2_topology_error(c); + ret = __bch2_topology_error(c, &buf); - printbuf_reset(&buf); - prt_str(&buf, "empty interior node\n in "); + prt_str(&buf, "empty interior node\nin "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); - goto topology_repair; } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - bch2_topology_error(c); + ret = __bch2_topology_error(c, &buf); - printbuf_reset(&buf); - prt_str(&buf, "last child node doesn't end at end of parent node\n in "); + prt_str(&buf, "last child node doesn't end at end of parent node\nin "); bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); prt_str(&buf, " node "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\n last key "); + prt_str(&buf, "\nlast key "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); - goto topology_repair; } out: fsck_err: @@ -145,9 +146,6 @@ fsck_err: bch2_bkey_buf_exit(&prev, c); printbuf_exit(&buf); return ret; -topology_repair: - ret = bch2_topology_error(c); - goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -287,6 +285,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, struct disk_reservation *res, struct closure *cl, bool interior_node, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; @@ -320,6 +319,7 @@ static struct btree *__bch2_btree_node_alloc(struct btree_trans *trans, mutex_unlock(&c->btree_reserve_cache_lock); retry: ret = bch2_alloc_sectors_start_trans(trans, + target ?: c->opts.metadata_target ?: c->opts.foreground_target, 0, @@ -328,7 +328,9 @@ retry: res->nr_replicas, min(res->nr_replicas, c->opts.metadata_replicas_required), - watermark, 0, cl, &wp); + watermark, + target ? BCH_WRITE_only_specified_devs : 0, + cl, &wp); if (unlikely(ret)) goto err; @@ -508,6 +510,7 @@ static void bch2_btree_reserve_put(struct btree_update *as, struct btree_trans * static int bch2_btree_reserve_get(struct btree_trans *trans, struct btree_update *as, unsigned nr_nodes[2], + unsigned target, unsigned flags, struct closure *cl) { @@ -530,7 +533,7 @@ static int bch2_btree_reserve_get(struct btree_trans *trans, while (p->nr < nr_nodes[interior]) { b = __bch2_btree_node_alloc(trans, &as->disk_res, cl, - interior, flags); + interior, target, flags); if (IS_ERR(b)) { ret = PTR_ERR(b); goto err; @@ -649,6 +652,14 @@ static int btree_update_nodes_written_trans(struct btree_trans *trans, return 0; } +/* If the node has been reused, we might be reading uninitialized memory - that's fine: */ +static noinline __no_kmsan_checks bool btree_node_seq_matches(struct btree *b, __le64 seq) +{ + struct btree_node *b_data = READ_ONCE(b->data); + + return (b_data ? b_data->keys.seq : 0) == seq; +} + static void btree_update_nodes_written(struct btree_update *as) { struct bch_fs *c = as->c; @@ -677,17 +688,9 @@ static void btree_update_nodes_written(struct btree_update *as) * on disk: */ for (i = 0; i < as->nr_old_nodes; i++) { - __le64 seq; - b = as->old_nodes[i]; - bch2_trans_begin(trans); - btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); - seq = b->data ? b->data->keys.seq : 0; - six_unlock_read(&b->c.lock); - bch2_trans_unlock_long(trans); - - if (seq == as->old_nodes_seq[i]) + if (btree_node_seq_matches(b, as->old_nodes_seq[i])) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } @@ -1119,7 +1122,8 @@ static void bch2_btree_update_done(struct btree_update *as, struct btree_trans * static struct btree_update * bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, - unsigned level_start, bool split, unsigned flags) + unsigned level_start, bool split, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; struct btree_update *as; @@ -1224,12 +1228,12 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, ret = bch2_disk_reservation_get(c, &as->disk_res, (nr_nodes[0] + nr_nodes[1]) * btree_sectors(c), - c->opts.metadata_replicas, + READ_ONCE(c->opts.metadata_replicas), disk_res_flags); if (ret) goto err; - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, NULL); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, NULL); if (bch2_err_matches(ret, ENOSPC) || bch2_err_matches(ret, ENOMEM)) { struct closure cl; @@ -1248,7 +1252,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, closure_init_stack(&cl); do { - ret = bch2_btree_reserve_get(trans, as, nr_nodes, flags, &cl); + ret = bch2_btree_reserve_get(trans, as, nr_nodes, target, flags, &cl); bch2_trans_unlock(trans); bch2_wait_on_allocator(c, &cl); @@ -1271,7 +1275,8 @@ err: bch2_btree_update_free(as, trans); if (!bch2_err_matches(ret, ENOSPC) && !bch2_err_matches(ret, EROFS) && - ret != -BCH_ERR_journal_reclaim_would_deadlock) + ret != -BCH_ERR_journal_reclaim_would_deadlock && + ret != -BCH_ERR_journal_shutdown) bch_err_fn_ratelimited(c, ret); return ERR_PTR(ret); } @@ -1391,7 +1396,7 @@ static void bch2_insert_fixup_btree_ptr(struct btree_update *as, printbuf_exit(&buf); } -static void +static int bch2_btree_insert_keys_interior(struct btree_update *as, struct btree_trans *trans, struct btree_path *path, @@ -1413,7 +1418,8 @@ bch2_btree_insert_keys_interior(struct btree_update *as, insert = bkey_next(insert)) bch2_insert_fixup_btree_ptr(as, trans, path, b, &node_iter, insert); - if (bch2_btree_node_check_topology(trans, b)) { + int ret = bch2_btree_node_check_topology(trans, b); + if (ret) { struct printbuf buf = PRINTBUF; for (struct bkey_i *k = keys->keys; @@ -1423,11 +1429,15 @@ bch2_btree_insert_keys_interior(struct btree_update *as, prt_newline(&buf); } - panic("%s(): check_topology error: inserted keys\n%s", __func__, buf.buf); + bch2_fs_fatal_error(as->c, "%ps -> %s(): check_topology error %s: inserted keys\n%s", + (void *) _RET_IP_, __func__, bch2_err_str(ret), buf.buf); + dump_stack(); + return ret; } memmove_u64s_down(keys->keys, insert, keys->top_p - insert->_data); keys->top_p -= insert->_data - keys->keys_p; + return 0; } static bool key_deleted_in_insert(struct keylist *insert_keys, struct bpos pos) @@ -1561,11 +1571,11 @@ static void __btree_split_node(struct btree_update *as, * nodes that were coalesced, and thus in the middle of a child node post * coalescing: */ -static void btree_split_insert_keys(struct btree_update *as, - struct btree_trans *trans, - btree_path_idx_t path_idx, - struct btree *b, - struct keylist *keys) +static int btree_split_insert_keys(struct btree_update *as, + struct btree_trans *trans, + btree_path_idx_t path_idx, + struct btree *b, + struct keylist *keys) { struct btree_path *path = trans->paths + path_idx; @@ -1575,8 +1585,12 @@ static void btree_split_insert_keys(struct btree_update *as, bch2_btree_node_iter_init(&node_iter, b, &bch2_keylist_front(keys)->k.p); - bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); + int ret = bch2_btree_insert_keys_interior(as, trans, path, b, node_iter, keys); + if (ret) + return ret; } + + return 0; } static int btree_split(struct btree_update *as, struct btree_trans *trans, @@ -1609,8 +1623,10 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, __btree_split_node(as, trans, b, n, keys); if (keys) { - btree_split_insert_keys(as, trans, path, n1, keys); - btree_split_insert_keys(as, trans, path, n2, keys); + ret = btree_split_insert_keys(as, trans, path, n1, keys) ?: + btree_split_insert_keys(as, trans, path, n2, keys); + if (ret) + goto err; BUG_ON(!bch2_keylist_empty(keys)); } @@ -1656,7 +1672,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, n3->sib_u64s[0] = U16_MAX; n3->sib_u64s[1] = U16_MAX; - btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); + ret = btree_split_insert_keys(as, trans, path, n3, &as->parent_keys); + if (ret) + goto err; } } else { trace_and_count(c, btree_node_compact, trans, b); @@ -1664,7 +1682,9 @@ static int btree_split(struct btree_update *as, struct btree_trans *trans, n1 = bch2_btree_node_alloc_replacement(as, trans, b); if (keys) { - btree_split_insert_keys(as, trans, path, n1, keys); + ret = btree_split_insert_keys(as, trans, path, n1, keys); + if (ret) + goto err; BUG_ON(!bch2_keylist_empty(keys)); } @@ -1782,11 +1802,24 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t int ret; lockdep_assert_held(&c->gc_lock); - BUG_ON(!btree_node_intent_locked(path, b->c.level)); BUG_ON(!b->c.level); BUG_ON(!as || as->b); bch2_verify_keylist_sorted(keys); + if (!btree_node_intent_locked(path, b->c.level)) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "%s(): node not locked at level %u\n", + __func__, b->c.level); + bch2_btree_update_to_text(&buf, as); + bch2_btree_path_to_text(&buf, trans, path_idx); + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + return -EIO; + } + ret = bch2_btree_node_lock_write(trans, path, &b->c); if (ret) return ret; @@ -1798,15 +1831,15 @@ static int bch2_btree_insert_node(struct btree_update *as, struct btree_trans *t goto split; } - ret = bch2_btree_node_check_topology(trans, b); + + ret = bch2_btree_node_check_topology(trans, b) ?: + bch2_btree_insert_keys_interior(as, trans, path, b, + path->l[b->c.level].iter, keys); if (ret) { bch2_btree_node_unlock_write(trans, path, b); return ret; } - bch2_btree_insert_keys_interior(as, trans, path, b, - path->l[b->c.level].iter, keys); - trans_for_each_path_with_node(trans, b, linked, i) bch2_btree_node_iter_peek(&linked->l[b->c.level].iter, b); @@ -1852,7 +1885,7 @@ int bch2_btree_split_leaf(struct btree_trans *trans, as = bch2_btree_update_start(trans, trans->paths + path, trans->paths[path].level, - true, flags); + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -1922,7 +1955,8 @@ int bch2_btree_increase_depth(struct btree_trans *trans, btree_path_idx_t path, return bch2_btree_split_leaf(trans, path, flags); struct btree_update *as = - bch2_btree_update_start(trans, trans->paths + path, b->c.level, true, flags); + bch2_btree_update_start(trans, trans->paths + path, b->c.level, + true, 0, flags); if (IS_ERR(as)) return PTR_ERR(as); @@ -2007,18 +2041,22 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, } if (!bpos_eq(bpos_successor(prev->data->max_key), next->data->min_key)) { - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - - bch2_bpos_to_text(&buf1, prev->data->max_key); - bch2_bpos_to_text(&buf2, next->data->min_key); - bch_err(c, - "%s(): btree topology error:\n" - " prev ends at %s\n" - " next starts at %s", - __func__, buf1.buf, buf2.buf); - printbuf_exit(&buf1); - printbuf_exit(&buf2); - ret = bch2_topology_error(c); + struct printbuf buf = PRINTBUF; + + printbuf_indent_add_nextline(&buf, 2); + prt_printf(&buf, "%s(): ", __func__); + ret = __bch2_topology_error(c, &buf); + prt_newline(&buf); + + prt_printf(&buf, "prev ends at "); + bch2_bpos_to_text(&buf, prev->data->max_key); + prt_newline(&buf); + + prt_printf(&buf, "next starts at "); + bch2_bpos_to_text(&buf, next->data->min_key); + + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); goto err; } @@ -2047,7 +2085,7 @@ int __bch2_foreground_maybe_merge(struct btree_trans *trans, parent = btree_node_parent(trans->paths + path, b); as = bch2_btree_update_start(trans, trans->paths + path, level, false, - BCH_TRANS_COMMIT_no_enospc|flags); + 0, BCH_TRANS_COMMIT_no_enospc|flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto err; @@ -2126,9 +2164,35 @@ err_free_update: goto out; } +static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b) +{ + bch2_trans_node_iter_init(trans, iter, b->c.btree_id, b->key.k.p, + BTREE_MAX_DEPTH, b->c.level, + BTREE_ITER_intent); + int ret = bch2_btree_iter_traverse(trans, iter); + if (ret) + goto err; + + /* has node been freed? */ + if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { + /* node has been freed: */ + BUG_ON(!btree_node_dying(b)); + ret = -BCH_ERR_btree_node_dying; + goto err; + } + + BUG_ON(!btree_node_hashed(b)); + return 0; +err: + bch2_trans_iter_exit(trans, iter); + return ret; +} + int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_iter *iter, struct btree *b, + unsigned target, unsigned flags) { struct bch_fs *c = trans->c; @@ -2141,7 +2205,8 @@ int bch2_btree_node_rewrite(struct btree_trans *trans, struct btree_path *path = btree_iter_path(trans, iter); parent = btree_node_parent(path, b); - as = bch2_btree_update_start(trans, path, b->c.level, false, flags); + as = bch2_btree_update_start(trans, path, b->c.level, + false, target, flags); ret = PTR_ERR_OR_ZERO(as); if (ret) goto out; @@ -2191,67 +2256,83 @@ err: goto out; } -struct async_btree_rewrite { - struct bch_fs *c; - struct work_struct work; - struct list_head list; - enum btree_id btree_id; - unsigned level; - struct bkey_buf key; -}; - -static int async_btree_node_rewrite_trans(struct btree_trans *trans, - struct async_btree_rewrite *a) +static int bch2_btree_node_rewrite_key(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bkey_i *k, unsigned flags) { struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, - a->btree_id, a->key.k->k.p, - BTREE_MAX_DEPTH, a->level, 0); - struct btree *b = bch2_btree_iter_peek_node(&iter); + btree, k->k.p, + BTREE_MAX_DEPTH, level, 0); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); int ret = PTR_ERR_OR_ZERO(b); if (ret) goto out; - bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(a->key.k); + bool found = b && btree_ptr_hash_val(&b->key) == btree_ptr_hash_val(k); ret = found - ? bch2_btree_node_rewrite(trans, &iter, b, 0) + ? bch2_btree_node_rewrite(trans, &iter, b, 0, flags) : -ENOENT; +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} -#if 0 - /* Tracepoint... */ - if (!ret || ret == -ENOENT) { - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; +int bch2_btree_node_rewrite_pos(struct btree_trans *trans, + enum btree_id btree, unsigned level, + struct bpos pos, + unsigned target, + unsigned flags) +{ + BUG_ON(!level); - if (!ret) { - prt_printf(&buf, "rewrite node:\n "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); - } else { - prt_printf(&buf, "node to rewrite not found:\n want: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(a->key.k)); - prt_printf(&buf, "\n got: "); - if (b) - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - else - prt_str(&buf, "(null)"); - } - bch_info(c, "%s", buf.buf); - printbuf_exit(&buf); - } -#endif -out: + /* Traverse one depth lower to get a pointer to the node itself: */ + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, pos, 0, level - 1, 0); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto err; + + ret = bch2_btree_node_rewrite(trans, &iter, b, target, flags); +err: bch2_trans_iter_exit(trans, &iter); return ret; } +int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *trans, + struct btree *b, unsigned flags) +{ + struct btree_iter iter; + int ret = get_iter_to_node(trans, &iter, b); + if (ret) + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; + + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, flags); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +struct async_btree_rewrite { + struct bch_fs *c; + struct work_struct work; + struct list_head list; + enum btree_id btree_id; + unsigned level; + struct bkey_buf key; +}; + static void async_btree_node_rewrite_work(struct work_struct *work) { struct async_btree_rewrite *a = container_of(work, struct async_btree_rewrite, work); struct bch_fs *c = a->c; - int ret = bch2_trans_do(c, async_btree_node_rewrite_trans(trans, a)); - if (ret != -ENOENT) + int ret = bch2_trans_do(c, bch2_btree_node_rewrite_key(trans, + a->btree_id, a->level, a->key.k, 0)); + if (ret != -ENOENT && + !bch2_err_matches(ret, EROFS) && + ret != -BCH_ERR_journal_shutdown) bch_err_fn_ratelimited(c, ret); spin_lock(&c->btree_node_rewrites_lock); @@ -2261,7 +2342,7 @@ static void async_btree_node_rewrite_work(struct work_struct *work) closure_wake_up(&c->btree_node_rewrites_wait); bch2_bkey_buf_exit(&a->key, c); - bch2_write_ref_put(c, BCH_WRITE_REF_node_rewrite); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_node_rewrite); kfree(a); } @@ -2282,8 +2363,8 @@ void bch2_btree_node_rewrite_async(struct bch_fs *c, struct btree *b) bool now = false, pending = false; spin_lock(&c->btree_node_rewrites_lock); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_journal_replay && - bch2_write_ref_tryget(c, BCH_WRITE_REF_node_rewrite)) { + if (c->recovery.passes_complete & BIT_ULL(BCH_RECOVERY_PASS_journal_replay) && + enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_node_rewrite)) { list_add(&a->list, &c->btree_node_rewrites); now = true; } else if (!test_bit(BCH_FS_may_go_rw, &c->flags)) { @@ -2322,7 +2403,7 @@ void bch2_do_pending_node_rewrites(struct bch_fs *c) if (!a) break; - bch2_write_ref_get(c, BCH_WRITE_REF_node_rewrite); + enumerated_ref_get(&c->writes, BCH_WRITE_REF_node_rewrite); queue_work(c->btree_node_rewrite_worker, &a->work); } } @@ -2352,7 +2433,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, bool skip_triggers) { struct bch_fs *c = trans->c; - struct btree_iter iter2 = { NULL }; + struct btree_iter iter2 = {}; struct btree *parent; int ret; @@ -2376,7 +2457,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, parent = btree_node_parent(btree_iter_path(trans, iter), b); if (parent) { - bch2_trans_copy_iter(&iter2, iter); + bch2_trans_copy_iter(trans, &iter2, iter); iter2.path = bch2_btree_path_make_mut(trans, iter2.path, iter2.flags & BTREE_ITER_intent, @@ -2390,7 +2471,7 @@ static int __bch2_btree_node_update_key(struct btree_trans *trans, trans->paths_sorted = false; - ret = bch2_btree_iter_traverse(&iter2) ?: + ret = bch2_btree_iter_traverse(trans, &iter2) ?: bch2_trans_update(trans, &iter2, new_key, BTREE_TRIGGER_norun); if (ret) goto err; @@ -2494,30 +2575,15 @@ int bch2_btree_node_update_key_get_iter(struct btree_trans *trans, unsigned commit_flags, bool skip_triggers) { struct btree_iter iter; - int ret; - - bch2_trans_node_iter_init(trans, &iter, b->c.btree_id, b->key.k.p, - BTREE_MAX_DEPTH, b->c.level, - BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&iter); + int ret = get_iter_to_node(trans, &iter, b); if (ret) - goto out; - - /* has node been freed? */ - if (btree_iter_path(trans, &iter)->l[b->c.level].b != b) { - /* node has been freed: */ - BUG_ON(!btree_node_dying(b)); - goto out; - } - - BUG_ON(!btree_node_hashed(b)); + return ret == -BCH_ERR_btree_node_dying ? 0 : ret; bch2_bkey_drop_ptrs(bkey_i_to_s(new_key), ptr, !bch2_bkey_has_device(bkey_i_to_s(&b->key), ptr->dev)); ret = bch2_btree_node_update_key(trans, &iter, b, new_key, commit_flags, skip_triggers); -out: bch2_trans_iter_exit(trans, &iter); return ret; } diff --git a/fs/bcachefs/btree_update_interior.h b/fs/bcachefs/btree_update_interior.h index 26d646e1275c..7fe793788a79 100644 --- a/fs/bcachefs/btree_update_interior.h +++ b/fs/bcachefs/btree_update_interior.h @@ -144,7 +144,7 @@ static inline int bch2_foreground_maybe_merge_sibling(struct btree_trans *trans, EBUG_ON(!btree_node_locked(path, level)); - if (bch2_btree_node_merging_disabled) + if (static_branch_unlikely(&bch2_btree_node_merging_disabled)) return 0; b = path->l[level].b; @@ -168,8 +168,15 @@ static inline int bch2_foreground_maybe_merge(struct btree_trans *trans, } int bch2_btree_node_rewrite(struct btree_trans *, struct btree_iter *, - struct btree *, unsigned); + struct btree *, unsigned, unsigned); +int bch2_btree_node_rewrite_pos(struct btree_trans *, + enum btree_id, unsigned, + struct bpos, unsigned, unsigned); +int bch2_btree_node_rewrite_key_get_iter(struct btree_trans *, + struct btree *, unsigned); + void bch2_btree_node_rewrite_async(struct bch_fs *, struct btree *); + int bch2_btree_node_update_key(struct btree_trans *, struct btree_iter *, struct btree *, struct bkey_i *, unsigned, bool); diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index b56c4987b8c9..efb0c64d0aac 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -7,6 +7,7 @@ #include "btree_update_interior.h" #include "btree_write_buffer.h" #include "disk_accounting.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "journal.h" @@ -144,7 +145,7 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite EBUG_ON(!trans->c->btree_write_buffer.flushing.pin.seq); EBUG_ON(trans->c->btree_write_buffer.flushing.pin.seq > wb->journal_seq); - ret = bch2_btree_iter_traverse(iter); + ret = bch2_btree_iter_traverse(trans, iter); if (ret) return ret; @@ -181,6 +182,8 @@ static inline int wb_flush_one(struct btree_trans *trans, struct btree_iter *ite return wb_flush_one_slowpath(trans, iter, wb); } + EBUG_ON(!bpos_eq(wb->k.k.p, path->pos)); + bch2_btree_insert_key_leaf(trans, path, &wb->k, wb->journal_seq); (*fast)++; return 0; @@ -208,7 +211,7 @@ btree_write_buffered_insert(struct btree_trans *trans, trans->journal_res.seq = wb->journal_seq; - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, &wb->k, BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter); @@ -264,12 +267,28 @@ out: BUG_ON(wb->sorted.size < wb->flushing.keys.nr); } +int bch2_btree_write_buffer_insert_err(struct btree_trans *trans, + enum btree_id btree, struct bkey_i *k) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "attempting to do write buffer update on non wb btree="); + bch2_btree_id_to_text(&buf, btree); + prt_str(&buf, "\n"); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(k)); + + bch2_fs_inconsistent(c, "%s", buf.buf); + printbuf_exit(&buf); + return -EROFS; +} + static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) { struct bch_fs *c = trans->c; struct journal *j = &c->journal; struct btree_write_buffer *wb = &c->btree_write_buffer; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; size_t overwritten = 0, fast = 0, slowpath = 0, could_not_insert = 0; bool write_locked = false; bool accounting_replay_done = test_bit(BCH_FS_accounting_replay_done, &c->flags); @@ -312,7 +331,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) darray_for_each(wb->sorted, i) { struct btree_write_buffered_key *k = &wb->flushing.keys.data[i->idx]; - BUG_ON(!btree_type_uses_write_buffer(k->btree)); + if (unlikely(!btree_type_uses_write_buffer(k->btree))) { + ret = bch2_btree_write_buffer_insert_err(trans, k->btree, &k->k); + goto err; + } for (struct wb_key_ref *n = i + 1; n < min(i + 4, &darray_top(wb->sorted)); n++) prefetch(&wb->flushing.keys.data[n->idx]); @@ -349,7 +371,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) write_locked = false; ret = lockrestart_do(trans, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_foreground_maybe_merge(trans, iter.path, 0, BCH_WATERMARK_reclaim| BCH_TRANS_COMMIT_journal_reclaim| @@ -366,7 +388,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) BTREE_ITER_intent|BTREE_ITER_all_snapshots); } - bch2_btree_iter_set_pos(&iter, k->k.k.p); + bch2_btree_iter_set_pos(trans, &iter, k->k.k.p); btree_iter_path(trans, &iter)->preserve = false; bool accounting_accumulated = false; @@ -409,10 +431,10 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) */ trace_and_count(c, write_buffer_flush_slowpath, trans, slowpath, wb->flushing.keys.nr); - sort(wb->flushing.keys.data, - wb->flushing.keys.nr, - sizeof(wb->flushing.keys.data[0]), - wb_key_seq_cmp, NULL); + sort_nonatomic(wb->flushing.keys.data, + wb->flushing.keys.nr, + sizeof(wb->flushing.keys.data[0]), + wb_key_seq_cmp, NULL); darray_for_each(wb->flushing.keys, i) { if (!i->journal_seq) @@ -610,11 +632,11 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) { struct bch_fs *c = trans->c; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) return -BCH_ERR_erofs_no_writes; int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); return ret; } @@ -673,7 +695,7 @@ static void bch2_btree_write_buffer_flush_work(struct work_struct *work) } while (!ret && bch2_btree_write_buffer_should_flush(c)); mutex_unlock(&wb->flushing.lock); - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); } static void wb_accounting_sort(struct btree_write_buffer *wb) @@ -802,9 +824,9 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *c, struct journal_keys_ bch2_journal_pin_drop(&c->journal, &dst->wb->pin); if (bch2_btree_write_buffer_should_flush(c) && - __bch2_write_ref_tryget(c, BCH_WRITE_REF_btree_write_buffer) && + __enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer) && !queue_work(system_unbound_wq, &c->btree_write_buffer.flush_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_btree_write_buffer); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); if (dst->wb == &wb->flushing) mutex_unlock(&wb->flushing.lock); @@ -847,13 +869,18 @@ void bch2_fs_btree_write_buffer_exit(struct bch_fs *c) darray_exit(&wb->inc.keys); } -int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +void bch2_fs_btree_write_buffer_init_early(struct bch_fs *c) { struct btree_write_buffer *wb = &c->btree_write_buffer; mutex_init(&wb->inc.lock); mutex_init(&wb->flushing.lock); INIT_WORK(&wb->flush_work, bch2_btree_write_buffer_flush_work); +} + +int bch2_fs_btree_write_buffer_init(struct bch_fs *c) +{ + struct btree_write_buffer *wb = &c->btree_write_buffer; /* Will be resized by journal as needed: */ unsigned initial_size = 1 << 16; diff --git a/fs/bcachefs/btree_write_buffer.h b/fs/bcachefs/btree_write_buffer.h index d535cea28bde..05f56fd1eed0 100644 --- a/fs/bcachefs/btree_write_buffer.h +++ b/fs/bcachefs/btree_write_buffer.h @@ -101,6 +101,7 @@ int bch2_journal_keys_to_write_buffer_end(struct bch_fs *, struct journal_keys_t int bch2_btree_write_buffer_resize(struct bch_fs *, size_t); void bch2_fs_btree_write_buffer_exit(struct bch_fs *); +void bch2_fs_btree_write_buffer_init_early(struct bch_fs *); int bch2_fs_btree_write_buffer_init(struct bch_fs *); #endif /* _BCACHEFS_BTREE_WRITE_BUFFER_H */ diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 345b117a4a4a..09eb5a543ae4 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -30,8 +30,15 @@ void bch2_dev_usage_read_fast(struct bch_dev *ca, struct bch_dev_usage *usage) { + for (unsigned i = 0; i < BCH_DATA_NR; i++) + usage->buckets[i] = percpu_u64_get(&ca->usage->d[i].buckets); +} + +void bch2_dev_usage_full_read_fast(struct bch_dev *ca, struct bch_dev_usage_full *usage) +{ memset(usage, 0, sizeof(*usage)); - acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, dev_usage_u64s()); + acc_u64s_percpu((u64 *) usage, (u64 __percpu *) ca->usage, + sizeof(struct bch_dev_usage_full) / sizeof(u64)); } static u64 reserve_factor(u64 r) @@ -75,7 +82,7 @@ bch2_fs_usage_read_short(struct bch_fs *c) void bch2_dev_usage_to_text(struct printbuf *out, struct bch_dev *ca, - struct bch_dev_usage *usage) + struct bch_dev_usage_full *usage) { if (out->nr_tabstops < 5) { printbuf_tabstops_reset(out); @@ -149,10 +156,14 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, g->gen_valid = true; g->gen = p.ptr.gen; } else { + /* this pointer will be dropped */ *do_update = true; + goto out; } } + /* g->gen_valid == true */ + if (fsck_err_on(gen_cmp(p.ptr.gen, g->gen) > 0, trans, ptr_gen_newer_than_bucket_gen, "bucket %u:%zu data type %s ptr gen in the future: %u > %u\n" @@ -165,15 +176,13 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, if (!p.ptr.cached && (g->data_type != BCH_DATA_btree || data_type == BCH_DATA_btree)) { - g->gen_valid = true; - g->gen = p.ptr.gen; - g->data_type = 0; + g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; g->cached_sectors = 0; - } else { - *do_update = true; } + + *do_update = true; } if (fsck_err_on(gen_cmp(g->gen, p.ptr.gen) > BUCKET_GC_GEN_MAX, @@ -210,9 +219,8 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, bch2_data_type_str(data_type), (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - if (data_type == BCH_DATA_btree) { - g->gen_valid = true; - g->gen = p.ptr.gen; + if (!p.ptr.cached && + data_type == BCH_DATA_btree) { g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; @@ -365,7 +373,7 @@ found: struct btree_iter iter; bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, BTREE_ITER_intent|BTREE_ITER_all_snapshots); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node| BTREE_TRIGGER_norun); @@ -381,6 +389,31 @@ err: return ret; } +static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf, + struct bkey_s_c k, bool insert, enum bch_sb_error_id id) +{ + struct bch_fs *c = trans->c; + + prt_printf(buf, "\nwhile marking "); + bch2_bkey_val_to_text(buf, c, k); + prt_newline(buf); + + bool print = __bch2_count_fsck_err(c, id, buf); + + int ret = bch2_run_explicit_recovery_pass(c, buf, + BCH_RECOVERY_PASS_check_allocations, 0); + + if (insert) { + bch2_trans_updates_to_text(buf, trans); + __bch2_inconsistent_error(c, buf); + ret = -BCH_ERR_bucket_ref_update; + } + + if (print || insert) + bch2_print_str(c, KERN_ERR, buf->buf); + return ret; +} + int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, struct bkey_s_c k, const struct bch_extent_ptr *ptr, @@ -396,32 +429,29 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, BUG_ON(!sectors); - if (gen_after(ptr->gen, b_gen)) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, ptr_gen_newer_than_bucket_gen, - "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen\n" - "while marking %s", + if (unlikely(gen_after(ptr->gen, b_gen))) { + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "bucket %u:%zu gen %u data type %s: ptr gen %u newer than bucket gen", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen, - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (inserting) - goto err; + ptr->gen); + + ret = bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_ptr_gen_newer_than_bucket_gen); goto out; } - if (gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, ptr_too_stale, - "bucket %u:%zu gen %u data type %s: ptr gen %u too stale\n" - "while marking %s", + if (unlikely(gen_cmp(b_gen, ptr->gen) > BUCKET_GC_GEN_MAX)) { + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "bucket %u:%zu gen %u data type %s: ptr gen %u too stale", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (inserting) - goto err; + ptr->gen); + + ret = bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_ptr_too_stale); goto out; } @@ -430,62 +460,50 @@ int bch2_bucket_ref_update(struct btree_trans *trans, struct bch_dev *ca, goto out; } - if (b_gen != ptr->gen) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, stale_dirty_ptr, - "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)\n" - "while marking %s", + if (unlikely(b_gen != ptr->gen)) { + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "bucket %u:%zu gen %u (mem gen %u) data type %s: stale dirty ptr (gen %u)", ptr->dev, bucket_nr, b_gen, bucket_gen_get(ca, bucket_nr), bch2_data_type_str(bucket_data_type ?: ptr_data_type), - ptr->gen, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (inserting) - goto err; + ptr->gen); + + ret = bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_stale_dirty_ptr); goto out; } - if (bucket_data_type_mismatch(bucket_data_type, ptr_data_type)) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, ptr_bucket_data_type_mismatch, - "bucket %u:%zu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - ptr->dev, bucket_nr, b_gen, - bch2_data_type_str(bucket_data_type), - bch2_data_type_str(ptr_data_type), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (inserting) - goto err; + if (unlikely(bucket_data_type_mismatch(bucket_data_type, ptr_data_type))) { + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "bucket %u:%zu gen %u different types of data in same bucket: %s, %s", + ptr->dev, bucket_nr, b_gen, + bch2_data_type_str(bucket_data_type), + bch2_data_type_str(ptr_data_type)); + + ret = bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_ptr_bucket_data_type_mismatch); goto out; } - if ((u64) *bucket_sectors + sectors > U32_MAX) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, bucket_sector_count_overflow, - "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX\n" - "while marking %s", + if (unlikely((u64) *bucket_sectors + sectors > U32_MAX)) { + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "bucket %u:%zu gen %u data type %s sector count overflow: %u + %lli > U32_MAX", ptr->dev, bucket_nr, b_gen, bch2_data_type_str(bucket_data_type ?: ptr_data_type), - *bucket_sectors, sectors, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, k), buf.buf)); - if (inserting) - goto err; + *bucket_sectors, sectors); + + ret = bucket_ref_update_err(trans, &buf, k, inserting, + BCH_FSCK_ERR_bucket_sector_count_overflow); sectors = -*bucket_sectors; + goto out; } *bucket_sectors += sectors; out: printbuf_exit(&buf); return ret; -err: -fsck_err: - bch2_dump_trans_updates(trans); - bch2_inconsistent_error(c); - ret = -BCH_ERR_bucket_ref_update; - goto out; } void bch2_trans_account_disk_usage_change(struct btree_trans *trans) @@ -582,6 +600,13 @@ static int bch2_trigger_pointer(struct btree_trans *trans, } struct bpos bucket = PTR_BUCKET_POS(ca, &p.ptr); + if (!bucket_valid(ca, bucket.offset)) { + if (insert) { + bch2_dev_bucket_missing(ca, bucket.offset); + ret = -BCH_ERR_trigger_pointer; + } + goto err; + } if (flags & BTREE_TRIGGER_transactional) { struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); @@ -590,11 +615,9 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (ret) goto err; - if (!p.ptr.cached) { - ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); - if (ret) - goto err; - } + ret = bch2_bucket_backpointer_mod(trans, k, &bp, insert); + if (ret) + goto err; } if (flags & BTREE_TRIGGER_gc) { @@ -653,9 +676,9 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, stripe_blockcount_get(&s->v, p.ec.block) + sectors); - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = data_type; ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); @@ -674,26 +697,28 @@ err: return -BCH_ERR_ENOMEM_mark_stripe_ptr; } - mutex_lock(&c->ec_stripes_heap_lock); + gc_stripe_lock(m); if (!m || !m->alive) { - mutex_unlock(&c->ec_stripes_heap_lock); + gc_stripe_unlock(m); struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "pointer to nonexistent stripe %llu\n while marking ", + (u64) p.ec.idx); bch2_bkey_val_to_text(&buf, c, k); - bch_err_ratelimited(c, "pointer to nonexistent stripe %llu\n while marking %s", - (u64) p.ec.idx, buf.buf); + __bch2_inconsistent_error(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - bch2_inconsistent_error(c); return -BCH_ERR_trigger_stripe_pointer; } m->block_sectors[p.ec.block] += sectors; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; - memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e)); - mutex_unlock(&c->ec_stripes_heap_lock); + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; + unsafe_memcpy(&acc.replicas, &m->r.e, replicas_entry_bytes(&m->r.e), "VLA"); + gc_stripe_unlock(m); acc.replicas.data_type = data_type; int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, true); @@ -719,16 +744,14 @@ static int __trigger_extent(struct btree_trans *trans, : BCH_DATA_user; int ret = 0; - struct disk_accounting_pos acc_replicas_key = { - .type = BCH_DISK_ACCOUNTING_replicas, - .replicas.data_type = data_type, - .replicas.nr_devs = 0, - .replicas.nr_required = 1, - }; + struct disk_accounting_pos acc_replicas_key; + memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); + acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; + acc_replicas_key.replicas.data_type = data_type; + acc_replicas_key.replicas.nr_devs = 0; + acc_replicas_key.replicas.nr_required = 1; - struct disk_accounting_pos acct_compression_key = { - .type = BCH_DISK_ACCOUNTING_compression, - }; + unsigned cur_compression_type = 0; u64 compression_acct[3] = { 1, 0, 0 }; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { @@ -762,13 +785,13 @@ static int __trigger_extent(struct btree_trans *trans, acc_replicas_key.replicas.nr_required = 0; } - if (acct_compression_key.compression.type && - acct_compression_key.compression.type != p.crc.compression_type) { + if (cur_compression_type && + cur_compression_type != p.crc.compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; @@ -777,7 +800,7 @@ static int __trigger_extent(struct btree_trans *trans, compression_acct[2] = 0; } - acct_compression_key.compression.type = p.crc.compression_type; + cur_compression_type = p.crc.compression_type; if (p.crc.compression_type) { compression_acct[1] += p.crc.uncompressed_size; compression_acct[2] += p.crc.compressed_size; @@ -791,45 +814,34 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - struct disk_accounting_pos acc_snapshot_key = { - .type = BCH_DISK_ACCOUNTING_snapshot, - .snapshot.id = k.k->p.snapshot, - }; - ret = bch2_disk_accounting_mod(trans, &acc_snapshot_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } - if (acct_compression_key.compression.type) { + if (cur_compression_type) { if (flags & BTREE_TRIGGER_overwrite) bch2_u64s_neg(compression_acct, ARRAY_SIZE(compression_acct)); - ret = bch2_disk_accounting_mod(trans, &acct_compression_key, compression_acct, - ARRAY_SIZE(compression_acct), gc); + ret = bch2_disk_accounting_mod2(trans, gc, compression_acct, + compression, cur_compression_type); if (ret) return ret; } if (level) { - struct disk_accounting_pos acc_btree_key = { - .type = BCH_DISK_ACCOUNTING_btree, - .btree.id = btree_id, - }; - ret = bch2_disk_accounting_mod(trans, &acc_btree_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { bool insert = !(flags & BTREE_TRIGGER_overwrite); - struct disk_accounting_pos acc_inum_key = { - .type = BCH_DISK_ACCOUNTING_inum, - .inum.inum = k.k->p.inode, - }; + s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), *replicas_sectors, }; - ret = bch2_disk_accounting_mod(trans, &acc_inum_key, v, ARRAY_SIZE(v), gc); + ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) return ret; } @@ -878,15 +890,15 @@ int bch2_trigger_extent(struct btree_trans *trans, } int need_rebalance_delta = 0; - s64 need_rebalance_sectors_delta = 0; + s64 need_rebalance_sectors_delta[1] = { 0 }; s64 s = bch2_bkey_sectors_need_rebalance(c, old); need_rebalance_delta -= s != 0; - need_rebalance_sectors_delta -= s; + need_rebalance_sectors_delta[0] -= s; s = bch2_bkey_sectors_need_rebalance(c, new.s_c); need_rebalance_delta += s != 0; - need_rebalance_sectors_delta += s; + need_rebalance_sectors_delta[0] += s; if ((flags & BTREE_TRIGGER_transactional) && need_rebalance_delta) { int ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, @@ -895,12 +907,9 @@ int bch2_trigger_extent(struct btree_trans *trans, return ret; } - if (need_rebalance_sectors_delta) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_rebalance_work, - }; - int ret = bch2_disk_accounting_mod(trans, &acc, &need_rebalance_sectors_delta, 1, - flags & BTREE_TRIGGER_gc); + if (need_rebalance_sectors_delta[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, + need_rebalance_sectors_delta, rebalance_work); if (ret) return ret; } @@ -916,17 +925,13 @@ static int __trigger_reservation(struct btree_trans *trans, enum btree_iter_update_trigger_flags flags) { if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 sectors = k.k->size; + s64 sectors[1] = { k.k->size }; if (flags & BTREE_TRIGGER_overwrite) - sectors = -sectors; - - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_persistent_reserved, - .persistent_reserved.nr_replicas = bkey_s_c_to_reservation(k).v->nr_replicas, - }; + sectors[0] = -sectors[0]; - return bch2_disk_accounting_mod(trans, &acc, §ors, 1, flags & BTREE_TRIGGER_gc); + return bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, sectors, + persistent_reserved, bkey_s_c_to_reservation(k).v->nr_replicas); } return 0; @@ -957,14 +962,23 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, return PTR_ERR(a); if (a->v.data_type && type && a->v.data_type != type) { - bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_allocations); - log_fsck_err(trans, bucket_metadata_type_mismatch, - "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" - "while marking %s", - iter.pos.inode, iter.pos.offset, a->v.gen, - bch2_data_type_str(a->v.data_type), - bch2_data_type_str(type), - bch2_data_type_str(type)); + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "bucket %llu:%llu gen %u different types of data in same bucket: %s, %s\n" + "while marking %s\n", + iter.pos.inode, iter.pos.offset, a->v.gen, + bch2_data_type_str(a->v.data_type), + bch2_data_type_str(type), + bch2_data_type_str(type)); + + bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); + + bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_allocations, 0); + + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); ret = -BCH_ERR_metadata_bucket_inconsistency; goto err; } @@ -976,7 +990,6 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, ret = bch2_trans_update(trans, &iter, &a->k_i, 0); } err: -fsck_err: bch2_trans_iter_exit(trans, &iter); return ret; } @@ -1134,10 +1147,10 @@ int bch2_trans_mark_dev_sb(struct bch_fs *c, struct bch_dev *ca, int bch2_trans_mark_dev_sbs_flags(struct bch_fs *c, enum btree_iter_update_trigger_flags flags) { - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_trans_mark_dev_sbs) { int ret = bch2_trans_mark_dev_sb(c, ca, flags); if (ret) { - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_trans_mark_dev_sbs); return ret; } } @@ -1305,15 +1318,18 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) old_bucket_gens = rcu_dereference_protected(ca->bucket_gens, 1); if (resize) { - bucket_gens->nbuckets = min(bucket_gens->nbuckets, - old_bucket_gens->nbuckets); - bucket_gens->nbuckets_minus_first = - bucket_gens->nbuckets - bucket_gens->first_bucket; + u64 copy = min(bucket_gens->nbuckets, + old_bucket_gens->nbuckets); memcpy(bucket_gens->b, old_bucket_gens->b, - bucket_gens->nbuckets); + sizeof(bucket_gens->b[0]) * copy); } + ret = bch2_bucket_bitmap_resize(&ca->bucket_backpointer_mismatch, + ca->mi.nbuckets, nbuckets) ?: + bch2_bucket_bitmap_resize(&ca->bucket_backpointer_empty, + ca->mi.nbuckets, nbuckets); + rcu_assign_pointer(ca->bucket_gens, bucket_gens); bucket_gens = old_bucket_gens; @@ -1336,7 +1352,7 @@ void bch2_dev_buckets_free(struct bch_dev *ca) int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { - ca->usage = alloc_percpu(struct bch_dev_usage); + ca->usage = alloc_percpu(struct bch_dev_usage_full); if (!ca->usage) return -BCH_ERR_ENOMEM_usage_init; diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index a9acdd6c0c86..af1532de4a37 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -39,38 +39,12 @@ static inline u64 sector_to_bucket_and_offset(const struct bch_dev *ca, sector_t for (_b = (_buckets)->b + (_buckets)->first_bucket; \ _b < (_buckets)->b + (_buckets)->nbuckets; _b++) -/* - * Ugly hack alert: - * - * We need to cram a spinlock in a single byte, because that's what we have left - * in struct bucket, and we care about the size of these - during fsck, we need - * in memory state for every single bucket on every device. - * - * We used to do - * while (xchg(&b->lock, 1) cpu_relax(); - * but, it turns out not all architectures support xchg on a single byte. - * - * So now we use bit_spin_lock(), with fun games since we can't burn a whole - * ulong for this - we just need to make sure the lock bit always ends up in the - * first byte. - */ - -#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ -#define BUCKET_LOCK_BITNR 0 -#else -#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) -#endif - -union ulong_byte_assert { - ulong ulong; - u8 byte; -}; - static inline void bucket_unlock(struct bucket *b) { BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &b->lock); + smp_mb__after_atomic(); wake_up_bit((void *) &b->lock, BUCKET_LOCK_BITNR); } @@ -167,9 +141,7 @@ static inline int gen_cmp(u8 a, u8 b) static inline int gen_after(u8 a, u8 b) { - int r = gen_cmp(a, b); - - return r > 0 ? r : 0; + return max(0, gen_cmp(a, b)); } static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ptr *ptr) @@ -201,7 +173,16 @@ static inline struct bch_dev_usage bch2_dev_usage_read(struct bch_dev *ca) return ret; } -void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage *); +void bch2_dev_usage_full_read_fast(struct bch_dev *, struct bch_dev_usage_full *); +static inline struct bch_dev_usage_full bch2_dev_usage_full_read(struct bch_dev *ca) +{ + struct bch_dev_usage_full ret; + + bch2_dev_usage_full_read_fast(ca, &ret); + return ret; +} + +void bch2_dev_usage_to_text(struct printbuf *, struct bch_dev *, struct bch_dev_usage_full *); static inline u64 bch2_dev_buckets_reserved(struct bch_dev *ca, enum bch_watermark watermark) { @@ -236,7 +217,7 @@ static inline u64 dev_buckets_free(struct bch_dev *ca, enum bch_watermark watermark) { return max_t(s64, 0, - usage.d[BCH_DATA_free].buckets - + usage.buckets[BCH_DATA_free]- ca->nr_open_buckets - bch2_dev_buckets_reserved(ca, watermark)); } @@ -246,10 +227,10 @@ static inline u64 __dev_buckets_available(struct bch_dev *ca, enum bch_watermark watermark) { return max_t(s64, 0, - usage.d[BCH_DATA_free].buckets - + usage.d[BCH_DATA_cached].buckets - + usage.d[BCH_DATA_need_gc_gens].buckets - + usage.d[BCH_DATA_need_discard].buckets + usage.buckets[BCH_DATA_free] + + usage.buckets[BCH_DATA_cached] + + usage.buckets[BCH_DATA_need_gc_gens] + + usage.buckets[BCH_DATA_need_discard] - ca->nr_open_buckets - bch2_dev_buckets_reserved(ca, watermark)); } @@ -262,11 +243,6 @@ static inline u64 dev_buckets_available(struct bch_dev *ca, /* Filesystem usage: */ -static inline unsigned dev_usage_u64s(void) -{ - return sizeof(struct bch_dev_usage) / sizeof(u64); -} - struct bch_fs_usage_short bch2_fs_usage_read_short(struct bch_fs *); diff --git a/fs/bcachefs/buckets_types.h b/fs/bcachefs/buckets_types.h index 7174047b8e92..0aed2500ade3 100644 --- a/fs/bcachefs/buckets_types.h +++ b/fs/bcachefs/buckets_types.h @@ -7,6 +7,33 @@ #define BUCKET_JOURNAL_SEQ_BITS 16 +/* + * Ugly hack alert: + * + * We need to cram a spinlock in a single byte, because that's what we have left + * in struct bucket, and we care about the size of these - during fsck, we need + * in memory state for every single bucket on every device. + * + * We used to do + * while (xchg(&b->lock, 1) cpu_relax(); + * but, it turns out not all architectures support xchg on a single byte. + * + * So now we use bit_spin_lock(), with fun games since we can't burn a whole + * ulong for this - we just need to make sure the lock bit always ends up in the + * first byte. + */ + +#if __BYTE_ORDER__ == __ORDER_LITTLE_ENDIAN__ +#define BUCKET_LOCK_BITNR 0 +#else +#define BUCKET_LOCK_BITNR (BITS_PER_LONG - 1) +#endif + +union ulong_byte_assert { + ulong ulong; + u8 byte; +}; + struct bucket { u8 lock; u8 gen_valid:1; @@ -27,7 +54,12 @@ struct bucket_gens { u8 b[] __counted_by(nbuckets); }; +/* Only info on bucket countns: */ struct bch_dev_usage { + u64 buckets[BCH_DATA_NR]; +}; + +struct bch_dev_usage_full { struct bch_dev_usage_type { u64 buckets; u64 sectors; /* _compressed_ sectors: */ diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 46e9e32105a9..4066946b26bc 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -11,6 +11,7 @@ #include "move.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-counters.h" #include "super-io.h" #include "thread_with_file.h" @@ -312,7 +313,12 @@ static int bch2_data_thread(void *arg) struct bch_data_ctx *ctx = container_of(arg, struct bch_data_ctx, thr); ctx->thr.ret = bch2_data_job(ctx->c, &ctx->stats, ctx->arg); - ctx->stats.data_type = U8_MAX; + if (ctx->thr.ret == -BCH_ERR_device_offline) + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_device_offline; + else { + ctx->stats.ret = BCH_IOCTL_DATA_EVENT_RET_done; + ctx->stats.data_type = (int) DATA_PROGRESS_DATA_TYPE_done; + } return 0; } @@ -331,14 +337,30 @@ static ssize_t bch2_data_job_read(struct file *file, char __user *buf, struct bch_data_ctx *ctx = container_of(file->private_data, struct bch_data_ctx, thr); struct bch_fs *c = ctx->c; struct bch_ioctl_data_event e = { - .type = BCH_DATA_EVENT_PROGRESS, - .p.data_type = ctx->stats.data_type, - .p.btree_id = ctx->stats.pos.btree, - .p.pos = ctx->stats.pos.pos, - .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), - .p.sectors_total = bch2_fs_usage_read_short(c).used, + .type = BCH_DATA_EVENT_PROGRESS, + .ret = ctx->stats.ret, + .p.data_type = ctx->stats.data_type, + .p.btree_id = ctx->stats.pos.btree, + .p.pos = ctx->stats.pos.pos, + .p.sectors_done = atomic64_read(&ctx->stats.sectors_seen), + .p.sectors_error_corrected = atomic64_read(&ctx->stats.sectors_error_corrected), + .p.sectors_error_uncorrected = atomic64_read(&ctx->stats.sectors_error_uncorrected), }; + if (ctx->arg.op == BCH_DATA_OP_scrub) { + struct bch_dev *ca = bch2_dev_tryget(c, ctx->arg.scrub.dev); + if (ca) { + struct bch_dev_usage_full u; + bch2_dev_usage_full_read_fast(ca, &u); + for (unsigned i = BCH_DATA_btree; i < ARRAY_SIZE(u.d); i++) + if (ctx->arg.scrub.data_types & BIT(i)) + e.p.sectors_total += u.d[i].sectors; + bch2_dev_put(ca); + } + } else { + e.p.sectors_total = bch2_fs_usage_read_short(c).used; + } + if (len < sizeof(e)) return -EINVAL; @@ -404,10 +426,8 @@ static long bch2_ioctl_fs_usage(struct bch_fs *c, arg.replica_entries_bytes = replicas.nr; for (unsigned i = 0; i < BCH_REPLICAS_MAX; i++) { - struct disk_accounting_pos k = { - .type = BCH_DISK_ACCOUNTING_persistent_reserved, - .persistent_reserved.nr_replicas = i, - }; + struct disk_accounting_pos k; + disk_accounting_key_init(k, persistent_reserved, .nr_replicas = i); bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&k), @@ -453,7 +473,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, struct bch_ioctl_dev_usage __user *user_arg) { struct bch_ioctl_dev_usage arg; - struct bch_dev_usage src; + struct bch_dev_usage_full src; struct bch_dev *ca; unsigned i; @@ -473,7 +493,7 @@ static long bch2_ioctl_dev_usage(struct bch_fs *c, if (IS_ERR(ca)) return PTR_ERR(ca); - src = bch2_dev_usage_read(ca); + src = bch2_dev_usage_full_read(ca); arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; @@ -494,7 +514,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, struct bch_ioctl_dev_usage_v2 __user *user_arg) { struct bch_ioctl_dev_usage_v2 arg; - struct bch_dev_usage src; + struct bch_dev_usage_full src; struct bch_dev *ca; int ret = 0; @@ -514,7 +534,7 @@ static long bch2_ioctl_dev_usage_v2(struct bch_fs *c, if (IS_ERR(ca)) return PTR_ERR(ca); - src = bch2_dev_usage_read(ca); + src = bch2_dev_usage_full_read(ca); arg.state = ca->mi.state; arg.bucket_size = ca->mi.bucket_size; @@ -593,11 +613,13 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!dev) return -EINVAL; - for_each_online_member(c, ca) + rcu_read_lock(); + for_each_online_member_rcu(c, ca) if (ca->dev == dev) { - percpu_ref_put(&ca->io_ref); + rcu_read_unlock(); return ca->dev_idx; } + rcu_read_unlock(); return -BCH_ERR_ENOENT_dev_idx_not_found; } @@ -710,6 +732,8 @@ long bch2_fs_ioctl(struct bch_fs *c, unsigned cmd, void __user *arg) BCH_IOCTL(fsck_online, struct bch_ioctl_fsck_online); case BCH_IOCTL_QUERY_ACCOUNTING: return bch2_ioctl_query_accounting(c, arg); + case BCH_IOCTL_QUERY_COUNTERS: + return bch2_ioctl_query_counters(c, arg); default: return -ENOTTY; } diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index 23a383577d4c..d3e2e4f776c6 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -7,17 +7,12 @@ #include "super-io.h" #include <linux/crc32c.h> -#include <linux/crypto.h> #include <linux/xxhash.h> #include <linux/key.h> #include <linux/random.h> #include <linux/ratelimit.h> -#include <linux/scatterlist.h> -#include <crypto/algapi.h> #include <crypto/chacha.h> -#include <crypto/hash.h> #include <crypto/poly1305.h> -#include <crypto/skcipher.h> #include <keys/user-type.h> /* @@ -96,116 +91,40 @@ static void bch2_checksum_update(struct bch2_checksum_state *state, const void * } } -static inline int do_encrypt_sg(struct crypto_sync_skcipher *tfm, - struct nonce nonce, - struct scatterlist *sg, size_t len) +static void bch2_chacha20_init(struct chacha_state *state, + const struct bch_key *key, struct nonce nonce) { - SYNC_SKCIPHER_REQUEST_ON_STACK(req, tfm); + u32 key_words[CHACHA_KEY_SIZE / sizeof(u32)]; - skcipher_request_set_sync_tfm(req, tfm); - skcipher_request_set_callback(req, 0, NULL, NULL); - skcipher_request_set_crypt(req, sg, sg, len, nonce.d); + BUILD_BUG_ON(sizeof(key_words) != sizeof(*key)); + memcpy(key_words, key, sizeof(key_words)); + le32_to_cpu_array(key_words, ARRAY_SIZE(key_words)); - int ret = crypto_skcipher_encrypt(req); - if (ret) - pr_err("got error %i from crypto_skcipher_encrypt()", ret); - - return ret; -} - -static inline int do_encrypt(struct crypto_sync_skcipher *tfm, - struct nonce nonce, - void *buf, size_t len) -{ - if (!is_vmalloc_addr(buf)) { - struct scatterlist sg = {}; - - sg_mark_end(&sg); - sg_set_page(&sg, virt_to_page(buf), len, offset_in_page(buf)); - return do_encrypt_sg(tfm, nonce, &sg, len); - } else { - DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; - size_t sgl_len = 0; - int ret; - - darray_init(&sgl); - - while (len) { - unsigned offset = offset_in_page(buf); - struct scatterlist sg = { - .page_link = (unsigned long) vmalloc_to_page(buf), - .offset = offset, - .length = min(len, PAGE_SIZE - offset), - }; - - if (darray_push(&sgl, sg)) { - sg_mark_end(&darray_last(sgl)); - ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); - if (ret) - goto err; - - nonce = nonce_add(nonce, sgl_len); - sgl_len = 0; - sgl.nr = 0; - BUG_ON(darray_push(&sgl, sg)); - } - - buf += sg.length; - len -= sg.length; - sgl_len += sg.length; - } + BUILD_BUG_ON(sizeof(nonce) != CHACHA_IV_SIZE); + chacha_init(state, key_words, (const u8 *)nonce.d); - sg_mark_end(&darray_last(sgl)); - ret = do_encrypt_sg(tfm, nonce, sgl.data, sgl_len); -err: - darray_exit(&sgl); - return ret; - } + memzero_explicit(key_words, sizeof(key_words)); } -int bch2_chacha_encrypt_key(struct bch_key *key, struct nonce nonce, - void *buf, size_t len) +void bch2_chacha20(const struct bch_key *key, struct nonce nonce, + void *data, size_t len) { - struct crypto_sync_skcipher *chacha20 = - crypto_alloc_sync_skcipher("chacha20", 0, 0); - int ret; - - ret = PTR_ERR_OR_ZERO(chacha20); - if (ret) { - pr_err("error requesting chacha20 cipher: %s", bch2_err_str(ret)); - return ret; - } - - ret = crypto_skcipher_setkey(&chacha20->base, - (void *) key, sizeof(*key)); - if (ret) { - pr_err("error from crypto_skcipher_setkey(): %s", bch2_err_str(ret)); - goto err; - } + struct chacha_state state; - ret = do_encrypt(chacha20, nonce, buf, len); -err: - crypto_free_sync_skcipher(chacha20); - return ret; + bch2_chacha20_init(&state, key, nonce); + chacha20_crypt(&state, data, data, len); + chacha_zeroize_state(&state); } -static int gen_poly_key(struct bch_fs *c, struct shash_desc *desc, - struct nonce nonce) +static void bch2_poly1305_init(struct poly1305_desc_ctx *desc, + struct bch_fs *c, struct nonce nonce) { - u8 key[POLY1305_KEY_SIZE]; - int ret; + u8 key[POLY1305_KEY_SIZE] = { 0 }; nonce.d[3] ^= BCH_NONCE_POLY; - memset(key, 0, sizeof(key)); - ret = do_encrypt(c->chacha20, nonce, key, sizeof(key)); - if (ret) - return ret; - - desc->tfm = c->poly1305; - crypto_shash_init(desc); - crypto_shash_update(desc, key, sizeof(key)); - return 0; + bch2_chacha20(&c->chacha20_key, nonce, key, sizeof(key)); + poly1305_init(desc, key); } struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, @@ -230,14 +149,13 @@ struct bch_csum bch2_checksum(struct bch_fs *c, unsigned type, case BCH_CSUM_chacha20_poly1305_80: case BCH_CSUM_chacha20_poly1305_128: { - SHASH_DESC_ON_STACK(desc, c->poly1305); + struct poly1305_desc_ctx dctx; u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; - gen_poly_key(c, desc, nonce); - - crypto_shash_update(desc, data, len); - crypto_shash_final(desc, digest); + bch2_poly1305_init(&dctx, c, nonce); + poly1305_update(&dctx, data, len); + poly1305_final(&dctx, digest); memcpy(&ret, digest, bch_crc_bytes[type]); return ret; @@ -253,11 +171,12 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, if (!bch2_csum_type_is_encryption(type)) return 0; - if (bch2_fs_inconsistent_on(!c->chacha20, + if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) return -BCH_ERR_no_encryption_key; - return do_encrypt(c->chacha20, nonce, data, len); + bch2_chacha20(&c->chacha20_key, nonce, data, len); + return 0; } static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, @@ -296,26 +215,26 @@ static struct bch_csum __bch2_checksum_bio(struct bch_fs *c, unsigned type, case BCH_CSUM_chacha20_poly1305_80: case BCH_CSUM_chacha20_poly1305_128: { - SHASH_DESC_ON_STACK(desc, c->poly1305); + struct poly1305_desc_ctx dctx; u8 digest[POLY1305_DIGEST_SIZE]; struct bch_csum ret = { 0 }; - gen_poly_key(c, desc, nonce); + bch2_poly1305_init(&dctx, c, nonce); #ifdef CONFIG_HIGHMEM __bio_for_each_segment(bv, bio, *iter, *iter) { void *p = kmap_local_page(bv.bv_page) + bv.bv_offset; - crypto_shash_update(desc, p, bv.bv_len); + poly1305_update(&dctx, p, bv.bv_len); kunmap_local(p); } #else __bio_for_each_bvec(bv, bio, *iter, *iter) - crypto_shash_update(desc, + poly1305_update(&dctx, page_address(bv.bv_page) + bv.bv_offset, bv.bv_len); #endif - crypto_shash_final(desc, digest); + poly1305_final(&dctx, digest); memcpy(&ret, digest, bch_crc_bytes[type]); return ret; @@ -338,43 +257,33 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, { struct bio_vec bv; struct bvec_iter iter; - DARRAY_PREALLOCATED(struct scatterlist, 4) sgl; - size_t sgl_len = 0; + struct chacha_state chacha_state; int ret = 0; - if (bch2_fs_inconsistent_on(!c->chacha20, + if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) return -BCH_ERR_no_encryption_key; - darray_init(&sgl); + bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce); bio_for_each_segment(bv, bio, iter) { - struct scatterlist sg = { - .page_link = (unsigned long) bv.bv_page, - .offset = bv.bv_offset, - .length = bv.bv_len, - }; - - if (darray_push(&sgl, sg)) { - sg_mark_end(&darray_last(sgl)); - ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); - if (ret) - goto err; - - nonce = nonce_add(nonce, sgl_len); - sgl_len = 0; - sgl.nr = 0; - - BUG_ON(darray_push(&sgl, sg)); + void *p; + + /* + * chacha_crypt() assumes that the length is a multiple of + * CHACHA_BLOCK_SIZE on any non-final call. + */ + if (!IS_ALIGNED(bv.bv_len, CHACHA_BLOCK_SIZE)) { + bch_err_ratelimited(c, "bio not aligned for encryption"); + ret = -EIO; + break; } - sgl_len += sg.length; + p = bvec_kmap_local(&bv); + chacha20_crypt(&chacha_state, p, p, bv.bv_len); + kunmap_local(p); } - - sg_mark_end(&darray_last(sgl)); - ret = do_encrypt_sg(c->chacha20, nonce, sgl.data, sgl_len); -err: - darray_exit(&sgl); + chacha_zeroize_state(&chacha_state); return ret; } @@ -466,7 +375,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, prt_str(&buf, ")"); WARN_RATELIMIT(1, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_recompute_checksum; } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { @@ -650,10 +559,7 @@ int bch2_decrypt_sb_key(struct bch_fs *c, } /* decrypt real key: */ - ret = bch2_chacha_encrypt_key(&user_key, bch2_sb_key_nonce(c), - &sb_key, sizeof(sb_key)); - if (ret) - goto err; + bch2_chacha20(&user_key, bch2_sb_key_nonce(c), &sb_key, sizeof(sb_key)); if (bch2_key_is_encrypted(&sb_key)) { bch_err(c, "incorrect encryption key"); @@ -668,31 +574,14 @@ err: return ret; } -static int bch2_alloc_ciphers(struct bch_fs *c) -{ - if (c->chacha20) - return 0; - - struct crypto_sync_skcipher *chacha20 = crypto_alloc_sync_skcipher("chacha20", 0, 0); - int ret = PTR_ERR_OR_ZERO(chacha20); - if (ret) { - bch_err(c, "error requesting chacha20 module: %s", bch2_err_str(ret)); - return ret; - } - - struct crypto_shash *poly1305 = crypto_alloc_shash("poly1305", 0, 0); - ret = PTR_ERR_OR_ZERO(poly1305); - if (ret) { - bch_err(c, "error requesting poly1305 module: %s", bch2_err_str(ret)); - crypto_free_sync_skcipher(chacha20); - return ret; - } - - c->chacha20 = chacha20; - c->poly1305 = poly1305; - return 0; -} +#if 0 +/* + * This seems to be duplicating code in cmd_remove_passphrase() in + * bcachefs-tools, but we might want to switch userspace to use this - and + * perhaps add an ioctl for calling this at runtime, so we can take the + * passphrase off of a mounted filesystem (which has come up). + */ int bch2_disable_encryption(struct bch_fs *c) { struct bch_sb_field_crypt *crypt; @@ -725,6 +614,10 @@ out: return ret; } +/* + * For enabling encryption on an existing filesystem: not hooked up yet, but it + * should be + */ int bch2_enable_encryption(struct bch_fs *c, bool keyed) { struct bch_encrypted_key key; @@ -781,48 +674,25 @@ err: memzero_explicit(&key, sizeof(key)); return ret; } +#endif void bch2_fs_encryption_exit(struct bch_fs *c) { - if (c->poly1305) - crypto_free_shash(c->poly1305); - if (c->chacha20) - crypto_free_sync_skcipher(c->chacha20); - if (c->sha256) - crypto_free_shash(c->sha256); + memzero_explicit(&c->chacha20_key, sizeof(c->chacha20_key)); } int bch2_fs_encryption_init(struct bch_fs *c) { struct bch_sb_field_crypt *crypt; - struct bch_key key; - int ret = 0; - - c->sha256 = crypto_alloc_shash("sha256", 0, 0); - ret = PTR_ERR_OR_ZERO(c->sha256); - if (ret) { - c->sha256 = NULL; - bch_err(c, "error requesting sha256 module: %s", bch2_err_str(ret)); - goto out; - } + int ret; crypt = bch2_sb_field_get(c->disk_sb.sb, crypt); if (!crypt) - goto out; - - ret = bch2_alloc_ciphers(c); - if (ret) - goto out; - - ret = bch2_decrypt_sb_key(c, crypt, &key); - if (ret) - goto out; + return 0; - ret = crypto_skcipher_setkey(&c->chacha20->base, - (void *) &key.key, sizeof(key.key)); + ret = bch2_decrypt_sb_key(c, crypt, &c->chacha20_key); if (ret) - goto out; -out: - memzero_explicit(&key, sizeof(key)); - return ret; + return ret; + c->chacha20_key_set = true; + return 0; } diff --git a/fs/bcachefs/checksum.h b/fs/bcachefs/checksum.h index 43b9d71f2f2b..7bd9cf6104ca 100644 --- a/fs/bcachefs/checksum.h +++ b/fs/bcachefs/checksum.h @@ -69,7 +69,8 @@ static inline void bch2_csum_err_msg(struct printbuf *out, bch2_csum_to_text(out, type, expected); } -int bch2_chacha_encrypt_key(struct bch_key *, struct nonce, void *, size_t); +void bch2_chacha20(const struct bch_key *, struct nonce, void *, size_t); + int bch2_request_key(struct bch_sb *, struct bch_key *); #ifndef __KERNEL__ int bch2_revoke_key(struct bch_sb *); @@ -103,8 +104,10 @@ extern const struct bch_sb_field_ops bch_sb_field_ops_crypt; int bch2_decrypt_sb_key(struct bch_fs *, struct bch_sb_field_crypt *, struct bch_key *); +#if 0 int bch2_disable_encryption(struct bch_fs *); int bch2_enable_encryption(struct bch_fs *, bool); +#endif void bch2_fs_encryption_exit(struct bch_fs *); int bch2_fs_encryption_init(struct bch_fs *); @@ -154,7 +157,7 @@ static inline bool bch2_checksum_type_valid(const struct bch_fs *c, if (type >= BCH_CSUM_NR) return false; - if (bch2_csum_type_is_encryption(type) && !c->chacha20) + if (bch2_csum_type_is_encryption(type) && !c->chacha20_key_set) return false; return true; diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index 1f8e035d7119..f57f9f4774e6 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -121,8 +121,8 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, } while (0); __set_current_state(TASK_RUNNING); - del_timer_sync(&wait.cpu_timer); - destroy_timer_on_stack(&wait.cpu_timer); + timer_delete_sync(&wait.cpu_timer); + timer_destroy_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); } diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 114bf2f3879f..1bca61d17092 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -177,7 +177,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, size_t src_len = src->bi_iter.bi_size; size_t dst_len = crc.uncompressed_size << 9; void *workspace; - int ret; + int ret = 0, ret2; enum bch_compression_opts opt = bch2_compression_type_to_opt(crc.compression_type); mempool_t *workspace_pool = &c->compress_workspace[opt]; @@ -189,7 +189,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, else ret = -BCH_ERR_compression_workspace_not_initialized; if (ret) - goto out; + goto err; } src_data = bio_map_or_bounce(c, src, READ); @@ -197,10 +197,10 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, switch (crc.compression_type) { case BCH_COMPRESSION_TYPE_lz4_old: case BCH_COMPRESSION_TYPE_lz4: - ret = LZ4_decompress_safe_partial(src_data.b, dst_data, - src_len, dst_len, dst_len); - if (ret != dst_len) - goto err; + ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, + src_len, dst_len, dst_len); + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_lz4; break; case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { @@ -214,45 +214,43 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, zlib_set_workspace(&strm, workspace); zlib_inflateInit2(&strm, -MAX_WBITS); - ret = zlib_inflate(&strm, Z_FINISH); + ret2 = zlib_inflate(&strm, Z_FINISH); mempool_free(workspace, workspace_pool); - if (ret != Z_STREAM_END) - goto err; + if (ret2 != Z_STREAM_END) + ret = -BCH_ERR_decompress_gzip; break; } case BCH_COMPRESSION_TYPE_zstd: { ZSTD_DCtx *ctx; size_t real_src_len = le32_to_cpup(src_data.b); - if (real_src_len > src_len - 4) + if (real_src_len > src_len - 4) { + ret = -BCH_ERR_decompress_zstd_src_len_bad; goto err; + } workspace = mempool_alloc(workspace_pool, GFP_NOFS); ctx = zstd_init_dctx(workspace, zstd_dctx_workspace_bound()); - ret = zstd_decompress_dctx(ctx, + ret2 = zstd_decompress_dctx(ctx, dst_data, dst_len, src_data.b + 4, real_src_len); mempool_free(workspace, workspace_pool); - if (ret != dst_len) - goto err; + if (ret2 != dst_len) + ret = -BCH_ERR_decompress_zstd; break; } default: BUG(); } - ret = 0; +err: fsck_err: -out: bio_unmap_or_unbounce(c, src_data); return ret; -err: - ret = -EIO; - goto out; } int bch2_bio_uncompress_inplace(struct bch_write_op *op, @@ -268,27 +266,22 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, BUG_ON(!bio->bi_vcnt); BUG_ON(DIV_ROUND_UP(crc->live_size, PAGE_SECTORS) > bio->bi_max_vecs); - if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max || - crc->compressed_size << 9 > c->opts.encoded_extent_max) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: extent too big"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - return -EIO; + if (crc->uncompressed_size << 9 > c->opts.encoded_extent_max) { + bch2_write_op_error(op, op->pos.offset, + "extent too big to decompress (%u > %u)", + crc->uncompressed_size << 9, c->opts.encoded_extent_max); + return -BCH_ERR_decompress_exceeded_max_encoded_extent; } data = __bounce_alloc(c, dst_len, WRITE); - if (__bio_uncompress(c, bio, data.b, *crc)) { - if (!c->opts.no_data_io) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error rewriting existing data: decompression error"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - ret = -EIO; + ret = __bio_uncompress(c, bio, data.b, *crc); + + if (c->opts.no_data_io) + ret = 0; + + if (ret) { + bch2_write_op_error(op, op->pos.offset, "%s", bch2_err_str(ret)); goto err; } @@ -321,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) - return -EIO; + return -BCH_ERR_decompress_exceeded_max_encoded_extent; dst_data = dst_len == dst_iter.bi_size ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) @@ -378,13 +371,14 @@ static int attempt_compress(struct bch_fs *c, }; zlib_set_workspace(&strm, workspace); - zlib_deflateInit2(&strm, + if (zlib_deflateInit2(&strm, compression.level ? clamp_t(unsigned, compression.level, Z_BEST_SPEED, Z_BEST_COMPRESSION) : Z_DEFAULT_COMPRESSION, Z_DEFLATED, -MAX_WBITS, DEF_MEM_LEVEL, - Z_DEFAULT_STRATEGY); + Z_DEFAULT_STRATEGY) != Z_OK) + return 0; if (zlib_deflate(&strm, Z_FINISH) != Z_STREAM_END) return 0; @@ -720,7 +714,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, ret = match_string(bch2_compression_opts, -1, type_str); if (ret < 0 && err) - prt_str(err, "invalid compression type"); + prt_printf(err, "invalid compression type\n"); if (ret < 0) goto err; @@ -735,7 +729,7 @@ int bch2_opt_compression_parse(struct bch_fs *c, const char *_val, u64 *res, if (!ret && level > 15) ret = -EINVAL; if (ret < 0 && err) - prt_str(err, "invalid compression level"); + prt_printf(err, "invalid compression level\n"); if (ret < 0) goto err; diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index c6151495985f..50ec3decfe8c 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -20,7 +20,18 @@ struct { \ #define DARRAY(_type) DARRAY_PREALLOCATED(_type, 0) typedef DARRAY(char) darray_char; -typedef DARRAY(char *) darray_str; +typedef DARRAY(char *) darray_str; +typedef DARRAY(const char *) darray_const_str; + +typedef DARRAY(u8) darray_u8; +typedef DARRAY(u16) darray_u16; +typedef DARRAY(u32) darray_u32; +typedef DARRAY(u64) darray_u64; + +typedef DARRAY(s8) darray_s8; +typedef DARRAY(s16) darray_s16; +typedef DARRAY(s32) darray_s32; +typedef DARRAY(s64) darray_s64; int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index 337494facac6..c34e5b88ba9d 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -20,6 +20,15 @@ #include "subvolume.h" #include "trace.h" +#include <linux/ioprio.h> + +static const char * const bch2_data_update_type_strs[] = { +#define x(t, n, ...) [n] = #t, + BCH_DATA_UPDATE_TYPES() +#undef x + NULL +}; + static void bkey_put_dev_refs(struct bch_fs *c, struct bkey_s_c k) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -33,7 +42,7 @@ static bool bkey_get_dev_refs(struct bch_fs *c, struct bkey_s_c k) struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - if (!bch2_dev_tryget(c, ptr->dev)) { + if (unlikely(!bch2_dev_tryget(c, ptr->dev))) { bkey_for_each_ptr(ptrs, ptr2) { if (ptr2 == ptr) break; @@ -91,9 +100,10 @@ static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struc return true; } -static noinline void trace_move_extent_finish2(struct data_update *u, - struct bkey_i *new, - struct bkey_i *insert) +noinline_for_stack +static void trace_io_move_finish2(struct data_update *u, + struct bkey_i *new, + struct bkey_i *insert) { struct bch_fs *c = u->op.c; struct printbuf buf = PRINTBUF; @@ -111,11 +121,12 @@ static noinline void trace_move_extent_finish2(struct data_update *u, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); prt_newline(&buf); - trace_move_extent_finish(c, buf.buf); + trace_io_move_finish(c, buf.buf); printbuf_exit(&buf); } -static void trace_move_extent_fail2(struct data_update *m, +noinline_for_stack +static void trace_io_move_fail2(struct data_update *m, struct bkey_s_c new, struct bkey_s_c wrote, struct bkey_i *insert, @@ -126,7 +137,7 @@ static void trace_move_extent_fail2(struct data_update *m, struct printbuf buf = PRINTBUF; unsigned rewrites_found = 0; - if (!trace_move_extent_fail_enabled()) + if (!trace_io_move_fail_enabled()) return; prt_str(&buf, msg); @@ -166,8 +177,76 @@ static void trace_move_extent_fail2(struct data_update *m, bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); } - trace_move_extent_fail(c, buf.buf); + trace_io_move_fail(c, buf.buf); + printbuf_exit(&buf); +} + +noinline_for_stack +static void trace_data_update2(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_data_update(c, buf.buf); + printbuf_exit(&buf); +} + +noinline_for_stack +static void trace_io_move_created_rebalance2(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + struct printbuf buf = PRINTBUF; + + bch2_data_update_opts_to_text(&buf, c, &m->op.opts, &m->data_opts); + + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + trace_io_move_created_rebalance(c, buf.buf); + printbuf_exit(&buf); + + this_cpu_inc(c->counters[BCH_COUNTER_io_move_created_rebalance]); +} + +noinline_for_stack +static int data_update_invalid_bkey(struct data_update *m, + struct bkey_s_c old, struct bkey_s_c k, + struct bkey_i *insert) +{ + struct bch_fs *c = m->op.c; + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_str(&buf, "about to insert invalid key in data update path"); + prt_printf(&buf, "\nop.nonce: %u", m->op.nonce); + prt_str(&buf, "\nold: "); + bch2_bkey_val_to_text(&buf, c, old); + prt_str(&buf, "\nk: "); + bch2_bkey_val_to_text(&buf, c, k); + prt_str(&buf, "\nnew: "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); + + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); + + return -BCH_ERR_invalid_bkey; } static int __bch2_data_update_index_update(struct btree_trans *trans, @@ -175,18 +254,11 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, { struct bch_fs *c = op->c; struct btree_iter iter; - struct data_update *m = - container_of(op, struct data_update, op); - struct keylist *keys = &op->insert_keys; - struct bkey_buf _new, _insert; + struct data_update *m = container_of(op, struct data_update, op); int ret = 0; - bch2_bkey_buf_init(&_new); - bch2_bkey_buf_init(&_insert); - bch2_bkey_buf_realloc(&_insert, c, U8_MAX); - bch2_trans_iter_init(trans, &iter, m->btree_id, - bkey_start_pos(&bch2_keylist_front(keys)->k), + bkey_start_pos(&bch2_keylist_front(&op->insert_keys)->k), BTREE_ITER_slots|BTREE_ITER_intent); while (1) { @@ -206,24 +278,35 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, bch2_trans_begin(trans); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; - new = bkey_i_to_extent(bch2_keylist_front(keys)); + new = bkey_i_to_extent(bch2_keylist_front(&op->insert_keys)); if (!bch2_extents_match(k, old)) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), - NULL, "no match:"); + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), + NULL, "no match:"); goto nowork; } - bkey_reassemble(_insert.k, k); - insert = _insert.k; + insert = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + + bkey_val_bytes(&new->k) + + sizeof(struct bch_extent_rebalance)); + ret = PTR_ERR_OR_ZERO(insert); + if (ret) + goto err; - bch2_bkey_buf_copy(&_new, c, bch2_keylist_front(keys)); - new = bkey_i_to_extent(_new.k); + bkey_reassemble(insert, k); + + new = bch2_trans_kmalloc(trans, bkey_bytes(&new->k)); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto err; + + bkey_copy(&new->k_i, bch2_keylist_front(&op->insert_keys)); bch2_cut_front(iter.pos, &new->k_i); bch2_cut_front(iter.pos, insert); @@ -254,7 +337,7 @@ static int __bch2_data_update_index_update(struct btree_trans *trans, if (m->data_opts.rewrite_ptrs && !rewrites_found && bch2_bkey_durability(c, k) >= m->op.opts.data_replicas) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "no rewrites found:"); goto nowork; } @@ -271,7 +354,7 @@ restart_drop_conflicting_replicas: } if (!bkey_val_u64s(&new->k)) { - trace_move_extent_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); + trace_io_move_fail2(m, k, bkey_i_to_s_c(&new->k_i), insert, "new replicas conflicted:"); goto nowork; } @@ -336,67 +419,52 @@ restart_drop_extra_replicas: .btree = m->btree_id, .flags = BCH_VALIDATE_commit, }); - if (invalid) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "about to insert invalid key in data update path"); - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); - - bch2_fatal_error(c); - ret = -EIO; + if (unlikely(invalid)) { + ret = data_update_invalid_bkey(m, old, k, insert); goto out; } - if (trace_data_update_enabled()) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "\nold: "); - bch2_bkey_val_to_text(&buf, c, old); - prt_str(&buf, "\nk: "); - bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\nnew: "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(insert)); - - trace_data_update(c, buf.buf); - printbuf_exit(&buf); - } - - ret = bch2_insert_snapshot_whiteouts(trans, m->btree_id, + ret = bch2_trans_log_str(trans, bch2_data_update_type_strs[m->type]) ?: + bch2_trans_log_bkey(trans, m->btree_id, 0, m->k.k) ?: + bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, bkey_start_pos(&insert->k)) ?: bch2_insert_snapshot_whiteouts(trans, m->btree_id, k.k->p, insert->k.p) ?: bch2_bkey_set_needs_rebalance(c, &op->opts, insert) ?: bch2_trans_update(trans, &iter, insert, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_trans_commit(trans, &op->res, + BTREE_UPDATE_internal_snapshot_node); + if (ret) + goto err; + + if (trace_data_update_enabled()) + trace_data_update2(m, old, k, insert); + + if (bch2_bkey_sectors_need_rebalance(c, bkey_i_to_s_c(insert)) * k.k->size > + bch2_bkey_sectors_need_rebalance(c, k) * insert->k.size) + trace_io_move_created_rebalance2(m, old, k, insert); + + ret = bch2_trans_commit(trans, &op->res, NULL, BCH_TRANS_COMMIT_no_check_rw| BCH_TRANS_COMMIT_no_enospc| m->data_opts.btree_insert_flags); - if (!ret) { - bch2_btree_iter_set_pos(&iter, next_pos); + if (ret) + goto err; - this_cpu_add(c->counters[BCH_COUNTER_move_extent_finish], new->k.size); - if (trace_move_extent_finish_enabled()) - trace_move_extent_finish2(m, &new->k_i, insert); - } + bch2_btree_iter_set_pos(trans, &iter, next_pos); + + this_cpu_add(c->counters[BCH_COUNTER_io_move_finish], new->k.size); + if (trace_io_move_finish_enabled()) + trace_io_move_finish2(m, &new->k_i, insert); err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) ret = 0; if (ret) break; next: - while (bkey_ge(iter.pos, bch2_keylist_front(keys)->k.p)) { - bch2_keylist_pop_front(keys); - if (bch2_keylist_empty(keys)) + while (bkey_ge(iter.pos, bch2_keylist_front(&op->insert_keys)->k.p)) { + bch2_keylist_pop_front(&op->insert_keys); + if (bch2_keylist_empty(&op->insert_keys)) goto out; } continue; @@ -408,15 +476,13 @@ nowork: &m->stats->sectors_raced); } - count_event(c, move_extent_fail); + count_event(c, io_move_fail); - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); goto next; } out: bch2_trans_iter_exit(trans, &iter); - bch2_bkey_buf_exit(&_insert, c); - bch2_bkey_buf_exit(&_new, c); BUG_ON(bch2_err_matches(ret, BCH_ERR_transaction_restart)); return ret; } @@ -426,14 +492,17 @@ int bch2_data_update_index_update(struct bch_write_op *op) return bch2_trans_run(op->c, __bch2_data_update_index_update(trans, op)); } -void bch2_data_update_read_done(struct data_update *m, - struct bch_extent_crc_unpacked crc) +void bch2_data_update_read_done(struct data_update *m) { + m->read_done = true; + /* write bio must own pages: */ BUG_ON(!m->op.wbio.bio.bi_vcnt); - m->op.crc = crc; - m->op.wbio.bio.bi_iter.bi_size = crc.compressed_size << 9; + m->op.crc = m->rbio.pick.crc; + m->op.wbio.bio.bi_iter.bi_size = m->op.crc.compressed_size << 9; + + this_cpu_add(m->op.c->counters[BCH_COUNTER_io_move_write], m->k.k->k.size); closure_call(&m->op.cl, bch2_write, NULL, NULL); } @@ -443,38 +512,41 @@ void bch2_data_update_exit(struct data_update *update) struct bch_fs *c = update->op.c; struct bkey_s_c k = bkey_i_to_s_c(update->k.k); + bch2_bio_free_pages_pool(c, &update->op.wbio.bio); + kfree(update->bvecs); + update->bvecs = NULL; + if (c->opts.nocow_enabled) bkey_nocow_unlock(c, k); bkey_put_dev_refs(c, k); - bch2_bkey_buf_exit(&update->k, c); bch2_disk_reservation_put(c, &update->op.res); - bch2_bio_free_pages_pool(c, &update->op.wbio.bio); + bch2_bkey_buf_exit(&update->k, c); } -static void bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) +static int bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) { struct bch_fs *c = update->op.c; - struct bio *bio = &update->op.wbio.bio; struct bkey_i_extent *e; struct write_point *wp; struct closure cl; struct btree_iter iter; struct bkey_s_c k; - int ret; + int ret = 0; closure_init_stack(&cl); bch2_keylist_init(&update->op.insert_keys, update->op.inline_keys); - while (bio_sectors(bio)) { - unsigned sectors = bio_sectors(bio); + while (bpos_lt(update->op.pos, update->k.k->k.p)) { + unsigned sectors = update->k.k->k.p.offset - + update->op.pos.offset; bch2_trans_begin(trans); bch2_trans_iter_init(trans, &iter, update->btree_id, update->op.pos, BTREE_ITER_slots); ret = lockrestart_do(trans, ({ - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); bkey_err(k); })); bch2_trans_iter_exit(trans, &iter); @@ -503,7 +575,7 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch_err_fn_ratelimited(c, ret); if (ret) - return; + break; sectors = min(sectors, wp->sectors_free); @@ -513,7 +585,6 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch2_alloc_sectors_append_ptrs(c, wp, &e->k_i, sectors, false); bch2_alloc_sectors_done(c, wp); - bio_advance(bio, sectors << 9); update->op.pos.offset += sectors; extent_for_each_ptr(extent_i_to_s(e), ptr) @@ -532,13 +603,16 @@ static void bch2_update_unwritten_extent(struct btree_trans *trans, bch2_trans_unlock(trans); closure_sync(&cl); } + + return ret; } void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - printbuf_tabstop_push(out, 20); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 20); prt_str_indented(out, "rewrite ptrs:\t"); bch2_prt_u64_base2(out, data_opts->rewrite_ptrs); @@ -562,10 +636,17 @@ void bch2_data_update_opts_to_text(struct printbuf *out, struct bch_fs *c, prt_str_indented(out, "extra replicas:\t"); prt_u64(out, data_opts->extra_replicas); + prt_newline(out); + + prt_str_indented(out, "scrub:\t"); + prt_u64(out, data_opts->scrub); } void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) { + prt_str(out, bch2_data_update_type_strs[m->type]); + prt_newline(out); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); prt_newline(out); @@ -573,6 +654,25 @@ void bch2_data_update_to_text(struct printbuf *out, struct data_update *m) bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); } +void bch2_data_update_inflight_to_text(struct printbuf *out, struct data_update *m) +{ + bch2_bkey_val_to_text(out, m->op.c, bkey_i_to_s_c(m->k.k)); + prt_newline(out); + printbuf_indent_add(out, 2); + bch2_data_update_opts_to_text(out, m->op.c, &m->op.opts, &m->data_opts); + + if (!m->read_done) { + prt_printf(out, "read:\n"); + printbuf_indent_add(out, 2); + bch2_read_bio_to_text(out, &m->rbio); + } else { + prt_printf(out, "write:\n"); + printbuf_indent_add(out, 2); + bch2_write_op_to_text(out, &m->op); + } + printbuf_indent_sub(out, 4); +} + int bch2_extent_drop_ptrs(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -616,12 +716,87 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } +int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); + + m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); + if (!m->bvecs) + return -ENOMEM; + + bio_init(&m->rbio.bio, NULL, m->bvecs, nr_vecs, REQ_OP_READ); + bio_init(&m->op.wbio.bio, NULL, m->bvecs, nr_vecs, 0); + + if (bch2_bio_alloc_pages(&m->op.wbio.bio, buf_bytes, GFP_KERNEL)) { + kfree(m->bvecs); + m->bvecs = NULL; + return -ENOMEM; + } + + rbio_init(&m->rbio.bio, c, *io_opts, NULL); + m->rbio.data_update = true; + m->rbio.bio.bi_iter.bi_size = buf_bytes; + m->rbio.bio.bi_iter.bi_sector = bkey_start_offset(&m->k.k->k); + m->op.wbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); + return 0; +} + +static int can_write_extent(struct bch_fs *c, struct data_update *m) +{ + if ((m->op.flags & BCH_WRITE_alloc_nowait) && + unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) + return -BCH_ERR_data_update_done_would_block; + + unsigned target = m->op.flags & BCH_WRITE_only_specified_devs + ? m->op.target + : 0; + struct bch_devs_mask devs = target_rw_devs(c, BCH_DATA_user, target); + + darray_for_each(m->op.devs_have, i) + __clear_bit(*i, devs.d); + + rcu_read_lock(); + unsigned nr_replicas = 0, i; + for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); + if (!ca) + continue; + + struct bch_dev_usage usage; + bch2_dev_usage_read_fast(ca, &usage); + + if (!dev_buckets_free(ca, usage, m->op.watermark)) + continue; + + nr_replicas += ca->mi.durability; + if (nr_replicas >= m->op.nr_replicas) + break; + } + rcu_read_unlock(); + + if (!nr_replicas) + return -BCH_ERR_data_update_done_no_rw_devs; + if (nr_replicas < m->op.nr_replicas) + return -BCH_ERR_insufficient_devices; + return 0; +} + int bch2_data_update_init(struct btree_trans *trans, struct btree_iter *iter, struct moving_context *ctxt, struct data_update *m, struct write_point_specifier wp, - struct bch_io_opts io_opts, + struct bch_io_opts *io_opts, struct data_update_opts data_opts, enum btree_id btree_id, struct bkey_s_c k) @@ -639,36 +814,30 @@ int bch2_data_update_init(struct btree_trans *trans, * snapshots table - just skip it, we can move it later. */ if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done; - - if (!bkey_get_dev_refs(c, k)) - return -BCH_ERR_data_update_done; - - if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, k)) { - bkey_put_dev_refs(c, k); - return -BCH_ERR_nocow_lock_blocked; - } + return -BCH_ERR_data_update_done_no_snapshot; bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); + m->type = data_opts.btree_insert_flags & BCH_WATERMARK_copygc + ? BCH_DATA_UPDATE_copygc + : BCH_DATA_UPDATE_rebalance; m->btree_id = btree_id; m->data_opts = data_opts; m->ctxt = ctxt; m->stats = ctxt ? ctxt->stats : NULL; - bch2_write_op_init(&m->op, c, io_opts); + bch2_write_op_init(&m->op, c, *io_opts); m->op.pos = bkey_start_pos(k.k); m->op.version = k.k->bversion; m->op.target = data_opts.target; m->op.write_point = wp; m->op.nr_replicas = 0; - m->op.flags |= BCH_WRITE_PAGES_STABLE| - BCH_WRITE_PAGES_OWNED| - BCH_WRITE_DATA_ENCODED| - BCH_WRITE_MOVE| + m->op.flags |= BCH_WRITE_pages_stable| + BCH_WRITE_pages_owned| + BCH_WRITE_data_encoded| + BCH_WRITE_move| m->data_opts.write_flags; - m->op.compression_opt = io_opts.background_compression; + m->op.compression_opt = io_opts->background_compression; m->op.watermark = m->data_opts.btree_insert_flags & BCH_WATERMARK_MASK; unsigned durability_have = 0, durability_removing = 0; @@ -706,7 +875,7 @@ int bch2_data_update_init(struct btree_trans *trans, ptr_bit <<= 1; } - unsigned durability_required = max(0, (int) (io_opts.data_replicas - durability_have)); + unsigned durability_required = max(0, (int) (io_opts->data_replicas - durability_have)); /* * If current extent durability is less than io_opts.data_replicas, @@ -739,28 +908,70 @@ int bch2_data_update_init(struct btree_trans *trans, m->data_opts.rewrite_ptrs = 0; /* if iter == NULL, it's just a promote */ if (iter) - ret = bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &m->data_opts); - goto out; + ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); + if (!ret) + ret = -BCH_ERR_data_update_done_no_writes_needed; + goto out_bkey_buf_exit; } + /* + * Check if the allocation will succeed, to avoid getting an error later + * in bch2_write() -> bch2_alloc_sectors_start() and doing a useless + * read: + * + * This guards against + * - BCH_WRITE_alloc_nowait allocations failing (promotes) + * - Destination target full + * - Device(s) in destination target offline + * - Insufficient durability available in destination target + * (i.e. trying to move a durability=2 replica to a target with a + * single durability=2 device) + */ + ret = can_write_extent(c, m); + if (ret) + goto out_bkey_buf_exit; + if (reserve_sectors) { ret = bch2_disk_reservation_add(c, &m->op.res, reserve_sectors, m->data_opts.extra_replicas ? 0 : BCH_DISK_RESERVATION_NOFAIL); if (ret) - goto out; + goto out_bkey_buf_exit; + } + + if (!bkey_get_dev_refs(c, k)) { + ret = -BCH_ERR_data_update_done_no_dev_refs; + goto out_put_disk_res; + } + + if (c->opts.nocow_enabled && + !bkey_nocow_lock(c, ctxt, k)) { + ret = -BCH_ERR_nocow_lock_blocked; + goto out_put_dev_refs; } if (bkey_extent_is_unwritten(k)) { - bch2_update_unwritten_extent(trans, m); - goto out; + ret = bch2_update_unwritten_extent(trans, m) ?: + -BCH_ERR_data_update_done_unwritten; + goto out_nocow_unlock; } + ret = bch2_data_update_bios_init(m, c, io_opts); + if (ret) + goto out_nocow_unlock; + return 0; -out: - bch2_data_update_exit(m); - return ret ?: -BCH_ERR_data_update_done; +out_nocow_unlock: + if (c->opts.nocow_enabled) + bkey_nocow_unlock(c, k); +out_put_dev_refs: + bkey_put_dev_refs(c, k); +out_put_disk_res: + bch2_disk_reservation_put(c, &m->op.res); +out_bkey_buf_exit: + bch2_bkey_buf_exit(&m->k, c); + return ret; } void bch2_data_update_opts_normalize(struct bkey_s_c k, struct data_update_opts *opts) diff --git a/fs/bcachefs/data_update.h b/fs/bcachefs/data_update.h index e4b50723428e..5e14d13568de 100644 --- a/fs/bcachefs/data_update.h +++ b/fs/bcachefs/data_update.h @@ -4,6 +4,7 @@ #define _BCACHEFS_DATA_UPDATE_H #include "bkey_buf.h" +#include "io_read.h" #include "io_write_types.h" struct moving_context; @@ -15,27 +16,61 @@ struct data_update_opts { u8 extra_replicas; unsigned btree_insert_flags; unsigned write_flags; + + int read_dev; + bool scrub; }; void bch2_data_update_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_io_opts *, struct data_update_opts *); +#define BCH_DATA_UPDATE_TYPES() \ + x(copygc, 0) \ + x(rebalance, 1) \ + x(promote, 2) + +enum bch_data_update_types { +#define x(n, id) BCH_DATA_UPDATE_##n = id, + BCH_DATA_UPDATE_TYPES() +#undef x +}; + struct data_update { + enum bch_data_update_types type; /* extent being updated: */ + bool read_done; enum btree_id btree_id; struct bkey_buf k; struct data_update_opts data_opts; struct moving_context *ctxt; struct bch_move_stats *stats; + + struct bch_read_bio rbio; struct bch_write_op op; + struct bio_vec *bvecs; +}; + +struct promote_op { + struct rcu_head rcu; + u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif + + struct rhash_head hash; + struct bpos pos; + + struct work_struct work; + struct data_update write; + struct bio_vec bi_inline_vecs[]; /* must be last */ }; void bch2_data_update_to_text(struct printbuf *, struct data_update *); +void bch2_data_update_inflight_to_text(struct printbuf *, struct data_update *); int bch2_data_update_index_update(struct bch_write_op *); -void bch2_data_update_read_done(struct data_update *, - struct bch_extent_crc_unpacked); +void bch2_data_update_read_done(struct data_update *); int bch2_extent_drop_ptrs(struct btree_trans *, struct btree_iter *, @@ -43,12 +78,15 @@ int bch2_extent_drop_ptrs(struct btree_trans *, struct bch_io_opts *, struct data_update_opts *); +int bch2_data_update_bios_init(struct data_update *, struct bch_fs *, + struct bch_io_opts *); + void bch2_data_update_exit(struct data_update *); int bch2_data_update_init(struct btree_trans *, struct btree_iter *, struct moving_context *, struct data_update *, struct write_point_specifier, - struct bch_io_opts, struct data_update_opts, + struct bch_io_opts *, struct data_update_opts, enum btree_id, struct bkey_s_c); void bch2_data_update_opts_normalize(struct bkey_s_c, struct data_update_opts *); diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 55333e82d1fe..4fa70634c90e 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -7,6 +7,8 @@ */ #include "bcachefs.h" +#include "alloc_foreground.h" +#include "async_objs.h" #include "bkey_methods.h" #include "btree_cache.h" #include "btree_io.h" @@ -15,6 +17,7 @@ #include "btree_update.h" #include "btree_update_interior.h" #include "buckets.h" +#include "data_update.h" #include "debug.h" #include "error.h" #include "extents.h" @@ -39,9 +42,10 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, struct btree_node *n_sorted = c->verify_data->data; struct bset *sorted, *inmemory = &b->data->keys; struct bio *bio; - bool failed = false, saw_error = false; + bool failed = false; - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_verify_replicas); if (!ca) return false; @@ -56,12 +60,13 @@ static bool bch2_btree_verify_replica(struct bch_fs *c, struct btree *b, submit_bio_wait(bio); bio_put(bio); - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_verify_replicas); memcpy(n_ondisk, n_sorted, btree_buf_bytes(b)); v->written = 0; - if (bch2_btree_node_read_done(c, ca, v, false, &saw_error) || saw_error) + if (bch2_btree_node_read_done(c, ca, v, NULL, NULL)) return false; n_sorted = c->verify_data->data; @@ -190,12 +195,13 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *out, struct bch_fs *c, unsigned offset = 0; int ret; - if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick) <= 0) { + if (bch2_bkey_pick_read_device(c, bkey_i_to_s_c(&b->key), NULL, &pick, -1) <= 0) { prt_printf(out, "error getting device to read from: invalid device\n"); return; } - ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_btree_node_ondisk_to_text); if (!ca) { prt_printf(out, "error getting device to read from: not online\n"); return; @@ -296,28 +302,13 @@ out: if (bio) bio_put(bio); kvfree(n_ondisk); - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_btree_node_ondisk_to_text); } #ifdef CONFIG_DEBUG_FS -/* XXX: bch_fs refcounting */ - -struct dump_iter { - struct bch_fs *c; - enum btree_id id; - struct bpos from; - struct bpos prev_node; - u64 iter; - - struct printbuf buf; - - char __user *ubuf; /* destination user buffer */ - size_t size; /* size of requested read */ - ssize_t ret; /* bytes read so far */ -}; - -static ssize_t flush_buf(struct dump_iter *i) +ssize_t bch2_debugfs_flush_buf(struct dump_iter *i) { if (i->buf.pos) { size_t bytes = min_t(size_t, i->buf.pos, i->size); @@ -329,6 +320,11 @@ static ssize_t flush_buf(struct dump_iter *i) i->buf.pos -= copied; memmove(i->buf.buf, i->buf.buf + copied, i->buf.pos); + if (i->buf.last_newline >= copied) + i->buf.last_newline -= copied; + if (i->buf.last_field >= copied) + i->buf.last_field -= copied; + if (copied != bytes) return -EFAULT; } @@ -355,7 +351,7 @@ static int bch2_dump_open(struct inode *inode, struct file *file) return 0; } -static int bch2_dump_release(struct inode *inode, struct file *file) +int bch2_dump_release(struct inode *inode, struct file *file) { struct dump_iter *i = file->private_data; @@ -373,7 +369,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, i->size = size; i->ret = 0; - return flush_buf(i) ?: + return bch2_debugfs_flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, BTREE_ITER_prefetch| @@ -382,7 +378,7 @@ static ssize_t bch2_read_btree(struct file *file, char __user *buf, prt_newline(&i->buf); bch2_trans_unlock(trans); i->from = bpos_successor(iter.pos); - flush_buf(i); + bch2_debugfs_flush_buf(i); }))) ?: i->ret; } @@ -403,7 +399,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, i->size = size; i->ret = 0; - ssize_t ret = flush_buf(i); + ssize_t ret = bch2_debugfs_flush_buf(i); if (ret) return ret; @@ -417,7 +413,7 @@ static ssize_t bch2_read_btree_formats(struct file *file, char __user *buf, ? bpos_successor(b->key.k.p) : b->key.k.p; - drop_locks_do(trans, flush_buf(i)); + drop_locks_do(trans, bch2_debugfs_flush_buf(i)); }))) ?: i->ret; } @@ -437,7 +433,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, i->size = size; i->ret = 0; - return flush_buf(i) ?: + return bch2_debugfs_flush_buf(i) ?: bch2_trans_run(i->c, for_each_btree_key(trans, iter, i->id, i->from, BTREE_ITER_prefetch| @@ -455,7 +451,7 @@ static ssize_t bch2_read_bfloat_failed(struct file *file, char __user *buf, bch2_bfloat_to_text(&i->buf, l->b, _k); bch2_trans_unlock(trans); i->from = bpos_successor(iter.pos); - flush_buf(i); + bch2_debugfs_flush_buf(i); }))) ?: i->ret; } @@ -516,7 +512,7 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, struct rhash_head *pos; struct btree *b; - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); if (ret) return ret; @@ -539,7 +535,7 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -613,7 +609,7 @@ restart: closure_put(&trans->ref); - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); if (ret) goto unlocked; @@ -626,7 +622,7 @@ unlocked: ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } @@ -651,7 +647,7 @@ static ssize_t bch2_journal_pins_read(struct file *file, char __user *buf, i->ret = 0; while (1) { - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -694,7 +690,7 @@ static ssize_t bch2_btree_updates_read(struct file *file, char __user *buf, i->iter++; } - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -752,7 +748,7 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, while (1) { struct btree_transaction_stats *s = &c->btree_transaction_stats[i->iter]; - err = flush_buf(i); + err = bch2_debugfs_flush_buf(i); if (err) return err; @@ -769,6 +765,12 @@ static ssize_t btree_transaction_stats_read(struct file *file, char __user *buf, mutex_lock(&s->lock); prt_printf(&i->buf, "Max mem used: %u\n", s->max_mem); +#ifdef CONFIG_BCACHEFS_TRANS_KMALLOC_TRACE + printbuf_indent_add(&i->buf, 2); + bch2_trans_kmalloc_trace_to_text(&i->buf, &s->trans_kmalloc_trace); + printbuf_indent_sub(&i->buf, 2); +#endif + prt_printf(&i->buf, "Transaction duration:\n"); printbuf_indent_add(&i->buf, 2); @@ -844,8 +846,11 @@ restart: seqmutex_unlock(&c->btree_trans_lock); } -static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, - size_t size, loff_t *ppos) +typedef void (*fs_to_text_fn)(struct printbuf *, struct bch_fs *); + +static ssize_t bch2_simple_print(struct file *file, char __user *buf, + size_t size, loff_t *ppos, + fs_to_text_fn fn) { struct dump_iter *i = file->private_data; struct bch_fs *c = i->c; @@ -856,7 +861,7 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, i->ret = 0; if (!i->iter) { - btree_deadlock_to_text(&i->buf, c); + fn(&i->buf, c); i->iter++; } @@ -864,11 +869,17 @@ static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, ret = -ENOMEM; if (!ret) - ret = flush_buf(i); + ret = bch2_debugfs_flush_buf(i); return ret ?: i->ret; } +static ssize_t bch2_btree_deadlock_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + return bch2_simple_print(file, buf, size, ppos, btree_deadlock_to_text); +} + static const struct file_operations btree_deadlock_ops = { .owner = THIS_MODULE, .open = bch2_dump_open, @@ -876,6 +887,19 @@ static const struct file_operations btree_deadlock_ops = { .read = bch2_btree_deadlock_read, }; +static ssize_t bch2_write_points_read(struct file *file, char __user *buf, + size_t size, loff_t *ppos) +{ + return bch2_simple_print(file, buf, size, ppos, bch2_write_points_to_text); +} + +static const struct file_operations write_points_ops = { + .owner = THIS_MODULE, + .open = bch2_dump_open, + .release = bch2_dump_release, + .read = bch2_write_points_read, +}; + void bch2_fs_debug_exit(struct bch_fs *c) { if (!IS_ERR_OR_NULL(c->fs_debug_dir)) @@ -904,7 +928,11 @@ void bch2_fs_debug_init(struct bch_fs *c) if (IS_ERR_OR_NULL(bch_debug)) return; - snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + if (c->sb.multi_device) + snprintf(name, sizeof(name), "%pU", c->sb.user_uuid.b); + else + strscpy(name, c->name, sizeof(name)); + c->fs_debug_dir = debugfs_create_dir(name, bch_debug); if (IS_ERR_OR_NULL(c->fs_debug_dir)) return; @@ -927,6 +955,11 @@ void bch2_fs_debug_init(struct bch_fs *c) debugfs_create_file("btree_deadlock", 0400, c->fs_debug_dir, c->btree_debug, &btree_deadlock_ops); + debugfs_create_file("write_points", 0400, c->fs_debug_dir, + c->btree_debug, &write_points_ops); + + bch2_fs_async_obj_debugfs_init(c); + c->btree_debug_dir = debugfs_create_dir("btrees", c->fs_debug_dir); if (IS_ERR_OR_NULL(c->btree_debug_dir)) return; diff --git a/fs/bcachefs/debug.h b/fs/bcachefs/debug.h index 2c37143b5fd1..d88b1194b8ac 100644 --- a/fs/bcachefs/debug.h +++ b/fs/bcachefs/debug.h @@ -14,11 +14,29 @@ void bch2_btree_node_ondisk_to_text(struct printbuf *, struct bch_fs *, static inline void bch2_btree_verify(struct bch_fs *c, struct btree *b) { - if (bch2_verify_btree_ondisk) + if (static_branch_unlikely(&bch2_verify_btree_ondisk)) __bch2_btree_verify(c, b); } #ifdef CONFIG_DEBUG_FS +struct dump_iter { + struct bch_fs *c; + struct async_obj_list *list; + enum btree_id id; + struct bpos from; + struct bpos prev_node; + u64 iter; + + struct printbuf buf; + + char __user *ubuf; /* destination user buffer */ + size_t size; /* size of requested read */ + ssize_t ret; /* bytes read so far */ +}; + +ssize_t bch2_debugfs_flush_buf(struct dump_iter *); +int bch2_dump_release(struct inode *, struct file *); + void bch2_fs_debug_exit(struct bch_fs *); void bch2_fs_debug_init(struct bch_fs *); #else diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index 600eee936f13..d198001838f3 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -13,6 +13,28 @@ #include <linux/dcache.h> +int bch2_casefold(struct btree_trans *trans, const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) +{ + *out_cf = (struct qstr) QSTR_INIT(NULL, 0); + +#ifdef CONFIG_UNICODE + unsigned char *buf = bch2_trans_kmalloc(trans, BCH_NAME_MAX + 1); + int ret = PTR_ERR_OR_ZERO(buf); + if (ret) + return ret; + + ret = utf8_casefold(info->cf_encoding, str, buf, BCH_NAME_MAX + 1); + if (ret <= 0) + return ret; + + *out_cf = (struct qstr) QSTR_INIT(buf, ret); + return 0; +#else + return -EOPNOTSUPP; +#endif +} + static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) { if (bkey_val_bytes(d.k) < offsetof(struct bch_dirent, d_name)) @@ -28,13 +50,38 @@ static unsigned bch2_dirent_name_bytes(struct bkey_s_c_dirent d) #endif return bkey_bytes - - offsetof(struct bch_dirent, d_name) - + (d.v->d_casefold + ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + : offsetof(struct bch_dirent, d_name)) - trailing_nuls; } struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d) { - return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + if (d.v->d_casefold) { + unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); + return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[0], name_len); + } else { + return (struct qstr) QSTR_INIT(d.v->d_name, bch2_dirent_name_bytes(d)); + } +} + +static struct qstr bch2_dirent_get_casefold_name(struct bkey_s_c_dirent d) +{ + if (d.v->d_casefold) { + unsigned name_len = le16_to_cpu(d.v->d_cf_name_block.d_name_len); + unsigned cf_name_len = le16_to_cpu(d.v->d_cf_name_block.d_cf_name_len); + return (struct qstr) QSTR_INIT(&d.v->d_cf_name_block.d_names[name_len], cf_name_len); + } else { + return (struct qstr) QSTR_INIT(NULL, 0); + } +} + +static inline struct qstr bch2_dirent_get_lookup_name(struct bkey_s_c_dirent d) +{ + return d.v->d_casefold + ? bch2_dirent_get_casefold_name(d) + : bch2_dirent_get_name(d); } static u64 bch2_dirent_hash(const struct bch_hash_info *info, @@ -57,7 +104,7 @@ static u64 dirent_hash_key(const struct bch_hash_info *info, const void *key) static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); - struct qstr name = bch2_dirent_get_name(d); + struct qstr name = bch2_dirent_get_lookup_name(d); return bch2_dirent_hash(info, &name); } @@ -65,7 +112,7 @@ static u64 dirent_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) static bool dirent_cmp_key(struct bkey_s_c _l, const void *_r) { struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); - const struct qstr l_name = bch2_dirent_get_name(l); + const struct qstr l_name = bch2_dirent_get_lookup_name(l); const struct qstr *r_name = _r; return !qstr_eq(l_name, *r_name); @@ -75,8 +122,8 @@ static bool dirent_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) { struct bkey_s_c_dirent l = bkey_s_c_to_dirent(_l); struct bkey_s_c_dirent r = bkey_s_c_to_dirent(_r); - const struct qstr l_name = bch2_dirent_get_name(l); - const struct qstr r_name = bch2_dirent_get_name(r); + const struct qstr l_name = bch2_dirent_get_lookup_name(l); + const struct qstr r_name = bch2_dirent_get_lookup_name(r); return !qstr_eq(l_name, r_name); } @@ -104,17 +151,19 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + unsigned name_block_len = bch2_dirent_name_bytes(d); struct qstr d_name = bch2_dirent_get_name(d); + struct qstr d_cf_name = bch2_dirent_get_casefold_name(d); int ret = 0; bkey_fsck_err_on(!d_name.len, c, dirent_empty_name, "empty name"); - bkey_fsck_err_on(bkey_val_u64s(k.k) > dirent_val_u64s(d_name.len), + bkey_fsck_err_on(d_name.len + d_cf_name.len > name_block_len, c, dirent_val_too_big, - "value too big (%zu > %u)", - bkey_val_u64s(k.k), dirent_val_u64s(d_name.len)); + "dirent names exceed bkey size (%d + %d > %d)", + d_name.len, d_cf_name.len, name_block_len); /* * Check new keys don't exceed the max length @@ -142,6 +191,18 @@ int bch2_dirent_validate(struct bch_fs *c, struct bkey_s_c k, le64_to_cpu(d.v->d_inum) == d.k->p.inode, c, dirent_to_itself, "dirent points to own directory"); + + if (d.v->d_casefold) { + bkey_fsck_err_on(from.from == BKEY_VALIDATE_commit && + d_cf_name.len > BCH_NAME_MAX, + c, dirent_cf_name_too_big, + "dirent w/ cf name too big (%u > %u)", + d_cf_name.len, BCH_NAME_MAX); + + bkey_fsck_err_on(d_cf_name.len != strnlen(d_cf_name.name, d_cf_name.len), + c, dirent_stray_data_after_cf_name, + "dirent has stray data after cf name's NUL"); + } fsck_err: return ret; } @@ -151,27 +212,33 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); struct qstr d_name = bch2_dirent_get_name(d); - prt_printf(out, "%.*s -> ", d_name.len, d_name.name); + prt_printf(out, "%.*s", d_name.len, d_name.name); + + if (d.v->d_casefold) { + struct qstr d_name = bch2_dirent_get_lookup_name(d); + prt_printf(out, " (casefold %.*s)", d_name.len, d_name.name); + } + + prt_str(out, " ->"); if (d.v->d_type != DT_SUBVOL) - prt_printf(out, "%llu", le64_to_cpu(d.v->d_inum)); + prt_printf(out, " %llu", le64_to_cpu(d.v->d_inum)); else - prt_printf(out, "%u -> %u", + prt_printf(out, " %u -> %u", le32_to_cpu(d.v->d_parent_subvol), le32_to_cpu(d.v->d_child_subvol)); prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } -static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, - subvol_inum dir, u8 type, - const struct qstr *name, u64 dst) +static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, + subvol_inum dir, + u8 type, + int name_len, int cf_name_len, + u64 dst) { struct bkey_i_dirent *dirent; - unsigned u64s = BKEY_U64s + dirent_val_u64s(name->len); - - if (name->len > BCH_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len); BUG_ON(u64s > U8_MAX); @@ -190,14 +257,75 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, } dirent->v.d_type = type; + dirent->v.d_unused = 0; + dirent->v.d_casefold = cf_name_len ? 1 : 0; + + return dirent; +} + +static void dirent_init_regular_name(struct bkey_i_dirent *dirent, + const struct qstr *name) +{ + EBUG_ON(dirent->v.d_casefold); + + memcpy(&dirent->v.d_name[0], name->name, name->len); + memset(&dirent->v.d_name[name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_name) - + name->len); +} + +static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, + const struct qstr *name, + const struct qstr *cf_name) +{ + EBUG_ON(!dirent->v.d_casefold); + EBUG_ON(!cf_name->len); + + dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); + dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len); + memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); + memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); + memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_cf_name_block.d_names) - + name->len + cf_name->len); + + EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len); +} + +static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, + const struct bch_hash_info *hash_info, + subvol_inum dir, + u8 type, + const struct qstr *name, + const struct qstr *cf_name, + u64 dst) +{ + struct bkey_i_dirent *dirent; + struct qstr _cf_name; - memcpy(dirent->v.d_name, name->name, name->len); - memset(dirent->v.d_name + name->len, 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); + if (name->len > BCH_NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + if (hash_info->cf_encoding && !cf_name) { + int ret = bch2_casefold(trans, hash_info, name, &_cf_name); + if (ret) + return ERR_PTR(ret); + + cf_name = &_cf_name; + } - EBUG_ON(bch2_dirent_name_bytes(dirent_i_to_s_c(dirent)) != name->len); + dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); + if (IS_ERR(dirent)) + return dirent; + + if (cf_name) + dirent_init_casefolded_name(dirent, name, cf_name); + else + dirent_init_regular_name(dirent, name); + + EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); return dirent; } @@ -213,7 +341,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, dir_inum, type, name, dst_inum); + dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -222,8 +350,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, dirent->k.p.snapshot = snapshot; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, - dir_inum, snapshot, &dirent->k_i, - flags|BTREE_UPDATE_internal_snapshot_node); + dir_inum, snapshot, &dirent->k_i, flags); *dir_offset = dirent->k.p.offset; return ret; @@ -238,7 +365,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, dir, type, name, dst_inum); + dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -275,14 +402,15 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, + subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) { - struct btree_iter src_iter = { NULL }; - struct btree_iter dst_iter = { NULL }; + struct qstr src_name_lookup, dst_name_lookup; + struct btree_iter src_iter = {}; + struct btree_iter dst_iter = {}; struct bkey_s_c old_src, old_dst = bkey_s_c_null; struct bkey_i_dirent *new_src = NULL, *new_dst = NULL; struct bpos dst_pos = @@ -295,8 +423,11 @@ int bch2_dirent_rename(struct btree_trans *trans, memset(dst_inum, 0, sizeof(*dst_inum)); /* Lookup src: */ + ret = bch2_maybe_casefold(trans, src_hash, src_name, &src_name_lookup); + if (ret) + goto out; old_src = bch2_hash_lookup(trans, &src_iter, bch2_dirent_hash_desc, - src_hash, src_dir, src_name, + src_hash, src_dir, &src_name_lookup, BTREE_ITER_intent); ret = bkey_err(old_src); if (ret) @@ -308,6 +439,9 @@ int bch2_dirent_rename(struct btree_trans *trans, goto out; /* Lookup dst: */ + ret = bch2_maybe_casefold(trans, dst_hash, dst_name, &dst_name_lookup); + if (ret) + goto out; if (mode == BCH_RENAME) { /* * Note that we're _not_ checking if the target already exists - @@ -315,12 +449,12 @@ int bch2_dirent_rename(struct btree_trans *trans, * correctness: */ ret = bch2_hash_hole(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name); + dst_hash, dst_dir, &dst_name_lookup); if (ret) goto out; } else { old_dst = bch2_hash_lookup(trans, &dst_iter, bch2_dirent_hash_desc, - dst_hash, dst_dir, dst_name, + dst_hash, dst_dir, &dst_name_lookup, BTREE_ITER_intent); ret = bkey_err(old_dst); if (ret) @@ -336,7 +470,8 @@ int bch2_dirent_rename(struct btree_trans *trans, *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, dst_dir, 0, dst_name, 0); + new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, + dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) goto out; @@ -346,7 +481,8 @@ int bch2_dirent_rename(struct btree_trans *trans, /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, src_dir, 0, src_name, 0); + new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name, + src_hash->cf_encoding ? &src_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) goto out; @@ -406,6 +542,14 @@ int bch2_dirent_rename(struct btree_trans *trans, new_src->v.d_type == DT_SUBVOL) new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); + if (old_dst.k) + *dst_dir_i_size -= bkey_bytes(old_dst.k); + *src_dir_i_size -= bkey_bytes(old_src.k); + + if (mode == BCH_RENAME_EXCHANGE) + *src_dir_i_size += bkey_bytes(&new_src->k); + *dst_dir_i_size += bkey_bytes(&new_dst->k); + ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; @@ -434,16 +578,16 @@ out_set_src: } if (delete_src) { - bch2_btree_iter_set_snapshot(&src_iter, old_src.k->p.snapshot); - ret = bch2_btree_iter_traverse(&src_iter) ?: + bch2_btree_iter_set_snapshot(trans, &src_iter, old_src.k->p.snapshot); + ret = bch2_btree_iter_traverse(trans, &src_iter) ?: bch2_btree_delete_at(trans, &src_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; } if (delete_dst) { - bch2_btree_iter_set_snapshot(&dst_iter, old_dst.k->p.snapshot); - ret = bch2_btree_iter_traverse(&dst_iter) ?: + bch2_btree_iter_set_snapshot(trans, &dst_iter, old_dst.k->p.snapshot); + ret = bch2_btree_iter_traverse(trans, &dst_iter) ?: bch2_btree_delete_at(trans, &dst_iter, BTREE_UPDATE_internal_snapshot_node); if (ret) goto out; @@ -465,9 +609,14 @@ int bch2_dirent_lookup_trans(struct btree_trans *trans, const struct qstr *name, subvol_inum *inum, unsigned flags) { + struct qstr lookup_name; + int ret = bch2_maybe_casefold(trans, hash_info, name, &lookup_name); + if (ret) + return ret; + struct bkey_s_c k = bch2_hash_lookup(trans, iter, bch2_dirent_hash_desc, - hash_info, dir, name, flags); - int ret = bkey_err(k); + hash_info, dir, &lookup_name, flags); + ret = bkey_err(k); if (ret) goto err; @@ -485,7 +634,7 @@ u64 bch2_dirent_lookup(struct bch_fs *c, subvol_inum dir, const struct qstr *name, subvol_inum *inum) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; int ret = lockrestart_do(trans, bch2_dirent_lookup_trans(trans, &iter, dir, hash_info, name, inum, 0)); @@ -540,7 +689,7 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv vfs_d_type(d.v->d_type)); if (ret) ctx->pos = d.k->p.offset + 1; - return ret; + return !ret; } int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) @@ -565,10 +714,61 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) if (ret2 > 0) continue; - ret2 ?: drop_locks_do(trans, bch2_dir_emit(ctx, dirent, target)); + ret2 ?: (bch2_trans_unlock(trans), bch2_dir_emit(ctx, dirent, target)); }))); bch2_bkey_buf_exit(&sk, c); return ret < 0 ? ret : 0; } + +/* fsck */ + +static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inode_nr) + break; + if (!bkey_is_inode(k.k)) + continue; + ret = bch2_inode_unpack(k, inode); + goto found; + } + ret = -BCH_ERR_ENOENT_inode; +found: + bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_fsck_remove_dirent(struct btree_trans *trans, struct bpos pos) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bch_inode_unpacked dir_inode; + struct bch_hash_info dir_hash_info; + int ret; + + ret = lookup_first_inode(trans, pos.inode, &dir_inode); + if (ret) + goto err; + + dir_hash_info = bch2_hash_info_init(c, &dir_inode); + + bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); + + ret = bch2_btree_iter_traverse(trans, &iter) ?: + bch2_hash_delete_at(trans, bch2_dirent_hash_desc, + &dir_hash_info, &iter, + BTREE_UPDATE_internal_snapshot_node); + bch2_trans_iter_exit(trans, &iter); +err: + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index a633f83c1ac7..d3e7ae669575 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -23,17 +23,30 @@ struct bch_fs; struct bch_hash_info; struct bch_inode_info; -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); +int bch2_casefold(struct btree_trans *, const struct bch_hash_info *, + const struct qstr *, struct qstr *); -static inline unsigned dirent_val_u64s(unsigned len) +static inline int bch2_maybe_casefold(struct btree_trans *trans, + const struct bch_hash_info *info, + const struct qstr *str, struct qstr *out_cf) { - return DIV_ROUND_UP(offsetof(struct bch_dirent, d_name) + len, - sizeof(u64)); + if (likely(!info->cf_encoding)) { + *out_cf = *str; + return 0; + } else { + return bch2_casefold(trans, info, str, out_cf); + } } -static inline unsigned int dirent_occupied_size(const struct qstr *name) +struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); + +static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) { - return (BKEY_U64s + dirent_val_u64s(name->len)) * sizeof(u64); + unsigned bytes = cf_len + ? offsetof(struct bch_dirent, d_cf_name_block.d_names) + len + cf_len + : offsetof(struct bch_dirent, d_name) + len; + + return DIV_ROUND_UP(bytes, sizeof(u64)); } int bch2_dirent_read_target(struct btree_trans *, subvol_inum, @@ -67,8 +80,8 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, - subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, u64 *, + subvol_inum, struct bch_hash_info *, u64 *, const struct qstr *, subvol_inum *, u64 *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); @@ -84,4 +97,6 @@ int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); +int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); + #endif /* _BCACHEFS_DIRENT_H */ diff --git a/fs/bcachefs/dirent_format.h b/fs/bcachefs/dirent_format.h index 5e116b88e814..a46dbddd21aa 100644 --- a/fs/bcachefs/dirent_format.h +++ b/fs/bcachefs/dirent_format.h @@ -29,9 +29,25 @@ struct bch_dirent { * Copy of mode bits 12-15 from the target inode - so userspace can get * the filetype without having to do a stat() */ - __u8 d_type; +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u8 d_type:5, + d_unused:2, + d_casefold:1; +#elif defined(__BIG_ENDIAN_BITFIELD) + __u8 d_casefold:1, + d_unused:2, + d_type:5; +#endif - __u8 d_name[]; + union { + struct { + __u8 d_pad; + __le16 d_name_len; + __le16 d_cf_name_len; + __u8 d_names[]; + } d_cf_name_block __packed; + __DECLARE_FLEX_ARRAY(__u8, d_name); + } __packed; } __packed __aligned(8); #define DT_SUBVOL 16 diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index b32e91ba8be8..b3840ff7c407 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -68,23 +68,31 @@ static const char * const disk_accounting_type_strs[] = { NULL }; -static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, - s64 *d, unsigned nr) +static inline void __accounting_key_init(struct bkey_i *k, struct bpos pos, + s64 *d, unsigned nr) { struct bkey_i_accounting *acc = bkey_accounting_init(k); - acc->k.p = disk_accounting_pos_to_bpos(pos); + acc->k.p = pos; set_bkey_val_u64s(&acc->k, sizeof(struct bch_accounting) / sizeof(u64) + nr); memcpy_u64s_small(acc->v.d, d, nr); } +static inline void accounting_key_init(struct bkey_i *k, struct disk_accounting_pos *pos, + s64 *d, unsigned nr) +{ + return __accounting_key_init(k, disk_accounting_pos_to_bpos(pos), d, nr); +} + static int bch2_accounting_update_sb_one(struct bch_fs *, struct bpos); int bch2_disk_accounting_mod(struct btree_trans *trans, struct disk_accounting_pos *k, s64 *d, unsigned nr, bool gc) { + BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); + /* Normalize: */ switch (k->type) { case BCH_DISK_ACCOUNTING_replicas: @@ -92,21 +100,49 @@ int bch2_disk_accounting_mod(struct btree_trans *trans, break; } - BUG_ON(nr > BCH_ACCOUNTING_MAX_COUNTERS); + struct bpos pos = disk_accounting_pos_to_bpos(k); + + if (likely(!gc)) { + struct bkey_i_accounting *a; +#if 0 + for (a = btree_trans_subbuf_base(trans, &trans->accounting); + a != btree_trans_subbuf_top(trans, &trans->accounting); + a = (void *) bkey_next(&a->k_i)) + if (bpos_eq(a->k.p, pos)) { + BUG_ON(nr != bch2_accounting_counters(&a->k)); + acc_u64s(a->v.d, d, nr); + + if (bch2_accounting_key_is_zero(accounting_i_to_s_c(a))) { + unsigned offset = (u64 *) a - + (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); + + trans->accounting.u64s -= a->k.u64s; + memmove_u64s_down(a, + bkey_next(&a->k_i), + trans->accounting.u64s - offset); + } + return 0; + } +#endif + unsigned u64s = sizeof(*a) / sizeof(u64) + nr; + a = bch2_trans_subbuf_alloc(trans, &trans->accounting, u64s); + int ret = PTR_ERR_OR_ZERO(a); + if (ret) + return ret; - struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; + __accounting_key_init(&a->k_i, pos, d, nr); + return 0; + } else { + struct { __BKEY_PADDED(k, BCH_ACCOUNTING_MAX_COUNTERS); } k_i; - accounting_key_init(&k_i.k, k, d, nr); + __accounting_key_init(&k_i.k, pos, d, nr); - if (unlikely(gc)) { int ret = bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); if (ret == -BCH_ERR_btree_insert_need_mark_replicas) ret = drop_locks_do(trans, bch2_accounting_update_sb_one(trans->c, disk_accounting_pos_to_bpos(k))) ?: bch2_accounting_mem_add(trans, bkey_i_to_s_c_accounting(&k_i.k), true); return ret; - } else { - return bch2_trans_update_buffered(trans, BTREE_ID_accounting, &k_i.k); } } @@ -114,10 +150,9 @@ int bch2_mod_dev_cached_sectors(struct btree_trans *trans, unsigned dev, s64 sectors, bool gc) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; - + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_replicas_entry_cached(&acc.replicas, dev); return bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); @@ -135,6 +170,12 @@ static inline bool is_zero(char *start, char *end) #define field_end(p, member) (((void *) (&p.member)) + sizeof(p.member)) +static const unsigned bch2_accounting_type_nr_counters[] = { +#define x(f, id, nr) [BCH_DISK_ACCOUNTING_##f] = nr, + BCH_DISK_ACCOUNTING_TYPES() +#undef x +}; + int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, struct bkey_validate_context from) { @@ -193,6 +234,11 @@ int bch2_accounting_validate(struct bch_fs *c, struct bkey_s_c k, bkey_fsck_err_on(!is_zero(end, (void *) (&acc_k + 1)), c, accounting_key_junk_at_end, "junk at end of accounting key"); + + bkey_fsck_err_on(bch2_accounting_counters(k.k) != bch2_accounting_type_nr_counters[acc_k.type], + c, accounting_key_nr_counters_wrong, + "accounting key with %u counters, should be %u", + bch2_accounting_counters(k.k), bch2_accounting_type_nr_counters[acc_k.type]); fsck_err: return ret; } @@ -277,7 +323,7 @@ static inline bool accounting_to_replicas(struct bch_replicas_entry_v1 *r, struc static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) { - struct bch_replicas_padded r; + union bch_replicas_padded r; return accounting_to_replicas(&r.e, p) ? bch2_mark_replicas(c, &r.e) : 0; @@ -289,14 +335,13 @@ static int bch2_accounting_update_sb_one(struct bch_fs *c, struct bpos p) */ int bch2_accounting_update_sb(struct btree_trans *trans) { - for (struct jset_entry *i = trans->journal_entries; - i != (void *) ((u64 *) trans->journal_entries + trans->journal_entries_u64s); - i = vstruct_next(i)) - if (jset_entry_is_key(i) && i->start->k.type == KEY_TYPE_accounting) { - int ret = bch2_accounting_update_sb_one(trans->c, i->start->k.p); - if (ret) - return ret; - } + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); + i != btree_trans_subbuf_top(trans, &trans->accounting); + i = bkey_next(i)) { + int ret = bch2_accounting_update_sb_one(trans->c, i->k.p); + if (ret) + return ret; + } return 0; } @@ -351,7 +396,7 @@ err: int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, enum bch_accounting_mode mode) { - struct bch_replicas_padded r; + union bch_replicas_padded r; if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && @@ -366,6 +411,19 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, return ret; } +int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounting a, + enum bch_accounting_mode mode) +{ + union bch_replicas_padded r; + + if (mode != BCH_ACCOUNTING_read && + accounting_to_replicas(&r.e, a.k->p) && + !bch2_replicas_marked_locked(c, &r.e)) + return -BCH_ERR_btree_insert_need_mark_replicas; + + return __bch2_accounting_mem_insert(c, a); +} + static bool accounting_mem_entry_is_zero(struct accounting_mem_entry *e) { for (unsigned i = 0; i < e->nr_counters; i++) @@ -415,10 +473,12 @@ int bch2_fs_replicas_usage_read(struct bch_fs *c, darray_char *usage) percpu_down_read(&c->mark_lock); darray_for_each(acc->k, i) { - struct { + union { + u8 bytes[struct_size_t(struct bch_replicas_usage, r.devs, + BCH_BKEY_PTRS_MAX)]; struct bch_replicas_usage r; - u8 pad[BCH_BKEY_PTRS_MAX]; } u; + u.r.r.nr_devs = BCH_BKEY_PTRS_MAX; if (!accounting_to_replicas(&u.r.r, i->pos)) continue; @@ -547,11 +607,11 @@ int bch2_gc_accounting_done(struct bch_fs *c) prt_str(&buf, "accounting mismatch for "); bch2_accounting_key_to_text(&buf, &acc_k); - prt_str(&buf, ": got"); + prt_str(&buf, ":\n got"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", dst_v[j]); - prt_str(&buf, " should be"); + prt_str(&buf, "\nshould be"); for (unsigned j = 0; j < nr; j++) prt_printf(&buf, " %llu", src_v[j]); @@ -573,7 +633,7 @@ int bch2_gc_accounting_done(struct bch_fs *c) accounting_key_init(&k_i.k, &acc_k, src_v, nr); bch2_accounting_mem_mod_locked(trans, bkey_i_to_s_c_accounting(&k_i.k), - BCH_ACCOUNTING_normal); + BCH_ACCOUNTING_normal, true); preempt_disable(); struct bch_fs_usage_base *dst = this_cpu_ptr(c->usage); @@ -602,23 +662,23 @@ static int accounting_read_key(struct btree_trans *trans, struct bkey_s_c k) percpu_down_read(&c->mark_lock); int ret = bch2_accounting_mem_mod_locked(trans, bkey_s_c_to_accounting(k), - BCH_ACCOUNTING_read); + BCH_ACCOUNTING_read, false); percpu_up_read(&c->mark_lock); return ret; } static int bch2_disk_accounting_validate_late(struct btree_trans *trans, - struct disk_accounting_pos acc, + struct disk_accounting_pos *acc, u64 *v, unsigned nr) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0, invalid_dev = -1; - switch (acc.type) { + switch (acc->type) { case BCH_DISK_ACCOUNTING_replicas: { - struct bch_replicas_padded r; - __accounting_to_replicas(&r.e, &acc); + union bch_replicas_padded r; + __accounting_to_replicas(&r.e, acc); for (unsigned i = 0; i < r.e.nr_devs; i++) if (r.e.devs[i] != BCH_SB_MEMBER_INVALID && @@ -635,9 +695,9 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, if (fsck_err_on(!bch2_replicas_marked_locked(c, &r.e), trans, accounting_replicas_not_marked, - "accounting not marked in superblock replicas\n %s", + "accounting not marked in superblock replicas\n%s", (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &acc), + bch2_accounting_key_to_text(&buf, acc), buf.buf))) { /* * We're not RW yet and still single threaded, dropping @@ -653,8 +713,8 @@ static int bch2_disk_accounting_validate_late(struct btree_trans *trans, } case BCH_DISK_ACCOUNTING_dev_data_type: - if (!bch2_dev_exists(c, acc.dev_data_type.dev)) { - invalid_dev = acc.dev_data_type.dev; + if (!bch2_dev_exists(c, acc->dev_data_type.dev)) { + invalid_dev = acc->dev_data_type.dev; goto invalid_device; } break; @@ -665,16 +725,16 @@ fsck_err: return ret; invalid_device: if (fsck_err(trans, accounting_to_invalid_device, - "accounting entry points to invalid device %i\n %s", + "accounting entry points to invalid device %i\n%s", invalid_dev, (printbuf_reset(&buf), - bch2_accounting_key_to_text(&buf, &acc), + bch2_accounting_key_to_text(&buf, acc), buf.buf))) { for (unsigned i = 0; i < nr; i++) v[i] = -v[i]; ret = commit_do(trans, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, nr, false)) ?: + bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: -BCH_ERR_remove_disk_accounting_entry; } else { ret = -BCH_ERR_remove_disk_accounting_entry; @@ -725,9 +785,11 @@ int bch2_accounting_read(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; - if (!bch2_accounting_is_mem(acc_k)) { - struct disk_accounting_pos next = { .type = acc_k.type + 1 }; - bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + if (!bch2_accounting_is_mem(&acc_k)) { + struct disk_accounting_pos next; + memset(&next, 0, sizeof(next)); + next.type = acc_k.type + 1; + bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); continue; } @@ -745,7 +807,7 @@ int bch2_accounting_read(struct bch_fs *c) struct disk_accounting_pos acc_k; bpos_to_disk_accounting_pos(&acc_k, i->k->k.p); - if (!bch2_accounting_is_mem(acc_k)) + if (!bch2_accounting_is_mem(&acc_k)) continue; struct bkey_s_c k = bkey_i_to_s_c(i->k); @@ -801,7 +863,7 @@ int bch2_accounting_read(struct bch_fs *c) */ ret = bch2_is_zero(v, sizeof(v[0]) * i->nr_counters) ? -BCH_ERR_remove_disk_accounting_entry - : bch2_disk_accounting_validate_late(trans, acc_k, v, i->nr_counters); + : bch2_disk_accounting_validate_late(trans, &acc_k, v, i->nr_counters); if (ret == -BCH_ERR_remove_disk_accounting_entry) { free_percpu(i->v[0]); @@ -882,15 +944,13 @@ int bch2_dev_usage_remove(struct bch_fs *c, unsigned dev) int bch2_dev_usage_init(struct bch_dev *ca, bool gc) { struct bch_fs *c = ca->fs; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = BCH_DATA_free, - }; u64 v[3] = { ca->mi.nbuckets - ca->mi.first_bucket, 0, 0 }; int ret = bch2_trans_do(c, ({ - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), gc) ?: + bch2_disk_accounting_mod2(trans, gc, + v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free) ?: (!gc ? bch2_trans_commit(trans, NULL, NULL, 0) : 0); })); bch_err_fn(c, ret); @@ -916,9 +976,11 @@ void bch2_verify_accounting_clean(struct bch_fs *c) if (acc_k.type >= BCH_DISK_ACCOUNTING_TYPE_NR) break; - if (!bch2_accounting_is_mem(acc_k)) { - struct disk_accounting_pos next = { .type = acc_k.type + 1 }; - bch2_btree_iter_set_pos(&iter, disk_accounting_pos_to_bpos(&next)); + if (!bch2_accounting_is_mem(&acc_k)) { + struct disk_accounting_pos next; + memset(&next, 0, sizeof(next)); + next.type = acc_k.type + 1; + bch2_btree_iter_set_pos(trans, &iter, disk_accounting_pos_to_bpos(&next)); continue; } diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index f4372cafea2e..f6098e33ab30 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -33,10 +33,12 @@ static inline bool bch2_accounting_key_is_zero(struct bkey_s_c_accounting a) static inline void bch2_accounting_accumulate(struct bkey_i_accounting *dst, struct bkey_s_c_accounting src) { - EBUG_ON(dst->k.u64s != src.k->u64s); - - for (unsigned i = 0; i < bch2_accounting_counters(&dst->k); i++) + for (unsigned i = 0; + i < min(bch2_accounting_counters(&dst->k), + bch2_accounting_counters(src.k)); + i++) dst->v.d[i] += src.v->d[i]; + if (bversion_cmp(dst->k.bversion, src.k->bversion) < 0) dst->k.bversion = src.k->bversion; } @@ -85,6 +87,24 @@ static inline struct bpos disk_accounting_pos_to_bpos(struct disk_accounting_pos int bch2_disk_accounting_mod(struct btree_trans *, struct disk_accounting_pos *, s64 *, unsigned, bool); + +#define disk_accounting_key_init(_k, _type, ...) \ +do { \ + memset(&(_k), 0, sizeof(_k)); \ + (_k).type = BCH_DISK_ACCOUNTING_##_type; \ + (_k)._type = (struct bch_acct_##_type) { __VA_ARGS__ }; \ +} while (0) + +#define bch2_disk_accounting_mod2_nr(_trans, _gc, _v, _nr, ...) \ +({ \ + struct disk_accounting_pos pos; \ + disk_accounting_key_init(pos, __VA_ARGS__); \ + bch2_disk_accounting_mod(trans, &pos, _v, _nr, _gc); \ +}) + +#define bch2_disk_accounting_mod2(_trans, _gc, _v, ...) \ + bch2_disk_accounting_mod2_nr(_trans, _gc, _v, ARRAY_SIZE(_v), __VA_ARGS__) + int bch2_mod_dev_cached_sectors(struct btree_trans *, unsigned, s64, bool); int bch2_accounting_validate(struct bch_fs *, struct bkey_s_c, @@ -116,12 +136,13 @@ enum bch_accounting_mode { }; int bch2_accounting_mem_insert(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); +int bch2_accounting_mem_insert_locked(struct bch_fs *, struct bkey_s_c_accounting, enum bch_accounting_mode); void bch2_accounting_mem_gc(struct bch_fs *); -static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) +static inline bool bch2_accounting_is_mem(struct disk_accounting_pos *acc) { - return acc.type < BCH_DISK_ACCOUNTING_TYPE_NR && - acc.type != BCH_DISK_ACCOUNTING_inum; + return acc->type < BCH_DISK_ACCOUNTING_TYPE_NR && + acc->type != BCH_DISK_ACCOUNTING_inum; } /* @@ -130,7 +151,8 @@ static inline bool bch2_accounting_is_mem(struct disk_accounting_pos acc) */ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, struct bkey_s_c_accounting a, - enum bch_accounting_mode mode) + enum bch_accounting_mode mode, + bool write_locked) { struct bch_fs *c = trans->c; struct bch_accounting_mem *acc = &c->accounting; @@ -141,7 +163,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, if (gc && !acc->gc_running) return 0; - if (!bch2_accounting_is_mem(acc_k)) + if (!bch2_accounting_is_mem(&acc_k)) return 0; if (mode == BCH_ACCOUNTING_normal) { @@ -169,7 +191,11 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, while ((idx = eytzinger0_find(acc->k.data, acc->k.nr, sizeof(acc->k.data[0]), accounting_pos_cmp, &a.k->p)) >= acc->k.nr) { - int ret = bch2_accounting_mem_insert(c, a, mode); + int ret = 0; + if (unlikely(write_locked)) + ret = bch2_accounting_mem_insert_locked(c, a, mode); + else + ret = bch2_accounting_mem_insert(c, a, mode); if (ret) return ret; } @@ -186,7 +212,7 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, static inline int bch2_accounting_mem_add(struct btree_trans *trans, struct bkey_s_c_accounting a, bool gc) { percpu_down_read(&trans->c->mark_lock); - int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal); + int ret = bch2_accounting_mem_mod_locked(trans, a, gc ? BCH_ACCOUNTING_gc : BCH_ACCOUNTING_normal, false); percpu_up_read(&trans->c->mark_lock); return ret; } @@ -233,13 +259,13 @@ static inline int bch2_accounting_trans_commit_hook(struct btree_trans *trans, struct bkey_i_accounting *a, unsigned commit_flags) { - a->k.bversion = journal_pos_to_bversion(&trans->journal_res, - (u64 *) a - (u64 *) trans->journal_entries); + u64 *base = (u64 *) btree_trans_subbuf_base(trans, &trans->accounting); + a->k.bversion = journal_pos_to_bversion(&trans->journal_res, (u64 *) a - base); EBUG_ON(bversion_zero(a->k.bversion)); return likely(!(commit_flags & BCH_TRANS_COMMIT_skip_accounting_apply)) - ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal) + ? bch2_accounting_mem_mod_locked(trans, accounting_i_to_s_c(a), BCH_ACCOUNTING_normal, false) : 0; } @@ -251,7 +277,7 @@ static inline void bch2_accounting_trans_commit_revert(struct btree_trans *trans struct bkey_s_accounting a = accounting_i_to_s(a_i); bch2_accounting_neg(a); - bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal); + bch2_accounting_mem_mod_locked(trans, a.c, BCH_ACCOUNTING_normal, false); bch2_accounting_neg(a); } } diff --git a/fs/bcachefs/disk_accounting_format.h b/fs/bcachefs/disk_accounting_format.h index 7b6e6c97e6aa..8269af1dbe2a 100644 --- a/fs/bcachefs/disk_accounting_format.h +++ b/fs/bcachefs/disk_accounting_format.h @@ -95,40 +95,81 @@ static inline bool data_type_is_hidden(enum bch_data_type type) } } +/* + * field 1: name + * field 2: id + * field 3: number of counters (max 3) + */ + #define BCH_DISK_ACCOUNTING_TYPES() \ - x(nr_inodes, 0) \ - x(persistent_reserved, 1) \ - x(replicas, 2) \ - x(dev_data_type, 3) \ - x(compression, 4) \ - x(snapshot, 5) \ - x(btree, 6) \ - x(rebalance_work, 7) \ - x(inum, 8) + x(nr_inodes, 0, 1) \ + x(persistent_reserved, 1, 1) \ + x(replicas, 2, 1) \ + x(dev_data_type, 3, 3) \ + x(compression, 4, 3) \ + x(snapshot, 5, 1) \ + x(btree, 6, 1) \ + x(rebalance_work, 7, 1) \ + x(inum, 8, 3) enum disk_accounting_type { -#define x(f, nr) BCH_DISK_ACCOUNTING_##f = nr, +#define x(f, nr, ...) BCH_DISK_ACCOUNTING_##f = nr, BCH_DISK_ACCOUNTING_TYPES() #undef x BCH_DISK_ACCOUNTING_TYPE_NR, }; -struct bch_nr_inodes { +/* + * No subtypes - number of inodes in the entire filesystem + * + * XXX: perhaps we could add a per-subvolume counter? + */ +struct bch_acct_nr_inodes { }; -struct bch_persistent_reserved { +/* + * Tracks KEY_TYPE_reservation sectors, broken out by number of replicas for the + * reservation: + */ +struct bch_acct_persistent_reserved { __u8 nr_replicas; }; -struct bch_dev_data_type { +/* + * device, data type counter fields: + * [ + * nr_buckets + * live sectors (in buckets of that data type) + * sectors of internal fragmentation + * ] + * + * XXX: live sectors should've been done differently, you can have multiple data + * types in the same bucket (user, stripe, cached) and this collapses them to + * the bucket data type, and makes the internal fragmentation counter redundant + */ +struct bch_acct_dev_data_type { __u8 dev; __u8 data_type; }; +/* + * Compression type fields: + * [ + * number of extents + * uncompressed size + * compressed size + * ] + * + * Compression ratio, average extent size (fragmentation). + */ struct bch_acct_compression { __u8 type; }; +/* + * On disk usage by snapshot id; counts same values as replicas counter, but + * aggregated differently + */ struct bch_acct_snapshot { __u32 id; } __packed; @@ -137,10 +178,27 @@ struct bch_acct_btree { __u32 id; } __packed; +/* + * inum counter fields: + * [ + * number of extents + * sum of extent sizes - bkey size + * this field is similar to inode.bi_sectors, except here extents in + * different snapshots but the same inode number are all collapsed to the + * same counter + * sum of on disk size - same values tracked by replicas counters + * ] + * + * This tracks on disk fragmentation. + */ struct bch_acct_inum { __u64 inum; } __packed; +/* + * Simple counter of the amount of data (on disk sectors) rebalance needs to + * move, extents counted here are also in the rebalance_work btree. + */ struct bch_acct_rebalance_work { }; @@ -149,10 +207,10 @@ struct disk_accounting_pos { struct { __u8 type; union { - struct bch_nr_inodes nr_inodes; - struct bch_persistent_reserved persistent_reserved; + struct bch_acct_nr_inodes nr_inodes; + struct bch_acct_persistent_reserved persistent_reserved; struct bch_replicas_entry_v1 replicas; - struct bch_dev_data_type dev_data_type; + struct bch_acct_dev_data_type dev_data_type; struct bch_acct_compression compression; struct bch_acct_snapshot snapshot; struct bch_acct_btree btree; diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index 5df8de0b8c02..c20ecf5e5381 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -86,35 +86,6 @@ err: return ret; } -void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) -{ - out->atomic++; - rcu_read_lock(); - - struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - if (!g) - goto out; - - for (unsigned i = 0; i < g->nr; i++) { - if (i) - prt_printf(out, " "); - - if (g->entries[i].deleted) { - prt_printf(out, "[deleted]"); - continue; - } - - prt_printf(out, "[parent %d devs", g->entries[i].parent); - for_each_member_device_rcu(c, ca, &g->entries[i].devs) - prt_printf(out, " %s", ca->name); - prt_printf(out, "]"); - } - -out: - rcu_read_unlock(); - out->atomic--; -} - static void bch2_sb_disk_groups_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) @@ -241,20 +212,13 @@ bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) case TARGET_DEV: return dev == t.dev; case TARGET_GROUP: { - struct bch_disk_groups_cpu *g; - const struct bch_devs_mask *m; - bool ret; - - rcu_read_lock(); - g = rcu_dereference(c->disk_groups); - m = g && t.group < g->nr && !g->entries[t.group].deleted + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + const struct bch_devs_mask *m = + g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - ret = m ? test_bit(dev, m->d) : false; - rcu_read_unlock(); - - return ret; + return m ? test_bit(dev, m->d) : false; } default: BUG(); @@ -377,54 +341,81 @@ int bch2_disk_path_find_or_create(struct bch_sb_handle *sb, const char *name) return v; } -void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +static void __bch2_disk_path_to_text(struct printbuf *out, struct bch_disk_groups_cpu *g, + unsigned v) { - struct bch_disk_groups_cpu *groups; - struct bch_disk_group_cpu *g; - unsigned nr = 0; u16 path[32]; - - out->atomic++; - rcu_read_lock(); - groups = rcu_dereference(c->disk_groups); - if (!groups) - goto invalid; + unsigned nr = 0; while (1) { if (nr == ARRAY_SIZE(path)) goto invalid; - if (v >= groups->nr) + if (v >= (g ? g->nr : 0)) goto invalid; - g = groups->entries + v; + struct bch_disk_group_cpu *e = g->entries + v; - if (g->deleted) + if (e->deleted) goto invalid; path[nr++] = v; - if (!g->parent) + if (!e->parent) break; - v = g->parent - 1; + v = e->parent - 1; } while (nr) { - v = path[--nr]; - g = groups->entries + v; + struct bch_disk_group_cpu *e = g->entries + path[--nr]; - prt_printf(out, "%.*s", (int) sizeof(g->label), g->label); + prt_printf(out, "%.*s", (int) sizeof(e->label), e->label); if (nr) prt_printf(out, "."); } -out: - rcu_read_unlock(); - out->atomic--; return; invalid: prt_printf(out, "invalid label %u", v); - goto out; +} + +void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) +{ + bch2_printbuf_make_room(out, 4096); + + out->atomic++; + rcu_read_lock(); + struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); + + for (unsigned i = 0; i < (g ? g->nr : 0); i++) { + prt_printf(out, "%2u: ", i); + + if (g->entries[i].deleted) { + prt_printf(out, "[deleted]"); + goto next; + } + + __bch2_disk_path_to_text(out, g, i); + + prt_printf(out, " devs"); + + for_each_member_device_rcu(c, ca, &g->entries[i].devs) + prt_printf(out, " %s", ca->name); +next: + prt_newline(out); + } + + rcu_read_unlock(); + out->atomic--; +} + +void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) +{ + out->atomic++; + rcu_read_lock(); + __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v), + rcu_read_unlock(); + --out->atomic; } void bch2_disk_path_to_text_sb(struct printbuf *out, struct bch_sb *sb, unsigned v) @@ -470,23 +461,22 @@ inval: int __bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) { - struct bch_member *mi; - int ret, v = -1; + lockdep_assert_held(&c->sb_lock); - if (!strlen(name) || !strcmp(name, "none")) - return 0; - v = bch2_disk_path_find_or_create(&c->disk_sb, name); - if (v < 0) - return v; + if (!strlen(name) || !strcmp(name, "none")) { + struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_GROUP(mi, 0); + } else { + int v = bch2_disk_path_find_or_create(&c->disk_sb, name); + if (v < 0) + return v; - ret = bch2_sb_disk_groups_to_cpu(c); - if (ret) - return ret; + struct bch_member *mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_GROUP(mi, v + 1); + } - mi = bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); - SET_BCH_MEMBER_GROUP(mi, v + 1); - return 0; + return bch2_sb_disk_groups_to_cpu(c); } int bch2_dev_group_set(struct bch_fs *c, struct bch_dev *ca, const char *name) @@ -555,14 +545,12 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) ? rcu_dereference(c->devs[t.dev]) : NULL; - if (ca && percpu_ref_tryget(&ca->io_ref)) { + if (ca && ca->disk_sb.bdev) prt_printf(out, "/dev/%s", ca->name); - percpu_ref_put(&ca->io_ref); - } else if (ca) { + else if (ca) prt_printf(out, "offline device %u", t.dev); - } else { + else prt_printf(out, "invalid device %u", t.dev); - } rcu_read_unlock(); out->atomic--; diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index d2a5e76e6479..c581426e3894 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -16,10 +16,12 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "io_read.h" #include "io_write.h" #include "keylist.h" +#include "lru.h" #include "recovery.h" #include "replicas.h" #include "super-io.h" @@ -104,6 +106,8 @@ struct ec_bio { struct bch_dev *ca; struct ec_stripe_buf *buf; size_t idx; + int rw; + u64 submit_time; struct bio bio; }; @@ -298,15 +302,27 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bpos bucket = PTR_BUCKET_POS(ca, ptr); if (flags & BTREE_TRIGGER_transactional) { + struct extent_ptr_decoded p = { + .ptr = *ptr, + .crc = bch2_extent_crc_unpack(s.k, NULL), + }; + struct bkey_i_backpointer bp; + bch2_extent_ptr_to_bp(c, BTREE_ID_stripes, 0, s.s_c, p, + (const union bch_extent_entry *) ptr, &bp); + struct bkey_i_alloc_v4 *a = bch2_trans_start_alloc_update(trans, bucket, 0); - ret = PTR_ERR_OR_ZERO(a) ?: - __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags); + ret = PTR_ERR_OR_ZERO(a) ?: + __mark_stripe_bucket(trans, ca, s, ptr_idx, deleting, bucket, &a->v, flags) ?: + bch2_bucket_backpointer_mod(trans, s.s_c, &bp, + !(flags & BTREE_TRIGGER_overwrite)); + if (ret) + goto err; } if (flags & BTREE_TRIGGER_gc) { struct bucket *g = gc_bucket(ca, bucket.offset); - if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", + if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = -BCH_ERR_mark_stripe; @@ -366,19 +382,6 @@ static int mark_stripe_buckets(struct btree_trans *trans, return 0; } -static inline void stripe_to_mem(struct stripe *m, const struct bch_stripe *s) -{ - m->sectors = le16_to_cpu(s->sectors); - m->algorithm = s->algorithm; - m->nr_blocks = s->nr_blocks; - m->nr_redundant = s->nr_redundant; - m->disk_label = s->disk_label; - m->blocks_nonempty = 0; - - for (unsigned i = 0; i < s->nr_blocks; i++) - m->blocks_nonempty += !!stripe_blockcount_get(s, i); -} - int bch2_trigger_stripe(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s _new, @@ -399,6 +402,15 @@ int bch2_trigger_stripe(struct btree_trans *trans, (new_s->nr_blocks != old_s->nr_blocks || new_s->nr_redundant != old_s->nr_redundant)); + if (flags & BTREE_TRIGGER_transactional) { + int ret = bch2_lru_change(trans, + BCH_LRU_STRIPE_FRAGMENTATION, + idx, + stripe_lru_pos(old_s), + stripe_lru_pos(new_s)); + if (ret) + return ret; + } if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { /* @@ -443,24 +455,25 @@ int bch2_trigger_stripe(struct btree_trans *trans, if (new_s) { s64 sectors = (u64) le16_to_cpu(new_s->sectors) * new_s->nr_redundant; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, new); int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); if (ret) return ret; if (gc) - memcpy(&gc->r.e, &acc.replicas, replicas_entry_bytes(&acc.replicas)); + unsafe_memcpy(&gc->r.e, &acc.replicas, + replicas_entry_bytes(&acc.replicas), "VLA"); } if (old_s) { s64 sectors = -((s64) le16_to_cpu(old_s->sectors)) * old_s->nr_redundant; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + struct disk_accounting_pos acc; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, old); int ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, gc); if (ret) @@ -472,38 +485,6 @@ int bch2_trigger_stripe(struct btree_trans *trans, return ret; } - if (flags & BTREE_TRIGGER_atomic) { - struct stripe *m = genradix_ptr(&c->stripes, idx); - - if (!m) { - struct printbuf buf1 = PRINTBUF; - struct printbuf buf2 = PRINTBUF; - - bch2_bkey_val_to_text(&buf1, c, old); - bch2_bkey_val_to_text(&buf2, c, new); - bch_err_ratelimited(c, "error marking nonexistent stripe %llu while marking\n" - "old %s\n" - "new %s", idx, buf1.buf, buf2.buf); - printbuf_exit(&buf2); - printbuf_exit(&buf1); - bch2_inconsistent_error(c); - return -1; - } - - if (!new_s) { - bch2_stripes_heap_del(c, m, idx); - - memset(m, 0, sizeof(*m)); - } else { - stripe_to_mem(m, new_s); - - if (!old_s) - bch2_stripes_heap_insert(c, m, idx); - else - bch2_stripes_heap_update(c, m, idx); - } - } - return 0; } @@ -527,20 +508,14 @@ static const struct bch_extent_ptr *bkey_matches_stripe(struct bch_stripe *s, static bool extent_has_stripe_ptr(struct bkey_s_c k, u64 idx) { - switch (k.k->type) { - case KEY_TYPE_extent: { - struct bkey_s_c_extent e = bkey_s_c_to_extent(k); - const union bch_extent_entry *entry; - - extent_for_each_entry(e, entry) - if (extent_entry_type(entry) == - BCH_EXTENT_ENTRY_stripe_ptr && - entry->stripe_ptr.idx == idx) - return true; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; - break; - } - } + bkey_extent_entry_for_each(ptrs, entry) + if (extent_entry_type(entry) == + BCH_EXTENT_ENTRY_stripe_ptr && + entry->stripe_ptr.idx == idx) + return true; return false; } @@ -725,15 +700,20 @@ static void ec_block_endio(struct bio *bio) struct bch_extent_ptr *ptr = &v->ptrs[ec_bio->idx]; struct bch_dev *ca = ec_bio->ca; struct closure *cl = bio->bi_private; + int rw = ec_bio->rw; + unsigned ref = rw == READ + ? BCH_DEV_READ_REF_ec_block + : BCH_DEV_WRITE_REF_ec_block; - if (bch2_dev_io_err_on(bio->bi_status, ca, - bio_data_dir(bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "erasure coding %s error: %s", + bch2_account_io_completion(ca, bio_data_dir(bio), + ec_bio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "erasure coding %s error: %s", str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status)); clear_bit(ec_bio->idx, ec_bio->buf->valid); + } int stale = dev_ptr_stale(ca, ptr); if (stale) { @@ -745,7 +725,7 @@ static void ec_block_endio(struct bio *bio) } bio_put(&ec_bio->bio); - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[rw], ref); closure_put(cl); } @@ -759,8 +739,11 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ? BCH_DATA_user : BCH_DATA_parity; int rw = op_is_write(opf); + unsigned ref = rw == READ + ? BCH_DEV_READ_REF_ec_block + : BCH_DEV_WRITE_REF_ec_block; - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, rw, ref); if (!ca) { clear_bit(idx, buf->valid); return; @@ -796,6 +779,8 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, ec_bio->ca = ca; ec_bio->buf = buf; ec_bio->idx = idx; + ec_bio->rw = rw; + ec_bio->submit_time = local_clock(); ec_bio->bio.bi_iter.bi_sector = ptr->offset + buf->offset + (offset >> 9); ec_bio->bio.bi_end_io = ec_block_endio; @@ -804,14 +789,14 @@ static void ec_block_io(struct bch_fs *c, struct ec_stripe_buf *buf, bch2_bio_map(&ec_bio->bio, buf->data[idx] + offset, b); closure_get(cl); - percpu_ref_get(&ca->io_ref); + enumerated_ref_get(&ca->io_ref[rw], ref); submit_bio(&ec_bio->bio); offset += b; } - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[rw], ref); } static int get_stripe_key_trans(struct btree_trans *trans, u64 idx, @@ -917,26 +902,6 @@ err: static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { - ec_stripes_heap n, *h = &c->ec_stripes_heap; - - if (idx >= h->size) { - if (!init_heap(&n, max(1024UL, roundup_pow_of_two(idx + 1)), gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - - mutex_lock(&c->ec_stripes_heap_lock); - if (n.size > h->size) { - memcpy(n.data, h->data, h->nr * sizeof(h->data[0])); - n.nr = h->nr; - swap(*h, n); - } - mutex_unlock(&c->ec_stripes_heap_lock); - - free_heap(&n); - } - - if (!genradix_ptr_alloc(&c->stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; - if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; @@ -1009,188 +974,58 @@ static void bch2_stripe_close(struct bch_fs *c, struct ec_stripe_new *s) s->idx = 0; } -/* Heap of all existing stripes, ordered by blocks_nonempty */ - -static u64 stripe_idx_to_delete(struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - - lockdep_assert_held(&c->ec_stripes_heap_lock); - - if (h->nr && - h->data[0].blocks_nonempty == 0 && - !bch2_stripe_is_open(c, h->data[0].idx)) - return h->data[0].idx; - - return 0; -} - -static inline void ec_stripes_heap_set_backpointer(ec_stripes_heap *h, - size_t i) -{ - struct bch_fs *c = container_of(h, struct bch_fs, ec_stripes_heap); - - genradix_ptr(&c->stripes, h->data[i].idx)->heap_idx = i; -} - -static inline bool ec_stripes_heap_cmp(const void *l, const void *r, void __always_unused *args) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - - return ((_l->blocks_nonempty > _r->blocks_nonempty) < - (_l->blocks_nonempty < _r->blocks_nonempty)); -} - -static inline void ec_stripes_heap_swap(void *l, void *r, void *h) -{ - struct ec_stripe_heap_entry *_l = (struct ec_stripe_heap_entry *)l; - struct ec_stripe_heap_entry *_r = (struct ec_stripe_heap_entry *)r; - ec_stripes_heap *_h = (ec_stripes_heap *)h; - size_t i = _l - _h->data; - size_t j = _r - _h->data; - - swap(*_l, *_r); - - ec_stripes_heap_set_backpointer(_h, i); - ec_stripes_heap_set_backpointer(_h, j); -} - -static const struct min_heap_callbacks callbacks = { - .less = ec_stripes_heap_cmp, - .swp = ec_stripes_heap_swap, -}; - -static void heap_verify_backpointer(struct bch_fs *c, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m = genradix_ptr(&c->stripes, idx); - - BUG_ON(m->heap_idx >= h->nr); - BUG_ON(h->data[m->heap_idx].idx != idx); -} - -void bch2_stripes_heap_del(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - min_heap_del(&c->ec_stripes_heap, m->heap_idx, &callbacks, &c->ec_stripes_heap); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_insert(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - mutex_lock(&c->ec_stripes_heap_lock); - BUG_ON(min_heap_full(&c->ec_stripes_heap)); - - genradix_ptr(&c->stripes, idx)->heap_idx = c->ec_stripes_heap.nr; - min_heap_push(&c->ec_stripes_heap, &((struct ec_stripe_heap_entry) { - .idx = idx, - .blocks_nonempty = m->blocks_nonempty, - }), - &callbacks, - &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - mutex_unlock(&c->ec_stripes_heap_lock); -} - -void bch2_stripes_heap_update(struct bch_fs *c, - struct stripe *m, size_t idx) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - bool do_deletes; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - heap_verify_backpointer(c, idx); - - h->data[m->heap_idx].blocks_nonempty = m->blocks_nonempty; - - i = m->heap_idx; - min_heap_sift_up(h, i, &callbacks, &c->ec_stripes_heap); - min_heap_sift_down(h, i, &callbacks, &c->ec_stripes_heap); - - heap_verify_backpointer(c, idx); - - do_deletes = stripe_idx_to_delete(c) != 0; - mutex_unlock(&c->ec_stripes_heap_lock); - - if (do_deletes) - bch2_do_stripe_deletes(c); -} - /* stripe deletion */ static int ec_stripe_delete(struct btree_trans *trans, u64 idx) { - struct bch_fs *c = trans->c; struct btree_iter iter; - struct bkey_s_c k; - struct bkey_s_c_stripe s; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_stripes, POS(0, idx), - BTREE_ITER_intent); - ret = bkey_err(k); + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), + BTREE_ITER_intent); + int ret = bkey_err(k); if (ret) goto err; - if (k.k->type != KEY_TYPE_stripe) { - bch2_fs_inconsistent(c, "attempting to delete nonexistent stripe %llu", idx); - ret = -EINVAL; - goto err; - } - - s = bkey_s_c_to_stripe(k); - for (unsigned i = 0; i < s.v->nr_blocks; i++) - if (stripe_blockcount_get(s.v, i)) { - struct printbuf buf = PRINTBUF; - - bch2_bkey_val_to_text(&buf, c, k); - bch2_fs_inconsistent(c, "attempting to delete nonempty stripe %s", buf.buf); - printbuf_exit(&buf); - ret = -EINVAL; - goto err; - } - - ret = bch2_btree_delete_at(trans, &iter, 0); + /* + * We expect write buffer races here + * Important: check stripe_is_open with stripe key locked: + */ + if (k.k->type == KEY_TYPE_stripe && + !bch2_stripe_is_open(trans->c, idx) && + stripe_lru_pos(bkey_s_c_to_stripe(k).v) == 1) + ret = bch2_btree_delete_at(trans, &iter, 0); err: bch2_trans_iter_exit(trans, &iter); return ret; } +/* + * XXX + * can we kill this and delete stripes from the trigger? + */ static void ec_stripe_delete_work(struct work_struct *work) { struct bch_fs *c = container_of(work, struct bch_fs, ec_stripe_delete_work); - while (1) { - mutex_lock(&c->ec_stripes_heap_lock); - u64 idx = stripe_idx_to_delete(c); - mutex_unlock(&c->ec_stripes_heap_lock); - - if (!idx) - break; - - int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - ec_stripe_delete(trans, idx)); - bch_err_fn(c, ret); - if (ret) - break; - } - - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); + bch2_trans_run(c, + bch2_btree_write_buffer_tryflush(trans) ?: + for_each_btree_key_max_commit(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 1, LRU_TIME_MAX), + 0, lru_k, + NULL, NULL, + BCH_TRANS_COMMIT_no_enospc, ({ + ec_stripe_delete(trans, lru_k.k->p.offset); + }))); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); } void bch2_do_stripe_deletes(struct bch_fs *c) { - if (bch2_write_ref_tryget(c, BCH_WRITE_REF_stripe_delete) && + if (enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_stripe_delete) && !queue_work(c->write_ref_wq, &c->ec_stripe_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_delete); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_delete); } /* stripe creation: */ @@ -1294,7 +1129,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); - return -EIO; + return -BCH_ERR_erasure_coding_found_btree_node; } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); @@ -1360,7 +1195,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); if (!ca) - return -EIO; + return -BCH_ERR_ENOENT_dev_not_found; struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); @@ -1380,8 +1215,12 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b if (bp_k.k->type != KEY_TYPE_backpointer) continue; + struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(bp_k); + if (bp.v->btree_id == BTREE_ID_stripes) + continue; + ec_stripe_update_extent(trans, ca, bucket_pos, ptr.gen, s, - bkey_s_c_to_backpointer(bp_k), &last_flushed); + bp, &last_flushed); })); bch2_bkey_buf_exit(&last_flushed, c); @@ -1393,21 +1232,19 @@ static int ec_stripe_update_extents(struct bch_fs *c, struct ec_stripe_buf *s) { struct btree_trans *trans = bch2_trans_get(c); struct bch_stripe *v = &bkey_i_to_stripe(&s->key)->v; - unsigned i, nr_data = v->nr_blocks - v->nr_redundant; - int ret = 0; + unsigned nr_data = v->nr_blocks - v->nr_redundant; - ret = bch2_btree_write_buffer_flush_sync(trans); + int ret = bch2_btree_write_buffer_flush_sync(trans); if (ret) goto err; - for (i = 0; i < nr_data; i++) { + for (unsigned i = 0; i < nr_data; i++) { ret = ec_stripe_update_bucket(trans, s, i); if (ret) break; } err: bch2_trans_put(trans); - return ret; } @@ -1416,7 +1253,8 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, unsigned block, struct open_bucket *ob) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE); + struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, + BCH_DEV_WRITE_REF_ec_bucket_zero); if (!ca) { s->err = -BCH_ERR_erofs_no_writes; return; @@ -1432,7 +1270,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, ob->sectors_free, GFP_KERNEL, 0); - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_ec_bucket_zero); if (ret) s->err = ret; @@ -1473,6 +1311,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (s->err) { if (!bch2_err_matches(s->err, EROFS)) bch_err(c, "error creating stripe: error writing data buckets"); + ret = s->err; goto err; } @@ -1481,6 +1320,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); + ret = -BCH_ERR_ec_block_read; goto err; } @@ -1506,6 +1346,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); + ret = -BCH_ERR_ec_block_write; goto err; } @@ -1527,6 +1368,8 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ret) goto err; err: + trace_stripe_create(c, s->idx, ret); + bch2_disk_reservation_put(c, &s->res); for (i = 0; i < v->nr_blocks; i++) @@ -1577,15 +1420,15 @@ static void ec_stripe_create_work(struct work_struct *work) while ((s = get_pending_stripe(c))) ec_stripe_create(s); - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); } void bch2_ec_do_stripe_creates(struct bch_fs *c) { - bch2_write_ref_get(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_get(&c->writes, BCH_WRITE_REF_stripe_create); if (!queue_work(system_long_wq, &c->ec_stripe_create_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_stripe_create); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_stripe_create); } static void ec_stripe_new_set_pending(struct bch_fs *c, struct ec_stripe_head *h) @@ -1612,11 +1455,11 @@ static void ec_stripe_new_cancel(struct bch_fs *c, struct ec_stripe_head *h, int ec_stripe_new_set_pending(c, h); } -void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob) +void bch2_ec_bucket_cancel(struct bch_fs *c, struct open_bucket *ob, int err) { struct ec_stripe_new *s = ob->ec; - s->err = -EIO; + s->err = err; } void *bch2_writepoint_ec_buf(struct bch_fs *c, struct write_point *wp) @@ -1875,23 +1718,32 @@ err: } static int new_stripe_alloc_buckets(struct btree_trans *trans, + struct alloc_request *req, struct ec_stripe_head *h, struct ec_stripe_new *s, - enum bch_watermark watermark, struct closure *cl) + struct closure *cl) { struct bch_fs *c = trans->c; - struct bch_devs_mask devs = h->devs; struct open_bucket *ob; - struct open_buckets buckets; struct bch_stripe *v = &bkey_i_to_stripe(&s->new_stripe.key)->v; unsigned i, j, nr_have_parity = 0, nr_have_data = 0; - bool have_cache = true; int ret = 0; + req->scratch_data_type = req->data_type; + req->scratch_ptrs = req->ptrs; + req->scratch_nr_replicas = req->nr_replicas; + req->scratch_nr_effective = req->nr_effective; + req->scratch_have_cache = req->have_cache; + req->scratch_devs_may_alloc = req->devs_may_alloc; + + req->devs_may_alloc = h->devs; + req->have_cache = true; + BUG_ON(v->nr_blocks != s->nr_data + s->nr_parity); BUG_ON(v->nr_redundant != s->nr_parity); /* * We bypass the sector allocator which normally does this: */ - bitmap_and(devs.d, devs.d, c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); + bitmap_and(req->devs_may_alloc.d, req->devs_may_alloc.d, + c->rw_devs[BCH_DATA_user].d, BCH_SB_MEMBERS_MAX); for_each_set_bit(i, s->blocks_gotten, v->nr_blocks) { /* @@ -1901,7 +1753,7 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, * block when updating the stripe */ if (v->ptrs[i].dev != BCH_SB_MEMBER_INVALID) - __clear_bit(v->ptrs[i].dev, devs.d); + __clear_bit(v->ptrs[i].dev, req->devs_may_alloc.d); if (i < s->nr_data) nr_have_data++; @@ -1912,95 +1764,94 @@ static int new_stripe_alloc_buckets(struct btree_trans *trans, BUG_ON(nr_have_data > s->nr_data); BUG_ON(nr_have_parity > s->nr_parity); - buckets.nr = 0; + req->ptrs.nr = 0; if (nr_have_parity < s->nr_parity) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, - &h->parity_stripe, - &devs, - s->nr_parity, - &nr_have_parity, - &have_cache, 0, - BCH_DATA_parity, - watermark, - cl); - - open_bucket_for_each(c, &buckets, ob, i) { + req->nr_replicas = s->nr_parity; + req->nr_effective = nr_have_parity; + req->data_type = BCH_DATA_parity; + + ret = bch2_bucket_alloc_set_trans(trans, req, &h->parity_stripe, cl); + + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data + s->nr_parity, s->nr_data); BUG_ON(j >= s->nr_data + s->nr_parity); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - buckets.nr = 0; + req->ptrs.nr = 0; if (nr_have_data < s->nr_data) { - ret = bch2_bucket_alloc_set_trans(trans, &buckets, - &h->block_stripe, - &devs, - s->nr_data, - &nr_have_data, - &have_cache, 0, - BCH_DATA_user, - watermark, - cl); - - open_bucket_for_each(c, &buckets, ob, i) { + req->nr_replicas = s->nr_data; + req->nr_effective = nr_have_data; + req->data_type = BCH_DATA_user; + + ret = bch2_bucket_alloc_set_trans(trans, req, &h->block_stripe, cl); + + open_bucket_for_each(c, &req->ptrs, ob, i) { j = find_next_zero_bit(s->blocks_gotten, s->nr_data, 0); BUG_ON(j >= s->nr_data); - s->blocks[j] = buckets.v[i]; + s->blocks[j] = req->ptrs.v[i]; v->ptrs[j] = bch2_ob_ptr(c, ob); __set_bit(j, s->blocks_gotten); } if (ret) - return ret; + goto err; } - - return 0; +err: + req->data_type = req->scratch_data_type; + req->ptrs = req->scratch_ptrs; + req->nr_replicas = req->scratch_nr_replicas; + req->nr_effective = req->scratch_nr_effective; + req->have_cache = req->scratch_have_cache; + req->devs_may_alloc = req->scratch_devs_may_alloc; + return ret; } -static s64 get_existing_stripe(struct bch_fs *c, - struct ec_stripe_head *head) +static int __get_existing_stripe(struct btree_trans *trans, + struct ec_stripe_head *head, + struct ec_stripe_buf *stripe, + u64 idx) { - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t heap_idx; - u64 stripe_idx; - s64 ret = -1; - - if (may_create_new_stripe(c)) - return -1; + struct bch_fs *c = trans->c; - mutex_lock(&c->ec_stripes_heap_lock); - for (heap_idx = 0; heap_idx < h->nr; heap_idx++) { - /* No blocks worth reusing, stripe will just be deleted: */ - if (!h->data[heap_idx].blocks_nonempty) - continue; + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, + BTREE_ID_stripes, POS(0, idx), 0); + int ret = bkey_err(k); + if (ret) + goto err; - stripe_idx = h->data[heap_idx].idx; + /* We expect write buffer races here */ + if (k.k->type != KEY_TYPE_stripe) + goto out; - m = genradix_ptr(&c->stripes, stripe_idx); + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + if (stripe_lru_pos(s.v) <= 1) + goto out; - if (m->disk_label == head->disk_label && - m->algorithm == head->algo && - m->nr_redundant == head->redundancy && - m->sectors == head->blocksize && - m->blocks_nonempty < m->nr_blocks - m->nr_redundant && - bch2_try_open_stripe(c, head->s, stripe_idx)) { - ret = stripe_idx; - break; - } + if (s.v->disk_label == head->disk_label && + s.v->algorithm == head->algo && + s.v->nr_redundant == head->redundancy && + le16_to_cpu(s.v->sectors) == head->blocksize && + bch2_try_open_stripe(c, head->s, idx)) { + bkey_reassemble(&stripe->key, k); + ret = 1; } - mutex_unlock(&c->ec_stripes_heap_lock); +out: + bch2_set_btree_iter_dontneed(trans, &iter); +err: + bch2_trans_iter_exit(trans, &iter); return ret; } @@ -2052,24 +1903,33 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri struct ec_stripe_new *s) { struct bch_fs *c = trans->c; - s64 idx; - int ret; /* * If we can't allocate a new stripe, and there's no stripes with empty * blocks for us to reuse, that means we have to wait on copygc: */ - idx = get_existing_stripe(c, h); - if (idx < 0) - return -BCH_ERR_stripe_alloc_blocked; + if (may_create_new_stripe(c)) + return -1; - ret = get_stripe_key_trans(trans, idx, &s->existing_stripe); - bch2_fs_fatal_err_on(ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart), c, - "reading stripe key: %s", bch2_err_str(ret)); - if (ret) { - bch2_stripe_close(c, s); - return ret; + struct btree_iter lru_iter; + struct bkey_s_c lru_k; + int ret = 0; + + for_each_btree_key_max_norestart(trans, lru_iter, BTREE_ID_lru, + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, 0), + lru_pos(BCH_LRU_STRIPE_FRAGMENTATION, 2, LRU_TIME_MAX), + 0, lru_k, ret) { + ret = __get_existing_stripe(trans, h, &s->existing_stripe, lru_k.k->p.offset); + if (ret) + break; } + bch2_trans_iter_exit(trans, &lru_iter); + if (!ret) + ret = -BCH_ERR_stripe_alloc_blocked; + if (ret == 1) + ret = 0; + if (ret) + return ret; return init_new_stripe_from_existing(c, s); } @@ -2102,7 +1962,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st if (bkey_gt(k.k->p, POS(0, U32_MAX))) { if (start_pos.offset) { start_pos = min_pos; - bch2_btree_iter_set_pos(&iter, start_pos); + bch2_btree_iter_set_pos(trans, &iter, start_pos); continue; } @@ -2136,17 +1996,15 @@ err: } struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, - unsigned target, + struct alloc_request *req, unsigned algo, - unsigned redundancy, - enum bch_watermark watermark, struct closure *cl) { struct bch_fs *c = trans->c; - struct ec_stripe_head *h; - bool waiting = false; + unsigned redundancy = req->nr_replicas - 1; unsigned disk_label = 0; - struct target t = target_decode(target); + struct target t = target_decode(req->target); + bool waiting = false; int ret; if (t.type == TARGET_GROUP) { @@ -2157,7 +2015,9 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, disk_label = t.group + 1; /* 0 == no label */ } - h = __bch2_ec_stripe_head_get(trans, disk_label, algo, redundancy, watermark); + struct ec_stripe_head *h = + __bch2_ec_stripe_head_get(trans, disk_label, algo, + redundancy, req->watermark); if (IS_ERR_OR_NULL(h)) return h; @@ -2181,8 +2041,12 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, goto alloc_existing; /* First, try to allocate a full stripe: */ - ret = new_stripe_alloc_buckets(trans, h, s, BCH_WATERMARK_stripe, NULL) ?: + enum bch_watermark saved_watermark = BCH_WATERMARK_stripe; + swap(req->watermark, saved_watermark); + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); + swap(req->watermark, saved_watermark); + if (!ret) goto allocate_buf; if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || @@ -2200,8 +2064,8 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, if (waiting || !cl || ret != -BCH_ERR_stripe_alloc_blocked) goto err; - if (watermark == BCH_WATERMARK_copygc) { - ret = new_stripe_alloc_buckets(trans, h, s, watermark, NULL) ?: + if (req->watermark == BCH_WATERMARK_copygc) { + ret = new_stripe_alloc_buckets(trans, req, h, s, NULL) ?: __bch2_ec_stripe_head_reserve(trans, h, s); if (ret) goto err; @@ -2220,7 +2084,7 @@ alloc_existing: * Retry allocating buckets, with the watermark for this * particular write: */ - ret = new_stripe_alloc_buckets(trans, h, s, watermark, cl); + ret = new_stripe_alloc_buckets(trans, req, h, s, cl); if (ret) goto err; @@ -2242,67 +2106,106 @@ err: /* device removal */ -static int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, struct bkey_s_c k_a) +int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k, + unsigned dev_idx, + unsigned flags) { - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); - - if (!a->stripe) + if (k.k->type != KEY_TYPE_stripe) return 0; - if (a->stripe_sectors) { - bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); - return -BCH_ERR_invalidate_stripe_to_dev; - } - - struct btree_iter iter; struct bkey_i_stripe *s = - bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), - BTREE_ITER_slots, stripe); + bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); int ret = PTR_ERR_OR_ZERO(s); if (ret) return ret; - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_replicas, - }; + struct disk_accounting_pos acc; s64 sectors = 0; for (unsigned i = 0; i < s->v.nr_blocks; i++) sectors -= stripe_blockcount_get(&s->v, i); + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = BCH_DATA_user; ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); if (ret) - goto err; + return ret; struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(&s->k_i)); - bkey_for_each_ptr(ptrs, ptr) - if (ptr->dev == k_a.k->p.inode) + + /* XXX: how much redundancy do we still have? check degraded flags */ + + unsigned nr_good = 0; + + rcu_read_lock(); + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev_idx) ptr->dev = BCH_SB_MEMBER_INVALID; + struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev); + nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; + } + rcu_read_unlock(); + + if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) + return -BCH_ERR_remove_would_lose_data; + + unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; + + if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) + return -BCH_ERR_remove_would_lose_data; + sectors = -sectors; + memset(&acc, 0, sizeof(acc)); + acc.type = BCH_DISK_ACCOUNTING_replicas; bch2_bkey_to_replicas(&acc.replicas, bkey_i_to_s_c(&s->k_i)); acc.replicas.data_type = BCH_DATA_user; - ret = bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); + return bch2_disk_accounting_mod(trans, &acc, §ors, 1, false); +} + +static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, struct bkey_s_c k_a, + unsigned flags) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k_a, &a_convert); + + if (!a->stripe) + return 0; + + if (a->stripe_sectors) { + bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); + return -BCH_ERR_invalidate_stripe_to_dev; + } + + struct btree_iter iter; + struct bkey_s_c_stripe s = + bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), + BTREE_ITER_slots, stripe); + int ret = bkey_err(s); if (ret) - goto err; -err: + return ret; + + ret = bch2_invalidate_stripe_to_dev(trans, &iter, s.s_c, k_a.k->p.inode, flags); bch2_trans_iter_exit(trans, &iter); return ret; } -int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx) +int bch2_dev_remove_stripes(struct bch_fs *c, unsigned dev_idx, unsigned flags) { - return bch2_trans_run(c, + int ret = bch2_trans_run(c, for_each_btree_key_max_commit(trans, iter, BTREE_ID_alloc, POS(dev_idx, 0), POS(dev_idx, U64_MAX), BTREE_ITER_intent, k, NULL, NULL, 0, ({ - bch2_invalidate_stripe_to_dev(trans, k); + bch2_invalidate_stripe_to_dev_from_alloc(trans, k, flags); }))); + bch_err_fn(c, ret); + return ret; } /* startup/shutdown */ @@ -2351,10 +2254,10 @@ void bch2_fs_ec_stop(struct bch_fs *c) static bool bch2_fs_ec_flush_done(struct bch_fs *c) { - bool ret; + sched_annotate_sleep(); mutex_lock(&c->ec_stripe_new_lock); - ret = list_empty(&c->ec_stripe_new_list); + bool ret = list_empty(&c->ec_stripe_new_list); mutex_unlock(&c->ec_stripe_new_lock); return ret; @@ -2367,46 +2270,7 @@ void bch2_fs_ec_flush(struct bch_fs *c) int bch2_stripes_read(struct bch_fs *c) { - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_stripes, POS_MIN, - BTREE_ITER_prefetch, k, ({ - if (k.k->type != KEY_TYPE_stripe) - continue; - - ret = __ec_stripe_mem_alloc(c, k.k->p.offset, GFP_KERNEL); - if (ret) - break; - - struct stripe *m = genradix_ptr(&c->stripes, k.k->p.offset); - - stripe_to_mem(m, bkey_s_c_to_stripe(k).v); - - bch2_stripes_heap_insert(c, m, k.k->p.offset); - 0; - }))); - bch_err_fn(c, ret); - return ret; -} - -void bch2_stripes_heap_to_text(struct printbuf *out, struct bch_fs *c) -{ - ec_stripes_heap *h = &c->ec_stripes_heap; - struct stripe *m; - size_t i; - - mutex_lock(&c->ec_stripes_heap_lock); - for (i = 0; i < min_t(size_t, h->nr, 50); i++) { - m = genradix_ptr(&c->stripes, h->data[i].idx); - - prt_printf(out, "%zu %u/%u+%u", h->data[i].idx, - h->data[i].blocks_nonempty, - m->nr_blocks - m->nr_redundant, - m->nr_redundant); - if (bch2_stripe_is_open(c, h->data[i].idx)) - prt_str(out, " open"); - prt_newline(out); - } - mutex_unlock(&c->ec_stripes_heap_lock); + return 0; } static void bch2_new_stripe_to_text(struct printbuf *out, struct bch_fs *c, @@ -2477,15 +2341,12 @@ void bch2_fs_ec_exit(struct bch_fs *c) BUG_ON(!list_empty(&c->ec_stripe_new_list)); - free_heap(&c->ec_stripes_heap); - genradix_free(&c->stripes); bioset_exit(&c->ec_bioset); } void bch2_fs_ec_init_early(struct bch_fs *c) { spin_lock_init(&c->ec_stripes_new_lock); - mutex_init(&c->ec_stripes_heap_lock); INIT_LIST_HEAD(&c->ec_stripe_head_list); mutex_init(&c->ec_stripe_head_lock); @@ -2503,3 +2364,40 @@ int bch2_fs_ec_init(struct bch_fs *c) return bioset_init(&c->ec_bioset, 1, offsetof(struct ec_bio, bio), BIOSET_NEED_BVECS); } + +static int bch2_check_stripe_to_lru_ref(struct btree_trans *trans, + struct bkey_s_c k, + struct bkey_buf *last_flushed) +{ + if (k.k->type != KEY_TYPE_stripe) + return 0; + + struct bkey_s_c_stripe s = bkey_s_c_to_stripe(k); + + u64 lru_idx = stripe_lru_pos(s.v); + if (lru_idx) { + int ret = bch2_lru_check_set(trans, BCH_LRU_STRIPE_FRAGMENTATION, + k.k->p.offset, lru_idx, k, last_flushed); + if (ret) + return ret; + } + return 0; +} + +int bch2_check_stripe_to_lru_refs(struct bch_fs *c) +{ + struct bkey_buf last_flushed; + + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_stripes, + POS_MIN, BTREE_ITER_prefetch, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_check_stripe_to_lru_ref(trans, k, &last_flushed))); + + bch2_bkey_buf_exit(&last_flushed, c); + bch_err_fn(c, ret); + return ret; +} diff --git a/fs/bcachefs/ec.h b/fs/bcachefs/ec.h index 583ca6a226da..548048adf0d5 100644 --- a/fs/bcachefs/ec.h +++ b/fs/bcachefs/ec.h @@ -92,6 +92,29 @@ static inline void stripe_csum_set(struct bch_stripe *s, memcpy(stripe_csum(s, block, csum_idx), &csum, bch_crc_bytes[s->csum_type]); } +#define STRIPE_LRU_POS_EMPTY 1 + +static inline u64 stripe_lru_pos(const struct bch_stripe *s) +{ + if (!s) + return 0; + + unsigned nr_data = s->nr_blocks - s->nr_redundant, blocks_empty = 0; + + for (unsigned i = 0; i < nr_data; i++) + blocks_empty += !stripe_blockcount_get(s, i); + + /* Will be picked up by the stripe_delete worker */ + if (blocks_empty == nr_data) + return STRIPE_LRU_POS_EMPTY; + + if (!blocks_empty) + return 0; + + /* invert: more blocks empty = reuse first */ + return LRU_TIME_MAX - blocks_empty; +} + static inline bool __bch2_ptr_matches_stripe(const struct bch_extent_ptr *stripe_ptr, const struct bch_extent_ptr *data_ptr, unsigned sectors) @@ -132,6 +155,21 @@ static inline bool bch2_ptr_matches_stripe_m(const struct gc_stripe *m, m->sectors); } +static inline void gc_stripe_unlock(struct gc_stripe *s) +{ + BUILD_BUG_ON(!((union ulong_byte_assert) { .ulong = 1UL << BUCKET_LOCK_BITNR }).byte); + + clear_bit_unlock(BUCKET_LOCK_BITNR, (void *) &s->lock); + smp_mb__after_atomic(); + wake_up_bit((void *) &s->lock, BUCKET_LOCK_BITNR); +} + +static inline void gc_stripe_lock(struct gc_stripe *s) +{ + wait_on_bit_lock((void *) &s->lock, BUCKET_LOCK_BITNR, + TASK_UNINTERRUPTIBLE); +} + struct bch_read_bio; struct ec_stripe_buf { @@ -212,18 +250,15 @@ int bch2_ec_read_extent(struct btree_trans *, struct bch_read_bio *, struct bkey void *bch2_writepoint_ec_buf(struct bch_fs *, struct write_point *); -void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *); +void bch2_ec_bucket_cancel(struct bch_fs *, struct open_bucket *, int); int bch2_ec_stripe_new_alloc(struct bch_fs *, struct ec_stripe_head *); void bch2_ec_stripe_head_put(struct bch_fs *, struct ec_stripe_head *); -struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, - unsigned, unsigned, unsigned, - enum bch_watermark, struct closure *); -void bch2_stripes_heap_update(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_del(struct bch_fs *, struct stripe *, size_t); -void bch2_stripes_heap_insert(struct bch_fs *, struct stripe *, size_t); +struct alloc_request; +struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *, + struct alloc_request *, unsigned, struct closure *); void bch2_do_stripe_deletes(struct bch_fs *); void bch2_ec_do_stripe_creates(struct bch_fs *); @@ -253,7 +288,9 @@ static inline void ec_stripe_new_put(struct bch_fs *c, struct ec_stripe_new *s, } } -int bch2_dev_remove_stripes(struct bch_fs *, unsigned); +int bch2_invalidate_stripe_to_dev(struct btree_trans *, struct btree_iter *, + struct bkey_s_c, unsigned, unsigned); +int bch2_dev_remove_stripes(struct bch_fs *, unsigned, unsigned); void bch2_ec_stop_dev(struct bch_fs *, struct bch_dev *); void bch2_fs_ec_stop(struct bch_fs *); @@ -261,11 +298,12 @@ void bch2_fs_ec_flush(struct bch_fs *); int bch2_stripes_read(struct bch_fs *); -void bch2_stripes_heap_to_text(struct printbuf *, struct bch_fs *); void bch2_new_stripes_to_text(struct printbuf *, struct bch_fs *); void bch2_fs_ec_exit(struct bch_fs *); void bch2_fs_ec_init_early(struct bch_fs *); int bch2_fs_ec_init(struct bch_fs *); +int bch2_check_stripe_to_lru_refs(struct bch_fs *); + #endif /* _BCACHEFS_EC_H */ diff --git a/fs/bcachefs/ec_types.h b/fs/bcachefs/ec_types.h index 8d1e70e830ac..809446c78951 100644 --- a/fs/bcachefs/ec_types.h +++ b/fs/bcachefs/ec_types.h @@ -4,9 +4,10 @@ #include "bcachefs_format.h" -struct bch_replicas_padded { +union bch_replicas_padded { + u8 bytes[struct_size_t(struct bch_replicas_entry_v1, + devs, BCH_BKEY_PTRS_MAX)]; struct bch_replicas_entry_v1 e; - u8 pad[BCH_BKEY_PTRS_MAX]; }; struct stripe { @@ -20,23 +21,15 @@ struct stripe { }; struct gc_stripe { + u8 lock; + unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 sectors; - u8 nr_blocks; u8 nr_redundant; - - unsigned alive:1; /* does a corresponding key exist in stripes btree? */ u16 block_sectors[BCH_BKEY_PTRS_MAX]; struct bch_extent_ptr ptrs[BCH_BKEY_PTRS_MAX]; - struct bch_replicas_padded r; + union bch_replicas_padded r; }; -struct ec_stripe_heap_entry { - size_t idx; - unsigned blocks_nonempty; -}; - -typedef DEFINE_MIN_HEAP(struct ec_stripe_heap_entry, ec_stripes_heap) ec_stripes_heap; - #endif /* _BCACHEFS_EC_TYPES_H */ diff --git a/fs/bcachefs/enumerated_ref.c b/fs/bcachefs/enumerated_ref.c new file mode 100644 index 000000000000..56ab430f209f --- /dev/null +++ b/fs/bcachefs/enumerated_ref.c @@ -0,0 +1,144 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include "bcachefs.h" +#include "enumerated_ref.h" +#include "util.h" + +#include <linux/completion.h> + +#ifdef ENUMERATED_REF_DEBUG +void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + atomic_long_inc(&ref->refs[idx]); +} + +bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + return atomic_long_inc_not_zero(&ref->refs[idx]); +} + +bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + return !ref->dying && + atomic_long_inc_not_zero(&ref->refs[idx]); +} + +void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) +{ + BUG_ON(idx >= ref->nr); + long v = atomic_long_dec_return(&ref->refs[idx]); + + BUG_ON(v < 0); + if (v) + return; + + for (unsigned i = 0; i < ref->nr; i++) + if (atomic_long_read(&ref->refs[i])) + return; + + if (ref->stop_fn) + ref->stop_fn(ref); + complete(&ref->stop_complete); +} +#endif + +#ifndef ENUMERATED_REF_DEBUG +static void enumerated_ref_kill_cb(struct percpu_ref *percpu_ref) +{ + struct enumerated_ref *ref = + container_of(percpu_ref, struct enumerated_ref, ref); + + if (ref->stop_fn) + ref->stop_fn(ref); + complete(&ref->stop_complete); +} +#endif + +void enumerated_ref_stop_async(struct enumerated_ref *ref) +{ + reinit_completion(&ref->stop_complete); + +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_kill(&ref->ref); +#else + ref->dying = true; + for (unsigned i = 0; i < ref->nr; i++) + enumerated_ref_put(ref, i); +#endif +} + +void enumerated_ref_stop(struct enumerated_ref *ref, + const char * const names[]) +{ + enumerated_ref_stop_async(ref); + while (!wait_for_completion_timeout(&ref->stop_complete, HZ * 10)) { + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Waited for 10 seconds to shutdown enumerated ref\n"); + prt_str(&buf, "Outstanding refs:\n"); + enumerated_ref_to_text(&buf, ref, names); + printk(KERN_ERR "%s", buf.buf); + printbuf_exit(&buf); + } +} + +void enumerated_ref_start(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_reinit(&ref->ref); +#else + ref->dying = false; + for (unsigned i = 0; i < ref->nr; i++) { + BUG_ON(atomic_long_read(&ref->refs[i])); + atomic_long_inc(&ref->refs[i]); + } +#endif +} + +void enumerated_ref_exit(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + percpu_ref_exit(&ref->ref); +#else + kfree(ref->refs); + ref->refs = NULL; + ref->nr = 0; +#endif +} + +int enumerated_ref_init(struct enumerated_ref *ref, unsigned nr, + void (*stop_fn)(struct enumerated_ref *)) +{ + init_completion(&ref->stop_complete); + ref->stop_fn = stop_fn; + +#ifndef ENUMERATED_REF_DEBUG + return percpu_ref_init(&ref->ref, enumerated_ref_kill_cb, + PERCPU_REF_INIT_DEAD, GFP_KERNEL); +#else + ref->refs = kzalloc(sizeof(ref->refs[0]) * nr, GFP_KERNEL); + if (!ref->refs) + return -ENOMEM; + + ref->nr = nr; + return 0; +#endif +} + +void enumerated_ref_to_text(struct printbuf *out, + struct enumerated_ref *ref, + const char * const names[]) +{ +#ifdef ENUMERATED_REF_DEBUG + bch2_printbuf_tabstop_push(out, 32); + + for (unsigned i = 0; i < ref->nr; i++) + prt_printf(out, "%s\t%li\n", names[i], + atomic_long_read(&ref->refs[i])); +#else + prt_str(out, "(not in debug mode)\n"); +#endif +} diff --git a/fs/bcachefs/enumerated_ref.h b/fs/bcachefs/enumerated_ref.h new file mode 100644 index 000000000000..ec01cf59ef80 --- /dev/null +++ b/fs/bcachefs/enumerated_ref.h @@ -0,0 +1,66 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ENUMERATED_REF_H +#define _BCACHEFS_ENUMERATED_REF_H + +#include "enumerated_ref_types.h" + +/* + * A refcount where the users are enumerated: in debug mode, we create sepate + * refcounts for each user, to make leaks and refcount errors easy to track + * down: + */ + +#ifdef ENUMERATED_REF_DEBUG +void enumerated_ref_get(struct enumerated_ref *, unsigned); +bool __enumerated_ref_tryget(struct enumerated_ref *, unsigned); +bool enumerated_ref_tryget(struct enumerated_ref *, unsigned); +void enumerated_ref_put(struct enumerated_ref *, unsigned); +#else + +static inline void enumerated_ref_get(struct enumerated_ref *ref, unsigned idx) +{ + percpu_ref_get(&ref->ref); +} + +static inline bool __enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + return percpu_ref_tryget(&ref->ref); +} + +static inline bool enumerated_ref_tryget(struct enumerated_ref *ref, unsigned idx) +{ + return percpu_ref_tryget_live(&ref->ref); +} + +static inline void enumerated_ref_put(struct enumerated_ref *ref, unsigned idx) +{ + percpu_ref_put(&ref->ref); +} +#endif + +static inline bool enumerated_ref_is_zero(struct enumerated_ref *ref) +{ +#ifndef ENUMERATED_REF_DEBUG + return percpu_ref_is_zero(&ref->ref); +#else + for (unsigned i = 0; i < ref->nr; i++) + if (atomic_long_read(&ref->refs[i])) + return false; + return true; +#endif +} + +void enumerated_ref_stop_async(struct enumerated_ref *); +void enumerated_ref_stop(struct enumerated_ref *, const char * const[]); +void enumerated_ref_start(struct enumerated_ref *); + +void enumerated_ref_exit(struct enumerated_ref *); +int enumerated_ref_init(struct enumerated_ref *, unsigned, + void (*stop_fn)(struct enumerated_ref *)); + +struct printbuf; +void enumerated_ref_to_text(struct printbuf *, + struct enumerated_ref *, + const char * const[]); + +#endif /* _BCACHEFS_ENUMERATED_REF_H */ diff --git a/fs/bcachefs/enumerated_ref_types.h b/fs/bcachefs/enumerated_ref_types.h new file mode 100644 index 000000000000..0e6076f466d3 --- /dev/null +++ b/fs/bcachefs/enumerated_ref_types.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_ENUMERATED_REF_TYPES_H +#define _BCACHEFS_ENUMERATED_REF_TYPES_H + +#include <linux/percpu-refcount.h> + +struct enumerated_ref { +#ifdef ENUMERATED_REF_DEBUG + unsigned nr; + bool dying; + atomic_long_t *refs; +#else + struct percpu_ref ref; +#endif + void (*stop_fn)(struct enumerated_ref *); + struct completion stop_complete; +}; + +#endif /* _BCACHEFS_ENUMERATED_REF_TYPES_H */ diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 4590cd0c7c90..62843e772b2c 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -5,6 +5,8 @@ #define BCH_ERRCODES() \ x(ERANGE, ERANGE_option_too_small) \ x(ERANGE, ERANGE_option_too_big) \ + x(EINVAL, injected) \ + x(BCH_ERR_injected, injected_fs_start) \ x(EINVAL, mount_option) \ x(BCH_ERR_mount_option, option_name) \ x(BCH_ERR_mount_option, option_value) \ @@ -51,6 +53,7 @@ x(ENOMEM, ENOMEM_dio_write_bioset_init) \ x(ENOMEM, ENOMEM_nocow_flush_bioset_init) \ x(ENOMEM, ENOMEM_promote_table_init) \ + x(ENOMEM, ENOMEM_async_obj_init) \ x(ENOMEM, ENOMEM_compression_bounce_read_init) \ x(ENOMEM, ENOMEM_compression_bounce_write_init) \ x(ENOMEM, ENOMEM_compression_workspace_init) \ @@ -116,9 +119,11 @@ x(ENOENT, ENOENT_snapshot_tree) \ x(ENOENT, ENOENT_dirent_doesnt_match_inode) \ x(ENOENT, ENOENT_dev_not_found) \ + x(ENOENT, ENOENT_dev_bucket_not_found) \ x(ENOENT, ENOENT_dev_idx_not_found) \ x(ENOENT, ENOENT_inode_no_backpointer) \ x(ENOENT, ENOENT_no_snapshot_tree_subvol) \ + x(ENOENT, btree_node_dying) \ x(ENOTEMPTY, ENOTEMPTY_dir_not_empty) \ x(ENOTEMPTY, ENOTEMPTY_subvol_not_empty) \ x(EEXIST, EEXIST_str_hash_set) \ @@ -170,6 +175,7 @@ x(0, backpointer_to_overwritten_btree_node) \ x(0, journal_reclaim_would_deadlock) \ x(EINVAL, fsck) \ + x(BCH_ERR_fsck, fsck_ask) \ x(BCH_ERR_fsck, fsck_fix) \ x(BCH_ERR_fsck, fsck_delete_bkey) \ x(BCH_ERR_fsck, fsck_ignore) \ @@ -177,9 +183,14 @@ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ x(EINVAL, restart_recovery) \ - x(EINVAL, not_in_recovery) \ x(EINVAL, cannot_rewind_recovery) \ x(0, data_update_done) \ + x(BCH_ERR_data_update_done, data_update_done_would_block) \ + x(BCH_ERR_data_update_done, data_update_done_unwritten) \ + x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ + x(BCH_ERR_data_update_done, data_update_done_no_snapshot) \ + x(BCH_ERR_data_update_done, data_update_done_no_dev_refs) \ + x(BCH_ERR_data_update_done, data_update_done_no_rw_devs) \ x(EINVAL, device_state_not_allowed) \ x(EINVAL, member_info_missing) \ x(EINVAL, mismatched_block_size) \ @@ -191,6 +202,7 @@ x(EINVAL, device_has_been_removed) \ x(EINVAL, device_splitbrain) \ x(EINVAL, device_already_online) \ + x(EINVAL, filesystem_uuid_already_open) \ x(EINVAL, insufficient_devices_to_start) \ x(EINVAL, invalid) \ x(EINVAL, internal_fsck_err) \ @@ -200,6 +212,9 @@ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ x(EINVAL, varint_decode_error) \ + x(EINVAL, erasure_coding_found_btree_node) \ + x(EINVAL, option_negative) \ + x(EOPNOTSUPP, may_not_use_incompat_feature) \ x(EROFS, erofs_trans_commit) \ x(EROFS, erofs_no_writes) \ x(EROFS, erofs_journal_err) \ @@ -207,13 +222,23 @@ x(EROFS, erofs_unfixed_errors) \ x(EROFS, erofs_norecovery) \ x(EROFS, erofs_nochanges) \ + x(EROFS, erofs_no_alloc_info) \ + x(EROFS, erofs_filesystem_full) \ x(EROFS, insufficient_devices) \ x(0, operation_blocked) \ x(BCH_ERR_operation_blocked, btree_cache_cannibalize_lock_blocked) \ - x(BCH_ERR_operation_blocked, journal_res_get_blocked) \ - x(BCH_ERR_operation_blocked, journal_preres_get_blocked) \ - x(BCH_ERR_operation_blocked, bucket_alloc_blocked) \ - x(BCH_ERR_operation_blocked, stripe_alloc_blocked) \ + x(BCH_ERR_operation_blocked, journal_res_blocked) \ + x(BCH_ERR_journal_res_blocked, journal_blocked) \ + x(BCH_ERR_journal_res_blocked, journal_max_in_flight) \ + x(BCH_ERR_journal_res_blocked, journal_max_open) \ + x(BCH_ERR_journal_res_blocked, journal_full) \ + x(BCH_ERR_journal_res_blocked, journal_pin_full) \ + x(BCH_ERR_journal_res_blocked, journal_buf_enomem) \ + x(BCH_ERR_journal_res_blocked, journal_stuck) \ + x(BCH_ERR_journal_res_blocked, journal_retry_open) \ + x(BCH_ERR_journal_res_blocked, journal_preres_get_blocked) \ + x(BCH_ERR_journal_res_blocked, bucket_alloc_blocked) \ + x(BCH_ERR_journal_res_blocked, stripe_alloc_blocked) \ x(BCH_ERR_invalid, invalid_sb) \ x(BCH_ERR_invalid_sb, invalid_sb_magic) \ x(BCH_ERR_invalid_sb, invalid_sb_version) \ @@ -223,6 +248,7 @@ x(BCH_ERR_invalid_sb, invalid_sb_csum) \ x(BCH_ERR_invalid_sb, invalid_sb_block_size) \ x(BCH_ERR_invalid_sb, invalid_sb_uuid) \ + x(BCH_ERR_invalid_sb, invalid_sb_offset) \ x(BCH_ERR_invalid_sb, invalid_sb_too_many_members) \ x(BCH_ERR_invalid_sb, invalid_sb_dev_idx) \ x(BCH_ERR_invalid_sb, invalid_sb_time_precision) \ @@ -248,8 +274,9 @@ x(BCH_ERR_invalid_sb, invalid_sb_downgrade) \ x(BCH_ERR_invalid, invalid_bkey) \ x(BCH_ERR_operation_blocked, nocow_lock_blocked) \ - x(EIO, journal_shutdown) \ + x(EROFS, journal_shutdown) \ x(EIO, journal_flush_err) \ + x(EIO, journal_write_err) \ x(EIO, btree_node_read_err) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_cached) \ x(EIO, sb_not_downgraded) \ @@ -258,17 +285,53 @@ x(EIO, btree_node_read_validate_error) \ x(EIO, btree_need_topology_repair) \ x(EIO, bucket_ref_update) \ + x(EIO, trigger_alloc) \ x(EIO, trigger_pointer) \ x(EIO, trigger_stripe_pointer) \ x(EIO, metadata_bucket_inconsistency) \ x(EIO, mark_stripe) \ x(EIO, stripe_reconstruct) \ x(EIO, key_type_error) \ - x(EIO, no_device_to_read_from) \ + x(EIO, extent_poisoned) \ x(EIO, missing_indirect_extent) \ x(EIO, invalidate_stripe_to_dev) \ x(EIO, no_encryption_key) \ x(EIO, insufficient_journal_devices) \ + x(EIO, device_offline) \ + x(EIO, EIO_fault_injected) \ + x(EIO, ec_block_read) \ + x(EIO, ec_block_write) \ + x(EIO, recompute_checksum) \ + x(EIO, decompress) \ + x(BCH_ERR_decompress, decompress_exceeded_max_encoded_extent) \ + x(BCH_ERR_decompress, decompress_lz4) \ + x(BCH_ERR_decompress, decompress_gzip) \ + x(BCH_ERR_decompress, decompress_zstd_src_len_bad) \ + x(BCH_ERR_decompress, decompress_zstd) \ + x(EIO, data_write) \ + x(BCH_ERR_data_write, data_write_io) \ + x(BCH_ERR_data_write, data_write_csum) \ + x(BCH_ERR_data_write, data_write_invalid_ptr) \ + x(BCH_ERR_data_write, data_write_misaligned) \ + x(BCH_ERR_decompress, data_read) \ + x(BCH_ERR_data_read, no_device_to_read_from) \ + x(BCH_ERR_data_read, no_devices_valid) \ + x(BCH_ERR_data_read, data_read_io_err) \ + x(BCH_ERR_data_read, data_read_csum_err) \ + x(BCH_ERR_data_read, data_read_retry) \ + x(BCH_ERR_data_read_retry, data_read_retry_avoid) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_device_offline) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_io_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_ec_reconstruct_err) \ + x(BCH_ERR_data_read_retry_avoid,data_read_retry_csum_err) \ + x(BCH_ERR_data_read_retry, data_read_retry_csum_err_maybe_userspace)\ + x(BCH_ERR_data_read, data_read_decompress_err) \ + x(BCH_ERR_data_read, data_read_decrypt_err) \ + x(BCH_ERR_data_read, data_read_ptr_stale_race) \ + x(BCH_ERR_data_read_retry, data_read_ptr_stale_retry) \ + x(BCH_ERR_data_read, data_read_no_encryption_key) \ + x(BCH_ERR_data_read, data_read_buffer_too_small) \ + x(BCH_ERR_data_read, data_read_key_overwritten) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_fixable) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_want_retry) \ x(BCH_ERR_btree_node_read_err, btree_node_read_err_must_retry) \ diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index 038da6a61f6b..c2cad28635bf 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -3,15 +3,24 @@ #include "btree_cache.h" #include "btree_iter.h" #include "error.h" -#include "fs-common.h" #include "journal.h" +#include "namei.h" #include "recovery_passes.h" #include "super.h" #include "thread_with_file.h" #define FSCK_ERR_RATELIMIT_NR 10 -bool bch2_inconsistent_error(struct bch_fs *c) +void __bch2_log_msg_start(const char *fs_or_dev_name, struct printbuf *out) +{ + printbuf_indent_add_nextline(out, 2); + +#ifdef BCACHEFS_LOG_PREFIX + prt_printf(out, "bcachefs (%s): ", fs_or_dev_name); +#endif +} + +bool __bch2_inconsistent_error(struct bch_fs *c, struct printbuf *out) { set_bit(BCH_FS_error, &c->flags); @@ -20,11 +29,10 @@ bool bch2_inconsistent_error(struct bch_fs *c) return false; case BCH_ON_ERROR_fix_safe: case BCH_ON_ERROR_ro: - if (bch2_fs_emergency_read_only(c)) - bch_err(c, "inconsistency detected - emergency read only at journal seq %llu", - journal_cur_seq(&c->journal)); + bch2_fs_emergency_read_only2(c, out); return true; case BCH_ON_ERROR_panic: + bch2_print_str(c, KERN_ERR, out->buf); panic(bch2_fmt(c, "panic after error")); return true; default: @@ -32,18 +40,91 @@ bool bch2_inconsistent_error(struct bch_fs *c) } } -int bch2_topology_error(struct bch_fs *c) +bool bch2_inconsistent_error(struct bch_fs *c) +{ + struct printbuf buf = PRINTBUF; + buf.atomic++; + + printbuf_indent_add_nextline(&buf, 2); + + bool ret = __bch2_inconsistent_error(c, &buf); + if (ret) + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + return ret; +} + +__printf(3, 0) +static bool bch2_fs_trans_inconsistent(struct bch_fs *c, struct btree_trans *trans, + const char *fmt, va_list args) +{ + struct printbuf buf = PRINTBUF; + buf.atomic++; + + bch2_log_msg_start(c, &buf); + + prt_vprintf(&buf, fmt, args); + prt_newline(&buf); + + if (trans) + bch2_trans_updates_to_text(&buf, trans); + bool ret = __bch2_inconsistent_error(c, &buf); + bch2_print_str_nonblocking(c, KERN_ERR, buf.buf); + + printbuf_exit(&buf); + return ret; +} + +bool bch2_fs_inconsistent(struct bch_fs *c, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + bool ret = bch2_fs_trans_inconsistent(c, NULL, fmt, args); + va_end(args); + return ret; +} + +bool bch2_trans_inconsistent(struct btree_trans *trans, const char *fmt, ...) +{ + va_list args; + va_start(args, fmt); + bool ret = bch2_fs_trans_inconsistent(trans->c, trans, fmt, args); + va_end(args); + return ret; +} + +int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) { + prt_printf(out, "btree topology error: "); + set_bit(BCH_FS_topology_error, &c->flags); - if (!test_bit(BCH_FS_recovery_running, &c->flags)) { - bch2_inconsistent_error(c); + if (!test_bit(BCH_FS_in_recovery, &c->flags)) { + __bch2_inconsistent_error(c, out); return -BCH_ERR_btree_need_topology_repair; } else { - return bch2_run_explicit_recovery_pass(c, BCH_RECOVERY_PASS_check_topology) ?: + return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: -BCH_ERR_btree_node_read_validate_error; } } +int bch2_fs_topology_error(struct bch_fs *c, const char *fmt, ...) +{ + struct printbuf buf = PRINTBUF; + + bch2_log_msg_start(c, &buf); + + va_list args; + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + + int ret = __bch2_topology_error(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + + printbuf_exit(&buf); + return ret; +} + void bch2_fatal_error(struct bch_fs *c) { if (bch2_fs_emergency_read_only(c)) @@ -54,25 +135,44 @@ void bch2_io_error_work(struct work_struct *work) { struct bch_dev *ca = container_of(work, struct bch_dev, io_error_work); struct bch_fs *c = ca->fs; - bool dev; + + /* XXX: if it's reads or checksums that are failing, set it to failed */ down_write(&c->state_lock); - dev = bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED); - if (dev - ? __bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, - BCH_FORCE_IF_DEGRADED) - : bch2_fs_emergency_read_only(c)) - bch_err(ca, - "too many IO errors, setting %s RO", + unsigned long write_errors_start = READ_ONCE(ca->write_errors_start); + + if (write_errors_start && + time_after(jiffies, + write_errors_start + c->opts.write_error_timeout * HZ)) { + if (ca->mi.state >= BCH_MEMBER_STATE_ro) + goto out; + + bool dev = !__bch2_dev_set_state(c, ca, BCH_MEMBER_STATE_ro, + BCH_FORCE_IF_DEGRADED); + struct printbuf buf = PRINTBUF; + __bch2_log_msg_start(ca->name, &buf); + + prt_printf(&buf, "writes erroring for %u seconds, setting %s ro", + c->opts.write_error_timeout, dev ? "device" : "filesystem"); + if (!dev) + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } +out: up_write(&c->state_lock); } void bch2_io_error(struct bch_dev *ca, enum bch_member_error_type type) { atomic64_inc(&ca->errors[type]); - //queue_work(system_long_wq, &ca->io_error_work); + + if (type == BCH_MEMBER_ERROR_write && !ca->write_errors_start) + ca->write_errors_start = jiffies; + + queue_work(system_long_wq, &ca->io_error_work); } enum ask_yn { @@ -168,15 +268,13 @@ static enum ask_yn bch2_fsck_ask_yn(struct bch_fs *c, struct btree_trans *trans) #endif -static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) +static struct fsck_err_state *fsck_err_get(struct bch_fs *c, + enum bch_sb_error_id id) { struct fsck_err_state *s; - if (!test_bit(BCH_FS_fsck_running, &c->flags)) - return NULL; - list_for_each_entry(s, &c->fsck_error_msgs, list) - if (s->fmt == fmt) { + if (s->id == id) { /* * move it to the head of the list: repeated fsck errors * are common @@ -194,7 +292,7 @@ static struct fsck_err_state *fsck_err_get(struct bch_fs *c, const char *fmt) } INIT_LIST_HEAD(&s->list); - s->fmt = fmt; + s->id = id; list_add(&s->list, &c->fsck_error_msgs); return s; } @@ -231,7 +329,7 @@ static int do_fsck_ask_yn(struct bch_fs *c, if (bch2_fs_stdio_redirect(c)) bch2_print(c, "%s", question->buf); else - bch2_print_string_as_lines(KERN_ERR, question->buf); + bch2_print_str(c, KERN_ERR, question->buf); int ask = bch2_fsck_ask_yn(c, trans); @@ -244,15 +342,107 @@ static int do_fsck_ask_yn(struct bch_fs *c, return ask; } +static struct fsck_err_state *count_fsck_err_locked(struct bch_fs *c, + enum bch_sb_error_id id, const char *msg, + bool *repeat, bool *print, bool *suppress) +{ + bch2_sb_error_count(c, id); + + struct fsck_err_state *s = fsck_err_get(c, id); + if (s) { + /* + * We may be called multiple times for the same error on + * transaction restart - this memoizes instead of asking the user + * multiple times for the same error: + */ + if (s->last_msg && !strcmp(msg, s->last_msg)) { + *repeat = true; + *print = false; + return s; + } + + kfree(s->last_msg); + s->last_msg = kstrdup(msg, GFP_KERNEL); + + if (c->opts.ratelimit_errors && + s->nr >= FSCK_ERR_RATELIMIT_NR) { + if (s->nr == FSCK_ERR_RATELIMIT_NR) + *suppress = true; + else + *print = false; + } + + s->nr++; + } + return s; +} + +bool __bch2_count_fsck_err(struct bch_fs *c, + enum bch_sb_error_id id, struct printbuf *msg) +{ + bch2_sb_error_count(c, id); + + mutex_lock(&c->fsck_error_msgs_lock); + bool print = true, repeat = false, suppress = false; + + count_fsck_err_locked(c, id, msg->buf, &repeat, &print, &suppress); + mutex_unlock(&c->fsck_error_msgs_lock); + + if (suppress) + prt_printf(msg, "Ratelimiting new instances of previous error\n"); + + return print && !repeat; +} + +int bch2_fsck_err_opt(struct bch_fs *c, + enum bch_fsck_flags flags, + enum bch_sb_error_id err) +{ + if (!WARN_ON(err >= ARRAY_SIZE(fsck_flags_extra))) + flags |= fsck_flags_extra[err]; + + if (test_bit(BCH_FS_in_fsck, &c->flags)) { + if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) + return -BCH_ERR_fsck_repair_unimplemented; + + switch (c->opts.fix_errors) { + case FSCK_FIX_exit: + return -BCH_ERR_fsck_errors_not_fixed; + case FSCK_FIX_yes: + if (flags & FSCK_CAN_FIX) + return -BCH_ERR_fsck_fix; + fallthrough; + case FSCK_FIX_no: + if (flags & FSCK_CAN_IGNORE) + return -BCH_ERR_fsck_ignore; + return -BCH_ERR_fsck_errors_not_fixed; + case FSCK_FIX_ask: + if (flags & FSCK_AUTOFIX) + return -BCH_ERR_fsck_fix; + return -BCH_ERR_fsck_ask; + default: + BUG(); + } + } else { + if ((flags & FSCK_AUTOFIX) && + (c->opts.errors == BCH_ON_ERROR_continue || + c->opts.errors == BCH_ON_ERROR_fix_safe)) + return -BCH_ERR_fsck_fix; + + if (c->opts.errors == BCH_ON_ERROR_continue && + (flags & FSCK_CAN_IGNORE)) + return -BCH_ERR_fsck_ignore; + return -BCH_ERR_fsck_errors_not_fixed; + } +} + int __bch2_fsck_err(struct bch_fs *c, struct btree_trans *trans, enum bch_fsck_flags flags, enum bch_sb_error_id err, const char *fmt, ...) { - struct fsck_err_state *s = NULL; va_list args; - bool print = true, suppressing = false, inconsistent = false, exiting = false; struct printbuf buf = PRINTBUF, *out = &buf; int ret = -BCH_ERR_fsck_ignore; const char *action_orig = "fix?", *action = action_orig; @@ -287,7 +477,12 @@ int __bch2_fsck_err(struct bch_fs *c, ? -BCH_ERR_fsck_fix : -BCH_ERR_fsck_ignore; - bch2_sb_error_count(c, err); + printbuf_indent_add_nextline(out, 2); + +#ifdef BCACHEFS_LOG_PREFIX + if (strncmp(fmt, "bcachefs", 8)) + prt_printf(out, bch2_log_msg(c, "")); +#endif va_start(args, fmt); prt_vprintf(out, fmt, args); @@ -307,42 +502,15 @@ int __bch2_fsck_err(struct bch_fs *c, } mutex_lock(&c->fsck_error_msgs_lock); - s = fsck_err_get(c, fmt); - if (s) { - /* - * We may be called multiple times for the same error on - * transaction restart - this memoizes instead of asking the user - * multiple times for the same error: - */ - if (s->last_msg && !strcmp(buf.buf, s->last_msg)) { - ret = s->ret; - goto err_unlock; - } - - kfree(s->last_msg); - s->last_msg = kstrdup(buf.buf, GFP_KERNEL); - if (!s->last_msg) { - ret = -ENOMEM; - goto err_unlock; - } - - if (c->opts.ratelimit_errors && - !(flags & FSCK_NO_RATELIMIT) && - s->nr >= FSCK_ERR_RATELIMIT_NR) { - if (s->nr == FSCK_ERR_RATELIMIT_NR) - suppressing = true; - else - print = false; - } - - s->nr++; + bool repeat = false, print = true, suppress = false; + bool inconsistent = false, exiting = false; + struct fsck_err_state *s = + count_fsck_err_locked(c, err, buf.buf, &repeat, &print, &suppress); + if (repeat) { + ret = s->ret; + goto err_unlock; } -#ifdef BCACHEFS_LOG_PREFIX - if (!strncmp(fmt, "bcachefs:", 9)) - prt_printf(out, bch2_log_msg(c, "")); -#endif - if ((flags & FSCK_AUTOFIX) && (c->opts.errors == BCH_ON_ERROR_continue || c->opts.errors == BCH_ON_ERROR_fix_safe)) { @@ -356,11 +524,14 @@ int __bch2_fsck_err(struct bch_fs *c, } goto print; - } else if (!test_bit(BCH_FS_fsck_running, &c->flags)) { + } else if (!test_bit(BCH_FS_in_fsck, &c->flags)) { if (c->opts.errors != BCH_ON_ERROR_continue || !(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) { - prt_str(out, ", shutting down"); + prt_str_indented(out, ", shutting down\n" + "error not marked as autofix and not in fsck\n" + "run fsck, and forward to devs so error can be marked for self-healing"); inconsistent = true; + print = true; ret = -BCH_ERR_fsck_errors_not_fixed; } else if (flags & FSCK_CAN_FIX) { prt_str(out, ", "); @@ -412,31 +583,37 @@ int __bch2_fsck_err(struct bch_fs *c, !(flags & FSCK_CAN_IGNORE))) ret = -BCH_ERR_fsck_errors_not_fixed; - if (test_bit(BCH_FS_fsck_running, &c->flags) && + if (test_bit(BCH_FS_in_fsck, &c->flags) && (ret != -BCH_ERR_fsck_fix && ret != -BCH_ERR_fsck_ignore)) { exiting = true; print = true; } print: + prt_newline(out); + + if (inconsistent) + __bch2_inconsistent_error(c, out); + else if (exiting) + prt_printf(out, "Unable to continue, halting\n"); + else if (suppress) + prt_printf(out, "Ratelimiting new instances of previous error\n"); + if (print) { + /* possibly strip an empty line, from printbuf_indent_add */ + while (out->pos && out->buf[out->pos - 1] == ' ') + --out->pos; + printbuf_nul_terminate(out); + if (bch2_fs_stdio_redirect(c)) - bch2_print(c, "%s\n", out->buf); + bch2_print(c, "%s", out->buf); else - bch2_print_string_as_lines(KERN_ERR, out->buf); + bch2_print_str(c, KERN_ERR, out->buf); } - if (exiting) - bch_err(c, "Unable to continue, halting"); - else if (suppressing) - bch_err(c, "Ratelimiting new instances of previous error"); - if (s) s->ret = ret; - if (inconsistent) - bch2_inconsistent_error(c); - /* * We don't yet track whether the filesystem currently has errors, for * log_fsck_err()s: that would require us to track for every error type @@ -498,29 +675,27 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, prt_printf(&buf, " level=%u: ", from.level); bch2_bkey_val_to_text(&buf, c, k); - prt_str(&buf, "\n "); + prt_newline(&buf); va_list args; va_start(args, fmt); prt_vprintf(&buf, fmt, args); va_end(args); - prt_str(&buf, ": delete?"); - - int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s", buf.buf); + int ret = __bch2_fsck_err(c, NULL, fsck_flags, err, "%s, delete?", buf.buf); printbuf_exit(&buf); return ret; } -void bch2_flush_fsck_errs(struct bch_fs *c) +static void __bch2_flush_fsck_errs(struct bch_fs *c, bool print) { struct fsck_err_state *s, *n; mutex_lock(&c->fsck_error_msgs_lock); list_for_each_entry_safe(s, n, &c->fsck_error_msgs, list) { - if (s->ratelimited && s->last_msg) - bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); + if (print && s->ratelimited && s->last_msg) + bch_err(c, "Saw %llu errors like:\n %s", s->nr, s->last_msg); list_del(&s->list); kfree(s->last_msg); @@ -530,35 +705,53 @@ void bch2_flush_fsck_errs(struct bch_fs *c) mutex_unlock(&c->fsck_error_msgs_lock); } -int bch2_inum_err_msg_trans(struct btree_trans *trans, struct printbuf *out, subvol_inum inum) +void bch2_flush_fsck_errs(struct bch_fs *c) +{ + __bch2_flush_fsck_errs(c, true); +} + +void bch2_free_fsck_errs(struct bch_fs *c) +{ + __bch2_flush_fsck_errs(c, false); +} + +int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + subvol_inum inum, u64 offset) { u32 restart_count = trans->restart_count; int ret = 0; - /* XXX: we don't yet attempt to print paths when we don't know the subvol */ - if (inum.subvol) - ret = lockrestart_do(trans, bch2_inum_to_path(trans, inum, out)); + if (inum.subvol) { + ret = bch2_inum_to_path(trans, inum, out); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + return ret; + } if (!inum.subvol || ret) prt_printf(out, "inum %llu:%llu", inum.subvol, inum.inum); + prt_printf(out, " offset %llu: ", offset); return trans_was_restarted(trans, restart_count); } -int bch2_inum_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, - subvol_inum inum, u64 offset) +void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, + subvol_inum inum, u64 offset) { - int ret = bch2_inum_err_msg_trans(trans, out, inum); - prt_printf(out, " offset %llu: ", offset); - return ret; + bch2_trans_do(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); } -void bch2_inum_err_msg(struct bch_fs *c, struct printbuf *out, subvol_inum inum) +int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *trans, struct printbuf *out, + struct bpos pos) { - bch2_trans_run(c, bch2_inum_err_msg_trans(trans, out, inum)); + int ret = bch2_inum_snapshot_to_path(trans, pos.inode, pos.snapshot, NULL, out); + if (ret) + return ret; + + prt_printf(out, " offset %llu: ", pos.offset << 8); + return 0; } -void bch2_inum_offset_err_msg(struct bch_fs *c, struct printbuf *out, - subvol_inum inum, u64 offset) +void bch2_inum_snap_offset_err_msg(struct bch_fs *c, struct printbuf *out, + struct bpos pos) { - bch2_trans_run(c, bch2_inum_offset_err_msg_trans(trans, out, inum, offset)); + bch2_trans_do(c, bch2_inum_snap_offset_err_msg_trans(trans, out, pos)); } diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 7acf2a27ca28..5123d4c86770 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -18,6 +18,13 @@ struct work_struct; /* Error messages: */ +void __bch2_log_msg_start(const char *, struct printbuf *); + +static inline void bch2_log_msg_start(struct bch_fs *c, struct printbuf *out) +{ + __bch2_log_msg_start(c->name, out); +} + /* * Inconsistency errors: The on disk data is inconsistent. If these occur during * initial recovery, they don't indicate a bug in the running code - we walk all @@ -29,21 +36,10 @@ struct work_struct; * BCH_ON_ERROR_CONTINUE mode */ +bool __bch2_inconsistent_error(struct bch_fs *, struct printbuf *); bool bch2_inconsistent_error(struct bch_fs *); - -int bch2_topology_error(struct bch_fs *); - -#define bch2_fs_topology_error(c, ...) \ -({ \ - bch_err(c, "btree topology error: " __VA_ARGS__); \ - bch2_topology_error(c); \ -}) - -#define bch2_fs_inconsistent(c, ...) \ -({ \ - bch_err(c, __VA_ARGS__); \ - bch2_inconsistent_error(c); \ -}) +__printf(2, 3) +bool bch2_fs_inconsistent(struct bch_fs *, const char *, ...); #define bch2_fs_inconsistent_on(cond, ...) \ ({ \ @@ -53,26 +49,21 @@ int bch2_topology_error(struct bch_fs *); _ret; \ }) -/* - * When a transaction update discovers or is causing a fs inconsistency, it's - * helpful to also dump the pending updates: - */ -#define bch2_trans_inconsistent(trans, ...) \ -({ \ - bch_err(trans->c, __VA_ARGS__); \ - bch2_dump_trans_updates(trans); \ - bch2_inconsistent_error(trans->c); \ -}) +__printf(2, 3) +bool bch2_trans_inconsistent(struct btree_trans *, const char *, ...); -#define bch2_trans_inconsistent_on(cond, trans, ...) \ +#define bch2_trans_inconsistent_on(cond, ...) \ ({ \ bool _ret = unlikely(!!(cond)); \ - \ if (_ret) \ - bch2_trans_inconsistent(trans, __VA_ARGS__); \ + bch2_trans_inconsistent(__VA_ARGS__); \ _ret; \ }) +int __bch2_topology_error(struct bch_fs *, struct printbuf *); +__printf(2, 3) +int bch2_fs_topology_error(struct bch_fs *, const char *, ...); + /* * Fsck errors: inconsistency errors we detect at mount time, and should ideally * be able to repair: @@ -80,7 +71,7 @@ int bch2_topology_error(struct bch_fs *); struct fsck_err_state { struct list_head list; - const char *fmt; + enum bch_sb_error_id id; u64 nr; bool ratelimited; int ret; @@ -90,6 +81,14 @@ struct fsck_err_state { #define fsck_err_count(_c, _err) bch2_sb_err_count(_c, BCH_FSCK_ERR_##_err) +bool __bch2_count_fsck_err(struct bch_fs *, enum bch_sb_error_id, struct printbuf *); +#define bch2_count_fsck_err(_c, _err, ...) \ + __bch2_count_fsck_err(_c, BCH_FSCK_ERR_##_err, __VA_ARGS__) + +int bch2_fsck_err_opt(struct bch_fs *, + enum bch_fsck_flags, + enum bch_sb_error_id); + __printf(5, 6) __cold int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, enum bch_fsck_flags, @@ -101,6 +100,7 @@ int __bch2_fsck_err(struct bch_fs *, struct btree_trans *, _flags, BCH_FSCK_ERR_##_err_type, __VA_ARGS__) void bch2_flush_fsck_errs(struct bch_fs *); +void bch2_free_fsck_errs(struct bch_fs *); #define fsck_err_wrap(_do) \ ({ \ @@ -216,32 +216,43 @@ void bch2_io_error_work(struct work_struct *); /* Does the error handling without logging a message */ void bch2_io_error(struct bch_dev *, enum bch_member_error_type); -#define bch2_dev_io_err_on(cond, ca, _type, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) { \ - bch_err_dev_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca, _type); \ - } \ - _ret; \ -}) - -#define bch2_dev_inum_io_err_on(cond, ca, _type, ...) \ -({ \ - bool _ret = (cond); \ - \ - if (_ret) { \ - bch_err_inum_offset_ratelimited(ca, __VA_ARGS__); \ - bch2_io_error(ca, _type); \ - } \ - _ret; \ -}) +#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT +void bch2_latency_acct(struct bch_dev *, u64, int); +#else +static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} +#endif + +static inline void bch2_account_io_success_fail(struct bch_dev *ca, + enum bch_member_error_type type, + bool success) +{ + if (likely(success)) { + if (type == BCH_MEMBER_ERROR_write && + ca->write_errors_start) + ca->write_errors_start = 0; + } else { + bch2_io_error(ca, type); + } +} + +static inline void bch2_account_io_completion(struct bch_dev *ca, + enum bch_member_error_type type, + u64 submit_time, bool success) +{ + if (unlikely(!ca)) + return; + + if (type != BCH_MEMBER_ERROR_checksum) + bch2_latency_acct(ca, submit_time, type); + + bch2_account_io_success_fail(ca, type, success); +} -int bch2_inum_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum); int bch2_inum_offset_err_msg_trans(struct btree_trans *, struct printbuf *, subvol_inum, u64); -void bch2_inum_err_msg(struct bch_fs *, struct printbuf *, subvol_inum); void bch2_inum_offset_err_msg(struct bch_fs *, struct printbuf *, subvol_inum, u64); +int bch2_inum_snap_offset_err_msg_trans(struct btree_trans *, struct printbuf *, struct bpos); +void bch2_inum_snap_offset_err_msg(struct bch_fs *, struct printbuf *, struct bpos); + #endif /* _BCACHEFS_ERROR_H */ diff --git a/fs/bcachefs/extent_update.c b/fs/bcachefs/extent_update.c index 6aac579a692a..b899ee75f5b9 100644 --- a/fs/bcachefs/extent_update.c +++ b/fs/bcachefs/extent_update.c @@ -37,16 +37,17 @@ static unsigned bch2_bkey_nr_alloc_ptrs(struct bkey_s_c k) return lru + ret * 2; } +#define EXTENT_ITERS_MAX 64 + static int count_iters_for_insert(struct btree_trans *trans, struct bkey_s_c k, unsigned offset, struct bpos *end, - unsigned *nr_iters, - unsigned max_iters) + unsigned *nr_iters) { int ret = 0, ret2 = 0; - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { *end = bpos_min(*end, k.k->p); ret = 1; } @@ -56,7 +57,7 @@ static int count_iters_for_insert(struct btree_trans *trans, case KEY_TYPE_reflink_v: *nr_iters += bch2_bkey_nr_alloc_ptrs(k); - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { *end = bpos_min(*end, k.k->p); ret = 1; } @@ -81,7 +82,7 @@ static int count_iters_for_insert(struct btree_trans *trans, *nr_iters += 1 + bch2_bkey_nr_alloc_ptrs(r_k); - if (*nr_iters >= max_iters) { + if (*nr_iters >= EXTENT_ITERS_MAX) { struct bpos pos = bkey_start_pos(k.k); pos.offset += min_t(u64, k.k->size, r_k.k->p.offset - idx); @@ -100,59 +101,31 @@ static int count_iters_for_insert(struct btree_trans *trans, return ret2 ?: ret; } -#define EXTENT_ITERS_MAX (BTREE_ITER_INITIAL / 3) - int bch2_extent_atomic_end(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_i *insert, struct bpos *end) { - struct btree_iter copy; - struct bkey_s_c k; unsigned nr_iters = 0; - int ret; - - ret = bch2_btree_iter_traverse(iter); - if (ret) - return ret; - - *end = insert->k.p; - /* extent_update_to_keys(): */ - nr_iters += 1; - - ret = count_iters_for_insert(trans, bkey_i_to_s_c(insert), 0, end, - &nr_iters, EXTENT_ITERS_MAX / 2); - if (ret < 0) - return ret; + struct btree_iter copy; + bch2_trans_copy_iter(trans, ©, iter); - bch2_trans_copy_iter(©, iter); + int ret = bch2_btree_iter_traverse(trans, ©); + if (ret) + goto err; - for_each_btree_key_max_continue_norestart(copy, insert->k.p, 0, k, ret) { + struct bkey_s_c k; + for_each_btree_key_max_continue_norestart(trans, copy, *end, 0, k, ret) { unsigned offset = 0; - if (bkey_gt(bkey_start_pos(&insert->k), bkey_start_pos(k.k))) - offset = bkey_start_offset(&insert->k) - - bkey_start_offset(k.k); + if (bkey_gt(iter->pos, bkey_start_pos(k.k))) + offset = iter->pos.offset - bkey_start_offset(k.k); - /* extent_handle_overwrites(): */ - switch (bch2_extent_overlap(&insert->k, k.k)) { - case BCH_EXTENT_OVERLAP_ALL: - case BCH_EXTENT_OVERLAP_FRONT: - nr_iters += 1; - break; - case BCH_EXTENT_OVERLAP_BACK: - case BCH_EXTENT_OVERLAP_MIDDLE: - nr_iters += 2; - break; - } - - ret = count_iters_for_insert(trans, k, offset, end, - &nr_iters, EXTENT_ITERS_MAX); + ret = count_iters_for_insert(trans, k, offset, end, &nr_iters); if (ret) break; } - +err: bch2_trans_iter_exit(trans, ©); return ret < 0 ? ret : 0; } @@ -161,10 +134,8 @@ int bch2_extent_trim_atomic(struct btree_trans *trans, struct btree_iter *iter, struct bkey_i *k) { - struct bpos end; - int ret; - - ret = bch2_extent_atomic_end(trans, iter, k, &end); + struct bpos end = k->k.p; + int ret = bch2_extent_atomic_end(trans, iter, &end); if (ret) return ret; diff --git a/fs/bcachefs/extent_update.h b/fs/bcachefs/extent_update.h index 6f5cf449361a..34467db53f45 100644 --- a/fs/bcachefs/extent_update.h +++ b/fs/bcachefs/extent_update.h @@ -5,7 +5,7 @@ #include "bcachefs.h" int bch2_extent_atomic_end(struct btree_trans *, struct btree_iter *, - struct bkey_i *, struct bpos *); + struct bpos *); int bch2_extent_trim_atomic(struct btree_trans *, struct btree_iter *, struct bkey_i *); diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 05d5f71a7ca9..1ac9897f189d 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -28,6 +28,13 @@ #include "trace.h" #include "util.h" +static const char * const bch2_extent_flags_strs[] = { +#define x(n, v) [BCH_EXTENT_FLAG_##n] = #n, + BCH_EXTENT_FLAGS() +#undef x + NULL, +}; + static unsigned bch2_crc_field_size_max[] = { [BCH_EXTENT_ENTRY_crc32] = CRC32_SIZE_MAX, [BCH_EXTENT_ENTRY_crc64] = CRC64_SIZE_MAX, @@ -38,6 +45,49 @@ static void bch2_extent_crc_pack(union bch_extent_crc *, struct bch_extent_crc_unpacked, enum bch_extent_entry_type); +void bch2_io_failures_to_text(struct printbuf *out, + struct bch_fs *c, + struct bch_io_failures *failed) +{ + static const char * const error_types[] = { + "io", "checksum", "ec reconstruct", NULL + }; + + for (struct bch_dev_io_failures *f = failed->devs; + f < failed->devs + failed->nr; + f++) { + unsigned errflags = + ((!!f->failed_io) << 0) | + ((!!f->failed_csum_nr) << 1) | + ((!!f->failed_ec) << 2); + + if (!errflags) + continue; + + bch2_printbuf_make_room(out, 1024); + rcu_read_lock(); + out->atomic++; + struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); + if (ca) + prt_str(out, ca->name); + else + prt_printf(out, "(invalid device %u)", f->dev); + --out->atomic; + rcu_read_unlock(); + + prt_char(out, ' '); + + if (is_power_of_2(errflags)) { + prt_bitflags(out, error_types, errflags); + prt_str(out, " error"); + } else { + prt_str(out, "errors: "); + prt_bitflags(out, error_types, errflags); + } + prt_newline(out); + } +} + struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, unsigned dev) { @@ -51,7 +101,8 @@ struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *f, } void bch2_mark_io_failure(struct bch_io_failures *failed, - struct extent_ptr_decoded *p) + struct extent_ptr_decoded *p, + bool csum_error) { struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, p->ptr.dev); @@ -59,53 +110,73 @@ void bch2_mark_io_failure(struct bch_io_failures *failed, BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); f = &failed->devs[failed->nr++]; - f->dev = p->ptr.dev; - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else if (p->idx != f->idx) { - f->idx = p->idx; - f->nr_failed = 1; - f->nr_retries = 0; - } else { - f->nr_failed++; + memset(f, 0, sizeof(*f)); + f->dev = p->ptr.dev; + } + + if (p->do_ec_reconstruct) + f->failed_ec = true; + else if (!csum_error) + f->failed_io = true; + else + f->failed_csum_nr++; +} + +void bch2_mark_btree_validate_failure(struct bch_io_failures *failed, + unsigned dev) +{ + struct bch_dev_io_failures *f = bch2_dev_io_failures(failed, dev); + + if (!f) { + BUG_ON(failed->nr >= ARRAY_SIZE(failed->devs)); + + f = &failed->devs[failed->nr++]; + memset(f, 0, sizeof(*f)); + f->dev = dev; } + + f->failed_btree_validate = true; } -static inline u64 dev_latency(struct bch_fs *c, unsigned dev) +static inline u64 dev_latency(struct bch_dev *ca) { - struct bch_dev *ca = bch2_dev_rcu(c, dev); return ca ? atomic64_read(&ca->cur_latency[READ]) : S64_MAX; } +static inline int dev_failed(struct bch_dev *ca) +{ + return !ca || ca->mi.state == BCH_MEMBER_STATE_failed; +} + /* * returns true if p1 is better than p2: */ static inline bool ptr_better(struct bch_fs *c, const struct extent_ptr_decoded p1, - const struct extent_ptr_decoded p2) + u64 p1_latency, + struct bch_dev *ca1, + const struct extent_ptr_decoded p2, + u64 p2_latency) { - if (likely(!p1.idx && !p2.idx)) { - u64 l1 = dev_latency(c, p1.ptr.dev); - u64 l2 = dev_latency(c, p2.ptr.dev); + struct bch_dev *ca2 = bch2_dev_rcu(c, p2.ptr.dev); - /* - * Square the latencies, to bias more in favor of the faster - * device - we never want to stop issuing reads to the slower - * device altogether, so that we can update our latency numbers: - */ - l1 *= l1; - l2 *= l2; + int failed_delta = dev_failed(ca1) - dev_failed(ca2); + if (unlikely(failed_delta)) + return failed_delta < 0; - /* Pick at random, biased in favor of the faster device: */ + if (static_branch_unlikely(&bch2_force_reconstruct_read)) + return p1.do_ec_reconstruct > p2.do_ec_reconstruct; - return bch2_rand_range(l1 + l2) > l1; - } + if (unlikely(p1.do_ec_reconstruct || p2.do_ec_reconstruct)) + return p1.do_ec_reconstruct < p2.do_ec_reconstruct; - if (bch2_force_reconstruct_read) - return p1.idx > p2.idx; + int crc_retry_delta = (int) p1.crc_retry_nr - (int) p2.crc_retry_nr; + if (unlikely(crc_retry_delta)) + return crc_retry_delta < 0; - return p1.idx < p2.idx; + /* Pick at random, biased in favor of the faster device: */ + + return bch2_get_random_u64_below(p1_latency + p2_latency) > p1_latency; } /* @@ -115,64 +186,117 @@ static inline bool ptr_better(struct bch_fs *c, */ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, struct bch_io_failures *failed, - struct extent_ptr_decoded *pick) + struct extent_ptr_decoded *pick, + int dev) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - struct bch_dev_io_failures *f; - int ret = 0; + bool have_csum_errors = false, have_io_errors = false, have_missing_devs = false; + bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) return -BCH_ERR_key_type_error; rcu_read_lock(); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + u64 pick_latency; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + have_dirty_ptrs |= !p.ptr.cached; + /* * Unwritten extent: no need to actually read, treat it as a * hole and return 0s: */ if (p.ptr.unwritten) { - ret = 0; - break; + rcu_read_unlock(); + return 0; } - /* - * If there are any dirty pointers it's an error if we can't - * read: - */ - if (!ret && !p.ptr.cached) - ret = -BCH_ERR_no_device_to_read_from; + /* Are we being asked to read from a specific device? */ + if (dev >= 0 && p.ptr.dev != dev) + continue; + + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - struct bch_dev *ca = bch2_dev_rcu(c, p.ptr.dev); + if (unlikely(!ca && p.ptr.dev != BCH_SB_MEMBER_INVALID)) { + rcu_read_unlock(); + int ret = bch2_dev_missing_bkey(c, k, p.ptr.dev); + if (ret) + return ret; + rcu_read_lock(); + } if (p.ptr.cached && (!ca || dev_ptr_stale_rcu(ca, &p.ptr))) continue; - f = failed ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; - if (f) - p.idx = f->nr_failed < f->nr_retries - ? f->idx - : f->idx + 1; + struct bch_dev_io_failures *f = + unlikely(failed) ? bch2_dev_io_failures(failed, p.ptr.dev) : NULL; + if (unlikely(f)) { + p.crc_retry_nr = f->failed_csum_nr; + p.has_ec &= ~f->failed_ec; - if (!p.idx && (!ca || !bch2_dev_is_readable(ca))) - p.idx++; + if (ca && ca->mi.state != BCH_MEMBER_STATE_failed) { + have_io_errors |= f->failed_io; + have_io_errors |= f->failed_btree_validate; + have_io_errors |= f->failed_ec; + } + have_csum_errors |= !!f->failed_csum_nr; + + if (p.has_ec && (f->failed_io || f->failed_csum_nr)) + p.do_ec_reconstruct = true; + else if (f->failed_io || + f->failed_btree_validate || + f->failed_csum_nr > c->opts.checksum_err_retry_nr) + continue; + } - if (!p.idx && p.has_ec && bch2_force_reconstruct_read) - p.idx++; + have_missing_devs |= ca && !bch2_dev_is_online(ca); - if (p.idx > (unsigned) p.has_ec) - continue; + if (!ca || !bch2_dev_is_online(ca)) { + if (!p.has_ec) + continue; + p.do_ec_reconstruct = true; + } - if (ret > 0 && !ptr_better(c, p, *pick)) - continue; + if (static_branch_unlikely(&bch2_force_reconstruct_read) && p.has_ec) + p.do_ec_reconstruct = true; - *pick = p; - ret = 1; + u64 p_latency = dev_latency(ca); + /* + * Square the latencies, to bias more in favor of the faster + * device - we never want to stop issuing reads to the slower + * device altogether, so that we can update our latency numbers: + */ + p_latency *= p_latency; + + if (!have_pick || + ptr_better(c, + p, p_latency, ca, + *pick, pick_latency)) { + *pick = p; + pick_latency = p_latency; + have_pick = true; + } } rcu_read_unlock(); - return ret; + if (have_pick) + return 1; + if (!have_dirty_ptrs) + return 0; + if (have_missing_devs) + return -BCH_ERR_no_device_to_read_from; + if (have_csum_errors) + return -BCH_ERR_data_read_csum_err; + if (have_io_errors) + return -BCH_ERR_data_read_io_err; + + /* + * If we get here, we have pointers (bkey_ptrs_validate() ensures that), + * but they don't point to valid devices: + */ + return -BCH_ERR_no_devices_valid; } /* KEY_TYPE_btree_ptr: */ @@ -536,29 +660,35 @@ static void bch2_extent_crc_pack(union bch_extent_crc *dst, struct bch_extent_crc_unpacked src, enum bch_extent_entry_type type) { -#define set_common_fields(_dst, _src) \ - _dst.type = 1 << type; \ - _dst.csum_type = _src.csum_type, \ - _dst.compression_type = _src.compression_type, \ - _dst._compressed_size = _src.compressed_size - 1, \ - _dst._uncompressed_size = _src.uncompressed_size - 1, \ - _dst.offset = _src.offset +#define common_fields(_src) \ + .type = BIT(type), \ + .csum_type = _src.csum_type, \ + .compression_type = _src.compression_type, \ + ._compressed_size = _src.compressed_size - 1, \ + ._uncompressed_size = _src.uncompressed_size - 1, \ + .offset = _src.offset switch (type) { case BCH_EXTENT_ENTRY_crc32: - set_common_fields(dst->crc32, src); - dst->crc32.csum = (u32 __force) *((__le32 *) &src.csum.lo); + dst->crc32 = (struct bch_extent_crc32) { + common_fields(src), + .csum = (u32 __force) *((__le32 *) &src.csum.lo), + }; break; case BCH_EXTENT_ENTRY_crc64: - set_common_fields(dst->crc64, src); - dst->crc64.nonce = src.nonce; - dst->crc64.csum_lo = (u64 __force) src.csum.lo; - dst->crc64.csum_hi = (u64 __force) *((__le16 *) &src.csum.hi); + dst->crc64 = (struct bch_extent_crc64) { + common_fields(src), + .nonce = src.nonce, + .csum_lo = (u64 __force) src.csum.lo, + .csum_hi = (u64 __force) *((__le16 *) &src.csum.hi), + }; break; case BCH_EXTENT_ENTRY_crc128: - set_common_fields(dst->crc128, src); - dst->crc128.nonce = src.nonce; - dst->crc128.csum = src.csum; + dst->crc128 = (struct bch_extent_crc128) { + common_fields(src), + .nonce = src.nonce, + .csum = src.csum, + }; break; default: BUG(); @@ -991,13 +1121,14 @@ bch2_extent_has_ptr(struct bkey_s_c k1, struct extent_ptr_decoded p1, struct bke static bool want_cached_ptr(struct bch_fs *c, struct bch_io_opts *opts, struct bch_extent_ptr *ptr) { - if (!opts->promote_target || - !bch2_dev_in_target(c, ptr->dev, opts->promote_target)) + unsigned target = opts->promote_target ?: opts->foreground_target; + + if (target && !bch2_dev_in_target(c, ptr->dev, target)) return false; struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); - return ca && bch2_dev_is_readable(ca) && !dev_ptr_stale_rcu(ca, ptr); + return ca && bch2_dev_is_healthy(ca) && !dev_ptr_stale_rcu(ca, ptr); } void bch2_extent_ptr_set_cached(struct bch_fs *c, @@ -1005,33 +1136,50 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c, struct bkey_s k, struct bch_extent_ptr *ptr) { - struct bkey_ptrs ptrs = bch2_bkey_ptrs(k); + struct bkey_ptrs ptrs; union bch_extent_entry *entry; struct extent_ptr_decoded p; + bool have_cached_ptr; + unsigned drop_dev = ptr->dev; rcu_read_lock(); - if (!want_cached_ptr(c, opts, ptr)) { - bch2_bkey_drop_ptr_noerror(k, ptr); - goto out; - } +restart_drop_ptrs: + ptrs = bch2_bkey_ptrs(k); + have_cached_ptr = false; - /* - * Stripes can't contain cached data, for - reasons. - * - * Possibly something we can fix in the future? - */ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (&entry->ptr == ptr) { - if (p.has_ec) - bch2_bkey_drop_ptr_noerror(k, ptr); - else - ptr->cached = true; - goto out; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { + /* + * Check if it's erasure coded - stripes can't contain cached + * data. Possibly something we can fix in the future? + */ + if (&entry->ptr == ptr && p.has_ec) + goto drop; + + if (p.ptr.cached) { + if (have_cached_ptr || !want_cached_ptr(c, opts, &p.ptr)) { + bch2_bkey_drop_ptr_noerror(k, &entry->ptr); + ptr = NULL; + goto restart_drop_ptrs; + } + + have_cached_ptr = true; } + } + + if (!ptr) + bkey_for_each_ptr(ptrs, ptr2) + if (ptr2->dev == drop_dev) + ptr = ptr2; - BUG(); -out: + if (have_cached_ptr || !want_cached_ptr(c, opts, ptr)) + goto drop; + + ptr->cached = true; + rcu_read_unlock(); + return; +drop: rcu_read_unlock(); + bch2_bkey_drop_ptr_noerror(k, ptr); } /* @@ -1220,6 +1368,10 @@ void bch2_bkey_ptrs_to_text(struct printbuf *out, struct bch_fs *c, bch2_extent_rebalance_to_text(out, c, &entry->rebalance); break; + case BCH_EXTENT_ENTRY_flags: + prt_bitflags(out, bch2_extent_flags_strs, entry->flags.flags); + break; + default: prt_printf(out, "(invalid extent entry %.16llx)", *((u64 *) entry)); return; @@ -1381,6 +1533,11 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, #endif break; } + case BCH_EXTENT_ENTRY_flags: + bkey_fsck_err_on(entry != ptrs.start, + c, extent_flags_not_at_start, + "extent flags entry not at start"); + break; } } @@ -1447,6 +1604,28 @@ void bch2_ptr_swab(struct bkey_s k) } } +int bch2_bkey_extent_flags_set(struct bch_fs *c, struct bkey_i *k, u64 flags) +{ + int ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_extent_flags); + if (ret) + return ret; + + struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(k)); + + if (ptrs.start != ptrs.end && + extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) { + ptrs.start->flags.flags = flags; + } else { + struct bch_extent_flags f = { + .type = BIT(BCH_EXTENT_ENTRY_flags), + .flags = flags, + }; + __extent_entry_insert(k, ptrs.start, (union bch_extent_entry *) &f); + } + + return 0; +} + /* Generic extent code: */ int bch2_cut_front_s(struct bpos where, struct bkey_s k) @@ -1492,8 +1671,8 @@ int bch2_cut_front_s(struct bpos where, struct bkey_s k) entry->crc128.offset += sub; break; case BCH_EXTENT_ENTRY_stripe_ptr: - break; case BCH_EXTENT_ENTRY_rebalance: + case BCH_EXTENT_ENTRY_flags: break; } diff --git a/fs/bcachefs/extents.h b/fs/bcachefs/extents.h index 620b284aa34f..b8590e51b76e 100644 --- a/fs/bcachefs/extents.h +++ b/fs/bcachefs/extents.h @@ -320,8 +320,9 @@ static inline struct bkey_ptrs bch2_bkey_ptrs(struct bkey_s k) ({ \ __label__ out; \ \ - (_ptr).idx = 0; \ - (_ptr).has_ec = false; \ + (_ptr).has_ec = false; \ + (_ptr).do_ec_reconstruct = false; \ + (_ptr).crc_retry_nr = 0; \ \ __bkey_extent_entry_for_each_from(_entry, _end, _entry) \ switch (__extent_entry_type(_entry)) { \ @@ -379,13 +380,6 @@ out: \ /* Iterate over pointers in KEY_TYPE_extent: */ -#define extent_for_each_entry_from(_e, _entry, _start) \ - __bkey_extent_entry_for_each_from(_start, \ - extent_entry_last(_e), _entry) - -#define extent_for_each_entry(_e, _entry) \ - extent_for_each_entry_from(_e, _entry, (_e).v->start) - #define extent_ptr_next(_e, _ptr) \ __bkey_ptr_next(_ptr, extent_entry_last(_e)) @@ -398,13 +392,16 @@ out: \ /* utility code common to all keys with pointers: */ +void bch2_io_failures_to_text(struct printbuf *, struct bch_fs *, + struct bch_io_failures *); struct bch_dev_io_failures *bch2_dev_io_failures(struct bch_io_failures *, unsigned); void bch2_mark_io_failure(struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, bool); +void bch2_mark_btree_validate_failure(struct bch_io_failures *, unsigned); int bch2_bkey_pick_read_device(struct bch_fs *, struct bkey_s_c, struct bch_io_failures *, - struct extent_ptr_decoded *); + struct extent_ptr_decoded *, int); /* KEY_TYPE_btree_ptr: */ @@ -704,7 +701,7 @@ static inline bool bch2_extent_ptr_eq(struct bch_extent_ptr ptr1, ptr1.unwritten == ptr2.unwritten && ptr1.offset == ptr2.offset && ptr1.dev == ptr2.dev && - ptr1.dev == ptr2.dev); + ptr1.gen == ptr2.gen); } void bch2_ptr_swab(struct bkey_s); @@ -753,4 +750,19 @@ static inline void bch2_key_resize(struct bkey *k, unsigned new_size) k->size = new_size; } +static inline u64 bch2_bkey_extent_ptrs_flags(struct bkey_ptrs_c ptrs) +{ + if (ptrs.start != ptrs.end && + extent_entry_type(ptrs.start) == BCH_EXTENT_ENTRY_flags) + return ptrs.start->flags.flags; + return 0; +} + +static inline u64 bch2_bkey_extent_flags(struct bkey_s_c k) +{ + return bch2_bkey_extent_ptrs_flags(bch2_bkey_ptrs_c(k)); +} + +int bch2_bkey_extent_flags_set(struct bch_fs *, struct bkey_i *, u64); + #endif /* _BCACHEFS_EXTENTS_H */ diff --git a/fs/bcachefs/extents_format.h b/fs/bcachefs/extents_format.h index c198dfc376d6..74c0252cbd98 100644 --- a/fs/bcachefs/extents_format.h +++ b/fs/bcachefs/extents_format.h @@ -79,8 +79,9 @@ x(crc64, 2) \ x(crc128, 3) \ x(stripe_ptr, 4) \ - x(rebalance, 5) -#define BCH_EXTENT_ENTRY_MAX 6 + x(rebalance, 5) \ + x(flags, 6) +#define BCH_EXTENT_ENTRY_MAX 7 enum bch_extent_entry_type { #define x(f, n) BCH_EXTENT_ENTRY_##f = n, @@ -201,6 +202,25 @@ struct bch_extent_stripe_ptr { #endif }; +#define BCH_EXTENT_FLAGS() \ + x(poisoned, 0) + +enum bch_extent_flags_e { +#define x(n, v) BCH_EXTENT_FLAG_##n = v, + BCH_EXTENT_FLAGS() +#undef x +}; + +struct bch_extent_flags { +#if defined(__LITTLE_ENDIAN_BITFIELD) + __u64 type:7, + flags:57; +#elif defined (__BIG_ENDIAN_BITFIELD) + __u64 flags:57, + type:7; +#endif +}; + /* bch_extent_rebalance: */ #include "rebalance_format.h" diff --git a/fs/bcachefs/extents_types.h b/fs/bcachefs/extents_types.h index 43d6c341ecca..b23ce4a373c0 100644 --- a/fs/bcachefs/extents_types.h +++ b/fs/bcachefs/extents_types.h @@ -20,8 +20,9 @@ struct bch_extent_crc_unpacked { }; struct extent_ptr_decoded { - unsigned idx; bool has_ec; + bool do_ec_reconstruct; + u8 crc_retry_nr; struct bch_extent_crc_unpacked crc; struct bch_extent_ptr ptr; struct bch_extent_stripe_ptr ec; @@ -31,10 +32,11 @@ struct bch_io_failures { u8 nr; struct bch_dev_io_failures { u8 dev; - u8 idx; - u8 nr_failed; - u8 nr_retries; - } devs[BCH_REPLICAS_MAX]; + unsigned failed_csum_nr:6, + failed_io:1, + failed_btree_validate:1, + failed_ec:1; + } devs[BCH_REPLICAS_MAX + 1]; }; #endif /* _BCACHEFS_EXTENTS_TYPES_H */ diff --git a/fs/bcachefs/eytzinger.c b/fs/bcachefs/eytzinger.c index 2eaffe37b5e7..0e742555cb0a 100644 --- a/fs/bcachefs/eytzinger.c +++ b/fs/bcachefs/eytzinger.c @@ -148,89 +148,99 @@ static int do_cmp(const void *a, const void *b, cmp_r_func_t cmp, const void *pr return cmp(a, b, priv); } -static inline int eytzinger0_do_cmp(void *base, size_t n, size_t size, +static inline int eytzinger1_do_cmp(void *base1, size_t n, size_t size, cmp_r_func_t cmp_func, const void *priv, size_t l, size_t r) { - return do_cmp(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, + return do_cmp(base1 + inorder_to_eytzinger1(l, n) * size, + base1 + inorder_to_eytzinger1(r, n) * size, cmp_func, priv); } -static inline void eytzinger0_do_swap(void *base, size_t n, size_t size, +static inline void eytzinger1_do_swap(void *base1, size_t n, size_t size, swap_r_func_t swap_func, const void *priv, size_t l, size_t r) { - do_swap(base + inorder_to_eytzinger0(l, n) * size, - base + inorder_to_eytzinger0(r, n) * size, + do_swap(base1 + inorder_to_eytzinger1(l, n) * size, + base1 + inorder_to_eytzinger1(r, n) * size, size, swap_func, priv); } -void eytzinger0_sort_r(void *base, size_t n, size_t size, - cmp_r_func_t cmp_func, - swap_r_func_t swap_func, - const void *priv) +static void eytzinger1_sort_r(void *base1, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) { - int i, j, k; + unsigned i, j, k; /* called from 'sort' without swap function, let's pick the default */ if (swap_func == SWAP_WRAPPER && !((struct wrapper *)priv)->swap_func) swap_func = NULL; if (!swap_func) { - if (is_aligned(base, size, 8)) + if (is_aligned(base1, size, 8)) swap_func = SWAP_WORDS_64; - else if (is_aligned(base, size, 4)) + else if (is_aligned(base1, size, 4)) swap_func = SWAP_WORDS_32; else swap_func = SWAP_BYTES; } /* heapify */ - for (i = n / 2 - 1; i >= 0; --i) { + for (i = n / 2; i >= 1; --i) { /* Find the sift-down path all the way to the leaves. */ - for (j = i; k = j * 2 + 1, k + 1 < n;) - j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + for (j = i; k = j * 2, k < n;) + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ - if (j * 2 + 2 == n) - j = j * 2 + 1; + if (j * 2 == n) + j *= 2; /* Backtrack to the correct location. */ - while (j != i && eytzinger0_do_cmp(base, n, size, cmp_func, priv, i, j) >= 0) - j = (j - 1) / 2; + while (j != i && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, i, j) >= 0) + j /= 2; /* Shift the element into its correct place. */ for (k = j; j != i;) { - j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + j /= 2; + eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); } } /* sort */ - for (i = n - 1; i > 0; --i) { - eytzinger0_do_swap(base, n, size, swap_func, priv, 0, i); + for (i = n; i > 1; --i) { + eytzinger1_do_swap(base1, n, size, swap_func, priv, 1, i); /* Find the sift-down path all the way to the leaves. */ - for (j = 0; k = j * 2 + 1, k + 1 < i;) - j = eytzinger0_do_cmp(base, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; + for (j = 1; k = j * 2, k + 1 < i;) + j = eytzinger1_do_cmp(base1, n, size, cmp_func, priv, k, k + 1) > 0 ? k : k + 1; /* Special case for the last leaf with no sibling. */ - if (j * 2 + 2 == i) - j = j * 2 + 1; + if (j * 2 + 1 == i) + j *= 2; /* Backtrack to the correct location. */ - while (j && eytzinger0_do_cmp(base, n, size, cmp_func, priv, 0, j) >= 0) - j = (j - 1) / 2; + while (j >= 1 && eytzinger1_do_cmp(base1, n, size, cmp_func, priv, 1, j) >= 0) + j /= 2; /* Shift the element into its correct place. */ - for (k = j; j;) { - j = (j - 1) / 2; - eytzinger0_do_swap(base, n, size, swap_func, priv, j, k); + for (k = j; j > 1;) { + j /= 2; + eytzinger1_do_swap(base1, n, size, swap_func, priv, j, k); } } } +void eytzinger0_sort_r(void *base, size_t n, size_t size, + cmp_r_func_t cmp_func, + swap_r_func_t swap_func, + const void *priv) +{ + void *base1 = base - size; + + return eytzinger1_sort_r(base1, n, size, cmp_func, swap_func, priv); +} + void eytzinger0_sort(void *base, size_t n, size_t size, cmp_func_t cmp_func, swap_func_t swap_func) diff --git a/fs/bcachefs/eytzinger.h b/fs/bcachefs/eytzinger.h index 0541192d7bc0..643c1f716061 100644 --- a/fs/bcachefs/eytzinger.h +++ b/fs/bcachefs/eytzinger.h @@ -6,6 +6,7 @@ #include <linux/log2.h> #ifdef EYTZINGER_DEBUG +#include <linux/bug.h> #define EYTZINGER_BUG_ON(cond) BUG_ON(cond) #else #define EYTZINGER_BUG_ON(cond) @@ -56,24 +57,14 @@ static inline unsigned eytzinger1_last(unsigned size) return rounddown_pow_of_two(size + 1) - 1; } -/* - * eytzinger1_next() and eytzinger1_prev() have the nice properties that - * - * eytzinger1_next(0) == eytzinger1_first()) - * eytzinger1_prev(0) == eytzinger1_last()) - * - * eytzinger1_prev(eytzinger1_first()) == 0 - * eytzinger1_next(eytzinger1_last()) == 0 - */ - static inline unsigned eytzinger1_next(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EYTZINGER_BUG_ON(i == 0 || i > size); if (eytzinger1_right_child(i) <= size) { i = eytzinger1_right_child(i); - i <<= __fls(size + 1) - __fls(i); + i <<= __fls(size) - __fls(i); i >>= i > size; } else { i >>= ffz(i) + 1; @@ -84,12 +75,12 @@ static inline unsigned eytzinger1_next(unsigned i, unsigned size) static inline unsigned eytzinger1_prev(unsigned i, unsigned size) { - EYTZINGER_BUG_ON(i > size); + EYTZINGER_BUG_ON(i == 0 || i > size); if (eytzinger1_left_child(i) <= size) { i = eytzinger1_left_child(i) + 1; - i <<= __fls(size + 1) - __fls(i); + i <<= __fls(size) - __fls(i); i -= 1; i >>= i > size; } else { @@ -243,73 +234,63 @@ static inline unsigned inorder_to_eytzinger0(unsigned i, unsigned size) (_i) != -1; \ (_i) = eytzinger0_next((_i), (_size))) +#define eytzinger0_for_each_prev(_i, _size) \ + for (unsigned (_i) = eytzinger0_last((_size)); \ + (_i) != -1; \ + (_i) = eytzinger0_prev((_i), (_size))) + /* return greatest node <= @search, or -1 if not found */ static inline int eytzinger0_find_le(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - unsigned i, n = 0; - - if (!nr) - return -1; - - do { - i = n; - n = eytzinger0_child(i, cmp(base + i * size, search) <= 0); - } while (n < nr); - - if (n & 1) { - /* - * @i was greater than @search, return previous node: - * - * if @i was leftmost/smallest element, - * eytzinger0_prev(eytzinger0_first())) returns -1, as expected - */ - return eytzinger0_prev(i, nr); - } else { - return i; - } + void *base1 = base - size; + unsigned n = 1; + + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); + n >>= __ffs(n) + 1; + return n - 1; } +/* return smallest node > @search, or -1 if not found */ static inline int eytzinger0_find_gt(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); + void *base1 = base - size; + unsigned n = 1; - /* - * if eytitzinger0_find_le() returned -1 - no element was <= search - we - * want to return the first element; next/prev identities mean this work - * as expected - * - * similarly if find_le() returns last element, we should return -1; - * identities mean this all works out: - */ - return eytzinger0_next(idx, nr); + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) <= 0); + n >>= __ffs(n + 1) + 1; + return n - 1; } +/* return smallest node >= @search, or -1 if not found */ static inline int eytzinger0_find_ge(void *base, size_t nr, size_t size, cmp_func_t cmp, const void *search) { - ssize_t idx = eytzinger0_find_le(base, nr, size, cmp, search); - - if (idx < nr && !cmp(base + idx * size, search)) - return idx; + void *base1 = base - size; + unsigned n = 1; - return eytzinger0_next(idx, nr); + while (n <= nr) + n = eytzinger1_child(n, cmp(base1 + n * size, search) < 0); + n >>= __ffs(n + 1) + 1; + return n - 1; } #define eytzinger0_find(base, nr, size, _cmp, search) \ ({ \ - void *_base = (base); \ + size_t _size = (size); \ + void *_base1 = (void *)(base) - _size; \ const void *_search = (search); \ size_t _nr = (nr); \ - size_t _size = (size); \ - size_t _i = 0; \ + size_t _i = 1; \ int _res; \ \ - while (_i < _nr && \ - (_res = _cmp(_search, _base + _i * _size))) \ - _i = eytzinger0_child(_i, _res > 0); \ - _i; \ + while (_i <= _nr && \ + (_res = _cmp(_search, _base1 + _i * _size))) \ + _i = eytzinger1_child(_i, _res > 0); \ + _i - 1; \ }) void eytzinger0_sort_r(void *, size_t, size_t, diff --git a/fs/bcachefs/fast_list.c b/fs/bcachefs/fast_list.c new file mode 100644 index 000000000000..2faec143eb31 --- /dev/null +++ b/fs/bcachefs/fast_list.c @@ -0,0 +1,156 @@ +// SPDX-License-Identifier: GPL-2.0 + +/* + * Fast, unordered lists + * + * Supports add, remove, and iterate + * + * Underneath, they're a radix tree and an IDA, with a percpu buffer for slot + * allocation and freeing. + * + * This means that adding, removing, and iterating over items is lockless, + * except when refilling/emptying the percpu slot buffers. + */ + +#include "fast_list.h" + +struct fast_list_pcpu { + u32 nr; + u32 entries[31]; +}; + +static int fast_list_alloc_idx(struct fast_list *l, gfp_t gfp) +{ + int idx = ida_alloc_range(&l->slots_allocated, 1, INT_MAX, gfp); + if (unlikely(idx < 0)) + return 0; + + if (unlikely(!genradix_ptr_alloc_inlined(&l->items, idx, gfp))) { + ida_free(&l->slots_allocated, idx); + return 0; + } + + return idx; +} + +/** + * fast_list_get_idx - get a slot in a fast_list + * @l: list to get slot in + * + * This allocates a slot in the radix tree without storing to it, so that we can + * take the potential memory allocation failure early and do the list add later + * when we can't take an allocation failure. + * + * Returns: positive integer on success, -ENOMEM on failure + */ +int fast_list_get_idx(struct fast_list *l) +{ + unsigned long flags; + int idx; +retry: + local_irq_save(flags); + struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); + + if (unlikely(!lp->nr)) { + u32 entries[16], nr = 0; + + local_irq_restore(flags); + while (nr < ARRAY_SIZE(entries) && + (idx = fast_list_alloc_idx(l, GFP_KERNEL))) + entries[nr++] = idx; + local_irq_save(flags); + + lp = this_cpu_ptr(l->buffer); + + while (nr && lp->nr < ARRAY_SIZE(lp->entries)) + lp->entries[lp->nr++] = entries[--nr]; + + if (unlikely(nr)) { + local_irq_restore(flags); + while (nr) + ida_free(&l->slots_allocated, entries[--nr]); + goto retry; + } + + if (unlikely(!lp->nr)) { + local_irq_restore(flags); + return -ENOMEM; + } + } + + idx = lp->entries[--lp->nr]; + local_irq_restore(flags); + + return idx; +} + +/** + * fast_list_add - add an item to a fast_list + * @l: list + * @item: item to add + * + * Allocates a slot in the radix tree and stores to it and then returns the + * slot index, which must be passed to fast_list_remove(). + * + * Returns: positive integer on success, -ENOMEM on failure + */ +int fast_list_add(struct fast_list *l, void *item) +{ + int idx = fast_list_get_idx(l); + if (idx < 0) + return idx; + + *genradix_ptr_inlined(&l->items, idx) = item; + return idx; +} + +/** + * fast_list_remove - remove an item from a fast_list + * @l: list + * @idx: item's slot index + * + * Zeroes out the slot in the radix tree and frees the slot for future + * fast_list_add() operations. + */ +void fast_list_remove(struct fast_list *l, unsigned idx) +{ + u32 entries[16], nr = 0; + unsigned long flags; + + if (!idx) + return; + + *genradix_ptr_inlined(&l->items, idx) = NULL; + + local_irq_save(flags); + struct fast_list_pcpu *lp = this_cpu_ptr(l->buffer); + + if (unlikely(lp->nr == ARRAY_SIZE(lp->entries))) + while (nr < ARRAY_SIZE(entries)) + entries[nr++] = lp->entries[--lp->nr]; + + lp->entries[lp->nr++] = idx; + local_irq_restore(flags); + + if (unlikely(nr)) + while (nr) + ida_free(&l->slots_allocated, entries[--nr]); +} + +void fast_list_exit(struct fast_list *l) +{ + /* XXX: warn if list isn't empty */ + free_percpu(l->buffer); + ida_destroy(&l->slots_allocated); + genradix_free(&l->items); +} + +int fast_list_init(struct fast_list *l) +{ + genradix_init(&l->items); + ida_init(&l->slots_allocated); + l->buffer = alloc_percpu(*l->buffer); + if (!l->buffer) + return -ENOMEM; + return 0; +} diff --git a/fs/bcachefs/fast_list.h b/fs/bcachefs/fast_list.h new file mode 100644 index 000000000000..73c9bf591fd6 --- /dev/null +++ b/fs/bcachefs/fast_list.h @@ -0,0 +1,41 @@ +#ifndef _LINUX_FAST_LIST_H +#define _LINUX_FAST_LIST_H + +#include <linux/generic-radix-tree.h> +#include <linux/idr.h> +#include <linux/percpu.h> + +struct fast_list_pcpu; + +struct fast_list { + GENRADIX(void *) items; + struct ida slots_allocated;; + struct fast_list_pcpu __percpu + *buffer; +}; + +static inline void *fast_list_iter_peek(struct genradix_iter *iter, + struct fast_list *list) +{ + void **p; + while ((p = genradix_iter_peek(iter, &list->items)) && !*p) + genradix_iter_advance(iter, &list->items); + + return p ? *p : NULL; +} + +#define fast_list_for_each_from(_list, _iter, _i, _start) \ + for (_iter = genradix_iter_init(&(_list)->items, _start); \ + (_i = fast_list_iter_peek(&(_iter), _list)) != NULL; \ + genradix_iter_advance(&(_iter), &(_list)->items)) + +#define fast_list_for_each(_list, _iter, _i) \ + fast_list_for_each_from(_list, _iter, _i, 0) + +int fast_list_get_idx(struct fast_list *l); +int fast_list_add(struct fast_list *l, void *item); +void fast_list_remove(struct fast_list *l, unsigned idx); +void fast_list_exit(struct fast_list *l); +int fast_list_init(struct fast_list *l); + +#endif /* _LINUX_FAST_LIST_H */ diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index ab1d5db2fa56..e3a75dcca60c 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -110,11 +110,21 @@ static int readpage_bio_extend(struct btree_trans *trans, if (!get_more) break; + unsigned sectors_remaining = sectors_this_extent - bio_sectors(bio); + + if (sectors_remaining < PAGE_SECTORS << mapping_min_folio_order(iter->mapping)) + break; + + unsigned order = ilog2(rounddown_pow_of_two(sectors_remaining) / PAGE_SECTORS); + + /* ensure proper alignment */ + order = min(order, __ffs(folio_offset|BIT(31))); + folio = xa_load(&iter->mapping->i_pages, folio_offset); if (folio && !xa_is_value(folio)) break; - folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), 0); + folio = filemap_alloc_folio(readahead_gfp_mask(iter->mapping), order); if (!folio) break; @@ -149,12 +159,10 @@ static void bchfs_read(struct btree_trans *trans, struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; - int flags = BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE; + int flags = BCH_READ_retry_if_stale| + BCH_READ_may_promote; int ret = 0; - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; bch2_bkey_buf_init(&sk); @@ -175,12 +183,12 @@ static void bchfs_read(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(&iter, + bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, rbio->bio.bi_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -211,14 +219,29 @@ static void bchfs_read(struct btree_trans *trans, swap(rbio->bio.bi_iter.bi_size, bytes); if (rbio->bio.bi_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; bch2_bio_page_state_set(&rbio->bio, k); bch2_read_extent(trans, rbio, iter.pos, data_btree, k, offset_into_extent, flags); + /* + * Careful there's a landmine here if bch2_read_extent() ever + * starts returning transaction restarts here. + * + * We've changed rbio->bi_iter.bi_size to be "bytes we can read + * from this extent" with the swap call, and we restore it + * below. That restore needs to come before checking for + * errors. + * + * But unlike __bch2_read(), we use the rbio bvec iter, not one + * on the stack, so we can't do the restore right after the + * bch2_read_extent() call: we don't own that iterator anymore + * if BCH_READ_last_fragment is set, since we may have submitted + * that rbio instead of cloning it. + */ - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; swap(rbio->bio.bi_iter.bi_size, bytes); @@ -232,7 +255,8 @@ err: if (ret) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter.pos.offset << 9)); prt_printf(&buf, "read error %i from btree lookup", ret); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); @@ -280,12 +304,13 @@ void bch2_readahead(struct readahead_control *ractl) struct bch_read_bio *rbio = rbio_init(bio_alloc_bioset(NULL, n, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_readpages_end_io); readpage_iter_advance(&readpages_iter); rbio->bio.bi_iter.bi_sector = folio_sector(folio); - rbio->bio.bi_end_io = bch2_readpages_end_io; BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); bchfs_read(trans, rbio, inode_inum(inode), @@ -323,10 +348,10 @@ int bch2_read_single_folio(struct folio *folio, struct address_space *mapping) bch2_inode_opts_get(&opts, c, &inode->ei_inode); rbio = rbio_init(bio_alloc_bioset(NULL, 1, REQ_OP_READ, GFP_KERNEL, &c->bio_read), - opts); + c, + opts, + bch2_read_single_folio_end_io); rbio->bio.bi_private = &done; - rbio->bio.bi_end_io = bch2_read_single_folio_end_io; - rbio->bio.bi_opf = REQ_OP_READ|REQ_SYNC; rbio->bio.bi_iter.bi_sector = folio_sector(folio); BUG_ON(!bio_add_folio(&rbio->bio, folio, folio_size(folio), 0)); @@ -420,7 +445,7 @@ static void bch2_writepage_io_done(struct bch_write_op *op) } } - if (io->op.flags & BCH_WRITE_WROTE_DATA_INLINE) { + if (io->op.flags & BCH_WRITE_wrote_data_inline) { bio_for_each_folio_all(fi, bio) { struct bch_folio *s; diff --git a/fs/bcachefs/fs-io-direct.c b/fs/bcachefs/fs-io-direct.c index 2089c36b5866..1f5154d9676b 100644 --- a/fs/bcachefs/fs-io-direct.c +++ b/fs/bcachefs/fs-io-direct.c @@ -3,6 +3,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "enumerated_ref.h" #include "fs.h" #include "fs-io.h" #include "fs-io-direct.h" @@ -73,6 +74,7 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) struct blk_plug plug; loff_t offset = req->ki_pos; bool sync = is_sync_kiocb(req); + bool split = false; size_t shorten; ssize_t ret; @@ -99,8 +101,6 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) GFP_KERNEL, &c->dio_read_bioset); - bio->bi_end_io = bch2_direct_IO_read_endio; - dio = container_of(bio, struct dio_read, rbio.bio); closure_init(&dio->cl, NULL); @@ -133,12 +133,13 @@ static int bch2_direct_IO_read(struct kiocb *req, struct iov_iter *iter) goto start; while (iter->count) { + split = true; + bio = bio_alloc_bioset(NULL, bio_iov_vecs_to_alloc(iter, BIO_MAX_VECS), REQ_OP_READ, GFP_KERNEL, &c->bio_read); - bio->bi_end_io = bch2_direct_IO_read_split_endio; start: bio->bi_opf = REQ_OP_READ|REQ_SYNC; bio->bi_iter.bi_sector = offset >> 9; @@ -160,7 +161,15 @@ start: if (iter->count) closure_get(&dio->cl); - bch2_read(c, rbio_init(bio, opts), inode_inum(inode)); + struct bch_read_bio *rbio = + rbio_init(bio, + c, + opts, + split + ? bch2_direct_IO_read_split_endio + : bch2_direct_IO_read_endio); + + bch2_read(c, rbio, inode_inum(inode)); } blk_finish_plug(&plug); @@ -393,7 +402,7 @@ static __always_inline long bch2_dio_write_done(struct dio_write *dio) ret = dio->op.error ?: ((long) dio->written << 9); bio_put(&dio->op.wbio.bio); - bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); /* inode->i_dio_count is our ref on inode and thus bch_fs */ inode_dio_end(&inode->v); @@ -511,8 +520,8 @@ static __always_inline long bch2_dio_write_loop(struct dio_write *dio) dio->op.devs_need_flush = &inode->ei_devs_need_flush; if (sync) - dio->op.flags |= BCH_WRITE_SYNC; - dio->op.flags |= BCH_WRITE_CHECK_ENOSPC; + dio->op.flags |= BCH_WRITE_sync; + dio->op.flags |= BCH_WRITE_check_enospc; ret = bch2_quota_reservation_add(c, inode, &dio->quota_res, bio_sectors(bio), true); @@ -598,7 +607,7 @@ ssize_t bch2_direct_write(struct kiocb *req, struct iov_iter *iter) prefetch(&inode->ei_inode); prefetch((void *) &inode->ei_inode + 64); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_dio_write)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_dio_write)) return -EROFS; inode_lock(&inode->v); @@ -667,7 +676,7 @@ err_put_bio: bio_put(bio); inode_dio_end(&inode->v); err_put_write_ref: - bch2_write_ref_put(c, BCH_WRITE_REF_dio_write); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_dio_write); goto out; } diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index e072900e6a5b..fbae9c1de746 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -605,10 +605,14 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) struct address_space *mapping = file->f_mapping; struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch2_folio_reservation res; - unsigned len; - loff_t isize; vm_fault_t ret; + loff_t file_offset = round_down(vmf->pgoff << PAGE_SHIFT, block_bytes(c)); + unsigned offset = file_offset - folio_pos(folio); + unsigned len = max(PAGE_SIZE, block_bytes(c)); + + BUG_ON(offset + len > folio_size(folio)); + bch2_folio_reservation_init(c, inode, &res); sb_start_pagefault(inode->v.i_sb); @@ -623,24 +627,24 @@ vm_fault_t bch2_page_mkwrite(struct vm_fault *vmf) bch2_pagecache_add_get(inode); folio_lock(folio); - isize = i_size_read(&inode->v); + u64 isize = i_size_read(&inode->v); - if (folio->mapping != mapping || folio_pos(folio) >= isize) { + if (folio->mapping != mapping || file_offset >= isize) { folio_unlock(folio); ret = VM_FAULT_NOPAGE; goto out; } - len = min_t(loff_t, folio_size(folio), isize - folio_pos(folio)); + len = min_t(unsigned, len, isize - file_offset); if (bch2_folio_set(c, inode_inum(inode), &folio, 1) ?: - bch2_folio_reservation_get(c, inode, folio, &res, 0, len)) { + bch2_folio_reservation_get(c, inode, folio, &res, offset, len)) { folio_unlock(folio); ret = VM_FAULT_SIGBUS; goto out; } - bch2_set_folio_dirty(c, inode, folio, &res, 0, len); + bch2_set_folio_dirty(c, inode, folio, &res, offset, len); bch2_folio_reservation_put(c, inode, &res); folio_wait_stable(folio); diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index 94bf34b9b65f..b1e9ee28fc0f 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -7,6 +7,7 @@ #include "btree_update.h" #include "buckets.h" #include "clock.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "extent_update.h" @@ -48,7 +49,8 @@ static void nocow_flush_endio(struct bio *_bio) struct nocow_flush *bio = container_of(_bio, struct nocow_flush, bio); closure_put(bio->cl); - percpu_ref_put(&bio->ca->io_ref); + enumerated_ref_put(&bio->ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush); bio_put(&bio->bio); } @@ -71,7 +73,8 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { rcu_read_lock(); ca = rcu_dereference(c->devs[dev]); - if (ca && !percpu_ref_tryget(&ca->io_ref)) + if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush)) ca = NULL; rcu_read_unlock(); @@ -144,10 +147,24 @@ int __must_check bch2_write_inode_size(struct bch_fs *c, void __bch2_i_sectors_acct(struct bch_fs *c, struct bch_inode_info *inode, struct quota_res *quota_res, s64 sectors) { - bch2_fs_inconsistent_on((s64) inode->v.i_blocks + sectors < 0, c, - "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, sectors, - inode->ei_inode.bi_sectors); + if (unlikely((s64) inode->v.i_blocks + sectors < 0)) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "inode %lu i_blocks underflow: %llu + %lli < 0 (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, sectors, + inode->ei_inode.bi_sectors); + + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_underflow, &buf); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + + if (sectors < 0) + sectors = -inode->v.i_blocks; + else + sectors = 0; + } + inode->v.i_blocks += sectors; #ifdef CONFIG_BCACHEFS_QUOTA @@ -205,7 +222,7 @@ static int bch2_flush_inode(struct bch_fs *c, if (c->opts.journal_flush_disabled) return 0; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fsync)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fsync)) return -EROFS; u64 seq; @@ -213,7 +230,7 @@ static int bch2_flush_inode(struct bch_fs *c, bch2_get_inode_journal_seq_trans(trans, inode_inum(inode), &seq)) ?: bch2_journal_flush_seq(&c->journal, seq, TASK_INTERRUPTIBLE) ?: bch2_inode_flush_nocow_writes(c, inode); - bch2_write_ref_put(c, BCH_WRITE_REF_fsync); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_fsync); return ret; } @@ -466,6 +483,7 @@ int bchfs_truncate(struct mnt_idmap *idmap, ret = bch2_truncate_folio(inode, iattr->ia_size); if (unlikely(ret < 0)) goto err; + ret = 0; truncate_setsize(&inode->v, iattr->ia_size); @@ -501,11 +519,20 @@ int bchfs_truncate(struct mnt_idmap *idmap, goto err; } - bch2_fs_inconsistent_on(!inode->v.i_size && inode->v.i_blocks && - !bch2_journal_error(&c->journal), c, - "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", - inode->v.i_ino, (u64) inode->v.i_blocks, - inode->ei_inode.bi_sectors); + if (unlikely(!inode->v.i_size && inode->v.i_blocks && + !bch2_journal_error(&c->journal))) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, + "inode %lu truncated to 0 but i_blocks %llu (ondisk %lli)", + inode->v.i_ino, (u64) inode->v.i_blocks, + inode->ei_inode.bi_sectors); + + bool print = bch2_count_fsck_err(c, vfs_inode_i_blocks_not_zero_at_truncate, &buf); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } ret = bch2_setattr_nonsize(idmap, inode, iattr); err: @@ -635,9 +662,9 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); if ((ret = bkey_err(k))) goto bkey_err; @@ -648,13 +675,13 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, /* already reserved */ if (bkey_extent_is_reservation(k) && bch2_bkey_nr_ptrs_fully_allocated(k) >= opts.data_replicas) { - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); continue; } if (bkey_extent_is_data(k.k) && !(mode & FALLOC_FL_ZERO_RANGE)) { - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); continue; } @@ -675,7 +702,7 @@ static noinline int __bchfs_fallocate(struct bch_inode_info *inode, int mode, if (ret) goto bkey_err; } - bch2_btree_iter_set_pos(&iter, POS(iter.pos.inode, hole_start)); + bch2_btree_iter_set_pos(trans, &iter, POS(iter.pos.inode, hole_start)); if (ret) goto bkey_err; @@ -794,7 +821,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, struct bch_fs *c = inode->v.i_sb->s_fs_info; long ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_fallocate)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_fallocate)) return -EROFS; inode_lock(&inode->v); @@ -818,7 +845,7 @@ long bch2_fallocate_dispatch(struct file *file, int mode, err: bch2_pagecache_block_put(inode); inode_unlock(&inode->v); - bch2_write_ref_put(c, BCH_WRITE_REF_fallocate); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_fallocate); return bch2_err_class(ret); } @@ -998,17 +1025,28 @@ static loff_t bch2_seek_hole(struct file *file, u64 offset) POS(inode->v.i_ino, offset >> 9), POS(inode->v.i_ino, U64_MAX), inum.subvol, BTREE_ITER_slots, k, ({ - if (k.k->p.inode != inode->v.i_ino) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - offset, MAX_LFS_FILESIZE, 0, false); - break; - } else if (!bkey_extent_is_data(k.k)) { - next_hole = bch2_seek_pagecache_hole(&inode->v, - max(offset, bkey_start_offset(k.k) << 9), - k.k->p.offset << 9, 0, false); - - if (next_hole < k.k->p.offset << 9) + if (k.k->p.inode != inode->v.i_ino || + !bkey_extent_is_data(k.k)) { + loff_t start_offset = k.k->p.inode == inode->v.i_ino + ? max(offset, bkey_start_offset(k.k) << 9) + : offset; + loff_t end_offset = k.k->p.inode == inode->v.i_ino + ? MAX_LFS_FILESIZE + : k.k->p.offset << 9; + + /* + * Found a hole in the btree, now make sure it's + * a hole in the pagecache. We might have to + * keep searching if this hole is entirely dirty + * in the page cache: + */ + bch2_trans_unlock(trans); + loff_t pagecache_hole = bch2_seek_pagecache_hole(&inode->v, + start_offset, end_offset, 0, false); + if (pagecache_hole < end_offset) { + next_hole = pagecache_hole; break; + } } else { offset = max(offset, bkey_start_offset(k.k) << 9); } diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 15725b4ce393..05361a793206 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -5,8 +5,8 @@ #include "chardev.h" #include "dirent.h" #include "fs.h" -#include "fs-common.h" #include "fs-ioctl.h" +#include "namei.h" #include "quota.h" #include <linux/compat.h> @@ -21,180 +21,6 @@ #define FSOP_GOING_FLAGS_LOGFLUSH 0x1 /* flush log but not data */ #define FSOP_GOING_FLAGS_NOLOGFLUSH 0x2 /* don't flush log nor data */ -struct flags_set { - unsigned mask; - unsigned flags; - - unsigned projid; - - bool set_projinherit; - bool projinherit; -}; - -static int bch2_inode_flags_set(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct bch_fs *c = inode->v.i_sb->s_fs_info; - /* - * We're relying on btree locking here for exclusion with other ioctl - * calls - use the flags in the btree (@bi), not inode->i_flags: - */ - struct flags_set *s = p; - unsigned newflags = s->flags; - unsigned oldflags = bi->bi_flags & s->mask; - - if (((newflags ^ oldflags) & (BCH_INODE_append|BCH_INODE_immutable)) && - !capable(CAP_LINUX_IMMUTABLE)) - return -EPERM; - - if (!S_ISREG(bi->bi_mode) && - !S_ISDIR(bi->bi_mode) && - (newflags & (BCH_INODE_nodump|BCH_INODE_noatime)) != newflags) - return -EINVAL; - - if (s->set_projinherit) { - bi->bi_fields_set &= ~(1 << Inode_opt_project); - bi->bi_fields_set |= ((int) s->projinherit << Inode_opt_project); - } - - bi->bi_flags &= ~s->mask; - bi->bi_flags |= newflags; - - bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); - return 0; -} - -static int bch2_ioc_getflags(struct bch_inode_info *inode, int __user *arg) -{ - unsigned flags = map_flags(bch_flags_to_uflags, inode->ei_inode.bi_flags); - - return put_user(flags, arg); -} - -static int bch2_ioc_setflags(struct bch_fs *c, - struct file *file, - struct bch_inode_info *inode, - void __user *arg) -{ - struct flags_set s = { .mask = map_defined(bch_flags_to_uflags) }; - unsigned uflags; - int ret; - - if (get_user(uflags, (int __user *) arg)) - return -EFAULT; - - s.flags = map_flags_rev(bch_flags_to_uflags, uflags); - if (uflags) - return -EOPNOTSUPP; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - inode_lock(&inode->v); - if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { - ret = -EACCES; - goto setflags_out; - } - - mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - bch2_write_inode(c, inode, bch2_inode_flags_set, &s, - ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); - -setflags_out: - inode_unlock(&inode->v); - mnt_drop_write_file(file); - return ret; -} - -static int bch2_ioc_fsgetxattr(struct bch_inode_info *inode, - struct fsxattr __user *arg) -{ - struct fsxattr fa = { 0 }; - - fa.fsx_xflags = map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags); - - if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) - fa.fsx_xflags |= FS_XFLAG_PROJINHERIT; - - fa.fsx_projid = inode->ei_qid.q[QTYP_PRJ]; - - if (copy_to_user(arg, &fa, sizeof(fa))) - return -EFAULT; - - return 0; -} - -static int fssetxattr_inode_update_fn(struct btree_trans *trans, - struct bch_inode_info *inode, - struct bch_inode_unpacked *bi, - void *p) -{ - struct flags_set *s = p; - - if (s->projid != bi->bi_project) { - bi->bi_fields_set |= 1U << Inode_opt_project; - bi->bi_project = s->projid; - } - - return bch2_inode_flags_set(trans, inode, bi, p); -} - -static int bch2_ioc_fssetxattr(struct bch_fs *c, - struct file *file, - struct bch_inode_info *inode, - struct fsxattr __user *arg) -{ - struct flags_set s = { .mask = map_defined(bch_flags_to_xflags) }; - struct fsxattr fa; - int ret; - - if (copy_from_user(&fa, arg, sizeof(fa))) - return -EFAULT; - - s.set_projinherit = true; - s.projinherit = (fa.fsx_xflags & FS_XFLAG_PROJINHERIT) != 0; - fa.fsx_xflags &= ~FS_XFLAG_PROJINHERIT; - - s.flags = map_flags_rev(bch_flags_to_xflags, fa.fsx_xflags); - if (fa.fsx_xflags) - return -EOPNOTSUPP; - - if (fa.fsx_projid >= U32_MAX) - return -EINVAL; - - /* - * inode fields accessible via the xattr interface are stored with a +1 - * bias, so that 0 means unset: - */ - s.projid = fa.fsx_projid + 1; - - ret = mnt_want_write_file(file); - if (ret) - return ret; - - inode_lock(&inode->v); - if (!inode_owner_or_capable(file_mnt_idmap(file), &inode->v)) { - ret = -EACCES; - goto err; - } - - mutex_lock(&inode->ei_update_lock); - ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: - bch2_set_projid(c, inode, fa.fsx_projid) ?: - bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, - ATTR_CTIME); - mutex_unlock(&inode->ei_update_lock); -err: - inode_unlock(&inode->v); - mnt_drop_write_file(file); - return ret; -} - static int bch2_reinherit_attrs_fn(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, @@ -218,7 +44,7 @@ static int bch2_ioc_reinherit_attrs(struct bch_fs *c, int ret = 0; subvol_inum inum; - kname = kmalloc(BCH_NAME_MAX + 1, GFP_KERNEL); + kname = kmalloc(BCH_NAME_MAX, GFP_KERNEL); if (!kname) return -ENOMEM; @@ -346,7 +172,10 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) if (get_user(flags, arg)) return -EFAULT; - bch_notice(c, "shutdown by ioctl type %u", flags); + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "shutdown by ioctl type %u", flags); switch (flags) { case FSOP_GOING_FLAGS_DEFAULT: @@ -354,20 +183,23 @@ static int bch2_ioc_goingdown(struct bch_fs *c, u32 __user *arg) if (ret) break; bch2_journal_flush(&c->journal); - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); bdev_thaw(c->vfs_sb->s_bdev); break; case FSOP_GOING_FLAGS_LOGFLUSH: bch2_journal_flush(&c->journal); fallthrough; case FSOP_GOING_FLAGS_NOLOGFLUSH: - bch2_fs_emergency_read_only(c); + bch2_fs_emergency_read_only2(c, &buf); break; default: ret = -EINVAL; - break; + goto noprint; } + bch2_print_str(c, KERN_ERR, buf.buf); +noprint: + printbuf_exit(&buf); return ret; } @@ -511,14 +343,12 @@ static long bch2_ioctl_subvolume_destroy(struct bch_fs *c, struct file *filp, ret = -EXDEV; goto err; } - if (!d_is_positive(victim)) { - ret = -ENOENT; - goto err; - } - ret = __bch2_unlink(dir, victim, true); + + ret = inode_permission(file_mnt_idmap(filp), d_inode(victim), MAY_WRITE) ?: + __bch2_unlink(dir, victim, true); if (!ret) { fsnotify_rmdir(dir, victim); - d_delete(victim); + d_invalidate(victim); } err: inode_unlock(dir); @@ -534,23 +364,6 @@ long bch2_fs_file_ioctl(struct file *file, unsigned cmd, unsigned long arg) long ret; switch (cmd) { - case FS_IOC_GETFLAGS: - ret = bch2_ioc_getflags(inode, (int __user *) arg); - break; - - case FS_IOC_SETFLAGS: - ret = bch2_ioc_setflags(c, file, inode, (int __user *) arg); - break; - - case FS_IOC_FSGETXATTR: - ret = bch2_ioc_fsgetxattr(inode, (void __user *) arg); - break; - - case FS_IOC_FSSETXATTR: - ret = bch2_ioc_fssetxattr(c, file, inode, - (void __user *) arg); - break; - case BCHFS_IOC_REINHERIT_ATTRS: ret = bch2_ioc_reinherit_attrs(c, file, inode, (void __user *) arg); diff --git a/fs/bcachefs/fs-ioctl.h b/fs/bcachefs/fs-ioctl.h index d30f9bb056fd..a657e4994b71 100644 --- a/fs/bcachefs/fs-ioctl.h +++ b/fs/bcachefs/fs-ioctl.h @@ -2,79 +2,6 @@ #ifndef _BCACHEFS_FS_IOCTL_H #define _BCACHEFS_FS_IOCTL_H -/* Inode flags: */ - -/* bcachefs inode flags -> vfs inode flags: */ -static const __maybe_unused unsigned bch_flags_to_vfs[] = { - [__BCH_INODE_sync] = S_SYNC, - [__BCH_INODE_immutable] = S_IMMUTABLE, - [__BCH_INODE_append] = S_APPEND, - [__BCH_INODE_noatime] = S_NOATIME, -}; - -/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ -static const __maybe_unused unsigned bch_flags_to_uflags[] = { - [__BCH_INODE_sync] = FS_SYNC_FL, - [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, - [__BCH_INODE_append] = FS_APPEND_FL, - [__BCH_INODE_nodump] = FS_NODUMP_FL, - [__BCH_INODE_noatime] = FS_NOATIME_FL, -}; - -/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ -static const __maybe_unused unsigned bch_flags_to_xflags[] = { - [__BCH_INODE_sync] = FS_XFLAG_SYNC, - [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, - [__BCH_INODE_append] = FS_XFLAG_APPEND, - [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, - [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, - //[__BCH_INODE_PROJINHERIT] = FS_XFLAG_PROJINHERIT; -}; - -#define set_flags(_map, _in, _out) \ -do { \ - unsigned _i; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & (1 << _i)) \ - (_out) |= _map[_i]; \ - else \ - (_out) &= ~_map[_i]; \ -} while (0) - -#define map_flags(_map, _in) \ -({ \ - unsigned _out = 0; \ - \ - set_flags(_map, _in, _out); \ - _out; \ -}) - -#define map_flags_rev(_map, _in) \ -({ \ - unsigned _i, _out = 0; \ - \ - for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ - if ((_in) & _map[_i]) { \ - (_out) |= 1 << _i; \ - (_in) &= ~_map[_i]; \ - } \ - (_out); \ -}) - -#define map_defined(_map) \ -({ \ - unsigned _in = ~0; \ - \ - map_flags_rev(_map, _in); \ -}) - -/* Set VFS inode flags from bcachefs inode: */ -static inline void bch2_inode_flags_to_vfs(struct bch_inode_info *inode) -{ - set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); -} - long bch2_fs_file_ioctl(struct file *, unsigned, unsigned long); long bch2_compat_fs_ioctl(struct file *, unsigned, unsigned long); diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index 90ade8f648d9..ddfe89d84966 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -11,7 +11,6 @@ #include "errcode.h" #include "extents.h" #include "fs.h" -#include "fs-common.h" #include "fs-io.h" #include "fs-ioctl.h" #include "fs-io-buffered.h" @@ -22,6 +21,7 @@ #include "io_read.h" #include "journal.h" #include "keylist.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "snapshot.h" @@ -33,6 +33,7 @@ #include <linux/backing-dev.h> #include <linux/exportfs.h> #include <linux/fiemap.h> +#include <linux/fileattr.h> #include <linux/fs_context.h> #include <linux/module.h> #include <linux/pagemap.h> @@ -51,6 +52,24 @@ static void bch2_vfs_inode_init(struct btree_trans *, subvol_inum, struct bch_inode_unpacked *, struct bch_subvolume *); +/* Set VFS inode flags from bcachefs inode: */ +static inline void bch2_inode_flags_to_vfs(struct bch_fs *c, struct bch_inode_info *inode) +{ + static const __maybe_unused unsigned bch_flags_to_vfs[] = { + [__BCH_INODE_sync] = S_SYNC, + [__BCH_INODE_immutable] = S_IMMUTABLE, + [__BCH_INODE_append] = S_APPEND, + [__BCH_INODE_noatime] = S_NOATIME, + }; + + set_flags(bch_flags_to_vfs, inode->ei_inode.bi_flags, inode->v.i_flags); + + if (bch2_inode_casefold(c, &inode->ei_inode)) + inode->v.i_flags |= S_CASEFOLD; + else + inode->v.i_flags &= ~S_CASEFOLD; +} + void bch2_inode_update_after_write(struct btree_trans *trans, struct bch_inode_info *inode, struct bch_inode_unpacked *bi, @@ -79,7 +98,7 @@ void bch2_inode_update_after_write(struct btree_trans *trans, inode->ei_inode = *bi; - bch2_inode_flags_to_vfs(inode); + bch2_inode_flags_to_vfs(c, inode); } int __must_check bch2_write_inode(struct bch_fs *c, @@ -88,7 +107,7 @@ int __must_check bch2_write_inode(struct bch_fs *c, void *p, unsigned fields) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bch_inode_unpacked inode_u; int ret; retry: @@ -172,11 +191,6 @@ int bch2_fs_quota_transfer(struct bch_fs *c, return ret; } -static bool subvol_inum_eq(subvol_inum a, subvol_inum b) -{ - return a.subvol == b.subvol && a.inum == b.inum; -} - static u32 bch2_vfs_inode_hash_fn(const void *data, u32 len, u32 seed) { const subvol_inum *inum = data; @@ -333,9 +347,8 @@ repeat: if (!trans) { __wait_on_freeing_inode(c, inode, inum); } else { - bch2_trans_unlock(trans); - __wait_on_freeing_inode(c, inode, inum); - int ret = bch2_trans_relock(trans); + int ret = drop_locks_do(trans, + (__wait_on_freeing_inode(c, inode, inum), 0)); if (ret) return ERR_PTR(ret); } @@ -631,17 +644,24 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, const struct qstr *name) { struct bch_fs *c = trans->c; - struct btree_iter dirent_iter = {}; subvol_inum inum = {}; struct printbuf buf = PRINTBUF; + struct qstr lookup_name; + int ret = bch2_maybe_casefold(trans, dir_hash_info, name, &lookup_name); + if (ret) + return ERR_PTR(ret); + + struct btree_iter dirent_iter = {}; struct bkey_s_c k = bch2_hash_lookup(trans, &dirent_iter, bch2_dirent_hash_desc, - dir_hash_info, dir, name, 0); - int ret = bkey_err(k); + dir_hash_info, dir, &lookup_name, 0); + ret = bkey_err(k); if (ret) return ERR_PTR(ret); - ret = bch2_dirent_read_target(trans, dir, bkey_s_c_to_dirent(k), &inum); + struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + + ret = bch2_dirent_read_target(trans, dir, d, &inum); if (ret > 0) ret = -ENOENT; if (ret) @@ -651,30 +671,30 @@ static struct bch_inode_info *bch2_lookup_trans(struct btree_trans *trans, if (inode) goto out; + /* + * Note: if check/repair needs it, we commit before + * bch2_inode_hash_init_insert(), as after that point we can't take a + * restart - not in the top level loop with a commit_do(), like we + * usually do: + */ + struct bch_subvolume subvol; struct bch_inode_unpacked inode_u; ret = bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: bch2_inode_find_by_inum_nowarn_trans(trans, inum, &inode_u) ?: + bch2_check_dirent_target(trans, &dirent_iter, d, &inode_u, false) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: PTR_ERR_OR_ZERO(inode = bch2_inode_hash_init_insert(trans, inum, &inode_u, &subvol)); + /* + * don't remove it: check_inodes might find another inode that points + * back to this dirent + */ bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), - c, "dirent to missing inode:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), buf.buf)); + c, "dirent to missing inode:\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf)); if (ret) goto err; - - /* regular files may have hardlinks: */ - if (bch2_fs_inconsistent_on(bch2_inode_should_have_single_bp(&inode_u) && - !bkey_eq(k.k->p, POS(inode_u.bi_dir, inode_u.bi_dir_offset)), - c, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, k), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, &inode_u), - buf.buf))) { - ret = -ENOENT; - goto err; - } out: bch2_trans_iter_exit(trans, &dirent_iter); printbuf_exit(&buf); @@ -698,6 +718,23 @@ static struct dentry *bch2_lookup(struct inode *vdir, struct dentry *dentry, if (IS_ERR(inode)) inode = NULL; +#ifdef CONFIG_UNICODE + if (!inode && IS_CASEFOLDED(vdir)) { + /* + * Do not cache a negative dentry in casefolded directories + * as it would need to be invalidated in the following situation: + * - Lookup file "blAH" in a casefolded directory + * - Creation of file "BLAH" in a casefolded directory + * - Lookup file "blAH" in a casefolded directory + * which would fail if we had a negative dentry. + * + * We should come back to this when VFS has a method to handle + * this edgecase. + */ + return NULL; + } +#endif + return d_splice_alias(&inode->v, dentry); } @@ -806,6 +843,9 @@ int __bch2_unlink(struct inode *vdir, struct dentry *dentry, */ set_nlink(&inode->v, 0); } + + if (IS_CASEFOLDED(vdir)) + d_invalidate(dentry); err: bch2_trans_put(trans); bch2_unlock_inodes(INODE_UPDATE_LOCK, dir, inode); @@ -858,10 +898,10 @@ err: return bch2_err_class(ret); } -static int bch2_mkdir(struct mnt_idmap *idmap, - struct inode *vdir, struct dentry *dentry, umode_t mode) +static struct dentry *bch2_mkdir(struct mnt_idmap *idmap, + struct inode *vdir, struct dentry *dentry, umode_t mode) { - return bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0); + return ERR_PTR(bch2_mknod(idmap, vdir, dentry, mode|S_IFDIR, 0)); } static int bch2_rename2(struct mnt_idmap *idmap, @@ -1056,7 +1096,7 @@ int bch2_setattr_nonsize(struct mnt_idmap *idmap, struct bch_fs *c = inode->v.i_sb->s_fs_info; struct bch_qid qid; struct btree_trans *trans; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; struct bch_inode_unpacked inode_u; struct posix_acl *acl = NULL; kuid_t kuid; @@ -1216,10 +1256,20 @@ static int bch2_tmpfile(struct mnt_idmap *idmap, return finish_open_simple(file, 0); } +struct bch_fiemap_extent { + struct bkey_buf kbuf; + unsigned flags; +}; + static int bch2_fill_extent(struct bch_fs *c, struct fiemap_extent_info *info, - struct bkey_s_c k, unsigned flags) + struct bch_fiemap_extent *fe) { + struct bkey_s_c k = bkey_i_to_s_c(fe->kbuf.k); + unsigned flags = fe->flags; + + BUG_ON(!k.k->size); + if (bkey_extent_is_direct_data(k.k)) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; @@ -1272,110 +1322,225 @@ static int bch2_fill_extent(struct bch_fs *c, } } -static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, - u64 start, u64 len) +/* + * Scan a range of an inode for data in pagecache. + * + * Intended to be retryable, so don't modify the output params until success is + * imminent. + */ +static int +bch2_fiemap_hole_pagecache(struct inode *vinode, u64 *start, u64 *end, + bool nonblock) { - struct bch_fs *c = vinode->i_sb->s_fs_info; - struct bch_inode_info *ei = to_bch_ei(vinode); - struct btree_trans *trans; - struct btree_iter iter; - struct bkey_s_c k; - struct bkey_buf cur, prev; - bool have_extent = false; - int ret = 0; + loff_t dstart, dend; - ret = fiemap_prep(&ei->v, info, start, &len, FIEMAP_FLAG_SYNC); - if (ret) + dstart = bch2_seek_pagecache_data(vinode, *start, *end, 0, nonblock); + if (dstart < 0) + return dstart; + + if (dstart == *end) { + *start = dstart; + return 0; + } + + dend = bch2_seek_pagecache_hole(vinode, dstart, *end, 0, nonblock); + if (dend < 0) + return dend; + + /* race */ + BUG_ON(dstart == dend); + + *start = dstart; + *end = dend; + return 0; +} + +/* + * Scan a range of pagecache that corresponds to a file mapping hole in the + * extent btree. If data is found, fake up an extent key so it looks like a + * delalloc extent to the rest of the fiemap processing code. + */ +static int +bch2_next_fiemap_pagecache_extent(struct btree_trans *trans, struct bch_inode_info *inode, + u64 start, u64 end, struct bch_fiemap_extent *cur) +{ + struct bch_fs *c = trans->c; + struct bkey_i_extent *delextent; + struct bch_extent_ptr ptr = {}; + loff_t dstart = start << 9, dend = end << 9; + int ret; + + /* + * We hold btree locks here so we cannot block on folio locks without + * dropping trans locks first. Run a nonblocking scan for the common + * case of no folios over holes and fall back on failure. + * + * Note that dropping locks like this is technically racy against + * writeback inserting to the extent tree, but a non-sync fiemap scan is + * fundamentally racy with writeback anyways. Therefore, just report the + * range as delalloc regardless of whether we have to cycle trans locks. + */ + ret = bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, true); + if (ret == -EAGAIN) + ret = drop_locks_do(trans, + bch2_fiemap_hole_pagecache(&inode->v, &dstart, &dend, false)); + if (ret < 0) return ret; - struct bpos end = POS(ei->v.i_ino, (start + len) >> 9); - if (start + len < start) - return -EINVAL; + /* + * Create a fake extent key in the buffer. We have to add a dummy extent + * pointer for the fill code to add an extent entry. It's explicitly + * zeroed to reflect delayed allocation (i.e. phys offset 0). + */ + bch2_bkey_buf_realloc(&cur->kbuf, c, sizeof(*delextent) / sizeof(u64)); + delextent = bkey_extent_init(cur->kbuf.k); + delextent->k.p = POS(inode->ei_inum.inum, dend >> 9); + delextent->k.size = (dend - dstart) >> 9; + bch2_bkey_append_ptr(&delextent->k_i, ptr); - start >>= 9; + cur->flags = FIEMAP_EXTENT_DELALLOC; - bch2_bkey_buf_init(&cur); - bch2_bkey_buf_init(&prev); - trans = bch2_trans_get(c); + return 0; +} + +static int bch2_next_fiemap_extent(struct btree_trans *trans, + struct bch_inode_info *inode, + u64 start, u64 end, + struct bch_fiemap_extent *cur) +{ + u32 snapshot; + int ret = bch2_subvolume_get_snapshot(trans, inode->ei_inum.subvol, &snapshot); + if (ret) + return ret; + struct btree_iter iter; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, - POS(ei->v.i_ino, start), 0); + SPOS(inode->ei_inum.inum, start, snapshot), 0); - while (!ret || bch2_err_matches(ret, BCH_ERR_transaction_restart)) { - enum btree_id data_btree = BTREE_ID_extents; + struct bkey_s_c k = + bch2_btree_iter_peek_max(trans, &iter, POS(inode->ei_inum.inum, end)); + ret = bkey_err(k); + if (ret) + goto err; - bch2_trans_begin(trans); + u64 pagecache_end = k.k ? max(start, bkey_start_offset(k.k)) : end; - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, ei->ei_inum.subvol, &snapshot); - if (ret) - continue; + ret = bch2_next_fiemap_pagecache_extent(trans, inode, start, pagecache_end, cur); + if (ret) + goto err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + struct bpos pagecache_start = bkey_start_pos(&cur->kbuf.k->k); - k = bch2_btree_iter_peek_max(&iter, end); - ret = bkey_err(k); + /* + * Does the pagecache or the btree take precedence? + * + * It _should_ be the pagecache, so that we correctly report delalloc + * extents when dirty in the pagecache (we're COW, after all). + * + * But we'd have to add per-sector writeback tracking to + * bch_folio_state, otherwise we report delalloc extents for clean + * cached data in the pagecache. + * + * We should do this, but even then fiemap won't report stable mappings: + * on bcachefs data moves around in the background (copygc, rebalance) + * and we don't provide a way for userspace to lock that out. + */ + if (k.k && + bkey_le(bpos_max(iter.pos, bkey_start_pos(k.k)), + pagecache_start)) { + bch2_bkey_buf_reassemble(&cur->kbuf, trans->c, k); + bch2_cut_front(iter.pos, cur->kbuf.k); + bch2_cut_back(POS(inode->ei_inum.inum, end), cur->kbuf.k); + cur->flags = 0; + } else if (k.k) { + bch2_cut_back(bkey_start_pos(k.k), cur->kbuf.k); + } + + if (cur->kbuf.k->k.type == KEY_TYPE_reflink_p) { + unsigned sectors = cur->kbuf.k->k.size; + s64 offset_into_extent = 0; + enum btree_id data_btree = BTREE_ID_extents; + ret = bch2_read_indirect_extent(trans, &data_btree, &offset_into_extent, + &cur->kbuf); if (ret) - continue; + goto err; - if (!k.k) - break; + struct bkey_i *k = cur->kbuf.k; + sectors = min_t(unsigned, sectors, k->k.size - offset_into_extent); - if (!bkey_extent_is_data(k.k) && - k.k->type != KEY_TYPE_reservation) { - bch2_btree_iter_advance(&iter); - continue; - } + bch2_cut_front(POS(k->k.p.inode, + bkey_start_offset(&k->k) + offset_into_extent), + k); + bch2_key_resize(&k->k, sectors); + k->k.p = iter.pos; + k->k.p.offset += k->k.size; + } +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} - s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); - unsigned sectors = k.k->size - offset_into_extent; +static int bch2_fiemap(struct inode *vinode, struct fiemap_extent_info *info, + u64 start, u64 len) +{ + struct bch_fs *c = vinode->i_sb->s_fs_info; + struct bch_inode_info *ei = to_bch_ei(vinode); + struct btree_trans *trans; + struct bch_fiemap_extent cur, prev; + int ret = 0; - bch2_bkey_buf_reassemble(&cur, c, k); + ret = fiemap_prep(&ei->v, info, start, &len, 0); + if (ret) + return ret; + + if (start + len < start) + return -EINVAL; - ret = bch2_read_indirect_extent(trans, &data_btree, - &offset_into_extent, &cur); + start >>= 9; + u64 end = (start + len) >> 9; + + bch2_bkey_buf_init(&cur.kbuf); + bch2_bkey_buf_init(&prev.kbuf); + bkey_init(&prev.kbuf.k->k); + + trans = bch2_trans_get(c); + + while (start < end) { + ret = lockrestart_do(trans, + bch2_next_fiemap_extent(trans, ei, start, end, &cur)); if (ret) - continue; + goto err; - k = bkey_i_to_s_c(cur.k); - bch2_bkey_buf_realloc(&prev, c, k.k->u64s); + BUG_ON(bkey_start_offset(&cur.kbuf.k->k) < start); + BUG_ON(cur.kbuf.k->k.p.offset > end); - sectors = min_t(unsigned, sectors, k.k->size - offset_into_extent); + if (bkey_start_offset(&cur.kbuf.k->k) == end) + break; - bch2_cut_front(POS(k.k->p.inode, - bkey_start_offset(k.k) + - offset_into_extent), - cur.k); - bch2_key_resize(&cur.k->k, sectors); - cur.k->k.p = iter.pos; - cur.k->k.p.offset += cur.k->k.size; + start = cur.kbuf.k->k.p.offset; - if (have_extent) { + if (!bkey_deleted(&prev.kbuf.k->k)) { bch2_trans_unlock(trans); - ret = bch2_fill_extent(c, info, - bkey_i_to_s_c(prev.k), 0); + ret = bch2_fill_extent(c, info, &prev); if (ret) - break; + goto err; } - bkey_copy(prev.k, cur.k); - have_extent = true; - - bch2_btree_iter_set_pos(&iter, - POS(iter.pos.inode, iter.pos.offset + sectors)); + bch2_bkey_buf_copy(&prev.kbuf, c, cur.kbuf.k); + prev.flags = cur.flags; } - bch2_trans_iter_exit(trans, &iter); - if (!ret && have_extent) { + if (!bkey_deleted(&prev.kbuf.k->k)) { bch2_trans_unlock(trans); - ret = bch2_fill_extent(c, info, bkey_i_to_s_c(prev.k), - FIEMAP_EXTENT_LAST); + prev.flags |= FIEMAP_EXTENT_LAST; + ret = bch2_fill_extent(c, info, &prev); } - +err: bch2_trans_put(trans); - bch2_bkey_buf_exit(&cur, c); - bch2_bkey_buf_exit(&prev, c); - return ret < 0 ? ret : 0; + bch2_bkey_buf_exit(&cur.kbuf, c); + bch2_bkey_buf_exit(&prev.kbuf, c); + + return bch2_err_class(ret < 0 ? ret : 0); } static const struct vm_operations_struct bch_vm_ops = { @@ -1430,6 +1595,141 @@ static int bch2_open(struct inode *vinode, struct file *file) return generic_file_open(vinode, file); } +/* bcachefs inode flags -> FS_IOC_GETFLAGS: */ +static const __maybe_unused unsigned bch_flags_to_uflags[] = { + [__BCH_INODE_sync] = FS_SYNC_FL, + [__BCH_INODE_immutable] = FS_IMMUTABLE_FL, + [__BCH_INODE_append] = FS_APPEND_FL, + [__BCH_INODE_nodump] = FS_NODUMP_FL, + [__BCH_INODE_noatime] = FS_NOATIME_FL, +}; + +/* bcachefs inode flags -> FS_IOC_FSGETXATTR: */ +static const __maybe_unused unsigned bch_flags_to_xflags[] = { + [__BCH_INODE_sync] = FS_XFLAG_SYNC, + [__BCH_INODE_immutable] = FS_XFLAG_IMMUTABLE, + [__BCH_INODE_append] = FS_XFLAG_APPEND, + [__BCH_INODE_nodump] = FS_XFLAG_NODUMP, + [__BCH_INODE_noatime] = FS_XFLAG_NOATIME, +}; + +static int bch2_fileattr_get(struct dentry *dentry, + struct fileattr *fa) +{ + struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + + fileattr_fill_xflags(fa, map_flags(bch_flags_to_xflags, inode->ei_inode.bi_flags)); + + if (inode->ei_inode.bi_fields_set & (1 << Inode_opt_project)) + fa->fsx_xflags |= FS_XFLAG_PROJINHERIT; + + if (bch2_inode_casefold(c, &inode->ei_inode)) + fa->flags |= FS_CASEFOLD_FL; + + fa->fsx_projid = inode->ei_qid.q[QTYP_PRJ]; + return 0; +} + +struct flags_set { + unsigned mask; + unsigned flags; + unsigned projid; + bool set_project; + bool set_casefold; + bool casefold; +}; + +static int fssetxattr_inode_update_fn(struct btree_trans *trans, + struct bch_inode_info *inode, + struct bch_inode_unpacked *bi, + void *p) +{ + struct bch_fs *c = trans->c; + struct flags_set *s = p; + + /* + * We're relying on btree locking here for exclusion with other ioctl + * calls - use the flags in the btree (@bi), not inode->i_flags: + */ + if (!S_ISREG(bi->bi_mode) && + !S_ISDIR(bi->bi_mode) && + (s->flags & (BCH_INODE_nodump|BCH_INODE_noatime)) != s->flags) + return -EINVAL; + + if (s->casefold != bch2_inode_casefold(c, bi)) { + int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->casefold); + if (ret) + return ret; + } + + if (s->set_project) { + bi->bi_project = s->projid; + bi->bi_fields_set |= BIT(Inode_opt_project); + } + + bi->bi_flags &= ~s->mask; + bi->bi_flags |= s->flags; + + bi->bi_ctime = timespec_to_bch2_time(c, current_time(&inode->v)); + return 0; +} + +static int bch2_fileattr_set(struct mnt_idmap *idmap, + struct dentry *dentry, + struct fileattr *fa) +{ + struct bch_inode_info *inode = to_bch_ei(d_inode(dentry)); + struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct flags_set s = {}; + int ret; + + if (fa->fsx_valid) { + fa->fsx_xflags &= ~FS_XFLAG_PROJINHERIT; + + s.mask = map_defined(bch_flags_to_xflags); + s.flags |= map_flags_rev(bch_flags_to_xflags, fa->fsx_xflags); + if (fa->fsx_xflags) + return -EOPNOTSUPP; + + if (fa->fsx_projid >= U32_MAX) + return -EINVAL; + + /* + * inode fields accessible via the xattr interface are stored with a +1 + * bias, so that 0 means unset: + */ + if ((inode->ei_inode.bi_project || + fa->fsx_projid) && + inode->ei_inode.bi_project != fa->fsx_projid + 1) { + s.projid = fa->fsx_projid + 1; + s.set_project = true; + } + } + + if (fa->flags_valid) { + s.mask = map_defined(bch_flags_to_uflags); + + s.set_casefold = true; + s.casefold = (fa->flags & FS_CASEFOLD_FL) != 0; + fa->flags &= ~FS_CASEFOLD_FL; + + s.flags |= map_flags_rev(bch_flags_to_uflags, fa->flags); + if (fa->flags) + return -EOPNOTSUPP; + } + + mutex_lock(&inode->ei_update_lock); + ret = bch2_subvol_is_ro(c, inode->ei_inum.subvol) ?: + (s.set_project + ? bch2_set_projid(c, inode, fa->fsx_projid) + : 0) ?: + bch2_write_inode(c, inode, fssetxattr_inode_update_fn, &s, + ATTR_CTIME); + mutex_unlock(&inode->ei_update_lock); + return ret; +} + static const struct file_operations bch_file_operations = { .open = bch2_open, .llseek = bch2_llseek, @@ -1457,6 +1757,8 @@ static const struct inode_operations bch_file_inode_operations = { .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif + .fileattr_get = bch2_fileattr_get, + .fileattr_set = bch2_fileattr_set, }; static const struct inode_operations bch_dir_inode_operations = { @@ -1477,6 +1779,8 @@ static const struct inode_operations bch_dir_inode_operations = { .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif + .fileattr_get = bch2_fileattr_get, + .fileattr_set = bch2_fileattr_set, }; static const struct file_operations bch_dir_file_operations = { @@ -1499,6 +1803,8 @@ static const struct inode_operations bch_symlink_inode_operations = { .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif + .fileattr_get = bch2_fileattr_get, + .fileattr_set = bch2_fileattr_set, }; static const struct inode_operations bch_special_inode_operations = { @@ -1509,6 +1815,8 @@ static const struct inode_operations bch_special_inode_operations = { .get_inode_acl = bch2_get_acl, .set_acl = bch2_set_acl, #endif + .fileattr_get = bch2_fileattr_get, + .fileattr_set = bch2_fileattr_set, }; static const struct address_space_operations bch_address_space_operations = { @@ -1678,17 +1986,17 @@ retry: if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter1, snapshot); - bch2_btree_iter_set_snapshot(&iter2, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter1, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter2, snapshot); ret = bch2_inode_find_by_inum_trans(trans, inode_inum(inode), &inode_u); if (ret) goto err; if (inode_u.bi_dir == dir->ei_inode.bi_inum) { - bch2_btree_iter_set_pos(&iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); + bch2_btree_iter_set_pos(trans, &iter1, POS(inode_u.bi_dir, inode_u.bi_dir_offset)); - k = bch2_btree_iter_peek_slot(&iter1); + k = bch2_btree_iter_peek_slot(trans, &iter1); ret = bkey_err(k); if (ret) goto err; @@ -1712,7 +2020,7 @@ retry: * File with multiple hardlinks and our backref is to the wrong * directory - linear search: */ - for_each_btree_key_continue_norestart(iter2, 0, k, ret) { + for_each_btree_key_continue_norestart(trans, iter2, 0, k, ret) { if (k.k->p.inode > dir->ei_inode.bi_inum) break; @@ -1802,7 +2110,8 @@ static void bch2_vfs_inode_init(struct btree_trans *trans, break; } - mapping_set_large_folios(inode->v.i_mapping); + mapping_set_folio_min_order(inode->v.i_mapping, + get_order(trans->c->opts.block_size)); } static void bch2_free_inode(struct inode *vinode) @@ -2008,55 +2317,19 @@ static struct bch_fs *bch2_path_to_fs(const char *path) return c ?: ERR_PTR(-ENOENT); } -static int bch2_remount(struct super_block *sb, int *flags, - struct bch_opts opts) -{ - struct bch_fs *c = sb->s_fs_info; - int ret = 0; - - opt_set(opts, read_only, (*flags & SB_RDONLY) != 0); - - if (opts.read_only != c->opts.read_only) { - down_write(&c->state_lock); - - if (opts.read_only) { - bch2_fs_read_only(c); - - sb->s_flags |= SB_RDONLY; - } else { - ret = bch2_fs_read_write(c); - if (ret) { - bch_err(c, "error going rw: %i", ret); - up_write(&c->state_lock); - ret = -EINVAL; - goto err; - } - - sb->s_flags &= ~SB_RDONLY; - } - - c->opts.read_only = opts.read_only; - - up_write(&c->state_lock); - } - - if (opt_defined(opts, errors)) - c->opts.errors = opts.errors; -err: - return bch2_err_class(ret); -} - static int bch2_show_devname(struct seq_file *seq, struct dentry *root) { struct bch_fs *c = root->d_sb->s_fs_info; bool first = true; - for_each_online_member(c, ca) { + rcu_read_lock(); + for_each_online_member_rcu(c, ca) { if (!first) seq_putc(seq, ':'); first = false; seq_puts(seq, ca->disk_sb.sb_name); } + rcu_read_unlock(); return 0; } @@ -2163,7 +2436,7 @@ static int bch2_fs_get_tree(struct fs_context *fc) struct inode *vinode; struct bch2_opts_parse *opts_parse = fc->fs_private; struct bch_opts opts = opts_parse->opts; - darray_str devs; + darray_const_str devs; darray_fs devs_to_fs = {}; int ret; @@ -2187,14 +2460,17 @@ static int bch2_fs_get_tree(struct fs_context *fc) if (!IS_ERR(sb)) goto got_sb; - c = bch2_fs_open(devs.data, devs.nr, opts); + c = bch2_fs_open(&devs, &opts); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; + if (opt_defined(opts, discard)) + set_bit(BCH_FS_discard_mount_opt_set, &c->flags); + /* Some options can't be parsed until after the fs is started: */ opts = bch2_opts_empty(); - ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf); + ret = bch2_parse_mount_opts(c, &opts, NULL, opts_parse->parse_later.buf, false); if (ret) goto err_stop_fs; @@ -2234,7 +2510,12 @@ got_sb: sb->s_time_min = div_s64(S64_MIN, c->sb.time_units_per_sec) + 1; sb->s_time_max = div_s64(S64_MAX, c->sb.time_units_per_sec); super_set_uuid(sb, c->sb.user_uuid.b, sizeof(c->sb.user_uuid)); - super_set_sysfs_name_uuid(sb); + + if (c->sb.multi_device) + super_set_sysfs_name_uuid(sb); + else + strscpy(sb->s_sysfs_name, c->name, sizeof(sb->s_sysfs_name)); + sb->s_shrink->seeks = 0; c->vfs_sb = sb; strscpy(sb->s_id, c->name, sizeof(sb->s_id)); @@ -2245,15 +2526,16 @@ got_sb: sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - for_each_online_member(c, ca) { + rcu_read_lock(); + for_each_online_member_rcu(c, ca) { struct block_device *bdev = ca->disk_sb.bdev; /* XXX: create an anonymous device for multi device filesystems */ sb->s_bdev = bdev; sb->s_dev = bdev->bd_dev; - percpu_ref_put(&ca->io_ref); break; } + rcu_read_unlock(); c->dev = sb->s_dev; @@ -2264,6 +2546,11 @@ got_sb: sb->s_shrink->seeks = 0; +#ifdef CONFIG_UNICODE + sb->s_encoding = c->cf_encoding; +#endif + generic_set_sb_d_ops(sb); + vinode = bch2_vfs_inode_get(c, BCACHEFS_ROOT_SUBVOL_INUM); ret = PTR_ERR_OR_ZERO(vinode); bch_err_msg(c, ret, "mounting: error getting root inode"); @@ -2300,7 +2587,8 @@ err_stop_fs: goto err; err_put_super: - __bch2_fs_stop(c); + if (!sb->s_root) + __bch2_fs_stop(c); deactivate_locked_super(sb); goto err; } @@ -2343,6 +2631,8 @@ static int bch2_fs_parse_param(struct fs_context *fc, int ret = bch2_parse_one_mount_opt(c, &opts->opts, &opts->parse_later, param->key, param->string); + if (ret) + pr_err("Error parsing option %s: %s", param->key, bch2_err_str(ret)); return bch2_err_class(ret); } @@ -2351,8 +2641,39 @@ static int bch2_fs_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; struct bch2_opts_parse *opts = fc->fs_private; + struct bch_fs *c = sb->s_fs_info; + int ret = 0; + + opt_set(opts->opts, read_only, (fc->sb_flags & SB_RDONLY) != 0); - return bch2_remount(sb, &fc->sb_flags, opts->opts); + if (opts->opts.read_only != c->opts.read_only) { + down_write(&c->state_lock); + + if (opts->opts.read_only) { + bch2_fs_read_only(c); + + sb->s_flags |= SB_RDONLY; + } else { + ret = bch2_fs_read_write(c); + if (ret) { + bch_err(c, "error going rw: %i", ret); + up_write(&c->state_lock); + ret = -EINVAL; + goto err; + } + + sb->s_flags &= ~SB_RDONLY; + } + + c->opts.read_only = opts->opts.read_only; + + up_write(&c->state_lock); + } + + if (opt_defined(opts->opts, errors)) + c->opts.errors = opts->opts.errors; +err: + return bch2_err_class(ret); } static const struct fs_context_operations bch2_context_ops = { @@ -2396,7 +2717,7 @@ static struct file_system_type bcache_fs_type = { .name = "bcachefs", .init_fs_context = bch2_init_fs_context, .kill_sb = bch2_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_LBS, }; MODULE_ALIAS_FS("bcachefs"); diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 9bf316e7b845..49f46df8340e 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -10,10 +10,10 @@ #include "dirent.h" #include "error.h" #include "fs.h" -#include "fs-common.h" #include "fsck.h" #include "inode.h" #include "keylist.h" +#include "namei.h" #include "recovery_passes.h" #include "snapshot.h" #include "super.h" @@ -23,13 +23,6 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ -static bool inode_points_to_dirent(struct bch_inode_unpacked *inode, - struct bkey_s_c_dirent d) -{ - return inode->bi_dir == d.k->p.inode && - inode->bi_dir_offset == d.k->p.offset; -} - static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, struct bch_inode_unpacked *inode) { @@ -116,50 +109,6 @@ static int subvol_lookup(struct btree_trans *trans, u32 subvol, return ret; } -static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inode_nr), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inode_nr) - break; - if (!bkey_is_inode(k.k)) - continue; - ret = bch2_inode_unpack(k, inode); - goto found; - } - ret = -BCH_ERR_ENOENT_inode; -found: - bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -static int lookup_inode(struct btree_trans *trans, u64 inode_nr, u32 snapshot, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, - SPOS(0, inode_nr, snapshot), 0); - ret = bkey_err(k); - if (ret) - goto err; - - ret = bkey_is_inode(k.k) - ? bch2_inode_unpack(k, inode) - : -BCH_ERR_ENOENT_inode; -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int lookup_dirent_in_snapshot(struct btree_trans *trans, struct bch_hash_info hash_info, subvol_inum dir, struct qstr *name, @@ -179,32 +128,6 @@ static int lookup_dirent_in_snapshot(struct btree_trans *trans, return 0; } -static int __remove_dirent(struct btree_trans *trans, struct bpos pos) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bch_inode_unpacked dir_inode; - struct bch_hash_info dir_hash_info; - int ret; - - ret = lookup_first_inode(trans, pos.inode, &dir_inode); - if (ret) - goto err; - - dir_hash_info = bch2_hash_info_init(c, &dir_inode); - - bch2_trans_iter_init(trans, &iter, BTREE_ID_dirents, pos, BTREE_ITER_intent); - - ret = bch2_btree_iter_traverse(&iter) ?: - bch2_hash_delete_at(trans, bch2_dirent_hash_desc, - &dir_hash_info, &iter, - BTREE_UPDATE_internal_snapshot_node); - bch2_trans_iter_exit(trans, &iter); -err: - bch_err_fn(c, ret); - return ret; -} - /* * Find any subvolume associated with a tree of snapshots * We can't rely on master_subvol - it might have been deleted. @@ -242,7 +165,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, { struct bch_fs *c = trans->c; struct qstr lostfound_str = QSTR("lost+found"); - struct btree_iter lostfound_iter = { NULL }; + struct btree_iter lostfound_iter = {}; u64 inum = 0; unsigned d_type = 0; int ret; @@ -287,7 +210,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, struct bch_inode_unpacked root_inode; struct bch_hash_info root_hash_info; - ret = lookup_inode(trans, root_inum.inum, snapshot, &root_inode); + ret = bch2_inode_find_by_inum_snapshot(trans, root_inum.inum, snapshot, &root_inode, 0); bch_err_msg(c, ret, "looking up root inode %llu for subvol %u", root_inum.inum, subvolid); if (ret) @@ -313,7 +236,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, * The bch2_check_dirents pass has already run, dangling dirents * shouldn't exist here: */ - ret = lookup_inode(trans, inum, snapshot, lostfound); + ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, lostfound, 0); bch_err_msg(c, ret, "looking up lost+found %llu:%u in (root inode %llu, snapshot root %u)", inum, snapshot, root_inum.inum, bch2_snapshot_root(c, snapshot)); return ret; @@ -341,7 +264,7 @@ create_lostfound: u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, lostfound); - bch2_inode_init_late(lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); + bch2_inode_init_late(c, lostfound, now, 0, 0, S_IFDIR|0700, 0, &root_inode); lostfound->bi_dir = root_inode.bi_inum; lostfound->bi_snapshot = le32_to_cpu(st.root_snapshot); @@ -351,8 +274,8 @@ create_lostfound: if (ret) goto err; - bch2_btree_iter_set_snapshot(&lostfound_iter, snapshot); - ret = bch2_btree_iter_traverse(&lostfound_iter); + bch2_btree_iter_set_snapshot(trans, &lostfound_iter, snapshot); + ret = bch2_btree_iter_traverse(trans, &lostfound_iter); if (ret) goto err; @@ -362,6 +285,7 @@ create_lostfound: &lostfound_str, lostfound->bi_inum, &lostfound->bi_dir_offset, + BTREE_UPDATE_internal_snapshot_node| STR_HASH_must_create) ?: bch2_inode_write_flags(trans, &lostfound_iter, lostfound, BTREE_UPDATE_internal_snapshot_node); @@ -377,6 +301,31 @@ static inline bool inode_should_reattach(struct bch_inode_unpacked *inode) inode->bi_subvol == BCACHEFS_ROOT_SUBVOL) return false; + /* + * Subvolume roots are special: older versions of subvolume roots may be + * disconnected, it's only the newest version that matters. + * + * We only keep a single dirent pointing to a subvolume root, i.e. + * older versions of snapshots will not have a different dirent pointing + * to the same subvolume root. + * + * This is because dirents that point to subvolumes are only visible in + * the parent subvolume - versioning is not needed - and keeping them + * around would break fsck, because when we're crossing subvolumes we + * don't have a consistent snapshot ID to do check the inode <-> dirent + * relationships. + * + * Thus, a subvolume root that's been renamed after a snapshot will have + * a disconnected older version - that's expected. + * + * Note that taking a snapshot always updates the root inode (to update + * the dirent backpointer), so a subvolume root inode with + * BCH_INODE_has_child_snapshot is never visible. + */ + if (inode->bi_subvol && + (inode->bi_flags & BCH_INODE_has_child_snapshot)) + return false; + return !inode->bi_dir && !(inode->bi_flags & BCH_INODE_unlinked); } @@ -462,6 +411,7 @@ static int reattach_inode(struct btree_trans *trans, struct bch_inode_unpacked * &name, inode->bi_subvol ?: inode->bi_inum, &inode->bi_dir_offset, + BTREE_UPDATE_internal_snapshot_node| STR_HASH_must_create); if (ret) { bch_err_msg(c, ret, "error creating dirent"); @@ -548,7 +498,7 @@ static int remove_backpointer(struct btree_trans *trans, SPOS(inode->bi_dir, inode->bi_dir_offset, inode->bi_snapshot)); int ret = bkey_err(d) ?: dirent_points_to_inode(c, d, inode) ?: - __remove_dirent(trans, d.k->p); + bch2_fsck_remove_dirent(trans, d.k->p); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -595,12 +545,12 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub u64 cpu = raw_smp_processor_id(); bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); + bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, S_IFDIR|0755, 0, NULL); new_inode.bi_subvol = subvolid; int ret = bch2_inode_create(trans, &inode_iter, &new_inode, snapshotid, cpu) ?: - bch2_btree_iter_traverse(&inode_iter) ?: + bch2_btree_iter_traverse(trans, &inode_iter) ?: bch2_inode_write(trans, &inode_iter, &new_inode); bch2_trans_iter_exit(trans, &inode_iter); if (ret) @@ -665,7 +615,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 struct btree_iter iter = {}; bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(inum, U64_MAX, snapshot), 0); - struct bkey_s_c k = bch2_btree_iter_peek_prev_min(&iter, POS(inum, 0)); + struct bkey_s_c k = bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum, 0)); bch2_trans_iter_exit(trans, &iter); int ret = bkey_err(k); if (ret) @@ -685,7 +635,7 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 struct bch_inode_unpacked new_inode; bch2_inode_init_early(c, &new_inode); - bch2_inode_init_late(&new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); + bch2_inode_init_late(c, &new_inode, bch2_current_time(c), 0, 0, i_mode|0600, 0, NULL); new_inode.bi_size = i_size; new_inode.bi_inum = inum; new_inode.bi_snapshot = snapshot; @@ -816,12 +766,12 @@ static int ref_visible2(struct bch_fs *c, #define for_each_visible_inode(_c, _s, _w, _snapshot, _i) \ for (_i = (_w)->inodes.data; _i < (_w)->inodes.data + (_w)->inodes.nr && \ - (_i)->snapshot <= (_snapshot); _i++) \ - if (key_visible_in_snapshot(_c, _s, _i->snapshot, _snapshot)) + (_i)->inode.bi_snapshot <= (_snapshot); _i++) \ + if (key_visible_in_snapshot(_c, _s, _i->inode.bi_snapshot, _snapshot)) struct inode_walker_entry { struct bch_inode_unpacked inode; - u32 snapshot; + bool whiteout; u64 count; u64 i_size; }; @@ -850,13 +800,20 @@ static struct inode_walker inode_walker_init(void) static int add_inode(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c inode) { - struct bch_inode_unpacked u; - - return bch2_inode_unpack(inode, &u) ?: - darray_push(&w->inodes, ((struct inode_walker_entry) { - .inode = u, - .snapshot = inode.k->p.snapshot, + int ret = darray_push(&w->inodes, ((struct inode_walker_entry) { + .whiteout = !bkey_is_inode(inode.k), })); + if (ret) + return ret; + + struct inode_walker_entry *n = &darray_last(w->inodes); + if (!n->whiteout) { + return bch2_inode_unpack(inode, &n->inode); + } else { + n->inode.bi_inum = inode.k->p.inode; + n->inode.bi_snapshot = inode.k->p.snapshot; + return 0; + } } static int get_inodes_all_snapshots(struct btree_trans *trans, @@ -876,13 +833,12 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, w->recalculate_sums = false; w->inodes.nr = 0; - for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, POS(0, inum), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) + for_each_btree_key_max_norestart(trans, iter, + BTREE_ID_inodes, POS(0, inum), SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + ret = add_inode(c, w, k); + if (ret) break; - - if (bkey_is_inode(k.k)) - add_inode(c, w, k); } bch2_trans_iter_exit(trans, &iter); @@ -894,48 +850,112 @@ static int get_inodes_all_snapshots(struct btree_trans *trans, return 0; } +static int get_visible_inodes(struct btree_trans *trans, + struct inode_walker *w, + struct snapshots_seen *s, + u64 inum) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret; + + w->inodes.nr = 0; + w->deletes.nr = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + + if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) + continue; + + if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) + continue; + + ret = bkey_is_inode(k.k) + ? add_inode(c, w, k) + : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + + return ret; +} + static struct inode_walker_entry * -lookup_inode_for_snapshot(struct bch_fs *c, struct inode_walker *w, struct bkey_s_c k) +lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, struct bkey_s_c k) { - bool is_whiteout = k.k->type == KEY_TYPE_whiteout; + struct bch_fs *c = trans->c; struct inode_walker_entry *i; __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->snapshot)) + if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)) goto found; return NULL; found: - BUG_ON(k.k->p.snapshot > i->snapshot); + BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot); - if (k.k->p.snapshot != i->snapshot && !is_whiteout) { - struct inode_walker_entry new = *i; - - new.snapshot = k.k->p.snapshot; - new.count = 0; - new.i_size = 0; - - struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); + struct printbuf buf = PRINTBUF; + int ret = 0; - bch_info(c, "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" + if (fsck_err_on(k.k->p.snapshot != i->inode.bi_snapshot, + trans, snapshot_key_missing_inode_snapshot, + "have key for inode %llu:%u but have inode in ancestor snapshot %u\n" "unexpected because we should always update the inode when we update a key in that inode\n" "%s", - w->last_pos.inode, k.k->p.snapshot, i->snapshot, buf.buf); - printbuf_exit(&buf); + w->last_pos.inode, k.k->p.snapshot, i->inode.bi_snapshot, + (bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + struct bch_inode_unpacked new = i->inode; + struct bkey_i whiteout; + + new.bi_snapshot = k.k->p.snapshot; + + if (!i->whiteout) { + ret = __bch2_fsck_write_inode(trans, &new); + } else { + bkey_init(&whiteout.k); + whiteout.k.type = KEY_TYPE_whiteout; + whiteout.k.p = SPOS(0, i->inode.bi_inum, i->inode.bi_snapshot); + ret = bch2_btree_insert_nonextent(trans, BTREE_ID_inodes, + &whiteout, + BTREE_UPDATE_internal_snapshot_node); + } + + if (ret) + goto fsck_err; + + ret = bch2_trans_commit(trans, NULL, NULL, 0); + if (ret) + goto fsck_err; + + struct inode_walker_entry new_entry = *i; + + new_entry.inode.bi_snapshot = k.k->p.snapshot; + new_entry.count = 0; + new_entry.i_size = 0; - while (i > w->inodes.data && i[-1].snapshot > k.k->p.snapshot) + while (i > w->inodes.data && i[-1].inode.bi_snapshot > k.k->p.snapshot) --i; size_t pos = i - w->inodes.data; - int ret = darray_insert_item(&w->inodes, pos, new); + ret = darray_insert_item(&w->inodes, pos, new_entry); if (ret) - return ERR_PTR(ret); + goto fsck_err; - i = w->inodes.data + pos; + ret = -BCH_ERR_transaction_restart_nested; + goto fsck_err; } + printbuf_exit(&buf); return i; +fsck_err: + printbuf_exit(&buf); + return ERR_PTR(ret); } static struct inode_walker_entry *walk_inode(struct btree_trans *trans, @@ -950,42 +970,7 @@ static struct inode_walker_entry *walk_inode(struct btree_trans *trans, w->last_pos = k.k->p; - return lookup_inode_for_snapshot(trans->c, w, k); -} - -static int get_visible_inodes(struct btree_trans *trans, - struct inode_walker *w, - struct snapshots_seen *s, - u64 inum) -{ - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret; - - w->inodes.nr = 0; - w->deletes.nr = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, s->pos.snapshot), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - - if (!ref_visible(c, s, s->pos.snapshot, k.k->p.snapshot)) - continue; - - if (snapshot_list_has_ancestor(c, &w->deletes, k.k->p.snapshot)) - continue; - - ret = bkey_is_inode(k.k) - ? add_inode(c, w, k) - : snapshot_list_add(c, &w->deletes, k.k->p.snapshot); - if (ret) - break; - } - bch2_trans_iter_exit(trans, &iter); - - return ret; + return lookup_inode_for_snapshot(trans, w, k); } /* @@ -1063,6 +1048,23 @@ static int check_inode_dirent_inode(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) return ret; + if ((ret || dirent_points_to_inode_nowarn(d, inode)) && + inode->bi_subvol && + (inode->bi_flags & BCH_INODE_has_child_snapshot)) { + /* Older version of a renamed subvolume root: we won't have a + * correct dirent for it. That's expected, see + * inode_should_reattach(). + * + * We don't clear the backpointer field when doing the rename + * because there might be arbitrarily many versions in older + * snapshots. + */ + inode->bi_dir = 0; + inode->bi_dir_offset = 0; + *write_inode = true; + goto out; + } + if (fsck_err_on(ret, trans, inode_points_to_missing_dirent, "inode points to missing dirent\n%s", @@ -1083,7 +1085,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, inode->bi_dir_offset = 0; *write_inode = true; } - +out: ret = 0; fsck_err: bch2_trans_iter_exit(trans, &dirent_iter); @@ -1092,32 +1094,6 @@ fsck_err: return ret; } -static int get_snapshot_root_inode(struct btree_trans *trans, - struct bch_inode_unpacked *root, - u64 inum) -{ - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) - goto found_root; - } - if (ret) - goto err; - BUG(); -found_root: - ret = bch2_inode_unpack(k, root); -err: - bch2_trans_iter_exit(trans, &iter); - return ret; -} - static int check_inode(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, @@ -1148,20 +1124,23 @@ static int check_inode(struct btree_trans *trans, goto err; if (snapshot_root->bi_inum != u.bi_inum) { - ret = get_snapshot_root_inode(trans, snapshot_root, u.bi_inum); + ret = bch2_inode_find_snapshot_root(trans, u.bi_inum, snapshot_root); if (ret) goto err; } - if (fsck_err_on(u.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, - "inode hash info in different snapshots don't match")) { - u.bi_hash_seed = snapshot_root->bi_hash_seed; - SET_INODE_STR_HASH(&u, INODE_STR_HASH(snapshot_root)); - do_update = true; + if (u.bi_hash_seed != snapshot_root->bi_hash_seed || + INODE_STR_HASH(&u) != INODE_STR_HASH(snapshot_root)) { + ret = bch2_repair_inode_hash_info(trans, snapshot_root); + BUG_ON(ret == -BCH_ERR_fsck_repair_unimplemented); + if (ret) + goto err; } + ret = bch2_check_inode_has_case_insensitive(trans, &u, &s->ids, &do_update); + if (ret) + goto err; + if (u.bi_dir || u.bi_dir_offset) { ret = check_inode_dirent_inode(trans, &u, &do_update); if (ret) @@ -1464,7 +1443,9 @@ static int check_key_has_inode(struct btree_trans *trans, if (k.k->type == KEY_TYPE_whiteout) goto out; - if (!i && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { + bool have_inode = i && !i->whiteout; + + if (!have_inode && (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_inodes))) { ret = reconstruct_inode(trans, iter->btree_id, k.k->p.snapshot, k.k->p.inode) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); if (ret) @@ -1475,16 +1456,16 @@ static int check_key_has_inode(struct btree_trans *trans, goto err; } - if (fsck_err_on(!i, + if (fsck_err_on(!have_inode, trans, key_in_missing_inode, - "key in missing inode:\n %s", + "key in missing inode:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) goto delete; - if (fsck_err_on(i && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), + if (fsck_err_on(have_inode && !btree_matches_i_mode(iter->btree_id, i->inode.bi_mode), trans, key_in_wrong_inode_type, - "key for wrong inode mode %o:\n %s", + "key for wrong inode mode %o:\n%s", i->inode.bi_mode, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) @@ -1510,21 +1491,21 @@ static int check_i_sectors_notnested(struct btree_trans *trans, struct inode_wal if (i->inode.bi_sectors == i->count) continue; - count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->snapshot); + count2 = bch2_count_inode_sectors(trans, w->last_pos.inode, i->inode.bi_snapshot); if (w->recalculate_sums) i->count = count2; if (i->count != count2) { bch_err_ratelimited(c, "fsck counted i_sectors wrong for inode %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); i->count = count2; } if (fsck_err_on(!(i->inode.bi_flags & BCH_INODE_i_sectors_dirty), trans, inode_i_sectors_wrong, "inode %llu:%u has incorrect i_sectors: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, + w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_sectors, i->count)) { i->inode.bi_sectors = i->count; ret = bch2_fsck_write_inode(trans, &i->inode); @@ -1613,7 +1594,7 @@ static int overlapping_extents_found(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; - struct btree_iter iter1, iter2 = { NULL }; + struct btree_iter iter1, iter2 = {}; struct bkey_s_c k1, k2; int ret; @@ -1622,18 +1603,18 @@ static int overlapping_extents_found(struct btree_trans *trans, bch2_trans_iter_init(trans, &iter1, btree, pos1, BTREE_ITER_all_snapshots| BTREE_ITER_not_extents); - k1 = bch2_btree_iter_peek_max(&iter1, POS(pos1.inode, U64_MAX)); + k1 = bch2_btree_iter_peek_max(trans, &iter1, POS(pos1.inode, U64_MAX)); ret = bkey_err(k1); if (ret) goto err; - prt_str(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k1); if (!bpos_eq(pos1, k1.k->p)) { - prt_str(&buf, "\n wanted\n "); + prt_str(&buf, "\nwanted\n "); bch2_bpos_to_text(&buf, pos1); - prt_str(&buf, "\n "); + prt_str(&buf, "\n"); bch2_bkey_to_text(&buf, &pos2); bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", @@ -1642,12 +1623,12 @@ static int overlapping_extents_found(struct btree_trans *trans, goto err; } - bch2_trans_copy_iter(&iter2, &iter1); + bch2_trans_copy_iter(trans, &iter2, &iter1); while (1) { - bch2_btree_iter_advance(&iter2); + bch2_btree_iter_advance(trans, &iter2); - k2 = bch2_btree_iter_peek_max(&iter2, POS(pos1.inode, U64_MAX)); + k2 = bch2_btree_iter_peek_max(trans, &iter2, POS(pos1.inode, U64_MAX)); ret = bkey_err(k2); if (ret) goto err; @@ -1656,7 +1637,7 @@ static int overlapping_extents_found(struct btree_trans *trans, break; } - prt_str(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k2); if (bpos_gt(k2.k->p, pos2.p) || @@ -1667,7 +1648,7 @@ static int overlapping_extents_found(struct btree_trans *trans, goto err; } - prt_printf(&buf, "\n overwriting %s extent", + prt_printf(&buf, "\noverwriting %s extent", pos1.snapshot >= pos2.p.snapshot ? "first" : "second"); if (fsck_err(trans, extent_overlapping, @@ -1688,6 +1669,8 @@ static int overlapping_extents_found(struct btree_trans *trans, bch2_trans_commit(trans, &res, NULL, BCH_TRANS_COMMIT_no_enospc); bch2_disk_reservation_put(c, &res); + bch_info(c, "repair ret %s", bch2_err_str(ret)); + if (ret) goto err; @@ -1833,21 +1816,21 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + if (i->inode.bi_snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; if (fsck_err_on(k.k->p.offset > round_up(i->inode.bi_size, block_bytes(c)) >> 9 && !bkey_extent_is_reservation(k), trans, extent_past_end_of_inode, - "extent type past end of inode %llu:%u, i_size %llu\n %s", - i->inode.bi_inum, i->snapshot, i->inode.bi_size, + "extent type past end of inode %llu:%u, i_size %llu\n%s", + i->inode.bi_inum, i->inode.bi_snapshot, i->inode.bi_size, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { struct btree_iter iter2; - bch2_trans_copy_iter(&iter2, iter); - bch2_btree_iter_set_snapshot(&iter2, i->snapshot); - ret = bch2_btree_iter_traverse(&iter2) ?: + bch2_trans_copy_iter(trans, &iter2, iter); + bch2_btree_iter_set_snapshot(trans, &iter2, i->inode.bi_snapshot); + ret = bch2_btree_iter_traverse(trans, &iter2) ?: bch2_btree_delete_at(trans, &iter2, BTREE_UPDATE_internal_snapshot_node); bch2_trans_iter_exit(trans, &iter2); @@ -1868,8 +1851,9 @@ static int check_extent(struct btree_trans *trans, struct btree_iter *iter, for (struct inode_walker_entry *i = extent_i ?: &darray_last(inode->inodes); inode->inodes.data && i >= inode->inodes.data; --i) { - if (i->snapshot > k.k->p.snapshot || - !key_visible_in_snapshot(c, s, i->snapshot, k.k->p.snapshot)) + if (i->whiteout || + i->inode.bi_snapshot > k.k->p.snapshot || + !key_visible_in_snapshot(c, s, i->inode.bi_snapshot, k.k->p.snapshot)) continue; i->count += k.k->size; @@ -1951,13 +1935,13 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ if (i->inode.bi_nlink == i->count) continue; - count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->snapshot); + count2 = bch2_count_subdirs(trans, w->last_pos.inode, i->inode.bi_snapshot); if (count2 < 0) return count2; if (i->count != count2) { bch_err_ratelimited(c, "fsck counted subdirectories wrong for inum %llu:%u: got %llu should be %llu", - w->last_pos.inode, i->snapshot, i->count, count2); + w->last_pos.inode, i->inode.bi_snapshot, i->count, count2); i->count = count2; if (i->inode.bi_nlink == i->count) continue; @@ -1966,7 +1950,7 @@ static int check_subdir_count_notnested(struct btree_trans *trans, struct inode_ if (fsck_err_on(i->inode.bi_nlink != i->count, trans, inode_dir_wrong_nlink, "directory %llu:%u with wrong i_nlink: got %u, should be %llu", - w->last_pos.inode, i->snapshot, i->inode.bi_nlink, i->count)) { + w->last_pos.inode, i->inode.bi_snapshot, i->inode.bi_nlink, i->count)) { i->inode.bi_nlink = i->count; ret = bch2_fsck_write_inode(trans, &i->inode); if (ret) @@ -1978,197 +1962,13 @@ fsck_err: return ret; } -static int check_dir_i_size_notnested(struct btree_trans *trans, struct inode_walker *w) -{ - struct bch_fs *c = trans->c; - int ret = 0; - - darray_for_each(w->inodes, i) - if (fsck_err_on(i->inode.bi_size != i->i_size, - trans, inode_dir_wrong_nlink, - "directory %llu:%u with wrong i_size: got %llu, should be %llu", - w->last_pos.inode, i->snapshot, i->inode.bi_size, i->i_size)) { - i->inode.bi_size = i->i_size; - ret = bch2_fsck_write_inode(trans, &i->inode); - if (ret) - break; - } -fsck_err: - bch_err_fn(c, ret); - return ret; -} - static int check_subdir_dirents_count(struct btree_trans *trans, struct inode_walker *w) { u32 restart_count = trans->restart_count; return check_subdir_count_notnested(trans, w) ?: - check_dir_i_size_notnested(trans, w) ?: trans_was_restarted(trans, restart_count); } -noinline_for_stack -static int check_dirent_inode_dirent(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct printbuf buf = PRINTBUF; - struct btree_iter bp_iter = { NULL }; - int ret = 0; - - if (inode_points_to_dirent(target, d)) - return 0; - - if (!target->bi_dir && - !target->bi_dir_offset) { - fsck_err_on(S_ISDIR(target->bi_mode), - trans, inode_dir_missing_backpointer, - "directory with missing backpointer\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - fsck_err_on(target->bi_flags & BCH_INODE_unlinked, - trans, inode_unlinked_but_has_dirent, - "inode unlinked but has dirent\n%s", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n"), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf)); - - target->bi_flags &= ~BCH_INODE_unlinked; - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - return __bch2_fsck_write_inode(trans, target); - } - - if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n %s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_printf(&buf, "\n "), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf))) - goto err; - - struct bkey_s_c_dirent bp_dirent = dirent_get_by_pos(trans, &bp_iter, - SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot)); - ret = bkey_err(bp_dirent); - if (ret && !bch2_err_matches(ret, ENOENT)) - goto err; - - bool backpointer_exists = !ret; - ret = 0; - - if (fsck_err_on(!backpointer_exists, - trans, inode_wrong_backpointer, - "inode %llu:%u has wrong backpointer:\n" - "got %llu:%llu\n" - "should be %llu:%llu", - target->bi_inum, target->bi_snapshot, - target->bi_dir, - target->bi_dir_offset, - d.k->p.inode, - d.k->p.offset)) { - target->bi_dir = d.k->p.inode; - target->bi_dir_offset = d.k->p.offset; - ret = __bch2_fsck_write_inode(trans, target); - goto out; - } - - bch2_bkey_val_to_text(&buf, c, d.s_c); - prt_newline(&buf); - if (backpointer_exists) - bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); - - if (fsck_err_on(backpointer_exists && - (S_ISDIR(target->bi_mode) || - target->bi_subvol), - trans, inode_dir_multiple_links, - "%s %llu:%u with multiple links\n%s", - S_ISDIR(target->bi_mode) ? "directory" : "subvolume", - target->bi_inum, target->bi_snapshot, buf.buf)) { - ret = __remove_dirent(trans, d.k->p); - goto out; - } - - /* - * hardlinked file with nlink 0: - * We're just adjusting nlink here so check_nlinks() will pick - * it up, it ignores inodes with nlink 0 - */ - if (fsck_err_on(backpointer_exists && !target->bi_nlink, - trans, inode_multiple_links_but_nlink_0, - "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", - target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { - target->bi_nlink++; - target->bi_flags &= ~BCH_INODE_unlinked; - ret = __bch2_fsck_write_inode(trans, target); - if (ret) - goto err; - } -out: -err: -fsck_err: - bch2_trans_iter_exit(trans, &bp_iter); - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - -noinline_for_stack -static int check_dirent_target(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c_dirent d, - struct bch_inode_unpacked *target) -{ - struct bch_fs *c = trans->c; - struct bkey_i_dirent *n; - struct printbuf buf = PRINTBUF; - int ret = 0; - - ret = check_dirent_inode_dirent(trans, iter, d, target); - if (ret) - goto err; - - if (fsck_err_on(d.v->d_type != inode_d_type(target), - trans, dirent_d_type_wrong, - "incorrect d_type: got %s, should be %s:\n%s", - bch2_d_type_str(d.v->d_type), - bch2_d_type_str(inode_d_type(target)), - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { - n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); - ret = PTR_ERR_OR_ZERO(n); - if (ret) - goto err; - - bkey_reassemble(&n->k_i, d.s_c); - n->v.d_type = inode_d_type(target); - if (n->v.d_type == DT_SUBVOL) { - n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); - n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); - } else { - n->v.d_inum = cpu_to_le64(target->bi_inum); - } - - ret = bch2_trans_update(trans, iter, &n->k_i, 0); - if (ret) - goto err; - - d = dirent_i_to_s_c(n); - } -err: -fsck_err: - printbuf_exit(&buf); - bch_err_fn(c, ret); - return ret; -} - /* find a subvolume that's a descendent of @snapshot: */ static int find_snapshot_subvol(struct btree_trans *trans, u32 snapshot, u32 *subvolid) { @@ -2262,35 +2062,46 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * 0, subvolume); ret = bkey_err(s.s_c); if (ret && !bch2_err_matches(ret, ENOENT)) - return ret; + goto err; if (ret) { if (fsck_err(trans, dirent_to_missing_subvol, "dirent points to missing subvolume\n%s", (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) - return __remove_dirent(trans, d.k->p); + return bch2_fsck_remove_dirent(trans, d.k->p); ret = 0; goto out; } - if (fsck_err_on(le32_to_cpu(s.v->fs_path_parent) != parent_subvol, - trans, subvol_fs_path_parent_wrong, - "subvol with wrong fs_path_parent, should be be %u\n%s", - parent_subvol, - (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - struct bkey_i_subvolume *n = - bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); - ret = PTR_ERR_OR_ZERO(n); + if (le32_to_cpu(s.v->fs_path_parent) != parent_subvol) { + printbuf_reset(&buf); + + prt_printf(&buf, "subvol with wrong fs_path_parent, should be be %u\n", + parent_subvol); + + ret = bch2_inum_to_path(trans, (subvol_inum) { s.k->p.offset, + le64_to_cpu(s.v->inode) }, &buf); if (ret) goto err; + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, s.s_c); - n->v.fs_path_parent = cpu_to_le32(parent_subvol); + if (fsck_err(trans, subvol_fs_path_parent_wrong, "%s", buf.buf)) { + struct bkey_i_subvolume *n = + bch2_bkey_make_mut_typed(trans, &subvol_iter, &s.s_c, 0, subvolume); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + n->v.fs_path_parent = cpu_to_le32(parent_subvol); + } } u64 target_inum = le64_to_cpu(s.v->inode); u32 target_snapshot = le32_to_cpu(s.v->snapshot); - ret = lookup_inode(trans, target_inum, target_snapshot, &subvol_root); + ret = bch2_inode_find_by_inum_snapshot(trans, target_inum, target_snapshot, + &subvol_root, 0); if (ret && !bch2_err_matches(ret, ENOENT)) goto err; @@ -2312,7 +2123,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * goto err; } - ret = check_dirent_target(trans, iter, d, &subvol_root); + ret = bch2_check_dirent_target(trans, iter, d, &subvol_root, true); if (ret) goto err; out: @@ -2363,7 +2174,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, if (ret) goto err; - if (!i) + if (!i || i->whiteout) goto out; if (dir->first_this_inode) @@ -2384,6 +2195,41 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); + /* check casefold */ + if (fsck_err_on(d.v->d_casefold != !!hash_info->cf_encoding, + trans, dirent_casefold_mismatch, + "dirent casefold does not match dir casefold\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + buf.buf))) { + struct qstr name = bch2_dirent_get_name(d); + u32 subvol = d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_parent_subvol) + : 0; + u64 target = d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_child_subvol) + : le64_to_cpu(d.v->d_inum); + u64 dir_offset; + + ret = bch2_hash_delete_at(trans, + bch2_dirent_hash_desc, hash_info, iter, + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_dirent_create_snapshot(trans, subvol, + d.k->p.inode, d.k->p.snapshot, + hash_info, + d.v->d_type, + &name, + target, + &dir_offset, + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node| + STR_HASH_must_create) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); + + /* might need another check_dirents pass */ + goto out; + } + if (d.v->d_type == DT_SUBVOL) { ret = check_dirent_to_subvol(trans, iter, d); if (ret) @@ -2399,13 +2245,13 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = __remove_dirent(trans, d.k->p); + ret = bch2_fsck_remove_dirent(trans, d.k->p); if (ret) goto err; } darray_for_each(target->inodes, i) { - ret = check_dirent_target(trans, iter, d, &i->inode); + ret = bch2_check_dirent_target(trans, iter, d, &i->inode, true); if (ret) goto err; } @@ -2423,7 +2269,7 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, BTREE_ID_dirents, SPOS(k.k->p.inode, k.k->p.offset, *i), BTREE_ITER_intent); - ret = bch2_btree_iter_traverse(&delete_iter) ?: + ret = bch2_btree_iter_traverse(trans, &delete_iter) ?: bch2_hash_delete_at(trans, bch2_dirent_hash_desc, hash_info, &delete_iter, @@ -2503,7 +2349,7 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, if (ret) return ret; - if (!i) + if (!i || i->whiteout) return 0; if (inode->first_this_inode) @@ -2572,7 +2418,8 @@ static int check_root_trans(struct btree_trans *trans) goto err; } - ret = lookup_inode(trans, BCACHEFS_ROOT_INO, snapshot, &root_inode); + ret = bch2_inode_find_by_inum_snapshot(trans, BCACHEFS_ROOT_INO, snapshot, + &root_inode, 0); if (ret && !bch2_err_matches(ret, ENOENT)) return ret; @@ -2604,8 +2451,6 @@ int bch2_check_root(struct bch_fs *c) return ret; } -typedef DARRAY(u32) darray_u32; - static bool darray_u32_has(darray_u32 *d, u32 v) { darray_for_each(*d, i) @@ -2642,7 +2487,14 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, u32 parent = le32_to_cpu(s.v->fs_path_parent); if (darray_u32_has(&subvol_path, parent)) { - if (fsck_err(c, subvol_loop, "subvolume loop")) + printbuf_reset(&buf); + prt_printf(&buf, "subvolume loop:\n"); + + darray_for_each_reverse(subvol_path, i) + prt_printf(&buf, "%u ", *i); + prt_printf(&buf, "%u", parent); + + if (fsck_err(trans, subvol_loop, "%s", buf.buf)) ret = reattach_subvol(trans, s); break; } @@ -2650,7 +2502,7 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, bch2_trans_iter_exit(trans, &parent_iter); bch2_trans_iter_init(trans, &parent_iter, BTREE_ID_subvolumes, POS(0, parent), 0); - k = bch2_btree_iter_peek_slot(&parent_iter); + k = bch2_btree_iter_peek_slot(trans, &parent_iter); ret = bkey_err(k); if (ret) goto err; @@ -2658,7 +2510,8 @@ static int check_subvol_path(struct btree_trans *trans, struct btree_iter *iter, if (fsck_err_on(k.k->type != KEY_TYPE_subvolume, trans, subvol_unreachable, "unreachable subvolume %s", - (bch2_bkey_val_to_text(&buf, c, s.s_c), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { ret = reattach_subvol(trans, s); break; @@ -2814,14 +2667,13 @@ static int check_path_loop(struct btree_trans *trans, struct bkey_s_c inode_k) redo_bi_depth = true; if (path_is_dup(&path, inode.bi_inum, snapshot)) { - /* XXX print path */ - bch_err(c, "directory structure loop"); - - darray_for_each(path, i) - pr_err("%llu:%u", i->inum, i->snapshot); - pr_err("%llu:%u", inode.bi_inum, snapshot); + printbuf_reset(&buf); + prt_printf(&buf, "directory structure loop:\n"); + darray_for_each_reverse(path, i) + prt_printf(&buf, "%llu:%u ", i->inum, i->snapshot); + prt_printf(&buf, "%llu:%u", inode.bi_inum, snapshot); - if (fsck_err(trans, dir_loop, "directory structure loop")) { + if (fsck_err(trans, dir_loop, "%s", buf.buf)) { ret = remove_backpointer(trans, &inode); bch_err_msg(c, ret, "removing dirent"); if (ret) @@ -3220,7 +3072,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) { struct bch_ioctl_fsck_offline arg; struct fsck_thread *thr = NULL; - darray_str(devs) = {}; + darray_const_str devs = {}; long ret = 0; if (copy_from_user(&arg, user_arg, sizeof(arg))) @@ -3261,7 +3113,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) if (arg.opts) { char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr); + bch2_parse_mount_opts(NULL, &thr->opts, NULL, optstr, false); if (!IS_ERR(optstr)) kfree(optstr); @@ -3278,7 +3130,7 @@ long bch2_ioctl_fsck_offline(struct bch_ioctl_fsck_offline __user *user_arg) bch2_thread_with_stdio_init(&thr->thr, &bch2_offline_fsck_ops); - thr->c = bch2_fs_open(devs.data, arg.nr_devs, thr->opts); + thr->c = bch2_fs_open(&devs, &thr->opts); if (!IS_ERR(thr->c) && thr->c->opts.errors == BCH_ON_ERROR_panic) @@ -3315,19 +3167,18 @@ static int bch2_fsck_online_thread_fn(struct thread_with_stdio *stdio) c->opts.fix_errors = FSCK_FIX_ask; c->opts.fsck = true; - set_bit(BCH_FS_fsck_running, &c->flags); + set_bit(BCH_FS_in_fsck, &c->flags); - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; - int ret = bch2_run_online_recovery_passes(c); + int ret = bch2_run_online_recovery_passes(c, ~0ULL); - clear_bit(BCH_FS_fsck_running, &c->flags); + clear_bit(BCH_FS_in_fsck, &c->flags); bch_err_fn(c, ret); c->stdio = NULL; c->stdio_filter = NULL; c->opts.fix_errors = old_fix_errors; - up(&c->online_fsck_mutex); + up(&c->recovery.run_lock); bch2_ro_ref_put(c); return ret; } @@ -3351,7 +3202,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) if (!bch2_ro_ref_tryget(c)) return -EROFS; - if (down_trylock(&c->online_fsck_mutex)) { + if (down_trylock(&c->recovery.run_lock)) { bch2_ro_ref_put(c); return -EAGAIN; } @@ -3369,7 +3220,7 @@ long bch2_ioctl_fsck_online(struct bch_fs *c, struct bch_ioctl_fsck_online arg) char *optstr = strndup_user((char __user *)(unsigned long) arg.opts, 1 << 16); ret = PTR_ERR_OR_ZERO(optstr) ?: - bch2_parse_mount_opts(c, &thr->opts, NULL, optstr); + bch2_parse_mount_opts(c, &thr->opts, NULL, optstr, false); if (!IS_ERR(optstr)) kfree(optstr); @@ -3383,7 +3234,7 @@ err: bch_err_fn(c, ret); if (thr) bch2_fsck_thread_exit(&thr->thr); - up(&c->online_fsck_mutex); + up(&c->recovery.run_lock); bch2_ro_ref_put(c); } return ret; diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 04ec05206f8c..5cf70108ae2f 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -14,6 +14,7 @@ #include "extent_update.h" #include "fs.h" #include "inode.h" +#include "namei.h" #include "opts.h" #include "str_hash.h" #include "snapshot.h" @@ -240,6 +241,7 @@ static int bch2_inode_unpack_v3(struct bkey_s_c k, u64 v[2]; unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); @@ -284,13 +286,12 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, { memset(unpacked, 0, sizeof(*unpacked)); - unpacked->bi_snapshot = k.k->p.snapshot; - switch (k.k->type) { case KEY_TYPE_inode: { struct bkey_s_c_inode inode = bkey_s_c_to_inode(k); unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= 0; unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le32_to_cpu(inode.v->bi_flags); @@ -309,6 +310,7 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, struct bkey_s_c_inode_v2 inode = bkey_s_c_to_inode_v2(k); unpacked->bi_inum = inode.k->p.offset; + unpacked->bi_snapshot = inode.k->p.snapshot; unpacked->bi_journal_seq= le64_to_cpu(inode.v->bi_journal_seq); unpacked->bi_hash_seed = inode.v->bi_hash_seed; unpacked->bi_flags = le64_to_cpu(inode.v->bi_flags); @@ -326,8 +328,6 @@ static noinline int bch2_inode_unpack_slowpath(struct bkey_s_c k, int bch2_inode_unpack(struct bkey_s_c k, struct bch_inode_unpacked *unpacked) { - unpacked->bi_snapshot = k.k->p.snapshot; - return likely(k.k->type == KEY_TYPE_inode_v3) ? bch2_inode_unpack_v3(k, unpacked) : bch2_inode_unpack_slowpath(k, unpacked); @@ -367,6 +367,82 @@ err: return ret; } +int bch2_inode_find_by_inum_snapshot(struct btree_trans *trans, + u64 inode_nr, u32 snapshot, + struct bch_inode_unpacked *inode, + unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_inodes, + SPOS(0, inode_nr, snapshot), flags); + int ret = bkey_err(k); + if (ret) + goto err; + + ret = bkey_is_inode(k.k) + ? bch2_inode_unpack(k, inode) + : -BCH_ERR_ENOENT_inode; +err: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum_trans(struct btree_trans *trans, + subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + struct btree_iter iter; + int ret; + + ret = bch2_inode_peek(trans, &iter, inode, inum, 0); + if (!ret) + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); +} + +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root) +{ + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, + SPOS(0, inum, U32_MAX), + BTREE_ITER_all_snapshots, k, ret) { + if (k.k->p.offset != inum) + break; + if (bkey_is_inode(k.k)) { + ret = bch2_inode_unpack(k, root); + goto out; + } + } + /* We're only called when we know we have an inode for @inum */ + BUG_ON(!ret); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + int bch2_inode_write_flags(struct btree_trans *trans, struct btree_iter *iter, struct bch_inode_unpacked *inode, @@ -731,10 +807,9 @@ int bch2_trigger_inode(struct btree_trans *trans, bkey_s_to_inode_v3(new).v->bi_journal_seq = cpu_to_le64(trans->journal_res.seq); } - s64 nr = bkey_is_inode(new.k) - bkey_is_inode(old.k); - if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr) { - struct disk_accounting_pos acc = { .type = BCH_DISK_ACCOUNTING_nr_inodes }; - int ret = bch2_disk_accounting_mod(trans, &acc, &nr, 1, flags & BTREE_TRIGGER_gc); + s64 nr[1] = { bkey_is_inode(new.k) - bkey_is_inode(old.k) }; + if ((flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) && nr[0]) { + int ret = bch2_disk_accounting_mod2(trans, flags & BTREE_TRIGGER_gc, nr, nr_inodes); if (ret) return ret; } @@ -833,7 +908,8 @@ void bch2_inode_init_early(struct bch_fs *c, get_random_bytes(&inode_u->bi_hash_seed, sizeof(inode_u->bi_hash_seed)); } -void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, +void bch2_inode_init_late(struct bch_fs *c, + struct bch_inode_unpacked *inode_u, u64 now, uid_t uid, gid_t gid, umode_t mode, dev_t rdev, struct bch_inode_unpacked *parent) { @@ -857,6 +933,12 @@ void bch2_inode_init_late(struct bch_inode_unpacked *inode_u, u64 now, BCH_INODE_OPTS() #undef x } + + if (!S_ISDIR(mode)) + inode_u->bi_casefold = 0; + + if (bch2_inode_casefold(c, inode_u)) + inode_u->bi_flags |= BCH_INODE_has_case_insensitive; } void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, @@ -864,23 +946,10 @@ void bch2_inode_init(struct bch_fs *c, struct bch_inode_unpacked *inode_u, struct bch_inode_unpacked *parent) { bch2_inode_init_early(c, inode_u); - bch2_inode_init_late(inode_u, bch2_current_time(c), + bch2_inode_init_late(c, inode_u, bch2_current_time(c), uid, gid, mode, rdev, parent); } -static inline u32 bkey_generation(struct bkey_s_c k) -{ - switch (k.k->type) { - case KEY_TYPE_inode: - case KEY_TYPE_inode_v2: - BUG(); - case KEY_TYPE_inode_generation: - return le32_to_cpu(bkey_s_c_to_inode_generation(k).v->bi_generation); - default: - return 0; - } -} - static struct bkey_i_inode_alloc_cursor * bch2_inode_alloc_cursor_get(struct btree_trans *trans, u64 cpu, u64 *min, u64 *max) { @@ -954,7 +1023,7 @@ int bch2_inode_create(struct btree_trans *trans, BTREE_ITER_intent); struct bkey_s_c k; again: - while ((k = bch2_btree_iter_peek(iter)).k && + while ((k = bch2_btree_iter_peek(trans, iter)).k && !(ret = bkey_err(k)) && bkey_lt(k.k->p, POS(0, max))) { if (pos < iter->pos.offset) @@ -965,7 +1034,7 @@ again: * we've found just one: */ pos = iter->pos.offset + 1; - bch2_btree_iter_set_pos(iter, POS(0, pos)); + bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); } if (!ret && pos < max) @@ -981,12 +1050,12 @@ again: /* Retry from start */ pos = start = min; - bch2_btree_iter_set_pos(iter, POS(0, pos)); + bch2_btree_iter_set_pos(trans, iter, POS(0, pos)); le32_add_cpu(&cursor->v.gen, 1); goto again; found_slot: - bch2_btree_iter_set_pos(iter, SPOS(0, pos, snapshot)); - k = bch2_btree_iter_peek_slot(iter); + bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, snapshot)); + k = bch2_btree_iter_peek_slot(trans, iter); ret = bkey_err(k); if (ret) { bch2_trans_iter_exit(trans, iter); @@ -1023,9 +1092,9 @@ static int bch2_inode_delete_keys(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - k = bch2_btree_iter_peek_max(&iter, end); + k = bch2_btree_iter_peek_max(trans, &iter, end); ret = bkey_err(k); if (ret) goto err; @@ -1056,7 +1125,7 @@ err: int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) { struct btree_trans *trans = bch2_trans_get(c); - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bkey_s_c k; u32 snapshot; int ret; @@ -1092,7 +1161,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } @@ -1113,38 +1182,6 @@ err2: return ret; } -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek_nowarn(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum_trans(struct btree_trans *trans, - subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - struct btree_iter iter; - int ret; - - ret = bch2_inode_peek(trans, &iter, inode, inum, 0); - if (!ret) - bch2_trans_iter_exit(trans, &iter); - return ret; -} - -int bch2_inode_find_by_inum(struct bch_fs *c, subvol_inum inum, - struct bch_inode_unpacked *inode) -{ - return bch2_trans_do(c, bch2_inode_find_by_inum_trans(trans, inum, inode)); -} - int bch2_inode_nlink_inc(struct bch_inode_unpacked *bi) { if (bi->bi_flags & BCH_INODE_unlinked) @@ -1198,6 +1235,7 @@ void bch2_inode_opts_get(struct bch_io_opts *opts, struct bch_fs *c, opts->_name##_from_inode = true; \ } else { \ opts->_name = c->opts._name; \ + opts->_name##_from_inode = false; \ } BCH_INODE_OPTS() #undef x @@ -1217,10 +1255,45 @@ int bch2_inum_opts_get(struct btree_trans *trans, subvol_inum inum, struct bch_i return 0; } +int bch2_inode_set_casefold(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *bi, unsigned v) +{ + struct bch_fs *c = trans->c; + +#ifdef CONFIG_UNICODE + int ret = 0; + /* Not supported on individual files. */ + if (!S_ISDIR(bi->bi_mode)) + return -EOPNOTSUPP; + + /* + * Make sure the dir is empty, as otherwise we'd need to + * rehash everything and update the dirent keys. + */ + ret = bch2_empty_dir_trans(trans, inum); + if (ret < 0) + return ret; + + ret = bch2_request_incompat_feature(c, bcachefs_metadata_version_casefolding); + if (ret) + return ret; + + bch2_check_set_feature(c, BCH_FEATURE_casefolding); + + bi->bi_casefold = v + 1; + bi->bi_fields_set |= BIT(Inode_opt_casefold); + + return bch2_maybe_propagate_has_case_insensitive(trans, inum, bi); +#else + bch_err(c, "Cannot use casefolding on a kernel without CONFIG_UNICODE"); + return -EOPNOTSUPP; +#endif +} + static noinline int __bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) { struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bkey_i_inode_generation delete; struct bch_inode_unpacked inode_u; struct bkey_s_c k; @@ -1255,7 +1328,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); - ret = -EIO; + ret = -BCH_ERR_ENOENT_inode; goto err; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 428b9be6af34..77ad2d549541 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -134,10 +134,21 @@ static inline int bch2_inode_peek(struct btree_trans *trans, subvol_inum inum, unsigned flags) { return __bch2_inode_peek(trans, iter, inode, inum, flags, true); - int ret = bch2_inode_peek_nowarn(trans, iter, inode, inum, flags); - return ret; } +int bch2_inode_find_by_inum_snapshot(struct btree_trans *, u64, u32, + struct bch_inode_unpacked *, unsigned); +int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, + subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, + struct bch_inode_unpacked *); + +int bch2_inode_find_snapshot_root(struct btree_trans *trans, u64 inum, + struct bch_inode_unpacked *root); + int bch2_inode_write_flags(struct btree_trans *, struct btree_iter *, struct bch_inode_unpacked *, enum btree_iter_update_trigger_flags); @@ -153,7 +164,7 @@ int bch2_fsck_write_inode(struct btree_trans *, struct bch_inode_unpacked *); void bch2_inode_init_early(struct bch_fs *, struct bch_inode_unpacked *); -void bch2_inode_init_late(struct bch_inode_unpacked *, u64, +void bch2_inode_init_late(struct bch_fs *, struct bch_inode_unpacked *, u64, uid_t, gid_t, umode_t, dev_t, struct bch_inode_unpacked *); void bch2_inode_init(struct bch_fs *, struct bch_inode_unpacked *, @@ -165,14 +176,6 @@ int bch2_inode_create(struct btree_trans *, struct btree_iter *, int bch2_inode_rm(struct bch_fs *, subvol_inum); -int bch2_inode_find_by_inum_nowarn_trans(struct btree_trans *, - subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum_trans(struct btree_trans *, subvol_inum, - struct bch_inode_unpacked *); -int bch2_inode_find_by_inum(struct bch_fs *, subvol_inum, - struct bch_inode_unpacked *); - #define inode_opt_get(_c, _inode, _name) \ ((_inode)->bi_##_name ? (_inode)->bi_##_name - 1 : (_c)->opts._name) @@ -243,6 +246,14 @@ static inline unsigned bkey_inode_mode(struct bkey_s_c k) } } +static inline bool bch2_inode_casefold(struct bch_fs *c, const struct bch_inode_unpacked *bi) +{ + /* inode opts are stored with a +1 bias: 0 means "unset, use fs opt" */ + return bi->bi_casefold + ? bi->bi_casefold - 1 + : c->opts.casefold; +} + /* i_nlink: */ static inline unsigned nlink_bias(umode_t mode) @@ -277,13 +288,16 @@ static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *i bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; return S_ISDIR(inode->bi_mode) || + inode->bi_subvol || (!inode->bi_nlink && inode_has_bp); } struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); -int bch2_inum_opts_get(struct btree_trans*, subvol_inum, struct bch_io_opts *); +int bch2_inum_opts_get(struct btree_trans *, subvol_inum, struct bch_io_opts *); +int bch2_inode_set_casefold(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *, unsigned); #include "rebalance.h" @@ -295,6 +309,14 @@ bch2_inode_rebalance_opts_get(struct bch_fs *c, struct bch_inode_unpacked *inode return io_opts_to_rebalance_opts(c, &io_opts); } +#define BCACHEFS_ROOT_SUBVOL_INUM \ + ((subvol_inum) { BCACHEFS_ROOT_SUBVOL, BCACHEFS_ROOT_INO }) + +static inline bool subvol_inum_eq(subvol_inum a, subvol_inum b) +{ + return a.subvol == b.subvol && a.inum == b.inum; +} + int bch2_inode_rm_snapshot(struct btree_trans *, u64, u32); int bch2_delete_dead_inodes(struct bch_fs *); diff --git a/fs/bcachefs/inode_format.h b/fs/bcachefs/inode_format.h index b99a5bf1a75e..1f00938b1bdc 100644 --- a/fs/bcachefs/inode_format.h +++ b/fs/bcachefs/inode_format.h @@ -103,7 +103,8 @@ struct bch_inode_generation { x(bi_parent_subvol, 32) \ x(bi_nocow, 8) \ x(bi_depth, 32) \ - x(bi_inodes_32bit, 8) + x(bi_inodes_32bit, 8) \ + x(bi_casefold, 8) /* subset of BCH_INODE_FIELDS */ #define BCH_INODE_OPTS() \ @@ -117,7 +118,8 @@ struct bch_inode_generation { x(background_target, 16) \ x(erasure_code, 16) \ x(nocow, 8) \ - x(inodes_32bit, 8) + x(inodes_32bit, 8) \ + x(casefold, 8) enum inode_opt_id { #define x(name, ...) \ @@ -127,6 +129,10 @@ enum inode_opt_id { Inode_opt_nr, }; +/* + * BCH_INODE_has_case_insensitive is set if any descendent is case insensitive - + * for overlayfs + */ #define BCH_INODE_FLAGS() \ x(sync, 0) \ x(immutable, 1) \ @@ -137,7 +143,8 @@ enum inode_opt_id { x(i_sectors_dirty, 6) \ x(unlinked, 7) \ x(backptr_untrusted, 8) \ - x(has_child_snapshot, 9) + x(has_child_snapshot, 9) \ + x(has_case_insensitive, 10) /* bits 20+ reserved for packed fields below: */ diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index 5353979117b0..cc07729a4b62 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -43,7 +43,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, bch2_bkey_buf_init(&new); closure_init_stack(&cl); - k = bch2_btree_iter_peek_slot(iter); + k = bch2_btree_iter_peek_slot(trans, iter); ret = bkey_err(k); if (ret) return ret; @@ -115,7 +115,8 @@ err: bch2_increment_clock(c, sectors_allocated, WRITE); if (should_print_err(ret)) { struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9); + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, iter->pos.offset << 9)); prt_printf(&buf, "fallocate error: %s", bch2_err_str(ret)); bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); @@ -163,12 +164,12 @@ int bch2_fpunch_at(struct btree_trans *trans, struct btree_iter *iter, if (ret) continue; - bch2_btree_iter_set_snapshot(iter, snapshot); + bch2_btree_iter_set_snapshot(trans, iter, snapshot); /* * peek_max() doesn't have ideal semantics for extents: */ - k = bch2_btree_iter_peek_max(iter, end_pos); + k = bch2_btree_iter_peek_max(trans, iter, end_pos); if (!k.k) break; @@ -229,7 +230,7 @@ static int truncate_set_isize(struct btree_trans *trans, u64 new_i_size, bool warn) { - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct bch_inode_unpacked inode_u; int ret; @@ -398,7 +399,7 @@ case LOGGED_OP_FINSERT_start: if (ret) goto err; } else { - bch2_btree_iter_set_pos(&iter, POS(inum.inum, src_offset)); + bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, src_offset)); ret = bch2_fpunch_at(trans, &iter, inum, src_offset + len, i_sectors_delta); if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -424,12 +425,12 @@ case LOGGED_OP_FINSERT_shift_extents: if (ret) goto btree_err; - bch2_btree_iter_set_snapshot(&iter, snapshot); - bch2_btree_iter_set_pos(&iter, SPOS(inum.inum, pos, snapshot)); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); + bch2_btree_iter_set_pos(trans, &iter, SPOS(inum.inum, pos, snapshot)); k = insert - ? bch2_btree_iter_peek_prev_min(&iter, POS(inum.inum, 0)) - : bch2_btree_iter_peek_max(&iter, POS(inum.inum, U64_MAX)); + ? bch2_btree_iter_peek_prev_min(trans, &iter, POS(inum.inum, 0)) + : bch2_btree_iter_peek_max(trans, &iter, POS(inum.inum, U64_MAX)); if ((ret = bkey_err(k))) goto btree_err; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index 8c7b2d3d779d..cc708d46557e 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -9,6 +9,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "btree_update.h" #include "buckets.h" #include "checksum.h" @@ -17,6 +18,7 @@ #include "data_update.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "io_read.h" #include "io_misc.h" @@ -25,8 +27,22 @@ #include "subvolume.h" #include "trace.h" +#include <linux/moduleparam.h> +#include <linux/random.h> #include <linux/sched/mm.h> +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_read_corrupt_ratio; +module_param_named(read_corrupt_ratio, bch2_read_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(read_corrupt_ratio, ""); +#endif + +static bool bch2_poison_extents_on_checksum_error; +module_param_named(poison_extents_on_checksum_error, + bch2_poison_extents_on_checksum_error, bool, 0644); +MODULE_PARM_DESC(poison_extents_on_checksum_error, + "Extents with checksum errors are marked as poisoned - unsafe without read fua support"); + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static bool bch2_target_congested(struct bch_fs *c, u16 target) @@ -59,7 +75,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) } rcu_read_unlock(); - return bch2_rand_range(nr * CONGESTED_MAX) < total; + return get_random_u32_below(nr * CONGESTED_MAX) < total; } #else @@ -73,17 +89,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) /* Cache promotion on read */ -struct promote_op { - struct rcu_head rcu; - u64 start_time; - - struct rhash_head hash; - struct bpos pos; - - struct data_update write; - struct bio_vec bi_inline_vecs[]; /* must be last */ -}; - static const struct rhashtable_params bch_promote_params = { .head_offset = offsetof(struct promote_op, hash), .key_offset = offsetof(struct promote_op, pos), @@ -96,6 +101,33 @@ static inline bool have_io_error(struct bch_io_failures *failed) return failed && failed->nr; } +static inline struct data_update *rbio_data_update(struct bch_read_bio *rbio) +{ + EBUG_ON(rbio->split); + + return rbio->data_update + ? container_of(rbio, struct data_update, rbio) + : NULL; +} + +static bool ptr_being_rewritten(struct bch_read_bio *orig, unsigned dev) +{ + struct data_update *u = rbio_data_update(orig); + if (!u) + return false; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(u->k.k)); + unsigned i = 0; + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev && + u->data_opts.rewrite_ptrs & BIT(i)) + return true; + i++; + } + + return false; +} + static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, struct bpos pos, struct bch_io_opts opts, @@ -105,7 +137,7 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, if (!have_io_error(failed)) { BUG_ON(!opts.promote_target); - if (!(flags & BCH_READ_MAY_PROMOTE)) + if (!(flags & BCH_READ_may_promote)) return -BCH_ERR_nopromote_may_not; if (bch2_bkey_has_target(c, k, opts.promote_target)) @@ -125,98 +157,95 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, return 0; } -static void promote_free(struct bch_fs *c, struct promote_op *op) +static noinline void promote_free(struct bch_read_bio *rbio) { - int ret; + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); + struct bch_fs *c = rbio->c; + + int ret = rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params); + BUG_ON(ret); + + async_object_list_del(c, promote, op->list_idx); bch2_data_update_exit(&op->write); - ret = rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params); - BUG_ON(ret); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); kfree_rcu(op, rcu); } static void promote_done(struct bch_write_op *wop) { - struct promote_op *op = - container_of(wop, struct promote_op, write.op); - struct bch_fs *c = op->write.op.c; + struct promote_op *op = container_of(wop, struct promote_op, write.op); + struct bch_fs *c = op->write.rbio.c; - bch2_time_stats_update(&c->times[BCH_TIME_data_promote], - op->start_time); - promote_free(c, op); + bch2_time_stats_update(&c->times[BCH_TIME_data_promote], op->start_time); + promote_free(&op->write.rbio); } -static void promote_start(struct promote_op *op, struct bch_read_bio *rbio) +static void promote_start_work(struct work_struct *work) { - struct bio *bio = &op->write.op.wbio.bio; + struct promote_op *op = container_of(work, struct promote_op, work); - trace_and_count(op->write.op.c, read_promote, &rbio->bio); + bch2_data_update_read_done(&op->write); +} - /* we now own pages: */ - BUG_ON(!rbio->bounce); - BUG_ON(rbio->bio.bi_vcnt > bio->bi_max_vecs); +static noinline void promote_start(struct bch_read_bio *rbio) +{ + struct promote_op *op = container_of(rbio, struct promote_op, write.rbio); - memcpy(bio->bi_io_vec, rbio->bio.bi_io_vec, - sizeof(struct bio_vec) * rbio->bio.bi_vcnt); - swap(bio->bi_vcnt, rbio->bio.bi_vcnt); + trace_and_count(op->write.op.c, io_read_promote, &rbio->bio); - bch2_data_update_read_done(&op->write, rbio->pick.crc); + INIT_WORK(&op->work, promote_start_work); + queue_work(rbio->c->write_ref_wq, &op->work); } -static struct promote_op *__promote_alloc(struct btree_trans *trans, - enum btree_id btree_id, - struct bkey_s_c k, - struct bpos pos, - struct extent_ptr_decoded *pick, - struct bch_io_opts opts, - unsigned sectors, - struct bch_read_bio **rbio, - struct bch_io_failures *failed) +static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, + enum btree_id btree_id, + struct bkey_s_c k, + struct bpos pos, + struct extent_ptr_decoded *pick, + unsigned sectors, + struct bch_read_bio *orig, + struct bch_io_failures *failed) { struct bch_fs *c = trans->c; - struct promote_op *op = NULL; - struct bio *bio; - unsigned pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); int ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_promote)) - return ERR_PTR(-BCH_ERR_nopromote_no_writes); + struct data_update_opts update_opts = { .write_flags = BCH_WRITE_alloc_nowait }; - op = kzalloc(struct_size(op, bi_inline_vecs, pages), GFP_KERNEL); - if (!op) { - ret = -BCH_ERR_nopromote_enomem; - goto err; - } + if (!have_io_error(failed)) { + update_opts.target = orig->opts.promote_target; + update_opts.extra_replicas = 1; + update_opts.write_flags |= BCH_WRITE_cached; + update_opts.write_flags |= BCH_WRITE_only_specified_devs; + } else { + update_opts.target = orig->opts.foreground_target; - op->start_time = local_clock(); - op->pos = pos; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + unsigned ptr_bit = 1; + bkey_for_each_ptr(ptrs, ptr) { + if (bch2_dev_io_failures(failed, ptr->dev) && + !ptr_being_rewritten(orig, ptr->dev)) + update_opts.rewrite_ptrs |= ptr_bit; + ptr_bit <<= 1; + } - /* - * We don't use the mempool here because extents that aren't - * checksummed or compressed can be too big for the mempool: - */ - *rbio = kzalloc(sizeof(struct bch_read_bio) + - sizeof(struct bio_vec) * pages, - GFP_KERNEL); - if (!*rbio) { - ret = -BCH_ERR_nopromote_enomem; - goto err; + if (!update_opts.rewrite_ptrs) + return NULL; } - rbio_init(&(*rbio)->bio, opts); - bio_init(&(*rbio)->bio, NULL, (*rbio)->bio.bi_inline_vecs, pages, 0); + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_promote)) + return ERR_PTR(-BCH_ERR_nopromote_no_writes); - if (bch2_bio_alloc_pages(&(*rbio)->bio, sectors << 9, GFP_KERNEL)) { + struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); + if (!op) { ret = -BCH_ERR_nopromote_enomem; - goto err; + goto err_put; } - (*rbio)->bounce = true; - (*rbio)->split = true; - (*rbio)->kmalloc = true; + op->start_time = local_clock(); + op->pos = pos; if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { @@ -224,68 +253,61 @@ static struct promote_op *__promote_alloc(struct btree_trans *trans, goto err; } - bio = &op->write.op.wbio.bio; - bio_init(bio, NULL, bio->bi_inline_vecs, pages, 0); - - struct data_update_opts update_opts = {}; - - if (!have_io_error(failed)) { - update_opts.target = opts.promote_target; - update_opts.extra_replicas = 1; - update_opts.write_flags = BCH_WRITE_ALLOC_NOWAIT|BCH_WRITE_CACHED; - } else { - update_opts.target = opts.foreground_target; - - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - unsigned ptr_bit = 1; - bkey_for_each_ptr(ptrs, ptr) { - if (bch2_dev_io_failures(failed, ptr->dev)) - update_opts.rewrite_ptrs |= ptr_bit; - ptr_bit <<= 1; - } - } + ret = async_object_list_add(c, promote, op, &op->list_idx); + if (ret < 0) + goto err_remove_hash; ret = bch2_data_update_init(trans, NULL, NULL, &op->write, writepoint_hashed((unsigned long) current), - opts, + &orig->opts, update_opts, btree_id, k); + op->write.type = BCH_DATA_UPDATE_promote; /* * possible errors: -BCH_ERR_nocow_lock_blocked, * -BCH_ERR_ENOSPC_disk_reservation: */ - if (ret) { - BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, - bch_promote_params)); - goto err; - } + if (ret) + goto err_remove_list; + rbio_init_fragment(&op->write.rbio.bio, orig); + op->write.rbio.bounce = true; + op->write.rbio.promote = true; op->write.op.end_io = promote_done; - return op; + return &op->write.rbio; +err_remove_list: + async_object_list_del(c, promote, op->list_idx); +err_remove_hash: + BUG_ON(rhashtable_remove_fast(&c->promote_table, &op->hash, + bch_promote_params)); err: - if (*rbio) - bio_free_pages(&(*rbio)->bio); - kfree(*rbio); - *rbio = NULL; + bio_free_pages(&op->write.op.wbio.bio); /* We may have added to the rhashtable and thus need rcu freeing: */ kfree_rcu(op, rcu); - bch2_write_ref_put(c, BCH_WRITE_REF_promote); +err_put: + enumerated_ref_put(&c->writes, BCH_WRITE_REF_promote); return ERR_PTR(ret); } noinline -static struct promote_op *promote_alloc(struct btree_trans *trans, +static struct bch_read_bio *promote_alloc(struct btree_trans *trans, struct bvec_iter iter, struct bkey_s_c k, struct extent_ptr_decoded *pick, - struct bch_io_opts opts, unsigned flags, - struct bch_read_bio **rbio, + struct bch_read_bio *orig, bool *bounce, bool *read_full, struct bch_io_failures *failed) { + /* + * We're in the retry path, but we don't know what to repair yet, and we + * don't want to do a promote here: + */ + if (failed && !failed->nr) + return NULL; + struct bch_fs *c = trans->c; /* * if failed != NULL we're not actually doing a promote, we're @@ -301,18 +323,21 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, struct bpos pos = promote_full ? bkey_start_pos(k.k) : POS(k.k->p.inode, iter.bi_sector); - struct promote_op *promote; int ret; - ret = should_promote(c, k, pos, opts, flags, failed); + ret = should_promote(c, k, pos, orig->opts, flags, failed); if (ret) goto nopromote; - promote = __promote_alloc(trans, - k.k->type == KEY_TYPE_reflink_v - ? BTREE_ID_reflink - : BTREE_ID_extents, - k, pos, pick, opts, sectors, rbio, failed); + struct bch_read_bio *promote = + __promote_alloc(trans, + k.k->type == KEY_TYPE_reflink_v + ? BTREE_ID_reflink + : BTREE_ID_extents, + k, pos, pick, sectors, orig, failed); + if (!promote) + return NULL; + ret = PTR_ERR_OR_ZERO(promote); if (ret) goto nopromote; @@ -321,18 +346,38 @@ static struct promote_op *promote_alloc(struct btree_trans *trans, *read_full = promote_full; return promote; nopromote: - trace_read_nopromote(c, ret); + trace_io_read_nopromote(c, ret); return NULL; } +void bch2_promote_op_to_text(struct printbuf *out, struct promote_op *op) +{ + if (!op->write.read_done) { + prt_printf(out, "parent read: %px\n", op->write.rbio.parent); + printbuf_indent_add(out, 2); + bch2_read_bio_to_text(out, op->write.rbio.parent); + printbuf_indent_sub(out, 2); + } + + bch2_data_update_to_text(out, &op->write); +} + /* Read */ static int bch2_read_err_msg_trans(struct btree_trans *trans, struct printbuf *out, struct bch_read_bio *rbio, struct bpos read_pos) { - return bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { rbio->subvol, read_pos.inode }, - read_pos.offset << 9); + int ret = lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, out, + (subvol_inum) { rbio->subvol, read_pos.inode }, + read_pos.offset << 9)); + if (ret) + return ret; + + if (rbio->data_update) + prt_str(out, "(internal move) "); + + return 0; } static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, @@ -341,10 +386,6 @@ static void bch2_read_err_msg(struct bch_fs *c, struct printbuf *out, bch2_trans_run(c, bch2_read_err_msg_trans(trans, out, rbio, read_pos)); } -#define READ_RETRY_AVOID 1 -#define READ_RETRY 2 -#define READ_ERR 3 - enum rbio_context { RBIO_CONTEXT_NULL, RBIO_CONTEXT_HIGHPRI, @@ -375,20 +416,27 @@ static inline struct bch_read_bio *bch2_rbio_free(struct bch_read_bio *rbio) { BUG_ON(rbio->bounce && !rbio->split); - if (rbio->promote) - promote_free(rbio->c, rbio->promote); - rbio->promote = NULL; - - if (rbio->bounce) - bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + if (rbio->have_ioref) { + struct bch_dev *ca = bch2_dev_have_ref(rbio->c, rbio->pick.ptr.dev); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); + } if (rbio->split) { struct bch_read_bio *parent = rbio->parent; - if (rbio->kmalloc) - kfree(rbio); - else + if (unlikely(rbio->promote)) { + if (!rbio->bio.bi_status) + promote_start(rbio); + else + promote_free(rbio); + } else { + async_object_list_del(rbio->c, rbio, rbio->list_idx); + + if (rbio->bounce) + bch2_bio_free_pages_pool(rbio->c, &rbio->bio); + bio_put(&rbio->bio); + } rbio = parent; } @@ -408,61 +456,118 @@ static void bch2_rbio_done(struct bch_read_bio *rbio) bio_endio(&rbio->bio); } -static void bch2_read_retry_nodecode(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, - struct bch_io_failures *failed, - unsigned flags) +static void get_rbio_extent(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bkey_buf *sk) { - struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; - struct bkey_buf sk; struct bkey_s_c k; - int ret; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + rbio->data_btree, rbio->data_pos, 0))); + if (ret) + return; + + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + bkey_for_each_ptr(ptrs, ptr) + if (bch2_extent_ptr_eq(*ptr, rbio->pick.ptr)) { + bch2_bkey_buf_reassemble(sk, trans->c, k); + break; + } - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; + bch2_trans_iter_exit(trans, &iter); +} - bch2_bkey_buf_init(&sk); +static noinline int maybe_poison_extent(struct btree_trans *trans, struct bch_read_bio *rbio, + enum btree_id btree, struct bkey_s_c read_k) +{ + if (!bch2_poison_extents_on_checksum_error) + return 0; + + struct bch_fs *c = trans->c; + + struct data_update *u = rbio_data_update(rbio); + if (u) + read_k = bkey_i_to_s_c(u->k.k); + + u64 flags = bch2_bkey_extent_flags(read_k); + if (flags & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, bkey_start_pos(read_k.k), + BTREE_ITER_intent); + int ret = bkey_err(k); + if (ret) + return ret; + + if (!bkey_and_val_eq(k, read_k)) + goto out; + + struct bkey_i *new = bch2_trans_kmalloc(trans, + bkey_bytes(k.k) + sizeof(struct bch_extent_flags)); + ret = PTR_ERR_OR_ZERO(new) ?: + (bkey_reassemble(new, k), 0) ?: + bch2_bkey_extent_flags_set(c, new, flags|BIT_ULL(BCH_EXTENT_FLAG_poisoned)) ?: + bch2_trans_update(trans, &iter, new, BTREE_UPDATE_internal_snapshot_node) ?: + bch2_trans_commit(trans, NULL, NULL, 0); + + /* + * Propagate key change back to data update path, in particular so it + * knows the extent has been poisoned and it's safe to change the + * checksum + */ + if (u && !ret) + bch2_bkey_buf_copy(&u->k, c, new); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} - bch2_trans_iter_init(trans, &iter, rbio->data_btree, - rbio->read_pos, BTREE_ITER_slots); +static noinline int bch2_read_retry_nodecode(struct btree_trans *trans, + struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, + struct bch_io_failures *failed, + unsigned flags) +{ + struct data_update *u = container_of(rbio, struct data_update, rbio); retry: bch2_trans_begin(trans); - rbio->bio.bi_status = 0; - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + struct btree_iter iter; + struct bkey_s_c k; + int ret = lockrestart_do(trans, + bkey_err(k = bch2_bkey_get_iter(trans, &iter, + u->btree_id, bkey_start_pos(&u->k.k->k), + 0))); if (ret) goto err; - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); - - if (!bch2_bkey_matches_ptr(c, k, - rbio->pick.ptr, - rbio->data_pos.offset - - rbio->pick.crc.offset)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ - rbio->hole = true; - goto out; + rbio->ret = -BCH_ERR_data_read_key_overwritten; + goto err; } ret = __bch2_read_extent(trans, rbio, bvec_iter, - rbio->read_pos, - rbio->data_btree, - k, 0, failed, flags); - if (ret == READ_RETRY) - goto retry; - if (ret) - goto err; -out: - bch2_rbio_done(rbio); - bch2_trans_iter_exit(trans, &iter); - bch2_trans_put(trans); - bch2_bkey_buf_exit(&sk, c); - return; + bkey_start_pos(&u->k.k->k), + u->btree_id, + bkey_i_to_s_c(u->k.k), + 0, failed, flags, -1); err: - rbio->bio.bi_status = BLK_STS_IOERR; - goto out; + bch2_trans_iter_exit(trans, &iter); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart) || + bch2_err_matches(ret, BCH_ERR_data_read_retry)) + goto retry; + + if (ret) { + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; + } + + BUG_ON(atomic_read(&rbio->bio.__bi_remaining) != 1); + return ret; } static void bch2_rbio_retry(struct work_struct *work) @@ -478,68 +583,105 @@ static void bch2_rbio_retry(struct work_struct *work) }; struct bch_io_failures failed = { .nr = 0 }; - trace_and_count(c, read_retry, &rbio->bio); + struct btree_trans *trans = bch2_trans_get(c); - if (rbio->retry == READ_RETRY_AVOID) - bch2_mark_io_failure(&failed, &rbio->pick); + struct bkey_buf sk; + bch2_bkey_buf_init(&sk); + bkey_init(&sk.k->k); + + trace_io_read_retry(&rbio->bio); + this_cpu_add(c->counters[BCH_COUNTER_io_read_retry], + bvec_iter_sectors(rbio->bvec_iter)); + + get_rbio_extent(trans, rbio, &sk); + + if (!bkey_deleted(&sk.k->k) && + bch2_err_matches(rbio->ret, BCH_ERR_data_read_retry_avoid)) + bch2_mark_io_failure(&failed, &rbio->pick, + rbio->ret == -BCH_ERR_data_read_retry_csum_err); + + if (!rbio->split) { + rbio->bio.bi_status = 0; + rbio->ret = 0; + } - rbio->bio.bi_status = 0; + unsigned subvol = rbio->subvol; + struct bpos read_pos = rbio->read_pos; rbio = bch2_rbio_free(rbio); - flags |= BCH_READ_IN_RETRY; - flags &= ~BCH_READ_MAY_PROMOTE; + flags |= BCH_READ_in_retry; + flags &= ~BCH_READ_may_promote; + flags &= ~BCH_READ_last_fragment; + flags |= BCH_READ_must_clone; - if (flags & BCH_READ_NODECODE) { - bch2_read_retry_nodecode(c, rbio, iter, &failed, flags); - } else { - flags &= ~BCH_READ_LAST_FRAGMENT; - flags |= BCH_READ_MUST_CLONE; + int ret = rbio->data_update + ? bch2_read_retry_nodecode(trans, rbio, iter, &failed, flags) + : __bch2_read(trans, rbio, iter, inum, &failed, &sk, flags); - __bch2_read(c, rbio, iter, inum, &failed, flags); + if (ret) { + rbio->ret = ret; + rbio->bio.bi_status = BLK_STS_IOERR; } -} -static void bch2_rbio_error(struct bch_read_bio *rbio, int retry, - blk_status_t error) -{ - rbio->retry = retry; + if (failed.nr || ret) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); - if (rbio->flags & BCH_READ_IN_RETRY) - return; + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, + (subvol_inum) { subvol, read_pos.inode }, + read_pos.offset << 9)); + if (rbio->data_update) + prt_str(&buf, "(internal move) "); - if (retry == READ_ERR) { - rbio = bch2_rbio_free(rbio); + prt_str(&buf, "data read error, "); + if (!ret) + prt_str(&buf, "successful retry"); + else + prt_str(&buf, bch2_err_str(ret)); + prt_newline(&buf); - rbio->bio.bi_status = error; - bch2_rbio_done(rbio); - } else { - bch2_rbio_punt(rbio, bch2_rbio_retry, - RBIO_CONTEXT_UNBOUND, system_unbound_wq); + if (!bkey_deleted(&sk.k->k)) { + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(sk.k)); + prt_newline(&buf); + } + + bch2_io_failures_to_text(&buf, c, &failed); + + bch2_print_str_ratelimited(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); } + + bch2_rbio_done(rbio); + bch2_bkey_buf_exit(&sk, c); + bch2_trans_put(trans); } -static void bch2_read_io_err(struct work_struct *work) +static void bch2_rbio_error(struct bch_read_bio *rbio, + int ret, blk_status_t blk_error) { - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bio *bio = &rbio->bio; - struct bch_fs *c = rbio->c; - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - struct printbuf buf = PRINTBUF; + BUG_ON(ret >= 0); - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_printf(&buf, "data read error: %s", bch2_blk_status_to_str(bio->bi_status)); + rbio->ret = ret; + rbio->bio.bi_status = blk_error; - if (ca) { - bch2_io_error(ca, BCH_MEMBER_ERROR_read); - bch_err_ratelimited(ca, "%s", buf.buf); + bch2_rbio_parent(rbio)->saw_error = true; + + if (rbio->flags & BCH_READ_in_retry) + return; + + if (bch2_err_matches(ret, BCH_ERR_data_read_retry)) { + bch2_rbio_punt(rbio, bch2_rbio_retry, + RBIO_CONTEXT_UNBOUND, system_unbound_wq); } else { - bch_err_ratelimited(c, "%s", buf.buf); - } + rbio = bch2_rbio_free(rbio); - printbuf_exit(&buf); - bch2_rbio_error(rbio, READ_RETRY_AVOID, bio->bi_status); + rbio->ret = ret; + rbio->bio.bi_status = blk_error; + + bch2_rbio_done(rbio); + } } static int __bch2_rbio_narrow_crcs(struct btree_trans *trans, @@ -605,33 +747,6 @@ static noinline void bch2_rbio_narrow_crcs(struct bch_read_bio *rbio) __bch2_rbio_narrow_crcs(trans, rbio)); } -static void bch2_read_csum_err(struct work_struct *work) -{ - struct bch_read_bio *rbio = - container_of(work, struct bch_read_bio, work); - struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio; - struct bch_extent_crc_unpacked crc = rbio->pick.crc; - struct nonce nonce = extent_nonce(rbio->version, crc); - struct bch_csum csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - struct printbuf buf = PRINTBUF; - - bch2_read_err_msg(c, &buf, rbio, rbio->read_pos); - prt_str(&buf, "data "); - bch2_csum_err_msg(&buf, crc.csum_type, rbio->pick.crc.csum, csum); - - struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; - if (ca) { - bch2_io_error(ca, BCH_MEMBER_ERROR_checksum); - bch_err_ratelimited(ca, "%s", buf.buf); - } else { - bch_err_ratelimited(c, "%s", buf.buf); - } - - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); - printbuf_exit(&buf); -} - static void bch2_read_decompress_err(struct work_struct *work) { struct bch_read_bio *rbio = @@ -648,7 +763,7 @@ static void bch2_read_decompress_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decompress_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -668,7 +783,7 @@ static void bch2_read_decrypt_err(struct work_struct *work) else bch_err_ratelimited(c, "%s", buf.buf); - bch2_rbio_error(rbio, READ_ERR, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_decrypt_err, BLK_STS_IOERR); printbuf_exit(&buf); } @@ -678,9 +793,11 @@ static void __bch2_read_endio(struct work_struct *work) struct bch_read_bio *rbio = container_of(work, struct bch_read_bio, work); struct bch_fs *c = rbio->c; - struct bio *src = &rbio->bio; - struct bio *dst = &bch2_rbio_parent(rbio)->bio; - struct bvec_iter dst_iter = rbio->bvec_iter; + struct bch_dev *ca = rbio->have_ioref ? bch2_dev_have_ref(c, rbio->pick.ptr.dev) : NULL; + struct bch_read_bio *parent = bch2_rbio_parent(rbio); + struct bio *src = &rbio->bio; + struct bio *dst = &parent->bio; + struct bvec_iter dst_iter = rbio->bvec_iter; struct bch_extent_crc_unpacked crc = rbio->pick.crc; struct nonce nonce = extent_nonce(rbio->version, crc); unsigned nofs_flags; @@ -698,8 +815,26 @@ static void __bch2_read_endio(struct work_struct *work) src->bi_iter = rbio->bvec_iter; } + bch2_maybe_corrupt_bio(src, bch2_read_corrupt_ratio); + csum = bch2_checksum_bio(c, crc.csum_type, nonce, src); - if (bch2_crc_cmp(csum, rbio->pick.crc.csum) && !c->opts.no_data_io) + bool csum_good = !bch2_crc_cmp(csum, rbio->pick.crc.csum) || c->opts.no_data_io; + + /* + * Checksum error: if the bio wasn't bounced, we may have been + * reading into buffers owned by userspace (that userspace can + * scribble over) - retry the read, bouncing it this time: + */ + if (!csum_good && !rbio->bounce && (rbio->flags & BCH_READ_user_mapped)) { + rbio->flags |= BCH_READ_must_bounce; + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err_maybe_userspace, + BLK_STS_IOERR); + goto out; + } + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) goto csum_err; /* @@ -712,32 +847,40 @@ static void __bch2_read_endio(struct work_struct *work) if (unlikely(rbio->narrow_crcs)) bch2_rbio_narrow_crcs(rbio); - if (rbio->flags & BCH_READ_NODECODE) - goto nodecode; + if (likely(!parent->data_update)) { + /* Adjust crc to point to subset of data we want: */ + crc.offset += rbio->offset_into_extent; + crc.live_size = bvec_iter_sectors(rbio->bvec_iter); - /* Adjust crc to point to subset of data we want: */ - crc.offset += rbio->offset_into_extent; - crc.live_size = bvec_iter_sectors(rbio->bvec_iter); + if (crc_is_compressed(crc)) { + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; - if (crc_is_compressed(crc)) { - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; + if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && + !c->opts.no_data_io) + goto decompression_err; + } else { + /* don't need to decrypt the entire bio: */ + nonce = nonce_add(nonce, crc.offset << 9); + bio_advance(src, crc.offset << 9); - if (bch2_bio_uncompress(c, src, dst, dst_iter, crc) && - !c->opts.no_data_io) - goto decompression_err; - } else { - /* don't need to decrypt the entire bio: */ - nonce = nonce_add(nonce, crc.offset << 9); - bio_advance(src, crc.offset << 9); + BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); + src->bi_iter.bi_size = dst_iter.bi_size; - BUG_ON(src->bi_iter.bi_size < dst_iter.bi_size); - src->bi_iter.bi_size = dst_iter.bi_size; + ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); + if (ret) + goto decrypt_err; - ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); - if (ret) - goto decrypt_err; + if (rbio->bounce) { + struct bvec_iter src_iter = src->bi_iter; + + bio_copy_data_iter(dst, &dst_iter, src, &src_iter); + } + } + } else { + if (rbio->split) + rbio->parent->pick = rbio->pick; if (rbio->bounce) { struct bvec_iter src_iter = src->bi_iter; @@ -754,12 +897,9 @@ static void __bch2_read_endio(struct work_struct *work) ret = bch2_encrypt_bio(c, crc.csum_type, nonce, src); if (ret) goto decrypt_err; - - promote_start(rbio->promote, rbio); - rbio->promote = NULL; } -nodecode: - if (likely(!(rbio->flags & BCH_READ_IN_RETRY))) { + + if (likely(!(rbio->flags & BCH_READ_in_retry))) { rbio = bch2_rbio_free(rbio); bch2_rbio_done(rbio); } @@ -767,18 +907,7 @@ out: memalloc_nofs_restore(nofs_flags); return; csum_err: - /* - * Checksum error: if the bio wasn't bounced, we may have been - * reading into buffers owned by userspace (that userspace can - * scribble over) - retry the read, bouncing it this time: - */ - if (!rbio->bounce && (rbio->flags & BCH_READ_USER_MAPPED)) { - rbio->flags |= BCH_READ_MUST_BOUNCE; - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_IOERR); - goto out; - } - - bch2_rbio_punt(rbio, bch2_read_csum_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_csum_err, BLK_STS_IOERR); goto out; decompression_err: bch2_rbio_punt(rbio, bch2_read_decompress_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); @@ -797,27 +926,25 @@ static void bch2_read_endio(struct bio *bio) struct workqueue_struct *wq = NULL; enum rbio_context context = RBIO_CONTEXT_NULL; - if (rbio->have_ioref) { - bch2_latency_acct(ca, rbio->submit_time, READ); - percpu_ref_put(&ca->io_ref); - } + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + rbio->submit_time, !bio->bi_status); if (!rbio->split) rbio->bio.bi_end_io = rbio->end_io; if (unlikely(bio->bi_status)) { - bch2_rbio_punt(rbio, bch2_read_io_err, RBIO_CONTEXT_UNBOUND, system_unbound_wq); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_io_err, bio->bi_status); return; } - if (((rbio->flags & BCH_READ_RETRY_IF_STALE) && race_fault()) || + if (((rbio->flags & BCH_READ_retry_if_stale) && race_fault()) || (ca && dev_ptr_stale(ca, &rbio->pick.ptr))) { - trace_and_count(c, read_reuse_race, &rbio->bio); + trace_and_count(c, io_read_reuse_race, &rbio->bio); - if (rbio->flags & BCH_READ_RETRY_IF_STALE) - bch2_rbio_error(rbio, READ_RETRY, BLK_STS_AGAIN); + if (rbio->flags & BCH_READ_retry_if_stale) + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_retry, BLK_STS_AGAIN); else - bch2_rbio_error(rbio, READ_ERR, BLK_STS_AGAIN); + bch2_rbio_error(rbio, -BCH_ERR_data_read_ptr_stale_race, BLK_STS_AGAIN); return; } @@ -856,7 +983,7 @@ static noinline void read_from_stale_dirty_pointer(struct btree_trans *trans, prt_printf(&buf, "memory gen: %u", gen); - ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(&iter))); + ret = lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_slot(trans, &iter))); if (!ret) { prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); @@ -883,15 +1010,15 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, struct bvec_iter iter, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, - struct bch_io_failures *failed, unsigned flags) + struct bch_io_failures *failed, unsigned flags, int dev) { struct bch_fs *c = trans->c; struct extent_ptr_decoded pick; struct bch_read_bio *rbio = NULL; - struct promote_op *promote = NULL; bool bounce = false, read_full = false, narrow_crcs = false; struct bpos data_pos = bkey_start_pos(k.k); - int pick_ret; + struct data_update *u = rbio_data_update(orig); + int ret = 0; if (bkey_extent_is_inline_data(k.k)) { unsigned bytes = min_t(unsigned, iter.bi_size, @@ -902,19 +1029,35 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, swap(iter.bi_size, bytes); bio_advance_iter(&orig->bio, &iter, bytes); zero_fill_bio_iter(&orig->bio, iter); + this_cpu_add(c->counters[BCH_COUNTER_io_read_inline], + bvec_iter_sectors(iter)); goto out_read_done; } + + if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && + !orig->data_update) + return -BCH_ERR_extent_poisoned; retry_pick: - pick_ret = bch2_bkey_pick_read_device(c, k, failed, &pick); + ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); /* hole or reservation - just zero fill: */ - if (!pick_ret) + if (!ret) goto hole; - if (unlikely(pick_ret < 0)) { + if (unlikely(ret < 0)) { + if (ret == -BCH_ERR_data_read_csum_err) { + int ret2 = maybe_poison_extent(trans, orig, data_btree, k); + if (ret2) { + ret = ret2; + goto err; + } + + trace_and_count(c, io_read_fail_and_poison, &orig->bio); + } + struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); - prt_printf(&buf, "no device to read from: %s\n ", bch2_err_str(pick_ret)); + prt_printf(&buf, "%s\n ", bch2_err_str(ret)); bch2_bkey_val_to_text(&buf, c, k); bch_err_ratelimited(c, "%s", buf.buf); @@ -922,7 +1065,8 @@ retry_pick: goto err; } - if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && !c->chacha20) { + if (unlikely(bch2_csum_type_is_encryption(pick.crc.csum_type)) && + !c->chacha20_key_set) { struct printbuf buf = PRINTBUF; bch2_read_err_msg_trans(trans, &buf, orig, read_pos); prt_printf(&buf, "attempting to read encrypted data without encryption key\n "); @@ -930,10 +1074,12 @@ retry_pick: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); + ret = -BCH_ERR_data_read_no_encryption_key; goto err; } - struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ); + struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, + BCH_DEV_READ_REF_io_read); /* * Stale dirty pointers are treated as IO errors, but @failed isn't @@ -941,62 +1087,58 @@ retry_pick: * retry path, don't check here, it'll be caught in bch2_read_endio() * and we'll end up in the retry path: */ - if ((flags & BCH_READ_IN_RETRY) && + if ((flags & BCH_READ_in_retry) && !pick.ptr.cached && ca && unlikely(dev_ptr_stale(ca, &pick.ptr))) { read_from_stale_dirty_pointer(trans, ca, k, pick.ptr); - bch2_mark_io_failure(failed, &pick); - percpu_ref_put(&ca->io_ref); + bch2_mark_io_failure(failed, &pick, false); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); goto retry_pick; } - /* - * Unlock the iterator while the btree node's lock is still in - * cache, before doing the IO: - */ - bch2_trans_unlock(trans); + if (likely(!u)) { + if (!(flags & BCH_READ_last_fragment) || + bio_flagged(&orig->bio, BIO_CHAIN)) + flags |= BCH_READ_must_clone; + + narrow_crcs = !(flags & BCH_READ_in_retry) && + bch2_can_narrow_extent_crcs(k, pick.crc); + + if (narrow_crcs && (flags & BCH_READ_user_mapped)) + flags |= BCH_READ_must_bounce; - if (flags & BCH_READ_NODECODE) { + EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); + + if (crc_is_compressed(pick.crc) || + (pick.crc.csum_type != BCH_CSUM_none && + (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || + (bch2_csum_type_is_encryption(pick.crc.csum_type) && + (flags & BCH_READ_user_mapped)) || + (flags & BCH_READ_must_bounce)))) { + read_full = true; + bounce = true; + } + } else { /* * can happen if we retry, and the extent we were going to read * has been merged in the meantime: */ - if (pick.crc.compressed_size > orig->bio.bi_vcnt * PAGE_SECTORS) { + if (pick.crc.compressed_size > u->op.wbio.bio.bi_iter.bi_size) { if (ca) - percpu_ref_put(&ca->io_ref); - goto hole; + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_io_read); + rbio->ret = -BCH_ERR_data_read_buffer_too_small; + goto out_read_done; } iter.bi_size = pick.crc.compressed_size << 9; - goto get_bio; - } - - if (!(flags & BCH_READ_LAST_FRAGMENT) || - bio_flagged(&orig->bio, BIO_CHAIN)) - flags |= BCH_READ_MUST_CLONE; - - narrow_crcs = !(flags & BCH_READ_IN_RETRY) && - bch2_can_narrow_extent_crcs(k, pick.crc); - - if (narrow_crcs && (flags & BCH_READ_USER_MAPPED)) - flags |= BCH_READ_MUST_BOUNCE; - - EBUG_ON(offset_into_extent + bvec_iter_sectors(iter) > k.k->size); - - if (crc_is_compressed(pick.crc) || - (pick.crc.csum_type != BCH_CSUM_none && - (bvec_iter_sectors(iter) != pick.crc.uncompressed_size || - (bch2_csum_type_is_encryption(pick.crc.csum_type) && - (flags & BCH_READ_USER_MAPPED)) || - (flags & BCH_READ_MUST_BOUNCE)))) { read_full = true; - bounce = true; } if (orig->opts.promote_target || have_io_error(failed)) - promote = promote_alloc(trans, iter, k, &pick, orig->opts, flags, - &rbio, &bounce, &read_full, failed); + rbio = promote_alloc(trans, iter, k, &pick, flags, orig, + &bounce, &read_full, failed); if (!read_full) { EBUG_ON(crc_is_compressed(pick.crc)); @@ -1015,7 +1157,7 @@ retry_pick: pick.crc.offset = 0; pick.crc.live_size = bvec_iter_sectors(iter); } -get_bio: + if (rbio) { /* * promote already allocated bounce rbio: @@ -1030,17 +1172,16 @@ get_bio: } else if (bounce) { unsigned sectors = pick.crc.compressed_size; - rbio = rbio_init(bio_alloc_bioset(NULL, + rbio = rbio_init_fragment(bio_alloc_bioset(NULL, DIV_ROUND_UP(sectors, PAGE_SECTORS), 0, GFP_NOFS, &c->bio_read_split), - orig->opts); + orig); bch2_bio_alloc_pages_pool(c, &rbio->bio, sectors << 9); rbio->bounce = true; - rbio->split = true; - } else if (flags & BCH_READ_MUST_CLONE) { + } else if (flags & BCH_READ_must_clone) { /* * Have to clone if there were any splits, due to error * reporting issues (if a split errored, and retrying didn't @@ -1049,11 +1190,10 @@ get_bio: * from the whole bio, in which case we don't want to retry and * lose the error) */ - rbio = rbio_init(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, + rbio = rbio_init_fragment(bio_alloc_clone(NULL, &orig->bio, GFP_NOFS, &c->bio_read_split), - orig->opts); + orig); rbio->bio.bi_iter = iter; - rbio->split = true; } else { rbio = orig; rbio->bio.bi_iter = iter; @@ -1062,68 +1202,66 @@ get_bio: EBUG_ON(bio_sectors(&rbio->bio) != pick.crc.compressed_size); - rbio->c = c; rbio->submit_time = local_clock(); - if (rbio->split) - rbio->parent = orig; - else + if (!rbio->split) rbio->end_io = orig->bio.bi_end_io; rbio->bvec_iter = iter; rbio->offset_into_extent= offset_into_extent; rbio->flags = flags; rbio->have_ioref = ca != NULL; rbio->narrow_crcs = narrow_crcs; - rbio->hole = 0; - rbio->retry = 0; + rbio->ret = 0; rbio->context = 0; - /* XXX: only initialize this if needed */ - rbio->devs_have = bch2_bkey_devs(k); rbio->pick = pick; rbio->subvol = orig->subvol; rbio->read_pos = read_pos; rbio->data_btree = data_btree; rbio->data_pos = data_pos; rbio->version = k.k->bversion; - rbio->promote = promote; INIT_WORK(&rbio->work, NULL); - if (flags & BCH_READ_NODECODE) - orig->pick = pick; - rbio->bio.bi_opf = orig->bio.bi_opf; rbio->bio.bi_iter.bi_sector = pick.ptr.offset; rbio->bio.bi_end_io = bch2_read_endio; + async_object_list_add(c, rbio, rbio, &rbio->list_idx); + if (rbio->bounce) - trace_and_count(c, read_bounce, &rbio->bio); + trace_and_count(c, io_read_bounce, &rbio->bio); - this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + if (!u) + this_cpu_add(c->counters[BCH_COUNTER_io_read], bio_sectors(&rbio->bio)); + else + this_cpu_add(c->counters[BCH_COUNTER_io_move_read], bio_sectors(&rbio->bio)); bch2_increment_clock(c, bio_sectors(&rbio->bio), READ); /* * If it's being moved internally, we don't want to flag it as a cache * hit: */ - if (ca && pick.ptr.cached && !(flags & BCH_READ_NODECODE)) + if (ca && pick.ptr.cached && !u) bch2_bucket_io_time_reset(trans, pick.ptr.dev, PTR_BUCKET_NR(ca, &pick.ptr), READ); - if (!(flags & (BCH_READ_IN_RETRY|BCH_READ_LAST_FRAGMENT))) { + if (!(flags & (BCH_READ_in_retry|BCH_READ_last_fragment))) { bio_inc_remaining(&orig->bio); - trace_and_count(c, read_split, &orig->bio); + trace_and_count(c, io_read_split, &orig->bio); } - if (!rbio->pick.idx) { - if (unlikely(!rbio->have_ioref)) { - struct printbuf buf = PRINTBUF; - bch2_read_err_msg_trans(trans, &buf, rbio, read_pos); - prt_printf(&buf, "no device to read from:\n "); - bch2_bkey_val_to_text(&buf, c, k); - - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + /* + * Unlock the iterator while the btree node's lock is still in + * cache, before doing the IO: + */ + if (!(flags & BCH_READ_in_retry)) + bch2_trans_unlock(trans); + else + bch2_trans_unlock_long(trans); - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + if (likely(!rbio->pick.do_ec_reconstruct)) { + if (unlikely(!rbio->have_ioref)) { + bch2_rbio_error(rbio, + -BCH_ERR_data_read_retry_device_offline, + BLK_STS_IOERR); goto out; } @@ -1132,10 +1270,10 @@ get_bio: bio_set_dev(&rbio->bio, ca->disk_sb.bdev); if (unlikely(c->opts.no_data_io)) { - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } else { - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) submit_bio(&rbio->bio); else submit_bio_wait(&rbio->bio); @@ -1149,70 +1287,76 @@ get_bio: } else { /* Attempting reconstruct read: */ if (bch2_ec_read_extent(trans, rbio, k)) { - bch2_rbio_error(rbio, READ_RETRY_AVOID, BLK_STS_IOERR); + bch2_rbio_error(rbio, -BCH_ERR_data_read_retry_ec_reconstruct_err, + BLK_STS_IOERR); goto out; } - if (likely(!(flags & BCH_READ_IN_RETRY))) + if (likely(!(flags & BCH_READ_in_retry))) bio_endio(&rbio->bio); } out: - if (likely(!(flags & BCH_READ_IN_RETRY))) { + if (likely(!(flags & BCH_READ_in_retry))) { return 0; } else { + bch2_trans_unlock(trans); + int ret; rbio->context = RBIO_CONTEXT_UNBOUND; bch2_read_endio(&rbio->bio); - ret = rbio->retry; + ret = rbio->ret; rbio = bch2_rbio_free(rbio); - if (ret == READ_RETRY_AVOID) { - bch2_mark_io_failure(failed, &pick); - ret = READ_RETRY; - } - - if (!ret) - goto out_read_done; + if (bch2_err_matches(ret, BCH_ERR_data_read_retry_avoid)) + bch2_mark_io_failure(failed, &pick, + ret == -BCH_ERR_data_read_retry_csum_err); return ret; } err: - if (flags & BCH_READ_IN_RETRY) - return READ_ERR; + if (flags & BCH_READ_in_retry) + return ret; - orig->bio.bi_status = BLK_STS_IOERR; + orig->bio.bi_status = BLK_STS_IOERR; + orig->ret = ret; goto out_read_done; hole: + this_cpu_add(c->counters[BCH_COUNTER_io_read_hole], + bvec_iter_sectors(iter)); /* - * won't normally happen in the BCH_READ_NODECODE - * (bch2_move_extent()) path, but if we retry and the extent we wanted - * to read no longer exists we have to signal that: + * won't normally happen in the data update (bch2_move_extent()) path, + * but if we retry and the extent we wanted to read no longer exists we + * have to signal that: */ - if (flags & BCH_READ_NODECODE) - orig->hole = true; + if (u) + orig->ret = -BCH_ERR_data_read_key_overwritten; zero_fill_bio_iter(&orig->bio, iter); out_read_done: - if (flags & BCH_READ_LAST_FRAGMENT) + if ((flags & BCH_READ_last_fragment) && + !(flags & BCH_READ_in_retry)) bch2_rbio_done(orig); return 0; } -void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, - struct bvec_iter bvec_iter, subvol_inum inum, - struct bch_io_failures *failed, unsigned flags) +int __bch2_read(struct btree_trans *trans, struct bch_read_bio *rbio, + struct bvec_iter bvec_iter, subvol_inum inum, + struct bch_io_failures *failed, + struct bkey_buf *prev_read, + unsigned flags) { - struct btree_trans *trans = bch2_trans_get(c); + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_buf sk; struct bkey_s_c k; + enum btree_id data_btree; int ret; - BUG_ON(flags & BCH_READ_NODECODE); + EBUG_ON(rbio->data_update); bch2_bkey_buf_init(&sk); bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, @@ -1220,7 +1364,7 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, BTREE_ITER_slots); while (1) { - enum btree_id data_btree = BTREE_ID_extents; + data_btree = BTREE_ID_extents; bch2_trans_begin(trans); @@ -1229,12 +1373,12 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, if (ret) goto err; - bch2_btree_iter_set_snapshot(&iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &iter, snapshot); - bch2_btree_iter_set_pos(&iter, + bch2_btree_iter_set_pos(trans, &iter, POS(inum.inum, bvec_iter.bi_sector)); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -1252,6 +1396,12 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, k = bkey_i_to_s_c(sk.k); + if (unlikely(flags & BCH_READ_in_retry)) { + if (!bkey_and_val_eq(k, bkey_i_to_s_c(prev_read->k))) + failed->nr = 0; + bch2_bkey_buf_copy(prev_read, c, sk.k); + } + /* * With indirect extents, the amount of data to read is the min * of the original extent and the indirect extent: @@ -1262,42 +1412,86 @@ void __bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, swap(bvec_iter.bi_size, bytes); if (bvec_iter.bi_size == bytes) - flags |= BCH_READ_LAST_FRAGMENT; + flags |= BCH_READ_last_fragment; ret = __bch2_read_extent(trans, rbio, bvec_iter, iter.pos, data_btree, k, - offset_into_extent, failed, flags); + offset_into_extent, failed, flags, -1); + swap(bvec_iter.bi_size, bytes); + if (ret) goto err; - if (flags & BCH_READ_LAST_FRAGMENT) + if (flags & BCH_READ_last_fragment) break; - swap(bvec_iter.bi_size, bytes); bio_advance_iter(&rbio->bio, &bvec_iter, bytes); err: + if (ret == -BCH_ERR_data_read_retry_csum_err_maybe_userspace) + flags |= BCH_READ_must_bounce; + if (ret && !bch2_err_matches(ret, BCH_ERR_transaction_restart) && - ret != READ_RETRY && - ret != READ_RETRY_AVOID) + !bch2_err_matches(ret, BCH_ERR_data_read_retry)) break; } - bch2_trans_iter_exit(trans, &iter); + if (unlikely(ret)) { + if (ret != -BCH_ERR_extent_poisoned) { + struct printbuf buf = PRINTBUF; + lockrestart_do(trans, + bch2_inum_offset_err_msg_trans(trans, &buf, inum, + bvec_iter.bi_sector << 9)); + prt_printf(&buf, "data read error: %s", bch2_err_str(ret)); + bch_err_ratelimited(c, "%s", buf.buf); + printbuf_exit(&buf); + } - if (ret) { - struct printbuf buf = PRINTBUF; - bch2_inum_offset_err_msg_trans(trans, &buf, inum, bvec_iter.bi_sector << 9); - prt_printf(&buf, "read error %i from btree lookup", ret); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + rbio->bio.bi_status = BLK_STS_IOERR; + rbio->ret = ret; - rbio->bio.bi_status = BLK_STS_IOERR; - bch2_rbio_done(rbio); + if (!(flags & BCH_READ_in_retry)) + bch2_rbio_done(rbio); } - bch2_trans_put(trans); + bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); + return ret; +} + +static const char * const bch2_read_bio_flags[] = { +#define x(n) #n, + BCH_READ_FLAGS() +#undef x + NULL +}; + +void bch2_read_bio_to_text(struct printbuf *out, struct bch_read_bio *rbio) +{ + u64 now = local_clock(); + prt_printf(out, "start_time:\t%llu\n", rbio->start_time ? now - rbio->start_time : 0); + prt_printf(out, "submit_time:\t%llu\n", rbio->submit_time ? now - rbio->submit_time : 0); + + if (!rbio->split) + prt_printf(out, "end_io:\t%ps\n", rbio->end_io); + else + prt_printf(out, "parent:\t%px\n", rbio->parent); + + prt_printf(out, "bi_end_io:\t%ps\n", rbio->bio.bi_end_io); + + prt_printf(out, "promote:\t%u\n", rbio->promote); + prt_printf(out, "bounce:\t%u\n", rbio->bounce); + prt_printf(out, "split:\t%u\n", rbio->split); + prt_printf(out, "have_ioref:\t%u\n", rbio->have_ioref); + prt_printf(out, "narrow_crcs:\t%u\n", rbio->narrow_crcs); + prt_printf(out, "context:\t%u\n", rbio->context); + prt_printf(out, "ret:\t%s\n", bch2_err_str(rbio->ret)); + + prt_printf(out, "flags:\t"); + bch2_prt_bitflags(out, bch2_read_bio_flags, rbio->flags); + prt_newline(out); + + bch2_bio_to_text(out, &rbio->bio); } void bch2_fs_io_read_exit(struct bch_fs *c) @@ -1306,10 +1500,18 @@ void bch2_fs_io_read_exit(struct bch_fs *c) rhashtable_destroy(&c->promote_table); bioset_exit(&c->bio_read_split); bioset_exit(&c->bio_read); + mempool_exit(&c->bio_bounce_pages); } int bch2_fs_io_read_init(struct bch_fs *c) { + if (mempool_init_page_pool(&c->bio_bounce_pages, + max_t(unsigned, + c->opts.btree_node_size, + c->opts.encoded_extent_max) / + PAGE_SIZE, 0)) + return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) return -BCH_ERR_ENOMEM_bio_read_init; diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index a82e8a94ccb6..c08b9c047b3e 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -3,6 +3,8 @@ #define _BCACHEFS_IO_READ_H #include "bkey_buf.h" +#include "btree_iter.h" +#include "extents_types.h" #include "reflink.h" struct bch_read_bio { @@ -35,19 +37,21 @@ struct bch_read_bio { u16 flags; union { struct { - u16 bounce:1, + u16 data_update:1, + promote:1, + bounce:1, split:1, - kmalloc:1, have_ioref:1, narrow_crcs:1, - hole:1, - retry:2, + saw_error:1, context:2; }; u16 _state; }; - - struct bch_devs_list devs_have; + s16 ret; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif struct extent_ptr_decoded pick; @@ -65,8 +69,6 @@ struct bch_read_bio { struct bpos data_pos; struct bversion version; - struct promote_op *promote; - struct bch_io_opts opts; struct work_struct work; @@ -108,64 +110,103 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, return 0; } +#define BCH_READ_FLAGS() \ + x(retry_if_stale) \ + x(may_promote) \ + x(user_mapped) \ + x(last_fragment) \ + x(must_bounce) \ + x(must_clone) \ + x(in_retry) + +enum __bch_read_flags { +#define x(n) __BCH_READ_##n, + BCH_READ_FLAGS() +#undef x +}; + enum bch_read_flags { - BCH_READ_RETRY_IF_STALE = 1 << 0, - BCH_READ_MAY_PROMOTE = 1 << 1, - BCH_READ_USER_MAPPED = 1 << 2, - BCH_READ_NODECODE = 1 << 3, - BCH_READ_LAST_FRAGMENT = 1 << 4, - - /* internal: */ - BCH_READ_MUST_BOUNCE = 1 << 5, - BCH_READ_MUST_CLONE = 1 << 6, - BCH_READ_IN_RETRY = 1 << 7, +#define x(n) BCH_READ_##n = BIT(__BCH_READ_##n), + BCH_READ_FLAGS() +#undef x }; int __bch2_read_extent(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, struct bpos, enum btree_id, struct bkey_s_c, unsigned, - struct bch_io_failures *, unsigned); + struct bch_io_failures *, unsigned, int); static inline void bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, struct bpos read_pos, enum btree_id data_btree, struct bkey_s_c k, unsigned offset_into_extent, unsigned flags) { - __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, - data_btree, k, offset_into_extent, NULL, flags); + int ret = __bch2_read_extent(trans, rbio, rbio->bio.bi_iter, read_pos, + data_btree, k, offset_into_extent, NULL, flags, -1); + /* __bch2_read_extent only returns errors if BCH_READ_in_retry is set */ + WARN(ret, "unhandled error from __bch2_read_extent()"); } -void __bch2_read(struct bch_fs *, struct bch_read_bio *, struct bvec_iter, - subvol_inum, struct bch_io_failures *, unsigned flags); +int __bch2_read(struct btree_trans *, struct bch_read_bio *, struct bvec_iter, + subvol_inum, + struct bch_io_failures *, struct bkey_buf *, unsigned flags); static inline void bch2_read(struct bch_fs *c, struct bch_read_bio *rbio, subvol_inum inum) { - struct bch_io_failures failed = { .nr = 0 }; - BUG_ON(rbio->_state); - rbio->c = c; - rbio->start_time = local_clock(); rbio->subvol = inum.subvol; - __bch2_read(c, rbio, rbio->bio.bi_iter, inum, &failed, - BCH_READ_RETRY_IF_STALE| - BCH_READ_MAY_PROMOTE| - BCH_READ_USER_MAPPED); + bch2_trans_run(c, + __bch2_read(trans, rbio, rbio->bio.bi_iter, inum, NULL, NULL, + BCH_READ_retry_if_stale| + BCH_READ_may_promote| + BCH_READ_user_mapped)); +} + +static inline struct bch_read_bio *rbio_init_fragment(struct bio *bio, + struct bch_read_bio *orig) +{ + struct bch_read_bio *rbio = to_rbio(bio); + + rbio->c = orig->c; + rbio->_state = 0; + rbio->flags = 0; + rbio->ret = 0; + rbio->split = true; + rbio->parent = orig; + rbio->opts = orig->opts; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + rbio->list_idx = 0; +#endif + return rbio; } static inline struct bch_read_bio *rbio_init(struct bio *bio, - struct bch_io_opts opts) + struct bch_fs *c, + struct bch_io_opts opts, + bio_end_io_t end_io) { struct bch_read_bio *rbio = to_rbio(bio); - rbio->_state = 0; - rbio->promote = NULL; - rbio->opts = opts; + rbio->start_time = local_clock(); + rbio->c = c; + rbio->_state = 0; + rbio->flags = 0; + rbio->ret = 0; + rbio->opts = opts; + rbio->bio.bi_end_io = end_io; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + rbio->list_idx = 0; +#endif return rbio; } +struct promote_op; +void bch2_promote_op_to_text(struct printbuf *, struct promote_op *); +void bch2_read_bio_to_text(struct printbuf *, struct bch_read_bio *); + void bch2_fs_io_read_exit(struct bch_fs *); int bch2_fs_io_read_init(struct bch_fs *); diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 03892388832b..52a60982a66b 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -6,6 +6,7 @@ #include "bcachefs.h" #include "alloc_foreground.h" +#include "async_objs.h" #include "bkey_buf.h" #include "bset.h" #include "btree_update.h" @@ -15,6 +16,7 @@ #include "compress.h" #include "debug.h" #include "ec.h" +#include "enumerated_ref.h" #include "error.h" #include "extent_update.h" #include "inode.h" @@ -34,6 +36,12 @@ #include <linux/random.h> #include <linux/sched/mm.h> +#ifdef CONFIG_BCACHEFS_DEBUG +static unsigned bch2_write_corrupt_ratio; +module_param_named(write_corrupt_ratio, bch2_write_corrupt_ratio, uint, 0644); +MODULE_PARM_DESC(write_corrupt_ratio, ""); +#endif + #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT static inline void bch2_congested_acct(struct bch_dev *ca, u64 io_latency, @@ -162,9 +170,9 @@ int bch2_sum_sector_overwrites(struct btree_trans *trans, *i_sectors_delta = 0; *disk_sectors_delta = 0; - bch2_trans_copy_iter(&iter, extent_iter); + bch2_trans_copy_iter(trans, &iter, extent_iter); - for_each_btree_key_max_continue_norestart(iter, + for_each_btree_key_max_continue_norestart(trans, iter, new->k.p, BTREE_ITER_slots, old, ret) { s64 sectors = min(new->k.p.offset, old.k->p.offset) - max(bkey_start_offset(&new->k), @@ -249,10 +257,35 @@ static inline int bch2_extent_update_i_size_sectors(struct btree_trans *trans, } if (i_sectors_delta) { + s64 bi_sectors = le64_to_cpu(inode->v.bi_sectors); + if (unlikely(bi_sectors + i_sectors_delta < 0)) { + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "inode %llu i_sectors underflow: %lli + %lli < 0", + extent_iter->pos.inode, bi_sectors, i_sectors_delta); + + bool print = bch2_count_fsck_err(c, inode_i_sectors_underflow, &buf); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + + if (i_sectors_delta < 0) + i_sectors_delta = -bi_sectors; + else + i_sectors_delta = 0; + } + le64_add_cpu(&inode->v.bi_sectors, i_sectors_delta); inode_update_flags = 0; } + /* + * extents, dirents and xattrs updates require that an inode update also + * happens - to ensure that if a key exists in one of those btrees with + * a given snapshot ID an inode is also present - so we may have to skip + * the nojournal optimization: + */ if (inode->k.p.snapshot != iter.snapshot) { inode->k.p.snapshot = iter.snapshot; inode_update_flags = 0; @@ -286,7 +319,7 @@ int bch2_extent_update(struct btree_trans *trans, * path already traversed at iter->pos because * bch2_trans_extent_update() will use it to attempt extent merging */ - ret = __bch2_btree_iter_traverse(iter); + ret = __bch2_btree_iter_traverse(trans, iter); if (ret) return ret; @@ -331,7 +364,7 @@ int bch2_extent_update(struct btree_trans *trans, if (i_sectors_delta_total) *i_sectors_delta_total += i_sectors_delta; - bch2_btree_iter_set_pos(iter, next_pos); + bch2_btree_iter_set_pos(trans, iter, next_pos); return 0; } @@ -370,11 +403,10 @@ static int bch2_write_index_default(struct bch_write_op *op) bkey_start_pos(&sk.k->k), BTREE_ITER_slots|BTREE_ITER_intent); - ret = bch2_bkey_set_needs_rebalance(c, &op->opts, sk.k) ?: - bch2_extent_update(trans, inum, &iter, sk.k, + ret = bch2_extent_update(trans, inum, &iter, sk.k, &op->res, op->new_i_size, &op->i_sectors_delta, - op->flags & BCH_WRITE_CHECK_ENOSPC); + op->flags & BCH_WRITE_check_enospc); bch2_trans_iter_exit(trans, &iter); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -396,29 +428,36 @@ static int bch2_write_index_default(struct bch_write_op *op) /* Writes */ -static void __bch2_write_op_error(struct printbuf *out, struct bch_write_op *op, - u64 offset) +void bch2_write_op_error(struct bch_write_op *op, u64 offset, const char *fmt, ...) { - bch2_inum_offset_err_msg(op->c, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); -} + struct printbuf buf = PRINTBUF; -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op) -{ - __bch2_write_op_error(out, op, op->pos.offset); -} + if (op->subvol) { + bch2_inum_offset_err_msg(op->c, &buf, + (subvol_inum) { op->subvol, op->pos.inode, }, + offset << 9); + } else { + struct bpos pos = op->pos; + pos.offset = offset; + bch2_inum_snap_offset_err_msg(op->c, &buf, pos); + } -static void bch2_write_op_error_trans(struct btree_trans *trans, struct printbuf *out, - struct bch_write_op *op, u64 offset) -{ - bch2_inum_offset_err_msg_trans(trans, out, - (subvol_inum) { op->subvol, op->pos.inode, }, - offset << 9); - prt_printf(out, "write error%s: ", - op->flags & BCH_WRITE_MOVE ? "(internal move)" : ""); + prt_str(&buf, "write error: "); + + va_list args; + va_start(args, fmt); + prt_vprintf(&buf, fmt, args); + va_end(args); + + if (op->flags & BCH_WRITE_move) { + struct data_update *u = container_of(op, struct data_update, op); + + prt_printf(&buf, "\n from internal move "); + bch2_bkey_val_to_text(&buf, op->c, bkey_i_to_s_c(u->k.k)); + } + + bch_err_ratelimited(op->c, "%s", buf.buf); + printbuf_exit(&buf); } void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, @@ -428,15 +467,28 @@ void bch2_submit_wbio_replicas(struct bch_write_bio *wbio, struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(k)); struct bch_write_bio *n; + unsigned ref_rw = type == BCH_DATA_btree ? READ : WRITE; + unsigned ref_idx = type == BCH_DATA_btree + ? BCH_DEV_READ_REF_btree_node_write + : BCH_DEV_WRITE_REF_io_write; BUG_ON(c->opts.nochanges); + const struct bch_extent_ptr *last = NULL; + bkey_for_each_ptr(ptrs, ptr) + last = ptr; + bkey_for_each_ptr(ptrs, ptr) { + /* + * XXX: btree writes should be using io_ref[WRITE], but we + * aren't retrying failed btree writes yet (due to device + * removal/ro): + */ struct bch_dev *ca = nocow ? bch2_dev_have_ref(c, ptr->dev) - : bch2_dev_get_ioref(c, ptr->dev, type == BCH_DATA_btree ? READ : WRITE); + : bch2_dev_get_ioref(c, ptr->dev, ref_rw, ref_idx); - if (to_entry(ptr + 1) < ptrs.end) { + if (ptr != last) { n = to_wbio(bio_alloc_clone(NULL, &wbio->bio, GFP_NOFS, &c->replica_set)); n->bio.bi_end_io = wbio->bio.bi_end_io; @@ -493,12 +545,13 @@ static void bch2_write_done(struct closure *cl) bch2_time_stats_update(&c->times[BCH_TIME_data_write], op->start_time); bch2_disk_reservation_put(c, &op->res); - if (!(op->flags & BCH_WRITE_MOVE)) - bch2_write_ref_put(c, BCH_WRITE_REF_write); + if (!(op->flags & BCH_WRITE_move)) + enumerated_ref_put(&c->writes, BCH_WRITE_REF_write); bch2_keylist_free(&op->insert_keys, op->inline_keys); EBUG_ON(cl->parent); closure_debug_destroy(cl); + async_object_list_del(c, write_op, op->list_idx); if (op->end_io) op->end_io(op); } @@ -516,7 +569,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -EIO; + return -BCH_ERR_data_write_io; } if (dst != src) @@ -539,7 +592,7 @@ static void __bch2_write_index(struct bch_write_op *op) unsigned dev; int ret = 0; - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { + if (unlikely(op->flags & BCH_WRITE_io_error)) { ret = bch2_write_drop_io_error_ptrs(op); if (ret) goto err; @@ -548,7 +601,7 @@ static void __bch2_write_index(struct bch_write_op *op) if (!bch2_keylist_empty(keys)) { u64 sectors_start = keylist_sectors(keys); - ret = !(op->flags & BCH_WRITE_MOVE) + ret = !(op->flags & BCH_WRITE_move) ? bch2_write_index_default(op) : bch2_data_update_index_update(op); @@ -560,11 +613,8 @@ static void __bch2_write_index(struct bch_write_op *op) if (unlikely(ret && !bch2_err_matches(ret, EROFS))) { struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - struct printbuf buf = PRINTBUF; - __bch2_write_op_error(&buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + bch2_write_op_error(op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); } if (ret) @@ -573,21 +623,29 @@ static void __bch2_write_index(struct bch_write_op *op) out: /* If some a bucket wasn't written, we can't erasure code it: */ for_each_set_bit(dev, op->failed.d, BCH_SB_MEMBERS_MAX) - bch2_open_bucket_write_error(c, &op->open_buckets, dev); + bch2_open_bucket_write_error(c, &op->open_buckets, dev, -BCH_ERR_data_write_io); bch2_open_buckets_put(c, &op->open_buckets); return; err: keys->top = keys->keys; op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; goto out; } static inline void __wp_update_state(struct write_point *wp, enum write_point_state state) { if (state != wp->state) { + struct task_struct *p = current; u64 now = ktime_get_ns(); + u64 runtime = p->se.sum_exec_runtime + + (now - p->se.exec_start); + + if (state == WRITE_POINT_runnable) + wp->last_runtime = runtime; + else if (wp->state == WRITE_POINT_runnable) + wp->time[WRITE_POINT_running] += runtime - wp->last_runtime; if (wp->last_state_change && time_after64(now, wp->last_state_change)) @@ -601,7 +659,7 @@ static inline void wp_update_state(struct write_point *wp, bool running) { enum write_point_state state; - state = running ? WRITE_POINT_running : + state = running ? WRITE_POINT_runnable: !list_empty(&wp->writes) ? WRITE_POINT_waiting_io : WRITE_POINT_stopped; @@ -615,8 +673,8 @@ static CLOSURE_CALLBACK(bch2_write_index) struct workqueue_struct *wq = index_update_wq(op); unsigned long flags; - if ((op->flags & BCH_WRITE_SUBMITTED) && - (op->flags & BCH_WRITE_MOVE)) + if ((op->flags & BCH_WRITE_submitted) && + (op->flags & BCH_WRITE_move)) bch2_bio_free_pages_pool(op->c, &op->wbio.bio); spin_lock_irqsave(&wp->writes_lock, flags); @@ -654,11 +712,11 @@ void bch2_write_point_do_index_updates(struct work_struct *work) if (!op) break; - op->flags |= BCH_WRITE_IN_WORKER; + op->flags |= BCH_WRITE_in_worker; __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_SUBMITTED)) + if (!(op->flags & BCH_WRITE_submitted)) __bch2_write(op); else bch2_write_done(&op->cl); @@ -676,13 +734,24 @@ static void bch2_write_endio(struct bio *bio) ? bch2_dev_have_ref(c, wbio->dev) : NULL; - if (bch2_dev_inum_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, - op->pos.inode, - wbio->inode_offset << 9, - "data write error: %s", - bch2_blk_status_to_str(bio->bi_status))) { + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + wbio->submit_time, !bio->bi_status); + + if (unlikely(bio->bi_status)) { + if (ca) + bch_err_inum_offset_ratelimited(ca, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status)); + else + bch_err_inum_offset_ratelimited(c, + op->pos.inode, + wbio->inode_offset << 9, + "data write error: %s", + bch2_blk_status_to_str(bio->bi_status)); set_bit(wbio->dev, op->failed.d); - op->flags |= BCH_WRITE_IO_ERROR; + op->flags |= BCH_WRITE_io_error; } if (wbio->nocow) { @@ -692,10 +761,9 @@ static void bch2_write_endio(struct bio *bio) set_bit(wbio->dev, op->devs_need_flush->d); } - if (wbio->have_ioref) { - bch2_latency_acct(ca, wbio->submit_time, WRITE); - percpu_ref_put(&ca->io_ref); - } + if (wbio->have_ioref) + enumerated_ref_put(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); if (wbio->bounce) bch2_bio_free_pages_pool(c, bio); @@ -729,7 +797,10 @@ static void init_append_extent(struct bch_write_op *op, bch2_extent_crc_append(&e->k_i, crc); bch2_alloc_sectors_append_ptrs_inlined(op->c, wp, &e->k_i, crc.compressed_size, - op->flags & BCH_WRITE_CACHED); + op->flags & BCH_WRITE_cached); + + if (!(op->flags & BCH_WRITE_move)) + bch2_bkey_set_needs_rebalance(op->c, &op->opts, &e->k_i); bch2_keylist_push(&op->insert_keys); } @@ -789,7 +860,6 @@ static int bch2_write_rechecksum(struct bch_fs *c, { struct bio *bio = &op->wbio.bio; struct bch_extent_crc_unpacked new_crc; - int ret; /* bch2_rechecksum_bio() can't encrypt or decrypt data: */ @@ -797,10 +867,10 @@ static int bch2_write_rechecksum(struct bch_fs *c, bch2_csum_type_is_encryption(new_csum_type)) new_csum_type = op->crc.csum_type; - ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, - NULL, &new_crc, - op->crc.offset, op->crc.live_size, - new_csum_type); + int ret = bch2_rechecksum_bio(c, bio, op->version, op->crc, + NULL, &new_crc, + op->crc.offset, op->crc.live_size, + new_csum_type); if (ret) return ret; @@ -810,44 +880,12 @@ static int bch2_write_rechecksum(struct bch_fs *c, return 0; } -static int bch2_write_decrypt(struct bch_write_op *op) -{ - struct bch_fs *c = op->c; - struct nonce nonce = extent_nonce(op->version, op->crc); - struct bch_csum csum; - int ret; - - if (!bch2_csum_type_is_encryption(op->crc.csum_type)) - return 0; - - /* - * If we need to decrypt data in the write path, we'll no longer be able - * to verify the existing checksum (poly1305 mac, in this case) after - * it's decrypted - this is the last point we'll be able to reverify the - * checksum: - */ - csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return -EIO; - - ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, &op->wbio.bio); - op->crc.csum_type = 0; - op->crc.csum = (struct bch_csum) { 0, 0 }; - return ret; -} - -static enum prep_encoded_ret { - PREP_ENCODED_OK, - PREP_ENCODED_ERR, - PREP_ENCODED_CHECKSUM_ERR, - PREP_ENCODED_DO_WRITE, -} bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) +static noinline int bch2_write_prep_encoded_data(struct bch_write_op *op, struct write_point *wp) { struct bch_fs *c = op->c; struct bio *bio = &op->wbio.bio; - - if (!(op->flags & BCH_WRITE_DATA_ENCODED)) - return PREP_ENCODED_OK; + struct bch_csum csum; + int ret = 0; BUG_ON(bio_sectors(bio) != op->crc.compressed_size); @@ -858,12 +896,13 @@ static enum prep_encoded_ret { (op->crc.compression_type == bch2_compression_opt_to_type(op->compression_opt) || op->incompressible)) { if (!crc_is_compressed(op->crc) && - op->csum_type != op->crc.csum_type && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + op->csum_type != op->crc.csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } - return PREP_ENCODED_DO_WRITE; + return 1; } /* @@ -871,20 +910,24 @@ static enum prep_encoded_ret { * is, we have to decompress it: */ if (crc_is_compressed(op->crc)) { - struct bch_csum csum; - - if (bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; - /* Last point we can still verify checksum: */ - csum = bch2_checksum_bio(c, op->crc.csum_type, - extent_nonce(op->version, op->crc), - bio); + struct nonce nonce = extent_nonce(op->version, op->crc); + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + goto csum_err; + + if (bch2_csum_type_is_encryption(op->crc.csum_type)) { + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; + + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } - if (bch2_bio_uncompress_inplace(op, bio)) - return PREP_ENCODED_ERR; + ret = bch2_bio_uncompress_inplace(op, bio); + if (ret) + return ret; } /* @@ -896,22 +939,44 @@ static enum prep_encoded_ret { * If the data is checksummed and we're only writing a subset, * rechecksum and adjust bio to point to currently live data: */ - if ((op->crc.live_size != op->crc.uncompressed_size || - op->crc.csum_type != op->csum_type) && - bch2_write_rechecksum(c, op, op->csum_type) && - !c->opts.no_data_io) - return PREP_ENCODED_CHECKSUM_ERR; + if (op->crc.live_size != op->crc.uncompressed_size || + op->crc.csum_type != op->csum_type) { + ret = bch2_write_rechecksum(c, op, op->csum_type); + if (ret) + return ret; + } /* * If we want to compress the data, it has to be decrypted: */ - if ((op->compression_opt || - bch2_csum_type_is_encryption(op->crc.csum_type) != - bch2_csum_type_is_encryption(op->csum_type)) && - bch2_write_decrypt(op)) - return PREP_ENCODED_CHECKSUM_ERR; + if (bch2_csum_type_is_encryption(op->crc.csum_type) && + (op->compression_opt || op->crc.csum_type != op->csum_type)) { + struct nonce nonce = extent_nonce(op->version, op->crc); + csum = bch2_checksum_bio(c, op->crc.csum_type, nonce, bio); + if (bch2_crc_cmp(op->crc.csum, csum) && !c->opts.no_data_io) + goto csum_err; - return PREP_ENCODED_OK; + ret = bch2_encrypt_bio(c, op->crc.csum_type, nonce, bio); + if (ret) + return ret; + + op->crc.csum_type = 0; + op->crc.csum = (struct bch_csum) { 0, 0 }; + } + + return 0; +csum_err: + bch2_write_op_error(op, op->pos.offset, + "error verifying existing checksum while moving existing data (memory corruption?)\n" + " expected %0llx:%0llx got %0llx:%0llx type %s", + op->crc.csum.hi, + op->crc.csum.lo, + csum.hi, + csum.lo, + op->crc.csum_type < BCH_CSUM_NR + ? __bch2_csum_types[op->crc.csum_type] + : "(unknown)"); + return -BCH_ERR_data_write_csum; } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -926,43 +991,51 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, bool page_alloc_failed = false; int ret, more = 0; + if (op->incompressible) + op->compression_opt = 0; + BUG_ON(!bio_sectors(src)); ec_buf = bch2_writepoint_ec_buf(c, wp); - switch (bch2_write_prep_encoded_data(op, wp)) { - case PREP_ENCODED_OK: - break; - case PREP_ENCODED_ERR: - ret = -EIO; - goto err; - case PREP_ENCODED_CHECKSUM_ERR: - goto csum_err; - case PREP_ENCODED_DO_WRITE: - /* XXX look for bug here */ - if (ec_buf) { - dst = bch2_write_bio_alloc(c, wp, src, - &page_alloc_failed, - ec_buf); - bio_copy_data(dst, src); - bounce = true; + if (unlikely(op->flags & BCH_WRITE_data_encoded)) { + ret = bch2_write_prep_encoded_data(op, wp); + if (ret < 0) + goto err; + if (ret) { + if (ec_buf) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bio_copy_data(dst, src); + bounce = true; + } + init_append_extent(op, wp, op->version, op->crc); + goto do_write; } - init_append_extent(op, wp, op->version, op->crc); - goto do_write; } if (ec_buf || op->compression_opt || (op->csum_type && - !(op->flags & BCH_WRITE_PAGES_STABLE)) || + !(op->flags & BCH_WRITE_pages_stable)) || (bch2_csum_type_is_encryption(op->csum_type) && - !(op->flags & BCH_WRITE_PAGES_OWNED))) { + !(op->flags & BCH_WRITE_pages_owned))) { dst = bch2_write_bio_alloc(c, wp, src, &page_alloc_failed, ec_buf); bounce = true; } +#ifdef CONFIG_BCACHEFS_DEBUG + unsigned write_corrupt_ratio = READ_ONCE(bch2_write_corrupt_ratio); + if (!bounce && write_corrupt_ratio) { + dst = bch2_write_bio_alloc(c, wp, src, + &page_alloc_failed, + ec_buf); + bounce = true; + } +#endif saved_iter = dst->bi_iter; do { @@ -976,7 +1049,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, break; BUG_ON(op->compression_opt && - (op->flags & BCH_WRITE_DATA_ENCODED) && + (op->flags & BCH_WRITE_data_encoded) && bch2_csum_type_is_encryption(op->crc.csum_type)); BUG_ON(op->compression_opt && !bounce); @@ -1014,7 +1087,7 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, } } - if ((op->flags & BCH_WRITE_DATA_ENCODED) && + if ((op->flags & BCH_WRITE_data_encoded) && !crc_is_compressed(crc) && bch2_csum_type_is_encryption(op->crc.csum_type) == bch2_csum_type_is_encryption(op->csum_type)) { @@ -1032,12 +1105,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, * data can't be modified (by userspace) while it's in * flight. */ - if (bch2_rechecksum_bio(c, src, version, op->crc, + ret = bch2_rechecksum_bio(c, src, version, op->crc, &crc, &op->crc, src_len >> 9, bio_sectors(src) - (src_len >> 9), - op->csum_type)) - goto csum_err; + op->csum_type); + if (ret) + goto err; /* * rchecksum_bio sets compression_type on crc from op->crc, * this isn't always correct as sometimes we're changing @@ -1046,13 +1120,13 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, crc.compression_type = compression_type; crc.nonce = nonce; } else { - if ((op->flags & BCH_WRITE_DATA_ENCODED) && - bch2_rechecksum_bio(c, src, version, op->crc, + if ((op->flags & BCH_WRITE_data_encoded) && + (ret = bch2_rechecksum_bio(c, src, version, op->crc, NULL, &op->crc, src_len >> 9, bio_sectors(src) - (src_len >> 9), - op->crc.csum_type)) - goto csum_err; + op->crc.csum_type))) + goto err; crc.compressed_size = dst_len >> 9; crc.uncompressed_size = src_len >> 9; @@ -1072,6 +1146,14 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, init_append_extent(op, wp, version, crc); +#ifdef CONFIG_BCACHEFS_DEBUG + if (write_corrupt_ratio) { + swap(dst->bi_iter.bi_size, dst_len); + bch2_maybe_corrupt_bio(dst, write_corrupt_ratio); + swap(dst->bi_iter.bi_size, dst_len); + } +#endif + if (dst != src) bio_advance(dst, dst_len); bio_advance(src, src_len); @@ -1103,16 +1185,6 @@ static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, do_write: *_dst = dst; return more; -csum_err: - { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "error verifying existing checksum while rewriting existing data (memory corruption?)"); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - ret = -EIO; err: if (to_wbio(dst)->bounce) bch2_bio_free_pages_pool(c, dst); @@ -1190,39 +1262,36 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) { struct bch_fs *c = op->c; struct btree_trans *trans = bch2_trans_get(c); + int ret = 0; for_each_keylist_key(&op->insert_keys, orig) { - int ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, + ret = for_each_btree_key_max_commit(trans, iter, BTREE_ID_extents, bkey_start_pos(&orig->k), orig->k.p, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ bch2_nocow_write_convert_one_unwritten(trans, &iter, orig, k, op->new_i_size); })); - - if (ret && !bch2_err_matches(ret, EROFS)) { - struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); - - struct printbuf buf = PRINTBUF; - bch2_write_op_error_trans(trans, &buf, op, bkey_start_offset(&insert->k)); - prt_printf(&buf, "btree update error: %s", bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } - - if (ret) { - op->error = ret; + if (ret) break; - } } bch2_trans_put(trans); + + if (ret && !bch2_err_matches(ret, EROFS)) { + struct bkey_i *insert = bch2_keylist_front(&op->insert_keys); + bch2_write_op_error(op, bkey_start_offset(&insert->k), + "btree update error: %s", bch2_err_str(ret)); + } + + if (ret) + op->error = ret; } static void __bch2_nocow_write_done(struct bch_write_op *op) { - if (unlikely(op->flags & BCH_WRITE_IO_ERROR)) { - op->error = -EIO; - } else if (unlikely(op->flags & BCH_WRITE_CONVERT_UNWRITTEN)) + if (unlikely(op->flags & BCH_WRITE_io_error)) { + op->error = -BCH_ERR_data_write_io; + } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1251,7 +1320,7 @@ static void bch2_nocow_write(struct bch_write_op *op) struct bucket_to_lock *stale_at; int stale, ret; - if (op->flags & BCH_WRITE_MOVE) + if (op->flags & BCH_WRITE_move) return; darray_init(&buckets); @@ -1275,7 +1344,7 @@ retry: if (ret) break; - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) break; @@ -1294,7 +1363,8 @@ retry: /* Get iorefs before dropping btree locks: */ struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); + struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE, + BCH_DEV_WRITE_REF_io_write); if (unlikely(!ca)) goto err_get_ioref; @@ -1309,7 +1379,7 @@ retry: }), GFP_KERNEL|__GFP_NOFAIL); if (ptr->unwritten) - op->flags |= BCH_WRITE_CONVERT_UNWRITTEN; + op->flags |= BCH_WRITE_convert_unwritten; } /* Unlock before taking nocow locks, doing IO: */ @@ -1317,7 +1387,7 @@ retry: bch2_trans_unlock(trans); bch2_cut_front(op->pos, op->insert_keys.top); - if (op->flags & BCH_WRITE_CONVERT_UNWRITTEN) + if (op->flags & BCH_WRITE_convert_unwritten) bch2_cut_back(POS(op->pos.inode, op->pos.offset + bio_sectors(bio)), op->insert_keys.top); darray_for_each(buckets, i) { @@ -1342,7 +1412,7 @@ retry: wbio_init(bio)->put_bio = true; bio->bi_opf = op->wbio.bio.bi_opf; } else { - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; } op->pos.offset += bio_sectors(bio); @@ -1352,13 +1422,14 @@ retry: bio->bi_private = &op->cl; bio->bi_opf |= REQ_OP_WRITE; closure_get(&op->cl); + bch2_submit_wbio_replicas(to_wbio(bio), c, BCH_DATA_user, op->insert_keys.top, true); bch2_keylist_push(&op->insert_keys); - if (op->flags & BCH_WRITE_SUBMITTED) + if (op->flags & BCH_WRITE_submitted) break; - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); } out: bch2_trans_iter_exit(trans, &iter); @@ -1370,21 +1441,18 @@ err: darray_exit(&buckets); if (ret) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); + bch2_write_op_error(op, op->pos.offset, + "%s(): btree lookup error: %s", __func__, bch2_err_str(ret)); op->error = ret; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; } /* fallback to cow write path? */ - if (!(op->flags & BCH_WRITE_SUBMITTED)) { + if (!(op->flags & BCH_WRITE_submitted)) { closure_sync(&op->cl); __bch2_nocow_write_done(op); op->insert_keys.top = op->insert_keys.keys; - } else if (op->flags & BCH_WRITE_SYNC) { + } else if (op->flags & BCH_WRITE_sync) { closure_sync(&op->cl); bch2_nocow_write_done(&op->cl.work); } else { @@ -1398,7 +1466,8 @@ err: return; err_get_ioref: darray_for_each(buckets, i) - percpu_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref); + enumerated_ref_put(&bch2_dev_have_ref(c, i->b.inode)->io_ref[WRITE], + BCH_DEV_WRITE_REF_io_write); /* Fall back to COW path: */ goto out; @@ -1414,7 +1483,7 @@ err_bucket_stale: "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -EIO; + ret = -BCH_ERR_data_write_invalid_ptr; } else { /* We can retry this: */ ret = -BCH_ERR_transaction_restart; @@ -1436,7 +1505,7 @@ static void __bch2_write(struct bch_write_op *op) if (unlikely(op->opts.nocow && c->opts.nocow_enabled)) { bch2_nocow_write(op); - if (op->flags & BCH_WRITE_SUBMITTED) + if (op->flags & BCH_WRITE_submitted) goto out_nofs_restore; } again: @@ -1466,7 +1535,7 @@ again: ret = bch2_trans_run(c, lockrestart_do(trans, bch2_alloc_sectors_start_trans(trans, op->target, - op->opts.erasure_code && !(op->flags & BCH_WRITE_CACHED), + op->opts.erasure_code && !(op->flags & BCH_WRITE_cached), op->write_point, &op->devs_have, op->nr_replicas, @@ -1489,16 +1558,12 @@ again: bch2_alloc_sectors_done_inlined(c, wp); err: if (ret <= 0) { - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_submitted; if (unlikely(ret < 0)) { - if (!(op->flags & BCH_WRITE_ALLOC_NOWAIT)) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "%s(): %s", __func__, bch2_err_str(ret)); - bch_err_ratelimited(c, "%s", buf.buf); - printbuf_exit(&buf); - } + if (!(op->flags & BCH_WRITE_alloc_nowait)) + bch2_write_op_error(op, op->pos.offset, + "%s(): %s", __func__, bch2_err_str(ret)); op->error = ret; break; } @@ -1524,14 +1589,14 @@ err: * synchronously here if we weren't able to submit all of the IO at * once, as that signals backpressure to the caller. */ - if ((op->flags & BCH_WRITE_SYNC) || - (!(op->flags & BCH_WRITE_SUBMITTED) && - !(op->flags & BCH_WRITE_IN_WORKER))) { + if ((op->flags & BCH_WRITE_sync) || + (!(op->flags & BCH_WRITE_submitted) && + !(op->flags & BCH_WRITE_in_worker))) { bch2_wait_on_allocator(c, &op->cl); __bch2_write_index(op); - if (!(op->flags & BCH_WRITE_SUBMITTED)) + if (!(op->flags & BCH_WRITE_submitted)) goto again; bch2_write_done(&op->cl); } else { @@ -1552,8 +1617,8 @@ static void bch2_write_data_inline(struct bch_write_op *op, unsigned data_len) memset(&op->failed, 0, sizeof(op->failed)); - op->flags |= BCH_WRITE_WROTE_DATA_INLINE; - op->flags |= BCH_WRITE_SUBMITTED; + op->flags |= BCH_WRITE_wrote_data_inline; + op->flags |= BCH_WRITE_submitted; bch2_check_set_feature(op->c, BCH_FEATURE_inline_data); @@ -1616,8 +1681,10 @@ CLOSURE_CALLBACK(bch2_write) BUG_ON(!op->write_point.v); BUG_ON(bkey_eq(op->pos, POS_MAX)); - if (op->flags & BCH_WRITE_ONLY_SPECIFIED_DEVS) - op->flags |= BCH_WRITE_ALLOC_NOWAIT; + async_object_list_add(c, write_op, op, &op->list_idx); + + if (op->flags & BCH_WRITE_only_specified_devs) + op->flags |= BCH_WRITE_alloc_nowait; op->nr_replicas_required = min_t(unsigned, op->nr_replicas_required, op->nr_replicas); op->start_time = local_clock(); @@ -1625,11 +1692,8 @@ CLOSURE_CALLBACK(bch2_write) wbio_init(bio)->put_bio = false; if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { - struct printbuf buf = PRINTBUF; - bch2_write_op_error(&buf, op); - prt_printf(&buf, "misaligned write"); - printbuf_exit(&buf); - op->error = -EIO; + bch2_write_op_error(op, op->pos.offset, "misaligned write"); + op->error = -BCH_ERR_data_write_misaligned; goto err; } @@ -1638,13 +1702,14 @@ CLOSURE_CALLBACK(bch2_write) goto err; } - if (!(op->flags & BCH_WRITE_MOVE) && - !bch2_write_ref_tryget(c, BCH_WRITE_REF_write)) { + if (!(op->flags & BCH_WRITE_move) && + !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { op->error = -BCH_ERR_erofs_no_writes; goto err; } - this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); + if (!(op->flags & BCH_WRITE_move)) + this_cpu_add(c->counters[BCH_COUNTER_io_write], bio_sectors(bio)); bch2_increment_clock(c, bio_sectors(bio), WRITE); data_len = min_t(u64, bio->bi_iter.bi_size, @@ -1662,6 +1727,7 @@ err: bch2_disk_reservation_put(c, &op->res); closure_debug_destroy(&op->cl); + async_object_list_del(c, write_op, op->list_idx); if (op->end_io) op->end_io(op); } @@ -1675,27 +1741,33 @@ static const char * const bch2_write_flags[] = { void bch2_write_op_to_text(struct printbuf *out, struct bch_write_op *op) { - prt_str(out, "pos: "); + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); + + prt_printf(out, "pos:\t"); bch2_bpos_to_text(out, op->pos); prt_newline(out); printbuf_indent_add(out, 2); - prt_str(out, "started: "); + prt_printf(out, "started:\t"); bch2_pr_time_units(out, local_clock() - op->start_time); prt_newline(out); - prt_str(out, "flags: "); + prt_printf(out, "flags:\t"); prt_bitflags(out, bch2_write_flags, op->flags); prt_newline(out); - prt_printf(out, "ref: %u\n", closure_nr_remaining(&op->cl)); + prt_printf(out, "nr_replicas:\t%u\n", op->nr_replicas); + prt_printf(out, "nr_replicas_required:\t%u\n", op->nr_replicas_required); + + prt_printf(out, "ref:\t%u\n", closure_nr_remaining(&op->cl)); + prt_printf(out, "ret\t%s\n", bch2_err_str(op->error)); printbuf_indent_sub(out, 2); } void bch2_fs_io_write_exit(struct bch_fs *c) { - mempool_exit(&c->bio_bounce_pages); bioset_exit(&c->replica_set); bioset_exit(&c->bio_write); } @@ -1706,12 +1778,5 @@ int bch2_fs_io_write_init(struct bch_fs *c) bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) return -BCH_ERR_ENOMEM_bio_write_init; - if (mempool_init_page_pool(&c->bio_bounce_pages, - max_t(unsigned, - c->opts.btree_node_size, - c->opts.encoded_extent_max) / - PAGE_SIZE, 0)) - return -BCH_ERR_ENOMEM_bio_bounce_pages_init; - return 0; } diff --git a/fs/bcachefs/io_write.h b/fs/bcachefs/io_write.h index b4626013abc8..2c0a8f35ee1f 100644 --- a/fs/bcachefs/io_write.h +++ b/fs/bcachefs/io_write.h @@ -11,45 +11,11 @@ void bch2_bio_free_pages_pool(struct bch_fs *, struct bio *); void bch2_bio_alloc_pages_pool(struct bch_fs *, struct bio *, size_t); -#ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT -void bch2_latency_acct(struct bch_dev *, u64, int); -#else -static inline void bch2_latency_acct(struct bch_dev *ca, u64 submit_time, int rw) {} -#endif - void bch2_submit_wbio_replicas(struct bch_write_bio *, struct bch_fs *, enum bch_data_type, const struct bkey_i *, bool); -void bch2_write_op_error(struct printbuf *out, struct bch_write_op *op); - -#define BCH_WRITE_FLAGS() \ - x(ALLOC_NOWAIT) \ - x(CACHED) \ - x(DATA_ENCODED) \ - x(PAGES_STABLE) \ - x(PAGES_OWNED) \ - x(ONLY_SPECIFIED_DEVS) \ - x(WROTE_DATA_INLINE) \ - x(FROM_INTERNAL) \ - x(CHECK_ENOSPC) \ - x(SYNC) \ - x(MOVE) \ - x(IN_WORKER) \ - x(SUBMITTED) \ - x(IO_ERROR) \ - x(CONVERT_UNWRITTEN) - -enum __bch_write_flags { -#define x(f) __BCH_WRITE_##f, - BCH_WRITE_FLAGS() -#undef x -}; - -enum bch_write_flags { -#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), - BCH_WRITE_FLAGS() -#undef x -}; +__printf(3, 4) +void bch2_write_op_error(struct bch_write_op *op, u64, const char *, ...); static inline struct workqueue_struct *index_update_wq(struct bch_write_op *op) { diff --git a/fs/bcachefs/io_write_types.h b/fs/bcachefs/io_write_types.h index 6e878a6f2f0b..5da4eb8bb6f6 100644 --- a/fs/bcachefs/io_write_types.h +++ b/fs/bcachefs/io_write_types.h @@ -13,6 +13,34 @@ #include <linux/llist.h> #include <linux/workqueue.h> +#define BCH_WRITE_FLAGS() \ + x(alloc_nowait) \ + x(cached) \ + x(data_encoded) \ + x(pages_stable) \ + x(pages_owned) \ + x(only_specified_devs) \ + x(wrote_data_inline) \ + x(check_enospc) \ + x(sync) \ + x(move) \ + x(in_worker) \ + x(submitted) \ + x(io_error) \ + x(convert_unwritten) + +enum __bch_write_flags { +#define x(f) __BCH_WRITE_##f, + BCH_WRITE_FLAGS() +#undef x +}; + +enum bch_write_flags { +#define x(f) BCH_WRITE_##f = BIT(__BCH_WRITE_##f), + BCH_WRITE_FLAGS() +#undef x +}; + struct bch_write_bio { struct_group(wbio, struct bch_fs *c; @@ -43,6 +71,10 @@ struct bch_write_op { void (*end_io)(struct bch_write_op *); u64 start_time; +#ifdef CONFIG_BCACHEFS_ASYNC_OBJECT_LISTS + unsigned list_idx; +#endif + unsigned written; /* sectors */ u16 flags; s16 error; /* dio write path expects it to hold -ERESTARTSYS... */ @@ -64,7 +96,7 @@ struct bch_write_op { struct bpos pos; struct bversion version; - /* For BCH_WRITE_DATA_ENCODED: */ + /* For BCH_WRITE_data_encoded: */ struct bch_extent_crc_unpacked crc; struct write_point_specifier write_point; diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 24c294d4634e..09b70fd140a1 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -12,6 +12,7 @@ #include "btree_update.h" #include "btree_write_buffer.h" #include "buckets.h" +#include "enumerated_ref.h" #include "error.h" #include "journal.h" #include "journal_io.h" @@ -20,13 +21,6 @@ #include "journal_seq_blacklist.h" #include "trace.h" -static const char * const bch2_journal_errors[] = { -#define x(n) #n, - JOURNAL_ERRORS() -#undef x - NULL -}; - static inline bool journal_seq_unwritten(struct journal *j, u64 seq) { return seq > j->seq_ondisk; @@ -56,14 +50,20 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 prt_printf(out, "seq:\t%llu\n", seq); printbuf_indent_add(out, 2); - prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i)); + if (!buf->write_started) + prt_printf(out, "refcount:\t%u\n", journal_state_count(s, i & JOURNAL_STATE_BUF_MASK)); - prt_printf(out, "size:\t"); - prt_human_readable_u64(out, vstruct_bytes(buf->data)); - prt_newline(out); + struct closure *cl = &buf->io; + int r = atomic_read(&cl->remaining); + prt_printf(out, "io:\t%pS r %i\n", cl->fn, r & CLOSURE_REMAINING_MASK); + + if (buf->data) { + prt_printf(out, "size:\t"); + prt_human_readable_u64(out, vstruct_bytes(buf->data)); + prt_newline(out); + } - prt_printf(out, "expires:\t"); - prt_printf(out, "%li jiffies\n", buf->expires - jiffies); + prt_printf(out, "expires:\t%li jiffies\n", buf->expires - jiffies); prt_printf(out, "flags:\t"); if (buf->noflush) @@ -87,6 +87,9 @@ static void bch2_journal_buf_to_text(struct printbuf *out, struct journal *j, u6 static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) { + lockdep_assert_held(&j->lock); + out->atomic++; + if (!out->nr_tabstops) printbuf_tabstop_push(out, 24); @@ -95,6 +98,8 @@ static void bch2_journal_bufs_to_text(struct printbuf *out, struct journal *j) seq++) bch2_journal_buf_to_text(out, j, seq); prt_printf(out, "last buf %s\n", journal_entry_is_open(j) ? "open" : "closed"); + + --out->atomic; } static inline struct journal_buf * @@ -104,10 +109,8 @@ journal_seq_to_buf(struct journal *j, u64 seq) EBUG_ON(seq > journal_cur_seq(j)); - if (journal_seq_unwritten(j, seq)) { + if (journal_seq_unwritten(j, seq)) buf = j->buf + (seq & JOURNAL_BUF_MASK); - EBUG_ON(le64_to_cpu(buf->data->seq) != seq); - } return buf; } @@ -139,8 +142,10 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) bool stuck = false; struct printbuf buf = PRINTBUF; - if (!(error == JOURNAL_ERR_journal_full || - error == JOURNAL_ERR_journal_pin_full) || + buf.atomic++; + + if (!(error == -BCH_ERR_journal_full || + error == -BCH_ERR_journal_pin_full) || nr_unwritten_journal_entries(j) || (flags & BCH_WATERMARK_MASK) != BCH_WATERMARK_reclaim) return stuck; @@ -164,12 +169,12 @@ journal_error_check_stuck(struct journal *j, int error, unsigned flags) return stuck; } j->err_seq = journal_cur_seq(j); - spin_unlock(&j->lock); - bch_err(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)", - bch2_journal_errors[error]); - bch2_journal_debug_to_text(&buf, j); - bch_err(c, "%s", buf.buf); + __bch2_journal_debug_to_text(&buf, j); + spin_unlock(&j->lock); + prt_printf(&buf, bch2_fmt(c, "Journal stuck! Hava a pre-reservation but journal full (error %s)"), + bch2_err_str(error)); + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_reset(&buf); bch2_journal_pins_to_text(&buf, j); @@ -195,7 +200,8 @@ void bch2_journal_do_writes(struct journal *j) if (w->write_started) continue; - if (!journal_state_count(j->reservations, idx)) { + if (!journal_state_seq_count(j, j->reservations, seq)) { + j->seq_write_started = seq; w->write_started = true; closure_call(&w->io, bch2_journal_write, j->wq, NULL); } @@ -276,7 +282,24 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t sectors = vstruct_blocks_plus(buf->data, c->block_bits, buf->u64s_reserved) << c->block_bits; - BUG_ON(sectors > buf->sectors); + if (unlikely(sectors > buf->sectors)) { + struct printbuf err = PRINTBUF; + err.atomic++; + + prt_printf(&err, "journal entry overran reserved space: %u > %u\n", + sectors, buf->sectors); + prt_printf(&err, "buf u64s %u u64s reserved %u cur_entry_u64s %u block_bits %u\n", + le32_to_cpu(buf->data->u64s), buf->u64s_reserved, + j->cur_entry_u64s, + c->block_bits); + prt_printf(&err, "fatal error - emergency read only"); + bch2_journal_halt_locked(j); + + bch_err(c, "%s", err.buf); + printbuf_exit(&err); + return; + } + buf->sectors = sectors; /* @@ -306,17 +329,7 @@ static void __journal_entry_close(struct journal *j, unsigned closed_val, bool t bch2_journal_space_available(j); - __bch2_journal_buf_put(j, old.idx, le64_to_cpu(buf->data->seq)); -} - -void bch2_journal_halt(struct journal *j) -{ - spin_lock(&j->lock); - __journal_entry_close(j, JOURNAL_ENTRY_ERROR_VAL, true); - if (!j->err_seq) - j->err_seq = journal_cur_seq(j); - journal_wake(j); - spin_unlock(&j->lock); + __bch2_journal_buf_put(j, le64_to_cpu(buf->data->seq)); } void bch2_journal_halt_locked(struct journal *j) @@ -329,6 +342,13 @@ void bch2_journal_halt_locked(struct journal *j) journal_wake(j); } +void bch2_journal_halt(struct journal *j) +{ + spin_lock(&j->lock); + bch2_journal_halt_locked(j); + spin_unlock(&j->lock); +} + static bool journal_entry_want_write(struct journal *j) { bool ret = !journal_entry_is_open(j) || @@ -377,29 +397,41 @@ static int journal_entry_open(struct journal *j) BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) - return JOURNAL_ERR_blocked; + return -BCH_ERR_journal_blocked; if (j->cur_entry_error) return j->cur_entry_error; - if (bch2_journal_error(j)) - return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + int ret = bch2_journal_error(j); + if (unlikely(ret)) + return ret; if (!fifo_free(&j->pin)) - return JOURNAL_ERR_journal_pin_full; + return -BCH_ERR_journal_pin_full; if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return JOURNAL_ERR_max_in_flight; + return -BCH_ERR_journal_max_in_flight; - if (journal_cur_seq(j) >= JOURNAL_SEQ_MAX) { + if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) + return -BCH_ERR_journal_max_open; + + if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); - return JOURNAL_ERR_insufficient_devices; /* -EROFS */ + return -BCH_ERR_journal_shutdown; } + if (!j->free_buf && !buf->data) + return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */ + BUG_ON(!j->cur_entry_sectors); + if (!buf->data) { + swap(buf->data, j->free_buf); + swap(buf->buf_size, j->free_buf_size); + } + buf->expires = (journal_cur_seq(j) == j->flushed_seq_ondisk ? jiffies @@ -415,7 +447,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= (ssize_t) j->early_journal_entries.nr) - return JOURNAL_ERR_journal_full; + return -BCH_ERR_journal_full; if (fifo_empty(&j->pin) && j->reclaim_thread) wake_up_process(j->reclaim_thread); @@ -427,6 +459,14 @@ static int journal_entry_open(struct journal *j) atomic64_inc(&j->seq); journal_pin_list_init(fifo_push_ref(&j->pin), 1); + if (unlikely(bch2_journal_seq_is_blacklisted(c, journal_cur_seq(j), false))) { + bch_err(c, "attempting to open blacklisted journal seq %llu", + journal_cur_seq(j)); + if (bch2_fs_emergency_read_only_locked(c)) + bch_err(c, "fatal error - emergency read only"); + return -BCH_ERR_journal_shutdown; + } + BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); BUG_ON(j->buf + (journal_cur_seq(j) & JOURNAL_BUF_MASK) != buf); @@ -464,7 +504,7 @@ static int journal_entry_open(struct journal *j) new.idx++; BUG_ON(journal_state_count(new, new.idx)); - BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_BUF_MASK)); + BUG_ON(new.idx != (journal_cur_seq(j) & JOURNAL_STATE_BUF_MASK)); journal_state_inc(&new); @@ -514,6 +554,33 @@ static void journal_write_work(struct work_struct *work) spin_unlock(&j->lock); } +static void journal_buf_prealloc(struct journal *j) +{ + if (j->free_buf && + j->free_buf_size >= j->buf_size_want) + return; + + unsigned buf_size = j->buf_size_want; + + spin_unlock(&j->lock); + void *buf = kvmalloc(buf_size, GFP_NOFS); + spin_lock(&j->lock); + + if (buf && + (!j->free_buf || + buf_size > j->free_buf_size)) { + swap(buf, j->free_buf); + swap(buf_size, j->free_buf_size); + } + + if (unlikely(buf)) { + spin_unlock(&j->lock); + /* kvfree can sleep */ + kvfree(buf); + spin_lock(&j->lock); + } +} + static int __journal_res_get(struct journal *j, struct journal_res *res, unsigned flags) { @@ -525,25 +592,28 @@ retry: if (journal_res_get_fast(j, res, flags)) return 0; - if (bch2_journal_error(j)) - return -BCH_ERR_erofs_journal_err; + ret = bch2_journal_error(j); + if (unlikely(ret)) + return ret; if (j->blocked) - return -BCH_ERR_journal_res_get_blocked; + return -BCH_ERR_journal_blocked; if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = JOURNAL_ERR_journal_full; + ret = -BCH_ERR_journal_full; can_discard = j->can_discard; goto out; } if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = JOURNAL_ERR_max_in_flight; + ret = -BCH_ERR_journal_max_in_flight; goto out; } spin_lock(&j->lock); + journal_buf_prealloc(j); + /* * Recheck after taking the lock, so we don't race with another thread * that just did journal_entry_open() and call bch2_journal_entry_close() @@ -566,25 +636,48 @@ retry: j->buf_size_want = max(j->buf_size_want, buf->buf_size << 1); __journal_entry_close(j, JOURNAL_ENTRY_CLOSED_VAL, false); - ret = journal_entry_open(j) ?: JOURNAL_ERR_retry; + ret = journal_entry_open(j) ?: -BCH_ERR_journal_retry_open; unlock: can_discard = j->can_discard; spin_unlock(&j->lock); out: - if (ret == JOURNAL_ERR_retry) - goto retry; - if (!ret) + if (likely(!ret)) return 0; + if (ret == -BCH_ERR_journal_retry_open) + goto retry; if (journal_error_check_stuck(j, ret, flags)) - ret = -BCH_ERR_journal_res_get_blocked; + ret = -BCH_ERR_journal_stuck; - if (ret == JOURNAL_ERR_max_in_flight && - track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true)) { + if (ret == -BCH_ERR_journal_max_in_flight && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && + trace_journal_entry_full_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_printbuf_make_room(&buf, 4096); + spin_lock(&j->lock); + prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); + bch2_journal_bufs_to_text(&buf, j); + spin_unlock(&j->lock); + + trace_journal_entry_full(c, buf.buf); + printbuf_exit(&buf); + count_event(c, journal_entry_full); + } + + if (ret == -BCH_ERR_journal_max_open && + track_event_change(&c->times[BCH_TIME_blocked_journal_max_open], true) && + trace_journal_entry_full_enabled()) { struct printbuf buf = PRINTBUF; + + bch2_printbuf_make_room(&buf, 4096); + + spin_lock(&j->lock); prt_printf(&buf, "seq %llu\n", journal_cur_seq(j)); bch2_journal_bufs_to_text(&buf, j); + spin_unlock(&j->lock); + trace_journal_entry_full(c, buf.buf); printbuf_exit(&buf); count_event(c, journal_entry_full); @@ -594,8 +687,8 @@ out: * Journal is full - can't rely on reclaim from work item due to * freezing: */ - if ((ret == JOURNAL_ERR_journal_full || - ret == JOURNAL_ERR_journal_pin_full) && + if ((ret == -BCH_ERR_journal_full || + ret == -BCH_ERR_journal_pin_full) && !(flags & JOURNAL_RES_GET_NONBLOCK)) { if (can_discard) { bch2_journal_do_discards(j); @@ -608,17 +701,17 @@ out: } } - return ret == JOURNAL_ERR_insufficient_devices - ? -BCH_ERR_erofs_journal_err - : -BCH_ERR_journal_res_get_blocked; + return ret; } static unsigned max_dev_latency(struct bch_fs *c) { u64 nsecs = 0; - for_each_rw_member(c, ca) + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); + rcu_read_unlock(); return nsecs_to_jiffies(nsecs); } @@ -640,7 +733,7 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, int ret; if (closure_wait_event_timeout(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK), HZ)) return ret; @@ -654,19 +747,19 @@ int bch2_journal_res_get_slowpath(struct journal *j, struct journal_res *res, remaining_wait = max(0, remaining_wait - HZ); if (closure_wait_event_timeout(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK), remaining_wait)) return ret; struct printbuf buf = PRINTBUF; bch2_journal_debug_to_text(&buf, j); - bch_err(c, "Journal stuck? Waited for 10 seconds...\n%s", - buf.buf); + bch2_print_str(c, KERN_ERR, buf.buf); + prt_printf(&buf, bch2_fmt(c, "Journal stuck? Waited for 10 seconds, err %s"), bch2_err_str(ret)); printbuf_exit(&buf); closure_wait_event(&j->async_wait, - (ret = __journal_res_get(j, res, flags)) != -BCH_ERR_journal_res_get_blocked || + !bch2_err_matches(ret = __journal_res_get(j, res, flags), BCH_ERR_operation_blocked) || (flags & JOURNAL_RES_GET_NONBLOCK)); return ret; } @@ -687,7 +780,6 @@ void bch2_journal_entry_res_resize(struct journal *j, goto out; j->cur_entry_u64s = max_t(int, 0, j->cur_entry_u64s - d); - smp_mb(); state = READ_ONCE(j->reservations); if (state.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL && @@ -906,11 +998,11 @@ int bch2_journal_meta(struct journal *j) { struct bch_fs *c = container_of(j, struct bch_fs, journal); - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_journal)) - return -EROFS; + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) + return -BCH_ERR_erofs_no_writes; int ret = __bch2_journal_meta(j); - bch2_write_ref_put(c, BCH_WRITE_REF_journal); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); return ret; } @@ -951,7 +1043,8 @@ static void __bch2_journal_block(struct journal *j) new.cur_entry_offset = JOURNAL_ENTRY_BLOCKED_VAL; } while (!atomic64_try_cmpxchg(&j->reservations.counter, &old.v, new.v)); - journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); + if (old.cur_entry_offset < JOURNAL_ENTRY_BLOCKED_VAL) + journal_cur_buf(j)->data->u64s = cpu_to_le32(old.cur_entry_offset); } } @@ -992,7 +1085,7 @@ static struct journal_buf *__bch2_next_write_buffer_flush_journal_buf(struct jou *blocked = true; } - ret = journal_state_count(s, idx) > open + ret = journal_state_count(s, idx & JOURNAL_STATE_BUF_MASK) > open ? ERR_PTR(-EAGAIN) : buf; break; @@ -1021,8 +1114,8 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *j, /* allocate journal on a device: */ -static int __bch2_set_nr_journal_buckets(struct bch_dev *ca, unsigned nr, - bool new_fs, struct closure *cl) +static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, + bool new_fs, struct closure *cl) { struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; @@ -1150,26 +1243,20 @@ err_free: return ret; } -/* - * Allocate more journal space at runtime - not currently making use if it, but - * the code works: - */ -int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, - unsigned nr) +static int bch2_set_nr_journal_buckets_loop(struct bch_fs *c, struct bch_dev *ca, + unsigned nr, bool new_fs) { struct journal_device *ja = &ca->journal; - struct closure cl; int ret = 0; + struct closure cl; closure_init_stack(&cl); - down_write(&c->state_lock); - /* don't handle reducing nr of buckets yet: */ if (nr < ja->nr) - goto unlock; + return 0; - while (ja->nr < nr) { + while (!ret && ja->nr < nr) { struct disk_reservation disk_res = { 0, 0, 0 }; /* @@ -1182,30 +1269,53 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, * filesystem-wide allocation will succeed, this is a device * specific allocation - we can hang here: */ + if (!new_fs) { + ret = bch2_disk_reservation_get(c, &disk_res, + bucket_to_sector(ca, nr - ja->nr), 1, 0); + if (ret) + break; + } - ret = bch2_disk_reservation_get(c, &disk_res, - bucket_to_sector(ca, nr - ja->nr), 1, 0); - if (ret) - break; + ret = bch2_set_nr_journal_buckets_iter(ca, nr, new_fs, &cl); - ret = __bch2_set_nr_journal_buckets(ca, nr, false, &cl); + if (ret == -BCH_ERR_bucket_alloc_blocked || + ret == -BCH_ERR_open_buckets_empty) + ret = 0; /* wait and retry */ bch2_disk_reservation_put(c, &disk_res); - closure_sync(&cl); - - if (ret && ret != -BCH_ERR_bucket_alloc_blocked) - break; } - bch_err_fn(c, ret); -unlock: + return ret; +} + +/* + * Allocate more journal space at runtime - not currently making use if it, but + * the code works: + */ +int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, + unsigned nr) +{ + down_write(&c->state_lock); + int ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, false); up_write(&c->state_lock); + + bch_err_fn(c, ret); return ret; } int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { + struct bch_fs *c = ca->fs; + + if (!(ca->mi.data_allowed & BIT(BCH_DATA_journal))) + return 0; + + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); + return -BCH_ERR_erofs_filesystem_full; + } + unsigned nr; int ret; @@ -1226,7 +1336,7 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) min(1 << 13, (1 << 24) / ca->mi.bucket_size)); - ret = __bch2_set_nr_journal_buckets(ca, nr, new_fs, NULL); + ret = bch2_set_nr_journal_buckets_loop(c, ca, nr, new_fs); err: bch_err_fn(ca, ret); return ret; @@ -1234,13 +1344,14 @@ err: int bch2_fs_journal_alloc(struct bch_fs *c) { - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_journal_alloc) { if (ca->journal.nr) continue; int ret = bch2_dev_journal_alloc(ca, true); if (ret) { - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_journal_alloc); return ret; } } @@ -1312,6 +1423,13 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) bool had_entries = false; u64 last_seq = cur_seq, nr, seq; + /* + * + * XXX pick most recent non blacklisted sequence number + */ + + cur_seq = max(cur_seq, bch2_journal_last_blacklisted_seq(c)); + if (cur_seq >= JOURNAL_SEQ_MAX) { bch_err(c, "cannot start: journal seq overflow"); return -EINVAL; @@ -1329,19 +1447,26 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) nr = cur_seq - last_seq; - if (nr + 1 > j->pin.size) { - free_fifo(&j->pin); - init_fifo(&j->pin, roundup_pow_of_two(nr + 1), GFP_KERNEL); - if (!j->pin.data) { - bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -BCH_ERR_ENOMEM_journal_pin_fifo; - } + /* + * Extra fudge factor, in case we crashed when the journal pin fifo was + * nearly or completely full. We'll need to be able to open additional + * journal entries (at least a few) in order for journal replay to get + * going: + */ + nr += nr / 4; + + nr = max(nr, JOURNAL_PIN); + init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); + if (!j->pin.data) { + bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); + return -BCH_ERR_ENOMEM_journal_pin_fifo; } j->replay_journal_seq = last_seq; j->replay_journal_seq_end = cur_seq; j->last_seq_ondisk = last_seq; j->flushed_seq_ondisk = cur_seq - 1; + j->seq_write_started = cur_seq - 1; j->seq_ondisk = cur_seq - 1; j->pin.front = last_seq; j->pin.back = cur_seq; @@ -1378,19 +1503,29 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) j->last_empty_seq = cur_seq - 1; /* to match j->seq */ spin_lock(&j->lock); - - set_bit(JOURNAL_running, &j->flags); j->last_flush_write = jiffies; - j->reservations.idx = j->reservations.unwritten_idx = journal_cur_seq(j); - j->reservations.unwritten_idx++; + j->reservations.idx = journal_cur_seq(j); c->last_bucket_seq_cleanup = journal_cur_seq(j); + spin_unlock(&j->lock); + return 0; +} + +void bch2_journal_set_replay_done(struct journal *j) +{ + /* + * journal_space_available must happen before setting JOURNAL_running + * JOURNAL_running must happen before JOURNAL_replay_done + */ + spin_lock(&j->lock); bch2_journal_space_available(j); - spin_unlock(&j->lock); - return bch2_journal_reclaim_start(j); + set_bit(JOURNAL_need_flush_write, &j->flags); + set_bit(JOURNAL_running, &j->flags); + set_bit(JOURNAL_replay_done, &j->flags); + spin_unlock(&j->lock); } /* init/exit: */ @@ -1436,7 +1571,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); for (unsigned i = 0; i < ARRAY_SIZE(ja->bio); i++) { - ja->bio[i] = kmalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, + ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) return -BCH_ERR_ENOMEM_dev_journal_init; @@ -1475,10 +1610,11 @@ void bch2_fs_journal_exit(struct journal *j) for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) kvfree(j->buf[i].data); + kvfree(j->free_buf); free_fifo(&j->pin); } -int bch2_fs_journal_init(struct journal *j) +void bch2_fs_journal_init_early(struct journal *j) { static struct lock_class_key res_key; @@ -1497,19 +1633,17 @@ int bch2_fs_journal_init(struct journal *j) atomic64_set(&j->reservations.counter, ((union journal_res_state) { .cur_entry_offset = JOURNAL_ENTRY_CLOSED_VAL }).v); +} - if (!(init_fifo(&j->pin, JOURNAL_PIN, GFP_KERNEL))) - return -BCH_ERR_ENOMEM_journal_pin_fifo; +int bch2_fs_journal_init(struct journal *j) +{ + j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; + j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); + if (!j->free_buf) + return -BCH_ERR_ENOMEM_journal_buf; - for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) { - j->buf[i].buf_size = JOURNAL_ENTRY_SIZE_MIN; - j->buf[i].data = kvmalloc(j->buf[i].buf_size, GFP_KERNEL); - if (!j->buf[i].data) - return -BCH_ERR_ENOMEM_journal_buf; + for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; - } - - j->pin.front = j->pin.back = 1; j->wq = alloc_workqueue("bcachefs_journal", WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); @@ -1557,6 +1691,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "average write size:\t"); prt_human_readable_u64(out, nr_writes ? div64_u64(j->entry_bytes_written, nr_writes) : 0); prt_newline(out); + prt_printf(out, "free buf:\t%u\n", j->free_buf ? j->free_buf_size : 0); prt_printf(out, "nr direct reclaim:\t%llu\n", j->nr_direct_reclaim); prt_printf(out, "nr background reclaim:\t%llu\n", j->nr_background_reclaim); prt_printf(out, "reclaim kicked:\t%u\n", j->reclaim_kicked); @@ -1564,7 +1699,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) ? jiffies_to_msecs(j->next_reclaim - jiffies) : 0); prt_printf(out, "blocked:\t%u\n", j->blocked); prt_printf(out, "current entry sectors:\t%u\n", j->cur_entry_sectors); - prt_printf(out, "current entry error:\t%s\n", bch2_journal_errors[j->cur_entry_error]); + prt_printf(out, "current entry error:\t%s\n", bch2_err_str(j->cur_entry_error)); prt_printf(out, "current entry:\t"); switch (s.cur_entry_offset) { diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 107f7f901cd9..8ff00a0ec778 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -121,11 +121,6 @@ static inline void journal_wake(struct journal *j) closure_wake_up(&j->async_wait); } -static inline struct journal_buf *journal_cur_buf(struct journal *j) -{ - return j->buf + j->reservations.idx; -} - /* Sequence number of oldest dirty journal entry */ static inline u64 journal_last_seq(struct journal *j) @@ -143,6 +138,15 @@ static inline u64 journal_last_unwritten_seq(struct journal *j) return j->seq_ondisk + 1; } +static inline struct journal_buf *journal_cur_buf(struct journal *j) +{ + unsigned idx = (journal_cur_seq(j) & + JOURNAL_BUF_MASK & + ~JOURNAL_STATE_BUF_MASK) + j->reservations.idx; + + return j->buf + idx; +} + static inline int journal_state_count(union journal_res_state s, int idx) { switch (idx) { @@ -154,6 +158,15 @@ static inline int journal_state_count(union journal_res_state s, int idx) BUG(); } +static inline int journal_state_seq_count(struct journal *j, + union journal_res_state s, u64 seq) +{ + if (journal_cur_seq(j) - seq < JOURNAL_STATE_BUF_NR) + return journal_state_count(s, seq & JOURNAL_STATE_BUF_MASK); + else + return 0; +} + static inline void journal_state_inc(union journal_res_state *s) { s->buf0_count += s->idx == 0; @@ -193,7 +206,7 @@ bch2_journal_add_entry_noreservation(struct journal_buf *buf, size_t u64s) static inline struct jset_entry * journal_res_entry(struct journal *j, struct journal_res *res) { - return vstruct_idx(j->buf[res->idx].data, res->offset); + return vstruct_idx(j->buf[res->seq & JOURNAL_BUF_MASK].data, res->offset); } static inline unsigned journal_entry_init(struct jset_entry *entry, unsigned type, @@ -267,8 +280,9 @@ bool bch2_journal_entry_close(struct journal *); void bch2_journal_do_writes(struct journal *); void bch2_journal_buf_put_final(struct journal *, u64); -static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +static inline void __bch2_journal_buf_put(struct journal *j, u64 seq) { + unsigned idx = seq & JOURNAL_STATE_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -276,8 +290,9 @@ static inline void __bch2_journal_buf_put(struct journal *j, unsigned idx, u64 s bch2_journal_buf_put_final(j, seq); } -static inline void bch2_journal_buf_put(struct journal *j, unsigned idx, u64 seq) +static inline void bch2_journal_buf_put(struct journal *j, u64 seq) { + unsigned idx = seq & JOURNAL_STATE_BUF_MASK; union journal_res_state s; s = journal_state_buf_put(j, idx); @@ -306,7 +321,7 @@ static inline void bch2_journal_res_put(struct journal *j, BCH_JSET_ENTRY_btree_keys, 0, 0, 0); - bch2_journal_buf_put(j, res->idx, res->seq); + bch2_journal_buf_put(j, res->seq); res->ref = 0; } @@ -335,8 +350,10 @@ static inline int journal_res_get_fast(struct journal *j, /* * Check if there is still room in the current journal - * entry: + * entry, smp_rmb() guarantees that reads from reservations.counter + * occur before accessing cur_entry_u64s: */ + smp_rmb(); if (new.cur_entry_offset + res->u64s > j->cur_entry_u64s) return 0; @@ -361,9 +378,9 @@ static inline int journal_res_get_fast(struct journal *j, &old.v, new.v)); res->ref = true; - res->idx = old.idx; res->offset = old.cur_entry_offset; - res->seq = le64_to_cpu(j->buf[old.idx].data->seq); + res->seq = journal_cur_seq(j); + res->seq -= (res->seq - old.idx) & JOURNAL_STATE_BUF_MASK; return 1; } @@ -390,6 +407,7 @@ out: (flags & JOURNAL_RES_GET_NONBLOCK) != 0, NULL, _THIS_IP_); EBUG_ON(!res->ref); + BUG_ON(!res->seq); } return 0; } @@ -408,8 +426,8 @@ int bch2_journal_flush(struct journal *); bool bch2_journal_noflush_seq(struct journal *, u64, u64); int bch2_journal_meta(struct journal *); -void bch2_journal_halt(struct journal *); void bch2_journal_halt_locked(struct journal *); +void bch2_journal_halt(struct journal *); static inline int bch2_journal_error(struct journal *j) { @@ -419,12 +437,6 @@ static inline int bch2_journal_error(struct journal *j) struct bch_dev; -static inline void bch2_journal_set_replay_done(struct journal *j) -{ - BUG_ON(!test_bit(JOURNAL_running, &j->flags)); - set_bit(JOURNAL_replay_done, &j->flags); -} - void bch2_journal_unblock(struct journal *); void bch2_journal_block(struct journal *); struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u64, bool *); @@ -441,10 +453,12 @@ void bch2_dev_journal_stop(struct journal *, struct bch_dev *); void bch2_fs_journal_stop(struct journal *); int bch2_fs_journal_start(struct journal *, u64); +void bch2_journal_set_replay_done(struct journal *); void bch2_dev_journal_exit(struct bch_dev *); int bch2_dev_journal_init(struct bch_dev *, struct bch_sb *); void bch2_fs_journal_exit(struct journal *); +void bch2_fs_journal_init_early(struct journal *); int bch2_fs_journal_init(struct journal *); #endif /* _BCACHEFS_JOURNAL_H */ diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 11c39e0c34f4..63bb207208b2 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -19,6 +19,7 @@ #include <linux/ioprio.h> #include <linux/string_choices.h> +#include <linux/sched/sysctl.h> void bch2_journal_pos_from_member_info_set(struct bch_fs *c) { @@ -214,12 +215,12 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, fsck_err_on(same_device, c, journal_entry_dup_same_device, - "duplicate journal entry on same device\n %s", + "duplicate journal entry on same device\n%s", buf.buf); fsck_err_on(not_identical, c, journal_entry_replicas_data_mismatch, - "found duplicate but non identical journal entries\n %s", + "found duplicate but non identical journal entries\n%s", buf.buf); if (entry_ptr.csum_good && !identical) @@ -308,8 +309,8 @@ static void journal_entry_err_msg(struct printbuf *out, break; \ case WRITE: \ bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ - bch_err(c, "corrupt metadata before write: %s\n", _buf.buf);\ - if (bch2_fs_inconsistent(c)) { \ + if (bch2_fs_inconsistent(c, \ + "corrupt metadata before write: %s\n", _buf.buf)) {\ ret = -BCH_ERR_fsck_errors_not_fixed; \ goto fsck_err; \ } \ @@ -764,6 +765,23 @@ static void journal_entry_overwrite_to_text(struct printbuf *out, struct bch_fs journal_entry_btree_keys_to_text(out, c, entry); } +static int journal_entry_log_bkey_validate(struct bch_fs *c, + struct jset *jset, + struct jset_entry *entry, + unsigned version, int big_endian, + struct bkey_validate_context from) +{ + from.flags = 0; + return journal_entry_btree_keys_validate(c, jset, entry, + version, big_endian, from); +} + +static void journal_entry_log_bkey_to_text(struct printbuf *out, struct bch_fs *c, + struct jset_entry *entry) +{ + journal_entry_btree_keys_to_text(out, c, entry); +} + static int journal_entry_write_buffer_keys_validate(struct bch_fs *c, struct jset *jset, struct jset_entry *entry, @@ -1041,13 +1059,19 @@ reread: bio->bi_iter.bi_sector = offset; bch2_bio_map(bio, buf->data, sectors_read << 9); + u64 submit_time = local_clock(); ret = submit_bio_wait(bio); kfree(bio); - if (bch2_dev_io_err_on(ret, ca, BCH_MEMBER_ERROR_read, - "journal read error: sector %llu", - offset) || - bch2_meta_read_fault("journal")) { + if (!ret && bch2_meta_read_fault("journal")) + ret = -BCH_ERR_EIO_fault_injected; + + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, + submit_time, !ret); + + if (ret) { + bch_err_dev_ratelimited(ca, + "journal read error: sector %llu", offset); /* * We don't error out of the recovery process * here, since the relevant journal entry may be @@ -1110,13 +1134,16 @@ reread: struct bch_csum csum; csum_good = jset_csum_good(c, j, &csum); - if (bch2_dev_io_err_on(!csum_good, ca, BCH_MEMBER_ERROR_checksum, - "%s", - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf))) + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); + + if (!csum_good) { + bch_err_dev_ratelimited(ca, "%s", + (printbuf_reset(&err), + prt_str(&err, "journal "), + bch2_csum_err_msg(&err, csum_type, j->csum, csum), + err.buf)); saw_bad = true; + } ret = bch2_encrypt(c, JSET_CSUM_TYPE(j), journal_nonce(j), j->encrypted_start, @@ -1192,7 +1219,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) out: bch_verbose(c, "journal read done on device %s, ret %i", ca->name, ret); kvfree(buf.data); - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_journal_read); closure_return(cl); return; err: @@ -1227,7 +1254,8 @@ int bch2_journal_read(struct bch_fs *c, if ((ca->mi.state == BCH_MEMBER_STATE_rw || ca->mi.state == BCH_MEMBER_STATE_ro) && - percpu_ref_tryget(&ca->io_ref)) + enumerated_ref_tryget(&ca->io_ref[READ], + BCH_DEV_READ_REF_journal_read)) closure_call(&ca->journal.read, bch2_journal_read_device, system_unbound_wq, @@ -1236,7 +1264,8 @@ int bch2_journal_read(struct bch_fs *c, degraded = true; } - closure_sync(&jlist.cl); + while (closure_sync_timeout(&jlist.cl, sysctl_hung_task_timeout_secs * HZ / 2)) + ; if (jlist.ret) return jlist.ret; @@ -1362,8 +1391,8 @@ int bch2_journal_read(struct bch_fs *c, missing_end = seq - 1; fsck_err(c, journal_entries_missing, "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" - " prev at %s\n" - " next at %s, continue?", + "prev at %s\n" + "next at %s, continue?", missing_start, missing_end, *last_seq, *blacklist_seq - 1, buf1.buf, buf2.buf); @@ -1377,7 +1406,7 @@ int bch2_journal_read(struct bch_fs *c, } genradix_for_each(&c->journal_entries, radix_iter, _i) { - struct bch_replicas_padded replicas = { + union bch_replicas_padded replicas = { .e.data_type = BCH_DATA_journal, .e.nr_devs = 0, .e.nr_required = 1, @@ -1417,7 +1446,7 @@ int bch2_journal_read(struct bch_fs *c, !bch2_replicas_marked(c, &replicas.e) && (le64_to_cpu(i->j.seq) == *last_seq || fsck_err(c, journal_entry_replicas_not_marked, - "superblock not marked as containing replicas for journal entry %llu\n %s", + "superblock not marked as containing replicas for journal entry %llu\n%s", le64_to_cpu(i->j.seq), buf.buf))) { ret = bch2_mark_replicas(c, &replicas.e); if (ret) @@ -1434,10 +1463,11 @@ fsck_err: static void journal_advance_devs_to_next_bucket(struct journal *j, struct dev_alloc_list *devs, - unsigned sectors, u64 seq) + unsigned sectors, __le64 seq) { struct bch_fs *c = container_of(j, struct bch_fs, journal); + rcu_read_lock(); darray_for_each(*devs, i) { struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) @@ -1459,6 +1489,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); } } + rcu_read_unlock(); } static void __journal_write_alloc(struct journal *j, @@ -1471,7 +1502,8 @@ static void __journal_write_alloc(struct journal *j, struct bch_fs *c = container_of(j, struct bch_fs, journal); darray_for_each(*devs, i) { - struct bch_dev *ca = rcu_dereference(c->devs[*i]); + struct bch_dev *ca = bch2_dev_get_ioref(c, *i, WRITE, + BCH_DEV_WRITE_REF_journal_write); if (!ca) continue; @@ -1485,8 +1517,10 @@ static void __journal_write_alloc(struct journal *j, ca->mi.state != BCH_MEMBER_STATE_rw || !ja->nr || bch2_bkey_has_device_c(bkey_i_to_s_c(&w->key), ca->dev_idx) || - sectors > ja->sectors_free) + sectors > ja->sectors_free) { + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); continue; + } bch2_dev_stripe_increment(ca, &j->wp.stripe); @@ -1509,15 +1543,8 @@ static void __journal_write_alloc(struct journal *j, } } -/** - * journal_write_alloc - decide where to write next journal entry - * - * @j: journal object - * @w: journal buf (entry to be written) - * - * Returns: 0 on success, or -EROFS on failure - */ -static int journal_write_alloc(struct journal *j, struct journal_buf *w) +static int journal_write_alloc(struct journal *j, struct journal_buf *w, + unsigned *replicas) { struct bch_fs *c = container_of(j, struct bch_fs, journal); struct bch_devs_mask devs; @@ -1525,29 +1552,18 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w) unsigned sectors = vstruct_sectors(w->data, c->block_bits); unsigned target = c->opts.metadata_target ?: c->opts.foreground_target; - unsigned replicas = 0, replicas_want = - READ_ONCE(c->opts.metadata_replicas); + unsigned replicas_want = READ_ONCE(c->opts.metadata_replicas); unsigned replicas_need = min_t(unsigned, replicas_want, READ_ONCE(c->opts.metadata_replicas_required)); bool advance_done = false; - rcu_read_lock(); - - /* We might run more than once if we have to stop and do discards: */ - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(&w->key)); - bkey_for_each_ptr(ptrs, p) { - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p->dev); - if (ca) - replicas += ca->mi.durability; - } - retry_target: devs = target_rw_devs(c, BCH_DATA_journal, target); devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); retry_alloc: - __journal_write_alloc(j, w, &devs_sorted, sectors, &replicas, replicas_want); + __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); - if (likely(replicas >= replicas_want)) + if (likely(*replicas >= replicas_want)) goto done; if (!advance_done) { @@ -1556,18 +1572,16 @@ retry_alloc: goto retry_alloc; } - if (replicas < replicas_want && target) { + if (*replicas < replicas_want && target) { /* Retry from all devices: */ target = 0; advance_done = false; goto retry_target; } done: - rcu_read_unlock(); - BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); - return replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; + return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) @@ -1600,18 +1614,12 @@ static void journal_buf_realloc(struct journal *j, struct journal_buf *buf) kvfree(new_buf); } -static inline struct journal_buf *journal_last_unwritten_buf(struct journal *j) -{ - return j->buf + (journal_last_unwritten_seq(j) & JOURNAL_BUF_MASK); -} - static CLOSURE_CALLBACK(journal_write_done) { closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; - union journal_res_state old, new; + union bch_replicas_padded replicas; u64 seq = le64_to_cpu(w->data->seq); int err = 0; @@ -1620,17 +1628,27 @@ static CLOSURE_CALLBACK(journal_write_done) : j->noflush_write_time, j->write_start_time); if (!w->devs_written.nr) { - bch_err(c, "unable to write journal to sufficient devices"); - err = -EIO; + err = -BCH_ERR_journal_write_err; } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); - if (bch2_mark_replicas(c, &replicas.e)) - err = -EIO; + err = bch2_mark_replicas(c, &replicas.e); } - if (err) - bch2_fatal_error(c); + if (err && !bch2_journal_error(j)) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + if (err == -BCH_ERR_journal_write_err) + prt_printf(&buf, "unable to write journal to sufficient devices"); + else + prt_printf(&buf, "journal write error marking replicas: %s", bch2_err_str(err)); + + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } closure_debug_destroy(cl); @@ -1641,7 +1659,23 @@ static CLOSURE_CALLBACK(journal_write_done) j->err_seq = seq; w->write_done = true; + if (!j->free_buf || j->free_buf_size < w->buf_size) { + swap(j->free_buf, w->data); + swap(j->free_buf_size, w->buf_size); + } + + if (w->data) { + void *buf = w->data; + w->data = NULL; + w->buf_size = 0; + + spin_unlock(&j->lock); + kvfree(buf); + spin_lock(&j->lock); + } + bool completed = false; + bool do_discards = false; for (seq = journal_last_unwritten_seq(j); seq <= journal_cur_seq(j); @@ -1650,11 +1684,10 @@ static CLOSURE_CALLBACK(journal_write_done) if (!w->write_done) break; - if (!j->err_seq && !JSET_NO_FLUSH(w->data)) { + if (!j->err_seq && !w->noflush) { j->flushed_seq_ondisk = seq; j->last_seq_ondisk = w->last_seq; - bch2_do_discards(c); closure_wake_up(&c->freelist_wait); bch2_reset_alloc_cursors(c); } @@ -1671,16 +1704,6 @@ static CLOSURE_CALLBACK(journal_write_done) if (j->watermark != BCH_WATERMARK_stripe) journal_reclaim_kick(&c->journal); - old.v = atomic64_read(&j->reservations.counter); - do { - new.v = old.v; - BUG_ON(journal_state_count(new, new.unwritten_idx)); - BUG_ON(new.unwritten_idx != (seq & JOURNAL_BUF_MASK)); - - new.unwritten_idx++; - } while (!atomic64_try_cmpxchg(&j->reservations.counter, - &old.v, new.v)); - closure_wake_up(&w->wait); completed = true; } @@ -1695,7 +1718,7 @@ static CLOSURE_CALLBACK(journal_write_done) } if (journal_last_unwritten_seq(j) == journal_cur_seq(j) && - new.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { + j->reservations.cur_entry_offset < JOURNAL_ENTRY_CLOSED_VAL) { struct journal_buf *buf = journal_cur_buf(j); long delta = buf->expires - jiffies; @@ -1715,6 +1738,9 @@ static CLOSURE_CALLBACK(journal_write_done) */ bch2_journal_do_writes(j); spin_unlock(&j->lock); + + if (do_discards) + bch2_do_discards(c); } static void journal_write_endio(struct bio *bio) @@ -1724,20 +1750,23 @@ static void journal_write_endio(struct bio *bio) struct journal *j = &ca->fs->journal; struct journal_buf *w = j->buf + jbio->buf_idx; - if (bch2_dev_io_err_on(bio->bi_status, ca, BCH_MEMBER_ERROR_write, + bch2_account_io_completion(ca, BCH_MEMBER_ERROR_write, + jbio->submit_time, !bio->bi_status); + + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "error writing journal entry %llu: %s", le64_to_cpu(w->data->seq), - bch2_blk_status_to_str(bio->bi_status)) || - bch2_meta_write_fault("journal")) { - unsigned long flags; + bch2_blk_status_to_str(bio->bi_status)); + unsigned long flags; spin_lock_irqsave(&j->err_lock, flags); bch2_dev_list_drop_dev(&w->devs_written, ca->dev_idx); spin_unlock_irqrestore(&j->err_lock, flags); } closure_put(&w->io); - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); } static CLOSURE_CALLBACK(journal_write_submit) @@ -1748,18 +1777,17 @@ static CLOSURE_CALLBACK(journal_write_submit) unsigned sectors = vstruct_sectors(w->data, c->block_bits); extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { - struct bch_dev *ca = bch2_dev_get_ioref(c, ptr->dev, WRITE); - if (!ca) { - /* XXX: fix this */ - bch_err(c, "missing device for journal write\n"); - continue; - } + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_journal], sectors); struct journal_device *ja = &ca->journal; - struct bio *bio = &ja->bio[w->idx]->bio; + struct journal_bio *jbio = ja->bio[w->idx]; + struct bio *bio = &jbio->bio; + + jbio->submit_time = local_clock(); + bio_reset(bio, ca->disk_sb.bdev, REQ_OP_WRITE|REQ_SYNC|REQ_META); bio->bi_iter.bi_sector = ptr->offset; bio->bi_end_io = journal_write_endio; @@ -1791,6 +1819,10 @@ static CLOSURE_CALLBACK(journal_write_preflush) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); + /* + * Wait for previous journal writes to comelete; they won't necessarily + * be flushed if they're still in flight + */ if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { spin_lock(&j->lock); if (j->seq_ondisk + 1 != le64_to_cpu(w->data->seq)) { @@ -1803,8 +1835,9 @@ static CLOSURE_CALLBACK(journal_write_preflush) } if (w->separate_flush) { - for_each_rw_member(c, ca) { - percpu_ref_get(&ca->io_ref); + for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_write) { + enumerated_ref_get(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_journal_write); struct journal_device *ja = &ca->journal; struct bio *bio = &ja->bio[w->idx]->bio; @@ -1831,9 +1864,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) struct jset_entry *start, *end; struct jset *jset = w->data; struct journal_keys_to_wb wb = { NULL }; - unsigned sectors, bytes, u64s; + unsigned u64s; unsigned long btree_roots_have = 0; - bool validate_before_checksum = false; u64 seq = le64_to_cpu(jset->seq); int ret; @@ -1916,8 +1948,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) le32_add_cpu(&jset->u64s, u64s); - sectors = vstruct_sectors(jset, c->block_bits); - bytes = vstruct_bytes(jset); + unsigned sectors = vstruct_sectors(jset, c->block_bits); if (sectors > w->sectors) { bch2_fs_fatal_error(c, ": journal write overran available space, %zu > %u (extra %u reserved %u/%u)", @@ -1926,6 +1957,17 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) return -EINVAL; } + return 0; +} + +static int bch2_journal_write_checksum(struct journal *j, struct journal_buf *w) +{ + struct bch_fs *c = container_of(j, struct bch_fs, journal); + struct jset *jset = w->data; + u64 seq = le64_to_cpu(jset->seq); + bool validate_before_checksum = false; + int ret = 0; + jset->magic = cpu_to_le64(jset_magic(c)); jset->version = cpu_to_le32(c->sb.version); @@ -1948,7 +1990,7 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) ret = bch2_encrypt(c, JSET_CSUM_TYPE(jset), journal_nonce(jset), jset->encrypted_start, vstruct_end(jset) - (void *) jset->encrypted_start); - if (bch2_fs_fatal_err_on(ret, c, "decrypting journal entry: %s", bch2_err_str(ret))) + if (bch2_fs_fatal_err_on(ret, c, "encrypting journal entry: %s", bch2_err_str(ret))) return ret; jset->csum = csum_vstruct(c, JSET_CSUM_TYPE(jset), @@ -1958,6 +2000,8 @@ static int bch2_journal_write_prep(struct journal *j, struct journal_buf *w) (ret = jset_validate(c, NULL, jset, 0, WRITE))) return ret; + unsigned sectors = vstruct_sectors(jset, c->block_bits); + unsigned bytes = vstruct_bytes(jset); memset((void *) jset + bytes, 0, (sectors << 9) - bytes); return 0; } @@ -1984,7 +2028,7 @@ static int bch2_journal_write_pick_flush(struct journal *j, struct journal_buf * * write anything at all. */ if (error && test_bit(JOURNAL_need_flush_write, &j->flags)) - return -EIO; + return error; if (error || w->noflush || @@ -2013,13 +2057,10 @@ CLOSURE_CALLBACK(bch2_journal_write) closure_type(w, struct journal_buf, io); struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); - struct bch_replicas_padded replicas; - unsigned nr_rw_members = 0; + union bch_replicas_padded replicas; + unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]); int ret; - for_each_rw_member(c, ca) - nr_rw_members++; - BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); BUG_ON(!w->write_started); BUG_ON(w->write_allocated); @@ -2033,7 +2074,8 @@ CLOSURE_CALLBACK(bch2_journal_write) ret = bch2_journal_write_pick_flush(j, w); spin_unlock(&j->lock); - if (ret) + + if (unlikely(ret)) goto err; mutex_lock(&j->buf_lock); @@ -2041,43 +2083,34 @@ CLOSURE_CALLBACK(bch2_journal_write) ret = bch2_journal_write_prep(j, w); mutex_unlock(&j->buf_lock); - if (ret) - goto err; - j->entry_bytes_written += vstruct_bytes(w->data); + if (unlikely(ret)) + goto err; + unsigned replicas_allocated = 0; while (1) { - spin_lock(&j->lock); - ret = journal_write_alloc(j, w); + ret = journal_write_alloc(j, w, &replicas_allocated); if (!ret || !j->can_discard) break; - spin_unlock(&j->lock); bch2_journal_do_discards(j); } - if (ret && !bch2_journal_error(j)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; + if (unlikely(ret)) + goto err_allocate_write; - prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), - le64_to_cpu(w->data->seq), - vstruct_sectors(w->data, c->block_bits), - bch2_err_str(ret)); - __bch2_journal_debug_to_text(&buf, j); - spin_unlock(&j->lock); - bch2_print_string_as_lines(KERN_ERR, buf.buf); - printbuf_exit(&buf); - } - if (ret) + ret = bch2_journal_write_checksum(j, w); + if (unlikely(ret)) goto err; + spin_lock(&j->lock); /* * write is allocated, no longer need to account for it in * bch2_journal_space_available(): */ w->sectors = 0; w->write_allocated = true; + j->entry_bytes_written += vstruct_bytes(w->data); /* * journal entry has been compacted and allocated, recalculate space @@ -2089,9 +2122,6 @@ CLOSURE_CALLBACK(bch2_journal_write) w->devs_written = bch2_bkey_devs(bkey_i_to_s_c(&w->key)); - if (c->opts.nochanges) - goto no_io; - /* * Mark journal replicas before we submit the write to guarantee * recovery will find the journal entries after a crash. @@ -2102,15 +2132,33 @@ CLOSURE_CALLBACK(bch2_journal_write) if (ret) goto err; + if (c->opts.nochanges) + goto no_io; + if (!JSET_NO_FLUSH(w->data)) continue_at(cl, journal_write_preflush, j->wq); else continue_at(cl, journal_write_submit, j->wq); return; -no_io: - continue_at(cl, journal_write_done, j->wq); - return; +err_allocate_write: + if (!bch2_journal_error(j)) { + struct printbuf buf = PRINTBUF; + + bch2_journal_debug_to_text(&buf, j); + prt_printf(&buf, bch2_fmt(c, "Unable to allocate journal write at seq %llu for %zu sectors: %s"), + le64_to_cpu(w->data->seq), + vstruct_sectors(w->data, c->block_bits), + bch2_err_str(ret)); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } err: bch2_fatal_error(c); +no_io: + extent_for_each_ptr(bkey_i_to_s_extent(&w->key), ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + enumerated_ref_put(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_journal_write); + } + continue_at(cl, journal_write_done, j->wq); } diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index d373cd181a7f..70f36f6bc482 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -17,6 +17,8 @@ #include <linux/kthread.h> #include <linux/sched/mm.h> +static bool __should_discard_bucket(struct journal *, struct journal_device *); + /* Free space calculations: */ static unsigned journal_space_from(struct journal_device *ja, @@ -203,8 +205,7 @@ void bch2_journal_space_available(struct journal *j) ja->bucket_seq[ja->dirty_idx_ondisk] < j->last_seq_ondisk) ja->dirty_idx_ondisk = (ja->dirty_idx_ondisk + 1) % ja->nr; - if (ja->discard_idx != ja->dirty_idx_ondisk) - can_discard = true; + can_discard |= __should_discard_bucket(j, ja); max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); nr_online++; @@ -214,19 +215,21 @@ void bch2_journal_space_available(struct journal *j) j->can_discard = can_discard; if (nr_online < metadata_replicas_required(c)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" - "rw journal devs:", nr_online, metadata_replicas_required(c)); - - rcu_read_lock(); - for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) - prt_printf(&buf, " %s", ca->name); - rcu_read_unlock(); - - bch_err(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = JOURNAL_ERR_insufficient_devices; + if (!(c->sb.features & BIT_ULL(BCH_FEATURE_small_image))) { + struct printbuf buf = PRINTBUF; + buf.atomic++; + prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" + "rw journal devs:", nr_online, metadata_replicas_required(c)); + + rcu_read_lock(); + for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) + prt_printf(&buf, " %s", ca->name); + rcu_read_unlock(); + + bch_err(c, "%s", buf.buf); + printbuf_exit(&buf); + } + ret = -BCH_ERR_insufficient_journal_devices; goto out; } @@ -240,7 +243,7 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) - ret = JOURNAL_ERR_journal_full; + ret = -BCH_ERR_journal_full; if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && @@ -252,7 +255,10 @@ void bch2_journal_space_available(struct journal *j) bch2_journal_set_watermark(j); out: - j->cur_entry_sectors = !ret ? j->space[journal_space_discarded].next_entry : 0; + j->cur_entry_sectors = !ret + ? round_down(j->space[journal_space_discarded].next_entry, + block_sectors(c)) + : 0; j->cur_entry_error = ret; if (!ret) @@ -261,12 +267,19 @@ out: /* Discards - last part of journal reclaim: */ -static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +static bool __should_discard_bucket(struct journal *j, struct journal_device *ja) { - bool ret; + unsigned min_free = max(4, ja->nr / 8); + return bch2_journal_dev_buckets_available(j, ja, journal_space_discarded) < + min_free && + ja->discard_idx != ja->dirty_idx_ondisk; +} + +static bool should_discard_bucket(struct journal *j, struct journal_device *ja) +{ spin_lock(&j->lock); - ret = ja->discard_idx != ja->dirty_idx_ondisk; + bool ret = __should_discard_bucket(j, ja); spin_unlock(&j->lock); return ret; @@ -282,12 +295,12 @@ void bch2_journal_do_discards(struct journal *j) mutex_lock(&j->discard_lock); - for_each_rw_member(c, ca) { + for_each_rw_member(c, ca, BCH_DEV_WRITE_REF_journal_do_discards) { struct journal_device *ja = &ca->journal; while (should_discard_bucket(j, ja)) { if (!c->opts.nochanges && - ca->mi.discard && + bch2_discard_opt_enabled(c, ca) && bdev_max_discard_sectors(ca->disk_sb.bdev)) blkdev_issue_discard(ca->disk_sb.bdev, bucket_to_sector(ca, @@ -614,7 +627,8 @@ static u64 journal_seq_to_flush(struct journal *j) spin_lock(&j->lock); - for_each_rw_member(c, ca) { + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -624,12 +638,11 @@ static u64 journal_seq_to_flush(struct journal *j) /* Try to keep the journal at most half full: */ nr_buckets = ja->nr / 2; - nr_buckets = min(nr_buckets, ja->nr); - bucket_to_flush = (ja->cur_idx + nr_buckets) % ja->nr; seq_to_flush = max(seq_to_flush, ja->bucket_seq[bucket_to_flush]); } + rcu_read_unlock(); /* Also flush if the pin fifo is more than half full */ seq_to_flush = max_t(s64, seq_to_flush, @@ -645,7 +658,6 @@ static u64 journal_seq_to_flush(struct journal *j) * @j: journal object * @direct: direct or background reclaim? * @kicked: requested to run since we last ran? - * Returns: 0 on success, or -EIO if the journal has been shutdown * * Background journal reclaim writes out btree nodes. It should be run * early enough so that we never completely run out of journal buckets. @@ -685,11 +697,11 @@ static int __bch2_journal_reclaim(struct journal *j, bool direct, bool kicked) if (kthread && kthread_should_stop()) break; - if (bch2_journal_error(j)) { - ret = -EIO; + ret = bch2_journal_error(j); + if (ret) break; - } + /* XXX shove journal discards off to another thread */ bch2_journal_do_discards(j); seq_to_flush = journal_seq_to_flush(j); @@ -952,7 +964,7 @@ int bch2_journal_flush_device_pins(struct journal *j, int dev_idx) seq = 0; spin_lock(&j->lock); while (!ret) { - struct bch_replicas_padded replicas; + union bch_replicas_padded replicas; seq = max(seq, journal_last_seq(j)); if (seq >= j->pin.back) diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index 1f25c111c54c..c5a7d800a0f5 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -130,6 +130,16 @@ bool bch2_journal_seq_is_blacklisted(struct bch_fs *c, u64 seq, return true; } +u64 bch2_journal_last_blacklisted_seq(struct bch_fs *c) +{ + struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; + + if (!t || !t->nr) + return 0; + + return t->entries[eytzinger0_last(t->nr)].end - 1; +} + int bch2_blacklist_table_initialize(struct bch_fs *c) { struct bch_sb_field_journal_seq_blacklist *bl = @@ -231,15 +241,14 @@ bool bch2_blacklist_entries_gc(struct bch_fs *c) struct journal_seq_blacklist_table *t = c->journal_seq_blacklist_table; BUG_ON(nr != t->nr); - unsigned i; - for (src = bl->start, i = t->nr == 0 ? 0 : eytzinger0_first(t->nr); - src < bl->start + nr; - src++, i = eytzinger0_next(i, nr)) { + src = bl->start; + eytzinger0_for_each(i, nr) { BUG_ON(t->entries[i].start != le64_to_cpu(src->start)); BUG_ON(t->entries[i].end != le64_to_cpu(src->end)); if (t->entries[i].dirty || t->entries[i].end >= c->journal.oldest_seq_found_ondisk) *dst++ = *src; + src++; } unsigned new_nr = dst - bl->start; diff --git a/fs/bcachefs/journal_seq_blacklist.h b/fs/bcachefs/journal_seq_blacklist.h index d47636f96fdc..f06942ccfcdd 100644 --- a/fs/bcachefs/journal_seq_blacklist.h +++ b/fs/bcachefs/journal_seq_blacklist.h @@ -12,6 +12,7 @@ blacklist_nr_entries(struct bch_sb_field_journal_seq_blacklist *bl) } bool bch2_journal_seq_is_blacklisted(struct bch_fs *, u64, bool); +u64 bch2_journal_last_blacklisted_seq(struct bch_fs *); int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64, u64); int bch2_blacklist_table_initialize(struct bch_fs *); diff --git a/fs/bcachefs/journal_types.h b/fs/bcachefs/journal_types.h index 1ef3a28ed6ab..51104bbb99da 100644 --- a/fs/bcachefs/journal_types.h +++ b/fs/bcachefs/journal_types.h @@ -12,7 +12,11 @@ /* btree write buffer steals 8 bits for its own purposes: */ #define JOURNAL_SEQ_MAX ((1ULL << 56) - 1) -#define JOURNAL_BUF_BITS 2 +#define JOURNAL_STATE_BUF_BITS 2 +#define JOURNAL_STATE_BUF_NR (1U << JOURNAL_STATE_BUF_BITS) +#define JOURNAL_STATE_BUF_MASK (JOURNAL_STATE_BUF_NR - 1) + +#define JOURNAL_BUF_BITS 4 #define JOURNAL_BUF_NR (1U << JOURNAL_BUF_BITS) #define JOURNAL_BUF_MASK (JOURNAL_BUF_NR - 1) @@ -82,7 +86,6 @@ struct journal_entry_pin { struct journal_res { bool ref; - u8 idx; u16 u64s; u32 offset; u64 seq; @@ -98,9 +101,8 @@ union journal_res_state { }; struct { - u64 cur_entry_offset:20, + u64 cur_entry_offset:22, idx:2, - unwritten_idx:2, buf0_count:10, buf1_count:10, buf2_count:10, @@ -110,13 +112,13 @@ union journal_res_state { /* bytes: */ #define JOURNAL_ENTRY_SIZE_MIN (64U << 10) /* 64k */ -#define JOURNAL_ENTRY_SIZE_MAX (4U << 20) /* 4M */ +#define JOURNAL_ENTRY_SIZE_MAX (4U << 22) /* 16M */ /* * We stash some journal state as sentinal values in cur_entry_offset: * note - cur_entry_offset is in units of u64s */ -#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 20) - 1) +#define JOURNAL_ENTRY_OFFSET_MAX ((1U << 22) - 1) #define JOURNAL_ENTRY_BLOCKED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 2) #define JOURNAL_ENTRY_CLOSED_VAL (JOURNAL_ENTRY_OFFSET_MAX - 1) @@ -149,28 +151,10 @@ enum journal_flags { #undef x }; -/* Reasons we may fail to get a journal reservation: */ -#define JOURNAL_ERRORS() \ - x(ok) \ - x(retry) \ - x(blocked) \ - x(max_in_flight) \ - x(journal_full) \ - x(journal_pin_full) \ - x(journal_stuck) \ - x(insufficient_devices) - -enum journal_errors { -#define x(n) JOURNAL_ERR_##n, - JOURNAL_ERRORS() -#undef x -}; - -typedef DARRAY(u64) darray_u64; - struct journal_bio { struct bch_dev *ca; unsigned buf_idx; + u64 submit_time; struct bio bio; }; @@ -199,7 +183,7 @@ struct journal { * 0, or -ENOSPC if waiting on journal reclaim, or -EROFS if * insufficient devices: */ - enum journal_errors cur_entry_error; + int cur_entry_error; unsigned cur_entry_offset_if_blocked; unsigned buf_size_want; @@ -220,6 +204,8 @@ struct journal { * other is possibly being written out. */ struct journal_buf buf[JOURNAL_BUF_NR]; + void *free_buf; + unsigned free_buf_size; spinlock_t lock; @@ -237,6 +223,7 @@ struct journal { /* Sequence number of most recent journal entry (last entry in @pin) */ atomic64_t seq; + u64 seq_write_started; /* seq, last_seq from the most recent journal entry successfully written */ u64 seq_ondisk; u64 flushed_seq_ondisk; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index ce794d55818f..2f63fc6d456f 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -6,6 +6,7 @@ #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" +#include "ec.h" #include "error.h" #include "lru.h" #include "recovery.h" @@ -59,9 +60,9 @@ int bch2_lru_set(struct btree_trans *trans, u16 lru_id, u64 dev_bucket, u64 time return __bch2_lru_set(trans, lru_id, dev_bucket, time, KEY_TYPE_set); } -int bch2_lru_change(struct btree_trans *trans, - u16 lru_id, u64 dev_bucket, - u64 old_time, u64 new_time) +int __bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) { if (old_time == new_time) return 0; @@ -78,7 +79,9 @@ static const char * const bch2_lru_types[] = { }; int bch2_lru_check_set(struct btree_trans *trans, - u16 lru_id, u64 time, + u16 lru_id, + u64 dev_bucket, + u64 time, struct bkey_s_c referring_k, struct bkey_buf *last_flushed) { @@ -87,9 +90,7 @@ int bch2_lru_check_set(struct btree_trans *trans, struct btree_iter lru_iter; struct bkey_s_c lru_k = bch2_bkey_get_iter(trans, &lru_iter, BTREE_ID_lru, - lru_pos(lru_id, - bucket_to_u64(referring_k.k->p), - time), 0); + lru_pos(lru_id, dev_bucket, time), 0); int ret = bkey_err(lru_k); if (ret) return ret; @@ -100,11 +101,10 @@ int bch2_lru_check_set(struct btree_trans *trans, goto err; if (fsck_err(trans, alloc_key_to_missing_lru_entry, - "missing %s lru entry\n" - " %s", + "missing %s lru entry\n%s", bch2_lru_types[lru_type(lru_k)], (bch2_bkey_val_to_text(&buf, c, referring_k), buf.buf))) { - ret = bch2_lru_set(trans, lru_id, bucket_to_u64(referring_k.k->p), time); + ret = bch2_lru_set(trans, lru_id, dev_bucket, time); if (ret) goto err; } @@ -116,57 +116,81 @@ fsck_err: return ret; } +static struct bbpos lru_pos_to_bp(struct bkey_s_c lru_k) +{ + enum bch_lru_type type = lru_type(lru_k); + + switch (type) { + case BCH_LRU_read: + case BCH_LRU_fragmentation: + return BBPOS(BTREE_ID_alloc, u64_to_bucket(lru_k.k->p.offset)); + case BCH_LRU_stripes: + return BBPOS(BTREE_ID_stripes, POS(0, lru_k.k->p.offset)); + default: + BUG(); + } +} + +static u64 bkey_lru_type_idx(struct bch_fs *c, + enum bch_lru_type type, + struct bkey_s_c k) +{ + struct bch_alloc_v4 a_convert; + const struct bch_alloc_v4 *a; + + switch (type) { + case BCH_LRU_read: + a = bch2_alloc_to_v4(k, &a_convert); + return alloc_lru_idx_read(*a); + case BCH_LRU_fragmentation: { + a = bch2_alloc_to_v4(k, &a_convert); + + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); + u64 idx = ca + ? alloc_lru_idx_fragmentation(*a, ca) + : 0; + rcu_read_unlock(); + return idx; + } + case BCH_LRU_stripes: + return k.k->type == KEY_TYPE_stripe + ? stripe_lru_pos(bkey_s_c_to_stripe(k).v) + : 0; + default: + BUG(); + } +} + static int bch2_check_lru_key(struct btree_trans *trans, struct btree_iter *lru_iter, struct bkey_s_c lru_k, struct bkey_buf *last_flushed) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 a_convert; - const struct bch_alloc_v4 *a; struct printbuf buf1 = PRINTBUF; struct printbuf buf2 = PRINTBUF; - enum bch_lru_type type = lru_type(lru_k); - struct bpos alloc_pos = u64_to_bucket(lru_k.k->p.offset); - u64 idx; - int ret; - - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, alloc_pos); - if (fsck_err_on(!ca, - trans, lru_entry_to_invalid_bucket, - "lru key points to nonexistent device:bucket %llu:%llu", - alloc_pos.inode, alloc_pos.offset)) - return bch2_btree_bit_mod_buffered(trans, BTREE_ID_lru, lru_iter->pos, false); + struct bbpos bp = lru_pos_to_bp(lru_k); - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, alloc_pos, 0); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, bp.btree, bp.pos, 0); + int ret = bkey_err(k); if (ret) goto err; - a = bch2_alloc_to_v4(k, &a_convert); - - switch (type) { - case BCH_LRU_read: - idx = alloc_lru_idx_read(*a); - break; - case BCH_LRU_fragmentation: - idx = alloc_lru_idx_fragmentation(*a, ca); - break; - } + enum bch_lru_type type = lru_type(lru_k); + u64 idx = bkey_lru_type_idx(c, type, k); - if (lru_k.k->type != KEY_TYPE_set || - lru_pos_time(lru_k.k->p) != idx) { + if (lru_pos_time(lru_k.k->p) != idx) { ret = bch2_btree_write_buffer_maybe_flush(trans, lru_k, last_flushed); if (ret) goto err; if (fsck_err(trans, lru_entry_bad, "incorrect lru entry: lru %s time %llu\n" - " %s\n" - " for %s", + "%s\n" + "for %s", bch2_lru_types[type], lru_pos_time(lru_k.k->p), (bch2_bkey_val_to_text(&buf1, c, lru_k), buf1.buf), @@ -176,7 +200,6 @@ static int bch2_check_lru_key(struct btree_trans *trans, err: fsck_err: bch2_trans_iter_exit(trans, &iter); - bch2_dev_put(ca); printbuf_exit(&buf2); printbuf_exit(&buf1); return ret; diff --git a/fs/bcachefs/lru.h b/fs/bcachefs/lru.h index f31a6cf1514c..8abd0aa2083a 100644 --- a/fs/bcachefs/lru.h +++ b/fs/bcachefs/lru.h @@ -28,9 +28,14 @@ static inline enum bch_lru_type lru_type(struct bkey_s_c l) { u16 lru_id = l.k->p.inode >> 48; - if (lru_id == BCH_LRU_FRAGMENTATION_START) + switch (lru_id) { + case BCH_LRU_BUCKET_FRAGMENTATION: return BCH_LRU_fragmentation; - return BCH_LRU_read; + case BCH_LRU_STRIPE_FRAGMENTATION: + return BCH_LRU_stripes; + default: + return BCH_LRU_read; + } } int bch2_lru_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); @@ -46,10 +51,19 @@ void bch2_lru_pos_to_text(struct printbuf *, struct bpos); int bch2_lru_del(struct btree_trans *, u16, u64, u64); int bch2_lru_set(struct btree_trans *, u16, u64, u64); -int bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); +int __bch2_lru_change(struct btree_trans *, u16, u64, u64, u64); + +static inline int bch2_lru_change(struct btree_trans *trans, + u16 lru_id, u64 dev_bucket, + u64 old_time, u64 new_time) +{ + return old_time != new_time + ? __bch2_lru_change(trans, lru_id, dev_bucket, old_time, new_time) + : 0; +} struct bkey_buf; -int bch2_lru_check_set(struct btree_trans *, u16, u64, struct bkey_s_c, struct bkey_buf *); +int bch2_lru_check_set(struct btree_trans *, u16, u64, u64, struct bkey_s_c, struct bkey_buf *); int bch2_check_lrus(struct bch_fs *); diff --git a/fs/bcachefs/lru_format.h b/fs/bcachefs/lru_format.h index f372cb3b8cda..b7392ad8e41f 100644 --- a/fs/bcachefs/lru_format.h +++ b/fs/bcachefs/lru_format.h @@ -9,7 +9,8 @@ struct bch_lru { #define BCH_LRU_TYPES() \ x(read) \ - x(fragmentation) + x(fragmentation) \ + x(stripes) enum bch_lru_type { #define x(n) BCH_LRU_##n, @@ -17,7 +18,8 @@ enum bch_lru_type { #undef x }; -#define BCH_LRU_FRAGMENTATION_START ((1U << 16) - 1) +#define BCH_LRU_BUCKET_FRAGMENTATION ((1U << 16) - 1) +#define BCH_LRU_STRIPE_FRAGMENTATION ((1U << 16) - 2) #define LRU_TIME_BITS 48 #define LRU_TIME_MAX ((1ULL << LRU_TIME_BITS) - 1) diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index ddc187fb693d..bb7a92270c09 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -4,10 +4,13 @@ */ #include "bcachefs.h" +#include "backpointers.h" #include "bkey_buf.h" #include "btree_update.h" #include "btree_update_interior.h" +#include "btree_write_buffer.h" #include "buckets.h" +#include "ec.h" #include "errcode.h" #include "extents.h" #include "io_write.h" @@ -15,11 +18,12 @@ #include "keylist.h" #include "migrate.h" #include "move.h" +#include "progress.h" #include "replicas.h" #include "super-io.h" static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, - unsigned dev_idx, int flags, bool metadata) + unsigned dev_idx, unsigned flags, bool metadata) { unsigned replicas = metadata ? c->opts.metadata_replicas : c->opts.data_replicas; unsigned lost = metadata ? BCH_FORCE_IF_METADATA_LOST : BCH_FORCE_IF_DATA_LOST; @@ -36,11 +40,28 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, return 0; } +static int drop_btree_ptrs(struct btree_trans *trans, struct btree_iter *iter, + struct btree *b, unsigned dev_idx, unsigned flags) +{ + struct bch_fs *c = trans->c; + struct bkey_buf k; + + bch2_bkey_buf_init(&k); + bch2_bkey_buf_copy(&k, c, &b->key); + + int ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), dev_idx, flags, true) ?: + bch2_btree_node_update_key(trans, iter, b, k.k, 0, false); + + bch_err_fn(c, ret); + bch2_bkey_buf_exit(&k, c); + return ret; +} + static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k, unsigned dev_idx, - int flags) + unsigned flags) { struct bch_fs *c = trans->c; struct bkey_i *n; @@ -76,7 +97,27 @@ static int bch2_dev_usrdata_drop_key(struct btree_trans *trans, return 0; } -static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int bch2_dev_btree_drop_key(struct btree_trans *trans, + struct bkey_s_c_backpointer bp, + unsigned dev_idx, + struct bkey_buf *last_flushed, + unsigned flags) +{ + struct btree_iter iter; + struct btree *b = bch2_backpointer_get_node(trans, bp, &iter, last_flushed); + int ret = PTR_ERR_OR_ZERO(b); + if (ret) + return ret == -BCH_ERR_backpointer_to_overwritten_btree_node ? 0 : ret; + + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +static int bch2_dev_usrdata_drop(struct bch_fs *c, + struct progress_indicator_state *progress, + unsigned dev_idx, unsigned flags) { struct btree_trans *trans = bch2_trans_get(c); enum btree_id id; @@ -88,8 +129,10 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) ret = for_each_btree_key_commit(trans, iter, id, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags)); + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + bch2_progress_update_iter(trans, progress, &iter, "dropping user data"); + bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); + })); if (ret) break; } @@ -99,7 +142,9 @@ static int bch2_dev_usrdata_drop(struct bch_fs *c, unsigned dev_idx, int flags) return ret; } -static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int bch2_dev_metadata_drop(struct bch_fs *c, + struct progress_indicator_state *progress, + unsigned dev_idx, unsigned flags) { struct btree_trans *trans; struct btree_iter iter; @@ -123,29 +168,23 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, unsigned dev_idx, int flags) retry: ret = 0; while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(&iter)) && + (b = bch2_btree_iter_peek_node(trans, &iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { + bch2_progress_update_iter(trans, progress, &iter, "dropping metadata"); + if (!bch2_bkey_has_device_c(bkey_i_to_s_c(&b->key), dev_idx)) goto next; - bch2_bkey_buf_copy(&k, c, &b->key); - - ret = drop_dev_ptrs(c, bkey_i_to_s(k.k), - dev_idx, flags, true); - if (ret) - break; - - ret = bch2_btree_node_update_key(trans, &iter, b, k.k, 0, false); + ret = drop_btree_ptrs(trans, &iter, b, dev_idx, flags); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { ret = 0; continue; } - bch_err_msg(c, ret, "updating btree node key"); if (ret) break; next: - bch2_btree_iter_next_node(&iter); + bch2_btree_iter_next_node(trans, &iter); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; @@ -167,8 +206,72 @@ err: return ret; } -int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, int flags) +static int data_drop_bp(struct btree_trans *trans, unsigned dev_idx, + struct bkey_s_c_backpointer bp, struct bkey_buf *last_flushed, + unsigned flags) +{ + struct btree_iter iter; + struct bkey_s_c k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, + last_flushed); + int ret = bkey_err(k); + if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) + return 0; + if (ret) + return ret; + + if (!k.k || !bch2_bkey_has_device_c(k, dev_idx)) + goto out; + + /* + * XXX: pass flags arg to invalidate_stripe_to_dev and handle it + * properly + */ + + if (bkey_is_btree_ptr(k.k)) + ret = bch2_dev_btree_drop_key(trans, bp, dev_idx, last_flushed, flags); + else if (k.k->type == KEY_TYPE_stripe) + ret = bch2_invalidate_stripe_to_dev(trans, &iter, k, dev_idx, flags); + else + ret = bch2_dev_usrdata_drop_key(trans, &iter, k, dev_idx, flags); +out: + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_dev_data_drop_by_backpointers(struct bch_fs *c, unsigned dev_idx, unsigned flags) { - return bch2_dev_usrdata_drop(c, dev_idx, flags) ?: - bch2_dev_metadata_drop(c, dev_idx, flags); + struct btree_trans *trans = bch2_trans_get(c); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + int ret = bch2_btree_write_buffer_flush_sync(trans) ?: + for_each_btree_key_max_commit(trans, iter, BTREE_ID_backpointers, + POS(dev_idx, 0), + POS(dev_idx, U64_MAX), 0, k, + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + if (k.k->type != KEY_TYPE_backpointer) + continue; + + data_drop_bp(trans, dev_idx, bkey_s_c_to_backpointer(k), + &last_flushed, flags); + + })); + + bch2_bkey_buf_exit(&last_flushed, trans->c); + bch2_trans_put(trans); + bch_err_fn(c, ret); + return ret; +} + +int bch2_dev_data_drop(struct bch_fs *c, unsigned dev_idx, unsigned flags) +{ + struct progress_indicator_state progress; + bch2_progress_init(&progress, c, + BIT_ULL(BTREE_ID_extents)| + BIT_ULL(BTREE_ID_reflink)); + + return bch2_dev_usrdata_drop(c, &progress, dev_idx, flags) ?: + bch2_dev_metadata_drop(c, &progress, dev_idx, flags); } diff --git a/fs/bcachefs/migrate.h b/fs/bcachefs/migrate.h index 027efaa0d575..30018140711b 100644 --- a/fs/bcachefs/migrate.h +++ b/fs/bcachefs/migrate.h @@ -2,6 +2,7 @@ #ifndef _BCACHEFS_MIGRATE_H #define _BCACHEFS_MIGRATE_H -int bch2_dev_data_drop(struct bch_fs *, unsigned, int); +int bch2_dev_data_drop_by_backpointers(struct bch_fs *, unsigned, unsigned); +int bch2_dev_data_drop(struct bch_fs *, unsigned, unsigned); #endif /* _BCACHEFS_MIGRATE_H */ diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 160b4374160a..79f4722621d5 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -38,28 +38,28 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_move_extent2(struct bch_fs *c, struct bkey_s_c k, +static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - if (trace_move_extent_enabled()) { + if (trace_io_move_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_newline(&buf); bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_move_extent(c, buf.buf); + trace_io_move(c, buf.buf); printbuf_exit(&buf); } } -static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) +static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) { - if (trace_move_extent_read_enabled()) { + if (trace_io_move_read_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); - trace_move_extent_read(c, buf.buf); + trace_io_move_read(c, buf.buf); printbuf_exit(&buf); } } @@ -67,18 +67,14 @@ static void trace_move_extent_read2(struct bch_fs *c, struct bkey_s_c k) struct moving_io { struct list_head read_list; struct list_head io_list; - struct move_bucket_in_flight *b; + struct move_bucket *b; struct closure cl; bool read_completed; unsigned read_sectors; unsigned write_sectors; - struct bch_read_bio rbio; - struct data_update write; - /* Must be last since it is variable size */ - struct bio_vec bi_inline_vecs[]; }; static void move_free(struct moving_io *io) @@ -88,43 +84,85 @@ static void move_free(struct moving_io *io) if (io->b) atomic_dec(&io->b->count); - bch2_data_update_exit(&io->write); - mutex_lock(&ctxt->lock); list_del(&io->io_list); wake_up(&ctxt->wait); mutex_unlock(&ctxt->lock); + if (!io->write.data_opts.scrub) { + bch2_data_update_exit(&io->write); + } else { + bch2_bio_free_pages_pool(io->write.op.c, &io->write.op.wbio.bio); + kfree(io->write.bvecs); + } kfree(io); } static void move_write_done(struct bch_write_op *op) { struct moving_io *io = container_of(op, struct moving_io, write.op); + struct bch_fs *c = op->c; struct moving_context *ctxt = io->write.ctxt; - if (io->write.op.error) + if (op->error) { + if (trace_io_move_write_fail_enabled()) { + struct printbuf buf = PRINTBUF; + + bch2_write_op_to_text(&buf, op); + trace_io_move_write_fail(c, buf.buf); + printbuf_exit(&buf); + } + this_cpu_inc(c->counters[BCH_COUNTER_io_move_write_fail]); + ctxt->write_error = true; + } - atomic_sub(io->write_sectors, &io->write.ctxt->write_sectors); - atomic_dec(&io->write.ctxt->write_ios); + atomic_sub(io->write_sectors, &ctxt->write_sectors); + atomic_dec(&ctxt->write_ios); move_free(io); closure_put(&ctxt->cl); } static void move_write(struct moving_io *io) { - if (unlikely(io->rbio.bio.bi_status || io->rbio.hole)) { + struct bch_fs *c = io->write.op.c; + struct moving_context *ctxt = io->write.ctxt; + struct bch_read_bio *rbio = &io->write.rbio; + + if (ctxt->stats) { + if (rbio->bio.bi_status) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_uncorrected); + else if (rbio->saw_error) + atomic64_add(io->write.rbio.bvec_iter.bi_size >> 9, + &ctxt->stats->sectors_error_corrected); + } + + /* + * If the extent has been bitrotted, we're going to have to give it a + * new checksum in order to move it - but the poison bit will ensure + * that userspace still gets the appropriate error. + */ + if (unlikely(rbio->ret == -BCH_ERR_data_read_csum_err && + (bch2_bkey_extent_flags(bkey_i_to_s_c(io->write.k.k)) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)))) { + struct bch_extent_crc_unpacked crc = rbio->pick.crc; + struct nonce nonce = extent_nonce(rbio->version, crc); + + rbio->pick.crc.csum = bch2_checksum_bio(c, rbio->pick.crc.csum_type, + nonce, &rbio->bio); + rbio->ret = 0; + } + + if (unlikely(rbio->ret || io->write.data_opts.scrub)) { move_free(io); return; } - if (trace_move_extent_write_enabled()) { - struct bch_fs *c = io->write.op.c; + if (trace_io_move_write_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(io->write.k.k)); - trace_move_extent_write(c, buf.buf); + trace_io_move_write(c, buf.buf); printbuf_exit(&buf); } @@ -132,7 +170,7 @@ static void move_write(struct moving_io *io) atomic_add(io->write_sectors, &io->write.ctxt->write_sectors); atomic_inc(&io->write.ctxt->write_ios); - bch2_data_update_read_done(&io->write, io->rbio.pick.crc); + bch2_data_update_read_done(&io->write); } struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctxt) @@ -145,7 +183,7 @@ struct moving_io *bch2_moving_ctxt_next_pending_write(struct moving_context *ctx static void move_read_endio(struct bio *bio) { - struct moving_io *io = container_of(bio, struct moving_io, rbio.bio); + struct moving_io *io = container_of(bio, struct moving_io, write.rbio.bio); struct moving_context *ctxt = io->write.ctxt; atomic_sub(io->read_sectors, &ctxt->read_sectors); @@ -250,7 +288,7 @@ void bch2_move_stats_init(struct bch_move_stats *stats, const char *name) } int bch2_move_extent(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, + struct move_bucket *bucket_in_flight, struct btree_iter *iter, struct bkey_s_c k, struct bch_io_opts io_opts, @@ -258,14 +296,10 @@ int bch2_move_extent(struct moving_context *ctxt, { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct moving_io *io; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned sectors = k.k->size, pages; int ret = -ENOMEM; - trace_move_extent2(c, k, &io_opts, &data_opts); + trace_io_move2(c, k, &io_opts, &data_opts); + this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); if (ctxt->stats) ctxt->stats->pos = BBPOS(iter->btree_id, iter->pos); @@ -273,7 +307,8 @@ int bch2_move_extent(struct moving_context *ctxt, bch2_data_update_opts_normalize(k, &data_opts); if (!data_opts.rewrite_ptrs && - !data_opts.extra_replicas) { + !data_opts.extra_replicas && + !data_opts.scrub) { if (data_opts.kill_ptrs) return bch2_extent_drop_ptrs(trans, iter, k, &io_opts, &data_opts); return 0; @@ -285,13 +320,7 @@ int bch2_move_extent(struct moving_context *ctxt, */ bch2_trans_unlock(trans); - /* write path might have to decompress data: */ - bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - sectors = max_t(unsigned, sectors, p.crc.uncompressed_size); - - pages = DIV_ROUND_UP(sectors, PAGE_SECTORS); - io = kzalloc(sizeof(struct moving_io) + - sizeof(struct bio_vec) * pages, GFP_KERNEL); + struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); if (!io) goto err; @@ -300,31 +329,27 @@ int bch2_move_extent(struct moving_context *ctxt, io->read_sectors = k.k->size; io->write_sectors = k.k->size; - bio_init(&io->write.op.wbio.bio, NULL, io->bi_inline_vecs, pages, 0); - io->write.op.wbio.bio.bi_ioprio = - IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - - if (bch2_bio_alloc_pages(&io->write.op.wbio.bio, sectors << 9, - GFP_KERNEL)) - goto err_free; + if (!data_opts.scrub) { + ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, + &io_opts, data_opts, iter->btree_id, k); + if (ret) + goto err_free; - io->rbio.c = c; - io->rbio.opts = io_opts; - bio_init(&io->rbio.bio, NULL, io->bi_inline_vecs, pages, 0); - io->rbio.bio.bi_vcnt = pages; - io->rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); - io->rbio.bio.bi_iter.bi_size = sectors << 9; + io->write.op.end_io = move_write_done; + } else { + bch2_bkey_buf_init(&io->write.k); + bch2_bkey_buf_reassemble(&io->write.k, c, k); - io->rbio.bio.bi_opf = REQ_OP_READ; - io->rbio.bio.bi_iter.bi_sector = bkey_start_offset(k.k); - io->rbio.bio.bi_end_io = move_read_endio; + io->write.op.c = c; + io->write.data_opts = data_opts; - ret = bch2_data_update_init(trans, iter, ctxt, &io->write, ctxt->wp, - io_opts, data_opts, iter->btree_id, k); - if (ret) - goto err_free_pages; + ret = bch2_data_update_bios_init(&io->write, c, &io_opts); + if (ret) + goto err_free; + } - io->write.op.end_io = move_write_done; + io->write.rbio.bio.bi_end_io = move_read_endio; + io->write.rbio.bio.bi_ioprio = IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0); if (ctxt->rate) bch2_ratelimit_increment(ctxt->rate, k.k->size); @@ -339,9 +364,7 @@ int bch2_move_extent(struct moving_context *ctxt, atomic_inc(&io->b->count); } - this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); - this_cpu_add(c->counters[BCH_COUNTER_move_extent_read], k.k->size); - trace_move_extent_read2(c, k); + trace_io_move_read2(c, k); mutex_lock(&ctxt->lock); atomic_add(io->read_sectors, &ctxt->read_sectors); @@ -356,39 +379,39 @@ int bch2_move_extent(struct moving_context *ctxt, * ctxt when doing wakeup */ closure_get(&ctxt->cl); - bch2_read_extent(trans, &io->rbio, - bkey_start_pos(k.k), - iter->btree_id, k, 0, - BCH_READ_NODECODE| - BCH_READ_LAST_FRAGMENT); + __bch2_read_extent(trans, &io->write.rbio, + io->write.rbio.bio.bi_iter, + bkey_start_pos(k.k), + iter->btree_id, k, 0, + NULL, + BCH_READ_last_fragment, + data_opts.scrub ? data_opts.read_dev : -1); return 0; -err_free_pages: - bio_free_pages(&io->write.op.wbio.bio); err_free: kfree(io); err: - if (ret == -BCH_ERR_data_update_done) + if (bch2_err_matches(ret, BCH_ERR_data_update_done)) return 0; if (bch2_err_matches(ret, EROFS) || bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; - count_event(c, move_extent_start_fail); + count_event(c, io_move_start_fail); - if (trace_move_extent_start_fail_enabled()) { + if (trace_io_move_start_fail_enabled()) { struct printbuf buf = PRINTBUF; bch2_bkey_val_to_text(&buf, c, k); prt_str(&buf, ": "); prt_str(&buf, bch2_err_str(ret)); - trace_move_extent_start_fail(c, buf.buf); + trace_io_move_start_fail(c, buf.buf); printbuf_exit(&buf); } return ret; } -static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct per_snapshot_io_opts *io_opts, struct bpos extent_pos, /* extent_iter, extent_k may be in reflink btree */ struct btree_iter *extent_iter, @@ -399,6 +422,9 @@ static struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *trans, struct bch_io_opts *opts_ret = &io_opts->fs_io_opts; int ret = 0; + if (extent_iter->min_depth) + return opts_ret; + if (extent_k.k->type == KEY_TYPE_reflink_v) goto out; @@ -518,11 +544,42 @@ int bch2_move_ratelimit(struct moving_context *ctxt) return 0; } -static int bch2_move_data_btree(struct moving_context *ctxt, - struct bpos start, - struct bpos end, - move_pred_fn pred, void *arg, - enum btree_id btree_id) +/* + * Move requires non extents iterators, and there's also no need for it to + * signal indirect_extent_missing_error: + */ +static struct bkey_s_c bch2_lookup_indirect_extent_for_move(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c_reflink_p p) +{ + if (unlikely(REFLINK_P_ERROR(p.v))) + return bkey_s_c_null; + + struct bpos reflink_pos = POS(0, REFLINK_P_IDX(p.v)); + + bch2_trans_iter_init(trans, iter, + BTREE_ID_reflink, reflink_pos, + BTREE_ITER_not_extents); + + struct bkey_s_c k = bch2_btree_iter_peek(trans, iter); + if (!k.k || bkey_err(k)) { + bch2_trans_iter_exit(trans, iter); + return k; + } + + if (bkey_lt(reflink_pos, bkey_start_pos(k.k))) { + bch2_trans_iter_exit(trans, iter); + return bkey_s_c_null; + } + + return k; +} + +int bch2_move_data_btree(struct moving_context *ctxt, + struct bpos start, + struct bpos end, + move_pred_fn pred, void *arg, + enum btree_id btree_id, unsigned level) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -548,10 +605,56 @@ static int bch2_move_data_btree(struct moving_context *ctxt, ctxt->stats->pos = BBPOS(btree_id, start); } +retry_root: bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &iter, btree_id, start, - BTREE_ITER_prefetch| - BTREE_ITER_all_snapshots); + + if (level == bch2_btree_id_root(c, btree_id)->level + 1) { + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level - 1, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); + struct btree *b = bch2_btree_iter_peek_node(trans, &iter); + ret = PTR_ERR_OR_ZERO(b); + if (ret) + goto root_err; + + if (b != btree_node_root(c, b)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } + + k = bkey_i_to_s_c(&b->key); + + io_opts = bch2_move_get_io_opts(trans, &snapshot_io_opts, + iter.pos, &iter, k); + ret = PTR_ERR_OR_ZERO(io_opts); + if (ret) + goto root_err; + + memset(&data_opts, 0, sizeof(data_opts)); + if (!pred(c, arg, iter.btree_id, k, io_opts, &data_opts)) + goto out; + + + if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_pos(trans, btree_id, level, + k.k->p, data_opts.target, 0); + else + ret = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); + +root_err: + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) { + bch2_trans_iter_exit(trans, &iter); + goto retry_root; + } + + goto out; + } + + bch2_trans_node_iter_init(trans, &iter, btree_id, start, 0, level, + BTREE_ITER_prefetch| + BTREE_ITER_not_extents| + BTREE_ITER_all_snapshots); if (ctxt->rate) bch2_ratelimit_reset(ctxt->rate); @@ -561,7 +664,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(trans, &iter); if (!k.k) break; @@ -571,7 +674,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, if (ret) break; - if (bkey_ge(bkey_start_pos(k.k), end)) + if (bkey_gt(bkey_start_pos(k.k), end)) break; if (ctxt->stats) @@ -581,17 +684,16 @@ static int bch2_move_data_btree(struct moving_context *ctxt, k.k->type == KEY_TYPE_reflink_p && REFLINK_P_MAY_UPDATE_OPTIONS(bkey_s_c_to_reflink_p(k).v)) { struct bkey_s_c_reflink_p p = bkey_s_c_to_reflink_p(k); - s64 offset_into_extent = iter.pos.offset - bkey_start_offset(k.k); bch2_trans_iter_exit(trans, &reflink_iter); - k = bch2_lookup_indirect_extent(trans, &reflink_iter, &offset_into_extent, p, true, 0); + k = bch2_lookup_indirect_extent_for_move(trans, &reflink_iter, p); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; - if (bkey_deleted(k.k)) + if (!k.k) goto next_nondata; /* @@ -612,7 +714,7 @@ static int bch2_move_data_btree(struct moving_context *ctxt, continue; memset(&data_opts, 0, sizeof(data_opts)); - if (!pred(c, arg, k, io_opts, &data_opts)) + if (!pred(c, arg, extent_iter->btree_id, k, io_opts, &data_opts)) goto next; /* @@ -622,12 +724,19 @@ static int bch2_move_data_btree(struct moving_context *ctxt, bch2_bkey_buf_reassemble(&sk, c, k); k = bkey_i_to_s_c(sk.k); - ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + if (!level) + ret2 = bch2_move_extent(ctxt, NULL, extent_iter, k, *io_opts, data_opts); + else if (!data_opts.scrub) + ret2 = bch2_btree_node_rewrite_pos(trans, btree_id, level, + k.k->p, data_opts.target, 0); + else + ret2 = bch2_btree_node_scrub(trans, btree_id, level, k, data_opts.read_dev); + if (ret2) { if (bch2_err_matches(ret2, BCH_ERR_transaction_restart)) continue; - if (ret2 == -ENOMEM) { + if (bch2_err_matches(ret2, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); continue; @@ -640,9 +749,10 @@ next: if (ctxt->stats) atomic64_add(k.k->size, &ctxt->stats->sectors_seen); next_nondata: - bch2_btree_iter_advance(&iter); + if (!bch2_btree_iter_advance(trans, &iter)) + break; } - +out: bch2_trans_iter_exit(trans, &reflink_iter); bch2_trans_iter_exit(trans, &iter); bch2_bkey_buf_exit(&sk, c); @@ -672,7 +782,7 @@ int __bch2_move_data(struct moving_context *ctxt, ret = bch2_move_data_btree(ctxt, id == start.btree ? start.pos : POS_MIN, id == end.btree ? end.pos : POS_MAX, - pred, arg, id); + pred, arg, id, 0); if (ret) break; } @@ -689,21 +799,23 @@ int bch2_move_data(struct bch_fs *c, bool wait_on_copygc, move_pred_fn pred, void *arg) { - struct moving_context ctxt; - int ret; bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); - ret = __bch2_move_data(&ctxt, start, end, pred, arg); + int ret = __bch2_move_data(&ctxt, start, end, pred, arg); bch2_moving_ctxt_exit(&ctxt); return ret; } -int bch2_evacuate_bucket(struct moving_context *ctxt, - struct move_bucket_in_flight *bucket_in_flight, - struct bpos bucket, int gen, - struct data_update_opts _data_opts) +static int __bch2_move_data_phys(struct moving_context *ctxt, + struct move_bucket *bucket_in_flight, + unsigned dev, + u64 bucket_start, + u64 bucket_end, + unsigned data_types, + bool copygc, + move_pred_fn pred, void *arg) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -712,16 +824,18 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct btree_iter iter = {}, bp_iter = {}; struct bkey_buf sk; struct bkey_s_c k; - struct data_update_opts data_opts; - unsigned sectors_moved = 0; struct bkey_buf last_flushed; + u64 check_mismatch_done = bucket_start; int ret = 0; - struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); + struct bch_dev *ca = bch2_dev_tryget(c, dev); if (!ca) return 0; - trace_bucket_evacuate(c, &bucket); + bucket_end = min(bucket_end, ca->mi.nbuckets); + + struct bpos bp_start = bucket_pos_to_bp_start(ca, POS(dev, bucket_start)); + struct bpos bp_end = bucket_pos_to_bp_end(ca, POS(dev, bucket_end)); bch2_bkey_buf_init(&last_flushed); bkey_init(&last_flushed.k->k); @@ -732,15 +846,11 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, */ bch2_trans_begin(trans); - bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, - bucket_pos_to_bp_start(ca, bucket), 0); - - bch_err_msg(c, ret, "looking up alloc key"); - if (ret) - goto err; + bch2_trans_iter_init(trans, &bp_iter, BTREE_ID_backpointers, bp_start, 0); ret = bch2_btree_write_buffer_tryflush(trans); - bch_err_msg(c, ret, "flushing btree write buffer"); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "flushing btree write buffer"); if (ret) goto err; @@ -750,122 +860,182 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, bch2_trans_begin(trans); - k = bch2_btree_iter_peek(&bp_iter); + k = bch2_btree_iter_peek(trans, &bp_iter); ret = bkey_err(k); if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) goto err; - if (!k.k || bkey_gt(k.k->p, bucket_pos_to_bp_end(ca, bucket))) + if (!k.k || bkey_gt(k.k->p, bp_end)) break; + if (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { + while (check_mismatch_done < bp_pos_to_bucket(ca, k.k->p).offset) { + bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, + copygc, &last_flushed); + } + continue; + } + if (k.k->type != KEY_TYPE_backpointer) goto next; struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - if (!bp.v->level) { - k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); - ret = bkey_err(k); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!k.k) - goto next; + if (ctxt->stats) + ctxt->stats->offset = bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT; - bch2_bkey_buf_reassemble(&sk, c, k); - k = bkey_i_to_s_c(sk.k); + if (!(data_types & BIT(bp.v->data_type))) + goto next; + + if (!bp.v->level && bp.v->btree_id == BTREE_ID_stripes) + goto next; + + k = bch2_backpointer_get_key(trans, bp, &iter, 0, &last_flushed); + ret = bkey_err(k); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret) + goto err; + if (!k.k) + goto next; + if (!bp.v->level) { ret = bch2_move_get_io_opts_one(trans, &io_opts, &iter, k); if (ret) { bch2_trans_iter_exit(trans, &iter); continue; } + } - data_opts = _data_opts; - data_opts.target = io_opts.background_target; - data_opts.rewrite_ptrs = 0; - - unsigned sectors = bp.v->bucket_len; /* move_extent will drop locks */ - unsigned i = 0; - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { - if (p.ptr.dev == bucket.inode) { - if (p.ptr.cached) { - bch2_trans_iter_exit(trans, &iter); - goto next; - } - data_opts.rewrite_ptrs |= 1U << i; - break; - } - i++; - } - - ret = bch2_move_extent(ctxt, bucket_in_flight, - &iter, k, io_opts, data_opts); + struct data_update_opts data_opts = {}; + if (!pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts)) { bch2_trans_iter_exit(trans, &iter); + goto next; + } - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret == -ENOMEM) { - /* memory allocation failure, wait for some IO to finish */ - bch2_move_ctxt_wait_for_io(ctxt); - continue; - } - if (ret) - goto err; - - if (ctxt->stats) - atomic64_add(sectors, &ctxt->stats->sectors_seen); - sectors_moved += sectors; - } else { - struct btree *b; + if (data_opts.scrub && + !bch2_dev_idx_is_online(c, data_opts.read_dev)) { + bch2_trans_iter_exit(trans, &iter); + ret = -BCH_ERR_device_offline; + break; + } - b = bch2_backpointer_get_node(trans, bp, &iter, &last_flushed); - ret = PTR_ERR_OR_ZERO(b); - if (ret == -BCH_ERR_backpointer_to_overwritten_btree_node) - goto next; - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; - if (!b) - goto next; + bch2_bkey_buf_reassemble(&sk, c, k); + k = bkey_i_to_s_c(sk.k); - unsigned sectors = btree_ptr_sectors_written(bkey_i_to_s_c(&b->key)); + /* move_extent will drop locks */ + unsigned sectors = bp.v->bucket_len; - ret = bch2_btree_node_rewrite(trans, &iter, b, 0); - bch2_trans_iter_exit(trans, &iter); + if (!bp.v->level) + ret = bch2_move_extent(ctxt, bucket_in_flight, &iter, k, io_opts, data_opts); + else if (!data_opts.scrub) + ret = bch2_btree_node_rewrite_pos(trans, bp.v->btree_id, bp.v->level, + k.k->p, data_opts.target, 0); + else + ret = bch2_btree_node_scrub(trans, bp.v->btree_id, bp.v->level, k, data_opts.read_dev); - if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) - continue; - if (ret) - goto err; + bch2_trans_iter_exit(trans, &iter); - if (ctxt->rate) - bch2_ratelimit_increment(ctxt->rate, sectors); - if (ctxt->stats) { - atomic64_add(sectors, &ctxt->stats->sectors_seen); - atomic64_add(sectors, &ctxt->stats->sectors_moved); - } - sectors_moved += btree_sectors(c); + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + continue; + if (ret == -ENOMEM) { + /* memory allocation failure, wait for some IO to finish */ + bch2_move_ctxt_wait_for_io(ctxt); + continue; } + if (ret) + goto err; + + if (ctxt->stats) + atomic64_add(sectors, &ctxt->stats->sectors_seen); next: - bch2_btree_iter_advance(&bp_iter); + bch2_btree_iter_advance(trans, &bp_iter); } - trace_evacuate_bucket(c, &bucket, sectors_moved, ca->mi.bucket_size, ret); + while (check_mismatch_done < bucket_end) + bch2_check_bucket_backpointer_mismatch(trans, ca, check_mismatch_done++, + copygc, &last_flushed); err: bch2_trans_iter_exit(trans, &bp_iter); - bch2_dev_put(ca); bch2_bkey_buf_exit(&sk, c); bch2_bkey_buf_exit(&last_flushed, c); + bch2_dev_put(ca); return ret; } +int bch2_move_data_phys(struct bch_fs *c, + unsigned dev, + u64 start, + u64 end, + unsigned data_types, + struct bch_ratelimit *rate, + struct bch_move_stats *stats, + struct write_point_specifier wp, + bool wait_on_copygc, + move_pred_fn pred, void *arg) +{ + struct moving_context ctxt; + + bch2_trans_run(c, bch2_btree_write_buffer_flush_sync(trans)); + + bch2_moving_ctxt_init(&ctxt, c, rate, stats, wp, wait_on_copygc); + if (ctxt.stats) { + ctxt.stats->phys = true; + ctxt.stats->data_type = (int) DATA_PROGRESS_DATA_TYPE_phys; + } + + int ret = __bch2_move_data_phys(&ctxt, NULL, dev, start, end, + data_types, false, pred, arg); + bch2_moving_ctxt_exit(&ctxt); + + return ret; +} + +struct evacuate_bucket_arg { + struct bpos bucket; + int gen; + struct data_update_opts data_opts; +}; + +static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, + enum btree_id btree, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct evacuate_bucket_arg *arg = _arg; + + *data_opts = arg->data_opts; + + unsigned i = 0; + bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { + if (ptr->dev == arg->bucket.inode && + (arg->gen < 0 || arg->gen == ptr->gen) && + !ptr->cached) + data_opts->rewrite_ptrs |= BIT(i); + i++; + } + + return data_opts->rewrite_ptrs != 0; +} + +int bch2_evacuate_bucket(struct moving_context *ctxt, + struct move_bucket *bucket_in_flight, + struct bpos bucket, int gen, + struct data_update_opts data_opts) +{ + struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; + + return __bch2_move_data_phys(ctxt, bucket_in_flight, + bucket.inode, + bucket.offset, + bucket.offset + 1, + ~0, + true, + evacuate_bucket_pred, &arg); +} + typedef bool (*move_btree_pred)(struct bch_fs *, void *, struct btree *, struct bch_io_opts *, struct data_update_opts *); @@ -906,7 +1076,7 @@ static int bch2_move_btree(struct bch_fs *c, retry: ret = 0; while (bch2_trans_begin(trans), - (b = bch2_btree_iter_peek_node(&iter)) && + (b = bch2_btree_iter_peek_node(trans, &iter)) && !(ret = PTR_ERR_OR_ZERO(b))) { if (kthread && kthread_should_stop()) break; @@ -920,13 +1090,13 @@ retry: if (!pred(c, arg, b, &io_opts, &data_opts)) goto next; - ret = bch2_btree_node_rewrite(trans, &iter, b, 0) ?: ret; + ret = bch2_btree_node_rewrite(trans, &iter, b, 0, 0) ?: ret; if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) continue; if (ret) break; next: - bch2_btree_iter_next_node(&iter); + bch2_btree_iter_next_node(trans, &iter); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; @@ -945,7 +1115,7 @@ next: } static bool rereplicate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -977,7 +1147,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, } static bool migrate_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1004,15 +1174,7 @@ static bool rereplicate_btree_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - return rereplicate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); -} - -static bool migrate_btree_pred(struct bch_fs *c, void *arg, - struct btree *b, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - return migrate_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + return rereplicate_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), io_opts, data_opts); } /* @@ -1068,7 +1230,7 @@ int bch2_scan_old_btree_nodes(struct bch_fs *c, struct bch_move_stats *stats) } static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, + enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { @@ -1101,7 +1263,32 @@ static bool drop_extra_replicas_btree_pred(struct bch_fs *c, void *arg, struct bch_io_opts *io_opts, struct data_update_opts *data_opts) { - return drop_extra_replicas_pred(c, arg, bkey_i_to_s_c(&b->key), io_opts, data_opts); + return drop_extra_replicas_pred(c, arg, b->c.btree_id, bkey_i_to_s_c(&b->key), + io_opts, data_opts); +} + +static bool scrub_pred(struct bch_fs *c, void *_arg, + enum btree_id btree, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) +{ + struct bch_ioctl_data *arg = _arg; + + if (k.k->type != KEY_TYPE_btree_ptr_v2) { + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + bkey_for_each_ptr_decode(k.k, ptrs, p, entry) + if (p.ptr.dev == arg->migrate.dev) { + if (!p.crc.csum_type) + return false; + break; + } + } + + data_opts->scrub = true; + data_opts->read_dev = arg->migrate.dev; + return true; } int bch2_data_job(struct bch_fs *c, @@ -1118,6 +1305,22 @@ int bch2_data_job(struct bch_fs *c, bch2_move_stats_init(stats, bch2_data_ops_strs[op.op]); switch (op.op) { + case BCH_DATA_OP_scrub: + /* + * prevent tests from spuriously failing, make sure we see all + * btree nodes that need to be repaired + */ + bch2_btree_interior_updates_flush(c); + + ret = bch2_move_data_phys(c, op.scrub.dev, 0, U64_MAX, + op.scrub.data_types, + NULL, + stats, + writepoint_hashed((unsigned long) current), + false, + scrub_pred, &op) ?: ret; + break; + case BCH_DATA_OP_rereplicate: stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, -1); @@ -1137,14 +1340,14 @@ int bch2_data_job(struct bch_fs *c, stats->data_type = BCH_DATA_journal; ret = bch2_journal_flush_device_pins(&c->journal, op.migrate.dev); - ret = bch2_move_btree(c, start, end, - migrate_btree_pred, &op, stats) ?: ret; - ret = bch2_move_data(c, start, end, - NULL, - stats, - writepoint_hashed((unsigned long) current), - true, - migrate_pred, &op) ?: ret; + ret = bch2_move_data_phys(c, op.migrate.dev, 0, U64_MAX, + ~0, + NULL, + stats, + writepoint_hashed((unsigned long) current), + true, + migrate_pred, &op) ?: ret; + bch2_btree_interior_updates_flush(c); ret = bch2_replicas_gc2(c) ?: ret; break; case BCH_DATA_OP_rewrite_old_nodes: @@ -1176,17 +1379,17 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) prt_newline(out); printbuf_indent_add(out, 2); - prt_printf(out, "keys moved: %llu\n", atomic64_read(&stats->keys_moved)); - prt_printf(out, "keys raced: %llu\n", atomic64_read(&stats->keys_raced)); - prt_printf(out, "bytes seen: "); + prt_printf(out, "keys moved:\t%llu\n", atomic64_read(&stats->keys_moved)); + prt_printf(out, "keys raced:\t%llu\n", atomic64_read(&stats->keys_raced)); + prt_printf(out, "bytes seen:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_seen) << 9); prt_newline(out); - prt_printf(out, "bytes moved: "); + prt_printf(out, "bytes moved:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_moved) << 9); prt_newline(out); - prt_printf(out, "bytes raced: "); + prt_printf(out, "bytes raced:\t"); prt_human_readable_u64(out, atomic64_read(&stats->sectors_raced) << 9); prt_newline(out); @@ -1195,7 +1398,8 @@ void bch2_move_stats_to_text(struct printbuf *out, struct bch_move_stats *stats) static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, struct moving_context *ctxt) { - struct moving_io *io; + if (!out->nr_tabstops) + printbuf_tabstop_push(out, 32); bch2_move_stats_to_text(out, ctxt->stats); printbuf_indent_add(out, 2); @@ -1215,8 +1419,9 @@ static void bch2_moving_ctxt_to_text(struct printbuf *out, struct bch_fs *c, str printbuf_indent_add(out, 2); mutex_lock(&ctxt->lock); + struct moving_io *io; list_for_each_entry(io, &ctxt->ios, io_list) - bch2_write_op_to_text(out, &io->write.op); + bch2_data_update_inflight_to_text(out, &io->write); mutex_unlock(&ctxt->lock); printbuf_indent_sub(out, 4); diff --git a/fs/bcachefs/move.h b/fs/bcachefs/move.h index 51e0505a8156..86b80499ac55 100644 --- a/fs/bcachefs/move.h +++ b/fs/bcachefs/move.h @@ -72,7 +72,7 @@ do { \ break; \ } while (1) -typedef bool (*move_pred_fn)(struct bch_fs *, void *, struct bkey_s_c, +typedef bool (*move_pred_fn)(struct bch_fs *, void *, enum btree_id, struct bkey_s_c, struct bch_io_opts *, struct data_update_opts *); extern const char * const bch2_data_ops_strs[]; @@ -116,12 +116,18 @@ int bch2_move_get_io_opts_one(struct btree_trans *, struct bch_io_opts *, int bch2_scan_old_btree_nodes(struct bch_fs *, struct bch_move_stats *); int bch2_move_extent(struct moving_context *, - struct move_bucket_in_flight *, + struct move_bucket *, struct btree_iter *, struct bkey_s_c, struct bch_io_opts, struct data_update_opts); +struct bch_io_opts *bch2_move_get_io_opts(struct btree_trans *, + struct per_snapshot_io_opts *, struct bpos, + struct btree_iter *, struct bkey_s_c); + +int bch2_move_data_btree(struct moving_context *, struct bpos, struct bpos, + move_pred_fn, void *, enum btree_id, unsigned); int __bch2_move_data(struct moving_context *, struct bbpos, struct bbpos, @@ -135,8 +141,13 @@ int bch2_move_data(struct bch_fs *, bool, move_pred_fn, void *); +int bch2_move_data_phys(struct bch_fs *, unsigned, u64, u64, unsigned, + struct bch_ratelimit *, struct bch_move_stats *, + struct write_point_specifier, bool, + move_pred_fn, void *); + int bch2_evacuate_bucket(struct moving_context *, - struct move_bucket_in_flight *, + struct move_bucket *, struct bpos, int, struct data_update_opts); int bch2_data_job(struct bch_fs *, diff --git a/fs/bcachefs/move_types.h b/fs/bcachefs/move_types.h index e22841ef31e4..c5c62cd600de 100644 --- a/fs/bcachefs/move_types.h +++ b/fs/bcachefs/move_types.h @@ -3,33 +3,43 @@ #define _BCACHEFS_MOVE_TYPES_H #include "bbpos_types.h" +#include "bcachefs_ioctl.h" struct bch_move_stats { - enum bch_data_type data_type; - struct bbpos pos; char name[32]; + bool phys; + enum bch_ioctl_data_event_ret ret; + + union { + struct { + enum bch_data_type data_type; + struct bbpos pos; + }; + struct { + unsigned dev; + u64 offset; + }; + }; atomic64_t keys_moved; atomic64_t keys_raced; atomic64_t sectors_seen; atomic64_t sectors_moved; atomic64_t sectors_raced; + atomic64_t sectors_error_corrected; + atomic64_t sectors_error_uncorrected; }; struct move_bucket_key { struct bpos bucket; - u8 gen; + unsigned gen; }; struct move_bucket { + struct move_bucket *next; + struct rhash_head hash; struct move_bucket_key k; unsigned sectors; -}; - -struct move_bucket_in_flight { - struct move_bucket_in_flight *next; - struct rhash_head hash; - struct move_bucket bucket; atomic_t count; }; diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index 21805509ab9e..e7a2a13554d7 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -8,6 +8,7 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "backpointers.h" #include "btree_iter.h" #include "btree_update.h" #include "btree_write_buffer.h" @@ -27,67 +28,46 @@ #include <linux/wait.h> struct buckets_in_flight { - struct rhashtable table; - struct move_bucket_in_flight *first; - struct move_bucket_in_flight *last; - size_t nr; - size_t sectors; + struct rhashtable table; + struct move_bucket *first; + struct move_bucket *last; + size_t nr; + size_t sectors; + + DARRAY(struct move_bucket *) to_evacuate; }; static const struct rhashtable_params bch_move_bucket_params = { - .head_offset = offsetof(struct move_bucket_in_flight, hash), - .key_offset = offsetof(struct move_bucket_in_flight, bucket.k), + .head_offset = offsetof(struct move_bucket, hash), + .key_offset = offsetof(struct move_bucket, k), .key_len = sizeof(struct move_bucket_key), .automatic_shrinking = true, }; -static struct move_bucket_in_flight * -move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket b) +static void move_bucket_in_flight_add(struct buckets_in_flight *list, struct move_bucket *b) { - struct move_bucket_in_flight *new = kzalloc(sizeof(*new), GFP_KERNEL); - int ret; - - if (!new) - return ERR_PTR(-ENOMEM); - - new->bucket = b; - - ret = rhashtable_lookup_insert_fast(&list->table, &new->hash, - bch_move_bucket_params); - if (ret) { - kfree(new); - return ERR_PTR(ret); - } - if (!list->first) - list->first = new; + list->first = b; else - list->last->next = new; + list->last->next = b; - list->last = new; + list->last = b; list->nr++; - list->sectors += b.sectors; - return new; + list->sectors += b->sectors; } static int bch2_bucket_is_movable(struct btree_trans *trans, struct move_bucket *b, u64 time) { struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a; - int ret; - if (bch2_bucket_is_open(trans->c, - b->k.bucket.inode, - b->k.bucket.offset)) + if (bch2_bucket_is_open(c, b->k.bucket.inode, b->k.bucket.offset)) return 0; - k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, - b->k.bucket, BTREE_ITER_cached); - ret = bkey_err(k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, BTREE_ID_alloc, + b->k.bucket, BTREE_ITER_cached); + int ret = bkey_err(k); if (ret) return ret; @@ -95,25 +75,40 @@ static int bch2_bucket_is_movable(struct btree_trans *trans, if (!ca) goto out; - a = bch2_alloc_to_v4(k, &_a); + if (bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b->k.bucket.offset)) + goto out; + + if (ca->mi.state != BCH_MEMBER_STATE_rw || + !bch2_dev_is_online(ca)) + goto out; + + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); b->k.gen = a->gen; b->sectors = bch2_bucket_sectors_dirty(*a); u64 lru_idx = alloc_lru_idx_fragmentation(*a, ca); ret = lru_idx && lru_idx <= time; - - bch2_dev_put(ca); out: + bch2_dev_put(ca); bch2_trans_iter_exit(trans, &iter); return ret; } +static void move_bucket_free(struct buckets_in_flight *list, + struct move_bucket *b) +{ + int ret = rhashtable_remove_fast(&list->table, &b->hash, + bch_move_bucket_params); + BUG_ON(ret); + kfree(b); +} + static void move_buckets_wait(struct moving_context *ctxt, struct buckets_in_flight *list, bool flush) { - struct move_bucket_in_flight *i; - int ret; + struct move_bucket *i; while ((i = list->first)) { if (flush) @@ -127,12 +122,9 @@ static void move_buckets_wait(struct moving_context *ctxt, list->last = NULL; list->nr--; - list->sectors -= i->bucket.sectors; + list->sectors -= i->sectors; - ret = rhashtable_remove_fast(&list->table, &i->hash, - bch_move_bucket_params); - BUG_ON(ret); - kfree(i); + move_bucket_free(list, i); } bch2_trans_unlock_long(ctxt->trans); @@ -144,11 +136,8 @@ static bool bucket_in_flight(struct buckets_in_flight *list, return rhashtable_lookup_fast(&list->table, &k, bch_move_bucket_params); } -typedef DARRAY(struct move_bucket) move_buckets; - static int bch2_copygc_get_buckets(struct moving_context *ctxt, - struct buckets_in_flight *buckets_in_flight, - move_buckets *buckets) + struct buckets_in_flight *buckets_in_flight) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; @@ -165,11 +154,9 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, if (bch2_fs_fatal_err_on(ret, c, "%s: from bch2_btree_write_buffer_tryflush()", bch2_err_str(ret))) return ret; - bch2_trans_begin(trans); - ret = for_each_btree_key_max(trans, iter, BTREE_ID_lru, - lru_pos(BCH_LRU_FRAGMENTATION_START, 0, 0), - lru_pos(BCH_LRU_FRAGMENTATION_START, U64_MAX, LRU_TIME_MAX), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, 0, 0), + lru_pos(BCH_LRU_BUCKET_FRAGMENTATION, U64_MAX, LRU_TIME_MAX), 0, k, ({ struct move_bucket b = { .k.bucket = u64_to_bucket(k.k->p.offset) }; int ret2 = 0; @@ -185,20 +172,34 @@ static int bch2_copygc_get_buckets(struct moving_context *ctxt, else if (bucket_in_flight(buckets_in_flight, b.k)) in_flight++; else { - ret2 = darray_push(buckets, b); + struct move_bucket *b_i = kmalloc(sizeof(*b_i), GFP_KERNEL); + ret2 = b_i ? 0 : -ENOMEM; if (ret2) goto err; + + *b_i = b; + + ret2 = darray_push(&buckets_in_flight->to_evacuate, b_i); + if (ret2) { + kfree(b_i); + goto err; + } + + ret2 = rhashtable_lookup_insert_fast(&buckets_in_flight->table, &b_i->hash, + bch_move_bucket_params); + BUG_ON(ret2); + sectors += b.sectors; } - ret2 = buckets->nr >= nr_to_get; + ret2 = buckets_in_flight->to_evacuate.nr >= nr_to_get; err: ret2; })); pr_debug("have: %zu (%zu) saw %zu in flight %zu not movable %zu got %zu (%zu)/%zu buckets ret %i", buckets_in_flight->nr, buckets_in_flight->sectors, - saw, in_flight, not_movable, buckets->nr, sectors, nr_to_get, ret); + saw, in_flight, not_movable, buckets_in_flight->to_evacuate.nr, sectors, nr_to_get, ret); return ret < 0 ? ret : 0; } @@ -213,40 +214,30 @@ static int bch2_copygc(struct moving_context *ctxt, struct data_update_opts data_opts = { .btree_insert_flags = BCH_WATERMARK_copygc, }; - move_buckets buckets = { 0 }; - struct move_bucket_in_flight *f; u64 sectors_seen = atomic64_read(&ctxt->stats->sectors_seen); u64 sectors_moved = atomic64_read(&ctxt->stats->sectors_moved); int ret = 0; - ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight, &buckets); + ret = bch2_copygc_get_buckets(ctxt, buckets_in_flight); if (ret) goto err; - darray_for_each(buckets, i) { + darray_for_each(buckets_in_flight->to_evacuate, i) { if (kthread_should_stop() || freezing(current)) break; - f = move_bucket_in_flight_add(buckets_in_flight, *i); - ret = PTR_ERR_OR_ZERO(f); - if (ret == -EEXIST) { /* rare race: copygc_get_buckets returned same bucket more than once */ - ret = 0; - continue; - } - if (ret == -ENOMEM) { /* flush IO, continue later */ - ret = 0; - break; - } + struct move_bucket *b = *i; + *i = NULL; - ret = bch2_evacuate_bucket(ctxt, f, f->bucket.k.bucket, - f->bucket.k.gen, data_opts); + move_bucket_in_flight_add(buckets_in_flight, b); + + ret = bch2_evacuate_bucket(ctxt, b, b->k.bucket, b->k.gen, data_opts); if (ret) goto err; *did_work = true; } err: - /* no entries in LRU btree found, or got to end: */ if (bch2_err_matches(ret, ENOENT)) ret = 0; @@ -256,12 +247,34 @@ err: sectors_seen = atomic64_read(&ctxt->stats->sectors_seen) - sectors_seen; sectors_moved = atomic64_read(&ctxt->stats->sectors_moved) - sectors_moved; - trace_and_count(c, copygc, c, buckets.nr, sectors_seen, sectors_moved); + trace_and_count(c, copygc, c, buckets_in_flight->to_evacuate.nr, sectors_seen, sectors_moved); - darray_exit(&buckets); + darray_for_each(buckets_in_flight->to_evacuate, i) + if (*i) + move_bucket_free(buckets_in_flight, *i); + darray_exit(&buckets_in_flight->to_evacuate); return ret; } +static u64 bch2_copygc_dev_wait_amount(struct bch_dev *ca) +{ + struct bch_dev_usage_full usage_full = bch2_dev_usage_full_read(ca); + struct bch_dev_usage usage; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + usage.buckets[i] = usage_full.d[i].buckets; + + s64 fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * + ca->mi.bucket_size) >> 1); + s64 fragmented = 0; + + for (unsigned i = 0; i < BCH_DATA_NR; i++) + if (data_type_movable(i)) + fragmented += usage_full.d[i].fragmented; + + return max(0LL, fragmented_allowed - fragmented); +} + /* * Copygc runs when the amount of fragmented data is above some arbitrary * threshold: @@ -276,23 +289,14 @@ err: * often and continually reduce the amount of fragmented space as the device * fills up. So, we increase the threshold by half the current free space. */ -unsigned long bch2_copygc_wait_amount(struct bch_fs *c) +u64 bch2_copygc_wait_amount(struct bch_fs *c) { - s64 wait = S64_MAX, fragmented_allowed, fragmented; - - for_each_rw_member(c, ca) { - struct bch_dev_usage usage = bch2_dev_usage_read(ca); - - fragmented_allowed = ((__dev_buckets_available(ca, usage, BCH_WATERMARK_stripe) * - ca->mi.bucket_size) >> 1); - fragmented = 0; + u64 wait = U64_MAX; - for (unsigned i = 0; i < BCH_DATA_NR; i++) - if (data_type_movable(i)) - fragmented += usage.d[i].fragmented; - - wait = min(wait, max(0LL, fragmented_allowed - fragmented)); - } + rcu_read_lock(); + for_each_rw_member_rcu(c, ca) + wait = min(wait, bch2_copygc_dev_wait_amount(ca)); + rcu_read_unlock(); return wait; } @@ -315,9 +319,28 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) c->copygc_wait_at) << 9); prt_newline(out); - prt_printf(out, "Currently calculated wait:\t"); - prt_human_readable_u64(out, bch2_copygc_wait_amount(c)); - prt_newline(out); + bch2_printbuf_make_room(out, 4096); + + rcu_read_lock(); + out->atomic++; + + prt_printf(out, "Currently calculated wait:\n"); + for_each_rw_member_rcu(c, ca) { + prt_printf(out, " %s:\t", ca->name); + prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); + prt_newline(out); + } + + struct task_struct *t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); + --out->atomic; + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } } static int bch2_copygc_thread(void *arg) @@ -326,22 +349,23 @@ static int bch2_copygc_thread(void *arg) struct moving_context ctxt; struct bch_move_stats move_stats; struct io_clock *clock = &c->io_clock[WRITE]; - struct buckets_in_flight *buckets; + struct buckets_in_flight buckets = {}; u64 last, wait; - int ret = 0; - buckets = kzalloc(sizeof(struct buckets_in_flight), GFP_KERNEL); - if (!buckets) - return -ENOMEM; - ret = rhashtable_init(&buckets->table, &bch_move_bucket_params); + int ret = rhashtable_init(&buckets.table, &bch_move_bucket_params); bch_err_msg(c, ret, "allocating copygc buckets in flight"); - if (ret) { - kfree(buckets); + if (ret) return ret; - } set_freezable(); + /* + * Data move operations can't run until after check_snapshots has + * completed, and bch2_snapshot_is_ancestor() is available. + */ + kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_should_stop()); + bch2_move_stats_init(&move_stats, "copygc"); bch2_moving_ctxt_init(&ctxt, c, NULL, &move_stats, writepoint_ptr(&c->copygc_write_point), @@ -354,13 +378,13 @@ static int bch2_copygc_thread(void *arg) cond_resched(); if (!c->opts.copygc_enabled) { - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); kthread_wait_freezable(c->opts.copygc_enabled || kthread_should_stop()); } if (unlikely(freezing(current))) { - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); __refrigerator(false); continue; } @@ -371,7 +395,7 @@ static int bch2_copygc_thread(void *arg) if (wait > clock->max_slop) { c->copygc_wait_at = last; c->copygc_wait = last + wait; - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); trace_and_count(c, copygc_wait, c, wait, last + wait); bch2_kthread_io_clock_wait(clock, last + wait, MAX_SCHEDULE_TIMEOUT); @@ -381,7 +405,7 @@ static int bch2_copygc_thread(void *arg) c->copygc_wait = 0; c->copygc_running = true; - ret = bch2_copygc(&ctxt, buckets, &did_work); + ret = bch2_copygc(&ctxt, &buckets, &did_work); c->copygc_running = false; wake_up(&c->copygc_running_wq); @@ -392,16 +416,14 @@ static int bch2_copygc_thread(void *arg) if (min_member_capacity == U64_MAX) min_member_capacity = 128 * 2048; - move_buckets_wait(&ctxt, buckets, true); + move_buckets_wait(&ctxt, &buckets, true); bch2_kthread_io_clock_wait(clock, last + (min_member_capacity >> 6), MAX_SCHEDULE_TIMEOUT); } } - move_buckets_wait(&ctxt, buckets, true); - - rhashtable_destroy(&buckets->table); - kfree(buckets); + move_buckets_wait(&ctxt, &buckets, true); + rhashtable_destroy(&buckets.table); bch2_moving_ctxt_exit(&ctxt); bch2_move_stats_exit(&move_stats, c); diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index ea181fef5bc9..b9683d22bab0 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -2,9 +2,18 @@ #ifndef _BCACHEFS_MOVINGGC_H #define _BCACHEFS_MOVINGGC_H -unsigned long bch2_copygc_wait_amount(struct bch_fs *); +u64 bch2_copygc_wait_amount(struct bch_fs *); void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); +static inline void bch2_copygc_wakeup(struct bch_fs *c) +{ + rcu_read_lock(); + struct task_struct *p = rcu_dereference(c->copygc_thread); + if (p) + wake_up_process(p); + rcu_read_unlock(); +} + void bch2_copygc_stop(struct bch_fs *); int bch2_copygc_start(struct bch_fs *); void bch2_fs_copygc_init(struct bch_fs *); diff --git a/fs/bcachefs/fs-common.c b/fs/bcachefs/namei.c index d70d9f634cea..a84b69d6caef 100644 --- a/fs/bcachefs/fs-common.c +++ b/fs/bcachefs/namei.c @@ -4,13 +4,21 @@ #include "acl.h" #include "btree_update.h" #include "dirent.h" -#include "fs-common.h" #include "inode.h" +#include "namei.h" #include "subvolume.h" #include "xattr.h" #include <linux/posix_acl.h> +static inline subvol_inum parent_inum(subvol_inum inum, struct bch_inode_unpacked *inode) +{ + return (subvol_inum) { + .subvol = inode->bi_parent_subvol ?: inum.subvol, + .inum = inode->bi_dir, + }; +} + static inline int is_subdir_for_nlink(struct bch_inode_unpacked *inode) { return S_ISDIR(inode->bi_mode) && !inode->bi_subvol; @@ -28,8 +36,8 @@ int bch2_create_trans(struct btree_trans *trans, unsigned flags) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = { NULL }; - struct btree_iter inode_iter = { NULL }; + struct btree_iter dir_iter = {}; + struct btree_iter inode_iter = {}; subvol_inum new_inum = dir; u64 now = bch2_current_time(c); u64 cpu = raw_smp_processor_id(); @@ -49,7 +57,7 @@ int bch2_create_trans(struct btree_trans *trans, if (!(flags & BCH_CREATE_SNAPSHOT)) { /* Normal create path - allocate a new inode: */ - bch2_inode_init_late(new_inode, now, uid, gid, mode, rdev, dir_u); + bch2_inode_init_late(c, new_inode, now, uid, gid, mode, rdev, dir_u); if (flags & BCH_CREATE_TMPFILE) new_inode->bi_flags |= BCH_INODE_unlinked; @@ -123,8 +131,8 @@ int bch2_create_trans(struct btree_trans *trans, if (ret) goto err; - bch2_btree_iter_set_snapshot(&dir_iter, dir_snapshot); - ret = bch2_btree_iter_traverse(&dir_iter); + bch2_btree_iter_set_snapshot(trans, &dir_iter, dir_snapshot); + ret = bch2_btree_iter_traverse(trans, &dir_iter); if (ret) goto err; } @@ -152,18 +160,14 @@ int bch2_create_trans(struct btree_trans *trans, if (is_subdir_for_nlink(new_inode)) dir_u->bi_nlink++; dir_u->bi_mtime = dir_u->bi_ctime = now; - dir_u->bi_size += dirent_occupied_size(name); - - ret = bch2_inode_write(trans, &dir_iter, dir_u); - if (ret) - goto err; - ret = bch2_dirent_create(trans, dir, &dir_hash, - dir_type, - name, - dir_target, - &dir_offset, - STR_HASH_must_create|BTREE_ITER_with_updates); + ret = bch2_dirent_create(trans, dir, &dir_hash, + dir_type, + name, + dir_target, + &dir_offset, + STR_HASH_must_create|BTREE_ITER_with_updates) ?: + bch2_inode_write(trans, &dir_iter, dir_u); if (ret) goto err; @@ -176,9 +180,9 @@ int bch2_create_trans(struct btree_trans *trans, new_inode->bi_depth = dir_u->bi_depth + 1; inode_iter.flags &= ~BTREE_ITER_all_snapshots; - bch2_btree_iter_set_snapshot(&inode_iter, snapshot); + bch2_btree_iter_set_snapshot(trans, &inode_iter, snapshot); - ret = bch2_btree_iter_traverse(&inode_iter) ?: + ret = bch2_btree_iter_traverse(trans, &inode_iter) ?: bch2_inode_write(trans, &inode_iter, new_inode); err: bch2_trans_iter_exit(trans, &inode_iter); @@ -192,8 +196,8 @@ int bch2_link_trans(struct btree_trans *trans, const struct qstr *name) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = { NULL }; - struct btree_iter inode_iter = { NULL }; + struct btree_iter dir_iter = {}; + struct btree_iter inode_iter = {}; struct bch_hash_info dir_hash; u64 now = bch2_current_time(c); u64 dir_offset = 0; @@ -221,13 +225,13 @@ int bch2_link_trans(struct btree_trans *trans, } dir_u->bi_mtime = dir_u->bi_ctime = now; - dir_u->bi_size += dirent_occupied_size(name); dir_hash = bch2_hash_info_init(c, dir_u); ret = bch2_dirent_create(trans, dir, &dir_hash, mode_to_type(inode_u->bi_mode), - name, inum.inum, &dir_offset, + name, inum.inum, + &dir_offset, STR_HASH_must_create); if (ret) goto err; @@ -251,9 +255,9 @@ int bch2_unlink_trans(struct btree_trans *trans, bool deleting_subvol) { struct bch_fs *c = trans->c; - struct btree_iter dir_iter = { NULL }; - struct btree_iter dirent_iter = { NULL }; - struct btree_iter inode_iter = { NULL }; + struct btree_iter dir_iter = {}; + struct btree_iter dirent_iter = {}; + struct btree_iter inode_iter = {}; struct bch_hash_info dir_hash; subvol_inum inum; u64 now = bch2_current_time(c); @@ -299,7 +303,7 @@ int bch2_unlink_trans(struct btree_trans *trans, if (ret) goto err; - k = bch2_btree_iter_peek_slot(&dirent_iter); + k = bch2_btree_iter_peek_slot(trans, &dirent_iter); ret = bkey_err(k); if (ret) goto err; @@ -308,8 +312,8 @@ int bch2_unlink_trans(struct btree_trans *trans, * If we're deleting a subvolume, we need to really delete the * dirent, not just emit a whiteout in the current snapshot: */ - bch2_btree_iter_set_snapshot(&dirent_iter, k.k->p.snapshot); - ret = bch2_btree_iter_traverse(&dirent_iter); + bch2_btree_iter_set_snapshot(trans, &dirent_iter, k.k->p.snapshot); + ret = bch2_btree_iter_traverse(trans, &dirent_iter); if (ret) goto err; } else { @@ -324,7 +328,6 @@ int bch2_unlink_trans(struct btree_trans *trans, dir_u->bi_mtime = dir_u->bi_ctime = inode_u->bi_ctime = now; dir_u->bi_nlink -= is_subdir_for_nlink(inode_u); - dir_u->bi_size -= dirent_occupied_size(name); ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, &dir_hash, &dirent_iter, @@ -346,6 +349,9 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *dst_u, bool ret = false; for (id = 0; id < Inode_opt_nr; id++) { + if (!S_ISDIR(dst_u->bi_mode) && id == Inode_opt_casefold) + continue; + /* Skip attributes that were explicitly set on this inode */ if (dst_u->bi_fields_set & (1 << id)) continue; @@ -389,10 +395,10 @@ int bch2_rename_trans(struct btree_trans *trans, enum bch_rename_mode mode) { struct bch_fs *c = trans->c; - struct btree_iter src_dir_iter = { NULL }; - struct btree_iter dst_dir_iter = { NULL }; - struct btree_iter src_inode_iter = { NULL }; - struct btree_iter dst_inode_iter = { NULL }; + struct btree_iter src_dir_iter = {}; + struct btree_iter dst_dir_iter = {}; + struct btree_iter src_inode_iter = {}; + struct btree_iter dst_inode_iter = {}; struct bch_hash_info src_hash, dst_hash; subvol_inum src_inum, dst_inum; u64 src_offset, dst_offset; @@ -406,8 +412,7 @@ int bch2_rename_trans(struct btree_trans *trans, src_hash = bch2_hash_info_init(c, src_dir_u); - if (dst_dir.inum != src_dir.inum || - dst_dir.subvol != src_dir.subvol) { + if (!subvol_inum_eq(dst_dir, src_dir)) { ret = bch2_inode_peek(trans, &dst_dir_iter, dst_dir_u, dst_dir, BTREE_ITER_intent); if (ret) @@ -420,8 +425,8 @@ int bch2_rename_trans(struct btree_trans *trans, } ret = bch2_dirent_rename(trans, - src_dir, &src_hash, - dst_dir, &dst_hash, + src_dir, &src_hash, &src_dir_u->bi_size, + dst_dir, &dst_hash, &dst_dir_u->bi_size, src_name, &src_inum, &src_offset, dst_name, &dst_inum, &dst_offset, mode); @@ -463,14 +468,6 @@ int bch2_rename_trans(struct btree_trans *trans, goto err; } - if (mode == BCH_RENAME) { - src_dir_u->bi_size -= dirent_occupied_size(src_name); - dst_dir_u->bi_size += dirent_occupied_size(dst_name); - } - - if (mode == BCH_RENAME_OVERWRITE) - src_dir_u->bi_size -= dirent_occupied_size(src_name); - if (src_inode_u->bi_parent_subvol) src_inode_u->bi_parent_subvol = dst_dir.subvol; @@ -507,32 +504,41 @@ int bch2_rename_trans(struct btree_trans *trans, } } - if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && - S_ISDIR(src_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } + if (!subvol_inum_eq(dst_dir, src_dir)) { + if (bch2_reinherit_attrs(src_inode_u, dst_dir_u) && + S_ISDIR(src_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } - if (mode == BCH_RENAME_EXCHANGE && - bch2_reinherit_attrs(dst_inode_u, src_dir_u) && - S_ISDIR(dst_inode_u->bi_mode)) { - ret = -EXDEV; - goto err; - } + if (mode == BCH_RENAME_EXCHANGE && + bch2_reinherit_attrs(dst_inode_u, src_dir_u) && + S_ISDIR(dst_inode_u->bi_mode)) { + ret = -EXDEV; + goto err; + } - if (is_subdir_for_nlink(src_inode_u)) { - src_dir_u->bi_nlink--; - dst_dir_u->bi_nlink++; - } + ret = bch2_maybe_propagate_has_case_insensitive(trans, src_inum, src_inode_u) ?: + (mode == BCH_RENAME_EXCHANGE + ? bch2_maybe_propagate_has_case_insensitive(trans, dst_inum, dst_inode_u) + : 0); + if (ret) + goto err; - if (S_ISDIR(src_inode_u->bi_mode) && - !src_inode_u->bi_subvol) - src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; + if (is_subdir_for_nlink(src_inode_u)) { + src_dir_u->bi_nlink--; + dst_dir_u->bi_nlink++; + } - if (mode == BCH_RENAME_EXCHANGE && - S_ISDIR(dst_inode_u->bi_mode) && - !dst_inode_u->bi_subvol) - dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; + if (S_ISDIR(src_inode_u->bi_mode) && + !src_inode_u->bi_subvol) + src_inode_u->bi_depth = dst_dir_u->bi_depth + 1; + + if (mode == BCH_RENAME_EXCHANGE && + S_ISDIR(dst_inode_u->bi_mode) && + !dst_inode_u->bi_subvol) + dst_inode_u->bi_depth = src_dir_u->bi_depth + 1; + } if (dst_inum.inum && is_subdir_for_nlink(dst_inode_u)) { dst_dir_u->bi_nlink--; @@ -571,6 +577,8 @@ err: return ret; } +/* inum_to_path */ + static inline void prt_bytes_reversed(struct printbuf *out, const void *b, unsigned n) { bch2_printbuf_make_room(out, n); @@ -601,31 +609,39 @@ static inline void reverse_bytes(void *b, size_t n) } } -/* XXX: we don't yet attempt to print paths when we don't know the subvol */ -int bch2_inum_to_path(struct btree_trans *trans, subvol_inum inum, struct printbuf *path) +static int __bch2_inum_to_path(struct btree_trans *trans, + u32 subvol, u64 inum, u32 snapshot, + struct printbuf *path) { unsigned orig_pos = path->pos; int ret = 0; - while (!(inum.subvol == BCACHEFS_ROOT_SUBVOL && - inum.inum == BCACHEFS_ROOT_INO)) { + while (true) { + if (!snapshot) { + ret = bch2_subvolume_get_snapshot(trans, subvol, &snapshot); + if (ret) + goto disconnected; + } + struct bch_inode_unpacked inode; - ret = bch2_inode_find_by_inum_trans(trans, inum, &inode); + ret = bch2_inode_find_by_inum_snapshot(trans, inum, snapshot, &inode, 0); if (ret) goto disconnected; + if (inode.bi_subvol == BCACHEFS_ROOT_SUBVOL && + inode.bi_inum == BCACHEFS_ROOT_INO) + break; + if (!inode.bi_dir && !inode.bi_dir_offset) { ret = -BCH_ERR_ENOENT_inode_no_backpointer; goto disconnected; } - inum.subvol = inode.bi_parent_subvol ?: inum.subvol; - inum.inum = inode.bi_dir; - - u32 snapshot; - ret = bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot); - if (ret) - goto disconnected; + inum = inode.bi_dir; + if (inode.bi_parent_subvol) { + subvol = inode.bi_parent_subvol; + snapshot = 0; + } struct btree_iter d_iter; struct bkey_s_c_dirent d = bch2_bkey_get_iter_typed(trans, &d_iter, @@ -661,3 +677,339 @@ disconnected: prt_str_reversed(path, "(disconnected)"); goto out; } + +int bch2_inum_to_path(struct btree_trans *trans, + subvol_inum inum, + struct printbuf *path) +{ + return __bch2_inum_to_path(trans, inum.subvol, inum.inum, 0, path); +} + +int bch2_inum_snapshot_to_path(struct btree_trans *trans, u64 inum, u32 snapshot, + snapshot_id_list *snapshot_overwrites, + struct printbuf *path) +{ + return __bch2_inum_to_path(trans, 0, inum, snapshot, path); +} + +/* fsck */ + +static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + struct btree_iter bp_iter = {}; + int ret = 0; + + if (inode_points_to_dirent(target, d)) + return 0; + + if (!target->bi_dir && + !target->bi_dir_offset) { + fsck_err_on(S_ISDIR(target->bi_mode), + trans, inode_dir_missing_backpointer, + "directory with missing backpointer\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + fsck_err_on(target->bi_flags & BCH_INODE_unlinked, + trans, inode_unlinked_but_has_dirent, + "inode unlinked but has dirent\n%s", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_printf(&buf, "\n"), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf)); + + target->bi_flags &= ~BCH_INODE_unlinked; + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + return __bch2_fsck_write_inode(trans, target); + } + + if (bch2_inode_should_have_single_bp(target) && + !fsck_err(trans, inode_wrong_backpointer, + "dirent points to inode that does not point back:\n%s", + (bch2_bkey_val_to_text(&buf, c, d.s_c), + prt_newline(&buf), + bch2_inode_unpacked_to_text(&buf, target), + buf.buf))) + goto err; + + struct bkey_s_c_dirent bp_dirent = + bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, + SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), + 0, dirent); + ret = bkey_err(bp_dirent); + if (ret && !bch2_err_matches(ret, ENOENT)) + goto err; + + bool backpointer_exists = !ret; + ret = 0; + + if (!backpointer_exists) { + if (fsck_err(trans, inode_wrong_backpointer, + "inode %llu:%u has wrong backpointer:\n" + "got %llu:%llu\n" + "should be %llu:%llu", + target->bi_inum, target->bi_snapshot, + target->bi_dir, + target->bi_dir_offset, + d.k->p.inode, + d.k->p.offset)) { + target->bi_dir = d.k->p.inode; + target->bi_dir_offset = d.k->p.offset; + ret = __bch2_fsck_write_inode(trans, target); + } + } else { + bch2_bkey_val_to_text(&buf, c, d.s_c); + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); + + if (S_ISDIR(target->bi_mode) || target->bi_subvol) { + /* + * XXX: verify connectivity of the other dirent + * up to the root before removing this one + * + * Additionally, bch2_lookup would need to cope with the + * dirent it found being removed - or should we remove + * the other one, even though the inode points to it? + */ + if (in_fsck) { + if (fsck_err(trans, inode_dir_multiple_links, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf)) + ret = bch2_fsck_remove_dirent(trans, d.k->p); + } else { + bch2_fs_inconsistent(c, + "%s %llu:%u with multiple links\n%s", + S_ISDIR(target->bi_mode) ? "directory" : "subvolume", + target->bi_inum, target->bi_snapshot, buf.buf); + } + + goto out; + } else { + /* + * hardlinked file with nlink 0: + * We're just adjusting nlink here so check_nlinks() will pick + * it up, it ignores inodes with nlink 0 + */ + if (fsck_err_on(!target->bi_nlink, + trans, inode_multiple_links_but_nlink_0, + "inode %llu:%u type %s has multiple links but i_nlink 0\n%s", + target->bi_inum, target->bi_snapshot, bch2_d_types[d.v->d_type], buf.buf)) { + target->bi_nlink++; + target->bi_flags &= ~BCH_INODE_unlinked; + ret = __bch2_fsck_write_inode(trans, target); + if (ret) + goto err; + } + } + } +out: +err: +fsck_err: + bch2_trans_iter_exit(trans, &bp_iter); + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +int __bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + int ret = 0; + + ret = bch2_check_dirent_inode_dirent(trans, d, target, in_fsck); + if (ret) + goto err; + + if (fsck_err_on(d.v->d_type != inode_d_type(target), + trans, dirent_d_type_wrong, + "incorrect d_type: got %s, should be %s:\n%s", + bch2_d_type_str(d.v->d_type), + bch2_d_type_str(inode_d_type(target)), + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { + struct bkey_i_dirent *n = bch2_trans_kmalloc(trans, bkey_bytes(d.k)); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto err; + + bkey_reassemble(&n->k_i, d.s_c); + n->v.d_type = inode_d_type(target); + if (n->v.d_type == DT_SUBVOL) { + n->v.d_parent_subvol = cpu_to_le32(target->bi_parent_subvol); + n->v.d_child_subvol = cpu_to_le32(target->bi_subvol); + } else { + n->v.d_inum = cpu_to_le64(target->bi_inum); + } + + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); + if (ret) + goto err; + } +err: +fsck_err: + printbuf_exit(&buf); + bch_err_fn(c, ret); + return ret; +} + +/* + * BCH_INODE_has_case_insensitive: + * We have to track whether directories have any descendent directory that is + * casefolded - for overlayfs: + */ + +static int bch2_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum) +{ + struct btree_iter iter = {}; + int ret = 0; + + while (true) { + struct bch_inode_unpacked inode; + ret = bch2_inode_peek(trans, &iter, &inode, inum, + BTREE_ITER_intent|BTREE_ITER_with_updates); + if (ret) + break; + + if (inode.bi_flags & BCH_INODE_has_case_insensitive) + break; + + inode.bi_flags |= BCH_INODE_has_case_insensitive; + ret = bch2_inode_write(trans, &iter, &inode); + if (ret) + break; + + bch2_trans_iter_exit(trans, &iter); + if (subvol_inum_eq(inum, BCACHEFS_ROOT_SUBVOL_INUM)) + break; + + inum = parent_inum(inum, &inode); + } + + bch2_trans_iter_exit(trans, &iter); + return ret; +} + +int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *trans, subvol_inum inum, + struct bch_inode_unpacked *inode) +{ + if (!bch2_inode_casefold(trans->c, inode)) + return 0; + + inode->bi_flags |= BCH_INODE_has_case_insensitive; + + return bch2_propagate_has_case_insensitive(trans, parent_inum(inum, inode)); +} + +int bch2_check_inode_has_case_insensitive(struct btree_trans *trans, + struct bch_inode_unpacked *inode, + snapshot_id_list *snapshot_overwrites, + bool *do_update) +{ + struct printbuf buf = PRINTBUF; + bool repairing_parents = false; + int ret = 0; + + if (!S_ISDIR(inode->bi_mode)) { + /* + * Old versions set bi_casefold for non dirs, but that's + * unnecessary and wasteful + */ + if (inode->bi_casefold) { + inode->bi_casefold = 0; + *do_update = true; + } + return 0; + } + + if (trans->c->sb.version < bcachefs_metadata_version_inode_has_case_insensitive) + return 0; + + if (bch2_inode_casefold(trans->c, inode) && + !(inode->bi_flags & BCH_INODE_has_case_insensitive)) { + prt_printf(&buf, "casefolded dir with has_case_insensitive not set\ninum %llu:%u ", + inode->bi_inum, inode->bi_snapshot); + + ret = bch2_inum_snapshot_to_path(trans, inode->bi_inum, inode->bi_snapshot, + snapshot_overwrites, &buf); + if (ret) + goto err; + + if (fsck_err(trans, inode_has_case_insensitive_not_set, "%s", buf.buf)) { + inode->bi_flags |= BCH_INODE_has_case_insensitive; + *do_update = true; + } + } + + if (!(inode->bi_flags & BCH_INODE_has_case_insensitive)) + goto out; + + struct bch_inode_unpacked dir = *inode; + u32 snapshot = dir.bi_snapshot; + + while (!(dir.bi_inum == BCACHEFS_ROOT_INO && + dir.bi_subvol == BCACHEFS_ROOT_SUBVOL)) { + if (dir.bi_parent_subvol) { + ret = bch2_subvolume_get_snapshot(trans, dir.bi_parent_subvol, &snapshot); + if (ret) + goto err; + + snapshot_overwrites = NULL; + } + + ret = bch2_inode_find_by_inum_snapshot(trans, dir.bi_dir, snapshot, &dir, 0); + if (ret) + goto err; + + if (!(dir.bi_flags & BCH_INODE_has_case_insensitive)) { + prt_printf(&buf, "parent of casefolded dir with has_case_insensitive not set\n"); + + ret = bch2_inum_snapshot_to_path(trans, dir.bi_inum, dir.bi_snapshot, + snapshot_overwrites, &buf); + if (ret) + goto err; + + if (fsck_err(trans, inode_parent_has_case_insensitive_not_set, "%s", buf.buf)) { + dir.bi_flags |= BCH_INODE_has_case_insensitive; + ret = __bch2_fsck_write_inode(trans, &dir); + if (ret) + goto err; + } + } + + /* + * We only need to check the first parent, unless we find an + * inconsistency + */ + if (!repairing_parents) + break; + } +out: +err: +fsck_err: + printbuf_exit(&buf); + if (ret) + return ret; + + if (repairing_parents) { + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; + } + + return 0; +} diff --git a/fs/bcachefs/fs-common.h b/fs/bcachefs/namei.h index 2b59210bb5e8..ae6ebc2d0785 100644 --- a/fs/bcachefs/fs-common.h +++ b/fs/bcachefs/namei.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _BCACHEFS_FS_COMMON_H -#define _BCACHEFS_FS_COMMON_H +#ifndef _BCACHEFS_NAMEI_H +#define _BCACHEFS_NAMEI_H #include "dirent.h" @@ -43,5 +43,37 @@ bool bch2_reinherit_attrs(struct bch_inode_unpacked *, struct bch_inode_unpacked *); int bch2_inum_to_path(struct btree_trans *, subvol_inum, struct printbuf *); +int bch2_inum_snapshot_to_path(struct btree_trans *, u64, u32, + snapshot_id_list *, struct printbuf *); -#endif /* _BCACHEFS_FS_COMMON_H */ +int __bch2_check_dirent_target(struct btree_trans *, + struct btree_iter *, + struct bkey_s_c_dirent, + struct bch_inode_unpacked *, bool); + +static inline bool inode_points_to_dirent(struct bch_inode_unpacked *inode, + struct bkey_s_c_dirent d) +{ + return inode->bi_dir == d.k->p.inode && + inode->bi_dir_offset == d.k->p.offset; +} + +static inline int bch2_check_dirent_target(struct btree_trans *trans, + struct btree_iter *dirent_iter, + struct bkey_s_c_dirent d, + struct bch_inode_unpacked *target, + bool in_fsck) +{ + if (likely(inode_points_to_dirent(target, d) && + d.v->d_type == inode_d_type(target))) + return 0; + + return __bch2_check_dirent_target(trans, dirent_iter, d, target, in_fsck); +} + +int bch2_maybe_propagate_has_case_insensitive(struct btree_trans *, subvol_inum, + struct bch_inode_unpacked *); +int bch2_check_inode_has_case_insensitive(struct btree_trans *, struct bch_inode_unpacked *, + snapshot_id_list *, bool *); + +#endif /* _BCACHEFS_NAMEI_H */ diff --git a/fs/bcachefs/nocow_locking.c b/fs/bcachefs/nocow_locking.c index 3c21981a4a1c..962218fa68ec 100644 --- a/fs/bcachefs/nocow_locking.c +++ b/fs/bcachefs/nocow_locking.c @@ -133,12 +133,10 @@ void bch2_fs_nocow_locking_exit(struct bch_fs *c) BUG_ON(atomic_read(&l->l[j])); } -int bch2_fs_nocow_locking_init(struct bch_fs *c) +void bch2_fs_nocow_locking_init_early(struct bch_fs *c) { struct bucket_nocow_lock_table *t = &c->nocow_locks; for (struct nocow_lock_bucket *l = t->l; l < t->l + ARRAY_SIZE(t->l); l++) spin_lock_init(&l->lock); - - return 0; } diff --git a/fs/bcachefs/nocow_locking.h b/fs/bcachefs/nocow_locking.h index f9d6a426a960..48b8a003c0d2 100644 --- a/fs/bcachefs/nocow_locking.h +++ b/fs/bcachefs/nocow_locking.h @@ -45,6 +45,6 @@ static inline bool bch2_bucket_nocow_trylock(struct bucket_nocow_lock_table *t, void bch2_nocow_locks_to_text(struct printbuf *, struct bucket_nocow_lock_table *); void bch2_fs_nocow_locking_exit(struct bch_fs *); -int bch2_fs_nocow_locking_init(struct bch_fs *); +void bch2_fs_nocow_locking_init_early(struct bch_fs *); #endif /* _BCACHEFS_NOCOW_LOCKING_H */ diff --git a/fs/bcachefs/opts.c b/fs/bcachefs/opts.c index 6772faf385a5..b1cf88905b81 100644 --- a/fs/bcachefs/opts.c +++ b/fs/bcachefs/opts.c @@ -7,7 +7,9 @@ #include "compress.h" #include "disk_groups.h" #include "error.h" +#include "movinggc.h" #include "opts.h" +#include "rebalance.h" #include "recovery_passes.h" #include "super-io.h" #include "util.h" @@ -19,6 +21,11 @@ const char * const bch2_error_actions[] = { NULL }; +const char * const bch2_degraded_actions[] = { + BCH_DEGRADED_ACTIONS() + NULL +}; + const char * const bch2_fsck_fix_opts[] = { BCH_FIX_ERRORS_OPTS() NULL @@ -44,7 +51,7 @@ const char * const __bch2_btree_ids[] = { NULL }; -static const char * const __bch2_csum_types[] = { +const char * const __bch2_csum_types[] = { BCH_CSUM_TYPES() NULL }; @@ -163,16 +170,6 @@ const char * const bch2_d_types[BCH_DT_MAX] = { [DT_SUBVOL] = "subvol", }; -u64 BCH2_NO_SB_OPT(const struct bch_sb *sb) -{ - BUG(); -} - -void SET_BCH2_NO_SB_OPT(struct bch_sb *sb, u64 v) -{ - BUG(); -} - void bch2_opts_apply(struct bch_opts *dst, struct bch_opts src) { #define x(_name, ...) \ @@ -223,6 +220,21 @@ void bch2_opt_set_by_id(struct bch_opts *opts, enum bch_opt_id id, u64 v) } } +/* dummy option, for options that aren't stored in the superblock */ +typedef u64 (*sb_opt_get_fn)(const struct bch_sb *); +typedef void (*sb_opt_set_fn)(struct bch_sb *, u64); +typedef u64 (*member_opt_get_fn)(const struct bch_member *); +typedef void (*member_opt_set_fn)(struct bch_member *, u64); + +__maybe_unused static const sb_opt_get_fn BCH2_NO_SB_OPT = NULL; +__maybe_unused static const sb_opt_set_fn SET_BCH2_NO_SB_OPT = NULL; +__maybe_unused static const member_opt_get_fn BCH2_NO_MEMBER_OPT = NULL; +__maybe_unused static const member_opt_set_fn SET_BCH2_NO_MEMBER_OPT = NULL; + +#define type_compatible_or_null(_p, _type) \ + __builtin_choose_expr( \ + __builtin_types_compatible_p(typeof(_p), typeof(_type)), _p, NULL) + const struct bch_option bch2_opt_table[] = { #define OPT_BOOL() .type = BCH_OPT_BOOL, .min = 0, .max = 2 #define OPT_UINT(_min, _max) .type = BCH_OPT_UINT, \ @@ -239,15 +251,15 @@ const struct bch_option bch2_opt_table[] = { #define x(_name, _bits, _flags, _type, _sb_opt, _default, _hint, _help) \ [Opt_##_name] = { \ - .attr = { \ - .name = #_name, \ - .mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ - }, \ - .flags = _flags, \ - .hint = _hint, \ - .help = _help, \ - .get_sb = _sb_opt, \ - .set_sb = SET_##_sb_opt, \ + .attr.name = #_name, \ + .attr.mode = (_flags) & OPT_RUNTIME ? 0644 : 0444, \ + .flags = _flags, \ + .hint = _hint, \ + .help = _help, \ + .get_sb = type_compatible_or_null(_sb_opt, *BCH2_NO_SB_OPT), \ + .set_sb = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_SB_OPT), \ + .get_member = type_compatible_or_null(_sb_opt, *BCH2_NO_MEMBER_OPT), \ + .set_member = type_compatible_or_null(SET_##_sb_opt,*SET_BCH2_NO_MEMBER_OPT),\ _type \ }, @@ -268,20 +280,20 @@ int bch2_opt_lookup(const char *name) return -1; } -struct synonym { +struct opt_synonym { const char *s1, *s2; }; -static const struct synonym bch_opt_synonyms[] = { +static const struct opt_synonym bch2_opt_synonyms[] = { { "quota", "usrquota" }, }; static int bch2_mount_opt_lookup(const char *name) { - const struct synonym *i; + const struct opt_synonym *i; - for (i = bch_opt_synonyms; - i < bch_opt_synonyms + ARRAY_SIZE(bch_opt_synonyms); + for (i = bch2_opt_synonyms; + i < bch2_opt_synonyms + ARRAY_SIZE(bch2_opt_synonyms); i++) if (!strcmp(name, i->s1)) name = i->s2; @@ -289,6 +301,30 @@ static int bch2_mount_opt_lookup(const char *name) return bch2_opt_lookup(name); } +struct opt_val_synonym { + const char *opt, *v1, *v2; +}; + +static const struct opt_val_synonym bch2_opt_val_synonyms[] = { + { "degraded", "true", "yes" }, + { "degraded", "false", "no" }, + { "degraded", "1", "yes" }, + { "degraded", "0", "no" }, +}; + +static const char *bch2_opt_val_synonym_lookup(const char *opt, const char *val) +{ + const struct opt_val_synonym *i; + + for (i = bch2_opt_val_synonyms; + i < bch2_opt_val_synonyms + ARRAY_SIZE(bch2_opt_val_synonyms); + i++) + if (!strcmp(opt, i->opt) && !strcmp(val, i->v1)) + return i->v2; + + return val; +} + int bch2_opt_validate(const struct bch_option *opt, u64 v, struct printbuf *err) { if (v < opt->min) { @@ -332,21 +368,22 @@ int bch2_opt_parse(struct bch_fs *c, { ssize_t ret; + if (err) + printbuf_indent_add_nextline(err, 2); + switch (opt->type) { case BCH_OPT_BOOL: - if (val) { - ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); - if (ret != -BCH_ERR_option_not_bool) { - *res = ret; - } else { - if (err) - prt_printf(err, "%s: must be bool", opt->attr.name); - return ret; - } + if (!val) + val = "1"; + + ret = lookup_constant(bool_names, val, -BCH_ERR_option_not_bool); + if (ret != -BCH_ERR_option_not_bool) { + *res = ret; } else { - *res = 1; + if (err) + prt_printf(err, "%s: must be bool", opt->attr.name); + return ret; } - break; case BCH_OPT_UINT: if (!val) { @@ -355,9 +392,15 @@ int bch2_opt_parse(struct bch_fs *c, return -EINVAL; } - ret = opt->flags & OPT_HUMAN_READABLE - ? bch2_strtou64_h(val, res) - : kstrtou64(val, 10, res); + if (*val != '-') { + ret = opt->flags & OPT_HUMAN_READABLE + ? bch2_strtou64_h(val, res) + : kstrtou64(val, 10, res); + } else { + prt_printf(err, "%s: must be a non-negative number", opt->attr.name); + return -BCH_ERR_option_negative; + } + if (ret < 0) { if (err) prt_printf(err, "%s: must be a number", @@ -475,11 +518,16 @@ void bch2_opts_to_text(struct printbuf *out, } } -int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) +int bch2_opt_hook_pre_set(struct bch_fs *c, struct bch_dev *ca, enum bch_opt_id id, u64 v) { int ret = 0; switch (id) { + case Opt_state: + if (ca) + return bch2_dev_set_state(c, ca, v, BCH_FORCE_IF_DEGRADED); + break; + case Opt_compression: case Opt_background_compression: ret = bch2_check_set_has_compressed_data(c, v); @@ -488,19 +536,17 @@ int bch2_opt_check_may_set(struct bch_fs *c, int id, u64 v) if (v) bch2_check_set_feature(c, BCH_FEATURE_ec); break; + default: + break; } return ret; } -int bch2_opts_check_may_set(struct bch_fs *c) +int bch2_opts_hooks_pre_set(struct bch_fs *c) { - unsigned i; - int ret; - - for (i = 0; i < bch2_opts_nr; i++) { - ret = bch2_opt_check_may_set(c, i, - bch2_opt_get_by_id(&c->opts, i)); + for (unsigned i = 0; i < bch2_opts_nr; i++) { + int ret = bch2_opt_hook_pre_set(c, NULL, i, bch2_opt_get_by_id(&c->opts, i)); if (ret) return ret; } @@ -508,6 +554,61 @@ int bch2_opts_check_may_set(struct bch_fs *c) return 0; } +void bch2_opt_hook_post_set(struct bch_fs *c, struct bch_dev *ca, u64 inum, + struct bch_opts *new_opts, enum bch_opt_id id) +{ + switch (id) { + case Opt_foreground_target: + if (new_opts->foreground_target && + !new_opts->background_target) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_compression: + if (new_opts->compression && + !new_opts->background_compression) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_background_target: + if (new_opts->background_target) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_background_compression: + if (new_opts->background_compression) + bch2_set_rebalance_needs_scan(c, inum); + break; + case Opt_rebalance_enabled: + bch2_rebalance_wakeup(c); + break; + case Opt_copygc_enabled: + bch2_copygc_wakeup(c); + break; + case Opt_discard: + if (!ca) { + mutex_lock(&c->sb_lock); + for_each_member_device(c, ca) { + struct bch_member *m = + bch2_members_v2_get_mut(ca->disk_sb.sb, ca->dev_idx); + SET_BCH_MEMBER_DISCARD(m, c->opts.discard); + } + + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + } + break; + case Opt_version_upgrade: + /* + * XXX: in the future we'll likely want to do compatible + * upgrades at runtime as well, but right now there's nothing + * that does that: + */ + if (new_opts->version_upgrade == BCH_VERSION_UPGRADE_incompatible) + bch2_sb_upgrade_incompat(c); + break; + default: + break; + } +} + int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, struct printbuf *parse_later, const char *name, const char *val) @@ -530,6 +631,12 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, if (id < 0) return 0; + /* must have a value for synonym lookup - but OPT_FN is weird */ + if (!val && bch2_opt_table[id].type != BCH_OPT_FN) + val = "1"; + + val = bch2_opt_val_synonym_lookup(name, val); + if (!(bch2_opt_table[id].flags & OPT_MOUNT)) goto bad_opt; @@ -543,14 +650,15 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, goto bad_opt; ret = bch2_opt_parse(c, &bch2_opt_table[id], val, &v, &err); - if (ret == -BCH_ERR_option_needs_open_fs && parse_later) { - prt_printf(parse_later, "%s=%s,", name, val); - if (parse_later->allocation_failure) { - ret = -ENOMEM; - goto out; + if (ret == -BCH_ERR_option_needs_open_fs) { + ret = 0; + + if (parse_later) { + prt_printf(parse_later, "%s=%s,", name, val); + if (parse_later->allocation_failure) + ret = -ENOMEM; } - ret = 0; goto out; } @@ -561,28 +669,24 @@ int bch2_parse_one_mount_opt(struct bch_fs *c, struct bch_opts *opts, bch2_opt_set_by_id(opts, id, v); ret = 0; - goto out; - +out: + printbuf_exit(&err); + return ret; bad_opt: - pr_err("Bad mount option %s", name); ret = -BCH_ERR_option_name; goto out; - bad_val: - pr_err("Invalid mount option %s", err.buf); ret = -BCH_ERR_option_value; - -out: - printbuf_exit(&err); - return ret; + goto out; } int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, - struct printbuf *parse_later, char *options) + struct printbuf *parse_later, char *options, + bool ignore_unknown) { char *copied_opts, *copied_opts_start; char *opt, *name, *val; - int ret; + int ret = 0; if (!options) return 0; @@ -607,24 +711,37 @@ int bch2_parse_mount_opts(struct bch_fs *c, struct bch_opts *opts, val = opt; ret = bch2_parse_one_mount_opt(c, opts, parse_later, name, val); - if (ret < 0) - goto out; + if (ret == -BCH_ERR_option_name && ignore_unknown) + ret = 0; + if (ret) { + pr_err("Error parsing option %s: %s", name, bch2_err_str(ret)); + break; + } } - ret = 0; - goto out; - -out: kfree(copied_opts_start); return ret; } -u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) +u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id, int dev_idx) { const struct bch_option *opt = bch2_opt_table + id; u64 v; - v = opt->get_sb(sb); + if (dev_idx < 0) { + v = opt->get_sb(sb); + } else { + if (WARN(!bch2_member_exists(sb, dev_idx), + "tried to set device option %s on nonexistent device %i", + opt->attr.name, dev_idx)) + return 0; + + struct bch_member m = bch2_sb_member_get(sb, dev_idx); + v = opt->get_member(&m); + } + + if (opt->flags & OPT_SB_FIELD_ONE_BIAS) + --v; if (opt->flags & OPT_SB_FIELD_ILOG2) v = 1ULL << v; @@ -641,34 +758,20 @@ u64 bch2_opt_from_sb(struct bch_sb *sb, enum bch_opt_id id) */ int bch2_opts_from_sb(struct bch_opts *opts, struct bch_sb *sb) { - unsigned id; - - for (id = 0; id < bch2_opts_nr; id++) { + for (unsigned id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; - if (opt->get_sb == BCH2_NO_SB_OPT) - continue; - - bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id)); + if (opt->get_sb) + bch2_opt_set_by_id(opts, id, bch2_opt_from_sb(sb, id, -1)); } return 0; } -struct bch_dev_sb_opt_set { - void (*set_sb)(struct bch_member *, u64); -}; - -static const struct bch_dev_sb_opt_set bch2_dev_sb_opt_setters [] = { -#define x(n, set) [Opt_##n] = { .set_sb = SET_##set }, - BCH_DEV_OPT_SETTERS() -#undef x -}; - -void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, +bool __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, const struct bch_option *opt, u64 v) { - enum bch_opt_id id = opt - bch2_opt_table; + bool changed = false; if (opt->flags & OPT_SB_FIELD_SECTORS) v >>= 9; @@ -679,34 +782,35 @@ void __bch2_opt_set_sb(struct bch_sb *sb, int dev_idx, if (opt->flags & OPT_SB_FIELD_ONE_BIAS) v++; - if (opt->flags & OPT_FS) { - if (opt->set_sb != SET_BCH2_NO_SB_OPT) - opt->set_sb(sb, v); + if ((opt->flags & OPT_FS) && opt->set_sb && dev_idx < 0) { + changed = v != opt->get_sb(sb); + + opt->set_sb(sb, v); } - if ((opt->flags & OPT_DEVICE) && dev_idx >= 0) { + if ((opt->flags & OPT_DEVICE) && opt->set_member && dev_idx >= 0) { if (WARN(!bch2_member_exists(sb, dev_idx), "tried to set device option %s on nonexistent device %i", opt->attr.name, dev_idx)) - return; + return false; struct bch_member *m = bch2_members_v2_get_mut(sb, dev_idx); - - const struct bch_dev_sb_opt_set *set = bch2_dev_sb_opt_setters + id; - if (set->set_sb) - set->set_sb(m, v); - else - pr_err("option %s cannot be set via opt_set_sb()", opt->attr.name); + changed = v != opt->get_member(m); + opt->set_member(m, v); } + + return changed; } -void bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, +bool bch2_opt_set_sb(struct bch_fs *c, struct bch_dev *ca, const struct bch_option *opt, u64 v) { mutex_lock(&c->sb_lock); - __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); - bch2_write_super(c); + bool changed = __bch2_opt_set_sb(c->disk_sb.sb, ca ? ca->dev_idx : -1, opt, v); + if (changed) + bch2_write_super(c); mutex_unlock(&c->sb_lock); + return changed; } /* io opts: */ diff --git a/fs/bcachefs/opts.h b/fs/bcachefs/opts.h index 9d397fc2a1f0..2a02606254b3 100644 --- a/fs/bcachefs/opts.h +++ b/fs/bcachefs/opts.h @@ -11,11 +11,13 @@ struct bch_fs; extern const char * const bch2_error_actions[]; +extern const char * const bch2_degraded_actions[]; extern const char * const bch2_fsck_fix_opts[]; extern const char * const bch2_version_upgrade_opts[]; extern const char * const bch2_sb_features[]; extern const char * const bch2_sb_compat[]; extern const char * const __bch2_btree_ids[]; +extern const char * const __bch2_csum_types[]; extern const char * const __bch2_csum_opts[]; extern const char * const __bch2_compression_types[]; extern const char * const bch2_compression_opts[]; @@ -50,10 +52,6 @@ static inline const char *bch2_d_type_str(unsigned d_type) * apply the options from that struct that are defined. */ -/* dummy option, for options that aren't stored in the superblock */ -u64 BCH2_NO_SB_OPT(const struct bch_sb *); -void SET_BCH2_NO_SB_OPT(struct bch_sb *, u64); - /* When can be set: */ enum opt_flags { OPT_FS = BIT(0), /* Filesystem option */ @@ -132,19 +130,24 @@ enum fsck_err_opts { OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ OPT_UINT(512, 1U << 16), \ - BCH_SB_BLOCK_SIZE, 8, \ + BCH_SB_BLOCK_SIZE, 4 << 10, \ "size", NULL) \ x(btree_node_size, u32, \ OPT_FS|OPT_FORMAT| \ OPT_HUMAN_READABLE|OPT_MUST_BE_POW_2|OPT_SB_FIELD_SECTORS, \ OPT_UINT(512, 1U << 20), \ - BCH_SB_BTREE_NODE_SIZE, 512, \ + BCH_SB_BTREE_NODE_SIZE, 256 << 10, \ "size", "Btree node size, default 256k") \ x(errors, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_error_actions), \ BCH_SB_ERROR_ACTION, BCH_ON_ERROR_fix_safe, \ NULL, "Action to take on filesystem error") \ + x(write_error_timeout, u16, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(1, 300), \ + BCH_SB_WRITE_ERROR_TIMEOUT, 30, \ + NULL, "Number of consecutive write errors allowed before kicking out a device")\ x(metadata_replicas, u8, \ OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_UINT(1, BCH_REPLICAS_MAX), \ @@ -181,6 +184,11 @@ enum fsck_err_opts { OPT_STR(__bch2_csum_opts), \ BCH_SB_DATA_CSUM_TYPE, BCH_CSUM_OPT_crc32c, \ NULL, NULL) \ + x(checksum_err_retry_nr, u8, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_UINT(0, 32), \ + BCH_SB_CSUM_ERR_RETRY_NR, 3, \ + NULL, NULL) \ x(compression, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_compression), \ @@ -197,7 +205,7 @@ enum fsck_err_opts { BCH_SB_STR_HASH_TYPE, BCH_STR_HASH_OPT_siphash, \ NULL, "Hash function for directory entries and xattrs")\ x(metadata_target, u16, \ - OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ + OPT_FS|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_FN(bch2_opt_target), \ BCH_SB_METADATA_TARGET, 0, \ "(target)", "Device or label for metadata writes") \ @@ -221,6 +229,11 @@ enum fsck_err_opts { OPT_BOOL(), \ BCH_SB_ERASURE_CODE, false, \ NULL, "Enable erasure coding (DO NOT USE YET)") \ + x(casefold, u8, \ + OPT_FS|OPT_INODE|OPT_FORMAT, \ + OPT_BOOL(), \ + BCH_SB_CASEFOLD, false, \ + NULL, "Dirent lookups are casefolded") \ x(inodes_32bit, u8, \ OPT_FS|OPT_INODE|OPT_FORMAT|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -295,24 +308,14 @@ enum fsck_err_opts { NULL, "Enable project quotas") \ x(degraded, u8, \ OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ + OPT_STR(bch2_degraded_actions), \ + BCH_SB_DEGRADED_ACTION, BCH_DEGRADED_ask, \ NULL, "Allow mounting in degraded mode") \ - x(very_degraded, u8, \ - OPT_FS|OPT_MOUNT, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, false, \ - NULL, "Allow mounting in when data will be missing") \ x(no_splitbrain_check, u8, \ OPT_FS|OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Don't kick drives out when splitbrain detected")\ - x(discard, u8, \ - OPT_FS|OPT_MOUNT|OPT_DEVICE, \ - OPT_BOOL(), \ - BCH2_NO_SB_OPT, true, \ - NULL, "Enable discard/TRIM support") \ x(verbose, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ @@ -447,7 +450,7 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, false, \ NULL, "Reconstruct alloc btree") \ x(version_upgrade, u8, \ - OPT_FS|OPT_MOUNT, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_STR(bch2_version_upgrade_opts), \ BCH_SB_VERSION_UPGRADE, BCH_VERSION_UPGRADE_compatible, \ NULL, "Set superblock to latest version,\n" \ @@ -487,45 +490,56 @@ enum fsck_err_opts { BCH2_NO_SB_OPT, true, \ NULL, "Enable rebalance: disable for debugging, or to\n"\ "quiet the system when doing performance testing\n")\ + x(rebalance_on_ac_only, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_SB_REBALANCE_AC_ONLY, false, \ + NULL, "Enable rebalance while on mains power only\n") \ + x(auto_snapshot_deletion, u8, \ + OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH2_NO_SB_OPT, true, \ + NULL, "Enable automatic snapshot deletion: disable for debugging, or to\n"\ + "quiet the system when doing performance testing\n")\ x(no_data_io, u8, \ OPT_MOUNT, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, false, \ NULL, "Skip submit_bio() for data reads and writes, " \ "for performance testing purposes") \ - x(fs_size, u64, \ - OPT_DEVICE, \ + x(state, u64, \ + OPT_DEVICE|OPT_RUNTIME, \ + OPT_STR(bch2_member_states), \ + BCH_MEMBER_STATE, BCH_MEMBER_STATE_rw, \ + "state", "rw,ro,failed,spare") \ + x(bucket_size, u32, \ + OPT_DEVICE|OPT_HUMAN_READABLE|OPT_SB_FIELD_SECTORS, \ OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, 0, \ - "size", "Size of filesystem on device") \ - x(bucket, u32, \ - OPT_DEVICE, \ - OPT_UINT(0, S64_MAX), \ - BCH2_NO_SB_OPT, 0, \ + BCH_MEMBER_BUCKET_SIZE, 0, \ "size", "Specifies the bucket size; must be greater than the btree node size")\ x(durability, u8, \ - OPT_DEVICE|OPT_SB_FIELD_ONE_BIAS, \ + OPT_DEVICE|OPT_RUNTIME|OPT_SB_FIELD_ONE_BIAS, \ OPT_UINT(0, BCH_REPLICAS_MAX), \ - BCH2_NO_SB_OPT, 1, \ + BCH_MEMBER_DURABILITY, 1, \ "n", "Data written to this device will be considered\n"\ "to have already been replicated n times") \ x(data_allowed, u8, \ OPT_DEVICE, \ OPT_BITFIELD(__bch2_data_types), \ - BCH2_NO_SB_OPT, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ + BCH_MEMBER_DATA_ALLOWED, BIT(BCH_DATA_journal)|BIT(BCH_DATA_btree)|BIT(BCH_DATA_user),\ "types", "Allowed data types for this device: journal, btree, and/or user")\ + x(discard, u8, \ + OPT_MOUNT|OPT_FS|OPT_DEVICE|OPT_RUNTIME, \ + OPT_BOOL(), \ + BCH_MEMBER_DISCARD, true, \ + NULL, "Enable discard/TRIM support") \ x(btree_node_prefetch, u8, \ OPT_FS|OPT_MOUNT|OPT_RUNTIME, \ OPT_BOOL(), \ BCH2_NO_SB_OPT, true, \ - NULL, "BTREE_ITER_prefetch casuse btree nodes to be\n"\ + NULL, "BTREE_ITER_prefetch causes btree nodes to be\n"\ " prefetched sequentially") -#define BCH_DEV_OPT_SETTERS() \ - x(discard, BCH_MEMBER_DISCARD) \ - x(durability, BCH_MEMBER_DURABILITY) \ - x(data_allowed, BCH_MEMBER_DATA_ALLOWED) - struct bch_opts { #define x(_name, _bits, ...) unsigned _name##_defined:1; BCH_OPTS() @@ -582,8 +596,6 @@ struct printbuf; struct bch_option { struct attribute attr; - u64 (*get_sb)(const struct bch_sb *); - void (*set_sb)(struct bch_sb *, u64); enum opt_type type; enum opt_flags flags; u64 min, max; @@ -595,6 +607,12 @@ struct bch_option { const char *hint; const char *help; + u64 (*get_sb)(const struct bch_sb *); + void (*set_sb)(struct bch_sb *, u64); + + u64 (*get_member)(const struct bch_member *); + void (*set_member)(struct bch_member *, u64); + }; extern const struct bch_option bch2_opt_table[]; @@ -603,12 +621,12 @@ bool bch2_opt_defined_by_id(const struct bch_opts *, enum bch_opt_id); u64 bch2_opt_get_by_id(const struct bch_opts *, enum bch_opt_id); void bch2_opt_set_by_id(struct bch_opts *, enum bch_opt_id, u64); -u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id); +u64 bch2_opt_from_sb(struct bch_sb *, enum bch_opt_id, int); int bch2_opts_from_sb(struct bch_opts *, struct bch_sb *); -void __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); +bool __bch2_opt_set_sb(struct bch_sb *, int, const struct bch_option *, u64); struct bch_dev; -void bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); +bool bch2_opt_set_sb(struct bch_fs *, struct bch_dev *, const struct bch_option *, u64); int bch2_opt_lookup(const char *); int bch2_opt_validate(const struct bch_option *, u64, struct printbuf *); @@ -625,12 +643,15 @@ void bch2_opts_to_text(struct printbuf *, struct bch_fs *, struct bch_sb *, unsigned, unsigned, unsigned); -int bch2_opt_check_may_set(struct bch_fs *, int, u64); -int bch2_opts_check_may_set(struct bch_fs *); +int bch2_opt_hook_pre_set(struct bch_fs *, struct bch_dev *, enum bch_opt_id, u64); +int bch2_opts_hooks_pre_set(struct bch_fs *); +void bch2_opt_hook_post_set(struct bch_fs *, struct bch_dev *, u64, + struct bch_opts *, enum bch_opt_id); + int bch2_parse_one_mount_opt(struct bch_fs *, struct bch_opts *, struct printbuf *, const char *, const char *); int bch2_parse_mount_opts(struct bch_fs *, struct bch_opts *, struct printbuf *, - char *); + char *, bool); /* inode opts: */ diff --git a/fs/bcachefs/printbuf.c b/fs/bcachefs/printbuf.c index 4cf5a2af1e6f..3302bbc78a09 100644 --- a/fs/bcachefs/printbuf.c +++ b/fs/bcachefs/printbuf.c @@ -277,6 +277,25 @@ void bch2_printbuf_indent_add(struct printbuf *buf, unsigned spaces) } /** + * bch2_printbuf_indent_add_nextline() - add to the current indent level for + * subsequent lines + * + * @buf: printbuf to control + * @spaces: number of spaces to add to the current indent level + * + * Subsequent lines - not the current line - will be indented by @spaces more + * spaces. + */ +void bch2_printbuf_indent_add_nextline(struct printbuf *buf, unsigned spaces) +{ + if (WARN_ON_ONCE(buf->indent + spaces < buf->indent)) + spaces = 0; + + buf->indent += spaces; + buf->has_indent_or_tabstops = true; +} + +/** * bch2_printbuf_indent_sub() - subtract from the current indent level * * @buf: printbuf to control diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index d0dd398baa2b..1ca476adbf6f 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -112,6 +112,7 @@ void bch2_printbuf_tabstop_pop(struct printbuf *); int bch2_printbuf_tabstop_push(struct printbuf *, unsigned); void bch2_printbuf_indent_add(struct printbuf *, unsigned); +void bch2_printbuf_indent_add_nextline(struct printbuf *, unsigned); void bch2_printbuf_indent_sub(struct printbuf *, unsigned); void bch2_prt_newline(struct printbuf *); diff --git a/fs/bcachefs/progress.c b/fs/bcachefs/progress.c new file mode 100644 index 000000000000..d09898566abe --- /dev/null +++ b/fs/bcachefs/progress.c @@ -0,0 +1,61 @@ +// SPDX-License-Identifier: GPL-2.0 +#include "bcachefs.h" +#include "bbpos.h" +#include "disk_accounting.h" +#include "progress.h" + +void bch2_progress_init(struct progress_indicator_state *s, + struct bch_fs *c, + u64 btree_id_mask) +{ + memset(s, 0, sizeof(*s)); + + s->next_print = jiffies + HZ * 10; + + for (unsigned i = 0; i < BTREE_ID_NR; i++) { + if (!(btree_id_mask & BIT_ULL(i))) + continue; + + struct disk_accounting_pos acc; + disk_accounting_key_init(acc, btree, .id = i); + + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + s->nodes_total += div64_ul(v, btree_sectors(c)); + } +} + +static inline bool progress_update_p(struct progress_indicator_state *s) +{ + bool ret = time_after_eq(jiffies, s->next_print); + + if (ret) + s->next_print = jiffies + HZ * 10; + return ret; +} + +void bch2_progress_update_iter(struct btree_trans *trans, + struct progress_indicator_state *s, + struct btree_iter *iter, + const char *msg) +{ + struct bch_fs *c = trans->c; + struct btree *b = path_l(btree_iter_path(trans, iter))->b; + + s->nodes_seen += b != s->last_node; + s->last_node = b; + + if (progress_update_p(s)) { + struct printbuf buf = PRINTBUF; + unsigned percent = s->nodes_total + ? div64_u64(s->nodes_seen * 100, s->nodes_total) + : 0; + + prt_printf(&buf, "%s: %d%%, done %llu/%llu nodes, at ", + msg, percent, s->nodes_seen, s->nodes_total); + bch2_bbpos_to_text(&buf, BBPOS(iter->btree_id, iter->pos)); + + bch_info(c, "%s", buf.buf); + printbuf_exit(&buf); + } +} diff --git a/fs/bcachefs/progress.h b/fs/bcachefs/progress.h new file mode 100644 index 000000000000..23fb1811f943 --- /dev/null +++ b/fs/bcachefs/progress.h @@ -0,0 +1,29 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_PROGRESS_H +#define _BCACHEFS_PROGRESS_H + +/* + * Lame progress indicators + * + * We don't like to use these because they print to the dmesg console, which is + * spammy - we much prefer to be wired up to a userspace programm (e.g. via + * thread_with_file) and have it print the progress indicator. + * + * But some code is old and doesn't support that, or runs in a context where + * that's not yet practical (mount). + */ + +struct progress_indicator_state { + unsigned long next_print; + u64 nodes_seen; + u64 nodes_total; + struct btree *last_node; +}; + +void bch2_progress_init(struct progress_indicator_state *, struct bch_fs *, u64); +void bch2_progress_update_iter(struct btree_trans *, + struct progress_indicator_state *, + struct btree_iter *, + const char *); + +#endif /* _BCACHEFS_PROGRESS_H */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 8b857fc33244..3d4755d73af7 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -516,7 +516,7 @@ static int bch2_fs_quota_read_inode(struct btree_trans *trans, bch2_quota_acct(c, bch_qid(&u), Q_INO, 1, KEY_TYPE_QUOTA_NOCHECK); advance: - bch2_btree_iter_set_pos(iter, bpos_nosnap_successor(iter->pos)); + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(iter->pos)); return 0; } diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index d0a1f5cd5c2b..de1ec9e0caa0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -26,9 +26,8 @@ /* bch_extent_rebalance: */ -static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +static const struct bch_extent_rebalance *bch2_bkey_ptrs_rebalance_opts(struct bkey_ptrs_c ptrs) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); const union bch_extent_entry *entry; bkey_extent_entry_for_each(ptrs, entry) @@ -38,6 +37,11 @@ static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s return NULL; } +static const struct bch_extent_rebalance *bch2_bkey_rebalance_opts(struct bkey_s_c k) +{ + return bch2_bkey_ptrs_rebalance_opts(bch2_bkey_ptrs_c(k)); +} + static inline unsigned bch2_bkey_ptrs_need_compress(struct bch_fs *c, struct bch_io_opts *opts, struct bkey_s_c k, @@ -76,11 +80,13 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, unsigned ptr_bit = 1; unsigned rewrite_ptrs = 0; + rcu_read_lock(); bkey_for_each_ptr(ptrs, ptr) { if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) rewrite_ptrs |= ptr_bit; ptr_bit <<= 1; } + rcu_read_unlock(); return rewrite_ptrs; } @@ -91,17 +97,24 @@ static unsigned bch2_bkey_ptrs_need_rebalance(struct bch_fs *c, { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + return bch2_bkey_ptrs_need_compress(c, opts, k, ptrs) | bch2_bkey_ptrs_need_move(c, opts, ptrs); } u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) { - const struct bch_extent_rebalance *opts = bch2_bkey_rebalance_opts(k); + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + + const struct bch_extent_rebalance *opts = bch2_bkey_ptrs_rebalance_opts(ptrs); if (!opts) return 0; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (bch2_bkey_extent_ptrs_flags(ptrs) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) + return 0; + const union bch_extent_entry *entry; struct extent_ptr_decoded p; u64 sectors = 0; @@ -121,10 +134,14 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } } incompressible: - if (opts->background_target) + if (opts->background_target) { + rcu_read_lock(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) - if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) + if (!p.ptr.cached && + !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; + rcu_read_unlock(); + } return sectors; } @@ -228,7 +245,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *trans, u64 inum) bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -257,7 +274,7 @@ int bch2_set_rebalance_needs_scan(struct bch_fs *c, u64 inum) int ret = bch2_trans_commit_do(c, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, bch2_set_rebalance_needs_scan_trans(trans, inum)); - rebalance_wakeup(c); + bch2_rebalance_wakeup(c); return ret; } @@ -276,7 +293,7 @@ static int bch2_clear_rebalance_needs_scan(struct btree_trans *trans, u64 inum, bch2_trans_iter_init(trans, &iter, BTREE_ID_rebalance_work, SPOS(inum, REBALANCE_WORK_SCAN_OFFSET, U32_MAX), BTREE_ITER_intent); - k = bch2_btree_iter_peek_slot(&iter); + k = bch2_btree_iter_peek_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -296,7 +313,7 @@ static struct bkey_s_c next_rebalance_entry(struct btree_trans *trans, struct btree_iter *work_iter) { return !kthread_should_stop() - ? bch2_btree_iter_peek(work_iter) + ? bch2_btree_iter_peek(trans, work_iter) : bkey_s_c_null; } @@ -304,7 +321,7 @@ static int bch2_bkey_clear_needs_rebalance(struct btree_trans *trans, struct btree_iter *iter, struct bkey_s_c k) { - if (!bch2_bkey_rebalance_opts(k)) + if (k.k->type == KEY_TYPE_reflink_v || !bch2_bkey_rebalance_opts(k)) return 0; struct bkey_i *n = bch2_bkey_make_mut(trans, iter, &k, 0); @@ -330,7 +347,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, work_pos.inode ? BTREE_ID_extents : BTREE_ID_reflink, work_pos, BTREE_ITER_all_snapshots); - struct bkey_s_c k = bch2_btree_iter_peek_slot(extent_iter); + struct bkey_s_c k = bch2_btree_iter_peek_slot(trans, extent_iter); if (bkey_err(k)) return k; @@ -341,7 +358,7 @@ static struct bkey_s_c next_rebalance_extent(struct btree_trans *trans, memset(data_opts, 0, sizeof(*data_opts)); data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; + data_opts->write_flags |= BCH_WRITE_only_specified_devs; if (!data_opts->rewrite_ptrs) { /* @@ -442,22 +459,11 @@ out: return ret; } -static bool rebalance_pred(struct bch_fs *c, void *arg, - struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) -{ - data_opts->rewrite_ptrs = bch2_bkey_ptrs_need_rebalance(c, io_opts, k); - data_opts->target = io_opts->background_target; - data_opts->write_flags |= BCH_WRITE_ONLY_SPECIFIED_DEVS; - return data_opts->rewrite_ptrs != 0; -} - static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) { struct btree_trans *trans = ctxt->trans; + struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &trans->c->rebalance; - int ret; bch2_move_stats_init(&r->scan_stats, "rebalance_scan"); ctxt->stats = &r->scan_stats; @@ -472,11 +478,34 @@ static int do_rebalance_scan(struct moving_context *ctxt, u64 inum, u64 cookie) r->state = BCH_REBALANCE_scanning; - ret = __bch2_move_data(ctxt, r->scan_start, r->scan_end, rebalance_pred, NULL) ?: - commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + struct per_snapshot_io_opts snapshot_io_opts; + per_snapshot_io_opts_init(&snapshot_io_opts, c); + + int ret = for_each_btree_key_max(trans, iter, BTREE_ID_extents, + r->scan_start.pos, r->scan_end.pos, + BTREE_ITER_all_snapshots| + BTREE_ITER_not_extents| + BTREE_ITER_prefetch, k, ({ + ctxt->stats->pos = BBPOS(iter.btree_id, iter.pos); + struct bch_io_opts *io_opts = bch2_move_get_io_opts(trans, + &snapshot_io_opts, iter.pos, &iter, k); + PTR_ERR_OR_ZERO(io_opts); + })) ?: + commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + bch2_clear_rebalance_needs_scan(trans, inum, cookie)); + + per_snapshot_io_opts_exit(&snapshot_io_opts); bch2_move_stats_exit(&r->scan_stats, trans->c); + + /* + * Ensure that the rebalance_work entries we created are seen by the + * next iteration of do_rebalance(), so we don't end up stuck in + * rebalance_wait(): + */ + atomic64_inc(&r->scan_stats.sectors_seen); + bch2_btree_write_buffer_flush_sync(trans); + return ret; } @@ -501,12 +530,19 @@ static void rebalance_wait(struct bch_fs *c) bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); } +static bool bch2_rebalance_enabled(struct bch_fs *c) +{ + return c->opts.rebalance_enabled && + !(c->opts.rebalance_on_ac_only && + c->rebalance.on_battery); +} + static int do_rebalance(struct moving_context *ctxt) { struct btree_trans *trans = ctxt->trans; struct bch_fs *c = trans->c; struct bch_fs_rebalance *r = &c->rebalance; - struct btree_iter rebalance_work_iter, extent_iter = { NULL }; + struct btree_iter rebalance_work_iter, extent_iter = {}; struct bkey_s_c k; int ret = 0; @@ -520,9 +556,9 @@ static int do_rebalance(struct moving_context *ctxt) BTREE_ITER_all_snapshots); while (!bch2_move_ratelimit(ctxt)) { - if (!c->opts.rebalance_enabled) { + if (!bch2_rebalance_enabled(c)) { bch2_moving_ctxt_flush_all(ctxt); - kthread_wait_freezable(c->opts.rebalance_enabled || + kthread_wait_freezable(bch2_rebalance_enabled(c) || kthread_should_stop()); } @@ -547,7 +583,7 @@ static int do_rebalance(struct moving_context *ctxt) if (ret) break; - bch2_btree_iter_advance(&rebalance_work_iter); + bch2_btree_iter_advance(trans, &rebalance_work_iter); } bch2_trans_iter_exit(trans, &extent_iter); @@ -576,6 +612,13 @@ static int bch2_rebalance_thread(void *arg) set_freezable(); + /* + * Data move operations can't run until after check_snapshots has + * completed, and bch2_snapshot_is_ancestor() is available. + */ + kthread_wait_freezable(c->recovery.pass_done > BCH_RECOVERY_PASS_check_snapshots || + kthread_should_stop()); + bch2_moving_ctxt_init(&ctxt, c, NULL, &r->work_stats, writepoint_ptr(&c->rebalance_write_point), true); @@ -590,8 +633,20 @@ static int bch2_rebalance_thread(void *arg) void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) { + printbuf_tabstop_push(out, 32); + struct bch_fs_rebalance *r = &c->rebalance; + /* print pending work */ + struct disk_accounting_pos acc; + disk_accounting_key_init(acc, rebalance_work); + u64 v; + bch2_accounting_mem_read(c, disk_accounting_pos_to_bpos(&acc), &v, 1); + + prt_printf(out, "pending work:\t"); + prt_human_readable_u64(out, v << 9); + prt_printf(out, "\n\n"); + prt_str(out, bch2_rebalance_state_strs[r->state]); prt_newline(out); printbuf_indent_add(out, 2); @@ -600,15 +655,15 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) case BCH_REBALANCE_waiting: { u64 now = atomic64_read(&c->io_clock[WRITE].now); - prt_str(out, "io wait duration: "); + prt_printf(out, "io wait duration:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - r->wait_iotime_start) << 9); prt_newline(out); - prt_str(out, "io wait remaining: "); + prt_printf(out, "io wait remaining:\t"); bch2_prt_human_readable_s64(out, (r->wait_iotime_end - now) << 9); prt_newline(out); - prt_str(out, "duration waited: "); + prt_printf(out, "duration waited:\t"); bch2_pr_time_units(out, ktime_get_real_ns() - r->wait_wallclock_start); prt_newline(out); break; @@ -621,6 +676,18 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) break; } prt_newline(out); + + rcu_read_lock(); + struct task_struct *t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + rcu_read_unlock(); + + if (t) { + bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); + put_task_struct(t); + } + printbuf_indent_sub(out, 2); } @@ -635,7 +702,7 @@ void bch2_rebalance_stop(struct bch_fs *c) c->rebalance.thread = NULL; if (p) { - /* for sychronizing with rebalance_wakeup() */ + /* for sychronizing with bch2_rebalance_wakeup() */ synchronize_rcu(); kthread_stop(p); @@ -666,7 +733,156 @@ int bch2_rebalance_start(struct bch_fs *c) return 0; } -void bch2_fs_rebalance_init(struct bch_fs *c) +#ifdef CONFIG_POWER_SUPPLY +#include <linux/power_supply.h> + +static int bch2_rebalance_power_notifier(struct notifier_block *nb, + unsigned long event, void *data) +{ + struct bch_fs *c = container_of(nb, struct bch_fs, rebalance.power_notifier); + + c->rebalance.on_battery = !power_supply_is_system_supplied(); + bch2_rebalance_wakeup(c); + return NOTIFY_OK; +} +#endif + +void bch2_fs_rebalance_exit(struct bch_fs *c) +{ +#ifdef CONFIG_POWER_SUPPLY + power_supply_unreg_notifier(&c->rebalance.power_notifier); +#endif +} + +int bch2_fs_rebalance_init(struct bch_fs *c) { - bch2_pd_controller_init(&c->rebalance.pd); + struct bch_fs_rebalance *r = &c->rebalance; + + bch2_pd_controller_init(&r->pd); + +#ifdef CONFIG_POWER_SUPPLY + r->power_notifier.notifier_call = bch2_rebalance_power_notifier; + int ret = power_supply_reg_notifier(&r->power_notifier); + if (ret) + return ret; + + r->on_battery = !power_supply_is_system_supplied(); +#endif + return 0; +} + +static int check_rebalance_work_one(struct btree_trans *trans, + struct btree_iter *extent_iter, + struct btree_iter *rebalance_iter, + struct bkey_buf *last_flushed) +{ + struct bch_fs *c = trans->c; + struct bkey_s_c extent_k, rebalance_k; + struct printbuf buf = PRINTBUF; + + int ret = bkey_err(extent_k = bch2_btree_iter_peek(trans, extent_iter)) ?: + bkey_err(rebalance_k = bch2_btree_iter_peek(trans, rebalance_iter)); + if (ret) + return ret; + + if (!extent_k.k && + extent_iter->btree_id == BTREE_ID_reflink && + (!rebalance_k.k || + rebalance_k.k->p.inode >= BCACHEFS_ROOT_INO)) { + bch2_trans_iter_exit(trans, extent_iter); + bch2_trans_iter_init(trans, extent_iter, + BTREE_ID_extents, POS_MIN, + BTREE_ITER_prefetch| + BTREE_ITER_all_snapshots); + return -BCH_ERR_transaction_restart_nested; + } + + if (!extent_k.k && !rebalance_k.k) + return 1; + + int cmp = bpos_cmp(extent_k.k ? extent_k.k->p : SPOS_MAX, + rebalance_k.k ? rebalance_k.k->p : SPOS_MAX); + + struct bkey deleted; + bkey_init(&deleted); + + if (cmp < 0) { + deleted.p = extent_k.k->p; + rebalance_k.k = &deleted; + } else if (cmp > 0) { + deleted.p = rebalance_k.k->p; + extent_k.k = &deleted; + } + + bool should_have_rebalance = + bch2_bkey_sectors_need_rebalance(c, extent_k) != 0; + bool have_rebalance = rebalance_k.k->type == KEY_TYPE_set; + + if (should_have_rebalance != have_rebalance) { + ret = bch2_btree_write_buffer_maybe_flush(trans, extent_k, last_flushed); + if (ret) + return ret; + + bch2_bkey_val_to_text(&buf, c, extent_k); + } + + if (fsck_err_on(!should_have_rebalance && have_rebalance, + trans, rebalance_work_incorrectly_set, + "rebalance work incorrectly set\n%s", buf.buf)) { + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + extent_k.k->p, false); + if (ret) + goto err; + } + + if (fsck_err_on(should_have_rebalance && !have_rebalance, + trans, rebalance_work_incorrectly_unset, + "rebalance work incorrectly unset\n%s", buf.buf)) { + ret = bch2_btree_bit_mod_buffered(trans, BTREE_ID_rebalance_work, + extent_k.k->p, true); + if (ret) + goto err; + } + + if (cmp <= 0) + bch2_btree_iter_advance(trans, extent_iter); + if (cmp >= 0) + bch2_btree_iter_advance(trans, rebalance_iter); +err: +fsck_err: + printbuf_exit(&buf); + return ret; +} + +int bch2_check_rebalance_work(struct bch_fs *c) +{ + struct btree_trans *trans = bch2_trans_get(c); + struct btree_iter rebalance_iter, extent_iter; + int ret = 0; + + bch2_trans_iter_init(trans, &extent_iter, + BTREE_ID_reflink, POS_MIN, + BTREE_ITER_prefetch); + bch2_trans_iter_init(trans, &rebalance_iter, + BTREE_ID_rebalance_work, POS_MIN, + BTREE_ITER_prefetch); + + struct bkey_buf last_flushed; + bch2_bkey_buf_init(&last_flushed); + bkey_init(&last_flushed.k->k); + + while (!ret) { + bch2_trans_begin(trans); + + ret = check_rebalance_work_one(trans, &extent_iter, &rebalance_iter, &last_flushed); + + if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) + ret = 0; + } + + bch2_bkey_buf_exit(&last_flushed, c); + bch2_trans_iter_exit(trans, &extent_iter); + bch2_trans_iter_exit(trans, &rebalance_iter); + bch2_trans_put(trans); + return ret < 0 ? ret : 0; } diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 62a3859d3823..5d9214fe1a22 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -37,7 +37,7 @@ int bch2_set_rebalance_needs_scan_trans(struct btree_trans *, u64); int bch2_set_rebalance_needs_scan(struct bch_fs *, u64 inum); int bch2_set_fs_needs_rebalance(struct bch_fs *); -static inline void rebalance_wakeup(struct bch_fs *c) +static inline void bch2_rebalance_wakeup(struct bch_fs *c) { struct task_struct *p; @@ -52,6 +52,10 @@ void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); void bch2_rebalance_stop(struct bch_fs *); int bch2_rebalance_start(struct bch_fs *); -void bch2_fs_rebalance_init(struct bch_fs *); + +void bch2_fs_rebalance_exit(struct bch_fs *); +int bch2_fs_rebalance_init(struct bch_fs *); + +int bch2_check_rebalance_work(struct bch_fs *); #endif /* _BCACHEFS_REBALANCE_H */ diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index fe5098c17dfc..33d77286f1d5 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -30,6 +30,11 @@ struct bch_fs_rebalance { struct bbpos scan_start; struct bbpos scan_end; struct bch_move_stats scan_stats; + + bool on_battery; +#ifdef CONFIG_POWER_SUPPLY + struct notifier_block power_notifier; +#endif }; #endif /* _BCACHEFS_REBALANCE_TYPES_H */ diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 71c786cdb192..4fca57575565 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -13,12 +13,13 @@ #include "disk_accounting.h" #include "errcode.h" #include "error.h" -#include "fs-common.h" #include "journal_io.h" #include "journal_reclaim.h" #include "journal_seq_blacklist.h" #include "logged_ops.h" #include "move.h" +#include "movinggc.h" +#include "namei.h" #include "quota.h" #include "rebalance.h" #include "recovery.h" @@ -32,8 +33,9 @@ #include <linux/sort.h> #include <linux/stat.h> - -int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) +int bch2_btree_lost_data(struct bch_fs *c, + struct printbuf *msg, + enum btree_id btree) { u64 b = BIT_ULL(btree); int ret = 0; @@ -42,32 +44,32 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); if (!(c->sb.btrees_lost_data & b)) { - struct printbuf buf = PRINTBUF; - bch2_btree_id_to_text(&buf, btree); - bch_err(c, "flagging btree %s lost data", buf.buf); - printbuf_exit(&buf); + prt_printf(msg, "flagging btree "); + bch2_btree_id_to_text(msg, btree); + prt_printf(msg, " lost data\n"); + ext->btrees_lost_data |= cpu_to_le64(b); } /* Once we have runtime self healing for topology errors we won't need this: */ - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_topology) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_topology, 0) ?: ret; /* Btree node accounting will be off: */ __set_bit_le64(BCH_FSCK_ERR_accounting_mismatch, ext->errors_silent); - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; #ifdef CONFIG_BCACHEFS_DEBUG /* * These are much more minor, and don't need to be corrected right away, * but in debug mode we want the next fsck run to be clean: */ - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_lrus) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_backpointers_to_extents) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_lrus, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_backpointers_to_extents, 0) ?: ret; #endif switch (btree) { case BTREE_ID_alloc: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; __set_bit_le64(BCH_FSCK_ERR_alloc_key_data_type_wrong, ext->errors_silent); __set_bit_le64(BCH_FSCK_ERR_alloc_key_gen_wrong, ext->errors_silent); @@ -77,26 +79,30 @@ int bch2_btree_lost_data(struct bch_fs *c, enum btree_id btree) __set_bit_le64(BCH_FSCK_ERR_alloc_key_stripe_redundancy_wrong, ext->errors_silent); goto out; case BTREE_ID_backpointers: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_btree_backpointers) ?: ret; - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_extents_to_backpointers) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_btree_backpointers, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_extents_to_backpointers, 0) ?: ret; goto out; case BTREE_ID_need_discard: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_freespace: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_bucket_gens: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_lru: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_alloc_info) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_alloc_info, 0) ?: ret; goto out; case BTREE_ID_accounting: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_check_allocations) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_check_allocations, 0) ?: ret; + goto out; + case BTREE_ID_snapshots: + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_reconstruct_snapshots, 0) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; default: - ret = bch2_run_explicit_recovery_pass_persistent_locked(c, BCH_RECOVERY_PASS_scan_for_btree_nodes) ?: ret; + ret = __bch2_run_explicit_recovery_pass(c, msg, BCH_RECOVERY_PASS_scan_for_btree_nodes, 0) ?: ret; goto out; } out: @@ -113,11 +119,8 @@ static void kill_btree(struct bch_fs *c, enum btree_id btree) } /* for -o reconstruct_alloc: */ -static void bch2_reconstruct_alloc(struct bch_fs *c) +void bch2_reconstruct_alloc(struct bch_fs *c) { - bch2_journal_log_msg(c, "dropping alloc info"); - bch_info(c, "dropping and reconstructing all alloc info"); - mutex_lock(&c->sb_lock); struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); @@ -159,6 +162,8 @@ static void bch2_reconstruct_alloc(struct bch_fs *c) c->opts.recovery_passes |= bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info)); + bch2_write_super(c); mutex_unlock(&c->sb_lock); @@ -198,7 +203,7 @@ static int bch2_journal_replay_accounting_key(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, BTREE_ITER_intent); - int ret = bch2_btree_iter_traverse(&iter); + int ret = bch2_btree_iter_traverse(trans, &iter); if (ret) goto out; @@ -261,7 +266,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, k->level, iter_flags); - ret = bch2_btree_iter_traverse(&iter); + ret = bch2_btree_iter_traverse(trans, &iter); if (ret) goto out; @@ -270,7 +275,7 @@ static int bch2_journal_replay_key(struct btree_trans *trans, bch2_trans_iter_exit(trans, &iter); bch2_trans_node_iter_init(trans, &iter, k->btree_id, k->k->k.p, BTREE_MAX_DEPTH, 0, iter_flags); - ret = bch2_btree_iter_traverse(&iter) ?: + ret = bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_increase_depth(trans, iter.path, 0) ?: -BCH_ERR_transaction_restart_nested; goto out; @@ -281,7 +286,12 @@ static int bch2_journal_replay_key(struct btree_trans *trans, goto out; if (k->k->k.type == KEY_TYPE_accounting) { - ret = bch2_trans_update_buffered(trans, BTREE_ID_accounting, k->k); + struct bkey_i *n = bch2_trans_subbuf_alloc(trans, &trans->accounting, k->k->k.u64s); + ret = PTR_ERR_OR_ZERO(n); + if (ret) + goto out; + + bkey_copy(n, k->k); goto out; } @@ -389,9 +399,9 @@ int bch2_journal_replay(struct bch_fs *c) * Now, replay any remaining keys in the order in which they appear in * the journal, unpinning those journal entries as we go: */ - sort(keys_sorted.data, keys_sorted.nr, - sizeof(keys_sorted.data[0]), - journal_sort_seq_cmp, NULL); + sort_nonatomic(keys_sorted.data, keys_sorted.nr, + sizeof(keys_sorted.data[0]), + journal_sort_seq_cmp, NULL); darray_for_each(keys_sorted, kp) { cond_resched(); @@ -429,7 +439,7 @@ int bch2_journal_replay(struct bch_fs *c) trans = NULL; if (!c->opts.retain_recovery_info && - c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) + c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) bch2_journal_keys_put_initial(c); replay_now_at(j, j->replay_journal_seq_end); @@ -584,9 +594,6 @@ static int read_btree_roots(struct bch_fs *c) buf.buf, bch2_err_str(ret))) { if (btree_id_is_alloc(i)) r->error = 0; - - ret = bch2_btree_lost_data(c, i); - BUG_ON(ret); } } @@ -666,7 +673,7 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_recovery_passes_from_stable(le64_to_cpu(passes))); } - bch_info(c, "%s", buf.buf); + bch_notice(c, "%s", buf.buf); printbuf_exit(&buf); ret = true; @@ -682,7 +689,7 @@ static bool check_version_upgrade(struct bch_fs *c) bch2_version_to_text(&buf, c->sb.version_incompat_allowed); prt_newline(&buf); - bch_info(c, "%s", buf.buf); + bch_notice(c, "%s", buf.buf); printbuf_exit(&buf); ret = true; @@ -789,11 +796,11 @@ int bch2_fs_recovery(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - if (c->opts.fsck) - set_bit(BCH_FS_fsck_running, &c->flags); if (c->sb.clean) set_bit(BCH_FS_clean_recovery, &c->flags); - set_bit(BCH_FS_recovery_running, &c->flags); + if (c->opts.fsck) + set_bit(BCH_FS_in_fsck, &c->flags); + set_bit(BCH_FS_in_recovery, &c->flags); ret = bch2_blacklist_table_initialize(c); if (ret) { @@ -888,8 +895,37 @@ use_clean: if (ret) goto err; - if (c->opts.reconstruct_alloc) + ret = bch2_fs_resize_on_mount(c); + if (ret) { + up_write(&c->state_lock); + goto err; + } + + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_info(c, "filesystem is an unresized image file, mounting ro"); + c->opts.read_only = true; + } + + if (!c->opts.read_only && + (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) { + bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); + bch2_reconstruct_alloc(c); + } else if (c->opts.reconstruct_alloc) { + bch2_journal_log_msg(c, "dropping alloc info"); + bch_info(c, "dropping and reconstructing all alloc info"); + + bch2_reconstruct_alloc(c); + } + + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { + /* We can't go RW to fix errors without alloc info */ + if (c->opts.fix_errors == FSCK_FIX_yes || + c->opts.fix_errors == FSCK_FIX_ask) + c->opts.fix_errors = FSCK_FIX_no; + if (c->opts.errors == BCH_ON_ERROR_fix_safe) + c->opts.errors = BCH_ON_ERROR_continue; + } /* * After an unclean shutdown, skip then next few journal sequence @@ -899,7 +935,7 @@ use_clean: * journal sequence numbers: */ if (!c->sb.clean) - journal_seq += 8; + journal_seq += JOURNAL_BUF_NR * 4; if (blacklist_seq != journal_seq) { ret = bch2_journal_log_msg(c, "blacklisting entries %llu-%llu", @@ -932,8 +968,10 @@ use_clean: set_bit(BCH_FS_btree_running, &c->flags); ret = bch2_sb_set_upgrade_extra(c); + if (ret) + goto err; - ret = bch2_run_recovery_passes(c); + ret = bch2_run_recovery_passes(c, 0); if (ret) goto err; @@ -944,8 +982,7 @@ use_clean: * multithreaded use: */ set_bit(BCH_FS_may_go_rw, &c->flags); - clear_bit(BCH_FS_fsck_running, &c->flags); - clear_bit(BCH_FS_recovery_running, &c->flags); + clear_bit(BCH_FS_in_fsck, &c->flags); /* in case we don't run journal replay, i.e. norecovery mode */ set_bit(BCH_FS_accounting_replay_done, &c->flags); @@ -968,9 +1005,8 @@ use_clean: bch_info(c, "Fixed errors, running fsck a second time to verify fs is clean"); clear_bit(BCH_FS_errors_fixed, &c->flags); - c->curr_recovery_pass = BCH_RECOVERY_PASS_check_alloc_info; - - ret = bch2_run_recovery_passes(c); + ret = bch2_run_recovery_passes(c, + BCH_RECOVERY_PASS_check_alloc_info); if (ret) goto err; @@ -1014,7 +1050,7 @@ use_clean: if (c->opts.fsck && !test_bit(BCH_FS_error, &c->flags) && - c->recovery_pass_done == BCH_RECOVERY_PASS_NR - 1 && + c->recovery.pass_done == BCH_RECOVERY_PASS_NR - 1 && ext->btrees_lost_data) { ext->btrees_lost_data = 0; write_sb = true; @@ -1075,8 +1111,17 @@ out: return ret; err: fsck_err: - bch2_fs_emergency_read_only(c); - goto out; + { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "error in recovery: %s", bch2_err_str(ret)); + bch2_fs_emergency_read_only2(c, &buf); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } + return ret; } int bch2_fs_initialize(struct bch_fs *c) @@ -1125,14 +1170,17 @@ int bch2_fs_initialize(struct bch_fs *c) * journal_res_get() will crash if called before this has * set up the journal.pin FIFO and journal.cur pointer: */ - bch2_fs_journal_start(&c->journal, 1); - set_bit(BCH_FS_accounting_replay_done, &c->flags); - bch2_journal_set_replay_done(&c->journal); + ret = bch2_fs_journal_start(&c->journal, 1); + if (ret) + goto err; ret = bch2_fs_read_write_early(c); if (ret) goto err; + set_bit(BCH_FS_accounting_replay_done, &c->flags); + bch2_journal_set_replay_done(&c->journal); + for_each_member_device(c, ca) { ret = bch2_dev_usage_init(ca, false); if (ret) { @@ -1189,7 +1237,10 @@ int bch2_fs_initialize(struct bch_fs *c) if (ret) goto err; - c->recovery_pass_done = BCH_RECOVERY_PASS_NR - 1; + c->recovery.pass_done = BCH_RECOVERY_PASS_NR - 1; + + bch2_copygc_wakeup(c); + bch2_rebalance_wakeup(c); if (enabled_qtypes(c)) { ret = bch2_fs_quota_read(c); @@ -1209,7 +1260,7 @@ int bch2_fs_initialize(struct bch_fs *c) bch2_write_super(c); mutex_unlock(&c->sb_lock); - c->curr_recovery_pass = BCH_RECOVERY_PASS_NR; + c->recovery.curr_pass = BCH_RECOVERY_PASS_NR; return 0; err: bch_err_fn(c, ret); diff --git a/fs/bcachefs/recovery.h b/fs/bcachefs/recovery.h index b0d55754b21b..c023f52fc2d6 100644 --- a/fs/bcachefs/recovery.h +++ b/fs/bcachefs/recovery.h @@ -2,7 +2,8 @@ #ifndef _BCACHEFS_RECOVERY_H #define _BCACHEFS_RECOVERY_H -int bch2_btree_lost_data(struct bch_fs *, enum btree_id); +int bch2_btree_lost_data(struct bch_fs *, struct printbuf *, enum btree_id); +void bch2_reconstruct_alloc(struct bch_fs *); int bch2_journal_replay(struct bch_fs *); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index 0b3c951c32da..dabb29b08ad0 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -12,6 +12,7 @@ #include "journal.h" #include "lru.h" #include "logged_ops.h" +#include "movinggc.h" #include "rebalance.h" #include "recovery.h" #include "recovery_passes.h" @@ -27,6 +28,145 @@ const char * const bch2_recovery_passes[] = { NULL }; +static const u8 passes_to_stable_map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +static const u8 passes_from_stable_map[] = { +#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x +}; + +static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) +{ + return passes_to_stable_map[pass]; +} + +u64 bch2_recovery_passes_to_stable(u64 v) +{ + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(passes_to_stable_map[i]); + return ret; +} + +static enum bch_recovery_pass bch2_recovery_pass_from_stable(enum bch_recovery_pass_stable pass) +{ + return pass < ARRAY_SIZE(passes_from_stable_map) + ? passes_from_stable_map[pass] + : 0; +} + +u64 bch2_recovery_passes_from_stable(u64 v) +{ + u64 ret = 0; + for (unsigned i = 0; i < ARRAY_SIZE(passes_from_stable_map); i++) + if (v & BIT_ULL(i)) + ret |= BIT_ULL(passes_from_stable_map[i]); + return ret; +} + +static int bch2_sb_recovery_passes_validate(struct bch_sb *sb, struct bch_sb_field *f, + enum bch_validate_flags flags, struct printbuf *err) +{ + return 0; +} + +static void bch2_sb_recovery_passes_to_text(struct printbuf *out, + struct bch_sb *sb, + struct bch_sb_field *f) +{ + struct bch_sb_field_recovery_passes *r = + field_to_type(f, recovery_passes); + unsigned nr = recovery_passes_nr_entries(r); + + if (out->nr_tabstops < 1) + printbuf_tabstop_push(out, 32); + if (out->nr_tabstops < 2) + printbuf_tabstop_push(out, 16); + + prt_printf(out, "Pass\tLast run\tLast runtime\n"); + + for (struct recovery_pass_entry *i = r->start; i < r->start + nr; i++) { + if (!i->last_run) + continue; + + unsigned idx = i - r->start; + + prt_printf(out, "%s\t", bch2_recovery_passes[bch2_recovery_pass_from_stable(idx)]); + + bch2_prt_datetime(out, le64_to_cpu(i->last_run)); + prt_tab(out); + + bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); + prt_newline(out); + } +} + +static void bch2_sb_recovery_pass_complete(struct bch_fs *c, + enum bch_recovery_pass pass, + s64 start_time) +{ + enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); + s64 end_time = ktime_get_real_seconds(); + + mutex_lock(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __clear_bit_le64(stable, ext->recovery_passes_required); + + struct bch_sb_field_recovery_passes *r = + bch2_sb_field_get(c->disk_sb.sb, recovery_passes); + + if (stable >= recovery_passes_nr_entries(r)) { + unsigned u64s = struct_size(r, start, stable + 1) / sizeof(u64); + + r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); + if (!r) { + bch_err(c, "error creating recovery_passes sb section"); + goto out; + } + } + + r->start[stable].last_run = cpu_to_le64(end_time); + r->start[stable].last_runtime = cpu_to_le32(max(0, end_time - start_time)); +out: + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} + +static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) +{ + enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); + bool ret = false; + + lockdep_assert_held(&c->sb_lock); + + struct bch_sb_field_recovery_passes *r = + bch2_sb_field_get(c->disk_sb.sb, recovery_passes); + + if (stable < recovery_passes_nr_entries(r)) { + struct recovery_pass_entry *i = r->start + stable; + + /* + * Ratelimit if the last runtime was more than 1% of the time + * since we last ran + */ + ret = (u64) le32_to_cpu(i->last_runtime) * 100 > + ktime_get_real_seconds() - le64_to_cpu(i->last_run); + } + + return ret; +} + +const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes = { + .validate = bch2_sb_recovery_passes_validate, + .to_text = bch2_sb_recovery_passes_to_text +}; + /* Fake recovery pass, so that scan_for_btree_nodes isn't 0: */ static int bch2_recovery_pass_empty(struct bch_fs *c) { @@ -46,11 +186,36 @@ static int bch2_set_may_go_rw(struct bch_fs *c) set_bit(BCH_FS_may_go_rw, &c->flags); - if (keys->nr || !c->opts.read_only || c->opts.fsck || !c->sb.clean || c->opts.recovery_passes) + if (keys->nr || + !c->opts.read_only || + !c->sb.clean || + c->opts.recovery_passes || + (c->opts.fsck && !(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)))) { + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) { + bch_info(c, "mounting a filesystem with no alloc info read-write; will recreate"); + bch2_reconstruct_alloc(c); + } + return bch2_fs_read_write_early(c); + } return 0; } +/* + * Make sure root inode is readable while we're still in recovery and can rewind + * for repair: + */ +static int bch2_lookup_root_inode(struct bch_fs *c) +{ + subvol_inum inum = BCACHEFS_ROOT_SUBVOL_INUM; + struct bch_inode_unpacked inode_u; + struct bch_subvolume subvol; + + return bch2_trans_do(c, + bch2_subvolume_get(trans, inum.subvol, true, &subvol) ?: + bch2_inode_find_by_inum_trans(trans, inum, &inode_u)); +} + struct recovery_pass_fn { int (*fn)(struct bch_fs *); unsigned when; @@ -62,255 +227,351 @@ static struct recovery_pass_fn recovery_pass_fns[] = { #undef x }; -static const u8 passes_to_stable_map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_##n] = BCH_RECOVERY_PASS_STABLE_##n, - BCH_RECOVERY_PASSES() -#undef x -}; +static u64 bch2_recovery_passes_match(unsigned flags) +{ + u64 ret = 0; -static enum bch_recovery_pass_stable bch2_recovery_pass_to_stable(enum bch_recovery_pass pass) + for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) + if (recovery_pass_fns[i].when & flags) + ret |= BIT_ULL(i); + return ret; +} + +u64 bch2_fsck_recovery_passes(void) { - return passes_to_stable_map[pass]; + return bch2_recovery_passes_match(PASS_FSCK); } -u64 bch2_recovery_passes_to_stable(u64 v) +static void bch2_run_async_recovery_passes(struct bch_fs *c) { - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(passes_to_stable_map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(passes_to_stable_map[i]); - return ret; + if (!down_trylock(&c->recovery.run_lock)) + return; + + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_async_recovery_passes)) + goto unlock; + + if (queue_work(system_long_wq, &c->recovery.work)) + return; + + enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); +unlock: + up(&c->recovery.run_lock); } -u64 bch2_recovery_passes_from_stable(u64 v) +static bool recovery_pass_needs_set(struct bch_fs *c, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags *flags) { - static const u8 map[] = { -#define x(n, id, ...) [BCH_RECOVERY_PASS_STABLE_##n] = BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - }; + struct bch_fs_recovery *r = &c->recovery; + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool persistent = !in_recovery || !(*flags & RUN_RECOVERY_PASS_nopersistent); - u64 ret = 0; - for (unsigned i = 0; i < ARRAY_SIZE(map); i++) - if (v & BIT_ULL(i)) - ret |= BIT_ULL(map[i]); - return ret; + if ((*flags & RUN_RECOVERY_PASS_ratelimit) && + !bch2_recovery_pass_want_ratelimit(c, pass)) + *flags &= ~RUN_RECOVERY_PASS_ratelimit; + + /* + * If RUN_RECOVERY_PASS_nopersistent is set, we don't want to do + * anything if the pass has already run: these mean we need a prior pass + * to run before we continue to repair, we don't expect that pass to fix + * the damage we encountered. + * + * Otherwise, we run run_explicit_recovery_pass when we find damage, so + * it should run again even if it's already run: + */ + + if (persistent + ? !(c->sb.recovery_passes_required & BIT_ULL(pass)) + : !((r->passes_to_run|r->passes_complete) & BIT_ULL(pass))) + return true; + + if (!(*flags & RUN_RECOVERY_PASS_ratelimit) && + (r->passes_ratelimiting & BIT_ULL(pass))) + return true; + + return false; } /* * For when we need to rewind recovery passes and run a pass we skipped: */ -static int __bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) +int __bch2_run_explicit_recovery_pass(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags flags) { - if (c->curr_recovery_pass == ARRAY_SIZE(recovery_pass_fns)) - return -BCH_ERR_not_in_recovery; + struct bch_fs_recovery *r = &c->recovery; + int ret = 0; - if (c->recovery_passes_complete & BIT_ULL(pass)) - return 0; + lockdep_assert_held(&c->sb_lock); - bool print = !(c->opts.recovery_passes & BIT_ULL(pass)); + bch2_printbuf_make_room(out, 1024); + out->atomic++; - if (pass < BCH_RECOVERY_PASS_set_may_go_rw && - c->curr_recovery_pass >= BCH_RECOVERY_PASS_set_may_go_rw) { - if (print) - bch_info(c, "need recovery pass %s (%u), but already rw", - bch2_recovery_passes[pass], pass); - return -BCH_ERR_cannot_rewind_recovery; - } + unsigned long lockflags; + spin_lock_irqsave(&r->lock, lockflags); - if (print) - bch_info(c, "running explicit recovery pass %s (%u), currently at %s (%u)", - bch2_recovery_passes[pass], pass, - bch2_recovery_passes[c->curr_recovery_pass], c->curr_recovery_pass); + if (!recovery_pass_needs_set(c, pass, &flags)) + goto out; - c->opts.recovery_passes |= BIT_ULL(pass); + bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); + bool rewind = in_recovery && r->curr_pass > pass; + bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; - if (c->curr_recovery_pass > pass) { - c->next_recovery_pass = pass; - c->recovery_passes_complete &= (1ULL << pass) >> 1; - return -BCH_ERR_restart_recovery; - } else { - return 0; + if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) { + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); } -} -int bch2_run_explicit_recovery_pass(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - unsigned long flags; - spin_lock_irqsave(&c->recovery_pass_lock, flags); - int ret = __bch2_run_explicit_recovery_pass(c, pass); - spin_unlock_irqrestore(&c->recovery_pass_lock, flags); - return ret; -} + if (pass < BCH_RECOVERY_PASS_set_may_go_rw && + (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { + prt_printf(out, "need recovery pass %s (%u), but already rw\n", + bch2_recovery_passes[pass], pass); + ret = -BCH_ERR_cannot_rewind_recovery; + goto out; + } -int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - lockdep_assert_held(&c->sb_lock); + if (ratelimit) + r->passes_ratelimiting |= BIT_ULL(pass); + else + r->passes_ratelimiting &= ~BIT_ULL(pass); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __set_bit_le64(bch2_recovery_pass_to_stable(pass), ext->recovery_passes_required); + if (in_recovery && !ratelimit) { + prt_printf(out, "running recovery pass %s (%u), currently at %s (%u)%s\n", + bch2_recovery_passes[pass], pass, + bch2_recovery_passes[r->curr_pass], r->curr_pass, + rewind ? " - rewinding" : ""); - return bch2_run_explicit_recovery_pass(c, pass); -} + r->passes_to_run |= BIT_ULL(pass); -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *c, - enum bch_recovery_pass pass) -{ - enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); - - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + if (rewind) { + r->next_pass = pass; + r->passes_complete &= (1ULL << pass) >> 1; + ret = -BCH_ERR_restart_recovery; + } + } else { + prt_printf(out, "scheduling recovery pass %s (%u)%s\n", + bch2_recovery_passes[pass], pass, + ratelimit ? " - ratelimiting" : ""); - if (!test_bit_le64(s, ext->recovery_passes_required)) { - __set_bit_le64(s, ext->recovery_passes_required); - bch2_write_super(c); + struct recovery_pass_fn *p = recovery_pass_fns + pass; + if (p->when & PASS_ONLINE) + bch2_run_async_recovery_passes(c); } - mutex_unlock(&c->sb_lock); - - return bch2_run_explicit_recovery_pass(c, pass); +out: + spin_unlock_irqrestore(&r->lock, lockflags); + --out->atomic; + return ret; } -static void bch2_clear_recovery_pass_required(struct bch_fs *c, - enum bch_recovery_pass pass) +int bch2_run_explicit_recovery_pass(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass, + enum bch_run_recovery_pass_flags flags) { - enum bch_recovery_pass_stable s = bch2_recovery_pass_to_stable(pass); + int ret = 0; - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + scoped_guard(mutex, &c->sb_lock) { + if (!recovery_pass_needs_set(c, pass, &flags)) + return 0; - if (test_bit_le64(s, ext->recovery_passes_required)) { - __clear_bit_le64(s, ext->recovery_passes_required); + ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); bch2_write_super(c); } - mutex_unlock(&c->sb_lock); -} -u64 bch2_fsck_recovery_passes(void) -{ - u64 ret = 0; - - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) - if (recovery_pass_fns[i].when & PASS_FSCK) - ret |= BIT_ULL(i); return ret; } -static bool should_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) +int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { - struct recovery_pass_fn *p = recovery_pass_fns + pass; + enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent; - if (c->opts.recovery_passes_exclude & BIT_ULL(pass)) - return false; - if (c->opts.recovery_passes & BIT_ULL(pass)) - return true; - if ((p->when & PASS_FSCK) && c->opts.fsck) - return true; - if ((p->when & PASS_UNCLEAN) && !c->sb.clean) - return true; - if (p->when & PASS_ALWAYS) - return true; - return false; + if (!recovery_pass_needs_set(c, pass, &flags)) + return 0; + + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + mutex_lock(&c->sb_lock); + int ret = __bch2_run_explicit_recovery_pass(c, &buf, pass, + RUN_RECOVERY_PASS_nopersistent); + mutex_unlock(&c->sb_lock); + + bch2_print_str(c, KERN_NOTICE, buf.buf); + printbuf_exit(&buf); + return ret; } static int bch2_run_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { + struct bch_fs_recovery *r = &c->recovery; struct recovery_pass_fn *p = recovery_pass_fns + pass; - int ret; if (!(p->when & PASS_SILENT)) bch2_print(c, KERN_INFO bch2_log_msg(c, "%s..."), bch2_recovery_passes[pass]); - ret = p->fn(c); - if (ret) + + s64 start_time = ktime_get_real_seconds(); + int ret = p->fn(c); + + r->passes_to_run &= ~BIT_ULL(pass); + + if (ret) { + r->passes_failing |= BIT_ULL(pass); return ret; + } + + r->passes_failing = 0; + + if (!test_bit(BCH_FS_error, &c->flags)) + bch2_sb_recovery_pass_complete(c, pass, start_time); + if (!(p->when & PASS_SILENT)) bch2_print(c, KERN_CONT " done\n"); return 0; } -int bch2_run_online_recovery_passes(struct bch_fs *c) +static int __bch2_run_recovery_passes(struct bch_fs *c, u64 orig_passes_to_run, + bool online) { + struct bch_fs_recovery *r = &c->recovery; int ret = 0; - down_read(&c->state_lock); + spin_lock_irq(&r->lock); - for (unsigned i = 0; i < ARRAY_SIZE(recovery_pass_fns); i++) { - struct recovery_pass_fn *p = recovery_pass_fns + i; + if (online) + orig_passes_to_run &= bch2_recovery_passes_match(PASS_ONLINE); - if (!(p->when & PASS_ONLINE)) - continue; + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) + orig_passes_to_run &= ~bch2_recovery_passes_match(PASS_ALLOC); - ret = bch2_run_recovery_pass(c, i); - if (bch2_err_matches(ret, BCH_ERR_restart_recovery)) { - i = c->curr_recovery_pass; - continue; + /* + * A failed recovery pass will be retried after another pass succeeds - + * but not this iteration. + * + * This is because some passes depend on repair done by other passes: we + * may want to retry, but we don't want to loop on failing passes. + */ + + orig_passes_to_run &= ~r->passes_failing; + + r->passes_to_run = orig_passes_to_run; + + while (r->passes_to_run) { + unsigned prev_done = r->pass_done; + unsigned pass = __ffs64(r->passes_to_run); + r->curr_pass = pass; + r->next_pass = r->curr_pass + 1; + r->passes_to_run &= ~BIT_ULL(pass); + + spin_unlock_irq(&r->lock); + + int ret2 = bch2_run_recovery_pass(c, pass) ?: + bch2_journal_flush(&c->journal); + + spin_lock_irq(&r->lock); + + if (r->next_pass < r->curr_pass) { + /* Rewind: */ + r->passes_to_run |= orig_passes_to_run & (~0ULL << r->next_pass); + } else if (!ret2) { + r->pass_done = max(r->pass_done, pass); + r->passes_complete |= BIT_ULL(pass); + } else { + ret = ret2; } - if (ret) + + if (ret && !online) break; + + if (prev_done <= BCH_RECOVERY_PASS_check_snapshots && + r->pass_done > BCH_RECOVERY_PASS_check_snapshots) { + bch2_copygc_wakeup(c); + bch2_rebalance_wakeup(c); + } } - up_read(&c->state_lock); + clear_bit(BCH_FS_in_recovery, &c->flags); + spin_unlock_irq(&r->lock); return ret; } -int bch2_run_recovery_passes(struct bch_fs *c) +static void bch2_async_recovery_passes_work(struct work_struct *work) { - int ret = 0; + struct bch_fs *c = container_of(work, struct bch_fs, recovery.work); + struct bch_fs_recovery *r = &c->recovery; + + __bch2_run_recovery_passes(c, + c->sb.recovery_passes_required & ~r->passes_ratelimiting, + true); + + up(&r->run_lock); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_async_recovery_passes); +} + +int bch2_run_online_recovery_passes(struct bch_fs *c, u64 passes) +{ + return __bch2_run_recovery_passes(c, c->sb.recovery_passes_required|passes, true); +} + +int bch2_run_recovery_passes(struct bch_fs *c, enum bch_recovery_pass from) +{ + u64 passes = + bch2_recovery_passes_match(PASS_ALWAYS) | + (!c->sb.clean ? bch2_recovery_passes_match(PASS_UNCLEAN) : 0) | + (c->opts.fsck ? bch2_recovery_passes_match(PASS_FSCK) : 0) | + c->opts.recovery_passes | + c->sb.recovery_passes_required; + + if (c->opts.recovery_pass_last) + passes &= BIT_ULL(c->opts.recovery_pass_last + 1) - 1; /* * We can't allow set_may_go_rw to be excluded; that would cause us to * use the journal replay keys for updates where it's not expected. */ c->opts.recovery_passes_exclude &= ~BCH_RECOVERY_PASS_set_may_go_rw; + passes &= ~c->opts.recovery_passes_exclude; - while (c->curr_recovery_pass < ARRAY_SIZE(recovery_pass_fns) && !ret) { - c->next_recovery_pass = c->curr_recovery_pass + 1; + passes &= ~(BIT_ULL(from) - 1); - spin_lock_irq(&c->recovery_pass_lock); - unsigned pass = c->curr_recovery_pass; - - if (c->opts.recovery_pass_last && - c->curr_recovery_pass > c->opts.recovery_pass_last) { - spin_unlock_irq(&c->recovery_pass_lock); - break; - } + down(&c->recovery.run_lock); + int ret = __bch2_run_recovery_passes(c, passes, false); + up(&c->recovery.run_lock); - if (!should_run_recovery_pass(c, pass)) { - c->curr_recovery_pass++; - c->recovery_pass_done = max(c->recovery_pass_done, pass); - spin_unlock_irq(&c->recovery_pass_lock); - continue; - } - spin_unlock_irq(&c->recovery_pass_lock); + return ret; +} - ret = bch2_run_recovery_pass(c, pass) ?: - bch2_journal_flush(&c->journal); +static void prt_passes(struct printbuf *out, const char *msg, u64 passes) +{ + prt_printf(out, "%s:\t", msg); + prt_bitflags(out, bch2_recovery_passes, passes); + prt_newline(out); +} - if (!ret && !test_bit(BCH_FS_error, &c->flags)) - bch2_clear_recovery_pass_required(c, pass); - - spin_lock_irq(&c->recovery_pass_lock); - if (c->next_recovery_pass < c->curr_recovery_pass) { - /* - * bch2_run_explicit_recovery_pass() was called: we - * can't always catch -BCH_ERR_restart_recovery because - * it may have been called from another thread (btree - * node read completion) - */ - ret = 0; - c->recovery_passes_complete &= ~(~0ULL << c->curr_recovery_pass); - } else { - c->recovery_passes_complete |= BIT_ULL(pass); - c->recovery_pass_done = max(c->recovery_pass_done, pass); - } - c->curr_recovery_pass = c->next_recovery_pass; - spin_unlock_irq(&c->recovery_pass_lock); +void bch2_recovery_pass_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct bch_fs_recovery *r = &c->recovery; + + printbuf_tabstop_push(out, 32); + prt_passes(out, "Scheduled passes", c->sb.recovery_passes_required); + prt_passes(out, "Scheduled online passes", c->sb.recovery_passes_required & + bch2_recovery_passes_match(PASS_ONLINE)); + prt_passes(out, "Complete passes", r->passes_complete); + prt_passes(out, "Failing passes", r->passes_failing); + + if (r->curr_pass) { + prt_printf(out, "Current pass:\t%s\n", bch2_recovery_passes[r->curr_pass]); + prt_passes(out, "Current passes", r->passes_to_run); } +} - return ret; +void bch2_fs_recovery_passes_init(struct bch_fs *c) +{ + spin_lock_init(&c->recovery.lock); + sema_init(&c->recovery.run_lock, 1); + + INIT_WORK(&c->recovery.work, bch2_async_recovery_passes_work); } diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index 7d7339c8fa29..dc0d2014ff9b 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -3,16 +3,32 @@ extern const char * const bch2_recovery_passes[]; +extern const struct bch_sb_field_ops bch_sb_field_ops_recovery_passes; + u64 bch2_recovery_passes_to_stable(u64 v); u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); -int bch2_run_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); -int bch2_run_explicit_recovery_pass_persistent_locked(struct bch_fs *, enum bch_recovery_pass); -int bch2_run_explicit_recovery_pass_persistent(struct bch_fs *, enum bch_recovery_pass); +enum bch_run_recovery_pass_flags { + RUN_RECOVERY_PASS_nopersistent = BIT(0), + RUN_RECOVERY_PASS_ratelimit = BIT(1), +}; + +int bch2_run_print_explicit_recovery_pass(struct bch_fs *, enum bch_recovery_pass); + +int __bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass, + enum bch_run_recovery_pass_flags); +int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass, + enum bch_run_recovery_pass_flags); + +int bch2_run_online_recovery_passes(struct bch_fs *, u64); +int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); + +void bch2_recovery_pass_status_to_text(struct printbuf *, struct bch_fs *); -int bch2_run_online_recovery_passes(struct bch_fs *); -int bch2_run_recovery_passes(struct bch_fs *); +void bch2_fs_recovery_passes_init(struct bch_fs *); #endif /* _BCACHEFS_RECOVERY_PASSES_H */ diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h new file mode 100644 index 000000000000..c434eafbca19 --- /dev/null +++ b/fs/bcachefs/recovery_passes_format.h @@ -0,0 +1,104 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_RECOVERY_PASSES_FORMAT_H +#define _BCACHEFS_RECOVERY_PASSES_FORMAT_H + +#define PASS_SILENT BIT(0) +#define PASS_FSCK BIT(1) +#define PASS_UNCLEAN BIT(2) +#define PASS_ALWAYS BIT(3) +#define PASS_ONLINE BIT(4) +#define PASS_ALLOC BIT(5) +#define PASS_FSCK_ALLOC (PASS_FSCK|PASS_ALLOC) + +#ifdef CONFIG_BCACHEFS_DEBUG +#define PASS_FSCK_DEBUG BIT(1) +#else +#define PASS_FSCK_DEBUG 0 +#endif + +/* + * Passes may be reordered, but the second field is a persistent identifier and + * must never change: + */ +#define BCH_RECOVERY_PASSES() \ + x(recovery_pass_empty, 41, PASS_SILENT) \ + x(scan_for_btree_nodes, 37, 0) \ + x(check_topology, 4, 0) \ + x(accounting_read, 39, PASS_ALWAYS) \ + x(alloc_read, 0, PASS_ALWAYS) \ + x(stripes_read, 1, 0) \ + x(initialize_subvolumes, 2, 0) \ + x(snapshots_read, 3, PASS_ALWAYS) \ + x(check_allocations, 5, PASS_FSCK_ALLOC) \ + x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ + x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT|PASS_ALLOC) \ + x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ + x(journal_replay, 9, PASS_ALWAYS) \ + x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_lrus, 11, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ + x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK_ALLOC) \ + x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ + x(bucket_gens_init, 17, 0) \ + x(reconstruct_snapshots, 38, 0) \ + x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ + x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ + x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ + x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ + x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ + x(fs_upgrade_for_subvolumes, 22, 0) \ + x(check_inodes, 24, PASS_FSCK) \ + x(check_extents, 25, PASS_FSCK) \ + x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ + x(check_dirents, 27, PASS_FSCK) \ + x(check_xattrs, 28, PASS_FSCK) \ + x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ + x(check_unreachable_inodes, 40, PASS_FSCK) \ + x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ + x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ + x(check_nlinks, 31, PASS_FSCK) \ + x(check_rebalance_work, 43, PASS_ONLINE|PASS_FSCK) \ + x(resume_logged_ops, 23, PASS_ALWAYS) \ + x(delete_dead_inodes, 32, PASS_ALWAYS) \ + x(fix_reflink_p, 33, 0) \ + x(set_fs_needs_rebalance, 34, 0) \ + x(lookup_root_inode, 42, PASS_ALWAYS|PASS_SILENT) + +/* We normally enumerate recovery passes in the order we run them: */ +enum bch_recovery_pass { +#define x(n, id, when) BCH_RECOVERY_PASS_##n, + BCH_RECOVERY_PASSES() +#undef x + BCH_RECOVERY_PASS_NR +}; + +/* But we also need stable identifiers that can be used in the superblock */ +enum bch_recovery_pass_stable { +#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, + BCH_RECOVERY_PASSES() +#undef x +}; + +struct recovery_pass_entry { + __le64 last_run; + __le32 last_runtime; + __le32 flags; +}; + +struct bch_sb_field_recovery_passes { + struct bch_sb_field field; + struct recovery_pass_entry start[]; +}; + +static inline unsigned +recovery_passes_nr_entries(struct bch_sb_field_recovery_passes *r) +{ + return r + ? ((vstruct_end(&r->field) - (void *) &r->start[0]) / + sizeof(struct recovery_pass_entry)) + : 0; +} + +#endif /* _BCACHEFS_RECOVERY_PASSES_FORMAT_H */ diff --git a/fs/bcachefs/recovery_passes_types.h b/fs/bcachefs/recovery_passes_types.h index 418557960ed6..aa9526938cc3 100644 --- a/fs/bcachefs/recovery_passes_types.h +++ b/fs/bcachefs/recovery_passes_types.h @@ -2,79 +2,26 @@ #ifndef _BCACHEFS_RECOVERY_PASSES_TYPES_H #define _BCACHEFS_RECOVERY_PASSES_TYPES_H -#define PASS_SILENT BIT(0) -#define PASS_FSCK BIT(1) -#define PASS_UNCLEAN BIT(2) -#define PASS_ALWAYS BIT(3) -#define PASS_ONLINE BIT(4) - -#ifdef CONFIG_BCACHEFS_DEBUG -#define PASS_FSCK_DEBUG BIT(1) -#else -#define PASS_FSCK_DEBUG 0 -#endif - -/* - * Passes may be reordered, but the second field is a persistent identifier and - * must never change: - */ -#define BCH_RECOVERY_PASSES() \ - x(recovery_pass_empty, 41, PASS_SILENT) \ - x(scan_for_btree_nodes, 37, 0) \ - x(check_topology, 4, 0) \ - x(accounting_read, 39, PASS_ALWAYS) \ - x(alloc_read, 0, PASS_ALWAYS) \ - x(stripes_read, 1, PASS_ALWAYS) \ - x(initialize_subvolumes, 2, 0) \ - x(snapshots_read, 3, PASS_ALWAYS) \ - x(check_allocations, 5, PASS_FSCK) \ - x(trans_mark_dev_sbs, 6, PASS_ALWAYS|PASS_SILENT) \ - x(fs_journal_alloc, 7, PASS_ALWAYS|PASS_SILENT) \ - x(set_may_go_rw, 8, PASS_ALWAYS|PASS_SILENT) \ - x(journal_replay, 9, PASS_ALWAYS) \ - x(check_alloc_info, 10, PASS_ONLINE|PASS_FSCK) \ - x(check_lrus, 11, PASS_ONLINE|PASS_FSCK) \ - x(check_btree_backpointers, 12, PASS_ONLINE|PASS_FSCK) \ - x(check_backpointers_to_extents, 13, PASS_ONLINE|PASS_FSCK_DEBUG) \ - x(check_extents_to_backpointers, 14, PASS_ONLINE|PASS_FSCK) \ - x(check_alloc_to_lru_refs, 15, PASS_ONLINE|PASS_FSCK) \ - x(fs_freespace_init, 16, PASS_ALWAYS|PASS_SILENT) \ - x(bucket_gens_init, 17, 0) \ - x(reconstruct_snapshots, 38, 0) \ - x(check_snapshot_trees, 18, PASS_ONLINE|PASS_FSCK) \ - x(check_snapshots, 19, PASS_ONLINE|PASS_FSCK) \ - x(check_subvols, 20, PASS_ONLINE|PASS_FSCK) \ - x(check_subvol_children, 35, PASS_ONLINE|PASS_FSCK) \ - x(delete_dead_snapshots, 21, PASS_ONLINE|PASS_FSCK) \ - x(fs_upgrade_for_subvolumes, 22, 0) \ - x(check_inodes, 24, PASS_FSCK) \ - x(check_extents, 25, PASS_FSCK) \ - x(check_indirect_extents, 26, PASS_ONLINE|PASS_FSCK) \ - x(check_dirents, 27, PASS_FSCK) \ - x(check_xattrs, 28, PASS_FSCK) \ - x(check_root, 29, PASS_ONLINE|PASS_FSCK) \ - x(check_unreachable_inodes, 40, PASS_FSCK) \ - x(check_subvolume_structure, 36, PASS_ONLINE|PASS_FSCK) \ - x(check_directory_structure, 30, PASS_ONLINE|PASS_FSCK) \ - x(check_nlinks, 31, PASS_FSCK) \ - x(resume_logged_ops, 23, PASS_ALWAYS) \ - x(delete_dead_inodes, 32, PASS_ALWAYS) \ - x(fix_reflink_p, 33, 0) \ - x(set_fs_needs_rebalance, 34, 0) - -/* We normally enumerate recovery passes in the order we run them: */ -enum bch_recovery_pass { -#define x(n, id, when) BCH_RECOVERY_PASS_##n, - BCH_RECOVERY_PASSES() -#undef x - BCH_RECOVERY_PASS_NR -}; - -/* But we also need stable identifiers that can be used in the superblock */ -enum bch_recovery_pass_stable { -#define x(n, id, when) BCH_RECOVERY_PASS_STABLE_##n = id, - BCH_RECOVERY_PASSES() -#undef x +struct bch_fs_recovery { + /* + * Two different uses: + * "Has this fsck pass?" - i.e. should this type of error be an + * emergency read-only + * And, in certain situations fsck will rewind to an earlier pass: used + * for signaling to the toplevel code which pass we want to run now. + */ + enum bch_recovery_pass curr_pass; + enum bch_recovery_pass next_pass; + /* never rewinds version of curr_pass */ + enum bch_recovery_pass pass_done; + u64 passes_to_run; + /* bitmask of recovery passes that we actually ran */ + u64 passes_complete; + u64 passes_failing; + u64 passes_ratelimiting; + spinlock_t lock; + struct semaphore run_lock; + struct work_struct work; }; #endif /* _BCACHEFS_RECOVERY_PASSES_TYPES_H */ diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 441e648f28b5..3a13dbcab6ba 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -3,6 +3,7 @@ #include "bkey_buf.h" #include "btree_update.h" #include "buckets.h" +#include "enumerated_ref.h" #include "error.h" #include "extents.h" #include "inode.h" @@ -185,12 +186,21 @@ static int bch2_indirect_extent_missing_error(struct btree_trans *trans, BUG_ON(missing_start < refd_start); BUG_ON(missing_end > refd_end); - if (fsck_err(trans, reflink_p_to_missing_reflink_v, - "pointer to missing indirect extent\n" - " %s\n" - " missing range %llu-%llu", - (bch2_bkey_val_to_text(&buf, c, p.s_c), buf.buf), - missing_start, missing_end)) { + struct bpos missing_pos = bkey_start_pos(p.k); + missing_pos.offset += missing_start - live_start; + + prt_printf(&buf, "pointer to missing indirect extent in "); + ret = bch2_inum_snap_offset_err_msg_trans(trans, &buf, missing_pos); + if (ret) + goto err; + + prt_printf(&buf, "-%llu\n", (missing_pos.offset + (missing_end - missing_start)) << 9); + bch2_bkey_val_to_text(&buf, c, p.s_c); + + prt_printf(&buf, "\nmissing reflink btree range %llu-%llu", + missing_start, missing_end); + + if (fsck_err(trans, reflink_p_to_missing_reflink_v, "%s", buf.buf)) { struct bkey_i_reflink_p *new = bch2_bkey_make_mut_noupdate_typed(trans, p.s_c, reflink_p); ret = PTR_ERR_OR_ZERO(new); if (ret) @@ -314,10 +324,10 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, __le64 *refcount = bkey_refcount(bkey_i_to_s(new)); if (!*refcount && (flags & BTREE_TRIGGER_overwrite)) { bch2_bkey_val_to_text(&buf, c, p.s_c); - prt_printf(&buf, "\n "); + prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, k); log_fsck_err(trans, reflink_refcount_underflow, - "indirect extent refcount underflow while marking\n %s", + "indirect extent refcount underflow while marking\n%s", buf.buf); goto next; } @@ -486,7 +496,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bool reflink_p_may_update_opts_field) { struct bch_fs *c = trans->c; - struct btree_iter reflink_iter = { NULL }; + struct btree_iter reflink_iter = {}; struct bkey_s_c k; struct bkey_i *r_v; struct bkey_i_reflink_p *r_p; @@ -498,7 +508,7 @@ static int bch2_make_extent_indirect(struct btree_trans *trans, bch2_trans_iter_init(trans, &reflink_iter, BTREE_ID_reflink, POS_MAX, BTREE_ITER_intent); - k = bch2_btree_iter_peek_prev(&reflink_iter); + k = bch2_btree_iter_peek_prev(trans, &reflink_iter); ret = bkey_err(k); if (ret) goto err; @@ -560,12 +570,13 @@ err: return ret; } -static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) +static struct bkey_s_c get_next_src(struct btree_trans *trans, + struct btree_iter *iter, struct bpos end) { struct bkey_s_c k; int ret; - for_each_btree_key_max_continue_norestart(*iter, end, 0, k, ret) { + for_each_btree_key_max_continue_norestart(trans, *iter, end, 0, k, ret) { if (bkey_extent_is_unwritten(k)) continue; @@ -574,7 +585,7 @@ static struct bkey_s_c get_next_src(struct btree_iter *iter, struct bpos end) } if (bkey_ge(iter->pos, end)) - bch2_btree_iter_set_pos(iter, end); + bch2_btree_iter_set_pos(trans, iter, end); return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } @@ -597,10 +608,10 @@ s64 bch2_remap_range(struct bch_fs *c, u64 dst_done = 0; u32 dst_snapshot, src_snapshot; bool reflink_p_may_update_opts_field = - bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); + !bch2_request_incompat_feature(c, bcachefs_metadata_version_reflink_p_may_update_opts); int ret = 0, ret2 = 0; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_reflink)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) return -BCH_ERR_erofs_no_writes; bch2_check_set_feature(c, BCH_FEATURE_reflink); @@ -638,27 +649,27 @@ s64 bch2_remap_range(struct bch_fs *c, if (ret) continue; - bch2_btree_iter_set_snapshot(&src_iter, src_snapshot); + bch2_btree_iter_set_snapshot(trans, &src_iter, src_snapshot); ret = bch2_subvolume_get_snapshot(trans, dst_inum.subvol, &dst_snapshot); if (ret) continue; - bch2_btree_iter_set_snapshot(&dst_iter, dst_snapshot); + bch2_btree_iter_set_snapshot(trans, &dst_iter, dst_snapshot); if (dst_inum.inum < src_inum.inum) { /* Avoid some lock cycle transaction restarts */ - ret = bch2_btree_iter_traverse(&dst_iter); + ret = bch2_btree_iter_traverse(trans, &dst_iter); if (ret) continue; } dst_done = dst_iter.pos.offset - dst_start.offset; src_want = POS(src_start.inode, src_start.offset + dst_done); - bch2_btree_iter_set_pos(&src_iter, src_want); + bch2_btree_iter_set_pos(trans, &src_iter, src_want); - src_k = get_next_src(&src_iter, src_end); + src_k = get_next_src(trans, &src_iter, src_end); ret = bkey_err(src_k); if (ret) continue; @@ -729,7 +740,7 @@ s64 bch2_remap_range(struct bch_fs *c, do { struct bch_inode_unpacked inode_u; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; bch2_trans_begin(trans); @@ -751,7 +762,7 @@ err: bch2_bkey_buf_exit(&new_src, c); bch2_bkey_buf_exit(&new_dst, c); - bch2_write_ref_put(c, BCH_WRITE_REF_reflink); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_reflink); return dst_done ?: ret ?: ret2; } @@ -786,8 +797,8 @@ static int bch2_gc_write_reflink_key(struct btree_trans *trans, if (fsck_err_on(r->refcount != le64_to_cpu(*refcount), trans, reflink_v_refcount_wrong, "reflink key has wrong refcount:\n" - " %s\n" - " should be %u", + "%s\n" + "should be %u", (bch2_bkey_val_to_text(&buf, c, k), buf.buf), r->refcount)) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); diff --git a/fs/bcachefs/sb-counters.c b/fs/bcachefs/sb-counters.c index 6992e7469112..2b4b8445d418 100644 --- a/fs/bcachefs/sb-counters.c +++ b/fs/bcachefs/sb-counters.c @@ -5,7 +5,13 @@ /* BCH_SB_FIELD_counters */ -static const char * const bch2_counter_names[] = { +static const u8 counters_to_stable_map[] = { +#define x(n, id, ...) [BCH_COUNTER_##n] = BCH_COUNTER_STABLE_##n, + BCH_PERSISTENT_COUNTERS() +#undef x +}; + +const char * const bch2_counter_names[] = { #define x(t, n, ...) (#t), BCH_PERSISTENT_COUNTERS() #undef x @@ -18,13 +24,13 @@ static size_t bch2_sb_counter_nr_entries(struct bch_sb_field_counters *ctrs) return 0; return (__le64 *) vstruct_end(&ctrs->field) - &ctrs->d[0]; -}; +} static int bch2_sb_counters_validate(struct bch_sb *sb, struct bch_sb_field *f, enum bch_validate_flags flags, struct printbuf *err) { return 0; -}; +} static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field *f) @@ -32,50 +38,56 @@ static void bch2_sb_counters_to_text(struct printbuf *out, struct bch_sb *sb, struct bch_sb_field_counters *ctrs = field_to_type(f, counters); unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - for (unsigned i = 0; i < nr; i++) - prt_printf(out, "%s \t%llu\n", - i < BCH_COUNTER_NR ? bch2_counter_names[i] : "(unknown)", - le64_to_cpu(ctrs->d[i])); -}; + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) + prt_printf(out, "%s \t%llu\n", + bch2_counter_names[i], + le64_to_cpu(ctrs->d[stable])); + } +} int bch2_sb_counters_to_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); - u64 val = 0; - for (i = 0; i < BCH_COUNTER_NR; i++) + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) c->counters_on_mount[i] = 0; - for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) { - val = le64_to_cpu(ctrs->d[i]); - percpu_u64_set(&c->counters[i], val); - c->counters_on_mount[i] = val; + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) { + u64 v = le64_to_cpu(ctrs->d[stable]); + percpu_u64_set(&c->counters[i], v); + c->counters_on_mount[i] = v; + } } + return 0; -}; +} int bch2_sb_counters_from_cpu(struct bch_fs *c) { struct bch_sb_field_counters *ctrs = bch2_sb_field_get(c->disk_sb.sb, counters); struct bch_sb_field_counters *ret; - unsigned int i; unsigned int nr = bch2_sb_counter_nr_entries(ctrs); if (nr < BCH_COUNTER_NR) { ret = bch2_sb_field_resize(&c->disk_sb, counters, - sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); - + sizeof(*ctrs) / sizeof(u64) + BCH_COUNTER_NR); if (ret) { ctrs = ret; nr = bch2_sb_counter_nr_entries(ctrs); } } + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + if (stable < nr) + ctrs->d[stable] = cpu_to_le64(percpu_u64_get(&c->counters[i])); + } - for (i = 0; i < min_t(unsigned int, nr, BCH_COUNTER_NR); i++) - ctrs->d[i] = cpu_to_le64(percpu_u64_get(&c->counters[i])); return 0; } @@ -97,3 +109,39 @@ const struct bch_sb_field_ops bch_sb_field_ops_counters = { .validate = bch2_sb_counters_validate, .to_text = bch2_sb_counters_to_text, }; + +#ifndef NO_BCACHEFS_CHARDEV +long bch2_ioctl_query_counters(struct bch_fs *c, + struct bch_ioctl_query_counters __user *user_arg) +{ + struct bch_ioctl_query_counters arg; + int ret = copy_from_user_errcode(&arg, user_arg, sizeof(arg)); + if (ret) + return ret; + + if ((arg.flags & ~BCH_IOCTL_QUERY_COUNTERS_MOUNT) || + arg.pad) + return -EINVAL; + + arg.nr = min(arg.nr, BCH_COUNTER_NR); + ret = put_user(arg.nr, &user_arg->nr); + if (ret) + return ret; + + for (unsigned i = 0; i < BCH_COUNTER_NR; i++) { + unsigned stable = counters_to_stable_map[i]; + + if (stable < arg.nr) { + u64 v = !(arg.flags & BCH_IOCTL_QUERY_COUNTERS_MOUNT) + ? percpu_u64_get(&c->counters[i]) + : c->counters_on_mount[i]; + + ret = put_user(v, &user_arg->d[stable]); + if (ret) + return ret; + } + } + + return 0; +} +#endif diff --git a/fs/bcachefs/sb-counters.h b/fs/bcachefs/sb-counters.h index 81f8aec9fcb1..a4329ad8dd1b 100644 --- a/fs/bcachefs/sb-counters.h +++ b/fs/bcachefs/sb-counters.h @@ -11,6 +11,10 @@ int bch2_sb_counters_from_cpu(struct bch_fs *); void bch2_fs_counters_exit(struct bch_fs *); int bch2_fs_counters_init(struct bch_fs *); +extern const char * const bch2_counter_names[]; extern const struct bch_sb_field_ops bch_sb_field_ops_counters; +long bch2_ioctl_query_counters(struct bch_fs *, + struct bch_ioctl_query_counters __user *); + #endif // _BCACHEFS_SB_COUNTERS_H diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index fdcf598f08b1..7c0c9c842b4e 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -9,10 +9,26 @@ enum counters_flags { #define BCH_PERSISTENT_COUNTERS() \ x(io_read, 0, TYPE_SECTORS) \ + x(io_read_inline, 80, TYPE_SECTORS) \ + x(io_read_hole, 81, TYPE_SECTORS) \ + x(io_read_promote, 30, TYPE_COUNTER) \ + x(io_read_bounce, 31, TYPE_COUNTER) \ + x(io_read_split, 33, TYPE_COUNTER) \ + x(io_read_reuse_race, 34, TYPE_COUNTER) \ + x(io_read_retry, 32, TYPE_COUNTER) \ + x(io_read_fail_and_poison, 82, TYPE_COUNTER) \ x(io_write, 1, TYPE_SECTORS) \ x(io_move, 2, TYPE_SECTORS) \ + x(io_move_read, 35, TYPE_SECTORS) \ + x(io_move_write, 36, TYPE_SECTORS) \ + x(io_move_finish, 37, TYPE_SECTORS) \ + x(io_move_fail, 38, TYPE_COUNTER) \ + x(io_move_write_fail, 82, TYPE_COUNTER) \ + x(io_move_start_fail, 39, TYPE_COUNTER) \ + x(io_move_created_rebalance, 83, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ + x(bucket_discard_fast, 79, TYPE_COUNTER) \ x(bucket_alloc, 5, TYPE_COUNTER) \ x(bucket_alloc_fail, 6, TYPE_COUNTER) \ x(btree_cache_scan, 7, TYPE_COUNTER) \ @@ -38,16 +54,6 @@ enum counters_flags { x(journal_reclaim_finish, 27, TYPE_COUNTER) \ x(journal_reclaim_start, 28, TYPE_COUNTER) \ x(journal_write, 29, TYPE_COUNTER) \ - x(read_promote, 30, TYPE_COUNTER) \ - x(read_bounce, 31, TYPE_COUNTER) \ - x(read_split, 33, TYPE_COUNTER) \ - x(read_retry, 32, TYPE_COUNTER) \ - x(read_reuse_race, 34, TYPE_COUNTER) \ - x(move_extent_read, 35, TYPE_SECTORS) \ - x(move_extent_write, 36, TYPE_SECTORS) \ - x(move_extent_finish, 37, TYPE_SECTORS) \ - x(move_extent_fail, 38, TYPE_COUNTER) \ - x(move_extent_start_fail, 39, TYPE_COUNTER) \ x(copygc, 40, TYPE_COUNTER) \ x(copygc_wait, 41, TYPE_COUNTER) \ x(gc_gens_end, 42, TYPE_COUNTER) \ @@ -95,6 +101,13 @@ enum bch_persistent_counters { BCH_COUNTER_NR }; +enum bch_persistent_counters_stable { +#define x(t, n, ...) BCH_COUNTER_STABLE_##t = n, + BCH_PERSISTENT_COUNTERS() +#undef x + BCH_COUNTER_STABLE_NR +}; + struct bch_sb_field_counters { struct bch_sb_field field; __le64 d[]; diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 35e07bc8fbd3..861fce1630f0 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -20,6 +20,10 @@ * x(version, recovery_passes, errors...) */ #define UPGRADE_TABLE() \ + x(snapshot_2, \ + RECOVERY_PASS_ALL_FSCK, \ + BCH_FSCK_ERR_subvol_root_wrong_bi_subvol, \ + BCH_FSCK_ERR_subvol_not_master_and_not_snapshot) \ x(backpointers, \ RECOVERY_PASS_ALL_FSCK) \ x(inode_v3, \ @@ -91,9 +95,16 @@ BCH_FSCK_ERR_accounting_mismatch, \ BCH_FSCK_ERR_accounting_key_replicas_nr_devs_0, \ BCH_FSCK_ERR_accounting_key_junk_at_end) \ - x(directory_size, \ - BIT_ULL(BCH_RECOVERY_PASS_check_dirents), \ - BCH_FSCK_ERR_directory_size_mismatch) \ + x(cached_backpointers, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(stripe_backpointers, \ + BIT_ULL(BCH_RECOVERY_PASS_check_extents_to_backpointers),\ + BCH_FSCK_ERR_ptr_to_missing_backpointer) \ + x(inode_has_case_insensitive, \ + BIT_ULL(BCH_RECOVERY_PASS_check_inodes), \ + BCH_FSCK_ERR_inode_has_case_insensitive_not_set, \ + BCH_FSCK_ERR_inode_parent_has_case_insensitive_not_set) #define DOWNGRADE_TABLE() \ x(bucket_stripe_sectors, \ @@ -367,6 +378,9 @@ int bch2_sb_downgrade_update(struct bch_fs *c) if (BCH_VERSION_MAJOR(src->version) != BCH_VERSION_MAJOR(le16_to_cpu(c->disk_sb.sb->version))) continue; + if (src->version < c->sb.version_incompat) + continue; + struct bch_sb_field_downgrade_entry *dst; unsigned bytes = sizeof(*dst) + sizeof(dst->errors[0]) * src->nr_errors; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index b86ec013d7d7..0bfb151da9cf 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -5,8 +5,7 @@ enum bch_fsck_flags { FSCK_CAN_FIX = 1 << 0, FSCK_CAN_IGNORE = 1 << 1, - FSCK_NO_RATELIMIT = 1 << 2, - FSCK_AUTOFIX = 1 << 3, + FSCK_AUTOFIX = 1 << 2, }; #define BCH_SB_ERRS() \ @@ -47,7 +46,7 @@ enum bch_fsck_flags { x(btree_node_unsupported_version, 34, 0) \ x(btree_node_bset_older_than_sb_min, 35, 0) \ x(btree_node_bset_newer_than_sb, 36, 0) \ - x(btree_node_data_missing, 37, 0) \ + x(btree_node_data_missing, 37, FSCK_AUTOFIX) \ x(btree_node_bset_after_end, 38, 0) \ x(btree_node_replicas_sectors_written_mismatch, 39, 0) \ x(btree_node_replicas_data_mismatch, 40, 0) \ @@ -179,6 +178,7 @@ enum bch_fsck_flags { x(ptr_crc_redundant, 160, 0) \ x(ptr_crc_nonce_mismatch, 162, 0) \ x(ptr_stripe_redundant, 163, 0) \ + x(extent_flags_not_at_start, 306, 0) \ x(reservation_key_nr_replicas_invalid, 164, 0) \ x(reflink_v_refcount_wrong, 165, FSCK_AUTOFIX) \ x(reflink_v_pos_bad, 292, 0) \ @@ -205,10 +205,11 @@ enum bch_fsck_flags { x(snapshot_bad_depth, 184, 0) \ x(snapshot_bad_skiplist, 185, 0) \ x(subvol_pos_bad, 186, 0) \ - x(subvol_not_master_and_not_snapshot, 187, 0) \ + x(subvol_not_master_and_not_snapshot, 187, FSCK_AUTOFIX) \ x(subvol_to_missing_root, 188, 0) \ - x(subvol_root_wrong_bi_subvol, 189, 0) \ + x(subvol_root_wrong_bi_subvol, 189, FSCK_AUTOFIX) \ x(bkey_in_missing_snapshot, 190, 0) \ + x(bkey_in_deleted_snapshot, 315, FSCK_AUTOFIX) \ x(inode_pos_inode_nonzero, 191, 0) \ x(inode_pos_blockdev_range, 192, 0) \ x(inode_alloc_cursor_inode_bad, 301, 0) \ @@ -216,6 +217,7 @@ enum bch_fsck_flags { x(inode_str_hash_invalid, 194, 0) \ x(inode_v3_fields_start_bad, 195, 0) \ x(inode_snapshot_mismatch, 196, 0) \ + x(snapshot_key_missing_inode_snapshot, 314, 0) \ x(inode_unlinked_but_clean, 197, 0) \ x(inode_unlinked_but_nlink_nonzero, 198, 0) \ x(inode_unlinked_and_not_open, 281, 0) \ @@ -236,6 +238,11 @@ enum bch_fsck_flags { x(inode_has_child_snapshots_wrong, 287, 0) \ x(inode_unreachable, 210, FSCK_AUTOFIX) \ x(inode_journal_seq_in_future, 299, FSCK_AUTOFIX) \ + x(inode_i_sectors_underflow, 312, FSCK_AUTOFIX) \ + x(inode_has_case_insensitive_not_set, 316, FSCK_AUTOFIX) \ + x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ + x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ + x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ @@ -259,6 +266,7 @@ enum bch_fsck_flags { x(dirent_to_overwritten_inode, 302, 0) \ x(dirent_to_missing_subvol, 230, 0) \ x(dirent_to_itself, 231, 0) \ + x(dirent_casefold_mismatch, 318, FSCK_AUTOFIX) \ x(quota_type_invalid, 232, 0) \ x(xattr_val_size_too_small, 233, 0) \ x(xattr_val_size_too_big, 234, 0) \ @@ -290,14 +298,15 @@ enum bch_fsck_flags { x(btree_node_bkey_bad_u64s, 260, 0) \ x(btree_node_topology_empty_interior_node, 261, 0) \ x(btree_ptr_v2_min_key_bad, 262, 0) \ - x(btree_root_unreadable_and_scan_found_nothing, 263, 0) \ - x(snapshot_node_missing, 264, 0) \ + x(btree_root_unreadable_and_scan_found_nothing, 263, FSCK_AUTOFIX) \ + x(snapshot_node_missing, 264, FSCK_AUTOFIX) \ x(dup_backpointer_to_bad_csum_extent, 265, 0) \ x(btree_bitmap_not_marked, 266, FSCK_AUTOFIX) \ x(sb_clean_entry_overrun, 267, 0) \ x(btree_ptr_v2_written_0, 268, 0) \ x(subvol_snapshot_bad, 269, 0) \ x(subvol_inode_bad, 270, 0) \ + x(subvol_missing, 308, FSCK_AUTOFIX) \ x(alloc_key_stripe_sectors_wrong, 271, FSCK_AUTOFIX) \ x(accounting_mismatch, 272, FSCK_AUTOFIX) \ x(accounting_replicas_not_marked, 273, 0) \ @@ -310,11 +319,16 @@ enum bch_fsck_flags { x(accounting_key_replicas_nr_required_bad, 279, FSCK_AUTOFIX) \ x(accounting_key_replicas_devs_unsorted, 280, FSCK_AUTOFIX) \ x(accounting_key_version_0, 282, FSCK_AUTOFIX) \ + x(accounting_key_nr_counters_wrong, 307, FSCK_AUTOFIX) \ x(logged_op_but_clean, 283, FSCK_AUTOFIX) \ x(compression_opt_not_marked_in_sb, 295, FSCK_AUTOFIX) \ x(compression_type_not_marked_in_sb, 296, FSCK_AUTOFIX) \ x(directory_size_mismatch, 303, FSCK_AUTOFIX) \ - x(MAX, 304, 0) + x(dirent_cf_name_too_big, 304, 0) \ + x(dirent_stray_data_after_cf_name, 305, 0) \ + x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ + x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ + x(MAX, 319, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 116131f95815..3398906660a5 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -5,19 +5,41 @@ #include "disk_groups.h" #include "error.h" #include "opts.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-members.h" #include "super-io.h" -void bch2_dev_missing(struct bch_fs *c, unsigned dev) +int bch2_dev_missing_bkey(struct bch_fs *c, struct bkey_s_c k, unsigned dev) +{ + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "pointer to nonexistent device %u in key\n", dev); + bch2_bkey_val_to_text(&buf, c, k); + + bool print = bch2_count_fsck_err(c, ptr_to_invalid_device, &buf); + + int ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_allocations, 0); + + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + return ret; +} + +void bch2_dev_missing_atomic(struct bch_fs *c, unsigned dev) { if (dev != BCH_SB_MEMBER_INVALID) bch2_fs_inconsistent(c, "pointer to nonexistent device %u", dev); } -void bch2_dev_bucket_missing(struct bch_fs *c, struct bpos bucket) +void bch2_dev_bucket_missing(struct bch_dev *ca, u64 bucket) { - bch2_fs_inconsistent(c, "pointer to nonexistent bucket %llu:%llu", bucket.inode, bucket.offset); + bch2_fs_inconsistent(ca->fs, + "pointer to nonexistent bucket %llu on device %s (valid range %u-%llu)", + bucket, ca->name, ca->mi.first_bucket, ca->mi.nbuckets); } #define x(t, n, ...) [n] = #t, @@ -117,6 +139,11 @@ int bch2_sb_members_cpy_v2_v1(struct bch_sb_handle *disk_sb) struct bch_sb_field_members_v1 *mi1; struct bch_sb_field_members_v2 *mi2; + if (BCH_SB_VERSION_INCOMPAT(disk_sb->sb) > bcachefs_metadata_version_extent_flags) { + bch2_sb_field_resize(disk_sb, members_v1, 0); + return 0; + } + mi1 = bch2_sb_field_resize(disk_sb, members_v1, DIV_ROUND_UP(sizeof(*mi1) + BCH_MEMBER_V1_BYTES * disk_sb->sb->nr_devices, sizeof(u64))); @@ -168,6 +195,12 @@ static int validate_member(struct printbuf *err, return -BCH_ERR_invalid_sb_members; } + if (BCH_MEMBER_FREESPACE_INITIALIZED(&m) && + sb->features[0] & cpu_to_le64(BIT_ULL(BCH_FEATURE_no_alloc_info))) { + prt_printf(err, "device %u: freespace initialized but fs has no alloc info", i); + return -BCH_ERR_invalid_sb_members; + } + return 0; } @@ -189,17 +222,11 @@ static void member_to_text(struct printbuf *out, printbuf_indent_add(out, 2); prt_printf(out, "Label:\t"); - if (BCH_MEMBER_GROUP(&m)) { - unsigned idx = BCH_MEMBER_GROUP(&m) - 1; - - if (idx < disk_groups_nr(gi)) - prt_printf(out, "%s (%u)", - gi->entries[idx].label, idx); - else - prt_printf(out, "(bad disk labels section)"); - } else { + if (BCH_MEMBER_GROUP(&m)) + bch2_disk_path_to_text_sb(out, sb, + BCH_MEMBER_GROUP(&m) - 1); + else prt_printf(out, "(none)"); - } prt_newline(out); prt_printf(out, "UUID:\t"); @@ -266,6 +293,7 @@ static void member_to_text(struct printbuf *out, prt_printf(out, "Discard:\t%llu\n", BCH_MEMBER_DISCARD(&m)); prt_printf(out, "Freespace initialized:\t%llu\n", BCH_MEMBER_FREESPACE_INITIALIZED(&m)); + prt_printf(out, "Resize on mount:\t%llu\n", BCH_MEMBER_RESIZE_ON_MOUNT(&m)); printbuf_indent_sub(out, 2); } @@ -491,6 +519,7 @@ int bch2_sb_member_alloc(struct bch_fs *c) unsigned u64s; int best = -1; u64 best_last_mount = 0; + unsigned nr_deleted = 0; if (dev_idx < BCH_SB_MEMBERS_MAX) goto have_slot; @@ -501,7 +530,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) continue; struct bch_member m = bch2_sb_member_get(c->disk_sb.sb, dev_idx); - if (bch2_member_alive(&m)) + + nr_deleted += uuid_equal(&m.uuid, &BCH_SB_MEMBER_DELETED_UUID); + + if (!bch2_is_zero(&m.uuid, sizeof(m.uuid))) continue; u64 last_mount = le64_to_cpu(m.last_mount); @@ -515,6 +547,10 @@ int bch2_sb_member_alloc(struct bch_fs *c) goto have_slot; } + if (nr_deleted) + bch_err(c, "unable to allocate new member, but have %u deleted: run fsck", + nr_deleted); + return -BCH_ERR_ENOSPC_sb_members; have_slot: nr_devices = max_t(unsigned, dev_idx + 1, c->sb.nr_devices); @@ -530,3 +566,22 @@ have_slot: c->disk_sb.sb->nr_devices = nr_devices; return dev_idx; } + +void bch2_sb_members_clean_deleted(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + bool write_sb = false; + + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_member *m = bch2_members_v2_get_mut(c->disk_sb.sb, i); + + if (uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID)) { + memset(&m->uuid, 0, sizeof(m->uuid)); + write_sb = true; + } + } + + if (write_sb) + bch2_write_super(c); + mutex_unlock(&c->sb_lock); +} diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 762083b564ee..6bd9b86aee5b 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -4,6 +4,7 @@ #include "darray.h" #include "bkey_types.h" +#include "enumerated_ref.h" extern char * const bch2_member_error_strs[]; @@ -20,10 +21,22 @@ struct bch_member bch2_sb_member_get(struct bch_sb *sb, int i); static inline bool bch2_dev_is_online(struct bch_dev *ca) { - return !percpu_ref_is_zero(&ca->io_ref); + return !enumerated_ref_is_zero(&ca->io_ref[READ]); } -static inline bool bch2_dev_is_readable(struct bch_dev *ca) +static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); + +static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) +{ + rcu_read_lock(); + struct bch_dev *ca = bch2_dev_rcu(c, dev); + bool ret = ca && bch2_dev_is_online(ca); + rcu_read_unlock(); + + return ret; +} + +static inline bool bch2_dev_is_healthy(struct bch_dev *ca) { return bch2_dev_is_online(ca) && ca->mi.state != BCH_MEMBER_STATE_failed; @@ -92,6 +105,12 @@ static inline struct bch_dev *__bch2_next_dev(struct bch_fs *c, struct bch_dev * for (struct bch_dev *_ca = NULL; \ (_ca = __bch2_next_dev((_c), _ca, (_mask)));) +#define for_each_online_member_rcu(_c, _ca) \ + for_each_member_device_rcu(_c, _ca, &(_c)->online_devs) + +#define for_each_rw_member_rcu(_c, _ca) \ + for_each_member_device_rcu(_c, _ca, &(_c)->rw_devs[BCH_DATA_free]) + static inline void bch2_dev_get(struct bch_dev *ca) { #ifdef CONFIG_BCACHEFS_DEBUG @@ -144,33 +163,34 @@ static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, struct bch_dev *ca, - unsigned state_mask) + unsigned state_mask, + int rw, unsigned ref_idx) { rcu_read_lock(); if (ca) - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[rw], ref_idx); while ((ca = __bch2_next_dev(c, ca, NULL)) && (!((1 << ca->mi.state) & state_mask) || - !percpu_ref_tryget(&ca->io_ref))) + !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) ; rcu_read_unlock(); return ca; } -#define __for_each_online_member(_c, _ca, state_mask) \ +#define __for_each_online_member(_c, _ca, state_mask, rw, ref_idx) \ for (struct bch_dev *_ca = NULL; \ - (_ca = bch2_get_next_online_dev(_c, _ca, state_mask));) + (_ca = bch2_get_next_online_dev(_c, _ca, state_mask, rw, ref_idx));) -#define for_each_online_member(c, ca) \ - __for_each_online_member(c, ca, ~0) +#define for_each_online_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, ~0, READ, ref_idx) -#define for_each_rw_member(c, ca) \ - __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw)) +#define for_each_rw_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, BIT(BCH_MEMBER_STATE_rw), WRITE, ref_idx) -#define for_each_readable_member(c, ca) \ - __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro)) +#define for_each_readable_member(c, ca, ref_idx) \ + __for_each_online_member(c, ca, BIT( BCH_MEMBER_STATE_rw)|BIT(BCH_MEMBER_STATE_ro), READ, ref_idx) static inline bool bch2_dev_exists(const struct bch_fs *c, unsigned dev) { @@ -205,13 +225,15 @@ static inline struct bch_dev *bch2_dev_rcu_noerror(struct bch_fs *c, unsigned de : NULL; } -void bch2_dev_missing(struct bch_fs *, unsigned); +int bch2_dev_missing_bkey(struct bch_fs *, struct bkey_s_c, unsigned); + +void bch2_dev_missing_atomic(struct bch_fs *, unsigned); static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) { struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); if (unlikely(!ca)) - bch2_dev_missing(c, dev); + bch2_dev_missing_atomic(c, dev); return ca; } @@ -229,27 +251,30 @@ static inline struct bch_dev *bch2_dev_tryget(struct bch_fs *c, unsigned dev) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, dev); if (unlikely(!ca)) - bch2_dev_missing(c, dev); + bch2_dev_missing_atomic(c, dev); return ca; } static inline struct bch_dev *bch2_dev_bucket_tryget_noerror(struct bch_fs *c, struct bpos bucket) { struct bch_dev *ca = bch2_dev_tryget_noerror(c, bucket.inode); - if (ca && !bucket_valid(ca, bucket.offset)) { + if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { bch2_dev_put(ca); ca = NULL; } return ca; } -void bch2_dev_bucket_missing(struct bch_fs *, struct bpos); +void bch2_dev_bucket_missing(struct bch_dev *, u64); static inline struct bch_dev *bch2_dev_bucket_tryget(struct bch_fs *c, struct bpos bucket) { - struct bch_dev *ca = bch2_dev_bucket_tryget_noerror(c, bucket); - if (!ca) - bch2_dev_bucket_missing(c, bucket); + struct bch_dev *ca = bch2_dev_tryget(c, bucket.inode); + if (ca && unlikely(!bucket_valid(ca, bucket.offset))) { + bch2_dev_bucket_missing(ca, bucket.offset); + bch2_dev_put(ca); + ca = NULL; + } return ca; } @@ -269,11 +294,14 @@ static inline struct bch_dev *bch2_dev_iterate(struct bch_fs *c, struct bch_dev return bch2_dev_tryget(c, dev_idx); } -static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, int rw) +static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, + int rw, unsigned ref_idx) { + might_sleep(); + rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (ca && !percpu_ref_tryget(&ca->io_ref)) + if (ca && !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) ca = NULL; rcu_read_unlock(); @@ -283,27 +311,17 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, return ca; if (ca) - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[rw], ref_idx); return NULL; } -/* XXX kill, move to struct bch_fs */ -static inline struct bch_devs_mask bch2_online_devs(struct bch_fs *c) -{ - struct bch_devs_mask devs; - - memset(&devs, 0, sizeof(devs)); - for_each_online_member(c, ca) - __set_bit(ca->dev_idx, devs.d); - return devs; -} - extern const struct bch_sb_field_ops bch_sb_field_ops_members_v1; extern const struct bch_sb_field_ops bch_sb_field_ops_members_v2; static inline bool bch2_member_alive(struct bch_member *m) { - return !bch2_is_zero(&m->uuid, sizeof(m->uuid)); + return !bch2_is_zero(&m->uuid, sizeof(m->uuid)) && + !uuid_equal(&m->uuid, &BCH_SB_MEMBER_DELETED_UUID); } static inline bool bch2_member_exists(struct bch_sb *sb, unsigned dev) @@ -333,6 +351,7 @@ static inline struct bch_member_cpu bch2_mi_to_cpu(struct bch_member *mi) ? BCH_MEMBER_DURABILITY(mi) - 1 : 1, .freespace_initialized = BCH_MEMBER_FREESPACE_INITIALIZED(mi), + .resize_on_mount = BCH_MEMBER_RESIZE_ON_MOUNT(mi), .valid = bch2_member_alive(mi), .btree_bitmap_shift = mi->btree_bitmap_shift, .btree_allocated_bitmap = le64_to_cpu(mi->btree_allocated_bitmap), @@ -363,5 +382,6 @@ bool bch2_dev_btree_bitmap_marked(struct bch_fs *, struct bkey_s_c); void bch2_dev_btree_bitmap_mark(struct bch_fs *, struct bkey_s_c); int bch2_sb_member_alloc(struct bch_fs *); +void bch2_sb_members_clean_deleted(struct bch_fs *); #endif /* _BCACHEFS_SB_MEMBERS_H */ diff --git a/fs/bcachefs/sb-members_format.h b/fs/bcachefs/sb-members_format.h index 2adf1221a440..fb72ad730518 100644 --- a/fs/bcachefs/sb-members_format.h +++ b/fs/bcachefs/sb-members_format.h @@ -13,6 +13,10 @@ */ #define BCH_SB_MEMBER_INVALID 255 +#define BCH_SB_MEMBER_DELETED_UUID \ + UUID_INIT(0xffffffff, 0xffff, 0xffff, \ + 0xd9, 0x6a, 0x60, 0xcf, 0x80, 0x3d, 0xf7, 0xef) + #define BCH_MIN_NR_NBUCKETS (1 << 6) #define BCH_IOPS_MEASUREMENTS() \ @@ -79,6 +83,7 @@ struct bch_member { #define BCH_MEMBER_V1_BYTES 56 +LE16_BITMASK(BCH_MEMBER_BUCKET_SIZE, struct bch_member, bucket_size, 0, 16) LE64_BITMASK(BCH_MEMBER_STATE, struct bch_member, flags, 0, 4) /* 4-14 unused, was TIER, HAS_(META)DATA, REPLACEMENT */ LE64_BITMASK(BCH_MEMBER_DISCARD, struct bch_member, flags, 14, 15) @@ -87,6 +92,8 @@ LE64_BITMASK(BCH_MEMBER_GROUP, struct bch_member, flags, 20, 28) LE64_BITMASK(BCH_MEMBER_DURABILITY, struct bch_member, flags, 28, 30) LE64_BITMASK(BCH_MEMBER_FREESPACE_INITIALIZED, struct bch_member, flags, 30, 31) +LE64_BITMASK(BCH_MEMBER_RESIZE_ON_MOUNT, + struct bch_member, flags, 31, 32) #if 0 LE64_BITMASK(BCH_MEMBER_NR_READ_ERRORS, struct bch_member, flags[1], 0, 20); diff --git a/fs/bcachefs/sb-members_types.h b/fs/bcachefs/sb-members_types.h index c0eda888fe39..d6443e186872 100644 --- a/fs/bcachefs/sb-members_types.h +++ b/fs/bcachefs/sb-members_types.h @@ -13,6 +13,7 @@ struct bch_member_cpu { u8 data_allowed; u8 durability; u8 freespace_initialized; + u8 resize_on_mount; u8 valid; u8 btree_bitmap_shift; u64 btree_allocated_bitmap; diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 7e7c66a1e1a6..7c403427fbdb 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -850,7 +850,8 @@ void six_lock_exit(struct six_lock *lock) EXPORT_SYMBOL_GPL(six_lock_exit); void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags) + struct lock_class_key *key, enum six_lock_init_flags flags, + gfp_t gfp) { atomic_set(&lock->state, 0); raw_spin_lock_init(&lock->wait_lock); @@ -873,7 +874,7 @@ void __six_lock_init(struct six_lock *lock, const char *name, * failure if they wish by checking lock->readers, but generally * will not want to treat it as an error. */ - lock->readers = alloc_percpu(unsigned); + lock->readers = alloc_percpu_gfp(unsigned, gfp); } #endif } diff --git a/fs/bcachefs/six.h b/fs/bcachefs/six.h index c142e06b7a3a..59b851cf8bac 100644 --- a/fs/bcachefs/six.h +++ b/fs/bcachefs/six.h @@ -164,18 +164,19 @@ enum six_lock_init_flags { }; void __six_lock_init(struct six_lock *lock, const char *name, - struct lock_class_key *key, enum six_lock_init_flags flags); + struct lock_class_key *key, enum six_lock_init_flags flags, + gfp_t gfp); /** * six_lock_init - initialize a six lock * @lock: lock to initialize * @flags: optional flags, i.e. SIX_LOCK_INIT_PCPU */ -#define six_lock_init(lock, flags) \ +#define six_lock_init(lock, flags, gfp) \ do { \ static struct lock_class_key __key; \ \ - __six_lock_init((lock), #lock, &__key, flags); \ + __six_lock_init((lock), #lock, &__key, flags, gfp); \ } while (0) /** diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index c54091a28909..00d62d1190ef 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -1,11 +1,13 @@ // SPDX-License-Identifier: GPL-2.0 #include "bcachefs.h" +#include "bbpos.h" #include "bkey_buf.h" #include "btree_cache.h" #include "btree_key_cache.h" #include "btree_update.h" #include "buckets.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -141,13 +143,14 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) rcu_read_lock(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery_pass_done < BCH_RECOVERY_PASS_check_snapshots)) { + if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) { ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); goto out; } - while (id && id < ancestor - IS_ANCESTOR_BITMAP) - id = get_ancestor_below(t, id, ancestor); + if (likely(ancestor >= IS_ANCESTOR_BITMAP)) + while (id && id < ancestor - IS_ANCESTOR_BITMAP) + id = get_ancestor_below(t, id, ancestor); ret = id && id < ancestor ? test_ancestor_bitmap(t, id, ancestor) @@ -208,9 +211,14 @@ void bch2_snapshot_to_text(struct printbuf *out, struct bch_fs *c, { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); - prt_printf(out, "is_subvol %llu deleted %llu parent %10u children %10u %10u subvol %u tree %u", - BCH_SNAPSHOT_SUBVOL(s.v), - BCH_SNAPSHOT_DELETED(s.v), + if (BCH_SNAPSHOT_SUBVOL(s.v)) + prt_str(out, "subvol "); + if (BCH_SNAPSHOT_WILL_DELETE(s.v)) + prt_str(out, "will_delete "); + if (BCH_SNAPSHOT_DELETED(s.v)) + prt_str(out, "deleted "); + + prt_printf(out, "parent %10u children %10u %10u subvol %u tree %u", le32_to_cpu(s.v->parent), le32_to_cpu(s.v->children[0]), le32_to_cpu(s.v->children[1]), @@ -280,6 +288,16 @@ fsck_err: return ret; } +static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) +{ + mutex_lock(&c->snapshot_table_lock); + int ret = snapshot_t_mut(c, id) + ? 0 + : -BCH_ERR_ENOMEM_mark_snapshot; + mutex_unlock(&c->snapshot_table_lock); + return ret; +} + static int __bch2_mark_snapshot(struct btree_trans *trans, enum btree_id btree, unsigned level, struct bkey_s_c old, struct bkey_s_c new, @@ -301,7 +319,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, if (new.k->type == KEY_TYPE_snapshot) { struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(new); - t->live = true; + t->state = !BCH_SNAPSHOT_DELETED(s.v) + ? SNAPSHOT_ID_live + : SNAPSHOT_ID_deleted; t->parent = le32_to_cpu(s.v->parent); t->children[0] = le32_to_cpu(s.v->children[0]); t->children[1] = le32_to_cpu(s.v->children[1]); @@ -326,9 +346,9 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, parent - id - 1 < IS_ANCESTOR_BITMAP) __set_bit(parent - id - 1, t->is_ancestor); - if (BCH_SNAPSHOT_DELETED(s.v)) { + if (BCH_SNAPSHOT_WILL_DELETE(s.v)) { set_bit(BCH_FS_need_delete_dead_snapshots, &c->flags); - if (c->curr_recovery_pass > BCH_RECOVERY_PASS_delete_dead_snapshots) + if (c->recovery.pass_done > BCH_RECOVERY_PASS_delete_dead_snapshots) bch2_delete_dead_snapshots_async(c); } } else { @@ -389,22 +409,31 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) return 0; } -static u32 bch2_snapshot_tree_oldest_subvol(struct bch_fs *c, u32 snapshot_root) +u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, + snapshot_id_list *skip) { - u32 id = snapshot_root; - u32 subvol = 0, s; - + u32 id, subvol = 0, s; +retry: + id = snapshot_root; rcu_read_lock(); - while (id) { - s = snapshot_t(c, id)->subvol; - - if (s && (!subvol || s < subvol)) - subvol = s; + while (id && bch2_snapshot_exists(c, id)) { + if (!(skip && snapshot_list_has_id(skip, id))) { + s = snapshot_t(c, id)->subvol; + if (s && (!subvol || s < subvol)) + subvol = s; + } id = bch2_snapshot_tree_next(c, id); + if (id == snapshot_root) + break; } rcu_read_unlock(); + if (!subvol && skip) { + skip = NULL; + goto retry; + } + return subvol; } @@ -436,7 +465,7 @@ static int bch2_snapshot_tree_master_subvol(struct btree_trans *trans, if (!ret && !found) { struct bkey_i_subvolume *u; - *subvol_id = bch2_snapshot_tree_oldest_subvol(c, snapshot_root); + *subvol_id = bch2_snapshot_oldest_subvol(c, snapshot_root, NULL); u = bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_subvolumes, POS(0, *subvol_id), @@ -484,7 +513,7 @@ static int check_snapshot_tree(struct btree_trans *trans, root_id != bch2_snapshot_root(c, root_id) || st.k->p.offset != le32_to_cpu(s.tree), trans, snapshot_tree_to_missing_snapshot, - "snapshot tree points to missing/incorrect snapshot:\n %s", + "snapshot tree points to missing/incorrect snapshot:\n%s", (bch2_bkey_val_to_text(&buf, c, st.s_c), prt_newline(&buf), ret @@ -504,19 +533,19 @@ static int check_snapshot_tree(struct btree_trans *trans, if (fsck_err_on(ret, trans, snapshot_tree_to_missing_subvol, - "snapshot tree points to missing subvolume:\n %s", + "snapshot tree points to missing subvolume:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || fsck_err_on(!bch2_snapshot_is_ancestor(c, le32_to_cpu(subvol.snapshot), root_id), trans, snapshot_tree_to_wrong_subvol, - "snapshot tree points to subvolume that does not point to snapshot in this tree:\n %s", + "snapshot tree points to subvolume that does not point to snapshot in this tree:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf)) || fsck_err_on(BCH_SUBVOLUME_SNAP(&subvol), trans, snapshot_tree_to_snapshot_subvol, - "snapshot tree points to snapshot subvolume:\n %s", + "snapshot tree points to snapshot subvolume:\n%s", (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, st.s_c), buf.buf))) { struct bkey_i_snapshot_tree *u; @@ -653,7 +682,7 @@ static int snapshot_tree_ptr_repair(struct btree_trans *trans, u = bch2_bkey_make_mut_typed(trans, &root_iter, &root.s_c, 0, snapshot); ret = PTR_ERR_OR_ZERO(u) ?: bch2_snapshot_tree_create(trans, root_id, - bch2_snapshot_tree_oldest_subvol(c, root_id), + bch2_snapshot_oldest_subvol(c, root_id, NULL), &tree_id); if (ret) goto err; @@ -698,6 +727,9 @@ static int check_snapshot(struct btree_trans *trans, memset(&s, 0, sizeof(s)); memcpy(&s, k.v, min(sizeof(s), bkey_val_bytes(k.k))); + if (BCH_SNAPSHOT_DELETED(&s)) + return 0; + id = le32_to_cpu(s.parent); if (id) { ret = bch2_snapshot_lookup(trans, id, &v); @@ -735,7 +767,7 @@ static int check_snapshot(struct btree_trans *trans, } bool should_have_subvol = BCH_SNAPSHOT_SUBVOL(&s) && - !BCH_SNAPSHOT_DELETED(&s); + !BCH_SNAPSHOT_WILL_DELETE(&s); if (should_have_subvol) { id = le32_to_cpu(s.subvol); @@ -755,7 +787,7 @@ static int check_snapshot(struct btree_trans *trans, } else { if (fsck_err_on(s.subvol, trans, snapshot_should_not_have_subvol, - "snapshot should not point to subvol:\n %s", + "snapshot should not point to subvol:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); @@ -773,7 +805,7 @@ static int check_snapshot(struct btree_trans *trans, if (fsck_err_on(!ret, trans, snapshot_to_bad_snapshot_tree, - "snapshot points to missing/incorrect tree:\n %s", + "snapshot points to missing/incorrect tree:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { ret = snapshot_tree_ptr_repair(trans, iter, k, &s); if (ret) @@ -785,7 +817,7 @@ static int check_snapshot(struct btree_trans *trans, if (fsck_err_on(le32_to_cpu(s.depth) != real_depth, trans, snapshot_bad_depth, - "snapshot with incorrect depth field, should be %u:\n %s", + "snapshot with incorrect depth field, should be %u:\n%s", real_depth, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); @@ -802,7 +834,7 @@ static int check_snapshot(struct btree_trans *trans, if (fsck_err_on(!ret, trans, snapshot_bad_skiplist, - "snapshot with bad skiplist field:\n %s", + "snapshot with bad skiplist field:\n%s", (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { u = bch2_bkey_make_mut_typed(trans, iter, &k, 0, snapshot); ret = PTR_ERR_OR_ZERO(u); @@ -842,9 +874,6 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - if (bch2_snapshot_exists(c, id)) - return 0; - /* Do we need to reconstruct the snapshot_tree entry as well? */ struct btree_iter iter; struct bkey_s_c k; @@ -889,9 +918,8 @@ static int check_snapshot_exists(struct btree_trans *trans, u32 id) } bch2_trans_iter_exit(trans, &iter); - return bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0) ?: - bch2_mark_snapshot(trans, BTREE_ID_snapshots, 0, - bkey_s_c_null, bkey_i_to_s(&snapshot->k_i), 0); + return bch2_snapshot_table_make_room(c, id) ?: + bch2_btree_insert_trans(trans, BTREE_ID_snapshots, &snapshot->k_i, 0); } /* Figure out which snapshot nodes belong in the same tree: */ @@ -989,7 +1017,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) snapshot_id_list_to_text(&buf, t); darray_for_each(*t, id) { - if (fsck_err_on(!bch2_snapshot_exists(c, *id), + if (fsck_err_on(bch2_snapshot_id_state(c, *id) == SNAPSHOT_ID_empty, trans, snapshot_node_missing, "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { @@ -1014,22 +1042,38 @@ err: return ret; } -int bch2_check_key_has_snapshot(struct btree_trans *trans, - struct btree_iter *iter, - struct bkey_s_c k) +int __bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) { struct bch_fs *c = trans->c; struct printbuf buf = PRINTBUF; int ret = 0; + enum snapshot_id_state state = bch2_snapshot_id_state(c, k.k->p.snapshot); + + /* Snapshot was definitively deleted, this error is marked autofix */ + if (fsck_err_on(state == SNAPSHOT_ID_deleted, + trans, bkey_in_deleted_snapshot, + "key in deleted snapshot %s, delete?", + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; - if (fsck_err_on(!bch2_snapshot_exists(c, k.k->p.snapshot), + /* + * Snapshot missing: we should have caught this with btree_lost_data and + * kicked off reconstruct_snapshots, so if we end up here we have no + * idea what happened: + */ + if (fsck_err_on(state == SNAPSHOT_ID_empty, trans, bkey_in_missing_snapshot, "key in missing snapshot %s, delete?", (bch2_btree_id_to_text(&buf, iter->btree_id), prt_char(&buf, ' '), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; + BTREE_UPDATE_internal_snapshot_node) ?: 1; fsck_err: printbuf_exit(&buf); return ret; @@ -1053,10 +1097,10 @@ int bch2_snapshot_node_set_deleted(struct btree_trans *trans, u32 id) } /* already deleted? */ - if (BCH_SNAPSHOT_DELETED(&s->v)) + if (BCH_SNAPSHOT_WILL_DELETE(&s->v)) goto err; - SET_BCH_SNAPSHOT_DELETED(&s->v, true); + SET_BCH_SNAPSHOT_WILL_DELETE(&s->v, true); SET_BCH_SNAPSHOT_SUBVOL(&s->v, false); s->v.subvol = 0; err: @@ -1073,27 +1117,28 @@ static inline void normalize_snapshot_child_pointers(struct bch_snapshot *s) static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) { struct bch_fs *c = trans->c; - struct btree_iter iter, p_iter = (struct btree_iter) { NULL }; - struct btree_iter c_iter = (struct btree_iter) { NULL }; - struct btree_iter tree_iter = (struct btree_iter) { NULL }; - struct bkey_s_c_snapshot s; + struct btree_iter iter, p_iter = {}; + struct btree_iter c_iter = {}; + struct btree_iter tree_iter = {}; u32 parent_id, child_id; unsigned i; int ret = 0; - s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), - BTREE_ITER_intent, snapshot); - ret = bkey_err(s); + struct bkey_i_snapshot *s = + bch2_bkey_get_mut_typed(trans, &iter, BTREE_ID_snapshots, POS(0, id), + BTREE_ITER_intent, snapshot); + ret = PTR_ERR_OR_ZERO(s); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, "missing snapshot %u", id); if (ret) goto err; - BUG_ON(s.v->children[1]); + BUG_ON(BCH_SNAPSHOT_DELETED(&s->v)); + BUG_ON(s->v.children[1]); - parent_id = le32_to_cpu(s.v->parent); - child_id = le32_to_cpu(s.v->children[0]); + parent_id = le32_to_cpu(s->v.parent); + child_id = le32_to_cpu(s->v.children[0]); if (parent_id) { struct bkey_i_snapshot *parent; @@ -1151,24 +1196,38 @@ static int bch2_snapshot_node_delete(struct btree_trans *trans, u32 id) */ struct bkey_i_snapshot_tree *s_t; - BUG_ON(s.v->children[1]); + BUG_ON(s->v.children[1]); s_t = bch2_bkey_get_mut_typed(trans, &tree_iter, - BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s.v->tree)), + BTREE_ID_snapshot_trees, POS(0, le32_to_cpu(s->v.tree)), 0, snapshot_tree); ret = PTR_ERR_OR_ZERO(s_t); if (ret) goto err; - if (s.v->children[0]) { - s_t->v.root_snapshot = s.v->children[0]; + if (s->v.children[0]) { + s_t->v.root_snapshot = s->v.children[0]; } else { s_t->k.type = KEY_TYPE_deleted; set_bkey_val_u64s(&s_t->k, 0); } } - ret = bch2_btree_delete_at(trans, &iter, 0); + if (!bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2)) { + SET_BCH_SNAPSHOT_DELETED(&s->v, true); + s->v.parent = 0; + s->v.children[0] = 0; + s->v.children[1] = 0; + s->v.subvol = 0; + s->v.tree = 0; + s->v.depth = 0; + s->v.skip[0] = 0; + s->v.skip[1] = 0; + s->v.skip[2] = 0; + } else { + s->k.type = KEY_TYPE_deleted; + set_bkey_val_u64s(&s->k, 0); + } err: bch2_trans_iter_exit(trans, &tree_iter); bch2_trans_iter_exit(trans, &p_iter); @@ -1192,13 +1251,13 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, bch2_trans_iter_init(trans, &iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent); - k = bch2_btree_iter_peek(&iter); + k = bch2_btree_iter_peek(trans, &iter); ret = bkey_err(k); if (ret) goto err; for (i = 0; i < nr_snapids; i++) { - k = bch2_btree_iter_prev_slot(&iter); + k = bch2_btree_iter_prev_slot(trans, &iter); ret = bkey_err(k); if (ret) goto err; @@ -1338,12 +1397,6 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, * that key to snapshot leaf nodes, where we can mutate it */ -struct snapshot_interior_delete { - u32 id; - u32 live_child; -}; -typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; - static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) { darray_for_each(*l, i) @@ -1377,28 +1430,34 @@ static unsigned __live_child(struct snapshot_table *t, u32 id, return 0; } -static unsigned live_child(struct bch_fs *c, u32 id, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static unsigned live_child(struct bch_fs *c, u32 id) { + struct snapshot_delete *d = &c->snapshot_delete; + rcu_read_lock(); u32 ret = __live_child(rcu_dereference(c->snapshots), id, - delete_leaves, delete_interior); + &d->delete_leaves, &d->delete_interior); rcu_read_unlock(); return ret; } +static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) +{ + return snapshot_list_has_id(&d->delete_leaves, id) || + interior_delete_has_id(&d->delete_interior, id) != 0; +} + static int delete_dead_snapshots_process_key(struct btree_trans *trans, struct btree_iter *iter, - struct bkey_s_c k, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) + struct bkey_s_c k) { - if (snapshot_list_has_id(delete_leaves, k.k->p.snapshot)) + struct snapshot_delete *d = &trans->c->snapshot_delete; + + if (snapshot_list_has_id(&d->delete_leaves, k.k->p.snapshot)) return bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node); - u32 live_child = interior_delete_has_id(delete_interior, k.k->p.snapshot); + u32 live_child = interior_delete_has_id(&d->delete_interior, k.k->p.snapshot); if (live_child) { struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); int ret = PTR_ERR_OR_ZERO(new); @@ -1429,49 +1488,208 @@ static int delete_dead_snapshots_process_key(struct btree_trans *trans, return 0; } +static bool skip_unrelated_snapshot_tree(struct btree_trans *trans, struct btree_iter *iter, u64 *prev_inum) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + + u64 inum = iter->btree_id != BTREE_ID_inodes + ? iter->pos.inode + : iter->pos.offset; + + if (*prev_inum == inum) + return false; + + *prev_inum = inum; + + bool ret = !snapshot_list_has_id(&d->deleting_from_trees, + bch2_snapshot_tree(c, iter->pos.snapshot)); + if (unlikely(ret)) { + struct bpos pos = iter->pos; + pos.snapshot = 0; + if (iter->btree_id != BTREE_ID_inodes) + pos.offset = U64_MAX; + bch2_btree_iter_set_pos(trans, iter, bpos_nosnap_successor(pos)); + } + + return ret; +} + +static int delete_dead_snapshot_keys_v1(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + + for (d->pos.btree = 0; d->pos.btree < BTREE_ID_NR; d->pos.btree++) { + struct disk_reservation res = { 0 }; + u64 prev_inum = 0; + + d->pos.pos = POS_MIN; + + if (!btree_type_has_snapshots(d->pos.btree)) + continue; + + int ret = for_each_btree_key_commit(trans, iter, + d->pos.btree, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + + if (ret) + return ret; + } + + return 0; +} + +static int delete_dead_snapshot_keys_range(struct btree_trans *trans, enum btree_id btree, + struct bpos start, struct bpos end) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + struct disk_reservation res = { 0 }; + + d->pos.btree = btree; + d->pos.pos = POS_MIN; + + int ret = for_each_btree_key_max_commit(trans, iter, + btree, start, end, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.pos = iter.pos; + delete_dead_snapshots_process_key(trans, &iter, k); + })); + + bch2_disk_reservation_put(c, &res); + return ret; +} + +static int delete_dead_snapshot_keys_v2(struct btree_trans *trans) +{ + struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; + struct disk_reservation res = { 0 }; + u64 prev_inum = 0; + int ret = 0; + + struct btree_iter iter; + bch2_trans_iter_init(trans, &iter, BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots); + + while (1) { + struct bkey_s_c k; + ret = lockrestart_do(trans, + bkey_err(k = bch2_btree_iter_peek(trans, &iter))); + if (ret) + break; + + if (!k.k) + break; + + d->pos.btree = iter.btree_id; + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + if (snapshot_id_dying(d, k.k->p.snapshot)) { + struct bpos start = POS(k.k->p.offset, 0); + struct bpos end = POS(k.k->p.offset, U64_MAX); + + ret = delete_dead_snapshot_keys_range(trans, BTREE_ID_extents, start, end) ?: + delete_dead_snapshot_keys_range(trans, BTREE_ID_dirents, start, end) ?: + delete_dead_snapshot_keys_range(trans, BTREE_ID_xattrs, start, end); + if (ret) + break; + + bch2_btree_iter_set_pos(trans, &iter, POS(0, k.k->p.offset + 1)); + } else { + bch2_btree_iter_advance(trans, &iter); + } + } + bch2_trans_iter_exit(trans, &iter); + + if (ret) + goto err; + + prev_inum = 0; + ret = for_each_btree_key_commit(trans, iter, + BTREE_ID_inodes, POS_MIN, + BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, + &res, NULL, BCH_TRANS_COMMIT_no_enospc, ({ + d->pos.btree = iter.btree_id; + d->pos.pos = iter.pos; + + if (skip_unrelated_snapshot_tree(trans, &iter, &prev_inum)) + continue; + + delete_dead_snapshots_process_key(trans, &iter, k); + })); +err: + bch2_disk_reservation_put(c, &res); + return ret; +} + /* * For a given snapshot, if it doesn't have a subvolume that points to it, and * it doesn't have child snapshot nodes - it's now redundant and we can mark it * as deleted. */ -static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k, - snapshot_id_list *delete_leaves, - interior_delete_list *delete_interior) +static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s_c k) { if (k.k->type != KEY_TYPE_snapshot) return 0; struct bch_fs *c = trans->c; + struct snapshot_delete *d = &c->snapshot_delete; struct bkey_s_c_snapshot s = bkey_s_c_to_snapshot(k); unsigned live_children = 0; + int ret = 0; if (BCH_SNAPSHOT_SUBVOL(s.v)) return 0; + if (BCH_SNAPSHOT_DELETED(s.v)) + return 0; + + mutex_lock(&d->progress_lock); for (unsigned i = 0; i < 2; i++) { u32 child = le32_to_cpu(s.v->children[i]); live_children += child && - !snapshot_list_has_id(delete_leaves, child); + !snapshot_list_has_id(&d->delete_leaves, child); } + u32 tree = bch2_snapshot_tree(c, s.k->p.offset); + if (live_children == 0) { - return snapshot_list_add(c, delete_leaves, s.k->p.offset); + ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: + snapshot_list_add(c, &d->delete_leaves, s.k->p.offset); } else if (live_children == 1) { - struct snapshot_interior_delete d = { + struct snapshot_interior_delete n = { .id = s.k->p.offset, - .live_child = live_child(c, s.k->p.offset, delete_leaves, delete_interior), + .live_child = live_child(c, s.k->p.offset), }; - if (!d.live_child) { - bch_err(c, "error finding live child of snapshot %u", d.id); - return -EINVAL; + if (!n.live_child) { + bch_err(c, "error finding live child of snapshot %u", n.id); + ret = -EINVAL; + } else { + ret = snapshot_list_add_nodup(c, &d->deleting_from_trees, tree) ?: + darray_push(&d->delete_interior, n); } - - return darray_push(delete_interior, d); - } else { - return 0; } + mutex_unlock(&d->progress_lock); + + return ret; } static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, @@ -1500,6 +1718,9 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, struct bkey_i_snapshot *s; int ret; + if (!bch2_snapshot_exists(c, k.k->p.offset)) + return 0; + if (k.k->type != KEY_TYPE_snapshot) return 0; @@ -1547,39 +1768,56 @@ static int bch2_fix_child_of_deleted_snapshot(struct btree_trans *trans, return bch2_trans_update(trans, iter, &s->k_i, 0); } -int bch2_delete_dead_snapshots(struct bch_fs *c) +static void bch2_snapshot_delete_nodes_to_text(struct printbuf *out, struct snapshot_delete *d) { - if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) + prt_printf(out, "deleting from trees"); + darray_for_each(d->deleting_from_trees, i) + prt_printf(out, " %u", *i); + + prt_printf(out, "deleting leaves"); + darray_for_each(d->delete_leaves, i) + prt_printf(out, " %u", *i); + prt_newline(out); + + prt_printf(out, "interior"); + darray_for_each(d->delete_interior, i) + prt_printf(out, " %u->%u", i->id, i->live_child); + prt_newline(out); +} + +int __bch2_delete_dead_snapshots(struct bch_fs *c) +{ + struct snapshot_delete *d = &c->snapshot_delete; + int ret = 0; + + if (!mutex_trylock(&d->lock)) return 0; + if (!test_and_clear_bit(BCH_FS_need_delete_dead_snapshots, &c->flags)) + goto out_unlock; + struct btree_trans *trans = bch2_trans_get(c); - snapshot_id_list delete_leaves = {}; - interior_delete_list delete_interior = {}; - int ret = 0; /* * For every snapshot node: If we have no live children and it's not * pointed to by a subvolume, delete it: */ + d->running = true; + d->pos = BBPOS_MIN; + ret = for_each_btree_key(trans, iter, BTREE_ID_snapshots, POS_MIN, 0, k, - check_should_delete_snapshot(trans, k, &delete_leaves, &delete_interior)); + check_should_delete_snapshot(trans, k)); if (!bch2_err_matches(ret, EROFS)) bch_err_msg(c, ret, "walking snapshots"); if (ret) goto err; - if (!delete_leaves.nr && !delete_interior.nr) + if (!d->delete_leaves.nr && !d->delete_interior.nr) goto err; { struct printbuf buf = PRINTBUF; - prt_printf(&buf, "deleting leaves"); - darray_for_each(delete_leaves, i) - prt_printf(&buf, " %u", *i); - - prt_printf(&buf, " interior"); - darray_for_each(delete_interior, i) - prt_printf(&buf, " %u->%u", i->id, i->live_child); + bch2_snapshot_delete_nodes_to_text(&buf, d); ret = commit_do(trans, NULL, NULL, 0, bch2_trans_log_msg(trans, &buf)); printbuf_exit(&buf); @@ -1587,29 +1825,15 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } - for (unsigned btree = 0; btree < BTREE_ID_NR; btree++) { - struct disk_reservation res = { 0 }; - - if (!btree_type_has_snapshots(btree)) - continue; - - ret = for_each_btree_key_commit(trans, iter, - btree, POS_MIN, - BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - &res, NULL, BCH_TRANS_COMMIT_no_enospc, - delete_dead_snapshots_process_key(trans, &iter, k, - &delete_leaves, - &delete_interior)); - - bch2_disk_reservation_put(c, &res); - - if (!bch2_err_matches(ret, EROFS)) - bch_err_msg(c, ret, "deleting keys from dying snapshots"); - if (ret) - goto err; - } + ret = !bch2_request_incompat_feature(c, bcachefs_metadata_version_snapshot_deletion_v2) + ? delete_dead_snapshot_keys_v2(trans) + : delete_dead_snapshot_keys_v1(trans); + if (!bch2_err_matches(ret, EROFS)) + bch_err_msg(c, ret, "deleting keys from dying snapshots"); + if (ret) + goto err; - darray_for_each(delete_leaves, i) { + darray_for_each(d->delete_leaves, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, *i)); if (!bch2_err_matches(ret, EROFS)) @@ -1626,11 +1850,11 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) ret = for_each_btree_key_commit(trans, iter, BTREE_ID_snapshots, POS_MIN, BTREE_ITER_intent, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, - bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &delete_interior)); + bch2_fix_child_of_deleted_snapshot(trans, &iter, k, &d->delete_interior)); if (ret) goto err; - darray_for_each(delete_interior, i) { + darray_for_each(d->delete_interior, i) { ret = commit_do(trans, NULL, NULL, 0, bch2_snapshot_node_delete(trans, i->id)); if (!bch2_err_matches(ret, EROFS)) @@ -1639,33 +1863,66 @@ int bch2_delete_dead_snapshots(struct bch_fs *c) goto err; } err: - darray_exit(&delete_interior); - darray_exit(&delete_leaves); + mutex_lock(&d->progress_lock); + darray_exit(&d->deleting_from_trees); + darray_exit(&d->delete_interior); + darray_exit(&d->delete_leaves); + d->running = false; + mutex_unlock(&d->progress_lock); bch2_trans_put(trans); +out_unlock: + mutex_unlock(&d->lock); if (!bch2_err_matches(ret, EROFS)) bch_err_fn(c, ret); return ret; } +int bch2_delete_dead_snapshots(struct bch_fs *c) +{ + if (!c->opts.auto_snapshot_deletion) + return 0; + + return __bch2_delete_dead_snapshots(c); +} + void bch2_delete_dead_snapshots_work(struct work_struct *work) { - struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete_work); + struct bch_fs *c = container_of(work, struct bch_fs, snapshot_delete.work); set_worker_desc("bcachefs-delete-dead-snapshots/%s", c->name); bch2_delete_dead_snapshots(c); - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); } void bch2_delete_dead_snapshots_async(struct bch_fs *c) { - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_delete_dead_snapshots)) + if (!c->opts.auto_snapshot_deletion) + return; + + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_delete_dead_snapshots)) return; BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - if (!queue_work(c->write_ref_wq, &c->snapshot_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_delete_dead_snapshots); + if (!queue_work(c->write_ref_wq, &c->snapshot_delete.work)) + enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); +} + +void bch2_snapshot_delete_status_to_text(struct printbuf *out, struct bch_fs *c) +{ + struct snapshot_delete *d = &c->snapshot_delete; + + if (!d->running) { + prt_str(out, "(not running)"); + return; + } + + mutex_lock(&d->progress_lock); + bch2_snapshot_delete_nodes_to_text(out, d); + + bch2_bbpos_to_text(out, d->pos); + mutex_unlock(&d->progress_lock); } int __bch2_key_has_snapshot_overwrites(struct btree_trans *trans, @@ -1706,7 +1963,7 @@ static int bch2_check_snapshot_needs_deletion(struct btree_trans *trans, struct return 0; struct bkey_s_c_snapshot snap = bkey_s_c_to_snapshot(k); - if (BCH_SNAPSHOT_DELETED(snap.v) || + if (BCH_SNAPSHOT_WILL_DELETE(snap.v) || interior_snapshot_needs_delete(snap)) set_bit(BCH_FS_need_delete_dead_snapshots, &trans->c->flags); @@ -1735,10 +1992,6 @@ int bch2_snapshots_read(struct bch_fs *c) BUG_ON(!test_bit(BCH_FS_new_fs, &c->flags) && test_bit(BCH_FS_may_go_rw, &c->flags)); - if (bch2_err_matches(ret, EIO) || - (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots))) - ret = bch2_run_explicit_recovery_pass_persistent(c, BCH_RECOVERY_PASS_reconstruct_snapshots); - return ret; } @@ -1746,3 +1999,11 @@ void bch2_fs_snapshots_exit(struct bch_fs *c) { kvfree(rcu_dereference_protected(c->snapshots, true)); } + +void bch2_fs_snapshots_init_early(struct bch_fs *c) +{ + INIT_WORK(&c->snapshot_delete.work, bch2_delete_dead_snapshots_work); + mutex_init(&c->snapshot_delete.lock); + mutex_init(&c->snapshot_delete.progress_lock); + mutex_init(&c->snapshots_unlinked_lock); +} diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 00373cf32e7b..382a171f5413 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -105,6 +105,7 @@ static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) return id; } +u32 bch2_snapshot_oldest_subvol(struct bch_fs *, u32, snapshot_id_list *); u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) @@ -119,21 +120,26 @@ static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) return id; } -static inline bool __bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, u32 id) { const struct snapshot_t *s = snapshot_t(c, id); - return s ? s->live : 0; + return s ? s->state : SNAPSHOT_ID_empty; } -static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) { rcu_read_lock(); - bool ret = __bch2_snapshot_exists(c, id); + enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id); rcu_read_unlock(); return ret; } +static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) +{ + return bch2_snapshot_id_state(c, id) == SNAPSHOT_ID_live; +} + static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { rcu_read_lock(); @@ -240,10 +246,19 @@ int bch2_snapshot_node_create(struct btree_trans *, u32, int bch2_check_snapshot_trees(struct bch_fs *); int bch2_check_snapshots(struct bch_fs *); int bch2_reconstruct_snapshots(struct bch_fs *); -int bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +int __bch2_check_key_has_snapshot(struct btree_trans *, struct btree_iter *, struct bkey_s_c); + +static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, + struct btree_iter *iter, + struct bkey_s_c k) +{ + return likely(bch2_snapshot_exists(trans->c, k.k->p.snapshot)) + ? 0 + : __bch2_check_key_has_snapshot(trans, iter, k); +} int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); -void bch2_delete_dead_snapshots_work(struct work_struct *); int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); @@ -258,7 +273,14 @@ static inline int bch2_key_has_snapshot_overwrites(struct btree_trans *trans, return __bch2_key_has_snapshot_overwrites(trans, id, pos); } +int __bch2_delete_dead_snapshots(struct bch_fs *); +int bch2_delete_dead_snapshots(struct bch_fs *); +void bch2_delete_dead_snapshots_work(struct work_struct *); +void bch2_delete_dead_snapshots_async(struct bch_fs *); +void bch2_snapshot_delete_status_to_text(struct printbuf *, struct bch_fs *); + int bch2_snapshots_read(struct bch_fs *); void bch2_fs_snapshots_exit(struct bch_fs *); +void bch2_fs_snapshots_init_early(struct bch_fs *); #endif /* _BCACHEFS_SNAPSHOT_H */ diff --git a/fs/bcachefs/snapshot_format.h b/fs/bcachefs/snapshot_format.h index aabcd3a74cd9..9bccae1f3590 100644 --- a/fs/bcachefs/snapshot_format.h +++ b/fs/bcachefs/snapshot_format.h @@ -15,10 +15,10 @@ struct bch_snapshot { bch_le128 btime; }; -LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 0, 1) - +LE32_BITMASK(BCH_SNAPSHOT_WILL_DELETE, struct bch_snapshot, flags, 0, 1) /* True if a subvolume points to this snapshot node: */ LE32_BITMASK(BCH_SNAPSHOT_SUBVOL, struct bch_snapshot, flags, 1, 2) +LE32_BITMASK(BCH_SNAPSHOT_DELETED, struct bch_snapshot, flags, 2, 3) /* * Snapshot trees: diff --git a/fs/bcachefs/snapshot_types.h b/fs/bcachefs/snapshot_types.h new file mode 100644 index 000000000000..0ab698f13e5c --- /dev/null +++ b/fs/bcachefs/snapshot_types.h @@ -0,0 +1,57 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _BCACHEFS_SNAPSHOT_TYPES_H +#define _BCACHEFS_SNAPSHOT_TYPES_H + +#include "bbpos_types.h" +#include "darray.h" +#include "subvolume_types.h" + +typedef DARRAY(u32) snapshot_id_list; + +#define IS_ANCESTOR_BITMAP 128 + +struct snapshot_t { + enum snapshot_id_state { + SNAPSHOT_ID_empty, + SNAPSHOT_ID_live, + SNAPSHOT_ID_deleted, + } state; + u32 parent; + u32 skip[3]; + u32 depth; + u32 children[2]; + u32 subvol; /* Nonzero only if a subvolume points to this node: */ + u32 tree; + unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; +}; + +struct snapshot_table { + struct rcu_head rcu; + size_t nr; +#ifndef RUST_BINDGEN + DECLARE_FLEX_ARRAY(struct snapshot_t, s); +#else + struct snapshot_t s[0]; +#endif +}; + +struct snapshot_interior_delete { + u32 id; + u32 live_child; +}; +typedef DARRAY(struct snapshot_interior_delete) interior_delete_list; + +struct snapshot_delete { + struct mutex lock; + struct work_struct work; + + struct mutex progress_lock; + snapshot_id_list deleting_from_trees; + snapshot_id_list delete_leaves; + interior_delete_list delete_interior; + + bool running; + struct bbpos pos; +}; + +#endif /* _BCACHEFS_SNAPSHOT_TYPES_H */ diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index d78451c2a0c6..0cbf5508a32c 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -50,7 +50,7 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans, for (unsigned i = 0; i < 1000; i++) { unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len); + unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0); if (u64s > U8_MAX) return -EINVAL; @@ -101,17 +101,25 @@ static noinline int hash_pick_winner(struct btree_trans *trans, } } -static int repair_inode_hash_info(struct btree_trans *trans, - struct bch_inode_unpacked *snapshot_root) +/* + * str_hash lookups across snapshots break in wild ways if hash_info in + * different snapshot versions doesn't match - so if we find one mismatch, check + * them all + */ +int bch2_repair_inode_hash_info(struct btree_trans *trans, + struct bch_inode_unpacked *snapshot_root) { + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k; + struct printbuf buf = PRINTBUF; + bool need_commit = false; int ret = 0; - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, - SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot - 1), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != snapshot_root->bi_inum) + for_each_btree_key_norestart(trans, iter, BTREE_ID_inodes, + POS(0, snapshot_root->bi_inum), + BTREE_ITER_all_snapshots, k, ret) { + if (bpos_ge(k.k->p, SPOS(0, snapshot_root->bi_inum, snapshot_root->bi_snapshot))) break; if (!bkey_is_inode(k.k)) continue; @@ -121,19 +129,72 @@ static int repair_inode_hash_info(struct btree_trans *trans, if (ret) break; - if (fsck_err_on(inode.bi_hash_seed != snapshot_root->bi_hash_seed || - INODE_STR_HASH(&inode) != INODE_STR_HASH(snapshot_root), - trans, inode_snapshot_mismatch, - "inode hash info in different snapshots don't match")) { + if (inode.bi_hash_seed == snapshot_root->bi_hash_seed && + INODE_STR_HASH(&inode) == INODE_STR_HASH(snapshot_root)) { +#ifdef CONFIG_BCACHEFS_DEBUG + struct bch_hash_info hash1 = bch2_hash_info_init(c, snapshot_root); + struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); + + BUG_ON(hash1.type != hash2.type || + memcmp(&hash1.siphash_key, + &hash2.siphash_key, + sizeof(hash1.siphash_key))); +#endif + continue; + } + + printbuf_reset(&buf); + prt_printf(&buf, "inode %llu hash info in snapshots %u %u don't match\n", + snapshot_root->bi_inum, + inode.bi_snapshot, + snapshot_root->bi_snapshot); + + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(&inode)); + prt_printf(&buf, " %llx\n", inode.bi_hash_seed); + + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); + prt_printf(&buf, " %llx", snapshot_root->bi_hash_seed); + + if (fsck_err(trans, inode_snapshot_mismatch, "%s", buf.buf)) { inode.bi_hash_seed = snapshot_root->bi_hash_seed; SET_INODE_STR_HASH(&inode, INODE_STR_HASH(snapshot_root)); - ret = __bch2_fsck_write_inode(trans, &inode) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - break; + + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + break; + need_commit = true; } } + + if (ret) + goto err; + + if (!need_commit) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "inode %llu hash info mismatch with root, but mismatch not found\n", + snapshot_root->bi_inum); + + prt_printf(&buf, "root snapshot %u ", snapshot_root->bi_snapshot); + bch2_prt_str_hash_type(&buf, INODE_STR_HASH(snapshot_root)); + prt_printf(&buf, " %llx\n", snapshot_root->bi_hash_seed); +#if 0 + prt_printf(&buf, "vs snapshot %u ", hash_info->inum_snapshot); + bch2_prt_str_hash_type(&buf, hash_info->type); + prt_printf(&buf, " %llx %llx", hash_info->siphash_key.k0, hash_info->siphash_key.k1); +#endif + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + ret = -BCH_ERR_fsck_repair_unimplemented; + goto err; + } + + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_nested; +err: fsck_err: + printbuf_exit(&buf); bch2_trans_iter_exit(trans, &iter); return ret; } @@ -145,46 +206,18 @@ fsck_err: static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans, u64 inum, struct bch_hash_info *hash_info) { - struct bch_fs *c = trans->c; - struct btree_iter iter; - struct bkey_s_c k; - int ret = 0; - - for_each_btree_key_reverse_norestart(trans, iter, BTREE_ID_inodes, SPOS(0, inum, U32_MAX), - BTREE_ITER_all_snapshots, k, ret) { - if (k.k->p.offset != inum) - break; - if (bkey_is_inode(k.k)) - goto found; - } - bch_err(c, "%s(): inum %llu not found", __func__, inum); - ret = -BCH_ERR_fsck_repair_unimplemented; - goto err; -found:; - struct bch_inode_unpacked inode; - ret = bch2_inode_unpack(k, &inode); + struct bch_inode_unpacked snapshot_root; + int ret = bch2_inode_find_snapshot_root(trans, inum, &snapshot_root); if (ret) - goto err; + return ret; + + struct bch_hash_info hash_root = bch2_hash_info_init(trans->c, &snapshot_root); + if (hash_info->type != hash_root.type || + memcmp(&hash_info->siphash_key, + &hash_root.siphash_key, + sizeof(hash_root.siphash_key))) + ret = bch2_repair_inode_hash_info(trans, &snapshot_root); - struct bch_hash_info hash2 = bch2_hash_info_init(c, &inode); - if (hash_info->type != hash2.type || - memcmp(&hash_info->siphash_key, &hash2.siphash_key, sizeof(hash2.siphash_key))) { - ret = repair_inode_hash_info(trans, &inode); - if (!ret) { - bch_err(c, "inode hash info mismatch with root, but mismatch not found\n" - "%u %llx %llx\n" - "%u %llx %llx", - hash_info->type, - hash_info->siphash_key.k0, - hash_info->siphash_key.k1, - hash2.type, - hash2.siphash_key.k0, - hash2.siphash_key.k1); - ret = -BCH_ERR_fsck_repair_unimplemented; - } - } -err: - bch2_trans_iter_exit(trans, &iter); return ret; } @@ -195,7 +228,7 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, struct btree_iter *k_iter, struct bkey_s_c hash_k) { struct bch_fs *c = trans->c; - struct btree_iter iter = { NULL }; + struct btree_iter iter = {}; struct printbuf buf = PRINTBUF; struct bkey_s_c k; int ret = 0; @@ -232,7 +265,7 @@ bad_hash: goto out; if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n %s", + "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 55a4ac7bf220..6762b3627e1b 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -12,7 +12,6 @@ #include "super.h" #include <linux/crc32c.h> -#include <crypto/hash.h> #include <crypto/sha2.h> static inline enum bch_str_hash_type @@ -33,7 +32,9 @@ bch2_str_hash_opt_to_type(struct bch_fs *c, enum bch_str_hash_opts opt) } struct bch_hash_info { + u32 inum_snapshot; u8 type; + struct unicode_map *cf_encoding; /* * For crc32 or crc64 string hashes the first key value of * the siphash_key (k0) is used as the key. @@ -44,20 +45,20 @@ struct bch_hash_info { static inline struct bch_hash_info bch2_hash_info_init(struct bch_fs *c, const struct bch_inode_unpacked *bi) { - /* XXX ick */ struct bch_hash_info info = { - .type = INODE_STR_HASH(bi), - .siphash_key = { .k0 = bi->bi_hash_seed } + .inum_snapshot = bi->bi_snapshot, + .type = INODE_STR_HASH(bi), +#ifdef CONFIG_UNICODE + .cf_encoding = bch2_inode_casefold(c, bi) ? c->cf_encoding : NULL, +#endif + .siphash_key = { .k0 = bi->bi_hash_seed } }; if (unlikely(info.type == BCH_STR_HASH_siphash_old)) { - SHASH_DESC_ON_STACK(desc, c->sha256); u8 digest[SHA256_DIGEST_SIZE]; - desc->tfm = c->sha256; - - crypto_shash_digest(desc, (void *) &bi->bi_hash_seed, - sizeof(bi->bi_hash_seed), digest); + sha256((const u8 *)&bi->bi_hash_seed, + sizeof(bi->bi_hash_seed), digest); memcpy(&info.siphash_key, digest, sizeof(info.siphash_key)); } @@ -231,11 +232,11 @@ int bch2_hash_needs_whiteout(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_trans_copy_iter(&iter, start); + bch2_trans_copy_iter(trans, &iter, start); - bch2_btree_iter_advance(&iter); + bch2_btree_iter_advance(trans, &iter); - for_each_btree_key_continue_norestart(iter, BTREE_ITER_slots, k, ret) { + for_each_btree_key_continue_norestart(trans, iter, BTREE_ITER_slots, k, ret) { if (k.k->type != desc.key_type && k.k->type != KEY_TYPE_hash_whiteout) break; @@ -280,7 +281,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, } if (!slot.path && !(flags & STR_HASH_must_replace)) - bch2_trans_copy_iter(&slot, iter); + bch2_trans_copy_iter(trans, &slot, iter); if (k.k->type != KEY_TYPE_hash_whiteout) goto not_found; @@ -393,6 +394,8 @@ int bch2_hash_delete(struct btree_trans *trans, return ret; } +int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); + struct snapshots_seen; int __bch2_str_hash_check_key(struct btree_trans *, struct snapshots_seen *, diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index b7b96283c316..35c9f86a73c1 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -3,9 +3,11 @@ #include "bcachefs.h" #include "btree_key_cache.h" #include "btree_update.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" +#include "recovery_passes.h" #include "snapshot.h" #include "subvolume.h" @@ -13,6 +15,22 @@ static int bch2_subvolume_delete(struct btree_trans *, u32); +static int bch2_subvolume_missing(struct bch_fs *c, u32 subvolid) +{ + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "missing subvolume %u", subvolid); + bool print = bch2_count_fsck_err(c, subvol_missing, &buf); + + int ret = bch2_run_explicit_recovery_pass(c, &buf, + BCH_RECOVERY_PASS_check_inodes, 0); + if (print) + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + return ret; +} + static struct bpos subvolume_children_pos(struct bkey_s_c k) { if (k.k->type != KEY_TYPE_subvolume) @@ -44,8 +62,8 @@ static int check_subvol(struct btree_trans *trans, ret = bch2_snapshot_lookup(trans, snapid, &snapshot); if (bch2_err_matches(ret, ENOENT)) - bch_err(c, "subvolume %llu points to nonexistent snapshot %u", - k.k->p.offset, snapid); + return bch2_run_print_explicit_recovery_pass(c, + BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; if (ret) return ret; @@ -275,7 +293,7 @@ int bch2_subvol_has_children(struct btree_trans *trans, u32 subvol) struct btree_iter iter; bch2_trans_iter_init(trans, &iter, BTREE_ID_subvolume_children, POS(subvol, 0), 0); - struct bkey_s_c k = bch2_btree_iter_peek(&iter); + struct bkey_s_c k = bch2_btree_iter_peek(trans, &iter); bch2_trans_iter_exit(trans, &iter); return bkey_err(k) ?: k.k && k.k->p.inode == subvol @@ -291,9 +309,8 @@ bch2_subvolume_get_inlined(struct btree_trans *trans, unsigned subvol, int ret = bch2_bkey_get_val_typed(trans, BTREE_ID_subvolumes, POS(0, subvol), BTREE_ITER_cached| BTREE_ITER_with_updates, subvolume, s); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT) && - inconsistent_if_not_found, - trans->c, "missing subvolume %u", subvol); + if (bch2_err_matches(ret, ENOENT) && inconsistent_if_not_found) + ret = bch2_subvolume_missing(trans->c, subvol) ?: ret; return ret; } @@ -343,8 +360,8 @@ int __bch2_subvolume_get_snapshot(struct btree_trans *trans, u32 subvolid, subvolume); ret = bkey_err(subvol); - bch2_fs_inconsistent_on(warn && bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; if (likely(!ret)) *snapid = le32_to_cpu(subvol.v->snapshot); @@ -417,8 +434,8 @@ static int __bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) BTREE_ITER_cached|BTREE_ITER_intent, subvolume); int ret = bkey_err(subvol); - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; if (ret) goto err; @@ -478,13 +495,11 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor { struct bch_fs *c = container_of(work, struct bch_fs, snapshot_wait_for_pagecache_and_delete_work); - snapshot_id_list s; - u32 *id; int ret = 0; while (!ret) { mutex_lock(&c->snapshots_unlinked_lock); - s = c->snapshots_unlinked; + snapshot_id_list s = c->snapshots_unlinked; darray_init(&c->snapshots_unlinked); mutex_unlock(&c->snapshots_unlinked_lock); @@ -493,7 +508,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor bch2_evict_subvolume_inodes(c, &s); - for (id = s.data; id < s.data + s.nr; id++) { + darray_for_each(s, id) { ret = bch2_trans_run(c, bch2_subvolume_delete(trans, *id)); bch_err_msg(c, ret, "deleting subvolume %u", *id); if (ret) @@ -503,7 +518,7 @@ static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *wor darray_exit(&s); } - bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); } struct subvolume_unlink_hook { @@ -526,11 +541,11 @@ static int bch2_subvolume_wait_for_pagecache_and_delete_hook(struct btree_trans if (ret) return ret; - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_snapshot_delete_pagecache)) + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache)) return -EROFS; if (!queue_work(c->write_ref_wq, &c->snapshot_wait_for_pagecache_and_delete_work)) - bch2_write_ref_put(c, BCH_WRITE_REF_snapshot_delete_pagecache); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_snapshot_delete_pagecache); return 0; } @@ -554,13 +569,13 @@ int bch2_subvolume_unlink(struct btree_trans *trans, u32 subvolid) BTREE_ID_subvolumes, POS(0, subvolid), BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(n); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), trans->c, - "missing subvolume %u", subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, subvolid) ?: ret; + if (unlikely(ret)) return ret; - } SET_BCH_SUBVOLUME_UNLINKED(&n->v, true); + n->v.fs_path_parent = 0; bch2_trans_iter_exit(trans, &iter); return ret; } @@ -573,7 +588,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, bool ro) { struct bch_fs *c = trans->c; - struct btree_iter dst_iter, src_iter = (struct btree_iter) { NULL }; + struct btree_iter dst_iter, src_iter = {}; struct bkey_i_subvolume *new_subvol = NULL; struct bkey_i_subvolume *src_subvol = NULL; u32 parent = 0, new_nodes[2], snapshot_subvols[2]; @@ -596,11 +611,10 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, BTREE_ID_subvolumes, POS(0, src_subvolid), BTREE_ITER_cached, subvolume); ret = PTR_ERR_OR_ZERO(src_subvol); - if (unlikely(ret)) { - bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, - "subvolume %u not found", src_subvolid); + if (bch2_err_matches(ret, ENOENT)) + ret = bch2_subvolume_missing(trans->c, src_subvolid) ?: ret; + if (unlikely(ret)) goto err; - } parent = le32_to_cpu(src_subvol->v.snapshot); } @@ -714,11 +728,8 @@ int bch2_fs_upgrade_for_subvolumes(struct bch_fs *c) return ret; } -int bch2_fs_subvolumes_init(struct bch_fs *c) +void bch2_fs_subvolumes_init_early(struct bch_fs *c) { - INIT_WORK(&c->snapshot_delete_work, bch2_delete_dead_snapshots_work); INIT_WORK(&c->snapshot_wait_for_pagecache_and_delete_work, bch2_subvolume_wait_for_pagecache_and_delete); - mutex_init(&c->snapshots_unlinked_lock); - return 0; } diff --git a/fs/bcachefs/subvolume.h b/fs/bcachefs/subvolume.h index 910f6196700e..075f55e25c70 100644 --- a/fs/bcachefs/subvolume.h +++ b/fs/bcachefs/subvolume.h @@ -33,16 +33,16 @@ int bch2_subvol_is_ro_trans(struct btree_trans *, u32); int bch2_subvol_is_ro(struct bch_fs *, u32); static inline struct bkey_s_c -bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos end, - u32 subvolid, unsigned flags) +bch2_btree_iter_peek_in_subvolume_max_type(struct btree_trans *trans, struct btree_iter *iter, + struct bpos end, u32 subvolid, unsigned flags) { u32 snapshot; - int ret = bch2_subvolume_get_snapshot(iter->trans, subvolid, &snapshot); + int ret = bch2_subvolume_get_snapshot(trans, subvolid, &snapshot); if (ret) return bkey_s_c_err(ret); - bch2_btree_iter_set_snapshot(iter, snapshot); - return bch2_btree_iter_peek_max_type(iter, end, flags); + bch2_btree_iter_set_snapshot(trans, iter, snapshot); + return bch2_btree_iter_peek_max_type(trans, iter, end, flags); } #define for_each_btree_key_in_subvolume_max_continue(_trans, _iter, \ @@ -53,14 +53,14 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos \ do { \ _ret3 = lockrestart_do(_trans, ({ \ - (_k) = bch2_btree_iter_peek_in_subvolume_max_type(&(_iter), \ + (_k) = bch2_btree_iter_peek_in_subvolume_max_type(trans, &(_iter),\ _end, _subvolid, (_flags)); \ if (!(_k).k) \ break; \ \ bkey_err(_k) ?: (_do); \ })); \ - } while (!_ret3 && bch2_btree_iter_advance(&(_iter))); \ + } while (!_ret3 && bch2_btree_iter_advance(_trans, &(_iter))); \ \ bch2_trans_iter_exit((_trans), &(_iter)); \ _ret3; \ @@ -77,15 +77,12 @@ bch2_btree_iter_peek_in_subvolume_max_type(struct btree_iter *iter, struct bpos _end, _subvolid, _flags, _k, _do); \ }) -int bch2_delete_dead_snapshots(struct bch_fs *); -void bch2_delete_dead_snapshots_async(struct bch_fs *); - int bch2_subvolume_unlink(struct btree_trans *, u32); int bch2_subvolume_create(struct btree_trans *, u64, u32, u32, u32 *, u32 *, bool); int bch2_initialize_subvolumes(struct bch_fs *); int bch2_fs_upgrade_for_subvolumes(struct bch_fs *); -int bch2_fs_subvolumes_init(struct bch_fs *); +void bch2_fs_subvolumes_init_early(struct bch_fs *); #endif /* _BCACHEFS_SUBVOLUME_H */ diff --git a/fs/bcachefs/subvolume_types.h b/fs/bcachefs/subvolume_types.h index 1549d6daf7af..9d634b906dcd 100644 --- a/fs/bcachefs/subvolume_types.h +++ b/fs/bcachefs/subvolume_types.h @@ -2,33 +2,6 @@ #ifndef _BCACHEFS_SUBVOLUME_TYPES_H #define _BCACHEFS_SUBVOLUME_TYPES_H -#include "darray.h" - -typedef DARRAY(u32) snapshot_id_list; - -#define IS_ANCESTOR_BITMAP 128 - -struct snapshot_t { - bool live; - u32 parent; - u32 skip[3]; - u32 depth; - u32 children[2]; - u32 subvol; /* Nonzero only if a subvolume points to this node: */ - u32 tree; - unsigned long is_ancestor[BITS_TO_LONGS(IS_ANCESTOR_BITMAP)]; -}; - -struct snapshot_table { - struct rcu_head rcu; - size_t nr; -#ifndef RUST_BINDGEN - DECLARE_FLEX_ARRAY(struct snapshot_t, s); -#else - struct snapshot_t s[0]; -#endif -}; - typedef struct { /* we can't have padding in this struct: */ u64 subvol; diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 8037ccbacf6a..6687b9235d3c 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -25,9 +25,6 @@ #include <linux/sort.h> #include <linux/string_choices.h> -static const struct blk_holder_ops bch2_sb_handle_bdev_ops = { -}; - struct bch2_metadata_version { u16 version; const char *name; @@ -69,14 +66,39 @@ enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_meta return v; } -void bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) +int bch2_set_version_incompat(struct bch_fs *c, enum bcachefs_metadata_version version) { + int ret = ((c->sb.features & BIT_ULL(BCH_FEATURE_incompat_version_field)) && + version <= c->sb.version_incompat_allowed) + ? 0 + : -BCH_ERR_may_not_use_incompat_feature; + mutex_lock(&c->sb_lock); - SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, - max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_FEATURE_incompat_version_field); - bch2_write_super(c); + if (!ret) { + SET_BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT(c->disk_sb.sb), version)); + bch2_write_super(c); + } else { + darray_for_each(c->incompat_versions_requested, i) + if (version == *i) + goto out; + + darray_push(&c->incompat_versions_requested, version); + struct printbuf buf = PRINTBUF; + prt_str(&buf, "requested incompat feature "); + bch2_version_to_text(&buf, version); + prt_str(&buf, " currently not enabled, allowed up to "); + bch2_version_to_text(&buf, version); + prt_printf(&buf, "\n set version_upgrade=incompat to enable"); + + bch_notice(c, "%s", buf.buf); + printbuf_exit(&buf); + } + +out: mutex_unlock(&c->sb_lock); + + return ret; } const char * const bch2_sb_fields[] = { @@ -239,11 +261,11 @@ struct bch_sb_field *bch2_sb_field_resize_id(struct bch_sb_handle *sb, /* XXX: we're not checking that offline device have enough space */ - for_each_online_member(c, ca) { + for_each_online_member(c, ca, BCH_DEV_READ_REF_sb_field_resize) { struct bch_sb_handle *dev_sb = &ca->disk_sb; if (bch2_sb_realloc(dev_sb, le32_to_cpu(dev_sb->sb->u64s) + d)) { - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_sb_field_resize); return NULL; } } @@ -360,39 +382,40 @@ static int bch2_sb_compatible(struct bch_sb *sb, struct printbuf *out) return 0; } -static int bch2_sb_validate(struct bch_sb_handle *disk_sb, - enum bch_validate_flags flags, struct printbuf *out) +int bch2_sb_validate(struct bch_sb *sb, u64 read_offset, + enum bch_validate_flags flags, struct printbuf *out) { - struct bch_sb *sb = disk_sb->sb; - struct bch_sb_field_members_v1 *mi; enum bch_opt_id opt_id; - u16 block_size; int ret; ret = bch2_sb_compatible(sb, out); if (ret) return ret; - if (sb->features[1] || - (le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR))) { - prt_printf(out, "Filesystem has incompatible features"); + u64 incompat = le64_to_cpu(sb->features[0]) & (~0ULL << BCH_FEATURE_NR); + unsigned incompat_bit = 0; + if (incompat) + incompat_bit = __ffs64(incompat); + else if (sb->features[1]) + incompat_bit = 64 + __ffs64(le64_to_cpu(sb->features[1])); + + if (incompat_bit) { + prt_printf(out, "Filesystem has incompatible feature bit %u, highest supported %s (%u)", + incompat_bit, + bch2_sb_features[BCH_FEATURE_NR - 1], + BCH_FEATURE_NR - 1); return -BCH_ERR_invalid_sb_features; } if (BCH_VERSION_MAJOR(le16_to_cpu(sb->version)) > BCH_VERSION_MAJOR(bcachefs_metadata_version_current) || BCH_SB_VERSION_INCOMPAT(sb) > bcachefs_metadata_version_current) { - prt_printf(out, "Filesystem has incompatible version"); + prt_str(out, "Filesystem has incompatible version "); + bch2_version_to_text(out, le16_to_cpu(sb->version)); + prt_str(out, ", current version "); + bch2_version_to_text(out, bcachefs_metadata_version_current); return -BCH_ERR_invalid_sb_features; } - block_size = le16_to_cpu(sb->block_size); - - if (block_size > PAGE_SECTORS) { - prt_printf(out, "Block size too big (got %u, max %u)", - block_size, PAGE_SECTORS); - return -BCH_ERR_invalid_sb_block_size; - } - if (bch2_is_zero(sb->user_uuid.b, sizeof(sb->user_uuid))) { prt_printf(out, "Bad user UUID (got zeroes)"); return -BCH_ERR_invalid_sb_uuid; @@ -403,6 +426,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, return -BCH_ERR_invalid_sb_uuid; } + if (!(flags & BCH_VALIDATE_write) && + le64_to_cpu(sb->offset) != read_offset) { + prt_printf(out, "Bad sb offset (got %llu, read from %llu)", + le64_to_cpu(sb->offset), read_offset); + return -BCH_ERR_invalid_sb_offset; + } + if (!sb->nr_devices || sb->nr_devices > BCH_SB_MEMBERS_MAX) { prt_printf(out, "Bad number of member devices %u (max %u)", @@ -438,6 +468,9 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(sb, BCH_SB_VERSION_INCOMPAT(sb)); } + if (sb->nr_devices > 1) + SET_BCH_SB_MULTI_DEVICE(sb, true); + if (!flags) { /* * Been seeing a bug where these are getting inexplicably @@ -458,6 +491,13 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_disk_accounting_v2) SET_BCH_SB_PROMOTE_WHOLE_EXTENTS(sb, true); + + if (!BCH_SB_WRITE_ERROR_TIMEOUT(sb)) + SET_BCH_SB_WRITE_ERROR_TIMEOUT(sb, 30); + + if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_extent_flags && + !BCH_SB_CSUM_ERR_RETRY_NR(sb)) + SET_BCH_SB_CSUM_ERR_RETRY_NR(sb, 3); } #ifdef __KERNEL__ @@ -468,8 +508,8 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, for (opt_id = 0; opt_id < bch2_opts_nr; opt_id++) { const struct bch_option *opt = bch2_opt_table + opt_id; - if (opt->get_sb != BCH2_NO_SB_OPT) { - u64 v = bch2_opt_from_sb(sb, opt_id); + if (opt->get_sb) { + u64 v = bch2_opt_from_sb(sb, opt_id, -1); prt_printf(out, "Invalid option "); ret = bch2_opt_validate(opt, v, out); @@ -499,14 +539,17 @@ static int bch2_sb_validate(struct bch_sb_handle *disk_sb, } } + struct bch_sb_field *mi = + bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v2) ?: + bch2_sb_field_get_id(sb, BCH_SB_FIELD_members_v1); + /* members must be validated first: */ - mi = bch2_sb_field_get(sb, members_v1); if (!mi) { prt_printf(out, "Invalid superblock: member info area missing"); return -BCH_ERR_invalid_sb_members_missing; } - ret = bch2_sb_field_validate(sb, &mi->field, flags, out); + ret = bch2_sb_field_validate(sb, mi, flags, out); if (ret) return ret; @@ -575,11 +618,15 @@ static void bch2_sb_update(struct bch_fs *c) c->sb.features = le64_to_cpu(src->features[0]); c->sb.compat = le64_to_cpu(src->compat[0]); + c->sb.multi_device = BCH_SB_MULTI_DEVICE(src); memset(c->sb.errors_silent, 0, sizeof(c->sb.errors_silent)); struct bch_sb_field_ext *ext = bch2_sb_field_get(src, ext); if (ext) { + c->sb.recovery_passes_required = + bch2_recovery_passes_from_stable(le64_to_cpu(ext->recovery_passes_required[0])); + le_bitvector_to_cpu(c->sb.errors_silent, (void *) ext->errors_silent, sizeof(c->sb.errors_silent) * 8); c->sb.btrees_lost_data = le64_to_cpu(ext->btrees_lost_data); @@ -749,7 +796,7 @@ retry: memset(sb, 0, sizeof(*sb)); sb->mode = BLK_OPEN_READ; sb->have_bio = true; - sb->holder = kmalloc(1, GFP_KERNEL); + sb->holder = kzalloc(sizeof(*sb->holder), GFP_KERNEL); if (!sb->holder) return -ENOMEM; @@ -875,7 +922,7 @@ got_super: sb->have_layout = true; - ret = bch2_sb_validate(sb, 0, &err); + ret = bch2_sb_validate(sb->sb, offset, 0, &err); if (ret) { bch2_print_opts(opts, KERN_ERR "bcachefs (%s): error validating superblock: %s\n", path, err.buf); @@ -912,19 +959,19 @@ static void write_super_endio(struct bio *bio) { struct bch_dev *ca = bio->bi_private; + bch2_account_io_success_fail(ca, bio_data_dir(bio), !bio->bi_status); + /* XXX: return errors directly */ - if (bch2_dev_io_err_on(bio->bi_status, ca, - bio_data_dir(bio) - ? BCH_MEMBER_ERROR_write - : BCH_MEMBER_ERROR_read, - "superblock %s error: %s", + if (bio->bi_status) { + bch_err_dev_ratelimited(ca, "superblock %s error: %s", str_write_read(bio_data_dir(bio)), - bch2_blk_status_to_str(bio->bi_status))) + bch2_blk_status_to_str(bio->bi_status)); ca->sb_write_error = 1; + } closure_put(&ca->fs->sb_write); - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); } static void read_back_super(struct bch_fs *c, struct bch_dev *ca) @@ -942,7 +989,7 @@ static void read_back_super(struct bch_fs *c, struct bch_dev *ca) this_cpu_add(ca->io_done->sectors[READ][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); closure_bio_submit(bio, &c->sb_write); } @@ -968,7 +1015,7 @@ static void write_one_super(struct bch_fs *c, struct bch_dev *ca, unsigned idx) this_cpu_add(ca->io_done->sectors[WRITE][BCH_DATA_sb], bio_sectors(bio)); - percpu_ref_get(&ca->io_ref); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); closure_bio_submit(bio, &c->sb_write); } @@ -985,7 +1032,7 @@ int bch2_write_super(struct bch_fs *c) trace_and_count(c, write_super, c, _RET_IP_); - if (c->opts.very_degraded) + if (c->opts.degraded == BCH_DEGRADED_very) degraded_flags |= BCH_FORCE_IF_LOST; lockdep_assert_held(&c->sb_lock); @@ -993,13 +1040,20 @@ int bch2_write_super(struct bch_fs *c) closure_init_stack(cl); memset(&sb_written, 0, sizeof(sb_written)); - for_each_online_member(c, ca) { + /* + * Note: we do writes to RO devices here, and we might want to change + * that in the future. + * + * For now, we expect to be able to call write_super() when we're not + * yet RW: + */ + for_each_online_member(c, ca, BCH_DEV_READ_REF_write_super) { ret = darray_push(&online_devices, ca); if (bch2_fs_fatal_err_on(ret, c, "%s: error allocating online devices", __func__)) { - percpu_ref_put(&ca->io_ref); + enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); goto out; } - percpu_ref_get(&ca->io_ref); + enumerated_ref_get(&ca->io_ref[READ], BCH_DEV_READ_REF_write_super); } /* Make sure we're using the new magic numbers: */ @@ -1032,7 +1086,7 @@ int bch2_write_super(struct bch_fs *c) darray_for_each(online_devices, ca) { printbuf_reset(&err); - ret = bch2_sb_validate(&(*ca)->disk_sb, BCH_VALIDATE_write, &err); + ret = bch2_sb_validate((*ca)->disk_sb.sb, 0, BCH_VALIDATE_write, &err); if (ret) { bch2_fs_inconsistent(c, "sb invalid before write: %s", err.buf); goto out; @@ -1058,7 +1112,8 @@ int bch2_write_super(struct bch_fs *c) prt_str(&buf, ")"); bch2_fs_fatal_error(c, ": %s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_sb_not_downgraded; + ret = -BCH_ERR_sb_not_downgraded; + goto out; } darray_for_each(online_devices, ca) { @@ -1160,12 +1215,12 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written), c, ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) - ret = -1; + ret = -BCH_ERR_erofs_sb_err; out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); darray_for_each(online_devices, ca) - percpu_ref_put(&(*ca)->io_ref); + enumerated_ref_put(&(*ca)->io_ref[READ], BCH_DEV_READ_REF_write_super); darray_exit(&online_devices); printbuf_exit(&err); return ret; @@ -1217,11 +1272,37 @@ void bch2_sb_upgrade(struct bch_fs *c, unsigned new_version, bool incompat) bch2_sb_field_resize(&c->disk_sb, downgrade, 0); c->disk_sb.sb->version = cpu_to_le16(new_version); - c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); - if (incompat) + if (incompat) { + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), new_version)); + } +} + +void bch2_sb_upgrade_incompat(struct bch_fs *c) +{ + mutex_lock(&c->sb_lock); + if (c->sb.version == c->sb.version_incompat_allowed) + goto unlock; + + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "Now allowing incompatible features up to "); + bch2_version_to_text(&buf, c->sb.version); + prt_str(&buf, ", previously allowed up to "); + bch2_version_to_text(&buf, c->sb.version_incompat_allowed); + prt_newline(&buf); + + bch_notice(c, "%s", buf.buf); + printbuf_exit(&buf); + + c->disk_sb.sb->features[0] |= cpu_to_le64(BCH_SB_FEATURES_ALL); + SET_BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb, + max(BCH_SB_VERSION_INCOMPAT_ALLOWED(c->disk_sb.sb), c->sb.version)); + bch2_write_super(c); +unlock: + mutex_unlock(&c->sb_lock); } static int bch2_sb_ext_validate(struct bch_sb *sb, struct bch_sb_field *f, @@ -1451,8 +1532,8 @@ void bch2_sb_to_text(struct printbuf *out, struct bch_sb *sb, for (id = 0; id < bch2_opts_nr; id++) { const struct bch_option *opt = bch2_opt_table + id; - if (opt->get_sb != BCH2_NO_SB_OPT) { - u64 v = bch2_opt_from_sb(sb, id); + if (opt->get_sb) { + u64 v = bch2_opt_from_sb(sb, id, -1); prt_printf(out, "%s:\t", opt->attr.name); bch2_opt_to_text(out, NULL, sb, opt, v, diff --git a/fs/bcachefs/super-io.h b/fs/bcachefs/super-io.h index f1ab4f943720..a3b7a90f2533 100644 --- a/fs/bcachefs/super-io.h +++ b/fs/bcachefs/super-io.h @@ -21,17 +21,14 @@ static inline bool bch2_version_compatible(u16 version) void bch2_version_to_text(struct printbuf *, enum bcachefs_metadata_version); enum bcachefs_metadata_version bch2_latest_compatible_version(enum bcachefs_metadata_version); -void bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); +int bch2_set_version_incompat(struct bch_fs *, enum bcachefs_metadata_version); -static inline bool bch2_request_incompat_feature(struct bch_fs *c, - enum bcachefs_metadata_version version) +static inline int bch2_request_incompat_feature(struct bch_fs *c, + enum bcachefs_metadata_version version) { - if (unlikely(version > c->sb.version_incompat)) { - if (version > c->sb.version_incompat_allowed) - return false; - bch2_set_version_incompat(c, version); - } - return true; + return likely(version <= c->sb.version_incompat) + ? 0 + : bch2_set_version_incompat(c, version); } static inline size_t bch2_sb_field_bytes(struct bch_sb_field *f) @@ -95,6 +92,8 @@ int bch2_sb_from_fs(struct bch_fs *, struct bch_dev *); void bch2_free_super(struct bch_sb_handle *); int bch2_sb_realloc(struct bch_sb_handle *, unsigned); +int bch2_sb_validate(struct bch_sb *, u64, enum bch_validate_flags, struct printbuf *); + int bch2_read_super(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_read_super_silent(const char *, struct bch_opts *, struct bch_sb_handle *); int bch2_write_super(struct bch_fs *); @@ -108,6 +107,7 @@ static inline void bch2_check_set_feature(struct bch_fs *c, unsigned feat) bool bch2_check_version_downgrade(struct bch_fs *); void bch2_sb_upgrade(struct bch_fs *, unsigned, bool); +void bch2_sb_upgrade_incompat(struct bch_fs *); void __bch2_sb_field_to_text(struct printbuf *, struct bch_sb *, struct bch_sb_field *); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 6d97d412fed9..11579b74c640 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -10,6 +10,8 @@ #include "bcachefs.h" #include "alloc_background.h" #include "alloc_foreground.h" +#include "async_objs.h" +#include "backpointers.h" #include "bkey_sort.h" #include "btree_cache.h" #include "btree_gc.h" @@ -28,6 +30,7 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "errcode.h" #include "error.h" #include "fs.h" @@ -48,6 +51,7 @@ #include "quota.h" #include "rebalance.h" #include "recovery.h" +#include "recovery_passes.h" #include "replicas.h" #include "sb-clean.h" #include "sb-counters.h" @@ -70,26 +74,37 @@ #include <linux/percpu.h> #include <linux/random.h> #include <linux/sysfs.h> -#include <crypto/hash.h> MODULE_LICENSE("GPL"); MODULE_AUTHOR("Kent Overstreet <kent.overstreet@gmail.com>"); MODULE_DESCRIPTION("bcachefs filesystem"); -MODULE_SOFTDEP("pre: crc32c"); -MODULE_SOFTDEP("pre: crc64"); -MODULE_SOFTDEP("pre: sha256"); -MODULE_SOFTDEP("pre: chacha20"); -MODULE_SOFTDEP("pre: poly1305"); -MODULE_SOFTDEP("pre: xxhash"); -const char * const bch2_fs_flag_strs[] = { +typedef DARRAY(struct bch_sb_handle) bch_sb_handles; + #define x(n) #n, +const char * const bch2_fs_flag_strs[] = { BCH_FS_FLAGS() -#undef x NULL }; -void bch2_print_str(struct bch_fs *c, const char *str) +const char * const bch2_write_refs[] = { + BCH_WRITE_REFS() + NULL +}; + +const char * const bch2_dev_read_refs[] = { + BCH_DEV_READ_REFS() + NULL +}; + +const char * const bch2_dev_write_refs[] = { + BCH_DEV_WRITE_REFS() + NULL +}; +#undef x + +static void __bch2_print_str(struct bch_fs *c, const char *prefix, + const char *str, bool nonblocking) { #ifdef __KERNEL__ struct stdio_redirect *stdio = bch2_fs_stdio_redirect(c); @@ -99,7 +114,17 @@ void bch2_print_str(struct bch_fs *c, const char *str) return; } #endif - bch2_print_string_as_lines(KERN_ERR, str); + bch2_print_string_as_lines(KERN_ERR, str, nonblocking); +} + +void bch2_print_str(struct bch_fs *c, const char *prefix, const char *str) +{ + __bch2_print_str(c, prefix, str, false); +} + +void bch2_print_str_nonblocking(struct bch_fs *c, const char *prefix, const char *str) +{ + __bch2_print_str(c, prefix, str, true); } __printf(2, 0) @@ -188,7 +213,9 @@ static void bch2_dev_unlink(struct bch_dev *); static void bch2_dev_free(struct bch_dev *); static int bch2_dev_alloc(struct bch_fs *, unsigned); static int bch2_dev_sysfs_online(struct bch_fs *, struct bch_dev *); +static void bch2_dev_io_ref_stop(struct bch_dev *, int); static void __bch2_dev_read_only(struct bch_fs *, struct bch_dev *); +static int bch2_fs_init_rw(struct bch_fs *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { @@ -297,19 +324,19 @@ static void __bch2_fs_read_only(struct bch_fs *c) /* * After stopping journal: */ - for_each_member_device(c, ca) + for_each_member_device(c, ca) { + bch2_dev_io_ref_stop(ca, WRITE); bch2_dev_allocator_remove(c, ca); + } } -#ifndef BCH_WRITE_REF_DEBUG -static void bch2_writes_disabled(struct percpu_ref *writes) +static void bch2_writes_disabled(struct enumerated_ref *writes) { struct bch_fs *c = container_of(writes, struct bch_fs, writes); set_bit(BCH_FS_write_disable_complete, &c->flags); wake_up(&bch2_read_only_wait); } -#endif void bch2_fs_read_only(struct bch_fs *c) { @@ -327,12 +354,7 @@ void bch2_fs_read_only(struct bch_fs *c) * writes will return -EROFS: */ set_bit(BCH_FS_going_ro, &c->flags); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_kill(&c->writes); -#else - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) - bch2_write_ref_put(c, i); -#endif + enumerated_ref_stop_async(&c->writes); /* * If we're not doing an emergency shutdown, we want to wait on @@ -370,7 +392,7 @@ void bch2_fs_read_only(struct bch_fs *c) !test_bit(BCH_FS_emergency_ro, &c->flags) && test_bit(BCH_FS_started, &c->flags) && test_bit(BCH_FS_clean_shutdown, &c->flags) && - c->recovery_pass_done >= BCH_RECOVERY_PASS_journal_replay) { + c->recovery.pass_done >= BCH_RECOVERY_PASS_journal_replay) { BUG_ON(c->journal.last_empty_seq != journal_cur_seq(&c->journal)); BUG_ON(atomic_long_read(&c->btree_cache.nr_dirty)); BUG_ON(atomic_long_read(&c->btree_key_cache.nr_dirty)); @@ -381,6 +403,11 @@ void bch2_fs_read_only(struct bch_fs *c) bch_verbose(c, "marking filesystem clean"); bch2_fs_mark_clean(c); } else { + /* Make sure error counts/counters are persisted */ + mutex_lock(&c->sb_lock); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + bch_verbose(c, "done going read-only, filesystem not clean"); } } @@ -411,41 +438,39 @@ bool bch2_fs_emergency_read_only(struct bch_fs *c) return ret; } -bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) +static bool __bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out, + bool locked) { bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - bch2_journal_halt_locked(&c->journal); + if (!locked) + bch2_journal_halt(&c->journal); + else + bch2_journal_halt_locked(&c->journal); bch2_fs_read_only_async(c); - wake_up(&bch2_read_only_wait); + + if (ret) + prt_printf(out, "emergency read only at seq %llu\n", + journal_cur_seq(&c->journal)); + return ret; } -static int bch2_fs_read_write_late(struct bch_fs *c) +bool bch2_fs_emergency_read_only2(struct bch_fs *c, struct printbuf *out) { - int ret; + return __bch2_fs_emergency_read_only2(c, out, false); +} - /* - * Data move operations can't run until after check_snapshots has - * completed, and bch2_snapshot_is_ancestor() is available. - * - * Ideally we'd start copygc/rebalance earlier instead of waiting for - * all of recovery/fsck to complete: - */ - ret = bch2_copygc_start(c); - if (ret) { - bch_err(c, "error starting copygc thread"); - return ret; - } +bool bch2_fs_emergency_read_only_locked(struct bch_fs *c) +{ + bool ret = !test_and_set_bit(BCH_FS_emergency_ro, &c->flags); - ret = bch2_rebalance_start(c); - if (ret) { - bch_err(c, "error starting rebalance thread"); - return ret; - } + bch2_journal_halt_locked(&c->journal); + bch2_fs_read_only_async(c); - return 0; + wake_up(&bch2_read_only_wait); + return ret; } static int __bch2_fs_read_write(struct bch_fs *c, bool early) @@ -454,59 +479,79 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); + if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) + return -BCH_ERR_erofs_no_alloc_info; + if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); return -BCH_ERR_erofs_unfixed_errors; } + if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { + bch_err(c, "cannot go rw, filesystem is an unresized image file"); + return -BCH_ERR_erofs_filesystem_full; + } + if (test_bit(BCH_FS_rw, &c->flags)) return 0; bch_info(c, "going read-write"); - ret = bch2_sb_members_v2_init(c); + ret = bch2_fs_init_rw(c); if (ret) goto err; - ret = bch2_fs_mark_dirty(c); + ret = bch2_sb_members_v2_init(c); if (ret) goto err; clear_bit(BCH_FS_clean_shutdown, &c->flags); + rcu_read_lock(); + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) { + bch2_dev_allocator_add(c, ca); + enumerated_ref_start(&ca->io_ref[WRITE]); + } + rcu_read_unlock(); + + bch2_recalc_capacity(c); + /* * First journal write must be a flush write: after a clean shutdown we * don't read the journal, so the first journal write may end up * overwriting whatever was there previously, and there must always be * at least one non-flush write in the journal or recovery will fail: */ + spin_lock(&c->journal.lock); set_bit(JOURNAL_need_flush_write, &c->journal.flags); set_bit(JOURNAL_running, &c->journal.flags); + bch2_journal_space_available(&c->journal); + spin_unlock(&c->journal.lock); - for_each_rw_member(c, ca) - bch2_dev_allocator_add(c, ca); - bch2_recalc_capacity(c); + ret = bch2_fs_mark_dirty(c); + if (ret) + goto err; + + ret = bch2_journal_reclaim_start(&c->journal); + if (ret) + goto err; set_bit(BCH_FS_rw, &c->flags); set_bit(BCH_FS_was_rw, &c->flags); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_reinit(&c->writes); -#else - for (unsigned i = 0; i < BCH_WRITE_REF_NR; i++) { - BUG_ON(atomic_long_read(&c->writes[i])); - atomic_long_inc(&c->writes[i]); - } -#endif + enumerated_ref_start(&c->writes); - ret = bch2_journal_reclaim_start(&c->journal); - if (ret) + ret = bch2_copygc_start(c); + if (ret) { + bch_err_msg(c, ret, "error starting copygc thread"); goto err; + } - if (!early) { - ret = bch2_fs_read_write_late(c); - if (ret) - goto err; + ret = bch2_rebalance_start(c); + if (ret) { + bch_err_msg(c, ret, "error starting rebalance thread"); + goto err; } bch2_do_discards(c); @@ -531,14 +576,19 @@ int bch2_fs_read_write(struct bch_fs *c) if (c->opts.nochanges) return -BCH_ERR_erofs_nochanges; + if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) + return -BCH_ERR_erofs_no_alloc_info; + return __bch2_fs_read_write(c, false); } int bch2_fs_read_write_early(struct bch_fs *c) { - lockdep_assert_held(&c->state_lock); + down_write(&c->state_lock); + int ret = __bch2_fs_read_write(c, true); + up_write(&c->state_lock); - return __bch2_fs_read_write(c, true); + return ret; } /* Filesystem startup/shutdown: */ @@ -548,37 +598,44 @@ static void __bch2_fs_free(struct bch_fs *c) for (unsigned i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_exit(&c->times[i]); +#ifdef CONFIG_UNICODE + utf8_unload(c->cf_encoding); +#endif + bch2_find_btree_nodes_exit(&c->found_btree_nodes); bch2_free_pending_node_rewrites(c); - bch2_fs_accounting_exit(c); - bch2_fs_sb_errors_exit(c); - bch2_fs_counters_exit(c); + bch2_free_fsck_errs(c); + bch2_fs_vfs_exit(c); bch2_fs_snapshots_exit(c); + bch2_fs_sb_errors_exit(c); + bch2_fs_replicas_exit(c); + bch2_fs_rebalance_exit(c); bch2_fs_quota_exit(c); + bch2_fs_nocow_locking_exit(c); + bch2_fs_journal_exit(&c->journal); bch2_fs_fs_io_direct_exit(c); bch2_fs_fs_io_buffered_exit(c); bch2_fs_fsio_exit(c); - bch2_fs_vfs_exit(c); - bch2_fs_ec_exit(c); - bch2_fs_encryption_exit(c); - bch2_fs_nocow_locking_exit(c); bch2_fs_io_write_exit(c); bch2_fs_io_read_exit(c); + bch2_fs_encryption_exit(c); + bch2_fs_ec_exit(c); + bch2_fs_counters_exit(c); + bch2_fs_compress_exit(c); + bch2_io_clock_exit(&c->io_clock[WRITE]); + bch2_io_clock_exit(&c->io_clock[READ]); bch2_fs_buckets_waiting_for_journal_exit(c); - bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_write_buffer_exit(c); bch2_fs_btree_key_cache_exit(&c->btree_key_cache); - bch2_fs_btree_cache_exit(c); bch2_fs_btree_iter_exit(c); - bch2_fs_replicas_exit(c); - bch2_fs_journal_exit(&c->journal); - bch2_io_clock_exit(&c->io_clock[WRITE]); - bch2_io_clock_exit(&c->io_clock[READ]); - bch2_fs_compress_exit(c); - bch2_fs_btree_gc_exit(c); + bch2_fs_btree_interior_update_exit(c); + bch2_fs_btree_cache_exit(c); + bch2_fs_accounting_exit(c); + bch2_fs_async_obj_exit(c); bch2_journal_keys_put_initial(c); bch2_find_btree_nodes_exit(&c->found_btree_nodes); + BUG_ON(atomic_read(&c->journal_keys.ref)); - bch2_fs_btree_write_buffer_exit(c); percpu_free_rwsem(&c->mark_lock); if (c->online_reserved) { u64 v = percpu_u64_get(c->online_reserved); @@ -586,6 +643,7 @@ static void __bch2_fs_free(struct bch_fs *c) free_percpu(c->online_reserved); } + darray_exit(&c->incompat_versions_requested); darray_exit(&c->btree_roots_extra); free_percpu(c->pcpu); free_percpu(c->usage); @@ -593,9 +651,7 @@ static void __bch2_fs_free(struct bch_fs *c) mempool_exit(&c->btree_bounce_pool); bioset_exit(&c->btree_bio); mempool_exit(&c->fill_iter); -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_exit(&c->writes); -#endif + enumerated_ref_exit(&c->writes); kfree(rcu_dereference_protected(c->disk_groups, 1)); kfree(c->journal_seq_blacklist_table); @@ -607,8 +663,8 @@ static void __bch2_fs_free(struct bch_fs *c) destroy_workqueue(c->btree_read_complete_wq); if (c->copygc_wq) destroy_workqueue(c->copygc_wq); - if (c->btree_io_complete_wq) - destroy_workqueue(c->btree_io_complete_wq); + if (c->btree_write_complete_wq) + destroy_workqueue(c->btree_write_complete_wq); if (c->btree_update_wq) destroy_workqueue(c->btree_update_wq); @@ -634,6 +690,12 @@ void __bch2_fs_stop(struct bch_fs *c) bch2_fs_read_only(c); up_write(&c->state_lock); + for (unsigned i = 0; i < c->sb.nr_devices; i++) { + struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); + if (ca) + bch2_dev_io_ref_stop(ca, READ); + } + for_each_member_device(c, ca) bch2_dev_unlink(ca); @@ -662,8 +724,6 @@ void __bch2_fs_stop(struct bch_fs *c) void bch2_fs_free(struct bch_fs *c) { - unsigned i; - mutex_lock(&bch_fs_list_lock); list_del(&c->list); mutex_unlock(&bch_fs_list_lock); @@ -671,11 +731,12 @@ void bch2_fs_free(struct bch_fs *c) closure_sync(&c->cl); closure_debug_destroy(&c->cl); - for (i = 0; i < c->sb.nr_devices; i++) { + for (unsigned i = 0; i < c->sb.nr_devices; i++) { struct bch_dev *ca = rcu_dereference_protected(c->devs[i], true); if (ca) { EBUG_ON(atomic_long_read(&ca->ref) != 1); + bch2_dev_io_ref_stop(ca, READ); bch2_free_super(&ca->disk_sb); bch2_dev_free(ca); } @@ -698,9 +759,10 @@ static int bch2_fs_online(struct bch_fs *c) lockdep_assert_held(&bch_fs_list_lock); - if (__bch2_uuid_to_fs(c->sb.uuid)) { + if (c->sb.multi_device && + __bch2_uuid_to_fs(c->sb.uuid)) { bch_err(c, "filesystem UUID already open"); - return -EINVAL; + return -BCH_ERR_filesystem_uuid_already_open; } ret = bch2_fs_chardev_init(c); @@ -711,14 +773,16 @@ static int bch2_fs_online(struct bch_fs *c) bch2_fs_debug_init(c); - ret = kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) ?: + ret = (c->sb.multi_device + ? kobject_add(&c->kobj, NULL, "%pU", c->sb.user_uuid.b) + : kobject_add(&c->kobj, NULL, "%s", c->name)) ?: kobject_add(&c->internal, &c->kobj, "internal") ?: kobject_add(&c->opts_dir, &c->kobj, "options") ?: #ifndef CONFIG_BCACHEFS_NO_LATENCY_ACCT kobject_add(&c->time_stats, &c->kobj, "time_stats") ?: #endif kobject_add(&c->counters_kobj, &c->kobj, "counters") ?: - bch2_opts_create_sysfs_files(&c->opts_dir); + bch2_opts_create_sysfs_files(&c->opts_dir, OPT_FS); if (ret) { bch_err(c, "error creating sysfs objects"); return ret; @@ -742,7 +806,37 @@ err: return ret; } -static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) +static int bch2_fs_init_rw(struct bch_fs *c) +{ + if (test_bit(BCH_FS_rw_init_done, &c->flags)) + return 0; + + if (!(c->btree_update_wq = alloc_workqueue("bcachefs", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || + !(c->btree_write_complete_wq = alloc_workqueue("bcachefs_btree_write_complete", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || + !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", + WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || + !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", + WQ_FREEZABLE, 0))) + return -BCH_ERR_ENOMEM_fs_other_alloc; + + int ret = bch2_fs_btree_interior_update_init(c) ?: + bch2_fs_btree_write_buffer_init(c) ?: + bch2_fs_fs_io_buffered_init(c) ?: + bch2_fs_io_write_init(c) ?: + bch2_fs_journal_init(&c->journal); + if (ret) + return ret; + + set_bit(BCH_FS_rw_init_done, &c->flags); + return 0; +} + +static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, + bch_sb_handles *sbs) { struct bch_fs *c; struct printbuf name = PRINTBUF; @@ -755,7 +849,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto out; } - c->stdio = (void *)(unsigned long) opts.stdio; + c->stdio = (void *)(unsigned long) opts->stdio; __module_get(THIS_MODULE); @@ -779,24 +873,29 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) refcount_set(&c->ro_ref, 1); init_waitqueue_head(&c->ro_ref_wait); - spin_lock_init(&c->recovery_pass_lock); - sema_init(&c->online_fsck_mutex, 1); for (i = 0; i < BCH_TIME_STAT_NR; i++) bch2_time_stats_init(&c->times[i]); - bch2_fs_copygc_init(c); - bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); - bch2_fs_btree_iter_init_early(c); - bch2_fs_btree_interior_update_init_early(c); - bch2_fs_journal_keys_init(c); bch2_fs_allocator_background_init(c); bch2_fs_allocator_foreground_init(c); - bch2_fs_rebalance_init(c); - bch2_fs_quota_init(c); + bch2_fs_btree_cache_init_early(&c->btree_cache); + bch2_fs_btree_gc_init_early(c); + bch2_fs_btree_interior_update_init_early(c); + bch2_fs_btree_iter_init_early(c); + bch2_fs_btree_key_cache_init_early(&c->btree_key_cache); + bch2_fs_btree_write_buffer_init_early(c); + bch2_fs_copygc_init(c); bch2_fs_ec_init_early(c); + bch2_fs_journal_init_early(&c->journal); + bch2_fs_journal_keys_init(c); bch2_fs_move_init(c); + bch2_fs_nocow_locking_init_early(c); + bch2_fs_quota_init(c); + bch2_fs_recovery_passes_init(c); bch2_fs_sb_errors_init_early(c); + bch2_fs_snapshots_init_early(c); + bch2_fs_subvolumes_init_early(c); INIT_LIST_HEAD(&c->list); @@ -822,8 +921,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) c->journal.noflush_write_time = &c->times[BCH_TIME_journal_noflush_write]; c->journal.flush_seq_time = &c->times[BCH_TIME_journal_flush_seq]; - bch2_fs_btree_cache_init_early(&c->btree_cache); - mutex_init(&c->sectors_available_lock); ret = percpu_init_rwsem(&c->mark_lock); @@ -837,14 +934,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - pr_uuid(&name, c->sb.user_uuid.b); - ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; - if (ret) - goto err; - - strscpy(c->name, name.buf, sizeof(c->name)); - printbuf_exit(&name); - /* Compat: */ if (le16_to_cpu(sb->version) <= bcachefs_metadata_version_inode_v2 && !BCH_SB_JOURNAL_FLUSH_DELAY(sb)) @@ -859,7 +948,14 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) if (ret) goto err; - bch2_opts_apply(&c->opts, opts); + bch2_opts_apply(&c->opts, *opts); + + if (!IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + c->opts.block_size > PAGE_SIZE) { + bch_err(c, "cannot mount bs > ps filesystem without CONFIG_TRANSPARENT_HUGEPAGE"); + ret = -EINVAL; + goto err; + } c->btree_key_cache_btrees |= 1U << BTREE_ID_alloc; if (c->opts.inodes_use_key_cache) @@ -875,26 +971,26 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } + if (c->sb.multi_device) + pr_uuid(&name, c->sb.user_uuid.b); + else + prt_bdevname(&name, sbs->data[0].bdev); + + ret = name.allocation_failure ? -BCH_ERR_ENOMEM_fs_name_alloc : 0; + if (ret) + goto err; + + strscpy(c->name, name.buf, sizeof(c->name)); + printbuf_exit(&name); + iter_size = sizeof(struct sort_iter) + (btree_blocks(c) + 1) * 2 * sizeof(struct sort_iter_set); - if (!(c->btree_update_wq = alloc_workqueue("bcachefs", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_UNBOUND, 512)) || - !(c->btree_io_complete_wq = alloc_workqueue("bcachefs_btree_io", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->copygc_wq = alloc_workqueue("bcachefs_copygc", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM|WQ_CPU_INTENSIVE, 1)) || - !(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", + if (!(c->btree_read_complete_wq = alloc_workqueue("bcachefs_btree_read_complete", WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 512)) || - !(c->btree_write_submit_wq = alloc_workqueue("bcachefs_btree_write_sumit", - WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || - !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", - WQ_FREEZABLE, 0)) || -#ifndef BCH_WRITE_REF_DEBUG - percpu_ref_init(&c->writes, bch2_writes_disabled, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || -#endif + enumerated_ref_init(&c->writes, BCH_WRITE_REF_NR, + bch2_writes_disabled) || mempool_init_kmalloc_pool(&c->fill_iter, 1, iter_size) || bioset_init(&c->btree_bio, 1, max(offsetof(struct btree_read_bio, bio), @@ -910,32 +1006,50 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts opts) goto err; } - ret = bch2_fs_counters_init(c) ?: - bch2_fs_sb_errors_init(c) ?: - bch2_io_clock_init(&c->io_clock[READ]) ?: - bch2_io_clock_init(&c->io_clock[WRITE]) ?: - bch2_fs_journal_init(&c->journal) ?: - bch2_fs_btree_iter_init(c) ?: + ret = + bch2_fs_async_obj_init(c) ?: bch2_fs_btree_cache_init(c) ?: + bch2_fs_btree_iter_init(c) ?: bch2_fs_btree_key_cache_init(&c->btree_key_cache) ?: - bch2_fs_btree_interior_update_init(c) ?: - bch2_fs_btree_gc_init(c) ?: bch2_fs_buckets_waiting_for_journal_init(c) ?: - bch2_fs_btree_write_buffer_init(c) ?: - bch2_fs_subvolumes_init(c) ?: - bch2_fs_io_read_init(c) ?: - bch2_fs_io_write_init(c) ?: - bch2_fs_nocow_locking_init(c) ?: - bch2_fs_encryption_init(c) ?: + bch2_io_clock_init(&c->io_clock[READ]) ?: + bch2_io_clock_init(&c->io_clock[WRITE]) ?: bch2_fs_compress_init(c) ?: + bch2_fs_counters_init(c) ?: bch2_fs_ec_init(c) ?: - bch2_fs_vfs_init(c) ?: + bch2_fs_encryption_init(c) ?: bch2_fs_fsio_init(c) ?: - bch2_fs_fs_io_buffered_init(c) ?: - bch2_fs_fs_io_direct_init(c); + bch2_fs_fs_io_direct_init(c) ?: + bch2_fs_io_read_init(c) ?: + bch2_fs_rebalance_init(c) ?: + bch2_fs_sb_errors_init(c) ?: + bch2_fs_vfs_init(c); if (ret) goto err; +#ifdef CONFIG_UNICODE + /* Default encoding until we can potentially have more as an option. */ + c->cf_encoding = utf8_load(BCH_FS_DEFAULT_UTF8_ENCODING); + if (IS_ERR(c->cf_encoding)) { + printk(KERN_ERR "Cannot load UTF-8 encoding for filesystem. Version: %u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); + ret = -EINVAL; + goto err; + } + bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); +#else + if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { + printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); + ret = -EINVAL; + goto err; + } +#endif + for (i = 0; i < c->sb.nr_devices; i++) { if (!bch2_member_exists(c->disk_sb.sb, i)) continue; @@ -975,12 +1089,6 @@ static void print_mount_opts(struct bch_fs *c) prt_str(&p, "starting version "); bch2_version_to_text(&p, c->sb.version); - if (c->opts.read_only) { - prt_str(&p, " opts="); - first = false; - prt_printf(&p, "ro"); - } - for (i = 0; i < bch2_opts_nr; i++) { const struct bch_option *opt = &bch2_opt_table[i]; u64 v = bch2_opt_get_by_id(&c->opts, i); @@ -996,45 +1104,102 @@ static void print_mount_opts(struct bch_fs *c) bch2_opt_to_text(&p, c, c->disk_sb.sb, opt, v, OPT_SHOW_MOUNT_STYLE); } + if (c->sb.version_incompat_allowed != c->sb.version) { + prt_printf(&p, "\n allowing incompatible features above "); + bch2_version_to_text(&p, c->sb.version_incompat_allowed); + } + + if (c->opts.verbose) { + prt_printf(&p, "\n features: "); + prt_bitflags(&p, bch2_sb_features, c->sb.features); + } + bch_info(c, "%s", p.buf); printbuf_exit(&p); } +static bool bch2_fs_may_start(struct bch_fs *c) +{ + struct bch_dev *ca; + unsigned flags = 0; + + switch (c->opts.degraded) { + case BCH_DEGRADED_very: + flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; + break; + case BCH_DEGRADED_yes: + flags |= BCH_FORCE_IF_DEGRADED; + break; + default: + mutex_lock(&c->sb_lock); + for (unsigned i = 0; i < c->disk_sb.sb->nr_devices; i++) { + if (!bch2_member_exists(c->disk_sb.sb, i)) + continue; + + ca = bch2_dev_locked(c, i); + + if (!bch2_dev_is_online(ca) && + (ca->mi.state == BCH_MEMBER_STATE_rw || + ca->mi.state == BCH_MEMBER_STATE_ro)) { + mutex_unlock(&c->sb_lock); + return false; + } + } + mutex_unlock(&c->sb_lock); + break; + } + + return bch2_have_enough_devs(c, c->online_devs, flags, true); +} + int bch2_fs_start(struct bch_fs *c) { time64_t now = ktime_get_real_seconds(); - int ret; + int ret = 0; print_mount_opts(c); + if (!bch2_fs_may_start(c)) + return -BCH_ERR_insufficient_devices_to_start; + down_write(&c->state_lock); + mutex_lock(&c->sb_lock); BUG_ON(test_bit(BCH_FS_started, &c->flags)); - mutex_lock(&c->sb_lock); + if (!bch2_sb_field_get_minsize(&c->disk_sb, ext, + sizeof(struct bch_sb_field_ext) / sizeof(u64))) { + mutex_unlock(&c->sb_lock); + up_write(&c->state_lock); + ret = -BCH_ERR_ENOSPC_sb; + goto err; + } ret = bch2_sb_members_v2_init(c); if (ret) { mutex_unlock(&c->sb_lock); + up_write(&c->state_lock); goto err; } - for_each_online_member(c, ca) - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = cpu_to_le64(now); + rcu_read_lock(); + for_each_online_member_rcu(c, ca) + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(now); + rcu_read_unlock(); - struct bch_sb_field_ext *ext = - bch2_sb_field_get_minsize(&c->disk_sb, ext, sizeof(*ext) / sizeof(u64)); + /* + * Dno't write superblock yet: recovery might have to downgrade + */ mutex_unlock(&c->sb_lock); - if (!ext) { - bch_err(c, "insufficient space in superblock for sb_field_ext"); - ret = -BCH_ERR_ENOSPC_sb; - goto err; - } - - for_each_rw_member(c, ca) - bch2_dev_allocator_add(c, ca); + rcu_read_lock(); + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) + bch2_dev_allocator_add(c, ca); + rcu_read_unlock(); bch2_recalc_capacity(c); + up_write(&c->state_lock); c->recovery_task = current; ret = BCH_SB_INITIALIZED(c->disk_sb.sb) @@ -1045,35 +1210,30 @@ int bch2_fs_start(struct bch_fs *c) if (ret) goto err; - ret = bch2_opts_check_may_set(c); + ret = bch2_opts_hooks_pre_set(c); if (ret) goto err; if (bch2_fs_init_fault("fs_start")) { - bch_err(c, "fs_start fault injected"); - ret = -EINVAL; + ret = -BCH_ERR_injected_fs_start; goto err; } set_bit(BCH_FS_started, &c->flags); + wake_up(&c->ro_ref_wait); - if (c->opts.read_only) { + down_write(&c->state_lock); + if (c->opts.read_only) bch2_fs_read_only(c); - } else { - ret = !test_bit(BCH_FS_rw, &c->flags) - ? bch2_fs_read_write(c) - : bch2_fs_read_write_late(c); - if (ret) - goto err; - } + else if (!test_bit(BCH_FS_rw, &c->flags)) + ret = bch2_fs_read_write(c); + up_write(&c->state_lock); - ret = 0; err: if (ret) bch_err_msg(c, ret, "starting filesystem"); else bch_verbose(c, "done starting filesystem"); - up_write(&c->state_lock); return ret; } @@ -1182,6 +1342,18 @@ static int bch2_dev_in_fs(struct bch_sb_handle *fs, /* Device startup/shutdown: */ +static void bch2_dev_io_ref_stop(struct bch_dev *ca, int rw) +{ + if (rw == READ) + clear_bit(ca->dev_idx, ca->fs->online_devs.d); + + if (!enumerated_ref_is_zero(&ca->io_ref[rw])) + enumerated_ref_stop(&ca->io_ref[rw], + rw == READ + ? bch2_dev_read_refs + : bch2_dev_write_refs); +} + static void bch2_dev_release(struct kobject *kobj) { struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); @@ -1191,6 +1363,9 @@ static void bch2_dev_release(struct kobject *kobj) static void bch2_dev_free(struct bch_dev *ca) { + WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); + WARN_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); + cancel_work_sync(&ca->io_error_work); bch2_dev_unlink(ca); @@ -1198,6 +1373,9 @@ static void bch2_dev_free(struct bch_dev *ca) if (ca->kobj.state_in_sysfs) kobject_del(&ca->kobj); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_mismatch); + bch2_bucket_bitmap_free(&ca->bucket_backpointer_empty); + bch2_free_super(&ca->disk_sb); bch2_dev_allocator_background_exit(ca); bch2_dev_journal_exit(ca); @@ -1209,7 +1387,8 @@ static void bch2_dev_free(struct bch_dev *ca) bch2_time_stats_quantiles_exit(&ca->io_latency[WRITE]); bch2_time_stats_quantiles_exit(&ca->io_latency[READ]); - percpu_ref_exit(&ca->io_ref); + enumerated_ref_exit(&ca->io_ref[WRITE]); + enumerated_ref_exit(&ca->io_ref[READ]); #ifndef CONFIG_BCACHEFS_DEBUG percpu_ref_exit(&ca->ref); #endif @@ -1221,14 +1400,12 @@ static void __bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca) lockdep_assert_held(&c->state_lock); - if (percpu_ref_is_zero(&ca->io_ref)) + if (enumerated_ref_is_zero(&ca->io_ref[READ])) return; __bch2_dev_read_only(c, ca); - reinit_completion(&ca->io_ref_completion); - percpu_ref_kill(&ca->io_ref); - wait_for_completion(&ca->io_ref_completion); + bch2_dev_io_ref_stop(ca, READ); bch2_dev_unlink(ca); @@ -1245,13 +1422,6 @@ static void bch2_dev_ref_complete(struct percpu_ref *ref) } #endif -static void bch2_dev_io_ref_complete(struct percpu_ref *ref) -{ - struct bch_dev *ca = container_of(ref, struct bch_dev, io_ref); - - complete(&ca->io_ref_completion); -} - static void bch2_dev_unlink(struct bch_dev *ca) { struct kobject *b; @@ -1280,8 +1450,8 @@ static int bch2_dev_sysfs_online(struct bch_fs *c, struct bch_dev *ca) return 0; if (!ca->kobj.state_in_sysfs) { - ret = kobject_add(&ca->kobj, &c->kobj, - "dev-%u", ca->dev_idx); + ret = kobject_add(&ca->kobj, &c->kobj, "dev-%u", ca->dev_idx) ?: + bch2_opts_create_sysfs_files(&ca->kobj, OPT_DEVICE); if (ret) return ret; } @@ -1313,7 +1483,6 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, kobject_init(&ca->kobj, &bch2_dev_ktype); init_completion(&ca->ref_completion); - init_completion(&ca->io_ref_completion); INIT_WORK(&ca->io_error_work, bch2_io_error_work); @@ -1337,10 +1506,13 @@ static struct bch_dev *__bch2_dev_alloc(struct bch_fs *c, atomic_long_set(&ca->ref, 1); #endif + mutex_init(&ca->bucket_backpointer_mismatch.lock); + mutex_init(&ca->bucket_backpointer_empty.lock); + bch2_dev_allocator_background_init(ca); - if (percpu_ref_init(&ca->io_ref, bch2_dev_io_ref_complete, - PERCPU_REF_INIT_DEAD, GFP_KERNEL) || + if (enumerated_ref_init(&ca->io_ref[READ], BCH_DEV_READ_REF_NR, NULL) || + enumerated_ref_init(&ca->io_ref[WRITE], BCH_DEV_WRITE_REF_NR, NULL) || !(ca->sb_read_scratch = kmalloc(BCH_SB_READ_SCRATCH_BUF_SIZE, GFP_KERNEL)) || bch2_dev_buckets_alloc(c, ca) || !(ca->io_done = alloc_percpu(*ca->io_done))) @@ -1357,7 +1529,9 @@ static void bch2_dev_attach(struct bch_fs *c, struct bch_dev *ca, { ca->dev_idx = dev_idx; __set_bit(ca->dev_idx, ca->self.d); - scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); + + if (!ca->name[0]) + scnprintf(ca->name, sizeof(ca->name), "dev-%u", dev_idx); ca->fs = c; rcu_assign_pointer(c->devs[ca->dev_idx], ca); @@ -1402,19 +1576,32 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) return -BCH_ERR_device_size_too_small; } - BUG_ON(!percpu_ref_is_zero(&ca->io_ref)); + BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); + BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[WRITE])); ret = bch2_dev_journal_init(ca, sb->sb); if (ret) return ret; + struct printbuf name = PRINTBUF; + prt_bdevname(&name, sb->bdev); + strscpy(ca->name, name.buf, sizeof(ca->name)); + printbuf_exit(&name); + /* Commit: */ ca->disk_sb = *sb; memset(sb, 0, sizeof(*sb)); + /* + * Stash pointer to the filesystem for blk_holder_ops - note that once + * attached to a filesystem, we will always close the block device + * before tearing down the filesystem object. + */ + ca->disk_sb.holder->c = ca->fs; + ca->dev = ca->disk_sb.bdev->bd_dev; - percpu_ref_reinit(&ca->io_ref); + enumerated_ref_start(&ca->io_ref[READ]); return 0; } @@ -1438,18 +1625,11 @@ static int bch2_dev_attach_bdev(struct bch_fs *c, struct bch_sb_handle *sb) if (ret) return ret; - bch2_dev_sysfs_online(c, ca); - - struct printbuf name = PRINTBUF; - prt_bdevname(&name, ca->disk_sb.bdev); + set_bit(ca->dev_idx, c->online_devs.d); - if (c->sb.nr_devices == 1) - strscpy(c->name, name.buf, sizeof(c->name)); - strscpy(ca->name, name.buf, sizeof(ca->name)); - - printbuf_exit(&name); + bch2_dev_sysfs_online(c, ca); - rebalance_wakeup(c); + bch2_rebalance_wakeup(c); return 0; } @@ -1499,7 +1679,7 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, return true; /* do we have enough devices to read from? */ - new_online_devs = bch2_online_devs(c); + new_online_devs = c->online_devs; __clear_bit(ca->dev_idx, new_online_devs.d); return bch2_have_enough_devs(c, new_online_devs, flags, false); @@ -1508,42 +1688,10 @@ bool bch2_dev_state_allowed(struct bch_fs *c, struct bch_dev *ca, } } -static bool bch2_fs_may_start(struct bch_fs *c) -{ - struct bch_dev *ca; - unsigned i, flags = 0; - - if (c->opts.very_degraded) - flags |= BCH_FORCE_IF_DEGRADED|BCH_FORCE_IF_LOST; - - if (c->opts.degraded) - flags |= BCH_FORCE_IF_DEGRADED; - - if (!c->opts.degraded && - !c->opts.very_degraded) { - mutex_lock(&c->sb_lock); - - for (i = 0; i < c->disk_sb.sb->nr_devices; i++) { - if (!bch2_member_exists(c->disk_sb.sb, i)) - continue; - - ca = bch2_dev_locked(c, i); - - if (!bch2_dev_is_online(ca) && - (ca->mi.state == BCH_MEMBER_STATE_rw || - ca->mi.state == BCH_MEMBER_STATE_ro)) { - mutex_unlock(&c->sb_lock); - return false; - } - } - mutex_unlock(&c->sb_lock); - } - - return bch2_have_enough_devs(c, bch2_online_devs(c), flags, true); -} - static void __bch2_dev_read_only(struct bch_fs *c, struct bch_dev *ca) { + bch2_dev_io_ref_stop(ca, WRITE); + /* * The allocator thread itself allocates btree nodes, so stop it first: */ @@ -1560,6 +1708,10 @@ static void __bch2_dev_read_write(struct bch_fs *c, struct bch_dev *ca) bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); + + if (enumerated_ref_is_zero(&ca->io_ref[WRITE])) + enumerated_ref_start(&ca->io_ref[WRITE]); + bch2_dev_do_discards(ca); } @@ -1589,7 +1741,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, if (new_state == BCH_MEMBER_STATE_rw) __bch2_dev_read_write(c, ca); - rebalance_wakeup(c); + bch2_rebalance_wakeup(c); return ret; } @@ -1612,6 +1764,8 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) { struct bch_member *m; unsigned dev_idx = ca->dev_idx, data; + bool fast_device_removal = !bch2_request_incompat_feature(c, + bcachefs_metadata_version_fast_device_removal); int ret; down_write(&c->state_lock); @@ -1630,11 +1784,25 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) __bch2_dev_read_only(c, ca); - ret = bch2_dev_data_drop(c, ca->dev_idx, flags); - bch_err_msg(ca, ret, "bch2_dev_data_drop()"); + ret = fast_device_removal + ? bch2_dev_data_drop_by_backpointers(c, ca->dev_idx, flags) + : (bch2_dev_data_drop(c, ca->dev_idx, flags) ?: + bch2_dev_remove_stripes(c, ca->dev_idx, flags)); if (ret) goto err; + /* Check if device still has data before blowing away alloc info */ + struct bch_dev_usage usage = bch2_dev_usage_read(ca); + for (unsigned i = 0; i < BCH_DATA_NR; i++) + if (!data_type_is_empty(i) && + !data_type_is_hidden(i) && + usage.buckets[i]) { + bch_err(ca, "Remove failed: still has data (%s, %llu buckets)", + __bch2_data_types[i], usage.buckets[i]); + ret = -EBUSY; + goto err; + } + ret = bch2_dev_remove_alloc(c, ca); bch_err_msg(ca, ret, "bch2_dev_remove_alloc()"); if (ret) @@ -1698,7 +1866,11 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) */ mutex_lock(&c->sb_lock); m = bch2_members_v2_get_mut(c->disk_sb.sb, dev_idx); - memset(&m->uuid, 0, sizeof(m->uuid)); + + if (fast_device_removal) + m->uuid = BCH_SB_MEMBER_DELETED_UUID; + else + memset(&m->uuid, 0, sizeof(m->uuid)); bch2_write_super(c); @@ -1706,8 +1878,9 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) up_write(&c->state_lock); return 0; err: - if (ca->mi.state == BCH_MEMBER_STATE_rw && - !percpu_ref_is_zero(&ca->io_ref)) + if (test_bit(BCH_FS_rw, &c->flags) && + ca->mi.state == BCH_MEMBER_STATE_rw && + !enumerated_ref_is_zero(&ca->io_ref[READ])) __bch2_dev_read_write(c, ca); up_write(&c->state_lock); return ret; @@ -1717,11 +1890,11 @@ err: int bch2_dev_add(struct bch_fs *c, const char *path) { struct bch_opts opts = bch2_opts_empty(); - struct bch_sb_handle sb; + struct bch_sb_handle sb = {}; struct bch_dev *ca = NULL; struct printbuf errbuf = PRINTBUF; struct printbuf label = PRINTBUF; - int ret; + int ret = 0; ret = bch2_read_super(path, &opts, &sb); bch_err_msg(c, ret, "reading super"); @@ -1738,6 +1911,20 @@ int bch2_dev_add(struct bch_fs *c, const char *path) } } + if (list_empty(&c->list)) { + mutex_lock(&bch_fs_list_lock); + if (__bch2_uuid_to_fs(c->sb.uuid)) + ret = -BCH_ERR_filesystem_uuid_already_open; + else + list_add(&c->list, &bch_fs_list); + mutex_unlock(&bch_fs_list_lock); + + if (ret) { + bch_err(c, "filesystem UUID already open"); + goto err; + } + } + ret = bch2_dev_may_add(sb.sb, c); if (ret) goto err; @@ -1754,6 +1941,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) down_write(&c->state_lock); mutex_lock(&c->sb_lock); + SET_BCH_SB_MULTI_DEVICE(c->disk_sb.sb, true); ret = bch2_sb_from_fs(c, ca); bch_err_msg(c, ret, "setting up new superblock"); @@ -1769,6 +1957,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) goto err_unlock; } unsigned dev_idx = ret; + ret = 0; /* success: */ @@ -1788,30 +1977,36 @@ int bch2_dev_add(struct bch_fs *c, const char *path) bch2_write_super(c); mutex_unlock(&c->sb_lock); - ret = bch2_dev_usage_init(ca, false); - if (ret) - goto err_late; + if (test_bit(BCH_FS_started, &c->flags)) { + ret = bch2_dev_usage_init(ca, false); + if (ret) + goto err_late; - ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); - bch_err_msg(ca, ret, "marking new superblock"); - if (ret) - goto err_late; + ret = bch2_trans_mark_dev_sb(c, ca, BTREE_TRIGGER_transactional); + bch_err_msg(ca, ret, "marking new superblock"); + if (ret) + goto err_late; - ret = bch2_fs_freespace_init(c); - bch_err_msg(ca, ret, "initializing free space"); - if (ret) - goto err_late; + ret = bch2_fs_freespace_init(c); + bch_err_msg(ca, ret, "initializing free space"); + if (ret) + goto err_late; - if (ca->mi.state == BCH_MEMBER_STATE_rw) - __bch2_dev_read_write(c, ca); + if (ca->mi.state == BCH_MEMBER_STATE_rw) + __bch2_dev_read_write(c, ca); - ret = bch2_dev_journal_alloc(ca, false); - bch_err_msg(c, ret, "allocating journal"); - if (ret) - goto err_late; + ret = bch2_dev_journal_alloc(ca, false); + bch_err_msg(c, ret, "allocating journal"); + if (ret) + goto err_late; + } up_write(&c->state_lock); - return 0; +out: + printbuf_exit(&label); + printbuf_exit(&errbuf); + bch_err_fn(c, ret); + return ret; err_unlock: mutex_unlock(&c->sb_lock); @@ -1820,10 +2015,7 @@ err: if (ca) bch2_dev_free(ca); bch2_free_super(&sb); - printbuf_exit(&label); - printbuf_exit(&errbuf); - bch_err_fn(c, ret); - return ret; + goto out; err_late: up_write(&c->state_lock); ca = NULL; @@ -1918,6 +2110,18 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) return 0; } +static int __bch2_dev_resize_alloc(struct bch_dev *ca, u64 old_nbuckets, u64 new_nbuckets) +{ + struct bch_fs *c = ca->fs; + u64 v[3] = { new_nbuckets - old_nbuckets, 0, 0 }; + + return bch2_trans_commit_do(ca->fs, NULL, NULL, 0, + bch2_disk_accounting_mod2(trans, false, v, dev_data_type, + .dev = ca->dev_idx, + .data_type = BCH_DATA_free)) ?: + bch2_dev_freespace_init(c, ca, old_nbuckets, new_nbuckets); +} + int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) { struct bch_member *m; @@ -1965,16 +2169,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) mutex_unlock(&c->sb_lock); if (ca->mi.freespace_initialized) { - struct disk_accounting_pos acc = { - .type = BCH_DISK_ACCOUNTING_dev_data_type, - .dev_data_type.dev = ca->dev_idx, - .dev_data_type.data_type = BCH_DATA_free, - }; - u64 v[3] = { nbuckets - old_nbuckets, 0, 0 }; - - ret = bch2_trans_commit_do(ca->fs, NULL, NULL, 0, - bch2_disk_accounting_mod(trans, &acc, v, ARRAY_SIZE(v), false)) ?: - bch2_dev_freespace_init(c, ca, old_nbuckets, nbuckets); + ret = __bch2_dev_resize_alloc(ca, old_nbuckets, nbuckets); if (ret) goto err; } @@ -1985,6 +2180,49 @@ err: return ret; } +int bch2_fs_resize_on_mount(struct bch_fs *c) +{ + for_each_online_member(c, ca, BCH_DEV_READ_REF_fs_resize_on_mount) { + u64 old_nbuckets = ca->mi.nbuckets; + u64 new_nbuckets = div64_u64(get_capacity(ca->disk_sb.bdev->bd_disk), + ca->mi.bucket_size); + + if (ca->mi.resize_on_mount && + new_nbuckets > ca->mi.nbuckets) { + bch_info(ca, "resizing to size %llu", new_nbuckets * ca->mi.bucket_size); + int ret = bch2_dev_buckets_resize(c, ca, new_nbuckets); + bch_err_fn(ca, ret); + if (ret) { + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_resize_on_mount); + up_write(&c->state_lock); + return ret; + } + + mutex_lock(&c->sb_lock); + struct bch_member *m = + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx); + m->nbuckets = cpu_to_le64(new_nbuckets); + SET_BCH_MEMBER_RESIZE_ON_MOUNT(m, false); + + c->disk_sb.sb->features[0] &= ~cpu_to_le64(BIT_ULL(BCH_FEATURE_small_image)); + bch2_write_super(c); + mutex_unlock(&c->sb_lock); + + if (ca->mi.freespace_initialized) { + ret = __bch2_dev_resize_alloc(ca, old_nbuckets, new_nbuckets); + if (ret) { + enumerated_ref_put(&ca->io_ref[READ], + BCH_DEV_READ_REF_fs_resize_on_mount); + up_write(&c->state_lock); + return ret; + } + } + } + } + return 0; +} + /* return with ref on ca->ref: */ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) { @@ -1997,6 +2235,114 @@ struct bch_dev *bch2_dev_lookup(struct bch_fs *c, const char *name) return ERR_PTR(-BCH_ERR_ENOENT_dev_not_found); } +/* blk_holder_ops: */ + +static struct bch_fs *bdev_get_fs(struct block_device *bdev) + __releases(&bdev->bd_holder_lock) +{ + struct bch_sb_handle_holder *holder = bdev->bd_holder; + struct bch_fs *c = holder->c; + + if (c && !bch2_ro_ref_tryget(c)) + c = NULL; + + mutex_unlock(&bdev->bd_holder_lock); + + if (c) + wait_event(c->ro_ref_wait, test_bit(BCH_FS_started, &c->flags)); + return c; +} + +/* returns with ref on ca->ref */ +static struct bch_dev *bdev_to_bch_dev(struct bch_fs *c, struct block_device *bdev) +{ + for_each_member_device(c, ca) + if (ca->disk_sb.bdev == bdev) + return ca; + return NULL; +} + +static void bch2_fs_bdev_mark_dead(struct block_device *bdev, bool surprise) +{ + struct bch_fs *c = bdev_get_fs(bdev); + if (!c) + return; + + struct super_block *sb = c->vfs_sb; + if (sb) { + /* + * Not necessary, c->ro_ref guards against the filesystem being + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ + down_read(&sb->s_umount); + } + + down_write(&c->state_lock); + struct bch_dev *ca = bdev_to_bch_dev(c, bdev); + if (!ca) + goto unlock; + + bool dev = bch2_dev_state_allowed(c, ca, + BCH_MEMBER_STATE_failed, + BCH_FORCE_IF_DEGRADED); + + if (!dev && sb) { + if (!surprise) + sync_filesystem(sb); + shrink_dcache_sb(sb); + evict_inodes(sb); + } + + struct printbuf buf = PRINTBUF; + __bch2_log_msg_start(ca->name, &buf); + + prt_printf(&buf, "offline from block layer"); + + if (dev) { + __bch2_dev_offline(c, ca); + } else { + bch2_journal_flush(&c->journal); + bch2_fs_emergency_read_only2(c, &buf); + } + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + + bch2_dev_put(ca); +unlock: + if (sb) + up_read(&sb->s_umount); + up_write(&c->state_lock); + bch2_ro_ref_put(c); +} + +static void bch2_fs_bdev_sync(struct block_device *bdev) +{ + struct bch_fs *c = bdev_get_fs(bdev); + if (!c) + return; + + struct super_block *sb = c->vfs_sb; + if (sb) { + /* + * Not necessary, c->ro_ref guards against the filesystem being + * unmounted - we only take this to avoid a warning in + * sync_filesystem: + */ + down_read(&sb->s_umount); + sync_filesystem(sb); + up_read(&sb->s_umount); + } + + bch2_ro_ref_put(c); +} + +const struct blk_holder_ops bch2_sb_handle_bdev_ops = { + .mark_dead = bch2_fs_bdev_mark_dead, + .sync = bch2_fs_bdev_sync, +}; + /* Filesystem open: */ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) @@ -2005,10 +2351,10 @@ static inline int sb_cmp(struct bch_sb *l, struct bch_sb *r) cmp_int(le64_to_cpu(l->write_time), le64_to_cpu(r->write_time)); } -struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, - struct bch_opts opts) +struct bch_fs *bch2_fs_open(darray_const_str *devices, + struct bch_opts *opts) { - DARRAY(struct bch_sb_handle) sbs = { 0 }; + bch_sb_handles sbs = {}; struct bch_fs *c = NULL; struct bch_sb_handle *best = NULL; struct printbuf errbuf = PRINTBUF; @@ -2017,26 +2363,26 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, if (!try_module_get(THIS_MODULE)) return ERR_PTR(-ENODEV); - if (!nr_devices) { + if (!devices->nr) { ret = -EINVAL; goto err; } - ret = darray_make_room(&sbs, nr_devices); + ret = darray_make_room(&sbs, devices->nr); if (ret) goto err; - for (unsigned i = 0; i < nr_devices; i++) { + darray_for_each(*devices, i) { struct bch_sb_handle sb = { NULL }; - ret = bch2_read_super(devices[i], &opts, &sb); + ret = bch2_read_super(*i, opts, &sb); if (ret) goto err; BUG_ON(darray_push(&sbs, sb)); } - if (opts.nochanges && !opts.read_only) { + if (opts->nochanges && !opts->read_only) { ret = -BCH_ERR_erofs_nochanges; goto err_print; } @@ -2046,7 +2392,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, best = sb; darray_for_each_reverse(sbs, sb) { - ret = bch2_dev_in_fs(best, sb, &opts); + ret = bch2_dev_in_fs(best, sb, opts); if (ret == -BCH_ERR_device_has_been_removed || ret == -BCH_ERR_device_splitbrain) { @@ -2061,7 +2407,7 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, goto err_print; } - c = bch2_fs_alloc(best->sb, opts); + c = bch2_fs_alloc(best->sb, opts, &sbs); ret = PTR_ERR_OR_ZERO(c); if (ret) goto err; @@ -2076,11 +2422,6 @@ struct bch_fs *bch2_fs_open(char * const *devices, unsigned nr_devices, } up_write(&c->state_lock); - if (!bch2_fs_may_start(c)) { - ret = -BCH_ERR_insufficient_devices_to_start; - goto err_print; - } - if (!c->opts.nostart) { ret = bch2_fs_start(c); if (ret) @@ -2095,7 +2436,7 @@ out: return c; err_print: pr_err("bch_fs_open err opening %s: %s", - devices[0], bch2_err_str(ret)); + devices->data[0], bch2_err_str(ret)); err: if (!IS_ERR_OR_NULL(c)) bch2_fs_stop(c); @@ -2132,16 +2473,52 @@ err: return -ENOMEM; } -#define BCH_DEBUG_PARAM(name, description) \ - bool bch2_##name; \ - module_param_named(name, bch2_##name, bool, 0644); \ +#define BCH_DEBUG_PARAM(name, description) DEFINE_STATIC_KEY_FALSE(bch2_##name); +BCH_DEBUG_PARAMS_ALL() +#undef BCH_DEBUG_PARAM + +static int bch2_param_set_static_key_t(const char *val, const struct kernel_param *kp) +{ + /* Match bool exactly, by re-using it. */ + struct static_key *key = kp->arg; + struct kernel_param boolkp = *kp; + bool v; + int ret; + + boolkp.arg = &v; + + ret = param_set_bool(val, &boolkp); + if (ret) + return ret; + if (v) + static_key_enable(key); + else + static_key_disable(key); + return 0; +} + +static int bch2_param_get_static_key_t(char *buffer, const struct kernel_param *kp) +{ + struct static_key *key = kp->arg; + return sprintf(buffer, "%c\n", static_key_enabled(key) ? 'N' : 'Y'); +} + +static const struct kernel_param_ops bch2_param_ops_static_key_t = { + .flags = KERNEL_PARAM_OPS_FL_NOARG, + .set = bch2_param_set_static_key_t, + .get = bch2_param_get_static_key_t, +}; + +#define BCH_DEBUG_PARAM(name, description) \ + module_param_cb(name, &bch2_param_ops_static_key_t, &bch2_##name.key, 0644);\ + __MODULE_PARM_TYPE(name, "static_key_t"); \ MODULE_PARM_DESC(name, description); BCH_DEBUG_PARAMS() #undef BCH_DEBUG_PARAM __maybe_unused static unsigned bch2_metadata_version = bcachefs_metadata_version_current; -module_param_named(version, bch2_metadata_version, uint, 0400); +module_param_named(version, bch2_metadata_version, uint, 0444); module_exit(bcachefs_exit); module_init(bcachefs_init); diff --git a/fs/bcachefs/super.h b/fs/bcachefs/super.h index 04f8287eff5c..dc52f06cb2b9 100644 --- a/fs/bcachefs/super.h +++ b/fs/bcachefs/super.h @@ -9,6 +9,9 @@ #include <linux/math64.h> extern const char * const bch2_fs_flag_strs[]; +extern const char * const bch2_write_refs[]; +extern const char * const bch2_dev_read_refs[]; +extern const char * const bch2_dev_write_refs[]; struct bch_fs *bch2_dev_to_fs(dev_t); struct bch_fs *bch2_uuid_to_fs(__uuid_t); @@ -29,17 +32,23 @@ int bch2_dev_resize(struct bch_fs *, struct bch_dev *, u64); struct bch_dev *bch2_dev_lookup(struct bch_fs *, const char *); bool bch2_fs_emergency_read_only(struct bch_fs *); +bool bch2_fs_emergency_read_only2(struct bch_fs *, struct printbuf *); + bool bch2_fs_emergency_read_only_locked(struct bch_fs *); void bch2_fs_read_only(struct bch_fs *); int bch2_fs_read_write(struct bch_fs *); int bch2_fs_read_write_early(struct bch_fs *); +int bch2_fs_resize_on_mount(struct bch_fs *); + void __bch2_fs_stop(struct bch_fs *); void bch2_fs_free(struct bch_fs *); void bch2_fs_stop(struct bch_fs *); int bch2_fs_start(struct bch_fs *); -struct bch_fs *bch2_fs_open(char * const *, unsigned, struct bch_opts); +struct bch_fs *bch2_fs_open(darray_const_str *, struct bch_opts *); + +extern const struct blk_holder_ops bch2_sb_handle_bdev_ops; #endif /* _BCACHEFS_SUPER_H */ diff --git a/fs/bcachefs/super_types.h b/fs/bcachefs/super_types.h index 368a63d938cf..3a899f799d1d 100644 --- a/fs/bcachefs/super_types.h +++ b/fs/bcachefs/super_types.h @@ -2,13 +2,19 @@ #ifndef _BCACHEFS_SUPER_TYPES_H #define _BCACHEFS_SUPER_TYPES_H +struct bch_fs; + +struct bch_sb_handle_holder { + struct bch_fs *c; +}; + struct bch_sb_handle { struct bch_sb *sb; struct file *s_bdev_file; struct block_device *bdev; char *sb_name; struct bio *bio; - void *holder; + struct bch_sb_handle_holder *holder; size_t buffer_size; blk_mode_t mode; unsigned have_layout:1; diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index a7eb1f511484..1a55196d69f1 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -25,6 +25,7 @@ #include "disk_accounting.h" #include "disk_groups.h" #include "ec.h" +#include "enumerated_ref.h" #include "inode.h" #include "journal.h" #include "journal_reclaim.h" @@ -34,6 +35,7 @@ #include "nocow_locking.h" #include "opts.h" #include "rebalance.h" +#include "recovery_passes.h" #include "replicas.h" #include "super-io.h" #include "tests.h" @@ -145,16 +147,17 @@ write_attribute(trigger_journal_flush); write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); write_attribute(trigger_btree_key_cache_shrink); +write_attribute(trigger_btree_updates); write_attribute(trigger_freelist_wakeup); +write_attribute(trigger_recalc_capacity); +write_attribute(trigger_delete_dead_snapshots); read_attribute(gc_gens_pos); read_attribute(uuid); read_attribute(minor); read_attribute(flags); -read_attribute(bucket_size); read_attribute(first_bucket); read_attribute(nbuckets); -rw_attribute(durability); read_attribute(io_done); read_attribute(io_errors); write_attribute(io_errors_reset); @@ -173,31 +176,13 @@ read_attribute(journal_debug); read_attribute(btree_cache); read_attribute(btree_key_cache); read_attribute(btree_reserve_cache); -read_attribute(stripes_heap); read_attribute(open_buckets); read_attribute(open_buckets_partial); -read_attribute(write_points); read_attribute(nocow_lock_table); -#ifdef BCH_WRITE_REF_DEBUG +read_attribute(read_refs); read_attribute(write_refs); -static const char * const bch2_write_refs[] = { -#define x(n) #n, - BCH_WRITE_REFS() -#undef x - NULL -}; - -static void bch2_write_refs_to_text(struct printbuf *out, struct bch_fs *c) -{ - bch2_printbuf_tabstop_push(out, 24); - - for (unsigned i = 0; i < ARRAY_SIZE(c->writes); i++) - prt_printf(out, "%s\t%li\n", bch2_write_refs[i], atomic_long_read(&c->writes[i])); -} -#endif - read_attribute(internal_uuid); read_attribute(disk_groups); @@ -209,14 +194,14 @@ read_attribute(usage_base); BCH_PERSISTENT_COUNTERS() #undef x -rw_attribute(discard); -read_attribute(state); rw_attribute(label); read_attribute(copy_gc_wait); sysfs_pd_controller_attribute(rebalance); read_attribute(rebalance_status); +read_attribute(snapshot_delete_status); +read_attribute(recovery_status); read_attribute(new_stripes); @@ -262,10 +247,8 @@ static int bch2_compression_stats_to_text(struct printbuf *out, struct bch_fs *c prt_printf(out, "type\tcompressed\runcompressed\raverage extent size\r\n"); for (unsigned i = 1; i < BCH_COMPRESSION_TYPE_NR; i++) { - struct disk_accounting_pos a = { - .type = BCH_DISK_ACCOUNTING_compression, - .compression.type = i, - }; + struct disk_accounting_pos a; + disk_accounting_key_init(a, compression, .type = i); struct bpos p = disk_accounting_pos_to_bpos(&a); u64 v[3]; bch2_accounting_mem_read(c, p, v, ARRAY_SIZE(v)); @@ -341,6 +324,12 @@ SHOW(bch2_fs) if (attr == &sysfs_rebalance_status) bch2_rebalance_status_to_text(out, c); + if (attr == &sysfs_snapshot_delete_status) + bch2_snapshot_delete_status_to_text(out, c); + + if (attr == &sysfs_recovery_status) + bch2_recovery_pass_status_to_text(out, c); + /* Debugging: */ if (attr == &sysfs_journal_debug) @@ -355,18 +344,12 @@ SHOW(bch2_fs) if (attr == &sysfs_btree_reserve_cache) bch2_btree_reserve_cache_to_text(out, c); - if (attr == &sysfs_stripes_heap) - bch2_stripes_heap_to_text(out, c); - if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, NULL); if (attr == &sysfs_open_buckets_partial) bch2_open_buckets_partial_to_text(out, c); - if (attr == &sysfs_write_points) - bch2_write_points_to_text(out, c); - if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); @@ -382,10 +365,8 @@ SHOW(bch2_fs) if (attr == &sysfs_moving_ctxts) bch2_fs_moving_ctxts_to_text(out, c); -#ifdef BCH_WRITE_REF_DEBUG if (attr == &sysfs_write_refs) - bch2_write_refs_to_text(out, c); -#endif + enumerated_ref_to_text(out, &c->writes, bch2_write_refs); if (attr == &sysfs_nocow_lock_table) bch2_nocow_locks_to_text(out, &c->nocow_locks); @@ -415,7 +396,10 @@ STORE(bch2_fs) /* Debugging: */ - if (!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs)) + if (attr == &sysfs_trigger_btree_updates) + queue_work(c->btree_interior_update_worker, &c->btree_interior_update_work); + + if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs)) return -EROFS; if (attr == &sysfs_trigger_btree_cache_shrink) { @@ -455,6 +439,15 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_freelist_wakeup) closure_wake_up(&c->freelist_wait); + if (attr == &sysfs_trigger_recalc_capacity) { + down_read(&c->state_lock); + bch2_recalc_capacity(c); + up_read(&c->state_lock); + } + + if (attr == &sysfs_trigger_delete_dead_snapshots) + __bch2_delete_dead_snapshots(c); + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -475,7 +468,7 @@ STORE(bch2_fs) size = ret; } #endif - bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); return size; } SYSFS_OPS(bch2_fs); @@ -486,6 +479,8 @@ struct attribute *bch2_fs_files[] = { &sysfs_btree_write_stats, &sysfs_rebalance_status, + &sysfs_snapshot_delete_status, + &sysfs_recovery_status, &sysfs_compression_stats, @@ -566,13 +561,9 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_btree_key_cache, &sysfs_btree_reserve_cache, &sysfs_new_stripes, - &sysfs_stripes_heap, &sysfs_open_buckets, &sysfs_open_buckets_partial, - &sysfs_write_points, -#ifdef BCH_WRITE_REF_DEBUG &sysfs_write_refs, -#endif &sysfs_nocow_lock_table, &sysfs_io_timers_read, &sysfs_io_timers_write, @@ -584,7 +575,10 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_journal_writes, &sysfs_trigger_btree_cache_shrink, &sysfs_trigger_btree_key_cache_shrink, + &sysfs_trigger_btree_updates, &sysfs_trigger_freelist_wakeup, + &sysfs_trigger_recalc_capacity, + &sysfs_trigger_delete_dead_snapshots, &sysfs_gc_gens_pos, @@ -604,87 +598,115 @@ struct attribute *bch2_fs_internal_files[] = { /* options */ -SHOW(bch2_fs_opts_dir) +static ssize_t sysfs_opt_show(struct bch_fs *c, + struct bch_dev *ca, + enum bch_opt_id id, + struct printbuf *out) { - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - const struct bch_option *opt = container_of(attr, struct bch_option, attr); - int id = opt - bch2_opt_table; - u64 v = bch2_opt_get_by_id(&c->opts, id); + const struct bch_option *opt = bch2_opt_table + id; + u64 v; + + if (opt->flags & OPT_FS) { + v = bch2_opt_get_by_id(&c->opts, id); + } else if ((opt->flags & OPT_DEVICE) && opt->get_member) { + v = bch2_opt_from_sb(c->disk_sb.sb, id, ca->dev_idx); + } else { + return -EINVAL; + } bch2_opt_to_text(out, c, c->disk_sb.sb, opt, v, OPT_SHOW_FULL_LIST); prt_char(out, '\n'); - return 0; } -STORE(bch2_fs_opts_dir) +static ssize_t sysfs_opt_store(struct bch_fs *c, + struct bch_dev *ca, + enum bch_opt_id id, + const char *buf, size_t size) { - struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); - const struct bch_option *opt = container_of(attr, struct bch_option, attr); - int ret, id = opt - bch2_opt_table; - char *tmp; - u64 v; + const struct bch_option *opt = bch2_opt_table + id; + int ret = 0; /* * We don't need to take c->writes for correctness, but it eliminates an * unsightly error message in the dmesg log when we're RO: */ - if (unlikely(!bch2_write_ref_tryget(c, BCH_WRITE_REF_sysfs))) + if (unlikely(!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_sysfs))) return -EROFS; - tmp = kstrdup(buf, GFP_KERNEL); + char *tmp = kstrdup(buf, GFP_KERNEL); if (!tmp) { ret = -ENOMEM; goto err; } - ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL); + u64 v; + ret = bch2_opt_parse(c, opt, strim(tmp), &v, NULL) ?: + bch2_opt_hook_pre_set(c, ca, id, v); kfree(tmp); if (ret < 0) goto err; - ret = bch2_opt_check_may_set(c, id, v); - if (ret < 0) - goto err; - - bch2_opt_set_sb(c, NULL, opt, v); - bch2_opt_set_by_id(&c->opts, id, v); - - if (v && - (id == Opt_background_target || - id == Opt_background_compression || - (id == Opt_compression && !c->opts.background_compression))) - bch2_set_rebalance_needs_scan(c, 0); + bool is_sb = opt->get_sb || opt->get_member; + bool changed = false; + + if (is_sb) { + changed = bch2_opt_set_sb(c, ca, opt, v); + } else if (!ca) { + changed = bch2_opt_get_by_id(&c->opts, id) != v; + } else { + /* device options that aren't superblock options aren't + * supported */ + BUG(); + } - if (v && id == Opt_rebalance_enabled) - rebalance_wakeup(c); + if (!ca) + bch2_opt_set_by_id(&c->opts, id, v); - if (v && id == Opt_copygc_enabled && - c->copygc_thread) - wake_up_process(c->copygc_thread); + if (changed) + bch2_opt_hook_post_set(c, ca, 0, &c->opts, id); ret = size; err: - bch2_write_ref_put(c, BCH_WRITE_REF_sysfs); + enumerated_ref_put(&c->writes, BCH_WRITE_REF_sysfs); return ret; } + +SHOW(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + int id = bch2_opt_lookup(attr->name); + if (id < 0) + return 0; + + return sysfs_opt_show(c, NULL, id, out); +} + +STORE(bch2_fs_opts_dir) +{ + struct bch_fs *c = container_of(kobj, struct bch_fs, opts_dir); + int id = bch2_opt_lookup(attr->name); + if (id < 0) + return 0; + + return sysfs_opt_store(c, NULL, id, buf, size); +} SYSFS_OPS(bch2_fs_opts_dir); struct attribute *bch2_fs_opts_dir_files[] = { NULL }; -int bch2_opts_create_sysfs_files(struct kobject *kobj) +int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) { - const struct bch_option *i; - int ret; - - for (i = bch2_opt_table; + for (const struct bch_option *i = bch2_opt_table; i < bch2_opt_table + bch2_opts_nr; i++) { - if (!(i->flags & OPT_FS)) + if (i->flags & OPT_HIDDEN) + continue; + if (!(i->flags & type)) continue; - ret = sysfs_create_file(kobj, &i->attr); + int ret = sysfs_create_file(kobj, &i->attr); if (ret) return ret; } @@ -755,11 +777,8 @@ SHOW(bch2_dev) sysfs_printf(uuid, "%pU\n", ca->uuid.b); - sysfs_print(bucket_size, bucket_bytes(ca)); sysfs_print(first_bucket, ca->mi.first_bucket); sysfs_print(nbuckets, ca->mi.nbuckets); - sysfs_print(durability, ca->mi.durability); - sysfs_print(discard, ca->mi.discard); if (attr == &sysfs_label) { if (ca->mi.group) @@ -772,11 +791,6 @@ SHOW(bch2_dev) prt_char(out, '\n'); } - if (attr == &sysfs_state) { - prt_string_option(out, bch2_member_states, ca->mi.state); - prt_char(out, '\n'); - } - if (attr == &sysfs_io_done) dev_io_done_to_text(out, ca); @@ -802,6 +816,16 @@ SHOW(bch2_dev) if (attr == &sysfs_open_buckets) bch2_open_buckets_to_text(out, c, ca); + int opt_id = bch2_opt_lookup(attr->name); + if (opt_id >= 0) + return sysfs_opt_show(c, ca, opt_id, out); + + if (attr == &sysfs_read_refs) + enumerated_ref_to_text(out, &ca->io_ref[READ], bch2_dev_read_refs); + + if (attr == &sysfs_write_refs) + enumerated_ref_to_text(out, &ca->io_ref[WRITE], bch2_dev_write_refs); + return 0; } @@ -810,18 +834,6 @@ STORE(bch2_dev) struct bch_dev *ca = container_of(kobj, struct bch_dev, kobj); struct bch_fs *c = ca->fs; - if (attr == &sysfs_discard) { - bool v = strtoul_or_return(buf); - - bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_discard, v); - } - - if (attr == &sysfs_durability) { - u64 v = strtoul_or_return(buf); - - bch2_opt_set_sb(c, ca, bch2_opt_table + Opt_durability, v); - } - if (attr == &sysfs_label) { char *tmp; int ret; @@ -839,20 +851,20 @@ STORE(bch2_dev) if (attr == &sysfs_io_errors_reset) bch2_dev_errors_reset(ca); + int opt_id = bch2_opt_lookup(attr->name); + if (opt_id >= 0) + return sysfs_opt_store(c, ca, opt_id, buf, size); + return size; } SYSFS_OPS(bch2_dev); struct attribute *bch2_dev_files[] = { &sysfs_uuid, - &sysfs_bucket_size, &sysfs_first_bucket, &sysfs_nbuckets, - &sysfs_durability, /* settings: */ - &sysfs_discard, - &sysfs_state, &sysfs_label, &sysfs_has_data, @@ -869,6 +881,9 @@ struct attribute *bch2_dev_files[] = { /* debug: */ &sysfs_alloc_debug, &sysfs_open_buckets, + + &sysfs_read_refs, + &sysfs_write_refs, NULL }; diff --git a/fs/bcachefs/sysfs.h b/fs/bcachefs/sysfs.h index 222cd5062702..303e0433c702 100644 --- a/fs/bcachefs/sysfs.h +++ b/fs/bcachefs/sysfs.h @@ -23,7 +23,7 @@ extern const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; extern const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; extern const struct sysfs_ops bch2_dev_sysfs_ops; -int bch2_opts_create_sysfs_files(struct kobject *); +int bch2_opts_create_sysfs_files(struct kobject *, unsigned); #else @@ -41,7 +41,8 @@ static const struct sysfs_ops bch2_fs_opts_dir_sysfs_ops; static const struct sysfs_ops bch2_fs_time_stats_sysfs_ops; static const struct sysfs_ops bch2_dev_sysfs_ops; -static inline int bch2_opts_create_sysfs_files(struct kobject *kobj) { return 0; } +static inline int bch2_opts_create_sysfs_files(struct kobject *kobj, unsigned type) +{ return 0; } #endif /* NO_BCACHEFS_SYSFS */ diff --git a/fs/bcachefs/tests.c b/fs/bcachefs/tests.c index 6c6469814637..782a05fe7656 100644 --- a/fs/bcachefs/tests.c +++ b/fs/bcachefs/tests.c @@ -43,7 +43,7 @@ static int test_delete(struct bch_fs *c, u64 nr) BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, &k.k_i, 0)); bch_err_msg(c, ret, "update error"); if (ret) @@ -51,7 +51,7 @@ static int test_delete(struct bch_fs *c, u64 nr) pr_info("deleting once"); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error (first)"); if (ret) @@ -59,7 +59,7 @@ static int test_delete(struct bch_fs *c, u64 nr) pr_info("deleting twice"); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error (second)"); if (ret) @@ -84,7 +84,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) BTREE_ITER_intent); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_trans_update(trans, &iter, &k.k_i, 0)); bch_err_msg(c, ret, "update error"); if (ret) @@ -94,7 +94,7 @@ static int test_delete_written(struct bch_fs *c, u64 nr) bch2_journal_flush_all_pins(&c->journal); ret = commit_do(trans, NULL, NULL, 0, - bch2_btree_iter_traverse(&iter) ?: + bch2_btree_iter_traverse(trans, &iter) ?: bch2_btree_delete_at(trans, &iter, 0)); bch_err_msg(c, ret, "delete error"); if (ret) @@ -342,6 +342,8 @@ static int test_iterate_slots_extents(struct bch_fs *c, u64 nr) */ static int test_peek_end(struct bch_fs *c, u64 nr) { + delete_test_keys(c); + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; @@ -349,10 +351,10 @@ static int test_peek_end(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -362,6 +364,8 @@ static int test_peek_end(struct bch_fs *c, u64 nr) static int test_peek_end_extents(struct bch_fs *c, u64 nr) { + delete_test_keys(c); + struct btree_trans *trans = bch2_trans_get(c); struct btree_iter iter; struct bkey_s_c k; @@ -369,10 +373,10 @@ static int test_peek_end_extents(struct bch_fs *c, u64 nr) bch2_trans_iter_init(trans, &iter, BTREE_ID_extents, SPOS(0, 0, U32_MAX), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k); bch2_trans_iter_exit(trans, &iter); @@ -488,7 +492,7 @@ static int test_snapshot_filter(struct bch_fs *c, u32 snapid_lo, u32 snapid_hi) trans = bch2_trans_get(c); bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, SPOS(0, 0, snapid_lo), 0); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)))); BUG_ON(k.k->p.snapshot != U32_MAX); @@ -602,9 +606,9 @@ static int rand_lookup(struct bch_fs *c, u64 nr) SPOS(0, 0, U32_MAX), 0); for (i = 0; i < nr; i++) { - bch2_btree_iter_set_pos(&iter, SPOS(0, test_rand(), U32_MAX)); + bch2_btree_iter_set_pos(trans, &iter, SPOS(0, test_rand(), U32_MAX)); - lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(&iter))); + lockrestart_do(trans, bkey_err(k = bch2_btree_iter_peek(trans, &iter))); ret = bkey_err(k); if (ret) break; @@ -623,9 +627,9 @@ static int rand_mixed_trans(struct btree_trans *trans, struct bkey_s_c k; int ret; - bch2_btree_iter_set_pos(iter, SPOS(0, pos, U32_MAX)); + bch2_btree_iter_set_pos(trans, iter, SPOS(0, pos, U32_MAX)); - k = bch2_btree_iter_peek(iter); + k = bch2_btree_iter_peek(trans, iter); ret = bkey_err(k); bch_err_msg(trans->c, ret, "lookup error"); if (ret) @@ -672,7 +676,7 @@ static int __do_delete(struct btree_trans *trans, struct bpos pos) bch2_trans_iter_init(trans, &iter, BTREE_ID_xattrs, pos, BTREE_ITER_intent); - k = bch2_btree_iter_peek_max(&iter, POS(0, U64_MAX)); + k = bch2_btree_iter_peek_max(trans, &iter, POS(0, U64_MAX)); ret = bkey_err(k); if (ret) goto err; diff --git a/fs/bcachefs/thread_with_file.c b/fs/bcachefs/thread_with_file.c index dea73bc1cb51..314a24d15d4e 100644 --- a/fs/bcachefs/thread_with_file.c +++ b/fs/bcachefs/thread_with_file.c @@ -455,8 +455,10 @@ ssize_t bch2_stdio_redirect_vprintf(struct stdio_redirect *stdio, bool nonblocki struct stdio_buf *buf = &stdio->output; unsigned long flags; ssize_t ret; - again: + if (stdio->done) + return -EPIPE; + spin_lock_irqsave(&buf->lock, flags); ret = bch2_darray_vprintf(&buf->buf, GFP_NOWAIT, fmt, args); spin_unlock_irqrestore(&buf->lock, flags); diff --git a/fs/bcachefs/time_stats.c b/fs/bcachefs/time_stats.c index 3fe82757f93a..2c34fe4be912 100644 --- a/fs/bcachefs/time_stats.c +++ b/fs/bcachefs/time_stats.c @@ -10,6 +10,9 @@ #include "eytzinger.h" #include "time_stats.h" +/* disable automatic switching to percpu mode */ +#define TIME_STATS_NONPCPU ((unsigned long) 1) + static const struct time_unit time_units[] = { { "ns", 1 }, { "us", NSEC_PER_USEC }, @@ -123,11 +126,12 @@ void __bch2_time_stats_update(struct bch2_time_stats *stats, u64 start, u64 end) { unsigned long flags; - if (!stats->buffer) { + if ((unsigned long) stats->buffer <= TIME_STATS_NONPCPU) { spin_lock_irqsave(&stats->lock, flags); time_stats_update_one(stats, start, end); - if (mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && + if (!stats->buffer && + mean_and_variance_weighted_get_mean(stats->freq_stats_weighted, TIME_STATS_MV_WEIGHT) < 32 && stats->duration_stats.n > 1024) stats->buffer = alloc_percpu_gfp(struct time_stat_buffer, @@ -157,7 +161,7 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats) unsigned offset = offsetof(struct bch2_time_stats, min_duration); memset((void *) stats + offset, 0, sizeof(*stats) - offset); - if (stats->buffer) { + if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) { int cpu; for_each_possible_cpu(cpu) per_cpu_ptr(stats->buffer, cpu)->nr = 0; @@ -167,7 +171,9 @@ void bch2_time_stats_reset(struct bch2_time_stats *stats) void bch2_time_stats_exit(struct bch2_time_stats *stats) { - free_percpu(stats->buffer); + if ((unsigned long) stats->buffer > TIME_STATS_NONPCPU) + free_percpu(stats->buffer); + stats->buffer = NULL; } void bch2_time_stats_init(struct bch2_time_stats *stats) @@ -177,3 +183,9 @@ void bch2_time_stats_init(struct bch2_time_stats *stats) stats->min_freq = U64_MAX; spin_lock_init(&stats->lock); } + +void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *stats) +{ + bch2_time_stats_init(stats); + stats->buffer = (struct time_stat_buffer __percpu *) TIME_STATS_NONPCPU; +} diff --git a/fs/bcachefs/time_stats.h b/fs/bcachefs/time_stats.h index dc6493f7bbab..eddb0985bab4 100644 --- a/fs/bcachefs/time_stats.h +++ b/fs/bcachefs/time_stats.h @@ -145,6 +145,7 @@ static inline bool track_event_change(struct bch2_time_stats *stats, bool v) void bch2_time_stats_reset(struct bch2_time_stats *); void bch2_time_stats_exit(struct bch2_time_stats *); void bch2_time_stats_init(struct bch2_time_stats *); +void bch2_time_stats_init_no_pcpu(struct bch2_time_stats *); static inline void bch2_time_stats_quantiles_exit(struct bch2_time_stats_quantiles *statq) { diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index c1b51009edf6..8cb5b40704fd 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -295,12 +295,12 @@ TRACE_EVENT(write_super, /* io.c: */ -DEFINE_EVENT(bio, read_promote, +DEFINE_EVENT(bio, io_read_promote, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -TRACE_EVENT(read_nopromote, +TRACE_EVENT(io_read_nopromote, TP_PROTO(struct bch_fs *c, int ret), TP_ARGS(c, ret), @@ -319,26 +319,55 @@ TRACE_EVENT(read_nopromote, __entry->ret) ); -DEFINE_EVENT(bio, read_bounce, +DEFINE_EVENT(bio, io_read_bounce, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_split, +DEFINE_EVENT(bio, io_read_split, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_retry, +DEFINE_EVENT(bio, io_read_retry, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); -DEFINE_EVENT(bio, read_reuse_race, +DEFINE_EVENT(bio, io_read_reuse_race, TP_PROTO(struct bio *bio), TP_ARGS(bio) ); +DEFINE_EVENT(bio, io_read_fail_and_poison, + TP_PROTO(struct bio *bio), + TP_ARGS(bio) +); + +/* ec.c */ + +TRACE_EVENT(stripe_create, + TP_PROTO(struct bch_fs *c, u64 idx, int ret), + TP_ARGS(c, idx, ret), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(u64, idx ) + __field(int, ret ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->idx = idx; + __entry->ret = ret; + ), + + TP_printk("%d,%d idx %llu ret %i", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->idx, + __entry->ret) +); + /* Journal */ DEFINE_EVENT(bch_fs, journal_full, @@ -797,53 +826,37 @@ TRACE_EVENT(bucket_invalidate, /* Moving IO */ -TRACE_EVENT(bucket_evacuate, - TP_PROTO(struct bch_fs *c, struct bpos *bucket), - TP_ARGS(c, bucket), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u32, dev_idx ) - __field(u64, bucket ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->dev_idx = bucket->inode; - __entry->bucket = bucket->offset; - ), - - TP_printk("%d:%d %u:%llu", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->dev_idx, __entry->bucket) +DEFINE_EVENT(fs_str, io_move, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent, +DEFINE_EVENT(fs_str, io_move_read, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_read, +DEFINE_EVENT(fs_str, io_move_write, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_write, +DEFINE_EVENT(fs_str, io_move_finish, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_finish, +DEFINE_EVENT(fs_str, io_move_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_fail, +DEFINE_EVENT(fs_str, io_move_write_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, move_extent_start_fail, +DEFINE_EVENT(fs_str, io_move_start_fail, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); @@ -881,37 +894,6 @@ TRACE_EVENT(move_data, __entry->sectors_raced) ); -TRACE_EVENT(evacuate_bucket, - TP_PROTO(struct bch_fs *c, struct bpos *bucket, - unsigned sectors, unsigned bucket_size, - int ret), - TP_ARGS(c, bucket, sectors, bucket_size, ret), - - TP_STRUCT__entry( - __field(dev_t, dev ) - __field(u64, member ) - __field(u64, bucket ) - __field(u32, sectors ) - __field(u32, bucket_size ) - __field(int, ret ) - ), - - TP_fast_assign( - __entry->dev = c->dev; - __entry->member = bucket->inode; - __entry->bucket = bucket->offset; - __entry->sectors = sectors; - __entry->bucket_size = bucket_size; - __entry->ret = ret; - ), - - TP_printk("%d,%d %llu:%llu sectors %u/%u ret %i", - MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->member, __entry->bucket, - __entry->sectors, __entry->bucket_size, - __entry->ret) -); - TRACE_EVENT(copygc, TP_PROTO(struct bch_fs *c, u64 buckets, @@ -1145,51 +1127,9 @@ DEFINE_EVENT(transaction_restart_iter, trans_restart_btree_node_split, TP_ARGS(trans, caller_ip, path) ); -TRACE_EVENT(trans_restart_upgrade, - TP_PROTO(struct btree_trans *trans, - unsigned long caller_ip, - struct btree_path *path, - unsigned old_locks_want, - unsigned new_locks_want, - struct get_locks_fail *f), - TP_ARGS(trans, caller_ip, path, old_locks_want, new_locks_want, f), - - TP_STRUCT__entry( - __array(char, trans_fn, 32 ) - __field(unsigned long, caller_ip ) - __field(u8, btree_id ) - __field(u8, old_locks_want ) - __field(u8, new_locks_want ) - __field(u8, level ) - __field(u32, path_seq ) - __field(u32, node_seq ) - TRACE_BPOS_entries(pos) - ), - - TP_fast_assign( - strscpy(__entry->trans_fn, trans->fn, sizeof(__entry->trans_fn)); - __entry->caller_ip = caller_ip; - __entry->btree_id = path->btree_id; - __entry->old_locks_want = old_locks_want; - __entry->new_locks_want = new_locks_want; - __entry->level = f->l; - __entry->path_seq = path->l[f->l].lock_seq; - __entry->node_seq = IS_ERR_OR_NULL(f->b) ? 0 : f->b->c.lock.seq; - TRACE_BPOS_assign(pos, path->pos) - ), - - TP_printk("%s %pS btree %s pos %llu:%llu:%u locks_want %u -> %u level %u path seq %u node seq %u", - __entry->trans_fn, - (void *) __entry->caller_ip, - bch2_btree_id_str(__entry->btree_id), - __entry->pos_inode, - __entry->pos_offset, - __entry->pos_snapshot, - __entry->old_locks_want, - __entry->new_locks_want, - __entry->level, - __entry->path_seq, - __entry->node_seq) +DEFINE_EVENT(fs_str, trans_restart_upgrade, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); DEFINE_EVENT(trans_str, trans_restart_relock, @@ -1491,6 +1431,11 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); +DEFINE_EVENT(fs_str, io_move_created_rebalance, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); + TRACE_EVENT(error_downcast, TP_PROTO(int bch_err, int std_err, unsigned long ip), TP_ARGS(bch_err, std_err, ip), diff --git a/fs/bcachefs/util.c b/fs/bcachefs/util.c index e0a876cbaa6b..dc3817f545fa 100644 --- a/fs/bcachefs/util.c +++ b/fs/bcachefs/util.c @@ -252,8 +252,18 @@ void bch2_prt_u64_base2(struct printbuf *out, u64 v) bch2_prt_u64_base2_nbits(out, v, fls64(v) ?: 1); } -static void __bch2_print_string_as_lines(const char *prefix, const char *lines, - bool nonblocking) +static bool string_is_spaces(const char *str) +{ + while (*str) { + if (*str != ' ') + return false; + str++; + } + return true; +} + +void bch2_print_string_as_lines(const char *prefix, const char *lines, + bool nonblocking) { bool locked = false; const char *p; @@ -270,8 +280,11 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, locked = console_trylock(); } - while (1) { + while (*lines) { p = strchrnul(lines, '\n'); + if (!*p && string_is_spaces(lines)) + break; + printk("%s%.*s\n", prefix, (int) (p - lines), lines); if (!*p) break; @@ -281,16 +294,6 @@ static void __bch2_print_string_as_lines(const char *prefix, const char *lines, console_unlock(); } -void bch2_print_string_as_lines(const char *prefix, const char *lines) -{ - return __bch2_print_string_as_lines(prefix, lines, false); -} - -void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines) -{ - return __bch2_print_string_as_lines(prefix, lines, true); -} - int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *task, unsigned skipnr, gfp_t gfp) { @@ -473,10 +476,10 @@ void bch2_time_stats_to_text(struct printbuf *out, struct bch2_time_stats *stats u64 last_q = 0; prt_printf(out, "quantiles (%s):\t", u->name); - eytzinger0_for_each(i, NR_QUANTILES) { - bool is_last = eytzinger0_next(i, NR_QUANTILES) == -1; + eytzinger0_for_each(j, NR_QUANTILES) { + bool is_last = eytzinger0_next(j, NR_QUANTILES) == -1; - u64 q = max(quantiles->entries[i].m, last_q); + u64 q = max(quantiles->entries[j].m, last_q); prt_printf(out, "%llu ", div64_u64(q, u->nsecs)); if (is_last) prt_newline(out); @@ -653,19 +656,25 @@ int bch2_bio_alloc_pages(struct bio *bio, size_t size, gfp_t gfp_mask) return 0; } -size_t bch2_rand_range(size_t max) +u64 bch2_get_random_u64_below(u64 ceil) { - size_t rand; - - if (!max) - return 0; - - do { - rand = get_random_long(); - rand &= roundup_pow_of_two(max) - 1; - } while (rand >= max); + if (ceil <= U32_MAX) + return __get_random_u32_below(ceil); + + /* this is the same (clever) algorithm as in __get_random_u32_below() */ + u64 rand = get_random_u64(); + u64 mult = ceil * rand; + + if (unlikely(mult < ceil)) { + u64 bound; + div64_u64_rem(-ceil, ceil, &bound); + while (unlikely(mult < bound)) { + rand = get_random_u64(); + mult = ceil * rand; + } + } - return rand; + return mul_u64_u64_shr(ceil, rand, 64); } void memcpy_to_bio(struct bio *dst, struct bvec_iter dst_iter, const void *src) @@ -698,12 +707,43 @@ void memcpy_from_bio(void *dst, struct bio *src, struct bvec_iter src_iter) } } +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *bio) +{ + struct bvec_iter iter; + struct bio_vec bv; + unsigned offset = get_random_u32_below(bio->bi_iter.bi_size / sizeof(u64)); + + bio_for_each_segment(bv, bio, iter) { + unsigned u64s = bv.bv_len / sizeof(u64); + + if (offset < u64s) { + u64 *segment = bvec_kmap_local(&bv); + segment[offset] = get_random_u64(); + kunmap_local(segment); + return; + } + offset -= u64s; + } +} +#endif + +void bch2_bio_to_text(struct printbuf *out, struct bio *bio) +{ + prt_printf(out, "bi_remaining:\t%u\n", + atomic_read(&bio->__bi_remaining)); + prt_printf(out, "bi_end_io:\t%ps\n", + bio->bi_end_io); + prt_printf(out, "bi_status:\t%u\n", + bio->bi_status); +} + #if 0 void eytzinger1_test(void) { - unsigned inorder, eytz, size; + unsigned inorder, size; - pr_info("1 based eytzinger test:"); + pr_info("1 based eytzinger test:\n"); for (size = 2; size < 65536; @@ -711,13 +751,7 @@ void eytzinger1_test(void) unsigned extra = eytzinger1_extra(size); if (!(size % 4096)) - pr_info("tree size %u", size); - - BUG_ON(eytzinger1_prev(0, size) != eytzinger1_last(size)); - BUG_ON(eytzinger1_next(0, size) != eytzinger1_first(size)); - - BUG_ON(eytzinger1_prev(eytzinger1_first(size), size) != 0); - BUG_ON(eytzinger1_next(eytzinger1_last(size), size) != 0); + pr_info("tree size %u\n", size); inorder = 1; eytzinger1_for_each(eytz, size) { @@ -728,15 +762,16 @@ void eytzinger1_test(void) inorder++; } + BUG_ON(inorder - 1 != size); } } void eytzinger0_test(void) { - unsigned inorder, eytz, size; + unsigned inorder, size; - pr_info("0 based eytzinger test:"); + pr_info("0 based eytzinger test:\n"); for (size = 1; size < 65536; @@ -744,13 +779,7 @@ void eytzinger0_test(void) unsigned extra = eytzinger0_extra(size); if (!(size % 4096)) - pr_info("tree size %u", size); - - BUG_ON(eytzinger0_prev(-1, size) != eytzinger0_last(size)); - BUG_ON(eytzinger0_next(-1, size) != eytzinger0_first(size)); - - BUG_ON(eytzinger0_prev(eytzinger0_first(size), size) != -1); - BUG_ON(eytzinger0_next(eytzinger0_last(size), size) != -1); + pr_info("tree size %u\n", size); inorder = 0; eytzinger0_for_each(eytz, size) { @@ -761,54 +790,191 @@ void eytzinger0_test(void) inorder++; } + BUG_ON(inorder != size); + + inorder = size - 1; + eytzinger0_for_each_prev(eytz, size) { + BUG_ON(eytz != eytzinger0_first(size) && + eytzinger0_next(eytzinger0_prev(eytz, size), size) != eytz); + + inorder--; + } + BUG_ON(inorder != -1); } } -static inline int cmp_u16(const void *_l, const void *_r, size_t size) +static inline int cmp_u16(const void *_l, const void *_r) { const u16 *l = _l, *r = _r; - return (*l > *r) - (*r - *l); + return (*l > *r) - (*r > *l); } -static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +static void eytzinger0_find_test_le(u16 *test_array, unsigned nr, u16 search) { - int i, c1 = -1, c2 = -1; - ssize_t r; + int r, s; + bool bad; r = eytzinger0_find_le(test_array, nr, sizeof(test_array[0]), cmp_u16, &search); - if (r >= 0) - c1 = test_array[r]; - - for (i = 0; i < nr; i++) - if (test_array[i] <= search && test_array[i] > c2) - c2 = test_array[i]; - - if (c1 != c2) { - eytzinger0_for_each(i, nr) - pr_info("[%3u] = %12u", i, test_array[i]); - pr_info("find_le(%2u) -> [%2zi] = %2i should be %2i", - i, r, c1, c2); + if (r >= 0) { + if (test_array[r] > search) { + bad = true; + } else { + s = eytzinger0_next(r, nr); + bad = s >= 0 && test_array[s] <= search; + } + } else { + s = eytzinger0_last(nr); + bad = s >= 0 && test_array[s] <= search; + } + + if (bad) { + s = -1; + eytzinger0_for_each_prev(j, nr) { + if (test_array[j] <= search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_le(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); + } +} + +static void eytzinger0_find_test_gt(u16 *test_array, unsigned nr, u16 search) +{ + int r, s; + bool bad; + + r = eytzinger0_find_gt(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) { + if (test_array[r] <= search) { + bad = true; + } else { + s = eytzinger0_prev(r, nr); + bad = s >= 0 && test_array[s] > search; + } + } else { + s = eytzinger0_first(nr); + bad = s >= 0 && test_array[s] > search; + } + + if (bad) { + s = -1; + eytzinger0_for_each(j, nr) { + if (test_array[j] > search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_gt(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); } } +static void eytzinger0_find_test_ge(u16 *test_array, unsigned nr, u16 search) +{ + int r, s; + bool bad; + + r = eytzinger0_find_ge(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + if (r >= 0) { + if (test_array[r] < search) { + bad = true; + } else { + s = eytzinger0_prev(r, nr); + bad = s >= 0 && test_array[s] >= search; + } + } else { + s = eytzinger0_first(nr); + bad = s >= 0 && test_array[s] >= search; + } + + if (bad) { + s = -1; + eytzinger0_for_each(j, nr) { + if (test_array[j] >= search) { + s = j; + break; + } + } + + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find_ge(%12u) = %3i should be %3i\n", + search, r, s); + BUG(); + } +} + +static void eytzinger0_find_test_eq(u16 *test_array, unsigned nr, u16 search) +{ + unsigned r; + int s; + bool bad; + + r = eytzinger0_find(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + + if (r < nr) { + bad = test_array[r] != search; + } else { + s = eytzinger0_find_le(test_array, nr, + sizeof(test_array[0]), + cmp_u16, &search); + bad = s >= 0 && test_array[s] == search; + } + + if (bad) { + eytzinger0_for_each(j, nr) + pr_info("[%3u] = %12u\n", j, test_array[j]); + pr_info("find(%12u) = %3i is incorrect\n", + search, r); + BUG(); + } +} + +static void eytzinger0_find_test_val(u16 *test_array, unsigned nr, u16 search) +{ + eytzinger0_find_test_le(test_array, nr, search); + eytzinger0_find_test_gt(test_array, nr, search); + eytzinger0_find_test_ge(test_array, nr, search); + eytzinger0_find_test_eq(test_array, nr, search); +} + void eytzinger0_find_test(void) { unsigned i, nr, allocated = 1 << 12; u16 *test_array = kmalloc_array(allocated, sizeof(test_array[0]), GFP_KERNEL); for (nr = 1; nr < allocated; nr++) { - pr_info("testing %u elems", nr); + u16 prev = 0; + + pr_info("testing %u elems\n", nr); get_random_bytes(test_array, nr * sizeof(test_array[0])); eytzinger0_sort(test_array, nr, sizeof(test_array[0]), cmp_u16, NULL); /* verify array is sorted correctly: */ - eytzinger0_for_each(i, nr) - BUG_ON(i != eytzinger0_last(nr) && - test_array[i] > test_array[eytzinger0_next(i, nr)]); + eytzinger0_for_each(j, nr) { + BUG_ON(test_array[j] < prev); + prev = test_array[j]; + } for (i = 0; i < U16_MAX; i += 1 << 12) eytzinger0_find_test_val(test_array, nr, i); @@ -850,14 +1016,14 @@ u64 *bch2_acc_percpu_u64s(u64 __percpu *p, unsigned nr) return ret; } -void bch2_darray_str_exit(darray_str *d) +void bch2_darray_str_exit(darray_const_str *d) { darray_for_each(*d, i) kfree(*i); darray_exit(d); } -int bch2_split_devs(const char *_dev_name, darray_str *ret) +int bch2_split_devs(const char *_dev_name, darray_const_str *ret) { darray_init(ret); diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index e7c3541b38f3..25cf61ebd40c 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -14,6 +14,7 @@ #include <linux/log2.h> #include <linux/percpu.h> #include <linux/preempt.h> +#include <linux/random.h> #include <linux/ratelimit.h> #include <linux/slab.h> #include <linux/vmalloc.h> @@ -55,15 +56,16 @@ static inline size_t buf_pages(void *p, size_t len) PAGE_SIZE); } -static inline void *bch2_kvmalloc(size_t n, gfp_t flags) +static inline void *bch2_kvmalloc_noprof(size_t n, gfp_t flags) { void *p = unlikely(n >= INT_MAX) - ? vmalloc(n) - : kvmalloc(n, flags & ~__GFP_ZERO); + ? vmalloc_noprof(n) + : kvmalloc_noprof(n, flags & ~__GFP_ZERO); if (p && (flags & __GFP_ZERO)) memset(p, 0, n); return p; } +#define bch2_kvmalloc(...) alloc_hooks(bch2_kvmalloc_noprof(__VA_ARGS__)) #define init_heap(heap, _size, gfp) \ ({ \ @@ -94,6 +96,7 @@ do { \ #define printbuf_tabstop_push(_buf, _n) bch2_printbuf_tabstop_push(_buf, _n) #define printbuf_indent_add(_out, _n) bch2_printbuf_indent_add(_out, _n) +#define printbuf_indent_add_nextline(_out, _n) bch2_printbuf_indent_add_nextline(_out, _n) #define printbuf_indent_sub(_out, _n) bch2_printbuf_indent_sub(_out, _n) #define prt_newline(_out) bch2_prt_newline(_out) @@ -210,8 +213,7 @@ u64 bch2_read_flag_list(const char *, const char * const[]); void bch2_prt_u64_base2_nbits(struct printbuf *, u64, unsigned); void bch2_prt_u64_base2(struct printbuf *, u64); -void bch2_print_string_as_lines(const char *prefix, const char *lines); -void bch2_print_string_as_lines_nonblocking(const char *prefix, const char *lines); +void bch2_print_string_as_lines(const char *, const char *, bool); typedef DARRAY(unsigned long) bch_stacktrace; int bch2_save_backtrace(bch_stacktrace *stack, struct task_struct *, unsigned, gfp_t); @@ -401,11 +403,25 @@ do { \ _ret; \ }) -size_t bch2_rand_range(size_t); +u64 bch2_get_random_u64_below(u64); void memcpy_to_bio(struct bio *, struct bvec_iter, const void *); void memcpy_from_bio(void *, struct bio *, struct bvec_iter); +#ifdef CONFIG_BCACHEFS_DEBUG +void bch2_corrupt_bio(struct bio *); + +static inline void bch2_maybe_corrupt_bio(struct bio *bio, unsigned ratio) +{ + if (ratio && !get_random_u32_below(ratio)) + bch2_corrupt_bio(bio); +} +#else +#define bch2_maybe_corrupt_bio(...) do {} while (0) +#endif + +void bch2_bio_to_text(struct printbuf *, struct bio *); + static inline void memcpy_u64s_small(void *dst, const void *src, unsigned u64s) { @@ -419,7 +435,7 @@ static inline void memcpy_u64s_small(void *dst, const void *src, static inline void __memcpy_u64s(void *dst, const void *src, unsigned u64s) { -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("rep ; movsq" @@ -496,7 +512,7 @@ static inline void __memmove_u64s_up(void *_dst, const void *_src, u64 *dst = (u64 *) _dst + u64s - 1; u64 *src = (u64 *) _src + u64s - 1; -#ifdef CONFIG_X86_64 +#if defined(CONFIG_X86_64) && !defined(CONFIG_KMSAN) long d0, d1, d2; asm volatile("std ;\n" @@ -609,7 +625,7 @@ do { \ #define per_cpu_sum(_p) \ ({ \ - typeof(*_p) _ret = 0; \ + TYPEOF_UNQUAL(*_p) _ret = 0; \ \ int cpu; \ for_each_possible_cpu(cpu) \ @@ -675,8 +691,8 @@ static inline bool qstr_eq(const struct qstr l, const struct qstr r) return l.len == r.len && !memcmp(l.name, r.name, l.len); } -void bch2_darray_str_exit(darray_str *); -int bch2_split_devs(const char *, darray_str *); +void bch2_darray_str_exit(darray_const_str *); +int bch2_split_devs(const char *, darray_const_str *); #ifdef __KERNEL__ @@ -726,4 +742,42 @@ static inline void memcpy_swab(void *_dst, void *_src, size_t len) *--dst = *src++; } +#define set_flags(_map, _in, _out) \ +do { \ + unsigned _i; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & (1 << _i)) \ + (_out) |= _map[_i]; \ + else \ + (_out) &= ~_map[_i]; \ +} while (0) + +#define map_flags(_map, _in) \ +({ \ + unsigned _out = 0; \ + \ + set_flags(_map, _in, _out); \ + _out; \ +}) + +#define map_flags_rev(_map, _in) \ +({ \ + unsigned _i, _out = 0; \ + \ + for (_i = 0; _i < ARRAY_SIZE(_map); _i++) \ + if ((_in) & _map[_i]) { \ + (_out) |= 1 << _i; \ + (_in) &= ~_map[_i]; \ + } \ + (_out); \ +}) + +#define map_defined(_map) \ +({ \ + unsigned _in = ~0; \ + \ + map_flags_rev(_map, _in); \ +}) + #endif /* _BCACHEFS_UTIL_H */ diff --git a/fs/bcachefs/xattr.c b/fs/bcachefs/xattr.c index aed7c6984173..627f153798c6 100644 --- a/fs/bcachefs/xattr.c +++ b/fs/bcachefs/xattr.c @@ -38,7 +38,7 @@ static u64 xattr_hash_bkey(const struct bch_hash_info *info, struct bkey_s_c k) struct bkey_s_c_xattr x = bkey_s_c_to_xattr(k); return bch2_xattr_hash(info, - &X_SEARCH(x.v->x_type, x.v->x_name, x.v->x_name_len)); + &X_SEARCH(x.v->x_type, x.v->x_name_and_value, x.v->x_name_len)); } static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) @@ -48,7 +48,7 @@ static bool xattr_cmp_key(struct bkey_s_c _l, const void *_r) return l.v->x_type != r->type || l.v->x_name_len != r->name.len || - memcmp(l.v->x_name, r->name.name, r->name.len); + memcmp(l.v->x_name_and_value, r->name.name, r->name.len); } static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) @@ -58,7 +58,7 @@ static bool xattr_cmp_bkey(struct bkey_s_c _l, struct bkey_s_c _r) return l.v->x_type != r.v->x_type || l.v->x_name_len != r.v->x_name_len || - memcmp(l.v->x_name, r.v->x_name, r.v->x_name_len); + memcmp(l.v->x_name_and_value, r.v->x_name_and_value, r.v->x_name_len); } const struct bch_hash_desc bch2_xattr_hash_desc = { @@ -96,7 +96,7 @@ int bch2_xattr_validate(struct bch_fs *c, struct bkey_s_c k, c, xattr_invalid_type, "invalid type (%u)", xattr.v->x_type); - bkey_fsck_err_on(memchr(xattr.v->x_name, '\0', xattr.v->x_name_len), + bkey_fsck_err_on(memchr(xattr.v->x_name_and_value, '\0', xattr.v->x_name_len), c, xattr_name_invalid_chars, "xattr name has invalid characters"); fsck_err: @@ -120,13 +120,13 @@ void bch2_xattr_to_text(struct printbuf *out, struct bch_fs *c, unsigned name_len = xattr.v->x_name_len; unsigned val_len = le16_to_cpu(xattr.v->x_val_len); unsigned max_name_val_bytes = bkey_val_bytes(xattr.k) - - offsetof(struct bch_xattr, x_name); + offsetof(struct bch_xattr, x_name_and_value); val_len = min_t(int, val_len, max_name_val_bytes - name_len); name_len = min(name_len, max_name_val_bytes); prt_printf(out, "%.*s:%.*s", - name_len, xattr.v->x_name, + name_len, xattr.v->x_name_and_value, val_len, (char *) xattr_val(xattr.v)); if (xattr.v->x_type == KEY_TYPE_XATTR_INDEX_POSIX_ACL_ACCESS || @@ -168,7 +168,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, int type, int flags) { struct bch_fs *c = trans->c; - struct btree_iter inode_iter = { NULL }; + struct btree_iter inode_iter = {}; int ret; ret = bch2_subvol_is_ro_trans(trans, inum.subvol) ?: @@ -176,6 +176,11 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, if (ret) return ret; + /* + * Besides the ctime update, extents, dirents and xattrs updates require + * that an inode update also happens - to ensure that if a key exists in + * one of those btrees with a given snapshot ID an inode is also present + */ inode_u->bi_ctime = bch2_current_time(c); ret = bch2_inode_write(trans, &inode_iter, inode_u); @@ -202,7 +207,7 @@ int bch2_xattr_set(struct btree_trans *trans, subvol_inum inum, xattr->v.x_type = type; xattr->v.x_name_len = namelen; xattr->v.x_val_len = cpu_to_le16(size); - memcpy(xattr->v.x_name, name, namelen); + memcpy(xattr->v.x_name_and_value, name, namelen); memcpy(xattr_val(&xattr->v), value, size); ret = bch2_hash_set(trans, bch2_xattr_hash_desc, hash_info, @@ -270,7 +275,7 @@ static int bch2_xattr_emit(struct dentry *dentry, if (!prefix) return 0; - return __bch2_xattr_emit(prefix, xattr->x_name, xattr->x_name_len, buf); + return __bch2_xattr_emit(prefix, xattr->x_name_and_value, xattr->x_name_len, buf); } static int bch2_xattr_list_bcachefs(struct bch_fs *c, @@ -473,6 +478,12 @@ static int inode_opt_set_fn(struct btree_trans *trans, { struct inode_opt_set *s = p; + if (s->id == Inode_opt_casefold) { + int ret = bch2_inode_set_casefold(trans, inode_inum(inode), bi, s->v); + if (ret) + return ret; + } + if (s->defined) bi->bi_fields_set |= 1U << s->id; else @@ -523,7 +534,7 @@ static int bch2_xattr_bcachefs_set(const struct xattr_handler *handler, if (ret < 0) goto err_class_exit; - ret = bch2_opt_check_may_set(c, opt_id, v); + ret = bch2_opt_hook_pre_set(c, NULL, opt_id, v); if (ret < 0) goto err_class_exit; diff --git a/fs/bcachefs/xattr.h b/fs/bcachefs/xattr.h index 132fbbd15a66..1139bf345f70 100644 --- a/fs/bcachefs/xattr.h +++ b/fs/bcachefs/xattr.h @@ -18,12 +18,12 @@ void bch2_xattr_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); static inline unsigned xattr_val_u64s(unsigned name_len, unsigned val_len) { - return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name) + + return DIV_ROUND_UP(offsetof(struct bch_xattr, x_name_and_value) + name_len + val_len, sizeof(u64)); } #define xattr_val(_xattr) \ - ((void *) (_xattr)->x_name + (_xattr)->x_name_len) + ((void *) (_xattr)->x_name_and_value + (_xattr)->x_name_len) struct xattr_search_key { u8 type; diff --git a/fs/bcachefs/xattr_format.h b/fs/bcachefs/xattr_format.h index c7916011ef34..4121b78d9a92 100644 --- a/fs/bcachefs/xattr_format.h +++ b/fs/bcachefs/xattr_format.h @@ -13,7 +13,13 @@ struct bch_xattr { __u8 x_type; __u8 x_name_len; __le16 x_val_len; - __u8 x_name[] __counted_by(x_name_len); + /* + * x_name contains the name and value counted by + * x_name_len + x_val_len. The introduction of + * __counted_by(x_name_len) previously caused a false positive + * detection of an out of bounds write. + */ + __u8 x_name_and_value[]; } __packed __aligned(8); #endif /* _BCACHEFS_XATTR_FORMAT_H */ diff --git a/fs/bfs/inode.c b/fs/bfs/inode.c index db81570c9637..1d41ce477df5 100644 --- a/fs/bfs/inode.c +++ b/fs/bfs/inode.c @@ -17,6 +17,7 @@ #include <linux/writeback.h> #include <linux/uio.h> #include <linux/uaccess.h> +#include <linux/fs_context.h> #include "bfs.h" MODULE_AUTHOR("Tigran Aivazian <aivazian.tigran@gmail.com>"); @@ -305,7 +306,7 @@ void bfs_dump_imap(const char *prefix, struct super_block *s) #endif } -static int bfs_fill_super(struct super_block *s, void *data, int silent) +static int bfs_fill_super(struct super_block *s, struct fs_context *fc) { struct buffer_head *bh, *sbh; struct bfs_super_block *bfs_sb; @@ -314,6 +315,7 @@ static int bfs_fill_super(struct super_block *s, void *data, int silent) struct bfs_sb_info *info; int ret = -EINVAL; unsigned long i_sblock, i_eblock, i_eoff, s_size; + int silent = fc->sb_flags & SB_SILENT; info = kzalloc(sizeof(*info), GFP_KERNEL); if (!info) @@ -446,18 +448,28 @@ out: return ret; } -static struct dentry *bfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int bfs_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, bfs_fill_super); + return get_tree_bdev(fc, bfs_fill_super); +} + +static const struct fs_context_operations bfs_context_ops = { + .get_tree = bfs_get_tree, +}; + +static int bfs_init_fs_context(struct fs_context *fc) +{ + fc->ops = &bfs_context_ops; + + return 0; } static struct file_system_type bfs_fs_type = { - .owner = THIS_MODULE, - .name = "bfs", - .mount = bfs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "bfs", + .init_fs_context = bfs_init_fs_context, + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("bfs"); diff --git a/fs/binfmt_elf.c b/fs/binfmt_elf.c index 8054f44d39cf..a43363d593e5 100644 --- a/fs/binfmt_elf.c +++ b/fs/binfmt_elf.c @@ -68,12 +68,6 @@ static int load_elf_binary(struct linux_binprm *bprm); -#ifdef CONFIG_USELIB -static int load_elf_library(struct file *); -#else -#define load_elf_library NULL -#endif - /* * If we don't support core dumping, then supply a NULL so we * don't even try. @@ -101,7 +95,6 @@ static int elf_core_dump(struct coredump_params *cprm); static struct linux_binfmt elf_format = { .module = THIS_MODULE, .load_binary = load_elf_binary, - .load_shlib = load_elf_library, #ifdef CONFIG_COREDUMP .core_dump = elf_core_dump, .min_coredump = ELF_EXEC_PAGESIZE, @@ -762,8 +755,7 @@ static int parse_elf_property(const char *data, size_t *off, size_t datasz, } #define NOTE_DATA_SZ SZ_1K -#define GNU_PROPERTY_TYPE_0_NAME "GNU" -#define NOTE_NAME_SZ (sizeof(GNU_PROPERTY_TYPE_0_NAME)) +#define NOTE_NAME_SZ (sizeof(NN_GNU_PROPERTY_TYPE_0)) static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, struct arch_elf_state *arch) @@ -800,7 +792,7 @@ static int parse_elf_properties(struct file *f, const struct elf_phdr *phdr, if (note.nhdr.n_type != NT_GNU_PROPERTY_TYPE_0 || note.nhdr.n_namesz != NOTE_NAME_SZ || strncmp(note.data + sizeof(note.nhdr), - GNU_PROPERTY_TYPE_0_NAME, n - sizeof(note.nhdr))) + NN_GNU_PROPERTY_TYPE_0, n - sizeof(note.nhdr))) return -ENOEXEC; off = round_up(sizeof(note.nhdr) + NOTE_NAME_SZ, @@ -831,6 +823,7 @@ static int load_elf_binary(struct linux_binprm *bprm) struct elf_phdr *elf_ppnt, *elf_phdata, *interp_elf_phdata = NULL; struct elf_phdr *elf_property_phdata = NULL; unsigned long elf_brk; + bool brk_moved = false; int retval, i; unsigned long elf_entry; unsigned long e_entry; @@ -1098,15 +1091,19 @@ out_free_interp: /* Calculate any requested alignment. */ alignment = maximum_alignment(elf_phdata, elf_ex->e_phnum); - /* - * There are effectively two types of ET_DYN - * binaries: programs (i.e. PIE: ET_DYN with PT_INTERP) - * and loaders (ET_DYN without PT_INTERP, since they - * _are_ the ELF interpreter). The loaders must - * be loaded away from programs since the program - * may otherwise collide with the loader (especially - * for ET_EXEC which does not have a randomized - * position). For example to handle invocations of + /** + * DOC: PIE handling + * + * There are effectively two types of ET_DYN ELF + * binaries: programs (i.e. PIE: ET_DYN with + * PT_INTERP) and loaders (i.e. static PIE: ET_DYN + * without PT_INTERP, usually the ELF interpreter + * itself). Loaders must be loaded away from programs + * since the program may otherwise collide with the + * loader (especially for ET_EXEC which does not have + * a randomized position). + * + * For example, to handle invocations of * "./ld.so someprog" to test out a new version of * the loader, the subsequent program that the * loader loads must avoid the loader itself, so @@ -1119,6 +1116,9 @@ out_free_interp: * ELF_ET_DYN_BASE and loaders are loaded into the * independently randomized mmap region (0 load_bias * without MAP_FIXED nor MAP_FIXED_NOREPLACE). + * + * See below for "brk" handling details, which is + * also affected by program vs loader and ASLR. */ if (interpreter) { /* On ET_DYN with PT_INTERP, we do the ASLR. */ @@ -1235,8 +1235,6 @@ out_free_interp: start_data += load_bias; end_data += load_bias; - current->mm->start_brk = current->mm->brk = ELF_PAGEALIGN(elf_brk); - if (interpreter) { elf_entry = load_elf_interp(interp_elf_ex, interpreter, @@ -1292,27 +1290,44 @@ out_free_interp: mm->end_data = end_data; mm->start_stack = bprm->p; - if ((current->flags & PF_RANDOMIZE) && (snapshot_randomize_va_space > 1)) { + /** + * DOC: "brk" handling + * + * For architectures with ELF randomization, when executing a + * loader directly (i.e. static PIE: ET_DYN without PT_INTERP), + * move the brk area out of the mmap region and into the unused + * ELF_ET_DYN_BASE region. Since "brk" grows up it may collide + * early with the stack growing down or other regions being put + * into the mmap region by the kernel (e.g. vdso). + * + * In the CONFIG_COMPAT_BRK case, though, everything is turned + * off because we're not allowed to move the brk at all. + */ + if (!IS_ENABLED(CONFIG_COMPAT_BRK) && + IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && + elf_ex->e_type == ET_DYN && !interpreter) { + elf_brk = ELF_ET_DYN_BASE; + /* This counts as moving the brk, so let brk(2) know. */ + brk_moved = true; + } + mm->start_brk = mm->brk = ELF_PAGEALIGN(elf_brk); + + if ((current->flags & PF_RANDOMIZE) && snapshot_randomize_va_space > 1) { /* - * For architectures with ELF randomization, when executing - * a loader directly (i.e. no interpreter listed in ELF - * headers), move the brk area out of the mmap region - * (since it grows up, and may collide early with the stack - * growing down), and into the unused ELF_ET_DYN_BASE region. + * If we didn't move the brk to ELF_ET_DYN_BASE (above), + * leave a gap between .bss and brk. */ - if (IS_ENABLED(CONFIG_ARCH_HAS_ELF_RANDOMIZE) && - elf_ex->e_type == ET_DYN && !interpreter) { - mm->brk = mm->start_brk = ELF_ET_DYN_BASE; - } else { - /* Otherwise leave a gap between .bss and brk. */ + if (!brk_moved) mm->brk = mm->start_brk = mm->brk + PAGE_SIZE; - } mm->brk = mm->start_brk = arch_randomize_brk(mm); + brk_moved = true; + } + #ifdef compat_brk_randomized + if (brk_moved) current->brk_randomized = 1; #endif - } if (current->personality & MMAP_PAGE_ZERO) { /* Why this, you ask??? Well SVr4 maps page 0 as read-only, @@ -1362,75 +1377,6 @@ out_free_ph: goto out; } -#ifdef CONFIG_USELIB -/* This is really simpleminded and specialized - we are loading an - a.out library that is given an ELF header. */ -static int load_elf_library(struct file *file) -{ - struct elf_phdr *elf_phdata; - struct elf_phdr *eppnt; - int retval, error, i, j; - struct elfhdr elf_ex; - - error = -ENOEXEC; - retval = elf_read(file, &elf_ex, sizeof(elf_ex), 0); - if (retval < 0) - goto out; - - if (memcmp(elf_ex.e_ident, ELFMAG, SELFMAG) != 0) - goto out; - - /* First of all, some simple consistency checks */ - if (elf_ex.e_type != ET_EXEC || elf_ex.e_phnum > 2 || - !elf_check_arch(&elf_ex) || !file->f_op->mmap) - goto out; - if (elf_check_fdpic(&elf_ex)) - goto out; - - /* Now read in all of the header information */ - - j = sizeof(struct elf_phdr) * elf_ex.e_phnum; - /* j < ELF_MIN_ALIGN because elf_ex.e_phnum <= 2 */ - - error = -ENOMEM; - elf_phdata = kmalloc(j, GFP_KERNEL); - if (!elf_phdata) - goto out; - - eppnt = elf_phdata; - error = -ENOEXEC; - retval = elf_read(file, eppnt, j, elf_ex.e_phoff); - if (retval < 0) - goto out_free_ph; - - for (j = 0, i = 0; i<elf_ex.e_phnum; i++) - if ((eppnt + i)->p_type == PT_LOAD) - j++; - if (j != 1) - goto out_free_ph; - - while (eppnt->p_type != PT_LOAD) - eppnt++; - - /* Now use mmap to map the library into memory. */ - error = elf_load(file, ELF_PAGESTART(eppnt->p_vaddr), - eppnt, - PROT_READ | PROT_WRITE | PROT_EXEC, - MAP_FIXED_NOREPLACE | MAP_PRIVATE, - 0); - - if (error != ELF_PAGESTART(eppnt->p_vaddr)) - goto out_free_ph; - - error = 0; - -out_free_ph: - kfree(elf_phdata); -out: - return error; -} -#endif /* #ifdef CONFIG_USELIB */ - #ifdef CONFIG_ELF_CORE /* * ELF core dumper @@ -1603,14 +1549,14 @@ static void fill_auxv_note(struct memelfnote *note, struct mm_struct *mm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); } static void fill_siginfo_note(struct memelfnote *note, user_siginfo_t *csigdata, const kernel_siginfo_t *siginfo) { copy_siginfo_to_external(csigdata, siginfo); - fill_note(note, "CORE", NT_SIGINFO, sizeof(*csigdata), csigdata); + fill_note(note, NN_SIGINFO, NT_SIGINFO, sizeof(*csigdata), csigdata); } /* @@ -1706,7 +1652,7 @@ static int fill_files_note(struct memelfnote *note, struct coredump_params *cprm } size = name_curpos - (char *)data; - fill_note(note, "CORE", NT_FILE, size, data); + fill_note(note, NN_FILE, NT_FILE, size, data); return 0; } @@ -1767,7 +1713,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, regset_get(t->task, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, PRSTATUS_SIZE, &t->prstatus); info->size += notesize(&t->notes[0]); @@ -1801,7 +1747,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, if (is_fpreg) SET_PR_FPVALID(&t->prstatus); - fill_note(&t->notes[note_iter], is_fpreg ? "CORE" : "LINUX", + fill_note(&t->notes[note_iter], is_fpreg ? NN_PRFPREG : "LINUX", note_type, ret, data); info->size += notesize(&t->notes[note_iter]); @@ -1821,7 +1767,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, fill_prstatus(&t->prstatus.common, p, signr); elf_core_copy_task_regs(p, &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), &(t->prstatus)); info->size += notesize(&t->notes[0]); @@ -1832,7 +1778,7 @@ static int fill_thread_core_info(struct elf_thread_core_info *t, } t->prstatus.pr_fpvalid = 1; - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(*fpu), fpu); + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(*fpu), fpu); info->size += notesize(&t->notes[1]); return 1; @@ -1852,7 +1798,7 @@ static int fill_note_info(struct elfhdr *elf, int phdrs, psinfo = kmalloc(sizeof(*psinfo), GFP_KERNEL); if (!psinfo) return 0; - fill_note(&info->psinfo, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&info->psinfo, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); #ifdef CORE_DUMP_USE_REGSET view = task_user_regset_view(dump_task); diff --git a/fs/binfmt_elf_fdpic.c b/fs/binfmt_elf_fdpic.c index c13ee8180b17..9133f3827f90 100644 --- a/fs/binfmt_elf_fdpic.c +++ b/fs/binfmt_elf_fdpic.c @@ -1024,7 +1024,7 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, /* deal with each load segment separately */ phdr = params->phdrs; for (loop = 0; loop < params->hdr.e_phnum; loop++, phdr++) { - unsigned long maddr, disp, excess, excess1; + unsigned long maddr, disp, excess; int prot = 0, flags; if (phdr->p_type != PT_LOAD) @@ -1120,9 +1120,10 @@ static int elf_fdpic_map_file_by_direct_mmap(struct elf_fdpic_params *params, * extant in the file */ excess = phdr->p_memsz - phdr->p_filesz; - excess1 = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); #ifdef CONFIG_MMU + unsigned long excess1 + = PAGE_SIZE - ((maddr + phdr->p_filesz) & ~PAGE_MASK); if (excess > excess1) { unsigned long xaddr = maddr + phdr->p_filesz + excess1; unsigned long xmaddr; @@ -1397,7 +1398,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ regset_get(p, &view->regsets[0], sizeof(t->prstatus.pr_reg), &t->prstatus.pr_reg); - fill_note(&t->notes[0], "CORE", NT_PRSTATUS, sizeof(t->prstatus), + fill_note(&t->notes[0], NN_PRSTATUS, NT_PRSTATUS, sizeof(t->prstatus), &t->prstatus); t->num_notes++; *sz += notesize(&t->notes[0]); @@ -1415,7 +1416,7 @@ static struct elf_thread_status *elf_dump_thread_status(long signr, struct task_ } if (t->prstatus.pr_fpvalid) { - fill_note(&t->notes[1], "CORE", NT_PRFPREG, sizeof(t->fpu), + fill_note(&t->notes[1], NN_PRFPREG, NT_PRFPREG, sizeof(t->fpu), &t->fpu); t->num_notes++; *sz += notesize(&t->notes[1]); @@ -1530,7 +1531,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) */ fill_psinfo(psinfo, current->group_leader, current->mm); - fill_note(&psinfo_note, "CORE", NT_PRPSINFO, sizeof(*psinfo), psinfo); + fill_note(&psinfo_note, NN_PRPSINFO, NT_PRPSINFO, sizeof(*psinfo), psinfo); thread_status_size += notesize(&psinfo_note); auxv = (elf_addr_t *) current->mm->saved_auxv; @@ -1538,7 +1539,7 @@ static int elf_fdpic_core_dump(struct coredump_params *cprm) do i += 2; while (auxv[i - 2] != AT_NULL); - fill_note(&auxv_note, "CORE", NT_AUXV, i * sizeof(elf_addr_t), auxv); + fill_note(&auxv_note, NN_AUXV, NT_AUXV, i * sizeof(elf_addr_t), auxv); thread_status_size += notesize(&auxv_note); offset = sizeof(*elf); /* ELF header */ diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index 5a7ebd160724..432fbf4fc334 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -842,7 +842,7 @@ static ssize_t bm_register_write(struct file *file, const char __user *buffer, } inode_lock(d_inode(root)); - dentry = lookup_one_len(e->name, root, strlen(e->name)); + dentry = lookup_noperm(&QSTR(e->name), root); err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out; diff --git a/fs/bpf_fs_kfuncs.c b/fs/bpf_fs_kfuncs.c index 3fe9f59ef867..08412532db1b 100644 --- a/fs/bpf_fs_kfuncs.c +++ b/fs/bpf_fs_kfuncs.c @@ -2,10 +2,12 @@ /* Copyright (c) 2024 Google LLC. */ #include <linux/bpf.h> +#include <linux/bpf_lsm.h> #include <linux/btf.h> #include <linux/btf_ids.h> #include <linux/dcache.h> #include <linux/fs.h> +#include <linux/fsnotify.h> #include <linux/file.h> #include <linux/mm.h> #include <linux/xattr.h> @@ -93,6 +95,24 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) return len; } +static bool match_security_bpf_prefix(const char *name__str) +{ + return !strncmp(name__str, XATTR_NAME_BPF_LSM, XATTR_NAME_BPF_LSM_LEN); +} + +static int bpf_xattr_read_permission(const char *name, struct inode *inode) +{ + if (WARN_ON(!inode)) + return -EINVAL; + + /* Allow reading xattr with user. and security.bpf. prefix */ + if (strncmp(name, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN) && + !match_security_bpf_prefix(name)) + return -EPERM; + + return inode_permission(&nop_mnt_idmap, inode, MAY_READ); +} + /** * bpf_get_dentry_xattr - get xattr of a dentry * @dentry: dentry to get xattr from @@ -101,9 +121,10 @@ __bpf_kfunc int bpf_path_d_path(struct path *path, char *buf, size_t buf__sz) * * Get xattr *name__str* of *dentry* and store the output in *value_ptr*. * - * For security reasons, only *name__str* with prefix "user." is allowed. + * For security reasons, only *name__str* with prefixes "user." or + * "security.bpf." are allowed. * - * Return: 0 on success, a negative value on error. + * Return: length of the xattr value on success, a negative value on error. */ __bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__str, struct bpf_dynptr *value_p) @@ -114,18 +135,12 @@ __bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__st void *value; int ret; - if (WARN_ON(!inode)) - return -EINVAL; - - if (strncmp(name__str, XATTR_USER_PREFIX, XATTR_USER_PREFIX_LEN)) - return -EPERM; - value_len = __bpf_dynptr_size(value_ptr); value = __bpf_dynptr_data_rw(value_ptr, value_len); if (!value) return -EINVAL; - ret = inode_permission(&nop_mnt_idmap, inode, MAY_READ); + ret = bpf_xattr_read_permission(name__str, inode); if (ret) return ret; return __vfs_getxattr(dentry, inode, name__str, value, value_len); @@ -139,9 +154,10 @@ __bpf_kfunc int bpf_get_dentry_xattr(struct dentry *dentry, const char *name__st * * Get xattr *name__str* of *file* and store the output in *value_ptr*. * - * For security reasons, only *name__str* with prefix "user." is allowed. + * For security reasons, only *name__str* with prefixes "user." or + * "security.bpf." are allowed. * - * Return: 0 on success, a negative value on error. + * Return: length of the xattr value on success, a negative value on error. */ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, struct bpf_dynptr *value_p) @@ -154,6 +170,160 @@ __bpf_kfunc int bpf_get_file_xattr(struct file *file, const char *name__str, __bpf_kfunc_end_defs(); +static int bpf_xattr_write_permission(const char *name, struct inode *inode) +{ + if (WARN_ON(!inode)) + return -EINVAL; + + /* Only allow setting and removing security.bpf. xattrs */ + if (!match_security_bpf_prefix(name)) + return -EPERM; + + return inode_permission(&nop_mnt_idmap, inode, MAY_WRITE); +} + +/** + * bpf_set_dentry_xattr_locked - set a xattr of a dentry + * @dentry: dentry to get xattr from + * @name__str: name of the xattr + * @value_p: xattr value + * @flags: flags to pass into filesystem operations + * + * Set xattr *name__str* of *dentry* to the value in *value_ptr*. + * + * For security reasons, only *name__str* with prefix "security.bpf." + * is allowed. + * + * The caller already locked dentry->d_inode. + * + * Return: 0 on success, a negative value on error. + */ +int bpf_set_dentry_xattr_locked(struct dentry *dentry, const char *name__str, + const struct bpf_dynptr *value_p, int flags) +{ + + struct bpf_dynptr_kern *value_ptr = (struct bpf_dynptr_kern *)value_p; + struct inode *inode = d_inode(dentry); + const void *value; + u32 value_len; + int ret; + + value_len = __bpf_dynptr_size(value_ptr); + value = __bpf_dynptr_data(value_ptr, value_len); + if (!value) + return -EINVAL; + + ret = bpf_xattr_write_permission(name__str, inode); + if (ret) + return ret; + + ret = __vfs_setxattr(&nop_mnt_idmap, dentry, inode, name__str, + value, value_len, flags); + if (!ret) { + fsnotify_xattr(dentry); + + /* This xattr is set by BPF LSM, so we do not call + * security_inode_post_setxattr. Otherwise, we would + * risk deadlocks by calling back to the same kfunc. + * + * This is the same as security_inode_setsecurity(). + */ + } + return ret; +} + +/** + * bpf_remove_dentry_xattr_locked - remove a xattr of a dentry + * @dentry: dentry to get xattr from + * @name__str: name of the xattr + * + * Rmove xattr *name__str* of *dentry*. + * + * For security reasons, only *name__str* with prefix "security.bpf." + * is allowed. + * + * The caller already locked dentry->d_inode. + * + * Return: 0 on success, a negative value on error. + */ +int bpf_remove_dentry_xattr_locked(struct dentry *dentry, const char *name__str) +{ + struct inode *inode = d_inode(dentry); + int ret; + + ret = bpf_xattr_write_permission(name__str, inode); + if (ret) + return ret; + + ret = __vfs_removexattr(&nop_mnt_idmap, dentry, name__str); + if (!ret) { + fsnotify_xattr(dentry); + + /* This xattr is removed by BPF LSM, so we do not call + * security_inode_post_removexattr. Otherwise, we would + * risk deadlocks by calling back to the same kfunc. + */ + } + return ret; +} + +__bpf_kfunc_start_defs(); + +/** + * bpf_set_dentry_xattr - set a xattr of a dentry + * @dentry: dentry to get xattr from + * @name__str: name of the xattr + * @value_p: xattr value + * @flags: flags to pass into filesystem operations + * + * Set xattr *name__str* of *dentry* to the value in *value_ptr*. + * + * For security reasons, only *name__str* with prefix "security.bpf." + * is allowed. + * + * The caller has not locked dentry->d_inode. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_set_dentry_xattr(struct dentry *dentry, const char *name__str, + const struct bpf_dynptr *value_p, int flags) +{ + struct inode *inode = d_inode(dentry); + int ret; + + inode_lock(inode); + ret = bpf_set_dentry_xattr_locked(dentry, name__str, value_p, flags); + inode_unlock(inode); + return ret; +} + +/** + * bpf_remove_dentry_xattr - remove a xattr of a dentry + * @dentry: dentry to get xattr from + * @name__str: name of the xattr + * + * Rmove xattr *name__str* of *dentry*. + * + * For security reasons, only *name__str* with prefix "security.bpf." + * is allowed. + * + * The caller has not locked dentry->d_inode. + * + * Return: 0 on success, a negative value on error. + */ +__bpf_kfunc int bpf_remove_dentry_xattr(struct dentry *dentry, const char *name__str) +{ + struct inode *inode = d_inode(dentry); + int ret; + + inode_lock(inode); + ret = bpf_remove_dentry_xattr_locked(dentry, name__str); + inode_unlock(inode); + return ret; +} + +__bpf_kfunc_end_defs(); + BTF_KFUNCS_START(bpf_fs_kfunc_set_ids) BTF_ID_FLAGS(func, bpf_get_task_exe_file, KF_ACQUIRE | KF_TRUSTED_ARGS | KF_RET_NULL) @@ -161,6 +331,8 @@ BTF_ID_FLAGS(func, bpf_put_file, KF_RELEASE) BTF_ID_FLAGS(func, bpf_path_d_path, KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_get_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_ID_FLAGS(func, bpf_get_file_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_set_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) +BTF_ID_FLAGS(func, bpf_remove_dentry_xattr, KF_SLEEPABLE | KF_TRUSTED_ARGS) BTF_KFUNCS_END(bpf_fs_kfunc_set_ids) static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) @@ -171,6 +343,37 @@ static int bpf_fs_kfuncs_filter(const struct bpf_prog *prog, u32 kfunc_id) return -EACCES; } +/* bpf_[set|remove]_dentry_xattr.* hooks have KF_TRUSTED_ARGS and + * KF_SLEEPABLE, so they are only available to sleepable hooks with + * dentry arguments. + * + * Setting and removing xattr requires exclusive lock on dentry->d_inode. + * Some hooks already locked d_inode, while some hooks have not locked + * d_inode. Therefore, we need different kfuncs for different hooks. + * Specifically, hooks in the following list (d_inode_locked_hooks) + * should call bpf_[set|remove]_dentry_xattr_locked; while other hooks + * should call bpf_[set|remove]_dentry_xattr. + */ +BTF_SET_START(d_inode_locked_hooks) +BTF_ID(func, bpf_lsm_inode_post_removexattr) +BTF_ID(func, bpf_lsm_inode_post_setattr) +BTF_ID(func, bpf_lsm_inode_post_setxattr) +BTF_ID(func, bpf_lsm_inode_removexattr) +BTF_ID(func, bpf_lsm_inode_rmdir) +BTF_ID(func, bpf_lsm_inode_setattr) +BTF_ID(func, bpf_lsm_inode_setxattr) +BTF_ID(func, bpf_lsm_inode_unlink) +#ifdef CONFIG_SECURITY_PATH +BTF_ID(func, bpf_lsm_path_unlink) +BTF_ID(func, bpf_lsm_path_rmdir) +#endif /* CONFIG_SECURITY_PATH */ +BTF_SET_END(d_inode_locked_hooks) + +bool bpf_lsm_has_d_inode_locked(const struct bpf_prog *prog) +{ + return btf_id_set_contains(&d_inode_locked_hooks, prog->aux->attach_btf_id); +} + static const struct btf_kfunc_id_set bpf_fs_kfunc_set = { .owner = THIS_MODULE, .set = &bpf_fs_kfunc_set_ids, diff --git a/fs/btrfs/Kconfig b/fs/btrfs/Kconfig index fa8515598341..c352f3ae0385 100644 --- a/fs/btrfs/Kconfig +++ b/fs/btrfs/Kconfig @@ -3,9 +3,9 @@ config BTRFS_FS tristate "Btrfs filesystem support" select BLK_CGROUP_PUNT_BIO + select CRC32 select CRYPTO select CRYPTO_CRC32C - select LIBCRC32C select CRYPTO_XXHASH select CRYPTO_SHA256 select CRYPTO_BLAKE2B @@ -52,10 +52,10 @@ config BTRFS_FS_RUN_SANITY_TESTS bool "Btrfs will run sanity tests upon loading" depends on BTRFS_FS help - This will run some basic sanity tests on the free space cache - code to make sure it is acting as it should. These are mostly - regression tests and are only really interesting to btrfs - developers. + This will run sanity tests for core functionality like free space, + extent maps, extent io, extent buffers, inodes, qgroups and others, + at module load time. These are mostly regression tests and are only + interesting to developers. If unsure, say N. @@ -63,9 +63,12 @@ config BTRFS_DEBUG bool "Btrfs debugging support" depends on BTRFS_FS help - Enable run-time debugging support for the btrfs filesystem. This may - enable additional and expensive checks with negative impact on - performance, or export extra information via sysfs. + Enable run-time debugging support for the btrfs filesystem. + + Additional potentially expensive checks, debugging functionality or + sysfs exported information is enabled, like leak checks of internal + objects, optional forced space fragmentation and /sys/fs/btrfs/debug . + This has negative impact on performance. If unsure, say N. @@ -73,8 +76,10 @@ config BTRFS_ASSERT bool "Btrfs assert support" depends on BTRFS_FS help - Enable run-time assertion checking. This will result in panics if - any of the assertions trip. This is meant for btrfs developers only. + Enable run-time assertion checking. Additional safety checks are + done, simple enough not to affect performance but verify invariants + and assumptions of code to run properly. This may result in panics, + and is meant for developers but can be enabled in general. If unsure, say N. @@ -89,7 +94,14 @@ config BTRFS_EXPERIMENTAL Current list: - - extent map shrinker - performance problems with too frequent shrinks + - COW fixup worker warning - last warning before removing the + functionality catching out-of-band page + dirtying, not necessary since 5.8 + + - RAID mirror read policy - additional read policies for balancing + reading from redundant block group + profiles (currently: pid, round-robin, + fixed devid) - send stream protocol v3 - fs-verity support diff --git a/fs/btrfs/accessors.h b/fs/btrfs/accessors.h index 7a7e0ef69973..15ea6348800b 100644 --- a/fs/btrfs/accessors.h +++ b/fs/btrfs/accessors.h @@ -12,6 +12,7 @@ #include <linux/string.h> #include <linux/mm.h> #include <uapi/linux/btrfs_tree.h> +#include "extent_io.h" struct extent_buffer; diff --git a/fs/btrfs/acl.h b/fs/btrfs/acl.h index 48b9ddae4a46..0458cd51ed48 100644 --- a/fs/btrfs/acl.h +++ b/fs/btrfs/acl.h @@ -3,6 +3,8 @@ #ifndef BTRFS_ACL_H #define BTRFS_ACL_H +#include <linux/types.h> + struct posix_acl; struct inode; struct btrfs_trans_handle; diff --git a/fs/btrfs/async-thread.c b/fs/btrfs/async-thread.c index a4c51600a408..6c6f3bb58f4e 100644 --- a/fs/btrfs/async-thread.c +++ b/fs/btrfs/async-thread.c @@ -168,7 +168,7 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) { int new_current_active; long pending; - int need_change = 0; + bool need_change = false; if (wq->thresh == NO_THRESHOLD) return; @@ -196,15 +196,14 @@ static inline void thresh_exec_hook(struct btrfs_workqueue *wq) new_current_active--; new_current_active = clamp_val(new_current_active, 1, wq->limit_active); if (new_current_active != wq->current_active) { - need_change = 1; + need_change = true; wq->current_active = new_current_active; } out: spin_unlock(&wq->thres_lock); - if (need_change) { + if (need_change) workqueue_set_max_active(wq->normal_wq, wq->current_active); - } } static void run_ordered_work(struct btrfs_workqueue *wq, @@ -220,8 +219,7 @@ static void run_ordered_work(struct btrfs_workqueue *wq, spin_lock_irqsave(lock, flags); if (list_empty(list)) break; - work = list_entry(list->next, struct btrfs_work, - ordered_list); + work = list_first_entry(list, struct btrfs_work, ordered_list); if (!test_bit(WORK_DONE_BIT, &work->flags)) break; /* @@ -296,7 +294,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) struct btrfs_work *work = container_of(normal_work, struct btrfs_work, normal_work); struct btrfs_workqueue *wq = work->wq; - int need_order = 0; + bool need_order = false; /* * We should not touch things inside work in the following cases: @@ -307,7 +305,7 @@ static void btrfs_work_helper(struct work_struct *normal_work) * So we save the needed things here. */ if (work->ordered_func) - need_order = 1; + need_order = true; trace_btrfs_work_sched(work); thresh_exec_hook(wq); diff --git a/fs/btrfs/backref.c b/fs/btrfs/backref.c index 3d3923cfc357..ed497f5f8d1b 100644 --- a/fs/btrfs/backref.c +++ b/fs/btrfs/backref.c @@ -1399,11 +1399,11 @@ static int find_parent_nodes(struct btrfs_backref_walk_ctx *ctx, ASSERT(ctx->roots == NULL); key.objectid = ctx->bytenr; - key.offset = (u64)-1; if (btrfs_fs_incompat(ctx->fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; path = btrfs_alloc_path(); if (!path) @@ -2206,11 +2206,11 @@ int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical, struct btrfs_extent_item *ei; struct btrfs_key key; + key.objectid = logical; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = logical; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -2877,7 +2877,7 @@ int btrfs_backref_iter_start(struct btrfs_backref_iter *iter, u64 bytenr) goto release; } if (path->slots[0] == 0) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); ret = -EUCLEAN; goto release; } @@ -3134,8 +3134,8 @@ void btrfs_backref_cleanup_node(struct btrfs_backref_cache *cache, return; while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, struct btrfs_backref_edge, - list[LOWER]); + edge = list_first_entry(&node->upper, struct btrfs_backref_edge, + list[LOWER]); list_del(&edge->list[LOWER]); list_del(&edge->list[UPPER]); btrfs_backref_free_edge(cache, edge); @@ -3473,8 +3473,8 @@ int btrfs_backref_add_tree_node(struct btrfs_trans_handle *trans, * type BTRFS_TREE_BLOCK_REF_KEY */ ASSERT(list_is_singular(&cur->upper)); - edge = list_entry(cur->upper.next, struct btrfs_backref_edge, - list[LOWER]); + edge = list_first_entry(&cur->upper, struct btrfs_backref_edge, + list[LOWER]); ASSERT(list_empty(&edge->list[UPPER])); exist = edge->node[UPPER]; /* @@ -3617,7 +3617,7 @@ int btrfs_backref_finish_upper_links(struct btrfs_backref_cache *cache, /* Sanity check, we shouldn't have any unchecked nodes */ if (!upper->checked) { - ASSERT(0); + DEBUG_WARN("we should not have any unchecked nodes"); return -EUCLEAN; } diff --git a/fs/btrfs/backref.h b/fs/btrfs/backref.h index 74e614031274..953637115956 100644 --- a/fs/btrfs/backref.h +++ b/fs/btrfs/backref.h @@ -423,8 +423,8 @@ struct btrfs_backref_node *btrfs_backref_alloc_node( struct btrfs_backref_edge *btrfs_backref_alloc_edge( struct btrfs_backref_cache *cache); -#define LINK_LOWER (1 << 0) -#define LINK_UPPER (1 << 1) +#define LINK_LOWER (1U << 0) +#define LINK_UPPER (1U << 1) void btrfs_backref_link_edge(struct btrfs_backref_edge *edge, struct btrfs_backref_node *lower, diff --git a/fs/btrfs/bio.c b/fs/btrfs/bio.c index bc2555c44a12..f7d8958b7327 100644 --- a/fs/btrfs/bio.c +++ b/fs/btrfs/bio.c @@ -97,33 +97,17 @@ static struct btrfs_bio *btrfs_split_bio(struct btrfs_fs_info *fs_info, return bbio; } -/* Free a bio that was never submitted to the underlying device. */ -static void btrfs_cleanup_bio(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) - btrfs_put_ordered_extent(bbio->ordered); - bio_put(&bbio->bio); -} - -static void __btrfs_bio_end_io(struct btrfs_bio *bbio) -{ - if (bbio_has_ordered_extent(bbio)) { - struct btrfs_ordered_extent *ordered = bbio->ordered; - - bbio->end_io(bbio); - btrfs_put_ordered_extent(ordered); - } else { - bbio->end_io(bbio); - } -} - void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) { bbio->bio.bi_status = status; if (bbio->bio.bi_pool == &btrfs_clone_bioset) { struct btrfs_bio *orig_bbio = bbio->private; - btrfs_cleanup_bio(bbio); + /* Free bio that was never submitted to the underlying device. */ + if (bbio_has_ordered_extent(bbio)) + btrfs_put_ordered_extent(bbio->ordered); + bio_put(&bbio->bio); + bbio = orig_bbio; } @@ -138,7 +122,15 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status) /* Load split bio's error which might be set above. */ if (status == BLK_STS_OK) bbio->bio.bi_status = READ_ONCE(bbio->status); - __btrfs_bio_end_io(bbio); + + if (bbio_has_ordered_extent(bbio)) { + struct btrfs_ordered_extent *ordered = bbio->ordered; + + bbio->end_io(bbio); + btrfs_put_ordered_extent(ordered); + } else { + bbio->end_io(bbio); + } } } @@ -200,7 +192,7 @@ static void btrfs_end_repair_bio(struct btrfs_bio *repair_bbio, btrfs_repair_io_failure(fs_info, btrfs_ino(inode), repair_bbio->file_offset, fs_info->sectorsize, repair_bbio->saved_iter.bi_sector << SECTOR_SHIFT, - page_folio(bv->bv_page), bv->bv_offset, mirror); + bvec_phys(bv), mirror); } while (mirror != fbio->bbio->mirror_num); done: @@ -520,7 +512,7 @@ static void btrfs_submit_bio(struct bio *bio, struct btrfs_io_context *bioc, } } -static blk_status_t btrfs_bio_csum(struct btrfs_bio *bbio) +static int btrfs_bio_csum(struct btrfs_bio *bbio) { if (bbio->bio.bi_opf & REQ_META) return btree_csum_one_bio(bbio); @@ -551,11 +543,11 @@ static void run_one_async_start(struct btrfs_work *work) { struct async_submit_bio *async = container_of(work, struct async_submit_bio, work); - blk_status_t ret; + int ret; ret = btrfs_bio_csum(async->bbio); if (ret) - async->bbio->bio.bi_status = ret; + async->bbio->bio.bi_status = errno_to_blk_status(ret); } /* @@ -581,7 +573,7 @@ static void run_one_async_done(struct btrfs_work *work, bool do_free) /* If an error occurred we just want to clean up the bio and move on. */ if (bio->bi_status) { - btrfs_bio_end_io(async->bbio, async->bbio->bio.bi_status); + btrfs_bio_end_io(async->bbio, bio->bi_status); return; } @@ -682,8 +674,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) bool use_append = btrfs_use_zone_append(bbio); struct btrfs_io_context *bioc = NULL; struct btrfs_io_stripe smap; - blk_status_t ret; - int error; + blk_status_t status; + int ret; if (!bbio->inode || btrfs_is_data_reloc_root(inode->root)) smap.rst_search_commit_root = true; @@ -691,10 +683,10 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) smap.rst_search_commit_root = false; btrfs_bio_counter_inc_blocked(fs_info); - error = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, - &bioc, &smap, &mirror_num); - if (error) { - ret = errno_to_blk_status(error); + ret = btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length, + &bioc, &smap, &mirror_num); + if (ret) { + status = errno_to_blk_status(ret); btrfs_bio_counter_dec(fs_info); goto end_bbio; } @@ -708,7 +700,7 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) split = btrfs_split_bio(fs_info, bbio, map_length); if (IS_ERR(split)) { - ret = errno_to_blk_status(PTR_ERR(split)); + status = errno_to_blk_status(PTR_ERR(split)); btrfs_bio_counter_dec(fs_info); goto end_bbio; } @@ -723,7 +715,8 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) if (bio_op(bio) == REQ_OP_READ && is_data_bbio(bbio)) { bbio->saved_iter = bio->bi_iter; ret = btrfs_lookup_bio_sums(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } @@ -756,13 +749,15 @@ static bool btrfs_submit_chunk(struct btrfs_bio *bbio, int mirror_num) goto done; ret = btrfs_bio_csum(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } else if (use_append || (btrfs_is_zoned(fs_info) && inode && inode->flags & BTRFS_INODE_NODATASUM)) { ret = btrfs_alloc_dummy_sum(bbio); - if (ret) + status = errno_to_blk_status(ret); + if (status) goto fail; } } @@ -783,10 +778,10 @@ fail: ASSERT(bbio->bio.bi_pool == &btrfs_clone_bioset); ASSERT(remaining); - btrfs_bio_end_io(remaining, ret); + btrfs_bio_end_io(remaining, status); } end_bbio: - btrfs_bio_end_io(bbio, ret); + btrfs_bio_end_io(bbio, status); /* Do not submit another chunk */ return true; } @@ -811,8 +806,7 @@ void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num) * freeing the bio. */ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct folio *folio, - unsigned int folio_offset, int mirror_num) + u64 length, u64 logical, phys_addr_t paddr, int mirror_num) { struct btrfs_io_stripe smap = { 0 }; struct bio_vec bvec; @@ -843,8 +837,7 @@ int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, bio_init(&bio, smap.dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC); bio.bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT; - ret = bio_add_folio(&bio, folio, length, folio_offset); - ASSERT(ret); + __bio_add_page(&bio, phys_to_page(paddr), length, offset_in_page(paddr)); ret = submit_bio_wait(&bio); if (ret) { /* try to remap that extent elsewhere? */ @@ -908,22 +901,18 @@ int __init btrfs_bioset_init(void) return -ENOMEM; if (bioset_init(&btrfs_clone_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_bio, bio), 0)) - goto out_free_bioset; + goto out; if (bioset_init(&btrfs_repair_bioset, BIO_POOL_SIZE, offsetof(struct btrfs_bio, bio), BIOSET_NEED_BVECS)) - goto out_free_clone_bioset; + goto out; if (mempool_init_kmalloc_pool(&btrfs_failed_bio_pool, BIO_POOL_SIZE, sizeof(struct btrfs_failed_bio))) - goto out_free_repair_bioset; + goto out; return 0; -out_free_repair_bioset: - bioset_exit(&btrfs_repair_bioset); -out_free_clone_bioset: - bioset_exit(&btrfs_clone_bioset); -out_free_bioset: - bioset_exit(&btrfs_bioset); +out: + btrfs_bioset_exit(); return -ENOMEM; } diff --git a/fs/btrfs/bio.h b/fs/btrfs/bio.h index e2fe16074ad6..dc2eb43b7097 100644 --- a/fs/btrfs/bio.h +++ b/fs/btrfs/bio.h @@ -110,7 +110,6 @@ void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status); void btrfs_submit_bbio(struct btrfs_bio *bbio, int mirror_num); void btrfs_submit_repair_write(struct btrfs_bio *bbio, int mirror_num, bool dev_replace); int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start, - u64 length, u64 logical, struct folio *folio, - unsigned int folio_offset, int mirror_num); + u64 length, u64 logical, phys_addr_t paddr, int mirror_num); #endif diff --git a/fs/btrfs/block-group.c b/fs/btrfs/block-group.c index c0a8f7d92acc..5b0cb04b2b93 100644 --- a/fs/btrfs/block-group.c +++ b/fs/btrfs/block-group.c @@ -191,21 +191,21 @@ static int btrfs_bg_start_cmp(const struct rb_node *new, /* * This adds the block group to the fs_info rb tree for the block group cache */ -static int btrfs_add_block_group_cache(struct btrfs_fs_info *info, - struct btrfs_block_group *block_group) +static int btrfs_add_block_group_cache(struct btrfs_block_group *block_group) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct rb_node *exist; int ret = 0; ASSERT(block_group->length != 0); - write_lock(&info->block_group_cache_lock); + write_lock(&fs_info->block_group_cache_lock); exist = rb_find_add_cached(&block_group->cache_node, - &info->block_group_cache_tree, btrfs_bg_start_cmp); + &fs_info->block_group_cache_tree, btrfs_bg_start_cmp); if (exist) ret = -EEXIST; - write_unlock(&info->block_group_cache_lock); + write_unlock(&fs_info->block_group_cache_lock); return ret; } @@ -525,10 +525,9 @@ int btrfs_add_new_free_space(struct btrfs_block_group *block_group, u64 start, *total_added_ret = 0; while (start < end) { - if (!find_first_extent_bit(&info->excluded_extents, start, - &extent_start, &extent_end, - EXTENT_DIRTY | EXTENT_UPTODATE, - NULL)) + if (!btrfs_find_first_extent_bit(&info->excluded_extents, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) break; if (extent_start <= start) { @@ -584,7 +583,7 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ struct btrfs_root *extent_root; u64 search_offset; u64 search_end = block_group->start + block_group->length; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key search_key; int ret = 0; @@ -626,7 +625,6 @@ static int sample_block_group_extent_item(struct btrfs_caching_control *caching_ lockdep_assert_held(&caching_ctl->mutex); lockdep_assert_held_read(&fs_info->commit_root_sem); - btrfs_free_path(path); return ret; } @@ -702,7 +700,7 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) struct btrfs_block_group *block_group = caching_ctl->block_group; struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *extent_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; u64 total_found = 0; @@ -738,8 +736,8 @@ static int load_extent_tree_free(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; next: ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -785,8 +783,8 @@ next: if (key.objectid < last) { key.objectid = last; - key.offset = 0; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = 0; btrfs_release_path(path); goto next; } @@ -829,14 +827,13 @@ next: block_group->start + block_group->length, NULL); out: - btrfs_free_path(path); return ret; } static inline void btrfs_free_excluded_extents(const struct btrfs_block_group *bg) { - clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, - bg->start + bg->length - 1, EXTENT_UPTODATE); + btrfs_clear_extent_bits(&bg->fs_info->excluded_extents, bg->start, + bg->start + bg->length - 1, EXTENT_DIRTY); } static noinline void caching_thread(struct btrfs_work *work) @@ -1421,9 +1418,8 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, int ret; spin_lock(&fs_info->trans_lock); - if (trans->transaction->list.prev != &fs_info->trans_list) { - prev_trans = list_last_entry(&trans->transaction->list, - struct btrfs_transaction, list); + if (!list_is_first(&trans->transaction->list, &fs_info->trans_list)) { + prev_trans = list_prev_entry(trans->transaction, list); refcount_inc(&prev_trans->use_count); } spin_unlock(&fs_info->trans_lock); @@ -1440,14 +1436,14 @@ static bool clean_pinned_extents(struct btrfs_trans_handle *trans, */ mutex_lock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) { - ret = clear_extent_bits(&prev_trans->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bits(&prev_trans->pinned_extents, start, end, + EXTENT_DIRTY); if (ret) goto out; } - ret = clear_extent_bits(&trans->transaction->pinned_extents, start, end, - EXTENT_DIRTY); + ret = btrfs_clear_extent_bits(&trans->transaction->pinned_extents, start, end, + EXTENT_DIRTY); out: mutex_unlock(&fs_info->unused_bg_unpin_mutex); if (prev_trans) @@ -1457,6 +1453,32 @@ out: } /* + * Link the block_group to a list via bg_list. + * + * @bg: The block_group to link to the list. + * @list: The list to link it to. + * + * Use this rather than list_add_tail() directly to ensure proper respect + * to locking and refcounting. + * + * Returns: true if the bg was linked with a refcount bump and false otherwise. + */ +static bool btrfs_link_bg_list(struct btrfs_block_group *bg, struct list_head *list) +{ + struct btrfs_fs_info *fs_info = bg->fs_info; + bool added = false; + + spin_lock(&fs_info->unused_bgs_lock); + if (list_empty(&bg->bg_list)) { + btrfs_get_block_group(bg); + list_add_tail(&bg->bg_list, list); + added = true; + } + spin_unlock(&fs_info->unused_bgs_lock); + return added; +} + +/* * Process the unused_bgs list and remove any that don't have any allocated * space inside of them. */ @@ -1571,8 +1593,7 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info) * drop under the "next" label for the * fs_info->unused_bgs list. */ - btrfs_get_block_group(block_group); - list_add_tail(&block_group->bg_list, &retry_list); + btrfs_link_bg_list(block_group, &retry_list); trace_btrfs_skip_unused_block_group(block_group); spin_unlock(&block_group->lock); @@ -1823,7 +1844,8 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) list_sort(NULL, &fs_info->reclaim_bgs, reclaim_bgs_cmp); while (!list_empty(&fs_info->reclaim_bgs)) { u64 zone_unusable; - u64 reclaimed; + u64 used; + u64 reserved; int ret = 0; bg = list_first_entry(&fs_info->reclaim_bgs, @@ -1887,6 +1909,17 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) up_write(&space_info->groups_sem); goto next; } + + /* + * Cache the zone_unusable value before turning the block group + * to read only. As soon as the block group is read only it's + * zone_unusable value gets moved to the block group's read-only + * bytes and isn't available for calculations anymore. We also + * cache it before unlocking the block group, to prevent races + * (reports from KCSAN and such tools) with tasks updating it. + */ + zone_unusable = bg->zone_unusable; + spin_unlock(&bg->lock); spin_unlock(&space_info->lock); @@ -1903,31 +1936,47 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) goto next; } - /* - * Cache the zone_unusable value before turning the block group - * to read only. As soon as the blog group is read only it's - * zone_unusable value gets moved to the block group's read-only - * bytes and isn't available for calculations anymore. - */ - zone_unusable = bg->zone_unusable; ret = inc_block_group_ro(bg, 0); up_write(&space_info->groups_sem); if (ret < 0) goto next; + /* + * The amount of bytes reclaimed corresponds to the sum of the + * "used" and "reserved" counters. We have set the block group + * to RO above, which prevents reservations from happening but + * we may have existing reservations for which allocation has + * not yet been done - btrfs_update_block_group() was not yet + * called, which is where we will transfer a reserved extent's + * size from the "reserved" counter to the "used" counter - this + * happens when running delayed references. When we relocate the + * chunk below, relocation first flushes dellaloc, waits for + * ordered extent completion (which is where we create delayed + * references for data extents) and commits the current + * transaction (which runs delayed references), and only after + * it does the actual work to move extents out of the block + * group. So the reported amount of reclaimed bytes is + * effectively the sum of the 'used' and 'reserved' counters. + */ + spin_lock(&bg->lock); + used = bg->used; + reserved = bg->reserved; + spin_unlock(&bg->lock); + btrfs_info(fs_info, - "reclaiming chunk %llu with %llu%% used %llu%% unusable", + "reclaiming chunk %llu with %llu%% used %llu%% reserved %llu%% unusable", bg->start, - div64_u64(bg->used * 100, bg->length), + div64_u64(used * 100, bg->length), + div64_u64(reserved * 100, bg->length), div64_u64(zone_unusable * 100, bg->length)); trace_btrfs_reclaim_block_group(bg); - reclaimed = bg->used; ret = btrfs_relocate_chunk(fs_info, bg->start); if (ret) { btrfs_dec_block_group_ro(bg); btrfs_err(fs_info, "error relocating chunk %llu", bg->start); - reclaimed = 0; + used = 0; + reserved = 0; spin_lock(&space_info->lock); space_info->reclaim_errors++; if (READ_ONCE(space_info->periodic_reclaim)) @@ -1936,24 +1985,13 @@ void btrfs_reclaim_bgs_work(struct work_struct *work) } spin_lock(&space_info->lock); space_info->reclaim_count++; - space_info->reclaim_bytes += reclaimed; + space_info->reclaim_bytes += used; + space_info->reclaim_bytes += reserved; spin_unlock(&space_info->lock); next: - if (ret && !READ_ONCE(space_info->periodic_reclaim)) { - /* Refcount held by the reclaim_bgs list after splice. */ - spin_lock(&fs_info->unused_bgs_lock); - /* - * This block group might be added to the unused list - * during the above process. Move it back to the - * reclaim list otherwise. - */ - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); - list_add_tail(&bg->bg_list, &retry_list); - } - spin_unlock(&fs_info->unused_bgs_lock); - } + if (ret && !READ_ONCE(space_info->periodic_reclaim)) + btrfs_link_bg_list(bg, &retry_list); btrfs_put_block_group(bg); mutex_unlock(&fs_info->reclaim_bgs_lock); @@ -1993,13 +2031,8 @@ void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg) { struct btrfs_fs_info *fs_info = bg->fs_info; - spin_lock(&fs_info->unused_bgs_lock); - if (list_empty(&bg->bg_list)) { - btrfs_get_block_group(bg); + if (btrfs_link_bg_list(bg, &fs_info->reclaim_bgs)) trace_btrfs_add_reclaim_block_group(bg); - list_add_tail(&bg->bg_list, &fs_info->reclaim_bgs); - } - spin_unlock(&fs_info->unused_bgs_lock); } static int read_bg_from_eb(struct btrfs_fs_info *fs_info, const struct btrfs_key *key, @@ -2182,9 +2215,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) if (cache->start < BTRFS_SUPER_INFO_OFFSET) { stripe_len = BTRFS_SUPER_INFO_OFFSET - cache->start; cache->bytes_super += stripe_len; - ret = set_extent_bit(&fs_info->excluded_extents, cache->start, - cache->start + stripe_len - 1, - EXTENT_UPTODATE, NULL); + ret = btrfs_set_extent_bit(&fs_info->excluded_extents, cache->start, + cache->start + stripe_len - 1, + EXTENT_DIRTY, NULL); if (ret) return ret; } @@ -2210,9 +2243,9 @@ static int exclude_super_stripes(struct btrfs_block_group *cache) cache->start + cache->length - logical[nr]); cache->bytes_super += len; - ret = set_extent_bit(&fs_info->excluded_extents, logical[nr], - logical[nr] + len - 1, - EXTENT_UPTODATE, NULL); + ret = btrfs_set_extent_bit(&fs_info->excluded_extents, + logical[nr], logical[nr] + len - 1, + EXTENT_DIRTY, NULL); if (ret) { kfree(logical); return ret; @@ -2337,6 +2370,7 @@ static int read_one_block_group(struct btrfs_fs_info *info, cache->commit_used = cache->used; cache->flags = btrfs_stack_block_group_flags(bgi); cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi); + cache->space_info = btrfs_find_space_info(info, cache->flags); set_free_space_tree_thresholds(cache); @@ -2410,11 +2444,12 @@ static int read_one_block_group(struct btrfs_fs_info *info, goto error; } - ret = btrfs_add_block_group_cache(info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); goto error; } + trace_btrfs_add_block_group(info, cache, 0); btrfs_add_bg_to_space_info(info, cache); @@ -2459,7 +2494,8 @@ static int fill_dummy_bgs(struct btrfs_fs_info *fs_info) bg->cached = BTRFS_CACHE_FINISHED; bg->used = map->chunk_len; bg->flags = map->type; - ret = btrfs_add_block_group_cache(fs_info, bg); + bg->space_info = btrfs_find_space_info(fs_info, bg->flags); + ret = btrfs_add_block_group_cache(bg); /* * We may have some valid block group cache added already, in * that case we skip to the next one. @@ -2509,8 +2545,8 @@ int btrfs_read_block_groups(struct btrfs_fs_info *info) return fill_dummy_bgs(info); key.objectid = 0; - key.offset = 0; key.type = BTRFS_BLOCK_GROUP_ITEM_KEY; + key.offset = 0; path = btrfs_alloc_path(); if (!path) return -ENOMEM; @@ -2641,7 +2677,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, { struct btrfs_fs_info *fs_info = device->fs_info; struct btrfs_root *root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dev_extent *extent; struct extent_buffer *leaf; struct btrfs_key key; @@ -2658,7 +2694,7 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, key.offset = start; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(*extent)); if (ret) - goto out; + return ret; leaf = path->nodes[0]; extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent); @@ -2666,10 +2702,8 @@ static int insert_dev_extent(struct btrfs_trans_handle *trans, btrfs_set_dev_extent_chunk_objectid(leaf, extent, BTRFS_FIRST_CHUNK_TREE_OBJECTID); btrfs_set_dev_extent_chunk_offset(leaf, extent, chunk_offset); - btrfs_set_dev_extent_length(leaf, extent, num_bytes); -out: - btrfs_free_path(path); + return ret; } @@ -2771,8 +2805,12 @@ void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans) /* Already aborted the transaction if it failed. */ next: btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); clear_bit(BLOCK_GROUP_FLAG_NEW, &block_group->runtime_flags); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); /* * If the block group is still unused, add it to the list of @@ -2830,8 +2868,8 @@ static u64 calculate_global_root_id(const struct btrfs_fs_info *fs_info, u64 off } struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 type, - u64 chunk_offset, u64 size) + struct btrfs_space_info *space_info, + u64 type, u64 chunk_offset, u64 size) { struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache; @@ -2885,10 +2923,10 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran * assigned to our block group. We want our bg to be added to the rbtree * with its ->space_info set. */ - cache->space_info = btrfs_find_space_info(fs_info, cache->flags); + cache->space_info = space_info; ASSERT(cache->space_info); - ret = btrfs_add_block_group_cache(fs_info, cache); + ret = btrfs_add_block_group_cache(cache); if (ret) { btrfs_remove_free_space_cache(cache); btrfs_put_block_group(cache); @@ -2910,7 +2948,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran } #endif - list_add_tail(&cache->bg_list, &trans->new_bgs); + btrfs_link_bg_list(cache, &trans->new_bgs); btrfs_inc_delayed_refs_rsv_bg_inserts(fs_info); set_avail_alloc_bits(fs_info, type); @@ -2930,6 +2968,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc) { struct btrfs_fs_info *fs_info = cache->fs_info; + struct btrfs_space_info *space_info = cache->space_info; struct btrfs_trans_handle *trans; struct btrfs_root *root = btrfs_block_group_root(fs_info); u64 alloc_flags; @@ -2982,7 +3021,7 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, */ alloc_flags = btrfs_get_alloc_profile(fs_info, cache->flags); if (alloc_flags != cache->flags) { - ret = btrfs_chunk_alloc(trans, alloc_flags, + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); /* * ENOSPC is allowed here, we may have enough space @@ -3010,15 +3049,15 @@ int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, (cache->flags & BTRFS_BLOCK_GROUP_SYSTEM)) goto unlock_out; - alloc_flags = btrfs_get_alloc_profile(fs_info, cache->space_info->flags); - ret = btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + alloc_flags = btrfs_get_alloc_profile(fs_info, space_info->flags); + ret = btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); if (ret < 0) goto out; /* * We have allocated a new chunk. We also need to activate that chunk to * grant metadata tickets for zoned filesystem. */ - ret = btrfs_zoned_activate_one_bg(fs_info, cache->space_info, true); + ret = btrfs_zoned_activate_one_bg(fs_info, space_info, true); if (ret < 0) goto out; @@ -3306,7 +3345,7 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *cache, *tmp; struct btrfs_transaction *cur_trans = trans->transaction; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); if (list_empty(&cur_trans->dirty_bgs) || !btrfs_test_opt(fs_info, SPACE_CACHE)) @@ -3323,7 +3362,6 @@ int btrfs_setup_space_cache(struct btrfs_trans_handle *trans) cache_save_setup(cache, trans, path); } - btrfs_free_path(path); return 0; } @@ -3346,7 +3384,7 @@ int btrfs_start_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); LIST_HEAD(dirty); struct list_head *io = &cur_trans->io_bgs; int loops = 0; @@ -3501,7 +3539,6 @@ out: btrfs_cleanup_dirty_bgs(cur_trans, fs_info); } - btrfs_free_path(path); return ret; } @@ -3512,7 +3549,7 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) struct btrfs_transaction *cur_trans = trans->transaction; int ret = 0; int should_put; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct list_head *io = &cur_trans->io_bgs; path = btrfs_alloc_path(); @@ -3624,7 +3661,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans) btrfs_put_block_group(cache); } - btrfs_free_path(path); return ret; } @@ -3703,8 +3739,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&space_info->lock); - set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); } spin_lock(&trans->transaction->dirty_bgs_lock); @@ -3793,17 +3829,17 @@ out: /* * Update the block_group and space info counters. * - * @cache: The cache we are manipulating - * @num_bytes: The number of bytes in question - * @delalloc: The blocks are allocated for the delalloc write + * @cache: The cache we are manipulating. + * @num_bytes: The number of bytes in question. + * @is_delalloc: Whether the blocks are allocated for a delalloc write. * * This is called by somebody who is freeing space that was never actually used * on disk. For example if you reserve some space for a new leaf in transaction * A and before transaction A commits you free that leaf, you call this with * reserve set to 0 in order to clear the reservation. */ -void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, - u64 num_bytes, int delalloc) +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, + bool is_delalloc) { struct btrfs_space_info *space_info = cache->space_info; @@ -3817,7 +3853,7 @@ void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, space_info->bytes_reserved -= num_bytes; space_info->max_extent_size = 0; - if (delalloc) + if (is_delalloc) cache->delalloc_bytes -= num_bytes; spin_unlock(&cache->lock); @@ -3836,14 +3872,14 @@ static void force_metadata_allocation(struct btrfs_fs_info *info) } } -static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, - const struct btrfs_space_info *sinfo, int force) +static bool should_alloc_chunk(const struct btrfs_fs_info *fs_info, + const struct btrfs_space_info *sinfo, int force) { u64 bytes_used = btrfs_space_info_used(sinfo, false); u64 thresh; if (force == CHUNK_ALLOC_FORCE) - return 1; + return true; /* * in limited mode, we want to have some free space up to @@ -3854,22 +3890,31 @@ static int should_alloc_chunk(const struct btrfs_fs_info *fs_info, thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1)); if (sinfo->total_bytes - bytes_used < thresh) - return 1; + return true; } if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80)) - return 0; - return 1; + return false; + return true; } int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type) { u64 alloc_flags = btrfs_get_alloc_profile(trans->fs_info, type); + struct btrfs_space_info *space_info; - return btrfs_chunk_alloc(trans, alloc_flags, CHUNK_ALLOC_FORCE); + space_info = btrfs_find_space_info(trans->fs_info, type); + if (!space_info) { + DEBUG_WARN(); + return -EINVAL; + } + + return btrfs_chunk_alloc(trans, space_info, alloc_flags, CHUNK_ALLOC_FORCE); } -static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags) +static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, + u64 flags) { struct btrfs_block_group *bg; int ret; @@ -3882,7 +3927,7 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans */ check_system_chunk(trans, flags); - bg = btrfs_create_chunk(trans, flags); + bg = btrfs_create_chunk(trans, space_info, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); goto out; @@ -3930,8 +3975,16 @@ static struct btrfs_block_group *do_chunk_alloc(struct btrfs_trans_handle *trans if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(trans->fs_info); struct btrfs_block_group *sys_bg; + struct btrfs_space_info *sys_space_info; + + sys_space_info = btrfs_find_space_info(trans->fs_info, sys_flags); + if (!sys_space_info) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } - sys_bg = btrfs_create_chunk(trans, sys_flags); + sys_bg = btrfs_create_chunk(trans, sys_space_info, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -4062,6 +4115,8 @@ out: * * This function, btrfs_chunk_alloc(), belongs to phase 1. * + * @space_info: specify which space_info the new chunk should belong to. + * * If @force is CHUNK_ALLOC_FORCE: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. @@ -4070,11 +4125,11 @@ out: * - return 1 if it successfully allocates a chunk, * - return errors including -ENOSPC otherwise. */ -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, u64 flags, enum btrfs_chunk_alloc_enum force) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_space_info *space_info; struct btrfs_block_group *ret_bg; bool wait_for_alloc = false; bool should_alloc = false; @@ -4113,9 +4168,6 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, if (flags & BTRFS_BLOCK_GROUP_SYSTEM) return -ENOSPC; - space_info = btrfs_find_space_info(fs_info, flags); - ASSERT(space_info); - do { spin_lock(&space_info->lock); if (force < space_info->force_alloc) @@ -4176,7 +4228,7 @@ int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, force_metadata_allocation(fs_info); } - ret_bg = do_chunk_alloc(trans, flags); + ret_bg = do_chunk_alloc(trans, space_info, flags); trans->allocating_chunk = false; if (IS_ERR(ret_bg)) { @@ -4252,6 +4304,10 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, if (left < bytes) { u64 flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *bg; + struct btrfs_space_info *space_info; + + space_info = btrfs_find_space_info(fs_info, flags); + ASSERT(space_info); /* * Ignore failure to create system chunk. We might end up not @@ -4259,7 +4315,7 @@ static void reserve_chunk_space(struct btrfs_trans_handle *trans, * the paths we visit in the chunk tree (they were already COWed * or created in the current transaction for example). */ - bg = btrfs_create_chunk(trans, flags); + bg = btrfs_create_chunk(trans, space_info, flags); if (IS_ERR(bg)) { ret = PTR_ERR(bg); } else { @@ -4367,6 +4423,43 @@ void btrfs_put_block_group_cache(struct btrfs_fs_info *info) } } +static void check_removing_space_info(struct btrfs_space_info *space_info) +{ + struct btrfs_fs_info *info = space_info->fs_info; + + if (space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY) { + /* This is a top space_info, proceed with its children first. */ + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { + if (space_info->sub_group[i]) { + check_removing_space_info(space_info->sub_group[i]); + kfree(space_info->sub_group[i]); + space_info->sub_group[i] = NULL; + } + } + } + + /* + * Do not hide this behind enospc_debug, this is actually important and + * indicates a real bug if this happens. + */ + if (WARN_ON(space_info->bytes_pinned > 0 || space_info->bytes_may_use > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + + /* + * If there was a failure to cleanup a log tree, very likely due to an + * IO failure on a writeback attempt of one or more of its extent + * buffers, we could not do proper (and cheap) unaccounting of their + * reserved space, so don't warn on bytes_reserved > 0 in that case. + */ + if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || + !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { + if (WARN_ON(space_info->bytes_reserved > 0)) + btrfs_dump_space_info(info, space_info, 0, 0); + } + + WARN_ON(space_info->reclaim_size > 0); +} + /* * Must be called only after stopping all workers, since we could have block * group caching kthreads running, and therefore they could race with us if we @@ -4392,8 +4485,8 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) write_lock(&info->block_group_cache_lock); while (!list_empty(&info->caching_block_groups)) { - caching_ctl = list_entry(info->caching_block_groups.next, - struct btrfs_caching_control, list); + caching_ctl = list_first_entry(&info->caching_block_groups, + struct btrfs_caching_control, list); list_del(&caching_ctl->list); btrfs_put_caching_control(caching_ctl); } @@ -4464,32 +4557,10 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info) btrfs_release_global_block_rsv(info); while (!list_empty(&info->space_info)) { - space_info = list_entry(info->space_info.next, - struct btrfs_space_info, - list); - - /* - * Do not hide this behind enospc_debug, this is actually - * important and indicates a real bug if this happens. - */ - if (WARN_ON(space_info->bytes_pinned > 0 || - space_info->bytes_may_use > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - - /* - * If there was a failure to cleanup a log tree, very likely due - * to an IO failure on a writeback attempt of one or more of its - * extent buffers, we could not do proper (and cheap) unaccounting - * of their reserved space, so don't warn on bytes_reserved > 0 in - * that case. - */ - if (!(space_info->flags & BTRFS_BLOCK_GROUP_METADATA) || - !BTRFS_FS_LOG_CLEANUP_ERROR(info)) { - if (WARN_ON(space_info->bytes_reserved > 0)) - btrfs_dump_space_info(info, space_info, 0, 0); - } + space_info = list_first_entry(&info->space_info, + struct btrfs_space_info, list); - WARN_ON(space_info->reclaim_size > 0); + check_removing_space_info(space_info); list_del(&space_info->list); btrfs_sysfs_remove_space_info(space_info); } diff --git a/fs/btrfs/block-group.h b/fs/btrfs/block-group.h index 36937eeab9b8..9de356bcb411 100644 --- a/fs/btrfs/block-group.h +++ b/fs/btrfs/block-group.h @@ -326,8 +326,8 @@ void btrfs_reclaim_bgs(struct btrfs_fs_info *fs_info); void btrfs_mark_bg_to_reclaim(struct btrfs_block_group *bg); int btrfs_read_block_groups(struct btrfs_fs_info *info); struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *trans, - u64 type, - u64 chunk_offset, u64 size); + struct btrfs_space_info *space_info, + u64 type, u64 chunk_offset, u64 size); void btrfs_create_pending_block_groups(struct btrfs_trans_handle *trans); int btrfs_inc_block_group_ro(struct btrfs_block_group *cache, bool do_chunk_alloc); @@ -340,9 +340,10 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans, int btrfs_add_reserved_bytes(struct btrfs_block_group *cache, u64 ram_bytes, u64 num_bytes, int delalloc, bool force_wrong_size_class); -void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, - u64 num_bytes, int delalloc); -int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, u64 flags, +void btrfs_free_reserved_bytes(struct btrfs_block_group *cache, u64 num_bytes, + bool is_delalloc); +int btrfs_chunk_alloc(struct btrfs_trans_handle *trans, + struct btrfs_space_info *space_info, u64 flags, enum btrfs_chunk_alloc_enum force); int btrfs_force_chunk_alloc(struct btrfs_trans_handle *trans, u64 type); void check_system_chunk(struct btrfs_trans_handle *trans, const u64 type); diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index 3f3608299c0b..5ad6de738aee 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -418,6 +418,9 @@ void btrfs_init_root_block_rsv(struct btrfs_root *root) case BTRFS_CHUNK_TREE_OBJECTID: root->block_rsv = &fs_info->chunk_block_rsv; break; + case BTRFS_TREE_LOG_OBJECTID: + root->block_rsv = &fs_info->treelog_rsv; + break; default: root->block_rsv = NULL; break; @@ -438,6 +441,14 @@ void btrfs_init_global_block_rsv(struct btrfs_fs_info *fs_info) fs_info->delayed_block_rsv.space_info = space_info; fs_info->delayed_refs_rsv.space_info = space_info; + /* The treelog_rsv uses a dedicated space_info on the zoned mode. */ + if (!btrfs_is_zoned(fs_info)) { + fs_info->treelog_rsv.space_info = space_info; + } else { + ASSERT(space_info->sub_group[0]->subgroup_id == BTRFS_SUB_GROUP_TREELOG); + fs_info->treelog_rsv.space_info = space_info->sub_group[0]; + } + btrfs_update_global_block_rsv(fs_info); } diff --git a/fs/btrfs/block-rsv.h b/fs/btrfs/block-rsv.h index d12b1fac5c74..79ae9d05cd91 100644 --- a/fs/btrfs/block-rsv.h +++ b/fs/btrfs/block-rsv.h @@ -24,6 +24,7 @@ enum btrfs_rsv_type { BTRFS_BLOCK_RSV_CHUNK, BTRFS_BLOCK_RSV_DELOPS, BTRFS_BLOCK_RSV_DELREFS, + BTRFS_BLOCK_RSV_TREELOG, BTRFS_BLOCK_RSV_EMPTY, BTRFS_BLOCK_RSV_TEMP, }; diff --git a/fs/btrfs/btrfs_inode.h b/fs/btrfs/btrfs_inode.h index b2fa33911c28..a79fa0726f1d 100644 --- a/fs/btrfs/btrfs_inode.h +++ b/fs/btrfs/btrfs_inode.h @@ -145,6 +145,7 @@ struct btrfs_inode { * different from prop_compress and takes precedence if set. */ u8 defrag_compress; + s8 defrag_compress_level; /* * Lock for counters and all fields used to determine if the inode is in @@ -516,15 +517,23 @@ static inline void btrfs_assert_inode_locked(struct btrfs_inode *inode) lockdep_assert_held(&inode->vfs_inode.i_rwsem); } +static inline void btrfs_update_inode_mapping_flags(struct btrfs_inode *inode) +{ + if (inode->flags & BTRFS_INODE_NODATASUM) + mapping_clear_stable_writes(inode->vfs_inode.i_mapping); + else + mapping_set_stable_writes(inode->vfs_inode.i_mapping); +} + /* Array of bytes with variable length, hexadecimal format 0x1234 */ #define CSUM_FMT "0x%*phN" #define CSUM_FMT_VALUE(size, bytes) size, bytes -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected); +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, + const u8 * const csum_expected); bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u32 bio_offset, struct bio_vec *bv); -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait); @@ -538,8 +547,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, struct btrfs_inode *parent_inode, struct btrfs_inode *inode, const struct fscrypt_str *name, int add_backref, u64 index); int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry); -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front); +int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end); int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context); int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr, @@ -584,9 +592,9 @@ void btrfs_free_inode(struct inode *inode); int btrfs_drop_inode(struct inode *inode); int __init btrfs_init_cachep(void); void __cold btrfs_destroy_cachep(void); -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path); -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root); +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path); +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root); struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct folio *folio, u64 start, u64 len); int btrfs_update_inode(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/compression.c b/fs/btrfs/compression.c index 0c4d486c3048..48d07939fee4 100644 --- a/fs/btrfs/compression.c +++ b/fs/btrfs/compression.c @@ -285,12 +285,12 @@ static noinline void end_compressed_writeback(const struct compressed_bio *cb) unsigned long index = cb->start >> PAGE_SHIFT; unsigned long end_index = (cb->start + cb->len - 1) >> PAGE_SHIFT; struct folio_batch fbatch; - const int error = blk_status_to_errno(cb->bbio.bio.bi_status); int i; int ret; - if (error) - mapping_set_error(inode->i_mapping, error); + ret = blk_status_to_errno(cb->bbio.bio.bi_status); + if (ret) + mapping_set_error(inode->i_mapping, ret); folio_batch_init(&fbatch); while (index <= end_index) { @@ -499,9 +499,9 @@ static noinline int add_ra_bio_pages(struct inode *inode, } page_end = (pg_index << PAGE_SHIFT) + folio_size(folio) - 1; - lock_extent(tree, cur, page_end, NULL); + btrfs_lock_extent(tree, cur, page_end, NULL); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); + em = btrfs_lookup_extent_mapping(em_tree, cur, page_end + 1 - cur); read_unlock(&em_tree->lock); /* @@ -510,20 +510,20 @@ static noinline int add_ra_bio_pages(struct inode *inode, * to this compressed extent on disk. */ if (!em || cur < em->start || - (cur + fs_info->sectorsize > extent_map_end(em)) || - (extent_map_block_start(em) >> SECTOR_SHIFT) != + (cur + fs_info->sectorsize > btrfs_extent_map_end(em)) || + (btrfs_extent_map_block_start(em) >> SECTOR_SHIFT) != orig_bio->bi_iter.bi_sector) { - free_extent_map(em); - unlock_extent(tree, cur, page_end, NULL); + btrfs_free_extent_map(em); + btrfs_unlock_extent(tree, cur, page_end, NULL); folio_unlock(folio); folio_put(folio); break; } add_size = min(em->start + em->len, page_end + 1) - cur; - free_extent_map(em); - unlock_extent(tree, cur, page_end, NULL); + btrfs_free_extent_map(em); + btrfs_unlock_extent(tree, cur, page_end, NULL); - if (folio->index == end_index) { + if (folio_contains(folio, end_index)) { size_t zero_offset = offset_in_folio(folio, isize); if (zero_offset) { @@ -576,19 +576,19 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) struct extent_map *em; unsigned long pflags; int memstall = 0; - blk_status_t ret; - int ret2; + blk_status_t status; + int ret; /* we need the actual starting offset of this extent in the file */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); + em = btrfs_lookup_extent_mapping(em_tree, file_offset, fs_info->sectorsize); read_unlock(&em_tree->lock); if (!em) { - ret = BLK_STS_IOERR; + status = BLK_STS_IOERR; goto out; } - ASSERT(extent_map_is_compressed(em)); + ASSERT(btrfs_extent_map_is_compressed(em)); compressed_len = em->disk_num_bytes; cb = alloc_compressed_bio(inode, file_offset, REQ_OP_READ, @@ -600,21 +600,21 @@ void btrfs_submit_compressed_read(struct btrfs_bio *bbio) cb->len = bbio->bio.bi_iter.bi_size; cb->compressed_len = compressed_len; - cb->compress_type = extent_map_compression(em); + cb->compress_type = btrfs_extent_map_compression(em); cb->orig_bbio = bbio; - free_extent_map(em); + btrfs_free_extent_map(em); cb->nr_folios = DIV_ROUND_UP(compressed_len, PAGE_SIZE); - cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct page *), GFP_NOFS); + cb->compressed_folios = kcalloc(cb->nr_folios, sizeof(struct folio *), GFP_NOFS); if (!cb->compressed_folios) { - ret = BLK_STS_RESOURCE; + status = BLK_STS_RESOURCE; goto out_free_bio; } - ret2 = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); - if (ret2) { - ret = BLK_STS_RESOURCE; + ret = btrfs_alloc_folio_array(cb->nr_folios, cb->compressed_folios); + if (ret) { + status = BLK_STS_RESOURCE; goto out_free_compressed_pages; } @@ -637,7 +637,7 @@ out_free_compressed_pages: out_free_bio: bio_put(&cb->bbio.bio); out: - btrfs_bio_end_io(bbio, ret); + btrfs_bio_end_io(bbio, status); } /* @@ -740,7 +740,7 @@ static const struct btrfs_compress_op * const btrfs_compress_op[] = { &btrfs_zstd_compress, }; -static struct list_head *alloc_workspace(int type, unsigned int level) +static struct list_head *alloc_workspace(int type, int level) { switch (type) { case BTRFS_COMPRESS_NONE: return alloc_heuristic_ws(); @@ -818,7 +818,7 @@ static void btrfs_cleanup_workspace_manager(int type) * Preallocation makes a forward progress guarantees and we do not return * errors. */ -struct list_head *btrfs_get_workspace(int type, unsigned int level) +struct list_head *btrfs_get_workspace(int type, int level) { struct workspace_manager *wsm; struct list_head *workspace; @@ -968,18 +968,28 @@ static void put_workspace(int type, struct list_head *ws) * Adjust @level according to the limits of the compression algorithm or * fallback to default */ -static unsigned int btrfs_compress_set_level(int type, unsigned level) +static int btrfs_compress_set_level(unsigned int type, int level) { const struct btrfs_compress_op *ops = btrfs_compress_op[type]; if (level == 0) level = ops->default_level; else - level = min(level, ops->max_level); + level = min(max(level, ops->min_level), ops->max_level); return level; } +/* + * Check whether the @level is within the valid range for the given type. + */ +bool btrfs_compress_level_valid(unsigned int type, int level) +{ + const struct btrfs_compress_op *ops = btrfs_compress_op[type]; + + return ops->min_level <= level && level <= ops->max_level; +} + /* Wrapper around find_get_page(), with extra error message. */ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, struct folio **in_folio_ret) @@ -1023,12 +1033,10 @@ int btrfs_compress_filemap_get_folio(struct address_space *mapping, u64 start, * @total_out is an in/out parameter, must be set to the input length and will * be also used to return the total number of compressed bytes */ -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) { - int type = btrfs_compress_type(type_level); - int level = btrfs_compress_level(type_level); const unsigned long orig_len = *total_out; struct list_head *workspace; int ret; @@ -1130,6 +1138,22 @@ void __cold btrfs_exit_compress(void) } /* + * The bvec is a single page bvec from a bio that contains folios from a filemap. + * + * Since the folio may be a large one, and if the bv_page is not a head page of + * a large folio, then page->index is unreliable. + * + * Thus we need this helper to grab the proper file offset. + */ +static u64 file_offset_from_bvec(const struct bio_vec *bvec) +{ + const struct page *page = bvec->bv_page; + const struct folio *folio = page_folio(page); + + return (page_pgoff(folio, page) << PAGE_SHIFT) + bvec->bv_offset; +} + +/* * Copy decompressed data from working buffer to pages. * * @buf: The decompressed data buffer @@ -1174,13 +1198,14 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, u32 copy_start; /* Offset inside the full decompressed extent */ u32 bvec_offset; + void *kaddr; bvec = bio_iter_iovec(orig_bio, orig_bio->bi_iter); /* * cb->start may underflow, but subtracting that value can still * give us correct offset inside the full decompressed extent. */ - bvec_offset = page_offset(bvec.bv_page) + bvec.bv_offset - cb->start; + bvec_offset = file_offset_from_bvec(&bvec) - cb->start; /* Haven't reached the bvec range, exit */ if (decompressed + buf_len <= bvec_offset) @@ -1196,10 +1221,12 @@ int btrfs_decompress_buf2page(const char *buf, u32 buf_len, * @buf + @buf_len. */ ASSERT(copy_start - decompressed < buf_len); - memcpy_to_page(bvec.bv_page, bvec.bv_offset, - buf + copy_start - decompressed, copy_len); - cur_offset += copy_len; + kaddr = bvec_kmap_local(&bvec); + memcpy(kaddr, buf + copy_start - decompressed, copy_len); + kunmap_local(kaddr); + + cur_offset += copy_len; bio_advance(orig_bio, copy_len); /* Finished the bio */ if (!orig_bio->bi_iter.bi_size) @@ -1590,18 +1617,19 @@ out: /* * Convert the compression suffix (eg. after "zlib" starting with ":") to - * level, unrecognized string will set the default level + * level, unrecognized string will set the default level. Negative level + * numbers are allowed. */ -unsigned int btrfs_compress_str2level(unsigned int type, const char *str) +int btrfs_compress_str2level(unsigned int type, const char *str) { - unsigned int level = 0; + int level = 0; int ret; if (!type) return 0; if (str[0] == ':') { - ret = kstrtouint(str + 1, 10, &level); + ret = kstrtoint(str + 1, 10, &level); if (ret) level = 0; } diff --git a/fs/btrfs/compression.h b/fs/btrfs/compression.h index 954034086d0d..d34c4341eaf4 100644 --- a/fs/btrfs/compression.h +++ b/fs/btrfs/compression.h @@ -11,7 +11,9 @@ #include <linux/list.h> #include <linux/workqueue.h> #include <linux/wait.h> +#include <linux/pagemap.h> #include "bio.h" +#include "messages.h" struct address_space; struct page; @@ -72,28 +74,22 @@ struct compressed_bio { struct btrfs_bio bbio; }; -static inline unsigned int btrfs_compress_type(unsigned int type_level) -{ - return (type_level & 0xF); -} - -static inline unsigned int btrfs_compress_level(unsigned int type_level) -{ - return ((type_level & 0xF0) >> 4); -} - /* @range_end must be exclusive. */ -static inline u32 btrfs_calc_input_length(u64 range_end, u64 cur) +static inline u32 btrfs_calc_input_length(struct folio *folio, u64 range_end, u64 cur) { - u64 page_end = round_down(cur, PAGE_SIZE) + PAGE_SIZE; + const u64 folio_end = folio_pos(folio) + folio_size(folio); - return min(range_end, page_end) - cur; + /* @cur must be inside the folio. */ + ASSERT(folio_pos(folio) <= cur); + ASSERT(cur < folio_end); + return min(range_end, folio_end) - cur; } int __init btrfs_init_compress(void); void __cold btrfs_exit_compress(void); -int btrfs_compress_folios(unsigned int type_level, struct address_space *mapping, +bool btrfs_compress_level_valid(unsigned int type, int level); +int btrfs_compress_folios(unsigned int type, int level, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out); int btrfs_decompress(int type, const u8 *data_in, struct folio *dest_folio, @@ -107,7 +103,7 @@ void btrfs_submit_compressed_write(struct btrfs_ordered_extent *ordered, bool writeback); void btrfs_submit_compressed_read(struct btrfs_bio *bbio); -unsigned int btrfs_compress_str2level(unsigned int type, const char *str); +int btrfs_compress_str2level(unsigned int type, const char *str); struct folio *btrfs_alloc_compr_folio(void); void btrfs_free_compr_folio(struct folio *folio); @@ -131,14 +127,15 @@ struct workspace_manager { wait_queue_head_t ws_wait; }; -struct list_head *btrfs_get_workspace(int type, unsigned int level); +struct list_head *btrfs_get_workspace(int type, int level); void btrfs_put_workspace(int type, struct list_head *ws); struct btrfs_compress_op { struct workspace_manager *workspace_manager; /* Maximum level supported by the compression algorithm */ - unsigned int max_level; - unsigned int default_level; + int min_level; + int max_level; + int default_level; }; /* The heuristic workspaces are managed via the 0th workspace manager */ @@ -187,9 +184,9 @@ int zstd_decompress(struct list_head *ws, const u8 *data_in, size_t destlen); void zstd_init_workspace_manager(void); void zstd_cleanup_workspace_manager(void); -struct list_head *zstd_alloc_workspace(unsigned int level); +struct list_head *zstd_alloc_workspace(int level); void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_get_workspace(unsigned int level); +struct list_head *zstd_get_workspace(int level); void zstd_put_workspace(struct list_head *ws); #endif diff --git a/fs/btrfs/ctree.c b/fs/btrfs/ctree.c index 3dc5a35dd19b..a2e7979372cc 100644 --- a/fs/btrfs/ctree.c +++ b/fs/btrfs/ctree.c @@ -4306,7 +4306,7 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, u32 data_size) { int ret = 0; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; unsigned long ptr; @@ -4320,7 +4320,6 @@ int btrfs_insert_item(struct btrfs_trans_handle *trans, struct btrfs_root *root, write_extent_buffer(leaf, data, ptr, data_size); btrfs_mark_buffer_dirty(trans, leaf); } - btrfs_free_path(path); return ret; } @@ -4608,7 +4607,6 @@ int btrfs_search_forward(struct btrfs_root *root, struct btrfs_key *min_key, u64 min_trans) { struct extent_buffer *cur; - struct btrfs_key found_key; int slot; int sret; u32 nritems; @@ -4644,7 +4642,8 @@ again: goto find_next_key; ret = 0; path->slots[level] = slot; - btrfs_item_key_to_cpu(cur, &found_key, slot); + /* Save our key for returning back. */ + btrfs_item_key_to_cpu(cur, min_key, slot); goto out; } if (sret && slot > 0) @@ -4668,8 +4667,8 @@ find_next_key: * we didn't find a candidate key in this node, walk forward * and find another one */ + path->slots[level] = slot; if (slot >= nritems) { - path->slots[level] = slot; sret = btrfs_find_next_key(root, path, min_key, level, min_trans); if (sret == 0) { @@ -4679,11 +4678,10 @@ find_next_key: goto out; } } - /* save our key for returning back */ - btrfs_node_key_to_cpu(cur, &found_key, slot); - path->slots[level] = slot; if (level == path->lowest_level) { ret = 0; + /* Save our key for returning back. */ + btrfs_node_key_to_cpu(cur, min_key, slot); goto out; } cur = btrfs_read_node_slot(cur, slot); @@ -4700,10 +4698,8 @@ find_next_key: } out: path->keep_locks = keep_locks; - if (ret == 0) { + if (ret == 0) btrfs_unlock_up_safe(path, path->lowest_level + 1); - memcpy(min_key, &found_key, sizeof(found_key)); - } return ret; } diff --git a/fs/btrfs/ctree.h b/fs/btrfs/ctree.h index 1096a80a64e7..71fa42ca04fe 100644 --- a/fs/btrfs/ctree.h +++ b/fs/btrfs/ctree.h @@ -6,7 +6,7 @@ #ifndef BTRFS_CTREE_H #define BTRFS_CTREE_H -#include "linux/cleanup.h" +#include <linux/cleanup.h> #include <linux/spinlock.h> #include <linux/rbtree.h> #include <linux/mutex.h> @@ -61,7 +61,6 @@ struct btrfs_path { /* if there is real range locking, this locks field will change */ u8 locks[BTRFS_MAX_LEVEL]; u8 reada; - /* keep some upper locks as we walk down */ u8 lowest_level; /* @@ -69,6 +68,7 @@ struct btrfs_path { * and to force calls to keep space in the nodes */ unsigned int search_for_split:1; + /* Keep some upper locks as we walk down. */ unsigned int keep_locks:1; unsigned int skip_locking:1; unsigned int search_commit_root:1; diff --git a/fs/btrfs/defrag.c b/fs/btrfs/defrag.c index 968dae953948..1831618579cb 100644 --- a/fs/btrfs/defrag.c +++ b/fs/btrfs/defrag.c @@ -105,15 +105,15 @@ static int btrfs_insert_inode_defrag(struct btrfs_inode *inode, return 0; } -static inline int need_auto_defrag(struct btrfs_fs_info *fs_info) +static inline bool need_auto_defrag(struct btrfs_fs_info *fs_info) { if (!btrfs_test_opt(fs_info, AUTO_DEFRAG)) - return 0; + return false; if (btrfs_fs_closing(fs_info)) - return 0; + return false; - return 1; + return true; } /* @@ -191,10 +191,7 @@ static struct inode_defrag *btrfs_pick_defrag_inode( if (parent && compare_inode_defrag(&tmp, entry) > 0) { parent = rb_next(parent); - if (parent) - entry = rb_entry(parent, struct inode_defrag, rb_node); - else - entry = NULL; + entry = rb_entry_safe(parent, struct inode_defrag, rb_node); } out: if (entry) @@ -225,7 +222,7 @@ static int btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info, struct file_ra_state *ra) { struct btrfs_root *inode_root; - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_ioctl_defrag_range_args range; int ret = 0; u64 cur = 0; @@ -250,24 +247,24 @@ again: goto cleanup; } - if (cur >= i_size_read(inode)) { - iput(inode); + if (cur >= i_size_read(&inode->vfs_inode)) { + iput(&inode->vfs_inode); goto cleanup; } /* Do a chunk of defrag */ - clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags); + clear_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags); memset(&range, 0, sizeof(range)); range.len = (u64)-1; range.start = cur; range.extent_thresh = defrag->extent_thresh; - file_ra_state_init(ra, inode->i_mapping); + file_ra_state_init(ra, inode->vfs_inode.i_mapping); sb_start_write(fs_info->sb); ret = btrfs_defrag_file(inode, ra, &range, defrag->transid, - BTRFS_DEFRAG_BATCH); + BTRFS_DEFRAG_BATCH); sb_end_write(fs_info->sb); - iput(inode); + iput(&inode->vfs_inode); if (ret < 0) goto cleanup; @@ -624,7 +621,7 @@ static struct extent_map *defrag_get_extent(struct btrfs_inode *inode, u64 ino = btrfs_ino(inode); int ret; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto err; @@ -734,12 +731,12 @@ next: not_found: btrfs_release_path(&path); - free_extent_map(em); + btrfs_free_extent_map(em); return NULL; err: btrfs_release_path(&path); - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } @@ -756,7 +753,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * full extent lock. */ read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, sectorsize); + em = btrfs_lookup_extent_mapping(em_tree, start, sectorsize); read_unlock(&em_tree->lock); /* @@ -769,7 +766,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, * file extent items in the inode's subvolume tree). */ if (em && (em->flags & EXTENT_FLAG_MERGED)) { - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; } @@ -779,10 +776,10 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, /* Get the big lock and read metadata off disk. */ if (!locked) - lock_extent(io_tree, start, end, &cached); + btrfs_lock_extent(io_tree, start, end, &cached); em = defrag_get_extent(BTRFS_I(inode), start, newer_than); if (!locked) - unlock_extent(io_tree, start, end, &cached); + btrfs_unlock_extent(io_tree, start, end, &cached); if (IS_ERR(em)) return NULL; @@ -794,7 +791,7 @@ static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start, static u32 get_extent_max_capacity(const struct btrfs_fs_info *fs_info, const struct extent_map *em) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return BTRFS_MAX_COMPRESSED; return fs_info->max_extent_size; } @@ -837,7 +834,7 @@ static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em, ret = true; out: - free_extent_map(next); + btrfs_free_extent_map(next); return ret; } @@ -857,13 +854,14 @@ static struct folio *defrag_prepare_one_folio(struct btrfs_inode *inode, pgoff_t { struct address_space *mapping = inode->vfs_inode.i_mapping; gfp_t mask = btrfs_alloc_write_mask(mapping); - u64 page_start = (u64)index << PAGE_SHIFT; - u64 page_end = page_start + PAGE_SIZE - 1; + u64 folio_start; + u64 folio_end; struct extent_state *cached_state = NULL; struct folio *folio; int ret; again: + /* TODO: Add order fgp order flags when large folios are fully enabled. */ folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) @@ -871,13 +869,16 @@ again: /* * Since we can defragment files opened read-only, we can encounter - * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We - * can't do I/O using huge pages yet, so return an error for now. + * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). + * + * The IO for such large folios is not fully tested, thus return + * an error to reject such folios unless it's an experimental build. + * * Filesystem transparent huge pages are typically only used for * executables that explicitly enable them, so this isn't very * restrictive. */ - if (folio_test_large(folio)) { + if (!IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && folio_test_large(folio)) { folio_unlock(folio); folio_put(folio); return ERR_PTR(-ETXTBSY); @@ -890,14 +891,15 @@ again: return ERR_PTR(ret); } + folio_start = folio_pos(folio); + folio_end = folio_pos(folio) + folio_size(folio) - 1; /* Wait for any existing ordered extent in the range */ while (1) { struct btrfs_ordered_extent *ordered; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); - ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); + btrfs_lock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); + ordered = btrfs_lookup_ordered_range(inode, folio_start, folio_size(folio)); + btrfs_unlock_extent(&inode->io_tree, folio_start, folio_end, &cached_state); if (!ordered) break; @@ -1027,8 +1029,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, * very likely resulting in a larger extent after writeback is * triggered (except in a case of free space fragmentation). */ - if (test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, - EXTENT_DELALLOC)) + if (btrfs_test_range_bit_exists(&inode->io_tree, cur, cur + range_len - 1, + EXTENT_DELALLOC)) goto next; /* @@ -1066,8 +1068,8 @@ static int defrag_collect_targets(struct btrfs_inode *inode, /* Empty target list, no way to merge with last entry */ if (list_empty(target_list)) goto next; - last = list_entry(target_list->prev, - struct defrag_target_range, list); + last = list_last_entry(target_list, + struct defrag_target_range, list); /* Not mergeable with last entry */ if (last->start + last->len != cur) goto next; @@ -1077,7 +1079,7 @@ static int defrag_collect_targets(struct btrfs_inode *inode, add: last_is_target = true; - range_len = min(extent_map_end(em), start + len) - cur; + range_len = min(btrfs_extent_map_end(em), start + len) - cur; /* * This one is a good target, check if it can be merged into * last range of the target list. @@ -1085,8 +1087,8 @@ add: if (!list_empty(target_list)) { struct defrag_target_range *last; - last = list_entry(target_list->prev, - struct defrag_target_range, list); + last = list_last_entry(target_list, + struct defrag_target_range, list); ASSERT(last->start + last->len <= cur); if (last->start + last->len == cur) { /* Mergeable, enlarge the last entry */ @@ -1099,7 +1101,7 @@ add: /* Allocate new defrag_target_range */ new = kmalloc(sizeof(*new), GFP_NOFS); if (!new) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = -ENOMEM; break; } @@ -1108,8 +1110,8 @@ add: list_add_tail(&new->list, target_list); next: - cur = extent_map_end(em); - free_extent_map(em); + cur = btrfs_extent_map_end(em); + btrfs_free_extent_map(em); } if (ret < 0) { struct defrag_target_range *entry; @@ -1162,27 +1164,31 @@ static int defrag_one_locked_target(struct btrfs_inode *inode, struct extent_changeset *data_reserved = NULL; const u64 start = target->start; const u64 len = target->len; - unsigned long last_index = (start + len - 1) >> PAGE_SHIFT; - unsigned long start_index = start >> PAGE_SHIFT; - unsigned long first_index = folios[0]->index; int ret = 0; - int i; - - ASSERT(last_index - first_index + 1 <= nr_pages); ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len); if (ret < 0) return ret; - clear_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, cached_state); - set_extent_bit(&inode->io_tree, start, start + len - 1, - EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); - - /* Update the page status */ - for (i = start_index - first_index; i <= last_index - first_index; i++) { - folio_clear_checked(folios[i]); - btrfs_folio_clamp_set_dirty(fs_info, folios[i], start, len); + btrfs_clear_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, cached_state); + btrfs_set_extent_bit(&inode->io_tree, start, start + len - 1, + EXTENT_DELALLOC | EXTENT_DEFRAG, cached_state); + + /* + * Update the page status. + * Due to possible large folios, we have to check all folios one by one. + */ + for (int i = 0; i < nr_pages && folios[i]; i++) { + struct folio *folio = folios[i]; + + if (!folio) + break; + if (start >= folio_pos(folio) + folio_size(folio) || + start + len <= folio_pos(folio)) + continue; + btrfs_folio_clamp_clear_checked(fs_info, folio, start, len); + btrfs_folio_clamp_set_dirty(fs_info, folio, start, len); } btrfs_delalloc_release_extents(inode, len); extent_changeset_free(data_reserved); @@ -1200,11 +1206,10 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, LIST_HEAD(target_list); struct folio **folios; const u32 sectorsize = inode->root->fs_info->sectorsize; - u64 last_index = (start + len - 1) >> PAGE_SHIFT; - u64 start_index = start >> PAGE_SHIFT; - unsigned int nr_pages = last_index - start_index + 1; + u64 cur = start; + const unsigned int nr_pages = ((start + len - 1) >> PAGE_SHIFT) - + (start >> PAGE_SHIFT) + 1; int ret = 0; - int i; ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE); ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize)); @@ -1214,21 +1219,25 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, return -ENOMEM; /* Prepare all pages */ - for (i = 0; i < nr_pages; i++) { - folios[i] = defrag_prepare_one_folio(inode, start_index + i); + for (int i = 0; cur < start + len && i < nr_pages; i++) { + folios[i] = defrag_prepare_one_folio(inode, cur >> PAGE_SHIFT); if (IS_ERR(folios[i])) { ret = PTR_ERR(folios[i]); - nr_pages = i; + folios[i] = NULL; goto free_folios; } + cur = folio_pos(folios[i]) + folio_size(folios[i]); } - for (i = 0; i < nr_pages; i++) + for (int i = 0; i < nr_pages; i++) { + if (!folios[i]) + break; folio_wait_writeback(folios[i]); + } + /* We should get at least one folio. */ + ASSERT(folios[0]); /* Lock the pages range */ - lock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + btrfs_lock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); /* * Now we have a consistent view about the extent map, re-check * which range really needs to be defragged. @@ -1254,11 +1263,11 @@ static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len, kfree(entry); } unlock_extent: - unlock_extent(&inode->io_tree, start_index << PAGE_SHIFT, - (last_index << PAGE_SHIFT) + PAGE_SIZE - 1, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, folio_pos(folios[0]), cur - 1, &cached_state); free_folios: - for (i = 0; i < nr_pages; i++) { + for (int i = 0; i < nr_pages; i++) { + if (!folios[i]) + break; folio_unlock(folios[i]); folio_put(folios[i]); } @@ -1352,17 +1361,18 @@ out: * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without * defragging all the range). */ -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; unsigned long sectors_defragged = 0; - u64 isize = i_size_read(inode); + u64 isize = i_size_read(&inode->vfs_inode); u64 cur; u64 last_byte; bool do_compress = (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS); int compress_type = BTRFS_COMPRESS_ZLIB; + int compress_level = 0; int ret = 0; u32 extent_thresh = range->extent_thresh; pgoff_t start_index; @@ -1376,10 +1386,21 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, return -EINVAL; if (do_compress) { - if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) - return -EINVAL; - if (range->compress_type) - compress_type = range->compress_type; + if (range->flags & BTRFS_DEFRAG_RANGE_COMPRESS_LEVEL) { + if (range->compress.type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress.type) { + compress_type = range->compress.type; + compress_level = range->compress.level; + if (!btrfs_compress_level_valid(compress_type, compress_level)) + return -EINVAL; + } + } else { + if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES) + return -EINVAL; + if (range->compress_type) + compress_type = range->compress_type; + } } if (extent_thresh == 0) @@ -1402,8 +1423,8 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * defrag range can be written sequentially. */ start_index = cur >> PAGE_SHIFT; - if (start_index < inode->i_mapping->writeback_index) - inode->i_mapping->writeback_index = start_index; + if (start_index < inode->vfs_inode.i_mapping->writeback_index) + inode->vfs_inode.i_mapping->writeback_index = start_index; while (cur < last_byte) { const unsigned long prev_sectors_defragged = sectors_defragged; @@ -1420,27 +1441,29 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1; cluster_end = min(cluster_end, last_byte); - btrfs_inode_lock(BTRFS_I(inode), 0); - if (IS_SWAPFILE(inode)) { + btrfs_inode_lock(inode, 0); + if (IS_SWAPFILE(&inode->vfs_inode)) { ret = -ETXTBSY; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); break; } - if (!(inode->i_sb->s_flags & SB_ACTIVE)) { - btrfs_inode_unlock(BTRFS_I(inode), 0); + if (!(inode->vfs_inode.i_sb->s_flags & SB_ACTIVE)) { + btrfs_inode_unlock(inode, 0); break; } - if (do_compress) - BTRFS_I(inode)->defrag_compress = compress_type; - ret = defrag_one_cluster(BTRFS_I(inode), ra, cur, + if (do_compress) { + inode->defrag_compress = compress_type; + inode->defrag_compress_level = compress_level; + } + ret = defrag_one_cluster(inode, ra, cur, cluster_end + 1 - cur, extent_thresh, newer_than, do_compress, §ors_defragged, max_to_defrag, &last_scanned); if (sectors_defragged > prev_sectors_defragged) - balance_dirty_pages_ratelimited(inode->i_mapping); + balance_dirty_pages_ratelimited(inode->vfs_inode.i_mapping); - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_unlock(inode, 0); if (ret < 0) break; cur = max(cluster_end + 1, last_scanned); @@ -1462,10 +1485,10 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, * need to be written back immediately. */ if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) { - filemap_flush(inode->i_mapping); + filemap_flush(inode->vfs_inode.i_mapping); if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, - &BTRFS_I(inode)->runtime_flags)) - filemap_flush(inode->i_mapping); + &inode->runtime_flags)) + filemap_flush(inode->vfs_inode.i_mapping); } if (range->compress_type == BTRFS_COMPRESS_LZO) btrfs_set_fs_incompat(fs_info, COMPRESS_LZO); @@ -1474,9 +1497,9 @@ int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, ret = sectors_defragged; } if (do_compress) { - btrfs_inode_lock(BTRFS_I(inode), 0); - BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE; - btrfs_inode_unlock(BTRFS_I(inode), 0); + btrfs_inode_lock(inode, 0); + inode->defrag_compress = BTRFS_COMPRESS_NONE; + btrfs_inode_unlock(inode, 0); } return ret; } diff --git a/fs/btrfs/defrag.h b/fs/btrfs/defrag.h index 6b7596c4f0dc..a7f917a38dbf 100644 --- a/fs/btrfs/defrag.h +++ b/fs/btrfs/defrag.h @@ -6,14 +6,14 @@ #include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct file_ra_state; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_root; struct btrfs_trans_handle; struct btrfs_ioctl_defrag_range_args; -int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra, +int btrfs_defrag_file(struct btrfs_inode *inode, struct file_ra_state *ra, struct btrfs_ioctl_defrag_range_args *range, u64 newer_than, unsigned long max_to_defrag); int __init btrfs_auto_defrag_init(void); diff --git a/fs/btrfs/delalloc-space.c b/fs/btrfs/delalloc-space.c index 88e900e5a43d..288e1776c02d 100644 --- a/fs/btrfs/delalloc-space.c +++ b/fs/btrfs/delalloc-space.c @@ -111,6 +111,18 @@ * making error handling and cleanup easier. */ +static inline struct btrfs_space_info *data_sinfo_for_inode(const struct btrfs_inode *inode) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + + if (btrfs_is_zoned(fs_info) && btrfs_is_data_reloc_root(inode->root)) { + ASSERT(fs_info->data_sinfo->sub_group[0]->subgroup_id == + BTRFS_SUB_GROUP_DATA_RELOC); + return fs_info->data_sinfo->sub_group[0]; + } + return fs_info->data_sinfo; +} + int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) { struct btrfs_root *root = inode->root; @@ -123,7 +135,7 @@ int btrfs_alloc_data_chunk_ondemand(const struct btrfs_inode *inode, u64 bytes) if (btrfs_is_free_space_inode(inode)) flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - return btrfs_reserve_data_bytes(fs_info, bytes, flush); + return btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), bytes, flush); } int btrfs_check_data_free_space(struct btrfs_inode *inode, @@ -144,14 +156,14 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, else if (btrfs_is_free_space_inode(inode)) flush = BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE; - ret = btrfs_reserve_data_bytes(fs_info, len, flush); + ret = btrfs_reserve_data_bytes(data_sinfo_for_inode(inode), len, flush); if (ret < 0) return ret; /* Use new btrfs_qgroup_reserve_data to reserve precious data space. */ ret = btrfs_qgroup_reserve_data(inode, reserved, start, len); if (ret < 0) { - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); extent_changeset_free(*reserved); *reserved = NULL; } else { @@ -168,15 +180,13 @@ int btrfs_check_data_free_space(struct btrfs_inode *inode, * which we can't sleep and is sure it won't affect qgroup reserved space. * Like clear_bit_hook(). */ -void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, - u64 len) +void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len) { - struct btrfs_space_info *data_sinfo; + struct btrfs_fs_info *fs_info = inode->root->fs_info; ASSERT(IS_ALIGNED(len, fs_info->sectorsize)); - data_sinfo = fs_info->data_sinfo; - btrfs_space_info_free_bytes_may_use(data_sinfo, len); + btrfs_space_info_free_bytes_may_use(data_sinfo_for_inode(inode), len); } /* @@ -196,7 +206,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, round_down(start, fs_info->sectorsize); start = round_down(start, fs_info->sectorsize); - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); btrfs_qgroup_free_data(inode, reserved, start, len, NULL); } @@ -439,6 +449,29 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes) btrfs_inode_rsv_release(inode, true); } +/* Shrink a previously reserved extent to a new length. */ +void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + const u32 reserved_num_extents = count_max_extents(fs_info, reserved_len); + const u32 new_num_extents = count_max_extents(fs_info, new_len); + const int diff_num_extents = new_num_extents - reserved_num_extents; + + ASSERT(new_len <= reserved_len); + if (new_num_extents == reserved_num_extents) + return; + + spin_lock(&inode->lock); + btrfs_mod_outstanding_extents(inode, diff_num_extents); + btrfs_calculate_inode_block_rsv_size(fs_info, inode); + spin_unlock(&inode->lock); + + if (btrfs_is_testing(fs_info)) + return; + + btrfs_inode_rsv_release(inode, true); +} + /* * Reserve data and metadata space for delalloc * diff --git a/fs/btrfs/delalloc-space.h b/fs/btrfs/delalloc-space.h index 3f32953c0a80..6119c0d3f883 100644 --- a/fs/btrfs/delalloc-space.h +++ b/fs/btrfs/delalloc-space.h @@ -18,8 +18,7 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode, void btrfs_delalloc_release_space(struct btrfs_inode *inode, struct extent_changeset *reserved, u64 start, u64 len, bool qgroup_free); -void btrfs_free_reserved_data_space_noquota(struct btrfs_fs_info *fs_info, - u64 len); +void btrfs_free_reserved_data_space_noquota(struct btrfs_inode *inode, u64 len); void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes, bool qgroup_free); int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, @@ -27,5 +26,6 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode, int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes, u64 disk_num_bytes, bool noflush); void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes); +void btrfs_delalloc_shrink_extents(struct btrfs_inode *inode, u64 reserved_len, u64 new_len); #endif /* BTRFS_DELALLOC_SPACE_H */ diff --git a/fs/btrfs/delayed-inode.c b/fs/btrfs/delayed-inode.c index 0b4933c6a889..c7cc24a5dd5e 100644 --- a/fs/btrfs/delayed-inode.c +++ b/fs/btrfs/delayed-inode.c @@ -119,7 +119,12 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node( return NULL; } -/* Will return either the node or PTR_ERR(-ENOMEM) */ +/* + * Look up an existing delayed node associated with @btrfs_inode or create a new + * one and insert it to the delayed nodes of the root. + * + * Return the delayed node, or error pointer on failure. + */ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node( struct btrfs_inode *btrfs_inode) { @@ -211,17 +216,13 @@ static void btrfs_dequeue_delayed_node(struct btrfs_delayed_root *root, static struct btrfs_delayed_node *btrfs_first_delayed_node( struct btrfs_delayed_root *delayed_root) { - struct list_head *p; - struct btrfs_delayed_node *node = NULL; + struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); - if (list_empty(&delayed_root->node_list)) - goto out; - - p = delayed_root->node_list.next; - node = list_entry(p, struct btrfs_delayed_node, n_list); - refcount_inc(&node->refs); -out: + node = list_first_entry_or_null(&delayed_root->node_list, + struct btrfs_delayed_node, n_list); + if (node) + refcount_inc(&node->refs); spin_unlock(&delayed_root->lock); return node; @@ -293,18 +294,15 @@ static inline void btrfs_release_delayed_node(struct btrfs_delayed_node *node) static struct btrfs_delayed_node *btrfs_first_prepared_delayed_node( struct btrfs_delayed_root *delayed_root) { - struct list_head *p; - struct btrfs_delayed_node *node = NULL; + struct btrfs_delayed_node *node; spin_lock(&delayed_root->lock); - if (list_empty(&delayed_root->prepare_list)) - goto out; - - p = delayed_root->prepare_list.next; - list_del_init(p); - node = list_entry(p, struct btrfs_delayed_node, p_list); - refcount_inc(&node->refs); -out: + node = list_first_entry_or_null(&delayed_root->prepare_list, + struct btrfs_delayed_node, p_list); + if (node) { + list_del_init(&node->p_list); + refcount_inc(&node->refs); + } spin_unlock(&delayed_root->lock); return node; @@ -454,40 +452,25 @@ static void btrfs_release_delayed_item(struct btrfs_delayed_item *item) static struct btrfs_delayed_item *__btrfs_first_delayed_insertion_item( struct btrfs_delayed_node *delayed_node) { - struct rb_node *p; - struct btrfs_delayed_item *item = NULL; + struct rb_node *p = rb_first_cached(&delayed_node->ins_root); - p = rb_first_cached(&delayed_node->ins_root); - if (p) - item = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return item; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static struct btrfs_delayed_item *__btrfs_first_delayed_deletion_item( struct btrfs_delayed_node *delayed_node) { - struct rb_node *p; - struct btrfs_delayed_item *item = NULL; + struct rb_node *p = rb_first_cached(&delayed_node->del_root); - p = rb_first_cached(&delayed_node->del_root); - if (p) - item = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return item; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static struct btrfs_delayed_item *__btrfs_next_delayed_item( struct btrfs_delayed_item *item) { - struct rb_node *p; - struct btrfs_delayed_item *next = NULL; + struct rb_node *p = rb_next(&item->rb_node); - p = rb_next(&item->rb_node); - if (p) - next = rb_entry(p, struct btrfs_delayed_item, rb_node); - - return next; + return rb_entry_safe(p, struct btrfs_delayed_item, rb_node); } static int btrfs_delayed_item_reserve_metadata(struct btrfs_trans_handle *trans, @@ -1211,7 +1194,7 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_delayed_node *delayed_node = btrfs_get_delayed_node(inode); - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_block_rsv *block_rsv; int ret; @@ -1238,7 +1221,6 @@ int btrfs_commit_inode_delayed_items(struct btrfs_trans_handle *trans, ret = __btrfs_commit_inode_delayed_items(trans, path, delayed_node); btrfs_release_delayed_node(delayed_node); - btrfs_free_path(path); trans->block_rsv = block_rsv; return ret; @@ -1398,17 +1380,17 @@ void btrfs_assert_delayed_root_empty(struct btrfs_fs_info *fs_info) WARN_ON(btrfs_first_delayed_node(fs_info->delayed_root)); } -static int could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) +static bool could_end_wait(struct btrfs_delayed_root *delayed_root, int seq) { int val = atomic_read(&delayed_root->items_seq); if (val < seq || val >= seq + BTRFS_DELAYED_BATCH) - return 1; + return true; if (atomic_read(&delayed_root->items) < BTRFS_DELAYED_BACKGROUND) - return 1; + return true; - return 0; + return false; } void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info) @@ -1817,53 +1799,53 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx, static void fill_stack_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode_item *inode_item, - struct inode *inode) + struct btrfs_inode *inode) { + struct inode *vfs_inode = &inode->vfs_inode; u64 flags; - btrfs_set_stack_inode_uid(inode_item, i_uid_read(inode)); - btrfs_set_stack_inode_gid(inode_item, i_gid_read(inode)); - btrfs_set_stack_inode_size(inode_item, BTRFS_I(inode)->disk_i_size); - btrfs_set_stack_inode_mode(inode_item, inode->i_mode); - btrfs_set_stack_inode_nlink(inode_item, inode->i_nlink); - btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(inode)); - btrfs_set_stack_inode_generation(inode_item, - BTRFS_I(inode)->generation); + btrfs_set_stack_inode_uid(inode_item, i_uid_read(vfs_inode)); + btrfs_set_stack_inode_gid(inode_item, i_gid_read(vfs_inode)); + btrfs_set_stack_inode_size(inode_item, inode->disk_i_size); + btrfs_set_stack_inode_mode(inode_item, vfs_inode->i_mode); + btrfs_set_stack_inode_nlink(inode_item, vfs_inode->i_nlink); + btrfs_set_stack_inode_nbytes(inode_item, inode_get_bytes(vfs_inode)); + btrfs_set_stack_inode_generation(inode_item, inode->generation); btrfs_set_stack_inode_sequence(inode_item, - inode_peek_iversion(inode)); + inode_peek_iversion(vfs_inode)); btrfs_set_stack_inode_transid(inode_item, trans->transid); - btrfs_set_stack_inode_rdev(inode_item, inode->i_rdev); - flags = btrfs_inode_combine_flags(BTRFS_I(inode)->flags, - BTRFS_I(inode)->ro_flags); + btrfs_set_stack_inode_rdev(inode_item, vfs_inode->i_rdev); + flags = btrfs_inode_combine_flags(inode->flags, inode->ro_flags); btrfs_set_stack_inode_flags(inode_item, flags); btrfs_set_stack_inode_block_group(inode_item, 0); btrfs_set_stack_timespec_sec(&inode_item->atime, - inode_get_atime_sec(inode)); + inode_get_atime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->atime, - inode_get_atime_nsec(inode)); + inode_get_atime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->mtime, - inode_get_mtime_sec(inode)); + inode_get_mtime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->mtime, - inode_get_mtime_nsec(inode)); + inode_get_mtime_nsec(vfs_inode)); btrfs_set_stack_timespec_sec(&inode_item->ctime, - inode_get_ctime_sec(inode)); + inode_get_ctime_sec(vfs_inode)); btrfs_set_stack_timespec_nsec(&inode_item->ctime, - inode_get_ctime_nsec(inode)); + inode_get_ctime_nsec(vfs_inode)); - btrfs_set_stack_timespec_sec(&inode_item->otime, BTRFS_I(inode)->i_otime_sec); - btrfs_set_stack_timespec_nsec(&inode_item->otime, BTRFS_I(inode)->i_otime_nsec); + btrfs_set_stack_timespec_sec(&inode_item->otime, inode->i_otime_sec); + btrfs_set_stack_timespec_nsec(&inode_item->otime, inode->i_otime_nsec); } -int btrfs_fill_inode(struct inode *inode, u32 *rdev) +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev) { - struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info; + struct btrfs_fs_info *fs_info = inode->root->fs_info; struct btrfs_delayed_node *delayed_node; struct btrfs_inode_item *inode_item; + struct inode *vfs_inode = &inode->vfs_inode; - delayed_node = btrfs_get_delayed_node(BTRFS_I(inode)); + delayed_node = btrfs_get_delayed_node(inode); if (!delayed_node) return -ENOENT; @@ -1876,39 +1858,38 @@ int btrfs_fill_inode(struct inode *inode, u32 *rdev) inode_item = &delayed_node->inode_item; - i_uid_write(inode, btrfs_stack_inode_uid(inode_item)); - i_gid_write(inode, btrfs_stack_inode_gid(inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_stack_inode_size(inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - inode->i_mode = btrfs_stack_inode_mode(inode_item); - set_nlink(inode, btrfs_stack_inode_nlink(inode_item)); - inode_set_bytes(inode, btrfs_stack_inode_nbytes(inode_item)); - BTRFS_I(inode)->generation = btrfs_stack_inode_generation(inode_item); - BTRFS_I(inode)->last_trans = btrfs_stack_inode_transid(inode_item); - - inode_set_iversion_queried(inode, - btrfs_stack_inode_sequence(inode_item)); - inode->i_rdev = 0; + i_uid_write(vfs_inode, btrfs_stack_inode_uid(inode_item)); + i_gid_write(vfs_inode, btrfs_stack_inode_gid(inode_item)); + btrfs_i_size_write(inode, btrfs_stack_inode_size(inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + vfs_inode->i_mode = btrfs_stack_inode_mode(inode_item); + set_nlink(vfs_inode, btrfs_stack_inode_nlink(inode_item)); + inode_set_bytes(vfs_inode, btrfs_stack_inode_nbytes(inode_item)); + inode->generation = btrfs_stack_inode_generation(inode_item); + inode->last_trans = btrfs_stack_inode_transid(inode_item); + + inode_set_iversion_queried(vfs_inode, btrfs_stack_inode_sequence(inode_item)); + vfs_inode->i_rdev = 0; *rdev = btrfs_stack_inode_rdev(inode_item); btrfs_inode_split_flags(btrfs_stack_inode_flags(inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); - inode_set_atime(inode, btrfs_stack_timespec_sec(&inode_item->atime), + inode_set_atime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->atime), btrfs_stack_timespec_nsec(&inode_item->atime)); - inode_set_mtime(inode, btrfs_stack_timespec_sec(&inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->mtime), btrfs_stack_timespec_nsec(&inode_item->mtime)); - inode_set_ctime(inode, btrfs_stack_timespec_sec(&inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_stack_timespec_sec(&inode_item->ctime), btrfs_stack_timespec_nsec(&inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); + inode->i_otime_sec = btrfs_stack_timespec_sec(&inode_item->otime); + inode->i_otime_nsec = btrfs_stack_timespec_nsec(&inode_item->otime); - inode->i_generation = BTRFS_I(inode)->generation; - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + vfs_inode->i_generation = inode->generation; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; mutex_unlock(&delayed_node->mutex); btrfs_release_delayed_node(delayed_node); @@ -1928,8 +1909,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, mutex_lock(&delayed_node->mutex); if (test_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags)) { - fill_stack_inode_item(trans, &delayed_node->inode_item, - &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); goto release_node; } @@ -1937,7 +1917,7 @@ int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, if (ret) goto release_node; - fill_stack_inode_item(trans, &delayed_node->inode_item, &inode->vfs_inode); + fill_stack_inode_item(trans, &delayed_node->inode_item, inode); set_bit(BTRFS_DELAYED_NODE_INODE_DIRTY, &delayed_node->flags); delayed_node->count++; atomic_inc(&root->fs_info->delayed_root->items); diff --git a/fs/btrfs/delayed-inode.h b/fs/btrfs/delayed-inode.h index f4d9feac0d0e..c4b4ba122beb 100644 --- a/fs/btrfs/delayed-inode.h +++ b/fs/btrfs/delayed-inode.h @@ -133,7 +133,7 @@ int btrfs_commit_inode_delayed_inode(struct btrfs_inode *inode); int btrfs_delayed_update_inode(struct btrfs_trans_handle *trans, struct btrfs_inode *inode); -int btrfs_fill_inode(struct inode *inode, u32 *rdev); +int btrfs_fill_inode(struct btrfs_inode *inode, u32 *rdev); int btrfs_delayed_delete_inode_ref(struct btrfs_inode *inode); /* Used for drop dead root */ diff --git a/fs/btrfs/delayed-ref.c b/fs/btrfs/delayed-ref.c index 98c5b61dabe8..739c9e29aaa3 100644 --- a/fs/btrfs/delayed-ref.c +++ b/fs/btrfs/delayed-ref.c @@ -331,12 +331,9 @@ static struct btrfs_delayed_ref_node* tree_insert(struct rb_root_cached *root, struct btrfs_delayed_ref_node *ins) { struct rb_node *node = &ins->ref_node; - struct rb_node *exist; + struct rb_node *exist = rb_find_add_cached(node, root, cmp_refs_node); - exist = rb_find_add_cached(node, root, cmp_refs_node); - if (exist) - return rb_entry(exist, struct btrfs_delayed_ref_node, ref_node); - return NULL; + return rb_entry_safe(exist, struct btrfs_delayed_ref_node, ref_node); } static struct btrfs_delayed_ref_head *find_first_ref_head( @@ -1339,7 +1336,7 @@ int __init btrfs_delayed_ref_init(void) { btrfs_delayed_ref_head_cachep = KMEM_CACHE(btrfs_delayed_ref_head, 0); if (!btrfs_delayed_ref_head_cachep) - goto fail; + return -ENOMEM; btrfs_delayed_ref_node_cachep = KMEM_CACHE(btrfs_delayed_ref_node, 0); if (!btrfs_delayed_ref_node_cachep) diff --git a/fs/btrfs/delayed-ref.h b/fs/btrfs/delayed-ref.h index a35067cebb97..78cc23837610 100644 --- a/fs/btrfs/delayed-ref.h +++ b/fs/btrfs/delayed-ref.h @@ -14,6 +14,8 @@ #include <linux/spinlock.h> #include <linux/slab.h> #include <uapi/linux/btrfs_tree.h> +#include "fs.h" +#include "messages.h" struct btrfs_trans_handle; struct btrfs_fs_info; @@ -260,7 +262,6 @@ enum btrfs_ref_type { BTRFS_REF_NOT_SET, BTRFS_REF_DATA, BTRFS_REF_METADATA, - BTRFS_REF_LAST, } __packed; struct btrfs_ref { diff --git a/fs/btrfs/dev-replace.c b/fs/btrfs/dev-replace.c index f86fbea0b3de..2decb9fff445 100644 --- a/fs/btrfs/dev-replace.c +++ b/fs/btrfs/dev-replace.c @@ -76,7 +76,7 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) struct extent_buffer *eb; int slot; int ret = 0; - struct btrfs_path *path = NULL; + BTRFS_PATH_AUTO_FREE(path); int item_size; struct btrfs_dev_replace_item *ptr; u64 src_devid; @@ -85,10 +85,8 @@ int btrfs_init_dev_replace(struct btrfs_fs_info *fs_info) return 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; key.objectid = 0; key.type = BTRFS_DEV_REPLACE_KEY; @@ -103,10 +101,8 @@ no_valid_dev_replace_entry_found: if (btrfs_find_device(fs_info->fs_devices, &args)) { btrfs_err(fs_info, "found replace target device without a valid replace item"); - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } - ret = 0; dev_replace->replace_state = BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED; dev_replace->cont_reading_from_srcdev_mode = @@ -123,7 +119,7 @@ no_valid_dev_replace_entry_found: dev_replace->tgtdev = NULL; dev_replace->is_valid = 0; dev_replace->item_needs_writeback = 0; - goto out; + return 0; } slot = path->slots[0]; eb = path->nodes[0]; @@ -226,8 +222,6 @@ no_valid_dev_replace_entry_found: break; } -out: - btrfs_free_path(path); return ret; } @@ -346,7 +340,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; int ret; struct btrfs_root *dev_root = fs_info->dev_root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *eb; struct btrfs_dev_replace_item *ptr; @@ -365,16 +359,15 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) key.offset = 0; path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; + ret = btrfs_search_slot(trans, dev_root, &key, path, -1, 1); if (ret < 0) { btrfs_warn(fs_info, "error %d while searching for dev_replace item!", ret); - goto out; + return ret; } if (ret == 0 && @@ -395,7 +388,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) btrfs_warn(fs_info, "delete too small dev_replace item failed %d!", ret); - goto out; + return ret; } ret = 1; } @@ -408,7 +401,7 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) if (ret < 0) { btrfs_warn(fs_info, "insert dev_replace item failed %d!", ret); - goto out; + return ret; } } @@ -440,8 +433,6 @@ int btrfs_run_dev_replace(struct btrfs_trans_handle *trans) dev_replace->cursor_right); dev_replace->item_needs_writeback = 0; up_write(&dev_replace->rwsem); -out: - btrfs_free_path(path); return ret; } @@ -646,7 +637,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info, break; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: - ASSERT(0); + DEBUG_WARN("unexpected STARTED ot SUSPENDED dev-replace state"); ret = BTRFS_IOCTL_DEV_REPLACE_RESULT_ALREADY_STARTED; up_write(&dev_replace->rwsem); goto leave; @@ -803,17 +794,17 @@ static int btrfs_set_target_alloc_state(struct btrfs_device *srcdev, lockdep_assert_held(&srcdev->fs_info->chunk_mutex); - while (find_first_extent_bit(&srcdev->alloc_state, start, - &found_start, &found_end, - CHUNK_ALLOCATED, &cached_state)) { - ret = set_extent_bit(&tgtdev->alloc_state, found_start, - found_end, CHUNK_ALLOCATED, NULL); + while (btrfs_find_first_extent_bit(&srcdev->alloc_state, start, + &found_start, &found_end, + CHUNK_ALLOCATED, &cached_state)) { + ret = btrfs_set_extent_bit(&tgtdev->alloc_state, found_start, + found_end, CHUNK_ALLOCATED, NULL); if (ret) break; start = found_end + 1; } - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); return ret; } @@ -1274,16 +1265,16 @@ static int btrfs_dev_replace_kthread(void *data) return 0; } -int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) +bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) { if (!dev_replace->is_valid) - return 0; + return false; switch (dev_replace->replace_state) { case BTRFS_IOCTL_DEV_REPLACE_STATE_NEVER_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_FINISHED: case BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED: - return 0; + return false; case BTRFS_IOCTL_DEV_REPLACE_STATE_STARTED: case BTRFS_IOCTL_DEV_REPLACE_STATE_SUSPENDED: /* @@ -1298,7 +1289,7 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace) */ break; } - return 1; + return true; } void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount) diff --git a/fs/btrfs/dev-replace.h b/fs/btrfs/dev-replace.h index 23e480efe5e6..b35cecf388f2 100644 --- a/fs/btrfs/dev-replace.h +++ b/fs/btrfs/dev-replace.h @@ -25,7 +25,7 @@ void btrfs_dev_replace_status(struct btrfs_fs_info *fs_info, int btrfs_dev_replace_cancel(struct btrfs_fs_info *fs_info); void btrfs_dev_replace_suspend_for_unmount(struct btrfs_fs_info *fs_info); int btrfs_resume_dev_replace_async(struct btrfs_fs_info *fs_info); -int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); +bool __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace); bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev, struct btrfs_block_group *cache, u64 physical); diff --git a/fs/btrfs/dir-item.c b/fs/btrfs/dir-item.c index ccf91de29f80..b29cc31a7c4a 100644 --- a/fs/btrfs/dir-item.c +++ b/fs/btrfs/dir-item.c @@ -236,7 +236,7 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, int data_size; struct extent_buffer *leaf; int slot; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) @@ -251,20 +251,17 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, if (IS_ERR(di)) { ret = PTR_ERR(di); /* Nothing found, we're safe */ - if (ret == -ENOENT) { - ret = 0; - goto out; - } + if (ret == -ENOENT) + return 0; if (ret < 0) - goto out; + return ret; } /* we found an item, look for our name in the item */ if (di) { /* our exact name was found */ - ret = -EEXIST; - goto out; + return -EEXIST; } /* See if there is room in the item to insert this name. */ @@ -273,14 +270,11 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir, slot = path->slots[0]; if (data_size + btrfs_item_size(leaf, slot) + sizeof(struct btrfs_item) > BTRFS_LEAF_DATA_SIZE(root->fs_info)) { - ret = -EOVERFLOW; - } else { - /* plenty of insertion room */ - ret = 0; + return -EOVERFLOW; } -out: - btrfs_free_path(path); - return ret; + + /* Plenty of insertion room. */ + return 0; } /* diff --git a/fs/btrfs/dir-item.h b/fs/btrfs/dir-item.h index 28d69970bc70..8462579a95f4 100644 --- a/fs/btrfs/dir-item.h +++ b/fs/btrfs/dir-item.h @@ -10,6 +10,7 @@ struct fscrypt_str; struct btrfs_fs_info; struct btrfs_key; struct btrfs_path; +struct btrfs_inode; struct btrfs_root; struct btrfs_trans_handle; diff --git a/fs/btrfs/direct-io.c b/fs/btrfs/direct-io.c index 8567af46e16f..fe9a4bd7e6e6 100644 --- a/fs/btrfs/direct-io.c +++ b/fs/btrfs/direct-io.c @@ -42,21 +42,21 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, /* Direct lock must be taken before the extent lock. */ if (nowait) { - if (!try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) + if (!btrfs_try_lock_dio_extent(io_tree, lockstart, lockend, cached_state)) return -EAGAIN; } else { - lock_dio_extent(io_tree, lockstart, lockend, cached_state); + btrfs_lock_dio_extent(io_tree, lockstart, lockend, cached_state); } while (1) { if (nowait) { - if (!try_lock_extent(io_tree, lockstart, lockend, - cached_state)) { + if (!btrfs_try_lock_extent(io_tree, lockstart, lockend, + cached_state)) { ret = -EAGAIN; break; } } else { - lock_extent(io_tree, lockstart, lockend, cached_state); + btrfs_lock_extent(io_tree, lockstart, lockend, cached_state); } /* * We're concerned with the entire range that we're going to be @@ -78,7 +78,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, lockstart, lockend))) break; - unlock_extent(io_tree, lockstart, lockend, cached_state); + btrfs_unlock_extent(io_tree, lockstart, lockend, cached_state); if (ordered) { if (nowait) { @@ -131,7 +131,7 @@ static int lock_extent_direct(struct inode *inode, u64 lockstart, u64 lockend, } if (ret) - unlock_dio_extent(io_tree, lockstart, lockend, cached_state); + btrfs_unlock_dio_extent(io_tree, lockstart, lockend, cached_state); return ret; } @@ -151,11 +151,11 @@ static struct extent_map *btrfs_create_dio_extent(struct btrfs_inode *inode, } ordered = btrfs_alloc_ordered_extent(inode, start, file_extent, - (1 << type) | - (1 << BTRFS_ORDERED_DIRECT)); + (1U << type) | + (1U << BTRFS_ORDERED_DIRECT)); if (IS_ERR(ordered)) { if (em) { - free_extent_map(em); + btrfs_free_extent_map(em); btrfs_drop_extent_map_range(inode, start, start + file_extent->num_bytes - 1, false); } @@ -204,8 +204,7 @@ again: BTRFS_ORDERED_REGULAR); btrfs_dec_block_group_reservations(fs_info, ins.objectid); if (IS_ERR(em)) - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, - 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); return em; } @@ -246,9 +245,10 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, else type = BTRFS_ORDERED_NOCOW; len = min(len, em->len - (start - em->start)); - block_start = extent_map_block_start(em) + (start - em->start); + block_start = btrfs_extent_map_block_start(em) + (start - em->start); - if (can_nocow_extent(inode, start, &len, &file_extent, false) == 1) { + if (can_nocow_extent(BTRFS_I(inode), start, &len, &file_extent, + false) == 1) { bg = btrfs_inc_nocow_writers(fs_info, block_start); if (bg) can_nocow = true; @@ -264,7 +264,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, nowait); if (ret < 0) { /* Our caller expects us to free the input extent map. */ - free_extent_map(em); + btrfs_free_extent_map(em); *map = NULL; btrfs_dec_nocow_writers(bg); if (nowait && (ret == -ENOSPC || ret == -EDQUOT)) @@ -277,7 +277,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, &file_extent, type); btrfs_dec_nocow_writers(bg); if (type == BTRFS_ORDERED_PREALLOC) { - free_extent_map(em); + btrfs_free_extent_map(em); *map = em2; em = em2; } @@ -290,7 +290,7 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map, dio_data->nocow_done = true; } else { /* Our caller expects us to free the input extent map. */ - free_extent_map(em); + btrfs_free_extent_map(em); *map = NULL; if (nowait) { @@ -439,8 +439,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, start, data_alloc_len, false); if (!ret) dio_data->data_space_reserved = true; - else if (ret && !(BTRFS_I(inode)->flags & - (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) + else if (!(BTRFS_I(inode)->flags & + (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC))) goto err; } @@ -473,8 +473,8 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * to buffered IO. Don't blame me, this is the price we pay for using * the generic code. */ - if (extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { - free_extent_map(em); + if (btrfs_extent_map_is_compressed(em) || em->disk_bytenr == EXTENT_MAP_INLINE) { + btrfs_free_extent_map(em); /* * If we are in a NOWAIT context, return -EAGAIN in order to * fallback to buffered IO. This is not only because we can @@ -515,7 +515,7 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, * after we have submitted bios for all the extents in the range. */ if ((flags & IOMAP_NOWAIT) && len < length) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = -EAGAIN; goto unlock_err; } @@ -557,13 +557,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, iomap->addr = IOMAP_NULL_ADDR; iomap->type = IOMAP_HOLE; } else { - iomap->addr = extent_map_block_start(em) + (start - em->start); + iomap->addr = btrfs_extent_map_block_start(em) + (start - em->start); iomap->type = IOMAP_MAPPED; } iomap->offset = start; iomap->bdev = fs_info->fs_devices->latest_dev->bdev; iomap->length = len; - free_extent_map(em); + btrfs_free_extent_map(em); /* * Reads will hold the EXTENT_DIO_LOCKED bit until the io is completed, @@ -574,13 +574,13 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start, if (write) unlock_bits |= EXTENT_DIO_LOCKED; - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - unlock_bits, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + unlock_bits, &cached_state); /* We didn't use everything, unlock the dio extent for the remainder. */ if (!write && (start + len) < lockend) - unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, - lockend, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, start + len, + lockend, NULL); return 0; @@ -590,8 +590,8 @@ unlock_err: * to update this, be explicit that we expect EXTENT_LOCKED and * EXTENT_DIO_LOCKED to be set here, and so that's what we're clearing. */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, - EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, lockstart, lockend, + EXTENT_LOCKED | EXTENT_DIO_LOCKED, &cached_state); err: if (dio_data->data_space_reserved) { btrfs_free_reserved_data_space(BTRFS_I(inode), @@ -614,8 +614,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, if (!write && (iomap->type == IOMAP_HOLE)) { /* If reading from a hole, unlock and return */ - unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); return 0; } @@ -626,8 +626,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length, btrfs_finish_ordered_extent(dio_data->ordered, NULL, pos, length, false); else - unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, - pos + length - 1, NULL); + btrfs_unlock_dio_extent(&BTRFS_I(inode)->io_tree, pos, + pos + length - 1, NULL); ret = -ENOTBLK; } if (write) { @@ -659,8 +659,8 @@ static void btrfs_dio_end_io(struct btrfs_bio *bbio) dip->file_offset, dip->bytes, !bio->bi_status); } else { - unlock_dio_extent(&inode->io_tree, dip->file_offset, - dip->file_offset + dip->bytes - 1, NULL); + btrfs_unlock_dio_extent(&inode->io_tree, dip->file_offset, + dip->file_offset + dip->bytes - 1, NULL); } bbio->bio.bi_private = bbio->private; @@ -691,9 +691,9 @@ static int btrfs_extract_ordered_extent(struct btrfs_bio *bbio, * a pre-existing one. */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags)) { - ret = split_extent_map(bbio->inode, bbio->file_offset, - ordered->num_bytes, len, - ordered->disk_bytenr); + ret = btrfs_split_extent_map(bbio->inode, bbio->file_offset, + ordered->num_bytes, len, + ordered->disk_bytenr); if (ret) return ret; } @@ -855,6 +855,22 @@ relock: btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); goto buffered; } + /* + * We can't control the folios being passed in, applications can write + * to them while a direct IO write is in progress. This means the + * content might change after we calculated the data checksum. + * Therefore we can end up storing a checksum that doesn't match the + * persisted data. + * + * To be extra safe and avoid false data checksum mismatch, if the + * inode requires data checksum, just fallback to buffered IO. + * For buffered IO we have full control of page cache and can ensure + * no one is modifying the content during writeback. + */ + if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)) { + btrfs_inode_unlock(BTRFS_I(inode), ilock_flags); + goto buffered; + } /* * The iov_iter can be mapped to the same file range we are writing to. diff --git a/fs/btrfs/direct-io.h b/fs/btrfs/direct-io.h index 3dc3ea926afe..df5d45ee6de7 100644 --- a/fs/btrfs/direct-io.h +++ b/fs/btrfs/direct-io.h @@ -5,6 +5,8 @@ #include <linux/types.h> +struct kiocb; + int __init btrfs_init_dio(void); void __cold btrfs_destroy_dio(void); diff --git a/fs/btrfs/discard.c b/fs/btrfs/discard.c index e815d165cccc..89fe85778115 100644 --- a/fs/btrfs/discard.c +++ b/fs/btrfs/discard.c @@ -94,8 +94,6 @@ static void __add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, struct btrfs_block_group *block_group) { lockdep_assert_held(&discard_ctl->lock); - if (!btrfs_run_discard_work(discard_ctl)) - return; if (list_empty(&block_group->discard_list) || block_group->discard_index == BTRFS_DISCARD_INDEX_UNUSED) { @@ -118,6 +116,9 @@ static void add_to_discard_list(struct btrfs_discard_ctl *discard_ctl, if (!btrfs_is_block_group_data_only(block_group)) return; + if (!btrfs_run_discard_work(discard_ctl)) + return; + spin_lock(&discard_ctl->lock); __add_to_discard_list(discard_ctl, block_group); spin_unlock(&discard_ctl->lock); @@ -167,13 +168,7 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl, block_group->discard_eligible_time = 0; queued = !list_empty(&block_group->discard_list); list_del_init(&block_group->discard_list); - /* - * If the block group is currently running in the discard workfn, we - * don't want to deref it, since it's still being used by the workfn. - * The workfn will notice this case and deref the block group when it is - * finished. - */ - if (queued && !running) + if (queued) btrfs_put_block_group(block_group); spin_unlock(&discard_ctl->lock); @@ -250,6 +245,20 @@ again: block_group->used != 0) { if (btrfs_is_block_group_data_only(block_group)) { __add_to_discard_list(discard_ctl, block_group); + /* + * The block group must have been moved to other + * discard list even if discard was disabled in + * the meantime or a transaction abort happened, + * otherwise we can end up in an infinite loop, + * always jumping into the 'again' label and + * keep getting this block group over and over + * in case there are no other block groups in + * the discard lists. + */ + ASSERT(block_group->discard_index != + BTRFS_DISCARD_INDEX_UNUSED, + "discard_index=%d", + block_group->discard_index); } else { list_del_init(&block_group->discard_list); btrfs_put_block_group(block_group); @@ -260,9 +269,10 @@ again: block_group->discard_cursor = block_group->start; block_group->discard_state = BTRFS_DISCARD_EXTENTS; } - discard_ctl->block_group = block_group; } if (block_group) { + btrfs_get_block_group(block_group); + discard_ctl->block_group = block_group; *discard_state = block_group->discard_state; *discard_index = block_group->discard_index; } @@ -493,9 +503,20 @@ static void btrfs_discard_workfn(struct work_struct *work) block_group = peek_discard_list(discard_ctl, &discard_state, &discard_index, now); - if (!block_group || !btrfs_run_discard_work(discard_ctl)) + if (!block_group) + return; + if (!btrfs_run_discard_work(discard_ctl)) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); return; + } if (now < block_group->discard_eligible_time) { + spin_lock(&discard_ctl->lock); + btrfs_put_block_group(block_group); + discard_ctl->block_group = NULL; + spin_unlock(&discard_ctl->lock); btrfs_discard_schedule_work(discard_ctl, false); return; } @@ -547,15 +568,7 @@ static void btrfs_discard_workfn(struct work_struct *work) spin_lock(&discard_ctl->lock); discard_ctl->prev_discard = trimmed; discard_ctl->prev_discard_time = now; - /* - * If the block group was removed from the discard list while it was - * running in this workfn, then we didn't deref it, since this function - * still owned that reference. But we set the discard_ctl->block_group - * back to NULL, so we can use that condition to know that now we need - * to deref the block_group. - */ - if (discard_ctl->block_group == NULL) - btrfs_put_block_group(block_group); + btrfs_put_block_group(block_group); discard_ctl->block_group = NULL; __btrfs_discard_schedule_work(discard_ctl, now, false); spin_unlock(&discard_ctl->lock); diff --git a/fs/btrfs/discard.h b/fs/btrfs/discard.h index dddb0f9101ba..2c5e85394092 100644 --- a/fs/btrfs/discard.h +++ b/fs/btrfs/discard.h @@ -3,6 +3,7 @@ #ifndef BTRFS_DISCARD_H #define BTRFS_DISCARD_H +#include <linux/types.h> #include <linux/sizes.h> struct btrfs_fs_info; diff --git a/fs/btrfs/disk-io.c b/fs/btrfs/disk-io.c index f09db62e61a1..1beb9458f622 100644 --- a/fs/btrfs/disk-io.c +++ b/fs/btrfs/disk-io.c @@ -182,22 +182,22 @@ static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); int ret = 0; if (sb_rdonly(fs_info->sb)) return -EROFS; - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; u64 start = max_t(u64, eb->start, folio_pos(folio)); u64 end = min_t(u64, eb->start + eb->len, folio_pos(folio) + eb->folio_size); u32 len = end - start; + phys_addr_t paddr = PFN_PHYS(folio_pfn(folio)) + + offset_in_folio(folio, start); - ret = btrfs_repair_io_failure(fs_info, 0, start, len, - start, folio, offset_in_folio(folio, start), - mirror_num); + ret = btrfs_repair_io_failure(fs_info, 0, start, len, start, + paddr, mirror_num); if (ret) break; } @@ -225,7 +225,6 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, ASSERT(check); while (1) { - clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); ret = read_extent_buffer_pages(eb, mirror_num, check); if (!ret) break; @@ -257,7 +256,7 @@ int btrfs_read_extent_buffer(struct extent_buffer *eb, /* * Checksum a dirty tree block before IO. */ -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) +int btree_csum_one_bio(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; struct btrfs_fs_info *fs_info = eb->fs_info; @@ -268,9 +267,9 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) /* Btree blocks are always contiguous on disk. */ if (WARN_ON_ONCE(bbio->file_offset != eb->start)) - return BLK_STS_IOERR; + return -EIO; if (WARN_ON_ONCE(bbio->bio.bi_iter.bi_size != eb->len)) - return BLK_STS_IOERR; + return -EIO; /* * If an extent_buffer is marked as EXTENT_BUFFER_ZONED_ZEROOUT, don't @@ -279,14 +278,13 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) */ if (test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)) { memzero_extent_buffer(eb, 0, eb->len); - return BLK_STS_OK; + return 0; } if (WARN_ON_ONCE(found_start != eb->start)) - return BLK_STS_IOERR; - if (WARN_ON(!btrfs_folio_test_uptodate(fs_info, eb->folios[0], - eb->start, eb->len))) - return BLK_STS_IOERR; + return -EIO; + if (WARN_ON(!btrfs_meta_folio_test_uptodate(eb->folios[0], eb))) + return -EIO; ASSERT(memcmp_extent_buffer(eb, fs_info->fs_devices->metadata_uuid, offsetof(struct btrfs_header, fsid), @@ -314,7 +312,7 @@ blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio) goto error; } write_extent_buffer(eb, result, 0, fs_info->csum_size); - return BLK_STS_OK; + return 0; error: btrfs_print_tree(eb, 0); @@ -328,7 +326,7 @@ error: */ WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG) || btrfs_header_owner(eb) == BTRFS_TREE_LOG_OBJECTID); - return errno_to_blk_status(ret); + return ret; } static bool check_tree_block_fsid(struct extent_buffer *eb) @@ -454,15 +452,9 @@ int btrfs_validate_extent_buffer(struct extent_buffer *eb, goto out; } - /* - * If this is a leaf block and it is corrupt, set the corrupt bit so - * that we don't try and read the other copies of this block, just - * return -EIO. - */ - if (found_level == 0 && btrfs_check_leaf(eb)) { - set_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags); + /* If this is a leaf block and it is corrupt, just return -EIO. */ + if (found_level == 0 && btrfs_check_leaf(eb)) ret = -EIO; - } if (found_level > 0 && btrfs_check_node(eb)) ret = -EIO; @@ -643,11 +635,16 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, } -static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, - u64 objectid) +static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, + u64 objectid, gfp_t flags) { + struct btrfs_root *root; bool dummy = btrfs_is_testing(fs_info); + root = kzalloc(sizeof(*root), flags); + if (!root) + return NULL; + memset(&root->root_key, 0, sizeof(root->root_key)); memset(&root->root_item, 0, sizeof(root->root_item)); memset(&root->defrag_progress, 0, sizeof(root->defrag_progress)); @@ -700,10 +697,10 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, btrfs_set_root_last_log_commit(root, 0); root->anon_dev = 0; if (!dummy) { - extent_io_tree_init(fs_info, &root->dirty_log_pages, - IO_TREE_ROOT_DIRTY_LOG_PAGES); - extent_io_tree_init(fs_info, &root->log_csum_range, - IO_TREE_LOG_CSUM_RANGE); + btrfs_extent_io_tree_init(fs_info, &root->dirty_log_pages, + IO_TREE_ROOT_DIRTY_LOG_PAGES); + btrfs_extent_io_tree_init(fs_info, &root->log_csum_range, + IO_TREE_LOG_CSUM_RANGE); } spin_lock_init(&root->root_item_lock); @@ -714,14 +711,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info, list_add_tail(&root->leak_list, &fs_info->allocated_roots); spin_unlock(&fs_info->fs_roots_radix_lock); #endif -} -static struct btrfs_root *btrfs_alloc_root(struct btrfs_fs_info *fs_info, - u64 objectid, gfp_t flags) -{ - struct btrfs_root *root = kzalloc(sizeof(*root), flags); - if (root) - __setup_root(root, fs_info, objectid); return root; } @@ -1089,21 +1079,22 @@ struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key) { struct btrfs_root *root; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return ERR_PTR(-ENOMEM); root = read_tree_root_path(tree_root, path, key); - btrfs_free_path(path); return root; } /* - * Initialize subvolume root in-memory structure + * Initialize subvolume root in-memory structure. * * @anon_dev: anonymous device to attach to the root, if zero, allocate new + * + * In case of failure the caller is responsible to call btrfs_free_fs_root() */ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) { @@ -1127,7 +1118,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) if (!anon_dev) { ret = get_anon_bdev(&root->anon_dev); if (ret) - goto fail; + return ret; } else { root->anon_dev = anon_dev; } @@ -1137,7 +1128,7 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) ret = btrfs_init_root_free_objectid(root); if (ret) { mutex_unlock(&root->objectid_mutex); - goto fail; + return ret; } ASSERT(root->free_objectid <= BTRFS_LAST_FREE_OBJECTID); @@ -1145,9 +1136,6 @@ static int btrfs_init_fs_root(struct btrfs_root *root, dev_t anon_dev) mutex_unlock(&root->objectid_mutex); return 0; -fail: - /* The caller is responsible to call btrfs_free_fs_root */ - return ret; } static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info, @@ -1568,7 +1556,7 @@ static int transaction_kthread(void *arg) do { cannot_commit = false; - delay = msecs_to_jiffies(fs_info->commit_interval * 1000); + delay = secs_to_jiffies(fs_info->commit_interval); mutex_lock(&fs_info->transaction_kthread_mutex); spin_lock(&fs_info->trans_lock); @@ -1583,9 +1571,9 @@ static int transaction_kthread(void *arg) cur->state < TRANS_STATE_COMMIT_PREP && delta < fs_info->commit_interval) { spin_unlock(&fs_info->trans_lock); - delay -= msecs_to_jiffies((delta - 1) * 1000); + delay -= secs_to_jiffies(delta - 1); delay = min(delay, - msecs_to_jiffies(fs_info->commit_interval * 1000)); + secs_to_jiffies(fs_info->commit_interval)); goto sleep; } transid = cur->transid; @@ -1867,8 +1855,8 @@ void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info) int i; while (!list_empty(&fs_info->dead_roots)) { - gang[0] = list_entry(fs_info->dead_roots.next, - struct btrfs_root, root_list); + gang[0] = list_first_entry(&fs_info->dead_roots, + struct btrfs_root, root_list); list_del(&gang[0]->root_list); if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state)) @@ -1931,9 +1919,9 @@ static int btrfs_init_btree_inode(struct super_block *sb) inode->i_mapping->a_ops = &btree_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_NOFS); - extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, - IO_TREE_BTREE_INODE_IO); - extent_map_tree_init(&BTRFS_I(inode)->extent_tree); + btrfs_extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree, + IO_TREE_BTREE_INODE_IO); + btrfs_extent_map_tree_init(&BTRFS_I(inode)->extent_tree); BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root); set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); @@ -2006,7 +1994,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info) btrfs_alloc_ordered_workqueue(fs_info, "qgroup-rescan", ordered_flags); fs_info->discard_ctl.discard_workers = - alloc_ordered_workqueue("btrfs_discard", WQ_FREEZABLE); + alloc_ordered_workqueue("btrfs-discard", WQ_FREEZABLE); if (!(fs_info->workers && fs_info->delalloc_workers && fs_info->flush_workers && @@ -2200,8 +2188,8 @@ static int load_global_roots_objectid(struct btrfs_root *tree_root, static int load_global_roots(struct btrfs_root *tree_root) { - struct btrfs_path *path; - int ret = 0; + BTRFS_PATH_AUTO_FREE(path); + int ret; path = btrfs_alloc_path(); if (!path) @@ -2210,18 +2198,17 @@ static int load_global_roots(struct btrfs_root *tree_root) ret = load_global_roots_objectid(tree_root, path, BTRFS_EXTENT_TREE_OBJECTID, "extent"); if (ret) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_CSUM_TREE_OBJECTID, "csum"); if (ret) - goto out; + return ret; if (!btrfs_fs_compat_ro(tree_root->fs_info, FREE_SPACE_TREE)) - goto out; + return ret; ret = load_global_roots_objectid(tree_root, path, BTRFS_FREE_SPACE_TREE_OBJECTID, "free space"); -out: - btrfs_free_path(path); + return ret; } @@ -2447,21 +2434,27 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, * Check sectorsize and nodesize first, other check will need it. * Check all possible sectorsize(4K, 8K, 16K, 32K, 64K) here. */ - if (!is_power_of_2(sectorsize) || sectorsize < 4096 || + if (!is_power_of_2(sectorsize) || sectorsize < BTRFS_MIN_BLOCKSIZE || sectorsize > BTRFS_MAX_METADATA_BLOCKSIZE) { btrfs_err(fs_info, "invalid sectorsize %llu", sectorsize); ret = -EINVAL; } /* - * We only support at most two sectorsizes: 4K and PAGE_SIZE. + * We only support at most 3 sectorsizes: 4K, PAGE_SIZE, MIN_BLOCKSIZE. + * + * For 4K page sized systems with non-debug builds, all 3 matches (4K). + * For 4K page sized systems with debug builds, there are two block sizes + * supported. (4K and 2K) * * We can support 16K sectorsize with 64K page size without problem, * but such sectorsize/pagesize combination doesn't make much sense. * 4K will be our future standard, PAGE_SIZE is supported from the very * beginning. */ - if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) { + if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && + sectorsize != PAGE_SIZE && + sectorsize != BTRFS_MIN_BLOCKSIZE)) { btrfs_err(fs_info, "sectorsize %llu not yet supported for page size %lu", sectorsize, PAGE_SIZE); @@ -2561,6 +2554,9 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, ret = -EINVAL; } + if (ret) + return ret; + ret = validate_sys_chunk_array(fs_info, sb); /* @@ -2765,10 +2761,21 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info) return ret; } +/* + * Lockdep gets confused between our buffer_tree which requires IRQ locking because + * we modify marks in the IRQ context, and our delayed inode xarray which doesn't + * have these requirements. Use a class key so lockdep doesn't get them mixed up. + */ +static struct lock_class_key buffer_xa_class; + void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) { INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC); - INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC); + + /* Use the same flags as mapping->i_pages. */ + xa_init_flags(&fs_info->buffer_tree, XA_FLAGS_LOCK_IRQ | XA_FLAGS_ACCOUNT); + lockdep_set_class(&fs_info->buffer_tree.xa_lock, &buffer_xa_class); + INIT_LIST_HEAD(&fs_info->trans_list); INIT_LIST_HEAD(&fs_info->dead_roots); INIT_LIST_HEAD(&fs_info->delayed_iputs); @@ -2780,7 +2787,6 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) spin_lock_init(&fs_info->delayed_iput_lock); spin_lock_init(&fs_info->defrag_inodes_lock); spin_lock_init(&fs_info->super_lock); - spin_lock_init(&fs_info->buffer_lock); spin_lock_init(&fs_info->unused_bgs_lock); spin_lock_init(&fs_info->treelog_bg_lock); spin_lock_init(&fs_info->zone_active_bgs_lock); @@ -2825,6 +2831,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) BTRFS_BLOCK_RSV_GLOBAL); btrfs_init_block_rsv(&fs_info->trans_block_rsv, BTRFS_BLOCK_RSV_TRANS); btrfs_init_block_rsv(&fs_info->chunk_block_rsv, BTRFS_BLOCK_RSV_CHUNK); + btrfs_init_block_rsv(&fs_info->treelog_rsv, BTRFS_BLOCK_RSV_TREELOG); btrfs_init_block_rsv(&fs_info->empty_block_rsv, BTRFS_BLOCK_RSV_EMPTY); btrfs_init_block_rsv(&fs_info->delayed_block_rsv, BTRFS_BLOCK_RSV_DELOPS); @@ -2858,8 +2865,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info) rwlock_init(&fs_info->block_group_cache_lock); fs_info->block_group_cache_tree = RB_ROOT_CACHED; - extent_io_tree_init(fs_info, &fs_info->excluded_extents, - IO_TREE_FS_EXCLUDED_EXTENTS); + btrfs_extent_io_tree_init(fs_info, &fs_info->excluded_extents, + IO_TREE_FS_EXCLUDED_EXTENTS); mutex_init(&fs_info->ordered_operations_mutex); mutex_init(&fs_info->tree_log_mutex); @@ -3311,7 +3318,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device /* * Read super block and check the signature bytes only */ - disk_super = btrfs_read_dev_super(fs_devices->latest_dev->bdev); + disk_super = btrfs_read_disk_super(fs_devices->latest_dev->bdev, 0, false); if (IS_ERR(disk_super)) { ret = PTR_ERR(disk_super); goto fail_alloc; @@ -3390,7 +3397,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device fs_info->nodesize = nodesize; fs_info->sectorsize = sectorsize; fs_info->sectorsize_bits = ilog2(sectorsize); - fs_info->sectors_per_page = (PAGE_SIZE >> fs_info->sectorsize_bits); fs_info->csums_per_leaf = BTRFS_MAX_ITEM_SIZE(fs_info) / fs_info->csum_size; fs_info->stripesize = stripesize; fs_info->fs_devices->fs_info = fs_info; @@ -3416,11 +3422,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device */ fs_info->max_inline = min_t(u64, fs_info->max_inline, fs_info->sectorsize); - if (sectorsize < PAGE_SIZE) - btrfs_warn(fs_info, - "read-write for sector size %u with page size %lu is experimental", - sectorsize, PAGE_SIZE); - ret = btrfs_init_workqueues(fs_info); if (ret) goto fail_sb_buffer; @@ -3712,85 +3713,6 @@ static void btrfs_end_super_write(struct bio *bio) bio_put(bio); } -struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num, bool drop_cache) -{ - struct btrfs_super_block *super; - struct page *page; - u64 bytenr, bytenr_orig; - struct address_space *mapping = bdev->bd_mapping; - int ret; - - bytenr_orig = btrfs_sb_offset(copy_num); - ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); - if (ret == -ENOENT) - return ERR_PTR(-EINVAL); - else if (ret) - return ERR_PTR(ret); - - if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) - return ERR_PTR(-EINVAL); - - if (drop_cache) { - /* This should only be called with the primary sb. */ - ASSERT(copy_num == 0); - - /* - * Drop the page of the primary superblock, so later read will - * always read from the device. - */ - invalidate_inode_pages2_range(mapping, - bytenr >> PAGE_SHIFT, - (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); - } - - page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); - if (IS_ERR(page)) - return ERR_CAST(page); - - super = page_address(page); - if (btrfs_super_magic(super) != BTRFS_MAGIC) { - btrfs_release_disk_super(super); - return ERR_PTR(-ENODATA); - } - - if (btrfs_super_bytenr(super) != bytenr_orig) { - btrfs_release_disk_super(super); - return ERR_PTR(-EINVAL); - } - - return super; -} - - -struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev) -{ - struct btrfs_super_block *super, *latest = NULL; - int i; - u64 transid = 0; - - /* we would like to check all the supers, but that would make - * a btrfs mount succeed after a mkfs from a different FS. - * So, we need to add a special mount option to scan for - * later supers, using BTRFS_SUPER_MIRROR_MAX instead - */ - for (i = 0; i < 1; i++) { - super = btrfs_read_dev_one_super(bdev, i, false); - if (IS_ERR(super)) - continue; - - if (!latest || btrfs_super_generation(super) > transid) { - if (latest) - btrfs_release_disk_super(super); - - latest = super; - transid = btrfs_super_generation(super); - } - } - - return super; -} - /* * Write superblock @sb to the @device. Do not wait for completion, all the * folios we use for writing are locked. @@ -3830,8 +3752,8 @@ static int write_dev_supers(struct btrfs_device *device, continue; } else if (ret < 0) { btrfs_err(device->fs_info, - "couldn't get super block location for mirror %d", - i); + "couldn't get super block location for mirror %d error %d", + i, ret); atomic_inc(&device->sb_write_errors); continue; } @@ -3850,12 +3772,11 @@ static int write_dev_supers(struct btrfs_device *device, GFP_NOFS); if (IS_ERR(folio)) { btrfs_err(device->fs_info, - "couldn't get super block page for bytenr %llu", - bytenr); + "couldn't get super block page for bytenr %llu error %ld", + bytenr, PTR_ERR(folio)); atomic_inc(&device->sb_write_errors); continue; } - ASSERT(folio_order(folio) == 0); offset = offset_in_folio(folio, bytenr); disk_super = folio_address(folio) + offset; @@ -3928,7 +3849,6 @@ static int wait_dev_supers(struct btrfs_device *device, int max_mirrors) /* If the folio has been removed, then we know it completed. */ if (IS_ERR(folio)) continue; - ASSERT(folio_order(folio) == 0); /* Folio will be unlocked once the write completes. */ folio_wait_locked(folio); @@ -4248,8 +4168,9 @@ static void warn_about_uncommitted_trans(struct btrfs_fs_info *fs_info) u64 found_end; found = true; - while (find_first_extent_bit(&trans->dirty_pages, cur, - &found_start, &found_end, EXTENT_DIRTY, &cached)) { + while (btrfs_find_first_extent_bit(&trans->dirty_pages, cur, + &found_start, &found_end, + EXTENT_DIRTY, &cached)) { dirty_bytes += found_end + 1 - found_start; cur = found_end + 1; } @@ -4326,6 +4247,14 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_cleanup_defrag_inodes(fs_info); /* + * Handle the error fs first, as it will flush and wait for all ordered + * extents. This will generate delayed iputs, thus we want to handle + * it first. + */ + if (unlikely(BTRFS_FS_ERROR(fs_info))) + btrfs_error_commit_super(fs_info); + + /* * Wait for any fixup workers to complete. * If we don't wait for them here and they are still running by the time * we call kthread_stop() against the cleaner kthread further below, we @@ -4346,6 +4275,31 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_flush_workqueue(fs_info->delalloc_workers); /* + * We can have ordered extents getting their last reference dropped from + * the fs_info->workers queue because for async writes for data bios we + * queue a work for that queue, at btrfs_wq_submit_bio(), that runs + * run_one_async_done() which calls btrfs_bio_end_io() in case the bio + * has an error, and that later function can do the final + * btrfs_put_ordered_extent() on the ordered extent attached to the bio, + * which adds a delayed iput for the inode. So we must flush the queue + * so that we don't have delayed iputs after committing the current + * transaction below and stopping the cleaner and transaction kthreads. + */ + btrfs_flush_workqueue(fs_info->workers); + + /* + * When finishing a compressed write bio we schedule a work queue item + * to finish an ordered extent - btrfs_finish_compressed_write_work() + * calls btrfs_finish_ordered_extent() which in turns does a call to + * btrfs_queue_ordered_fn(), and that queues the ordered extent + * completion either in the endio_write_workers work queue or in the + * fs_info->endio_freespace_worker work queue. We flush those queues + * below, so before we flush them we must flush this queue for the + * workers of compressed writes. + */ + flush_workqueue(fs_info->compressed_write_workers); + + /* * After we parked the cleaner kthread, ordered extents may have * completed and created new delayed iputs. If one of the async reclaim * tasks is running and in the RUN_DELAYED_IPUTS flush state, then we @@ -4369,6 +4323,8 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) /* Ordered extents for free space inodes. */ btrfs_flush_workqueue(fs_info->endio_freespace_worker); btrfs_run_delayed_iputs(fs_info); + /* There should be no more workload to generate new delayed iputs. */ + set_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state); cancel_work_sync(&fs_info->async_reclaim_work); cancel_work_sync(&fs_info->async_data_reclaim_work); @@ -4403,9 +4359,6 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) btrfs_err(fs_info, "commit super ret %d", ret); } - if (BTRFS_FS_ERROR(fs_info)) - btrfs_error_commit_super(fs_info); - kthread_stop(fs_info->transaction_kthread); kthread_stop(fs_info->cleaner_kthread); @@ -4413,7 +4366,7 @@ void __cold close_ctree(struct btrfs_fs_info *fs_info) set_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags); if (btrfs_check_quota_leak(fs_info)) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN("qgroup reserved space leaked"); btrfs_err(fs_info, "qgroup reserved space leaked"); } @@ -4528,10 +4481,6 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info) /* cleanup FS via transaction */ btrfs_cleanup_transaction(fs_info); - mutex_lock(&fs_info->cleaner_mutex); - btrfs_run_delayed_iputs(fs_info); - mutex_unlock(&fs_info->cleaner_mutex); - down_write(&fs_info->cleanup_work_sem); up_write(&fs_info->cleanup_work_sem); } @@ -4674,9 +4623,9 @@ static void btrfs_destroy_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - mark, NULL)) { - clear_extent_bits(dirty_pages, start, end, mark); + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + mark, NULL)) { + btrfs_clear_extent_bits(dirty_pages, start, end, mark); while (start <= end) { eb = find_extent_buffer(fs_info, start); start += fs_info->nodesize; @@ -4709,14 +4658,14 @@ static void btrfs_destroy_pinned_extent(struct btrfs_fs_info *fs_info, * the same extent range. */ mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (!find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { + if (!btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state)) { mutex_unlock(&fs_info->unused_bg_unpin_mutex); break; } - clear_extent_dirty(unpin, start, end, &cached_state); - free_extent_state(cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); + btrfs_free_extent_state(cached_state); btrfs_error_unpin_extent_range(fs_info, start, end); mutex_unlock(&fs_info->unused_bg_unpin_mutex); cond_resched(); @@ -4902,7 +4851,7 @@ static int btrfs_cleanup_transaction(struct btrfs_fs_info *fs_info) int btrfs_init_root_free_objectid(struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *l; struct btrfs_key search_key; @@ -4918,14 +4867,13 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) search_key.offset = (u64)-1; ret = btrfs_search_slot(NULL, root, &search_key, path, 0, 0); if (ret < 0) - goto error; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto error; + return -EUCLEAN; } if (path->slots[0] > 0) { slot = path->slots[0] - 1; @@ -4936,10 +4884,8 @@ int btrfs_init_root_free_objectid(struct btrfs_root *root) } else { root->free_objectid = BTRFS_FIRST_FREE_OBJECTID; } - ret = 0; -error: - btrfs_free_path(path); - return ret; + + return 0; } int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid) diff --git a/fs/btrfs/disk-io.h b/fs/btrfs/disk-io.h index 587842991b24..864a55a96226 100644 --- a/fs/btrfs/disk-io.h +++ b/fs/btrfs/disk-io.h @@ -58,9 +58,6 @@ int btrfs_validate_super(const struct btrfs_fs_info *fs_info, const struct btrfs_super_block *sb, int mirror_num); int btrfs_check_features(struct btrfs_fs_info *fs_info, bool is_rw_mount); int write_all_supers(struct btrfs_fs_info *fs_info, int max_mirrors); -struct btrfs_super_block *btrfs_read_dev_super(struct block_device *bdev); -struct btrfs_super_block *btrfs_read_dev_one_super(struct block_device *bdev, - int copy_num, bool drop_cache); int btrfs_commit_super(struct btrfs_fs_info *fs_info); struct btrfs_root *btrfs_read_tree_root(struct btrfs_root *tree_root, const struct btrfs_key *key); @@ -114,7 +111,7 @@ int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid, int btrfs_read_extent_buffer(struct extent_buffer *buf, const struct btrfs_tree_parent_check *check); -blk_status_t btree_csum_one_bio(struct btrfs_bio *bbio); +int btree_csum_one_bio(struct btrfs_bio *bbio); int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans, struct btrfs_root *root); int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans, diff --git a/fs/btrfs/export.c b/fs/btrfs/export.c index e2b22bea348a..7fc8a3200b40 100644 --- a/fs/btrfs/export.c +++ b/fs/btrfs/export.c @@ -75,7 +75,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, { struct btrfs_fs_info *fs_info = btrfs_sb(sb); struct btrfs_root *root; - struct inode *inode; + struct btrfs_inode *inode; if (objectid < BTRFS_FIRST_FREE_OBJECTID) return ERR_PTR(-ESTALE); @@ -89,12 +89,12 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid, if (IS_ERR(inode)) return ERR_CAST(inode); - if (generation != 0 && generation != inode->i_generation) { - iput(inode); + if (generation != 0 && generation != inode->vfs_inode.i_generation) { + iput(&inode->vfs_inode); return ERR_PTR(-ESTALE); } - return d_obtain_alias(inode); + return d_obtain_alias(&inode->vfs_inode); } static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh, @@ -145,9 +145,10 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh, struct dentry *btrfs_get_parent(struct dentry *child) { - struct inode *dir = d_inode(child); - struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb); - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *dir = BTRFS_I(d_inode(child)); + struct btrfs_inode *inode; + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_path *path; struct extent_buffer *leaf; struct btrfs_root_ref *ref; @@ -159,13 +160,13 @@ struct dentry *btrfs_get_parent(struct dentry *child) if (!path) return ERR_PTR(-ENOMEM); - if (btrfs_ino(BTRFS_I(dir)) == BTRFS_FIRST_FREE_OBJECTID) { + if (btrfs_ino(dir) == BTRFS_FIRST_FREE_OBJECTID) { key.objectid = btrfs_root_id(root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { - key.objectid = btrfs_ino(BTRFS_I(dir)); + key.objectid = btrfs_ino(dir); key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; } @@ -210,7 +211,11 @@ struct dentry *btrfs_get_parent(struct dentry *child) found_key.offset, 0); } - return d_obtain_alias(btrfs_iget(key.objectid, root)); + inode = btrfs_iget(key.objectid, root); + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return d_obtain_alias(&inode->vfs_inode); fail: btrfs_free_path(path); return ERR_PTR(ret); @@ -219,11 +224,11 @@ fail: static int btrfs_get_name(struct dentry *parent, char *name, struct dentry *child) { - struct inode *inode = d_inode(child); - struct inode *dir = d_inode(parent); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_path *path; - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(child)); + struct btrfs_inode *dir = BTRFS_I(d_inode(parent)); + struct btrfs_root *root = dir->root; + struct btrfs_fs_info *fs_info = root->fs_info; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_inode_ref *iref; struct btrfs_root_ref *rref; struct extent_buffer *leaf; @@ -233,37 +238,34 @@ static int btrfs_get_name(struct dentry *parent, char *name, int ret; u64 ino; - if (!S_ISDIR(dir->i_mode)) + if (!S_ISDIR(dir->vfs_inode.i_mode)) return -EINVAL; - ino = btrfs_ino(BTRFS_I(inode)); + ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; if (ino == BTRFS_FIRST_FREE_OBJECTID) { - key.objectid = btrfs_root_id(BTRFS_I(inode)->root); + key.objectid = btrfs_root_id(inode->root); key.type = BTRFS_ROOT_BACKREF_KEY; key.offset = (u64)-1; root = fs_info->tree_root; } else { key.objectid = ino; - key.offset = btrfs_ino(BTRFS_I(dir)); key.type = BTRFS_INODE_REF_KEY; + key.offset = btrfs_ino(dir); } ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { - btrfs_free_path(path); return ret; } else if (ret > 0) { - if (ino == BTRFS_FIRST_FREE_OBJECTID) { + if (ino == BTRFS_FIRST_FREE_OBJECTID) path->slots[0]--; - } else { - btrfs_free_path(path); + else return -ENOENT; - } } leaf = path->nodes[0]; @@ -280,7 +282,6 @@ static int btrfs_get_name(struct dentry *parent, char *name, } read_extent_buffer(leaf, name, name_ptr, name_len); - btrfs_free_path(path); /* * have to add the null termination to make sure that reconnect_path diff --git a/fs/btrfs/extent-io-tree.c b/fs/btrfs/extent-io-tree.c index 6d08c100b01d..b1b96eb5f64e 100644 --- a/fs/btrfs/extent-io-tree.c +++ b/fs/btrfs/extent-io-tree.c @@ -42,7 +42,7 @@ static inline void btrfs_extent_state_leak_debug_check(void) struct extent_state *state; while (!list_empty(&states)) { - state = list_entry(states.next, struct extent_state, leak_list); + state = list_first_entry(&states, struct extent_state, leak_list); pr_err("BTRFS: state leak: start %llu end %llu state %u in tree %d refs %d\n", state->start, state->end, state->state, extent_state_in_tree(state), @@ -59,13 +59,12 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, struct extent_io_tree *tree, u64 start, u64 end) { - const struct btrfs_inode *inode; + const struct btrfs_inode *inode = tree->inode; u64 isize; if (tree->owner != IO_TREE_INODE_IO) return; - inode = extent_io_tree_to_inode_const(tree); isize = i_size_read(&inode->vfs_inode); if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) { btrfs_debug_rl(inode->root->fs_info, @@ -80,25 +79,8 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller, #define btrfs_debug_check_extent_io_range(c, s, e) do {} while (0) #endif - -/* - * The only tree allowed to set the inode is IO_TREE_INODE_IO. - */ -static bool is_inode_io_tree(const struct extent_io_tree *tree) -{ - return tree->owner == IO_TREE_INODE_IO; -} - -/* Return the inode if it's valid for the given tree, otherwise NULL. */ -struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree) -{ - if (tree->owner == IO_TREE_INODE_IO) - return tree->inode; - return NULL; -} - /* Read-only access to the inode. */ -const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree) +const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree) { if (tree->owner == IO_TREE_INODE_IO) return tree->inode; @@ -106,15 +88,15 @@ const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_t } /* For read-only access to fs_info. */ -const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree) +const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree) { if (tree->owner == IO_TREE_INODE_IO) return tree->inode->root->fs_info; return tree->fs_info; } -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner) +void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner) { tree->state = RB_ROOT; spin_lock_init(&tree->lock); @@ -129,7 +111,7 @@ void extent_io_tree_init(struct btrfs_fs_info *fs_info, * aren't any waiters on any extent state record (EXTENT_LOCK_BITS are never * set on any extent state when calling this function). */ -void extent_io_tree_release(struct extent_io_tree *tree) +void btrfs_extent_io_tree_release(struct extent_io_tree *tree) { struct rb_root root; struct extent_state *state; @@ -148,7 +130,7 @@ void extent_io_tree_release(struct extent_io_tree *tree) * (see wait_extent_bit()). */ ASSERT(!waitqueue_active(&state->wq)); - free_extent_state(state); + btrfs_free_extent_state(state); cond_resched_lock(&tree->lock); } /* @@ -176,7 +158,7 @@ static struct extent_state *alloc_extent_state(gfp_t mask) btrfs_leak_debug_add_state(state); refcount_set(&state->refs, 1); init_waitqueue_head(&state->wq); - trace_alloc_extent_state(state, mask, _RET_IP_); + trace_btrfs_alloc_extent_state(state, mask, _RET_IP_); return state; } @@ -188,14 +170,14 @@ static struct extent_state *alloc_extent_state_atomic(struct extent_state *preal return prealloc; } -void free_extent_state(struct extent_state *state) +void btrfs_free_extent_state(struct extent_state *state) { if (!state) return; if (refcount_dec_and_test(&state->refs)) { WARN_ON(extent_state_in_tree(state)); btrfs_leak_debug_del_state(state); - trace_free_extent_state(state, _RET_IP_); + trace_btrfs_free_extent_state(state, _RET_IP_); kmem_cache_free(extent_state_cache, state); } } @@ -222,38 +204,34 @@ static inline struct extent_state *next_state(struct extent_state *state) { struct rb_node *next = rb_next(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; + return rb_entry_safe(next, struct extent_state, rb_node); } static inline struct extent_state *prev_state(struct extent_state *state) { struct rb_node *next = rb_prev(&state->rb_node); - if (next) - return rb_entry(next, struct extent_state, rb_node); - else - return NULL; + return rb_entry_safe(next, struct extent_state, rb_node); } /* - * Search @tree for an entry that contains @offset. Such entry would have - * entry->start <= offset && entry->end >= offset. + * Search @tree for an entry that contains @offset or if none exists for the + * first entry that starts and ends after that offset. * * @tree: the tree to search - * @offset: offset that should fall within an entry in @tree + * @offset: search offset * @node_ret: pointer where new node should be anchored (used when inserting an * entry in the tree) * @parent_ret: points to entry which would have been the parent of the entry, * containing @offset * - * Return a pointer to the entry that contains @offset byte address and don't change - * @node_ret and @parent_ret. + * Return a pointer to the entry that contains @offset byte address. + * + * If no such entry exists, return the first entry that starts and ends after + * @offset if one exists, otherwise NULL. * - * If no such entry exists, return pointer to entry that ends before @offset - * and fill parameters @node_ret and @parent_ret, ie. does not return NULL. + * If the returned entry starts at @offset, then @node_ret and @parent_ret + * aren't changed. */ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree *tree, u64 offset, @@ -282,7 +260,11 @@ static inline struct extent_state *tree_search_for_insert(struct extent_io_tree if (parent_ret) *parent_ret = prev; - /* Search neighbors until we find the first one past the end */ + /* + * Return either the current entry if it contains offset (it ends after + * or at offset) or the first entry that starts and ends after offset if + * one exists, or NULL. + */ while (entry && offset > entry->end) entry = next_state(entry); @@ -346,12 +328,12 @@ static inline struct extent_state *tree_search(struct extent_io_tree *tree, u64 return tree_search_for_insert(tree, offset, NULL, NULL); } -static void extent_io_tree_panic(const struct extent_io_tree *tree, - const struct extent_state *state, - const char *opname, - int err) +static void __cold extent_io_tree_panic(const struct extent_io_tree *tree, + const struct extent_state *state, + const char *opname, + int err) { - btrfs_panic(extent_io_tree_to_fs_info(tree), err, + btrfs_panic(btrfs_extent_io_tree_to_fs_info(tree), err, "extent io tree error on %s state start %llu end %llu", opname, state->start, state->end); } @@ -362,13 +344,12 @@ static void merge_prev_state(struct extent_io_tree *tree, struct extent_state *s prev = prev_state(state); if (prev && prev->end == state->start - 1 && prev->state == state->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), - state, prev); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, state, prev); state->start = prev->start; rb_erase(&prev->rb_node, &tree->state); RB_CLEAR_NODE(&prev->rb_node); - free_extent_state(prev); + btrfs_free_extent_state(prev); } } @@ -378,13 +359,12 @@ static void merge_next_state(struct extent_io_tree *tree, struct extent_state *s next = next_state(state); if (next && next->start == state->end + 1 && next->state == state->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent(extent_io_tree_to_inode(tree), - state, next); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, state, next); state->end = next->end; rb_erase(&next->rb_node, &tree->state); RB_CLEAR_NODE(&next->rb_node); - free_extent_state(next); + btrfs_free_extent_state(next); } } @@ -413,8 +393,8 @@ static void set_state_bits(struct extent_io_tree *tree, u32 bits_to_set = bits & ~EXTENT_CTLBITS; int ret; - if (is_inode_io_tree(tree)) - btrfs_set_delalloc_extent(extent_io_tree_to_inode(tree), state, bits); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_set_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_set, changeset, 1); BUG_ON(ret < 0); @@ -459,10 +439,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, if (state->end < entry->start) { if (try_merge && end == entry->start && state->state == entry->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent( - extent_io_tree_to_inode(tree), - state, entry); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); entry->start = state->start; merge_prev_state(tree, entry); state->state = 0; @@ -472,10 +451,9 @@ static struct extent_state *insert_state(struct extent_io_tree *tree, } else if (state->end > entry->end) { if (try_merge && entry->end == start && state->state == entry->state) { - if (is_inode_io_tree(tree)) - btrfs_merge_delalloc_extent( - extent_io_tree_to_inode(tree), - state, entry); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_merge_delalloc_extent(tree->inode, + state, entry); entry->end = state->end; merge_next_state(tree, entry); state->state = 0; @@ -527,9 +505,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, struct rb_node *parent = NULL; struct rb_node **node; - if (is_inode_io_tree(tree)) - btrfs_split_delalloc_extent(extent_io_tree_to_inode(tree), orig, - split); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_split_delalloc_extent(tree->inode, orig, split); prealloc->start = orig->start; prealloc->end = split - 1; @@ -549,7 +526,7 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, } else if (prealloc->end > entry->end) { node = &(*node)->rb_right; } else { - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return -EEXIST; } } @@ -561,6 +538,18 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, } /* + * Use this during tree iteration to avoid doing next node searches when it's + * not needed (the current record ends at or after the target range's end). + */ +static inline struct extent_state *next_search_state(struct extent_state *state, u64 end) +{ + if (state->end < end) + return next_state(state); + + return NULL; +} + +/* * Utility function to clear some bits in an extent state struct. It will * optionally wake up anyone waiting on this state (wake == 1). * @@ -569,16 +558,15 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig, */ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, struct extent_state *state, - u32 bits, int wake, + u32 bits, int wake, u64 end, struct extent_changeset *changeset) { struct extent_state *next; u32 bits_to_clear = bits & ~EXTENT_CTLBITS; int ret; - if (is_inode_io_tree(tree)) - btrfs_clear_delalloc_extent(extent_io_tree_to_inode(tree), state, - bits); + if (tree->owner == IO_TREE_INODE_IO) + btrfs_clear_delalloc_extent(tree->inode, state, bits); ret = add_extent_changeset(state, bits_to_clear, changeset, 0); BUG_ON(ret < 0); @@ -586,17 +574,17 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree, if (wake) wake_up(&state->wq); if (state->state == 0) { - next = next_state(state); + next = next_search_state(state, end); if (extent_state_in_tree(state)) { rb_erase(&state->rb_node, &tree->state); RB_CLEAR_NODE(&state->rb_node); - free_extent_state(state); + btrfs_free_extent_state(state); } else { WARN_ON(1); } } else { merge_state(tree, state); - next = next_state(state); + next = next_search_state(state, end); } return next; } @@ -620,18 +608,18 @@ static void set_gfp_mask_from_bits(u32 *bits, gfp_t *mask) * * This takes the tree lock, and returns 0 on success and < 0 on error. */ -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state, - struct extent_changeset *changeset) +int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *cached; struct extent_state *prealloc = NULL; u64 last_end; - int err; - int clear = 0; - int wake; - int delete = (bits & EXTENT_CLEAR_ALL_BITS); + int ret = 0; + bool clear; + bool wake; + const bool delete = (bits & EXTENT_CLEAR_ALL_BITS); gfp_t mask; set_gfp_mask_from_bits(&bits, &mask); @@ -644,9 +632,8 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, if (bits & EXTENT_DELALLOC) bits |= EXTENT_NORESERVE; - wake = ((bits & EXTENT_LOCK_BITS) ? 1 : 0); - if (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)) - clear = 1; + wake = (bits & EXTENT_LOCK_BITS); + clear = (bits & (EXTENT_LOCK_BITS | EXTENT_BOUNDARY)); again: if (!prealloc) { /* @@ -676,7 +663,7 @@ again: goto hit_next; } if (clear) - free_extent_state(cached); + btrfs_free_extent_state(cached); } /* This search will find the extents that end after our range starts. */ @@ -691,7 +678,7 @@ hit_next: /* The state doesn't have the wanted bits, go ahead. */ if (!(state->state & bits)) { - state = next_state(state); + state = next_search_state(state, end); goto next; } @@ -714,18 +701,24 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, start); - if (err) - extent_io_tree_panic(tree, state, "split", err); - + ret = split_state(tree, state, prealloc, start); prealloc = NULL; - if (err) + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); goto out; + } if (state->end <= end) { - state = clear_state_bit(tree, state, bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, end, + changeset); goto next; } - goto search_again; + if (need_resched()) + goto search_again; + /* + * Fallthrough and try atomic extent state allocation if needed. + * If it fails we'll jump to 'search_again' retry the allocation + * in non-atomic mode and start the search again. + */ } /* * | ---- desired range ---- | @@ -736,30 +729,31 @@ hit_next: prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; - err = split_state(tree, state, prealloc, end + 1); - if (err) - extent_io_tree_panic(tree, state, "split", err); + ret = split_state(tree, state, prealloc, end + 1); + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } if (wake) wake_up(&state->wq); - clear_state_bit(tree, prealloc, bits, wake, changeset); + clear_state_bit(tree, prealloc, bits, wake, end, changeset); prealloc = NULL; goto out; } - state = clear_state_bit(tree, state, bits, wake, changeset); + state = clear_state_bit(tree, state, bits, wake, end, changeset); next: - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; - if (start <= end && state && !need_resched()) + if (state && !need_resched()) goto hit_next; search_again: - if (start > end) - goto out; spin_unlock(&tree->lock); if (gfpflags_allow_blocking(mask)) cond_resched(); @@ -767,10 +761,9 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); - return 0; + return ret; } @@ -820,7 +813,7 @@ process_node: schedule(); spin_lock(&tree->lock); finish_wait(&state->wq, &wait); - free_extent_state(state); + btrfs_free_extent_state(state); goto again; } start = state->end + 1; @@ -838,7 +831,7 @@ out: if (cached_state && *cached_state) { state = *cached_state; *cached_state = NULL; - free_extent_state(state); + btrfs_free_extent_state(state); } spin_unlock(&tree->lock); } @@ -877,7 +870,7 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t */ state = tree_search(tree, start); while (state) { - if (state->end >= start && (state->state & bits)) + if (state->state & bits) return state; state = next_state(state); } @@ -892,9 +885,9 @@ static struct extent_state *find_first_extent_bit_state(struct extent_io_tree *t * Return true if we find something, and update @start_ret and @end_ret. * Return false if we found nothing. */ -bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state) +bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state) { struct extent_state *state; bool ret = false; @@ -914,13 +907,13 @@ bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, * again. If we haven't found any, clear as well since * it's now useless. */ - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = NULL; if (state) goto got_it; goto out; } - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = NULL; } @@ -952,14 +945,17 @@ out: * contiguous area for given bits. We will search to the first bit we find, and * then walk down the tree until we find a non-contiguous area. The area * returned will be the full contiguous area with the bits set. + * + * Returns true if we found a range with the given bits set, in which case + * @start_ret and @end_ret are updated, or false if no range was found. */ -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) +bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; - int ret = 1; + bool ret = false; - ASSERT(!btrfs_fs_incompat(extent_io_tree_to_fs_info(tree), NO_HOLES)); + ASSERT(!btrfs_fs_incompat(btrfs_extent_io_tree_to_fs_info(tree), NO_HOLES)); spin_lock(&tree->lock); state = find_first_extent_bit_state(tree, start, bits); @@ -971,7 +967,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, break; *end_ret = state->end; } - ret = 0; + ret = true; } spin_unlock(&tree->lock); return ret; @@ -1046,11 +1042,11 @@ out: * * [start, end] is inclusive This takes the tree lock. */ -static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u64 *failed_start, - struct extent_state **failed_state, - struct extent_state **cached_state, - struct extent_changeset *changeset) +static int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u64 *failed_start, + struct extent_state **failed_state, + struct extent_state **cached_state, + struct extent_changeset *changeset) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1129,12 +1125,11 @@ hit_next: set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; goto search_again; } @@ -1186,12 +1181,11 @@ hit_next: set_state_bits(tree, state, bits, changeset); cache_state(state, cached_state); merge_state(tree, state); - if (last_end == (u64)-1) + if (last_end >= end) goto out; start = last_end + 1; state = next_state(state); - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; } goto search_again; @@ -1204,14 +1198,8 @@ hit_next: * extent we found. */ if (state->start > start) { - u64 this_end; struct extent_state *inserted_state; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) goto search_again; @@ -1221,17 +1209,38 @@ hit_next: * extent. */ prealloc->start = start; - prealloc->end = this_end; + if (end < last_start) + prealloc->end = end; + else + prealloc->end = last_start - 1; + inserted_state = insert_state(tree, prealloc, bits, changeset); if (IS_ERR(inserted_state)) { ret = PTR_ERR(inserted_state); extent_io_tree_panic(tree, prealloc, "insert", ret); + goto out; } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) prealloc = NULL; - start = this_end + 1; + start = inserted_state->end + 1; + + /* Beyond target range, stop. */ + if (start > end) + goto out; + + if (need_resched()) + goto search_again; + + state = next_search_state(inserted_state, end); + /* + * If there's a next state, whether contiguous or not, we don't + * need to unlock and start search agian. If it's not contiguous + * we will end up here and try to allocate a prealloc state and insert. + */ + if (state) + goto hit_next; goto search_again; } /* @@ -1252,8 +1261,11 @@ hit_next: if (!prealloc) goto search_again; ret = split_state(tree, state, prealloc, end + 1); - if (ret) + if (ret) { extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } set_state_bits(tree, prealloc, bits, changeset); cache_state(prealloc, cached_state); @@ -1272,18 +1284,16 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return ret; } -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state) +int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state) { - return __set_extent_bit(tree, start, end, bits, NULL, NULL, - cached_state, NULL); + return set_extent_bit(tree, start, end, bits, NULL, NULL, cached_state, NULL); } /* @@ -1304,9 +1314,9 @@ int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, * * All allocations are done with GFP_NOFS. */ -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state) +int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state) { struct extent_state *state; struct extent_state *prealloc = NULL; @@ -1374,12 +1384,11 @@ hit_next: if (state->start == start && state->end <= end) { set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) + state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); + if (last_end >= end) goto out; start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; goto search_again; } @@ -1406,20 +1415,19 @@ hit_next: goto out; } ret = split_state(tree, state, prealloc, start); - if (ret) - extent_io_tree_panic(tree, state, "split", ret); prealloc = NULL; - if (ret) + if (ret) { + extent_io_tree_panic(tree, state, "split", ret); goto out; + } if (state->end <= end) { set_state_bits(tree, state, bits, NULL); cache_state(state, cached_state); - state = clear_state_bit(tree, state, clear_bits, 0, NULL); - if (last_end == (u64)-1) + state = clear_state_bit(tree, state, clear_bits, 0, end, NULL); + if (last_end >= end) goto out; start = last_end + 1; - if (start < end && state && state->start == start && - !need_resched()) + if (state && state->start == start && !need_resched()) goto hit_next; } goto search_again; @@ -1432,14 +1440,8 @@ hit_next: * extent we found. */ if (state->start > start) { - u64 this_end; struct extent_state *inserted_state; - if (end < last_start) - this_end = end; - else - this_end = last_start - 1; - prealloc = alloc_extent_state_atomic(prealloc); if (!prealloc) { ret = -ENOMEM; @@ -1451,16 +1453,37 @@ hit_next: * extent. */ prealloc->start = start; - prealloc->end = this_end; + if (end < last_start) + prealloc->end = end; + else + prealloc->end = last_start - 1; + inserted_state = insert_state(tree, prealloc, bits, NULL); if (IS_ERR(inserted_state)) { ret = PTR_ERR(inserted_state); extent_io_tree_panic(tree, prealloc, "insert", ret); + goto out; } cache_state(inserted_state, cached_state); if (inserted_state == prealloc) prealloc = NULL; - start = this_end + 1; + start = inserted_state->end + 1; + + /* Beyond target range, stop. */ + if (start > end) + goto out; + + if (need_resched()) + goto search_again; + + state = next_search_state(inserted_state, end); + /* + * If there's a next state, whether contiguous or not, we don't + * need to unlock and start search again. If it's not contiguous + * we will end up here and try to allocate a prealloc state and insert. + */ + if (state) + goto hit_next; goto search_again; } /* @@ -1477,12 +1500,15 @@ hit_next: } ret = split_state(tree, state, prealloc, end + 1); - if (ret) + if (ret) { extent_io_tree_panic(tree, state, "split", ret); + prealloc = NULL; + goto out; + } set_state_bits(tree, prealloc, bits, NULL); cache_state(prealloc, cached_state); - clear_state_bit(tree, prealloc, clear_bits, 0, NULL); + clear_state_bit(tree, prealloc, clear_bits, 0, end, NULL); prealloc = NULL; goto out; } @@ -1497,8 +1523,7 @@ search_again: out: spin_unlock(&tree->lock); - if (prealloc) - free_extent_state(prealloc); + btrfs_free_extent_state(prealloc); return ret; } @@ -1518,8 +1543,8 @@ out: * spans (last_range_end, end of device]. In this case it's up to the caller to * trim @end_ret to the appropriate size. */ -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits) +void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits) { struct extent_state *state; struct extent_state *prev = NULL, *next = NULL; @@ -1636,10 +1661,10 @@ out: * all given bits set. If the returned number of bytes is greater than zero * then @start is updated with the offset of the first byte with the bits set. */ -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, u64 max_bytes, - u32 bits, int contig, - struct extent_state **cached_state) +u64 btrfs_count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, u64 max_bytes, + u32 bits, int contig, + struct extent_state **cached_state) { struct extent_state *state = NULL; struct extent_state *cached; @@ -1710,7 +1735,7 @@ search: } if (cached_state) { - free_extent_state(*cached_state); + btrfs_free_extent_state(*cached_state); *cached_state = state; if (state) refcount_inc(&state->refs); @@ -1724,16 +1749,16 @@ search: /* * Check if the single @bit exists in the given range. */ -bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) +bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit) { - struct extent_state *state = NULL; + struct extent_state *state; bool bitset = false; ASSERT(is_power_of_2(bit)); spin_lock(&tree->lock); state = tree_search(tree, start); - while (state && start <= end) { + while (state) { if (state->start > end) break; @@ -1742,9 +1767,7 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 break; } - /* If state->end is (u64)-1, start will overflow to 0 */ - start = state->end + 1; - if (start > end || start == 0) + if (state->end >= end) break; state = next_state(state); } @@ -1752,16 +1775,51 @@ bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 return bitset; } +void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits, + struct extent_state **cached_state) +{ + struct extent_state *state; + + /* + * The cached state is currently mandatory and not used to start the + * search, only to cache the first state record found in the range. + */ + ASSERT(cached_state != NULL); + ASSERT(*cached_state == NULL); + + *bits = 0; + + spin_lock(&tree->lock); + state = tree_search(tree, start); + if (state && state->start < end) { + *cached_state = state; + refcount_inc(&state->refs); + } + while (state) { + if (state->start > end) + break; + + *bits |= state->state; + + if (state->end >= end) + break; + + state = next_state(state); + } + spin_unlock(&tree->lock); +} + /* * Check if the whole range [@start,@end) contains the single @bit set. */ -bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, - struct extent_state *cached) +bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached) { - struct extent_state *state = NULL; + struct extent_state *state; bool bitset = true; ASSERT(is_power_of_2(bit)); + ASSERT(start < end); spin_lock(&tree->lock); if (cached && extent_state_in_tree(cached) && cached->start <= start && @@ -1769,30 +1827,22 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, state = cached; else state = tree_search(tree, start); - while (state && start <= end) { + while (state) { if (state->start > start) { bitset = false; break; } - if (state->start > end) - break; - if ((state->state & bit) == 0) { bitset = false; break; } - if (state->end == (u64)-1) + if (state->end >= end) break; - /* - * Last entry (if state->end is (u64)-1 and overflow happens), - * or next entry starts after the range. - */ + /* Next state must start where this one ends. */ start = state->end + 1; - if (start > end || start == 0) - break; state = next_state(state); } @@ -1804,8 +1854,8 @@ bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, } /* Wrappers around set/clear extent bit */ -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) +int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) { /* * We don't support EXTENT_LOCK_BITS yet, as current changeset will @@ -1814,11 +1864,11 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCK_BITS)); - return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); + return set_extent_bit(tree, start, end, bits, NULL, NULL, NULL, changeset); } -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset) +int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset) { /* * Don't support EXTENT_LOCK_BITS case, same reason as @@ -1826,20 +1876,21 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, */ ASSERT(!(bits & EXTENT_LOCK_BITS)); - return __clear_extent_bit(tree, start, end, bits, NULL, changeset); + return btrfs_clear_extent_bit_changeset(tree, start, end, bits, NULL, changeset); } -bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached) +bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached) { int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, bits, &failed_start, - NULL, cached, NULL); + err = set_extent_bit(tree, start, end, bits, &failed_start, NULL, + cached, NULL); if (err == -EEXIST) { if (failed_start > start) - clear_extent_bit(tree, start, failed_start - 1, bits, cached); + btrfs_clear_extent_bit(tree, start, failed_start - 1, + bits, cached); return 0; } return 1; @@ -1849,35 +1900,54 @@ bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits * Either insert or lock state struct between start and end use mask to tell * us if waiting is desired. */ -int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached_state) +int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached_state) { struct extent_state *failed_state = NULL; int err; u64 failed_start; - err = __set_extent_bit(tree, start, end, bits, &failed_start, - &failed_state, cached_state, NULL); + err = set_extent_bit(tree, start, end, bits, &failed_start, + &failed_state, cached_state, NULL); while (err == -EEXIST) { if (failed_start != start) - clear_extent_bit(tree, start, failed_start - 1, - bits, cached_state); + btrfs_clear_extent_bit(tree, start, failed_start - 1, + bits, cached_state); wait_extent_bit(tree, failed_start, end, bits, &failed_state); - err = __set_extent_bit(tree, start, end, bits, - &failed_start, &failed_state, - cached_state, NULL); + err = set_extent_bit(tree, start, end, bits, &failed_start, + &failed_state, cached_state, NULL); } return err; } -void __cold extent_state_free_cachep(void) +/* + * Get the extent state that follows the given extent state. + * This is meant to be used in a context where we know no other tasks can + * concurrently modify the tree. + */ +struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree, + struct extent_state *state) +{ + struct extent_state *next; + + spin_lock(&tree->lock); + ASSERT(extent_state_in_tree(state)); + next = next_state(state); + if (next) + refcount_inc(&next->refs); + spin_unlock(&tree->lock); + + return next; +} + +void __cold btrfs_extent_state_free_cachep(void) { btrfs_extent_state_leak_debug_check(); kmem_cache_destroy(extent_state_cache); } -int __init extent_state_init_cachep(void) +int __init btrfs_extent_state_init_cachep(void) { extent_state_cache = kmem_cache_create("btrfs_extent_state", sizeof(struct extent_state), 0, 0, diff --git a/fs/btrfs/extent-io-tree.h b/fs/btrfs/extent-io-tree.h index 6ffef1cd37c1..0a18ca9c59c3 100644 --- a/fs/btrfs/extent-io-tree.h +++ b/fs/btrfs/extent-io-tree.h @@ -17,7 +17,6 @@ struct btrfs_inode; /* Bits for the extent state */ enum { ENUM_BIT(EXTENT_DIRTY), - ENUM_BIT(EXTENT_UPTODATE), ENUM_BIT(EXTENT_LOCKED), ENUM_BIT(EXTENT_DIO_LOCKED), ENUM_BIT(EXTENT_NEW), @@ -39,6 +38,11 @@ enum { */ ENUM_BIT(EXTENT_DELALLOC_NEW), /* + * Mark that a range is being locked for finishing an ordered extent. + * Used together with EXTENT_LOCKED. + */ + ENUM_BIT(EXTENT_FINISHING_ORDERED), + /* * When an ordered extent successfully completes for a region marked as * a new delalloc range, use this flag when clearing a new delalloc * range to indicate that the VFS' inode number of bytes should be @@ -130,117 +134,116 @@ struct extent_state { #endif }; -struct btrfs_inode *extent_io_tree_to_inode(struct extent_io_tree *tree); -const struct btrfs_inode *extent_io_tree_to_inode_const(const struct extent_io_tree *tree); -const struct btrfs_fs_info *extent_io_tree_to_fs_info(const struct extent_io_tree *tree); +const struct btrfs_inode *btrfs_extent_io_tree_to_inode(const struct extent_io_tree *tree); +const struct btrfs_fs_info *btrfs_extent_io_tree_to_fs_info(const struct extent_io_tree *tree); -void extent_io_tree_init(struct btrfs_fs_info *fs_info, - struct extent_io_tree *tree, unsigned int owner); -void extent_io_tree_release(struct extent_io_tree *tree); -int __lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached); -bool __try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, - struct extent_state **cached); +void btrfs_extent_io_tree_init(struct btrfs_fs_info *fs_info, + struct extent_io_tree *tree, unsigned int owner); +void btrfs_extent_io_tree_release(struct extent_io_tree *tree); +int btrfs_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 bits, + struct extent_state **cached); +bool btrfs_try_lock_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached); -static inline int lock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +static inline int btrfs_lock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return __lock_extent(tree, start, end, EXTENT_LOCKED, cached); + return btrfs_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached); } -static inline bool try_lock_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline bool btrfs_try_lock_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __try_lock_extent(tree, start, end, EXTENT_LOCKED, cached); + return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_LOCKED, cached); } -int __init extent_state_init_cachep(void); -void __cold extent_state_free_cachep(void); - -u64 count_range_bits(struct extent_io_tree *tree, - u64 *start, u64 search_end, - u64 max_bytes, u32 bits, int contig, - struct extent_state **cached_state); - -void free_extent_state(struct extent_state *state); -bool test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, - struct extent_state *cached_state); -bool test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); -int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset); -int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached, - struct extent_changeset *changeset); - -static inline int clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits, - struct extent_state **cached) -{ - return __clear_extent_bit(tree, start, end, bits, cached, NULL); -} +int __init btrfs_extent_state_init_cachep(void); +void __cold btrfs_extent_state_free_cachep(void); + +u64 btrfs_count_range_bits(struct extent_io_tree *tree, + u64 *start, u64 search_end, + u64 max_bytes, u32 bits, int contig, + struct extent_state **cached_state); -static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, - struct extent_state **cached) +void btrfs_free_extent_state(struct extent_state *state); +bool btrfs_test_range_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bit, + struct extent_state *cached_state); +bool btrfs_test_range_bit_exists(struct extent_io_tree *tree, u64 start, u64 end, u32 bit); +void btrfs_get_range_bits(struct extent_io_tree *tree, u64 start, u64 end, u32 *bits, + struct extent_state **cached_state); +int btrfs_clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset); +int btrfs_clear_extent_bit_changeset(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached, + struct extent_changeset *changeset); + +static inline int btrfs_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits, + struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, bits, cached, NULL); } -static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start, - u64 end, u32 bits) +static inline int btrfs_unlock_extent(struct extent_io_tree *tree, u64 start, u64 end, + struct extent_state **cached) { - return clear_extent_bit(tree, start, end, bits, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_LOCKED, + cached, NULL); } -int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_changeset *changeset); -int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, struct extent_state **cached_state); - -static inline int clear_extent_uptodate(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached_state) +static inline int btrfs_clear_extent_bits(struct extent_io_tree *tree, u64 start, + u64 end, u32 bits) { - return __clear_extent_bit(tree, start, end, EXTENT_UPTODATE, - cached_state, NULL); + return btrfs_clear_extent_bit(tree, start, end, bits, NULL); } -static inline int clear_extent_dirty(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +int btrfs_set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_changeset *changeset); +int btrfs_set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, struct extent_state **cached_state); + +static inline int btrfs_clear_extent_dirty(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return clear_extent_bit(tree, start, end, - EXTENT_DIRTY | EXTENT_DELALLOC | - EXTENT_DO_ACCOUNTING, cached); + return btrfs_clear_extent_bit(tree, start, end, + EXTENT_DIRTY | EXTENT_DELALLOC | + EXTENT_DO_ACCOUNTING, cached); } -int convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, - u32 bits, u32 clear_bits, - struct extent_state **cached_state); - -bool find_first_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits, - struct extent_state **cached_state); -void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits); -int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, - u64 *start_ret, u64 *end_ret, u32 bits); +int btrfs_convert_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, + u32 bits, u32 clear_bits, + struct extent_state **cached_state); + +bool btrfs_find_first_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits, + struct extent_state **cached_state); +void btrfs_find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits); +bool btrfs_find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start, + u64 *start_ret, u64 *end_ret, u32 bits); bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start, u64 *end, u64 max_bytes, struct extent_state **cached_state); -static inline int lock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int btrfs_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); + return btrfs_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached); } -static inline bool try_lock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline bool btrfs_try_lock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __try_lock_extent(tree, start, end, EXTENT_DIO_LOCKED, cached); + return btrfs_try_lock_extent_bits(tree, start, end, EXTENT_DIO_LOCKED, cached); } -static inline int unlock_dio_extent(struct extent_io_tree *tree, u64 start, - u64 end, struct extent_state **cached) +static inline int btrfs_unlock_dio_extent(struct extent_io_tree *tree, u64 start, + u64 end, struct extent_state **cached) { - return __clear_extent_bit(tree, start, end, EXTENT_DIO_LOCKED, cached, NULL); + return btrfs_clear_extent_bit_changeset(tree, start, end, EXTENT_DIO_LOCKED, + cached, NULL); } +struct extent_state *btrfs_next_extent_state(struct extent_io_tree *tree, + struct extent_state *state); + #endif /* BTRFS_EXTENT_IO_TREE_H */ diff --git a/fs/btrfs/extent-tree.c b/fs/btrfs/extent-tree.c index 3014a1a23efd..cb6128778a83 100644 --- a/fs/btrfs/extent-tree.c +++ b/fs/btrfs/extent-tree.c @@ -70,20 +70,17 @@ static int block_group_bits(struct btrfs_block_group *cache, u64 bits) int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len) { struct btrfs_root *root = btrfs_extent_root(fs_info, start); - int ret; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); path = btrfs_alloc_path(); if (!path) return -ENOMEM; key.objectid = start; - key.offset = len; key.type = BTRFS_EXTENT_ITEM_KEY; - ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); - btrfs_free_path(path); - return ret; + key.offset = len; + return btrfs_search_slot(NULL, root, &key, path, 0, 0); } /* @@ -103,7 +100,7 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, struct btrfs_root *extent_root; struct btrfs_delayed_ref_head *head; struct btrfs_delayed_ref_root *delayed_refs; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 num_refs; u64 extent_flags; @@ -125,16 +122,16 @@ int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans, search_again: key.objectid = bytenr; - key.offset = offset; if (metadata) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = offset; extent_root = btrfs_extent_root(fs_info, bytenr); ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) - goto out_free; + return ret; if (ret > 0 && key.type == BTRFS_METADATA_ITEM_KEY) { if (path->slots[0]) { @@ -159,7 +156,7 @@ search_again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); @@ -170,7 +167,7 @@ search_again: "unexpected zero reference count for extent item (%llu %u %llu)", key.objectid, key.type, key.offset); btrfs_abort_transaction(trans, ret); - goto out_free; + return ret; } extent_flags = btrfs_extent_flags(leaf, ei); owner = btrfs_get_extent_owner_root(fs_info, leaf, path->slots[0]); @@ -216,8 +213,7 @@ search_again: *flags = extent_flags; if (owning_root) *owning_root = owner; -out_free: - btrfs_free_path(path); + return ret; } @@ -413,15 +409,15 @@ static u64 hash_extent_data_ref_item(struct extent_buffer *leaf, btrfs_extent_data_ref_offset(leaf, ref)); } -static int match_extent_data_ref(struct extent_buffer *leaf, - struct btrfs_extent_data_ref *ref, - u64 root_objectid, u64 owner, u64 offset) +static bool match_extent_data_ref(struct extent_buffer *leaf, + struct btrfs_extent_data_ref *ref, + u64 root_objectid, u64 owner, u64 offset) { if (btrfs_extent_data_ref_root(leaf, ref) != root_objectid || btrfs_extent_data_ref_objectid(leaf, ref) != owner || btrfs_extent_data_ref_offset(leaf, ref) != offset) - return 0; - return 1; + return false; + return true; } static noinline int lookup_extent_data_ref(struct btrfs_trans_handle *trans, @@ -1487,7 +1483,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_delayed_ref_node *node, struct btrfs_delayed_extent_op *extent_op) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_extent_item *item; struct btrfs_key key; @@ -1508,7 +1504,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, node->parent, node->ref_root, owner, offset, refs_to_add, extent_op); if ((ret < 0 && ret != -EAGAIN) || !ret) - goto out; + return ret; /* * Ok we had -EAGAIN which means we didn't have space to insert and @@ -1533,8 +1529,7 @@ static int __btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, if (ret) btrfs_abort_transaction(trans, ret); -out: - btrfs_free_path(path); + return ret; } @@ -1631,7 +1626,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_root *root; struct btrfs_key key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_item *ei; struct extent_buffer *leaf; u32 item_size; @@ -1662,7 +1657,7 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans, again: ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - goto out; + return ret; } else if (ret > 0) { if (metadata) { if (path->slots[0] > 0) { @@ -1679,8 +1674,8 @@ again: metadata = 0; key.objectid = head->bytenr; - key.offset = head->num_bytes; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = head->num_bytes; goto again; } } else { @@ -1688,7 +1683,7 @@ again: btrfs_err(fs_info, "missing extent item for extent %llu num_bytes %llu level %d", head->bytenr, head->num_bytes, head->level); - goto out; + return ret; } } @@ -1701,13 +1696,12 @@ again: "unexpected extent item size, has %u expect >= %zu", item_size, sizeof(*ei)); btrfs_abort_transaction(trans, ret); - goto out; + return ret; } ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_extent_item); __run_delayed_extent_op(extent_op, leaf, ei); -out: - btrfs_free_path(path); + return ret; } @@ -2012,7 +2006,12 @@ static noinline int __btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, delayed_refs = &trans->transaction->delayed_refs; if (min_bytes == 0) { - max_count = delayed_refs->num_heads_ready; + /* + * We may be subject to a harmless race if some task is + * concurrently adding or removing a delayed ref, so silence + * KCSAN and similar tools. + */ + max_count = data_race(delayed_refs->num_heads_ready); min_bytes = U64_MAX; } @@ -2348,8 +2347,8 @@ static noinline int check_committed_ref(struct btrfs_inode *inode, int ret; key.objectid = bytenr; - key.offset = (u64)-1; key.type = BTRFS_EXTENT_ITEM_KEY; + key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); if (ret < 0) @@ -2604,8 +2603,8 @@ static int pin_down_extent(struct btrfs_trans_handle *trans, spin_unlock(&cache->lock); spin_unlock(&cache->space_info->lock); - set_extent_bit(&trans->transaction->pinned_extents, bytenr, - bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->pinned_extents, bytenr, + bytenr + num_bytes - 1, EXTENT_DIRTY, NULL); return 0; } @@ -2824,34 +2823,63 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct btrfs_block_group *block_group, *tmp; struct list_head *deleted_bgs; - struct extent_io_tree *unpin; + struct extent_io_tree *unpin = &trans->transaction->pinned_extents; + struct extent_state *cached_state = NULL; u64 start; u64 end; + int unpin_error = 0; int ret; - unpin = &trans->transaction->pinned_extents; - - while (!TRANS_ABORTED(trans)) { - struct extent_state *cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, EXTENT_DIRTY, &cached_state); - mutex_lock(&fs_info->unused_bg_unpin_mutex); - if (!find_first_extent_bit(unpin, 0, &start, &end, - EXTENT_DIRTY, &cached_state)) { - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - break; - } + while (!TRANS_ABORTED(trans) && cached_state) { + struct extent_state *next_state; if (btrfs_test_opt(fs_info, DISCARD_SYNC)) ret = btrfs_discard_extent(fs_info, start, end + 1 - start, NULL); - clear_extent_dirty(unpin, start, end, &cached_state); + next_state = btrfs_next_extent_state(unpin, cached_state); + btrfs_clear_extent_dirty(unpin, start, end, &cached_state); ret = unpin_extent_range(fs_info, start, end, true); - BUG_ON(ret); - mutex_unlock(&fs_info->unused_bg_unpin_mutex); - free_extent_state(cached_state); - cond_resched(); + /* + * If we get an error unpinning an extent range, store the first + * error to return later after trying to unpin all ranges and do + * the sync discards. Our caller will abort the transaction + * (which already wrote new superblocks) and on the next mount + * the space will be available as it was pinned by in-memory + * only structures in this phase. + */ + if (ret) { + btrfs_err_rl(fs_info, +"failed to unpin extent range [%llu, %llu] when committing transaction %llu: %s (%d)", + start, end, trans->transid, + btrfs_decode_error(ret), ret); + if (!unpin_error) + unpin_error = ret; + } + + btrfs_free_extent_state(cached_state); + + if (need_resched()) { + btrfs_free_extent_state(next_state); + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + cond_resched(); + cached_state = NULL; + mutex_lock(&fs_info->unused_bg_unpin_mutex); + btrfs_find_first_extent_bit(unpin, 0, &start, &end, + EXTENT_DIRTY, &cached_state); + } else { + cached_state = next_state; + if (cached_state) { + start = cached_state->start; + end = cached_state->end; + } + } } + mutex_unlock(&fs_info->unused_bg_unpin_mutex); + btrfs_free_extent_state(cached_state); if (btrfs_test_opt(fs_info, DISCARD_ASYNC)) { btrfs_discard_calc_delay(&fs_info->discard_ctl); @@ -2865,16 +2893,20 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) */ deleted_bgs = &trans->transaction->deleted_bgs; list_for_each_entry_safe(block_group, tmp, deleted_bgs, bg_list) { - u64 trimmed = 0; - ret = -EROFS; if (!TRANS_ABORTED(trans)) - ret = btrfs_discard_extent(fs_info, - block_group->start, - block_group->length, - &trimmed); + ret = btrfs_discard_extent(fs_info, block_group->start, + block_group->length, NULL); + /* + * Not strictly necessary to lock, as the block_group should be + * read-only from btrfs_delete_unused_bgs(). + */ + ASSERT(block_group->ro); + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + spin_unlock(&fs_info->unused_bgs_lock); + btrfs_unfreeze_block_group(block_group); btrfs_put_block_group(block_group); @@ -2886,7 +2918,7 @@ int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans) } } - return 0; + return unpin_error; } /* @@ -3481,17 +3513,11 @@ int btrfs_free_tree_block(struct btrfs_trans_handle *trans, WARN_ON(test_bit(EXTENT_BUFFER_DIRTY, &buf->bflags)); btrfs_add_free_space(bg, buf->start, buf->len); - btrfs_free_reserved_bytes(bg, buf->len, 0); + btrfs_free_reserved_bytes(bg, buf->len, false); btrfs_put_block_group(bg); trace_btrfs_reserved_extent_free(fs_info, buf->start, buf->len); out: - - /* - * Deleting the buffer, clear the corrupt flag since it doesn't - * matter anymore. - */ - clear_bit(EXTENT_BUFFER_CORRUPT, &buf->bflags); return 0; } @@ -4109,6 +4135,7 @@ static int can_allocate_chunk(struct btrfs_fs_info *fs_info, static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, struct btrfs_key *ins, struct find_free_extent_ctl *ffe_ctl, + struct btrfs_space_info *space_info, bool full_search) { struct btrfs_root *root = fs_info->chunk_root; @@ -4163,7 +4190,7 @@ static int find_free_extent_update_loop(struct btrfs_fs_info *fs_info, return ret; } - ret = btrfs_chunk_alloc(trans, ffe_ctl->flags, + ret = btrfs_chunk_alloc(trans, space_info, ffe_ctl->flags, CHUNK_ALLOC_FORCE_FOR_EXTENT); /* Do not bail out on ENOSPC since we can do more. */ @@ -4380,11 +4407,22 @@ static noinline int find_free_extent(struct btrfs_root *root, ins->objectid = 0; ins->offset = 0; - trace_find_free_extent(root, ffe_ctl); + trace_btrfs_find_free_extent(root, ffe_ctl); space_info = btrfs_find_space_info(fs_info, ffe_ctl->flags); + if (btrfs_is_zoned(fs_info) && space_info) { + /* Use dedicated sub-space_info for dedicated block group users. */ + if (ffe_ctl->for_data_reloc) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_DATA_RELOC); + } else if (ffe_ctl->for_treelog) { + space_info = space_info->sub_group[0]; + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_TREELOG); + } + } if (!space_info) { - btrfs_err(fs_info, "No space info for %llu", ffe_ctl->flags); + btrfs_err(fs_info, "no space info for %llu, tree-log %d, relocation %d", + ffe_ctl->flags, ffe_ctl->for_treelog, ffe_ctl->for_data_reloc); return -ENOSPC; } @@ -4406,6 +4444,7 @@ static noinline int find_free_extent(struct btrfs_root *root, * picked out then we don't care that the block group is cached. */ if (block_group && block_group_bits(block_group, ffe_ctl->flags) && + block_group->space_info == space_info && block_group->cached != BTRFS_CACHE_NO) { down_read(&space_info->groups_sem); if (list_empty(&block_group->list) || @@ -4431,7 +4470,7 @@ static noinline int find_free_extent(struct btrfs_root *root, } } search: - trace_find_free_extent_search_loop(root, ffe_ctl); + trace_btrfs_find_free_extent_search_loop(root, ffe_ctl); ffe_ctl->have_caching_bg = false; if (ffe_ctl->index == btrfs_bg_flags_to_raid_index(ffe_ctl->flags) || ffe_ctl->index == 0) @@ -4483,7 +4522,7 @@ search: } have_block_group: - trace_find_free_extent_have_block_group(root, ffe_ctl, block_group); + trace_btrfs_find_free_extent_have_block_group(root, ffe_ctl, block_group); ffe_ctl->cached = btrfs_block_group_done(block_group); if (unlikely(!ffe_ctl->cached)) { ffe_ctl->have_caching_bg = true; @@ -4576,7 +4615,8 @@ loop: } up_read(&space_info->groups_sem); - ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, full_search); + ret = find_free_extent_update_loop(fs_info, ins, ffe_ctl, space_info, + full_search); if (ret > 0) goto search; @@ -4698,8 +4738,8 @@ again: return ret; } -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc) +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, + bool is_delalloc) { struct btrfs_block_group *cache; @@ -4711,7 +4751,7 @@ int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, } btrfs_add_free_space(cache, start, len); - btrfs_free_reserved_bytes(cache, len, delalloc); + btrfs_free_reserved_bytes(cache, len, is_delalloc); trace_btrfs_reserved_extent_free(fs_info, start, len); btrfs_put_block_group(cache); @@ -5069,17 +5109,17 @@ btrfs_init_new_buffer(struct btrfs_trans_handle *trans, struct btrfs_root *root, * EXTENT bit to differentiate dirty pages. */ if (buf->log_index == 0) - set_extent_bit(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, - EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_DIRTY, NULL); else - set_extent_bit(&root->dirty_log_pages, buf->start, - buf->start + buf->len - 1, - EXTENT_NEW, NULL); + btrfs_set_extent_bit(&root->dirty_log_pages, buf->start, + buf->start + buf->len - 1, + EXTENT_NEW, NULL); } else { buf->log_index = -1; - set_extent_bit(&trans->transaction->dirty_pages, buf->start, - buf->start + buf->len - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&trans->transaction->dirty_pages, buf->start, + buf->start + buf->len - 1, EXTENT_DIRTY, NULL); } /* this returns a buffer locked for blocking */ return buf; @@ -5185,7 +5225,7 @@ out_free_buf: btrfs_tree_unlock(buf); free_extent_buffer(buf); out_free_reserved: - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 0); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, false); out_unuse: btrfs_unuse_block_rsv(fs_info, block_rsv, blocksize); return ERR_PTR(ret); @@ -5465,7 +5505,7 @@ static int check_ref_exists(struct btrfs_trans_handle *trans, { struct btrfs_delayed_ref_root *delayed_refs; struct btrfs_delayed_ref_head *head; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_extent_inline_ref *iref; int ret; bool exists = false; @@ -5482,7 +5522,6 @@ again: * If we get 0 then we found our reference, return 1, else * return the error if it's not -ENOENT; */ - btrfs_free_path(path); return (ret < 0 ) ? ret : 1; } @@ -5517,7 +5556,6 @@ again: mutex_unlock(&head->mutex); out: spin_unlock(&delayed_refs->lock); - btrfs_free_path(path); return exists ? 1 : 0; } @@ -6285,7 +6323,7 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, struct extent_buffer *parent) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct walk_control *wc; int level; int parent_level; @@ -6298,10 +6336,8 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, return -ENOMEM; wc = kzalloc(sizeof(*wc), GFP_NOFS); - if (!wc) { - btrfs_free_path(path); + if (!wc) return -ENOMEM; - } btrfs_assert_tree_write_locked(parent); parent_level = btrfs_header_level(parent); @@ -6338,7 +6374,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans, } kfree(wc); - btrfs_free_path(path); return ret; } @@ -6400,13 +6435,13 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) if (ret) break; - find_first_clear_extent_bit(&device->alloc_state, start, - &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&device->alloc_state, start, + &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); /* Check if there are any CHUNK_* bits left */ if (start > device->total_bytes) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_warn_in_rcu(fs_info, "ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu", start, end - start + 1, @@ -6439,8 +6474,8 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed) ret = btrfs_issue_discard(device->bdev, start, len, &bytes); if (!ret) - set_extent_bit(&device->alloc_state, start, - start + bytes - 1, CHUNK_TRIMMED, NULL); + btrfs_set_extent_bit(&device->alloc_state, start, + start + bytes - 1, CHUNK_TRIMMED, NULL); mutex_unlock(&fs_info->chunk_mutex); if (ret) diff --git a/fs/btrfs/extent-tree.h b/fs/btrfs/extent-tree.h index cfa52264f678..72914074c304 100644 --- a/fs/btrfs/extent-tree.h +++ b/fs/btrfs/extent-tree.h @@ -4,7 +4,6 @@ #define BTRFS_EXTENT_TREE_H #include <linux/types.h> -#include "misc.h" #include "block-group.h" #include "locking.h" @@ -150,8 +149,8 @@ int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref); u64 btrfs_get_extent_owner_root(struct btrfs_fs_info *fs_info, struct extent_buffer *leaf, int slot); -int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, - u64 start, u64 len, int delalloc); +int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len, + bool is_delalloc); int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, const struct extent_buffer *eb); int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index b2fae67f8fa3..849199768664 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -96,6 +96,8 @@ void btrfs_extent_buffer_leak_debug_check(struct btrfs_fs_info *fs_info) */ struct btrfs_bio_ctrl { struct btrfs_bio *bbio; + /* Last byte contained in bbio + 1 . */ + loff_t next_file_offset; enum btrfs_compression_type compress_type; u32 len_to_oe_boundary; blk_opf_t opf; @@ -221,22 +223,17 @@ static void __process_folios_contig(struct address_space *mapping, } static noinline void unlock_delalloc_folio(const struct inode *inode, - const struct folio *locked_folio, + struct folio *locked_folio, u64 start, u64 end) { - unsigned long index = start >> PAGE_SHIFT; - unsigned long end_index = end >> PAGE_SHIFT; - ASSERT(locked_folio); - if (index == locked_folio->index && end_index == index) - return; __process_folios_contig(inode->i_mapping, locked_folio, start, end, PAGE_UNLOCK); } static noinline int lock_delalloc_folios(struct inode *inode, - const struct folio *locked_folio, + struct folio *locked_folio, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); @@ -246,9 +243,6 @@ static noinline int lock_delalloc_folios(struct inode *inode, u64 processed_end = start; struct folio_batch fbatch; - if (index == locked_folio->index && index == end_index) - return 0; - folio_batch_init(&fbatch); while (index <= end_index) { unsigned int found_folios, i; @@ -340,7 +334,7 @@ again: /* @delalloc_end can be -1, never go beyond @orig_end */ *end = min(delalloc_end, orig_end); - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); return false; } @@ -366,7 +360,7 @@ again: /* some of the folios are gone, lets avoid looping by * shortening the size of the delalloc range we're searching */ - free_extent_state(cached_state); + btrfs_free_extent_state(cached_state); cached_state = NULL; if (!loops) { max_bytes = PAGE_SIZE; @@ -379,13 +373,13 @@ again: } /* step three, lock the state bits for the whole range */ - lock_extent(tree, delalloc_start, delalloc_end, &cached_state); + btrfs_lock_extent(tree, delalloc_start, delalloc_end, &cached_state); /* then test to make sure it is all still delalloc */ - ret = test_range_bit(tree, delalloc_start, delalloc_end, - EXTENT_DELALLOC, cached_state); + ret = btrfs_test_range_bit(tree, delalloc_start, delalloc_end, + EXTENT_DELALLOC, cached_state); - unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); + btrfs_unlock_extent(tree, delalloc_start, delalloc_end, &cached_state); if (!ret) { unlock_delalloc_folio(inode, locked_folio, delalloc_start, delalloc_end); @@ -403,7 +397,7 @@ void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end, struct extent_state **cached, u32 clear_bits, unsigned long page_ops) { - clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, cached); __process_folios_contig(inode->vfs_inode.i_mapping, locked_folio, start, end, page_ops); @@ -425,14 +419,14 @@ static void end_folio_read(struct folio *folio, bool uptodate, u64 start, u32 le struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); if (uptodate && btrfs_verify_folio(folio, start, len)) btrfs_folio_set_uptodate(fs_info, folio, start, len); else btrfs_folio_clear_uptodate(fs_info, folio, start, len); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) folio_unlock(folio); else btrfs_folio_end_lock(fs_info, folio, start, len); @@ -462,9 +456,6 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) u64 start = folio_pos(folio) + fi.offset; u32 len = fi.length; - /* Only order 0 (single page) folios are allowed for data. */ - ASSERT(folio_order(folio) == 0); - /* Our read/write should always be sector aligned. */ if (!IS_ALIGNED(fi.offset, sectorsize)) btrfs_err(fs_info, @@ -488,11 +479,11 @@ static void end_bbio_data_write(struct btrfs_bio *bbio) static void begin_folio_read(struct btrfs_fs_info *fs_info, struct folio *folio) { ASSERT(folio_test_locked(folio)); - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio)); - btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), PAGE_SIZE); + btrfs_folio_set_lock(fs_info, folio, folio_pos(folio), folio_size(folio)); } /* @@ -512,43 +503,22 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) struct btrfs_fs_info *fs_info = bbio->fs_info; struct bio *bio = &bbio->bio; struct folio_iter fi; - const u32 sectorsize = fs_info->sectorsize; ASSERT(!bio_flagged(bio, BIO_CLONED)); bio_for_each_folio_all(fi, &bbio->bio) { bool uptodate = !bio->bi_status; struct folio *folio = fi.folio; struct inode *inode = folio->mapping->host; - u64 start; - u64 end; - u32 len; + u64 start = folio_pos(folio) + fi.offset; btrfs_debug(fs_info, "%s: bi_sector=%llu, err=%d, mirror=%u", __func__, bio->bi_iter.bi_sector, bio->bi_status, bbio->mirror_num); - /* - * We always issue full-sector reads, but if some block in a - * folio fails to read, blk_update_request() will advance - * bv_offset and adjust bv_len to compensate. Print a warning - * for unaligned offsets, and an error if they don't add up to - * a full sector. - */ - if (!IS_ALIGNED(fi.offset, sectorsize)) - btrfs_err(fs_info, - "partial page read in btrfs with offset %zu and length %zu", - fi.offset, fi.length); - else if (!IS_ALIGNED(fi.offset + fi.length, sectorsize)) - btrfs_info(fs_info, - "incomplete page read with offset %zu and length %zu", - fi.offset, fi.length); - - start = folio_pos(folio) + fi.offset; - end = start + fi.length - 1; - len = fi.length; if (likely(uptodate)) { + u64 end = start + fi.length - 1; loff_t i_size = i_size_read(inode); /* @@ -573,7 +543,7 @@ static void end_bbio_data_read(struct btrfs_bio *bbio) } /* Update page status and unlock. */ - end_folio_read(folio, uptodate, start, len); + end_folio_read(folio, uptodate, start, fi.length); } bio_put(bio); } @@ -664,13 +634,10 @@ static int alloc_eb_folio_array(struct extent_buffer *eb, bool nofail) } static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, - struct folio *folio, u64 disk_bytenr, - unsigned int pg_offset) + u64 disk_bytenr, loff_t file_offset) { struct bio *bio = &bio_ctrl->bbio->bio; - struct bio_vec *bvec = bio_last_bvec_all(bio); const sector_t sector = disk_bytenr >> SECTOR_SHIFT; - struct folio *bv_folio = page_folio(bvec->bv_page); if (bio_ctrl->compress_type != BTRFS_COMPRESS_NONE) { /* @@ -681,19 +648,11 @@ static bool btrfs_bio_is_contig(struct btrfs_bio_ctrl *bio_ctrl, } /* - * The contig check requires the following conditions to be met: - * - * 1) The folios are belonging to the same inode - * This is implied by the call chain. - * - * 2) The range has adjacent logical bytenr - * - * 3) The range has adjacent file offset - * This is required for the usage of btrfs_bio->file_offset. + * To merge into a bio both the disk sector and the logical offset in + * the file need to be contiguous. */ - return bio_end_sector(bio) == sector && - folio_pos(bv_folio) + bvec->bv_offset + bvec->bv_len == - folio_pos(folio) + pg_offset; + return bio_ctrl->next_file_offset == file_offset && + bio_end_sector(bio) == sector; } static void alloc_new_bio(struct btrfs_inode *inode, @@ -711,6 +670,7 @@ static void alloc_new_bio(struct btrfs_inode *inode, bbio->file_offset = file_offset; bio_ctrl->bbio = bbio; bio_ctrl->len_to_oe_boundary = U32_MAX; + bio_ctrl->next_file_offset = file_offset; /* Limit data write bios to the ordered boundary. */ if (bio_ctrl->wbc) { @@ -752,22 +712,21 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, size_t size, unsigned long pg_offset) { struct btrfs_inode *inode = folio_to_inode(folio); + loff_t file_offset = folio_pos(folio) + pg_offset; - ASSERT(pg_offset + size <= PAGE_SIZE); + ASSERT(pg_offset + size <= folio_size(folio)); ASSERT(bio_ctrl->end_io_func); if (bio_ctrl->bbio && - !btrfs_bio_is_contig(bio_ctrl, folio, disk_bytenr, pg_offset)) + !btrfs_bio_is_contig(bio_ctrl, disk_bytenr, file_offset)) submit_one_bio(bio_ctrl); do { u32 len = size; /* Allocate new bio if needed */ - if (!bio_ctrl->bbio) { - alloc_new_bio(inode, bio_ctrl, disk_bytenr, - folio_pos(folio) + pg_offset); - } + if (!bio_ctrl->bbio) + alloc_new_bio(inode, bio_ctrl, disk_bytenr, file_offset); /* Cap to the current ordered extent boundary if there is one. */ if (len > bio_ctrl->len_to_oe_boundary) { @@ -781,14 +740,15 @@ static void submit_extent_folio(struct btrfs_bio_ctrl *bio_ctrl, submit_one_bio(bio_ctrl); continue; } + bio_ctrl->next_file_offset += len; if (bio_ctrl->wbc) - wbc_account_cgroup_owner(bio_ctrl->wbc, folio, - len); + wbc_account_cgroup_owner(bio_ctrl->wbc, folio, len); size -= len; pg_offset += len; disk_bytenr += len; + file_offset += len; /* * len_to_oe_boundary defaults to U32_MAX, which isn't folio or @@ -836,7 +796,7 @@ static int attach_extent_buffer_folio(struct extent_buffer *eb, if (folio->mapping) lockdep_assert_held(&folio->mapping->i_private_lock); - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { if (!folio_test_private(folio)) folio_attach_private(folio, eb); else @@ -870,7 +830,7 @@ int set_folio_extent_mapped(struct folio *folio) fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) + if (btrfs_is_subpage(fs_info, folio)) return btrfs_attach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_attach_private(folio, (void *)EXTENT_FOLIO_PRIVATE); @@ -887,8 +847,8 @@ void clear_folio_extent_mapped(struct folio *folio) return; fs_info = folio_to_fs_info(folio); - if (btrfs_is_subpage(fs_info, folio->mapping)) - return btrfs_detach_subpage(fs_info, folio); + if (btrfs_is_subpage(fs_info, folio)) + return btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_DATA); folio_detach_private(folio); } @@ -903,13 +863,13 @@ static struct extent_map *get_extent_map(struct btrfs_inode *inode, if (*em_cached) { em = *em_cached; - if (extent_map_in_tree(em) && start >= em->start && - start < extent_map_end(em)) { + if (btrfs_extent_map_in_tree(em) && start >= em->start && + start < btrfs_extent_map_end(em)) { refcount_inc(&em->refs); return em; } - free_extent_map(em); + btrfs_free_extent_map(em); *em_cached = NULL; } @@ -935,16 +895,12 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, struct inode *inode = folio->mapping->host; struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); u64 start = folio_pos(folio); - const u64 end = start + PAGE_SIZE - 1; - u64 cur = start; + const u64 end = start + folio_size(folio) - 1; u64 extent_offset; u64 last_byte = i_size_read(inode); - u64 block_start; struct extent_map *em; int ret = 0; - size_t pg_offset = 0; - size_t iosize; - size_t blocksize = fs_info->sectorsize; + const size_t blocksize = fs_info->sectorsize; ret = set_folio_extent_mapped(folio); if (ret < 0) { @@ -955,45 +911,49 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (folio_contains(folio, last_byte >> PAGE_SHIFT)) { size_t zero_offset = offset_in_folio(folio, last_byte); - if (zero_offset) { - iosize = folio_size(folio) - zero_offset; - folio_zero_range(folio, zero_offset, iosize); - } + if (zero_offset) + folio_zero_range(folio, zero_offset, + folio_size(folio) - zero_offset); } bio_ctrl->end_io_func = end_bbio_data_read; begin_folio_read(fs_info, folio); - while (cur <= end) { + for (u64 cur = start; cur <= end; cur += blocksize) { enum btrfs_compression_type compress_type = BTRFS_COMPRESS_NONE; + unsigned long pg_offset = offset_in_folio(folio, cur); bool force_bio_submit = false; u64 disk_bytenr; + u64 block_start; ASSERT(IS_ALIGNED(cur, fs_info->sectorsize)); if (cur >= last_byte) { - iosize = folio_size(folio) - pg_offset; - folio_zero_range(folio, pg_offset, iosize); - end_folio_read(folio, true, cur, iosize); + folio_zero_range(folio, pg_offset, end - cur + 1); + end_folio_read(folio, true, cur, end - cur + 1); break; } + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + end_folio_read(folio, true, cur, blocksize); + continue; + } em = get_extent_map(BTRFS_I(inode), folio, cur, end - cur + 1, em_cached); if (IS_ERR(em)) { end_folio_read(folio, false, cur, end + 1 - cur); return PTR_ERR(em); } extent_offset = cur - em->start; - BUG_ON(extent_map_end(em) <= cur); + BUG_ON(btrfs_extent_map_end(em) <= cur); BUG_ON(end < cur); - compress_type = extent_map_compression(em); + compress_type = btrfs_extent_map_compression(em); - iosize = min(extent_map_end(em) - cur, end - cur + 1); - iosize = ALIGN(iosize, blocksize); if (compress_type != BTRFS_COMPRESS_NONE) disk_bytenr = em->disk_bytenr; else - disk_bytenr = extent_map_block_start(em) + extent_offset; - block_start = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; + if (em->flags & EXTENT_FLAG_PREALLOC) block_start = EXTENT_MAP_HOLE; + else + block_start = btrfs_extent_map_block_start(em); /* * If we have a file range that points to a compressed extent @@ -1037,23 +997,18 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (prev_em_start) *prev_em_start = em->start; - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; /* we've found a hole, just zero and go on */ if (block_start == EXTENT_MAP_HOLE) { - folio_zero_range(folio, pg_offset, iosize); - - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + folio_zero_range(folio, pg_offset, blocksize); + end_folio_read(folio, true, cur, blocksize); continue; } /* the get_extent function already copied into the folio */ if (block_start == EXTENT_MAP_INLINE) { - end_folio_read(folio, true, cur, iosize); - cur = cur + iosize; - pg_offset += iosize; + end_folio_read(folio, true, cur, blocksize); continue; } @@ -1064,15 +1019,190 @@ static int btrfs_do_readpage(struct folio *folio, struct extent_map **em_cached, if (force_bio_submit) submit_one_bio(bio_ctrl); - submit_extent_folio(bio_ctrl, disk_bytenr, folio, iosize, + submit_extent_folio(bio_ctrl, disk_bytenr, folio, blocksize, pg_offset); - cur = cur + iosize; - pg_offset += iosize; } - return 0; } +/* + * Check if we can skip waiting the @ordered extent covering the block at @fileoff. + * + * @fileoff: Both input and output. + * Input as the file offset where the check should start at. + * Output as where the next check should start at, + * if the function returns true. + * + * Return true if we can skip to @fileoff. The caller needs to check the new + * @fileoff value to make sure it covers the full range, before skipping the + * full OE. + * + * Return false if we must wait for the ordered extent. + */ +static bool can_skip_one_ordered_range(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 *fileoff) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct folio *folio; + const u32 blocksize = fs_info->sectorsize; + u64 cur = *fileoff; + bool ret; + + folio = filemap_get_folio(inode->vfs_inode.i_mapping, cur >> PAGE_SHIFT); + + /* + * We should have locked the folio(s) for range [start, end], thus + * there must be a folio and it must be locked. + */ + ASSERT(!IS_ERR(folio)); + ASSERT(folio_test_locked(folio)); + + /* + * There are several cases for the folio and OE combination: + * + * 1) Folio has no private flag + * The OE has all its IO done but not yet finished, and folio got + * invalidated. + * + * Have we have to wait for the OE to finish, as it may contain the + * to-be-inserted data checksum. + * Without the data checksum inserted into the csum tree, read will + * just fail with missing csum. + */ + if (!folio_test_private(folio)) { + ret = false; + goto out; + } + + /* + * 2) The first block is DIRTY. + * + * This means the OE is created by some other folios whose file pos is + * before this one. And since we are holding the folio lock, the writeback + * of this folio cannot start. + * + * We must skip the whole OE, because it will never start until we + * finished our folio read and unlocked the folio. + */ + if (btrfs_folio_test_dirty(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + ret = true; + /* + * At least inside the folio, all the remaining blocks should + * also be dirty. + */ + ASSERT(btrfs_folio_test_dirty(fs_info, folio, cur, range_len)); + *fileoff = ordered->file_offset + ordered->num_bytes; + goto out; + } + + /* + * 3) The first block is uptodate. + * + * At least the first block can be skipped, but we are still not fully + * sure. E.g. if the OE has some other folios in the range that cannot + * be skipped. + * So we return true and update @next_ret to the OE/folio boundary. + */ + if (btrfs_folio_test_uptodate(fs_info, folio, cur, blocksize)) { + u64 range_len = min(folio_pos(folio) + folio_size(folio), + ordered->file_offset + ordered->num_bytes) - cur; + + /* + * The whole range to the OE end or folio boundary should also + * be uptodate. + */ + ASSERT(btrfs_folio_test_uptodate(fs_info, folio, cur, range_len)); + ret = true; + *fileoff = cur + range_len; + goto out; + } + + /* + * 4) The first block is not uptodate. + * + * This means the folio is invalidated after the writeback was finished, + * but by some other operations (e.g. block aligned buffered write) the + * folio is inserted into filemap. + * Very much the same as case 1). + */ + ret = false; +out: + folio_put(folio); + return ret; +} + +static bool can_skip_ordered_extent(struct btrfs_inode *inode, + struct btrfs_ordered_extent *ordered, + u64 start, u64 end) +{ + const u64 range_end = min(end, ordered->file_offset + ordered->num_bytes - 1); + u64 cur = max(start, ordered->file_offset); + + while (cur < range_end) { + bool can_skip; + + can_skip = can_skip_one_ordered_range(inode, ordered, &cur); + if (!can_skip) + return false; + } + return true; +} + +/* + * Locking helper to make sure we get a stable view of extent maps for the + * involved range. + * + * This is for folio read paths (read and readahead), thus the involved range + * should have all the folios locked. + */ +static void lock_extents_for_read(struct btrfs_inode *inode, u64 start, u64 end, + struct extent_state **cached_state) +{ + u64 cur_pos; + + /* Caller must provide a valid @cached_state. */ + ASSERT(cached_state); + + /* The range must at least be page aligned, as all read paths are folio based. */ + ASSERT(IS_ALIGNED(start, PAGE_SIZE)); + ASSERT(IS_ALIGNED(end + 1, PAGE_SIZE)); + +again: + btrfs_lock_extent(&inode->io_tree, start, end, cached_state); + cur_pos = start; + while (cur_pos < end) { + struct btrfs_ordered_extent *ordered; + + ordered = btrfs_lookup_ordered_range(inode, cur_pos, + end - cur_pos + 1); + /* + * No ordered extents in the range, and we hold the extent lock, + * no one can modify the extent maps in the range, we're safe to return. + */ + if (!ordered) + break; + + /* Check if we can skip waiting for the whole OE. */ + if (can_skip_ordered_extent(inode, ordered, start, end)) { + cur_pos = min(ordered->file_offset + ordered->num_bytes, + end + 1); + btrfs_put_ordered_extent(ordered); + continue; + } + + /* Now wait for the OE to finish. */ + btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_start_ordered_extent_nowriteback(ordered, start, end + 1 - start); + btrfs_put_ordered_extent(ordered); + /* We have unlocked the whole range, restart from the beginning. */ + goto again; + } +} + int btrfs_read_folio(struct file *file, struct folio *folio) { struct btrfs_inode *inode = folio_to_inode(folio); @@ -1083,11 +1213,11 @@ int btrfs_read_folio(struct file *file, struct folio *folio) struct extent_map *em_cached = NULL; int ret; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); ret = btrfs_do_readpage(folio, &em_cached, &bio_ctrl, NULL); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); - free_extent_map(em_cached); + btrfs_free_extent_map(em_cached); /* * If btrfs_do_readpage() failed we will want to submit the assembled @@ -1105,7 +1235,7 @@ static void set_delalloc_bitmap(struct folio *folio, unsigned long *delalloc_bit unsigned int start_bit; unsigned int nbits; - ASSERT(start >= folio_start && start + len <= folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start + len <= folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; nbits = len >> fs_info->sectorsize_bits; ASSERT(bitmap_test_range_all_zero(delalloc_bitmap, start_bit, nbits)); @@ -1118,12 +1248,12 @@ static bool find_next_delalloc_bitmap(struct folio *folio, { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); const u64 folio_start = folio_pos(folio); - const unsigned int bitmap_size = fs_info->sectors_per_page; + const unsigned int bitmap_size = btrfs_blocks_per_folio(fs_info, folio); unsigned int start_bit; unsigned int first_zero; unsigned int first_set; - ASSERT(start >= folio_start && start < folio_start + PAGE_SIZE); + ASSERT(start >= folio_start && start < folio_start + folio_size(folio)); start_bit = (start - folio_start) >> fs_info->sectorsize_bits; first_set = find_next_bit(delalloc_bitmap, bitmap_size, start_bit); @@ -1157,9 +1287,10 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, { struct btrfs_fs_info *fs_info = inode_to_fs_info(&inode->vfs_inode); struct writeback_control *wbc = bio_ctrl->wbc; - const bool is_subpage = btrfs_is_subpage(fs_info, folio->mapping); + const bool is_subpage = btrfs_is_subpage(fs_info, folio); const u64 page_start = folio_pos(folio); const u64 page_end = page_start + folio_size(folio) - 1; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long delalloc_bitmap = 0; /* * Save the last found delalloc end. As the delalloc end can go beyond @@ -1184,14 +1315,14 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, int bit; /* Save the dirty bitmap as our submission bitmap will be a subset of it. */ - if (btrfs_is_subpage(fs_info, inode->vfs_inode.i_mapping)) { - ASSERT(fs_info->sectors_per_page > 1); + if (btrfs_is_subpage(fs_info, folio)) { + ASSERT(blocks_per_folio > 1); btrfs_get_subpage_dirty_bitmap(fs_info, folio, &bio_ctrl->submit_bitmap); } else { bio_ctrl->submit_bitmap = 1; } - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { u64 start = page_start + (bit << fs_info->sectorsize_bits); btrfs_folio_set_lock(fs_info, folio, start, fs_info->sectorsize); @@ -1264,7 +1395,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, btrfs_root_id(inode->root), btrfs_ino(inode), folio_pos(folio), - fs_info->sectors_per_page, + blocks_per_folio, &bio_ctrl->submit_bitmap, found_start, found_len, ret); } else { @@ -1272,8 +1403,8 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, * We've hit an error during previous delalloc range, * have to cleanup the remaining locked ranges. */ - unlock_extent(&inode->io_tree, found_start, - found_start + found_len - 1, NULL); + btrfs_unlock_extent(&inode->io_tree, found_start, + found_start + found_len - 1, NULL); unlock_delalloc_folio(&inode->vfs_inode, folio, found_start, found_start + found_len - 1); @@ -1309,7 +1440,7 @@ static noinline_for_stack int writepage_delalloc(struct btrfs_inode *inode, unsigned int bitmap_size = min( (last_finished_delalloc_end - page_start) >> fs_info->sectorsize_bits, - fs_info->sectors_per_page); + blocks_per_folio); for_each_set_bit(bit, &bio_ctrl->submit_bitmap, bitmap_size) btrfs_mark_ordered_io_finished(inode, folio, @@ -1324,7 +1455,7 @@ out: delalloc_end = page_end; /* * delalloc_end is already one less than the total length, so - * we don't subtract one from PAGE_SIZE + * we don't subtract one from PAGE_SIZE. */ delalloc_to_write += DIV_ROUND_UP(delalloc_end + 1 - page_start, PAGE_SIZE); @@ -1333,7 +1464,7 @@ out: * If all ranges are submitted asynchronously, we just need to account * for them here. */ - if (bitmap_empty(&bio_ctrl->submit_bitmap, fs_info->sectors_per_page)) { + if (bitmap_empty(&bio_ctrl->submit_bitmap, blocks_per_folio)) { wbc->nr_to_write -= delalloc_to_write; return 1; } @@ -1379,19 +1510,19 @@ static int submit_one_sector(struct btrfs_inode *inode, return PTR_ERR(em); extent_offset = filepos - em->start; - em_end = extent_map_end(em); + em_end = btrfs_extent_map_end(em); ASSERT(filepos <= em_end); ASSERT(IS_ALIGNED(em->start, sectorsize)); ASSERT(IS_ALIGNED(em->len, sectorsize)); - block_start = extent_map_block_start(em); - disk_bytenr = extent_map_block_start(em) + extent_offset; + block_start = btrfs_extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em) + extent_offset; - ASSERT(!extent_map_is_compressed(em)); + ASSERT(!btrfs_extent_map_is_compressed(em)); ASSERT(block_start != EXTENT_MAP_HOLE); ASSERT(block_start != EXTENT_MAP_INLINE); - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; /* @@ -1434,6 +1565,7 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, bool submitted_io = false; bool error = false; const u64 folio_start = folio_pos(folio); + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); u64 cur; int bit; int ret = 0; @@ -1442,21 +1574,23 @@ static noinline_for_stack int extent_writepage_io(struct btrfs_inode *inode, start + len <= folio_start + folio_size(folio)); ret = btrfs_writepage_cow_fixup(folio); - if (ret) { + if (ret == -EAGAIN) { /* Fixup worker will requeue */ folio_redirty_for_writepage(bio_ctrl->wbc, folio); folio_unlock(folio); return 1; } + if (ret < 0) + return ret; for (cur = start; cur < start + len; cur += fs_info->sectorsize) set_bit((cur - folio_start) >> fs_info->sectorsize_bits, &range_bitmap); bitmap_and(&bio_ctrl->submit_bitmap, &bio_ctrl->submit_bitmap, &range_bitmap, - fs_info->sectors_per_page); + blocks_per_folio); bio_ctrl->end_io_func = end_bbio_data_write; - for_each_set_bit(bit, &bio_ctrl->submit_bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bio_ctrl->submit_bitmap, blocks_per_folio) { cur = folio_pos(folio) + (bit << fs_info->sectorsize_bits); if (cur >= i_size) { @@ -1530,6 +1664,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl size_t pg_offset; loff_t i_size = i_size_read(&inode->vfs_inode); unsigned long end_index = i_size >> PAGE_SHIFT; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); trace_extent_writepage(folio, &inode->vfs_inode, bio_ctrl->wbc); @@ -1543,7 +1678,7 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl return 0; } - if (folio->index == end_index) + if (folio_contains(folio, end_index)) folio_zero_range(folio, pg_offset, folio_size(folio) - pg_offset); /* @@ -1551,6 +1686,30 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl * The proper bitmap can only be initialized until writepage_delalloc(). */ bio_ctrl->submit_bitmap = (unsigned long)-1; + + /* + * If the page is dirty but without private set, it's marked dirty + * without informing the fs. + * Nowadays that is a bug, since the introduction of + * pin_user_pages*(). + * + * So here we check if the page has private set to rule out such + * case. + * But we also have a long history of relying on the COW fixup, + * so here we only enable this check for experimental builds until + * we're sure it's safe. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL) && + unlikely(!folio_test_private(folio))) { + WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + inode->root->root_key.objectid, + btrfs_ino(inode), folio_pos(folio)); + ret = -EUCLEAN; + goto done; + } + ret = set_folio_extent_mapped(folio); if (ret < 0) goto done; @@ -1562,14 +1721,14 @@ static int extent_writepage(struct folio *folio, struct btrfs_bio_ctrl *bio_ctrl goto done; ret = extent_writepage_io(inode, folio, folio_pos(folio), - PAGE_SIZE, bio_ctrl, i_size); + folio_size(folio), bio_ctrl, i_size); if (ret == 1) return 0; if (ret < 0) btrfs_err_rl(fs_info, "failed to submit blocks, root=%lld inode=%llu folio=%llu submit_bitmap=%*pbl: %d", btrfs_root_id(inode->root), btrfs_ino(inode), - folio_pos(folio), fs_info->sectors_per_page, + folio_pos(folio), blocks_per_folio, &bio_ctrl->submit_bitmap, ret); bio_ctrl->wbc->nr_to_write--; @@ -1615,8 +1774,18 @@ static noinline_for_stack bool lock_extent_buffer_for_io(struct extent_buffer *e */ spin_lock(&eb->refs_lock); if (test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) { + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + set_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); spin_unlock(&eb->refs_lock); + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_set_mark(&xas, PAGECACHE_TAG_WRITEBACK); + xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); + xas_unlock_irqrestore(&xas, flags); + btrfs_set_header_flag(eb, BTRFS_HEADER_FLAG_WRITTEN); percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, @@ -1702,45 +1871,166 @@ static void set_btree_ioerr(struct extent_buffer *eb) } } +static void buffer_tree_set_mark(const struct extent_buffer *eb, xa_mark_t mark) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_set_mark(&xas, mark); + xas_unlock_irqrestore(&xas, flags); +} + +static void buffer_tree_clear_mark(const struct extent_buffer *eb, xa_mark_t mark) +{ + struct btrfs_fs_info *fs_info = eb->fs_info; + XA_STATE(xas, &fs_info->buffer_tree, eb->start >> fs_info->sectorsize_bits); + unsigned long flags; + + xas_lock_irqsave(&xas, flags); + xas_load(&xas); + xas_clear_mark(&xas, mark); + xas_unlock_irqrestore(&xas, flags); +} + +static void buffer_tree_tag_for_writeback(struct btrfs_fs_info *fs_info, + unsigned long start, unsigned long end) +{ + XA_STATE(xas, &fs_info->buffer_tree, start); + unsigned int tagged = 0; + void *eb; + + xas_lock_irq(&xas); + xas_for_each_marked(&xas, eb, end, PAGECACHE_TAG_DIRTY) { + xas_set_mark(&xas, PAGECACHE_TAG_TOWRITE); + if (++tagged % XA_CHECK_SCHED) + continue; + xas_pause(&xas); + xas_unlock_irq(&xas); + cond_resched(); + xas_lock_irq(&xas); + } + xas_unlock_irq(&xas); +} + +struct eb_batch { + unsigned int nr; + unsigned int cur; + struct extent_buffer *ebs[PAGEVEC_SIZE]; +}; + +static inline bool eb_batch_add(struct eb_batch *batch, struct extent_buffer *eb) +{ + batch->ebs[batch->nr++] = eb; + return (batch->nr < PAGEVEC_SIZE); +} + +static inline void eb_batch_init(struct eb_batch *batch) +{ + batch->nr = 0; + batch->cur = 0; +} + +static inline struct extent_buffer *eb_batch_next(struct eb_batch *batch) +{ + if (batch->cur >= batch->nr) + return NULL; + return batch->ebs[batch->cur++]; +} + +static inline void eb_batch_release(struct eb_batch *batch) +{ + for (unsigned int i = 0; i < batch->nr; i++) + free_extent_buffer(batch->ebs[i]); + eb_batch_init(batch); +} + +static inline struct extent_buffer *find_get_eb(struct xa_state *xas, unsigned long max, + xa_mark_t mark) +{ + struct extent_buffer *eb; + +retry: + eb = xas_find_marked(xas, max, mark); + + if (xas_retry(xas, eb)) + goto retry; + + if (!eb) + return NULL; + + if (!atomic_inc_not_zero(&eb->refs)) { + xas_reset(xas); + goto retry; + } + + if (unlikely(eb != xas_reload(xas))) { + free_extent_buffer(eb); + xas_reset(xas); + goto retry; + } + + return eb; +} + +static unsigned int buffer_tree_get_ebs_tag(struct btrfs_fs_info *fs_info, + unsigned long *start, + unsigned long end, xa_mark_t tag, + struct eb_batch *batch) +{ + XA_STATE(xas, &fs_info->buffer_tree, *start); + struct extent_buffer *eb; + + rcu_read_lock(); + while ((eb = find_get_eb(&xas, end, tag)) != NULL) { + if (!eb_batch_add(batch, eb)) { + *start = ((eb->start + eb->len) >> fs_info->sectorsize_bits); + goto out; + } + } + if (end == ULONG_MAX) + *start = ULONG_MAX; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return batch->nr; +} + /* * The endio specific version which won't touch any unsafe spinlock in endio * context. */ static struct extent_buffer *find_extent_buffer_nolock( - const struct btrfs_fs_info *fs_info, u64 start) + struct btrfs_fs_info *fs_info, u64 start) { struct extent_buffer *eb; + unsigned long index = (start >> fs_info->sectorsize_bits); rcu_read_lock(); - eb = radix_tree_lookup(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits); - if (eb && atomic_inc_not_zero(&eb->refs)) { - rcu_read_unlock(); - return eb; - } + eb = xa_load(&fs_info->buffer_tree, index); + if (eb && !atomic_inc_not_zero(&eb->refs)) + eb = NULL; rcu_read_unlock(); - return NULL; + return eb; } static void end_bbio_meta_write(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; struct folio_iter fi; - u32 bio_offset = 0; if (bbio->bio.bi_status != BLK_STS_OK) set_btree_ioerr(eb); bio_for_each_folio_all(fi, &bbio->bio) { - u64 start = eb->start + bio_offset; - struct folio *folio = fi.folio; - u32 len = fi.length; - - btrfs_folio_clear_writeback(fs_info, folio, start, len); - bio_offset += len; + btrfs_meta_folio_clear_writeback(fi.folio, eb); } + buffer_tree_clear_mark(eb, PAGECACHE_TAG_WRITEBACK); clear_bit(EXTENT_BUFFER_WRITEBACK, &eb->bflags); smp_mb__after_atomic(); wake_up_bit(&eb->bflags, EXTENT_BUFFER_WRITEBACK); @@ -1792,199 +2082,56 @@ static noinline_for_stack void write_one_eb(struct extent_buffer *eb, wbc_init_bio(wbc, &bbio->bio); bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; - if (fs_info->nodesize < PAGE_SIZE) { - struct folio *folio = eb->folios[0]; - bool ret; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; folio_lock(folio); - btrfs_subpage_set_writeback(fs_info, folio, eb->start, eb->len); - if (btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, - eb->len)) { - folio_clear_dirty_for_io(folio); - wbc->nr_to_write--; - } - ret = bio_add_folio(&bbio->bio, folio, eb->len, - eb->start - folio_pos(folio)); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->len); - folio_unlock(folio); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - bool ret; - - folio_lock(folio); - folio_clear_dirty_for_io(folio); - folio_start_writeback(folio); - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - wbc_account_cgroup_owner(wbc, folio, eb->folio_size); + btrfs_meta_folio_clear_dirty(folio, eb); + btrfs_meta_folio_set_writeback(folio, eb); + if (!folio_test_dirty(folio)) wbc->nr_to_write -= folio_nr_pages(folio); - folio_unlock(folio); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); + wbc_account_cgroup_owner(wbc, folio, range_len); + folio_unlock(folio); } btrfs_submit_bbio(bbio, 0); } /* - * Submit one subpage btree page. - * - * The main difference to submit_eb_page() is: - * - Page locking - * For subpage, we don't rely on page locking at all. + * Wait for all eb writeback in the given range to finish. * - * - Flush write bio - * We only flush bio if we may be unable to fit current extent buffers into - * current bio. - * - * Return >=0 for the number of submitted extent buffers. - * Return <0 for fatal error. + * @fs_info: The fs_info for this file system. + * @start: The offset of the range to start waiting on writeback. + * @end: The end of the range, inclusive. This is meant to be used in + * conjuction with wait_marked_extents, so this will usually be + * the_next_eb->start - 1. */ -static int submit_eb_subpage(struct folio *folio, struct writeback_control *wbc) +void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, + u64 end) { - struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - int submitted = 0; - u64 folio_start = folio_pos(folio); - int bit_start = 0; - int sectors_per_node = fs_info->nodesize >> fs_info->sectorsize_bits; - - /* Lock and write each dirty extent buffers in the range */ - while (bit_start < fs_info->sectors_per_page) { - struct btrfs_subpage *subpage = folio_get_private(folio); + struct eb_batch batch; + unsigned long start_index = (start >> fs_info->sectorsize_bits); + unsigned long end_index = (end >> fs_info->sectorsize_bits); + + eb_batch_init(&batch); + while (start_index <= end_index) { struct extent_buffer *eb; - unsigned long flags; - u64 start; + unsigned int nr_ebs; - /* - * Take private lock to ensure the subpage won't be detached - * in the meantime. - */ - spin_lock(&folio->mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&folio->mapping->i_private_lock); + nr_ebs = buffer_tree_get_ebs_tag(fs_info, &start_index, end_index, + PAGECACHE_TAG_WRITEBACK, &batch); + if (!nr_ebs) break; - } - spin_lock_irqsave(&subpage->lock, flags); - if (!test_bit(bit_start + btrfs_bitmap_nr_dirty * fs_info->sectors_per_page, - subpage->bitmaps)) { - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - bit_start++; - continue; - } - - start = folio_start + bit_start * fs_info->sectorsize; - bit_start += sectors_per_node; - - /* - * Here we just want to grab the eb without touching extra - * spin locks, so call find_extent_buffer_nolock(). - */ - eb = find_extent_buffer_nolock(fs_info, start); - spin_unlock_irqrestore(&subpage->lock, flags); - spin_unlock(&folio->mapping->i_private_lock); - - /* - * The eb has already reached 0 refs thus find_extent_buffer() - * doesn't return it. We don't need to write back such eb - * anyway. - */ - if (!eb) - continue; - - if (lock_extent_buffer_for_io(eb, wbc)) { - write_one_eb(eb, wbc); - submitted++; - } - free_extent_buffer(eb); - } - return submitted; -} - -/* - * Submit all page(s) of one extent buffer. - * - * @page: the page of one extent buffer - * @eb_context: to determine if we need to submit this page, if current page - * belongs to this eb, we don't need to submit - * - * The caller should pass each page in their bytenr order, and here we use - * @eb_context to determine if we have submitted pages of one extent buffer. - * - * If we have, we just skip until we hit a new page that doesn't belong to - * current @eb_context. - * - * If not, we submit all the page(s) of the extent buffer. - * - * Return >0 if we have submitted the extent buffer successfully. - * Return 0 if we don't need to submit the page, as it's already submitted by - * previous call. - * Return <0 for fatal error. - */ -static int submit_eb_page(struct folio *folio, struct btrfs_eb_write_context *ctx) -{ - struct writeback_control *wbc = ctx->wbc; - struct address_space *mapping = folio->mapping; - struct extent_buffer *eb; - int ret; - - if (!folio_test_private(folio)) - return 0; - - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) - return submit_eb_subpage(folio, wbc); - - spin_lock(&mapping->i_private_lock); - if (!folio_test_private(folio)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - eb = folio_get_private(folio); - - /* - * Shouldn't happen and normally this would be a BUG_ON but no point - * crashing the machine for something we can survive anyway. - */ - if (WARN_ON(!eb)) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - - if (eb == ctx->eb) { - spin_unlock(&mapping->i_private_lock); - return 0; - } - ret = atomic_inc_not_zero(&eb->refs); - spin_unlock(&mapping->i_private_lock); - if (!ret) - return 0; - - ctx->eb = eb; - ret = btrfs_check_meta_write_pointer(eb->fs_info, ctx); - if (ret) { - if (ret == -EBUSY) - ret = 0; - free_extent_buffer(eb); - return ret; - } - - if (!lock_extent_buffer_for_io(eb, wbc)) { - free_extent_buffer(eb); - return 0; - } - /* Implies write in zoned mode. */ - if (ctx->zoned_bg) { - /* Mark the last eb in the block group. */ - btrfs_schedule_zone_finish_bg(ctx->zoned_bg, eb); - ctx->zoned_bg->meta_write_pointer += eb->len; + while ((eb = eb_batch_next(&batch)) != NULL) + wait_on_extent_buffer_writeback(eb); + eb_batch_release(&batch); + cond_resched(); } - write_one_eb(eb, wbc); - free_extent_buffer(eb); - return 1; } int btree_write_cache_pages(struct address_space *mapping, @@ -1995,25 +2142,27 @@ int btree_write_cache_pages(struct address_space *mapping, int ret = 0; int done = 0; int nr_to_write_done = 0; - struct folio_batch fbatch; - unsigned int nr_folios; - pgoff_t index; - pgoff_t end; /* Inclusive */ + struct eb_batch batch; + unsigned int nr_ebs; + unsigned long index; + unsigned long end; int scanned = 0; xa_mark_t tag; - folio_batch_init(&fbatch); + eb_batch_init(&batch); if (wbc->range_cyclic) { - index = mapping->writeback_index; /* Start from prev offset */ + index = ((mapping->writeback_index << PAGE_SHIFT) >> fs_info->sectorsize_bits); end = -1; + /* * Start from the beginning does not need to cycle over the * range, mark it as scanned. */ scanned = (index == 0); } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; + index = (wbc->range_start >> fs_info->sectorsize_bits); + end = (wbc->range_end >> fs_info->sectorsize_bits); + scanned = 1; } if (wbc->sync_mode == WB_SYNC_ALL) @@ -2023,31 +2172,39 @@ int btree_write_cache_pages(struct address_space *mapping, btrfs_zoned_meta_io_lock(fs_info); retry: if (wbc->sync_mode == WB_SYNC_ALL) - tag_pages_for_writeback(mapping, index, end); + buffer_tree_tag_for_writeback(fs_info, index, end); while (!done && !nr_to_write_done && (index <= end) && - (nr_folios = filemap_get_folios_tag(mapping, &index, end, - tag, &fbatch))) { - unsigned i; + (nr_ebs = buffer_tree_get_ebs_tag(fs_info, &index, end, tag, &batch))) { + struct extent_buffer *eb; - for (i = 0; i < nr_folios; i++) { - struct folio *folio = fbatch.folios[i]; + while ((eb = eb_batch_next(&batch)) != NULL) { + ctx.eb = eb; - ret = submit_eb_page(folio, &ctx); - if (ret == 0) + ret = btrfs_check_meta_write_pointer(eb->fs_info, &ctx); + if (ret) { + if (ret == -EBUSY) + ret = 0; + + if (ret) { + done = 1; + break; + } continue; - if (ret < 0) { - done = 1; - break; } - /* - * the filesystem may choose to bump up nr_to_write. - * We have to make sure to honor the new nr_to_write - * at any time - */ - nr_to_write_done = wbc->nr_to_write <= 0; + if (!lock_extent_buffer_for_io(eb, wbc)) + continue; + + /* Implies write in zoned mode. */ + if (ctx.zoned_bg) { + /* Mark the last eb in the block group. */ + btrfs_schedule_zone_finish_bg(ctx.zoned_bg, eb); + ctx.zoned_bg->meta_write_pointer += eb->len; + } + write_one_eb(eb, wbc); } - folio_batch_release(&fbatch); + nr_to_write_done = (wbc->nr_to_write <= 0); + eb_batch_release(&batch); cond_resched(); } if (!scanned && !done) { @@ -2192,10 +2349,8 @@ retry: done_index = folio_next_index(folio); /* * At this point we hold neither the i_pages lock nor - * the page lock: the page may be truncated or - * invalidated (changing page->mapping to NULL), - * or even swizzled back from swapper_space to - * tmpfs file mapping + * the folio lock: the folio may be truncated or + * invalidated (changing folio->mapping to NULL). */ if (!folio_trylock(folio)) { submit_write_bio(bio_ctrl, 0); @@ -2233,7 +2388,7 @@ retry: * regular submission. */ if (wbc->sync_mode != WB_SYNC_NONE || - btrfs_is_subpage(inode_to_fs_info(inode), mapping)) { + btrfs_is_subpage(inode_to_fs_info(inode), folio)) { if (folio_test_writeback(folio)) submit_write_bio(bio_ctrl, 0); folio_wait_writeback(folio); @@ -2314,8 +2469,8 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(end + 1, sectorsize)); while (cur <= end) { - u64 cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); - u32 cur_len = cur_end + 1 - cur; + u64 cur_end; + u32 cur_len; struct folio *folio; folio = filemap_get_folio(mapping, cur >> PAGE_SHIFT); @@ -2325,13 +2480,18 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f * code is just in case, but shouldn't actually be run. */ if (IS_ERR(folio)) { + cur_end = min(round_down(cur, PAGE_SIZE) + PAGE_SIZE - 1, end); + cur_len = cur_end + 1 - cur; btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL, cur, cur_len, false); mapping_set_error(mapping, PTR_ERR(folio)); - cur = cur_end + 1; + cur = cur_end; continue; } + cur_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, end); + cur_len = cur_end + 1 - cur; + ASSERT(folio_test_locked(folio)); if (pages_dirty && folio != locked_folio) ASSERT(folio_test_dirty(folio)); @@ -2390,15 +2550,15 @@ void btrfs_readahead(struct readahead_control *rac) struct extent_map *em_cached = NULL; u64 prev_em_start = (u64)-1; - btrfs_lock_and_flush_ordered_range(inode, start, end, &cached_state); + lock_extents_for_read(inode, start, end, &cached_state); while ((folio = readahead_folio(rac)) != NULL) btrfs_do_readpage(folio, &em_cached, &bio_ctrl, &prev_em_start); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); if (em_cached) - free_extent_map(em_cached); + btrfs_free_extent_map(em_cached); submit_one_bio(&bio_ctrl); } @@ -2422,7 +2582,7 @@ int extent_invalidate_folio(struct extent_io_tree *tree, if (start > end) return 0; - lock_extent(tree, start, end, &cached_state); + btrfs_lock_extent(tree, start, end, &cached_state); folio_wait_writeback(folio); /* @@ -2430,46 +2590,54 @@ int extent_invalidate_folio(struct extent_io_tree *tree, * so here we only need to unlock the extent range to free any * existing extent state. */ - unlock_extent(tree, start, end, &cached_state); + btrfs_unlock_extent(tree, start, end, &cached_state); return 0; } /* - * a helper for release_folio, this tests for areas of the page that - * are locked or under IO and drops the related state bits if it is safe - * to drop the page. + * A helper for struct address_space_operations::release_folio, this tests for + * areas of the folio that are locked or under IO and drops the related state + * bits if it is safe to drop the folio. */ static bool try_release_extent_state(struct extent_io_tree *tree, struct folio *folio) { + struct extent_state *cached_state = NULL; u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; - bool ret; + u64 end = start + folio_size(folio) - 1; + u32 range_bits; + u32 clear_bits; + bool ret = false; + int ret2; - if (test_range_bit_exists(tree, start, end, EXTENT_LOCKED)) { - ret = false; - } else { - u32 clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | - EXTENT_DELALLOC_NEW | EXTENT_CTLBITS | - EXTENT_QGROUP_RESERVED); - int ret2; + btrfs_get_range_bits(tree, start, end, &range_bits, &cached_state); - /* - * At this point we can safely clear everything except the - * locked bit, the nodatasum bit and the delalloc new bit. - * The delalloc new bit will be cleared by ordered extent - * completion. - */ - ret2 = __clear_extent_bit(tree, start, end, clear_bits, NULL, NULL); + /* + * We can release the folio if it's locked only for ordered extent + * completion, since that doesn't require using the folio. + */ + if ((range_bits & EXTENT_LOCKED) && + !(range_bits & EXTENT_FINISHING_ORDERED)) + goto out; + + clear_bits = ~(EXTENT_LOCKED | EXTENT_NODATASUM | EXTENT_DELALLOC_NEW | + EXTENT_CTLBITS | EXTENT_QGROUP_RESERVED | + EXTENT_FINISHING_ORDERED); + /* + * At this point we can safely clear everything except the locked, + * nodatasum, delalloc new and finishing ordered bits. The delalloc new + * bit will be cleared by ordered extent completion. + */ + ret2 = btrfs_clear_extent_bit(tree, start, end, clear_bits, &cached_state); + /* + * If clear_extent_bit failed for enomem reasons, we can't allow the + * release to continue. + */ + if (ret2 == 0) + ret = true; +out: + btrfs_free_extent_state(cached_state); - /* if clear_extent_bit failed for enomem reasons, - * we can't allow the release to continue. - */ - if (ret2 < 0) - ret = false; - else - ret = true; - } return ret; } @@ -2481,7 +2649,7 @@ static bool try_release_extent_state(struct extent_io_tree *tree, bool try_release_extent_mapping(struct folio *folio, gfp_t mask) { u64 start = folio_pos(folio); - u64 end = start + PAGE_SIZE - 1; + u64 end = start + folio_size(folio) - 1; struct btrfs_inode *inode = folio_to_inode(folio); struct extent_io_tree *io_tree = &inode->io_tree; @@ -2492,18 +2660,19 @@ bool try_release_extent_mapping(struct folio *folio, gfp_t mask) struct extent_map *em; write_lock(&extent_tree->lock); - em = lookup_extent_mapping(extent_tree, start, len); + em = btrfs_lookup_extent_mapping(extent_tree, start, len); if (!em) { write_unlock(&extent_tree->lock); break; } if ((em->flags & EXTENT_FLAG_PINNED) || em->start != start) { write_unlock(&extent_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); break; } - if (test_range_bit_exists(io_tree, em->start, - extent_map_end(em) - 1, EXTENT_LOCKED)) + if (btrfs_test_range_bit_exists(io_tree, em->start, + btrfs_extent_map_end(em) - 1, + EXTENT_LOCKED)) goto next; /* * If it's not in the list of modified extents, used by a fast @@ -2530,15 +2699,15 @@ remove_em: * fsync performance for workloads with a data size that exceeds * or is close to the system's memory). */ - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); /* Once for the inode's extent map tree. */ - free_extent_map(em); + btrfs_free_extent_map(em); next: - start = extent_map_end(em); + start = btrfs_extent_map_end(em); write_unlock(&extent_tree->lock); /* Once for us, for the lookup_extent_mapping() reference. */ - free_extent_map(em); + btrfs_free_extent_map(em); if (need_resched()) { /* @@ -2577,6 +2746,7 @@ static bool folio_range_has_eb(struct folio *folio) static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct folio *folio) { struct btrfs_fs_info *fs_info = eb->fs_info; + struct address_space *mapping = folio->mapping; const bool mapped = !test_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); /* @@ -2584,21 +2754,20 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * be done under the i_private_lock. */ if (mapped) - spin_lock(&folio->mapping->i_private_lock); + spin_lock(&mapping->i_private_lock); if (!folio_test_private(folio)) { if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } - if (fs_info->nodesize >= PAGE_SIZE) { + if (!btrfs_meta_is_subpage(fs_info)) { /* - * We do this since we'll remove the pages after we've - * removed the eb from the radix tree, so we could race - * and have this page now attached to the new eb. So - * only clear folio if it's still connected to - * this eb. + * We do this since we'll remove the pages after we've removed + * the eb from the xarray, so we could race and have this page + * now attached to the new eb. So only clear folio if it's + * still connected to this eb. */ if (folio_test_private(folio) && folio_get_private(folio) == eb) { BUG_ON(test_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)); @@ -2608,7 +2777,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo folio_detach_private(folio); } if (mapped) - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); return; } @@ -2618,7 +2787,7 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * attached to one dummy eb, no sharing. */ if (!mapped) { - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); return; } @@ -2629,9 +2798,9 @@ static void detach_extent_buffer_folio(const struct extent_buffer *eb, struct fo * page range and no unfinished IO. */ if (!folio_range_has_eb(folio)) - btrfs_detach_subpage(fs_info, folio); + btrfs_detach_subpage(fs_info, folio, BTRFS_SUBPAGE_METADATA); - spin_unlock(&folio->mapping->i_private_lock); + spin_unlock(&mapping->i_private_lock); } /* Release all folios attached to the extent buffer */ @@ -2646,9 +2815,6 @@ static void btrfs_release_extent_buffer_folios(const struct extent_buffer *eb) continue; detach_extent_buffer_folio(eb, folio); - - /* One for when we allocated the folio. */ - folio_put(folio); } } @@ -2662,15 +2828,14 @@ static inline void btrfs_release_extent_buffer(struct extent_buffer *eb) kmem_cache_free(extent_buffer_cache, eb); } -static struct extent_buffer * -__alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, - unsigned long len) +static struct extent_buffer *__alloc_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb = NULL; eb = kmem_cache_zalloc(extent_buffer_cache, GFP_NOFS|__GFP_NOFAIL); eb->start = start; - eb->len = len; + eb->len = fs_info->nodesize; eb->fs_info = fs_info; init_rwsem(&eb->lock); @@ -2679,18 +2844,36 @@ __alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, spin_lock_init(&eb->refs_lock); atomic_set(&eb->refs, 1); - ASSERT(len <= BTRFS_MAX_METADATA_BLOCKSIZE); + ASSERT(eb->len <= BTRFS_MAX_METADATA_BLOCKSIZE); return eb; } +/* + * For use in eb allocation error cleanup paths, as btrfs_release_extent_buffer() + * does not call folio_put(), and we need to set the folios to NULL so that + * btrfs_release_extent_buffer() will not detach them a second time. + */ +static void cleanup_extent_buffer_folios(struct extent_buffer *eb) +{ + const int num_folios = num_extent_folios(eb); + + /* We canont use num_extent_folios() as loop bound as eb->folios changes. */ + for (int i = 0; i < num_folios; i++) { + ASSERT(eb->folios[i]); + detach_extent_buffer_folio(eb, eb->folios[i]); + folio_put(eb->folios[i]); + eb->folios[i] = NULL; + } +} + struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) { struct extent_buffer *new; - int num_folios = num_extent_folios(src); + int num_folios; int ret; - new = __alloc_extent_buffer(src->fs_info, src->start, src->len); + new = __alloc_extent_buffer(src->fs_info, src->start); if (new == NULL) return NULL; @@ -2702,78 +2885,78 @@ struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src) set_bit(EXTENT_BUFFER_UNMAPPED, &new->bflags); ret = alloc_eb_folio_array(new, false); - if (ret) { - btrfs_release_extent_buffer(new); - return NULL; - } + if (ret) + goto release_eb; + ASSERT(num_extent_folios(src) == num_extent_folios(new), + "%d != %d", num_extent_folios(src), num_extent_folios(new)); + /* Explicitly use the cached num_extent value from now on. */ + num_folios = num_extent_folios(src); for (int i = 0; i < num_folios; i++) { struct folio *folio = new->folios[i]; ret = attach_extent_buffer_folio(new, folio, NULL); - if (ret < 0) { - btrfs_release_extent_buffer(new); - return NULL; - } + if (ret < 0) + goto cleanup_folios; WARN_ON(folio_test_dirty(folio)); } + for (int i = 0; i < num_folios; i++) + folio_put(new->folios[i]); + copy_extent_buffer_full(new, src); set_extent_buffer_uptodate(new); return new; + +cleanup_folios: + cleanup_extent_buffer_folios(new); +release_eb: + btrfs_release_extent_buffer(new); + return NULL; } -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len) +struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, + u64 start) { struct extent_buffer *eb; - int num_folios = 0; int ret; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return NULL; ret = alloc_eb_folio_array(eb, false); if (ret) - goto err; + goto release_eb; - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { ret = attach_extent_buffer_folio(eb, eb->folios[i], NULL); if (ret < 0) - goto err; + goto cleanup_folios; } + for (int i = 0; i < num_extent_folios(eb); i++) + folio_put(eb->folios[i]); set_extent_buffer_uptodate(eb); btrfs_set_header_nritems(eb, 0); set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); return eb; -err: - for (int i = 0; i < num_folios; i++) { - if (eb->folios[i]) { - detach_extent_buffer_folio(eb, eb->folios[i]); - folio_put(eb->folios[i]); - } - } - kmem_cache_free(extent_buffer_cache, eb); - return NULL; -} -struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start) -{ - return __alloc_dummy_extent_buffer(fs_info, start, fs_info->nodesize); +cleanup_folios: + cleanup_extent_buffer_folios(eb); +release_eb: + btrfs_release_extent_buffer(eb); + return NULL; } static void check_buffer_tree_ref(struct extent_buffer *eb) { int refs; /* - * The TREE_REF bit is first set when the extent_buffer is added - * to the radix tree. It is also reset, if unset, when a new reference - * is created by find_extent_buffer. + * The TREE_REF bit is first set when the extent_buffer is added to the + * xarray. It is also reset, if unset, when a new reference is created + * by find_extent_buffer. * * It is only cleared in two cases: freeing the last non-tree * reference to the extent_buffer when its STALE bit is set or @@ -2785,13 +2968,12 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) * conditions between the calls to check_buffer_tree_ref in those * codepaths and clearing TREE_REF in try_release_extent_buffer. * - * The actual lifetime of the extent_buffer in the radix tree is - * adequately protected by the refcount, but the TREE_REF bit and - * its corresponding reference are not. To protect against this - * class of races, we call check_buffer_tree_ref from the codepaths - * which trigger io. Note that once io is initiated, TREE_REF can no - * longer be cleared, so that is the moment at which any such race is - * best fixed. + * The actual lifetime of the extent_buffer in the xarray is adequately + * protected by the refcount, but the TREE_REF bit and its corresponding + * reference are not. To protect against this class of races, we call + * check_buffer_tree_ref() from the code paths which trigger io. Note that + * once io is initiated, TREE_REF can no longer be cleared, so that is + * the moment at which any such race is best fixed. */ refs = atomic_read(&eb->refs); if (refs >= 2 && test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)) @@ -2805,11 +2987,9 @@ static void check_buffer_tree_ref(struct extent_buffer *eb) static void mark_extent_buffer_accessed(struct extent_buffer *eb) { - int num_folios= num_extent_folios(eb); - check_buffer_tree_ref(eb); - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) folio_mark_accessed(eb->folios[i]); } @@ -2842,10 +3022,10 @@ struct extent_buffer *find_extent_buffer(struct btrfs_fs_info *fs_info, return eb; } -#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, u64 start) { +#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS struct extent_buffer *eb, *exists = NULL; int ret; @@ -2857,32 +3037,34 @@ struct extent_buffer *alloc_test_extent_buffer(struct btrfs_fs_info *fs_info, return ERR_PTR(-ENOMEM); eb->fs_info = fs_info; again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) { - exists = ERR_PTR(ret); - goto free_eb; + xa_lock_irq(&fs_info->buffer_tree); + exists = __xa_cmpxchg(&fs_info->buffer_tree, start >> fs_info->sectorsize_bits, + NULL, eb, GFP_NOFS); + if (xa_is_err(exists)) { + ret = xa_err(exists); + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return ERR_PTR(ret); } - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - exists = find_extent_buffer(fs_info, start); - if (exists) - goto free_eb; - else + if (exists) { + if (!atomic_inc_not_zero(&exists->refs)) { + /* The extent buffer is being freed, retry. */ + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + btrfs_release_extent_buffer(eb); + return exists; } + xa_unlock_irq(&fs_info->buffer_tree); check_buffer_tree_ref(eb); - set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); return eb; -free_eb: - btrfs_release_extent_buffer(eb); - return exists; -} +#else + /* Stub to avoid linker error when compiled with optimizations turned off. */ + return NULL; #endif +} static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, struct folio *folio) @@ -2892,11 +3074,11 @@ static struct extent_buffer *grab_extent_buffer(struct btrfs_fs_info *fs_info, lockdep_assert_held(&folio->mapping->i_private_lock); /* - * For subpage case, we completely rely on radix tree to ensure we - * don't try to insert two ebs for the same bytenr. So here we always - * return NULL and just continue. + * For subpage case, we completely rely on xarray to ensure we don't try + * to insert two ebs for the same bytenr. So here we always return NULL + * and just continue. */ - if (fs_info->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(fs_info)) return NULL; /* Page not yet attached to an extent buffer */ @@ -2928,10 +3110,9 @@ static bool check_eb_alignment(struct btrfs_fs_info *fs_info, u64 start) return true; } - if (fs_info->nodesize < PAGE_SIZE && - offset_in_page(start) + fs_info->nodesize > PAGE_SIZE) { + if (fs_info->nodesize < PAGE_SIZE && !IS_ALIGNED(start, fs_info->nodesize)) { btrfs_err(fs_info, - "tree block crosses page boundary, start %llu nodesize %u", + "tree block is not nodesize aligned, start %llu nodesize %u", start, fs_info->nodesize); return true; } @@ -2967,7 +3148,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, struct btrfs_fs_info *fs_info = eb->fs_info; struct address_space *mapping = fs_info->btree_inode->i_mapping; const unsigned long index = eb->start >> PAGE_SHIFT; - struct folio *existing_folio = NULL; + struct folio *existing_folio; int ret; ASSERT(found_eb_ret); @@ -2976,6 +3157,7 @@ static int attach_eb_folio_to_filemap(struct extent_buffer *eb, int i, ASSERT(eb->folios[i]); retry: + existing_folio = NULL; ret = filemap_add_folio(mapping, eb->folios[i], index + i, GFP_NOFS | __GFP_NOFAIL); if (!ret) @@ -2983,10 +3165,8 @@ retry: existing_folio = filemap_lock_folio(mapping, index + i); /* The page cache only exists for a very short time, just retry. */ - if (IS_ERR(existing_folio)) { - existing_folio = NULL; + if (IS_ERR(existing_folio)) goto retry; - } /* For now, we should only have single-page folios for btree inode. */ ASSERT(folio_nr_pages(existing_folio) == 1); @@ -2999,7 +3179,7 @@ retry: finish: spin_lock(&mapping->i_private_lock); - if (existing_folio && fs_info->nodesize < PAGE_SIZE) { + if (existing_folio && btrfs_meta_is_subpage(fs_info)) { /* We're going to reuse the existing page, can drop our folio now. */ __free_page(folio_page(eb->folios[i], 0)); eb->folios[i] = existing_folio; @@ -3027,7 +3207,7 @@ finish: /* * To inform we have an extra eb under allocation, so that * detach_extent_buffer_page() won't release the folio private when the - * eb hasn't been inserted into radix tree yet. + * eb hasn't been inserted into the xarray yet. * * The ref will be decreased when the eb releases the page, in * detach_extent_buffer_page(). Thus needs no special handling in the @@ -3041,8 +3221,6 @@ finish: struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level) { - unsigned long len = fs_info->nodesize; - int num_folios; int attached = 0; struct extent_buffer *eb; struct extent_buffer *existing_eb = NULL; @@ -3070,7 +3248,7 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, if (eb) return eb; - eb = __alloc_extent_buffer(fs_info, start, len); + eb = __alloc_extent_buffer(fs_info, start); if (!eb) return ERR_PTR(-ENOMEM); @@ -3090,8 +3268,8 @@ struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, * The memory will be freed by attach_extent_buffer_page() or freed * manually if we exit earlier. */ - if (fs_info->nodesize < PAGE_SIZE) { - prealloc = btrfs_alloc_subpage(fs_info, BTRFS_SUBPAGE_METADATA); + if (btrfs_meta_is_subpage(fs_info)) { + prealloc = btrfs_alloc_subpage(fs_info, PAGE_SIZE, BTRFS_SUBPAGE_METADATA); if (IS_ERR(prealloc)) { ret = PTR_ERR(prealloc); goto out; @@ -3106,9 +3284,8 @@ reallocate: goto out; } - num_folios = num_extent_folios(eb); /* Attach all pages to the filemap. */ - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio; ret = attach_eb_folio_to_filemap(eb, i, prealloc, &existing_eb); @@ -3137,7 +3314,7 @@ reallocate: * using 0-order folios. */ if (unlikely(ret == -EAGAIN)) { - ASSERT(0); + DEBUG_WARN("folio order mismatch between new eb and filemap"); goto reallocate; } attached++; @@ -3148,7 +3325,7 @@ reallocate: * and free the allocated page. */ folio = eb->folios[i]; - WARN_ON(btrfs_folio_test_dirty(fs_info, folio, eb->start, eb->len)); + WARN_ON(btrfs_meta_folio_test_dirty(folio, eb)); /* * Check if the current page is physically contiguous with previous eb @@ -3159,15 +3336,14 @@ reallocate: if (i && folio_page(eb->folios[i - 1], 0) + 1 != folio_page(folio, 0)) page_contig = false; - if (!btrfs_folio_test_uptodate(fs_info, folio, eb->start, eb->len)) + if (!btrfs_meta_folio_test_uptodate(folio, eb)) uptodate = 0; /* * We can't unlock the pages just yet since the extent buffer - * hasn't been properly inserted in the radix tree, this - * opens a race with btree_release_folio which can free a page - * while we are still filling in all pages for the buffer and - * we could crash. + * hasn't been properly inserted into the xarray, this opens a + * race with btree_release_folio() which can free a page while we + * are still filling in all pages for the buffer and we could crash. */ } if (uptodate) @@ -3176,34 +3352,42 @@ reallocate: if (page_contig) eb->addr = folio_address(eb->folios[0]) + offset_in_page(eb->start); again: - ret = radix_tree_preload(GFP_NOFS); - if (ret) + xa_lock_irq(&fs_info->buffer_tree); + existing_eb = __xa_cmpxchg(&fs_info->buffer_tree, + start >> fs_info->sectorsize_bits, NULL, eb, + GFP_NOFS); + if (xa_is_err(existing_eb)) { + ret = xa_err(existing_eb); + xa_unlock_irq(&fs_info->buffer_tree); goto out; - - spin_lock(&fs_info->buffer_lock); - ret = radix_tree_insert(&fs_info->buffer_radix, - start >> fs_info->sectorsize_bits, eb); - spin_unlock(&fs_info->buffer_lock); - radix_tree_preload_end(); - if (ret == -EEXIST) { - ret = 0; - existing_eb = find_extent_buffer(fs_info, start); - if (existing_eb) - goto out; - else + } + if (existing_eb) { + if (!atomic_inc_not_zero(&existing_eb->refs)) { + xa_unlock_irq(&fs_info->buffer_tree); goto again; + } + xa_unlock_irq(&fs_info->buffer_tree); + goto out; } + xa_unlock_irq(&fs_info->buffer_tree); + /* add one reference for the tree */ check_buffer_tree_ref(eb); - set_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags); /* * Now it's safe to unlock the pages because any calls to * btree_release_folio will correctly detect that a page belongs to a * live buffer and won't free them prematurely. */ - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) { folio_unlock(eb->folios[i]); + /* + * A folio that has been added to an address_space mapping + * should not continue holding the refcount from its original + * allocation indefinitely. + */ + folio_put(eb->folios[i]); + } return eb; out: @@ -3217,26 +3401,22 @@ out: * want that to grab this eb, as we're getting ready to free it. So we * have to detach it first and then unlock it. * - * We have to drop our reference and NULL it out here because in the - * subpage case detaching does a btrfs_folio_dec_eb_refs() for our eb. - * Below when we call btrfs_release_extent_buffer() we will call - * detach_extent_buffer_folio() on our remaining pages in the !subpage - * case. If we left eb->folios[i] populated in the subpage case we'd - * double put our reference and be super sad. + * Note: the bounds is num_extent_pages() as we need to go through all slots. */ - for (int i = 0; i < attached; i++) { - ASSERT(eb->folios[i]); - detach_extent_buffer_folio(eb, eb->folios[i]); - folio_unlock(eb->folios[i]); - folio_put(eb->folios[i]); + for (int i = 0; i < num_extent_pages(eb); i++) { + struct folio *folio = eb->folios[i]; + + if (i < attached) { + ASSERT(folio); + detach_extent_buffer_folio(eb, folio); + folio_unlock(folio); + } else if (!folio) { + continue; + } + + folio_put(folio); eb->folios[i] = NULL; } - /* - * Now all pages of that extent buffer is unmapped, set UNMAPPED flag, - * so it can be cleaned up without utilizing page->mapping. - */ - set_bit(EXTENT_BUFFER_UNMAPPED, &eb->bflags); - btrfs_release_extent_buffer(eb); if (ret < 0) return ERR_PTR(ret); @@ -3259,18 +3439,27 @@ static int release_extent_buffer(struct extent_buffer *eb) WARN_ON(atomic_read(&eb->refs) == 0); if (atomic_dec_and_test(&eb->refs)) { - if (test_and_clear_bit(EXTENT_BUFFER_IN_TREE, &eb->bflags)) { - struct btrfs_fs_info *fs_info = eb->fs_info; + struct btrfs_fs_info *fs_info = eb->fs_info; - spin_unlock(&eb->refs_lock); + spin_unlock(&eb->refs_lock); - spin_lock(&fs_info->buffer_lock); - radix_tree_delete(&fs_info->buffer_radix, - eb->start >> fs_info->sectorsize_bits); - spin_unlock(&fs_info->buffer_lock); - } else { - spin_unlock(&eb->refs_lock); - } + /* + * We're erasing, theoretically there will be no allocations, so + * just use GFP_ATOMIC. + * + * We use cmpxchg instead of erase because we do not know if + * this eb is actually in the tree or not, we could be cleaning + * up an eb that we allocated but never inserted into the tree. + * Thus use cmpxchg to remove it from the tree if it is there, + * or leave the other entry if this isn't in the tree. + * + * The documentation says that putting a NULL value is the same + * as erase as long as XA_FLAGS_ALLOC is not set, which it isn't + * in this case. + */ + xa_cmpxchg_irq(&fs_info->buffer_tree, + eb->start >> fs_info->sectorsize_bits, eb, NULL, + GFP_ATOMIC); btrfs_leak_debug_del_eb(eb); /* Should be safe to release folios at this point. */ @@ -3333,38 +3522,21 @@ void free_extent_buffer_stale(struct extent_buffer *eb) release_extent_buffer(eb); } -static void btree_clear_folio_dirty(struct folio *folio) +static void btree_clear_folio_dirty_tag(struct folio *folio) { - ASSERT(folio_test_dirty(folio)); + ASSERT(!folio_test_dirty(folio)); ASSERT(folio_test_locked(folio)); - folio_clear_dirty_for_io(folio); xa_lock_irq(&folio->mapping->i_pages); if (!folio_test_dirty(folio)) - __xa_clear_mark(&folio->mapping->i_pages, - folio_index(folio), PAGECACHE_TAG_DIRTY); + __xa_clear_mark(&folio->mapping->i_pages, folio->index, + PAGECACHE_TAG_DIRTY); xa_unlock_irq(&folio->mapping->i_pages); } -static void clear_subpage_extent_buffer_dirty(const struct extent_buffer *eb) -{ - struct btrfs_fs_info *fs_info = eb->fs_info; - struct folio *folio = eb->folios[0]; - bool last; - - /* btree_clear_folio_dirty() needs page locked. */ - folio_lock(folio); - last = btrfs_subpage_clear_and_test_dirty(fs_info, folio, eb->start, eb->len); - if (last) - btree_clear_folio_dirty(folio); - folio_unlock(folio); - WARN_ON(atomic_read(&eb->refs) == 0); -} - void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, struct extent_buffer *eb) { struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios; btrfs_assert_tree_write_locked(eb); @@ -3388,20 +3560,20 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, if (!test_and_clear_bit(EXTENT_BUFFER_DIRTY, &eb->bflags)) return; + buffer_tree_clear_mark(eb, PAGECACHE_TAG_DIRTY); percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, -eb->len, fs_info->dirty_metadata_batch); - if (eb->fs_info->nodesize < PAGE_SIZE) - return clear_subpage_extent_buffer_dirty(eb); - - num_folios = num_extent_folios(eb); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; + bool last; if (!folio_test_dirty(folio)) continue; folio_lock(folio); - btree_clear_folio_dirty(folio); + last = btrfs_meta_folio_clear_and_test_dirty(folio, eb); + if (last) + btree_clear_folio_dirty_tag(folio); folio_unlock(folio); } WARN_ON(atomic_read(&eb->refs) == 0); @@ -3409,37 +3581,35 @@ void btrfs_clear_buffer_dirty(struct btrfs_trans_handle *trans, void set_extent_buffer_dirty(struct extent_buffer *eb) { - int num_folios; bool was_dirty; check_buffer_tree_ref(eb); was_dirty = test_and_set_bit(EXTENT_BUFFER_DIRTY, &eb->bflags); - num_folios = num_extent_folios(eb); WARN_ON(atomic_read(&eb->refs) == 0); WARN_ON(!test_bit(EXTENT_BUFFER_TREE_REF, &eb->bflags)); WARN_ON(test_bit(EXTENT_BUFFER_ZONED_ZEROOUT, &eb->bflags)); if (!was_dirty) { - bool subpage = eb->fs_info->nodesize < PAGE_SIZE; + bool subpage = btrfs_meta_is_subpage(eb->fs_info); /* * For subpage case, we can have other extent buffers in the - * same page, and in clear_subpage_extent_buffer_dirty() we + * same page, and in clear_extent_buffer_dirty() we * have to clear page dirty without subpage lock held. * This can cause race where our page gets dirty cleared after * we just set it. * - * Thankfully, clear_subpage_extent_buffer_dirty() has locked + * Thankfully, clear_extent_buffer_dirty() has locked * its page for other reasons, we can use page lock to prevent * the above race. */ if (subpage) folio_lock(eb->folios[0]); - for (int i = 0; i < num_folios; i++) - btrfs_folio_set_dirty(eb->fs_info, eb->folios[i], - eb->start, eb->len); + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_dirty(eb->folios[i], eb); + buffer_tree_set_mark(eb, PAGECACHE_TAG_DIRTY); if (subpage) folio_unlock(eb->folios[0]); percpu_counter_add_batch(&eb->fs_info->dirty_metadata_bytes, @@ -3447,54 +3617,31 @@ void set_extent_buffer_dirty(struct extent_buffer *eb) eb->fs_info->dirty_metadata_batch); } #ifdef CONFIG_BTRFS_DEBUG - for (int i = 0; i < num_folios; i++) + for (int i = 0; i < num_extent_folios(eb); i++) ASSERT(folio_test_dirty(eb->folios[i])); #endif } void clear_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); clear_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { + for (int i = 0; i < num_extent_folios(eb); i++) { struct folio *folio = eb->folios[i]; if (!folio) continue; - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_clear_uptodate(folio); - else - btrfs_subpage_clear_uptodate(fs_info, folio, - eb->start, eb->len); + btrfs_meta_folio_clear_uptodate(folio, eb); } } void set_extent_buffer_uptodate(struct extent_buffer *eb) { - struct btrfs_fs_info *fs_info = eb->fs_info; - int num_folios = num_extent_folios(eb); set_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags); - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; - - /* - * This is special handling for metadata subpage, as regular - * btrfs_is_subpage() can not handle cloned/dummy metadata. - */ - if (fs_info->nodesize >= PAGE_SIZE) - folio_mark_uptodate(folio); - else - btrfs_subpage_set_uptodate(fs_info, folio, - eb->start, eb->len); - } + for (int i = 0; i < num_extent_folios(eb); i++) + btrfs_meta_folio_set_uptodate(eb->folios[i], eb); } static void clear_extent_buffer_reading(struct extent_buffer *eb) @@ -3507,10 +3654,7 @@ static void clear_extent_buffer_reading(struct extent_buffer *eb) static void end_bbio_meta_read(struct btrfs_bio *bbio) { struct extent_buffer *eb = bbio->private; - struct btrfs_fs_info *fs_info = eb->fs_info; bool uptodate = !bbio->bio.bi_status; - struct folio_iter fi; - u32 bio_offset = 0; /* * If the extent buffer is marked UPTODATE before the read operation @@ -3525,25 +3669,10 @@ static void end_bbio_meta_read(struct btrfs_bio *bbio) btrfs_validate_extent_buffer(eb, &bbio->parent_check) < 0) uptodate = false; - if (uptodate) { + if (uptodate) set_extent_buffer_uptodate(eb); - } else { + else clear_extent_buffer_uptodate(eb); - set_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); - } - - bio_for_each_folio_all(fi, &bbio->bio) { - struct folio *folio = fi.folio; - u64 start = eb->start + bio_offset; - u32 len = fi.length; - - if (uptodate) - btrfs_folio_set_uptodate(fs_info, folio, start, len); - else - btrfs_folio_clear_uptodate(fs_info, folio, start, len); - - bio_offset += len; - } clear_extent_buffer_reading(eb); free_extent_buffer(eb); @@ -3555,7 +3684,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, const struct btrfs_tree_parent_check *check) { struct btrfs_bio *bbio; - bool ret; if (test_bit(EXTENT_BUFFER_UPTODATE, &eb->bflags)) return 0; @@ -3583,7 +3711,6 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, return 0; } - clear_bit(EXTENT_BUFFER_READ_ERR, &eb->bflags); eb->read_mirror = 0; check_buffer_tree_ref(eb); atomic_inc(&eb->refs); @@ -3595,19 +3722,14 @@ int read_extent_buffer_pages_nowait(struct extent_buffer *eb, int mirror_num, bbio->inode = BTRFS_I(eb->fs_info->btree_inode); bbio->file_offset = eb->start; memcpy(&bbio->parent_check, check, sizeof(*check)); - if (eb->fs_info->nodesize < PAGE_SIZE) { - ret = bio_add_folio(&bbio->bio, eb->folios[0], eb->len, - eb->start - folio_pos(eb->folios[0])); - ASSERT(ret); - } else { - int num_folios = num_extent_folios(eb); - - for (int i = 0; i < num_folios; i++) { - struct folio *folio = eb->folios[i]; + for (int i = 0; i < num_extent_folios(eb); i++) { + struct folio *folio = eb->folios[i]; + u64 range_start = max_t(u64, eb->start, folio_pos(folio)); + u32 range_len = min_t(u64, folio_pos(folio) + folio_size(folio), + eb->start + eb->len) - range_start; - ret = bio_add_folio(&bbio->bio, folio, eb->folio_size, 0); - ASSERT(ret); - } + bio_add_folio_nofail(&bbio->bio, folio, range_len, + offset_in_folio(folio, range_start)); } btrfs_submit_bbio(bbio, mirror_num); return 0; @@ -3634,7 +3756,7 @@ static bool report_eb_range(const struct extent_buffer *eb, unsigned long start, btrfs_warn(eb->fs_info, "access to eb bytenr %llu len %u out of range start %lu len %lu", eb->start, eb->len, start, len); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); return true; } @@ -3796,7 +3918,7 @@ static void assert_eb_folio_uptodate(const struct extent_buffer *eb, int i) if (test_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) return; - if (fs_info->nodesize < PAGE_SIZE) { + if (btrfs_meta_is_subpage(fs_info)) { folio = eb->folios[0]; ASSERT(i == 0); if (WARN_ON(!btrfs_subpage_test_uptodate(fs_info, folio, @@ -4170,71 +4292,17 @@ void memmove_extent_buffer(const struct extent_buffer *dst, } } -#define GANG_LOOKUP_SIZE 16 -static struct extent_buffer *get_next_extent_buffer( - const struct btrfs_fs_info *fs_info, struct folio *folio, u64 bytenr) -{ - struct extent_buffer *gang[GANG_LOOKUP_SIZE]; - struct extent_buffer *found = NULL; - u64 folio_start = folio_pos(folio); - u64 cur = folio_start; - - ASSERT(in_range(bytenr, folio_start, PAGE_SIZE)); - lockdep_assert_held(&fs_info->buffer_lock); - - while (cur < folio_start + PAGE_SIZE) { - int ret; - int i; - - ret = radix_tree_gang_lookup(&fs_info->buffer_radix, - (void **)gang, cur >> fs_info->sectorsize_bits, - min_t(unsigned int, GANG_LOOKUP_SIZE, - PAGE_SIZE / fs_info->nodesize)); - if (ret == 0) - goto out; - for (i = 0; i < ret; i++) { - /* Already beyond page end */ - if (gang[i]->start >= folio_start + PAGE_SIZE) - goto out; - /* Found one */ - if (gang[i]->start >= bytenr) { - found = gang[i]; - goto out; - } - } - cur = gang[ret - 1]->start + gang[ret - 1]->len; - } -out: - return found; -} - static int try_release_subpage_extent_buffer(struct folio *folio) { struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); - u64 cur = folio_pos(folio); - const u64 end = cur + PAGE_SIZE; + struct extent_buffer *eb; + unsigned long start = (folio_pos(folio) >> fs_info->sectorsize_bits); + unsigned long index = start; + unsigned long end = index + (PAGE_SIZE >> fs_info->sectorsize_bits) - 1; int ret; - while (cur < end) { - struct extent_buffer *eb = NULL; - - /* - * Unlike try_release_extent_buffer() which uses folio private - * to grab buffer, for subpage case we rely on radix tree, thus - * we need to ensure radix tree consistency. - * - * We also want an atomic snapshot of the radix tree, thus go - * with spinlock rather than RCU. - */ - spin_lock(&fs_info->buffer_lock); - eb = get_next_extent_buffer(fs_info, folio, cur); - if (!eb) { - /* No more eb in the page range after or at cur */ - spin_unlock(&fs_info->buffer_lock); - break; - } - cur = eb->start + eb->len; - + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each_range(&fs_info->buffer_tree, index, eb, start, end) { /* * The same as try_release_extent_buffer(), to ensure the eb * won't disappear out from under us. @@ -4242,10 +4310,9 @@ static int try_release_subpage_extent_buffer(struct folio *folio) spin_lock(&eb->refs_lock); if (atomic_read(&eb->refs) != 1 || extent_buffer_under_io(eb)) { spin_unlock(&eb->refs_lock); - spin_unlock(&fs_info->buffer_lock); - break; + continue; } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_tree); /* * If tree ref isn't set then we know the ref on this eb is a @@ -4263,7 +4330,10 @@ static int try_release_subpage_extent_buffer(struct folio *folio) * release_extent_buffer() will release the refs_lock. */ release_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } + xa_unlock_irq(&fs_info->buffer_tree); + /* * Finally to check if we have cleared folio private, as if we have * released all ebs in the page, the folio private should be cleared now. @@ -4282,7 +4352,7 @@ int try_release_extent_buffer(struct folio *folio) { struct extent_buffer *eb; - if (folio_to_fs_info(folio)->nodesize < PAGE_SIZE) + if (btrfs_meta_is_subpage(folio_to_fs_info(folio))) return try_release_subpage_extent_buffer(folio); /* diff --git a/fs/btrfs/extent_io.h b/fs/btrfs/extent_io.h index 6c5328bfabc2..e36e8d6a00bc 100644 --- a/fs/btrfs/extent_io.h +++ b/fs/btrfs/extent_io.h @@ -38,16 +38,10 @@ struct btrfs_tree_parent_check; enum { EXTENT_BUFFER_UPTODATE, EXTENT_BUFFER_DIRTY, - EXTENT_BUFFER_CORRUPT, - /* this got triggered by readahead */ - EXTENT_BUFFER_READAHEAD, EXTENT_BUFFER_TREE_REF, EXTENT_BUFFER_STALE, EXTENT_BUFFER_WRITEBACK, - /* read IO error */ - EXTENT_BUFFER_READ_ERR, EXTENT_BUFFER_UNMAPPED, - EXTENT_BUFFER_IN_TREE, /* write IO error */ EXTENT_BUFFER_WRITE_ERR, /* Indicate the extent buffer is written zeroed out (for zoned) */ @@ -79,7 +73,7 @@ enum { * single word in a bitmap may straddle two pages in the extent buffer. */ #define BIT_BYTE(nr) ((nr) / BITS_PER_BYTE) -#define BYTE_MASK ((1 << BITS_PER_BYTE) - 1) +#define BYTE_MASK ((1U << BITS_PER_BYTE) - 1) #define BITMAP_FIRST_BYTE_MASK(start) \ ((BYTE_MASK << ((start) & (BITS_PER_BYTE - 1))) & BYTE_MASK) #define BITMAP_LAST_BYTE_MASK(nbits) \ @@ -246,14 +240,13 @@ void extent_write_locked_range(struct inode *inode, const struct folio *locked_f int btrfs_writepages(struct address_space *mapping, struct writeback_control *wbc); int btree_write_cache_pages(struct address_space *mapping, struct writeback_control *wbc); +void btrfs_btree_wait_writeback_range(struct btrfs_fs_info *fs_info, u64 start, u64 end); void btrfs_readahead(struct readahead_control *rac); int set_folio_extent_mapped(struct folio *folio); void clear_folio_extent_mapped(struct folio *folio); struct extent_buffer *alloc_extent_buffer(struct btrfs_fs_info *fs_info, u64 start, u64 owner_root, int level); -struct extent_buffer *__alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, - u64 start, unsigned long len); struct extent_buffer *alloc_dummy_extent_buffer(struct btrfs_fs_info *fs_info, u64 start); struct extent_buffer *btrfs_clone_extent_buffer(const struct extent_buffer *src); @@ -276,7 +269,8 @@ void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr, u64 owner_root, u64 gen, int level); void btrfs_readahead_node_child(struct extent_buffer *node, int slot); -static inline int num_extent_pages(const struct extent_buffer *eb) +/* Note: this can be used in for loops without caching the value in a variable. */ +static inline int __pure num_extent_pages(const struct extent_buffer *eb) { /* * For sectorsize == PAGE_SIZE case, since nodesize is always aligned to @@ -294,9 +288,13 @@ static inline int num_extent_pages(const struct extent_buffer *eb) * As we can have either one large folio covering the whole eb * (either nodesize <= PAGE_SIZE, or high order folio), or multiple * single-paged folios. + * + * Note: this can be used in for loops without caching the value in a variable. */ -static inline int num_extent_folios(const struct extent_buffer *eb) +static inline int __pure num_extent_folios(const struct extent_buffer *eb) { + if (!eb->folios[0]) + return 0; if (folio_order(eb->folios[0])) return 1; return num_extent_pages(eb); diff --git a/fs/btrfs/extent_map.c b/fs/btrfs/extent_map.c index 67ce85ff0ae2..02bfdb976e40 100644 --- a/fs/btrfs/extent_map.c +++ b/fs/btrfs/extent_map.c @@ -13,7 +13,7 @@ static struct kmem_cache *extent_map_cache; -int __init extent_map_init(void) +int __init btrfs_extent_map_init(void) { extent_map_cache = kmem_cache_create("btrfs_extent_map", sizeof(struct extent_map), 0, 0, NULL); @@ -22,7 +22,7 @@ int __init extent_map_init(void) return 0; } -void __cold extent_map_exit(void) +void __cold btrfs_extent_map_exit(void) { kmem_cache_destroy(extent_map_cache); } @@ -31,7 +31,7 @@ void __cold extent_map_exit(void) * Initialize the extent tree @tree. Should be called for each new inode or * other user of the extent_map interface. */ -void extent_map_tree_init(struct extent_map_tree *tree) +void btrfs_extent_map_tree_init(struct extent_map_tree *tree) { tree->root = RB_ROOT; INIT_LIST_HEAD(&tree->modified_extents); @@ -42,7 +42,7 @@ void extent_map_tree_init(struct extent_map_tree *tree) * Allocate a new extent_map structure. The new structure is returned with a * reference count of one and needs to be freed using free_extent_map() */ -struct extent_map *alloc_extent_map(void) +struct extent_map *btrfs_alloc_extent_map(void) { struct extent_map *em; em = kmem_cache_zalloc(extent_map_cache, GFP_NOFS); @@ -58,12 +58,12 @@ struct extent_map *alloc_extent_map(void) * Drop the reference out on @em by one and free the structure if the reference * count hits zero. */ -void free_extent_map(struct extent_map *em) +void btrfs_free_extent_map(struct extent_map *em) { if (!em) return; if (refcount_dec_and_test(&em->refs)) { - WARN_ON(extent_map_in_tree(em)); + WARN_ON(btrfs_extent_map_in_tree(em)); WARN_ON(!list_empty(&em->list)); kmem_cache_free(extent_map_cache, em); } @@ -102,19 +102,19 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) if (em->start < entry->start) p = &(*p)->rb_left; - else if (em->start >= extent_map_end(entry)) + else if (em->start >= btrfs_extent_map_end(entry)) p = &(*p)->rb_right; else return -EEXIST; } orig_parent = parent; - while (parent && em->start >= extent_map_end(entry)) { + while (parent && em->start >= btrfs_extent_map_end(entry)) { parent = rb_next(parent); entry = rb_entry(parent, struct extent_map, rb_node); } if (parent) - if (end > entry->start && em->start < extent_map_end(entry)) + if (end > entry->start && em->start < btrfs_extent_map_end(entry)) return -EEXIST; parent = orig_parent; @@ -124,7 +124,7 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) entry = rb_entry(parent, struct extent_map, rb_node); } if (parent) - if (end > entry->start && em->start < extent_map_end(entry)) + if (end > entry->start && em->start < btrfs_extent_map_end(entry)) return -EEXIST; rb_link_node(&em->rb_node, orig_parent, p); @@ -136,8 +136,8 @@ static int tree_insert(struct rb_root *root, struct extent_map *em) * Search through the tree for an extent_map with a given offset. If it can't * be found, try to find some neighboring extents */ -static struct rb_node *__tree_search(struct rb_root *root, u64 offset, - struct rb_node **prev_or_next_ret) +static struct rb_node *tree_search(struct rb_root *root, u64 offset, + struct rb_node **prev_or_next_ret) { struct rb_node *n = root->rb_node; struct rb_node *prev = NULL; @@ -154,14 +154,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, if (offset < entry->start) n = n->rb_left; - else if (offset >= extent_map_end(entry)) + else if (offset >= btrfs_extent_map_end(entry)) n = n->rb_right; else return n; } orig_prev = prev; - while (prev && offset >= extent_map_end(prev_entry)) { + while (prev && offset >= btrfs_extent_map_end(prev_entry)) { prev = rb_next(prev); prev_entry = rb_entry(prev, struct extent_map, rb_node); } @@ -188,14 +188,14 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset, static inline u64 extent_map_block_len(const struct extent_map *em) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return em->disk_num_bytes; return em->len; } static inline u64 extent_map_block_end(const struct extent_map *em) { - const u64 block_start = extent_map_block_start(em); + const u64 block_start = btrfs_extent_map_block_start(em); const u64 block_end = block_start + extent_map_block_len(em); if (block_end < block_start) @@ -210,7 +210,7 @@ static bool can_merge_extent_map(const struct extent_map *em) return false; /* Don't merge compressed extents, we need to know their actual size. */ - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return false; if (em->flags & EXTENT_FLAG_LOGGING) @@ -230,7 +230,7 @@ static bool can_merge_extent_map(const struct extent_map *em) /* Check to see if two extent_map structs are adjacent and safe to merge. */ static bool mergeable_maps(const struct extent_map *prev, const struct extent_map *next) { - if (extent_map_end(prev) != next->start) + if (btrfs_extent_map_end(prev) != next->start) return false; /* @@ -242,7 +242,7 @@ static bool mergeable_maps(const struct extent_map *prev, const struct extent_ma return false; if (next->disk_bytenr < EXTENT_MAP_LAST_BYTE - 1) - return extent_map_block_start(next) == extent_map_block_end(prev); + return btrfs_extent_map_block_start(next) == extent_map_block_end(prev); /* HOLES and INLINE extents. */ return next->disk_bytenr == prev->disk_bytenr; @@ -270,8 +270,8 @@ static void merge_ondisk_extents(const struct extent_map *prev, const struct ext u64 new_offset; /* @prev and @next should not be compressed. */ - ASSERT(!extent_map_is_compressed(prev)); - ASSERT(!extent_map_is_compressed(next)); + ASSERT(!btrfs_extent_map_is_compressed(prev)); + ASSERT(!btrfs_extent_map_is_compressed(next)); /* * There are two different cases where @prev and @next can be merged. @@ -327,9 +327,9 @@ static void validate_extent_map(struct btrfs_fs_info *fs_info, struct extent_map if (em->offset + em->len > em->ram_bytes) dump_extent_map(fs_info, "ram_bytes too small", em); if (em->offset + em->len > em->disk_num_bytes && - !extent_map_is_compressed(em)) + !btrfs_extent_map_is_compressed(em)) dump_extent_map(fs_info, "disk_num_bytes too small", em); - if (!extent_map_is_compressed(em) && + if (!btrfs_extent_map_is_compressed(em) && em->ram_bytes != em->disk_num_bytes) dump_extent_map(fs_info, "ram_bytes mismatch with disk_num_bytes for non-compressed em", @@ -361,8 +361,8 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) if (em->start != 0) { rb = rb_prev(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); + merge = rb_entry_safe(rb, struct extent_map, rb_node); + if (rb && can_merge_extent_map(merge) && mergeable_maps(merge, em)) { em->start = merge->start; em->len += merge->len; @@ -374,13 +374,13 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) validate_extent_map(fs_info, em); remove_em(inode, merge); - free_extent_map(merge); + btrfs_free_extent_map(merge); } } rb = rb_next(&em->rb_node); - if (rb) - merge = rb_entry(rb, struct extent_map, rb_node); + merge = rb_entry_safe(rb, struct extent_map, rb_node); + if (rb && can_merge_extent_map(merge) && mergeable_maps(em, merge)) { em->len += merge->len; if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) @@ -389,7 +389,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) em->generation = max(em->generation, merge->generation); em->flags |= EXTENT_FLAG_MERGED; remove_em(inode, merge); - free_extent_map(merge); + btrfs_free_extent_map(merge); } } @@ -409,7 +409,7 @@ static void try_merge_map(struct btrfs_inode *inode, struct extent_map *em) * -ENOENT when the extent is not found in the tree * -EUCLEAN if the found extent does not match the expected start */ -int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) +int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct extent_map_tree *tree = &inode->extent_tree; @@ -417,7 +417,7 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) struct extent_map *em; write_lock(&tree->lock); - em = lookup_extent_mapping(tree, start, len); + em = btrfs_lookup_extent_mapping(tree, start, len); if (WARN_ON(!em)) { btrfs_warn(fs_info, @@ -444,17 +444,17 @@ int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen) out: write_unlock(&tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } -void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) +void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em) { lockdep_assert_held_write(&inode->extent_tree.lock); em->flags &= ~EXTENT_FLAG_LOGGING; - if (extent_map_in_tree(em)) + if (btrfs_extent_map_in_tree(em)) try_merge_map(inode, em); } @@ -508,16 +508,15 @@ static int add_extent_mapping(struct btrfs_inode *inode, return 0; } -static struct extent_map * -__lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len, int strict) +static struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len, int strict) { struct extent_map *em; struct rb_node *rb_node; struct rb_node *prev_or_next = NULL; u64 end = range_end(start, len); - rb_node = __tree_search(&tree->root, start, &prev_or_next); + rb_node = tree_search(&tree->root, start, &prev_or_next); if (!rb_node) { if (prev_or_next) rb_node = prev_or_next; @@ -527,7 +526,7 @@ __lookup_extent_mapping(struct extent_map_tree *tree, em = rb_entry(rb_node, struct extent_map, rb_node); - if (strict && !(end > em->start && start < extent_map_end(em))) + if (strict && !(end > em->start && start < btrfs_extent_map_end(em))) return NULL; refcount_inc(&em->refs); @@ -546,10 +545,10 @@ __lookup_extent_mapping(struct extent_map_tree *tree, * intersect, so check the object returned carefully to make sure that no * additional lookups are needed. */ -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) +struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) { - return __lookup_extent_mapping(tree, start, len, 1); + return lookup_extent_mapping(tree, start, len, 1); } /* @@ -564,10 +563,10 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, * * If one can't be found, any nearby extent may be returned */ -struct extent_map *search_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len) +struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len) { - return __lookup_extent_mapping(tree, start, len, 0); + return lookup_extent_mapping(tree, start, len, 0); } /* @@ -579,7 +578,7 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree, * Remove @em from the extent tree of @inode. No reference counts are dropped, * and no checks are done to see if the range is in use. */ -void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) +void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em) { struct extent_map_tree *tree = &inode->extent_tree; @@ -605,7 +604,7 @@ static void replace_extent_mapping(struct btrfs_inode *inode, validate_extent_map(fs_info, new); WARN_ON(cur->flags & EXTENT_FLAG_PINNED); - ASSERT(extent_map_in_tree(cur)); + ASSERT(btrfs_extent_map_in_tree(cur)); if (!(cur->flags & EXTENT_FLAG_LOGGING)) list_del_init(&cur->list); rb_replace_node(&cur->rb_node, &new->rb_node, &tree->root); @@ -651,7 +650,7 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, u64 end; u64 start_diff; - if (map_start < em->start || map_start >= extent_map_end(em)) + if (map_start < em->start || map_start >= btrfs_extent_map_end(em)) return -EINVAL; if (existing->start > map_start) { @@ -662,10 +661,10 @@ static noinline int merge_extent_mapping(struct btrfs_inode *inode, next = next_extent_map(prev); } - start = prev ? extent_map_end(prev) : em->start; + start = prev ? btrfs_extent_map_end(prev) : em->start; start = max_t(u64, start, em->start); - end = next ? next->start : extent_map_end(em); - end = min_t(u64, end, extent_map_end(em)); + end = next ? next->start : btrfs_extent_map_end(em); + end = min_t(u64, end, btrfs_extent_map_end(em)); start_diff = start - em->start; em->start = start; em->len = end - start; @@ -716,7 +715,7 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, if (ret == -EEXIST) { struct extent_map *existing; - existing = search_extent_mapping(&inode->extent_tree, start, len); + existing = btrfs_search_extent_mapping(&inode->extent_tree, start, len); trace_btrfs_handle_em_exist(fs_info, existing, em, start, len); @@ -725,8 +724,8 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, * extent causing the -EEXIST. */ if (start >= existing->start && - start < extent_map_end(existing)) { - free_extent_map(em); + start < btrfs_extent_map_end(existing)) { + btrfs_free_extent_map(em); *em_in = existing; ret = 0; } else { @@ -739,14 +738,14 @@ int btrfs_add_extent_mapping(struct btrfs_inode *inode, */ ret = merge_extent_mapping(inode, existing, em, start); if (WARN_ON(ret)) { - free_extent_map(em); + btrfs_free_extent_map(em); *em_in = NULL; btrfs_warn(fs_info, "extent map merge error existing [%llu, %llu) with em [%llu, %llu) start %llu", - existing->start, extent_map_end(existing), + existing->start, btrfs_extent_map_end(existing), orig_start, orig_start + orig_len, start); } - free_extent_map(existing); + btrfs_free_extent_map(existing); } } @@ -772,8 +771,8 @@ static void drop_all_extent_maps_fast(struct btrfs_inode *inode) em = rb_entry(node, struct extent_map, rb_node); em->flags &= ~(EXTENT_FLAG_PINNED | EXTENT_FLAG_LOGGING); - remove_extent_mapping(inode, em); - free_extent_map(em); + btrfs_remove_extent_mapping(inode, em); + btrfs_free_extent_map(em); if (cond_resched_rwlock_write(&tree->lock)) node = rb_first(&tree->root); @@ -826,15 +825,15 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, * range ends after our range (and they might be the same extent map), * because we need to split those two extent maps at the boundaries. */ - split = alloc_extent_map(); - split2 = alloc_extent_map(); + split = btrfs_alloc_extent_map(); + split2 = btrfs_alloc_extent_map(); write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); while (em) { /* extent_map_end() returns exclusive value (last byte + 1). */ - const u64 em_end = extent_map_end(em); + const u64 em_end = btrfs_extent_map_end(em); struct extent_map *next_em = NULL; u64 gen; unsigned long flags; @@ -898,7 +897,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->generation = gen; split->flags = flags; replace_extent_mapping(inode, em, split, modified); - free_extent_map(split); + btrfs_free_extent_map(split); split = split2; split2 = NULL; } @@ -925,7 +924,7 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, split->ram_bytes = split->len; } - if (extent_map_in_tree(em)) { + if (btrfs_extent_map_in_tree(em)) { replace_extent_mapping(inode, em, split, modified); } else { int ret; @@ -936,11 +935,11 @@ void btrfs_drop_extent_map_range(struct btrfs_inode *inode, u64 start, u64 end, if (WARN_ON(ret != 0) && modified) btrfs_set_inode_full_sync(inode); } - free_extent_map(split); + btrfs_free_extent_map(split); split = NULL; } remove_em: - if (extent_map_in_tree(em)) { + if (btrfs_extent_map_in_tree(em)) { /* * If the extent map is still in the tree it means that * either of the following is true: @@ -965,25 +964,25 @@ remove_em: ASSERT(!split); btrfs_set_inode_full_sync(inode); } - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); } /* * Once for the tree reference (we replaced or removed the * extent map from the tree). */ - free_extent_map(em); + btrfs_free_extent_map(em); next: /* Once for us (for our lookup reference). */ - free_extent_map(em); + btrfs_free_extent_map(em); em = next_em; } write_unlock(&em_tree->lock); - free_extent_map(split); - free_extent_map(split2); + btrfs_free_extent_map(split); + btrfs_free_extent_map(split2); } /* @@ -1007,7 +1006,7 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, struct extent_map_tree *tree = &inode->extent_tree; int ret; - ASSERT(!extent_map_in_tree(new_em)); + ASSERT(!btrfs_extent_map_in_tree(new_em)); /* * The caller has locked an appropriate file range in the inode's io @@ -1033,8 +1032,8 @@ int btrfs_replace_extent_map_range(struct btrfs_inode *inode, * * This function is used when an ordered_extent needs to be split. */ -int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, - u64 new_logical) +int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical) { struct extent_map_tree *em_tree = &inode->extent_tree; struct extent_map *em; @@ -1046,25 +1045,25 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, ASSERT(pre != 0); ASSERT(pre < len); - split_pre = alloc_extent_map(); + split_pre = btrfs_alloc_extent_map(); if (!split_pre) return -ENOMEM; - split_mid = alloc_extent_map(); + split_mid = btrfs_alloc_extent_map(); if (!split_mid) { ret = -ENOMEM; goto out_free_pre; } - lock_extent(&inode->io_tree, start, start + len - 1, NULL); + btrfs_lock_extent(&inode->io_tree, start, start + len - 1, NULL); write_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); if (!em) { ret = -EIO; goto out_unlock; } ASSERT(em->len == len); - ASSERT(!extent_map_is_compressed(em)); + ASSERT(!btrfs_extent_map_is_compressed(em)); ASSERT(em->disk_bytenr < EXTENT_MAP_LAST_BYTE); ASSERT(em->flags & EXTENT_FLAG_PINNED); ASSERT(!(em->flags & EXTENT_FLAG_LOGGING)); @@ -1093,7 +1092,7 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, /* Insert the middle extent_map. */ split_mid->start = em->start + pre; split_mid->len = em->len - pre; - split_mid->disk_bytenr = extent_map_block_start(em) + pre; + split_mid->disk_bytenr = btrfs_extent_map_block_start(em) + pre; split_mid->disk_num_bytes = split_mid->len; split_mid->offset = 0; split_mid->ram_bytes = split_mid->len; @@ -1102,16 +1101,16 @@ int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, add_extent_mapping(inode, split_mid, 1); /* Once for us */ - free_extent_map(em); + btrfs_free_extent_map(em); /* Once for the tree */ - free_extent_map(em); + btrfs_free_extent_map(em); out_unlock: write_unlock(&em_tree->lock); - unlock_extent(&inode->io_tree, start, start + len - 1, NULL); - free_extent_map(split_mid); + btrfs_unlock_extent(&inode->io_tree, start, start + len - 1, NULL); + btrfs_free_extent_map(split_mid); out_free_pre: - free_extent_map(split_pre); + btrfs_free_extent_map(split_pre); return ret; } @@ -1128,6 +1127,8 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c long nr_dropped = 0; struct rb_node *node; + lockdep_assert_held_write(&tree->lock); + /* * Take the mmap lock so that we serialize with the inode logging phase * of fsync because we may need to set the full sync flag on the inode, @@ -1139,28 +1140,12 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c * to find new extents, which may not be there yet because ordered * extents haven't completed yet. * - * We also do a try lock because otherwise we could deadlock. This is - * because the shrinker for this filesystem may be invoked while we are - * in a path that is holding the mmap lock in write mode. For example in - * a reflink operation while COWing an extent buffer, when allocating - * pages for a new extent buffer and under memory pressure, the shrinker - * may be invoked, and therefore we would deadlock by attempting to read - * lock the mmap lock while we are holding already a write lock on it. + * We also do a try lock because we don't want to block for too long and + * we are holding the extent map tree's lock in write mode. */ if (!down_read_trylock(&inode->i_mmap_lock)) return 0; - /* - * We want to be fast so if the lock is busy we don't want to spend time - * waiting for it - either some task is about to do IO for the inode or - * we may have another task shrinking extent maps, here in this code, so - * skip this inode. - */ - if (!write_trylock(&tree->lock)) { - up_read(&inode->i_mmap_lock); - return 0; - } - node = rb_first(&tree->root); while (node) { struct rb_node *next = rb_next(node); @@ -1182,10 +1167,10 @@ static long btrfs_scan_inode(struct btrfs_inode *inode, struct btrfs_em_shrink_c if (!list_empty(&em->list) && em->generation >= cur_fs_gen) btrfs_set_inode_full_sync(inode); - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); trace_btrfs_extent_map_shrinker_remove_em(inode, em); /* Drop the reference for the tree. */ - free_extent_map(em); + btrfs_free_extent_map(em); nr_dropped++; next: if (ctx->scanned >= ctx->nr_to_scan) @@ -1201,12 +1186,61 @@ next: break; node = next; } - write_unlock(&tree->lock); up_read(&inode->i_mmap_lock); return nr_dropped; } +static struct btrfs_inode *find_first_inode_to_shrink(struct btrfs_root *root, + u64 min_ino) +{ + struct btrfs_inode *inode; + unsigned long from = min_ino; + + xa_lock(&root->inodes); + while (true) { + struct extent_map_tree *tree; + + inode = xa_find(&root->inodes, &from, ULONG_MAX, XA_PRESENT); + if (!inode) + break; + + tree = &inode->extent_tree; + + /* + * We want to be fast so if the lock is busy we don't want to + * spend time waiting for it (some task is about to do IO for + * the inode). + */ + if (!write_trylock(&tree->lock)) + goto next; + + /* + * Skip inode if it doesn't have loaded extent maps, so we avoid + * getting a reference and doing an iput later. This includes + * cases like files that were opened for things like stat(2), or + * files with all extent maps previously released through the + * release folio callback (btrfs_release_folio()) or released in + * a previous run, or directories which never have extent maps. + */ + if (RB_EMPTY_ROOT(&tree->root)) { + write_unlock(&tree->lock); + goto next; + } + + if (igrab(&inode->vfs_inode)) + break; + + write_unlock(&tree->lock); +next: + from = btrfs_ino(inode) + 1; + cond_resched_lock(&root->inodes.xa_lock); + } + xa_unlock(&root->inodes); + + return inode; +} + static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx *ctx) { struct btrfs_fs_info *fs_info = root->fs_info; @@ -1214,21 +1248,21 @@ static long btrfs_scan_root(struct btrfs_root *root, struct btrfs_em_shrink_ctx long nr_dropped = 0; u64 min_ino = fs_info->em_shrinker_last_ino + 1; - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); while (inode) { nr_dropped += btrfs_scan_inode(inode, ctx); + write_unlock(&inode->extent_tree.lock); min_ino = btrfs_ino(inode) + 1; fs_info->em_shrinker_last_ino = btrfs_ino(inode); - btrfs_add_delayed_iput(inode); + iput(&inode->vfs_inode); - if (ctx->scanned >= ctx->nr_to_scan || - btrfs_fs_closing(inode->root->fs_info)) + if (ctx->scanned >= ctx->nr_to_scan || btrfs_fs_closing(fs_info)) break; cond_resched(); - inode = btrfs_find_first_inode(root, min_ino); + inode = find_first_inode_to_shrink(root, min_ino); } if (inode) { diff --git a/fs/btrfs/extent_map.h b/fs/btrfs/extent_map.h index cd123b266b64..d4b81ee4d97b 100644 --- a/fs/btrfs/extent_map.h +++ b/fs/btrfs/extent_map.h @@ -108,8 +108,8 @@ struct extent_map_tree { struct btrfs_inode; -static inline void extent_map_set_compression(struct extent_map *em, - enum btrfs_compression_type type) +static inline void btrfs_extent_map_set_compression(struct extent_map *em, + enum btrfs_compression_type type) { if (type == BTRFS_COMPRESS_ZLIB) em->flags |= EXTENT_FLAG_COMPRESS_ZLIB; @@ -119,7 +119,8 @@ static inline void extent_map_set_compression(struct extent_map *em, em->flags |= EXTENT_FLAG_COMPRESS_ZSTD; } -static inline enum btrfs_compression_type extent_map_compression(const struct extent_map *em) +static inline enum btrfs_compression_type btrfs_extent_map_compression( + const struct extent_map *em) { if (em->flags & EXTENT_FLAG_COMPRESS_ZLIB) return BTRFS_COMPRESS_ZLIB; @@ -137,50 +138,50 @@ static inline enum btrfs_compression_type extent_map_compression(const struct ex * More efficient way to determine if extent is compressed, instead of using * 'extent_map_compression() != BTRFS_COMPRESS_NONE'. */ -static inline bool extent_map_is_compressed(const struct extent_map *em) +static inline bool btrfs_extent_map_is_compressed(const struct extent_map *em) { return (em->flags & (EXTENT_FLAG_COMPRESS_ZLIB | EXTENT_FLAG_COMPRESS_LZO | EXTENT_FLAG_COMPRESS_ZSTD)) != 0; } -static inline int extent_map_in_tree(const struct extent_map *em) +static inline int btrfs_extent_map_in_tree(const struct extent_map *em) { return !RB_EMPTY_NODE(&em->rb_node); } -static inline u64 extent_map_block_start(const struct extent_map *em) +static inline u64 btrfs_extent_map_block_start(const struct extent_map *em) { if (em->disk_bytenr < EXTENT_MAP_LAST_BYTE) { - if (extent_map_is_compressed(em)) + if (btrfs_extent_map_is_compressed(em)) return em->disk_bytenr; return em->disk_bytenr + em->offset; } return em->disk_bytenr; } -static inline u64 extent_map_end(const struct extent_map *em) +static inline u64 btrfs_extent_map_end(const struct extent_map *em) { if (em->start + em->len < em->start) return (u64)-1; return em->start + em->len; } -void extent_map_tree_init(struct extent_map_tree *tree); -struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len); -void remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); -int split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, - u64 new_logical); - -struct extent_map *alloc_extent_map(void); -void free_extent_map(struct extent_map *em); -int __init extent_map_init(void); -void __cold extent_map_exit(void); -int unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); -void clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); -struct extent_map *search_extent_mapping(struct extent_map_tree *tree, - u64 start, u64 len); +void btrfs_extent_map_tree_init(struct extent_map_tree *tree); +struct extent_map *btrfs_lookup_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); +void btrfs_remove_extent_mapping(struct btrfs_inode *inode, struct extent_map *em); +int btrfs_split_extent_map(struct btrfs_inode *inode, u64 start, u64 len, u64 pre, + u64 new_logical); + +struct extent_map *btrfs_alloc_extent_map(void); +void btrfs_free_extent_map(struct extent_map *em); +int __init btrfs_extent_map_init(void); +void __cold btrfs_extent_map_exit(void); +int btrfs_unpin_extent_cache(struct btrfs_inode *inode, u64 start, u64 len, u64 gen); +void btrfs_clear_em_logging(struct btrfs_inode *inode, struct extent_map *em); +struct extent_map *btrfs_search_extent_mapping(struct extent_map_tree *tree, + u64 start, u64 len); int btrfs_add_extent_mapping(struct btrfs_inode *inode, struct extent_map **em_in, u64 start, u64 len); void btrfs_drop_extent_map_range(struct btrfs_inode *inode, diff --git a/fs/btrfs/fiemap.c b/fs/btrfs/fiemap.c index b80c07ad8c5e..43bf0979fd53 100644 --- a/fs/btrfs/fiemap.c +++ b/fs/btrfs/fiemap.c @@ -634,7 +634,7 @@ static int extent_fiemap(struct btrfs_inode *inode, const u64 ino = btrfs_ino(inode); struct extent_state *cached_state = NULL; struct extent_state *delalloc_cached_state = NULL; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct fiemap_cache cache = { 0 }; struct btrfs_backref_share_check_ctx *backref_ctx; u64 last_extent_end = 0; @@ -661,7 +661,7 @@ restart: range_end = round_up(start + len, sectorsize); prev_extent_end = range_start; - lock_extent(&inode->io_tree, range_start, range_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, range_start, range_end, &cached_state); ret = fiemap_find_last_extent_offset(inode, path, &last_extent_end); if (ret < 0) @@ -841,7 +841,7 @@ check_eof_delalloc: } out_unlock: - unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, range_start, range_end, &cached_state); if (ret == BTRFS_FIEMAP_FLUSH_CACHE) { btrfs_release_path(path); @@ -871,10 +871,9 @@ out_unlock: ret = emit_last_fiemap_cache(fieinfo, &cache); out: - free_extent_state(delalloc_cached_state); + btrfs_free_extent_state(delalloc_cached_state); kfree(cache.entries); btrfs_free_backref_share_ctx(backref_ctx); - btrfs_free_path(path); return ret; } diff --git a/fs/btrfs/file-item.c b/fs/btrfs/file-item.c index d04a3b47b1fb..54d523d4f421 100644 --- a/fs/btrfs/file-item.c +++ b/fs/btrfs/file-item.c @@ -46,7 +46,7 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size) { u64 start, end, i_size; - int ret; + bool found; spin_lock(&inode->lock); i_size = new_i_size ?: i_size_read(&inode->vfs_inode); @@ -55,9 +55,9 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz goto out_unlock; } - ret = find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, - &end, EXTENT_DIRTY); - if (!ret && start == 0) + found = btrfs_find_contiguous_extent_bit(inode->file_extent_tree, 0, &start, + &end, EXTENT_DIRTY); + if (found && start == 0) i_size = min(i_size, end + 1); else i_size = 0; @@ -91,8 +91,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize)); - return set_extent_bit(inode->file_extent_tree, start, start + len - 1, - EXTENT_DIRTY, NULL); + return btrfs_set_extent_bit(inode->file_extent_tree, start, start + len - 1, + EXTENT_DIRTY, NULL); } /* @@ -121,8 +121,8 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start, ASSERT(IS_ALIGNED(start + len, inode->root->fs_info->sectorsize) || len == (u64)-1); - return clear_extent_bit(inode->file_extent_tree, start, - start + len - 1, EXTENT_DIRTY, NULL); + return btrfs_clear_extent_bit(inode->file_extent_tree, start, + start + len - 1, EXTENT_DIRTY, NULL); } static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes) @@ -163,20 +163,21 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, int ret = 0; struct btrfs_file_extent_item *item; struct btrfs_key file_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; path = btrfs_alloc_path(); if (!path) return -ENOMEM; + file_key.objectid = objectid; - file_key.offset = pos; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = pos; ret = btrfs_insert_empty_item(trans, root, path, &file_key, sizeof(*item)); if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -190,8 +191,7 @@ int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, btrfs_set_file_extent_compression(leaf, item, 0); btrfs_set_file_extent_encryption(leaf, item, 0); btrfs_set_file_extent_other_encoding(leaf, item, 0); -out: - btrfs_free_path(path); + return ret; } @@ -212,8 +212,8 @@ btrfs_lookup_csum(struct btrfs_trans_handle *trans, int csums_in_item; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; ret = btrfs_search_slot(trans, root, &file_key, path, 0, cow); if (ret < 0) goto fail; @@ -259,8 +259,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int cow = mod != 0; file_key.objectid = objectid; - file_key.offset = offset; file_key.type = BTRFS_EXTENT_DATA_KEY; + file_key.offset = offset; return btrfs_search_slot(trans, root, &file_key, path, ins_len, cow); } @@ -336,23 +336,23 @@ out: * * Return: BLK_STS_RESOURCE if allocating memory fails, BLK_STS_OK otherwise. */ -blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) +int btrfs_lookup_bio_sums(struct btrfs_bio *bbio) { struct btrfs_inode *inode = bbio->inode; struct btrfs_fs_info *fs_info = inode->root->fs_info; struct bio *bio = &bbio->bio; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); const u32 sectorsize = fs_info->sectorsize; const u32 csum_size = fs_info->csum_size; u32 orig_len = bio->bi_iter.bi_size; u64 orig_disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT; const unsigned int nblocks = orig_len >> fs_info->sectorsize_bits; - blk_status_t ret = BLK_STS_OK; + int ret = 0; u32 bio_offset = 0; if ((inode->flags & BTRFS_INODE_NODATASUM) || test_bit(BTRFS_FS_STATE_NO_DATA_CSUMS, &fs_info->fs_state)) - return BLK_STS_OK; + return 0; /* * This function is only called for read bio. @@ -369,14 +369,12 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) ASSERT(bio_op(bio) == REQ_OP_READ); path = btrfs_alloc_path(); if (!path) - return BLK_STS_RESOURCE; + return -ENOMEM; if (nblocks * csum_size > BTRFS_BIO_INLINE_CSUM_SIZE) { bbio->csum = kmalloc_array(nblocks, csum_size, GFP_NOFS); - if (!bbio->csum) { - btrfs_free_path(path); - return BLK_STS_RESOURCE; - } + if (!bbio->csum) + return -ENOMEM; } else { bbio->csum = bbio->csum_inline; } @@ -408,7 +406,7 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) count = search_csum_tree(fs_info, path, cur_disk_bytenr, orig_len - bio_offset, csum_dst); if (count < 0) { - ret = errno_to_blk_status(count); + ret = count; if (bbio->csum != bbio->csum_inline) kfree(bbio->csum); bbio->csum = NULL; @@ -432,9 +430,9 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) if (btrfs_root_id(inode->root) == BTRFS_DATA_RELOC_TREE_OBJECTID) { u64 file_offset = bbio->file_offset + bio_offset; - set_extent_bit(&inode->io_tree, file_offset, - file_offset + sectorsize - 1, - EXTENT_NODATASUM, NULL); + btrfs_set_extent_bit(&inode->io_tree, file_offset, + file_offset + sectorsize - 1, + EXTENT_NODATASUM, NULL); } else { btrfs_warn_rl(fs_info, "csum hole found for disk bytenr range [%llu, %llu)", @@ -444,7 +442,6 @@ blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio) bio_offset += count * sectorsize; } - btrfs_free_path(path); return ret; } @@ -484,8 +481,8 @@ int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end, path->nowait = nowait; key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = start; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = start; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -738,7 +735,7 @@ fail: /* * Calculate checksums of the data contained inside a bio. */ -blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) +int btrfs_csum_one_bio(struct btrfs_bio *bbio) { struct btrfs_ordered_extent *ordered = bbio->ordered; struct btrfs_inode *inode = bbio->inode; @@ -760,7 +757,7 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) memalloc_nofs_restore(nofs_flag); if (!sums) - return BLK_STS_RESOURCE; + return -ENOMEM; sums->len = bio->bi_iter.bi_size; INIT_LIST_HEAD(&sums->list); @@ -797,11 +794,11 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio) * record the updated logical address on Zone Append completion. * Allocate just the structure with an empty sums array here for that case. */ -blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) +int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio) { bbio->sums = kmalloc(sizeof(*bbio->sums), GFP_NOFS); if (!bbio->sums) - return BLK_STS_RESOURCE; + return -ENOMEM; bbio->sums->len = bbio->bio.bi_iter.bi_size; bbio->sums->logical = bbio->bio.bi_iter.bi_sector << SECTOR_SHIFT; btrfs_add_ordered_sum(bbio->ordered, bbio->sums); @@ -874,7 +871,7 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len) { struct btrfs_fs_info *fs_info = trans->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; u64 end_byte = bytenr + len; u64 csum_end; @@ -892,8 +889,8 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, while (1) { key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - key.offset = end_byte - 1; key.type = BTRFS_EXTENT_CSUM_KEY; + key.offset = end_byte - 1; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -1010,7 +1007,6 @@ int btrfs_del_csums(struct btrfs_trans_handle *trans, } btrfs_release_path(path); } - btrfs_free_path(path); return ret; } @@ -1052,7 +1048,7 @@ int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_key file_key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_csum_item *item; struct btrfs_csum_item *item_end; struct extent_buffer *leaf = NULL; @@ -1074,8 +1070,8 @@ again: found_next = 0; bytenr = sums->logical + total_bytes; file_key.objectid = BTRFS_EXTENT_CSUM_OBJECTID; - file_key.offset = bytenr; file_key.type = BTRFS_EXTENT_CSUM_KEY; + file_key.offset = bytenr; item = btrfs_lookup_csum(trans, root, path, bytenr, 1); if (!IS_ERR(item)) { @@ -1263,7 +1259,6 @@ found: goto again; } out: - btrfs_free_path(path); return ret; } @@ -1301,7 +1296,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->disk_num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi); em->offset = btrfs_file_extent_offset(leaf, fi); if (compress_type != BTRFS_COMPRESS_NONE) { - extent_map_set_compression(em, compress_type); + btrfs_extent_map_set_compression(em, compress_type); } else { /* * Older kernels can create regular non-hole data @@ -1321,7 +1316,7 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode, em->start = 0; em->len = fs_info->sectorsize; em->offset = 0; - extent_map_set_compression(em, compress_type); + btrfs_extent_map_set_compression(em, compress_type); } else { btrfs_err(fs_info, "unknown file extent item type %d, inode %llu, offset %llu, " diff --git a/fs/btrfs/file-item.h b/fs/btrfs/file-item.h index 0e13661a71f3..63216c43676d 100644 --- a/fs/btrfs/file-item.h +++ b/fs/btrfs/file-item.h @@ -3,8 +3,10 @@ #ifndef BTRFS_FILE_ITEM_H #define BTRFS_FILE_ITEM_H +#include <linux/blk_types.h> #include <linux/list.h> #include <uapi/linux/btrfs_tree.h> +#include "ctree.h" #include "accessors.h" struct extent_map; @@ -51,7 +53,7 @@ static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize) int btrfs_del_csums(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 bytenr, u64 len); -blk_status_t btrfs_lookup_bio_sums(struct btrfs_bio *bbio); +int btrfs_lookup_bio_sums(struct btrfs_bio *bbio); int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans, struct btrfs_root *root, u64 objectid, u64 pos, u64 num_bytes); @@ -62,8 +64,8 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans, int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans, struct btrfs_root *root, struct btrfs_ordered_sum *sums); -blk_status_t btrfs_csum_one_bio(struct btrfs_bio *bbio); -blk_status_t btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); +int btrfs_csum_one_bio(struct btrfs_bio *bbio); +int btrfs_alloc_dummy_sum(struct btrfs_bio *bbio); int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end, struct list_head *list, int search_commit, bool nowait); diff --git a/fs/btrfs/file.c b/fs/btrfs/file.c index ed3c0d6546c5..8ce6f45f45e0 100644 --- a/fs/btrfs/file.c +++ b/fs/btrfs/file.c @@ -98,9 +98,9 @@ int btrfs_dirty_folio(struct btrfs_inode *inode, struct folio *folio, loff_t pos * The pages may have already been dirty, clear out old accounting so * we can set things up properly */ - clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - cached); + btrfs_clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + cached); ret = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block, extra_bits, cached); @@ -508,20 +508,19 @@ out: return ret; } -static int extent_mergeable(struct extent_buffer *leaf, int slot, - u64 objectid, u64 bytenr, u64 orig_offset, - u64 *start, u64 *end) +static bool extent_mergeable(struct extent_buffer *leaf, int slot, u64 objectid, + u64 bytenr, u64 orig_offset, u64 *start, u64 *end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; u64 extent_end; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; + return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; + return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG || @@ -530,15 +529,15 @@ static int extent_mergeable(struct extent_buffer *leaf, int slot, btrfs_file_extent_compression(leaf, fi) || btrfs_file_extent_encryption(leaf, fi) || btrfs_file_extent_other_encoding(leaf, fi)) - return 0; + return false; extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi); if ((*start && *start != key.offset) || (*end && *end != extent_end)) - return 0; + return false; *start = key.offset; *end = extent_end; - return 1; + return true; } /* @@ -553,7 +552,7 @@ int btrfs_mark_extent_written(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; struct extent_buffer *leaf; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_file_extent_item *fi; struct btrfs_ref ref = { 0 }; struct btrfs_key key; @@ -791,7 +790,6 @@ again: } } out: - btrfs_free_path(path); return ret; } @@ -800,18 +798,18 @@ out: * On success return a locked folio and 0 */ static int prepare_uptodate_folio(struct inode *inode, struct folio *folio, u64 pos, - u64 len, bool force_uptodate) + u64 len) { u64 clamp_start = max_t(u64, pos, folio_pos(folio)); u64 clamp_end = min_t(u64, pos + len, folio_pos(folio) + folio_size(folio)); + const u32 blocksize = inode_to_fs_info(inode)->sectorsize; int ret = 0; if (folio_test_uptodate(folio)) return 0; - if (!force_uptodate && - IS_ALIGNED(clamp_start, PAGE_SIZE) && - IS_ALIGNED(clamp_end, PAGE_SIZE)) + if (IS_ALIGNED(clamp_start, blocksize) && + IS_ALIGNED(clamp_end, blocksize)) return 0; ret = btrfs_read_folio(NULL, folio); @@ -857,33 +855,27 @@ static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait) */ static noinline int prepare_one_folio(struct inode *inode, struct folio **folio_ret, loff_t pos, size_t write_bytes, - bool force_uptodate, bool nowait) + bool nowait) { unsigned long index = pos >> PAGE_SHIFT; gfp_t mask = get_prepare_gfp_flags(inode, nowait); - fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN); + fgf_t fgp_flags = (nowait ? FGP_WRITEBEGIN | FGP_NOWAIT : FGP_WRITEBEGIN) | + fgf_set_order(write_bytes); struct folio *folio; int ret = 0; again: folio = __filemap_get_folio(inode->i_mapping, index, fgp_flags, mask); - if (IS_ERR(folio)) { - if (nowait) - ret = -EAGAIN; - else - ret = PTR_ERR(folio); - return ret; - } - folio_wait_writeback(folio); - /* Only support page sized folio yet. */ - ASSERT(folio_order(folio) == 0); + if (IS_ERR(folio)) + return PTR_ERR(folio); + ret = set_folio_extent_mapped(folio); if (ret < 0) { folio_unlock(folio); folio_put(folio); return ret; } - ret = prepare_uptodate_folio(inode, folio, pos, write_bytes, force_uptodate); + ret = prepare_uptodate_folio(inode, folio, pos, write_bytes); if (ret) { /* The folio is already unlocked. */ folio_put(folio); @@ -924,14 +916,15 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, struct btrfs_ordered_extent *ordered; if (nowait) { - if (!try_lock_extent(&inode->io_tree, start_pos, last_pos, - cached_state)) { + if (!btrfs_try_lock_extent(&inode->io_tree, start_pos, + last_pos, cached_state)) { folio_unlock(folio); folio_put(folio); return -EAGAIN; } } else { - lock_extent(&inode->io_tree, start_pos, last_pos, cached_state); + btrfs_lock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); } ordered = btrfs_lookup_ordered_range(inode, start_pos, @@ -939,8 +932,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct folio *folio, if (ordered && ordered->file_offset + ordered->num_bytes > start_pos && ordered->file_offset <= last_pos) { - unlock_extent(&inode->io_tree, start_pos, last_pos, - cached_state); + btrfs_unlock_extent(&inode->io_tree, start_pos, last_pos, + cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); @@ -1014,14 +1007,13 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos, btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, &cached_state); } - ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes, - NULL, nowait); + ret = can_nocow_extent(inode, lockstart, &num_bytes, NULL, nowait); if (ret <= 0) btrfs_drew_write_unlock(&root->snapshot_lock); else *write_bytes = min_t(size_t, *write_bytes , num_bytes - pos + lockstart); - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); return ret; } @@ -1078,234 +1070,306 @@ int btrfs_write_check(struct kiocb *iocb, size_t count) return 0; } -ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *i) +static void release_space(struct btrfs_inode *inode, struct extent_changeset *data_reserved, + u64 start, u64 len, bool only_release_metadata) { - struct file *file = iocb->ki_filp; - loff_t pos; - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct extent_changeset *data_reserved = NULL; - u64 release_bytes = 0; - u64 lockstart; - u64 lockend; - size_t num_written = 0; - ssize_t ret; - loff_t old_isize = i_size_read(inode); - unsigned int ilock_flags = 0; - const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); - unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); - bool only_release_metadata = false; + if (len == 0) + return; - if (nowait) - ilock_flags |= BTRFS_ILOCK_TRY; + if (only_release_metadata) { + btrfs_check_nocow_unlock(inode); + btrfs_delalloc_release_metadata(inode, len, true); + } else { + const struct btrfs_fs_info *fs_info = inode->root->fs_info; - ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); - if (ret < 0) - return ret; + btrfs_delalloc_release_space(inode, data_reserved, + round_down(start, fs_info->sectorsize), + len, true); + } +} - ret = generic_write_checks(iocb, i); - if (ret <= 0) - goto out; +/* + * Reserve data and metadata space for this buffered write range. + * + * Return >0 for the number of bytes reserved, which is always block aligned. + * Return <0 for error. + */ +static ssize_t reserve_space(struct btrfs_inode *inode, + struct extent_changeset **data_reserved, + u64 start, size_t *len, bool nowait, + bool *only_release_metadata) +{ + const struct btrfs_fs_info *fs_info = inode->root->fs_info; + const unsigned int block_offset = (start & (fs_info->sectorsize - 1)); + size_t reserve_bytes; + int ret; - ret = btrfs_write_check(iocb, ret); - if (ret < 0) - goto out; + ret = btrfs_check_data_free_space(inode, data_reserved, start, *len, nowait); + if (ret < 0) { + int can_nocow; - pos = iocb->ki_pos; - while (iov_iter_count(i) > 0) { - struct extent_state *cached_state = NULL; - size_t offset = offset_in_page(pos); - size_t sector_offset; - size_t write_bytes = min(iov_iter_count(i), PAGE_SIZE - offset); - size_t reserve_bytes; - size_t copied; - size_t dirty_sectors; - size_t num_sectors; - struct folio *folio = NULL; - int extents_locked; - bool force_page_uptodate = false; + if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) + return -EAGAIN; /* - * Fault pages before locking them in prepare_one_folio() - * to avoid recursive lock + * If we don't have to COW at the offset, reserve metadata only. + * write_bytes may get smaller than requested here. */ - if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) { - ret = -EFAULT; - break; - } + can_nocow = btrfs_check_nocow_lock(inode, start, len, nowait); + if (can_nocow < 0) + ret = can_nocow; + if (can_nocow > 0) + ret = 0; + if (ret) + return ret; + *only_release_metadata = true; + } - only_release_metadata = false; - sector_offset = pos & (fs_info->sectorsize - 1); + reserve_bytes = round_up(*len + block_offset, fs_info->sectorsize); + WARN_ON(reserve_bytes == 0); + ret = btrfs_delalloc_reserve_metadata(inode, reserve_bytes, + reserve_bytes, nowait); + if (ret) { + if (!*only_release_metadata) + btrfs_free_reserved_data_space(inode, *data_reserved, + start, *len); + else + btrfs_check_nocow_unlock(inode); - extent_changeset_release(data_reserved); - ret = btrfs_check_data_free_space(BTRFS_I(inode), - &data_reserved, pos, - write_bytes, nowait); - if (ret < 0) { - int can_nocow; + if (nowait && ret == -ENOSPC) + ret = -EAGAIN; + return ret; + } + return reserve_bytes; +} - if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) { - ret = -EAGAIN; - break; - } +/* Shrink the reserved data and metadata space from @reserved_len to @new_len. */ +static void shrink_reserved_space(struct btrfs_inode *inode, + struct extent_changeset *data_reserved, + u64 reserved_start, u64 reserved_len, + u64 new_len, bool only_release_metadata) +{ + const u64 diff = reserved_len - new_len; - /* - * If we don't have to COW at the offset, reserve - * metadata only. write_bytes may get smaller than - * requested here. - */ - can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos, - &write_bytes, nowait); - if (can_nocow < 0) - ret = can_nocow; - if (can_nocow > 0) - ret = 0; - if (ret) - break; - only_release_metadata = true; - } + ASSERT(new_len <= reserved_len); + btrfs_delalloc_shrink_extents(inode, reserved_len, new_len); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, diff, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + reserved_start + new_len, diff, true); +} - reserve_bytes = round_up(write_bytes + sector_offset, - fs_info->sectorsize); - WARN_ON(reserve_bytes == 0); - ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode), - reserve_bytes, - reserve_bytes, nowait); - if (ret) { - if (!only_release_metadata) - btrfs_free_reserved_data_space(BTRFS_I(inode), - data_reserved, pos, - write_bytes); - else - btrfs_check_nocow_unlock(BTRFS_I(inode)); +/* Calculate the maximum amount of bytes we can write into one folio. */ +static size_t calc_write_bytes(const struct btrfs_inode *inode, + const struct iov_iter *iter, u64 start) +{ + const size_t max_folio_size = mapping_max_folio_size(inode->vfs_inode.i_mapping); - if (nowait && ret == -ENOSPC) - ret = -EAGAIN; - break; - } + return min(max_folio_size - (start & (max_folio_size - 1)), + iov_iter_count(iter)); +} + +/* + * Do the heavy-lifting work to copy one range into one folio of the page cache. + * + * Return > 0 in case we copied all bytes or just some of them. + * Return 0 if no bytes were copied, in which case the caller should retry. + * Return <0 on error. + */ +static int copy_one_range(struct btrfs_inode *inode, struct iov_iter *iter, + struct extent_changeset **data_reserved, u64 start, + bool nowait) +{ + struct btrfs_fs_info *fs_info = inode->root->fs_info; + struct extent_state *cached_state = NULL; + size_t write_bytes = calc_write_bytes(inode, iter, start); + size_t copied; + const u64 reserved_start = round_down(start, fs_info->sectorsize); + u64 reserved_len; + struct folio *folio = NULL; + int extents_locked; + u64 lockstart; + u64 lockend; + bool only_release_metadata = false; + const unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0); + int ret; + + /* + * Fault all pages before locking them in prepare_one_folio() to avoid + * recursive lock. + */ + if (unlikely(fault_in_iov_iter_readable(iter, write_bytes))) + return -EFAULT; + extent_changeset_release(*data_reserved); + ret = reserve_space(inode, data_reserved, start, &write_bytes, nowait, + &only_release_metadata); + if (ret < 0) + return ret; + reserved_len = ret; + /* Write range must be inside the reserved range. */ + ASSERT(reserved_start <= start); + ASSERT(start + write_bytes <= reserved_start + reserved_len); - release_bytes = reserve_bytes; again: - ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); - break; - } + ret = balance_dirty_pages_ratelimited_flags(inode->vfs_inode.i_mapping, + bdp_flags); + if (ret) { + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } - ret = prepare_one_folio(inode, &folio, pos, write_bytes, - force_page_uptodate, false); - if (ret) { - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - break; - } + ret = prepare_one_folio(&inode->vfs_inode, &folio, start, write_bytes, false); + if (ret) { + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } - extents_locked = lock_and_cleanup_extent_if_need(BTRFS_I(inode), - folio, pos, write_bytes, &lockstart, - &lockend, nowait, &cached_state); - if (extents_locked < 0) { - if (!nowait && extents_locked == -EAGAIN) - goto again; + /* + * The reserved range goes beyond the current folio, shrink the reserved + * space to the folio boundary. + */ + if (reserved_start + reserved_len > folio_pos(folio) + folio_size(folio)) { + const u64 last_block = folio_pos(folio) + folio_size(folio); + + shrink_reserved_space(inode, *data_reserved, reserved_start, + reserved_len, last_block - reserved_start, + only_release_metadata); + write_bytes = last_block - start; + reserved_len = last_block - reserved_start; + } + + extents_locked = lock_and_cleanup_extent_if_need(inode, folio, start, + write_bytes, &lockstart, + &lockend, nowait, + &cached_state); + if (extents_locked < 0) { + if (!nowait && extents_locked == -EAGAIN) + goto again; - btrfs_delalloc_release_extents(BTRFS_I(inode), - reserve_bytes); - ret = extents_locked; - break; - } + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + ret = extents_locked; + return ret; + } + + copied = copy_folio_from_iter_atomic(folio, offset_in_folio(folio, start), + write_bytes, iter); + flush_dcache_folio(folio); - copied = copy_folio_from_iter_atomic(folio, - offset_in_folio(folio, pos), write_bytes, i); - flush_dcache_folio(folio); + if (unlikely(copied < write_bytes)) { + u64 last_block; /* - * If we get a partial write, we can end up with partially - * uptodate page. Although if sector size < page size we can - * handle it, but if it's not sector aligned it can cause - * a lot of complexity, so make sure they don't happen by - * forcing retry this copy. + * The original write range doesn't need an uptodate folio as + * the range is block aligned. But now a short copy happened. + * We cannot handle it without an uptodate folio. + * + * So just revert the range and we will retry. */ - if (unlikely(copied < write_bytes)) { - if (!folio_test_uptodate(folio)) { - iov_iter_revert(i, copied); - copied = 0; - } + if (!folio_test_uptodate(folio)) { + iov_iter_revert(iter, copied); + copied = 0; } - num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes); - dirty_sectors = round_up(copied + sector_offset, - fs_info->sectorsize); - dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors); - + /* No copied bytes, unlock, release reserved space and exit. */ if (copied == 0) { - force_page_uptodate = true; - dirty_sectors = 0; - } else { - force_page_uptodate = false; + if (extents_locked) + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, + &cached_state); + else + btrfs_free_extent_state(cached_state); + btrfs_delalloc_release_extents(inode, reserved_len); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + btrfs_drop_folio(fs_info, folio, start, copied); + return 0; } - if (num_sectors > dirty_sectors) { - /* release everything except the sectors we dirtied */ - release_bytes -= dirty_sectors << fs_info->sectorsize_bits; - if (only_release_metadata) { - btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes, true); - } else { - u64 release_start = round_up(pos + copied, - fs_info->sectorsize); - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, release_start, - release_bytes, true); - } - } + /* Release the reserved space beyond the last block. */ + last_block = round_up(start + copied, fs_info->sectorsize); - release_bytes = round_up(copied + sector_offset, - fs_info->sectorsize); + shrink_reserved_space(inode, *data_reserved, reserved_start, + reserved_len, last_block - reserved_start, + only_release_metadata); + reserved_len = last_block - reserved_start; + } - ret = btrfs_dirty_folio(BTRFS_I(inode), folio, pos, copied, - &cached_state, only_release_metadata); + ret = btrfs_dirty_folio(inode, folio, start, copied, &cached_state, + only_release_metadata); + /* + * If we have not locked the extent range, because the range's start + * offset is >= i_size, we might still have a non-NULL cached extent + * state, acquired while marking the extent range as delalloc through + * btrfs_dirty_page(). Therefore free any possible cached extent state + * to avoid a memory leak. + */ + if (extents_locked) + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + else + btrfs_free_extent_state(cached_state); - /* - * If we have not locked the extent range, because the range's - * start offset is >= i_size, we might still have a non-NULL - * cached extent state, acquired while marking the extent range - * as delalloc through btrfs_dirty_page(). Therefore free any - * possible cached extent state to avoid a memory leak. - */ - if (extents_locked) - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); - else - free_extent_state(cached_state); + btrfs_delalloc_release_extents(inode, reserved_len); + if (ret) { + btrfs_drop_folio(fs_info, folio, start, copied); + release_space(inode, *data_reserved, reserved_start, reserved_len, + only_release_metadata); + return ret; + } + if (only_release_metadata) + btrfs_check_nocow_unlock(inode); - btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes); - if (ret) { - btrfs_drop_folio(fs_info, folio, pos, copied); - break; - } + btrfs_drop_folio(fs_info, folio, start, copied); + return copied; +} - release_bytes = 0; - if (only_release_metadata) - btrfs_check_nocow_unlock(BTRFS_I(inode)); +ssize_t btrfs_buffered_write(struct kiocb *iocb, struct iov_iter *iter) +{ + struct file *file = iocb->ki_filp; + loff_t pos; + struct inode *inode = file_inode(file); + struct extent_changeset *data_reserved = NULL; + size_t num_written = 0; + ssize_t ret; + loff_t old_isize; + unsigned int ilock_flags = 0; + const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); - btrfs_drop_folio(fs_info, folio, pos, copied); + if (nowait) + ilock_flags |= BTRFS_ILOCK_TRY; - cond_resched(); + ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags); + if (ret < 0) + return ret; - pos += copied; - num_written += copied; - } + /* + * We can only trust the isize with inode lock held, or it can race with + * other buffered writes and cause incorrect call of + * pagecache_isize_extended() to overwrite existing data. + */ + old_isize = i_size_read(inode); - if (release_bytes) { - if (only_release_metadata) { - btrfs_check_nocow_unlock(BTRFS_I(inode)); - btrfs_delalloc_release_metadata(BTRFS_I(inode), - release_bytes, true); - } else { - btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, - round_down(pos, fs_info->sectorsize), - release_bytes, true); - } + ret = generic_write_checks(iocb, iter); + if (ret <= 0) + goto out; + + ret = btrfs_write_check(iocb, ret); + if (ret < 0) + goto out; + + pos = iocb->ki_pos; + while (iov_iter_count(iter) > 0) { + ret = copy_one_range(BTRFS_I(inode), iter, &data_reserved, pos, nowait); + if (ret < 0) + break; + pos += ret; + num_written += ret; + cond_resched(); } extent_changeset_free(data_reserved); @@ -1400,7 +1464,7 @@ int btrfs_release_file(struct inode *inode, struct file *filp) if (private) { kfree(private->filldir_buf); - free_extent_state(private->llseek_cached_state); + btrfs_free_extent_state(private->llseek_cached_state); kfree(private); filp->private_data = NULL; } @@ -1776,17 +1840,14 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) struct extent_changeset *data_reserved = NULL; unsigned long zero_start; loff_t size; - vm_fault_t ret; - int ret2; - int reserved = 0; + size_t fsize = folio_size(folio); + int ret; u64 reserved_space; u64 page_start; u64 page_end; u64 end; - ASSERT(folio_order(folio) == 0); - - reserved_space = PAGE_SIZE; + reserved_space = fsize; sb_start_pagefault(inode->i_sb); page_start = folio_pos(folio); @@ -1801,21 +1862,14 @@ static vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf) * end up waiting indefinitely to get a lock on the page currently * being processed by btrfs_page_mkwrite() function. */ - ret2 = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, - page_start, reserved_space); - if (!ret2) { - ret2 = file_update_time(vmf->vma->vm_file); - reserved = 1; - } - if (ret2) { - ret = vmf_error(ret2); - if (reserved) - goto out; + ret = btrfs_delalloc_reserve_space(BTRFS_I(inode), &data_reserved, + page_start, reserved_space); + if (ret < 0) goto out_noreserve; - } - /* Make the VM retry the fault. */ - ret = VM_FAULT_NOPAGE; + ret = file_update_time(vmf->vma->vm_file); + if (ret < 0) + goto out; again: down_read(&BTRFS_I(inode)->i_mmap_lock); folio_lock(folio); @@ -1828,11 +1882,10 @@ again: } folio_wait_writeback(folio); - lock_extent(io_tree, page_start, page_end, &cached_state); - ret2 = set_folio_extent_mapped(folio); - if (ret2 < 0) { - ret = vmf_error(ret2); - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_lock_extent(io_tree, page_start, page_end, &cached_state); + ret = set_folio_extent_mapped(folio); + if (ret < 0) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -1840,9 +1893,9 @@ again: * We can't set the delalloc bits if there are pending ordered * extents. Drop our locks and wait for them to finish. */ - ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, PAGE_SIZE); + ordered = btrfs_lookup_ordered_range(BTRFS_I(inode), page_start, fsize); if (ordered) { - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); btrfs_start_ordered_extent(ordered); @@ -1850,13 +1903,13 @@ again: goto again; } - if (folio->index == ((size - 1) >> PAGE_SHIFT)) { + if (folio_contains(folio, (size - 1) >> PAGE_SHIFT)) { reserved_space = round_up(size - page_start, fs_info->sectorsize); - if (reserved_space < PAGE_SIZE) { + if (reserved_space < fsize) { end = page_start + reserved_space - 1; btrfs_delalloc_release_space(BTRFS_I(inode), - data_reserved, page_start, - PAGE_SIZE - reserved_space, true); + data_reserved, end + 1, + fsize - reserved_space, true); } } @@ -1867,15 +1920,14 @@ again: * clear any delalloc bits within this page range since we have to * reserve data&meta space before lock_page() (see above comments). */ - clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, page_start, end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); - ret2 = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, + ret = btrfs_set_extent_delalloc(BTRFS_I(inode), page_start, end, 0, &cached_state); - if (ret2) { - unlock_extent(io_tree, page_start, page_end, &cached_state); - ret = VM_FAULT_SIGBUS; + if (ret < 0) { + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); goto out_unlock; } @@ -1883,21 +1935,21 @@ again: if (page_start + folio_size(folio) > size) zero_start = offset_in_folio(folio, size); else - zero_start = PAGE_SIZE; + zero_start = fsize; - if (zero_start != PAGE_SIZE) + if (zero_start != fsize) folio_zero_range(folio, zero_start, folio_size(folio) - zero_start); - btrfs_folio_clear_checked(fs_info, folio, page_start, PAGE_SIZE); + btrfs_folio_clear_checked(fs_info, folio, page_start, fsize); btrfs_folio_set_dirty(fs_info, folio, page_start, end + 1 - page_start); btrfs_folio_set_uptodate(fs_info, folio, page_start, end + 1 - page_start); btrfs_set_inode_last_sub_trans(BTRFS_I(inode)); - unlock_extent(io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(io_tree, page_start, page_end, &cached_state); up_read(&BTRFS_I(inode)->i_mmap_lock); - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); sb_end_pagefault(inode->i_sb); extent_changeset_free(data_reserved); return VM_FAULT_LOCKED; @@ -1906,13 +1958,18 @@ out_unlock: folio_unlock(folio); up_read(&BTRFS_I(inode)->i_mmap_lock); out: - btrfs_delalloc_release_extents(BTRFS_I(inode), PAGE_SIZE); + btrfs_delalloc_release_extents(BTRFS_I(inode), fsize); btrfs_delalloc_release_space(BTRFS_I(inode), data_reserved, page_start, - reserved_space, (ret != 0)); + reserved_space, true); + extent_changeset_free(data_reserved); out_noreserve: sb_end_pagefault(inode->i_sb); - extent_changeset_free(data_reserved); - return ret; + + if (ret < 0) + return vmf_error(ret); + + /* Make the VM retry the fault. */ + return VM_FAULT_NOPAGE; } static const struct vm_operations_struct btrfs_file_vm_ops = { @@ -1934,33 +1991,33 @@ static int btrfs_file_mmap(struct file *filp, struct vm_area_struct *vma) return 0; } -static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, - int slot, u64 start, u64 end) +static bool hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf, + int slot, u64 start, u64 end) { struct btrfs_file_extent_item *fi; struct btrfs_key key; if (slot < 0 || slot >= btrfs_header_nritems(leaf)) - return 0; + return false; btrfs_item_key_to_cpu(leaf, &key, slot); if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) - return 0; + return false; fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item); if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG) - return 0; + return false; if (btrfs_file_extent_disk_bytenr(leaf, fi)) - return 0; + return false; if (key.offset == end) - return 1; + return true; if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start) - return 1; - return 0; + return true; + return false; } static int fill_holes(struct btrfs_trans_handle *trans, @@ -2034,7 +2091,7 @@ static int fill_holes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, offset, end - 1, false); btrfs_set_inode_full_sync(inode); @@ -2048,7 +2105,7 @@ out: hole_em->generation = trans->transid; ret = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + btrfs_free_extent_map(hole_em); if (ret) btrfs_set_inode_full_sync(inode); } @@ -2081,15 +2138,33 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len) 0 : *start + *len - em->start - em->len; *start = em->start + em->len; } - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } -static void btrfs_punch_hole_lock_range(struct inode *inode, - const u64 lockstart, - const u64 lockend, - struct extent_state **cached_state) +/* + * Check if there is no folio in the range. + * + * We cannot utilize filemap_range_has_page() in a filemap with large folios + * as we can hit the following false positive: + * + * start end + * | | + * |//|//|//|//| | | | | | | | |//|//| + * \ / \ / + * Folio A Folio B + * + * That large folio A and B cover the start and end indexes. + * In that case filemap_range_has_page() will always return true, but the above + * case is fine for btrfs_punch_hole_lock_range() usage. + * + * So here we only ensure that no other folios is in the range, excluding the + * head/tail large folio. + */ +static bool check_range_has_page(struct inode *inode, u64 start, u64 end) { + struct folio_batch fbatch; + bool ret = false; /* * For subpage case, if the range is not at page boundary, we could * have pages at the leading/tailing part of the range. @@ -2097,15 +2172,48 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * will always return true. * So here we need to do extra page alignment for * filemap_range_has_page(). + * + * And do not decrease page_lockend right now, as it can be 0. */ - const u64 page_lockstart = round_up(lockstart, PAGE_SIZE); - const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1; + const u64 page_lockstart = round_up(start, PAGE_SIZE); + const u64 page_lockend = round_down(end + 1, PAGE_SIZE); + const pgoff_t start_index = page_lockstart >> PAGE_SHIFT; + const pgoff_t end_index = (page_lockend - 1) >> PAGE_SHIFT; + pgoff_t tmp = start_index; + int found_folios; + + /* The same page or adjacent pages. */ + if (page_lockend <= page_lockstart) + return false; + + folio_batch_init(&fbatch); + found_folios = filemap_get_folios(inode->i_mapping, &tmp, end_index, &fbatch); + for (int i = 0; i < found_folios; i++) { + struct folio *folio = fbatch.folios[i]; + + /* A large folio begins before the start. Not a target. */ + if (folio->index < start_index) + continue; + /* A large folio extends beyond the end. Not a target. */ + if (folio->index + folio_nr_pages(folio) > end_index) + continue; + /* A folio doesn't cover the head/tail index. Found a target. */ + ret = true; + break; + } + folio_batch_release(&fbatch); + return ret; +} +static void btrfs_punch_hole_lock_range(struct inode *inode, + const u64 lockstart, const u64 lockend, + struct extent_state **cached_state) +{ while (1) { truncate_pagecache_range(inode, lockstart, lockend); - lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); /* * We can't have ordered extents in the range, nor dirty/writeback * pages, because we have locked the inode's VFS lock in exclusive @@ -2116,12 +2224,11 @@ static void btrfs_punch_hole_lock_range(struct inode *inode, * locking the range check if we have pages in the range, and if * we do, unlock the range and retry. */ - if (!filemap_range_has_page(inode->i_mapping, page_lockstart, - page_lockend)) + if (!check_range_has_page(inode, lockstart, lockend)) break; - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + cached_state); } btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend); @@ -2494,7 +2601,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) u64 lockend; u64 tail_start; u64 tail_len; - u64 orig_start = offset; + const u64 orig_start = offset; + const u64 orig_end = offset + len - 1; int ret = 0; bool same_block; u64 ino_size; @@ -2526,18 +2634,14 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset)) == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)); /* - * We needn't truncate any block which is beyond the end of the file - * because we are sure there is no data there. - */ - /* * Only do this if we are in the same block and we aren't doing the * entire block. */ if (same_block && len < fs_info->sectorsize) { if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, - 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); } else { ret = 0; } @@ -2547,7 +2651,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) /* zero back part of the first block */ if (offset < ino_size) { truncated_block = true; - ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, orig_start, orig_end); if (ret) { btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); return ret; @@ -2584,8 +2688,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) if (tail_start + tail_len < ino_size) { truncated_block = true; ret = btrfs_truncate_block(BTRFS_I(inode), - tail_start + tail_len, - 0, 1); + tail_start + tail_len - 1, + orig_start, orig_end); if (ret) goto out_only_mutex; } @@ -2619,8 +2723,8 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); out: - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); out_only_mutex: if (!updated_inode && truncated_block && !ret) { /* @@ -2738,7 +2842,7 @@ static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode, else ret = RANGE_BOUNDARY_WRITTEN_EXTENT; - free_extent_map(em); + btrfs_free_extent_map(em); return ret; } @@ -2753,6 +2857,8 @@ static int btrfs_zero_range(struct inode *inode, int ret; u64 alloc_hint = 0; const u64 sectorsize = fs_info->sectorsize; + const u64 orig_start = offset; + const u64 orig_end = offset + len - 1; u64 alloc_start = round_down(offset, sectorsize); u64 alloc_end = round_up(offset + len, sectorsize); u64 bytes_to_reserve = 0; @@ -2782,7 +2888,7 @@ static int btrfs_zero_range(struct inode *inode, * do nothing except updating the inode's i_size if * needed. */ - free_extent_map(em); + btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; @@ -2795,9 +2901,9 @@ static int btrfs_zero_range(struct inode *inode, ASSERT(IS_ALIGNED(alloc_start, sectorsize)); len = offset + len - alloc_start; offset = alloc_start; - alloc_hint = extent_map_block_start(em) + em->len; + alloc_hint = btrfs_extent_map_block_start(em) + em->len; } - free_extent_map(em); + btrfs_free_extent_map(em); if (BTRFS_BYTES_TO_BLKS(fs_info, offset) == BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) { @@ -2808,22 +2914,22 @@ static int btrfs_zero_range(struct inode *inode, } if (em->flags & EXTENT_FLAG_PREALLOC) { - free_extent_map(em); + btrfs_free_extent_map(em); ret = btrfs_fallocate_update_isize(inode, offset + len, mode); goto out; } if (len < sectorsize && em->disk_bytenr != EXTENT_MAP_HOLE) { - free_extent_map(em); - ret = btrfs_truncate_block(BTRFS_I(inode), offset, len, - 0); + btrfs_free_extent_map(em); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); if (!ret) ret = btrfs_fallocate_update_isize(inode, offset + len, mode); return ret; } - free_extent_map(em); + btrfs_free_extent_map(em); alloc_start = round_down(offset, sectorsize); alloc_end = alloc_start + sectorsize; goto reserve_space; @@ -2847,7 +2953,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_start = round_down(offset, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), offset, + orig_start, orig_end); if (ret) goto out; } else { @@ -2864,8 +2971,8 @@ static int btrfs_zero_range(struct inode *inode, alloc_end = round_up(offset + len, sectorsize); ret = 0; } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) { - ret = btrfs_truncate_block(BTRFS_I(inode), offset + len, - 0, 1); + ret = btrfs_truncate_block(BTRFS_I(inode), offset + len - 1, + orig_start, orig_end); if (ret) goto out; } else { @@ -2890,16 +2997,16 @@ reserve_space: ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, alloc_start, bytes_to_reserve); if (ret) { - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, - lockend, &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, + lockend, &cached_state); goto out; } ret = btrfs_prealloc_file_range(inode, mode, alloc_start, alloc_end - alloc_start, fs_info->sectorsize, offset + len, &alloc_hint); - unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend, + &cached_state); /* btrfs_prealloc_file_range releases reserved space on error */ if (ret) { space_reserved = false; @@ -2985,7 +3092,8 @@ static long btrfs_fallocate(struct file *file, int mode, * need to zero out the end of the block if i_size lands in the * middle of a block. */ - ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0); + ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, + inode->i_size, (u64)-1); if (ret) goto out; } @@ -3010,8 +3118,8 @@ static long btrfs_fallocate(struct file *file, int mode, } locked_end = alloc_end - 1; - lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end); @@ -3023,8 +3131,8 @@ static long btrfs_fallocate(struct file *file, int mode, ret = PTR_ERR(em); break; } - last_byte = min(extent_map_end(em), alloc_end); - actual_end = min_t(u64, extent_map_end(em), offset + len); + last_byte = min(btrfs_extent_map_end(em), alloc_end); + actual_end = min_t(u64, btrfs_extent_map_end(em), offset + len); last_byte = ALIGN(last_byte, blocksize); if (em->disk_bytenr == EXTENT_MAP_HOLE || (cur_offset >= inode->i_size && @@ -3033,19 +3141,19 @@ static long btrfs_fallocate(struct file *file, int mode, ret = add_falloc_range(&reserve_list, cur_offset, range_len); if (ret < 0) { - free_extent_map(em); + btrfs_free_extent_map(em); break; } ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved, cur_offset, range_len); if (ret < 0) { - free_extent_map(em); + btrfs_free_extent_map(em); break; } qgroup_reserved += range_len; data_space_needed += range_len; } - free_extent_map(em); + btrfs_free_extent_map(em); cur_offset = last_byte; } @@ -3099,8 +3207,8 @@ static long btrfs_fallocate(struct file *file, int mode, */ ret = btrfs_fallocate_update_isize(inode, actual_end, mode); out_unlock: - unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end, + &cached_state); out: btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP); extent_changeset_free(data_reserved); @@ -3134,10 +3242,10 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end if (inode->delalloc_bytes > 0) { spin_unlock(&inode->lock); *delalloc_start_ret = start; - delalloc_len = count_range_bits(&inode->io_tree, - delalloc_start_ret, end, - len, EXTENT_DELALLOC, 1, - cached_state); + delalloc_len = btrfs_count_range_bits(&inode->io_tree, + delalloc_start_ret, end, + len, EXTENT_DELALLOC, 1, + cached_state); } else { spin_unlock(&inode->lock); } @@ -3446,7 +3554,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) last_extent_end = lockstart; - lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_lock_extent(&inode->io_tree, lockstart, lockend, &cached_state); ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) { @@ -3592,7 +3700,7 @@ static loff_t find_desired_extent(struct file *file, loff_t offset, int whence) } out: - unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state); btrfs_free_path(path); if (ret < 0) diff --git a/fs/btrfs/file.h b/fs/btrfs/file.h index de89e644be29..d7df81388cbe 100644 --- a/fs/btrfs/file.h +++ b/fs/btrfs/file.h @@ -9,6 +9,8 @@ struct file; struct extent_state; struct kiocb; struct iov_iter; +struct inode; +struct folio; struct page; struct btrfs_ioctl_encoded_io_args; struct btrfs_drop_extents_args; diff --git a/fs/btrfs/free-space-cache.c b/fs/btrfs/free-space-cache.c index d42b6f882f57..4b34ea1f01c2 100644 --- a/fs/btrfs/free-space-cache.c +++ b/fs/btrfs/free-space-cache.c @@ -88,13 +88,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, struct btrfs_disk_key disk_key; struct btrfs_free_space_header *header; struct extent_buffer *leaf; - struct inode *inode = NULL; + struct btrfs_inode *inode; unsigned nofs_flag; int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -120,13 +120,13 @@ static struct inode *__lookup_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); memalloc_nofs_restore(nofs_flag); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); - mapping_set_gfp_mask(inode->i_mapping, - mapping_gfp_constraint(inode->i_mapping, + mapping_set_gfp_mask(inode->vfs_inode.i_mapping, + mapping_gfp_constraint(inode->vfs_inode.i_mapping, ~(__GFP_FS | __GFP_HIGHMEM))); - return inode; + return &inode->vfs_inode; } struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group, @@ -201,8 +201,8 @@ static int __create_free_space_inode(struct btrfs_root *root, btrfs_release_path(path); key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_insert_empty_item(trans, root, path, &key, sizeof(struct btrfs_free_space_header)); if (ret < 0) { @@ -244,7 +244,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, struct inode *inode, struct btrfs_block_group *block_group) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int ret = 0; @@ -257,12 +257,12 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) { if (PTR_ERR(inode) != -ENOENT) ret = PTR_ERR(inode); - goto out; + return ret; } ret = btrfs_orphan_add(trans, BTRFS_I(inode)); if (ret) { btrfs_add_delayed_iput(BTRFS_I(inode)); - goto out; + return ret; } clear_nlink(inode); /* One for the block groups ref */ @@ -285,12 +285,9 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = 0; - goto out; + return ret; } - ret = btrfs_del_item(trans, trans->fs_info->tree_root, path); -out: - btrfs_free_path(path); - return ret; + return btrfs_del_item(trans, trans->fs_info->tree_root, path); } int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, @@ -311,8 +308,9 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, bool locked = false; if (block_group) { - struct btrfs_path *path = btrfs_alloc_path(); + BTRFS_PATH_AUTO_FREE(path); + path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; goto fail; @@ -333,13 +331,12 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, spin_lock(&block_group->lock); block_group->disk_cache_state = BTRFS_DC_CLEAR; spin_unlock(&block_group->lock); - btrfs_free_path(path); } btrfs_i_size_write(inode, 0); truncate_pagecache(vfs_inode, 0); - lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_lock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); btrfs_drop_extent_map_range(inode, 0, (u64)-1, false); /* @@ -351,7 +348,7 @@ int btrfs_truncate_free_space_cache(struct btrfs_trans_handle *trans, inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); + btrfs_unlock_extent(&inode->io_tree, 0, (u64)-1, &cached_state); if (ret) goto fail; @@ -447,7 +444,7 @@ static void io_ctl_drop_pages(struct btrfs_io_ctl *io_ctl) static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) { - struct page *page; + struct folio *folio; struct inode *inode = io_ctl->inode; gfp_t mask = btrfs_alloc_write_mask(inode->i_mapping); int i; @@ -455,31 +452,33 @@ static int io_ctl_prepare_pages(struct btrfs_io_ctl *io_ctl, bool uptodate) for (i = 0; i < io_ctl->num_pages; i++) { int ret; - page = find_or_create_page(inode->i_mapping, i, mask); - if (!page) { + folio = __filemap_get_folio(inode->i_mapping, i, + FGP_LOCK | FGP_ACCESSED | FGP_CREAT, + mask); + if (IS_ERR(folio)) { io_ctl_drop_pages(io_ctl); - return -ENOMEM; + return PTR_ERR(folio); } - ret = set_folio_extent_mapped(page_folio(page)); + ret = set_folio_extent_mapped(folio); if (ret < 0) { - unlock_page(page); - put_page(page); + folio_unlock(folio); + folio_put(folio); io_ctl_drop_pages(io_ctl); return ret; } - io_ctl->pages[i] = page; - if (uptodate && !PageUptodate(page)) { - btrfs_read_folio(NULL, page_folio(page)); - lock_page(page); - if (page->mapping != inode->i_mapping) { + io_ctl->pages[i] = &folio->page; + if (uptodate && !folio_test_uptodate(folio)) { + btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != inode->i_mapping) { btrfs_err(BTRFS_I(inode)->root->fs_info, "free space cache page truncated"); io_ctl_drop_pages(io_ctl); return -EIO; } - if (!PageUptodate(page)) { + if (!folio_test_uptodate(folio)) { btrfs_err(BTRFS_I(inode)->root->fs_info, "error reading free space cache"); io_ctl_drop_pages(io_ctl); @@ -753,8 +752,8 @@ static int __load_free_space_cache(struct btrfs_root *root, struct inode *inode, return 0; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) @@ -1081,9 +1080,8 @@ int write_cache_extent_entries(struct btrfs_io_ctl *io_ctl, /* Get the cluster for this block_group if it exists */ if (block_group && !list_empty(&block_group->cluster_list)) { - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); + cluster = list_first_entry(&block_group->cluster_list, + struct btrfs_free_cluster, block_group_list); } if (!node && cluster) { @@ -1156,13 +1154,13 @@ update_cache_item(struct btrfs_trans_handle *trans, int ret; key.objectid = BTRFS_FREE_SPACE_OBJECTID; - key.offset = offset; key.type = 0; + key.offset = offset; ret = btrfs_search_slot(trans, root, &key, path, 0, 1); if (ret < 0) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); goto fail; } leaf = path->nodes[0]; @@ -1173,9 +1171,9 @@ update_cache_item(struct btrfs_trans_handle *trans, btrfs_item_key_to_cpu(leaf, &found_key, path->slots[0]); if (found_key.objectid != BTRFS_FREE_SPACE_OBJECTID || found_key.offset != offset) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, - inode->i_size - 1, EXTENT_DELALLOC, - NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, + inode->i_size - 1, EXTENT_DELALLOC, + NULL); btrfs_release_path(path); goto fail; } @@ -1220,9 +1218,9 @@ static noinline_for_stack int write_pinned_extent_entries( start = block_group->start; while (start < block_group->start + block_group->length) { - if (!find_first_extent_bit(unpin, start, - &extent_start, &extent_end, - EXTENT_DIRTY, NULL)) + if (!btrfs_find_first_extent_bit(unpin, start, + &extent_start, &extent_end, + EXTENT_DIRTY, NULL)) return 0; /* This pinned extent is out of our range */ @@ -1268,8 +1266,8 @@ static int flush_dirty_cache(struct inode *inode) ret = btrfs_wait_ordered_range(BTRFS_I(inode), 0, (u64)-1); if (ret) - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, - EXTENT_DELALLOC, NULL); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, inode->i_size - 1, + EXTENT_DELALLOC, NULL); return ret; } @@ -1289,8 +1287,8 @@ cleanup_write_cache_enospc(struct inode *inode, struct extent_state **cached_state) { io_ctl_drop_pages(io_ctl); - unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + cached_state); } static int __btrfs_wait_cache_io(struct btrfs_root *root, @@ -1415,8 +1413,8 @@ static int __btrfs_write_out_cache(struct inode *inode, if (ret) goto out_unlock; - lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); io_ctl_set_generation(io_ctl, trans->transid); @@ -1476,8 +1474,8 @@ static int __btrfs_write_out_cache(struct inode *inode, io_ctl_drop_pages(io_ctl); io_ctl_free(io_ctl); - unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, 0, i_size_read(inode) - 1, + &cached_state); /* * at this point the pages are under IO and we're happy, @@ -2343,9 +2341,8 @@ again: struct rb_node *node; struct btrfs_free_space *entry; - cluster = list_entry(block_group->cluster_list.next, - struct btrfs_free_cluster, - block_group_list); + cluster = list_first_entry(&block_group->cluster_list, + struct btrfs_free_cluster, block_group_list); spin_lock(&cluster->lock); node = rb_first(&cluster->root); if (!node) { diff --git a/fs/btrfs/free-space-tree.c b/fs/btrfs/free-space-tree.c index cae540ec15ed..0c573d46639a 100644 --- a/fs/btrfs/free-space-tree.c +++ b/fs/btrfs/free-space-tree.c @@ -117,7 +117,7 @@ struct btrfs_free_space_info *search_free_space_info( if (ret != 0) { btrfs_warn(fs_info, "missing free space info for %llu", block_group->start); - ASSERT(0); + DEBUG_WARN(); return ERR_PTR(-ENOENT); } @@ -141,12 +141,12 @@ static int btrfs_search_prev_slot(struct btrfs_trans_handle *trans, return ret; if (ret == 0) { - ASSERT(0); + DEBUG_WARN(); return -EIO; } if (p->slots[0] == 0) { - ASSERT(0); + DEBUG_WARN("no previous slot found"); return -EIO; } p->slots[0]--; @@ -223,6 +223,7 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -235,8 +236,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -271,14 +274,17 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); + btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; @@ -293,8 +299,8 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); ret = -EIO; + btrfs_abort_transaction(trans, ret); goto out; } @@ -315,8 +321,10 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = btrfs_insert_empty_item(trans, root, path, &key, data_size); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -331,8 +339,6 @@ int convert_free_space_to_bitmaps(struct btrfs_trans_handle *trans, ret = 0; out: kvfree(bitmap); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -358,6 +364,7 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, bitmap = alloc_bitmap(bitmap_size); if (!bitmap) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } @@ -370,8 +377,10 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, while (!done) { ret = btrfs_search_prev_slot(trans, root, &key, path, -1, 1); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } leaf = path->nodes[0]; nr = 0; @@ -412,14 +421,17 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, } ret = btrfs_del_items(trans, root, path, path->slots[0], nr); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); } info = search_free_space_info(trans, block_group, path, 1); if (IS_ERR(info)) { ret = PTR_ERR(info); + btrfs_abort_transaction(trans, ret); goto out; } leaf = path->nodes[0]; @@ -441,8 +453,10 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, key.offset = (end_bit - start_bit) * block_group->fs_info->sectorsize; ret = btrfs_insert_empty_item(trans, root, path, &key, 0); - if (ret) + if (ret) { + btrfs_abort_transaction(trans, ret); goto out; + } btrfs_release_path(path); extent_count++; @@ -455,16 +469,14 @@ int convert_free_space_to_extents(struct btrfs_trans_handle *trans, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); ret = -EIO; + btrfs_abort_transaction(trans, ret); goto out; } ret = 0; out: kvfree(bitmap); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -838,13 +850,15 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { - ASSERT(0); + DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; + btrfs_abort_transaction(trans, ret); goto out; } @@ -852,12 +866,12 @@ int remove_from_free_space_tree(struct btrfs_trans_handle *trans, ret = __remove_from_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -1031,25 +1045,27 @@ int add_to_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) { ret = -ENOMEM; + btrfs_abort_transaction(trans, ret); goto out; } block_group = btrfs_lookup_block_group(trans->fs_info, start); if (!block_group) { - ASSERT(0); + DEBUG_WARN("no block group found for start=%llu", start); ret = -ENOENT; + btrfs_abort_transaction(trans, ret); goto out; } mutex_lock(&block_group->free_space_lock); ret = __add_to_free_space_tree(trans, block_group, path, start, size); mutex_unlock(&block_group->free_space_lock); + if (ret) + btrfs_abort_transaction(trans, ret); btrfs_put_block_group(block_group); out: btrfs_free_path(path); - if (ret) - btrfs_abort_transaction(trans, ret); return ret; } @@ -1062,7 +1078,8 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_block_group *block_group) { struct btrfs_root *extent_root; - struct btrfs_path *path, *path2; + BTRFS_PATH_AUTO_FREE(path); + BTRFS_PATH_AUTO_FREE(path2); struct btrfs_key key; u64 start, end; int ret; @@ -1070,17 +1087,16 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, path = btrfs_alloc_path(); if (!path) return -ENOMEM; - path->reada = READA_FORWARD; path2 = btrfs_alloc_path(); - if (!path2) { - btrfs_free_path(path); + if (!path2) return -ENOMEM; - } + + path->reada = READA_FORWARD; ret = add_new_free_space_info(trans, block_group, path2); if (ret) - goto out; + return ret; mutex_lock(&block_group->free_space_lock); @@ -1146,9 +1162,7 @@ static int populate_free_space_tree(struct btrfs_trans_handle *trans, ret = 0; out_locked: mutex_unlock(&block_group->free_space_lock); -out: - btrfs_free_path(path2); - btrfs_free_path(path); + return ret; } @@ -1217,7 +1231,7 @@ out_clear: static int clear_free_space_tree(struct btrfs_trans_handle *trans, struct btrfs_root *root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; int nr; int ret; @@ -1233,7 +1247,7 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) - goto out; + return ret; nr = btrfs_header_nritems(path->nodes[0]); if (!nr) @@ -1242,15 +1256,12 @@ static int clear_free_space_tree(struct btrfs_trans_handle *trans, path->slots[0] = 0; ret = btrfs_del_items(trans, root, path, 0, nr); if (ret) - goto out; + return ret; btrfs_release_path(path); } - ret = 0; -out: - btrfs_free_path(path); - return ret; + return 0; } int btrfs_delete_free_space_tree(struct btrfs_fs_info *fs_info) @@ -1560,7 +1571,7 @@ static int load_free_space_bitmaps(struct btrfs_caching_control *caching_ctl, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); + DEBUG_WARN(); ret = -EIO; goto out; } @@ -1624,7 +1635,7 @@ static int load_free_space_extents(struct btrfs_caching_control *caching_ctl, "incorrect extent count for %llu; counted %u, expected %u", block_group->start, extent_count, expected_extent_count); - ASSERT(0); + DEBUG_WARN(); ret = -EIO; goto out; } @@ -1638,9 +1649,8 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) { struct btrfs_block_group *block_group; struct btrfs_free_space_info *info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); u32 extent_count, flags; - int ret; block_group = caching_ctl->block_group; @@ -1657,10 +1667,9 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) path->reada = READA_FORWARD; info = search_free_space_info(NULL, block_group, path, 0); - if (IS_ERR(info)) { - ret = PTR_ERR(info); - goto out; - } + if (IS_ERR(info)) + return PTR_ERR(info); + extent_count = btrfs_free_space_extent_count(path->nodes[0], info); flags = btrfs_free_space_flags(path->nodes[0], info); @@ -1670,11 +1679,7 @@ int load_free_space_tree(struct btrfs_caching_control *caching_ctl) * there. */ if (flags & BTRFS_FREE_SPACE_USING_BITMAPS) - ret = load_free_space_bitmaps(caching_ctl, path, extent_count); + return load_free_space_bitmaps(caching_ctl, path, extent_count); else - ret = load_free_space_extents(caching_ctl, path, extent_count); - -out: - btrfs_free_path(path); - return ret; + return load_free_space_extents(caching_ctl, path, extent_count); } diff --git a/fs/btrfs/fs.c b/fs/btrfs/fs.c index 09cfb43580cb..b2bb86f8d7cf 100644 --- a/fs/btrfs/fs.c +++ b/fs/btrfs/fs.c @@ -1,7 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 #include "messages.h" -#include "ctree.h" #include "fs.h" #include "accessors.h" #include "volumes.h" diff --git a/fs/btrfs/fs.h b/fs/btrfs/fs.h index b572d6b9730b..4394de12a767 100644 --- a/fs/btrfs/fs.h +++ b/fs/btrfs/fs.h @@ -47,6 +47,18 @@ struct btrfs_subpage_info; struct btrfs_stripe_hash_table; struct btrfs_space_info; +/* + * Minimum data and metadata block size. + * + * Normally it's 4K, but for testing subpage block size on 4K page systems, we + * allow DEBUG builds to accept 2K page size. + */ +#ifdef CONFIG_BTRFS_DEBUG +#define BTRFS_MIN_BLOCKSIZE (SZ_2K) +#else +#define BTRFS_MIN_BLOCKSIZE (SZ_4K) +#endif + #define BTRFS_MAX_EXTENT_SIZE SZ_128M #define BTRFS_OLDEST_GENERATION 0ULL @@ -105,6 +117,9 @@ enum { /* Indicates there was an error cleaning up a log tree. */ BTRFS_FS_STATE_LOG_CLEANUP_ERROR, + /* No more delayed iput can be queued. */ + BTRFS_FS_STATE_NO_DELAYED_IPUT, + BTRFS_FS_STATE_COUNT }; @@ -285,6 +300,7 @@ enum { #define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL #define BTRFS_DEFAULT_COMMIT_INTERVAL (30) +#define BTRFS_WARNING_COMMIT_INTERVAL (300) #define BTRFS_DEFAULT_MAX_INLINE (2048) struct btrfs_dev_replace { @@ -456,6 +472,8 @@ struct btrfs_fs_info { struct btrfs_block_rsv delayed_block_rsv; /* Block reservation for delayed refs */ struct btrfs_block_rsv delayed_refs_rsv; + /* Block reservation for treelog tree */ + struct btrfs_block_rsv treelog_rsv; struct btrfs_block_rsv empty_block_rsv; @@ -485,8 +503,8 @@ struct btrfs_fs_info { u64 last_trans_log_full_commit; unsigned long long mount_opt; - unsigned long compress_type:4; - unsigned int compress_level; + int compress_type; + int compress_level; u32 commit_interval; /* * It is a suggestive number, the read side is safe even it gets a @@ -709,7 +727,6 @@ struct btrfs_fs_info { * running. */ refcount_t scrub_workers_refcnt; - u32 sectors_per_page; struct workqueue_struct *scrub_workers; struct btrfs_discard_ctl discard_ctl; @@ -762,10 +779,8 @@ struct btrfs_fs_info { struct btrfs_delayed_root *delayed_root; - /* Extent buffer radix tree */ - spinlock_t buffer_lock; /* Entries are eb->start / sectorsize */ - struct radix_tree_root buffer_radix; + struct xarray buffer_tree; /* Next backup root to be overwritten */ int backup_root_index; @@ -981,6 +996,12 @@ static inline u32 count_max_extents(const struct btrfs_fs_info *fs_info, u64 siz return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size); } +static inline unsigned int btrfs_blocks_per_folio(const struct btrfs_fs_info *fs_info, + const struct folio *folio) +{ + return folio_size(folio) >> fs_info->sectorsize_bits; +} + bool btrfs_exclop_start(struct btrfs_fs_info *fs_info, enum btrfs_exclusive_operation type); bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info, diff --git a/fs/btrfs/inode-item.c b/fs/btrfs/inode-item.c index 448aa1a682d6..a61c3540d67b 100644 --- a/fs/btrfs/inode-item.c +++ b/fs/btrfs/inode-item.c @@ -109,7 +109,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, u64 inode_objectid, u64 ref_objectid, u64 *index) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct btrfs_inode_extref *extref; struct extent_buffer *leaf; @@ -129,9 +129,9 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) - ret = -ENOENT; + return -ENOENT; if (ret < 0) - goto out; + return ret; /* * Sanity check - did we find the right item for this name? @@ -142,8 +142,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, ref_objectid, name); if (!extref) { btrfs_abort_transaction(trans, -ENOENT); - ret = -ENOENT; - goto out; + return -ENOENT; } leaf = path->nodes[0]; @@ -152,12 +151,8 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, *index = btrfs_inode_extref_index(leaf, extref); if (del_len == item_size) { - /* - * Common case only one ref in the item, remove the - * whole item. - */ - ret = btrfs_del_item(trans, root, path); - goto out; + /* Common case only one ref in the item, remove the whole item. */ + return btrfs_del_item(trans, root, path); } ptr = (unsigned long)extref; @@ -168,9 +163,6 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans, btrfs_truncate_item(trans, path, item_size - del_len, 1); -out: - btrfs_free_path(path); - return ret; } @@ -191,8 +183,8 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans, int del_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -260,7 +252,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, int ret; int ins_len = name->len + sizeof(*extref); unsigned long ptr; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_key key; struct extent_buffer *leaf; @@ -279,13 +271,13 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, path->slots[0], ref_objectid, name)) - goto out; + return ret; btrfs_extend_item(trans, path, ins_len); ret = 0; } if (ret < 0) - goto out; + return ret; leaf = path->nodes[0]; ptr = (unsigned long)btrfs_item_ptr(leaf, path->slots[0], char); @@ -298,9 +290,8 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans, ptr = (unsigned long)&extref->name; write_extent_buffer(path->nodes[0], name->name, ptr, name->len); -out: - btrfs_free_path(path); - return ret; + + return 0; } /* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */ @@ -317,8 +308,8 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans, int ins_len = name->len + sizeof(*ref); key.objectid = inode_objectid; - key.offset = ref_objectid; key.type = BTRFS_INODE_REF_KEY; + key.offset = ref_objectid; path = btrfs_alloc_path(); if (!path) @@ -493,8 +484,8 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans, path->reada = READA_BACK; key.objectid = control->ino; - key.offset = (u64)-1; key.type = (u8)-1; + key.offset = (u64)-1; search_again: /* diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c index a9322601ab5c..c0c778243bf1 100644 --- a/fs/btrfs/inode.c +++ b/fs/btrfs/inode.c @@ -489,8 +489,8 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans, size_t datasize; key.objectid = btrfs_ino(inode); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(cur_size); ret = btrfs_insert_empty_item(trans, root, path, &key, @@ -566,23 +566,14 @@ static bool can_cow_file_range_inline(struct btrfs_inode *inode, if (offset != 0) return false; - /* - * Due to the page size limit, for subpage we can only trigger the - * writeback for the dirty sectors of page, that means data writeback - * is doing more writeback than what we want. - * - * This is especially unexpected for some call sites like fallocate, - * where we only increase i_size after everything is done. - * This means we can trigger inline extent even if we didn't want to. - * So here we skip inline extent creation completely. - */ - if (fs_info->sectorsize != PAGE_SIZE) - return false; - /* Inline extents are limited to sectorsize. */ if (size > fs_info->sectorsize) return false; + /* We do not allow a non-compressed extent to be as large as block size. */ + if (data_len >= fs_info->sectorsize) + return false; + /* We cannot exceed the maximum inline data size. */ if (data_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) return false; @@ -672,7 +663,7 @@ out: * And at reserve time, it's always aligned to page size, so * just free one page here. */ - btrfs_qgroup_free_data(inode, NULL, 0, PAGE_SIZE, NULL); + btrfs_qgroup_free_data(inode, NULL, 0, fs_info->sectorsize, NULL); btrfs_free_path(path); btrfs_end_transaction(trans); return ret; @@ -695,12 +686,12 @@ static noinline int cow_file_range_inline(struct btrfs_inode *inode, if (!can_cow_file_range_inline(inode, offset, size, compressed_size)) return 1; - lock_extent(&inode->io_tree, offset, end, &cached); + btrfs_lock_extent(&inode->io_tree, offset, end, &cached); ret = __cow_file_range_inline(inode, size, compressed_size, compress_type, compressed_folio, update_i_size); if (ret > 0) { - unlock_extent(&inode->io_tree, offset, end, &cached); + btrfs_unlock_extent(&inode->io_tree, offset, end, &cached); return ret; } @@ -786,26 +777,9 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start, struct btrfs_fs_info *fs_info = inode->root->fs_info; if (!btrfs_inode_can_compress(inode)) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: unexpected compression for ino %llu\n", - btrfs_ino(inode)); + DEBUG_WARN("BTRFS: unexpected compression for ino %llu", btrfs_ino(inode)); return 0; } - /* - * Only enable sector perfect compression for experimental builds. - * - * This is a big feature change for subpage cases, and can hit - * different corner cases, so only limit this feature for - * experimental build for now. - * - * ETA for moving this out of experimental builds is 6.15. - */ - if (fs_info->sectorsize < PAGE_SIZE && - !IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { - if (!PAGE_ALIGNED(start) || - !PAGE_ALIGNED(end + 1)) - return 0; - } /* force compress */ if (btrfs_test_opt(fs_info, FORCE_COMPRESS)) @@ -832,7 +806,7 @@ static inline void inode_should_defrag(struct btrfs_inode *inode, btrfs_add_inode_defrag(inode, small_write); } -static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 end) +static int extent_range_clear_dirty_for_io(struct btrfs_inode *inode, u64 start, u64 end) { unsigned long end_index = end >> PAGE_SHIFT; struct folio *folio; @@ -840,13 +814,13 @@ static int extent_range_clear_dirty_for_io(struct inode *inode, u64 start, u64 e for (unsigned long index = start >> PAGE_SHIFT; index <= end_index; index++) { - folio = filemap_get_folio(inode->i_mapping, index); + folio = filemap_get_folio(inode->vfs_inode.i_mapping, index); if (IS_ERR(folio)) { if (!ret) ret = PTR_ERR(folio); continue; } - btrfs_folio_clamp_clear_dirty(inode_to_fs_info(inode), folio, start, + btrfs_folio_clamp_clear_dirty(inode->root->fs_info, folio, start, end + 1 - start); folio_put(folio); } @@ -886,6 +860,7 @@ static void compress_file_range(struct btrfs_work *work) unsigned int poff; int i; int compress_type = fs_info->compress_type; + int compress_level = fs_info->compress_level; inode_should_defrag(inode, start, end, end - start + 1, SZ_16K); @@ -894,7 +869,7 @@ static void compress_file_range(struct btrfs_work *work) * Otherwise applications with the file mmap'd can wander in and change * the page contents while we are compressing them. */ - ret = extent_range_clear_dirty_for_io(&inode->vfs_inode, start, end); + ret = extent_range_clear_dirty_for_io(inode, start, end); /* * All the folios should have been locked thus no failure. @@ -968,13 +943,15 @@ again: goto cleanup_and_bail_uncompressed; } - if (inode->defrag_compress) + if (inode->defrag_compress) { compress_type = inode->defrag_compress; - else if (inode->prop_compress) + compress_level = inode->defrag_compress_level; + } else if (inode->prop_compress) { compress_type = inode->prop_compress; + } /* Compression level is applied here. */ - ret = btrfs_compress_folios(compress_type | (fs_info->compress_level << 4), + ret = btrfs_compress_folios(compress_type, compress_level, mapping, start, folios, &nr_folios, &total_in, &total_compressed); if (ret) @@ -1090,7 +1067,6 @@ static void submit_uncompressed_range(struct btrfs_inode *inode, &wbc, false); wbc_detach_inode(&wbc); if (ret < 0) { - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); if (locked_folio) btrfs_folio_end_lock(inode->root->fs_info, locked_folio, start, async_extent->ram_size); @@ -1116,6 +1092,7 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, struct extent_state *cached = NULL; struct extent_map *em; int ret = 0; + bool free_pages = false; u64 start = async_extent->start; u64 end = async_extent->start + async_extent->ram_size - 1; @@ -1136,7 +1113,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, } if (async_extent->compress_type == BTRFS_COMPRESS_NONE) { + ASSERT(!async_extent->folios); + ASSERT(async_extent->nr_folios == 0); submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } @@ -1152,10 +1132,11 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, * fall back to uncompressed. */ submit_uncompressed_range(inode, async_extent, locked_folio); + free_pages = true; goto done; } - lock_extent(io_tree, start, end, &cached); + btrfs_lock_extent(io_tree, start, end, &cached); /* Here we're doing allocation and writeback of the compressed pages */ file_extent.disk_bytenr = ins.objectid; @@ -1170,10 +1151,10 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, ret = PTR_ERR(em); goto out_free_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_COMPRESSED); + 1U << BTRFS_ORDERED_COMPRESSED); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -1193,12 +1174,14 @@ static void submit_one_async_extent(struct async_chunk *async_chunk, done: if (async_chunk->blkcg_css) kthread_associate_blkcg(NULL); + if (free_pages) + free_async_extent_pages(async_extent); kfree(async_extent); return; out_free_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); mapping_set_error(inode->vfs_inode.i_mapping, -EIO); extent_clear_unlock_delalloc(inode, start, end, NULL, &cached, @@ -1225,7 +1208,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, u64 alloc_hint = 0; read_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, start, num_bytes); + em = btrfs_search_extent_mapping(em_tree, start, num_bytes); if (em) { /* * if block start isn't an actual block number then find the @@ -1233,15 +1216,15 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * block is also bogus then just don't worry about it. */ if (em->disk_bytenr >= EXTENT_MAP_LAST_BYTE) { - free_extent_map(em); - em = search_extent_mapping(em_tree, 0, 0); + btrfs_free_extent_map(em); + em = btrfs_search_extent_mapping(em_tree, 0, 0); if (em && em->disk_bytenr < EXTENT_MAP_LAST_BYTE) - alloc_hint = extent_map_block_start(em); + alloc_hint = btrfs_extent_map_block_start(em); if (em) - free_extent_map(em); + btrfs_free_extent_map(em); } else { - alloc_hint = extent_map_block_start(em); - free_extent_map(em); + alloc_hint = btrfs_extent_map_block_start(em); + btrfs_free_extent_map(em); } } read_unlock(&em_tree->lock); @@ -1272,10 +1255,7 @@ u64 btrfs_get_extent_allocation_hint(struct btrfs_inode *inode, u64 start, * - Else all pages except for @locked_folio are unlocked. * * When a failure happens in the second or later iteration of the - * while-loop, the ordered extents created in previous iterations are kept - * intact. So, the caller must clean them up by calling - * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for - * example. + * while-loop, the ordered extents created in previous iterations are cleaned up. */ static noinline int cow_file_range(struct btrfs_inode *inode, struct folio *locked_folio, u64 start, @@ -1382,8 +1362,13 @@ static noinline int cow_file_range(struct btrfs_inode *inode, continue; } if (done_offset) { - *done_offset = start - 1; - return 0; + /* + * Move @end to the end of the processed range, + * and exit the loop to unlock the processed extents. + */ + end = start - 1; + ret = 0; + break; } ret = -ENOSPC; } @@ -1402,24 +1387,24 @@ static noinline int cow_file_range(struct btrfs_inode *inode, * Locked range will be released either during error clean up or * after the whole range is finished. */ - lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, - &cached); + btrfs_lock_extent(&inode->io_tree, start, start + cur_alloc_size - 1, + &cached); em = btrfs_create_io_em(inode, start, &file_extent, BTRFS_ORDERED_REGULAR); if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(em); goto out_reserve; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - 1 << BTRFS_ORDERED_REGULAR); + 1U << BTRFS_ORDERED_REGULAR); if (IS_ERR(ordered)) { - unlock_extent(&inode->io_tree, start, - start + cur_alloc_size - 1, &cached); + btrfs_unlock_extent(&inode->io_tree, start, + start + cur_alloc_size - 1, &cached); ret = PTR_ERR(ordered); goto out_drop_extent_cache; } @@ -1474,7 +1459,7 @@ out_drop_extent_cache: btrfs_drop_extent_map_range(inode, start, start + cur_alloc_size - 1, false); out_reserve: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_unlock: /* * Now, we have three regions to clean up: @@ -1487,11 +1472,9 @@ out_unlock: /* * For the range (1). We have already instantiated the ordered extents - * for this region. They are cleaned up by - * btrfs_cleanup_ordered_extents() in e.g, - * btrfs_run_delalloc_range(). + * for this region, thus we need to cleanup those ordered extents. * EXTENT_DELALLOC_NEW | EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV - * are also handled by the cleanup function. + * are also handled by the ordered extents cleanup. * * So here we only clear EXTENT_LOCKED and EXTENT_DELALLOC flag, and * finish the writeback of the involved folios, which will be never submitted. @@ -1502,6 +1485,8 @@ out_unlock: if (!locked_folio) mapping_set_error(inode->vfs_inode.i_mapping, ret); + + btrfs_cleanup_ordered_extents(inode, orig_start, start - orig_start); extent_clear_unlock_delalloc(inode, orig_start, start - 1, locked_folio, NULL, clear_bits, page_ops); } @@ -1583,8 +1568,8 @@ static noinline void submit_compressed_extents(struct btrfs_work *work, bool do_ PAGE_SHIFT; while (!list_empty(&async_chunk->extents)) { - async_extent = list_entry(async_chunk->extents.next, - struct async_extent, list); + async_extent = list_first_entry(&async_chunk->extents, + struct async_extent, list); list_del(&async_extent->list); submit_one_async_extent(async_chunk, async_extent, &alloc_hint); } @@ -1754,9 +1739,9 @@ static int fallback_to_cow(struct btrfs_inode *inode, * group that contains that extent to RO mode and therefore force COW * when starting writeback. */ - lock_extent(io_tree, start, end, &cached_state); - count = count_range_bits(io_tree, &range_start, end, range_bytes, - EXTENT_NORESERVE, 0, NULL); + btrfs_lock_extent(io_tree, start, end, &cached_state); + count = btrfs_count_range_bits(io_tree, &range_start, end, range_bytes, + EXTENT_NORESERVE, 0, NULL); if (count > 0 || is_space_ino || is_reloc_ino) { u64 bytes = count; struct btrfs_fs_info *fs_info = inode->root->fs_info; @@ -1770,10 +1755,9 @@ static int fallback_to_cow(struct btrfs_inode *inode, spin_unlock(&sinfo->lock); if (count > 0) - clear_extent_bit(io_tree, start, end, EXTENT_NORESERVE, - NULL); + btrfs_clear_extent_bits(io_tree, start, end, EXTENT_NORESERVE); } - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); /* * Don't try to create inline extents, as a mix of inline extent that @@ -1971,6 +1955,65 @@ static void cleanup_dirty_folios(struct btrfs_inode *inode, mapping_set_error(mapping, error); } +static int nocow_one_range(struct btrfs_inode *inode, struct folio *locked_folio, + struct extent_state **cached, + struct can_nocow_file_extent_args *nocow_args, + u64 file_pos, bool is_prealloc) +{ + struct btrfs_ordered_extent *ordered; + u64 len = nocow_args->file_extent.num_bytes; + u64 end = file_pos + len - 1; + int ret = 0; + + btrfs_lock_extent(&inode->io_tree, file_pos, end, cached); + + if (is_prealloc) { + struct extent_map *em; + + em = btrfs_create_io_em(inode, file_pos, &nocow_args->file_extent, + BTRFS_ORDERED_PREALLOC); + if (IS_ERR(em)) { + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(em); + } + btrfs_free_extent_map(em); + } + + ordered = btrfs_alloc_ordered_extent(inode, file_pos, &nocow_args->file_extent, + is_prealloc + ? (1U << BTRFS_ORDERED_PREALLOC) + : (1U << BTRFS_ORDERED_NOCOW)); + if (IS_ERR(ordered)) { + if (is_prealloc) + btrfs_drop_extent_map_range(inode, file_pos, end, false); + btrfs_unlock_extent(&inode->io_tree, file_pos, end, cached); + return PTR_ERR(ordered); + } + + if (btrfs_is_data_reloc_root(inode->root)) + /* + * Errors are handled later, as we must prevent + * extent_clear_unlock_delalloc() in error handler from freeing + * metadata of the created ordered extent. + */ + ret = btrfs_reloc_clone_csums(ordered); + btrfs_put_ordered_extent(ordered); + + extent_clear_unlock_delalloc(inode, file_pos, end, locked_folio, cached, + EXTENT_LOCKED | EXTENT_DELALLOC | + EXTENT_CLEAR_DATA_RESV, + PAGE_UNLOCK | PAGE_SET_ORDERED); + /* + * On error, we need to cleanup the ordered extents we created. + * + * We do not clear the folio Dirty flags because they are set and + * cleaered by the caller. + */ + if (ret < 0) + btrfs_cleanup_ordered_extents(inode, file_pos, end); + return ret; +} + /* * when nowcow writeback call back. This checks for snapshots or COW copies * of the extents that exist in the file, and COWs the file as required. @@ -2015,15 +2058,12 @@ static noinline int run_delalloc_nocow(struct btrfs_inode *inode, while (cur_offset <= end) { struct btrfs_block_group *nocow_bg = NULL; - struct btrfs_ordered_extent *ordered; struct btrfs_key found_key; struct btrfs_file_extent_item *fi; struct extent_buffer *leaf; struct extent_state *cached_state = NULL; u64 extent_end; - u64 nocow_end; int extent_type; - bool is_prealloc; ret = btrfs_lookup_file_extent(NULL, root, path, ino, cur_offset, 0); @@ -2078,12 +2118,13 @@ next_slot: /* * If the found extent starts after requested offset, then - * adjust extent_end to be right before this extent begins + * adjust cur_offset to be right before this extent begins. */ if (found_key.offset > cur_offset) { - extent_end = found_key.offset; - extent_type = 0; - goto must_cow; + if (cow_start == (u64)-1) + cow_start = cur_offset; + cur_offset = found_key.offset; + goto next_slot; } /* @@ -2149,75 +2190,21 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, found_key.offset - 1); - cow_start = (u64)-1; if (ret) { cow_end = found_key.offset - 1; btrfs_dec_nocow_writers(nocow_bg); goto error; } + cow_start = (u64)-1; } - nocow_end = cur_offset + nocow_args.file_extent.num_bytes - 1; - lock_extent(&inode->io_tree, cur_offset, nocow_end, &cached_state); - - is_prealloc = extent_type == BTRFS_FILE_EXTENT_PREALLOC; - if (is_prealloc) { - struct extent_map *em; - - em = btrfs_create_io_em(inode, cur_offset, - &nocow_args.file_extent, - BTRFS_ORDERED_PREALLOC); - if (IS_ERR(em)) { - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - btrfs_dec_nocow_writers(nocow_bg); - ret = PTR_ERR(em); - goto error; - } - free_extent_map(em); - } - - ordered = btrfs_alloc_ordered_extent(inode, cur_offset, - &nocow_args.file_extent, - is_prealloc - ? (1 << BTRFS_ORDERED_PREALLOC) - : (1 << BTRFS_ORDERED_NOCOW)); + ret = nocow_one_range(inode, locked_folio, &cached_state, + &nocow_args, cur_offset, + extent_type == BTRFS_FILE_EXTENT_PREALLOC); btrfs_dec_nocow_writers(nocow_bg); - if (IS_ERR(ordered)) { - if (is_prealloc) { - btrfs_drop_extent_map_range(inode, cur_offset, - nocow_end, false); - } - unlock_extent(&inode->io_tree, cur_offset, - nocow_end, &cached_state); - ret = PTR_ERR(ordered); + if (ret < 0) goto error; - } - - if (btrfs_is_data_reloc_root(root)) - /* - * Error handled later, as we must prevent - * extent_clear_unlock_delalloc() in error handler - * from freeing metadata of created ordered extent. - */ - ret = btrfs_reloc_clone_csums(ordered); - btrfs_put_ordered_extent(ordered); - - extent_clear_unlock_delalloc(inode, cur_offset, nocow_end, - locked_folio, &cached_state, - EXTENT_LOCKED | EXTENT_DELALLOC | - EXTENT_CLEAR_DATA_RESV, - PAGE_UNLOCK | PAGE_SET_ORDERED); - cur_offset = extent_end; - - /* - * btrfs_reloc_clone_csums() error, now we're OK to call error - * handler, as metadata for created ordered extent will only - * be freed by btrfs_finish_ordered_io(). - */ - if (ret) - goto error; } btrfs_release_path(path); @@ -2226,11 +2213,11 @@ must_cow: if (cow_start != (u64)-1) { ret = fallback_to_cow(inode, locked_folio, cow_start, end); - cow_start = (u64)-1; if (ret) { cow_end = end; goto error; } + cow_start = (u64)-1; } btrfs_free_path(path); @@ -2244,27 +2231,44 @@ error: * start cur_offset end * |/////////////| | * + * In this case, cow_start should be (u64)-1. + * * For range [start, cur_offset) the folios are already unlocked (except * @locked_folio), EXTENT_DELALLOC already removed. - * Only need to clear the dirty flag as they will never be submitted. - * Ordered extent and extent maps are handled by - * btrfs_mark_ordered_io_finished() inside run_delalloc_range(). + * Need to clear the dirty flags and finish the ordered extents. + * + * 2) Failed with error before calling fallback_to_cow() + * + * start cow_start end + * |/////////////| | + * + * In this case, only @cow_start is set, @cur_offset is between + * [cow_start, end) + * + * It's mostly the same as case 1), just replace @cur_offset with + * @cow_start. * - * 2) Failed with error from fallback_to_cow() - * start cur_offset cow_end end + * 3) Failed with error from fallback_to_cow() + * + * start cow_start cow_end end * |/////////////|-----------| | * - * For range [start, cur_offset) it's the same as case 1). - * But for range [cur_offset, cow_end), the folios have dirty flag - * cleared and unlocked, EXTENT_DEALLLOC cleared by cow_file_range(). + * In this case, both @cow_start and @cow_end is set. * - * Thus we should not call extent_clear_unlock_delalloc() on range - * [cur_offset, cow_end), as the folios are already unlocked. + * For range [start, cow_start) it's the same as case 1). + * But for range [cow_start, cow_end), all the cleanup is handled by + * cow_file_range(), we should not touch anything in that range. * - * So clear the folio dirty flags for [start, cur_offset) first. + * So for all above cases, if @cow_start is set, cleanup ordered extents + * for range [start, @cow_start), other wise cleanup range [start, @cur_offset). */ - if (cur_offset > start) + if (cow_start != (u64)-1) + cur_offset = cow_start; + + if (cur_offset > start) { + btrfs_cleanup_ordered_extents(inode, start, cur_offset - start); cleanup_dirty_folios(inode, locked_folio, start, cur_offset - 1, ret); + } /* * If an error happened while a COW region is outstanding, cur_offset @@ -2281,7 +2285,7 @@ error: if (cur_offset < end) { struct extent_state *cached = NULL; - lock_extent(&inode->io_tree, cur_offset, end, &cached); + btrfs_lock_extent(&inode->io_tree, cur_offset, end, &cached); extent_clear_unlock_delalloc(inode, cur_offset, end, locked_folio, &cached, EXTENT_LOCKED | EXTENT_DELALLOC | @@ -2303,7 +2307,7 @@ static bool should_nocow(struct btrfs_inode *inode, u64 start, u64 end) { if (inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)) { if (inode->defrag_bytes && - test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) + btrfs_test_range_bit_exists(&inode->io_tree, start, end, EXTENT_DEFRAG)) return false; return true; } @@ -2329,7 +2333,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol if (should_nocow(inode, start, end)) { ret = run_delalloc_nocow(inode, locked_folio, start, end); - goto out; + return ret; } if (btrfs_inode_can_compress(inode) && @@ -2343,10 +2347,6 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct folio *locked_fol else ret = cow_file_range(inode, locked_folio, start, end, NULL, false, false); - -out: - if (ret < 0) - btrfs_cleanup_ordered_extents(inode, start, end - start + 1); return ret; } @@ -2596,7 +2596,7 @@ void btrfs_clear_delalloc_extent(struct btrfs_inode *inode, !btrfs_is_free_space_inode(inode) && !(state->state & EXTENT_NORESERVE) && (bits & EXTENT_CLEAR_DATA_RESV)) - btrfs_free_reserved_data_space_noquota(fs_info, len); + btrfs_free_reserved_data_space_noquota(inode, len); percpu_counter_add_batch(&fs_info->delalloc_bytes, -len, fs_info->delalloc_batch); @@ -2680,12 +2680,12 @@ static int btrfs_find_new_delalloc_bytes(struct btrfs_inode *inode, if (em_len > search_len) em_len = search_len; - ret = set_extent_bit(&inode->io_tree, search_start, - search_start + em_len - 1, - EXTENT_DELALLOC_NEW, cached_state); + ret = btrfs_set_extent_bit(&inode->io_tree, search_start, + search_start + em_len - 1, + EXTENT_DELALLOC_NEW, cached_state); next: - search_start = extent_map_end(em); - free_extent_map(em); + search_start = btrfs_extent_map_end(em); + btrfs_free_extent_map(em); if (ret) return ret; } @@ -2715,8 +2715,8 @@ int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end, return ret; } - return set_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC | extra_bits, cached_state); + return btrfs_set_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC | extra_bits, cached_state); } /* see btrfs_writepage_start_hook for details on why this is required */ @@ -2791,7 +2791,7 @@ again: if (ret) goto out_page; - lock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_lock_extent(&inode->io_tree, page_start, page_end, &cached_state); /* already ordered? We're done */ if (folio_test_ordered(folio)) @@ -2799,8 +2799,8 @@ again: ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE); if (ordered) { - unlock_extent(&inode->io_tree, page_start, page_end, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, + &cached_state); folio_unlock(folio); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); @@ -2826,7 +2826,7 @@ out_reserved: if (free_delalloc_space) btrfs_delalloc_release_space(inode, data_reserved, page_start, PAGE_SIZE, true); - unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, page_start, page_end, &cached_state); out_page: if (ret) { /* @@ -2873,6 +2873,21 @@ int btrfs_writepage_cow_fixup(struct folio *folio) return 0; /* + * For experimental build, we error out instead of EAGAIN. + * + * We should not hit such out-of-band dirty folios anymore. + */ + if (IS_ENABLED(CONFIG_BTRFS_EXPERIMENTAL)) { + DEBUG_WARN(); + btrfs_err_rl(fs_info, + "root %lld ino %llu folio %llu is marked dirty without notifying the fs", + BTRFS_I(inode)->root->root_key.objectid, + btrfs_ino(BTRFS_I(inode)), + folio_pos(folio)); + return -EUCLEAN; + } + + /* * folio_checked is set below when we create a fixup worker for this * folio, don't try to create another one if we're already * folio_test_checked. @@ -2891,7 +2906,7 @@ int btrfs_writepage_cow_fixup(struct folio *folio) * We are already holding a reference to this inode from * write_cache_pages. We need to hold it because the space reservation * takes place outside of the folio lock, and we can't trust - * page->mapping outside of the folio lock. + * folio->mapping outside of the folio lock. */ ihold(inode); btrfs_folio_set_checked(fs_info, folio, folio_pos(folio), folio_size(folio)); @@ -2912,7 +2927,7 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, { struct btrfs_root *root = inode->root; const u64 sectorsize = root->fs_info->sectorsize; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key ins; u64 disk_num_bytes = btrfs_stack_file_extent_disk_num_bytes(stack_fi); @@ -2947,8 +2962,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, if (!drop_args.extent_inserted) { ins.objectid = btrfs_ino(inode); - ins.offset = file_pos; ins.type = BTRFS_EXTENT_DATA_KEY; + ins.offset = file_pos; ret = btrfs_insert_empty_item(trans, root, path, &ins, sizeof(*stack_fi)); @@ -2983,8 +2998,8 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, btrfs_update_inode_bytes(inode, num_bytes, drop_args.bytes_found); ins.objectid = disk_bytenr; - ins.offset = disk_num_bytes; ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = disk_num_bytes; ret = btrfs_inode_set_file_extent_range(inode, file_pos, ram_bytes); if (ret) @@ -2994,8 +3009,6 @@ static int insert_reserved_file_extent(struct btrfs_trans_handle *trans, file_pos - offset, qgroup_reserved, &ins); out: - btrfs_free_path(path); - return ret; } @@ -3111,8 +3124,10 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) * depending on their current state). */ if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags)) { - clear_bits |= EXTENT_LOCKED; - lock_extent(io_tree, start, end, &cached_state); + clear_bits |= EXTENT_LOCKED | EXTENT_FINISHING_ORDERED; + btrfs_lock_extent_bits(io_tree, start, end, + EXTENT_LOCKED | EXTENT_FINISHING_ORDERED, + &cached_state); } if (freespace_inode) @@ -3176,8 +3191,8 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } - ret = unpin_extent_cache(inode, ordered_extent->file_offset, - ordered_extent->num_bytes, trans->transid); + ret = btrfs_unpin_extent_cache(inode, ordered_extent->file_offset, + ordered_extent->num_bytes, trans->transid); if (ret < 0) { btrfs_abort_transaction(trans, ret); goto out; @@ -3196,9 +3211,9 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) */ if ((clear_bits & EXTENT_DELALLOC_NEW) && !test_bit(BTRFS_ORDERED_TRUNCATED, &ordered_extent->flags)) - clear_extent_bit(&inode->io_tree, start, end, - EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, + EXTENT_DELALLOC_NEW | EXTENT_ADD_INODE_BYTES, + &cached_state); btrfs_inode_safe_disk_i_size_write(inode, 0); ret = btrfs_update_inode_fallback(trans, inode); @@ -3207,15 +3222,13 @@ int btrfs_finish_one_ordered(struct btrfs_ordered_extent *ordered_extent) goto out; } out: - clear_extent_bit(&inode->io_tree, start, end, clear_bits, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, start, end, clear_bits, + &cached_state); if (trans) btrfs_end_transaction(trans); if (ret || truncated) { - u64 unwritten_start = start; - /* * If we failed to finish this ordered extent for any reason we * need to make sure BTRFS_ORDERED_IOERR is set on the ordered @@ -3227,10 +3240,6 @@ out: if (ret) btrfs_mark_ordered_extent_error(ordered_extent); - if (truncated) - unwritten_start += logical_len; - clear_extent_uptodate(io_tree, unwritten_start, end, NULL); - /* * Drop extent maps for the part of the extent we didn't write. * @@ -3245,9 +3254,15 @@ out: * we don't mess with the extent map tree in the NOCOW case, but * for now simply skip this if we are the free space inode. */ - if (!btrfs_is_free_space_inode(inode)) + if (!btrfs_is_free_space_inode(inode)) { + u64 unwritten_start = start; + + if (truncated) + unwritten_start += logical_len; + btrfs_drop_extent_map_range(inode, unwritten_start, end, false); + } /* * If the ordered extent had an IOERR or something else went @@ -3274,7 +3289,7 @@ out: NULL); btrfs_free_reserved_extent(fs_info, ordered_extent->disk_bytenr, - ordered_extent->disk_num_bytes, 1); + ordered_extent->disk_num_bytes, true); /* * Actually free the qgroup rsv which was released when * the ordered extent was created. @@ -3311,20 +3326,16 @@ int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered) /* * Verify the checksum for a single sector without any extra action that depend * on the type of I/O. + * + * @kaddr must be a properly kmapped address. */ -int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page, - u32 pgoff, u8 *csum, const u8 * const csum_expected) +int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, void *kaddr, u8 *csum, + const u8 * const csum_expected) { SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); - char *kaddr; - - ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE); shash->tfm = fs_info->csum_shash; - - kaddr = kmap_local_page(page) + pgoff; crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum); - kunmap_local(kaddr); if (memcmp(csum, csum_expected, fs_info->csum_size)) return -EIO; @@ -3353,6 +3364,7 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, u64 end = file_offset + bv->bv_len - 1; u8 *csum_expected; u8 csum[BTRFS_CSUM_SIZE]; + void *kaddr; ASSERT(bv->bv_len == fs_info->sectorsize); @@ -3360,19 +3372,22 @@ bool btrfs_data_csum_ok(struct btrfs_bio *bbio, struct btrfs_device *dev, return true; if (btrfs_is_data_reloc_root(inode->root) && - test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, - NULL)) { + btrfs_test_range_bit(&inode->io_tree, file_offset, end, EXTENT_NODATASUM, + NULL)) { /* Skip the range without csum for data reloc inode */ - clear_extent_bits(&inode->io_tree, file_offset, end, - EXTENT_NODATASUM); + btrfs_clear_extent_bits(&inode->io_tree, file_offset, end, + EXTENT_NODATASUM); return true; } csum_expected = bbio->csum + (bio_offset >> fs_info->sectorsize_bits) * fs_info->csum_size; - if (btrfs_check_sector_csum(fs_info, bv->bv_page, bv->bv_offset, csum, - csum_expected)) + kaddr = bvec_kmap_local(bv); + if (btrfs_check_sector_csum(fs_info, kaddr, csum, csum_expected)) { + kunmap_local(kaddr); goto zeroit; + } + kunmap_local(kaddr); return true; zeroit: @@ -3402,6 +3417,7 @@ void btrfs_add_delayed_iput(struct btrfs_inode *inode) if (atomic_add_unless(&inode->vfs_inode.i_count, -1, 1)) return; + WARN_ON_ONCE(test_bit(BTRFS_FS_STATE_NO_DELAYED_IPUT, &fs_info->fs_state)); atomic_inc(&fs_info->nr_delayed_iputs); /* * Need to be irq safe here because we can be called from either an irq @@ -3518,11 +3534,10 @@ static int btrfs_orphan_del(struct btrfs_trans_handle *trans, int btrfs_orphan_cleanup(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key, found_key; struct btrfs_trans_handle *trans; - struct inode *inode; u64 last_objectid = 0; int ret = 0, nr_unlink = 0; @@ -3541,6 +3556,8 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) goto out; @@ -3664,10 +3681,10 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) * deleted but wasn't. The inode number may have been reused, * but either way, we can delete the orphan item. */ - if (!inode || inode->i_nlink) { + if (!inode || inode->vfs_inode.i_nlink) { if (inode) { - ret = btrfs_drop_verity_items(BTRFS_I(inode)); - iput(inode); + ret = btrfs_drop_verity_items(inode); + iput(&inode->vfs_inode); inode = NULL; if (ret) goto out; @@ -3690,7 +3707,7 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) nr_unlink++; /* this will do delete_inode and everything for us */ - iput(inode); + iput(&inode->vfs_inode); } /* release the path since we're done with it */ btrfs_release_path(path); @@ -3707,19 +3724,22 @@ int btrfs_orphan_cleanup(struct btrfs_root *root) out: if (ret) btrfs_err(fs_info, "could not do orphan cleanup %d", ret); - btrfs_free_path(path); return ret; } /* - * very simple check to peek ahead in the leaf looking for xattrs. If we - * don't find any xattrs, we know there can't be any acls. + * Look ahead in the leaf for xattrs. If we don't find any then we know there + * can't be any ACLs. + * + * @leaf: the eb leaf where to search + * @slot: the slot the inode is in + * @objectid: the objectid of the inode * - * slot is the slot the inode is in, objectid is the objectid of the inode + * Return true if there is xattr/ACL, false otherwise. */ -static noinline int acls_after_inode_item(struct extent_buffer *leaf, - int slot, u64 objectid, - int *first_xattr_slot) +static noinline bool acls_after_inode_item(struct extent_buffer *leaf, + int slot, u64 objectid, + int *first_xattr_slot) { u32 nritems = btrfs_header_nritems(leaf); struct btrfs_key found_key; @@ -3739,45 +3759,50 @@ static noinline int acls_after_inode_item(struct extent_buffer *leaf, while (slot < nritems) { btrfs_item_key_to_cpu(leaf, &found_key, slot); - /* we found a different objectid, there must not be acls */ + /* We found a different objectid, there must be no ACLs. */ if (found_key.objectid != objectid) - return 0; + return false; - /* we found an xattr, assume we've got an acl */ + /* We found an xattr, assume we've got an ACL. */ if (found_key.type == BTRFS_XATTR_ITEM_KEY) { if (*first_xattr_slot == -1) *first_xattr_slot = slot; if (found_key.offset == xattr_access || found_key.offset == xattr_default) - return 1; + return true; } /* - * we found a key greater than an xattr key, there can't - * be any acls later on + * We found a key greater than an xattr key, there can't be any + * ACLs later on. */ if (found_key.type > BTRFS_XATTR_ITEM_KEY) - return 0; + return false; slot++; scanned++; /* - * it goes inode, inode backrefs, xattrs, extents, - * so if there are a ton of hard links to an inode there can - * be a lot of backrefs. Don't waste time searching too hard, - * this is just an optimization + * The item order goes like: + * - inode + * - inode backrefs + * - xattrs + * - extents, + * + * so if there are lots of hard links to an inode there can be + * a lot of backrefs. Don't waste time searching too hard, + * this is just an optimization. */ if (scanned >= 8) break; } - /* we hit the end of the leaf before we found an xattr or - * something larger than an xattr. We have to assume the inode - * has acls + /* + * We hit the end of the leaf before we found an xattr or something + * larger than an xattr. We have to assume the inode has ACLs. */ if (*first_xattr_slot == -1) *first_xattr_slot = slot; - return 1; + return true; } static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) @@ -3797,7 +3822,8 @@ static int btrfs_init_file_extent_tree(struct btrfs_inode *inode) if (!inode->file_extent_tree) return -ENOMEM; - extent_io_tree_init(fs_info, inode->file_extent_tree, IO_TREE_INODE_FILE_EXTENT); + btrfs_extent_io_tree_init(fs_info, inode->file_extent_tree, + IO_TREE_INODE_FILE_EXTENT); /* Lockdep class is set only for the file extent tree. */ lockdep_set_class(&inode->file_extent_tree->lock, &file_extent_tree_class); @@ -3840,12 +3866,13 @@ static int btrfs_add_inode_to_root(struct btrfs_inode *inode, bool prealloc) * * On failure clean up the inode. */ -static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) +static int btrfs_read_locked_inode(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct extent_buffer *leaf; struct btrfs_inode_item *inode_item; - struct btrfs_root *root = BTRFS_I(inode)->root; + struct inode *vfs_inode = &inode->vfs_inode; struct btrfs_key location; unsigned long ptr; int maybe_acls; @@ -3854,7 +3881,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) bool filled = false; int first_xattr_slot; - ret = btrfs_init_file_extent_tree(BTRFS_I(inode)); + ret = btrfs_init_file_extent_tree(inode); if (ret) goto out; @@ -3864,7 +3891,7 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) ASSERT(path); - btrfs_get_inode_key(BTRFS_I(inode), &location); + btrfs_get_inode_key(inode, &location); ret = btrfs_lookup_inode(NULL, root, path, &location, 0); if (ret) { @@ -3884,41 +3911,41 @@ static int btrfs_read_locked_inode(struct inode *inode, struct btrfs_path *path) inode_item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_inode_item); - inode->i_mode = btrfs_inode_mode(leaf, inode_item); - set_nlink(inode, btrfs_inode_nlink(leaf, inode_item)); - i_uid_write(inode, btrfs_inode_uid(leaf, inode_item)); - i_gid_write(inode, btrfs_inode_gid(leaf, inode_item)); - btrfs_i_size_write(BTRFS_I(inode), btrfs_inode_size(leaf, inode_item)); - btrfs_inode_set_file_extent_range(BTRFS_I(inode), 0, - round_up(i_size_read(inode), fs_info->sectorsize)); - - inode_set_atime(inode, btrfs_timespec_sec(leaf, &inode_item->atime), + vfs_inode->i_mode = btrfs_inode_mode(leaf, inode_item); + set_nlink(vfs_inode, btrfs_inode_nlink(leaf, inode_item)); + i_uid_write(vfs_inode, btrfs_inode_uid(leaf, inode_item)); + i_gid_write(vfs_inode, btrfs_inode_gid(leaf, inode_item)); + btrfs_i_size_write(inode, btrfs_inode_size(leaf, inode_item)); + btrfs_inode_set_file_extent_range(inode, 0, + round_up(i_size_read(vfs_inode), fs_info->sectorsize)); + + inode_set_atime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->atime), btrfs_timespec_nsec(leaf, &inode_item->atime)); - inode_set_mtime(inode, btrfs_timespec_sec(leaf, &inode_item->mtime), + inode_set_mtime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->mtime), btrfs_timespec_nsec(leaf, &inode_item->mtime)); - inode_set_ctime(inode, btrfs_timespec_sec(leaf, &inode_item->ctime), + inode_set_ctime(vfs_inode, btrfs_timespec_sec(leaf, &inode_item->ctime), btrfs_timespec_nsec(leaf, &inode_item->ctime)); - BTRFS_I(inode)->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); - BTRFS_I(inode)->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); + inode->i_otime_sec = btrfs_timespec_sec(leaf, &inode_item->otime); + inode->i_otime_nsec = btrfs_timespec_nsec(leaf, &inode_item->otime); - inode_set_bytes(inode, btrfs_inode_nbytes(leaf, inode_item)); - BTRFS_I(inode)->generation = btrfs_inode_generation(leaf, inode_item); - BTRFS_I(inode)->last_trans = btrfs_inode_transid(leaf, inode_item); + inode_set_bytes(vfs_inode, btrfs_inode_nbytes(leaf, inode_item)); + inode->generation = btrfs_inode_generation(leaf, inode_item); + inode->last_trans = btrfs_inode_transid(leaf, inode_item); - inode_set_iversion_queried(inode, - btrfs_inode_sequence(leaf, inode_item)); - inode->i_generation = BTRFS_I(inode)->generation; - inode->i_rdev = 0; + inode_set_iversion_queried(vfs_inode, btrfs_inode_sequence(leaf, inode_item)); + vfs_inode->i_generation = inode->generation; + vfs_inode->i_rdev = 0; rdev = btrfs_inode_rdev(leaf, inode_item); - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(vfs_inode->i_mode)) + inode->index_cnt = (u64)-1; btrfs_inode_split_flags(btrfs_inode_flags(leaf, inode_item), - &BTRFS_I(inode)->flags, &BTRFS_I(inode)->ro_flags); + &inode->flags, &inode->ro_flags); + btrfs_update_inode_mapping_flags(inode); cache_index: /* @@ -3930,9 +3957,8 @@ cache_index: * This is required for both inode re-read from disk and delayed inode * in the delayed_nodes xarray. */ - if (BTRFS_I(inode)->last_trans == btrfs_get_fs_generation(fs_info)) - set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, - &BTRFS_I(inode)->runtime_flags); + if (inode->last_trans == btrfs_get_fs_generation(fs_info)) + set_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags); /* * We don't persist the id of the transaction where an unlink operation @@ -3961,7 +3987,7 @@ cache_index: * transaction commits on fsync if our inode is a directory, or if our * inode is not a directory, logging its parent unnecessarily. */ - BTRFS_I(inode)->last_unlink_trans = BTRFS_I(inode)->last_trans; + inode->last_unlink_trans = inode->last_trans; /* * Same logic as for last_unlink_trans. We don't persist the generation @@ -3969,15 +3995,15 @@ cache_index: * operation, so after eviction and reloading the inode we must be * pessimistic and assume the last transaction that modified the inode. */ - BTRFS_I(inode)->last_reflink_trans = BTRFS_I(inode)->last_trans; + inode->last_reflink_trans = inode->last_trans; path->slots[0]++; - if (inode->i_nlink != 1 || + if (vfs_inode->i_nlink != 1 || path->slots[0] >= btrfs_header_nritems(leaf)) goto cache_acl; btrfs_item_key_to_cpu(leaf, &location, path->slots[0]); - if (location.objectid != btrfs_ino(BTRFS_I(inode))) + if (location.objectid != btrfs_ino(inode)) goto cache_acl; ptr = btrfs_item_ptr_offset(leaf, path->slots[0]); @@ -3985,13 +4011,12 @@ cache_index: struct btrfs_inode_ref *ref; ref = (struct btrfs_inode_ref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_ref_index(leaf, ref); + inode->dir_index = btrfs_inode_ref_index(leaf, ref); } else if (location.type == BTRFS_INODE_EXTREF_KEY) { struct btrfs_inode_extref *extref; extref = (struct btrfs_inode_extref *)ptr; - BTRFS_I(inode)->dir_index = btrfs_inode_extref_index(leaf, - extref); + inode->dir_index = btrfs_inode_extref_index(leaf, extref); } cache_acl: /* @@ -3999,50 +4024,49 @@ cache_acl: * any xattrs or acls */ maybe_acls = acls_after_inode_item(leaf, path->slots[0], - btrfs_ino(BTRFS_I(inode)), &first_xattr_slot); + btrfs_ino(inode), &first_xattr_slot); if (first_xattr_slot != -1) { path->slots[0] = first_xattr_slot; ret = btrfs_load_inode_props(inode, path); if (ret) btrfs_err(fs_info, "error loading props for ino %llu (root %llu): %d", - btrfs_ino(BTRFS_I(inode)), - btrfs_root_id(root), ret); + btrfs_ino(inode), btrfs_root_id(root), ret); } if (!maybe_acls) - cache_no_acl(inode); + cache_no_acl(vfs_inode); - switch (inode->i_mode & S_IFMT) { + switch (vfs_inode->i_mode & S_IFMT) { case S_IFREG: - inode->i_mapping->a_ops = &btrfs_aops; - inode->i_fop = &btrfs_file_operations; - inode->i_op = &btrfs_file_inode_operations; + vfs_inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_fop = &btrfs_file_operations; + vfs_inode->i_op = &btrfs_file_inode_operations; break; case S_IFDIR: - inode->i_fop = &btrfs_dir_file_operations; - inode->i_op = &btrfs_dir_inode_operations; + vfs_inode->i_fop = &btrfs_dir_file_operations; + vfs_inode->i_op = &btrfs_dir_inode_operations; break; case S_IFLNK: - inode->i_op = &btrfs_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &btrfs_aops; + vfs_inode->i_op = &btrfs_symlink_inode_operations; + inode_nohighmem(vfs_inode); + vfs_inode->i_mapping->a_ops = &btrfs_aops; break; default: - inode->i_op = &btrfs_special_inode_operations; - init_special_inode(inode, inode->i_mode, rdev); + vfs_inode->i_op = &btrfs_special_inode_operations; + init_special_inode(vfs_inode, vfs_inode->i_mode, rdev); break; } btrfs_sync_inode_flags_to_i_flags(inode); - ret = btrfs_add_inode_to_root(BTRFS_I(inode), true); + ret = btrfs_add_inode_to_root(inode, true); if (ret) goto out; return 0; out: - iget_failed(inode); + iget_failed(vfs_inode); return ret; } @@ -4102,7 +4126,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, struct btrfs_inode *inode) { struct btrfs_inode_item *inode_item; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_key key; int ret; @@ -4116,7 +4140,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, if (ret) { if (ret > 0) ret = -ENOENT; - goto failed; + return ret; } leaf = path->nodes[0]; @@ -4125,10 +4149,7 @@ static noinline int btrfs_update_inode_item(struct btrfs_trans_handle *trans, fill_inode_item(trans, leaf, inode_item, &inode->vfs_inode); btrfs_set_inode_last_trans(trans, inode); - ret = 0; -failed: - btrfs_free_path(path); - return ret; + return 0; } /* @@ -4462,7 +4483,7 @@ out: static noinline int may_destroy_subvol(struct btrfs_root *root) { struct btrfs_fs_info *fs_info = root->fs_info; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_dir_item *di; struct btrfs_key key; struct fscrypt_str name = FSTR_INIT("default", 7); @@ -4484,7 +4505,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) btrfs_err(fs_info, "deleting default subvolume %llu is not allowed", key.objectid); - goto out; + return ret; } btrfs_release_path(path); } @@ -4495,14 +4516,13 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; if (ret == 0) { /* * Key with offset -1 found, there would have to exist a root * with such id, but this is out of valid range. */ - ret = -EUCLEAN; - goto out; + return -EUCLEAN; } ret = 0; @@ -4512,8 +4532,7 @@ static noinline int may_destroy_subvol(struct btrfs_root *root) if (key.objectid == btrfs_root_id(root) && key.type == BTRFS_ROOT_REF_KEY) ret = -ENOTEMPTY; } -out: - btrfs_free_path(path); + return ret; } @@ -4756,20 +4775,80 @@ out_notrans: return ret; } +static bool is_inside_block(u64 bytenr, u64 blockstart, u32 blocksize) +{ + ASSERT(IS_ALIGNED(blockstart, blocksize), "blockstart=%llu blocksize=%u", + blockstart, blocksize); + + if (blockstart <= bytenr && bytenr <= blockstart + blocksize - 1) + return true; + return false; +} + +static int truncate_block_zero_beyond_eof(struct btrfs_inode *inode, u64 start) +{ + const pgoff_t index = (start >> PAGE_SHIFT); + struct address_space *mapping = inode->vfs_inode.i_mapping; + struct folio *folio; + u64 zero_start; + u64 zero_end; + int ret = 0; + +again: + folio = filemap_lock_folio(mapping, index); + /* No folio present. */ + if (IS_ERR(folio)) + return 0; + + if (!folio_test_uptodate(folio)) { + ret = btrfs_read_folio(NULL, folio); + folio_lock(folio); + if (folio->mapping != mapping) { + folio_unlock(folio); + folio_put(folio); + goto again; + } + if (!folio_test_uptodate(folio)) { + ret = -EIO; + goto out_unlock; + } + } + folio_wait_writeback(folio); + + /* + * We do not need to lock extents nor wait for OE, as it's already + * beyond EOF. + */ + + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = folio_pos(folio) + folio_size(folio) - 1; + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start + 1); + +out_unlock: + folio_unlock(folio); + folio_put(folio); + return ret; +} + /* - * Read, zero a chunk and write a block. + * Handle the truncation of a fs block. + * + * @inode - inode that we're zeroing + * @offset - the file offset of the block to truncate + * The value must be inside [@start, @end], and the function will do + * extra checks if the block that covers @offset needs to be zeroed. + * @start - the start file offset of the range we want to zero + * @end - the end (inclusive) file offset of the range we want to zero. * - * @inode - inode that we're zeroing - * @from - the offset to start zeroing - * @len - the length to zero, 0 to zero the entire range respective to the - * offset - * @front - zero up to the offset instead of from the offset on + * If the range is not block aligned, read out the folio that covers @offset, + * and if needed zero blocks that are inside the folio and covered by [@start, @end). + * If @start or @end + 1 lands inside a block, that block will be marked dirty + * for writeback. * - * This will find the block for the "from" offset and cow the block and zero the - * part we want to zero. This is used with truncate and hole punching. + * This is utilized by hole punch, zero range, file expansion. */ -int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, - int front) +int btrfs_truncate_block(struct btrfs_inode *inode, u64 offset, u64 start, u64 end) { struct btrfs_fs_info *fs_info = inode->root->fs_info; struct address_space *mapping = inode->vfs_inode.i_mapping; @@ -4779,20 +4858,56 @@ int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len, struct extent_changeset *data_reserved = NULL; bool only_release_metadata = false; u32 blocksize = fs_info->sectorsize; - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (blocksize - 1); + pgoff_t index = (offset >> PAGE_SHIFT); struct folio *folio; gfp_t mask = btrfs_alloc_write_mask(mapping); size_t write_bytes = blocksize; int ret = 0; + const bool in_head_block = is_inside_block(offset, round_down(start, blocksize), + blocksize); + const bool in_tail_block = is_inside_block(offset, round_down(end, blocksize), + blocksize); + bool need_truncate_head = false; + bool need_truncate_tail = false; + u64 zero_start; + u64 zero_end; u64 block_start; u64 block_end; - if (IS_ALIGNED(offset, blocksize) && - (!len || IS_ALIGNED(len, blocksize))) + /* @offset should be inside the range. */ + ASSERT(start <= offset && offset <= end, "offset=%llu start=%llu end=%llu", + offset, start, end); + + /* The range is aligned at both ends. */ + if (IS_ALIGNED(start, blocksize) && IS_ALIGNED(end + 1, blocksize)) { + /* + * For block size < page size case, we may have polluted blocks + * beyond EOF. So we also need to zero them out. + */ + if (end == (u64)-1 && blocksize < PAGE_SIZE) + ret = truncate_block_zero_beyond_eof(inode, start); + goto out; + } + + /* + * @offset may not be inside the head nor tail block. In that case we + * don't need to do anything. + */ + if (!in_head_block && !in_tail_block) goto out; - block_start = round_down(from, blocksize); + /* + * Skip the truncatioin if the range in the target block is already aligned. + * The seemingly complex check will also handle the same block case. + */ + if (in_head_block && !IS_ALIGNED(start, blocksize)) + need_truncate_head = true; + if (in_tail_block && !IS_ALIGNED(end + 1, blocksize)) + need_truncate_tail = true; + if (!need_truncate_head && !need_truncate_tail) + goto out; + + block_start = round_down(offset, blocksize); block_end = block_start + blocksize - 1; ret = btrfs_check_data_free_space(inode, &data_reserved, block_start, @@ -4816,10 +4931,13 @@ again: folio = __filemap_get_folio(mapping, index, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, mask); if (IS_ERR(folio)) { - btrfs_delalloc_release_space(inode, data_reserved, block_start, - blocksize, true); + if (only_release_metadata) + btrfs_delalloc_release_metadata(inode, blocksize, true); + else + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); btrfs_delalloc_release_extents(inode, blocksize); - ret = -ENOMEM; + ret = PTR_ERR(folio); goto out; } @@ -4849,11 +4967,11 @@ again: folio_wait_writeback(folio); - lock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_lock_extent(io_tree, block_start, block_end, &cached_state); ordered = btrfs_lookup_ordered_extent(inode, block_start); if (ordered) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); folio_unlock(folio); folio_put(folio); btrfs_start_ordered_extent(ordered); @@ -4861,37 +4979,46 @@ again: goto again; } - clear_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - &cached_state); + btrfs_clear_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, + &cached_state); ret = btrfs_set_extent_delalloc(inode, block_start, block_end, 0, &cached_state); if (ret) { - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); goto out_unlock; } - if (offset != blocksize) { - if (!len) - len = blocksize - offset; - if (front) - folio_zero_range(folio, block_start - folio_pos(folio), - offset); - else - folio_zero_range(folio, - (block_start - folio_pos(folio)) + offset, - len); + if (end == (u64)-1) { + /* + * We're truncating beyond EOF, the remaining blocks normally are + * already holes thus no need to zero again, but it's possible for + * fs block size < page size cases to have memory mapped writes + * to pollute ranges beyond EOF. + * + * In that case although such polluted blocks beyond EOF will + * not reach disk, it still affects our page caches. + */ + zero_start = max_t(u64, folio_pos(folio), start); + zero_end = min_t(u64, folio_pos(folio) + folio_size(folio) - 1, + end); + } else { + zero_start = max_t(u64, block_start, start); + zero_end = min_t(u64, block_end, end); } + folio_zero_range(folio, zero_start - folio_pos(folio), + zero_end - zero_start + 1); + btrfs_folio_clear_checked(fs_info, folio, block_start, block_end + 1 - block_start); btrfs_folio_set_dirty(fs_info, folio, block_start, block_end + 1 - block_start); - unlock_extent(io_tree, block_start, block_end, &cached_state); + btrfs_unlock_extent(io_tree, block_start, block_end, &cached_state); if (only_release_metadata) - set_extent_bit(&inode->io_tree, block_start, block_end, - EXTENT_NORESERVE, NULL); + btrfs_set_extent_bit(&inode->io_tree, block_start, block_end, + EXTENT_NORESERVE, NULL); out_unlock: if (ret) { @@ -4984,7 +5111,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) * rest of the block before we expand the i_size, otherwise we could * expose stale data. */ - ret = btrfs_truncate_block(inode, oldsize, 0, 0); + ret = btrfs_truncate_block(inode, oldsize, oldsize, -1); if (ret) return ret; @@ -5001,7 +5128,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) em = NULL; break; } - last_byte = min(extent_map_end(em), block_end); + last_byte = min(btrfs_extent_map_end(em), block_end); last_byte = ALIGN(last_byte, fs_info->sectorsize); hole_size = last_byte - cur_offset; @@ -5017,7 +5144,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) if (ret) break; - hole_em = alloc_extent_map(); + hole_em = btrfs_alloc_extent_map(); if (!hole_em) { btrfs_drop_extent_map_range(inode, cur_offset, cur_offset + hole_size - 1, @@ -5034,7 +5161,7 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) hole_em->generation = btrfs_get_fs_generation(fs_info); ret = btrfs_replace_extent_map_range(inode, hole_em, true); - free_extent_map(hole_em); + btrfs_free_extent_map(hole_em); } else { ret = btrfs_inode_set_file_extent_range(inode, cur_offset, hole_size); @@ -5042,14 +5169,14 @@ int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size) break; } next: - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; cur_offset = last_byte; if (cur_offset >= block_end) break; } - free_extent_map(em); - unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); + btrfs_free_extent_map(em); + btrfs_unlock_extent(io_tree, hole_start, block_end - 1, &cached_state); return ret; } @@ -5233,7 +5360,7 @@ static void evict_inode_truncate_pages(struct inode *inode) state_flags = state->state; spin_unlock(&io_tree->lock); - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); /* * If still has DELALLOC flag, the extent didn't reach disk, @@ -5247,9 +5374,9 @@ static void evict_inode_truncate_pages(struct inode *inode) btrfs_qgroup_free_data(BTRFS_I(inode), NULL, start, end - start + 1, NULL); - clear_extent_bit(io_tree, start, end, - EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, - &cached_state); + btrfs_clear_extent_bit(io_tree, start, end, + EXTENT_CLEAR_ALL_BITS | EXTENT_DO_ACCOUNTING, + &cached_state); cond_resched(); spin_lock(&io_tree->lock); @@ -5435,7 +5562,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, struct btrfs_key *location, u8 *type) { struct btrfs_dir_item *di; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *root = dir->root; int ret = 0; struct fscrypt_name fname; @@ -5446,7 +5573,7 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, ret = fscrypt_setup_filename(&dir->vfs_inode, &dentry->d_name, 1, &fname); if (ret < 0) - goto out; + return ret; /* * fscrypt_setup_filename() should never return a positive value, but * gcc on sparc/parisc thinks it can, so assert that doesn't happen. @@ -5475,7 +5602,6 @@ static int btrfs_inode_by_name(struct btrfs_inode *dir, struct dentry *dentry, *type = btrfs_dir_ftype(path->nodes[0], di); out: fscrypt_free_filename(&fname); - btrfs_free_path(path); return ret; } @@ -5490,7 +5616,7 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, struct btrfs_key *location, struct btrfs_root **sub_root) { - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct btrfs_root *new_root; struct btrfs_root_ref *ref; struct extent_buffer *leaf; @@ -5546,7 +5672,6 @@ static int fixup_tree_root_location(struct btrfs_fs_info *fs_info, location->offset = 0; err = 0; out: - btrfs_free_path(path); fscrypt_free_filename(&fname); return err; } @@ -5597,7 +5722,7 @@ static int btrfs_find_actor(struct inode *inode, void *opaque) args->root == BTRFS_I(inode)->root; } -static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) { struct inode *inode; struct btrfs_iget_args args; @@ -5609,40 +5734,42 @@ static struct inode *btrfs_iget_locked(u64 ino, struct btrfs_root *root) inode = iget5_locked_rcu(root->fs_info->sb, hashval, btrfs_find_actor, btrfs_init_locked_inode, (void *)&args); - return inode; + if (!inode) + return NULL; + return BTRFS_I(inode); } /* * Get an inode object given its inode number and corresponding root. Path is * preallocated to prevent recursing back to iget through allocator. */ -struct inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, - struct btrfs_path *path) +struct btrfs_inode *btrfs_iget_path(u64 ino, struct btrfs_root *root, + struct btrfs_path *path) { - struct inode *inode; + struct btrfs_inode *inode; int ret; inode = btrfs_iget_locked(ino, root); if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; ret = btrfs_read_locked_inode(inode, path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } /* * Get an inode object given its inode number and corresponding root. */ -struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) +struct btrfs_inode *btrfs_iget(u64 ino, struct btrfs_root *root) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_path *path; int ret; @@ -5650,55 +5777,60 @@ struct inode *btrfs_iget(u64 ino, struct btrfs_root *root) if (!inode) return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) + if (!(inode->vfs_inode.i_state & I_NEW)) return inode; path = btrfs_alloc_path(); - if (!path) + if (!path) { + iget_failed(&inode->vfs_inode); return ERR_PTR(-ENOMEM); + } ret = btrfs_read_locked_inode(inode, path); btrfs_free_path(path); if (ret) return ERR_PTR(ret); - unlock_new_inode(inode); + unlock_new_inode(&inode->vfs_inode); return inode; } -static struct inode *new_simple_dir(struct inode *dir, - struct btrfs_key *key, - struct btrfs_root *root) +static struct btrfs_inode *new_simple_dir(struct inode *dir, + struct btrfs_key *key, + struct btrfs_root *root) { struct timespec64 ts; - struct inode *inode = new_inode(dir->i_sb); + struct inode *vfs_inode; + struct btrfs_inode *inode; - if (!inode) + vfs_inode = new_inode(dir->i_sb); + if (!vfs_inode) return ERR_PTR(-ENOMEM); - BTRFS_I(inode)->root = btrfs_grab_root(root); - BTRFS_I(inode)->ref_root_id = key->objectid; - set_bit(BTRFS_INODE_ROOT_STUB, &BTRFS_I(inode)->runtime_flags); - set_bit(BTRFS_INODE_DUMMY, &BTRFS_I(inode)->runtime_flags); + inode = BTRFS_I(vfs_inode); + inode->root = btrfs_grab_root(root); + inode->ref_root_id = key->objectid; + set_bit(BTRFS_INODE_ROOT_STUB, &inode->runtime_flags); + set_bit(BTRFS_INODE_DUMMY, &inode->runtime_flags); - btrfs_set_inode_number(BTRFS_I(inode), BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); + btrfs_set_inode_number(inode, BTRFS_EMPTY_SUBVOL_DIR_OBJECTID); /* * We only need lookup, the rest is read-only and there's no inode * associated with the dentry */ - inode->i_op = &simple_dir_inode_operations; - inode->i_opflags &= ~IOP_XATTR; - inode->i_fop = &simple_dir_operations; - inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; + vfs_inode->i_op = &simple_dir_inode_operations; + vfs_inode->i_opflags &= ~IOP_XATTR; + vfs_inode->i_fop = &simple_dir_operations; + vfs_inode->i_mode = S_IFDIR | S_IRUGO | S_IWUSR | S_IXUGO; - ts = inode_set_ctime_current(inode); - inode_set_mtime_to_ts(inode, ts); - inode_set_atime_to_ts(inode, inode_get_atime(dir)); - BTRFS_I(inode)->i_otime_sec = ts.tv_sec; - BTRFS_I(inode)->i_otime_nsec = ts.tv_nsec; + ts = inode_set_ctime_current(vfs_inode); + inode_set_mtime_to_ts(vfs_inode, ts); + inode_set_atime_to_ts(vfs_inode, inode_get_atime(dir)); + inode->i_otime_sec = ts.tv_sec; + inode->i_otime_nsec = ts.tv_nsec; - inode->i_uid = dir->i_uid; - inode->i_gid = dir->i_gid; + vfs_inode->i_uid = dir->i_uid; + vfs_inode->i_gid = dir->i_gid; return inode; } @@ -5712,15 +5844,15 @@ static_assert(BTRFS_FT_FIFO == FT_FIFO); static_assert(BTRFS_FT_SOCK == FT_SOCK); static_assert(BTRFS_FT_SYMLINK == FT_SYMLINK); -static inline u8 btrfs_inode_type(struct inode *inode) +static inline u8 btrfs_inode_type(const struct btrfs_inode *inode) { - return fs_umode_to_ftype(inode->i_mode); + return fs_umode_to_ftype(inode->vfs_inode.i_mode); } struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) { struct btrfs_fs_info *fs_info = inode_to_fs_info(dir); - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_root *root = BTRFS_I(dir)->root; struct btrfs_root *sub_root = root; struct btrfs_key location = { 0 }; @@ -5737,18 +5869,18 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) if (location.type == BTRFS_INODE_ITEM_KEY) { inode = btrfs_iget(location.objectid, root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); /* Do extra check against inode mode with di_type */ if (btrfs_inode_type(inode) != di_type) { btrfs_crit(fs_info, "inode mode mismatch with dir: inode mode=0%o btrfs type=%u dir type=%u", - inode->i_mode, btrfs_inode_type(inode), + inode->vfs_inode.i_mode, btrfs_inode_type(inode), di_type); - iput(inode); + iput(&inode->vfs_inode); return ERR_PTR(-EUCLEAN); } - return inode; + return &inode->vfs_inode; } ret = fixup_tree_root_location(fs_info, BTRFS_I(dir), dentry, @@ -5763,19 +5895,22 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry) btrfs_put_root(sub_root); if (IS_ERR(inode)) - return inode; + return ERR_CAST(inode); down_read(&fs_info->cleanup_work_sem); - if (!sb_rdonly(inode->i_sb)) + if (!sb_rdonly(inode->vfs_inode.i_sb)) ret = btrfs_orphan_cleanup(sub_root); up_read(&fs_info->cleanup_work_sem); if (ret) { - iput(inode); + iput(&inode->vfs_inode); inode = ERR_PTR(ret); } } - return inode; + if (IS_ERR(inode)) + return ERR_CAST(inode); + + return &inode->vfs_inode; } static int btrfs_dentry_delete(const struct dentry *dentry) @@ -5815,7 +5950,7 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) { struct btrfs_root *root = inode->root; struct btrfs_key key, found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; int ret; @@ -5829,15 +5964,14 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); if (ret < 0) - goto out; + return ret; /* FIXME: we should be able to handle this */ if (ret == 0) - goto out; - ret = 0; + return ret; if (path->slots[0] == 0) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } path->slots[0]--; @@ -5848,13 +5982,12 @@ static int btrfs_set_inode_index_count(struct btrfs_inode *inode) if (found_key.objectid != btrfs_ino(inode) || found_key.type != BTRFS_DIR_INDEX_KEY) { inode->index_cnt = BTRFS_DIR_START_INDEX; - goto out; + return 0; } inode->index_cnt = found_key.offset + 1; -out: - btrfs_free_path(path); - return ret; + + return 0; } static int btrfs_get_dir_last_index(struct btrfs_inode *dir, u64 *index) @@ -5957,7 +6090,7 @@ static int btrfs_real_readdir(struct file *file, struct dir_context *ctx) struct btrfs_dir_item *di; struct btrfs_key key; struct btrfs_key found_key; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); void *addr; LIST_HEAD(ins_list); LIST_HEAD(del_list); @@ -6070,7 +6203,6 @@ nopos: err: if (put) btrfs_readdir_put_delayed_items(BTRFS_I(inode), &ins_list, &del_list); - btrfs_free_path(path); return ret; } @@ -6248,7 +6380,7 @@ static void btrfs_inherit_iflags(struct btrfs_inode *inode, struct btrfs_inode * inode->flags |= BTRFS_INODE_NODATASUM; } - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); } int btrfs_create_new_inode(struct btrfs_trans_handle *trans, @@ -6334,6 +6466,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (btrfs_test_opt(fs_info, NODATACOW)) BTRFS_I(inode)->flags |= BTRFS_INODE_NODATACOW | BTRFS_INODE_NODATASUM; + btrfs_update_inode_mapping_flags(BTRFS_I(inode)); } ret = btrfs_insert_inode_locked(inode); @@ -6427,7 +6560,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, path = NULL; if (args->subvol) { - struct inode *parent; + struct btrfs_inode *parent; /* * Subvolumes inherit properties from their parent subvolume, @@ -6437,11 +6570,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans, if (IS_ERR(parent)) { ret = PTR_ERR(parent); } else { - ret = btrfs_inode_inherit_props(trans, inode, parent); - iput(parent); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + parent); + iput(&parent->vfs_inode); } } else { - ret = btrfs_inode_inherit_props(trans, inode, dir); + ret = btrfs_inode_inherit_props(trans, BTRFS_I(inode), + BTRFS_I(dir)); } if (ret) { btrfs_err(fs_info, @@ -6539,7 +6674,7 @@ int btrfs_add_link(struct btrfs_trans_handle *trans, return ret; ret = btrfs_insert_dir_item(trans, name, parent_inode, &key, - btrfs_inode_type(&inode->vfs_inode), index); + btrfs_inode_type(inode), index); if (ret == -EEXIST || ret == -EOVERFLOW) goto fail_dir_item; else if (ret) { @@ -6739,18 +6874,18 @@ fail: return err; } -static int btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *btrfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; inode = new_inode(dir->i_sb); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); inode_init_owner(idmap, inode, dir, S_IFDIR | mode); inode->i_op = &btrfs_dir_inode_operations; inode->i_fop = &btrfs_dir_file_operations; - return btrfs_create_common(dir, dentry, inode); + return ERR_PTR(btrfs_create_common(dir, dentry, inode)); } static noinline int uncompress_inline(struct btrfs_path *path, @@ -6759,6 +6894,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, { int ret; struct extent_buffer *leaf = path->nodes[0]; + const u32 blocksize = leaf->fs_info->sectorsize; char *tmp; size_t max_size; unsigned long inline_size; @@ -6775,7 +6911,7 @@ static noinline int uncompress_inline(struct btrfs_path *path, read_extent_buffer(leaf, tmp, ptr, inline_size); - max_size = min_t(unsigned long, PAGE_SIZE, max_size); + max_size = min_t(unsigned long, blocksize, max_size); ret = btrfs_decompress(compress_type, tmp, folio, 0, inline_size, max_size); @@ -6787,14 +6923,15 @@ static noinline int uncompress_inline(struct btrfs_path *path, * cover that region here. */ - if (max_size < PAGE_SIZE) - folio_zero_range(folio, max_size, PAGE_SIZE - max_size); + if (max_size < blocksize) + folio_zero_range(folio, max_size, blocksize - max_size); kfree(tmp); return ret; } static int read_inline_extent(struct btrfs_path *path, struct folio *folio) { + const u32 blocksize = path->nodes[0]->fs_info->sectorsize; struct btrfs_file_extent_item *fi; void *kaddr; size_t copy_size; @@ -6809,14 +6946,14 @@ static int read_inline_extent(struct btrfs_path *path, struct folio *folio) if (btrfs_file_extent_compression(path->nodes[0], fi) != BTRFS_COMPRESS_NONE) return uncompress_inline(path, folio, fi); - copy_size = min_t(u64, PAGE_SIZE, + copy_size = min_t(u64, blocksize, btrfs_file_extent_ram_bytes(path->nodes[0], fi)); kaddr = kmap_local_folio(folio, 0); read_extent_buffer(path->nodes[0], kaddr, btrfs_file_extent_inline_start(fi), copy_size); kunmap_local(kaddr); - if (copy_size < PAGE_SIZE) - folio_zero_range(folio, copy_size, PAGE_SIZE - copy_size); + if (copy_size < blocksize) + folio_zero_range(folio, copy_size, blocksize - copy_size); return 0; } @@ -6855,18 +6992,18 @@ struct extent_map *btrfs_get_extent(struct btrfs_inode *inode, struct extent_map_tree *em_tree = &inode->extent_tree; read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, start, len); + em = btrfs_lookup_extent_mapping(em_tree, start, len); read_unlock(&em_tree->lock); if (em) { if (em->start > start || em->start + em->len <= start) - free_extent_map(em); + btrfs_free_extent_map(em); else if (em->disk_bytenr == EXTENT_MAP_INLINE && folio) - free_extent_map(em); + btrfs_free_extent_map(em); else goto out; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { ret = -ENOMEM; goto out; @@ -7003,7 +7140,7 @@ not_found: insert: ret = 0; btrfs_release_path(path); - if (em->start > start || extent_map_end(em) <= start) { + if (em->start > start || btrfs_extent_map_end(em) <= start) { btrfs_err(fs_info, "bad extent! em: [%llu %llu] passed [%llu %llu]", em->start, em->len, start, len); @@ -7020,7 +7157,7 @@ out: trace_btrfs_get_extent(root, inode, em); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } return em; @@ -7057,17 +7194,17 @@ static bool btrfs_extent_readonly(struct btrfs_fs_info *fs_info, u64 bytenr) * NOTE: This only checks the file extents, caller is responsible to wait for * any ordered extents. */ -noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, +noinline int can_nocow_extent(struct btrfs_inode *inode, u64 offset, u64 *len, struct btrfs_file_extent *file_extent, bool nowait) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct can_nocow_file_extent_args nocow_args = { 0 }; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); int ret; struct extent_buffer *leaf; - struct btrfs_root *root = BTRFS_I(inode)->root; - struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree; + struct extent_io_tree *io_tree = &inode->io_tree; struct btrfs_file_extent_item *fi; struct btrfs_key key; int found_type; @@ -7077,35 +7214,34 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, return -ENOMEM; path->nowait = nowait; - ret = btrfs_lookup_file_extent(NULL, root, path, - btrfs_ino(BTRFS_I(inode)), offset, 0); + ret = btrfs_lookup_file_extent(NULL, root, path, btrfs_ino(inode), + offset, 0); if (ret < 0) - goto out; + return ret; if (ret == 1) { if (path->slots[0] == 0) { - /* can't find the item, must cow */ - ret = 0; - goto out; + /* Can't find the item, must COW. */ + return 0; } path->slots[0]--; } ret = 0; leaf = path->nodes[0]; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); - if (key.objectid != btrfs_ino(BTRFS_I(inode)) || + if (key.objectid != btrfs_ino(inode) || key.type != BTRFS_EXTENT_DATA_KEY) { - /* not our file or wrong item type, must cow */ - goto out; + /* Not our file or wrong item type, must COW. */ + return 0; } if (key.offset > offset) { - /* Wrong offset, must cow */ - goto out; + /* Wrong offset, must COW. */ + return 0; } if (btrfs_file_extent_end(path) <= offset) - goto out; + return 0; fi = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); found_type = btrfs_file_extent_type(leaf, fi); @@ -7114,43 +7250,38 @@ noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len, nocow_args.end = offset + *len - 1; nocow_args.free_path = true; - ret = can_nocow_file_extent(path, &key, BTRFS_I(inode), &nocow_args); + ret = can_nocow_file_extent(path, &key, inode, &nocow_args); /* can_nocow_file_extent() has freed the path. */ path = NULL; if (ret != 1) { /* Treat errors as not being able to NOCOW. */ - ret = 0; - goto out; + return 0; } - ret = 0; if (btrfs_extent_readonly(fs_info, nocow_args.file_extent.disk_bytenr + nocow_args.file_extent.offset)) - goto out; + return 0; - if (!(BTRFS_I(inode)->flags & BTRFS_INODE_NODATACOW) && + if (!(inode->flags & BTRFS_INODE_NODATACOW) && found_type == BTRFS_FILE_EXTENT_PREALLOC) { u64 range_end; range_end = round_up(offset + nocow_args.file_extent.num_bytes, root->fs_info->sectorsize) - 1; - ret = test_range_bit_exists(io_tree, offset, range_end, EXTENT_DELALLOC); - if (ret) { - ret = -EAGAIN; - goto out; - } + ret = btrfs_test_range_bit_exists(io_tree, offset, range_end, + EXTENT_DELALLOC); + if (ret) + return -EAGAIN; } if (file_extent) memcpy(file_extent, &nocow_args.file_extent, sizeof(*file_extent)); *len = nocow_args.file_extent.num_bytes; - ret = 1; -out: - btrfs_free_path(path); - return ret; + + return 1; } /* The callers of this must take lock_extent() */ @@ -7198,7 +7329,7 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) return ERR_PTR(-ENOMEM); @@ -7211,15 +7342,15 @@ struct extent_map *btrfs_create_io_em(struct btrfs_inode *inode, u64 start, em->offset = file_extent->offset; em->flags |= EXTENT_FLAG_PINNED; if (type == BTRFS_ORDERED_COMPRESSED) - extent_map_set_compression(em, file_extent->compression); + btrfs_extent_map_set_compression(em, file_extent->compression); ret = btrfs_replace_extent_map_range(inode, em, true); if (ret) { - free_extent_map(em); + btrfs_free_extent_map(em); return ERR_PTR(ret); } - /* em got 2 refs now, callers needs to do free_extent_map once. */ + /* em got 2 refs now, callers needs to do btrfs_free_extent_map once. */ return em; } @@ -7235,7 +7366,7 @@ static void wait_subpage_spinlock(struct folio *folio) struct btrfs_fs_info *fs_info = folio_to_fs_info(folio); struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_is_subpage(fs_info, folio)) return; ASSERT(folio_test_private(folio) && folio_get_private(folio)); @@ -7259,7 +7390,7 @@ static void wait_subpage_spinlock(struct folio *folio) static int btrfs_launder_folio(struct folio *folio) { return btrfs_qgroup_free_data(folio_to_inode(folio), NULL, folio_pos(folio), - PAGE_SIZE, NULL); + folio_size(folio), NULL); } static bool __btrfs_release_folio(struct folio *folio, gfp_t gfp_flags) @@ -7346,7 +7477,7 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, } if (!inode_evicting) - lock_extent(tree, page_start, page_end, &cached_state); + btrfs_lock_extent(tree, page_start, page_end, &cached_state); cur = page_start; while (cur < page_end) { @@ -7402,10 +7533,10 @@ static void btrfs_invalidate_folio(struct folio *folio, size_t offset, * btrfs_finish_ordered_io(). */ if (!inode_evicting) - clear_extent_bit(tree, cur, range_end, - EXTENT_DELALLOC | - EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | - EXTENT_DEFRAG, &cached_state); + btrfs_clear_extent_bit(tree, cur, range_end, + EXTENT_DELALLOC | + EXTENT_LOCKED | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG, &cached_state); spin_lock_irq(&inode->ordered_tree_lock); set_bit(BTRFS_ORDERED_TRUNCATED, &ordered->flags); @@ -7447,12 +7578,11 @@ next: * Since the IO will never happen for this page. */ btrfs_qgroup_free_data(inode, NULL, cur, range_end + 1 - cur, NULL); - if (!inode_evicting) { - clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | - EXTENT_DELALLOC | EXTENT_UPTODATE | - EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG | - extra_flags, &cached_state); - } + if (!inode_evicting) + btrfs_clear_extent_bit(tree, cur, range_end, EXTENT_LOCKED | + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | + EXTENT_DEFRAG | extra_flags, + &cached_state); cur = range_end + 1; } /* @@ -7556,7 +7686,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) const u64 lock_start = ALIGN_DOWN(new_size, fs_info->sectorsize); control.new_size = new_size; - lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_lock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); /* * We want to drop from the next block forward in case this new * size is not block aligned since we will be keeping the last @@ -7571,7 +7701,7 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) inode_sub_bytes(&inode->vfs_inode, control.sub_bytes); btrfs_inode_safe_disk_i_size_write(inode, control.last_size); - unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); + btrfs_unlock_extent(&inode->io_tree, lock_start, (u64)-1, &cached_state); trans->block_rsv = &fs_info->trans_block_rsv; if (ret != -ENOSPC && ret != -EAGAIN) @@ -7615,7 +7745,8 @@ static int btrfs_truncate(struct btrfs_inode *inode, bool skip_writeback) btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); - ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, 0, 0); + ret = btrfs_truncate_block(inode, inode->vfs_inode.i_size, + inode->vfs_inode.i_size, (u64)-1); if (ret) goto out; trans = btrfs_start_transaction(root, 1); @@ -7727,10 +7858,10 @@ struct inode *btrfs_alloc_inode(struct super_block *sb) ei->i_otime_nsec = 0; inode = &ei->vfs_inode; - extent_map_tree_init(&ei->extent_tree); + btrfs_extent_map_tree_init(&ei->extent_tree); /* This io tree sets the valid inode. */ - extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); + btrfs_extent_io_tree_init(fs_info, &ei->io_tree, IO_TREE_INODE_IO); ei->io_tree.inode = ei; ei->file_extent_tree = NULL; @@ -8494,8 +8625,6 @@ static int start_delalloc_inodes(struct btrfs_root *root, struct writeback_control *wbc, bool snapshot, bool in_reclaim_context) { - struct btrfs_inode *binode; - struct inode *inode; struct btrfs_delalloc_work *work, *next; LIST_HEAD(works); LIST_HEAD(splice); @@ -8506,30 +8635,30 @@ static int start_delalloc_inodes(struct btrfs_root *root, spin_lock(&root->delalloc_lock); list_splice_init(&root->delalloc_inodes, &splice); while (!list_empty(&splice)) { - binode = list_entry(splice.next, struct btrfs_inode, - delalloc_inodes); + struct btrfs_inode *inode; + struct inode *tmp_inode; + + inode = list_first_entry(&splice, struct btrfs_inode, delalloc_inodes); - list_move_tail(&binode->delalloc_inodes, - &root->delalloc_inodes); + list_move_tail(&inode->delalloc_inodes, &root->delalloc_inodes); if (in_reclaim_context && - test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &binode->runtime_flags)) + test_bit(BTRFS_INODE_NO_DELALLOC_FLUSH, &inode->runtime_flags)) continue; - inode = igrab(&binode->vfs_inode); - if (!inode) { + tmp_inode = igrab(&inode->vfs_inode); + if (!tmp_inode) { cond_resched_lock(&root->delalloc_lock); continue; } spin_unlock(&root->delalloc_lock); if (snapshot) - set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, - &binode->runtime_flags); + set_bit(BTRFS_INODE_SNAPSHOT_FLUSH, &inode->runtime_flags); if (full_flush) { - work = btrfs_alloc_delalloc_work(inode); + work = btrfs_alloc_delalloc_work(&inode->vfs_inode); if (!work) { - iput(inode); + iput(&inode->vfs_inode); ret = -ENOMEM; goto out; } @@ -8537,8 +8666,8 @@ static int start_delalloc_inodes(struct btrfs_root *root, btrfs_queue_work(root->fs_info->flush_workers, &work->work); } else { - ret = filemap_fdatawrite_wbc(inode->i_mapping, wbc); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = filemap_fdatawrite_wbc(inode->vfs_inode.i_mapping, wbc); + btrfs_add_delayed_iput(inode); if (ret || wbc->nr_to_write <= 0) goto out; } @@ -8655,7 +8784,12 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, struct extent_buffer *leaf; name_len = strlen(symname); - if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info)) + /* + * Symlinks utilize uncompressed inline extent data, which should not + * reach block size. + */ + if (name_len > BTRFS_MAX_INLINE_DATA_SIZE(fs_info) || + name_len >= fs_info->sectorsize) return -ENAMETOOLONG; inode = new_inode(dir->i_sb); @@ -8694,8 +8828,8 @@ static int btrfs_symlink(struct mnt_idmap *idmap, struct inode *dir, goto out; } key.objectid = btrfs_ino(BTRFS_I(inode)); - key.offset = 0; key.type = BTRFS_EXTENT_DATA_KEY; + key.offset = 0; datasize = btrfs_file_extent_calc_inline_size(name_len); err = btrfs_insert_empty_item(trans, root, path, &key, datasize); @@ -8868,11 +9002,11 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, if (IS_ERR(trans)) { ret = PTR_ERR(trans); btrfs_free_reserved_extent(fs_info, ins.objectid, - ins.offset, 0); + ins.offset, false); break; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { btrfs_drop_extent_map_range(BTRFS_I(inode), cur_offset, cur_offset + ins.offset - 1, false); @@ -8890,7 +9024,7 @@ static int __btrfs_prealloc_file_range(struct inode *inode, int mode, em->generation = trans->transid; ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, true); - free_extent_map(em); + btrfs_free_extent_map(em); next: num_bytes -= ins.offset; cur_offset += ins.offset; @@ -9062,7 +9196,7 @@ static ssize_t btrfs_encoded_read_inline( struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; struct extent_io_tree *io_tree = &inode->io_tree; - struct btrfs_path *path; + BTRFS_PATH_AUTO_FREE(path); struct extent_buffer *leaf; struct btrfs_file_extent_item *item; u64 ram_bytes; @@ -9072,10 +9206,8 @@ static ssize_t btrfs_encoded_read_inline( const bool nowait = (iocb->ki_flags & IOCB_NOWAIT); path = btrfs_alloc_path(); - if (!path) { - ret = -ENOMEM; - goto out; - } + if (!path) + return -ENOMEM; path->nowait = nowait; @@ -9084,9 +9216,9 @@ static ssize_t btrfs_encoded_read_inline( if (ret) { if (ret > 0) { /* The extent item disappeared? */ - ret = -EIO; + return -EIO; } - goto out; + return ret; } leaf = path->nodes[0]; item = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -9099,17 +9231,16 @@ static ssize_t btrfs_encoded_read_inline( ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, item)); if (ret < 0) - goto out; + return ret; encoded->compression = ret; if (encoded->compression) { size_t inline_size; inline_size = btrfs_file_extent_inline_item_len(leaf, path->slots[0]); - if (inline_size > count) { - ret = -ENOBUFS; - goto out; - } + if (inline_size > count) + return -ENOBUFS; + count = inline_size; encoded->unencoded_len = ram_bytes; encoded->unencoded_offset = iocb->ki_pos - extent_start; @@ -9121,13 +9252,12 @@ static ssize_t btrfs_encoded_read_inline( } tmp = kmalloc(count, GFP_NOFS); - if (!tmp) { - ret = -ENOMEM; - goto out; - } + if (!tmp) + return -ENOMEM; + read_extent_buffer(leaf, tmp, ptr, count); btrfs_release_path(path); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -9135,13 +9265,12 @@ static ssize_t btrfs_encoded_read_inline( if (ret != count) ret = -EFAULT; kfree(tmp); -out: - btrfs_free_path(path); + return ret; } struct btrfs_encoded_read_private { - struct completion done; + struct completion *sync_reads; void *uring_ctx; refcount_t pending_refs; blk_status_t status; @@ -9153,11 +9282,10 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) if (bbio->bio.bi_status) { /* - * The memory barrier implied by the atomic_dec_return() here - * pairs with the memory barrier implied by the - * atomic_dec_return() or io_wait_event() in - * btrfs_encoded_read_regular_fill_pages() to ensure that this - * write is observed before the load of status in + * The memory barrier implied by the refcount_dec_and_test() here + * pairs with the memory barrier implied by the refcount_dec_and_test() + * in btrfs_encoded_read_regular_fill_pages() to ensure that + * this write is observed before the load of status in * btrfs_encoded_read_regular_fill_pages(). */ WRITE_ONCE(priv->status, bbio->bio.bi_status); @@ -9169,7 +9297,7 @@ static void btrfs_encoded_read_endio(struct btrfs_bio *bbio) btrfs_uring_read_extent_endio(priv->uring_ctx, err); kfree(priv); } else { - complete(&priv->done); + complete(priv->sync_reads); } } bio_put(&bbio->bio); @@ -9180,16 +9308,26 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, struct page **pages, void *uring_ctx) { struct btrfs_fs_info *fs_info = inode->root->fs_info; - struct btrfs_encoded_read_private *priv; + struct btrfs_encoded_read_private *priv, sync_priv; + struct completion sync_reads; unsigned long i = 0; struct btrfs_bio *bbio; int ret; - priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); - if (!priv) - return -ENOMEM; + /* + * Fast path for synchronous reads which completes in this call, io_uring + * needs longer time span. + */ + if (uring_ctx) { + priv = kmalloc(sizeof(struct btrfs_encoded_read_private), GFP_NOFS); + if (!priv) + return -ENOMEM; + } else { + priv = &sync_priv; + init_completion(&sync_reads); + priv->sync_reads = &sync_reads; + } - init_completion(&priv->done); refcount_set(&priv->pending_refs, 1); priv->status = 0; priv->uring_ctx = uring_ctx; @@ -9232,11 +9370,9 @@ int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode, return -EIOCBQUEUED; } else { if (!refcount_dec_and_test(&priv->pending_refs)) - wait_for_completion_io(&priv->done); + wait_for_completion_io(&sync_reads); /* See btrfs_encoded_read_endio() for ordering. */ - ret = blk_status_to_errno(READ_ONCE(priv->status)); - kfree(priv); - return ret; + return blk_status_to_errno(READ_ONCE(priv->status)); } } @@ -9269,7 +9405,7 @@ ssize_t btrfs_encoded_read_regular(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out; - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); *unlocked = true; @@ -9346,7 +9482,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, goto out_unlock_inode; } - if (!try_lock_extent(io_tree, start, lockend, cached_state)) { + if (!btrfs_try_lock_extent(io_tree, start, lockend, cached_state)) { ret = -EAGAIN; goto out_unlock_inode; } @@ -9355,7 +9491,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, lockend - start + 1); if (ordered) { btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); ret = -EAGAIN; goto out_unlock_inode; } @@ -9368,13 +9504,13 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, if (ret) goto out_unlock_inode; - lock_extent(io_tree, start, lockend, cached_state); + btrfs_lock_extent(io_tree, start, lockend, cached_state); ordered = btrfs_lookup_ordered_range(inode, start, lockend - start + 1); if (!ordered) break; btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); cond_resched(); } } @@ -9392,7 +9528,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, * For inline extents we get everything we need out of the * extent item. */ - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; ret = btrfs_encoded_read_inline(iocb, iter, start, lockend, cached_state, extent_start, @@ -9404,7 +9540,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, * We only want to return up to EOF even if the extent extends beyond * that. */ - encoded->len = min_t(u64, extent_map_end(em), + encoded->len = min_t(u64, btrfs_extent_map_end(em), inode->vfs_inode.i_size) - iocb->ki_pos; if (em->disk_bytenr == EXTENT_MAP_HOLE || (em->flags & EXTENT_FLAG_PREALLOC)) { @@ -9412,7 +9548,7 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, count = min_t(u64, count, encoded->len); encoded->len = count; encoded->unencoded_len = count; - } else if (extent_map_is_compressed(em)) { + } else if (btrfs_extent_map_is_compressed(em)) { *disk_bytenr = em->disk_bytenr; /* * Bail if the buffer isn't large enough to return the whole @@ -9427,12 +9563,12 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = em->ram_bytes; encoded->unencoded_offset = iocb->ki_pos - (em->start - em->offset); ret = btrfs_encoded_io_compression_from_extent(fs_info, - extent_map_compression(em)); + btrfs_extent_map_compression(em)); if (ret < 0) goto out_em; encoded->compression = ret; } else { - *disk_bytenr = extent_map_block_start(em) + (start - em->start); + *disk_bytenr = btrfs_extent_map_block_start(em) + (start - em->start); if (encoded->len > count) encoded->len = count; /* @@ -9445,11 +9581,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, encoded->unencoded_len = count; *disk_io_size = ALIGN(*disk_io_size, fs_info->sectorsize); } - free_extent_map(em); + btrfs_free_extent_map(em); em = NULL; if (*disk_bytenr == EXTENT_MAP_HOLE) { - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); unlocked = true; ret = iov_iter_zero(count, iter); @@ -9461,11 +9597,11 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter, } out_em: - free_extent_map(em); + btrfs_free_extent_map(em); out_unlock_extent: /* Leave inode and extent locked if we need to do a read. */ if (!unlocked && ret != -EIOCBQUEUED) - unlock_extent(io_tree, start, lockend, cached_state); + btrfs_unlock_extent(io_tree, start, lockend, cached_state); out_unlock_inode: if (!unlocked && ret != -EIOCBQUEUED) btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); @@ -9612,14 +9748,14 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, end >> PAGE_SHIFT); if (ret) goto out_folios; - lock_extent(io_tree, start, end, &cached_state); + btrfs_lock_extent(io_tree, start, end, &cached_state); ordered = btrfs_lookup_ordered_range(inode, start, num_bytes); if (!ordered && !filemap_range_has_page(inode->vfs_inode.i_mapping, start, end)) break; if (ordered) btrfs_put_ordered_extent(ordered); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); cond_resched(); } @@ -9669,11 +9805,11 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, ret = PTR_ERR(em); goto out_free_reserved; } - free_extent_map(em); + btrfs_free_extent_map(em); ordered = btrfs_alloc_ordered_extent(inode, start, &file_extent, - (1 << BTRFS_ORDERED_ENCODED) | - (1 << BTRFS_ORDERED_COMPRESSED)); + (1U << BTRFS_ORDERED_ENCODED) | + (1U << BTRFS_ORDERED_COMPRESSED)); if (IS_ERR(ordered)) { btrfs_drop_extent_map_range(inode, start, end, false); ret = PTR_ERR(ordered); @@ -9684,7 +9820,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, if (start + encoded->len > inode->vfs_inode.i_size) i_size_write(&inode->vfs_inode, start + encoded->len); - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); btrfs_delalloc_release_extents(inode, num_bytes); @@ -9694,7 +9830,7 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from, out_free_reserved: btrfs_dec_block_group_reservations(fs_info, ins.objectid); - btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1); + btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, true); out_delalloc_release: btrfs_delalloc_release_extents(inode, num_bytes); btrfs_delalloc_release_metadata(inode, disk_num_bytes, ret < 0); @@ -9707,9 +9843,9 @@ out_free_data_space: * bytes_may_use. */ if (!extent_reserved) - btrfs_free_reserved_data_space_noquota(fs_info, disk_num_bytes); + btrfs_free_reserved_data_space_noquota(inode, disk_num_bytes); out_unlock: - unlock_extent(io_tree, start, end, &cached_state); + btrfs_unlock_extent(io_tree, start, end, &cached_state); out_folios: for (i = 0; i < nr_folios; i++) { if (folios[i]) @@ -9974,7 +10110,7 @@ static int btrfs_swap_activate(struct swap_info_struct *sis, struct file *file, isize = ALIGN_DOWN(inode->i_size, fs_info->sectorsize); - lock_extent(io_tree, 0, isize - 1, &cached_state); + btrfs_lock_extent(io_tree, 0, isize - 1, &cached_state); while (prev_extent_end < isize) { struct btrfs_key key; struct extent_buffer *leaf; @@ -10152,7 +10288,7 @@ out: if (!IS_ERR_OR_NULL(map)) btrfs_free_chunk_map(map); - unlock_extent(io_tree, 0, isize - 1, &cached_state); + btrfs_unlock_extent(io_tree, 0, isize - 1, &cached_state); if (ret) btrfs_swap_deactivate(file); diff --git a/fs/btrfs/ioctl.c b/fs/btrfs/ioctl.c index 6c18bad53cd3..913acef3f0a9 100644 --- a/fs/btrfs/ioctl.c +++ b/fs/btrfs/ioctl.c @@ -118,8 +118,8 @@ struct btrfs_ioctl_encoded_io_args_32 { #endif /* Mask out flags that are inappropriate for the given type of inode. */ -static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, - unsigned int flags) +static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode, + unsigned int flags) { if (S_ISDIR(inode->i_mode)) return flags; @@ -133,11 +133,11 @@ static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode, * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS * ioctl. */ -static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) +static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode) { unsigned int iflags = 0; - u32 flags = binode->flags; - u32 ro_flags = binode->ro_flags; + u32 flags = inode->flags; + u32 ro_flags = inode->ro_flags; if (flags & BTRFS_INODE_SYNC) iflags |= FS_SYNC_FL; @@ -167,25 +167,24 @@ static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode) /* * Update inode->i_flags based on the btrfs internal flags. */ -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode) +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode) { - struct btrfs_inode *binode = BTRFS_I(inode); unsigned int new_fl = 0; - if (binode->flags & BTRFS_INODE_SYNC) + if (inode->flags & BTRFS_INODE_SYNC) new_fl |= S_SYNC; - if (binode->flags & BTRFS_INODE_IMMUTABLE) + if (inode->flags & BTRFS_INODE_IMMUTABLE) new_fl |= S_IMMUTABLE; - if (binode->flags & BTRFS_INODE_APPEND) + if (inode->flags & BTRFS_INODE_APPEND) new_fl |= S_APPEND; - if (binode->flags & BTRFS_INODE_NOATIME) + if (inode->flags & BTRFS_INODE_NOATIME) new_fl |= S_NOATIME; - if (binode->flags & BTRFS_INODE_DIRSYNC) + if (inode->flags & BTRFS_INODE_DIRSYNC) new_fl |= S_DIRSYNC; - if (binode->ro_flags & BTRFS_INODE_RO_VERITY) + if (inode->ro_flags & BTRFS_INODE_RO_VERITY) new_fl |= S_VERITY; - set_mask_bits(&inode->i_flags, + set_mask_bits(&inode->vfs_inode.i_flags, S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC | S_VERITY, new_fl); } @@ -219,7 +218,7 @@ static int check_fsflags(unsigned int old_flags, unsigned int flags) return 0; } -static int check_fsflags_compatible(struct btrfs_fs_info *fs_info, +static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info, unsigned int flags) { if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL)) @@ -248,24 +247,23 @@ static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_ */ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) { - struct btrfs_inode *binode = BTRFS_I(d_inode(dentry)); + const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); - fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode)); + fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode)); return 0; } int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa) { - struct inode *inode = d_inode(dentry); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_inode *binode = BTRFS_I(inode); - struct btrfs_root *root = binode->root; + struct btrfs_inode *inode = BTRFS_I(d_inode(dentry)); + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; struct btrfs_trans_handle *trans; unsigned int fsflags, old_fsflags; int ret; const char *comp = NULL; - u32 binode_flags; + u32 inode_flags; if (btrfs_root_readonly(root)) return -EROFS; @@ -273,8 +271,8 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (fileattr_has_fsx(fa)) return -EOPNOTSUPP; - fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags); - old_fsflags = btrfs_inode_flags_to_fsflags(binode); + fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags); + old_fsflags = btrfs_inode_flags_to_fsflags(inode); ret = check_fsflags(old_fsflags, fsflags); if (ret) return ret; @@ -283,27 +281,27 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, if (ret) return ret; - binode_flags = binode->flags; + inode_flags = inode->flags; if (fsflags & FS_SYNC_FL) - binode_flags |= BTRFS_INODE_SYNC; + inode_flags |= BTRFS_INODE_SYNC; else - binode_flags &= ~BTRFS_INODE_SYNC; + inode_flags &= ~BTRFS_INODE_SYNC; if (fsflags & FS_IMMUTABLE_FL) - binode_flags |= BTRFS_INODE_IMMUTABLE; + inode_flags |= BTRFS_INODE_IMMUTABLE; else - binode_flags &= ~BTRFS_INODE_IMMUTABLE; + inode_flags &= ~BTRFS_INODE_IMMUTABLE; if (fsflags & FS_APPEND_FL) - binode_flags |= BTRFS_INODE_APPEND; + inode_flags |= BTRFS_INODE_APPEND; else - binode_flags &= ~BTRFS_INODE_APPEND; + inode_flags &= ~BTRFS_INODE_APPEND; if (fsflags & FS_NODUMP_FL) - binode_flags |= BTRFS_INODE_NODUMP; + inode_flags |= BTRFS_INODE_NODUMP; else - binode_flags &= ~BTRFS_INODE_NODUMP; + inode_flags &= ~BTRFS_INODE_NODUMP; if (fsflags & FS_NOATIME_FL) - binode_flags |= BTRFS_INODE_NOATIME; + inode_flags |= BTRFS_INODE_NOATIME; else - binode_flags &= ~BTRFS_INODE_NOATIME; + inode_flags &= ~BTRFS_INODE_NOATIME; /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */ if (!fa->flags_valid) { @@ -315,32 +313,32 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } if (fsflags & FS_DIRSYNC_FL) - binode_flags |= BTRFS_INODE_DIRSYNC; + inode_flags |= BTRFS_INODE_DIRSYNC; else - binode_flags &= ~BTRFS_INODE_DIRSYNC; + inode_flags &= ~BTRFS_INODE_DIRSYNC; if (fsflags & FS_NOCOW_FL) { - if (S_ISREG(inode->i_mode)) { + if (S_ISREG(inode->vfs_inode.i_mode)) { /* * It's safe to turn csums off here, no extents exist. * Otherwise we want the flag to reflect the real COW * status of the file and will not set it. */ - if (inode->i_size == 0) - binode_flags |= BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM; + if (inode->vfs_inode.i_size == 0) + inode_flags |= BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM; } else { - binode_flags |= BTRFS_INODE_NODATACOW; + inode_flags |= BTRFS_INODE_NODATACOW; } } else { /* * Revert back under same assumptions as above */ - if (S_ISREG(inode->i_mode)) { - if (inode->i_size == 0) - binode_flags &= ~(BTRFS_INODE_NODATACOW | - BTRFS_INODE_NODATASUM); + if (S_ISREG(inode->vfs_inode.i_mode)) { + if (inode->vfs_inode.i_size == 0) + inode_flags &= ~(BTRFS_INODE_NODATACOW | + BTRFS_INODE_NODATASUM); } else { - binode_flags &= ~BTRFS_INODE_NODATACOW; + inode_flags &= ~BTRFS_INODE_NODATACOW; } } @@ -350,21 +348,21 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, * things smaller. */ if (fsflags & FS_NOCOMP_FL) { - binode_flags &= ~BTRFS_INODE_COMPRESS; - binode_flags |= BTRFS_INODE_NOCOMPRESS; + inode_flags &= ~BTRFS_INODE_COMPRESS; + inode_flags |= BTRFS_INODE_NOCOMPRESS; } else if (fsflags & FS_COMPR_FL) { - if (IS_SWAPFILE(inode)) + if (IS_SWAPFILE(&inode->vfs_inode)) return -ETXTBSY; - binode_flags |= BTRFS_INODE_COMPRESS; - binode_flags &= ~BTRFS_INODE_NOCOMPRESS; + inode_flags |= BTRFS_INODE_COMPRESS; + inode_flags &= ~BTRFS_INODE_NOCOMPRESS; comp = btrfs_compress_type2str(fs_info->compress_type); if (!comp || comp[0] == 0) comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB); } else { - binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); + inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS); } /* @@ -376,15 +374,14 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, return PTR_ERR(trans); if (comp) { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", + ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp, strlen(comp), 0); if (ret) { btrfs_abort_transaction(trans, ret); goto out_end_trans; } } else { - ret = btrfs_set_prop(trans, BTRFS_I(inode), "btrfs.compression", - NULL, 0, 0); + ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0); if (ret && ret != -ENODATA) { btrfs_abort_transaction(trans, ret); goto out_end_trans; @@ -392,18 +389,19 @@ int btrfs_fileattr_set(struct mnt_idmap *idmap, } update_flags: - binode->flags = binode_flags; + inode->flags = inode_flags; + btrfs_update_inode_mapping_flags(inode); btrfs_sync_inode_flags_to_i_flags(inode); - inode_inc_iversion(inode); - inode_set_ctime_current(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inode_inc_iversion(&inode->vfs_inode); + inode_set_ctime_current(&inode->vfs_inode); + ret = btrfs_update_inode(trans, inode); out_end_trans: btrfs_end_transaction(trans); return ret; } -static int btrfs_ioctl_getversion(struct inode *inode, int __user *arg) +static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg) { return put_user(inode->i_generation, arg); } @@ -475,7 +473,7 @@ static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info, * Calculate the number of transaction items to reserve for creating a subvolume * or snapshot, not including the inode, directory entries, or parent directory. */ -static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit) +static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit) { /* * 1 to add root block @@ -617,8 +615,8 @@ static noinline int create_subvol(struct mnt_idmap *idmap, btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID); key.objectid = objectid; - key.offset = 0; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = 0; ret = btrfs_insert_root(trans, fs_info->tree_root, &key, root_item); if (ret) { @@ -878,7 +876,7 @@ static int btrfs_may_delete(struct mnt_idmap *idmap, /* copy of may_create in fs/namei.c() */ static inline int btrfs_may_create(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *child) + struct inode *dir, const struct dentry *child) { if (d_really_is_positive(child)) return -EEXIST; @@ -911,7 +909,7 @@ static noinline int btrfs_mksubvol(const struct path *parent, if (error == -EINTR) return error; - dentry = lookup_one(idmap, name, parent->dentry, namelen); + dentry = lookup_one(idmap, &QSTR_LEN(name, namelen), parent->dentry); error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_unlock; @@ -1033,17 +1031,14 @@ static noinline int btrfs_ioctl_resize(struct file *file, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); - struct inode *inode = file_inode(file); - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_root *root = BTRFS_I(file_inode(file))->root; + struct btrfs_fs_info *fs_info = root->fs_info; u64 new_size; u64 old_size; u64 devid = 1; - struct btrfs_root *root = BTRFS_I(inode)->root; struct btrfs_ioctl_vol_args *vol_args; - struct btrfs_trans_handle *trans; struct btrfs_device *device = NULL; char *sizestr; - char *retptr; char *devstr = NULL; int ret = 0; int mod = 0; @@ -1111,6 +1106,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, if (!strcmp(sizestr, "max")) new_size = bdev_nr_bytes(device->bdev); else { + char *retptr; + if (sizestr[0] == '-') { mod = -1; sizestr++; @@ -1158,6 +1155,8 @@ static noinline int btrfs_ioctl_resize(struct file *file, new_size = round_down(new_size, fs_info->sectorsize); if (new_size > old_size) { + struct btrfs_trans_handle *trans; + trans = btrfs_start_transaction(root, 0); if (IS_ERR(trans)) { ret = PTR_ERR(trans); @@ -1336,15 +1335,15 @@ free_args: return ret; } -static noinline int btrfs_ioctl_subvol_getflags(struct inode *inode, +static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode, void __user *arg) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; u64 flags = 0; - if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) + if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID) return -EINVAL; down_read(&fs_info->subvol_sem); @@ -1447,8 +1446,8 @@ out: return ret; } -static noinline int key_in_sk(struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk) +static noinline bool key_in_sk(const struct btrfs_key *key, + const struct btrfs_ioctl_search_key *sk) { struct btrfs_key test; int ret; @@ -1459,7 +1458,7 @@ static noinline int key_in_sk(struct btrfs_key *key, ret = btrfs_comp_cpu_keys(key, &test); if (ret < 0) - return 0; + return false; test.objectid = sk->max_objectid; test.type = sk->max_type; @@ -1467,13 +1466,13 @@ static noinline int key_in_sk(struct btrfs_key *key, ret = btrfs_comp_cpu_keys(key, &test); if (ret > 0) - return 0; - return 1; + return false; + return true; } static noinline int copy_to_sk(struct btrfs_path *path, struct btrfs_key *key, - struct btrfs_ioctl_search_key *sk, + const struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf, unsigned long *sk_offset, @@ -1530,8 +1529,8 @@ static noinline int copy_to_sk(struct btrfs_path *path, } sh.objectid = key->objectid; - sh.offset = key->offset; sh.type = key->type; + sh.offset = key->offset; sh.len = item_len; sh.transid = found_transid; @@ -1604,13 +1603,12 @@ out: return ret; } -static noinline int search_ioctl(struct inode *inode, +static noinline int search_ioctl(struct btrfs_root *root, struct btrfs_ioctl_search_key *sk, u64 *buf_size, char __user *ubuf) { - struct btrfs_fs_info *info = inode_to_fs_info(inode); - struct btrfs_root *root; + struct btrfs_fs_info *info = root->fs_info; struct btrfs_key key; struct btrfs_path *path; int ret; @@ -1627,9 +1625,10 @@ static noinline int search_ioctl(struct inode *inode, return -ENOMEM; if (sk->tree_id == 0) { - /* search the root of the inode that was passed */ - root = btrfs_grab_root(BTRFS_I(inode)->root); + /* Search the root that we got passed. */ + root = btrfs_grab_root(root); } else { + /* Look up the root from the arguments. */ root = btrfs_get_fs_root(info, sk->tree_id, true); if (IS_ERR(root)) { btrfs_free_path(path); @@ -1642,21 +1641,19 @@ static noinline int search_ioctl(struct inode *inode, key.offset = sk->min_offset; while (1) { - ret = -EFAULT; /* * Ensure that the whole user buffer is faulted in at sub-page * granularity, otherwise the loop may live-lock. */ - if (fault_in_subpage_writeable(ubuf + sk_offset, - *buf_size - sk_offset)) + if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) { + ret = -EFAULT; break; + } ret = btrfs_search_forward(root, &key, path, sk->min_transid); - if (ret != 0) { - if (ret > 0) - ret = 0; - goto err; - } + if (ret) + break; + ret = copy_to_sk(path, &key, sk, buf_size, ubuf, &sk_offset, &num_found); btrfs_release_path(path); @@ -1664,16 +1661,17 @@ static noinline int search_ioctl(struct inode *inode, break; } + /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */ if (ret > 0) ret = 0; -err: + sk->nr_items = num_found; btrfs_put_root(root); btrfs_free_path(path); return ret; } -static noinline int btrfs_ioctl_tree_search(struct inode *inode, +static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args __user *uargs = argp; @@ -1689,7 +1687,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, buf_size = sizeof(uargs->buf); - ret = search_ioctl(inode, &sk, &buf_size, uargs->buf); + ret = search_ioctl(root, &sk, &buf_size, uargs->buf); /* * In the origin implementation an overflow is handled by returning a @@ -1703,7 +1701,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode, return ret; } -static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, +static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root, void __user *argp) { struct btrfs_ioctl_search_args_v2 __user *uarg = argp; @@ -1725,7 +1723,7 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode, if (buf_size > buf_limit) buf_size = buf_limit; - ret = search_ioctl(inode, &args.key, &buf_size, + ret = search_ioctl(root, &args.key, &buf_size, (char __user *)(&uarg->buf[0])); if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key))) ret = -EFAULT; @@ -1833,7 +1831,6 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, struct btrfs_path *path; struct btrfs_key key, key2; struct extent_buffer *leaf; - struct inode *temp_inode; char *ptr; int slot; int len; @@ -1861,6 +1858,8 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, key.type = BTRFS_INODE_REF_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *temp_inode; + ret = btrfs_search_backwards(root, &key, path); if (ret < 0) goto out_put; @@ -1915,9 +1914,9 @@ static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap, ret = PTR_ERR(temp_inode); goto out_put; } - ret = inode_permission(idmap, temp_inode, + ret = inode_permission(idmap, &temp_inode->vfs_inode, MAY_READ | MAY_EXEC); - iput(temp_inode); + iput(&temp_inode->vfs_inode); if (ret) { ret = -EACCES; goto out_put; @@ -2289,7 +2288,6 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL; struct mnt_idmap *idmap = file_mnt_idmap(file); char *subvol_name, *subvol_name_ptr = NULL; - int subvol_namelen; int ret = 0; bool destroy_parent = false; @@ -2412,10 +2410,8 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, goto out; } - subvol_namelen = strlen(subvol_name); - if (strchr(subvol_name, '/') || - strncmp(subvol_name, "..", subvol_namelen) == 0) { + strcmp(subvol_name, "..") == 0) { ret = -EINVAL; goto free_subvol_name; } @@ -2428,7 +2424,7 @@ static noinline int btrfs_ioctl_snap_destroy(struct file *file, ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT); if (ret == -EINTR) goto free_subvol_name; - dentry = lookup_one(idmap, subvol_name, parent, subvol_namelen); + dentry = lookup_one(idmap, &QSTR(subvol_name), parent); if (IS_ERR(dentry)) { ret = PTR_ERR(dentry); goto out_unlock_dir; @@ -2571,7 +2567,7 @@ static int btrfs_ioctl_defrag(struct file *file, void __user *argp) /* the rest are all set to zero by kzalloc */ range.len = (u64)-1; } - ret = btrfs_defrag_file(file_inode(file), &file->f_ra, + ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra, &range, BTRFS_OLDEST_GENERATION, 0); if (ret > 0) ret = 0; @@ -2763,7 +2759,7 @@ out_free: return ret; } -static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info, void __user *arg) { struct btrfs_ioctl_fs_info_args *fi_args; @@ -2817,7 +2813,7 @@ static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info, return ret; } -static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info, +static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info, void __user *arg) { BTRFS_DEV_LOOKUP_ARGS(args); @@ -4248,7 +4244,7 @@ static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info, return 0; } -static int check_feature_bits(struct btrfs_fs_info *fs_info, +static int check_feature_bits(const struct btrfs_fs_info *fs_info, enum btrfs_feature_set set, u64 change_mask, u64 flags, u64 supported_flags, u64 safe_set, u64 safe_clear) @@ -4384,7 +4380,7 @@ out_drop_write: return ret; } -static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool compat) +static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat) { struct btrfs_ioctl_send_args *arg; int ret; @@ -4415,7 +4411,7 @@ static int _btrfs_ioctl_send(struct btrfs_inode *inode, void __user *argp, bool if (IS_ERR(arg)) return PTR_ERR(arg); } - ret = btrfs_ioctl_send(inode, arg); + ret = btrfs_ioctl_send(root, arg); kfree(arg); return ret; } @@ -4511,7 +4507,7 @@ static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp, args.compression, &unlocked); if (!unlocked) { - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); } } @@ -4700,7 +4696,7 @@ static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int iss ret = priv->count; out: - unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); + btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); io_uring_cmd_done(cmd, ret, 0, issue_flags); @@ -4789,7 +4785,7 @@ static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter, return -EIOCBQUEUED; out_fail: - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); kfree(priv); return ret; @@ -4903,6 +4899,8 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state, &disk_bytenr, &disk_io_size); + if (ret == -EAGAIN) + goto out_acct; if (ret < 0 && ret != -EIOCBQUEUED) goto out_free; @@ -4912,7 +4910,7 @@ static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue (const char *)&data->args + copy_end_kernel, sizeof(data->args) - copy_end_kernel)) { if (ret == -EIOCBQUEUED) { - unlock_extent(io_tree, start, lockend, &cached_state); + btrfs_unlock_extent(io_tree, start, lockend, &cached_state); btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED); } ret = -EFAULT; @@ -5242,7 +5240,7 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_SNAP_DESTROY_V2: return btrfs_ioctl_snap_destroy(file, argp, true); case BTRFS_IOC_SUBVOL_GETFLAGS: - return btrfs_ioctl_subvol_getflags(inode, argp); + return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp); case BTRFS_IOC_SUBVOL_SETFLAGS: return btrfs_ioctl_subvol_setflags(file, argp); case BTRFS_IOC_DEFAULT_SUBVOL: @@ -5264,9 +5262,9 @@ long btrfs_ioctl(struct file *file, unsigned int case BTRFS_IOC_DEV_INFO: return btrfs_ioctl_dev_info(fs_info, argp); case BTRFS_IOC_TREE_SEARCH: - return btrfs_ioctl_tree_search(inode, argp); + return btrfs_ioctl_tree_search(root, argp); case BTRFS_IOC_TREE_SEARCH_V2: - return btrfs_ioctl_tree_search_v2(inode, argp); + return btrfs_ioctl_tree_search_v2(root, argp); case BTRFS_IOC_INO_LOOKUP: return btrfs_ioctl_ino_lookup(root, argp); case BTRFS_IOC_INO_PATHS: @@ -5314,10 +5312,10 @@ long btrfs_ioctl(struct file *file, unsigned int return btrfs_ioctl_set_received_subvol_32(file, argp); #endif case BTRFS_IOC_SEND: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, false); + return _btrfs_ioctl_send(root, argp, false); #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT) case BTRFS_IOC_SEND_32: - return _btrfs_ioctl_send(BTRFS_I(inode), argp, true); + return _btrfs_ioctl_send(root, argp, true); #endif case BTRFS_IOC_GET_DEV_STATS: return btrfs_ioctl_get_dev_stats(fs_info, argp); diff --git a/fs/btrfs/ioctl.h b/fs/btrfs/ioctl.h index ce915fcda43b..e08ea446cf48 100644 --- a/fs/btrfs/ioctl.h +++ b/fs/btrfs/ioctl.h @@ -9,6 +9,8 @@ struct file; struct dentry; struct mnt_idmap; struct fileattr; +struct io_uring_cmd; +struct btrfs_inode; struct btrfs_fs_info; struct btrfs_ioctl_balance_args; @@ -18,7 +20,7 @@ int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); int btrfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, struct fileattr *fa); int btrfs_ioctl_get_supported_features(void __user *arg); -void btrfs_sync_inode_flags_to_i_flags(struct inode *inode); +void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode); void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info, struct btrfs_ioctl_balance_args *bargs); int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); diff --git a/fs/btrfs/locking.c b/fs/btrfs/locking.c index 9a7a7b723305..a3e6d9616e60 100644 --- a/fs/btrfs/locking.c +++ b/fs/btrfs/locking.c @@ -9,7 +9,6 @@ #include <linux/page-flags.h> #include <asm/bug.h> #include <trace/events/btrfs.h> -#include "misc.h" #include "ctree.h" #include "extent_io.h" #include "locking.h" @@ -150,15 +149,15 @@ void btrfs_tree_read_lock_nested(struct extent_buffer *eb, enum btrfs_lock_nesti /* * Try-lock for read. * - * Return 1 if the rwlock has been taken, 0 otherwise + * Return true if the rwlock has been taken, false otherwise */ -int btrfs_try_tree_read_lock(struct extent_buffer *eb) +bool btrfs_try_tree_read_lock(struct extent_buffer *eb) { if (down_read_trylock(&eb->lock)) { trace_btrfs_try_tree_read_lock(eb); - return 1; + return true; } - return 0; + return false; } /* diff --git a/fs/btrfs/locking.h b/fs/btrfs/locking.h index c69e57ff804b..af29df98ac14 100644 --- a/fs/btrfs/locking.h +++ b/fs/btrfs/locking.h @@ -189,7 +189,7 @@ static inline void btrfs_tree_read_lock(struct extent_buffer *eb) } void btrfs_tree_read_unlock(struct extent_buffer *eb); -int btrfs_try_tree_read_lock(struct extent_buffer *eb); +bool btrfs_try_tree_read_lock(struct extent_buffer *eb); struct extent_buffer *btrfs_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_read_lock_root_node(struct btrfs_root *root); struct extent_buffer *btrfs_try_read_lock_root_node(struct btrfs_root *root); diff --git a/fs/btrfs/lzo.c b/fs/btrfs/lzo.c index a45bc11f8665..d403641889ca 100644 --- a/fs/btrfs/lzo.c +++ b/fs/btrfs/lzo.c @@ -252,9 +252,8 @@ int lzo_compress_folios(struct list_head *ws, struct address_space *mapping, /* Compress at most one sector of data each time */ in_len = min_t(u32, start + len - cur_in, sectorsize - sector_off); ASSERT(in_len); - data_in = kmap_local_folio(folio_in, 0); - ret = lzo1x_1_compress(data_in + - offset_in_page(cur_in), in_len, + data_in = kmap_local_folio(folio_in, offset_in_folio(folio_in, cur_in)); + ret = lzo1x_1_compress(data_in, in_len, workspace->cbuf, &out_len, workspace->mem); kunmap_local(data_in); diff --git a/fs/btrfs/messages.h b/fs/btrfs/messages.h index 08a9272399d2..6abf81bb00c2 100644 --- a/fs/btrfs/messages.h +++ b/fs/btrfs/messages.h @@ -4,6 +4,7 @@ #define BTRFS_MESSAGES_H #include <linux/types.h> +#include <linux/types.h> #include <linux/printk.h> #include <linux/bug.h> @@ -170,15 +171,83 @@ do { \ #ifdef CONFIG_BTRFS_ASSERT -#define btrfs_assertfail(expr, file, line) ({ \ - pr_err("assertion failed: %s, in %s:%d\n", (expr), (file), (line)); \ - BUG(); \ -}) +__printf(1, 2) +static inline void verify_assert_printk_format(const char *fmt, ...) { + /* Stub to verify the assertion format string. */ +} + +/* Take the first token if any. */ +#define __FIRST_ARG(_, ...) _ +/* + * Skip the first token and return the rest, if it's empty the comma is dropped. + * As ##__VA_ARGS__ cannot be at the beginning of the macro the __VA_OPT__ is needed + * and supported since GCC 8 and Clang 12. + */ +#define __REST_ARGS(_, ... ) __VA_OPT__(,) __VA_ARGS__ + +#if defined(CONFIG_CC_IS_CLANG) || GCC_VERSION >= 80000 +/* + * Assertion with optional printk() format. + * + * Accepted syntax: + * ASSERT(condition); + * ASSERT(condition, "string"); + * ASSERT(condition, "variable=%d", variable); + * + * How it works: + * - if there's no format string, ""[0] evaluates at compile time to 0 and the + * true branch is executed + * - any non-empty format string with the "" prefix evaluates to != 0 at + * compile time and the false branch is executed + * - stringified condition is printed as %s so we don't accidentally mix format + * strings (the % operator) + * - there can be only one printk() call, so the format strings and arguments are + * spliced together: + * DEFAULT_FMT [USER_FMT], DEFAULT_ARGS [, USER_ARGS] + * - comma between DEFAULT_ARGS and USER_ARGS is handled by preprocessor + * (requires __VA_OPT__ support) + * - otherwise we could use __VA_OPT(,) __VA_ARGS__ for the 2nd+ argument of args, + */ +#define ASSERT(cond, args...) \ +do { \ + verify_assert_printk_format("check the format string" args); \ + if (!likely(cond)) { \ + if (("" __FIRST_ARG(args) [0]) == 0) { \ + pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ + #cond, (long)(cond), __FILE__, __LINE__); \ + } else { \ + pr_err("assertion failed: %s :: %ld, in %s:%d (" __FIRST_ARG(args) ")\n", \ + #cond, (long)(cond), __FILE__, __LINE__ __REST_ARGS(args)); \ + } \ + BUG(); \ + } \ +} while(0) + +#else + +/* For GCC < 8.x only the simple output. */ + +#define ASSERT(cond, args...) \ +do { \ + verify_assert_printk_format("check the format string" args); \ + if (!likely(cond)) { \ + pr_err("assertion failed: %s :: %ld, in %s:%d\n", \ + #cond, (long)(cond), __FILE__, __LINE__); \ + BUG(); \ + } \ +} while(0) + +#endif + +#else +#define ASSERT(cond, args...) (void)(cond) +#endif -#define ASSERT(expr) \ - (likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__)) +#ifdef CONFIG_BTRFS_DEBUG +/* Verbose warning only under debug build. */ +#define DEBUG_WARN(args...) WARN(1, KERN_ERR args) #else -#define ASSERT(expr) (void)(expr) +#define DEBUG_WARN(...) do {} while(0) #endif __printf(5, 6) diff --git a/fs/btrfs/ordered-data.c b/fs/btrfs/ordered-data.c index 4aca7475fd82..9212ce110cde 100644 --- a/fs/btrfs/ordered-data.c +++ b/fs/btrfs/ordered-data.c @@ -153,25 +153,30 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( struct btrfs_ordered_extent *entry; int ret; u64 qgroup_rsv = 0; + const bool is_nocow = (flags & + ((1U << BTRFS_ORDERED_NOCOW) | (1U << BTRFS_ORDERED_PREALLOC))); - if (flags & - ((1 << BTRFS_ORDERED_NOCOW) | (1 << BTRFS_ORDERED_PREALLOC))) { - /* For nocow write, we can release the qgroup rsv right now */ + /* + * For a NOCOW write we can free the qgroup reserve right now. For a COW + * one we transfer the reserved space from the inode's iotree into the + * ordered extent by calling btrfs_qgroup_release_data() and tracking + * the qgroup reserved amount in the ordered extent, so that later after + * completing the ordered extent, when running the data delayed ref it + * creates, we free the reserved data with btrfs_qgroup_free_refroot(). + */ + if (is_nocow) ret = btrfs_qgroup_free_data(inode, NULL, file_offset, num_bytes, &qgroup_rsv); - if (ret < 0) - return ERR_PTR(ret); - } else { - /* - * The ordered extent has reserved qgroup space, release now - * and pass the reserved number for qgroup_record to free. - */ + else ret = btrfs_qgroup_release_data(inode, file_offset, num_bytes, &qgroup_rsv); - if (ret < 0) - return ERR_PTR(ret); - } + + if (ret < 0) + return ERR_PTR(ret); + entry = kmem_cache_zalloc(btrfs_ordered_extent_cache, GFP_NOFS); - if (!entry) - return ERR_PTR(-ENOMEM); + if (!entry) { + entry = ERR_PTR(-ENOMEM); + goto out; + } entry->file_offset = file_offset; entry->num_bytes = num_bytes; @@ -180,7 +185,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( entry->disk_num_bytes = disk_num_bytes; entry->offset = offset; entry->bytes_left = num_bytes; - entry->inode = BTRFS_I(igrab(&inode->vfs_inode)); + if (WARN_ON_ONCE(!igrab(&inode->vfs_inode))) { + kmem_cache_free(btrfs_ordered_extent_cache, entry); + entry = ERR_PTR(-ESTALE); + goto out; + } + entry->inode = inode; entry->compress_type = compress_type; entry->truncated_len = (u64)-1; entry->qgroup_rsv = qgroup_rsv; @@ -203,6 +213,12 @@ static struct btrfs_ordered_extent *alloc_ordered_extent( btrfs_mod_outstanding_extents(inode, 1); spin_unlock(&inode->lock); +out: + if (IS_ERR(entry) && !is_nocow) + btrfs_qgroup_free_refroot(inode->root->fs_info, + btrfs_root_id(inode->root), + qgroup_rsv, BTRFS_QGROUP_RSV_DATA); + return entry; } @@ -253,7 +269,7 @@ static void insert_ordered_extent(struct btrfs_ordered_extent *entry) * @disk_bytenr: Offset of extent on disk. * @disk_num_bytes: Size of extent on disk. * @offset: Offset into unencoded data where file data starts. - * @flags: Flags specifying type of extent (1 << BTRFS_ORDERED_*). + * @flags: Flags specifying type of extent (1U << BTRFS_ORDERED_*). * @compress_type: Compression algorithm used for data. * * Most of these parameters correspond to &struct btrfs_file_extent_item. The @@ -607,23 +623,18 @@ out: */ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry) { - struct list_head *cur; - struct btrfs_ordered_sum *sum; - trace_btrfs_ordered_extent_put(entry->inode, entry); if (refcount_dec_and_test(&entry->refs)) { + struct btrfs_ordered_sum *sum; + struct btrfs_ordered_sum *tmp; + ASSERT(list_empty(&entry->root_extent_list)); ASSERT(list_empty(&entry->log_list)); ASSERT(RB_EMPTY_NODE(&entry->rb_node)); - if (entry->inode) - btrfs_add_delayed_iput(entry->inode); - while (!list_empty(&entry->list)) { - cur = entry->list.next; - sum = list_entry(cur, struct btrfs_ordered_sum, list); - list_del(&sum->list); + btrfs_add_delayed_iput(entry->inode); + list_for_each_entry_safe(sum, tmp, &entry->list, list) kvfree(sum); - } kmem_cache_free(btrfs_ordered_extent_cache, entry); } } @@ -842,10 +853,12 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr, /* * Start IO and wait for a given ordered extent to finish. * - * Wait on page writeback for all the pages in the extent and the IO completion - * code to insert metadata into the btree corresponding to the extent. + * Wait on page writeback for all the pages in the extent but not in + * [@nowriteback_start, @nowriteback_start + @nowriteback_len) and the + * IO completion code to insert metadata into the btree corresponding to the extent. */ -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len) { u64 start = entry->file_offset; u64 end = start + entry->num_bytes - 1; @@ -865,8 +878,19 @@ void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) * start IO on any dirty ones so the wait doesn't stall waiting * for the flusher thread to find them */ - if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) - filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + if (!test_bit(BTRFS_ORDERED_DIRECT, &entry->flags)) { + if (!nowriteback_len) { + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, end); + } else { + if (start < nowriteback_start) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, start, + nowriteback_start - 1); + if (nowriteback_start + nowriteback_len < end) + filemap_fdatawrite_range(inode->vfs_inode.i_mapping, + nowriteback_start + nowriteback_len, + end); + } + } if (!freespace_inode) btrfs_might_wait_for_event(inode->root->fs_info, btrfs_ordered_extent); @@ -1160,7 +1184,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, cachedp = cached_state; while (1) { - lock_extent(&inode->io_tree, start, end, cachedp); + btrfs_lock_extent(&inode->io_tree, start, end, cachedp); ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); if (!ordered) { @@ -1173,7 +1197,7 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start, refcount_dec(&cache->refs); break; } - unlock_extent(&inode->io_tree, start, end, cachedp); + btrfs_unlock_extent(&inode->io_tree, start, end, cachedp); btrfs_start_ordered_extent(ordered); btrfs_put_ordered_extent(ordered); } @@ -1191,7 +1215,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, { struct btrfs_ordered_extent *ordered; - if (!try_lock_extent(&inode->io_tree, start, end, cached_state)) + if (!btrfs_try_lock_extent(&inode->io_tree, start, end, cached_state)) return false; ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1); @@ -1199,7 +1223,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end, return true; btrfs_put_ordered_extent(ordered); - unlock_extent(&inode->io_tree, start, end, cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, cached_state); return false; } diff --git a/fs/btrfs/ordered-data.h b/fs/btrfs/ordered-data.h index 4e152736d06c..1e6b0b182b29 100644 --- a/fs/btrfs/ordered-data.h +++ b/fs/btrfs/ordered-data.h @@ -17,6 +17,7 @@ struct inode; struct page; struct extent_state; +struct btrfs_block_group; struct btrfs_inode; struct btrfs_root; struct btrfs_fs_info; @@ -191,7 +192,13 @@ void btrfs_add_ordered_sum(struct btrfs_ordered_extent *entry, struct btrfs_ordered_sum *sum); struct btrfs_ordered_extent *btrfs_lookup_ordered_extent(struct btrfs_inode *inode, u64 file_offset); -void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry); +void btrfs_start_ordered_extent_nowriteback(struct btrfs_ordered_extent *entry, + u64 nowriteback_start, u32 nowriteback_len); +static inline void btrfs_start_ordered_extent(struct btrfs_ordered_extent *entry) +{ + return btrfs_start_ordered_extent_nowriteback(entry, 0, 0); +} + int btrfs_wait_ordered_range(struct btrfs_inode *inode, u64 start, u64 len); struct btrfs_ordered_extent * btrfs_lookup_first_ordered_extent(struct btrfs_inode *inode, u64 file_offset); diff --git a/fs/btrfs/print-tree.h b/fs/btrfs/print-tree.h index 8504bf1702c7..d0e620bf5f5a 100644 --- a/fs/btrfs/print-tree.h +++ b/fs/btrfs/print-tree.h @@ -6,6 +6,8 @@ #ifndef BTRFS_PRINT_TREE_H #define BTRFS_PRINT_TREE_H +#include <linux/types.h> + /* Buffer size to contain tree name and possibly additional data (offset) */ #define BTRFS_ROOT_NAME_BUF_LEN 48 diff --git a/fs/btrfs/props.c b/fs/btrfs/props.c index b8fa34e16abb..adc956432d2f 100644 --- a/fs/btrfs/props.c +++ b/fs/btrfs/props.c @@ -26,8 +26,8 @@ struct prop_handler { const char *xattr_name; int (*validate)(const struct btrfs_inode *inode, const char *value, size_t len); - int (*apply)(struct inode *inode, const char *value, size_t len); - const char *(*extract)(const struct inode *inode); + int (*apply)(struct btrfs_inode *inode, const char *value, size_t len); + const char *(*extract)(const struct btrfs_inode *inode); bool (*ignore)(const struct btrfs_inode *inode); int inheritable; }; @@ -121,7 +121,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, NULL, 0); + ret = handler->apply(inode, NULL, 0); ASSERT(ret == 0); return ret; @@ -131,7 +131,7 @@ int btrfs_set_prop(struct btrfs_trans_handle *trans, struct btrfs_inode *inode, value_len, flags); if (ret) return ret; - ret = handler->apply(&inode->vfs_inode, value, value_len); + ret = handler->apply(inode, value, value_len); if (ret) { btrfs_setxattr(trans, &inode->vfs_inode, handler->xattr_name, NULL, 0, flags); @@ -263,7 +263,7 @@ static void inode_prop_iterator(void *ctx, struct btrfs_root *root = BTRFS_I(inode)->root; int ret; - ret = handler->apply(inode, value, len); + ret = handler->apply(BTRFS_I(inode), value, len); if (unlikely(ret)) btrfs_warn(root->fs_info, "error applying prop %s to ino %llu (root %llu): %d", @@ -273,12 +273,13 @@ static void inode_prop_iterator(void *ctx, set_bit(BTRFS_INODE_HAS_PROPS, &BTRFS_I(inode)->runtime_flags); } -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path) +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path) { - struct btrfs_root *root = BTRFS_I(inode)->root; - u64 ino = btrfs_ino(BTRFS_I(inode)); + struct btrfs_root *root = inode->root; + u64 ino = btrfs_ino(inode); - return iterate_object_props(root, path, ino, inode_prop_iterator, inode); + return iterate_object_props(root, path, ino, inode_prop_iterator, + &inode->vfs_inode); } static int prop_compression_validate(const struct btrfs_inode *inode, @@ -300,26 +301,26 @@ static int prop_compression_validate(const struct btrfs_inode *inode, return -EINVAL; } -static int prop_compression_apply(struct inode *inode, const char *value, +static int prop_compression_apply(struct btrfs_inode *inode, const char *value, size_t len) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(inode); + struct btrfs_fs_info *fs_info = inode->root->fs_info; int type; /* Reset to defaults */ if (len == 0) { - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } /* Set NOCOMPRESS flag */ if ((len == 2 && strncmp("no", value, 2) == 0) || (len == 4 && strncmp("none", value, 4) == 0)) { - BTRFS_I(inode)->flags |= BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags &= ~BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = BTRFS_COMPRESS_NONE; + inode->flags |= BTRFS_INODE_NOCOMPRESS; + inode->flags &= ~BTRFS_INODE_COMPRESS; + inode->prop_compress = BTRFS_COMPRESS_NONE; return 0; } @@ -336,9 +337,9 @@ static int prop_compression_apply(struct inode *inode, const char *value, return -EINVAL; } - BTRFS_I(inode)->flags &= ~BTRFS_INODE_NOCOMPRESS; - BTRFS_I(inode)->flags |= BTRFS_INODE_COMPRESS; - BTRFS_I(inode)->prop_compress = type; + inode->flags &= ~BTRFS_INODE_NOCOMPRESS; + inode->flags |= BTRFS_INODE_COMPRESS; + inode->prop_compress = type; return 0; } @@ -359,13 +360,13 @@ static bool prop_compression_ignore(const struct btrfs_inode *inode) return false; } -static const char *prop_compression_extract(const struct inode *inode) +static const char *prop_compression_extract(const struct btrfs_inode *inode) { - switch (BTRFS_I(inode)->prop_compress) { + switch (inode->prop_compress) { case BTRFS_COMPRESS_ZLIB: case BTRFS_COMPRESS_LZO: case BTRFS_COMPRESS_ZSTD: - return btrfs_compress_type2str(BTRFS_I(inode)->prop_compress); + return btrfs_compress_type2str(inode->prop_compress); default: break; } @@ -385,16 +386,16 @@ static struct prop_handler prop_handlers[] = { }; int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, const struct inode *parent) + struct btrfs_inode *inode, + const struct btrfs_inode *parent) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret; int i; bool need_reserve = false; - if (!test_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(parent)->runtime_flags)) + if (!test_bit(BTRFS_INODE_HAS_PROPS, &parent->runtime_flags)) return 0; for (i = 0; i < ARRAY_SIZE(prop_handlers); i++) { @@ -405,7 +406,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, if (!h->inheritable) continue; - if (h->ignore(BTRFS_I(inode))) + if (h->ignore(inode)) continue; value = h->extract(parent); @@ -416,7 +417,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, * This is not strictly necessary as the property should be * valid, but in case it isn't, don't propagate it further. */ - ret = h->validate(BTRFS_I(inode), value, strlen(value)); + ret = h->validate(inode, value, strlen(value)); if (ret) continue; @@ -436,16 +437,15 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, return ret; } - ret = btrfs_setxattr(trans, inode, h->xattr_name, value, + ret = btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, value, strlen(value), 0); if (!ret) { ret = h->apply(inode, value, strlen(value)); if (ret) - btrfs_setxattr(trans, inode, h->xattr_name, + btrfs_setxattr(trans, &inode->vfs_inode, h->xattr_name, NULL, 0, 0); else - set_bit(BTRFS_INODE_HAS_PROPS, - &BTRFS_I(inode)->runtime_flags); + set_bit(BTRFS_INODE_HAS_PROPS, &inode->runtime_flags); } if (need_reserve) { diff --git a/fs/btrfs/props.h b/fs/btrfs/props.h index 63546d0a9444..15d9a025c923 100644 --- a/fs/btrfs/props.h +++ b/fs/btrfs/props.h @@ -6,9 +6,9 @@ #ifndef BTRFS_PROPS_H #define BTRFS_PROPS_H +#include <linux/types.h> #include <linux/compiler_types.h> -struct inode; struct btrfs_inode; struct btrfs_path; struct btrfs_trans_handle; @@ -22,10 +22,10 @@ int btrfs_validate_prop(const struct btrfs_inode *inode, const char *name, const char *value, size_t value_len); bool btrfs_ignore_prop(const struct btrfs_inode *inode, const char *name); -int btrfs_load_inode_props(struct inode *inode, struct btrfs_path *path); +int btrfs_load_inode_props(struct btrfs_inode *inode, struct btrfs_path *path); int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans, - struct inode *inode, - const struct inode *dir); + struct btrfs_inode *inode, + const struct btrfs_inode *dir); #endif diff --git a/fs/btrfs/qgroup.c b/fs/btrfs/qgroup.c index f9d3766c809b..b3176edbde82 100644 --- a/fs/btrfs/qgroup.c +++ b/fs/btrfs/qgroup.c @@ -83,7 +83,7 @@ static void qgroup_rsv_add(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - trace_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); + trace_btrfs_qgroup_update_reserve(fs_info, qgroup, num_bytes, type); qgroup->rsv.values[type] += num_bytes; } @@ -91,7 +91,7 @@ static void qgroup_rsv_release(struct btrfs_fs_info *fs_info, struct btrfs_qgroup *qgroup, u64 num_bytes, enum btrfs_qgroup_rsv_type type) { - trace_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); + trace_btrfs_qgroup_update_reserve(fs_info, qgroup, -(s64)num_bytes, type); if (qgroup->rsv.values[type] >= num_bytes) { qgroup->rsv.values[type] -= num_bytes; return; @@ -956,8 +956,8 @@ static int btrfs_clean_quota_tree(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = 0; - key.offset = 0; key.type = 0; + key.offset = 0; while (1) { ret = btrfs_search_slot(trans, root, &key, path, -1, 1); @@ -1823,7 +1823,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) if (qgroup->rsv.values[BTRFS_QGROUP_RSV_DATA] || qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PREALLOC] || qgroup->rsv.values[BTRFS_QGROUP_RSV_META_PERTRANS]) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, data %llu meta prealloc %llu meta pertrans %llu", btrfs_qgroup_level(qgroup->qgroupid), @@ -1843,7 +1843,7 @@ int btrfs_remove_qgroup(struct btrfs_trans_handle *trans, u64 qgroupid) !(fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT)) { if (qgroup->rfer || qgroup->excl || qgroup->rfer_cmpr || qgroup->excl_cmpr) { - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); btrfs_warn_rl(fs_info, "to be deleted qgroup %u/%llu has non-zero numbers, rfer %llu rfer_cmpr %llu excl %llu excl_cmpr %llu", btrfs_qgroup_level(qgroup->qgroupid), @@ -2837,8 +2837,8 @@ static void qgroup_update_counters(struct btrfs_fs_info *fs_info, cur_old_count = btrfs_qgroup_get_old_refcnt(qg, seq); cur_new_count = btrfs_qgroup_get_new_refcnt(qg, seq); - trace_qgroup_update_counters(fs_info, qg, cur_old_count, - cur_new_count); + trace_btrfs_qgroup_update_counters(fs_info, qg, cur_old_count, + cur_new_count); /* Rfer update part */ if (cur_old_count == 0 && cur_new_count > 0) { @@ -3100,8 +3100,7 @@ cleanup: kfree(record); } - trace_qgroup_num_dirty_extents(fs_info, trans->transid, - num_dirty_extents); + trace_btrfs_qgroup_num_dirty_extents(fs_info, trans->transid, num_dirty_extents); return ret; } @@ -4129,8 +4128,8 @@ static int qgroup_unreserve_range(struct btrfs_inode *inode, * Now the entry is in [start, start + len), revert the * EXTENT_QGROUP_RESERVED bit. */ - clear_ret = clear_extent_bits(&inode->io_tree, entry_start, - entry_end, EXTENT_QGROUP_RESERVED); + clear_ret = btrfs_clear_extent_bits(&inode->io_tree, entry_start, + entry_end, EXTENT_QGROUP_RESERVED); if (!ret && clear_ret < 0) ret = clear_ret; @@ -4232,8 +4231,9 @@ static int qgroup_reserve_data(struct btrfs_inode *inode, reserved = *reserved_ret; /* Record already reserved space */ orig_reserved = reserved->bytes_changed; - ret = set_record_extent_bits(&inode->io_tree, start, - start + len -1, EXTENT_QGROUP_RESERVED, reserved); + ret = btrfs_set_record_extent_bits(&inode->io_tree, start, + start + len - 1, EXTENT_QGROUP_RESERVED, + reserved); /* Newly reserved space */ to_reserve = reserved->bytes_changed - orig_reserved; @@ -4326,9 +4326,10 @@ static int qgroup_free_reserved_data(struct btrfs_inode *inode, * EXTENT_QGROUP_RESERVED, we won't double free. * So not need to rush. */ - ret = clear_record_extent_bits(&inode->io_tree, free_start, - free_start + free_len - 1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, free_start, + free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, + &changeset); if (ret < 0) goto out; freed += changeset.bytes_changed; @@ -4352,9 +4353,9 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, int ret; if (btrfs_qgroup_mode(inode->root->fs_info) == BTRFS_QGROUP_MODE_DISABLED) { - return clear_record_extent_bits(&inode->io_tree, start, - start + len - 1, - EXTENT_QGROUP_RESERVED, NULL); + return btrfs_clear_record_extent_bits(&inode->io_tree, start, + start + len - 1, + EXTENT_QGROUP_RESERVED, NULL); } /* In release case, we shouldn't have @reserved */ @@ -4362,8 +4363,8 @@ static int __btrfs_qgroup_release_data(struct btrfs_inode *inode, if (free && reserved) return qgroup_free_reserved_data(inode, reserved, start, len, released); extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&inode->io_tree, start, start + len -1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, start, start + len - 1, + EXTENT_QGROUP_RESERVED, &changeset); if (ret < 0) goto out; @@ -4472,7 +4473,7 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes, return 0; BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, (s64)num_bytes, type); + trace_btrfs_qgroup_meta_reserve(root, (s64)num_bytes, type); ret = qgroup_reserve(root, num_bytes, enforce, type); if (ret < 0) return ret; @@ -4517,7 +4518,7 @@ void btrfs_qgroup_free_meta_all_pertrans(struct btrfs_root *root) return; /* TODO: Update trace point to handle such free */ - trace_qgroup_meta_free_all_pertrans(root); + trace_btrfs_qgroup_meta_free_all_pertrans(root); /* Special value -1 means to free all reserved space */ btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), (u64)-1, BTRFS_QGROUP_RSV_META_PERTRANS); @@ -4539,7 +4540,7 @@ void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes, */ num_bytes = sub_root_meta_rsv(root, num_bytes, type); BUG_ON(num_bytes != round_down(num_bytes, fs_info->nodesize)); - trace_qgroup_meta_reserve(root, -(s64)num_bytes, type); + trace_btrfs_qgroup_meta_reserve(root, -(s64)num_bytes, type); btrfs_qgroup_free_refroot(fs_info, btrfs_root_id(root), num_bytes, type); } @@ -4593,7 +4594,7 @@ void btrfs_qgroup_convert_reserved_meta(struct btrfs_root *root, int num_bytes) /* Same as btrfs_qgroup_free_meta_prealloc() */ num_bytes = sub_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PREALLOC); - trace_qgroup_meta_convert(root, num_bytes); + trace_btrfs_qgroup_meta_convert(root, num_bytes); qgroup_convert_meta(fs_info, btrfs_root_id(root), num_bytes); if (!sb_rdonly(fs_info->sb)) add_root_meta_rsv(root, num_bytes, BTRFS_QGROUP_RSV_META_PERTRANS); @@ -4611,8 +4612,8 @@ void btrfs_qgroup_check_reserved_leak(struct btrfs_inode *inode) int ret; extent_changeset_init(&changeset); - ret = clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, - EXTENT_QGROUP_RESERVED, &changeset); + ret = btrfs_clear_record_extent_bits(&inode->io_tree, 0, (u64)-1, + EXTENT_QGROUP_RESERVED, &changeset); WARN_ON(ret < 0); if (WARN_ON(changeset.bytes_changed)) { @@ -4766,7 +4767,7 @@ int btrfs_qgroup_add_swapped_blocks(struct btrfs_root *subvol_root, * Marking qgroup inconsistent should be enough * for end users. */ - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN("duplicated but mismatched entry found"); ret = -EEXIST; } kfree(block); diff --git a/fs/btrfs/qgroup.h b/fs/btrfs/qgroup.h index e233cc79af18..a979fd59a4da 100644 --- a/fs/btrfs/qgroup.h +++ b/fs/btrfs/qgroup.h @@ -22,6 +22,9 @@ struct btrfs_ioctl_quota_ctl_args; struct btrfs_trans_handle; struct btrfs_delayed_ref_root; struct btrfs_inode; +struct btrfs_transaction; +struct btrfs_block_group; +struct btrfs_qgroup_swapped_blocks; /* * Btrfs qgroup overview diff --git a/fs/btrfs/raid-stripe-tree.h b/fs/btrfs/raid-stripe-tree.h index 541836421778..69942ad43140 100644 --- a/fs/btrfs/raid-stripe-tree.h +++ b/fs/btrfs/raid-stripe-tree.h @@ -9,6 +9,7 @@ #include <linux/types.h> #include <uapi/linux/btrfs_tree.h> #include "fs.h" +#include "accessors.h" #define BTRFS_RST_SUPP_BLOCK_GROUP_MASK (BTRFS_BLOCK_GROUP_DUP | \ BTRFS_BLOCK_GROUP_RAID1_MASK | \ diff --git a/fs/btrfs/raid56.c b/fs/btrfs/raid56.c index cdd373c27784..3ff2bedfb3a4 100644 --- a/fs/btrfs/raid56.c +++ b/fs/btrfs/raid56.c @@ -134,14 +134,17 @@ struct btrfs_stripe_hash_table { }; /* - * A bvec like structure to present a sector inside a page. - * - * Unlike bvec we don't need bvlen, as it's fixed to sectorsize. + * A structure to present a sector inside a page, the length is fixed to + * sectorsize; */ struct sector_ptr { - struct page *page; - unsigned int pgoff:24; - unsigned int uptodate:8; + /* + * Blocks from the bio list can still be highmem. + * So here we use physical address to present a page and the offset inside it. + */ + phys_addr_t paddr; + bool has_paddr; + bool uptodate; }; static void rmw_rbio_work(struct work_struct *work); @@ -200,8 +203,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) struct btrfs_stripe_hash_table *x; struct btrfs_stripe_hash *cur; struct btrfs_stripe_hash *h; - int num_entries = 1 << BTRFS_STRIPE_HASH_TABLE_BITS; - int i; + unsigned int num_entries = 1U << BTRFS_STRIPE_HASH_TABLE_BITS; if (info->stripe_hash_table) return 0; @@ -222,7 +224,7 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) h = table->table; - for (i = 0; i < num_entries; i++) { + for (unsigned int i = 0; i < num_entries; i++) { cur = h + i; INIT_LIST_HEAD(&cur->hash_list); spin_lock_init(&cur->lock); @@ -233,6 +235,14 @@ int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info) return 0; } +static void memcpy_sectors(const struct sector_ptr *dst, + const struct sector_ptr *src, u32 blocksize) +{ + memcpy_page(phys_to_page(dst->paddr), offset_in_page(dst->paddr), + phys_to_page(src->paddr), offset_in_page(src->paddr), + blocksize); +} + /* * caching an rbio means to copy anything from the * bio_sectors array into the stripe_pages array. We @@ -253,7 +263,7 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) for (i = 0; i < rbio->nr_sectors; i++) { /* Some range not covered by bio (partial write), skip it */ - if (!rbio->bio_sectors[i].page) { + if (!rbio->bio_sectors[i].has_paddr) { /* * Even if the sector is not covered by bio, if it is * a data sector it should still be uptodate as it is @@ -264,12 +274,8 @@ static void cache_rbio_pages(struct btrfs_raid_bio *rbio) continue; } - ASSERT(rbio->stripe_sectors[i].page); - memcpy_page(rbio->stripe_sectors[i].page, - rbio->stripe_sectors[i].pgoff, - rbio->bio_sectors[i].page, - rbio->bio_sectors[i].pgoff, - rbio->bioc->fs_info->sectorsize); + memcpy_sectors(&rbio->stripe_sectors[i], &rbio->bio_sectors[i], + rbio->bioc->fs_info->sectorsize); rbio->stripe_sectors[i].uptodate = 1; } set_bit(RBIO_CACHE_READY_BIT, &rbio->flags); @@ -326,8 +332,13 @@ static void index_stripe_sectors(struct btrfs_raid_bio *rbio) int page_index = offset >> PAGE_SHIFT; ASSERT(page_index < rbio->nr_pages); - rbio->stripe_sectors[i].page = rbio->stripe_pages[page_index]; - rbio->stripe_sectors[i].pgoff = offset_in_page(offset); + if (!rbio->stripe_pages[page_index]) + continue; + + rbio->stripe_sectors[i].has_paddr = true; + rbio->stripe_sectors[i].paddr = + page_to_phys(rbio->stripe_pages[page_index]) + + offset_in_page(offset); } } @@ -507,9 +518,8 @@ static void btrfs_clear_rbio_cache(struct btrfs_fs_info *info) spin_lock(&table->cache_lock); while (!list_empty(&table->stripe_cache)) { - rbio = list_entry(table->stripe_cache.next, - struct btrfs_raid_bio, - stripe_cache); + rbio = list_first_entry(&table->stripe_cache, + struct btrfs_raid_bio, stripe_cache); __remove_rbio_from_cache(rbio); } spin_unlock(&table->cache_lock); @@ -567,9 +577,9 @@ static void cache_rbio(struct btrfs_raid_bio *rbio) if (table->cache_size > RBIO_CACHE_SIZE) { struct btrfs_raid_bio *found; - found = list_entry(table->stripe_cache.prev, - struct btrfs_raid_bio, - stripe_cache); + found = list_last_entry(&table->stripe_cache, + struct btrfs_raid_bio, + stripe_cache); if (found != rbio) __remove_rbio_from_cache(found); @@ -882,14 +892,14 @@ done_nolock: remove_rbio_from_cache(rbio); } -static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) +static void rbio_endio_bio_list(struct bio *cur, blk_status_t status) { struct bio *next; while (cur) { next = cur->bi_next; cur->bi_next = NULL; - cur->bi_status = err; + cur->bi_status = status; bio_endio(cur); cur = next; } @@ -899,7 +909,7 @@ static void rbio_endio_bio_list(struct bio *cur, blk_status_t err) * this frees the rbio and runs through all the bios in the * bio_list and calls end_io on them */ -static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) +static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t status) { struct bio *cur = bio_list_get(&rbio->bio_list); struct bio *extra; @@ -928,9 +938,9 @@ static void rbio_orig_end_io(struct btrfs_raid_bio *rbio, blk_status_t err) extra = bio_list_get(&rbio->bio_list); free_raid_bio(rbio); - rbio_endio_bio_list(cur, err); + rbio_endio_bio_list(cur, status); if (extra) - rbio_endio_bio_list(extra, err); + rbio_endio_bio_list(extra, status); } /* @@ -962,9 +972,9 @@ static struct sector_ptr *sector_in_rbio(struct btrfs_raid_bio *rbio, spin_lock(&rbio->bio_list_lock); sector = &rbio->bio_sectors[index]; - if (sector->page || bio_list_only) { + if (sector->has_paddr || bio_list_only) { /* Don't return sector without a valid page pointer */ - if (!sector->page) + if (!sector->has_paddr) sector = NULL; spin_unlock(&rbio->bio_list_lock); return sector; @@ -1142,7 +1152,7 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, rbio, stripe_nr); ASSERT_RBIO_SECTOR(sector_nr >= 0 && sector_nr < rbio->stripe_nsectors, rbio, sector_nr); - ASSERT(sector->page); + ASSERT(sector->has_paddr); stripe = &rbio->bioc->stripes[stripe_nr]; disk_start = stripe->physical + sector_nr * sectorsize; @@ -1173,8 +1183,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, */ if (last_end == disk_start && !last->bi_status && last->bi_bdev == stripe->dev->bdev) { - ret = bio_add_page(last, sector->page, sectorsize, - sector->pgoff); + ret = bio_add_page(last, phys_to_page(sector->paddr), + sectorsize, offset_in_page(sector->paddr)); if (ret == sectorsize) return 0; } @@ -1187,7 +1197,8 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, bio->bi_iter.bi_sector = disk_start >> SECTOR_SHIFT; bio->bi_private = rbio; - __bio_add_page(bio, sector->page, sectorsize, sector->pgoff); + __bio_add_page(bio, phys_to_page(sector->paddr), sectorsize, + offset_in_page(sector->paddr)); bio_list_add(bio_list, bio); return 0; } @@ -1195,23 +1206,20 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio, static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio) { const u32 sectorsize = rbio->bioc->fs_info->sectorsize; - struct bio_vec bvec; - struct bvec_iter iter; + const u32 sectorsize_bits = rbio->bioc->fs_info->sectorsize_bits; + struct bvec_iter iter = bio->bi_iter; u32 offset = (bio->bi_iter.bi_sector << SECTOR_SHIFT) - rbio->bioc->full_stripe_logical; - bio_for_each_segment(bvec, bio, iter) { - u32 bvec_offset; - - for (bvec_offset = 0; bvec_offset < bvec.bv_len; - bvec_offset += sectorsize, offset += sectorsize) { - int index = offset / sectorsize; - struct sector_ptr *sector = &rbio->bio_sectors[index]; + while (iter.bi_size) { + unsigned int index = (offset >> sectorsize_bits); + struct sector_ptr *sector = &rbio->bio_sectors[index]; + struct bio_vec bv = bio_iter_iovec(bio, iter); - sector->page = bvec.bv_page; - sector->pgoff = bvec.bv_offset + bvec_offset; - ASSERT(sector->pgoff < PAGE_SIZE); - } + sector->has_paddr = true; + sector->paddr = bvec_phys(&bv); + bio_advance_iter_single(bio, &iter, sectorsize); + offset += sectorsize; } } @@ -1289,6 +1297,15 @@ static void assert_rbio(struct btrfs_raid_bio *rbio) ASSERT_RBIO(rbio->nr_data < rbio->real_stripes, rbio); } +static inline void *kmap_local_sector(const struct sector_ptr *sector) +{ + /* The sector pointer must have a page mapped to it. */ + ASSERT(sector->has_paddr); + + return kmap_local_page(phys_to_page(sector->paddr)) + + offset_in_page(sector->paddr); +} + /* Generate PQ for one vertical stripe. */ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) { @@ -1301,14 +1318,13 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) /* First collect one sector from each data stripe */ for (stripe = 0; stripe < rbio->nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe] = kmap_local_sector(sector); } /* Then add the parity stripe */ sector = rbio_pstripe_sector(rbio, sectornr); sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + sector->pgoff; + pointers[stripe++] = kmap_local_sector(sector); if (has_qstripe) { /* @@ -1317,8 +1333,7 @@ static void generate_pq_vertical(struct btrfs_raid_bio *rbio, int sectornr) */ sector = rbio_qstripe_sector(rbio, sectornr); sector->uptodate = 1; - pointers[stripe++] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe++] = kmap_local_sector(sector); assert_rbio(rbio); raid6_call.gen_syndrome(rbio->real_stripes, sectorsize, @@ -1477,15 +1492,14 @@ static void set_rbio_range_error(struct btrfs_raid_bio *rbio, struct bio *bio) * stripe_pages[], thus we need to locate the sector. */ static struct sector_ptr *find_stripe_sector(struct btrfs_raid_bio *rbio, - struct page *page, - unsigned int pgoff) + phys_addr_t paddr) { int i; for (i = 0; i < rbio->nr_sectors; i++) { struct sector_ptr *sector = &rbio->stripe_sectors[i]; - if (sector->page == page && sector->pgoff == pgoff) + if (sector->has_paddr && sector->paddr == paddr) return sector; } return NULL; @@ -1505,11 +1519,10 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) bio_for_each_segment_all(bvec, bio, iter_all) { struct sector_ptr *sector; - int pgoff; + phys_addr_t paddr = bvec_phys(bvec); - for (pgoff = bvec->bv_offset; pgoff - bvec->bv_offset < bvec->bv_len; - pgoff += sectorsize) { - sector = find_stripe_sector(rbio, bvec->bv_page, pgoff); + for (u32 off = 0; off < bvec->bv_len; off += sectorsize) { + sector = find_stripe_sector(rbio, paddr + off); ASSERT(sector); if (sector) sector->uptodate = 1; @@ -1519,17 +1532,14 @@ static void set_bio_pages_uptodate(struct btrfs_raid_bio *rbio, struct bio *bio) static int get_bio_sector_nr(struct btrfs_raid_bio *rbio, struct bio *bio) { - struct bio_vec *bv = bio_first_bvec_all(bio); + phys_addr_t bvec_paddr = bvec_phys(bio_first_bvec_all(bio)); int i; for (i = 0; i < rbio->nr_sectors; i++) { - struct sector_ptr *sector; - - sector = &rbio->stripe_sectors[i]; - if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + if (rbio->stripe_sectors[i].paddr == bvec_paddr) break; - sector = &rbio->bio_sectors[i]; - if (sector->page == bv->bv_page && sector->pgoff == bv->bv_offset) + if (rbio->bio_sectors[i].has_paddr && + rbio->bio_sectors[i].paddr == bvec_paddr) break; } ASSERT(i < rbio->nr_sectors); @@ -1575,11 +1585,11 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, return; bio_for_each_segment_all(bvec, bio, iter_all) { - int bv_offset; + void *kaddr; - for (bv_offset = bvec->bv_offset; - bv_offset < bvec->bv_offset + bvec->bv_len; - bv_offset += fs_info->sectorsize, total_sector_nr++) { + kaddr = bvec_kmap_local(bvec); + for (u32 off = 0; off < bvec->bv_len; + off += fs_info->sectorsize, total_sector_nr++) { u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *expected_csum = rbio->csum_buf + total_sector_nr * fs_info->csum_size; @@ -1589,11 +1599,12 @@ static void verify_bio_data_sectors(struct btrfs_raid_bio *rbio, if (!test_bit(total_sector_nr, rbio->csum_bitmap)) continue; - ret = btrfs_check_sector_csum(fs_info, bvec->bv_page, - bv_offset, csum_buf, expected_csum); + ret = btrfs_check_sector_csum(fs_info, kaddr + off, + csum_buf, expected_csum); if (ret < 0) set_bit(total_sector_nr, rbio->error_bitmap); } + kunmap_local(kaddr); } } @@ -1689,8 +1700,8 @@ static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule) list_sort(NULL, &plug->rbio_list, plug_cmp); while (!list_empty(&plug->rbio_list)) { - cur = list_entry(plug->rbio_list.next, - struct btrfs_raid_bio, plug_list); + cur = list_first_entry(&plug->rbio_list, + struct btrfs_raid_bio, plug_list); list_del_init(&cur->plug_list); if (rbio_is_full(cur)) { @@ -1791,6 +1802,7 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, struct sector_ptr *sector; u8 csum_buf[BTRFS_CSUM_SIZE]; u8 *csum_expected; + void *kaddr; int ret; if (!rbio->csum_bitmap || !rbio->csum_buf) @@ -1809,13 +1821,12 @@ static int verify_one_sector(struct btrfs_raid_bio *rbio, sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - ASSERT(sector->page); - csum_expected = rbio->csum_buf + (stripe_nr * rbio->stripe_nsectors + sector_nr) * fs_info->csum_size; - ret = btrfs_check_sector_csum(fs_info, sector->page, sector->pgoff, - csum_buf, csum_expected); + kaddr = kmap_local_sector(sector); + ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, csum_expected); + kunmap_local(kaddr); return ret; } @@ -1872,9 +1883,7 @@ static int recover_vertical(struct btrfs_raid_bio *rbio, int sector_nr, } else { sector = rbio_stripe_sector(rbio, stripe_nr, sector_nr); } - ASSERT(sector->page); - pointers[stripe_nr] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe_nr] = kmap_local_sector(sector); unmap_array[stripe_nr] = pointers[stripe_nr]; } @@ -2282,9 +2291,8 @@ static int rmw_read_wait_recover(struct btrfs_raid_bio *rbio) static void raid_wait_write_end_io(struct bio *bio) { struct btrfs_raid_bio *rbio = bio->bi_private; - blk_status_t err = bio->bi_status; - if (err) + if (bio->bi_status) rbio_update_error_bitmap(rbio, bio); bio_put(bio); if (atomic_dec_and_test(&rbio->stripes_pending)) @@ -2326,7 +2334,7 @@ static bool need_read_stripe_sectors(struct btrfs_raid_bio *rbio) * thus this rbio can not be cached one, as cached one must * have all its data sectors present and uptodate. */ - if (!sector->page || !sector->uptodate) + if (!sector->has_paddr || !sector->uptodate) return true; } return false; @@ -2516,6 +2524,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) int stripe; int sectornr; bool has_qstripe; + struct page *page; struct sector_ptr p_sector = { 0 }; struct sector_ptr q_sector = { 0 }; struct bio_list bio_list; @@ -2547,29 +2556,33 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) */ clear_bit(RBIO_CACHE_READY_BIT, &rbio->flags); - p_sector.page = alloc_page(GFP_NOFS); - if (!p_sector.page) + page = alloc_page(GFP_NOFS); + if (!page) return -ENOMEM; - p_sector.pgoff = 0; + p_sector.has_paddr = true; + p_sector.paddr = page_to_phys(page); p_sector.uptodate = 1; + page = NULL; if (has_qstripe) { /* RAID6, allocate and map temp space for the Q stripe */ - q_sector.page = alloc_page(GFP_NOFS); - if (!q_sector.page) { - __free_page(p_sector.page); - p_sector.page = NULL; + page = alloc_page(GFP_NOFS); + if (!page) { + __free_page(phys_to_page(p_sector.paddr)); + p_sector.has_paddr = false; return -ENOMEM; } - q_sector.pgoff = 0; + q_sector.has_paddr = true; + q_sector.paddr = page_to_phys(page); q_sector.uptodate = 1; - pointers[rbio->real_stripes - 1] = kmap_local_page(q_sector.page); + page = NULL; + pointers[rbio->real_stripes - 1] = kmap_local_sector(&q_sector); } bitmap_clear(rbio->error_bitmap, 0, rbio->nr_sectors); /* Map the parity stripe just once */ - pointers[nr_data] = kmap_local_page(p_sector.page); + pointers[nr_data] = kmap_local_sector(&p_sector); for_each_set_bit(sectornr, &rbio->dbitmap, rbio->stripe_nsectors) { struct sector_ptr *sector; @@ -2578,8 +2591,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) /* first collect one page from each data stripe */ for (stripe = 0; stripe < nr_data; stripe++) { sector = sector_in_rbio(rbio, stripe, sectornr, 0); - pointers[stripe] = kmap_local_page(sector->page) + - sector->pgoff; + pointers[stripe] = kmap_local_sector(sector); } if (has_qstripe) { @@ -2595,7 +2607,7 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) /* Check scrubbing parity and repair it */ sector = rbio_stripe_sector(rbio, rbio->scrubp, sectornr); - parity = kmap_local_page(sector->page) + sector->pgoff; + parity = kmap_local_sector(sector); if (memcmp(parity, pointers[rbio->scrubp], sectorsize) != 0) memcpy(parity, pointers[rbio->scrubp], sectorsize); else @@ -2608,12 +2620,11 @@ static int finish_parity_scrub(struct btrfs_raid_bio *rbio) } kunmap_local(pointers[nr_data]); - __free_page(p_sector.page); - p_sector.page = NULL; - if (q_sector.page) { - kunmap_local(pointers[rbio->real_stripes - 1]); - __free_page(q_sector.page); - q_sector.page = NULL; + __free_page(phys_to_page(p_sector.paddr)); + p_sector.has_paddr = false; + if (q_sector.has_paddr) { + __free_page(phys_to_page(q_sector.paddr)); + q_sector.has_paddr = false; } /* diff --git a/fs/btrfs/reflink.c b/fs/btrfs/reflink.c index f0824c948cb7..62161beca559 100644 --- a/fs/btrfs/reflink.c +++ b/fs/btrfs/reflink.c @@ -87,7 +87,7 @@ static int copy_inline_to_page(struct btrfs_inode *inode, FGP_LOCK | FGP_ACCESSED | FGP_CREAT, btrfs_alloc_write_mask(mapping)); if (IS_ERR(folio)) { - ret = -ENOMEM; + ret = PTR_ERR(folio); goto out_unlock; } @@ -95,9 +95,8 @@ static int copy_inline_to_page(struct btrfs_inode *inode, if (ret < 0) goto out_unlock; - clear_extent_bit(&inode->io_tree, file_offset, range_end, - EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG, - NULL); + btrfs_clear_extent_bits(&inode->io_tree, file_offset, range_end, + EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG); ret = btrfs_set_extent_delalloc(inode, file_offset, range_end, 0, NULL); if (ret) goto out_unlock; @@ -165,7 +164,7 @@ out: * the source inode to destination inode when possible. When not possible we * copy the inline extent's data into the respective page of the inode. */ -static int clone_copy_inline_extent(struct inode *dst, +static int clone_copy_inline_extent(struct btrfs_inode *inode, struct btrfs_path *path, struct btrfs_key *new_key, const u64 drop_start, @@ -175,8 +174,8 @@ static int clone_copy_inline_extent(struct inode *dst, char *inline_data, struct btrfs_trans_handle **trans_out) { - struct btrfs_fs_info *fs_info = inode_to_fs_info(dst); - struct btrfs_root *root = BTRFS_I(dst)->root; + struct btrfs_root *root = inode->root; + struct btrfs_fs_info *fs_info = root->fs_info; const u64 aligned_end = ALIGN(new_key->offset + datal, fs_info->sectorsize); struct btrfs_trans_handle *trans = NULL; @@ -185,12 +184,12 @@ static int clone_copy_inline_extent(struct inode *dst, struct btrfs_key key; if (new_key->offset > 0) { - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } - key.objectid = btrfs_ino(BTRFS_I(dst)); + key.objectid = btrfs_ino(inode); key.type = BTRFS_EXTENT_DATA_KEY; key.offset = 0; ret = btrfs_search_slot(NULL, root, &key, path, 0, 0); @@ -205,7 +204,7 @@ static int clone_copy_inline_extent(struct inode *dst, goto copy_inline_extent; } btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]); - if (key.objectid == btrfs_ino(BTRFS_I(dst)) && + if (key.objectid == btrfs_ino(inode) && key.type == BTRFS_EXTENT_DATA_KEY) { /* * There's an implicit hole at file offset 0, copy the @@ -214,7 +213,7 @@ static int clone_copy_inline_extent(struct inode *dst, ASSERT(key.offset > 0); goto copy_to_page; } - } else if (i_size_read(dst) <= datal) { + } else if (i_size_read(&inode->vfs_inode) <= datal) { struct btrfs_file_extent_item *ei; ei = btrfs_item_ptr(path->nodes[0], path->slots[0], @@ -236,7 +235,7 @@ copy_inline_extent: * We have no extent items, or we have an extent at offset 0 which may * or may not be inlined. All these cases are dealt the same way. */ - if (i_size_read(dst) > datal) { + if (i_size_read(&inode->vfs_inode) > datal) { /* * At the destination offset 0 we have either a hole, a regular * extent or an inline extent larger then the one we want to @@ -270,7 +269,7 @@ copy_inline_extent: drop_args.start = drop_start; drop_args.end = aligned_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(dst), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; ret = btrfs_insert_empty_item(trans, root, path, new_key, size); @@ -281,9 +280,9 @@ copy_inline_extent: btrfs_item_ptr_offset(path->nodes[0], path->slots[0]), size); - btrfs_update_inode_bytes(BTRFS_I(dst), datal, drop_args.bytes_found); - btrfs_set_inode_full_sync(BTRFS_I(dst)); - ret = btrfs_inode_set_file_extent_range(BTRFS_I(dst), 0, aligned_end); + btrfs_update_inode_bytes(inode, datal, drop_args.bytes_found); + btrfs_set_inode_full_sync(inode); + ret = btrfs_inode_set_file_extent_range(inode, 0, aligned_end); out: if (!ret && !trans) { /* @@ -318,7 +317,7 @@ copy_to_page: */ btrfs_release_path(path); - ret = copy_inline_to_page(BTRFS_I(dst), new_key->offset, + ret = copy_inline_to_page(inode, new_key->offset, inline_data, size, datal, comp_type); goto out; } @@ -526,7 +525,7 @@ process_slot: goto out; } - ret = clone_copy_inline_extent(inode, path, &new_key, + ret = clone_copy_inline_extent(BTRFS_I(inode), path, &new_key, drop_start, datal, size, comp, buf, &trans); if (ret) @@ -617,26 +616,26 @@ out: return ret; } -static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_lock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { if (inode1 < inode2) swap(inode1, inode2); - down_write(&BTRFS_I(inode1)->i_mmap_lock); - down_write_nested(&BTRFS_I(inode2)->i_mmap_lock, SINGLE_DEPTH_NESTING); + down_write(&inode1->i_mmap_lock); + down_write_nested(&inode2->i_mmap_lock, SINGLE_DEPTH_NESTING); } -static void btrfs_double_mmap_unlock(struct inode *inode1, struct inode *inode2) +static void btrfs_double_mmap_unlock(struct btrfs_inode *inode1, struct btrfs_inode *inode2) { - up_write(&BTRFS_I(inode1)->i_mmap_lock); - up_write(&BTRFS_I(inode2)->i_mmap_lock); + up_write(&inode1->i_mmap_lock); + up_write(&inode2->i_mmap_lock); } -static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, - struct inode *dst, u64 dst_loff) +static int btrfs_extent_same_range(struct btrfs_inode *src, u64 loff, u64 len, + struct btrfs_inode *dst, u64 dst_loff) { const u64 end = dst_loff + len - 1; struct extent_state *cached_state = NULL; - struct btrfs_fs_info *fs_info = BTRFS_I(src)->root->fs_info; + struct btrfs_fs_info *fs_info = src->root->fs_info; const u64 bs = fs_info->sectorsize; int ret; @@ -646,9 +645,10 @@ static int btrfs_extent_same_range(struct inode *src, u64 loff, u64 len, * because we have already locked the inode's i_mmap_lock in exclusive * mode. */ - lock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); - ret = btrfs_clone(src, dst, loff, len, ALIGN(len, bs), dst_loff, 1); - unlock_extent(&BTRFS_I(dst)->io_tree, dst_loff, end, &cached_state); + btrfs_lock_extent(&dst->io_tree, dst_loff, end, &cached_state); + ret = btrfs_clone(&src->vfs_inode, &dst->vfs_inode, loff, len, + ALIGN(len, bs), dst_loff, 1); + btrfs_unlock_extent(&dst->io_tree, dst_loff, end, &cached_state); btrfs_btree_balance_dirty(fs_info); @@ -678,8 +678,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, chunk_count = div_u64(olen, BTRFS_MAX_DEDUPE_LEN); for (i = 0; i < chunk_count; i++) { - ret = btrfs_extent_same_range(src, loff, BTRFS_MAX_DEDUPE_LEN, - dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, BTRFS_MAX_DEDUPE_LEN, + BTRFS_I(dst), dst_loff); if (ret) goto out; @@ -688,7 +688,8 @@ static int btrfs_extent_same(struct inode *src, u64 loff, u64 olen, } if (tail_len > 0) - ret = btrfs_extent_same_range(src, loff, tail_len, dst, dst_loff); + ret = btrfs_extent_same_range(BTRFS_I(src), loff, tail_len, + BTRFS_I(dst), dst_loff); out: spin_lock(&root_dst->root_item_lock); root_dst->dedupe_in_progress--; @@ -747,9 +748,9 @@ static noinline int btrfs_clone_files(struct file *file, struct file *file_src, * mode. */ end = destoff + len - 1; - lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); ret = btrfs_clone(src, inode, off, olen, len, destoff, 0); - unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, destoff, end, &cached_state); /* * We may have copied an inline extent into a page of the destination @@ -775,24 +776,24 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t *len, unsigned int remap_flags) { - struct inode *inode_in = file_inode(file_in); - struct inode *inode_out = file_inode(file_out); - u64 bs = BTRFS_I(inode_out)->root->fs_info->sectorsize; + struct btrfs_inode *inode_in = BTRFS_I(file_inode(file_in)); + struct btrfs_inode *inode_out = BTRFS_I(file_inode(file_out)); + u64 bs = inode_out->root->fs_info->sectorsize; u64 wb_len; int ret; if (!(remap_flags & REMAP_FILE_DEDUP)) { - struct btrfs_root *root_out = BTRFS_I(inode_out)->root; + struct btrfs_root *root_out = inode_out->root; if (btrfs_root_readonly(root_out)) return -EROFS; - ASSERT(inode_in->i_sb == inode_out->i_sb); + ASSERT(inode_in->vfs_inode.i_sb == inode_out->vfs_inode.i_sb); } /* Don't make the dst file partly checksummed */ - if ((BTRFS_I(inode_in)->flags & BTRFS_INODE_NODATASUM) != - (BTRFS_I(inode_out)->flags & BTRFS_INODE_NODATASUM)) { + if ((inode_in->flags & BTRFS_INODE_NODATASUM) != + (inode_out->flags & BTRFS_INODE_NODATASUM)) { return -EINVAL; } @@ -811,7 +812,7 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * to complete so that new file extent items are in the fs tree. */ if (*len == 0 && !(remap_flags & REMAP_FILE_DEDUP)) - wb_len = ALIGN(inode_in->i_size, bs) - ALIGN_DOWN(pos_in, bs); + wb_len = ALIGN(inode_in->vfs_inode.i_size, bs) - ALIGN_DOWN(pos_in, bs); else wb_len = ALIGN(*len, bs); @@ -832,16 +833,14 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in, * Also we don't need to check ASYNC_EXTENT, as async extent will be * CoWed anyway, not affecting nocow part. */ - ret = filemap_flush(inode_in->i_mapping); + ret = filemap_flush(inode_in->vfs_inode.i_mapping); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_in), ALIGN_DOWN(pos_in, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_in, ALIGN_DOWN(pos_in, bs), wb_len); if (ret < 0) return ret; - ret = btrfs_wait_ordered_range(BTRFS_I(inode_out), ALIGN_DOWN(pos_out, bs), - wb_len); + ret = btrfs_wait_ordered_range(inode_out, ALIGN_DOWN(pos_out, bs), wb_len); if (ret < 0) return ret; @@ -863,8 +862,8 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, struct file *dst_file, loff_t destoff, loff_t len, unsigned int remap_flags) { - struct inode *src_inode = file_inode(src_file); - struct inode *dst_inode = file_inode(dst_file); + struct btrfs_inode *src_inode = BTRFS_I(file_inode(src_file)); + struct btrfs_inode *dst_inode = BTRFS_I(file_inode(dst_file)); bool same_inode = dst_inode == src_inode; int ret; @@ -872,9 +871,9 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, return -EINVAL; if (same_inode) { - btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP); } else { - lock_two_nondirectories(src_inode, dst_inode); + lock_two_nondirectories(&src_inode->vfs_inode, &dst_inode->vfs_inode); btrfs_double_mmap_lock(src_inode, dst_inode); } @@ -884,16 +883,18 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off, goto out_unlock; if (remap_flags & REMAP_FILE_DEDUP) - ret = btrfs_extent_same(src_inode, off, len, dst_inode, destoff); + ret = btrfs_extent_same(&src_inode->vfs_inode, off, len, + &dst_inode->vfs_inode, destoff); else ret = btrfs_clone_files(dst_file, src_file, off, len, destoff); out_unlock: if (same_inode) { - btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP); + btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP); } else { btrfs_double_mmap_unlock(src_inode, dst_inode); - unlock_two_nondirectories(src_inode, dst_inode); + unlock_two_nondirectories(&src_inode->vfs_inode, + &dst_inode->vfs_inode); } /* diff --git a/fs/btrfs/relocation.c b/fs/btrfs/relocation.c index af0969b70b53..02086191630d 100644 --- a/fs/btrfs/relocation.c +++ b/fs/btrfs/relocation.c @@ -178,8 +178,9 @@ static void mark_block_processed(struct reloc_control *rc, in_range(node->bytenr, rc->block_group->start, rc->block_group->length)) { blocksize = rc->extent_root->fs_info->nodesize; - set_extent_bit(&rc->processed_blocks, node->bytenr, - node->bytenr + blocksize - 1, EXTENT_DIRTY, NULL); + btrfs_set_extent_bit(&rc->processed_blocks, node->bytenr, + node->bytenr + blocksize - 1, EXTENT_DIRTY, + NULL); } node->processed = 1; } @@ -195,8 +196,8 @@ static struct btrfs_backref_node *walk_up_backref( int idx = *index; while (!list_empty(&node->upper)) { - edge = list_entry(node->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&node->upper, struct btrfs_backref_edge, + list[LOWER]); edges[idx++] = edge; node = edge->node[UPPER]; } @@ -222,8 +223,8 @@ static struct btrfs_backref_node *walk_down_backref( idx--; continue; } - edge = list_entry(edge->list[LOWER].next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&edge->list[LOWER], struct btrfs_backref_edge, + list[LOWER]); edges[idx - 1] = edge; *index = idx; return edge->node[UPPER]; @@ -347,8 +348,8 @@ static bool handle_useless_nodes(struct reloc_control *rc, struct btrfs_backref_edge *edge; struct btrfs_backref_node *lower; - edge = list_entry(cur->lower.next, - struct btrfs_backref_edge, list[UPPER]); + edge = list_first_entry(&cur->lower, struct btrfs_backref_edge, + list[UPPER]); list_del(&edge->list[UPPER]); list_del(&edge->list[LOWER]); lower = edge->node[LOWER]; @@ -910,16 +911,16 @@ int replace_file_extents(struct btrfs_trans_handle *trans, /* Take mmap lock to serialize with reflinks. */ if (!down_read_trylock(&inode->i_mmap_lock)) continue; - ret = try_lock_extent(&inode->io_tree, key.offset, - end, &cached_state); + ret = btrfs_try_lock_extent(&inode->io_tree, key.offset, + end, &cached_state); if (!ret) { up_read(&inode->i_mmap_lock); continue; } btrfs_drop_extent_map_range(inode, key.offset, end, true); - unlock_extent(&inode->io_tree, key.offset, end, - &cached_state); + btrfs_unlock_extent(&inode->io_tree, key.offset, end, + &cached_state); up_read(&inode->i_mmap_lock); } } @@ -1378,9 +1379,9 @@ static int invalidate_extent_cache(struct btrfs_root *root, } /* the lock_extent waits for read_folio to complete */ - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); btrfs_drop_extent_map_range(inode, start, end, true); - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); } return 0; } @@ -1697,8 +1698,8 @@ again: rc->merge_reloc_tree = true; while (!list_empty(&rc->reloc_roots)) { - reloc_root = list_entry(rc->reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&rc->reloc_roots, + struct btrfs_root, root_list); list_del_init(&reloc_root->root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, @@ -1813,8 +1814,7 @@ again: while (!list_empty(&reloc_roots)) { found = 1; - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); root = btrfs_get_fs_root(fs_info, reloc_root->root_key.offset, false); @@ -1930,11 +1930,11 @@ static int record_reloc_root_in_trans(struct btrfs_trans_handle *trans, * reloc root without a corresponding root this could return ENOENT. */ if (IS_ERR(root)) { - ASSERT(0); + DEBUG_WARN("error %ld reading root for reloc root", PTR_ERR(root)); return PTR_ERR(root); } if (root->reloc_root != reloc_root) { - ASSERT(0); + DEBUG_WARN("unexpected reloc root found"); btrfs_err(fs_info, "root %llu has two reloc roots associated with it", reloc_root->root_key.offset); @@ -2109,8 +2109,8 @@ static noinline_for_stack u64 calcu_metadata_size(struct reloc_control *rc, if (list_empty(&next->upper)) break; - edge = list_entry(next->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&next->upper, struct btrfs_backref_edge, + list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } @@ -2356,8 +2356,8 @@ static int finish_pending_nodes(struct btrfs_trans_handle *trans, for (level = 0; level < BTRFS_MAX_LEVEL; level++) { while (!list_empty(&cache->pending[level])) { - node = list_entry(cache->pending[level].next, - struct btrfs_backref_node, list); + node = list_first_entry(&cache->pending[level], + struct btrfs_backref_node, list); list_move_tail(&node->list, &list); BUG_ON(!node->pending); @@ -2395,8 +2395,8 @@ static void update_processed_blocks(struct reloc_control *rc, if (list_empty(&next->upper)) break; - edge = list_entry(next->upper.next, - struct btrfs_backref_edge, list[LOWER]); + edge = list_first_entry(&next->upper, struct btrfs_backref_edge, + list[LOWER]); edges[index++] = edge; next = edge->node[UPPER]; } @@ -2408,8 +2408,8 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc) { u32 blocksize = rc->extent_root->fs_info->nodesize; - if (test_range_bit(&rc->processed_blocks, bytenr, - bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) + if (btrfs_test_range_bit(&rc->processed_blocks, bytenr, + bytenr + blocksize - 1, EXTENT_DIRTY, NULL)) return 1; return 0; } @@ -2706,9 +2706,6 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control if (ret < 0) return ret; - clear_extent_bits(&inode->io_tree, i_size, - round_up(i_size, PAGE_SIZE) - 1, - EXTENT_UPTODATE); folio = filemap_lock_folio(mapping, i_size >> PAGE_SHIFT); /* * If page is freed we don't need to do anything then, as we @@ -2738,21 +2735,21 @@ static noinline_for_stack int prealloc_file_extent_cluster(struct reloc_control else end = cluster->end - offset; - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); num_bytes = end + 1 - start; ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start, num_bytes, num_bytes, end + 1, &alloc_hint); cur_offset = end + 1; - unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); if (ret) break; } btrfs_inode_unlock(inode, 0); if (cur_offset < prealloc_end) - btrfs_free_reserved_data_space_noquota(inode->root->fs_info, - prealloc_end + 1 - cur_offset); + btrfs_free_reserved_data_space_noquota(inode, + prealloc_end + 1 - cur_offset); return ret; } @@ -2766,7 +2763,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr u64 end = rc->cluster.end - offset; int ret = 0; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) return -ENOMEM; @@ -2777,10 +2774,10 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct reloc_contr em->ram_bytes = em->len; em->flags |= EXTENT_FLAG_PINNED; - lock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_lock_extent(&inode->io_tree, start, end, &cached_state); ret = btrfs_replace_extent_map_range(inode, em, false); - unlock_extent(&inode->io_tree, start, end, &cached_state); - free_extent_map(em); + btrfs_unlock_extent(&inode->io_tree, start, end, &cached_state); + btrfs_free_extent_map(em); return ret; } @@ -2902,15 +2899,15 @@ again: goto release_folio; /* Mark the range delalloc and dirty for later writeback */ - lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, - &cached_state); + btrfs_lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, + clamped_end, &cached_state); ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start, clamped_end, 0, &cached_state); if (ret) { - clear_extent_bit(&BTRFS_I(inode)->io_tree, - clamped_start, clamped_end, - EXTENT_LOCKED | EXTENT_BOUNDARY, - &cached_state); + btrfs_clear_extent_bit(&BTRFS_I(inode)->io_tree, + clamped_start, clamped_end, + EXTENT_LOCKED | EXTENT_BOUNDARY, + &cached_state); btrfs_delalloc_release_metadata(BTRFS_I(inode), clamped_len, true); btrfs_delalloc_release_extents(BTRFS_I(inode), @@ -2932,12 +2929,12 @@ again: u64 boundary_end = boundary_start + fs_info->sectorsize - 1; - set_extent_bit(&BTRFS_I(inode)->io_tree, - boundary_start, boundary_end, - EXTENT_BOUNDARY, NULL); + btrfs_set_extent_bit(&BTRFS_I(inode)->io_tree, + boundary_start, boundary_end, + EXTENT_BOUNDARY, NULL); } - unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, - &cached_state); + btrfs_unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, + &cached_state); btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len); cur += clamped_len; @@ -3239,21 +3236,23 @@ out: return ret; } -static int delete_block_group_cache(struct btrfs_fs_info *fs_info, - struct btrfs_block_group *block_group, +static int delete_block_group_cache(struct btrfs_block_group *block_group, struct inode *inode, u64 ino) { + struct btrfs_fs_info *fs_info = block_group->fs_info; struct btrfs_root *root = fs_info->tree_root; struct btrfs_trans_handle *trans; + struct btrfs_inode *btrfs_inode; int ret = 0; if (inode) goto truncate; - inode = btrfs_iget(ino, root); - if (IS_ERR(inode)) + btrfs_inode = btrfs_iget(ino, root); + if (IS_ERR(btrfs_inode)) return -ENOENT; + inode = &btrfs_inode->vfs_inode; truncate: ret = btrfs_check_trunc_cache_free_space(fs_info, @@ -3313,8 +3312,7 @@ static int delete_v1_space_cache(struct extent_buffer *leaf, } if (!found) return -ENOENT; - ret = delete_block_group_cache(leaf->fs_info, block_group, NULL, - space_cache_ino); + ret = delete_block_group_cache(block_group, NULL, space_cache_ino); return ret; } @@ -3434,9 +3432,9 @@ next: goto next; } - block_found = find_first_extent_bit(&rc->processed_blocks, - key.objectid, &start, &end, - EXTENT_DIRTY, NULL); + block_found = btrfs_find_first_extent_bit(&rc->processed_blocks, + key.objectid, &start, &end, + EXTENT_DIRTY, NULL); if (block_found && start <= key.objectid) { btrfs_release_path(path); @@ -3645,7 +3643,7 @@ restart: } btrfs_release_path(path); - clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); + btrfs_clear_extent_bits(&rc->processed_blocks, 0, (u64)-1, EXTENT_DIRTY); if (trans) { btrfs_end_transaction_throttle(trans); @@ -3761,10 +3759,10 @@ out: * the inode is in data relocation tree and its link count is 0 */ static noinline_for_stack struct inode *create_reloc_inode( - struct btrfs_fs_info *fs_info, const struct btrfs_block_group *group) { - struct inode *inode = NULL; + struct btrfs_fs_info *fs_info = group->fs_info; + struct btrfs_inode *inode = NULL; struct btrfs_trans_handle *trans; struct btrfs_root *root; u64 objectid; @@ -3792,18 +3790,19 @@ static noinline_for_stack struct inode *create_reloc_inode( inode = NULL; goto out; } - BTRFS_I(inode)->reloc_block_group_start = group->start; + inode->reloc_block_group_start = group->start; - ret = btrfs_orphan_add(trans, BTRFS_I(inode)); + ret = btrfs_orphan_add(trans, inode); out: btrfs_put_root(root); btrfs_end_transaction(trans); btrfs_btree_balance_dirty(fs_info); if (ret) { - iput(inode); - inode = ERR_PTR(ret); + if (inode) + iput(&inode->vfs_inode); + return ERR_PTR(ret); } - return inode; + return &inode->vfs_inode; } /* @@ -3860,7 +3859,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info) btrfs_backref_init_cache(fs_info, &rc->backref_cache, true); rc->reloc_root_tree.rb_root = RB_ROOT; spin_lock_init(&rc->reloc_root_tree.lock); - extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); + btrfs_extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS); return rc; } @@ -3977,7 +3976,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) btrfs_free_path(path); if (!IS_ERR(inode)) - ret = delete_block_group_cache(fs_info, rc->block_group, inode, 0); + ret = delete_block_group_cache(rc->block_group, inode, 0); else ret = PTR_ERR(inode); @@ -3986,7 +3985,7 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start) goto out; } - rc->data_inode = create_reloc_inode(fs_info, rc->block_group); + rc->data_inode = create_reloc_inode(rc->block_group); if (IS_ERR(rc->data_inode)) { err = PTR_ERR(rc->data_inode); rc->data_inode = NULL; @@ -4183,8 +4182,7 @@ int btrfs_recover_relocation(struct btrfs_fs_info *fs_info) rc->merge_reloc_tree = true; while (!list_empty(&reloc_roots)) { - reloc_root = list_entry(reloc_roots.next, - struct btrfs_root, root_list); + reloc_root = list_first_entry(&reloc_roots, struct btrfs_root, root_list); list_del(&reloc_root->root_list); if (btrfs_root_refs(&reloc_root->root_item) == 0) { @@ -4277,7 +4275,7 @@ int btrfs_reloc_clone_csums(struct btrfs_ordered_extent *ordered) while (!list_empty(&list)) { struct btrfs_ordered_sum *sums = - list_entry(list.next, struct btrfs_ordered_sum, list); + list_first_entry(&list, struct btrfs_ordered_sum, list); list_del_init(&sums->list); diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c index 531312efee8d..ce36fafc771e 100644 --- a/fs/btrfs/scrub.c +++ b/fs/btrfs/scrub.c @@ -66,8 +66,6 @@ struct scrub_ctx; /* Represent one sector and its needed info to verify the content. */ struct scrub_sector_verification { - bool is_metadata; - union { /* * Csum pointer for data csum verification. Should point to a @@ -100,6 +98,38 @@ enum scrub_stripe_flags { SCRUB_STRIPE_FLAG_NO_REPORT, }; +/* + * We have multiple bitmaps for one scrub_stripe. + * However each bitmap has at most (BTRFS_STRIPE_LEN / blocksize) bits, + * which is normally 16, and much smaller than BITS_PER_LONG (32 or 64). + * + * So to reduce memory usage for each scrub_stripe, we pack those bitmaps + * into a larger one. + * + * These enum records where the sub-bitmap are inside the larger one. + * Each subbitmap starts at scrub_bitmap_nr_##name * nr_sectors bit. + */ +enum { + /* Which blocks are covered by extent items. */ + scrub_bitmap_nr_has_extent = 0, + + /* Which blocks are meteadata. */ + scrub_bitmap_nr_is_metadata, + + /* + * Which blocks have errors, including IO, csum, and metadata + * errors. + * This sub-bitmap is the OR results of the next few error related + * sub-bitmaps. + */ + scrub_bitmap_nr_error, + scrub_bitmap_nr_io_error, + scrub_bitmap_nr_csum_error, + scrub_bitmap_nr_meta_error, + scrub_bitmap_nr_meta_gen_error, + scrub_bitmap_nr_last, +}; + #define SCRUB_STRIPE_PAGES (BTRFS_STRIPE_LEN / PAGE_SIZE) /* @@ -138,36 +168,15 @@ struct scrub_stripe { */ unsigned long state; - /* Indicate which sectors are covered by extent items. */ - unsigned long extent_sector_bitmap; - - /* - * The errors hit during the initial read of the stripe. - * - * Would be utilized for error reporting and repair. - * - * The remaining init_nr_* records the number of errors hit, only used - * by error reporting. - */ - unsigned long init_error_bitmap; - unsigned int init_nr_io_errors; - unsigned int init_nr_csum_errors; - unsigned int init_nr_meta_errors; + /* The large bitmap contains all the sub-bitmaps. */ + unsigned long bitmaps[BITS_TO_LONGS(scrub_bitmap_nr_last * + (BTRFS_STRIPE_LEN / BTRFS_MIN_BLOCKSIZE))]; /* - * The following error bitmaps are all for the current status. - * Every time we submit a new read, these bitmaps may be updated. - * - * error_bitmap = io_error_bitmap | csum_error_bitmap | meta_error_bitmap; - * - * IO and csum errors can happen for both metadata and data. + * For writeback (repair or replace) error reporting. + * This one is protected by a spinlock, thus can not be packed into + * the larger bitmap. */ - unsigned long error_bitmap; - unsigned long io_error_bitmap; - unsigned long csum_error_bitmap; - unsigned long meta_error_bitmap; - - /* For writeback (repair or replace) error reporting. */ unsigned long write_error_bitmap; /* Writeback can be concurrent, thus we need to protect the bitmap. */ @@ -219,6 +228,90 @@ struct scrub_ctx { refcount_t refs; }; +#define scrub_calc_start_bit(stripe, name, block_nr) \ +({ \ + unsigned int __start_bit; \ + \ + ASSERT(block_nr < stripe->nr_sectors, \ + "nr_sectors=%u block_nr=%u", stripe->nr_sectors, block_nr); \ + __start_bit = scrub_bitmap_nr_##name * stripe->nr_sectors + block_nr; \ + __start_bit; \ +}) + +#define IMPLEMENT_SCRUB_BITMAP_OPS(name) \ +static inline void scrub_bitmap_set_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr, \ + unsigned int nr_blocks) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, \ + name, block_nr); \ + \ + bitmap_set(stripe->bitmaps, start_bit, nr_blocks); \ +} \ +static inline void scrub_bitmap_clear_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr, \ + unsigned int nr_blocks) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + bitmap_clear(stripe->bitmaps, start_bit, nr_blocks); \ +} \ +static inline bool scrub_bitmap_test_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + return test_bit(start_bit, stripe->bitmaps); \ +} \ +static inline void scrub_bitmap_set_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + set_bit(start_bit, stripe->bitmaps); \ +} \ +static inline void scrub_bitmap_clear_bit_##name(struct scrub_stripe *stripe, \ + unsigned int block_nr) \ +{ \ + const unsigned int start_bit = scrub_calc_start_bit(stripe, name, \ + block_nr); \ + \ + clear_bit(start_bit, stripe->bitmaps); \ +} \ +static inline unsigned long scrub_bitmap_read_##name(struct scrub_stripe *stripe) \ +{ \ + const unsigned int nr_blocks = stripe->nr_sectors; \ + \ + ASSERT(nr_blocks > 0 && nr_blocks <= BITS_PER_LONG, \ + "nr_blocks=%u BITS_PER_LONG=%u", \ + nr_blocks, BITS_PER_LONG); \ + \ + return bitmap_read(stripe->bitmaps, nr_blocks * scrub_bitmap_nr_##name, \ + stripe->nr_sectors); \ +} \ +static inline bool scrub_bitmap_empty_##name(struct scrub_stripe *stripe) \ +{ \ + unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ + \ + return bitmap_empty(&bitmap, stripe->nr_sectors); \ +} \ +static inline unsigned int scrub_bitmap_weight_##name(struct scrub_stripe *stripe) \ +{ \ + unsigned long bitmap = scrub_bitmap_read_##name(stripe); \ + \ + return bitmap_weight(&bitmap, stripe->nr_sectors); \ +} +IMPLEMENT_SCRUB_BITMAP_OPS(has_extent); +IMPLEMENT_SCRUB_BITMAP_OPS(is_metadata); +IMPLEMENT_SCRUB_BITMAP_OPS(error); +IMPLEMENT_SCRUB_BITMAP_OPS(io_error); +IMPLEMENT_SCRUB_BITMAP_OPS(csum_error); +IMPLEMENT_SCRUB_BITMAP_OPS(meta_error); +IMPLEMENT_SCRUB_BITMAP_OPS(meta_gen_error); + struct scrub_warning { struct btrfs_path *path; u64 extent_item_size; @@ -228,6 +321,19 @@ struct scrub_warning { struct btrfs_device *dev; }; +struct scrub_error_records { + /* + * Bitmap recording which blocks hit errors (IO/csum/...) during the + * initial read. + */ + unsigned long init_error_bitmap; + + unsigned int nr_io_errors; + unsigned int nr_csum_errors; + unsigned int nr_meta_errors; + unsigned int nr_meta_gen_errors; +}; + static void release_scrub_stripe(struct scrub_stripe *stripe) { if (!stripe) @@ -579,20 +685,15 @@ static int fill_writer_pointer_gap(struct scrub_ctx *sctx, u64 physical) return ret; } -static struct page *scrub_stripe_get_page(struct scrub_stripe *stripe, int sector_nr) +static void *scrub_stripe_get_kaddr(struct scrub_stripe *stripe, int sector_nr) { - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - int page_index = (sector_nr << fs_info->sectorsize_bits) >> PAGE_SHIFT; + u32 offset = (sector_nr << stripe->bg->fs_info->sectorsize_bits); + const struct page *page = stripe->pages[offset >> PAGE_SHIFT]; - return stripe->pages[page_index]; -} - -static unsigned int scrub_stripe_get_page_offset(struct scrub_stripe *stripe, - int sector_nr) -{ - struct btrfs_fs_info *fs_info = stripe->bg->fs_info; - - return offset_in_page(sector_nr << fs_info->sectorsize_bits); + /* stripe->pages[] is allocated by us and no highmem is allowed. */ + ASSERT(page); + ASSERT(!PageHighMem(page)); + return page_address(page) + offset_in_page(offset); } static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr) @@ -600,24 +701,22 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr struct btrfs_fs_info *fs_info = stripe->bg->fs_info; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; const u64 logical = stripe->logical + (sector_nr << fs_info->sectorsize_bits); - const struct page *first_page = scrub_stripe_get_page(stripe, sector_nr); - const unsigned int first_off = scrub_stripe_get_page_offset(stripe, sector_nr); + void *first_kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + struct btrfs_header *header = first_kaddr; SHASH_DESC_ON_STACK(shash, fs_info->csum_shash); u8 on_disk_csum[BTRFS_CSUM_SIZE]; u8 calculated_csum[BTRFS_CSUM_SIZE]; - struct btrfs_header *header; /* * Here we don't have a good way to attach the pages (and subpages) * to a dummy extent buffer, thus we have to directly grab the members * from pages. */ - header = (struct btrfs_header *)(page_address(first_page) + first_off); memcpy(on_disk_csum, header->csum, fs_info->csum_size); if (logical != btrfs_stack_header_bytenr(header)) { - bitmap_set(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad bytenr, has %llu want %llu", logical, stripe->mirror_num, @@ -626,8 +725,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (memcmp(header->fsid, fs_info->fs_devices->metadata_uuid, BTRFS_FSID_SIZE) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad fsid, has %pU want %pU", logical, stripe->mirror_num, @@ -636,8 +735,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (memcmp(header->chunk_tree_uuid, fs_info->chunk_tree_uuid, BTRFS_UUID_SIZE) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad chunk tree uuid, has %pU want %pU", logical, stripe->mirror_num, @@ -648,21 +747,18 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr /* Now check tree block csum. */ shash->tfm = fs_info->csum_shash; crypto_shash_init(shash); - crypto_shash_update(shash, page_address(first_page) + first_off + - BTRFS_CSUM_SIZE, fs_info->sectorsize - BTRFS_CSUM_SIZE); + crypto_shash_update(shash, first_kaddr + BTRFS_CSUM_SIZE, + fs_info->sectorsize - BTRFS_CSUM_SIZE); for (int i = sector_nr + 1; i < sector_nr + sectors_per_tree; i++) { - struct page *page = scrub_stripe_get_page(stripe, i); - unsigned int page_off = scrub_stripe_get_page_offset(stripe, i); - - crypto_shash_update(shash, page_address(page) + page_off, + crypto_shash_update(shash, scrub_stripe_get_kaddr(stripe, i), fs_info->sectorsize); } crypto_shash_final(shash, calculated_csum); if (memcmp(calculated_csum, on_disk_csum, fs_info->csum_size) != 0) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad csum, has " CSUM_FMT " want " CSUM_FMT, logical, stripe->mirror_num, @@ -672,8 +768,8 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr } if (stripe->sectors[sector_nr].generation != btrfs_stack_header_generation(header)) { - bitmap_set(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); - bitmap_set(&stripe->error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_set_meta_gen_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_set_error(stripe, sector_nr, sectors_per_tree); btrfs_warn_rl(fs_info, "tree block %llu mirror %u has bad generation, has %llu want %llu", logical, stripe->mirror_num, @@ -681,9 +777,10 @@ static void scrub_verify_one_metadata(struct scrub_stripe *stripe, int sector_nr stripe->sectors[sector_nr].generation); return; } - bitmap_clear(&stripe->error_bitmap, sector_nr, sectors_per_tree); - bitmap_clear(&stripe->csum_error_bitmap, sector_nr, sectors_per_tree); - bitmap_clear(&stripe->meta_error_bitmap, sector_nr, sectors_per_tree); + scrub_bitmap_clear_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_csum_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_meta_error(stripe, sector_nr, sectors_per_tree); + scrub_bitmap_clear_meta_gen_error(stripe, sector_nr, sectors_per_tree); } static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) @@ -691,23 +788,22 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct scrub_sector_verification *sector = &stripe->sectors[sector_nr]; const u32 sectors_per_tree = fs_info->nodesize >> fs_info->sectorsize_bits; - struct page *page = scrub_stripe_get_page(stripe, sector_nr); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); + void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); u8 csum_buf[BTRFS_CSUM_SIZE]; int ret; ASSERT(sector_nr >= 0 && sector_nr < stripe->nr_sectors); /* Sector not utilized, skip it. */ - if (!test_bit(sector_nr, &stripe->extent_sector_bitmap)) + if (!scrub_bitmap_test_bit_has_extent(stripe, sector_nr)) return; /* IO error, no need to check. */ - if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) return; /* Metadata, verify the full tree block. */ - if (sector->is_metadata) { + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { /* * Check if the tree block crosses the stripe boundary. If * crossed the boundary, we cannot verify it but only give a @@ -733,17 +829,17 @@ static void scrub_verify_one_sector(struct scrub_stripe *stripe, int sector_nr) * cases without csum, we have no other choice but to trust it. */ if (!sector->csum) { - clear_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_clear_bit_error(stripe, sector_nr); return; } - ret = btrfs_check_sector_csum(fs_info, page, pgoff, csum_buf, sector->csum); + ret = btrfs_check_sector_csum(fs_info, kaddr, csum_buf, sector->csum); if (ret < 0) { - set_bit(sector_nr, &stripe->csum_error_bitmap); - set_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_set_bit_csum_error(stripe, sector_nr); + scrub_bitmap_set_bit_error(stripe, sector_nr); } else { - clear_bit(sector_nr, &stripe->csum_error_bitmap); - clear_bit(sector_nr, &stripe->error_bitmap); + scrub_bitmap_clear_bit_csum_error(stripe, sector_nr); + scrub_bitmap_clear_bit_error(stripe, sector_nr); } } @@ -756,7 +852,7 @@ static void scrub_verify_one_stripe(struct scrub_stripe *stripe, unsigned long b for_each_set_bit(sector_nr, &bitmap, stripe->nr_sectors) { scrub_verify_one_sector(stripe, sector_nr); - if (stripe->sectors[sector_nr].is_metadata) + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) sector_nr += sectors_per_tree - 1; } } @@ -766,8 +862,7 @@ static int calc_sector_number(struct scrub_stripe *stripe, struct bio_vec *first int i; for (i = 0; i < stripe->nr_sectors; i++) { - if (scrub_stripe_get_page(stripe, i) == first_bvec->bv_page && - scrub_stripe_get_page_offset(stripe, i) == first_bvec->bv_offset) + if (scrub_stripe_get_kaddr(stripe, i) == bvec_virt(first_bvec)) break; } ASSERT(i < stripe->nr_sectors); @@ -795,13 +890,13 @@ static void scrub_repair_read_endio(struct btrfs_bio *bbio) bio_size += bvec->bv_len; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); - bitmap_set(&stripe->error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_set_io_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_set_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); } else { - bitmap_clear(&stripe->io_error_bitmap, sector_nr, - bio_size >> fs_info->sectorsize_bits); + scrub_bitmap_clear_io_error(stripe, sector_nr, + bio_size >> fs_info->sectorsize_bits); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) @@ -814,27 +909,39 @@ static int calc_next_mirror(int mirror, int num_copies) return (mirror + 1 > num_copies) ? 1 : mirror + 1; } +static void scrub_bio_add_sector(struct btrfs_bio *bbio, struct scrub_stripe *stripe, + int sector_nr) +{ + void *kaddr = scrub_stripe_get_kaddr(stripe, sector_nr); + int ret; + + ret = bio_add_page(&bbio->bio, virt_to_page(kaddr), bbio->fs_info->sectorsize, + offset_in_page(kaddr)); + /* + * Caller should ensure the bbio has enough size. + * And we cannot use __bio_add_page(), which doesn't do any merge. + * + * Meanwhile for scrub_submit_initial_read() we fully rely on the merge + * to create the minimal amount of bio vectors, for fs block size < page + * size cases. + */ + ASSERT(ret == bbio->fs_info->sectorsize); +} + static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, int mirror, int blocksize, bool wait) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); int i; ASSERT(stripe->mirror_num >= 1); ASSERT(atomic_read(&stripe->pending_io) == 0); for_each_set_bit(i, &old_error_bitmap, stripe->nr_sectors) { - struct page *page; - int pgoff; - int ret; - - page = scrub_stripe_get_page(stripe, i); - pgoff = scrub_stripe_get_page_offset(stripe, i); - /* The current sector cannot be merged, submit the bio. */ - if (bbio && ((i > 0 && !test_bit(i - 1, &stripe->error_bitmap)) || + if (bbio && ((i > 0 && !test_bit(i - 1, &old_error_bitmap)) || bbio->bio.bi_iter.bi_size >= blocksize)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); @@ -851,8 +958,7 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, (i << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; } - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - ASSERT(ret == fs_info->sectorsize); + scrub_bio_add_sector(bbio, stripe, i); } if (bbio) { ASSERT(bbio->bio.bi_iter.bi_size); @@ -864,12 +970,15 @@ static void scrub_stripe_submit_repair_read(struct scrub_stripe *stripe, } static void scrub_stripe_report_errors(struct scrub_ctx *sctx, - struct scrub_stripe *stripe) + struct scrub_stripe *stripe, + const struct scrub_error_records *errors) { static DEFINE_RATELIMIT_STATE(rs, DEFAULT_RATELIMIT_INTERVAL, DEFAULT_RATELIMIT_BURST); struct btrfs_fs_info *fs_info = sctx->fs_info; struct btrfs_device *dev = NULL; + const unsigned long extent_bitmap = scrub_bitmap_read_has_extent(stripe); + const unsigned long error_bitmap = scrub_bitmap_read_error(stripe); u64 physical = 0; int nr_data_sectors = 0; int nr_meta_sectors = 0; @@ -886,7 +995,7 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, * Although our scrub_stripe infrastructure is mostly based on btrfs_submit_bio() * thus no need for dev/physical, error reporting still needs dev and physical. */ - if (!bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) { + if (!bitmap_empty(&errors->init_error_bitmap, stripe->nr_sectors)) { u64 mapped_len = fs_info->sectorsize; struct btrfs_io_context *bioc = NULL; int stripe_index = stripe->mirror_num - 1; @@ -909,10 +1018,10 @@ static void scrub_stripe_report_errors(struct scrub_ctx *sctx, } skip: - for_each_set_bit(sector_nr, &stripe->extent_sector_bitmap, stripe->nr_sectors) { + for_each_set_bit(sector_nr, &extent_bitmap, stripe->nr_sectors) { bool repaired = false; - if (stripe->sectors[sector_nr].is_metadata) { + if (scrub_bitmap_test_bit_is_metadata(stripe, sector_nr)) { nr_meta_sectors++; } else { nr_data_sectors++; @@ -920,14 +1029,14 @@ skip: nr_nodatacsum_sectors++; } - if (test_bit(sector_nr, &stripe->init_error_bitmap) && - !test_bit(sector_nr, &stripe->error_bitmap)) { + if (test_bit(sector_nr, &errors->init_error_bitmap) && + !test_bit(sector_nr, &error_bitmap)) { nr_repaired_sectors++; repaired = true; } /* Good sector from the beginning, nothing need to be done. */ - if (!test_bit(sector_nr, &stripe->init_error_bitmap)) + if (!test_bit(sector_nr, &errors->init_error_bitmap)) continue; /* @@ -960,31 +1069,46 @@ skip: stripe->logical, stripe->mirror_num); } - if (test_bit(sector_nr, &stripe->io_error_bitmap)) + if (scrub_bitmap_test_bit_io_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("i/o error", dev, false, stripe->logical, physical); - if (test_bit(sector_nr, &stripe->csum_error_bitmap)) + if (scrub_bitmap_test_bit_csum_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("checksum error", dev, false, stripe->logical, physical); - if (test_bit(sector_nr, &stripe->meta_error_bitmap)) + if (scrub_bitmap_test_bit_meta_error(stripe, sector_nr)) if (__ratelimit(&rs) && dev) scrub_print_common_warning("header error", dev, false, stripe->logical, physical); + if (scrub_bitmap_test_bit_meta_gen_error(stripe, sector_nr)) + if (__ratelimit(&rs) && dev) + scrub_print_common_warning("generation error", dev, false, + stripe->logical, physical); } + /* Update the device stats. */ + for (int i = 0; i < errors->nr_io_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_READ_ERRS); + for (int i = 0; i < errors->nr_csum_errors; i++) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_CORRUPTION_ERRS); + /* Generation mismatch error is based on each metadata, not each block. */ + for (int i = 0; i < errors->nr_meta_gen_errors; + i += (fs_info->nodesize >> fs_info->sectorsize_bits)) + btrfs_dev_stat_inc_and_print(stripe->dev, BTRFS_DEV_STAT_GENERATION_ERRS); + spin_lock(&sctx->stat_lock); sctx->stat.data_extents_scrubbed += stripe->nr_data_extents; sctx->stat.tree_extents_scrubbed += stripe->nr_meta_extents; sctx->stat.data_bytes_scrubbed += nr_data_sectors << fs_info->sectorsize_bits; sctx->stat.tree_bytes_scrubbed += nr_meta_sectors << fs_info->sectorsize_bits; sctx->stat.no_csum += nr_nodatacsum_sectors; - sctx->stat.read_errors += stripe->init_nr_io_errors; - sctx->stat.csum_errors += stripe->init_nr_csum_errors; - sctx->stat.verify_errors += stripe->init_nr_meta_errors; + sctx->stat.read_errors += errors->nr_io_errors; + sctx->stat.csum_errors += errors->nr_csum_errors; + sctx->stat.verify_errors += errors->nr_meta_errors + + errors->nr_meta_gen_errors; sctx->stat.uncorrectable_errors += - bitmap_weight(&stripe->error_bitmap, stripe->nr_sectors); + bitmap_weight(&error_bitmap, stripe->nr_sectors); sctx->stat.corrected_errors += nr_repaired_sectors; spin_unlock(&sctx->stat_lock); } @@ -1010,26 +1134,26 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) struct scrub_stripe *stripe = container_of(work, struct scrub_stripe, work); struct scrub_ctx *sctx = stripe->sctx; struct btrfs_fs_info *fs_info = sctx->fs_info; + struct scrub_error_records errors = { 0 }; int num_copies = btrfs_num_copies(fs_info, stripe->bg->start, stripe->bg->length); unsigned long repaired; + unsigned long error; int mirror; int i; ASSERT(stripe->mirror_num > 0); wait_scrub_stripe_io(stripe); - scrub_verify_one_stripe(stripe, stripe->extent_sector_bitmap); + scrub_verify_one_stripe(stripe, scrub_bitmap_read_has_extent(stripe)); /* Save the initial failed bitmap for later repair and report usage. */ - stripe->init_error_bitmap = stripe->error_bitmap; - stripe->init_nr_io_errors = bitmap_weight(&stripe->io_error_bitmap, - stripe->nr_sectors); - stripe->init_nr_csum_errors = bitmap_weight(&stripe->csum_error_bitmap, - stripe->nr_sectors); - stripe->init_nr_meta_errors = bitmap_weight(&stripe->meta_error_bitmap, - stripe->nr_sectors); - - if (bitmap_empty(&stripe->init_error_bitmap, stripe->nr_sectors)) + errors.init_error_bitmap = scrub_bitmap_read_error(stripe); + errors.nr_io_errors = scrub_bitmap_weight_io_error(stripe); + errors.nr_csum_errors = scrub_bitmap_weight_csum_error(stripe); + errors.nr_meta_errors = scrub_bitmap_weight_meta_error(stripe); + errors.nr_meta_gen_errors = scrub_bitmap_weight_meta_gen_error(stripe); + + if (bitmap_empty(&errors.init_error_bitmap, stripe->nr_sectors)) goto out; /* @@ -1041,13 +1165,13 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) for (mirror = calc_next_mirror(stripe->mirror_num, num_copies); mirror != stripe->mirror_num; mirror = calc_next_mirror(mirror, num_copies)) { - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); scrub_stripe_submit_repair_read(stripe, mirror, BTRFS_STRIPE_LEN, false); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, old_error_bitmap); - if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + if (scrub_bitmap_empty_error(stripe)) goto out; } @@ -1065,21 +1189,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) for (i = 0, mirror = stripe->mirror_num; i < num_copies; i++, mirror = calc_next_mirror(mirror, num_copies)) { - const unsigned long old_error_bitmap = stripe->error_bitmap; + const unsigned long old_error_bitmap = scrub_bitmap_read_error(stripe); scrub_stripe_submit_repair_read(stripe, mirror, fs_info->sectorsize, true); wait_scrub_stripe_io(stripe); scrub_verify_one_stripe(stripe, old_error_bitmap); - if (bitmap_empty(&stripe->error_bitmap, stripe->nr_sectors)) + if (scrub_bitmap_empty_error(stripe)) goto out; } out: + error = scrub_bitmap_read_error(stripe); /* * Submit the repaired sectors. For zoned case, we cannot do repair * in-place, but queue the bg to be relocated. */ - bitmap_andnot(&repaired, &stripe->init_error_bitmap, &stripe->error_bitmap, + bitmap_andnot(&repaired, &errors.init_error_bitmap, &error, stripe->nr_sectors); if (!sctx->readonly && !bitmap_empty(&repaired, stripe->nr_sectors)) { if (btrfs_is_zoned(fs_info)) { @@ -1090,7 +1215,7 @@ out: } } - scrub_stripe_report_errors(sctx, stripe); + scrub_stripe_report_errors(sctx, stripe, &errors); set_bit(SCRUB_STRIPE_FLAG_REPAIR_DONE, &stripe->state); wake_up(&stripe->repair_wait); } @@ -1110,10 +1235,10 @@ static void scrub_read_endio(struct btrfs_bio *bbio) num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; if (bbio->bio.bi_status) { - bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); - bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); + scrub_bitmap_set_io_error(stripe, sector_nr, num_sectors); + scrub_bitmap_set_error(stripe, sector_nr, num_sectors); } else { - bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); + scrub_bitmap_clear_io_error(stripe, sector_nr, num_sectors); } bio_put(&bbio->bio); if (atomic_dec_and_test(&stripe->pending_io)) { @@ -1142,6 +1267,9 @@ static void scrub_write_endio(struct btrfs_bio *bbio) bitmap_set(&stripe->write_error_bitmap, sector_nr, bio_size >> fs_info->sectorsize_bits); spin_unlock_irqrestore(&stripe->write_error_lock, flags); + for (int i = 0; i < (bio_size >> fs_info->sectorsize_bits); i++) + btrfs_dev_stat_inc_and_print(stripe->dev, + BTRFS_DEV_STAT_WRITE_ERRS); } bio_put(&bbio->bio); @@ -1199,12 +1327,8 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str int sector_nr; for_each_set_bit(sector_nr, &write_bitmap, stripe->nr_sectors) { - struct page *page = scrub_stripe_get_page(stripe, sector_nr); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, sector_nr); - int ret; - /* We should only writeback sectors covered by an extent. */ - ASSERT(test_bit(sector_nr, &stripe->extent_sector_bitmap)); + ASSERT(scrub_bitmap_test_bit_has_extent(stripe, sector_nr)); /* Cannot merge with previous sector, submit the current one. */ if (bbio && sector_nr && !test_bit(sector_nr - 1, &write_bitmap)) { @@ -1218,8 +1342,7 @@ static void scrub_write_sectors(struct scrub_ctx *sctx, struct scrub_stripe *str (sector_nr << fs_info->sectorsize_bits)) >> SECTOR_SHIFT; } - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - ASSERT(ret == fs_info->sectorsize); + scrub_bio_add_sector(bbio, stripe, sector_nr); } if (bbio) scrub_submit_write_bio(sctx, stripe, bbio, dev_replace); @@ -1380,11 +1503,11 @@ static int find_first_extent_item(struct btrfs_root *extent_root, if (path->nodes[0]) goto search_forward; + key.objectid = search_start; if (btrfs_fs_incompat(fs_info, SKINNY_METADATA)) key.type = BTRFS_METADATA_ITEM_KEY; else key.type = BTRFS_EXTENT_ITEM_KEY; - key.objectid = search_start; key.offset = (u64)-1; ret = btrfs_search_slot(NULL, extent_root, &key, path, 0, 0); @@ -1493,9 +1616,9 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info, struct scrub_sector_verification *sector = &stripe->sectors[nr_sector]; - set_bit(nr_sector, &stripe->extent_sector_bitmap); + scrub_bitmap_set_bit_has_extent(stripe, nr_sector); if (extent_flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) { - sector->is_metadata = true; + scrub_bitmap_set_bit_is_metadata(stripe, nr_sector); sector->generation = extent_gen; } } @@ -1503,15 +1626,8 @@ static void fill_one_extent_info(struct btrfs_fs_info *fs_info, static void scrub_stripe_reset_bitmaps(struct scrub_stripe *stripe) { - stripe->extent_sector_bitmap = 0; - stripe->init_error_bitmap = 0; - stripe->init_nr_io_errors = 0; - stripe->init_nr_csum_errors = 0; - stripe->init_nr_meta_errors = 0; - stripe->error_bitmap = 0; - stripe->io_error_bitmap = 0; - stripe->csum_error_bitmap = 0; - stripe->meta_error_bitmap = 0; + ASSERT(stripe->nr_sectors); + bitmap_zero(stripe->bitmaps, scrub_bitmap_nr_last * stripe->nr_sectors); } /* @@ -1541,8 +1657,8 @@ static int scrub_find_fill_first_stripe(struct btrfs_block_group *bg, u64 extent_gen; int ret; - if (unlikely(!extent_root)) { - btrfs_err(fs_info, "no valid extent root for scrub"); + if (unlikely(!extent_root || !csum_root)) { + btrfs_err(fs_info, "no valid extent or csum root for scrub"); return -EUCLEAN; } memset(stripe->sectors, 0, sizeof(struct scrub_sector_verification) * @@ -1646,7 +1762,6 @@ static void scrub_reset_stripe(struct scrub_stripe *stripe) stripe->state = 0; for (int i = 0; i < stripe->nr_sectors; i++) { - stripe->sectors[i].is_metadata = false; stripe->sectors[i].csum = NULL; stripe->sectors[i].generation = 0; } @@ -1665,24 +1780,21 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) struct btrfs_fs_info *fs_info = stripe->bg->fs_info; struct btrfs_bio *bbio = NULL; unsigned int nr_sectors = stripe_length(stripe) >> fs_info->sectorsize_bits; + const unsigned long has_extent = scrub_bitmap_read_has_extent(stripe); u64 stripe_len = BTRFS_STRIPE_LEN; int mirror = stripe->mirror_num; int i; atomic_inc(&stripe->pending_io); - for_each_set_bit(i, &stripe->extent_sector_bitmap, stripe->nr_sectors) { - struct page *page = scrub_stripe_get_page(stripe, i); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, i); - + for_each_set_bit(i, &has_extent, stripe->nr_sectors) { /* We're beyond the chunk boundary, no need to read anymore. */ if (i >= nr_sectors) break; /* The current sector cannot be merged, submit the bio. */ if (bbio && - ((i > 0 && - !test_bit(i - 1, &stripe->extent_sector_bitmap)) || + ((i > 0 && !test_bit(i - 1, &has_extent)) || bbio->bio.bi_iter.bi_size >= stripe_len)) { ASSERT(bbio->bio.bi_iter.bi_size); atomic_inc(&stripe->pending_io); @@ -1716,8 +1828,8 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) * the extent tree, then it's a preallocated * extent and not an error. */ - set_bit(i, &stripe->io_error_bitmap); - set_bit(i, &stripe->error_bitmap); + scrub_bitmap_set_bit_io_error(stripe, i); + scrub_bitmap_set_bit_error(stripe, i); } continue; } @@ -1727,7 +1839,7 @@ static void scrub_submit_extent_sector_read(struct scrub_stripe *stripe) bbio->bio.bi_iter.bi_sector = logical >> SECTOR_SHIFT; } - __bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + scrub_bio_add_sector(bbio, stripe, i); } if (bbio) { @@ -1765,15 +1877,8 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; /* Read the whole range inside the chunk boundary. */ - for (unsigned int cur = 0; cur < nr_sectors; cur++) { - struct page *page = scrub_stripe_get_page(stripe, cur); - unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); - int ret; - - ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); - /* We should have allocated enough bio vectors. */ - ASSERT(ret == fs_info->sectorsize); - } + for (unsigned int cur = 0; cur < nr_sectors; cur++) + scrub_bio_add_sector(bbio, stripe, cur); atomic_inc(&stripe->pending_io); /* @@ -1794,10 +1899,11 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, static bool stripe_has_metadata_error(struct scrub_stripe *stripe) { + const unsigned long error = scrub_bitmap_read_error(stripe); int i; - for_each_set_bit(i, &stripe->error_bitmap, stripe->nr_sectors) { - if (stripe->sectors[i].is_metadata) { + for_each_set_bit(i, &error, stripe->nr_sectors) { + if (scrub_bitmap_test_bit_is_metadata(stripe, i)) { struct btrfs_fs_info *fs_info = stripe->bg->fs_info; btrfs_err(fs_info, @@ -1872,13 +1978,16 @@ static int flush_scrub_stripes(struct scrub_ctx *sctx) } for (int i = 0; i < nr_stripes; i++) { unsigned long good; + unsigned long has_extent; + unsigned long error; stripe = &sctx->stripes[i]; ASSERT(stripe->dev == fs_info->dev_replace.srcdev); - bitmap_andnot(&good, &stripe->extent_sector_bitmap, - &stripe->error_bitmap, stripe->nr_sectors); + has_extent = scrub_bitmap_read_has_extent(stripe); + error = scrub_bitmap_read_error(stripe); + bitmap_andnot(&good, &has_extent, &error, stripe->nr_sectors); scrub_write_sectors(sctx, stripe, good, true); } } @@ -2012,7 +2121,7 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, /* Check if all data stripes are empty. */ for (int i = 0; i < data_stripes; i++) { stripe = &sctx->raid56_data_stripes[i]; - if (!bitmap_empty(&stripe->extent_sector_bitmap, stripe->nr_sectors)) { + if (!scrub_bitmap_empty_has_extent(stripe)) { all_empty = false; break; } @@ -2044,15 +2153,18 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, */ for (int i = 0; i < data_stripes; i++) { unsigned long error; + unsigned long has_extent; stripe = &sctx->raid56_data_stripes[i]; + error = scrub_bitmap_read_error(stripe); + has_extent = scrub_bitmap_read_has_extent(stripe); + /* * We should only check the errors where there is an extent. * As we may hit an empty data stripe while it's missing. */ - bitmap_and(&error, &stripe->error_bitmap, - &stripe->extent_sector_bitmap, stripe->nr_sectors); + bitmap_and(&error, &error, &has_extent, stripe->nr_sectors); if (!bitmap_empty(&error, stripe->nr_sectors)) { btrfs_err(fs_info, "unrepaired sectors detected, full stripe %llu data stripe %u errors %*pbl", @@ -2061,8 +2173,8 @@ static int scrub_raid56_parity_stripe(struct scrub_ctx *sctx, ret = -EIO; goto out; } - bitmap_or(&extent_bitmap, &extent_bitmap, - &stripe->extent_sector_bitmap, stripe->nr_sectors); + bitmap_or(&extent_bitmap, &extent_bitmap, &has_extent, + stripe->nr_sectors); } /* Now we can check and regenerate the P/Q stripe. */ @@ -2497,8 +2609,8 @@ int scrub_enumerate_chunks(struct scrub_ctx *sctx, path->skip_locking = 1; key.objectid = scrub_dev->devid; - key.offset = 0ull; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = 0ull; while (1) { u64 dev_extent_len; @@ -2770,17 +2882,11 @@ static int scrub_one_super(struct scrub_ctx *sctx, struct btrfs_device *dev, struct page *page, u64 physical, u64 generation) { struct btrfs_fs_info *fs_info = sctx->fs_info; - struct bio_vec bvec; - struct bio bio; struct btrfs_super_block *sb = page_address(page); int ret; - bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = physical >> SECTOR_SHIFT; - __bio_add_page(&bio, page, BTRFS_SUPER_INFO_SIZE, 0); - ret = submit_bio_wait(&bio); - bio_uninit(&bio); - + ret = bdev_rw_virt(dev->bdev, physical >> SECTOR_SHIFT, sb, + BTRFS_SUPER_INFO_SIZE, REQ_OP_READ); if (ret < 0) return ret; ret = btrfs_check_super_csum(fs_info, sb); diff --git a/fs/btrfs/send.c b/fs/btrfs/send.c index f437138fefbc..2891ec4056c6 100644 --- a/fs/btrfs/send.c +++ b/fs/btrfs/send.c @@ -16,7 +16,6 @@ #include <linux/compat.h> #include <linux/crc32c.h> #include <linux/fsverity.h> - #include "send.h" #include "ctree.h" #include "backref.h" @@ -178,6 +177,7 @@ struct send_ctx { u64 cur_inode_rdev; u64 cur_inode_last_extent; u64 cur_inode_next_write_offset; + struct fs_path cur_inode_path; bool cur_inode_new; bool cur_inode_new_gen; bool cur_inode_deleted; @@ -383,11 +383,11 @@ static void inconsistent_snapshot_error(struct send_ctx *sctx, result_string = "updated"; break; case BTRFS_COMPARE_TREE_SAME: - ASSERT(0); + DEBUG_WARN("no change between trees"); result_string = "unchanged"; break; default: - ASSERT(0); + DEBUG_WARN("unexpected comparison result %d", result); result_string = "unexpected"; } @@ -425,15 +425,21 @@ static int need_send_hole(struct send_ctx *sctx) static void fs_path_reset(struct fs_path *p) { - if (p->reversed) { + if (p->reversed) p->start = p->buf + p->buf_len - 1; - p->end = p->start; - *p->start = 0; - } else { + else p->start = p->buf; - p->end = p->start; - *p->start = 0; - } + + p->end = p->start; + *p->start = 0; +} + +static void init_path(struct fs_path *p) +{ + p->reversed = 0; + p->buf = p->inline_buf; + p->buf_len = FS_PATH_INLINE_SIZE; + fs_path_reset(p); } static struct fs_path *fs_path_alloc(void) @@ -443,10 +449,7 @@ static struct fs_path *fs_path_alloc(void) p = kmalloc(sizeof(*p), GFP_KERNEL); if (!p) return NULL; - p->reversed = 0; - p->buf = p->inline_buf; - p->buf_len = FS_PATH_INLINE_SIZE; - fs_path_reset(p); + init_path(p); return p; } @@ -471,7 +474,7 @@ static void fs_path_free(struct fs_path *p) kfree(p); } -static int fs_path_len(struct fs_path *p) +static inline int fs_path_len(const struct fs_path *p) { return p->end - p->start; } @@ -487,12 +490,10 @@ static int fs_path_ensure_buf(struct fs_path *p, int len) if (p->buf_len >= len) return 0; - if (len > PATH_MAX) { - WARN_ON(1); - return -ENOMEM; - } + if (WARN_ON(len > PATH_MAX)) + return -ENAMETOOLONG; - path_len = p->end - p->start; + path_len = fs_path_len(p); old_buf_len = p->buf_len; /* @@ -533,12 +534,12 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, int ret; int new_len; - new_len = p->end - p->start + name_len; + new_len = fs_path_len(p) + name_len; if (p->start != p->end) new_len++; ret = fs_path_ensure_buf(p, new_len); if (ret < 0) - goto out; + return ret; if (p->reversed) { if (p->start != p->end) @@ -553,8 +554,7 @@ static int fs_path_prepare_for_add(struct fs_path *p, int name_len, *p->end = 0; } -out: - return ret; + return 0; } static int fs_path_add(struct fs_path *p, const char *name, int name_len) @@ -564,25 +564,15 @@ static int fs_path_add(struct fs_path *p, const char *name, int name_len) ret = fs_path_prepare_for_add(p, name_len, &prepared); if (ret < 0) - goto out; + return ret; memcpy(prepared, name, name_len); -out: - return ret; + return 0; } -static int fs_path_add_path(struct fs_path *p, struct fs_path *p2) +static inline int fs_path_add_path(struct fs_path *p, const struct fs_path *p2) { - int ret; - char *prepared; - - ret = fs_path_prepare_for_add(p, p2->end - p2->start, &prepared); - if (ret < 0) - goto out; - memcpy(prepared, p2->start, p2->end - p2->start); - -out: - return ret; + return fs_path_add(p, p2->start, fs_path_len(p2)); } static int fs_path_add_from_extent_buffer(struct fs_path *p, @@ -594,12 +584,11 @@ static int fs_path_add_from_extent_buffer(struct fs_path *p, ret = fs_path_prepare_for_add(p, len, &prepared); if (ret < 0) - goto out; + return ret; read_extent_buffer(eb, prepared, off, len); -out: - return ret; + return 0; } static int fs_path_copy(struct fs_path *p, struct fs_path *from) @@ -619,13 +608,21 @@ static void fs_path_unreverse(struct fs_path *p) return; tmp = p->start; - len = p->end - p->start; + len = fs_path_len(p); p->start = p->buf; p->end = p->start + len; memmove(p->start, tmp, len + 1); p->reversed = 0; } +static inline bool is_current_inode_path(const struct send_ctx *sctx, + const struct fs_path *path) +{ + const struct fs_path *cur = &sctx->cur_inode_path; + + return (strncmp(path->start, cur->start, fs_path_len(cur)) == 0); +} + static struct btrfs_path *alloc_path_for_send(void) { struct btrfs_path *path; @@ -740,7 +737,7 @@ static int tlv_put_btrfs_timespec(struct send_ctx *sctx, u16 attr, #define TLV_PUT_PATH(sctx, attrtype, p) \ do { \ ret = tlv_put_string(sctx, attrtype, p->start, \ - p->end - p->start); \ + fs_path_len((p))); \ if (ret < 0) \ goto tlv_put_failure; \ } while(0) @@ -819,14 +816,11 @@ static int send_cmd(struct send_ctx *sctx) static int send_rename(struct send_ctx *sctx, struct fs_path *from, struct fs_path *to) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_rename %s -> %s", from->start, to->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_RENAME); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, from); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_TO, to); @@ -834,7 +828,6 @@ static int send_rename(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -844,14 +837,11 @@ out: static int send_link(struct send_ctx *sctx, struct fs_path *path, struct fs_path *lnk) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_link %s -> %s", path->start, lnk->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_LINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH_LINK, lnk); @@ -859,7 +849,6 @@ static int send_link(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -868,21 +857,17 @@ out: */ static int send_unlink(struct send_ctx *sctx, struct fs_path *path) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_unlink %s", path->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_UNLINK); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -891,21 +876,17 @@ out: */ static int send_rmdir(struct send_ctx *sctx, struct fs_path *path) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; - btrfs_debug(fs_info, "send_rmdir %s", path->start); - ret = begin_cmd(sctx, BTRFS_SEND_C_RMDIR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -1580,7 +1561,6 @@ static int find_extent_clone(struct send_ctx *sctx, struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret; int extent_type; - u64 logical; u64 disk_byte; u64 num_bytes; struct btrfs_file_extent_item *fi; @@ -1611,7 +1591,6 @@ static int find_extent_clone(struct send_ctx *sctx, compressed = btrfs_file_extent_compression(eb, fi); num_bytes = btrfs_file_extent_num_bytes(eb, fi); - logical = disk_byte + btrfs_file_extent_offset(eb, fi); /* * Setup the clone roots. @@ -1693,14 +1672,8 @@ static int find_extent_clone(struct send_ctx *sctx, } up_read(&fs_info->commit_root_sem); - btrfs_debug(fs_info, - "find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu", - data_offset, ino, num_bytes, logical); - - if (!backref_ctx.found) { - btrfs_debug(fs_info, "no clones found"); + if (!backref_ctx.found) return -ENOENT; - } cur_clone_root = NULL; for (i = 0; i < sctx->clone_roots_cnt; i++) { @@ -1897,7 +1870,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_inode_info(sctx->send_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; left_ret = (info.nlink == 0) ? -ENOENT : ret; left_gen = info.gen; if (send_gen) @@ -1908,7 +1881,7 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, } else { ret = get_inode_info(sctx->parent_root, ino, &info); if (ret < 0 && ret != -ENOENT) - goto out; + return ret; right_ret = (info.nlink == 0) ? -ENOENT : ret; right_gen = info.gen; if (parent_gen) @@ -1953,7 +1926,6 @@ static int get_cur_inode_state(struct send_ctx *sctx, u64 ino, u64 gen, ret = -ENOENT; } -out: return ret; } @@ -1967,17 +1939,14 @@ static int is_inode_existent(struct send_ctx *sctx, u64 ino, u64 gen, ret = get_cur_inode_state(sctx, ino, gen, send_gen, parent_gen); if (ret < 0) - goto out; + return ret; if (ret == inode_state_no_change || ret == inode_state_did_create || ret == inode_state_will_delete) - ret = 1; - else - ret = 0; + return 1; -out: - return ret; + return 0; } /* @@ -2326,9 +2295,8 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, *parent_gen = nce->parent_gen; ret = fs_path_add(dest, nce->name, nce->name_len); if (ret < 0) - goto out; - ret = nce->ret; - goto out; + return ret; + return nce->ret; } } @@ -2339,12 +2307,12 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, */ ret = is_inode_existent(sctx, ino, gen, NULL, NULL); if (ret < 0) - goto out; + return ret; if (!ret) { ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; goto out_cache; } @@ -2360,21 +2328,21 @@ static int __get_cur_name_and_parent(struct send_ctx *sctx, ret = get_first_ref(sctx->parent_root, ino, parent_ino, parent_gen, dest); if (ret < 0) - goto out; + return ret; /* * Check if the ref was overwritten by an inode's ref that was processed * earlier. If yes, treat as orphan and return 1. */ ret = did_overwrite_ref(sctx, *parent_ino, *parent_gen, ino, gen, - dest->start, dest->end - dest->start); + dest->start, fs_path_len(dest)); if (ret < 0) - goto out; + return ret; if (ret) { fs_path_reset(dest); ret = gen_unique_name(sctx, ino, gen, dest); if (ret < 0) - goto out; + return ret; ret = 1; } @@ -2383,10 +2351,8 @@ out_cache: * Store the result of the lookup in the name cache. */ nce = kmalloc(sizeof(*nce) + fs_path_len(dest), GFP_KERNEL); - if (!nce) { - ret = -ENOMEM; - goto out; - } + if (!nce) + return -ENOMEM; nce->entry.key = ino; nce->entry.gen = gen; @@ -2404,10 +2370,9 @@ out_cache: nce_ret = btrfs_lru_cache_store(&sctx->name_cache, &nce->entry, GFP_KERNEL); if (nce_ret < 0) { kfree(nce); - ret = nce_ret; + return nce_ret; } -out: return ret; } @@ -2444,6 +2409,14 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, u64 parent_inode = 0; u64 parent_gen = 0; int stop = 0; + const bool is_cur_inode = (ino == sctx->cur_ino && gen == sctx->cur_inode_gen); + + if (is_cur_inode && fs_path_len(&sctx->cur_inode_path) > 0) { + if (dest != &sctx->cur_inode_path) + return fs_path_copy(dest, &sctx->cur_inode_path); + + return 0; + } name = fs_path_alloc(); if (!name) { @@ -2495,8 +2468,12 @@ static int get_cur_path(struct send_ctx *sctx, u64 ino, u64 gen, out: fs_path_free(name); - if (!ret) + if (!ret) { fs_path_unreverse(dest); + if (is_cur_inode && dest != &sctx->cur_inode_path) + ret = fs_path_copy(&sctx->cur_inode_path, dest); + } + return ret; } @@ -2591,25 +2568,60 @@ out: return ret; } +static struct fs_path *get_cur_inode_path(struct send_ctx *sctx) +{ + if (fs_path_len(&sctx->cur_inode_path) == 0) { + int ret; + + ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, + &sctx->cur_inode_path); + if (ret < 0) + return ERR_PTR(ret); + } + + return &sctx->cur_inode_path; +} + +static struct fs_path *get_path_for_command(struct send_ctx *sctx, u64 ino, u64 gen) +{ + struct fs_path *path; + int ret; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + return get_cur_inode_path(sctx); + + path = fs_path_alloc(); + if (!path) + return ERR_PTR(-ENOMEM); + + ret = get_cur_path(sctx, ino, gen, path); + if (ret < 0) { + fs_path_free(path); + return ERR_PTR(ret); + } + + return path; +} + +static void free_path_for_command(const struct send_ctx *sctx, struct fs_path *path) +{ + if (path != &sctx->cur_inode_path) + fs_path_free(path); +} + static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_truncate %llu size=%llu", ino, size); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_TRUNCATE); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_SIZE, size); @@ -2617,29 +2629,23 @@ static int send_truncate(struct send_ctx *sctx, u64 ino, u64 gen, u64 size) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_chmod %llu mode=%llu", ino, mode); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHMOD); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_MODE, mode & 07777); @@ -2647,32 +2653,26 @@ static int send_chmod(struct send_ctx *sctx, u64 ino, u64 gen, u64 mode) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; if (sctx->proto < 2) return 0; - btrfs_debug(fs_info, "send_fileattr %llu fileattr=%llu", ino, fileattr); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_FILEATTR); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILEATTR, fileattr); @@ -2680,30 +2680,23 @@ static int send_fileattr(struct send_ctx *sctx, u64 ino, u64 gen, u64 fileattr) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - btrfs_debug(fs_info, "send_chown %llu uid=%llu, gid=%llu", - ino, uid, gid); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_CHOWN); if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_UID, uid); TLV_PUT_U64(sctx, BTRFS_SEND_A_GID, gid); @@ -2712,13 +2705,12 @@ static int send_chown(struct send_ctx *sctx, u64 ino, u64 gen, u64 uid, u64 gid) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); return ret; } static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p = NULL; struct btrfs_inode_item *ii; @@ -2727,11 +2719,9 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) struct btrfs_key key; int slot; - btrfs_debug(fs_info, "send_utimes %llu", ino); - - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_path_for_command(sctx, ino, gen); + if (IS_ERR(p)) + return PTR_ERR(p); path = alloc_path_for_send(); if (!path) { @@ -2756,9 +2746,6 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) if (ret < 0) goto out; - ret = get_cur_path(sctx, ino, gen, p); - if (ret < 0) - goto out; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_ATIME, eb, &ii->atime); TLV_PUT_BTRFS_TIMESPEC(sctx, BTRFS_SEND_A_MTIME, eb, &ii->mtime); @@ -2770,7 +2757,7 @@ static int send_utimes(struct send_ctx *sctx, u64 ino, u64 gen) tlv_put_failure: out: - fs_path_free(p); + free_path_for_command(sctx, p); btrfs_free_path(path); return ret; } @@ -2838,7 +2825,6 @@ static int trim_dir_utimes_cache(struct send_ctx *sctx) */ static int send_create_inode(struct send_ctx *sctx, u64 ino) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; int cmd; @@ -2847,8 +2833,6 @@ static int send_create_inode(struct send_ctx *sctx, u64 ino) u64 mode; u64 rdev; - btrfs_debug(fs_info, "send_create_inode %llu", ino); - p = fs_path_alloc(); if (!p) return -ENOMEM; @@ -3075,7 +3059,7 @@ static void __free_recorded_refs(struct list_head *head) struct recorded_ref *cur; while (!list_empty(head)) { - cur = list_entry(head->next, struct recorded_ref, list); + cur = list_first_entry(head, struct recorded_ref, list); recorded_ref_free(cur); } } @@ -3106,6 +3090,11 @@ static int orphanize_inode(struct send_ctx *sctx, u64 ino, u64 gen, goto out; ret = send_rename(sctx, path, orphan); + if (ret < 0) + goto out; + + if (ino == sctx->cur_ino && gen == sctx->cur_inode_gen) + ret = fs_path_copy(&sctx->cur_inode_path, orphan); out: fs_path_free(orphan); @@ -4158,6 +4147,23 @@ out: return ret; } +static int rename_current_inode(struct send_ctx *sctx, + struct fs_path *current_path, + struct fs_path *new_path) +{ + int ret; + + ret = send_rename(sctx, current_path, new_path); + if (ret < 0) + return ret; + + ret = fs_path_copy(&sctx->cur_inode_path, new_path); + if (ret < 0) + return ret; + + return fs_path_copy(current_path, new_path); +} + /* * This does all the move/link/unlink/rmdir magic. */ @@ -4172,15 +4178,13 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) u64 ow_inode = 0; u64 ow_gen; u64 ow_mode; - int did_overwrite = 0; - int is_orphan = 0; u64 last_dir_ino_rm = 0; + bool did_overwrite = false; + bool is_orphan = false; bool can_rename = true; bool orphanized_dir = false; bool orphanized_ancestor = false; - btrfs_debug(fs_info, "process_recorded_refs %llu", sctx->cur_ino); - /* * This should never happen as the root dir always has the same ref * which is always '..' @@ -4216,14 +4220,14 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret < 0) goto out; if (ret) - did_overwrite = 1; + did_overwrite = true; } if (sctx->cur_inode_new || did_overwrite) { ret = gen_unique_name(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } else { ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4348,6 +4352,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) if (ret > 0) { orphanized_ancestor = true; fs_path_reset(valid_path); + fs_path_reset(&sctx->cur_inode_path); ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, valid_path); @@ -4443,13 +4448,10 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * it depending on the inode mode. */ if (is_orphan && can_rename) { - ret = send_rename(sctx, valid_path, cur->full_path); - if (ret < 0) - goto out; - is_orphan = 0; - ret = fs_path_copy(valid_path, cur->full_path); + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; + is_orphan = false; } else if (can_rename) { if (S_ISDIR(sctx->cur_inode_mode)) { /* @@ -4457,10 +4459,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) * dirs, we always have one new and one deleted * ref. The deleted ref is ignored later. */ - ret = send_rename(sctx, valid_path, - cur->full_path); - if (!ret) - ret = fs_path_copy(valid_path, + ret = rename_current_inode(sctx, valid_path, cur->full_path); if (ret < 0) goto out; @@ -4507,7 +4506,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) sctx->cur_inode_gen, valid_path); if (ret < 0) goto out; - is_orphan = 1; + is_orphan = true; } list_for_each_entry(cur, &sctx->deleted_refs, list) { @@ -4520,8 +4519,7 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) /* * We have a moved dir. Add the old parent to check_dirs */ - cur = list_entry(sctx->deleted_refs.next, struct recorded_ref, - list); + cur = list_first_entry(&sctx->deleted_refs, struct recorded_ref, list); ret = dup_ref(cur, &check_dirs); if (ret < 0) goto out; @@ -4553,6 +4551,8 @@ static int process_recorded_refs(struct send_ctx *sctx, int *pending_move) ret = send_unlink(sctx, cur->full_path); if (ret < 0) goto out; + if (is_current_inode_path(sctx, cur->full_path)) + fs_path_reset(&sctx->cur_inode_path); } ret = dup_ref(cur, &check_dirs); if (ret < 0) @@ -4701,7 +4701,7 @@ out: static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4710,7 +4710,7 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) ret = get_inode_gen(sctx->send_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4724,13 +4724,13 @@ static int record_new_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) &sctx->new_refs, name, dir, dir_gen, sctx); } -out: + return ret; } static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx) { - int ret = 0; + int ret; struct send_ctx *sctx = ctx; struct rb_node *node = NULL; struct recorded_ref data; @@ -4739,7 +4739,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx ret = get_inode_gen(sctx->parent_root, dir, &dir_gen); if (ret < 0) - goto out; + return ret; data.dir = dir; data.dir_gen = dir_gen; @@ -4753,7 +4753,7 @@ static int record_deleted_ref_if_needed(u64 dir, struct fs_path *name, void *ctx &sctx->deleted_refs, name, dir, dir_gen, sctx); } -out: + return ret; } @@ -4764,11 +4764,9 @@ static int record_new_ref(struct send_ctx *sctx) ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_deleted_ref(struct send_ctx *sctx) @@ -4779,29 +4777,25 @@ static int record_deleted_ref(struct send_ctx *sctx) sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } static int record_changed_ref(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_inode_ref(sctx->send_root, sctx->left_path, sctx->cmp_key, 0, record_new_ref_if_needed, sctx); if (ret < 0) - goto out; + return ret; ret = iterate_inode_ref(sctx->parent_root, sctx->right_path, sctx->cmp_key, 0, record_deleted_ref_if_needed, sctx); if (ret < 0) - goto out; - ret = 0; + return ret; -out: - return ret; + return 0; } /* @@ -4869,15 +4863,19 @@ out: } static int send_set_xattr(struct send_ctx *sctx, - struct fs_path *path, const char *name, int name_len, const char *data, int data_len) { - int ret = 0; + struct fs_path *path; + int ret; + + path = get_cur_inode_path(sctx); + if (IS_ERR(path)) + return PTR_ERR(path); ret = begin_cmd(sctx, BTRFS_SEND_C_SET_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4886,7 +4884,6 @@ static int send_set_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4894,11 +4891,11 @@ static int send_remove_xattr(struct send_ctx *sctx, struct fs_path *path, const char *name, int name_len) { - int ret = 0; + int ret; ret = begin_cmd(sctx, BTRFS_SEND_C_REMOVE_XATTR); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_STRING(sctx, BTRFS_SEND_A_XATTR_NAME, name, name_len); @@ -4906,7 +4903,6 @@ static int send_remove_xattr(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } @@ -4914,19 +4910,13 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; - struct fs_path *p; struct posix_acl_xattr_header dummy_acl; /* Capabilities are emitted by finish_inode_if_needed */ if (!strncmp(name, XATTR_NAME_CAPS, name_len)) return 0; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - /* * This hack is needed because empty acls are stored as zero byte * data in xattrs. Problem with that is, that receiving these zero byte @@ -4943,48 +4933,27 @@ static int __process_new_xattr(int num, struct btrfs_key *di_key, } } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - - ret = send_set_xattr(sctx, p, name, name_len, data, data_len); - -out: - fs_path_free(p); - return ret; + return send_set_xattr(sctx, name, name_len, data, data_len); } static int __process_deleted_xattr(int num, struct btrfs_key *di_key, const char *name, int name_len, const char *data, int data_len, void *ctx) { - int ret; struct send_ctx *sctx = ctx; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); - ret = send_remove_xattr(sctx, p, name, name_len); - -out: - fs_path_free(p); - return ret; + return send_remove_xattr(sctx, p, name, name_len); } static int process_new_xattr(struct send_ctx *sctx) { - int ret = 0; - - ret = iterate_dir_item(sctx->send_root, sctx->left_path, - __process_new_xattr, sctx); - - return ret; + return iterate_dir_item(sctx->send_root, sctx->left_path, + __process_new_xattr, sctx); } static int process_deleted_xattr(struct send_ctx *sctx) @@ -5100,17 +5069,15 @@ static int __process_changed_deleted_xattr(int num, struct btrfs_key *di_key, static int process_changed_xattr(struct send_ctx *sctx) { - int ret = 0; + int ret; ret = iterate_dir_item(sctx->send_root, sctx->left_path, __process_changed_new_xattr, sctx); if (ret < 0) - goto out; - ret = iterate_dir_item(sctx->parent_root, sctx->right_path, - __process_changed_deleted_xattr, sctx); + return ret; -out: - return ret; + return iterate_dir_item(sctx->parent_root, sctx->right_path, + __process_changed_deleted_xattr, sctx); } static int process_all_new_xattrs(struct send_ctx *sctx) @@ -5157,7 +5124,7 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = begin_cmd(sctx, BTRFS_SEND_C_ENABLE_VERITY); if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, path); TLV_PUT_U8(sctx, BTRFS_SEND_A_VERITY_ALGORITHM, @@ -5172,21 +5139,20 @@ static int send_verity(struct send_ctx *sctx, struct fs_path *path, ret = send_cmd(sctx); tlv_put_failure: -out: return ret; } static int process_verity(struct send_ctx *sctx) { int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *p; inode = btrfs_iget(sctx->cur_ino, sctx->send_root); if (IS_ERR(inode)) return PTR_ERR(inode); - ret = btrfs_get_verity_descriptor(inode, NULL, 0); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, NULL, 0); if (ret < 0) goto iput; @@ -5203,27 +5169,19 @@ static int process_verity(struct send_ctx *sctx) } } - ret = btrfs_get_verity_descriptor(inode, sctx->verity_descriptor, ret); + ret = btrfs_get_verity_descriptor(&inode->vfs_inode, sctx->verity_descriptor, ret); if (ret < 0) goto iput; - p = fs_path_alloc(); - if (!p) { - ret = -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) { + ret = PTR_ERR(p); goto iput; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto free_path; ret = send_verity(sctx, p, sctx->verity_descriptor); - if (ret < 0) - goto free_path; - -free_path: - fs_path_free(p); iput: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5263,10 +5221,9 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct folio *folio; - pgoff_t index = offset >> PAGE_SHIFT; - pgoff_t last_index; - unsigned pg_offset = offset_in_page(offset); + u64 cur = offset; + const u64 end = offset + len; + const pgoff_t last_index = ((end - 1) >> PAGE_SHIFT); struct address_space *mapping = sctx->cur_inode->i_mapping; int ret; @@ -5274,13 +5231,12 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len) if (ret) return ret; - last_index = (offset + len - 1) >> PAGE_SHIFT; - - while (index <= last_index) { - unsigned cur_len = min_t(unsigned, len, - PAGE_SIZE - pg_offset); + while (cur < end) { + pgoff_t index = (cur >> PAGE_SHIFT); + unsigned int cur_len; + unsigned int pg_offset; + struct folio *folio; -again: folio = filemap_lock_folio(mapping, index); if (IS_ERR(folio)) { page_cache_sync_readahead(mapping, @@ -5293,8 +5249,8 @@ again: break; } } - - WARN_ON(folio_order(folio)); + pg_offset = offset_in_folio(folio, cur); + cur_len = min_t(unsigned int, end - cur, folio_size(folio) - pg_offset); if (folio_test_readahead(folio)) page_cache_async_readahead(mapping, &sctx->ra, NULL, folio, @@ -5316,7 +5272,7 @@ again: if (folio->mapping != mapping) { folio_unlock(folio); folio_put(folio); - goto again; + continue; } } @@ -5324,9 +5280,7 @@ again: pg_offset, cur_len); folio_unlock(folio); folio_put(folio); - index++; - pg_offset = 0; - len -= cur_len; + cur += cur_len; sctx->send_size += cur_len; } @@ -5339,35 +5293,26 @@ again: */ static int send_write(struct send_ctx *sctx, u64 offset, u32 len) { - struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - - btrfs_debug(fs_info, "send_write offset=%llu, len=%d", offset, len); + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_WRITE); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); ret = put_file_data(sctx, offset, len); if (ret < 0) - goto out; + return ret; ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5380,12 +5325,12 @@ static int send_clone(struct send_ctx *sctx, { int ret = 0; struct fs_path *p; + struct fs_path *cur_inode_path; u64 gen; - btrfs_debug(sctx->send_root->fs_info, - "send_clone offset=%llu, len=%d, clone_root=%llu, clone_inode=%llu, clone_offset=%llu", - offset, len, btrfs_root_id(clone_root->root), - clone_root->ino, clone_root->offset); + cur_inode_path = get_cur_inode_path(sctx); + if (IS_ERR(cur_inode_path)) + return PTR_ERR(cur_inode_path); p = fs_path_alloc(); if (!p) @@ -5395,13 +5340,9 @@ static int send_clone(struct send_ctx *sctx, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; - TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); TLV_PUT_U64(sctx, BTRFS_SEND_A_CLONE_LEN, len); - TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); + TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, cur_inode_path); if (clone_root->root == sctx->send_root) { ret = get_inode_gen(sctx->send_root, clone_root->ino, &gen); @@ -5452,17 +5393,13 @@ static int send_update_extent(struct send_ctx *sctx, int ret = 0; struct fs_path *p; - p = fs_path_alloc(); - if (!p) - return -ENOMEM; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); ret = begin_cmd(sctx, BTRFS_SEND_C_UPDATE_EXTENT); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto out; + return ret; TLV_PUT_PATH(sctx, BTRFS_SEND_A_PATH, p); TLV_PUT_U64(sctx, BTRFS_SEND_A_FILE_OFFSET, offset); @@ -5471,8 +5408,6 @@ static int send_update_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(p); return ret; } @@ -5501,12 +5436,10 @@ static int send_hole(struct send_ctx *sctx, u64 end) if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA) return send_update_extent(sctx, offset, end - offset); - p = fs_path_alloc(); - if (!p) - return -ENOMEM; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, p); - if (ret < 0) - goto tlv_put_failure; + p = get_cur_inode_path(sctx); + if (IS_ERR(p)) + return PTR_ERR(p); + while (offset < end) { u64 len = min(end - offset, read_size); @@ -5527,7 +5460,6 @@ static int send_hole(struct send_ctx *sctx, u64 end) } sctx->cur_inode_next_write_offset = offset; tlv_put_failure: - fs_path_free(p); return ret; } @@ -5535,9 +5467,7 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, struct btrfs_path *path, u64 offset, u64 len) { - struct btrfs_root *root = sctx->send_root; - struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_fs_info *fs_info = sctx->send_root->fs_info; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5546,23 +5476,13 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, size_t inline_size; int ret; - inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(inode)) - return PTR_ERR(inode); - - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; - goto out; - } + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) + return PTR_ERR(fspath); ret = begin_cmd(sctx, BTRFS_SEND_C_ENCODED_WRITE); if (ret < 0) - goto out; - - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; + return ret; btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); @@ -5578,12 +5498,12 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = btrfs_encoded_io_compression_from_extent(fs_info, btrfs_file_extent_compression(leaf, ei)); if (ret < 0) - goto out; + return ret; TLV_PUT_U32(sctx, BTRFS_SEND_A_COMPRESSION, ret); ret = put_data_header(sctx, inline_size); if (ret < 0) - goto out; + return ret; read_extent_buffer(leaf, sctx->send_buf + sctx->send_size, btrfs_file_extent_inline_start(ei), inline_size); sctx->send_size += inline_size; @@ -5591,9 +5511,6 @@ static int send_encoded_inline_extent(struct send_ctx *sctx, ret = send_cmd(sctx); tlv_put_failure: -out: - fs_path_free(fspath); - iput(inode); return ret; } @@ -5602,7 +5519,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, { struct btrfs_root *root = sctx->send_root; struct btrfs_fs_info *fs_info = root->fs_info; - struct inode *inode; + struct btrfs_inode *inode; struct fs_path *fspath; struct extent_buffer *leaf = path->nodes[0]; struct btrfs_key key; @@ -5617,9 +5534,9 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (IS_ERR(inode)) return PTR_ERR(inode); - fspath = fs_path_alloc(); - if (!fspath) { - ret = -ENOMEM; + fspath = get_cur_inode_path(sctx); + if (IS_ERR(fspath)) { + ret = PTR_ERR(fspath); goto out; } @@ -5627,10 +5544,6 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, if (ret < 0) goto out; - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - btrfs_item_key_to_cpu(leaf, &key, path->slots[0]); ei = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_file_extent_item); disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, ei); @@ -5672,7 +5585,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, * Note that send_buf is a mapping of send_buf_pages, so this is really * reading into send_buf. */ - ret = btrfs_encoded_read_regular_fill_pages(BTRFS_I(inode), + ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr, disk_num_bytes, sctx->send_buf_pages + (data_offset >> PAGE_SHIFT), @@ -5698,8 +5611,7 @@ static int send_encoded_extent(struct send_ctx *sctx, struct btrfs_path *path, tlv_put_failure: out: - fs_path_free(fspath); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -5741,15 +5653,14 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, } if (sctx->cur_inode == NULL) { + struct btrfs_inode *btrfs_inode; struct btrfs_root *root = sctx->send_root; - sctx->cur_inode = btrfs_iget(sctx->cur_ino, root); - if (IS_ERR(sctx->cur_inode)) { - int err = PTR_ERR(sctx->cur_inode); + btrfs_inode = btrfs_iget(sctx->cur_ino, root); + if (IS_ERR(btrfs_inode)) + return PTR_ERR(btrfs_inode); - sctx->cur_inode = NULL; - return err; - } + sctx->cur_inode = &btrfs_inode->vfs_inode; memset(&sctx->ra, 0, sizeof(struct file_ra_state)); file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping); @@ -5828,7 +5739,6 @@ static int send_extent_data(struct send_ctx *sctx, struct btrfs_path *path, */ static int send_capabilities(struct send_ctx *sctx) { - struct fs_path *fspath = NULL; struct btrfs_path *path; struct btrfs_dir_item *di; struct extent_buffer *leaf; @@ -5854,25 +5764,19 @@ static int send_capabilities(struct send_ctx *sctx) leaf = path->nodes[0]; buf_len = btrfs_dir_data_len(leaf, di); - fspath = fs_path_alloc(); buf = kmalloc(buf_len, GFP_KERNEL); - if (!fspath || !buf) { + if (!buf) { ret = -ENOMEM; goto out; } - ret = get_cur_path(sctx, sctx->cur_ino, sctx->cur_inode_gen, fspath); - if (ret < 0) - goto out; - data_ptr = (unsigned long)(di + 1) + btrfs_dir_name_len(leaf, di); read_extent_buffer(leaf, buf, data_ptr, buf_len); - ret = send_set_xattr(sctx, fspath, XATTR_NAME_CAPS, + ret = send_set_xattr(sctx, XATTR_NAME_CAPS, strlen(XATTR_NAME_CAPS), buf, buf_len); out: kfree(buf); - fs_path_free(fspath); btrfs_free_path(path); return ret; } @@ -6898,6 +6802,7 @@ static int changed_inode(struct send_ctx *sctx, sctx->cur_inode_last_extent = (u64)-1; sctx->cur_inode_next_write_offset = 0; sctx->ignore_cur_inode = false; + fs_path_reset(&sctx->cur_inode_path); /* * Set send_progress to current inode. This will tell all get_cur_xxx @@ -8107,10 +8012,9 @@ static void dedupe_in_progress_warn(const struct btrfs_root *root) btrfs_root_id(root), root->dedupe_in_progress); } -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg) +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg) { int ret = 0; - struct btrfs_root *send_root = inode->root; struct btrfs_fs_info *fs_info = send_root->fs_info; struct btrfs_root *clone_root; struct send_ctx *sctx = NULL; @@ -8173,6 +8077,7 @@ long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_a goto out; } + init_path(&sctx->cur_inode_path); INIT_LIST_HEAD(&sctx->new_refs); INIT_LIST_HEAD(&sctx->deleted_refs); @@ -8449,6 +8354,9 @@ out: btrfs_lru_cache_clear(&sctx->dir_created_cache); btrfs_lru_cache_clear(&sctx->dir_utimes_cache); + if (sctx->cur_inode_path.buf != sctx->cur_inode_path.inline_buf) + kfree(sctx->cur_inode_path.buf); + kfree(sctx); } diff --git a/fs/btrfs/send.h b/fs/btrfs/send.h index 9309886c5ea1..652bb28f63d4 100644 --- a/fs/btrfs/send.h +++ b/fs/btrfs/send.h @@ -11,7 +11,7 @@ #include <linux/sizes.h> #include <linux/align.h> -struct btrfs_inode; +struct btrfs_root; struct btrfs_ioctl_send_args; #define BTRFS_SEND_STREAM_MAGIC "btrfs-stream" @@ -182,6 +182,6 @@ enum { __BTRFS_SEND_A_MAX = 35, }; -long btrfs_ioctl_send(struct btrfs_inode *inode, const struct btrfs_ioctl_send_args *arg); +long btrfs_ioctl_send(struct btrfs_root *send_root, const struct btrfs_ioctl_send_args *arg); #endif diff --git a/fs/btrfs/space-info.c b/fs/btrfs/space-info.c index a341d087567a..d9087aa81b21 100644 --- a/fs/btrfs/space-info.c +++ b/fs/btrfs/space-info.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0 -#include "linux/spinlock.h" +#include <linux/spinlock.h> #include <linux/minmax.h> #include "misc.h" #include "ctree.h" @@ -50,11 +50,11 @@ * num_bytes we want to reserve. * * ->reserve - * space_info->bytes_may_reserve += num_bytes + * space_info->bytes_may_use += num_bytes * * ->extent allocation * Call btrfs_add_reserved_bytes() which does - * space_info->bytes_may_reserve -= num_bytes + * space_info->bytes_may_use -= num_bytes * space_info->bytes_reserved += extent_bytes * * ->insert reference @@ -234,19 +234,11 @@ void btrfs_update_space_info_chunk_size(struct btrfs_space_info *space_info, WRITE_ONCE(space_info->chunk_size, chunk_size); } -static int create_space_info(struct btrfs_fs_info *info, u64 flags) +static void init_space_info(struct btrfs_fs_info *info, + struct btrfs_space_info *space_info, u64 flags) { - - struct btrfs_space_info *space_info; - int i; - int ret; - - space_info = kzalloc(sizeof(*space_info), GFP_NOFS); - if (!space_info) - return -ENOMEM; - space_info->fs_info = info; - for (i = 0; i < BTRFS_NR_RAID_TYPES; i++) + for (int i = 0; i < BTRFS_NR_RAID_TYPES; i++) INIT_LIST_HEAD(&space_info->block_groups[i]); init_rwsem(&space_info->groups_sem); spin_lock_init(&space_info->lock); @@ -257,9 +249,64 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags) INIT_LIST_HEAD(&space_info->priority_tickets); space_info->clamp = 1; btrfs_update_space_info_chunk_size(space_info, calc_chunk_size(info, flags)); + space_info->subgroup_id = BTRFS_SUB_GROUP_PRIMARY; if (btrfs_is_zoned(info)) space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH; +} + +static int create_space_info_sub_group(struct btrfs_space_info *parent, u64 flags, + enum btrfs_space_info_sub_group id, int index) +{ + struct btrfs_fs_info *fs_info = parent->fs_info; + struct btrfs_space_info *sub_group; + int ret; + + ASSERT(parent->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); + ASSERT(id != BTRFS_SUB_GROUP_PRIMARY); + + sub_group = kzalloc(sizeof(*sub_group), GFP_NOFS); + if (!sub_group) + return -ENOMEM; + + init_space_info(fs_info, sub_group, flags); + parent->sub_group[index] = sub_group; + sub_group->parent = parent; + sub_group->subgroup_id = id; + + ret = btrfs_sysfs_add_space_info_type(fs_info, sub_group); + if (ret) { + kfree(sub_group); + parent->sub_group[index] = NULL; + } + return ret; +} + +static int create_space_info(struct btrfs_fs_info *info, u64 flags) +{ + + struct btrfs_space_info *space_info; + int ret = 0; + + space_info = kzalloc(sizeof(*space_info), GFP_NOFS); + if (!space_info) + return -ENOMEM; + + init_space_info(info, space_info, flags); + + if (btrfs_is_zoned(info)) { + if (flags & BTRFS_BLOCK_GROUP_DATA) + ret = create_space_info_sub_group(space_info, flags, + BTRFS_SUB_GROUP_DATA_RELOC, + 0); + else if (flags & BTRFS_BLOCK_GROUP_METADATA) + ret = create_space_info_sub_group(space_info, flags, + BTRFS_SUB_GROUP_TREELOG, + 0); + + if (ret) + return ret; + } ret = btrfs_sysfs_add_space_info_type(info, space_info); if (ret) @@ -312,31 +359,29 @@ out: void btrfs_add_bg_to_space_info(struct btrfs_fs_info *info, struct btrfs_block_group *block_group) { - struct btrfs_space_info *found; + struct btrfs_space_info *space_info = block_group->space_info; int factor, index; factor = btrfs_bg_type_to_factor(block_group->flags); - found = btrfs_find_space_info(info, block_group->flags); - ASSERT(found); - spin_lock(&found->lock); - found->total_bytes += block_group->length; - found->disk_total += block_group->length * factor; - found->bytes_used += block_group->used; - found->disk_used += block_group->used * factor; - found->bytes_readonly += block_group->bytes_super; - btrfs_space_info_update_bytes_zone_unusable(found, block_group->zone_unusable); + spin_lock(&space_info->lock); + space_info->total_bytes += block_group->length; + space_info->disk_total += block_group->length * factor; + space_info->bytes_used += block_group->used; + space_info->disk_used += block_group->used * factor; + space_info->bytes_readonly += block_group->bytes_super; + btrfs_space_info_update_bytes_zone_unusable(space_info, block_group->zone_unusable); if (block_group->length > 0) - found->full = 0; - btrfs_try_granting_tickets(info, found); - spin_unlock(&found->lock); + space_info->full = 0; + btrfs_try_granting_tickets(info, space_info); + spin_unlock(&space_info->lock); - block_group->space_info = found; + block_group->space_info = space_info; index = btrfs_bg_flags_to_raid_index(block_group->flags); - down_write(&found->groups_sem); - list_add_tail(&block_group->list, &found->block_groups[index]); - up_write(&found->groups_sem); + down_write(&space_info->groups_sem); + list_add_tail(&block_group->list, &space_info->block_groups[index]); + up_write(&space_info->groups_sem); } struct btrfs_space_info *btrfs_find_space_info(struct btrfs_fs_info *info, @@ -556,8 +601,9 @@ static void __btrfs_dump_space_info(const struct btrfs_fs_info *fs_info, lockdep_assert_held(&info->lock); /* The free space could be negative in case of overcommit */ - btrfs_info(fs_info, "space_info %s has %lld free, is %sfull", - flag_str, + btrfs_info(fs_info, + "space_info %s (sub-group id %d) has %lld free, is %sfull", + flag_str, info->subgroup_id, (s64)(info->total_bytes - btrfs_space_info_used(info, true)), info->full ? "" : "not "); btrfs_info(fs_info, @@ -812,7 +858,7 @@ static void flush_space(struct btrfs_fs_info *fs_info, ret = PTR_ERR(trans); break; } - ret = btrfs_chunk_alloc(trans, + ret = btrfs_chunk_alloc(trans, space_info, btrfs_get_alloc_profile(fs_info, space_info->flags), (state == ALLOC_CHUNK) ? CHUNK_ALLOC_NO_FORCE : CHUNK_ALLOC_FORCE); @@ -1083,23 +1129,15 @@ static bool maybe_fail_all_tickets(struct btrfs_fs_info *fs_info, return (tickets_id != space_info->tickets_id); } -/* - * This is for normal flushers, we can wait all goddamned day if we want to. We - * will loop and continuously try to flush as long as we are making progress. - * We count progress as clearing off tickets each time we have to loop. - */ -static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +static void do_async_reclaim_metadata_space(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 to_reclaim; enum btrfs_flush_state flush_state; int commit_cycles = 0; u64 last_tickets_id; enum btrfs_flush_state final_state; - fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); - space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); if (btrfs_is_zoned(fs_info)) final_state = RESET_ZONES; else @@ -1174,6 +1212,25 @@ static void btrfs_async_reclaim_metadata_space(struct work_struct *work) } /* + * This is for normal flushers, it can wait as much time as needed. We will + * loop and continuously try to flush as long as we are making progress. We + * count progress as clearing off tickets each time we have to loop. + */ +static void btrfs_async_reclaim_metadata_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + + fs_info = container_of(work, struct btrfs_fs_info, async_reclaim_work); + space_info = btrfs_find_space_info(fs_info, BTRFS_BLOCK_GROUP_METADATA); + do_async_reclaim_metadata_space(space_info); + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) { + if (space_info->sub_group[i]) + do_async_reclaim_metadata_space(space_info->sub_group[i]); + } +} + +/* * This handles pre-flushing of metadata space before we get to the point that * we need to start blocking threads on tickets. The logic here is different * from the other flush paths because it doesn't rely on tickets to tell us how @@ -1318,16 +1375,12 @@ static const enum btrfs_flush_state data_flush_states[] = { ALLOC_CHUNK_FORCE, }; -static void btrfs_async_reclaim_data_space(struct work_struct *work) +static void do_async_reclaim_data_space(struct btrfs_space_info *space_info) { - struct btrfs_fs_info *fs_info; - struct btrfs_space_info *space_info; + struct btrfs_fs_info *fs_info = space_info->fs_info; u64 last_tickets_id; enum btrfs_flush_state flush_state = 0; - fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); - space_info = fs_info->data_sinfo; - spin_lock(&space_info->lock); if (list_empty(&space_info->tickets)) { space_info->flush = 0; @@ -1395,6 +1448,19 @@ aborted_fs: spin_unlock(&space_info->lock); } +static void btrfs_async_reclaim_data_space(struct work_struct *work) +{ + struct btrfs_fs_info *fs_info; + struct btrfs_space_info *space_info; + + fs_info = container_of(work, struct btrfs_fs_info, async_data_reclaim_work); + space_info = fs_info->data_sinfo; + do_async_reclaim_data_space(space_info); + for (int i = 0; i < BTRFS_SPACE_INFO_SUB_GROUP_MAX; i++) + if (space_info->sub_group[i]) + do_async_reclaim_data_space(space_info->sub_group[i]); +} + void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info) { INIT_WORK(&fs_info->async_reclaim_work, btrfs_async_reclaim_metadata_space); @@ -1836,10 +1902,10 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info, * This will reserve bytes from the data space info. If there is not enough * space then we will attempt to flush space as specified by flush. */ -int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, +int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush) { - struct btrfs_space_info *data_sinfo = fs_info->data_sinfo; + struct btrfs_fs_info *fs_info = space_info->fs_info; int ret; ASSERT(flush == BTRFS_RESERVE_FLUSH_DATA || @@ -1847,12 +1913,12 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, flush == BTRFS_RESERVE_NO_FLUSH); ASSERT(!current->journal_info || flush != BTRFS_RESERVE_FLUSH_DATA); - ret = __reserve_bytes(fs_info, data_sinfo, bytes, flush); + ret = __reserve_bytes(fs_info, space_info, bytes, flush); if (ret == -ENOSPC) { trace_btrfs_space_reservation(fs_info, "space_info:enospc", - data_sinfo->flags, bytes, 1); + space_info->flags, bytes, 1); if (btrfs_test_opt(fs_info, ENOSPC_DEBUG)) - btrfs_dump_space_info(fs_info, data_sinfo, bytes, 0); + btrfs_dump_space_info(fs_info, space_info, bytes, 0); } return ret; } diff --git a/fs/btrfs/space-info.h b/fs/btrfs/space-info.h index a96efdb5e681..92b7f5e2b850 100644 --- a/fs/btrfs/space-info.h +++ b/fs/btrfs/space-info.h @@ -98,8 +98,18 @@ enum btrfs_flush_state { RESET_ZONES = 12, }; +enum btrfs_space_info_sub_group { + BTRFS_SUB_GROUP_PRIMARY, + BTRFS_SUB_GROUP_DATA_RELOC, + BTRFS_SUB_GROUP_TREELOG, +}; + +#define BTRFS_SPACE_INFO_SUB_GROUP_MAX 1 struct btrfs_space_info { struct btrfs_fs_info *fs_info; + struct btrfs_space_info *parent; + struct btrfs_space_info *sub_group[BTRFS_SPACE_INFO_SUB_GROUP_MAX]; + int subgroup_id; spinlock_t lock; u64 total_bytes; /* total bytes in the space, @@ -288,7 +298,7 @@ static inline void btrfs_space_info_free_bytes_may_use( btrfs_try_granting_tickets(space_info->fs_info, space_info); spin_unlock(&space_info->lock); } -int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes, +int btrfs_reserve_data_bytes(struct btrfs_space_info *space_info, u64 bytes, enum btrfs_reserve_flush_enum flush); void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info); void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info); diff --git a/fs/btrfs/subpage.c b/fs/btrfs/subpage.c index 722acf768396..d4f019233493 100644 --- a/fs/btrfs/subpage.c +++ b/fs/btrfs/subpage.c @@ -2,12 +2,11 @@ #include <linux/slab.h> #include "messages.h" -#include "ctree.h" #include "subpage.h" #include "btrfs_inode.h" /* - * Subpage (sectorsize < PAGE_SIZE) support overview: + * Subpage (block size < folio size) support overview: * * Limitations: * @@ -64,35 +63,15 @@ * This means a slightly higher tree locking latency. */ -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping) -{ - if (fs_info->sectorsize >= PAGE_SIZE) - return false; - - /* - * Only data pages (either through DIO or compression) can have no - * mapping. And if page->mapping->host is data inode, it's subpage. - * As we have ruled our sectorsize >= PAGE_SIZE case already. - */ - if (!mapping || !mapping->host || is_data_inode(BTRFS_I(mapping->host))) - return true; - - /* - * Now the only remaining case is metadata, which we only go subpage - * routine if nodesize < PAGE_SIZE. - */ - if (fs_info->nodesize < PAGE_SIZE) - return true; - return false; -} -#endif - int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; + /* For metadata we don't support large folio yet. */ + if (type == BTRFS_SUBPAGE_METADATA) + ASSERT(!folio_test_large(folio)); + /* * We have cases like a dummy extent buffer page, which is not mapped * and doesn't need to be locked. @@ -101,10 +80,14 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || folio_test_private(folio)) + if (folio_test_private(folio)) + return 0; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return 0; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return 0; - subpage = btrfs_alloc_subpage(fs_info, type); + subpage = btrfs_alloc_subpage(fs_info, folio_size(folio), type); if (IS_ERR(subpage)) return PTR_ERR(subpage); @@ -112,12 +95,17 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, return 0; } -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio) +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_subpage_type type) { struct btrfs_subpage *subpage; /* Either not subpage, or the folio already has private attached. */ - if (!btrfs_is_subpage(fs_info, folio->mapping) || !folio_test_private(folio)) + if (!folio_test_private(folio)) + return; + if (type == BTRFS_SUBPAGE_METADATA && !btrfs_meta_is_subpage(fs_info)) + return; + if (type == BTRFS_SUBPAGE_DATA && !btrfs_is_subpage(fs_info, folio)) return; subpage = folio_detach_private(folio); @@ -126,15 +114,16 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *fol } struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type) + size_t fsize, enum btrfs_subpage_type type) { struct btrfs_subpage *ret; unsigned int real_size; - ASSERT(fs_info->sectorsize < PAGE_SIZE); + ASSERT(fs_info->sectorsize < fsize); real_size = struct_size(ret, bitmaps, - BITS_TO_LONGS(btrfs_bitmap_nr_max * fs_info->sectors_per_page)); + BITS_TO_LONGS(btrfs_bitmap_nr_max * + (fsize >> fs_info->sectorsize_bits))); ret = kzalloc(real_size, GFP_NOFS); if (!ret) return ERR_PTR(-ENOMEM); @@ -165,7 +154,7 @@ void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); @@ -179,7 +168,7 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * { struct btrfs_subpage *subpage; - if (!btrfs_is_subpage(fs_info, folio->mapping)) + if (!btrfs_meta_is_subpage(fs_info)) return; ASSERT(folio_test_private(folio) && folio->mapping); @@ -193,9 +182,6 @@ void btrfs_folio_dec_eb_refs(const struct btrfs_fs_info *fs_info, struct folio * static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { - /* For subpage support, the folio must be single page. */ - ASSERT(folio_order(folio) == 0); - /* Basic checks */ ASSERT(folio_test_private(folio) && folio_get_private(folio)); ASSERT(IS_ALIGNED(start, fs_info->sectorsize) && @@ -206,16 +192,18 @@ static void btrfs_subpage_assert(const struct btrfs_fs_info *fs_info, */ if (folio->mapping) ASSERT(folio_pos(folio) <= start && - start + len <= folio_pos(folio) + PAGE_SIZE); + start + len <= folio_pos(folio) + folio_size(folio)); } #define subpage_calc_start_bit(fs_info, folio, name, start, len) \ ({ \ - unsigned int __start_bit; \ + unsigned int __start_bit; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ btrfs_subpage_assert(fs_info, folio, start, len); \ - __start_bit = offset_in_page(start) >> fs_info->sectorsize_bits; \ - __start_bit += fs_info->sectors_per_page * btrfs_bitmap_nr_##name; \ + __start_bit = offset_in_folio(folio, start) >> fs_info->sectorsize_bits; \ + __start_bit += blocks_per_folio * btrfs_bitmap_nr_##name; \ __start_bit; \ }) @@ -233,7 +221,7 @@ static void btrfs_subpage_clamp_range(struct folio *folio, u64 *start, u32 *len) if (folio_pos(folio) >= orig_start + orig_len) *len = 0; else - *len = min_t(u64, folio_pos(folio) + PAGE_SIZE, + *len = min_t(u64, folio_pos(folio) + folio_size(folio), orig_start + orig_len) - *start; } @@ -296,7 +284,7 @@ void btrfs_folio_end_lock(const struct btrfs_fs_info *fs_info, ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) { + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -323,13 +311,14 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long bitmap) { struct btrfs_subpage *subpage = folio_get_private(folio); - const int start_bit = fs_info->sectors_per_page * btrfs_bitmap_nr_locked; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); + const int start_bit = blocks_per_folio * btrfs_bitmap_nr_locked; unsigned long flags; bool last = false; int cleared = 0; int bit; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { folio_unlock(folio); return; } @@ -341,7 +330,7 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, } spin_lock_irqsave(&subpage->lock, flags); - for_each_set_bit(bit, &bitmap, fs_info->sectors_per_page) { + for_each_set_bit(bit, &bitmap, blocks_per_folio) { if (test_and_clear_bit(bit + start_bit, subpage->bitmaps)) cleared++; } @@ -352,15 +341,27 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, folio_unlock(folio); } -#define subpage_test_bitmap_all_set(fs_info, subpage, name) \ +#define subpage_test_bitmap_all_set(fs_info, folio, name) \ +({ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ bitmap_test_range_all_set(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) -#define subpage_test_bitmap_all_zero(fs_info, subpage, name) \ +#define subpage_test_bitmap_all_zero(fs_info, folio, name) \ +({ \ + struct btrfs_subpage *subpage = folio_get_private(folio); \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + \ bitmap_test_range_all_zero(subpage->bitmaps, \ - fs_info->sectors_per_page * btrfs_bitmap_nr_##name, \ - fs_info->sectors_per_page) + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ +}) void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) @@ -372,7 +373,7 @@ void btrfs_subpage_set_uptodate(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, uptodate)) + if (subpage_test_bitmap_all_set(fs_info, folio, uptodate)) folio_mark_uptodate(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -426,7 +427,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, dirty)) + if (subpage_test_bitmap_all_zero(fs_info, folio, dirty)) last = true; spin_unlock_irqrestore(&subpage->lock, flags); return last; @@ -467,7 +468,7 @@ void btrfs_subpage_clear_writeback(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, writeback)) { + if (subpage_test_bitmap_all_zero(fs_info, folio, writeback)) { ASSERT(folio_test_writeback(folio)); folio_end_writeback(folio); } @@ -498,7 +499,7 @@ void btrfs_subpage_clear_ordered(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_clear(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_zero(fs_info, subpage, ordered)) + if (subpage_test_bitmap_all_zero(fs_info, folio, ordered)) folio_clear_ordered(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -513,7 +514,7 @@ void btrfs_subpage_set_checked(const struct btrfs_fs_info *fs_info, spin_lock_irqsave(&subpage->lock, flags); bitmap_set(subpage->bitmaps, start_bit, len >> fs_info->sectorsize_bits); - if (subpage_test_bitmap_all_set(fs_info, subpage, checked)) + if (subpage_test_bitmap_all_set(fs_info, folio, checked)) folio_set_checked(folio); spin_unlock_irqrestore(&subpage->lock, flags); } @@ -569,7 +570,7 @@ void btrfs_folio_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -579,7 +580,7 @@ void btrfs_folio_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -589,7 +590,7 @@ bool btrfs_folio_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ } \ @@ -597,7 +598,7 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_set_func(folio); \ return; \ } \ @@ -608,7 +609,7 @@ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) { \ + !btrfs_is_subpage(fs_info, folio)) { \ folio_clear_func(folio); \ return; \ } \ @@ -619,10 +620,32 @@ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len) \ { \ if (unlikely(!fs_info) || \ - !btrfs_is_subpage(fs_info, folio->mapping)) \ + !btrfs_is_subpage(fs_info, folio)) \ return folio_test_func(folio); \ btrfs_subpage_clamp_range(folio, &start, &len); \ return btrfs_subpage_test_##name(fs_info, folio, start, len); \ +} \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_set_func(folio); \ + return; \ + } \ + btrfs_subpage_set_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) { \ + folio_clear_func(folio); \ + return; \ + } \ + btrfs_subpage_clear_##name(eb->fs_info, folio, eb->start, eb->len); \ +} \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb) \ +{ \ + if (!btrfs_meta_is_subpage(eb->fs_info)) \ + return folio_test_func(folio); \ + return btrfs_subpage_test_##name(eb->fs_info, folio, eb->start, eb->len); \ } IMPLEMENT_BTRFS_PAGE_OPS(uptodate, folio_mark_uptodate, folio_clear_uptodate, folio_test_uptodate); @@ -635,26 +658,29 @@ IMPLEMENT_BTRFS_PAGE_OPS(ordered, folio_set_ordered, folio_clear_ordered, IMPLEMENT_BTRFS_PAGE_OPS(checked, folio_set_checked, folio_clear_checked, folio_test_checked); -#define GET_SUBPAGE_BITMAP(subpage, fs_info, name, dst) \ +#define GET_SUBPAGE_BITMAP(fs_info, folio, name, dst) \ { \ - const int sectors_per_page = fs_info->sectors_per_page; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ + const struct btrfs_subpage *subpage = folio_get_private(folio); \ \ - ASSERT(sectors_per_page < BITS_PER_LONG); \ + ASSERT(blocks_per_folio <= BITS_PER_LONG); \ *dst = bitmap_read(subpage->bitmaps, \ - sectors_per_page * btrfs_bitmap_nr_##name, \ - sectors_per_page); \ + blocks_per_folio * btrfs_bitmap_nr_##name, \ + blocks_per_folio); \ } #define SUBPAGE_DUMP_BITMAP(fs_info, folio, name, start, len) \ { \ - const struct btrfs_subpage *subpage = folio_get_private(folio); \ unsigned long bitmap; \ + const unsigned int blocks_per_folio = \ + btrfs_blocks_per_folio(fs_info, folio); \ \ - GET_SUBPAGE_BITMAP(subpage, fs_info, name, &bitmap); \ + GET_SUBPAGE_BITMAP(fs_info, folio, name, &bitmap); \ btrfs_warn(fs_info, \ "dumpping bitmap start=%llu len=%u folio=%llu " #name "_bitmap=%*pbl", \ start, len, folio_pos(folio), \ - fs_info->sectors_per_page, &bitmap); \ + blocks_per_folio, &bitmap); \ } /* @@ -672,7 +698,7 @@ void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, if (!IS_ENABLED(CONFIG_BTRFS_ASSERT)) return; - if (!btrfs_is_subpage(fs_info, folio->mapping)) { + if (!btrfs_is_subpage(fs_info, folio)) { ASSERT(!folio_test_dirty(folio)); return; } @@ -707,7 +733,7 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, int ret; ASSERT(folio_test_locked(folio)); - if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio->mapping)) + if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, folio)) return; subpage = folio_get_private(folio); @@ -721,15 +747,37 @@ void btrfs_folio_set_lock(const struct btrfs_fs_info *fs_info, } bitmap_set(subpage->bitmaps, start_bit, nbits); ret = atomic_add_return(nbits, &subpage->nr_locked); - ASSERT(ret <= fs_info->sectors_per_page); + ASSERT(ret <= btrfs_blocks_per_folio(fs_info, folio)); spin_unlock_irqrestore(&subpage->lock, flags); } +/* + * Clear the dirty flag for the folio. + * + * If the affected folio is no longer dirty, return true. Otherwise return false. + */ +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb) +{ + bool last; + + if (!btrfs_meta_is_subpage(eb->fs_info)) { + folio_clear_dirty_for_io(folio); + return true; + } + + last = btrfs_subpage_clear_and_test_dirty(eb->fs_info, folio, eb->start, eb->len); + if (last) { + folio_clear_dirty_for_io(folio); + return true; + } + return false; +} + void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len) { struct btrfs_subpage *subpage; - const u32 sectors_per_page = fs_info->sectors_per_page; + const unsigned int blocks_per_folio = btrfs_blocks_per_folio(fs_info, folio); unsigned long uptodate_bitmap; unsigned long dirty_bitmap; unsigned long writeback_bitmap; @@ -739,28 +787,28 @@ void __cold btrfs_subpage_dump_bitmap(const struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(sectors_per_page > 1); + ASSERT(blocks_per_folio > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, uptodate, &uptodate_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, &dirty_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, writeback, &writeback_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, ordered, &ordered_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, checked, &checked_bitmap); - GET_SUBPAGE_BITMAP(subpage, fs_info, locked, &locked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, uptodate, &uptodate_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, &dirty_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, writeback, &writeback_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, ordered, &ordered_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, checked, &checked_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, locked, &locked_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); dump_page(folio_page(folio, 0), "btrfs subpage dump"); btrfs_warn(fs_info, "start=%llu len=%u page=%llu, bitmaps uptodate=%*pbl dirty=%*pbl locked=%*pbl writeback=%*pbl ordered=%*pbl checked=%*pbl", start, len, folio_pos(folio), - sectors_per_page, &uptodate_bitmap, - sectors_per_page, &dirty_bitmap, - sectors_per_page, &locked_bitmap, - sectors_per_page, &writeback_bitmap, - sectors_per_page, &ordered_bitmap, - sectors_per_page, &checked_bitmap); + blocks_per_folio, &uptodate_bitmap, + blocks_per_folio, &dirty_bitmap, + blocks_per_folio, &locked_bitmap, + blocks_per_folio, &writeback_bitmap, + blocks_per_folio, &ordered_bitmap, + blocks_per_folio, &checked_bitmap); } void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, @@ -771,10 +819,10 @@ void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, unsigned long flags; ASSERT(folio_test_private(folio) && folio_get_private(folio)); - ASSERT(fs_info->sectors_per_page > 1); + ASSERT(btrfs_blocks_per_folio(fs_info, folio) > 1); subpage = folio_get_private(folio); spin_lock_irqsave(&subpage->lock, flags); - GET_SUBPAGE_BITMAP(subpage, fs_info, dirty, ret_bitmap); + GET_SUBPAGE_BITMAP(fs_info, folio, dirty, ret_bitmap); spin_unlock_irqrestore(&subpage->lock, flags); } diff --git a/fs/btrfs/subpage.h b/fs/btrfs/subpage.h index 44fff1f4eac4..3042c5ea840a 100644 --- a/fs/btrfs/subpage.h +++ b/fs/btrfs/subpage.h @@ -6,10 +6,11 @@ #include <linux/spinlock.h> #include <linux/atomic.h> #include <linux/sizes.h> +#include "btrfs_inode.h" +#include "fs.h" struct address_space; struct folio; -struct btrfs_fs_info; /* * Extra info for subpapge bitmap. @@ -69,23 +70,49 @@ enum btrfs_subpage_type { BTRFS_SUBPAGE_DATA, }; -#if PAGE_SIZE > SZ_4K -bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct address_space *mapping); +#if PAGE_SIZE > BTRFS_MIN_BLOCKSIZE +/* + * Subpage support for metadata is more complex, as we can have dummy extent + * buffers, where folios have no mapping to determine the owning inode. + * + * Thankfully we only need to check if node size is smaller than page size. + * Even with larger folio support, we will only allocate a folio as large as + * node size. + * Thus if nodesize < PAGE_SIZE, we know metadata needs need to subpage routine. + */ +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return fs_info->nodesize < PAGE_SIZE; +} +static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, + struct folio *folio) +{ + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); + return fs_info->sectorsize < folio_size(folio); +} #else +static inline bool btrfs_meta_is_subpage(const struct btrfs_fs_info *fs_info) +{ + return false; +} static inline bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, - struct address_space *mapping) + struct folio *folio) { + if (folio->mapping && folio->mapping->host) + ASSERT(is_data_inode(BTRFS_I(folio->mapping->host))); return false; } #endif int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, enum btrfs_subpage_type type); -void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio); +void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info, struct folio *folio, + enum btrfs_subpage_type type); /* Allocate additional data where page represents more than one sector */ struct btrfs_subpage *btrfs_alloc_subpage(const struct btrfs_fs_info *fs_info, - enum btrfs_subpage_type type); + size_t fsize, enum btrfs_subpage_type type); void btrfs_free_subpage(struct btrfs_subpage *subpage); void btrfs_folio_inc_eb_refs(const struct btrfs_fs_info *fs_info, struct folio *folio); @@ -110,6 +137,13 @@ void btrfs_folio_end_lock_bitmap(const struct btrfs_fs_info *fs_info, * btrfs_folio_clamp_*() are similar to btrfs_folio_*(), except the range doesn't * need to be inside the page. Those functions will truncate the range * automatically. + * + * Both btrfs_folio_*() and btrfs_folio_clamp_*() are for data folios. + * + * For metadata, one should use btrfs_meta_folio_*() helpers instead, and there + * is no clamp version for metadata helpers, as we either go subpage + * (nodesize < PAGE_SIZE) or go regular folio helpers (nodesize >= PAGE_SIZE, + * and our folio is never larger than nodesize). */ #define DECLARE_BTRFS_SUBPAGE_OPS(name) \ void btrfs_subpage_set_##name(const struct btrfs_fs_info *fs_info, \ @@ -129,7 +163,10 @@ void btrfs_folio_clamp_set_##name(const struct btrfs_fs_info *fs_info, \ void btrfs_folio_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \ struct folio *folio, u64 start, u32 len); \ bool btrfs_folio_clamp_test_##name(const struct btrfs_fs_info *fs_info, \ - struct folio *folio, u64 start, u32 len); + struct folio *folio, u64 start, u32 len); \ +void btrfs_meta_folio_set_##name(struct folio *folio, const struct extent_buffer *eb); \ +void btrfs_meta_folio_clear_##name(struct folio *folio, const struct extent_buffer *eb); \ +bool btrfs_meta_folio_test_##name(struct folio *folio, const struct extent_buffer *eb); DECLARE_BTRFS_SUBPAGE_OPS(uptodate); DECLARE_BTRFS_SUBPAGE_OPS(dirty); @@ -155,6 +192,7 @@ bool btrfs_subpage_clear_and_test_dirty(const struct btrfs_fs_info *fs_info, void btrfs_folio_assert_not_dirty(const struct btrfs_fs_info *fs_info, struct folio *folio, u64 start, u32 len); +bool btrfs_meta_folio_clear_and_test_dirty(struct folio *folio, const struct extent_buffer *eb); void btrfs_get_subpage_dirty_bitmap(struct btrfs_fs_info *fs_info, struct folio *folio, unsigned long *ret_bitmap); diff --git a/fs/btrfs/super.c b/fs/btrfs/super.c index dc4fee519ca6..a0c65adce1ab 100644 --- a/fs/btrfs/super.c +++ b/fs/btrfs/super.c @@ -84,7 +84,7 @@ struct btrfs_fs_context { u32 thread_pool_size; unsigned long long mount_opt; unsigned long compress_type:4; - unsigned int compress_level; + int compress_level; refcount_t refs; }; @@ -125,7 +125,6 @@ enum { /* Rescue options */ Opt_rescue, Opt_usebackuproot, - Opt_nologreplay, /* Debugging options */ Opt_enospc_debug, @@ -246,8 +245,6 @@ static const struct fs_parameter_spec btrfs_fs_parameters[] = { /* Rescue options. */ fsparam_enum("rescue", Opt_rescue, btrfs_parameter_rescue), - /* Deprecated, with alias rescue=nologreplay */ - __fsparam(NULL, "nologreplay", Opt_nologreplay, fs_param_deprecated, NULL), /* Deprecated, with alias rescue=usebackuproot */ __fsparam(NULL, "usebackuproot", Opt_usebackuproot, fs_param_deprecated, NULL), /* For compatibility only, alias for "rescue=nologreplay". */ @@ -449,11 +446,6 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) else btrfs_clear_opt(ctx->mount_opt, NOTREELOG); break; - case Opt_nologreplay: - btrfs_warn(NULL, - "'nologreplay' is deprecated, use 'rescue=nologreplay' instead"); - btrfs_set_opt(ctx->mount_opt, NOLOGREPLAY); - break; case Opt_norecovery: btrfs_info(NULL, "'norecovery' is for compatibility only, recommended to use 'rescue=nologreplay'"); @@ -569,6 +561,10 @@ static int btrfs_parse_param(struct fs_context *fc, struct fs_parameter *param) break; case Opt_commit_interval: ctx->commit_interval = result.uint_32; + if (ctx->commit_interval > BTRFS_WARNING_COMMIT_INTERVAL) { + btrfs_warn(NULL, "excessive commit interval %u, use with care", + ctx->commit_interval); + } if (ctx->commit_interval == 0) ctx->commit_interval = BTRFS_DEFAULT_COMMIT_INTERVAL; break; @@ -947,7 +943,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec static int btrfs_fill_super(struct super_block *sb, struct btrfs_fs_devices *fs_devices) { - struct inode *inode; + struct btrfs_inode *inode; struct btrfs_fs_info *fs_info = btrfs_sb(sb); int err; @@ -982,7 +978,7 @@ static int btrfs_fill_super(struct super_block *sb, goto fail_close; } - sb->s_root = d_make_root(inode); + sb->s_root = d_make_root(&inode->vfs_inode); if (!sb->s_root) { err = -ENOMEM; goto fail_close; @@ -1139,8 +1135,7 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) subvol_name = btrfs_get_subvol_name_from_objectid(info, btrfs_root_id(BTRFS_I(d_inode(dentry))->root)); if (!IS_ERR(subvol_name)) { - seq_puts(seq, ",subvol="); - seq_escape(seq, subvol_name, " \t\n\\"); + seq_show_option(seq, "subvol", subvol_name); kfree(subvol_name); } return 0; @@ -1149,11 +1144,11 @@ static int btrfs_show_options(struct seq_file *seq, struct dentry *dentry) /* * subvolumes are identified by ino 256 */ -static inline int is_subvolume_inode(struct inode *inode) +static inline bool is_subvolume_inode(struct inode *inode) { if (inode && inode->i_ino == BTRFS_FIRST_FREE_OBJECTID) - return 1; - return 0; + return true; + return false; } static struct dentry *mount_subvol(const char *subvol_name, u64 subvol_objectid, @@ -2293,7 +2288,7 @@ static int check_dev_super(struct btrfs_device *dev) return 0; /* Only need to check the primary super block. */ - sb = btrfs_read_dev_one_super(dev->bdev, 0, true); + sb = btrfs_read_disk_super(dev->bdev, 0, true); if (IS_ERR(sb)) return PTR_ERR(sb); @@ -2526,8 +2521,8 @@ static const struct init_sequence mod_init_seq[] = { .init_func = btrfs_free_space_init, .exit_func = btrfs_free_space_exit, }, { - .init_func = extent_state_init_cachep, - .exit_func = extent_state_free_cachep, + .init_func = btrfs_extent_state_init_cachep, + .exit_func = btrfs_extent_state_free_cachep, }, { .init_func = extent_buffer_init_cachep, .exit_func = extent_buffer_free_cachep, @@ -2535,8 +2530,8 @@ static const struct init_sequence mod_init_seq[] = { .init_func = btrfs_bioset_init, .exit_func = btrfs_bioset_exit, }, { - .init_func = extent_map_init, - .exit_func = extent_map_exit, + .init_func = btrfs_extent_map_init, + .exit_func = btrfs_extent_map_exit, #ifdef CONFIG_BTRFS_EXPERIMENTAL }, { .init_func = btrfs_read_policy_init, diff --git a/fs/btrfs/sysfs.c b/fs/btrfs/sysfs.c index 53b846d99ece..5d93d9dd2c12 100644 --- a/fs/btrfs/sysfs.c +++ b/fs/btrfs/sysfs.c @@ -411,7 +411,8 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj, { ssize_t ret = 0; - /* An artificial limit to only support 4K and PAGE_SIZE */ + if (BTRFS_MIN_BLOCKSIZE != SZ_4K && BTRFS_MIN_BLOCKSIZE != PAGE_SIZE) + ret += sysfs_emit_at(buf, ret, "%u ", BTRFS_MIN_BLOCKSIZE); if (PAGE_SIZE > SZ_4K) ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K); ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE); @@ -1330,29 +1331,30 @@ MODULE_PARM_DESC(read_policy, int btrfs_read_policy_to_enum(const char *str, s64 *value_ret) { - char param[32] = { 0 }; + char param[32]; char __maybe_unused *value_str; if (!str || strlen(str) == 0) return 0; - strncpy(param, str, sizeof(param) - 1); + strscpy(param, str); #ifdef CONFIG_BTRFS_EXPERIMENTAL /* Separate value from input in policy:value format. */ value_str = strchr(param, ':'); if (value_str) { - int ret; + char *retptr; *value_str = 0; value_str++; if (!value_ret) return -EINVAL; - ret = kstrtos64(value_str, 10, value_ret); - if (ret) + + *value_ret = memparse(value_str, &retptr); + /* There could be any trailing typos after the value. */ + retptr = skip_spaces(retptr); + if (*retptr != 0 || *value_ret <= 0) return -EINVAL; - if (*value_ret < 0) - return -ERANGE; } #endif @@ -1928,16 +1930,35 @@ void btrfs_sysfs_remove_space_info(struct btrfs_space_info *space_info) kobject_put(&space_info->kobj); } -static const char *alloc_name(u64 flags) +static const char *alloc_name(struct btrfs_space_info *space_info) { + u64 flags = space_info->flags; + switch (flags) { case BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_DATA: return "mixed"; case BTRFS_BLOCK_GROUP_METADATA: - return "metadata"; + switch (space_info->subgroup_id) { + case BTRFS_SUB_GROUP_PRIMARY: + return "metadata"; + case BTRFS_SUB_GROUP_TREELOG: + return "metadata-treelog"; + default: + WARN_ON_ONCE(1); + return "metadata (unknown sub-group)"; + } case BTRFS_BLOCK_GROUP_DATA: - return "data"; + switch (space_info->subgroup_id) { + case BTRFS_SUB_GROUP_PRIMARY: + return "data"; + case BTRFS_SUB_GROUP_DATA_RELOC: + return "data-reloc"; + default: + WARN_ON_ONCE(1); + return "data (unknown sub-group)"; + } case BTRFS_BLOCK_GROUP_SYSTEM: + ASSERT(space_info->subgroup_id == BTRFS_SUB_GROUP_PRIMARY); return "system"; default: WARN_ON(1); @@ -1956,7 +1977,7 @@ int btrfs_sysfs_add_space_info_type(struct btrfs_fs_info *fs_info, ret = kobject_init_and_add(&space_info->kobj, &space_info_ktype, fs_info->space_info_kobj, "%s", - alloc_name(space_info->flags)); + alloc_name(space_info)); if (ret) { kobject_put(&space_info->kobj); return ret; diff --git a/fs/btrfs/sysfs.h b/fs/btrfs/sysfs.h index 3fc5c6f90dc4..0f94ae923210 100644 --- a/fs/btrfs/sysfs.h +++ b/fs/btrfs/sysfs.h @@ -7,6 +7,7 @@ #include <linux/compiler_types.h> #include <linux/kobject.h> +struct block_device; struct btrfs_fs_info; struct btrfs_device; struct btrfs_fs_devices; diff --git a/fs/btrfs/tests/btrfs-tests.c b/fs/btrfs/tests/btrfs-tests.c index 5eff8d7d2360..b576897d71cc 100644 --- a/fs/btrfs/tests/btrfs-tests.c +++ b/fs/btrfs/tests/btrfs-tests.c @@ -102,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) if (!dev) return ERR_PTR(-ENOMEM); - extent_io_tree_init(fs_info, &dev->alloc_state, 0); + btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, 0); INIT_LIST_HEAD(&dev->dev_list); list_add(&dev->dev_list, &fs_info->fs_devices->devices); @@ -111,7 +111,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info) static void btrfs_free_dummy_device(struct btrfs_device *dev) { - extent_io_tree_release(&dev->alloc_state); + btrfs_extent_io_tree_release(&dev->alloc_state); kfree(dev); } @@ -157,9 +157,9 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize) void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) { - struct radix_tree_iter iter; - void **slot; struct btrfs_device *dev, *tmp; + struct extent_buffer *eb; + unsigned long index; if (!fs_info) return; @@ -169,25 +169,13 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info) test_mnt->mnt_sb->s_fs_info = NULL; - spin_lock(&fs_info->buffer_lock); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) { - struct extent_buffer *eb; - - eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock); - if (!eb) - continue; - /* Shouldn't happen but that kind of thinking creates CVE's */ - if (radix_tree_exception(eb)) { - if (radix_tree_deref_retry(eb)) - slot = radix_tree_iter_retry(&iter); - continue; - } - slot = radix_tree_iter_resume(slot, &iter); - spin_unlock(&fs_info->buffer_lock); - free_extent_buffer_stale(eb); - spin_lock(&fs_info->buffer_lock); + xa_lock_irq(&fs_info->buffer_tree); + xa_for_each(&fs_info->buffer_tree, index, eb) { + xa_unlock_irq(&fs_info->buffer_tree); + free_extent_buffer(eb); + xa_lock_irq(&fs_info->buffer_tree); } - spin_unlock(&fs_info->buffer_lock); + xa_unlock_irq(&fs_info->buffer_tree); btrfs_mapping_tree_free(fs_info); list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices, diff --git a/fs/btrfs/tests/delayed-refs-tests.c b/fs/btrfs/tests/delayed-refs-tests.c index 6558508c2ddf..265370e79a54 100644 --- a/fs/btrfs/tests/delayed-refs-tests.c +++ b/fs/btrfs/tests/delayed-refs-tests.c @@ -1009,6 +1009,7 @@ int btrfs_test_delayed_refs(u32 sectorsize, u32 nodesize) if (!ret) ret = select_delayed_refs_test(&trans); + kfree(transaction); out_free_fs_info: btrfs_free_dummy_fs_info(fs_info); return ret; diff --git a/fs/btrfs/tests/extent-io-tests.c b/fs/btrfs/tests/extent-io-tests.c index 0a2dbfaaf49e..00da54f0164c 100644 --- a/fs/btrfs/tests/extent-io-tests.c +++ b/fs/btrfs/tests/extent-io-tests.c @@ -14,9 +14,9 @@ #include "../disk-io.h" #include "../btrfs_inode.h" -#define PROCESS_UNLOCK (1 << 0) -#define PROCESS_RELEASE (1 << 1) -#define PROCESS_TEST_LOCKED (1 << 2) +#define PROCESS_UNLOCK (1U << 0) +#define PROCESS_RELEASE (1U << 1) +#define PROCESS_TEST_LOCKED (1U << 2) static noinline int process_page_range(struct inode *inode, u64 start, u64 end, unsigned long flags) @@ -74,7 +74,6 @@ static void extent_flag_to_str(const struct extent_state *state, char *dest) dest[0] = 0; PRINT_ONE_FLAG(state, dest, cur, DIRTY); - PRINT_ONE_FLAG(state, dest, cur, UPTODATE); PRINT_ONE_FLAG(state, dest, cur, LOCKED); PRINT_ONE_FLAG(state, dest, cur, NEW); PRINT_ONE_FLAG(state, dest, cur, DELALLOC); @@ -150,7 +149,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * Passing NULL as we don't have fs_info but tracepoints are not used * at this point */ - extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); + btrfs_extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST); /* * First go through and create and mark all of our pages dirty, we pin @@ -177,7 +176,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * |--- delalloc ---| * |--- search ---| */ - set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, 0, sectorsize - 1, EXTENT_DELALLOC, NULL); start = 0; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -191,7 +190,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) sectorsize - 1, start, end); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); unlock_page(locked_page); put_page(locked_page); @@ -208,7 +207,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("couldn't find the locked page"); goto out_bits; } - set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, sectorsize, max_bytes - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -227,7 +226,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("there were unlocked pages in the range"); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); /* locked_page was unlocked above */ put_page(locked_page); @@ -263,7 +262,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) * * We are re-using our test_start from above since it works out well. */ - set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); + btrfs_set_extent_bit(tmp, max_bytes, total_dirty - 1, EXTENT_DELALLOC, NULL); start = test_start; end = start + PAGE_SIZE - 1; found = find_lock_delalloc_range(inode, page_folio(locked_page), &start, @@ -282,7 +281,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) test_err("pages in range were not all locked"); goto out_bits; } - unlock_extent(tmp, start, end, NULL); + btrfs_unlock_extent(tmp, start, end, NULL); /* * Now to test where we run into a page that is no longer dirty in the @@ -327,7 +326,7 @@ static int test_find_delalloc(u32 sectorsize, u32 nodesize) out_bits: if (ret) dump_extent_io_tree(tmp); - clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); + btrfs_clear_extent_bits(tmp, 0, total_dirty - 1, (unsigned)-1); out: if (locked_page) put_page(locked_page); @@ -525,7 +524,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, 0, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, 0); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -542,7 +541,7 @@ static int test_eb_bitmaps(u32 sectorsize, u32 nodesize) * Test again for case where the tree block is sectorsize aligned but * not nodesize aligned. */ - eb = __alloc_dummy_extent_buffer(fs_info, sectorsize, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, sectorsize); if (!eb) { test_std_err(TEST_ALLOC_ROOT); ret = -ENOMEM; @@ -565,10 +564,10 @@ static int test_find_first_clear_extent_bit(void) test_msg("running find_first_clear_extent_bit test"); - extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); + btrfs_extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST); /* Test correct handling of empty tree */ - find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED); if (start != 0 || end != -1) { test_err( "error getting a range from completely empty tree: start %llu end %llu", @@ -579,11 +578,11 @@ static int test_find_first_clear_extent_bit(void) * Set 1M-4M alloc/discard and 32M-64M thus leaving a hole between * 4M-32M */ - set_extent_bit(&tree, SZ_1M, SZ_4M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); + btrfs_set_extent_bit(&tree, SZ_1M, SZ_4M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); - find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, SZ_512K, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != 0 || end != SZ_1M - 1) { test_err("error finding beginning range: start %llu end %llu", @@ -592,14 +591,14 @@ static int test_find_first_clear_extent_bit(void) } /* Now add 32M-64M so that we have a hole between 4M-32M */ - set_extent_bit(&tree, SZ_32M, SZ_64M - 1, - CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); + btrfs_set_extent_bit(&tree, SZ_32M, SZ_64M - 1, + CHUNK_TRIMMED | CHUNK_ALLOCATED, NULL); /* * Request first hole starting at 12M, we should get 4M-32M */ - find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, 12 * SZ_1M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding trimmed range: start %llu end %llu", @@ -611,8 +610,8 @@ static int test_find_first_clear_extent_bit(void) * Search in the middle of allocated range, should get the next one * available, which happens to be unallocated -> 4M-32M */ - find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, - CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_find_first_clear_extent_bit(&tree, SZ_2M, &start, &end, + CHUNK_TRIMMED | CHUNK_ALLOCATED); if (start != SZ_4M || end != SZ_32M - 1) { test_err("error finding next unalloc range: start %llu end %llu", @@ -624,9 +623,9 @@ static int test_find_first_clear_extent_bit(void) * Set 64M-72M with CHUNK_ALLOC flag, then search for CHUNK_TRIMMED flag * being unset in this range, we should get the entry in range 64M-72M */ - set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); - find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, - CHUNK_TRIMMED); + btrfs_set_extent_bit(&tree, SZ_64M, SZ_64M + SZ_8M - 1, CHUNK_ALLOCATED, NULL); + btrfs_find_first_clear_extent_bit(&tree, SZ_64M + SZ_1M, &start, &end, + CHUNK_TRIMMED); if (start != SZ_64M || end != SZ_64M + SZ_8M - 1) { test_err("error finding exact range: start %llu end %llu", @@ -634,8 +633,8 @@ static int test_find_first_clear_extent_bit(void) goto out; } - find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, - CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, SZ_64M - SZ_8M, &start, &end, + CHUNK_TRIMMED); /* * Search in the middle of set range whose immediate neighbour doesn't @@ -651,7 +650,7 @@ static int test_find_first_clear_extent_bit(void) * Search beyond any known range, shall return after last known range * and end should be -1 */ - find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); + btrfs_find_first_clear_extent_bit(&tree, -1, &start, &end, CHUNK_TRIMMED); if (start != SZ_64M + SZ_8M || end != -1) { test_err( "error handling beyond end of range search: start %llu end %llu", @@ -663,7 +662,7 @@ static int test_find_first_clear_extent_bit(void) out: if (ret) dump_extent_io_tree(&tree); - clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); + btrfs_clear_extent_bits(&tree, 0, (u64)-1, CHUNK_TRIMMED | CHUNK_ALLOCATED); return ret; } @@ -730,7 +729,7 @@ static int test_eb_mem_ops(u32 sectorsize, u32 nodesize) goto out; } - eb = __alloc_dummy_extent_buffer(fs_info, SZ_1M, nodesize); + eb = alloc_dummy_extent_buffer(fs_info, SZ_1M); if (!eb) { test_std_err(TEST_ALLOC_EXTENT_BUFFER); ret = -ENOMEM; diff --git a/fs/btrfs/tests/extent-map-tests.c b/fs/btrfs/tests/extent-map-tests.c index 56e61ac1cc64..3a86534c116f 100644 --- a/fs/btrfs/tests/extent-map-tests.c +++ b/fs/btrfs/tests/extent-map-tests.c @@ -22,7 +22,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode) while (!RB_EMPTY_ROOT(&em_tree->root)) { node = rb_first(&em_tree->root); em = rb_entry(node, struct extent_map, rb_node); - remove_extent_mapping(inode, em); + btrfs_remove_extent_mapping(inode, em); #ifdef CONFIG_BTRFS_DEBUG if (refcount_read(&em->refs) != 1) { @@ -36,7 +36,7 @@ static int free_extent_map_tree(struct btrfs_inode *inode) refcount_set(&em->refs, 1); } #endif - free_extent_map(em); + btrfs_free_extent_map(em); } write_unlock(&em_tree->lock); @@ -68,7 +68,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -87,10 +87,10 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [0, 16K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* Add [16K, 20K) following [0, 16K) */ - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -109,9 +109,9 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [16K, 20K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -137,7 +137,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = -ENOENT; goto out; } - if (em->start != 0 || extent_map_end(em) != SZ_16K || + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_16K || em->disk_bytenr != 0 || em->disk_num_bytes != SZ_16K) { test_err( "case1 [%llu %llu]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu", @@ -145,7 +145,7 @@ static int test_case_1(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -167,7 +167,7 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -186,10 +186,10 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [0, 1K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* Add [4K, 8K) following [0, 1K) */ - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -208,9 +208,9 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("cannot add extent range [4K, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -235,14 +235,14 @@ static int test_case_2(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = -ENOENT; goto out; } - if (em->start != 0 || extent_map_end(em) != SZ_1K || + if (em->start != 0 || btrfs_extent_map_end(em) != SZ_1K || em->disk_bytenr != EXTENT_MAP_INLINE) { test_err( "case2 [0 1K]: ret %d return a wrong em (start %llu len %llu disk_bytenr %llu", ret, em->start, em->len, em->disk_bytenr); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -260,7 +260,7 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -279,9 +279,9 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [4K, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -312,15 +312,15 @@ static int __test_case_3(struct btrfs_fs_info *fs_info, * Since bytes within em are contiguous, em->block_start is identical to * em->start. */ - if (start < em->start || start + len > extent_map_end(em) || - em->start != extent_map_block_start(em)) { + if (start < em->start || start + len > btrfs_extent_map_end(em) || + em->start != btrfs_extent_map_block_start(em)) { test_err( "case3 [%llu %llu): ret %d em (start %llu len %llu disk_bytenr %llu block_len %llu)", start, start + len, ret, em->start, em->len, em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -369,7 +369,7 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -388,9 +388,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [0, 8K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -410,9 +410,9 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, test_err("cannot add extent range [8K, 32K)"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -438,14 +438,14 @@ static int __test_case_4(struct btrfs_fs_info *fs_info, ret = -ENOENT; goto out; } - if (start < em->start || start + len > extent_map_end(em)) { + if (start < em->start || start + len > btrfs_extent_map_end(em)) { test_err( "case4 [%llu %llu): ret %d, added wrong em (start %llu len %llu disk_bytenr %llu disk_num_bytes %llu)", start, start + len, ret, em->start, em->len, em->disk_bytenr, em->disk_num_bytes); ret = -EINVAL; } - free_extent_map(em); + btrfs_free_extent_map(em); out: ret2 = free_extent_map_tree(inode); if (ret == 0) @@ -498,7 +498,7 @@ static int add_compressed_extent(struct btrfs_inode *inode, struct extent_map *em; int ret; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -513,7 +513,7 @@ static int add_compressed_extent(struct btrfs_inode *inode, write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("cannot add extent map [%llu, %llu)", start, start + len); return ret; @@ -719,7 +719,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) if (ret) goto out; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -751,7 +751,7 @@ static int test_case_6(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) } ret = 0; out: - free_extent_map(em); + btrfs_free_extent_map(em); ret2 = free_extent_map_tree(inode); if (ret == 0) ret = ret2; @@ -773,7 +773,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_msg("Running btrfs_drop_extent_cache with pinned"); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -793,9 +793,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("couldn't add extent map"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -815,7 +815,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) test_err("couldn't add extent map"); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); /* * Drop [0, 36K) This should skip the [0, 4K) extent and then split the @@ -826,7 +826,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) /* Make sure our extent maps look sane. */ ret = -EINVAL; - em = lookup_extent_mapping(em_tree, 0, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, 0, SZ_16K); if (!em) { test_err("didn't find an em at 0 as expected"); goto out; @@ -842,10 +842,10 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, SZ_16K, SZ_16K); read_unlock(&em_tree->lock); if (em) { test_err("found an em when we weren't expecting one"); @@ -853,7 +853,7 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) } read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); + em = btrfs_lookup_extent_mapping(em_tree, SZ_32K, SZ_16K); read_unlock(&em_tree->lock); if (!em) { test_err("didn't find an em at 32K as expected"); @@ -870,16 +870,16 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) goto out; } - if (extent_map_block_start(em) != SZ_32K + SZ_4K) { + if (btrfs_extent_map_block_start(em) != SZ_32K + SZ_4K) { test_err("em->block_start is %llu, expected 36K", - extent_map_block_start(em)); + btrfs_extent_map_block_start(em)); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); read_lock(&em_tree->lock); - em = lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); + em = btrfs_lookup_extent_mapping(em_tree, 48 * SZ_1K, (u64)-1); read_unlock(&em_tree->lock); if (em) { test_err("found an unexpected em above 48K"); @@ -888,9 +888,9 @@ static int test_case_7(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) ret = 0; out: - free_extent_map(em); + btrfs_free_extent_map(em); /* Unpin our extent to prevent warning when removing it below. */ - ret2 = unpin_extent_cache(inode, 0, SZ_16K, 0); + ret2 = btrfs_unpin_extent_cache(inode, 0, SZ_16K, 0); if (ret == 0) ret = ret2; ret2 = free_extent_map_tree(inode); @@ -913,7 +913,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) int ret; int ret2; - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); return -ENOMEM; @@ -928,13 +928,13 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, em->start, em->len); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("couldn't add extent map for range [120K, 128K)"); goto out; } - em = alloc_extent_map(); + em = btrfs_alloc_extent_map(); if (!em) { test_std_err(TEST_ALLOC_EXTENT_MAP); ret = -ENOMEM; @@ -967,7 +967,7 @@ static int test_case_8(struct btrfs_fs_info *fs_info, struct btrfs_inode *inode) write_lock(&em_tree->lock); ret = btrfs_add_extent_mapping(inode, &em, SZ_1K * 140, SZ_4K); write_unlock(&em_tree->lock); - free_extent_map(em); + btrfs_free_extent_map(em); if (ret < 0) { test_err("couldn't add extent map for range [108K, 144K)"); goto out; @@ -1045,6 +1045,7 @@ static int test_rmap_block(struct btrfs_fs_info *fs_info, ret = btrfs_add_chunk_map(fs_info, map); if (ret) { test_err("error adding chunk map to mapping tree"); + btrfs_free_chunk_map(map); goto out_free; } diff --git a/fs/btrfs/tests/inode-tests.c b/fs/btrfs/tests/inode-tests.c index 3ea3bc2225fe..a29d2c02c2c8 100644 --- a/fs/btrfs/tests/inode-tests.c +++ b/fs/btrfs/tests/inode-tests.c @@ -268,7 +268,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("expected a hole, got %llu", em->disk_bytenr); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); btrfs_drop_extent_map_range(BTRFS_I(inode), 0, (u64)-1, false); /* @@ -314,7 +314,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) * this? */ offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -336,7 +336,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Regular extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -363,7 +363,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* The next 3 are split extents */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -389,10 +389,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -414,7 +414,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -441,13 +441,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } disk_bytenr += (em->start - orig_start); - if (extent_map_block_start(em) != disk_bytenr) { + if (btrfs_extent_map_block_start(em) != disk_bytenr) { test_err("wrong block start, want %llu, have %llu", - disk_bytenr, extent_map_block_start(em)); + disk_bytenr, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -475,7 +475,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* The next 3 are a half written prealloc extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -502,10 +502,10 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -531,13 +531,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start - orig_start, em->offset); goto out; } - if (extent_map_block_start(em) != disk_bytenr + em->offset) { + if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + em->offset, extent_map_block_start(em)); + disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -564,13 +564,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->offset, orig_start); goto out; } - if (extent_map_block_start(em) != disk_bytenr + em->offset) { + if (btrfs_extent_map_block_start(em) != disk_bytenr + em->offset) { test_err("unexpected block start, wanted %llu, have %llu", - disk_bytenr + em->offset, extent_map_block_start(em)); + disk_bytenr + em->offset, btrfs_extent_map_block_start(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Now for the compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -597,13 +597,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* Split compressed extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); @@ -630,15 +630,15 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) test_err("wrong offset, want 0, have %llu", em->offset); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } - disk_bytenr = extent_map_block_start(em); + disk_bytenr = btrfs_extent_map_block_start(em); orig_start = em->start; offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -664,16 +664,16 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (extent_map_block_start(em) != disk_bytenr) { + if (btrfs_extent_map_block_start(em) != disk_bytenr) { test_err("block start does not match, want %llu got %llu", - disk_bytenr, extent_map_block_start(em)); + disk_bytenr, btrfs_extent_map_block_start(em)); goto out; } if (em->start != offset || em->len != 2 * sectorsize) { @@ -692,13 +692,13 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) em->start, em->offset, orig_start); goto out; } - if (extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { + if (btrfs_extent_map_compression(em) != BTRFS_COMPRESS_ZLIB) { test_err("unexpected compress type, wanted %d, got %d", - BTRFS_COMPRESS_ZLIB, extent_map_compression(em)); + BTRFS_COMPRESS_ZLIB, btrfs_extent_map_compression(em)); goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); /* A hole between regular extents but no hole extent */ em = btrfs_get_extent(BTRFS_I(inode), NULL, offset + 6, sectorsize); @@ -725,7 +725,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, SZ_4M); if (IS_ERR(em)) { @@ -757,7 +757,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) goto out; } offset = em->start + em->len; - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, offset, sectorsize); if (IS_ERR(em)) { @@ -785,7 +785,7 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize) ret = 0; out: if (!IS_ERR(em)) - free_extent_map(em); + btrfs_free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -858,15 +858,16 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) em->flags); goto out; } - free_extent_map(em); + btrfs_free_extent_map(em); em = btrfs_get_extent(BTRFS_I(inode), NULL, sectorsize, 2 * sectorsize); if (IS_ERR(em)) { test_err("got an error when we shouldn't have"); goto out; } - if (extent_map_block_start(em) != sectorsize) { - test_err("expected a real extent, got %llu", extent_map_block_start(em)); + if (btrfs_extent_map_block_start(em) != sectorsize) { + test_err("expected a real extent, got %llu", + btrfs_extent_map_block_start(em)); goto out; } if (em->start != sectorsize || em->len != sectorsize) { @@ -883,7 +884,7 @@ static int test_hole_first(u32 sectorsize, u32 nodesize) ret = 0; out: if (!IS_ERR(em)) - free_extent_map(em); + btrfs_free_extent_map(em); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); @@ -949,11 +950,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE/2][sectorsize HOLE][the rest] */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE >> 1, - (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE >> 1, + (BTRFS_MAX_EXTENT_SIZE >> 1) + sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1017,11 +1017,10 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* [BTRFS_MAX_EXTENT_SIZE+4k][4K HOLE][BTRFS_MAX_EXTENT_SIZE+4k] */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, - BTRFS_MAX_EXTENT_SIZE + sectorsize, - BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, + BTRFS_MAX_EXTENT_SIZE + sectorsize, + BTRFS_MAX_EXTENT_SIZE + 2 * sectorsize - 1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1052,9 +1051,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) } /* Empty */ - ret = clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + ret = btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); if (ret) { test_err("clear_extent_bit returned %d", ret); goto out; @@ -1068,9 +1066,8 @@ static int test_extent_accounting(u32 sectorsize, u32 nodesize) ret = 0; out: if (ret) - clear_extent_bit(&BTRFS_I(inode)->io_tree, 0, (u64)-1, - EXTENT_DELALLOC | EXTENT_DELALLOC_NEW | - EXTENT_UPTODATE, NULL); + btrfs_clear_extent_bits(&BTRFS_I(inode)->io_tree, 0, (u64)-1, + EXTENT_DELALLOC | EXTENT_DELALLOC_NEW); iput(inode); btrfs_free_dummy_root(root); btrfs_free_dummy_fs_info(fs_info); diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index aca83a98b75a..b96195d6480f 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -160,7 +160,13 @@ void btrfs_put_transaction(struct btrfs_transaction *transaction) cache = list_first_entry(&transaction->deleted_bgs, struct btrfs_block_group, bg_list); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the deleted_bgs list during a transaction abort. + */ + spin_lock(&transaction->fs_info->unused_bgs_lock); list_del_init(&cache->bg_list); + spin_unlock(&transaction->fs_info->unused_bgs_lock); btrfs_unfreeze_block_group(cache); btrfs_put_block_group(cache); } @@ -191,7 +197,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans) list_del_init(&root->dirty_list); free_extent_buffer(root->commit_root); root->commit_root = btrfs_root_node(root); - extent_io_tree_release(&root->dirty_log_pages); + btrfs_extent_io_tree_release(&root->dirty_log_pages); btrfs_qgroup_clean_swapped_blocks(root); } @@ -377,10 +383,10 @@ loop: INIT_LIST_HEAD(&cur_trans->deleted_bgs); spin_lock_init(&cur_trans->dropped_roots_lock); list_add_tail(&cur_trans->list, &fs_info->trans_list); - extent_io_tree_init(fs_info, &cur_trans->dirty_pages, - IO_TREE_TRANS_DIRTY_PAGES); - extent_io_tree_init(fs_info, &cur_trans->pinned_extents, - IO_TREE_FS_PINNED_EXTENTS); + btrfs_extent_io_tree_init(fs_info, &cur_trans->dirty_pages, + IO_TREE_TRANS_DIRTY_PAGES); + btrfs_extent_io_tree_init(fs_info, &cur_trans->pinned_extents, + IO_TREE_FS_PINNED_EXTENTS); btrfs_set_fs_generation(fs_info, fs_info->generation + 1); cur_trans->transid = fs_info->generation; fs_info->running_transaction = cur_trans; @@ -532,15 +538,15 @@ static void wait_current_trans(struct btrfs_fs_info *fs_info) } } -static int may_wait_transaction(struct btrfs_fs_info *fs_info, int type) +static bool may_wait_transaction(struct btrfs_fs_info *fs_info, int type) { if (test_bit(BTRFS_FS_LOG_RECOVERING, &fs_info->flags)) - return 0; + return false; if (type == TRANS_START) - return 1; + return true; - return 0; + return false; } static inline bool need_reserve_reloc_root(struct btrfs_root *root) @@ -755,9 +761,10 @@ got_it: * value here. */ if (do_chunk_alloc && num_bytes) { - u64 flags = h->block_rsv->space_info->flags; + struct btrfs_space_info *space_info = h->block_rsv->space_info; + u64 flags = space_info->flags; - btrfs_chunk_alloc(h, btrfs_get_alloc_profile(fs_info, flags), + btrfs_chunk_alloc(h, space_info, btrfs_get_alloc_profile(fs_info, flags), CHUNK_ALLOC_NO_FORCE); } @@ -1122,13 +1129,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, u64 start = 0; u64 end; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - mark, &cached_state)) { + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + mark, &cached_state)) { bool wait_writeback = false; - ret = convert_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, - mark, &cached_state); + ret = btrfs_convert_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, + mark, &cached_state); /* * convert_extent_bit can return -ENOMEM, which is most of the * time a temporary error. So when it happens, ignore the error @@ -1149,8 +1156,8 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, if (!ret) ret = filemap_fdatawrite_range(mapping, start, end); if (!ret && wait_writeback) - ret = filemap_fdatawait_range(mapping, start, end); - free_extent_state(cached_state); + btrfs_btree_wait_writeback_range(fs_info, start, end); + btrfs_free_extent_state(cached_state); if (ret) break; cached_state = NULL; @@ -1169,14 +1176,13 @@ int btrfs_write_marked_extents(struct btrfs_fs_info *fs_info, static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, struct extent_io_tree *dirty_pages) { - struct address_space *mapping = fs_info->btree_inode->i_mapping; struct extent_state *cached_state = NULL; u64 start = 0; u64 end; int ret = 0; - while (find_first_extent_bit(dirty_pages, start, &start, &end, - EXTENT_NEED_WAIT, &cached_state)) { + while (btrfs_find_first_extent_bit(dirty_pages, start, &start, &end, + EXTENT_NEED_WAIT, &cached_state)) { /* * Ignore -ENOMEM errors returned by clear_extent_bit(). * When committing the transaction, we'll remove any entries @@ -1185,13 +1191,13 @@ static int __btrfs_wait_marked_extents(struct btrfs_fs_info *fs_info, * concurrently - we do it only at transaction commit time when * it's safe to do it (through extent_io_tree_release()). */ - ret = clear_extent_bit(dirty_pages, start, end, - EXTENT_NEED_WAIT, &cached_state); + ret = btrfs_clear_extent_bit(dirty_pages, start, end, + EXTENT_NEED_WAIT, &cached_state); if (ret == -ENOMEM) ret = 0; if (!ret) - ret = filemap_fdatawait_range(mapping, start, end); - free_extent_state(cached_state); + btrfs_btree_wait_writeback_range(fs_info, start, end); + btrfs_free_extent_state(cached_state); if (ret) break; cached_state = NULL; @@ -1259,7 +1265,7 @@ static int btrfs_write_and_wait_transaction(struct btrfs_trans_handle *trans) blk_finish_plug(&plug); ret2 = btrfs_wait_extents(fs_info, dirty_pages); - extent_io_tree_release(&trans->transaction->dirty_pages); + btrfs_extent_io_tree_release(&trans->transaction->dirty_pages); if (ret) return ret; @@ -1321,7 +1327,6 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; struct list_head *dirty_bgs = &trans->transaction->dirty_bgs; struct list_head *io_bgs = &trans->transaction->io_bgs; - struct list_head *next; struct extent_buffer *eb; int ret; @@ -1357,13 +1362,13 @@ static noinline int commit_cowonly_roots(struct btrfs_trans_handle *trans) again: while (!list_empty(&fs_info->dirty_cowonly_roots)) { struct btrfs_root *root; - next = fs_info->dirty_cowonly_roots.next; - list_del_init(next); - root = list_entry(next, struct btrfs_root, dirty_list); + + root = list_first_entry(&fs_info->dirty_cowonly_roots, + struct btrfs_root, dirty_list); clear_bit(BTRFS_ROOT_DIRTY, &root->state); + list_move_tail(&root->dirty_list, + &trans->transaction->switch_commits); - list_add_tail(&root->dirty_list, - &trans->transaction->switch_commits); ret = update_cowonly_root(trans, root); if (ret) return ret; @@ -1635,7 +1640,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, struct btrfs_root *root = pending->root; struct btrfs_root *parent_root; struct btrfs_block_rsv *rsv; - struct inode *parent_inode = &pending->dir->vfs_inode; + struct btrfs_inode *parent_inode = pending->dir; struct btrfs_path *path; struct btrfs_dir_item *dir_item; struct extent_buffer *tmp; @@ -1661,7 +1666,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, * filesystem. */ nofs_flags = memalloc_nofs_save(); - pending->error = fscrypt_setup_filename(parent_inode, + pending->error = fscrypt_setup_filename(&parent_inode->vfs_inode, &pending->dentry->d_name, 0, &fname); memalloc_nofs_restore(nofs_flags); @@ -1690,8 +1695,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, } key.objectid = objectid; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; rsv = trans->block_rsv; trans->block_rsv = &pending->block_rsv; @@ -1699,16 +1704,16 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, trace_btrfs_space_reservation(fs_info, "transaction", trans->transid, trans->bytes_reserved, 1); - parent_root = BTRFS_I(parent_inode)->root; + parent_root = parent_inode->root; ret = record_root_in_trans(trans, parent_root, 0); if (ret) goto fail; - cur_time = current_time(parent_inode); + cur_time = current_time(&parent_inode->vfs_inode); /* * insert the directory item */ - ret = btrfs_set_inode_index(BTRFS_I(parent_inode), &index); + ret = btrfs_set_inode_index(parent_inode, &index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -1716,7 +1721,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, /* check if there is a file/dir which has the same name. */ dir_item = btrfs_lookup_dir_item(NULL, parent_root, path, - btrfs_ino(BTRFS_I(parent_inode)), + btrfs_ino(parent_inode), &fname.disk_name, 0); if (dir_item != NULL && !IS_ERR(dir_item)) { pending->error = -EEXIST; @@ -1817,7 +1822,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, */ ret = btrfs_add_root_ref(trans, objectid, btrfs_root_id(parent_root), - btrfs_ino(BTRFS_I(parent_inode)), index, + btrfs_ino(parent_inode), index, &fname.disk_name); if (ret) { btrfs_abort_transaction(trans, ret); @@ -1855,18 +1860,18 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans, goto fail; ret = btrfs_insert_dir_item(trans, &fname.disk_name, - BTRFS_I(parent_inode), &key, BTRFS_FT_DIR, + parent_inode, &key, BTRFS_FT_DIR, index); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; } - btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size + + btrfs_i_size_write(parent_inode, parent_inode->vfs_inode.i_size + fname.disk_name.len * 2); - inode_set_mtime_to_ts(parent_inode, - inode_set_ctime_current(parent_inode)); - ret = btrfs_update_inode_fallback(trans, BTRFS_I(parent_inode)); + inode_set_mtime_to_ts(&parent_inode->vfs_inode, + inode_set_ctime_current(&parent_inode->vfs_inode)); + ret = btrfs_update_inode_fallback(trans, parent_inode); if (ret) { btrfs_abort_transaction(trans, ret); goto fail; @@ -2096,7 +2101,14 @@ static void btrfs_cleanup_pending_block_groups(struct btrfs_trans_handle *trans) list_for_each_entry_safe(block_group, tmp, &trans->new_bgs, bg_list) { btrfs_dec_delayed_refs_rsv_bg_inserts(fs_info); + /* + * Not strictly necessary to lock, as no other task will be using a + * block_group on the new_bgs list during a transaction abort. + */ + spin_lock(&fs_info->unused_bgs_lock); list_del_init(&block_group->bg_list); + btrfs_put_block_group(block_group); + spin_unlock(&fs_info->unused_bgs_lock); } } @@ -2258,14 +2270,13 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&fs_info->transaction_blocked_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_PREP); - if (cur_trans->list.prev != &fs_info->trans_list) { + if (!list_is_first(&cur_trans->list, &fs_info->trans_list)) { enum btrfs_trans_state want_state = TRANS_STATE_COMPLETED; if (trans->in_fsync) want_state = TRANS_STATE_SUPER_COMMITTED; - prev_trans = list_entry(cur_trans->list.prev, - struct btrfs_transaction, list); + prev_trans = list_prev_entry(cur_trans, list); if (prev_trans->state < want_state) { refcount_inc(&prev_trans->use_count); spin_unlock(&fs_info->trans_lock); @@ -2542,7 +2553,9 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans) wake_up(&cur_trans->commit_wait); btrfs_trans_state_lockdep_release(fs_info, BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED); - btrfs_finish_extent_commit(trans); + ret = btrfs_finish_extent_commit(trans); + if (ret) + goto scrub_continue; if (test_bit(BTRFS_TRANS_HAVE_FREE_BGS, &cur_trans->flags)) btrfs_clear_space_info_full(fs_info); diff --git a/fs/btrfs/tree-checker.c b/fs/btrfs/tree-checker.c index 43979891f7c8..8f4703b488b7 100644 --- a/fs/btrfs/tree-checker.c +++ b/fs/btrfs/tree-checker.c @@ -1571,7 +1571,7 @@ static int check_extent_item(struct extent_buffer *leaf, inline_type); return -EUCLEAN; } - if (inline_type < last_type) { + if (unlikely(inline_type < last_type)) { extent_err(leaf, slot, "inline ref out-of-order: has type %u, prev type %u", inline_type, last_type); @@ -1580,7 +1580,7 @@ static int check_extent_item(struct extent_buffer *leaf, /* Type changed, allow the sequence starts from U64_MAX again. */ if (inline_type > last_type) last_seq = U64_MAX; - if (seq > last_seq) { + if (unlikely(seq > last_seq)) { extent_err(leaf, slot, "inline ref out-of-order: has type %u offset %llu seq 0x%llx, prev type %u seq 0x%llx", inline_type, inline_offset, seq, @@ -1929,7 +1929,7 @@ static enum btrfs_tree_block_status check_leaf_item(struct extent_buffer *leaf, break; } - if (ret) + if (unlikely(ret)) return BTRFS_TREE_BLOCK_INVALID_ITEM; return BTRFS_TREE_BLOCK_CLEAN; } @@ -2229,13 +2229,12 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int ret; found_level = btrfs_header_level(eb); - if (found_level != check->level) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree level check failed\n"); + if (unlikely(found_level != check->level)) { + DEBUG_WARN(); btrfs_err(fs_info, "tree level mismatch detected, bytenr=%llu level expected=%u has=%u", eb->start, check->level, found_level); - return -EIO; + return -EUCLEAN; } if (!check->has_first_key) @@ -2251,11 +2250,11 @@ int btrfs_verify_level_key(struct extent_buffer *eb, return 0; /* We have @first_key, so this @eb must have at least one item */ - if (btrfs_header_nritems(eb) == 0) { + if (unlikely(btrfs_header_nritems(eb) == 0)) { btrfs_err(fs_info, "invalid tree nritems, bytenr=%llu nritems=0 expect >0", eb->start); - WARN_ON(IS_ENABLED(CONFIG_BTRFS_DEBUG)); + DEBUG_WARN(); return -EUCLEAN; } @@ -2263,11 +2262,10 @@ int btrfs_verify_level_key(struct extent_buffer *eb, btrfs_node_key_to_cpu(eb, &found_key, 0); else btrfs_item_key_to_cpu(eb, &found_key, 0); - ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); - if (ret) { - WARN(IS_ENABLED(CONFIG_BTRFS_DEBUG), - KERN_ERR "BTRFS: tree first key check failed\n"); + ret = btrfs_comp_cpu_keys(&check->first_key, &found_key); + if (unlikely(ret)) { + DEBUG_WARN(); btrfs_err(fs_info, "tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)", eb->start, check->transid, check->first_key.objectid, diff --git a/fs/btrfs/tree-log.c b/fs/btrfs/tree-log.c index 955d1677e865..97e933113b82 100644 --- a/fs/btrfs/tree-log.c +++ b/fs/btrfs/tree-log.c @@ -138,10 +138,10 @@ static void wait_log_commit(struct btrfs_root *root, int transid); * and once to do all the other items. */ -static struct inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) +static struct btrfs_inode *btrfs_iget_logging(u64 objectid, struct btrfs_root *root) { unsigned int nofs_flag; - struct inode *inode; + struct btrfs_inode *inode; /* * We're holding a transaction handle whether we are logging or @@ -376,12 +376,12 @@ static int process_one_buffer(struct btrfs_root *log, } /* - * Item overwrite used by replay and tree logging. eb, slot and key all refer - * to the src data we are copying out. + * Item overwrite used by log replay. The given eb, slot and key all refer to + * the source data we are copying out. * - * root is the tree we are copying into, and path is a scratch - * path for use in this function (it should be released on entry and - * will be released on exit). + * The given root is for the tree we are copying into, and path is a scratch + * path for use in this function (it should be released on entry and will be + * released on exit). * * If the key is already in the destination tree the existing item is * overwritten. If the existing item isn't big enough, it is extended. @@ -401,6 +401,8 @@ static int overwrite_item(struct btrfs_trans_handle *trans, int save_old_i_size = 0; unsigned long src_ptr; unsigned long dst_ptr; + struct extent_buffer *dst_eb; + int dst_slot; bool inode_item = key->type == BTRFS_INODE_ITEM_KEY; /* @@ -420,11 +422,13 @@ static int overwrite_item(struct btrfs_trans_handle *trans, if (ret < 0) return ret; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + if (ret == 0) { char *src_copy; - char *dst_copy; - u32 dst_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 dst_size = btrfs_item_size(dst_eb, dst_slot); + if (dst_size != item_size) goto insert; @@ -432,23 +436,16 @@ static int overwrite_item(struct btrfs_trans_handle *trans, btrfs_release_path(path); return 0; } - dst_copy = kmalloc(item_size, GFP_NOFS); src_copy = kmalloc(item_size, GFP_NOFS); - if (!dst_copy || !src_copy) { + if (!src_copy) { btrfs_release_path(path); - kfree(dst_copy); - kfree(src_copy); return -ENOMEM; } read_extent_buffer(eb, src_copy, src_ptr, item_size); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); + ret = memcmp_extent_buffer(dst_eb, src_copy, dst_ptr, item_size); - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); - read_extent_buffer(path->nodes[0], dst_copy, dst_ptr, - item_size); - ret = memcmp(dst_copy, src_copy, item_size); - - kfree(dst_copy); kfree(src_copy); /* * they have the same contents, just return, this saves @@ -470,9 +467,9 @@ static int overwrite_item(struct btrfs_trans_handle *trans, u64 nbytes; u32 mode; - item = btrfs_item_ptr(path->nodes[0], path->slots[0], + item = btrfs_item_ptr(dst_eb, dst_slot, struct btrfs_inode_item); - nbytes = btrfs_inode_nbytes(path->nodes[0], item); + nbytes = btrfs_inode_nbytes(dst_eb, item); item = btrfs_item_ptr(eb, slot, struct btrfs_inode_item); btrfs_set_inode_nbytes(eb, item, nbytes); @@ -514,11 +511,13 @@ insert: key, item_size); path->skip_release_on_error = 0; + dst_eb = path->nodes[0]; + dst_slot = path->slots[0]; + /* make sure any existing item is the correct size */ if (ret == -EEXIST || ret == -EOVERFLOW) { - u32 found_size; - found_size = btrfs_item_size(path->nodes[0], - path->slots[0]); + const u32 found_size = btrfs_item_size(dst_eb, dst_slot); + if (found_size > item_size) btrfs_truncate_item(trans, path, item_size, 1); else if (found_size < item_size) @@ -526,8 +525,7 @@ insert: } else if (ret) { return ret; } - dst_ptr = btrfs_item_ptr_offset(path->nodes[0], - path->slots[0]); + dst_ptr = btrfs_item_ptr_offset(dst_eb, dst_slot); /* don't overwrite an existing inode if the generation number * was logged as zero. This is done when the tree logging code @@ -546,7 +544,6 @@ insert: dst_item = (struct btrfs_inode_item *)dst_ptr; if (btrfs_inode_generation(eb, src_item) == 0) { - struct extent_buffer *dst_eb = path->nodes[0]; const u64 ino_size = btrfs_inode_size(eb, src_item); /* @@ -564,30 +561,28 @@ insert: } if (S_ISDIR(btrfs_inode_mode(eb, src_item)) && - S_ISDIR(btrfs_inode_mode(path->nodes[0], dst_item))) { + S_ISDIR(btrfs_inode_mode(dst_eb, dst_item))) { save_old_i_size = 1; - saved_i_size = btrfs_inode_size(path->nodes[0], - dst_item); + saved_i_size = btrfs_inode_size(dst_eb, dst_item); } } - copy_extent_buffer(path->nodes[0], eb, dst_ptr, - src_ptr, item_size); + copy_extent_buffer(dst_eb, eb, dst_ptr, src_ptr, item_size); if (save_old_i_size) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - btrfs_set_inode_size(path->nodes[0], dst_item, saved_i_size); + btrfs_set_inode_size(dst_eb, dst_item, saved_i_size); } /* make sure the generation is filled in */ if (key->type == BTRFS_INODE_ITEM_KEY) { struct btrfs_inode_item *dst_item; + dst_item = (struct btrfs_inode_item *)dst_ptr; - if (btrfs_inode_generation(path->nodes[0], dst_item) == 0) { - btrfs_set_inode_generation(path->nodes[0], dst_item, - trans->transid); - } + if (btrfs_inode_generation(dst_eb, dst_item) == 0) + btrfs_set_inode_generation(dst_eb, dst_item, trans->transid); } no_copy: btrfs_release_path(path); @@ -613,14 +608,14 @@ static int read_alloc_one_name(struct extent_buffer *eb, void *start, int len, * simple helper to read an inode off the disk from a given root * This can only be called for subvolume roots and not for the log */ -static noinline struct inode *read_one_inode(struct btrfs_root *root, - u64 objectid) +static noinline struct btrfs_inode *read_one_inode(struct btrfs_root *root, + u64 objectid) { - struct inode *inode; + struct btrfs_inode *inode; inode = btrfs_iget_logging(objectid, root); if (IS_ERR(inode)) - inode = NULL; + return NULL; return inode; } @@ -649,7 +644,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, u64 start = key->offset; u64 nbytes = 0; struct btrfs_file_extent_item *item; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; unsigned long size; int ret = 0; @@ -688,31 +683,23 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, * file. This must be done before the btrfs_drop_extents run * so we don't try to drop this extent. */ - ret = btrfs_lookup_file_extent(trans, root, path, - btrfs_ino(BTRFS_I(inode)), start, 0); + ret = btrfs_lookup_file_extent(trans, root, path, btrfs_ino(inode), start, 0); if (ret == 0 && (found_type == BTRFS_FILE_EXTENT_REG || found_type == BTRFS_FILE_EXTENT_PREALLOC)) { - struct btrfs_file_extent_item cmp1; - struct btrfs_file_extent_item cmp2; - struct btrfs_file_extent_item *existing; - struct extent_buffer *leaf; - - leaf = path->nodes[0]; - existing = btrfs_item_ptr(leaf, path->slots[0], - struct btrfs_file_extent_item); + struct btrfs_file_extent_item existing; + unsigned long ptr; - read_extent_buffer(eb, &cmp1, (unsigned long)item, - sizeof(cmp1)); - read_extent_buffer(leaf, &cmp2, (unsigned long)existing, - sizeof(cmp2)); + ptr = btrfs_item_ptr_offset(path->nodes[0], path->slots[0]); + read_extent_buffer(path->nodes[0], &existing, ptr, sizeof(existing)); /* * we already have a pointer to this exact extent, * we don't have to do anything */ - if (memcmp(&cmp1, &cmp2, sizeof(cmp1)) == 0) { + if (memcmp_extent_buffer(eb, &existing, (unsigned long)item, + sizeof(existing)) == 0) { btrfs_release_path(path); goto out; } @@ -723,7 +710,7 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, drop_args.start = start; drop_args.end = extent_end; drop_args.drop_cache = true; - ret = btrfs_drop_extents(trans, root, BTRFS_I(inode), &drop_args); + ret = btrfs_drop_extents(trans, root, inode, &drop_args); if (ret) goto out; @@ -747,8 +734,8 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, (unsigned long)item, sizeof(*item)); ins.objectid = btrfs_file_extent_disk_bytenr(eb, item); - ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); ins.type = BTRFS_EXTENT_ITEM_KEY; + ins.offset = btrfs_file_extent_disk_num_bytes(eb, item); offset = key->offset - btrfs_file_extent_offset(eb, item); /* @@ -873,9 +860,9 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, struct btrfs_ordered_sum *sums; struct btrfs_root *csum_root; - sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); + sums = list_first_entry(&ordered_sums, + struct btrfs_ordered_sum, + list); csum_root = btrfs_csum_root(fs_info, sums->logical); if (!ret) @@ -901,16 +888,15 @@ static noinline int replay_one_extent(struct btrfs_trans_handle *trans, goto out; } - ret = btrfs_inode_set_file_extent_range(BTRFS_I(inode), start, - extent_end - start); + ret = btrfs_inode_set_file_extent_range(inode, start, extent_end - start); if (ret) goto out; update_inode: - btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + btrfs_update_inode_bytes(inode, nbytes, drop_args.bytes_found); + ret = btrfs_update_inode(trans, inode); out: - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -947,7 +933,7 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, struct btrfs_dir_item *di) { struct btrfs_root *root = dir->root; - struct inode *inode; + struct btrfs_inode *inode; struct fscrypt_str name; struct extent_buffer *leaf; struct btrfs_key location; @@ -972,10 +958,10 @@ static noinline int drop_one_dir_item(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = unlink_inode_for_log_replay(trans, dir, BTRFS_I(inode), &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); out: kfree(name.name); - iput(inode); + iput(&inode->vfs_inode); return ret; } @@ -1148,7 +1134,7 @@ again: u32 item_size; u32 cur_offset = 0; unsigned long base; - struct inode *victim_parent; + struct btrfs_inode *victim_parent; leaf = path->nodes[0]; @@ -1188,10 +1174,10 @@ again: btrfs_release_path(path); ret = unlink_inode_for_log_replay(trans, - BTRFS_I(victim_parent), + victim_parent, inode, &victim_name); } - iput(victim_parent); + iput(&victim_parent->vfs_inode); kfree(victim_name.name); if (ret) return ret; @@ -1325,7 +1311,7 @@ again: ret = !!btrfs_find_name_in_backref(log_eb, log_slot, &name); if (!ret) { - struct inode *dir; + struct btrfs_inode *dir; btrfs_release_path(path); dir = read_one_inode(root, parent_id); @@ -1334,10 +1320,9 @@ again: kfree(name.name); goto out; } - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), - inode, &name); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (ret) goto out; goto again; @@ -1369,8 +1354,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, struct extent_buffer *eb, int slot, struct btrfs_key *key) { - struct inode *dir = NULL; - struct inode *inode = NULL; + struct btrfs_inode *dir = NULL; + struct btrfs_inode *inode = NULL; unsigned long ref_ptr; unsigned long ref_end; struct fscrypt_str name = { 0 }; @@ -1435,8 +1420,8 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, if (ret) goto out; - ret = inode_in_dir(root, path, btrfs_ino(BTRFS_I(dir)), - btrfs_ino(BTRFS_I(inode)), ref_index, &name); + ret = inode_in_dir(root, path, btrfs_ino(dir), btrfs_ino(inode), + ref_index, &name); if (ret < 0) { goto out; } else if (ret == 0) { @@ -1447,8 +1432,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * overwrite any existing back reference, and we don't * want to create dangling pointers in the directory. */ - ret = __add_inode_ref(trans, root, path, log, - BTRFS_I(dir), BTRFS_I(inode), + ret = __add_inode_ref(trans, root, path, log, dir, inode, inode_objectid, parent_objectid, ref_index, &name); if (ret) { @@ -1458,12 +1442,11 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, } /* insert our name */ - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), - &name, 0, ref_index); + ret = btrfs_add_link(trans, dir, inode, &name, 0, ref_index); if (ret) goto out; - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } @@ -1473,7 +1456,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, kfree(name.name); name.name = NULL; if (log_ref_ver) { - iput(dir); + iput(&dir->vfs_inode); dir = NULL; } } @@ -1486,8 +1469,7 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, * dir index entries exist for a name but there is no inode reference * item with the same name. */ - ret = unlink_old_inode_refs(trans, root, path, BTRFS_I(inode), eb, slot, - key); + ret = unlink_old_inode_refs(trans, root, path, inode, eb, slot, key); if (ret) goto out; @@ -1496,8 +1478,10 @@ static noinline int add_inode_ref(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); kfree(name.name); - iput(dir); - iput(inode); + if (dir) + iput(&dir->vfs_inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -1611,25 +1595,25 @@ process_slot: * will free the inode. */ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, - struct inode *inode) + struct btrfs_inode *inode) { - struct btrfs_root *root = BTRFS_I(inode)->root; + struct btrfs_root *root = inode->root; struct btrfs_path *path; int ret; u64 nlink = 0; - u64 ino = btrfs_ino(BTRFS_I(inode)); + const u64 ino = btrfs_ino(inode); path = btrfs_alloc_path(); if (!path) return -ENOMEM; - ret = count_inode_refs(BTRFS_I(inode), path); + ret = count_inode_refs(inode, path); if (ret < 0) goto out; nlink = ret; - ret = count_inode_extrefs(BTRFS_I(inode), path); + ret = count_inode_extrefs(inode, path); if (ret < 0) goto out; @@ -1637,17 +1621,17 @@ static noinline int fixup_inode_link_count(struct btrfs_trans_handle *trans, ret = 0; - if (nlink != inode->i_nlink) { - set_nlink(inode, nlink); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + if (nlink != inode->vfs_inode.i_nlink) { + set_nlink(&inode->vfs_inode, nlink); + ret = btrfs_update_inode(trans, inode); if (ret) goto out; } - if (S_ISDIR(inode->i_mode)) - BTRFS_I(inode)->index_cnt = (u64)-1; + if (S_ISDIR(inode->vfs_inode.i_mode)) + inode->index_cnt = (u64)-1; - if (inode->i_nlink == 0) { - if (S_ISDIR(inode->i_mode)) { + if (inode->vfs_inode.i_nlink == 0) { + if (S_ISDIR(inode->vfs_inode.i_mode)) { ret = replay_dir_deletes(trans, root, NULL, path, ino, 1); if (ret) @@ -1669,12 +1653,13 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, { int ret; struct btrfs_key key; - struct inode *inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = (u64)-1; while (1) { + struct btrfs_inode *inode; + ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) break; @@ -1703,7 +1688,7 @@ static noinline int fixup_inode_link_counts(struct btrfs_trans_handle *trans, } ret = fixup_inode_link_count(trans, inode); - iput(inode); + iput(&inode->vfs_inode); if (ret) break; @@ -1731,12 +1716,14 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, { struct btrfs_key key; int ret = 0; - struct inode *inode; + struct btrfs_inode *inode; + struct inode *vfs_inode; inode = read_one_inode(root, objectid); if (!inode) return -EIO; + vfs_inode = &inode->vfs_inode; key.objectid = BTRFS_TREE_LOG_FIXUP_OBJECTID; key.type = BTRFS_ORPHAN_ITEM_KEY; key.offset = objectid; @@ -1745,15 +1732,15 @@ static noinline int link_to_fixup_dir(struct btrfs_trans_handle *trans, btrfs_release_path(path); if (ret == 0) { - if (!inode->i_nlink) - set_nlink(inode, 1); + if (!vfs_inode->i_nlink) + set_nlink(vfs_inode, 1); else - inc_nlink(inode); - ret = btrfs_update_inode(trans, BTRFS_I(inode)); + inc_nlink(vfs_inode); + ret = btrfs_update_inode(trans, inode); } else if (ret == -EEXIST) { ret = 0; } - iput(inode); + iput(vfs_inode); return ret; } @@ -1769,8 +1756,8 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, const struct fscrypt_str *name, struct btrfs_key *location) { - struct inode *inode; - struct inode *dir; + struct btrfs_inode *inode; + struct btrfs_inode *dir; int ret; inode = read_one_inode(root, location->objectid); @@ -1779,17 +1766,16 @@ static noinline int insert_one_name(struct btrfs_trans_handle *trans, dir = read_one_inode(root, dirid); if (!dir) { - iput(inode); + iput(&inode->vfs_inode); return -EIO; } - ret = btrfs_add_link(trans, BTRFS_I(dir), BTRFS_I(inode), name, - 1, index); + ret = btrfs_add_link(trans, dir, inode, name, 1, index); /* FIXME, put inode into FIXUP list */ - iput(inode); - iput(dir); + iput(&inode->vfs_inode); + iput(&dir->vfs_inode); return ret; } @@ -1851,7 +1837,7 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, bool index_dst_matches = false; struct btrfs_key log_key; struct btrfs_key search_key; - struct inode *dir; + struct btrfs_inode *dir; u8 log_flags; bool exists; int ret; @@ -1881,9 +1867,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(dir_dst_di); goto out; } else if (dir_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - dir_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, dir_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; dir_dst_matches = (ret == 1); @@ -1898,9 +1883,8 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, ret = PTR_ERR(index_dst_di); goto out; } else if (index_dst_di) { - ret = delete_conflicting_dir_entry(trans, BTRFS_I(dir), path, - index_dst_di, &log_key, - log_flags, exists); + ret = delete_conflicting_dir_entry(trans, dir, path, index_dst_di, + &log_key, log_flags, exists); if (ret < 0) goto out; index_dst_matches = (ret == 1); @@ -1955,11 +1939,11 @@ static noinline int replay_one_name(struct btrfs_trans_handle *trans, out: if (!ret && update_size) { - btrfs_i_size_write(BTRFS_I(dir), dir->i_size + name.len * 2); - ret = btrfs_update_inode(trans, BTRFS_I(dir)); + btrfs_i_size_write(dir, dir->vfs_inode.i_size + name.len * 2); + ret = btrfs_update_inode(trans, dir); } kfree(name.name); - iput(dir); + iput(&dir->vfs_inode); if (!ret && name_added) ret = 1; return ret; @@ -2116,16 +2100,16 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, struct btrfs_root *log, struct btrfs_path *path, struct btrfs_path *log_path, - struct inode *dir, + struct btrfs_inode *dir, struct btrfs_key *dir_key) { - struct btrfs_root *root = BTRFS_I(dir)->root; + struct btrfs_root *root = dir->root; int ret; struct extent_buffer *eb; int slot; struct btrfs_dir_item *di; struct fscrypt_str name = { 0 }; - struct inode *inode = NULL; + struct btrfs_inode *inode = NULL; struct btrfs_key location; /* @@ -2172,9 +2156,8 @@ static noinline int check_item_in_log(struct btrfs_trans_handle *trans, if (ret) goto out; - inc_nlink(inode); - ret = unlink_inode_for_log_replay(trans, BTRFS_I(dir), BTRFS_I(inode), - &name); + inc_nlink(&inode->vfs_inode); + ret = unlink_inode_for_log_replay(trans, dir, inode, &name); /* * Unlike dir item keys, dir index keys can only have one name (entry) in * them, as there are no key collisions since each key has a unique offset @@ -2184,7 +2167,8 @@ out: btrfs_release_path(path); btrfs_release_path(log_path); kfree(name.name); - iput(inode); + if (inode) + iput(&inode->vfs_inode); return ret; } @@ -2308,7 +2292,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, struct btrfs_key dir_key; struct btrfs_key found_key; struct btrfs_path *log_path; - struct inode *dir; + struct btrfs_inode *dir; dir_key.objectid = dirid; dir_key.type = BTRFS_DIR_INDEX_KEY; @@ -2385,7 +2369,7 @@ static noinline int replay_dir_deletes(struct btrfs_trans_handle *trans, out: btrfs_release_path(path); btrfs_free_path(log_path); - iput(dir); + iput(&dir->vfs_inode); return ret; } @@ -2479,7 +2463,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, */ if (S_ISREG(mode)) { struct btrfs_drop_extents_args drop_args = { 0 }; - struct inode *inode; + struct btrfs_inode *inode; u64 from; inode = read_one_inode(root, key.objectid); @@ -2487,22 +2471,20 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb, ret = -EIO; break; } - from = ALIGN(i_size_read(inode), + from = ALIGN(i_size_read(&inode->vfs_inode), root->fs_info->sectorsize); drop_args.start = from; drop_args.end = (u64)-1; drop_args.drop_cache = true; - ret = btrfs_drop_extents(wc->trans, root, - BTRFS_I(inode), + ret = btrfs_drop_extents(wc->trans, root, inode, &drop_args); if (!ret) { - inode_sub_bytes(inode, + inode_sub_bytes(&inode->vfs_inode, drop_args.bytes_found); /* Update the inode's nbytes. */ - ret = btrfs_update_inode(wc->trans, - BTRFS_I(inode)); + ret = btrfs_update_inode(wc->trans, inode); } - iput(inode); + iput(&inode->vfs_inode); if (ret) break; } @@ -3269,8 +3251,8 @@ static void free_log_tree(struct btrfs_trans_handle *trans, } } - extent_io_tree_release(&log->dirty_log_pages); - extent_io_tree_release(&log->log_csum_range); + btrfs_extent_io_tree_release(&log->dirty_log_pages); + btrfs_extent_io_tree_release(&log->log_csum_range); btrfs_put_root(log); } @@ -3560,8 +3542,8 @@ static noinline int insert_dir_log_key(struct btrfs_trans_handle *trans, struct btrfs_dir_log_item *item; key.objectid = dirid; - key.offset = first_offset; key.type = BTRFS_DIR_LOG_INDEX_KEY; + key.offset = first_offset; ret = btrfs_insert_empty_item(trans, log, path, &key, sizeof(*item)); /* * -EEXIST is fine and can happen sporadically when we are logging a @@ -4318,8 +4300,8 @@ static int log_csums(struct btrfs_trans_handle *trans, * file which happens to refer to the same extent as well. Such races * can leave checksum items in the log with overlapping ranges. */ - ret = lock_extent(&log_root->log_csum_range, sums->logical, lock_end, - &cached_state); + ret = btrfs_lock_extent(&log_root->log_csum_range, sums->logical, lock_end, + &cached_state); if (ret) return ret; /* @@ -4335,8 +4317,8 @@ static int log_csums(struct btrfs_trans_handle *trans, if (!ret) ret = btrfs_csum_file_blocks(trans, log_root, sums); - unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, - &cached_state); + btrfs_unlock_extent(&log_root->log_csum_range, sums->logical, lock_end, + &cached_state); return ret; } @@ -4666,7 +4648,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, return 0; /* If we're compressed we have to save the entire range of csums. */ - if (extent_map_is_compressed(em)) { + if (btrfs_extent_map_is_compressed(em)) { csum_offset = 0; csum_len = em->disk_num_bytes; } else { @@ -4675,7 +4657,7 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, } /* block start is already adjusted for the file extent offset. */ - block_start = extent_map_block_start(em); + block_start = btrfs_extent_map_block_start(em); csum_root = btrfs_csum_root(trans->fs_info, block_start); ret = btrfs_lookup_csums_list(csum_root, block_start + csum_offset, block_start + csum_offset + csum_len - 1, @@ -4685,9 +4667,9 @@ static int log_extent_csums(struct btrfs_trans_handle *trans, ret = 0; while (!list_empty(&ordered_sums)) { - struct btrfs_ordered_sum *sums = list_entry(ordered_sums.next, - struct btrfs_ordered_sum, - list); + struct btrfs_ordered_sum *sums = list_first_entry(&ordered_sums, + struct btrfs_ordered_sum, + list); if (!ret) ret = log_csums(trans, inode, log_root, sums); list_del(&sums->list); @@ -4710,7 +4692,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, struct btrfs_key key; enum btrfs_compression_type compress_type; u64 extent_offset = em->offset; - u64 block_start = extent_map_block_start(em); + u64 block_start = btrfs_extent_map_block_start(em); u64 block_len; int ret; @@ -4721,7 +4703,7 @@ static int log_one_extent(struct btrfs_trans_handle *trans, btrfs_set_stack_file_extent_type(&fi, BTRFS_FILE_EXTENT_REG); block_len = em->disk_num_bytes; - compress_type = extent_map_compression(em); + compress_type = btrfs_extent_map_compression(em); if (compress_type != BTRFS_COMPRESS_NONE) { btrfs_set_stack_file_extent_disk_bytenr(&fi, block_start); btrfs_set_stack_file_extent_disk_num_bytes(&fi, block_len); @@ -4965,7 +4947,7 @@ static int btrfs_log_changed_extents(struct btrfs_trans_handle *trans, list_sort(NULL, &extents, extent_cmp); process: while (!list_empty(&extents)) { - em = list_entry(extents.next, struct extent_map, list); + em = list_first_entry(&extents, struct extent_map, list); list_del_init(&em->list); @@ -4974,8 +4956,8 @@ process: * private list. */ if (ret) { - clear_em_logging(inode, em); - free_extent_map(em); + btrfs_clear_em_logging(inode, em); + btrfs_free_extent_map(em); continue; } @@ -4983,8 +4965,8 @@ process: ret = log_one_extent(trans, inode, em, path, ctx); write_lock(&tree->lock); - clear_em_logging(inode, em); - free_extent_map(em); + btrfs_clear_em_logging(inode, em); + btrfs_free_extent_map(em); } WARN_ON(!list_empty(&extents)); write_unlock(&tree->lock); @@ -5481,7 +5463,6 @@ static int log_new_dir_dentries(struct btrfs_trans_handle *trans, ihold(&curr_inode->vfs_inode); while (true) { - struct inode *vfs_inode; struct btrfs_key key; struct btrfs_key found_key; u64 next_index; @@ -5497,7 +5478,7 @@ again: struct extent_buffer *leaf = path->nodes[0]; struct btrfs_dir_item *di; struct btrfs_key di_key; - struct inode *di_inode; + struct btrfs_inode *di_inode; int log_mode = LOG_INODE_EXISTS; int type; @@ -5524,17 +5505,16 @@ again: goto out; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); break; } ctx->log_new_dentries = false; if (type == BTRFS_FT_DIR) log_mode = LOG_INODE_ALL; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), - log_mode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); + btrfs_add_delayed_iput(di_inode); if (ret) goto out; if (ctx->log_new_dentries) { @@ -5576,14 +5556,13 @@ again: kfree(dir_elem); btrfs_add_delayed_iput(curr_inode); - curr_inode = NULL; - vfs_inode = btrfs_iget_logging(ino, root); - if (IS_ERR(vfs_inode)) { - ret = PTR_ERR(vfs_inode); + curr_inode = btrfs_iget_logging(ino, root); + if (IS_ERR(curr_inode)) { + ret = PTR_ERR(curr_inode); + curr_inode = NULL; break; } - curr_inode = BTRFS_I(vfs_inode); } out: btrfs_free_path(path); @@ -5661,7 +5640,7 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, struct btrfs_log_ctx *ctx) { struct btrfs_ino_list *ino_elem; - struct inode *inode; + struct btrfs_inode *inode; /* * It's rare to have a lot of conflicting inodes, in practice it is not @@ -5752,12 +5731,12 @@ static int add_conflicting_inode(struct btrfs_trans_handle *trans, * inode in LOG_INODE_EXISTS mode and rename operations update the log, * so that the log ends up with the new name and without the old name. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); return 0; } - btrfs_add_delayed_iput(BTRFS_I(inode)); + btrfs_add_delayed_iput(inode); ino_elem = kmalloc(sizeof(*ino_elem), GFP_NOFS); if (!ino_elem) @@ -5793,7 +5772,7 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, */ while (!list_empty(&ctx->conflict_inodes)) { struct btrfs_ino_list *curr; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; u64 parent; @@ -5829,9 +5808,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * dir index key range logged for the directory. So we * must make sure the deletion is recorded. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_ALL, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_ALL, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; continue; @@ -5847,8 +5825,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * it again because if some other task logged the inode after * that, we can avoid doing it again. */ - if (!need_log_inode(trans, BTRFS_I(inode))) { - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (!need_log_inode(trans, inode)) { + btrfs_add_delayed_iput(inode); continue; } @@ -5859,8 +5837,8 @@ static int log_conflicting_inodes(struct btrfs_trans_handle *trans, * well because during a rename we pin the log and update the * log with the new name before we unpin it. */ - ret = btrfs_log_inode(trans, BTRFS_I(inode), LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) break; } @@ -6351,7 +6329,7 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, list_for_each_entry(item, delayed_ins_list, log_list) { struct btrfs_dir_item *dir_item; - struct inode *di_inode; + struct btrfs_inode *di_inode; struct btrfs_key key; int log_mode = LOG_INODE_EXISTS; @@ -6367,8 +6345,8 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, break; } - if (!need_log_inode(trans, BTRFS_I(di_inode))) { - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + if (!need_log_inode(trans, di_inode)) { + btrfs_add_delayed_iput(di_inode); continue; } @@ -6376,12 +6354,12 @@ static int log_new_delayed_dentries(struct btrfs_trans_handle *trans, log_mode = LOG_INODE_ALL; ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(di_inode), log_mode, ctx); + ret = btrfs_log_inode(trans, di_inode, log_mode, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, BTRFS_I(di_inode), ctx); + ret = log_new_dir_dentries(trans, di_inode, ctx); - btrfs_add_delayed_iput(BTRFS_I(di_inode)); + btrfs_add_delayed_iput(di_inode); if (ret) break; @@ -6605,6 +6583,19 @@ static int btrfs_log_inode(struct btrfs_trans_handle *trans, btrfs_log_get_delayed_items(inode, &delayed_ins_list, &delayed_del_list); + /* + * If we are fsyncing a file with 0 hard links, then commit the delayed + * inode because the last inode ref (or extref) item may still be in the + * subvolume tree and if we log it the file will still exist after a log + * replay. So commit the delayed inode to delete that last ref and we + * skip logging it. + */ + if (inode->vfs_inode.i_nlink == 0) { + ret = btrfs_commit_inode_delayed_inode(inode); + if (ret) + goto out_unlock; + } + ret = copy_inode_items_to_log(trans, inode, &min_key, &max_key, path, dst_path, logged_isize, inode_only, ctx, @@ -6789,7 +6780,7 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, ptr = btrfs_item_ptr_offset(leaf, slot); while (cur_offset < item_size) { struct btrfs_key inode_key; - struct inode *dir_inode; + struct btrfs_inode *dir_inode; inode_key.type = BTRFS_INODE_ITEM_KEY; inode_key.offset = 0; @@ -6838,18 +6829,16 @@ static int btrfs_log_all_parents(struct btrfs_trans_handle *trans, goto out; } - if (!need_log_inode(trans, BTRFS_I(dir_inode))) { - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + if (!need_log_inode(trans, dir_inode)) { + btrfs_add_delayed_iput(dir_inode); continue; } ctx->log_new_dentries = false; - ret = btrfs_log_inode(trans, BTRFS_I(dir_inode), - LOG_INODE_ALL, ctx); + ret = btrfs_log_inode(trans, dir_inode, LOG_INODE_ALL, ctx); if (!ret && ctx->log_new_dentries) - ret = log_new_dir_dentries(trans, - BTRFS_I(dir_inode), ctx); - btrfs_add_delayed_iput(BTRFS_I(dir_inode)); + ret = log_new_dir_dentries(trans, dir_inode, ctx); + btrfs_add_delayed_iput(dir_inode); if (ret) goto out; } @@ -6874,7 +6863,7 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, struct extent_buffer *leaf; int slot; struct btrfs_key search_key; - struct inode *inode; + struct btrfs_inode *inode; u64 ino; int ret = 0; @@ -6889,11 +6878,10 @@ static int log_new_ancestors(struct btrfs_trans_handle *trans, if (IS_ERR(inode)) return PTR_ERR(inode); - if (BTRFS_I(inode)->generation >= trans->transid && - need_log_inode(trans, BTRFS_I(inode))) - ret = btrfs_log_inode(trans, BTRFS_I(inode), - LOG_INODE_EXISTS, ctx); - btrfs_add_delayed_iput(BTRFS_I(inode)); + if (inode->generation >= trans->transid && + need_log_inode(trans, inode)) + ret = btrfs_log_inode(trans, inode, LOG_INODE_EXISTS, ctx); + btrfs_add_delayed_iput(inode); if (ret) return ret; @@ -7061,42 +7049,29 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, struct btrfs_root *root = inode->root; struct btrfs_fs_info *fs_info = root->fs_info; int ret = 0; - bool log_dentries = false; + bool log_dentries; - if (btrfs_test_opt(fs_info, NOTREELOG)) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_test_opt(fs_info, NOTREELOG)) + return BTRFS_LOG_FORCE_COMMIT; - if (btrfs_root_refs(&root->root_item) == 0) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_refs(&root->root_item) == 0) + return BTRFS_LOG_FORCE_COMMIT; /* * If we're logging an inode from a subvolume created in the current * transaction we must force a commit since the root is not persisted. */ - if (btrfs_root_generation(&root->root_item) == trans->transid) { - ret = BTRFS_LOG_FORCE_COMMIT; - goto end_no_trans; - } + if (btrfs_root_generation(&root->root_item) == trans->transid) + return BTRFS_LOG_FORCE_COMMIT; - /* - * Skip already logged inodes or inodes corresponding to tmpfiles - * (since logging them is pointless, a link count of 0 means they - * will never be accessible). - */ - if ((btrfs_inode_in_log(inode, trans->transid) && - list_empty(&ctx->ordered_extents)) || - inode->vfs_inode.i_nlink == 0) { - ret = BTRFS_NO_LOG_SYNC; - goto end_no_trans; - } + /* Skip already logged inodes and without new extents. */ + if (btrfs_inode_in_log(inode, trans->transid) && + list_empty(&ctx->ordered_extents)) + return BTRFS_NO_LOG_SYNC; ret = start_log_trans(trans, root, ctx); if (ret) - goto end_no_trans; + return ret; ret = btrfs_log_inode(trans, inode, inode_only, ctx); if (ret) @@ -7115,8 +7090,11 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, goto end_trans; } - if (S_ISDIR(inode->vfs_inode.i_mode) && ctx->log_new_dentries) - log_dentries = true; + /* + * Track if we need to log dentries because ctx->log_new_dentries can + * be modified in the call chains below. + */ + log_dentries = ctx->log_new_dentries; /* * On unlink we must make sure all our current and old parent directory @@ -7171,8 +7149,6 @@ static int btrfs_log_inode_parent(struct btrfs_trans_handle *trans, if (log_dentries) ret = log_new_dir_dentries(trans, inode, ctx); - else - ret = 0; end_trans: if (ret < 0) { btrfs_set_log_full_commit(trans); @@ -7182,7 +7158,7 @@ end_trans: if (ret) btrfs_remove_log_ctx(root, ctx); btrfs_end_log_trans(root); -end_no_trans: + return ret; } @@ -7247,8 +7223,8 @@ int btrfs_recover_log_trees(struct btrfs_root *log_root_tree) again: key.objectid = BTRFS_TREE_LOG_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_ROOT_ITEM_KEY; + key.offset = (u64)-1; while (1) { ret = btrfs_search_slot(NULL, log_root_tree, &key, path, 0, 0); diff --git a/fs/btrfs/verity.c b/fs/btrfs/verity.c index e97ad824ae16..b7a96a005487 100644 --- a/fs/btrfs/verity.c +++ b/fs/btrfs/verity.c @@ -485,7 +485,7 @@ static int rollback_verity(struct btrfs_inode *inode) goto out; } inode->ro_flags &= ~BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) { btrfs_abort_transaction(trans, ret); @@ -552,7 +552,7 @@ static int finish_verity(struct btrfs_inode *inode, const void *desc, goto out; } inode->ro_flags |= BTRFS_INODE_RO_VERITY; - btrfs_sync_inode_flags_to_i_flags(&inode->vfs_inode); + btrfs_sync_inode_flags_to_i_flags(inode); ret = btrfs_update_inode(trans, inode); if (ret) goto end_trans; diff --git a/fs/btrfs/volumes.c b/fs/btrfs/volumes.c index 0a0776489055..89835071cfea 100644 --- a/fs/btrfs/volumes.c +++ b/fs/btrfs/volumes.c @@ -404,7 +404,7 @@ static void btrfs_free_device(struct btrfs_device *device) { WARN_ON(!list_empty(&device->post_commit_list)); rcu_string_free(device->name); - extent_io_tree_release(&device->alloc_state); + btrfs_extent_io_tree_release(&device->alloc_state); btrfs_destroy_dev_zone_info(device); kfree(device); } @@ -415,8 +415,8 @@ static void free_fs_devices(struct btrfs_fs_devices *fs_devices) WARN_ON(fs_devices->opened); while (!list_empty(&fs_devices->devices)) { - device = list_entry(fs_devices->devices.next, - struct btrfs_device, dev_list); + device = list_first_entry(&fs_devices->devices, + struct btrfs_device, dev_list); list_del(&device->dev_list); btrfs_free_device(device); } @@ -428,8 +428,8 @@ void __exit btrfs_cleanup_fs_uuids(void) struct btrfs_fs_devices *fs_devices; while (!list_empty(&fs_uuids)) { - fs_devices = list_entry(fs_uuids.next, - struct btrfs_fs_devices, fs_list); + fs_devices = list_first_entry(&fs_uuids, struct btrfs_fs_devices, + fs_list); list_del(&fs_devices->fs_list); free_fs_devices(fs_devices); } @@ -493,7 +493,7 @@ btrfs_get_bdev_and_sb(const char *device_path, blk_mode_t flags, void *holder, } } invalidate_bdev(bdev); - *disk_super = btrfs_read_dev_super(bdev); + *disk_super = btrfs_read_disk_super(bdev, 0, false); if (IS_ERR(*disk_super)) { ret = PTR_ERR(*disk_super); fput(*bdev_file); @@ -733,82 +733,6 @@ const u8 *btrfs_sb_fsid_ptr(const struct btrfs_super_block *sb) return has_metadata_uuid ? sb->metadata_uuid : sb->fsid; } -/* - * We can have very weird soft links passed in. - * One example is "/proc/self/fd/<fd>", which can be a soft link to - * a block device. - * - * But it's never a good idea to use those weird names. - * Here we check if the path (not following symlinks) is a good one inside - * "/dev/". - */ -static bool is_good_dev_path(const char *dev_path) -{ - struct path path = { .mnt = NULL, .dentry = NULL }; - char *path_buf = NULL; - char *resolved_path; - bool is_good = false; - int ret; - - if (!dev_path) - goto out; - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) - goto out; - - /* - * Do not follow soft link, just check if the original path is inside - * "/dev/". - */ - ret = kern_path(dev_path, 0, &path); - if (ret) - goto out; - resolved_path = d_path(&path, path_buf, PATH_MAX); - if (IS_ERR(resolved_path)) - goto out; - if (strncmp(resolved_path, "/dev/", strlen("/dev/"))) - goto out; - is_good = true; -out: - kfree(path_buf); - path_put(&path); - return is_good; -} - -static int get_canonical_dev_path(const char *dev_path, char *canonical) -{ - struct path path = { .mnt = NULL, .dentry = NULL }; - char *path_buf = NULL; - char *resolved_path; - int ret; - - if (!dev_path) { - ret = -EINVAL; - goto out; - } - - path_buf = kmalloc(PATH_MAX, GFP_KERNEL); - if (!path_buf) { - ret = -ENOMEM; - goto out; - } - - ret = kern_path(dev_path, LOOKUP_FOLLOW, &path); - if (ret) - goto out; - resolved_path = d_path(&path, path_buf, PATH_MAX); - if (IS_ERR(resolved_path)) { - ret = PTR_ERR(resolved_path); - goto out; - } - ret = strscpy(canonical, resolved_path, PATH_MAX); -out: - kfree(path_buf); - path_put(&path); - return ret; -} - static bool is_same_device(struct btrfs_device *device, const char *new_path) { struct path old = { .mnt = NULL, .dentry = NULL }; @@ -1225,7 +1149,7 @@ static void btrfs_close_one_device(struct btrfs_device *device) device->fs_info = NULL; atomic_set(&device->dev_stats_ccnt, 0); - extent_io_tree_release(&device->alloc_state); + btrfs_extent_io_tree_release(&device->alloc_state); /* * Reset the flush error record. We might have a transient flush error @@ -1401,48 +1325,58 @@ void btrfs_release_disk_super(struct btrfs_super_block *super) put_page(page); } -static struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, - u64 bytenr, u64 bytenr_orig) +struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + int copy_num, bool drop_cache) { - struct btrfs_super_block *disk_super; + struct btrfs_super_block *super; struct page *page; - void *p; - pgoff_t index; + u64 bytenr, bytenr_orig; + struct address_space *mapping = bdev->bd_mapping; + int ret; - /* make sure our super fits in the device */ - if (bytenr + PAGE_SIZE >= bdev_nr_bytes(bdev)) - return ERR_PTR(-EINVAL); + bytenr_orig = btrfs_sb_offset(copy_num); + ret = btrfs_sb_log_location_bdev(bdev, copy_num, READ, &bytenr); + if (ret < 0) { + if (ret == -ENOENT) + ret = -EINVAL; + return ERR_PTR(ret); + } - /* make sure our super fits in the page */ - if (sizeof(*disk_super) > PAGE_SIZE) + if (bytenr + BTRFS_SUPER_INFO_SIZE >= bdev_nr_bytes(bdev)) return ERR_PTR(-EINVAL); - /* make sure our super doesn't straddle pages on disk */ - index = bytenr >> PAGE_SHIFT; - if ((bytenr + sizeof(*disk_super) - 1) >> PAGE_SHIFT != index) - return ERR_PTR(-EINVAL); + if (drop_cache) { + /* This should only be called with the primary sb. */ + ASSERT(copy_num == 0); - /* pull in the page with our super */ - page = read_cache_page_gfp(bdev->bd_mapping, index, GFP_KERNEL); + /* + * Drop the page of the primary superblock, so later read will + * always read from the device. + */ + invalidate_inode_pages2_range(mapping, bytenr >> PAGE_SHIFT, + (bytenr + BTRFS_SUPER_INFO_SIZE) >> PAGE_SHIFT); + } + page = read_cache_page_gfp(mapping, bytenr >> PAGE_SHIFT, GFP_NOFS); if (IS_ERR(page)) return ERR_CAST(page); - p = page_address(page); - - /* align our pointer to the offset of the super block */ - disk_super = p + offset_in_page(bytenr); - - if (btrfs_super_bytenr(disk_super) != bytenr_orig || - btrfs_super_magic(disk_super) != BTRFS_MAGIC) { - btrfs_release_disk_super(p); + super = page_address(page); + if (btrfs_super_magic(super) != BTRFS_MAGIC || + btrfs_super_bytenr(super) != bytenr_orig) { + btrfs_release_disk_super(super); return ERR_PTR(-EINVAL); } - if (disk_super->label[0] && disk_super->label[BTRFS_LABEL_SIZE - 1]) - disk_super->label[BTRFS_LABEL_SIZE - 1] = 0; + /* + * Make sure the last byte of label is properly NUL termiated. We use + * '%s' to print the label, if not properly NUL termiated we can access + * beyond the label. + */ + if (super->label[0] && super->label[BTRFS_LABEL_SIZE - 1]) + super->label[BTRFS_LABEL_SIZE - 1] = 0; - return disk_super; + return super; } int btrfs_forget_devices(dev_t devt) @@ -1513,23 +1447,10 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, bool new_device_added = false; struct btrfs_device *device = NULL; struct file *bdev_file; - char *canonical_path = NULL; - u64 bytenr; dev_t devt; - int ret; lockdep_assert_held(&uuid_mutex); - if (!is_good_dev_path(path)) { - canonical_path = kmalloc(PATH_MAX, GFP_KERNEL); - if (canonical_path) { - ret = get_canonical_dev_path(path, canonical_path); - if (ret < 0) { - kfree(canonical_path); - canonical_path = NULL; - } - } - } /* * Avoid an exclusive open here, as the systemd-udev may initiate the * device scan which may race with the user's mount or mkfs command, @@ -1544,20 +1465,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, if (IS_ERR(bdev_file)) return ERR_CAST(bdev_file); - /* - * We would like to check all the super blocks, but doing so would - * allow a mount to succeed after a mkfs from a different filesystem. - * Currently, recovery from a bad primary btrfs superblock is done - * using the userspace command 'btrfs check --super'. - */ - ret = btrfs_sb_log_location_bdev(file_bdev(bdev_file), 0, READ, &bytenr); - if (ret) { - device = ERR_PTR(ret); - goto error_bdev_put; - } - - disk_super = btrfs_read_disk_super(file_bdev(bdev_file), bytenr, - btrfs_sb_offset(0)); + disk_super = btrfs_read_disk_super(file_bdev(bdev_file), 0, false); if (IS_ERR(disk_super)) { device = ERR_CAST(disk_super); goto error_bdev_put; @@ -1574,8 +1482,7 @@ struct btrfs_device *btrfs_scan_one_device(const char *path, blk_mode_t flags, goto free_disk_super; } - device = device_list_add(canonical_path ? : path, disk_super, - &new_device_added); + device = device_list_add(path, disk_super, &new_device_added); if (!IS_ERR(device) && new_device_added) btrfs_free_stale_devices(device->devt, device); @@ -1584,7 +1491,6 @@ free_disk_super: error_bdev_put: fput(bdev_file); - kfree(canonical_path); return device; } @@ -1600,9 +1506,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, lockdep_assert_held(&device->fs_info->chunk_mutex); - if (find_first_extent_bit(&device->alloc_state, *start, - &physical_start, &physical_end, - CHUNK_ALLOCATED, NULL)) { + if (btrfs_find_first_extent_bit(&device->alloc_state, *start, + &physical_start, &physical_end, + CHUNK_ALLOCATED, NULL)) { if (in_range(physical_start, *start, len) || in_range(*start, physical_start, @@ -1617,6 +1523,9 @@ static bool contains_pending_extent(struct btrfs_device *device, u64 *start, static u64 dev_extent_search_start(struct btrfs_device *device) { switch (device->fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: return BTRFS_DEVICE_RANGE_RESERVED; case BTRFS_CHUNK_ALLOC_ZONED: @@ -1626,8 +1535,6 @@ static u64 dev_extent_search_start(struct btrfs_device *device) * for superblock logging. */ return 0; - default: - BUG(); } } @@ -1640,7 +1547,8 @@ static bool dev_extent_hole_check_zoned(struct btrfs_device *device, int ret; bool changed = false; - ASSERT(IS_ALIGNED(*hole_start, zone_size)); + ASSERT(IS_ALIGNED(*hole_start, zone_size), + "hole_start=%llu zone_size=%llu", *hole_start, zone_size); while (*hole_size > 0) { pos = btrfs_find_allocatable_zones(device, *hole_start, @@ -1706,6 +1614,9 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, } switch (device->fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(device->fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: /* No extra check */ break; @@ -1720,8 +1631,6 @@ static bool dev_extent_hole_check(struct btrfs_device *device, u64 *hole_start, continue; } break; - default: - BUG(); } break; @@ -1798,8 +1707,8 @@ again: path->skip_locking = 1; key.objectid = device->devid; - key.offset = search_start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = search_start; ret = btrfs_search_backwards(root, &key, path); if (ret < 0) @@ -1891,7 +1800,9 @@ next: else ret = 0; - ASSERT(max_hole_start + max_hole_size <= search_end); + ASSERT(max_hole_start + max_hole_size <= search_end, + "max_hole_start=%llu max_hole_size=%llu search_end=%llu", + max_hole_start, max_hole_size, search_end); out: btrfs_free_path(path); *start = max_hole_start; @@ -1918,8 +1829,8 @@ static int btrfs_free_dev_extent(struct btrfs_trans_handle *trans, return -ENOMEM; key.objectid = device->devid; - key.offset = start; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = start; again: ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret > 0) { @@ -2204,7 +2115,7 @@ static u64 btrfs_num_devices(struct btrfs_fs_info *fs_info) down_read(&fs_info->dev_replace.rwsem); if (btrfs_dev_replace_is_ongoing(&fs_info->dev_replace)) { - ASSERT(num_devices > 1); + ASSERT(num_devices > 1, "num_devices=%llu", num_devices); num_devices--; } up_read(&fs_info->dev_replace.rwsem); @@ -2220,7 +2131,7 @@ static void btrfs_scratch_superblock(struct btrfs_fs_info *fs_info, const u64 bytenr = btrfs_sb_offset(copy_num); int ret; - disk_super = btrfs_read_disk_super(bdev, bytenr, bytenr); + disk_super = btrfs_read_disk_super(bdev, copy_num, false); if (IS_ERR(disk_super)) return; @@ -2408,7 +2319,7 @@ int btrfs_rm_device(struct btrfs_fs_info *fs_info, */ if (cur_devices->num_devices == 0) { list_del_init(&cur_devices->seed_list); - ASSERT(cur_devices->opened == 1); + ASSERT(cur_devices->opened == 1, "opened=%d", cur_devices->opened); cur_devices->opened--; free_fs_devices(cur_devices); } @@ -2721,8 +2632,8 @@ static int btrfs_finish_sprout(struct btrfs_trans_handle *trans) return -ENOMEM; key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = BTRFS_DEV_ITEM_KEY; + key.offset = 0; while (1) { btrfs_reserve_chunk_metadata(trans, false); @@ -3119,8 +3030,8 @@ static int btrfs_free_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) return -ENOMEM; key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = chunk_offset; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = chunk_offset; ret = btrfs_search_slot(trans, root, &key, path, -1, 1); if (ret < 0) @@ -3338,7 +3249,8 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) * user having built with ASSERT enabled, so if ASSERT doesn't * do anything we still error out. */ - ASSERT(0); + DEBUG_WARN("errr %ld reading chunk map at offset %llu", + PTR_ERR(map), chunk_offset); return PTR_ERR(map); } @@ -3419,8 +3331,16 @@ int btrfs_remove_chunk(struct btrfs_trans_handle *trans, u64 chunk_offset) if (ret == -ENOSPC) { const u64 sys_flags = btrfs_system_alloc_profile(fs_info); struct btrfs_block_group *sys_bg; + struct btrfs_space_info *space_info; - sys_bg = btrfs_create_chunk(trans, sys_flags); + space_info = btrfs_find_space_info(fs_info, sys_flags); + if (!space_info) { + ret = -EINVAL; + btrfs_abort_transaction(trans, ret); + goto out; + } + + sys_bg = btrfs_create_chunk(trans, space_info, sys_flags); if (IS_ERR(sys_bg)) { ret = PTR_ERR(sys_bg); btrfs_abort_transaction(trans, ret); @@ -3577,8 +3497,8 @@ static int btrfs_relocate_sys_chunks(struct btrfs_fs_info *fs_info) again: key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -3880,26 +3800,25 @@ static void reset_balance_state(struct btrfs_fs_info *fs_info) * Balance filters. Return 1 if chunk should be filtered out * (should not be balanced). */ -static int chunk_profiles_filter(u64 chunk_type, - struct btrfs_balance_args *bargs) +static bool chunk_profiles_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->profiles & chunk_type) - return 0; + return false; - return 1; + return true; } -static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, - struct btrfs_balance_args *bargs) +static bool chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used; u64 user_thresh_min; u64 user_thresh_max; - int ret = 1; + bool ret = true; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; @@ -3917,18 +3836,18 @@ static int chunk_usage_range_filter(struct btrfs_fs_info *fs_info, u64 chunk_off user_thresh_max = mult_perc(cache->length, bargs->usage_max); if (user_thresh_min <= chunk_used && chunk_used < user_thresh_max) - ret = 0; + ret = false; btrfs_put_block_group(cache); return ret; } -static int chunk_usage_filter(struct btrfs_fs_info *fs_info, - u64 chunk_offset, struct btrfs_balance_args *bargs) +static bool chunk_usage_filter(struct btrfs_fs_info *fs_info, u64 chunk_offset, + struct btrfs_balance_args *bargs) { struct btrfs_block_group *cache; u64 chunk_used, user_thresh; - int ret = 1; + bool ret = true; cache = btrfs_lookup_block_group(fs_info, chunk_offset); chunk_used = cache->used; @@ -3941,15 +3860,14 @@ static int chunk_usage_filter(struct btrfs_fs_info *fs_info, user_thresh = mult_perc(cache->length, bargs->usage); if (chunk_used < user_thresh) - ret = 0; + ret = false; btrfs_put_block_group(cache); return ret; } -static int chunk_devid_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_devid_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); @@ -3958,10 +3876,10 @@ static int chunk_devid_filter(struct extent_buffer *leaf, for (i = 0; i < num_stripes; i++) { stripe = btrfs_stripe_nr(chunk, i); if (btrfs_stripe_devid(leaf, stripe) == bargs->devid) - return 0; + return false; } - return 1; + return true; } static u64 calc_data_stripes(u64 type, int num_stripes) @@ -3974,9 +3892,8 @@ static u64 calc_data_stripes(u64 type, int num_stripes) } /* [pstart, pend) */ -static int chunk_drange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_drange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { struct btrfs_stripe *stripe; int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); @@ -3987,7 +3904,7 @@ static int chunk_drange_filter(struct extent_buffer *leaf, int i; if (!(bargs->flags & BTRFS_BALANCE_ARGS_DEVID)) - return 0; + return false; type = btrfs_chunk_type(leaf, chunk); factor = calc_data_stripes(type, num_stripes); @@ -4003,56 +3920,53 @@ static int chunk_drange_filter(struct extent_buffer *leaf, if (stripe_offset < bargs->pend && stripe_offset + stripe_length > bargs->pstart) - return 0; + return false; } - return 1; + return true; } /* [vstart, vend) */ -static int chunk_vrange_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - u64 chunk_offset, - struct btrfs_balance_args *bargs) +static bool chunk_vrange_filter(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + u64 chunk_offset, struct btrfs_balance_args *bargs) { if (chunk_offset < bargs->vend && chunk_offset + btrfs_chunk_length(leaf, chunk) > bargs->vstart) /* at least part of the chunk is inside this vrange */ - return 0; + return false; - return 1; + return true; } -static int chunk_stripes_range_filter(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, - struct btrfs_balance_args *bargs) +static bool chunk_stripes_range_filter(struct extent_buffer *leaf, + struct btrfs_chunk *chunk, + struct btrfs_balance_args *bargs) { int num_stripes = btrfs_chunk_num_stripes(leaf, chunk); if (bargs->stripes_min <= num_stripes && num_stripes <= bargs->stripes_max) - return 0; + return false; - return 1; + return true; } -static int chunk_soft_convert_filter(u64 chunk_type, - struct btrfs_balance_args *bargs) +static bool chunk_soft_convert_filter(u64 chunk_type, struct btrfs_balance_args *bargs) { if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT)) - return 0; + return false; chunk_type = chunk_to_extended(chunk_type) & BTRFS_EXTENDED_PROFILE_MASK; if (bargs->target == chunk_type) - return 1; + return true; - return 0; + return false; } -static int should_balance_chunk(struct extent_buffer *leaf, - struct btrfs_chunk *chunk, u64 chunk_offset) +static bool should_balance_chunk(struct extent_buffer *leaf, struct btrfs_chunk *chunk, + u64 chunk_offset) { struct btrfs_fs_info *fs_info = leaf->fs_info; struct btrfs_balance_control *bctl = fs_info->balance_ctl; @@ -4062,7 +3976,7 @@ static int should_balance_chunk(struct extent_buffer *leaf, /* type filter */ if (!((chunk_type & BTRFS_BLOCK_GROUP_TYPE_MASK) & (bctl->flags & BTRFS_BALANCE_TYPE_MASK))) { - return 0; + return false; } if (chunk_type & BTRFS_BLOCK_GROUP_DATA) @@ -4075,46 +3989,46 @@ static int should_balance_chunk(struct extent_buffer *leaf, /* profiles filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_PROFILES) && chunk_profiles_filter(chunk_type, bargs)) { - return 0; + return false; } /* usage filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE) && chunk_usage_filter(fs_info, chunk_offset, bargs)) { - return 0; + return false; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_USAGE_RANGE) && chunk_usage_range_filter(fs_info, chunk_offset, bargs)) { - return 0; + return false; } /* devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DEVID) && chunk_devid_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* drange filter, makes sense only with devid filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_DRANGE) && chunk_drange_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* vrange filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_VRANGE) && chunk_vrange_filter(leaf, chunk, chunk_offset, bargs)) { - return 0; + return false; } /* stripes filter */ if ((bargs->flags & BTRFS_BALANCE_ARGS_STRIPES_RANGE) && chunk_stripes_range_filter(leaf, chunk, bargs)) { - return 0; + return false; } /* soft profile changing mode */ if ((bargs->flags & BTRFS_BALANCE_ARGS_SOFT) && chunk_soft_convert_filter(chunk_type, bargs)) { - return 0; + return false; } /* @@ -4122,7 +4036,7 @@ static int should_balance_chunk(struct extent_buffer *leaf, */ if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT)) { if (bargs->limit == 0) - return 0; + return false; else bargs->limit--; } else if ((bargs->flags & BTRFS_BALANCE_ARGS_LIMIT_RANGE)) { @@ -4132,12 +4046,12 @@ static int should_balance_chunk(struct extent_buffer *leaf, * about the count of all chunks that satisfy the filters. */ if (bargs->limit_max == 0) - return 0; + return false; else bargs->limit_max--; } - return 1; + return true; } static int __btrfs_balance(struct btrfs_fs_info *fs_info) @@ -4184,8 +4098,8 @@ again: bctl->sys.limit = limit_sys; } key.objectid = BTRFS_FIRST_CHUNK_TREE_OBJECTID; - key.offset = (u64)-1; key.type = BTRFS_CHUNK_ITEM_KEY; + key.offset = (u64)-1; while (1) { if ((!counting && atomic_read(&fs_info->balance_pause_req)) || @@ -4752,7 +4666,8 @@ int btrfs_resume_balance_async(struct btrfs_fs_info *fs_info) } spin_lock(&fs_info->super_lock); - ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED); + ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED, + "exclusive_operation=%d", fs_info->exclusive_operation); fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE; spin_unlock(&fs_info->super_lock); /* @@ -5001,8 +4916,8 @@ int btrfs_shrink_device(struct btrfs_device *device, u64 new_size) again: key.objectid = device->devid; - key.offset = (u64)-1; key.type = BTRFS_DEV_EXTENT_KEY; + key.offset = (u64)-1; do { mutex_lock(&fs_info->reclaim_bgs_lock); @@ -5088,8 +5003,8 @@ again: mutex_lock(&fs_info->chunk_mutex); /* Clear all state bits beyond the shrunk device size */ - clear_extent_bits(&device->alloc_state, new_size, (u64)-1, - CHUNK_STATE_MASK); + btrfs_clear_extent_bits(&device->alloc_state, new_size, (u64)-1, + CHUNK_STATE_MASK); btrfs_device_set_disk_total_bytes(device, new_size); if (list_empty(&device->post_commit_list)) @@ -5216,6 +5131,8 @@ struct alloc_chunk_ctl { u64 stripe_size; u64 chunk_size; int ndevs; + /* Space_info the block group is going to belong. */ + struct btrfs_space_info *space_info; }; static void init_alloc_chunk_ctl_policy_regular( @@ -5289,14 +5206,15 @@ static void init_alloc_chunk_ctl(struct btrfs_fs_devices *fs_devices, ctl->ndevs = 0; switch (fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: init_alloc_chunk_ctl_policy_regular(fs_devices, ctl); break; case BTRFS_CHUNK_ALLOC_ZONED: init_alloc_chunk_ctl_policy_zoned(fs_devices, ctl); break; - default: - BUG(); } } @@ -5435,7 +5353,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, * It should hold because: * dev_extent_min == dev_extent_want == zone_size * dev_stripes */ - ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min); + ASSERT(devices_info[ctl->ndevs - 1].max_avail == ctl->dev_extent_min, + "ndevs=%d max_avail=%llu dev_extent_min=%llu", ctl->ndevs, + devices_info[ctl->ndevs - 1].max_avail, ctl->dev_extent_min); ctl->stripe_size = zone_size; ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; @@ -5448,7 +5368,9 @@ static int decide_stripe_size_zoned(struct alloc_chunk_ctl *ctl, ctl->dev_stripes); ctl->num_stripes = ctl->ndevs * ctl->dev_stripes; data_stripes = (ctl->num_stripes - ctl->nparity) / ctl->ncopies; - ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size); + ASSERT(ctl->stripe_size * data_stripes <= ctl->max_chunk_size, + "stripe_size=%llu data_stripes=%d max_chunk_size=%llu", + ctl->stripe_size, data_stripes, ctl->max_chunk_size); } ctl->chunk_size = ctl->stripe_size * data_stripes; @@ -5481,12 +5403,13 @@ static int decide_stripe_size(struct btrfs_fs_devices *fs_devices, ctl->ndevs = min(ctl->ndevs, ctl->devs_max); switch (fs_devices->chunk_alloc_policy) { + default: + btrfs_warn_unknown_chunk_allocation(fs_devices->chunk_alloc_policy); + fallthrough; case BTRFS_CHUNK_ALLOC_REGULAR: return decide_stripe_size_regular(ctl, devices_info); case BTRFS_CHUNK_ALLOC_ZONED: return decide_stripe_size_zoned(ctl, devices_info); - default: - BUG(); } } @@ -5496,9 +5419,9 @@ static void chunk_map_device_set_bits(struct btrfs_chunk_map *map, unsigned int struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - set_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + map->stripe_size - 1, - bits | EXTENT_NOWAIT, NULL); + btrfs_set_extent_bit(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT, NULL); } } @@ -5508,10 +5431,9 @@ static void chunk_map_device_clear_bits(struct btrfs_chunk_map *map, unsigned in struct btrfs_io_stripe *stripe = &map->stripes[i]; struct btrfs_device *device = stripe->dev; - __clear_extent_bit(&device->alloc_state, stripe->physical, - stripe->physical + map->stripe_size - 1, - bits | EXTENT_NOWAIT, - NULL, NULL); + btrfs_clear_extent_bits(&device->alloc_state, stripe->physical, + stripe->physical + map->stripe_size - 1, + bits | EXTENT_NOWAIT); } } @@ -5618,7 +5540,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, return ERR_PTR(ret); } - block_group = btrfs_make_block_group(trans, type, start, ctl->chunk_size); + block_group = btrfs_make_block_group(trans, ctl->space_info, type, start, + ctl->chunk_size); if (IS_ERR(block_group)) { btrfs_remove_chunk_map(info, map); return block_group; @@ -5644,7 +5567,8 @@ static struct btrfs_block_group *create_chunk(struct btrfs_trans_handle *trans, } struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, - u64 type) + struct btrfs_space_info *space_info, + u64 type) { struct btrfs_fs_info *info = trans->fs_info; struct btrfs_fs_devices *fs_devices = info->fs_devices; @@ -5656,7 +5580,7 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, lockdep_assert_held(&info->chunk_mutex); if (!alloc_profile_is_valid(type, 0)) { - ASSERT(0); + DEBUG_WARN("invalid alloc profile for type %llu", type); return ERR_PTR(-EINVAL); } @@ -5668,12 +5592,13 @@ struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, if (!(type & BTRFS_BLOCK_GROUP_TYPE_MASK)) { btrfs_err(info, "invalid chunk type 0x%llx requested", type); - ASSERT(0); + DEBUG_WARN(); return ERR_PTR(-EINVAL); } ctl.start = find_next_chunk(info); ctl.type = type; + ctl.space_info = space_info; init_alloc_chunk_ctl(fs_devices, &ctl); devices_info = kcalloc(fs_devices->rw_devices, sizeof(*devices_info), @@ -5817,7 +5742,9 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) struct btrfs_fs_info *fs_info = trans->fs_info; u64 alloc_profile; struct btrfs_block_group *meta_bg; + struct btrfs_space_info *meta_space_info; struct btrfs_block_group *sys_bg; + struct btrfs_space_info *sys_space_info; /* * When adding a new device for sprouting, the seed device is read-only @@ -5841,12 +5768,22 @@ static noinline int init_first_rw_device(struct btrfs_trans_handle *trans) */ alloc_profile = btrfs_metadata_alloc_profile(fs_info); - meta_bg = btrfs_create_chunk(trans, alloc_profile); + meta_space_info = btrfs_find_space_info(fs_info, alloc_profile); + if (!meta_space_info) { + DEBUG_WARN(); + return -EINVAL; + } + meta_bg = btrfs_create_chunk(trans, meta_space_info, alloc_profile); if (IS_ERR(meta_bg)) return PTR_ERR(meta_bg); alloc_profile = btrfs_system_alloc_profile(fs_info); - sys_bg = btrfs_create_chunk(trans, alloc_profile); + sys_space_info = btrfs_find_space_info(fs_info, alloc_profile); + if (!sys_space_info) { + DEBUG_WARN(); + return -EINVAL; + } + sys_bg = btrfs_create_chunk(trans, sys_space_info, alloc_profile); if (IS_ERR(sys_bg)) return PTR_ERR(sys_bg); @@ -6046,7 +5983,7 @@ static int btrfs_read_rr(const struct btrfs_chunk_map *map, int first, int num_s static int find_live_mirror(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map, int first, - int dev_replace_is_ongoing) + bool dev_replace_is_ongoing) { const enum btrfs_read_policy policy = READ_ONCE(fs_info->fs_devices->read_policy); int i; @@ -6055,8 +5992,8 @@ static int find_live_mirror(struct btrfs_fs_info *fs_info, int tolerance; struct btrfs_device *srcdev; - ASSERT((map->type & - (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10))); + ASSERT((map->type & (BTRFS_BLOCK_GROUP_RAID1_MASK | BTRFS_BLOCK_GROUP_RAID10)), + "type=%llu", map->type); if (map->type & BTRFS_BLOCK_GROUP_RAID10) num_stripes = map->sub_stripes; @@ -6357,7 +6294,7 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, } /* We can only have at most 2 extra nr_stripes (for DUP). */ - ASSERT(nr_extra_stripes <= 2); + ASSERT(nr_extra_stripes <= 2, "nr_extra_stripes=%d", nr_extra_stripes); /* * For GET_READ_MIRRORS, we can only return at most 1 extra stripe for * replace. @@ -6368,7 +6305,8 @@ static void handle_ops_on_dev_replace(struct btrfs_io_context *bioc, struct btrfs_io_stripe *second = &bioc->stripes[num_stripes + 1]; /* Only DUP can have two extra stripes. */ - ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP); + ASSERT(bioc->map_type & BTRFS_BLOCK_GROUP_DUP, + "map_type=%llu", bioc->map_type); /* * Swap the last stripe stripes and reduce @nr_extra_stripes. @@ -6395,7 +6333,8 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, */ io_geom->stripe_offset = offset & BTRFS_STRIPE_LEN_MASK; io_geom->stripe_nr = offset >> BTRFS_STRIPE_LEN_SHIFT; - ASSERT(io_geom->stripe_offset < U32_MAX); + ASSERT(io_geom->stripe_offset < U32_MAX, + "stripe_offset=%llu", io_geom->stripe_offset); if (map->type & BTRFS_BLOCK_GROUP_RAID56_MASK) { unsigned long full_stripe_len = @@ -6413,8 +6352,12 @@ static u64 btrfs_max_io_len(struct btrfs_chunk_map *map, u64 offset, io_geom->raid56_full_stripe_start = btrfs_stripe_nr_to_offset( rounddown(io_geom->stripe_nr, nr_data_stripes(map))); - ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset); - ASSERT(io_geom->raid56_full_stripe_start <= offset); + ASSERT(io_geom->raid56_full_stripe_start + full_stripe_len > offset, + "raid56_full_stripe_start=%llu full_stripe_len=%lu offset=%llu", + io_geom->raid56_full_stripe_start, full_stripe_len, offset); + ASSERT(io_geom->raid56_full_stripe_start <= offset, + "raid56_full_stripe_start=%llu offset=%llu", + io_geom->raid56_full_stripe_start, offset); /* * For writes to RAID56, allow to write a full stripe set, but * no straddling of stripe sets. @@ -6580,7 +6523,7 @@ static void map_blocks_raid56_read(struct btrfs_chunk_map *map, { int data_stripes = nr_data_stripes(map); - ASSERT(io_geom->mirror_num <= 1); + ASSERT(io_geom->mirror_num <= 1, "mirror_num=%d", io_geom->mirror_num); /* Just grab the data stripe directly. */ io_geom->stripe_index = io_geom->stripe_nr % data_stripes; io_geom->stripe_nr /= data_stripes; @@ -6648,7 +6591,7 @@ int btrfs_map_block(struct btrfs_fs_info *fs_info, enum btrfs_map_op op, int num_copies; struct btrfs_io_context *bioc = NULL; struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; - int dev_replace_is_ongoing = 0; + bool dev_replace_is_ongoing = false; u16 num_alloc_stripes; u64 max_len; @@ -6953,7 +6896,7 @@ struct btrfs_device *btrfs_alloc_device(struct btrfs_fs_info *fs_info, atomic_set(&dev->dev_stats_ccnt, 0); btrfs_device_data_ordered_init(dev); - extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); + btrfs_extent_io_tree_init(fs_info, &dev->alloc_state, IO_TREE_DEVICE_ALLOC_STATE); if (devid) tmp = *devid; @@ -7155,6 +7098,7 @@ static int read_one_chunk(struct btrfs_key *key, struct extent_buffer *leaf, btrfs_err(fs_info, "failed to add chunk map, start=%llu len=%llu: %d", map->start, map->chunk_len, ret); + btrfs_free_chunk_map(map); } return ret; @@ -7200,8 +7144,12 @@ static struct btrfs_fs_devices *open_seed_devices(struct btrfs_fs_info *fs_info, fs_devices = find_fsid(fsid, NULL); if (!fs_devices) { - if (!btrfs_test_opt(fs_info, DEGRADED)) + if (!btrfs_test_opt(fs_info, DEGRADED)) { + btrfs_err(fs_info, + "failed to find fsid %pU when attempting to open seed devices", + fsid); return ERR_PTR(-ENOENT); + } fs_devices = alloc_fs_devices(fsid); if (IS_ERR(fs_devices)) @@ -7534,8 +7482,8 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info) * item - BTRFS_FIRST_CHUNK_TREE_OBJECTID). */ key.objectid = BTRFS_DEV_ITEMS_OBJECTID; - key.offset = 0; key.type = 0; + key.offset = 0; btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) { struct extent_buffer *node = path->nodes[1]; @@ -7920,7 +7868,7 @@ void btrfs_commit_device_sizes(struct btrfs_transaction *trans) { struct btrfs_device *curr, *next; - ASSERT(trans->state == TRANS_STATE_COMMIT_DOING); + ASSERT(trans->state == TRANS_STATE_COMMIT_DOING, "state=%d" , trans->state); if (list_empty(&trans->dev_update_list)) return; @@ -8289,7 +8237,7 @@ static void map_raid56_repair_block(struct btrfs_io_context *bioc, logical < stripe_start + BTRFS_STRIPE_LEN) break; } - ASSERT(i < data_stripes); + ASSERT(i < data_stripes, "i=%d data_stripes=%d", i, data_stripes); smap->dev = bioc->stripes[i].dev; smap->physical = bioc->stripes[i].physical + ((logical - bioc->full_stripe_logical) & @@ -8318,7 +8266,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, int mirror_ret = mirror_num; int ret; - ASSERT(mirror_num > 0); + ASSERT(mirror_num > 0, "mirror_num=%d", mirror_num); ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical, &map_length, &bioc, smap, &mirror_ret); @@ -8326,7 +8274,7 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, return ret; /* The map range should not cross stripe boundary. */ - ASSERT(map_length >= length); + ASSERT(map_length >= length, "map_length=%llu length=%u", map_length, length); /* Already mapped to single stripe. */ if (!bioc) @@ -8338,7 +8286,8 @@ int btrfs_map_repair_block(struct btrfs_fs_info *fs_info, goto out; } - ASSERT(mirror_num <= bioc->num_stripes); + ASSERT(mirror_num <= bioc->num_stripes, + "mirror_num=%d num_stripes=%d", mirror_num, bioc->num_stripes); smap->dev = bioc->stripes[mirror_num - 1].dev; smap->physical = bioc->stripes[mirror_num - 1].physical; out: diff --git a/fs/btrfs/volumes.h b/fs/btrfs/volumes.h index 120f65e21eeb..137cc232f58e 100644 --- a/fs/btrfs/volumes.h +++ b/fs/btrfs/volumes.h @@ -7,6 +7,7 @@ #define BTRFS_VOLUMES_H #include <linux/blk_types.h> +#include <linux/blkdev.h> #include <linux/sizes.h> #include <linux/atomic.h> #include <linux/sort.h> @@ -18,14 +19,17 @@ #include <linux/completion.h> #include <linux/rbtree.h> #include <uapi/linux/btrfs.h> +#include <uapi/linux/btrfs_tree.h> #include "messages.h" #include "rcu-string.h" +#include "extent-io-tree.h" struct block_device; struct bdev_handle; struct btrfs_fs_info; struct btrfs_block_group; struct btrfs_trans_handle; +struct btrfs_transaction; struct btrfs_zoned_device_info; #define BTRFS_MAX_DATA_CHUNK_SIZE (10ULL * SZ_1G) @@ -469,7 +473,6 @@ struct btrfs_io_stripe { struct btrfs_device *dev; /* Block mapping. */ u64 physical; - u64 length; bool rst_search_commit_root; /* For the endio handler. */ struct btrfs_io_context *bioc; @@ -711,7 +714,8 @@ struct btrfs_discard_stripe *btrfs_map_discard(struct btrfs_fs_info *fs_info, int btrfs_read_sys_array(struct btrfs_fs_info *fs_info); int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info); struct btrfs_block_group *btrfs_create_chunk(struct btrfs_trans_handle *trans, - u64 type); + struct btrfs_space_info *space_info, + u64 type); void btrfs_mapping_tree_free(struct btrfs_fs_info *fs_info); int btrfs_open_devices(struct btrfs_fs_devices *fs_devices, blk_mode_t flags, void *holder); @@ -782,6 +786,8 @@ struct btrfs_chunk_map *btrfs_find_chunk_map_nolock(struct btrfs_fs_info *fs_inf struct btrfs_chunk_map *btrfs_get_chunk_map(struct btrfs_fs_info *fs_info, u64 logical, u64 length); void btrfs_remove_chunk_map(struct btrfs_fs_info *fs_info, struct btrfs_chunk_map *map); +struct btrfs_super_block *btrfs_read_disk_super(struct block_device *bdev, + int copy_num, bool drop_cache); void btrfs_release_disk_super(struct btrfs_super_block *super); static inline void btrfs_dev_stat_inc(struct btrfs_device *dev, @@ -843,6 +849,11 @@ static inline const char *btrfs_dev_name(const struct btrfs_device *device) return rcu_str_deref(device->name); } +static inline void btrfs_warn_unknown_chunk_allocation(enum btrfs_chunk_allocation_policy pol) +{ + WARN_ONCE(1, "unknown allocation policy %d, fallback to regular", pol); +} + void btrfs_commit_device_sizes(struct btrfs_transaction *trans); struct list_head * __attribute_const__ btrfs_get_fs_uuids(void); diff --git a/fs/btrfs/xattr.h b/fs/btrfs/xattr.h index 8dc4cf49f6f0..0ce10e4ec836 100644 --- a/fs/btrfs/xattr.h +++ b/fs/btrfs/xattr.h @@ -6,6 +6,8 @@ #ifndef BTRFS_XATTR_H #define BTRFS_XATTR_H +#include <linux/types.h> + struct dentry; struct inode; struct qstr; diff --git a/fs/btrfs/zlib.c b/fs/btrfs/zlib.c index c9e92c6941ec..5292cd341f70 100644 --- a/fs/btrfs/zlib.c +++ b/fs/btrfs/zlib.c @@ -94,6 +94,45 @@ fail: return ERR_PTR(-ENOMEM); } +/* + * Helper for S390x with hardware zlib compression support. + * + * That hardware acceleration requires a buffer size larger than a single page + * to get ideal performance, thus we need to do the memory copy rather than + * use the page cache directly as input buffer. + */ +static int copy_data_into_buffer(struct address_space *mapping, + struct workspace *workspace, u64 filepos, + unsigned long length) +{ + u64 cur = filepos; + + /* It's only for hardware accelerated zlib code. */ + ASSERT(zlib_deflate_dfltcc_enabled()); + + while (cur < filepos + length) { + struct folio *folio; + void *data_in; + unsigned int offset; + unsigned long copy_length; + int ret; + + ret = btrfs_compress_filemap_get_folio(mapping, cur, &folio); + if (ret < 0) + return ret; + + offset = offset_in_folio(folio, cur); + copy_length = min(folio_size(folio) - offset, + filepos + length - cur); + + data_in = kmap_local_folio(folio, offset); + memcpy(workspace->buf + cur - filepos, data_in, copy_length); + kunmap_local(data_in); + cur += copy_length; + } + return 0; +} + int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, u64 start, struct folio **folios, unsigned long *out_folios, unsigned long *total_in, unsigned long *total_out) @@ -105,8 +144,6 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, int nr_folios = 0; struct folio *in_folio = NULL; struct folio *out_folio = NULL; - unsigned long bytes_left; - unsigned int in_buf_folios; unsigned long len = *total_out; unsigned long nr_dest_folios = *out_folios; const unsigned long max_out = nr_dest_folios * PAGE_SIZE; @@ -150,36 +187,22 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, * the workspace buffer if required. */ if (workspace->strm.avail_in == 0) { - bytes_left = len - workspace->strm.total_in; - in_buf_folios = min(DIV_ROUND_UP(bytes_left, PAGE_SIZE), - workspace->buf_size / PAGE_SIZE); - if (in_buf_folios > 1) { - int i; - - /* S390 hardware acceleration path, not subpage. */ - ASSERT(!btrfs_is_subpage( - inode_to_fs_info(mapping->host), - mapping)); - for (i = 0; i < in_buf_folios; i++) { - if (data_in) { - kunmap_local(data_in); - folio_put(in_folio); - data_in = NULL; - } - ret = btrfs_compress_filemap_get_folio(mapping, - start, &in_folio); - if (ret < 0) - goto out; - data_in = kmap_local_folio(in_folio, 0); - copy_page(workspace->buf + i * PAGE_SIZE, - data_in); - start += PAGE_SIZE; - } + unsigned long bytes_left = len - workspace->strm.total_in; + unsigned int copy_length = min(bytes_left, workspace->buf_size); + + /* + * This can only happen when hardware zlib compression is + * enabled. + */ + if (copy_length > PAGE_SIZE) { + ret = copy_data_into_buffer(mapping, workspace, + start, copy_length); + if (ret < 0) + goto out; + start += copy_length; workspace->strm.next_in = workspace->buf; - workspace->strm.avail_in = min(bytes_left, - in_buf_folios << PAGE_SHIFT); + workspace->strm.avail_in = copy_length; } else { - unsigned int pg_off; unsigned int cur_len; if (data_in) { @@ -191,9 +214,9 @@ int zlib_compress_folios(struct list_head *ws, struct address_space *mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); - cur_len = btrfs_calc_input_length(orig_end, start); - data_in = kmap_local_folio(in_folio, pg_off); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + data_in = kmap_local_folio(in_folio, + offset_in_folio(in_folio, start)); start += cur_len; workspace->strm.next_in = data_in; workspace->strm.avail_in = cur_len; @@ -463,6 +486,7 @@ out: const struct btrfs_compress_op btrfs_zlib_compress = { .workspace_manager = &wsm, + .min_level = 1, .max_level = 9, .default_level = BTRFS_ZLIB_DEFAULT_LEVEL, }; diff --git a/fs/btrfs/zoned.c b/fs/btrfs/zoned.c index 73e0aa9fc08a..b5b0156d5b95 100644 --- a/fs/btrfs/zoned.c +++ b/fs/btrfs/zoned.c @@ -989,7 +989,7 @@ int btrfs_advance_sb_log(struct btrfs_device *device, int mirror) } /* All the zones are FULL. Should not reach here. */ - ASSERT(0); + DEBUG_WARN("unexpected state, all zones full"); return -EIO; } @@ -1277,7 +1277,7 @@ struct zone_info { static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, struct zone_info *info, unsigned long *active, - struct btrfs_chunk_map *map) + struct btrfs_chunk_map *map, bool new) { struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace; struct btrfs_device *device; @@ -1307,6 +1307,8 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, return 0; } + ASSERT(!new || btrfs_dev_is_empty_zone(device, info->physical)); + /* This zone will be used for allocation, so mark this zone non-empty. */ btrfs_dev_clear_zone_empty(device, info->physical); @@ -1319,6 +1321,18 @@ static int btrfs_load_zone_info(struct btrfs_fs_info *fs_info, int zone_idx, * to determine the allocation offset within the zone. */ WARN_ON(!IS_ALIGNED(info->physical, fs_info->zone_size)); + + if (new) { + sector_t capacity; + + capacity = bdev_zone_capacity(device->bdev, info->physical >> SECTOR_SHIFT); + up_read(&dev_replace->rwsem); + info->alloc_offset = 0; + info->capacity = capacity << SECTOR_SHIFT; + + return 0; + } + nofs_flag = memalloc_nofs_save(); ret = btrfs_get_dev_zone(device, info->physical, &zone); memalloc_nofs_restore(nofs_flag); @@ -1588,7 +1602,7 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) } for (i = 0; i < map->num_stripes; i++) { - ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map); + ret = btrfs_load_zone_info(fs_info, i, &zone_info[i], active, map, new); if (ret) goto out; @@ -1659,7 +1673,6 @@ int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new) * stripe. */ cache->alloc_offset = cache->zone_capacity; - ret = 0; } out: @@ -1784,12 +1797,12 @@ static void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered, ordered->disk_bytenr = logical; write_lock(&em_tree->lock); - em = search_extent_mapping(em_tree, ordered->file_offset, - ordered->num_bytes); + em = btrfs_search_extent_mapping(em_tree, ordered->file_offset, + ordered->num_bytes); /* The em should be a new COW extent, thus it should not have an offset. */ ASSERT(em->offset == 0); em->disk_bytenr = logical; - free_extent_map(em); + btrfs_free_extent_map(em); write_unlock(&em_tree->lock); } @@ -1799,8 +1812,8 @@ static bool btrfs_zoned_split_ordered(struct btrfs_ordered_extent *ordered, struct btrfs_ordered_extent *new; if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered->flags) && - split_extent_map(ordered->inode, ordered->file_offset, - ordered->num_bytes, len, logical)) + btrfs_split_extent_map(ordered->inode, ordered->file_offset, + ordered->num_bytes, len, logical)) return false; new = btrfs_split_ordered_extent(ordered, len); @@ -2111,6 +2124,9 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group) physical = map->stripes[i].physical; zinfo = device->zone_info; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2155,27 +2171,15 @@ static void wait_eb_writebacks(struct btrfs_block_group *block_group) { struct btrfs_fs_info *fs_info = block_group->fs_info; const u64 end = block_group->start + block_group->length; - struct radix_tree_iter iter; struct extent_buffer *eb; - void __rcu **slot; + unsigned long index, start = (block_group->start >> fs_info->sectorsize_bits); rcu_read_lock(); - radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, - block_group->start >> fs_info->sectorsize_bits) { - eb = radix_tree_deref_slot(slot); - if (!eb) - continue; - if (radix_tree_deref_retry(eb)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - + xa_for_each_start(&fs_info->buffer_tree, index, eb, start) { if (eb->start < block_group->start) continue; if (eb->start >= end) break; - - slot = radix_tree_iter_resume(slot, &iter); rcu_read_unlock(); wait_on_extent_buffer_writeback(eb); rcu_read_lock(); @@ -2272,6 +2276,9 @@ static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_writ struct btrfs_zoned_device_info *zinfo = device->zone_info; unsigned int nofs_flags; + if (!device->bdev) + continue; + if (zinfo->max_active_zones == 0) continue; @@ -2325,6 +2332,9 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags) if (!btrfs_is_zoned(fs_info)) return true; + if (test_bit(BTRFS_FS_NEED_ZONE_FINISH, &fs_info->flags)) + return false; + /* Check if there is a device with active zones left */ mutex_lock(&fs_info->chunk_mutex); spin_lock(&fs_info->zone_active_bgs_lock); diff --git a/fs/btrfs/zstd.c b/fs/btrfs/zstd.c index 5232b56d5892..4a796a049b5a 100644 --- a/fs/btrfs/zstd.c +++ b/fs/btrfs/zstd.c @@ -24,13 +24,14 @@ #include "super.h" #define ZSTD_BTRFS_MAX_WINDOWLOG 17 -#define ZSTD_BTRFS_MAX_INPUT (1 << ZSTD_BTRFS_MAX_WINDOWLOG) +#define ZSTD_BTRFS_MAX_INPUT (1U << ZSTD_BTRFS_MAX_WINDOWLOG) #define ZSTD_BTRFS_DEFAULT_LEVEL 3 +#define ZSTD_BTRFS_MIN_LEVEL -15 #define ZSTD_BTRFS_MAX_LEVEL 15 /* 307s to avoid pathologically clashing with transaction commit */ #define ZSTD_BTRFS_RECLAIM_JIFFIES (307 * HZ) -static zstd_parameters zstd_get_btrfs_parameters(unsigned int level, +static zstd_parameters zstd_get_btrfs_parameters(int level, size_t src_len) { zstd_parameters params = zstd_get_params(level, src_len); @@ -45,13 +46,14 @@ struct workspace { void *mem; size_t size; char *buf; - unsigned int level; - unsigned int req_level; + int level; + int req_level; unsigned long last_used; /* jiffies */ struct list_head list; struct list_head lru_list; zstd_in_buffer in_buf; zstd_out_buffer out_buf; + zstd_parameters params; }; /* @@ -93,8 +95,10 @@ static inline struct workspace *list_to_workspace(struct list_head *list) return container_of(list, struct workspace, list); } -void zstd_free_workspace(struct list_head *ws); -struct list_head *zstd_alloc_workspace(unsigned int level); +static inline int clip_level(int level) +{ + return max(0, level - 1); +} /* * Timer callback to free unused workspaces. @@ -123,7 +127,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_for_each_prev_safe(pos, next, &wsm.lru_list) { struct workspace *victim = container_of(pos, struct workspace, lru_list); - unsigned int level; + int level; if (time_after(victim->last_used, reclaim_threshold)) break; @@ -137,8 +141,8 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) list_del(&victim->list); zstd_free_workspace(&victim->list); - if (list_empty(&wsm.idle_ws[level - 1])) - clear_bit(level - 1, &wsm.active_map); + if (list_empty(&wsm.idle_ws[level])) + clear_bit(level, &wsm.active_map); } @@ -160,9 +164,11 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer) static void zstd_calc_ws_mem_sizes(void) { size_t max_size = 0; - unsigned int level; + int level; - for (level = 1; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + for (level = ZSTD_BTRFS_MIN_LEVEL; level <= ZSTD_BTRFS_MAX_LEVEL; level++) { + if (level == 0) + continue; zstd_parameters params = zstd_get_btrfs_parameters(level, ZSTD_BTRFS_MAX_INPUT); size_t level_size = @@ -171,7 +177,8 @@ static void zstd_calc_ws_mem_sizes(void) zstd_dstream_workspace_bound(ZSTD_BTRFS_MAX_INPUT)); max_size = max_t(size_t, max_size, level_size); - zstd_ws_mem_sizes[level - 1] = max_size; + /* Use level 1 workspace size for all the fast mode negative levels. */ + zstd_ws_mem_sizes[clip_level(level)] = max_size; } } @@ -218,7 +225,7 @@ void zstd_cleanup_workspace_manager(void) } spin_unlock_bh(&wsm.lock); - del_timer_sync(&wsm.timer); + timer_delete_sync(&wsm.timer); } /* @@ -233,11 +240,11 @@ void zstd_cleanup_workspace_manager(void) * offer the opportunity to reclaim the workspace in favor of allocating an * appropriately sized one in the future. */ -static struct list_head *zstd_find_workspace(unsigned int level) +static struct list_head *zstd_find_workspace(int level) { struct list_head *ws; struct workspace *workspace; - int i = level - 1; + int i = clip_level(level); spin_lock_bh(&wsm.lock); for_each_set_bit_from(i, &wsm.active_map, ZSTD_BTRFS_MAX_LEVEL) { @@ -247,7 +254,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) list_del_init(ws); /* keep its place if it's a lower level using this */ workspace->req_level = level; - if (level == workspace->level) + if (clip_level(level) == workspace->level) list_del(&workspace->lru_list); if (list_empty(&wsm.idle_ws[i])) clear_bit(i, &wsm.active_map); @@ -270,7 +277,7 @@ static struct list_head *zstd_find_workspace(unsigned int level) * attempt to allocate a new workspace. If we fail to allocate one due to * memory pressure, go to sleep waiting for the max level workspace to free up. */ -struct list_head *zstd_get_workspace(unsigned int level) +struct list_head *zstd_get_workspace(int level) { struct list_head *ws; unsigned int nofs_flag; @@ -319,7 +326,7 @@ void zstd_put_workspace(struct list_head *ws) spin_lock_bh(&wsm.lock); /* A node is only taken off the lru if we are the corresponding level */ - if (workspace->req_level == workspace->level) { + if (clip_level(workspace->req_level) == workspace->level) { /* Hide a max level workspace from reclaim */ if (list_empty(&wsm.idle_ws[ZSTD_BTRFS_MAX_LEVEL - 1])) { INIT_LIST_HEAD(&workspace->lru_list); @@ -332,13 +339,13 @@ void zstd_put_workspace(struct list_head *ws) } } - set_bit(workspace->level - 1, &wsm.active_map); - list_add(&workspace->list, &wsm.idle_ws[workspace->level - 1]); + set_bit(workspace->level, &wsm.active_map); + list_add(&workspace->list, &wsm.idle_ws[workspace->level]); workspace->req_level = 0; spin_unlock_bh(&wsm.lock); - if (workspace->level == ZSTD_BTRFS_MAX_LEVEL) + if (workspace->level == clip_level(ZSTD_BTRFS_MAX_LEVEL)) cond_wake_up(&wsm.wait); } @@ -351,7 +358,7 @@ void zstd_free_workspace(struct list_head *ws) kfree(workspace); } -struct list_head *zstd_alloc_workspace(unsigned int level) +struct list_head *zstd_alloc_workspace(int level) { struct workspace *workspace; @@ -359,8 +366,9 @@ struct list_head *zstd_alloc_workspace(unsigned int level) if (!workspace) return ERR_PTR(-ENOMEM); - workspace->size = zstd_ws_mem_sizes[level - 1]; - workspace->level = level; + /* Use level 1 workspace size for all the fast mode negative levels. */ + workspace->size = zstd_ws_mem_sizes[clip_level(level)]; + workspace->level = clip_level(level); workspace->req_level = level; workspace->last_used = jiffies; workspace->mem = kvmalloc(workspace->size, GFP_KERNEL | __GFP_NOWARN); @@ -393,17 +401,15 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, const unsigned long nr_dest_folios = *out_folios; const u64 orig_end = start + len; unsigned long max_out = nr_dest_folios * PAGE_SIZE; - unsigned int pg_off; unsigned int cur_len; - zstd_parameters params = zstd_get_btrfs_parameters(workspace->req_level, - len); + workspace->params = zstd_get_btrfs_parameters(workspace->req_level, len); *out_folios = 0; *total_out = 0; *total_in = 0; /* Initialize the stream */ - stream = zstd_init_cstream(¶ms, len, workspace->mem, + stream = zstd_init_cstream(&workspace->params, len, workspace->mem, workspace->size); if (unlikely(!stream)) { struct btrfs_inode *inode = BTRFS_I(mapping->host); @@ -420,9 +426,8 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); - cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; @@ -506,9 +511,9 @@ int zstd_compress_folios(struct list_head *ws, struct address_space *mapping, ret = btrfs_compress_filemap_get_folio(mapping, start, &in_folio); if (ret < 0) goto out; - pg_off = offset_in_page(start); - cur_len = btrfs_calc_input_length(orig_end, start); - workspace->in_buf.src = kmap_local_folio(in_folio, pg_off); + cur_len = btrfs_calc_input_length(in_folio, orig_end, start); + workspace->in_buf.src = kmap_local_folio(in_folio, + offset_in_folio(in_folio, start)); workspace->in_buf.pos = 0; workspace->in_buf.size = cur_len; } @@ -717,6 +722,7 @@ finish: const struct btrfs_compress_op btrfs_zstd_compress = { /* ZSTD uses own workspace manager */ .workspace_manager = NULL, + .min_level = ZSTD_BTRFS_MIN_LEVEL, .max_level = ZSTD_BTRFS_MAX_LEVEL, .default_level = ZSTD_BTRFS_DEFAULT_LEVEL, }; diff --git a/fs/buffer.c b/fs/buffer.c index cc8452f60251..8cf4a1dc481e 100644 --- a/fs/buffer.c +++ b/fs/buffer.c @@ -176,18 +176,8 @@ void end_buffer_write_sync(struct buffer_head *bh, int uptodate) } EXPORT_SYMBOL(end_buffer_write_sync); -/* - * Various filesystems appear to want __find_get_block to be non-blocking. - * But it's the page lock which protects the buffers. To get around this, - * we get exclusion from try_to_free_buffers with the blockdev mapping's - * i_private_lock. - * - * Hack idea: for the blockdev mapping, i_private_lock contention - * may be quite high. This code could TryLock the page, and if that - * succeeds, there is no need to take i_private_lock. - */ static struct buffer_head * -__find_get_block_slow(struct block_device *bdev, sector_t block) +__find_get_block_slow(struct block_device *bdev, sector_t block, bool atomic) { struct address_space *bd_mapping = bdev->bd_mapping; const int blkbits = bd_mapping->host->i_blkbits; @@ -204,10 +194,28 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) if (IS_ERR(folio)) goto out; - spin_lock(&bd_mapping->i_private_lock); + /* + * Folio lock protects the buffers. Callers that cannot block + * will fallback to serializing vs try_to_free_buffers() via + * the i_private_lock. + */ + if (atomic) + spin_lock(&bd_mapping->i_private_lock); + else + folio_lock(folio); + head = folio_buffers(folio); if (!head) goto out_unlock; + /* + * Upon a noref migration, the folio lock serializes here; + * otherwise bail. + */ + if (test_bit_acquire(BH_Migrate, &head->b_state)) { + WARN_ON(!atomic); + goto out_unlock; + } + bh = head; do { if (!buffer_mapped(bh)) @@ -236,7 +244,10 @@ __find_get_block_slow(struct block_device *bdev, sector_t block) 1 << blkbits); } out_unlock: - spin_unlock(&bd_mapping->i_private_lock); + if (atomic) + spin_unlock(&bd_mapping->i_private_lock); + else + folio_unlock(folio); folio_put(folio); out: return ret; @@ -286,7 +297,6 @@ static void end_buffer_async_read(struct buffer_head *bh, int uptodate) still_busy: spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - return; } struct postprocess_bh_ctx { @@ -411,7 +421,6 @@ static void end_buffer_async_write(struct buffer_head *bh, int uptodate) still_busy: spin_unlock_irqrestore(&first->b_uptodate_lock, flags); - return; } /* @@ -656,7 +665,9 @@ EXPORT_SYMBOL(generic_buffers_fsync); void write_boundary_block(struct block_device *bdev, sector_t bblock, unsigned blocksize) { - struct buffer_head *bh = __find_get_block(bdev, bblock + 1, blocksize); + struct buffer_head *bh; + + bh = __find_get_block_nonatomic(bdev, bblock + 1, blocksize); if (bh) { if (buffer_dirty(bh)) write_dirty_buffer(bh, 0); @@ -1109,6 +1120,8 @@ static struct buffer_head * __getblk_slow(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { + bool blocking = gfpflags_allow_blocking(gfp); + /* Size must be multiple of hard sectorsize */ if (unlikely(size & (bdev_logical_block_size(bdev)-1) || (size < 512 || size > PAGE_SIZE))) { @@ -1124,12 +1137,15 @@ __getblk_slow(struct block_device *bdev, sector_t block, for (;;) { struct buffer_head *bh; - bh = __find_get_block(bdev, block, size); - if (bh) - return bh; - if (!grow_buffers(bdev, block, size, gfp)) return NULL; + + if (blocking) + bh = __find_get_block_nonatomic(bdev, block, size); + else + bh = __find_get_block(bdev, block, size); + if (bh) + return bh; } } @@ -1207,10 +1223,8 @@ void mark_buffer_write_io_error(struct buffer_head *bh) /* FIXME: do we need to set this in both places? */ if (bh->b_folio && bh->b_folio->mapping) mapping_set_error(bh->b_folio->mapping, -EIO); - if (bh->b_assoc_map) { + if (bh->b_assoc_map) mapping_set_error(bh->b_assoc_map, -EIO); - errseq_set(&bh->b_assoc_map->host->i_sb->s_wb_err, -EIO); - } } EXPORT_SYMBOL(mark_buffer_write_io_error); @@ -1386,16 +1400,18 @@ lookup_bh_lru(struct block_device *bdev, sector_t block, unsigned size) /* * Perform a pagecache lookup for the matching buffer. If it's there, refresh * it in the LRU and mark it as accessed. If it is not present then return - * NULL + * NULL. Atomic context callers may also return NULL if the buffer is being + * migrated; similarly the page is not marked accessed either. */ -struct buffer_head * -__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +static struct buffer_head * +find_get_block_common(struct block_device *bdev, sector_t block, + unsigned size, bool atomic) { struct buffer_head *bh = lookup_bh_lru(bdev, block, size); if (bh == NULL) { /* __find_get_block_slow will mark the page accessed */ - bh = __find_get_block_slow(bdev, block); + bh = __find_get_block_slow(bdev, block, atomic); if (bh) bh_lru_install(bh); } else @@ -1403,8 +1419,23 @@ __find_get_block(struct block_device *bdev, sector_t block, unsigned size) return bh; } + +struct buffer_head * +__find_get_block(struct block_device *bdev, sector_t block, unsigned size) +{ + return find_get_block_common(bdev, block, size, true); +} EXPORT_SYMBOL(__find_get_block); +/* same as __find_get_block() but allows sleeping contexts */ +struct buffer_head * +__find_get_block_nonatomic(struct block_device *bdev, sector_t block, + unsigned size) +{ + return find_get_block_common(bdev, block, size, false); +} +EXPORT_SYMBOL(__find_get_block_nonatomic); + /** * bdev_getblk - Get a buffer_head in a block device's buffer cache. * @bdev: The block device. @@ -1422,7 +1453,12 @@ EXPORT_SYMBOL(__find_get_block); struct buffer_head *bdev_getblk(struct block_device *bdev, sector_t block, unsigned size, gfp_t gfp) { - struct buffer_head *bh = __find_get_block(bdev, block, size); + struct buffer_head *bh; + + if (gfpflags_allow_blocking(gfp)) + bh = __find_get_block_nonatomic(bdev, block, size); + else + bh = __find_get_block(bdev, block, size); might_alloc(gfp); if (bh) @@ -1578,8 +1614,8 @@ static void discard_buffer(struct buffer_head * bh) bh->b_bdev = NULL; b_state = READ_ONCE(bh->b_state); do { - } while (!try_cmpxchg(&bh->b_state, &b_state, - b_state & ~BUFFER_FLAGS_DISCARD)); + } while (!try_cmpxchg_relaxed(&bh->b_state, &b_state, + b_state & ~BUFFER_FLAGS_DISCARD)); unlock_buffer(bh); } @@ -1644,7 +1680,6 @@ void block_invalidate_folio(struct folio *folio, size_t offset, size_t length) filemap_release_folio(folio, 0); out: folio_clear_mappedtodisk(folio); - return; } EXPORT_SYMBOL(block_invalidate_folio); @@ -2166,7 +2201,7 @@ int __block_write_begin(struct folio *folio, loff_t pos, unsigned len, } EXPORT_SYMBOL(__block_write_begin); -static void __block_commit_write(struct folio *folio, size_t from, size_t to) +void block_commit_write(struct folio *folio, size_t from, size_t to) { size_t block_start, block_end; bool partial = false; @@ -2204,6 +2239,7 @@ static void __block_commit_write(struct folio *folio, size_t from, size_t to) if (!partial) folio_mark_uptodate(folio); } +EXPORT_SYMBOL(block_commit_write); /* * block_write_begin takes care of the basic task of block allocation and @@ -2262,7 +2298,7 @@ int block_write_end(struct file *file, struct address_space *mapping, flush_dcache_folio(folio); /* This could be a short (even 0-length) commit */ - __block_commit_write(folio, start, start + copied); + block_commit_write(folio, start, start + copied); return copied; } @@ -2361,9 +2397,8 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) { struct inode *inode = folio->mapping->host; sector_t iblock, lblock; - struct buffer_head *bh, *head, *arr[MAX_BUF_PER_PAGE]; + struct buffer_head *bh, *head, *prev = NULL; size_t blocksize; - int nr, i; int fully_mapped = 1; bool page_error = false; loff_t limit = i_size_read(inode); @@ -2372,16 +2407,12 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (IS_ENABLED(CONFIG_FS_VERITY) && IS_VERITY(inode)) limit = inode->i_sb->s_maxbytes; - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - head = folio_create_buffers(folio, inode, 0); blocksize = head->b_size; iblock = div_u64(folio_pos(folio), blocksize); lblock = div_u64(limit + blocksize - 1, blocksize); bh = head; - nr = 0; - i = 0; do { if (buffer_uptodate(bh)) @@ -2398,7 +2429,7 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) page_error = true; } if (!buffer_mapped(bh)) { - folio_zero_range(folio, i * blocksize, + folio_zero_range(folio, bh_offset(bh), blocksize); if (!err) set_buffer_uptodate(bh); @@ -2411,40 +2442,33 @@ int block_read_full_folio(struct folio *folio, get_block_t *get_block) if (buffer_uptodate(bh)) continue; } - arr[nr++] = bh; - } while (i++, iblock++, (bh = bh->b_this_page) != head); - - if (fully_mapped) - folio_set_mappedtodisk(folio); - - if (!nr) { - /* - * All buffers are uptodate or get_block() returned an - * error when trying to map them - we can finish the read. - */ - folio_end_read(folio, !page_error); - return 0; - } - /* Stage two: lock the buffers */ - for (i = 0; i < nr; i++) { - bh = arr[i]; lock_buffer(bh); + if (buffer_uptodate(bh)) { + unlock_buffer(bh); + continue; + } + mark_buffer_async_read(bh); - } + if (prev) + submit_bh(REQ_OP_READ, prev); + prev = bh; + } while (iblock++, (bh = bh->b_this_page) != head); + + if (fully_mapped) + folio_set_mappedtodisk(folio); /* - * Stage 3: start the IO. Check for uptodateness - * inside the buffer lock in case another process reading - * the underlying blockdev brought it uptodate (the sct fix). + * All buffers are uptodate or get_block() returned an error + * when trying to map them - we must finish the read because + * end_buffer_async_read() will never be called on any buffer + * in this folio. */ - for (i = 0; i < nr; i++) { - bh = arr[i]; - if (buffer_uptodate(bh)) - end_buffer_async_read(bh, 1); - else - submit_bh(REQ_OP_READ, bh); - } + if (prev) + submit_bh(REQ_OP_READ, prev); + else + folio_end_read(folio, !page_error); + return 0; } EXPORT_SYMBOL(block_read_full_folio); @@ -2578,13 +2602,6 @@ int cont_write_begin(struct file *file, struct address_space *mapping, } EXPORT_SYMBOL(cont_write_begin); -void block_commit_write(struct page *page, unsigned from, unsigned to) -{ - struct folio *folio = page_folio(page); - __block_commit_write(folio, from, to); -} -EXPORT_SYMBOL(block_commit_write); - /* * block_page_mkwrite() is not allowed to change the file size as it gets * called from a page fault handler when a page is first dirtied. Hence we must @@ -2630,7 +2647,7 @@ int block_page_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf, if (unlikely(ret)) goto out_unlock; - __block_commit_write(folio, 0, end); + block_commit_write(folio, 0, end); folio_mark_dirty(folio); folio_wait_stable(folio); @@ -2713,7 +2730,7 @@ unlock: EXPORT_SYMBOL(block_truncate_page); /* - * The generic ->writepage function for buffer-backed address_spaces + * The generic write folio function for buffer-backed address_spaces */ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, void *get_block) @@ -2733,7 +2750,7 @@ int block_write_full_folio(struct folio *folio, struct writeback_control *wbc, /* * The folio straddles i_size. It must be zeroed out on each and every - * writepage invocation because it may be mmapped. "A file is mapped + * writeback invocation because it may be mmapped. "A file is mapped * in multiples of the page size. For a file that is not a multiple of * the page size, the remaining memory is zeroed when mapped, and * writes to that region are not written out to the file." diff --git a/fs/cachefiles/internal.h b/fs/cachefiles/internal.h index 38c236e38cef..b62cd3e9a18e 100644 --- a/fs/cachefiles/internal.h +++ b/fs/cachefiles/internal.h @@ -71,7 +71,6 @@ struct cachefiles_object { int debug_id; spinlock_t lock; refcount_t ref; - u8 d_name_len; /* Length of filename */ enum cachefiles_content content_info:8; /* Info about content presence */ unsigned long flags; #define CACHEFILES_OBJECT_USING_TMPFILE 0 /* Have an unlinked tmpfile */ diff --git a/fs/cachefiles/key.c b/fs/cachefiles/key.c index bf935e25bdbe..aae86af48ed5 100644 --- a/fs/cachefiles/key.c +++ b/fs/cachefiles/key.c @@ -8,7 +8,7 @@ #include <linux/slab.h> #include "internal.h" -static const char cachefiles_charmap[64] = +static const char cachefiles_charmap[64] __nonstring = "0123456789" /* 0 - 9 */ "abcdefghijklmnopqrstuvwxyz" /* 10 - 35 */ "ABCDEFGHIJKLMNOPQRSTUVWXYZ" /* 36 - 61 */ @@ -132,7 +132,6 @@ bool cachefiles_cook_key(struct cachefiles_object *object) success: name[len] = 0; object->d_name = name; - object->d_name_len = len; _leave(" = %s", object->d_name); return true; } diff --git a/fs/cachefiles/namei.c b/fs/cachefiles/namei.c index 7cf59713f0f7..aecfc5c37b49 100644 --- a/fs/cachefiles/namei.c +++ b/fs/cachefiles/namei.c @@ -98,7 +98,7 @@ struct dentry *cachefiles_get_directory(struct cachefiles_cache *cache, retry: ret = cachefiles_inject_read_error(); if (ret == 0) - subdir = lookup_one_len(dirname, dir, strlen(dirname)); + subdir = lookup_one(&nop_mnt_idmap, &QSTR(dirname), dir); else subdir = ERR_PTR(ret); trace_cachefiles_lookup(NULL, dir, subdir); @@ -130,16 +130,18 @@ retry: goto mkdir_error; ret = cachefiles_inject_write_error(); if (ret == 0) - ret = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); - if (ret < 0) { + subdir = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), subdir, 0700); + else + subdir = ERR_PTR(ret); + if (IS_ERR(subdir)) { trace_cachefiles_vfs_error(NULL, d_inode(dir), ret, cachefiles_trace_mkdir_error); goto mkdir_error; } trace_cachefiles_mkdir(dir, subdir); - if (unlikely(d_unhashed(subdir))) { - cachefiles_put_directory(subdir); + if (unlikely(d_unhashed(subdir) || d_is_negative(subdir))) { + dput(subdir); goto retry; } ASSERT(d_backing_inode(subdir)); @@ -195,7 +197,8 @@ mark_error: mkdir_error: inode_unlock(d_inode(dir)); - dput(subdir); + if (!IS_ERR(subdir)) + dput(subdir); pr_err("mkdir %s failed with error %d\n", dirname, ret); return ERR_PTR(ret); @@ -335,7 +338,7 @@ try_again: return -EIO; } - grave = lookup_one_len(nbuffer, cache->graveyard, strlen(nbuffer)); + grave = lookup_one(&nop_mnt_idmap, &QSTR(nbuffer), cache->graveyard); if (IS_ERR(grave)) { unlock_rename(cache->graveyard, dir); trace_cachefiles_vfs_error(object, d_inode(cache->graveyard), @@ -627,8 +630,8 @@ bool cachefiles_look_up_object(struct cachefiles_object *object) /* Look up path "cache/vol/fanout/file". */ ret = cachefiles_inject_read_error(); if (ret == 0) - dentry = lookup_positive_unlocked(object->d_name, fan, - object->d_name_len); + dentry = lookup_one_positive_unlocked(&nop_mnt_idmap, + &QSTR(object->d_name), fan); else dentry = ERR_PTR(ret); trace_cachefiles_lookup(object, fan, dentry); @@ -680,7 +683,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, inode_lock_nested(d_inode(fan), I_MUTEX_PARENT); ret = cachefiles_inject_read_error(); if (ret == 0) - dentry = lookup_one_len(object->d_name, fan, object->d_name_len); + dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan); else dentry = ERR_PTR(ret); if (IS_ERR(dentry)) { @@ -699,7 +702,7 @@ bool cachefiles_commit_tmpfile(struct cachefiles_cache *cache, dput(dentry); ret = cachefiles_inject_read_error(); if (ret == 0) - dentry = lookup_one_len(object->d_name, fan, object->d_name_len); + dentry = lookup_one(&nop_mnt_idmap, &QSTR(object->d_name), fan); else dentry = ERR_PTR(ret); if (IS_ERR(dentry)) { @@ -748,7 +751,7 @@ static struct dentry *cachefiles_lookup_for_cull(struct cachefiles_cache *cache, inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - victim = lookup_one_len(filename, dir, strlen(filename)); + victim = lookup_one(&nop_mnt_idmap, &QSTR(filename), dir); if (IS_ERR(victim)) goto lookup_error; if (d_is_negative(victim)) diff --git a/fs/cachefiles/ondemand.c b/fs/cachefiles/ondemand.c index fe3de9ad57bf..d9bc67176128 100644 --- a/fs/cachefiles/ondemand.c +++ b/fs/cachefiles/ondemand.c @@ -317,8 +317,9 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req, goto err_free_id; } - anon_file->file = anon_inode_getfile("[cachefiles]", - &cachefiles_ondemand_fd_fops, object, O_WRONLY); + anon_file->file = anon_inode_getfile_fmode("[cachefiles]", + &cachefiles_ondemand_fd_fops, object, + O_WRONLY, FMODE_PWRITE | FMODE_LSEEK); if (IS_ERR(anon_file->file)) { ret = PTR_ERR(anon_file->file); goto err_put_fd; @@ -333,8 +334,6 @@ static int cachefiles_ondemand_get_fd(struct cachefiles_req *req, goto err_put_file; } - anon_file->file->f_mode |= FMODE_PWRITE | FMODE_LSEEK; - load = (void *)req->msg.data; load->fd = anon_file->fd; object->ondemand->ondemand_id = object_id; diff --git a/fs/ceph/Kconfig b/fs/ceph/Kconfig index 7249d70e1a43..3e7def3d31c1 100644 --- a/fs/ceph/Kconfig +++ b/fs/ceph/Kconfig @@ -3,7 +3,7 @@ config CEPH_FS tristate "Ceph distributed file system" depends on INET select CEPH_LIB - select LIBCRC32C + select CRC32 select CRYPTO_AES select CRYPTO select NETFS_SUPPORT diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index f5224a566b69..29be367905a1 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -82,6 +82,7 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) { struct inode *inode = mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); struct ceph_inode_info *ci; struct ceph_snap_context *snapc; @@ -92,6 +93,8 @@ static bool ceph_dirty_folio(struct address_space *mapping, struct folio *folio) return false; } + atomic64_inc(&mdsc->dirty_folios); + ci = ceph_inode(inode); /* dirty the head */ @@ -568,7 +571,36 @@ struct ceph_writeback_ctl u64 truncate_size; u32 truncate_seq; bool size_stable; + bool head_snapc; + struct ceph_snap_context *snapc; + struct ceph_snap_context *last_snapc; + + bool done; + bool should_loop; + bool range_whole; + pgoff_t start_index; + pgoff_t index; + pgoff_t end; + xa_mark_t tag; + + pgoff_t strip_unit_end; + unsigned int wsize; + unsigned int nr_folios; + unsigned int max_pages; + unsigned int locked_pages; + + int op_idx; + int num_ops; + u64 offset; + u64 len; + + struct folio_batch fbatch; + unsigned int processed_in_fbatch; + + bool from_pool; + struct page **pages; + struct page **data_pages; }; /* @@ -666,22 +698,23 @@ static u64 get_writepages_data_length(struct inode *inode, } /* - * Write a single page, but leave the page locked. + * Write a folio, but leave it locked. * * If we get a write error, mark the mapping for error, but still adjust the - * dirty page accounting (i.e., page is no longer dirty). + * dirty page accounting (i.e., folio is no longer dirty). */ -static int writepage_nounlock(struct page *page, struct writeback_control *wbc) +static int write_folio_nounlock(struct folio *folio, + struct writeback_control *wbc) { - struct folio *folio = page_folio(page); - struct inode *inode = page->mapping->host; + struct page *page = &folio->page; + struct inode *inode = folio->mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; struct ceph_snap_context *snapc, *oldest; - loff_t page_off = page_offset(page); + loff_t page_off = folio_pos(folio); int err; - loff_t len = thp_size(page); + loff_t len = folio_size(folio); loff_t wlen; struct ceph_writeback_ctl ceph_wbc; struct ceph_osd_client *osdc = &fsc->client->osdc; @@ -689,27 +722,27 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) bool caching = ceph_is_cache_enabled(inode); struct page *bounce_page = NULL; - doutc(cl, "%llx.%llx page %p idx %lu\n", ceph_vinop(inode), page, - page->index); + doutc(cl, "%llx.%llx folio %p idx %lu\n", ceph_vinop(inode), folio, + folio->index); if (ceph_inode_is_shutdown(inode)) return -EIO; /* verify this is a writeable snap context */ - snapc = page_snap_context(page); + snapc = page_snap_context(&folio->page); if (!snapc) { - doutc(cl, "%llx.%llx page %p not dirty?\n", ceph_vinop(inode), - page); + doutc(cl, "%llx.%llx folio %p not dirty?\n", ceph_vinop(inode), + folio); return 0; } oldest = get_oldest_context(inode, &ceph_wbc, snapc); if (snapc->seq > oldest->seq) { - doutc(cl, "%llx.%llx page %p snapc %p not writeable - noop\n", - ceph_vinop(inode), page, snapc); + doutc(cl, "%llx.%llx folio %p snapc %p not writeable - noop\n", + ceph_vinop(inode), folio, snapc); /* we should only noop if called by kswapd */ WARN_ON(!(current->flags & PF_MEMALLOC)); ceph_put_snap_context(oldest); - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); return 0; } ceph_put_snap_context(oldest); @@ -726,8 +759,8 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) len = ceph_wbc.i_size - page_off; wlen = IS_ENCRYPTED(inode) ? round_up(len, CEPH_FSCRYPT_BLOCK_SIZE) : len; - doutc(cl, "%llx.%llx page %p index %lu on %llu~%llu snapc %p seq %lld\n", - ceph_vinop(inode), page, page->index, page_off, wlen, snapc, + doutc(cl, "%llx.%llx folio %p index %lu on %llu~%llu snapc %p seq %lld\n", + ceph_vinop(inode), folio, folio->index, page_off, wlen, snapc, snapc->seq); if (atomic_long_inc_return(&fsc->writeback_count) > @@ -740,32 +773,32 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ceph_wbc.truncate_seq, ceph_wbc.truncate_size, true); if (IS_ERR(req)) { - redirty_page_for_writepage(wbc, page); + folio_redirty_for_writepage(wbc, folio); return PTR_ERR(req); } if (wlen < len) len = wlen; - set_page_writeback(page); + folio_start_writeback(folio); if (caching) - ceph_set_page_fscache(page); + ceph_set_page_fscache(&folio->page); ceph_fscache_write_to_cache(inode, page_off, len, caching); if (IS_ENCRYPTED(inode)) { - bounce_page = fscrypt_encrypt_pagecache_blocks(page, + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, CEPH_FSCRYPT_BLOCK_SIZE, 0, GFP_NOFS); if (IS_ERR(bounce_page)) { - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); ceph_osdc_put_request(req); return PTR_ERR(bounce_page); } } /* it may be a short write due to an object boundary */ - WARN_ON_ONCE(len > thp_size(page)); + WARN_ON_ONCE(len > folio_size(folio)); osd_req_op_extent_osd_data_pages(req, 0, bounce_page ? &bounce_page : &page, wlen, 0, false, false); @@ -791,25 +824,25 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) if (err == -ERESTARTSYS) { /* killed by SIGKILL */ doutc(cl, "%llx.%llx interrupted page %p\n", - ceph_vinop(inode), page); - redirty_page_for_writepage(wbc, page); - end_page_writeback(page); + ceph_vinop(inode), folio); + folio_redirty_for_writepage(wbc, folio); + folio_end_writeback(folio); return err; } if (err == -EBLOCKLISTED) fsc->blocklisted = true; - doutc(cl, "%llx.%llx setting page/mapping error %d %p\n", - ceph_vinop(inode), err, page); + doutc(cl, "%llx.%llx setting mapping error %d %p\n", + ceph_vinop(inode), err, folio); mapping_set_error(&inode->i_data, err); wbc->pages_skipped++; } else { doutc(cl, "%llx.%llx cleaned page %p\n", - ceph_vinop(inode), page); + ceph_vinop(inode), folio); err = 0; /* vfs expects us to return 0 */ } - oldest = detach_page_private(page); + oldest = folio_detach_private(folio); WARN_ON_ONCE(oldest != snapc); - end_page_writeback(page); + folio_end_writeback(folio); ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_snap_context(snapc); /* page's reference */ @@ -820,32 +853,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) return err; } -static int ceph_writepage(struct page *page, struct writeback_control *wbc) -{ - int err; - struct inode *inode = page->mapping->host; - BUG_ON(!inode); - ihold(inode); - - if (wbc->sync_mode == WB_SYNC_NONE && - ceph_inode_to_fs_client(inode)->write_congested) { - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; - } - - folio_wait_private_2(page_folio(page)); /* [DEPRECATED] */ - - err = writepage_nounlock(page, wbc); - if (err == -ERESTARTSYS) { - /* direct memory reclaimer was killed by SIGKILL. return 0 - * to prevent caller from setting mapping/page error */ - err = 0; - } - unlock_page(page); - iput(inode); - return err; -} - /* * async writeback completion handler. * @@ -865,6 +872,7 @@ static void writepages_finish(struct ceph_osd_request *req) struct ceph_snap_context *snapc = req->r_snapc; struct address_space *mapping = inode->i_mapping; struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(inode->i_sb); unsigned int len = 0; bool remove_page; @@ -920,6 +928,12 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_put_snap_context(detach_page_private(page)); end_page_writeback(page); + + if (atomic64_dec_return(&mdsc->dirty_folios) <= 0) { + wake_up_all(&mdsc->flush_end_wq); + WARN_ON(atomic64_read(&mdsc->dirty_folios) < 0); + } + doutc(cl, "unlocking %p\n", page); if (remove_page) @@ -949,36 +963,13 @@ static void writepages_finish(struct ceph_osd_request *req) ceph_dec_osd_stopping_blocker(fsc->mdsc); } -/* - * initiate async writeback - */ -static int ceph_writepages_start(struct address_space *mapping, - struct writeback_control *wbc) +static inline +bool is_forced_umount(struct address_space *mapping) { struct inode *inode = mapping->host; struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); struct ceph_client *cl = fsc->client; - struct ceph_vino vino = ceph_vino(inode); - pgoff_t index, start_index, end = -1; - struct ceph_snap_context *snapc = NULL, *last_snapc = NULL, *pgsnapc; - struct folio_batch fbatch; - int rc = 0; - unsigned int wsize = i_blocksize(inode); - struct ceph_osd_request *req = NULL; - struct ceph_writeback_ctl ceph_wbc; - bool should_loop, range_whole = false; - bool done = false; - bool caching = ceph_is_cache_enabled(inode); - xa_mark_t tag; - - if (wbc->sync_mode == WB_SYNC_NONE && - fsc->write_congested) - return 0; - - doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), - wbc->sync_mode == WB_SYNC_NONE ? "NONE" : - (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); if (ceph_inode_is_shutdown(inode)) { if (ci->i_wrbuffer_ref > 0) { @@ -987,388 +978,733 @@ static int ceph_writepages_start(struct address_space *mapping, ceph_vinop(inode), ceph_ino(inode)); } mapping_set_error(mapping, -EIO); - return -EIO; /* we're in a forced umount, don't write! */ + return true; } + + return false; +} + +static inline +unsigned int ceph_define_write_size(struct address_space *mapping) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + unsigned int wsize = i_blocksize(inode); + if (fsc->mount_options->wsize < wsize) wsize = fsc->mount_options->wsize; - folio_batch_init(&fbatch); + return wsize; +} + +static inline +void ceph_folio_batch_init(struct ceph_writeback_ctl *ceph_wbc) +{ + folio_batch_init(&ceph_wbc->fbatch); + ceph_wbc->processed_in_fbatch = 0; +} + +static inline +void ceph_folio_batch_reinit(struct ceph_writeback_ctl *ceph_wbc) +{ + folio_batch_release(&ceph_wbc->fbatch); + ceph_folio_batch_init(ceph_wbc); +} + +static inline +void ceph_init_writeback_ctl(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + ceph_wbc->snapc = NULL; + ceph_wbc->last_snapc = NULL; + + ceph_wbc->strip_unit_end = 0; + ceph_wbc->wsize = ceph_define_write_size(mapping); - start_index = wbc->range_cyclic ? mapping->writeback_index : 0; - index = start_index; + ceph_wbc->nr_folios = 0; + ceph_wbc->max_pages = 0; + ceph_wbc->locked_pages = 0; + + ceph_wbc->done = false; + ceph_wbc->should_loop = false; + ceph_wbc->range_whole = false; + + ceph_wbc->start_index = wbc->range_cyclic ? mapping->writeback_index : 0; + ceph_wbc->index = ceph_wbc->start_index; + ceph_wbc->end = -1; if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) { - tag = PAGECACHE_TAG_TOWRITE; + ceph_wbc->tag = PAGECACHE_TAG_TOWRITE; } else { - tag = PAGECACHE_TAG_DIRTY; + ceph_wbc->tag = PAGECACHE_TAG_DIRTY; } -retry: + + ceph_wbc->op_idx = -1; + ceph_wbc->num_ops = 0; + ceph_wbc->offset = 0; + ceph_wbc->len = 0; + ceph_wbc->from_pool = false; + + ceph_folio_batch_init(ceph_wbc); + + ceph_wbc->pages = NULL; + ceph_wbc->data_pages = NULL; +} + +static inline +int ceph_define_writeback_range(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + /* find oldest snap context with dirty data */ - snapc = get_oldest_context(inode, &ceph_wbc, NULL); - if (!snapc) { + ceph_wbc->snapc = get_oldest_context(inode, ceph_wbc, NULL); + if (!ceph_wbc->snapc) { /* hmm, why does writepages get called when there is no dirty data? */ doutc(cl, " no snap context with dirty data?\n"); - goto out; + return -ENODATA; } - doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", snapc, - snapc->seq, snapc->num_snaps); - should_loop = false; - if (ceph_wbc.head_snapc && snapc != last_snapc) { + doutc(cl, " oldest snapc is %p seq %lld (%d snaps)\n", + ceph_wbc->snapc, ceph_wbc->snapc->seq, + ceph_wbc->snapc->num_snaps); + + ceph_wbc->should_loop = false; + + if (ceph_wbc->head_snapc && ceph_wbc->snapc != ceph_wbc->last_snapc) { /* where to start/end? */ if (wbc->range_cyclic) { - index = start_index; - end = -1; - if (index > 0) - should_loop = true; - doutc(cl, " cyclic, start at %lu\n", index); + ceph_wbc->index = ceph_wbc->start_index; + ceph_wbc->end = -1; + if (ceph_wbc->index > 0) + ceph_wbc->should_loop = true; + doutc(cl, " cyclic, start at %lu\n", ceph_wbc->index); } else { - index = wbc->range_start >> PAGE_SHIFT; - end = wbc->range_end >> PAGE_SHIFT; + ceph_wbc->index = wbc->range_start >> PAGE_SHIFT; + ceph_wbc->end = wbc->range_end >> PAGE_SHIFT; if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX) - range_whole = true; - doutc(cl, " not cyclic, %lu to %lu\n", index, end); + ceph_wbc->range_whole = true; + doutc(cl, " not cyclic, %lu to %lu\n", + ceph_wbc->index, ceph_wbc->end); } - } else if (!ceph_wbc.head_snapc) { + } else if (!ceph_wbc->head_snapc) { /* Do not respect wbc->range_{start,end}. Dirty pages * in that range can be associated with newer snapc. * They are not writeable until we write all dirty pages * associated with 'snapc' get written */ - if (index > 0) - should_loop = true; + if (ceph_wbc->index > 0) + ceph_wbc->should_loop = true; doutc(cl, " non-head snapc, range whole\n"); } - if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) - tag_pages_for_writeback(mapping, index, end); + ceph_put_snap_context(ceph_wbc->last_snapc); + ceph_wbc->last_snapc = ceph_wbc->snapc; - ceph_put_snap_context(last_snapc); - last_snapc = snapc; + return 0; +} - while (!done && index <= end) { - int num_ops = 0, op_idx; - unsigned i, nr_folios, max_pages, locked_pages = 0; - struct page **pages = NULL, **data_pages; - struct page *page; - pgoff_t strip_unit_end = 0; - u64 offset = 0, len = 0; - bool from_pool = false; +static inline +bool has_writeback_done(struct ceph_writeback_ctl *ceph_wbc) +{ + return ceph_wbc->done && ceph_wbc->index > ceph_wbc->end; +} - max_pages = wsize >> PAGE_SHIFT; +static inline +bool can_next_page_be_processed(struct ceph_writeback_ctl *ceph_wbc, + unsigned index) +{ + return index < ceph_wbc->nr_folios && + ceph_wbc->locked_pages < ceph_wbc->max_pages; +} -get_more_pages: - nr_folios = filemap_get_folios_tag(mapping, &index, - end, tag, &fbatch); - doutc(cl, "pagevec_lookup_range_tag got %d\n", nr_folios); - if (!nr_folios && !locked_pages) - break; - for (i = 0; i < nr_folios && locked_pages < max_pages; i++) { - struct folio *folio = fbatch.folios[i]; +static +int ceph_check_page_before_write(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc, + struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_snap_context *pgsnapc; - page = &folio->page; - doutc(cl, "? %p idx %lu\n", page, page->index); - if (locked_pages == 0) - lock_page(page); /* first page */ - else if (!trylock_page(page)) - break; + /* only dirty folios, or our accounting breaks */ + if (unlikely(!folio_test_dirty(folio) || folio->mapping != mapping)) { + doutc(cl, "!dirty or !mapping %p\n", folio); + return -ENODATA; + } - /* only dirty pages, or our accounting breaks */ - if (unlikely(!PageDirty(page)) || - unlikely(page->mapping != mapping)) { - doutc(cl, "!dirty or !mapping %p\n", page); - unlock_page(page); - continue; - } - /* only if matching snap context */ - pgsnapc = page_snap_context(page); - if (pgsnapc != snapc) { - doutc(cl, "page snapc %p %lld != oldest %p %lld\n", - pgsnapc, pgsnapc->seq, snapc, snapc->seq); - if (!should_loop && - !ceph_wbc.head_snapc && - wbc->sync_mode != WB_SYNC_NONE) - should_loop = true; - unlock_page(page); - continue; + /* only if matching snap context */ + pgsnapc = page_snap_context(&folio->page); + if (pgsnapc != ceph_wbc->snapc) { + doutc(cl, "folio snapc %p %lld != oldest %p %lld\n", + pgsnapc, pgsnapc->seq, + ceph_wbc->snapc, ceph_wbc->snapc->seq); + + if (!ceph_wbc->should_loop && !ceph_wbc->head_snapc && + wbc->sync_mode != WB_SYNC_NONE) + ceph_wbc->should_loop = true; + + return -ENODATA; + } + + if (folio_pos(folio) >= ceph_wbc->i_size) { + doutc(cl, "folio at %lu beyond eof %llu\n", + folio->index, ceph_wbc->i_size); + + if ((ceph_wbc->size_stable || + folio_pos(folio) >= i_size_read(inode)) && + folio_clear_dirty_for_io(folio)) + folio_invalidate(folio, 0, folio_size(folio)); + + return -ENODATA; + } + + if (ceph_wbc->strip_unit_end && + (folio->index > ceph_wbc->strip_unit_end)) { + doutc(cl, "end of strip unit %p\n", folio); + return -E2BIG; + } + + return 0; +} + +static inline +void __ceph_allocate_page_array(struct ceph_writeback_ctl *ceph_wbc, + unsigned int max_pages) +{ + ceph_wbc->pages = kmalloc_array(max_pages, + sizeof(*ceph_wbc->pages), + GFP_NOFS); + if (!ceph_wbc->pages) { + ceph_wbc->from_pool = true; + ceph_wbc->pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); + BUG_ON(!ceph_wbc->pages); + } +} + +static inline +void ceph_allocate_page_array(struct address_space *mapping, + struct ceph_writeback_ctl *ceph_wbc, + struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + u64 objnum; + u64 objoff; + u32 xlen; + + /* prepare async write request */ + ceph_wbc->offset = (u64)folio_pos(folio); + ceph_calc_file_object_mapping(&ci->i_layout, + ceph_wbc->offset, ceph_wbc->wsize, + &objnum, &objoff, &xlen); + + ceph_wbc->num_ops = 1; + ceph_wbc->strip_unit_end = folio->index + ((xlen - 1) >> PAGE_SHIFT); + + BUG_ON(ceph_wbc->pages); + ceph_wbc->max_pages = calc_pages_for(0, (u64)xlen); + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->max_pages); + + ceph_wbc->len = 0; +} + +static inline +bool is_folio_index_contiguous(const struct ceph_writeback_ctl *ceph_wbc, + const struct folio *folio) +{ + return folio->index == (ceph_wbc->offset + ceph_wbc->len) >> PAGE_SHIFT; +} + +static inline +bool is_num_ops_too_big(struct ceph_writeback_ctl *ceph_wbc) +{ + return ceph_wbc->num_ops >= + (ceph_wbc->from_pool ? CEPH_OSD_SLAB_OPS : CEPH_OSD_MAX_OPS); +} + +static inline +bool is_write_congestion_happened(struct ceph_fs_client *fsc) +{ + return atomic_long_inc_return(&fsc->writeback_count) > + CONGESTION_ON_THRESH(fsc->mount_options->congestion_kb); +} + +static inline int move_dirty_folio_in_page_array(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc, struct folio *folio) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct page **pages = ceph_wbc->pages; + unsigned int index = ceph_wbc->locked_pages; + gfp_t gfp_flags = ceph_wbc->locked_pages ? GFP_NOWAIT : GFP_NOFS; + + if (IS_ENCRYPTED(inode)) { + pages[index] = fscrypt_encrypt_pagecache_blocks(folio, + PAGE_SIZE, + 0, + gfp_flags); + if (IS_ERR(pages[index])) { + if (PTR_ERR(pages[index]) == -EINVAL) { + pr_err_client(cl, "inode->i_blkbits=%hhu\n", + inode->i_blkbits); } - if (page_offset(page) >= ceph_wbc.i_size) { - doutc(cl, "folio at %lu beyond eof %llu\n", - folio->index, ceph_wbc.i_size); - if ((ceph_wbc.size_stable || - folio_pos(folio) >= i_size_read(inode)) && - folio_clear_dirty_for_io(folio)) - folio_invalidate(folio, 0, - folio_size(folio)); + + /* better not fail on first page! */ + BUG_ON(ceph_wbc->locked_pages == 0); + + pages[index] = NULL; + return PTR_ERR(pages[index]); + } + } else { + pages[index] = &folio->page; + } + + ceph_wbc->locked_pages++; + + return 0; +} + +static +int ceph_process_folio_batch(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct folio *folio = NULL; + unsigned i; + int rc = 0; + + for (i = 0; can_next_page_be_processed(ceph_wbc, i); i++) { + folio = ceph_wbc->fbatch.folios[i]; + + if (!folio) + continue; + + doutc(cl, "? %p idx %lu, folio_test_writeback %#x, " + "folio_test_dirty %#x, folio_test_locked %#x\n", + folio, folio->index, folio_test_writeback(folio), + folio_test_dirty(folio), + folio_test_locked(folio)); + + if (folio_test_writeback(folio) || + folio_test_private_2(folio) /* [DEPRECATED] */) { + doutc(cl, "waiting on writeback %p\n", folio); + folio_wait_writeback(folio); + folio_wait_private_2(folio); /* [DEPRECATED] */ + continue; + } + + if (ceph_wbc->locked_pages == 0) + folio_lock(folio); + else if (!folio_trylock(folio)) + break; + + rc = ceph_check_page_before_write(mapping, wbc, + ceph_wbc, folio); + if (rc == -ENODATA) { + rc = 0; + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + continue; + } else if (rc == -E2BIG) { + rc = 0; + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + break; + } + + if (!folio_clear_dirty_for_io(folio)) { + doutc(cl, "%p !folio_clear_dirty_for_io\n", folio); + folio_unlock(folio); + ceph_wbc->fbatch.folios[i] = NULL; + continue; + } + + /* + * We have something to write. If this is + * the first locked page this time through, + * calculate max possible write size and + * allocate a page array + */ + if (ceph_wbc->locked_pages == 0) { + ceph_allocate_page_array(mapping, ceph_wbc, folio); + } else if (!is_folio_index_contiguous(ceph_wbc, folio)) { + if (is_num_ops_too_big(ceph_wbc)) { + folio_redirty_for_writepage(wbc, folio); folio_unlock(folio); - continue; - } - if (strip_unit_end && (page->index > strip_unit_end)) { - doutc(cl, "end of strip unit %p\n", page); - unlock_page(page); break; } - if (folio_test_writeback(folio) || - folio_test_private_2(folio) /* [DEPRECATED] */) { - if (wbc->sync_mode == WB_SYNC_NONE) { - doutc(cl, "%p under writeback\n", folio); - folio_unlock(folio); - continue; - } - doutc(cl, "waiting on writeback %p\n", folio); - folio_wait_writeback(folio); - folio_wait_private_2(folio); /* [DEPRECATED] */ - } - if (!clear_page_dirty_for_io(page)) { - doutc(cl, "%p !clear_page_dirty_for_io\n", page); - unlock_page(page); - continue; - } + ceph_wbc->num_ops++; + ceph_wbc->offset = (u64)folio_pos(folio); + ceph_wbc->len = 0; + } - /* - * We have something to write. If this is - * the first locked page this time through, - * calculate max possinle write size and - * allocate a page array - */ - if (locked_pages == 0) { - u64 objnum; - u64 objoff; - u32 xlen; - - /* prepare async write request */ - offset = (u64)page_offset(page); - ceph_calc_file_object_mapping(&ci->i_layout, - offset, wsize, - &objnum, &objoff, - &xlen); - len = xlen; - - num_ops = 1; - strip_unit_end = page->index + - ((len - 1) >> PAGE_SHIFT); - - BUG_ON(pages); - max_pages = calc_pages_for(0, (u64)len); - pages = kmalloc_array(max_pages, - sizeof(*pages), - GFP_NOFS); - if (!pages) { - from_pool = true; - pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); - BUG_ON(!pages); - } - - len = 0; - } else if (page->index != - (offset + len) >> PAGE_SHIFT) { - if (num_ops >= (from_pool ? CEPH_OSD_SLAB_OPS : - CEPH_OSD_MAX_OPS)) { - redirty_page_for_writepage(wbc, page); - unlock_page(page); - break; - } - - num_ops++; - offset = (u64)page_offset(page); - len = 0; - } + /* note position of first page in fbatch */ + doutc(cl, "%llx.%llx will write folio %p idx %lu\n", + ceph_vinop(inode), folio, folio->index); - /* note position of first page in fbatch */ - doutc(cl, "%llx.%llx will write page %p idx %lu\n", - ceph_vinop(inode), page, page->index); - - if (atomic_long_inc_return(&fsc->writeback_count) > - CONGESTION_ON_THRESH( - fsc->mount_options->congestion_kb)) - fsc->write_congested = true; - - if (IS_ENCRYPTED(inode)) { - pages[locked_pages] = - fscrypt_encrypt_pagecache_blocks(page, - PAGE_SIZE, 0, - locked_pages ? GFP_NOWAIT : GFP_NOFS); - if (IS_ERR(pages[locked_pages])) { - if (PTR_ERR(pages[locked_pages]) == -EINVAL) - pr_err_client(cl, - "inode->i_blkbits=%hhu\n", - inode->i_blkbits); - /* better not fail on first page! */ - BUG_ON(locked_pages == 0); - pages[locked_pages] = NULL; - redirty_page_for_writepage(wbc, page); - unlock_page(page); - break; - } - ++locked_pages; - } else { - pages[locked_pages++] = page; - } + fsc->write_congested = is_write_congestion_happened(fsc); - fbatch.folios[i] = NULL; - len += thp_size(page); + rc = move_dirty_folio_in_page_array(mapping, wbc, ceph_wbc, + folio); + if (rc) { + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + break; } - /* did we get anything? */ - if (!locked_pages) - goto release_folios; - if (i) { - unsigned j, n = 0; - /* shift unused page to beginning of fbatch */ - for (j = 0; j < nr_folios; j++) { - if (!fbatch.folios[j]) - continue; - if (n < j) - fbatch.folios[n] = fbatch.folios[j]; - n++; - } - fbatch.nr = n; + ceph_wbc->fbatch.folios[i] = NULL; + ceph_wbc->len += folio_size(folio); + } - if (nr_folios && i == nr_folios && - locked_pages < max_pages) { - doutc(cl, "reached end fbatch, trying for more\n"); - folio_batch_release(&fbatch); - goto get_more_pages; - } + ceph_wbc->processed_in_fbatch = i; + + return rc; +} + +static inline +void ceph_shift_unused_folios_left(struct folio_batch *fbatch) +{ + unsigned j, n = 0; + + /* shift unused page to beginning of fbatch */ + for (j = 0; j < folio_batch_count(fbatch); j++) { + if (!fbatch->folios[j]) + continue; + + if (n < j) { + fbatch->folios[n] = fbatch->folios[j]; } + n++; + } + + fbatch->nr = n; +} + +static +int ceph_submit_write(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct inode *inode = mapping->host; + struct ceph_inode_info *ci = ceph_inode(inode); + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_vino vino = ceph_vino(inode); + struct ceph_osd_request *req = NULL; + struct page *page = NULL; + bool caching = ceph_is_cache_enabled(inode); + u64 offset; + u64 len; + unsigned i; + new_request: - offset = ceph_fscrypt_page_offset(pages[0]); - len = wsize; + offset = ceph_fscrypt_page_offset(ceph_wbc->pages[0]); + len = ceph_wbc->wsize; + req = ceph_osdc_new_request(&fsc->client->osdc, + &ci->i_layout, vino, + offset, &len, 0, ceph_wbc->num_ops, + CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, + ceph_wbc->snapc, ceph_wbc->truncate_seq, + ceph_wbc->truncate_size, false); + if (IS_ERR(req)) { req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, vino, - offset, &len, 0, num_ops, - CEPH_OSD_OP_WRITE, CEPH_OSD_FLAG_WRITE, - snapc, ceph_wbc.truncate_seq, - ceph_wbc.truncate_size, false); - if (IS_ERR(req)) { - req = ceph_osdc_new_request(&fsc->client->osdc, - &ci->i_layout, vino, - offset, &len, 0, - min(num_ops, - CEPH_OSD_SLAB_OPS), - CEPH_OSD_OP_WRITE, - CEPH_OSD_FLAG_WRITE, - snapc, ceph_wbc.truncate_seq, - ceph_wbc.truncate_size, true); - BUG_ON(IS_ERR(req)); + &ci->i_layout, vino, + offset, &len, 0, + min(ceph_wbc->num_ops, + CEPH_OSD_SLAB_OPS), + CEPH_OSD_OP_WRITE, + CEPH_OSD_FLAG_WRITE, + ceph_wbc->snapc, + ceph_wbc->truncate_seq, + ceph_wbc->truncate_size, + true); + BUG_ON(IS_ERR(req)); + } + + page = ceph_wbc->pages[ceph_wbc->locked_pages - 1]; + BUG_ON(len < ceph_fscrypt_page_offset(page) + thp_size(page) - offset); + + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + for (i = 0; i < folio_batch_count(&ceph_wbc->fbatch); i++) { + struct folio *folio = ceph_wbc->fbatch.folios[i]; + + if (!folio) + continue; + + page = &folio->page; + redirty_page_for_writepage(wbc, page); + unlock_page(page); } - BUG_ON(len < ceph_fscrypt_page_offset(pages[locked_pages - 1]) + - thp_size(pages[locked_pages - 1]) - offset); - if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { - rc = -EIO; - goto release_folios; + for (i = 0; i < ceph_wbc->locked_pages; i++) { + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); + + if (!page) + continue; + + redirty_page_for_writepage(wbc, page); + unlock_page(page); } - req->r_callback = writepages_finish; - req->r_inode = inode; - - /* Format the osd request message and submit the write */ - len = 0; - data_pages = pages; - op_idx = 0; - for (i = 0; i < locked_pages; i++) { - struct page *page = ceph_fscrypt_pagecache_page(pages[i]); - - u64 cur_offset = page_offset(page); - /* - * Discontinuity in page range? Ceph can handle that by just passing - * multiple extents in the write op. - */ - if (offset + len != cur_offset) { - /* If it's full, stop here */ - if (op_idx + 1 == req->r_num_ops) - break; - - /* Kick off an fscache write with what we have so far. */ - ceph_fscache_write_to_cache(inode, offset, len, caching); - - /* Start a new extent */ - osd_req_op_extent_dup_last(req, op_idx, - cur_offset - offset); - doutc(cl, "got pages at %llu~%llu\n", offset, - len); - osd_req_op_extent_osd_data_pages(req, op_idx, - data_pages, len, 0, - from_pool, false); - osd_req_op_extent_update(req, op_idx, len); - - len = 0; - offset = cur_offset; - data_pages = pages + i; - op_idx++; - } - set_page_writeback(page); - if (caching) - ceph_set_page_fscache(page); - len += thp_size(page); + ceph_osdc_put_request(req); + return -EIO; + } + + req->r_callback = writepages_finish; + req->r_inode = inode; + + /* Format the osd request message and submit the write */ + len = 0; + ceph_wbc->data_pages = ceph_wbc->pages; + ceph_wbc->op_idx = 0; + for (i = 0; i < ceph_wbc->locked_pages; i++) { + u64 cur_offset; + + page = ceph_fscrypt_pagecache_page(ceph_wbc->pages[i]); + cur_offset = page_offset(page); + + /* + * Discontinuity in page range? Ceph can handle that by just passing + * multiple extents in the write op. + */ + if (offset + len != cur_offset) { + /* If it's full, stop here */ + if (ceph_wbc->op_idx + 1 == req->r_num_ops) + break; + + /* Kick off an fscache write with what we have so far. */ + ceph_fscache_write_to_cache(inode, offset, len, caching); + + /* Start a new extent */ + osd_req_op_extent_dup_last(req, ceph_wbc->op_idx, + cur_offset - offset); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, + ceph_wbc->data_pages, + len, 0, + ceph_wbc->from_pool, + false); + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); + + len = 0; + offset = cur_offset; + ceph_wbc->data_pages = ceph_wbc->pages + i; + ceph_wbc->op_idx++; } - ceph_fscache_write_to_cache(inode, offset, len, caching); - - if (ceph_wbc.size_stable) { - len = min(len, ceph_wbc.i_size - offset); - } else if (i == locked_pages) { - /* writepages_finish() clears writeback pages - * according to the data length, so make sure - * data length covers all locked pages */ - u64 min_len = len + 1 - thp_size(page); - len = get_writepages_data_length(inode, pages[i - 1], - offset); - len = max(len, min_len); + + set_page_writeback(page); + + if (caching) + ceph_set_page_fscache(page); + + len += thp_size(page); + } + + ceph_fscache_write_to_cache(inode, offset, len, caching); + + if (ceph_wbc->size_stable) { + len = min(len, ceph_wbc->i_size - offset); + } else if (i == ceph_wbc->locked_pages) { + /* writepages_finish() clears writeback pages + * according to the data length, so make sure + * data length covers all locked pages */ + u64 min_len = len + 1 - thp_size(page); + len = get_writepages_data_length(inode, + ceph_wbc->pages[i - 1], + offset); + len = max(len, min_len); + } + + if (IS_ENCRYPTED(inode)) + len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + + doutc(cl, "got pages at %llu~%llu\n", offset, len); + + if (IS_ENCRYPTED(inode) && + ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) { + pr_warn_client(cl, + "bad encrypted write offset=%lld len=%llu\n", + offset, len); + } + + osd_req_op_extent_osd_data_pages(req, ceph_wbc->op_idx, + ceph_wbc->data_pages, len, + 0, ceph_wbc->from_pool, false); + osd_req_op_extent_update(req, ceph_wbc->op_idx, len); + + BUG_ON(ceph_wbc->op_idx + 1 != req->r_num_ops); + + ceph_wbc->from_pool = false; + if (i < ceph_wbc->locked_pages) { + BUG_ON(ceph_wbc->num_ops <= req->r_num_ops); + ceph_wbc->num_ops -= req->r_num_ops; + ceph_wbc->locked_pages -= i; + + /* allocate new pages array for next request */ + ceph_wbc->data_pages = ceph_wbc->pages; + __ceph_allocate_page_array(ceph_wbc, ceph_wbc->locked_pages); + memcpy(ceph_wbc->pages, ceph_wbc->data_pages + i, + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); + memset(ceph_wbc->data_pages + i, 0, + ceph_wbc->locked_pages * sizeof(*ceph_wbc->pages)); + } else { + BUG_ON(ceph_wbc->num_ops != req->r_num_ops); + /* request message now owns the pages array */ + ceph_wbc->pages = NULL; + } + + req->r_mtime = inode_get_mtime(inode); + ceph_osdc_start_request(&fsc->client->osdc, req); + req = NULL; + + wbc->nr_to_write -= i; + if (ceph_wbc->pages) + goto new_request; + + return 0; +} + +static +void ceph_wait_until_current_writes_complete(struct address_space *mapping, + struct writeback_control *wbc, + struct ceph_writeback_ctl *ceph_wbc) +{ + struct page *page; + unsigned i, nr; + + if (wbc->sync_mode != WB_SYNC_NONE && + ceph_wbc->start_index == 0 && /* all dirty pages were checked */ + !ceph_wbc->head_snapc) { + ceph_wbc->index = 0; + + while ((ceph_wbc->index <= ceph_wbc->end) && + (nr = filemap_get_folios_tag(mapping, + &ceph_wbc->index, + (pgoff_t)-1, + PAGECACHE_TAG_WRITEBACK, + &ceph_wbc->fbatch))) { + for (i = 0; i < nr; i++) { + page = &ceph_wbc->fbatch.folios[i]->page; + if (page_snap_context(page) != ceph_wbc->snapc) + continue; + wait_on_page_writeback(page); + } + + folio_batch_release(&ceph_wbc->fbatch); + cond_resched(); } - if (IS_ENCRYPTED(inode)) - len = round_up(len, CEPH_FSCRYPT_BLOCK_SIZE); + } +} + +/* + * initiate async writeback + */ +static int ceph_writepages_start(struct address_space *mapping, + struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct ceph_fs_client *fsc = ceph_inode_to_fs_client(inode); + struct ceph_client *cl = fsc->client; + struct ceph_writeback_ctl ceph_wbc; + int rc = 0; - doutc(cl, "got pages at %llu~%llu\n", offset, len); + if (wbc->sync_mode == WB_SYNC_NONE && fsc->write_congested) + return 0; - if (IS_ENCRYPTED(inode) && - ((offset | len) & ~CEPH_FSCRYPT_BLOCK_MASK)) - pr_warn_client(cl, - "bad encrypted write offset=%lld len=%llu\n", - offset, len); - - osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len, - 0, from_pool, false); - osd_req_op_extent_update(req, op_idx, len); - - BUG_ON(op_idx + 1 != req->r_num_ops); - - from_pool = false; - if (i < locked_pages) { - BUG_ON(num_ops <= req->r_num_ops); - num_ops -= req->r_num_ops; - locked_pages -= i; - - /* allocate new pages array for next request */ - data_pages = pages; - pages = kmalloc_array(locked_pages, sizeof(*pages), - GFP_NOFS); - if (!pages) { - from_pool = true; - pages = mempool_alloc(ceph_wb_pagevec_pool, GFP_NOFS); - BUG_ON(!pages); + doutc(cl, "%llx.%llx (mode=%s)\n", ceph_vinop(inode), + wbc->sync_mode == WB_SYNC_NONE ? "NONE" : + (wbc->sync_mode == WB_SYNC_ALL ? "ALL" : "HOLD")); + + if (is_forced_umount(mapping)) { + /* we're in a forced umount, don't write! */ + return -EIO; + } + + ceph_init_writeback_ctl(mapping, wbc, &ceph_wbc); + + if (!ceph_inc_osd_stopping_blocker(fsc->mdsc)) { + rc = -EIO; + goto out; + } + +retry: + rc = ceph_define_writeback_range(mapping, wbc, &ceph_wbc); + if (rc == -ENODATA) { + /* hmm, why does writepages get called when there + is no dirty data? */ + rc = 0; + goto dec_osd_stopping_blocker; + } + + if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages) + tag_pages_for_writeback(mapping, ceph_wbc.index, ceph_wbc.end); + + while (!has_writeback_done(&ceph_wbc)) { + ceph_wbc.locked_pages = 0; + ceph_wbc.max_pages = ceph_wbc.wsize >> PAGE_SHIFT; + +get_more_pages: + ceph_folio_batch_reinit(&ceph_wbc); + + ceph_wbc.nr_folios = filemap_get_folios_tag(mapping, + &ceph_wbc.index, + ceph_wbc.end, + ceph_wbc.tag, + &ceph_wbc.fbatch); + doutc(cl, "pagevec_lookup_range_tag for tag %#x got %d\n", + ceph_wbc.tag, ceph_wbc.nr_folios); + + if (!ceph_wbc.nr_folios && !ceph_wbc.locked_pages) + break; + +process_folio_batch: + rc = ceph_process_folio_batch(mapping, wbc, &ceph_wbc); + if (rc) + goto release_folios; + + /* did we get anything? */ + if (!ceph_wbc.locked_pages) + goto release_folios; + + if (ceph_wbc.processed_in_fbatch) { + ceph_shift_unused_folios_left(&ceph_wbc.fbatch); + + if (folio_batch_count(&ceph_wbc.fbatch) == 0 && + ceph_wbc.locked_pages < ceph_wbc.max_pages) { + doutc(cl, "reached end fbatch, trying for more\n"); + goto get_more_pages; } - memcpy(pages, data_pages + i, - locked_pages * sizeof(*pages)); - memset(data_pages + i, 0, - locked_pages * sizeof(*pages)); - } else { - BUG_ON(num_ops != req->r_num_ops); - index = pages[i - 1]->index + 1; - /* request message now owns the pages array */ - pages = NULL; } - req->r_mtime = inode_get_mtime(inode); - ceph_osdc_start_request(&fsc->client->osdc, req); - req = NULL; + rc = ceph_submit_write(mapping, wbc, &ceph_wbc); + if (rc) + goto release_folios; + + ceph_wbc.locked_pages = 0; + ceph_wbc.strip_unit_end = 0; - wbc->nr_to_write -= i; - if (pages) - goto new_request; + if (folio_batch_count(&ceph_wbc.fbatch) > 0) { + ceph_wbc.nr_folios = + folio_batch_count(&ceph_wbc.fbatch); + goto process_folio_batch; + } /* * We stop writing back only if we are not doing @@ -1377,61 +1713,44 @@ new_request: * we tagged for writeback prior to entering this loop. */ if (wbc->nr_to_write <= 0 && wbc->sync_mode == WB_SYNC_NONE) - done = true; + ceph_wbc.done = true; release_folios: doutc(cl, "folio_batch release on %d folios (%p)\n", - (int)fbatch.nr, fbatch.nr ? fbatch.folios[0] : NULL); - folio_batch_release(&fbatch); + (int)ceph_wbc.fbatch.nr, + ceph_wbc.fbatch.nr ? ceph_wbc.fbatch.folios[0] : NULL); + folio_batch_release(&ceph_wbc.fbatch); } - if (should_loop && !done) { + if (ceph_wbc.should_loop && !ceph_wbc.done) { /* more to do; loop back to beginning of file */ doutc(cl, "looping back to beginning of file\n"); - end = start_index - 1; /* OK even when start_index == 0 */ + /* OK even when start_index == 0 */ + ceph_wbc.end = ceph_wbc.start_index - 1; /* to write dirty pages associated with next snapc, * we need to wait until current writes complete */ - if (wbc->sync_mode != WB_SYNC_NONE && - start_index == 0 && /* all dirty pages were checked */ - !ceph_wbc.head_snapc) { - struct page *page; - unsigned i, nr; - index = 0; - while ((index <= end) && - (nr = filemap_get_folios_tag(mapping, &index, - (pgoff_t)-1, - PAGECACHE_TAG_WRITEBACK, - &fbatch))) { - for (i = 0; i < nr; i++) { - page = &fbatch.folios[i]->page; - if (page_snap_context(page) != snapc) - continue; - wait_on_page_writeback(page); - } - folio_batch_release(&fbatch); - cond_resched(); - } - } + ceph_wait_until_current_writes_complete(mapping, wbc, &ceph_wbc); - start_index = 0; - index = 0; + ceph_wbc.start_index = 0; + ceph_wbc.index = 0; goto retry; } - if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0)) - mapping->writeback_index = index; + if (wbc->range_cyclic || (ceph_wbc.range_whole && wbc->nr_to_write > 0)) + mapping->writeback_index = ceph_wbc.index; + +dec_osd_stopping_blocker: + ceph_dec_osd_stopping_blocker(fsc->mdsc); out: - ceph_osdc_put_request(req); - ceph_put_snap_context(last_snapc); + ceph_put_snap_context(ceph_wbc.last_snapc); doutc(cl, "%llx.%llx dend - startone, rc = %d\n", ceph_vinop(inode), rc); + return rc; } - - /* * See if a given @snapc is either writeable, or already written. */ @@ -1447,56 +1766,56 @@ static int context_is_writeable_or_written(struct inode *inode, /** * ceph_find_incompatible - find an incompatible context and return it - * @page: page being dirtied + * @folio: folio being dirtied * - * We are only allowed to write into/dirty a page if the page is + * We are only allowed to write into/dirty a folio if the folio is * clean, or already dirty within the same snap context. Returns a * conflicting context if there is one, NULL if there isn't, or a * negative error code on other errors. * - * Must be called with page lock held. + * Must be called with folio lock held. */ static struct ceph_snap_context * -ceph_find_incompatible(struct page *page) +ceph_find_incompatible(struct folio *folio) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct ceph_client *cl = ceph_inode_to_client(inode); struct ceph_inode_info *ci = ceph_inode(inode); if (ceph_inode_is_shutdown(inode)) { - doutc(cl, " %llx.%llx page %p is shutdown\n", - ceph_vinop(inode), page); + doutc(cl, " %llx.%llx folio %p is shutdown\n", + ceph_vinop(inode), folio); return ERR_PTR(-ESTALE); } for (;;) { struct ceph_snap_context *snapc, *oldest; - wait_on_page_writeback(page); + folio_wait_writeback(folio); - snapc = page_snap_context(page); + snapc = page_snap_context(&folio->page); if (!snapc || snapc == ci->i_head_snapc) break; /* - * this page is already dirty in another (older) snap + * this folio is already dirty in another (older) snap * context! is it writeable now? */ oldest = get_oldest_context(inode, NULL, NULL); if (snapc->seq > oldest->seq) { /* not writeable -- return it for the caller to deal with */ ceph_put_snap_context(oldest); - doutc(cl, " %llx.%llx page %p snapc %p not current or oldest\n", - ceph_vinop(inode), page, snapc); + doutc(cl, " %llx.%llx folio %p snapc %p not current or oldest\n", + ceph_vinop(inode), folio, snapc); return ceph_get_snap_context(snapc); } ceph_put_snap_context(oldest); - /* yay, writeable, do it now (without dropping page lock) */ - doutc(cl, " %llx.%llx page %p snapc %p not current, but oldest\n", - ceph_vinop(inode), page, snapc); - if (clear_page_dirty_for_io(page)) { - int r = writepage_nounlock(page, NULL); + /* yay, writeable, do it now (without dropping folio lock) */ + doutc(cl, " %llx.%llx folio %p snapc %p not current, but oldest\n", + ceph_vinop(inode), folio, snapc); + if (folio_clear_dirty_for_io(folio)) { + int r = write_folio_nounlock(folio, NULL); if (r < 0) return ERR_PTR(r); } @@ -1511,7 +1830,7 @@ static int ceph_netfs_check_write_begin(struct file *file, loff_t pos, unsigned struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_snap_context *snapc; - snapc = ceph_find_incompatible(folio_page(*foliop, 0)); + snapc = ceph_find_incompatible(*foliop); if (snapc) { int r; @@ -1594,7 +1913,6 @@ out: const struct address_space_operations ceph_aops = { .read_folio = netfs_read_folio, .readahead = netfs_readahead, - .writepage = ceph_writepage, .writepages = ceph_writepages_start, .write_begin = ceph_write_begin, .write_end = ceph_write_end, @@ -1602,6 +1920,7 @@ const struct address_space_operations ceph_aops = { .invalidate_folio = ceph_invalidate_folio, .release_folio = netfs_release_folio, .direct_IO = noop_direct_IO, + .migrate_folio = filemap_migrate_folio, }; static void ceph_block_sigs(sigset_t *oldset) @@ -1718,8 +2037,8 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_file_info *fi = vma->vm_file->private_data; struct ceph_cap_flush *prealloc_cf; - struct page *page = vmf->page; - loff_t off = page_offset(page); + struct folio *folio = page_folio(vmf->page); + loff_t off = folio_pos(folio); loff_t size = i_size_read(inode); size_t len; int want, got, err; @@ -1736,10 +2055,10 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) sb_start_pagefault(inode->i_sb); ceph_block_sigs(&oldset); - if (off + thp_size(page) <= size) - len = thp_size(page); + if (off + folio_size(folio) <= size) + len = folio_size(folio); else - len = offset_in_thp(page, size); + len = offset_in_folio(folio, size); doutc(cl, "%llx.%llx %llu~%zd getting caps i_size %llu\n", ceph_vinop(inode), off, len, size); @@ -1756,30 +2075,30 @@ static vm_fault_t ceph_page_mkwrite(struct vm_fault *vmf) doutc(cl, "%llx.%llx %llu~%zd got cap refs on %s\n", ceph_vinop(inode), off, len, ceph_cap_string(got)); - /* Update time before taking page lock */ + /* Update time before taking folio lock */ file_update_time(vma->vm_file); inode_inc_iversion_raw(inode); do { struct ceph_snap_context *snapc; - lock_page(page); + folio_lock(folio); - if (page_mkwrite_check_truncate(page, inode) < 0) { - unlock_page(page); + if (folio_mkwrite_check_truncate(folio, inode) < 0) { + folio_unlock(folio); ret = VM_FAULT_NOPAGE; break; } - snapc = ceph_find_incompatible(page); + snapc = ceph_find_incompatible(folio); if (!snapc) { - /* success. we'll keep the page locked. */ - set_page_dirty(page); + /* success. we'll keep the folio locked. */ + folio_mark_dirty(folio); ret = VM_FAULT_LOCKED; break; } - unlock_page(page); + folio_unlock(folio); if (IS_ERR(snapc)) { ret = VM_FAULT_SIGBUS; diff --git a/fs/ceph/dir.c b/fs/ceph/dir.c index 62e99e65250d..a321aa6d0ed2 100644 --- a/fs/ceph/dir.c +++ b/fs/ceph/dir.c @@ -141,17 +141,18 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx, if (ptr_pos >= i_size_read(dir)) return NULL; - if (!cache_ctl->page || ptr_pgoff != cache_ctl->page->index) { + if (!cache_ctl->folio || ptr_pgoff != cache_ctl->folio->index) { ceph_readdir_cache_release(cache_ctl); - cache_ctl->page = find_lock_page(&dir->i_data, ptr_pgoff); - if (!cache_ctl->page) { - doutc(cl, " page %lu not found\n", ptr_pgoff); + cache_ctl->folio = filemap_lock_folio(&dir->i_data, ptr_pgoff); + if (IS_ERR(cache_ctl->folio)) { + cache_ctl->folio = NULL; + doutc(cl, " folio %lu not found\n", ptr_pgoff); return ERR_PTR(-EAGAIN); } /* reading/filling the cache are serialized by - i_rwsem, no need to use page lock */ - unlock_page(cache_ctl->page); - cache_ctl->dentries = kmap(cache_ctl->page); + i_rwsem, no need to use folio lock */ + folio_unlock(cache_ctl->folio); + cache_ctl->dentries = kmap_local_folio(cache_ctl->folio, 0); } cache_ctl->index = idx & idx_mask; @@ -1092,19 +1093,20 @@ out: return err; } -static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct ceph_mds_client *mdsc = ceph_sb_to_mdsc(dir->i_sb); struct ceph_client *cl = mdsc->fsc->client; struct ceph_mds_request *req; struct ceph_acl_sec_ctx as_ctx = {}; + struct dentry *ret; int err; int op; err = ceph_wait_on_conflict_unlink(dentry); if (err) - return err; + return ERR_PTR(err); if (ceph_snap(dir) == CEPH_SNAPDIR) { /* mkdir .snap/foo is a MKSNAP */ @@ -1116,32 +1118,32 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, ceph_vinop(dir), dentry, dentry, mode); op = CEPH_MDS_OP_MKDIR; } else { - err = -EROFS; + ret = ERR_PTR(-EROFS); goto out; } if (op == CEPH_MDS_OP_MKDIR && ceph_quota_is_max_files_exceeded(dir)) { - err = -EDQUOT; + ret = ERR_PTR(-EDQUOT); goto out; } if ((op == CEPH_MDS_OP_MKSNAP) && IS_ENCRYPTED(dir) && !fscrypt_has_encryption_key(dir)) { - err = -ENOKEY; + ret = ERR_PTR(-ENOKEY); goto out; } req = ceph_mdsc_create_request(mdsc, op, USE_AUTH_MDS); if (IS_ERR(req)) { - err = PTR_ERR(req); + ret = ERR_CAST(req); goto out; } mode |= S_IFDIR; req->r_new_inode = ceph_new_inode(dir, dentry, &mode, &as_ctx); if (IS_ERR(req->r_new_inode)) { - err = PTR_ERR(req->r_new_inode); + ret = ERR_CAST(req->r_new_inode); req->r_new_inode = NULL; goto out_req; } @@ -1165,15 +1167,22 @@ static int ceph_mkdir(struct mnt_idmap *idmap, struct inode *dir, !req->r_reply_info.head->is_target && !req->r_reply_info.head->is_dentry) err = ceph_handle_notrace_create(dir, dentry); + ret = ERR_PTR(err); out_req: + if (!IS_ERR(ret) && req->r_dentry != dentry) + /* Some other dentry was spliced in */ + ret = dget(req->r_dentry); ceph_mdsc_put_request(req); out: - if (!err) + if (!IS_ERR(ret)) { + if (ret) + dentry = ret; ceph_init_inode_acls(d_inode(dentry), &as_ctx); - else + } else { d_drop(dentry); + } ceph_release_acl_sec_ctx(&as_ctx); - return err; + return ret; } static int ceph_link(struct dentry *old_dentry, struct inode *dir, diff --git a/fs/ceph/inode.c b/fs/ceph/inode.c index 7dd6c2275085..06cd2963e41e 100644 --- a/fs/ceph/inode.c +++ b/fs/ceph/inode.c @@ -1845,10 +1845,9 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl) { - if (ctl->page) { - kunmap(ctl->page); - put_page(ctl->page); - ctl->page = NULL; + if (ctl->folio) { + folio_release_kmap(ctl->folio, ctl->dentries); + ctl->folio = NULL; } } @@ -1862,20 +1861,26 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, unsigned idx = ctl->index % nsize; pgoff_t pgoff = ctl->index / nsize; - if (!ctl->page || pgoff != ctl->page->index) { + if (!ctl->folio || pgoff != ctl->folio->index) { ceph_readdir_cache_release(ctl); + fgf_t fgf = FGP_LOCK; + if (idx == 0) - ctl->page = grab_cache_page(&dir->i_data, pgoff); - else - ctl->page = find_lock_page(&dir->i_data, pgoff); - if (!ctl->page) { + fgf |= FGP_ACCESSED | FGP_CREAT; + + ctl->folio = __filemap_get_folio(&dir->i_data, pgoff, + fgf, mapping_gfp_mask(&dir->i_data)); + if (IS_ERR(ctl->folio)) { + int err = PTR_ERR(ctl->folio); + + ctl->folio = NULL; ctl->index = -1; - return idx == 0 ? -ENOMEM : 0; + return idx == 0 ? err : 0; } /* reading/filling the cache are serialized by - * i_rwsem, no need to use page lock */ - unlock_page(ctl->page); - ctl->dentries = kmap(ctl->page); + * i_rwsem, no need to use folio lock */ + folio_unlock(ctl->folio); + ctl->dentries = kmap_local_folio(ctl->folio, 0); if (idx == 0) memset(ctl->dentries, 0, PAGE_SIZE); } @@ -2362,7 +2367,7 @@ static int fill_fscrypt_truncate(struct inode *inode, /* Try to writeback the dirty pagecaches */ if (issued & (CEPH_CAP_FILE_BUFFER)) { - loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SHIFT - 1; + loff_t lend = orig_pos + CEPH_FSCRYPT_BLOCK_SIZE - 1; ret = filemap_write_and_wait_range(inode->i_mapping, orig_pos, lend); diff --git a/fs/ceph/mds_client.c b/fs/ceph/mds_client.c index 54b3421501e9..230e0c3f341f 100644 --- a/fs/ceph/mds_client.c +++ b/fs/ceph/mds_client.c @@ -5489,6 +5489,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) spin_lock_init(&mdsc->stopping_lock); atomic_set(&mdsc->stopping_blockers, 0); init_completion(&mdsc->stopping_waiter); + atomic64_set(&mdsc->dirty_folios, 0); + init_waitqueue_head(&mdsc->flush_end_wq); init_waitqueue_head(&mdsc->session_close_wq); INIT_LIST_HEAD(&mdsc->waiting_for_map); mdsc->quotarealms_inodes = RB_ROOT; diff --git a/fs/ceph/mds_client.h b/fs/ceph/mds_client.h index 7c9fee9e80d4..3e2a6fa7c19a 100644 --- a/fs/ceph/mds_client.h +++ b/fs/ceph/mds_client.h @@ -458,6 +458,9 @@ struct ceph_mds_client { atomic_t stopping_blockers; struct completion stopping_waiter; + atomic64_t dirty_folios; + wait_queue_head_t flush_end_wq; + atomic64_t quotarealms_count; /* # realms with quota */ /* * We keep a list of inodes we don't see in the mountpoint but that we diff --git a/fs/ceph/super.c b/fs/ceph/super.c index 4344e1f11806..f3951253e393 100644 --- a/fs/ceph/super.c +++ b/fs/ceph/super.c @@ -1563,6 +1563,17 @@ static void ceph_kill_sb(struct super_block *s) */ sync_filesystem(s); + if (atomic64_read(&mdsc->dirty_folios) > 0) { + wait_queue_head_t *wq = &mdsc->flush_end_wq; + long timeleft = wait_event_killable_timeout(*wq, + atomic64_read(&mdsc->dirty_folios) <= 0, + fsc->client->options->mount_timeout); + if (!timeleft) /* timed out */ + pr_warn_client(cl, "umount timed out, %ld\n", timeleft); + else if (timeleft < 0) /* killed */ + pr_warn_client(cl, "umount was killed, %ld\n", timeleft); + } + spin_lock(&mdsc->stopping_lock); mdsc->stopping = CEPH_MDSC_STOPPING_FLUSHING; wait = !!atomic_read(&mdsc->stopping_blockers); diff --git a/fs/ceph/super.h b/fs/ceph/super.h index 7fa1e7be50e4..bb0db0cc8003 100644 --- a/fs/ceph/super.h +++ b/fs/ceph/super.h @@ -903,7 +903,7 @@ ceph_find_rw_context(struct ceph_file_info *cf) } struct ceph_readdir_cache_control { - struct page *page; + struct folio *folio; struct dentry **dentries; int index; }; diff --git a/fs/coda/dir.c b/fs/coda/dir.c index a3e2dfeedfbf..ab69d8f0cec2 100644 --- a/fs/coda/dir.c +++ b/fs/coda/dir.c @@ -166,8 +166,8 @@ err_out: return error; } -static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *de, umode_t mode) +static struct dentry *coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *de, umode_t mode) { struct inode *inode; struct coda_vattr attrs; @@ -177,14 +177,14 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, struct CodaFid newfid; if (is_root_inode(dir) && coda_iscontrol(name, len)) - return -EPERM; + return ERR_PTR(-EPERM); attrs.va_mode = mode; - error = venus_mkdir(dir->i_sb, coda_i2f(dir), + error = venus_mkdir(dir->i_sb, coda_i2f(dir), name, len, &newfid, &attrs); if (error) goto err_out; - + inode = coda_iget(dir->i_sb, &newfid, &attrs); if (IS_ERR(inode)) { error = PTR_ERR(inode); @@ -195,10 +195,10 @@ static int coda_mkdir(struct mnt_idmap *idmap, struct inode *dir, coda_dir_inc_nlink(dir); coda_dir_update_mtime(dir); d_instantiate(de, inode); - return 0; + return NULL; err_out: d_drop(de); - return error; + return ERR_PTR(error); } /* try to make de an entry in dir_inodde linked to source_de */ diff --git a/fs/configfs/dir.c b/fs/configfs/dir.c index 7d10278db30d..ebf32822e29b 100644 --- a/fs/configfs/dir.c +++ b/fs/configfs/dir.c @@ -619,7 +619,7 @@ static int populate_attrs(struct config_item *item) break; } } - if (t->ct_bin_attrs) { + if (!error && t->ct_bin_attrs) { for (i = 0; (bin_attr = t->ct_bin_attrs[i]) != NULL; i++) { if (ops && ops->is_bin_visible && !ops->is_bin_visible(item, bin_attr, i)) continue; @@ -970,7 +970,7 @@ static void configfs_dump_one(struct configfs_dirent *sd, int level) { pr_info("%*s\"%s\":\n", level, " ", configfs_get_name(sd)); -#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type); +#define type_print(_type) if (sd->s_type & _type) pr_info("%*s %s\n", level, " ", #_type) type_print(CONFIGFS_ROOT); type_print(CONFIGFS_DIR); type_print(CONFIGFS_ITEM_ATTR); @@ -1280,8 +1280,8 @@ out_root_unlock: } EXPORT_SYMBOL(configfs_depend_item_unlocked); -static int configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *configfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int ret = 0; int module_got = 0; @@ -1461,7 +1461,7 @@ out_put: put_fragment(frag); out: - return ret; + return ERR_PTR(ret); } static int configfs_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/configfs/item.c b/fs/configfs/item.c index 254170a82aa3..c378b5cbf87d 100644 --- a/fs/configfs/item.c +++ b/fs/configfs/item.c @@ -66,7 +66,7 @@ int config_item_set_name(struct config_item *item, const char *fmt, ...) name = kvasprintf(GFP_KERNEL, fmt, args); va_end(args); if (!name) - return -EFAULT; + return -ENOMEM; } /* Free the old name, if necessary. */ diff --git a/fs/coredump.c b/fs/coredump.c index 591700e1b2ce..f217ebf2b3b6 100644 --- a/fs/coredump.c +++ b/fs/coredump.c @@ -43,6 +43,14 @@ #include <linux/timekeeping.h> #include <linux/sysctl.h> #include <linux/elf.h> +#include <linux/pidfs.h> +#include <linux/net.h> +#include <linux/socket.h> +#include <net/af_unix.h> +#include <net/net_namespace.h> +#include <net/sock.h> +#include <uapi/linux/pidfd.h> +#include <uapi/linux/un.h> #include <linux/uaccess.h> #include <asm/mmu_context.h> @@ -60,16 +68,30 @@ static void free_vma_snapshot(struct coredump_params *cprm); #define CORE_FILE_NOTE_SIZE_DEFAULT (4*1024*1024) /* Define a reasonable max cap */ #define CORE_FILE_NOTE_SIZE_MAX (16*1024*1024) +/* + * File descriptor number for the pidfd for the thread-group leader of + * the coredumping task installed into the usermode helper's file + * descriptor table. + */ +#define COREDUMP_PIDFD_NUMBER 3 static int core_uses_pid; static unsigned int core_pipe_limit; +static unsigned int core_sort_vma; static char core_pattern[CORENAME_MAX_SIZE] = "core"; static int core_name_size = CORENAME_MAX_SIZE; unsigned int core_file_note_size_limit = CORE_FILE_NOTE_SIZE_DEFAULT; +enum coredump_type_t { + COREDUMP_FILE = 1, + COREDUMP_PIPE = 2, + COREDUMP_SOCK = 3, +}; + struct core_name { char *corename; int used, size; + enum coredump_type_t core_type; }; static int expand_corename(struct core_name *cn, int size) @@ -209,18 +231,24 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, { const struct cred *cred = current_cred(); const char *pat_ptr = core_pattern; - int ispipe = (*pat_ptr == '|'); bool was_space = false; int pid_in_pattern = 0; int err = 0; cn->used = 0; cn->corename = NULL; + if (*pat_ptr == '|') + cn->core_type = COREDUMP_PIPE; + else if (*pat_ptr == '@') + cn->core_type = COREDUMP_SOCK; + else + cn->core_type = COREDUMP_FILE; if (expand_corename(cn, core_name_size)) return -ENOMEM; cn->corename[0] = '\0'; - if (ispipe) { + switch (cn->core_type) { + case COREDUMP_PIPE: { int argvs = sizeof(core_pattern) / 2; (*argv) = kmalloc_array(argvs, sizeof(**argv), GFP_KERNEL); if (!(*argv)) @@ -229,6 +257,45 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, ++pat_ptr; if (!(*pat_ptr)) return -ENOMEM; + break; + } + case COREDUMP_SOCK: { + /* skip the @ */ + pat_ptr++; + if (!(*pat_ptr)) + return -ENOMEM; + + err = cn_printf(cn, "%s", pat_ptr); + if (err) + return err; + + /* Require absolute paths. */ + if (cn->corename[0] != '/') + return -EINVAL; + + /* + * Ensure we can uses spaces to indicate additional + * parameters in the future. + */ + if (strchr(cn->corename, ' ')) { + coredump_report_failure("Coredump socket may not %s contain spaces", cn->corename); + return -EINVAL; + } + + /* + * Currently no need to parse any other options. + * Relevant information can be retrieved from the peer + * pidfd retrievable via SO_PEERPIDFD by the receiver or + * via /proc/<pid>, using the SO_PEERPIDFD to guard + * against pid recycling when opening /proc/<pid>. + */ + return 0; + } + case COREDUMP_FILE: + break; + default: + WARN_ON_ONCE(true); + return -EINVAL; } /* Repeat as long as we have more pattern to process and more output @@ -238,7 +305,7 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, * Split on spaces before doing template expansion so that * %e and %E don't get split if they have spaces in them */ - if (ispipe) { + if (cn->core_type == COREDUMP_PIPE) { if (isspace(*pat_ptr)) { if (cn->used != 0) was_space = true; @@ -338,6 +405,27 @@ static int format_corename(struct core_name *cn, struct coredump_params *cprm, case 'C': err = cn_printf(cn, "%d", cprm->cpu); break; + /* pidfd number */ + case 'F': { + /* + * Installing a pidfd only makes sense if + * we actually spawn a usermode helper. + */ + if (cn->core_type != COREDUMP_PIPE) + break; + + /* + * Note that we'll install a pidfd for the + * thread-group leader. We know that task + * linkage hasn't been removed yet and even if + * this @current isn't the actual thread-group + * leader we know that the thread-group leader + * cannot be reaped until @current has exited. + */ + cprm->pid = task_tgid(current); + err = cn_printf(cn, "%d", COREDUMP_PIDFD_NUMBER); + break; + } default: break; } @@ -354,12 +442,10 @@ out: * If core_pattern does not include a %p (as is the default) * and core_uses_pid is set, then .%pid will be appended to * the filename. Do not do this for piped commands. */ - if (!ispipe && !pid_in_pattern && core_uses_pid) { - err = cn_printf(cn, ".%d", task_tgid_vnr(current)); - if (err) - return err; - } - return ispipe; + if (cn->core_type == COREDUMP_FILE && !pid_in_pattern && core_uses_pid) + return cn_printf(cn, ".%d", task_tgid_vnr(current)); + + return 0; } static int zap_process(struct signal_struct *signal, int exit_code) @@ -492,7 +578,7 @@ static void wait_for_dump_helpers(struct file *file) } /* - * umh_pipe_setup + * umh_coredump_setup * helper function to customize the process used * to collect the core in userspace. Specifically * it sets up a pipe and installs it as fd 0 (stdin) @@ -502,11 +588,34 @@ static void wait_for_dump_helpers(struct file *file) * is a special value that we use to trap recursive * core dumps */ -static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) +static int umh_coredump_setup(struct subprocess_info *info, struct cred *new) { struct file *files[2]; struct coredump_params *cp = (struct coredump_params *)info->data; - int err = create_pipe_files(files, 0); + int err; + + if (cp->pid) { + struct file *pidfs_file __free(fput) = NULL; + + pidfs_file = pidfs_alloc_file(cp->pid, 0); + if (IS_ERR(pidfs_file)) + return PTR_ERR(pidfs_file); + + pidfs_coredump(cp); + + /* + * Usermode helpers are childen of either + * system_unbound_wq or of kthreadd. So we know that + * we're starting off with a clean file descriptor + * table. So we should always be able to use + * COREDUMP_PIDFD_NUMBER as our file descriptor value. + */ + err = replace_fd(COREDUMP_PIDFD_NUMBER, pidfs_file, 0); + if (err < 0) + return err; + } + + err = create_pipe_files(files, 0); if (err) return err; @@ -514,10 +623,13 @@ static int umh_pipe_setup(struct subprocess_info *info, struct cred *new) err = replace_fd(0, files[0], 0); fput(files[0]); + if (err < 0) + return err; + /* and disallow core files too */ current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1}; - return err; + return 0; } void do_coredump(const kernel_siginfo_t *siginfo) @@ -529,7 +641,6 @@ void do_coredump(const kernel_siginfo_t *siginfo) const struct cred *old_cred; struct cred *cred; int retval = 0; - int ispipe; size_t *argv = NULL; int argc = 0; /* require nonrelative corefile path and be extra careful */ @@ -578,70 +689,14 @@ void do_coredump(const kernel_siginfo_t *siginfo) old_cred = override_creds(cred); - ispipe = format_corename(&cn, &cprm, &argv, &argc); - - if (ispipe) { - int argi; - int dump_count; - char **helper_argv; - struct subprocess_info *sub_info; - - if (ispipe < 0) { - coredump_report_failure("format_corename failed, aborting core"); - goto fail_unlock; - } - - if (cprm.limit == 1) { - /* See umh_pipe_setup() which sets RLIMIT_CORE = 1. - * - * Normally core limits are irrelevant to pipes, since - * we're not writing to the file system, but we use - * cprm.limit of 1 here as a special value, this is a - * consistent way to catch recursive crashes. - * We can still crash if the core_pattern binary sets - * RLIM_CORE = !1, but it runs as root, and can do - * lots of stupid things. - * - * Note that we use task_tgid_vnr here to grab the pid - * of the process group leader. That way we get the - * right pid if a thread in a multi-threaded - * core_pattern process dies. - */ - coredump_report_failure("RLIMIT_CORE is set to 1, aborting core"); - goto fail_unlock; - } - cprm.limit = RLIM_INFINITY; - - dump_count = atomic_inc_return(&core_dump_count); - if (core_pipe_limit && (core_pipe_limit < dump_count)) { - coredump_report_failure("over core_pipe_limit, skipping core dump"); - goto fail_dropcount; - } - - helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), - GFP_KERNEL); - if (!helper_argv) { - coredump_report_failure("%s failed to allocate memory", __func__); - goto fail_dropcount; - } - for (argi = 0; argi < argc; argi++) - helper_argv[argi] = cn.corename + argv[argi]; - helper_argv[argi] = NULL; - - retval = -ENOMEM; - sub_info = call_usermodehelper_setup(helper_argv[0], - helper_argv, NULL, GFP_KERNEL, - umh_pipe_setup, NULL, &cprm); - if (sub_info) - retval = call_usermodehelper_exec(sub_info, - UMH_WAIT_EXEC); + retval = format_corename(&cn, &cprm, &argv, &argc); + if (retval < 0) { + coredump_report_failure("format_corename failed, aborting core"); + goto fail_unlock; + } - kfree(helper_argv); - if (retval) { - coredump_report_failure("|%s pipe failed", cn.corename); - goto close_fail; - } - } else { + switch (cn.core_type) { + case COREDUMP_FILE: { struct mnt_idmap *idmap; struct inode *inode; int open_flags = O_CREAT | O_WRONLY | O_NOFOLLOW | @@ -735,6 +790,143 @@ void do_coredump(const kernel_siginfo_t *siginfo) if (do_truncate(idmap, cprm.file->f_path.dentry, 0, 0, cprm.file)) goto close_fail; + break; + } + case COREDUMP_PIPE: { + int argi; + int dump_count; + char **helper_argv; + struct subprocess_info *sub_info; + + if (cprm.limit == 1) { + /* See umh_coredump_setup() which sets RLIMIT_CORE = 1. + * + * Normally core limits are irrelevant to pipes, since + * we're not writing to the file system, but we use + * cprm.limit of 1 here as a special value, this is a + * consistent way to catch recursive crashes. + * We can still crash if the core_pattern binary sets + * RLIM_CORE = !1, but it runs as root, and can do + * lots of stupid things. + * + * Note that we use task_tgid_vnr here to grab the pid + * of the process group leader. That way we get the + * right pid if a thread in a multi-threaded + * core_pattern process dies. + */ + coredump_report_failure("RLIMIT_CORE is set to 1, aborting core"); + goto fail_unlock; + } + cprm.limit = RLIM_INFINITY; + + dump_count = atomic_inc_return(&core_dump_count); + if (core_pipe_limit && (core_pipe_limit < dump_count)) { + coredump_report_failure("over core_pipe_limit, skipping core dump"); + goto fail_dropcount; + } + + helper_argv = kmalloc_array(argc + 1, sizeof(*helper_argv), + GFP_KERNEL); + if (!helper_argv) { + coredump_report_failure("%s failed to allocate memory", __func__); + goto fail_dropcount; + } + for (argi = 0; argi < argc; argi++) + helper_argv[argi] = cn.corename + argv[argi]; + helper_argv[argi] = NULL; + + retval = -ENOMEM; + sub_info = call_usermodehelper_setup(helper_argv[0], + helper_argv, NULL, GFP_KERNEL, + umh_coredump_setup, NULL, &cprm); + if (sub_info) + retval = call_usermodehelper_exec(sub_info, + UMH_WAIT_EXEC); + + kfree(helper_argv); + if (retval) { + coredump_report_failure("|%s pipe failed", cn.corename); + goto close_fail; + } + break; + } + case COREDUMP_SOCK: { +#ifdef CONFIG_UNIX + struct file *file __free(fput) = NULL; + struct sockaddr_un addr = { + .sun_family = AF_UNIX, + }; + ssize_t addr_len; + struct socket *socket; + + addr_len = strscpy(addr.sun_path, cn.corename); + if (addr_len < 0) + goto close_fail; + addr_len += offsetof(struct sockaddr_un, sun_path) + 1; + + /* + * It is possible that the userspace process which is + * supposed to handle the coredump and is listening on + * the AF_UNIX socket coredumps. Userspace should just + * mark itself non dumpable. + */ + + retval = sock_create_kern(&init_net, AF_UNIX, SOCK_STREAM, 0, &socket); + if (retval < 0) + goto close_fail; + + file = sock_alloc_file(socket, 0, NULL); + if (IS_ERR(file)) + goto close_fail; + + /* + * Set the thread-group leader pid which is used for the + * peer credentials during connect() below. Then + * immediately register it in pidfs... + */ + cprm.pid = task_tgid(current); + retval = pidfs_register_pid(cprm.pid); + if (retval) + goto close_fail; + + /* + * ... and set the coredump information so userspace + * has it available after connect()... + */ + pidfs_coredump(&cprm); + + retval = kernel_connect(socket, (struct sockaddr *)(&addr), + addr_len, O_NONBLOCK | SOCK_COREDUMP); + + /* + * ... Make sure to only put our reference after connect() took + * its own reference keeping the pidfs entry alive ... + */ + pidfs_put_pid(cprm.pid); + + if (retval) { + if (retval == -EAGAIN) + coredump_report_failure("Coredump socket %s receive queue full", addr.sun_path); + else + coredump_report_failure("Coredump socket connection %s failed %d", addr.sun_path, retval); + goto close_fail; + } + + /* ... and validate that @sk_peer_pid matches @cprm.pid. */ + if (WARN_ON_ONCE(unix_peer(socket->sk)->sk_peer_pid != cprm.pid)) + goto close_fail; + + cprm.limit = RLIM_INFINITY; + cprm.file = no_free_ptr(file); +#else + coredump_report_failure("Core dump socket support %s disabled", cn.corename); + goto close_fail; +#endif + break; + } + default: + WARN_ON_ONCE(true); + goto close_fail; } /* get us an unshared descriptor table; almost always a no-op */ @@ -769,13 +961,49 @@ void do_coredump(const kernel_siginfo_t *siginfo) file_end_write(cprm.file); free_vma_snapshot(&cprm); } - if (ispipe && core_pipe_limit) - wait_for_dump_helpers(cprm.file); + +#ifdef CONFIG_UNIX + /* Let userspace know we're done processing the coredump. */ + if (sock_from_file(cprm.file)) + kernel_sock_shutdown(sock_from_file(cprm.file), SHUT_WR); +#endif + + /* + * When core_pipe_limit is set we wait for the coredump server + * or usermodehelper to finish before exiting so it can e.g., + * inspect /proc/<pid>. + */ + if (core_pipe_limit) { + switch (cn.core_type) { + case COREDUMP_PIPE: + wait_for_dump_helpers(cprm.file); + break; +#ifdef CONFIG_UNIX + case COREDUMP_SOCK: { + ssize_t n; + + /* + * We use a simple read to wait for the coredump + * processing to finish. Either the socket is + * closed or we get sent unexpected data. In + * both cases, we're done. + */ + n = __kernel_read(cprm.file, &(char){ 0 }, 1, NULL); + if (n != 0) + coredump_report_failure("Unexpected data on coredump socket"); + break; + } +#endif + default: + break; + } + } + close_fail: if (cprm.file) filp_close(cprm.file, NULL); fail_dropcount: - if (ispipe) + if (cn.core_type == COREDUMP_PIPE) atomic_dec(&core_dump_count); fail_unlock: kfree(argv); @@ -798,10 +1026,9 @@ static int __dump_emit(struct coredump_params *cprm, const void *addr, int nr) struct file *file = cprm->file; loff_t pos = file->f_pos; ssize_t n; + if (cprm->written + nr > cprm->limit) return 0; - - if (dump_interrupted()) return 0; n = __kernel_write(file, addr, nr, &pos); @@ -818,20 +1045,21 @@ static int __dump_skip(struct coredump_params *cprm, size_t nr) { static char zeroes[PAGE_SIZE]; struct file *file = cprm->file; + if (file->f_mode & FMODE_LSEEK) { - if (dump_interrupted() || - vfs_llseek(file, nr, SEEK_CUR) < 0) + if (dump_interrupted() || vfs_llseek(file, nr, SEEK_CUR) < 0) return 0; cprm->pos += nr; return 1; - } else { - while (nr > PAGE_SIZE) { - if (!__dump_emit(cprm, zeroes, PAGE_SIZE)) - return 0; - nr -= PAGE_SIZE; - } - return __dump_emit(cprm, zeroes, nr); } + + while (nr > PAGE_SIZE) { + if (!__dump_emit(cprm, zeroes, PAGE_SIZE)) + return 0; + nr -= PAGE_SIZE; + } + + return __dump_emit(cprm, zeroes, nr); } int dump_emit(struct coredump_params *cprm, const void *addr, int nr) @@ -925,14 +1153,23 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, { unsigned long addr; struct page *dump_page; + int locked, ret; dump_page = dump_page_alloc(); if (!dump_page) return 0; + ret = 0; + locked = 0; for (addr = start; addr < start + len; addr += PAGE_SIZE) { struct page *page; + if (!locked) { + if (mmap_read_lock_killable(current->mm)) + goto out; + locked = 1; + } + /* * To avoid having to allocate page tables for virtual address * ranges that have never been used yet, and also to make it @@ -940,21 +1177,38 @@ int dump_user_range(struct coredump_params *cprm, unsigned long start, * NULL when encountering an empty page table entry that would * otherwise have been filled with the zero page. */ - page = get_dump_page(addr); + page = get_dump_page(addr, &locked); if (page) { + if (locked) { + mmap_read_unlock(current->mm); + locked = 0; + } int stop = !dump_emit_page(cprm, dump_page_copy(page, dump_page)); put_page(page); - if (stop) { - dump_page_free(dump_page); - return 0; - } + if (stop) + goto out; } else { dump_skip(cprm, PAGE_SIZE); } + + if (dump_interrupted()) + goto out; + + if (!need_resched()) + continue; + if (locked) { + mmap_read_unlock(current->mm); + locked = 0; + } cond_resched(); } + ret = 1; +out: + if (locked) + mmap_read_unlock(current->mm); + dump_page_free(dump_page); - return 1; + return ret; } #endif @@ -974,7 +1228,7 @@ EXPORT_SYMBOL(dump_align); void validate_coredump_safety(void) { if (suid_dumpable == SUID_DUMP_ROOT && - core_pattern[0] != '/' && core_pattern[0] != '|') { + core_pattern[0] != '/' && core_pattern[0] != '|' && core_pattern[0] != '@') { coredump_report_failure("Unsafe core_pattern used with fs.suid_dumpable=2: " "pipe handler or fully qualified core dump path required. " @@ -982,18 +1236,55 @@ void validate_coredump_safety(void) } } +static inline bool check_coredump_socket(void) +{ + if (core_pattern[0] != '@') + return true; + + /* + * Coredump socket must be located in the initial mount + * namespace. Don't give the impression that anything else is + * supported right now. + */ + if (current->nsproxy->mnt_ns != init_task.nsproxy->mnt_ns) + return false; + + /* Must be an absolute path. */ + if (*(core_pattern + 1) != '/') + return false; + + return true; +} + static int proc_dostring_coredump(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - int error = proc_dostring(table, write, buffer, lenp, ppos); + int error; + ssize_t retval; + char old_core_pattern[CORENAME_MAX_SIZE]; + + retval = strscpy(old_core_pattern, core_pattern, CORENAME_MAX_SIZE); + + error = proc_dostring(table, write, buffer, lenp, ppos); + if (error) + return error; + if (!check_coredump_socket()) { + strscpy(core_pattern, old_core_pattern, retval + 1); + return -EINVAL; + } - if (!error) - validate_coredump_safety(); + validate_coredump_safety(); return error; } static const unsigned int core_file_note_size_min = CORE_FILE_NOTE_SIZE_DEFAULT; static const unsigned int core_file_note_size_max = CORE_FILE_NOTE_SIZE_MAX; +static char core_modes[] = { + "file\npipe" +#ifdef CONFIG_UNIX + "\nsocket" +#endif +}; static const struct ctl_table coredump_sysctls[] = { { @@ -1015,7 +1306,9 @@ static const struct ctl_table coredump_sysctls[] = { .data = &core_pipe_limit, .maxlen = sizeof(unsigned int), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_INT_MAX, }, { .procname = "core_file_note_size_limit", @@ -1026,6 +1319,22 @@ static const struct ctl_table coredump_sysctls[] = { .extra1 = (unsigned int *)&core_file_note_size_min, .extra2 = (unsigned int *)&core_file_note_size_max, }, + { + .procname = "core_sort_vma", + .data = &core_sort_vma, + .maxlen = sizeof(int), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = SYSCTL_ONE, + }, + { + .procname = "core_modes", + .data = core_modes, + .maxlen = sizeof(core_modes) - 1, + .mode = 0444, + .proc_handler = proc_dostring, + }, }; static int __init init_fs_coredump_sysctls(void) @@ -1256,8 +1565,9 @@ static bool dump_vma_snapshot(struct coredump_params *cprm) cprm->vma_data_size += m->dump_size; } - sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta), - cmp_vma_size, NULL); + if (core_sort_vma) + sort(cprm->vma_meta, cprm->vma_count, sizeof(*cprm->vma_meta), + cmp_vma_size, NULL); return true; } diff --git a/fs/crypto/Kconfig b/fs/crypto/Kconfig index 5aff5934baa1..b5dfb0aa405a 100644 --- a/fs/crypto/Kconfig +++ b/fs/crypto/Kconfig @@ -3,6 +3,7 @@ config FS_ENCRYPTION bool "FS Encryption (Per-file encryption)" select CRYPTO select CRYPTO_HASH + select CRYPTO_HKDF select CRYPTO_SKCIPHER select CRYPTO_LIB_SHA256 select KEYS @@ -24,20 +25,16 @@ config FS_ENCRYPTION # # Also note that this option only pulls in the generic implementations of the # algorithms, not any per-architecture optimized implementations. It is -# strongly recommended to enable optimized implementations too. It is safe to -# disable these generic implementations if corresponding optimized -# implementations will always be available too; for this reason, these are soft -# dependencies ('imply' rather than 'select'). Only disable these generic -# implementations if you're sure they will never be needed, though. +# strongly recommended to enable optimized implementations too. config FS_ENCRYPTION_ALGS tristate - imply CRYPTO_AES - imply CRYPTO_CBC - imply CRYPTO_CTS - imply CRYPTO_ECB - imply CRYPTO_HMAC - imply CRYPTO_SHA512 - imply CRYPTO_XTS + select CRYPTO_AES + select CRYPTO_CBC + select CRYPTO_CTS + select CRYPTO_ECB + select CRYPTO_HMAC + select CRYPTO_SHA512 + select CRYPTO_XTS config FS_ENCRYPTION_INLINE_CRYPT bool "Enable fscrypt to use inline crypto" diff --git a/fs/crypto/crypto.c b/fs/crypto/crypto.c index 328470d40dec..b74b5937e695 100644 --- a/fs/crypto/crypto.c +++ b/fs/crypto/crypto.c @@ -153,8 +153,8 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, } /** - * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache page - * @page: the locked pagecache page containing the data to encrypt + * fscrypt_encrypt_pagecache_blocks() - Encrypt data from a pagecache folio + * @folio: the locked pagecache folio containing the data to encrypt * @len: size of the data to encrypt, in bytes * @offs: offset within @page of the data to encrypt, in bytes * @gfp_flags: memory allocation flags; see details below @@ -177,23 +177,21 @@ int fscrypt_crypt_data_unit(const struct fscrypt_inode_info *ci, * * Return: the new encrypted bounce page on success; an ERR_PTR() on failure */ -struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, - unsigned int len, - unsigned int offs, - gfp_t gfp_flags) - +struct page *fscrypt_encrypt_pagecache_blocks(struct folio *folio, + size_t len, size_t offs, gfp_t gfp_flags) { - const struct inode *inode = page->mapping->host; + const struct inode *inode = folio->mapping->host; const struct fscrypt_inode_info *ci = inode->i_crypt_info; const unsigned int du_bits = ci->ci_data_unit_bits; const unsigned int du_size = 1U << du_bits; struct page *ciphertext_page; - u64 index = ((u64)page->index << (PAGE_SHIFT - du_bits)) + + u64 index = ((u64)folio->index << (PAGE_SHIFT - du_bits)) + (offs >> du_bits); unsigned int i; int err; - if (WARN_ON_ONCE(!PageLocked(page))) + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + if (WARN_ON_ONCE(!folio_test_locked(folio))) return ERR_PTR(-EINVAL); if (WARN_ON_ONCE(len <= 0 || !IS_ALIGNED(len | offs, du_size))) @@ -205,7 +203,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, for (i = offs; i < offs + len; i += du_size, index++) { err = fscrypt_crypt_data_unit(ci, FS_ENCRYPT, index, - page, ciphertext_page, + &folio->page, ciphertext_page, du_size, i, gfp_flags); if (err) { fscrypt_free_bounce_page(ciphertext_page); @@ -213,7 +211,7 @@ struct page *fscrypt_encrypt_pagecache_blocks(struct page *page, } } SetPagePrivate(ciphertext_page); - set_page_private(ciphertext_page, (unsigned long)page); + set_page_private(ciphertext_page, (unsigned long)folio); return ciphertext_page; } EXPORT_SYMBOL(fscrypt_encrypt_pagecache_blocks); diff --git a/fs/crypto/fscrypt_private.h b/fs/crypto/fscrypt_private.h index 8371e4e1f596..c1d92074b65c 100644 --- a/fs/crypto/fscrypt_private.h +++ b/fs/crypto/fscrypt_private.h @@ -12,6 +12,7 @@ #define _FSCRYPT_PRIVATE_H #include <linux/fscrypt.h> +#include <linux/minmax.h> #include <linux/siphash.h> #include <crypto/hash.h> #include <linux/blk-crypto.h> @@ -27,6 +28,23 @@ */ #define FSCRYPT_MIN_KEY_SIZE 16 +/* Maximum size of a raw fscrypt master key */ +#define FSCRYPT_MAX_RAW_KEY_SIZE 64 + +/* Maximum size of a hardware-wrapped fscrypt master key */ +#define FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE BLK_CRYPTO_MAX_HW_WRAPPED_KEY_SIZE + +/* Maximum size of an fscrypt master key across both key types */ +#define FSCRYPT_MAX_ANY_KEY_SIZE \ + MAX(FSCRYPT_MAX_RAW_KEY_SIZE, FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE) + +/* + * FSCRYPT_MAX_KEY_SIZE is defined in the UAPI header, but the addition of + * hardware-wrapped keys has made it misleading as it's only for raw keys. + * Don't use it in kernel code; use one of the above constants instead. + */ +#undef FSCRYPT_MAX_KEY_SIZE + #define FSCRYPT_CONTEXT_V1 1 #define FSCRYPT_CONTEXT_V2 2 @@ -360,13 +378,15 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, * outputs are unique and cryptographically isolated, i.e. knowledge of one * output doesn't reveal another. */ -#define HKDF_CONTEXT_KEY_IDENTIFIER 1 /* info=<empty> */ +#define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY 1 /* info=<empty> */ #define HKDF_CONTEXT_PER_FILE_ENC_KEY 2 /* info=file_nonce */ #define HKDF_CONTEXT_DIRECT_KEY 3 /* info=mode_num */ #define HKDF_CONTEXT_IV_INO_LBLK_64_KEY 4 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_DIRHASH_KEY 5 /* info=file_nonce */ #define HKDF_CONTEXT_IV_INO_LBLK_32_KEY 6 /* info=mode_num||fs_uuid */ #define HKDF_CONTEXT_INODE_HASH_KEY 7 /* info=<empty> */ +#define HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY \ + 8 /* info=<empty> */ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, const u8 *info, unsigned int infolen, @@ -376,7 +396,8 @@ void fscrypt_destroy_hkdf(struct fscrypt_hkdf *hkdf); /* inline_crypt.c */ #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT -int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci); +int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci, + bool is_hw_wrapped_key); static inline bool fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) @@ -385,12 +406,17 @@ fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, - const u8 *raw_key, + const u8 *key_bytes, size_t key_size, + bool is_hw_wrapped, const struct fscrypt_inode_info *ci); void fscrypt_destroy_inline_crypt_key(struct super_block *sb, struct fscrypt_prepared_key *prep_key); +int fscrypt_derive_sw_secret(struct super_block *sb, + const u8 *wrapped_key, size_t wrapped_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]); + /* * Check whether the crypto transform or blk-crypto key has been allocated in * @prep_key, depending on which encryption implementation the file will use. @@ -414,7 +440,8 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, #else /* CONFIG_FS_ENCRYPTION_INLINE_CRYPT */ -static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) +static inline int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci, + bool is_hw_wrapped_key) { return 0; } @@ -427,7 +454,8 @@ fscrypt_using_inline_encryption(const struct fscrypt_inode_info *ci) static inline int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, - const u8 *raw_key, + const u8 *key_bytes, size_t key_size, + bool is_hw_wrapped, const struct fscrypt_inode_info *ci) { WARN_ON_ONCE(1); @@ -440,6 +468,15 @@ fscrypt_destroy_inline_crypt_key(struct super_block *sb, { } +static inline int +fscrypt_derive_sw_secret(struct super_block *sb, + const u8 *wrapped_key, size_t wrapped_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + fscrypt_warn(NULL, "kernel doesn't support hardware-wrapped keys"); + return -EOPNOTSUPP; +} + static inline bool fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, const struct fscrypt_inode_info *ci) @@ -456,20 +493,38 @@ fscrypt_is_key_prepared(struct fscrypt_prepared_key *prep_key, struct fscrypt_master_key_secret { /* - * For v2 policy keys: HKDF context keyed by this master key. - * For v1 policy keys: not set (hkdf.hmac_tfm == NULL). + * The KDF with which subkeys of this key can be derived. + * + * For v1 policy keys, this isn't applicable and won't be set. + * Otherwise, this KDF will be keyed by this master key if + * ->is_hw_wrapped=false, or by the "software secret" that hardware + * derived from this master key if ->is_hw_wrapped=true. */ struct fscrypt_hkdf hkdf; /* - * Size of the raw key in bytes. This remains set even if ->raw was + * True if this key is a hardware-wrapped key; false if this key is a + * raw key (i.e. a "software key"). For v1 policy keys this will always + * be false, as v1 policy support is a legacy feature which doesn't + * support newer functionality such as hardware-wrapped keys. + */ + bool is_hw_wrapped; + + /* + * Size of the key in bytes. This remains set even if ->bytes was * zeroized due to no longer being needed. I.e. we still remember the * size of the key even if we don't need to remember the key itself. */ u32 size; - /* For v1 policy keys: the raw key. Wiped for v2 policy keys. */ - u8 raw[FSCRYPT_MAX_KEY_SIZE]; + /* + * The bytes of the key, when still needed. This can be either a raw + * key or a hardware-wrapped key, as indicated by ->is_hw_wrapped. In + * the case of a raw, v2 policy key, there is no need to remember the + * actual key separately from ->hkdf so this field will be zeroized as + * soon as ->hkdf is initialized. + */ + u8 bytes[FSCRYPT_MAX_ANY_KEY_SIZE]; } __randomize_layout; diff --git a/fs/crypto/hkdf.c b/fs/crypto/hkdf.c index 5a384dad2c72..0f3028adc9c7 100644 --- a/fs/crypto/hkdf.c +++ b/fs/crypto/hkdf.c @@ -1,16 +1,15 @@ // SPDX-License-Identifier: GPL-2.0 /* - * Implementation of HKDF ("HMAC-based Extract-and-Expand Key Derivation - * Function"), aka RFC 5869. See also the original paper (Krawczyk 2010): - * "Cryptographic Extraction and Key Derivation: The HKDF Scheme". - * - * This is used to derive keys from the fscrypt master keys. + * This is used to derive keys from the fscrypt master keys (or from the + * "software secrets" which hardware derives from the fscrypt master keys, in + * the case that the fscrypt master keys are hardware-wrapped keys). * * Copyright 2019 Google LLC */ #include <crypto/hash.h> #include <crypto/sha2.h> +#include <crypto/hkdf.h> #include "fscrypt_private.h" @@ -44,20 +43,6 @@ * there's no way to persist a random salt per master key from kernel mode. */ -/* HKDF-Extract (RFC 5869 section 2.2), unsalted */ -static int hkdf_extract(struct crypto_shash *hmac_tfm, const u8 *ikm, - unsigned int ikmlen, u8 prk[HKDF_HASHLEN]) -{ - static const u8 default_salt[HKDF_HASHLEN]; - int err; - - err = crypto_shash_setkey(hmac_tfm, default_salt, HKDF_HASHLEN); - if (err) - return err; - - return crypto_shash_tfm_digest(hmac_tfm, ikm, ikmlen, prk); -} - /* * Compute HKDF-Extract using the given master key as the input keying material, * and prepare an HMAC transform object keyed by the resulting pseudorandom key. @@ -69,6 +54,7 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, unsigned int master_key_size) { struct crypto_shash *hmac_tfm; + static const u8 default_salt[HKDF_HASHLEN]; u8 prk[HKDF_HASHLEN]; int err; @@ -84,7 +70,8 @@ int fscrypt_init_hkdf(struct fscrypt_hkdf *hkdf, const u8 *master_key, goto err_free_tfm; } - err = hkdf_extract(hmac_tfm, master_key, master_key_size, prk); + err = hkdf_extract(hmac_tfm, master_key, master_key_size, + default_salt, HKDF_HASHLEN, prk); if (err) goto err_free_tfm; @@ -118,61 +105,21 @@ int fscrypt_hkdf_expand(const struct fscrypt_hkdf *hkdf, u8 context, u8 *okm, unsigned int okmlen) { SHASH_DESC_ON_STACK(desc, hkdf->hmac_tfm); - u8 prefix[9]; - unsigned int i; + u8 *full_info; int err; - const u8 *prev = NULL; - u8 counter = 1; - u8 tmp[HKDF_HASHLEN]; - - if (WARN_ON_ONCE(okmlen > 255 * HKDF_HASHLEN)) - return -EINVAL; + full_info = kzalloc(infolen + 9, GFP_KERNEL); + if (!full_info) + return -ENOMEM; desc->tfm = hkdf->hmac_tfm; - memcpy(prefix, "fscrypt\0", 8); - prefix[8] = context; - - for (i = 0; i < okmlen; i += HKDF_HASHLEN) { - - err = crypto_shash_init(desc); - if (err) - goto out; - - if (prev) { - err = crypto_shash_update(desc, prev, HKDF_HASHLEN); - if (err) - goto out; - } - - err = crypto_shash_update(desc, prefix, sizeof(prefix)); - if (err) - goto out; - - err = crypto_shash_update(desc, info, infolen); - if (err) - goto out; - - BUILD_BUG_ON(sizeof(counter) != 1); - if (okmlen - i < HKDF_HASHLEN) { - err = crypto_shash_finup(desc, &counter, 1, tmp); - if (err) - goto out; - memcpy(&okm[i], tmp, okmlen - i); - memzero_explicit(tmp, sizeof(tmp)); - } else { - err = crypto_shash_finup(desc, &counter, 1, &okm[i]); - if (err) - goto out; - } - counter++; - prev = &okm[i]; - } - err = 0; -out: - if (unlikely(err)) - memzero_explicit(okm, okmlen); /* so caller doesn't need to */ - shash_desc_zero(desc); + memcpy(full_info, "fscrypt\0", 8); + full_info[8] = context; + memcpy(full_info + 9, info, infolen); + + err = hkdf_expand(hkdf->hmac_tfm, full_info, infolen + 9, + okm, okmlen); + kfree_sensitive(full_info); return err; } diff --git a/fs/crypto/inline_crypt.c b/fs/crypto/inline_crypt.c index 40de69860dcf..1d008c440cb6 100644 --- a/fs/crypto/inline_crypt.c +++ b/fs/crypto/inline_crypt.c @@ -89,7 +89,8 @@ static void fscrypt_log_blk_crypto_impl(struct fscrypt_mode *mode, } /* Enable inline encryption for this file if supported. */ -int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) +int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci, + bool is_hw_wrapped_key) { const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; @@ -130,6 +131,8 @@ int fscrypt_select_encryption_impl(struct fscrypt_inode_info *ci) crypto_cfg.crypto_mode = ci->ci_mode->blk_crypto_mode; crypto_cfg.data_unit_size = 1U << ci->ci_data_unit_bits; crypto_cfg.dun_bytes = fscrypt_get_dun_bytes(ci); + crypto_cfg.key_type = is_hw_wrapped_key ? + BLK_CRYPTO_KEY_TYPE_HW_WRAPPED : BLK_CRYPTO_KEY_TYPE_RAW; devs = fscrypt_get_devices(sb, &num_devs); if (IS_ERR(devs)) @@ -150,12 +153,15 @@ out_free_devs: } int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, - const u8 *raw_key, + const u8 *key_bytes, size_t key_size, + bool is_hw_wrapped, const struct fscrypt_inode_info *ci) { const struct inode *inode = ci->ci_inode; struct super_block *sb = inode->i_sb; enum blk_crypto_mode_num crypto_mode = ci->ci_mode->blk_crypto_mode; + enum blk_crypto_key_type key_type = is_hw_wrapped ? + BLK_CRYPTO_KEY_TYPE_HW_WRAPPED : BLK_CRYPTO_KEY_TYPE_RAW; struct blk_crypto_key *blk_key; struct block_device **devs; unsigned int num_devs; @@ -166,8 +172,8 @@ int fscrypt_prepare_inline_crypt_key(struct fscrypt_prepared_key *prep_key, if (!blk_key) return -ENOMEM; - err = blk_crypto_init_key(blk_key, raw_key, crypto_mode, - fscrypt_get_dun_bytes(ci), + err = blk_crypto_init_key(blk_key, key_bytes, key_size, key_type, + crypto_mode, fscrypt_get_dun_bytes(ci), 1U << ci->ci_data_unit_bits); if (err) { fscrypt_err(inode, "error %d initializing blk-crypto key", err); @@ -226,6 +232,34 @@ void fscrypt_destroy_inline_crypt_key(struct super_block *sb, kfree_sensitive(blk_key); } +/* + * Ask the inline encryption hardware to derive the software secret from a + * hardware-wrapped key. Returns -EOPNOTSUPP if hardware-wrapped keys aren't + * supported on this filesystem or hardware. + */ +int fscrypt_derive_sw_secret(struct super_block *sb, + const u8 *wrapped_key, size_t wrapped_key_size, + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]) +{ + int err; + + /* The filesystem must be mounted with -o inlinecrypt. */ + if (!(sb->s_flags & SB_INLINECRYPT)) { + fscrypt_warn(NULL, + "%s: filesystem not mounted with inlinecrypt\n", + sb->s_id); + return -EOPNOTSUPP; + } + + err = blk_crypto_derive_sw_secret(sb->s_bdev, wrapped_key, + wrapped_key_size, sw_secret); + if (err == -EOPNOTSUPP) + fscrypt_warn(NULL, + "%s: block device doesn't support hardware-wrapped keys\n", + sb->s_id); + return err; +} + bool __fscrypt_inode_uses_inline_crypto(const struct inode *inode) { return inode->i_crypt_info->ci_inlinecrypt; diff --git a/fs/crypto/keyring.c b/fs/crypto/keyring.c index 787e9c8938ba..ace369f13068 100644 --- a/fs/crypto/keyring.c +++ b/fs/crypto/keyring.c @@ -149,11 +149,11 @@ static int fscrypt_user_key_instantiate(struct key *key, struct key_preparsed_payload *prep) { /* - * We just charge FSCRYPT_MAX_KEY_SIZE bytes to the user's key quota for - * each key, regardless of the exact key size. The amount of memory + * We just charge FSCRYPT_MAX_RAW_KEY_SIZE bytes to the user's key quota + * for each key, regardless of the exact key size. The amount of memory * actually used is greater than the size of the raw key anyway. */ - return key_payload_reserve(key, FSCRYPT_MAX_KEY_SIZE); + return key_payload_reserve(key, FSCRYPT_MAX_RAW_KEY_SIZE); } static void fscrypt_user_key_describe(const struct key *key, struct seq_file *m) @@ -558,20 +558,45 @@ static int add_master_key(struct super_block *sb, int err; if (key_spec->type == FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) { - err = fscrypt_init_hkdf(&secret->hkdf, secret->raw, - secret->size); - if (err) - return err; + u8 sw_secret[BLK_CRYPTO_SW_SECRET_SIZE]; + u8 *kdf_key = secret->bytes; + unsigned int kdf_key_size = secret->size; + u8 keyid_kdf_ctx = HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY; /* - * Now that the HKDF context is initialized, the raw key is no - * longer needed. + * For raw keys, the fscrypt master key is used directly as the + * fscrypt KDF key. For hardware-wrapped keys, we have to pass + * the master key to the hardware to derive the KDF key, which + * is then only used to derive non-file-contents subkeys. + */ + if (secret->is_hw_wrapped) { + err = fscrypt_derive_sw_secret(sb, secret->bytes, + secret->size, sw_secret); + if (err) + return err; + kdf_key = sw_secret; + kdf_key_size = sizeof(sw_secret); + /* + * To avoid weird behavior if someone manages to + * determine sw_secret and add it as a raw key, ensure + * that hardware-wrapped keys and raw keys will have + * different key identifiers by deriving their key + * identifiers using different KDF contexts. + */ + keyid_kdf_ctx = + HKDF_CONTEXT_KEY_IDENTIFIER_FOR_HW_WRAPPED_KEY; + } + err = fscrypt_init_hkdf(&secret->hkdf, kdf_key, kdf_key_size); + /* + * Now that the KDF context is initialized, the raw KDF key is + * no longer needed. */ - memzero_explicit(secret->raw, secret->size); + memzero_explicit(kdf_key, kdf_key_size); + if (err) + return err; /* Calculate the key identifier */ - err = fscrypt_hkdf_expand(&secret->hkdf, - HKDF_CONTEXT_KEY_IDENTIFIER, NULL, 0, + err = fscrypt_hkdf_expand(&secret->hkdf, keyid_kdf_ctx, NULL, 0, key_spec->u.identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); if (err) @@ -580,19 +605,36 @@ static int add_master_key(struct super_block *sb, return do_add_master_key(sb, secret, key_spec); } +/* + * Validate the size of an fscrypt master key being added. Note that this is + * just an initial check, as we don't know which ciphers will be used yet. + * There is a stricter size check later when the key is actually used by a file. + */ +static inline bool fscrypt_valid_key_size(size_t size, u32 add_key_flags) +{ + u32 max_size = (add_key_flags & FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) ? + FSCRYPT_MAX_HW_WRAPPED_KEY_SIZE : + FSCRYPT_MAX_RAW_KEY_SIZE; + + return size >= FSCRYPT_MIN_KEY_SIZE && size <= max_size; +} + static int fscrypt_provisioning_key_preparse(struct key_preparsed_payload *prep) { const struct fscrypt_provisioning_key_payload *payload = prep->data; - if (prep->datalen < sizeof(*payload) + FSCRYPT_MIN_KEY_SIZE || - prep->datalen > sizeof(*payload) + FSCRYPT_MAX_KEY_SIZE) + if (prep->datalen < sizeof(*payload)) + return -EINVAL; + + if (!fscrypt_valid_key_size(prep->datalen - sizeof(*payload), + payload->flags)) return -EINVAL; if (payload->type != FSCRYPT_KEY_SPEC_TYPE_DESCRIPTOR && payload->type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) return -EINVAL; - if (payload->__reserved) + if (payload->flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) return -EINVAL; prep->payload.data[0] = kmemdup(payload, prep->datalen, GFP_KERNEL); @@ -636,21 +678,21 @@ static struct key_type key_type_fscrypt_provisioning = { }; /* - * Retrieve the raw key from the Linux keyring key specified by 'key_id', and - * store it into 'secret'. + * Retrieve the key from the Linux keyring key specified by 'key_id', and store + * it into 'secret'. * - * The key must be of type "fscrypt-provisioning" and must have the field - * fscrypt_provisioning_key_payload::type set to 'type', indicating that it's - * only usable with fscrypt with the particular KDF version identified by - * 'type'. We don't use the "logon" key type because there's no way to - * completely restrict the use of such keys; they can be used by any kernel API - * that accepts "logon" keys and doesn't require a specific service prefix. + * The key must be of type "fscrypt-provisioning" and must have the 'type' and + * 'flags' field of the payload set to the given values, indicating that the key + * is intended for use for the specified purpose. We don't use the "logon" key + * type because there's no way to completely restrict the use of such keys; they + * can be used by any kernel API that accepts "logon" keys and doesn't require a + * specific service prefix. * * The ability to specify the key via Linux keyring key is intended for cases * where userspace needs to re-add keys after the filesystem is unmounted and - * re-mounted. Most users should just provide the raw key directly instead. + * re-mounted. Most users should just provide the key directly instead. */ -static int get_keyring_key(u32 key_id, u32 type, +static int get_keyring_key(u32 key_id, u32 type, u32 flags, struct fscrypt_master_key_secret *secret) { key_ref_t ref; @@ -667,12 +709,16 @@ static int get_keyring_key(u32 key_id, u32 type, goto bad_key; payload = key->payload.data[0]; - /* Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. */ - if (payload->type != type) + /* + * Don't allow fscrypt v1 keys to be used as v2 keys and vice versa. + * Similarly, don't allow hardware-wrapped keys to be used as + * non-hardware-wrapped keys and vice versa. + */ + if (payload->type != type || payload->flags != flags) goto bad_key; secret->size = key->datalen - sizeof(*payload); - memcpy(secret->raw, payload->raw, secret->size); + memcpy(secret->bytes, payload->raw, secret->size); err = 0; goto out_put; @@ -734,19 +780,28 @@ int fscrypt_ioctl_add_key(struct file *filp, void __user *_uarg) return -EACCES; memset(&secret, 0, sizeof(secret)); + + if (arg.flags) { + if (arg.flags & ~FSCRYPT_ADD_KEY_FLAG_HW_WRAPPED) + return -EINVAL; + if (arg.key_spec.type != FSCRYPT_KEY_SPEC_TYPE_IDENTIFIER) + return -EINVAL; + secret.is_hw_wrapped = true; + } + if (arg.key_id) { if (arg.raw_size != 0) return -EINVAL; - err = get_keyring_key(arg.key_id, arg.key_spec.type, &secret); + err = get_keyring_key(arg.key_id, arg.key_spec.type, arg.flags, + &secret); if (err) goto out_wipe_secret; } else { - if (arg.raw_size < FSCRYPT_MIN_KEY_SIZE || - arg.raw_size > FSCRYPT_MAX_KEY_SIZE) + if (!fscrypt_valid_key_size(arg.raw_size, arg.flags)) return -EINVAL; secret.size = arg.raw_size; err = -EFAULT; - if (copy_from_user(secret.raw, uarg->raw, secret.size)) + if (copy_from_user(secret.bytes, uarg->raw, secret.size)) goto out_wipe_secret; } @@ -770,13 +825,13 @@ EXPORT_SYMBOL_GPL(fscrypt_ioctl_add_key); static void fscrypt_get_test_dummy_secret(struct fscrypt_master_key_secret *secret) { - static u8 test_key[FSCRYPT_MAX_KEY_SIZE]; + static u8 test_key[FSCRYPT_MAX_RAW_KEY_SIZE]; - get_random_once(test_key, FSCRYPT_MAX_KEY_SIZE); + get_random_once(test_key, sizeof(test_key)); memset(secret, 0, sizeof(*secret)); - secret->size = FSCRYPT_MAX_KEY_SIZE; - memcpy(secret->raw, test_key, FSCRYPT_MAX_KEY_SIZE); + secret->size = sizeof(test_key); + memcpy(secret->bytes, test_key, sizeof(test_key)); } int fscrypt_get_test_dummy_key_identifier( @@ -787,10 +842,11 @@ int fscrypt_get_test_dummy_key_identifier( fscrypt_get_test_dummy_secret(&secret); - err = fscrypt_init_hkdf(&secret.hkdf, secret.raw, secret.size); + err = fscrypt_init_hkdf(&secret.hkdf, secret.bytes, secret.size); if (err) goto out; - err = fscrypt_hkdf_expand(&secret.hkdf, HKDF_CONTEXT_KEY_IDENTIFIER, + err = fscrypt_hkdf_expand(&secret.hkdf, + HKDF_CONTEXT_KEY_IDENTIFIER_FOR_RAW_KEY, NULL, 0, key_identifier, FSCRYPT_KEY_IDENTIFIER_SIZE); out: diff --git a/fs/crypto/keysetup.c b/fs/crypto/keysetup.c index b4fe01ea4bd4..0d71843af946 100644 --- a/fs/crypto/keysetup.c +++ b/fs/crypto/keysetup.c @@ -153,7 +153,9 @@ int fscrypt_prepare_key(struct fscrypt_prepared_key *prep_key, struct crypto_skcipher *tfm; if (fscrypt_using_inline_encryption(ci)) - return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, ci); + return fscrypt_prepare_inline_crypt_key(prep_key, raw_key, + ci->ci_mode->keysize, + false, ci); tfm = fscrypt_allocate_skcipher(ci->ci_mode, raw_key, ci->ci_inode); if (IS_ERR(tfm)) @@ -195,14 +197,29 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, struct fscrypt_mode *mode = ci->ci_mode; const u8 mode_num = mode - fscrypt_modes; struct fscrypt_prepared_key *prep_key; - u8 mode_key[FSCRYPT_MAX_KEY_SIZE]; + u8 mode_key[FSCRYPT_MAX_RAW_KEY_SIZE]; u8 hkdf_info[sizeof(mode_num) + sizeof(sb->s_uuid)]; unsigned int hkdf_infolen = 0; + bool use_hw_wrapped_key = false; int err; if (WARN_ON_ONCE(mode_num > FSCRYPT_MODE_MAX)) return -EINVAL; + if (mk->mk_secret.is_hw_wrapped && S_ISREG(inode->i_mode)) { + /* Using a hardware-wrapped key for file contents encryption */ + if (!fscrypt_using_inline_encryption(ci)) { + if (sb->s_flags & SB_INLINECRYPT) + fscrypt_warn(ci->ci_inode, + "Hardware-wrapped key required, but no suitable inline encryption capabilities are available"); + else + fscrypt_warn(ci->ci_inode, + "Hardware-wrapped keys require inline encryption (-o inlinecrypt)"); + return -EINVAL; + } + use_hw_wrapped_key = true; + } + prep_key = &keys[mode_num]; if (fscrypt_is_key_prepared(prep_key, ci)) { ci->ci_enc_key = *prep_key; @@ -214,6 +231,16 @@ static int setup_per_mode_enc_key(struct fscrypt_inode_info *ci, if (fscrypt_is_key_prepared(prep_key, ci)) goto done_unlock; + if (use_hw_wrapped_key) { + err = fscrypt_prepare_inline_crypt_key(prep_key, + mk->mk_secret.bytes, + mk->mk_secret.size, true, + ci); + if (err) + goto out_unlock; + goto done_unlock; + } + BUILD_BUG_ON(sizeof(mode_num) != 1); BUILD_BUG_ON(sizeof(sb->s_uuid) != 16); BUILD_BUG_ON(sizeof(hkdf_info) != 17); @@ -336,6 +363,14 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, { int err; + if (mk->mk_secret.is_hw_wrapped && + !(ci->ci_policy.v2.flags & (FSCRYPT_POLICY_FLAG_IV_INO_LBLK_64 | + FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32))) { + fscrypt_warn(ci->ci_inode, + "Hardware-wrapped keys are only supported with IV_INO_LBLK policies"); + return -EINVAL; + } + if (ci->ci_policy.v2.flags & FSCRYPT_POLICY_FLAG_DIRECT_KEY) { /* * DIRECT_KEY: instead of deriving per-file encryption keys, the @@ -362,7 +397,7 @@ static int fscrypt_setup_v2_file_key(struct fscrypt_inode_info *ci, FSCRYPT_POLICY_FLAG_IV_INO_LBLK_32) { err = fscrypt_setup_iv_ino_lblk_32_key(ci, mk); } else { - u8 derived_key[FSCRYPT_MAX_KEY_SIZE]; + u8 derived_key[FSCRYPT_MAX_RAW_KEY_SIZE]; err = fscrypt_hkdf_expand(&mk->mk_secret.hkdf, HKDF_CONTEXT_PER_FILE_ENC_KEY, @@ -445,10 +480,6 @@ static int setup_file_encryption_key(struct fscrypt_inode_info *ci, struct fscrypt_master_key *mk; int err; - err = fscrypt_select_encryption_impl(ci); - if (err) - return err; - err = fscrypt_policy_to_key_spec(&ci->ci_policy, &mk_spec); if (err) return err; @@ -476,6 +507,10 @@ static int setup_file_encryption_key(struct fscrypt_inode_info *ci, if (ci->ci_policy.version != FSCRYPT_POLICY_V1) return -ENOKEY; + err = fscrypt_select_encryption_impl(ci, false); + if (err) + return err; + /* * As a legacy fallback for v1 policies, search for the key in * the current task's subscribed keyrings too. Don't move this @@ -497,9 +532,21 @@ static int setup_file_encryption_key(struct fscrypt_inode_info *ci, goto out_release_key; } + err = fscrypt_select_encryption_impl(ci, mk->mk_secret.is_hw_wrapped); + if (err) + goto out_release_key; + switch (ci->ci_policy.version) { case FSCRYPT_POLICY_V1: - err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.raw); + if (WARN_ON_ONCE(mk->mk_secret.is_hw_wrapped)) { + /* + * This should never happen, as adding a v1 policy key + * that is hardware-wrapped isn't allowed. + */ + err = -EINVAL; + goto out_release_key; + } + err = fscrypt_setup_v1_file_key(ci, mk->mk_secret.bytes); break; case FSCRYPT_POLICY_V2: err = fscrypt_setup_v2_file_key(ci, mk, need_dirhash_key); diff --git a/fs/crypto/keysetup_v1.c b/fs/crypto/keysetup_v1.c index cf3b58ec32cc..b70521c55132 100644 --- a/fs/crypto/keysetup_v1.c +++ b/fs/crypto/keysetup_v1.c @@ -118,7 +118,7 @@ find_and_lock_process_key(const char *prefix, payload = (const struct fscrypt_key *)ukp->data; if (ukp->datalen != sizeof(struct fscrypt_key) || - payload->size < 1 || payload->size > FSCRYPT_MAX_KEY_SIZE) { + payload->size < 1 || payload->size > sizeof(payload->raw)) { fscrypt_warn(NULL, "key with description '%s' has invalid payload", key->description); @@ -149,7 +149,7 @@ struct fscrypt_direct_key { const struct fscrypt_mode *dk_mode; struct fscrypt_prepared_key dk_key; u8 dk_descriptor[FSCRYPT_KEY_DESCRIPTOR_SIZE]; - u8 dk_raw[FSCRYPT_MAX_KEY_SIZE]; + u8 dk_raw[FSCRYPT_MAX_RAW_KEY_SIZE]; }; static void free_direct_key(struct fscrypt_direct_key *dk) @@ -71,6 +71,11 @@ static unsigned long dax_to_pfn(void *entry) return xa_to_value(entry) >> DAX_SHIFT; } +static struct folio *dax_to_folio(void *entry) +{ + return page_folio(pfn_to_page(dax_to_pfn(entry))); +} + static void *dax_make_entry(pfn_t pfn, unsigned long flags) { return xa_mk_value(flags | (pfn_t_to_pfn(pfn) << DAX_SHIFT)); @@ -206,7 +211,7 @@ static void dax_wake_entry(struct xa_state *xas, void *entry, * * Must be called with the i_pages lock held. */ -static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) +static void *get_next_unlocked_entry(struct xa_state *xas, unsigned int order) { void *entry; struct wait_exceptional_entry_queue ewait; @@ -236,6 +241,37 @@ static void *get_unlocked_entry(struct xa_state *xas, unsigned int order) } /* + * Wait for the given entry to become unlocked. Caller must hold the i_pages + * lock and call either put_unlocked_entry() if it did not lock the entry or + * dax_unlock_entry() if it did. Returns an unlocked entry if still present. + */ +static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry) +{ + struct wait_exceptional_entry_queue ewait; + wait_queue_head_t *wq; + + init_wait(&ewait.wait); + ewait.wait.func = wake_exceptional_entry_func; + + while (unlikely(dax_is_locked(entry))) { + wq = dax_entry_waitqueue(xas, entry, &ewait.key); + prepare_to_wait_exclusive(wq, &ewait.wait, + TASK_UNINTERRUPTIBLE); + xas_pause(xas); + xas_unlock_irq(xas); + schedule(); + finish_wait(wq, &ewait.wait); + xas_lock_irq(xas); + entry = xas_load(xas); + } + + if (xa_is_internal(entry)) + return NULL; + + return entry; +} + +/* * The only thing keeping the address space around is the i_pages lock * (it's cycled in clear_inode() after removing the entries from i_pages) * After we call xas_unlock_irq(), we cannot touch xas->xa. @@ -250,7 +286,7 @@ static void wait_entry_unlocked(struct xa_state *xas, void *entry) wq = dax_entry_waitqueue(xas, entry, &ewait.key); /* - * Unlike get_unlocked_entry() there is no guarantee that this + * Unlike get_next_unlocked_entry() there is no guarantee that this * path ever successfully retrieves an unlocked entry before an * inode dies. Perform a non-exclusive wait in case this path * never successfully performs its own wake up. @@ -307,109 +343,157 @@ static unsigned long dax_entry_size(void *entry) return PAGE_SIZE; } -static unsigned long dax_end_pfn(void *entry) +/* + * A DAX folio is considered shared if it has no mapping set and ->share (which + * shares the ->index field) is non-zero. Note this may return false even if the + * page is shared between multiple files but has not yet actually been mapped + * into multiple address spaces. + */ +static inline bool dax_folio_is_shared(struct folio *folio) { - return dax_to_pfn(entry) + dax_entry_size(entry) / PAGE_SIZE; + return !folio->mapping && folio->share; } /* - * Iterate through all mapped pfns represented by an entry, i.e. skip - * 'empty' and 'zero' entries. + * When it is called by dax_insert_entry(), the shared flag will indicate + * whether this entry is shared by multiple files. If the page has not + * previously been associated with any mappings the ->mapping and ->index + * fields will be set. If it has already been associated with a mapping + * the mapping will be cleared and the share count set. It's then up to + * reverse map users like memory_failure() to call back into the filesystem to + * recover ->mapping and ->index information. For example by implementing + * dax_holder_operations. */ -#define for_each_mapped_pfn(entry, pfn) \ - for (pfn = dax_to_pfn(entry); \ - pfn < dax_end_pfn(entry); pfn++) - -static inline bool dax_page_is_shared(struct page *page) +static void dax_folio_make_shared(struct folio *folio) { - return page->mapping == PAGE_MAPPING_DAX_SHARED; + /* + * folio is not currently shared so mark it as shared by clearing + * folio->mapping. + */ + folio->mapping = NULL; + + /* + * folio has previously been mapped into one address space so set the + * share count. + */ + folio->share = 1; } -/* - * Set the page->mapping with PAGE_MAPPING_DAX_SHARED flag, increase the - * refcount. - */ -static inline void dax_page_share_get(struct page *page) +static inline unsigned long dax_folio_put(struct folio *folio) { - if (page->mapping != PAGE_MAPPING_DAX_SHARED) { + unsigned long ref; + int order, i; + + if (!dax_folio_is_shared(folio)) + ref = 0; + else + ref = --folio->share; + + if (ref) + return ref; + + folio->mapping = NULL; + order = folio_order(folio); + if (!order) + return 0; + folio_reset_order(folio); + + for (i = 0; i < (1UL << order); i++) { + struct dev_pagemap *pgmap = page_pgmap(&folio->page); + struct page *page = folio_page(folio, i); + struct folio *new_folio = (struct folio *)page; + + ClearPageHead(page); + clear_compound_head(page); + + new_folio->mapping = NULL; /* - * Reset the index if the page was already mapped - * regularly before. + * Reset pgmap which was over-written by + * prep_compound_page(). */ - if (page->mapping) - page->share = 1; - page->mapping = PAGE_MAPPING_DAX_SHARED; + new_folio->pgmap = pgmap; + new_folio->share = 0; + WARN_ON_ONCE(folio_ref_count(new_folio)); } - page->share++; + + return ref; } -static inline unsigned long dax_page_share_put(struct page *page) +static void dax_folio_init(void *entry) { - return --page->share; + struct folio *folio = dax_to_folio(entry); + int order = dax_entry_order(entry); + + /* + * Folio should have been split back to order-0 pages in + * dax_folio_put() when they were removed from their + * final mapping. + */ + WARN_ON_ONCE(folio_order(folio)); + + if (order > 0) { + prep_compound_page(&folio->page, order); + if (order > 1) + INIT_LIST_HEAD(&folio->_deferred_list); + WARN_ON_ONCE(folio_ref_count(folio)); + } } -/* - * When it is called in dax_insert_entry(), the shared flag will indicate that - * whether this entry is shared by multiple files. If so, set the page->mapping - * PAGE_MAPPING_DAX_SHARED, and use page->share as refcount. - */ static void dax_associate_entry(void *entry, struct address_space *mapping, - struct vm_area_struct *vma, unsigned long address, bool shared) + struct vm_area_struct *vma, + unsigned long address, bool shared) { - unsigned long size = dax_entry_size(entry), pfn, index; - int i = 0; + unsigned long size = dax_entry_size(entry), index; + struct folio *folio = dax_to_folio(entry); + + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) + return; if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return; index = linear_page_index(vma, address & ~(size - 1)); - for_each_mapped_pfn(entry, pfn) { - struct page *page = pfn_to_page(pfn); + if (shared && (folio->mapping || dax_folio_is_shared(folio))) { + if (folio->mapping) + dax_folio_make_shared(folio); - if (shared) { - dax_page_share_get(page); - } else { - WARN_ON_ONCE(page->mapping); - page->mapping = mapping; - page->index = index + i++; - } + WARN_ON_ONCE(!folio->share); + WARN_ON_ONCE(dax_entry_order(entry) != folio_order(folio)); + folio->share++; + } else { + WARN_ON_ONCE(folio->mapping); + dax_folio_init(entry); + folio = dax_to_folio(entry); + folio->mapping = mapping; + folio->index = index; } } static void dax_disassociate_entry(void *entry, struct address_space *mapping, - bool trunc) + bool trunc) { - unsigned long pfn; + struct folio *folio = dax_to_folio(entry); if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return; - for_each_mapped_pfn(entry, pfn) { - struct page *page = pfn_to_page(pfn); - - WARN_ON_ONCE(trunc && page_ref_count(page) > 1); - if (dax_page_is_shared(page)) { - /* keep the shared flag if this page is still shared */ - if (dax_page_share_put(page) > 0) - continue; - } else - WARN_ON_ONCE(page->mapping && page->mapping != mapping); - page->mapping = NULL; - page->index = 0; - } + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) + return; + + dax_folio_put(folio); } static struct page *dax_busy_page(void *entry) { - unsigned long pfn; + struct folio *folio = dax_to_folio(entry); - for_each_mapped_pfn(entry, pfn) { - struct page *page = pfn_to_page(pfn); + if (dax_is_zero_entry(entry) || dax_is_empty_entry(entry)) + return NULL; - if (page_ref_count(page) > 1) - return page; - } - return NULL; + if (folio_ref_count(folio) - folio_mapcount(folio)) + return &folio->page; + else + return NULL; } /** @@ -580,7 +664,7 @@ static void *grab_mapping_entry(struct xa_state *xas, retry: pmd_downgrade = false; xas_lock_irq(xas); - entry = get_unlocked_entry(xas, order); + entry = get_next_unlocked_entry(xas, order); if (entry) { if (dax_is_conflict(entry)) @@ -690,7 +774,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, if (IS_ENABLED(CONFIG_FS_DAX_LIMITED)) return NULL; - if (!dax_mapping(mapping) || !mapping_mapped(mapping)) + if (!dax_mapping(mapping)) return NULL; /* If end == LLONG_MAX, all pages from start to till end of file */ @@ -716,8 +800,7 @@ struct page *dax_layout_busy_page_range(struct address_space *mapping, xas_for_each(&xas, entry, end_idx) { if (WARN_ON_ONCE(!xa_is_value(entry))) continue; - if (unlikely(dax_is_locked(entry))) - entry = get_unlocked_entry(&xas, 0); + entry = wait_entry_unlocked_exclusive(&xas, entry); if (entry) page = dax_busy_page(entry); put_unlocked_entry(&xas, entry, WAKE_NEXT); @@ -743,14 +826,14 @@ struct page *dax_layout_busy_page(struct address_space *mapping) EXPORT_SYMBOL_GPL(dax_layout_busy_page); static int __dax_invalidate_entry(struct address_space *mapping, - pgoff_t index, bool trunc) + pgoff_t index, bool trunc) { XA_STATE(xas, &mapping->i_pages, index); int ret = 0; void *entry; xas_lock_irq(&xas); - entry = get_unlocked_entry(&xas, 0); + entry = get_next_unlocked_entry(&xas, 0); if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) goto out; if (!trunc && @@ -776,7 +859,9 @@ static int __dax_clear_dirty_range(struct address_space *mapping, xas_lock_irq(&xas); xas_for_each(&xas, entry, end) { - entry = get_unlocked_entry(&xas, 0); + entry = wait_entry_unlocked_exclusive(&xas, entry); + if (!entry) + continue; xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY); xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE); put_unlocked_entry(&xas, entry, WAKE_NEXT); @@ -813,6 +898,107 @@ int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index) return ret; } +void dax_delete_mapping_range(struct address_space *mapping, + loff_t start, loff_t end) +{ + void *entry; + pgoff_t start_idx = start >> PAGE_SHIFT; + pgoff_t end_idx; + XA_STATE(xas, &mapping->i_pages, start_idx); + + /* If end == LLONG_MAX, all pages from start to till end of file */ + if (end == LLONG_MAX) + end_idx = ULONG_MAX; + else + end_idx = end >> PAGE_SHIFT; + + xas_lock_irq(&xas); + xas_for_each(&xas, entry, end_idx) { + if (!xa_is_value(entry)) + continue; + entry = wait_entry_unlocked_exclusive(&xas, entry); + if (!entry) + continue; + dax_disassociate_entry(entry, mapping, true); + xas_store(&xas, NULL); + mapping->nrpages -= 1UL << dax_entry_order(entry); + put_unlocked_entry(&xas, entry, WAKE_ALL); + } + xas_unlock_irq(&xas); +} +EXPORT_SYMBOL_GPL(dax_delete_mapping_range); + +static int wait_page_idle(struct page *page, + void (cb)(struct inode *), + struct inode *inode) +{ + return ___wait_var_event(page, dax_page_is_idle(page), + TASK_INTERRUPTIBLE, 0, 0, cb(inode)); +} + +static void wait_page_idle_uninterruptible(struct page *page, + struct inode *inode) +{ + ___wait_var_event(page, dax_page_is_idle(page), + TASK_UNINTERRUPTIBLE, 0, 0, schedule()); +} + +/* + * Unmaps the inode and waits for any DMA to complete prior to deleting the + * DAX mapping entries for the range. + * + * For NOWAIT behavior, pass @cb as NULL to early-exit on first found + * busy page + */ +int dax_break_layout(struct inode *inode, loff_t start, loff_t end, + void (cb)(struct inode *)) +{ + struct page *page; + int error = 0; + + if (!dax_mapping(inode->i_mapping)) + return 0; + + do { + page = dax_layout_busy_page_range(inode->i_mapping, start, end); + if (!page) + break; + if (!cb) { + error = -ERESTARTSYS; + break; + } + + error = wait_page_idle(page, cb, inode); + } while (error == 0); + + if (!page) + dax_delete_mapping_range(inode->i_mapping, start, end); + + return error; +} +EXPORT_SYMBOL_GPL(dax_break_layout); + +void dax_break_layout_final(struct inode *inode) +{ + struct page *page; + + if (!dax_mapping(inode->i_mapping)) + return; + + do { + page = dax_layout_busy_page_range(inode->i_mapping, 0, + LLONG_MAX); + if (!page) + break; + + wait_page_idle_uninterruptible(page, inode); + } while (true); + + if (!page) + dax_delete_mapping_range(inode->i_mapping, 0, LLONG_MAX); +} +EXPORT_SYMBOL_GPL(dax_break_layout_final); + /* * Invalidate DAX entry if it is clean. */ @@ -895,8 +1081,9 @@ static void *dax_insert_entry(struct xa_state *xas, struct vm_fault *vmf, void *old; dax_disassociate_entry(entry, mapping, false); - dax_associate_entry(new_entry, mapping, vmf->vma, vmf->address, - shared); + dax_associate_entry(new_entry, mapping, vmf->vma, + vmf->address, shared); + /* * Only swap our new entry into the page cache if the current * entry is a zero page or an empty entry. If a normal PTE or @@ -940,7 +1127,7 @@ static int dax_writeback_one(struct xa_state *xas, struct dax_device *dax_dev, if (unlikely(dax_is_locked(entry))) { void *old_entry = entry; - entry = get_unlocked_entry(xas, 0); + entry = get_next_unlocked_entry(xas, 0); /* Entry got punched out / reallocated? */ if (!entry || WARN_ON_ONCE(!xa_is_value(entry))) @@ -1084,9 +1271,7 @@ static int dax_iomap_direct_access(const struct iomap *iomap, loff_t pos, goto out; if (pfn_t_to_pfn(*pfnp) & (PHYS_PFN(size)-1)) goto out; - /* For larger pages we need devmap */ - if (length > 1 && !pfn_t_devmap(*pfnp)) - goto out; + rc = 0; out_check_addr: @@ -1193,7 +1378,7 @@ static vm_fault_t dax_load_hole(struct xa_state *xas, struct vm_fault *vmf, *entry = dax_insert_entry(xas, vmf, iter, *entry, pfn, DAX_ZERO_PAGE); - ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); + ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), false); trace_dax_load_hole(inode, vmf, ret); return ret; } @@ -1258,7 +1443,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, } #endif /* CONFIG_FS_DAX_PMD */ -static s64 dax_unshare_iter(struct iomap_iter *iter) +static int dax_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); @@ -1266,11 +1451,11 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) u64 copy_len = iomap_length(iter); u32 mod; int id = 0; - s64 ret = 0; + s64 ret; void *daddr = NULL, *saddr = NULL; if (!iomap_want_unshare_iter(iter)) - return iomap_length(iter); + return iomap_iter_advance_full(iter); /* * Extend the file range to be aligned to fsblock/pagesize, because @@ -1300,14 +1485,14 @@ static s64 dax_unshare_iter(struct iomap_iter *iter) if (ret < 0) goto out_unlock; - if (copy_mc_to_kernel(daddr, saddr, copy_len) == 0) - ret = iomap_length(iter); - else + if (copy_mc_to_kernel(daddr, saddr, copy_len) != 0) ret = -EIO; out_unlock: dax_read_unlock(id); - return dax_mem2blk_err(ret); + if (ret < 0) + return dax_mem2blk_err(ret); + return iomap_iter_advance_full(iter); } int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, @@ -1326,7 +1511,7 @@ int dax_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_unshare_iter(&iter); + iter.status = dax_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(dax_file_unshare); @@ -1354,17 +1539,16 @@ static int dax_memzero(struct iomap_iter *iter, loff_t pos, size_t size) return ret; } -static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int dax_zero_iter(struct iomap_iter *iter, bool *did_zero) { const struct iomap *iomap = &iter->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iter); - loff_t pos = iter->pos; u64 length = iomap_length(iter); - s64 written = 0; + int ret; /* already zeroed? we're done. */ if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) - return length; + return iomap_iter_advance(iter, &length); /* * invalidate the pages whose sharing state is to be changed @@ -1372,33 +1556,35 @@ static s64 dax_zero_iter(struct iomap_iter *iter, bool *did_zero) */ if (iomap->flags & IOMAP_F_SHARED) invalidate_inode_pages2_range(iter->inode->i_mapping, - pos >> PAGE_SHIFT, - (pos + length - 1) >> PAGE_SHIFT); + iter->pos >> PAGE_SHIFT, + (iter->pos + length - 1) >> PAGE_SHIFT); do { + loff_t pos = iter->pos; unsigned offset = offset_in_page(pos); - unsigned size = min_t(u64, PAGE_SIZE - offset, length); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); - long rc; int id; + length = min_t(u64, PAGE_SIZE - offset, length); + id = dax_read_lock(); - if (IS_ALIGNED(pos, PAGE_SIZE) && size == PAGE_SIZE) - rc = dax_zero_page_range(iomap->dax_dev, pgoff, 1); + if (IS_ALIGNED(pos, PAGE_SIZE) && length == PAGE_SIZE) + ret = dax_zero_page_range(iomap->dax_dev, pgoff, 1); else - rc = dax_memzero(iter, pos, size); + ret = dax_memzero(iter, pos, length); dax_read_unlock(id); - if (rc < 0) - return rc; - pos += size; - length -= size; - written += size; + if (ret < 0) + return ret; + + ret = iomap_iter_advance(iter, &length); + if (ret) + return ret; } while (length > 0); if (did_zero) *did_zero = true; - return written; + return ret; } int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, @@ -1413,7 +1599,7 @@ int dax_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, int ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = dax_zero_iter(&iter, did_zero); + iter.status = dax_zero_iter(&iter, did_zero); return ret; } EXPORT_SYMBOL_GPL(dax_zero_range); @@ -1431,8 +1617,7 @@ int dax_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, } EXPORT_SYMBOL_GPL(dax_truncate_page); -static loff_t dax_iomap_iter(const struct iomap_iter *iomi, - struct iov_iter *iter) +static int dax_iomap_iter(struct iomap_iter *iomi, struct iov_iter *iter) { const struct iomap *iomap = &iomi->iomap; const struct iomap *srcmap = iomap_iter_srcmap(iomi); @@ -1451,8 +1636,10 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, if (pos >= end) return 0; - if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) - return iov_iter_zero(min(length, end - pos), iter); + if (iomap->type == IOMAP_HOLE || iomap->type == IOMAP_UNWRITTEN) { + done = iov_iter_zero(min(length, end - pos), iter); + return iomap_iter_advance(iomi, &done); + } } /* @@ -1485,7 +1672,7 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, } id = dax_read_lock(); - while (pos < end) { + while ((pos = iomi->pos) < end) { unsigned offset = pos & (PAGE_SIZE - 1); const size_t size = ALIGN(length + offset, PAGE_SIZE); pgoff_t pgoff = dax_iomap_pgoff(iomap, pos); @@ -1535,18 +1722,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi, xfer = dax_copy_to_iter(dax_dev, pgoff, kaddr, map_len, iter); - pos += xfer; - length -= xfer; - done += xfer; - - if (xfer == 0) + length = xfer; + ret = iomap_iter_advance(iomi, &length); + if (!ret && xfer == 0) ret = -EFAULT; if (xfer < map_len) break; } dax_read_unlock(id); - return done ? done : ret; + return ret; } /** @@ -1586,7 +1771,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_NOWAIT; while ((ret = iomap_iter(&iomi, ops)) > 0) - iomi.processed = dax_iomap_iter(&iomi, iter); + iomi.status = dax_iomap_iter(&iomi, iter); done = iomi.pos - iocb->ki_pos; iocb->ki_pos = iomi.pos; @@ -1664,7 +1849,8 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, loff_t pos = (loff_t)xas->xa_index << PAGE_SHIFT; bool write = iter->flags & IOMAP_WRITE; unsigned long entry_flags = pmd ? DAX_PMD : 0; - int err = 0; + struct folio *folio; + int ret, err = 0; pfn_t pfn; void *kaddr; @@ -1696,17 +1882,19 @@ static vm_fault_t dax_fault_iter(struct vm_fault *vmf, return dax_fault_return(err); } + folio = dax_to_folio(*entry); if (dax_fault_is_synchronous(iter, vmf->vma)) return dax_fault_synchronous_pfnp(pfnp, pfn); - /* insert PMD pfn */ + folio_ref_inc(folio); if (pmd) - return vmf_insert_pfn_pmd(vmf, pfn, write); + ret = vmf_insert_folio_pmd(vmf, pfn_folio(pfn_t_to_pfn(pfn)), + write); + else + ret = vmf_insert_page_mkwrite(vmf, pfn_t_to_page(pfn), write); + folio_put(folio); - /* insert PTE pfn */ - if (write) - return vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); - return vmf_insert_mixed(vmf->vma, vmf->address, pfn); + return ret; } static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, @@ -1757,7 +1945,7 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, while ((error = iomap_iter(&iter, ops)) > 0) { if (WARN_ON_ONCE(iomap_length(&iter) < PAGE_SIZE)) { - iter.processed = -EIO; /* fs corruption? */ + iter.status = -EIO; /* fs corruption? */ continue; } @@ -1769,8 +1957,10 @@ static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, ret |= VM_FAULT_MAJOR; } - if (!(ret & VM_FAULT_ERROR)) - iter.processed = PAGE_SIZE; + if (!(ret & VM_FAULT_ERROR)) { + u64 length = PAGE_SIZE; + iter.status = iomap_iter_advance(&iter, &length); + } } if (iomap_errp) @@ -1883,8 +2073,10 @@ static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, continue; /* actually breaks out of the loop */ ret = dax_fault_iter(vmf, &iter, pfnp, &xas, &entry, true); - if (ret != VM_FAULT_FALLBACK) - iter.processed = PMD_SIZE; + if (ret != VM_FAULT_FALLBACK) { + u64 length = PMD_SIZE; + iter.status = iomap_iter_advance(&iter, &length); + } } unlock_entry: @@ -1945,11 +2137,12 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; XA_STATE_ORDER(xas, &mapping->i_pages, vmf->pgoff, order); + struct folio *folio; void *entry; vm_fault_t ret; xas_lock_irq(&xas); - entry = get_unlocked_entry(&xas, order); + entry = get_next_unlocked_entry(&xas, order); /* Did we race with someone splitting entry or so? */ if (!entry || dax_is_conflict(entry) || (order == 0 && !dax_is_pte_entry(entry))) { @@ -1962,14 +2155,17 @@ dax_insert_pfn_mkwrite(struct vm_fault *vmf, pfn_t pfn, unsigned int order) xas_set_mark(&xas, PAGECACHE_TAG_DIRTY); dax_lock_entry(&xas, entry); xas_unlock_irq(&xas); + folio = pfn_folio(pfn_t_to_pfn(pfn)); + folio_ref_inc(folio); if (order == 0) - ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); + ret = vmf_insert_page_mkwrite(vmf, &folio->page, true); #ifdef CONFIG_FS_DAX_PMD else if (order == PMD_ORDER) - ret = vmf_insert_pfn_pmd(vmf, pfn, FAULT_FLAG_WRITE); + ret = vmf_insert_folio_pmd(vmf, folio, FAULT_FLAG_WRITE); #endif else ret = VM_FAULT_FALLBACK; + folio_put(folio); dax_unlock_entry(&xas, entry); trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); return ret; @@ -1999,12 +2195,13 @@ vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, unsigned int order, } EXPORT_SYMBOL_GPL(dax_finish_sync_fault); -static loff_t dax_range_compare_iter(struct iomap_iter *it_src, +static int dax_range_compare_iter(struct iomap_iter *it_src, struct iomap_iter *it_dest, u64 len, bool *same) { const struct iomap *smap = &it_src->iomap; const struct iomap *dmap = &it_dest->iomap; loff_t pos1 = it_src->pos, pos2 = it_dest->pos; + u64 dest_len; void *saddr, *daddr; int id, ret; @@ -2012,7 +2209,7 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src, if (smap->type == IOMAP_HOLE && dmap->type == IOMAP_HOLE) { *same = true; - return len; + goto advance; } if (smap->type == IOMAP_HOLE || dmap->type == IOMAP_HOLE) { @@ -2035,7 +2232,13 @@ static loff_t dax_range_compare_iter(struct iomap_iter *it_src, if (!*same) len = 0; dax_read_unlock(id); - return len; + +advance: + dest_len = len; + ret = iomap_iter_advance(it_src, &len); + if (!ret) + ret = iomap_iter_advance(it_dest, &dest_len); + return ret; out_unlock: dax_read_unlock(id); @@ -2058,15 +2261,15 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff, .len = len, .flags = IOMAP_DAX, }; - int ret, compared = 0; + int ret, status; while ((ret = iomap_iter(&src_iter, ops)) > 0 && (ret = iomap_iter(&dst_iter, ops)) > 0) { - compared = dax_range_compare_iter(&src_iter, &dst_iter, + status = dax_range_compare_iter(&src_iter, &dst_iter, min(src_iter.len, dst_iter.len), same); - if (compared < 0) + if (status < 0) return ret; - src_iter.processed = dst_iter.processed = compared; + src_iter.status = dst_iter.status = status; } return ret; } diff --git a/fs/dcache.c b/fs/dcache.c index e3634916ffb9..03d58b2d4fa3 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -73,8 +73,14 @@ * If no ancestor relationship: * arbitrary, since it's serialized on rename_lock */ -int sysctl_vfs_cache_pressure __read_mostly = 100; -EXPORT_SYMBOL_GPL(sysctl_vfs_cache_pressure); +static int sysctl_vfs_cache_pressure __read_mostly = 100; +static int sysctl_vfs_cache_pressure_denom __read_mostly = 100; + +unsigned long vfs_pressure_ratio(unsigned long val) +{ + return mult_frac(val, sysctl_vfs_cache_pressure, sysctl_vfs_cache_pressure_denom); +} +EXPORT_SYMBOL_GPL(vfs_pressure_ratio); __cacheline_aligned_in_smp DEFINE_SEQLOCK(rename_lock); @@ -211,8 +217,28 @@ static const struct ctl_table fs_dcache_sysctls[] = { }, }; +static const struct ctl_table vm_dcache_sysctls[] = { + { + .procname = "vfs_cache_pressure", + .data = &sysctl_vfs_cache_pressure, + .maxlen = sizeof(sysctl_vfs_cache_pressure), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ZERO, + }, + { + .procname = "vfs_cache_pressure_denom", + .data = &sysctl_vfs_cache_pressure_denom, + .maxlen = sizeof(sysctl_vfs_cache_pressure_denom), + .mode = 0644, + .proc_handler = proc_dointvec_minmax, + .extra1 = SYSCTL_ONE_HUNDRED, + }, +}; + static int __init init_fs_dcache_sysctls(void) { + register_sysctl_init("vm", vm_dcache_sysctls); register_sysctl_init("fs", fs_dcache_sysctls); return 0; } @@ -2395,7 +2421,6 @@ struct dentry *d_hash_and_lookup(struct dentry *dir, struct qstr *name) } return d_lookup(dir, name); } -EXPORT_SYMBOL(d_hash_and_lookup); /* * When a file is deleted, we have two options: @@ -2480,7 +2505,8 @@ static inline void end_dir_add(struct inode *dir, unsigned int n, { smp_store_release(&dir->i_dir_seq, n + 2); preempt_enable_nested(); - wake_up_all(d_wait); + if (wq_has_sleeper(d_wait)) + wake_up_all(d_wait); } static void d_wait_lookup(struct dentry *dentry) @@ -2687,52 +2713,6 @@ void d_add(struct dentry *entry, struct inode *inode) } EXPORT_SYMBOL(d_add); -/** - * d_exact_alias - find and hash an exact unhashed alias - * @entry: dentry to add - * @inode: The inode to go with this dentry - * - * If an unhashed dentry with the same name/parent and desired - * inode already exists, hash and return it. Otherwise, return - * NULL. - * - * Parent directory should be locked. - */ -struct dentry *d_exact_alias(struct dentry *entry, struct inode *inode) -{ - struct dentry *alias; - unsigned int hash = entry->d_name.hash; - - spin_lock(&inode->i_lock); - hlist_for_each_entry(alias, &inode->i_dentry, d_u.d_alias) { - /* - * Don't need alias->d_lock here, because aliases with - * d_parent == entry->d_parent are not subject to name or - * parent changes, because the parent inode i_mutex is held. - */ - if (alias->d_name.hash != hash) - continue; - if (alias->d_parent != entry->d_parent) - continue; - if (!d_same_name(alias, entry->d_parent, &entry->d_name)) - continue; - spin_lock(&alias->d_lock); - if (!d_unhashed(alias)) { - spin_unlock(&alias->d_lock); - alias = NULL; - } else { - dget_dlock(alias); - __d_rehash(alias); - spin_unlock(&alias->d_lock); - } - spin_unlock(&inode->i_lock); - return alias; - } - spin_unlock(&inode->i_lock); - return NULL; -} -EXPORT_SYMBOL(d_exact_alias); - static void swap_names(struct dentry *dentry, struct dentry *target) { if (unlikely(dname_external(target))) { diff --git a/fs/debugfs/inode.c b/fs/debugfs/inode.c index 75715d8877ee..30c4944e1862 100644 --- a/fs/debugfs/inode.c +++ b/fs/debugfs/inode.c @@ -346,7 +346,7 @@ struct dentry *debugfs_lookup(const char *name, struct dentry *parent) if (!parent) parent = debugfs_mount->mnt_root; - dentry = lookup_positive_unlocked(name, parent, strlen(name)); + dentry = lookup_noperm_positive_unlocked(&QSTR(name), parent); if (IS_ERR(dentry)) return NULL; return dentry; @@ -388,7 +388,7 @@ static struct dentry *start_creating(const char *name, struct dentry *parent) if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (!IS_ERR(dentry) && d_really_is_positive(dentry)) { if (d_is_dir(dentry)) pr_err("Directory '%s' with parent '%s' already present!\n", @@ -872,7 +872,7 @@ int __printf(2, 3) debugfs_change_name(struct dentry *dentry, const char *fmt, . } if (strcmp(old_name.name.name, new_name) == 0) goto out; - target = lookup_one_len(new_name, parent, strlen(new_name)); + target = lookup_noperm(&QSTR(new_name), parent); if (IS_ERR(target)) { error = PTR_ERR(target); goto out; diff --git a/fs/devpts/inode.c b/fs/devpts/inode.c index 1096ff8562fa..9c20d78e41f6 100644 --- a/fs/devpts/inode.c +++ b/fs/devpts/inode.c @@ -12,6 +12,8 @@ #include <linux/module.h> #include <linux/init.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/sched.h> #include <linux/namei.h> #include <linux/slab.h> @@ -21,7 +23,6 @@ #include <linux/magic.h> #include <linux/idr.h> #include <linux/devpts_fs.h> -#include <linux/parser.h> #include <linux/fsnotify.h> #include <linux/seq_file.h> @@ -87,14 +88,14 @@ enum { Opt_err }; -static const match_table_t tokens = { - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_mode, "mode=%o"}, - {Opt_ptmxmode, "ptmxmode=%o"}, - {Opt_newinstance, "newinstance"}, - {Opt_max, "max=%d"}, - {Opt_err, NULL} +static const struct fs_parameter_spec devpts_param_specs[] = { + fsparam_gid ("gid", Opt_gid), + fsparam_s32 ("max", Opt_max), + fsparam_u32oct ("mode", Opt_mode), + fsparam_flag ("newinstance", Opt_newinstance), + fsparam_u32oct ("ptmxmode", Opt_ptmxmode), + fsparam_uid ("uid", Opt_uid), + {} }; struct pts_fs_info { @@ -214,93 +215,48 @@ void devpts_release(struct pts_fs_info *fsi) deactivate_super(fsi->sb); } -#define PARSE_MOUNT 0 -#define PARSE_REMOUNT 1 - /* - * parse_mount_options(): - * Set @opts to mount options specified in @data. If an option is not - * specified in @data, set it to its default value. - * - * Note: @data may be NULL (in which case all options are set to default). + * devpts_parse_param - Parse mount parameters */ -static int parse_mount_options(char *data, int op, struct pts_mount_opts *opts) +static int devpts_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - kuid_t uid; - kgid_t gid; - - opts->setuid = 0; - opts->setgid = 0; - opts->uid = GLOBAL_ROOT_UID; - opts->gid = GLOBAL_ROOT_GID; - opts->mode = DEVPTS_DEFAULT_MODE; - opts->ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; - opts->max = NR_UNIX98_PTY_MAX; - - /* Only allow instances mounted from the initial mount - * namespace to tap the reserve pool of ptys. - */ - if (op == PARSE_MOUNT) - opts->reserve = - (current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns); - - while ((p = strsep(&data, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - int option; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return -EINVAL; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) - return -EINVAL; - opts->uid = uid; - opts->setuid = 1; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return -EINVAL; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) - return -EINVAL; - opts->gid = gid; - opts->setgid = 1; - break; - case Opt_mode: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->mode = option & S_IALLUGO; - break; - case Opt_ptmxmode: - if (match_octal(&args[0], &option)) - return -EINVAL; - opts->ptmxmode = option & S_IALLUGO; - break; - case Opt_newinstance: - break; - case Opt_max: - if (match_int(&args[0], &option) || - option < 0 || option > NR_UNIX98_PTY_MAX) - return -EINVAL; - opts->max = option; - break; - default: - pr_err("called with bogus options\n"); - return -EINVAL; - } + struct pts_fs_info *fsi = fc->s_fs_info; + struct pts_mount_opts *opts = &fsi->mount_opts; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, devpts_param_specs, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_uid: + opts->uid = result.uid; + opts->setuid = 1; + break; + case Opt_gid: + opts->gid = result.gid; + opts->setgid = 1; + break; + case Opt_mode: + opts->mode = result.uint_32 & S_IALLUGO; + break; + case Opt_ptmxmode: + opts->ptmxmode = result.uint_32 & S_IALLUGO; + break; + case Opt_newinstance: + break; + case Opt_max: + if (result.uint_32 > NR_UNIX98_PTY_MAX) + return invalf(fc, "max out of range"); + opts->max = result.uint_32; + break; } return 0; } -static int mknod_ptmx(struct super_block *sb) +static int mknod_ptmx(struct super_block *sb, struct fs_context *fc) { int mode; int rc = -ENOMEM; @@ -362,13 +318,23 @@ static void update_ptmx_mode(struct pts_fs_info *fsi) } } -static int devpts_remount(struct super_block *sb, int *flags, char *data) +static int devpts_reconfigure(struct fs_context *fc) { - int err; - struct pts_fs_info *fsi = DEVPTS_SB(sb); - struct pts_mount_opts *opts = &fsi->mount_opts; + struct pts_fs_info *fsi = DEVPTS_SB(fc->root->d_sb); + struct pts_fs_info *new = fc->s_fs_info; - err = parse_mount_options(data, PARSE_REMOUNT, opts); + /* Apply the revised options. We don't want to change ->reserve. + * Ideally, we'd update each option conditionally on it having been + * explicitly changed, but the default is to reset everything so that + * would break UAPI... + */ + fsi->mount_opts.setuid = new->mount_opts.setuid; + fsi->mount_opts.setgid = new->mount_opts.setgid; + fsi->mount_opts.uid = new->mount_opts.uid; + fsi->mount_opts.gid = new->mount_opts.gid; + fsi->mount_opts.mode = new->mount_opts.mode; + fsi->mount_opts.ptmxmode = new->mount_opts.ptmxmode; + fsi->mount_opts.max = new->mount_opts.max; /* * parse_mount_options() restores options to default values @@ -378,7 +344,7 @@ static int devpts_remount(struct super_block *sb, int *flags, char *data) */ update_ptmx_mode(fsi); - return err; + return 0; } static int devpts_show_options(struct seq_file *seq, struct dentry *root) @@ -402,31 +368,13 @@ static int devpts_show_options(struct seq_file *seq, struct dentry *root) static const struct super_operations devpts_sops = { .statfs = simple_statfs, - .remount_fs = devpts_remount, .show_options = devpts_show_options, }; -static void *new_pts_fs_info(struct super_block *sb) -{ - struct pts_fs_info *fsi; - - fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); - if (!fsi) - return NULL; - - ida_init(&fsi->allocated_ptys); - fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; - fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; - fsi->sb = sb; - - return fsi; -} - -static int -devpts_fill_super(struct super_block *s, void *data, int silent) +static int devpts_fill_super(struct super_block *s, struct fs_context *fc) { + struct pts_fs_info *fsi = DEVPTS_SB(s); struct inode *inode; - int error; s->s_iflags &= ~SB_I_NODEV; s->s_blocksize = 1024; @@ -435,20 +383,11 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_op = &devpts_sops; s->s_d_op = &simple_dentry_operations; s->s_time_gran = 1; + fsi->sb = s; - error = -ENOMEM; - s->s_fs_info = new_pts_fs_info(s); - if (!s->s_fs_info) - goto fail; - - error = parse_mount_options(data, PARSE_MOUNT, &DEVPTS_SB(s)->mount_opts); - if (error) - goto fail; - - error = -ENOMEM; inode = new_inode(s); if (!inode) - goto fail; + return -ENOMEM; inode->i_ino = 1; simple_inode_init_ts(inode); inode->i_mode = S_IFDIR | S_IRUGO | S_IXUGO | S_IWUSR; @@ -459,31 +398,60 @@ devpts_fill_super(struct super_block *s, void *data, int silent) s->s_root = d_make_root(inode); if (!s->s_root) { pr_err("get root dentry failed\n"); - goto fail; + return -ENOMEM; } - error = mknod_ptmx(s); - if (error) - goto fail_dput; - - return 0; -fail_dput: - dput(s->s_root); - s->s_root = NULL; -fail: - return error; + return mknod_ptmx(s, fc); } /* - * devpts_mount() + * devpts_get_tree() * * Mount a new (private) instance of devpts. PTYs created in this * instance are independent of the PTYs in other devpts instances. */ -static struct dentry *devpts_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int devpts_get_tree(struct fs_context *fc) +{ + return get_tree_nodev(fc, devpts_fill_super); +} + +static void devpts_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations devpts_context_ops = { + .free = devpts_free_fc, + .parse_param = devpts_parse_param, + .get_tree = devpts_get_tree, + .reconfigure = devpts_reconfigure, +}; + +/* + * Set up the filesystem mount context. + */ +static int devpts_init_fs_context(struct fs_context *fc) { - return mount_nodev(fs_type, flags, data, devpts_fill_super); + struct pts_fs_info *fsi; + + fsi = kzalloc(sizeof(struct pts_fs_info), GFP_KERNEL); + if (!fsi) + return -ENOMEM; + + ida_init(&fsi->allocated_ptys); + fsi->mount_opts.uid = GLOBAL_ROOT_UID; + fsi->mount_opts.gid = GLOBAL_ROOT_GID; + fsi->mount_opts.mode = DEVPTS_DEFAULT_MODE; + fsi->mount_opts.ptmxmode = DEVPTS_DEFAULT_PTMX_MODE; + fsi->mount_opts.max = NR_UNIX98_PTY_MAX; + + if (fc->purpose == FS_CONTEXT_FOR_MOUNT && + current->nsproxy->mnt_ns == init_task.nsproxy->mnt_ns) + fsi->mount_opts.reserve = true; + + fc->s_fs_info = fsi; + fc->ops = &devpts_context_ops; + return 0; } static void devpts_kill_sb(struct super_block *sb) @@ -498,7 +466,8 @@ static void devpts_kill_sb(struct super_block *sb) static struct file_system_type devpts_fs_type = { .name = "devpts", - .mount = devpts_mount, + .init_fs_context = devpts_init_fs_context, + .parameters = devpts_param_specs, .kill_sb = devpts_kill_sb, .fs_flags = FS_USERNS_MOUNT, }; diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index f82a4952769d..b46165df5a91 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -3,7 +3,6 @@ menuconfig DLM tristate "Distributed Lock Manager (DLM)" depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) - select IP_SCTP help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/config.c b/fs/dlm/config.c index cf9ba6fd7a28..a23fd524a6ee 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -197,6 +197,9 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x) break; case 1: /* SCTP */ + if (!IS_ENABLED(CONFIG_IP_SCTP)) + return -EOPNOTSUPP; + break; default: return -EINVAL; diff --git a/fs/dlm/config.h b/fs/dlm/config.h index e48c4f9686d3..13a3d0b26194 100644 --- a/fs/dlm/config.h +++ b/fs/dlm/config.h @@ -23,7 +23,7 @@ struct dlm_config_node { extern const struct rhashtable_params dlm_rhash_rsb_params; -#define DLM_MAX_ADDR_COUNT 3 +#define DLM_MAX_ADDR_COUNT 8 #define DLM_PROTO_TCP 0 #define DLM_PROTO_SCTP 1 diff --git a/fs/dlm/lock.c b/fs/dlm/lock.c index c8ff88f1cdcf..e01d5f29f4d2 100644 --- a/fs/dlm/lock.c +++ b/fs/dlm/lock.c @@ -741,6 +741,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, read_lock_bh(&ls->ls_rsbtbl_lock); if (!rsb_flag(r, RSB_HASHED)) { read_unlock_bh(&ls->ls_rsbtbl_lock); + error = -EBADR; goto do_new; } @@ -784,6 +785,7 @@ static int find_rsb_dir(struct dlm_ls *ls, const void *name, int len, } } else { write_unlock_bh(&ls->ls_rsbtbl_lock); + error = -EBADR; goto do_new; } diff --git a/fs/dlm/lockspace.c b/fs/dlm/lockspace.c index 8afac6e2dff0..1929327ffbe1 100644 --- a/fs/dlm/lockspace.c +++ b/fs/dlm/lockspace.c @@ -576,7 +576,7 @@ static int new_lockspace(const char *name, const char *cluster, lockspace to start running (via sysfs) in dlm_ls_start(). */ error = do_uevent(ls, 1); - if (error) + if (error < 0) goto out_recoverd; /* wait until recovery is successful or failed */ diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index d28141829c05..e4373bce1bc2 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -160,6 +160,7 @@ struct dlm_proto_ops { bool try_new_addr; const char *name; int proto; + int how; void (*sockopts)(struct socket *sock); int (*bind)(struct socket *sock); @@ -533,7 +534,7 @@ static void lowcomms_state_change(struct sock *sk) /* SCTP layer is not calling sk_data_ready when the connection * is done, so we catch the signal through here. */ - if (sk->sk_shutdown == RCV_SHUTDOWN) + if (sk->sk_shutdown & RCV_SHUTDOWN) lowcomms_data_ready(sk); } @@ -810,7 +811,7 @@ static void shutdown_connection(struct connection *con, bool and_other) return; } - ret = kernel_sock_shutdown(con->sock, SHUT_WR); + ret = kernel_sock_shutdown(con->sock, dlm_proto_ops->how); up_read(&con->sock_lock); if (ret) { log_print("Connection %p failed to shutdown: %d will force close", @@ -1826,8 +1827,8 @@ static int dlm_tcp_listen_validate(void) { /* We don't support multi-homed hosts */ if (dlm_local_count > 1) { - log_print("TCP protocol can't handle multi-homed hosts, try SCTP"); - return -EINVAL; + log_print("Detect multi-homed hosts but use only the first IP address."); + log_print("Try SCTP, if you want to enable multi-link."); } return 0; @@ -1858,6 +1859,7 @@ static int dlm_tcp_listen_bind(struct socket *sock) static const struct dlm_proto_ops dlm_tcp_ops = { .name = "TCP", .proto = IPPROTO_TCP, + .how = SHUT_WR, .sockopts = dlm_tcp_sockopts, .bind = dlm_tcp_bind, .listen_validate = dlm_tcp_listen_validate, @@ -1896,6 +1898,7 @@ static void dlm_sctp_sockopts(struct socket *sock) static const struct dlm_proto_ops dlm_sctp_ops = { .name = "SCTP", .proto = IPPROTO_SCTP, + .how = SHUT_RDWR, .try_new_addr = true, .sockopts = dlm_sctp_sockopts, .bind = dlm_sctp_bind, diff --git a/fs/drop_caches.c b/fs/drop_caches.c index d45ef541d848..019a8b4eaaf9 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -14,7 +14,7 @@ #include "internal.h" /* A global variable is a bit ugly, but it keeps the code simple */ -int sysctl_drop_caches; +static int sysctl_drop_caches; static void drop_pagecache_sb(struct super_block *sb, void *unused) { @@ -48,7 +48,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) iput(toput_inode); } -int drop_caches_sysctl_handler(const struct ctl_table *table, int write, +static int drop_caches_sysctl_handler(const struct ctl_table *table, int write, void *buffer, size_t *length, loff_t *ppos) { int ret; @@ -77,3 +77,22 @@ int drop_caches_sysctl_handler(const struct ctl_table *table, int write, } return 0; } + +static const struct ctl_table drop_caches_table[] = { + { + .procname = "drop_caches", + .data = &sysctl_drop_caches, + .maxlen = sizeof(int), + .mode = 0200, + .proc_handler = drop_caches_sysctl_handler, + .extra1 = SYSCTL_ONE, + .extra2 = SYSCTL_FOUR, + }, +}; + +static int __init init_vm_drop_caches_sysctls(void) +{ + register_sysctl_init("vm", drop_caches_table); + return 0; +} +fs_initcall(init_vm_drop_caches_sysctls); diff --git a/fs/ecryptfs/inode.c b/fs/ecryptfs/inode.c index a9819ddb1ab8..493d7f194956 100644 --- a/fs/ecryptfs/inode.c +++ b/fs/ecryptfs/inode.c @@ -394,8 +394,8 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, char *encrypted_and_encoded_name = NULL; struct ecryptfs_mount_crypt_stat *mount_crypt_stat; struct dentry *lower_dir_dentry, *lower_dentry; - const char *name = ecryptfs_dentry->d_name.name; - size_t len = ecryptfs_dentry->d_name.len; + struct qstr qname = QSTR_INIT(ecryptfs_dentry->d_name.name, + ecryptfs_dentry->d_name.len); struct dentry *res; int rc = 0; @@ -404,23 +404,25 @@ static struct dentry *ecryptfs_lookup(struct inode *ecryptfs_dir_inode, mount_crypt_stat = &ecryptfs_superblock_to_private( ecryptfs_dentry->d_sb)->mount_crypt_stat; if (mount_crypt_stat->flags & ECRYPTFS_GLOBAL_ENCRYPT_FILENAMES) { + size_t len = qname.len; rc = ecryptfs_encrypt_and_encode_filename( &encrypted_and_encoded_name, &len, - mount_crypt_stat, name, len); + mount_crypt_stat, qname.name, len); if (rc) { printk(KERN_ERR "%s: Error attempting to encrypt and encode " "filename; rc = [%d]\n", __func__, rc); return ERR_PTR(rc); } - name = encrypted_and_encoded_name; + qname.name = encrypted_and_encoded_name; + qname.len = len; } - lower_dentry = lookup_one_len_unlocked(name, lower_dir_dentry, len); + lower_dentry = lookup_noperm_unlocked(&qname, lower_dir_dentry); if (IS_ERR(lower_dentry)) { - ecryptfs_printk(KERN_DEBUG, "%s: lookup_one_len() returned " + ecryptfs_printk(KERN_DEBUG, "%s: lookup_noperm() returned " "[%ld] on lower_dentry = [%s]\n", __func__, PTR_ERR(lower_dentry), - name); + qname.name); res = ERR_CAST(lower_dentry); } else { res = ecryptfs_lookup_interpose(ecryptfs_dentry, lower_dentry); @@ -503,18 +505,24 @@ out_lock: return rc; } -static int ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ecryptfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int rc; struct dentry *lower_dentry; struct inode *lower_dir; rc = lock_parent(dentry, &lower_dentry, &lower_dir); - if (!rc) - rc = vfs_mkdir(&nop_mnt_idmap, lower_dir, - lower_dentry, mode); - if (rc || d_really_is_negative(lower_dentry)) + if (rc) + goto out; + + lower_dentry = vfs_mkdir(&nop_mnt_idmap, lower_dir, + lower_dentry, mode); + rc = PTR_ERR(lower_dentry); + if (IS_ERR(lower_dentry)) + goto out; + rc = 0; + if (d_unhashed(lower_dentry)) goto out; rc = ecryptfs_interpose(lower_dentry, dentry, dir->i_sb); if (rc) @@ -526,7 +534,7 @@ out: inode_unlock(lower_dir); if (d_really_is_negative(dentry)) d_drop(dentry); - return rc; + return ERR_PTR(rc); } static int ecryptfs_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/ecryptfs/super.c b/fs/ecryptfs/super.c index 0b1c878317ab..e7b7f426fecf 100644 --- a/fs/ecryptfs/super.c +++ b/fs/ecryptfs/super.c @@ -172,7 +172,6 @@ const struct super_operations ecryptfs_sops = { .destroy_inode = ecryptfs_destroy_inode, .free_inode = ecryptfs_free_inode, .statfs = ecryptfs_statfs, - .remount_fs = NULL, .evict_inode = ecryptfs_evict_inode, .show_options = ecryptfs_show_options }; diff --git a/fs/efivarfs/internal.h b/fs/efivarfs/internal.h index ac6a1dd0a6a5..f913b6824289 100644 --- a/fs/efivarfs/internal.h +++ b/fs/efivarfs/internal.h @@ -17,7 +17,6 @@ struct efivarfs_fs_info { struct efivarfs_mount_opts mount_opts; struct super_block *sb; struct notifier_block nb; - struct notifier_block pm_nb; }; struct efi_variable { diff --git a/fs/efivarfs/super.c b/fs/efivarfs/super.c index 09fcf731e65d..c900d98bf494 100644 --- a/fs/efivarfs/super.c +++ b/fs/efivarfs/super.c @@ -18,8 +18,10 @@ #include <linux/statfs.h> #include <linux/notifier.h> #include <linux/printk.h> +#include <linux/namei.h> #include "internal.h" +#include "../internal.h" static int efivarfs_ops_notifier(struct notifier_block *nb, unsigned long event, void *data) @@ -119,12 +121,18 @@ static int efivarfs_statfs(struct dentry *dentry, struct kstatfs *buf) return 0; } + +static int efivarfs_freeze_fs(struct super_block *sb); +static int efivarfs_unfreeze_fs(struct super_block *sb); + static const struct super_operations efivarfs_ops = { .statfs = efivarfs_statfs, .drop_inode = generic_delete_inode, .alloc_inode = efivarfs_alloc_inode, .free_inode = efivarfs_free_inode, .show_options = efivarfs_show_options, + .freeze_fs = efivarfs_freeze_fs, + .unfreeze_fs = efivarfs_unfreeze_fs, }; /* @@ -204,7 +212,6 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name, char *name = efivar_get_utf8name(variable_name, vendor); struct super_block *sb = data; struct dentry *dentry; - struct qstr qstr; if (!name) /* @@ -217,9 +224,7 @@ bool efivarfs_variable_is_present(efi_char16_t *variable_name, */ return true; - qstr.name = name; - qstr.len = strlen(name); - dentry = d_hash_and_lookup(sb->s_root, &qstr); + dentry = try_lookup_noperm(&QSTR(name), sb->s_root); kfree(name); if (!IS_ERR_OR_NULL(dentry)) dput(dentry); @@ -391,55 +396,12 @@ static const struct fs_context_operations efivarfs_context_ops = { .reconfigure = efivarfs_reconfigure, }; -struct efivarfs_ctx { - struct dir_context ctx; - struct super_block *sb; - struct dentry *dentry; -}; - -static bool efivarfs_actor(struct dir_context *ctx, const char *name, int len, - loff_t offset, u64 ino, unsigned mode) -{ - unsigned long size; - struct efivarfs_ctx *ectx = container_of(ctx, struct efivarfs_ctx, ctx); - struct qstr qstr = { .name = name, .len = len }; - struct dentry *dentry = d_hash_and_lookup(ectx->sb->s_root, &qstr); - struct inode *inode; - struct efivar_entry *entry; - int err; - - if (IS_ERR_OR_NULL(dentry)) - return true; - - inode = d_inode(dentry); - entry = efivar_entry(inode); - - err = efivar_entry_size(entry, &size); - size += sizeof(__u32); /* attributes */ - if (err) - size = 0; - - inode_lock(inode); - i_size_write(inode, size); - inode_unlock(inode); - - if (!size) { - ectx->dentry = dentry; - return false; - } - - dput(dentry); - - return true; -} - static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor, unsigned long name_size, void *data) { char *name; struct super_block *sb = data; struct dentry *dentry; - struct qstr qstr; int err; if (guid_equal(&vendor, &LINUX_EFI_RANDOM_SEED_TABLE_GUID)) @@ -449,9 +411,7 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor, if (!name) return -ENOMEM; - qstr.name = name; - qstr.len = strlen(name); - dentry = d_hash_and_lookup(sb->s_root, &qstr); + dentry = try_lookup_noperm(&QSTR(name), sb->s_root); if (IS_ERR(dentry)) { err = PTR_ERR(dentry); goto out; @@ -472,65 +432,59 @@ static int efivarfs_check_missing(efi_char16_t *name16, efi_guid_t vendor, return err; } -static int efivarfs_pm_notify(struct notifier_block *nb, unsigned long action, - void *ptr) -{ - struct efivarfs_fs_info *sfi = container_of(nb, struct efivarfs_fs_info, - pm_nb); - struct path path = { .mnt = NULL, .dentry = sfi->sb->s_root, }; - struct efivarfs_ctx ectx = { - .ctx = { - .actor = efivarfs_actor, - }, - .sb = sfi->sb, - }; - struct file *file; - static bool rescan_done = true; - - if (action == PM_HIBERNATION_PREPARE) { - rescan_done = false; - return NOTIFY_OK; - } else if (action != PM_POST_HIBERNATION) { - return NOTIFY_DONE; - } +static struct file_system_type efivarfs_type; - if (rescan_done) - return NOTIFY_DONE; - - pr_info("efivarfs: resyncing variable state\n"); - - /* O_NOATIME is required to prevent oops on NULL mnt */ - file = kernel_file_open(&path, O_RDONLY | O_DIRECTORY | O_NOATIME, - current_cred()); - if (IS_ERR(file)) - return NOTIFY_DONE; +static int efivarfs_freeze_fs(struct super_block *sb) +{ + /* Nothing for us to do. */ + return 0; +} - rescan_done = true; +static int efivarfs_unfreeze_fs(struct super_block *sb) +{ + struct dentry *child = NULL; /* - * First loop over the directory and verify each entry exists, - * removing it if it doesn't + * Unconditionally resync the variable state on a thaw request. + * Given the size of efivarfs it really doesn't matter to simply + * iterate through all of the entries and resync. Freeze/thaw + * requests are rare enough for that to not matter and the + * number of entries is pretty low too. So we really don't care. */ - file->f_pos = 2; /* skip . and .. */ - do { - ectx.dentry = NULL; - iterate_dir(file, &ectx.ctx); - if (ectx.dentry) { - pr_info("efivarfs: removing variable %pd\n", - ectx.dentry); - simple_recursive_removal(ectx.dentry, NULL); - dput(ectx.dentry); + pr_info("efivarfs: resyncing variable state\n"); + for (;;) { + int err; + unsigned long size = 0; + struct inode *inode; + struct efivar_entry *entry; + + child = find_next_child(sb->s_root, child); + if (!child) + break; + + inode = d_inode(child); + entry = efivar_entry(inode); + + err = efivar_entry_size(entry, &size); + if (err) + size = 0; + else + size += sizeof(__u32); + + inode_lock(inode); + i_size_write(inode, size); + inode_unlock(inode); + + /* The variable doesn't exist anymore, delete it. */ + if (!size) { + pr_info("efivarfs: removing variable %pd\n", child); + simple_recursive_removal(child, NULL); } - } while (ectx.dentry); - fput(file); - - /* - * then loop over variables, creating them if there's no matching - * dentry - */ - efivar_init(efivarfs_check_missing, sfi->sb, false); + } - return NOTIFY_OK; + efivar_init(efivarfs_check_missing, sb, false); + pr_info("efivarfs: finished resyncing variable state\n"); + return 0; } static int efivarfs_init_fs_context(struct fs_context *fc) @@ -550,10 +504,6 @@ static int efivarfs_init_fs_context(struct fs_context *fc) fc->s_fs_info = sfi; fc->ops = &efivarfs_context_ops; - sfi->pm_nb.notifier_call = efivarfs_pm_notify; - sfi->pm_nb.priority = 0; - register_pm_notifier(&sfi->pm_nb); - return 0; } @@ -563,7 +513,6 @@ static void efivarfs_kill_sb(struct super_block *sb) blocking_notifier_chain_unregister(&efivar_ops_nh, &sfi->nb); kill_litter_super(sb); - unregister_pm_notifier(&sfi->pm_nb); kfree(sfi); } diff --git a/fs/erofs/Kconfig b/fs/erofs/Kconfig index 6ea60661fa55..6beeb7063871 100644 --- a/fs/erofs/Kconfig +++ b/fs/erofs/Kconfig @@ -3,8 +3,8 @@ config EROFS_FS tristate "EROFS filesystem support" depends on BLOCK + select CRC32 select FS_IOMAP - select LIBCRC32C help EROFS (Enhanced Read-Only File System) is a lightweight read-only file system with modern designs (e.g. no buffer heads, inline @@ -13,12 +13,12 @@ config EROFS_FS smartphones with Android OS, LiveCDs and high-density hosts with numerous containers; - It also provides fixed-sized output compression support in order to - improve storage density as well as keep relatively higher compression - ratios and implements in-place decompression to reuse the file page - for compressed data temporarily with proper strategies, which is - quite useful to ensure guaranteed end-to-end runtime decompression - performance under extremely memory pressure without extra cost. + It also provides transparent compression and deduplication support to + improve storage density and maintain relatively high compression + ratios, and it implements in-place decompression to temporarily reuse + page cache for compressed data using proper strategies, which is + quite useful for ensuring guaranteed end-to-end runtime decompression + performance under extreme memory pressure without extra cost. See the documentation at <file:Documentation/filesystems/erofs.rst> and the web pages at <https://erofs.docs.kernel.org> for more details. @@ -97,7 +97,7 @@ config EROFS_FS_ZIP select LZ4_DECOMPRESS default y help - Enable fixed-sized output compression for EROFS. + Enable transparent compression support for EROFS file systems. If you don't want to enable compression feature, say N. @@ -144,6 +144,20 @@ config EROFS_FS_ZIP_ZSTD If unsure, say N. +config EROFS_FS_ZIP_ACCEL + bool "EROFS hardware decompression support" + depends on EROFS_FS_ZIP + help + Saying Y here includes hardware accelerator support for reading + EROFS file systems containing compressed data. It gives better + decompression speed than the software-implemented decompression, and + it costs lower CPU overhead. + + Hardware accelerator support is an experimental feature for now and + file systems are still readable without selecting this option. + + If unsure, say N. + config EROFS_FS_ONDEMAND bool "EROFS fscache-based on-demand read support (deprecated)" depends on EROFS_FS diff --git a/fs/erofs/Makefile b/fs/erofs/Makefile index 4331d53c7109..549abc424763 100644 --- a/fs/erofs/Makefile +++ b/fs/erofs/Makefile @@ -7,5 +7,6 @@ erofs-$(CONFIG_EROFS_FS_ZIP) += decompressor.o zmap.o zdata.o zutil.o erofs-$(CONFIG_EROFS_FS_ZIP_LZMA) += decompressor_lzma.o erofs-$(CONFIG_EROFS_FS_ZIP_DEFLATE) += decompressor_deflate.o erofs-$(CONFIG_EROFS_FS_ZIP_ZSTD) += decompressor_zstd.o +erofs-$(CONFIG_EROFS_FS_ZIP_ACCEL) += decompressor_crypto.o erofs-$(CONFIG_EROFS_FS_BACKED_BY_FILE) += fileio.o erofs-$(CONFIG_EROFS_FS_ONDEMAND) += fscache.o diff --git a/fs/erofs/compress.h b/fs/erofs/compress.h index 65ff39401020..510e922c5193 100644 --- a/fs/erofs/compress.h +++ b/fs/erofs/compress.h @@ -11,6 +11,7 @@ struct z_erofs_decompress_req { struct super_block *sb; struct page **in, **out; + unsigned int inpages, outpages; unsigned short pageofs_in, pageofs_out; unsigned int inputsize, outputsize; @@ -59,7 +60,6 @@ extern const struct z_erofs_decompressor *z_erofs_decomp[]; struct z_erofs_stream_dctx { struct z_erofs_decompress_req *rq; - unsigned int inpages, outpages; /* # of {en,de}coded pages */ int no, ni; /* the current {en,de}coded page # */ unsigned int avail_out; /* remaining bytes in the decoded buffer */ @@ -76,4 +76,14 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, unsigned int padbufsize); int __init z_erofs_init_decompressor(void); void z_erofs_exit_decompressor(void); +int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl); +int z_erofs_crypto_enable_engine(const char *name, int len); +#ifdef CONFIG_EROFS_FS_ZIP_ACCEL +void z_erofs_crypto_disable_all_engines(void); +int z_erofs_crypto_show_engines(char *buf, int size, char sep); +#else +static inline void z_erofs_crypto_disable_all_engines(void) {} +static inline int z_erofs_crypto_show_engines(char *buf, int size, char sep) { return 0; } +#endif #endif diff --git a/fs/erofs/data.c b/fs/erofs/data.c index 0cd6b5c4df98..6a329c329f43 100644 --- a/fs/erofs/data.c +++ b/fs/erofs/data.c @@ -25,10 +25,9 @@ void erofs_put_metabuf(struct erofs_buf *buf) buf->page = NULL; } -void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, - enum erofs_kmap_type type) +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap) { - pgoff_t index = offset >> PAGE_SHIFT; + pgoff_t index = (buf->off + offset) >> PAGE_SHIFT; struct folio *folio = NULL; if (buf->page) { @@ -43,10 +42,10 @@ void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, return folio; } buf->page = folio_file_page(folio, index); - if (!buf->base && type == EROFS_KMAP) - buf->base = kmap_local_page(buf->page); - if (type == EROFS_NO_KMAP) + if (!need_kmap) return NULL; + if (!buf->base) + buf->base = kmap_local_page(buf->page); return buf->base + (offset & ~PAGE_MASK); } @@ -55,6 +54,7 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) struct erofs_sb_info *sbi = EROFS_SB(sb); buf->file = NULL; + buf->off = sbi->dif0.fsoff; if (erofs_is_fileio_mode(sbi)) { buf->file = sbi->dif0.file; /* some fs like FUSE needs it */ buf->mapping = buf->file->f_mapping; @@ -65,64 +65,47 @@ void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb) } void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_off_t offset, enum erofs_kmap_type type) + erofs_off_t offset, bool need_kmap) { erofs_init_metabuf(buf, sb); - return erofs_bread(buf, offset, type); -} - -static int erofs_map_blocks_flatmode(struct inode *inode, - struct erofs_map_blocks *map) -{ - struct erofs_inode *vi = EROFS_I(inode); - struct super_block *sb = inode->i_sb; - bool tailendpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); - erofs_blk_t lastblk = erofs_iblks(inode) - tailendpacking; - - map->m_flags = EROFS_MAP_MAPPED; /* no hole in flat inodes */ - if (map->m_la < erofs_pos(sb, lastblk)) { - map->m_pa = erofs_pos(sb, vi->raw_blkaddr) + map->m_la; - map->m_plen = erofs_pos(sb, lastblk) - map->m_la; - } else { - DBG_BUGON(!tailendpacking); - map->m_pa = erofs_iloc(inode) + vi->inode_isize + - vi->xattr_isize + erofs_blkoff(sb, map->m_la); - map->m_plen = inode->i_size - map->m_la; - - /* inline data should be located in the same meta block */ - if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { - erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; - } - map->m_flags |= EROFS_MAP_META; - } - return 0; + return erofs_bread(buf, offset, need_kmap); } int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) { + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct super_block *sb = inode->i_sb; + unsigned int unit, blksz = sb->s_blocksize; struct erofs_inode *vi = EROFS_I(inode); struct erofs_inode_chunk_index *idx; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - u64 chunknr; - unsigned int unit; + erofs_blk_t startblk, addrmask; + bool tailpacking; erofs_off_t pos; - void *kaddr; + u64 chunknr; int err = 0; trace_erofs_map_blocks_enter(inode, map, 0); map->m_deviceid = 0; - if (map->m_la >= inode->i_size) { - /* leave out-of-bound access unmapped */ - map->m_flags = 0; - map->m_plen = map->m_llen; + map->m_flags = 0; + if (map->m_la >= inode->i_size) goto out; - } if (vi->datalayout != EROFS_INODE_CHUNK_BASED) { - err = erofs_map_blocks_flatmode(inode, map); + tailpacking = (vi->datalayout == EROFS_INODE_FLAT_INLINE); + if (!tailpacking && vi->startblk == EROFS_NULL_ADDR) + goto out; + pos = erofs_pos(sb, erofs_iblks(inode) - tailpacking); + + map->m_flags = EROFS_MAP_MAPPED; + if (map->m_la < pos) { + map->m_pa = erofs_pos(sb, vi->startblk) + map->m_la; + map->m_llen = pos - map->m_la; + } else { + map->m_pa = erofs_iloc(inode) + vi->inode_isize + + vi->xattr_isize + erofs_blkoff(sb, map->m_la); + map->m_llen = inode->i_size - map->m_la; + map->m_flags |= EROFS_MAP_META; + } goto out; } @@ -135,45 +118,44 @@ int erofs_map_blocks(struct inode *inode, struct erofs_map_blocks *map) pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, unit) + unit * chunknr; - kaddr = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); - if (IS_ERR(kaddr)) { - err = PTR_ERR(kaddr); + idx = erofs_read_metabuf(&buf, sb, pos, true); + if (IS_ERR(idx)) { + err = PTR_ERR(idx); goto out; } map->m_la = chunknr << vi->chunkbits; - map->m_plen = min_t(erofs_off_t, 1UL << vi->chunkbits, - round_up(inode->i_size - map->m_la, sb->s_blocksize)); - - /* handle block map */ - if (!(vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES)) { - __le32 *blkaddr = kaddr; - - if (le32_to_cpu(*blkaddr) == EROFS_NULL_ADDR) { - map->m_flags = 0; - } else { - map->m_pa = erofs_pos(sb, le32_to_cpu(*blkaddr)); + map->m_llen = min_t(erofs_off_t, 1UL << vi->chunkbits, + round_up(inode->i_size - map->m_la, blksz)); + if (vi->chunkformat & EROFS_CHUNK_FORMAT_INDEXES) { + addrmask = (vi->chunkformat & EROFS_CHUNK_FORMAT_48BIT) ? + BIT_ULL(48) - 1 : BIT_ULL(32) - 1; + startblk = (((u64)le16_to_cpu(idx->startblk_hi) << 32) | + le32_to_cpu(idx->startblk_lo)) & addrmask; + if ((startblk ^ EROFS_NULL_ADDR) & addrmask) { + map->m_deviceid = le16_to_cpu(idx->device_id) & + EROFS_SB(sb)->device_id_mask; + map->m_pa = erofs_pos(sb, startblk); + map->m_flags = EROFS_MAP_MAPPED; + } + } else { + startblk = le32_to_cpu(*(__le32 *)idx); + if (startblk != (u32)EROFS_NULL_ADDR) { + map->m_pa = erofs_pos(sb, startblk); map->m_flags = EROFS_MAP_MAPPED; } - goto out_unlock; - } - /* parse chunk indexes */ - idx = kaddr; - switch (le32_to_cpu(idx->blkaddr)) { - case EROFS_NULL_ADDR: - map->m_flags = 0; - break; - default: - map->m_deviceid = le16_to_cpu(idx->device_id) & - EROFS_SB(sb)->device_id_mask; - map->m_pa = erofs_pos(sb, le32_to_cpu(idx->blkaddr)); - map->m_flags = EROFS_MAP_MAPPED; - break; } -out_unlock: erofs_put_metabuf(&buf); out: - if (!err) - map->m_llen = map->m_plen; + if (!err) { + map->m_plen = map->m_llen; + /* inline data should be located in the same meta block */ + if ((map->m_flags & EROFS_MAP_META) && + erofs_blkoff(sb, map->m_pa) + map->m_plen > blksz) { + erofs_err(sb, "inline data across blocks @ nid %llu", vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + } trace_erofs_map_blocks_exit(inode, map, 0, err); return err; } @@ -192,7 +174,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) { struct erofs_dev_context *devs = EROFS_SB(sb)->devs; struct erofs_device_info *dif; - erofs_off_t startoff, length; + erofs_off_t startoff; int id; erofs_fill_from_devinfo(map, sb, &EROFS_SB(sb)->dif0); @@ -205,7 +187,7 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) return -ENODEV; } if (devs->flatdev) { - map->m_pa += erofs_pos(sb, dif->mapped_blkaddr); + map->m_pa += erofs_pos(sb, dif->uniaddr); up_read(&devs->rwsem); return 0; } @@ -214,13 +196,12 @@ int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *map) } else if (devs->extra_devices && !devs->flatdev) { down_read(&devs->rwsem); idr_for_each_entry(&devs->tree, dif, id) { - if (!dif->mapped_blkaddr) + if (!dif->uniaddr) continue; - startoff = erofs_pos(sb, dif->mapped_blkaddr); - length = erofs_pos(sb, dif->blocks); + startoff = erofs_pos(sb, dif->uniaddr); if (map->m_pa >= startoff && - map->m_pa < startoff + length) { + map->m_pa < startoff + erofs_pos(sb, dif->blocks)) { map->m_pa -= startoff; erofs_fill_from_devinfo(map, sb, dif); break; @@ -312,14 +293,14 @@ static int erofs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, struct erofs_buf buf = __EROFS_BUF_INITIALIZER; iomap->type = IOMAP_INLINE; - ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, EROFS_KMAP); + ptr = erofs_read_metabuf(&buf, sb, mdev.m_pa, true); if (IS_ERR(ptr)) return PTR_ERR(ptr); iomap->inline_data = ptr; iomap->private = buf.base; } else { iomap->type = IOMAP_MAPPED; - iomap->addr = mdev.m_pa; + iomap->addr = mdev.m_dif->fsoff + mdev.m_pa; if (flags & IOMAP_DAX) iomap->addr += mdev.m_dif->dax_part_off; } diff --git a/fs/erofs/decompressor.c b/fs/erofs/decompressor.c index 2b123b070a42..bf62e2836b60 100644 --- a/fs/erofs/decompressor.c +++ b/fs/erofs/decompressor.c @@ -9,14 +9,6 @@ #define LZ4_MAX_DISTANCE_PAGES (DIV_ROUND_UP(LZ4_DISTANCE_MAX, PAGE_SIZE) + 1) -struct z_erofs_lz4_decompress_ctx { - struct z_erofs_decompress_req *rq; - /* # of encoded, decoded pages */ - unsigned int inpages, outpages; - /* decoded block total length (used for in-place decompression) */ - unsigned int oend; -}; - static int z_erofs_load_lz4_config(struct super_block *sb, struct erofs_super_block *dsb, void *data, int size) { @@ -55,10 +47,9 @@ static int z_erofs_load_lz4_config(struct super_block *sb, * Fill all gaps with bounce pages if it's a sparse page list. Also check if * all physical pages are consecutive, which can be seen for moderate CR. */ -static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, +static int z_erofs_lz4_prepare_dstpages(struct z_erofs_decompress_req *rq, struct page **pagepool) { - struct z_erofs_decompress_req *rq = ctx->rq; struct page *availables[LZ4_MAX_DISTANCE_PAGES] = { NULL }; unsigned long bounced[DIV_ROUND_UP(LZ4_MAX_DISTANCE_PAGES, BITS_PER_LONG)] = { 0 }; @@ -68,7 +59,7 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, unsigned int i, j, top; top = 0; - for (i = j = 0; i < ctx->outpages; ++i, ++j) { + for (i = j = 0; i < rq->outpages; ++i, ++j) { struct page *const page = rq->out[i]; struct page *victim; @@ -114,36 +105,36 @@ static int z_erofs_lz4_prepare_dstpages(struct z_erofs_lz4_decompress_ctx *ctx, return kaddr ? 1 : 0; } -static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, +static void *z_erofs_lz4_handle_overlap(struct z_erofs_decompress_req *rq, void *inpage, void *out, unsigned int *inputmargin, int *maptype, bool may_inplace) { - struct z_erofs_decompress_req *rq = ctx->rq; - unsigned int omargin, total, i; + unsigned int oend, omargin, total, i; struct page **in; void *src, *tmp; if (rq->inplace_io) { - omargin = PAGE_ALIGN(ctx->oend) - ctx->oend; + oend = rq->pageofs_out + rq->outputsize; + omargin = PAGE_ALIGN(oend) - oend; if (rq->partial_decoding || !may_inplace || omargin < LZ4_DECOMPRESS_INPLACE_MARGIN(rq->inputsize)) goto docopy; - for (i = 0; i < ctx->inpages; ++i) - if (rq->out[ctx->outpages - ctx->inpages + i] != + for (i = 0; i < rq->inpages; ++i) + if (rq->out[rq->outpages - rq->inpages + i] != rq->in[i]) goto docopy; kunmap_local(inpage); *maptype = 3; - return out + ((ctx->outpages - ctx->inpages) << PAGE_SHIFT); + return out + ((rq->outpages - rq->inpages) << PAGE_SHIFT); } - if (ctx->inpages <= 1) { + if (rq->inpages <= 1) { *maptype = 0; return inpage; } kunmap_local(inpage); - src = erofs_vm_map_ram(rq->in, ctx->inpages); + src = erofs_vm_map_ram(rq->in, rq->inpages); if (!src) return ERR_PTR(-ENOMEM); *maptype = 1; @@ -152,7 +143,7 @@ static void *z_erofs_lz4_handle_overlap(struct z_erofs_lz4_decompress_ctx *ctx, docopy: /* Or copy compressed data which can be overlapped to per-CPU buffer */ in = rq->in; - src = z_erofs_get_gbuf(ctx->inpages); + src = z_erofs_get_gbuf(rq->inpages); if (!src) { DBG_BUGON(1); kunmap_local(inpage); @@ -197,10 +188,8 @@ int z_erofs_fixup_insize(struct z_erofs_decompress_req *rq, const char *padbuf, return 0; } -static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, - u8 *dst) +static int z_erofs_lz4_decompress_mem(struct z_erofs_decompress_req *rq, u8 *dst) { - struct z_erofs_decompress_req *rq = ctx->rq; bool support_0padding = false, may_inplace = false; unsigned int inputmargin; u8 *out, *headpage, *src; @@ -224,7 +213,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, } inputmargin = rq->pageofs_in; - src = z_erofs_lz4_handle_overlap(ctx, headpage, dst, &inputmargin, + src = z_erofs_lz4_handle_overlap(rq, headpage, dst, &inputmargin, &maptype, may_inplace); if (IS_ERR(src)) return PTR_ERR(src); @@ -251,7 +240,7 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, if (maptype == 0) { kunmap_local(headpage); } else if (maptype == 1) { - vm_unmap_ram(src, ctx->inpages); + vm_unmap_ram(src, rq->inpages); } else if (maptype == 2) { z_erofs_put_gbuf(src); } else if (maptype != 3) { @@ -264,54 +253,42 @@ static int z_erofs_lz4_decompress_mem(struct z_erofs_lz4_decompress_ctx *ctx, static int z_erofs_lz4_decompress(struct z_erofs_decompress_req *rq, struct page **pagepool) { - struct z_erofs_lz4_decompress_ctx ctx; unsigned int dst_maptype; void *dst; int ret; - ctx.rq = rq; - ctx.oend = rq->pageofs_out + rq->outputsize; - ctx.outpages = PAGE_ALIGN(ctx.oend) >> PAGE_SHIFT; - ctx.inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT; - /* one optimized fast path only for non bigpcluster cases yet */ - if (ctx.inpages == 1 && ctx.outpages == 1 && !rq->inplace_io) { + if (rq->inpages == 1 && rq->outpages == 1 && !rq->inplace_io) { DBG_BUGON(!*rq->out); dst = kmap_local_page(*rq->out); dst_maptype = 0; - goto dstmap_out; - } - - /* general decoding path which can be used for all cases */ - ret = z_erofs_lz4_prepare_dstpages(&ctx, pagepool); - if (ret < 0) { - return ret; - } else if (ret > 0) { - dst = page_address(*rq->out); - dst_maptype = 1; } else { - dst = erofs_vm_map_ram(rq->out, ctx.outpages); - if (!dst) - return -ENOMEM; - dst_maptype = 2; + /* general decoding path which can be used for all cases */ + ret = z_erofs_lz4_prepare_dstpages(rq, pagepool); + if (ret < 0) + return ret; + if (ret > 0) { + dst = page_address(*rq->out); + dst_maptype = 1; + } else { + dst = erofs_vm_map_ram(rq->out, rq->outpages); + if (!dst) + return -ENOMEM; + dst_maptype = 2; + } } - -dstmap_out: - ret = z_erofs_lz4_decompress_mem(&ctx, dst); + ret = z_erofs_lz4_decompress_mem(rq, dst); if (!dst_maptype) kunmap_local(dst); else if (dst_maptype == 2) - vm_unmap_ram(dst, ctx.outpages); + vm_unmap_ram(dst, rq->outpages); return ret; } static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, struct page **pagepool) { - const unsigned int nrpages_in = - PAGE_ALIGN(rq->pageofs_in + rq->inputsize) >> PAGE_SHIFT; - const unsigned int nrpages_out = - PAGE_ALIGN(rq->pageofs_out + rq->outputsize) >> PAGE_SHIFT; + const unsigned int nrpages_in = rq->inpages, nrpages_out = rq->outpages; const unsigned int bs = rq->sb->s_blocksize; unsigned int cur = 0, ni = 0, no, pi, po, insz, cnt; u8 *kin; @@ -336,7 +313,7 @@ static int z_erofs_transform_plain(struct z_erofs_decompress_req *rq, rq->outputsize -= cur; } - for (; rq->outputsize; rq->pageofs_in = 0, cur += PAGE_SIZE, ni++) { + for (; rq->outputsize; rq->pageofs_in = 0, cur += insz, ni++) { insz = min(PAGE_SIZE - rq->pageofs_in, rq->outputsize); rq->outputsize -= insz; if (!rq->in[ni]) @@ -373,7 +350,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, unsigned int j; if (!dctx->avail_out) { - if (++dctx->no >= dctx->outpages || !rq->outputsize) { + if (++dctx->no >= rq->outpages || !rq->outputsize) { erofs_err(sb, "insufficient space for decompressed data"); return -EFSCORRUPTED; } @@ -401,7 +378,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, } if (dctx->inbuf_pos == dctx->inbuf_sz && rq->inputsize) { - if (++dctx->ni >= dctx->inpages) { + if (++dctx->ni >= rq->inpages) { erofs_err(sb, "invalid compressed data"); return -EFSCORRUPTED; } @@ -434,7 +411,7 @@ int z_erofs_stream_switch_bufs(struct z_erofs_stream_dctx *dctx, void **dst, dctx->bounced = true; } - for (j = dctx->ni + 1; j < dctx->inpages; ++j) { + for (j = dctx->ni + 1; j < rq->inpages; ++j) { if (rq->out[dctx->no] != rq->in[j]) continue; tmppage = erofs_allocpage(pgpl, rq->gfp); diff --git a/fs/erofs/decompressor_crypto.c b/fs/erofs/decompressor_crypto.c new file mode 100644 index 000000000000..97b77ab64432 --- /dev/null +++ b/fs/erofs/decompressor_crypto.c @@ -0,0 +1,181 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +#include <linux/scatterlist.h> +#include <crypto/acompress.h> +#include "compress.h" + +static int __z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq, + struct crypto_acomp *tfm) +{ + struct sg_table st_src, st_dst; + struct acomp_req *req; + struct crypto_wait wait; + u8 *headpage; + int ret; + + headpage = kmap_local_page(*rq->in); + ret = z_erofs_fixup_insize(rq, headpage + rq->pageofs_in, + min_t(unsigned int, rq->inputsize, + rq->sb->s_blocksize - rq->pageofs_in)); + kunmap_local(headpage); + if (ret) + return ret; + + req = acomp_request_alloc(tfm); + if (!req) + return -ENOMEM; + + ret = sg_alloc_table_from_pages_segment(&st_src, rq->in, rq->inpages, + rq->pageofs_in, rq->inputsize, UINT_MAX, GFP_KERNEL); + if (ret < 0) + goto failed_src_alloc; + + ret = sg_alloc_table_from_pages_segment(&st_dst, rq->out, rq->outpages, + rq->pageofs_out, rq->outputsize, UINT_MAX, GFP_KERNEL); + if (ret < 0) + goto failed_dst_alloc; + + acomp_request_set_params(req, st_src.sgl, + st_dst.sgl, rq->inputsize, rq->outputsize); + + crypto_init_wait(&wait); + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + + ret = crypto_wait_req(crypto_acomp_decompress(req), &wait); + if (ret) { + erofs_err(rq->sb, "failed to decompress %d in[%u, %u] out[%u]", + ret, rq->inputsize, rq->pageofs_in, rq->outputsize); + ret = -EIO; + } + + sg_free_table(&st_dst); +failed_dst_alloc: + sg_free_table(&st_src); +failed_src_alloc: + acomp_request_free(req); + return ret; +} + +struct z_erofs_crypto_engine { + char *crypto_name; + struct crypto_acomp *tfm; +}; + +struct z_erofs_crypto_engine *z_erofs_crypto[Z_EROFS_COMPRESSION_MAX] = { + [Z_EROFS_COMPRESSION_LZ4] = (struct z_erofs_crypto_engine[]) { + {}, + }, + [Z_EROFS_COMPRESSION_LZMA] = (struct z_erofs_crypto_engine[]) { + {}, + }, + [Z_EROFS_COMPRESSION_DEFLATE] = (struct z_erofs_crypto_engine[]) { + { .crypto_name = "qat_deflate", }, + {}, + }, + [Z_EROFS_COMPRESSION_ZSTD] = (struct z_erofs_crypto_engine[]) { + {}, + }, +}; +static DECLARE_RWSEM(z_erofs_crypto_rwsem); + +static struct crypto_acomp *z_erofs_crypto_get_engine(int alg) +{ + struct z_erofs_crypto_engine *e; + + for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) + if (e->tfm) + return e->tfm; + return NULL; +} + +int z_erofs_crypto_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) +{ + struct crypto_acomp *tfm; + int i, err; + + down_read(&z_erofs_crypto_rwsem); + tfm = z_erofs_crypto_get_engine(rq->alg); + if (!tfm) { + err = -EOPNOTSUPP; + goto out; + } + + for (i = 0; i < rq->outpages; i++) { + struct page *const page = rq->out[i]; + struct page *victim; + + if (!page) { + victim = __erofs_allocpage(pgpl, rq->gfp, true); + if (!victim) { + err = -ENOMEM; + goto out; + } + set_page_private(victim, Z_EROFS_SHORTLIVED_PAGE); + rq->out[i] = victim; + } + } + err = __z_erofs_crypto_decompress(rq, tfm); +out: + up_read(&z_erofs_crypto_rwsem); + return err; +} + +int z_erofs_crypto_enable_engine(const char *name, int len) +{ + struct z_erofs_crypto_engine *e; + struct crypto_acomp *tfm; + int alg; + + down_write(&z_erofs_crypto_rwsem); + for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) { + for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) { + if (!strncmp(name, e->crypto_name, len)) { + if (e->tfm) + break; + tfm = crypto_alloc_acomp(e->crypto_name, 0, 0); + if (IS_ERR(tfm)) { + up_write(&z_erofs_crypto_rwsem); + return -EOPNOTSUPP; + } + e->tfm = tfm; + break; + } + } + } + up_write(&z_erofs_crypto_rwsem); + return 0; +} + +void z_erofs_crypto_disable_all_engines(void) +{ + struct z_erofs_crypto_engine *e; + int alg; + + down_write(&z_erofs_crypto_rwsem); + for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) { + for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) { + if (!e->tfm) + continue; + crypto_free_acomp(e->tfm); + e->tfm = NULL; + } + } + up_write(&z_erofs_crypto_rwsem); +} + +int z_erofs_crypto_show_engines(char *buf, int size, char sep) +{ + struct z_erofs_crypto_engine *e; + int alg, len = 0; + + for (alg = 0; alg < Z_EROFS_COMPRESSION_MAX; ++alg) { + for (e = z_erofs_crypto[alg]; e->crypto_name; ++e) { + if (!e->tfm) + continue; + len += scnprintf(buf + len, size - len, "%s%c", + e->crypto_name, sep); + } + } + return len; +} diff --git a/fs/erofs/decompressor_deflate.c b/fs/erofs/decompressor_deflate.c index 5070d2fcc737..6909b2d529c7 100644 --- a/fs/erofs/decompressor_deflate.c +++ b/fs/erofs/decompressor_deflate.c @@ -97,17 +97,11 @@ failed: return -ENOMEM; } -static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, - struct page **pgpl) +static int __z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) { struct super_block *sb = rq->sb; - struct z_erofs_stream_dctx dctx = { - .rq = rq, - .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, - .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) - >> PAGE_SHIFT, - .no = -1, .ni = 0, - }; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; struct z_erofs_deflate *strm; int zerr, err; @@ -184,6 +178,22 @@ failed_zinit: return err; } +static int z_erofs_deflate_decompress(struct z_erofs_decompress_req *rq, + struct page **pgpl) +{ +#ifdef CONFIG_EROFS_FS_ZIP_ACCEL + int err; + + if (!rq->partial_decoding) { + err = z_erofs_crypto_decompress(rq, pgpl); + if (err != -EOPNOTSUPP) + return err; + + } +#endif + return __z_erofs_deflate_decompress(rq, pgpl); +} + const struct z_erofs_decompressor z_erofs_deflate_decomp = { .config = z_erofs_load_deflate_config, .decompress = z_erofs_deflate_decompress, diff --git a/fs/erofs/decompressor_lzma.c b/fs/erofs/decompressor_lzma.c index 40666815046f..832cffb83a66 100644 --- a/fs/erofs/decompressor_lzma.c +++ b/fs/erofs/decompressor_lzma.c @@ -150,13 +150,7 @@ static int z_erofs_lzma_decompress(struct z_erofs_decompress_req *rq, struct page **pgpl) { struct super_block *sb = rq->sb; - struct z_erofs_stream_dctx dctx = { - .rq = rq, - .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, - .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) - >> PAGE_SHIFT, - .no = -1, .ni = 0, - }; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; struct xz_buf buf = {}; struct z_erofs_lzma *strm; enum xz_ret xz_err; diff --git a/fs/erofs/decompressor_zstd.c b/fs/erofs/decompressor_zstd.c index 7e177304967e..b4bfe14229f9 100644 --- a/fs/erofs/decompressor_zstd.c +++ b/fs/erofs/decompressor_zstd.c @@ -139,13 +139,7 @@ static int z_erofs_zstd_decompress(struct z_erofs_decompress_req *rq, struct page **pgpl) { struct super_block *sb = rq->sb; - struct z_erofs_stream_dctx dctx = { - .rq = rq, - .inpages = PAGE_ALIGN(rq->inputsize) >> PAGE_SHIFT, - .outpages = PAGE_ALIGN(rq->pageofs_out + rq->outputsize) - >> PAGE_SHIFT, - .no = -1, .ni = 0, - }; + struct z_erofs_stream_dctx dctx = { .rq = rq, .no = -1, .ni = 0 }; zstd_in_buffer in_buf = { NULL, 0, 0 }; zstd_out_buffer out_buf = { NULL, 0, 0 }; struct z_erofs_zstd *strm; diff --git a/fs/erofs/dir.c b/fs/erofs/dir.c index c3b90abdee37..2fae209d0274 100644 --- a/fs/erofs/dir.c +++ b/fs/erofs/dir.c @@ -58,9 +58,9 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) struct erofs_dirent *de; unsigned int nameoff, maxsize; - de = erofs_bread(&buf, dbstart, EROFS_KMAP); + de = erofs_bread(&buf, dbstart, true); if (IS_ERR(de)) { - erofs_err(sb, "fail to readdir of logical block %u of nid %llu", + erofs_err(sb, "failed to readdir of logical block %llu of nid %llu", erofs_blknr(sb, dbstart), EROFS_I(dir)->nid); err = PTR_ERR(de); break; @@ -90,6 +90,11 @@ static int erofs_readdir(struct file *f, struct dir_context *ctx) ofs = 0; } erofs_put_metabuf(&buf); + if (EROFS_I(dir)->dot_omitted && ctx->pos == dir->i_size) { + if (!dir_emit_dot(f, ctx)) + return 0; + ++ctx->pos; + } return err < 0 ? err : 0; } diff --git a/fs/erofs/erofs_fs.h b/fs/erofs/erofs_fs.h index 199395ed1c1f..767fb4acdc93 100644 --- a/fs/erofs/erofs_fs.h +++ b/fs/erofs/erofs_fs.h @@ -30,25 +30,19 @@ #define EROFS_FEATURE_INCOMPAT_FRAGMENTS 0x00000020 #define EROFS_FEATURE_INCOMPAT_DEDUPE 0x00000020 #define EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES 0x00000040 +#define EROFS_FEATURE_INCOMPAT_48BIT 0x00000080 #define EROFS_ALL_FEATURE_INCOMPAT \ - (EROFS_FEATURE_INCOMPAT_ZERO_PADDING | \ - EROFS_FEATURE_INCOMPAT_COMPR_CFGS | \ - EROFS_FEATURE_INCOMPAT_BIG_PCLUSTER | \ - EROFS_FEATURE_INCOMPAT_CHUNKED_FILE | \ - EROFS_FEATURE_INCOMPAT_DEVICE_TABLE | \ - EROFS_FEATURE_INCOMPAT_COMPR_HEAD2 | \ - EROFS_FEATURE_INCOMPAT_ZTAILPACKING | \ - EROFS_FEATURE_INCOMPAT_FRAGMENTS | \ - EROFS_FEATURE_INCOMPAT_DEDUPE | \ - EROFS_FEATURE_INCOMPAT_XATTR_PREFIXES) + ((EROFS_FEATURE_INCOMPAT_48BIT << 1) - 1) #define EROFS_SB_EXTSLOT_SIZE 16 struct erofs_deviceslot { u8 tag[64]; /* digest(sha256), etc. */ - __le32 blocks; /* total fs blocks of this device */ - __le32 mapped_blkaddr; /* map starting at mapped_blkaddr */ - u8 reserved[56]; + __le32 blocks_lo; /* total blocks count of this device */ + __le32 uniaddr_lo; /* unified starting block of this device */ + __le32 blocks_hi; /* total blocks count MSB */ + __le16 uniaddr_hi; /* unified starting block MSB */ + u8 reserved[50]; }; #define EROFS_DEVT_SLOT_SIZE sizeof(struct erofs_deviceslot) @@ -59,13 +53,14 @@ struct erofs_super_block { __le32 feature_compat; __u8 blkszbits; /* filesystem block size in bit shift */ __u8 sb_extslots; /* superblock size = 128 + sb_extslots * 16 */ - - __le16 root_nid; /* nid of root directory */ + union { + __le16 rootnid_2b; /* nid of root directory */ + __le16 blocks_hi; /* (48BIT on) blocks count MSB */ + } __packed rb; __le64 inos; /* total valid ino # (== f_files - f_favail) */ - - __le64 build_time; /* compact inode time derivation */ - __le32 build_time_nsec; /* compact inode time derivation in ns scale */ - __le32 blocks; /* used for statfs */ + __le64 epoch; /* base seconds used for compact inodes */ + __le32 fixed_nsec; /* fixed nanoseconds for compact inodes */ + __le32 blocks_lo; /* blocks count LSB */ __le32 meta_blkaddr; /* start block address of metadata area */ __le32 xattr_blkaddr; /* start block address of shared xattr area */ __u8 uuid[16]; /* 128-bit uuid for volume */ @@ -84,7 +79,10 @@ struct erofs_super_block { __le32 xattr_prefix_start; /* start of long xattr prefixes */ __le64 packed_nid; /* nid of the special packed inode */ __u8 xattr_filter_reserved; /* reserved for xattr name filter */ - __u8 reserved2[23]; + __u8 reserved[3]; + __le32 build_time; /* seconds added to epoch for mkfs time */ + __le64 rootnid_8b; /* (48BIT on) nid of root directory */ + __u8 reserved2[8]; }; /* @@ -115,19 +113,19 @@ static inline bool erofs_inode_is_data_compressed(unsigned int datamode) #define EROFS_I_VERSION_MASK 0x01 #define EROFS_I_DATALAYOUT_MASK 0x07 -#define EROFS_I_VERSION_BIT 0 -#define EROFS_I_DATALAYOUT_BIT 1 -#define EROFS_I_ALL_BIT 4 - -#define EROFS_I_ALL ((1 << EROFS_I_ALL_BIT) - 1) +#define EROFS_I_VERSION_BIT 0 +#define EROFS_I_DATALAYOUT_BIT 1 +#define EROFS_I_NLINK_1_BIT 4 /* non-directory compact inodes only */ +#define EROFS_I_DOT_OMITTED_BIT 4 /* (directories) omit the `.` dirent */ +#define EROFS_I_ALL ((1 << (EROFS_I_NLINK_1_BIT + 1)) - 1) /* indicate chunk blkbits, thus 'chunksize = blocksize << chunk blkbits' */ #define EROFS_CHUNK_FORMAT_BLKBITS_MASK 0x001F -/* with chunk indexes or just a 4-byte blkaddr array */ +/* with chunk indexes or just a 4-byte block array */ #define EROFS_CHUNK_FORMAT_INDEXES 0x0020 +#define EROFS_CHUNK_FORMAT_48BIT 0x0040 -#define EROFS_CHUNK_FORMAT_ALL \ - (EROFS_CHUNK_FORMAT_BLKBITS_MASK | EROFS_CHUNK_FORMAT_INDEXES) +#define EROFS_CHUNK_FORMAT_ALL ((EROFS_CHUNK_FORMAT_48BIT << 1) - 1) /* 32-byte on-disk inode */ #define EROFS_INODE_LAYOUT_COMPACT 0 @@ -140,45 +138,40 @@ struct erofs_inode_chunk_info { }; union erofs_inode_i_u { - /* total compressed blocks for compressed inodes */ - __le32 compressed_blocks; - - /* block address for uncompressed flat inodes */ - __le32 raw_blkaddr; - - /* for device files, used to indicate old/new device # */ - __le32 rdev; - - /* for chunk-based files, it contains the summary info */ + __le32 blocks_lo; /* total blocks count (if compressed inodes) */ + __le32 startblk_lo; /* starting block number (if flat inodes) */ + __le32 rdev; /* device ID (if special inodes) */ struct erofs_inode_chunk_info c; }; +union erofs_inode_i_nb { + __le16 nlink; /* if EROFS_I_NLINK_1_BIT is unset */ + __le16 blocks_hi; /* total blocks count MSB */ + __le16 startblk_hi; /* starting block number MSB */ +} __packed; + /* 32-byte reduced form of an ondisk inode */ struct erofs_inode_compact { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_nlink; + union erofs_inode_i_nb i_nb; __le32 i_size; - __le32 i_reserved; + __le32 i_mtime; union erofs_inode_i_u i_u; __le32 i_ino; /* only used for 32-bit stat compatibility */ __le16 i_uid; __le16 i_gid; - __le32 i_reserved2; + __le32 i_reserved; }; /* 64-byte complete form of an ondisk inode */ struct erofs_inode_extended { __le16 i_format; /* inode format hints */ - -/* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ __le16 i_xattr_icount; __le16 i_mode; - __le16 i_reserved; + union erofs_inode_i_nb i_nb; __le64 i_size; union erofs_inode_i_u i_u; @@ -248,6 +241,7 @@ static inline unsigned int erofs_xattr_ibody_size(__le16 i_xattr_icount) if (!i_xattr_icount) return 0; + /* 1 header + n-1 * 4 bytes inline xattr to keep continuity */ return sizeof(struct erofs_xattr_ibody_header) + sizeof(__u32) * (le16_to_cpu(i_xattr_icount) - 1); } @@ -266,11 +260,11 @@ static inline unsigned int erofs_xattr_entry_size(struct erofs_xattr_entry *e) /* 4-byte block address array */ #define EROFS_BLOCK_MAP_ENTRY_SIZE sizeof(__le32) -/* 8-byte inode chunk indexes */ +/* 8-byte inode chunk index */ struct erofs_inode_chunk_index { - __le16 advise; /* always 0, don't care for now */ + __le16 startblk_hi; /* starting block number MSB */ __le16 device_id; /* back-end storage id (with bits masked) */ - __le32 blkaddr; /* start block address of this inode chunk */ + __le32 startblk_lo; /* starting block number of this chunk */ }; /* dirent sorts in alphabet order, thus we can do binary search */ @@ -337,21 +331,20 @@ struct z_erofs_zstd_cfgs { #define Z_EROFS_ZSTD_MAX_DICT_SIZE Z_EROFS_PCLUSTER_MAX_SIZE /* - * bit 0 : COMPACTED_2B indexes (0 - off; 1 - on) - * e.g. for 4k logical cluster size, 4B if compacted 2B is off; - * (4B) + 2B + (4B) if compacted 2B is on. - * bit 1 : HEAD1 big pcluster (0 - off; 1 - on) - * bit 2 : HEAD2 big pcluster (0 - off; 1 - on) - * bit 3 : tailpacking inline pcluster (0 - off; 1 - on) - * bit 4 : interlaced plain pcluster (0 - off; 1 - on) - * bit 5 : fragment pcluster (0 - off; 1 - on) + * Enable COMPACTED_2B for EROFS_INODE_COMPRESSED_COMPACT inodes: + * 4B (disabled) vs 4B+2B+4B (enabled) */ #define Z_EROFS_ADVISE_COMPACTED_2B 0x0001 +/* Enable extent metadata for EROFS_INODE_COMPRESSED_FULL inodes */ +#define Z_EROFS_ADVISE_EXTENTS 0x0001 #define Z_EROFS_ADVISE_BIG_PCLUSTER_1 0x0002 #define Z_EROFS_ADVISE_BIG_PCLUSTER_2 0x0004 #define Z_EROFS_ADVISE_INLINE_PCLUSTER 0x0008 #define Z_EROFS_ADVISE_INTERLACED_PCLUSTER 0x0010 #define Z_EROFS_ADVISE_FRAGMENT_PCLUSTER 0x0020 +/* Indicate the record size for each extent if extent metadata is used */ +#define Z_EROFS_ADVISE_EXTRECSZ_BIT 1 +#define Z_EROFS_ADVISE_EXTRECSZ_MASK 0x3 #define Z_EROFS_FRAGMENT_INODE_BIT 7 struct z_erofs_map_header { @@ -363,45 +356,24 @@ struct z_erofs_map_header { /* indicates the encoded size of tailpacking data */ __le16 h_idata_size; }; + __le32 h_extents_lo; /* extent count LSB */ }; __le16 h_advise; - /* - * bit 0-3 : algorithm type of head 1 (logical cluster type 01); - * bit 4-7 : algorithm type of head 2 (logical cluster type 11). - */ - __u8 h_algorithmtype; - /* - * bit 0-2 : logical cluster bits - 12, e.g. 0 for 4096; - * bit 3-6 : reserved; - * bit 7 : move the whole file into packed inode or not. - */ - __u8 h_clusterbits; + union { + struct { + /* algorithm type (bit 0-3: HEAD1; bit 4-7: HEAD2) */ + __u8 h_algorithmtype; + /* + * bit 0-3 : logical cluster bits - blkszbits + * bit 4-6 : reserved + * bit 7 : pack the whole file into packed inode + */ + __u8 h_clusterbits; + } __packed; + __le16 h_extents_hi; /* extent count MSB */ + } __packed; }; -/* - * On-disk logical cluster type: - * 0 - literal (uncompressed) lcluster - * 1,3 - compressed lcluster (for HEAD lclusters) - * 2 - compressed lcluster (for NONHEAD lclusters) - * - * In detail, - * 0 - literal (uncompressed) lcluster, - * di_advise = 0 - * di_clusterofs = the literal data offset of the lcluster - * di_blkaddr = the blkaddr of the literal pcluster - * - * 1,3 - compressed lcluster (for HEAD lclusters) - * di_advise = 1 or 3 - * di_clusterofs = the decompressed data offset of the lcluster - * di_blkaddr = the blkaddr of the compressed pcluster - * - * 2 - compressed lcluster (for NONHEAD lclusters) - * di_advise = 2 - * di_clusterofs = - * the decompressed data offset in its own HEAD lcluster - * di_u.delta[0] = distance to this HEAD lcluster - * di_u.delta[1] = distance to the next HEAD lcluster - */ enum { Z_EROFS_LCLUSTER_TYPE_PLAIN = 0, Z_EROFS_LCLUSTER_TYPE_HEAD1 = 1, @@ -415,11 +387,7 @@ enum { /* (noncompact only, HEAD) This pcluster refers to partial decompressed data */ #define Z_EROFS_LI_PARTIAL_REF (1 << 15) -/* - * D0_CBLKCNT will be marked _only_ at the 1st non-head lcluster to store the - * compressed block count of a compressed extent (in logical clusters, aka. - * block count of a pcluster). - */ +/* Set on 1st non-head lcluster to store compressed block counti (in blocks) */ #define Z_EROFS_LI_D0_CBLKCNT (1 << 11) struct z_erofs_lcluster_index { @@ -428,19 +396,36 @@ struct z_erofs_lcluster_index { __le16 di_clusterofs; union { - /* for the HEAD lclusters */ - __le32 blkaddr; + __le32 blkaddr; /* for the HEAD lclusters */ /* - * for the NONHEAD lclusters * [0] - distance to its HEAD lcluster * [1] - distance to the next HEAD lcluster */ - __le16 delta[2]; + __le16 delta[2]; /* for the NONHEAD lclusters */ } di_u; }; -#define Z_EROFS_FULL_INDEX_ALIGN(end) \ - (ALIGN(end, 8) + sizeof(struct z_erofs_map_header) + 8) +#define Z_EROFS_MAP_HEADER_END(end) \ + (ALIGN(end, 8) + sizeof(struct z_erofs_map_header)) +#define Z_EROFS_FULL_INDEX_START(end) (Z_EROFS_MAP_HEADER_END(end) + 8) + +#define Z_EROFS_EXTENT_PLEN_PARTIAL BIT(27) +#define Z_EROFS_EXTENT_PLEN_FMT_BIT 28 +#define Z_EROFS_EXTENT_PLEN_MASK ((Z_EROFS_PCLUSTER_MAX_SIZE << 1) - 1) +struct z_erofs_extent { + __le32 plen; /* encoded length */ + __le32 pstart_lo; /* physical offset */ + __le32 pstart_hi; /* physical offset MSB */ + __le32 lstart_lo; /* logical offset */ + __le32 lstart_hi; /* logical offset MSB (>= 4GiB inodes) */ + __u8 reserved[12]; /* for future use */ +}; + +static inline int z_erofs_extent_recsize(unsigned int advise) +{ + return 4 << ((advise >> Z_EROFS_ADVISE_EXTRECSZ_BIT) & + Z_EROFS_ADVISE_EXTRECSZ_MASK); +} /* check the EROFS on-disk layout strictly at compile time */ static inline void erofs_check_ondisk_layout_definitions(void) diff --git a/fs/erofs/fileio.c b/fs/erofs/fileio.c index 0ffd1c63beeb..7d81f504bff0 100644 --- a/fs/erofs/fileio.c +++ b/fs/erofs/fileio.c @@ -32,6 +32,8 @@ static void erofs_fileio_ki_complete(struct kiocb *iocb, long ret) ret = 0; } if (rq->bio.bi_end_io) { + if (ret < 0 && !rq->bio.bi_status) + rq->bio.bi_status = errno_to_blk_status(ret); rq->bio.bi_end_io(&rq->bio); } else { bio_for_each_folio_all(fi, &rq->bio) { @@ -112,7 +114,7 @@ static int erofs_fileio_scan_folio(struct erofs_fileio *io, struct folio *folio) void *src; src = erofs_read_metabuf(&buf, inode->i_sb, - map->m_pa + ofs, EROFS_KMAP); + map->m_pa + ofs, true); if (IS_ERR(src)) { err = PTR_ERR(src); break; @@ -145,13 +147,14 @@ io_retry: if (err) break; io->rq = erofs_fileio_rq_alloc(&io->dev); - io->rq->bio.bi_iter.bi_sector = io->dev.m_pa >> 9; + io->rq->bio.bi_iter.bi_sector = + (io->dev.m_dif->fsoff + io->dev.m_pa) >> 9; attached = 0; } - if (!attached++) - erofs_onlinefolio_split(folio); if (!bio_add_folio(&io->rq->bio, folio, len, cur)) goto io_retry; + if (!attached++) + erofs_onlinefolio_split(folio); io->dev.m_pa += len; } cur += len; @@ -178,7 +181,7 @@ static void erofs_fileio_readahead(struct readahead_control *rac) struct folio *folio; int err; - trace_erofs_readpages(inode, readahead_index(rac), + trace_erofs_readahead(inode, readahead_index(rac), readahead_count(rac), true); while ((folio = readahead_folio(rac))) { err = erofs_fileio_scan_folio(&io, folio); diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index ce3d8737df85..9c9129bca346 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -276,7 +276,7 @@ static int erofs_fscache_data_read_slice(struct erofs_fscache_rq *req) size_t size = map.m_llen; void *src; - src = erofs_read_metabuf(&buf, sb, map.m_pa, EROFS_KMAP); + src = erofs_read_metabuf(&buf, sb, map.m_pa, true); if (IS_ERR(src)) return PTR_ERR(src); diff --git a/fs/erofs/inode.c b/fs/erofs/inode.c index d4b89407822a..a0ae0b4f7b01 100644 --- a/fs/erofs/inode.c +++ b/fs/erofs/inode.c @@ -27,29 +27,27 @@ static int erofs_fill_symlink(struct inode *inode, void *kaddr, static int erofs_read_inode(struct inode *inode) { struct super_block *sb = inode->i_sb; + erofs_blk_t blkaddr = erofs_blknr(sb, erofs_iloc(inode)); + unsigned int ofs = erofs_blkoff(sb, erofs_iloc(inode)); + struct erofs_buf buf = __EROFS_BUF_INITIALIZER; struct erofs_sb_info *sbi = EROFS_SB(sb); + erofs_blk_t addrmask = BIT_ULL(48) - 1; struct erofs_inode *vi = EROFS_I(inode); - const erofs_off_t inode_loc = erofs_iloc(inode); - erofs_blk_t blkaddr, nblks = 0; - void *kaddr; + struct erofs_inode_extended *die, copied; struct erofs_inode_compact *dic; - struct erofs_inode_extended *die, *copied = NULL; - union erofs_inode_i_u iu; - struct erofs_buf buf = __EROFS_BUF_INITIALIZER; - unsigned int ifmt, ofs; + unsigned int ifmt; + void *ptr; int err = 0; - blkaddr = erofs_blknr(sb, inode_loc); - ofs = erofs_blkoff(sb, inode_loc); - - kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), EROFS_KMAP); - if (IS_ERR(kaddr)) { - erofs_err(sb, "failed to get inode (nid: %llu) page, err %ld", - vi->nid, PTR_ERR(kaddr)); - return PTR_ERR(kaddr); + ptr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr), true); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + erofs_err(sb, "failed to get inode (nid: %llu) page, err %d", + vi->nid, err); + goto err_out; } - dic = kaddr + ofs; + dic = ptr + ofs; ifmt = le16_to_cpu(dic->i_format); if (ifmt & ~EROFS_I_ALL) { erofs_err(sb, "unsupported i_format %u of nid %llu", @@ -73,40 +71,34 @@ static int erofs_read_inode(struct inode *inode) if (ofs + vi->inode_isize <= sb->s_blocksize) { ofs += vi->inode_isize; die = (struct erofs_inode_extended *)dic; + copied.i_u = die->i_u; + copied.i_nb = die->i_nb; } else { const unsigned int gotten = sb->s_blocksize - ofs; - copied = kmalloc(vi->inode_isize, GFP_KERNEL); - if (!copied) { - err = -ENOMEM; + memcpy(&copied, dic, gotten); + ptr = erofs_read_metabuf(&buf, sb, + erofs_pos(sb, blkaddr + 1), true); + if (IS_ERR(ptr)) { + err = PTR_ERR(ptr); + erofs_err(sb, "failed to get inode payload block (nid: %llu), err %d", + vi->nid, err); goto err_out; } - memcpy(copied, dic, gotten); - kaddr = erofs_read_metabuf(&buf, sb, erofs_pos(sb, blkaddr + 1), - EROFS_KMAP); - if (IS_ERR(kaddr)) { - erofs_err(sb, "failed to get inode payload block (nid: %llu), err %ld", - vi->nid, PTR_ERR(kaddr)); - kfree(copied); - return PTR_ERR(kaddr); - } ofs = vi->inode_isize - gotten; - memcpy((u8 *)copied + gotten, kaddr, ofs); - die = copied; + memcpy((u8 *)&copied + gotten, ptr, ofs); + die = &copied; } vi->xattr_isize = erofs_xattr_ibody_size(die->i_xattr_icount); inode->i_mode = le16_to_cpu(die->i_mode); - iu = die->i_u; i_uid_write(inode, le32_to_cpu(die->i_uid)); i_gid_write(inode, le32_to_cpu(die->i_gid)); set_nlink(inode, le32_to_cpu(die->i_nlink)); - /* each extended inode has its own timestamp */ - inode_set_ctime(inode, le64_to_cpu(die->i_mtime), + inode_set_mtime(inode, le64_to_cpu(die->i_mtime), le32_to_cpu(die->i_mtime_nsec)); inode->i_size = le64_to_cpu(die->i_size); - kfree(copied); break; case EROFS_INODE_LAYOUT_COMPACT: vi->inode_isize = sizeof(struct erofs_inode_compact); @@ -114,12 +106,20 @@ static int erofs_read_inode(struct inode *inode) vi->xattr_isize = erofs_xattr_ibody_size(dic->i_xattr_icount); inode->i_mode = le16_to_cpu(dic->i_mode); - iu = dic->i_u; + copied.i_u = dic->i_u; i_uid_write(inode, le16_to_cpu(dic->i_uid)); i_gid_write(inode, le16_to_cpu(dic->i_gid)); - set_nlink(inode, le16_to_cpu(dic->i_nlink)); - /* use build time for compact inodes */ - inode_set_ctime(inode, sbi->build_time, sbi->build_time_nsec); + if (!S_ISDIR(inode->i_mode) && + ((ifmt >> EROFS_I_NLINK_1_BIT) & 1)) { + set_nlink(inode, 1); + copied.i_nb = dic->i_nb; + } else { + set_nlink(inode, le16_to_cpu(dic->i_nb.nlink)); + copied.i_nb.startblk_hi = 0; + addrmask = BIT_ULL(32) - 1; + } + inode_set_mtime(inode, sbi->epoch + le32_to_cpu(dic->i_mtime), + sbi->fixed_nsec); inode->i_size = le32_to_cpu(dic->i_size); break; @@ -136,19 +136,26 @@ static int erofs_read_inode(struct inode *inode) goto err_out; } switch (inode->i_mode & S_IFMT) { - case S_IFREG: case S_IFDIR: + vi->dot_omitted = (ifmt >> EROFS_I_DOT_OMITTED_BIT) & 1; + fallthrough; + case S_IFREG: case S_IFLNK: - vi->raw_blkaddr = le32_to_cpu(iu.raw_blkaddr); + vi->startblk = le32_to_cpu(copied.i_u.startblk_lo) | + ((u64)le16_to_cpu(copied.i_nb.startblk_hi) << 32); + if (vi->datalayout == EROFS_INODE_FLAT_PLAIN && + !((vi->startblk ^ EROFS_NULL_ADDR) & addrmask)) + vi->startblk = EROFS_NULL_ADDR; + if(S_ISLNK(inode->i_mode)) { - err = erofs_fill_symlink(inode, kaddr, ofs); + err = erofs_fill_symlink(inode, ptr, ofs); if (err) goto err_out; } break; case S_IFCHR: case S_IFBLK: - inode->i_rdev = new_decode_dev(le32_to_cpu(iu.rdev)); + inode->i_rdev = new_decode_dev(le32_to_cpu(copied.i_u.rdev)); break; case S_IFIFO: case S_IFSOCK: @@ -161,12 +168,15 @@ static int erofs_read_inode(struct inode *inode) goto err_out; } - /* total blocks for compressed files */ - if (erofs_inode_is_data_compressed(vi->datalayout)) { - nblks = le32_to_cpu(iu.compressed_blocks); - } else if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { + if (erofs_inode_is_data_compressed(vi->datalayout)) + inode->i_blocks = le32_to_cpu(copied.i_u.blocks_lo) << + (sb->s_blocksize_bits - 9); + else + inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; + + if (vi->datalayout == EROFS_INODE_CHUNK_BASED) { /* fill chunked inode summary info */ - vi->chunkformat = le16_to_cpu(iu.c.format); + vi->chunkformat = le16_to_cpu(copied.i_u.c.format); if (vi->chunkformat & ~EROFS_CHUNK_FORMAT_ALL) { erofs_err(sb, "unsupported chunk format %x of nid %llu", vi->chunkformat, vi->nid); @@ -176,22 +186,15 @@ static int erofs_read_inode(struct inode *inode) vi->chunkbits = sb->s_blocksize_bits + (vi->chunkformat & EROFS_CHUNK_FORMAT_BLKBITS_MASK); } - inode_set_mtime_to_ts(inode, - inode_set_atime_to_ts(inode, inode_get_ctime(inode))); + inode_set_atime_to_ts(inode, + inode_set_ctime_to_ts(inode, inode_get_mtime(inode))); inode->i_flags &= ~S_DAX; if (test_opt(&sbi->opt, DAX_ALWAYS) && S_ISREG(inode->i_mode) && (vi->datalayout == EROFS_INODE_FLAT_PLAIN || vi->datalayout == EROFS_INODE_CHUNK_BASED)) inode->i_flags |= S_DAX; - - if (!nblks) - /* measure inode.i_blocks as generic filesystems */ - inode->i_blocks = round_up(inode->i_size, sb->s_blocksize) >> 9; - else - inode->i_blocks = nblks << (sb->s_blocksize_bits - 9); err_out: - DBG_BUGON(err); erofs_put_metabuf(&buf); return err; } @@ -202,13 +205,10 @@ static int erofs_fill_inode(struct inode *inode) int err; trace_erofs_fill_inode(inode); - - /* read inode base data from disk */ err = erofs_read_inode(inode); if (err) return err; - /* setup the new inode */ switch (inode->i_mode & S_IFMT) { case S_IFREG: inode->i_op = &erofs_generic_iops; @@ -229,15 +229,10 @@ static int erofs_fill_inode(struct inode *inode) inode->i_op = &erofs_symlink_iops; inode_nohighmem(inode); break; - case S_IFCHR: - case S_IFBLK: - case S_IFIFO: - case S_IFSOCK: + default: inode->i_op = &erofs_generic_iops; init_special_inode(inode, inode->i_mode, inode->i_rdev); return 0; - default: - return -EFSCORRUPTED; } mapping_set_large_folios(inode->i_mapping); diff --git a/fs/erofs/internal.h b/fs/erofs/internal.h index 686d835eb533..a32c03a80c70 100644 --- a/fs/erofs/internal.h +++ b/fs/erofs/internal.h @@ -37,18 +37,17 @@ __printf(2, 3) void _erofs_printk(struct super_block *sb, const char *fmt, ...); typedef u64 erofs_nid_t; typedef u64 erofs_off_t; -/* data type for filesystem-wide blocks number */ -typedef u32 erofs_blk_t; +typedef u64 erofs_blk_t; struct erofs_device_info { char *path; struct erofs_fscache *fscache; struct file *file; struct dax_device *dax_dev; - u64 dax_part_off; + u64 fsoff, dax_part_off; - u32 blocks; - u32 mapped_blkaddr; + erofs_blk_t blocks; + erofs_blk_t uniaddr; }; enum { @@ -143,8 +142,8 @@ struct erofs_sb_info { unsigned char blkszbits; /* filesystem block size in bit shift */ u32 sb_size; /* total superblock size */ - u32 build_time_nsec; - u64 build_time; + u32 fixed_nsec; + s64 epoch; /* what we really care is nid, rather than ino.. */ erofs_nid_t root_nid; @@ -152,8 +151,6 @@ struct erofs_sb_info { /* used for statfs, f_files - f_favail */ u64 inos; - u8 uuid[16]; /* 128-bit uuid for volume */ - u8 volume_name[16]; /* volume name */ u32 feature_compat; u32 feature_incompat; @@ -199,21 +196,17 @@ enum { EROFS_ZIP_CACHE_READAROUND }; -enum erofs_kmap_type { - EROFS_NO_KMAP, /* don't map the buffer */ - EROFS_KMAP, /* use kmap_local_page() to map the buffer */ -}; - struct erofs_buf { struct address_space *mapping; struct file *file; + u64 off; struct page *page; void *base; }; #define __EROFS_BUF_INITIALIZER ((struct erofs_buf){ .page = NULL }) -#define erofs_blknr(sb, addr) ((erofs_blk_t)((addr) >> (sb)->s_blocksize_bits)) -#define erofs_blkoff(sb, addr) ((addr) & ((sb)->s_blocksize - 1)) +#define erofs_blknr(sb, pos) ((erofs_blk_t)((pos) >> (sb)->s_blocksize_bits)) +#define erofs_blkoff(sb, pos) ((pos) & ((sb)->s_blocksize - 1)) #define erofs_pos(sb, blk) ((erofs_off_t)(blk) << (sb)->s_blocksize_bits) #define erofs_iblks(i) (round_up((i)->i_size, i_blocksize(i)) >> (i)->i_blkbits) @@ -233,6 +226,7 @@ EROFS_FEATURE_FUNCS(ztailpacking, incompat, INCOMPAT_ZTAILPACKING) EROFS_FEATURE_FUNCS(fragments, incompat, INCOMPAT_FRAGMENTS) EROFS_FEATURE_FUNCS(dedupe, incompat, INCOMPAT_DEDUPE) EROFS_FEATURE_FUNCS(xattr_prefixes, incompat, INCOMPAT_XATTR_PREFIXES) +EROFS_FEATURE_FUNCS(48bit, incompat, INCOMPAT_48BIT) EROFS_FEATURE_FUNCS(sb_chksum, compat, COMPAT_SB_CHKSUM) EROFS_FEATURE_FUNCS(xattr_filter, compat, COMPAT_XATTR_FILTER) @@ -252,6 +246,7 @@ struct erofs_inode { unsigned char datalayout; unsigned char inode_isize; + bool dot_omitted; unsigned int xattr_isize; unsigned int xattr_name_filter; @@ -259,7 +254,7 @@ struct erofs_inode { unsigned int *xattr_shared_xattrs; union { - erofs_blk_t raw_blkaddr; + erofs_blk_t startblk; struct { unsigned short chunkformat; unsigned char chunkbits; @@ -268,15 +263,13 @@ struct erofs_inode { struct { unsigned short z_advise; unsigned char z_algorithmtype[2]; - unsigned char z_logical_clusterbits; - unsigned long z_tailextent_headlcn; + unsigned char z_lclusterbits; union { - struct { - erofs_off_t z_idataoff; - unsigned short z_idata_size; - }; - erofs_off_t z_fragmentoff; + u64 z_tailextent_headlcn; + u64 z_extents; }; + erofs_off_t z_fragmentoff; + unsigned short z_idata_size; }; #endif /* CONFIG_EROFS_FS_ZIP */ }; @@ -387,11 +380,10 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, erofs_off_t *offset, int *lengthp); void erofs_unmap_metabuf(struct erofs_buf *buf); void erofs_put_metabuf(struct erofs_buf *buf); -void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, - enum erofs_kmap_type type); +void *erofs_bread(struct erofs_buf *buf, erofs_off_t offset, bool need_kmap); void erofs_init_metabuf(struct erofs_buf *buf, struct super_block *sb); void *erofs_read_metabuf(struct erofs_buf *buf, struct super_block *sb, - erofs_off_t offset, enum erofs_kmap_type type); + erofs_off_t offset, bool need_kmap); int erofs_map_dev(struct super_block *sb, struct erofs_map_dev *dev); int erofs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, u64 start, u64 len); @@ -448,6 +440,7 @@ int __init erofs_init_shrinker(void); void erofs_exit_shrinker(void); int __init z_erofs_init_subsystem(void); void z_erofs_exit_subsystem(void); +int z_erofs_init_super(struct super_block *sb); unsigned long z_erofs_shrink_scan(struct erofs_sb_info *sbi, unsigned long nr_shrink); int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, @@ -457,7 +450,6 @@ void z_erofs_put_gbuf(void *ptr); int z_erofs_gbuf_growsize(unsigned int nrpages); int __init z_erofs_gbuf_init(void); void z_erofs_gbuf_exit(void); -int erofs_init_managed_cache(struct super_block *sb); int z_erofs_parse_cfgs(struct super_block *sb, struct erofs_super_block *dsb); #else static inline void erofs_shrinker_register(struct super_block *sb) {} @@ -466,7 +458,7 @@ static inline int erofs_init_shrinker(void) { return 0; } static inline void erofs_exit_shrinker(void) {} static inline int z_erofs_init_subsystem(void) { return 0; } static inline void z_erofs_exit_subsystem(void) {} -static inline int erofs_init_managed_cache(struct super_block *sb) { return 0; } +static inline int z_erofs_init_super(struct super_block *sb) { return 0; } #endif /* !CONFIG_EROFS_FS_ZIP */ #ifdef CONFIG_EROFS_FS_BACKED_BY_FILE diff --git a/fs/erofs/namei.c b/fs/erofs/namei.c index c94d0c1608a8..f7cf4f41af28 100644 --- a/fs/erofs/namei.c +++ b/fs/erofs/namei.c @@ -100,7 +100,7 @@ static void *erofs_find_target_block(struct erofs_buf *target, struct erofs_dirent *de; buf.mapping = dir->i_mapping; - de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), EROFS_KMAP); + de = erofs_bread(&buf, erofs_pos(dir->i_sb, mid), true); if (!IS_ERR(de)) { const int nameoff = nameoff_from_disk(de->nameoff, bsz); const int ndirents = nameoff / sizeof(*de); diff --git a/fs/erofs/super.c b/fs/erofs/super.c index 827b62665649..e1e9f06e8342 100644 --- a/fs/erofs/super.c +++ b/fs/erofs/super.c @@ -94,7 +94,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, int len, i, cnt; *offset = round_up(*offset, 4); - ptr = erofs_bread(buf, *offset, EROFS_KMAP); + ptr = erofs_bread(buf, *offset, true); if (IS_ERR(ptr)) return ptr; @@ -110,7 +110,7 @@ void *erofs_read_metadata(struct super_block *sb, struct erofs_buf *buf, for (i = 0; i < len; i += cnt) { cnt = min_t(int, sb->s_blocksize - erofs_blkoff(sb, *offset), len - i); - ptr = erofs_bread(buf, *offset, EROFS_KMAP); + ptr = erofs_bread(buf, *offset, true); if (IS_ERR(ptr)) { kfree(buffer); return ptr; @@ -141,7 +141,7 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, struct erofs_deviceslot *dis; struct file *file; - dis = erofs_read_metabuf(buf, sb, *pos, EROFS_KMAP); + dis = erofs_read_metabuf(buf, sb, *pos, true); if (IS_ERR(dis)) return PTR_ERR(dis); @@ -165,8 +165,11 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, filp_open(dif->path, O_RDONLY | O_LARGEFILE, 0) : bdev_file_open_by_path(dif->path, BLK_OPEN_READ, sb->s_type, NULL); - if (IS_ERR(file)) + if (IS_ERR(file)) { + if (file == ERR_PTR(-ENOTBLK)) + return -EINVAL; return PTR_ERR(file); + } if (!erofs_is_fileio_mode(sbi)) { dif->dax_dev = fs_dax_get_by_bdev(file_bdev(file), @@ -178,8 +181,8 @@ static int erofs_init_device(struct erofs_buf *buf, struct super_block *sb, dif->file = file; } - dif->blocks = le32_to_cpu(dis->blocks); - dif->mapped_blkaddr = le32_to_cpu(dis->mapped_blkaddr); + dif->blocks = le32_to_cpu(dis->blocks_lo); + dif->uniaddr = le32_to_cpu(dis->uniaddr_lo); sbi->total_blocks += dif->blocks; *pos += EROFS_DEVT_SLOT_SIZE; return 0; @@ -255,7 +258,7 @@ static int erofs_read_superblock(struct super_block *sb) void *data; int ret; - data = erofs_read_metabuf(&buf, sb, 0, EROFS_KMAP); + data = erofs_read_metabuf(&buf, sb, 0, true); if (IS_ERR(data)) { erofs_err(sb, "cannot read erofs superblock"); return PTR_ERR(data); @@ -268,7 +271,7 @@ static int erofs_read_superblock(struct super_block *sb) goto out; } - sbi->blkszbits = dsb->blkszbits; + sbi->blkszbits = dsb->blkszbits; if (sbi->blkszbits < 9 || sbi->blkszbits > PAGE_SHIFT) { erofs_err(sb, "blkszbits %u isn't supported", sbi->blkszbits); goto out; @@ -299,7 +302,7 @@ static int erofs_read_superblock(struct super_block *sb) sbi->sb_size); goto out; } - sbi->dif0.blocks = le32_to_cpu(dsb->blocks); + sbi->dif0.blocks = le32_to_cpu(dsb->blocks_lo); sbi->meta_blkaddr = le32_to_cpu(dsb->meta_blkaddr); #ifdef CONFIG_EROFS_FS_XATTR sbi->xattr_blkaddr = le32_to_cpu(dsb->xattr_blkaddr); @@ -308,23 +311,20 @@ static int erofs_read_superblock(struct super_block *sb) sbi->xattr_filter_reserved = dsb->xattr_filter_reserved; #endif sbi->islotbits = ilog2(sizeof(struct erofs_inode_compact)); - sbi->root_nid = le16_to_cpu(dsb->root_nid); + if (erofs_sb_has_48bit(sbi) && dsb->rootnid_8b) { + sbi->root_nid = le64_to_cpu(dsb->rootnid_8b); + sbi->dif0.blocks = (sbi->dif0.blocks << 32) | + le16_to_cpu(dsb->rb.blocks_hi); + } else { + sbi->root_nid = le16_to_cpu(dsb->rb.rootnid_2b); + } sbi->packed_nid = le64_to_cpu(dsb->packed_nid); sbi->inos = le64_to_cpu(dsb->inos); - sbi->build_time = le64_to_cpu(dsb->build_time); - sbi->build_time_nsec = le32_to_cpu(dsb->build_time_nsec); - + sbi->epoch = (s64)le64_to_cpu(dsb->epoch); + sbi->fixed_nsec = le32_to_cpu(dsb->fixed_nsec); super_set_uuid(sb, (void *)dsb->uuid, sizeof(dsb->uuid)); - ret = strscpy(sbi->volume_name, dsb->volume_name, - sizeof(dsb->volume_name)); - if (ret < 0) { /* -E2BIG */ - erofs_err(sb, "bad volume name without NIL terminator"); - ret = -EFSCORRUPTED; - goto out; - } - /* parse on-disk compression configurations */ ret = z_erofs_parse_cfgs(sb, dsb); if (ret < 0) @@ -333,6 +333,8 @@ static int erofs_read_superblock(struct super_block *sb) /* handle multiple devices */ ret = erofs_scan_devices(sb, dsb); + if (erofs_sb_has_48bit(sbi)) + erofs_info(sb, "EXPERIMENTAL 48-bit layout support in use. Use at your own risk!"); if (erofs_is_fscache_mode(sb)) erofs_info(sb, "[deprecated] fscache-based on-demand read feature in use. Use at your own risk!"); out: @@ -357,8 +359,7 @@ static void erofs_default_options(struct erofs_sb_info *sbi) enum { Opt_user_xattr, Opt_acl, Opt_cache_strategy, Opt_dax, Opt_dax_enum, - Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, - Opt_err + Opt_device, Opt_fsid, Opt_domain_id, Opt_directio, Opt_fsoffset, }; static const struct constant_table erofs_param_cache_strategy[] = { @@ -385,6 +386,7 @@ static const struct fs_parameter_spec erofs_fs_parameters[] = { fsparam_string("fsid", Opt_fsid), fsparam_string("domain_id", Opt_domain_id), fsparam_flag_no("directio", Opt_directio), + fsparam_u64("fsoffset", Opt_fsoffset), {} }; @@ -508,28 +510,59 @@ static int erofs_fc_parse_param(struct fs_context *fc, errorfc(fc, "%s option not supported", erofs_fs_parameters[opt].name); #endif break; + case Opt_fsoffset: + sbi->dif0.fsoff = result.uint_64; + break; } return 0; } -static struct inode *erofs_nfs_get_inode(struct super_block *sb, - u64 ino, u32 generation) +static int erofs_encode_fh(struct inode *inode, u32 *fh, int *max_len, + struct inode *parent) { - return erofs_iget(sb, ino); + erofs_nid_t nid = EROFS_I(inode)->nid; + int len = parent ? 6 : 3; + + if (*max_len < len) { + *max_len = len; + return FILEID_INVALID; + } + + fh[0] = (u32)(nid >> 32); + fh[1] = (u32)(nid & 0xffffffff); + fh[2] = inode->i_generation; + + if (parent) { + nid = EROFS_I(parent)->nid; + + fh[3] = (u32)(nid >> 32); + fh[4] = (u32)(nid & 0xffffffff); + fh[5] = parent->i_generation; + } + + *max_len = len; + return parent ? FILEID_INO64_GEN_PARENT : FILEID_INO64_GEN; } static struct dentry *erofs_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - return generic_fh_to_dentry(sb, fid, fh_len, fh_type, - erofs_nfs_get_inode); + if ((fh_type != FILEID_INO64_GEN && + fh_type != FILEID_INO64_GEN_PARENT) || fh_len < 3) + return NULL; + + return d_obtain_alias(erofs_iget(sb, + ((u64)fid->raw[0] << 32) | fid->raw[1])); } static struct dentry *erofs_fh_to_parent(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { - return generic_fh_to_parent(sb, fid, fh_len, fh_type, - erofs_nfs_get_inode); + if (fh_type != FILEID_INO64_GEN_PARENT || fh_len < 6) + return NULL; + + return d_obtain_alias(erofs_iget(sb, + ((u64)fid->raw[3] << 32) | fid->raw[4])); } static struct dentry *erofs_get_parent(struct dentry *child) @@ -545,7 +578,7 @@ static struct dentry *erofs_get_parent(struct dentry *child) } static const struct export_operations erofs_export_ops = { - .encode_fh = generic_encode_ino32_fh, + .encode_fh = erofs_encode_fh, .fh_to_dentry = erofs_fh_to_dentry, .fh_to_parent = erofs_fh_to_parent, .get_parent = erofs_get_parent, @@ -620,6 +653,14 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) } } + if (sbi->dif0.fsoff) { + if (sbi->dif0.fsoff & (sb->s_blocksize - 1)) + return invalfc(fc, "fsoffset %llu is not aligned to block size %lu", + sbi->dif0.fsoff, sb->s_blocksize); + if (erofs_is_fscache_mode(sb)) + return invalfc(fc, "cannot use fsoffset in fscache mode"); + } + if (test_opt(&sbi->opt, DAX_ALWAYS)) { if (!sbi->dif0.dax_dev) { errorfc(fc, "DAX unsupported by block device. Turning off DAX."); @@ -639,9 +680,16 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) else sb->s_flags &= ~SB_POSIXACL; -#ifdef CONFIG_EROFS_FS_ZIP - xa_init(&sbi->managed_pslots); -#endif + err = z_erofs_init_super(sb); + if (err) + return err; + + if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { + inode = erofs_iget(sb, sbi->packed_nid); + if (IS_ERR(inode)) + return PTR_ERR(inode); + sbi->packed_inode = inode; + } inode = erofs_iget(sb, sbi->root_nid); if (IS_ERR(inode)) @@ -653,24 +701,11 @@ static int erofs_fc_fill_super(struct super_block *sb, struct fs_context *fc) iput(inode); return -EINVAL; } - sb->s_root = d_make_root(inode); if (!sb->s_root) return -ENOMEM; erofs_shrinker_register(sb); - if (erofs_sb_has_fragments(sbi) && sbi->packed_nid) { - sbi->packed_inode = erofs_iget(sb, sbi->packed_nid); - if (IS_ERR(sbi->packed_inode)) { - err = PTR_ERR(sbi->packed_inode); - sbi->packed_inode = NULL; - return err; - } - } - err = erofs_init_managed_cache(sb); - if (err) - return err; - err = erofs_xattr_prefixes_init(sb); if (err) return err; @@ -806,6 +841,16 @@ static int erofs_init_fs_context(struct fs_context *fc) return 0; } +static void erofs_drop_internal_inodes(struct erofs_sb_info *sbi) +{ + iput(sbi->packed_inode); + sbi->packed_inode = NULL; +#ifdef CONFIG_EROFS_FS_ZIP + iput(sbi->managed_cache); + sbi->managed_cache = NULL; +#endif +} + static void erofs_kill_sb(struct super_block *sb) { struct erofs_sb_info *sbi = EROFS_SB(sb); @@ -815,6 +860,7 @@ static void erofs_kill_sb(struct super_block *sb) kill_anon_super(sb); else kill_block_super(sb); + erofs_drop_internal_inodes(sbi); fs_put_dax(sbi->dif0.dax_dev, NULL); erofs_fscache_unregister_fs(sb); erofs_sb_free(sbi); @@ -825,17 +871,10 @@ static void erofs_put_super(struct super_block *sb) { struct erofs_sb_info *const sbi = EROFS_SB(sb); - DBG_BUGON(!sbi); - erofs_unregister_sysfs(sb); erofs_shrinker_unregister(sb); erofs_xattr_prefixes_cleanup(sb); -#ifdef CONFIG_EROFS_FS_ZIP - iput(sbi->managed_cache); - sbi->managed_cache = NULL; -#endif - iput(sbi->packed_inode); - sbi->packed_inode = NULL; + erofs_drop_internal_inodes(sbi); erofs_free_dev_context(sbi->devs); sbi->devs = NULL; erofs_fscache_unregister_fs(sb); @@ -951,6 +990,8 @@ static int erofs_show_options(struct seq_file *seq, struct dentry *root) if (sbi->domain_id) seq_printf(seq, ",domain_id=%s", sbi->domain_id); #endif + if (sbi->dif0.fsoff) + seq_printf(seq, ",fsoffset=%llu", sbi->dif0.fsoff); return 0; } diff --git a/fs/erofs/sysfs.c b/fs/erofs/sysfs.c index 19d586273b70..eed8797a193f 100644 --- a/fs/erofs/sysfs.c +++ b/fs/erofs/sysfs.c @@ -7,12 +7,14 @@ #include <linux/kobject.h> #include "internal.h" +#include "compress.h" enum { attr_feature, attr_drop_caches, attr_pointer_ui, attr_pointer_bool, + attr_accel, }; enum { @@ -60,14 +62,25 @@ static struct erofs_attr erofs_attr_##_name = { \ EROFS_ATTR_RW_UI(sync_decompress, erofs_mount_opts); EROFS_ATTR_FUNC(drop_caches, 0200); #endif +#ifdef CONFIG_EROFS_FS_ZIP_ACCEL +EROFS_ATTR_FUNC(accel, 0644); +#endif -static struct attribute *erofs_attrs[] = { +static struct attribute *erofs_sb_attrs[] = { #ifdef CONFIG_EROFS_FS_ZIP ATTR_LIST(sync_decompress), ATTR_LIST(drop_caches), #endif NULL, }; +ATTRIBUTE_GROUPS(erofs_sb); + +static struct attribute *erofs_attrs[] = { +#ifdef CONFIG_EROFS_FS_ZIP_ACCEL + ATTR_LIST(accel), +#endif + NULL, +}; ATTRIBUTE_GROUPS(erofs); /* Features this copy of erofs supports */ @@ -81,6 +94,7 @@ EROFS_ATTR_FEATURE(sb_chksum); EROFS_ATTR_FEATURE(ztailpacking); EROFS_ATTR_FEATURE(fragments); EROFS_ATTR_FEATURE(dedupe); +EROFS_ATTR_FEATURE(48bit); static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(zero_padding), @@ -93,6 +107,7 @@ static struct attribute *erofs_feat_attrs[] = { ATTR_LIST(ztailpacking), ATTR_LIST(fragments), ATTR_LIST(dedupe), + ATTR_LIST(48bit), NULL, }; ATTRIBUTE_GROUPS(erofs_feat); @@ -126,12 +141,14 @@ static ssize_t erofs_attr_show(struct kobject *kobj, if (!ptr) return 0; return sysfs_emit(buf, "%d\n", *(bool *)ptr); + case attr_accel: + return z_erofs_crypto_show_engines(buf, PAGE_SIZE, '\n'); } return 0; } static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, - const char *buf, size_t len) + const char *buf, size_t len) { struct erofs_sb_info *sbi = container_of(kobj, struct erofs_sb_info, s_kobj); @@ -180,6 +197,19 @@ static ssize_t erofs_attr_store(struct kobject *kobj, struct attribute *attr, invalidate_mapping_pages(MNGD_MAPPING(sbi), 0, -1); return len; #endif +#ifdef CONFIG_EROFS_FS_ZIP_ACCEL + case attr_accel: + buf = skip_spaces(buf); + z_erofs_crypto_disable_all_engines(); + while (*buf) { + t = strcspn(buf, "\n"); + ret = z_erofs_crypto_enable_engine(buf, t); + if (ret < 0) + return ret; + buf += buf[t] != '\0' ? t + 1 : t; + } + return len; +#endif } return 0; } @@ -197,12 +227,13 @@ static const struct sysfs_ops erofs_attr_ops = { }; static const struct kobj_type erofs_sb_ktype = { - .default_groups = erofs_groups, + .default_groups = erofs_sb_groups, .sysfs_ops = &erofs_attr_ops, .release = erofs_sb_release, }; static const struct kobj_type erofs_ktype = { + .default_groups = erofs_groups, .sysfs_ops = &erofs_attr_ops, }; @@ -246,6 +277,12 @@ void erofs_unregister_sysfs(struct super_block *sb) } } +void erofs_exit_sysfs(void) +{ + kobject_put(&erofs_feat); + kset_unregister(&erofs_root); +} + int __init erofs_init_sysfs(void) { int ret; @@ -253,24 +290,12 @@ int __init erofs_init_sysfs(void) kobject_set_name(&erofs_root.kobj, "erofs"); erofs_root.kobj.parent = fs_kobj; ret = kset_register(&erofs_root); - if (ret) - goto root_err; - - ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, - NULL, "features"); - if (ret) - goto feat_err; - return ret; - -feat_err: - kobject_put(&erofs_feat); - kset_unregister(&erofs_root); -root_err: + if (!ret) { + ret = kobject_init_and_add(&erofs_feat, &erofs_feat_ktype, + NULL, "features"); + if (!ret) + return 0; + erofs_exit_sysfs(); + } return ret; } - -void erofs_exit_sysfs(void) -{ - kobject_put(&erofs_feat); - kset_unregister(&erofs_root); -} diff --git a/fs/erofs/xattr.c b/fs/erofs/xattr.c index df2777e05661..9cf84717a92e 100644 --- a/fs/erofs/xattr.c +++ b/fs/erofs/xattr.c @@ -81,7 +81,7 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos = erofs_iloc(inode) + vi->inode_isize; /* read in shared xattr array (non-atomic, see kmalloc below) */ - it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, true); if (IS_ERR(it.kaddr)) { ret = PTR_ERR(it.kaddr); goto out_unlock; @@ -102,7 +102,7 @@ static int erofs_init_inode_xattrs(struct inode *inode) it.pos += sizeof(struct erofs_xattr_ibody_header); for (i = 0; i < vi->xattr_shared_count; ++i) { - it.kaddr = erofs_bread(&it.buf, it.pos, EROFS_KMAP); + it.kaddr = erofs_bread(&it.buf, it.pos, true); if (IS_ERR(it.kaddr)) { kfree(vi->xattr_shared_xattrs); vi->xattr_shared_xattrs = NULL; @@ -183,7 +183,7 @@ static int erofs_xattr_copy_to_buffer(struct erofs_xattr_iter *it, void *src; for (processed = 0; processed < len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -286,7 +286,7 @@ static int erofs_getxattr_foreach(struct erofs_xattr_iter *it) /* 2. handle xattr name */ for (processed = 0; processed < entry.e_name_len; processed += slice) { - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -330,7 +330,7 @@ static int erofs_xattr_iter_inline(struct erofs_xattr_iter *it, it->pos = erofs_iloc(inode) + vi->inode_isize + xattr_header_sz; while (remaining) { - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); @@ -367,7 +367,7 @@ static int erofs_xattr_iter_shared(struct erofs_xattr_iter *it, for (i = 0; i < vi->xattr_shared_count; ++i) { it->pos = erofs_pos(sb, sbi->xattr_blkaddr) + vi->xattr_shared_xattrs[i] * sizeof(__le32); - it->kaddr = erofs_bread(&it->buf, it->pos, EROFS_KMAP); + it->kaddr = erofs_bread(&it->buf, it->pos, true); if (IS_ERR(it->kaddr)) return PTR_ERR(it->kaddr); diff --git a/fs/erofs/zdata.c b/fs/erofs/zdata.c index d771e06db738..fe8071844724 100644 --- a/fs/erofs/zdata.c +++ b/fs/erofs/zdata.c @@ -44,8 +44,8 @@ struct z_erofs_pcluster { /* A: point to next chained pcluster or TAILs */ struct z_erofs_pcluster *next; - /* I: start block address of this pcluster */ - erofs_off_t index; + /* I: start physical position of this pcluster */ + erofs_off_t pos; /* L: the maximum decompression size of this round */ unsigned int length; @@ -73,12 +73,12 @@ struct z_erofs_pcluster { /* I: compression algorithm format */ unsigned char algorithmformat; + /* I: whether compressed data is in-lined or not */ + bool from_meta; + /* L: whether partial decompression or not */ bool partial; - /* L: indicate several pageofs_outs or not */ - bool multibases; - /* L: whether extra buffer allocations are best-effort */ bool besteffort; @@ -102,14 +102,9 @@ struct z_erofs_decompressqueue { bool eio, sync; }; -static inline bool z_erofs_is_inline_pcluster(struct z_erofs_pcluster *pcl) -{ - return !pcl->index; -} - static inline unsigned int z_erofs_pclusterpages(struct z_erofs_pcluster *pcl) { - return PAGE_ALIGN(pcl->pclustersize) >> PAGE_SHIFT; + return PAGE_ALIGN(pcl->pageofs_in + pcl->pclustersize) >> PAGE_SHIFT; } static bool erofs_folio_is_managed(struct erofs_sb_info *sbi, struct folio *fo) @@ -133,7 +128,7 @@ struct z_erofs_pcluster_slab { static struct z_erofs_pcluster_slab pcluster_pool[] __read_mostly = { _PCLP(1), _PCLP(4), _PCLP(16), _PCLP(64), _PCLP(128), - _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES) + _PCLP(Z_EROFS_PCLUSTER_MAX_PAGES + 1) }; struct z_erofs_bvec_iter { @@ -267,7 +262,6 @@ static struct z_erofs_pcluster *z_erofs_alloc_pcluster(unsigned int size) pcl = kmem_cache_zalloc(pcs->slab, GFP_KERNEL); if (!pcl) return ERR_PTR(-ENOMEM); - pcl->pclustersize = size; return pcl; } return ERR_PTR(-EINVAL); @@ -294,6 +288,7 @@ static struct workqueue_struct *z_erofs_workqueue __read_mostly; #ifdef CONFIG_EROFS_FS_PCPU_KTHREAD static struct kthread_worker __rcu **z_erofs_pcpu_workers; +static atomic_t erofs_percpu_workers_initialized = ATOMIC_INIT(0); static void erofs_destroy_percpu_workers(void) { @@ -339,12 +334,8 @@ static int erofs_init_percpu_workers(void) } return 0; } -#else -static inline void erofs_destroy_percpu_workers(void) {} -static inline int erofs_init_percpu_workers(void) { return 0; } -#endif -#if defined(CONFIG_HOTPLUG_CPU) && defined(CONFIG_EROFS_FS_PCPU_KTHREAD) +#ifdef CONFIG_HOTPLUG_CPU static DEFINE_SPINLOCK(z_erofs_pcpu_worker_lock); static enum cpuhp_state erofs_cpuhp_state; @@ -401,17 +392,56 @@ static void erofs_cpu_hotplug_destroy(void) if (erofs_cpuhp_state) cpuhp_remove_state_nocalls(erofs_cpuhp_state); } -#else /* !CONFIG_HOTPLUG_CPU || !CONFIG_EROFS_FS_PCPU_KTHREAD */ +#else /* !CONFIG_HOTPLUG_CPU */ static inline int erofs_cpu_hotplug_init(void) { return 0; } static inline void erofs_cpu_hotplug_destroy(void) {} -#endif +#endif/* CONFIG_HOTPLUG_CPU */ +static int z_erofs_init_pcpu_workers(struct super_block *sb) +{ + int err; -void z_erofs_exit_subsystem(void) + if (atomic_xchg(&erofs_percpu_workers_initialized, 1)) + return 0; + + err = erofs_init_percpu_workers(); + if (err) { + erofs_err(sb, "per-cpu workers: failed to allocate."); + goto err_init_percpu_workers; + } + + err = erofs_cpu_hotplug_init(); + if (err < 0) { + erofs_err(sb, "per-cpu workers: failed CPU hotplug init."); + goto err_cpuhp_init; + } + erofs_info(sb, "initialized per-cpu workers successfully."); + return err; + +err_cpuhp_init: + erofs_destroy_percpu_workers(); +err_init_percpu_workers: + atomic_set(&erofs_percpu_workers_initialized, 0); + return err; +} + +static void z_erofs_destroy_pcpu_workers(void) { + if (!atomic_xchg(&erofs_percpu_workers_initialized, 0)) + return; erofs_cpu_hotplug_destroy(); erofs_destroy_percpu_workers(); +} +#else /* !CONFIG_EROFS_FS_PCPU_KTHREAD */ +static inline int z_erofs_init_pcpu_workers(struct super_block *sb) { return 0; } +static inline void z_erofs_destroy_pcpu_workers(void) {} +#endif/* CONFIG_EROFS_FS_PCPU_KTHREAD */ + +void z_erofs_exit_subsystem(void) +{ + z_erofs_destroy_pcpu_workers(); destroy_workqueue(z_erofs_workqueue); z_erofs_destroy_pcluster_pool(); + z_erofs_crypto_disable_all_engines(); z_erofs_exit_decompressor(); } @@ -433,19 +463,8 @@ int __init z_erofs_init_subsystem(void) goto err_workqueue_init; } - err = erofs_init_percpu_workers(); - if (err) - goto err_pcpu_worker; - - err = erofs_cpu_hotplug_init(); - if (err < 0) - goto err_cpuhp_init; return err; -err_cpuhp_init: - erofs_destroy_percpu_workers(); -err_pcpu_worker: - destroy_workqueue(z_erofs_workqueue); err_workqueue_init: z_erofs_destroy_pcluster_pool(); err_pcluster_pool: @@ -516,6 +535,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) struct z_erofs_pcluster *pcl = fe->pcl; unsigned int pclusterpages = z_erofs_pclusterpages(pcl); bool shouldalloc = z_erofs_should_alloc_cache(fe); + pgoff_t poff = pcl->pos >> PAGE_SHIFT; bool may_bypass = true; /* Optimistic allocation, as in-place I/O can be used as a fallback */ gfp_t gfp = (mapping_gfp_mask(mc) & ~__GFP_DIRECT_RECLAIM) | @@ -532,7 +552,7 @@ static void z_erofs_bind_cache(struct z_erofs_frontend *fe) if (READ_ONCE(pcl->compressed_bvecs[i].page)) continue; - folio = filemap_get_folio(mc, pcl->index + i); + folio = filemap_get_folio(mc, poff + i); if (IS_ERR(folio)) { may_bypass = false; if (!shouldalloc) @@ -575,7 +595,7 @@ static int erofs_try_to_free_all_cached_folios(struct erofs_sb_info *sbi, struct folio *folio; int i; - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + DBG_BUGON(pcl->from_meta); /* Each cached folio contains one page unless bs > ps is supported */ for (i = 0; i < pclusterpages; ++i) { if (pcl->compressed_bvecs[i].page) { @@ -607,7 +627,7 @@ static bool z_erofs_cache_release_folio(struct folio *folio, gfp_t gfp) ret = false; spin_lock(&pcl->lockref.lock); if (pcl->lockref.count <= 0) { - DBG_BUGON(z_erofs_is_inline_pcluster(pcl)); + DBG_BUGON(pcl->from_meta); for (; bvec < end; ++bvec) { if (bvec->page && page_folio(bvec->page) == folio) { bvec->page = NULL; @@ -644,18 +664,24 @@ static const struct address_space_operations z_erofs_cache_aops = { .invalidate_folio = z_erofs_cache_invalidate_folio, }; -int erofs_init_managed_cache(struct super_block *sb) +int z_erofs_init_super(struct super_block *sb) { - struct inode *const inode = new_inode(sb); + struct inode *inode; + int err; + err = z_erofs_init_pcpu_workers(sb); + if (err) + return err; + + inode = new_inode(sb); if (!inode) return -ENOMEM; - set_nlink(inode, 1); inode->i_size = OFFSET_MAX; inode->i_mapping->a_ops = &z_erofs_cache_aops; mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); EROFS_SB(sb)->managed_cache = inode; + xa_init(&EROFS_SB(sb)->managed_pslots); return 0; } @@ -667,16 +693,20 @@ static int z_erofs_attach_page(struct z_erofs_frontend *fe, int ret; if (exclusive) { - /* give priority for inplaceio to use file pages first */ - spin_lock(&pcl->lockref.lock); - while (fe->icur > 0) { - if (pcl->compressed_bvecs[--fe->icur].page) - continue; - pcl->compressed_bvecs[fe->icur] = *bvec; + /* Inplace I/O is limited to one page for uncompressed data */ + if (pcl->algorithmformat < Z_EROFS_COMPRESSION_MAX || + fe->icur <= 1) { + /* Try to prioritize inplace I/O here */ + spin_lock(&pcl->lockref.lock); + while (fe->icur > 0) { + if (pcl->compressed_bvecs[--fe->icur].page) + continue; + pcl->compressed_bvecs[fe->icur] = *bvec; + spin_unlock(&pcl->lockref.lock); + return 0; + } spin_unlock(&pcl->lockref.lock); - return 0; } - spin_unlock(&pcl->lockref.lock); /* otherwise, check if it can be used as a bvpage */ if (fe->mode >= Z_EROFS_PCLUSTER_FOLLOWED && @@ -711,27 +741,25 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; struct erofs_sb_info *sbi = EROFS_SB(sb); - bool ztailpacking = map->m_flags & EROFS_MAP_META; struct z_erofs_pcluster *pcl, *pre; + unsigned int pageofs_in; int err; - if (!(map->m_flags & EROFS_MAP_ENCODED) || - (!ztailpacking && !erofs_blknr(sb, map->m_pa))) { - DBG_BUGON(1); - return -EFSCORRUPTED; - } - - /* no available pcluster, let's allocate one */ - pcl = z_erofs_alloc_pcluster(map->m_plen); + pageofs_in = erofs_blkoff(sb, map->m_pa); + pcl = z_erofs_alloc_pcluster(pageofs_in + map->m_plen); if (IS_ERR(pcl)) return PTR_ERR(pcl); lockref_init(&pcl->lockref); /* one ref for this request */ pcl->algorithmformat = map->m_algorithmformat; + pcl->pclustersize = map->m_plen; pcl->length = 0; pcl->partial = true; pcl->next = fe->head; + pcl->pos = map->m_pa; + pcl->pageofs_in = pageofs_in; pcl->pageofs_out = map->m_la & ~PAGE_MASK; + pcl->from_meta = map->m_flags & EROFS_MAP_META; fe->mode = Z_EROFS_PCLUSTER_FOLLOWED; /* @@ -741,13 +769,10 @@ static int z_erofs_register_pcluster(struct z_erofs_frontend *fe) mutex_init(&pcl->lock); DBG_BUGON(!mutex_trylock(&pcl->lock)); - if (ztailpacking) { - pcl->index = 0; /* which indicates ztailpacking */ - } else { - pcl->index = erofs_blknr(sb, map->m_pa); + if (!pcl->from_meta) { while (1) { xa_lock(&sbi->managed_pslots); - pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->index, + pre = __xa_cmpxchg(&sbi->managed_pslots, pcl->pos, NULL, pcl, GFP_KERNEL); if (!pre || xa_is_err(pre) || z_erofs_get_pcluster(pre)) { xa_unlock(&sbi->managed_pslots); @@ -779,7 +804,6 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) { struct erofs_map_blocks *map = &fe->map; struct super_block *sb = fe->inode->i_sb; - erofs_blk_t blknr = erofs_blknr(sb, map->m_pa); struct z_erofs_pcluster *pcl = NULL; int ret; @@ -790,9 +814,9 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) if (!(map->m_flags & EROFS_MAP_META)) { while (1) { rcu_read_lock(); - pcl = xa_load(&EROFS_SB(sb)->managed_pslots, blknr); + pcl = xa_load(&EROFS_SB(sb)->managed_pslots, map->m_pa); if (!pcl || z_erofs_get_pcluster(pcl)) { - DBG_BUGON(pcl && blknr != pcl->index); + DBG_BUGON(pcl && map->m_pa != pcl->pos); rcu_read_unlock(); break; } @@ -826,13 +850,13 @@ static int z_erofs_pcluster_begin(struct z_erofs_frontend *fe) z_erofs_bvec_iter_begin(&fe->biter, &fe->pcl->bvset, Z_EROFS_INLINE_BVECS, fe->pcl->vcnt); - if (!z_erofs_is_inline_pcluster(fe->pcl)) { + if (!fe->pcl->from_meta) { /* bind cache first when cached decompression is preferred */ z_erofs_bind_cache(fe); } else { void *mptr; - mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, EROFS_NO_KMAP); + mptr = erofs_read_metabuf(&map->buf, sb, map->m_pa, false); if (IS_ERR(mptr)) { ret = PTR_ERR(mptr); erofs_err(sb, "failed to get inline data %d", ret); @@ -871,7 +895,7 @@ static bool __erofs_try_to_release_pcluster(struct erofs_sb_info *sbi, * It's impossible to fail after the pcluster is freezed, but in order * to avoid some race conditions, add a DBG_BUGON to observe this. */ - DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->index) != pcl); + DBG_BUGON(__xa_erase(&sbi->managed_pslots, pcl->pos) != pcl); lockref_mark_dead(&pcl->lockref); return true; @@ -967,7 +991,7 @@ static int z_erofs_read_fragment(struct super_block *sb, struct folio *folio, buf.mapping = packed_inode->i_mapping; for (; cur < end; cur += cnt, pos += cnt) { cnt = min(end - cur, sb->s_blocksize - erofs_blkoff(sb, pos)); - src = erofs_bread(&buf, pos, EROFS_KMAP); + src = erofs_bread(&buf, pos, true); if (IS_ERR(src)) { erofs_put_metabuf(&buf); return PTR_ERR(src); @@ -1050,8 +1074,6 @@ static int z_erofs_scan_folio(struct z_erofs_frontend *f, break; erofs_onlinefolio_split(folio); - if (f->pcl->pageofs_out != (map->m_la & ~PAGE_MASK)) - f->pcl->multibases = true; if (f->pcl->length < offset + end - map->m_la) { f->pcl->length = offset + end - map->m_la; f->pcl->pageofs_out = map->m_la & ~PAGE_MASK; @@ -1097,7 +1119,6 @@ struct z_erofs_backend { struct page *onstack_pages[Z_EROFS_ONSTACK_PAGES]; struct super_block *sb; struct z_erofs_pcluster *pcl; - /* pages with the longest decompressed length for deduplication */ struct page **decompressed_pages; /* pages to keep the compressed data */ @@ -1106,6 +1127,8 @@ struct z_erofs_backend { struct list_head decompressed_secondary_bvecs; struct page **pagepool; unsigned int onstack_used, nr_pages; + /* indicate if temporary copies should be preserved for later use */ + bool keepxcpy; }; struct z_erofs_bvec_item { @@ -1116,18 +1139,20 @@ struct z_erofs_bvec_item { static void z_erofs_do_decompressed_bvec(struct z_erofs_backend *be, struct z_erofs_bvec *bvec) { + int poff = bvec->offset + be->pcl->pageofs_out; struct z_erofs_bvec_item *item; - unsigned int pgnr; - - if (!((bvec->offset + be->pcl->pageofs_out) & ~PAGE_MASK) && - (bvec->end == PAGE_SIZE || - bvec->offset + bvec->end == be->pcl->length)) { - pgnr = (bvec->offset + be->pcl->pageofs_out) >> PAGE_SHIFT; - DBG_BUGON(pgnr >= be->nr_pages); - if (!be->decompressed_pages[pgnr]) { - be->decompressed_pages[pgnr] = bvec->page; + struct page **page; + + if (!(poff & ~PAGE_MASK) && (bvec->end == PAGE_SIZE || + bvec->offset + bvec->end == be->pcl->length)) { + DBG_BUGON((poff >> PAGE_SHIFT) >= be->nr_pages); + page = be->decompressed_pages + (poff >> PAGE_SHIFT); + if (!*page) { + *page = bvec->page; return; } + } else { + be->keepxcpy = true; } /* (cold path) one pcluster is requested multiple times */ @@ -1221,7 +1246,7 @@ static int z_erofs_parse_in_bvecs(struct z_erofs_backend *be, bool *overlapped) } be->compressed_pages[i] = page; - if (z_erofs_is_inline_pcluster(pcl) || + if (pcl->from_meta || erofs_folio_is_managed(EROFS_SB(be->sb), page_folio(page))) { if (!PageUptodate(page)) err = -EIO; @@ -1284,6 +1309,8 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) .sb = be->sb, .in = be->compressed_pages, .out = be->decompressed_pages, + .inpages = pclusterpages, + .outpages = be->nr_pages, .pageofs_in = pcl->pageofs_in, .pageofs_out = pcl->pageofs_out, .inputsize = pcl->pclustersize, @@ -1291,13 +1318,13 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) .alg = pcl->algorithmformat, .inplace_io = overlapped, .partial_decoding = pcl->partial, - .fillgaps = pcl->multibases, + .fillgaps = be->keepxcpy, .gfp = pcl->besteffort ? GFP_KERNEL : GFP_NOWAIT | __GFP_NORETRY }, be->pagepool); /* must handle all compressed pages before actual file pages */ - if (z_erofs_is_inline_pcluster(pcl)) { + if (pcl->from_meta) { page = pcl->compressed_bvecs[0].page; WRITE_ONCE(pcl->compressed_bvecs[0].page, NULL); put_page(page); @@ -1348,7 +1375,6 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) pcl->length = 0; pcl->partial = true; - pcl->multibases = false; pcl->besteffort = false; pcl->bvset.nextpage = NULL; pcl->vcnt = 0; @@ -1357,7 +1383,7 @@ static int z_erofs_decompress_pcluster(struct z_erofs_backend *be, int err) WRITE_ONCE(pcl->next, NULL); mutex_unlock(&pcl->lock); - if (z_erofs_is_inline_pcluster(pcl)) + if (pcl->from_meta) z_erofs_free_pcluster(pcl); else z_erofs_put_pcluster(sbi, pcl, try_free); @@ -1538,7 +1564,7 @@ out_allocfolio: folio = page_folio(page); out_tocache: if (!tocache || bs != PAGE_SIZE || - filemap_add_folio(mc, folio, pcl->index + nr, gfp)) { + filemap_add_folio(mc, folio, (pcl->pos >> PAGE_SHIFT) + nr, gfp)) { /* turn into a temporary shortlived folio (1 ref) */ folio->private = (void *)Z_EROFS_SHORTLIVED_PAGE; return; @@ -1655,19 +1681,20 @@ static void z_erofs_submit_queue(struct z_erofs_frontend *f, pcl = next; next = READ_ONCE(pcl->next); - if (z_erofs_is_inline_pcluster(pcl)) { + if (pcl->from_meta) { z_erofs_move_to_bypass_queue(pcl, next, qtail); continue; } /* no device id here, thus it will always succeed */ mdev = (struct erofs_map_dev) { - .m_pa = erofs_pos(sb, pcl->index), + .m_pa = round_down(pcl->pos, sb->s_blocksize), }; (void)erofs_map_dev(sb, &mdev); cur = mdev.m_pa; - end = cur + pcl->pclustersize; + end = round_up(cur + pcl->pageofs_in + pcl->pclustersize, + sb->s_blocksize); do { bvec.bv_page = NULL; if (bio && (cur != last_pa || @@ -1711,7 +1738,8 @@ drain_io: bio = bio_alloc(mdev.m_bdev, BIO_MAX_VECS, REQ_OP_READ, GFP_NOIO); bio->bi_end_io = z_erofs_endio; - bio->bi_iter.bi_sector = cur >> 9; + bio->bi_iter.bi_sector = + (mdev.m_dif->fsoff + cur) >> 9; bio->bi_private = q[JQ_SUBMIT]; if (readahead) bio->bi_opf |= REQ_RAHEAD; @@ -1859,13 +1887,12 @@ static void z_erofs_readahead(struct readahead_control *rac) { struct inode *const inode = rac->mapping->host; Z_EROFS_DEFINE_FRONTEND(f, inode, readahead_pos(rac)); - struct folio *head = NULL, *folio; unsigned int nrpages = readahead_count(rac); + struct folio *head = NULL, *folio; int err; + trace_erofs_readahead(inode, readahead_index(rac), nrpages, false); z_erofs_pcluster_readmore(&f, rac, true); - nrpages = readahead_count(rac); - trace_erofs_readpages(inode, readahead_index(rac), nrpages, false); while ((folio = readahead_folio(rac))) { folio->private = head; head = folio; diff --git a/fs/erofs/zmap.c b/fs/erofs/zmap.c index 689437e99a5a..14ea47f954f5 100644 --- a/fs/erofs/zmap.c +++ b/fs/erofs/zmap.c @@ -25,13 +25,13 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t pos = Z_EROFS_FULL_INDEX_ALIGN(erofs_iloc(inode) + + const erofs_off_t pos = Z_EROFS_FULL_INDEX_START(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize) + lcn * sizeof(struct z_erofs_lcluster_index); struct z_erofs_lcluster_index *di; unsigned int advise; - di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, EROFS_KMAP); + di = erofs_read_metabuf(&m->map->buf, inode->i_sb, pos, true); if (IS_ERR(di)) return PTR_ERR(di); m->lcn = lcn; @@ -40,7 +40,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, advise = le16_to_cpu(di->di_advise); m->type = advise & Z_EROFS_LI_LCLUSTER_TYPE_MASK; if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { - m->clusterofs = 1 << vi->z_logical_clusterbits; + m->clusterofs = 1 << vi->z_lclusterbits; m->delta[0] = le16_to_cpu(di->di_u.delta[0]); if (m->delta[0] & Z_EROFS_LI_D0_CBLKCNT) { if (!(vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | @@ -55,7 +55,7 @@ static int z_erofs_load_full_lcluster(struct z_erofs_maprecorder *m, } else { m->partialref = !!(advise & Z_EROFS_LI_PARTIAL_REF); m->clusterofs = le16_to_cpu(di->di_clusterofs); - if (m->clusterofs >= 1 << vi->z_logical_clusterbits) { + if (m->clusterofs >= 1 << vi->z_lclusterbits) { DBG_BUGON(1); return -EFSCORRUPTED; } @@ -102,9 +102,9 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, { struct inode *const inode = m->inode; struct erofs_inode *const vi = EROFS_I(inode); - const erofs_off_t ebase = sizeof(struct z_erofs_map_header) + - ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - const unsigned int lclusterbits = vi->z_logical_clusterbits; + const erofs_off_t ebase = Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize); + const unsigned int lclusterbits = vi->z_lclusterbits; const unsigned int totalidx = erofs_iblks(inode); unsigned int compacted_4b_initial, compacted_2b, amortizedshift; unsigned int vcnt, lo, lobits, encodebits, nblk, bytes; @@ -146,7 +146,7 @@ static int z_erofs_load_compact_lcluster(struct z_erofs_maprecorder *m, else return -EOPNOTSUPP; - in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, EROFS_KMAP); + in = erofs_read_metabuf(&m->map->buf, m->inode->i_sb, pos, true); if (IS_ERR(in)) return PTR_ERR(in); @@ -255,7 +255,7 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, { struct super_block *sb = m->inode->i_sb; struct erofs_inode *const vi = EROFS_I(m->inode); - const unsigned int lclusterbits = vi->z_logical_clusterbits; + const unsigned int lclusterbits = vi->z_lclusterbits; while (m->lcn >= lookback_distance) { unsigned long lcn = m->lcn - lookback_distance; @@ -265,26 +265,22 @@ static int z_erofs_extent_lookback(struct z_erofs_maprecorder *m, if (err) return err; - switch (m->type) { - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: + if (m->type >= Z_EROFS_LCLUSTER_TYPE_MAX) { + erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", + m->type, lcn, vi->nid); + DBG_BUGON(1); + return -EOPNOTSUPP; + } else if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { lookback_distance = m->delta[0]; if (!lookback_distance) - goto err_bogus; + break; continue; - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: + } else { m->headtype = m->type; m->map->m_la = (lcn << lclusterbits) | m->clusterofs; return 0; - default: - erofs_err(sb, "unknown type %u @ lcn %lu of nid %llu", - m->type, lcn, vi->nid); - DBG_BUGON(1); - return -EOPNOTSUPP; } } -err_bogus: erofs_err(sb, "bogus lookback distance %u @ lcn %lu of nid %llu", lookback_distance, m->lcn, vi->nid); DBG_BUGON(1); @@ -308,7 +304,7 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, if ((m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD1 && !bigpcl1) || ((m->headtype == Z_EROFS_LCLUSTER_TYPE_PLAIN || m->headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2) && !bigpcl2) || - (lcn << vi->z_logical_clusterbits) >= inode->i_size) + (lcn << vi->z_lclusterbits) >= inode->i_size) m->compressedblks = 1; if (m->compressedblks) @@ -329,35 +325,28 @@ static int z_erofs_get_extent_compressedlen(struct z_erofs_maprecorder *m, DBG_BUGON(lcn == initial_lcn && m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD); - switch (m->type) { - case Z_EROFS_LCLUSTER_TYPE_PLAIN: - case Z_EROFS_LCLUSTER_TYPE_HEAD1: - case Z_EROFS_LCLUSTER_TYPE_HEAD2: + if (m->type == Z_EROFS_LCLUSTER_TYPE_NONHEAD) { + if (m->delta[0] != 1) { + erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + if (m->compressedblks) + goto out; + } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) { /* * if the 1st NONHEAD lcluster is actually PLAIN or HEAD type * rather than CBLKCNT, it's a 1 block-sized pcluster. */ m->compressedblks = 1; - break; - case Z_EROFS_LCLUSTER_TYPE_NONHEAD: - if (m->delta[0] != 1) - goto err_bonus_cblkcnt; - if (m->compressedblks) - break; - fallthrough; - default: - erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, - vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; + goto out; } + erofs_err(sb, "cannot found CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; out: m->map->m_plen = erofs_pos(sb, m->compressedblks); return 0; -err_bonus_cblkcnt: - erofs_err(sb, "bogus CBLKCNT @ lcn %lu of nid %llu", lcn, vi->nid); - DBG_BUGON(1); - return -EFSCORRUPTED; } static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) @@ -365,7 +354,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) struct inode *inode = m->inode; struct erofs_inode *vi = EROFS_I(inode); struct erofs_map_blocks *map = m->map; - unsigned int lclusterbits = vi->z_logical_clusterbits; + unsigned int lclusterbits = vi->z_lclusterbits; u64 lcn = m->lcn, headlcn = map->m_la >> lclusterbits; int err; @@ -386,9 +375,7 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) m->delta[1] = 1; DBG_BUGON(1); } - } else if (m->type == Z_EROFS_LCLUSTER_TYPE_PLAIN || - m->type == Z_EROFS_LCLUSTER_TYPE_HEAD1 || - m->type == Z_EROFS_LCLUSTER_TYPE_HEAD2) { + } else if (m->type < Z_EROFS_LCLUSTER_TYPE_MAX) { if (lcn != headlcn) break; /* ends at the next HEAD lcluster */ m->delta[1] = 1; @@ -404,23 +391,32 @@ static int z_erofs_get_extent_decompressedlen(struct z_erofs_maprecorder *m) return 0; } -static int z_erofs_do_map_blocks(struct inode *inode, +static int z_erofs_map_blocks_fo(struct inode *inode, struct erofs_map_blocks *map, int flags) { - struct erofs_inode *const vi = EROFS_I(inode); - bool ztailpacking = vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER; + struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; bool fragment = vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER; + bool ztailpacking = vi->z_idata_size; + unsigned int lclusterbits = vi->z_lclusterbits; struct z_erofs_maprecorder m = { .inode = inode, .map = map, }; int err = 0; - unsigned int lclusterbits, endoff, afmt; + unsigned int endoff, afmt; unsigned long initial_lcn; unsigned long long ofs, end; - lclusterbits = vi->z_logical_clusterbits; ofs = flags & EROFS_GET_BLOCKS_FINDTAIL ? inode->i_size - 1 : map->m_la; + if (fragment && !(flags & EROFS_GET_BLOCKS_FINDTAIL) && + !vi->z_tailextent_headlcn) { + map->m_la = 0; + map->m_llen = inode->i_size; + map->m_flags = EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; + return 0; + } initial_lcn = ofs >> lclusterbits; endoff = ofs & ((1 << lclusterbits) - 1); @@ -428,9 +424,8 @@ static int z_erofs_do_map_blocks(struct inode *inode, if (err) goto unmap_out; - if (ztailpacking && (flags & EROFS_GET_BLOCKS_FINDTAIL)) - vi->z_idataoff = m.nextpackoff; - + if ((flags & EROFS_GET_BLOCKS_FINDTAIL) && ztailpacking) + vi->z_fragmentoff = m.nextpackoff; map->m_flags = EROFS_MAP_MAPPED | EROFS_MAP_ENCODED; end = (m.lcn + 1ULL) << lclusterbits; @@ -452,8 +447,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, } /* m.lcn should be >= 1 if endoff < m.clusterofs */ if (!m.lcn) { - erofs_err(inode->i_sb, - "invalid logical cluster 0 at nid %llu", + erofs_err(sb, "invalid logical cluster 0 at nid %llu", vi->nid); err = -EFSCORRUPTED; goto unmap_out; @@ -469,8 +463,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, goto unmap_out; break; default: - erofs_err(inode->i_sb, - "unknown type %u @ offset %llu of nid %llu", + erofs_err(sb, "unknown type %u @ offset %llu of nid %llu", m.type, ofs, vi->nid); err = -EOPNOTSUPP; goto unmap_out; @@ -487,12 +480,18 @@ static int z_erofs_do_map_blocks(struct inode *inode, } if (ztailpacking && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_META; - map->m_pa = vi->z_idataoff; + map->m_pa = vi->z_fragmentoff; map->m_plen = vi->z_idata_size; + if (erofs_blkoff(sb, map->m_pa) + map->m_plen > sb->s_blocksize) { + erofs_err(sb, "invalid tail-packing pclustersize %llu", + map->m_plen); + err = -EFSCORRUPTED; + goto unmap_out; + } } else if (fragment && m.lcn == vi->z_tailextent_headlcn) { map->m_flags |= EROFS_MAP_FRAGMENT; } else { - map->m_pa = erofs_pos(inode->i_sb, m.pblk); + map->m_pa = erofs_pos(sb, m.pblk); err = z_erofs_get_extent_compressedlen(&m, initial_lcn); if (err) goto unmap_out; @@ -511,7 +510,7 @@ static int z_erofs_do_map_blocks(struct inode *inode, afmt = m.headtype == Z_EROFS_LCLUSTER_TYPE_HEAD2 ? vi->z_algorithmtype[1] : vi->z_algorithmtype[0]; if (!(EROFS_I_SB(inode)->available_compr_algs & (1 << afmt))) { - erofs_err(inode->i_sb, "inconsistent algorithmtype %u for nid %llu", + erofs_err(sb, "inconsistent algorithmtype %u for nid %llu", afmt, vi->nid); err = -EFSCORRUPTED; goto unmap_out; @@ -535,6 +534,116 @@ unmap_out: return err; } +static int z_erofs_map_blocks_ext(struct inode *inode, + struct erofs_map_blocks *map, int flags) +{ + struct erofs_inode *vi = EROFS_I(inode); + struct super_block *sb = inode->i_sb; + bool interlaced = vi->z_advise & Z_EROFS_ADVISE_INTERLACED_PCLUSTER; + unsigned int recsz = z_erofs_extent_recsize(vi->z_advise); + erofs_off_t pos = round_up(Z_EROFS_MAP_HEADER_END(erofs_iloc(inode) + + vi->inode_isize + vi->xattr_isize), recsz); + erofs_off_t lend = inode->i_size; + erofs_off_t l, r, mid, pa, la, lstart; + struct z_erofs_extent *ext; + unsigned int fmt; + bool last; + + map->m_flags = 0; + if (recsz <= offsetof(struct z_erofs_extent, pstart_hi)) { + if (recsz <= offsetof(struct z_erofs_extent, pstart_lo)) { + ext = erofs_read_metabuf(&map->buf, sb, pos, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + pa = le64_to_cpu(*(__le64 *)ext); + pos += sizeof(__le64); + lstart = 0; + } else { + lstart = round_down(map->m_la, 1 << vi->z_lclusterbits); + pos += (lstart >> vi->z_lclusterbits) * recsz; + pa = EROFS_NULL_ADDR; + } + + for (; lstart <= map->m_la; lstart += 1 << vi->z_lclusterbits) { + ext = erofs_read_metabuf(&map->buf, sb, pos, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + map->m_plen = le32_to_cpu(ext->plen); + if (pa != EROFS_NULL_ADDR) { + map->m_pa = pa; + pa += map->m_plen & Z_EROFS_EXTENT_PLEN_MASK; + } else { + map->m_pa = le32_to_cpu(ext->pstart_lo); + } + pos += recsz; + } + last = (lstart >= round_up(lend, 1 << vi->z_lclusterbits)); + lend = min(lstart, lend); + lstart -= 1 << vi->z_lclusterbits; + } else { + lstart = lend; + for (l = 0, r = vi->z_extents; l < r; ) { + mid = l + (r - l) / 2; + ext = erofs_read_metabuf(&map->buf, sb, + pos + mid * recsz, true); + if (IS_ERR(ext)) + return PTR_ERR(ext); + + la = le32_to_cpu(ext->lstart_lo); + pa = le32_to_cpu(ext->pstart_lo) | + (u64)le32_to_cpu(ext->pstart_hi) << 32; + if (recsz > offsetof(struct z_erofs_extent, lstart_hi)) + la |= (u64)le32_to_cpu(ext->lstart_hi) << 32; + + if (la > map->m_la) { + r = mid; + lend = la; + } else { + l = mid + 1; + if (map->m_la == la) + r = min(l + 1, r); + lstart = la; + map->m_plen = le32_to_cpu(ext->plen); + map->m_pa = pa; + } + } + last = (l >= vi->z_extents); + } + + if (lstart < lend) { + map->m_la = lstart; + if (last && (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { + map->m_flags |= EROFS_MAP_MAPPED | EROFS_MAP_FRAGMENT; + vi->z_fragmentoff = map->m_plen; + if (recsz > offsetof(struct z_erofs_extent, pstart_lo)) + vi->z_fragmentoff |= map->m_pa << 32; + } else if (map->m_plen) { + map->m_flags |= EROFS_MAP_MAPPED | + EROFS_MAP_FULL_MAPPED | EROFS_MAP_ENCODED; + fmt = map->m_plen >> Z_EROFS_EXTENT_PLEN_FMT_BIT; + if (fmt) + map->m_algorithmformat = fmt - 1; + else if (interlaced && !erofs_blkoff(sb, map->m_pa)) + map->m_algorithmformat = + Z_EROFS_COMPRESSION_INTERLACED; + else + map->m_algorithmformat = + Z_EROFS_COMPRESSION_SHIFTED; + if (map->m_plen & Z_EROFS_EXTENT_PLEN_PARTIAL) + map->m_flags |= EROFS_MAP_PARTIAL_REF; + map->m_plen &= Z_EROFS_EXTENT_PLEN_MASK; + } + } + map->m_llen = lend - map->m_la; + if (!last && map->m_llen < sb->s_blocksize) { + erofs_err(sb, "extent too small %llu @ offset %llu of nid %llu", + map->m_llen, map->m_la, vi->nid); + DBG_BUGON(1); + return -EFSCORRUPTED; + } + return 0; +} + static int z_erofs_fill_inode_lazy(struct inode *inode) { struct erofs_inode *const vi = EROFS_I(inode); @@ -561,7 +670,7 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_unlock; pos = ALIGN(erofs_iloc(inode) + vi->inode_isize + vi->xattr_isize, 8); - h = erofs_read_metabuf(&buf, sb, pos, EROFS_KMAP); + h = erofs_read_metabuf(&buf, sb, pos, true); if (IS_ERR(h)) { err = PTR_ERR(h); goto out_unlock; @@ -578,8 +687,20 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto done; } vi->z_advise = le16_to_cpu(h->h_advise); + vi->z_lclusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 15); + if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL && + (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) { + vi->z_extents = le32_to_cpu(h->h_extents_lo) | + ((u64)le16_to_cpu(h->h_extents_hi) << 32); + goto done; + } + vi->z_algorithmtype[0] = h->h_algorithmtype & 15; vi->z_algorithmtype[1] = h->h_algorithmtype >> 4; + if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) + vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); + else if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) + vi->z_idata_size = le16_to_cpu(h->h_idata_size); headnr = 0; if (vi->z_algorithmtype[0] >= Z_EROFS_COMPRESSION_MAX || @@ -590,7 +711,6 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_put_metabuf; } - vi->z_logical_clusterbits = sb->s_blocksize_bits + (h->h_clusterbits & 7); if (!erofs_sb_has_big_pcluster(EROFS_SB(sb)) && vi->z_advise & (Z_EROFS_ADVISE_BIG_PCLUSTER_1 | Z_EROFS_ADVISE_BIG_PCLUSTER_2)) { @@ -608,34 +728,13 @@ static int z_erofs_fill_inode_lazy(struct inode *inode) goto out_put_metabuf; } - if (vi->z_advise & Z_EROFS_ADVISE_INLINE_PCLUSTER) { + if (vi->z_idata_size || + (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER)) { struct erofs_map_blocks map = { .buf = __EROFS_BUF_INITIALIZER }; - vi->z_idata_size = le16_to_cpu(h->h_idata_size); - err = z_erofs_do_map_blocks(inode, &map, - EROFS_GET_BLOCKS_FINDTAIL); - erofs_put_metabuf(&map.buf); - - if (!map.m_plen || - erofs_blkoff(sb, map.m_pa) + map.m_plen > sb->s_blocksize) { - erofs_err(sb, "invalid tail-packing pclustersize %llu", - map.m_plen); - err = -EFSCORRUPTED; - } - if (err < 0) - goto out_put_metabuf; - } - - if (vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER && - !(h->h_clusterbits >> Z_EROFS_FRAGMENT_INODE_BIT)) { - struct erofs_map_blocks map = { - .buf = __EROFS_BUF_INITIALIZER - }; - - vi->z_fragmentoff = le32_to_cpu(h->h_fragmentoff); - err = z_erofs_do_map_blocks(inode, &map, + err = z_erofs_map_blocks_fo(inode, &map, EROFS_GET_BLOCKS_FINDTAIL); erofs_put_metabuf(&map.buf); if (err < 0) @@ -666,15 +765,11 @@ int z_erofs_map_blocks_iter(struct inode *inode, struct erofs_map_blocks *map, } else { err = z_erofs_fill_inode_lazy(inode); if (!err) { - if ((vi->z_advise & Z_EROFS_ADVISE_FRAGMENT_PCLUSTER) && - !vi->z_tailextent_headlcn) { - map->m_la = 0; - map->m_llen = inode->i_size; - map->m_flags = EROFS_MAP_MAPPED | - EROFS_MAP_FULL_MAPPED | EROFS_MAP_FRAGMENT; - } else { - err = z_erofs_do_map_blocks(inode, map, flags); - } + if (vi->datalayout == EROFS_INODE_COMPRESSED_FULL && + (vi->z_advise & Z_EROFS_ADVISE_EXTENTS)) + err = z_erofs_map_blocks_ext(inode, map, flags); + else + err = z_erofs_map_blocks_fo(inode, map, flags); } if (!err && (map->m_flags & EROFS_MAP_ENCODED) && unlikely(map->m_plen > Z_EROFS_PCLUSTER_MAX_SIZE || diff --git a/fs/eventfd.c b/fs/eventfd.c index 76129bfcd663..af42b2c7d235 100644 --- a/fs/eventfd.c +++ b/fs/eventfd.c @@ -406,14 +406,13 @@ static int do_eventfd(unsigned int count, int flags) if (fd < 0) goto err; - file = anon_inode_getfile("[eventfd]", &eventfd_fops, ctx, flags); + file = anon_inode_getfile_fmode("[eventfd]", &eventfd_fops, + ctx, flags, FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(fd); fd = PTR_ERR(file); goto err; } - - file->f_mode |= FMODE_NOWAIT; fd_install(fd, file); return fd; err: diff --git a/fs/eventpoll.c b/fs/eventpoll.c index 7c0980db77b3..d4dbffdedd08 100644 --- a/fs/eventpoll.c +++ b/fs/eventpoll.c @@ -438,7 +438,7 @@ static bool ep_busy_loop_end(void *p, unsigned long start_time) * * we must do our busy polling with irqs enabled */ -static bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static bool ep_busy_loop(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); u16 budget = READ_ONCE(ep->busy_poll_budget); @@ -447,8 +447,8 @@ static bool ep_busy_loop(struct eventpoll *ep, int nonblock) if (!budget) budget = BUSY_POLL_BUDGET; - if (napi_id >= MIN_NAPI_ID && ep_busy_loop_on(ep)) { - napi_busy_loop(napi_id, nonblock ? NULL : ep_busy_loop_end, + if (napi_id_valid(napi_id) && ep_busy_loop_on(ep)) { + napi_busy_loop(napi_id, ep_busy_loop_end, ep, prefer_busy_poll, budget); if (ep_events_available(ep)) return true; @@ -492,7 +492,7 @@ static inline void ep_set_busy_poll_napi_id(struct epitem *epi) * or * Nothing to do if we already have this ID */ - if (napi_id < MIN_NAPI_ID || napi_id == ep->napi_id) + if (!napi_id_valid(napi_id) || napi_id == ep->napi_id) return; /* record NAPI ID for use in next busy poll */ @@ -546,7 +546,7 @@ static void ep_suspend_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_suspend_irqs(napi_id); } @@ -554,13 +554,13 @@ static void ep_resume_napi_irqs(struct eventpoll *ep) { unsigned int napi_id = READ_ONCE(ep->napi_id); - if (napi_id >= MIN_NAPI_ID && READ_ONCE(ep->prefer_busy_poll)) + if (napi_id_valid(napi_id) && READ_ONCE(ep->prefer_busy_poll)) napi_resume_irqs(napi_id); } #else -static inline bool ep_busy_loop(struct eventpoll *ep, int nonblock) +static inline bool ep_busy_loop(struct eventpoll *ep) { return false; } @@ -1980,6 +1980,30 @@ static int ep_autoremove_wake_function(struct wait_queue_entry *wq_entry, return ret; } +static int ep_try_send_events(struct eventpoll *ep, + struct epoll_event __user *events, int maxevents) +{ + int res; + + /* + * Try to transfer events to user space. In case we get 0 events and + * there's still timeout left over, we go trying again in search of + * more luck. + */ + res = ep_send_events(ep, events, maxevents); + if (res > 0) + ep_suspend_napi_irqs(ep); + return res; +} + +static int ep_schedule_timeout(ktime_t *to) +{ + if (to) + return ktime_after(*to, ktime_get()); + else + return 1; +} + /** * ep_poll - Retrieves ready events, and delivers them to the caller-supplied * event buffer. @@ -2031,23 +2055,15 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, while (1) { if (eavail) { - /* - * Try to transfer events to user space. In case we get - * 0 events and there's still timeout left over, we go - * trying again in search of more luck. - */ - res = ep_send_events(ep, events, maxevents); - if (res) { - if (res > 0) - ep_suspend_napi_irqs(ep); + res = ep_try_send_events(ep, events, maxevents); + if (res) return res; - } } if (timed_out) return 0; - eavail = ep_busy_loop(ep, timed_out); + eavail = ep_busy_loop(ep); if (eavail) continue; @@ -2096,8 +2112,9 @@ static int ep_poll(struct eventpoll *ep, struct epoll_event __user *events, write_unlock_irq(&ep->lock); if (!eavail) - timed_out = !schedule_hrtimeout_range(to, slack, - HRTIMER_MODE_ABS); + timed_out = !ep_schedule_timeout(to) || + !schedule_hrtimeout_range(to, slack, + HRTIMER_MODE_ABS); __set_current_state(TASK_RUNNING); /* @@ -2445,6 +2462,47 @@ SYSCALL_DEFINE4(epoll_ctl, int, epfd, int, op, int, fd, return do_epoll_ctl(epfd, op, fd, &epds, false); } +static int ep_check_params(struct file *file, struct epoll_event __user *evs, + int maxevents) +{ + /* The maximum number of event must be greater than zero */ + if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) + return -EINVAL; + + /* Verify that the area passed by the user is writeable */ + if (!access_ok(evs, maxevents * sizeof(struct epoll_event))) + return -EFAULT; + + /* + * We have to check that the file structure underneath the fd + * the user passed to us _is_ an eventpoll file. + */ + if (!is_file_epoll(file)) + return -EINVAL; + + return 0; +} + +int epoll_sendevents(struct file *file, struct epoll_event __user *events, + int maxevents) +{ + struct eventpoll *ep; + int ret; + + ret = ep_check_params(file, events, maxevents); + if (unlikely(ret)) + return ret; + + ep = file->private_data; + /* + * Racy call, but that's ok - it should get retried based on + * poll readiness anyway. + */ + if (ep_events_available(ep)) + return ep_try_send_events(ep, events, maxevents); + return 0; +} + /* * Implement the event wait interface for the eventpoll file. It is the kernel * part of the user space epoll_wait(2). @@ -2453,26 +2511,16 @@ static int do_epoll_wait(int epfd, struct epoll_event __user *events, int maxevents, struct timespec64 *to) { struct eventpoll *ep; - - /* The maximum number of event must be greater than zero */ - if (maxevents <= 0 || maxevents > EP_MAX_EVENTS) - return -EINVAL; - - /* Verify that the area passed by the user is writeable */ - if (!access_ok(events, maxevents * sizeof(struct epoll_event))) - return -EFAULT; + int ret; /* Get the "struct file *" for the eventpoll file */ CLASS(fd, f)(epfd); if (fd_empty(f)) return -EBADF; - /* - * We have to check that the file structure underneath the fd - * the user passed to us _is_ an eventpoll file. - */ - if (!is_file_epoll(fd_file(f))) - return -EINVAL; + ret = ep_check_params(fd_file(f), events, maxevents); + if (unlikely(ret)) + return ret; /* * At this point it is safe to assume that the "private_data" contains diff --git a/fs/exec.c b/fs/exec.c index 506cd411f4ac..cfbb2b9ee3c9 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -115,66 +115,6 @@ bool path_noexec(const struct path *path) (path->mnt->mnt_sb->s_iflags & SB_I_NOEXEC); } -#ifdef CONFIG_USELIB -/* - * Note that a shared library must be both readable and executable due to - * security reasons. - * - * Also note that we take the address to load from the file itself. - */ -SYSCALL_DEFINE1(uselib, const char __user *, library) -{ - struct linux_binfmt *fmt; - struct file *file; - struct filename *tmp = getname(library); - int error = PTR_ERR(tmp); - static const struct open_flags uselib_flags = { - .open_flag = O_LARGEFILE | O_RDONLY, - .acc_mode = MAY_READ | MAY_EXEC, - .intent = LOOKUP_OPEN, - .lookup_flags = LOOKUP_FOLLOW, - }; - - if (IS_ERR(tmp)) - goto out; - - file = do_filp_open(AT_FDCWD, tmp, &uselib_flags); - putname(tmp); - error = PTR_ERR(file); - if (IS_ERR(file)) - goto out; - - /* - * Check do_open_execat() for an explanation. - */ - error = -EACCES; - if (WARN_ON_ONCE(!S_ISREG(file_inode(file)->i_mode)) || - path_noexec(&file->f_path)) - goto exit; - - error = -ENOEXEC; - - read_lock(&binfmt_lock); - list_for_each_entry(fmt, &formats, lh) { - if (!fmt->load_shlib) - continue; - if (!try_module_get(fmt->module)) - continue; - read_unlock(&binfmt_lock); - error = fmt->load_shlib(file); - read_lock(&binfmt_lock); - put_binfmt(fmt); - if (error != -ENOEXEC) - break; - } - read_unlock(&binfmt_lock); -exit: - fput(file); -out: - return error; -} -#endif /* #ifdef CONFIG_USELIB */ - #ifdef CONFIG_MMU /* * The nascent bprm->mm is not visible until exec_mmap() but it can @@ -755,8 +695,6 @@ int setup_arg_pages(struct linux_binprm *bprm, mm->arg_start = bprm->p; #endif - if (bprm->loader) - bprm->loader -= stack_shift; bprm->exec -= stack_shift; if (mmap_write_lock_killable(mm)) @@ -1229,13 +1167,12 @@ int begin_new_exec(struct linux_binprm * bprm) */ bprm->point_of_no_return = true; - /* - * Make this the only thread in the thread group. - */ + /* Make this the only thread in the thread group */ retval = de_thread(me); if (retval) goto out; - + /* see the comment in check_unsafe_exec() */ + current->fs->in_exec = 0; /* * Cancel any io_uring activity across execve */ @@ -1497,6 +1434,8 @@ static void free_bprm(struct linux_binprm *bprm) } free_arg_pages(bprm); if (bprm->cred) { + /* in case exec fails before de_thread() succeeds */ + current->fs->in_exec = 0; mutex_unlock(¤t->signal->cred_guard_mutex); abort_creds(bprm->cred); } @@ -1618,6 +1557,10 @@ static void check_unsafe_exec(struct linux_binprm *bprm) * suid exec because the differently privileged task * will be able to manipulate the current directory, etc. * It would be nice to force an unshare instead... + * + * Otherwise we set fs->in_exec = 1 to deny clone(CLONE_FS) + * from another sub-thread until de_thread() succeeds, this + * state is protected by cred_guard_mutex we hold. */ n_fs = 1; spin_lock(&p->fs->lock); @@ -1861,10 +1804,9 @@ static int bprm_execve(struct linux_binprm *bprm) goto out; sched_mm_cid_after_execve(current); + rseq_execve(current); /* execve succeeded */ - current->fs->in_exec = 0; current->in_execve = 0; - rseq_execve(current); user_events_execve(current); acct_update_integrals(current); task_numa_free(current, false); @@ -1881,7 +1823,7 @@ out: force_fatal_sig(SIGSEGV); sched_mm_cid_after_execve(current); - current->fs->in_exec = 0; + rseq_set_notify_resume(current); current->in_execve = 0; return retval; diff --git a/fs/exfat/balloc.c b/fs/exfat/balloc.c index ce9be95c9172..cc01556c9d9b 100644 --- a/fs/exfat/balloc.c +++ b/fs/exfat/balloc.c @@ -141,36 +141,28 @@ int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync) return 0; } -void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) +int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync) { int i, b; unsigned int ent_idx; struct super_block *sb = inode->i_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); - struct exfat_mount_options *opts = &sbi->options; if (!is_valid_cluster(sbi, clu)) - return; + return -EIO; ent_idx = CLUSTER_TO_BITMAP_ENT(clu); i = BITMAP_OFFSET_SECTOR_INDEX(sb, ent_idx); b = BITMAP_OFFSET_BIT_IN_SECTOR(sb, ent_idx); - clear_bit_le(b, sbi->vol_amap[i]->b_data); - exfat_update_bh(sbi->vol_amap[i], sync); + if (!test_bit_le(b, sbi->vol_amap[i]->b_data)) + return -EIO; - if (opts->discard) { - int ret_discard; + clear_bit_le(b, sbi->vol_amap[i]->b_data); - ret_discard = sb_issue_discard(sb, - exfat_cluster_to_sector(sbi, clu), - (1 << sbi->sect_per_clus_bits), GFP_NOFS, 0); + exfat_update_bh(sbi->vol_amap[i], sync); - if (ret_discard == -EOPNOTSUPP) { - exfat_err(sb, "discard not supported by device, disabling"); - opts->discard = 0; - } - } + return 0; } /* diff --git a/fs/exfat/exfat_fs.h b/fs/exfat/exfat_fs.h index 78be6964a8a0..f8ead4d47ef0 100644 --- a/fs/exfat/exfat_fs.h +++ b/fs/exfat/exfat_fs.h @@ -14,8 +14,6 @@ #define EXFAT_ROOT_INO 1 -#define EXFAT_CLUSTERS_UNTRACKED (~0u) - /* * exfat error flags */ @@ -456,7 +454,7 @@ int exfat_count_num_clusters(struct super_block *sb, int exfat_load_bitmap(struct super_block *sb); void exfat_free_bitmap(struct exfat_sb_info *sbi); int exfat_set_bitmap(struct inode *inode, unsigned int clu, bool sync); -void exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync); +int exfat_clear_bitmap(struct inode *inode, unsigned int clu, bool sync); unsigned int exfat_find_free_bitmap(struct super_block *sb, unsigned int clu); int exfat_count_used_clusters(struct super_block *sb, unsigned int *ret_count); int exfat_trim_fs(struct inode *inode, struct fstrim_range *range); diff --git a/fs/exfat/fatent.c b/fs/exfat/fatent.c index 9e5492ac409b..23065f948ae7 100644 --- a/fs/exfat/fatent.c +++ b/fs/exfat/fatent.c @@ -144,6 +144,20 @@ int exfat_chain_cont_cluster(struct super_block *sb, unsigned int chain, return 0; } +static inline void exfat_discard_cluster(struct super_block *sb, + unsigned int clu, unsigned int num_clusters) +{ + int ret; + struct exfat_sb_info *sbi = EXFAT_SB(sb); + + ret = sb_issue_discard(sb, exfat_cluster_to_sector(sbi, clu), + sbi->sect_per_clus * num_clusters, GFP_NOFS, 0); + if (ret == -EOPNOTSUPP) { + exfat_err(sb, "discard not supported by device, disabling"); + sbi->options.discard = 0; + } +} + /* This function must be called with bitmap_lock held */ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain) { @@ -175,6 +189,7 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain BITMAP_OFFSET_SECTOR_INDEX(sb, CLUSTER_TO_BITMAP_ENT(clu)); if (p_chain->flags == ALLOC_NO_FAT_CHAIN) { + int err; unsigned int last_cluster = p_chain->dir + p_chain->size - 1; do { bool sync = false; @@ -189,11 +204,18 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain cur_cmap_i = next_cmap_i; } - exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))); + err = exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))); + if (err) + break; clu++; num_clusters++; } while (num_clusters < p_chain->size); + + if (sbi->options.discard) + exfat_discard_cluster(sb, p_chain->dir, p_chain->size); } else { + unsigned int nr_clu = 1; + do { bool sync = false; unsigned int n_clu = clu; @@ -210,12 +232,23 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain cur_cmap_i = next_cmap_i; } - exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode))); + if (exfat_clear_bitmap(inode, clu, (sync && IS_DIRSYNC(inode)))) + break; + + if (sbi->options.discard) { + if (n_clu == clu + 1) + nr_clu++; + else { + exfat_discard_cluster(sb, clu - nr_clu + 1, nr_clu); + nr_clu = 1; + } + } + clu = n_clu; num_clusters++; if (err) - goto dec_used_clus; + break; if (num_clusters >= sbi->num_clusters - EXFAT_FIRST_CLUSTER) { /* @@ -229,7 +262,6 @@ static int __exfat_free_cluster(struct inode *inode, struct exfat_chain *p_chain } while (clu != EXFAT_EOF_CLUSTER); } -dec_used_clus: sbi->used_clusters -= num_clusters; return 0; } @@ -262,7 +294,7 @@ int exfat_find_last_cluster(struct super_block *sb, struct exfat_chain *p_chain, clu = next; if (exfat_ent_get(sb, clu, &next)) return -EIO; - } while (next != EXFAT_EOF_CLUSTER); + } while (next != EXFAT_EOF_CLUSTER && count <= p_chain->size); if (p_chain->size != count) { exfat_fs_error(sb, diff --git a/fs/exfat/file.c b/fs/exfat/file.c index 05b51e721783..841a5b18e3df 100644 --- a/fs/exfat/file.c +++ b/fs/exfat/file.c @@ -582,12 +582,15 @@ static ssize_t exfat_file_write_iter(struct kiocb *iocb, struct iov_iter *iter) loff_t pos = iocb->ki_pos; loff_t valid_size; + if (unlikely(exfat_forced_shutdown(inode->i_sb))) + return -EIO; + inode_lock(inode); valid_size = ei->valid_size; ret = generic_write_checks(iocb, iter); - if (ret < 0) + if (ret <= 0) goto unlock; if (iocb->ki_flags & IOCB_DIRECT) { @@ -635,6 +638,16 @@ unlock: return ret; } +static ssize_t exfat_file_read_iter(struct kiocb *iocb, struct iov_iter *iter) +{ + struct inode *inode = file_inode(iocb->ki_filp); + + if (unlikely(exfat_forced_shutdown(inode->i_sb))) + return -EIO; + + return generic_file_read_iter(iocb, iter); +} + static vm_fault_t exfat_page_mkwrite(struct vm_fault *vmf) { int err; @@ -672,14 +685,26 @@ static const struct vm_operations_struct exfat_file_vm_ops = { static int exfat_file_mmap(struct file *file, struct vm_area_struct *vma) { + if (unlikely(exfat_forced_shutdown(file_inode(file)->i_sb))) + return -EIO; + file_accessed(file); vma->vm_ops = &exfat_file_vm_ops; return 0; } +static ssize_t exfat_splice_read(struct file *in, loff_t *ppos, + struct pipe_inode_info *pipe, size_t len, unsigned int flags) +{ + if (unlikely(exfat_forced_shutdown(file_inode(in)->i_sb))) + return -EIO; + + return filemap_splice_read(in, ppos, pipe, len, flags); +} + const struct file_operations exfat_file_operations = { .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, + .read_iter = exfat_file_read_iter, .write_iter = exfat_file_write_iter, .unlocked_ioctl = exfat_ioctl, #ifdef CONFIG_COMPAT @@ -687,7 +712,7 @@ const struct file_operations exfat_file_operations = { #endif .mmap = exfat_file_mmap, .fsync = exfat_file_fsync, - .splice_read = filemap_splice_read, + .splice_read = exfat_splice_read, .splice_write = iter_file_splice_write, }; diff --git a/fs/exfat/inode.c b/fs/exfat/inode.c index 96952d4acb50..b22c02d6000f 100644 --- a/fs/exfat/inode.c +++ b/fs/exfat/inode.c @@ -274,9 +274,11 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, sector_t last_block; sector_t phys = 0; sector_t valid_blks; + loff_t i_size; mutex_lock(&sbi->s_lock); - last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size_read(inode), sb); + i_size = i_size_read(inode); + last_block = EXFAT_B_TO_BLK_ROUND_UP(i_size, sb); if (iblock >= last_block && !create) goto done; @@ -305,77 +307,99 @@ static int exfat_get_block(struct inode *inode, sector_t iblock, if (buffer_delay(bh_result)) clear_buffer_delay(bh_result); - if (create) { + /* + * In most cases, we just need to set bh_result to mapped, unmapped + * or new status as follows: + * 1. i_size == valid_size + * 2. write case (create == 1) + * 3. direct_read (!bh_result->b_folio) + * -> the unwritten part will be zeroed in exfat_direct_IO() + * + * Otherwise, in the case of buffered read, it is necessary to take + * care the last nested block if valid_size is not equal to i_size. + */ + if (i_size == ei->valid_size || create || !bh_result->b_folio) valid_blks = EXFAT_B_TO_BLK_ROUND_UP(ei->valid_size, sb); + else + valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb); - if (iblock + max_blocks < valid_blks) { - /* The range has been written, map it */ - goto done; - } else if (iblock < valid_blks) { - /* - * The range has been partially written, - * map the written part. - */ - max_blocks = valid_blks - iblock; - goto done; - } + /* The range has been fully written, map it */ + if (iblock + max_blocks < valid_blks) + goto done; - /* The area has not been written, map and mark as new. */ - set_buffer_new(bh_result); + /* The range has been partially written, map the written part */ + if (iblock < valid_blks) { + max_blocks = valid_blks - iblock; + goto done; + } + /* The area has not been written, map and mark as new for create case */ + if (create) { + set_buffer_new(bh_result); ei->valid_size = EXFAT_BLK_TO_B(iblock + max_blocks, sb); mark_inode_dirty(inode); - } else { - valid_blks = EXFAT_B_TO_BLK(ei->valid_size, sb); + goto done; + } - if (iblock + max_blocks < valid_blks) { - /* The range has been written, map it */ + /* + * The area has just one block partially written. + * In that case, we should read and fill the unwritten part of + * a block with zero. + */ + if (bh_result->b_folio && iblock == valid_blks && + (ei->valid_size & (sb->s_blocksize - 1))) { + loff_t size, pos; + void *addr; + + max_blocks = 1; + + /* + * No buffer_head is allocated. + * (1) bmap: It's enough to set blocknr without I/O. + * (2) read: The unwritten part should be filled with zero. + * If a folio does not have any buffers, + * let's returns -EAGAIN to fallback to + * block_read_full_folio() for per-bh IO. + */ + if (!folio_buffers(bh_result->b_folio)) { + err = -EAGAIN; goto done; - } else if (iblock < valid_blks) { - /* - * The area has been partially written, - * map the written part. - */ - max_blocks = valid_blks - iblock; + } + + pos = EXFAT_BLK_TO_B(iblock, sb); + size = ei->valid_size - pos; + addr = folio_address(bh_result->b_folio) + + offset_in_folio(bh_result->b_folio, pos); + + /* Check if bh->b_data points to proper addr in folio */ + if (bh_result->b_data != addr) { + exfat_fs_error_ratelimit(sb, + "b_data(%p) != folio_addr(%p)", + bh_result->b_data, addr); + err = -EINVAL; goto done; - } else if (iblock == valid_blks && - (ei->valid_size & (sb->s_blocksize - 1))) { - /* - * The block has been partially written, - * zero the unwritten part and map the block. - */ - loff_t size, off, pos; - - max_blocks = 1; - - /* - * For direct read, the unwritten part will be zeroed in - * exfat_direct_IO() - */ - if (!bh_result->b_folio) - goto done; - - pos = EXFAT_BLK_TO_B(iblock, sb); - size = ei->valid_size - pos; - off = pos & (PAGE_SIZE - 1); - - folio_set_bh(bh_result, bh_result->b_folio, off); - err = bh_read(bh_result, 0); - if (err < 0) - goto unlock_ret; - - folio_zero_segment(bh_result->b_folio, off + size, - off + sb->s_blocksize); - } else { - /* - * The range has not been written, clear the mapped flag - * to only zero the cache and do not read from disk. - */ - clear_buffer_mapped(bh_result); } + + /* Read a block */ + err = bh_read(bh_result, 0); + if (err < 0) + goto done; + + /* Zero unwritten part of a block */ + memset(bh_result->b_data + size, 0, bh_result->b_size - size); + err = 0; + goto done; } + + /* + * The area has not been written, clear mapped for read/bmap cases. + * If so, it will be filled with zero without reading from disk. + */ + clear_buffer_mapped(bh_result); done: bh_result->b_size = EXFAT_BLK_TO_B(max_blocks, sb); + if (err < 0) + clear_buffer_mapped(bh_result); unlock_ret: mutex_unlock(&sbi->s_lock); return err; diff --git a/fs/exfat/namei.c b/fs/exfat/namei.c index 691dd77b6ab5..fede0283d6e2 100644 --- a/fs/exfat/namei.c +++ b/fs/exfat/namei.c @@ -232,7 +232,7 @@ static int exfat_search_empty_slot(struct super_block *sb, dentry = 0; } - while (dentry + num_entries < total_entries && + while (dentry + num_entries <= total_entries && clu.dir != EXFAT_EOF_CLUSTER) { i = dentry & (dentries_per_clu - 1); @@ -646,6 +646,11 @@ static int exfat_find(struct inode *dir, struct qstr *qname, info->valid_size = le64_to_cpu(ep2->dentry.stream.valid_size); info->size = le64_to_cpu(ep2->dentry.stream.size); + if (unlikely(EXFAT_B_TO_CLU_ROUND_UP(info->size, sbi) > sbi->used_clusters)) { + exfat_fs_error(sb, "data size is invalid(%lld)", info->size); + return -EIO; + } + info->start_clu = le32_to_cpu(ep2->dentry.stream.start_clu); if (!is_valid_cluster(sbi, info->start_clu) && info->size) { exfat_warn(sb, "start_clu is invalid cluster(0x%x)", @@ -835,8 +840,8 @@ unlock: return err; } -static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -846,7 +851,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, loff_t size = i_size_read(dir); if (unlikely(exfat_forced_shutdown(sb))) - return -EIO; + return ERR_PTR(-EIO); mutex_lock(&EXFAT_SB(sb)->s_lock); exfat_set_volume_dirty(sb); @@ -877,7 +882,7 @@ static int exfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, unlock: mutex_unlock(&EXFAT_SB(sb)->s_lock); - return err; + return ERR_PTR(err); } static int exfat_check_dir_empty(struct super_block *sb, diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index d47896a89596..1729bf42eb51 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -801,4 +801,5 @@ load_default: void exfat_free_upcase_table(struct exfat_sb_info *sbi) { kvfree(sbi->vol_utbl); + sbi->vol_utbl = NULL; } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index bd57844414aa..7ed858937d45 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -36,46 +36,18 @@ static void exfat_put_super(struct super_block *sb) struct exfat_sb_info *sbi = EXFAT_SB(sb); mutex_lock(&sbi->s_lock); + exfat_clear_volume_dirty(sb); exfat_free_bitmap(sbi); brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); } -static int exfat_sync_fs(struct super_block *sb, int wait) -{ - struct exfat_sb_info *sbi = EXFAT_SB(sb); - int err = 0; - - if (unlikely(exfat_forced_shutdown(sb))) - return 0; - - if (!wait) - return 0; - - /* If there are some dirty buffers in the bdev inode */ - mutex_lock(&sbi->s_lock); - sync_blockdev(sb->s_bdev); - if (exfat_clear_volume_dirty(sb)) - err = -EIO; - mutex_unlock(&sbi->s_lock); - return err; -} - static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; struct exfat_sb_info *sbi = EXFAT_SB(sb); unsigned long long id = huge_encode_dev(sb->s_bdev->bd_dev); - if (sbi->used_clusters == EXFAT_CLUSTERS_UNTRACKED) { - mutex_lock(&sbi->s_lock); - if (exfat_count_used_clusters(sb, &sbi->used_clusters)) { - mutex_unlock(&sbi->s_lock); - return -EIO; - } - mutex_unlock(&sbi->s_lock); - } - buf->f_type = sb->s_magic; buf->f_bsize = sbi->cluster_size; buf->f_blocks = sbi->num_clusters - 2; /* clu 0 & 1 */ @@ -228,7 +200,6 @@ static const struct super_operations exfat_sops = { .write_inode = exfat_write_inode, .evict_inode = exfat_evict_inode, .put_super = exfat_put_super, - .sync_fs = exfat_sync_fs, .statfs = exfat_statfs, .show_options = exfat_show_options, .shutdown = exfat_shutdown, @@ -531,7 +502,6 @@ static int exfat_read_boot_sector(struct super_block *sb) sbi->vol_flags = le16_to_cpu(p_boot->vol_flags); sbi->vol_flags_persistent = sbi->vol_flags & (VOLUME_DIRTY | MEDIA_FAILURE); sbi->clu_srch_ptr = EXFAT_FIRST_CLUSTER; - sbi->used_clusters = EXFAT_CLUSTERS_UNTRACKED; /* check consistencies */ if ((u64)sbi->num_FAT_sectors << p_boot->sect_size_bits < @@ -761,10 +731,14 @@ static void exfat_free(struct fs_context *fc) static int exfat_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; fc->sb_flags |= SB_NODIRATIME; - /* volume flag will be updated in exfat_sync_fs */ - sync_filesystem(fc->root->d_sb); + sync_filesystem(sb); + mutex_lock(&EXFAT_SB(sb)->s_lock); + exfat_clear_volume_dirty(sb); + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return 0; } diff --git a/fs/exportfs/expfs.c b/fs/exportfs/expfs.c index 0c899cfba578..cdefea17986a 100644 --- a/fs/exportfs/expfs.c +++ b/fs/exportfs/expfs.c @@ -126,10 +126,8 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, int err; parent = ERR_PTR(-EACCES); - inode_lock(dentry->d_inode); if (mnt->mnt_sb->s_export_op->get_parent) parent = mnt->mnt_sb->s_export_op->get_parent(dentry); - inode_unlock(dentry->d_inode); if (IS_ERR(parent)) { dprintk("get_parent of %lu failed, err %ld\n", @@ -145,7 +143,7 @@ static struct dentry *reconnect_one(struct vfsmount *mnt, if (err) goto out_err; dprintk("%s: found name: %s\n", __func__, nbuf); - tmp = lookup_one_unlocked(mnt_idmap(mnt), nbuf, parent, strlen(nbuf)); + tmp = lookup_one_unlocked(mnt_idmap(mnt), &QSTR(nbuf), parent); if (IS_ERR(tmp)) { dprintk("lookup failed: %ld\n", PTR_ERR(tmp)); err = PTR_ERR(tmp); @@ -286,6 +284,7 @@ static int get_name(const struct path *path, char *name, struct dentry *child) }; struct getdents_callback buffer = { .ctx.actor = filldir_one, + .ctx.count = INT_MAX, .name = name, }; @@ -551,8 +550,7 @@ exportfs_decode_fh_raw(struct vfsmount *mnt, struct fid *fid, int fh_len, } inode_lock(target_dir->d_inode); - nresult = lookup_one(mnt_idmap(mnt), nbuf, - target_dir, strlen(nbuf)); + nresult = lookup_one(mnt_idmap(mnt), &QSTR(nbuf), target_dir); if (!IS_ERR(nresult)) { if (unlikely(nresult->d_inode != result->d_inode)) { dput(nresult); @@ -610,4 +608,5 @@ struct dentry *exportfs_decode_fh(struct vfsmount *mnt, struct fid *fid, } EXPORT_SYMBOL_GPL(exportfs_decode_fh); +MODULE_DESCRIPTION("Code mapping from inodes to file handles"); MODULE_LICENSE("GPL"); diff --git a/fs/ext2/ext2.h b/fs/ext2/ext2.h index f38bdd46e4f7..4025f875252a 100644 --- a/fs/ext2/ext2.h +++ b/fs/ext2/ext2.h @@ -368,6 +368,7 @@ struct ext2_inode { #define EXT2_MOUNT_ERRORS_CONT 0x000010 /* Continue on errors */ #define EXT2_MOUNT_ERRORS_RO 0x000020 /* Remount fs ro on errors */ #define EXT2_MOUNT_ERRORS_PANIC 0x000040 /* Panic on errors */ +#define EXT2_MOUNT_ERRORS_MASK 0x000070 #define EXT2_MOUNT_MINIX_DF 0x000080 /* Mimics the Minix statfs */ #define EXT2_MOUNT_NOBH 0x000100 /* No buffer_heads */ #define EXT2_MOUNT_NO_UID32 0x000200 /* Disable 32-bit UIDs */ diff --git a/fs/ext2/namei.c b/fs/ext2/namei.c index 8346ab9534c1..bde617a66cec 100644 --- a/fs/ext2/namei.c +++ b/fs/ext2/namei.c @@ -225,15 +225,16 @@ static int ext2_link (struct dentry * old_dentry, struct inode * dir, return err; } -static int ext2_mkdir(struct mnt_idmap * idmap, - struct inode * dir, struct dentry * dentry, umode_t mode) +static struct dentry *ext2_mkdir(struct mnt_idmap * idmap, + struct inode * dir, struct dentry * dentry, + umode_t mode) { struct inode * inode; int err; err = dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); inode_inc_link_count(dir); @@ -258,7 +259,7 @@ static int ext2_mkdir(struct mnt_idmap * idmap, d_instantiate_new(dentry, inode); out: - return err; + return ERR_PTR(err); out_fail: inode_dec_link_count(inode); diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 37f7ce56adce..28ff47ec4be6 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -23,7 +23,8 @@ #include <linux/slab.h> #include <linux/init.h> #include <linux/blkdev.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/random.h> #include <linux/buffer_head.h> #include <linux/exportfs.h> @@ -40,7 +41,6 @@ #include "acl.h" static void ext2_write_super(struct super_block *sb); -static int ext2_remount (struct super_block * sb, int * flags, char * data); static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf); static int ext2_sync_fs(struct super_block *sb, int wait); static int ext2_freeze(struct super_block *sb); @@ -81,6 +81,33 @@ void ext2_error(struct super_block *sb, const char *function, } } +static void ext2_msg_fc(struct fs_context *fc, const char *prefix, + const char *fmt, ...) +{ + struct va_format vaf; + va_list args; + const char *s_id; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + s_id = fc->root->d_sb->s_id; + } else { + /* get last path component of source */ + s_id = strrchr(fc->source, '/'); + if (s_id) + s_id++; + else + s_id = fc->source; + } + va_start(args, fmt); + + vaf.fmt = fmt; + vaf.va = &args; + + printk("%sEXT2-fs (%s): %pV\n", prefix, s_id, &vaf); + + va_end(args); +} + void ext2_msg(struct super_block *sb, const char *prefix, const char *fmt, ...) { @@ -346,7 +373,6 @@ static const struct super_operations ext2_sops = { .freeze_fs = ext2_freeze, .unfreeze_fs = ext2_unfreeze, .statfs = ext2_statfs, - .remount_fs = ext2_remount, .show_options = ext2_show_options, #ifdef CONFIG_QUOTA .quota_read = ext2_quota_read, @@ -402,230 +428,217 @@ static const struct export_operations ext2_export_ops = { .get_parent = ext2_get_parent, }; -static unsigned long get_sb_block(void **data) -{ - unsigned long sb_block; - char *options = (char *) *data; - - if (!options || strncmp(options, "sb=", 3) != 0) - return 1; /* Default location */ - options += 3; - sb_block = simple_strtoul(options, &options, 0); - if (*options && *options != ',') { - printk("EXT2-fs: Invalid sb specification: %s\n", - (char *) *data); - return 1; - } - if (*options == ',') - options++; - *data = (void *) options; - return sb_block; -} - enum { - Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, - Opt_resgid, Opt_resuid, Opt_sb, Opt_err_cont, Opt_err_panic, - Opt_err_ro, Opt_nouid32, Opt_debug, - Opt_oldalloc, Opt_orlov, Opt_nobh, Opt_user_xattr, Opt_nouser_xattr, - Opt_acl, Opt_noacl, Opt_xip, Opt_dax, Opt_ignore, Opt_err, Opt_quota, - Opt_usrquota, Opt_grpquota, Opt_reservation, Opt_noreservation + Opt_bsd_df, Opt_minix_df, Opt_grpid, Opt_nogrpid, Opt_resgid, Opt_resuid, + Opt_sb, Opt_errors, Opt_nouid32, Opt_debug, Opt_oldalloc, Opt_orlov, + Opt_nobh, Opt_user_xattr, Opt_acl, Opt_xip, Opt_dax, Opt_ignore, + Opt_quota, Opt_usrquota, Opt_grpquota, Opt_reservation, }; -static const match_table_t tokens = { - {Opt_bsd_df, "bsddf"}, - {Opt_minix_df, "minixdf"}, - {Opt_grpid, "grpid"}, - {Opt_grpid, "bsdgroups"}, - {Opt_nogrpid, "nogrpid"}, - {Opt_nogrpid, "sysvgroups"}, - {Opt_resgid, "resgid=%u"}, - {Opt_resuid, "resuid=%u"}, - {Opt_sb, "sb=%u"}, - {Opt_err_cont, "errors=continue"}, - {Opt_err_panic, "errors=panic"}, - {Opt_err_ro, "errors=remount-ro"}, - {Opt_nouid32, "nouid32"}, - {Opt_debug, "debug"}, - {Opt_oldalloc, "oldalloc"}, - {Opt_orlov, "orlov"}, - {Opt_nobh, "nobh"}, - {Opt_user_xattr, "user_xattr"}, - {Opt_nouser_xattr, "nouser_xattr"}, - {Opt_acl, "acl"}, - {Opt_noacl, "noacl"}, - {Opt_xip, "xip"}, - {Opt_dax, "dax"}, - {Opt_grpquota, "grpquota"}, - {Opt_ignore, "noquota"}, - {Opt_quota, "quota"}, - {Opt_usrquota, "usrquota"}, - {Opt_reservation, "reservation"}, - {Opt_noreservation, "noreservation"}, - {Opt_err, NULL} +static const struct constant_table ext2_param_errors[] = { + {"continue", EXT2_MOUNT_ERRORS_CONT}, + {"panic", EXT2_MOUNT_ERRORS_PANIC}, + {"remount-ro", EXT2_MOUNT_ERRORS_RO}, + {} +}; + +static const struct fs_parameter_spec ext2_param_spec[] = { + fsparam_flag ("bsddf", Opt_bsd_df), + fsparam_flag ("minixdf", Opt_minix_df), + fsparam_flag ("grpid", Opt_grpid), + fsparam_flag ("bsdgroups", Opt_grpid), + fsparam_flag ("nogrpid", Opt_nogrpid), + fsparam_flag ("sysvgroups", Opt_nogrpid), + fsparam_gid ("resgid", Opt_resgid), + fsparam_uid ("resuid", Opt_resuid), + fsparam_u32 ("sb", Opt_sb), + fsparam_enum ("errors", Opt_errors, ext2_param_errors), + fsparam_flag ("nouid32", Opt_nouid32), + fsparam_flag ("debug", Opt_debug), + fsparam_flag ("oldalloc", Opt_oldalloc), + fsparam_flag ("orlov", Opt_orlov), + fsparam_flag ("nobh", Opt_nobh), + fsparam_flag_no ("user_xattr", Opt_user_xattr), + fsparam_flag_no ("acl", Opt_acl), + fsparam_flag ("xip", Opt_xip), + fsparam_flag ("dax", Opt_dax), + fsparam_flag ("grpquota", Opt_grpquota), + fsparam_flag ("noquota", Opt_ignore), + fsparam_flag ("quota", Opt_quota), + fsparam_flag ("usrquota", Opt_usrquota), + fsparam_flag_no ("reservation", Opt_reservation), + {} +}; + +#define EXT2_SPEC_s_resuid (1 << 0) +#define EXT2_SPEC_s_resgid (1 << 1) + +struct ext2_fs_context { + unsigned long vals_s_flags; /* Bits to set in s_flags */ + unsigned long mask_s_flags; /* Bits changed in s_flags */ + unsigned int vals_s_mount_opt; + unsigned int mask_s_mount_opt; + kuid_t s_resuid; + kgid_t s_resgid; + unsigned long s_sb_block; + unsigned int spec; + }; -static int parse_options(char *options, struct super_block *sb, - struct ext2_mount_options *opts) +static inline void ctx_set_mount_opt(struct ext2_fs_context *ctx, + unsigned long flag) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - kuid_t uid; - kgid_t gid; - - if (!options) - return 1; - - while ((p = strsep (&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_bsd_df: - clear_opt (opts->s_mount_opt, MINIX_DF); - break; - case Opt_minix_df: - set_opt (opts->s_mount_opt, MINIX_DF); - break; - case Opt_grpid: - set_opt (opts->s_mount_opt, GRPID); - break; - case Opt_nogrpid: - clear_opt (opts->s_mount_opt, GRPID); - break; - case Opt_resuid: - if (match_int(&args[0], &option)) - return 0; - uid = make_kuid(current_user_ns(), option); - if (!uid_valid(uid)) { - ext2_msg(sb, KERN_ERR, "Invalid uid value %d", option); - return 0; - - } - opts->s_resuid = uid; - break; - case Opt_resgid: - if (match_int(&args[0], &option)) - return 0; - gid = make_kgid(current_user_ns(), option); - if (!gid_valid(gid)) { - ext2_msg(sb, KERN_ERR, "Invalid gid value %d", option); - return 0; - } - opts->s_resgid = gid; - break; - case Opt_sb: - /* handled by get_sb_block() instead of here */ - /* *sb_block = match_int(&args[0]); */ - break; - case Opt_err_panic: - clear_opt (opts->s_mount_opt, ERRORS_CONT); - clear_opt (opts->s_mount_opt, ERRORS_RO); - set_opt (opts->s_mount_opt, ERRORS_PANIC); - break; - case Opt_err_ro: - clear_opt (opts->s_mount_opt, ERRORS_CONT); - clear_opt (opts->s_mount_opt, ERRORS_PANIC); - set_opt (opts->s_mount_opt, ERRORS_RO); - break; - case Opt_err_cont: - clear_opt (opts->s_mount_opt, ERRORS_RO); - clear_opt (opts->s_mount_opt, ERRORS_PANIC); - set_opt (opts->s_mount_opt, ERRORS_CONT); - break; - case Opt_nouid32: - set_opt (opts->s_mount_opt, NO_UID32); - break; - case Opt_debug: - set_opt (opts->s_mount_opt, DEBUG); - break; - case Opt_oldalloc: - set_opt (opts->s_mount_opt, OLDALLOC); - break; - case Opt_orlov: - clear_opt (opts->s_mount_opt, OLDALLOC); - break; - case Opt_nobh: - ext2_msg(sb, KERN_INFO, - "nobh option not supported"); - break; + ctx->mask_s_mount_opt |= flag; + ctx->vals_s_mount_opt |= flag; +} + +static inline void ctx_clear_mount_opt(struct ext2_fs_context *ctx, + unsigned long flag) +{ + ctx->mask_s_mount_opt |= flag; + ctx->vals_s_mount_opt &= ~flag; +} + +static inline unsigned long +ctx_test_mount_opt(struct ext2_fs_context *ctx, unsigned long flag) +{ + return (ctx->vals_s_mount_opt & flag); +} + +static inline bool +ctx_parsed_mount_opt(struct ext2_fs_context *ctx, unsigned long flag) +{ + return (ctx->mask_s_mount_opt & flag); +} + +static void ext2_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static int ext2_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct ext2_fs_context *ctx = fc->fs_private; + int opt; + struct fs_parse_result result; + + opt = fs_parse(fc, ext2_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_bsd_df: + ctx_clear_mount_opt(ctx, EXT2_MOUNT_MINIX_DF); + break; + case Opt_minix_df: + ctx_set_mount_opt(ctx, EXT2_MOUNT_MINIX_DF); + break; + case Opt_grpid: + ctx_set_mount_opt(ctx, EXT2_MOUNT_GRPID); + break; + case Opt_nogrpid: + ctx_clear_mount_opt(ctx, EXT2_MOUNT_GRPID); + break; + case Opt_resuid: + ctx->s_resuid = result.uid; + ctx->spec |= EXT2_SPEC_s_resuid; + break; + case Opt_resgid: + ctx->s_resgid = result.gid; + ctx->spec |= EXT2_SPEC_s_resgid; + break; + case Opt_sb: + /* Note that this is silently ignored on remount */ + ctx->s_sb_block = result.uint_32; + break; + case Opt_errors: + ctx_clear_mount_opt(ctx, EXT2_MOUNT_ERRORS_MASK); + ctx_set_mount_opt(ctx, result.uint_32); + break; + case Opt_nouid32: + ctx_set_mount_opt(ctx, EXT2_MOUNT_NO_UID32); + break; + case Opt_debug: + ctx_set_mount_opt(ctx, EXT2_MOUNT_DEBUG); + break; + case Opt_oldalloc: + ctx_set_mount_opt(ctx, EXT2_MOUNT_OLDALLOC); + break; + case Opt_orlov: + ctx_clear_mount_opt(ctx, EXT2_MOUNT_OLDALLOC); + break; + case Opt_nobh: + ext2_msg_fc(fc, KERN_INFO, "nobh option not supported\n"); + break; #ifdef CONFIG_EXT2_FS_XATTR - case Opt_user_xattr: - set_opt (opts->s_mount_opt, XATTR_USER); - break; - case Opt_nouser_xattr: - clear_opt (opts->s_mount_opt, XATTR_USER); - break; + case Opt_user_xattr: + if (!result.negated) + ctx_set_mount_opt(ctx, EXT2_MOUNT_XATTR_USER); + else + ctx_clear_mount_opt(ctx, EXT2_MOUNT_XATTR_USER); + break; #else - case Opt_user_xattr: - case Opt_nouser_xattr: - ext2_msg(sb, KERN_INFO, "(no)user_xattr options" - "not supported"); - break; + case Opt_user_xattr: + ext2_msg_fc(fc, KERN_INFO, "(no)user_xattr options not supported"); + break; #endif #ifdef CONFIG_EXT2_FS_POSIX_ACL - case Opt_acl: - set_opt(opts->s_mount_opt, POSIX_ACL); - break; - case Opt_noacl: - clear_opt(opts->s_mount_opt, POSIX_ACL); - break; + case Opt_acl: + if (!result.negated) + ctx_set_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL); + else + ctx_clear_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL); + break; #else - case Opt_acl: - case Opt_noacl: - ext2_msg(sb, KERN_INFO, - "(no)acl options not supported"); - break; + case Opt_acl: + ext2_msg_fc(fc, KERN_INFO, "(no)acl options not supported"); + break; #endif - case Opt_xip: - ext2_msg(sb, KERN_INFO, "use dax instead of xip"); - set_opt(opts->s_mount_opt, XIP); - fallthrough; - case Opt_dax: + case Opt_xip: + ext2_msg_fc(fc, KERN_INFO, "use dax instead of xip"); + ctx_set_mount_opt(ctx, EXT2_MOUNT_XIP); + fallthrough; + case Opt_dax: #ifdef CONFIG_FS_DAX - ext2_msg(sb, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); - set_opt(opts->s_mount_opt, DAX); + ext2_msg_fc(fc, KERN_WARNING, + "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + ctx_set_mount_opt(ctx, EXT2_MOUNT_DAX); #else - ext2_msg(sb, KERN_INFO, "dax option not supported"); + ext2_msg_fc(fc, KERN_INFO, "dax option not supported"); #endif - break; + break; #if defined(CONFIG_QUOTA) - case Opt_quota: - case Opt_usrquota: - set_opt(opts->s_mount_opt, USRQUOTA); - break; - - case Opt_grpquota: - set_opt(opts->s_mount_opt, GRPQUOTA); - break; + case Opt_quota: + case Opt_usrquota: + ctx_set_mount_opt(ctx, EXT2_MOUNT_USRQUOTA); + break; + + case Opt_grpquota: + ctx_set_mount_opt(ctx, EXT2_MOUNT_GRPQUOTA); + break; #else - case Opt_quota: - case Opt_usrquota: - case Opt_grpquota: - ext2_msg(sb, KERN_INFO, - "quota operations not supported"); - break; + case Opt_quota: + case Opt_usrquota: + case Opt_grpquota: + ext2_msg_fc(fc, KERN_INFO, "quota operations not supported"); + break; #endif - - case Opt_reservation: - set_opt(opts->s_mount_opt, RESERVATION); - ext2_msg(sb, KERN_INFO, "reservations ON"); - break; - case Opt_noreservation: - clear_opt(opts->s_mount_opt, RESERVATION); - ext2_msg(sb, KERN_INFO, "reservations OFF"); - break; - case Opt_ignore: - break; - default: - return 0; + case Opt_reservation: + if (!result.negated) { + ctx_set_mount_opt(ctx, EXT2_MOUNT_RESERVATION); + ext2_msg_fc(fc, KERN_INFO, "reservations ON"); + } else { + ctx_clear_mount_opt(ctx, EXT2_MOUNT_RESERVATION); + ext2_msg_fc(fc, KERN_INFO, "reservations OFF"); } + break; + case Opt_ignore: + break; + default: + return -EINVAL; } - return 1; + return 0; } static int ext2_setup_super (struct super_block * sb, @@ -801,24 +814,83 @@ static unsigned long descriptor_loc(struct super_block *sb, return ext2_group_first_block_no(sb, bg) + ext2_bg_has_super(sb, bg); } -static int ext2_fill_super(struct super_block *sb, void *data, int silent) +/* + * Set all mount options either from defaults on disk, or from parsed + * options. Parsed/specified options override on-disk defaults. + */ +static void ext2_set_options(struct fs_context *fc, struct ext2_sb_info *sbi) +{ + struct ext2_fs_context *ctx = fc->fs_private; + struct ext2_super_block *es = sbi->s_es; + unsigned long def_mount_opts = le32_to_cpu(es->s_default_mount_opts); + + /* Copy parsed mount options to sbi */ + sbi->s_mount_opt = ctx->vals_s_mount_opt; + + /* Use in-superblock defaults only if not specified during parsing */ + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_DEBUG) && + def_mount_opts & EXT2_DEFM_DEBUG) + set_opt(sbi->s_mount_opt, DEBUG); + + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_GRPID) && + def_mount_opts & EXT2_DEFM_BSDGROUPS) + set_opt(sbi->s_mount_opt, GRPID); + + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_NO_UID32) && + def_mount_opts & EXT2_DEFM_UID16) + set_opt(sbi->s_mount_opt, NO_UID32); + +#ifdef CONFIG_EXT2_FS_XATTR + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_XATTR_USER) && + def_mount_opts & EXT2_DEFM_XATTR_USER) + set_opt(sbi->s_mount_opt, XATTR_USER); +#endif +#ifdef CONFIG_EXT2_FS_POSIX_ACL + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_POSIX_ACL) && + def_mount_opts & EXT2_DEFM_ACL) + set_opt(sbi->s_mount_opt, POSIX_ACL); +#endif + + if (!ctx_parsed_mount_opt(ctx, EXT2_MOUNT_ERRORS_MASK)) { + if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC) + set_opt(sbi->s_mount_opt, ERRORS_PANIC); + else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE) + set_opt(sbi->s_mount_opt, ERRORS_CONT); + else + set_opt(sbi->s_mount_opt, ERRORS_RO); + } + + if (ctx->spec & EXT2_SPEC_s_resuid) + sbi->s_resuid = ctx->s_resuid; + else + sbi->s_resuid = make_kuid(&init_user_ns, + le16_to_cpu(es->s_def_resuid)); + + if (ctx->spec & EXT2_SPEC_s_resgid) + sbi->s_resgid = ctx->s_resgid; + else + sbi->s_resgid = make_kgid(&init_user_ns, + le16_to_cpu(es->s_def_resgid)); +} + +static int ext2_fill_super(struct super_block *sb, struct fs_context *fc) { + struct ext2_fs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct buffer_head * bh; struct ext2_sb_info * sbi; struct ext2_super_block * es; struct inode *root; unsigned long block; - unsigned long sb_block = get_sb_block(&data); + unsigned long sb_block = ctx->s_sb_block; unsigned long logic_sb_block; unsigned long offset = 0; - unsigned long def_mount_opts; long ret = -ENOMEM; int blocksize = BLOCK_SIZE; int db_count; int i, j; __le32 features; int err; - struct ext2_mount_options opts; sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); if (!sbi) @@ -877,42 +949,7 @@ static int ext2_fill_super(struct super_block *sb, void *data, int silent) if (sb->s_magic != EXT2_SUPER_MAGIC) goto cantfind_ext2; - opts.s_mount_opt = 0; - /* Set defaults before we parse the mount options */ - def_mount_opts = le32_to_cpu(es->s_default_mount_opts); - if (def_mount_opts & EXT2_DEFM_DEBUG) - set_opt(opts.s_mount_opt, DEBUG); - if (def_mount_opts & EXT2_DEFM_BSDGROUPS) - set_opt(opts.s_mount_opt, GRPID); - if (def_mount_opts & EXT2_DEFM_UID16) - set_opt(opts.s_mount_opt, NO_UID32); -#ifdef CONFIG_EXT2_FS_XATTR - if (def_mount_opts & EXT2_DEFM_XATTR_USER) - set_opt(opts.s_mount_opt, XATTR_USER); -#endif -#ifdef CONFIG_EXT2_FS_POSIX_ACL - if (def_mount_opts & EXT2_DEFM_ACL) - set_opt(opts.s_mount_opt, POSIX_ACL); -#endif - - if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_PANIC) - set_opt(opts.s_mount_opt, ERRORS_PANIC); - else if (le16_to_cpu(sbi->s_es->s_errors) == EXT2_ERRORS_CONTINUE) - set_opt(opts.s_mount_opt, ERRORS_CONT); - else - set_opt(opts.s_mount_opt, ERRORS_RO); - - opts.s_resuid = make_kuid(&init_user_ns, le16_to_cpu(es->s_def_resuid)); - opts.s_resgid = make_kgid(&init_user_ns, le16_to_cpu(es->s_def_resgid)); - - set_opt(opts.s_mount_opt, RESERVATION); - - if (!parse_options((char *) data, sb, &opts)) - goto failed_mount; - - sbi->s_mount_opt = opts.s_mount_opt; - sbi->s_resuid = opts.s_resuid; - sbi->s_resgid = opts.s_resgid; + ext2_set_options(fc, sbi); sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sb, POSIX_ACL) ? SB_POSIXACL : 0); @@ -1324,23 +1361,21 @@ static void ext2_write_super(struct super_block *sb) ext2_sync_fs(sb, 1); } -static int ext2_remount (struct super_block * sb, int * flags, char * data) +static int ext2_reconfigure(struct fs_context *fc) { + struct ext2_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; struct ext2_sb_info * sbi = EXT2_SB(sb); struct ext2_super_block * es; struct ext2_mount_options new_opts; + int flags = fc->sb_flags; int err; sync_filesystem(sb); - spin_lock(&sbi->s_lock); - new_opts.s_mount_opt = sbi->s_mount_opt; - new_opts.s_resuid = sbi->s_resuid; - new_opts.s_resgid = sbi->s_resgid; - spin_unlock(&sbi->s_lock); - - if (!parse_options(data, sb, &new_opts)) - return -EINVAL; + new_opts.s_mount_opt = ctx->vals_s_mount_opt; + new_opts.s_resuid = ctx->s_resuid; + new_opts.s_resgid = ctx->s_resgid; spin_lock(&sbi->s_lock); es = sbi->s_es; @@ -1349,9 +1384,9 @@ static int ext2_remount (struct super_block * sb, int * flags, char * data) "dax flag with busy inodes while remounting"); new_opts.s_mount_opt ^= EXT2_MOUNT_DAX; } - if ((bool)(*flags & SB_RDONLY) == sb_rdonly(sb)) + if ((bool)(flags & SB_RDONLY) == sb_rdonly(sb)) goto out_set; - if (*flags & SB_RDONLY) { + if (flags & SB_RDONLY) { if (le16_to_cpu(es->s_state) & EXT2_VALID_FS || !(sbi->s_mount_state & EXT2_VALID_FS)) goto out_set; @@ -1470,10 +1505,9 @@ static int ext2_statfs (struct dentry * dentry, struct kstatfs * buf) return 0; } -static struct dentry *ext2_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ext2_get_tree(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ext2_fill_super); + return get_tree_bdev(fc, ext2_fill_super); } #ifdef CONFIG_QUOTA @@ -1556,7 +1590,7 @@ static ssize_t ext2_quota_write(struct super_block *sb, int type, } lock_buffer(bh); memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); @@ -1624,12 +1658,49 @@ out: #endif +static const struct fs_context_operations ext2_context_ops = { + .parse_param = ext2_parse_param, + .get_tree = ext2_get_tree, + .reconfigure = ext2_reconfigure, + .free = ext2_free_fc, +}; + +static int ext2_init_fs_context(struct fs_context *fc) +{ + struct ext2_fs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct ext2_sb_info *sbi = EXT2_SB(sb); + + spin_lock(&sbi->s_lock); + ctx->vals_s_mount_opt = sbi->s_mount_opt; + ctx->vals_s_flags = sb->s_flags; + ctx->s_resuid = sbi->s_resuid; + ctx->s_resgid = sbi->s_resgid; + spin_unlock(&sbi->s_lock); + } else { + ctx->s_sb_block = 1; + ctx_set_mount_opt(ctx, EXT2_MOUNT_RESERVATION); + } + + fc->fs_private = ctx; + fc->ops = &ext2_context_ops; + + return 0; +} + static struct file_system_type ext2_fs_type = { .owner = THIS_MODULE, .name = "ext2", - .mount = ext2_mount, .kill_sb = kill_block_super, .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = ext2_init_fs_context, + .parameters = ext2_param_spec, }; MODULE_ALIAS_FS("ext2"); diff --git a/fs/ext4/balloc.c b/fs/ext4/balloc.c index 8042ad873808..c48fd36b2d74 100644 --- a/fs/ext4/balloc.c +++ b/fs/ext4/balloc.c @@ -649,8 +649,8 @@ static int ext4_has_free_clusters(struct ext4_sb_info *sbi, /* Hm, nope. Are (enough) root reserved clusters available? */ if (uid_eq(sbi->s_resuid, current_fsuid()) || (!gid_eq(sbi->s_resgid, GLOBAL_ROOT_GID) && in_group_p(sbi->s_resgid)) || - capable(CAP_SYS_RESOURCE) || - (flags & EXT4_MB_USE_ROOT_BLOCKS)) { + (flags & EXT4_MB_USE_ROOT_BLOCKS) || + capable(CAP_SYS_RESOURCE)) { if (free_clusters >= (nclusters + dirty_clusters + resv_clusters)) diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index 2a135075468d..87760fabdd2e 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -25,12 +25,12 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; sz = EXT4_INODES_PER_GROUP(sb) >> 3; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); @@ -48,11 +48,11 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; sz = EXT4_INODES_PER_GROUP(sb) >> 3; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); @@ -67,11 +67,11 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, struct ext4_sb_info *sbi = EXT4_SB(sb); int sz = EXT4_CLUSTERS_PER_GROUP(sb) / 8; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); @@ -89,10 +89,10 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, __u32 csum; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); diff --git a/fs/ext4/block_validity.c b/fs/ext4/block_validity.c index 87ee3a17bd29..e8c5525afc67 100644 --- a/fs/ext4/block_validity.c +++ b/fs/ext4/block_validity.c @@ -351,10 +351,9 @@ int ext4_check_blockref(const char *function, unsigned int line, { __le32 *bref = p; unsigned int blk; + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + if (journal && inode == journal->j_inode) return 0; while (bref < p+max) { diff --git a/fs/ext4/dir.c b/fs/ext4/dir.c index 02d47a64e8d1..d4164c507a90 100644 --- a/fs/ext4/dir.c +++ b/fs/ext4/dir.c @@ -86,7 +86,7 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, dir->i_sb->s_blocksize); const int next_offset = ((char *) de - buf) + rlen; bool fake = is_fake_dir_entry(de); - bool has_csum = ext4_has_metadata_csum(dir->i_sb); + bool has_csum = ext4_has_feature_metadata_csum(dir->i_sb); if (unlikely(rlen < ext4_dir_rec_len(1, fake ? NULL : dir))) error_msg = "rec_len is smaller than minimal"; @@ -104,6 +104,9 @@ int __ext4_check_dir_entry(const char *function, unsigned int line, else if (unlikely(le32_to_cpu(de->inode) > le32_to_cpu(EXT4_SB(dir->i_sb)->s_es->s_inodes_count))) error_msg = "inode out of bounds"; + else if (unlikely(next_offset == size && de->name_len == 1 && + de->name[0] == '.')) + error_msg = "'.' directory cannot be the last in data block"; else return 0; @@ -145,7 +148,7 @@ static int ext4_readdir(struct file *file, struct dir_context *ctx) return err; /* Can we just clear INDEX flag to ignore htree information? */ - if (!ext4_has_metadata_csum(sb)) { + if (!ext4_has_feature_metadata_csum(sb)) { /* * We don't set the inode dirty flag since it's not * critical that it gets flushed back to the disk. diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 4e7de7eaa374..18373de980f2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -256,9 +256,19 @@ struct ext4_allocation_request { #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_DELAYED BIT(BH_Delay) +/* + * This is for use in ext4_map_query_blocks() for a special case where we can + * have a physically and logically contiguous blocks split across two leaf + * nodes instead of a single extent. This is required in case of atomic writes + * to know whether the returned extent is last in leaf. If yes, then lookup for + * next in leaf block in ext4_map_query_blocks_next_in_leaf(). + * - This is never going to be added to any buffer head state. + * - We use the next available bit after BH_BITMAP_UPTODATE. + */ +#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_DELAYED) + EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -278,7 +288,10 @@ struct ext4_system_blocks { /* * Flags for ext4_io_end->flags */ -#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_UNWRITTEN 0x0001 +#define EXT4_IO_END_FAILED 0x0002 + +#define EXT4_IO_END_DEFER_COMPLETION (EXT4_IO_END_UNWRITTEN | EXT4_IO_END_FAILED) struct ext4_io_end_vec { struct list_head list; /* list of io_end_vec */ @@ -367,6 +380,8 @@ struct ext4_io_submit { #define EXT4_MAX_BLOCKS(size, offset, blkbits) \ ((EXT4_BLOCK_ALIGN(size + offset, blkbits) >> blkbits) - (offset >> \ blkbits)) +#define EXT4_B_TO_LBLK(inode, offset) \ + (round_up((offset), i_blocksize(inode)) >> (inode)->i_blkbits) /* Translate a block number to a cluster number */ #define EXT4_B2C(sbi, blk) ((blk) >> (sbi)->s_cluster_bits) @@ -701,9 +716,6 @@ enum { #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) - /* Convert extent to initialized after IO complete */ -#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 @@ -715,11 +727,23 @@ enum { #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) - /* Caller will submit data before dropping transaction handle. This - * allows jbd2 to avoid submitting data before commit. */ + /* Caller is in the context of data submission, such as writeback, + * fsync, etc. Especially, in the generic writeback path, caller will + * submit data before dropping transaction handle. This allows jbd2 + * to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ + EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 +/* + * Atomic write caller needs this to query in the slow path of mixed mapping + * case, when a contiguous extent can be split across two adjacent leaf nodes. + * Look EXT4_MAP_QUERY_LAST_IN_LEAF. + */ +#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000 /* * The bit position of these flags must not overlap with any of the @@ -733,6 +757,13 @@ enum { #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 +/* + * ext4_map_query_blocks() uses this filter mask to filter the flags needed to + * pass while lookup/querying of on disk extent tree. + */ +#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ + EXT4_EX_NOFAIL |\ + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) /* * Flags used by ext4_free_blocks @@ -1056,15 +1087,16 @@ struct ext4_inode_info { /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; - /* Number of ongoing updates on this inode */ - atomic_t i_fc_updates; - atomic_t i_unwritten; /* Nr. of inflight conversions pending */ + spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; - /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */ - struct mutex i_fc_lock; + /* + * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len + * and inode's EXT4_FC_STATE_COMMITTING state bit. + */ + spinlock_t i_fc_lock; /* * i_disksize keeps track of what the inode size is ON DISK, not @@ -1097,8 +1129,6 @@ struct ext4_inode_info { struct inode vfs_inode; struct jbd2_inode *jinode; - spinlock_t i_raw_lock; /* protects updates to the raw inode */ - /* * File creation time. Its function is same as that of * struct timespec64 i_{a,c,m}time in the generic inode. @@ -1141,6 +1171,7 @@ struct ext4_inode_info { /* quota space reservation, managed internally by quota code */ qsize_t i_reserved_quota; #endif + spinlock_t i_block_reservation_lock; /* Lock protecting lists below */ spinlock_t i_completed_io_lock; @@ -1151,8 +1182,6 @@ struct ext4_inode_info { struct list_head i_rsv_conversion_list; struct work_struct i_rsv_conversion_work; - spinlock_t i_block_reservation_lock; - /* * Transactions that contain inode's metadata needed to complete * fsync and fdatasync, respectively. @@ -1606,6 +1635,8 @@ struct ext4_sb_info { unsigned int s_mb_prefetch; unsigned int s_mb_prefetch_limit; unsigned int s_mb_best_avail_max_trim_order; + unsigned int s_sb_update_sec; + unsigned int s_sb_update_kb; /* stats for buddy allocator */ atomic_t s_bal_reqs; /* number of reqs with len > 1 */ @@ -1749,7 +1780,7 @@ struct ext4_sb_info { * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. */ - spinlock_t s_fc_lock; + struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; @@ -1821,7 +1852,8 @@ static inline int ext4_valid_inum(struct super_block *sb, unsigned long ino) */ enum { EXT4_MF_MNTDIR_SAMPLED, - EXT4_MF_FC_INELIGIBLE /* Fast commit ineligible */ + EXT4_MF_FC_INELIGIBLE, /* Fast commit ineligible */ + EXT4_MF_JOURNAL_DESTROY /* Journal is in process of destroying */ }; static inline void ext4_set_mount_flag(struct super_block *sb, int bit) @@ -1907,6 +1939,7 @@ enum { EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; @@ -2232,15 +2265,32 @@ extern int ext4_feature_set_ok(struct super_block *sb, int readonly); /* * Superblock flags */ -#define EXT4_FLAGS_RESIZING 0 -#define EXT4_FLAGS_SHUTDOWN 1 -#define EXT4_FLAGS_BDEV_IS_DAX 2 +enum { + EXT4_FLAGS_RESIZING, /* Avoid superblock update and resize race */ + EXT4_FLAGS_SHUTDOWN, /* Prevent access to the file system */ + EXT4_FLAGS_BDEV_IS_DAX, /* Current block device support DAX */ + EXT4_FLAGS_EMERGENCY_RO,/* Emergency read-only due to fs errors */ +}; static inline int ext4_forced_shutdown(struct super_block *sb) { return test_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); } +static inline int ext4_emergency_ro(struct super_block *sb) +{ + return test_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); +} + +static inline int ext4_emergency_state(struct super_block *sb) +{ + if (unlikely(ext4_forced_shutdown(sb))) + return -EIO; + if (unlikely(ext4_emergency_ro(sb))) + return -EROFS; + return 0; +} + /* * Default values for user and/or group using reserved blocks */ @@ -2272,10 +2322,19 @@ static inline int ext4_forced_shutdown(struct super_block *sb) #define EXT4_DEFM_NODELALLOC 0x0800 /* - * Default journal batch times + * Default journal batch times and ioprio. */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ +#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + + +/* + * Default values for superblock update + */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_SEC (3600) /* seconds (1 hour) */ +#define EXT4_DEF_SB_UPDATE_INTERVAL_KB (16384) /* kilobytes (16MB) */ + /* * Minimum number of groups in a flexgroup before we separate out @@ -2457,8 +2516,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_SIPHASH 6 #define DX_HASH_LAST DX_HASH_SIPHASH -static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, - const void *address, unsigned int length) +static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } @@ -2810,8 +2868,7 @@ extern int ext4_htree_store_dirent(struct file *dir_file, __u32 hash, struct ext4_dir_entry_2 *dirent, struct fscrypt_str *ent_name); extern void ext4_htree_free_dir_info(struct dir_private_info *p); -extern int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, +extern int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de); @@ -2893,8 +2950,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); -void ext4_fc_start_update(struct inode *inode); -void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); @@ -2944,6 +2999,7 @@ static inline bool ext4_mb_cr_expensive(enum criteria cr) void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); +void ext4_check_map_extents_env(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, @@ -2964,6 +3020,7 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); +bool ext4_should_enable_large_folio(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -3001,6 +3058,8 @@ extern int ext4_inode_attach_jinode(struct inode *inode); extern int ext4_can_truncate(struct inode *inode); extern int ext4_truncate(struct inode *); extern int ext4_break_layouts(struct inode *); +extern int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end); extern int ext4_punch_hole(struct file *file, loff_t offset, loff_t length); extern void ext4_set_inode_flags(struct inode *, bool init); extern int ext4_alloc_da_blocks(struct inode *inode); @@ -3008,6 +3067,8 @@ extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); @@ -3019,6 +3080,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode, extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +static inline bool is_special_ino(struct super_block *sb, unsigned long ino) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || + ino == le32_to_cpu(es->s_usr_quota_inum) || + ino == le32_to_cpu(es->s_grp_quota_inum) || + ino == le32_to_cpu(es->s_prj_quota_inum) || + ino == le32_to_cpu(es->s_orphan_file_inum); +} + /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); @@ -3088,8 +3160,7 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wa extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); -extern __le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es); +extern __le32 ext4_superblock_csum(struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); @@ -3259,14 +3330,10 @@ extern void ext4_group_desc_csum_set(struct super_block *sb, __u32 group, extern int ext4_register_li_request(struct super_block *sb, ext4_group_t first_not_zeroed); -static inline int ext4_has_metadata_csum(struct super_block *sb) -{ - return ext4_has_feature_metadata_csum(sb); -} - static inline int ext4_has_group_desc_csum(struct super_block *sb) { - return ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb); + return ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb); } #define ext4_read_incompat_64bit_val(es, name) \ @@ -3351,6 +3418,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) return 1 << sbi->s_log_groups_per_flex; } +static inline loff_t ext4_get_maxbytes(struct inode *inode) +{ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return inode->i_sb->s_maxbytes; + return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; +} + #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ @@ -3546,11 +3620,11 @@ extern int ext4_try_to_write_inline_data(struct address_space *mapping, struct folio **foliop); int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, unsigned copied, struct folio *folio); -extern int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop, - void **fsdata); +extern int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da); extern int ext4_try_add_inline_entry(handle_t *handle, struct ext4_filename *fname, struct inode *dir, struct inode *inode); @@ -3683,6 +3757,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_extents_atomic(handle_t *handle, + struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, @@ -3785,34 +3861,19 @@ static inline void set_bitmap_uptodate(struct buffer_head *bh) set_bit(BH_BITMAP_UPTODATE, &(bh)->b_state); } -/* For ioend & aio unwritten conversion wait queues */ -#define EXT4_WQ_HASH_SZ 37 -#define ext4_ioend_wq(v) (&ext4__ioend_wq[((unsigned long)(v)) %\ - EXT4_WQ_HASH_SZ]) -extern wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; - extern int ext4_resize_begin(struct super_block *sb); extern int ext4_resize_end(struct super_block *sb, bool update_backups); -static inline void ext4_set_io_unwritten_flag(struct inode *inode, - struct ext4_io_end *io_end) +static inline void ext4_set_io_unwritten_flag(struct ext4_io_end *io_end) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) { + if (!(io_end->flag & EXT4_IO_END_UNWRITTEN)) io_end->flag |= EXT4_IO_END_UNWRITTEN; - atomic_inc(&EXT4_I(inode)->i_unwritten); - } } static inline void ext4_clear_io_unwritten_flag(ext4_io_end_t *io_end) { - struct inode *inode = io_end->inode; - - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (io_end->flag & EXT4_IO_END_UNWRITTEN) io_end->flag &= ~EXT4_IO_END_UNWRITTEN; - /* Wake up anyone waiting on unwritten extent conversion */ - if (atomic_dec_and_test(&EXT4_I(inode)->i_unwritten)) - wake_up_all(ext4_ioend_wq(inode)); - } } extern const struct iomap_ops ext4_iomap_ops; @@ -3835,7 +3896,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) static inline bool ext4_inode_can_atomic_write(struct inode *inode) { - return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0; + return S_ISREG(inode->i_mode) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + EXT4_SB(inode->i_sb)->s_awu_min > 0; } extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index da4a82456383..b3e9b7bd7978 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode) ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC))) { + !test_opt(inode->i_sb, DELALLOC) && + !mapping_large_folio_support(inode->i_mapping))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ @@ -63,12 +64,14 @@ static void ext4_put_nojournal(handle_t *handle) */ static int ext4_journal_check_start(struct super_block *sb) { + int ret; journal_t *journal; might_sleep(); - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; if (WARN_ON_ONCE(sb_rdonly(sb))) return -EROFS; @@ -244,7 +247,8 @@ int __ext4_journal_get_write_access(const char *where, unsigned int line, } } else ext4_check_bdev_write_error(sb); - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, @@ -331,7 +335,8 @@ int __ext4_journal_get_create_access(const char *where, unsigned int line, err); return err; } - if (trigger_type == EXT4_JTR_NONE || !ext4_has_metadata_csum(sb)) + if (trigger_type == EXT4_JTR_NONE || + !ext4_has_feature_metadata_csum(sb)) return 0; BUG_ON(trigger_type >= EXT4_JOURNAL_TRIGGER_COUNT); jbd2_journal_set_triggers(bh, diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 0c77697d5e90..63d17c5201b5 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -122,90 +122,6 @@ #define EXT4_HT_EXT_CONVERT 11 #define EXT4_HT_MAX 12 -/** - * struct ext4_journal_cb_entry - Base structure for callback information. - * - * This struct is a 'seed' structure for a using with your own callback - * structs. If you are using callbacks you must allocate one of these - * or another struct of your own definition which has this struct - * as it's first element and pass it to ext4_journal_callback_add(). - */ -struct ext4_journal_cb_entry { - /* list information for other callbacks attached to the same handle */ - struct list_head jce_list; - - /* Function to call with this callback structure */ - void (*jce_func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, int error); - - /* user data goes here */ -}; - -/** - * ext4_journal_callback_add: add a function to call after transaction commit - * @handle: active journal transaction handle to register callback on - * @func: callback function to call after the transaction has committed: - * @sb: superblock of current filesystem for transaction - * @jce: returned journal callback data - * @rc: journal state at commit (0 = transaction committed properly) - * @jce: journal callback data (internal and function private data struct) - * - * The registered function will be called in the context of the journal thread - * after the transaction for which the handle was created has completed. - * - * No locks are held when the callback function is called, so it is safe to - * call blocking functions from within the callback, but the callback should - * not block or run for too long, or the filesystem will be blocked waiting for - * the next transaction to commit. No journaling functions can be used, or - * there is a risk of deadlock. - * - * There is no guaranteed calling order of multiple registered callbacks on - * the same transaction. - */ -static inline void _ext4_journal_callback_add(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - /* Add the jce to transaction's private list */ - list_add_tail(&jce->jce_list, &handle->h_transaction->t_private_list); -} - -static inline void ext4_journal_callback_add(handle_t *handle, - void (*func)(struct super_block *sb, - struct ext4_journal_cb_entry *jce, - int rc), - struct ext4_journal_cb_entry *jce) -{ - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - /* Add the jce to transaction's private list */ - jce->jce_func = func; - spin_lock(&sbi->s_md_lock); - _ext4_journal_callback_add(handle, jce); - spin_unlock(&sbi->s_md_lock); -} - - -/** - * ext4_journal_callback_del: delete a registered callback - * @handle: active journal transaction handle on which callback was registered - * @jce: registered journal callback entry to unregister - * Return true if object was successfully removed - */ -static inline bool ext4_journal_callback_try_del(handle_t *handle, - struct ext4_journal_cb_entry *jce) -{ - bool deleted; - struct ext4_sb_info *sbi = - EXT4_SB(handle->h_transaction->t_journal->j_private); - - spin_lock(&sbi->s_md_lock); - deleted = !list_empty(&jce->jce_list); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - return deleted; -} - int ext4_mark_iloc_dirty(handle_t *handle, struct inode *inode, @@ -403,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, revoke_creds, 0); } -static inline int ext4_journal_blocks_per_page(struct inode *inode) +static inline int ext4_journal_blocks_per_folio(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) - return jbd2_journal_blocks_per_page(inode); + return jbd2_journal_blocks_per_folio(inode); return 0; } @@ -513,4 +429,33 @@ static inline int ext4_should_dioread_nolock(struct inode *inode) return 1; } +/* + * Pass journal explicitly as it may not be cached in the sbi->s_journal in some + * cases + */ +static inline int ext4_journal_destroy(struct ext4_sb_info *sbi, journal_t *journal) +{ + int err = 0; + + /* + * At this point only two things can be operating on the journal. + * JBD2 thread performing transaction commit and s_sb_upd_work + * issuing sb update through the journal. Once we set + * EXT4_JOURNAL_DESTROY, new ext4_handle_error() calls will not + * queue s_sb_upd_work and ext4_force_commit() makes sure any + * ext4_handle_error() calls from the running transaction commit are + * finished. Hence no new s_sb_upd_work can be queued after we + * flush it here. + */ + ext4_set_mount_flag(sbi->s_sb, EXT4_MF_JOURNAL_DESTROY); + + ext4_force_commit(sbi->s_sb); + flush_work(&sbi->s_sb_upd_work); + + err = jbd2_journal_destroy(journal); + sbi->s_journal = NULL; + + return err; +} + #endif /* _EXT4_JBD2_H */ diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index a07a98a4b97a..b543a46fc809 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } @@ -63,7 +62,7 @@ static int ext4_extent_block_csum_verify(struct inode *inode, { struct ext4_extent_tail *et; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; et = find_ext4_extent_tail(eh); @@ -77,7 +76,7 @@ static void ext4_extent_block_csum_set(struct inode *inode, { struct ext4_extent_tail *et; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; et = find_ext4_extent_tail(eh); @@ -611,6 +610,8 @@ int ext4_ext_precache(struct inode *inode) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ + ext4_check_map_extents_env(inode); + down_read(&ei->i_data_sem); depth = ext_depth(inode); @@ -1530,7 +1531,7 @@ static int ext4_ext_search_left(struct inode *inode, static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, - struct ext4_extent *ret_ex) + struct ext4_extent *ret_ex, int flags) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1604,7 +1605,8 @@ got_index: ix++; while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, + flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -1612,7 +1614,7 @@ got_index: put_bh(bh); } - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -2396,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; - int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; - depth = ext_depth(inode); - + /* + * Extent tree can change between the time we estimate credits and + * the time we actually modify the tree. Assume the worst case. + */ if (extents <= 1) - index = depth * 2; + index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents; else - index = depth * 3; + index = (EXT4_MAX_EXTENT_DEPTH * 3) + + DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0)); return index; } @@ -2821,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL; partial.pclu = 0; partial.lblk = 0; @@ -2851,8 +2856,7 @@ again: ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ - path = ext4_find_extent(inode, end, NULL, - EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); + path = ext4_find_extent(inode, end, NULL, flags); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2918,7 +2922,7 @@ again: */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, - NULL); + NULL, flags); if (err < 0) goto out; if (pblk) { @@ -2994,8 +2998,7 @@ again: i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, path[i].p_idx, - depth - i - 1, - EXT4_EX_NOCACHE); + depth - i - 1, flags); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); @@ -4202,7 +4205,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_find_extent(inode, map->m_lblk, NULL, 0); + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -4314,7 +4317,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out; ar.lright = map->m_lblk; - err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, + &ex2, flags); if (err < 0) goto out; @@ -4433,6 +4437,20 @@ got_allocated_blocks: allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: + /* + * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag. + * So we know that the depth used here is correct, since there was no + * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set. + * If tomorrow we start using this QUERY flag with CREATE, then we will + * need to re-calculate the depth as it might have changed due to block + * allocation. + */ + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) { + WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE); + if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr))) + map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF; + } + ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, @@ -4568,131 +4586,65 @@ static long ext4_zero_range(struct file *file, loff_t offset, loff_t len, int mode) { struct inode *inode = file_inode(file); - struct address_space *mapping = file->f_mapping; handle_t *handle = NULL; - unsigned int max_blocks; loff_t new_size = 0; - int ret = 0; - int flags; - int credits; - int partial_begin, partial_end; - loff_t start, end; - ext4_lblk_t lblk; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; + unsigned int blocksize = i_blocksize(inode); unsigned int blkbits = inode->i_blkbits; + int ret, flags, credits; trace_ext4_zero_range(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); - /* - * Round up offset. This is not fallocate, we need to zero out - * blocks, so convert interior block aligned part of the range to - * unwritten and possibly manually zero out unaligned parts of the - * range. Here, start and partial_begin are inclusive, end and - * partial_end are exclusive. - */ - start = round_up(offset, 1 << blkbits); - end = round_down((offset + len), 1 << blkbits); - - if (start < offset || end > offset + len) - return -EINVAL; - partial_begin = offset & ((1 << blkbits) - 1); - partial_end = (offset + len) & ((1 << blkbits) - 1); - - lblk = start >> blkbits; - max_blocks = (end >> blkbits); - if (max_blocks < lblk) - max_blocks = 0; - else - max_blocks -= lblk; - - inode_lock(inode); - - /* - * Indirect files do not support unwritten extents - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out_mutex; - } + /* Indirect files do not support unwritten extents */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) + return -EOPNOTSUPP; if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; ret = inode_newsize_ok(inode, new_size); if (ret) - goto out_mutex; + return ret; } flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - /* Preallocate the range including the unaligned edges */ - if (partial_begin || partial_end) { - ret = ext4_alloc_file_blocks(file, - round_down(offset, 1 << blkbits) >> blkbits, - (round_up((offset + len), 1 << blkbits) - - round_down(offset, 1 << blkbits)) >> blkbits, - new_size, flags); - if (ret) - goto out_mutex; + if (!IS_ALIGNED(offset | end, blocksize)) { + ext4_lblk_t alloc_lblk = offset >> blkbits; + ext4_lblk_t len_lblk = EXT4_MAX_BLOCKS(len, offset, blkbits); + ret = ext4_alloc_file_blocks(file, alloc_lblk, len_lblk, + new_size, flags); + if (ret) + return ret; } - /* Zero range excluding the unaligned edges */ - if (max_blocks > 0) { - flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | - EXT4_EX_NOCACHE); - - /* - * Prevent page faults from reinstantiating pages we have - * released from page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - ret = ext4_update_disksize_before_punch(inode, offset, len); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - - /* - * For journalled data we need to write (and checkpoint) pages - * before discarding page cache to avoid inconsitent data on - * disk in case of crash before zeroing trans is committed. - */ - if (ext4_should_journal_data(inode)) { - ret = filemap_write_and_wait_range(mapping, start, - end - 1); - if (ret) { - filemap_invalidate_unlock(mapping); - goto out_mutex; - } - } + ret = ext4_update_disksize_before_punch(inode, offset, len); + if (ret) + return ret; - /* Now release the pages and zero block aligned part of pages */ - truncate_pagecache_range(inode, start, end - 1); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + /* Now release the pages and zero block aligned part of pages */ + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, - flags); - filemap_invalidate_unlock(mapping); + /* Zero range excluding the unaligned edges */ + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> blkbits; + if (end_lblk > start_lblk) { + ext4_lblk_t zero_blks = end_lblk - start_lblk; + + flags |= (EXT4_GET_BLOCKS_CONVERT_UNWRITTEN | EXT4_EX_NOCACHE); + ret = ext4_alloc_file_blocks(file, start_lblk, zero_blks, + new_size, flags); if (ret) - goto out_mutex; + return ret; } - if (!partial_begin && !partial_end) - goto out_mutex; + /* Finish zeroing out if it doesn't contain partial block */ + if (IS_ALIGNED(offset | end, blocksize)) + return ret; /* * In worst case we have to writeout two nonadjacent unwritten @@ -4705,27 +4657,69 @@ static long ext4_zero_range(struct file *file, loff_t offset, if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(inode->i_sb, ret); - goto out_mutex; + return ret; } - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); + /* Zero out partial block at the edges of the range */ + ret = ext4_zero_partial_blocks(handle, inode, offset, len); + if (ret) + goto out_handle; + if (new_size) ext4_update_inode_size(inode, new_size); ret = ext4_mark_inode_dirty(handle, inode); if (unlikely(ret)) goto out_handle; - /* Zero out partial block at the edges of the range */ - ret = ext4_zero_partial_blocks(handle, inode, offset, len); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); + ext4_update_inode_fsync_trans(handle, inode, 1); if (file->f_flags & O_SYNC) ext4_handle_sync(handle); out_handle: ext4_journal_stop(handle); -out_mutex: - inode_unlock(inode); + return ret; +} + +static long ext4_do_fallocate(struct file *file, loff_t offset, + loff_t len, int mode) +{ + struct inode *inode = file_inode(file); + loff_t end = offset + len; + loff_t new_size = 0; + ext4_lblk_t start_lblk, len_lblk; + int ret; + + trace_ext4_fallocate_enter(inode, offset, len, mode); + WARN_ON_ONCE(!inode_is_locked(inode)); + + start_lblk = offset >> inode->i_blkbits; + len_lblk = EXT4_MAX_BLOCKS(len, offset, inode->i_blkbits); + + /* We only support preallocation for extent-based files only. */ + if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { + ret = -EOPNOTSUPP; + goto out; + } + + if (!(mode & FALLOC_FL_KEEP_SIZE) && + (end > inode->i_size || end > EXT4_I(inode)->i_disksize)) { + new_size = end; + ret = inode_newsize_ok(inode, new_size); + if (ret) + goto out; + } + + ret = ext4_alloc_file_blocks(file, start_lblk, len_lblk, new_size, + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT); + if (ret) + goto out; + + if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { + ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, + EXT4_I(inode)->i_sync_tid); + } +out: + trace_ext4_fallocate_exit(inode, offset, len_lblk, ret); return ret; } @@ -4739,12 +4733,8 @@ out_mutex: long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - loff_t new_size = 0; - unsigned int max_blocks; - int ret = 0; - int flags; - ext4_lblk_t lblk; - unsigned int blkbits = inode->i_blkbits; + struct address_space *mapping = file->f_mapping; + int ret; /* * Encrypted inodes can't handle collapse range or insert @@ -4764,74 +4754,135 @@ long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len) inode_lock(inode); ret = ext4_convert_inline_data(inode); - inode_unlock(inode); if (ret) - goto exit; + goto out_inode_lock; - if (mode & FALLOC_FL_PUNCH_HOLE) { - ret = ext4_punch_hole(file, offset, len); - goto exit; - } + /* Wait all existing dio workers, newcomers will block on i_rwsem */ + inode_dio_wait(inode); - if (mode & FALLOC_FL_COLLAPSE_RANGE) { - ret = ext4_collapse_range(file, offset, len); - goto exit; - } + ret = file_modified(file); + if (ret) + goto out_inode_lock; - if (mode & FALLOC_FL_INSERT_RANGE) { - ret = ext4_insert_range(file, offset, len); - goto exit; + if ((mode & FALLOC_FL_MODE_MASK) == FALLOC_FL_ALLOCATE_RANGE) { + ret = ext4_do_fallocate(file, offset, len, mode); + goto out_inode_lock; } - if (mode & FALLOC_FL_ZERO_RANGE) { + /* + * Follow-up operations will drop page cache, hold invalidate lock + * to prevent page faults from reinstantiating pages we have + * released from page cache. + */ + filemap_invalidate_lock(mapping); + + ret = ext4_break_layouts(inode); + if (ret) + goto out_invalidate_lock; + + if (mode & FALLOC_FL_PUNCH_HOLE) + ret = ext4_punch_hole(file, offset, len); + else if (mode & FALLOC_FL_COLLAPSE_RANGE) + ret = ext4_collapse_range(file, offset, len); + else if (mode & FALLOC_FL_INSERT_RANGE) + ret = ext4_insert_range(file, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) ret = ext4_zero_range(file, offset, len, mode); - goto exit; - } - trace_ext4_fallocate_enter(inode, offset, len, mode); - lblk = offset >> blkbits; + else + ret = -EOPNOTSUPP; + +out_invalidate_lock: + filemap_invalidate_unlock(mapping); +out_inode_lock: + inode_unlock(inode); + return ret; +} +/* + * This function converts a range of blocks to written extents. The caller of + * this function will pass the start offset and the size. all unwritten extents + * within this range will be converted to written extents. + * + * This function is called from the direct IO end io call back function for + * atomic writes, to convert the unwritten extents after IO is completed. + * + * Note that the requirement for atomic writes is that all conversion should + * happen atomically in a single fs journal transaction. We mainly only allocate + * unwritten extents either on a hole on a pre-exiting unwritten extent range in + * ext4_map_blocks_atomic_write(). The only case where we can have multiple + * unwritten extents in a range [offset, offset+len) is when there is a split + * unwritten extent between two leaf nodes which was cached in extent status + * cache during ext4_iomap_alloc() time. That will allow + * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going + * into the slow path. That means we might need a loop for conversion of this + * unwritten extent split across leaf block within a single journal transaction. + * Split extents across leaf nodes is a rare case, but let's still handle that + * to meet the requirements of multi-fsblock atomic writes. + * + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) +{ + unsigned int max_blocks; + int ret = 0, ret2 = 0, ret3 = 0; + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; + int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE; + + map.m_lblk = offset >> blkbits; max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); - flags = EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT; - inode_lock(inode); + if (!handle) { + /* + * TODO: An optimization can be added later by having an extent + * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that + * it can tell if the extent in the cache is a split extent. + * But for now let's assume pextents as 2 always. + */ + credits = ext4_meta_trans_blocks(inode, max_blocks, 2); + } - /* - * We only support preallocation for extent-based files only - */ - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) { - ret = -EOPNOTSUPP; - goto out; + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } } - if (!(mode & FALLOC_FL_KEEP_SIZE) && - (offset + len > inode->i_size || - offset + len > EXT4_I(inode)->i_disksize)) { - new_size = offset + len; - ret = inode_newsize_ok(inode, new_size); - if (ret) - goto out; + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret != max_blocks) + ext4_msg(inode->i_sb, KERN_INFO, + "inode #%lu: block %u: len %u: " + "split block mapping found for atomic write, " + "ret = %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + if (ret <= 0) + break; } - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); + ret2 = ext4_mark_inode_dirty(handle, inode); - ret = file_modified(file); - if (ret) - goto out; + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } - ret = ext4_alloc_file_blocks(file, lblk, max_blocks, new_size, flags); - if (ret) - goto out; + if (ret <= 0 || ret2) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "returned %d or %d", + inode->i_ino, map.m_lblk, + map.m_len, ret, ret2); - if (file->f_flags & O_SYNC && EXT4_SB(inode->i_sb)->s_journal) { - ret = ext4_fc_commit(EXT4_SB(inode->i_sb)->s_journal, - EXT4_I(inode)->i_sync_tid); - } -out: - inode_unlock(inode); - trace_ext4_fallocate_exit(inode, offset, max_blocks, ret); -exit: - return ret; + return ret > 0 ? ret2 : ret; } /* @@ -4873,8 +4924,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, break; } } + /* + * Do not cache any unrelated extents, as it does not hold the + * i_rwsem or invalidate_lock, which could corrupt the extent + * status tree. + */ ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_IO_CONVERT_EXT); + EXT4_GET_BLOCKS_IO_CONVERT_EXT | + EXT4_EX_NOCACHE); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " @@ -4985,12 +5042,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { - u64 maxbytes; - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - maxbytes = inode->i_sb->s_maxbytes; - else - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + u64 maxbytes = ext4_get_maxbytes(inode); if (*len == 0) return -EINVAL; @@ -5010,10 +5062,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { int error = 0; + inode_lock_shared(inode); if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) - return error; + goto unlock; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } @@ -5024,15 +5077,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ error = ext4_fiemap_check_ranges(inode, start, &len); if (error) - return error; + goto unlock; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; - return iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_xattr_ops); + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); + } else { + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_report_ops); } - - return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); +unlock: + inode_unlock_shared(inode); + return error; } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -5053,7 +5110,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + inode_lock_shared(inode); error = ext4_ext_precache(inode); + inode_unlock_shared(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; @@ -5332,109 +5391,74 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; struct address_space *mapping = inode->i_mapping; - ext4_lblk_t punch_start, punch_stop; + loff_t end = offset + len; + ext4_lblk_t start_lblk, end_lblk; handle_t *handle; unsigned int credits; - loff_t new_size, ioffset; + loff_t start, new_size; int ret; - /* - * We need to test this early because xfstests assumes that a - * collapse range of (0, 1) will return EOPNOTSUPP if the file - * system does not support collapse range. - */ + trace_ext4_collapse_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Collapse range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_collapse_range(inode, offset, len); - - punch_start = offset >> EXT4_BLOCK_SIZE_BITS(sb); - punch_stop = (offset + len) >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); /* * There is no need to overlap collapse range with EOF, in which case * it is effectively a truncate operation */ - if (offset + len >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) - goto out_mmap; + if (end >= inode->i_size) + return -EINVAL; /* + * Write tail of the last page before removed range and data that + * will be shifted since they will get removed from the page cache + * below. We are also protected from pages becoming dirty by + * i_rwsem and invalidate_lock. * Need to round down offset to be aligned with page size boundary * for page size > block size. */ - ioffset = round_down(offset, PAGE_SIZE); - /* - * Write tail of the last page before removed range since it will get - * removed from the page cache below. - */ - ret = filemap_write_and_wait_range(mapping, ioffset, offset); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, offset); + if (!ret) + ret = filemap_write_and_wait_range(mapping, end, LLONG_MAX); if (ret) - goto out_mmap; - /* - * Write data that will be shifted to preserve them when discarding - * page cache below. We are also protected from pages becoming dirty - * by i_rwsem and invalidate_lock. - */ - ret = filemap_write_and_wait_range(mapping, offset + len, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + return ret; + + truncate_pagecache(inode, start); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); + start_lblk = offset >> inode->i_blkbits; + end_lblk = (offset + len) >> inode->i_blkbits; + + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, punch_start, EXT_MAX_BLOCKS - punch_start); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); - ret = ext4_ext_remove_space(inode, punch_start, punch_stop - 1); + ret = ext4_ext_remove_space(inode, start_lblk, end_lblk - 1); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } ext4_discard_preallocations(inode); - ret = ext4_ext_shift_extents(inode, handle, punch_stop, - punch_stop - punch_start, SHIFT_LEFT); + ret = ext4_ext_shift_extents(inode, handle, end_lblk, + end_lblk - start_lblk, SHIFT_LEFT); if (ret) { up_write(&EXT4_I(inode)->i_data_sem); - goto out_stop; + goto out_handle; } new_size = inode->i_size - len; @@ -5442,18 +5466,16 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) EXT4_I(inode)->i_disksize = new_size; up_write(&EXT4_I(inode)->i_data_sem); - if (IS_SYNC(inode)) - ext4_handle_sync(handle); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); + if (ret) + goto out_handle; + ext4_update_inode_fsync_trans(handle, inode, 1); + if (IS_SYNC(inode)) + ext4_handle_sync(handle); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -5473,100 +5495,65 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) handle_t *handle; struct ext4_ext_path *path; struct ext4_extent *extent; - ext4_lblk_t offset_lblk, len_lblk, ee_start_lblk = 0; + ext4_lblk_t start_lblk, len_lblk, ee_start_lblk = 0; unsigned int credits, ee_len; - int ret = 0, depth, split_flag = 0; - loff_t ioffset; + int ret, depth, split_flag = 0; + loff_t start; - /* - * We need to test this early because xfstests assumes that an - * insert range of (0, 1) will return EOPNOTSUPP if the file - * system does not support insert range. - */ + trace_ext4_insert_range(inode, offset, len); + WARN_ON_ONCE(!inode_is_locked(inode)); + + /* Currently just for extent based files */ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return -EOPNOTSUPP; - /* Insert range works only on fs cluster size aligned regions. */ if (!IS_ALIGNED(offset | len, EXT4_CLUSTER_SIZE(sb))) return -EINVAL; - - trace_ext4_insert_range(inode, offset, len); - - offset_lblk = offset >> EXT4_BLOCK_SIZE_BITS(sb); - len_lblk = len >> EXT4_BLOCK_SIZE_BITS(sb); - - inode_lock(inode); - /* Currently just for extent based files */ - if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) { - ret = -EOPNOTSUPP; - goto out_mutex; - } - - /* Check whether the maximum file size would be exceeded */ - if (len > inode->i_sb->s_maxbytes - inode->i_size) { - ret = -EFBIG; - goto out_mutex; - } - /* Offset must be less than i_size */ - if (offset >= inode->i_size) { - ret = -EINVAL; - goto out_mutex; - } - - /* Wait for existing dio to complete */ - inode_dio_wait(inode); - - ret = file_modified(file); - if (ret) - goto out_mutex; + if (offset >= inode->i_size) + return -EINVAL; + /* Check whether the maximum file size would be exceeded */ + if (len > inode->i_sb->s_maxbytes - inode->i_size) + return -EFBIG; /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. + * Write out all dirty pages. Need to round down to align start offset + * to page size boundary for page size > block size. */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); + start = round_down(offset, PAGE_SIZE); + ret = filemap_write_and_wait_range(mapping, start, LLONG_MAX); if (ret) - goto out_mmap; + return ret; - /* - * Need to round down to align start offset to page size boundary - * for page size > block size. - */ - ioffset = round_down(offset, PAGE_SIZE); - /* Write out all dirty pages */ - ret = filemap_write_and_wait_range(inode->i_mapping, ioffset, - LLONG_MAX); - if (ret) - goto out_mmap; - truncate_pagecache(inode, ioffset); + truncate_pagecache(inode, start); credits = ext4_writepage_trans_blocks(inode); handle = ext4_journal_start(inode, EXT4_HT_TRUNCATE, credits); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out_mmap; - } + if (IS_ERR(handle)) + return PTR_ERR(handle); + ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_FALLOC_RANGE, handle); /* Expand file to avoid data loss if there is error while shifting */ inode->i_size += len; EXT4_I(inode)->i_disksize += len; - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); ret = ext4_mark_inode_dirty(handle, inode); if (ret) - goto out_stop; + goto out_handle; + + start_lblk = offset >> inode->i_blkbits; + len_lblk = len >> inode->i_blkbits; + + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - path = ext4_find_extent(inode, offset_lblk, NULL, 0); + path = ext4_find_extent(inode, start_lblk, NULL, 0); if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); - goto out_stop; + goto out_handle; } depth = ext_depth(inode); @@ -5576,16 +5563,16 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) ee_len = ext4_ext_get_actual_len(extent); /* - * If offset_lblk is not the starting block of extent, split - * the extent @offset_lblk + * If start_lblk is not the starting block of extent, split + * the extent @start_lblk */ - if ((offset_lblk > ee_start_lblk) && - (offset_lblk < (ee_start_lblk + ee_len))) { + if ((start_lblk > ee_start_lblk) && + (start_lblk < (ee_start_lblk + ee_len))) { if (ext4_ext_is_unwritten(extent)) split_flag = EXT4_EXT_MARK_UNWRIT1 | EXT4_EXT_MARK_UNWRIT2; path = ext4_split_extent_at(handle, inode, path, - offset_lblk, split_flag, + start_lblk, split_flag, EXT4_EX_NOCACHE | EXT4_GET_BLOCKS_PRE_IO | EXT4_GET_BLOCKS_METADATA_NOFAIL); @@ -5594,32 +5581,29 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) if (IS_ERR(path)) { up_write(&EXT4_I(inode)->i_data_sem); ret = PTR_ERR(path); - goto out_stop; + goto out_handle; } } ext4_free_ext_path(path); - ext4_es_remove_extent(inode, offset_lblk, EXT_MAX_BLOCKS - offset_lblk); + ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); /* - * if offset_lblk lies in a hole which is at start of file, use + * if start_lblk lies in a hole which is at start of file, use * ee_start_lblk to shift extents */ ret = ext4_ext_shift_extents(inode, handle, - max(ee_start_lblk, offset_lblk), len_lblk, SHIFT_RIGHT); - + max(ee_start_lblk, start_lblk), len_lblk, SHIFT_RIGHT); up_write(&EXT4_I(inode)->i_data_sem); + if (ret) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_mmap: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index ae29832aab1e..31dc0496f8d0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -120,9 +120,40 @@ * memory. Hence, we will reclaim written/unwritten/hole extents from * the tree under a heavy memory pressure. * + * ========================================================================== + * 3. Assurance of Ext4 extent status tree consistency + * + * When mapping blocks, Ext4 queries the extent status tree first and should + * always trusts that the extent status tree is consistent and up to date. + * Therefore, it is important to adheres to the following rules when createing, + * modifying and removing extents. + * + * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, + * the extent information should always be processed through the extent + * status tree instead of being organized manually through the on-disk + * extent tree. + * + * 2. When updating the extent tree, Ext4 should acquire the i_data_sem + * exclusively and update the extent status tree atomically. If the extents + * to be modified are large enough to exceed the range that a single + * i_data_sem can process (as ext4_datasem_ensure_credits() may drop + * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() + * does): + * + * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against page faults, as well as reads and writes that may + * concurrently modify the extent status tree. + * b) Evict all page cache in the affected range and recommend rebuilding + * or dropping the extent status tree after modifying the on-disk + * extent tree. This ensures exclusion against concurrent writebacks + * that do not hold those locks but only holds a folio lock. + * + * 3. Based on the rules above, when querying block mappings, Ext4 should at + * least hold the i_rwsem or invalidate_lock or folio lock(s) for the + * specified querying range. * * ========================================================================== - * 3. Performance analysis + * 4. Performance analysis * * -- overhead * 1. There is a cache extent for write access, so if writes are @@ -134,7 +165,7 @@ * * * ========================================================================== - * 4. TODO list + * 5. TODO list * * -- Refactor delayed space reservation * @@ -1551,7 +1582,6 @@ retry: ext4_es_print_tree(inode); ext4_da_release_space(inode, reserved); - return; } static int __es_shrink(struct ext4_sb_info *sbi, int nr_to_scan, diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index da4263a14a20..42bee1d4f9f9 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -12,6 +12,7 @@ #include "ext4_extents.h" #include "mballoc.h" +#include <linux/lockdep.h> /* * Ext4 Fast Commits * ----------------- @@ -49,19 +50,27 @@ * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * - * [1] Lock inodes for any further data updates by setting COMMITTING state - * [2] Submit data buffers of all the inodes - * [3] Wait for [2] to complete - * [4] Commit all the directory entry updates in the fast commit space - * [5] Commit all the changed inode structures - * [6] Write tail tag (this tag ensures the atomicity, please read the following + * [1] Prepare all the inodes to write out their data by setting + * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be + * deleted while it is being flushed. + * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" + * state. + * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that + * all the exsiting handles finish and no new handles can start. + * [4] Mark all the fast commit eligible inodes as undergoing fast commit + * by setting "EXT4_STATE_FC_COMMITTING" state. + * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows + * starting of new handles. If new handles try to start an update on + * any of the inodes that are being committed, ext4_fc_track_inode() + * will block until those inodes have finished the fast commit. + * [6] Commit all the directory entry updates in the fast commit space. + * [7] Commit all the changed inodes in the fast commit space and clear + * "EXT4_STATE_FC_COMMITTING" for these inodes. + * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). - * [7] Wait for [4], [5] and [6] to complete. * - * All the inode updates must call ext4_fc_start_update() before starting an - * update. If such an ongoing update is present, fast commit waits for it to - * complete. The completion of such an update is marked by - * ext4_fc_stop_update(). + * All the inode updates must be enclosed within jbd2_jounrnal_start() + * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- @@ -142,6 +151,13 @@ * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * + * Locking + * ------- + * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit + * dentry queue. ei->i_fc_lock protects the fast commit related info in a given + * inode. Most of the code avoids acquiring both the locks, but if one must do + * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. + * * TODOs * ----- * @@ -156,13 +172,12 @@ * fast commit recovery even if that area is invalidated by later full * commits. * - * 1) Fast commit's commit path locks the entire file system during fast - * commit. This has significant performance penalty. Instead of that, we - * should use ext4_fc_start/stop_update functions to start inode level - * updates from ext4_journal_start/stop. Once we do that we can drop file - * system locking during commit path. + * 1) Handle more ineligible cases. * - * 2) Handle more ineligible cases. + * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent + * status tree. This would get rid of the need to call ext4_fc_track_inode() + * before acquiring i_data_sem. To do that we would need to ensure that + * modified extents from the extent status tree are not evicted from memory. */ #include <trace/events/ext4.h> @@ -201,32 +216,6 @@ void ext4_fc_init_inode(struct inode *inode) INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); - atomic_set(&ei->i_fc_updates, 0); -} - -/* This function must be called with sbi->s_fc_lock held. */ -static void ext4_fc_wait_committing_inode(struct inode *inode) -__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) -{ - wait_queue_head_t *wq; - struct ext4_inode_info *ei = EXT4_I(inode); - -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) @@ -236,48 +225,6 @@ static bool ext4_fc_disabled(struct super_block *sb) } /* - * Inform Ext4's fast about start of an inode update - * - * This function is called by the high level call VFS callbacks before - * performing any inode update. This function blocks if there's an ongoing - * fast commit on the inode in question. - */ -void ext4_fc_start_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - -restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); - if (list_empty(&ei->i_fc_list)) - goto out; - - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; - } -out: - atomic_inc(&ei->i_fc_updates); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); -} - -/* - * Stop inode update and wake up waiting fast commits if any. - */ -void ext4_fc_stop_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (atomic_dec_and_test(&ei->i_fc_updates)) - wake_up_all(&ei->i_fc_wait); -} - -/* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ @@ -286,31 +233,62 @@ void ext4_fc_del(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; + wait_queue_head_t *wq; if (ext4_fc_disabled(inode->i_sb)) return; -restart: - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; + /* + * Since ext4_fc_del is called from ext4_evict_inode while having a + * handle open, there is no need for us to wait here even if a fast + * commit is going on. That is because, if this inode is being + * committed, ext4_mark_inode_dirty would have waited for inode commit + * operation to finish before we come here. So, by the time we come + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode + * here. + * + * We may come here without any handles open in the "no_delete" case of + * ext4_evict_inode as well. However, if that happens, we first mark the + * file system as fast commit ineligible anyway. So, even in that case, + * it is okay to remove the inode from the fc list. + */ + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); + while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { + mutex_unlock(&sbi->s_fc_lock); + schedule(); + mutex_lock(&sbi->s_fc_lock); + } + finish_wait(wq, &wait.wq_entry); } - - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } @@ -320,12 +298,10 @@ restart: list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - - return; } /* @@ -353,12 +329,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -385,7 +361,7 @@ static int ext4_fc_track_template( int ret; tid = handle->h_transaction->t_tid; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { @@ -393,19 +369,18 @@ static int ext4_fc_track_template( ei->i_sync_tid = tid; } ret = __fc_track_fn(handle, inode, args, update); - mutex_unlock(&ei->i_fc_lock); - + spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret; - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return ret; } @@ -428,19 +403,19 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -449,7 +424,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); - spin_lock(&sbi->s_fc_lock); + INIT_LIST_HEAD(&node->fcd_list); + mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, @@ -470,8 +446,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } - spin_unlock(&sbi->s_fc_lock); - mutex_lock(&ei->i_fc_lock); + mutex_unlock(&sbi->s_fc_lock); + spin_lock(&ei->i_fc_lock); return 0; } @@ -571,6 +547,8 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg, void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) @@ -588,6 +566,35 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; + /* + * If we come here, we may sleep while waiting for the inode to + * commit. We shouldn't be holding i_data_sem when we go to sleep since + * the commit path needs to grab the lock while committing the inode. + */ + lockdep_assert_not_held(&ei->i_data_sem); + + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + schedule(); + finish_wait(wq, &wait.wq_entry); + } + + /* + * From this point on, this inode will not be committed either + * by fast or full commit as long as the handle is open. + */ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } @@ -727,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); - *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); + *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); @@ -774,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); - crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, + crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); @@ -893,15 +900,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) struct ext4_extent *ex; int ret; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", @@ -910,7 +917,9 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); + ret = ext4_map_blocks(NULL, inode, &map, + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE); if (ret < 0) return -ECANCELED; @@ -954,69 +963,31 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) } -/* Submit data for all the fast commit inodes */ -static int ext4_fc_submit_inode_data_all(journal_t *journal) +/* Flushes data of all the inodes in the commit queue. */ +static int ext4_fc_flush_data(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; - spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - while (atomic_read(&ei->i_fc_updates)) { - DEFINE_WAIT(wait); - - prepare_to_wait(&ei->i_fc_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&ei->i_fc_updates)) { - spin_unlock(&sbi->s_fc_lock); - schedule(); - spin_lock(&sbi->s_fc_lock); - } - finish_wait(&ei->i_fc_wait, &wait); - } - spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - - return ret; -} - -/* Wait for completion of data for all the fast commit inodes */ -static int ext4_fc_wait_inode_data_all(journal_t *journal) -{ - struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *pos, *n; - int ret = 0; - - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - if (!ext4_test_inode_state(&pos->vfs_inode, - EXT4_STATE_FC_COMMITTING)) - continue; - spin_unlock(&sbi->s_fc_lock); - ret = jbd2_wait_inode_data(journal, pos->jinode); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) -__acquires(&sbi->s_fc_lock) -__releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1030,26 +1001,22 @@ __releases(&sbi->s_fc_lock) list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { - spin_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - spin_lock(&sbi->s_fc_lock); + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the - * corresponding inode pointer + * corresponding inode. Also, the corresponding inode could have been + * deleted, in which case, we don't need to do anything. */ - WARN_ON(list_empty(&fc_dentry->fcd_dilist)); + if (list_empty(&fc_dentry->fcd_dilist)) + continue; ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); - spin_unlock(&sbi->s_fc_lock); - /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first @@ -1059,23 +1026,14 @@ __releases(&sbi->s_fc_lock) */ ret = ext4_fc_write_inode(inode, crc); if (ret) - goto lock_and_exit; - + return ret; ret = ext4_fc_write_inode_data(inode, crc); if (ret) - goto lock_and_exit; - - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - - spin_lock(&sbi->s_fc_lock); + return ret; + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; } return 0; -lock_and_exit: - spin_lock(&sbi->s_fc_lock); - return ret; } static int ext4_fc_perform_commit(journal_t *journal) @@ -1089,26 +1047,81 @@ static int ext4_fc_perform_commit(journal_t *journal) int ret = 0; u32 crc = 0; - ret = ext4_fc_submit_inode_data_all(journal); - if (ret) - return ret; + /* + * Step 1: Mark all inodes on s_fc_q[MAIN] with + * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being + * freed until the data flush is over. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); + } + mutex_unlock(&sbi->s_fc_lock); + + /* Step 2: Flush data for all the eligible inodes. */ + ret = ext4_fc_flush_data(journal); - ret = ext4_fc_wait_inode_data_all(journal); + /* + * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning + * any error from step 2. This ensures that waiters waiting on + * EXT4_STATE_FC_FLUSHING_DATA can resume. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_clear_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); +#endif + } + + /* + * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before + * the waiter checks the bit. Pairs with implicit barrier in + * prepare_to_wait() in ext4_fc_del(). + */ + smp_mb(); + mutex_unlock(&sbi->s_fc_lock); + + /* + * If we encountered error in Step 2, return it now after clearing + * EXT4_STATE_FC_FLUSHING_DATA bit. + */ if (ret) return ret; + + /* Step 4: Mark all inodes as being committed. */ + jbd2_journal_lock_updates(journal); /* - * If file system device is different from journal device, issue a cache - * flush before we start writing fast commit blocks. + * The journal is now locked. No more handles can start and all the + * previous handles are now drained. We now mark the inodes on the + * commit queue as being committed. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } + mutex_unlock(&sbi->s_fc_lock); + jbd2_journal_unlock_updates(journal); + + /* + * Step 5: If file system device is different from journal device, + * issue a cache flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); + /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* - * Add a head tag only if this is the first fast commit - * in this TID. + * Step 6.1: Add a head tag only if this is the first fast + * commit in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( @@ -1120,32 +1133,30 @@ static int ext4_fc_perform_commit(journal_t *journal) } } - spin_lock(&sbi->s_fc_lock); + /* Step 6.2: Now write all the dentry updates. */ + mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); - if (ret) { - spin_unlock(&sbi->s_fc_lock); + if (ret) goto out; - } + /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; - spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - + /* Step 6.4: Finally write tail tag to conclude this fast commit. */ ret = ext4_fc_write_tail(sb, crc); out: + mutex_unlock(&sbi->s_fc_lock); blk_finish_plug(&plug); return ret; } @@ -1191,6 +1202,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; + int old_ioprio, journal_ioprio; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); @@ -1198,6 +1210,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); + old_ioprio = get_current_ioprio(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); @@ -1228,6 +1241,15 @@ restart_fc: goto fallback; } + /* + * Now that we know that this thread is going to do a fast commit, + * elevate the priority to match that of the journal thread. + */ + if (journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + else + journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { @@ -1242,6 +1264,7 @@ restart_fc: } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); + set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time @@ -1251,6 +1274,7 @@ restart_fc: return ret; fallback: + set_task_ioprio(current, old_ioprio); ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; @@ -1264,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter, *iter_n; + struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) @@ -1273,14 +1297,16 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], - i_fc_list) { - list_del_init(&iter->i_fc_list); - ext4_clear_inode_state(&iter->vfs_inode, + mutex_lock(&sbi->s_fc_lock); + while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { + ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], + struct ext4_inode_info, + i_fc_list); + list_del_init(&ei->i_fc_list); + ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - if (tid_geq(tid, iter->i_sync_tid)) { - ext4_fc_reset_inode(&iter->vfs_inode); + if (tid_geq(tid, ei->i_sync_tid)) { + ext4_fc_reset_inode(&ei->vfs_inode); } else if (full) { /* * We are called after a full commit, inode has been @@ -1291,15 +1317,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) * time in that case (and tid doesn't increase so * tid check above isn't reliable). */ - list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list, + list_add_tail(&ei->i_fc_list, &sbi->s_fc_q[FC_Q_STAGING]); } - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ smp_mb(); #if (BITS_PER_LONG < 64) - wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else - wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif } @@ -1309,11 +1339,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); - spin_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - spin_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], @@ -1328,7 +1356,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) if (full) sbi->s_fc_bytes = 0; - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } @@ -2105,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); @@ -2138,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal, break; } state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: diff --git a/fs/ext4/file.c b/fs/ext4/file.c index a5205149adba..21df81347147 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -377,7 +377,12 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); - if (!error && size && flags & IOMAP_DIO_UNWRITTEN) + + if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && + (iocb->ki_flags & IOCB_ATOMIC)) + error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, + size); + else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; @@ -688,10 +693,12 @@ out: static ssize_t ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) { + int ret; struct inode *inode = file_inode(iocb->ki_filp); - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; #ifdef CONFIG_FS_DAX if (IS_DAX(inode)) @@ -700,7 +707,6 @@ ext4_file_write_iter(struct kiocb *iocb, struct iov_iter *from) if (iocb->ki_flags & IOCB_ATOMIC) { size_t len = iov_iter_count(from); - int ret; if (len < EXT4_SB(inode->i_sb)->s_awu_min || len > EXT4_SB(inode->i_sb)->s_awu_max) @@ -756,9 +762,6 @@ retry: return VM_FAULT_SIGBUS; } } else { - result = filemap_fsnotify_fault(vmf); - if (unlikely(result)) - return result; filemap_invalidate_lock_shared(mapping); } result = dax_iomap_fault(vmf, order, &pfn, &error, &ext4_iomap_ops); @@ -803,11 +806,16 @@ static const struct vm_operations_struct ext4_file_vm_ops = { static int ext4_file_mmap(struct file *file, struct vm_area_struct *vma) { + int ret; struct inode *inode = file->f_mapping->host; struct dax_device *dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + if (file->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; /* * We don't support synchronous mappings for non-DAX files and @@ -838,7 +846,8 @@ static int ext4_sample_last_mounted(struct super_block *sb, if (likely(ext4_test_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED))) return 0; - if (sb_rdonly(sb) || !sb_start_intwrite_trylock(sb)) + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !sb_start_intwrite_trylock(sb)) return 0; ext4_set_mount_flag(sb, EXT4_MF_MNTDIR_SAMPLED); @@ -881,8 +890,12 @@ static int ext4_file_open(struct inode *inode, struct file *filp) { int ret; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + if (filp->f_mode & FMODE_WRITE) + ret = ext4_emergency_state(inode->i_sb); + else + ret = ext4_forced_shutdown(inode->i_sb) ? -EIO : 0; + if (unlikely(ret)) + return ret; ret = ext4_sample_last_mounted(inode->i_sb, filp->f_path.mnt); if (ret) @@ -921,12 +934,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - loff_t maxbytes; - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; - else - maxbytes = inode->i_sb->s_maxbytes; + loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: diff --git a/fs/ext4/fsync.c b/fs/ext4/fsync.c index b40d3b29f7e5..e476c6de3074 100644 --- a/fs/ext4/fsync.c +++ b/fs/ext4/fsync.c @@ -132,20 +132,16 @@ int ext4_sync_file(struct file *file, loff_t start, loff_t end, int datasync) bool needs_barrier = false; struct inode *inode = file->f_mapping->host; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; ASSERT(ext4_journal_current_handle() == NULL); trace_ext4_sync_file_enter(file, datasync); - if (sb_rdonly(inode->i_sb)) { - /* Make sure that we read updated s_ext4_flags value */ - smp_rmb(); - if (ext4_forced_shutdown(inode->i_sb)) - ret = -EROFS; + if (sb_rdonly(inode->i_sb)) goto out; - } if (!EXT4_SB(inode->i_sb)->s_journal) { ret = ext4_fsync_nojournal(file, start, end, datasync, diff --git a/fs/ext4/hash.c b/fs/ext4/hash.c index deabe29da7fb..33cd5b6b02d5 100644 --- a/fs/ext4/hash.c +++ b/fs/ext4/hash.c @@ -302,7 +302,7 @@ int ext4fs_dirhash(const struct inode *dir, const char *name, int len, if (len && IS_CASEFOLDED(dir) && (!IS_ENCRYPTED(dir) || fscrypt_has_encryption_key(dir))) { - buff = kzalloc(sizeof(char) * PATH_MAX, GFP_KERNEL); + buff = kzalloc(PATH_MAX, GFP_KERNEL); if (!buff) return -ENOMEM; diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index 21d228073d79..79aa3df8d019 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -691,7 +691,8 @@ static int recently_deleted(struct super_block *sb, ext4_group_t group, int ino) if (!bh || !buffer_uptodate(bh)) /* * If the block is not in the buffer cache, then it - * must have been written out. + * must have been written out, or, most unlikely, is + * being migrated - false failure should be OK here. */ goto out; @@ -951,8 +952,9 @@ struct inode *__ext4_new_inode(struct mnt_idmap *idmap, sb = dir->i_sb; sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sb))) - return ERR_PTR(-EIO); + ret2 = ext4_emergency_state(sb); + if (unlikely(ret2)) + return ERR_PTR(ret2); ngroups = ext4_get_groups_count(sb); trace_ext4_request_inode(dir, mode); @@ -1282,14 +1284,13 @@ got: inode->i_generation = get_random_u32(); /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -1298,7 +1299,7 @@ got: ei->i_extra_isize = sbi->s_want_extra_isize; ei->i_inline_off = 0; if (ext4_has_feature_inline_data(sb) && - (!(ei->i_flags & EXT4_DAX_FL) || S_ISDIR(mode))) + (!(ei->i_flags & (EXT4_DAX_FL|EXT4_EA_INODE_FL)) || S_ISDIR(mode))) ext4_set_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA); ret = inode; err = dquot_alloc_inode(inode); @@ -1334,6 +1335,9 @@ got: } } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ext4_update_inode_fsync_trans(handle, inode, 1); err = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 3536ca7e4fcc..a1bbcdf40824 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -20,6 +20,11 @@ #define EXT4_INLINE_DOTDOT_OFFSET 2 #define EXT4_INLINE_DOTDOT_SIZE 4 + +static int ext4_da_convert_inline_data_to_extent(struct address_space *mapping, + struct inode *inode, + void **fsdata); + static int ext4_get_inline_size(struct inode *inode) { if (EXT4_I(inode)->i_inline_off) @@ -228,7 +233,7 @@ static void ext4_write_inline_data(struct inode *inode, struct ext4_iloc *iloc, struct ext4_inode *raw_inode; int cp_len = 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; BUG_ON(!EXT4_I(inode)->i_inline_off); @@ -392,7 +397,7 @@ out: } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) + loff_t len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); @@ -596,6 +601,7 @@ retry: goto out; } + ext4_fc_track_inode(handle, inode); ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; @@ -637,7 +643,7 @@ retry: goto retry; if (folio) - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); out: if (folio) { folio_unlock(folio); @@ -653,91 +659,109 @@ out_nofolio: } /* - * Try to write data in the inode. - * If the inode has inline data, check whether the new write can be - * in the inode also. If not, create the page the handle, move the data - * to the page make it update and let the later codes create extent for it. + * Prepare the write for the inline data. + * If the data can be written into the inode, we just read + * the page and make it uptodate, and start the journal. + * Otherwise read the page, makes it dirty so that it can be + * handle in writepages(the i_disksize update is left to the + * normal ext4_da_write_end). */ -int ext4_try_to_write_inline_data(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop) +int ext4_generic_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop, + void **fsdata, bool da) { int ret; handle_t *handle; struct folio *folio; struct ext4_iloc iloc; - - if (pos + len > ext4_get_max_inline_size(inode)) - goto convert; + int retries = 0; ret = ext4_get_inode_loc(inode, &iloc); if (ret) return ret; - /* - * The possible write could happen in the inode, - * so try to reserve the space in inode first. - */ +retry_journal: handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); if (IS_ERR(handle)) { ret = PTR_ERR(handle); - handle = NULL; - goto out; + goto out_release_bh; } ret = ext4_prepare_inline_data(handle, inode, pos + len); if (ret && ret != -ENOSPC) - goto out; + goto out_stop_journal; - /* We don't have space in inline inode, so convert it to extent. */ if (ret == -ENOSPC) { ext4_journal_stop(handle); - brelse(iloc.bh); - goto convert; - } + if (!da) { + brelse(iloc.bh); + /* Retry inside */ + return ext4_convert_inline_data_to_extent(mapping, inode); + } - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, - EXT4_JTR_NONE); - if (ret) - goto out; + ret = ext4_da_convert_inline_data_to_extent(mapping, inode, fsdata); + if (ret == -ENOSPC && + ext4_should_retry_alloc(inode->i_sb, &retries)) + goto retry_journal; + goto out_release_bh; + } folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { ret = PTR_ERR(folio); - goto out; + goto out_stop_journal; } - *foliop = folio; down_read(&EXT4_I(inode)->xattr_sem); + /* Someone else had converted it to extent */ if (!ext4_has_inline_data(inode)) { ret = 0; - folio_unlock(folio); - folio_put(folio); - goto out_up_read; + goto out_release_folio; } if (!folio_test_uptodate(folio)) { ret = ext4_read_inline_folio(inode, folio); - if (ret < 0) { - folio_unlock(folio); - folio_put(folio); - goto out_up_read; - } + if (ret < 0) + goto out_release_folio; } - ret = 1; - handle = NULL; -out_up_read: + ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, EXT4_JTR_NONE); + if (ret) + goto out_release_folio; + *foliop = folio; up_read(&EXT4_I(inode)->xattr_sem); -out: - if (handle && (ret != 1)) - ext4_journal_stop(handle); + brelse(iloc.bh); + return 1; + +out_release_folio: + up_read(&EXT4_I(inode)->xattr_sem); + folio_unlock(folio); + folio_put(folio); +out_stop_journal: + ext4_journal_stop(handle); +out_release_bh: brelse(iloc.bh); return ret; -convert: - return ext4_convert_inline_data_to_extent(mapping, inode); +} + +/* + * Try to write data in the inode. + * If the inode has inline data, check whether the new write can be + * in the inode also. If not, create the page the handle, move the data + * to the page make it update and let the later codes create extent for it. + */ +int ext4_try_to_write_inline_data(struct address_space *mapping, + struct inode *inode, + loff_t pos, unsigned len, + struct folio **foliop) +{ + if (pos + len > ext4_get_max_inline_size(inode)) + return ext4_convert_inline_data_to_extent(mapping, inode); + return ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, NULL, false); } int ext4_write_inline_data_end(struct inode *inode, loff_t pos, unsigned len, @@ -881,94 +905,6 @@ out: return ret; } -/* - * Prepare the write for the inline data. - * If the data can be written into the inode, we just read - * the page and make it uptodate, and start the journal. - * Otherwise read the page, makes it dirty so that it can be - * handle in writepages(the i_disksize update is left to the - * normal ext4_da_write_end). - */ -int ext4_da_write_inline_data_begin(struct address_space *mapping, - struct inode *inode, - loff_t pos, unsigned len, - struct folio **foliop, - void **fsdata) -{ - int ret; - handle_t *handle; - struct folio *folio; - struct ext4_iloc iloc; - int retries = 0; - - ret = ext4_get_inode_loc(inode, &iloc); - if (ret) - return ret; - -retry_journal: - handle = ext4_journal_start(inode, EXT4_HT_INODE, 1); - if (IS_ERR(handle)) { - ret = PTR_ERR(handle); - goto out; - } - - ret = ext4_prepare_inline_data(handle, inode, pos + len); - if (ret && ret != -ENOSPC) - goto out_journal; - - if (ret == -ENOSPC) { - ext4_journal_stop(handle); - ret = ext4_da_convert_inline_data_to_extent(mapping, - inode, - fsdata); - if (ret == -ENOSPC && - ext4_should_retry_alloc(inode->i_sb, &retries)) - goto retry_journal; - goto out; - } - - /* - * We cannot recurse into the filesystem as the transaction - * is already started. - */ - folio = __filemap_get_folio(mapping, 0, FGP_WRITEBEGIN | FGP_NOFS, - mapping_gfp_mask(mapping)); - if (IS_ERR(folio)) { - ret = PTR_ERR(folio); - goto out_journal; - } - - down_read(&EXT4_I(inode)->xattr_sem); - if (!ext4_has_inline_data(inode)) { - ret = 0; - goto out_release_page; - } - - if (!folio_test_uptodate(folio)) { - ret = ext4_read_inline_folio(inode, folio); - if (ret < 0) - goto out_release_page; - } - ret = ext4_journal_get_write_access(handle, inode->i_sb, iloc.bh, - EXT4_JTR_NONE); - if (ret) - goto out_release_page; - - up_read(&EXT4_I(inode)->xattr_sem); - *foliop = folio; - brelse(iloc.bh); - return 1; -out_release_page: - up_read(&EXT4_I(inode)->xattr_sem); - folio_unlock(folio); - folio_put(folio); -out_journal: - ext4_journal_stop(handle); -out: - brelse(iloc.bh); - return ret; -} - #ifdef INLINE_DIR_DEBUG void ext4_show_inline_dir(struct inode *dir, struct buffer_head *bh, void *inline_start, int inline_size) @@ -1012,7 +948,7 @@ static int ext4_add_dirent_to_inline(handle_t *handle, int err; struct ext4_dir_entry_2 *de; - err = ext4_find_dest_de(dir, inode, iloc->bh, inline_start, + err = ext4_find_dest_de(dir, iloc->bh, inline_start, inline_size, fname, &de); if (err) return err; @@ -1146,7 +1082,7 @@ static int ext4_finish_convert_inline_dir(handle_t *handle, memcpy((void *)de, buf + EXT4_INLINE_DOTDOT_SIZE, inline_size - EXT4_INLINE_DOTDOT_SIZE); - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); inode->i_size = inode->i_sb->s_blocksize; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index 7c54ae5fcbd4..be9a4cba35fd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -31,6 +31,7 @@ #include <linux/writeback.h> #include <linux/pagevec.h> #include <linux/mpage.h> +#include <linux/rmap.h> #include <linux/namei.h> #include <linux/uio.h> #include <linux/bio.h> @@ -57,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); - csum = ext4_chksum(sbi, csum, (__u8 *)raw + - EXT4_GOOD_OLD_INODE_SIZE, + csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } @@ -93,7 +92,7 @@ static int ext4_inode_csum_verify(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return 1; provided = le16_to_cpu(raw->i_checksum_lo); @@ -114,7 +113,7 @@ void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, if (EXT4_SB(inode->i_sb)->s_es->s_creator_os != cpu_to_le32(EXT4_OS_LINUX) || - !ext4_has_metadata_csum(inode->i_sb)) + !ext4_has_feature_metadata_csum(inode->i_sb)) return; csum = ext4_inode_csum(inode, raw, ei); @@ -141,9 +140,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); - /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. @@ -181,6 +177,8 @@ void ext4_evict_inode(struct inode *inode) trace_ext4_evict_inode(inode); + dax_break_layout_final(inode); + if (EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL) ext4_evict_ea_inode(inode); if (inode->i_nlink) { @@ -383,10 +381,11 @@ static int __check_block_validity(struct inode *inode, const char *func, unsigned int line, struct ext4_map_blocks *map) { - if (ext4_has_feature_journal(inode->i_sb) && - (inode->i_ino == - le32_to_cpu(EXT4_SB(inode->i_sb)->s_es->s_journal_inum))) + journal_t *journal = EXT4_SB(inode->i_sb)->s_journal; + + if (journal && inode == journal->j_inode) return 0; + if (!ext4_inode_block_valid(inode, map->m_pblk, map->m_len)) { ext4_error_inode(inode, func, line, map->m_pblk, "lblock %lu mapped to illegal pblock %llu " @@ -412,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, return ret; } +/* + * For generic regular files, when updating the extent tree, Ext4 should + * hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against concurrent page faults, as well as reads and writes. + */ +#ifdef CONFIG_EXT4_DEBUG +void ext4_check_map_extents_env(struct inode *inode) +{ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + if (!S_ISREG(inode->i_mode) || + IS_NOQUOTA(inode) || IS_VERITY(inode) || + is_special_ino(inode->i_sb, inode->i_ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + ext4_verity_in_progress(inode)) + return; + + WARN_ON_ONCE(!inode_is_locked(inode) && + !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); +} +#else +void ext4_check_map_extents_env(struct inode *inode) {} +#endif + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -458,16 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + unsigned int orig_mlen) +{ + struct ext4_map_blocks map2; + unsigned int status, status2; + int retval; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); + WARN_ON_ONCE(orig_mlen <= map->m_len); + + /* Prepare map2 for lookup in next leaf block */ + map2.m_lblk = map->m_lblk + map->m_len; + map2.m_len = orig_mlen - map->m_len; + map2.m_flags = 0; + retval = ext4_ext_map_blocks(handle, inode, &map2, 0); + + if (retval <= 0) { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return map->m_len; + } + + if (unlikely(retval != map2.m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map2.m_len); + WARN_ON(1); + } + + status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + /* + * If map2 is contiguous with map, then let's insert it as a single + * extent in es cache and return the combined length of both the maps. + */ + if (map->m_pblk + map->m_len == map2.m_pblk && + status == status2) { + ext4_es_insert_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status, false); + map->m_len += map2.m_len; + } else { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + } + + return map->m_len; +} + static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map) + struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; + unsigned int orig_mlen = map->m_len; + flags &= EXT4_EX_QUERY_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(handle, inode, map, 0); + retval = ext4_ext_map_blocks(handle, inode, map, flags); else - retval = ext4_ind_map_blocks(handle, inode, map, 0); + retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval <= 0) return retval; @@ -480,11 +562,22 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, WARN_ON(1); } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); - return retval; + /* + * No need to query next in leaf: + * - if returned extent is not last in leaf or + * - if the last in leaf is the full requested range + */ + if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || + map->m_len == orig_mlen) { + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return retval; + } + + return ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, @@ -598,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct extent_status es; int retval; int ret = 0; + unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -618,6 +712,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; + /* + * Callers from the context of data submission are the only exceptions + * for regular files that do not hold the i_rwsem or invalidate_lock. + * However, caching unrelated ranges is not permitted. + */ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + else + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { @@ -649,7 +753,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif - goto found; + if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || + orig_mlen == map->m_len) + goto found; + + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) + map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we @@ -663,7 +772,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - retval = ext4_map_query_blocks(handle, inode, map); + retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -692,6 +801,8 @@ found: if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; + + ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -751,7 +862,7 @@ static void ext4_update_bh_state(struct buffer_head *bh, unsigned long flags) flags &= EXT4_MAP_FLAGS; /* Dummy buffer_head? Set non-atomically. */ - if (!bh->b_page) { + if (!bh->b_folio) { bh->b_state = (bh->b_state & ~EXT4_MAP_FLAGS) | flags; return; } @@ -1005,7 +1116,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { - folio_mark_dirty(bh->b_folio); + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; + + /* only regular files have a_ops */ + if (S_ISREG(inode->i_mode)) + folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } @@ -1023,7 +1139,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { - unsigned from = pos & (PAGE_SIZE - 1); + unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; @@ -1037,8 +1153,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_buffers(folio); @@ -1148,9 +1263,11 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; unsigned from, to; + fgf_t fgp = FGP_WRITEBEGIN; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; trace_ext4_write_begin(inode, pos, len); /* @@ -1159,8 +1276,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, @@ -1179,10 +1294,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, * the folio (if needed) without using GFP_NOFS. */ retry_grab: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + + from = offset_in_folio(folio, pos); + to = from + len; + /* * The same as page allocation, we prealloc buffer heads before * starting the handle. @@ -1760,6 +1883,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, @@ -1800,7 +1925,7 @@ found: if (ext4_has_inline_data(inode)) retval = 0; else - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; @@ -1823,7 +1948,7 @@ add_delayed: goto found; } } else if (!ext4_has_inline_data(inode)) { - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; @@ -1931,7 +2056,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); return err; } @@ -2154,7 +2279,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; - lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2165,6 +2289,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; + lblk = folio->index << bpp_bits; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* @@ -2207,11 +2332,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if - * possible. + * possible. In addition, do not cache any unrelated extents, as it + * only holds the folio lock but does not hold the i_rwsem or + * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | - EXT4_GET_BLOCKS_IO_SUBMIT; + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE; + dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; @@ -2225,7 +2354,7 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) mpd->io_submit.io_end->handle = handle->h_rsv_handle; handle->h_rsv_handle = NULL; } - ext4_set_io_unwritten_flag(inode, mpd->io_submit.io_end); + ext4_set_io_unwritten_flag(mpd->io_submit.io_end); } BUG_ON(map->m_len == 0); @@ -2273,7 +2402,7 @@ static int mpage_map_and_submit_extent(handle_t *handle, if (err < 0) { struct super_block *sb = inode->i_sb; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) goto invalidate_dirty_pages; /* * Let the uper layers retry transient errors. @@ -2350,7 +2479,7 @@ update_disksize: */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); @@ -2386,7 +2515,7 @@ static int mpage_journal_page_buffers(handle_t *handle, size_t len = folio_size(folio); folio_clear_checked(folio); - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) @@ -2428,7 +2557,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; - int bpp = ext4_journal_blocks_per_page(mpd->inode); + int bpp = ext4_journal_blocks_per_folio(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; @@ -2599,10 +2728,9 @@ static int ext4_do_writepages(struct mpage_da_data *mpd) * *never* be called, so if that ever happens, we would want * the stack trace. */ - if (unlikely(ext4_forced_shutdown(mapping->host->i_sb))) { - ret = -EROFS; + ret = ext4_emergency_state(mapping->host->i_sb); + if (unlikely(ret)) goto out_writepages; - } /* * If we have inline data and arrive here, it means that @@ -2817,8 +2945,9 @@ static int ext4_writepages(struct address_space *mapping, int ret; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(sb); ret = ext4_do_writepages(&mpd); @@ -2858,8 +2987,9 @@ static int ext4_dax_writepages(struct address_space *mapping, struct inode *inode = mapping->host; int alloc_ctx; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; alloc_ctx = ext4_writepages_down_read(inode->i_sb); trace_ext4_writepages(inode, wbc); @@ -2914,9 +3044,11 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; + fgf_t fgp = FGP_WRITEBEGIN; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + ret = ext4_emergency_state(inode->i_sb); + if (unlikely(ret)) + return ret; index = pos >> PAGE_SHIFT; @@ -2929,8 +3061,8 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, trace_ext4_da_write_begin(inode, pos, len); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { - ret = ext4_da_write_inline_data_begin(mapping, inode, pos, len, - foliop, fsdata); + ret = ext4_generic_write_inline_data(mapping, inode, pos, len, + foliop, fsdata, true); if (ret < 0) return ret; if (ret == 1) @@ -2938,11 +3070,15 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { @@ -3031,7 +3167,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, unsigned long end; i_size_write(inode, new_i_size); - end = (new_i_size - 1) & (PAGE_SIZE - 1); + end = offset_in_folio(folio, new_i_size - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; @@ -3290,6 +3426,10 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, if (map->m_flags & EXT4_MAP_NEW) iomap->flags |= IOMAP_F_NEW; + /* HW-offload atomics are always used */ + if (flags & IOMAP_ATOMIC) + iomap->flags |= IOMAP_F_ATOMIC_BIO; + if (flags & IOMAP_DAX) iomap->dax_dev = EXT4_SB(inode->i_sb)->s_daxdev; else @@ -3329,12 +3469,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, } } +static int ext4_map_blocks_atomic_write_slow(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + unsigned int mapped_len = 0, m_flags = 0; + ext4_fsblk_t next_pblk; + bool check_next_pblk = false; + int ret = 0; + + WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); + + /* + * This is a slow path in case of mixed mapping. We use + * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single + * contiguous mapped mapping. This will ensure any unwritten or hole + * regions within the requested range is zeroed out and we return + * a single contiguous mapped extent. + */ + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + + do { + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 && ret != -ENOSPC) + goto out_err; + /* + * This should never happen, but let's return an error code to + * avoid an infinite loop in here. + */ + if (ret == 0) { + ret = -EFSCORRUPTED; + ext4_warning_inode(inode, + "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", + m_flags, ret); + goto out_err; + } + /* + * With bigalloc we should never get ENOSPC nor discontiguous + * physical extents. + */ + if ((check_next_pblk && next_pblk != map->m_pblk) || + ret == -ENOSPC) { + ext4_warning_inode(inode, + "Non-contiguous allocation detected: expected %llu, got %llu, " + "or ext4_map_blocks() returned out of space ret: %d", + next_pblk, map->m_pblk, ret); + ret = -EFSCORRUPTED; + goto out_err; + } + next_pblk = map->m_pblk + map->m_len; + check_next_pblk = true; + + mapped_len += map->m_len; + map->m_lblk += map->m_len; + map->m_len = m_len - mapped_len; + } while (mapped_len < m_len); + + /* + * We might have done some work in above loop, so we need to query the + * start of the physical extent, based on the origin m_lblk and m_len. + * Let's also ensure we were able to allocate the required range for + * mixed mapping case. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); + if (ret != m_len) { + ext4_warning_inode(inode, + "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", + m_lblk, m_len, ret); + ret = -EINVAL; + } + return ret; + +out_err: + /* reset map before returning an error */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + return ret; +} + +/* + * ext4_map_blocks_atomic: Helper routine to ensure the entire requested + * range in @map [lblk, lblk + len) is one single contiguous extent with no + * mixed mappings. + * + * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). + * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying + * physical extent for the requested range does not have a single contiguous + * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. + * In that case we will loop over the requested range to allocate and zero out + * the unwritten / holes in between, to get a single mapped extent from + * [m_lblk, m_lblk + m_len). Note that this is only possible because we know + * this can be called only with bigalloc enabled filesystem where the underlying + * cluster is already allocated. This avoids allocating discontiguous extents + * in the slow path due to multiple calls to ext4_map_blocks(). + * The slow path is mostly non-performance critical path, so it should be ok to + * loop using ext4_map_blocks() with appropriate flags to allocate & zero the + * underlying short holes/unwritten extents within the requested range. + */ +static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int m_flags, + bool *force_commit) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + int ret = 0; + + WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); + + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 || ret == m_len) + goto out; + /* + * This is a mixed mapping case where we were not able to allocate + * a single contiguous extent. In that case let's reset requested + * mapping and call the slow path. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + /* + * slow path means we have mixed mapping, that means we will need + * to force txn commit. + */ + *force_commit = true; + return ext4_map_blocks_atomic_write_slow(handle, inode, map); +out: + return ret; +} + static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; + bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at @@ -3342,7 +3619,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + + /* + * journal credits estimation for atomic writes. We call + * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, + * then let's assume the no. of pextents required can be m_len i.e. + * every alternate block can be unwritten and hole. + */ + if (flags & IOMAP_ATOMIC) { + unsigned int orig_mlen = map->m_len; + + ret = ext4_map_blocks(NULL, inode, map, 0); + if (ret < 0) + return ret; + if (map->m_len < orig_mlen) { + map->m_len = orig_mlen; + dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, + map->m_len); + } else { + dio_credits = ext4_chunk_trans_blocks(inode, + map->m_len); + } + } else { + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + } retry: /* @@ -3373,7 +3673,11 @@ retry: else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ret = ext4_map_blocks(handle, inode, map, m_flags); + if (flags & IOMAP_ATOMIC) + ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, + &force_commit); + else + ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could @@ -3387,6 +3691,22 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + /* + * Force commit the current transaction if the allocation spans a mixed + * mapping range. This ensures any pending metadata updates (like + * unwritten to written extents conversion) in this range are in + * consistent state with the file data blocks, before performing the + * actual write I/O. If the commit fails, the whole I/O must be aborted + * to prevent any possible torn writes. + */ + if (ret > 0 && force_commit) { + int ret2; + + ret2 = ext4_force_commit(inode->i_sb); + if (ret2) + return ret2; + } + return ret; } @@ -3397,6 +3717,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; + unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; @@ -3410,6 +3731,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* @@ -3420,11 +3742,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) - goto out; + /* + * For atomic writes the entire requested length should + * be mapped. + */ + if (map.m_flags & EXT4_MAP_MAPPED) { + if ((!(flags & IOMAP_ATOMIC) && ret > 0) || + (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + goto out; + } + map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { + /* + * This can be called for overwrites path from + * ext4_iomap_overwrite_begin(). + */ ret = ext4_map_blocks(NULL, inode, &map, 0); } @@ -3438,6 +3772,16 @@ out: */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + /* + * Before returning to iomap, let's ensure the allocated mapping + * covers the entire requested length for atomic writes. + */ + if (flags & IOMAP_ATOMIC) { + if (map.m_len < (length >> blkbits)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; @@ -3679,9 +4023,7 @@ void ext4_set_aops(struct inode *inode) static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { - ext4_fsblk_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - unsigned blocksize, pos; + unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3696,13 +4038,14 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ + offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -3902,6 +4245,68 @@ int ext4_update_disksize_before_punch(struct inode *inode, loff_t offset, return ret; } +static inline void ext4_truncate_folio(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + struct folio *folio; + + /* Nothing to be done if no complete block needs to be truncated. */ + if (round_up(start, blocksize) >= round_down(end, blocksize)) + return; + + folio = filemap_lock_folio(inode->i_mapping, start >> PAGE_SHIFT); + if (IS_ERR(folio)) + return; + + if (folio_mkclean(folio)) + folio_mark_dirty(folio); + folio_unlock(folio); + folio_put(folio); +} + +int ext4_truncate_page_cache_block_range(struct inode *inode, + loff_t start, loff_t end) +{ + unsigned long blocksize = i_blocksize(inode); + int ret; + + /* + * For journalled data we need to write (and checkpoint) pages + * before discarding page cache to avoid inconsitent data on disk + * in case of crash before freeing or unwritten converting trans + * is committed. + */ + if (ext4_should_journal_data(inode)) { + ret = filemap_write_and_wait_range(inode->i_mapping, start, + end - 1); + if (ret) + return ret; + goto truncate_pagecache; + } + + /* + * If the block size is less than the page size, the file's mapped + * blocks within one page could be freed or converted to unwritten. + * So it's necessary to remove writable userspace mappings, and then + * ext4_page_mkwrite() can be called during subsequent write access + * to these partial folios. + */ + if (!IS_ALIGNED(start | end, PAGE_SIZE) && + blocksize < PAGE_SIZE && start < inode->i_size) { + loff_t page_boundary = round_up(start, PAGE_SIZE); + + ext4_truncate_folio(inode, start, min(page_boundary, end)); + if (end > page_boundary) + ext4_truncate_folio(inode, + round_down(end, PAGE_SIZE), end); + } + +truncate_pagecache: + truncate_pagecache_range(inode, start, end - 1); + return 0; +} + static void ext4_wait_dax_page(struct inode *inode) { filemap_invalidate_unlock(inode->i_mapping); @@ -3911,24 +4316,10 @@ static void ext4_wait_dax_page(struct inode *inode) int ext4_break_layouts(struct inode *inode) { - struct page *page; - int error; - if (WARN_ON_ONCE(!rwsem_is_locked(&inode->i_mapping->invalidate_lock))) return -EINVAL; - do { - page = dax_layout_busy_page(inode->i_mapping); - if (!page) - return 0; - - error = ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, - TASK_INTERRUPTIBLE, 0, 0, - ext4_wait_dax_page(inode)); - } while (error == 0); - - return error; + return dax_break_layout_inode(inode, ext4_wait_dax_page); } /* @@ -3946,91 +4337,56 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) { struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; - ext4_lblk_t first_block, stop_block; - struct address_space *mapping = inode->i_mapping; - loff_t first_block_offset, last_block_offset, max_length; - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); + ext4_lblk_t start_lblk, end_lblk; + loff_t max_end = sb->s_maxbytes; + loff_t end = offset + length; handle_t *handle; unsigned int credits; - int ret = 0, ret2 = 0; + int ret; trace_ext4_punch_hole(inode, offset, length, 0); + WARN_ON_ONCE(!inode_is_locked(inode)); /* - * Write out all dirty pages to avoid race conditions - * Then release them. + * For indirect-block based inodes, make sure that the hole within + * one block before last range. */ - if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) { - ret = filemap_write_and_wait_range(mapping, offset, - offset + length - 1); - if (ret) - return ret; - } - - inode_lock(inode); + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) - goto out_mutex; + if (offset >= inode->i_size || offset >= max_end) + return 0; /* - * If the hole extends beyond i_size, set the hole - * to end after the page that contains i_size + * If the hole extends beyond i_size, set the hole to end after + * the page that contains i_size. */ - if (offset + length > inode->i_size) { - length = inode->i_size + - PAGE_SIZE - (inode->i_size & (PAGE_SIZE - 1)) - - offset; - } + if (end > inode->i_size) + end = round_up(inode->i_size, PAGE_SIZE); + if (end > max_end) + end = max_end; + length = end - offset; /* - * For punch hole the length + offset needs to be within one block - * before last range. Adjust the length if it goes beyond that limit. + * Attach jinode to inode for jbd2 if we do any zeroing of partial + * block. */ - max_length = sbi->s_bitmap_maxbytes - inode->i_sb->s_blocksize; - if (offset + length > max_length) - length = max_length - offset; - - if (offset & (sb->s_blocksize - 1) || - (offset + length) & (sb->s_blocksize - 1)) { - /* - * Attach jinode to inode for jbd2 if we do any zeroing of - * partial block - */ + if (!IS_ALIGNED(offset | end, sb->s_blocksize)) { ret = ext4_inode_attach_jinode(inode); if (ret < 0) - goto out_mutex; - + return ret; } - /* Wait all existing dio workers, newcomers will block on i_rwsem */ - inode_dio_wait(inode); - ret = file_modified(file); + ret = ext4_update_disksize_before_punch(inode, offset, length); if (ret) - goto out_mutex; - - /* - * Prevent page faults from reinstantiating pages we have released from - * page cache. - */ - filemap_invalidate_lock(mapping); - - ret = ext4_break_layouts(inode); - if (ret) - goto out_dio; - - first_block_offset = round_up(offset, sb->s_blocksize); - last_block_offset = round_down((offset + length), sb->s_blocksize) - 1; + return ret; /* Now release the pages and zero block aligned part of pages*/ - if (last_block_offset > first_block_offset) { - ret = ext4_update_disksize_before_punch(inode, offset, length); - if (ret) - goto out_dio; - truncate_pagecache_range(inode, first_block_offset, - last_block_offset); - } + ret = ext4_truncate_page_cache_block_range(inode, offset, end); + if (ret) + return ret; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) credits = ext4_writepage_trans_blocks(inode); @@ -4040,54 +4396,53 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (IS_ERR(handle)) { ret = PTR_ERR(handle); ext4_std_error(sb, ret); - goto out_dio; + return ret; } - ret = ext4_zero_partial_blocks(handle, inode, offset, - length); + ret = ext4_zero_partial_blocks(handle, inode, offset, length); if (ret) - goto out_stop; - - first_block = (offset + sb->s_blocksize - 1) >> - EXT4_BLOCK_SIZE_BITS(sb); - stop_block = (offset + length) >> EXT4_BLOCK_SIZE_BITS(sb); + goto out_handle; /* If there are blocks to remove, do it */ - if (stop_block > first_block) { - ext4_lblk_t hole_len = stop_block - first_block; + start_lblk = EXT4_B_TO_LBLK(inode, offset); + end_lblk = end >> inode->i_blkbits; + + if (end_lblk > start_lblk) { + ext4_lblk_t hole_len = end_lblk - start_lblk; + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); - ext4_es_remove_extent(inode, first_block, hole_len); + ext4_es_remove_extent(inode, start_lblk, hole_len); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - ret = ext4_ext_remove_space(inode, first_block, - stop_block - 1); + ret = ext4_ext_remove_space(inode, start_lblk, + end_lblk - 1); else - ret = ext4_ind_remove_space(handle, inode, first_block, - stop_block); + ret = ext4_ind_remove_space(handle, inode, start_lblk, + end_lblk); + if (ret) { + up_write(&EXT4_I(inode)->i_data_sem); + goto out_handle; + } - ext4_es_insert_extent(inode, first_block, hole_len, ~0, + ext4_es_insert_extent(inode, start_lblk, hole_len, ~0, EXTENT_STATUS_HOLE, 0); up_write(&EXT4_I(inode)->i_data_sem); } - ext4_fc_track_range(handle, inode, first_block, stop_block); + ext4_fc_track_range(handle, inode, start_lblk, end_lblk); + + ret = ext4_mark_inode_dirty(handle, inode); + if (unlikely(ret)) + goto out_handle; + + ext4_update_inode_fsync_trans(handle, inode, 1); if (IS_SYNC(inode)) ext4_handle_sync(handle); - - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - ret2 = ext4_mark_inode_dirty(handle, inode); - if (unlikely(ret2)) - ret = ret2; - if (ret >= 0) - ext4_update_inode_fsync_trans(handle, inode, 1); -out_stop: +out_handle: ext4_journal_stop(handle); -out_dio: - filemap_invalidate_unlock(mapping); -out_mutex: - inode_unlock(inode); return ret; } @@ -4209,8 +4564,10 @@ int ext4_truncate(struct inode *inode) if (err) goto out_stop; - down_write(&EXT4_I(inode)->i_data_sem); + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4674,6 +5031,11 @@ static inline int ext4_iget_extra_inode(struct inode *inode, *magic == cpu_to_le32(EXT4_XATTR_MAGIC)) { int err; + err = xattr_check_inode(inode, IHDR(inode, raw_inode), + ITAIL(inode, raw_inode)); + if (err) + return err; + ext4_set_inode_state(inode, EXT4_STATE_XATTR); err = ext4_find_inline_data_nolock(inode); if (!err && ext4_has_inline_data(inode)) @@ -4705,22 +5067,60 @@ static inline void ext4_inode_set_iversion_queried(struct inode *inode, u64 val) inode_set_iversion_queried(inode, val); } -static const char *check_igot_inode(struct inode *inode, ext4_iget_flags flags) - +static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, + const char *function, unsigned int line) { + const char *err_str; + if (flags & EXT4_IGET_EA_INODE) { - if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "missing EA_INODE flag"; + if (!(EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + err_str = "missing EA_INODE flag"; + goto error; + } if (ext4_test_inode_state(inode, EXT4_STATE_XATTR) || - EXT4_I(inode)->i_file_acl) - return "ea_inode with extended attributes"; + EXT4_I(inode)->i_file_acl) { + err_str = "ea_inode with extended attributes"; + goto error; + } } else { - if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) - return "unexpected EA_INODE flag"; + if ((EXT4_I(inode)->i_flags & EXT4_EA_INODE_FL)) { + /* + * open_by_handle_at() could provide an old inode number + * that has since been reused for an ea_inode; this does + * not indicate filesystem corruption + */ + if (flags & EXT4_IGET_HANDLE) + return -ESTALE; + err_str = "unexpected EA_INODE flag"; + goto error; + } + } + if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) { + err_str = "unexpected bad inode w/o EXT4_IGET_BAD"; + goto error; } - if (is_bad_inode(inode) && !(flags & EXT4_IGET_BAD)) - return "unexpected bad inode w/o EXT4_IGET_BAD"; - return NULL; + return 0; + +error: + ext4_error_inode(inode, function, line, 0, "%s", err_str); + return -EFSCORRUPTED; +} + +bool ext4_should_enable_large_folio(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!S_ISREG(inode->i_mode)) + return false; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return false; + if (ext4_has_feature_verity(sb)) + return false; + if (ext4_has_feature_encrypt(sb)) + return false; + + return true; } struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, @@ -4732,7 +5132,6 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, struct ext4_inode_info *ei; struct ext4_super_block *es = EXT4_SB(sb)->s_es; struct inode *inode; - const char *err_str; journal_t *journal = EXT4_SB(sb)->s_journal; long ret; loff_t size; @@ -4741,12 +5140,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, gid_t i_gid; projid_t i_projid; - if ((!(flags & EXT4_IGET_SPECIAL) && - ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || - ino == le32_to_cpu(es->s_usr_quota_inum) || - ino == le32_to_cpu(es->s_grp_quota_inum) || - ino == le32_to_cpu(es->s_prj_quota_inum) || - ino == le32_to_cpu(es->s_orphan_file_inum))) || + if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) @@ -4761,10 +5155,10 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, if (!inode) return ERR_PTR(-ENOMEM); if (!(inode->i_state & I_NEW)) { - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); + ret = check_igot_inode(inode, flags, function, line); + if (ret) { iput(inode); - return ERR_PTR(-EFSCORRUPTED); + return ERR_PTR(ret); } return inode; } @@ -4800,15 +5194,14 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_extra_isize = 0; /* Precompute checksum seed for inode metadata */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || @@ -4876,7 +5269,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); - if ((size = i_size_read(inode)) < 0) { + size = i_size_read(inode); + if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -4887,7 +5281,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, * we'd normally treat htree data as empty space. But with metadata * checksumming that corrupts checksums so forbid that. */ - if (!ext4_has_feature_dir_index(sb) && ext4_has_metadata_csum(sb) && + if (!ext4_has_feature_dir_index(sb) && + ext4_has_feature_metadata_csum(sb) && ext4_test_inode_flag(inode, EXT4_INODE_INDEX)) { ext4_error_inode(inode, function, line, 0, "iget: Dir with htree data on filesystem without dir_index feature."); @@ -5007,8 +5402,16 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, inode->i_op = &ext4_encrypted_symlink_inode_operations; } else if (ext4_inode_is_fast_symlink(inode)) { inode->i_op = &ext4_fast_symlink_inode_operations; - nd_terminate_link(ei->i_data, inode->i_size, - sizeof(ei->i_data) - 1); + if (inode->i_size == 0 || + inode->i_size >= sizeof(ei->i_data) || + strnlen((char *)ei->i_data, inode->i_size + 1) != + inode->i_size) { + ext4_error_inode(inode, function, line, 0, + "invalid fast symlink length %llu", + (unsigned long long)inode->i_size); + ret = -EFSCORRUPTED; + goto bad_inode; + } inode_set_cached_link(inode, (char *)ei->i_data, inode->i_size); } else { @@ -5037,13 +5440,24 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } - if ((err_str = check_igot_inode(inode, flags)) != NULL) { - ext4_error_inode(inode, function, line, 0, err_str); - ret = -EFSCORRUPTED; - goto bad_inode; - } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ret = check_igot_inode(inode, flags, function, line); + /* + * -ESTALE here means there is nothing inherently wrong with the inode, + * it's just not an inode we can return for an fhandle lookup. + */ + if (ret == -ESTALE) { + brelse(iloc.bh); + unlock_new_inode(inode); + iput(inode); + return ERR_PTR(-ESTALE); + } + if (ret) + goto bad_inode; brelse(iloc.bh); + unlock_new_inode(inode); return inode; @@ -5228,8 +5642,9 @@ int ext4_write_inode(struct inode *inode, struct writeback_control *wbc) if (WARN_ON_ONCE(current->flags & PF_MEMALLOC)) return 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; if (EXT4_SB(inode->i_sb)->s_journal) { if (ext4_journal_current_handle()) { @@ -5351,8 +5766,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, const unsigned int ia_valid = attr->ia_valid; bool inc_ivers = true; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + error = ext4_emergency_state(inode->i_sb); + if (unlikely(error)) + return error; if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; @@ -5464,7 +5880,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, oldsize & (inode->i_sb->s_blocksize - 1)) { error = ext4_inode_attach_jinode(inode); if (error) - goto err_out; + goto out_mmap_sem; } handle = ext4_journal_start(inode, EXT4_HT_INODE, 3); @@ -5505,9 +5921,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code @@ -5518,6 +5932,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; @@ -5633,7 +6050,7 @@ int ext4_getattr(struct mnt_idmap *idmap, const struct path *path, awu_max = sbi->s_awu_max; } - generic_fill_statx_atomic_writes(stat, awu_min, awu_max); + generic_fill_statx_atomic_writes(stat, awu_min, awu_max, 0); } flags = ei->i_flags & EXT4_FL_USER_VISIBLE; @@ -5714,8 +6131,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents) +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -5723,18 +6139,16 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int ret; /* - * How many index blocks need to touch to map @lblocks logical blocks - * to @pextents physical extents? + * How many index and lead blocks need to touch to map @lblocks + * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); - ret = idxblocks; - /* * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks + pextents; + groups = idxblocks; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -5742,7 +6156,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ - ret += groups + gdpblocks; + ret = idxblocks + groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); @@ -5762,7 +6176,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_writepage_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); @@ -5796,9 +6210,10 @@ int ext4_mark_iloc_dirty(handle_t *handle, { int err = 0; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) { + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) { put_bh(iloc->bh); - return -EIO; + return err; } ext4_fc_track_inode(handle, inode); @@ -5822,8 +6237,9 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, { int err; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) - return -EIO; + err = ext4_emergency_state(inode->i_sb); + if (unlikely(err)) + return err; err = ext4_get_inode_loc(inode, iloc); if (!err) { @@ -5834,6 +6250,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, brelse(iloc->bh); iloc->bh = NULL; } + ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index 7b9ce71c1c81..5668a17458ae 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -142,16 +142,16 @@ static int ext4_update_backup_sb(struct super_block *sb, es = (struct ext4_super_block *) (bh->b_data + offset); lock_buffer(bh); - if (ext4_has_metadata_csum(sb) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + if (ext4_has_feature_metadata_csum(sb) && + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " "superblock %llu", sb_block); unlock_buffer(bh); goto out_bh; } func(es, arg); - if (ext4_has_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + if (ext4_has_feature_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -351,11 +351,11 @@ void ext4_reset_inode_seed(struct inode *inode) __le32 gen = cpu_to_le32(inode->i_generation); __u32 csum; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } /* @@ -1205,7 +1205,8 @@ static int ext4_ioctl_setuuid(struct file *filp, * If any checksums (group descriptors or metadata) are being used * then the checksum seed feature is required to change the UUID. */ - if (((ext4_has_feature_gdt_csum(sb) || ext4_has_metadata_csum(sb)) + if (((ext4_has_feature_gdt_csum(sb) || + ext4_has_feature_metadata_csum(sb)) && !ext4_has_feature_csum_seed(sb)) || ext4_has_feature_stable_inodes(sb)) return -EOPNOTSUPP; @@ -1253,7 +1254,7 @@ static long __ext4_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) if (!inode_owner_or_capable(idmap, inode)) return -EPERM; - if (ext4_has_metadata_csum(inode->i_sb)) { + if (ext4_has_feature_metadata_csum(inode->i_sb)) { ext4_warning(sb, "Setting inode version is not " "supported with metadata_csum enabled."); return -ENOTTY; @@ -1504,8 +1505,14 @@ resizefs_out: return 0; } case EXT4_IOC_PRECACHE_EXTENTS: - return ext4_ext_precache(inode); + { + int ret; + inode_lock_shared(inode); + ret = ext4_ext_precache(inode); + inode_unlock_shared(inode); + return ret; + } case FS_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; @@ -1705,7 +1712,7 @@ int ext4_update_overhead(struct super_block *sb, bool force) { struct ext4_sb_info *sbi = EXT4_SB(sb); - if (sb_rdonly(sb)) + if (ext4_emergency_state(sb) || sb_rdonly(sb)) return 0; if (!force && (sbi->s_overhead == 0 || diff --git a/fs/ext4/mballoc-test.c b/fs/ext4/mballoc-test.c index bb2a223b207c..d634c12f1984 100644 --- a/fs/ext4/mballoc-test.c +++ b/fs/ext4/mballoc-test.c @@ -796,6 +796,7 @@ static void test_mb_mark_used(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); @@ -860,6 +861,7 @@ static void test_mb_free_blocks(struct kunit *test) KUNIT_ASSERT_NOT_ERR_OR_NULL(test, buddy); grp = kunit_kzalloc(test, offsetof(struct ext4_group_info, bb_counters[MB_NUM_ORDERS(sb)]), GFP_KERNEL); + KUNIT_ASSERT_NOT_ERR_OR_NULL(test, grp); ret = ext4_mb_load_buddy(sb, TEST_GOAL_GROUP, &e4b); KUNIT_ASSERT_EQ(test, ret, 0); diff --git a/fs/ext4/mballoc.c b/fs/ext4/mballoc.c index b25a27c86696..1e98c5be4e0a 100644 --- a/fs/ext4/mballoc.c +++ b/fs/ext4/mballoc.c @@ -187,7 +187,7 @@ * /sys/fs/ext4/<partition>/mb_min_to_scan * /sys/fs/ext4/<partition>/mb_max_to_scan * /sys/fs/ext4/<partition>/mb_order2_req - * /sys/fs/ext4/<partition>/mb_linear_limit + * /sys/fs/ext4/<partition>/mb_max_linear_groups * * The regular allocator uses buddy scan only if the request len is power of * 2 blocks and the order of allocation is >= sbi->s_mb_order2_reqs. The @@ -209,7 +209,7 @@ * get traversed linearly. That may result in subsequent allocations being not * close to each other. And so, the underlying device may get filled up in a * non-linear fashion. While that may not matter on non-rotational devices, for - * rotational devices that may result in higher seek times. "mb_linear_limit" + * rotational devices that may result in higher seek times. "mb_max_linear_groups" * tells mballoc how many groups mballoc should search linearly before * performing consulting above data structures for more efficient lookups. For * non rotational devices, this value defaults to 0 and for rotational devices @@ -3037,10 +3037,8 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) unsigned char blocksize_bits = min_t(unsigned char, sb->s_blocksize_bits, EXT4_MAX_BLOCK_LOG_SIZE); - struct sg { - struct ext4_group_info info; - ext4_grpblk_t counters[EXT4_MAX_BLOCK_LOG_SIZE + 2]; - } sg; + DEFINE_RAW_FLEX(struct ext4_group_info, sg, bb_counters, + EXT4_MAX_BLOCK_LOG_SIZE + 2); group--; if (group == 0) @@ -3048,7 +3046,7 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) " 2^0 2^1 2^2 2^3 2^4 2^5 2^6 " " 2^7 2^8 2^9 2^10 2^11 2^12 2^13 ]\n"); - i = (blocksize_bits + 2) * sizeof(sg.info.bb_counters[0]) + + i = (blocksize_bits + 2) * sizeof(sg->bb_counters[0]) + sizeof(struct ext4_group_info); grinfo = ext4_get_group_info(sb, group); @@ -3068,14 +3066,14 @@ static int ext4_mb_seq_groups_show(struct seq_file *seq, void *v) * We care only about free space counters in the group info and * these are safe to access even after the buddy has been unloaded */ - memcpy(&sg, grinfo, i); - seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg.info.bb_free, - sg.info.bb_fragments, sg.info.bb_first_free); + memcpy(sg, grinfo, i); + seq_printf(seq, "#%-5u: %-5u %-5u %-5u [", group, sg->bb_free, + sg->bb_fragments, sg->bb_first_free); for (i = 0; i <= 13; i++) seq_printf(seq, " %-5u", i <= blocksize_bits + 1 ? - sg.info.bb_counters[i] : 0); + sg->bb_counters[i] : 0); seq_puts(seq, " ]"); - if (EXT4_MB_GRP_BBITMAP_CORRUPT(&sg.info)) + if (EXT4_MB_GRP_BBITMAP_CORRUPT(sg)) seq_puts(seq, " Block bitmap corrupted!"); seq_putc(seq, '\n'); return 0; @@ -5653,7 +5651,7 @@ static inline void ext4_mb_show_pa(struct super_block *sb) { ext4_group_t i, ngroups; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return; ngroups = ext4_get_groups_count(sb); @@ -5687,7 +5685,7 @@ static void ext4_mb_show_ac(struct ext4_allocation_context *ac) { struct super_block *sb = ac->ac_sb; - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return; mb_debug(sb, "Can't allocate:" @@ -6644,7 +6642,8 @@ void ext4_free_blocks(handle_t *handle, struct inode *inode, for (i = 0; i < count; i++) { cond_resched(); if (is_metadata) - bh = sb_find_get_block(inode->i_sb, block + i); + bh = sb_find_get_block_nonatomic(inode->i_sb, + block + i); ext4_forget(handle, is_metadata, inode, bh, block + i); } } diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index d64c04ed061a..51661570cf3b 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -14,14 +14,14 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) int offset = offsetof(struct mmp_struct, mmp_checksum); __u32 csum; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset); return cpu_to_le32(csum); } static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; return mmp->mmp_checksum == ext4_mmp_csum(sb, mmp); @@ -29,7 +29,7 @@ static int ext4_mmp_csum_verify(struct super_block *sb, struct mmp_struct *mmp) static void ext4_mmp_csum_set(struct super_block *sb, struct mmp_struct *mmp) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; mmp->mmp_checksum = ext4_mmp_csum(sb, mmp); @@ -162,7 +162,7 @@ static int kmmpd(void *data) memcpy(mmp->mmp_nodename, init_utsname()->nodename, sizeof(mmp->mmp_nodename)); - while (!kthread_should_stop() && !ext4_forced_shutdown(sb)) { + while (!kthread_should_stop() && !ext4_emergency_state(sb)) { if (!ext4_has_feature_mmp(sb)) { ext4_warning(sb, "kmmpd being stopped since MMP feature" " has been disabled."); diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 898443e98efc..1f8493a56e8f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -269,7 +269,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, unsigned int tmp_data_size, data_size, replaced_size; int i, err2, jblocks, retries = 0; int replaced_count = 0; - int from = data_offset_in_page << orig_inode->i_blkbits; + int from; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; @@ -323,11 +323,6 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - - VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); - VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); - VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); - if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to @@ -360,6 +355,8 @@ again: goto unlock_folios; } data_copy: + from = offset_in_folio(folio[0], + orig_blk_offset << orig_inode->i_blkbits); *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; @@ -390,7 +387,7 @@ data_copy: if (!bh) bh = create_empty_buffers(folio[0], 1 << orig_inode->i_blkbits, 0); - for (i = 0; i < data_offset_in_page; i++) + for (i = 0; i < from >> orig_inode->i_blkbits; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); @@ -399,7 +396,7 @@ data_copy: bh = bh->b_this_page; } - block_commit_write(&folio[0]->page, from, from + replaced_size); + block_commit_write(folio[0], from, from + replaced_size); /* Even in case of data=writeback it is reasonable to pin * inode to transaction, to prevent unexpected data loss */ diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index 536d56d15072..a178ac229489 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -176,7 +176,7 @@ static struct buffer_head *__ext4_read_dirblock(struct inode *inode, brelse(bh); return ERR_PTR(-EFSCORRUPTED); } - if (!ext4_has_metadata_csum(inode->i_sb) || + if (!ext4_has_feature_metadata_csum(inode->i_sb) || buffer_verified(bh)) return bh; @@ -291,36 +291,6 @@ struct dx_tail { __le32 dt_checksum; /* crc32c(uuid+inum+dirblock) */ }; -static inline ext4_lblk_t dx_get_block(struct dx_entry *entry); -static void dx_set_block(struct dx_entry *entry, ext4_lblk_t value); -static inline unsigned dx_get_hash(struct dx_entry *entry); -static void dx_set_hash(struct dx_entry *entry, unsigned value); -static unsigned dx_get_count(struct dx_entry *entries); -static unsigned dx_get_limit(struct dx_entry *entries); -static void dx_set_count(struct dx_entry *entries, unsigned value); -static void dx_set_limit(struct dx_entry *entries, unsigned value); -static unsigned dx_root_limit(struct inode *dir, unsigned infosize); -static unsigned dx_node_limit(struct inode *dir); -static struct dx_frame *dx_probe(struct ext4_filename *fname, - struct inode *dir, - struct dx_hash_info *hinfo, - struct dx_frame *frame); -static void dx_release(struct dx_frame *frames); -static int dx_make_map(struct inode *dir, struct buffer_head *bh, - struct dx_hash_info *hinfo, - struct dx_map_entry *map_tail); -static void dx_sort_map(struct dx_map_entry *map, unsigned count); -static struct ext4_dir_entry_2 *dx_move_dirents(struct inode *dir, char *from, - char *to, struct dx_map_entry *offsets, - int count, unsigned int blocksize); -static struct ext4_dir_entry_2 *dx_pack_dirents(struct inode *dir, char *base, - unsigned int blocksize); -static void dx_insert_block(struct dx_frame *frame, - u32 hash, ext4_lblk_t block); -static int ext4_htree_next_block(struct inode *dir, __u32 hash, - struct dx_frame *frame, - struct dx_frame *frames, - __u32 *start_hash); static struct buffer_head * ext4_dx_find_entry(struct inode *dir, struct ext4_filename *fname, struct ext4_dir_entry_2 **res_dir); @@ -376,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } @@ -398,7 +367,7 @@ int ext4_dirblock_csum_verify(struct inode *inode, struct buffer_head *bh) { struct ext4_dir_entry_tail *t; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; t = get_dirent_tail(inode, bh); @@ -419,7 +388,7 @@ static void ext4_dirblock_csum_set(struct inode *inode, { struct ext4_dir_entry_tail *t; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; t = get_dirent_tail(inode, bh); @@ -472,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; @@ -480,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(csum, (__u8 *)t, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } @@ -494,7 +462,7 @@ static int ext4_dx_csum_verify(struct inode *inode, struct dx_tail *t; int count_offset, limit, count; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return 1; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -523,7 +491,7 @@ static void ext4_dx_csum_set(struct inode *inode, struct ext4_dir_entry *dirent) struct dx_tail *t; int count_offset, limit, count; - if (!ext4_has_metadata_csum(inode->i_sb)) + if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; c = get_dx_countlimit(inode, dirent, &count_offset); @@ -612,7 +580,7 @@ static inline unsigned dx_root_limit(struct inode *dir, unsigned infosize) ext4_dir_rec_len(1, NULL) - ext4_dir_rec_len(2, NULL) - infosize; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -622,7 +590,7 @@ static inline unsigned dx_node_limit(struct inode *dir) unsigned int entry_space = dir->i_sb->s_blocksize - ext4_dir_rec_len(0, dir); - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) entry_space -= sizeof(struct dx_tail); return entry_space / sizeof(struct dx_entry); } @@ -1076,7 +1044,7 @@ static int htree_dirblock_to_tree(struct file *dir_file, struct ext4_dir_entry_2 *de, *top; int err = 0, count = 0; struct fscrypt_str fname_crypto_str = FSTR_INIT(NULL, 0), tmp_str; - int csum = ext4_has_metadata_csum(dir->i_sb); + int csum = ext4_has_feature_metadata_csum(dir->i_sb); dxtrace(printk(KERN_INFO "In htree dirblock_to_tree: block %lu\n", (unsigned long)block)); @@ -1320,7 +1288,7 @@ static int dx_make_map(struct inode *dir, struct buffer_head *bh, struct dx_hash_info h = *hinfo; int blocksize = EXT4_BLOCK_SIZE(dir->i_sb); - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) buflen -= sizeof(struct ext4_dir_entry_tail); while ((char *) de < base + buflen) { @@ -1462,7 +1430,8 @@ static bool ext4_match(struct inode *parent, * sure cf_name was properly initialized before * considering the calculated hash. */ - if (IS_ENCRYPTED(parent) && fname->cf_name.name && + if (sb_no_casefold_compat_fallback(parent->i_sb) && + IS_ENCRYPTED(parent) && fname->cf_name.name && (fname->hinfo.hash != EXT4_DIRENT_HASH(de) || fname->hinfo.minor_hash != EXT4_DIRENT_MINOR_HASH(de))) return false; @@ -1595,10 +1564,15 @@ static struct buffer_head *__ext4_find_entry(struct inode *dir, * return. Otherwise, fall back to doing a search the * old fashioned way. */ - if (!IS_ERR(ret) || PTR_ERR(ret) != ERR_BAD_DX_DIR) + if (IS_ERR(ret) && PTR_ERR(ret) == ERR_BAD_DX_DIR) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " + "falling back\n")); + else if (!sb_no_casefold_compat_fallback(dir->i_sb) && + *res_dir == NULL && IS_CASEFOLDED(dir)) + dxtrace(printk(KERN_DEBUG "ext4_find_entry: casefold " + "failed, falling back\n")); + else goto cleanup_and_exit; - dxtrace(printk(KERN_DEBUG "ext4_find_entry: dx failed, " - "falling back\n")); ret = NULL; } nblocks = dir->i_size >> EXT4_BLOCK_SIZE_BITS(sb); @@ -1945,7 +1919,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, int csum_size = 0; int err = 0, i; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); bh2 = ext4_append(handle, dir, &newblock); @@ -1995,7 +1969,7 @@ static struct ext4_dir_entry_2 *do_split(handle_t *handle, struct inode *dir, * split it in half by count; each resulting block will have at least * half the space free. */ - if (i > 0) + if (i >= 0) split = count - move; else split = count/2; @@ -2060,8 +2034,7 @@ out: return ERR_PTR(err); } -int ext4_find_dest_de(struct inode *dir, struct inode *inode, - struct buffer_head *bh, +int ext4_find_dest_de(struct inode *dir, struct buffer_head *bh, void *buf, int buf_size, struct ext4_filename *fname, struct ext4_dir_entry_2 **dest_de) @@ -2143,11 +2116,11 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname, int csum_size = 0; int err, err2; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (!de) { - err = ext4_find_dest_de(dir, inode, bh, bh->b_data, + err = ext4_find_dest_de(dir, bh, bh->b_data, blocksize - csum_size, fname, &de); if (err) return err; @@ -2252,7 +2225,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname, struct fake_dirent *fde; int csum_size = 0; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); blocksize = dir->i_sb->s_blocksize; @@ -2396,7 +2369,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, ext4_lblk_t block, blocks; int csum_size = 0; - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); sb = dir->i_sb; @@ -2427,7 +2400,7 @@ static int ext4_add_entry(handle_t *handle, struct dentry *dentry, if (!retval || (retval != ERR_BAD_DX_DIR)) goto out; /* Can we just ignore htree data? */ - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { EXT4_ERROR_INODE(dir, "Directory has corrupted htree index."); retval = -EFSCORRUPTED; @@ -2577,8 +2550,10 @@ again: BUFFER_TRACE(frame->bh, "get_write_access"); err = ext4_journal_get_write_access(handle, sb, frame->bh, EXT4_JTR_NONE); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } if (!add_level) { unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned hash2 = dx_get_hash(entries + icount1); @@ -2589,8 +2564,10 @@ again: err = ext4_journal_get_write_access(handle, sb, (frame - 1)->bh, EXT4_JTR_NONE); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } memcpy((char *) entries2, (char *) (entries + icount1), icount2 * sizeof(struct dx_entry)); @@ -2609,8 +2586,10 @@ again: dxtrace(dx_show_index("node", ((struct dx_node *) bh2->b_data)->entries)); err = ext4_handle_dirty_dx_node(handle, dir, bh2); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } brelse (bh2); err = ext4_handle_dirty_dx_node(handle, dir, (frame - 1)->bh); @@ -2635,8 +2614,10 @@ again: "Creating %d level index...\n", dxroot->info.indirect_levels)); err = ext4_handle_dirty_dx_node(handle, dir, frame->bh); - if (err) + if (err) { + brelse(bh2); goto journal_error; + } err = ext4_handle_dirty_dx_node(handle, dir, bh2); brelse(bh2); restart = 1; @@ -2733,7 +2714,7 @@ static int ext4_delete_entry(handle_t *handle, return err; } - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); BUFFER_TRACE(bh, "get_write_access"); @@ -2973,7 +2954,7 @@ int ext4_init_new_dir(handle_t *handle, struct inode *dir, int csum_size = 0; int err; - if (ext4_has_metadata_csum(dir->i_sb)) + if (ext4_has_feature_metadata_csum(dir->i_sb)) csum_size = sizeof(struct ext4_dir_entry_tail); if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { @@ -3004,19 +2985,19 @@ out: return err; } -static int ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ext4_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { handle_t *handle; struct inode *inode; int err, err2 = 0, credits, retries = 0; if (EXT4_DIR_LINK_MAX(dir)) - return -EMLINK; + return ERR_PTR(-EMLINK); err = dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); credits = (EXT4_DATA_TRANS_BLOCKS(dir->i_sb) + EXT4_INDEX_EXTRA_TRANS_BLOCKS + 3); @@ -3066,7 +3047,7 @@ out_stop: out_retry: if (err == -ENOSPC && ext4_should_retry_alloc(dir->i_sb, &retries)) goto retry; - return err; + return ERR_PTR(err); } /* @@ -3151,8 +3132,9 @@ static int ext4_rmdir(struct inode *dir, struct dentry *dentry) struct ext4_dir_entry_2 *de; handle_t *handle = NULL; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; /* Initialize quotas before so that eventual writes go in * separate transaction */ @@ -3309,8 +3291,9 @@ static int ext4_unlink(struct inode *dir, struct dentry *dentry) { int retval; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + retval = ext4_emergency_state(dir->i_sb); + if (unlikely(retval)) + return retval; trace_ext4_unlink_enter(dir, dentry); /* @@ -3376,8 +3359,9 @@ static int ext4_symlink(struct mnt_idmap *idmap, struct inode *dir, struct fscrypt_str disk_link; int retries = 0; - if (unlikely(ext4_forced_shutdown(dir->i_sb))) - return -EIO; + err = ext4_emergency_state(dir->i_sb); + if (unlikely(err)) + return err; err = fscrypt_prepare_symlink(dir, symname, len, dir->i_sb->s_blocksize, &disk_link); @@ -4199,8 +4183,9 @@ static int ext4_rename2(struct mnt_idmap *idmap, { int err; - if (unlikely(ext4_forced_shutdown(old_dir->i_sb))) - return -EIO; + err = ext4_emergency_state(old_dir->i_sb); + if (unlikely(err)) + return err; if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT)) return -EINVAL; diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index e5b47dda3317..7c7f792ad6ab 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -537,13 +537,13 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; ot = ext4_orphan_block_tail(sb, bh); - calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, + calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + calculated = ext4_chksum(calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } @@ -560,10 +560,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, - inodes_per_ob * sizeof(__u32)); + csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } diff --git a/fs/ext4/page-io.c b/fs/ext4/page-io.c index 69b8a7221a2b..179e54f3a3b6 100644 --- a/fs/ext4/page-io.c +++ b/fs/ext4/page-io.c @@ -164,7 +164,8 @@ static void ext4_release_io_end(ext4_io_end_t *io_end) } /* - * Check a range of space and convert unwritten extents to written. Note that + * On successful IO, check a range of space and convert unwritten extents to + * written. On IO failure, check if journal abort is needed. Note that * we are protected from truncate touching same part of extent tree by the * fact that truncate code waits for all DIO to finish (thus exclusion from * direct IO is achieved) and also waits for PageWriteback bits. Thus we @@ -175,20 +176,36 @@ static int ext4_end_io_end(ext4_io_end_t *io_end) { struct inode *inode = io_end->inode; handle_t *handle = io_end->handle; + struct super_block *sb = inode->i_sb; int ret = 0; ext4_debug("ext4_end_io_nolock: io_end 0x%p from inode %lu,list->next 0x%p," "list->prev 0x%p\n", io_end, inode->i_ino, io_end->list.next, io_end->list.prev); - io_end->handle = NULL; /* Following call will use up the handle */ - ret = ext4_convert_unwritten_io_end_vec(handle, io_end); - if (ret < 0 && !ext4_forced_shutdown(inode->i_sb)) { - ext4_msg(inode->i_sb, KERN_EMERG, + /* + * Do not convert the unwritten extents if data writeback fails, + * or stale data may be exposed. + */ + io_end->handle = NULL; /* Following call will use up the handle */ + if (unlikely(io_end->flag & EXT4_IO_END_FAILED)) { + ret = -EIO; + if (handle) + jbd2_journal_free_reserved(handle); + + if (test_opt(sb, DATA_ERR_ABORT)) + jbd2_journal_abort(EXT4_SB(sb)->s_journal, ret); + } else { + ret = ext4_convert_unwritten_io_end_vec(handle, io_end); + } + if (ret < 0 && !ext4_emergency_state(sb) && + io_end->flag & EXT4_IO_END_UNWRITTEN) { + ext4_msg(sb, KERN_EMERG, "failed to convert unwritten extents to written " "extents -- potential data loss! " "(inode %lu, error %d)", inode->i_ino, ret); } + ext4_clear_io_unwritten_flag(io_end); ext4_release_io_end(io_end); return ret; @@ -217,6 +234,16 @@ static void dump_completed_IO(struct inode *inode, struct list_head *head) #endif } +static bool ext4_io_end_defer_completion(ext4_io_end_t *io_end) +{ + if (io_end->flag & EXT4_IO_END_UNWRITTEN) + return true; + if (test_opt(io_end->inode->i_sb, DATA_ERR_ABORT) && + io_end->flag & EXT4_IO_END_FAILED) + return true; + return false; +} + /* Add the io_end to per-inode completed end_io list. */ static void ext4_add_complete_io(ext4_io_end_t *io_end) { @@ -225,9 +252,11 @@ static void ext4_add_complete_io(ext4_io_end_t *io_end) struct workqueue_struct *wq; unsigned long flags; - /* Only reserved conversions from writeback should enter here */ - WARN_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); - WARN_ON(!io_end->handle && sbi->s_journal); + /* Only reserved conversions or pending IO errors will enter here. */ + WARN_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); + WARN_ON(io_end->flag & EXT4_IO_END_UNWRITTEN && + !io_end->handle && sbi->s_journal); + spin_lock_irqsave(&ei->i_completed_io_lock, flags); wq = sbi->rsv_conversion_wq; if (list_empty(&ei->i_rsv_conversion_list)) @@ -252,7 +281,7 @@ static int ext4_do_flush_completed_IO(struct inode *inode, while (!list_empty(&unwritten)) { io_end = list_entry(unwritten.next, ext4_io_end_t, list); - BUG_ON(!(io_end->flag & EXT4_IO_END_UNWRITTEN)); + BUG_ON(!(io_end->flag & EXT4_IO_END_DEFER_COMPLETION)); list_del_init(&io_end->list); err = ext4_end_io_end(io_end); @@ -263,7 +292,8 @@ static int ext4_do_flush_completed_IO(struct inode *inode, } /* - * work on completed IO, to convert unwritten extents to extents + * Used to convert unwritten extents to written extents upon IO completion, + * or used to abort the journal upon IO errors. */ void ext4_end_io_rsv_work(struct work_struct *work) { @@ -288,29 +318,25 @@ ext4_io_end_t *ext4_init_io_end(struct inode *inode, gfp_t flags) void ext4_put_io_end_defer(ext4_io_end_t *io_end) { if (refcount_dec_and_test(&io_end->count)) { - if (!(io_end->flag & EXT4_IO_END_UNWRITTEN) || - list_empty(&io_end->list_vec)) { - ext4_release_io_end(io_end); + if (io_end->flag & EXT4_IO_END_FAILED || + (io_end->flag & EXT4_IO_END_UNWRITTEN && + !list_empty(&io_end->list_vec))) { + ext4_add_complete_io(io_end); return; } - ext4_add_complete_io(io_end); + ext4_release_io_end(io_end); } } int ext4_put_io_end(ext4_io_end_t *io_end) { - int err = 0; - if (refcount_dec_and_test(&io_end->count)) { - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { - err = ext4_convert_unwritten_io_end_vec(io_end->handle, - io_end); - io_end->handle = NULL; - ext4_clear_io_unwritten_flag(io_end); - } + if (ext4_io_end_defer_completion(io_end)) + return ext4_end_io_end(io_end); + ext4_release_io_end(io_end); } - return err; + return 0; } ext4_io_end_t *ext4_get_io_end(ext4_io_end_t *io_end) @@ -344,11 +370,12 @@ static void ext4_end_bio(struct bio *bio) bio->bi_status, inode->i_ino, (unsigned long long) bi_sector >> (inode->i_blkbits - 9)); + io_end->flag |= EXT4_IO_END_FAILED; mapping_set_error(inode->i_mapping, blk_status_to_errno(bio->bi_status)); } - if (io_end->flag & EXT4_IO_END_UNWRITTEN) { + if (ext4_io_end_defer_completion(io_end)) { /* * Link bio into list hanging from io_end. We have to do it * atomically as bio completions can be racing against each @@ -522,7 +549,7 @@ int ext4_bio_write_folio(struct ext4_io_submit *io, struct folio *folio, if (io->io_bio) gfp_flags = GFP_NOWAIT | __GFP_NOWARN; retry_encrypt: - bounce_page = fscrypt_encrypt_pagecache_blocks(&folio->page, + bounce_page = fscrypt_encrypt_pagecache_blocks(folio, enc_bytes, 0, gfp_flags); if (IS_ERR(bounce_page)) { ret = PTR_ERR(bounce_page); diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 5d3a9dc9a32d..f329daf6e5c7 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -227,24 +227,30 @@ int ext4_mpage_readpages(struct inode *inode, int length; unsigned relative_block = 0; struct ext4_map_blocks map; - unsigned int nr_pages = rac ? readahead_count(rac) : 1; + unsigned int nr_pages, folio_pages; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; - for (; nr_pages; nr_pages--) { + nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio); + for (; nr_pages; nr_pages -= folio_pages) { int fully_mapped = 1; - unsigned first_hole = blocks_per_page; + unsigned int first_hole; + unsigned int blocks_per_folio; if (rac) folio = readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; + blocks_per_folio = folio_size(folio) >> blkbits; + first_hole = blocks_per_folio; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; @@ -270,7 +276,7 @@ int ext4_mpage_readpages(struct inode *inode, map.m_flags &= ~EXT4_MAP_MAPPED; break; } - if (page_block == blocks_per_page) + if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -281,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode, * Then do more ext4_map_blocks() calls until we are * done with this folio. */ - while (page_block < blocks_per_page) { + while (page_block < blocks_per_folio) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; @@ -296,13 +302,13 @@ int ext4_mpage_readpages(struct inode *inode, } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; - if (first_hole == blocks_per_page) + if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; continue; } - if (first_hole != blocks_per_page) + if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ @@ -315,13 +321,13 @@ int ext4_mpage_readpages(struct inode *inode, /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; - } else if (page_block == blocks_per_page) + } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } } - if (first_hole != blocks_per_page) { + if (first_hole != blocks_per_folio) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { @@ -367,11 +373,11 @@ int ext4_mpage_readpages(struct inode *inode, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || - (first_hole != blocks_per_page)) { + (first_hole != blocks_per_folio)) { submit_bio(bio); bio = NULL; } else - last_block_in_bio = first_block + blocks_per_page - 1; + last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index 72f77f78ae8d..050f26168d97 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1118,8 +1118,8 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, struct ext4_super_block *es = (struct ext4_super_block *) data; es->s_block_group_nr = cpu_to_le16(group); - if (ext4_has_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + if (ext4_has_feature_metadata_csum(sb)) + es->s_checksum = ext4_superblock_csum(es); } /* @@ -1315,7 +1315,7 @@ static int ext4_set_bitmap_checksums(struct super_block *sb, { struct buffer_head *bh; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 0; bh = ext4_get_bitmap(sb, group_data->inode_bitmap); diff --git a/fs/ext4/super.c b/fs/ext4/super.c index a50e5c31b937..a7f80ca01174 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -79,7 +79,6 @@ static int ext4_unfreeze(struct super_block *sb); static int ext4_freeze(struct super_block *sb); static inline int ext2_feature_set_ok(struct super_block *sb); static inline int ext3_feature_set_ok(struct super_block *sb); -static void ext4_destroy_lazyinit_thread(void); static void ext4_unregister_li_request(struct super_block *sb); static void ext4_clear_request_list(void); static struct inode *ext4_get_journal_inode(struct super_block *sb, @@ -287,14 +286,12 @@ static int ext4_verify_csum_type(struct super_block *sb, return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } -__le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es) +__le32 ext4_superblock_csum(struct ext4_super_block *es) { - struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; - csum = ext4_chksum(sbi, ~0, (char *)es, offset); + csum = ext4_chksum(~0, (char *)es, offset); return cpu_to_le32(csum); } @@ -302,20 +299,20 @@ __le32 ext4_superblock_csum(struct super_block *sb, static int ext4_superblock_csum_verify(struct super_block *sb, struct ext4_super_block *es) { - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return 1; - return es->s_checksum == ext4_superblock_csum(sb, es); + return es->s_checksum == ext4_superblock_csum(es); } void ext4_superblock_csum_set(struct super_block *sb) { struct ext4_super_block *es = EXT4_SB(sb)->s_es; - if (!ext4_has_metadata_csum(sb)) + if (!ext4_has_feature_metadata_csum(sb)) return; - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -448,9 +445,6 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) #define ext4_get_tstamp(es, tstamp) \ __ext4_get_tstamp(&(es)->tstamp, &(es)->tstamp ## _hi) -#define EXT4_SB_REFRESH_INTERVAL_SEC (3600) /* seconds (1 hour) */ -#define EXT4_SB_REFRESH_INTERVAL_KB (16384) /* kilobytes (16MB) */ - /* * The ext4_maybe_update_superblock() function checks and updates the * superblock if needed. @@ -458,8 +452,10 @@ static time64_t __ext4_get_tstamp(__le32 *lo, __u8 *hi) * This function is designed to update the on-disk superblock only under * certain conditions to prevent excessive disk writes and unnecessary * waking of the disk from sleep. The superblock will be updated if: - * 1. More than an hour has passed since the last superblock update, and - * 2. More than 16MB have been written since the last superblock update. + * 1. More than sbi->s_sb_update_sec (def: 1 hour) has passed since the last + * superblock update + * 2. More than sbi->s_sb_update_kb (def: 16MB) kbs have been written since the + * last superblock update. * * @sb: The superblock */ @@ -473,14 +469,15 @@ static void ext4_maybe_update_superblock(struct super_block *sb) __u64 lifetime_write_kbytes; __u64 diff_size; - if (sb_rdonly(sb) || !(sb->s_flags & SB_ACTIVE) || - !journal || (journal->j_flags & JBD2_UNMOUNT)) + if (ext4_emergency_state(sb) || sb_rdonly(sb) || + !(sb->s_flags & SB_ACTIVE) || !journal || + journal->j_flags & JBD2_UNMOUNT) return; now = ktime_get_real_seconds(); last_update = ext4_get_tstamp(es, s_wtime); - if (likely(now - last_update < EXT4_SB_REFRESH_INTERVAL_SEC)) + if (likely(now - last_update < sbi->s_sb_update_sec)) return; lifetime_write_kbytes = sbi->s_kbytes_written + @@ -495,49 +492,23 @@ static void ext4_maybe_update_superblock(struct super_block *sb) */ diff_size = lifetime_write_kbytes - le64_to_cpu(es->s_kbytes_written); - if (diff_size > EXT4_SB_REFRESH_INTERVAL_KB) + if (diff_size > sbi->s_sb_update_kb) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); } static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) { struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - int error = is_journal_aborted(journal); - struct ext4_journal_cb_entry *jce; BUG_ON(txn->t_state == T_FINISHED); ext4_process_freed_data(sb, txn->t_tid); ext4_maybe_update_superblock(sb); - - spin_lock(&sbi->s_md_lock); - while (!list_empty(&txn->t_private_list)) { - jce = list_entry(txn->t_private_list.next, - struct ext4_journal_cb_entry, jce_list); - list_del_init(&jce->jce_list); - spin_unlock(&sbi->s_md_lock); - jce->jce_func(sb, jce, error); - spin_lock(&sbi->s_md_lock); - } - spin_unlock(&sbi->s_md_lock); } -/* - * This writepage callback for write_cache_pages() - * takes care of a few cases after page cleaning. - * - * write_cache_pages() already checks for dirty pages - * and calls clear_page_dirty_for_io(), which we want, - * to write protect the pages. - * - * However, we may have to redirty a page (see below.) - */ -static int ext4_journalled_writepage_callback(struct folio *folio, - struct writeback_control *wbc, - void *data) +static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, + struct folio *folio) { - transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; @@ -558,15 +529,12 @@ static int ext4_journalled_writepage_callback(struct folio *folio, */ jh = bh2jh(bh); if (buffer_dirty(bh) || - (jh && (jh->b_transaction != transaction || - jh->b_next_transaction))) { - folio_redirty_for_writepage(wbc, folio); - goto out; - } + (jh && (jh->b_transaction != jinode->i_transaction || + jh->b_next_transaction))) + return true; } while ((bh = bh->b_this_page) != head); -out: - return AOP_WRITEPAGE_ACTIVATE; + return false; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -578,10 +546,23 @@ static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; + struct folio *folio = NULL; + int error; + + /* + * writeback_iter() already checks for dirty pages and calls + * folio_clear_dirty_for_io(), which we want to write protect the + * folios. + * + * However, we may have to redirty a folio sometimes. + */ + while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { + if (ext4_journalled_writepage_needs_redirty(jinode, folio)) + folio_redirty_for_writepage(&wbc, folio); + folio_unlock(folio); + } - return write_cache_pages(mapping, &wbc, - ext4_journalled_writepage_callback, - jinode->i_transaction); + return error; } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -707,11 +688,8 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, if (test_opt(sb, WARN_ON_ERROR)) WARN_ON_ONCE(1); - if (!continue_fs && !sb_rdonly(sb)) { - set_bit(EXT4_FLAGS_SHUTDOWN, &EXT4_SB(sb)->s_ext4_flags); - if (journal) - jbd2_journal_abort(journal, -EIO); - } + if (!continue_fs && !ext4_emergency_ro(sb) && journal) + jbd2_journal_abort(journal, -EIO); if (!bdev_read_only(sb->s_bdev)) { save_error_info(sb, error, ino, block, func, line); @@ -719,9 +697,13 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, * In case the fs should keep running, we need to writeout * superblock through the journal. Due to lock ordering * constraints, it may not be safe to do it right here so we - * defer superblock flushing to a workqueue. + * defer superblock flushing to a workqueue. We just need to be + * careful when the journal is already shutting down. If we get + * here in that case, just update the sb directly as the last + * transaction won't commit anyway. */ - if (continue_fs && journal) + if (continue_fs && journal && + !ext4_test_mount_flag(sb, EXT4_MF_JOURNAL_DESTROY)) schedule_work(&EXT4_SB(sb)->s_sb_upd_work); else ext4_commit_super(sb); @@ -737,17 +719,17 @@ static void ext4_handle_error(struct super_block *sb, bool force_ro, int error, sb->s_id); } - if (sb_rdonly(sb) || continue_fs) + if (ext4_emergency_ro(sb) || continue_fs) return; ext4_msg(sb, KERN_CRIT, "Remounting filesystem read-only"); /* - * EXT4_FLAGS_SHUTDOWN was set which stops all filesystem - * modifications. We don't set SB_RDONLY because that requires - * sb->s_umount semaphore and setting it without proper remount - * procedure is confusing code such as freeze_super() leading to - * deadlocks and other problems. + * We don't set SB_RDONLY because that requires sb->s_umount + * semaphore and setting it without proper remount procedure is + * confusing code such as freeze_super() leading to deadlocks + * and other problems. */ + set_bit(EXT4_FLAGS_EMERGENCY_RO, &EXT4_SB(sb)->s_ext4_flags); } static void update_super_work(struct work_struct *work) @@ -765,7 +747,8 @@ static void update_super_work(struct work_struct *work) * We use directly jbd2 functions here to avoid recursing back into * ext4 error handling code during handling of previous errors. */ - if (!sb_rdonly(sbi->s_sb) && journal) { + if (!ext4_emergency_state(sbi->s_sb) && + !sb_rdonly(sbi->s_sb) && journal) { struct buffer_head *sbh = sbi->s_sbh; bool call_notify_err = false; @@ -819,7 +802,7 @@ void __ext4_error(struct super_block *sb, const char *function, struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); @@ -844,7 +827,7 @@ void __ext4_error_inode(struct inode *inode, const char *function, va_list args; struct va_format vaf; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -879,7 +862,7 @@ void __ext4_error_file(struct file *file, const char *function, struct inode *inode = file_inode(file); char pathname[80], *path; - if (unlikely(ext4_forced_shutdown(inode->i_sb))) + if (unlikely(ext4_emergency_state(inode->i_sb))) return; trace_ext4_error(inode->i_sb, function, line); @@ -959,7 +942,7 @@ void __ext4_std_error(struct super_block *sb, const char *function, char nbuf[16]; const char *errstr; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; /* Special case: if the error is EROFS, and we're not already @@ -1053,7 +1036,7 @@ __acquires(bitlock) struct va_format vaf; va_list args; - if (unlikely(ext4_forced_shutdown(sb))) + if (unlikely(ext4_emergency_state(sb))) return; trace_ext4_error(sb, function, line); @@ -1306,18 +1289,17 @@ static void ext4_put_super(struct super_block *sb) ext4_unregister_li_request(sb); ext4_quotas_off(sb, EXT4_MAXQUOTAS); - flush_work(&sbi->s_sb_upd_work); destroy_workqueue(sbi->rsv_conversion_wq); ext4_release_orphan_info(sb); if (sbi->s_journal) { aborted = is_journal_aborted(sbi->s_journal); - err = jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + err = ext4_journal_destroy(sbi, sbi->s_journal); if ((err < 0) && !aborted) { ext4_abort(sb, -err, "Couldn't clean up the journal"); } - } + } else + flush_work(&sbi->s_sb_upd_work); ext4_es_unregister_shrinker(sbi); timer_shutdown_sync(&sbi->s_err_report); @@ -1325,13 +1307,14 @@ static void ext4_put_super(struct super_block *sb) ext4_mb_release(sb); ext4_ext_release(sb); - if (!sb_rdonly(sb) && !aborted) { - ext4_clear_feature_journal_needs_recovery(sb); - ext4_clear_feature_orphan_present(sb); - es->s_state = cpu_to_le16(sbi->s_mount_state); - } - if (!sb_rdonly(sb)) + if (!ext4_emergency_state(sb) && !sb_rdonly(sb)) { + if (!aborted) { + ext4_clear_feature_journal_needs_recovery(sb); + ext4_clear_feature_orphan_present(sb); + es->s_state = cpu_to_le16(sbi->s_mount_state); + } ext4_commit_super(sb); + } ext4_group_desc_free(sbi); ext4_flex_groups_free(sbi); @@ -1426,10 +1409,9 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) spin_lock_init(&ei->i_completed_io_lock); ei->i_sync_tid = 0; ei->i_datasync_tid = 0; - atomic_set(&ei->i_unwritten, 0); INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); - mutex_init(&ei->i_fc_lock); + spin_lock_init(&ei->i_fc_lock); return &ei->vfs_inode; } @@ -1823,7 +1805,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { {} }; -#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 @@ -2785,6 +2766,13 @@ static int ext4_check_opt_consistency(struct fs_context *fc, } if (is_remount) { + if (!sbi->s_journal && + ctx_test_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT)) { + ext4_msg(NULL, KERN_WARNING, + "Remounting fs w/o journal so ignoring data_err option"); + ctx_clear_mount_opt(ctx, EXT4_MOUNT_DATA_ERR_ABORT); + } + if (ctx_test_mount_opt(ctx, EXT4_MOUNT_DAX_ALWAYS) && (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA)) { ext4_msg(NULL, KERN_ERR, "can't mount with " @@ -3038,6 +3026,12 @@ static int _ext4_show_options(struct seq_file *seq, struct super_block *sb, if (nodefs && !test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS)) SEQ_OPTS_PUTS("prefetch_block_bitmaps"); + if (ext4_emergency_ro(sb)) + SEQ_OPTS_PUTS("emergency_ro"); + + if (ext4_forced_shutdown(sb)) + SEQ_OPTS_PUTS("shutdown"); + ext4_show_quota_options(seq, sb); return 0; } @@ -3205,19 +3199,19 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, __le32 le_group = cpu_to_le32(block_group); struct ext4_sb_info *sbi = EXT4_SB(sb); - if (ext4_has_metadata_csum(sbi->s_sb)) { + if (ext4_has_feature_metadata_csum(sbi->s_sb)) { /* Use new metadata_csum algorithm */ __u32 csum32; __u16 dummy_csum = 0; - csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; @@ -3693,7 +3687,8 @@ static int ext4_run_li_request(struct ext4_li_request *elr) if (group >= elr->lr_next_group) { ret = 1; if (elr->lr_first_not_zeroed != ngroups && - !sb_rdonly(sb) && test_opt(sb, INIT_INODE_TABLE)) { + !ext4_emergency_state(sb) && !sb_rdonly(sb) && + test_opt(sb, INIT_INODE_TABLE)) { elr->lr_next_group = elr->lr_first_not_zeroed; elr->lr_mode = EXT4_LI_MODE_ITABLE; ret = 0; @@ -3998,7 +3993,7 @@ int ext4_register_li_request(struct super_block *sb, goto out; } - if (sb_rdonly(sb) || + if (ext4_emergency_state(sb) || sb_rdonly(sb) || (test_opt(sb, NO_PREFETCH_BLOCK_BITMAPS) && (first_not_zeroed == ngroups || !test_opt(sb, INIT_INODE_TABLE)))) goto out; @@ -4061,7 +4056,7 @@ static int set_journal_csum_feature_set(struct super_block *sb) int compat, incompat; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (ext4_has_metadata_csum(sb)) { + if (ext4_has_feature_metadata_csum(sb)) { /* journal checksum v3 */ compat = 0; incompat = JBD2_FEATURE_INCOMPAT_CSUM_V3; @@ -4349,7 +4344,7 @@ static void ext4_set_def_opts(struct super_block *sb, if (ext4_has_feature_fast_commit(sb)) set_opt2(sb, JOURNAL_FAST_COMMIT); /* don't forget to enable journal_csum when metadata_csum is enabled. */ - if (ext4_has_metadata_csum(sb)) + if (ext4_has_feature_metadata_csum(sb)) set_opt(sb, JOURNAL_CHECKSUM); if ((def_mount_opts & EXT4_DEFM_JMODE) == EXT4_DEFM_JMODE_DATA) @@ -4441,13 +4436,16 @@ static int ext4_handle_clustersize(struct super_block *sb) /* * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * With non-bigalloc filesystem awu will be based upon filesystem blocksize + * & bdev awu units. + * With bigalloc it will be based upon bigalloc cluster size & bdev awu units. * @sb: super block - * TODO: Later add support for bigalloc */ static void ext4_atomic_write_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct block_device *bdev = sb->s_bdev; + unsigned int clustersize = EXT4_CLUSTER_SIZE(sb); if (!bdev_can_atomic_write(bdev)) return; @@ -4457,7 +4455,7 @@ static void ext4_atomic_write_init(struct super_block *sb) sbi->s_awu_min = max(sb->s_blocksize, bdev_atomic_write_unit_min_bytes(bdev)); - sbi->s_awu_max = min(sb->s_blocksize, + sbi->s_awu_max = min(clustersize, bdev_atomic_write_unit_max_bytes(bdev)); if (sbi->s_awu_min && sbi->s_awu_max && sbi->s_awu_min <= sbi->s_awu_max) { @@ -4482,7 +4480,7 @@ static void ext4_fast_commit_init(struct super_block *sb) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; - spin_lock_init(&sbi->s_fc_lock); + mutex_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; @@ -4642,8 +4640,9 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo /* Precompute checksum seed for all metadata */ if (ext4_has_feature_csum_seed(sb)) sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); - else if (ext4_has_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) - sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + else if (ext4_has_feature_metadata_csum(sb) || + ext4_has_feature_ea_inode(sb)) + sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } @@ -4973,10 +4972,7 @@ static int ext4_load_and_init_journal(struct super_block *sb, return 0; out: - /* flush s_sb_upd_work before destroying the journal. */ - flush_work(&sbi->s_sb_upd_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); return -EINVAL; } @@ -5013,6 +5009,24 @@ static int ext4_check_journal_data_mode(struct super_block *sb) return 0; } +static const char *ext4_has_journal_option(struct super_block *sb) +{ + struct ext4_sb_info *sbi = EXT4_SB(sb); + + if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) + return "journal_async_commit"; + if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) + return "journal_checksum"; + if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) + return "commit="; + if (EXT4_MOUNT_DATA_FLAGS & + (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) + return "data="; + if (test_opt(sb, DATA_ERR_ABORT)) + return "data_err=abort"; + return NULL; +} + static int ext4_load_super(struct super_block *sb, ext4_fsblk_t *lsb, int silent) { @@ -5239,7 +5253,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = @@ -5263,6 +5277,8 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) sbi->s_commit_interval = JBD2_DEFAULT_MAX_COMMIT_AGE * HZ; sbi->s_min_batch_time = EXT4_DEF_MIN_BATCH_TIME; sbi->s_max_batch_time = EXT4_DEF_MAX_BATCH_TIME; + sbi->s_sb_update_kb = EXT4_DEF_SB_UPDATE_INTERVAL_KB; + sbi->s_sb_update_sec = EXT4_DEF_SB_UPDATE_INTERVAL_SEC; /* * set default s_li_wait_mult for lazyinit, for the case there is @@ -5404,30 +5420,17 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) "suppressed and not mounted read-only"); goto failed_mount3a; } else { + const char *journal_option; + /* Nojournal mode, all journal mount options are illegal */ - if (test_opt(sb, JOURNAL_ASYNC_COMMIT)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_async_commit, fs mounted w/o journal"); + journal_option = ext4_has_journal_option(sb); + if (journal_option != NULL) { + ext4_msg(sb, KERN_ERR, + "can't mount with %s, fs mounted w/o journal", + journal_option); goto failed_mount3a; } - if (test_opt2(sb, EXPLICIT_JOURNAL_CHECKSUM)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "journal_checksum, fs mounted w/o journal"); - goto failed_mount3a; - } - if (sbi->s_commit_interval != JBD2_DEFAULT_MAX_COMMIT_AGE*HZ) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "commit=%lu, fs mounted w/o journal", - sbi->s_commit_interval / HZ); - goto failed_mount3a; - } - if (EXT4_MOUNT_DATA_FLAGS & - (sbi->s_mount_opt ^ sbi->s_def_mount_opt)) { - ext4_msg(sb, KERN_ERR, "can't mount with " - "data=, fs mounted w/o journal"); - goto failed_mount3a; - } sbi->s_def_mount_opt &= ~EXT4_MOUNT_JOURNAL_CHECKSUM; clear_opt(sb, JOURNAL_CHECKSUM); clear_opt(sb, DATA_FLAGS); @@ -5616,9 +5619,11 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) goto failed_mount9; } - if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) + if (test_opt(sb, DISCARD) && !bdev_max_discard_sectors(sb->s_bdev)) { ext4_msg(sb, KERN_WARNING, "mounting with \"discard\" option, but the device does not support discard"); + clear_opt(sb, DISCARD); + } if (es->s_error_count) mod_timer(&sbi->s_err_report, jiffies + 300*HZ); /* 5 minutes */ @@ -5665,10 +5670,7 @@ failed_mount_wq: sbi->s_ea_block_cache = NULL; if (sbi->s_journal) { - /* flush s_sb_upd_work before journal destroy. */ - flush_work(&sbi->s_sb_upd_work); - jbd2_journal_destroy(sbi->s_journal); - sbi->s_journal = NULL; + ext4_journal_destroy(sbi, sbi->s_journal); } failed_mount3a: ext4_es_unregister_shrinker(sbi); @@ -5676,7 +5678,7 @@ failed_mount3: /* flush s_sb_upd_work before sbi destroy */ flush_work(&sbi->s_sb_upd_work); ext4_stop_mmpd(sbi); - del_timer_sync(&sbi->s_err_report); + timer_delete_sync(&sbi->s_err_report); ext4_group_desc_free(sbi); failed_mount: #if IS_ENABLED(CONFIG_UNICODE) @@ -5773,10 +5775,6 @@ static void ext4_init_journal_params(struct super_block *sb, journal_t *journal) journal->j_flags |= JBD2_BARRIER; else journal->j_flags &= ~JBD2_BARRIER; - if (test_opt(sb, DATA_ERR_ABORT)) - journal->j_flags |= JBD2_ABORT_ON_SYNCDATA_ERR; - else - journal->j_flags &= ~JBD2_ABORT_ON_SYNCDATA_ERR; /* * Always enable journal cycle record option, letting the journal * records log transactions continuously between each mount. @@ -5916,7 +5914,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); errno = -EFSCORRUPTED; goto out_bh; @@ -5973,7 +5971,7 @@ static journal_t *ext4_open_dev_journal(struct super_block *sb, return journal; out_journal: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); out_bdev: bdev_fput(bdev_file); return ERR_PTR(errno); @@ -6090,8 +6088,7 @@ static int ext4_load_journal(struct super_block *sb, EXT4_SB(sb)->s_journal = journal; err = ext4_clear_journal_err(sb, es); if (err) { - EXT4_SB(sb)->s_journal = NULL; - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6109,7 +6106,7 @@ static int ext4_load_journal(struct super_block *sb, return 0; err_out: - jbd2_journal_destroy(journal); + ext4_journal_destroy(EXT4_SB(sb), journal); return err; } @@ -6336,8 +6333,9 @@ static int ext4_sync_fs(struct super_block *sb, int wait) bool needs_barrier = false; struct ext4_sb_info *sbi = EXT4_SB(sb); - if (unlikely(ext4_forced_shutdown(sb))) - return -EIO; + ret = ext4_emergency_state(sb); + if (unlikely(ret)) + return ret; trace_ext4_sync_fs(sb, wait); flush_workqueue(sbi->rsv_conversion_wq); @@ -6419,7 +6417,7 @@ out: */ static int ext4_unfreeze(struct super_block *sb) { - if (ext4_forced_shutdown(sb)) + if (ext4_emergency_state(sb)) return 0; if (EXT4_SB(sb)->s_journal) { @@ -6495,7 +6493,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; } @@ -6575,7 +6573,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) flush_work(&sbi->s_sb_upd_work); if ((bool)(fc->sb_flags & SB_RDONLY) != sb_rdonly(sb)) { - if (ext4_forced_shutdown(sb)) { + if (ext4_emergency_state(sb)) { err = -EROFS; goto restore_opts; } @@ -6780,6 +6778,7 @@ static int ext4_reconfigure(struct fs_context *fc) { struct super_block *sb = fc->root->d_sb; int ret; + bool old_ro = sb_rdonly(sb); fc->s_fs_info = EXT4_SB(sb); @@ -6791,9 +6790,9 @@ static int ext4_reconfigure(struct fs_context *fc) if (ret < 0) return ret; - ext4_msg(sb, KERN_INFO, "re-mounted %pU %s. Quota mode: %s.", - &sb->s_uuid, sb_rdonly(sb) ? "ro" : "r/w", - ext4_quota_mode(sb)); + ext4_msg(sb, KERN_INFO, "re-mounted %pU%s.", + &sb->s_uuid, + (old_ro != sb_rdonly(sb)) ? (sb_rdonly(sb) ? " ro" : " r/w") : ""); return 0; } @@ -6817,22 +6816,29 @@ static int ext4_statfs_project(struct super_block *sb, dquot->dq_dqb.dqb_bhardlimit); limit >>= sb->s_blocksize_bits; - if (limit && buf->f_blocks > limit) { + if (limit) { + uint64_t remaining = 0; + curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; - buf->f_blocks = limit; - buf->f_bfree = buf->f_bavail = - (buf->f_blocks > curblock) ? - (buf->f_blocks - curblock) : 0; + if (limit > curblock) + remaining = limit - curblock; + + buf->f_blocks = min(buf->f_blocks, limit); + buf->f_bfree = min(buf->f_bfree, remaining); + buf->f_bavail = min(buf->f_bavail, remaining); } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); - if (limit && buf->f_files > limit) { - buf->f_files = limit; - buf->f_ffree = - (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? - (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + if (limit) { + uint64_t remaining = 0; + + if (limit > dquot->dq_dqb.dqb_curinodes) + remaining = limit - dquot->dq_dqb.dqb_curinodes; + + buf->f_files = min(buf->f_files, limit); + buf->f_ffree = min(buf->f_ffree, remaining); } spin_unlock(&dquot->dq_dqb_lock); @@ -6935,12 +6941,25 @@ static int ext4_release_dquot(struct dquot *dquot) { int ret, err; handle_t *handle; + bool freeze_protected = false; + + /* + * Trying to sb_start_intwrite() in a running transaction + * can result in a deadlock. Further, running transactions + * are already protected from freezing. + */ + if (!ext4_journal_current_handle()) { + sb_start_intwrite(dquot->dq_sb); + freeze_protected = true; + } handle = ext4_journal_start(dquot_to_inode(dquot), EXT4_HT_QUOTA, EXT4_QUOTA_DEL_BLOCKS(dquot->dq_sb)); if (IS_ERR(handle)) { /* Release dquot anyway to avoid endless cycle in dqput() */ dquot_release(dquot); + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); return PTR_ERR(handle); } ret = dquot_release(dquot); @@ -6951,6 +6970,10 @@ static int ext4_release_dquot(struct dquot *dquot) err = ext4_journal_stop(handle); if (!ret) ret = err; + + if (freeze_protected) + sb_end_intwrite(dquot->dq_sb); + return ret; } @@ -7288,7 +7311,7 @@ static ssize_t ext4_quota_write(struct super_block *sb, int type, } lock_buffer(bh); memcpy(bh->b_data+offset, data, len); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); unlock_buffer(bh); err = ext4_handle_dirty_metadata(handle, NULL, bh); brelse(bh); @@ -7381,12 +7404,9 @@ static struct file_system_type ext4_fs_type = { }; MODULE_ALIAS_FS("ext4"); -/* Shared across all ext4 file systems */ -wait_queue_head_t ext4__ioend_wq[EXT4_WQ_HASH_SZ]; - static int __init ext4_init_fs(void) { - int i, err; + int err; ratelimit_state_init(&ext4_mount_msg_ratelimit, 30 * HZ, 64); ext4_li_info = NULL; @@ -7394,9 +7414,6 @@ static int __init ext4_init_fs(void) /* Build-time check for flags consistency */ ext4_check_flag_values(); - for (i = 0; i < EXT4_WQ_HASH_SZ; i++) - init_waitqueue_head(&ext4__ioend_wq[i]); - err = ext4_init_es(); if (err) return err; diff --git a/fs/ext4/sysfs.c b/fs/ext4/sysfs.c index ddb54608ca2e..987bd00f916a 100644 --- a/fs/ext4/sysfs.c +++ b/fs/ext4/sysfs.c @@ -254,6 +254,8 @@ EXT4_ATTR(journal_task, 0444, journal_task); EXT4_RW_ATTR_SBI_UI(mb_prefetch, s_mb_prefetch); EXT4_RW_ATTR_SBI_UI(mb_prefetch_limit, s_mb_prefetch_limit); EXT4_RW_ATTR_SBI_UL(last_trim_minblks, s_last_trim_minblks); +EXT4_RW_ATTR_SBI_UI(sb_update_sec, s_sb_update_sec); +EXT4_RW_ATTR_SBI_UI(sb_update_kb, s_sb_update_kb); static unsigned int old_bump_val = 128; EXT4_ATTR_PTR(max_writeback_mb_bump, 0444, pointer_ui, &old_bump_val); @@ -305,6 +307,8 @@ static struct attribute *ext4_attrs[] = { ATTR_LIST(mb_prefetch), ATTR_LIST(mb_prefetch_limit), ATTR_LIST(last_trim_minblks), + ATTR_LIST(sb_update_sec), + ATTR_LIST(sb_update_kb), NULL, }; ATTRIBUTE_GROUPS(ext4); diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7647e9f6e190..8d15acbacc20 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(csum, (__u8 *)hdr, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + csum = ext4_chksum(csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); @@ -156,7 +156,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, struct ext4_xattr_header *hdr = BHDR(bh); int ret = 1; - if (ext4_has_metadata_csum(inode->i_sb)) { + if (ext4_has_feature_metadata_csum(inode->i_sb)) { lock_buffer(bh); ret = (hdr->h_checksum == ext4_xattr_block_csum(inode, bh->b_blocknr, hdr)); @@ -168,7 +168,7 @@ static int ext4_xattr_block_csum_verify(struct inode *inode, static void ext4_xattr_block_csum_set(struct inode *inode, struct buffer_head *bh) { - if (ext4_has_metadata_csum(inode->i_sb)) + if (ext4_has_feature_metadata_csum(inode->i_sb)) BHDR(bh)->h_checksum = ext4_xattr_block_csum(inode, bh->b_blocknr, BHDR(bh)); } @@ -308,7 +308,7 @@ __ext4_xattr_check_block(struct inode *inode, struct buffer_head *bh, __ext4_xattr_check_block((inode), (bh), __func__, __LINE__) -static inline int +int __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, void *end, const char *function, unsigned int line) { @@ -316,9 +316,6 @@ __xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, function, line); } -#define xattr_check_inode(inode, header, end) \ - __xattr_check_inode((inode), (header), (end), __func__, __LINE__) - static int xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, void *end, int name_index, const char *name, int sorted) @@ -351,7 +348,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { - return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); + return ext4_chksum(sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) @@ -649,10 +646,7 @@ ext4_xattr_ibody_get(struct inode *inode, int name_index, const char *name, return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; + end = ITAIL(inode, raw_inode); entry = IFIRST(header); error = xattr_find_entry(inode, &entry, end, name_index, name, 0); if (error) @@ -783,7 +777,6 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) struct ext4_xattr_ibody_header *header; struct ext4_inode *raw_inode; struct ext4_iloc iloc; - void *end; int error; if (!ext4_test_inode_state(inode, EXT4_STATE_XATTR)) @@ -793,14 +786,9 @@ ext4_xattr_ibody_list(struct dentry *dentry, char *buffer, size_t buffer_size) return error; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; error = ext4_xattr_list_entries(dentry, IFIRST(header), buffer, buffer_size); -cleanup: brelse(iloc.bh); return error; } @@ -868,7 +856,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) struct ext4_xattr_ibody_header *header; struct ext4_xattr_entry *entry; qsize_t ea_inode_refs = 0; - void *end; int ret; lockdep_assert_held_read(&EXT4_I(inode)->xattr_sem); @@ -879,10 +866,6 @@ int ext4_get_inode_usage(struct inode *inode, qsize_t *usage) goto out; raw_inode = ext4_raw_inode(&iloc); header = IHDR(inode, raw_inode); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; - ret = xattr_check_inode(inode, header, end); - if (ret) - goto out; for (entry = IFIRST(header); !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) @@ -1176,15 +1159,24 @@ ext4_xattr_inode_dec_ref_all(handle_t *handle, struct inode *parent, { struct inode *ea_inode; struct ext4_xattr_entry *entry; + struct ext4_iloc iloc; bool dirty = false; unsigned int ea_ino; int err; int credits; + void *end; + + if (block_csum) + end = (void *)bh->b_data + bh->b_size; + else { + ext4_get_inode_loc(parent, &iloc); + end = (void *)ext4_raw_inode(&iloc) + EXT4_SB(parent->i_sb)->s_inode_size; + } /* One credit for dec ref on ea_inode, one for orphan list addition, */ credits = 2 + extra_credits; - for (entry = first; !IS_LAST_ENTRY(entry); + for (entry = first; (void *)entry < end && !IS_LAST_ENTRY(entry); entry = EXT4_XATTR_NEXT(entry)) { if (!entry->e_value_inum) continue; @@ -2235,11 +2227,8 @@ int ext4_xattr_ibody_find(struct inode *inode, struct ext4_xattr_info *i, header = IHDR(inode, raw_inode); is->s.base = is->s.first = IFIRST(header); is->s.here = is->s.first; - is->s.end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + is->s.end = ITAIL(inode, raw_inode); if (ext4_test_inode_state(inode, EXT4_STATE_XATTR)) { - error = xattr_check_inode(inode, header, is->s.end); - if (error) - return error; /* Find the named attribute. */ error = xattr_find_entry(inode, &is->s.here, is->s.end, i->name_index, i->name, 0); @@ -2786,14 +2775,10 @@ retry: */ base = IFIRST(header); - end = (void *)raw_inode + EXT4_SB(inode->i_sb)->s_inode_size; + end = ITAIL(inode, raw_inode); min_offs = end - base; total_ino = sizeof(struct ext4_xattr_ibody_header) + sizeof(u32); - error = xattr_check_inode(inode, header, end); - if (error) - goto cleanup; - ifree = ext4_xattr_free_space(base, &min_offs, base, &total_ino); if (ifree >= isize_diff) goto shift; diff --git a/fs/ext4/xattr.h b/fs/ext4/xattr.h index b25c2d7b5f99..1fedf44d4fb6 100644 --- a/fs/ext4/xattr.h +++ b/fs/ext4/xattr.h @@ -67,6 +67,9 @@ struct ext4_xattr_entry { ((void *)raw_inode + \ EXT4_GOOD_OLD_INODE_SIZE + \ EXT4_I(inode)->i_extra_isize)) +#define ITAIL(inode, raw_inode) \ + ((void *)(raw_inode) + \ + EXT4_SB((inode)->i_sb)->s_inode_size) #define IFIRST(hdr) ((struct ext4_xattr_entry *)((hdr)+1)) /* @@ -206,6 +209,13 @@ extern int ext4_xattr_ibody_set(handle_t *handle, struct inode *inode, extern struct mb_cache *ext4_xattr_create_cache(void); extern void ext4_xattr_destroy_cache(struct mb_cache *); +extern int +__xattr_check_inode(struct inode *inode, struct ext4_xattr_ibody_header *header, + void *end, const char *function, unsigned int line); + +#define xattr_check_inode(inode, header, end) \ + __xattr_check_inode((inode), (header), (end), __func__, __LINE__) + #ifdef CONFIG_EXT4_FS_SECURITY extern int ext4_init_security(handle_t *handle, struct inode *inode, struct inode *dir, const struct qstr *qstr); diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index efda9a022981..cf77987d0698 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -21,7 +21,7 @@ #include "iostat.h" #include <trace/events/f2fs.h> -#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) +#define DEFAULT_CHECKPOINT_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_RT, 3)) static struct kmem_cache *ino_entry_slab; struct kmem_cache *f2fs_inode_entry_slab; @@ -58,7 +58,7 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); - struct page *page; + struct folio *folio; struct f2fs_io_info fio = { .sbi = sbi, .type = META, @@ -74,37 +74,37 @@ static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, if (unlikely(!is_meta)) fio.op_flags &= ~REQ_META; repeat: - page = f2fs_grab_cache_page(mapping, index, false); - if (!page) { + folio = f2fs_grab_cache_folio(mapping, index, false); + if (IS_ERR(folio)) { cond_resched(); goto repeat; } - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto out; - fio.page = page; + fio.page = &folio->page; err = f2fs_submit_page_bio(&fio); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return ERR_PTR(err); } f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); - lock_page(page); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping)) { + f2fs_folio_put(folio, true); goto repeat; } - if (unlikely(!PageUptodate(page))) { - f2fs_handle_page_eio(sbi, page_folio(page), META); - f2fs_put_page(page, 1); + if (unlikely(!folio_test_uptodate(folio))) { + f2fs_handle_page_eio(sbi, folio, META); + f2fs_folio_put(folio, true); return ERR_PTR(-EIO); } out: - return page; + return &folio->page; } struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) @@ -381,12 +381,6 @@ redirty_out: return AOP_WRITEPAGE_ACTIVATE; } -static int f2fs_write_meta_page(struct page *page, - struct writeback_control *wbc) -{ - return __f2fs_write_meta_page(page, wbc, FS_META_IO); -} - static int f2fs_write_meta_pages(struct address_space *mapping, struct writeback_control *wbc) { @@ -507,7 +501,6 @@ static bool f2fs_dirty_meta_folio(struct address_space *mapping, } const struct address_space_operations f2fs_meta_aops = { - .writepage = f2fs_write_meta_page, .writepages = f2fs_write_meta_pages, .dirty_folio = f2fs_dirty_meta_folio, .invalidate_folio = f2fs_invalidate_folio, @@ -1237,7 +1230,7 @@ static int block_operations(struct f2fs_sb_info *sbi) retry_flush_quotas: f2fs_lock_all(sbi); if (__need_flush_quota(sbi)) { - int locked; + bool need_lock = sbi->umount_lock_holder != current; if (++cnt > DEFAULT_RETRY_QUOTA_FLUSH_COUNT) { set_sbi_flag(sbi, SBI_QUOTA_SKIP_FLUSH); @@ -1246,11 +1239,13 @@ retry_flush_quotas: } f2fs_unlock_all(sbi); - /* only failed during mount/umount/freeze/quotactl */ - locked = down_read_trylock(&sbi->sb->s_umount); - f2fs_quota_sync(sbi->sb, -1); - if (locked) + /* don't grab s_umount lock during mount/umount/remount/freeze/quotactl */ + if (!need_lock) { + f2fs_do_quota_sync(sbi->sb, -1); + } else if (down_read_trylock(&sbi->sb->s_umount)) { + f2fs_do_quota_sync(sbi->sb, -1); up_read(&sbi->sb->s_umount); + } cond_resched(); goto retry_flush_quotas; } @@ -1344,21 +1339,13 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); unsigned long flags; - if (cpc->reason & CP_UMOUNT) { - if (le32_to_cpu(ckpt->cp_pack_total_block_count) + - NM_I(sbi)->nat_bits_blocks > BLKS_PER_SEG(sbi)) { - clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Disable nat_bits due to no space"); - } else if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG) && - f2fs_nat_bitmap_enabled(sbi)) { - f2fs_enable_nat_bits(sbi); - set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Rebuild and enable nat_bits"); - } - } - spin_lock_irqsave(&sbi->cp_lock, flags); + if ((cpc->reason & CP_UMOUNT) && + le32_to_cpu(ckpt->cp_pack_total_block_count) > + sbi->blocks_per_seg - NM_I(sbi)->nat_bits_blocks) + disable_nat_bits(sbi, false); + if (cpc->reason & CP_TRIMMED) __set_ckpt_flags(ckpt, CP_TRIMMED_FLAG); else @@ -1541,8 +1528,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) start_blk = __start_cp_next_addr(sbi); /* write nat bits */ - if ((cpc->reason & CP_UMOUNT) && - is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) { + if (enabled_nat_bits(sbi, cpc)) { __u64 cp_ver = cur_cp_version(ckpt); block_t blk; @@ -1867,7 +1853,8 @@ int f2fs_issue_checkpoint(struct f2fs_sb_info *sbi) struct cp_control cpc; cpc.reason = __get_cp_reason(sbi); - if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC) { + if (!test_opt(sbi, MERGE_CHECKPOINT) || cpc.reason != CP_SYNC || + sbi->umount_lock_holder == current) { int ret; f2fs_down_write(&sbi->gc_lock); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 985690d81a82..9b94810675c1 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -1150,6 +1150,7 @@ retry: f2fs_compress_ctx_add_page(cc, page_folio(page)); if (!PageUptodate(page)) { + f2fs_handle_page_eio(sbi, page_folio(page), DATA); release_and_retry: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index de4da6d9cd93..54f89f0ee69b 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -319,8 +319,7 @@ static void f2fs_read_end_io(struct bio *bio) static void f2fs_write_end_io(struct bio *bio) { struct f2fs_sb_info *sbi; - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; iostat_update_and_unbind_ctx(bio); sbi = bio->bi_private; @@ -328,34 +327,41 @@ static void f2fs_write_end_io(struct bio *bio) if (time_to_inject(sbi, FAULT_WRITE_IO)) bio->bi_status = BLK_STS_IOERR; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - enum count_type type = WB_DATA_TYPE(page, false); + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + enum count_type type; + + if (fscrypt_is_bounce_folio(folio)) { + struct folio *io_folio = folio; - fscrypt_finalize_bounce_page(&page); + folio = fscrypt_pagecache_folio(io_folio); + fscrypt_free_bounce_page(&io_folio->page); + } #ifdef CONFIG_F2FS_FS_COMPRESSION - if (f2fs_is_compressed_page(page)) { - f2fs_compress_write_end_io(bio, page); + if (f2fs_is_compressed_page(&folio->page)) { + f2fs_compress_write_end_io(bio, &folio->page); continue; } #endif + type = WB_DATA_TYPE(&folio->page, false); + if (unlikely(bio->bi_status)) { - mapping_set_error(page->mapping, -EIO); + mapping_set_error(folio->mapping, -EIO); if (type == F2FS_WB_CP_DATA) f2fs_stop_checkpoint(sbi, true, STOP_CP_REASON_WRITE_FAIL); } - f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) && - page_folio(page)->index != nid_of_node(page)); + f2fs_bug_on(sbi, folio->mapping == NODE_MAPPING(sbi) && + folio->index != nid_of_node(&folio->page)); dec_page_count(sbi, type); - if (f2fs_in_warm_node_list(sbi, page)) - f2fs_del_fsync_node_entry(sbi, page); - clear_page_private_gcing(page); - end_page_writeback(page); + if (f2fs_in_warm_node_list(sbi, folio)) + f2fs_del_fsync_node_entry(sbi, &folio->page); + clear_page_private_gcing(&folio->page); + folio_end_writeback(folio); } if (!get_pages(sbi, F2FS_WB_CP_DATA) && wq_has_sleeper(&sbi->cp_wait)) @@ -413,6 +419,7 @@ int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr) static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) { unsigned int temp_mask = GENMASK(NR_TEMP_TYPE - 1, 0); + struct folio *fio_folio = page_folio(fio->page); unsigned int fua_flag, meta_flag, io_flag; blk_opf_t op_flags = 0; @@ -438,6 +445,11 @@ static blk_opf_t f2fs_io_flags(struct f2fs_io_info *fio) op_flags |= REQ_META; if (BIT(fio->temp) & fua_flag) op_flags |= REQ_FUA; + + if (fio->type == DATA && + F2FS_I(fio_folio->mapping->host)->ioprio_hint == F2FS_IOPRIO_WRITE) + op_flags |= REQ_PRIO; + return op_flags; } @@ -876,6 +888,7 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) struct bio *bio = *fio->bio; struct page *page = fio->encrypted_page ? fio->encrypted_page : fio->page; + struct folio *folio = page_folio(fio->page); if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr, __is_meta_io(fio) ? META_GENERIC : DATA_GENERIC)) @@ -889,8 +902,8 @@ int f2fs_merge_page_bio(struct f2fs_io_info *fio) alloc_new: if (!bio) { bio = __bio_alloc(fio, BIO_MAX_VECS); - f2fs_set_bio_crypt_ctx(bio, fio->page->mapping->host, - page_folio(fio->page)->index, fio, GFP_NOIO); + f2fs_set_bio_crypt_ctx(bio, folio->mapping->host, + folio->index, fio, GFP_NOIO); add_bio_entry(fio->sbi, bio, page, fio->temp); } else { @@ -899,8 +912,7 @@ alloc_new: } if (fio->io_wbc) - wbc_account_cgroup_owner(fio->io_wbc, page_folio(fio->page), - PAGE_SIZE); + wbc_account_cgroup_owner(fio->io_wbc, folio, folio_size(folio)); inc_page_count(fio->sbi, WB_DATA_TYPE(page, false)); @@ -1041,8 +1053,6 @@ static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr, bio = bio_alloc_bioset(bdev, bio_max_segs(nr_pages), REQ_OP_READ | op_flag, for_write ? GFP_NOIO : GFP_KERNEL, &f2fs_bioset); - if (!bio) - return ERR_PTR(-ENOMEM); bio->bi_iter.bi_sector = sector; f2fs_set_bio_crypt_ctx(bio, inode, first_idx, NULL, GFP_NOFS); bio->bi_end_io = f2fs_read_end_io; @@ -1193,18 +1203,17 @@ int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) return err; } -struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write, - pgoff_t *next_pgofs) +struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; struct dnode_of_data dn; - struct page *page; + struct folio *folio; int err; - page = f2fs_grab_cache_page(mapping, index, for_write); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(mapping, index, for_write); + if (IS_ERR(folio)) + return folio; if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { @@ -1239,9 +1248,9 @@ struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, goto put_err; } got_it: - if (PageUptodate(page)) { - unlock_page(page); - return page; + if (folio_test_uptodate(folio)) { + folio_unlock(folio); + return folio; } /* @@ -1252,48 +1261,51 @@ got_it: * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_SIZE); - if (!PageUptodate(page)) - SetPageUptodate(page); - unlock_page(page); - return page; + folio_zero_segment(folio, 0, folio_size(folio)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + folio_unlock(folio); + return folio; } - err = f2fs_submit_page_read(inode, page_folio(page), dn.data_blkaddr, + err = f2fs_submit_page_read(inode, folio, dn.data_blkaddr, op_flags, for_write); if (err) goto put_err; - return page; + return folio; put_err: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return ERR_PTR(err); } -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; - page = find_get_page_flags(mapping, index, FGP_ACCESSED); - if (page && PageUptodate(page)) - return page; - f2fs_put_page(page, 0); + folio = __filemap_get_folio(mapping, index, FGP_ACCESSED, 0); + if (IS_ERR(folio)) + goto read; + if (folio_test_uptodate(folio)) + return folio; + f2fs_folio_put(folio, false); - page = f2fs_get_read_data_page(inode, index, 0, false, next_pgofs); - if (IS_ERR(page)) - return page; +read: + folio = f2fs_get_read_data_folio(inode, index, 0, false, next_pgofs); + if (IS_ERR(folio)) + return folio; - if (PageUptodate(page)) - return page; + if (folio_test_uptodate(folio)) + return folio; - wait_on_page_locked(page); - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 0); + folio_wait_locked(folio); + if (unlikely(!folio_test_uptodate(folio))) { + f2fs_folio_put(folio, false); return ERR_PTR(-EIO); } - return page; + return folio; } /* @@ -1301,23 +1313,23 @@ struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, * Because, the callers, functions in dir.c and GC, should be able to know * whether this page exists or not. */ -struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; - page = f2fs_get_read_data_page(inode, index, 0, for_write, NULL); - if (IS_ERR(page)) - return page; + folio = f2fs_get_read_data_folio(inode, index, 0, for_write, NULL); + if (IS_ERR(folio)) + return folio; /* wait for read completion */ - lock_page(page); - if (unlikely(page->mapping != mapping || !PageUptodate(page))) { - f2fs_put_page(page, 1); + folio_lock(folio); + if (unlikely(folio->mapping != mapping || !folio_test_uptodate(folio))) { + f2fs_folio_put(folio, true); return ERR_PTR(-EIO); } - return page; + return folio; } /* @@ -2178,6 +2190,12 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, int i; int ret = 0; + if (unlikely(f2fs_cp_error(sbi))) { + ret = -EIO; + from_dnode = false; + goto out_put_dnode; + } + f2fs_bug_on(sbi, f2fs_cluster_is_empty(cc)); last_block_in_file = F2FS_BYTES_TO_BLK(f2fs_readpage_limit(inode) + @@ -2221,10 +2239,6 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, if (ret) goto out; - if (unlikely(f2fs_cp_error(sbi))) { - ret = -EIO; - goto out_put_dnode; - } f2fs_bug_on(sbi, dn.data_blkaddr != COMPRESS_ADDR); skip_reading_dnode: @@ -2500,7 +2514,7 @@ int f2fs_encrypt_one_page(struct f2fs_io_info *fio) return 0; retry_encrypt: - fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page, + fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(page_folio(page), PAGE_SIZE, 0, gfp_flags); if (IS_ERR(fio->encrypted_page)) { /* flush pending IOs and wait for a while in the ENOMEM case */ @@ -2921,29 +2935,6 @@ redirty_out: return err; } -static int f2fs_write_data_page(struct page *page, - struct writeback_control *wbc) -{ - struct folio *folio = page_folio(page); -#ifdef CONFIG_F2FS_FS_COMPRESSION - struct inode *inode = folio->mapping->host; - - if (unlikely(f2fs_cp_error(F2FS_I_SB(inode)))) - goto out; - - if (f2fs_compressed_file(inode)) { - if (f2fs_is_compressed_cluster(inode, folio->index)) { - folio_redirty_for_writepage(wbc, folio); - return AOP_WRITEPAGE_ACTIVATE; - } - } -out: -#endif - - return f2fs_write_single_data_page(folio, NULL, NULL, NULL, - wbc, FS_DATA_IO, 0, true); -} - /* * This function was copied from write_cache_pages from mm/page-writeback.c. * The major change is making write step of cold data page separately from @@ -3266,10 +3257,6 @@ static int __f2fs_write_data_pages(struct address_space *mapping, int ret; bool locked = false; - /* deal with chardevs and other special file */ - if (!mapping->a_ops->writepage) - return 0; - /* skip writing if there is no dirty page in this inode */ if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE) return 0; @@ -3390,7 +3377,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, restart: /* check inline_data */ - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; @@ -3453,7 +3440,7 @@ static int __find_data_block(struct inode *inode, pgoff_t index, struct page *ipage; int err = 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -3483,7 +3470,7 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto unlock_out; @@ -4101,7 +4088,6 @@ static void f2fs_swap_deactivate(struct file *file) const struct address_space_operations f2fs_dblock_aops = { .read_folio = f2fs_read_data_folio, .readahead = f2fs_readahead, - .writepage = f2fs_write_data_page, .writepages = f2fs_write_data_pages, .write_begin = f2fs_write_begin, .write_end = f2fs_write_end, @@ -4195,7 +4181,13 @@ static int f2fs_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_next_pgofs = &next_pgofs; map.m_seg_type = f2fs_rw_hint_to_seg_type(F2FS_I_SB(inode), inode->i_write_hint); - if (flags & IOMAP_WRITE) + + /* + * If the blocks being overwritten are already allocated, + * f2fs_map_lock and f2fs_balance_fs are not necessary. + */ + if ((flags & IOMAP_WRITE) && + !f2fs_overwrite_io(inode, offset, length)) map.m_may_create = true; err = f2fs_map_blocks(inode, &map, F2FS_GET_BLOCK_DIO); diff --git a/fs/f2fs/debug.c b/fs/f2fs/debug.c index 468828288a4a..16c2dfb4f595 100644 --- a/fs/f2fs/debug.c +++ b/fs/f2fs/debug.c @@ -164,6 +164,7 @@ static void update_general_status(struct f2fs_sb_info *sbi) si->ndirty_imeta = get_pages(sbi, F2FS_DIRTY_IMETA); si->ndirty_dirs = sbi->ndirty_inode[DIR_INODE]; si->ndirty_files = sbi->ndirty_inode[FILE_INODE]; + si->ndonate_files = sbi->donate_files; si->nquota_files = sbi->nquota_files; si->ndirty_all = sbi->ndirty_inode[DIRTY_META]; si->aw_cnt = atomic_read(&sbi->atomic_files); @@ -501,6 +502,8 @@ static int stat_show(struct seq_file *s, void *v) si->compr_inode, si->compr_blocks); seq_printf(s, " - Swapfile Inode: %u\n", si->swapfile_inode); + seq_printf(s, " - Donate Inode: %u\n", + si->ndonate_files); seq_printf(s, " - Orphan/Append/Update Inode: %u, %u, %u\n", si->orphans, si->append, si->update); seq_printf(s, "\nMain area: %d segs, %d secs %d zones\n", diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 54dd52de7269..5a63ff0df03b 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -551,7 +551,7 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, goto put_error; } } else { - page = f2fs_get_node_page(F2FS_I_SB(dir), inode->i_ino); + page = f2fs_get_inode_page(F2FS_I_SB(dir), inode->i_ino); if (IS_ERR(page)) return page; } diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index 1afa7be16e7d..f1576dc6ec67 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -62,6 +62,7 @@ enum { FAULT_BLKADDR_VALIDITY, FAULT_BLKADDR_CONSISTENCE, FAULT_NO_SEGMENT, + FAULT_INCONSISTENT_FOOTER, FAULT_MAX, }; @@ -114,6 +115,13 @@ extern const char *f2fs_fault_name[FAULT_MAX]; #define F2FS_MOUNT_GC_MERGE 0x02000000 #define F2FS_MOUNT_COMPRESS_CACHE 0x04000000 #define F2FS_MOUNT_AGE_EXTENT_CACHE 0x08000000 +#define F2FS_MOUNT_NAT_BITS 0x10000000 +#define F2FS_MOUNT_INLINECRYPT 0x20000000 +/* + * Some f2fs environments expect to be able to pass the "lazytime" option + * string rather than using the MS_LAZYTIME flag, so this must remain. + */ +#define F2FS_MOUNT_LAZYTIME 0x40000000 #define F2FS_OPTION(sbi) ((sbi)->mount_opt) #define clear_opt(sbi, option) (F2FS_OPTION(sbi).opt &= ~F2FS_MOUNT_##option) @@ -830,6 +838,7 @@ struct f2fs_inode_info { /* Use below internally in f2fs*/ unsigned long flags[BITS_TO_LONGS(FI_MAX)]; /* use to pass per-file flags */ + unsigned int ioprio_hint; /* hint for IO priority */ struct f2fs_rwsem i_sem; /* protect fi info */ atomic_t dirty_pages; /* # of dirty pages */ f2fs_hash_t chash; /* hash value of given file name */ @@ -849,6 +858,11 @@ struct f2fs_inode_info { #endif struct list_head dirty_list; /* dirty list for dirs and files */ struct list_head gdirty_list; /* linked in global dirty list */ + + /* linked in global inode list for cache donation */ + struct list_head gdonate_list; + pgoff_t donate_start, donate_end; /* inclusive */ + struct task_struct *atomic_write_task; /* store atomic write task */ struct extent_tree *extent_tree[NR_EXTENT_CACHES]; /* cached extent_tree entry */ @@ -1273,6 +1287,7 @@ enum inode_type { DIR_INODE, /* for dirty dir inode */ FILE_INODE, /* for dirty regular/symlink inode */ DIRTY_META, /* for all dirtied inode metadata */ + DONATE_INODE, /* for all inode to donate pages */ NR_INODE_TYPE, }; @@ -1628,6 +1643,9 @@ struct f2fs_sb_info { unsigned int warm_data_age_threshold; unsigned int last_age_weight; + /* control donate caches */ + unsigned int donate_files; + /* basic filesystem units */ unsigned int log_sectors_per_block; /* log2 sectors per block */ unsigned int log_blocksize; /* log2 block size */ @@ -1659,6 +1677,7 @@ struct f2fs_sb_info { unsigned int nquota_files; /* # of quota sysfile */ struct f2fs_rwsem quota_sem; /* blocking cp for flags */ + struct task_struct *umount_lock_holder; /* s_umount lock holder */ /* # of pages, see count_type */ atomic_t nr_pages[NR_COUNT_TYPE]; @@ -1800,6 +1819,9 @@ struct f2fs_sb_info { u64 committed_atomic_block; u64 revoked_atomic_block; + /* carve out reserved_blocks from total blocks */ + bool carve_out; + #ifdef CONFIG_F2FS_FS_COMPRESSION struct kmem_cache *page_array_slab; /* page array entry */ unsigned int page_array_slab_size; /* default page array slab size */ @@ -2015,7 +2037,7 @@ static inline struct f2fs_checkpoint *F2FS_CKPT(struct f2fs_sb_info *sbi) return (struct f2fs_checkpoint *)(sbi->ckpt); } -static inline struct f2fs_node *F2FS_NODE(struct page *page) +static inline struct f2fs_node *F2FS_NODE(const struct page *page) { return (struct f2fs_node *)page_address(page); } @@ -2219,6 +2241,36 @@ static inline void f2fs_up_write(struct f2fs_rwsem *sem) #endif } +static inline void disable_nat_bits(struct f2fs_sb_info *sbi, bool lock) +{ + unsigned long flags; + unsigned char *nat_bits; + + /* + * In order to re-enable nat_bits we need to call fsck.f2fs by + * set_sbi_flag(sbi, SBI_NEED_FSCK). But it may give huge cost, + * so let's rely on regular fsck or unclean shutdown. + */ + + if (lock) + spin_lock_irqsave(&sbi->cp_lock, flags); + __clear_ckpt_flags(F2FS_CKPT(sbi), CP_NAT_BITS_FLAG); + nat_bits = NM_I(sbi)->nat_bits; + NM_I(sbi)->nat_bits = NULL; + if (lock) + spin_unlock_irqrestore(&sbi->cp_lock, flags); + + kvfree(nat_bits); +} + +static inline bool enabled_nat_bits(struct f2fs_sb_info *sbi, + struct cp_control *cpc) +{ + bool set = is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG); + + return (cpc) ? (cpc->reason & CP_UMOUNT) && set : set; +} + static inline void f2fs_lock_op(struct f2fs_sb_info *sbi) { f2fs_down_read(&sbi->cp_rwsem); @@ -2765,33 +2817,46 @@ static inline s64 valid_inode_count(struct f2fs_sb_info *sbi) return percpu_counter_sum_positive(&sbi->total_valid_inode_count); } -static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, - pgoff_t index, bool for_write) +static inline struct folio *f2fs_grab_cache_folio(struct address_space *mapping, + pgoff_t index, bool for_write) { - struct page *page; + struct folio *folio; unsigned int flags; if (IS_ENABLED(CONFIG_F2FS_FAULT_INJECTION)) { + fgf_t fgf_flags; + if (!for_write) - page = find_get_page_flags(mapping, index, - FGP_LOCK | FGP_ACCESSED); + fgf_flags = FGP_LOCK | FGP_ACCESSED; else - page = find_lock_page(mapping, index); - if (page) - return page; + fgf_flags = FGP_LOCK; + folio = __filemap_get_folio(mapping, index, fgf_flags, 0); + if (!IS_ERR(folio)) + return folio; if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_ALLOC)) - return NULL; + return ERR_PTR(-ENOMEM); } if (!for_write) - return grab_cache_page(mapping, index); + return filemap_grab_folio(mapping, index); flags = memalloc_nofs_save(); - page = grab_cache_page_write_begin(mapping, index); + folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, + mapping_gfp_mask(mapping)); memalloc_nofs_restore(flags); - return page; + return folio; +} + +static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, + pgoff_t index, bool for_write) +{ + struct folio *folio = f2fs_grab_cache_folio(mapping, index, for_write); + + if (IS_ERR(folio)) + return NULL; + return &folio->page; } static inline struct page *f2fs_pagecache_get_page( @@ -2804,16 +2869,23 @@ static inline struct page *f2fs_pagecache_get_page( return pagecache_get_page(mapping, index, fgp_flags, gfp_mask); } -static inline void f2fs_put_page(struct page *page, int unlock) +static inline void f2fs_folio_put(struct folio *folio, bool unlock) { - if (!page) + if (!folio) return; if (unlock) { - f2fs_bug_on(F2FS_P_SB(page), !PageLocked(page)); - unlock_page(page); + f2fs_bug_on(F2FS_F_SB(folio), !folio_test_locked(folio)); + folio_unlock(folio); } - put_page(page); + folio_put(folio); +} + +static inline void f2fs_put_page(struct page *page, int unlock) +{ + if (!page) + return; + f2fs_folio_put(page_folio(page), unlock); } static inline void f2fs_put_dnode(struct dnode_of_data *dn) @@ -3624,7 +3696,7 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync); void f2fs_inode_synced(struct inode *inode); int f2fs_dquot_initialize(struct inode *inode); int f2fs_enable_quota_files(struct f2fs_sb_info *sbi, bool rdonly); -int f2fs_quota_sync(struct super_block *sb, int type); +int f2fs_do_quota_sync(struct super_block *sb, int type); loff_t max_file_blocks(struct inode *inode); void f2fs_quota_off_umount(struct super_block *sb); void f2fs_save_errors(struct f2fs_sb_info *sbi, unsigned char flag); @@ -3647,7 +3719,8 @@ struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); -bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, + const struct folio *folio); void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page); void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); @@ -3662,12 +3735,14 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from); int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); -bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi); int f2fs_remove_inode_page(struct inode *inode); struct page *f2fs_new_inode_page(struct inode *inode); struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); +struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino); +struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid); struct page *f2fs_get_node_page_ra(struct page *parent, int start); int f2fs_move_node_page(struct page *node_page, int gc_type); void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); @@ -3687,7 +3762,6 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, unsigned int segno, struct f2fs_summary_block *sum); -void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi); int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc); int f2fs_build_node_manager(struct f2fs_sb_info *sbi); void f2fs_destroy_node_manager(struct f2fs_sb_info *sbi); @@ -3758,8 +3832,10 @@ int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, struct f2fs_io_info *fio); void f2fs_update_device_state(struct f2fs_sb_info *sbi, nid_t ino, block_t blkaddr, unsigned int blkcnt); -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered, bool locked); +void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, + bool ordered, bool locked); +#define f2fs_wait_on_page_writeback(page, type, ordered, locked) \ + f2fs_folio_wait_writeback(page_folio(page), type, ordered, locked) void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr); void f2fs_wait_on_block_writeback_range(struct inode *inode, block_t blkaddr, block_t len); @@ -3871,11 +3947,11 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count); int f2fs_reserve_new_block(struct dnode_of_data *dn); int f2fs_get_block_locked(struct dnode_of_data *dn, pgoff_t index); int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index); -struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index, - blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); -struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index, - pgoff_t *next_pgofs); -struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index, +struct folio *f2fs_get_read_data_folio(struct inode *inode, pgoff_t index, + blk_opf_t op_flags, bool for_write, pgoff_t *next_pgofs); +struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, + pgoff_t *next_pgofs); +struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write); struct page *f2fs_get_new_data_page(struct inode *inode, struct page *ipage, pgoff_t index, bool new_i_size); @@ -3902,6 +3978,22 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); extern const struct iomap_ops f2fs_iomap_ops; +static inline struct page *f2fs_find_data_page(struct inode *inode, + pgoff_t index, pgoff_t *next_pgofs) +{ + struct folio *folio = f2fs_find_data_folio(inode, index, next_pgofs); + + return &folio->page; +} + +static inline struct page *f2fs_get_lock_data_page(struct inode *inode, + pgoff_t index, bool for_write) +{ + struct folio *folio = f2fs_get_lock_data_folio(inode, index, for_write); + + return &folio->page; +} + /* * gc.c */ @@ -3966,7 +4058,8 @@ struct f2fs_stat_info { unsigned long long allocated_data_blocks; int ndirty_node, ndirty_dent, ndirty_meta, ndirty_imeta; int ndirty_data, ndirty_qdata; - unsigned int ndirty_dirs, ndirty_files, nquota_files, ndirty_all; + unsigned int ndirty_dirs, ndirty_files, ndirty_all; + unsigned int nquota_files, ndonate_files; int nats, dirty_nats, sits, dirty_sits; int free_nids, avail_nids, alloc_nids; int total_count, utilization; @@ -4231,6 +4324,8 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, struct shrink_control *sc); unsigned long f2fs_shrink_scan(struct shrinker *shrink, struct shrink_control *sc); +unsigned int f2fs_donate_files(void); +void f2fs_reclaim_caches(unsigned int reclaim_caches_kb); void f2fs_join_shrinker(struct f2fs_sb_info *sbi); void f2fs_leave_shrinker(struct f2fs_sb_info *sbi); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index f92a9fba9991..abbcbb5865a3 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -707,31 +707,33 @@ static int truncate_partial_data_page(struct inode *inode, u64 from, loff_t offset = from & (PAGE_SIZE - 1); pgoff_t index = from >> PAGE_SHIFT; struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; if (!offset && !cache_only) return 0; if (cache_only) { - page = find_lock_page(mapping, index); - if (page && PageUptodate(page)) + folio = filemap_lock_folio(mapping, index); + if (IS_ERR(folio)) + return 0; + if (folio_test_uptodate(folio)) goto truncate_out; - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return 0; } - page = f2fs_get_lock_data_page(inode, index, true); - if (IS_ERR(page)) - return PTR_ERR(page) == -ENOENT ? 0 : PTR_ERR(page); + folio = f2fs_get_lock_data_folio(inode, index, true); + if (IS_ERR(folio)) + return PTR_ERR(folio) == -ENOENT ? 0 : PTR_ERR(folio); truncate_out: - f2fs_wait_on_page_writeback(page, DATA, true, true); - zero_user(page, offset, PAGE_SIZE - offset); + f2fs_folio_wait_writeback(folio, DATA, true, true); + folio_zero_segment(folio, offset, folio_size(folio)); /* An encrypted inode should have a key and truncate the last page. */ f2fs_bug_on(F2FS_I_SB(inode), cache_only && IS_ENCRYPTED(inode)); if (!cache_only) - set_page_dirty(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); return 0; } @@ -759,7 +761,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) if (lock) f2fs_lock_op(sbi); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; @@ -1834,18 +1836,32 @@ static int f2fs_expand_inode_data(struct inode *inode, loff_t offset, map.m_len = sec_blks; next_alloc: + f2fs_down_write(&sbi->pin_sem); + + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) { + if (has_not_enough_free_secs(sbi, 0, 0)) { + f2fs_up_write(&sbi->pin_sem); + err = -ENOSPC; + f2fs_warn_ratelimited(sbi, + "ino:%lu, start:%lu, end:%lu, need to trigger GC to " + "reclaim enough free segment when checkpoint is enabled", + inode->i_ino, pg_start, pg_end); + goto out_err; + } + } + if (has_not_enough_free_secs(sbi, 0, f2fs_sb_has_blkzoned(sbi) ? ZONED_PIN_SEC_REQUIRED_COUNT : GET_SEC_FROM_SEG(sbi, overprovision_segments(sbi)))) { f2fs_down_write(&sbi->gc_lock); stat_inc_gc_call_count(sbi, FOREGROUND); err = f2fs_gc(sbi, &gc_control); - if (err && err != -ENODATA) + if (err && err != -ENODATA) { + f2fs_up_write(&sbi->pin_sem); goto out_err; + } } - f2fs_down_write(&sbi->pin_sem); - err = f2fs_allocate_pinning_section(sbi); if (err) { f2fs_up_write(&sbi->pin_sem); @@ -2448,6 +2464,52 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) return ret; } +static void f2fs_keep_noreuse_range(struct inode *inode, + loff_t offset, loff_t len) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + u64 max_bytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); + u64 start, end; + + if (!S_ISREG(inode->i_mode)) + return; + + if (offset >= max_bytes || len > max_bytes || + (offset + len) > max_bytes) + return; + + start = offset >> PAGE_SHIFT; + end = DIV_ROUND_UP(offset + len, PAGE_SIZE); + + inode_lock(inode); + if (f2fs_is_atomic_file(inode)) { + inode_unlock(inode); + return; + } + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + /* let's remove the range, if len = 0 */ + if (!len) { + if (!list_empty(&F2FS_I(inode)->gdonate_list)) { + list_del_init(&F2FS_I(inode)->gdonate_list); + sbi->donate_files--; + } + } else { + if (list_empty(&F2FS_I(inode)->gdonate_list)) { + list_add_tail(&F2FS_I(inode)->gdonate_list, + &sbi->inode_list[DONATE_INODE]); + sbi->donate_files++; + } else { + list_move_tail(&F2FS_I(inode)->gdonate_list, + &sbi->inode_list[DONATE_INODE]); + } + F2FS_I(inode)->donate_start = start; + F2FS_I(inode)->donate_end = end - 1; + } + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + inode_unlock(inode); +} + static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) { struct inode *inode = file_inode(filp); @@ -3446,6 +3508,23 @@ static int f2fs_ioc_get_dev_alias_file(struct file *filp, unsigned long arg) (u32 __user *)arg); } +static int f2fs_ioc_io_prio(struct file *filp, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + __u32 level; + + if (get_user(level, (__u32 __user *)arg)) + return -EFAULT; + + if (!S_ISREG(inode->i_mode) || level >= F2FS_IOPRIO_MAX) + return -EINVAL; + + inode_lock(inode); + F2FS_I(inode)->ioprio_hint = level; + inode_unlock(inode); + return 0; +} + int f2fs_precache_extents(struct inode *inode) { struct f2fs_inode_info *fi = F2FS_I(inode); @@ -4547,6 +4626,8 @@ static long __f2fs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) return f2fs_ioc_compress_file(filp); case F2FS_IOC_GET_DEV_ALIAS_FILE: return f2fs_ioc_get_dev_alias_file(filp, arg); + case F2FS_IOC_IO_PRIO: + return f2fs_ioc_io_prio(filp, arg); default: return -ENOTTY; } @@ -5147,12 +5228,16 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, } err = generic_fadvise(filp, offset, len, advice); - if (!err && advice == POSIX_FADV_DONTNEED && - test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && - f2fs_compressed_file(inode)) - f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + if (err) + return err; - return err; + if (advice == POSIX_FADV_DONTNEED && + (test_opt(F2FS_I_SB(inode), COMPRESS_CACHE) && + f2fs_compressed_file(inode))) + f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); + else if (advice == POSIX_FADV_NOREUSE) + f2fs_keep_noreuse_range(inode, offset, len); + return 0; } #ifdef CONFIG_COMPAT @@ -5261,6 +5346,7 @@ long f2fs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg) case F2FS_IOC_DECOMPRESS_FILE: case F2FS_IOC_COMPRESS_FILE: case F2FS_IOC_GET_DEV_ALIAS_FILE: + case F2FS_IOC_IO_PRIO: break; default: return -ENOIOCTLCMD; diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index faf9fa1c804d..dd0ba0532e01 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1449,14 +1449,14 @@ out: } static int move_data_page(struct inode *inode, block_t bidx, int gc_type, - unsigned int segno, int off) + unsigned int segno, int off) { - struct page *page; + struct folio *folio; int err = 0; - page = f2fs_get_lock_data_page(inode, bidx, true); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = f2fs_get_lock_data_folio(inode, bidx, true); + if (IS_ERR(folio)) + return PTR_ERR(folio); if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { err = -ENOENT; @@ -1468,12 +1468,12 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, goto out; if (gc_type == BG_GC) { - if (folio_test_writeback(page_folio(page))) { + if (folio_test_writeback(folio)) { err = -EAGAIN; goto out; } - set_page_dirty(page); - set_page_private_gcing(page); + folio_mark_dirty(folio); + set_page_private_gcing(&folio->page); } else { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(inode), @@ -1483,37 +1483,37 @@ static int move_data_page(struct inode *inode, block_t bidx, int gc_type, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC, .old_blkaddr = NULL_ADDR, - .page = page, + .page = &folio->page, .encrypted_page = NULL, .need_lock = LOCK_REQ, .io_type = FS_GC_DATA_IO, }; - bool is_dirty = PageDirty(page); + bool is_dirty = folio_test_dirty(folio); retry: - f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); - set_page_dirty(page); - if (clear_page_dirty_for_io(page)) { + folio_mark_dirty(folio); + if (folio_clear_dirty_for_io(folio)) { inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); } - set_page_private_gcing(page); + set_page_private_gcing(&folio->page); err = f2fs_do_write_data_page(&fio); if (err) { - clear_page_private_gcing(page); + clear_page_private_gcing(&folio->page); if (err == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto retry; } if (is_dirty) - set_page_dirty(page); + folio_mark_dirty(folio); } } out: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return err; } @@ -1542,7 +1542,6 @@ next_step: entry = sum; for (off = 0; off < usable_blks_in_seg; off++, entry++) { - struct page *data_page; struct inode *inode; struct node_info dni; /* dnode info for the data */ unsigned int ofs_in_node, nofs; @@ -1585,6 +1584,7 @@ next_step: ofs_in_node = le16_to_cpu(entry->ofs_in_node); if (phase == 3) { + struct folio *data_folio; int err; inode = f2fs_iget(sb, dni.ino); @@ -1635,15 +1635,15 @@ next_step: continue; } - data_page = f2fs_get_read_data_page(inode, start_bidx, + data_folio = f2fs_get_read_data_folio(inode, start_bidx, REQ_RAHEAD, true, NULL); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - if (IS_ERR(data_page)) { + if (IS_ERR(data_folio)) { iput(inode); continue; } - f2fs_put_page(data_page, 0); + f2fs_folio_put(data_folio, false); add_gc_inode(gc_list, inode); continue; } @@ -2271,12 +2271,12 @@ out_drop_write: if (err) return err; - err = freeze_super(sbi->sb, FREEZE_HOLDER_USERSPACE); + err = freeze_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); if (err) return err; if (f2fs_readonly(sbi->sb)) { - err = thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE); + err = thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); if (err) return err; return -EROFS; @@ -2333,6 +2333,6 @@ recover_out: out_err: f2fs_up_write(&sbi->cp_global_sem); f2fs_up_write(&sbi->gc_lock); - thaw_super(sbi->sb, FREEZE_HOLDER_USERSPACE); + thaw_super(sbi->sb, FREEZE_HOLDER_KERNEL, NULL); return err; } diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index 3e3c35d4c98b..ad92e9008781 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -119,7 +119,7 @@ int f2fs_read_inline_data(struct inode *inode, struct folio *folio) { struct page *ipage; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) { folio_unlock(folio); return PTR_ERR(ipage); @@ -237,7 +237,7 @@ int f2fs_convert_inline_inode(struct inode *inode) f2fs_lock_op(sbi); - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out; @@ -265,7 +265,7 @@ int f2fs_write_inline_data(struct inode *inode, struct folio *folio) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct page *ipage; - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -312,7 +312,7 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage) if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { process_inline: - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -331,7 +331,7 @@ process_inline: } if (f2fs_has_inline_data(inode)) { - ipage = f2fs_get_node_page(sbi, inode->i_ino); + ipage = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); f2fs_truncate_inline_inode(inode, ipage, 0); @@ -361,7 +361,7 @@ struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, struct page *ipage; void *inline_dentry; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { *res_page = ipage; return NULL; @@ -609,7 +609,7 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) if (err) goto out; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) { err = PTR_ERR(ipage); goto out_fname; @@ -644,7 +644,7 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, struct page *page = NULL; int err = 0; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -734,7 +734,7 @@ bool f2fs_empty_inline_dir(struct inode *dir) void *inline_dentry; struct f2fs_dentry_ptr d; - ipage = f2fs_get_node_page(sbi, dir->i_ino); + ipage = f2fs_get_inode_page(sbi, dir->i_ino); if (IS_ERR(ipage)) return false; @@ -765,7 +765,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (ctx->pos == d.max) return 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -797,7 +797,7 @@ int f2fs_inline_data_fiemap(struct inode *inode, struct page *ipage; int err = 0; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 3dd25f64d6f1..83f862578fc8 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -34,10 +34,8 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) if (f2fs_inode_dirtied(inode, sync)) return; - if (f2fs_is_atomic_file(inode)) { - set_inode_flag(inode, FI_ATOMIC_DIRTIED); + if (f2fs_is_atomic_file(inode)) return; - } mark_inode_dirty_sync(inode); } @@ -410,7 +408,7 @@ static int do_read_inode(struct inode *inode) if (f2fs_check_nid_range(sbi, inode->i_ino)) return -EINVAL; - node_page = f2fs_get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(node_page)) return PTR_ERR(node_page); @@ -757,7 +755,7 @@ void f2fs_update_inode_page(struct inode *inode) struct page *node_page; int count = 0; retry: - node_page = f2fs_get_node_page(sbi, inode->i_ino); + node_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(node_page)) { int err = PTR_ERR(node_page); @@ -765,8 +763,12 @@ retry: if (err == -ENOENT) return; + if (err == -EFSCORRUPTED) + goto stop_checkpoint; + if (err == -ENOMEM || ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; +stop_checkpoint: f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE); return; } @@ -789,6 +791,13 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) !is_inode_flag_set(inode, FI_DIRTY_INODE)) return 0; + /* + * no need to update inode page, ultimately f2fs_evict_inode() will + * clear dirty status of inode. + */ + if (f2fs_cp_error(sbi)) + return -EIO; + if (!f2fs_is_checkpoint_ready(sbi)) { f2fs_mark_inode_dirty_sync(inode, true); return -ENOSPC; @@ -804,6 +813,19 @@ int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) return 0; } +static void f2fs_remove_donate_inode(struct inode *inode) +{ + struct f2fs_sb_info *sbi = F2FS_I_SB(inode); + + if (list_empty(&F2FS_I(inode)->gdonate_list)) + return; + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + list_del_init(&F2FS_I(inode)->gdonate_list); + sbi->donate_files--; + spin_unlock(&sbi->inode_lock[DONATE_INODE]); +} + /* * Called at the last iput() if i_nlink is zero */ @@ -838,6 +860,7 @@ void f2fs_evict_inode(struct inode *inode) f2fs_bug_on(sbi, get_dirty_pages(inode)); f2fs_remove_dirty_inode(inode); + f2fs_remove_donate_inode(inode); if (!IS_DEVICE_ALIASING(inode)) f2fs_destroy_extent_tree(inode); diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index a278c7da8177..8f8b9b843bdf 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -502,6 +502,14 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out; } + if (inode->i_nlink == 0) { + f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink", + __func__, inode->i_ino); + err = -EFSCORRUPTED; + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + goto out_iput; + } + if (IS_ENCRYPTED(dir) && (S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode)) && !fscrypt_has_permitted_context(dir, inode)) { @@ -684,23 +692,23 @@ out_free_encrypted_link: return err; } -static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode; int err; if (unlikely(f2fs_cp_error(sbi))) - return -EIO; + return ERR_PTR(-EIO); err = f2fs_dquot_initialize(dir); if (err) - return err; + return ERR_PTR(err); inode = f2fs_new_inode(idmap, dir, S_IFDIR | mode, NULL); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode->i_op = &f2fs_dir_inode_operations; inode->i_fop = &f2fs_dir_operations; @@ -722,12 +730,12 @@ static int f2fs_mkdir(struct mnt_idmap *idmap, struct inode *dir, f2fs_sync_fs(sbi->sb, 1); f2fs_balance_fs(sbi, true); - return 0; + return NULL; out_fail: clear_inode_flag(inode, FI_INC_LINK); f2fs_handle_failed_inode(inode); - return err; + return ERR_PTR(err); } static int f2fs_rmdir(struct inode *dir, struct dentry *dentry) diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index f88392fc4ba9..5f15c224bf78 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -310,10 +310,10 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } -bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct page *page) +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, const struct folio *folio) { - return NODE_MAPPING(sbi) == page->mapping && - IS_DNODE(page) && is_cold_node(page); + return NODE_MAPPING(sbi) == folio->mapping && + IS_DNODE(&folio->page) && is_cold_node(&folio->page); } void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) @@ -778,7 +778,7 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) npage[0] = dn->inode_page; if (!npage[0]) { - npage[0] = f2fs_get_node_page(sbi, nids[0]); + npage[0] = f2fs_get_inode_page(sbi, nids[0]); if (IS_ERR(npage[0])) return PTR_ERR(npage[0]); } @@ -1130,26 +1130,33 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) unsigned int nofs = 0; struct f2fs_inode *ri; struct dnode_of_data dn; - struct page *page; + struct folio *folio; trace_f2fs_truncate_inode_blocks_enter(inode, from); level = get_node_path(inode, from, offset, noffset); - if (level < 0) { + if (level <= 0) { + if (!level) { + level = -EFSCORRUPTED; + f2fs_err(sbi, "%s: inode ino=%lx has corrupted node block, from:%lu addrs:%u", + __func__, inode->i_ino, + from, ADDRS_PER_INODE(inode)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + } trace_f2fs_truncate_inode_blocks_exit(inode, level); return level; } - page = f2fs_get_node_page(sbi, inode->i_ino); - if (IS_ERR(page)) { - trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(page)); - return PTR_ERR(page); + folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(folio)) { + trace_f2fs_truncate_inode_blocks_exit(inode, PTR_ERR(folio)); + return PTR_ERR(folio); } - set_new_dnode(&dn, inode, page, NULL, 0); - unlock_page(page); + set_new_dnode(&dn, inode, &folio->page, NULL, 0); + folio_unlock(folio); - ri = F2FS_INODE(page); + ri = F2FS_INODE(&folio->page); switch (level) { case 0: case 1: @@ -1178,7 +1185,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) skip_partial: while (cont) { - dn.nid = get_nid(page, offset[0], true); + dn.nid = get_nid(&folio->page, offset[0], true); switch (offset[0]) { case NODE_DIR1_BLOCK: case NODE_DIR2_BLOCK: @@ -1199,7 +1206,7 @@ skip_partial: BUG(); } if (err == -ENOENT) { - set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); + set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_BLKADDR); f2fs_err_ratelimited(sbi, "truncate node fail, ino:%lu, nid:%u, " @@ -1210,18 +1217,18 @@ skip_partial: } if (err < 0) goto fail; - if (offset[1] == 0 && get_nid(page, offset[0], true)) { - lock_page(page); - BUG_ON(page->mapping != NODE_MAPPING(sbi)); - set_nid(page, offset[0], 0, true); - unlock_page(page); + if (offset[1] == 0 && get_nid(&folio->page, offset[0], true)) { + folio_lock(folio); + BUG_ON(folio->mapping != NODE_MAPPING(sbi)); + set_nid(&folio->page, offset[0], 0, true); + folio_unlock(folio); } offset[1] = 0; offset[0]++; nofs += err; } fail: - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); trace_f2fs_truncate_inode_blocks_exit(inode, err); return err > 0 ? 0 : err; } @@ -1238,7 +1245,7 @@ int f2fs_truncate_xattr_node(struct inode *inode) if (!nid) return 0; - npage = f2fs_get_node_page(sbi, nid); + npage = f2fs_get_xnode_page(sbi, nid); if (IS_ERR(npage)) return PTR_ERR(npage); @@ -1449,10 +1456,32 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) f2fs_put_page(apage, err ? 1 : 0); } -static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, - struct page *parent, int start) +static int sanity_check_node_footer(struct f2fs_sb_info *sbi, + struct page *page, pgoff_t nid, + enum node_type ntype) { - struct page *page; + if (unlikely(nid != nid_of_node(page) || + (ntype == NODE_TYPE_INODE && !IS_INODE(page)) || + (ntype == NODE_TYPE_XATTR && + !f2fs_has_xattr_block(ofs_of_node(page))) || + time_to_inject(sbi, FAULT_INCONSISTENT_FOOTER))) { + f2fs_warn(sbi, "inconsistent node block, node_type:%d, nid:%lu, " + "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", + ntype, nid, nid_of_node(page), ino_of_node(page), + ofs_of_node(page), cpver_of_node(page), + next_blkaddr_of_node(page)); + set_sbi_flag(sbi, SBI_NEED_FSCK); + f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); + return -EFSCORRUPTED; + } + return 0; +} + +static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, + struct page *parent, int start, + enum node_type ntype) +{ + struct folio *folio; int err; if (!nid) @@ -1460,11 +1489,11 @@ static struct page *__get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid, if (f2fs_check_nid_range(sbi, nid)) return ERR_PTR(-EINVAL); repeat: - page = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false); + if (IS_ERR(folio)) + return folio; - err = read_node_page(page, 0); + err = read_node_page(&folio->page, 0); if (err < 0) { goto out_put_err; } else if (err == LOCKED_PAGE) { @@ -1475,54 +1504,72 @@ repeat: if (parent) f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { - f2fs_put_page(page, 1); + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { + f2fs_folio_put(folio, true); goto repeat; } - if (unlikely(!PageUptodate(page))) { + if (unlikely(!folio_test_uptodate(folio))) { err = -EIO; goto out_err; } - if (!f2fs_inode_chksum_verify(sbi, page)) { + if (!f2fs_inode_chksum_verify(sbi, &folio->page)) { err = -EFSBADCRC; goto out_err; } page_hit: - if (likely(nid == nid_of_node(page))) - return page; - - f2fs_warn(sbi, "inconsistent node block, nid:%lu, node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", - nid, nid_of_node(page), ino_of_node(page), - ofs_of_node(page), cpver_of_node(page), - next_blkaddr_of_node(page)); - set_sbi_flag(sbi, SBI_NEED_FSCK); - f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); - err = -EFSCORRUPTED; + err = sanity_check_node_footer(sbi, &folio->page, nid, ntype); + if (!err) + return folio; out_err: - ClearPageUptodate(page); + folio_clear_uptodate(folio); out_put_err: /* ENOENT comes from read_node_page which is not an error. */ if (err != -ENOENT) - f2fs_handle_page_eio(sbi, page_folio(page), NODE); - f2fs_put_page(page, 1); + f2fs_handle_page_eio(sbi, folio, NODE); + f2fs_folio_put(folio, true); return ERR_PTR(err); } struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) { - return __get_node_page(sbi, nid, NULL, 0); + struct folio *folio = __get_node_folio(sbi, nid, NULL, 0, + NODE_TYPE_REGULAR); + + return &folio->page; +} + +struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) +{ + return __get_node_folio(sbi, ino, NULL, 0, NODE_TYPE_INODE); +} + +struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino) +{ + struct folio *folio = f2fs_get_inode_folio(sbi, ino); + + return &folio->page; +} + +struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid) +{ + struct folio *folio = __get_node_folio(sbi, xnid, NULL, 0, + NODE_TYPE_XATTR); + + return &folio->page; } struct page *f2fs_get_node_page_ra(struct page *parent, int start) { struct f2fs_sb_info *sbi = F2FS_P_SB(parent); nid_t nid = get_nid(parent, start, false); + struct folio *folio = __get_node_folio(sbi, nid, parent, start, + NODE_TYPE_REGULAR); - return __get_node_page(sbi, nid, parent, start); + return &folio->page; } static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) @@ -1561,11 +1608,11 @@ iput_out: iput(inode); } -static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) +static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) { pgoff_t index; struct folio_batch fbatch; - struct page *last_page = NULL; + struct folio *last_folio = NULL; int nr_folios; folio_batch_init(&fbatch); @@ -1577,45 +1624,45 @@ static struct page *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; if (unlikely(f2fs_cp_error(sbi))) { - f2fs_put_page(last_page, 0); + f2fs_folio_put(last_folio, false); folio_batch_release(&fbatch); return ERR_PTR(-EIO); } - if (!IS_DNODE(page) || !is_cold_node(page)) + if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) continue; - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) continue; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) goto continue_unlock; - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } - if (last_page) - f2fs_put_page(last_page, 0); + if (last_folio) + f2fs_folio_put(last_folio, false); - get_page(page); - last_page = page; - unlock_page(page); + folio_get(folio); + last_folio = folio; + folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); } - return last_page; + return last_folio; } static int __write_node_page(struct page *page, bool atomic, bool *submitted, @@ -1694,7 +1741,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, fio.op_flags |= REQ_PREFLUSH | REQ_FUA; /* should add to global list before clearing PAGECACHE status */ - if (f2fs_in_warm_node_list(sbi, page)) { + if (f2fs_in_warm_node_list(sbi, folio)) { seq = f2fs_add_fsync_node_entry(sbi, page); if (seq_id) *seq_id = seq; @@ -1769,13 +1816,6 @@ release_page: return err; } -static int f2fs_write_node_page(struct page *page, - struct writeback_control *wbc) -{ - return __write_node_page(page, false, NULL, wbc, false, - FS_NODE_IO, NULL); -} - int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, unsigned int *seq_id) @@ -1783,16 +1823,16 @@ int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, pgoff_t index; struct folio_batch fbatch; int ret = 0; - struct page *last_page = NULL; + struct folio *last_folio = NULL; bool marked = false; nid_t ino = inode->i_ino; int nr_folios; int nwritten = 0; if (atomic) { - last_page = last_fsync_dnode(sbi, ino); - if (IS_ERR_OR_NULL(last_page)) - return PTR_ERR_OR_ZERO(last_page); + last_folio = last_fsync_dnode(sbi, ino); + if (IS_ERR_OR_NULL(last_folio)) + return PTR_ERR_OR_ZERO(last_folio); } retry: folio_batch_init(&fbatch); @@ -1804,73 +1844,73 @@ retry: int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; bool submitted = false; if (unlikely(f2fs_cp_error(sbi))) { - f2fs_put_page(last_page, 0); + f2fs_folio_put(last_folio, false); folio_batch_release(&fbatch); ret = -EIO; goto out; } - if (!IS_DNODE(page) || !is_cold_node(page)) + if (!IS_DNODE(&folio->page) || !is_cold_node(&folio->page)) continue; - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) continue; - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (ino_of_node(page) != ino) + if (ino_of_node(&folio->page) != ino) goto continue_unlock; - if (!PageDirty(page) && page != last_page) { + if (!folio_test_dirty(folio) && folio != last_folio) { /* someone wrote it for us */ goto continue_unlock; } - f2fs_wait_on_page_writeback(page, NODE, true, true); + f2fs_folio_wait_writeback(folio, NODE, true, true); - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); + set_fsync_mark(&folio->page, 0); + set_dentry_mark(&folio->page, 0); - if (!atomic || page == last_page) { - set_fsync_mark(page, 1); + if (!atomic || folio == last_folio) { + set_fsync_mark(&folio->page, 1); percpu_counter_inc(&sbi->rf_node_block_count); - if (IS_INODE(page)) { + if (IS_INODE(&folio->page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - f2fs_update_inode(inode, page); - set_dentry_mark(page, + f2fs_update_inode(inode, &folio->page); + set_dentry_mark(&folio->page, f2fs_need_dentry_mark(sbi, ino)); } /* may be written by other thread */ - if (!PageDirty(page)) - set_page_dirty(page); + if (!folio_test_dirty(folio)) + folio_mark_dirty(folio); } - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - ret = __write_node_page(page, atomic && - page == last_page, + ret = __write_node_page(&folio->page, atomic && + folio == last_folio, &submitted, wbc, true, FS_NODE_IO, seq_id); if (ret) { - unlock_page(page); - f2fs_put_page(last_page, 0); + folio_unlock(folio); + f2fs_folio_put(last_folio, false); break; } else if (submitted) { nwritten++; } - if (page == last_page) { - f2fs_put_page(page, 0); + if (folio == last_folio) { + f2fs_folio_put(folio, false); marked = true; break; } @@ -1883,11 +1923,11 @@ continue_unlock: } if (!ret && atomic && !marked) { f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx", - ino, page_folio(last_page)->index); - lock_page(last_page); - f2fs_wait_on_page_writeback(last_page, NODE, true, true); - set_page_dirty(last_page); - unlock_page(last_page); + ino, last_folio->index); + folio_lock(last_folio); + f2fs_folio_wait_writeback(last_folio, NODE, true, true); + folio_mark_dirty(last_folio); + folio_unlock(last_folio); goto retry; } out: @@ -1920,18 +1960,18 @@ static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) return 1; } -static bool flush_dirty_inode(struct page *page) +static bool flush_dirty_inode(struct folio *folio) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); struct inode *inode; - nid_t ino = ino_of_node(page); + nid_t ino = ino_of_node(&folio->page); inode = find_inode_nowait(sbi->sb, ino, f2fs_match_ino, NULL); if (!inode) return false; - f2fs_update_inode(inode, page); - unlock_page(page); + f2fs_update_inode(inode, &folio->page); + folio_unlock(folio); iput(inode); return true; @@ -1951,32 +1991,27 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; - if (!IS_INODE(page)) + if (!IS_INODE(&folio->page)) continue; - lock_page(page); - - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { -continue_unlock: - unlock_page(page); - continue; - } + folio_lock(folio); - if (!PageDirty(page)) { - /* someone wrote it for us */ - goto continue_unlock; - } + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) + goto unlock; + if (!folio_test_dirty(folio)) + goto unlock; /* flush inline_data, if it's async context. */ - if (page_private_inline(page)) { - clear_page_private_inline(page); - unlock_page(page); - flush_inline_data(sbi, ino_of_node(page)); + if (page_private_inline(&folio->page)) { + clear_page_private_inline(&folio->page); + folio_unlock(folio); + flush_inline_data(sbi, ino_of_node(&folio->page)); continue; } - unlock_page(page); +unlock: + folio_unlock(folio); } folio_batch_release(&fbatch); cond_resched(); @@ -2005,7 +2040,7 @@ next_step: int i; for (i = 0; i < nr_folios; i++) { - struct page *page = &fbatch.folios[i]->page; + struct folio *folio = fbatch.folios[i]; bool submitted = false; /* give a priority to WB_SYNC threads */ @@ -2021,27 +2056,27 @@ next_step: * 1. dentry dnodes * 2. file dnodes */ - if (step == 0 && IS_DNODE(page)) + if (step == 0 && IS_DNODE(&folio->page)) continue; - if (step == 1 && (!IS_DNODE(page) || - is_cold_node(page))) + if (step == 1 && (!IS_DNODE(&folio->page) || + is_cold_node(&folio->page))) continue; - if (step == 2 && (!IS_DNODE(page) || - !is_cold_node(page))) + if (step == 2 && (!IS_DNODE(&folio->page) || + !is_cold_node(&folio->page))) continue; lock_node: if (wbc->sync_mode == WB_SYNC_ALL) - lock_page(page); - else if (!trylock_page(page)) + folio_lock(folio); + else if (!folio_trylock(folio)) continue; - if (unlikely(page->mapping != NODE_MAPPING(sbi))) { + if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { continue_unlock: - unlock_page(page); + folio_unlock(folio); continue; } - if (!PageDirty(page)) { + if (!folio_test_dirty(folio)) { /* someone wrote it for us */ goto continue_unlock; } @@ -2051,29 +2086,29 @@ continue_unlock: goto write_node; /* flush inline_data */ - if (page_private_inline(page)) { - clear_page_private_inline(page); - unlock_page(page); - flush_inline_data(sbi, ino_of_node(page)); + if (page_private_inline(&folio->page)) { + clear_page_private_inline(&folio->page); + folio_unlock(folio); + flush_inline_data(sbi, ino_of_node(&folio->page)); goto lock_node; } /* flush dirty inode */ - if (IS_INODE(page) && flush_dirty_inode(page)) + if (IS_INODE(&folio->page) && flush_dirty_inode(folio)) goto lock_node; write_node: - f2fs_wait_on_page_writeback(page, NODE, true, true); + f2fs_folio_wait_writeback(folio, NODE, true, true); - if (!clear_page_dirty_for_io(page)) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - set_fsync_mark(page, 0); - set_dentry_mark(page, 0); + set_fsync_mark(&folio->page, 0); + set_dentry_mark(&folio->page, 0); - ret = __write_node_page(page, false, &submitted, + ret = __write_node_page(&folio->page, false, &submitted, wbc, do_balance, io_type, NULL); if (ret) - unlock_page(page); + folio_unlock(folio); else if (submitted) nwritten++; @@ -2207,7 +2242,6 @@ static bool f2fs_dirty_node_folio(struct address_space *mapping, * Structure of the f2fs node operations */ const struct address_space_operations f2fs_node_aops = { - .writepage = f2fs_write_node_page, .writepages = f2fs_write_node_pages, .dirty_folio = f2fs_dirty_node_folio, .invalidate_folio = f2fs_invalidate_folio, @@ -2269,24 +2303,6 @@ static void __move_free_nid(struct f2fs_sb_info *sbi, struct free_nid *i, } } -bool f2fs_nat_bitmap_enabled(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int i; - bool ret = true; - - f2fs_down_read(&nm_i->nat_tree_lock); - for (i = 0; i < nm_i->nat_blocks; i++) { - if (!test_bit_le(i, nm_i->nat_block_bitmap)) { - ret = false; - break; - } - } - f2fs_up_read(&nm_i->nat_tree_lock); - - return ret; -} - static void update_free_nid_bitmap(struct f2fs_sb_info *sbi, nid_t nid, bool set, bool build) { @@ -2717,7 +2733,7 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) struct page *ipage; struct f2fs_inode *ri; - ipage = f2fs_get_node_page(F2FS_I_SB(inode), inode->i_ino); + ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); if (IS_ERR(ipage)) return PTR_ERR(ipage); @@ -2965,23 +2981,7 @@ add_out: list_add_tail(&nes->set_list, head); } -static void __update_nat_bits(struct f2fs_nm_info *nm_i, unsigned int nat_ofs, - unsigned int valid) -{ - if (valid == 0) { - __set_bit_le(nat_ofs, nm_i->empty_nat_bits); - __clear_bit_le(nat_ofs, nm_i->full_nat_bits); - return; - } - - __clear_bit_le(nat_ofs, nm_i->empty_nat_bits); - if (valid == NAT_ENTRY_PER_BLOCK) - __set_bit_le(nat_ofs, nm_i->full_nat_bits); - else - __clear_bit_le(nat_ofs, nm_i->full_nat_bits); -} - -static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, +static void __update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, struct page *page) { struct f2fs_nm_info *nm_i = NM_I(sbi); @@ -2990,7 +2990,7 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, int valid = 0; int i = 0; - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + if (!enabled_nat_bits(sbi, NULL)) return; if (nat_index == 0) { @@ -3001,36 +3001,17 @@ static void update_nat_bits(struct f2fs_sb_info *sbi, nid_t start_nid, if (le32_to_cpu(nat_blk->entries[i].block_addr) != NULL_ADDR) valid++; } - - __update_nat_bits(nm_i, nat_index, valid); -} - -void f2fs_enable_nat_bits(struct f2fs_sb_info *sbi) -{ - struct f2fs_nm_info *nm_i = NM_I(sbi); - unsigned int nat_ofs; - - f2fs_down_read(&nm_i->nat_tree_lock); - - for (nat_ofs = 0; nat_ofs < nm_i->nat_blocks; nat_ofs++) { - unsigned int valid = 0, nid_ofs = 0; - - /* handle nid zero due to it should never be used */ - if (unlikely(nat_ofs == 0)) { - valid = 1; - nid_ofs = 1; - } - - for (; nid_ofs < NAT_ENTRY_PER_BLOCK; nid_ofs++) { - if (!test_bit_le(nid_ofs, - nm_i->free_nid_bitmap[nat_ofs])) - valid++; - } - - __update_nat_bits(nm_i, nat_ofs, valid); + if (valid == 0) { + __set_bit_le(nat_index, nm_i->empty_nat_bits); + __clear_bit_le(nat_index, nm_i->full_nat_bits); + return; } - f2fs_up_read(&nm_i->nat_tree_lock); + __clear_bit_le(nat_index, nm_i->empty_nat_bits); + if (valid == NAT_ENTRY_PER_BLOCK) + __set_bit_le(nat_index, nm_i->full_nat_bits); + else + __clear_bit_le(nat_index, nm_i->full_nat_bits); } static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, @@ -3049,7 +3030,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, * #1, flush nat entries to journal in current hot data summary block. * #2, flush nat entries to nat page. */ - if ((cpc->reason & CP_UMOUNT) || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, set->entry_cnt, NAT_JOURNAL)) to_journal = false; @@ -3096,7 +3077,7 @@ static int __flush_nat_entry_set(struct f2fs_sb_info *sbi, if (to_journal) { up_write(&curseg->journal_rwsem); } else { - update_nat_bits(sbi, start_nid, page); + __update_nat_bits(sbi, start_nid, page); f2fs_put_page(page, 1); } @@ -3127,7 +3108,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * during unmount, let's flush nat_bits before checking * nat_cnt[DIRTY_NAT]. */ - if (cpc->reason & CP_UMOUNT) { + if (enabled_nat_bits(sbi, cpc)) { f2fs_down_write(&nm_i->nat_tree_lock); remove_nats_in_journal(sbi); f2fs_up_write(&nm_i->nat_tree_lock); @@ -3143,7 +3124,7 @@ int f2fs_flush_nat_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * entries, remove all entries from journal and merge them * into nat entry set. */ - if (cpc->reason & CP_UMOUNT || + if (enabled_nat_bits(sbi, cpc) || !__has_cursum_space(journal, nm_i->nat_cnt[DIRTY_NAT], NAT_JOURNAL)) remove_nats_in_journal(sbi); @@ -3180,18 +3161,15 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) __u64 cp_ver = cur_cp_version(ckpt); block_t nat_bits_addr; + if (!enabled_nat_bits(sbi, NULL)) + return 0; + nm_i->nat_bits_blocks = F2FS_BLK_ALIGN((nat_bits_bytes << 1) + 8); nm_i->nat_bits = f2fs_kvzalloc(sbi, F2FS_BLK_TO_BYTES(nm_i->nat_bits_blocks), GFP_KERNEL); if (!nm_i->nat_bits) return -ENOMEM; - nm_i->full_nat_bits = nm_i->nat_bits + 8; - nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; - - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) - return 0; - nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { @@ -3208,12 +3186,13 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) cp_ver |= (cur_cp_crc(ckpt) << 32); if (cpu_to_le64(cp_ver) != *(__le64 *)nm_i->nat_bits) { - clear_ckpt_flags(sbi, CP_NAT_BITS_FLAG); - f2fs_notice(sbi, "Disable nat_bits due to incorrect cp_ver (%llu, %llu)", - cp_ver, le64_to_cpu(*(__le64 *)nm_i->nat_bits)); + disable_nat_bits(sbi, true); return 0; } + nm_i->full_nat_bits = nm_i->nat_bits + 8; + nm_i->empty_nat_bits = nm_i->full_nat_bits + nat_bits_bytes; + f2fs_notice(sbi, "Found nat_bits in checkpoint"); return 0; } @@ -3224,7 +3203,7 @@ static inline void load_free_nid_bitmap(struct f2fs_sb_info *sbi) unsigned int i = 0; nid_t nid, last_nid; - if (!is_set_ckpt_flags(sbi, CP_NAT_BITS_FLAG)) + if (!enabled_nat_bits(sbi, NULL)) return; for (i = 0; i < nm_i->nat_blocks; i++) { @@ -3296,6 +3275,9 @@ static int init_node_manager(struct f2fs_sb_info *sbi) if (!nm_i->nat_bitmap) return -ENOMEM; + if (!test_opt(sbi, NAT_BITS)) + disable_nat_bits(sbi, true); + err = __get_nat_bitmaps(sbi); if (err) return err; diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 6aea13024ac1..103a437e6425 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -52,6 +52,13 @@ enum { IS_PREALLOC, /* nat entry is preallocated */ }; +/* For node type in __get_node_folio() */ +enum node_type { + NODE_TYPE_REGULAR, + NODE_TYPE_INODE, + NODE_TYPE_XATTR, +}; + /* * For node information */ @@ -248,7 +255,7 @@ static inline nid_t nid_of_node(struct page *node_page) return le32_to_cpu(rn->footer.nid); } -static inline unsigned int ofs_of_node(struct page *node_page) +static inline unsigned int ofs_of_node(const struct page *node_page) { struct f2fs_node *rn = F2FS_NODE(node_page); unsigned flag = le32_to_cpu(rn->footer.flag); @@ -342,7 +349,7 @@ static inline bool is_recoverable_dnode(struct page *page) * `- indirect node ((6 + 2N) + (N - 1)(N + 1)) * `- direct node */ -static inline bool IS_DNODE(struct page *node_page) +static inline bool IS_DNODE(const struct page *node_page) { unsigned int ofs = ofs_of_node(node_page); @@ -389,7 +396,7 @@ static inline nid_t get_nid(struct page *p, int off, bool i) * - Mark cold data pages in page cache */ -static inline int is_node(struct page *page, int type) +static inline int is_node(const struct page *page, int type) { struct f2fs_node *rn = F2FS_NODE(page); return le32_to_cpu(rn->footer.flag) & BIT(type); diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index c282e8a0a2ec..396ef71f41e3 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -2096,7 +2096,9 @@ static bool add_discard_addrs(struct f2fs_sb_info *sbi, struct cp_control *cpc, return false; if (!force) { - if (!f2fs_realtime_discard_enable(sbi) || !se->valid_blocks || + if (!f2fs_realtime_discard_enable(sbi) || + (!se->valid_blocks && + !IS_CURSEG(sbi, cpc->trim_start)) || SM_I(sbi)->dcc_info->nr_discards >= SM_I(sbi)->dcc_info->max_discards) return false; @@ -2320,10 +2322,9 @@ static int create_discard_cmd_control(struct f2fs_sb_info *sbi) dcc->discard_granularity = DEFAULT_DISCARD_GRANULARITY; dcc->max_ordered_discard = DEFAULT_MAX_ORDERED_DISCARD_GRANULARITY; dcc->discard_io_aware = DPOLICY_IO_AWARE_ENABLE; - if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT) + if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SEGMENT || + F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) dcc->discard_granularity = BLKS_PER_SEG(sbi); - else if (F2FS_OPTION(sbi).discard_unit == DISCARD_UNIT_SECTION) - dcc->discard_granularity = BLKS_PER_SEC(sbi); INIT_LIST_HEAD(&dcc->entry_list); for (i = 0; i < MAX_PLIST_NUM; i++) @@ -2806,7 +2807,7 @@ find_other_zone: MAIN_SECS(sbi)); if (secno >= MAIN_SECS(sbi)) { ret = -ENOSPC; - f2fs_bug_on(sbi, 1); + f2fs_bug_on(sbi, !pinning); goto out_unlock; } } @@ -2848,7 +2849,7 @@ got_it: out_unlock: spin_unlock(&free_i->segmap_lock); - if (ret == -ENOSPC) + if (ret == -ENOSPC && !pinning) f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_NO_SEGMENT); return ret; } @@ -2921,6 +2922,13 @@ static unsigned int __get_next_segno(struct f2fs_sb_info *sbi, int type) return curseg->segno; } +static void reset_curseg_fields(struct curseg_info *curseg) +{ + curseg->inited = false; + curseg->segno = NULL_SEGNO; + curseg->next_segno = 0; +} + /* * Allocate a current working segment. * This function always allocates a free segment in LFS manner. @@ -2939,7 +2947,7 @@ static int new_curseg(struct f2fs_sb_info *sbi, int type, bool new_sec) ret = get_new_segment(sbi, &segno, new_sec, pinning); if (ret) { if (ret == -ENOSPC) - curseg->segno = NULL_SEGNO; + reset_curseg_fields(curseg); return ret; } @@ -3710,13 +3718,6 @@ static void f2fs_randomize_chunk(struct f2fs_sb_info *sbi, get_random_u32_inclusive(1, sbi->max_fragment_hole); } -static void reset_curseg_fields(struct curseg_info *curseg) -{ - curseg->inited = false; - curseg->segno = NULL_SEGNO; - curseg->next_segno = 0; -} - int f2fs_allocate_data_block(struct f2fs_sb_info *sbi, struct page *page, block_t old_blkaddr, block_t *new_blkaddr, struct f2fs_summary *sum, int type, @@ -3902,6 +3903,7 @@ static int log_type_to_seg_type(enum log_type type) static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) { + struct folio *folio = page_folio(fio->page); enum log_type type = __get_segment_type(fio); int seg_type = log_type_to_seg_type(type); bool keep_order = (f2fs_lfs_mode(fio->sbi) && @@ -3912,10 +3914,10 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) if (f2fs_allocate_data_block(fio->sbi, fio->page, fio->old_blkaddr, &fio->new_blkaddr, sum, type, fio)) { - if (fscrypt_inode_uses_fs_layer_crypto(fio->page->mapping->host)) + if (fscrypt_inode_uses_fs_layer_crypto(folio->mapping->host)) fscrypt_finalize_bounce_page(&fio->encrypted_page); - end_page_writeback(fio->page); - if (f2fs_in_warm_node_list(fio->sbi, fio->page)) + folio_end_writeback(folio); + if (f2fs_in_warm_node_list(fio->sbi, folio)) f2fs_del_fsync_node_entry(fio->sbi, fio->page); goto out; } @@ -4154,22 +4156,21 @@ void f2fs_replace_block(struct f2fs_sb_info *sbi, struct dnode_of_data *dn, f2fs_update_data_blkaddr(dn, new_addr); } -void f2fs_wait_on_page_writeback(struct page *page, - enum page_type type, bool ordered, bool locked) +void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, + bool ordered, bool locked) { - if (folio_test_writeback(page_folio(page))) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); + if (folio_test_writeback(folio)) { + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); /* submit cached LFS IO */ - f2fs_submit_merged_write_cond(sbi, NULL, page, 0, type); + f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type); /* submit cached IPU IO */ - f2fs_submit_merged_ipu_write(sbi, NULL, page); + f2fs_submit_merged_ipu_write(sbi, NULL, &folio->page); if (ordered) { - wait_on_page_writeback(page); - f2fs_bug_on(sbi, locked && - folio_test_writeback(page_folio(page))); + folio_wait_writeback(folio); + f2fs_bug_on(sbi, locked && folio_test_writeback(folio)); } else { - wait_for_stable_page(page); + folio_wait_stable(folio); } } } diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 943be4f1d6d2..0465dc00b349 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -559,13 +559,16 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, unsigned int node_blocks, unsigned int data_blocks, unsigned int dent_blocks) { - unsigned int segno, left_blocks, blocks; int i; /* check current data/node sections in the worst case. */ for (i = CURSEG_HOT_DATA; i < NR_PERSISTENT_LOG; i++) { segno = CURSEG_I(sbi, i)->segno; + + if (unlikely(segno == NULL_SEGNO)) + return false; + left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); @@ -576,6 +579,10 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, /* check current data section for dentry blocks. */ segno = CURSEG_I(sbi, CURSEG_HOT_DATA)->segno; + + if (unlikely(segno == NULL_SEGNO)) + return false; + left_blocks = CAP_BLKS_PER_SEC(sbi) - get_ckpt_valid_blocks(sbi, segno, true); if (dent_blocks > left_blocks) diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 83d6fb97dcae..9c8d3aee89af 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -73,7 +73,7 @@ unsigned long f2fs_shrink_count(struct shrinker *shrink, mutex_unlock(&sbi->umount_mutex); } spin_unlock(&f2fs_list_lock); - return count; + return count ?: SHRINK_EMPTY; } unsigned long f2fs_shrink_scan(struct shrinker *shrink, @@ -130,6 +130,96 @@ unsigned long f2fs_shrink_scan(struct shrinker *shrink, return freed; } +unsigned int f2fs_donate_files(void) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + unsigned int donate_files = 0; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + donate_files += sbi->donate_files; + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); + + return donate_files; +} + +static unsigned int do_reclaim_caches(struct f2fs_sb_info *sbi, + unsigned int reclaim_caches_kb) +{ + struct inode *inode; + struct f2fs_inode_info *fi; + unsigned int nfiles = sbi->donate_files; + pgoff_t npages = reclaim_caches_kb >> (PAGE_SHIFT - 10); + + while (npages && nfiles--) { + pgoff_t len; + + spin_lock(&sbi->inode_lock[DONATE_INODE]); + if (list_empty(&sbi->inode_list[DONATE_INODE])) { + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + break; + } + fi = list_first_entry(&sbi->inode_list[DONATE_INODE], + struct f2fs_inode_info, gdonate_list); + list_move_tail(&fi->gdonate_list, &sbi->inode_list[DONATE_INODE]); + inode = igrab(&fi->vfs_inode); + spin_unlock(&sbi->inode_lock[DONATE_INODE]); + + if (!inode) + continue; + + len = fi->donate_end - fi->donate_start + 1; + npages = npages < len ? 0 : npages - len; + invalidate_inode_pages2_range(inode->i_mapping, + fi->donate_start, fi->donate_end); + iput(inode); + cond_resched(); + } + return npages << (PAGE_SHIFT - 10); +} + +void f2fs_reclaim_caches(unsigned int reclaim_caches_kb) +{ + struct f2fs_sb_info *sbi; + struct list_head *p; + + spin_lock(&f2fs_list_lock); + p = f2fs_list.next; + while (p != &f2fs_list && reclaim_caches_kb) { + sbi = list_entry(p, struct f2fs_sb_info, s_list); + + /* stop f2fs_put_super */ + if (!mutex_trylock(&sbi->umount_mutex)) { + p = p->next; + continue; + } + spin_unlock(&f2fs_list_lock); + + reclaim_caches_kb = do_reclaim_caches(sbi, reclaim_caches_kb); + + spin_lock(&f2fs_list_lock); + p = p->next; + mutex_unlock(&sbi->umount_mutex); + } + spin_unlock(&f2fs_list_lock); +} + void f2fs_join_shrinker(struct f2fs_sb_info *sbi) { spin_lock(&f2fs_list_lock); diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index 19b67828ae32..f087b2b71c89 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -63,6 +63,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_BLKADDR_VALIDITY] = "invalid blkaddr", [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr", [FAULT_NO_SEGMENT] = "no free segment", + [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer", }; int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, @@ -190,6 +191,7 @@ enum { Opt_memory_mode, Opt_age_extent_cache, Opt_errors, + Opt_nat_bits, Opt_err, }; @@ -269,6 +271,7 @@ static match_table_t f2fs_tokens = { {Opt_memory_mode, "memory=%s"}, {Opt_age_extent_cache, "age_extent_cache"}, {Opt_errors, "errors=%s"}, + {Opt_nat_bits, "nat_bits"}, {Opt_err, NULL}, }; @@ -383,10 +386,10 @@ static void init_once(void *foo) #ifdef CONFIG_QUOTA static const char * const quotatypes[] = INITQFNAMES; #define QTYPE2NAME(t) (quotatypes[t]) -static int f2fs_set_qf_name(struct super_block *sb, int qtype, +static int f2fs_set_qf_name(struct f2fs_sb_info *sbi, int qtype, substring_t *args) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct super_block *sb = sbi->sb; char *qname; int ret = -EINVAL; @@ -424,9 +427,9 @@ errout: return ret; } -static int f2fs_clear_qf_name(struct super_block *sb, int qtype) +static int f2fs_clear_qf_name(struct f2fs_sb_info *sbi, int qtype) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct super_block *sb = sbi->sb; if (sb_any_quota_loaded(sb) && F2FS_OPTION(sbi).s_qf_names[qtype]) { f2fs_err(sbi, "Cannot change journaled quota options when quota turned on"); @@ -483,12 +486,11 @@ static int f2fs_check_quota_options(struct f2fs_sb_info *sbi) } #endif -static int f2fs_set_test_dummy_encryption(struct super_block *sb, +static int f2fs_set_test_dummy_encryption(struct f2fs_sb_info *sbi, const char *opt, const substring_t *arg, bool is_remount) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); struct fs_parameter param = { .type = fs_value_is_string, .string = arg->from ? arg->from : "", @@ -671,9 +673,8 @@ static int f2fs_set_zstd_level(struct f2fs_sb_info *sbi, const char *str) #endif #endif -static int parse_options(struct super_block *sb, char *options, bool is_remount) +static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remount) { - struct f2fs_sb_info *sbi = F2FS_SB(sb); substring_t args[MAX_OPT_ARGS]; #ifdef CONFIG_F2FS_FS_COMPRESSION unsigned char (*ext)[F2FS_EXTENSION_LEN]; @@ -687,7 +688,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) int ret; if (!options) - goto default_check; + return 0; while ((p = strsep(&options, ",")) != NULL) { int token; @@ -728,10 +729,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, DISABLE_ROLL_FORWARD); break; case Opt_norecovery: - /* this option mounts f2fs with ro */ + /* requires ro mount, checked in f2fs_default_check */ set_opt(sbi, NORECOVERY); - if (!f2fs_readonly(sb)) - return -EINVAL; break; case Opt_discard: if (!f2fs_hw_support_discard(sbi)) { @@ -772,16 +771,11 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_user_xattr: - f2fs_info(sbi, "user_xattr options not supported"); - break; case Opt_nouser_xattr: - f2fs_info(sbi, "nouser_xattr options not supported"); - break; case Opt_inline_xattr: - f2fs_info(sbi, "inline_xattr options not supported"); - break; case Opt_noinline_xattr: - f2fs_info(sbi, "noinline_xattr options not supported"); + case Opt_inline_xattr_size: + f2fs_info(sbi, "xattr options not supported"); break; #endif #ifdef CONFIG_F2FS_FS_POSIX_ACL @@ -793,10 +787,8 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_acl: - f2fs_info(sbi, "acl options not supported"); - break; case Opt_noacl: - f2fs_info(sbi, "noacl options not supported"); + f2fs_info(sbi, "acl options not supported"); break; #endif case Opt_active_logs: @@ -838,7 +830,7 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, READ_EXTENT_CACHE); break; case Opt_noextent_cache: - if (F2FS_HAS_FEATURE(sbi, F2FS_FEATURE_DEVICE_ALIAS)) { + if (f2fs_sb_has_device_alias(sbi)) { f2fs_err(sbi, "device aliasing requires extent cache"); return -EINVAL; } @@ -919,18 +911,15 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) break; #else case Opt_fault_injection: - f2fs_info(sbi, "fault_injection options not supported"); - break; - case Opt_fault_type: - f2fs_info(sbi, "fault_type options not supported"); + f2fs_info(sbi, "fault injection options not supported"); break; #endif case Opt_lazytime: - sb->s_flags |= SB_LAZYTIME; + set_opt(sbi, LAZYTIME); break; case Opt_nolazytime: - sb->s_flags &= ~SB_LAZYTIME; + clear_opt(sbi, LAZYTIME); break; #ifdef CONFIG_QUOTA case Opt_quota: @@ -944,32 +933,32 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) set_opt(sbi, PRJQUOTA); break; case Opt_usrjquota: - ret = f2fs_set_qf_name(sb, USRQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, USRQUOTA, &args[0]); if (ret) return ret; break; case Opt_grpjquota: - ret = f2fs_set_qf_name(sb, GRPQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, GRPQUOTA, &args[0]); if (ret) return ret; break; case Opt_prjjquota: - ret = f2fs_set_qf_name(sb, PRJQUOTA, &args[0]); + ret = f2fs_set_qf_name(sbi, PRJQUOTA, &args[0]); if (ret) return ret; break; case Opt_offusrjquota: - ret = f2fs_clear_qf_name(sb, USRQUOTA); + ret = f2fs_clear_qf_name(sbi, USRQUOTA); if (ret) return ret; break; case Opt_offgrpjquota: - ret = f2fs_clear_qf_name(sb, GRPQUOTA); + ret = f2fs_clear_qf_name(sbi, GRPQUOTA); if (ret) return ret; break; case Opt_offprjjquota: - ret = f2fs_clear_qf_name(sb, PRJQUOTA); + ret = f2fs_clear_qf_name(sbi, PRJQUOTA); if (ret) return ret; break; @@ -1039,14 +1028,14 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) kfree(name); break; case Opt_test_dummy_encryption: - ret = f2fs_set_test_dummy_encryption(sb, p, &args[0], + ret = f2fs_set_test_dummy_encryption(sbi, p, &args[0], is_remount); if (ret) return ret; break; case Opt_inlinecrypt: #ifdef CONFIG_FS_ENCRYPTION_INLINE_CRYPT - sb->s_flags |= SB_INLINECRYPT; + set_opt(sbi, INLINECRYPT); #else f2fs_info(sbi, "inline encryption not supported"); #endif @@ -1322,13 +1311,20 @@ static int parse_options(struct super_block *sb, char *options, bool is_remount) } kfree(name); break; + case Opt_nat_bits: + set_opt(sbi, NAT_BITS); + break; default: f2fs_err(sbi, "Unrecognized mount option \"%s\" or missing value", p); return -EINVAL; } } -default_check: + return 0; +} + +static int f2fs_default_check(struct f2fs_sb_info *sbi) +{ #ifdef CONFIG_QUOTA if (f2fs_check_quota_options(sbi)) return -EINVAL; @@ -1418,6 +1414,12 @@ default_check: f2fs_err(sbi, "Allow to mount readonly mode only"); return -EROFS; } + + if (test_opt(sbi, NORECOVERY) && !f2fs_readonly(sbi->sb)) { + f2fs_err(sbi, "norecovery requires readonly mount"); + return -EINVAL; + } + return 0; } @@ -1441,6 +1443,7 @@ static struct inode *f2fs_alloc_inode(struct super_block *sb) spin_lock_init(&fi->i_size_lock); INIT_LIST_HEAD(&fi->dirty_list); INIT_LIST_HEAD(&fi->gdirty_list); + INIT_LIST_HEAD(&fi->gdonate_list); init_f2fs_rwsem(&fi->i_gc_rwsem[READ]); init_f2fs_rwsem(&fi->i_gc_rwsem[WRITE]); init_f2fs_rwsem(&fi->i_xattr_sem); @@ -1527,6 +1530,10 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync) inc_page_count(sbi, F2FS_DIRTY_IMETA); } spin_unlock(&sbi->inode_lock[DIRTY_META]); + + if (!ret && f2fs_is_atomic_file(inode)) + set_inode_flag(inode, FI_ATOMIC_DIRTIED); + return ret; } @@ -1737,22 +1744,28 @@ int f2fs_sync_fs(struct super_block *sb, int sync) static int f2fs_freeze(struct super_block *sb) { + struct f2fs_sb_info *sbi = F2FS_SB(sb); + if (f2fs_readonly(sb)) return 0; /* IO error happened before */ - if (unlikely(f2fs_cp_error(F2FS_SB(sb)))) + if (unlikely(f2fs_cp_error(sbi))) return -EIO; /* must be clean, since sync_filesystem() was already called */ - if (is_sbi_flag_set(F2FS_SB(sb), SBI_IS_DIRTY)) + if (is_sbi_flag_set(sbi, SBI_IS_DIRTY)) return -EINVAL; + sbi->umount_lock_holder = current; + /* Let's flush checkpoints and stop the thread. */ - f2fs_flush_ckpt_thread(F2FS_SB(sb)); + f2fs_flush_ckpt_thread(sbi); + + sbi->umount_lock_holder = NULL; /* to avoid deadlock on f2fs_evict_inode->SB_FREEZE_FS */ - set_sbi_flag(F2FS_SB(sb), SBI_IS_FREEZING); + set_sbi_flag(sbi, SBI_IS_FREEZING); return 0; } @@ -1836,7 +1849,8 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_blocks = total_count - start_count; spin_lock(&sbi->stat_lock); - + if (sbi->carve_out) + buf->f_blocks -= sbi->current_reserved_blocks; user_block_count = sbi->user_block_count; total_valid_node_count = valid_node_count(sbi); avail_node_count = sbi->total_node_count - F2FS_RESERVED_NODE_NUM; @@ -2128,6 +2142,9 @@ static int f2fs_show_options(struct seq_file *seq, struct dentry *root) else if (F2FS_OPTION(sbi).errors == MOUNT_ERRORS_PANIC) seq_printf(seq, ",errors=%s", "panic"); + if (test_opt(sbi, NAT_BITS)) + seq_puts(seq, ",nat_bits"); + return 0; } @@ -2175,8 +2192,8 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) set_opt(sbi, INLINE_DATA); set_opt(sbi, INLINE_DENTRY); set_opt(sbi, MERGE_CHECKPOINT); + set_opt(sbi, LAZYTIME); F2FS_OPTION(sbi).unusable_cap = 0; - sbi->sb->s_flags |= SB_LAZYTIME; if (!f2fs_is_readonly(sbi)) set_opt(sbi, FLUSH_MERGE); if (f2fs_sb_has_blkzoned(sbi)) @@ -2318,6 +2335,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) bool no_discard = !test_opt(sbi, DISCARD); bool no_compress_cache = !test_opt(sbi, COMPRESS_CACHE); bool block_unit_discard = f2fs_block_unit_discard(sbi); + bool no_nat_bits = !test_opt(sbi, NAT_BITS); #ifdef CONFIG_QUOTA int i, j; #endif @@ -2329,6 +2347,8 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) org_mount_opt = sbi->mount_opt; old_sb_flags = sb->s_flags; + sbi->umount_lock_holder = current; + #ifdef CONFIG_QUOTA org_mount_opt.s_jquota_fmt = F2FS_OPTION(sbi).s_jquota_fmt; for (i = 0; i < MAXQUOTAS; i++) { @@ -2359,7 +2379,7 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) default_options(sbi, true); /* parse mount options */ - err = parse_options(sb, data, true); + err = parse_options(sbi, data, true); if (err) goto restore_opts; @@ -2374,6 +2394,10 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) } #endif + err = f2fs_default_check(sbi); + if (err) + goto restore_opts; + /* flush outstanding errors before changing fs state */ flush_work(&sbi->s_error_work); @@ -2444,6 +2468,12 @@ static int f2fs_remount(struct super_block *sb, int *flags, char *data) goto restore_opts; } + if (no_nat_bits == !!test_opt(sbi, NAT_BITS)) { + err = -EINVAL; + f2fs_warn(sbi, "switch nat_bits option is not allowed"); + goto restore_opts; + } + if ((*flags & SB_RDONLY) && test_opt(sbi, DISABLE_CHECKPOINT)) { err = -EINVAL; f2fs_warn(sbi, "disabling checkpoint not compatible with read-only"); @@ -2552,6 +2582,8 @@ skip: limit_reserve_root(sbi); *flags = (*flags & ~SB_LAZYTIME) | (sb->s_flags & SB_LAZYTIME); + + sbi->umount_lock_holder = NULL; return 0; restore_checkpoint: if (need_enable_checkpoint) { @@ -2592,6 +2624,8 @@ restore_opts: #endif sbi->mount_opt = org_mount_opt; sb->s_flags = old_sb_flags; + + sbi->umount_lock_holder = NULL; return err; } @@ -2908,7 +2942,7 @@ out: return ret; } -int f2fs_quota_sync(struct super_block *sb, int type) +int f2fs_do_quota_sync(struct super_block *sb, int type) { struct f2fs_sb_info *sbi = F2FS_SB(sb); struct quota_info *dqopt = sb_dqopt(sb); @@ -2956,11 +2990,21 @@ int f2fs_quota_sync(struct super_block *sb, int type) return ret; } +static int f2fs_quota_sync(struct super_block *sb, int type) +{ + int ret; + + F2FS_SB(sb)->umount_lock_holder = current; + ret = f2fs_do_quota_sync(sb, type); + F2FS_SB(sb)->umount_lock_holder = NULL; + return ret; +} + static int f2fs_quota_on(struct super_block *sb, int type, int format_id, const struct path *path) { struct inode *inode; - int err; + int err = 0; /* if quota sysfile exists, deny enabling quota with specific file */ if (f2fs_sb_has_quota_ino(F2FS_SB(sb))) { @@ -2971,31 +3015,34 @@ static int f2fs_quota_on(struct super_block *sb, int type, int format_id, if (path->dentry->d_sb != sb) return -EXDEV; - err = f2fs_quota_sync(sb, type); + F2FS_SB(sb)->umount_lock_holder = current; + + err = f2fs_do_quota_sync(sb, type); if (err) - return err; + goto out; inode = d_inode(path->dentry); err = filemap_fdatawrite(inode->i_mapping); if (err) - return err; + goto out; err = filemap_fdatawait(inode->i_mapping); if (err) - return err; + goto out; err = dquot_quota_on(sb, type, format_id, path); if (err) - return err; + goto out; inode_lock(inode); F2FS_I(inode)->i_flags |= F2FS_QUOTA_DEFAULT_FL; f2fs_set_inode_flags(inode); inode_unlock(inode); f2fs_mark_inode_dirty_sync(inode, false); - - return 0; +out: + F2FS_SB(sb)->umount_lock_holder = NULL; + return err; } static int __f2fs_quota_off(struct super_block *sb, int type) @@ -3006,7 +3053,7 @@ static int __f2fs_quota_off(struct super_block *sb, int type) if (!inode || !igrab(inode)) return dquot_quota_off(sb, type); - err = f2fs_quota_sync(sb, type); + err = f2fs_do_quota_sync(sb, type); if (err) goto out_put; @@ -3029,6 +3076,8 @@ static int f2fs_quota_off(struct super_block *sb, int type) struct f2fs_sb_info *sbi = F2FS_SB(sb); int err; + F2FS_SB(sb)->umount_lock_holder = current; + err = __f2fs_quota_off(sb, type); /* @@ -3038,6 +3087,9 @@ static int f2fs_quota_off(struct super_block *sb, int type) */ if (is_journalled_quota(sbi)) set_sbi_flag(sbi, SBI_QUOTA_NEED_REPAIR); + + F2FS_SB(sb)->umount_lock_holder = NULL; + return err; } @@ -3170,7 +3222,7 @@ int f2fs_dquot_initialize(struct inode *inode) return 0; } -int f2fs_quota_sync(struct super_block *sb, int type) +int f2fs_do_quota_sync(struct super_block *sb, int type) { return 0; } @@ -4220,6 +4272,8 @@ void f2fs_handle_critical_error(struct f2fs_sb_info *sbi, unsigned char reason) if (shutdown) set_sbi_flag(sbi, SBI_IS_SHUTDOWN); + else + dump_stack(); /* * Continue filesystem operators if errors=continue. Should not set @@ -4495,7 +4549,11 @@ try_onemore: goto free_sb_buf; } - err = parse_options(sb, options, false); + err = parse_options(sbi, options, false); + if (err) + goto free_options; + + err = f2fs_default_check(sbi); if (err) goto free_options; @@ -4533,6 +4591,14 @@ try_onemore: sb->s_time_gran = 1; sb->s_flags = (sb->s_flags & ~SB_POSIXACL) | (test_opt(sbi, POSIX_ACL) ? SB_POSIXACL : 0); + if (test_opt(sbi, INLINECRYPT)) + sb->s_flags |= SB_INLINECRYPT; + + if (test_opt(sbi, LAZYTIME)) + sb->s_flags |= SB_LAZYTIME; + else + sb->s_flags &= ~SB_LAZYTIME; + super_set_uuid(sb, (void *) raw_super->uuid, sizeof(raw_super->uuid)); super_set_sysfs_name_bdev(sb); sb->s_iflags |= SB_I_CGROUPWB; @@ -4703,6 +4769,7 @@ try_onemore: if (err) goto free_compress_inode; + sbi->umount_lock_holder = current; #ifdef CONFIG_QUOTA /* Enable quota usage during mount */ if (f2fs_sb_has_quota_ino(sbi) && !f2fs_readonly(sb)) { @@ -4718,8 +4785,10 @@ try_onemore: if (err) goto free_meta; - if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) + if (unlikely(is_set_ckpt_flags(sbi, CP_DISABLED_FLAG))) { + skip_recovery = true; goto reset_checkpoint; + } /* recover fsynced data */ if (!test_opt(sbi, DISABLE_ROLL_FORWARD) && @@ -4769,10 +4838,10 @@ try_onemore: } } +reset_checkpoint: #ifdef CONFIG_QUOTA f2fs_recover_quota_end(sbi, quota_enabled); #endif -reset_checkpoint: /* * If the f2fs is not readonly and fsync data recovery succeeds, * write pointer consistency of cursegs and other zones are already @@ -4829,6 +4898,8 @@ reset_checkpoint: f2fs_update_time(sbi, CP_TIME); f2fs_update_time(sbi, REQ_TIME); clear_sbi_flag(sbi, SBI_CP_DISABLED_QUICK); + + sbi->umount_lock_holder = NULL; return 0; sync_free_meta: @@ -4931,6 +5002,8 @@ static void kill_f2fs_super(struct super_block *sb) struct f2fs_sb_info *sbi = F2FS_SB(sb); if (sb->s_root) { + sbi->umount_lock_holder = current; + set_sbi_flag(sbi, SBI_IS_CLOSE); f2fs_stop_gc_thread(sbi); f2fs_stop_discard_thread(sbi); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index d15c68b28952..c69161366467 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -61,6 +61,12 @@ struct f2fs_attr { int id; }; +struct f2fs_base_attr { + struct attribute attr; + ssize_t (*show)(struct f2fs_base_attr *a, char *buf); + ssize_t (*store)(struct f2fs_base_attr *a, const char *buf, size_t len); +}; + static ssize_t f2fs_sbi_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf); @@ -862,6 +868,25 @@ static void f2fs_sb_release(struct kobject *kobj) complete(&sbi->s_kobj_unregister); } +static ssize_t f2fs_base_attr_show(struct kobject *kobj, + struct attribute *attr, char *buf) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->show ? a->show(a, buf) : 0; +} + +static ssize_t f2fs_base_attr_store(struct kobject *kobj, + struct attribute *attr, + const char *buf, size_t len) +{ + struct f2fs_base_attr *a = container_of(attr, + struct f2fs_base_attr, attr); + + return a->store ? a->store(a, buf, len) : 0; +} + /* * Note that there are three feature list entries: * 1) /sys/fs/f2fs/features @@ -880,18 +905,50 @@ static void f2fs_sb_release(struct kobject *kobj) * please add new on-disk feature in this list only. * - ref. F2FS_SB_FEATURE_RO_ATTR() */ -static ssize_t f2fs_feature_show(struct f2fs_attr *a, - struct f2fs_sb_info *sbi, char *buf) +static ssize_t f2fs_feature_show(struct f2fs_base_attr *a, char *buf) { return sysfs_emit(buf, "supported\n"); } #define F2FS_FEATURE_RO_ATTR(_name) \ -static struct f2fs_attr f2fs_attr_##_name = { \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ .attr = {.name = __stringify(_name), .mode = 0444 }, \ .show = f2fs_feature_show, \ } +static ssize_t f2fs_tune_show(struct f2fs_base_attr *a, char *buf) +{ + unsigned int res = 0; + + if (!strcmp(a->attr.name, "reclaim_caches_kb")) + res = f2fs_donate_files(); + + return sysfs_emit(buf, "%u\n", res); +} + +static ssize_t f2fs_tune_store(struct f2fs_base_attr *a, + const char *buf, size_t count) +{ + unsigned long t; + int ret; + + ret = kstrtoul(skip_spaces(buf), 0, &t); + if (ret) + return ret; + + if (!strcmp(a->attr.name, "reclaim_caches_kb")) + f2fs_reclaim_caches(t); + + return count; +} + +#define F2FS_TUNE_RW_ATTR(_name) \ +static struct f2fs_base_attr f2fs_base_attr_##_name = { \ + .attr = {.name = __stringify(_name), .mode = 0644 }, \ + .show = f2fs_tune_show, \ + .store = f2fs_tune_store, \ +} + static ssize_t f2fs_sb_feature_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -1065,6 +1122,7 @@ F2FS_SBI_GENERAL_RW_ATTR(max_read_extent_count); F2FS_SBI_GENERAL_RO_ATTR(unusable_blocks_per_sec); F2FS_SBI_GENERAL_RW_ATTR(blkzone_alloc_policy); #endif +F2FS_SBI_GENERAL_RW_ATTR(carve_out); /* STAT_INFO ATTR */ #ifdef CONFIG_F2FS_STAT_FS @@ -1252,41 +1310,43 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(warm_data_age_threshold), ATTR_LIST(last_age_weight), ATTR_LIST(max_read_extent_count), + ATTR_LIST(carve_out), NULL, }; ATTRIBUTE_GROUPS(f2fs); +#define BASE_ATTR_LIST(name) (&f2fs_base_attr_##name.attr) static struct attribute *f2fs_feat_attrs[] = { #ifdef CONFIG_FS_ENCRYPTION - ATTR_LIST(encryption), - ATTR_LIST(test_dummy_encryption_v2), + BASE_ATTR_LIST(encryption), + BASE_ATTR_LIST(test_dummy_encryption_v2), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(encrypted_casefold), + BASE_ATTR_LIST(encrypted_casefold), #endif #endif /* CONFIG_FS_ENCRYPTION */ #ifdef CONFIG_BLK_DEV_ZONED - ATTR_LIST(block_zoned), + BASE_ATTR_LIST(block_zoned), #endif - ATTR_LIST(atomic_write), - ATTR_LIST(extra_attr), - ATTR_LIST(project_quota), - ATTR_LIST(inode_checksum), - ATTR_LIST(flexible_inline_xattr), - ATTR_LIST(quota_ino), - ATTR_LIST(inode_crtime), - ATTR_LIST(lost_found), + BASE_ATTR_LIST(atomic_write), + BASE_ATTR_LIST(extra_attr), + BASE_ATTR_LIST(project_quota), + BASE_ATTR_LIST(inode_checksum), + BASE_ATTR_LIST(flexible_inline_xattr), + BASE_ATTR_LIST(quota_ino), + BASE_ATTR_LIST(inode_crtime), + BASE_ATTR_LIST(lost_found), #ifdef CONFIG_FS_VERITY - ATTR_LIST(verity), + BASE_ATTR_LIST(verity), #endif - ATTR_LIST(sb_checksum), + BASE_ATTR_LIST(sb_checksum), #if IS_ENABLED(CONFIG_UNICODE) - ATTR_LIST(casefold), + BASE_ATTR_LIST(casefold), #endif - ATTR_LIST(readonly), + BASE_ATTR_LIST(readonly), #ifdef CONFIG_F2FS_FS_COMPRESSION - ATTR_LIST(compression), + BASE_ATTR_LIST(compression), #endif - ATTR_LIST(pin_file), + BASE_ATTR_LIST(pin_file), NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -1343,6 +1403,14 @@ static struct attribute *f2fs_sb_feat_attrs[] = { }; ATTRIBUTE_GROUPS(f2fs_sb_feat); +F2FS_TUNE_RW_ATTR(reclaim_caches_kb); + +static struct attribute *f2fs_tune_attrs[] = { + BASE_ATTR_LIST(reclaim_caches_kb), + NULL, +}; +ATTRIBUTE_GROUPS(f2fs_tune); + static const struct sysfs_ops f2fs_attr_ops = { .show = f2fs_attr_show, .store = f2fs_attr_store, @@ -1362,15 +1430,34 @@ static struct kset f2fs_kset = { .kobj = {.ktype = &f2fs_ktype}, }; +static const struct sysfs_ops f2fs_feat_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + static const struct kobj_type f2fs_feat_ktype = { .default_groups = f2fs_feat_groups, - .sysfs_ops = &f2fs_attr_ops, + .sysfs_ops = &f2fs_feat_attr_ops, }; static struct kobject f2fs_feat = { .kset = &f2fs_kset, }; +static const struct sysfs_ops f2fs_tune_attr_ops = { + .show = f2fs_base_attr_show, + .store = f2fs_base_attr_store, +}; + +static const struct kobj_type f2fs_tune_ktype = { + .default_groups = f2fs_tune_groups, + .sysfs_ops = &f2fs_tune_attr_ops, +}; + +static struct kobject f2fs_tune = { + .kset = &f2fs_kset, +}; + static ssize_t f2fs_stat_attr_show(struct kobject *kobj, struct attribute *attr, char *buf) { @@ -1607,6 +1694,11 @@ int __init f2fs_init_sysfs(void) if (ret) goto put_kobject; + ret = kobject_init_and_add(&f2fs_tune, &f2fs_tune_ktype, + NULL, "tuning"); + if (ret) + goto put_kobject; + f2fs_proc_root = proc_mkdir("fs/f2fs", NULL); if (!f2fs_proc_root) { ret = -ENOMEM; @@ -1614,7 +1706,9 @@ int __init f2fs_init_sysfs(void) } return 0; + put_kobject: + kobject_put(&f2fs_tune); kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); return ret; @@ -1622,6 +1716,7 @@ put_kobject: void f2fs_exit_sysfs(void) { + kobject_put(&f2fs_tune); kobject_put(&f2fs_feat); kset_unregister(&f2fs_kset); remove_proc_entry("fs/f2fs", NULL); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index 3f3874943679..c691b35618ad 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -282,7 +282,7 @@ static int read_inline_xattr(struct inode *inode, struct page *ipage, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - page = f2fs_get_node_page(sbi, inode->i_ino); + page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(page)) return PTR_ERR(page); @@ -303,7 +303,7 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = f2fs_get_node_page(sbi, xnid); + xpage = f2fs_get_xnode_page(sbi, xnid); if (IS_ERR(xpage)) return PTR_ERR(xpage); @@ -449,7 +449,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, if (ipage) { inline_addr = inline_xattr_addr(inode, ipage); } else { - in_page = f2fs_get_node_page(sbi, inode->i_ino); + in_page = f2fs_get_inode_page(sbi, inode->i_ino); if (IS_ERR(in_page)) { f2fs_alloc_nid_failed(sbi, new_nid); return PTR_ERR(in_page); @@ -475,7 +475,7 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to xattr node block */ if (F2FS_I(inode)->i_xattr_nid) { - xpage = f2fs_get_node_page(sbi, F2FS_I(inode)->i_xattr_nid); + xpage = f2fs_get_xnode_page(sbi, F2FS_I(inode)->i_xattr_nid); if (IS_ERR(xpage)) { err = PTR_ERR(xpage); f2fs_alloc_nid_failed(sbi, new_nid); diff --git a/fs/fat/namei_msdos.c b/fs/fat/namei_msdos.c index f06f6ba643cc..23e9b9371ec3 100644 --- a/fs/fat/namei_msdos.c +++ b/fs/fat/namei_msdos.c @@ -339,8 +339,8 @@ out: } /***** Make a directory */ -static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct fat_slot_info sinfo; @@ -389,13 +389,13 @@ static int msdos_mkdir(struct mnt_idmap *idmap, struct inode *dir, mutex_unlock(&MSDOS_SB(sb)->s_lock); fat_flush_inodes(sb, dir, inode); - return 0; + return NULL; out_free: fat_free_clusters(dir, cluster); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); - return err; + return ERR_PTR(err); } /***** Unlink a file */ diff --git a/fs/fat/namei_vfat.c b/fs/fat/namei_vfat.c index 926c26e90ef8..dd910edd2404 100644 --- a/fs/fat/namei_vfat.c +++ b/fs/fat/namei_vfat.c @@ -841,8 +841,8 @@ out: return err; } -static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct super_block *sb = dir->i_sb; struct inode *inode; @@ -877,13 +877,13 @@ static int vfat_mkdir(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); mutex_unlock(&MSDOS_SB(sb)->s_lock); - return 0; + return NULL; out_free: fat_free_clusters(dir, cluster); out: mutex_unlock(&MSDOS_SB(sb)->s_lock); - return err; + return ERR_PTR(err); } static int vfat_get_dotdot_de(struct inode *inode, struct buffer_head **bh, diff --git a/fs/file.c b/fs/file.c index d868cdb95d1e..3a3146664cf3 100644 --- a/fs/file.c +++ b/fs/file.c @@ -26,6 +26,28 @@ #include "internal.h" +static noinline bool __file_ref_put_badval(file_ref_t *ref, unsigned long cnt) +{ + /* + * If the reference count was already in the dead zone, then this + * put() operation is imbalanced. Warn, put the reference count back to + * DEAD and tell the caller to not deconstruct the object. + */ + if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { + atomic_long_set(&ref->refcnt, FILE_REF_DEAD); + return false; + } + + /* + * This is a put() operation on a saturated refcount. Restore the + * mean saturation value and tell the caller to not deconstruct the + * object. + */ + if (cnt > FILE_REF_MAXREF) + atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); + return false; +} + /** * __file_ref_put - Slowpath of file_ref_put() * @ref: Pointer to the reference count @@ -67,24 +89,7 @@ bool __file_ref_put(file_ref_t *ref, unsigned long cnt) return true; } - /* - * If the reference count was already in the dead zone, then this - * put() operation is imbalanced. Warn, put the reference count back to - * DEAD and tell the caller to not deconstruct the object. - */ - if (WARN_ONCE(cnt >= FILE_REF_RELEASED, "imbalanced put on file reference count")) { - atomic_long_set(&ref->refcnt, FILE_REF_DEAD); - return false; - } - - /* - * This is a put() operation on a saturated refcount. Restore the - * mean saturation value and tell the caller to not deconstruct the - * object. - */ - if (cnt > FILE_REF_MAXREF) - atomic_long_set(&ref->refcnt, FILE_REF_SATURATED); - return false; + return __file_ref_put_badval(ref, cnt); } EXPORT_SYMBOL_GPL(__file_ref_put); @@ -418,17 +423,25 @@ struct files_struct *dup_fd(struct files_struct *oldf, struct fd_range *punch_ho old_fds = old_fdt->fd; new_fds = new_fdt->fd; + /* + * We may be racing against fd allocation from other threads using this + * files_struct, despite holding ->file_lock. + * + * alloc_fd() might have already claimed a slot, while fd_install() + * did not populate it yet. Note the latter operates locklessly, so + * the file can show up as we are walking the array below. + * + * At the same time we know no files will disappear as all other + * operations take the lock. + * + * Instead of trying to placate userspace racing with itself, we + * ref the file if we see it and mark the fd slot as unused otherwise. + */ for (i = open_files; i != 0; i--) { - struct file *f = *old_fds++; + struct file *f = rcu_dereference_raw(*old_fds++); if (f) { get_file(f); } else { - /* - * The fd may be claimed in the fd bitmap but not yet - * instantiated in the files array if a sibling thread - * is partway through open(). So make sure that this - * fd is available to the new process. - */ __clear_open_fd(open_files - i, new_fdt); } rcu_assign_pointer(*new_fds++, f); @@ -577,6 +590,7 @@ repeat: __set_open_fd(fd, fdt, flags & O_CLOEXEC); error = fd; + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); out: spin_unlock(&files->file_lock); @@ -612,22 +626,14 @@ void put_unused_fd(unsigned int fd) EXPORT_SYMBOL(put_unused_fd); -/* - * Install a file pointer in the fd array. - * - * The VFS is full of places where we drop the files lock between - * setting the open_fds bitmap and installing the file in the file - * array. At any such point, we are vulnerable to a dup2() race - * installing a file in the array before us. We need to detect this and - * fput() the struct file we are about to overwrite in this case. - * - * It should never happen - if we allow dup2() do it, _really_ bad things - * will follow. +/** + * fd_install - install a file pointer in the fd array + * @fd: file descriptor to install the file in + * @file: the file to install * * This consumes the "file" refcount, so callers should treat it * as if they had called fput(file). */ - void fd_install(unsigned int fd, struct file *file) { struct files_struct *files = current->files; @@ -642,7 +648,7 @@ void fd_install(unsigned int fd, struct file *file) rcu_read_unlock_sched(); spin_lock(&files->file_lock); fdt = files_fdtable(files); - WARN_ON(fdt->fd[fd] != NULL); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); spin_unlock(&files->file_lock); return; @@ -650,7 +656,7 @@ void fd_install(unsigned int fd, struct file *file) /* coupled with smp_wmb() in expand_fdtable() */ smp_rmb(); fdt = rcu_dereference_sched(files->fdt); - BUG_ON(fdt->fd[fd] != NULL); + VFS_BUG_ON(rcu_access_pointer(fdt->fd[fd]) != NULL); rcu_assign_pointer(fdt->fd[fd], file); rcu_read_unlock_sched(); } @@ -679,7 +685,7 @@ struct file *file_close_fd_locked(struct files_struct *files, unsigned fd) return NULL; fd = array_index_nospec(fd, fdt->max_fds); - file = fdt->fd[fd]; + file = rcu_dereference_raw(fdt->fd[fd]); if (file) { rcu_assign_pointer(fdt->fd[fd], NULL); __put_unused_fd(files, fd); @@ -1178,8 +1184,23 @@ struct fd fdget_raw(unsigned int fd) */ static inline bool file_needs_f_pos_lock(struct file *file) { - return (file->f_mode & FMODE_ATOMIC_POS) && - (file_count(file) > 1 || file->f_op->iterate_shared); + if (!(file->f_mode & FMODE_ATOMIC_POS)) + return false; + if (__file_ref_read_raw(&file->f_ref) != FILE_REF_ONEREF) + return true; + if (file->f_op->iterate_shared) + return true; + return false; +} + +bool file_seek_cur_needs_f_lock(struct file *file) +{ + if (!(file->f_mode & FMODE_ATOMIC_POS) && !file->f_op->iterate_shared) + return false; + + VFS_WARN_ON_ONCE((file_count(file) > 1) && + !mutex_is_locked(&file->f_pos_lock)); + return true; } struct fd fdget_pos(unsigned int fd) @@ -1187,7 +1208,7 @@ struct fd fdget_pos(unsigned int fd) struct fd f = fdget(fd); struct file *file = fd_file(f); - if (file && file_needs_f_pos_lock(file)) { + if (likely(file) && file_needs_f_pos_lock(file)) { f.word |= FDPUT_POS_UNLOCK; mutex_lock(&file->f_pos_lock); } @@ -1230,14 +1251,34 @@ __releases(&files->file_lock) struct fdtable *fdt; /* - * We need to detect attempts to do dup2() over allocated but still - * not finished descriptor. + * dup2() is expected to close the file installed in the target fd slot + * (if any). However, userspace hand-picking a fd may be racing against + * its own threads which happened to allocate it in open() et al but did + * not populate it yet. + * + * Broadly speaking we may be racing against the following: + * fd = get_unused_fd_flags(); // fd slot reserved, ->fd[fd] == NULL + * file = hard_work_goes_here(); + * fd_install(fd, file); // only now ->fd[fd] == file + * + * It is an invariant that a successfully allocated fd has a NULL entry + * in the array until the matching fd_install(). + * + * If we fit the window, we have the fd to populate, yet no target file + * to close. Trying to ignore it and install our new file would violate + * the invariant and make fd_install() overwrite our file. + * + * Things can be done(tm) to handle this. However, the issue does not + * concern legitimate programs and we only need to make sure the kernel + * does not trip over it. + * + * The simplest way out is to return an error if we find ourselves here. * * POSIX is silent on the issue, we return -EBUSY. */ fdt = files_fdtable(files); fd = array_index_nospec(fd, fdt->max_fds); - tofree = fdt->fd[fd]; + tofree = rcu_dereference_raw(fdt->fd[fd]); if (!tofree && fd_is_open(fd, fdt)) goto Ebusy; get_file(file); diff --git a/fs/file_table.c b/fs/file_table.c index 5c00dc38558d..138114d64307 100644 --- a/fs/file_table.c +++ b/fs/file_table.c @@ -102,7 +102,7 @@ EXPORT_SYMBOL_GPL(get_max_files); static int proc_nr_files(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { - files_stat.nr_files = get_nr_files(); + files_stat.nr_files = percpu_counter_sum_positive(&nr_files); return proc_doulongvec_minmax(table, write, buffer, lenp, ppos); } @@ -221,7 +221,8 @@ struct file *alloc_empty_file(int flags, const struct cred *cred) /* * Privileged users can go above max_files */ - if (get_nr_files() >= files_stat.max_files && !capable(CAP_SYS_ADMIN)) { + if (unlikely(get_nr_files() >= files_stat.max_files) && + !capable(CAP_SYS_ADMIN)) { /* * percpu_counters are inaccurate. Do an expensive check before * we go and fail. @@ -511,31 +512,37 @@ void flush_delayed_fput(void) } EXPORT_SYMBOL_GPL(flush_delayed_fput); -void fput(struct file *file) +static void __fput_deferred(struct file *file) { - if (file_ref_put(&file->f_ref)) { - struct task_struct *task = current; + struct task_struct *task = current; + + if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { + file_free(file); + return; + } - if (unlikely(!(file->f_mode & (FMODE_BACKING | FMODE_OPENED)))) { - file_free(file); + if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { + init_task_work(&file->f_task_work, ____fput); + if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) return; - } - if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) { - init_task_work(&file->f_task_work, ____fput); - if (!task_work_add(task, &file->f_task_work, TWA_RESUME)) - return; - /* - * After this task has run exit_task_work(), - * task_work_add() will fail. Fall through to delayed - * fput to avoid leaking *file. - */ - } - - if (llist_add(&file->f_llist, &delayed_fput_list)) - schedule_delayed_work(&delayed_fput_work, 1); + /* + * After this task has run exit_task_work(), + * task_work_add() will fail. Fall through to delayed + * fput to avoid leaking *file. + */ } + + if (llist_add(&file->f_llist, &delayed_fput_list)) + schedule_delayed_work(&delayed_fput_work, 1); } +void fput(struct file *file) +{ + if (unlikely(file_ref_put(&file->f_ref))) + __fput_deferred(file); +} +EXPORT_SYMBOL(fput); + /* * synchronous analog of fput(); for kernel threads that might be needed * in some umount() (and thus can't use flush_delayed_fput() without @@ -549,10 +556,32 @@ void __fput_sync(struct file *file) if (file_ref_put(&file->f_ref)) __fput(file); } - -EXPORT_SYMBOL(fput); EXPORT_SYMBOL(__fput_sync); +/* + * Equivalent to __fput_sync(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close_sync(struct file *file) +{ + if (likely(file_ref_put_close(&file->f_ref))) + __fput(file); +} + +/* + * Equivalent to fput(), but optimized for being called with the last + * reference. + * + * See file_ref_put_close() for details. + */ +void fput_close(struct file *file) +{ + if (file_ref_put_close(&file->f_ref)) + __fput_deferred(file); +} + void __init files_init(void) { struct kmem_cache_args args = { diff --git a/fs/filesystems.c b/fs/filesystems.c index 58b9067b2391..95e5256821a5 100644 --- a/fs/filesystems.c +++ b/fs/filesystems.c @@ -156,15 +156,19 @@ static int fs_index(const char __user * __name) static int fs_name(unsigned int index, char __user * buf) { struct file_system_type * tmp; - int len, res; + int len, res = -EINVAL; read_lock(&file_systems_lock); - for (tmp = file_systems; tmp; tmp = tmp->next, index--) - if (index <= 0 && try_module_get(tmp->owner)) + for (tmp = file_systems; tmp; tmp = tmp->next, index--) { + if (index == 0) { + if (try_module_get(tmp->owner)) + res = 0; break; + } + } read_unlock(&file_systems_lock); - if (!tmp) - return -EINVAL; + if (res) + return res; /* OK, we got the reference, so we can safely block */ len = strlen(tmp->name) + 1; diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3cd99e2dc6ac..cc57367fb641 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -65,7 +65,7 @@ struct wb_writeback_work { * timestamps written to disk after 12 hours, but in the worst case a * few inodes might not their timestamps updated for 24 hours. */ -unsigned int dirtytime_expire_interval = 12 * 60 * 60; +static unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { @@ -2435,14 +2435,7 @@ static void wakeup_dirtytime_writeback(struct work_struct *w) schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); } -static int __init start_dirtytime_writeback(void) -{ - schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); - return 0; -} -__initcall(start_dirtytime_writeback); - -int dirtytime_interval_handler(const struct ctl_table *table, int write, +static int dirtytime_interval_handler(const struct ctl_table *table, int write, void *buffer, size_t *lenp, loff_t *ppos) { int ret; @@ -2453,6 +2446,25 @@ int dirtytime_interval_handler(const struct ctl_table *table, int write, return ret; } +static const struct ctl_table vm_fs_writeback_table[] = { + { + .procname = "dirtytime_expire_seconds", + .data = &dirtytime_expire_interval, + .maxlen = sizeof(dirtytime_expire_interval), + .mode = 0644, + .proc_handler = dirtytime_interval_handler, + .extra1 = SYSCTL_ZERO, + }, +}; + +static int __init start_dirtytime_writeback(void) +{ + schedule_delayed_work(&dirtytime_work, dirtytime_expire_interval * HZ); + register_sysctl_init("vm", vm_fs_writeback_table); + return 0; +} +__initcall(start_dirtytime_writeback); + /** * __mark_inode_dirty - internal function to mark an inode dirty * diff --git a/fs/fs_context.c b/fs/fs_context.c index 582d33e81117..666e61753aed 100644 --- a/fs/fs_context.c +++ b/fs/fs_context.c @@ -222,7 +222,7 @@ int vfs_parse_monolithic_sep(struct fs_context *fc, void *data, char *value = strchr(key, '='); if (value) { - if (value == key) + if (unlikely(value == key)) continue; *value++ = 0; v_len = strlen(value); @@ -449,6 +449,10 @@ void logfc(struct fc_log *log, const char *prefix, char level, const char *fmt, printk(KERN_ERR "%s%s%pV\n", prefix ? prefix : "", prefix ? ": " : "", &vaf); break; + case 'i': + printk(KERN_INFO "%s%s%pV\n", prefix ? prefix : "", + prefix ? ": " : "", &vaf); + break; default: printk(KERN_NOTICE "%s%s%pV\n", prefix ? prefix : "", prefix ? ": " : "", &vaf); diff --git a/fs/fs_parser.c b/fs/fs_parser.c index e635a81e17d9..c092a9f79e32 100644 --- a/fs/fs_parser.c +++ b/fs/fs_parser.c @@ -380,58 +380,9 @@ EXPORT_SYMBOL(fs_param_is_path); #ifdef CONFIG_VALIDATE_FS_PARSER /** - * validate_constant_table - Validate a constant table - * @tbl: The constant table to validate. - * @tbl_size: The size of the table. - * @low: The lowest permissible value. - * @high: The highest permissible value. - * @special: One special permissible value outside of the range. - */ -bool validate_constant_table(const struct constant_table *tbl, size_t tbl_size, - int low, int high, int special) -{ - size_t i; - bool good = true; - - if (tbl_size == 0) { - pr_warn("VALIDATE C-TBL: Empty\n"); - return true; - } - - for (i = 0; i < tbl_size; i++) { - if (!tbl[i].name) { - pr_err("VALIDATE C-TBL[%zu]: Null\n", i); - good = false; - } else if (i > 0 && tbl[i - 1].name) { - int c = strcmp(tbl[i-1].name, tbl[i].name); - - if (c == 0) { - pr_err("VALIDATE C-TBL[%zu]: Duplicate %s\n", - i, tbl[i].name); - good = false; - } - if (c > 0) { - pr_err("VALIDATE C-TBL[%zu]: Missorted %s>=%s\n", - i, tbl[i-1].name, tbl[i].name); - good = false; - } - } - - if (tbl[i].value != special && - (tbl[i].value < low || tbl[i].value > high)) { - pr_err("VALIDATE C-TBL[%zu]: %s->%d const out of range (%d-%d)\n", - i, tbl[i].name, tbl[i].value, low, high); - good = false; - } - } - - return good; -} - -/** - * fs_validate_description - Validate a parameter description - * @name: The parameter name to search for. - * @desc: The parameter description to validate. + * fs_validate_description - Validate a parameter specification array + * @name: Owner name of the parameter specification array + * @desc: The parameter specification array to validate. */ bool fs_validate_description(const char *name, const struct fs_parameter_spec *desc) diff --git a/fs/fsopen.c b/fs/fsopen.c index 094a7f510edf..1aaf4cb2afb2 100644 --- a/fs/fsopen.c +++ b/fs/fsopen.c @@ -453,7 +453,7 @@ SYSCALL_DEFINE5(fsconfig, case FSCONFIG_SET_FD: param.type = fs_value_is_file; ret = -EBADF; - param.file = fget(aux); + param.file = fget_raw(aux); if (!param.file) goto out_key; param.dirfd = aux; diff --git a/fs/fuse/dax.c b/fs/fuse/dax.c index 0b6ee6dd1fd6..0502bf3cdf6a 100644 --- a/fs/fuse/dax.c +++ b/fs/fuse/dax.c @@ -666,36 +666,12 @@ static void fuse_wait_dax_page(struct inode *inode) filemap_invalidate_lock(inode->i_mapping); } -/* Should be called with mapping->invalidate_lock held exclusively */ -static int __fuse_dax_break_layouts(struct inode *inode, bool *retry, - loff_t start, loff_t end) -{ - struct page *page; - - page = dax_layout_busy_page_range(inode->i_mapping, start, end); - if (!page) - return 0; - - *retry = true; - return ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, - 0, 0, fuse_wait_dax_page(inode)); -} - -/* dmap_end == 0 leads to unmapping of whole file */ +/* Should be called with mapping->invalidate_lock held exclusively. */ int fuse_dax_break_layouts(struct inode *inode, u64 dmap_start, u64 dmap_end) { - bool retry; - int ret; - - do { - retry = false; - ret = __fuse_dax_break_layouts(inode, &retry, dmap_start, - dmap_end); - } while (ret == 0 && retry); - - return ret; + return dax_break_layout(inode, dmap_start, dmap_end, + fuse_wait_dax_page); } ssize_t fuse_dax_read_iter(struct kiocb *iocb, struct iov_iter *to) diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 5b5f789b37eb..6dcbaa218b7a 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -32,6 +32,100 @@ MODULE_ALIAS("devname:fuse"); static struct kmem_cache *fuse_req_cachep; +const unsigned long fuse_timeout_timer_freq = + secs_to_jiffies(FUSE_TIMEOUT_TIMER_FREQ); + +bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list) +{ + struct fuse_req *req; + + req = list_first_entry_or_null(list, struct fuse_req, list); + if (!req) + return false; + return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout); +} + +bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) +{ + int i; + + for (i = 0; i < FUSE_PQ_HASH_SIZE; i++) + if (fuse_request_expired(fc, &processing[i])) + return true; + + return false; +} + +/* + * Check if any requests aren't being completed by the time the request timeout + * elapses. To do so, we: + * - check the fiq pending list + * - check the bg queue + * - check the fpq io and processing lists + * + * To make this fast, we only check against the head request on each list since + * these are generally queued in order of creation time (eg newer requests get + * queued to the tail). We might miss a few edge cases (eg requests transitioning + * between lists, re-sent requests at the head of the pending list having a + * later creation time than other requests on that list, etc.) but that is fine + * since if the request never gets fulfilled, it will eventually be caught. + */ +void fuse_check_timeout(struct work_struct *work) +{ + struct delayed_work *dwork = to_delayed_work(work); + struct fuse_conn *fc = container_of(dwork, struct fuse_conn, + timeout.work); + struct fuse_iqueue *fiq = &fc->iq; + struct fuse_dev *fud; + struct fuse_pqueue *fpq; + bool expired = false; + + if (!atomic_read(&fc->num_waiting)) + goto out; + + spin_lock(&fiq->lock); + expired = fuse_request_expired(fc, &fiq->pending); + spin_unlock(&fiq->lock); + if (expired) + goto abort_conn; + + spin_lock(&fc->bg_lock); + expired = fuse_request_expired(fc, &fc->bg_queue); + spin_unlock(&fc->bg_lock); + if (expired) + goto abort_conn; + + spin_lock(&fc->lock); + if (!fc->connected) { + spin_unlock(&fc->lock); + return; + } + list_for_each_entry(fud, &fc->devices, entry) { + fpq = &fud->pq; + spin_lock(&fpq->lock); + if (fuse_request_expired(fc, &fpq->io) || + fuse_fpq_processing_expired(fc, fpq->processing)) { + spin_unlock(&fpq->lock); + spin_unlock(&fc->lock); + goto abort_conn; + } + + spin_unlock(&fpq->lock); + } + spin_unlock(&fc->lock); + + if (fuse_uring_request_expired(fc)) + goto abort_conn; + +out: + queue_delayed_work(system_wq, &fc->timeout.work, + fuse_timeout_timer_freq); + return; + +abort_conn: + fuse_abort_conn(fc); +} + static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) { INIT_LIST_HEAD(&req->list); @@ -40,6 +134,7 @@ static void fuse_request_init(struct fuse_mount *fm, struct fuse_req *req) refcount_set(&req->count, 1); __set_bit(FR_PENDING, &req->flags); req->fm = fm; + req->create_time = jiffies; } static struct fuse_req *fuse_request_alloc(struct fuse_mount *fm, gfp_t flags) @@ -77,7 +172,7 @@ void fuse_set_initialized(struct fuse_conn *fc) static bool fuse_block_alloc(struct fuse_conn *fc, bool for_background) { return !fc->initialized || (for_background && fc->blocked) || - (fc->io_uring && !fuse_uring_ready(fc)); + (fc->io_uring && fc->connected && !fuse_uring_ready(fc)); } static void fuse_drop_waiting(struct fuse_conn *fc) @@ -407,6 +502,24 @@ static int queue_interrupt(struct fuse_req *req) return 0; } +bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock) +{ + spin_lock(lock); + if (test_bit(FR_PENDING, &req->flags)) { + /* + * FR_PENDING does not get cleared as the request will end + * up in destruction anyway. + */ + list_del(&req->list); + spin_unlock(lock); + __fuse_put_request(req); + req->out.h.error = -EINTR; + return true; + } + spin_unlock(lock); + return false; +} + static void request_wait_answer(struct fuse_req *req) { struct fuse_conn *fc = req->fm->fc; @@ -428,22 +541,20 @@ static void request_wait_answer(struct fuse_req *req) } if (!test_bit(FR_FORCE, &req->flags)) { + bool removed; + /* Only fatal signals may interrupt this */ err = wait_event_killable(req->waitq, test_bit(FR_FINISHED, &req->flags)); if (!err) return; - spin_lock(&fiq->lock); - /* Request is not yet in userspace, bail out */ - if (test_bit(FR_PENDING, &req->flags)) { - list_del(&req->list); - spin_unlock(&fiq->lock); - __fuse_put_request(req); - req->out.h.error = -EINTR; + if (test_bit(FR_URING, &req->flags)) + removed = fuse_uring_remove_pending_req(req); + else + removed = fuse_remove_pending_req(req, &fiq->lock); + if (removed) return; - } - spin_unlock(&fiq->lock); } /* @@ -838,6 +949,12 @@ static int fuse_check_folio(struct folio *folio) return 0; } +/* + * Attempt to steal a page from the splice() pipe and move it into the + * pagecache. If successful, the pointer in @pagep will be updated. The + * folio that was originally in @pagep will lose a reference and the new + * folio returned in @pagep will carry a reference. + */ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) { int err; @@ -1451,7 +1568,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (ret < 0) goto out; - if (pipe_occupancy(pipe->head, pipe->tail) + cs.nr_segs > pipe->max_usage) { + if (pipe_buf_usage(pipe) + cs.nr_segs > pipe->max_usage) { ret = -EIO; goto out; } @@ -1527,14 +1644,10 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_inval_entry_out outarg; - int err = -ENOMEM; - char *buf; + int err; + char *buf = NULL; struct qstr name; - buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); - if (!buf) - goto err; - err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -1544,13 +1657,18 @@ static int fuse_notify_inval_entry(struct fuse_conn *fc, unsigned int size, goto err; err = -ENAMETOOLONG; - if (outarg.namelen > FUSE_NAME_MAX) + if (outarg.namelen > fc->name_max) goto err; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) goto err; + err = -ENOMEM; + buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); + if (!buf) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); @@ -1575,14 +1693,10 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, struct fuse_copy_state *cs) { struct fuse_notify_delete_out outarg; - int err = -ENOMEM; - char *buf; + int err; + char *buf = NULL; struct qstr name; - buf = kzalloc(FUSE_NAME_MAX + 1, GFP_KERNEL); - if (!buf) - goto err; - err = -EINVAL; if (size < sizeof(outarg)) goto err; @@ -1592,13 +1706,18 @@ static int fuse_notify_delete(struct fuse_conn *fc, unsigned int size, goto err; err = -ENAMETOOLONG; - if (outarg.namelen > FUSE_NAME_MAX) + if (outarg.namelen > fc->name_max) goto err; err = -EINVAL; if (size != sizeof(outarg) + outarg.namelen + 1) goto err; + err = -ENOMEM; + buf = kzalloc(outarg.namelen + 1, GFP_KERNEL); + if (!buf) + goto err; + name.name = buf; name.len = outarg.namelen; err = fuse_copy_one(cs, buf, outarg.namelen + 1); @@ -2101,7 +2220,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct file *out, loff_t *ppos, size_t len, unsigned int flags) { - unsigned int head, tail, mask, count; + unsigned int head, tail, count; unsigned nbuf; unsigned idx; struct pipe_buffer *bufs; @@ -2118,8 +2237,7 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; - count = head - tail; + count = pipe_occupancy(head, tail); bufs = kvmalloc_array(count, sizeof(struct pipe_buffer), GFP_KERNEL); if (!bufs) { @@ -2129,8 +2247,8 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, nbuf = 0; rem = 0; - for (idx = tail; idx != head && rem < len; idx++) - rem += pipe->bufs[idx & mask].len; + for (idx = tail; !pipe_empty(head, idx) && rem < len; idx++) + rem += pipe_buf(pipe, idx)->len; ret = -EINVAL; if (rem < len) @@ -2141,10 +2259,10 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, struct pipe_buffer *ibuf; struct pipe_buffer *obuf; - if (WARN_ON(nbuf >= count || tail == head)) + if (WARN_ON(nbuf >= count || pipe_empty(head, tail))) goto out_free; - ibuf = &pipe->bufs[tail & mask]; + ibuf = pipe_buf(pipe, tail); obuf = &bufs[nbuf]; if (rem >= ibuf->len) { @@ -2270,6 +2388,9 @@ void fuse_abort_conn(struct fuse_conn *fc) LIST_HEAD(to_end); unsigned int i; + if (fc->timeout.req_timeout) + cancel_delayed_work(&fc->timeout.work); + /* Background queuing checks fc->connected under bg_lock */ spin_lock(&fc->bg_lock); fc->connected = 0; diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index ebd2931b4f2a..accdce2977c5 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -140,6 +140,33 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) } } +bool fuse_uring_request_expired(struct fuse_conn *fc) +{ + struct fuse_ring *ring = fc->ring; + struct fuse_ring_queue *queue; + int qid; + + if (!ring) + return false; + + for (qid = 0; qid < ring->nr_queues; qid++) { + queue = READ_ONCE(ring->queues[qid]); + if (!queue) + continue; + + spin_lock(&queue->lock); + if (fuse_request_expired(fc, &queue->fuse_req_queue) || + fuse_request_expired(fc, &queue->fuse_req_bg_queue) || + fuse_fpq_processing_expired(fc, queue->fpq.processing)) { + spin_unlock(&queue->lock); + return true; + } + spin_unlock(&queue->lock); + } + + return false; +} + void fuse_uring_destruct(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -208,11 +235,10 @@ static struct fuse_ring *fuse_uring_create(struct fuse_conn *fc) init_waitqueue_head(&ring->stop_waitq); - fc->ring = ring; ring->nr_queues = nr_queues; ring->fc = fc; ring->max_payload_sz = max_payload_size; - atomic_set(&ring->queue_refs, 0); + smp_store_release(&fc->ring, ring); spin_unlock(&fc->lock); return ring; @@ -726,8 +752,6 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, struct fuse_req *req) { struct fuse_ring_queue *queue = ent->queue; - struct fuse_conn *fc = req->fm->fc; - struct fuse_iqueue *fiq = &fc->iq; lockdep_assert_held(&queue->lock); @@ -737,9 +761,7 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, ent->state); } - spin_lock(&fiq->lock); clear_bit(FR_PENDING, &req->flags); - spin_unlock(&fiq->lock); ent->fuse_req = req; ent->state = FRRS_FUSE_REQ; list_move(&ent->list, &queue->ent_w_req_queue); @@ -1041,7 +1063,7 @@ static int fuse_uring_register(struct io_uring_cmd *cmd, unsigned int issue_flags, struct fuse_conn *fc) { const struct fuse_uring_cmd_req *cmd_req = io_uring_sqe_cmd(cmd->sqe); - struct fuse_ring *ring = fc->ring; + struct fuse_ring *ring = smp_load_acquire(&fc->ring); struct fuse_ring_queue *queue; struct fuse_ring_ent *ent; int err; @@ -1238,6 +1260,8 @@ void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req) if (unlikely(queue->stopped)) goto err_unlock; + set_bit(FR_URING, &req->flags); + req->ring_queue = queue; ent = list_first_entry_or_null(&queue->ent_avail_queue, struct fuse_ring_ent, list); if (ent) @@ -1276,6 +1300,8 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) return false; } + set_bit(FR_URING, &req->flags); + req->ring_queue = queue; list_add_tail(&req->list, &queue->fuse_req_bg_queue); ent = list_first_entry_or_null(&queue->ent_avail_queue, @@ -1306,6 +1332,13 @@ bool fuse_uring_queue_bq_req(struct fuse_req *req) return true; } +bool fuse_uring_remove_pending_req(struct fuse_req *req) +{ + struct fuse_ring_queue *queue = req->ring_queue; + + return fuse_remove_pending_req(req, &queue->lock); +} + static const struct fuse_iqueue_ops fuse_io_uring_ops = { /* should be send over io-uring as enhancement */ .send_forget = fuse_dev_queue_forget, diff --git a/fs/fuse/dev_uring_i.h b/fs/fuse/dev_uring_i.h index 2102b3d0c1ae..51a563922ce1 100644 --- a/fs/fuse/dev_uring_i.h +++ b/fs/fuse/dev_uring_i.h @@ -142,6 +142,8 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring); int fuse_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags); void fuse_uring_queue_fuse_req(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_uring_queue_bq_req(struct fuse_req *req); +bool fuse_uring_remove_pending_req(struct fuse_req *req); +bool fuse_uring_request_expired(struct fuse_conn *fc); static inline void fuse_uring_abort(struct fuse_conn *fc) { @@ -172,12 +174,6 @@ static inline bool fuse_uring_ready(struct fuse_conn *fc) #else /* CONFIG_FUSE_IO_URING */ -struct fuse_ring; - -static inline void fuse_uring_create(struct fuse_conn *fc) -{ -} - static inline void fuse_uring_destruct(struct fuse_conn *fc) { } @@ -200,6 +196,16 @@ static inline bool fuse_uring_ready(struct fuse_conn *fc) return false; } +static inline bool fuse_uring_remove_pending_req(struct fuse_req *req) +{ + return false; +} + +static inline bool fuse_uring_request_expired(struct fuse_conn *fc) +{ + return false; +} + #endif /* CONFIG_FUSE_IO_URING */ #endif /* _FS_FUSE_DEV_URING_I_H */ diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 198862b086ff..33b82529cb6e 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -370,7 +370,7 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name *inode = NULL; err = -ENAMETOOLONG; - if (name->len > FUSE_NAME_MAX) + if (name->len > fm->fc->name_max) goto out; @@ -781,9 +781,9 @@ no_open: /* * Code shared between mknod, mkdir, symlink and link */ -static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, - struct fuse_args *args, struct inode *dir, - struct dentry *entry, umode_t mode) +static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_entry_out outarg; struct inode *inode; @@ -792,11 +792,11 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, struct fuse_forget_link *forget; if (fuse_is_bad(dir)) - return -EIO; + return ERR_PTR(-EIO); forget = fuse_alloc_forget(); if (!forget) - return -ENOMEM; + return ERR_PTR(-ENOMEM); memset(&outarg, 0, sizeof(outarg)); args->nodeid = get_node_id(dir); @@ -826,29 +826,43 @@ static int create_new_entry(struct mnt_idmap *idmap, struct fuse_mount *fm, &outarg.attr, ATTR_TIMEOUT(&outarg), 0, 0); if (!inode) { fuse_queue_forget(fm->fc, forget, outarg.nodeid, 1); - return -ENOMEM; + return ERR_PTR(-ENOMEM); } kfree(forget); d_drop(entry); d = d_splice_alias(inode, entry); if (IS_ERR(d)) - return PTR_ERR(d); + return d; - if (d) { + if (d) fuse_change_entry_timeout(d, &outarg); - dput(d); - } else { + else fuse_change_entry_timeout(entry, &outarg); - } fuse_dir_changed(dir); - return 0; + return d; out_put_forget_req: if (err == -EEXIST) fuse_invalidate_entry(entry); kfree(forget); - return err; + return ERR_PTR(err); +} + +static int create_new_nondir(struct mnt_idmap *idmap, struct fuse_mount *fm, + struct fuse_args *args, struct inode *dir, + struct dentry *entry, umode_t mode) +{ + /* + * Note that when creating anything other than a directory we + * can be sure create_new_entry() will NOT return an alternate + * dentry as d_splice_alias() only returns an alternate dentry + * for directories. So we don't need to check for that case + * when passing back the result. + */ + WARN_ON_ONCE(S_ISDIR(mode)); + + return PTR_ERR(create_new_entry(idmap, fm, args, dir, entry, mode)); } static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, @@ -871,7 +885,7 @@ static int fuse_mknod(struct mnt_idmap *idmap, struct inode *dir, args.in_args[0].value = &inarg; args.in_args[1].size = entry->d_name.len + 1; args.in_args[1].value = entry->d_name.name; - return create_new_entry(idmap, fm, &args, dir, entry, mode); + return create_new_nondir(idmap, fm, &args, dir, entry, mode); } static int fuse_create(struct mnt_idmap *idmap, struct inode *dir, @@ -898,8 +912,8 @@ static int fuse_tmpfile(struct mnt_idmap *idmap, struct inode *dir, return err; } -static int fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *entry, umode_t mode) +static struct dentry *fuse_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *entry, umode_t mode) { struct fuse_mkdir_in inarg; struct fuse_mount *fm = get_fuse_mount(dir); @@ -934,7 +948,7 @@ static int fuse_symlink(struct mnt_idmap *idmap, struct inode *dir, args.in_args[1].value = entry->d_name.name; args.in_args[2].size = len; args.in_args[2].value = link; - return create_new_entry(idmap, fm, &args, dir, entry, S_IFLNK); + return create_new_nondir(idmap, fm, &args, dir, entry, S_IFLNK); } void fuse_flush_time_update(struct inode *inode) @@ -1123,6 +1137,9 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, struct fuse_mount *fm = get_fuse_mount(inode); FUSE_ARGS(args); + if (fm->fc->no_link) + goto out; + memset(&inarg, 0, sizeof(inarg)); inarg.oldnodeid = get_node_id(inode); args.opcode = FUSE_LINK; @@ -1131,12 +1148,18 @@ static int fuse_link(struct dentry *entry, struct inode *newdir, args.in_args[0].value = &inarg; args.in_args[1].size = newent->d_name.len + 1; args.in_args[1].value = newent->d_name.name; - err = create_new_entry(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); + err = create_new_nondir(&invalid_mnt_idmap, fm, &args, newdir, newent, inode->i_mode); if (!err) fuse_update_ctime_in_cache(inode); else if (err == -EINTR) fuse_invalidate_attr(inode); + if (err == -ENOSYS) + fm->fc->no_link = 1; +out: + if (fm->fc->no_link) + return -EPERM; + return err; } @@ -1636,7 +1659,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, goto out_err; if (fc->cache_symlinks) - return page_get_link(dentry, inode, callback); + return page_get_link_raw(dentry, inode, callback); err = -ECHILD; if (!dentry) @@ -1653,7 +1676,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, goto out_err; } - set_delayed_call(callback, page_put_link, &folio->page); + set_delayed_call(callback, page_put_link, folio); return folio_address(folio); @@ -1940,7 +1963,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (FUSE_IS_DAX(inode) && is_truncate) { filemap_invalidate_lock(mapping); fault_blocked = true; - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) { filemap_invalidate_unlock(mapping); return err; diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 7d92a5479998..754378dd9f71 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -253,7 +253,7 @@ static int fuse_open(struct inode *inode, struct file *file) if (dax_truncate) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out_inode_unlock; } @@ -955,8 +955,10 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_invalidate_atime(inode); } - for (i = 0; i < ap->num_folios; i++) + for (i = 0; i < ap->num_folios; i++) { folio_end_read(ap->folios[i], !err); + folio_put(ap->folios[i]); + } if (ia->ff) fuse_file_put(ia->ff, false); @@ -1048,7 +1050,14 @@ static void fuse_readahead(struct readahead_control *rac) ap = &ia->ap; while (ap->num_folios < cur_pages) { - folio = readahead_folio(rac); + /* + * This returns a folio with a ref held on it. + * The ref needs to be held until the request is + * completed, since the splice case (see + * fuse_try_move_page()) drops the ref after it's + * replaced in the page cache. + */ + folio = __readahead_folio(rac); ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].length = folio_size(folio); ap->num_folios++; @@ -3196,7 +3205,7 @@ static long fuse_file_fallocate(struct file *file, int mode, loff_t offset, inode_lock(inode); if (block_faults) { filemap_invalidate_lock(inode->i_mapping); - err = fuse_dax_break_layouts(inode, 0, 0); + err = fuse_dax_break_layouts(inode, 0, -1); if (err) goto out; } diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index 3b2bfe1248d3..b3c2e32254ba 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -61,6 +61,10 @@ int fuse_copy_out_args(struct fuse_copy_state *cs, struct fuse_args *args, void fuse_dev_queue_forget(struct fuse_iqueue *fiq, struct fuse_forget_link *forget); void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); +bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock); + +bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list); +bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing); #endif diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index fee96fe7887b..d56d4fd956db 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -38,14 +38,34 @@ /** Bias for fi->writectr, meaning new writepages must not be sent */ #define FUSE_NOWRITE INT_MIN -/** It could be as large as PATH_MAX, but would that have any uses? */ -#define FUSE_NAME_MAX 1024 +/** Maximum length of a filename, not including terminating null */ + +/* maximum, small enough for FUSE_MIN_READ_BUFFER*/ +#define FUSE_NAME_LOW_MAX 1024 +/* maximum, but needs a request buffer > FUSE_MIN_READ_BUFFER */ +#define FUSE_NAME_MAX (PATH_MAX - 1) /** Number of dentries for each connection in the control filesystem */ #define FUSE_CTL_NUM_DENTRIES 5 +/* Frequency (in seconds) of request timeout checks, if opted into */ +#define FUSE_TIMEOUT_TIMER_FREQ 15 + +/** Frequency (in jiffies) of request timeout checks, if opted into */ +extern const unsigned long fuse_timeout_timer_freq; + /** Maximum of max_pages received in init_out */ extern unsigned int fuse_max_pages_limit; +/* + * Default timeout (in seconds) for the server to reply to a request + * before the connection is aborted, if no timeout was specified on mount. + */ +extern unsigned int fuse_default_req_timeout; +/* + * Max timeout (in seconds) for the server to reply to a request before + * the connection is aborted. + */ +extern unsigned int fuse_max_req_timeout; /** List of active connections */ extern struct list_head fuse_conn_list; @@ -378,6 +398,7 @@ struct fuse_io_priv { * FR_FINISHED: request is finished * FR_PRIVATE: request is on private list * FR_ASYNC: request is asynchronous + * FR_URING: request is handled through fuse-io-uring */ enum fuse_req_flag { FR_ISREPLY, @@ -392,6 +413,7 @@ enum fuse_req_flag { FR_FINISHED, FR_PRIVATE, FR_ASYNC, + FR_URING, }; /** @@ -441,7 +463,10 @@ struct fuse_req { #ifdef CONFIG_FUSE_IO_URING void *ring_entry; + void *ring_queue; #endif + /** When (in jiffies) the request was created */ + unsigned long create_time; }; struct fuse_iqueue; @@ -867,6 +892,9 @@ struct fuse_conn { /* Use pages instead of pointer for kernel I/O */ unsigned int use_pages_for_kvec_io:1; + /* Is link not implemented by fs? */ + unsigned int no_link:1; + /* Use io_uring for communication */ unsigned int io_uring; @@ -900,6 +928,9 @@ struct fuse_conn { /** Version counter for evict inode */ atomic64_t evict_ctr; + /* maximum file name length */ + u32 name_max; + /** Called on final put */ void (*release)(struct fuse_conn *); @@ -935,6 +966,15 @@ struct fuse_conn { /** uring connection information*/ struct fuse_ring *ring; #endif + + /** Only used if the connection opts into request timeouts */ + struct { + /* Worker for checking if any requests have timed out */ + struct delayed_work work; + + /* Request timeout (in jiffies). 0 = no timeout */ + unsigned int req_timeout; + } timeout; }; /* @@ -1216,6 +1256,9 @@ void fuse_request_end(struct fuse_req *req); void fuse_abort_conn(struct fuse_conn *fc); void fuse_wait_aborted(struct fuse_conn *fc); +/* Check if any requests timed out */ +void fuse_check_timeout(struct work_struct *work); + /** * Invalidate inode attributes */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index e9db2cb8c150..fd48e8d37f2e 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -37,6 +37,9 @@ DEFINE_MUTEX(fuse_mutex); static int set_global_limit(const char *val, const struct kernel_param *kp); unsigned int fuse_max_pages_limit = 256; +/* default is no timeout */ +unsigned int fuse_default_req_timeout; +unsigned int fuse_max_req_timeout; unsigned max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, @@ -979,6 +982,8 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, fc->user_ns = get_user_ns(user_ns); fc->max_pages = FUSE_DEFAULT_MAX_PAGES_PER_REQ; fc->max_pages_limit = fuse_max_pages_limit; + fc->name_max = FUSE_NAME_LOW_MAX; + fc->timeout.req_timeout = 0; if (IS_ENABLED(CONFIG_FUSE_PASSTHROUGH)) fuse_backing_files_init(fc); @@ -1007,6 +1012,8 @@ void fuse_conn_put(struct fuse_conn *fc) if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_conn_free(fc); + if (fc->timeout.req_timeout) + cancel_delayed_work_sync(&fc->timeout.work); if (fiq->ops->release) fiq->ops->release(fiq); put_pid_ns(fc->pid_ns); @@ -1257,6 +1264,34 @@ static void process_init_limits(struct fuse_conn *fc, struct fuse_init_out *arg) spin_unlock(&fc->bg_lock); } +static void set_request_timeout(struct fuse_conn *fc, unsigned int timeout) +{ + fc->timeout.req_timeout = secs_to_jiffies(timeout); + INIT_DELAYED_WORK(&fc->timeout.work, fuse_check_timeout); + queue_delayed_work(system_wq, &fc->timeout.work, + fuse_timeout_timer_freq); +} + +static void init_server_timeout(struct fuse_conn *fc, unsigned int timeout) +{ + if (!timeout && !fuse_max_req_timeout && !fuse_default_req_timeout) + return; + + if (!timeout) + timeout = fuse_default_req_timeout; + + if (fuse_max_req_timeout) { + if (timeout) + timeout = min(fuse_max_req_timeout, timeout); + else + timeout = fuse_max_req_timeout; + } + + timeout = max(FUSE_TIMEOUT_TIMER_FREQ, timeout); + + set_request_timeout(fc, timeout); +} + struct fuse_init_args { struct fuse_args args; struct fuse_init_in in; @@ -1275,6 +1310,7 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, ok = false; else { unsigned long ra_pages; + unsigned int timeout = 0; process_init_limits(fc, arg); @@ -1338,6 +1374,13 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, fc->max_pages = min_t(unsigned int, fc->max_pages_limit, max_t(unsigned int, arg->max_pages, 1)); + + /* + * PATH_MAX file names might need two pages for + * ops like rename + */ + if (fc->max_pages > 1) + fc->name_max = FUSE_NAME_MAX; } if (IS_ENABLED(CONFIG_FUSE_DAX)) { if (flags & FUSE_MAP_ALIGNMENT && @@ -1392,12 +1435,17 @@ static void process_init_reply(struct fuse_mount *fm, struct fuse_args *args, } if (flags & FUSE_OVER_IO_URING && fuse_uring_enabled()) fc->io_uring = 1; + + if (flags & FUSE_REQUEST_TIMEOUT) + timeout = arg->request_timeout; } else { ra_pages = fc->max_read / PAGE_SIZE; fc->no_lock = 1; fc->no_flock = 1; } + init_server_timeout(fc, timeout); + fm->sb->s_bdi->ra_pages = min(fm->sb->s_bdi->ra_pages, ra_pages); fc->minor = arg->minor; @@ -1439,7 +1487,8 @@ void fuse_send_init(struct fuse_mount *fm) FUSE_HANDLE_KILLPRIV_V2 | FUSE_SETXATTR_EXT | FUSE_INIT_EXT | FUSE_SECURITY_CTX | FUSE_CREATE_SUPP_GROUP | FUSE_HAS_EXPIRE_ONLY | FUSE_DIRECT_IO_ALLOW_MMAP | - FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP; + FUSE_NO_EXPORT_SUPPORT | FUSE_HAS_RESEND | FUSE_ALLOW_IDMAP | + FUSE_REQUEST_TIMEOUT; #ifdef CONFIG_FUSE_DAX if (fm->fc->dax) flags |= FUSE_MAP_ALIGNMENT; diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index 17ce9636a2b1..edcd6f18a8a8 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -120,7 +120,7 @@ static bool fuse_emit(struct file *file, struct dir_context *ctx, fuse_add_dirent_to_cache(file, dirent, ctx->pos); return dir_emit(ctx, dirent->name, dirent->namelen, dirent->ino, - dirent->type); + dirent->type | FILLDIR_FLAG_NOINTR); } static int parse_dirfile(char *buf, size_t nbytes, struct file *file, @@ -419,7 +419,7 @@ static enum fuse_parse_result fuse_parse_cache(struct fuse_file *ff, if (ff->readdir.pos == ctx->pos) { res = FOUND_SOME; if (!dir_emit(ctx, dirent->name, dirent->namelen, - dirent->ino, dirent->type)) + dirent->ino, dirent->type | FILLDIR_FLAG_NOINTR)) return FOUND_ALL; ctx->pos = dirent->off; } diff --git a/fs/fuse/sysctl.c b/fs/fuse/sysctl.c index 63fb1e5bee30..e2d921abcb88 100644 --- a/fs/fuse/sysctl.c +++ b/fs/fuse/sysctl.c @@ -13,6 +13,12 @@ static struct ctl_table_header *fuse_table_header; /* Bound by fuse_init_out max_pages, which is a u16 */ static unsigned int sysctl_fuse_max_pages_limit = 65535; +/* + * fuse_init_out request timeouts are u16. + * This goes up to ~18 hours, which is plenty for a timeout. + */ +static unsigned int sysctl_fuse_req_timeout_limit = 65535; + static const struct ctl_table fuse_sysctl_table[] = { { .procname = "max_pages_limit", @@ -23,6 +29,24 @@ static const struct ctl_table fuse_sysctl_table[] = { .extra1 = SYSCTL_ONE, .extra2 = &sysctl_fuse_max_pages_limit, }, + { + .procname = "default_request_timeout", + .data = &fuse_default_req_timeout, + .maxlen = sizeof(fuse_default_req_timeout), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_fuse_req_timeout_limit, + }, + { + .procname = "max_request_timeout", + .data = &fuse_max_req_timeout, + .maxlen = sizeof(fuse_max_req_timeout), + .mode = 0644, + .proc_handler = proc_douintvec_minmax, + .extra1 = SYSCTL_ZERO, + .extra2 = &sysctl_fuse_req_timeout_limit, + }, }; int fuse_sysctl_register(void) diff --git a/fs/fuse/virtio_fs.c b/fs/fuse/virtio_fs.c index 82afe78ec542..53c2626e90e7 100644 --- a/fs/fuse/virtio_fs.c +++ b/fs/fuse/virtio_fs.c @@ -1017,8 +1017,7 @@ static long virtio_fs_direct_access(struct dax_device *dax_dev, pgoff_t pgoff, if (kaddr) *kaddr = fs->window_kaddr + offset; if (pfn) - *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, - PFN_DEV | PFN_MAP); + *pfn = phys_to_pfn_t(fs->window_phys_addr + offset, 0); return nr_pages > max_nr_pages ? max_nr_pages : nr_pages; } @@ -1670,6 +1669,9 @@ static int virtio_fs_get_tree(struct fs_context *fsc) unsigned int virtqueue_size; int err = -EIO; + if (!fsc->source) + return invalf(fsc, "No source specified"); + /* This gets a reference on virtio_fs object. This ptr gets installed * in fc->iq->priv. Once fuse_conn is going away, it calls ->put() * to drop the reference to this object. diff --git a/fs/gfs2/Kconfig b/fs/gfs2/Kconfig index be7f87a8e11a..7bd231d16d4a 100644 --- a/fs/gfs2/Kconfig +++ b/fs/gfs2/Kconfig @@ -4,7 +4,6 @@ config GFS2_FS select BUFFER_HEAD select FS_POSIX_ACL select CRC32 - select LIBCRC32C select QUOTACTL select FS_IOMAP help diff --git a/fs/gfs2/aops.c b/fs/gfs2/aops.c index 68fc8af14700..14f204cd5a82 100644 --- a/fs/gfs2/aops.c +++ b/fs/gfs2/aops.c @@ -37,27 +37,6 @@ #include "aops.h" -void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, - size_t from, size_t len) -{ - struct buffer_head *head = folio_buffers(folio); - unsigned int bsize = head->b_size; - struct buffer_head *bh; - size_t to = from + len; - size_t start, end; - - for (bh = head, start = 0; bh != head || !start; - bh = bh->b_this_page, start = end) { - end = start + bsize; - if (end <= from) - continue; - if (start >= to) - break; - set_buffer_uptodate(bh); - gfs2_trans_add_data(ip->i_gl, bh); - } -} - /** * gfs2_get_block_noalloc - Fills in a buffer head with details about a block * @inode: The inode @@ -133,12 +112,43 @@ static int __gfs2_jdata_write_folio(struct folio *folio, inode->i_sb->s_blocksize, BIT(BH_Dirty)|BIT(BH_Uptodate)); } - gfs2_trans_add_databufs(ip, folio, 0, folio_size(folio)); + gfs2_trans_add_databufs(ip->i_gl, folio, 0, folio_size(folio)); } return gfs2_write_jdata_folio(folio, wbc); } /** + * gfs2_jdata_writeback - Write jdata folios to the log + * @mapping: The mapping to write + * @wbc: The writeback control + * + * Returns: errno + */ +int gfs2_jdata_writeback(struct address_space *mapping, struct writeback_control *wbc) +{ + struct inode *inode = mapping->host; + struct gfs2_inode *ip = GFS2_I(inode); + struct gfs2_sbd *sdp = GFS2_SB(mapping->host); + struct folio *folio = NULL; + int error; + + BUG_ON(current->journal_info); + if (gfs2_assert_withdraw(sdp, ip->i_gl->gl_state == LM_ST_EXCLUSIVE)) + return 0; + + while ((folio = writeback_iter(mapping, wbc, folio, &error))) { + if (folio_test_checked(folio)) { + folio_redirty_for_writepage(wbc, folio); + folio_unlock(folio); + continue; + } + error = __gfs2_jdata_write_folio(folio, wbc); + } + + return error; +} + +/** * gfs2_writepages - Write a bunch of dirty pages back to disk * @mapping: The mapping to write * @wbc: Write-back control @@ -228,24 +238,16 @@ continue_unlock: ret = __gfs2_jdata_write_folio(folio, wbc); if (unlikely(ret)) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - folio_unlock(folio); - ret = 0; - } else { - - /* - * done_index is set past this page, - * so media errors will not choke - * background writeout for the entire - * file. This has consequences for - * range_cyclic semantics (ie. it may - * not be suitable for data integrity - * writeout). - */ - *done_index = folio_next_index(folio); - ret = 1; - break; - } + /* + * done_index is set past this page, so media errors + * will not choke background writeout for the entire + * file. This has consequences for range_cyclic + * semantics (ie. it may not be suitable for data + * integrity writeout). + */ + *done_index = folio_next_index(folio); + ret = 1; + break; } /* @@ -540,7 +542,7 @@ out: gfs2_trans_end(sdp); } -static bool jdata_dirty_folio(struct address_space *mapping, +static bool gfs2_jdata_dirty_folio(struct address_space *mapping, struct folio *folio) { if (current->journal_info) @@ -722,7 +724,7 @@ static const struct address_space_operations gfs2_jdata_aops = { .writepages = gfs2_jdata_writepages, .read_folio = gfs2_read_folio, .readahead = gfs2_readahead, - .dirty_folio = jdata_dirty_folio, + .dirty_folio = gfs2_jdata_dirty_folio, .bmap = gfs2_bmap, .migrate_folio = buffer_migrate_folio, .invalidate_folio = gfs2_invalidate_folio, diff --git a/fs/gfs2/aops.h b/fs/gfs2/aops.h index a10c4334d248..bf002522a782 100644 --- a/fs/gfs2/aops.h +++ b/fs/gfs2/aops.h @@ -9,7 +9,6 @@ #include "incore.h" void adjust_fs_space(struct inode *inode); -void gfs2_trans_add_databufs(struct gfs2_inode *ip, struct folio *folio, - size_t from, size_t len); +int gfs2_jdata_writeback(struct address_space *mapping, struct writeback_control *wbc); #endif /* __AOPS_DOT_H__ */ diff --git a/fs/gfs2/bmap.c b/fs/gfs2/bmap.c index 1795c4e8dbf6..7703d0471139 100644 --- a/fs/gfs2/bmap.c +++ b/fs/gfs2/bmap.c @@ -988,7 +988,8 @@ static void gfs2_iomap_put_folio(struct inode *inode, loff_t pos, struct gfs2_sbd *sdp = GFS2_SB(inode); if (!gfs2_is_stuffed(ip)) - gfs2_trans_add_databufs(ip, folio, offset_in_folio(folio, pos), + gfs2_trans_add_databufs(ip->i_gl, folio, + offset_in_folio(folio, pos), copied); folio_unlock(folio); @@ -1296,11 +1297,14 @@ int gfs2_alloc_extent(struct inode *inode, u64 lblock, u64 *dblock, * uses iomap write to perform its actions, which begin their own transactions * (iomap_begin, get_folio, etc.) */ -static int gfs2_block_zero_range(struct inode *inode, loff_t from, - unsigned int length) +static int gfs2_block_zero_range(struct inode *inode, loff_t from, loff_t length) { BUG_ON(current->journal_info); - return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops); + if (from >= inode->i_size) + return 0; + length = min(length, inode->i_size - from); + return iomap_zero_range(inode, from, length, NULL, &gfs2_iomap_ops, + NULL); } #define GFS2_JTRUNC_REVOKES 8192 diff --git a/fs/gfs2/file.c b/fs/gfs2/file.c index c9bb3be21d2b..fd1147aa3891 100644 --- a/fs/gfs2/file.c +++ b/fs/gfs2/file.c @@ -820,7 +820,7 @@ static ssize_t gfs2_file_direct_read(struct kiocb *iocb, struct iov_iter *to, /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. * * Unlike generic_file_read_iter, for reads, iomap_dio_rw can trigger @@ -885,7 +885,7 @@ static ssize_t gfs2_file_direct_write(struct kiocb *iocb, struct iov_iter *from, /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. * * For writes, iomap_dio_rw only triggers manual page faults, so we @@ -957,7 +957,7 @@ static ssize_t gfs2_file_read_iter(struct kiocb *iocb, struct iov_iter *to) /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. */ @@ -1024,7 +1024,7 @@ static ssize_t gfs2_file_buffered_write(struct kiocb *iocb, /* * In this function, we disable page faults when we're holding the * inode glock while doing I/O. If a page fault occurs, we indicate - * that the inode glock may be dropped, fault in the pages manually, + * that the inode glock should be dropped, fault in the pages manually, * and retry. */ diff --git a/fs/gfs2/glock.c b/fs/gfs2/glock.c index 65c07aa95718..ba25b884169e 100644 --- a/fs/gfs2/glock.c +++ b/fs/gfs2/glock.c @@ -607,14 +607,19 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) if (gh && (ret & LM_OUT_CANCELED)) gfs2_holder_wake(gh); if (gh && !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)) { - /* move to back of queue and try next entry */ if (ret & LM_OUT_CANCELED) { - list_move_tail(&gh->gh_list, &gl->gl_holders); + list_del_init(&gh->gh_list); + trace_gfs2_glock_queue(gh, 0); + gl->gl_target = gl->gl_state; gh = find_first_waiter(gl); - gl->gl_target = gh->gh_state; - if (do_promote(gl)) - goto out; - goto retry; + if (gh) { + gl->gl_target = gh->gh_state; + if (do_promote(gl)) + goto out; + do_xmote(gl, gh, gl->gl_target); + return; + } + goto out; } /* Some error or failed "try lock" - report it */ if ((ret & LM_OUT_ERROR) || @@ -627,7 +632,6 @@ static void finish_xmote(struct gfs2_glock *gl, unsigned int ret) switch(state) { /* Unlocked due to conversion deadlock, try again */ case LM_ST_UNLOCKED: -retry: do_xmote(gl, gh, gl->gl_target); break; /* Conversion fails, unlock and try again */ @@ -661,7 +665,8 @@ retry: do_promote(gl); } out: - clear_bit(GLF_LOCK, &gl->gl_flags); + if (!test_bit(GLF_CANCELING, &gl->gl_flags)) + clear_bit(GLF_LOCK, &gl->gl_flags); } static bool is_system_glock(struct gfs2_glock *gl) @@ -807,6 +812,7 @@ skip_inval: } if (ls->ls_ops->lm_lock) { + set_bit(GLF_PENDING_REPLY, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); ret = ls->ls_ops->lm_lock(gl, target, lck_flags); spin_lock(&gl->gl_lockref.lock); @@ -825,6 +831,7 @@ skip_inval: /* The operation will be completed asynchronously. */ return; } + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); } /* Complete the operation now. */ @@ -843,12 +850,13 @@ static void run_queue(struct gfs2_glock *gl, const int nonblock) __releases(&gl->gl_lockref.lock) __acquires(&gl->gl_lockref.lock) { - struct gfs2_holder *gh = NULL; + struct gfs2_holder *gh; if (test_bit(GLF_LOCK, &gl->gl_flags)) return; set_bit(GLF_LOCK, &gl->gl_flags); + /* While a demote is in progress, the GLF_LOCK flag must be set. */ GLOCK_BUG_ON(gl, test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags)); if (test_bit(GLF_DEMOTE, &gl->gl_flags) && @@ -860,18 +868,22 @@ __acquires(&gl->gl_lockref.lock) set_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags); GLOCK_BUG_ON(gl, gl->gl_demote_state == LM_ST_EXCLUSIVE); gl->gl_target = gl->gl_demote_state; + do_xmote(gl, NULL, gl->gl_target); + return; } else { if (test_bit(GLF_DEMOTE, &gl->gl_flags)) gfs2_demote_wake(gl); if (do_promote(gl)) goto out_unlock; gh = find_first_waiter(gl); + if (!gh) + goto out_unlock; gl->gl_target = gh->gh_state; if (!(gh->gh_flags & (LM_FLAG_TRY | LM_FLAG_TRY_1CB))) do_error(gl, 0); /* Fail queued try locks */ + do_xmote(gl, gh, gl->gl_target); + return; } - do_xmote(gl, gh, gl->gl_target); - return; out_sched: clear_bit(GLF_LOCK, &gl->gl_flags); @@ -898,12 +910,8 @@ void glock_set_object(struct gfs2_glock *gl, void *object) prev_object = gl->gl_object; gl->gl_object = object; spin_unlock(&gl->gl_lockref.lock); - if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) { - pr_warn("glock=%u/%llx\n", - gl->gl_name.ln_type, - (unsigned long long)gl->gl_name.ln_number); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == NULL)) gfs2_dump_glock(NULL, gl, true); - } } /** @@ -919,12 +927,8 @@ void glock_clear_object(struct gfs2_glock *gl, void *object) prev_object = gl->gl_object; gl->gl_object = NULL; spin_unlock(&gl->gl_lockref.lock); - if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) { - pr_warn("glock=%u/%llx\n", - gl->gl_name.ln_type, - (unsigned long long)gl->gl_name.ln_number); + if (gfs2_assert_warn(gl->gl_name.ln_sbd, prev_object == object)) gfs2_dump_glock(NULL, gl, true); - } } void gfs2_inode_remember_delete(struct gfs2_glock *gl, u64 generation) @@ -959,6 +963,25 @@ static void gfs2_glock_poke(struct gfs2_glock *gl) gfs2_holder_uninit(&gh); } +static struct gfs2_inode *gfs2_grab_existing_inode(struct gfs2_glock *gl) +{ + struct gfs2_inode *ip; + + spin_lock(&gl->gl_lockref.lock); + ip = gl->gl_object; + if (ip && !igrab(&ip->i_inode)) + ip = NULL; + spin_unlock(&gl->gl_lockref.lock); + if (ip) { + wait_on_inode(&ip->i_inode); + if (is_bad_inode(&ip->i_inode)) { + iput(&ip->i_inode); + ip = NULL; + } + } + return ip; +} + static void gfs2_try_evict(struct gfs2_glock *gl) { struct gfs2_inode *ip; @@ -976,32 +999,15 @@ static void gfs2_try_evict(struct gfs2_glock *gl) * happened below. (Verification is triggered by the call to * gfs2_queue_verify_delete() in gfs2_evict_inode().) */ - spin_lock(&gl->gl_lockref.lock); - ip = gl->gl_object; - if (ip && !igrab(&ip->i_inode)) - ip = NULL; - spin_unlock(&gl->gl_lockref.lock); - if (ip) { - wait_on_inode(&ip->i_inode); - if (is_bad_inode(&ip->i_inode)) { - iput(&ip->i_inode); - ip = NULL; - } - } + ip = gfs2_grab_existing_inode(gl); if (ip) { - set_bit(GIF_DEFER_DELETE, &ip->i_flags); + set_bit(GLF_DEFER_DELETE, &gl->gl_flags); d_prune_aliases(&ip->i_inode); iput(&ip->i_inode); + clear_bit(GLF_DEFER_DELETE, &gl->gl_flags); /* If the inode was evicted, gl->gl_object will now be NULL. */ - spin_lock(&gl->gl_lockref.lock); - ip = gl->gl_object; - if (ip) { - clear_bit(GIF_DEFER_DELETE, &ip->i_flags); - if (!igrab(&ip->i_inode)) - ip = NULL; - } - spin_unlock(&gl->gl_lockref.lock); + ip = gfs2_grab_existing_inode(gl); if (ip) { gfs2_glock_poke(ip->i_gl); iput(&ip->i_inode); @@ -1160,7 +1166,6 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, const struct gfs2_glock_operations *glops, int create, struct gfs2_glock **glp) { - struct super_block *s = sdp->sd_vfs; struct lm_lockname name = { .ln_number = number, .ln_type = glops->go_type, .ln_sbd = sdp }; @@ -1223,7 +1228,7 @@ int gfs2_glock_get(struct gfs2_sbd *sdp, u64 number, mapping = gfs2_glock2aspace(gl); if (mapping) { mapping->a_ops = &gfs2_meta_aops; - mapping->host = s->s_bdev->bd_mapping->host; + mapping->host = sdp->sd_inode; mapping->flags = 0; mapping_set_gfp_mask(mapping, GFP_NOFS); mapping->i_private_data = NULL; @@ -1462,9 +1467,7 @@ static inline bool pid_is_meaningful(const struct gfs2_holder *gh) { if (!(gh->gh_flags & GL_NOPID)) return true; - if (gh->gh_state == LM_ST_UNLOCKED) - return true; - return false; + return !test_bit(HIF_HOLDER, &gh->gh_iflags); } /** @@ -1483,7 +1486,6 @@ __acquires(&gl->gl_lockref.lock) { struct gfs2_glock *gl = gh->gh_gl; struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct list_head *insert_pt = NULL; struct gfs2_holder *gh2; int try_futile = 0; @@ -1519,21 +1521,11 @@ fail: gfs2_holder_wake(gh); return; } - if (test_bit(HIF_HOLDER, &gh2->gh_iflags)) - continue; } trace_gfs2_glock_queue(gh, 1); gfs2_glstats_inc(gl, GFS2_LKS_QCOUNT); gfs2_sbstats_inc(gl, GFS2_LKS_QCOUNT); - if (likely(insert_pt == NULL)) { - list_add_tail(&gh->gh_list, &gl->gl_holders); - return; - } - list_add_tail(&gh->gh_list, insert_pt); - spin_unlock(&gl->gl_lockref.lock); - if (sdp->sd_lockstruct.ls_ops->lm_cancel) - sdp->sd_lockstruct.ls_ops->lm_cancel(gl); - spin_lock(&gl->gl_lockref.lock); + list_add_tail(&gh->gh_list, &gl->gl_holders); return; trap_recursive: @@ -1673,11 +1665,19 @@ void gfs2_glock_dq(struct gfs2_holder *gh) } if (list_is_first(&gh->gh_list, &gl->gl_holders) && - !test_bit(HIF_HOLDER, &gh->gh_iflags)) { + !test_bit(HIF_HOLDER, &gh->gh_iflags) && + test_bit(GLF_LOCK, &gl->gl_flags) && + !test_bit(GLF_DEMOTE_IN_PROGRESS, &gl->gl_flags) && + !test_bit(GLF_CANCELING, &gl->gl_flags)) { + set_bit(GLF_CANCELING, &gl->gl_flags); spin_unlock(&gl->gl_lockref.lock); gl->gl_name.ln_sbd->sd_lockstruct.ls_ops->lm_cancel(gl); wait_on_bit(&gh->gh_iflags, HIF_WAIT, TASK_UNINTERRUPTIBLE); spin_lock(&gl->gl_lockref.lock); + clear_bit(GLF_CANCELING, &gl->gl_flags); + clear_bit(GLF_LOCK, &gl->gl_flags); + if (!gfs2_holder_queued(gh)) + goto out; } /* @@ -1923,6 +1923,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret) struct lm_lockstruct *ls = &gl->gl_name.ln_sbd->sd_lockstruct; spin_lock(&gl->gl_lockref.lock); + clear_bit(GLF_PENDING_REPLY, &gl->gl_flags); gl->gl_reply = ret; if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) { @@ -2323,6 +2324,8 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'f'; if (test_bit(GLF_INVALIDATE_IN_PROGRESS, gflags)) *p++ = 'i'; + if (test_bit(GLF_PENDING_REPLY, gflags)) + *p++ = 'R'; if (test_bit(GLF_HAVE_REPLY, gflags)) *p++ = 'r'; if (test_bit(GLF_INITIAL, gflags)) @@ -2347,6 +2350,10 @@ static const char *gflags2str(char *buf, const struct gfs2_glock *gl) *p++ = 'e'; if (test_bit(GLF_VERIFY_DELETE, gflags)) *p++ = 'E'; + if (test_bit(GLF_DEFER_DELETE, gflags)) + *p++ = 's'; + if (test_bit(GLF_CANCELING, gflags)) + *p++ = 'C'; *p = 0; return buf; } diff --git a/fs/gfs2/glops.c b/fs/gfs2/glops.c index eb4714f299ef..cebd66b22694 100644 --- a/fs/gfs2/glops.c +++ b/fs/gfs2/glops.c @@ -168,7 +168,7 @@ void gfs2_ail_flush(struct gfs2_glock *gl, bool fsync) static int gfs2_rgrp_metasync(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct address_space *metamapping = &sdp->sd_aspace; + struct address_space *metamapping = gfs2_aspace(sdp); struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); const unsigned bsize = sdp->sd_sb.sb_bsize; loff_t start = (rgd->rd_addr * bsize) & PAGE_MASK; @@ -225,7 +225,7 @@ static int rgrp_go_sync(struct gfs2_glock *gl) static void rgrp_go_inval(struct gfs2_glock *gl, int flags) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; - struct address_space *mapping = &sdp->sd_aspace; + struct address_space *mapping = gfs2_aspace(sdp); struct gfs2_rgrpd *rgd = gfs2_glock2rgrp(gl); const unsigned bsize = sdp->sd_sb.sb_bsize; loff_t start, end; @@ -601,14 +601,13 @@ static int freeze_go_xmote_bh(struct gfs2_glock *gl) if (test_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags)) { j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); - error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); + error = gfs2_find_jhead(sdp->sd_jdesc, &head); if (gfs2_assert_withdraw_delayed(sdp, !error)) return error; if (gfs2_assert_withdraw_delayed(sdp, head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) return -EIO; - sdp->sd_log_sequence = head.lh_sequence + 1; - gfs2_log_pointers_init(sdp, head.lh_blkno); + gfs2_log_pointers_init(sdp, &head); } return 0; } diff --git a/fs/gfs2/incore.h b/fs/gfs2/incore.h index 4e19cce3d906..0a41c4e76b32 100644 --- a/fs/gfs2/incore.h +++ b/fs/gfs2/incore.h @@ -330,6 +330,9 @@ enum { GLF_UNLOCKED = 16, /* Wait for glock to be unlocked */ GLF_TRY_TO_EVICT = 17, /* iopen glocks only */ GLF_VERIFY_DELETE = 18, /* iopen glocks only */ + GLF_PENDING_REPLY = 19, + GLF_DEFER_DELETE = 20, /* iopen glocks only */ + GLF_CANCELING = 21, }; struct gfs2_glock { @@ -376,7 +379,6 @@ enum { GIF_SW_PAGED = 3, GIF_FREE_VFS_INODE = 5, GIF_GLOP_PENDING = 6, - GIF_DEFER_DELETE = 7, }; struct gfs2_inode { @@ -793,7 +795,7 @@ struct gfs2_sbd { /* Log stuff */ - struct address_space sd_aspace; + struct inode *sd_inode; spinlock_t sd_log_lock; @@ -849,6 +851,13 @@ struct gfs2_sbd { unsigned long sd_glock_dqs_held; }; +#define GFS2_BAD_INO 1 + +static inline struct address_space *gfs2_aspace(struct gfs2_sbd *sdp) +{ + return sdp->sd_inode->i_mapping; +} + static inline void gfs2_glstats_inc(struct gfs2_glock *gl, int which) { gl->gl_stats.stats[which]++; diff --git a/fs/gfs2/inode.c b/fs/gfs2/inode.c index 6fbbaaad1cd0..187d789a8f1e 100644 --- a/fs/gfs2/inode.c +++ b/fs/gfs2/inode.c @@ -439,6 +439,74 @@ out: return error; } +static void gfs2_final_release_pages(struct gfs2_inode *ip) +{ + struct inode *inode = &ip->i_inode; + struct gfs2_glock *gl = ip->i_gl; + + if (unlikely(!gl)) { + /* This can only happen during incomplete inode creation. */ + BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); + return; + } + + truncate_inode_pages(gfs2_glock2aspace(gl), 0); + truncate_inode_pages(&inode->i_data, 0); + + if (atomic_read(&gl->gl_revokes) == 0) { + clear_bit(GLF_LFLUSH, &gl->gl_flags); + clear_bit(GLF_DIRTY, &gl->gl_flags); + } +} + +int gfs2_dinode_dealloc(struct gfs2_inode *ip) +{ + struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); + struct gfs2_rgrpd *rgd; + struct gfs2_holder gh; + int error; + + if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { + gfs2_consist_inode(ip); + return -EIO; + } + + gfs2_rindex_update(sdp); + + error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); + if (error) + return error; + + rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); + if (!rgd) { + gfs2_consist_inode(ip); + error = -EIO; + goto out_qs; + } + + error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, + LM_FLAG_NODE_SCOPE, &gh); + if (error) + goto out_qs; + + error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, + sdp->sd_jdesc->jd_blocks); + if (error) + goto out_rg_gunlock; + + gfs2_free_di(rgd, ip); + + gfs2_final_release_pages(ip); + + gfs2_trans_end(sdp); + +out_rg_gunlock: + gfs2_glock_dq_uninit(&gh); +out_qs: + gfs2_quota_unhold(ip); + return error; +} + static void gfs2_init_dir(struct buffer_head *dibh, const struct gfs2_inode *parent) { @@ -629,10 +697,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, struct gfs2_inode *dip = GFS2_I(dir), *ip; struct gfs2_sbd *sdp = GFS2_SB(&dip->i_inode); struct gfs2_glock *io_gl; - int error; + int error, dealloc_error; u32 aflags = 0; unsigned blocks = 1; struct gfs2_diradd da = { .bh = NULL, .save_loc = 1, }; + bool xattr_initialized = false; if (!name->len || name->len > GFS2_FNAMESIZE) return -ENAMETOOLONG; @@ -659,7 +728,8 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, if (!IS_ERR(inode)) { if (S_ISDIR(inode->i_mode)) { iput(inode); - inode = ERR_PTR(-EISDIR); + inode = NULL; + error = -EISDIR; goto fail_gunlock; } d_instantiate(dentry, inode); @@ -744,11 +814,11 @@ static int gfs2_create_inode(struct inode *dir, struct dentry *dentry, error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_inode_glops, CREATE, &ip->i_gl); if (error) - goto fail_free_inode; + goto fail_dealloc_inode; error = gfs2_glock_get(sdp, ip->i_no_addr, &gfs2_iopen_glops, CREATE, &io_gl); if (error) - goto fail_free_inode; + goto fail_dealloc_inode; gfs2_cancel_delete_work(io_gl); io_gl->gl_no_formal_ino = ip->i_no_formal_ino; @@ -767,13 +837,16 @@ retry: error = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, &gh); if (error) goto fail_gunlock3; + clear_bit(GLF_INSTANTIATE_NEEDED, &ip->i_gl->gl_flags); error = gfs2_trans_begin(sdp, blocks, 0); if (error) goto fail_gunlock3; - if (blocks > 1) + if (blocks > 1) { gfs2_init_xattr(ip); + xattr_initialized = true; + } init_dinode(dip, ip, symname); gfs2_trans_end(sdp); @@ -828,6 +901,18 @@ fail_gunlock3: gfs2_glock_dq_uninit(&ip->i_iopen_gh); fail_gunlock2: gfs2_glock_put(io_gl); +fail_dealloc_inode: + set_bit(GIF_ALLOC_FAILED, &ip->i_flags); + dealloc_error = 0; + if (ip->i_eattr) + dealloc_error = gfs2_ea_dealloc(ip, xattr_initialized); + clear_nlink(inode); + mark_inode_dirty(inode); + if (!dealloc_error) + dealloc_error = gfs2_dinode_dealloc(ip); + if (dealloc_error) + fs_warn(sdp, "%s: %d\n", __func__, dealloc_error); + ip->i_no_addr = 0; fail_free_inode: if (ip->i_gl) { gfs2_glock_put(ip->i_gl); @@ -842,10 +927,6 @@ fail_gunlock: gfs2_dir_no_add(&da); gfs2_glock_dq_uninit(&d_gh); if (!IS_ERR_OR_NULL(inode)) { - set_bit(GIF_ALLOC_FAILED, &ip->i_flags); - clear_nlink(inode); - if (ip->i_no_addr) - mark_inode_dirty(inode); if (inode->i_state & I_NEW) iget_failed(inode); else @@ -1248,14 +1329,15 @@ static int gfs2_symlink(struct mnt_idmap *idmap, struct inode *dir, * @dentry: The dentry of the new directory * @mode: The mode of the new directory * - * Returns: errno + * Returns: the dentry, or ERR_PTR(errno) */ -static int gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *gfs2_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { unsigned dsize = gfs2_max_stuffed_size(GFS2_I(dir)); - return gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0); + + return ERR_PTR(gfs2_create_inode(dir, dentry, NULL, S_IFDIR | mode, 0, NULL, dsize, 0)); } /** diff --git a/fs/gfs2/inode.h b/fs/gfs2/inode.h index 9e5e1622d50a..eafe123617e6 100644 --- a/fs/gfs2/inode.h +++ b/fs/gfs2/inode.h @@ -92,6 +92,7 @@ struct inode *gfs2_inode_lookup(struct super_block *sb, unsigned type, struct inode *gfs2_lookup_by_inum(struct gfs2_sbd *sdp, u64 no_addr, u64 no_formal_ino, unsigned int blktype); +int gfs2_dinode_dealloc(struct gfs2_inode *ip); struct inode *gfs2_lookupi(struct inode *dir, const struct qstr *name, int is_root); diff --git a/fs/gfs2/lock_dlm.c b/fs/gfs2/lock_dlm.c index 58aeeae7ed8c..7cb9d216d8bb 100644 --- a/fs/gfs2/lock_dlm.c +++ b/fs/gfs2/lock_dlm.c @@ -328,6 +328,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) { struct gfs2_sbd *sdp = gl->gl_name.ln_sbd; struct lm_lockstruct *ls = &sdp->sd_lockstruct; + uint32_t flags = 0; int error; BUG_ON(!__lockref_is_dead(&gl->gl_lockref)); @@ -352,7 +353,7 @@ static void gdlm_put_lock(struct gfs2_glock *gl) * When the lockspace is released, all remaining glocks will be * unlocked automatically. This is more efficient than unlocking them * individually, but when the lock is held in DLM_LOCK_EX or - * DLM_LOCK_PW mode, the lock value block (LVB) will be lost. + * DLM_LOCK_PW mode, the lock value block (LVB) would be lost. */ if (test_bit(SDF_SKIP_DLM_UNLOCK, &sdp->sd_flags) && @@ -361,8 +362,11 @@ static void gdlm_put_lock(struct gfs2_glock *gl) return; } + if (gl->gl_lksb.sb_lvbptr) + flags |= DLM_LKF_VALBLK; + again: - error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, DLM_LKF_VALBLK, + error = dlm_unlock(ls->ls_dlm, gl->gl_lksb.sb_lkid, flags, NULL, gl); if (error == -EBUSY) { msleep(20); @@ -996,14 +1000,15 @@ locks_done: if (sdp->sd_args.ar_spectator) { fs_info(sdp, "Recovery is required. Waiting for a " "non-spectator to mount.\n"); + spin_unlock(&ls->ls_recover_spin); msleep_interruptible(1000); } else { fs_info(sdp, "control_mount wait1 block %u start %u " "mount %u lvb %u flags %lx\n", block_gen, start_gen, mount_gen, lvb_gen, ls->ls_recover_flags); + spin_unlock(&ls->ls_recover_spin); } - spin_unlock(&ls->ls_recover_spin); goto restart; } diff --git a/fs/gfs2/log.c b/fs/gfs2/log.c index f9c5089783d2..115c4ac457e9 100644 --- a/fs/gfs2/log.c +++ b/fs/gfs2/log.c @@ -31,6 +31,7 @@ #include "dir.h" #include "trace_gfs2.h" #include "trans.h" +#include "aops.h" static void gfs2_log_shutdown(struct gfs2_sbd *sdp); @@ -131,7 +132,11 @@ __acquires(&sdp->sd_ail_lock) if (!mapping) continue; spin_unlock(&sdp->sd_ail_lock); - ret = mapping->a_ops->writepages(mapping, wbc); + BUG_ON(GFS2_SB(mapping->host) != sdp); + if (gfs2_is_jdata(GFS2_I(mapping->host))) + ret = gfs2_jdata_writeback(mapping, wbc); + else + ret = mapping->a_ops->writepages(mapping, wbc); if (need_resched()) { blk_finish_plug(plug); cond_resched(); diff --git a/fs/gfs2/log.h b/fs/gfs2/log.h index c27b05099c1e..fc30ebdad83a 100644 --- a/fs/gfs2/log.h +++ b/fs/gfs2/log.h @@ -44,17 +44,6 @@ __releases(&sdp->sd_log_lock) spin_unlock(&sdp->sd_log_lock); } -static inline void gfs2_log_pointers_init(struct gfs2_sbd *sdp, - unsigned int value) -{ - if (++value == sdp->sd_jdesc->jd_blocks) { - value = 0; - } - sdp->sd_log_tail = value; - sdp->sd_log_flush_tail = value; - sdp->sd_log_head = value; -} - static inline void gfs2_ordered_add_inode(struct gfs2_inode *ip) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); diff --git a/fs/gfs2/lops.c b/fs/gfs2/lops.c index 314ec2a70167..9c8c305a75c4 100644 --- a/fs/gfs2/lops.c +++ b/fs/gfs2/lops.c @@ -157,7 +157,9 @@ u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock) /** * gfs2_end_log_write_bh - end log write of pagecache data with buffers * @sdp: The superblock - * @bvec: The bio_vec + * @folio: The folio + * @offset: The first byte within the folio that completed + * @size: The number of bytes that completed * @error: The i/o status * * This finds the relevant buffers and unlocks them and sets the @@ -166,17 +168,13 @@ u64 gfs2_log_bmap(struct gfs2_jdesc *jd, unsigned int lblock) * that is pinned in the pagecache. */ -static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, - struct bio_vec *bvec, - blk_status_t error) +static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, struct folio *folio, + size_t offset, size_t size, blk_status_t error) { struct buffer_head *bh, *next; - struct page *page = bvec->bv_page; - unsigned size; - bh = page_buffers(page); - size = bvec->bv_len; - while (bh_offset(bh) < bvec->bv_offset) + bh = folio_buffers(folio); + while (bh_offset(bh) < offset) bh = bh->b_this_page; do { if (error) @@ -186,7 +184,7 @@ static void gfs2_end_log_write_bh(struct gfs2_sbd *sdp, size -= bh->b_size; brelse(bh); bh = next; - } while(bh && size); + } while (bh && size); } /** @@ -203,13 +201,14 @@ static void gfs2_end_log_write(struct bio *bio) { struct gfs2_sbd *sdp = bio->bi_private; struct bio_vec *bvec; - struct page *page; struct bvec_iter_all iter_all; if (bio->bi_status) { - if (!cmpxchg(&sdp->sd_log_error, 0, (int)bio->bi_status)) + int err = blk_status_to_errno(bio->bi_status); + + if (!cmpxchg(&sdp->sd_log_error, 0, err)) fs_err(sdp, "Error %d writing to journal, jid=%u\n", - bio->bi_status, sdp->sd_jdesc->jd_jid); + err, sdp->sd_jdesc->jd_jid); gfs2_withdraw_delayed(sdp); /* prevent more writes to the journal */ clear_bit(SDF_JOURNAL_LIVE, &sdp->sd_flags); @@ -217,9 +216,12 @@ static void gfs2_end_log_write(struct bio *bio) } bio_for_each_segment_all(bvec, bio, iter_all) { - page = bvec->bv_page; - if (page_has_buffers(page)) - gfs2_end_log_write_bh(sdp, bvec, bio->bi_status); + struct page *page = bvec->bv_page; + struct folio *folio = page_folio(page); + + if (folio && folio_buffers(folio)) + gfs2_end_log_write_bh(sdp, folio, bvec->bv_offset, + bvec->bv_len, bio->bi_status); else mempool_free(page, gfs2_page_pool); } @@ -359,8 +361,8 @@ static void gfs2_log_write_bh(struct gfs2_sbd *sdp, struct buffer_head *bh) dblock = gfs2_log_bmap(sdp->sd_jdesc, sdp->sd_log_flush_head); gfs2_log_incr_head(sdp); - gfs2_log_write(sdp, sdp->sd_jdesc, bh->b_page, bh->b_size, - bh_offset(bh), dblock); + gfs2_log_write(sdp, sdp->sd_jdesc, folio_page(bh->b_folio, 0), + bh->b_size, bh_offset(bh), dblock); } /** @@ -406,17 +408,16 @@ static void gfs2_end_log_read(struct bio *bio) } /** - * gfs2_jhead_pg_srch - Look for the journal head in a given page. + * gfs2_jhead_folio_search - Look for the journal head in a given page. * @jd: The journal descriptor * @head: The journal head to start from - * @page: The page to look in + * @folio: The folio to look in * * Returns: 1 if found, 0 otherwise. */ - -static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, - struct page *page) +static bool gfs2_jhead_folio_search(struct gfs2_jdesc *jd, + struct gfs2_log_header_host *head, + struct folio *folio) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct gfs2_log_header_host lh; @@ -424,7 +425,8 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, unsigned int offset; bool ret = false; - kaddr = kmap_local_page(page); + VM_BUG_ON_FOLIO(folio_test_large(folio), folio); + kaddr = kmap_local_folio(folio, 0); for (offset = 0; offset < PAGE_SIZE; offset += sdp->sd_sb.sb_bsize) { if (!__get_log_header(sdp, kaddr + offset, 0, &lh)) { if (lh.lh_sequence >= head->lh_sequence) @@ -449,7 +451,7 @@ static bool gfs2_jhead_pg_srch(struct gfs2_jdesc *jd, * Find the folio with 'index' in the journal's mapping. Search the folio for * the journal head if requested (cleanup == false). Release refs on the * folio so the page cache can reclaim it. We grabbed a - * reference on this folio twice, first when we did a grab_cache_page() + * reference on this folio twice, first when we did a filemap_grab_folio() * to obtain the folio to add it to the bio and second when we do a * filemap_get_folio() here to get the folio to wait on while I/O on it is being * completed. @@ -472,9 +474,9 @@ static void gfs2_jhead_process_page(struct gfs2_jdesc *jd, unsigned long index, *done = true; if (!*done) - *done = gfs2_jhead_pg_srch(jd, head, &folio->page); + *done = gfs2_jhead_folio_search(jd, head, folio); - /* filemap_get_folio() and the earlier grab_cache_page() */ + /* filemap_get_folio() and the earlier filemap_grab_folio() */ folio_put_refs(folio, 2); } @@ -494,15 +496,13 @@ static struct bio *gfs2_chain_bio(struct bio *prev, unsigned int nr_iovecs) * gfs2_find_jhead - find the head of a log * @jd: The journal descriptor * @head: The log descriptor for the head of the log is returned here - * @keep_cache: If set inode pages will not be truncated * * Do a search of a journal by reading it in large chunks using bios and find * the valid log entry with the highest sequence number. (i.e. the log head) * * Returns: 0 on success, errno otherwise */ -int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, - bool keep_cache) +int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); struct address_space *mapping = jd->jd_inode->i_mapping; @@ -512,9 +512,9 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, unsigned int shift = PAGE_SHIFT - bsize_shift; unsigned int max_blocks = 2 * 1024 * 1024 >> bsize_shift; struct gfs2_journal_extent *je; - int sz, ret = 0; + int ret = 0; struct bio *bio = NULL; - struct page *page = NULL; + struct folio *folio = NULL; bool done = false; errseq_t since; @@ -527,10 +527,11 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, u64 dblock = je->dblock; for (; block < je->lblock + je->blocks; block++, dblock++) { - if (!page) { - page = grab_cache_page(mapping, block >> shift); - if (!page) { - ret = -ENOMEM; + if (!folio) { + folio = filemap_grab_folio(mapping, + block >> shift); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); done = true; goto out; } @@ -541,8 +542,7 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, sector_t sector = dblock << sdp->sd_fsb2bb_shift; if (bio_end_sector(bio) == sector) { - sz = bio_add_page(bio, page, bsize, off); - if (sz == bsize) + if (bio_add_folio(bio, folio, bsize, off)) goto block_added; } if (off) { @@ -562,12 +562,12 @@ int gfs2_find_jhead(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head, bio = gfs2_log_alloc_bio(sdp, dblock, gfs2_end_log_read); bio->bi_opf = REQ_OP_READ; add_block_to_new_bio: - sz = bio_add_page(bio, page, bsize, off); - BUG_ON(sz != bsize); + if (!bio_add_folio(bio, folio, bsize, off)) + BUG(); block_added: off += bsize; - if (off == PAGE_SIZE) - page = NULL; + if (off == folio_size(folio)) + folio = NULL; if (blocks_submitted <= blocks_read + max_blocks) { /* Keep at least one bio in flight */ continue; @@ -591,8 +591,7 @@ out: if (!ret) ret = filemap_check_wb_err(mapping, since); - if (!keep_cache) - truncate_inode_pages(mapping, 0); + truncate_inode_pages(mapping, 0); return ret; } @@ -615,15 +614,13 @@ static struct page *gfs2_get_log_desc(struct gfs2_sbd *sdp, u32 ld_type, static void gfs2_check_magic(struct buffer_head *bh) { - void *kaddr; __be32 *ptr; clear_buffer_escaped(bh); - kaddr = kmap_local_page(bh->b_page); - ptr = kaddr + bh_offset(bh); + ptr = kmap_local_folio(bh->b_folio, bh_offset(bh)); if (*ptr == cpu_to_be32(GFS2_MAGIC)) set_buffer_escaped(bh); - kunmap_local(kaddr); + kunmap_local(ptr); } static int blocknr_cmp(void *priv, const struct list_head *a, diff --git a/fs/gfs2/lops.h b/fs/gfs2/lops.h index 07890c7b145d..be740bf33666 100644 --- a/fs/gfs2/lops.h +++ b/fs/gfs2/lops.h @@ -20,7 +20,7 @@ void gfs2_log_write(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, void gfs2_log_submit_bio(struct bio **biop, blk_opf_t opf); void gfs2_pin(struct gfs2_sbd *sdp, struct buffer_head *bh); int gfs2_find_jhead(struct gfs2_jdesc *jd, - struct gfs2_log_header_host *head, bool keep_cache); + struct gfs2_log_header_host *head); void gfs2_drain_revokes(struct gfs2_sbd *sdp); static inline unsigned int buf_limit(struct gfs2_sbd *sdp) diff --git a/fs/gfs2/meta_io.c b/fs/gfs2/meta_io.c index fea3efcc2f93..9dc8885c95d0 100644 --- a/fs/gfs2/meta_io.c +++ b/fs/gfs2/meta_io.c @@ -132,7 +132,7 @@ struct buffer_head *gfs2_getbuf(struct gfs2_glock *gl, u64 blkno, int create) unsigned int bufnum; if (mapping == NULL) - mapping = &sdp->sd_aspace; + mapping = gfs2_aspace(sdp); shift = PAGE_SHIFT - sdp->sd_sb.sb_bsize_shift; index = blkno >> shift; /* convert block to page */ @@ -198,15 +198,14 @@ struct buffer_head *gfs2_meta_new(struct gfs2_glock *gl, u64 blkno) static void gfs2_meta_read_endio(struct bio *bio) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *page = bvec->bv_page; - struct buffer_head *bh = page_buffers(page); - unsigned int len = bvec->bv_len; + bio_for_each_folio_all(fi, bio) { + struct folio *folio = fi.folio; + struct buffer_head *bh = folio_buffers(folio); + size_t len = fi.length; - while (bh_offset(bh) < bvec->bv_offset) + while (bh_offset(bh) < fi.offset) bh = bh->b_this_page; do { struct buffer_head *next = bh->b_this_page; @@ -232,7 +231,7 @@ static void gfs2_submit_bhs(blk_opf_t opf, struct buffer_head *bhs[], int num) bio->bi_iter.bi_sector = bh->b_blocknr * (bh->b_size >> 9); while (num > 0) { bh = *bhs; - if (!bio_add_page(bio, bh->b_page, bh->b_size, bh_offset(bh))) { + if (!bio_add_folio(bio, bh->b_folio, bh->b_size, bh_offset(bh))) { BUG_ON(bio->bi_iter.bi_size == 0); break; } diff --git a/fs/gfs2/meta_io.h b/fs/gfs2/meta_io.h index 831d988c2ceb..b7c8a6684d02 100644 --- a/fs/gfs2/meta_io.h +++ b/fs/gfs2/meta_io.h @@ -44,9 +44,7 @@ static inline struct gfs2_sbd *gfs2_mapping2sbd(struct address_space *mapping) struct gfs2_glock_aspace *gla = container_of(mapping, struct gfs2_glock_aspace, mapping); return gla->glock.gl_name.ln_sbd; - } else if (mapping->a_ops == &gfs2_rgrp_aops) - return container_of(mapping, struct gfs2_sbd, sd_aspace); - else + } else return inode->i_sb->s_fs_info; } diff --git a/fs/gfs2/ops_fstype.c b/fs/gfs2/ops_fstype.c index e83d293c3614..653f0ff4b057 100644 --- a/fs/gfs2/ops_fstype.c +++ b/fs/gfs2/ops_fstype.c @@ -64,15 +64,13 @@ static void gfs2_tune_init(struct gfs2_tune *gt) void free_sbd(struct gfs2_sbd *sdp) { - if (sdp->sd_lkstats) - free_percpu(sdp->sd_lkstats); + free_percpu(sdp->sd_lkstats); kfree(sdp); } static struct gfs2_sbd *init_sbd(struct super_block *sb) { struct gfs2_sbd *sdp; - struct address_space *mapping; sdp = kzalloc(sizeof(struct gfs2_sbd), GFP_KERNEL); if (!sdp) @@ -109,16 +107,6 @@ static struct gfs2_sbd *init_sbd(struct super_block *sb) INIT_LIST_HEAD(&sdp->sd_sc_inodes_list); - mapping = &sdp->sd_aspace; - - address_space_init_once(mapping); - mapping->a_ops = &gfs2_rgrp_aops; - mapping->host = sb->s_bdev->bd_mapping->host; - mapping->flags = 0; - mapping_set_gfp_mask(mapping, GFP_NOFS); - mapping->i_private_data = NULL; - mapping->writeback_index = 0; - spin_lock_init(&sdp->sd_log_lock); atomic_set(&sdp->sd_log_pinned, 0); INIT_LIST_HEAD(&sdp->sd_log_revokes); @@ -226,28 +214,22 @@ static void gfs2_sb_in(struct gfs2_sbd *sdp, const struct gfs2_sb *str) static int gfs2_read_super(struct gfs2_sbd *sdp, sector_t sector, int silent) { - struct super_block *sb = sdp->sd_vfs; - struct page *page; - struct bio_vec bvec; - struct bio bio; + struct gfs2_sb *sb; int err; - page = alloc_page(GFP_KERNEL); - if (unlikely(!page)) + sb = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (unlikely(!sb)) return -ENOMEM; - - bio_init(&bio, sb->s_bdev, &bvec, 1, REQ_OP_READ | REQ_META); - bio.bi_iter.bi_sector = sector * (sb->s_blocksize >> 9); - __bio_add_page(&bio, page, PAGE_SIZE, 0); - - err = submit_bio_wait(&bio); + err = bdev_rw_virt(sdp->sd_vfs->s_bdev, + sector * (sdp->sd_vfs->s_blocksize >> 9), sb, PAGE_SIZE, + REQ_OP_READ | REQ_META); if (err) { pr_warn("error %d reading superblock\n", err); - __free_page(page); + kfree(sb); return err; } - gfs2_sb_in(sdp, page_address(page)); - __free_page(page); + gfs2_sb_in(sdp, sb); + kfree(sb); return gfs2_check_sb(sdp, silent); } @@ -500,7 +482,9 @@ static int init_sb(struct gfs2_sbd *sdp, int silent) sdp->sd_sb.sb_bsize, (unsigned int)PAGE_SIZE); goto out; } - sb_set_blocksize(sb, sdp->sd_sb.sb_bsize); + ret = -EINVAL; + if (!sb_set_blocksize(sb, sdp->sd_sb.sb_bsize)) + goto out; /* Get the root inode */ no_addr = sdp->sd_sb.sb_root_dir.no_addr; @@ -1135,6 +1119,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) int silent = fc->sb_flags & SB_SILENT; struct gfs2_sbd *sdp; struct gfs2_holder mount_gh; + struct address_space *mapping; int error; sdp = init_sbd(sb); @@ -1156,6 +1141,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_flags |= SB_NOSEC; sb->s_magic = GFS2_MAGIC; sb->s_op = &gfs2_super_ops; + sb->s_d_op = &gfs2_dops; sb->s_export_op = &gfs2_export_ops; sb->s_qcop = &gfs2_quotactl_ops; @@ -1167,6 +1153,9 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) /* Set up the buffer cache and fill in some fake block size values to allow us to read-in the on-disk superblock. */ sdp->sd_sb.sb_bsize = sb_min_blocksize(sb, 512); + error = -EINVAL; + if (!sdp->sd_sb.sb_bsize) + goto fail_free; sdp->sd_sb.sb_bsize_shift = sb->s_blocksize_bits; sdp->sd_fsb2bb_shift = sdp->sd_sb.sb_bsize_shift - 9; sdp->sd_fsb2bb = BIT(sdp->sd_fsb2bb_shift); @@ -1181,9 +1170,21 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) sdp->sd_tune.gt_statfs_quantum = 30; } + /* Set up an address space for metadata writes */ + sdp->sd_inode = new_inode(sb); + error = -ENOMEM; + if (!sdp->sd_inode) + goto fail_free; + sdp->sd_inode->i_ino = GFS2_BAD_INO; + sdp->sd_inode->i_size = OFFSET_MAX; + + mapping = gfs2_aspace(sdp); + mapping->a_ops = &gfs2_rgrp_aops; + mapping_set_gfp_mask(mapping, GFP_NOFS); + error = init_names(sdp, silent); if (error) - goto fail_free; + goto fail_iput; snprintf(sdp->sd_fsname, sizeof(sdp->sd_fsname), "%s", sdp->sd_table_name); @@ -1192,7 +1193,7 @@ static int gfs2_fill_super(struct super_block *sb, struct fs_context *fc) WQ_MEM_RECLAIM | WQ_HIGHPRI | WQ_FREEZABLE, 0, sdp->sd_fsname); if (!sdp->sd_glock_wq) - goto fail_free; + goto fail_iput; sdp->sd_delete_wq = alloc_workqueue("gfs2-delete/%s", WQ_MEM_RECLAIM | WQ_FREEZABLE, 0, sdp->sd_fsname); @@ -1309,6 +1310,8 @@ fail_delete_wq: fail_glock_wq: if (sdp->sd_glock_wq) destroy_workqueue(sdp->sd_glock_wq); +fail_iput: + iput(sdp->sd_inode); fail_free: free_sbd(sdp); sb->s_fs_info = NULL; diff --git a/fs/gfs2/recovery.c b/fs/gfs2/recovery.c index f4fe7039f725..24250478b085 100644 --- a/fs/gfs2/recovery.c +++ b/fs/gfs2/recovery.c @@ -118,6 +118,7 @@ void gfs2_revoke_clean(struct gfs2_jdesc *jd) int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, unsigned int blkno, struct gfs2_log_header_host *head) { + const u32 zero = 0; u32 hash, crc; if (lh->lh_header.mh_magic != cpu_to_be32(GFS2_MAGIC) || @@ -126,7 +127,7 @@ int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, return 1; hash = crc32(~0, lh, LH_V1_SIZE - 4); - hash = ~crc32_le_shift(hash, 4); /* assume lh_hash is zero */ + hash = ~crc32(hash, &zero, 4); /* assume lh_hash is zero */ if (be32_to_cpu(lh->lh_hash) != hash) return 1; @@ -263,16 +264,12 @@ static void clean_journal(struct gfs2_jdesc *jd, struct gfs2_log_header_host *head) { struct gfs2_sbd *sdp = GFS2_SB(jd->jd_inode); - u32 lblock = head->lh_blkno; - gfs2_replay_incr_blk(jd, &lblock); - gfs2_write_log_header(sdp, jd, head->lh_sequence + 1, 0, lblock, + gfs2_replay_incr_blk(jd, &head->lh_blkno); + head->lh_sequence++; + gfs2_write_log_header(sdp, jd, head->lh_sequence, 0, head->lh_blkno, GFS2_LOG_HEAD_UNMOUNT | GFS2_LOG_HEAD_RECOVERY, REQ_PREFLUSH | REQ_FUA | REQ_META | REQ_SYNC); - if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) { - sdp->sd_log_flush_head = lblock; - gfs2_log_incr_head(sdp); - } } @@ -457,7 +454,7 @@ void gfs2_recover_func(struct work_struct *work) if (error) goto fail_gunlock_ji; - error = gfs2_find_jhead(jd, &head, true); + error = gfs2_find_jhead(jd, &head); if (error) goto fail_gunlock_ji; t_jhd = ktime_get(); @@ -533,6 +530,9 @@ void gfs2_recover_func(struct work_struct *work) ktime_ms_delta(t_rep, t_tlck)); } + if (jd->jd_jid == sdp->sd_lockstruct.ls_jid) + gfs2_log_pointers_init(sdp, &head); + gfs2_recovery_done(sdp, jd->jd_jid, LM_RD_SUCCESS); if (jlocked) { @@ -580,3 +580,13 @@ int gfs2_recover_journal(struct gfs2_jdesc *jd, bool wait) return wait ? jd->jd_recover_error : 0; } +void gfs2_log_pointers_init(struct gfs2_sbd *sdp, + struct gfs2_log_header_host *head) +{ + sdp->sd_log_sequence = head->lh_sequence + 1; + gfs2_replay_incr_blk(sdp->sd_jdesc, &head->lh_blkno); + sdp->sd_log_tail = head->lh_blkno; + sdp->sd_log_flush_head = head->lh_blkno; + sdp->sd_log_flush_tail = head->lh_blkno; + sdp->sd_log_head = head->lh_blkno; +} diff --git a/fs/gfs2/recovery.h b/fs/gfs2/recovery.h index 6a0fd42e1120..5a5ba72ecd75 100644 --- a/fs/gfs2/recovery.h +++ b/fs/gfs2/recovery.h @@ -29,6 +29,8 @@ void gfs2_recover_func(struct work_struct *work); int __get_log_header(struct gfs2_sbd *sdp, const struct gfs2_log_header *lh, unsigned int blkno, struct gfs2_log_header_host *head); +void gfs2_log_pointers_init(struct gfs2_sbd *sdp, + struct gfs2_log_header_host *head); #endif /* __RECOVERY_DOT_H__ */ diff --git a/fs/gfs2/super.c b/fs/gfs2/super.c index 92a3b6ddafdc..7c518c4ff638 100644 --- a/fs/gfs2/super.c +++ b/fs/gfs2/super.c @@ -134,28 +134,18 @@ int gfs2_make_fs_rw(struct gfs2_sbd *sdp) { struct gfs2_inode *ip = GFS2_I(sdp->sd_jdesc->jd_inode); struct gfs2_glock *j_gl = ip->i_gl; - struct gfs2_log_header_host head; int error; j_gl->gl_ops->go_inval(j_gl, DIO_METADATA); if (gfs2_withdrawing_or_withdrawn(sdp)) return -EIO; - error = gfs2_find_jhead(sdp->sd_jdesc, &head, false); - if (error) { - gfs2_consist(sdp); - return error; - } - - if (!(head.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { - gfs2_consist(sdp); + if (sdp->sd_log_sequence == 0) { + fs_err(sdp, "unknown status of our own journal jid %d", + sdp->sd_lockstruct.ls_jid); return -EIO; } - /* Initialize some head of the log stuff */ - sdp->sd_log_sequence = head.lh_sequence + 1; - gfs2_log_pointers_init(sdp, head.lh_blkno); - error = gfs2_quota_init(sdp); if (!error && gfs2_withdrawing_or_withdrawn(sdp)) error = -EIO; @@ -370,7 +360,7 @@ static int gfs2_lock_fs_check_clean(struct gfs2_sbd *sdp) error = gfs2_jdesc_check(jd); if (error) break; - error = gfs2_find_jhead(jd, &lh, false); + error = gfs2_find_jhead(jd, &lh); if (error) break; if (!(lh.lh_flags & GFS2_LOG_HEAD_UNMOUNT)) { @@ -648,7 +638,7 @@ restart: gfs2_jindex_free(sdp); /* Take apart glock structures and buffer lists */ gfs2_gl_hash_clear(sdp); - truncate_inode_pages_final(&sdp->sd_aspace); + iput(sdp->sd_inode); gfs2_delete_debugfs_file(sdp); gfs2_sys_fs_del(sdp); @@ -674,7 +664,7 @@ static int gfs2_sync_fs(struct super_block *sb, int wait) return sdp->sd_log_error; } -static int gfs2_do_thaw(struct gfs2_sbd *sdp) +static int gfs2_do_thaw(struct gfs2_sbd *sdp, enum freeze_holder who, const void *freeze_owner) { struct super_block *sb = sdp->sd_vfs; int error; @@ -682,7 +672,7 @@ static int gfs2_do_thaw(struct gfs2_sbd *sdp) error = gfs2_freeze_lock_shared(sdp); if (error) goto fail; - error = thaw_super(sb, FREEZE_HOLDER_USERSPACE); + error = thaw_super(sb, who, freeze_owner); if (!error) return 0; @@ -703,14 +693,14 @@ void gfs2_freeze_func(struct work_struct *work) if (test_bit(SDF_FROZEN, &sdp->sd_flags)) goto freeze_failed; - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); + error = freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL); if (error) goto freeze_failed; gfs2_freeze_unlock(sdp); set_bit(SDF_FROZEN, &sdp->sd_flags); - error = gfs2_do_thaw(sdp); + error = gfs2_do_thaw(sdp, FREEZE_HOLDER_USERSPACE, NULL); if (error) goto out; @@ -728,10 +718,13 @@ out: /** * gfs2_freeze_super - prevent further writes to the filesystem * @sb: the VFS structure for the filesystem + * @who: freeze flags + * @freeze_owner: owner of the freeze * */ -static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who) +static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who, + const void *freeze_owner) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; @@ -744,7 +737,7 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who) } for (;;) { - error = freeze_super(sb, FREEZE_HOLDER_USERSPACE); + error = freeze_super(sb, who, freeze_owner); if (error) { fs_info(sdp, "GFS2: couldn't freeze filesystem: %d\n", error); @@ -758,7 +751,7 @@ static int gfs2_freeze_super(struct super_block *sb, enum freeze_holder who) break; } - error = gfs2_do_thaw(sdp); + error = gfs2_do_thaw(sdp, who, freeze_owner); if (error) goto out; @@ -796,10 +789,13 @@ static int gfs2_freeze_fs(struct super_block *sb) /** * gfs2_thaw_super - reallow writes to the filesystem * @sb: the VFS structure for the filesystem + * @who: freeze flags + * @freeze_owner: owner of the freeze * */ -static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who) +static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who, + const void *freeze_owner) { struct gfs2_sbd *sdp = sb->s_fs_info; int error; @@ -814,7 +810,7 @@ static int gfs2_thaw_super(struct super_block *sb, enum freeze_holder who) atomic_inc(&sb->s_active); gfs2_freeze_unlock(sdp); - error = gfs2_do_thaw(sdp); + error = gfs2_do_thaw(sdp, who, freeze_owner); if (!error) { clear_bit(SDF_FREEZE_INITIATOR, &sdp->sd_flags); @@ -1173,74 +1169,6 @@ static int gfs2_show_options(struct seq_file *s, struct dentry *root) return 0; } -static void gfs2_final_release_pages(struct gfs2_inode *ip) -{ - struct inode *inode = &ip->i_inode; - struct gfs2_glock *gl = ip->i_gl; - - if (unlikely(!gl)) { - /* This can only happen during incomplete inode creation. */ - BUG_ON(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags)); - return; - } - - truncate_inode_pages(gfs2_glock2aspace(gl), 0); - truncate_inode_pages(&inode->i_data, 0); - - if (atomic_read(&gl->gl_revokes) == 0) { - clear_bit(GLF_LFLUSH, &gl->gl_flags); - clear_bit(GLF_DIRTY, &gl->gl_flags); - } -} - -static int gfs2_dinode_dealloc(struct gfs2_inode *ip) -{ - struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); - struct gfs2_rgrpd *rgd; - struct gfs2_holder gh; - int error; - - if (gfs2_get_inode_blocks(&ip->i_inode) != 1) { - gfs2_consist_inode(ip); - return -EIO; - } - - gfs2_rindex_update(sdp); - - error = gfs2_quota_hold(ip, NO_UID_QUOTA_CHANGE, NO_GID_QUOTA_CHANGE); - if (error) - return error; - - rgd = gfs2_blk2rgrpd(sdp, ip->i_no_addr, 1); - if (!rgd) { - gfs2_consist_inode(ip); - error = -EIO; - goto out_qs; - } - - error = gfs2_glock_nq_init(rgd->rd_gl, LM_ST_EXCLUSIVE, - LM_FLAG_NODE_SCOPE, &gh); - if (error) - goto out_qs; - - error = gfs2_trans_begin(sdp, RES_RG_BIT + RES_STATFS + RES_QUOTA, - sdp->sd_jdesc->jd_blocks); - if (error) - goto out_rg_gunlock; - - gfs2_free_di(rgd, ip); - - gfs2_final_release_pages(ip); - - gfs2_trans_end(sdp); - -out_rg_gunlock: - gfs2_glock_dq_uninit(&gh); -out_qs: - gfs2_quota_unhold(ip); - return error; -} - /** * gfs2_glock_put_eventually * @gl: The glock to put @@ -1326,10 +1254,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode, struct gfs2_sbd *sdp = sb->s_fs_info; int ret; - if (unlikely(test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) - goto should_delete; - - if (test_bit(GIF_DEFER_DELETE, &ip->i_flags)) + if (gfs2_holder_initialized(&ip->i_iopen_gh) && + test_bit(GLF_DEFER_DELETE, &ip->i_iopen_gh.gh_gl->gl_flags)) return EVICT_SHOULD_DEFER_DELETE; /* Deletes should never happen under memory pressure anymore. */ @@ -1338,12 +1264,8 @@ static enum evict_behavior evict_should_delete(struct inode *inode, /* Must not read inode block until block type has been verified */ ret = gfs2_glock_nq_init(ip->i_gl, LM_ST_EXCLUSIVE, GL_SKIP, gh); - if (unlikely(ret)) { - glock_clear_object(ip->i_iopen_gh.gh_gl, ip); - ip->i_iopen_gh.gh_flags |= GL_NOCACHE; - gfs2_glock_dq_uninit(&ip->i_iopen_gh); - return EVICT_SHOULD_DEFER_DELETE; - } + if (unlikely(ret)) + return EVICT_SHOULD_SKIP_DELETE; if (gfs2_inode_already_deleted(ip->i_gl, ip->i_no_formal_ino)) return EVICT_SHOULD_SKIP_DELETE; @@ -1361,17 +1283,9 @@ static enum evict_behavior evict_should_delete(struct inode *inode, if (inode->i_nlink) return EVICT_SHOULD_SKIP_DELETE; -should_delete: if (gfs2_holder_initialized(&ip->i_iopen_gh) && - test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) { - enum evict_behavior behavior = - gfs2_upgrade_iopen_glock(inode); - - if (behavior != EVICT_SHOULD_DELETE) { - gfs2_holder_uninit(&ip->i_iopen_gh); - return behavior; - } - } + test_bit(HIF_HOLDER, &ip->i_iopen_gh.gh_iflags)) + return gfs2_upgrade_iopen_glock(inode); return EVICT_SHOULD_DELETE; } @@ -1392,7 +1306,7 @@ static int evict_unlinked_inode(struct inode *inode) } if (ip->i_eattr) { - ret = gfs2_ea_dealloc(ip); + ret = gfs2_ea_dealloc(ip, true); if (ret) goto out; } @@ -1509,7 +1423,7 @@ static void gfs2_evict_inode(struct inode *inode) gfs2_glock_put(io_gl); goto out; } - behavior = EVICT_SHOULD_DELETE; + behavior = EVICT_SHOULD_SKIP_DELETE; } if (behavior == EVICT_SHOULD_DELETE) ret = evict_unlinked_inode(inode); diff --git a/fs/gfs2/sys.c b/fs/gfs2/sys.c index ecc699f8d9fc..748125653d6c 100644 --- a/fs/gfs2/sys.c +++ b/fs/gfs2/sys.c @@ -174,10 +174,10 @@ static ssize_t freeze_store(struct gfs2_sbd *sdp, const char *buf, size_t len) switch (n) { case 0: - error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE); + error = thaw_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE, NULL); break; case 1: - error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE); + error = freeze_super(sdp->sd_vfs, FREEZE_HOLDER_USERSPACE, NULL); break; default: return -EINVAL; diff --git a/fs/gfs2/trace_gfs2.h b/fs/gfs2/trace_gfs2.h index 8eae8d62a413..26036ffc3f33 100644 --- a/fs/gfs2/trace_gfs2.h +++ b/fs/gfs2/trace_gfs2.h @@ -53,12 +53,20 @@ {(1UL << GLF_DIRTY), "y" }, \ {(1UL << GLF_LFLUSH), "f" }, \ {(1UL << GLF_INVALIDATE_IN_PROGRESS), "i" }, \ + {(1UL << GLF_PENDING_REPLY), "R" }, \ {(1UL << GLF_HAVE_REPLY), "r" }, \ {(1UL << GLF_INITIAL), "a" }, \ {(1UL << GLF_HAVE_FROZEN_REPLY), "F" }, \ {(1UL << GLF_LRU), "L" }, \ {(1UL << GLF_OBJECT), "o" }, \ - {(1UL << GLF_BLOCKING), "b" }) + {(1UL << GLF_BLOCKING), "b" }, \ + {(1UL << GLF_UNLOCKED), "x" }, \ + {(1UL << GLF_INSTANTIATE_NEEDED), "n" }, \ + {(1UL << GLF_INSTANTIATE_IN_PROG), "N" }, \ + {(1UL << GLF_TRY_TO_EVICT), "e" }, \ + {(1UL << GLF_VERIFY_DELETE), "E" }, \ + {(1UL << GLF_DEFER_DELETE), "s" }, \ + {(1UL << GLF_CANCELING), "C" }) #ifndef NUMPTY #define NUMPTY diff --git a/fs/gfs2/trans.c b/fs/gfs2/trans.c index 192213c7359a..075f7e9abe47 100644 --- a/fs/gfs2/trans.c +++ b/fs/gfs2/trans.c @@ -226,6 +226,27 @@ out: unlock_buffer(bh); } +void gfs2_trans_add_databufs(struct gfs2_glock *gl, struct folio *folio, + size_t from, size_t len) +{ + struct buffer_head *head = folio_buffers(folio); + unsigned int bsize = head->b_size; + struct buffer_head *bh; + size_t to = from + len; + size_t start, end; + + for (bh = head, start = 0; bh != head || !start; + bh = bh->b_this_page, start = end) { + end = start + bsize; + if (end <= from) + continue; + if (start >= to) + break; + set_buffer_uptodate(bh); + gfs2_trans_add_data(gl, bh); + } +} + void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) { @@ -246,12 +267,12 @@ void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh) if (bd == NULL) { gfs2_log_unlock(sdp); unlock_buffer(bh); - lock_page(bh->b_page); + folio_lock(bh->b_folio); if (bh->b_private == NULL) bd = gfs2_alloc_bufdata(gl, bh); else bd = bh->b_private; - unlock_page(bh->b_page); + folio_unlock(bh->b_folio); lock_buffer(bh); gfs2_log_lock(sdp); } diff --git a/fs/gfs2/trans.h b/fs/gfs2/trans.h index f8ce5302280d..790c55f59e61 100644 --- a/fs/gfs2/trans.h +++ b/fs/gfs2/trans.h @@ -42,6 +42,8 @@ int gfs2_trans_begin(struct gfs2_sbd *sdp, unsigned int blocks, void gfs2_trans_end(struct gfs2_sbd *sdp); void gfs2_trans_add_data(struct gfs2_glock *gl, struct buffer_head *bh); +void gfs2_trans_add_databufs(struct gfs2_glock *gl, struct folio *folio, + size_t from, size_t len); void gfs2_trans_add_meta(struct gfs2_glock *gl, struct buffer_head *bh); void gfs2_trans_add_revoke(struct gfs2_sbd *sdp, struct gfs2_bufdata *bd); void gfs2_trans_remove_revoke(struct gfs2_sbd *sdp, u64 blkno, unsigned int len); diff --git a/fs/gfs2/util.c b/fs/gfs2/util.c index 13be8d1d228b..d5a1e63fa257 100644 --- a/fs/gfs2/util.c +++ b/fs/gfs2/util.c @@ -73,7 +73,7 @@ int check_journal_clean(struct gfs2_sbd *sdp, struct gfs2_jdesc *jd, "mount.\n"); goto out_unlock; } - error = gfs2_find_jhead(jd, &head, false); + error = gfs2_find_jhead(jd, &head); if (error) { if (verbose) fs_err(sdp, "Error parsing journal for spectator " diff --git a/fs/gfs2/xattr.c b/fs/gfs2/xattr.c index 17ae5070a90e..df9c93de94c7 100644 --- a/fs/gfs2/xattr.c +++ b/fs/gfs2/xattr.c @@ -1383,7 +1383,7 @@ out: return error; } -static int ea_dealloc_block(struct gfs2_inode *ip) +static int ea_dealloc_block(struct gfs2_inode *ip, bool initialized) { struct gfs2_sbd *sdp = GFS2_SB(&ip->i_inode); struct gfs2_rgrpd *rgd; @@ -1416,7 +1416,7 @@ static int ea_dealloc_block(struct gfs2_inode *ip) ip->i_eattr = 0; gfs2_add_inode_blocks(&ip->i_inode, -1); - if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) { + if (initialized) { error = gfs2_meta_inode_buffer(ip, &dibh); if (!error) { gfs2_trans_add_meta(ip->i_gl, dibh); @@ -1435,11 +1435,12 @@ out_gunlock: /** * gfs2_ea_dealloc - deallocate the extended attribute fork * @ip: the inode + * @initialized: xattrs have been initialized * * Returns: errno */ -int gfs2_ea_dealloc(struct gfs2_inode *ip) +int gfs2_ea_dealloc(struct gfs2_inode *ip, bool initialized) { int error; @@ -1451,7 +1452,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) if (error) return error; - if (likely(!test_bit(GIF_ALLOC_FAILED, &ip->i_flags))) { + if (initialized) { error = ea_foreach(ip, ea_dealloc_unstuffed, NULL); if (error) goto out_quota; @@ -1463,7 +1464,7 @@ int gfs2_ea_dealloc(struct gfs2_inode *ip) } } - error = ea_dealloc_block(ip); + error = ea_dealloc_block(ip, initialized); out_quota: gfs2_quota_unhold(ip); diff --git a/fs/gfs2/xattr.h b/fs/gfs2/xattr.h index eb12eb7e37c1..3c9788e0e137 100644 --- a/fs/gfs2/xattr.h +++ b/fs/gfs2/xattr.h @@ -54,7 +54,7 @@ int __gfs2_xattr_set(struct inode *inode, const char *name, const void *value, size_t size, int flags, int type); ssize_t gfs2_listxattr(struct dentry *dentry, char *buffer, size_t size); -int gfs2_ea_dealloc(struct gfs2_inode *ip); +int gfs2_ea_dealloc(struct gfs2_inode *ip, bool initialized); /* Exported to acl.c */ diff --git a/fs/hfs/bnode.c b/fs/hfs/bnode.c index 6add6ebfef89..cb823a8a6ba9 100644 --- a/fs/hfs/bnode.c +++ b/fs/hfs/bnode.c @@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) else key_len = tree->max_key_len + 1; + if (key_len > sizeof(hfs_btree_key) || key_len < 1) { + memset(key, 0, sizeof(hfs_btree_key)); + pr_err("hfs: Invalid key length: %d\n", key_len); + return; + } + hfs_bnode_read(node, key, off, key_len); } diff --git a/fs/hfs/dir.c b/fs/hfs/dir.c index b75c26045df4..86a6b317b474 100644 --- a/fs/hfs/dir.c +++ b/fs/hfs/dir.c @@ -219,26 +219,26 @@ static int hfs_create(struct mnt_idmap *idmap, struct inode *dir, * in a directory, given the inode for the parent directory and the * name (and its length) of the new directory. */ -static int hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; int res; inode = hfs_new_inode(dir, &dentry->d_name, S_IFDIR | mode); if (!inode) - return -ENOMEM; + return ERR_PTR(-ENOMEM); res = hfs_cat_create(inode->i_ino, dir, &dentry->d_name, inode); if (res) { clear_nlink(inode); hfs_delete_inode(inode); iput(inode); - return res; + return ERR_PTR(res); } d_instantiate(dentry, inode); mark_inode_dirty(inode); - return 0; + return NULL; } /* diff --git a/fs/hfsplus/bnode.c b/fs/hfsplus/bnode.c index 87974d5e6791..079ea80534f7 100644 --- a/fs/hfsplus/bnode.c +++ b/fs/hfsplus/bnode.c @@ -67,6 +67,12 @@ void hfs_bnode_read_key(struct hfs_bnode *node, void *key, int off) else key_len = tree->max_key_len + 2; + if (key_len > sizeof(hfsplus_btree_key) || key_len < 1) { + memset(key, 0, sizeof(hfsplus_btree_key)); + pr_err("hfsplus: Invalid key length: %d\n", key_len); + return; + } + hfs_bnode_read(node, key, off, key_len); } diff --git a/fs/hfsplus/dir.c b/fs/hfsplus/dir.c index f5c4b3e31a1c..876bbb80fb4d 100644 --- a/fs/hfsplus/dir.c +++ b/fs/hfsplus/dir.c @@ -523,10 +523,10 @@ static int hfsplus_create(struct mnt_idmap *idmap, struct inode *dir, return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); } -static int hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hfsplus_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); + return ERR_PTR(hfsplus_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0)); } static int hfsplus_rename(struct mnt_idmap *idmap, diff --git a/fs/hfsplus/wrapper.c b/fs/hfsplus/wrapper.c index 74801911bc1c..30cf4fe78b3d 100644 --- a/fs/hfsplus/wrapper.c +++ b/fs/hfsplus/wrapper.c @@ -48,47 +48,19 @@ struct hfsplus_wd { int hfsplus_submit_bio(struct super_block *sb, sector_t sector, void *buf, void **data, blk_opf_t opf) { - const enum req_op op = opf & REQ_OP_MASK; - struct bio *bio; - int ret = 0; - u64 io_size; - loff_t start; - int offset; + u64 io_size = hfsplus_min_io_size(sb); + loff_t start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT; + int offset = start & (io_size - 1); + + if ((opf & REQ_OP_MASK) != REQ_OP_WRITE && data) + *data = (u8 *)buf + offset; /* - * Align sector to hardware sector size and find offset. We - * assume that io_size is a power of two, which _should_ - * be true. + * Align sector to hardware sector size and find offset. We assume that + * io_size is a power of two, which _should_ be true. */ - io_size = hfsplus_min_io_size(sb); - start = (loff_t)sector << HFSPLUS_SECTOR_SHIFT; - offset = start & (io_size - 1); sector &= ~((io_size >> HFSPLUS_SECTOR_SHIFT) - 1); - - bio = bio_alloc(sb->s_bdev, 1, opf, GFP_NOIO); - bio->bi_iter.bi_sector = sector; - - if (op != REQ_OP_WRITE && data) - *data = (u8 *)buf + offset; - - while (io_size > 0) { - unsigned int page_offset = offset_in_page(buf); - unsigned int len = min_t(unsigned int, PAGE_SIZE - page_offset, - io_size); - - ret = bio_add_page(bio, virt_to_page(buf), len, page_offset); - if (ret != len) { - ret = -EIO; - goto out; - } - io_size -= len; - buf = (u8 *)buf + len; - } - - ret = submit_bio_wait(bio); -out: - bio_put(bio); - return ret < 0 ? ret : 0; + return bdev_rw_virt(sb->s_bdev, sector, buf, io_size, opf); } static int hfsplus_read_mdb(void *bufptr, struct hfsplus_wd *wd) diff --git a/fs/hostfs/hostfs.h b/fs/hostfs/hostfs.h index 8b39c15c408c..15b2f094d36e 100644 --- a/fs/hostfs/hostfs.h +++ b/fs/hostfs/hostfs.h @@ -60,7 +60,7 @@ struct hostfs_stat { unsigned int uid; unsigned int gid; unsigned long long size; - struct hostfs_timespec atime, mtime, ctime; + struct hostfs_timespec atime, mtime, ctime, btime; unsigned int blksize; unsigned long long blocks; struct { diff --git a/fs/hostfs/hostfs_kern.c b/fs/hostfs/hostfs_kern.c index e0741e468956..702c41317589 100644 --- a/fs/hostfs/hostfs_kern.c +++ b/fs/hostfs/hostfs_kern.c @@ -33,6 +33,7 @@ struct hostfs_inode_info { struct inode vfs_inode; struct mutex open_mutex; dev_t dev; + struct hostfs_timespec btime; }; static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode) @@ -547,6 +548,7 @@ static int hostfs_inode_set(struct inode *ino, void *data) } HOSTFS_I(ino)->dev = dev; + HOSTFS_I(ino)->btime = st->btime; ino->i_ino = st->ino; ino->i_mode = st->mode; return hostfs_inode_update(ino, st); @@ -557,7 +559,10 @@ static int hostfs_inode_test(struct inode *inode, void *data) const struct hostfs_stat *st = data; dev_t dev = MKDEV(st->dev.maj, st->dev.min); - return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev; + return inode->i_ino == st->ino && HOSTFS_I(inode)->dev == dev && + (inode->i_mode & S_IFMT) == (st->mode & S_IFMT) && + HOSTFS_I(inode)->btime.tv_sec == st->btime.tv_sec && + HOSTFS_I(inode)->btime.tv_nsec == st->btime.tv_nsec; } static struct inode *hostfs_iget(struct super_block *sb, char *name) @@ -679,17 +684,25 @@ static int hostfs_symlink(struct mnt_idmap *idmap, struct inode *ino, return err; } -static int hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino, - struct dentry *dentry, umode_t mode) +static struct dentry *hostfs_mkdir(struct mnt_idmap *idmap, struct inode *ino, + struct dentry *dentry, umode_t mode) { + struct inode *inode; char *file; int err; if ((file = dentry_name(dentry)) == NULL) - return -ENOMEM; + return ERR_PTR(-ENOMEM); err = do_mkdir(file, mode); + if (err) { + dentry = ERR_PTR(err); + } else { + inode = hostfs_iget(dentry->d_sb, file); + d_drop(dentry); + dentry = d_splice_alias(inode, dentry); + } __putname(file); - return err; + return dentry; } static int hostfs_rmdir(struct inode *ino, struct dentry *dentry) diff --git a/fs/hostfs/hostfs_user.c b/fs/hostfs/hostfs_user.c index 97e9c40a9448..3bcd9f35e70b 100644 --- a/fs/hostfs/hostfs_user.c +++ b/fs/hostfs/hostfs_user.c @@ -18,39 +18,48 @@ #include "hostfs.h" #include <utime.h> -static void stat64_to_hostfs(const struct stat64 *buf, struct hostfs_stat *p) +static void statx_to_hostfs(const struct statx *buf, struct hostfs_stat *p) { - p->ino = buf->st_ino; - p->mode = buf->st_mode; - p->nlink = buf->st_nlink; - p->uid = buf->st_uid; - p->gid = buf->st_gid; - p->size = buf->st_size; - p->atime.tv_sec = buf->st_atime; - p->atime.tv_nsec = 0; - p->ctime.tv_sec = buf->st_ctime; - p->ctime.tv_nsec = 0; - p->mtime.tv_sec = buf->st_mtime; - p->mtime.tv_nsec = 0; - p->blksize = buf->st_blksize; - p->blocks = buf->st_blocks; - p->rdev.maj = os_major(buf->st_rdev); - p->rdev.min = os_minor(buf->st_rdev); - p->dev.maj = os_major(buf->st_dev); - p->dev.min = os_minor(buf->st_dev); + p->ino = buf->stx_ino; + p->mode = buf->stx_mode; + p->nlink = buf->stx_nlink; + p->uid = buf->stx_uid; + p->gid = buf->stx_gid; + p->size = buf->stx_size; + p->atime.tv_sec = buf->stx_atime.tv_sec; + p->atime.tv_nsec = buf->stx_atime.tv_nsec; + p->ctime.tv_sec = buf->stx_ctime.tv_sec; + p->ctime.tv_nsec = buf->stx_ctime.tv_nsec; + p->mtime.tv_sec = buf->stx_mtime.tv_sec; + p->mtime.tv_nsec = buf->stx_mtime.tv_nsec; + if (buf->stx_mask & STATX_BTIME) { + p->btime.tv_sec = buf->stx_btime.tv_sec; + p->btime.tv_nsec = buf->stx_btime.tv_nsec; + } else { + memset(&p->btime, 0, sizeof(p->btime)); + } + p->blksize = buf->stx_blksize; + p->blocks = buf->stx_blocks; + p->rdev.maj = buf->stx_rdev_major; + p->rdev.min = buf->stx_rdev_minor; + p->dev.maj = buf->stx_dev_major; + p->dev.min = buf->stx_dev_minor; } int stat_file(const char *path, struct hostfs_stat *p, int fd) { - struct stat64 buf; + struct statx buf; + int flags = AT_SYMLINK_NOFOLLOW; if (fd >= 0) { - if (fstat64(fd, &buf) < 0) - return -errno; - } else if (lstat64(path, &buf) < 0) { - return -errno; + flags |= AT_EMPTY_PATH; + path = ""; } - stat64_to_hostfs(&buf, p); + + if ((statx(fd, path, flags, STATX_BASIC_STATS | STATX_BTIME, &buf)) < 0) + return -errno; + + statx_to_hostfs(&buf, p); return 0; } diff --git a/fs/hpfs/namei.c b/fs/hpfs/namei.c index d0edf9ed33b6..e3cdc421dfba 100644 --- a/fs/hpfs/namei.c +++ b/fs/hpfs/namei.c @@ -19,8 +19,8 @@ static void hpfs_update_directory_times(struct inode *dir) hpfs_write_inode_nolock(dir); } -static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { const unsigned char *name = dentry->d_name.name; unsigned len = dentry->d_name.len; @@ -35,7 +35,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, int r; struct hpfs_dirent dee; int err; - if ((err = hpfs_chk_name(name, &len))) return err==-ENOENT ? -EINVAL : err; + if ((err = hpfs_chk_name(name, &len))) return ERR_PTR(err==-ENOENT ? -EINVAL : err); hpfs_lock(dir->i_sb); err = -ENOSPC; fnode = hpfs_alloc_fnode(dir->i_sb, hpfs_i(dir)->i_dno, &fno, &bh); @@ -112,7 +112,7 @@ static int hpfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, hpfs_update_directory_times(dir); d_instantiate(dentry, result); hpfs_unlock(dir->i_sb); - return 0; + return NULL; bail3: iput(result); bail2: @@ -123,7 +123,7 @@ bail1: hpfs_free_sectors(dir->i_sb, fno, 1); bail: hpfs_unlock(dir->i_sb); - return err; + return ERR_PTR(err); } static int hpfs_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/hugetlbfs/inode.c b/fs/hugetlbfs/inode.c index 0fc179a59830..e4de5425838d 100644 --- a/fs/hugetlbfs/inode.c +++ b/fs/hugetlbfs/inode.c @@ -193,19 +193,21 @@ hugetlb_get_unmapped_area(struct file *file, unsigned long addr, } /* - * Someone wants to read @bytes from a HWPOISON hugetlb @page from @offset. + * Someone wants to read @bytes from a HWPOISON hugetlb @folio from @offset. * Returns the maximum number of bytes one can read without touching the 1st raw - * HWPOISON subpage. + * HWPOISON page. * * The implementation borrows the iteration logic from copy_page_to_iter*. */ -static size_t adjust_range_hwpoison(struct page *page, size_t offset, size_t bytes) +static size_t adjust_range_hwpoison(struct folio *folio, size_t offset, + size_t bytes) { + struct page *page; size_t n = 0; size_t res = 0; - /* First subpage to start the loop. */ - page = nth_page(page, offset / PAGE_SIZE); + /* First page to start the loop. */ + page = folio_page(folio, offset / PAGE_SIZE); offset %= PAGE_SIZE; while (1) { if (is_raw_hwpoison_page_in_hugepage(page)) @@ -278,10 +280,10 @@ static ssize_t hugetlbfs_read_iter(struct kiocb *iocb, struct iov_iter *to) else { /* * Adjust how many bytes safe to read without - * touching the 1st raw HWPOISON subpage after + * touching the 1st raw HWPOISON page after * offset. */ - want = adjust_range_hwpoison(&folio->page, offset, nr); + want = adjust_range_hwpoison(folio, offset, nr); if (want == 0) { folio_put(folio); retval = -EIO; @@ -338,8 +340,8 @@ static void hugetlb_delete_from_page_cache(struct folio *folio) * mutex for the page in the mapping. So, we can not race with page being * faulted into the vma. */ -static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, - unsigned long addr, struct page *page) +static bool hugetlb_vma_maps_pfn(struct vm_area_struct *vma, + unsigned long addr, unsigned long pfn) { pte_t *ptep, pte; @@ -351,7 +353,7 @@ static bool hugetlb_vma_maps_page(struct vm_area_struct *vma, if (huge_pte_none(pte) || !pte_present(pte)) return false; - if (pte_page(pte) == page) + if (pte_pfn(pte) == pfn) return true; return false; @@ -396,7 +398,7 @@ static void hugetlb_unmap_file_folio(struct hstate *h, { struct rb_root_cached *root = &mapping->i_mmap; struct hugetlb_vma_lock *vma_lock; - struct page *page = &folio->page; + unsigned long pfn = folio_pfn(folio); struct vm_area_struct *vma; unsigned long v_start; unsigned long v_end; @@ -412,7 +414,7 @@ retry: v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (!hugetlb_vma_maps_page(vma, v_start, page)) + if (!hugetlb_vma_maps_pfn(vma, v_start, pfn)) continue; if (!hugetlb_vma_trylock_write(vma)) { @@ -462,7 +464,7 @@ retry: */ v_start = vma_offset_start(vma, start); v_end = vma_offset_end(vma, end); - if (hugetlb_vma_maps_page(vma, v_start, page)) + if (hugetlb_vma_maps_pfn(vma, v_start, pfn)) unmap_hugepage_range(vma, v_start, v_end, NULL, ZAP_FLAG_DROP_MARKER); @@ -991,14 +993,14 @@ static int hugetlbfs_mknod(struct mnt_idmap *idmap, struct inode *dir, return 0; } -static int hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *hugetlbfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int retval = hugetlbfs_mknod(idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); - return retval; + return ERR_PTR(retval); } static int hugetlbfs_create(struct mnt_idmap *idmap, diff --git a/fs/init.c b/fs/init.c index e9387b6c4f30..eef5124885e3 100644 --- a/fs/init.c +++ b/fs/init.c @@ -230,9 +230,12 @@ int __init init_mkdir(const char *pathname, umode_t mode) return PTR_ERR(dentry); mode = mode_strip_umask(d_inode(path.dentry), mode); error = security_path_mkdir(&path, dentry, mode); - if (!error) - error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + if (!error) { + dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); + if (IS_ERR(dentry)) + error = PTR_ERR(dentry); + } done_path_create(&path, dentry); return error; } diff --git a/fs/inode.c b/fs/inode.c index 5587aabdaa5e..99318b157a9a 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -327,7 +327,17 @@ static void i_callback(struct rcu_head *head) free_inode_nonrcu(inode); } -static struct inode *alloc_inode(struct super_block *sb) +/** + * alloc_inode - obtain an inode + * @sb: superblock + * + * Allocates a new inode for given superblock. + * Inode wont be chained in superblock s_inodes list + * This means : + * - fs can't be unmount + * - quotas, fsnotify, writeback can't work + */ +struct inode *alloc_inode(struct super_block *sb) { const struct super_operations *ops = sb->s_op; struct inode *inode; @@ -613,18 +623,22 @@ static void inode_wait_for_lru_isolating(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode->i_sb->s_inode_list_lock); - list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode->i_sb->s_inode_list_lock); + struct super_block *sb = inode->i_sb; + + spin_lock(&sb->s_inode_list_lock); + list_add(&inode->i_sb_list, &sb->s_inodes); + spin_unlock(&sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { + struct super_block *sb = inode->i_sb; + if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode->i_sb->s_inode_list_lock); + spin_lock(&sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); - spin_unlock(&inode->i_sb->s_inode_list_lock); + spin_unlock(&sb->s_inode_list_lock); } } @@ -806,23 +820,16 @@ static void evict(struct inode *inode) /* * Wake up waiters in __wait_on_freeing_inode(). * - * Lockless hash lookup may end up finding the inode before we removed - * it above, but only lock it *after* we are done with the wakeup below. - * In this case the potential waiter cannot safely block. + * It is an invariant that any thread we need to wake up is already + * accounted for before remove_inode_hash() acquires ->i_lock -- both + * sides take the lock and sleep is aborted if the inode is found + * unhashed. Thus either the sleeper wins and goes off CPU, or removal + * wins and the sleeper aborts after testing with the lock. * - * The inode being unhashed after the call to remove_inode_hash() is - * used as an indicator whether blocking on it is safe. + * This also means we don't need any fences for the call below. */ - spin_lock(&inode->i_lock); - /* - * Pairs with the barrier in prepare_to_wait_event() to make sure - * ___wait_var_event() either sees the bit cleared or - * waitqueue_active() check in wake_up_var() sees the waiter. - */ - smp_mb__after_spinlock(); inode_wake_up_bit(inode, __I_NEW); BUG_ON(inode->i_state != (I_FREEING | I_CLEAR)); - spin_unlock(&inode->i_lock); destroy_inode(inode); } @@ -900,46 +907,6 @@ again: } EXPORT_SYMBOL_GPL(evict_inodes); -/** - * invalidate_inodes - attempt to free all inodes on a superblock - * @sb: superblock to operate on - * - * Attempts to free all inodes (including dirty inodes) for a given superblock. - */ -void invalidate_inodes(struct super_block *sb) -{ - struct inode *inode, *next; - LIST_HEAD(dispose); - -again: - spin_lock(&sb->s_inode_list_lock); - list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { - spin_lock(&inode->i_lock); - if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { - spin_unlock(&inode->i_lock); - continue; - } - if (atomic_read(&inode->i_count)) { - spin_unlock(&inode->i_lock); - continue; - } - - inode->i_state |= I_FREEING; - inode_lru_list_del(inode); - spin_unlock(&inode->i_lock); - list_add(&inode->i_lru, &dispose); - if (need_resched()) { - spin_unlock(&sb->s_inode_list_lock); - cond_resched(); - dispose_list(&dispose); - goto again; - } - } - spin_unlock(&sb->s_inode_list_lock); - - dispose_list(&dispose); -} - /* * Isolate the inode from the LRU in preparation for freeing it. * @@ -1160,21 +1127,6 @@ unsigned int get_next_ino(void) EXPORT_SYMBOL(get_next_ino); /** - * new_inode_pseudo - obtain an inode - * @sb: superblock - * - * Allocates a new inode for given superblock. - * Inode wont be chained in superblock s_inodes list - * This means : - * - fs can't be unmount - * - quotas, fsnotify, writeback can't work - */ -struct inode *new_inode_pseudo(struct super_block *sb) -{ - return alloc_inode(sb); -} - -/** * new_inode - obtain an inode * @sb: superblock * @@ -1190,7 +1142,7 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - inode = new_inode_pseudo(sb); + inode = alloc_inode(sb); if (inode) inode_sb_list_add(inode); return inode; @@ -1348,8 +1300,8 @@ again: } if (set && unlikely(set(inode, data))) { - inode = NULL; - goto unlock; + spin_unlock(&inode_hash_lock); + return NULL; } /* @@ -1361,14 +1313,14 @@ again: hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); + spin_unlock(&inode_hash_lock); + /* * Add inode to the sb list if it's not already. It has I_NEW at this * point, so it should be safe to test i_sb_list locklessly. */ if (list_empty(&inode->i_sb_list)) inode_sb_list_add(inode); -unlock: - spin_unlock(&inode_hash_lock); return inode; } @@ -1497,8 +1449,8 @@ again: inode->i_state = I_NEW; hlist_add_head_rcu(&inode->i_hash, head); spin_unlock(&inode->i_lock); - inode_sb_list_add(inode); spin_unlock(&inode_hash_lock); + inode_sb_list_add(inode); /* Return the locked inode with I_NEW set, the * caller is responsible for filling in the contents @@ -2953,3 +2905,18 @@ umode_t mode_strip_sgid(struct mnt_idmap *idmap, return mode & ~S_ISGID; } EXPORT_SYMBOL(mode_strip_sgid); + +#ifdef CONFIG_DEBUG_VFS +/* + * Dump an inode. + * + * TODO: add a proper inode dumping routine, this is a stub to get debug off the + * ground. + */ +void dump_inode(struct inode *inode, const char *reason) +{ + pr_warn("%s encountered for inode %px", reason, inode); +} + +EXPORT_SYMBOL(dump_inode); +#endif diff --git a/fs/internal.h b/fs/internal.h index e7f02ae1e098..393f6c5c24f6 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -66,6 +66,7 @@ int do_linkat(int olddfd, struct filename *old, int newdfd, int vfs_tmpfile(struct mnt_idmap *idmap, const struct path *parentpath, struct file *file, umode_t mode); +struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); /* * namespace.c @@ -118,6 +119,9 @@ static inline void put_file_access(struct file *file) } } +void fput_close_sync(struct file *); +void fput_close(struct file *); + /* * super.c */ @@ -187,8 +191,8 @@ extern struct open_how build_open_how(int flags, umode_t mode); extern int build_open_flags(const struct open_how *how, struct open_flags *op); struct file *file_close_fd_locked(struct files_struct *files, unsigned fd); -long do_ftruncate(struct file *file, loff_t length, int small); -long do_sys_ftruncate(unsigned int fd, loff_t length, int small); +int do_ftruncate(struct file *file, loff_t length, int small); +int do_sys_ftruncate(unsigned int fd, loff_t length, int small); int chmod_common(const struct path *path, umode_t mode); int do_fchownat(int dfd, const char __user *filename, uid_t user, gid_t group, int flag); @@ -207,7 +211,6 @@ bool in_group_or_capable(struct mnt_idmap *idmap, * fs-writeback.c */ extern long get_nr_dirty_inodes(void); -void invalidate_inodes(struct super_block *sb); /* * dcache.c @@ -325,6 +328,7 @@ struct stashed_operations { int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, struct path *path); void stashed_dentry_prune(struct dentry *dentry); +struct dentry *stashed_dentry_get(struct dentry **stashed); /** * path_mounted - check whether path is mounted * @path: path to check @@ -338,3 +342,11 @@ static inline bool path_mounted(const struct path *path) return path->mnt->mnt_root == path->dentry; } void file_f_owner_release(struct file *file); +bool file_seek_cur_needs_f_lock(struct file *file); +int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map); +struct dentry *find_next_child(struct dentry *parent, struct dentry *prev); +int anon_inode_getattr(struct mnt_idmap *idmap, const struct path *path, + struct kstat *stat, u32 request_mask, + unsigned int query_flags); +int anon_inode_setattr(struct mnt_idmap *idmap, struct dentry *dentry, + struct iattr *attr); diff --git a/fs/ioctl.c b/fs/ioctl.c index 638a36be31c1..69107a245b4c 100644 --- a/fs/ioctl.c +++ b/fs/ioctl.c @@ -41,7 +41,7 @@ * * Returns 0 on success, -errno on error. */ -long vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) +int vfs_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { int error = -ENOTTY; @@ -228,8 +228,8 @@ static int ioctl_fiemap(struct file *filp, struct fiemap __user *ufiemap) return error; } -static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, - u64 off, u64 olen, u64 destoff) +static int ioctl_file_clone(struct file *dst_file, unsigned long srcfd, + u64 off, u64 olen, u64 destoff) { CLASS(fd, src_file)(srcfd); loff_t cloned; @@ -248,8 +248,8 @@ static long ioctl_file_clone(struct file *dst_file, unsigned long srcfd, return ret; } -static long ioctl_file_clone_range(struct file *file, - struct file_clone_range __user *argp) +static int ioctl_file_clone_range(struct file *file, + struct file_clone_range __user *argp) { struct file_clone_range args; @@ -396,8 +396,8 @@ static int ioctl_fsfreeze(struct file *filp) /* Freeze */ if (sb->s_op->freeze_super) - return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE); - return freeze_super(sb, FREEZE_HOLDER_USERSPACE); + return sb->s_op->freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL); + return freeze_super(sb, FREEZE_HOLDER_USERSPACE, NULL); } static int ioctl_fsthaw(struct file *filp) @@ -409,8 +409,8 @@ static int ioctl_fsthaw(struct file *filp) /* Thaw */ if (sb->s_op->thaw_super) - return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE); - return thaw_super(sb, FREEZE_HOLDER_USERSPACE); + return sb->s_op->thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL); + return thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL); } static int ioctl_file_dedupe_range(struct file *file, @@ -821,7 +821,8 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, return ioctl_fioasync(fd, filp, argp); case FIOQSIZE: - if (S_ISDIR(inode->i_mode) || S_ISREG(inode->i_mode) || + if (S_ISDIR(inode->i_mode) || + (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) || S_ISLNK(inode->i_mode)) { loff_t res = inode_get_bytes(inode); return copy_to_user(argp, &res, sizeof(res)) ? @@ -856,7 +857,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, return ioctl_file_dedupe_range(filp, argp); case FIONREAD: - if (!S_ISREG(inode->i_mode)) + if (!S_ISREG(inode->i_mode) || IS_ANON_FILE(inode)) return vfs_ioctl(filp, cmd, arg); return put_user(i_size_read(inode) - filp->f_pos, @@ -881,7 +882,7 @@ static int do_vfs_ioctl(struct file *filp, unsigned int fd, return ioctl_get_fs_sysfs_path(filp, argp); default: - if (S_ISREG(inode->i_mode)) + if (S_ISREG(inode->i_mode) && !IS_ANON_FILE(inode)) return file_ioctl(filp, cmd, argp); break; } diff --git a/fs/iomap/Makefile b/fs/iomap/Makefile index 381d76c5c232..69e8ebb41302 100644 --- a/fs/iomap/Makefile +++ b/fs/iomap/Makefile @@ -12,6 +12,7 @@ iomap-y += trace.o \ iter.o iomap-$(CONFIG_BLOCK) += buffered-io.o \ direct-io.o \ + ioend.o \ fiemap.o \ seek.o iomap-$(CONFIG_SWAP) += swapfile.o diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index d303e6c8900c..233abf598f65 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -12,17 +12,15 @@ #include <linux/buffer_head.h> #include <linux/dax.h> #include <linux/writeback.h> -#include <linux/list_sort.h> #include <linux/swap.h> #include <linux/bio.h> #include <linux/sched/signal.h> #include <linux/migrate.h> +#include "internal.h" #include "trace.h" #include "../internal.h" -#define IOEND_BATCH_SIZE 4096 - /* * Structure allocated for each folio to track per-block uptodate, dirty state * and I/O completions. @@ -40,8 +38,6 @@ struct iomap_folio_state { unsigned long state[]; }; -static struct bio_set iomap_ioend_bioset; - static inline bool ifs_is_fully_uptodate(struct folio *folio, struct iomap_folio_state *ifs) { @@ -263,7 +259,7 @@ static void iomap_adjust_read_range(struct inode *inode, struct folio *folio, } /* truncate len if we find any trailing uptodate block(s) */ - for ( ; i <= last; i++) { + while (++i <= last) { if (ifs_block_is_uptodate(ifs, i)) { plen -= (last - i + 1) * block_size; last = i - 1; @@ -366,20 +362,24 @@ static inline bool iomap_block_needs_zeroing(const struct iomap_iter *iter, pos >= i_size_read(iter->inode); } -static loff_t iomap_readpage_iter(const struct iomap_iter *iter, - struct iomap_readpage_ctx *ctx, loff_t offset) +static int iomap_readpage_iter(struct iomap_iter *iter, + struct iomap_readpage_ctx *ctx) { const struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos + offset; - loff_t length = iomap_length(iter) - offset; + loff_t pos = iter->pos; + loff_t length = iomap_length(iter); struct folio *folio = ctx->cur_folio; struct iomap_folio_state *ifs; - loff_t orig_pos = pos; size_t poff, plen; sector_t sector; + int ret; - if (iomap->type == IOMAP_INLINE) - return iomap_read_inline_data(iter, folio); + if (iomap->type == IOMAP_INLINE) { + ret = iomap_read_inline_data(iter, folio); + if (ret) + return ret; + return iomap_iter_advance(iter, &length); + } /* zero post-eof blocks as the page may be mapped */ ifs = ifs_alloc(iter->inode, folio, iter->flags); @@ -438,25 +438,22 @@ done: * we can skip trailing ones as they will be handled in the next * iteration. */ - return pos - orig_pos + plen; + length = pos - iter->pos + plen; + return iomap_iter_advance(iter, &length); } -static loff_t iomap_read_folio_iter(const struct iomap_iter *iter, +static int iomap_read_folio_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - struct folio *folio = ctx->cur_folio; - size_t offset = offset_in_folio(folio, iter->pos); - loff_t length = min_t(loff_t, folio_size(folio) - offset, - iomap_length(iter)); - loff_t done, ret; - - for (done = 0; done < length; done += ret) { - ret = iomap_readpage_iter(iter, ctx, done); - if (ret <= 0) + int ret; + + while (iomap_length(iter)) { + ret = iomap_readpage_iter(iter, ctx); + if (ret) return ret; } - return done; + return 0; } int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) @@ -474,7 +471,7 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) trace_iomap_readpage(iter.inode, 1); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_read_folio_iter(&iter, &ctx); + iter.status = iomap_read_folio_iter(&iter, &ctx); if (ctx.bio) { submit_bio(ctx.bio); @@ -493,15 +490,14 @@ int iomap_read_folio(struct folio *folio, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_read_folio); -static loff_t iomap_readahead_iter(const struct iomap_iter *iter, +static int iomap_readahead_iter(struct iomap_iter *iter, struct iomap_readpage_ctx *ctx) { - loff_t length = iomap_length(iter); - loff_t done, ret; + int ret; - for (done = 0; done < length; done += ret) { + while (iomap_length(iter)) { if (ctx->cur_folio && - offset_in_folio(ctx->cur_folio, iter->pos + done) == 0) { + offset_in_folio(ctx->cur_folio, iter->pos) == 0) { if (!ctx->cur_folio_in_bio) folio_unlock(ctx->cur_folio); ctx->cur_folio = NULL; @@ -510,12 +506,12 @@ static loff_t iomap_readahead_iter(const struct iomap_iter *iter, ctx->cur_folio = readahead_folio(ctx->rac); ctx->cur_folio_in_bio = false; } - ret = iomap_readpage_iter(iter, ctx, done); - if (ret <= 0) + ret = iomap_readpage_iter(iter, ctx); + if (ret) return ret; } - return done; + return 0; } /** @@ -547,7 +543,7 @@ void iomap_readahead(struct readahead_control *rac, const struct iomap_ops *ops) trace_iomap_readahead(rac->mapping->host, readahead_count(rac)); while (iomap_iter(&iter, ops) > 0) - iter.processed = iomap_readahead_iter(&iter, &ctx); + iter.status = iomap_readahead_iter(&iter, &ctx); if (ctx.bio) submit_bio(ctx.bio); @@ -603,6 +599,8 @@ struct folio *iomap_get_folio(struct iomap_iter *iter, loff_t pos, size_t len) if (iter->flags & IOMAP_NOWAIT) fgp |= FGP_NOWAIT; + if (iter->flags & IOMAP_DONTCACHE) + fgp |= FGP_DONTCACHE; fgp |= fgf_set_order(len); return __filemap_get_folio(iter->inode->i_mapping, pos >> PAGE_SHIFT, @@ -681,11 +679,12 @@ static int iomap_read_folio_sync(loff_t block_start, struct folio *folio, return submit_bio_wait(&bio); } -static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, - size_t len, struct folio *folio) +static int __iomap_write_begin(const struct iomap_iter *iter, size_t len, + struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); struct iomap_folio_state *ifs; + loff_t pos = iter->pos; loff_t block_size = i_blocksize(iter->inode); loff_t block_start = round_down(pos, block_size); loff_t block_end = round_up(pos + len, block_size); @@ -743,10 +742,13 @@ static int __iomap_write_begin(const struct iomap_iter *iter, loff_t pos, return 0; } -static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, - size_t len) +static struct folio *__iomap_get_folio(struct iomap_iter *iter, size_t len) { const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; + loff_t pos = iter->pos; + + if (!mapping_large_folio_support(iter->inode->i_mapping)) + len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); if (folio_ops && folio_ops->get_folio) return folio_ops->get_folio(iter, pos, len); @@ -754,10 +756,11 @@ static struct folio *__iomap_get_folio(struct iomap_iter *iter, loff_t pos, return iomap_get_folio(iter, pos, len); } -static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, +static void __iomap_put_folio(struct iomap_iter *iter, size_t ret, struct folio *folio) { const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; + loff_t pos = iter->pos; if (folio_ops && folio_ops->put_folio) { folio_ops->put_folio(iter->inode, pos, ret, folio); @@ -767,6 +770,22 @@ static void __iomap_put_folio(struct iomap_iter *iter, loff_t pos, size_t ret, } } +/* trim pos and bytes to within a given folio */ +static loff_t iomap_trim_folio_range(struct iomap_iter *iter, + struct folio *folio, size_t *offset, u64 *bytes) +{ + loff_t pos = iter->pos; + size_t fsize = folio_size(folio); + + WARN_ON_ONCE(pos < folio_pos(folio)); + WARN_ON_ONCE(pos >= folio_pos(folio) + fsize); + + *offset = offset_in_folio(folio, pos); + *bytes = min(*bytes, fsize - *offset); + + return pos; +} + static int iomap_write_begin_inline(const struct iomap_iter *iter, struct folio *folio) { @@ -776,14 +795,22 @@ static int iomap_write_begin_inline(const struct iomap_iter *iter, return iomap_read_inline_data(iter, folio); } -static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, - size_t len, struct folio **foliop) +/* + * Grab and prepare a folio for write based on iter state. Returns the folio, + * offset, and length. Callers can optionally pass a max length *plen, + * otherwise init to zero. + */ +static int iomap_write_begin(struct iomap_iter *iter, struct folio **foliop, + size_t *poffset, u64 *plen) { const struct iomap_folio_ops *folio_ops = iter->iomap.folio_ops; const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; + u64 len = min_t(u64, SIZE_MAX, iomap_length(iter)); struct folio *folio; int status = 0; + len = min_not_zero(len, *plen); BUG_ON(pos + len > iter->iomap.offset + iter->iomap.length); if (srcmap != &iter->iomap) BUG_ON(pos + len > srcmap->offset + srcmap->length); @@ -791,10 +818,7 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, if (fatal_signal_pending(current)) return -EINTR; - if (!mapping_large_folio_support(iter->inode->i_mapping)) - len = min_t(size_t, len, PAGE_SIZE - offset_in_page(pos)); - - folio = __iomap_get_folio(iter, pos, len); + folio = __iomap_get_folio(iter, len); if (IS_ERR(folio)) return PTR_ERR(folio); @@ -818,24 +842,24 @@ static int iomap_write_begin(struct iomap_iter *iter, loff_t pos, } } - if (pos + len > folio_pos(folio) + folio_size(folio)) - len = folio_pos(folio) + folio_size(folio) - pos; + pos = iomap_trim_folio_range(iter, folio, poffset, &len); if (srcmap->type == IOMAP_INLINE) status = iomap_write_begin_inline(iter, folio); else if (srcmap->flags & IOMAP_F_BUFFER_HEAD) status = __block_write_begin_int(folio, pos, len, NULL, srcmap); else - status = __iomap_write_begin(iter, pos, len, folio); + status = __iomap_write_begin(iter, len, folio); if (unlikely(status)) goto out_unlock; *foliop = folio; + *plen = len; return 0; out_unlock: - __iomap_put_folio(iter, pos, 0, folio); + __iomap_put_folio(iter, 0, folio); return status; } @@ -885,10 +909,11 @@ static void iomap_write_end_inline(const struct iomap_iter *iter, * Returns true if all copied bytes have been written to the pagecache, * otherwise return false. */ -static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, - size_t copied, struct folio *folio) +static bool iomap_write_end(struct iomap_iter *iter, size_t len, size_t copied, + struct folio *folio) { const struct iomap *srcmap = iomap_iter_srcmap(iter); + loff_t pos = iter->pos; if (srcmap->type == IOMAP_INLINE) { iomap_write_end_inline(iter, folio, pos, copied); @@ -907,12 +932,10 @@ static bool iomap_write_end(struct iomap_iter *iter, loff_t pos, size_t len, return __iomap_write_end(iter->inode, pos, len, copied, folio); } -static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) +static int iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) { - loff_t length = iomap_length(iter); - loff_t pos = iter->pos; ssize_t total_written = 0; - long status = 0; + int status = 0; struct address_space *mapping = iter->inode->i_mapping; size_t chunk = mapping_max_folio_size(mapping); unsigned int bdp_flags = (iter->flags & IOMAP_NOWAIT) ? BDP_ASYNC : 0; @@ -921,21 +944,22 @@ static loff_t iomap_write_iter(struct iomap_iter *iter, struct iov_iter *i) struct folio *folio; loff_t old_size; size_t offset; /* Offset into folio */ - size_t bytes; /* Bytes to write to folio */ + u64 bytes; /* Bytes to write to folio */ size_t copied; /* Bytes copied from user */ - size_t written; /* Bytes have been written */ + u64 written; /* Bytes have been written */ + loff_t pos; bytes = iov_iter_count(i); retry: - offset = pos & (chunk - 1); + offset = iter->pos & (chunk - 1); bytes = min(chunk - offset, bytes); status = balance_dirty_pages_ratelimited_flags(mapping, bdp_flags); if (unlikely(status)) break; - if (bytes > length) - bytes = length; + if (bytes > iomap_length(iter)) + bytes = iomap_length(iter); /* * Bring in the user page that we'll copy from _first_. @@ -952,23 +976,21 @@ retry: break; } - status = iomap_write_begin(iter, pos, bytes, &folio); + status = iomap_write_begin(iter, &folio, &offset, &bytes); if (unlikely(status)) { - iomap_write_failed(iter->inode, pos, bytes); + iomap_write_failed(iter->inode, iter->pos, bytes); break; } if (iter->iomap.flags & IOMAP_F_STALE) break; - offset = offset_in_folio(folio, pos); - if (bytes > folio_size(folio) - offset) - bytes = folio_size(folio) - offset; + pos = iter->pos; if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); copied = copy_folio_from_iter_atomic(folio, offset, bytes, i); - written = iomap_write_end(iter, pos, bytes, copied, folio) ? + written = iomap_write_end(iter, bytes, copied, folio) ? copied : 0; /* @@ -983,7 +1005,7 @@ retry: i_size_write(iter->inode, pos + written); iter->iomap.flags |= IOMAP_F_SIZE_CHANGED; } - __iomap_put_folio(iter, pos, written, folio); + __iomap_put_folio(iter, written, folio); if (old_size < pos) pagecache_isize_extended(iter->inode, old_size, pos); @@ -1006,17 +1028,12 @@ retry: goto retry; } } else { - pos += written; total_written += written; - length -= written; + iomap_iter_advance(iter, &written); } - } while (iov_iter_count(i) && length); + } while (iov_iter_count(i) && iomap_length(iter)); - if (status == -EAGAIN) { - iov_iter_revert(i, total_written); - return -EAGAIN; - } - return total_written ? total_written : status; + return total_written ? 0 : status; } ssize_t @@ -1034,9 +1051,11 @@ iomap_file_buffered_write(struct kiocb *iocb, struct iov_iter *i, if (iocb->ki_flags & IOCB_NOWAIT) iter.flags |= IOMAP_NOWAIT; + if (iocb->ki_flags & IOCB_DONTCACHE) + iter.flags |= IOMAP_DONTCACHE; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_write_iter(&iter, i); + iter.status = iomap_write_iter(&iter, i); if (unlikely(iter.pos == iocb->ki_pos)) return ret; @@ -1270,48 +1289,42 @@ void iomap_write_delalloc_release(struct inode *inode, loff_t start_byte, } EXPORT_SYMBOL_GPL(iomap_write_delalloc_release); -static loff_t iomap_unshare_iter(struct iomap_iter *iter) +static int iomap_unshare_iter(struct iomap_iter *iter) { struct iomap *iomap = &iter->iomap; - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; if (!iomap_want_unshare_iter(iter)) - return length; + return iomap_iter_advance(iter, &bytes); do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); bool ret; - status = iomap_write_begin(iter, pos, bytes, &folio); + bytes = min_t(u64, SIZE_MAX, bytes); + status = iomap_write_begin(iter, &folio, &offset, &bytes); if (unlikely(status)) return status; if (iomap->flags & IOMAP_F_STALE) break; - offset = offset_in_folio(folio, pos); - if (bytes > folio_size(folio) - offset) - bytes = folio_size(folio) - offset; - - ret = iomap_write_end(iter, pos, bytes, bytes, folio); - __iomap_put_folio(iter, pos, bytes, folio); + ret = iomap_write_end(iter, bytes, bytes, folio); + __iomap_put_folio(iter, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; cond_resched(); - pos += bytes; - written += bytes; - length -= bytes; - balance_dirty_pages_ratelimited(iter->inode->i_mapping); - } while (length > 0); - return written; + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); + + return status; } int @@ -1331,7 +1344,7 @@ iomap_file_unshare(struct inode *inode, loff_t pos, loff_t len, iter.len = min(len, size - pos); while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_unshare_iter(&iter); + iter.status = iomap_unshare_iter(&iter); return ret; } EXPORT_SYMBOL_GPL(iomap_file_unshare); @@ -1350,20 +1363,18 @@ static inline int iomap_zero_iter_flush_and_stale(struct iomap_iter *i) return filemap_write_and_wait_range(mapping, i->pos, end); } -static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) +static int iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) { - loff_t pos = iter->pos; - loff_t length = iomap_length(iter); - loff_t written = 0; + u64 bytes = iomap_length(iter); + int status; do { struct folio *folio; - int status; size_t offset; - size_t bytes = min_t(u64, SIZE_MAX, length); bool ret; - status = iomap_write_begin(iter, pos, bytes, &folio); + bytes = min_t(u64, SIZE_MAX, bytes); + status = iomap_write_begin(iter, &folio, &offset, &bytes); if (status) return status; if (iter->iomap.flags & IOMAP_F_STALE) @@ -1371,37 +1382,35 @@ static loff_t iomap_zero_iter(struct iomap_iter *iter, bool *did_zero) /* warn about zeroing folios beyond eof that won't write back */ WARN_ON_ONCE(folio_pos(folio) > iter->inode->i_size); - offset = offset_in_folio(folio, pos); - if (bytes > folio_size(folio) - offset) - bytes = folio_size(folio) - offset; folio_zero_range(folio, offset, bytes); folio_mark_accessed(folio); - ret = iomap_write_end(iter, pos, bytes, bytes, folio); - __iomap_put_folio(iter, pos, bytes, folio); + ret = iomap_write_end(iter, bytes, bytes, folio); + __iomap_put_folio(iter, bytes, folio); if (WARN_ON_ONCE(!ret)) return -EIO; - pos += bytes; - length -= bytes; - written += bytes; - } while (length > 0); + status = iomap_iter_advance(iter, &bytes); + if (status) + break; + } while (bytes > 0); if (did_zero) *did_zero = true; - return written; + return status; } int iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { struct iomap_iter iter = { .inode = inode, .pos = pos, .len = len, .flags = IOMAP_ZERO, + .private = private, }; struct address_space *mapping = inode->i_mapping; unsigned int blocksize = i_blocksize(inode); @@ -1424,7 +1433,7 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, filemap_range_needs_writeback(mapping, pos, pos + plen - 1)) { iter.len = plen; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); iter.len = len - (iter.pos - pos); if (ret || !iter.len) @@ -1443,17 +1452,19 @@ iomap_zero_range(struct inode *inode, loff_t pos, loff_t len, bool *did_zero, if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN) { - loff_t proc = iomap_length(&iter); + s64 status; if (range_dirty) { range_dirty = false; - proc = iomap_zero_iter_flush_and_stale(&iter); + status = iomap_zero_iter_flush_and_stale(&iter); + } else { + status = iomap_iter_advance_full(&iter); } - iter.processed = proc; + iter.status = status; continue; } - iter.processed = iomap_zero_iter(&iter, did_zero); + iter.status = iomap_zero_iter(&iter, did_zero); } return ret; } @@ -1461,7 +1472,7 @@ EXPORT_SYMBOL_GPL(iomap_zero_range); int iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, - const struct iomap_ops *ops) + const struct iomap_ops *ops, void *private) { unsigned int blocksize = i_blocksize(inode); unsigned int off = pos & (blocksize - 1); @@ -1469,11 +1480,12 @@ iomap_truncate_page(struct inode *inode, loff_t pos, bool *did_zero, /* Block boundary? Nothing to do */ if (!off) return 0; - return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops); + return iomap_zero_range(inode, pos, blocksize - off, did_zero, ops, + private); } EXPORT_SYMBOL_GPL(iomap_truncate_page); -static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, +static int iomap_folio_mkwrite_iter(struct iomap_iter *iter, struct folio *folio) { loff_t length = iomap_length(iter); @@ -1484,20 +1496,22 @@ static loff_t iomap_folio_mkwrite_iter(struct iomap_iter *iter, &iter->iomap); if (ret) return ret; - block_commit_write(&folio->page, 0, length); + block_commit_write(folio, 0, length); } else { WARN_ON_ONCE(!folio_test_uptodate(folio)); folio_mark_dirty(folio); } - return length; + return iomap_iter_advance(iter, &length); } -vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) +vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops, + void *private) { struct iomap_iter iter = { .inode = file_inode(vmf->vma->vm_file), .flags = IOMAP_WRITE | IOMAP_FAULT, + .private = private, }; struct folio *folio = page_folio(vmf->page); ssize_t ret; @@ -1509,7 +1523,7 @@ vm_fault_t iomap_page_mkwrite(struct vm_fault *vmf, const struct iomap_ops *ops) iter.pos = folio_pos(folio); iter.len = ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_folio_mkwrite_iter(&iter, folio); + iter.status = iomap_folio_mkwrite_iter(&iter, folio); if (ret < 0) goto out_unlock; @@ -1538,16 +1552,15 @@ static void iomap_finish_folio_write(struct inode *inode, struct folio *folio, * state, release holds on bios, and finally free up memory. Do not use the * ioend after this. */ -static u32 -iomap_finish_ioend(struct iomap_ioend *ioend, int error) +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend) { struct inode *inode = ioend->io_inode; struct bio *bio = &ioend->io_bio; struct folio_iter fi; u32 folio_count = 0; - if (error) { - mapping_set_error(inode->i_mapping, error); + if (ioend->io_error) { + mapping_set_error(inode->i_mapping, ioend->io_error); if (!bio_flagged(bio, BIO_QUIET)) { pr_err_ratelimited( "%s: writeback error on inode %lu, offset %lld, sector %llu", @@ -1566,116 +1579,16 @@ iomap_finish_ioend(struct iomap_ioend *ioend, int error) return folio_count; } -/* - * Ioend completion routine for merged bios. This can only be called from task - * contexts as merged ioends can be of unbound length. Hence we have to break up - * the writeback completions into manageable chunks to avoid long scheduler - * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get - * good batch processing throughput without creating adverse scheduler latency - * conditions. - */ -void -iomap_finish_ioends(struct iomap_ioend *ioend, int error) -{ - struct list_head tmp; - u32 completions; - - might_sleep(); - - list_replace_init(&ioend->io_list, &tmp); - completions = iomap_finish_ioend(ioend, error); - - while (!list_empty(&tmp)) { - if (completions > IOEND_BATCH_SIZE * 8) { - cond_resched(); - completions = 0; - } - ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); - list_del_init(&ioend->io_list); - completions += iomap_finish_ioend(ioend, error); - } -} -EXPORT_SYMBOL_GPL(iomap_finish_ioends); - -/* - * We can merge two adjacent ioends if they have the same set of work to do. - */ -static bool -iomap_ioend_can_merge(struct iomap_ioend *ioend, struct iomap_ioend *next) -{ - if (ioend->io_bio.bi_status != next->io_bio.bi_status) - return false; - if (next->io_flags & IOMAP_F_BOUNDARY) - return false; - if ((ioend->io_flags & IOMAP_F_SHARED) ^ - (next->io_flags & IOMAP_F_SHARED)) - return false; - if ((ioend->io_type == IOMAP_UNWRITTEN) ^ - (next->io_type == IOMAP_UNWRITTEN)) - return false; - if (ioend->io_offset + ioend->io_size != next->io_offset) - return false; - /* - * Do not merge physically discontiguous ioends. The filesystem - * completion functions will have to iterate the physical - * discontiguities even if we merge the ioends at a logical level, so - * we don't gain anything by merging physical discontiguities here. - * - * We cannot use bio->bi_iter.bi_sector here as it is modified during - * submission so does not point to the start sector of the bio at - * completion. - */ - if (ioend->io_sector + (ioend->io_size >> 9) != next->io_sector) - return false; - return true; -} - -void -iomap_ioend_try_merge(struct iomap_ioend *ioend, struct list_head *more_ioends) -{ - struct iomap_ioend *next; - - INIT_LIST_HEAD(&ioend->io_list); - - while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, - io_list))) { - if (!iomap_ioend_can_merge(ioend, next)) - break; - list_move_tail(&next->io_list, &ioend->io_list); - ioend->io_size += next->io_size; - } -} -EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); - -static int -iomap_ioend_compare(void *priv, const struct list_head *a, - const struct list_head *b) -{ - struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); - struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); - - if (ia->io_offset < ib->io_offset) - return -1; - if (ia->io_offset > ib->io_offset) - return 1; - return 0; -} - -void -iomap_sort_ioends(struct list_head *ioend_list) -{ - list_sort(NULL, ioend_list, iomap_ioend_compare); -} -EXPORT_SYMBOL_GPL(iomap_sort_ioends); - static void iomap_writepage_end_bio(struct bio *bio) { - iomap_finish_ioend(iomap_ioend_from_bio(bio), - blk_status_to_errno(bio->bi_status)); + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + ioend->io_error = blk_status_to_errno(bio->bi_status); + iomap_finish_ioend_buffered(ioend); } /* - * Submit the final bio for an ioend. + * Submit an ioend. * * If @error is non-zero, it means that we have a situation where some part of * the submission process has failed after we've marked pages for writeback. @@ -1694,14 +1607,18 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) * failure happened so that the file system end I/O handler gets called * to clean up. */ - if (wpc->ops->prepare_ioend) - error = wpc->ops->prepare_ioend(wpc->ioend, error); + if (wpc->ops->submit_ioend) { + error = wpc->ops->submit_ioend(wpc, error); + } else { + if (WARN_ON_ONCE(wpc->iomap.flags & IOMAP_F_ANON_WRITE)) + error = -EIO; + if (!error) + submit_bio(&wpc->ioend->io_bio); + } if (error) { wpc->ioend->io_bio.bi_status = errno_to_blk_status(error); bio_endio(&wpc->ioend->io_bio); - } else { - submit_bio(&wpc->ioend->io_bio); } wpc->ioend = NULL; @@ -1709,9 +1626,9 @@ static int iomap_submit_ioend(struct iomap_writepage_ctx *wpc, int error) } static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, - struct writeback_control *wbc, struct inode *inode, loff_t pos) + struct writeback_control *wbc, struct inode *inode, loff_t pos, + u16 ioend_flags) { - struct iomap_ioend *ioend; struct bio *bio; bio = bio_alloc_bioset(wpc->iomap.bdev, BIO_MAX_VECS, @@ -1719,36 +1636,24 @@ static struct iomap_ioend *iomap_alloc_ioend(struct iomap_writepage_ctx *wpc, GFP_NOFS, &iomap_ioend_bioset); bio->bi_iter.bi_sector = iomap_sector(&wpc->iomap, pos); bio->bi_end_io = iomap_writepage_end_bio; - wbc_init_bio(wbc, bio); bio->bi_write_hint = inode->i_write_hint; - - ioend = iomap_ioend_from_bio(bio); - INIT_LIST_HEAD(&ioend->io_list); - ioend->io_type = wpc->iomap.type; - ioend->io_flags = wpc->iomap.flags; - if (pos > wpc->iomap.offset) - wpc->iomap.flags &= ~IOMAP_F_BOUNDARY; - ioend->io_inode = inode; - ioend->io_size = 0; - ioend->io_offset = pos; - ioend->io_sector = bio->bi_iter.bi_sector; - + wbc_init_bio(wbc, bio); wpc->nr_folios = 0; - return ioend; + return iomap_init_ioend(inode, bio, pos, ioend_flags); } -static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos) +static bool iomap_can_add_to_ioend(struct iomap_writepage_ctx *wpc, loff_t pos, + u16 ioend_flags) { - if (wpc->iomap.offset == pos && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + if (ioend_flags & IOMAP_IOEND_BOUNDARY) return false; - if ((wpc->iomap.flags & IOMAP_F_SHARED) != - (wpc->ioend->io_flags & IOMAP_F_SHARED)) - return false; - if (wpc->iomap.type != wpc->ioend->io_type) + if ((ioend_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (wpc->ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) return false; if (pos != wpc->ioend->io_offset + wpc->ioend->io_size) return false; - if (iomap_sector(&wpc->iomap, pos) != + if (!(wpc->iomap.flags & IOMAP_F_ANON_WRITE) && + iomap_sector(&wpc->iomap, pos) != bio_end_sector(&wpc->ioend->io_bio)) return false; /* @@ -1779,14 +1684,23 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, { struct iomap_folio_state *ifs = folio->private; size_t poff = offset_in_folio(folio, pos); + unsigned int ioend_flags = 0; int error; - if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos)) { + if (wpc->iomap.type == IOMAP_UNWRITTEN) + ioend_flags |= IOMAP_IOEND_UNWRITTEN; + if (wpc->iomap.flags & IOMAP_F_SHARED) + ioend_flags |= IOMAP_IOEND_SHARED; + if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) + ioend_flags |= IOMAP_IOEND_BOUNDARY; + + if (!wpc->ioend || !iomap_can_add_to_ioend(wpc, pos, ioend_flags)) { new_ioend: error = iomap_submit_ioend(wpc, 0); if (error) return error; - wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos); + wpc->ioend = iomap_alloc_ioend(wpc, wbc, inode, pos, + ioend_flags); } if (!bio_add_folio(&wpc->ioend->io_bio, folio, len, poff)) @@ -2062,11 +1976,3 @@ iomap_writepages(struct address_space *mapping, struct writeback_control *wbc, return iomap_submit_ioend(wpc, error); } EXPORT_SYMBOL_GPL(iomap_writepages); - -static int __init iomap_buffered_init(void) -{ - return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), - offsetof(struct iomap_ioend, io_bio), - BIOSET_NEED_BVECS); -} -fs_initcall(iomap_buffered_init); diff --git a/fs/iomap/direct-io.c b/fs/iomap/direct-io.c index b521eb15759e..844261a31156 100644 --- a/fs/iomap/direct-io.c +++ b/fs/iomap/direct-io.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (C) 2010 Red Hat, Inc. - * Copyright (c) 2016-2021 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. */ #include <linux/module.h> #include <linux/compiler.h> @@ -12,6 +12,7 @@ #include <linux/backing-dev.h> #include <linux/uio.h> #include <linux/task_io_accounting_ops.h> +#include "internal.h" #include "trace.h" #include "../internal.h" @@ -20,6 +21,7 @@ * Private flags for iomap_dio, must not overlap with the public ones in * iomap.h: */ +#define IOMAP_DIO_NO_INVALIDATE (1U << 25) #define IOMAP_DIO_CALLER_COMP (1U << 26) #define IOMAP_DIO_INLINE_COMP (1U << 27) #define IOMAP_DIO_WRITE_THROUGH (1U << 28) @@ -81,10 +83,12 @@ static void iomap_dio_submit_bio(const struct iomap_iter *iter, WRITE_ONCE(iocb->private, bio); } - if (dio->dops && dio->dops->submit_io) + if (dio->dops && dio->dops->submit_io) { dio->dops->submit_io(iter, bio, pos); - else + } else { + WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_ANON_WRITE); submit_bio(bio); + } } ssize_t iomap_dio_complete(struct iomap_dio *dio) @@ -117,7 +121,8 @@ ssize_t iomap_dio_complete(struct iomap_dio *dio) * ->end_io() when necessary, otherwise a racing buffer read would cache * zeros from unwritten extents. */ - if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE)) + if (!dio->error && dio->size && (dio->flags & IOMAP_DIO_WRITE) && + !(dio->flags & IOMAP_DIO_NO_INVALIDATE)) kiocb_invalidate_post_direct_write(iocb, dio->size); inode_dio_end(file_inode(iocb->ki_filp)); @@ -163,43 +168,31 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret) cmpxchg(&dio->error, 0, ret); } -void iomap_dio_bio_end_io(struct bio *bio) +/* + * Called when dio->ref reaches zero from an I/O completion. + */ +static void iomap_dio_done(struct iomap_dio *dio) { - struct iomap_dio *dio = bio->bi_private; - bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); struct kiocb *iocb = dio->iocb; - if (bio->bi_status) - iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); - if (!atomic_dec_and_test(&dio->ref)) - goto release_bio; - - /* - * Synchronous dio, task itself will handle any completion work - * that needs after IO. All we need to do is wake the task. - */ if (dio->wait_for_completion) { + /* + * Synchronous I/O, task itself will handle any completion work + * that needs after IO. All we need to do is wake the task. + */ struct task_struct *waiter = dio->submit.waiter; WRITE_ONCE(dio->submit.waiter, NULL); blk_wake_io_task(waiter); - goto release_bio; - } - - /* - * Flagged with IOMAP_DIO_INLINE_COMP, we can complete it inline - */ - if (dio->flags & IOMAP_DIO_INLINE_COMP) { + } else if (dio->flags & IOMAP_DIO_INLINE_COMP) { WRITE_ONCE(iocb->private, NULL); iomap_dio_complete_work(&dio->aio.work); - goto release_bio; - } - - /* - * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then schedule - * our completion that way to avoid an async punt to a workqueue. - */ - if (dio->flags & IOMAP_DIO_CALLER_COMP) { + } else if (dio->flags & IOMAP_DIO_CALLER_COMP) { + /* + * If this dio is flagged with IOMAP_DIO_CALLER_COMP, then + * schedule our completion that way to avoid an async punt to a + * workqueue. + */ /* only polled IO cares about private cleared */ iocb->private = dio; iocb->dio_complete = iomap_dio_deferred_complete; @@ -217,19 +210,31 @@ void iomap_dio_bio_end_io(struct bio *bio) * issuer. */ iocb->ki_complete(iocb, 0); - goto release_bio; + } else { + struct inode *inode = file_inode(iocb->ki_filp); + + /* + * Async DIO completion that requires filesystem level + * completion work gets punted to a work queue to complete as + * the operation may require more IO to be issued to finalise + * filesystem metadata changes or guarantee data integrity. + */ + INIT_WORK(&dio->aio.work, iomap_dio_complete_work); + queue_work(inode->i_sb->s_dio_done_wq, &dio->aio.work); } +} + +void iomap_dio_bio_end_io(struct bio *bio) +{ + struct iomap_dio *dio = bio->bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + + if (bio->bi_status) + iomap_dio_set_error(dio, blk_status_to_errno(bio->bi_status)); + + if (atomic_dec_and_test(&dio->ref)) + iomap_dio_done(dio); - /* - * Async DIO completion that requires filesystem level completion work - * gets punted to a work queue to complete as the operation may require - * more IO to be issued to finalise filesystem metadata changes or - * guarantee data integrity. - */ - INIT_WORK(&dio->aio.work, iomap_dio_complete_work); - queue_work(file_inode(iocb->ki_filp)->i_sb->s_dio_done_wq, - &dio->aio.work); -release_bio: if (should_dirty) { bio_check_pages_dirty(bio); } else { @@ -239,6 +244,47 @@ release_bio: } EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend) +{ + struct iomap_dio *dio = ioend->io_bio.bi_private; + bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY); + u32 vec_count = ioend->io_bio.bi_vcnt; + + if (ioend->io_error) + iomap_dio_set_error(dio, ioend->io_error); + + if (atomic_dec_and_test(&dio->ref)) { + /* + * Try to avoid another context switch for the completion given + * that we are already called from the ioend completion + * workqueue, but never invalidate pages from this thread to + * avoid deadlocks with buffered I/O completions. Tough luck if + * you hit the tiny race with someone dirtying the range now + * between this check and the actual completion. + */ + if (!dio->iocb->ki_filp->f_mapping->nrpages) { + dio->flags |= IOMAP_DIO_INLINE_COMP; + dio->flags |= IOMAP_DIO_NO_INVALIDATE; + } + dio->flags &= ~IOMAP_DIO_CALLER_COMP; + iomap_dio_done(dio); + } + + if (should_dirty) { + bio_check_pages_dirty(&ioend->io_bio); + } else { + bio_release_pages(&ioend->io_bio, false); + bio_put(&ioend->io_bio); + } + + /* + * Return the number of bvecs completed as even direct I/O completions + * do significant per-folio work and we'll still want to give up the + * CPU after a lot of completions. + */ + return vec_count; +} + static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, loff_t pos, unsigned len) { @@ -266,81 +312,85 @@ static int iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio, } /* - * Figure out the bio's operation flags from the dio request, the - * mapping, and whether or not we want FUA. Note that we can end up - * clearing the WRITE_THROUGH flag in the dio request. + * Use a FUA write if we need datasync semantics and this is a pure data I/O + * that doesn't require any metadata updates (including after I/O completion + * such as unwritten extent conversion) and the underlying device either + * doesn't have a volatile write cache or supports FUA. + * This allows us to avoid cache flushes on I/O completion. */ -static inline blk_opf_t iomap_dio_bio_opflags(struct iomap_dio *dio, - const struct iomap *iomap, bool use_fua, bool atomic) +static inline bool iomap_dio_can_use_fua(const struct iomap *iomap, + struct iomap_dio *dio) { - blk_opf_t opflags = REQ_SYNC | REQ_IDLE; - - if (!(dio->flags & IOMAP_DIO_WRITE)) - return REQ_OP_READ; - - opflags |= REQ_OP_WRITE; - if (use_fua) - opflags |= REQ_FUA; - else - dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; - if (atomic) - opflags |= REQ_ATOMIC; - - return opflags; + if (iomap->flags & (IOMAP_F_SHARED | IOMAP_F_DIRTY)) + return false; + if (!(dio->flags & IOMAP_DIO_WRITE_THROUGH)) + return false; + return !bdev_write_cache(iomap->bdev) || bdev_fua(iomap->bdev); } -static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_bio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { const struct iomap *iomap = &iter->iomap; struct inode *inode = iter->inode; unsigned int fs_block_size = i_blocksize(inode), pad; const loff_t length = iomap_length(iter); - bool atomic = iter->flags & IOMAP_ATOMIC; loff_t pos = iter->pos; - blk_opf_t bio_opf; + blk_opf_t bio_opf = REQ_SYNC | REQ_IDLE; struct bio *bio; bool need_zeroout = false; - bool use_fua = false; int nr_pages, ret = 0; - size_t copied = 0; + u64 copied = 0; size_t orig_count; - if (atomic && length != fs_block_size) - return -EINVAL; - if ((pos | length) & (bdev_logical_block_size(iomap->bdev) - 1) || !bdev_iter_is_aligned(iomap->bdev, dio->submit.iter)) return -EINVAL; - if (iomap->type == IOMAP_UNWRITTEN) { - dio->flags |= IOMAP_DIO_UNWRITTEN; - need_zeroout = true; - } + if (dio->flags & IOMAP_DIO_WRITE) { + bio_opf |= REQ_OP_WRITE; - if (iomap->flags & IOMAP_F_SHARED) - dio->flags |= IOMAP_DIO_COW; + if (iomap->flags & IOMAP_F_ATOMIC_BIO) { + /* + * Ensure that the mapping covers the full write + * length, otherwise it won't be submitted as a single + * bio, which is required to use hardware atomics. + */ + if (length != iter->len) + return -EINVAL; + bio_opf |= REQ_ATOMIC; + } + + if (iomap->type == IOMAP_UNWRITTEN) { + dio->flags |= IOMAP_DIO_UNWRITTEN; + need_zeroout = true; + } + + if (iomap->flags & IOMAP_F_SHARED) + dio->flags |= IOMAP_DIO_COW; + + if (iomap->flags & IOMAP_F_NEW) { + need_zeroout = true; + } else if (iomap->type == IOMAP_MAPPED) { + if (iomap_dio_can_use_fua(iomap, dio)) + bio_opf |= REQ_FUA; + else + dio->flags &= ~IOMAP_DIO_WRITE_THROUGH; + } - if (iomap->flags & IOMAP_F_NEW) { - need_zeroout = true; - } else if (iomap->type == IOMAP_MAPPED) { /* - * Use a FUA write if we need datasync semantics, this is a pure - * data IO that doesn't require any metadata updates (including - * after IO completion such as unwritten extent conversion) and - * the underlying device either supports FUA or doesn't have - * a volatile write cache. This allows us to avoid cache flushes - * on IO completion. If we can't use writethrough and need to - * sync, disable in-task completions as dio completion will - * need to call generic_write_sync() which will do a blocking - * fsync / cache flush call. + * We can only do deferred completion for pure overwrites that + * don't require additional I/O at completion time. + * + * This rules out writes that need zeroing or extent conversion, + * extend the file size, or issue metadata I/O or cache flushes + * during completion processing. */ - if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) && - (dio->flags & IOMAP_DIO_WRITE_THROUGH) && - (bdev_fua(iomap->bdev) || !bdev_write_cache(iomap->bdev))) - use_fua = true; - else if (dio->flags & IOMAP_DIO_NEED_SYNC) + if (need_zeroout || (pos >= i_size_read(inode)) || + ((dio->flags & IOMAP_DIO_NEED_SYNC) && + !(bio_opf & REQ_FUA))) dio->flags &= ~IOMAP_DIO_CALLER_COMP; + } else { + bio_opf |= REQ_OP_READ; } /* @@ -355,18 +405,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; /* - * We can only do deferred completion for pure overwrites that - * don't require additional IO at completion. This rules out - * writes that need zeroing or extent conversion, extend - * the file size, or issue journal IO or cache flushes - * during completion processing. - */ - if (need_zeroout || - ((dio->flags & IOMAP_DIO_NEED_SYNC) && !use_fua) || - ((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) - dio->flags &= ~IOMAP_DIO_CALLER_COMP; - - /* * The rules for polled IO completions follow the guidelines as the * ones we set for inline and deferred completions. If none of those * are available for this IO, clear the polled flag. @@ -383,8 +421,6 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, goto out; } - bio_opf = iomap_dio_bio_opflags(dio, iomap, use_fua, atomic); - nr_pages = bio_iov_vecs_to_alloc(dio->submit.iter, BIO_MAX_VECS); do { size_t n; @@ -416,9 +452,9 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, } n = bio->bi_iter.bi_size; - if (WARN_ON_ONCE(atomic && n != length)) { + if (WARN_ON_ONCE((bio_opf & REQ_ATOMIC) && n != length)) { /* - * This bio should have covered the complete length, + * An atomic write bio must cover the complete length, * which it doesn't, so error. We may need to zero out * the tail (complete FS block), similar to when * bio_iov_iter_get_pages() returns an error, above. @@ -427,12 +463,10 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter, bio_put(bio); goto zero_tail; } - if (dio->flags & IOMAP_DIO_WRITE) { + if (dio->flags & IOMAP_DIO_WRITE) task_io_account_write(n); - } else { - if (dio->flags & IOMAP_DIO_DIRTY) - bio_set_pages_dirty(bio); - } + else if (dio->flags & IOMAP_DIO_DIRTY) + bio_set_pages_dirty(bio); dio->size += n; copied += n; @@ -467,30 +501,28 @@ out: /* Undo iter limitation to current extent */ iov_iter_reexpand(dio->submit.iter, orig_count - copied); if (copied) - return copied; + return iomap_iter_advance(iter, &copied); return ret; } -static loff_t iomap_dio_hole_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_hole_iter(struct iomap_iter *iter, struct iomap_dio *dio) { loff_t length = iov_iter_zero(iomap_length(iter), dio->submit.iter); dio->size += length; if (!length) return -EFAULT; - return length; + return iomap_iter_advance(iter, &length); } -static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, - struct iomap_dio *dio) +static int iomap_dio_inline_iter(struct iomap_iter *iomi, struct iomap_dio *dio) { const struct iomap *iomap = &iomi->iomap; struct iov_iter *iter = dio->submit.iter; void *inline_data = iomap_inline_data(iomap, iomi->pos); loff_t length = iomap_length(iomi); loff_t pos = iomi->pos; - size_t copied; + u64 copied; if (WARN_ON_ONCE(!iomap_inline_data_valid(iomap))) return -EIO; @@ -512,11 +544,10 @@ static loff_t iomap_dio_inline_iter(const struct iomap_iter *iomi, dio->size += copied; if (!copied) return -EFAULT; - return copied; + return iomap_iter_advance(iomi, &copied); } -static loff_t iomap_dio_iter(const struct iomap_iter *iter, - struct iomap_dio *dio) +static int iomap_dio_iter(struct iomap_iter *iter, struct iomap_dio *dio) { switch (iter->iomap.type) { case IOMAP_HOLE: @@ -610,9 +641,6 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, if (iocb->ki_flags & IOCB_NOWAIT) iomi.flags |= IOMAP_NOWAIT; - if (iocb->ki_flags & IOCB_ATOMIC) - iomi.flags |= IOMAP_ATOMIC; - if (iov_iter_rw(iter) == READ) { /* reads can always complete inline */ dio->flags |= IOMAP_DIO_INLINE_COMP; @@ -647,6 +675,9 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, iomi.flags |= IOMAP_OVERWRITE_ONLY; } + if (iocb->ki_flags & IOCB_ATOMIC) + iomi.flags |= IOMAP_ATOMIC; + /* for data sync or sync, we need sync completion processing */ if (iocb_is_dsync(iocb)) { dio->flags |= IOMAP_DIO_NEED_SYNC; @@ -700,7 +731,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter, blk_start_plug(&plug); while ((ret = iomap_iter(&iomi, ops)) > 0) { - iomi.processed = iomap_dio_iter(&iomi, dio); + iomi.status = iomap_dio_iter(&iomi, dio); /* * We can only poll for single bio I/Os. diff --git a/fs/iomap/fiemap.c b/fs/iomap/fiemap.c index 610ca6f1ec9b..80675c42e94e 100644 --- a/fs/iomap/fiemap.c +++ b/fs/iomap/fiemap.c @@ -39,24 +39,23 @@ static int iomap_to_fiemap(struct fiemap_extent_info *fi, iomap->length, flags); } -static loff_t iomap_fiemap_iter(const struct iomap_iter *iter, +static int iomap_fiemap_iter(struct iomap_iter *iter, struct fiemap_extent_info *fi, struct iomap *prev) { int ret; if (iter->iomap.type == IOMAP_HOLE) - return iomap_length(iter); + goto advance; ret = iomap_to_fiemap(fi, prev, 0); *prev = iter->iomap; - switch (ret) { - case 0: /* success */ - return iomap_length(iter); - case 1: /* extent array full */ - return 0; - default: /* error */ + if (ret < 0) return ret; - } + if (ret == 1) /* extent array full */ + return 0; + +advance: + return iomap_iter_advance_full(iter); } int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, @@ -78,7 +77,7 @@ int iomap_fiemap(struct inode *inode, struct fiemap_extent_info *fi, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_fiemap_iter(&iter, fi, &prev); + iter.status = iomap_fiemap_iter(&iter, fi, &prev); if (prev.type != IOMAP_HOLE) { ret = iomap_to_fiemap(fi, &prev, FIEMAP_EXTENT_LAST); @@ -114,7 +113,7 @@ iomap_bmap(struct address_space *mapping, sector_t bno, while ((ret = iomap_iter(&iter, ops)) > 0) { if (iter.iomap.type == IOMAP_MAPPED) bno = iomap_sector(&iter.iomap, iter.pos) >> blkshift; - /* leave iter.processed unset to abort loop */ + /* leave iter.status unset to abort loop */ } if (ret) return 0; diff --git a/fs/iomap/internal.h b/fs/iomap/internal.h new file mode 100644 index 000000000000..f6992a3bf66a --- /dev/null +++ b/fs/iomap/internal.h @@ -0,0 +1,10 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _IOMAP_INTERNAL_H +#define _IOMAP_INTERNAL_H 1 + +#define IOEND_BATCH_SIZE 4096 + +u32 iomap_finish_ioend_buffered(struct iomap_ioend *ioend); +u32 iomap_finish_ioend_direct(struct iomap_ioend *ioend); + +#endif /* _IOMAP_INTERNAL_H */ diff --git a/fs/iomap/ioend.c b/fs/iomap/ioend.c new file mode 100644 index 000000000000..18894ebba6db --- /dev/null +++ b/fs/iomap/ioend.c @@ -0,0 +1,216 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2024-2025 Christoph Hellwig. + */ +#include <linux/iomap.h> +#include <linux/list_sort.h> +#include "internal.h" + +struct bio_set iomap_ioend_bioset; +EXPORT_SYMBOL_GPL(iomap_ioend_bioset); + +struct iomap_ioend *iomap_init_ioend(struct inode *inode, + struct bio *bio, loff_t file_offset, u16 ioend_flags) +{ + struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); + + atomic_set(&ioend->io_remaining, 1); + ioend->io_error = 0; + ioend->io_parent = NULL; + INIT_LIST_HEAD(&ioend->io_list); + ioend->io_flags = ioend_flags; + ioend->io_inode = inode; + ioend->io_offset = file_offset; + ioend->io_size = bio->bi_iter.bi_size; + ioend->io_sector = bio->bi_iter.bi_sector; + ioend->io_private = NULL; + return ioend; +} +EXPORT_SYMBOL_GPL(iomap_init_ioend); + +static u32 iomap_finish_ioend(struct iomap_ioend *ioend, int error) +{ + if (ioend->io_parent) { + struct bio *bio = &ioend->io_bio; + + ioend = ioend->io_parent; + bio_put(bio); + } + + if (error) + cmpxchg(&ioend->io_error, 0, error); + + if (!atomic_dec_and_test(&ioend->io_remaining)) + return 0; + if (ioend->io_flags & IOMAP_IOEND_DIRECT) + return iomap_finish_ioend_direct(ioend); + return iomap_finish_ioend_buffered(ioend); +} + +/* + * Ioend completion routine for merged bios. This can only be called from task + * contexts as merged ioends can be of unbound length. Hence we have to break up + * the writeback completions into manageable chunks to avoid long scheduler + * holdoffs. We aim to keep scheduler holdoffs down below 10ms so that we get + * good batch processing throughput without creating adverse scheduler latency + * conditions. + */ +void iomap_finish_ioends(struct iomap_ioend *ioend, int error) +{ + struct list_head tmp; + u32 completions; + + might_sleep(); + + list_replace_init(&ioend->io_list, &tmp); + completions = iomap_finish_ioend(ioend, error); + + while (!list_empty(&tmp)) { + if (completions > IOEND_BATCH_SIZE * 8) { + cond_resched(); + completions = 0; + } + ioend = list_first_entry(&tmp, struct iomap_ioend, io_list); + list_del_init(&ioend->io_list); + completions += iomap_finish_ioend(ioend, error); + } +} +EXPORT_SYMBOL_GPL(iomap_finish_ioends); + +/* + * We can merge two adjacent ioends if they have the same set of work to do. + */ +static bool iomap_ioend_can_merge(struct iomap_ioend *ioend, + struct iomap_ioend *next) +{ + if (ioend->io_bio.bi_status != next->io_bio.bi_status) + return false; + if (next->io_flags & IOMAP_IOEND_BOUNDARY) + return false; + if ((ioend->io_flags & IOMAP_IOEND_NOMERGE_FLAGS) != + (next->io_flags & IOMAP_IOEND_NOMERGE_FLAGS)) + return false; + if (ioend->io_offset + ioend->io_size != next->io_offset) + return false; + /* + * Do not merge physically discontiguous ioends. The filesystem + * completion functions will have to iterate the physical + * discontiguities even if we merge the ioends at a logical level, so + * we don't gain anything by merging physical discontiguities here. + * + * We cannot use bio->bi_iter.bi_sector here as it is modified during + * submission so does not point to the start sector of the bio at + * completion. + */ + if (ioend->io_sector + (ioend->io_size >> SECTOR_SHIFT) != + next->io_sector) + return false; + return true; +} + +void iomap_ioend_try_merge(struct iomap_ioend *ioend, + struct list_head *more_ioends) +{ + struct iomap_ioend *next; + + INIT_LIST_HEAD(&ioend->io_list); + + while ((next = list_first_entry_or_null(more_ioends, struct iomap_ioend, + io_list))) { + if (!iomap_ioend_can_merge(ioend, next)) + break; + list_move_tail(&next->io_list, &ioend->io_list); + ioend->io_size += next->io_size; + } +} +EXPORT_SYMBOL_GPL(iomap_ioend_try_merge); + +static int iomap_ioend_compare(void *priv, const struct list_head *a, + const struct list_head *b) +{ + struct iomap_ioend *ia = container_of(a, struct iomap_ioend, io_list); + struct iomap_ioend *ib = container_of(b, struct iomap_ioend, io_list); + + if (ia->io_offset < ib->io_offset) + return -1; + if (ia->io_offset > ib->io_offset) + return 1; + return 0; +} + +void iomap_sort_ioends(struct list_head *ioend_list) +{ + list_sort(NULL, ioend_list, iomap_ioend_compare); +} +EXPORT_SYMBOL_GPL(iomap_sort_ioends); + +/* + * Split up to the first @max_len bytes from @ioend if the ioend covers more + * than @max_len bytes. + * + * If @is_append is set, the split will be based on the hardware limits for + * REQ_OP_ZONE_APPEND commands and can be less than @max_len if the hardware + * limits don't allow the entire @max_len length. + * + * The bio embedded into @ioend must be a REQ_OP_WRITE because the block layer + * does not allow splitting REQ_OP_ZONE_APPEND bios. The file systems has to + * switch the operation after this call, but before submitting the bio. + */ +struct iomap_ioend *iomap_split_ioend(struct iomap_ioend *ioend, + unsigned int max_len, bool is_append) +{ + struct bio *bio = &ioend->io_bio; + struct iomap_ioend *split_ioend; + unsigned int nr_segs; + int sector_offset; + struct bio *split; + + if (is_append) { + struct queue_limits *lim = bdev_limits(bio->bi_bdev); + + max_len = min(max_len, + lim->max_zone_append_sectors << SECTOR_SHIFT); + + sector_offset = bio_split_rw_at(bio, lim, &nr_segs, max_len); + if (unlikely(sector_offset < 0)) + return ERR_PTR(sector_offset); + if (!sector_offset) + return NULL; + } else { + if (bio->bi_iter.bi_size <= max_len) + return NULL; + sector_offset = max_len >> SECTOR_SHIFT; + } + + /* ensure the split ioend is still block size aligned */ + sector_offset = ALIGN_DOWN(sector_offset << SECTOR_SHIFT, + i_blocksize(ioend->io_inode)) >> SECTOR_SHIFT; + + split = bio_split(bio, sector_offset, GFP_NOFS, &iomap_ioend_bioset); + if (IS_ERR(split)) + return ERR_CAST(split); + split->bi_private = bio->bi_private; + split->bi_end_io = bio->bi_end_io; + + split_ioend = iomap_init_ioend(ioend->io_inode, split, ioend->io_offset, + ioend->io_flags); + split_ioend->io_parent = ioend; + + atomic_inc(&ioend->io_remaining); + ioend->io_offset += split_ioend->io_size; + ioend->io_size -= split_ioend->io_size; + + split_ioend->io_sector = ioend->io_sector; + if (!is_append) + ioend->io_sector += (split_ioend->io_size >> SECTOR_SHIFT); + return split_ioend; +} +EXPORT_SYMBOL_GPL(iomap_split_ioend); + +static int __init iomap_ioend_init(void) +{ + return bioset_init(&iomap_ioend_bioset, 4 * (PAGE_SIZE / SECTOR_SIZE), + offsetof(struct iomap_ioend, io_bio), + BIOSET_NEED_BVECS); +} +fs_initcall(iomap_ioend_init); diff --git a/fs/iomap/iter.c b/fs/iomap/iter.c index 3790918646af..6ffc6a7b9ba5 100644 --- a/fs/iomap/iter.c +++ b/fs/iomap/iter.c @@ -7,40 +7,25 @@ #include <linux/iomap.h> #include "trace.h" -/* - * Advance to the next range we need to map. - * - * If the iomap is marked IOMAP_F_STALE, it means the existing map was not fully - * processed - it was aborted because the extent the iomap spanned may have been - * changed during the operation. In this case, the iteration behaviour is to - * remap the unprocessed range of the iter, and that means we may need to remap - * even when we've made no progress (i.e. iter->processed = 0). Hence the - * "finished iterating" case needs to distinguish between - * (processed = 0) meaning we are done and (processed = 0 && stale) meaning we - * need to remap the entire remaining range. - */ -static inline int iomap_iter_advance(struct iomap_iter *iter) +static inline void iomap_iter_reset_iomap(struct iomap_iter *iter) { - bool stale = iter->iomap.flags & IOMAP_F_STALE; - int ret = 1; - - /* handle the previous iteration (if any) */ - if (iter->iomap.length) { - if (iter->processed < 0) - return iter->processed; - if (WARN_ON_ONCE(iter->processed > iomap_length(iter))) - return -EIO; - iter->pos += iter->processed; - iter->len -= iter->processed; - if (!iter->len || (!iter->processed && !stale)) - ret = 0; - } - - /* clear the per iteration state */ - iter->processed = 0; + iter->status = 0; memset(&iter->iomap, 0, sizeof(iter->iomap)); memset(&iter->srcmap, 0, sizeof(iter->srcmap)); - return ret; +} + +/* + * Advance the current iterator position and output the length remaining for the + * current mapping. + */ +int iomap_iter_advance(struct iomap_iter *iter, u64 *count) +{ + if (WARN_ON_ONCE(*count > iomap_length(iter))) + return -EIO; + iter->pos += *count; + iter->len -= *count; + *count = iomap_length(iter); + return 0; } static inline void iomap_iter_done(struct iomap_iter *iter) @@ -50,6 +35,8 @@ static inline void iomap_iter_done(struct iomap_iter *iter) WARN_ON_ONCE(iter->iomap.offset + iter->iomap.length <= iter->pos); WARN_ON_ONCE(iter->iomap.flags & IOMAP_F_STALE); + iter->iter_start_pos = iter->pos; + trace_iomap_iter_dstmap(iter->inode, &iter->iomap); if (iter->srcmap.type != IOMAP_HOLE) trace_iomap_iter_srcmap(iter->inode, &iter->srcmap); @@ -67,26 +54,58 @@ static inline void iomap_iter_done(struct iomap_iter *iter) * function must be called in a loop that continues as long it returns a * positive value. If 0 or a negative value is returned, the caller must not * return to the loop body. Within a loop body, there are two ways to break out - * of the loop body: leave @iter.processed unchanged, or set it to a negative + * of the loop body: leave @iter.status unchanged, or set it to a negative * errno. */ int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops) { + bool stale = iter->iomap.flags & IOMAP_F_STALE; + ssize_t advanced; + u64 olen; int ret; - if (iter->iomap.length && ops->iomap_end) { - ret = ops->iomap_end(iter->inode, iter->pos, iomap_length(iter), - iter->processed > 0 ? iter->processed : 0, - iter->flags, &iter->iomap); - if (ret < 0 && !iter->processed) + trace_iomap_iter(iter, ops, _RET_IP_); + + if (!iter->iomap.length) + goto begin; + + /* + * Calculate how far the iter was advanced and the original length bytes + * for ->iomap_end(). + */ + advanced = iter->pos - iter->iter_start_pos; + olen = iter->len + advanced; + + if (ops->iomap_end) { + ret = ops->iomap_end(iter->inode, iter->iter_start_pos, + iomap_length_trim(iter, iter->iter_start_pos, + olen), + advanced, iter->flags, &iter->iomap); + if (ret < 0 && !advanced) return ret; } - trace_iomap_iter(iter, ops, _RET_IP_); - ret = iomap_iter_advance(iter); + /* detect old return semantics where this would advance */ + if (WARN_ON_ONCE(iter->status > 0)) + iter->status = -EIO; + + /* + * Use iter->len to determine whether to continue onto the next mapping. + * Explicitly terminate on error status or if the current iter has not + * advanced at all (i.e. no work was done for some reason) unless the + * mapping has been marked stale and needs to be reprocessed. + */ + if (iter->status < 0) + ret = iter->status; + else if (iter->len == 0 || (!advanced && !stale)) + ret = 0; + else + ret = 1; + iomap_iter_reset_iomap(iter); if (ret <= 0) return ret; +begin: ret = ops->iomap_begin(iter->inode, iter->pos, iter->len, iter->flags, &iter->iomap, &iter->srcmap); if (ret < 0) diff --git a/fs/iomap/seek.c b/fs/iomap/seek.c index a845c012b50c..04d7919636c1 100644 --- a/fs/iomap/seek.c +++ b/fs/iomap/seek.c @@ -10,7 +10,7 @@ #include <linux/pagemap.h> #include <linux/pagevec.h> -static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, +static int iomap_seek_hole_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); @@ -20,13 +20,13 @@ static loff_t iomap_seek_hole_iter(const struct iomap_iter *iter, *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_HOLE); if (*hole_pos == iter->pos + length) - return length; + return iomap_iter_advance(iter, &length); return 0; case IOMAP_HOLE: *hole_pos = iter->pos; return 0; default: - return length; + return iomap_iter_advance(iter, &length); } } @@ -47,7 +47,7 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_hole_iter(&iter, &pos); + iter.status = iomap_seek_hole_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found hole before EOF */ @@ -56,19 +56,19 @@ iomap_seek_hole(struct inode *inode, loff_t pos, const struct iomap_ops *ops) } EXPORT_SYMBOL_GPL(iomap_seek_hole); -static loff_t iomap_seek_data_iter(const struct iomap_iter *iter, +static int iomap_seek_data_iter(struct iomap_iter *iter, loff_t *hole_pos) { loff_t length = iomap_length(iter); switch (iter->iomap.type) { case IOMAP_HOLE: - return length; + return iomap_iter_advance(iter, &length); case IOMAP_UNWRITTEN: *hole_pos = mapping_seek_hole_data(iter->inode->i_mapping, iter->pos, iter->pos + length, SEEK_DATA); if (*hole_pos < 0) - return length; + return iomap_iter_advance(iter, &length); return 0; default: *hole_pos = iter->pos; @@ -93,7 +93,7 @@ iomap_seek_data(struct inode *inode, loff_t pos, const struct iomap_ops *ops) iter.len = size - pos; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_seek_data_iter(&iter, &pos); + iter.status = iomap_seek_data_iter(&iter, &pos); if (ret < 0) return ret; if (iter.len) /* found data before EOF */ diff --git a/fs/iomap/swapfile.c b/fs/iomap/swapfile.c index b90d0eda9e51..c1a762c10ce4 100644 --- a/fs/iomap/swapfile.c +++ b/fs/iomap/swapfile.c @@ -94,7 +94,7 @@ static int iomap_swapfile_fail(struct iomap_swapfile_info *isi, const char *str) * swap only cares about contiguous page-aligned physical extents and makes no * distinction between written and unwritten extents. */ -static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, +static int iomap_swapfile_iter(struct iomap_iter *iter, struct iomap *iomap, struct iomap_swapfile_info *isi) { switch (iomap->type) { @@ -132,7 +132,8 @@ static loff_t iomap_swapfile_iter(const struct iomap_iter *iter, return error; memcpy(&isi->iomap, iomap, sizeof(isi->iomap)); } - return iomap_length(iter); + + return iomap_iter_advance_full(iter); } /* @@ -166,7 +167,7 @@ int iomap_swapfile_activate(struct swap_info_struct *sis, return ret; while ((ret = iomap_iter(&iter, ops)) > 0) - iter.processed = iomap_swapfile_iter(&iter, &iter.iomap, &isi); + iter.status = iomap_swapfile_iter(&iter, &iter.iomap, &isi); if (ret < 0) return ret; diff --git a/fs/iomap/trace.h b/fs/iomap/trace.h index 4118a42cdab0..455cc6f90be0 100644 --- a/fs/iomap/trace.h +++ b/fs/iomap/trace.h @@ -99,7 +99,11 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_FAULT, "FAULT" }, \ { IOMAP_DIRECT, "DIRECT" }, \ { IOMAP_NOWAIT, "NOWAIT" }, \ - { IOMAP_ATOMIC, "ATOMIC" } + { IOMAP_OVERWRITE_ONLY, "OVERWRITE_ONLY" }, \ + { IOMAP_UNSHARE, "UNSHARE" }, \ + { IOMAP_DAX, "DAX" }, \ + { IOMAP_ATOMIC, "ATOMIC" }, \ + { IOMAP_DONTCACHE, "DONTCACHE" } #define IOMAP_F_FLAGS_STRINGS \ { IOMAP_F_NEW, "NEW" }, \ @@ -107,7 +111,14 @@ DEFINE_RANGE_EVENT(iomap_dio_rw_queued); { IOMAP_F_SHARED, "SHARED" }, \ { IOMAP_F_MERGED, "MERGED" }, \ { IOMAP_F_BUFFER_HEAD, "BH" }, \ - { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" } + { IOMAP_F_XATTR, "XATTR" }, \ + { IOMAP_F_BOUNDARY, "BOUNDARY" }, \ + { IOMAP_F_ANON_WRITE, "ANON_WRITE" }, \ + { IOMAP_F_ATOMIC_BIO, "ATOMIC_BIO" }, \ + { IOMAP_F_PRIVATE, "PRIVATE" }, \ + { IOMAP_F_SIZE_CHANGED, "SIZE_CHANGED" }, \ + { IOMAP_F_STALE, "STALE" } + #define IOMAP_DIO_STRINGS \ {IOMAP_DIO_FORCE_WAIT, "DIO_FORCE_WAIT" }, \ @@ -138,7 +149,7 @@ DECLARE_EVENT_CLASS(iomap_class, __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), TP_printk("dev %d:%d ino 0x%llx bdev %d:%d addr 0x%llx offset 0x%llx " - "length 0x%llx type %s flags %s", + "length 0x%llx type %s (0x%x) flags %s (0x%x)", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), @@ -146,7 +157,9 @@ DECLARE_EVENT_CLASS(iomap_class, __entry->offset, __entry->length, __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), - __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) + __entry->type, + __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS), + __entry->flags) ) #define DEFINE_IOMAP_EVENT(name) \ @@ -185,7 +198,7 @@ TRACE_EVENT(iomap_writepage_map, __entry->bdev = iomap->bdev ? iomap->bdev->bd_dev : 0; ), TP_printk("dev %d:%d ino 0x%llx bdev %d:%d pos 0x%llx dirty len 0x%llx " - "addr 0x%llx offset 0x%llx length 0x%llx type %s flags %s", + "addr 0x%llx offset 0x%llx length 0x%llx type %s (0x%x) flags %s (0x%x)", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, MAJOR(__entry->bdev), MINOR(__entry->bdev), @@ -195,7 +208,9 @@ TRACE_EVENT(iomap_writepage_map, __entry->offset, __entry->length, __print_symbolic(__entry->type, IOMAP_TYPE_STRINGS), - __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS)) + __entry->type, + __print_flags(__entry->flags, "|", IOMAP_F_FLAGS_STRINGS), + __entry->flags) ); TRACE_EVENT(iomap_iter, @@ -207,7 +222,7 @@ TRACE_EVENT(iomap_iter, __field(u64, ino) __field(loff_t, pos) __field(u64, length) - __field(s64, processed) + __field(int, status) __field(unsigned int, flags) __field(const void *, ops) __field(unsigned long, caller) @@ -217,17 +232,17 @@ TRACE_EVENT(iomap_iter, __entry->ino = iter->inode->i_ino; __entry->pos = iter->pos; __entry->length = iomap_length(iter); - __entry->processed = iter->processed; + __entry->status = iter->status; __entry->flags = iter->flags; __entry->ops = ops; __entry->caller = caller; ), - TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx processed %lld flags %s (0x%x) ops %ps caller %pS", + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx length 0x%llx status %d flags %s (0x%x) ops %ps caller %pS", MAJOR(__entry->dev), MINOR(__entry->dev), __entry->ino, __entry->pos, __entry->length, - __entry->processed, + __entry->status, __print_flags(__entry->flags, "|", IOMAP_FLAGS_STRINGS), __entry->flags, __entry->ops, diff --git a/fs/isofs/dir.c b/fs/isofs/dir.c index eb2f8273e6f1..09df40b612fb 100644 --- a/fs/isofs/dir.c +++ b/fs/isofs/dir.c @@ -147,7 +147,8 @@ static int do_isofs_readdir(struct inode *inode, struct file *file, de = tmpde; } /* Basic sanity check, whether name doesn't exceed dir entry */ - if (de_len < de->name_len[0] + + if (de_len < sizeof(struct iso_directory_record) || + de_len < de->name_len[0] + sizeof(struct iso_directory_record)) { printk(KERN_NOTICE "iso9660: Corrupted directory entry" " in block %lu of inode %lu\n", block, diff --git a/fs/isofs/export.c b/fs/isofs/export.c index 35768a63fb1d..421d247fae52 100644 --- a/fs/isofs/export.c +++ b/fs/isofs/export.c @@ -180,7 +180,7 @@ static struct dentry *isofs_fh_to_parent(struct super_block *sb, return NULL; return isofs_export_iget(sb, - fh_len > 2 ? ifid->parent_block : 0, + fh_len > 3 ? ifid->parent_block : 0, ifid->parent_offset, fh_len > 4 ? ifid->parent_generation : 0); } diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index e8e80761ac73..7203d2d2624d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -57,8 +57,8 @@ static void journal_end_buffer_io_sync(struct buffer_head *bh, int uptodate) * So here, we have a buffer which has just come off the forget list. Look to * see if we can strip all buffers from the backing page. * - * Called under lock_journal(), and possibly under journal_datalist_lock. The - * caller provided us with a ref against the buffer, and we drop that here. + * Called under j_list_lock. The caller provided us with a ref against the + * buffer, and we drop that here. */ static void release_buffer_page(struct buffer_head *bh) { @@ -99,7 +99,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) h->h_chksum_type = 0; h->h_chksum_size = 0; h->h_chksum[0] = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); h->h_chksum[0] = cpu_to_be32(csum); } @@ -330,8 +330,8 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, seq = cpu_to_be32(sequence); addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, addr, bh->b_size); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, addr, bh->b_size); kunmap_local(addr); if (jbd2_has_feature_csum3(j)) @@ -738,10 +738,8 @@ start_journal_io: err = journal_finish_inode_data_buffers(journal, commit_transaction); if (err) { printk(KERN_WARNING - "JBD2: Detected IO errors while flushing file data " - "on %s\n", journal->j_devname); - if (journal->j_flags & JBD2_ABORT_ON_SYNCDATA_ERR) - jbd2_journal_abort(journal, err); + "JBD2: Detected IO errors %d while flushing file data on %s\n", + err, journal->j_devname); err = 0; } diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index d8084b31b361..6d5e76848733 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -83,7 +83,7 @@ EXPORT_SYMBOL(jbd2_log_wait_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); -EXPORT_SYMBOL(jbd2_journal_blocks_per_page); +EXPORT_SYMBOL(jbd2_journal_blocks_per_folio); EXPORT_SYMBOL(jbd2_journal_invalidate_folio); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); @@ -115,14 +115,14 @@ void __jbd2_debug(int level, const char *file, const char *func, #endif /* Checksumming functions */ -static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) +static __be32 jbd2_superblock_csum(journal_superblock_t *sb) { __u32 csum; __be32 old_csum; old_csum = sb->s_checksum; sb->s_checksum = 0; - csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t)); + csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t)); sb->s_checksum = old_csum; return cpu_to_be32(csum); @@ -197,7 +197,7 @@ loop: if (journal->j_commit_sequence != journal->j_commit_request) { jbd2_debug(1, "OK, requests differ\n"); write_unlock(&journal->j_state_lock); - del_timer_sync(&journal->j_commit_timer); + timer_delete_sync(&journal->j_commit_timer); jbd2_journal_commit_transaction(journal); write_lock(&journal->j_state_lock); goto loop; @@ -246,7 +246,7 @@ loop: goto loop; end_loop: - del_timer_sync(&journal->j_commit_timer); + timer_delete_sync(&journal->j_commit_timer); journal->j_task = NULL; wake_up(&journal->j_wait_done_commit); jbd2_debug(1, "Journal thread exiting.\n"); @@ -603,7 +603,7 @@ int jbd2_journal_start_commit(journal_t *journal, tid_t *ptid) int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) { int ret = 0; - transaction_t *commit_trans; + transaction_t *commit_trans, *running_trans; if (!(journal->j_flags & JBD2_BARRIER)) return 0; @@ -613,6 +613,16 @@ int jbd2_trans_will_send_data_barrier(journal_t *journal, tid_t tid) goto out; commit_trans = journal->j_committing_transaction; if (!commit_trans || commit_trans->t_tid != tid) { + running_trans = journal->j_running_transaction; + /* + * The query transaction hasn't started committing, + * it must still be running. + */ + if (WARN_ON_ONCE(!running_trans || + running_trans->t_tid != tid)) + goto out; + + running_trans->t_need_data_flush = 1; ret = 1; goto out; } @@ -718,7 +728,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) } journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); - jbd2_journal_lock_updates(journal); return 0; } @@ -732,7 +741,6 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { if (journal->j_fc_cleanup_callback) journal->j_fc_cleanup_callback(journal, 0, tid); - jbd2_journal_unlock_updates(journal); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) @@ -947,7 +955,7 @@ int jbd2_journal_bmap(journal_t *journal, unsigned long blocknr, * descriptor blocks we do need to generate bona fide buffers. * * After the caller of jbd2_journal_get_descriptor_buffer() has finished modifying - * the buffer's contents they really should run flush_dcache_page(bh->b_page). + * the buffer's contents they really should run flush_dcache_folio(bh->b_folio). * But we don't bother doing that, so there will be coherency problems with * mmaps of blockdevs which hold live JBD-controlled filesystems. */ @@ -992,7 +1000,7 @@ void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); tail->t_checksum = cpu_to_be32(csum); } @@ -1361,7 +1369,7 @@ static int journal_check_superblock(journal_t *journal) return err; } - if (jbd2_journal_has_csum_v2or3_feature(journal) && + if (jbd2_journal_has_csum_v2or3(journal) && jbd2_has_feature_checksum(journal)) { /* Can't have checksum v1 and v2 on at the same time! */ printk(KERN_ERR "JBD2: Can't enable checksumming v1 and v2/3 " @@ -1369,14 +1377,14 @@ static int journal_check_superblock(journal_t *journal) return err; } - if (jbd2_journal_has_csum_v2or3_feature(journal)) { + if (jbd2_journal_has_csum_v2or3(journal)) { if (sb->s_checksum_type != JBD2_CRC32C_CHKSUM) { printk(KERN_ERR "JBD2: Unknown checksum type\n"); return err; } /* Check superblock checksum */ - if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + if (sb->s_checksum != jbd2_superblock_csum(sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); err = -EFSBADCRC; return err; @@ -1482,7 +1490,7 @@ static int journal_load_superblock(journal_t *journal) journal->j_total_len = be32_to_cpu(sb->s_maxlen); /* Precompute checksum seed for all metadata */ if (jbd2_journal_has_csum_v2or3(journal)) - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, sizeof(sb->s_uuid)); /* After journal features are set, we can compute transaction limits */ jbd2_journal_init_transaction_limits(journal); @@ -1811,7 +1819,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) set_buffer_uptodate(bh); } if (jbd2_journal_has_csum_v2or3(journal)) - sb->s_checksum = jbd2_superblock_csum(journal, sb); + sb->s_checksum = jbd2_superblock_csum(sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); @@ -1869,7 +1877,6 @@ int jbd2_journal_update_sb_log_tail(journal_t *journal, tid_t tail_tid, /* Log is no longer empty */ write_lock(&journal->j_state_lock); - WARN_ON(!sb->s_sequence); journal->j_flags &= ~JBD2_FLUSHED; write_unlock(&journal->j_state_lock); @@ -1965,17 +1972,15 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) return err; } - if (block_start == ~0ULL) { - block_start = phys_block; - block_stop = block_start - 1; - } + if (block_start == ~0ULL) + block_stop = block_start = phys_block; /* * last block not contiguous with current block, * process last contiguous region and return to this block on * next loop */ - if (phys_block != block_stop + 1) { + if (phys_block != block_stop) { block--; } else { block_stop++; @@ -1994,11 +1999,10 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) */ byte_start = block_start * journal->j_blocksize; byte_stop = block_stop * journal->j_blocksize; - byte_count = (block_stop - block_start + 1) * - journal->j_blocksize; + byte_count = (block_stop - block_start) * journal->j_blocksize; truncate_inode_pages_range(journal->j_dev->bd_mapping, - byte_start, byte_stop); + byte_start, byte_stop - 1); if (flags & JBD2_JOURNAL_FLUSH_DISCARD) { err = blkdev_issue_discard(journal->j_dev, @@ -2013,7 +2017,7 @@ static int __jbd2_journal_erase(journal_t *journal, unsigned int flags) } if (unlikely(err != 0)) { - pr_err("JBD2: (error %d) unable to wipe journal at physical blocks %llu - %llu", + pr_err("JBD2: (error %d) unable to wipe journal at physical blocks [%llu, %llu)", err, block_start, block_stop); return err; } @@ -2332,7 +2336,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, sizeof(sb->s_uuid)); } @@ -2651,9 +2655,10 @@ void jbd2_journal_ack_err(journal_t *journal) write_unlock(&journal->j_state_lock); } -int jbd2_journal_blocks_per_page(struct inode *inode) +int jbd2_journal_blocks_per_folio(struct inode *inode) { - return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + return 1 << (PAGE_SHIFT + mapping_max_folio_order(inode->i_mapping) - + inode->i_sb->s_blocksize_bits); } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index 9192be7c19d8..cac8c2cd4a92 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -39,7 +39,7 @@ struct recovery_info static int do_one_pass(journal_t *journal, struct recovery_info *info, enum passtype pass); -static int scan_revoke_records(journal_t *, struct buffer_head *, +static int scan_revoke_records(journal_t *, enum passtype, struct buffer_head *, tid_t, struct recovery_info *); #ifdef __KERNEL__ @@ -65,9 +65,8 @@ static void journal_brelse_array(struct buffer_head *b[], int n) */ #define MAXBUF 8 -static int do_readahead(journal_t *journal, unsigned int start) +static void do_readahead(journal_t *journal, unsigned int start) { - int err; unsigned int max, nbufs, next; unsigned long long blocknr; struct buffer_head *bh; @@ -85,7 +84,7 @@ static int do_readahead(journal_t *journal, unsigned int start) nbufs = 0; for (next = start; next < max; next++) { - err = jbd2_journal_bmap(journal, next, &blocknr); + int err = jbd2_journal_bmap(journal, next, &blocknr); if (err) { printk(KERN_ERR "JBD2: bad block at offset %u\n", @@ -94,10 +93,8 @@ static int do_readahead(journal_t *journal, unsigned int start) } bh = __getblk(journal->j_dev, blocknr, journal->j_blocksize); - if (!bh) { - err = -ENOMEM; + if (!bh) goto failed; - } if (!buffer_uptodate(bh) && !buffer_locked(bh)) { bufs[nbufs++] = bh; @@ -112,12 +109,10 @@ static int do_readahead(journal_t *journal, unsigned int start) if (nbufs) bh_readahead_batch(nbufs, bufs, 0); - err = 0; failed: if (nbufs) journal_brelse_array(bufs, nbufs); - return err; } #endif /* __KERNEL__ */ @@ -190,7 +185,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); provided = tail->t_checksum; tail->t_checksum = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); tail->t_checksum = provided; return provided == cpu_to_be32(calculated); @@ -287,19 +282,20 @@ static int fc_do_one_pass(journal_t *journal, int jbd2_journal_recover(journal_t *journal) { int err, err2; - journal_superblock_t * sb; - struct recovery_info info; memset(&info, 0, sizeof(info)); - sb = journal->j_superblock; /* * The journal superblock's s_start field (the current log head) * is always zero if, and only if, the journal was cleanly - * unmounted. + * unmounted. We use its in-memory version j_tail here because + * jbd2_journal_wipe() could have updated it without updating journal + * superblock. */ - if (!sb->s_start) { + if (!journal->j_tail) { + journal_superblock_t *sb = journal->j_superblock; + jbd2_debug(1, "No recovery required, last transaction %d, head block %u\n", be32_to_cpu(sb->s_sequence), be32_to_cpu(sb->s_head)); journal->j_transaction_sequence = be32_to_cpu(sb->s_sequence) + 1; @@ -327,6 +323,12 @@ int jbd2_journal_recover(journal_t *journal) journal->j_transaction_sequence, journal->j_head); jbd2_journal_clear_revoke(journal); + /* Free revoke table allocated for replay */ + if (journal->j_revoke != journal->j_revoke_table[0] && + journal->j_revoke != journal->j_revoke_table[1]) { + jbd2_journal_destroy_revoke_table(journal->j_revoke); + journal->j_revoke = journal->j_revoke_table[1]; + } err2 = sync_blockdev(journal->j_fs_dev); if (!err) err = err2; @@ -438,7 +440,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) h = buf; provided = h->h_chksum[0]; h->h_chksum[0] = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); h->h_chksum[0] = provided; return provided == cpu_to_be32(calculated); @@ -459,7 +461,7 @@ static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf) h = tmpbuf; provided = h->h_chksum[0]; h->h_chksum[0] = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, tmpbuf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, tmpbuf, j->j_blocksize); kfree(tmpbuf); return provided == cpu_to_be32(calculated); @@ -476,8 +478,8 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return 1; seq = cpu_to_be32(sequence); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, buf, j->j_blocksize); if (jbd2_has_feature_csum3(j)) return tag3->t_checksum == cpu_to_be32(csum32); @@ -612,6 +614,31 @@ static int do_one_pass(journal_t *journal, first_commit_ID = next_commit_ID; if (pass == PASS_SCAN) info->start_transaction = first_commit_ID; + else if (pass == PASS_REVOKE) { + /* + * Would the default revoke table have too long hash chains + * during replay? + */ + if (info->nr_revokes > JOURNAL_REVOKE_DEFAULT_HASH * 16) { + unsigned int hash_size; + + /* + * Aim for average chain length of 8, limit at 1M + * entries to avoid problems with malicious + * filesystems. + */ + hash_size = min(roundup_pow_of_two(info->nr_revokes / 8), + 1U << 20); + journal->j_revoke = + jbd2_journal_init_revoke_table(hash_size); + if (!journal->j_revoke) { + printk(KERN_ERR + "JBD2: failed to allocate revoke table for replay with %u entries. " + "Journal replay may be slow.\n", hash_size); + journal->j_revoke = journal->j_revoke_table[1]; + } + } + } jbd2_debug(1, "Starting recovery pass %d\n", pass); @@ -852,6 +879,13 @@ chksum_ok: case JBD2_REVOKE_BLOCK: /* + * If we aren't in the SCAN or REVOKE pass, then we can + * just skip over this block. + */ + if (pass != PASS_REVOKE && pass != PASS_SCAN) + continue; + + /* * Check revoke block crc in pass_scan, if csum verify * failed, check commit block time later. */ @@ -863,12 +897,7 @@ chksum_ok: need_check_commit_time = true; } - /* If we aren't in the REVOKE pass, then we can - * just skip over this block. */ - if (pass != PASS_REVOKE) - continue; - - err = scan_revoke_records(journal, bh, + err = scan_revoke_records(journal, pass, bh, next_commit_ID, info); if (err) goto failed; @@ -922,8 +951,9 @@ chksum_ok: /* Scan a revoke record, marking all blocks mentioned as revoked. */ -static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, - tid_t sequence, struct recovery_info *info) +static int scan_revoke_records(journal_t *journal, enum passtype pass, + struct buffer_head *bh, tid_t sequence, + struct recovery_info *info) { jbd2_journal_revoke_header_t *header; int offset, max; @@ -944,6 +974,11 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, if (jbd2_has_feature_64bit(journal)) record_len = 8; + if (pass == PASS_SCAN) { + info->nr_revokes += (max - offset) / record_len; + return 0; + } + while (offset + record_len <= max) { unsigned long long blocknr; int err; @@ -956,7 +991,6 @@ static int scan_revoke_records(journal_t *journal, struct buffer_head *bh, err = jbd2_journal_set_revoke(journal, blocknr, sequence); if (err) return err; - ++info->nr_revokes; } return 0; } diff --git a/fs/jbd2/revoke.c b/fs/jbd2/revoke.c index ce63d5fde9c3..1467f6790747 100644 --- a/fs/jbd2/revoke.c +++ b/fs/jbd2/revoke.c @@ -215,7 +215,7 @@ int __init jbd2_journal_init_revoke_table_cache(void) return 0; } -static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) +struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) { int shift = 0; int tmp = hash_size; @@ -231,7 +231,7 @@ static struct jbd2_revoke_table_s *jbd2_journal_init_revoke_table(int hash_size) table->hash_size = hash_size; table->hash_shift = shift; table->hash_table = - kmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); + kvmalloc_array(hash_size, sizeof(struct list_head), GFP_KERNEL); if (!table->hash_table) { kmem_cache_free(jbd2_revoke_table_cache, table); table = NULL; @@ -245,7 +245,7 @@ out: return table; } -static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) +void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) { int i; struct list_head *hash_list; @@ -255,7 +255,7 @@ static void jbd2_journal_destroy_revoke_table(struct jbd2_revoke_table_s *table) J_ASSERT(list_empty(hash_list)); } - kfree(table->hash_table); + kvfree(table->hash_table); kmem_cache_free(jbd2_revoke_table_cache, table); } @@ -345,7 +345,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, bh = bh_in; if (!bh) { - bh = __find_get_block(bdev, blocknr, journal->j_blocksize); + bh = __find_get_block_nonatomic(bdev, blocknr, + journal->j_blocksize); if (bh) BUFFER_TRACE(bh, "found on hash"); } @@ -355,7 +356,8 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, /* If there is a different buffer_head lying around in * memory anywhere... */ - bh2 = __find_get_block(bdev, blocknr, journal->j_blocksize); + bh2 = __find_get_block_nonatomic(bdev, blocknr, + journal->j_blocksize); if (bh2) { /* ... and it has RevokeValid status... */ if (bh2 != bh && buffer_revokevalid(bh2)) @@ -420,12 +422,11 @@ int jbd2_journal_revoke(handle_t *handle, unsigned long long blocknr, * do not trust the Revoked bit on buffers unless RevokeValid is also * set. */ -int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) +void jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) { struct jbd2_revoke_record_s *record; journal_t *journal = handle->h_transaction->t_journal; int need_cancel; - int did_revoke = 0; /* akpm: debug */ struct buffer_head *bh = jh2bh(jh); jbd2_debug(4, "journal_head %p, cancelling revoke\n", jh); @@ -450,7 +451,6 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) list_del(&record->hash); spin_unlock(&journal->j_revoke_lock); kmem_cache_free(jbd2_revoke_record_cache, record); - did_revoke = 1; } } @@ -466,18 +466,18 @@ int jbd2_journal_cancel_revoke(handle_t *handle, struct journal_head *jh) * state machine will get very upset later on. */ if (need_cancel) { struct buffer_head *bh2; - bh2 = __find_get_block(bh->b_bdev, bh->b_blocknr, bh->b_size); + bh2 = __find_get_block_nonatomic(bh->b_bdev, bh->b_blocknr, + bh->b_size); if (bh2) { if (bh2 != bh) clear_buffer_revoked(bh2); __brelse(bh2); } } - return did_revoke; } /* - * journal_clear_revoked_flag clears revoked flag of buffers in + * jbd2_clear_buffer_revoked_flags clears revoked flag of buffers in * revoke table to reflect there is no revoked buffers in the next * transaction which is going to be started. */ @@ -495,9 +495,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal) struct jbd2_revoke_record_s *record; struct buffer_head *bh; record = (struct jbd2_revoke_record_s *)list_entry; - bh = __find_get_block(journal->j_fs_dev, - record->blocknr, - journal->j_blocksize); + bh = __find_get_block_nonatomic(journal->j_fs_dev, + record->blocknr, + journal->j_blocksize); if (bh) { clear_buffer_revoked(bh); __brelse(bh); @@ -506,9 +506,9 @@ void jbd2_clear_buffer_revoked_flags(journal_t *journal) } } -/* journal_switch_revoke table select j_revoke for next transaction - * we do not want to suspend any processing until all revokes are - * written -bzzz +/* jbd2_journal_switch_revoke_table table select j_revoke for next + * transaction we do not want to suspend any processing until all + * revokes are written -bzzz */ void jbd2_journal_switch_revoke_table(journal_t *journal) { diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index 66513c18ca29..c7867139af69 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -92,7 +92,6 @@ static void jbd2_get_transaction(journal_t *journal, atomic_set(&transaction->t_outstanding_revokes, 0); atomic_set(&transaction->t_handle_count, 0); INIT_LIST_HEAD(&transaction->t_inode_list); - INIT_LIST_HEAD(&transaction->t_private_list); /* Set up the commit timer for the new transaction. */ journal->j_commit_timer.expires = round_jiffies_up(transaction->t_expires); @@ -114,12 +113,9 @@ static void jbd2_get_transaction(journal_t *journal, */ /* - * Update transaction's maximum wait time, if debugging is enabled. - * * t_max_wait is carefully updated here with use of atomic compare exchange. * Note that there could be multiplre threads trying to do this simultaneously * hence using cmpxchg to avoid any use of locks in this case. - * With this t_max_wait can be updated w/o enabling jbd2_journal_enable_debug. */ static inline void update_t_max_wait(transaction_t *transaction, unsigned long ts) @@ -1513,7 +1509,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction == transaction); spin_unlock(&jh->b_state_lock); } - if (jh->b_modified == 1) { + if (data_race(jh->b_modified == 1)) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (data_race(jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata)) { @@ -1532,7 +1528,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out; } - journal = transaction->t_journal; spin_lock(&jh->b_state_lock); if (is_handle_aborted(handle)) { @@ -1547,6 +1542,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out_unlock_bh; } + journal = transaction->t_journal; + if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part @@ -2079,21 +2076,6 @@ static void __jbd2_journal_unfile_buffer(struct journal_head *jh) jh->b_transaction = NULL; } -void jbd2_journal_unfile_buffer(journal_t *journal, struct journal_head *jh) -{ - struct buffer_head *bh = jh2bh(jh); - - /* Get reference so that buffer cannot be freed before we unlock it */ - get_bh(bh); - spin_lock(&jh->b_state_lock); - spin_lock(&journal->j_list_lock); - __jbd2_journal_unfile_buffer(jh); - spin_unlock(&journal->j_list_lock); - spin_unlock(&jh->b_state_lock); - jbd2_journal_put_journal_head(jh); - __brelse(bh); -} - /** * jbd2_journal_try_to_free_buffers() - try to free page buffers. * @journal: journal for operation @@ -2192,7 +2174,7 @@ static int __dispose_buffer(struct journal_head *jh, transaction_t *transaction) /* * We don't want to write the buffer anymore, clear the * bit so that we don't confuse checks in - * __journal_file_buffer + * __jbd2_journal_file_buffer */ clear_buffer_dirty(bh); __jbd2_journal_file_buffer(jh, transaction, BJ_Forget); diff --git a/fs/jffs2/dir.c b/fs/jffs2/dir.c index 2b2938970da3..dd91f725ded6 100644 --- a/fs/jffs2/dir.c +++ b/fs/jffs2/dir.c @@ -32,8 +32,8 @@ static int jffs2_link (struct dentry *,struct inode *,struct dentry *); static int jffs2_unlink (struct inode *,struct dentry *); static int jffs2_symlink (struct mnt_idmap *, struct inode *, struct dentry *, const char *); -static int jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *, - umode_t); +static struct dentry *jffs2_mkdir (struct mnt_idmap *, struct inode *,struct dentry *, + umode_t); static int jffs2_rmdir (struct inode *,struct dentry *); static int jffs2_mknod (struct mnt_idmap *, struct inode *,struct dentry *, umode_t,dev_t); @@ -446,8 +446,8 @@ static int jffs2_symlink (struct mnt_idmap *idmap, struct inode *dir_i, } -static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, - struct dentry *dentry, umode_t mode) +static struct dentry *jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, + struct dentry *dentry, umode_t mode) { struct jffs2_inode_info *f, *dir_f; struct jffs2_sb_info *c; @@ -464,7 +464,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, ri = jffs2_alloc_raw_inode(); if (!ri) - return -ENOMEM; + return ERR_PTR(-ENOMEM); c = JFFS2_SB_INFO(dir_i->i_sb); @@ -477,7 +477,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, if (ret) { jffs2_free_raw_inode(ri); - return ret; + return ERR_PTR(ret); } inode = jffs2_new_inode(dir_i, mode, ri); @@ -485,7 +485,7 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, if (IS_ERR(inode)) { jffs2_free_raw_inode(ri); jffs2_complete_reservation(c); - return PTR_ERR(inode); + return ERR_CAST(inode); } inode->i_op = &jffs2_dir_inode_operations; @@ -584,11 +584,11 @@ static int jffs2_mkdir (struct mnt_idmap *idmap, struct inode *dir_i, jffs2_complete_reservation(c); d_instantiate_new(dentry, inode); - return 0; + return NULL; fail: iget_failed(inode); - return ret; + return ERR_PTR(ret); } static int jffs2_rmdir (struct inode *dir_i, struct dentry *dentry) diff --git a/fs/jffs2/wbuf.c b/fs/jffs2/wbuf.c index 4061e0ba7010..bb815a002984 100644 --- a/fs/jffs2/wbuf.c +++ b/fs/jffs2/wbuf.c @@ -584,7 +584,7 @@ static int __jffs2_flush_wbuf(struct jffs2_sb_info *c, int pad) size_t retlen; /* Nothing to do if not write-buffering the flash. In particular, we shouldn't - del_timer() the timer we never initialised. */ + call timer_delete() on the timer we never initialised. */ if (!jffs2_is_writebuffered(c)) return 0; diff --git a/fs/jfs/inode.c b/fs/jfs/inode.c index 07cfdc440596..60fc92dee24d 100644 --- a/fs/jfs/inode.c +++ b/fs/jfs/inode.c @@ -369,7 +369,7 @@ void jfs_truncate_nolock(struct inode *ip, loff_t length) ASSERT(length >= 0); - if (test_cflag(COMMIT_Nolink, ip)) { + if (test_cflag(COMMIT_Nolink, ip) || isReadOnly(ip)) { xtTruncate(0, ip, length, COMMIT_WMAP); return; } diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c index 5f4b305030ad..4b660296caf3 100644 --- a/fs/jfs/jfs_discard.c +++ b/fs/jfs/jfs_discard.c @@ -86,7 +86,8 @@ int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range) down_read(&sb->s_umount); bmp = JFS_SBI(ip->i_sb)->bmap; - if (minlen > bmp->db_agsize || + if (bmp == NULL || + minlen > bmp->db_agsize || start >= bmp->db_mapsize || range->len < sb->s_blocksize) { up_read(&sb->s_umount); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index f9009e4f9ffd..35e063c9f3a4 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -178,41 +178,30 @@ int dbMount(struct inode *ipbmap) dbmp_le = (struct dbmap_disk *) mp->data; bmp->db_mapsize = le64_to_cpu(dbmp_le->dn_mapsize); bmp->db_nfree = le64_to_cpu(dbmp_le->dn_nfree); - bmp->db_l2nbperpage = le32_to_cpu(dbmp_le->dn_l2nbperpage); - if (bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE || - bmp->db_l2nbperpage < 0) { - err = -EINVAL; - goto err_release_metapage; - } - bmp->db_numag = le32_to_cpu(dbmp_le->dn_numag); - if (!bmp->db_numag || bmp->db_numag > MAXAG) { - err = -EINVAL; - goto err_release_metapage; - } - bmp->db_maxlevel = le32_to_cpu(dbmp_le->dn_maxlevel); bmp->db_maxag = le32_to_cpu(dbmp_le->dn_maxag); bmp->db_agpref = le32_to_cpu(dbmp_le->dn_agpref); - if (bmp->db_maxag >= MAXAG || bmp->db_maxag < 0 || - bmp->db_agpref >= MAXAG || bmp->db_agpref < 0) { - err = -EINVAL; - goto err_release_metapage; - } - bmp->db_aglevel = le32_to_cpu(dbmp_le->dn_aglevel); bmp->db_agheight = le32_to_cpu(dbmp_le->dn_agheight); bmp->db_agwidth = le32_to_cpu(dbmp_le->dn_agwidth); bmp->db_agstart = le32_to_cpu(dbmp_le->dn_agstart); bmp->db_agl2size = le32_to_cpu(dbmp_le->dn_agl2size); - if (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG || - bmp->db_agl2size < 0) { - err = -EINVAL; - goto err_release_metapage; - } - if (((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { + if ((bmp->db_l2nbperpage > L2PSIZE - L2MINBLOCKSIZE) || + (bmp->db_l2nbperpage < 0) || + !bmp->db_numag || (bmp->db_numag > MAXAG) || + (bmp->db_maxag >= MAXAG) || (bmp->db_maxag < 0) || + (bmp->db_agpref >= MAXAG) || (bmp->db_agpref < 0) || + (bmp->db_agheight < 0) || (bmp->db_agheight > (L2LPERCTL >> 1)) || + (bmp->db_agwidth < 1) || (bmp->db_agwidth > (LPERCTL / MAXAG)) || + (bmp->db_agwidth > (1 << (L2LPERCTL - (bmp->db_agheight << 1)))) || + (bmp->db_agstart < 0) || + (bmp->db_agstart > (CTLTREESIZE - 1 - bmp->db_agwidth * (MAXAG - 1))) || + (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) || + (bmp->db_agl2size < 0) || + ((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { err = -EINVAL; goto err_release_metapage; } @@ -3403,7 +3392,7 @@ int dbExtendFS(struct inode *ipbmap, s64 blkno, s64 nblocks) oldl2agsize = bmp->db_agl2size; bmp->db_agl2size = l2agsize; - bmp->db_agsize = 1 << l2agsize; + bmp->db_agsize = (s64)1 << l2agsize; /* compute new number of AG */ agno = bmp->db_numag; @@ -3666,8 +3655,8 @@ void dbFinalizeBmap(struct inode *ipbmap) * system size is not a multiple of the group size). */ inactfree = (inactags && ag_rem) ? - ((inactags - 1) << bmp->db_agl2size) + ag_rem - : inactags << bmp->db_agl2size; + (((s64)inactags - 1) << bmp->db_agl2size) + ag_rem + : ((s64)inactags << bmp->db_agl2size); /* determine how many free blocks are in the active * allocation groups plus the average number of free blocks diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 8f85177f284b..ab11849cf9cc 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -117,7 +117,8 @@ do { \ if (!(RC)) { \ if (((P)->header.nextindex > \ (((BN) == 0) ? DTROOTMAXSLOT : (P)->header.maxslot)) || \ - ((BN) && ((P)->header.maxslot > DTPAGEMAXSLOT))) { \ + ((BN) && (((P)->header.maxslot > DTPAGEMAXSLOT) || \ + ((P)->header.stblindex >= DTPAGEMAXSLOT)))) { \ BT_PUTPAGE(MP); \ jfs_error((IP)->i_sb, \ "DT_GETPAGE: dtree page corrupt\n"); \ @@ -2612,7 +2613,7 @@ void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) * fsck.jfs should really fix this, but it currently does not. * Called from jfs_readdir when bad index is detected. */ -static void add_missing_indices(struct inode *inode, s64 bn) +static int add_missing_indices(struct inode *inode, s64 bn) { struct ldtentry *d; struct dt_lock *dtlck; @@ -2621,7 +2622,7 @@ static void add_missing_indices(struct inode *inode, s64 bn) struct lv *lv; struct metapage *mp; dtpage_t *p; - int rc; + int rc = 0; s8 *stbl; tid_t tid; struct tlock *tlck; @@ -2646,6 +2647,16 @@ static void add_missing_indices(struct inode *inode, s64 bn) stbl = DT_GETSTBL(p); for (i = 0; i < p->header.nextindex; i++) { + if (stbl[i] < 0) { + jfs_err("jfs: add_missing_indices: Invalid stbl[%d] = %d for inode %ld, block = %lld", + i, stbl[i], (long)inode->i_ino, (long long)bn); + rc = -EIO; + + DT_PUTPAGE(mp); + txAbort(tid, 0); + goto end; + } + d = (struct ldtentry *) &p->slot[stbl[i]]; index = le32_to_cpu(d->index); if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { @@ -2663,6 +2674,7 @@ static void add_missing_indices(struct inode *inode, s64 bn) (void) txCommit(tid, 1, &inode, 0); end: txEnd(tid); + return rc; } /* @@ -3016,7 +3028,8 @@ skip_one: } if (fix_page) { - add_missing_indices(ip, bn); + if ((rc = add_missing_indices(ip, bn))) + goto out; page_fixed = 1; } diff --git a/fs/jfs/jfs_extent.c b/fs/jfs/jfs_extent.c index 63d21822d309..46529bcc8297 100644 --- a/fs/jfs/jfs_extent.c +++ b/fs/jfs/jfs_extent.c @@ -74,6 +74,11 @@ extAlloc(struct inode *ip, s64 xlen, s64 pno, xad_t * xp, bool abnr) int rc; int xflag; + if (isReadOnly(ip)) { + jfs_error(ip->i_sb, "read-only filesystem\n"); + return -EIO; + } + /* This blocks if we are low on resources */ txBeginAnon(ip->i_sb); @@ -253,6 +258,11 @@ int extRecord(struct inode *ip, xad_t * xp) { int rc; + if (isReadOnly(ip)) { + jfs_error(ip->i_sb, "read-only filesystem\n"); + return -EIO; + } + txBeginAnon(ip->i_sb); mutex_lock(&JFS_IP(ip)->commit_mutex); diff --git a/fs/jfs/jfs_imap.c b/fs/jfs/jfs_imap.c index a360b24ed320..ecb8e05b8b84 100644 --- a/fs/jfs/jfs_imap.c +++ b/fs/jfs/jfs_imap.c @@ -102,7 +102,7 @@ int diMount(struct inode *ipimap) * allocate/initialize the in-memory inode map control structure */ /* allocate the in-memory inode map control structure. */ - imap = kmalloc(sizeof(struct inomap), GFP_KERNEL); + imap = kzalloc(sizeof(struct inomap), GFP_KERNEL); if (imap == NULL) return -ENOMEM; @@ -456,7 +456,7 @@ struct inode *diReadSpecial(struct super_block *sb, ino_t inum, int secondary) dp += inum % 8; /* 8 inodes per 4K page */ /* copy on-disk inode to in-memory inode */ - if ((copy_from_dinode(dp, ip)) != 0) { + if ((copy_from_dinode(dp, ip) != 0) || (ip->i_nlink == 0)) { /* handle bad return by returning NULL for ip */ set_nlink(ip, 1); /* Don't want iput() deleting it */ iput(ip); @@ -3029,14 +3029,23 @@ static void duplicateIXtree(struct super_block *sb, s64 blkno, * * RETURN VALUES: * 0 - success - * -ENOMEM - insufficient memory + * -EINVAL - unexpected inode type */ static int copy_from_dinode(struct dinode * dip, struct inode *ip) { struct jfs_inode_info *jfs_ip = JFS_IP(ip); struct jfs_sb_info *sbi = JFS_SBI(ip->i_sb); + int fileset = le32_to_cpu(dip->di_fileset); + + switch (fileset) { + case AGGR_RESERVED_I: case AGGREGATE_I: case BMAP_I: + case LOG_I: case BADBLOCK_I: case FILESYSTEM_I: + break; + default: + return -EINVAL; + } - jfs_ip->fileset = le32_to_cpu(dip->di_fileset); + jfs_ip->fileset = fileset; jfs_ip->mode2 = le32_to_cpu(dip->di_mode); jfs_set_inode_flags(ip); diff --git a/fs/jfs/namei.c b/fs/jfs/namei.c index fc8ede43afde..65a218eba8fa 100644 --- a/fs/jfs/namei.c +++ b/fs/jfs/namei.c @@ -187,13 +187,13 @@ static int jfs_create(struct mnt_idmap *idmap, struct inode *dip, * dentry - dentry of child directory * mode - create mode (rwxrwxrwx). * - * RETURN: Errors from subroutines + * RETURN: ERR_PTR() of errors from subroutines. * * note: * EACCES: user needs search+write permission on the parent directory */ -static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, - struct dentry *dentry, umode_t mode) +static struct dentry *jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, + struct dentry *dentry, umode_t mode) { int rc = 0; tid_t tid; /* transaction id */ @@ -308,7 +308,7 @@ static int jfs_mkdir(struct mnt_idmap *idmap, struct inode *dip, out1: jfs_info("jfs_mkdir: rc:%d", rc); - return rc; + return ERR_PTR(rc); } /* diff --git a/fs/jfs/super.c b/fs/jfs/super.c index 223d9ac59839..10368c188c5e 100644 --- a/fs/jfs/super.c +++ b/fs/jfs/super.c @@ -389,8 +389,8 @@ static int jfs_reconfigure(struct fs_context *fc) if (!ctx->newLVSize) { ctx->newLVSize = sb_bdev_nr_blocks(sb); - if (ctx->newLVSize == 0) - pr_err("JFS: Cannot determine volume size\n"); + if (ctx->newLVSize == 0) + pr_err("JFS: Cannot determine volume size\n"); } rc = jfs_extendfs(sb, ctx->newLVSize, 0); @@ -766,7 +766,7 @@ static ssize_t jfs_quota_write(struct super_block *sb, int type, } lock_buffer(bh); memcpy(bh->b_data+offset, data, tocopy); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); set_buffer_uptodate(bh); mark_buffer_dirty(bh); unlock_buffer(bh); diff --git a/fs/jfs/xattr.c b/fs/jfs/xattr.c index 24afbae87225..11d7f74d207b 100644 --- a/fs/jfs/xattr.c +++ b/fs/jfs/xattr.c @@ -559,11 +559,16 @@ static int ea_get(struct inode *inode, struct ea_buffer *ea_buf, int min_size) size_check: if (EALIST_SIZE(ea_buf->xattr) != ea_size) { - int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr)); - - printk(KERN_ERR "ea_get: invalid extended attribute\n"); - print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, - ea_buf->xattr, size, 1); + if (unlikely(EALIST_SIZE(ea_buf->xattr) > INT_MAX)) { + printk(KERN_ERR "ea_get: extended attribute size too large: %u > INT_MAX\n", + EALIST_SIZE(ea_buf->xattr)); + } else { + int size = clamp_t(int, ea_size, 0, EALIST_SIZE(ea_buf->xattr)); + + printk(KERN_ERR "ea_get: invalid extended attribute\n"); + print_hex_dump(KERN_ERR, "", DUMP_PREFIX_ADDRESS, 16, 1, + ea_buf->xattr, size, 1); + } ea_release(inode, ea_buf); rc = -EIO; goto clean_up; diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index 5f0f8b95f44c..fc70d72c3fe8 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,7 @@ #include "kernfs-internal.h" -static DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ +DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() @@ -51,22 +51,14 @@ static bool kernfs_lockdep(struct kernfs_node *kn) #endif } -static int kernfs_name_locked(struct kernfs_node *kn, char *buf, size_t buflen) -{ - if (!kn) - return strscpy(buf, "(null)", buflen); - - return strscpy(buf, kn->parent ? kn->name : "/", buflen); -} - /* kernfs_node_depth - compute depth from @from to @to */ static size_t kernfs_depth(struct kernfs_node *from, struct kernfs_node *to) { size_t depth = 0; - while (to->parent && to != from) { + while (rcu_dereference(to->__parent) && to != from) { depth++; - to = to->parent; + to = rcu_dereference(to->__parent); } return depth; } @@ -84,18 +76,18 @@ static struct kernfs_node *kernfs_common_ancestor(struct kernfs_node *a, db = kernfs_depth(rb->kn, b); while (da > db) { - a = a->parent; + a = rcu_dereference(a->__parent); da--; } while (db > da) { - b = b->parent; + b = rcu_dereference(b->__parent); db--; } /* worst case b and a will be the same at root */ while (b != a) { - b = b->parent; - a = a->parent; + b = rcu_dereference(b->__parent); + a = rcu_dereference(a->__parent); } return a; @@ -168,10 +160,13 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, /* Calculate how many bytes we need for the rest */ for (i = depth_to - 1; i >= 0; i--) { + const char *name; + for (kn = kn_to, j = 0; j < i; j++) - kn = kn->parent; + kn = rcu_dereference(kn->__parent); - len += scnprintf(buf + len, buflen - len, "/%s", kn->name); + name = rcu_dereference(kn->name); + len += scnprintf(buf + len, buflen - len, "/%s", name); } return len; @@ -195,13 +190,18 @@ static int kernfs_path_from_node_locked(struct kernfs_node *kn_to, */ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) { - unsigned long flags; - int ret; + struct kernfs_node *kn_parent; - read_lock_irqsave(&kernfs_rename_lock, flags); - ret = kernfs_name_locked(kn, buf, buflen); - read_unlock_irqrestore(&kernfs_rename_lock, flags); - return ret; + if (!kn) + return strscpy(buf, "(null)", buflen); + + guard(rcu)(); + /* + * KERNFS_ROOT_INVARIANT_PARENT is ignored here. The name is RCU freed and + * the parent is either existing or not. + */ + kn_parent = rcu_dereference(kn->__parent); + return strscpy(buf, kn_parent ? rcu_dereference(kn->name) : "/", buflen); } /** @@ -223,13 +223,17 @@ int kernfs_name(struct kernfs_node *kn, char *buf, size_t buflen) int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, char *buf, size_t buflen) { - unsigned long flags; - int ret; + struct kernfs_root *root; - read_lock_irqsave(&kernfs_rename_lock, flags); - ret = kernfs_path_from_node_locked(to, from, buf, buflen); - read_unlock_irqrestore(&kernfs_rename_lock, flags); - return ret; + guard(rcu)(); + if (to) { + root = kernfs_root(to); + if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) { + guard(read_lock_irqsave)(&kernfs_rename_lock); + return kernfs_path_from_node_locked(to, from, buf, buflen); + } + } + return kernfs_path_from_node_locked(to, from, buf, buflen); } EXPORT_SYMBOL_GPL(kernfs_path_from_node); @@ -295,7 +299,7 @@ struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) unsigned long flags; read_lock_irqsave(&kernfs_rename_lock, flags); - parent = kn->parent; + parent = kernfs_parent(kn); kernfs_get(parent); read_unlock_irqrestore(&kernfs_rename_lock, flags); @@ -336,13 +340,13 @@ static int kernfs_name_compare(unsigned int hash, const char *name, return -1; if (ns > kn->ns) return 1; - return strcmp(name, kn->name); + return strcmp(name, kernfs_rcu_name(kn)); } static int kernfs_sd_compare(const struct kernfs_node *left, const struct kernfs_node *right) { - return kernfs_name_compare(left->hash, left->name, left->ns, right); + return kernfs_name_compare(left->hash, kernfs_rcu_name(left), left->ns, right); } /** @@ -360,8 +364,12 @@ static int kernfs_sd_compare(const struct kernfs_node *left, */ static int kernfs_link_sibling(struct kernfs_node *kn) { - struct rb_node **node = &kn->parent->dir.children.rb_node; struct rb_node *parent = NULL; + struct kernfs_node *kn_parent; + struct rb_node **node; + + kn_parent = kernfs_parent(kn); + node = &kn_parent->dir.children.rb_node; while (*node) { struct kernfs_node *pos; @@ -380,13 +388,13 @@ static int kernfs_link_sibling(struct kernfs_node *kn) /* add new node and rebalance the tree */ rb_link_node(&kn->rb, parent, node); - rb_insert_color(&kn->rb, &kn->parent->dir.children); + rb_insert_color(&kn->rb, &kn_parent->dir.children); /* successfully added, account subdir number */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) - kn->parent->dir.subdirs++; - kernfs_inc_rev(kn->parent); + kn_parent->dir.subdirs++; + kernfs_inc_rev(kn_parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); return 0; @@ -407,16 +415,19 @@ static int kernfs_link_sibling(struct kernfs_node *kn) */ static bool kernfs_unlink_sibling(struct kernfs_node *kn) { + struct kernfs_node *kn_parent; + if (RB_EMPTY_NODE(&kn->rb)) return false; + kn_parent = kernfs_parent(kn); down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); if (kernfs_type(kn) == KERNFS_DIR) - kn->parent->dir.subdirs--; - kernfs_inc_rev(kn->parent); + kn_parent->dir.subdirs--; + kernfs_inc_rev(kn_parent); up_write(&kernfs_root(kn)->kernfs_iattr_rwsem); - rb_erase(&kn->rb, &kn->parent->dir.children); + rb_erase(&kn->rb, &kn_parent->dir.children); RB_CLEAR_NODE(&kn->rb); return true; } @@ -533,7 +544,8 @@ static void kernfs_free_rcu(struct rcu_head *rcu) { struct kernfs_node *kn = container_of(rcu, struct kernfs_node, rcu); - kfree_const(kn->name); + /* If the whole node goes away, then name can't be used outside */ + kfree_const(rcu_access_pointer(kn->name)); if (kn->iattr) { simple_xattrs_free(&kn->iattr->xattrs, NULL); @@ -562,11 +574,12 @@ void kernfs_put(struct kernfs_node *kn) * Moving/renaming is always done while holding reference. * kn->parent won't change beneath us. */ - parent = kn->parent; + parent = kernfs_parent(kn); WARN_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS, "kernfs_put: %s/%s: released with incorrect active_ref %d\n", - parent ? parent->name : "", kn->name, atomic_read(&kn->active)); + parent ? rcu_dereference(parent->name) : "", + rcu_dereference(kn->name), atomic_read(&kn->active)); if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); @@ -643,7 +656,7 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, atomic_set(&kn->active, KN_DEACTIVATED_BIAS); RB_CLEAR_NODE(&kn->rb); - kn->name = name; + rcu_assign_pointer(kn->name, name); kn->mode = mode; kn->flags = flags; @@ -701,7 +714,7 @@ struct kernfs_node *kernfs_new_node(struct kernfs_node *parent, name, mode, uid, gid, flags); if (kn) { kernfs_get(parent); - kn->parent = parent; + rcu_assign_pointer(kn->__parent, parent); } return kn; } @@ -769,18 +782,20 @@ err_unlock: */ int kernfs_add_one(struct kernfs_node *kn) { - struct kernfs_node *parent = kn->parent; - struct kernfs_root *root = kernfs_root(parent); + struct kernfs_root *root = kernfs_root(kn); struct kernfs_iattrs *ps_iattr; + struct kernfs_node *parent; bool has_ns; int ret; down_write(&root->kernfs_rwsem); + parent = kernfs_parent(kn); ret = -EINVAL; has_ns = kernfs_ns_enabled(parent); if (WARN(has_ns != (bool)kn->ns, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", - has_ns ? "required" : "invalid", parent->name, kn->name)) + has_ns ? "required" : "invalid", + kernfs_rcu_name(parent), kernfs_rcu_name(kn))) goto out_unlock; if (kernfs_type(parent) != KERNFS_DIR) @@ -790,7 +805,7 @@ int kernfs_add_one(struct kernfs_node *kn) if (parent->flags & (KERNFS_REMOVING | KERNFS_EMPTY_DIR)) goto out_unlock; - kn->hash = kernfs_name_hash(kn->name, kn->ns); + kn->hash = kernfs_name_hash(kernfs_rcu_name(kn), kn->ns); ret = kernfs_link_sibling(kn); if (ret) @@ -846,7 +861,7 @@ static struct kernfs_node *kernfs_find_ns(struct kernfs_node *parent, if (has_ns != (bool)ns) { WARN(1, KERN_WARNING "kernfs: ns %s in '%s' for '%s'\n", - has_ns ? "required" : "invalid", parent->name, name); + has_ns ? "required" : "invalid", kernfs_rcu_name(parent), name); return NULL; } @@ -949,6 +964,11 @@ struct kernfs_node *kernfs_walk_and_get_ns(struct kernfs_node *parent, return kn; } +unsigned int kernfs_root_flags(struct kernfs_node *kn) +{ + return kernfs_root(kn)->flags; +} + /** * kernfs_create_root - create a new kernfs hierarchy * @scops: optional syscall operations for the hierarchy @@ -1112,7 +1132,7 @@ struct kernfs_node *kernfs_create_empty_dir(struct kernfs_node *parent, static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name, struct dentry *dentry, unsigned int flags) { - struct kernfs_node *kn; + struct kernfs_node *kn, *parent; struct kernfs_root *root; if (flags & LOOKUP_RCU) @@ -1120,8 +1140,6 @@ static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name, /* Negative hashed dentry? */ if (d_really_is_negative(dentry)) { - struct kernfs_node *parent; - /* If the kernfs parent node has changed discard and * proceed to ->lookup. * @@ -1163,16 +1181,17 @@ static int kernfs_dop_revalidate(struct inode *dir, const struct qstr *name, if (!kernfs_active(kn)) goto out_bad; + parent = kernfs_parent(kn); /* The kernfs node has been moved? */ - if (kernfs_dentry_node(dentry->d_parent) != kn->parent) + if (kernfs_dentry_node(dentry->d_parent) != parent) goto out_bad; /* The kernfs node has been renamed */ - if (strcmp(dentry->d_name.name, kn->name) != 0) + if (strcmp(dentry->d_name.name, kernfs_rcu_name(kn)) != 0) goto out_bad; /* The kernfs node has been moved to a different namespace */ - if (kn->parent && kernfs_ns_enabled(kn->parent) && + if (parent && kernfs_ns_enabled(parent) && kernfs_info(dentry->d_sb)->ns != kn->ns) goto out_bad; @@ -1230,24 +1249,24 @@ static struct dentry *kernfs_iop_lookup(struct inode *dir, return d_splice_alias(inode, dentry); } -static int kernfs_iop_mkdir(struct mnt_idmap *idmap, - struct inode *dir, struct dentry *dentry, - umode_t mode) +static struct dentry *kernfs_iop_mkdir(struct mnt_idmap *idmap, + struct inode *dir, struct dentry *dentry, + umode_t mode) { struct kernfs_node *parent = dir->i_private; struct kernfs_syscall_ops *scops = kernfs_root(parent)->syscall_ops; int ret; if (!scops || !scops->mkdir) - return -EPERM; + return ERR_PTR(-EPERM); if (!kernfs_get_active(parent)) - return -ENODEV; + return ERR_PTR(-ENODEV); ret = scops->mkdir(parent, dentry->d_name.name, mode); kernfs_put_active(parent); - return ret; + return ERR_PTR(ret); } static int kernfs_iop_rmdir(struct inode *dir, struct dentry *dentry) @@ -1365,7 +1384,7 @@ static struct kernfs_node *kernfs_next_descendant_post(struct kernfs_node *pos, return kernfs_leftmost_descendant(rb_to_kn(rbn)); /* no sibling left, visit parent */ - return pos->parent; + return kernfs_parent(pos); } static void kernfs_activate_one(struct kernfs_node *kn) @@ -1377,7 +1396,7 @@ static void kernfs_activate_one(struct kernfs_node *kn) if (kernfs_active(kn) || (kn->flags & (KERNFS_HIDDEN | KERNFS_REMOVING))) return; - WARN_ON_ONCE(kn->parent && RB_EMPTY_NODE(&kn->rb)); + WARN_ON_ONCE(rcu_access_pointer(kn->__parent) && RB_EMPTY_NODE(&kn->rb)); WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); atomic_sub(KN_DEACTIVATED_BIAS, &kn->active); @@ -1447,7 +1466,7 @@ void kernfs_show(struct kernfs_node *kn, bool show) static void __kernfs_remove(struct kernfs_node *kn) { - struct kernfs_node *pos; + struct kernfs_node *pos, *parent; /* Short-circuit if non-root @kn has already finished removal. */ if (!kn) @@ -1459,10 +1478,10 @@ static void __kernfs_remove(struct kernfs_node *kn) * This is for kernfs_remove_self() which plays with active ref * after removal. */ - if (kn->parent && RB_EMPTY_NODE(&kn->rb)) + if (kernfs_parent(kn) && RB_EMPTY_NODE(&kn->rb)) return; - pr_debug("kernfs %s: removing\n", kn->name); + pr_debug("kernfs %s: removing\n", kernfs_rcu_name(kn)); /* prevent new usage by marking all nodes removing and deactivating */ pos = NULL; @@ -1485,14 +1504,14 @@ static void __kernfs_remove(struct kernfs_node *kn) kernfs_get(pos); kernfs_drain(pos); - + parent = kernfs_parent(pos); /* * kernfs_unlink_sibling() succeeds once per node. Use it * to decide who's responsible for cleanups. */ - if (!pos->parent || kernfs_unlink_sibling(pos)) { + if (!parent || kernfs_unlink_sibling(pos)) { struct kernfs_iattrs *ps_iattr = - pos->parent ? pos->parent->iattr : NULL; + parent ? parent->iattr : NULL; /* update timestamps on the parent */ down_write(&kernfs_root(kn)->kernfs_iattr_rwsem); @@ -1718,11 +1737,11 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, { struct kernfs_node *old_parent; struct kernfs_root *root; - const char *old_name = NULL; + const char *old_name; int error; /* can't move or rename root */ - if (!kn->parent) + if (!rcu_access_pointer(kn->__parent)) return -EINVAL; root = kernfs_root(kn); @@ -1733,9 +1752,19 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, (new_parent->flags & KERNFS_EMPTY_DIR)) goto out; + old_parent = kernfs_parent(kn); + if (root->flags & KERNFS_ROOT_INVARIANT_PARENT) { + error = -EINVAL; + if (WARN_ON_ONCE(old_parent != new_parent)) + goto out; + } + error = 0; - if ((kn->parent == new_parent) && (kn->ns == new_ns) && - (strcmp(kn->name, new_name) == 0)) + old_name = kernfs_rcu_name(kn); + if (!new_name) + new_name = old_name; + if ((old_parent == new_parent) && (kn->ns == new_ns) && + (strcmp(old_name, new_name) == 0)) goto out; /* nothing to rename */ error = -EEXIST; @@ -1743,7 +1772,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, goto out; /* rename kernfs_node */ - if (strcmp(kn->name, new_name) != 0) { + if (strcmp(old_name, new_name) != 0) { error = -ENOMEM; new_name = kstrdup_const(new_name, GFP_KERNEL); if (!new_name) @@ -1756,27 +1785,32 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, * Move to the appropriate place in the appropriate directories rbtree. */ kernfs_unlink_sibling(kn); - kernfs_get(new_parent); - /* rename_lock protects ->parent and ->name accessors */ - write_lock_irq(&kernfs_rename_lock); + /* rename_lock protects ->parent accessors */ + if (old_parent != new_parent) { + kernfs_get(new_parent); + write_lock_irq(&kernfs_rename_lock); - old_parent = kn->parent; - kn->parent = new_parent; + rcu_assign_pointer(kn->__parent, new_parent); - kn->ns = new_ns; - if (new_name) { - old_name = kn->name; - kn->name = new_name; - } + kn->ns = new_ns; + if (new_name) + rcu_assign_pointer(kn->name, new_name); - write_unlock_irq(&kernfs_rename_lock); + write_unlock_irq(&kernfs_rename_lock); + kernfs_put(old_parent); + } else { + /* name assignment is RCU protected, parent is the same */ + kn->ns = new_ns; + if (new_name) + rcu_assign_pointer(kn->name, new_name); + } - kn->hash = kernfs_name_hash(kn->name, kn->ns); + kn->hash = kernfs_name_hash(new_name ?: old_name, kn->ns); kernfs_link_sibling(kn); - kernfs_put(old_parent); - kfree_const(old_name); + if (new_name && !is_kernel_rodata((unsigned long)old_name)) + kfree_rcu_mightsleep(old_name); error = 0; out: @@ -1795,7 +1829,8 @@ static struct kernfs_node *kernfs_dir_pos(const void *ns, { if (pos) { int valid = kernfs_active(pos) && - pos->parent == parent && hash == pos->hash; + rcu_access_pointer(pos->__parent) == parent && + hash == pos->hash; kernfs_put(pos); if (!valid) pos = NULL; @@ -1860,7 +1895,7 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) for (pos = kernfs_dir_pos(ns, parent, ctx->pos, pos); pos; pos = kernfs_dir_next_pos(ns, parent, ctx->pos, pos)) { - const char *name = pos->name; + const char *name = kernfs_rcu_name(pos); unsigned int type = fs_umode_to_dtype(pos->mode); int len = strlen(name); ino_t ino = kernfs_ino(pos); @@ -1869,10 +1904,10 @@ static int kernfs_fop_readdir(struct file *file, struct dir_context *ctx) file->private_data = pos; kernfs_get(pos); - up_read(&root->kernfs_rwsem); - if (!dir_emit(ctx, name, len, ino, type)) + if (!dir_emit(ctx, name, len, ino, type)) { + up_read(&root->kernfs_rwsem); return 0; - down_read(&root->kernfs_rwsem); + } } up_read(&root->kernfs_rwsem); file->private_data = NULL; diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 0eb320617d7b..66fe8fe41f06 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -911,9 +911,11 @@ repeat: /* kick fsnotify */ down_read(&root->kernfs_supers_rwsem); + down_read(&root->kernfs_rwsem); list_for_each_entry(info, &kernfs_root(kn)->supers, node) { struct kernfs_node *parent; struct inode *p_inode = NULL; + const char *kn_name; struct inode *inode; struct qstr name; @@ -927,7 +929,8 @@ repeat: if (!inode) continue; - name = QSTR(kn->name); + kn_name = kernfs_rcu_name(kn); + name = QSTR(kn_name); parent = kernfs_get_parent(kn); if (parent) { p_inode = ilookup(info->sb, kernfs_ino(parent)); @@ -947,6 +950,7 @@ repeat: iput(inode); } + up_read(&root->kernfs_rwsem); up_read(&root->kernfs_supers_rwsem); kernfs_put(kn); goto repeat; diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index b42ee6547cdc..40a2a9cd819d 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -19,6 +19,8 @@ #include <linux/kernfs.h> #include <linux/fs_context.h> +extern rwlock_t kernfs_rename_lock; + struct kernfs_iattrs { kuid_t ia_uid; kgid_t ia_gid; @@ -64,11 +66,14 @@ struct kernfs_root { * * Return: the kernfs_root @kn belongs to. */ -static inline struct kernfs_root *kernfs_root(struct kernfs_node *kn) +static inline struct kernfs_root *kernfs_root(const struct kernfs_node *kn) { + const struct kernfs_node *knp; /* if parent exists, it's always a dir; otherwise, @sd is a dir */ - if (kn->parent) - kn = kn->parent; + guard(rcu)(); + knp = rcu_dereference(kn->__parent); + if (knp) + kn = knp; return kn->dir.root; } @@ -97,6 +102,32 @@ struct kernfs_super_info { }; #define kernfs_info(SB) ((struct kernfs_super_info *)(SB->s_fs_info)) +static inline bool kernfs_root_is_locked(const struct kernfs_node *kn) +{ + return lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem); +} + +static inline const char *kernfs_rcu_name(const struct kernfs_node *kn) +{ + return rcu_dereference_check(kn->name, kernfs_root_is_locked(kn)); +} + +static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn) +{ + /* + * The kernfs_node::__parent remains valid within a RCU section. The kn + * can be reparented (and renamed) which changes the entry. This can be + * avoided by locking kernfs_root::kernfs_rwsem or kernfs_rename_lock. + * Both locks can be used to obtain a reference on __parent. Once the + * reference count reaches 0 then the node is about to be freed + * and can not be renamed (or become a different parent) anymore. + */ + return rcu_dereference_check(kn->__parent, + kernfs_root_is_locked(kn) || + lockdep_is_held(&kernfs_rename_lock) || + !atomic_read(&kn->count)); +} + static inline struct kernfs_node *kernfs_dentry_node(struct dentry *dentry) { if (d_really_is_negative(dentry)) diff --git a/fs/kernfs/mount.c b/fs/kernfs/mount.c index 1358c21837f1..c1719b5778a1 100644 --- a/fs/kernfs/mount.c +++ b/fs/kernfs/mount.c @@ -62,6 +62,21 @@ const struct super_operations kernfs_sops = { .show_options = kernfs_sop_show_options, .show_path = kernfs_sop_show_path, + + /* + * sysfs is built on top of kernfs and sysfs provides the power + * management infrastructure to support suspend/hibernate by + * writing to various files in /sys/power/. As filesystems may + * be automatically frozen during suspend/hibernate implementing + * freeze/thaw support for kernfs generically will cause + * deadlocks as the suspending/hibernation initiating task will + * hold a VFS lock that it will then wait upon to be released. + * If freeze/thaw for kernfs is needed talk to the VFS. + */ + .freeze_fs = NULL, + .unfreeze_fs = NULL, + .freeze_super = NULL, + .thaw_super = NULL, }; static int kernfs_encode_fh(struct inode *inode, __u32 *fh, int *max_len, @@ -145,8 +160,10 @@ static struct dentry *kernfs_fh_to_parent(struct super_block *sb, static struct dentry *kernfs_get_parent_dentry(struct dentry *child) { struct kernfs_node *kn = kernfs_dentry_node(child); + struct kernfs_root *root = kernfs_root(kn); - return d_obtain_alias(kernfs_get_inode(child->d_sb, kn->parent)); + guard(rwsem_read)(&root->kernfs_rwsem); + return d_obtain_alias(kernfs_get_inode(child->d_sb, kernfs_parent(kn))); } static const struct export_operations kernfs_export_ops = { @@ -186,10 +203,10 @@ static struct kernfs_node *find_next_ancestor(struct kernfs_node *child, return NULL; } - while (child->parent != parent) { - if (!child->parent) + while (kernfs_parent(child) != parent) { + child = kernfs_parent(child); + if (!child) return NULL; - child = child->parent; } return child; @@ -207,16 +224,27 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, { struct dentry *dentry; struct kernfs_node *knparent; + struct kernfs_root *root; BUG_ON(sb->s_op != &kernfs_sops); dentry = dget(sb->s_root); /* Check if this is the root kernfs_node */ - if (!kn->parent) + if (!rcu_access_pointer(kn->__parent)) return dentry; - knparent = find_next_ancestor(kn, NULL); + root = kernfs_root(kn); + /* + * As long as kn is valid, its parent can not vanish. This is cgroup's + * kn so it can't have its parent replaced. Therefore it is safe to use + * the ancestor node outside of the RCU or locked section. + */ + if (WARN_ON_ONCE(!(root->flags & KERNFS_ROOT_INVARIANT_PARENT))) + return ERR_PTR(-EINVAL); + scoped_guard(rcu) { + knparent = find_next_ancestor(kn, NULL); + } if (WARN_ON(!knparent)) { dput(dentry); return ERR_PTR(-EINVAL); @@ -225,17 +253,26 @@ struct dentry *kernfs_node_dentry(struct kernfs_node *kn, do { struct dentry *dtmp; struct kernfs_node *kntmp; + const char *name; if (kn == knparent) return dentry; - kntmp = find_next_ancestor(kn, knparent); - if (WARN_ON(!kntmp)) { + + scoped_guard(rwsem_read, &root->kernfs_rwsem) { + kntmp = find_next_ancestor(kn, knparent); + if (WARN_ON(!kntmp)) { + dput(dentry); + return ERR_PTR(-EINVAL); + } + name = kstrdup(kernfs_rcu_name(kntmp), GFP_KERNEL); + } + if (!name) { dput(dentry); - return ERR_PTR(-EINVAL); + return ERR_PTR(-ENOMEM); } - dtmp = lookup_positive_unlocked(kntmp->name, dentry, - strlen(kntmp->name)); + dtmp = lookup_noperm_positive_unlocked(&QSTR(name), dentry); dput(dentry); + kfree(name); if (IS_ERR(dtmp)) return dtmp; knparent = kntmp; diff --git a/fs/kernfs/symlink.c b/fs/kernfs/symlink.c index 45371a70caa7..0bd8a2143723 100644 --- a/fs/kernfs/symlink.c +++ b/fs/kernfs/symlink.c @@ -62,10 +62,10 @@ static int kernfs_get_target_path(struct kernfs_node *parent, /* go up to the root, stop at the base */ base = parent; - while (base->parent) { - kn = target->parent; - while (kn->parent && base != kn) - kn = kn->parent; + while (kernfs_parent(base)) { + kn = kernfs_parent(target); + while (kernfs_parent(kn) && base != kn) + kn = kernfs_parent(kn); if (base == kn) break; @@ -75,14 +75,14 @@ static int kernfs_get_target_path(struct kernfs_node *parent, strcpy(s, "../"); s += 3; - base = base->parent; + base = kernfs_parent(base); } /* determine end of target string for reverse fillup */ kn = target; - while (kn->parent && kn != base) { - len += strlen(kn->name) + 1; - kn = kn->parent; + while (kernfs_parent(kn) && kn != base) { + len += strlen(kernfs_rcu_name(kn)) + 1; + kn = kernfs_parent(kn); } /* check limits */ @@ -94,15 +94,16 @@ static int kernfs_get_target_path(struct kernfs_node *parent, /* reverse fillup of target string from target to base */ kn = target; - while (kn->parent && kn != base) { - int slen = strlen(kn->name); + while (kernfs_parent(kn) && kn != base) { + const char *name = kernfs_rcu_name(kn); + int slen = strlen(name); len -= slen; - memcpy(s + len, kn->name, slen); + memcpy(s + len, name, slen); if (len) s[--len] = '/'; - kn = kn->parent; + kn = kernfs_parent(kn); } return 0; @@ -111,12 +112,13 @@ static int kernfs_get_target_path(struct kernfs_node *parent, static int kernfs_getlink(struct inode *inode, char *path) { struct kernfs_node *kn = inode->i_private; - struct kernfs_node *parent = kn->parent; + struct kernfs_node *parent; struct kernfs_node *target = kn->symlink.target_kn; - struct kernfs_root *root = kernfs_root(parent); + struct kernfs_root *root = kernfs_root(kn); int error; down_read(&root->kernfs_rwsem); + parent = kernfs_parent(kn); error = kernfs_get_target_path(parent, target, path); up_read(&root->kernfs_rwsem); diff --git a/fs/libfs.c b/fs/libfs.c index 8444f5cc4064..9ea0ecc325a8 100644 --- a/fs/libfs.c +++ b/fs/libfs.c @@ -496,7 +496,7 @@ offset_dir_lookup(struct dentry *parent, loff_t offset) found = find_positive_dentry(parent, NULL, false); else { rcu_read_lock(); - child = mas_find(&mas, DIR_OFFSET_MAX); + child = mas_find_rev(&mas, DIR_OFFSET_MIN); found = find_positive_dentry(parent, child, false); rcu_read_unlock(); } @@ -583,7 +583,7 @@ const struct file_operations simple_offset_dir_operations = { .fsync = noop_fsync, }; -static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) +struct dentry *find_next_child(struct dentry *parent, struct dentry *prev) { struct dentry *child = NULL, *d; @@ -603,6 +603,7 @@ static struct dentry *find_next_child(struct dentry *parent, struct dentry *prev dput(prev); return child; } +EXPORT_SYMBOL(find_next_child); void simple_recursive_removal(struct dentry *dentry, void (*callback)(struct dentry *)) @@ -1647,10 +1648,16 @@ struct inode *alloc_anon_inode(struct super_block *s) * that it already _is_ on the dirty list. */ inode->i_state = I_DIRTY; - inode->i_mode = S_IRUSR | S_IWUSR; + /* + * Historically anonymous inodes didn't have a type at all and + * userspace has come to rely on this. Internally they're just + * regular files but S_IFREG is masked off when reporting + * information to userspace. + */ + inode->i_mode = S_IFREG | S_IRUSR | S_IWUSR; inode->i_uid = current_fsuid(); inode->i_gid = current_fsgid(); - inode->i_flags |= S_PRIVATE; + inode->i_flags |= S_PRIVATE | S_ANON_INODE; simple_inode_init_ts(inode); return inode; } @@ -2113,7 +2120,7 @@ struct timespec64 simple_inode_init_ts(struct inode *inode) } EXPORT_SYMBOL(simple_inode_init_ts); -static inline struct dentry *get_stashed_dentry(struct dentry **stashed) +struct dentry *stashed_dentry_get(struct dentry **stashed) { struct dentry *dentry; @@ -2215,7 +2222,7 @@ int path_from_stashed(struct dentry **stashed, struct vfsmount *mnt, void *data, const struct stashed_operations *sops = mnt->mnt_sb->s_fs_info; /* See if dentry can be reused. */ - path->dentry = get_stashed_dentry(stashed); + path->dentry = stashed_dentry_get(stashed); if (path->dentry) { sops->put_data(data); goto out_path; diff --git a/fs/lockd/Makefile b/fs/lockd/Makefile index fe3e23dd29c3..51bbe22d21e3 100644 --- a/fs/lockd/Makefile +++ b/fs/lockd/Makefile @@ -8,6 +8,6 @@ ccflags-y += -I$(src) # needed for trace events obj-$(CONFIG_LOCKD) += lockd.o lockd-y := clntlock.o clntproc.o clntxdr.o host.o svc.o svclock.o \ - svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o + svcshare.o svcproc.o svcsubs.o mon.o trace.o xdr.o netlink.o lockd-$(CONFIG_LOCKD_V4) += clnt4xdr.o xdr4.o svc4proc.o lockd-$(CONFIG_PROC_FS) += procfs.o diff --git a/fs/lockd/netlink.c b/fs/lockd/netlink.c new file mode 100644 index 000000000000..6e00b02cad90 --- /dev/null +++ b/fs/lockd/netlink.c @@ -0,0 +1,44 @@ +// SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/lockd.yaml */ +/* YNL-GEN kernel source */ + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include "netlink.h" + +#include <uapi/linux/lockd_netlink.h> + +/* LOCKD_CMD_SERVER_SET - do */ +static const struct nla_policy lockd_server_set_nl_policy[LOCKD_A_SERVER_UDP_PORT + 1] = { + [LOCKD_A_SERVER_GRACETIME] = { .type = NLA_U32, }, + [LOCKD_A_SERVER_TCP_PORT] = { .type = NLA_U16, }, + [LOCKD_A_SERVER_UDP_PORT] = { .type = NLA_U16, }, +}; + +/* Ops table for lockd */ +static const struct genl_split_ops lockd_nl_ops[] = { + { + .cmd = LOCKD_CMD_SERVER_SET, + .doit = lockd_nl_server_set_doit, + .policy = lockd_server_set_nl_policy, + .maxattr = LOCKD_A_SERVER_UDP_PORT, + .flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO, + }, + { + .cmd = LOCKD_CMD_SERVER_GET, + .doit = lockd_nl_server_get_doit, + .flags = GENL_CMD_CAP_DO, + }, +}; + +struct genl_family lockd_nl_family __ro_after_init = { + .name = LOCKD_FAMILY_NAME, + .version = LOCKD_FAMILY_VERSION, + .netnsok = true, + .parallel_ops = true, + .module = THIS_MODULE, + .split_ops = lockd_nl_ops, + .n_split_ops = ARRAY_SIZE(lockd_nl_ops), +}; diff --git a/fs/lockd/netlink.h b/fs/lockd/netlink.h new file mode 100644 index 000000000000..1920543a7955 --- /dev/null +++ b/fs/lockd/netlink.h @@ -0,0 +1,19 @@ +/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-3-Clause) */ +/* Do not edit directly, auto-generated from: */ +/* Documentation/netlink/specs/lockd.yaml */ +/* YNL-GEN kernel header */ + +#ifndef _LINUX_LOCKD_GEN_H +#define _LINUX_LOCKD_GEN_H + +#include <net/netlink.h> +#include <net/genetlink.h> + +#include <uapi/linux/lockd_netlink.h> + +int lockd_nl_server_set_doit(struct sk_buff *skb, struct genl_info *info); +int lockd_nl_server_get_doit(struct sk_buff *skb, struct genl_info *info); + +extern struct genl_family lockd_nl_family; + +#endif /* _LINUX_LOCKD_GEN_H */ diff --git a/fs/lockd/netns.h b/fs/lockd/netns.h index 17432c445fe6..88e8e2a97397 100644 --- a/fs/lockd/netns.h +++ b/fs/lockd/netns.h @@ -10,6 +10,9 @@ struct lockd_net { unsigned int nlmsvc_users; unsigned long next_gc; unsigned long nrhosts; + u32 gracetime; + u16 tcp_port; + u16 udp_port; struct delayed_work grace_period_end; struct lock_manager lockd_manager; diff --git a/fs/lockd/svc.c b/fs/lockd/svc.c index 2c8eedc6c2cc..e80262a51884 100644 --- a/fs/lockd/svc.c +++ b/fs/lockd/svc.c @@ -41,6 +41,7 @@ #include "netns.h" #include "procfs.h" +#include "netlink.h" #define NLMDBG_FACILITY NLMDBG_SVC #define LOCKD_BUFSIZE (1024 + NLMSVC_XDRSIZE) @@ -83,8 +84,14 @@ static const int nlm_port_min = 0, nlm_port_max = 65535; static struct ctl_table_header * nlm_sysctl_table; #endif -static unsigned long get_lockd_grace_period(void) +static unsigned long get_lockd_grace_period(struct net *net) { + struct lockd_net *ln = net_generic(net, lockd_net_id); + + /* Return the net-ns specific grace period, if there is one */ + if (ln->gracetime) + return ln->gracetime * HZ; + /* Note: nlm_timeout should always be nonzero */ if (nlm_grace_period) return roundup(nlm_grace_period, nlm_timeout) * HZ; @@ -103,7 +110,7 @@ static void grace_ender(struct work_struct *grace) static void set_grace_period(struct net *net) { - unsigned long grace_period = get_lockd_grace_period(); + unsigned long grace_period = get_lockd_grace_period(net); struct lockd_net *ln = net_generic(net, lockd_net_id); locks_start_grace(net, &ln->lockd_manager); @@ -166,15 +173,16 @@ static int create_lockd_listener(struct svc_serv *serv, const char *name, static int create_lockd_family(struct svc_serv *serv, struct net *net, const int family, const struct cred *cred) { + struct lockd_net *ln = net_generic(net, lockd_net_id); int err; - err = create_lockd_listener(serv, "udp", net, family, nlm_udpport, - cred); + err = create_lockd_listener(serv, "udp", net, family, + ln->udp_port ? ln->udp_port : nlm_udpport, cred); if (err < 0) return err; - return create_lockd_listener(serv, "tcp", net, family, nlm_tcpport, - cred); + return create_lockd_listener(serv, "tcp", net, family, + ln->tcp_port ? ln->tcp_port : nlm_tcpport, cred); } /* @@ -459,9 +467,10 @@ static const struct ctl_table nlm_sysctls[] = { { .procname = "nsm_local_state", .data = &nsm_local_state, - .maxlen = sizeof(int), + .maxlen = sizeof(nsm_local_state), .mode = 0644, - .proc_handler = proc_dointvec, + .proc_handler = proc_douintvec, + .extra1 = SYSCTL_ZERO, }, }; @@ -588,6 +597,10 @@ static int __init init_nlm(void) if (err) goto err_pernet; + err = genl_register_family(&lockd_nl_family); + if (err) + goto err_netlink; + err = lockd_create_procfs(); if (err) goto err_procfs; @@ -595,6 +608,8 @@ static int __init init_nlm(void) return 0; err_procfs: + genl_unregister_family(&lockd_nl_family); +err_netlink: unregister_pernet_subsys(&lockd_net_ops); err_pernet: #ifdef CONFIG_SYSCTL @@ -608,6 +623,7 @@ static void __exit exit_nlm(void) { /* FIXME: delete all NLM clients */ nlm_shutdown_hosts(); + genl_unregister_family(&lockd_nl_family); lockd_remove_procfs(); unregister_pernet_subsys(&lockd_net_ops); #ifdef CONFIG_SYSCTL @@ -710,3 +726,94 @@ static struct svc_program nlmsvc_program = { .pg_init_request = svc_generic_init_request, .pg_rpcbind_set = svc_generic_rpcbind_set, }; + +/** + * lockd_nl_server_set_doit - set the lockd server parameters via netlink + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * This updates the per-net values. When updating the values in the init_net + * namespace, also update the "legacy" global values. + * + * Return 0 on success or a negative errno. + */ +int lockd_nl_server_set_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct lockd_net *ln = net_generic(net, lockd_net_id); + const struct nlattr *attr; + + if (GENL_REQ_ATTR_CHECK(info, LOCKD_A_SERVER_GRACETIME)) + return -EINVAL; + + if (info->attrs[LOCKD_A_SERVER_GRACETIME] || + info->attrs[LOCKD_A_SERVER_TCP_PORT] || + info->attrs[LOCKD_A_SERVER_UDP_PORT]) { + attr = info->attrs[LOCKD_A_SERVER_GRACETIME]; + if (attr) { + u32 gracetime = nla_get_u32(attr); + + if (gracetime > nlm_grace_period_max) + return -EINVAL; + + ln->gracetime = gracetime; + + if (net == &init_net) + nlm_grace_period = gracetime; + } + + attr = info->attrs[LOCKD_A_SERVER_TCP_PORT]; + if (attr) { + ln->tcp_port = nla_get_u16(attr); + if (net == &init_net) + nlm_tcpport = ln->tcp_port; + } + + attr = info->attrs[LOCKD_A_SERVER_UDP_PORT]; + if (attr) { + ln->udp_port = nla_get_u16(attr); + if (net == &init_net) + nlm_udpport = ln->udp_port; + } + } + return 0; +} + +/** + * lockd_nl_server_get_doit - get lockd server parameters via netlink + * @skb: reply buffer + * @info: netlink metadata and command arguments + * + * Return 0 on success or a negative errno. + */ +int lockd_nl_server_get_doit(struct sk_buff *skb, struct genl_info *info) +{ + struct net *net = genl_info_net(info); + struct lockd_net *ln = net_generic(net, lockd_net_id); + void *hdr; + int err; + + skb = genlmsg_new(GENLMSG_DEFAULT_SIZE, GFP_KERNEL); + if (!skb) + return -ENOMEM; + + hdr = genlmsg_iput(skb, info); + if (!hdr) { + err = -EMSGSIZE; + goto err_free_msg; + } + + err = nla_put_u32(skb, LOCKD_A_SERVER_GRACETIME, ln->gracetime) || + nla_put_u16(skb, LOCKD_A_SERVER_TCP_PORT, ln->tcp_port) || + nla_put_u16(skb, LOCKD_A_SERVER_UDP_PORT, ln->udp_port); + if (err) + goto err_free_msg; + + genlmsg_end(skb, hdr); + + return genlmsg_reply(skb, info); +err_free_msg: + nlmsg_free(skb); + + return err; +} diff --git a/fs/minix/namei.c b/fs/minix/namei.c index 5d9c1406fe27..8938536d8d3c 100644 --- a/fs/minix/namei.c +++ b/fs/minix/namei.c @@ -104,15 +104,15 @@ static int minix_link(struct dentry * old_dentry, struct inode * dir, return add_nondir(dentry, inode); } -static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode * inode; int err; inode = minix_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); inode_inc_link_count(dir); minix_set_inode(inode, 0); @@ -128,7 +128,7 @@ static int minix_mkdir(struct mnt_idmap *idmap, struct inode *dir, d_instantiate(dentry, inode); out: - return err; + return ERR_PTR(err); out_fail: inode_dec_link_count(inode); diff --git a/fs/mnt_idmapping.c b/fs/mnt_idmapping.c index 7b1df8cc2821..a37991fdb194 100644 --- a/fs/mnt_idmapping.c +++ b/fs/mnt_idmapping.c @@ -6,6 +6,7 @@ #include <linux/mnt_idmapping.h> #include <linux/slab.h> #include <linux/user_namespace.h> +#include <linux/seq_file.h> #include "internal.h" @@ -334,3 +335,53 @@ void mnt_idmap_put(struct mnt_idmap *idmap) free_mnt_idmap(idmap); } EXPORT_SYMBOL_GPL(mnt_idmap_put); + +int statmount_mnt_idmap(struct mnt_idmap *idmap, struct seq_file *seq, bool uid_map) +{ + struct uid_gid_map *map, *map_up; + u32 idx, nr_mappings; + + if (!is_valid_mnt_idmap(idmap)) + return 0; + + /* + * Idmappings are shown relative to the caller's idmapping. + * This is both the most intuitive and most useful solution. + */ + if (uid_map) { + map = &idmap->uid_map; + map_up = ¤t_user_ns()->uid_map; + } else { + map = &idmap->gid_map; + map_up = ¤t_user_ns()->gid_map; + } + + for (idx = 0, nr_mappings = 0; idx < map->nr_extents; idx++) { + uid_t lower; + struct uid_gid_extent *extent; + + if (map->nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) + extent = &map->extent[idx]; + else + extent = &map->forward[idx]; + + /* + * Verify that the whole range of the mapping can be + * resolved in the caller's idmapping. If it cannot be + * resolved skip the mapping. + */ + lower = map_id_range_up(map_up, extent->lower_first, extent->count); + if (lower == (uid_t) -1) + continue; + + seq_printf(seq, "%u %u %u", extent->first, lower, extent->count); + + seq->count++; /* mappings are separated by \0 */ + if (seq_has_overflowed(seq)) + return -EAGAIN; + + nr_mappings++; + } + + return nr_mappings; +} diff --git a/fs/mount.h b/fs/mount.h index ffb613cdfeee..7aecf2a60472 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -5,6 +5,12 @@ #include <linux/ns_common.h> #include <linux/fs_pin.h> +extern struct list_head notify_list; + +typedef __u32 __bitwise mntns_flags_t; + +#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0)) + struct mnt_namespace { struct ns_common ns; struct mount * root; @@ -20,12 +26,18 @@ struct mnt_namespace { wait_queue_head_t poll; struct rcu_head mnt_ns_rcu; }; + u64 seq_origin; /* Sequence number of origin mount namespace */ u64 event; +#ifdef CONFIG_FSNOTIFY + __u32 n_fsnotify_mask; + struct fsnotify_mark_connector __rcu *n_fsnotify_marks; +#endif unsigned int nr_mounts; /* # of mounts in the namespace */ unsigned int pending_mounts; struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ + mntns_flags_t mntns_flags; } __randomize_layout; struct mnt_pcp { @@ -76,6 +88,8 @@ struct mount { #ifdef CONFIG_FSNOTIFY struct fsnotify_mark_connector __rcu *mnt_fsnotify_marks; __u32 mnt_fsnotify_mask; + struct list_head to_notify; /* need to queue notification */ + struct mnt_namespace *prev_ns; /* previous namespace (NULL if none) */ #endif int mnt_id; /* mount identifier, reused */ u64 mnt_id_unique; /* mount ID unique until reboot */ @@ -156,6 +170,11 @@ static inline bool mnt_ns_attached(const struct mount *mnt) return !RB_EMPTY_NODE(&mnt->mnt_node); } +static inline bool mnt_ns_empty(const struct mnt_namespace *ns) +{ + return RB_EMPTY_ROOT(&ns->mounts); +} + static inline void move_from_ns(struct mount *mnt, struct list_head *dt_list) { struct mnt_namespace *ns = mnt->mnt_ns; @@ -177,3 +196,21 @@ static inline struct mnt_namespace *to_mnt_ns(struct ns_common *ns) { return container_of(ns, struct mnt_namespace, ns); } + +#ifdef CONFIG_FSNOTIFY +static inline void mnt_notify_add(struct mount *m) +{ + /* Optimize the case where there are no watches */ + if ((m->mnt_ns && m->mnt_ns->n_fsnotify_marks) || + (m->prev_ns && m->prev_ns->n_fsnotify_marks)) + list_add_tail(&m->to_notify, ¬ify_list); + else + m->prev_ns = m->mnt_ns; +} +#else +static inline void mnt_notify_add(struct mount *m) +{ +} +#endif + +struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry); diff --git a/fs/mpage.c b/fs/mpage.c index 82aecf372743..c5fd821fd30e 100644 --- a/fs/mpage.c +++ b/fs/mpage.c @@ -107,7 +107,7 @@ static void map_buffer_to_folio(struct folio *folio, struct buffer_head *bh, * don't make any buffers if there is only one buffer on * the folio and the folio just needs to be set up to date */ - if (inode->i_blkbits == PAGE_SHIFT && + if (inode->i_blkbits == folio_shift(folio) && buffer_uptodate(bh)) { folio_mark_uptodate(folio); return; @@ -153,7 +153,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) struct folio *folio = args->folio; struct inode *inode = folio->mapping->host; const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + const unsigned blocks_per_folio = folio_size(folio) >> blkbits; const unsigned blocksize = 1 << blkbits; struct buffer_head *map_bh = &args->map_bh; sector_t block_in_file; @@ -161,7 +161,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) sector_t last_block_in_file; sector_t first_block; unsigned page_block; - unsigned first_hole = blocks_per_page; + unsigned first_hole = blocks_per_folio; struct block_device *bdev = NULL; int length; int fully_mapped = 1; @@ -170,9 +170,6 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) unsigned relative_block; gfp_t gfp = mapping_gfp_constraint(folio->mapping, GFP_KERNEL); - /* MAX_BUF_PER_PAGE, for example */ - VM_BUG_ON_FOLIO(folio_test_large(folio), folio); - if (args->is_readahead) { opf |= REQ_RAHEAD; gfp |= __GFP_NORETRY | __GFP_NOWARN; @@ -181,8 +178,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) if (folio_buffers(folio)) goto confused; - block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); - last_block = block_in_file + args->nr_pages * blocks_per_page; + block_in_file = folio_pos(folio) >> blkbits; + last_block = block_in_file + ((args->nr_pages * PAGE_SIZE) >> blkbits); last_block_in_file = (i_size_read(inode) + blocksize - 1) >> blkbits; if (last_block > last_block_in_file) last_block = last_block_in_file; @@ -204,7 +201,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) clear_buffer_mapped(map_bh); break; } - if (page_block == blocks_per_page) + if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -216,7 +213,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) * Then do more get_blocks calls until we are done with this folio. */ map_bh->b_folio = folio; - while (page_block < blocks_per_page) { + while (page_block < blocks_per_folio) { map_bh->b_state = 0; map_bh->b_size = 0; @@ -229,7 +226,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) if (!buffer_mapped(map_bh)) { fully_mapped = 0; - if (first_hole == blocks_per_page) + if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; @@ -247,7 +244,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) goto confused; } - if (first_hole != blocks_per_page) + if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ @@ -260,7 +257,7 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) if (relative_block == nblocks) { clear_buffer_mapped(map_bh); break; - } else if (page_block == blocks_per_page) + } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -268,8 +265,8 @@ static struct bio *do_mpage_readpage(struct mpage_readpage_args *args) bdev = map_bh->b_bdev; } - if (first_hole != blocks_per_page) { - folio_zero_segment(folio, first_hole << blkbits, PAGE_SIZE); + if (first_hole != blocks_per_folio) { + folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { folio_mark_uptodate(folio); folio_unlock(folio); @@ -303,10 +300,10 @@ alloc_new: relative_block = block_in_file - args->first_logical_block; nblocks = map_bh->b_size >> blkbits; if ((buffer_boundary(map_bh) && relative_block == nblocks) || - (first_hole != blocks_per_page)) + (first_hole != blocks_per_folio)) args->bio = mpage_bio_submit_read(args->bio); else - args->last_block_in_bio = first_block + blocks_per_page - 1; + args->last_block_in_bio = first_block + blocks_per_folio - 1; out: return args->bio; @@ -385,7 +382,7 @@ int mpage_read_folio(struct folio *folio, get_block_t get_block) { struct mpage_readpage_args args = { .folio = folio, - .nr_pages = 1, + .nr_pages = folio_nr_pages(folio), .get_block = get_block, }; @@ -448,20 +445,19 @@ static void clean_buffers(struct folio *folio, unsigned first_unmapped) try_to_free_buffers(folio); } -static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, - void *data) +static int mpage_write_folio(struct writeback_control *wbc, struct folio *folio, + struct mpage_data *mpd) { - struct mpage_data *mpd = data; struct bio *bio = mpd->bio; struct address_space *mapping = folio->mapping; struct inode *inode = mapping->host; const unsigned blkbits = inode->i_blkbits; - const unsigned blocks_per_page = PAGE_SIZE >> blkbits; + const unsigned blocks_per_folio = folio_size(folio) >> blkbits; sector_t last_block; sector_t block_in_file; sector_t first_block; unsigned page_block; - unsigned first_unmapped = blocks_per_page; + unsigned first_unmapped = blocks_per_folio; struct block_device *bdev = NULL; int boundary = 0; sector_t boundary_block = 0; @@ -486,12 +482,12 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, */ if (buffer_dirty(bh)) goto confused; - if (first_unmapped == blocks_per_page) + if (first_unmapped == blocks_per_folio) first_unmapped = page_block; continue; } - if (first_unmapped != blocks_per_page) + if (first_unmapped != blocks_per_folio) goto confused; /* hole -> non-hole */ if (!buffer_dirty(bh) || !buffer_uptodate(bh)) @@ -527,7 +523,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, * The page has no buffers: map it to disk */ BUG_ON(!folio_test_uptodate(folio)); - block_in_file = (sector_t)folio->index << (PAGE_SHIFT - blkbits); + block_in_file = folio_pos(folio) >> blkbits; /* * Whole page beyond EOF? Skip allocating blocks to avoid leaking * space. @@ -536,7 +532,7 @@ static int __mpage_writepage(struct folio *folio, struct writeback_control *wbc, goto page_is_mapped; last_block = (i_size - 1) >> blkbits; map_bh.b_folio = folio; - for (page_block = 0; page_block < blocks_per_page; ) { + for (page_block = 0; page_block < blocks_per_folio; ) { map_bh.b_state = 0; map_bh.b_size = 1 << blkbits; @@ -618,14 +614,14 @@ alloc_new: BUG_ON(folio_test_writeback(folio)); folio_start_writeback(folio); folio_unlock(folio); - if (boundary || (first_unmapped != blocks_per_page)) { + if (boundary || (first_unmapped != blocks_per_folio)) { bio = mpage_bio_submit_write(bio); if (boundary_block) { write_boundary_block(boundary_bdev, boundary_block, 1 << blkbits); } } else { - mpd->last_block_in_bio = first_block + blocks_per_page - 1; + mpd->last_block_in_bio = first_block + blocks_per_folio - 1; } goto out; @@ -659,14 +655,16 @@ mpage_writepages(struct address_space *mapping, struct mpage_data mpd = { .get_block = get_block, }; + struct folio *folio = NULL; struct blk_plug plug; - int ret; + int error; blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __mpage_writepage, &mpd); + while ((folio = writeback_iter(mapping, wbc, folio, &error))) + error = mpage_write_folio(wbc, folio, &mpd); if (mpd.bio) mpage_bio_submit_write(mpd.bio); blk_finish_plug(&plug); - return ret; + return error; } EXPORT_SYMBOL(mpage_writepages); diff --git a/fs/namei.c b/fs/namei.c index 3ab9440c5b93..4bb889fc980b 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -125,6 +125,13 @@ #define EMBEDDED_NAME_MAX (PATH_MAX - offsetof(struct filename, iname)) +static inline void initname(struct filename *name, const char __user *uptr) +{ + name->uptr = uptr; + name->aname = NULL; + atomic_set(&name->refcnt, 1); +} + struct filename * getname_flags(const char __user *filename, int flags) { @@ -203,10 +210,7 @@ getname_flags(const char __user *filename, int flags) return ERR_PTR(-ENAMETOOLONG); } } - - atomic_set(&result->refcnt, 1); - result->uptr = filename; - result->aname = NULL; + initname(result, filename); audit_getname(result); return result; } @@ -218,11 +222,6 @@ struct filename *getname_uflags(const char __user *filename, int uflags) return getname_flags(filename, flags); } -struct filename *getname(const char __user * filename) -{ - return getname_flags(filename, 0); -} - struct filename *__getname_maybe_null(const char __user *pathname) { struct filename *name; @@ -269,25 +268,27 @@ struct filename *getname_kernel(const char * filename) return ERR_PTR(-ENAMETOOLONG); } memcpy((char *)result->name, filename, len); - result->uptr = NULL; - result->aname = NULL; - atomic_set(&result->refcnt, 1); + initname(result, NULL); audit_getname(result); - return result; } EXPORT_SYMBOL(getname_kernel); void putname(struct filename *name) { + int refcnt; + if (IS_ERR_OR_NULL(name)) return; - if (WARN_ON_ONCE(!atomic_read(&name->refcnt))) - return; + refcnt = atomic_read(&name->refcnt); + if (refcnt != 1) { + if (WARN_ON_ONCE(!refcnt)) + return; - if (!atomic_dec_and_test(&name->refcnt)) - return; + if (!atomic_dec_and_test(&name->refcnt)) + return; + } if (name->name != name->iname) { __putname(name->name); @@ -570,14 +571,14 @@ int inode_permission(struct mnt_idmap *idmap, int retval; retval = sb_permission(inode->i_sb, inode, mask); - if (retval) + if (unlikely(retval)) return retval; if (unlikely(mask & MAY_WRITE)) { /* * Nobody gets write access to an immutable file. */ - if (IS_IMMUTABLE(inode)) + if (unlikely(IS_IMMUTABLE(inode))) return -EPERM; /* @@ -585,16 +586,16 @@ int inode_permission(struct mnt_idmap *idmap, * written back improperly if their true value is unknown * to the vfs. */ - if (HAS_UNMAPPED_ID(idmap, inode)) + if (unlikely(HAS_UNMAPPED_ID(idmap, inode))) return -EACCES; } retval = do_inode_permission(idmap, inode, mask); - if (retval) + if (unlikely(retval)) return retval; retval = devcgroup_inode_permission(inode, mask); - if (retval) + if (unlikely(retval)) return retval; return security_inode_permission(inode, mask); @@ -1664,25 +1665,20 @@ static struct dentry *lookup_dcache(const struct qstr *name, return dentry; } -/* - * Parent directory has inode locked exclusive. This is one - * and only case when ->lookup() gets called on non in-lookup - * dentries - as the matter of fact, this only gets called - * when directory is guaranteed to have no in-lookup children - * at all. - */ -struct dentry *lookup_one_qstr_excl(const struct qstr *name, - struct dentry *base, - unsigned int flags) +static struct dentry *lookup_one_qstr_excl_raw(const struct qstr *name, + struct dentry *base, + unsigned int flags) { - struct dentry *dentry = lookup_dcache(name, base, flags); + struct dentry *dentry; struct dentry *old; - struct inode *dir = base->d_inode; + struct inode *dir; + dentry = lookup_dcache(name, base, flags); if (dentry) return dentry; /* Don't create child dentry for a dead directory. */ + dir = base->d_inode; if (unlikely(IS_DEADDIR(dir))) return ERR_PTR(-ENOENT); @@ -1697,6 +1693,34 @@ struct dentry *lookup_one_qstr_excl(const struct qstr *name, } return dentry; } + +/* + * Parent directory has inode locked exclusive. This is one + * and only case when ->lookup() gets called on non in-lookup + * dentries - as the matter of fact, this only gets called + * when directory is guaranteed to have no in-lookup children + * at all. + * Will return -ENOENT if name isn't found and LOOKUP_CREATE wasn't passed. + * Will return -EEXIST if name is found and LOOKUP_EXCL was passed. + */ +struct dentry *lookup_one_qstr_excl(const struct qstr *name, + struct dentry *base, unsigned int flags) +{ + struct dentry *dentry; + + dentry = lookup_one_qstr_excl_raw(name, base, flags); + if (IS_ERR(dentry)) + return dentry; + if (d_is_negative(dentry) && !(flags & LOOKUP_CREATE)) { + dput(dentry); + return ERR_PTR(-ENOENT); + } + if (d_is_positive(dentry) && (flags & LOOKUP_EXCL)) { + dput(dentry); + return ERR_PTR(-EEXIST); + } + return dentry; +} EXPORT_SYMBOL(lookup_one_qstr_excl); /** @@ -1891,13 +1915,13 @@ static const char *pick_link(struct nameidata *nd, struct path *link, unlikely(link->mnt->mnt_flags & MNT_NOSYMFOLLOW)) return ERR_PTR(-ELOOP); - if (!(nd->flags & LOOKUP_RCU)) { + if (unlikely(atime_needs_update(&last->link, inode))) { + if (nd->flags & LOOKUP_RCU) { + if (!try_to_unlazy(nd)) + return ERR_PTR(-ECHILD); + } touch_atime(&last->link); cond_resched(); - } else if (atime_needs_update(&last->link, inode)) { - if (!try_to_unlazy(nd)) - return ERR_PTR(-ECHILD); - touch_atime(&last->link); } error = security_inode_follow_link(link->dentry, inode, @@ -2410,9 +2434,12 @@ static int link_path_walk(const char *name, struct nameidata *nd) nd->flags |= LOOKUP_PARENT; if (IS_ERR(name)) return PTR_ERR(name); - while (*name=='/') - name++; - if (!*name) { + if (*name == '/') { + do { + name++; + } while (unlikely(*name == '/')); + } + if (unlikely(!*name)) { nd->dir_mode = 0; // short-circuit the 'hardening' idiocy return 0; } @@ -2425,7 +2452,7 @@ static int link_path_walk(const char *name, struct nameidata *nd) idmap = mnt_idmap(nd->path.mnt); err = may_lookup(idmap, nd); - if (err) + if (unlikely(err)) return err; nd->last.name = name; @@ -2728,23 +2755,48 @@ static int filename_parentat(int dfd, struct filename *name, /* does lookup, returns the object with parent locked */ static struct dentry *__kern_path_locked(int dfd, struct filename *name, struct path *path) { + struct path parent_path __free(path_put) = {}; struct dentry *d; struct qstr last; int type, error; - error = filename_parentat(dfd, name, 0, path, &last, &type); + error = filename_parentat(dfd, name, 0, &parent_path, &last, &type); if (error) return ERR_PTR(error); - if (unlikely(type != LAST_NORM)) { - path_put(path); + if (unlikely(type != LAST_NORM)) return ERR_PTR(-EINVAL); + inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl(&last, parent_path.dentry, 0); + if (IS_ERR(d)) { + inode_unlock(parent_path.dentry->d_inode); + return d; } - inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); - d = lookup_one_qstr_excl(&last, path->dentry, 0); + path->dentry = no_free_ptr(parent_path.dentry); + path->mnt = no_free_ptr(parent_path.mnt); + return d; +} + +struct dentry *kern_path_locked_negative(const char *name, struct path *path) +{ + struct path parent_path __free(path_put) = {}; + struct filename *filename __free(putname) = getname_kernel(name); + struct dentry *d; + struct qstr last; + int type, error; + + error = filename_parentat(AT_FDCWD, filename, 0, &parent_path, &last, &type); + if (error) + return ERR_PTR(error); + if (unlikely(type != LAST_NORM)) + return ERR_PTR(-EINVAL); + inode_lock_nested(parent_path.dentry->d_inode, I_MUTEX_PARENT); + d = lookup_one_qstr_excl_raw(&last, parent_path.dentry, 0); if (IS_ERR(d)) { - inode_unlock(path->dentry->d_inode); - path_put(path); + inode_unlock(parent_path.dentry->d_inode); + return d; } + path->dentry = no_free_ptr(parent_path.dentry); + path->mnt = no_free_ptr(parent_path.mnt); return d; } @@ -2820,13 +2872,12 @@ int vfs_path_lookup(struct dentry *dentry, struct vfsmount *mnt, } EXPORT_SYMBOL(vfs_path_lookup); -static int lookup_one_common(struct mnt_idmap *idmap, - const char *name, struct dentry *base, int len, - struct qstr *this) +static int lookup_noperm_common(struct qstr *qname, struct dentry *base) { - this->name = name; - this->len = len; - this->hash = full_name_hash(base, name, len); + const char *name = qname->name; + u32 len = qname->len; + + qname->hash = full_name_hash(base, name, len); if (!len) return -EACCES; @@ -2843,140 +2894,135 @@ static int lookup_one_common(struct mnt_idmap *idmap, * to use its own hash.. */ if (base->d_flags & DCACHE_OP_HASH) { - int err = base->d_op->d_hash(base, this); + int err = base->d_op->d_hash(base, qname); if (err < 0) return err; } + return 0; +} +static int lookup_one_common(struct mnt_idmap *idmap, + struct qstr *qname, struct dentry *base) +{ + int err; + err = lookup_noperm_common(qname, base); + if (err < 0) + return err; return inode_permission(idmap, base->d_inode, MAY_EXEC); } /** - * try_lookup_one_len - filesystem helper to lookup single pathname component - * @name: pathname component to lookup + * try_lookup_noperm - filesystem helper to lookup single pathname component + * @name: qstr storing pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * Look up a dentry by name in the dcache, returning NULL if it does not * currently exist. The function does not try to create a dentry. * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * not be called by generic code. It does no permission checking. + * + * No locks need be held - only a counted reference to @base is needed. * - * The caller must hold base->i_mutex. */ -struct dentry *try_lookup_one_len(const char *name, struct dentry *base, int len) +struct dentry *try_lookup_noperm(struct qstr *name, struct dentry *base) { - struct qstr this; int err; - WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - - err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); + err = lookup_noperm_common(name, base); if (err) return ERR_PTR(err); - return lookup_dcache(&this, base, 0); + return lookup_dcache(name, base, 0); } -EXPORT_SYMBOL(try_lookup_one_len); +EXPORT_SYMBOL(try_lookup_noperm); /** - * lookup_one_len - filesystem helper to lookup single pathname component - * @name: pathname component to lookup + * lookup_noperm - filesystem helper to lookup single pathname component + * @name: qstr storing pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * not be called by generic code. It does no permission checking. * * The caller must hold base->i_mutex. */ -struct dentry *lookup_one_len(const char *name, struct dentry *base, int len) +struct dentry *lookup_noperm(struct qstr *name, struct dentry *base) { struct dentry *dentry; - struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(&nop_mnt_idmap, name, base, len, &this); + err = lookup_noperm_common(name, base); if (err) return ERR_PTR(err); - dentry = lookup_dcache(&this, base, 0); - return dentry ? dentry : __lookup_slow(&this, base, 0); + dentry = lookup_dcache(name, base, 0); + return dentry ? dentry : __lookup_slow(name, base, 0); } -EXPORT_SYMBOL(lookup_one_len); +EXPORT_SYMBOL(lookup_noperm); /** - * lookup_one - filesystem helper to lookup single pathname component + * lookup_one - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from - * @name: pathname component to lookup + * @name: qstr holding pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * This can be used for in-kernel filesystem clients such as file servers. * * The caller must hold base->i_mutex. */ -struct dentry *lookup_one(struct mnt_idmap *idmap, const char *name, - struct dentry *base, int len) +struct dentry *lookup_one(struct mnt_idmap *idmap, struct qstr *name, + struct dentry *base) { struct dentry *dentry; - struct qstr this; int err; WARN_ON_ONCE(!inode_is_locked(base->d_inode)); - err = lookup_one_common(idmap, name, base, len, &this); + err = lookup_one_common(idmap, name, base); if (err) return ERR_PTR(err); - dentry = lookup_dcache(&this, base, 0); - return dentry ? dentry : __lookup_slow(&this, base, 0); + dentry = lookup_dcache(name, base, 0); + return dentry ? dentry : __lookup_slow(name, base, 0); } EXPORT_SYMBOL(lookup_one); /** - * lookup_one_unlocked - filesystem helper to lookup single pathname component + * lookup_one_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from - * @name: pathname component to lookup + * @name: qstr olding pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * This can be used for in-kernel filesystem clients such as file servers. * - * Unlike lookup_one_len, it should be called without the parent - * i_mutex held, and will take the i_mutex itself if necessary. + * Unlike lookup_one, it should be called without the parent + * i_rwsem held, and will take the i_rwsem itself if necessary. */ -struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, - const char *name, struct dentry *base, - int len) +struct dentry *lookup_one_unlocked(struct mnt_idmap *idmap, struct qstr *name, + struct dentry *base) { - struct qstr this; int err; struct dentry *ret; - err = lookup_one_common(idmap, name, base, len, &this); + err = lookup_one_common(idmap, name, base); if (err) return ERR_PTR(err); - ret = lookup_dcache(&this, base, 0); + ret = lookup_dcache(name, base, 0); if (!ret) - ret = lookup_slow(&this, base, 0); + ret = lookup_slow(name, base, 0); return ret; } EXPORT_SYMBOL(lookup_one_unlocked); /** - * lookup_one_positive_unlocked - filesystem helper to lookup single - * pathname component + * lookup_one_positive_unlocked - lookup single pathname component * @idmap: idmap of the mount the lookup is performed from - * @name: pathname component to lookup + * @name: qstr holding pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * This helper will yield ERR_PTR(-ENOENT) on negatives. The helper returns * known positive or ERR_PTR(). This is what most of the users want. @@ -2985,16 +3031,15 @@ EXPORT_SYMBOL(lookup_one_unlocked); * time, so callers of lookup_one_unlocked() need to be very careful; pinned * positives have >d_inode stable, so this one avoids such problems. * - * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * This can be used for in-kernel filesystem clients such as file servers. * - * The helper should be called without i_mutex held. + * The helper should be called without i_rwsem held. */ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, - const char *name, - struct dentry *base, int len) + struct qstr *name, + struct dentry *base) { - struct dentry *ret = lookup_one_unlocked(idmap, name, base, len); + struct dentry *ret = lookup_one_unlocked(idmap, name, base); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { dput(ret); @@ -3005,38 +3050,48 @@ struct dentry *lookup_one_positive_unlocked(struct mnt_idmap *idmap, EXPORT_SYMBOL(lookup_one_positive_unlocked); /** - * lookup_one_len_unlocked - filesystem helper to lookup single pathname component + * lookup_noperm_unlocked - filesystem helper to lookup single pathname component * @name: pathname component to lookup * @base: base directory to lookup from - * @len: maximum length @len should be interpreted to * * Note that this routine is purely a helper for filesystem usage and should - * not be called by generic code. + * not be called by generic code. It does no permission checking. * - * Unlike lookup_one_len, it should be called without the parent - * i_mutex held, and will take the i_mutex itself if necessary. + * Unlike lookup_noperm, it should be called without the parent + * i_rwsem held, and will take the i_rwsem itself if necessary. */ -struct dentry *lookup_one_len_unlocked(const char *name, - struct dentry *base, int len) +struct dentry *lookup_noperm_unlocked(struct qstr *name, struct dentry *base) { - return lookup_one_unlocked(&nop_mnt_idmap, name, base, len); + struct dentry *ret; + + ret = try_lookup_noperm(name, base); + if (!ret) + ret = lookup_slow(name, base, 0); + return ret; } -EXPORT_SYMBOL(lookup_one_len_unlocked); +EXPORT_SYMBOL(lookup_noperm_unlocked); /* - * Like lookup_one_len_unlocked(), except that it yields ERR_PTR(-ENOENT) + * Like lookup_noperm_unlocked(), except that it yields ERR_PTR(-ENOENT) * on negatives. Returns known positive or ERR_PTR(); that's what * most of the users want. Note that pinned negative with unlocked parent - * _can_ become positive at any time, so callers of lookup_one_len_unlocked() + * _can_ become positive at any time, so callers of lookup_noperm_unlocked() * need to be very careful; pinned positives have ->d_inode stable, so * this one avoids such problems. */ -struct dentry *lookup_positive_unlocked(const char *name, - struct dentry *base, int len) +struct dentry *lookup_noperm_positive_unlocked(struct qstr *name, + struct dentry *base) { - return lookup_one_positive_unlocked(&nop_mnt_idmap, name, base, len); + struct dentry *ret; + + ret = lookup_noperm_unlocked(name, base); + if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { + dput(ret); + ret = ERR_PTR(-ENOENT); + } + return ret; } -EXPORT_SYMBOL(lookup_positive_unlocked); +EXPORT_SYMBOL(lookup_noperm_positive_unlocked); #ifdef CONFIG_UNIX98_PTYS int path_pts(struct path *path) @@ -3415,6 +3470,8 @@ static int may_open(struct mnt_idmap *idmap, const struct path *path, if ((acc_mode & MAY_EXEC) && path_noexec(path)) return -EACCES; break; + default: + VFS_BUG_ON_INODE(1, inode); } error = inode_permission(idmap, inode, MAY_OPEN | acc_mode); @@ -3995,7 +4052,7 @@ static struct file *path_openat(struct nameidata *nd, WARN_ON(1); error = -EINVAL; } - fput(file); + fput_close(file); if (error == -EOPENSTALE) { if (flags & LOOKUP_RCU) error = -ECHILD; @@ -4078,27 +4135,13 @@ static struct dentry *filename_create(int dfd, struct filename *name, * '/', and a directory wasn't requested. */ if (last.name[last.len] && !want_dir) - create_flags = 0; + create_flags &= ~LOOKUP_CREATE; inode_lock_nested(path->dentry->d_inode, I_MUTEX_PARENT); dentry = lookup_one_qstr_excl(&last, path->dentry, reval_flag | create_flags); if (IS_ERR(dentry)) goto unlock; - error = -EEXIST; - if (d_is_positive(dentry)) - goto fail; - - /* - * Special case - lookup gave negative, but... we had foo/bar/ - * From the vfs_mknod() POV we just have a negative dentry - - * all is fine. Let's be bastards - you had / on the end, you've - * been asking for (non-existent) directory. -ENOENT for you. - */ - if (unlikely(!create_flags)) { - error = -ENOENT; - goto fail; - } if (unlikely(err2)) { error = err2; goto fail; @@ -4129,7 +4172,8 @@ EXPORT_SYMBOL(kern_path_create); void done_path_create(struct path *path, struct dentry *dentry) { - dput(dentry); + if (!IS_ERR(dentry)) + dput(dentry); inode_unlock(path->dentry->d_inode); mnt_drop_write(path->mnt); path_put(path); @@ -4275,7 +4319,7 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d } /** - * vfs_mkdir - create directory + * vfs_mkdir - create directory returning correct dentry if possible * @idmap: idmap of the mount the inode was found from * @dir: inode of the parent directory * @dentry: dentry of the child directory @@ -4288,32 +4332,51 @@ SYSCALL_DEFINE3(mknod, const char __user *, filename, umode_t, mode, unsigned, d * care to map the inode according to @idmap before checking permissions. * On non-idmapped mounts or if permission checking is to be performed on the * raw inode simply pass @nop_mnt_idmap. + * + * In the event that the filesystem does not use the *@dentry but leaves it + * negative or unhashes it and possibly splices a different one returning it, + * the original dentry is dput() and the alternate is returned. + * + * In case of an error the dentry is dput() and an ERR_PTR() is returned. */ -int vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +struct dentry *vfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int error; unsigned max_links = dir->i_sb->s_max_links; + struct dentry *de; error = may_create(idmap, dir, dentry); if (error) - return error; + goto err; + error = -EPERM; if (!dir->i_op->mkdir) - return -EPERM; + goto err; mode = vfs_prepare_mode(idmap, dir, mode, S_IRWXUGO | S_ISVTX, 0); error = security_inode_mkdir(dir, dentry, mode); if (error) - return error; + goto err; + error = -EMLINK; if (max_links && dir->i_nlink >= max_links) - return -EMLINK; + goto err; - error = dir->i_op->mkdir(idmap, dir, dentry, mode); - if (!error) - fsnotify_mkdir(dir, dentry); - return error; + de = dir->i_op->mkdir(idmap, dir, dentry, mode); + error = PTR_ERR(de); + if (IS_ERR(de)) + goto err; + if (de) { + dput(dentry); + dentry = de; + } + fsnotify_mkdir(dir, dentry); + return dentry; + +err: + dput(dentry); + return ERR_PTR(error); } EXPORT_SYMBOL(vfs_mkdir); @@ -4333,8 +4396,10 @@ retry: error = security_path_mkdir(&path, dentry, mode_strip_umask(path.dentry->d_inode, mode)); if (!error) { - error = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, + dentry = vfs_mkdir(mnt_idmap(path.mnt), path.dentry->d_inode, dentry, mode); + if (IS_ERR(dentry)) + error = PTR_ERR(dentry); } done_path_create(&path, dentry); if (retry_estale(error, lookup_flags)) { @@ -4445,10 +4510,6 @@ retry: error = PTR_ERR(dentry); if (IS_ERR(dentry)) goto exit3; - if (!dentry->d_inode) { - error = -ENOENT; - goto exit4; - } error = security_path_rmdir(&path, dentry); if (error) goto exit4; @@ -4579,7 +4640,7 @@ retry_deleg: if (!IS_ERR(dentry)) { /* Why not before? Because we want correct error value */ - if (last.name[last.len] || d_is_negative(dentry)) + if (last.name[last.len]) goto slashes; inode = dentry->d_inode; ihold(inode); @@ -4613,9 +4674,7 @@ exit1: return error; slashes: - if (d_is_negative(dentry)) - error = -ENOENT; - else if (d_is_dir(dentry)) + if (d_is_dir(dentry)) error = -EISDIR; else error = -ENOTDIR; @@ -5115,7 +5174,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, struct qstr old_last, new_last; int old_type, new_type; struct inode *delegated_inode = NULL; - unsigned int lookup_flags = 0, target_flags = LOOKUP_RENAME_TARGET; + unsigned int lookup_flags = 0, target_flags = + LOOKUP_RENAME_TARGET | LOOKUP_CREATE; bool should_retry = false; int error = -EINVAL; @@ -5128,6 +5188,8 @@ int do_renameat2(int olddfd, struct filename *from, int newdfd, if (flags & RENAME_EXCHANGE) target_flags = 0; + if (flags & RENAME_NOREPLACE) + target_flags |= LOOKUP_EXCL; retry: error = filename_parentat(olddfd, from, lookup_flags, &old_path, @@ -5169,23 +5231,12 @@ retry_deleg: error = PTR_ERR(old_dentry); if (IS_ERR(old_dentry)) goto exit3; - /* source must exist */ - error = -ENOENT; - if (d_is_negative(old_dentry)) - goto exit4; new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, lookup_flags | target_flags); error = PTR_ERR(new_dentry); if (IS_ERR(new_dentry)) goto exit4; - error = -EEXIST; - if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) - goto exit5; if (flags & RENAME_EXCHANGE) { - error = -ENOENT; - if (d_is_negative(new_dentry)) - goto exit5; - if (!d_is_dir(new_dentry)) { error = -ENOTDIR; if (new_last.name[new_last.len]) @@ -5356,38 +5407,76 @@ const char *vfs_get_link(struct dentry *dentry, struct delayed_call *done) EXPORT_SYMBOL(vfs_get_link); /* get the link contents into pagecache */ -const char *page_get_link(struct dentry *dentry, struct inode *inode, - struct delayed_call *callback) +static char *__page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) { - char *kaddr; - struct page *page; + struct folio *folio; struct address_space *mapping = inode->i_mapping; if (!dentry) { - page = find_get_page(mapping, 0); - if (!page) + folio = filemap_get_folio(mapping, 0); + if (IS_ERR(folio)) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { - page = read_mapping_page(mapping, 0, NULL); - if (IS_ERR(page)) - return (char*)page; + folio = read_mapping_folio(mapping, 0, NULL); + if (IS_ERR(folio)) + return ERR_CAST(folio); } - set_delayed_call(callback, page_put_link, page); + set_delayed_call(callback, page_put_link, folio); BUG_ON(mapping_gfp_mask(mapping) & __GFP_HIGHMEM); - kaddr = page_address(page); - nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); - return kaddr; + return folio_address(folio); +} + +const char *page_get_link_raw(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + return __page_get_link(dentry, inode, callback); } +EXPORT_SYMBOL_GPL(page_get_link_raw); +/** + * page_get_link() - An implementation of the get_link inode_operation. + * @dentry: The directory entry which is the symlink. + * @inode: The inode for the symlink. + * @callback: Used to drop the reference to the symlink. + * + * Filesystems which store their symlinks in the page cache should use + * this to implement the get_link() member of their inode_operations. + * + * Return: A pointer to the NUL-terminated symlink. + */ +const char *page_get_link(struct dentry *dentry, struct inode *inode, + struct delayed_call *callback) +{ + char *kaddr = __page_get_link(dentry, inode, callback); + + if (!IS_ERR(kaddr)) + nd_terminate_link(kaddr, inode->i_size, PAGE_SIZE - 1); + return kaddr; +} EXPORT_SYMBOL(page_get_link); +/** + * page_put_link() - Drop the reference to the symlink. + * @arg: The folio which contains the symlink. + * + * This is used internally by page_get_link(). It is exported for use + * by filesystems which need to implement a variant of page_get_link() + * themselves. Despite the apparent symmetry, filesystems which use + * page_get_link() do not need to call page_put_link(). + * + * The argument, while it has a void pointer type, must be a pointer to + * the folio which was retrieved from the page cache. The delayed_call + * infrastructure is used to drop the reference count once the caller + * is done with the symlink. + */ void page_put_link(void *arg) { - put_page(arg); + folio_put(arg); } EXPORT_SYMBOL(page_put_link); diff --git a/fs/namespace.c b/fs/namespace.c index 8f1000f9f3df..552ad7f4d18b 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -81,15 +81,23 @@ static HLIST_HEAD(unmounted); /* protected by namespace_sem */ static LIST_HEAD(ex_mountpoints); /* protected by namespace_sem */ static DEFINE_SEQLOCK(mnt_ns_tree_lock); +#ifdef CONFIG_FSNOTIFY +LIST_HEAD(notify_list); /* protected by namespace_sem */ +#endif static struct rb_root mnt_ns_tree = RB_ROOT; /* protected by mnt_ns_tree_lock */ static LIST_HEAD(mnt_ns_list); /* protected by mnt_ns_tree_lock */ +enum mount_kattr_flags_t { + MOUNT_KATTR_RECURSE = (1 << 0), + MOUNT_KATTR_IDMAP_REPLACE = (1 << 1), +}; + struct mount_kattr { unsigned int attr_set; unsigned int attr_clr; unsigned int propagation; unsigned int lookup_flags; - bool recurse; + enum mount_kattr_flags_t kflags; struct user_namespace *mnt_userns; struct mnt_idmap *mnt_idmap; }; @@ -163,6 +171,7 @@ static void mnt_ns_release(struct mnt_namespace *ns) { /* keep alive for {list,stat}mount() */ if (refcount_dec_and_test(&ns->passive)) { + fsnotify_mntns_delete(ns); put_user_ns(ns->user_ns); kfree(ns); } @@ -346,12 +355,13 @@ static struct mount *alloc_vfsmnt(const char *name) if (err) goto out_free_cache; - if (name) { + if (name) mnt->mnt_devname = kstrdup_const(name, GFP_KERNEL_ACCOUNT); - if (!mnt->mnt_devname) - goto out_free_id; - } + else + mnt->mnt_devname = "none"; + if (!mnt->mnt_devname) + goto out_free_id; #ifdef CONFIG_SMP mnt->mnt_pcp = alloc_percpu(struct mnt_pcp); @@ -778,15 +788,11 @@ int __legitimize_mnt(struct vfsmount *bastard, unsigned seq) return 0; mnt = real_mount(bastard); mnt_add_count(mnt, 1); - smp_mb(); // see mntput_no_expire() + smp_mb(); // see mntput_no_expire() and do_umount() if (likely(!read_seqretry(&mount_lock, seq))) return 0; - if (bastard->mnt_flags & MNT_SYNC_UMOUNT) { - mnt_add_count(mnt, -1); - return 1; - } lock_mount_hash(); - if (unlikely(bastard->mnt_flags & MNT_DOOMED)) { + if (unlikely(bastard->mnt_flags & (MNT_SYNC_UMOUNT | MNT_DOOMED))) { mnt_add_count(mnt, -1); unlock_mount_hash(); return 1; @@ -998,6 +1004,17 @@ static inline int check_mnt(struct mount *mnt) return mnt->mnt_ns == current->nsproxy->mnt_ns; } +static inline bool check_anonymous_mnt(struct mount *mnt) +{ + u64 seq; + + if (!is_anon_ns(mnt->mnt_ns)) + return false; + + seq = mnt->mnt_ns->seq_origin; + return !seq || (seq == current->nsproxy->mnt_ns->seq); +} + /* * vfsmount lock must be held for write */ @@ -1176,6 +1193,8 @@ static void mnt_add_to_ns(struct mnt_namespace *ns, struct mount *mnt) ns->mnt_first_node = &mnt->mnt_node; rb_link_node(&mnt->mnt_node, parent, link); rb_insert_color(&mnt->mnt_node, &ns->mounts); + + mnt_notify_add(mnt); } /* @@ -1246,7 +1265,7 @@ struct vfsmount *vfs_create_mount(struct fs_context *fc) if (!fc->root) return ERR_PTR(-EINVAL); - mnt = alloc_vfsmnt(fc->source ?: "none"); + mnt = alloc_vfsmnt(fc->source); if (!mnt) return ERR_PTR(-ENOMEM); @@ -1723,6 +1742,50 @@ int may_umount(struct vfsmount *mnt) EXPORT_SYMBOL(may_umount); +#ifdef CONFIG_FSNOTIFY +static void mnt_notify(struct mount *p) +{ + if (!p->prev_ns && p->mnt_ns) { + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); + } else if (p->prev_ns && !p->mnt_ns) { + fsnotify_mnt_detach(p->prev_ns, &p->mnt); + } else if (p->prev_ns == p->mnt_ns) { + fsnotify_mnt_move(p->mnt_ns, &p->mnt); + } else { + fsnotify_mnt_detach(p->prev_ns, &p->mnt); + fsnotify_mnt_attach(p->mnt_ns, &p->mnt); + } + p->prev_ns = p->mnt_ns; +} + +static void notify_mnt_list(void) +{ + struct mount *m, *tmp; + /* + * Notify about mounts that were added/reparented/detached/remain + * connected after unmount. + */ + list_for_each_entry_safe(m, tmp, ¬ify_list, to_notify) { + mnt_notify(m); + list_del_init(&m->to_notify); + } +} + +static bool need_notify_mnt_list(void) +{ + return !list_empty(¬ify_list); +} +#else +static void notify_mnt_list(void) +{ +} + +static bool need_notify_mnt_list(void) +{ + return false; +} +#endif + static void namespace_unlock(void) { struct hlist_head head; @@ -1733,7 +1796,18 @@ static void namespace_unlock(void) hlist_move_list(&unmounted, &head); list_splice_init(&ex_mountpoints, &list); - up_write(&namespace_sem); + if (need_notify_mnt_list()) { + /* + * No point blocking out concurrent readers while notifications + * are sent. This will also allow statmount()/listmount() to run + * concurrently. + */ + downgrade_write(&namespace_sem); + notify_mnt_list(); + up_read(&namespace_sem); + } else { + up_write(&namespace_sem); + } shrink_dentry_list(&list); @@ -1753,6 +1827,8 @@ static inline void namespace_lock(void) down_write(&namespace_sem); } +DEFINE_GUARD(namespace_lock, struct rw_semaphore *, namespace_lock(), namespace_unlock()) + enum umount_tree_flags { UMOUNT_SYNC = 1, UMOUNT_PROPAGATE = 2, @@ -1846,6 +1922,19 @@ static void umount_tree(struct mount *mnt, enum umount_tree_flags how) change_mnt_propagation(p, MS_PRIVATE); if (disconnect) hlist_add_head(&p->mnt_umount, &unmounted); + + /* + * At this point p->mnt_ns is NULL, notification will be queued + * only if + * + * - p->prev_ns is non-NULL *and* + * - p->prev_ns->n_fsnotify_marks is non-NULL + * + * This will preclude queuing the mount if this is a cleanup + * after a failed copy_tree() or destruction of an anonymous + * namespace, etc. + */ + mnt_notify_add(p); } } @@ -1956,6 +2045,7 @@ static int do_umount(struct mount *mnt, int flags) umount_tree(mnt, UMOUNT_PROPAGATE); retval = 0; } else { + smp_mb(); // paired with __legitimize_mnt() shrink_submounts(mnt); retval = -EBUSY; if (!propagate_mount_busy(mnt, 2)) { @@ -2026,6 +2116,7 @@ static void warn_mandlock(void) static int can_umount(const struct path *path, int flags) { struct mount *mnt = real_mount(path->mnt); + struct super_block *sb = path->dentry->d_sb; if (!may_mount()) return -EPERM; @@ -2035,7 +2126,7 @@ static int can_umount(const struct path *path, int flags) return -EINVAL; if (mnt->mnt.mnt_flags & MNT_LOCKED) /* Check optimistically */ return -EINVAL; - if (flags & MNT_FORCE && !capable(CAP_SYS_ADMIN)) + if (flags & MNT_FORCE && !ns_capable(sb->s_user_ns, CAP_SYS_ADMIN)) return -EPERM; return 0; } @@ -2145,16 +2236,24 @@ struct mnt_namespace *get_sequential_mnt_ns(struct mnt_namespace *mntns, bool pr } } +struct mnt_namespace *mnt_ns_from_dentry(struct dentry *dentry) +{ + if (!is_mnt_ns_file(dentry)) + return NULL; + + return to_mnt_ns(get_proc_ns(dentry->d_inode)); +} + static bool mnt_ns_loop(struct dentry *dentry) { /* Could bind mounting the mount namespace inode cause a * mount namespace loop? */ - struct mnt_namespace *mnt_ns; - if (!is_mnt_ns_file(dentry)) + struct mnt_namespace *mnt_ns = mnt_ns_from_dentry(dentry); + + if (!mnt_ns) return false; - mnt_ns = to_mnt_ns(get_proc_ns(dentry->d_inode)); return current->nsproxy->mnt_ns->seq >= mnt_ns->seq; } @@ -2246,22 +2345,75 @@ struct vfsmount *collect_mounts(const struct path *path) static void free_mnt_ns(struct mnt_namespace *); static struct mnt_namespace *alloc_mnt_ns(struct user_namespace *, bool); +static inline bool must_dissolve(struct mnt_namespace *mnt_ns) +{ + /* + * This mount belonged to an anonymous mount namespace + * but was moved to a non-anonymous mount namespace and + * then unmounted. + */ + if (unlikely(!mnt_ns)) + return false; + + /* + * This mount belongs to a non-anonymous mount namespace + * and we know that such a mount can never transition to + * an anonymous mount namespace again. + */ + if (!is_anon_ns(mnt_ns)) { + /* + * A detached mount either belongs to an anonymous mount + * namespace or a non-anonymous mount namespace. It + * should never belong to something purely internal. + */ + VFS_WARN_ON_ONCE(mnt_ns == MNT_NS_INTERNAL); + return false; + } + + return true; +} + void dissolve_on_fput(struct vfsmount *mnt) { struct mnt_namespace *ns; - namespace_lock(); - lock_mount_hash(); - ns = real_mount(mnt)->mnt_ns; - if (ns) { - if (is_anon_ns(ns)) - umount_tree(real_mount(mnt), UMOUNT_CONNECTED); - else - ns = NULL; + struct mount *m = real_mount(mnt); + + scoped_guard(rcu) { + if (!must_dissolve(READ_ONCE(m->mnt_ns))) + return; } - unlock_mount_hash(); - namespace_unlock(); - if (ns) - free_mnt_ns(ns); + + scoped_guard(namespace_lock, &namespace_sem) { + ns = m->mnt_ns; + if (!must_dissolve(ns)) + return; + + /* + * After must_dissolve() we know that this is a detached + * mount in an anonymous mount namespace. + * + * Now when mnt_has_parent() reports that this mount + * tree has a parent, we know that this anonymous mount + * tree has been moved to another anonymous mount + * namespace. + * + * So when closing this file we cannot unmount the mount + * tree. This will be done when the file referring to + * the root of the anonymous mount namespace will be + * closed (It could already be closed but it would sync + * on @namespace_sem and wait for us to finish.). + */ + if (mnt_has_parent(m)) + return; + + lock_mount_hash(); + umount_tree(m, UMOUNT_CONNECTED); + unlock_mount_hash(); + } + + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); + free_mnt_ns(ns); } void drop_collected_mounts(struct vfsmount *mnt) @@ -2287,6 +2439,28 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) return false; } +/* + * Check that there aren't references to earlier/same mount namespaces in the + * specified subtree. Such references can act as pins for mount namespaces + * that aren't checked by the mount-cycle checking code, thereby allowing + * cycles to be made. + */ +static bool check_for_nsfs_mounts(struct mount *subtree) +{ + struct mount *p; + bool ret = false; + + lock_mount_hash(); + for (p = subtree; p; p = next_mnt(p, subtree)) + if (mnt_ns_loop(p->mnt.mnt_root)) + goto out; + + ret = true; +out: + unlock_mount_hash(); + return ret; +} + /** * clone_private_mount - create a private clone of a path * @path: path to clone @@ -2295,6 +2469,8 @@ bool has_locked_children(struct mount *mnt, struct dentry *dentry) * will not be attached anywhere in the namespace and will be private (i.e. * changes to the originating mount won't be propagated into this). * + * This assumes caller has called or done the equivalent of may_mount(). + * * Release with mntput(). */ struct vfsmount *clone_private_mount(const struct path *path) @@ -2302,30 +2478,37 @@ struct vfsmount *clone_private_mount(const struct path *path) struct mount *old_mnt = real_mount(path->mnt); struct mount *new_mnt; - down_read(&namespace_sem); + guard(rwsem_read)(&namespace_sem); + if (IS_MNT_UNBINDABLE(old_mnt)) - goto invalid; + return ERR_PTR(-EINVAL); + + if (mnt_has_parent(old_mnt)) { + if (!check_mnt(old_mnt)) + return ERR_PTR(-EINVAL); + } else { + if (!is_mounted(&old_mnt->mnt)) + return ERR_PTR(-EINVAL); + + /* Make sure this isn't something purely kernel internal. */ + if (!is_anon_ns(old_mnt->mnt_ns)) + return ERR_PTR(-EINVAL); - if (!check_mnt(old_mnt)) - goto invalid; + /* Make sure we don't create mount namespace loops. */ + if (!check_for_nsfs_mounts(old_mnt)) + return ERR_PTR(-EINVAL); + } if (has_locked_children(old_mnt, path->dentry)) - goto invalid; + return ERR_PTR(-EINVAL); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); - up_read(&namespace_sem); - if (IS_ERR(new_mnt)) - return ERR_CAST(new_mnt); + return ERR_PTR(-EINVAL); /* Longterm mount to be removed by kern_unmount*() */ new_mnt->mnt_ns = MNT_NS_INTERNAL; - return &new_mnt->mnt; - -invalid: - up_read(&namespace_sem); - return ERR_PTR(-EINVAL); } EXPORT_SYMBOL_GPL(clone_private_mount); @@ -2424,6 +2607,7 @@ int count_mounts(struct mnt_namespace *ns, struct mount *mnt) enum mnt_tree_flags_t { MNT_TREE_MOVE = BIT(0), MNT_TREE_BENEATH = BIT(1), + MNT_TREE_PROPAGATION = BIT(2), }; /** @@ -2547,6 +2731,7 @@ static int attach_recursive_mnt(struct mount *source_mnt, dest_mp = smp; unhash_mnt(source_mnt); attach_mnt(source_mnt, top_mnt, dest_mp, beneath); + mnt_notify_add(source_mnt); touch_mnt_namespace(source_mnt->mnt_ns); } else { if (source_mnt->mnt_ns) { @@ -2639,56 +2824,62 @@ static struct mountpoint *do_lock_mount(struct path *path, bool beneath) struct vfsmount *mnt = path->mnt; struct dentry *dentry; struct mountpoint *mp = ERR_PTR(-ENOENT); + struct path under = {}; for (;;) { - struct mount *m; + struct mount *m = real_mount(mnt); if (beneath) { - m = real_mount(mnt); + path_put(&under); read_seqlock_excl(&mount_lock); - dentry = dget(m->mnt_mountpoint); + under.mnt = mntget(&m->mnt_parent->mnt); + under.dentry = dget(m->mnt_mountpoint); read_sequnlock_excl(&mount_lock); + dentry = under.dentry; } else { dentry = path->dentry; } inode_lock(dentry->d_inode); - if (unlikely(cant_mount(dentry))) { - inode_unlock(dentry->d_inode); - goto out; - } - namespace_lock(); - if (beneath && (!is_mounted(mnt) || m->mnt_mountpoint != dentry)) { + if (unlikely(cant_mount(dentry) || !is_mounted(mnt))) + break; // not to be mounted on + + if (beneath && unlikely(m->mnt_mountpoint != dentry || + &m->mnt_parent->mnt != under.mnt)) { namespace_unlock(); inode_unlock(dentry->d_inode); - goto out; + continue; // got moved } mnt = lookup_mnt(path); - if (likely(!mnt)) + if (unlikely(mnt)) { + namespace_unlock(); + inode_unlock(dentry->d_inode); + path_put(path); + path->mnt = mnt; + path->dentry = dget(mnt->mnt_root); + continue; // got overmounted + } + mp = get_mountpoint(dentry); + if (IS_ERR(mp)) break; - - namespace_unlock(); - inode_unlock(dentry->d_inode); - if (beneath) - dput(dentry); - path_put(path); - path->mnt = mnt; - path->dentry = dget(mnt->mnt_root); - } - - mp = get_mountpoint(dentry); - if (IS_ERR(mp)) { - namespace_unlock(); - inode_unlock(dentry->d_inode); + if (beneath) { + /* + * @under duplicates the references that will stay + * at least until namespace_unlock(), so the path_put() + * below is safe (and OK to do under namespace_lock - + * we are not dropping the final references here). + */ + path_put(&under); + } + return mp; } - -out: + namespace_unlock(); + inode_unlock(dentry->d_inode); if (beneath) - dput(dentry); - + path_put(&under); return mp; } @@ -2699,14 +2890,11 @@ static inline struct mountpoint *lock_mount(struct path *path) static void unlock_mount(struct mountpoint *where) { - struct dentry *dentry = where->m_dentry; - + inode_unlock(where->m_dentry->d_inode); read_seqlock_excl(&mount_lock); put_mountpoint(where); read_sequnlock_excl(&mount_lock); - namespace_unlock(); - inode_unlock(dentry->d_inode); } static int graft_tree(struct mount *mnt, struct mount *p, struct mountpoint *mp) @@ -2773,6 +2961,71 @@ static int do_change_type(struct path *path, int ms_flags) return err; } +/* may_copy_tree() - check if a mount tree can be copied + * @path: path to the mount tree to be copied + * + * This helper checks if the caller may copy the mount tree starting + * from @path->mnt. The caller may copy the mount tree under the + * following circumstances: + * + * (1) The caller is located in the mount namespace of the mount tree. + * This also implies that the mount does not belong to an anonymous + * mount namespace. + * (2) The caller tries to copy an nfs mount referring to a mount + * namespace, i.e., the caller is trying to copy a mount namespace + * entry from nsfs. + * (3) The caller tries to copy a pidfs mount referring to a pidfd. + * (4) The caller is trying to copy a mount tree that belongs to an + * anonymous mount namespace. + * + * For that to be safe, this helper enforces that the origin mount + * namespace the anonymous mount namespace was created from is the + * same as the caller's mount namespace by comparing the sequence + * numbers. + * + * This is not strictly necessary. The current semantics of the new + * mount api enforce that the caller must be located in the same + * mount namespace as the mount tree it interacts with. Using the + * origin sequence number preserves these semantics even for + * anonymous mount namespaces. However, one could envision extending + * the api to directly operate across mount namespace if needed. + * + * The ownership of a non-anonymous mount namespace such as the + * caller's cannot change. + * => We know that the caller's mount namespace is stable. + * + * If the origin sequence number of the anonymous mount namespace is + * the same as the sequence number of the caller's mount namespace. + * => The owning namespaces are the same. + * + * ==> The earlier capability check on the owning namespace of the + * caller's mount namespace ensures that the caller has the + * ability to copy the mount tree. + * + * Returns true if the mount tree can be copied, false otherwise. + */ +static inline bool may_copy_tree(struct path *path) +{ + struct mount *mnt = real_mount(path->mnt); + const struct dentry_operations *d_op; + + if (check_mnt(mnt)) + return true; + + d_op = path->dentry->d_op; + if (d_op == &ns_dentry_operations) + return true; + + if (d_op == &pidfs_dentry_operations) + return true; + + if (!is_mounted(path->mnt)) + return false; + + return check_anonymous_mnt(mnt); +} + + static struct mount *__do_loopback(struct path *old_path, int recurse) { struct mount *mnt = ERR_PTR(-EINVAL), *old = real_mount(old_path->mnt); @@ -2780,13 +3033,8 @@ static struct mount *__do_loopback(struct path *old_path, int recurse) if (IS_MNT_UNBINDABLE(old)) return mnt; - if (!check_mnt(old)) { - const struct dentry_operations *d_op = old_path->dentry->d_op; - - if (d_op != &ns_dentry_operations && - d_op != &pidfs_dentry_operations) - return mnt; - } + if (!may_copy_tree(old_path)) + return mnt; if (!recurse && has_locked_children(old, old_path->dentry)) return mnt; @@ -2853,15 +3101,30 @@ out: static struct file *open_detached_copy(struct path *path, bool recursive) { - struct user_namespace *user_ns = current->nsproxy->mnt_ns->user_ns; - struct mnt_namespace *ns = alloc_mnt_ns(user_ns, true); + struct mnt_namespace *ns, *mnt_ns = current->nsproxy->mnt_ns, *src_mnt_ns; + struct user_namespace *user_ns = mnt_ns->user_ns; struct mount *mnt, *p; struct file *file; + ns = alloc_mnt_ns(user_ns, true); if (IS_ERR(ns)) return ERR_CAST(ns); namespace_lock(); + + /* + * Record the sequence number of the source mount namespace. + * This needs to hold namespace_sem to ensure that the mount + * doesn't get attached. + */ + if (is_mounted(path->mnt)) { + src_mnt_ns = real_mount(path->mnt)->mnt_ns; + if (is_anon_ns(src_mnt_ns)) + ns->seq_origin = src_mnt_ns->seq_origin; + else + ns->seq_origin = src_mnt_ns->seq; + } + mnt = __do_loopback(path, recursive); if (IS_ERR(mnt)) { namespace_unlock(); @@ -2889,24 +3152,22 @@ static struct file *open_detached_copy(struct path *path, bool recursive) return file; } -SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +static struct file *vfs_open_tree(int dfd, const char __user *filename, unsigned int flags) { - struct file *file; - struct path path; + int ret; + struct path path __free(path_put) = {}; int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; bool detached = flags & OPEN_TREE_CLONE; - int error; - int fd; BUILD_BUG_ON(OPEN_TREE_CLOEXEC != O_CLOEXEC); if (flags & ~(AT_EMPTY_PATH | AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC)) - return -EINVAL; + return ERR_PTR(-EINVAL); if ((flags & (AT_RECURSIVE | OPEN_TREE_CLONE)) == AT_RECURSIVE) - return -EINVAL; + return ERR_PTR(-EINVAL); if (flags & AT_NO_AUTOMOUNT) lookup_flags &= ~LOOKUP_AUTOMOUNT; @@ -2916,27 +3177,32 @@ SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, fl lookup_flags |= LOOKUP_EMPTY; if (detached && !may_mount()) - return -EPERM; + return ERR_PTR(-EPERM); + + ret = user_path_at(dfd, filename, lookup_flags, &path); + if (unlikely(ret)) + return ERR_PTR(ret); + + if (detached) + return open_detached_copy(&path, flags & AT_RECURSIVE); + + return dentry_open(&path, O_PATH, current_cred()); +} + +SYSCALL_DEFINE3(open_tree, int, dfd, const char __user *, filename, unsigned, flags) +{ + int fd; + struct file *file __free(fput) = NULL; + + file = vfs_open_tree(dfd, filename, flags); + if (IS_ERR(file)) + return PTR_ERR(file); fd = get_unused_fd_flags(flags & O_CLOEXEC); if (fd < 0) return fd; - error = user_path_at(dfd, filename, lookup_flags, &path); - if (unlikely(error)) { - file = ERR_PTR(error); - } else { - if (detached) - file = open_detached_copy(&path, flags & AT_RECURSIVE); - else - file = dentry_open(&path, O_PATH, current_cred()); - path_put(&path); - } - if (IS_ERR(file)) { - put_unused_fd(fd); - return PTR_ERR(file); - } - fd_install(fd, file); + fd_install(fd, no_free_ptr(file)); return fd; } @@ -3123,28 +3389,6 @@ static inline int tree_contains_unbindable(struct mount *mnt) return 0; } -/* - * Check that there aren't references to earlier/same mount namespaces in the - * specified subtree. Such references can act as pins for mount namespaces - * that aren't checked by the mount-cycle checking code, thereby allowing - * cycles to be made. - */ -static bool check_for_nsfs_mounts(struct mount *subtree) -{ - struct mount *p; - bool ret = false; - - lock_mount_hash(); - for (p = subtree; p; p = next_mnt(p, subtree)) - if (mnt_ns_loop(p->mnt.mnt_root)) - goto out; - - ret = true; -out: - unlock_mount_hash(); - return ret; -} - static int do_set_group(struct path *from_path, struct path *to_path) { struct mount *from, *to; @@ -3314,14 +3558,63 @@ static int can_move_mount_beneath(const struct path *from, * @mnt_from itself. This defeats the whole purpose of mounting * @mnt_from beneath @mnt_to. */ - if (propagation_would_overmount(parent_mnt_to, mnt_from, mp)) + if (check_mnt(mnt_from) && + propagation_would_overmount(parent_mnt_to, mnt_from, mp)) return -EINVAL; return 0; } -static int do_move_mount(struct path *old_path, struct path *new_path, - bool beneath) +/* may_use_mount() - check if a mount tree can be used + * @mnt: vfsmount to be used + * + * This helper checks if the caller may use the mount tree starting + * from @path->mnt. The caller may use the mount tree under the + * following circumstances: + * + * (1) The caller is located in the mount namespace of the mount tree. + * This also implies that the mount does not belong to an anonymous + * mount namespace. + * (2) The caller is trying to use a mount tree that belongs to an + * anonymous mount namespace. + * + * For that to be safe, this helper enforces that the origin mount + * namespace the anonymous mount namespace was created from is the + * same as the caller's mount namespace by comparing the sequence + * numbers. + * + * The ownership of a non-anonymous mount namespace such as the + * caller's cannot change. + * => We know that the caller's mount namespace is stable. + * + * If the origin sequence number of the anonymous mount namespace is + * the same as the sequence number of the caller's mount namespace. + * => The owning namespaces are the same. + * + * ==> The earlier capability check on the owning namespace of the + * caller's mount namespace ensures that the caller has the + * ability to use the mount tree. + * + * Returns true if the mount tree can be used, false otherwise. + */ +static inline bool may_use_mount(struct mount *mnt) +{ + if (check_mnt(mnt)) + return true; + + /* + * Make sure that noone unmounted the target path or somehow + * managed to get their hands on something purely kernel + * internal. + */ + if (!is_mounted(&mnt->mnt)) + return false; + + return check_anonymous_mnt(mnt); +} + +static int do_move_mount(struct path *old_path, + struct path *new_path, enum mnt_tree_flags_t flags) { struct mnt_namespace *ns; struct mount *p; @@ -3329,8 +3622,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path, struct mount *parent; struct mountpoint *mp, *old_mp; int err; - bool attached; - enum mnt_tree_flags_t flags = 0; + bool attached, beneath = flags & MNT_TREE_BENEATH; mp = do_lock_mount(new_path, beneath); if (IS_ERR(mp)) @@ -3346,8 +3638,7 @@ static int do_move_mount(struct path *old_path, struct path *new_path, ns = old->mnt_ns; err = -EINVAL; - /* The mountpoint must be in our namespace. */ - if (!check_mnt(p)) + if (!may_use_mount(p)) goto out; /* The thing moved must be mounted... */ @@ -3358,6 +3649,32 @@ static int do_move_mount(struct path *old_path, struct path *new_path, if (!(attached ? check_mnt(old) : is_anon_ns(ns))) goto out; + if (is_anon_ns(ns)) { + /* + * Ending up with two files referring to the root of the + * same anonymous mount namespace would cause an error + * as this would mean trying to move the same mount + * twice into the mount tree which would be rejected + * later. But be explicit about it right here. + */ + if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns)) + goto out; + + /* + * If this is an anonymous mount tree ensure that mount + * propagation can detect mounts that were just + * propagated to the target mount tree so we don't + * propagate onto them. + */ + ns->mntns_flags |= MNTNS_PROPAGATING; + } else if (is_anon_ns(p->mnt_ns)) { + /* + * Don't allow moving an attached mount tree to an + * anonymous mount tree. + */ + goto out; + } + if (old->mnt.mnt_flags & MNT_LOCKED) goto out; @@ -3406,12 +3723,17 @@ static int do_move_mount(struct path *old_path, struct path *new_path, if (attached) put_mountpoint(old_mp); out: + if (is_anon_ns(ns)) + ns->mntns_flags &= ~MNTNS_PROPAGATING; unlock_mount(mp); if (!err) { - if (attached) + if (attached) { mntput_no_expire(parent); - else + } else { + /* Make sure we notice when we leak mounts. */ + VFS_WARN_ON_ONCE(!mnt_ns_empty(ns)); free_mnt_ns(ns); + } } return err; } @@ -3428,7 +3750,7 @@ static int do_move_mount_old(struct path *path, const char *old_name) if (err) return err; - err = do_move_mount(&old_path, path, false); + err = do_move_mount(&old_path, path, 0); path_put(&old_path); return err; } @@ -4269,6 +4591,21 @@ err_unlock: return ret; } +static inline int vfs_move_mount(struct path *from_path, struct path *to_path, + enum mnt_tree_flags_t mflags) +{ + int ret; + + ret = security_move_mount(from_path, to_path); + if (ret) + return ret; + + if (mflags & MNT_TREE_PROPAGATION) + return do_set_group(from_path, to_path); + + return do_move_mount(from_path, to_path, mflags); +} + /* * Move a mount from one place to another. In combination with * fsopen()/fsmount() this is used to install a new mount and in combination @@ -4282,8 +4619,12 @@ SYSCALL_DEFINE5(move_mount, int, to_dfd, const char __user *, to_pathname, unsigned int, flags) { - struct path from_path, to_path; - unsigned int lflags; + struct path to_path __free(path_put) = {}; + struct path from_path __free(path_put) = {}; + struct filename *to_name __free(putname) = NULL; + struct filename *from_name __free(putname) = NULL; + unsigned int lflags, uflags; + enum mnt_tree_flags_t mflags = 0; int ret = 0; if (!may_mount()) @@ -4296,43 +4637,53 @@ SYSCALL_DEFINE5(move_mount, (MOVE_MOUNT_BENEATH | MOVE_MOUNT_SET_GROUP)) return -EINVAL; - /* If someone gives a pathname, they aren't permitted to move - * from an fd that requires unmount as we can't get at the flag - * to clear it afterwards. - */ + if (flags & MOVE_MOUNT_SET_GROUP) mflags |= MNT_TREE_PROPAGATION; + if (flags & MOVE_MOUNT_BENEATH) mflags |= MNT_TREE_BENEATH; + lflags = 0; if (flags & MOVE_MOUNT_F_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_F_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - if (flags & MOVE_MOUNT_F_EMPTY_PATH) lflags |= LOOKUP_EMPTY; - - ret = user_path_at(from_dfd, from_pathname, lflags, &from_path); - if (ret < 0) - return ret; + uflags = 0; + if (flags & MOVE_MOUNT_F_EMPTY_PATH) uflags = AT_EMPTY_PATH; + from_name = getname_maybe_null(from_pathname, uflags); + if (IS_ERR(from_name)) + return PTR_ERR(from_name); lflags = 0; if (flags & MOVE_MOUNT_T_SYMLINKS) lflags |= LOOKUP_FOLLOW; if (flags & MOVE_MOUNT_T_AUTOMOUNTS) lflags |= LOOKUP_AUTOMOUNT; - if (flags & MOVE_MOUNT_T_EMPTY_PATH) lflags |= LOOKUP_EMPTY; + uflags = 0; + if (flags & MOVE_MOUNT_T_EMPTY_PATH) uflags = AT_EMPTY_PATH; + to_name = getname_maybe_null(to_pathname, uflags); + if (IS_ERR(to_name)) + return PTR_ERR(to_name); + + if (!to_name && to_dfd >= 0) { + CLASS(fd_raw, f_to)(to_dfd); + if (fd_empty(f_to)) + return -EBADF; + + to_path = fd_file(f_to)->f_path; + path_get(&to_path); + } else { + ret = filename_lookup(to_dfd, to_name, lflags, &to_path, NULL); + if (ret) + return ret; + } - ret = user_path_at(to_dfd, to_pathname, lflags, &to_path); - if (ret < 0) - goto out_from; + if (!from_name && from_dfd >= 0) { + CLASS(fd_raw, f_from)(from_dfd); + if (fd_empty(f_from)) + return -EBADF; - ret = security_move_mount(&from_path, &to_path); - if (ret < 0) - goto out_to; + return vfs_move_mount(&fd_file(f_from)->f_path, &to_path, mflags); + } - if (flags & MOVE_MOUNT_SET_GROUP) - ret = do_set_group(&from_path, &to_path); - else - ret = do_move_mount(&from_path, &to_path, - (flags & MOVE_MOUNT_BENEATH)); + ret = filename_lookup(from_dfd, from_name, lflags, &from_path, NULL); + if (ret) + return ret; -out_to: - path_put(&to_path); -out_from: - path_put(&from_path); - return ret; + return vfs_move_mount(&from_path, &to_path, mflags); } /* @@ -4468,6 +4819,8 @@ SYSCALL_DEFINE2(pivot_root, const char __user *, new_root, list_del_init(&new_mnt->mnt_expire); put_mountpoint(root_mp); unlock_mount_hash(); + mnt_notify_add(root_mnt); + mnt_notify_add(new_mnt); chroot_fs_refs(&root, &new); error = 0; out4: @@ -4512,11 +4865,10 @@ static int can_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) return -EINVAL; /* - * Once a mount has been idmapped we don't allow it to change its - * mapping. It makes things simpler and callers can just create - * another bind-mount they can idmap if they want to. + * We only allow an mount to change it's idmapping if it has + * never been accessible to userspace. */ - if (is_idmapped_mnt(m)) + if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE) && is_idmapped_mnt(m)) return -EPERM; /* The underlying filesystem doesn't support idmapped mounts yet. */ @@ -4576,7 +4928,7 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) break; } - if (!kattr->recurse) + if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) return 0; } @@ -4606,18 +4958,16 @@ static int mount_setattr_prepare(struct mount_kattr *kattr, struct mount *mnt) static void do_idmap_mount(const struct mount_kattr *kattr, struct mount *mnt) { + struct mnt_idmap *old_idmap; + if (!kattr->mnt_idmap) return; - /* - * Pairs with smp_load_acquire() in mnt_idmap(). - * - * Since we only allow a mount to change the idmapping once and - * verified this in can_idmap_mount() we know that the mount has - * @nop_mnt_idmap attached to it. So there's no need to drop any - * references. - */ + old_idmap = mnt_idmap(&mnt->mnt); + + /* Pairs with smp_load_acquire() in mnt_idmap(). */ smp_store_release(&mnt->mnt.mnt_idmap, mnt_idmap_get(kattr->mnt_idmap)); + mnt_idmap_put(old_idmap); } static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) @@ -4637,7 +4987,7 @@ static void mount_setattr_commit(struct mount_kattr *kattr, struct mount *mnt) if (kattr->propagation) change_mnt_propagation(m, kattr->propagation); - if (!kattr->recurse) + if (!(kattr->kflags & MOUNT_KATTR_RECURSE)) break; } touch_mnt_namespace(mnt->mnt_ns); @@ -4667,7 +5017,7 @@ static int do_mount_setattr(struct path *path, struct mount_kattr *kattr) */ namespace_lock(); if (kattr->propagation == MS_SHARED) { - err = invent_group_ids(mnt, kattr->recurse); + err = invent_group_ids(mnt, kattr->kflags & MOUNT_KATTR_RECURSE); if (err) { namespace_unlock(); return err; @@ -4718,7 +5068,7 @@ out: } static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, - struct mount_kattr *kattr, unsigned int flags) + struct mount_kattr *kattr) { struct ns_common *ns; struct user_namespace *mnt_userns; @@ -4726,13 +5076,23 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, if (!((attr->attr_set | attr->attr_clr) & MOUNT_ATTR_IDMAP)) return 0; - /* - * We currently do not support clearing an idmapped mount. If this ever - * is a use-case we can revisit this but for now let's keep it simple - * and not allow it. - */ - if (attr->attr_clr & MOUNT_ATTR_IDMAP) - return -EINVAL; + if (attr->attr_clr & MOUNT_ATTR_IDMAP) { + /* + * We can only remove an idmapping if it's never been + * exposed to userspace. + */ + if (!(kattr->kflags & MOUNT_KATTR_IDMAP_REPLACE)) + return -EINVAL; + + /* + * Removal of idmappings is equivalent to setting + * nop_mnt_idmap. + */ + if (!(attr->attr_set & MOUNT_ATTR_IDMAP)) { + kattr->mnt_idmap = &nop_mnt_idmap; + return 0; + } + } if (attr->userns_fd > INT_MAX) return -EINVAL; @@ -4769,22 +5129,8 @@ static int build_mount_idmapped(const struct mount_attr *attr, size_t usize, } static int build_mount_kattr(const struct mount_attr *attr, size_t usize, - struct mount_kattr *kattr, unsigned int flags) + struct mount_kattr *kattr) { - unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; - - if (flags & AT_NO_AUTOMOUNT) - lookup_flags &= ~LOOKUP_AUTOMOUNT; - if (flags & AT_SYMLINK_NOFOLLOW) - lookup_flags &= ~LOOKUP_FOLLOW; - if (flags & AT_EMPTY_PATH) - lookup_flags |= LOOKUP_EMPTY; - - *kattr = (struct mount_kattr) { - .lookup_flags = lookup_flags, - .recurse = !!(flags & AT_RECURSIVE), - }; - if (attr->propagation & ~MOUNT_SETATTR_PROPAGATION_FLAGS) return -EINVAL; if (hweight32(attr->propagation & MOUNT_SETATTR_PROPAGATION_FLAGS) > 1) @@ -4832,35 +5178,28 @@ static int build_mount_kattr(const struct mount_attr *attr, size_t usize, return -EINVAL; } - return build_mount_idmapped(attr, usize, kattr, flags); + return build_mount_idmapped(attr, usize, kattr); } static void finish_mount_kattr(struct mount_kattr *kattr) { - put_user_ns(kattr->mnt_userns); - kattr->mnt_userns = NULL; + if (kattr->mnt_userns) { + put_user_ns(kattr->mnt_userns); + kattr->mnt_userns = NULL; + } if (kattr->mnt_idmap) mnt_idmap_put(kattr->mnt_idmap); } -SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, - unsigned int, flags, struct mount_attr __user *, uattr, - size_t, usize) +static int wants_mount_setattr(struct mount_attr __user *uattr, size_t usize, + struct mount_kattr *kattr) { - int err; - struct path target; + int ret; struct mount_attr attr; - struct mount_kattr kattr; BUILD_BUG_ON(sizeof(struct mount_attr) != MOUNT_ATTR_SIZE_VER0); - if (flags & ~(AT_EMPTY_PATH | - AT_RECURSIVE | - AT_SYMLINK_NOFOLLOW | - AT_NO_AUTOMOUNT)) - return -EINVAL; - if (unlikely(usize > PAGE_SIZE)) return -E2BIG; if (unlikely(usize < MOUNT_ATTR_SIZE_VER0)) @@ -4869,18 +5208,54 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, if (!may_mount()) return -EPERM; - err = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); - if (err) - return err; + ret = copy_struct_from_user(&attr, sizeof(attr), uattr, usize); + if (ret) + return ret; /* Don't bother walking through the mounts if this is a nop. */ if (attr.attr_set == 0 && attr.attr_clr == 0 && attr.propagation == 0) - return 0; + return 0; /* Tell caller to not bother. */ - err = build_mount_kattr(&attr, usize, &kattr, flags); - if (err) + ret = build_mount_kattr(&attr, usize, kattr); + if (ret < 0) + return ret; + + return 1; +} + +SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, + unsigned int, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + int err; + struct path target; + struct mount_kattr kattr; + unsigned int lookup_flags = LOOKUP_AUTOMOUNT | LOOKUP_FOLLOW; + + if (flags & ~(AT_EMPTY_PATH | + AT_RECURSIVE | + AT_SYMLINK_NOFOLLOW | + AT_NO_AUTOMOUNT)) + return -EINVAL; + + if (flags & AT_NO_AUTOMOUNT) + lookup_flags &= ~LOOKUP_AUTOMOUNT; + if (flags & AT_SYMLINK_NOFOLLOW) + lookup_flags &= ~LOOKUP_FOLLOW; + if (flags & AT_EMPTY_PATH) + lookup_flags |= LOOKUP_EMPTY; + + kattr = (struct mount_kattr) { + .lookup_flags = lookup_flags, + }; + + if (flags & AT_RECURSIVE) + kattr.kflags |= MOUNT_KATTR_RECURSE; + + err = wants_mount_setattr(uattr, usize, &kattr); + if (err <= 0) return err; err = user_path_at(dfd, path, kattr.lookup_flags, &target); @@ -4892,6 +5267,49 @@ SYSCALL_DEFINE5(mount_setattr, int, dfd, const char __user *, path, return err; } +SYSCALL_DEFINE5(open_tree_attr, int, dfd, const char __user *, filename, + unsigned, flags, struct mount_attr __user *, uattr, + size_t, usize) +{ + struct file __free(fput) *file = NULL; + int fd; + + if (!uattr && usize) + return -EINVAL; + + file = vfs_open_tree(dfd, filename, flags); + if (IS_ERR(file)) + return PTR_ERR(file); + + if (uattr) { + int ret; + struct mount_kattr kattr = {}; + + kattr.kflags = MOUNT_KATTR_IDMAP_REPLACE; + if (flags & AT_RECURSIVE) + kattr.kflags |= MOUNT_KATTR_RECURSE; + + ret = wants_mount_setattr(uattr, usize, &kattr); + if (ret < 0) + return ret; + + if (ret) { + ret = do_mount_setattr(&file->f_path, &kattr); + if (ret) + return ret; + + finish_mount_kattr(&kattr); + } + } + + fd = get_unused_fd_flags(flags & O_CLOEXEC); + if (fd < 0) + return fd; + + fd_install(fd, no_free_ptr(file)); + return fd; +} + int show_path(struct seq_file *m, struct dentry *root) { if (root->d_sb->s_op->show_path) @@ -4915,10 +5333,13 @@ struct kstatmount { struct statmount __user *buf; size_t bufsize; struct vfsmount *mnt; + struct mnt_idmap *idmap; u64 mask; struct path root; - struct statmount sm; struct seq_file seq; + + /* Must be last --ends in a flexible-array member. */ + struct statmount sm; }; static u64 mnt_to_attr_flags(struct vfsmount *mnt) @@ -5071,7 +5492,7 @@ static int statmount_sb_source(struct kstatmount *s, struct seq_file *seq) seq->buf[seq->count] = '\0'; seq->count = start; seq_commit(seq, string_unescape_inplace(seq->buf + start, UNESCAPE_OCTAL)); - } else if (r->mnt_devname) { + } else { seq_puts(seq, r->mnt_devname); } return 0; @@ -5184,6 +5605,46 @@ static int statmount_opt_sec_array(struct kstatmount *s, struct seq_file *seq) return 0; } +static inline int statmount_mnt_uidmap(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + + ret = statmount_mnt_idmap(s->idmap, seq, true); + if (ret < 0) + return ret; + + s->sm.mnt_uidmap_num = ret; + /* + * Always raise STATMOUNT_MNT_UIDMAP even if there are no valid + * mappings. This allows userspace to distinguish between a + * non-idmapped mount and an idmapped mount where none of the + * individual mappings are valid in the caller's idmapping. + */ + if (is_valid_mnt_idmap(s->idmap)) + s->sm.mask |= STATMOUNT_MNT_UIDMAP; + return 0; +} + +static inline int statmount_mnt_gidmap(struct kstatmount *s, struct seq_file *seq) +{ + int ret; + + ret = statmount_mnt_idmap(s->idmap, seq, false); + if (ret < 0) + return ret; + + s->sm.mnt_gidmap_num = ret; + /* + * Always raise STATMOUNT_MNT_GIDMAP even if there are no valid + * mappings. This allows userspace to distinguish between a + * non-idmapped mount and an idmapped mount where none of the + * individual mappings are valid in the caller's idmapping. + */ + if (is_valid_mnt_idmap(s->idmap)) + s->sm.mask |= STATMOUNT_MNT_GIDMAP; + return 0; +} + static int statmount_string(struct kstatmount *s, u64 flag) { int ret = 0; @@ -5231,6 +5692,14 @@ static int statmount_string(struct kstatmount *s, u64 flag) offp = &sm->sb_source; ret = statmount_sb_source(s, seq); break; + case STATMOUNT_MNT_UIDMAP: + sm->mnt_uidmap = start; + ret = statmount_mnt_uidmap(s, seq); + break; + case STATMOUNT_MNT_GIDMAP: + sm->mnt_gidmap = start; + ret = statmount_mnt_gidmap(s, seq); + break; default: WARN_ON_ONCE(true); return -EINVAL; @@ -5306,7 +5775,7 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) * We have to find the first mount in our ns and use that, however it * may not exist, so handle that properly. */ - if (RB_EMPTY_ROOT(&ns->mounts)) + if (mnt_ns_empty(ns)) return -ENOENT; first = child = ns->root; @@ -5323,6 +5792,23 @@ static int grab_requested_root(struct mnt_namespace *ns, struct path *root) return 0; } +/* This must be updated whenever a new flag is added */ +#define STATMOUNT_SUPPORTED (STATMOUNT_SB_BASIC | \ + STATMOUNT_MNT_BASIC | \ + STATMOUNT_PROPAGATE_FROM | \ + STATMOUNT_MNT_ROOT | \ + STATMOUNT_MNT_POINT | \ + STATMOUNT_FS_TYPE | \ + STATMOUNT_MNT_NS_ID | \ + STATMOUNT_MNT_OPTS | \ + STATMOUNT_FS_SUBTYPE | \ + STATMOUNT_SB_SOURCE | \ + STATMOUNT_OPT_ARRAY | \ + STATMOUNT_OPT_SEC_ARRAY | \ + STATMOUNT_SUPPORTED_MASK | \ + STATMOUNT_MNT_UIDMAP | \ + STATMOUNT_MNT_GIDMAP) + static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, struct mnt_namespace *ns) { @@ -5331,7 +5817,7 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, int err; /* Has the namespace already been emptied? */ - if (mnt_ns_id && RB_EMPTY_ROOT(&ns->mounts)) + if (mnt_ns_id && mnt_ns_empty(ns)) return -ENOENT; s->mnt = lookup_mnt_in_ns(mnt_id, ns); @@ -5356,12 +5842,29 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, return err; s->root = root; - if (s->mask & STATMOUNT_SB_BASIC) - statmount_sb_basic(s); + /* + * Note that mount properties in mnt->mnt_flags, mnt->mnt_idmap + * can change concurrently as we only hold the read-side of the + * namespace semaphore and mount properties may change with only + * the mount lock held. + * + * We could sample the mount lock sequence counter to detect + * those changes and retry. But it's not worth it. Worst that + * happens is that the mnt->mnt_idmap pointer is already changed + * while mnt->mnt_flags isn't or vica versa. So what. + * + * Both mnt->mnt_flags and mnt->mnt_idmap are set and retrieved + * via READ_ONCE()/WRITE_ONCE() and guard against theoretical + * torn read/write. That's all we care about right now. + */ + s->idmap = mnt_idmap(s->mnt); if (s->mask & STATMOUNT_MNT_BASIC) statmount_mnt_basic(s); + if (s->mask & STATMOUNT_SB_BASIC) + statmount_sb_basic(s); + if (s->mask & STATMOUNT_PROPAGATE_FROM) statmount_propagate_from(s); @@ -5389,12 +5892,26 @@ static int do_statmount(struct kstatmount *s, u64 mnt_id, u64 mnt_ns_id, if (!err && s->mask & STATMOUNT_SB_SOURCE) err = statmount_string(s, STATMOUNT_SB_SOURCE); + if (!err && s->mask & STATMOUNT_MNT_UIDMAP) + err = statmount_string(s, STATMOUNT_MNT_UIDMAP); + + if (!err && s->mask & STATMOUNT_MNT_GIDMAP) + err = statmount_string(s, STATMOUNT_MNT_GIDMAP); + if (!err && s->mask & STATMOUNT_MNT_NS_ID) statmount_mnt_ns_id(s, ns); + if (!err && s->mask & STATMOUNT_SUPPORTED_MASK) { + s->sm.mask |= STATMOUNT_SUPPORTED_MASK; + s->sm.supported_mask = STATMOUNT_SUPPORTED; + } + if (err) return err; + /* Are there bits in the return mask not present in STATMOUNT_SUPPORTED? */ + WARN_ON_ONCE(~STATMOUNT_SUPPORTED & s->sm.mask); + return 0; } @@ -5412,7 +5929,8 @@ static inline bool retry_statmount(const long ret, size_t *seq_size) #define STATMOUNT_STRING_REQ (STATMOUNT_MNT_ROOT | STATMOUNT_MNT_POINT | \ STATMOUNT_FS_TYPE | STATMOUNT_MNT_OPTS | \ STATMOUNT_FS_SUBTYPE | STATMOUNT_SB_SOURCE | \ - STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY) + STATMOUNT_OPT_ARRAY | STATMOUNT_OPT_SEC_ARRAY | \ + STATMOUNT_MNT_UIDMAP | STATMOUNT_MNT_GIDMAP) static int prepare_kstatmount(struct kstatmount *ks, struct mnt_id_req *kreq, struct statmount __user *buf, size_t bufsize, @@ -5658,6 +6176,10 @@ SYSCALL_DEFINE4(listmount, const struct mnt_id_req __user *, req, !ns_capable_noaudit(ns->user_ns, CAP_SYS_ADMIN)) return -ENOENT; + /* + * We only need to guard against mount topology changes as + * listmount() doesn't care about any mount properties. + */ scoped_guard(rwsem_read, &namespace_sem) ret = do_listmount(ns, kreq.mnt_id, last_mnt_id, kmnt_ids, nr_mnt_ids, (flags & LISTMOUNT_REVERSE)); diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index 0bf3c2f5a710..5e3f0aeb51f3 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -125,9 +125,9 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) * Perform a read to an application buffer, bypassing the pagecache and the * local disk cache. */ -static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) +static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) { - int ret; + ssize_t ret; _enter("R=%x %llx-%llx", rreq->debug_id, rreq->start, rreq->start + rreq->len - 1); @@ -155,7 +155,7 @@ static int netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) else ret = -EIOCBQUEUED; out: - _leave(" = %d", ret); + _leave(" = %zd", ret); return ret; } diff --git a/fs/netfs/fscache_cache.c b/fs/netfs/fscache_cache.c index 9397ed39b0b4..8f70f8da064b 100644 --- a/fs/netfs/fscache_cache.c +++ b/fs/netfs/fscache_cache.c @@ -372,7 +372,7 @@ void fscache_withdraw_cache(struct fscache_cache *cache) EXPORT_SYMBOL(fscache_withdraw_cache); #ifdef CONFIG_PROC_FS -static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] = "-PAEW"; +static const char fscache_cache_states[NR__FSCACHE_CACHE_STATE] __nonstring = "-PAEW"; /* * Generate a list of caches in /proc/fs/fscache/caches diff --git a/fs/netfs/fscache_cookie.c b/fs/netfs/fscache_cookie.c index d4d4b3a8b106..3d56fc73435f 100644 --- a/fs/netfs/fscache_cookie.c +++ b/fs/netfs/fscache_cookie.c @@ -29,7 +29,7 @@ static LIST_HEAD(fscache_cookie_lru); static DEFINE_SPINLOCK(fscache_cookie_lru_lock); DEFINE_TIMER(fscache_cookie_lru_timer, fscache_cookie_lru_timed_out); static DECLARE_WORK(fscache_cookie_lru_work, fscache_cookie_lru_worker); -static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] = "-LCAIFUWRD"; +static const char fscache_cookie_states[FSCACHE_COOKIE_STATE__NR] __nonstring = "-LCAIFUWRD"; static unsigned int fscache_lru_cookie_timeout = 10 * HZ; void fscache_print_cookie(struct fscache_cookie *cookie, char prefix) diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 4e3e62040831..70ecc8f5f210 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -127,11 +127,13 @@ static int __init netfs_init(void) if (mempool_init_slab_pool(&netfs_subrequest_pool, 100, netfs_subrequest_slab) < 0) goto error_subreqpool; +#ifdef CONFIG_PROC_FS if (!proc_mkdir("fs/netfs", NULL)) goto error_proc; if (!proc_create_seq("fs/netfs/requests", S_IFREG | 0444, NULL, &netfs_requests_seq_ops)) goto error_procfile; +#endif #ifdef CONFIG_FSCACHE_STATS if (!proc_create_single("fs/netfs/stats", S_IFREG | 0444, NULL, netfs_stats_show)) @@ -144,9 +146,11 @@ static int __init netfs_init(void) return 0; error_fscache: +#ifdef CONFIG_PROC_FS error_procfile: remove_proc_subtree("fs/netfs", NULL); error_proc: +#endif mempool_exit(&netfs_subrequest_pool); error_subreqpool: kmem_cache_destroy(netfs_subrequest_slab); diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 636cc5a98ef5..23c75755ad4e 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -682,14 +682,16 @@ void netfs_wait_for_pause(struct netfs_io_request *rreq) trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - subreq = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); - if (subreq && - (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || - test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { - __set_current_state(TASK_RUNNING); - netfs_read_collection(rreq); - continue; + if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + subreq = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, rreq_link); + if (subreq && + (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { + __set_current_state(TASK_RUNNING); + netfs_read_collection(rreq); + continue; + } } if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) || diff --git a/fs/netfs/rolling_buffer.c b/fs/netfs/rolling_buffer.c index 75d97af14b4a..207b6a326651 100644 --- a/fs/netfs/rolling_buffer.c +++ b/fs/netfs/rolling_buffer.c @@ -146,10 +146,6 @@ ssize_t rolling_buffer_load_from_ra(struct rolling_buffer *roll, /* Store the counter after setting the slot. */ smp_store_release(&roll->next_head_slot, to); - - for (; ix < folioq_nr_slots(fq); ix++) - folioq_clear(fq, ix); - return size; } diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 294f67795f79..3fca59e6475d 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -400,7 +400,8 @@ void netfs_write_collection_worker(struct work_struct *work) trace_netfs_rreq(wreq, netfs_rreq_trace_write_done); if (wreq->io_streams[1].active && - wreq->io_streams[1].failed) { + wreq->io_streams[1].failed && + ictx->ops->invalidate_cache) { /* Cache write failure doesn't prevent writeback completion * unless we're in disconnected mode. */ diff --git a/fs/nfs/Kconfig b/fs/nfs/Kconfig index d3f76101ad4b..07932ce9246c 100644 --- a/fs/nfs/Kconfig +++ b/fs/nfs/Kconfig @@ -2,6 +2,7 @@ config NFS_FS tristate "NFS client support" depends on INET && FILE_LOCKING && MULTIUSER + select CRC32 select LOCKD select SUNRPC select NFS_COMMON @@ -196,7 +197,6 @@ config NFS_USE_KERNEL_DNS config NFS_DEBUG bool depends on NFS_FS && SUNRPC_DEBUG - select CRC32 default y config NFS_DISABLE_UDP_SUPPORT diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 3b0918ade53c..6d63b958c4bb 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -546,6 +546,8 @@ int nfs_create_rpc_client(struct nfs_client *clp, args.flags |= RPC_CLNT_CREATE_NOPING; if (test_bit(NFS_CS_REUSEPORT, &clp->cl_flags)) args.flags |= RPC_CLNT_CREATE_REUSEPORT; + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + args.flags |= RPC_CLNT_CREATE_NETUNREACH_FATAL; if (!IS_ERR(clp->cl_rpcclient)) return 0; @@ -709,6 +711,9 @@ static int nfs_init_server(struct nfs_server *server, if (ctx->flags & NFS_MOUNT_NORESVPORT) set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (ctx->flags & NFS_MOUNT_NETUNREACH_FATAL) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); + /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) @@ -1100,6 +1105,8 @@ struct nfs_server *nfs_create_server(struct fs_context *fc) if (server->namelen == 0 || server->namelen > NFS2_MAXNAMLEN) server->namelen = NFS2_MAXNAMLEN; } + /* Linux 'subtree_check' borkenness mandates this setting */ + server->fh_expire_type = NFS_FH_VOL_RENAME; if (!(fattr->valid & NFS_ATTR_FATTR)) { error = ctx->nfs_mod->rpc_ops->getattr(server, ctx->mntfh, @@ -1195,6 +1202,10 @@ void nfs_clients_init(struct net *net) #if IS_ENABLED(CONFIG_NFS_V4) idr_init(&nn->cb_ident_idr); #endif +#if IS_ENABLED(CONFIG_NFS_V4_1) + INIT_LIST_HEAD(&nn->nfs4_data_server_cache); + spin_lock_init(&nn->nfs4_data_server_lock); +#endif spin_lock_init(&nn->nfs_client_lock); nn->boot_time = ktime_get_real(); memset(&nn->rpcstats, 0, sizeof(nn->rpcstats)); @@ -1211,6 +1222,9 @@ void nfs_clients_exit(struct net *net) nfs_cleanup_cb_ident_idr(net); WARN_ON_ONCE(!list_empty(&nn->nfs_client_list)); WARN_ON_ONCE(!list_empty(&nn->nfs_volume_list)); +#if IS_ENABLED(CONFIG_NFS_V4_1) + WARN_ON_ONCE(!list_empty(&nn->nfs4_data_server_cache)); +#endif } #ifdef CONFIG_PROC_FS diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 035ba52742a5..8bdbc4dca89c 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -79,6 +79,7 @@ static void nfs_mark_return_delegation(struct nfs_server *server, struct nfs_delegation *delegation) { set_bit(NFS_DELEGATION_RETURN, &delegation->flags); + set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags); set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); } @@ -306,7 +307,8 @@ nfs_start_delegation_return_locked(struct nfs_inode *nfsi) if (delegation == NULL) goto out; spin_lock(&delegation->lock); - if (!test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + if (delegation->inode && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { clear_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); /* Refcount matched in nfs_end_delegation_return() */ ret = nfs_get_delegation(delegation); @@ -330,14 +332,16 @@ nfs_start_delegation_return(struct nfs_inode *nfsi) } static void nfs_abort_delegation_return(struct nfs_delegation *delegation, - struct nfs_client *clp, int err) + struct nfs_server *server, int err) { - spin_lock(&delegation->lock); clear_bit(NFS_DELEGATION_RETURNING, &delegation->flags); if (err == -EAGAIN) { set_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN_DELAYED, &clp->cl_state); + set_bit(NFS4SERV_DELEGRETURN_DELAYED, + &server->delegation_flags); + set_bit(NFS4CLNT_DELEGRETURN_DELAYED, + &server->nfs_client->cl_state); } spin_unlock(&delegation->lock); } @@ -547,7 +551,7 @@ out: */ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation *delegation, int issync) { - struct nfs_client *clp = NFS_SERVER(inode)->nfs_client; + struct nfs_server *server = NFS_SERVER(inode); unsigned int mode = O_WRONLY | O_RDWR; int err = 0; @@ -569,11 +573,11 @@ static int nfs_end_delegation_return(struct inode *inode, struct nfs_delegation /* * Guard against state recovery */ - err = nfs4_wait_clnt_recover(clp); + err = nfs4_wait_clnt_recover(server->nfs_client); } if (err) { - nfs_abort_delegation_return(delegation, clp, err); + nfs_abort_delegation_return(delegation, server, err); goto out; } @@ -590,17 +594,6 @@ static bool nfs_delegation_need_return(struct nfs_delegation *delegation) if (test_and_clear_bit(NFS_DELEGATION_RETURN, &delegation->flags)) ret = true; - else if (test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) { - struct inode *inode; - - spin_lock(&delegation->lock); - inode = delegation->inode; - if (inode && list_empty(&NFS_I(inode)->open_files)) - ret = true; - spin_unlock(&delegation->lock); - } - if (ret) - clear_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); if (test_bit(NFS_DELEGATION_RETURNING, &delegation->flags) || test_bit(NFS_DELEGATION_RETURN_DELAYED, &delegation->flags) || test_bit(NFS_DELEGATION_REVOKED, &delegation->flags)) @@ -619,6 +612,9 @@ static int nfs_server_return_marked_delegations(struct nfs_server *server, struct nfs_delegation *place_holder_deleg = NULL; int err = 0; + if (!test_and_clear_bit(NFS4SERV_DELEGRETURN, + &server->delegation_flags)) + return 0; restart: /* * To avoid quadratic looping we hold a reference @@ -670,6 +666,7 @@ restart: cond_resched(); if (!err) goto restart; + set_bit(NFS4SERV_DELEGRETURN, &server->delegation_flags); set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); goto out; } @@ -684,6 +681,9 @@ static bool nfs_server_clear_delayed_delegations(struct nfs_server *server) struct nfs_delegation *d; bool ret = false; + if (!test_and_clear_bit(NFS4SERV_DELEGRETURN_DELAYED, + &server->delegation_flags)) + goto out; list_for_each_entry_rcu (d, &server->delegations, super_list) { if (!test_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags)) continue; @@ -691,6 +691,7 @@ static bool nfs_server_clear_delayed_delegations(struct nfs_server *server) clear_bit(NFS_DELEGATION_RETURN_DELAYED, &d->flags); ret = true; } +out: return ret; } @@ -781,6 +782,43 @@ int nfs4_inode_return_delegation(struct inode *inode) } /** + * nfs4_inode_set_return_delegation_on_close - asynchronously return a delegation + * @inode: inode to process + * + * This routine is called to request that the delegation be returned as soon + * as the file is closed. If the file is already closed, the delegation is + * immediately returned. + */ +void nfs4_inode_set_return_delegation_on_close(struct inode *inode) +{ + struct nfs_delegation *delegation; + struct nfs_delegation *ret = NULL; + + if (!inode) + return; + rcu_read_lock(); + delegation = nfs4_get_valid_delegation(inode); + if (!delegation) + goto out; + spin_lock(&delegation->lock); + if (!delegation->inode) + goto out_unlock; + if (list_empty(&NFS_I(inode)->open_files) && + !test_and_set_bit(NFS_DELEGATION_RETURNING, &delegation->flags)) { + /* Refcount matched in nfs_end_delegation_return() */ + ret = nfs_get_delegation(delegation); + } else + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +out_unlock: + spin_unlock(&delegation->lock); + if (ret) + nfs_clear_verifier_delegated(inode); +out: + rcu_read_unlock(); + nfs_end_delegation_return(inode, ret, 0); +} + +/** * nfs4_inode_return_delegation_on_close - asynchronously return a delegation * @inode: inode to process * @@ -841,11 +879,25 @@ int nfs4_inode_make_writeable(struct inode *inode) return nfs4_inode_return_delegation(inode); } -static void nfs_mark_return_if_closed_delegation(struct nfs_server *server, - struct nfs_delegation *delegation) +static void +nfs_mark_return_if_closed_delegation(struct nfs_server *server, + struct nfs_delegation *delegation) { - set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); - set_bit(NFS4CLNT_DELEGRETURN, &server->nfs_client->cl_state); + struct inode *inode; + + if (test_bit(NFS_DELEGATION_RETURN, &delegation->flags) || + test_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags)) + return; + spin_lock(&delegation->lock); + inode = delegation->inode; + if (!inode) + goto out; + if (list_empty(&NFS_I(inode)->open_files)) + nfs_mark_return_delegation(server, delegation); + else + set_bit(NFS_DELEGATION_RETURN_IF_CLOSED, &delegation->flags); +out: + spin_unlock(&delegation->lock); } static bool nfs_server_mark_return_all_delegations(struct nfs_server *server) @@ -1239,6 +1291,7 @@ static void nfs_mark_test_expired_delegation(struct nfs_server *server, return; clear_bit(NFS_DELEGATION_NEED_RECLAIM, &delegation->flags); set_bit(NFS_DELEGATION_TEST_EXPIRED, &delegation->flags); + set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags); set_bit(NFS4CLNT_DELEGATION_EXPIRED, &server->nfs_client->cl_state); } @@ -1317,6 +1370,9 @@ static int nfs_server_reap_expired_delegations(struct nfs_server *server, nfs4_stateid stateid; unsigned long gen = ++server->delegation_gen; + if (!test_and_clear_bit(NFS4SERV_DELEGATION_EXPIRED, + &server->delegation_flags)) + return 0; restart: rcu_read_lock(); list_for_each_entry_rcu(delegation, &server->delegations, super_list) { @@ -1346,6 +1402,9 @@ restart: goto restart; } nfs_inode_mark_test_expired_delegation(server,inode); + set_bit(NFS4SERV_DELEGATION_EXPIRED, &server->delegation_flags); + set_bit(NFS4CLNT_DELEGATION_EXPIRED, + &server->nfs_client->cl_state); iput(inode); return -EAGAIN; } diff --git a/fs/nfs/delegation.h b/fs/nfs/delegation.h index 71524d34ed20..8ff5ab9c5c25 100644 --- a/fs/nfs/delegation.h +++ b/fs/nfs/delegation.h @@ -49,6 +49,7 @@ void nfs_inode_reclaim_delegation(struct inode *inode, const struct cred *cred, unsigned long pagemod_limit, u32 deleg_type); int nfs4_inode_return_delegation(struct inode *inode); void nfs4_inode_return_delegation_on_close(struct inode *inode); +void nfs4_inode_set_return_delegation_on_close(struct inode *inode); int nfs_async_inode_return_delegation(struct inode *inode, const nfs4_stateid *stateid); void nfs_inode_evict_delegation(struct inode *inode); diff --git a/fs/nfs/dir.c b/fs/nfs/dir.c index 2b04038b0e40..d0e0b435a843 100644 --- a/fs/nfs/dir.c +++ b/fs/nfs/dir.c @@ -666,6 +666,8 @@ static bool nfs_use_readdirplus(struct inode *dir, struct dir_context *ctx, { if (!nfs_server_capable(dir, NFS_CAP_READDIRPLUS)) return false; + if (NFS_SERVER(dir)->flags & NFS_MOUNT_FORCE_RDIRPLUS) + return true; if (ctx->pos == 0 || cache_hits + cache_misses > NFS_READDIR_CACHE_USAGE_THRESHOLD) return true; @@ -1532,7 +1534,8 @@ static int nfs_is_exclusive_create(struct inode *dir, unsigned int flags) { if (NFS_PROTO(dir)->version == 2) return 0; - return flags & LOOKUP_EXCL; + return (flags & (LOOKUP_CREATE | LOOKUP_EXCL)) == + (LOOKUP_CREATE | LOOKUP_EXCL); } /* @@ -2421,11 +2424,11 @@ EXPORT_SYMBOL_GPL(nfs_mknod); /* * See comments for nfs_proc_create regarding failed operations. */ -int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +struct dentry *nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct iattr attr; - int error; + struct dentry *ret; dfprintk(VFS, "NFS: mkdir(%s/%lu), %pd\n", dir->i_sb->s_id, dir->i_ino, dentry); @@ -2434,14 +2437,9 @@ int nfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, attr.ia_mode = mode | S_IFDIR; trace_nfs_mkdir_enter(dir, dentry); - error = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); - trace_nfs_mkdir_exit(dir, dentry, error); - if (error != 0) - goto out_err; - return 0; -out_err: - d_drop(dentry); - return error; + ret = NFS_PROTO(dir)->mkdir(dir, dentry, &attr); + trace_nfs_mkdir_exit(dir, dentry, PTR_ERR_OR_ZERO(ret)); + return ret; } EXPORT_SYMBOL_GPL(nfs_mkdir); @@ -2678,6 +2676,18 @@ nfs_unblock_rename(struct rpc_task *task, struct nfs_renamedata *data) unblock_revalidate(new_dentry); } +static bool nfs_rename_is_unsafe_cross_dir(struct dentry *old_dentry, + struct dentry *new_dentry) +{ + struct nfs_server *server = NFS_SB(old_dentry->d_sb); + + if (old_dentry->d_parent != new_dentry->d_parent) + return false; + if (server->fh_expire_type & NFS_FH_RENAME_UNSAFE) + return !(server->fh_expire_type & NFS_FH_NOEXPIRE_WITH_OPEN); + return true; +} + /* * RENAME * FIXME: Some nfsds, like the Linux user space nfsd, may generate a @@ -2765,7 +2775,8 @@ int nfs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } - if (S_ISREG(old_inode->i_mode)) + if (S_ISREG(old_inode->i_mode) && + nfs_rename_is_unsafe_cross_dir(old_dentry, new_dentry)) nfs_sync_inode(old_inode); task = nfs_async_rename(old_dir, new_dir, old_dentry, new_dentry, must_unblock ? nfs_unblock_rename : NULL); diff --git a/fs/nfs/direct.c b/fs/nfs/direct.c index f45beea92d03..48d89716193a 100644 --- a/fs/nfs/direct.c +++ b/fs/nfs/direct.c @@ -56,6 +56,7 @@ #include <linux/uaccess.h> #include <linux/atomic.h> +#include "delegation.h" #include "internal.h" #include "iostat.h" #include "pnfs.h" @@ -130,6 +131,20 @@ static void nfs_direct_truncate_request(struct nfs_direct_req *dreq, dreq->count = req_start; } +static void nfs_direct_file_adjust_size_locked(struct inode *inode, + loff_t offset, size_t count) +{ + loff_t newsize = offset + (loff_t)count; + loff_t oldsize = i_size_read(inode); + + if (newsize > oldsize) { + i_size_write(inode, newsize); + NFS_I(inode)->cache_validity &= ~NFS_INO_INVALID_SIZE; + trace_nfs_size_grow(inode, newsize); + nfs_inc_stats(inode, NFSIOS_EXTENDWRITE); + } +} + /** * nfs_swap_rw - NFS address space operation for swap I/O * @iocb: target I/O control block @@ -272,6 +287,8 @@ static void nfs_direct_read_completion(struct nfs_pgio_header *hdr) nfs_direct_count_bytes(dreq, hdr); spin_unlock(&dreq->lock); + nfs_update_delegated_atime(dreq->inode); + while (!list_empty(&hdr->pages)) { struct nfs_page *req = nfs_list_entry(hdr->pages.next); struct page *page = req->wb_page; @@ -740,7 +757,7 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) { struct nfs_direct_req *dreq = hdr->dreq; struct nfs_commit_info cinfo; - struct nfs_page *req = nfs_list_entry(hdr->pages.next); + struct inode *inode = dreq->inode; int flags = NFS_ODIRECT_DONE; trace_nfs_direct_write_completion(dreq); @@ -762,7 +779,13 @@ static void nfs_direct_write_completion(struct nfs_pgio_header *hdr) } spin_unlock(&dreq->lock); + spin_lock(&inode->i_lock); + nfs_direct_file_adjust_size_locked(inode, dreq->io_start, dreq->count); + nfs_update_delegated_mtime_locked(dreq->inode); + spin_unlock(&inode->i_lock); + while (!list_empty(&hdr->pages)) { + struct nfs_page *req; req = nfs_list_entry(hdr->pages.next); nfs_list_remove_request(req); diff --git a/fs/nfs/export.c b/fs/nfs/export.c index be686b8e0c54..e9c233b6fd20 100644 --- a/fs/nfs/export.c +++ b/fs/nfs/export.c @@ -154,5 +154,6 @@ const struct export_operations nfs_export_ops = { EXPORT_OP_CLOSE_BEFORE_UNLINK | EXPORT_OP_REMOTE_FS | EXPORT_OP_NOATOMIC_ATTR | - EXPORT_OP_FLUSH_ON_CLOSE, + EXPORT_OP_FLUSH_ON_CLOSE | + EXPORT_OP_NOLOCKS, }; diff --git a/fs/nfs/file.c b/fs/nfs/file.c index 1bb646752e46..033feeab8c34 100644 --- a/fs/nfs/file.c +++ b/fs/nfs/file.c @@ -29,6 +29,7 @@ #include <linux/pagemap.h> #include <linux/gfp.h> #include <linux/swap.h> +#include <linux/compaction.h> #include <linux/uaccess.h> #include <linux/filelock.h> @@ -457,7 +458,7 @@ static bool nfs_release_folio(struct folio *folio, gfp_t gfp) /* If the private flag is set, then the folio is not freeable */ if (folio_test_private(folio)) { if ((current_gfp_context(gfp) & GFP_KERNEL) != GFP_KERNEL || - current_is_kswapd()) + current_is_kswapd() || current_is_kcompactd()) return false; if (nfs_wb_folio(folio->mapping->host, folio) < 0) return false; diff --git a/fs/nfs/filelayout/filelayoutdev.c b/fs/nfs/filelayout/filelayoutdev.c index 4fa304fa5bc4..29d9234d5c08 100644 --- a/fs/nfs/filelayout/filelayoutdev.c +++ b/fs/nfs/filelayout/filelayoutdev.c @@ -76,6 +76,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct page *scratch; struct list_head dsaddrs; struct nfs4_pnfs_ds_addr *da; + struct net *net = server->nfs_client->cl_net; /* set up xdr stream */ scratch = alloc_page(gfp_flags); @@ -159,8 +160,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, mp_count = be32_to_cpup(p); /* multipath count */ for (j = 0; j < mp_count; j++) { - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -170,7 +170,7 @@ nfs4_fl_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, goto out_err_free_deviceid; } - dsaddr->ds_list[i] = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + dsaddr->ds_list[i] = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!dsaddr->ds_list[i]) goto out_err_drain_dsaddrs; trace_fl_getdevinfo(server, &pdev->dev_id, dsaddr->ds_list[i]->ds_remotestr); diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index 98b45b636be3..e6909cafab68 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1154,10 +1154,14 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, rpc_wake_up(&tbl->slot_tbl_waitq); goto reset; /* RPC connection errors */ + case -ENETDOWN: + case -ENETUNREACH: + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + return -NFS4ERR_FATAL_IOERROR; + fallthrough; case -ECONNREFUSED: case -EHOSTDOWN: case -EHOSTUNREACH: - case -ENETUNREACH: case -EIO: case -ETIMEDOUT: case -EPIPE: @@ -1183,6 +1187,7 @@ reset: /* Retry all errors through either pNFS or MDS except for -EJUKEBOX */ static int ff_layout_async_handle_error_v3(struct rpc_task *task, + struct nfs_client *clp, struct pnfs_layout_segment *lseg, u32 idx) { @@ -1200,6 +1205,11 @@ static int ff_layout_async_handle_error_v3(struct rpc_task *task, case -EJUKEBOX: nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); goto out_retry; + case -ENETDOWN: + case -ENETUNREACH: + if (test_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags)) + return -NFS4ERR_FATAL_IOERROR; + fallthrough; default: dprintk("%s DS connection error %d\n", __func__, task->tk_status); @@ -1234,7 +1244,7 @@ static int ff_layout_async_handle_error(struct rpc_task *task, switch (vers) { case 3: - return ff_layout_async_handle_error_v3(task, lseg, idx); + return ff_layout_async_handle_error_v3(task, clp, lseg, idx); case 4: return ff_layout_async_handle_error_v4(task, state, clp, lseg, idx); @@ -1264,6 +1274,7 @@ static void ff_layout_io_track_ds_error(struct pnfs_layout_segment *lseg, case -ECONNRESET: case -EHOSTDOWN: case -EHOSTUNREACH: + case -ENETDOWN: case -ENETUNREACH: case -EADDRINUSE: case -ENOBUFS: @@ -1318,7 +1329,7 @@ static int ff_layout_read_done_cb(struct rpc_task *task, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_READ, task->tk_status); - trace_ff_layout_read_error(hdr); + trace_ff_layout_read_error(hdr, task->tk_status); } err = ff_layout_async_handle_error(task, hdr->args.context->state, @@ -1337,6 +1348,9 @@ static int ff_layout_read_done_cb(struct rpc_task *task, return task->tk_status; case -EAGAIN: goto out_eagain; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } return 0; @@ -1488,7 +1502,7 @@ static int ff_layout_write_done_cb(struct rpc_task *task, hdr->args.offset, hdr->args.count, &hdr->res.op_status, OP_WRITE, task->tk_status); - trace_ff_layout_write_error(hdr); + trace_ff_layout_write_error(hdr, task->tk_status); } err = ff_layout_async_handle_error(task, hdr->args.context->state, @@ -1507,6 +1521,9 @@ static int ff_layout_write_done_cb(struct rpc_task *task, return task->tk_status; case -EAGAIN: return -EAGAIN; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } if (hdr->res.verf->committed == NFS_FILE_SYNC || @@ -1534,7 +1551,7 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, data->args.offset, data->args.count, &data->res.op_status, OP_COMMIT, task->tk_status); - trace_ff_layout_commit_error(data); + trace_ff_layout_commit_error(data, task->tk_status); } err = ff_layout_async_handle_error(task, NULL, data->ds_clp, @@ -1551,6 +1568,9 @@ static int ff_layout_commit_done_cb(struct rpc_task *task, case -EAGAIN: rpc_restart_call_prepare(task); return -EAGAIN; + case -NFS4ERR_FATAL_IOERROR: + task->tk_status = -EIO; + return 0; } ff_layout_set_layoutcommit(data->inode, data->lseg, data->lwb); diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index e58bedfb1dcc..4a304cf17c4b 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -49,6 +49,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, struct nfs4_pnfs_ds_addr *da; struct nfs4_ff_layout_ds *new_ds = NULL; struct nfs4_ff_ds_version *ds_versions = NULL; + struct net *net = server->nfs_client->cl_net; u32 mp_count; u32 version_count; __be32 *p; @@ -80,8 +81,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, for (i = 0; i < mp_count; i++) { /* multipath ds */ - da = nfs4_decode_mp_ds_addr(server->nfs_client->cl_net, - &stream, gfp_flags); + da = nfs4_decode_mp_ds_addr(net, &stream, gfp_flags); if (da) list_add_tail(&da->da_node, &dsaddrs); } @@ -149,7 +149,7 @@ nfs4_ff_alloc_deviceid_node(struct nfs_server *server, struct pnfs_device *pdev, new_ds->ds_versions = ds_versions; new_ds->ds_versions_cnt = version_count; - new_ds->ds = nfs4_pnfs_ds_add(&dsaddrs, gfp_flags); + new_ds->ds = nfs4_pnfs_ds_add(net, &dsaddrs, gfp_flags); if (!new_ds->ds) goto out_err_drain_dsaddrs; diff --git a/fs/nfs/fs_context.c b/fs/nfs/fs_context.c index b069385eea17..13f71ca8c974 100644 --- a/fs/nfs/fs_context.c +++ b/fs/nfs/fs_context.c @@ -50,6 +50,7 @@ enum nfs_param { Opt_clientaddr, Opt_cto, Opt_alignwrite, + Opt_fatal_neterrors, Opt_fg, Opt_fscache, Opt_fscache_flag, @@ -72,6 +73,8 @@ enum nfs_param { Opt_posix, Opt_proto, Opt_rdirplus, + Opt_rdirplus_none, + Opt_rdirplus_force, Opt_rdma, Opt_resvport, Opt_retrans, @@ -96,6 +99,20 @@ enum nfs_param { }; enum { + Opt_fatal_neterrors_default, + Opt_fatal_neterrors_enetunreach, + Opt_fatal_neterrors_none, +}; + +static const struct constant_table nfs_param_enums_fatal_neterrors[] = { + { "default", Opt_fatal_neterrors_default }, + { "ENETDOWN:ENETUNREACH", Opt_fatal_neterrors_enetunreach }, + { "ENETUNREACH:ENETDOWN", Opt_fatal_neterrors_enetunreach }, + { "none", Opt_fatal_neterrors_none }, + {} +}; + +enum { Opt_local_lock_all, Opt_local_lock_flock, Opt_local_lock_none, @@ -151,6 +168,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_string("clientaddr", Opt_clientaddr), fsparam_flag_no("cto", Opt_cto), fsparam_flag_no("alignwrite", Opt_alignwrite), + fsparam_enum("fatal_neterrors", Opt_fatal_neterrors, + nfs_param_enums_fatal_neterrors), fsparam_flag ("fg", Opt_fg), fsparam_flag_no("fsc", Opt_fscache_flag), fsparam_string("fsc", Opt_fscache), @@ -174,7 +193,8 @@ static const struct fs_parameter_spec nfs_fs_parameters[] = { fsparam_u32 ("port", Opt_port), fsparam_flag_no("posix", Opt_posix), fsparam_string("proto", Opt_proto), - fsparam_flag_no("rdirplus", Opt_rdirplus), + fsparam_flag_no("rdirplus", Opt_rdirplus), // rdirplus|nordirplus + fsparam_string("rdirplus", Opt_rdirplus), // rdirplus=... fsparam_flag ("rdma", Opt_rdma), fsparam_flag_no("resvport", Opt_resvport), fsparam_u32 ("retrans", Opt_retrans), @@ -288,6 +308,12 @@ static const struct constant_table nfs_xprtsec_policies[] = { {} }; +static const struct constant_table nfs_rdirplus_tokens[] = { + { "none", Opt_rdirplus_none }, + { "force", Opt_rdirplus_force }, + {} +}; + /* * Sanity-check a server address provided by the mount command. * @@ -636,10 +662,25 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, ctx->flags &= ~NFS_MOUNT_NOACL; break; case Opt_rdirplus: - if (result.negated) + if (result.negated) { + ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS; ctx->flags |= NFS_MOUNT_NORDIRPLUS; - else - ctx->flags &= ~NFS_MOUNT_NORDIRPLUS; + } else if (!param->string) { + ctx->flags &= ~(NFS_MOUNT_NORDIRPLUS | NFS_MOUNT_FORCE_RDIRPLUS); + } else { + switch (lookup_constant(nfs_rdirplus_tokens, param->string, -1)) { + case Opt_rdirplus_none: + ctx->flags &= ~NFS_MOUNT_FORCE_RDIRPLUS; + ctx->flags |= NFS_MOUNT_NORDIRPLUS; + break; + case Opt_rdirplus_force: + ctx->flags &= ~NFS_MOUNT_NORDIRPLUS; + ctx->flags |= NFS_MOUNT_FORCE_RDIRPLUS; + break; + default: + goto out_invalid_value; + } + } break; case Opt_sharecache: if (result.negated) @@ -872,6 +913,25 @@ static int nfs_fs_context_parse_param(struct fs_context *fc, goto out_of_bounds; ctx->nfs_server.max_connect = result.uint_32; break; + case Opt_fatal_neterrors: + trace_nfs_mount_assign(param->key, param->string); + switch (result.uint_32) { + case Opt_fatal_neterrors_default: + if (fc->net_ns != &init_net) + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + else + ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL; + break; + case Opt_fatal_neterrors_enetunreach: + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + break; + case Opt_fatal_neterrors_none: + ctx->flags &= ~NFS_MOUNT_NETUNREACH_FATAL; + break; + default: + goto out_invalid_value; + } + break; case Opt_lookupcache: trace_nfs_mount_assign(param->key, param->string); switch (result.uint_32) { @@ -1651,6 +1711,9 @@ static int nfs_init_fs_context(struct fs_context *fc) ctx->xprtsec.cert_serial = TLS_NO_CERT; ctx->xprtsec.privkey_serial = TLS_NO_PRIVKEY; + if (fc->net_ns != &init_net) + ctx->flags |= NFS_MOUNT_NETUNREACH_FATAL; + fc->s_iflags |= SB_I_STABLE_WRITES; } fc->fs_private = ctx; diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 1aa67fca69b2..119e447758b9 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -74,6 +74,8 @@ nfs_fattr_to_ino_t(struct nfs_fattr *fattr) int nfs_wait_bit_killable(struct wait_bit_key *key, int mode) { + if (unlikely(nfs_current_task_exiting())) + return -EINTR; schedule(); if (signal_pending_state(mode, current)) return -ERESTARTSYS; diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index fae2c7ae4acc..6655e5f32ec6 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -400,8 +400,8 @@ struct dentry *nfs_lookup(struct inode *, struct dentry *, unsigned int); void nfs_d_prune_case_insensitive_aliases(struct inode *inode); int nfs_create(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, bool); -int nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t); +struct dentry *nfs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, + umode_t); int nfs_rmdir(struct inode *, struct dentry *); int nfs_unlink(struct inode *, struct dentry *); int nfs_symlink(struct mnt_idmap *, struct inode *, struct dentry *, @@ -899,18 +899,16 @@ u64 nfs_timespec_to_change_attr(const struct timespec64 *ts) return ((u64)ts->tv_sec << 30) + ts->tv_nsec; } -#ifdef CONFIG_CRC32 static inline u32 nfs_stateid_hash(const nfs4_stateid *stateid) { return ~crc32_le(0xFFFFFFFF, &stateid->other[0], NFS4_STATEID_OTHER_SIZE); } -#else -static inline u32 nfs_stateid_hash(nfs4_stateid *stateid) + +static inline bool nfs_current_task_exiting(void) { - return 0; + return (current->flags & PF_EXITING) != 0; } -#endif static inline bool nfs_error_is_fatal(int err) { diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 5c21caeae075..4ec952f9f47d 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -278,6 +278,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, new = __nfs_local_open_fh(clp, cred, fh, nfl, mode); if (IS_ERR(new)) return NULL; + rcu_read_lock(); /* try to swap in the pointer */ spin_lock(&clp->cl_uuid.lock); nf = rcu_dereference_protected(*pnf, 1); @@ -287,7 +288,6 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, rcu_assign_pointer(*pnf, nf); } spin_unlock(&clp->cl_uuid.lock); - rcu_read_lock(); } nf = nfs_local_file_get(nf); rcu_read_unlock(); diff --git a/fs/nfs/netns.h b/fs/nfs/netns.h index a68b21603ea9..6ba3ea39e928 100644 --- a/fs/nfs/netns.h +++ b/fs/nfs/netns.h @@ -31,7 +31,11 @@ struct nfs_net { unsigned short nfs_callback_tcpport; unsigned short nfs_callback_tcpport6; int cb_users[NFS4_MAX_MINOR_VERSION + 1]; -#endif +#endif /* CONFIG_NFS_V4 */ +#if IS_ENABLED(CONFIG_NFS_V4_1) + struct list_head nfs4_data_server_cache; + spinlock_t nfs4_data_server_lock; +#endif /* CONFIG_NFS_V4_1 */ struct nfs_netns_client *nfs_client; spinlock_t nfs_client_lock; ktime_t boot_time; diff --git a/fs/nfs/nfs3acl.c b/fs/nfs/nfs3acl.c index 18d8f6529f61..a126eb31f62f 100644 --- a/fs/nfs/nfs3acl.c +++ b/fs/nfs/nfs3acl.c @@ -104,7 +104,7 @@ struct posix_acl *nfs3_get_acl(struct inode *inode, int type, bool rcu) switch (status) { case 0: - status = nfs_refresh_inode(inode, res.fattr); + nfs_refresh_inode(inode, res.fattr); break; case -EPFNOSUPPORT: case -EPROTONOSUPPORT: diff --git a/fs/nfs/nfs3client.c b/fs/nfs/nfs3client.c index b0c8a39c2bbd..0d7310c1ee0c 100644 --- a/fs/nfs/nfs3client.c +++ b/fs/nfs/nfs3client.c @@ -120,6 +120,8 @@ struct nfs_client *nfs3_set_ds_client(struct nfs_server *mds_srv, if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); __set_bit(NFS_CS_DS, &cl_init.init_flags); diff --git a/fs/nfs/nfs3proc.c b/fs/nfs/nfs3proc.c index 0c3bc98cd999..a4cb67573aa7 100644 --- a/fs/nfs/nfs3proc.c +++ b/fs/nfs/nfs3proc.c @@ -39,7 +39,7 @@ nfs3_rpc_wrapper(struct rpc_clnt *clnt, struct rpc_message *msg, int flags) __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(NFS_JUKEBOX_RETRY_TIME); res = -ERESTARTSYS; - } while (!fatal_signal_pending(current)); + } while (!fatal_signal_pending(current) && !nfs_current_task_exiting()); return res; } @@ -578,13 +578,13 @@ out: return status; } -static int +static struct dentry * nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct posix_acl *default_acl, *acl; struct nfs3_createdata *data; - struct dentry *d_alias; - int status = -ENOMEM; + struct dentry *ret = ERR_PTR(-ENOMEM); + int status; dprintk("NFS call mkdir %pd\n", dentry); @@ -592,8 +592,9 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) if (data == NULL) goto out; - status = posix_acl_create(dir, &sattr->ia_mode, &default_acl, &acl); - if (status) + ret = ERR_PTR(posix_acl_create(dir, &sattr->ia_mode, + &default_acl, &acl)); + if (IS_ERR(ret)) goto out; data->msg.rpc_proc = &nfs3_procedures[NFS3PROC_MKDIR]; @@ -602,25 +603,27 @@ nfs3_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) data->arg.mkdir.len = dentry->d_name.len; data->arg.mkdir.sattr = sattr; - d_alias = nfs3_do_create(dir, dentry, data); - status = PTR_ERR_OR_ZERO(d_alias); + ret = nfs3_do_create(dir, dentry, data); - if (status != 0) + if (IS_ERR(ret)) goto out_release_acls; - if (d_alias) - dentry = d_alias; + if (ret) + dentry = ret; status = nfs3_proc_setacls(d_inode(dentry), acl, default_acl); + if (status) { + dput(ret); + ret = ERR_PTR(status); + } - dput(d_alias); out_release_acls: posix_acl_release(acl); posix_acl_release(default_acl); out: nfs3_free_createdata(data); - dprintk("NFS reply mkdir: %d\n", status); - return status; + dprintk("NFS reply mkdir: %d\n", PTR_ERR_OR_ZERO(ret)); + return ret; } static int diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 1924c4a2077b..5cf52ece96ac 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -21,6 +21,8 @@ #define NFSDBG_FACILITY NFSDBG_PROC static int nfs42_do_offload_cancel_async(struct file *dst, nfs4_stateid *std); +static int nfs42_proc_offload_status(struct file *file, nfs4_stateid *stateid, + u64 *copied); static void nfs42_set_netaddr(struct file *filep, struct nfs42_netaddr *naddr) { @@ -173,6 +175,20 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) return err; } +static void nfs4_copy_dequeue_callback(struct nfs_server *dst_server, + struct nfs_server *src_server, + struct nfs4_copy_state *copy) +{ + spin_lock(&dst_server->nfs_client->cl_lock); + list_del_init(©->copies); + spin_unlock(&dst_server->nfs_client->cl_lock); + if (dst_server != src_server) { + spin_lock(&src_server->nfs_client->cl_lock); + list_del_init(©->src_copies); + spin_unlock(&src_server->nfs_client->cl_lock); + } +} + static int handle_async_copy(struct nfs42_copy_res *res, struct nfs_server *dst_server, struct nfs_server *src_server, @@ -182,9 +198,12 @@ static int handle_async_copy(struct nfs42_copy_res *res, bool *restart) { struct nfs4_copy_state *copy, *tmp_copy = NULL, *iter; - int status = NFS4_OK; struct nfs_open_context *dst_ctx = nfs_file_open_context(dst); struct nfs_open_context *src_ctx = nfs_file_open_context(src); + struct nfs_client *clp = dst_server->nfs_client; + unsigned long timeout = 3 * HZ; + int status = NFS4_OK; + u64 copied; copy = kzalloc(sizeof(struct nfs4_copy_state), GFP_KERNEL); if (!copy) @@ -222,15 +241,12 @@ static int handle_async_copy(struct nfs42_copy_res *res, spin_unlock(&src_server->nfs_client->cl_lock); } - status = wait_for_completion_interruptible(©->completion); - spin_lock(&dst_server->nfs_client->cl_lock); - list_del_init(©->copies); - spin_unlock(&dst_server->nfs_client->cl_lock); - if (dst_server != src_server) { - spin_lock(&src_server->nfs_client->cl_lock); - list_del_init(©->src_copies); - spin_unlock(&src_server->nfs_client->cl_lock); - } +wait: + status = wait_for_completion_interruptible_timeout(©->completion, + timeout); + if (!status) + goto timeout; + nfs4_copy_dequeue_callback(dst_server, src_server, copy); if (status == -ERESTARTSYS) { goto out_cancel; } else if (copy->flags || copy->error == NFS4ERR_PARTNER_NO_AUTH) { @@ -240,6 +256,7 @@ static int handle_async_copy(struct nfs42_copy_res *res, } out: res->write_res.count = copy->count; + /* Copy out the updated write verifier provided by CB_OFFLOAD. */ memcpy(&res->write_res.verifier, ©->verf, sizeof(copy->verf)); status = -copy->error; @@ -251,6 +268,39 @@ out_cancel: if (!nfs42_files_from_same_server(src, dst)) nfs42_do_offload_cancel_async(src, src_stateid); goto out_free; +timeout: + timeout <<= 1; + if (timeout > (clp->cl_lease_time >> 1)) + timeout = clp->cl_lease_time >> 1; + status = nfs42_proc_offload_status(dst, ©->stateid, &copied); + if (status == -EINPROGRESS) + goto wait; + nfs4_copy_dequeue_callback(dst_server, src_server, copy); + switch (status) { + case 0: + /* The server recognized the copy stateid, so it hasn't + * rebooted. Don't overwrite the verifier returned in the + * COPY result. */ + res->write_res.count = copied; + goto out_free; + case -EREMOTEIO: + /* COPY operation failed on the server. */ + status = -EOPNOTSUPP; + res->write_res.count = copied; + goto out_free; + case -EBADF: + /* Server did not recognize the copy stateid. It has + * probably restarted and lost the plot. */ + res->write_res.count = 0; + status = -EOPNOTSUPP; + break; + case -EOPNOTSUPP: + /* RFC 7862 REQUIREs server to support OFFLOAD_STATUS when + * it has signed up for an async COPY, so server is not + * spec-compliant. */ + res->write_res.count = 0; + } + goto out_free; } static int process_copy_commit(struct file *dst, loff_t pos_dst, @@ -582,6 +632,108 @@ static int nfs42_do_offload_cancel_async(struct file *dst, return status; } +static int +_nfs42_proc_offload_status(struct nfs_server *server, struct file *file, + struct nfs42_offload_data *data) +{ + struct nfs_open_context *ctx = nfs_file_open_context(file); + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_OFFLOAD_STATUS], + .rpc_argp = &data->args, + .rpc_resp = &data->res, + .rpc_cred = ctx->cred, + }; + int status; + + status = nfs4_call_sync(server->client, server, &msg, + &data->args.osa_seq_args, + &data->res.osr_seq_res, 1); + trace_nfs4_offload_status(&data->args, status); + switch (status) { + case 0: + break; + + case -NFS4ERR_ADMIN_REVOKED: + case -NFS4ERR_BAD_STATEID: + case -NFS4ERR_OLD_STATEID: + /* + * Server does not recognize the COPY stateid. CB_OFFLOAD + * could have purged it, or server might have rebooted. + * Since COPY stateids don't have an associated inode, + * avoid triggering state recovery. + */ + status = -EBADF; + break; + case -NFS4ERR_NOTSUPP: + case -ENOTSUPP: + case -EOPNOTSUPP: + server->caps &= ~NFS_CAP_OFFLOAD_STATUS; + status = -EOPNOTSUPP; + break; + } + + return status; +} + +/** + * nfs42_proc_offload_status - Poll completion status of an async copy operation + * @dst: handle of file being copied into + * @stateid: copy stateid (from async COPY result) + * @copied: OUT: number of bytes copied so far + * + * Return values: + * %0: Server returned an NFS4_OK completion status + * %-EINPROGRESS: Server returned no completion status + * %-EREMOTEIO: Server returned an error completion status + * %-EBADF: Server did not recognize the copy stateid + * %-EOPNOTSUPP: Server does not support OFFLOAD_STATUS + * %-ERESTARTSYS: Wait interrupted by signal + * + * Other negative errnos indicate the client could not complete the + * request. + */ +static int +nfs42_proc_offload_status(struct file *dst, nfs4_stateid *stateid, u64 *copied) +{ + struct inode *inode = file_inode(dst); + struct nfs_server *server = NFS_SERVER(inode); + struct nfs4_exception exception = { + .inode = inode, + }; + struct nfs42_offload_data *data; + int status; + + if (!(server->caps & NFS_CAP_OFFLOAD_STATUS)) + return -EOPNOTSUPP; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return -ENOMEM; + data->seq_server = server; + data->args.osa_src_fh = NFS_FH(inode); + memcpy(&data->args.osa_stateid, stateid, + sizeof(data->args.osa_stateid)); + exception.stateid = &data->args.osa_stateid; + do { + status = _nfs42_proc_offload_status(server, dst, data); + if (status == -EOPNOTSUPP) + goto out; + status = nfs4_handle_exception(server, status, &exception); + } while (exception.retry); + if (status) + goto out; + + *copied = data->res.osr_count; + if (!data->res.complete_count) + status = -EINPROGRESS; + else if (data->res.osr_complete != NFS_OK) + status = -EREMOTEIO; + +out: + kfree(data); + return status; +} + static int _nfs42_proc_copy_notify(struct file *src, struct file *dst, struct nfs42_copy_notify_args *args, struct nfs42_copy_notify_res *res) diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index 5072d7ea72e9..b1b663468249 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -35,6 +35,11 @@ #define encode_offload_cancel_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE)) #define decode_offload_cancel_maxsz (op_decode_hdr_maxsz) +#define encode_offload_status_maxsz (op_encode_hdr_maxsz + \ + XDR_QUADLEN(NFS4_STATEID_SIZE)) +#define decode_offload_status_maxsz (op_decode_hdr_maxsz + \ + 2 /* osr_count */ + \ + 2 /* osr_complete */) #define encode_copy_notify_maxsz (op_encode_hdr_maxsz + \ XDR_QUADLEN(NFS4_STATEID_SIZE) + \ 1 + /* nl4_type */ \ @@ -143,6 +148,14 @@ decode_sequence_maxsz + \ decode_putfh_maxsz + \ decode_offload_cancel_maxsz) +#define NFS4_enc_offload_status_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_offload_status_maxsz) +#define NFS4_dec_offload_status_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_offload_status_maxsz) #define NFS4_enc_copy_notify_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -345,6 +358,14 @@ static void encode_offload_cancel(struct xdr_stream *xdr, encode_nfs4_stateid(xdr, &args->osa_stateid); } +static void encode_offload_status(struct xdr_stream *xdr, + const struct nfs42_offload_status_args *args, + struct compound_hdr *hdr) +{ + encode_op_hdr(xdr, OP_OFFLOAD_STATUS, decode_offload_status_maxsz, hdr); + encode_nfs4_stateid(xdr, &args->osa_stateid); +} + static void encode_copy_notify(struct xdr_stream *xdr, const struct nfs42_copy_notify_args *args, struct compound_hdr *hdr) @@ -570,6 +591,25 @@ static void nfs4_xdr_enc_offload_cancel(struct rpc_rqst *req, } /* + * Encode OFFLOAD_STATUS request + */ +static void nfs4_xdr_enc_offload_status(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_offload_status_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->osa_seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->osa_seq_args, &hdr); + encode_putfh(xdr, args->osa_src_fh, &hdr); + encode_offload_status(xdr, args, &hdr); + encode_nops(&hdr); +} + +/* * Encode COPY_NOTIFY request */ static void nfs4_xdr_enc_copy_notify(struct rpc_rqst *req, @@ -921,6 +961,26 @@ static int decode_offload_cancel(struct xdr_stream *xdr, return decode_op_hdr(xdr, OP_OFFLOAD_CANCEL); } +static int decode_offload_status(struct xdr_stream *xdr, + struct nfs42_offload_status_res *res) +{ + ssize_t result; + int status; + + status = decode_op_hdr(xdr, OP_OFFLOAD_STATUS); + if (status) + return status; + /* osr_count */ + if (xdr_stream_decode_u64(xdr, &res->osr_count) < 0) + return -EIO; + /* osr_complete<1> */ + result = xdr_stream_decode_uint32_array(xdr, &res->osr_complete, 1); + if (result < 0) + return -EIO; + res->complete_count = result; + return 0; +} + static int decode_copy_notify(struct xdr_stream *xdr, struct nfs42_copy_notify_res *res) { @@ -1371,6 +1431,32 @@ out: } /* + * Decode OFFLOAD_STATUS response + */ +static int nfs4_xdr_dec_offload_status(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_offload_status_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->osr_seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_offload_status(xdr, res); + +out: + return status; +} + +/* * Decode COPY_NOTIFY response */ static int nfs4_xdr_dec_copy_notify(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4client.c b/fs/nfs/nfs4client.c index 83378f69b35e..162c85a83a14 100644 --- a/fs/nfs/nfs4client.c +++ b/fs/nfs/nfs4client.c @@ -233,6 +233,8 @@ struct nfs_client *nfs4_alloc_client(const struct nfs_client_initdata *cl_init) __set_bit(NFS_CS_NO_RETRANS_TIMEOUT, &clp->cl_flags); if (test_bit(NFS_CS_PNFS, &cl_init->init_flags)) __set_bit(NFS_CS_PNFS, &clp->cl_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &cl_init->init_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &clp->cl_flags); /* * Set up the connection to the server before we add add to the * global list. @@ -937,6 +939,9 @@ static int nfs4_set_client(struct nfs_server *server, __set_bit(NFS_CS_TSM_POSSIBLE, &cl_init.init_flags); server->port = rpc_get_port((struct sockaddr *)addr); + if (server->flags & NFS_MOUNT_NETUNREACH_FATAL) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); + /* Allocate or find a client reference we can use */ clp = nfs_get_client(&cl_init); if (IS_ERR(clp)) @@ -1011,6 +1016,8 @@ struct nfs_client *nfs4_set_ds_client(struct nfs_server *mds_srv, if (mds_srv->flags & NFS_MOUNT_NORESVPORT) __set_bit(NFS_CS_NORESVPORT, &cl_init.init_flags); + if (test_bit(NFS_CS_NETUNREACH_FATAL, &mds_clp->cl_flags)) + __set_bit(NFS_CS_NETUNREACH_FATAL, &cl_init.init_flags); __set_bit(NFS_CS_PNFS, &cl_init.init_flags); cl_init.max_connect = NFS_MAX_TRANSPORTS; diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index df9669d4ded7..b1d2122bd5a7 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -133,6 +133,7 @@ nfs4_label_init_security(struct inode *dir, struct dentry *dentry, if (err) return NULL; + label->lsmid = shim.id; label->label = shim.context; label->len = shim.len; return label; @@ -145,7 +146,7 @@ nfs4_label_release_security(struct nfs4_label *label) if (label) { shim.context = label->label; shim.len = label->len; - shim.id = LSM_ID_UNDEF; + shim.id = label->lsmid; security_release_secctx(&shim); } } @@ -194,6 +195,9 @@ static int nfs4_map_errors(int err) return -EBUSY; case -NFS4ERR_NOT_SAME: return -ENOTSYNC; + case -ENETDOWN: + case -ENETUNREACH: + break; default: dprintk("%s could not handle NFSv4 error %d\n", __func__, -err); @@ -442,6 +446,8 @@ static int nfs4_delay_killable(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_KILLABLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!__fatal_signal_pending(current)) @@ -453,6 +459,8 @@ static int nfs4_delay_interruptible(long *timeout) { might_sleep(); + if (unlikely(nfs_current_task_exiting())) + return -EINTR; __set_current_state(TASK_INTERRUPTIBLE|TASK_FREEZABLE_UNSAFE); schedule_timeout(nfs4_update_delay(timeout)); if (!signal_pending(current)) @@ -663,6 +671,15 @@ nfs4_async_handle_exception(struct rpc_task *task, struct nfs_server *server, struct nfs_client *clp = server->nfs_client; int ret; + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + exception->delay = 0; + exception->recovering = 0; + exception->retry = 0; + return -EIO; + } + ret = nfs4_do_handle_exception(server, errorcode, exception); if (exception->delay) { int ret2 = nfs4_exception_should_retrans(server, exception); @@ -1773,7 +1790,8 @@ static void nfs_set_open_stateid_locked(struct nfs4_state *state, rcu_read_unlock(); trace_nfs4_open_stateid_update_wait(state->inode, stateid, 0); - if (!fatal_signal_pending(current)) { + if (!fatal_signal_pending(current) && + !nfs_current_task_exiting()) { if (schedule_timeout(5*HZ) == 0) status = -EAGAIN; else @@ -3153,9 +3171,7 @@ static int _nfs4_open_and_get_state(struct nfs4_opendata *opendata, if (d_really_is_negative(dentry)) { struct dentry *alias; d_drop(dentry); - alias = d_exact_alias(dentry, state->inode); - if (!alias) - alias = d_splice_alias(igrab(state->inode), dentry); + alias = d_splice_alias(igrab(state->inode), dentry); /* d_splice_alias() can't fail here - it's a non-directory */ if (alias) { dput(ctx->dentry); @@ -3577,7 +3593,7 @@ static bool nfs4_refresh_open_old_stateid(nfs4_stateid *dst, write_sequnlock(&state->seqlock); trace_nfs4_close_stateid_update_wait(state->inode, dst, 0); - if (fatal_signal_pending(current)) + if (fatal_signal_pending(current) || nfs_current_task_exiting()) status = -EINTR; else if (schedule_timeout(5*HZ) != 0) @@ -3906,8 +3922,11 @@ nfs4_atomic_open(struct inode *dir, struct nfs_open_context *ctx, static void nfs4_close_context(struct nfs_open_context *ctx, int is_sync) { + struct dentry *dentry = ctx->dentry; if (ctx->state == NULL) return; + if (dentry->d_flags & DCACHE_NFSFS_RENAMED) + nfs4_inode_set_return_delegation_on_close(d_inode(dentry)); if (is_sync) nfs4_close_sync(ctx->state, _nfs4_ctx_to_openmode(ctx)); else @@ -5135,9 +5154,6 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ &data->arg.seq_args, &data->res.seq_res, 1); if (status == 0) { spin_lock(&dir->i_lock); - /* Creating a directory bumps nlink in the parent */ - if (data->arg.ftype == NF4DIR) - nfs4_inc_nlink_locked(dir); nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, data->res.fattr->time_start, NFS_INO_INVALID_DATA); @@ -5147,6 +5163,25 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ return status; } +static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, + struct nfs4_createdata *data) +{ + int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + &data->arg.seq_args, &data->res.seq_res, 1); + + if (status) + return ERR_PTR(status); + + spin_lock(&dir->i_lock); + /* Creating a directory bumps nlink in the parent */ + nfs4_inc_nlink_locked(dir); + nfs4_update_changeattr_locked(dir, &data->res.dir_cinfo, + data->res.fattr->time_start, + NFS_INO_INVALID_DATA); + spin_unlock(&dir->i_lock); + return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); +} + static void nfs4_free_createdata(struct nfs4_createdata *data) { nfs4_label_free(data->fattr.label); @@ -5203,32 +5238,34 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, return err; } -static int _nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr, struct nfs4_label *label) +static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr, + struct nfs4_label *label) { struct nfs4_createdata *data; - int status = -ENOMEM; + struct dentry *ret = ERR_PTR(-ENOMEM); data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); if (data == NULL) goto out; data->arg.label = label; - status = nfs4_do_create(dir, dentry, data); + ret = nfs4_do_mkdir(dir, dentry, data); nfs4_free_createdata(data); out: - return status; + return ret; } -static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, - struct iattr *sattr) +static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, + struct iattr *sattr) { struct nfs_server *server = NFS_SERVER(dir); struct nfs4_exception exception = { .interruptible = true, }; struct nfs4_label l, *label; + struct dentry *alias; int err; label = nfs4_label_init_security(dir, dentry, sattr, &l); @@ -5236,14 +5273,15 @@ static int nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_mode &= ~current_umask(); do { - err = _nfs4_proc_mkdir(dir, dentry, sattr, label); + alias = _nfs4_proc_mkdir(dir, dentry, sattr, label); + err = PTR_ERR_OR_ZERO(alias); trace_nfs4_mkdir(dir, &dentry->d_name, err); err = nfs4_handle_exception(NFS_SERVER(dir), err, &exception); } while (exception.retry); nfs4_label_release_security(label); - return err; + return alias; } static int _nfs4_proc_readdir(struct nfs_readdir_arg *nr_arg, @@ -6269,7 +6307,7 @@ static int _nfs4_get_security_label(struct inode *inode, void *buf, size_t buflen) { struct nfs_server *server = NFS_SERVER(inode); - struct nfs4_label label = {0, 0, buflen, buf}; + struct nfs4_label label = {0, 0, 0, buflen, buf}; u32 bitmask[3] = { 0, 0, FATTR4_WORD2_SECURITY_LABEL }; struct nfs_fattr fattr = { @@ -6374,7 +6412,7 @@ static int nfs4_do_set_security_label(struct inode *inode, static int nfs4_set_security_label(struct inode *inode, const void *buf, size_t buflen) { - struct nfs4_label ilabel = {0, 0, buflen, (char *)buf }; + struct nfs4_label ilabel = {0, 0, 0, buflen, (char *)buf }; struct nfs_fattr *fattr; int status; @@ -7045,10 +7083,18 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, struct nfs4_unlockdata *p; struct nfs4_state *state = lsp->ls_state; struct inode *inode = state->inode; + struct nfs_lock_context *l_ctx; p = kzalloc(sizeof(*p), GFP_KERNEL); if (p == NULL) return NULL; + l_ctx = nfs_get_lock_context(ctx); + if (!IS_ERR(l_ctx)) { + p->l_ctx = l_ctx; + } else { + kfree(p); + return NULL; + } p->arg.fh = NFS_FH(inode); p->arg.fl = &p->fl; p->arg.seqid = seqid; @@ -7056,7 +7102,6 @@ static struct nfs4_unlockdata *nfs4_alloc_unlockdata(struct file_lock *fl, p->lsp = lsp; /* Ensure we don't close file until we're done freeing locks! */ p->ctx = get_nfs_open_context(ctx); - p->l_ctx = nfs_get_lock_context(ctx); locks_init_lock(&p->fl); locks_copy_lock(&p->fl, fl); p->server = NFS_SERVER(inode); @@ -9573,7 +9618,7 @@ static void nfs41_sequence_call_done(struct rpc_task *task, void *data) return; trace_nfs4_sequence(clp, task->tk_status); - if (task->tk_status < 0 && !task->tk_client->cl_shutdown) { + if (task->tk_status < 0 && clp->cl_cons_state >= 0) { dprintk("%s ERROR %d\n", __func__, task->tk_status); if (refcount_read(&clp->cl_count) == 1) return; @@ -10777,7 +10822,8 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_CLONE | NFS_CAP_LAYOUTERROR | NFS_CAP_READ_PLUS - | NFS_CAP_MOVEABLE, + | NFS_CAP_MOVEABLE + | NFS_CAP_OFFLOAD_STATUS, .init_client = nfs41_init_client, .shutdown_client = nfs41_shutdown_client, .match_stateid = nfs41_match_stateid, diff --git a/fs/nfs/nfs4session.h b/fs/nfs/nfs4session.h index 351616c61df5..f9c291e2165c 100644 --- a/fs/nfs/nfs4session.h +++ b/fs/nfs/nfs4session.h @@ -148,16 +148,12 @@ static inline void nfs4_copy_sessionid(struct nfs4_sessionid *dst, memcpy(dst->data, src->data, NFS4_MAX_SESSIONID_LEN); } -#ifdef CONFIG_CRC32 /* * nfs_session_id_hash - calculate the crc32 hash for the session id * @session - pointer to session */ #define nfs_session_id_hash(sess_id) \ (~crc32_le(0xFFFFFFFF, &(sess_id)->data[0], sizeof((sess_id)->data))) -#else -#define nfs_session_id_hash(session) (0) -#endif #else /* defined(CONFIG_NFS_V4_1) */ static inline int nfs4_init_session(struct nfs_client *clp) diff --git a/fs/nfs/nfs4state.c b/fs/nfs/nfs4state.c index 542cdf71229f..7612e977e80b 100644 --- a/fs/nfs/nfs4state.c +++ b/fs/nfs/nfs4state.c @@ -1198,7 +1198,7 @@ void nfs4_schedule_state_manager(struct nfs_client *clp) struct rpc_clnt *clnt = clp->cl_rpcclient; bool swapon = false; - if (clnt->cl_shutdown) + if (clp->cl_cons_state < 0) return; set_bit(NFS4CLNT_RUN_MANAGER, &clp->cl_state); @@ -1403,7 +1403,7 @@ int nfs4_schedule_stateid_recovery(const struct nfs_server *server, struct nfs4_ dprintk("%s: scheduling stateid recovery for server %s\n", __func__, clp->cl_hostname); nfs4_schedule_state_manager(clp); - return 0; + return clp->cl_cons_state < 0 ? clp->cl_cons_state : 0; } EXPORT_SYMBOL_GPL(nfs4_schedule_stateid_recovery); @@ -2739,7 +2739,15 @@ out_error: pr_warn_ratelimited("NFS: state manager%s%s failed on NFSv4 server %s" " with error %d\n", section_sep, section, clp->cl_hostname, -status); - ssleep(1); + switch (status) { + case -ENETDOWN: + case -ENETUNREACH: + nfs_mark_client_ready(clp, -EIO); + break; + default: + ssleep(1); + break; + } out_drain: memalloc_nofs_restore(memflags); nfs4_end_drain_session(clp); diff --git a/fs/nfs/nfs4trace.h b/fs/nfs/nfs4trace.h index 22c973316f0b..deab4c0e21a0 100644 --- a/fs/nfs/nfs4trace.h +++ b/fs/nfs/nfs4trace.h @@ -2051,13 +2051,15 @@ TRACE_EVENT(fl_getdevinfo, DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_PROTO( - const struct nfs_pgio_header *hdr + const struct nfs_pgio_header *hdr, + int error ), - TP_ARGS(hdr), + TP_ARGS(hdr, error), TP_STRUCT__entry( __field(unsigned long, error) + __field(unsigned long, nfs_error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -2073,7 +2075,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_fast_assign( const struct inode *inode = hdr->inode; - __entry->error = hdr->res.op_status; + __entry->error = -error; + __entry->nfs_error = hdr->res.op_status; __entry->fhandle = nfs_fhandle_hash(hdr->args.fh); __entry->fileid = NFS_FILEID(inode); __entry->dev = inode->i_sb->s_dev; @@ -2088,7 +2091,8 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s", + "offset=%llu count=%u stateid=%d:0x%08x dstaddr=%s " + "nfs_error=%lu (%s)", -__entry->error, show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), @@ -2096,28 +2100,32 @@ DECLARE_EVENT_CLASS(nfs4_flexfiles_io_event, __entry->fhandle, __entry->offset, __entry->count, __entry->stateid_seq, __entry->stateid_hash, - __get_str(dstaddr) + __get_str(dstaddr), __entry->nfs_error, + show_nfs4_status(__entry->nfs_error) ) ); #define DEFINE_NFS4_FLEXFILES_IO_EVENT(name) \ DEFINE_EVENT(nfs4_flexfiles_io_event, name, \ TP_PROTO( \ - const struct nfs_pgio_header *hdr \ + const struct nfs_pgio_header *hdr, \ + int error \ ), \ - TP_ARGS(hdr)) + TP_ARGS(hdr, error)) DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_read_error); DEFINE_NFS4_FLEXFILES_IO_EVENT(ff_layout_write_error); TRACE_EVENT(ff_layout_commit_error, TP_PROTO( - const struct nfs_commit_data *data + const struct nfs_commit_data *data, + int error ), - TP_ARGS(data), + TP_ARGS(data, error), TP_STRUCT__entry( __field(unsigned long, error) + __field(unsigned long, nfs_error) __field(dev_t, dev) __field(u32, fhandle) __field(u64, fileid) @@ -2131,7 +2139,8 @@ TRACE_EVENT(ff_layout_commit_error, TP_fast_assign( const struct inode *inode = data->inode; - __entry->error = data->res.op_status; + __entry->error = -error; + __entry->nfs_error = data->res.op_status; __entry->fhandle = nfs_fhandle_hash(data->args.fh); __entry->fileid = NFS_FILEID(inode); __entry->dev = inode->i_sb->s_dev; @@ -2142,14 +2151,15 @@ TRACE_EVENT(ff_layout_commit_error, TP_printk( "error=%ld (%s) fileid=%02x:%02x:%llu fhandle=0x%08x " - "offset=%llu count=%u dstaddr=%s", + "offset=%llu count=%u dstaddr=%s nfs_error=%lu (%s)", -__entry->error, show_nfs4_status(__entry->error), MAJOR(__entry->dev), MINOR(__entry->dev), (unsigned long long)__entry->fileid, __entry->fhandle, __entry->offset, __entry->count, - __get_str(dstaddr) + __get_str(dstaddr), __entry->nfs_error, + show_nfs4_status(__entry->nfs_error) ) ); @@ -2608,7 +2618,7 @@ TRACE_EVENT(nfs4_copy_notify, ) ); -TRACE_EVENT(nfs4_offload_cancel, +DECLARE_EVENT_CLASS(nfs4_offload_class, TP_PROTO( const struct nfs42_offload_status_args *args, int error @@ -2640,6 +2650,15 @@ TRACE_EVENT(nfs4_offload_cancel, __entry->stateid_seq, __entry->stateid_hash ) ); +#define DEFINE_NFS4_OFFLOAD_EVENT(name) \ + DEFINE_EVENT(nfs4_offload_class, name, \ + TP_PROTO( \ + const struct nfs42_offload_status_args *args, \ + int error \ + ), \ + TP_ARGS(args, error)) +DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_cancel); +DEFINE_NFS4_OFFLOAD_EVENT(nfs4_offload_status); DECLARE_EVENT_CLASS(nfs4_xattr_event, TP_PROTO( diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index e8ac3f615f93..55bef5fbfa47 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -82,9 +82,8 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, * we currently use size 2 (u64) out of (NFS4_OPAQUE_LIMIT >> 2) */ #define pagepad_maxsz (1) -#define open_owner_id_maxsz (1 + 2 + 1 + 1 + 2) -#define lock_owner_id_maxsz (1 + 1 + 4) -#define decode_lockowner_maxsz (1 + XDR_QUADLEN(IDMAP_NAMESZ)) +#define open_owner_id_maxsz (2 + 1 + 2 + 2) +#define lock_owner_id_maxsz (2 + 1 + 2) #define compound_encode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define compound_decode_hdr_maxsz (3 + (NFS4_MAXTAGLEN >> 2)) #define op_encode_hdr_maxsz (1) @@ -185,7 +184,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define encode_claim_null_maxsz (1 + nfs4_name_maxsz) #define encode_open_maxsz (op_encode_hdr_maxsz + \ 2 + encode_share_access_maxsz + 2 + \ - open_owner_id_maxsz + \ + 1 + open_owner_id_maxsz + \ encode_opentype_maxsz + \ encode_claim_null_maxsz) #define decode_space_limit_maxsz (3) @@ -255,13 +254,14 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, #define encode_link_maxsz (op_encode_hdr_maxsz + \ nfs4_name_maxsz) #define decode_link_maxsz (op_decode_hdr_maxsz + decode_change_info_maxsz) -#define encode_lockowner_maxsz (7) +#define encode_lockowner_maxsz (2 + 1 + lock_owner_id_maxsz) + #define encode_lock_maxsz (op_encode_hdr_maxsz + \ 7 + \ 1 + encode_stateid_maxsz + 1 + \ encode_lockowner_maxsz) #define decode_lock_denied_maxsz \ - (8 + decode_lockowner_maxsz) + (2 + 2 + 1 + 2 + 1 + lock_owner_id_maxsz) #define decode_lock_maxsz (op_decode_hdr_maxsz + \ decode_lock_denied_maxsz) #define encode_lockt_maxsz (op_encode_hdr_maxsz + 5 + \ @@ -617,7 +617,7 @@ static int decode_layoutget(struct xdr_stream *xdr, struct rpc_rqst *req, encode_lockowner_maxsz) #define NFS4_dec_release_lockowner_sz \ (compound_decode_hdr_maxsz + \ - decode_lockowner_maxsz) + decode_release_lockowner_maxsz) #define NFS4_enc_access_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -1412,7 +1412,7 @@ static inline void encode_openhdr(struct xdr_stream *xdr, const struct nfs_opena __be32 *p; /* * opcode 4, seqid 4, share_access 4, share_deny 4, clientid 8, ownerlen 4, - * owner 4 = 32 + * owner 28 */ encode_nfs4_seqid(xdr, arg->seqid); encode_share_access(xdr, arg->share_access); @@ -5077,7 +5077,7 @@ static int decode_link(struct xdr_stream *xdr, struct nfs4_change_info *cinfo) /* * We create the owner, so we know a proper owner.id length is 4. */ -static int decode_lock_denied (struct xdr_stream *xdr, struct file_lock *fl) +static int decode_lock_denied(struct xdr_stream *xdr, struct file_lock *fl) { uint64_t offset, length, clientid; __be32 *p; @@ -7702,6 +7702,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(CLONE, enc_clone, dec_clone), PROC42(COPY, enc_copy, dec_copy), PROC42(OFFLOAD_CANCEL, enc_offload_cancel, dec_offload_cancel), + PROC42(OFFLOAD_STATUS, enc_offload_status, dec_offload_status), PROC42(COPY_NOTIFY, enc_copy_notify, dec_copy_notify), PROC(LOOKUPP, enc_lookupp, dec_lookupp), PROC42(LAYOUTERROR, enc_layouterror, dec_layouterror), diff --git a/fs/nfs/pnfs.c b/fs/nfs/pnfs.c index 5f582713bf05..3adb7d0dbec7 100644 --- a/fs/nfs/pnfs.c +++ b/fs/nfs/pnfs.c @@ -745,6 +745,14 @@ pnfs_mark_matching_lsegs_invalid(struct pnfs_layout_hdr *lo, return remaining; } +static void pnfs_reset_return_info(struct pnfs_layout_hdr *lo) +{ + struct pnfs_layout_segment *lseg; + + list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) + pnfs_set_plh_return_info(lo, lseg->pls_range.iomode, 0); +} + static void pnfs_free_returned_lsegs(struct pnfs_layout_hdr *lo, struct list_head *free_me, @@ -1246,21 +1254,15 @@ static void pnfs_clear_layoutcommit(struct inode *inode, static void pnfs_layoutreturn_retry_later_locked(struct pnfs_layout_hdr *lo, const nfs4_stateid *arg_stateid, - const struct pnfs_layout_range *range) + const struct pnfs_layout_range *range, + struct list_head *freeme) { - const struct pnfs_layout_segment *lseg; - u32 seq = be32_to_cpu(arg_stateid->seqid); - if (pnfs_layout_is_valid(lo) && - nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) { - list_for_each_entry(lseg, &lo->plh_return_segs, pls_list) { - if (pnfs_seqid_is_newer(lseg->pls_seq, seq) || - !pnfs_should_free_range(&lseg->pls_range, range)) - continue; - pnfs_set_plh_return_info(lo, range->iomode, seq); - break; - } - } + nfs4_stateid_match_other(&lo->plh_stateid, arg_stateid)) + pnfs_reset_return_info(lo); + else + pnfs_mark_layout_stateid_invalid(lo, freeme); + pnfs_clear_layoutreturn_waitbit(lo); } void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, @@ -1268,11 +1270,12 @@ void pnfs_layoutreturn_retry_later(struct pnfs_layout_hdr *lo, const struct pnfs_layout_range *range) { struct inode *inode = lo->plh_inode; + LIST_HEAD(freeme); spin_lock(&inode->i_lock); - pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range); - pnfs_clear_layoutreturn_waitbit(lo); + pnfs_layoutreturn_retry_later_locked(lo, arg_stateid, range, &freeme); spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); } void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, @@ -1292,6 +1295,7 @@ void pnfs_layoutreturn_free_lsegs(struct pnfs_layout_hdr *lo, pnfs_mark_matching_lsegs_invalid(lo, &freeme, range, seq); pnfs_free_returned_lsegs(lo, &freeme, range, seq); pnfs_set_layout_stateid(lo, stateid, NULL, true); + pnfs_reset_return_info(lo); } else pnfs_mark_layout_stateid_invalid(lo, &freeme); out_unlock: @@ -1661,6 +1665,18 @@ int pnfs_roc_done(struct rpc_task *task, struct nfs4_layoutreturn_args **argpp, /* Was there an RPC level error? If not, retry */ if (task->tk_rpc_status == 0) break; + /* + * Is there a fatal network level error? + * If so release the layout, but flag the error. + */ + if ((task->tk_rpc_status == -ENETDOWN || + task->tk_rpc_status == -ENETUNREACH) && + task->tk_flags & RPC_TASK_NETUNREACH_FATAL) { + *ret = 0; + (*respp)->lrs_present = 0; + retval = -EIO; + break; + } /* If the call was not sent, let caller handle it */ if (!RPC_WAS_SENT(task)) return 0; @@ -1695,6 +1711,7 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, struct inode *inode = args->inode; const nfs4_stateid *res_stateid = NULL; struct nfs4_xdr_opaque_data *ld_private = args->ld_private; + LIST_HEAD(freeme); switch (ret) { case -NFS4ERR_BADSESSION: @@ -1703,9 +1720,9 @@ void pnfs_roc_release(struct nfs4_layoutreturn_args *args, case -NFS4ERR_NOMATCHING_LAYOUT: spin_lock(&inode->i_lock); pnfs_layoutreturn_retry_later_locked(lo, &args->stateid, - &args->range); - pnfs_clear_layoutreturn_waitbit(lo); + &args->range, &freeme); spin_unlock(&inode->i_lock); + pnfs_free_lseg_list(&freeme); break; case 0: if (res->lrs_present) diff --git a/fs/nfs/pnfs.h b/fs/nfs/pnfs.h index 30d2613e912b..91ff877185c8 100644 --- a/fs/nfs/pnfs.h +++ b/fs/nfs/pnfs.h @@ -60,6 +60,7 @@ struct nfs4_pnfs_ds { struct list_head ds_node; /* nfs4_pnfs_dev_hlist dev_dslist */ char *ds_remotestr; /* comma sep list of addrs */ struct list_head ds_addrs; + const struct net *ds_net; struct nfs_client *ds_clp; refcount_t ds_count; unsigned long ds_state; @@ -415,7 +416,8 @@ int pnfs_generic_commit_pagelist(struct inode *inode, int pnfs_generic_scan_commit_lists(struct nfs_commit_info *cinfo, int max); void pnfs_generic_write_commit_done(struct rpc_task *task, void *data); void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds); -struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(struct list_head *dsaddrs, +struct nfs4_pnfs_ds *nfs4_pnfs_ds_add(const struct net *net, + struct list_head *dsaddrs, gfp_t gfp_flags); void nfs4_pnfs_v3_ds_connect_unload(void); int nfs4_pnfs_ds_connect(struct nfs_server *mds_srv, struct nfs4_pnfs_ds *ds, diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index dbef837e871a..91ef486f40b9 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -16,6 +16,7 @@ #include "nfs4session.h" #include "internal.h" #include "pnfs.h" +#include "netns.h" #define NFSDBG_FACILITY NFSDBG_PNFS @@ -504,14 +505,14 @@ EXPORT_SYMBOL_GPL(pnfs_generic_commit_pagelist); /* * Data server cache * - * Data servers can be mapped to different device ids. - * nfs4_pnfs_ds reference counting + * Data servers can be mapped to different device ids, but should + * never be shared between net namespaces. + * + * nfs4_pnfs_ds reference counting: * - set to 1 on allocation * - incremented when a device id maps a data server already in the cache. * - decremented when deviceid is removed from the cache. */ -static DEFINE_SPINLOCK(nfs4_ds_cache_lock); -static LIST_HEAD(nfs4_data_server_cache); /* Debug routines */ static void @@ -604,11 +605,11 @@ _same_data_server_addrs_locked(const struct list_head *dsaddrs1, * Lookup DS by addresses. nfs4_ds_cache_lock is held */ static struct nfs4_pnfs_ds * -_data_server_lookup_locked(const struct list_head *dsaddrs) +_data_server_lookup_locked(const struct nfs_net *nn, const struct list_head *dsaddrs) { struct nfs4_pnfs_ds *ds; - list_for_each_entry(ds, &nfs4_data_server_cache, ds_node) + list_for_each_entry(ds, &nn->nfs4_data_server_cache, ds_node) if (_same_data_server_addrs_locked(&ds->ds_addrs, dsaddrs)) return ds; return NULL; @@ -653,10 +654,11 @@ static void destroy_ds(struct nfs4_pnfs_ds *ds) void nfs4_pnfs_ds_put(struct nfs4_pnfs_ds *ds) { - if (refcount_dec_and_lock(&ds->ds_count, - &nfs4_ds_cache_lock)) { + struct nfs_net *nn = net_generic(ds->ds_net, nfs_net_id); + + if (refcount_dec_and_lock(&ds->ds_count, &nn->nfs4_data_server_lock)) { list_del_init(&ds->ds_node); - spin_unlock(&nfs4_ds_cache_lock); + spin_unlock(&nn->nfs4_data_server_lock); destroy_ds(ds); } } @@ -716,8 +718,9 @@ out_err: * uncached and return cached struct nfs4_pnfs_ds. */ struct nfs4_pnfs_ds * -nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) +nfs4_pnfs_ds_add(const struct net *net, struct list_head *dsaddrs, gfp_t gfp_flags) { + struct nfs_net *nn = net_generic(net, nfs_net_id); struct nfs4_pnfs_ds *tmp_ds, *ds = NULL; char *remotestr; @@ -733,16 +736,17 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) /* this is only used for debugging, so it's ok if its NULL */ remotestr = nfs4_pnfs_remotestr(dsaddrs, gfp_flags); - spin_lock(&nfs4_ds_cache_lock); - tmp_ds = _data_server_lookup_locked(dsaddrs); + spin_lock(&nn->nfs4_data_server_lock); + tmp_ds = _data_server_lookup_locked(nn, dsaddrs); if (tmp_ds == NULL) { INIT_LIST_HEAD(&ds->ds_addrs); list_splice_init(dsaddrs, &ds->ds_addrs); ds->ds_remotestr = remotestr; refcount_set(&ds->ds_count, 1); INIT_LIST_HEAD(&ds->ds_node); + ds->ds_net = net; ds->ds_clp = NULL; - list_add(&ds->ds_node, &nfs4_data_server_cache); + list_add(&ds->ds_node, &nn->nfs4_data_server_cache); dprintk("%s add new data server %s\n", __func__, ds->ds_remotestr); } else { @@ -754,7 +758,7 @@ nfs4_pnfs_ds_add(struct list_head *dsaddrs, gfp_t gfp_flags) refcount_read(&tmp_ds->ds_count)); ds = tmp_ds; } - spin_unlock(&nfs4_ds_cache_lock); + spin_unlock(&nn->nfs4_data_server_lock); out: return ds; } diff --git a/fs/nfs/proc.c b/fs/nfs/proc.c index 77920a2e3cef..63e71310b9f6 100644 --- a/fs/nfs/proc.c +++ b/fs/nfs/proc.c @@ -446,13 +446,14 @@ out: return status; } -static int +static struct dentry * nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) { struct nfs_createdata *data; struct rpc_message msg = { .rpc_proc = &nfs_procedures[NFSPROC_MKDIR], }; + struct dentry *alias = NULL; int status = -ENOMEM; dprintk("NFS call mkdir %pd\n", dentry); @@ -464,12 +465,15 @@ nfs_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr) status = rpc_call_sync(NFS_CLIENT(dir), &msg, 0); nfs_mark_for_revalidate(dir); - if (status == 0) - status = nfs_instantiate(dentry, data->res.fh, data->res.fattr); + if (status == 0) { + alias = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + status = PTR_ERR_OR_ZERO(alias); + } else + alias = ERR_PTR(status); nfs_free_createdata(data); out: dprintk("NFS reply mkdir: %d\n", status); - return status; + return alias; } static int diff --git a/fs/nfs/super.c b/fs/nfs/super.c index aeb715b4a690..9eea9e62afc9 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -454,8 +454,12 @@ static void nfs_show_mount_options(struct seq_file *m, struct nfs_server *nfss, { NFS_MOUNT_NONLM, ",nolock", "" }, { NFS_MOUNT_NOACL, ",noacl", "" }, { NFS_MOUNT_NORDIRPLUS, ",nordirplus", "" }, + { NFS_MOUNT_FORCE_RDIRPLUS, ",rdirplus=force", "" }, { NFS_MOUNT_UNSHARED, ",nosharecache", "" }, { NFS_MOUNT_NORESVPORT, ",noresvport", "" }, + { NFS_MOUNT_NETUNREACH_FATAL, + ",fatal_neterrors=ENETDOWN:ENETUNREACH", + ",fatal_neterrors=none" }, { 0, NULL, NULL } }; const struct proc_nfs_info *nfs_infop; diff --git a/fs/nfs/symlink.c b/fs/nfs/symlink.c index 1c62a5a9f51d..58146e935402 100644 --- a/fs/nfs/symlink.c +++ b/fs/nfs/symlink.c @@ -40,31 +40,31 @@ static const char *nfs_get_link(struct dentry *dentry, struct inode *inode, struct delayed_call *done) { - struct page *page; + struct folio *folio; void *err; if (!dentry) { err = ERR_PTR(nfs_revalidate_mapping_rcu(inode)); if (err) return err; - page = find_get_page(inode->i_mapping, 0); - if (!page) + folio = filemap_get_folio(inode->i_mapping, 0); + if (IS_ERR(folio)) return ERR_PTR(-ECHILD); - if (!PageUptodate(page)) { - put_page(page); + if (!folio_test_uptodate(folio)) { + folio_put(folio); return ERR_PTR(-ECHILD); } } else { err = ERR_PTR(nfs_revalidate_mapping(inode, inode->i_mapping)); if (err) return err; - page = read_cache_page(&inode->i_data, 0, nfs_symlink_filler, + folio = read_cache_folio(&inode->i_data, 0, nfs_symlink_filler, NULL); - if (IS_ERR(page)) - return ERR_CAST(page); + if (IS_ERR(folio)) + return ERR_CAST(folio); } - set_delayed_call(done, page_put_link, page); - return page_address(page); + set_delayed_call(done, page_put_link, folio); + return folio_address(folio); } /* diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 7b59a40d40c0..37cb2b776435 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -14,6 +14,7 @@ #include <linux/rcupdate.h> #include <linux/lockd/lockd.h> +#include "internal.h" #include "nfs4_fs.h" #include "netns.h" #include "sysfs.h" @@ -228,6 +229,25 @@ static void shutdown_client(struct rpc_clnt *clnt) rpc_cancel_tasks(clnt, -EIO, shutdown_match_client, NULL); } +/* + * Shut down the nfs_client only once all the superblocks + * have been shut down. + */ +static void shutdown_nfs_client(struct nfs_client *clp) +{ + struct nfs_server *server; + rcu_read_lock(); + list_for_each_entry_rcu(server, &clp->cl_superblocks, client_link) { + if (!(server->flags & NFS_MOUNT_SHUTDOWN)) { + rcu_read_unlock(); + return; + } + } + rcu_read_unlock(); + nfs_mark_client_ready(clp, -EIO); + shutdown_client(clp->cl_rpcclient); +} + static ssize_t shutdown_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) @@ -259,7 +279,6 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, server->flags |= NFS_MOUNT_SHUTDOWN; shutdown_client(server->client); - shutdown_client(server->nfs_client->cl_rpcclient); if (!IS_ERR(server->client_acl)) shutdown_client(server->client_acl); @@ -267,11 +286,44 @@ shutdown_store(struct kobject *kobj, struct kobj_attribute *attr, if (server->nlm_host) shutdown_client(server->nlm_host->h_rpcclnt); out: + shutdown_nfs_client(server->nfs_client); return count; } static struct kobj_attribute nfs_sysfs_attr_shutdown = __ATTR_RW(shutdown); +#if IS_ENABLED(CONFIG_NFS_V4_1) +static ssize_t +implid_domain_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid; + + if (!impl_id || strlen(impl_id->domain) == 0) + return 0; //sysfs_emit(buf, ""); + return sysfs_emit(buf, "%s\n", impl_id->domain); +} + +static struct kobj_attribute nfs_sysfs_attr_implid_domain = __ATTR_RO(implid_domain); + + +static ssize_t +implid_name_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + struct nfs41_impl_id *impl_id = server->nfs_client->cl_implid; + + if (!impl_id || strlen(impl_id->name) == 0) + return 0; //sysfs_emit(buf, ""); + return sysfs_emit(buf, "%s\n", impl_id->name); +} + +static struct kobj_attribute nfs_sysfs_attr_implid_name = __ATTR_RO(implid_name); + +#endif /* IS_ENABLED(CONFIG_NFS_V4_1) */ + #define RPC_CLIENT_NAME_SIZE 64 void nfs_sysfs_link_rpc_client(struct nfs_server *server, @@ -309,6 +361,32 @@ static struct kobj_type nfs_sb_ktype = { .child_ns_type = nfs_netns_object_child_ns_type, }; +#if IS_ENABLED(CONFIG_NFS_V4_1) +static void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) +{ + int ret; + + if (!server->nfs_client->cl_implid) + return; + + ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_domain.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); + + ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_implid_name.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +#else /* CONFIG_NFS_V4_1 */ +static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) +{ +} +#endif /* CONFIG_NFS_V4_1 */ + void nfs_sysfs_add_server(struct nfs_server *server) { int ret; @@ -325,6 +403,8 @@ void nfs_sysfs_add_server(struct nfs_server *server) if (ret < 0) pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", server->s_sysfs_id, ret); + + nfs_sysfs_add_nfsv41_server(server); } EXPORT_SYMBOL_GPL(nfs_sysfs_add_server); diff --git a/fs/nfs/unlink.c b/fs/nfs/unlink.c index bf77399696a7..b55467911648 100644 --- a/fs/nfs/unlink.c +++ b/fs/nfs/unlink.c @@ -464,18 +464,17 @@ nfs_sillyrename(struct inode *dir, struct dentry *dentry) sdentry = NULL; do { - int slen; dput(sdentry); sillycounter++; - slen = scnprintf(silly, sizeof(silly), - SILLYNAME_PREFIX "%0*llx%0*x", - SILLYNAME_FILEID_LEN, fileid, - SILLYNAME_COUNTER_LEN, sillycounter); + scnprintf(silly, sizeof(silly), + SILLYNAME_PREFIX "%0*llx%0*x", + SILLYNAME_FILEID_LEN, fileid, + SILLYNAME_COUNTER_LEN, sillycounter); dfprintk(VFS, "NFS: trying to rename %pd to %s\n", dentry, silly); - sdentry = lookup_one_len(silly, dentry->d_parent, slen); + sdentry = lookup_noperm(&QSTR(silly), dentry->d_parent); /* * N.B. Better to return EBUSY here ... it could be * dangerous to delete the file while it's in use. diff --git a/fs/nfs/write.c b/fs/nfs/write.c index aa3d8bea3ec0..23df8b214474 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -579,8 +579,10 @@ retry: while (!nfs_lock_request(head)) { ret = nfs_wait_on_request(head); - if (ret < 0) + if (ret < 0) { + nfs_release_request(head); return ERR_PTR(ret); + } } /* Ensure that nobody removed the request before we locked it */ diff --git a/fs/nfs_common/nfsacl.c b/fs/nfs_common/nfsacl.c index ea382b75b26c..e2eaac14fd8e 100644 --- a/fs/nfs_common/nfsacl.c +++ b/fs/nfs_common/nfsacl.c @@ -42,7 +42,7 @@ struct nfsacl_encode_desc { }; struct nfsacl_simple_acl { - struct posix_acl acl; + struct posix_acl_hdr acl; struct posix_acl_entry ace[4]; }; @@ -112,7 +112,8 @@ int nfsacl_encode(struct xdr_buf *buf, unsigned int base, struct inode *inode, xdr_encode_word(buf, base, entries)) return -EINVAL; if (encode_entries && acl && acl->a_count == 3) { - struct posix_acl *acl2 = &aclbuf.acl; + struct posix_acl *acl2 = + container_of(&aclbuf.acl, struct posix_acl, hdr); /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is * invoked in contexts where a memory allocation failure is @@ -177,7 +178,8 @@ bool nfs_stream_encode_acl(struct xdr_stream *xdr, struct inode *inode, return false; if (encode_entries && acl && acl->a_count == 3) { - struct posix_acl *acl2 = &aclbuf.acl; + struct posix_acl *acl2 = + container_of(&aclbuf.acl, struct posix_acl, hdr); /* Avoid the use of posix_acl_alloc(). nfsacl_encode() is * invoked in contexts where a memory allocation failure is diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index c0bd1509ccd4..879e0b104d1c 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -4,6 +4,7 @@ config NFSD depends on INET depends on FILE_LOCKING depends on FSNOTIFY + select CRC32 select LOCKD select SUNRPC select EXPORTFS @@ -76,8 +77,8 @@ config NFSD_V4 select FS_POSIX_ACL select RPCSEC_GSS_KRB5 select CRYPTO + select CRYPTO_LIB_SHA256 select CRYPTO_MD5 - select CRYPTO_SHA256 select GRACE_PERIOD select NFS_V4_2_SSC_HELPER if NFS_V4_2 help @@ -172,6 +173,16 @@ config NFSD_LEGACY_CLIENT_TRACKING recoverydir, or spawn a process directly using a usermodehelper upcall. - These legacy client tracking methods have proven to be probelmatic + These legacy client tracking methods have proven to be problematic and will be removed in the future. Say Y here if you need support for them in the interim. + +config NFSD_V4_DELEG_TIMESTAMPS + bool "Support delegated timestamps" + depends on NFSD_V4 + default n + help + NFSD implements delegated timestamps according to + draft-ietf-nfsv4-delstid-08 "Extending the Opening of Files". This + is currently an experimental feature and is therefore left disabled + by default. diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 2f687619f65b..55744bb786c9 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -24,6 +24,7 @@ nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o nfsd-$(CONFIG_NFS_LOCALIO) += localio.o +nfsd-$(CONFIG_DEBUG_FS) += debugfs.o .PHONY: xdrgen diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c new file mode 100644 index 000000000000..84b0c8b559dc --- /dev/null +++ b/fs/nfsd/debugfs.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/debugfs.h> + +#include "nfsd.h" + +static struct dentry *nfsd_top_dir __read_mostly; + +/* + * /sys/kernel/debug/nfsd/disable-splice-read + * + * Contents: + * %0: NFS READ is allowed to use page splicing + * %1: NFS READ uses only iov iter read + * + * The default value of this setting is zero (page splicing is + * allowed). This setting takes immediate effect for all NFS + * versions, all exports, and in all NFSD net namespaces. + */ + +static int nfsd_dsr_get(void *data, u64 *val) +{ + *val = nfsd_disable_splice_read ? 1 : 0; + return 0; +} + +static int nfsd_dsr_set(void *data, u64 val) +{ + nfsd_disable_splice_read = (val > 0) ? true : false; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n"); + +void nfsd_debugfs_exit(void) +{ + debugfs_remove_recursive(nfsd_top_dir); + nfsd_top_dir = NULL; +} + +void nfsd_debugfs_init(void) +{ + nfsd_top_dir = debugfs_create_dir("nfsd", NULL); + + debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO, + nfsd_top_dir, NULL, &nfsd_dsr_fops); +} diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 0363720280d4..88ae410b4113 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1124,7 +1124,8 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) goto ok; } - goto denied; + if (!may_bypass_gss) + goto denied; ok: /* legacy gss-only clients are always OK: */ diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index fb9b1656a287..ab85e6a2454f 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -319,15 +319,14 @@ nfsd_file_check_writeback(struct nfsd_file *nf) mapping_tagged(mapping, PAGECACHE_TAG_WRITEBACK); } - -static bool nfsd_file_lru_add(struct nfsd_file *nf) +static void nfsd_file_lru_add(struct nfsd_file *nf) { - set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); - if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) { + refcount_inc(&nf->nf_ref); + if (list_lru_add_obj(&nfsd_file_lru, &nf->nf_lru)) trace_nfsd_file_lru_add(nf); - return true; - } - return false; + else + WARN_ON(1); + nfsd_file_schedule_laundrette(); } static bool nfsd_file_lru_remove(struct nfsd_file *nf) @@ -363,30 +362,10 @@ nfsd_file_put(struct nfsd_file *nf) if (test_bit(NFSD_FILE_GC, &nf->nf_flags) && test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { - /* - * If this is the last reference (nf_ref == 1), then try to - * transfer it to the LRU. - */ - if (refcount_dec_not_one(&nf->nf_ref)) - return; - - /* Try to add it to the LRU. If that fails, decrement. */ - if (nfsd_file_lru_add(nf)) { - /* If it's still hashed, we're done */ - if (test_bit(NFSD_FILE_HASHED, &nf->nf_flags)) { - nfsd_file_schedule_laundrette(); - return; - } - - /* - * We're racing with unhashing, so try to remove it from - * the LRU. If removal fails, then someone else already - * has our reference. - */ - if (!nfsd_file_lru_remove(nf)) - return; - } + set_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + set_bit(NFSD_FILE_RECENT, &nf->nf_flags); } + if (refcount_dec_and_test(&nf->nf_ref)) nfsd_file_free(nf); } @@ -530,13 +509,12 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, } /* - * Put the reference held on behalf of the LRU. If it wasn't the last - * one, then just remove it from the LRU and ignore it. + * Put the reference held on behalf of the LRU if it is the last + * reference, else rotate. */ - if (!refcount_dec_and_test(&nf->nf_ref)) { + if (!refcount_dec_if_one(&nf->nf_ref)) { trace_nfsd_file_gc_in_use(nf); - list_lru_isolate(lru, &nf->nf_lru); - return LRU_REMOVED; + return LRU_ROTATE; } /* Refcount went to zero. Unhash it and queue it to the dispose list */ @@ -548,14 +526,54 @@ nfsd_file_lru_cb(struct list_head *item, struct list_lru_one *lru, return LRU_REMOVED; } +static enum lru_status +nfsd_file_gc_cb(struct list_head *item, struct list_lru_one *lru, + void *arg) +{ + struct nfsd_file *nf = list_entry(item, struct nfsd_file, nf_lru); + + if (test_and_clear_bit(NFSD_FILE_RECENT, &nf->nf_flags)) { + /* + * "REFERENCED" really means "should be at the end of the + * LRU. As we are putting it there we can clear the flag. + */ + clear_bit(NFSD_FILE_REFERENCED, &nf->nf_flags); + trace_nfsd_file_gc_aged(nf); + return LRU_ROTATE; + } + return nfsd_file_lru_cb(item, lru, arg); +} + +/* If the shrinker runs between calls to list_lru_walk_node() in + * nfsd_file_gc(), the "remaining" count will be wrong. This could + * result in premature freeing of some files. This may not matter much + * but is easy to fix with this spinlock which temporarily disables + * the shrinker. + */ +static DEFINE_SPINLOCK(nfsd_gc_lock); static void nfsd_file_gc(void) { + unsigned long ret = 0; LIST_HEAD(dispose); - unsigned long ret; + int nid; + + spin_lock(&nfsd_gc_lock); + for_each_node_state(nid, N_NORMAL_MEMORY) { + unsigned long remaining = list_lru_count_node(&nfsd_file_lru, nid); + + while (remaining > 0) { + unsigned long nr = min(remaining, NFSD_FILE_GC_BATCH); - ret = list_lru_walk(&nfsd_file_lru, nfsd_file_lru_cb, - &dispose, list_lru_count(&nfsd_file_lru)); + remaining -= nr; + ret += list_lru_walk_node(&nfsd_file_lru, nid, nfsd_file_gc_cb, + &dispose, &nr); + if (nr) + /* walk aborted early */ + remaining = 0; + } + } + spin_unlock(&nfsd_gc_lock); trace_nfsd_file_gc_removed(ret, list_lru_count(&nfsd_file_lru)); nfsd_file_dispose_list_delayed(&dispose); } @@ -563,9 +581,9 @@ nfsd_file_gc(void) static void nfsd_file_gc_worker(struct work_struct *work) { - nfsd_file_gc(); if (list_lru_count(&nfsd_file_lru)) - nfsd_file_schedule_laundrette(); + nfsd_file_gc(); + nfsd_file_schedule_laundrette(); } static unsigned long @@ -580,8 +598,12 @@ nfsd_file_lru_scan(struct shrinker *s, struct shrink_control *sc) LIST_HEAD(dispose); unsigned long ret; + if (!spin_trylock(&nfsd_gc_lock)) + return SHRINK_STOP; + ret = list_lru_shrink_walk(&nfsd_file_lru, sc, nfsd_file_lru_cb, &dispose); + spin_unlock(&nfsd_gc_lock); trace_nfsd_file_shrinker_removed(ret, list_lru_count(&nfsd_file_lru)); nfsd_file_dispose_list_delayed(&dispose); return ret; @@ -686,17 +708,12 @@ nfsd_file_close_inode(struct inode *inode) void nfsd_file_close_inode_sync(struct inode *inode) { - struct nfsd_file *nf; LIST_HEAD(dispose); trace_nfsd_file_close(inode); nfsd_file_queue_for_close(inode, &dispose); - while (!list_empty(&dispose)) { - nf = list_first_entry(&dispose, struct nfsd_file, nf_gc); - list_del_init(&nf->nf_gc); - nfsd_file_free(nf); - } + nfsd_file_dispose_list(&dispose); } static int @@ -1058,16 +1075,8 @@ retry: nf = nfsd_file_lookup_locked(net, current_cred(), inode, need, want_gc); rcu_read_unlock(); - if (nf) { - /* - * If the nf is on the LRU then it holds an extra reference - * that must be put if it's removed. It had better not be - * the last one however, since we should hold another. - */ - if (nfsd_file_lru_remove(nf)) - refcount_dec(&nf->nf_ref); + if (nf) goto wait_for_construction; - } new = nfsd_file_alloc(net, inode, need, want_gc); if (!new) { @@ -1161,6 +1170,9 @@ open_file: */ if (status != nfs_ok || inode->i_nlink == 0) nfsd_file_unhash(nf); + else if (want_gc) + nfsd_file_lru_add(nf); + clear_and_wake_up_bit(NFSD_FILE_PENDING, &nf->nf_flags); if (status == nfs_ok) goto out; diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index d5db6b34ba30..5865f9c72712 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -4,6 +4,12 @@ #include <linux/fsnotify_backend.h> /* + * Limit the time that the list_lru_one lock is held during + * an LRU scan. + */ +#define NFSD_FILE_GC_BATCH (16UL) + +/* * This is the fsnotify_mark container that nfsd attaches to the files that it * is holding open. Note that we have a separate refcount here aside from the * one in the fsnotify_mark. We only want a single fsnotify_mark attached to @@ -38,6 +44,7 @@ struct nfsd_file { #define NFSD_FILE_PENDING (1) #define NFSD_FILE_REFERENCED (2) #define NFSD_FILE_GC (3) +#define NFSD_FILE_RECENT (4) unsigned long nf_flags; refcount_t nf_ref; unsigned char nf_may; diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index 372bdcf5e07a..a817d8485d21 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -14,6 +14,7 @@ #include "xdr3.h" #include "vfs.h" #include "filecache.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -69,8 +70,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: GETATTR(3) %s\n", - SVCFH_fmt(&argp->fh)); + trace_nfsd_vfs_getattr(rqstp, &argp->fh); fh_copy(&resp->fh, &argp->fh); resp->status = fh_verify(rqstp, &resp->fh, 0, @@ -220,7 +220,6 @@ nfsd3_proc_write(struct svc_rqst *rqstp) struct nfsd3_writeargs *argp = rqstp->rq_argp; struct nfsd3_writeres *resp = rqstp->rq_resp; unsigned long cnt = argp->len; - unsigned int nvecs; dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", SVCFH_fmt(&argp->fh), @@ -235,10 +234,8 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; - nvecs = svc_fill_write_vector(rqstp, &argp->payload); - resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, nvecs, &cnt, + &argp->payload, &cnt, resp->committed, resp->verf); resp->count = cnt; resp->status = nfsd3_map_status(resp->status); @@ -266,6 +263,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 status; int host_err; + trace_nfsd_vfs_create(rqstp, fhp, S_IFREG, argp->name, argp->len); + if (isdotent(argp->name, argp->len)) return nfserr_exist; if (!(iap->ia_valid & ATTR_MODE)) @@ -284,7 +283,9 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, inode_lock_nested(inode, I_MUTEX_PARENT); - child = lookup_one_len(argp->name, parent, argp->len); + child = lookup_one(&nop_mnt_idmap, + &QSTR_LEN(argp->name, argp->len), + parent); if (IS_ERR(child)) { status = nfserrno(PTR_ERR(child)); goto out; @@ -380,11 +381,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp) struct nfsd3_diropres *resp = rqstp->rq_resp; svc_fh *dirfhp, *newfhp; - dprintk("nfsd: CREATE(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - dirfhp = fh_copy(&resp->dirfh, &argp->fh); newfhp = fh_init(&resp->fh, NFS3_FHSIZE); @@ -405,11 +401,6 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp) .na_iattr = &argp->attrs, }; - dprintk("nfsd: MKDIR(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - argp->attrs.ia_valid &= ~ATTR_SIZE; fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); @@ -445,11 +436,6 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) goto out; } - dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", - SVCFH_fmt(&argp->ffh), - argp->flen, argp->fname, - argp->tlen, argp->tname); - fh_copy(&resp->dirfh, &argp->ffh); fh_init(&resp->fh, NFS3_FHSIZE); resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, @@ -474,11 +460,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) int type; dev_t rdev = 0; - dprintk("nfsd: MKNOD(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); @@ -511,11 +492,6 @@ nfsd3_proc_remove(struct svc_rqst *rqstp) struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: REMOVE(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - /* Unlink. -S_IFDIR means file must not be a directory */ fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, @@ -533,11 +509,6 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp) struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: RMDIR(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); @@ -551,15 +522,6 @@ nfsd3_proc_rename(struct svc_rqst *rqstp) struct nfsd3_renameargs *argp = rqstp->rq_argp; struct nfsd3_renameres *resp = rqstp->rq_resp; - dprintk("nfsd: RENAME(3) %s %.*s ->\n", - SVCFH_fmt(&argp->ffh), - argp->flen, - argp->fname); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); - fh_copy(&resp->ffh, &argp->ffh); fh_copy(&resp->tfh, &argp->tfh); resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen, @@ -574,13 +536,6 @@ nfsd3_proc_link(struct svc_rqst *rqstp) struct nfsd3_linkargs *argp = rqstp->rq_argp; struct nfsd3_linkres *resp = rqstp->rq_resp; - dprintk("nfsd: LINK(3) %s ->\n", - SVCFH_fmt(&argp->ffh)); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); - fh_copy(&resp->fh, &argp->ffh); fh_copy(&resp->tfh, &argp->tfh); resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen, @@ -619,9 +574,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) struct nfsd3_readdirres *resp = rqstp->rq_resp; loff_t offset; - dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, (u32) argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); nfsd3_init_dirlist_pages(rqstp, resp, argp->count); @@ -653,9 +606,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) struct nfsd3_readdirres *resp = rqstp->rq_resp; loff_t offset; - dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, (u32) argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); nfsd3_init_dirlist_pages(rqstp, resp, argp->count); @@ -696,9 +647,6 @@ nfsd3_proc_fsstat(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_fsstatres *resp = rqstp->rq_resp; - dprintk("nfsd: FSSTAT(3) %s\n", - SVCFH_fmt(&argp->fh)); - resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); fh_put(&argp->fh); resp->status = nfsd3_map_status(resp->status); diff --git a/fs/nfsd/nfs3xdr.c b/fs/nfsd/nfs3xdr.c index a7a07470c1f8..ef4971d71ac4 100644 --- a/fs/nfsd/nfs3xdr.c +++ b/fs/nfsd/nfs3xdr.c @@ -1001,7 +1001,9 @@ compose_entry_fh(struct nfsd3_readdirres *cd, struct svc_fh *fhp, } else dchild = dget(dparent); } else - dchild = lookup_positive_unlocked(name, dparent, namlen); + dchild = lookup_one_positive_unlocked(&nop_mnt_idmap, + &QSTR_LEN(name, namlen), + dparent); if (IS_ERR(dchild)) return rv; if (d_mountpoint(dchild)) diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index 484077200c5d..ccb00aa93be0 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -46,8 +46,6 @@ #define NFSDDBG_FACILITY NFSDDBG_PROC -static void nfsd4_mark_cb_fault(struct nfs4_client *clp); - #define NFSPROC4_CB_NULL 0 #define NFSPROC4_CB_COMPOUND 1 @@ -101,15 +99,15 @@ static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap, if (bitmap[0] & FATTR4_WORD0_CHANGE) if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_change) < 0) - return -NFSERR_BAD_XDR; + return -EIO; if (bitmap[0] & FATTR4_WORD0_SIZE) if (xdr_stream_decode_u64(xdr, &fattr->ncf_cb_fsize) < 0) - return -NFSERR_BAD_XDR; + return -EIO; if (bitmap[2] & FATTR4_WORD2_TIME_DELEG_ACCESS) { fattr4_time_deleg_access access; if (!xdrgen_decode_fattr4_time_deleg_access(xdr, &access)) - return -NFSERR_BAD_XDR; + return -EIO; fattr->ncf_cb_atime.tv_sec = access.seconds; fattr->ncf_cb_atime.tv_nsec = access.nseconds; @@ -118,7 +116,7 @@ static int decode_cb_fattr4(struct xdr_stream *xdr, uint32_t *bitmap, fattr4_time_deleg_modify modify; if (!xdrgen_decode_fattr4_time_deleg_modify(xdr, &modify)) - return -NFSERR_BAD_XDR; + return -EIO; fattr->ncf_cb_mtime.tv_sec = modify.seconds; fattr->ncf_cb_mtime.tv_nsec = modify.nseconds; @@ -419,6 +417,29 @@ static u32 highest_slotid(struct nfsd4_session *ses) return idx; } +static void +encode_referring_call4(struct xdr_stream *xdr, + const struct nfsd4_referring_call *rc) +{ + encode_uint32(xdr, rc->rc_sequenceid); + encode_uint32(xdr, rc->rc_slotid); +} + +static void +encode_referring_call_list4(struct xdr_stream *xdr, + const struct nfsd4_referring_call_list *rcl) +{ + struct nfsd4_referring_call *rc; + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); + xdr_encode_opaque_fixed(p, rcl->rcl_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + encode_uint32(xdr, rcl->__nr_referring_calls); + list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) + encode_referring_call4(xdr, rc); +} + /* * CB_SEQUENCE4args * @@ -436,6 +457,7 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) { struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + struct nfsd4_referring_call_list *rcl; __be32 *p; if (hdr->minorversion == 0) @@ -444,12 +466,16 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr, encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE); encode_sessionid4(xdr, session); - p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4); + p = xdr_reserve_space(xdr, XDR_UNIT * 4); *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]); /* csa_sequenceid */ *p++ = cpu_to_be32(cb->cb_held_slot); /* csa_slotid */ *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */ *p++ = xdr_zero; /* csa_cachethis */ - xdr_encode_empty_array(p); /* csa_referring_call_lists */ + + /* csa_referring_call_lists */ + encode_uint32(xdr, cb->cb_nr_referring_call_list); + list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) + encode_referring_call_list4(xdr, rcl); hdr->nops++; } @@ -682,15 +708,15 @@ static int nfs4_xdr_dec_cb_getattr(struct rpc_rqst *rqstp, if (unlikely(status || cb->cb_status)) return status; if (xdr_stream_decode_uint32_array(xdr, bitmap, 3) < 0) - return -NFSERR_BAD_XDR; + return -EIO; if (xdr_stream_decode_u32(xdr, &attrlen) < 0) - return -NFSERR_BAD_XDR; + return -EIO; maxlen = sizeof(ncf->ncf_cb_change) + sizeof(ncf->ncf_cb_fsize); if (bitmap[2] != 0) maxlen += (sizeof(ncf->ncf_cb_mtime.tv_sec) + sizeof(ncf->ncf_cb_mtime.tv_nsec)) * 2; if (attrlen > maxlen) - return -NFSERR_BAD_XDR; + return -EIO; status = decode_cb_fattr4(xdr, bitmap, ncf); return status; } @@ -1064,6 +1090,17 @@ static bool nfsd4_queue_cb(struct nfsd4_callback *cb) return queue_work(clp->cl_callback_wq, &cb->cb_work); } +static void nfsd4_requeue_cb(struct rpc_task *task, struct nfsd4_callback *cb) +{ + struct nfs4_client *clp = cb->cb_clp; + + if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { + trace_nfsd_cb_restart(clp, cb); + task->tk_status = 0; + set_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); + } +} + static void nfsd41_cb_inflight_begin(struct nfs4_client *clp) { atomic_inc(&clp->cl_cb_inflight); @@ -1301,15 +1338,112 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) trace_nfsd_cb_destroy(clp, cb); nfsd41_cb_release_slot(cb); + if (test_bit(NFSD4_CALLBACK_WAKE, &cb->cb_flags)) + clear_and_wake_up_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags); + else + clear_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags); + if (cb->cb_ops && cb->cb_ops->release) cb->cb_ops->release(cb); nfsd41_cb_inflight_end(clp); } -/* - * TODO: cb_sequence should support referring call lists, cachethis, - * and mark callback channel down on communication errors. +/** + * nfsd41_cb_referring_call - add a referring call to a callback operation + * @cb: context of callback to add the rc to + * @sessionid: referring call's session ID + * @slotid: referring call's session slot index + * @seqno: referring call's slot sequence number + * + * Caller serializes access to @cb. + * + * NB: If memory allocation fails, the referring call is not added. + */ +void nfsd41_cb_referring_call(struct nfsd4_callback *cb, + struct nfs4_sessionid *sessionid, + u32 slotid, u32 seqno) +{ + struct nfsd4_referring_call_list *rcl; + struct nfsd4_referring_call *rc; + bool found; + + might_sleep(); + + found = false; + list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) { + if (!memcmp(rcl->rcl_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN)) { + found = true; + break; + } + } + if (!found) { + rcl = kmalloc(sizeof(*rcl), GFP_KERNEL); + if (!rcl) + return; + memcpy(rcl->rcl_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN); + rcl->__nr_referring_calls = 0; + INIT_LIST_HEAD(&rcl->rcl_referring_calls); + list_add(&rcl->__list, &cb->cb_referring_call_list); + cb->cb_nr_referring_call_list++; + } + + found = false; + list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) { + if (rc->rc_sequenceid == seqno && rc->rc_slotid == slotid) { + found = true; + break; + } + } + if (!found) { + rc = kmalloc(sizeof(*rc), GFP_KERNEL); + if (!rc) + goto out; + rc->rc_sequenceid = seqno; + rc->rc_slotid = slotid; + rcl->__nr_referring_calls++; + list_add(&rc->__list, &rcl->rcl_referring_calls); + } + +out: + if (!rcl->__nr_referring_calls) { + cb->cb_nr_referring_call_list--; + kfree(rcl); + } +} + +/** + * nfsd41_cb_destroy_referring_call_list - release referring call info + * @cb: context of a callback that has completed + * + * Callers who allocate referring calls using nfsd41_cb_referring_call() must + * release those resources by calling nfsd41_cb_destroy_referring_call_list. + * + * Caller serializes access to @cb. */ +void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb) +{ + struct nfsd4_referring_call_list *rcl; + struct nfsd4_referring_call *rc; + + while (!list_empty(&cb->cb_referring_call_list)) { + rcl = list_first_entry(&cb->cb_referring_call_list, + struct nfsd4_referring_call_list, + __list); + + while (!list_empty(&rcl->rcl_referring_calls)) { + rc = list_first_entry(&rcl->rcl_referring_calls, + struct nfsd4_referring_call, + __list); + list_del(&rc->__list); + kfree(rc); + } + list_del(&rcl->__list); + kfree(rcl); + } +} + static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; @@ -1328,30 +1462,14 @@ static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) rpc_call_start(task); } +/* Returns true if CB_COMPOUND processing should continue */ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback *cb) { - struct nfs4_client *clp = cb->cb_clp; - struct nfsd4_session *session = clp->cl_cb_session; - bool ret = true; - - if (!clp->cl_minorversion) { - /* - * If the backchannel connection was shut down while this - * task was queued, we need to resubmit it after setting up - * a new backchannel connection. - * - * Note that if we lost our callback connection permanently - * the submission code will error out, so we don't need to - * handle that case here. - */ - if (RPC_SIGNALLED(task)) - goto need_restart; - - return true; - } + struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + bool ret = false; if (cb->cb_held_slot < 0) - goto need_restart; + goto requeue; /* This is the operation status code for CB_SEQUENCE */ trace_nfsd_cb_seq_status(task, cb); @@ -1365,11 +1483,16 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback * (sequence ID, cached reply) MUST NOT change. */ ++session->se_cb_seq_nr[cb->cb_held_slot]; + ret = true; break; case -ESERVERFAULT: - ++session->se_cb_seq_nr[cb->cb_held_slot]; + /* + * Call succeeded, but the session, slot index, or slot + * sequence number in the response do not match the same + * in the server's call. The sequence information is thus + * untrustworthy. + */ nfsd4_mark_cb_fault(cb->cb_clp); - ret = false; break; case 1: /* @@ -1381,43 +1504,42 @@ static bool nfsd4_cb_sequence_done(struct rpc_task *task, struct nfsd4_callback fallthrough; case -NFS4ERR_BADSESSION: nfsd4_mark_cb_fault(cb->cb_clp); - ret = false; - goto need_restart; + goto requeue; case -NFS4ERR_DELAY: cb->cb_seq_status = 1; - if (!rpc_restart_call(task)) - goto out; - + if (RPC_SIGNALLED(task) || !rpc_restart_call(task)) + goto requeue; rpc_delay(task, 2 * HZ); return false; + case -NFS4ERR_SEQ_MISORDERED: case -NFS4ERR_BADSLOT: + /* + * A SEQ_MISORDERED or BADSLOT error means that the client and + * server are out of sync as to the backchannel parameters. Mark + * the backchannel faulty and restart the RPC, but leak the slot + * so that it's no longer used. + */ + nfsd4_mark_cb_fault(cb->cb_clp); + cb->cb_held_slot = -1; goto retry_nowait; - case -NFS4ERR_SEQ_MISORDERED: - if (session->se_cb_seq_nr[cb->cb_held_slot] != 1) { - session->se_cb_seq_nr[cb->cb_held_slot] = 1; - goto retry_nowait; - } - break; default: nfsd4_mark_cb_fault(cb->cb_clp); } trace_nfsd_cb_free_slot(task, cb); nfsd41_cb_release_slot(cb); - - if (RPC_SIGNALLED(task)) - goto need_restart; -out: return ret; retry_nowait: - if (rpc_restart_call_prepare(task)) - ret = false; - goto out; -need_restart: - if (!test_bit(NFSD4_CLIENT_CB_KILL, &clp->cl_flags)) { - trace_nfsd_cb_restart(clp, cb); - task->tk_status = 0; - cb->cb_need_restart = true; + /* + * RPC_SIGNALLED() means that the rpc_client is being torn down and + * (possibly) recreated. Requeue the call in that case. + */ + if (!RPC_SIGNALLED(task)) { + if (rpc_restart_call_prepare(task)) + return false; } +requeue: + nfsd41_cb_release_slot(cb); + nfsd4_requeue_cb(task, cb); return false; } @@ -1428,8 +1550,21 @@ static void nfsd4_cb_done(struct rpc_task *task, void *calldata) trace_nfsd_cb_rpc_done(clp); - if (!nfsd4_cb_sequence_done(task, cb)) + if (!clp->cl_minorversion) { + /* + * If the backchannel connection was shut down while this + * task was queued, we need to resubmit it after setting up + * a new backchannel connection. + * + * Note that if we lost our callback connection permanently + * the submission code will error out, so we don't need to + * handle that case here. + */ + if (RPC_SIGNALLED(task)) + nfsd4_requeue_cb(task, cb); + } else if (!nfsd4_cb_sequence_done(task, cb)) { return; + } if (cb->cb_status) { WARN_ONCE(task->tk_status, @@ -1462,7 +1597,7 @@ static void nfsd4_cb_release(void *calldata) trace_nfsd_cb_rpc_release(cb->cb_clp); - if (cb->cb_need_restart) + if (test_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags)) nfsd4_queue_cb(cb); else nfsd41_destroy_cb(cb); @@ -1575,7 +1710,7 @@ nfsd4_run_cb_work(struct work_struct *work) container_of(work, struct nfsd4_callback, cb_work); struct nfs4_client *clp = cb->cb_clp; struct rpc_clnt *clnt; - int flags; + int flags, ret; trace_nfsd_cb_start(clp); @@ -1601,16 +1736,19 @@ nfsd4_run_cb_work(struct work_struct *work) return; } - if (cb->cb_need_restart) { - cb->cb_need_restart = false; - } else { + if (!test_and_clear_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags)) { if (cb->cb_ops && cb->cb_ops->prepare) cb->cb_ops->prepare(cb); } + cb->cb_msg.rpc_cred = clp->cl_cb_cred; flags = clp->cl_minorversion ? RPC_TASK_NOCONNECT : RPC_TASK_SOFTCONN; - rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, - cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); + ret = rpc_call_async(clnt, &cb->cb_msg, RPC_TASK_SOFT | flags, + cb->cb_ops ? &nfsd4_cb_ops : &nfsd4_cb_probe_ops, cb); + if (ret != 0) { + set_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); + nfsd4_queue_cb(cb); + } } void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, @@ -1620,11 +1758,13 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, cb->cb_msg.rpc_proc = &nfs4_cb_procedures[op]; cb->cb_msg.rpc_argp = cb; cb->cb_msg.rpc_resp = cb; + cb->cb_flags = 0; cb->cb_ops = ops; INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); cb->cb_status = 0; - cb->cb_need_restart = false; cb->cb_held_slot = -1; + cb->cb_nr_referring_call_list = 0; + INIT_LIST_HEAD(&cb->cb_referring_call_list); } /** diff --git a/fs/nfsd/nfs4layouts.c b/fs/nfsd/nfs4layouts.c index fbfddd3c4c94..290271ac4245 100644 --- a/fs/nfsd/nfs4layouts.c +++ b/fs/nfsd/nfs4layouts.c @@ -344,9 +344,10 @@ nfsd4_recall_file_layout(struct nfs4_layout_stateid *ls) atomic_inc(&ls->ls_stid.sc_file->fi_lo_recalls); trace_nfsd_layout_recall(&ls->ls_stid.sc_stateid); - refcount_inc(&ls->ls_stid.sc_count); - nfsd4_run_cb(&ls->ls_recall); - + if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ls->ls_recall.cb_flags)) { + refcount_inc(&ls->ls_stid.sc_count); + nfsd4_run_cb(&ls->ls_recall); + } out_unlock: spin_unlock(&ls->ls_lock); } diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index f6e06c779d09..f13abbb13b38 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -266,7 +266,9 @@ nfsd4_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, inode_lock_nested(inode, I_MUTEX_PARENT); - child = lookup_one_len(open->op_fname, parent, open->op_fnamelen); + child = lookup_one(&nop_mnt_idmap, + &QSTR_LEN(open->op_fname, open->op_fnamelen), + parent); if (IS_ERR(child)) { status = nfserrno(PTR_ERR(child)); goto out; @@ -876,6 +878,8 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_getattr *getattr = &u->getattr; __be32 status; + trace_nfsd_vfs_getattr(rqstp, &cstate->current_fh); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) return status; @@ -998,6 +1002,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, u64 cookie = readdir->rd_cookie; static const nfs4_verifier zeroverf; + trace_nfsd_vfs_readdir(rqstp, &cstate->current_fh, + readdir->rd_maxcount, readdir->rd_cookie); + /* no need to check permission - this will be done in nfsd_readdir() */ if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) @@ -1211,7 +1218,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd_file *nf = NULL; __be32 status = nfs_ok; unsigned long cnt; - int nvecs; if (write->wr_offset > (u64)OFFSET_MAX || write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX) @@ -1226,13 +1232,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; write->wr_how_written = write->wr_stable_how; - - nvecs = svc_fill_write_vector(rqstp, &write->wr_payload); - WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); - status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, - write->wr_offset, rqstp->rq_vec, nvecs, &cnt, - write->wr_how_written, + write->wr_offset, &write->wr_payload, + &cnt, write->wr_how_written, (__be32 *)write->wr_verifier.data); nfsd_file_put(nf); @@ -1379,8 +1381,11 @@ static void nfs4_put_copy(struct nfsd4_copy *copy) static void nfsd4_stop_copy(struct nfsd4_copy *copy) { trace_nfsd_copy_async_cancel(copy); - if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) + if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) { kthread_stop(copy->copy_task); + copy->nfserr = nfs_ok; + set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); + } nfs4_put_copy(copy); } @@ -1709,10 +1714,11 @@ static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, switch (task->tk_status) { case -NFS4ERR_DELAY: if (cbo->co_retries--) { - rpc_delay(task, 1 * HZ); + rpc_delay(task, HZ / 5); return 0; } } + nfsd41_cb_destroy_referring_call_list(cb); return 1; } @@ -1845,9 +1851,12 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy) nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); + nfsd41_cb_referring_call(&cbo->co_cb, &cbo->co_referring_sessionid, + cbo->co_referring_slotid, + cbo->co_referring_seqno); trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid, &cbo->co_fh, copy->cp_count, copy->nfserr); - nfsd4_run_cb(&cbo->co_cb); + nfsd4_try_run_cb(&cbo->co_cb); } /** @@ -1961,6 +1970,11 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); + memcpy(async_copy->cp_cb_offload.co_referring_sessionid.data, + cstate->session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + async_copy->cp_cb_offload.co_referring_slotid = cstate->slot->sl_index; + async_copy->cp_cb_offload.co_referring_seqno = cstate->slot->sl_seqid; async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); if (IS_ERR(async_copy->copy_task)) @@ -3766,7 +3780,8 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; u32 opiter; - if (!cstate->minorversion) + if (rqstp->rq_procinfo != &nfsd_version4.vs_proc[NFSPROC4_COMPOUND] || + cstate->minorversion == 0) return false; if (cstate->spo_must_allowed) @@ -3832,7 +3847,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_ressize = sizeof(struct nfsd4_compoundres), .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = NFSD_BUFSIZE/4, + .pc_xdrressize = 3+NFSSVC_MAXBLKSIZE/4, .pc_name = "COMPOUND", }, }; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index 28f4d5311c40..82785db730d9 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -33,6 +33,7 @@ */ #include <crypto/hash.h> +#include <crypto/sha2.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> @@ -218,7 +219,7 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) /* lock the parent */ inode_lock(d_inode(dir)); - dentry = lookup_one_len(dname, dir, HEXDIR_LEN-1); + dentry = lookup_one(&nop_mnt_idmap, &QSTR(dname), dir); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; @@ -233,9 +234,12 @@ nfsd4_create_clid_dir(struct nfs4_client *clp) * as well be forgiving and just succeed silently. */ goto out_put; - status = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); + dentry = vfs_mkdir(&nop_mnt_idmap, d_inode(dir), dentry, S_IRWXU); + if (IS_ERR(dentry)) + status = PTR_ERR(dentry); out_put: - dput(dentry); + if (!status) + dput(dentry); out_unlock: inode_unlock(d_inode(dir)); if (status == 0) { @@ -313,7 +317,8 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) list_for_each_entry_safe(entry, tmp, &ctx.names, list) { if (!status) { struct dentry *dentry; - dentry = lookup_one_len(entry->name, dir, HEXDIR_LEN-1); + dentry = lookup_one(&nop_mnt_idmap, + &QSTR(entry->name), dir); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); break; @@ -336,16 +341,16 @@ nfsd4_list_rec_dir(recdir_func *f, struct nfsd_net *nn) } static int -nfsd4_unlink_clid_dir(char *name, int namlen, struct nfsd_net *nn) +nfsd4_unlink_clid_dir(char *name, struct nfsd_net *nn) { struct dentry *dir, *dentry; int status; - dprintk("NFSD: nfsd4_unlink_clid_dir. name %.*s\n", namlen, name); + dprintk("NFSD: nfsd4_unlink_clid_dir. name %s\n", name); dir = nn->rec_file->f_path.dentry; inode_lock_nested(d_inode(dir), I_MUTEX_PARENT); - dentry = lookup_one_len(name, dir, namlen); + dentry = lookup_one(&nop_mnt_idmap, &QSTR(name), dir); if (IS_ERR(dentry)) { status = PTR_ERR(dentry); goto out_unlock; @@ -405,7 +410,7 @@ nfsd4_remove_clid_dir(struct nfs4_client *clp) if (status < 0) goto out_drop_write; - status = nfsd4_unlink_clid_dir(dname, HEXDIR_LEN-1, nn); + status = nfsd4_unlink_clid_dir(dname, nn); nfs4_reset_creds(original_cred); if (status == 0) { vfs_fsync(nn->rec_file, 0); @@ -733,7 +738,6 @@ struct cld_net { spinlock_t cn_lock; struct list_head cn_list; unsigned int cn_xid; - struct crypto_shash *cn_tfm; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING bool cn_has_legacy; #endif @@ -1059,8 +1063,6 @@ nfsd4_remove_cld_pipe(struct net *net) nfsd4_cld_unregister_net(net, cn->cn_pipe); rpc_destroy_pipe_data(cn->cn_pipe); - if (cn->cn_tfm) - crypto_free_shash(cn->cn_tfm); kfree(nn->cld_net); nn->cld_net = NULL; } @@ -1154,8 +1156,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; struct cld_msg_v2 *cmsg; - struct crypto_shash *tfm = cn->cn_tfm; - struct xdr_netobj cksum; char *principal = NULL; /* Don't upcall if it's already stored */ @@ -1178,22 +1178,9 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal) { - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) { - ret = -ENOMEM; - goto out; - } - ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal), - cksum.data); - if (ret) { - kfree(cksum.data); - goto out; - } - cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; - memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, - cksum.data, cksum.len); - kfree(cksum.data); + sha256(principal, strlen(principal), + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data); + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = SHA256_DIGEST_SIZE; } else cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; @@ -1203,7 +1190,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } -out: free_cld_upcall(cup); out_err: if (ret) @@ -1342,12 +1328,11 @@ found: static int nfsd4_cld_check_v2(struct nfs4_client *clp) { - struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING struct cld_net *cn = nn->cld_net; - int status; - struct crypto_shash *tfm = cn->cn_tfm; - struct xdr_netobj cksum; +#endif + struct nfs4_client_reclaim *crp; char *principal = NULL; /* did we already find that this client is stable? */ @@ -1363,6 +1348,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) if (cn->cn_has_legacy) { struct xdr_netobj name; char dname[HEXDIR_LEN]; + int status; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) @@ -1385,28 +1371,18 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) return -ENOENT; found: if (crp->cr_princhash.len) { + u8 digest[SHA256_DIGEST_SIZE]; + if (clp->cl_cred.cr_raw_principal) principal = clp->cl_cred.cr_raw_principal; else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal == NULL) return -ENOENT; - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) - return -ENOENT; - status = crypto_shash_tfm_digest(tfm, principal, - strlen(principal), cksum.data); - if (status) { - kfree(cksum.data); - return -ENOENT; - } - if (memcmp(crp->cr_princhash.data, cksum.data, - crp->cr_princhash.len)) { - kfree(cksum.data); + sha256(principal, strlen(principal), digest); + if (memcmp(crp->cr_princhash.data, digest, + crp->cr_princhash.len)) return -ENOENT; - } - kfree(cksum.data); } crp->cr_clp = clp; return 0; @@ -1586,7 +1562,6 @@ nfsd4_cld_tracking_init(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool running; int retries = 10; - struct crypto_shash *tfm; status = nfs4_cld_state_init(net); if (status) @@ -1611,12 +1586,6 @@ nfsd4_cld_tracking_init(struct net *net) status = -ETIMEDOUT; goto err_remove; } - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - status = PTR_ERR(tfm); - goto err_remove; - } - nn->cld_net->cn_tfm = tfm; status = nfsd4_cld_get_version(nn); if (status == -EOPNOTSUPP) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 153eeea2c7c9..d5694987f86f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -946,15 +946,6 @@ struct nfs4_stid *nfs4_alloc_stid(struct nfs4_client *cl, struct kmem_cache *sla spin_lock_init(&stid->sc_lock); INIT_LIST_HEAD(&stid->sc_cp_list); - /* - * It shouldn't be a problem to reuse an opaque stateid value. - * I don't think it is for 4.1. But with 4.0 I worry that, for - * example, a stray write retransmission could be accepted by - * the server when it should have been rejected. Therefore, - * adopt a trick from the sctp code to attempt to maximize the - * amount of time until an id is reused, by ensuring they always - * "increase" (mod INT_MAX): - */ return stid; out_free: kmem_cache_free(slab, stid); @@ -1050,6 +1041,12 @@ static struct nfs4_ol_stateid * nfs4_alloc_open_stateid(struct nfs4_client *clp) return openlockstateid(stid); } +/* + * As the sc_free callback of deleg, this may be called by nfs4_put_stid + * in nfsd_break_one_deleg. + * Considering nfsd_break_one_deleg is called with the flc->flc_lock held, + * this function mustn't ever sleep. + */ static void nfs4_free_deleg(struct nfs4_stid *stid) { struct nfs4_delegation *dp = delegstateid(stid); @@ -1378,7 +1375,8 @@ static void revoke_delegation(struct nfs4_delegation *dp) struct nfs4_client *clp = dp->dl_stid.sc_client; WARN_ON(!list_empty(&dp->dl_recall_lru)); - WARN_ON_ONCE(!(dp->dl_stid.sc_status & + WARN_ON_ONCE(dp->dl_stid.sc_client->cl_minorversion > 0 && + !(dp->dl_stid.sc_status & (SC_STATUS_REVOKED | SC_STATUS_ADMIN_REVOKED))); trace_nfsd_stid_revoke(&dp->dl_stid); @@ -1989,26 +1987,30 @@ reduce_session_slots(struct nfsd4_session *ses, int dec) return ret; } -/* - * We don't actually need to cache the rpc and session headers, so we - * can allocate a little less for each slot: - */ -static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) +static struct nfsd4_slot *nfsd4_alloc_slot(struct nfsd4_channel_attrs *fattrs, + int index, gfp_t gfp) { - u32 size; + struct nfsd4_slot *slot; + size_t size; - if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ) - size = 0; - else - size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; - return size + sizeof(struct nfsd4_slot); + /* + * The RPC and NFS session headers are never saved in + * the slot reply cache buffer. + */ + size = fattrs->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ ? + 0 : fattrs->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + + slot = kzalloc(struct_size(slot, sl_data, size), gfp); + if (!slot) + return NULL; + slot->sl_index = index; + return slot; } static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, struct nfsd4_channel_attrs *battrs) { int numslots = fattrs->maxreqs; - int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; struct nfsd4_slot *slot; int i; @@ -2017,14 +2019,14 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, if (!new) return NULL; xa_init(&new->se_slots); - /* allocate each struct nfsd4_slot and data cache in one piece */ - slot = kzalloc(slotsize, GFP_KERNEL); + + slot = nfsd4_alloc_slot(fattrs, 0, GFP_KERNEL); if (!slot || xa_is_err(xa_store(&new->se_slots, 0, slot, GFP_KERNEL))) goto out_free; for (i = 1; i < numslots; i++) { const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; - slot = kzalloc(slotsize, gfp); + slot = nfsd4_alloc_slot(fattrs, i, gfp); if (!slot) break; if (xa_is_err(xa_store(&new->se_slots, i, slot, gfp))) { @@ -3168,7 +3170,6 @@ nfsd4_cb_recall_any_release(struct nfsd4_callback *cb) { struct nfs4_client *clp = cb->cb_clp; - clear_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); drop_client(clp); } @@ -3199,7 +3200,6 @@ nfsd4_cb_getattr_release(struct nfsd4_callback *cb) struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); - clear_and_wake_up_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags); nfs4_put_stid(&dp->dl_stid); } @@ -3220,11 +3220,15 @@ static void nfs4_cb_getattr(struct nfs4_cb_fattr *ncf) struct nfs4_delegation *dp = container_of(ncf, struct nfs4_delegation, dl_cb_fattr); - if (test_and_set_bit(CB_GETATTR_BUSY, &ncf->ncf_cb_flags)) + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &ncf->ncf_getattr.cb_flags)) return; + /* set to proper status when nfsd4_cb_getattr_done runs */ ncf->ncf_cb_status = NFS4ERR_IO; + /* ensure that wake_bit is done when RUNNING is cleared */ + set_bit(NFSD4_CALLBACK_WAKE, &ncf->ncf_getattr.cb_flags); + refcount_inc(&dp->dl_stid.sc_count); nfsd4_run_cb(&ncf->ncf_getattr); } @@ -4402,7 +4406,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfserr_rep_too_big; if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) goto out_put_session; - svc_reserve(rqstp, buflen); + svc_reserve_auth(rqstp, buflen); status = nfs_ok; /* Success! accept new slot seqid */ @@ -4438,8 +4442,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * spinlock, and only succeeds if there is * plenty of memory. */ - slot = kzalloc(slot_bytes(&session->se_fchannel), - GFP_NOWAIT); + slot = nfsd4_alloc_slot(&session->se_fchannel, s, + GFP_NOWAIT); prev_slot = xa_load(&session->se_slots, s); if (xa_is_value(prev_slot) && slot) { slot->sl_seqid = xa_to_value(prev_slot); @@ -4815,8 +4819,8 @@ out: static unsigned long nfsd4_state_shrinker_count(struct shrinker *shrink, struct shrink_control *sc) { - int count; struct nfsd_net *nn = shrink->private_data; + long count; count = atomic_read(&nn->nfsd_courtesy_clients); if (!count) @@ -5414,6 +5418,11 @@ static const struct nfsd4_callback_ops nfsd4_cb_recall_ops = { static void nfsd_break_one_deleg(struct nfs4_delegation *dp) { + bool queued; + + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &dp->dl_recall.cb_flags)) + return; + /* * We're assuming the state code never drops its reference * without first removing the lease. Since we're in this lease @@ -5422,7 +5431,10 @@ static void nfsd_break_one_deleg(struct nfs4_delegation *dp) * we know it's safe to take a reference. */ refcount_inc(&dp->dl_stid.sc_count); - WARN_ON_ONCE(!nfsd4_run_cb(&dp->dl_recall)); + queued = nfsd4_run_cb(&dp->dl_recall); + WARN_ON_ONCE(!queued); + if (!queued) + refcount_dec(&dp->dl_stid.sc_count); } /* Called from break_lease() with flc_lock held. */ @@ -5948,11 +5960,23 @@ nfsd4_verify_setuid_write(struct nfsd4_open *open, struct nfsd_file *nf) return 0; } +#ifdef CONFIG_NFSD_V4_DELEG_TIMESTAMPS +static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) +{ + return open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; +} +#else /* CONFIG_NFSD_V4_DELEG_TIMESTAMPS */ +static bool nfsd4_want_deleg_timestamps(const struct nfsd4_open *open) +{ + return false; +} +#endif /* CONFIG NFSD_V4_DELEG_TIMESTAMPS */ + static struct nfs4_delegation * nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *parent) { - bool deleg_ts = open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; + bool deleg_ts = nfsd4_want_deleg_timestamps(open); struct nfs4_client *clp = stp->st_stid.sc_client; struct nfs4_file *fp = stp->st_stid.sc_file; struct nfs4_clnt_odstate *odstate = stp->st_clnt_odstate; @@ -5999,6 +6023,15 @@ nfs4_set_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, if (!nf) return ERR_PTR(-EAGAIN); + /* + * File delegations and associated locks cannot be recovered if the + * export is from an NFS proxy server. + */ + if (exportfs_cannot_lock(nf->nf_file->f_path.mnt->mnt_sb->s_export_op)) { + nfsd_file_put(nf); + return ERR_PTR(-EOPNOTSUPP); + } + spin_lock(&state_lock); spin_lock(&fp->fi_lock); if (nfs4_delegation_exists(clp, fp)) @@ -6151,8 +6184,8 @@ static void nfs4_open_delegation(struct nfsd4_open *open, struct nfs4_ol_stateid *stp, struct svc_fh *currentfh) { - bool deleg_ts = open->op_deleg_want & OPEN4_SHARE_ACCESS_WANT_DELEG_TIMESTAMPS; struct nfs4_openowner *oo = openowner(stp->st_stateowner); + bool deleg_ts = nfsd4_want_deleg_timestamps(open); struct nfs4_client *clp = stp->st_stid.sc_client; struct svc_fh *parent = NULL; struct nfs4_delegation *dp; @@ -6855,38 +6888,34 @@ deleg_reaper(struct nfsd_net *nn) { struct list_head *pos, *next; struct nfs4_client *clp; - LIST_HEAD(cblist); spin_lock(&nn->client_lock); list_for_each_safe(pos, next, &nn->client_lru) { clp = list_entry(pos, struct nfs4_client, cl_lru); - if (clp->cl_state != NFSD4_ACTIVE || - list_empty(&clp->cl_delegations) || - atomic_read(&clp->cl_delegs_in_recall) || - test_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags) || - (ktime_get_boottime_seconds() - - clp->cl_ra_time < 5)) { + + if (clp->cl_state != NFSD4_ACTIVE) + continue; + if (list_empty(&clp->cl_delegations)) + continue; + if (atomic_read(&clp->cl_delegs_in_recall)) + continue; + if (test_and_set_bit(NFSD4_CALLBACK_RUNNING, &clp->cl_ra->ra_cb.cb_flags)) + continue; + if (ktime_get_boottime_seconds() - clp->cl_ra_time < 5) + continue; + if (clp->cl_cb_state != NFSD4_CB_UP) continue; - } - list_add(&clp->cl_ra_cblist, &cblist); /* release in nfsd4_cb_recall_any_release */ kref_get(&clp->cl_nfsdfs.cl_ref); - set_bit(NFSD4_CLIENT_CB_RECALL_ANY, &clp->cl_flags); clp->cl_ra_time = ktime_get_boottime_seconds(); - } - spin_unlock(&nn->client_lock); - - while (!list_empty(&cblist)) { - clp = list_first_entry(&cblist, struct nfs4_client, - cl_ra_cblist); - list_del_init(&clp->cl_ra_cblist); clp->cl_ra->ra_keep = 0; clp->cl_ra->ra_bmval[0] = BIT(RCA4_TYPE_MASK_RDATA_DLG) | BIT(RCA4_TYPE_MASK_WDATA_DLG); trace_nfsd_cb_recall_any(clp->cl_ra); nfsd4_run_cb(&clp->cl_ra->ra_cb); } + spin_unlock(&nn->client_lock); } static void @@ -7051,7 +7080,7 @@ nfsd4_lookup_stateid(struct nfsd4_compound_state *cstate, */ statusmask |= SC_STATUS_REVOKED; - statusmask |= SC_STATUS_ADMIN_REVOKED; + statusmask |= SC_STATUS_ADMIN_REVOKED | SC_STATUS_FREEABLE; if (ZERO_STATEID(stateid) || ONE_STATEID(stateid) || CLOSE_STATEID(stateid)) @@ -7706,9 +7735,7 @@ nfsd4_delegreturn(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, if ((status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0))) return status; - status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, - SC_STATUS_REVOKED | SC_STATUS_FREEABLE, - &s, nn); + status = nfsd4_lookup_stateid(cstate, stateid, SC_TYPE_DELEG, SC_STATUS_REVOKED, &s, nn); if (status) goto out; dp = delegstateid(s); @@ -7816,7 +7843,7 @@ nfsd4_lm_notify(struct file_lock *fl) if (queue) { trace_nfsd_cb_notify_lock(lo, nbl); - nfsd4_run_cb(&nbl->nbl_cb); + nfsd4_try_run_cb(&nbl->nbl_cb); } } @@ -8134,6 +8161,10 @@ nfsd4_lock(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = fh_verify(rqstp, &cstate->current_fh, S_IFREG, 0); if (status != nfs_ok) return status; + if (exportfs_cannot_lock(cstate->current_fh.fh_dentry->d_sb->s_export_op)) { + status = nfserr_notsupp; + goto out; + } if (lock->lk_is_new) { if (nfsd4_has_session(cstate)) @@ -8473,6 +8504,11 @@ nfsd4_locku(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, status = nfserr_lock_range; goto put_stateid; } + if (exportfs_cannot_lock(nf->nf_file->f_path.mnt->mnt_sb->s_export_op)) { + status = nfserr_notsupp; + goto put_file; + } + file_lock = locks_alloc_lock(); if (!file_lock) { dprintk("NFSD: %s: unable to allocate lock!\n", __func__); @@ -9182,8 +9218,8 @@ nfsd4_deleg_getattr_conflict(struct svc_rqst *rqstp, struct dentry *dentry, nfs4_cb_getattr(&dp->dl_cb_fattr); spin_unlock(&ctx->flc_lock); - wait_on_bit_timeout(&ncf->ncf_cb_flags, CB_GETATTR_BUSY, - TASK_INTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT); + wait_on_bit_timeout(&ncf->ncf_getattr.cb_flags, NFSD4_CALLBACK_RUNNING, + TASK_UNINTERRUPTIBLE, NFSD_CB_GETATTR_TIMEOUT); if (ncf->ncf_cb_status) { /* Recall delegation only if client didn't respond */ status = nfserrno(nfsd_open_break_lease(inode, NFSD_MAY_READ)); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index e67420729ecd..3afcdbed6e14 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2564,7 +2564,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) /* Sessions make the DRC unnecessary: */ if (argp->minorversion) cachethis = false; - svc_reserve(argp->rqstp, max_reply + readbytes); + svc_reserve_auth(argp->rqstp, max_reply + readbytes); argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; argp->splice_ok = nfsd_read_splice_ok(argp->rqstp); @@ -3391,6 +3391,23 @@ static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr, return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]); } +/* + * Copied from generic_remap_checks/generic_remap_file_range_prep. + * + * These generic functions use the file system's s_blocksize, but + * individual file systems aren't required to use + * generic_remap_file_range_prep. Until there is a mechanism for + * determining a particular file system's (or file's) clone block + * size, this is the best NFSD can do. + */ +static __be32 nfsd4_encode_fattr4_clone_blksize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct inode *inode = d_inode(args->dentry); + + return nfsd4_encode_uint32_t(xdr, inode->i_sb->s_blocksize); +} + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr, const struct nfsd4_fattr_args *args) @@ -3545,7 +3562,7 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = { [FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop, [FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat, [FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop, - [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop, + [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4_clone_blksize, [FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop, [FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop, @@ -3812,7 +3829,9 @@ nfsd4_encode_entry4_fattr(struct nfsd4_readdir *cd, const char *name, __be32 nfserr; int ignore_crossmnt = 0; - dentry = lookup_positive_unlocked(name, cd->rd_fhp->fh_dentry, namlen); + dentry = lookup_one_positive_unlocked(&nop_mnt_idmap, + &QSTR_LEN(name, namlen), + cd->rd_fhp->fh_dentry); if (IS_ERR(dentry)) return nfserrno(PTR_ERR(dentry)); diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ce2a71e4904c..3f3e9f6c4250 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -1917,6 +1917,7 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) struct svc_serv *serv; LIST_HEAD(permsocks); struct nfsd_net *nn; + bool delete = false; int err, rem; mutex_lock(&nfsd_mutex); @@ -1977,34 +1978,28 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) } } - /* For now, no removing old sockets while server is running */ - if (serv->sv_nrthreads && !list_empty(&permsocks)) { + /* + * If there are listener transports remaining on the permsocks list, + * it means we were asked to remove a listener. + */ + if (!list_empty(&permsocks)) { list_splice_init(&permsocks, &serv->sv_permsocks); - spin_unlock_bh(&serv->sv_lock); - err = -EBUSY; - goto out_unlock_mtx; + delete = true; } + spin_unlock_bh(&serv->sv_lock); - /* Close the remaining sockets on the permsocks list */ - while (!list_empty(&permsocks)) { - xprt = list_first_entry(&permsocks, struct svc_xprt, xpt_list); - list_move(&xprt->xpt_list, &serv->sv_permsocks); - - /* - * Newly-created sockets are born with the BUSY bit set. Clear - * it if there are no threads, since nothing can pick it up - * in that case. - */ - if (!serv->sv_nrthreads) - clear_bit(XPT_BUSY, &xprt->xpt_flags); - - set_bit(XPT_CLOSE, &xprt->xpt_flags); - spin_unlock_bh(&serv->sv_lock); - svc_xprt_close(xprt); - spin_lock_bh(&serv->sv_lock); + /* Do not remove listeners while there are active threads. */ + if (serv->sv_nrthreads) { + err = -EBUSY; + goto out_unlock_mtx; } - spin_unlock_bh(&serv->sv_lock); + /* + * Since we can't delete an arbitrary llist entry, destroy the + * remaining listeners and recreate the list. + */ + if (delete) + svc_xprt_destroy_all(serv, net); /* walk list of addrs again, open any that still don't exist */ nlmsg_for_each_attr(attr, info->nlhdr, GENL_HDRLEN, rem) { @@ -2031,6 +2026,9 @@ int nfsd_nl_listener_set_doit(struct sk_buff *skb, struct genl_info *info) xprt = svc_find_listener(serv, xcl_name, net, sa); if (xprt) { + if (delete) + WARN_ONCE(1, "Transport type=%s already exists\n", + xcl_name); svc_xprt_put(xprt); continue; } @@ -2204,8 +2202,14 @@ static __net_init int nfsd_net_init(struct net *net) NFSD_STATS_COUNTERS_NUM); if (retval) goto out_repcache_error; + memset(&nn->nfsd_svcstats, 0, sizeof(nn->nfsd_svcstats)); nn->nfsd_svcstats.program = &nfsd_programs[0]; + if (!nfsd_proc_stat_init(net)) { + retval = -ENOMEM; + goto out_proc_error; + } + for (i = 0; i < sizeof(nn->nfsd_versions); i++) nn->nfsd_versions[i] = nfsd_support_version(i); for (i = 0; i < sizeof(nn->nfsd4_minorversions); i++) @@ -2215,13 +2219,14 @@ static __net_init int nfsd_net_init(struct net *net) nfsd4_init_leases_net(nn); get_random_bytes(&nn->siphash_key, sizeof(nn->siphash_key)); seqlock_init(&nn->writeverf_lock); - nfsd_proc_stat_init(net); #if IS_ENABLED(CONFIG_NFS_LOCALIO) spin_lock_init(&nn->local_clients_lock); INIT_LIST_HEAD(&nn->local_clients); #endif return 0; +out_proc_error: + percpu_counter_destroy_many(nn->counter, NFSD_STATS_COUNTERS_NUM); out_repcache_error: nfsd_idmap_shutdown(net); out_idmap_error: @@ -2276,6 +2281,8 @@ static int __init init_nfsd(void) { int retval; + nfsd_debugfs_init(); + retval = nfsd4_init_slabs(); if (retval) return retval; @@ -2286,12 +2293,9 @@ static int __init init_nfsd(void) if (retval) goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ - retval = create_proc_exports_entry(); - if (retval) - goto out_free_lockd; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) - goto out_free_exports; + goto out_free_lockd; retval = register_cld_notifier(); if (retval) goto out_free_subsys; @@ -2300,22 +2304,26 @@ static int __init init_nfsd(void) goto out_free_cld; retval = register_filesystem(&nfsd_fs_type); if (retval) - goto out_free_all; + goto out_free_nfsd4; retval = genl_register_family(&nfsd_nl_family); if (retval) + goto out_free_filesystem; + retval = create_proc_exports_entry(); + if (retval) goto out_free_all; nfsd_localio_ops_init(); return 0; out_free_all: + genl_unregister_family(&nfsd_nl_family); +out_free_filesystem: + unregister_filesystem(&nfsd_fs_type); +out_free_nfsd4: nfsd4_destroy_laundry_wq(); out_free_cld: unregister_cld_notifier(); out_free_subsys: unregister_pernet_subsys(&nfsd_net_ops); -out_free_exports: - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); out_free_lockd: nfsd_lockd_shutdown(); nfsd_drc_slab_free(); @@ -2323,22 +2331,24 @@ out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); + nfsd_debugfs_exit(); return retval; } static void __exit exit_nfsd(void) { + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); genl_unregister_family(&nfsd_nl_family); unregister_filesystem(&nfsd_fs_type); nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); + nfsd_debugfs_exit(); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index e2997f0ffbc5..1bfd0b4e9af7 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -44,24 +44,14 @@ bool nfsd_support_version(int vers); #include "stats.h" /* - * Maximum blocksizes supported by daemon under various circumstances. + * Default and maximum payload size (NFS READ or WRITE), in bytes. + * The default is historical, and the maximum is an implementation + * limit. */ -#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD -/* NFSv2 is limited by the protocol specification, see RFC 1094 */ -#define NFSSVC_MAXBLKSIZE_V2 (8*1024) - - -/* - * Largest number of bytes we need to allocate for an NFS - * call or reply. Used to control buffer sizes. We use - * the length of v3 WRITE, READDIR and READDIR replies - * which are an RPC header, up to 26 XDR units of reply - * data, and some page data. - * - * Note that accuracy here doesn't matter too much as the - * size is rounded up to a page size when allocating space. - */ -#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) +enum { + NFSSVC_DEFBLKSIZE = 1 * 1024 * 1024, + NFSSVC_MAXBLKSIZE = RPCSVC_MAXPAYLOAD, +}; struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ @@ -156,6 +146,16 @@ void nfsd_reset_versions(struct nfsd_net *nn); int nfsd_create_serv(struct net *net); void nfsd_destroy_serv(struct net *net); +#ifdef CONFIG_DEBUG_FS +void nfsd_debugfs_init(void); +void nfsd_debugfs_exit(void); +#else +static inline void nfsd_debugfs_init(void) {} +static inline void nfsd_debugfs_exit(void) {} +#endif + +extern bool nfsd_disable_splice_read __read_mostly; + extern int nfsd_max_blksize; static inline int nfsd_v4client(struct svc_rqst *rq) diff --git a/fs/nfsd/nfsfh.h b/fs/nfsd/nfsfh.h index 876152a91f12..5103c2f4d225 100644 --- a/fs/nfsd/nfsfh.h +++ b/fs/nfsd/nfsfh.h @@ -267,7 +267,6 @@ static inline bool fh_fsid_match(const struct knfsd_fh *fh1, return true; } -#ifdef CONFIG_CRC32 /** * knfsd_fh_hash - calculate the crc32 hash for the filehandle * @fh - pointer to filehandle @@ -279,12 +278,6 @@ static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) { return ~crc32_le(0xFFFFFFFF, fh->fh_raw, fh->fh_size); } -#else -static inline u32 knfsd_fh_hash(const struct knfsd_fh *fh) -{ - return 0; -} -#endif /** * fh_clear_pre_post_attrs - Reset pre/post attributes diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 6dda081eb24c..c10fa8128a8a 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -10,6 +10,7 @@ #include "cache.h" #include "xdr.h" #include "vfs.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -54,7 +55,7 @@ nfsd_proc_getattr(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + trace_nfsd_vfs_getattr(rqstp, &argp->fh); fh_copy(&resp->fh, &argp->fh); resp->status = fh_verify(rqstp, &resp->fh, 0, @@ -211,7 +212,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->count, argp->offset); - argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); + argp->count = min_t(u32, argp->count, NFS_MAXDATA); argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); resp->pages = rqstp->rq_next_page; @@ -250,17 +251,14 @@ nfsd_proc_write(struct svc_rqst *rqstp) struct nfsd_writeargs *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; unsigned long cnt = argp->len; - unsigned int nvecs; dprintk("nfsd: WRITE %s %u bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nvecs = svc_fill_write_vector(rqstp, &argp->payload); - - resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), - argp->offset, rqstp->rq_vec, nvecs, - &cnt, NFS_DATA_SYNC, NULL); + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, + &argp->payload, &cnt, NFS_DATA_SYNC, NULL); if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) @@ -292,9 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp) int hosterr; dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); - dprintk("nfsd: CREATE %s %.*s\n", - SVCFH_fmt(dirfhp), argp->len, argp->name); - /* First verify the parent file handle */ resp->status = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); if (resp->status != nfs_ok) @@ -312,7 +307,8 @@ nfsd_proc_create(struct svc_rqst *rqstp) } inode_lock_nested(dirfhp->fh_dentry->d_inode, I_MUTEX_PARENT); - dchild = lookup_one_len(argp->name, dirfhp->fh_dentry, argp->len); + dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(argp->name, argp->len), + dirfhp->fh_dentry); if (IS_ERR(dchild)) { resp->status = nfserrno(PTR_ERR(dchild)); goto out_unlock; @@ -331,7 +327,7 @@ nfsd_proc_create(struct svc_rqst *rqstp) */ resp->status = nfserr_acces; if (!newfhp->fh_dentry) { - printk(KERN_WARNING + printk(KERN_WARNING "nfsd_proc_create: file handle not verified\n"); goto out_unlock; } @@ -445,9 +441,6 @@ nfsd_proc_remove(struct svc_rqst *rqstp) struct nfsd_diropargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh), - argp->len, argp->name); - /* Unlink. -SIFDIR means file must not be a directory */ resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len); @@ -462,11 +455,6 @@ nfsd_proc_rename(struct svc_rqst *rqstp) struct nfsd_renameargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: RENAME %s %.*s -> \n", - SVCFH_fmt(&argp->ffh), argp->flen, argp->fname); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname); - resp->status = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen, &argp->tfh, argp->tname, argp->tlen); fh_put(&argp->ffh); @@ -481,13 +469,6 @@ nfsd_proc_link(struct svc_rqst *rqstp) struct nfsd_linkargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: LINK %s ->\n", - SVCFH_fmt(&argp->ffh)); - dprintk("nfsd: %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); - resp->status = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen, &argp->ffh); fh_put(&argp->ffh); @@ -519,10 +500,6 @@ nfsd_proc_symlink(struct svc_rqst *rqstp) goto out; } - dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", - SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, - argp->tlen, argp->tname); - fh_init(&newfh, NFS_FHSIZE); resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, argp->tname, &attrs, &newfh); @@ -548,8 +525,6 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp) .na_iattr = &argp->attrs, }; - dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); - if (resp->fh.fh_dentry) { printk(KERN_WARNING "nfsd_proc_mkdir: response already verified??\n"); @@ -578,8 +553,6 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp) struct nfsd_diropargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); - resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len); fh_put(&argp->fh); @@ -615,9 +588,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) struct nfsd_readdirres *resp = rqstp->rq_resp; loff_t offset; - dprintk("nfsd: READDIR %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); nfsd_init_dirlist_pages(rqstp, resp, argp->count); @@ -642,8 +613,6 @@ nfsd_proc_statfs(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_statfsres *resp = rqstp->rq_resp; - dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); - resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, NFSD_MAY_BYPASS_GSS_ON_ROOT); fh_put(&argp->fh); @@ -739,7 +708,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_argzero = sizeof(struct nfsd_readargs), .pc_ressize = sizeof(struct nfsd_readres), .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, + .pc_xdrressize = ST+AT+1+NFS_MAXDATA/4, .pc_name = "READ", }, [NFSPROC_WRITECACHE] = { diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9b3d6cff0e1e..82b0111ac469 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -396,13 +396,13 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred) if (ret) goto out_filecache; +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + nfsd4_ssc_init_umount_work(nn); +#endif ret = nfs4_state_start_net(net); if (ret) goto out_reply_cache; -#ifdef CONFIG_NFSD_V4_2_INTER_SSC - nfsd4_ssc_init_umount_work(nn); -#endif nn->nfsd_net_up = true; return 0; @@ -582,7 +582,7 @@ static int nfsd_get_default_max_blksize(void) */ target >>= 12; - ret = NFSSVC_MAXBLKSIZE; + ret = NFSSVC_DEFBLKSIZE; while (ret > target && ret >= 8*1024*2) ret /= 2; return ret; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 5777f40c7353..fc262ceafca9 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -336,7 +336,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) /* opaque data */ if (xdr_stream_decode_u32(xdr, &args->len) < 0) return false; - if (args->len > NFSSVC_MAXBLKSIZE_V2) + if (args->len > NFS_MAXDATA) return false; return xdr_stream_subsegment(xdr, &args->payload, args->len); @@ -540,7 +540,7 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr) p = xdr_reserve_space(xdr, XDR_UNIT * 5); if (!p) return false; - *p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2); + *p++ = cpu_to_be32(NFS_MAXDATA); *p++ = cpu_to_be32(stat->f_bsize); *p++ = cpu_to_be32(stat->f_blocks); *p++ = cpu_to_be32(stat->f_bfree); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 74d2d7b42676..1995bca158b8 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -64,15 +64,36 @@ typedef struct { refcount_t cs_count; } copy_stateid_t; +struct nfsd4_referring_call { + struct list_head __list; + + u32 rc_sequenceid; + u32 rc_slotid; +}; + +struct nfsd4_referring_call_list { + struct list_head __list; + + struct nfs4_sessionid rcl_sessionid; + int __nr_referring_calls; + struct list_head rcl_referring_calls; +}; + struct nfsd4_callback { struct nfs4_client *cb_clp; struct rpc_message cb_msg; +#define NFSD4_CALLBACK_RUNNING (0) +#define NFSD4_CALLBACK_WAKE (1) +#define NFSD4_CALLBACK_REQUEUE (2) + unsigned long cb_flags; const struct nfsd4_callback_ops *cb_ops; struct work_struct cb_work; int cb_seq_status; int cb_status; int cb_held_slot; - bool cb_need_restart; + + int cb_nr_referring_call_list; + struct list_head cb_referring_call_list; }; struct nfsd4_callback_ops { @@ -162,15 +183,11 @@ struct nfs4_cb_fattr { struct timespec64 ncf_cb_mtime; struct timespec64 ncf_cb_atime; - unsigned long ncf_cb_flags; bool ncf_file_modified; u64 ncf_initial_cinfo; u64 ncf_cur_fsize; }; -/* bits for ncf_cb_flags */ -#define CB_GETATTR_BUSY 0 - /* * Represents a delegation stateid. The nfs4_client holds references to these * and they are put when it is being destroyed or when the delegation is @@ -198,8 +215,8 @@ struct nfs4_delegation { struct list_head dl_perclnt; struct list_head dl_recall_lru; /* delegation recalled */ struct nfs4_clnt_odstate *dl_clnt_odstate; - u32 dl_type; time64_t dl_time; + u32 dl_type; /* For recall: */ int dl_retries; struct nfsd4_callback dl_recall; @@ -261,6 +278,7 @@ struct nfsd4_slot { u32 sl_seqid; __be32 sl_status; struct svc_cred sl_cred; + u32 sl_index; u32 sl_datalen; u16 sl_opcnt; u16 sl_generation; @@ -452,7 +470,6 @@ struct nfs4_client { #define NFSD4_CLIENT_UPCALL_LOCK (5) /* upcall serialization */ #define NFSD4_CLIENT_CB_FLAG_MASK (1 << NFSD4_CLIENT_CB_UPDATE | \ 1 << NFSD4_CLIENT_CB_KILL) -#define NFSD4_CLIENT_CB_RECALL_ANY (6) unsigned long cl_flags; struct workqueue_struct *cl_callback_wq; @@ -498,7 +515,6 @@ struct nfs4_client { struct nfsd4_cb_recall_any *cl_ra; time64_t cl_ra_time; - struct list_head cl_ra_cblist; }; /* struct nfs4_client_reset @@ -777,9 +793,20 @@ extern __be32 nfs4_check_open_reclaim(struct nfs4_client *); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); +extern void nfsd41_cb_referring_call(struct nfsd4_callback *cb, + struct nfs4_sessionid *sessionid, + u32 slotid, u32 seqno); +extern void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb); extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); extern bool nfsd4_run_cb(struct nfsd4_callback *cb); + +static inline void nfsd4_try_run_cb(struct nfsd4_callback *cb) +{ + if (!test_and_set_bit(NFSD4_CALLBACK_RUNNING, &cb->cb_flags)) + WARN_ON_ONCE(!nfsd4_run_cb(cb)); +} + extern void nfsd4_shutdown_callback(struct nfs4_client *); extern void nfsd4_shutdown_copy(struct nfs4_client *clp); void nfsd4_async_copy_reaper(struct nfsd_net *nn); diff --git a/fs/nfsd/stats.c b/fs/nfsd/stats.c index bb22893f1157..f7eaf95e20fc 100644 --- a/fs/nfsd/stats.c +++ b/fs/nfsd/stats.c @@ -73,11 +73,11 @@ static int nfsd_show(struct seq_file *seq, void *v) DEFINE_PROC_SHOW_ATTRIBUTE(nfsd); -void nfsd_proc_stat_init(struct net *net) +struct proc_dir_entry *nfsd_proc_stat_init(struct net *net) { struct nfsd_net *nn = net_generic(net, nfsd_net_id); - svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); + return svc_proc_register(net, &nn->nfsd_svcstats, &nfsd_proc_ops); } void nfsd_proc_stat_shutdown(struct net *net) diff --git a/fs/nfsd/stats.h b/fs/nfsd/stats.h index 04aacb6c36e2..e4efb0e4e56d 100644 --- a/fs/nfsd/stats.h +++ b/fs/nfsd/stats.h @@ -10,7 +10,7 @@ #include <uapi/linux/nfsd/stats.h> #include <linux/percpu_counter.h> -void nfsd_proc_stat_init(struct net *net); +struct proc_dir_entry *nfsd_proc_stat_init(struct net *net); void nfsd_proc_stat_shutdown(struct net *net); static inline void nfsd_stats_rc_hits_inc(struct nfsd_net *nn) diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index ad2c0c432d08..3c5505ef5e3a 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -11,6 +11,7 @@ #include <linux/tracepoint.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/xprt.h> +#include <trace/misc/fs.h> #include <trace/misc/nfs.h> #include <trace/misc/sunrpc.h> @@ -18,22 +19,40 @@ #include "nfsfh.h" #include "xdr4.h" -#define NFSD_TRACE_PROC_RES_FIELDS \ +#define NFSD_TRACE_PROC_CALL_FIELDS(r) \ + __field(unsigned int, netns_ino) \ + __field(u32, xid) \ + __sockaddr(server, (r)->rq_xprt->xpt_locallen) \ + __sockaddr(client, (r)->rq_xprt->xpt_remotelen) + +#define NFSD_TRACE_PROC_CALL_ASSIGNMENTS(r) \ + do { \ + struct svc_xprt *xprt = (r)->rq_xprt; \ + __entry->netns_ino = SVC_NET(r)->ns.inum; \ + __entry->xid = be32_to_cpu((r)->rq_xid); \ + __assign_sockaddr(server, &xprt->xpt_local, \ + xprt->xpt_locallen); \ + __assign_sockaddr(client, &xprt->xpt_remote, \ + xprt->xpt_remotelen); \ + } while (0) + +#define NFSD_TRACE_PROC_RES_FIELDS(r) \ __field(unsigned int, netns_ino) \ __field(u32, xid) \ __field(unsigned long, status) \ - __array(unsigned char, server, sizeof(struct sockaddr_in6)) \ - __array(unsigned char, client, sizeof(struct sockaddr_in6)) + __sockaddr(server, (r)->rq_xprt->xpt_locallen) \ + __sockaddr(client, (r)->rq_xprt->xpt_remotelen) -#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(error) \ +#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(r, error) \ do { \ - __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \ - __entry->xid = be32_to_cpu(rqstp->rq_xid); \ + struct svc_xprt *xprt = (r)->rq_xprt; \ + __entry->netns_ino = SVC_NET(r)->ns.inum; \ + __entry->xid = be32_to_cpu((r)->rq_xid); \ __entry->status = be32_to_cpu(error); \ - memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \ - rqstp->rq_xprt->xpt_locallen); \ - memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \ - rqstp->rq_xprt->xpt_remotelen); \ + __assign_sockaddr(server, &xprt->xpt_local, \ + xprt->xpt_locallen); \ + __assign_sockaddr(client, &xprt->xpt_remote, \ + xprt->xpt_remotelen); \ } while (0); DECLARE_EVENT_CLASS(nfsd_xdr_err_class, @@ -145,14 +164,14 @@ TRACE_EVENT(nfsd_compound_decode_err, ), TP_ARGS(rqstp, args_opcnt, resp_opcnt, opnum, status), TP_STRUCT__entry( - NFSD_TRACE_PROC_RES_FIELDS + NFSD_TRACE_PROC_RES_FIELDS(rqstp) __field(u32, args_opcnt) __field(u32, resp_opcnt) __field(u32, opnum) ), TP_fast_assign( - NFSD_TRACE_PROC_RES_ASSIGNMENTS(status) + NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status) __entry->args_opcnt = args_opcnt; __entry->resp_opcnt = resp_opcnt; @@ -171,12 +190,12 @@ DECLARE_EVENT_CLASS(nfsd_compound_err_class, ), TP_ARGS(rqstp, opnum, status), TP_STRUCT__entry( - NFSD_TRACE_PROC_RES_FIELDS + NFSD_TRACE_PROC_RES_FIELDS(rqstp) __field(u32, opnum) ), TP_fast_assign( - NFSD_TRACE_PROC_RES_ASSIGNMENTS(status) + NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status) __entry->opnum = opnum; ), @@ -451,6 +470,8 @@ DEFINE_NFSD_IO_EVENT(write_start); DEFINE_NFSD_IO_EVENT(write_opened); DEFINE_NFSD_IO_EVENT(write_io_done); DEFINE_NFSD_IO_EVENT(write_done); +DEFINE_NFSD_IO_EVENT(commit_start); +DEFINE_NFSD_IO_EVENT(commit_done); DECLARE_EVENT_CLASS(nfsd_err_class, TP_PROTO(struct svc_rqst *rqstp, @@ -803,6 +824,14 @@ DEFINE_EVENT(nfsd_cs_slot_class, nfsd_##name, \ DEFINE_CS_SLOT_EVENT(slot_seqid_conf); DEFINE_CS_SLOT_EVENT(slot_seqid_unconf); +#define show_nfs_slot_flags(val) \ + __print_flags(val, "|", \ + { NFSD4_SLOT_INUSE, "INUSE" }, \ + { NFSD4_SLOT_CACHETHIS, "CACHETHIS" }, \ + { NFSD4_SLOT_INITIALIZED, "INITIALIZED" }, \ + { NFSD4_SLOT_CACHED, "CACHED" }, \ + { NFSD4_SLOT_REUSED, "REUSED" }) + TRACE_EVENT(nfsd_slot_seqid_sequence, TP_PROTO( const struct nfs4_client *clp, @@ -813,10 +842,11 @@ TRACE_EVENT(nfsd_slot_seqid_sequence, TP_STRUCT__entry( __field(u32, seqid) __field(u32, slot_seqid) + __field(u32, slot_index) + __field(unsigned long, slot_flags) __field(u32, cl_boot) __field(u32, cl_id) __sockaddr(addr, clp->cl_cb_conn.cb_addrlen) - __field(bool, in_use) ), TP_fast_assign( __entry->cl_boot = clp->cl_clientid.cl_boot; @@ -825,11 +855,13 @@ TRACE_EVENT(nfsd_slot_seqid_sequence, clp->cl_cb_conn.cb_addrlen); __entry->seqid = seq->seqid; __entry->slot_seqid = slot->sl_seqid; + __entry->slot_index = seq->slotid; + __entry->slot_flags = slot->sl_flags; ), - TP_printk("addr=%pISpc client %08x:%08x seqid=%u slot_seqid=%u (%sin use)", + TP_printk("addr=%pISpc client %08x:%08x idx=%u seqid=%u slot_seqid=%u flags=%s", __get_sockaddr(addr), __entry->cl_boot, __entry->cl_id, - __entry->seqid, __entry->slot_seqid, - __entry->in_use ? "" : "not " + __entry->slot_index, __entry->seqid, __entry->slot_seqid, + show_nfs_slot_flags(__entry->slot_flags) ) ); @@ -1039,6 +1071,7 @@ DEFINE_CLID_EVENT(confirmed_r); { 1 << NFSD_FILE_HASHED, "HASHED" }, \ { 1 << NFSD_FILE_PENDING, "PENDING" }, \ { 1 << NFSD_FILE_REFERENCED, "REFERENCED" }, \ + { 1 << NFSD_FILE_RECENT, "RECENT" }, \ { 1 << NFSD_FILE_GC, "GC" }) DECLARE_EVENT_CLASS(nfsd_file_class, @@ -1317,6 +1350,7 @@ DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_lru_del_disposed); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_in_use); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_writeback); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_referenced); +DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_aged); DEFINE_NFSD_FILE_GC_EVENT(nfsd_file_gc_disposed); DECLARE_EVENT_CLASS(nfsd_file_lruwalk_class, @@ -1346,6 +1380,7 @@ DEFINE_EVENT(nfsd_file_lruwalk_class, name, \ TP_ARGS(removed, remaining)) DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_removed); +DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_gc_recent); DEFINE_NFSD_FILE_LRUWALK_EVENT(nfsd_file_shrinker_removed); TRACE_EVENT(nfsd_file_close, @@ -1602,7 +1637,7 @@ DECLARE_EVENT_CLASS(nfsd_cb_lifetime_class, __entry->cl_id = clp->cl_clientid.cl_id; __entry->cb = cb; __entry->opcode = cb->cb_ops ? cb->cb_ops->opcode : _CB_NULL; - __entry->need_restart = cb->cb_need_restart; + __entry->need_restart = test_bit(NFSD4_CALLBACK_REQUEUE, &cb->cb_flags); __assign_sockaddr(addr, &clp->cl_cb_conn.cb_addr, clp->cl_cb_conn.cb_addrlen) ), @@ -2321,6 +2356,259 @@ DEFINE_EVENT(nfsd_copy_async_done_class, \ DEFINE_COPY_ASYNC_DONE_EVENT(done); DEFINE_COPY_ASYNC_DONE_EVENT(cancel); +TRACE_EVENT(nfsd_vfs_setattr, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const struct iattr *iap, + const struct timespec64 *guardtime + ), + TP_ARGS(rqstp, fhp, iap, guardtime), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(s64, gtime_tv_sec) + __field(u32, gtime_tv_nsec) + __field(unsigned int, ia_valid) + __field(loff_t, ia_size) + __field(uid_t, ia_uid) + __field(gid_t, ia_gid) + __field(umode_t, ia_mode) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->gtime_tv_sec = guardtime ? guardtime->tv_sec : 0; + __entry->gtime_tv_nsec = guardtime ? guardtime->tv_nsec : 0; + __entry->ia_valid = iap->ia_valid; + __entry->ia_size = iap->ia_size; + __entry->ia_uid = __kuid_val(iap->ia_uid); + __entry->ia_gid = __kgid_val(iap->ia_gid); + __entry->ia_mode = iap->ia_mode; + ), + TP_printk( + "xid=0x%08x fh_hash=0x%08x ia_valid=%s ia_size=%llu ia_mode=0%o ia_uid=%u ia_gid=%u guard_time=%lld.%u", + __entry->xid, __entry->fh_hash, show_ia_valid_flags(__entry->ia_valid), + __entry->ia_size, __entry->ia_mode, __entry->ia_uid, __entry->ia_gid, + __entry->gtime_tv_sec, __entry->gtime_tv_nsec + ) +) + +TRACE_EVENT(nfsd_vfs_lookup, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s", + __entry->xid, __entry->fh_hash, __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_create, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + umode_t type, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, type, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(umode_t, type) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->type = type; + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x type=%s name=%s", + __entry->xid, __entry->fh_hash, + show_fs_file_type(__entry->type), __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_symlink, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int namelen, + const char *target + ), + TP_ARGS(rqstp, fhp, name, namelen, target), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, namelen) + __string(target, target) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + __assign_str(target); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s target=%s", + __entry->xid, __entry->fh_hash, + __get_str(name), __get_str(target) + ) +); + +TRACE_EVENT(nfsd_vfs_link, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *sfhp, + const struct svc_fh *tfhp, + const char *name, + unsigned int namelen + ), + TP_ARGS(rqstp, sfhp, tfhp, name, namelen), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, sfh_hash) + __field(u32, tfh_hash) + __string_len(name, name, namelen) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle); + __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x src_fh=0x%08x tgt_fh=0x%08x name=%s", + __entry->xid, __entry->sfh_hash, __entry->tfh_hash, + __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_unlink, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s", + __entry->xid, __entry->fh_hash, + __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_rename, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *sfhp, + const struct svc_fh *tfhp, + const char *source, + unsigned int sourcelen, + const char *target, + unsigned int targetlen + ), + TP_ARGS(rqstp, sfhp, tfhp, source, sourcelen, target, targetlen), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, sfh_hash) + __field(u32, tfh_hash) + __string_len(source, source, sourcelen) + __string_len(target, target, targetlen) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle); + __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle); + __assign_str(source); + __assign_str(target); + ), + TP_printk("xid=0x%08x sfh_hash=0x%08x tfh_hash=0x%08x source=%s target=%s", + __entry->xid, __entry->sfh_hash, __entry->tfh_hash, + __get_str(source), __get_str(target) + ) +); + +TRACE_EVENT(nfsd_vfs_readdir, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + u32 count, + u64 offset + ), + TP_ARGS(rqstp, fhp, count, offset), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(u32, count) + __field(u64, offset) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->count = count; + __entry->offset = offset; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu count=%u", + __entry->xid, __entry->fh_hash, + __entry->offset, __entry->count + ) +); + +DECLARE_EVENT_CLASS(nfsd_vfs_getattr_class, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp + ), + TP_ARGS(rqstp, fhp), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x", + __entry->xid, __entry->fh_hash + ) +); + +#define DEFINE_NFSD_VFS_GETATTR_EVENT(__name) \ +DEFINE_EVENT(nfsd_vfs_getattr_class, __name, \ + TP_PROTO( \ + const struct svc_rqst *rqstp, \ + const struct svc_fh *fhp \ + ), \ + TP_ARGS(rqstp, fhp)) + +DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_getattr); +DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_statfs); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 29cb7b812d71..cd689df2ca5d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -31,6 +31,7 @@ #include <linux/exportfs.h> #include <linux/writeback.h> #include <linux/security.h> +#include <linux/sunrpc/xdr.h> #include "xdr3.h" @@ -47,6 +48,8 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP +bool nfsd_disable_splice_read __read_mostly; + /** * nfserrno - Map Linux errnos to NFS errnos * @errno: POSIX(-ish) error code to be mapped @@ -71,7 +74,6 @@ nfserrno (int errno) { nfserr_acces, -EACCES }, { nfserr_exist, -EEXIST }, { nfserr_xdev, -EXDEV }, - { nfserr_mlink, -EMLINK }, { nfserr_nodev, -ENODEV }, { nfserr_notdir, -ENOTDIR }, { nfserr_isdir, -EISDIR }, @@ -245,7 +247,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry; int host_err; - dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); + trace_nfsd_vfs_lookup(rqstp, fhp, name, len); dparent = fhp->fh_dentry; exp = exp_get(fhp->fh_export); @@ -265,7 +267,8 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, goto out_nfserr; } } else { - dentry = lookup_one_len_unlocked(name, dparent, len); + dentry = lookup_one_unlocked(&nop_mnt_idmap, + &QSTR_LEN(name, len), dparent); host_err = PTR_ERR(dentry); if (IS_ERR(dentry)) goto out_nfserr; @@ -500,6 +503,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, bool size_change = (iap->ia_valid & ATTR_SIZE); int retries; + trace_nfsd_vfs_setattr(rqstp, fhp, iap, guardtime); + if (iap->ia_valid & ATTR_SIZE) { accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; ftype = S_IFREG; @@ -923,7 +928,7 @@ nfsd_open(struct svc_rqst *rqstp, struct svc_fh *fhp, umode_t type, * directories, but we never have and it doesn't seem to have * caused anyone a problem. If we were to change this, note * also that our filldir callbacks would need a variant of - * lookup_one_len that doesn't check permissions. + * lookup_one_positive_unlocked() that doesn't check permissions. */ if (type == S_IFREG) may_flags |= NFSD_MAY_OWNER_OVERRIDE; @@ -1082,23 +1087,23 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long v, total; struct iov_iter iter; loff_t ppos = offset; - struct page *page; ssize_t host_err; + size_t len; v = 0; total = *count; while (total) { - page = *(rqstp->rq_next_page++); - rqstp->rq_vec[v].iov_base = page_address(page) + base; - rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base); - total -= rqstp->rq_vec[v].iov_len; + len = min_t(size_t, total, PAGE_SIZE - base); + bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++), + len, base); + total -= len; ++v; base = 0; } - WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec)); + WARN_ON_ONCE(v > rqstp->rq_maxpages); trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count); + iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count); host_err = vfs_iter_read(file, &iter, &ppos, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1140,11 +1145,27 @@ static int wait_for_concurrent_writes(struct file *file) return err; } +/** + * nfsd_vfs_write - write data to an already-open file + * @rqstp: RPC execution context + * @fhp: File handle of file to write into + * @nf: An open file matching @fhp + * @offset: Byte offset of start + * @payload: xdr_buf containing the write payload + * @cnt: IN: number of bytes to write, OUT: number of bytes actually written + * @stable: An NFS stable_how value + * @verf: NFS WRITE verifier + * + * Upon return, caller must invoke fh_put on @fhp. + * + * Return values: + * An nfsstat value in network byte order. + */ __be32 -nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, - loff_t offset, struct kvec *vec, int vlen, - unsigned long *cnt, int stable, - __be32 *verf) +nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, loff_t offset, + const struct xdr_buf *payload, unsigned long *cnt, + int stable, __be32 *verf) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file *file = nf->nf_file; @@ -1159,6 +1180,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, unsigned int pflags = current->flags; rwf_t flags = 0; bool restore_flags = false; + unsigned int nvecs; trace_nfsd_write_opened(rqstp, fhp, offset, *cnt); @@ -1186,7 +1208,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && !fhp->fh_use_wgather) flags |= RWF_SYNC; - iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt); + nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); + iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); @@ -1237,6 +1260,8 @@ out_nfserr: */ bool nfsd_read_splice_ok(struct svc_rqst *rqstp) { + if (nfsd_disable_splice_read) + return false; switch (svc_auth_flavor(rqstp)) { case RPC_AUTH_GSS_KRB5I: case RPC_AUTH_GSS_KRB5P: @@ -1284,14 +1309,24 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; } -/* - * Write data to a file. - * The stable flag requests synchronous writes. - * N.B. After this call fhp needs an fh_put +/** + * nfsd_write - open a file and write data to it + * @rqstp: RPC execution context + * @fhp: File handle of file to write into; nfsd_write() may modify it + * @offset: Byte offset of start + * @payload: xdr_buf containing the write payload + * @cnt: IN: number of bytes to write, OUT: number of bytes actually written + * @stable: An NFS stable_how value + * @verf: NFS WRITE verifier + * + * Upon return, caller must invoke fh_put on @fhp. + * + * Return values: + * An nfsstat value in network byte order. */ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, - struct kvec *vec, int vlen, unsigned long *cnt, int stable, + const struct xdr_buf *payload, unsigned long *cnt, int stable, __be32 *verf) { struct nfsd_file *nf; @@ -1303,8 +1338,8 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, if (err) goto out; - err = nfsd_vfs_write(rqstp, fhp, nf, offset, vec, - vlen, cnt, stable, verf); + err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt, + stable, verf); nfsd_file_put(nf); out: trace_nfsd_write_done(rqstp, fhp, offset, *cnt); @@ -1340,6 +1375,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, loff_t start, end; struct nfsd_net *nn; + trace_nfsd_commit_start(rqstp, fhp, offset, count); + /* * Convert the client-provided (offset, count) range to a * (start, end) range. If the client-provided range falls @@ -1378,6 +1415,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, } else nfsd_copy_write_verifier(verf, nn); + trace_nfsd_commit_done(rqstp, fhp, offset, count); return err; } @@ -1461,7 +1499,7 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, struct inode *dirp; struct iattr *iap = attrs->na_iattr; __be32 err; - int host_err; + int host_err = 0; dentry = fhp->fh_dentry; dirp = d_inode(dentry); @@ -1488,28 +1526,15 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, nfsd_check_ignore_resizing(iap); break; case S_IFDIR: - host_err = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); - if (!host_err && unlikely(d_unhashed(dchild))) { - struct dentry *d; - d = lookup_one_len(dchild->d_name.name, - dchild->d_parent, - dchild->d_name.len); - if (IS_ERR(d)) { - host_err = PTR_ERR(d); - break; - } - if (unlikely(d_is_negative(d))) { - dput(d); - err = nfserr_serverfault; - goto out; - } + dchild = vfs_mkdir(&nop_mnt_idmap, dirp, dchild, iap->ia_mode); + if (IS_ERR(dchild)) { + host_err = PTR_ERR(dchild); + } else if (d_is_negative(dchild)) { + err = nfserr_serverfault; + goto out; + } else if (unlikely(dchild != resfhp->fh_dentry)) { dput(resfhp->fh_dentry); - resfhp->fh_dentry = dget(d); - err = fh_update(resfhp); - dput(dchild); - dchild = d; - if (err) - goto out; + resfhp->fh_dentry = dget(dchild); } break; case S_IFCHR: @@ -1530,7 +1555,8 @@ nfsd_create_locked(struct svc_rqst *rqstp, struct svc_fh *fhp, err = nfsd_create_setattr(rqstp, fhp, resfhp, attrs); out: - dput(dchild); + if (!IS_ERR(dchild)) + dput(dchild); return err; out_nfserr: @@ -1553,6 +1579,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 err; int host_err; + trace_nfsd_vfs_create(rqstp, fhp, type, fname, flen); + if (isdotent(fname, flen)) return nfserr_exist; @@ -1567,7 +1595,7 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, return nfserrno(host_err); inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT); - dchild = lookup_one_len(fname, dentry, flen); + dchild = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); host_err = PTR_ERR(dchild); if (IS_ERR(dchild)) { err = nfserrno(host_err); @@ -1653,6 +1681,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 err, cerr; int host_err; + trace_nfsd_vfs_symlink(rqstp, fhp, fname, flen, path); + err = nfserr_noent; if (!flen || path[0] == '\0') goto out; @@ -1672,7 +1702,7 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, dentry = fhp->fh_dentry; inode_lock_nested(dentry->d_inode, I_MUTEX_PARENT); - dnew = lookup_one_len(fname, dentry, flen); + dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); if (IS_ERR(dnew)) { err = nfserrno(PTR_ERR(dnew)); inode_unlock(dentry->d_inode); @@ -1699,9 +1729,17 @@ out: return err; } -/* - * Create a hardlink - * N.B. After this call _both_ ffhp and tfhp need an fh_put +/** + * nfsd_link - create a link + * @rqstp: RPC transaction context + * @ffhp: the file handle of the directory where the new link is to be created + * @name: the filename of the new link + * @len: the length of @name in octets + * @tfhp: the file handle of an existing file object + * + * After this call _both_ ffhp and tfhp need an fh_put. + * + * Returns a generic NFS status code in network byte-order. */ __be32 nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, @@ -1709,9 +1747,12 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, { struct dentry *ddir, *dnew, *dold; struct inode *dirp; + int type; __be32 err; int host_err; + trace_nfsd_vfs_link(rqstp, ffhp, tfhp, name, len); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1728,19 +1769,19 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, if (isdotent(name, len)) goto out; + err = nfs_ok; + type = d_inode(tfhp->fh_dentry)->i_mode & S_IFMT; host_err = fh_want_write(tfhp); - if (host_err) { - err = nfserrno(host_err); + if (host_err) goto out; - } ddir = ffhp->fh_dentry; dirp = d_inode(ddir); inode_lock_nested(dirp, I_MUTEX_PARENT); - dnew = lookup_one_len(name, ddir, len); + dnew = lookup_one(&nop_mnt_idmap, &QSTR_LEN(name, len), ddir); if (IS_ERR(dnew)) { - err = nfserrno(PTR_ERR(dnew)); + host_err = PTR_ERR(dnew); goto out_unlock; } @@ -1756,17 +1797,26 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, fh_fill_post_attrs(ffhp); inode_unlock(dirp); if (!host_err) { - err = nfserrno(commit_metadata(ffhp)); - if (!err) - err = nfserrno(commit_metadata(tfhp)); - } else { - err = nfserrno(host_err); + host_err = commit_metadata(ffhp); + if (!host_err) + host_err = commit_metadata(tfhp); } + dput(dnew); out_drop_write: fh_drop_write(tfhp); + if (host_err == -EBUSY) { + /* + * See RFC 8881 Section 18.9.4 para 1-2: NFSv4 LINK + * wants a status unique to the object type. + */ + if (type != S_IFDIR) + err = nfserr_file_open; + else + err = nfserr_acces; + } out: - return err; + return err != nfs_ok ? err : nfserrno(host_err); out_dput: dput(dnew); @@ -1795,9 +1845,19 @@ nfsd_has_cached_files(struct dentry *dentry) return ret; } -/* - * Rename a file - * N.B. After this call _both_ ffhp and tfhp need an fh_put +/** + * nfsd_rename - rename a directory entry + * @rqstp: RPC transaction context + * @ffhp: the file handle of parent directory containing the entry to be renamed + * @fname: the filename of directory entry to be renamed + * @flen: the length of @fname in octets + * @tfhp: the file handle of parent directory to contain the renamed entry + * @tname: the filename of the new entry + * @tlen: the length of @tlen in octets + * + * After this call _both_ ffhp and tfhp need an fh_put. + * + * Returns a generic NFS status code in network byte-order. */ __be32 nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, @@ -1805,10 +1865,13 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, { struct dentry *fdentry, *tdentry, *odentry, *ndentry, *trap; struct inode *fdir, *tdir; + int type = S_IFDIR; __be32 err; int host_err; bool close_cached = false; + trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) goto out; @@ -1851,7 +1914,7 @@ retry: if (err != nfs_ok) goto out_unlock; - odentry = lookup_one_len(fname, fdentry, flen); + odentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), fdentry); host_err = PTR_ERR(odentry); if (IS_ERR(odentry)) goto out_nfserr; @@ -1862,11 +1925,14 @@ retry: host_err = -EINVAL; if (odentry == trap) goto out_dput_old; + type = d_inode(odentry)->i_mode & S_IFMT; - ndentry = lookup_one_len(tname, tdentry, tlen); + ndentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(tname, tlen), tdentry); host_err = PTR_ERR(ndentry); if (IS_ERR(ndentry)) goto out_dput_old; + if (d_inode(ndentry)) + type = d_inode(ndentry)->i_mode & S_IFMT; host_err = -ENOTEMPTY; if (ndentry == trap) goto out_dput_new; @@ -1904,7 +1970,18 @@ retry: out_dput_old: dput(odentry); out_nfserr: - err = nfserrno(host_err); + if (host_err == -EBUSY) { + /* + * See RFC 8881 Section 18.26.4 para 1-3: NFSv4 RENAME + * wants a status unique to the object type. + */ + if (type != S_IFDIR) + err = nfserr_file_open; + else + err = nfserr_acces; + } else { + err = nfserrno(host_err); + } if (!close_cached) { fh_fill_post_attrs(ffhp); @@ -1931,9 +2008,17 @@ out: return err; } -/* - * Unlink a file or directory - * N.B. After this call fhp needs an fh_put +/** + * nfsd_unlink - remove a directory entry + * @rqstp: RPC transaction context + * @fhp: the file handle of the parent directory to be modified + * @type: enforced file type of the object to be removed + * @fname: the name of directory entry to be removed + * @flen: length of @fname in octets + * + * After this call fhp needs an fh_put. + * + * Returns a generic NFS status code in network byte-order. */ __be32 nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, @@ -1945,6 +2030,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, __be32 err; int host_err; + trace_nfsd_vfs_unlink(rqstp, fhp, fname, flen); + err = nfserr_acces; if (!flen || isdotent(fname, flen)) goto out; @@ -1960,7 +2047,7 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, dirp = d_inode(dentry); inode_lock_nested(dirp, I_MUTEX_PARENT); - rdentry = lookup_one_len(fname, dentry, flen); + rdentry = lookup_one(&nop_mnt_idmap, &QSTR_LEN(fname, flen), dentry); host_err = PTR_ERR(rdentry); if (IS_ERR(rdentry)) goto out_unlock; @@ -2007,15 +2094,17 @@ out_drop_write: fh_drop_write(fhp); out_nfserr: if (host_err == -EBUSY) { - /* name is mounted-on. There is no perfect - * error status. + /* + * See RFC 8881 Section 18.25.4 para 4: NFSv4 REMOVE + * wants a status unique to the object type. */ - err = nfserr_file_open; - } else { - err = nfserrno(host_err); + if (type != S_IFDIR) + err = nfserr_file_open; + else + err = nfserr_acces; } out: - return err; + return err != nfs_ok ? err : nfserrno(host_err); out_unlock: inode_unlock(dirp); goto out_drop_write; @@ -2231,6 +2320,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, in { __be32 err; + trace_nfsd_vfs_statfs(rqstp, fhp); + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); if (!err) { struct path path = { diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index f9b09b842856..eff04959606f 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -128,13 +128,13 @@ bool nfsd_read_splice_ok(struct svc_rqst *rqstp); __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long *count, u32 *eof); -__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, - struct kvec *, int, unsigned long *, - int stable, __be32 *verf); +__be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, const struct xdr_buf *payload, + unsigned long *cnt, int stable, __be32 *verf); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, loff_t offset, - struct kvec *vec, int vlen, unsigned long *cnt, - int stable, __be32 *verf); + const struct xdr_buf *payload, + unsigned long *cnt, int stable, __be32 *verf); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index c26ba86dbdfd..aa2a356da784 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -676,6 +676,10 @@ struct nfsd4_cb_offload { __be32 co_nfserr; unsigned int co_retries; struct knfsd_fh co_fh; + + struct nfs4_sessionid co_referring_sessionid; + u32 co_referring_slotid; + u32 co_referring_seqno; }; struct nfsd4_copy { diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index f1a315cd31b7..f4e29c0c701c 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -6,8 +6,11 @@ #define cb_compound_enc_hdr_sz 4 #define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) #define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define enc_referring_call4_sz (1 + 1) +#define enc_referring_call_list4_sz (sessionid_sz + 1 + \ + enc_referring_call4_sz) #define cb_sequence_enc_sz (sessionid_sz + 4 + \ - 1 /* no referring calls list yet */) + enc_referring_call_list4_sz) #define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) #define op_enc_sz 1 diff --git a/fs/nilfs2/namei.c b/fs/nilfs2/namei.c index 953fbd5f0851..40f4b1a28705 100644 --- a/fs/nilfs2/namei.c +++ b/fs/nilfs2/namei.c @@ -218,8 +218,8 @@ static int nilfs_link(struct dentry *old_dentry, struct inode *dir, return err; } -static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct nilfs_transaction_info ti; @@ -227,7 +227,7 @@ static int nilfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, err = nilfs_transaction_begin(dir->i_sb, &ti, 1); if (err) - return err; + return ERR_PTR(err); inc_nlink(dir); @@ -258,7 +258,7 @@ out: else nilfs_transaction_abort(dir->i_sb); - return err; + return ERR_PTR(err); out_fail: drop_nlink(inode); diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 3a202e51b360..83970d97840b 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2424,7 +2424,7 @@ static void nilfs_segctor_accept(struct nilfs_sc_info *sci) * the area protected by sc_state_lock. */ if (thread_is_alive) - del_timer_sync(&sci->sc_timer); + timer_delete_sync(&sci->sc_timer); } /** diff --git a/fs/nilfs2/the_nilfs.c b/fs/nilfs2/the_nilfs.c index cb01ea81724d..d0bcf744c553 100644 --- a/fs/nilfs2/the_nilfs.c +++ b/fs/nilfs2/the_nilfs.c @@ -705,8 +705,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb) int blocksize; int err; - down_write(&nilfs->ns_sem); - blocksize = sb_min_blocksize(sb, NILFS_MIN_BLOCK_SIZE); if (!blocksize) { nilfs_err(sb, "unable to set blocksize"); @@ -779,7 +777,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb) set_nilfs_init(nilfs); err = 0; out: - up_write(&nilfs->ns_sem); return err; failed_sbh: diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 95646f7c46ca..6d386080faf2 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -166,6 +166,8 @@ static bool fanotify_should_merge(struct fanotify_event *old, case FANOTIFY_EVENT_TYPE_FS_ERROR: return fanotify_error_event_equal(FANOTIFY_EE(old), FANOTIFY_EE(new)); + case FANOTIFY_EVENT_TYPE_MNT: + return false; default: WARN_ON_ONCE(1); } @@ -312,7 +314,10 @@ static u32 fanotify_group_event_mask(struct fsnotify_group *group, pr_debug("%s: report_mask=%x mask=%x data=%p data_type=%d\n", __func__, iter_info->report_mask, event_mask, data, data_type); - if (!fid_mode) { + if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { + if (data_type != FSNOTIFY_EVENT_MNT) + return 0; + } else if (!fid_mode) { /* Do we have path to open a file descriptor? */ if (!path) return 0; @@ -557,6 +562,20 @@ static struct fanotify_event *fanotify_alloc_path_event(const struct path *path, return &pevent->fae; } +static struct fanotify_event *fanotify_alloc_mnt_event(u64 mnt_id, gfp_t gfp) +{ + struct fanotify_mnt_event *pevent; + + pevent = kmem_cache_alloc(fanotify_mnt_event_cachep, gfp); + if (!pevent) + return NULL; + + pevent->fae.type = FANOTIFY_EVENT_TYPE_MNT; + pevent->mnt_id = mnt_id; + + return &pevent->fae; +} + static struct fanotify_event *fanotify_alloc_perm_event(const void *data, int data_type, gfp_t gfp) @@ -731,6 +750,7 @@ static struct fanotify_event *fanotify_alloc_event( fid_mode); struct inode *dirid = fanotify_dfid_inode(mask, data, data_type, dir); const struct path *path = fsnotify_data_path(data, data_type); + u64 mnt_id = fsnotify_data_mnt_id(data, data_type); struct mem_cgroup *old_memcg; struct dentry *moved = NULL; struct inode *child = NULL; @@ -826,8 +846,12 @@ static struct fanotify_event *fanotify_alloc_event( moved, &hash, gfp); } else if (fid_mode) { event = fanotify_alloc_fid_event(id, fsid, &hash, gfp); - } else { + } else if (path) { event = fanotify_alloc_path_event(path, &hash, gfp); + } else if (mnt_id) { + event = fanotify_alloc_mnt_event(mnt_id, gfp); + } else { + WARN_ON_ONCE(1); } if (!event) @@ -927,7 +951,7 @@ static int fanotify_handle_event(struct fsnotify_group *group, u32 mask, BUILD_BUG_ON(FAN_RENAME != FS_RENAME); BUILD_BUG_ON(FAN_PRE_ACCESS != FS_PRE_ACCESS); - BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 22); + BUILD_BUG_ON(HWEIGHT32(ALL_FANOTIFY_EVENT_BITS) != 24); mask = fanotify_group_event_mask(group, iter_info, &match_mask, mask, data, data_type, dir); @@ -1028,6 +1052,11 @@ static void fanotify_free_error_event(struct fsnotify_group *group, mempool_free(fee, &group->fanotify_data.error_events_pool); } +static void fanotify_free_mnt_event(struct fanotify_event *event) +{ + kmem_cache_free(fanotify_mnt_event_cachep, FANOTIFY_ME(event)); +} + static void fanotify_free_event(struct fsnotify_group *group, struct fsnotify_event *fsn_event) { @@ -1054,6 +1083,9 @@ static void fanotify_free_event(struct fsnotify_group *group, case FANOTIFY_EVENT_TYPE_FS_ERROR: fanotify_free_error_event(group, event); break; + case FANOTIFY_EVENT_TYPE_MNT: + fanotify_free_mnt_event(event); + break; default: WARN_ON_ONCE(1); } diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index c12cbc270539..b44e70e44be6 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -9,6 +9,7 @@ extern struct kmem_cache *fanotify_mark_cache; extern struct kmem_cache *fanotify_fid_event_cachep; extern struct kmem_cache *fanotify_path_event_cachep; extern struct kmem_cache *fanotify_perm_event_cachep; +extern struct kmem_cache *fanotify_mnt_event_cachep; /* Possible states of the permission event */ enum { @@ -244,6 +245,7 @@ enum fanotify_event_type { FANOTIFY_EVENT_TYPE_PATH_PERM, FANOTIFY_EVENT_TYPE_OVERFLOW, /* struct fanotify_event */ FANOTIFY_EVENT_TYPE_FS_ERROR, /* struct fanotify_error_event */ + FANOTIFY_EVENT_TYPE_MNT, __FANOTIFY_EVENT_TYPE_NUM }; @@ -409,12 +411,23 @@ struct fanotify_path_event { struct path path; }; +struct fanotify_mnt_event { + struct fanotify_event fae; + u64 mnt_id; +}; + static inline struct fanotify_path_event * FANOTIFY_PE(struct fanotify_event *event) { return container_of(event, struct fanotify_path_event, fae); } +static inline struct fanotify_mnt_event * +FANOTIFY_ME(struct fanotify_event *event) +{ + return container_of(event, struct fanotify_mnt_event, fae); +} + /* * Structure for permission fanotify events. It gets allocated and freed in * fanotify_handle_event() since we wait there for user response. When the @@ -466,6 +479,11 @@ static inline bool fanotify_is_error_event(u32 mask) return mask & FAN_FS_ERROR; } +static inline bool fanotify_is_mnt_event(u32 mask) +{ + return mask & (FAN_MNT_ATTACH | FAN_MNT_DETACH); +} + static inline const struct path *fanotify_event_path(struct fanotify_event *event) { if (event->type == FANOTIFY_EVENT_TYPE_PATH) diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index ba3e2d09eb44..87f861e9004f 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -113,6 +113,7 @@ struct kmem_cache *fanotify_mark_cache __ro_after_init; struct kmem_cache *fanotify_fid_event_cachep __ro_after_init; struct kmem_cache *fanotify_path_event_cachep __ro_after_init; struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; +struct kmem_cache *fanotify_mnt_event_cachep __ro_after_init; #define FANOTIFY_EVENT_ALIGN 4 #define FANOTIFY_FID_INFO_HDR_LEN \ @@ -123,6 +124,8 @@ struct kmem_cache *fanotify_perm_event_cachep __ro_after_init; (sizeof(struct fanotify_event_info_error)) #define FANOTIFY_RANGE_INFO_LEN \ (sizeof(struct fanotify_event_info_range)) +#define FANOTIFY_MNT_INFO_LEN \ + (sizeof(struct fanotify_event_info_mnt)) static int fanotify_fid_info_len(int fh_len, int name_len) { @@ -178,6 +181,8 @@ static size_t fanotify_event_len(unsigned int info_mode, fh_len = fanotify_event_object_fh_len(event); event_len += fanotify_fid_info_len(fh_len, dot_len); } + if (fanotify_is_mnt_event(event->mask)) + event_len += FANOTIFY_MNT_INFO_LEN; if (info_mode & FAN_REPORT_PIDFD) event_len += FANOTIFY_PIDFD_INFO_LEN; @@ -405,6 +410,25 @@ static int process_access_response(struct fsnotify_group *group, return -ENOENT; } +static size_t copy_mnt_info_to_user(struct fanotify_event *event, + char __user *buf, int count) +{ + struct fanotify_event_info_mnt info = { }; + + info.hdr.info_type = FAN_EVENT_INFO_TYPE_MNT; + info.hdr.len = FANOTIFY_MNT_INFO_LEN; + + if (WARN_ON(count < info.hdr.len)) + return -EFAULT; + + info.mnt_id = FANOTIFY_ME(event)->mnt_id; + + if (copy_to_user(buf, &info, sizeof(info))) + return -EFAULT; + + return info.hdr.len; +} + static size_t copy_error_info_to_user(struct fanotify_event *event, char __user *buf, int count) { @@ -700,6 +724,15 @@ static int copy_info_records_to_user(struct fanotify_event *event, total_bytes += ret; } + if (fanotify_is_mnt_event(event->mask)) { + ret = copy_mnt_info_to_user(event, buf, count); + if (ret < 0) + return ret; + buf += ret; + count -= ret; + total_bytes += ret; + } + return total_bytes; } @@ -1508,6 +1541,14 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) if ((flags & FAN_REPORT_PIDFD) && (flags & FAN_REPORT_TID)) return -EINVAL; + /* Don't allow mixing mnt events with inode events for now */ + if (flags & FAN_REPORT_MNT) { + if (class != FAN_CLASS_NOTIF) + return -EINVAL; + if (flags & (FANOTIFY_FID_BITS | FAN_REPORT_FD_ERROR)) + return -EINVAL; + } + if (event_f_flags & ~FANOTIFY_INIT_ALL_EVENT_F_BITS) return -EINVAL; @@ -1767,7 +1808,6 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, int dfd, const char __user *pathname) { struct inode *inode = NULL; - struct vfsmount *mnt = NULL; struct fsnotify_group *group; struct path path; struct fan_fsid __fsid, *fsid = NULL; @@ -1776,7 +1816,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; unsigned int ignore = flags & FANOTIFY_MARK_IGNORE_BITS; unsigned int obj_type, fid_mode; - void *obj; + void *obj = NULL; u32 umask = 0; int ret; @@ -1800,6 +1840,9 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, case FAN_MARK_FILESYSTEM: obj_type = FSNOTIFY_OBJ_TYPE_SB; break; + case FAN_MARK_MNTNS: + obj_type = FSNOTIFY_OBJ_TYPE_MNTNS; + break; default: return -EINVAL; } @@ -1847,6 +1890,19 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; group = fd_file(f)->private_data; + /* Only report mount events on mnt namespace */ + if (FAN_GROUP_FLAG(group, FAN_REPORT_MNT)) { + if (mask & ~FANOTIFY_MOUNT_EVENTS) + return -EINVAL; + if (mark_type != FAN_MARK_MNTNS) + return -EINVAL; + } else { + if (mask & FANOTIFY_MOUNT_EVENTS) + return -EINVAL; + if (mark_type == FAN_MARK_MNTNS) + return -EINVAL; + } + /* * An unprivileged user is not allowed to setup mount nor filesystem * marks. This also includes setting up such marks by a group that @@ -1888,7 +1944,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, * point. */ fid_mode = FAN_GROUP_FLAG(group, FANOTIFY_FID_BITS); - if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_EVENT_FLAGS) && + if (mask & ~(FANOTIFY_FD_EVENTS|FANOTIFY_MOUNT_EVENTS|FANOTIFY_EVENT_FLAGS) && (!fid_mode || mark_type == FAN_MARK_MOUNT)) return -EINVAL; @@ -1905,12 +1961,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, return -EINVAL; if (mark_cmd == FAN_MARK_FLUSH) { - if (mark_type == FAN_MARK_MOUNT) - fsnotify_clear_vfsmount_marks_by_group(group); - else if (mark_type == FAN_MARK_FILESYSTEM) - fsnotify_clear_sb_marks_by_group(group); - else - fsnotify_clear_inode_marks_by_group(group); + fsnotify_clear_marks_by_group(group, obj_type); return 0; } @@ -1938,17 +1989,21 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* inode held in place by reference to path; group by fget on fd */ - if (mark_type == FAN_MARK_INODE) { + if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) { inode = path.dentry->d_inode; obj = inode; - } else { - mnt = path.mnt; - if (mark_type == FAN_MARK_MOUNT) - obj = mnt; - else - obj = mnt->mnt_sb; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + obj = path.mnt; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) { + obj = path.mnt->mnt_sb; + } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { + obj = mnt_ns_from_dentry(path.dentry); } + ret = -EINVAL; + if (!obj) + goto path_put_and_out; + /* * If some other task has this inode open for write we should not add * an ignore mask, unless that ignore mask is supposed to survive @@ -1956,10 +2011,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, */ if (mark_cmd == FAN_MARK_ADD && (flags & FANOTIFY_MARK_IGNORE_BITS) && !(flags & FAN_MARK_IGNORED_SURV_MODIFY)) { - ret = mnt ? -EINVAL : -EISDIR; + ret = !inode ? -EINVAL : -EISDIR; /* FAN_MARK_IGNORE requires SURV_MODIFY for sb/mount/dir marks */ if (ignore == FAN_MARK_IGNORE && - (mnt || S_ISDIR(inode->i_mode))) + (!inode || S_ISDIR(inode->i_mode))) goto path_put_and_out; ret = 0; @@ -1968,7 +2023,7 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* Mask out FAN_EVENT_ON_CHILD flag for sb/mount/non-dir marks */ - if (mnt || !S_ISDIR(inode->i_mode)) { + if (!inode || !S_ISDIR(inode->i_mode)) { mask &= ~FAN_EVENT_ON_CHILD; umask = FAN_EVENT_ON_CHILD; /* @@ -2042,7 +2097,7 @@ static int __init fanotify_user_setup(void) FANOTIFY_DEFAULT_MAX_USER_MARKS); BUILD_BUG_ON(FANOTIFY_INIT_FLAGS & FANOTIFY_INTERNAL_GROUP_FLAGS); - BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 13); + BUILD_BUG_ON(HWEIGHT32(FANOTIFY_INIT_FLAGS) != 14); BUILD_BUG_ON(HWEIGHT32(FANOTIFY_MARK_FLAGS) != 11); fanotify_mark_cache = KMEM_CACHE(fanotify_mark, @@ -2055,6 +2110,7 @@ static int __init fanotify_user_setup(void) fanotify_perm_event_cachep = KMEM_CACHE(fanotify_perm_event, SLAB_PANIC); } + fanotify_mnt_event_cachep = KMEM_CACHE(fanotify_mnt_event, SLAB_PANIC); fanotify_max_queued_events = FANOTIFY_DEFAULT_MAX_EVENTS; init_user_ns.ucount_max[UCOUNT_FANOTIFY_GROUPS] = diff --git a/fs/notify/fdinfo.c b/fs/notify/fdinfo.c index e933f9c65d90..1161eabf11ee 100644 --- a/fs/notify/fdinfo.c +++ b/fs/notify/fdinfo.c @@ -121,6 +121,11 @@ static void fanotify_fdinfo(struct seq_file *m, struct fsnotify_mark *mark) seq_printf(m, "fanotify sdev:%x mflags:%x mask:%x ignored_mask:%x\n", sb->s_dev, mflags, mark->mask, mark->ignore_mask); + } else if (mark->connector->type == FSNOTIFY_OBJ_TYPE_MNTNS) { + struct mnt_namespace *mnt_ns = fsnotify_conn_mntns(mark->connector); + + seq_printf(m, "fanotify mnt_ns:%u mflags:%x mask:%x ignored_mask:%x\n", + mnt_ns->ns.inum, mflags, mark->mask, mark->ignore_mask); } } diff --git a/fs/notify/fsnotify.c b/fs/notify/fsnotify.c index fae1b6d397ea..e2b4f17a48bb 100644 --- a/fs/notify/fsnotify.c +++ b/fs/notify/fsnotify.c @@ -28,6 +28,11 @@ void __fsnotify_vfsmount_delete(struct vfsmount *mnt) fsnotify_clear_marks_by_mount(mnt); } +void __fsnotify_mntns_delete(struct mnt_namespace *mntns) +{ + fsnotify_clear_marks_by_mntns(mntns); +} + /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. * @sb: superblock being unmounted. @@ -420,7 +425,7 @@ static int send_to_group(__u32 mask, const void *data, int data_type, file_name, cookie, iter_info); } -static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector **connp) +static struct fsnotify_mark *fsnotify_first_mark(struct fsnotify_mark_connector *const *connp) { struct fsnotify_mark_connector *conn; struct hlist_node *node = NULL; @@ -538,14 +543,15 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, { const struct path *path = fsnotify_data_path(data, data_type); struct super_block *sb = fsnotify_data_sb(data, data_type); - struct fsnotify_sb_info *sbinfo = fsnotify_sb_info(sb); + const struct fsnotify_mnt *mnt_data = fsnotify_data_mnt(data, data_type); + struct fsnotify_sb_info *sbinfo = sb ? fsnotify_sb_info(sb) : NULL; struct fsnotify_iter_info iter_info = {}; struct mount *mnt = NULL; struct inode *inode2 = NULL; struct dentry *moved; int inode2_type; int ret = 0; - __u32 test_mask, marks_mask; + __u32 test_mask, marks_mask = 0; if (path) mnt = real_mount(path->mnt); @@ -578,17 +584,20 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, if ((!sbinfo || !sbinfo->sb_marks) && (!mnt || !mnt->mnt_fsnotify_marks) && (!inode || !inode->i_fsnotify_marks) && - (!inode2 || !inode2->i_fsnotify_marks)) + (!inode2 || !inode2->i_fsnotify_marks) && + (!mnt_data || !mnt_data->ns->n_fsnotify_marks)) return 0; - marks_mask = READ_ONCE(sb->s_fsnotify_mask); + if (sb) + marks_mask |= READ_ONCE(sb->s_fsnotify_mask); if (mnt) marks_mask |= READ_ONCE(mnt->mnt_fsnotify_mask); if (inode) marks_mask |= READ_ONCE(inode->i_fsnotify_mask); if (inode2) marks_mask |= READ_ONCE(inode2->i_fsnotify_mask); - + if (mnt_data) + marks_mask |= READ_ONCE(mnt_data->ns->n_fsnotify_mask); /* * If this is a modify event we may need to clear some ignore masks. @@ -618,6 +627,10 @@ int fsnotify(__u32 mask, const void *data, int data_type, struct inode *dir, iter_info.marks[inode2_type] = fsnotify_first_mark(&inode2->i_fsnotify_marks); } + if (mnt_data) { + iter_info.marks[FSNOTIFY_ITER_TYPE_MNTNS] = + fsnotify_first_mark(&mnt_data->ns->n_fsnotify_marks); + } /* * We need to merge inode/vfsmount/sb mark lists so that e.g. inode mark @@ -708,11 +721,31 @@ void file_set_fsnotify_mode_from_watchers(struct file *file) } #endif +void fsnotify_mnt(__u32 mask, struct mnt_namespace *ns, struct vfsmount *mnt) +{ + struct fsnotify_mnt data = { + .ns = ns, + .mnt_id = real_mount(mnt)->mnt_id_unique, + }; + + if (WARN_ON_ONCE(!ns)) + return; + + /* + * This is an optimization as well as making sure fsnotify_init() has + * been called. + */ + if (!ns->n_fsnotify_marks) + return; + + fsnotify(mask, &data, FSNOTIFY_EVENT_MNT, NULL, NULL, NULL, 0); +} + static __init int fsnotify_init(void) { int ret; - BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 24); + BUILD_BUG_ON(HWEIGHT32(ALL_FSNOTIFY_BITS) != 26); ret = init_srcu_struct(&fsnotify_mark_srcu); if (ret) diff --git a/fs/notify/fsnotify.h b/fs/notify/fsnotify.h index 663759ed6fbc..5950c7a67f41 100644 --- a/fs/notify/fsnotify.h +++ b/fs/notify/fsnotify.h @@ -33,6 +33,12 @@ static inline struct super_block *fsnotify_conn_sb( return conn->obj; } +static inline struct mnt_namespace *fsnotify_conn_mntns( + struct fsnotify_mark_connector *conn) +{ + return conn->obj; +} + static inline struct super_block *fsnotify_object_sb(void *obj, enum fsnotify_obj_type obj_type) { @@ -89,6 +95,11 @@ static inline void fsnotify_clear_marks_by_sb(struct super_block *sb) fsnotify_destroy_marks(fsnotify_sb_marks(sb)); } +static inline void fsnotify_clear_marks_by_mntns(struct mnt_namespace *mntns) +{ + fsnotify_destroy_marks(&mntns->n_fsnotify_marks); +} + /* * update the dentry->d_flags of all of inode's children to indicate if inode cares * about events that happen to its children. diff --git a/fs/notify/mark.c b/fs/notify/mark.c index 4981439e6209..798340db69d7 100644 --- a/fs/notify/mark.c +++ b/fs/notify/mark.c @@ -107,6 +107,8 @@ static fsnotify_connp_t *fsnotify_object_connp(void *obj, return &real_mount(obj)->mnt_fsnotify_marks; case FSNOTIFY_OBJ_TYPE_SB: return fsnotify_sb_marks(obj); + case FSNOTIFY_OBJ_TYPE_MNTNS: + return &((struct mnt_namespace *)obj)->n_fsnotify_marks; default: return NULL; } @@ -120,6 +122,8 @@ static __u32 *fsnotify_conn_mask_p(struct fsnotify_mark_connector *conn) return &fsnotify_conn_mount(conn)->mnt_fsnotify_mask; else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) return &fsnotify_conn_sb(conn)->s_fsnotify_mask; + else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) + return &fsnotify_conn_mntns(conn)->n_fsnotify_mask; return NULL; } @@ -346,12 +350,15 @@ static void *fsnotify_detach_connector_from_object( fsnotify_conn_mount(conn)->mnt_fsnotify_mask = 0; } else if (conn->type == FSNOTIFY_OBJ_TYPE_SB) { fsnotify_conn_sb(conn)->s_fsnotify_mask = 0; + } else if (conn->type == FSNOTIFY_OBJ_TYPE_MNTNS) { + fsnotify_conn_mntns(conn)->n_fsnotify_mask = 0; } rcu_assign_pointer(*connp, NULL); conn->obj = NULL; conn->type = FSNOTIFY_OBJ_TYPE_DETACHED; - fsnotify_update_sb_watchers(sb, conn); + if (sb) + fsnotify_update_sb_watchers(sb, conn); return inode; } @@ -724,7 +731,7 @@ static int fsnotify_add_mark_list(struct fsnotify_mark *mark, void *obj, * Attach the sb info before attaching a connector to any object on sb. * The sb info will remain attached as long as sb lives. */ - if (!fsnotify_sb_info(sb)) { + if (sb && !fsnotify_sb_info(sb)) { err = fsnotify_attach_info_to_sb(sb); if (err) return err; @@ -770,7 +777,8 @@ restart: /* mark should be the last entry. last is the current last entry */ hlist_add_behind_rcu(&mark->obj_list, &last->obj_list); added: - fsnotify_update_sb_watchers(sb, conn); + if (sb) + fsnotify_update_sb_watchers(sb, conn); /* * Since connector is attached to object using cmpxchg() we are * guaranteed that connector initialization is fully visible by anyone diff --git a/fs/nsfs.c b/fs/nsfs.c index 663f8656158d..59aa801347a7 100644 --- a/fs/nsfs.c +++ b/fs/nsfs.c @@ -37,7 +37,6 @@ static char *ns_dname(struct dentry *dentry, char *buffer, int buflen) } const struct dentry_operations ns_dentry_operations = { - .d_delete = always_delete_dentry, .d_dname = ns_dname, .d_prune = stashed_dentry_prune, }; @@ -152,19 +151,49 @@ static int copy_ns_info_to_user(const struct mnt_namespace *mnt_ns, return 0; } +static bool nsfs_ioctl_valid(unsigned int cmd) +{ + switch (cmd) { + case NS_GET_USERNS: + case NS_GET_PARENT: + case NS_GET_NSTYPE: + case NS_GET_OWNER_UID: + case NS_GET_MNTNS_ID: + case NS_GET_PID_FROM_PIDNS: + case NS_GET_TGID_FROM_PIDNS: + case NS_GET_PID_IN_PIDNS: + case NS_GET_TGID_IN_PIDNS: + return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + } + + /* Extensible ioctls require some extra handling. */ + switch (_IOC_NR(cmd)) { + case _IOC_NR(NS_MNT_GET_INFO): + case _IOC_NR(NS_MNT_GET_NEXT): + case _IOC_NR(NS_MNT_GET_PREV): + return (_IOC_TYPE(cmd) == _IOC_TYPE(cmd)); + } + + return false; +} + static long ns_ioctl(struct file *filp, unsigned int ioctl, unsigned long arg) { struct user_namespace *user_ns; struct pid_namespace *pid_ns; struct task_struct *tsk; - struct ns_common *ns = get_proc_ns(file_inode(filp)); + struct ns_common *ns; struct mnt_namespace *mnt_ns; bool previous = false; uid_t __user *argp; uid_t uid; int ret; + if (!nsfs_ioctl_valid(ioctl)) + return -ENOIOCTLCMD; + + ns = get_proc_ns(file_inode(filp)); switch (ioctl) { case NS_GET_USERNS: return open_related_ns(ns, ns_get_owner); diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index af94e3737470..eced9013a881 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -2605,74 +2605,3 @@ int attr_force_nonresident(struct ntfs_inode *ni) return err; } - -/* - * Change the compression of data attribute - */ -int attr_set_compress(struct ntfs_inode *ni, bool compr) -{ - struct ATTRIB *attr; - struct mft_inode *mi; - - attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); - if (!attr) - return -ENOENT; - - if (is_attr_compressed(attr) == !!compr) { - /* Already required compressed state. */ - return 0; - } - - if (attr->non_res) { - u16 run_off; - u32 run_size; - char *run; - - if (attr->nres.data_size) { - /* - * There are rare cases when it possible to change - * compress state without big changes. - * TODO: Process these cases. - */ - return -EOPNOTSUPP; - } - - run_off = le16_to_cpu(attr->nres.run_off); - run_size = le32_to_cpu(attr->size) - run_off; - run = Add2Ptr(attr, run_off); - - if (!compr) { - /* remove field 'attr->nres.total_size'. */ - memmove(run - 8, run, run_size); - run_off -= 8; - } - - if (!mi_resize_attr(mi, attr, compr ? +8 : -8)) { - /* - * Ignore rare case when there are no 8 bytes in record with attr. - * TODO: split attribute. - */ - return -EOPNOTSUPP; - } - - if (compr) { - /* Make a gap for 'attr->nres.total_size'. */ - memmove(run + 8, run, run_size); - run_off += 8; - attr->nres.total_size = attr->nres.alloc_size; - } - attr->nres.run_off = cpu_to_le16(run_off); - } - - /* Update data attribute flags. */ - if (compr) { - attr->flags |= ATTR_FLAG_COMPRESSED; - attr->nres.c_unit = NTFS_LZNT_CUNIT; - } else { - attr->flags &= ~ATTR_FLAG_COMPRESSED; - attr->nres.c_unit = 0; - } - mi->dirty = true; - - return 0; -} diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 3f96a11804c9..34ed242e1063 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -50,72 +50,6 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) } /* - * ntfs_fileattr_get - inode_operations::fileattr_get - */ -int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - struct ntfs_inode *ni = ntfs_i(inode); - u32 flags = 0; - - if (inode->i_flags & S_IMMUTABLE) - flags |= FS_IMMUTABLE_FL; - - if (inode->i_flags & S_APPEND) - flags |= FS_APPEND_FL; - - if (is_compressed(ni)) - flags |= FS_COMPR_FL; - - if (is_encrypted(ni)) - flags |= FS_ENCRYPT_FL; - - fileattr_fill_flags(fa, flags); - - return 0; -} - -/* - * ntfs_fileattr_set - inode_operations::fileattr_set - */ -int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - struct ntfs_inode *ni = ntfs_i(inode); - u32 flags = fa->flags; - unsigned int new_fl = 0; - - if (fileattr_has_fsx(fa)) - return -EOPNOTSUPP; - - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_COMPR_FL)) - return -EOPNOTSUPP; - - if (flags & FS_IMMUTABLE_FL) - new_fl |= S_IMMUTABLE; - - if (flags & FS_APPEND_FL) - new_fl |= S_APPEND; - - /* Allowed to change compression for empty files and for directories only. */ - if (!is_dedup(ni) && !is_encrypted(ni) && - (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - /* Change compress state. */ - int err = ni_set_compress(inode, flags & FS_COMPR_FL); - if (err) - return err; - } - - inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND); - - inode_set_ctime_current(inode); - mark_inode_dirty(inode); - - return 0; -} - -/* * ntfs_ioctl - file_operations::unlocked_ioctl */ long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) @@ -1228,21 +1162,22 @@ static ssize_t ntfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from) ssize_t ret; int err; - err = check_write_restriction(inode); - if (err) - return err; - - if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { - ntfs_inode_warn(inode, "direct i/o + compressed not supported"); - return -EOPNOTSUPP; - } - if (!inode_trylock(inode)) { if (iocb->ki_flags & IOCB_NOWAIT) return -EAGAIN; inode_lock(inode); } + ret = check_write_restriction(inode); + if (ret) + goto out; + + if (is_compressed(ni) && (iocb->ki_flags & IOCB_DIRECT)) { + ntfs_inode_warn(inode, "direct i/o + compressed not supported"); + ret = -EOPNOTSUPP; + goto out; + } + ret = generic_write_checks(iocb, from); if (ret <= 0) goto out; @@ -1389,8 +1324,6 @@ const struct inode_operations ntfs_file_inode_operations = { .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .fiemap = ntfs_fiemap, - .fileattr_get = ntfs_fileattr_get, - .fileattr_set = ntfs_fileattr_set, }; const struct file_operations ntfs_file_operations = { diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index 5df6a0b5add9..756e1306fe6c 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -281,63 +281,6 @@ struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, } /* - * ni_load_attr - Load attribute that contains given VCN. - */ -struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, - const __le16 *name, u8 name_len, CLST vcn, - struct mft_inode **pmi) -{ - struct ATTR_LIST_ENTRY *le; - struct ATTRIB *attr; - struct mft_inode *mi; - struct ATTR_LIST_ENTRY *next; - - if (!ni->attr_list.size) { - if (pmi) - *pmi = &ni->mi; - return mi_find_attr(ni, &ni->mi, NULL, type, name, name_len, - NULL); - } - - le = al_find_ex(ni, NULL, type, name, name_len, NULL); - if (!le) - return NULL; - - /* - * Unfortunately ATTR_LIST_ENTRY contains only start VCN. - * So to find the ATTRIB segment that contains 'vcn' we should - * enumerate some entries. - */ - if (vcn) { - for (;; le = next) { - next = al_find_ex(ni, le, type, name, name_len, NULL); - if (!next || le64_to_cpu(next->vcn) > vcn) - break; - } - } - - if (ni_load_mi(ni, le, &mi)) - return NULL; - - if (pmi) - *pmi = mi; - - attr = mi_find_attr(ni, mi, NULL, type, name, name_len, &le->id); - if (!attr) - return NULL; - - if (!attr->non_res) - return attr; - - if (le64_to_cpu(attr->nres.svcn) <= vcn && - vcn <= le64_to_cpu(attr->nres.evcn)) - return attr; - - _ntfs_bad_inode(&ni->vfs_inode); - return NULL; -} - -/* * ni_load_all_mi - Load all subrecords. */ int ni_load_all_mi(struct ntfs_inode *ni) @@ -3384,75 +3327,3 @@ out: return 0; } - -/* - * ni_set_compress - * - * Helper for 'ntfs_fileattr_set'. - * Changes compression for empty files and directories only. - */ -int ni_set_compress(struct inode *inode, bool compr) -{ - int err; - struct ntfs_inode *ni = ntfs_i(inode); - struct ATTR_STD_INFO *std; - const char *bad_inode; - - if (is_compressed(ni) == !!compr) - return 0; - - if (is_sparsed(ni)) { - /* sparse and compress not compatible. */ - return -EOPNOTSUPP; - } - - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) { - /*Skip other inodes. (symlink,fifo,...) */ - return -EOPNOTSUPP; - } - - bad_inode = NULL; - - ni_lock(ni); - - std = ni_std(ni); - if (!std) { - bad_inode = "no std"; - goto out; - } - - if (S_ISREG(inode->i_mode)) { - err = attr_set_compress(ni, compr); - if (err) { - if (err == -ENOENT) { - /* Fix on the fly? */ - /* Each file must contain data attribute. */ - bad_inode = "no data attribute"; - } - goto out; - } - } - - ni->std_fa = std->fa; - if (compr) - std->fa |= FILE_ATTRIBUTE_COMPRESSED; - else - std->fa &= ~FILE_ATTRIBUTE_COMPRESSED; - - if (ni->std_fa != std->fa) { - ni->std_fa = std->fa; - ni->mi.dirty = true; - } - /* update duplicate information and directory entries in ni_write_inode.*/ - ni->ni_flags |= NI_FLAG_UPDATE_PARENT; - err = 0; - -out: - ni_unlock(ni); - if (bad_inode) { - ntfs_bad_inode(inode, bad_inode); - err = -EINVAL; - } - - return err; -} diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index d0d530f4e2b9..38934e6978ec 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -3091,16 +3091,16 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, inode = ilookup(sbi->sb, rno); if (inode) { mi = &ntfs_i(inode)->mi; - } else if (op == InitializeFileRecordSegment) { - mi = kzalloc(sizeof(struct mft_inode), GFP_NOFS); - if (!mi) - return -ENOMEM; - err = mi_format_new(mi, sbi, rno, 0, false); - if (err) - goto out; } else { /* Read from disk. */ err = mi_get(sbi, rno, &mi); + if (err && op == InitializeFileRecordSegment) { + mi = kzalloc(sizeof(struct mft_inode), + GFP_NOFS); + if (!mi) + return -ENOMEM; + err = mi_format_new(mi, sbi, rno, 0, false); + } if (err) return err; } @@ -3109,15 +3109,13 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, if (op == DeallocateFileRecordSegment) goto skip_load_parent; - if (InitializeFileRecordSegment != op) { - if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE) - goto dirty_vol; - if (!check_lsn(&rec->rhdr, rlsn)) - goto out; - if (!check_file_record(rec, NULL, sbi)) - goto dirty_vol; - attr = Add2Ptr(rec, roff); - } + if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE) + goto dirty_vol; + if (!check_lsn(&rec->rhdr, rlsn)) + goto out; + if (!check_file_record(rec, NULL, sbi)) + goto dirty_vol; + attr = Add2Ptr(rec, roff); if (is_rec_base(rec) || InitializeFileRecordSegment == op) { rno_base = rno; @@ -3143,7 +3141,7 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, if (inode) iput(inode); - else if (mi) + else mi_put(mi); inode = inode_parent; diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c index 938d351ebac7..df81f1f7330c 100644 --- a/fs/ntfs3/fsntfs.c +++ b/fs/ntfs3/fsntfs.c @@ -1035,34 +1035,6 @@ struct buffer_head *ntfs_bread(struct super_block *sb, sector_t block) return NULL; } -int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer) -{ - struct block_device *bdev = sb->s_bdev; - u32 blocksize = sb->s_blocksize; - u64 block = lbo >> sb->s_blocksize_bits; - u32 off = lbo & (blocksize - 1); - u32 op = blocksize - off; - - for (; bytes; block += 1, off = 0, op = blocksize) { - struct buffer_head *bh = __bread(bdev, block, blocksize); - - if (!bh) - return -EIO; - - if (op > bytes) - op = bytes; - - memcpy(buffer, bh->b_data + off, op); - - put_bh(bh); - - bytes -= op; - buffer = Add2Ptr(buffer, op); - } - - return 0; -} - int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buf, int wait) { diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 7eb9fae22f8d..1bf2a6593dec 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -618,7 +618,7 @@ static bool index_hdr_check(const struct INDEX_HDR *hdr, u32 bytes) u32 off = le32_to_cpu(hdr->de_off); if (!IS_ALIGNED(off, 8) || tot > bytes || end > tot || - off + sizeof(struct NTFS_DE) > end) { + size_add(off, sizeof(struct NTFS_DE)) > end) { /* incorrect index buffer. */ return false; } @@ -736,7 +736,7 @@ fill_table: if (end > total) return NULL; - if (off + sizeof(struct NTFS_DE) > end) + if (size_add(off, sizeof(struct NTFS_DE)) > end) return NULL; e = Add2Ptr(hdr, off); @@ -2182,6 +2182,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx, e = hdr_first_de(&n->index->ihdr); fnd_push(fnd, n, e); + if (!e) { + err = -EINVAL; + goto out; + } if (!de_is_last(e)) { /* @@ -2203,6 +2207,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx, n = fnd->nodes[level]; te = hdr_first_de(&n->index->ihdr); + if (!te) { + err = -EINVAL; + goto out; + } /* Copy the candidate entry into the replacement entry buffer. */ re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS); if (!re) { diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index a1e11228dafd..0f0d27d4644a 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -805,6 +805,10 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ret = 0; goto out; } + if (is_compressed(ni)) { + ret = 0; + goto out; + } ret = blockdev_direct_IO(iocb, inode, iter, wr ? ntfs_get_block_direct_IO_W : @@ -1025,46 +1029,6 @@ int ntfs_sync_inode(struct inode *inode) } /* - * writeback_inode - Helper function for ntfs_flush_inodes(). - * - * This writes both the inode and the file data blocks, waiting - * for in flight data blocks before the start of the call. It - * does not wait for any io started during the call. - */ -static int writeback_inode(struct inode *inode) -{ - int ret = sync_inode_metadata(inode, 0); - - if (!ret) - ret = filemap_fdatawrite(inode->i_mapping); - return ret; -} - -/* - * ntfs_flush_inodes - * - * Write data and metadata corresponding to i1 and i2. The io is - * started but we do not wait for any of it to finish. - * - * filemap_flush() is used for the block device, so if there is a dirty - * page for a block already in flight, we will not wait and start the - * io over again. - */ -int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, - struct inode *i2) -{ - int ret = 0; - - if (i1) - ret = writeback_inode(i1); - if (!ret && i2) - ret = writeback_inode(i2); - if (!ret) - ret = filemap_flush(sb->s_bdev_file->f_mapping); - return ret; -} - -/* * Helper function to read file. */ int inode_read_data(struct inode *inode, void *data, size_t bytes) @@ -2108,5 +2072,6 @@ const struct address_space_operations ntfs_aops_cmpr = { .read_folio = ntfs_read_folio, .readahead = ntfs_readahead, .dirty_folio = block_dirty_folio, + .direct_IO = ntfs_direct_IO, }; // clang-format on diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index abf7e81584a9..b807744fc6a9 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -201,11 +201,11 @@ static int ntfs_symlink(struct mnt_idmap *idmap, struct inode *dir, /* * ntfs_mkdir- inode_operations::mkdir */ -static int ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ntfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0, - NULL, 0, NULL); + return ERR_PTR(ntfs_create_inode(idmap, dir, dentry, NULL, S_IFDIR | mode, 0, + NULL, 0, NULL)); } /* @@ -507,8 +507,6 @@ const struct inode_operations ntfs_dir_inode_operations = { .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, .fiemap = ntfs_fiemap, - .fileattr_get = ntfs_fileattr_get, - .fileattr_set = ntfs_fileattr_set, }; const struct inode_operations ntfs_special_inode_operations = { diff --git a/fs/ntfs3/ntfs.h b/fs/ntfs3/ntfs.h index 241f2ffdd920..1ff13b6f9613 100644 --- a/fs/ntfs3/ntfs.h +++ b/fs/ntfs3/ntfs.h @@ -717,7 +717,7 @@ static inline struct NTFS_DE *hdr_first_de(const struct INDEX_HDR *hdr) struct NTFS_DE *e; u16 esize; - if (de_off >= used || de_off + sizeof(struct NTFS_DE) > used ) + if (de_off >= used || size_add(de_off, sizeof(struct NTFS_DE)) > used) return NULL; e = Add2Ptr(hdr, de_off); diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index 382820464dee..36b8052660d5 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -454,7 +454,6 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size); int attr_force_nonresident(struct ntfs_inode *ni); -int attr_set_compress(struct ntfs_inode *ni, bool compr); /* Functions from attrlist.c */ void al_destroy(struct ntfs_inode *ni); @@ -497,9 +496,6 @@ extern const struct file_operations ntfs_dir_operations; extern const struct file_operations ntfs_legacy_dir_operations; /* Globals from file.c */ -int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa); int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, @@ -530,9 +526,6 @@ struct ATTRIB *ni_find_attr(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTRIB *ni_enum_attr_ex(struct ntfs_inode *ni, struct ATTRIB *attr, struct ATTR_LIST_ENTRY **le, struct mft_inode **mi); -struct ATTRIB *ni_load_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, - const __le16 *name, u8 name_len, CLST vcn, - struct mft_inode **pmi); int ni_load_all_mi(struct ntfs_inode *ni); bool ni_add_subrecord(struct ntfs_inode *ni, CLST rno, struct mft_inode **mi); int ni_remove_attr(struct ntfs_inode *ni, enum ATTR_TYPE type, @@ -588,7 +581,6 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, bool *is_bad); bool ni_is_dirty(struct inode *inode); -int ni_set_compress(struct inode *inode, bool compr); /* Globals from fslog.c */ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); @@ -619,7 +611,6 @@ enum NTFS_DIRTY_FLAGS { NTFS_DIRTY_ERROR = 2, }; int ntfs_set_state(struct ntfs_sb_info *sbi, enum NTFS_DIRTY_FLAGS dirty); -int ntfs_sb_read(struct super_block *sb, u64 lbo, size_t bytes, void *buffer); int ntfs_sb_write(struct super_block *sb, u64 lbo, size_t bytes, const void *buffer, int wait); int ntfs_sb_write_run(struct ntfs_sb_info *sbi, const struct runs_tree *run, @@ -717,8 +708,6 @@ int ntfs_write_end(struct file *file, struct address_space *mapping, loff_t pos, u32 len, u32 copied, struct folio *folio, void *fsdata); int ntfs3_write_inode(struct inode *inode, struct writeback_control *wbc); int ntfs_sync_inode(struct inode *inode); -int ntfs_flush_inodes(struct super_block *sb, struct inode *i1, - struct inode *i2); int inode_read_data(struct inode *inode, void *data, size_t bytes); int ntfs_create_inode(struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, const struct cpu_str *uni, diff --git a/fs/ntfs3/super.c b/fs/ntfs3/super.c index 6a0f6b0a3ab2..920a1ab47b63 100644 --- a/fs/ntfs3/super.c +++ b/fs/ntfs3/super.c @@ -555,6 +555,55 @@ static const struct proc_ops ntfs3_label_fops = { .proc_write = ntfs3_label_write, }; +static void ntfs_create_procdir(struct super_block *sb) +{ + struct proc_dir_entry *e; + + if (!proc_info_root) + return; + + e = proc_mkdir(sb->s_id, proc_info_root); + if (e) { + struct ntfs_sb_info *sbi = sb->s_fs_info; + + proc_create_data("volinfo", 0444, e, + &ntfs3_volinfo_fops, sb); + proc_create_data("label", 0644, e, + &ntfs3_label_fops, sb); + sbi->procdir = e; + } +} + +static void ntfs_remove_procdir(struct super_block *sb) +{ + struct ntfs_sb_info *sbi = sb->s_fs_info; + + if (!sbi->procdir) + return; + + remove_proc_entry("label", sbi->procdir); + remove_proc_entry("volinfo", sbi->procdir); + remove_proc_entry(sb->s_id, proc_info_root); + sbi->procdir = NULL; +} + +static void ntfs_create_proc_root(void) +{ + proc_info_root = proc_mkdir("fs/ntfs3", NULL); +} + +static void ntfs_remove_proc_root(void) +{ + if (proc_info_root) { + remove_proc_entry("fs/ntfs3", NULL); + proc_info_root = NULL; + } +} +#else +static void ntfs_create_procdir(struct super_block *sb) {} +static void ntfs_remove_procdir(struct super_block *sb) {} +static void ntfs_create_proc_root(void) {} +static void ntfs_remove_proc_root(void) {} #endif static struct kmem_cache *ntfs_inode_cachep; @@ -644,15 +693,7 @@ static void ntfs_put_super(struct super_block *sb) { struct ntfs_sb_info *sbi = sb->s_fs_info; -#ifdef CONFIG_PROC_FS - // Remove /proc/fs/ntfs3/.. - if (sbi->procdir) { - remove_proc_entry("label", sbi->procdir); - remove_proc_entry("volinfo", sbi->procdir); - remove_proc_entry(sb->s_id, proc_info_root); - sbi->procdir = NULL; - } -#endif + ntfs_remove_procdir(sb); /* Mark rw ntfs as clear, if possible. */ ntfs_set_state(sbi, NTFS_DIRTY_CLEAR); @@ -1590,20 +1631,7 @@ load_root: kfree(boot2); } -#ifdef CONFIG_PROC_FS - /* Create /proc/fs/ntfs3/.. */ - if (proc_info_root) { - struct proc_dir_entry *e = proc_mkdir(sb->s_id, proc_info_root); - static_assert((S_IRUGO | S_IWUSR) == 0644); - if (e) { - proc_create_data("volinfo", S_IRUGO, e, - &ntfs3_volinfo_fops, sb); - proc_create_data("label", S_IRUGO | S_IWUSR, e, - &ntfs3_label_fops, sb); - sbi->procdir = e; - } - } -#endif + ntfs_create_procdir(sb); if (is_legacy_ntfs(sb)) sb->s_flags |= SB_RDONLY; @@ -1853,14 +1881,11 @@ static int __init init_ntfs_fs(void) if (IS_ENABLED(CONFIG_NTFS3_LZX_XPRESS)) pr_info("ntfs3: Read-only LZX/Xpress compression included\n"); -#ifdef CONFIG_PROC_FS - /* Create "/proc/fs/ntfs3" */ - proc_info_root = proc_mkdir("fs/ntfs3", NULL); -#endif + ntfs_create_proc_root(); err = ntfs3_init_bitmap(); if (err) - return err; + goto out2; ntfs_inode_cachep = kmem_cache_create( "ntfs_inode_cache", sizeof(struct ntfs_inode), 0, @@ -1880,6 +1905,8 @@ out: kmem_cache_destroy(ntfs_inode_cachep); out1: ntfs3_exit_bitmap(); +out2: + ntfs_remove_proc_root(); return err; } @@ -1890,11 +1917,7 @@ static void __exit exit_ntfs_fs(void) unregister_filesystem(&ntfs_fs_type); unregister_as_ntfs_legacy(); ntfs3_exit_bitmap(); - -#ifdef CONFIG_PROC_FS - if (proc_info_root) - remove_proc_entry("fs/ntfs3", NULL); -#endif + ntfs_remove_proc_root(); } MODULE_LICENSE("GPL"); diff --git a/fs/ocfs2/alloc.c b/fs/ocfs2/alloc.c index 4414743b638e..821cb7874685 100644 --- a/fs/ocfs2/alloc.c +++ b/fs/ocfs2/alloc.c @@ -1803,6 +1803,14 @@ static int __ocfs2_find_path(struct ocfs2_caching_info *ci, el = root_el; while (el->l_tree_depth) { + if (unlikely(le16_to_cpu(el->l_tree_depth) >= OCFS2_MAX_PATH_DEPTH)) { + ocfs2_error(ocfs2_metadata_cache_get_super(ci), + "Owner %llu has invalid tree depth %u in extent list\n", + (unsigned long long)ocfs2_metadata_cache_owner(ci), + le16_to_cpu(el->l_tree_depth)); + ret = -EROFS; + goto out; + } if (le16_to_cpu(el->l_next_free_rec) == 0) { ocfs2_error(ocfs2_metadata_cache_get_super(ci), "Owner %llu has empty extent list at depth %u\n", @@ -6910,6 +6918,7 @@ static int ocfs2_grab_folios(struct inode *inode, loff_t start, loff_t end, if (IS_ERR(folios[numfolios])) { ret = PTR_ERR(folios[numfolios]); mlog_errno(ret); + folios[numfolios] = NULL; goto out; } diff --git a/fs/ocfs2/aops.c b/fs/ocfs2/aops.c index 5bbeb6fbb1ac..40b6bce12951 100644 --- a/fs/ocfs2/aops.c +++ b/fs/ocfs2/aops.c @@ -46,7 +46,6 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh = NULL; struct buffer_head *buffer_cache_bh = NULL; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - void *kaddr; trace_ocfs2_symlink_get_block( (unsigned long long)OCFS2_I(inode)->ip_blkno, @@ -91,17 +90,11 @@ static int ocfs2_symlink_get_block(struct inode *inode, sector_t iblock, * could've happened. Since we've got a reference on * the bh, even if it commits while we're doing the * copy, the data is still good. */ - if (buffer_jbd(buffer_cache_bh) - && ocfs2_inode_is_new(inode)) { - kaddr = kmap_atomic(bh_result->b_page); - if (!kaddr) { - mlog(ML_ERROR, "couldn't kmap!\n"); - goto bail; - } - memcpy(kaddr + (bh_result->b_size * iblock), - buffer_cache_bh->b_data, - bh_result->b_size); - kunmap_atomic(kaddr); + if (buffer_jbd(buffer_cache_bh) && ocfs2_inode_is_new(inode)) { + memcpy_to_folio(bh_result->b_folio, + bh_result->b_size * iblock, + buffer_cache_bh->b_data, + bh_result->b_size); set_buffer_uptodate(bh_result); } brelse(buffer_cache_bh); @@ -920,7 +913,7 @@ static void ocfs2_write_failure(struct inode *inode, ocfs2_jbd2_inode_add_write(wc->w_handle, inode, user_pos, user_len); - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); } } } @@ -2012,7 +2005,7 @@ int ocfs2_write_end_nolock(struct address_space *mapping, loff_t pos, ocfs2_jbd2_inode_add_write(handle, inode, start_byte, length); } - block_commit_write(&folio->page, from, to); + block_commit_write(folio, from, to); } } diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index 0f46b22561d6..fce9beb214f0 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -724,7 +724,7 @@ static void o2net_shutdown_sc(struct work_struct *work) if (o2net_unregister_callbacks(sc->sc_sock->sk, sc)) { /* we shouldn't flush as we're in the thread, the * races with pending sc work structs are harmless */ - del_timer_sync(&sc->sc_idle_timeout); + timer_delete_sync(&sc->sc_idle_timeout); o2net_sc_cancel_delayed_work(sc, &sc->sc_keepalive_work); sc_put(sc); kernel_sock_shutdown(sc->sc_sock, SHUT_RDWR); diff --git a/fs/ocfs2/dlmfs/dlmfs.c b/fs/ocfs2/dlmfs/dlmfs.c index 2a7f36643895..5130ec44e5e1 100644 --- a/fs/ocfs2/dlmfs/dlmfs.c +++ b/fs/ocfs2/dlmfs/dlmfs.c @@ -402,10 +402,10 @@ static struct inode *dlmfs_get_inode(struct inode *parent, * File creation. Allocate an inode, and we're done.. */ /* SMP-safe */ -static int dlmfs_mkdir(struct mnt_idmap * idmap, - struct inode * dir, - struct dentry * dentry, - umode_t mode) +static struct dentry *dlmfs_mkdir(struct mnt_idmap * idmap, + struct inode * dir, + struct dentry * dentry, + umode_t mode) { int status; struct inode *inode = NULL; @@ -448,7 +448,7 @@ static int dlmfs_mkdir(struct mnt_idmap * idmap, bail: if (status < 0) iput(inode); - return status; + return ERR_PTR(status); } static int dlmfs_create(struct mnt_idmap *idmap, diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index e54f2c4b5a90..2056cf08ac1e 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -813,7 +813,7 @@ static int ocfs2_write_zero_page(struct inode *inode, u64 abs_from, /* must not update i_size! */ - block_commit_write(&folio->page, block_start + 1, block_start + 1); + block_commit_write(folio, block_start + 1, block_start + 1); } /* diff --git a/fs/ocfs2/journal.c b/fs/ocfs2/journal.c index f1b4b3e611cb..e5f58ff2175f 100644 --- a/fs/ocfs2/journal.c +++ b/fs/ocfs2/journal.c @@ -174,7 +174,7 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) struct ocfs2_recovery_map *rm; mutex_init(&osb->recovery_lock); - osb->disable_recovery = 0; + osb->recovery_state = OCFS2_REC_ENABLED; osb->recovery_thread_task = NULL; init_waitqueue_head(&osb->recovery_event); @@ -190,31 +190,53 @@ int ocfs2_recovery_init(struct ocfs2_super *osb) return 0; } -/* we can't grab the goofy sem lock from inside wait_event, so we use - * memory barriers to make sure that we'll see the null task before - * being woken up */ static int ocfs2_recovery_thread_running(struct ocfs2_super *osb) { - mb(); return osb->recovery_thread_task != NULL; } -void ocfs2_recovery_exit(struct ocfs2_super *osb) +static void ocfs2_recovery_disable(struct ocfs2_super *osb, + enum ocfs2_recovery_state state) { - struct ocfs2_recovery_map *rm; - - /* disable any new recovery threads and wait for any currently - * running ones to exit. Do this before setting the vol_state. */ mutex_lock(&osb->recovery_lock); - osb->disable_recovery = 1; + /* + * If recovery thread is not running, we can directly transition to + * final state. + */ + if (!ocfs2_recovery_thread_running(osb)) { + osb->recovery_state = state + 1; + goto out_lock; + } + osb->recovery_state = state; + /* Wait for recovery thread to acknowledge state transition */ + wait_event_cmd(osb->recovery_event, + !ocfs2_recovery_thread_running(osb) || + osb->recovery_state >= state + 1, + mutex_unlock(&osb->recovery_lock), + mutex_lock(&osb->recovery_lock)); +out_lock: mutex_unlock(&osb->recovery_lock); - wait_event(osb->recovery_event, !ocfs2_recovery_thread_running(osb)); - /* At this point, we know that no more recovery threads can be - * launched, so wait for any recovery completion work to - * complete. */ + /* + * At this point we know that no more recovery work can be queued so + * wait for any recovery completion work to complete. + */ if (osb->ocfs2_wq) flush_workqueue(osb->ocfs2_wq); +} + +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb) +{ + ocfs2_recovery_disable(osb, OCFS2_REC_QUOTA_WANT_DISABLE); +} + +void ocfs2_recovery_exit(struct ocfs2_super *osb) +{ + struct ocfs2_recovery_map *rm; + + /* disable any new recovery threads and wait for any currently + * running ones to exit. Do this before setting the vol_state. */ + ocfs2_recovery_disable(osb, OCFS2_REC_WANT_DISABLE); /* * Now that recovery is shut down, and the osb is about to be @@ -1249,7 +1271,7 @@ static int ocfs2_force_read_journal(struct inode *inode) } for (i = 0; i < p_blocks; i++, p_blkno++) { - bh = __find_get_block(osb->sb->s_bdev, p_blkno, + bh = __find_get_block_nonatomic(osb->sb->s_bdev, p_blkno, osb->sb->s_blocksize); /* block not cached. */ if (!bh) @@ -1472,6 +1494,18 @@ static int __ocfs2_recovery_thread(void *arg) } } restart: + if (quota_enabled) { + mutex_lock(&osb->recovery_lock); + /* Confirm that recovery thread will no longer recover quotas */ + if (osb->recovery_state == OCFS2_REC_QUOTA_WANT_DISABLE) { + osb->recovery_state = OCFS2_REC_QUOTA_DISABLED; + wake_up(&osb->recovery_event); + } + if (osb->recovery_state >= OCFS2_REC_QUOTA_DISABLED) + quota_enabled = 0; + mutex_unlock(&osb->recovery_lock); + } + status = ocfs2_super_lock(osb, 1); if (status < 0) { mlog_errno(status); @@ -1569,27 +1603,29 @@ bail: ocfs2_free_replay_slots(osb); osb->recovery_thread_task = NULL; - mb(); /* sync with ocfs2_recovery_thread_running */ + if (osb->recovery_state == OCFS2_REC_WANT_DISABLE) + osb->recovery_state = OCFS2_REC_DISABLED; wake_up(&osb->recovery_event); mutex_unlock(&osb->recovery_lock); - if (quota_enabled) - kfree(rm_quota); + kfree(rm_quota); return status; } void ocfs2_recovery_thread(struct ocfs2_super *osb, int node_num) { + int was_set = -1; + mutex_lock(&osb->recovery_lock); + if (osb->recovery_state < OCFS2_REC_WANT_DISABLE) + was_set = ocfs2_recovery_map_set(osb, node_num); trace_ocfs2_recovery_thread(node_num, osb->node_num, - osb->disable_recovery, osb->recovery_thread_task, - osb->disable_recovery ? - -1 : ocfs2_recovery_map_set(osb, node_num)); + osb->recovery_state, osb->recovery_thread_task, was_set); - if (osb->disable_recovery) + if (osb->recovery_state >= OCFS2_REC_WANT_DISABLE) goto out; if (osb->recovery_thread_task) diff --git a/fs/ocfs2/journal.h b/fs/ocfs2/journal.h index e3c3a35dc5e0..6397170f302f 100644 --- a/fs/ocfs2/journal.h +++ b/fs/ocfs2/journal.h @@ -148,6 +148,7 @@ void ocfs2_wait_for_recovery(struct ocfs2_super *osb); int ocfs2_recovery_init(struct ocfs2_super *osb); void ocfs2_recovery_exit(struct ocfs2_super *osb); +void ocfs2_recovery_disable_quota(struct ocfs2_super *osb); int ocfs2_compute_replay_slots(struct ocfs2_super *osb); void ocfs2_free_replay_slots(struct ocfs2_super *osb); diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 0ec63a1a94b8..99278c8f0e24 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -644,10 +644,10 @@ static int ocfs2_mknod_locked(struct ocfs2_super *osb, suballoc_loc, suballoc_bit); } -static int ocfs2_mkdir(struct mnt_idmap *idmap, - struct inode *dir, - struct dentry *dentry, - umode_t mode) +static struct dentry *ocfs2_mkdir(struct mnt_idmap *idmap, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { int ret; @@ -657,7 +657,7 @@ static int ocfs2_mkdir(struct mnt_idmap *idmap, if (ret) mlog_errno(ret); - return ret; + return ERR_PTR(ret); } static int ocfs2_create(struct mnt_idmap *idmap, diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 51c52768132d..6aaa94c554c1 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -308,6 +308,21 @@ enum ocfs2_journal_trigger_type { void ocfs2_initialize_journal_triggers(struct super_block *sb, struct ocfs2_triggers triggers[]); +enum ocfs2_recovery_state { + OCFS2_REC_ENABLED = 0, + OCFS2_REC_QUOTA_WANT_DISABLE, + /* + * Must be OCFS2_REC_QUOTA_WANT_DISABLE + 1 for + * ocfs2_recovery_disable_quota() to work. + */ + OCFS2_REC_QUOTA_DISABLED, + OCFS2_REC_WANT_DISABLE, + /* + * Must be OCFS2_REC_WANT_DISABLE + 1 for ocfs2_recovery_exit() to work + */ + OCFS2_REC_DISABLED, +}; + struct ocfs2_journal; struct ocfs2_slot_info; struct ocfs2_recovery_map; @@ -370,7 +385,7 @@ struct ocfs2_super struct ocfs2_recovery_map *recovery_map; struct ocfs2_replay_map *replay_map; struct task_struct *recovery_thread_task; - int disable_recovery; + enum ocfs2_recovery_state recovery_state; wait_queue_head_t checkpoint_event; struct ocfs2_journal *journal; unsigned long osb_commit_interval; diff --git a/fs/ocfs2/quota_global.c b/fs/ocfs2/quota_global.c index 15d9acd456ec..e85b1ccf81be 100644 --- a/fs/ocfs2/quota_global.c +++ b/fs/ocfs2/quota_global.c @@ -273,7 +273,7 @@ ssize_t ocfs2_quota_write(struct super_block *sb, int type, if (new) memset(bh->b_data, 0, sb->s_blocksize); memcpy(bh->b_data + offset, data, len); - flush_dcache_page(bh->b_page); + flush_dcache_folio(bh->b_folio); set_buffer_uptodate(bh); unlock_buffer(bh); ocfs2_set_buffer_uptodate(INODE_CACHE(gqinode), bh); diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index 2956d888c131..e272429da3db 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -453,8 +453,7 @@ out: /* Sync changes in local quota file into global quota file and * reinitialize local quota file. - * The function expects local quota file to be already locked and - * s_umount locked in shared mode. */ + * The function expects local quota file to be already locked. */ static int ocfs2_recover_local_quota_file(struct inode *lqinode, int type, struct ocfs2_quota_recovery *rec) @@ -588,7 +587,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, { unsigned int ino[OCFS2_MAXQUOTAS] = { LOCAL_USER_QUOTA_SYSTEM_INODE, LOCAL_GROUP_QUOTA_SYSTEM_INODE }; - struct super_block *sb = osb->sb; struct ocfs2_local_disk_dqinfo *ldinfo; struct buffer_head *bh; handle_t *handle; @@ -600,7 +598,6 @@ int ocfs2_finish_quota_recovery(struct ocfs2_super *osb, printk(KERN_NOTICE "ocfs2: Finishing quota recovery on device (%s) for " "slot %u\n", osb->dev_str, slot_num); - down_read(&sb->s_umount); for (type = 0; type < OCFS2_MAXQUOTAS; type++) { if (list_empty(&(rec->r_list[type]))) continue; @@ -677,7 +674,6 @@ out_put: break; } out: - up_read(&sb->s_umount); kfree(rec); return status; } @@ -843,8 +839,7 @@ static int ocfs2_local_free_info(struct super_block *sb, int type) ocfs2_release_local_quota_bitmaps(&oinfo->dqi_chunk); /* - * s_umount held in exclusive mode protects us against racing with - * recovery thread... + * ocfs2_dismount_volume() has already aborted quota recovery... */ if (oinfo->dqi_rec) { ocfs2_free_quota_recovery(oinfo->dqi_rec); diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index f7b483f0de2a..6ac4dcd54588 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -698,10 +698,12 @@ static int ocfs2_block_group_alloc(struct ocfs2_super *osb, bg_bh = ocfs2_block_group_alloc_contig(osb, handle, alloc_inode, ac, cl); - if (PTR_ERR(bg_bh) == -ENOSPC) + if (PTR_ERR(bg_bh) == -ENOSPC) { + ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; bg_bh = ocfs2_block_group_alloc_discontig(handle, alloc_inode, ac, cl); + } if (IS_ERR(bg_bh)) { status = PTR_ERR(bg_bh); bg_bh = NULL; @@ -1794,6 +1796,7 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, { int status; u16 chain; + u32 contig_bits; u64 next_group; struct inode *alloc_inode = ac->ac_inode; struct buffer_head *group_bh = NULL; @@ -1819,10 +1822,21 @@ static int ocfs2_search_chain(struct ocfs2_alloc_context *ac, status = -ENOSPC; /* for now, the chain search is a bit simplistic. We just use * the 1st group with any empty bits. */ - while ((status = ac->ac_group_search(alloc_inode, group_bh, - bits_wanted, min_bits, - ac->ac_max_block, - res)) == -ENOSPC) { + while (1) { + if (ac->ac_which == OCFS2_AC_USE_MAIN_DISCONTIG) { + contig_bits = le16_to_cpu(bg->bg_contig_free_bits); + if (!contig_bits) + contig_bits = ocfs2_find_max_contig_free_bits(bg->bg_bitmap, + le16_to_cpu(bg->bg_bits), 0); + if (bits_wanted > contig_bits && contig_bits >= min_bits) + bits_wanted = contig_bits; + } + + status = ac->ac_group_search(alloc_inode, group_bh, + bits_wanted, min_bits, + ac->ac_max_block, res); + if (status != -ENOSPC) + break; if (!bg->bg_next_group) break; @@ -1982,6 +1996,7 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, victim = ocfs2_find_victim_chain(cl); ac->ac_chain = victim; +search: status = ocfs2_search_chain(ac, handle, bits_wanted, min_bits, res, &bits_left); if (!status) { @@ -2022,6 +2037,16 @@ static int ocfs2_claim_suballoc_bits(struct ocfs2_alloc_context *ac, } } + /* Chains can't supply the bits_wanted contiguous space. + * We should switch to using every single bit when allocating + * from the global bitmap. */ + if (i == le16_to_cpu(cl->cl_next_free_rec) && + status == -ENOSPC && ac->ac_which == OCFS2_AC_USE_MAIN) { + ac->ac_which = OCFS2_AC_USE_MAIN_DISCONTIG; + ac->ac_chain = victim; + goto search; + } + set_hint: if (status != -ENOSPC) { /* If the next search of this group is not likely to @@ -2365,7 +2390,8 @@ int __ocfs2_claim_clusters(handle_t *handle, BUG_ON(ac->ac_bits_given >= ac->ac_bits_wanted); BUG_ON(ac->ac_which != OCFS2_AC_USE_LOCAL - && ac->ac_which != OCFS2_AC_USE_MAIN); + && ac->ac_which != OCFS2_AC_USE_MAIN + && ac->ac_which != OCFS2_AC_USE_MAIN_DISCONTIG); if (ac->ac_which == OCFS2_AC_USE_LOCAL) { WARN_ON(min_clusters > 1); diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index b481b834857d..bcf2ed4a8631 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -29,6 +29,7 @@ struct ocfs2_alloc_context { #define OCFS2_AC_USE_MAIN 2 #define OCFS2_AC_USE_INODE 3 #define OCFS2_AC_USE_META 4 +#define OCFS2_AC_USE_MAIN_DISCONTIG 5 u32 ac_which; /* these are used by the chain search */ diff --git a/fs/ocfs2/super.c b/fs/ocfs2/super.c index 8bb5022f3082..3d2533950bae 100644 --- a/fs/ocfs2/super.c +++ b/fs/ocfs2/super.c @@ -1812,6 +1812,9 @@ static void ocfs2_dismount_volume(struct super_block *sb, int mnt_err) /* Orphan scan should be stopped as early as possible */ ocfs2_orphan_scan_stop(osb); + /* Stop quota recovery so that we can disable quotas */ + ocfs2_recovery_disable_quota(osb); + ocfs2_disable_quotas(osb); /* All dquots should be freed by now */ diff --git a/fs/omfs/dir.c b/fs/omfs/dir.c index 6bda275826d6..2ed541fccf33 100644 --- a/fs/omfs/dir.c +++ b/fs/omfs/dir.c @@ -279,10 +279,10 @@ out_free_inode: return err; } -static int omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *omfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return omfs_add_node(dir, dentry, mode | S_IFDIR); + return ERR_PTR(omfs_add_node(dir, dentry, mode | S_IFDIR)); } static int omfs_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/omfs/inode.c b/fs/omfs/inode.c index d6cd81163030..135c49c5d848 100644 --- a/fs/omfs/inode.c +++ b/fs/omfs/inode.c @@ -9,12 +9,13 @@ #include <linux/fs.h> #include <linux/vfs.h> #include <linux/cred.h> -#include <linux/parser.h> #include <linux/buffer_head.h> #include <linux/vmalloc.h> #include <linux/writeback.h> #include <linux/seq_file.h> #include <linux/crc-itu-t.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include "omfs.h" MODULE_AUTHOR("Bob Copeland <me@bobcopeland.com>"); @@ -384,79 +385,83 @@ nomem: return -ENOMEM; } +struct omfs_mount_options { + kuid_t s_uid; + kgid_t s_gid; + int s_dmask; + int s_fmask; +}; + enum { - Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, Opt_err + Opt_uid, Opt_gid, Opt_umask, Opt_dmask, Opt_fmask, }; -static const match_table_t tokens = { - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_umask, "umask=%o"}, - {Opt_dmask, "dmask=%o"}, - {Opt_fmask, "fmask=%o"}, - {Opt_err, NULL}, +static const struct fs_parameter_spec omfs_param_spec[] = { + fsparam_uid ("uid", Opt_uid), + fsparam_gid ("gid", Opt_gid), + fsparam_u32oct ("umask", Opt_umask), + fsparam_u32oct ("dmask", Opt_dmask), + fsparam_u32oct ("fmask", Opt_fmask), + {} }; -static int parse_options(char *options, struct omfs_sb_info *sbi) +static int +omfs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_uid: - if (match_int(&args[0], &option)) - return 0; - sbi->s_uid = make_kuid(current_user_ns(), option); - if (!uid_valid(sbi->s_uid)) - return 0; - break; - case Opt_gid: - if (match_int(&args[0], &option)) - return 0; - sbi->s_gid = make_kgid(current_user_ns(), option); - if (!gid_valid(sbi->s_gid)) - return 0; - break; - case Opt_umask: - if (match_octal(&args[0], &option)) - return 0; - sbi->s_fmask = sbi->s_dmask = option; - break; - case Opt_dmask: - if (match_octal(&args[0], &option)) - return 0; - sbi->s_dmask = option; - break; - case Opt_fmask: - if (match_octal(&args[0], &option)) - return 0; - sbi->s_fmask = option; - break; - default: - return 0; - } + struct omfs_mount_options *opts = fc->fs_private; + int token; + struct fs_parse_result result; + + /* All options are ignored on remount */ + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) + return 0; + + token = fs_parse(fc, omfs_param_spec, param, &result); + if (token < 0) + return token; + + switch (token) { + case Opt_uid: + opts->s_uid = result.uid; + break; + case Opt_gid: + opts->s_gid = result.gid; + break; + case Opt_umask: + opts->s_fmask = opts->s_dmask = result.uint_32; + break; + case Opt_dmask: + opts->s_dmask = result.uint_32; + break; + case Opt_fmask: + opts->s_fmask = result.uint_32; + break; + default: + return -EINVAL; } - return 1; + + return 0; } -static int omfs_fill_super(struct super_block *sb, void *data, int silent) +static void +omfs_set_options(struct omfs_sb_info *sbi, struct omfs_mount_options *opts) +{ + sbi->s_uid = opts->s_uid; + sbi->s_gid = opts->s_gid; + sbi->s_dmask = opts->s_dmask; + sbi->s_fmask = opts->s_fmask; +} + +static int omfs_fill_super(struct super_block *sb, struct fs_context *fc) { struct buffer_head *bh, *bh2; struct omfs_super_block *omfs_sb; struct omfs_root_block *omfs_rb; struct omfs_sb_info *sbi; struct inode *root; + struct omfs_mount_options *parsed_opts = fc->fs_private; int ret = -EINVAL; + int silent = fc->sb_flags & SB_SILENT; sbi = kzalloc(sizeof(struct omfs_sb_info), GFP_KERNEL); if (!sbi) @@ -464,12 +469,7 @@ static int omfs_fill_super(struct super_block *sb, void *data, int silent) sb->s_fs_info = sbi; - sbi->s_uid = current_uid(); - sbi->s_gid = current_gid(); - sbi->s_dmask = sbi->s_fmask = current_umask(); - - if (!parse_options((char *) data, sbi)) - goto end; + omfs_set_options(sbi, parsed_opts); sb->s_maxbytes = 0xffffffff; @@ -594,18 +594,50 @@ end: return ret; } -static struct dentry *omfs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int omfs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, omfs_fill_super); +} + +static void omfs_free_fc(struct fs_context *fc); + +static const struct fs_context_operations omfs_context_ops = { + .parse_param = omfs_parse_param, + .get_tree = omfs_get_tree, + .free = omfs_free_fc, +}; + +static int omfs_init_fs_context(struct fs_context *fc) +{ + struct omfs_mount_options *opts; + + opts = kzalloc(sizeof(*opts), GFP_KERNEL); + if (!opts) + return -ENOMEM; + + /* Set mount options defaults */ + opts->s_uid = current_uid(); + opts->s_gid = current_gid(); + opts->s_dmask = opts->s_fmask = current_umask(); + + fc->fs_private = opts; + fc->ops = &omfs_context_ops; + + return 0; +} + +static void omfs_free_fc(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, omfs_fill_super); + kfree(fc->fs_private); } static struct file_system_type omfs_fs_type = { - .owner = THIS_MODULE, - .name = "omfs", - .mount = omfs_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, + .owner = THIS_MODULE, + .name = "omfs", + .kill_sb = kill_block_super, + .fs_flags = FS_REQUIRES_DEV, + .init_fs_context = omfs_init_fs_context, + .parameters = omfs_param_spec, }; MODULE_ALIAS_FS("omfs"); diff --git a/fs/open.c b/fs/open.c index 1be20de9f283..7828234a7caa 100644 --- a/fs/open.c +++ b/fs/open.c @@ -60,18 +60,21 @@ int do_truncate(struct mnt_idmap *idmap, struct dentry *dentry, if (ret) newattrs.ia_valid |= ret | ATTR_FORCE; - inode_lock(dentry->d_inode); + ret = inode_lock_killable(dentry->d_inode); + if (ret) + return ret; + /* Note any delegations or leases have already been broken: */ ret = notify_change(idmap, dentry, &newattrs, NULL); inode_unlock(dentry->d_inode); return ret; } -long vfs_truncate(const struct path *path, loff_t length) +int vfs_truncate(const struct path *path, loff_t length) { struct mnt_idmap *idmap; struct inode *inode; - long error; + int error; inode = path->dentry->d_inode; @@ -123,7 +126,7 @@ mnt_drop_write_and_out: } EXPORT_SYMBOL_GPL(vfs_truncate); -long do_sys_truncate(const char __user *pathname, loff_t length) +int do_sys_truncate(const char __user *pathname, loff_t length) { unsigned int lookup_flags = LOOKUP_FOLLOW; struct path path; @@ -157,7 +160,7 @@ COMPAT_SYSCALL_DEFINE2(truncate, const char __user *, path, compat_off_t, length } #endif -long do_ftruncate(struct file *file, loff_t length, int small) +int do_ftruncate(struct file *file, loff_t length, int small) { struct inode *inode; struct dentry *dentry; @@ -196,7 +199,7 @@ long do_ftruncate(struct file *file, loff_t length, int small) return error; } -long do_sys_ftruncate(unsigned int fd, loff_t length, int small) +int do_sys_ftruncate(unsigned int fd, loff_t length, int small) { if (length < 0) return -EINVAL; @@ -251,7 +254,7 @@ COMPAT_SYSCALL_DEFINE3(ftruncate64, unsigned int, fd, int vfs_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { struct inode *inode = file_inode(file); - long ret; + int ret; loff_t sum; if (offset < 0 || len <= 0) @@ -460,7 +463,7 @@ static const struct cred *access_override_creds(void) return override_creds(override_cred); } -static long do_faccessat(int dfd, const char __user *filename, int mode, int flags) +static int do_faccessat(int dfd, const char __user *filename, int mode, int flags) { struct path path; struct inode *inode; @@ -635,7 +638,9 @@ int chmod_common(const struct path *path, umode_t mode) if (error) return error; retry_deleg: - inode_lock(inode); + error = inode_lock_killable(inode); + if (error) + goto out_mnt_unlock; error = security_path_chmod(path, mode); if (error) goto out_unlock; @@ -650,6 +655,7 @@ out_unlock: if (!error) goto retry_deleg; } +out_mnt_unlock: mnt_drop_write(path->mnt); return error; } @@ -769,7 +775,9 @@ retry_deleg: return -EINVAL; if ((group != (gid_t)-1) && !setattr_vfsgid(&newattrs, gid)) return -EINVAL; - inode_lock(inode); + error = inode_lock_killable(inode); + if (error) + return error; if (!S_ISDIR(inode->i_mode)) newattrs.ia_valid |= ATTR_KILL_SUID | ATTR_KILL_PRIV | setattr_should_drop_sgid(idmap, inode); @@ -1409,22 +1417,23 @@ struct file *file_open_root(const struct path *root, } EXPORT_SYMBOL(file_open_root); -static long do_sys_openat2(int dfd, const char __user *filename, - struct open_how *how) +static int do_sys_openat2(int dfd, const char __user *filename, + struct open_how *how) { struct open_flags op; - int fd = build_open_flags(how, &op); struct filename *tmp; + int err, fd; - if (fd) - return fd; + err = build_open_flags(how, &op); + if (unlikely(err)) + return err; tmp = getname(filename); if (IS_ERR(tmp)) return PTR_ERR(tmp); fd = get_unused_fd_flags(how->flags); - if (fd >= 0) { + if (likely(fd >= 0)) { struct file *f = do_filp_open(dfd, tmp, &op); if (IS_ERR(f)) { put_unused_fd(fd); @@ -1437,7 +1446,7 @@ static long do_sys_openat2(int dfd, const char __user *filename, return fd; } -long do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) +int do_sys_open(int dfd, const char __user *filename, int flags, umode_t mode) { struct open_how how = build_open_how(flags, mode); return do_sys_openat2(dfd, filename, &how); @@ -1551,7 +1560,7 @@ int filp_close(struct file *filp, fl_owner_t id) int retval; retval = filp_flush(filp, id); - fput(filp); + fput_close(filp); return retval; } @@ -1577,13 +1586,16 @@ SYSCALL_DEFINE1(close, unsigned int, fd) * We're returning to user space. Don't bother * with any delayed fput() cases. */ - __fput_sync(file); + fput_close_sync(file); + + if (likely(retval == 0)) + return 0; /* can't restart close syscall because file table entry was cleared */ - if (unlikely(retval == -ERESTARTSYS || - retval == -ERESTARTNOINTR || - retval == -ERESTARTNOHAND || - retval == -ERESTART_RESTARTBLOCK)) + if (retval == -ERESTARTSYS || + retval == -ERESTARTNOINTR || + retval == -ERESTARTNOHAND || + retval == -ERESTART_RESTARTBLOCK) retval = -EINTR; return retval; diff --git a/fs/orangefs/file.c b/fs/orangefs/file.c index d68372241b30..90c49c0de243 100644 --- a/fs/orangefs/file.c +++ b/fs/orangefs/file.c @@ -57,8 +57,8 @@ ssize_t wait_for_direct_io(enum ORANGEFS_io_type type, struct inode *inode, int buffer_index; ssize_t ret; size_t copy_amount; - int open_for_read; - int open_for_write; + bool open_for_read; + bool open_for_write; new_op = op_alloc(ORANGEFS_VFS_OP_FILE_IO); if (!new_op) diff --git a/fs/orangefs/inode.c b/fs/orangefs/inode.c index aae6d2b8767d..08a6f372a352 100644 --- a/fs/orangefs/inode.c +++ b/fs/orangefs/inode.c @@ -16,60 +16,50 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -static int orangefs_writepage_locked(struct page *page, - struct writeback_control *wbc) +static int orangefs_writepage_locked(struct folio *folio, + struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = folio->mapping->host; struct orangefs_write_range *wr = NULL; struct iov_iter iter; struct bio_vec bv; - size_t len, wlen; + size_t wlen; ssize_t ret; - loff_t off; + loff_t len, off; - set_page_writeback(page); + folio_start_writeback(folio); len = i_size_read(inode); - if (PagePrivate(page)) { - wr = (struct orangefs_write_range *)page_private(page); - WARN_ON(wr->pos >= len); + if (folio->private) { + wr = folio->private; off = wr->pos; - if (off + wr->len > len) + if ((off + wr->len > len) && (off <= len)) wlen = len - off; else wlen = wr->len; + if (wlen == 0) + wlen = wr->len; } else { WARN_ON(1); - off = page_offset(page); - if (off + PAGE_SIZE > len) + off = folio_pos(folio); + wlen = folio_size(folio); + + if (wlen > len - off) wlen = len - off; - else - wlen = PAGE_SIZE; } - /* Should've been handled in orangefs_invalidate_folio. */ - WARN_ON(off == len || off + wlen > len); WARN_ON(wlen == 0); - bvec_set_page(&bv, page, wlen, off % PAGE_SIZE); + bvec_set_folio(&bv, folio, wlen, offset_in_folio(folio, off)); iov_iter_bvec(&iter, ITER_SOURCE, &bv, 1, wlen); ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, wlen, len, wr, NULL, NULL); if (ret < 0) { - mapping_set_error(page->mapping, ret); + mapping_set_error(folio->mapping, ret); } else { ret = 0; } - kfree(detach_page_private(page)); - return ret; -} - -static int orangefs_writepage(struct page *page, struct writeback_control *wbc) -{ - int ret; - ret = orangefs_writepage_locked(page, wbc); - unlock_page(page); - end_page_writeback(page); + kfree(folio_detach_private(folio)); return ret; } @@ -79,33 +69,33 @@ struct orangefs_writepages { kuid_t uid; kgid_t gid; int maxpages; - int npages; - struct page **pages; + int nfolios; + struct address_space *mapping; + struct folio **folios; struct bio_vec *bv; }; static int orangefs_writepages_work(struct orangefs_writepages *ow, - struct writeback_control *wbc) + struct writeback_control *wbc) { - struct inode *inode = ow->pages[0]->mapping->host; + struct inode *inode = ow->mapping->host; struct orangefs_write_range *wrp, wr; struct iov_iter iter; ssize_t ret; - size_t len; - loff_t off; + size_t start; + loff_t len, off; int i; len = i_size_read(inode); - for (i = 0; i < ow->npages; i++) { - set_page_writeback(ow->pages[i]); - bvec_set_page(&ow->bv[i], ow->pages[i], - min(page_offset(ow->pages[i]) + PAGE_SIZE, - ow->off + ow->len) - - max(ow->off, page_offset(ow->pages[i])), - i == 0 ? ow->off - page_offset(ow->pages[i]) : 0); + start = offset_in_folio(ow->folios[0], ow->off); + for (i = 0; i < ow->nfolios; i++) { + folio_start_writeback(ow->folios[i]); + bvec_set_folio(&ow->bv[i], ow->folios[i], + folio_size(ow->folios[i]) - start, start); + start = 0; } - iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->npages, ow->len); + iov_iter_bvec(&iter, ITER_SOURCE, ow->bv, ow->nfolios, ow->len); WARN_ON(ow->off >= len); if (ow->off + ow->len > len) @@ -116,40 +106,24 @@ static int orangefs_writepages_work(struct orangefs_writepages *ow, wr.gid = ow->gid; ret = wait_for_direct_io(ORANGEFS_IO_WRITE, inode, &off, &iter, ow->len, 0, &wr, NULL, NULL); - if (ret < 0) { - for (i = 0; i < ow->npages; i++) { - mapping_set_error(ow->pages[i]->mapping, ret); - if (PagePrivate(ow->pages[i])) { - wrp = (struct orangefs_write_range *) - page_private(ow->pages[i]); - ClearPagePrivate(ow->pages[i]); - put_page(ow->pages[i]); - kfree(wrp); - } - end_page_writeback(ow->pages[i]); - unlock_page(ow->pages[i]); - } - } else { + if (ret < 0) + mapping_set_error(ow->mapping, ret); + else ret = 0; - for (i = 0; i < ow->npages; i++) { - if (PagePrivate(ow->pages[i])) { - wrp = (struct orangefs_write_range *) - page_private(ow->pages[i]); - ClearPagePrivate(ow->pages[i]); - put_page(ow->pages[i]); - kfree(wrp); - } - end_page_writeback(ow->pages[i]); - unlock_page(ow->pages[i]); - } + + for (i = 0; i < ow->nfolios; i++) { + wrp = folio_detach_private(ow->folios[i]); + kfree(wrp); + folio_end_writeback(ow->folios[i]); + folio_unlock(ow->folios[i]); } + return ret; } static int orangefs_writepages_callback(struct folio *folio, - struct writeback_control *wbc, void *data) + struct writeback_control *wbc, struct orangefs_writepages *ow) { - struct orangefs_writepages *ow = data; struct orangefs_write_range *wr = folio->private; int ret; @@ -162,41 +136,41 @@ static int orangefs_writepages_callback(struct folio *folio, } ret = -1; - if (ow->npages == 0) { + if (ow->nfolios == 0) { ow->off = wr->pos; ow->len = wr->len; ow->uid = wr->uid; ow->gid = wr->gid; - ow->pages[ow->npages++] = &folio->page; + ow->folios[ow->nfolios++] = folio; ret = 0; goto done; } if (!uid_eq(ow->uid, wr->uid) || !gid_eq(ow->gid, wr->gid)) { orangefs_writepages_work(ow, wbc); - ow->npages = 0; + ow->nfolios = 0; ret = -1; goto done; } if (ow->off + ow->len == wr->pos) { ow->len += wr->len; - ow->pages[ow->npages++] = &folio->page; + ow->folios[ow->nfolios++] = folio; ret = 0; goto done; } done: if (ret == -1) { - if (ow->npages) { + if (ow->nfolios) { orangefs_writepages_work(ow, wbc); - ow->npages = 0; + ow->nfolios = 0; } - ret = orangefs_writepage_locked(&folio->page, wbc); + ret = orangefs_writepage_locked(folio, wbc); mapping_set_error(folio->mapping, ret); folio_unlock(folio); folio_end_writeback(folio); } else { - if (ow->npages == ow->maxpages) { + if (ow->nfolios == ow->maxpages) { orangefs_writepages_work(ow, wbc); - ow->npages = 0; + ow->nfolios = 0; } } return ret; @@ -207,31 +181,35 @@ static int orangefs_writepages(struct address_space *mapping, { struct orangefs_writepages *ow; struct blk_plug plug; - int ret; + int error; + struct folio *folio = NULL; + ow = kzalloc(sizeof(struct orangefs_writepages), GFP_KERNEL); if (!ow) return -ENOMEM; ow->maxpages = orangefs_bufmap_size_query()/PAGE_SIZE; - ow->pages = kcalloc(ow->maxpages, sizeof(struct page *), GFP_KERNEL); - if (!ow->pages) { + ow->folios = kcalloc(ow->maxpages, sizeof(struct folio *), GFP_KERNEL); + if (!ow->folios) { kfree(ow); return -ENOMEM; } ow->bv = kcalloc(ow->maxpages, sizeof(struct bio_vec), GFP_KERNEL); if (!ow->bv) { - kfree(ow->pages); + kfree(ow->folios); kfree(ow); return -ENOMEM; } + ow->mapping = mapping; blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, orangefs_writepages_callback, ow); - if (ow->npages) - ret = orangefs_writepages_work(ow, wbc); + while ((folio = writeback_iter(mapping, wbc, folio, &error))) + error = orangefs_writepages_callback(folio, wbc, ow); + if (ow->nfolios) + error = orangefs_writepages_work(ow, wbc); blk_finish_plug(&plug); - kfree(ow->pages); + kfree(ow->folios); kfree(ow->bv); kfree(ow); - return ret; + return error; } static int orangefs_launder_folio(struct folio *); @@ -341,6 +319,8 @@ static int orangefs_write_begin(struct file *file, wr->len += len; goto okay; } else { + wr->pos = pos; + wr->len = len; ret = orangefs_launder_folio(folio); if (ret) return ret; @@ -484,7 +464,7 @@ static int orangefs_launder_folio(struct folio *folio) }; folio_wait_writeback(folio); if (folio_clear_dirty_for_io(folio)) { - r = orangefs_writepage_locked(&folio->page, &wbc); + r = orangefs_writepage_locked(folio, &wbc); folio_end_writeback(folio); } return r; @@ -606,7 +586,6 @@ out: /** ORANGEFS2 implementation of address space operations */ static const struct address_space_operations orangefs_address_operations = { - .writepage = orangefs_writepage, .readahead = orangefs_readahead, .read_folio = orangefs_read_folio, .writepages = orangefs_writepages, @@ -616,6 +595,7 @@ static const struct address_space_operations orangefs_address_operations = { .invalidate_folio = orangefs_invalidate_folio, .release_folio = orangefs_release_folio, .free_folio = orangefs_free_folio, + .migrate_folio = filemap_migrate_folio, .launder_folio = orangefs_launder_folio, .direct_IO = orangefs_direct_IO, }; diff --git a/fs/orangefs/namei.c b/fs/orangefs/namei.c index 200558ec72f0..82395fe2b956 100644 --- a/fs/orangefs/namei.c +++ b/fs/orangefs/namei.c @@ -300,8 +300,8 @@ out: return ret; } -static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct orangefs_inode_s *parent = ORANGEFS_I(dir); struct orangefs_kernel_op_s *new_op; @@ -312,7 +312,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, new_op = op_alloc(ORANGEFS_VFS_OP_MKDIR); if (!new_op) - return -ENOMEM; + return ERR_PTR(-ENOMEM); new_op->upcall.req.mkdir.parent_refn = parent->refn; @@ -366,7 +366,7 @@ static int orangefs_mkdir(struct mnt_idmap *idmap, struct inode *dir, __orangefs_setattr(dir, &iattr); out: op_release(new_op); - return ret; + return ERR_PTR(ret); } static int orangefs_rename(struct mnt_idmap *idmap, diff --git a/fs/orangefs/orangefs-bufmap.c b/fs/orangefs/orangefs-bufmap.c index edcca4beb765..b562d3dbc76b 100644 --- a/fs/orangefs/orangefs-bufmap.c +++ b/fs/orangefs/orangefs-bufmap.c @@ -197,18 +197,6 @@ int orangefs_bufmap_size_query(void) return size; } -int orangefs_bufmap_shift_query(void) -{ - struct orangefs_bufmap *bufmap; - int shift = 0; - spin_lock(&orangefs_bufmap_lock); - bufmap = __orangefs_bufmap; - if (bufmap) - shift = bufmap->desc_shift; - spin_unlock(&orangefs_bufmap_lock); - return shift; -} - static DECLARE_WAIT_QUEUE_HEAD(bufmap_waitq); static DECLARE_WAIT_QUEUE_HEAD(readdir_waitq); @@ -532,16 +520,3 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, } return 0; } - -void orangefs_bufmap_page_fill(void *page_to, - int buffer_index, - int slot_index) -{ - struct orangefs_bufmap_desc *from; - void *page_from; - - from = &__orangefs_bufmap->desc_array[buffer_index]; - page_from = kmap_atomic(from->page_array[slot_index]); - memcpy(page_to, page_from, PAGE_SIZE); - kunmap_atomic(page_from); -} diff --git a/fs/orangefs/orangefs-bufmap.h b/fs/orangefs/orangefs-bufmap.h index 75b2d2833af1..4231175ccdb2 100644 --- a/fs/orangefs/orangefs-bufmap.h +++ b/fs/orangefs/orangefs-bufmap.h @@ -10,8 +10,6 @@ int orangefs_bufmap_size_query(void); -int orangefs_bufmap_shift_query(void); - int orangefs_bufmap_initialize(struct ORANGEFS_dev_map_desc *user_desc); void orangefs_bufmap_finalize(void); @@ -34,6 +32,5 @@ int orangefs_bufmap_copy_to_iovec(struct iov_iter *iter, int buffer_index, size_t size); -void orangefs_bufmap_page_fill(void *kaddr, int buffer_index, int slot_index); #endif /* __ORANGEFS_BUFMAP_H */ diff --git a/fs/orangefs/orangefs-debug.h b/fs/orangefs/orangefs-debug.h index 6e079d4230d0..d4463534cec6 100644 --- a/fs/orangefs/orangefs-debug.h +++ b/fs/orangefs/orangefs-debug.h @@ -43,47 +43,4 @@ #define GOSSIP_MAX_NR 16 #define GOSSIP_MAX_DEBUG (((__u64)1 << GOSSIP_MAX_NR) - 1) -/* a private internal type */ -struct __keyword_mask_s { - const char *keyword; - __u64 mask_val; -}; - -/* - * Map all kmod keywords to kmod debug masks here. Keep this - * structure "packed": - * - * "all" is always last... - * - * keyword mask_val index - * foo 1 0 - * bar 2 1 - * baz 4 2 - * qux 8 3 - * . . . - */ -static struct __keyword_mask_s s_kmod_keyword_mask_map[] = { - {"super", GOSSIP_SUPER_DEBUG}, - {"inode", GOSSIP_INODE_DEBUG}, - {"file", GOSSIP_FILE_DEBUG}, - {"dir", GOSSIP_DIR_DEBUG}, - {"utils", GOSSIP_UTILS_DEBUG}, - {"wait", GOSSIP_WAIT_DEBUG}, - {"acl", GOSSIP_ACL_DEBUG}, - {"dcache", GOSSIP_DCACHE_DEBUG}, - {"dev", GOSSIP_DEV_DEBUG}, - {"name", GOSSIP_NAME_DEBUG}, - {"bufmap", GOSSIP_BUFMAP_DEBUG}, - {"cache", GOSSIP_CACHE_DEBUG}, - {"debugfs", GOSSIP_DEBUGFS_DEBUG}, - {"xattr", GOSSIP_XATTR_DEBUG}, - {"init", GOSSIP_INIT_DEBUG}, - {"sysfs", GOSSIP_SYSFS_DEBUG}, - {"none", GOSSIP_NO_DEBUG}, - {"all", GOSSIP_MAX_DEBUG} -}; - -static const int num_kmod_keyword_mask_map = (int) - (ARRAY_SIZE(s_kmod_keyword_mask_map)); - #endif /* __ORANGEFS_DEBUG_H */ diff --git a/fs/orangefs/orangefs-debugfs.c b/fs/orangefs/orangefs-debugfs.c index f52073022fae..f7095c91660c 100644 --- a/fs/orangefs/orangefs-debugfs.c +++ b/fs/orangefs/orangefs-debugfs.c @@ -44,6 +44,49 @@ #include "protocol.h" #include "orangefs-kernel.h" +/* a private internal type */ +struct __keyword_mask_s { + const char *keyword; + __u64 mask_val; +}; + +/* + * Map all kmod keywords to kmod debug masks here. Keep this + * structure "packed": + * + * "all" is always last... + * + * keyword mask_val index + * foo 1 0 + * bar 2 1 + * baz 4 2 + * qux 8 3 + * . . . + */ +static struct __keyword_mask_s s_kmod_keyword_mask_map[] = { + {"super", GOSSIP_SUPER_DEBUG}, + {"inode", GOSSIP_INODE_DEBUG}, + {"file", GOSSIP_FILE_DEBUG}, + {"dir", GOSSIP_DIR_DEBUG}, + {"utils", GOSSIP_UTILS_DEBUG}, + {"wait", GOSSIP_WAIT_DEBUG}, + {"acl", GOSSIP_ACL_DEBUG}, + {"dcache", GOSSIP_DCACHE_DEBUG}, + {"dev", GOSSIP_DEV_DEBUG}, + {"name", GOSSIP_NAME_DEBUG}, + {"bufmap", GOSSIP_BUFMAP_DEBUG}, + {"cache", GOSSIP_CACHE_DEBUG}, + {"debugfs", GOSSIP_DEBUGFS_DEBUG}, + {"xattr", GOSSIP_XATTR_DEBUG}, + {"init", GOSSIP_INIT_DEBUG}, + {"sysfs", GOSSIP_SYSFS_DEBUG}, + {"none", GOSSIP_NO_DEBUG}, + {"all", GOSSIP_MAX_DEBUG} +}; + +static const int num_kmod_keyword_mask_map = (int) + (ARRAY_SIZE(s_kmod_keyword_mask_map)); + #define DEBUG_HELP_STRING_SIZE 4096 #define HELP_STRING_UNINITIALIZED \ "Client Debug Keywords are unknown until the first time\n" \ diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 3d4b883a7660..3e153c2f6b82 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -32,6 +32,8 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/vmalloc.h> #include <linux/aio.h> @@ -328,11 +330,9 @@ void purge_waiting_ops(void); * defined in super.c */ extern uint64_t orangefs_features; +extern const struct fs_parameter_spec orangefs_fs_param_spec[]; -struct dentry *orangefs_mount(struct file_system_type *fst, - int flags, - const char *devname, - void *data); +int orangefs_init_fs_context(struct fs_context *fc); void orangefs_kill_sb(struct super_block *sb); int orangefs_remount(struct orangefs_sb_info_s *); diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index 5ab741c60b7e..7ac16a4d2dc6 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -46,7 +46,8 @@ MODULE_PARM_DESC(hash_table_size, static struct file_system_type orangefs_fs_type = { .name = "pvfs2", - .mount = orangefs_mount, + .init_fs_context = orangefs_init_fs_context, + .parameters = orangefs_fs_param_spec, .kill_sb = orangefs_kill_sb, .owner = THIS_MODULE, }; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index eba3e357192e..64ca9498f550 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -9,7 +9,6 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -#include <linux/parser.h> #include <linux/hashtable.h> #include <linux/seq_file.h> @@ -22,18 +21,16 @@ LIST_HEAD(orangefs_superblocks); DEFINE_SPINLOCK(orangefs_superblocks_lock); enum { - Opt_intr, Opt_acl, + Opt_intr, Opt_local_lock, - - Opt_err }; -static const match_table_t tokens = { - { Opt_acl, "acl" }, - { Opt_intr, "intr" }, - { Opt_local_lock, "local_lock" }, - { Opt_err, NULL } +const struct fs_parameter_spec orangefs_fs_param_spec[] = { + fsparam_flag ("acl", Opt_acl), + fsparam_flag ("intr", Opt_intr), + fsparam_flag ("local_lock", Opt_local_lock), + {} }; uint64_t orangefs_features; @@ -51,48 +48,30 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static int parse_mount_options(struct super_block *sb, char *options, - int silent) +static int orangefs_parse_param(struct fs_context *fc, + struct fs_parameter *param) { - struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb); - substring_t args[MAX_OPT_ARGS]; - char *p; - - /* - * Force any potential flags that might be set from the mount - * to zero, ie, initialize to unset. - */ - sb->s_flags &= ~SB_POSIXACL; - orangefs_sb->flags &= ~ORANGEFS_OPT_INTR; - orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_acl: - sb->s_flags |= SB_POSIXACL; - break; - case Opt_intr: - orangefs_sb->flags |= ORANGEFS_OPT_INTR; - break; - case Opt_local_lock: - orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK; - break; - default: - goto fail; - } + struct orangefs_sb_info_s *orangefs_sb = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, orangefs_fs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_acl: + fc->sb_flags |= SB_POSIXACL; + break; + case Opt_intr: + orangefs_sb->flags |= ORANGEFS_OPT_INTR; + break; + case Opt_local_lock: + orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK; + break; } return 0; -fail: - if (!silent) - gossip_err("Error: mount option [%s] is not supported.\n", p); - return -EINVAL; } static void orangefs_inode_cache_ctor(void *req) @@ -223,10 +202,20 @@ out_op_release: * Remount as initiated by VFS layer. We just need to reparse the mount * options, no need to signal pvfs2-client-core about it. */ -static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data) +static int orangefs_reconfigure(struct fs_context *fc) { - gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n"); - return parse_mount_options(sb, data, 1); + struct super_block *sb = fc->root->d_sb; + struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb); + struct orangefs_sb_info_s *revised = fc->s_fs_info; + unsigned int flags; + + flags = orangefs_sb->flags; + flags &= ~(ORANGEFS_OPT_INTR | ORANGEFS_OPT_LOCAL_LOCK); + flags |= revised->flags; + WRITE_ONCE(orangefs_sb->flags, flags); + + gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_reconfigure: called\n"); + return 0; } /* @@ -319,7 +308,6 @@ static const struct super_operations orangefs_s_ops = { .write_inode = orangefs_write_inode, .drop_inode = generic_delete_inode, .statfs = orangefs_statfs, - .remount_fs = orangefs_remount_fs, .show_options = orangefs_show_options, }; @@ -410,8 +398,8 @@ static int orangefs_unmount(int id, __s32 fs_id, const char *devname) } static int orangefs_fill_sb(struct super_block *sb, - struct orangefs_fs_mount_response *fs_mount, - void *data, int silent) + struct fs_context *fc, + struct orangefs_fs_mount_response *fs_mount) { int ret; struct inode *root; @@ -424,12 +412,6 @@ static int orangefs_fill_sb(struct super_block *sb, ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id; ORANGEFS_SB(sb)->id = fs_mount->id; - if (data) { - ret = parse_mount_options(sb, data, silent); - if (ret) - return ret; - } - /* Hang the xattr handlers off the superblock */ sb->s_xattr = orangefs_xattr_handlers; sb->s_magic = ORANGEFS_SUPER_MAGIC; @@ -470,30 +452,24 @@ static int orangefs_fill_sb(struct super_block *sb, return 0; } -struct dentry *orangefs_mount(struct file_system_type *fst, - int flags, - const char *devname, - void *data) +static int orangefs_get_tree(struct fs_context *fc) { int ret; struct super_block *sb = ERR_PTR(-EINVAL); struct orangefs_kernel_op_s *new_op; - struct dentry *d = ERR_PTR(-EINVAL); + + if (!fc->source) + return invalf(fc, "Device name not specified.\n"); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_mount: called with devname %s\n", - devname); - - if (!devname) { - gossip_err("ERROR: device name not specified.\n"); - return ERR_PTR(-EINVAL); - } + fc->source); new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); if (!new_op) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, devname); + strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, fc->source); gossip_debug(GOSSIP_SUPER_DEBUG, "Attempting ORANGEFS Mount via host %s\n", @@ -511,37 +487,27 @@ struct dentry *orangefs_mount(struct file_system_type *fst, goto free_op; } - sb = sget(fst, NULL, set_anon_super, flags, NULL); + sb = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(sb)) { - d = ERR_CAST(sb); + ret = PTR_ERR(sb); orangefs_unmount(new_op->downcall.resp.fs_mount.id, - new_op->downcall.resp.fs_mount.fs_id, devname); - goto free_op; - } - - /* alloc and init our private orangefs sb info */ - sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); - if (!ORANGEFS_SB(sb)) { - d = ERR_PTR(-ENOMEM); + new_op->downcall.resp.fs_mount.fs_id, + fc->source); goto free_op; } - ret = orangefs_fill_sb(sb, - &new_op->downcall.resp.fs_mount, data, - flags & SB_SILENT ? 1 : 0); + /* init our private orangefs sb info */ + ret = orangefs_fill_sb(sb, fc, &new_op->downcall.resp.fs_mount); - if (ret) { - d = ERR_PTR(ret); + if (ret) goto free_sb_and_op; - } /* * on successful mount, store the devname and data * used */ - strscpy(ORANGEFS_SB(sb)->devname, devname); - + strscpy(ORANGEFS_SB(sb)->devname, fc->source); /* mount_pending must be cleared */ ORANGEFS_SB(sb)->mount_pending = 0; @@ -564,7 +530,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) - return ERR_PTR(-ENOMEM); + return -ENOMEM; new_op->upcall.req.features.features = 0; ret = service_operation(new_op, "orangefs_features", 0); orangefs_features = new_op->downcall.resp.features.features; @@ -573,7 +539,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst, orangefs_features = 0; } - return dget(sb->s_root); + fc->root = dget(sb->s_root); + return 0; free_sb_and_op: /* Will call orangefs_kill_sb with sb not in list. */ @@ -589,7 +556,43 @@ free_op: op_release(new_op); - return d; + return ret; +} + +static void orangefs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations orangefs_context_ops = { + .free = orangefs_free_fc, + .parse_param = orangefs_parse_param, + .get_tree = orangefs_get_tree, + .reconfigure = orangefs_reconfigure, +}; + +/* + * Set up the filesystem mount context. + */ +int orangefs_init_fs_context(struct fs_context *fc) +{ + struct orangefs_sb_info_s *osi; + + osi = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); + if (!osi) + return -ENOMEM; + + /* + * Force any potential flags that might be set from the mount + * to zero, ie, initialize to unset. + */ + fc->sb_flags_mask &= ~SB_POSIXACL; + osi->flags &= ~ORANGEFS_OPT_INTR; + osi->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; + + fc->s_fs_info = osi; + fc->ops = &orangefs_context_ops; + return 0; } void orangefs_kill_sb(struct super_block *sb) diff --git a/fs/overlayfs/copy_up.c b/fs/overlayfs/copy_up.c index 0c28e5fa3407..d7310fcf3888 100644 --- a/fs/overlayfs/copy_up.c +++ b/fs/overlayfs/copy_up.c @@ -618,7 +618,6 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) err = PTR_ERR(upper); if (!IS_ERR(upper)) { err = ovl_do_link(ofs, ovl_dentry_upper(c->dentry), udir, upper); - dput(upper); if (!err) { /* Restore timestamps on parent (best effort) */ @@ -626,6 +625,7 @@ static int ovl_link_up(struct ovl_copy_up_ctx *c) ovl_dentry_set_upper_alias(c->dentry); ovl_dentry_update_reval(c->dentry, upper); } + dput(upper); } inode_unlock(udir); if (err) diff --git a/fs/overlayfs/dir.c b/fs/overlayfs/dir.c index c9993ff66fc2..fe493f3ed6b6 100644 --- a/fs/overlayfs/dir.c +++ b/fs/overlayfs/dir.c @@ -138,37 +138,6 @@ kill_whiteout: goto out; } -int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, - struct dentry **newdentry, umode_t mode) -{ - int err; - struct dentry *d, *dentry = *newdentry; - - err = ovl_do_mkdir(ofs, dir, dentry, mode); - if (err) - return err; - - if (likely(!d_unhashed(dentry))) - return 0; - - /* - * vfs_mkdir() may succeed and leave the dentry passed - * to it unhashed and negative. If that happens, try to - * lookup a new hashed and positive dentry. - */ - d = ovl_lookup_upper(ofs, dentry->d_name.name, dentry->d_parent, - dentry->d_name.len); - if (IS_ERR(d)) { - pr_warn("failed lookup after mkdir (%pd2, err=%i).\n", - dentry, err); - return PTR_ERR(d); - } - dput(dentry); - *newdentry = d; - - return 0; -} - struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr) { @@ -191,7 +160,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, case S_IFDIR: /* mkdir is special... */ - err = ovl_mkdir_real(ofs, dir, &newdentry, attr->mode); + newdentry = ovl_do_mkdir(ofs, dir, newdentry, attr->mode); + err = PTR_ERR_OR_ZERO(newdentry); break; case S_IFCHR: @@ -219,7 +189,8 @@ struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, } out: if (err) { - dput(newdentry); + if (!IS_ERR(newdentry)) + dput(newdentry); return ERR_PTR(err); } return newdentry; @@ -282,7 +253,8 @@ static int ovl_instantiate(struct dentry *dentry, struct inode *inode, * XXX: if we ever use ovl_obtain_alias() to decode directory * file handles, need to use ovl_get_inode_locked() and * d_instantiate_new() here to prevent from creating two - * hashed directory inode aliases. + * hashed directory inode aliases. We then need to return + * the obtained alias to ovl_mkdir(). */ inode = ovl_get_inode(dentry->d_sb, &oip); if (IS_ERR(inode)) @@ -687,10 +659,10 @@ static int ovl_create(struct mnt_idmap *idmap, struct inode *dir, return ovl_create_object(dentry, (mode & 07777) | S_IFREG, 0, NULL); } -static int ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ovl_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { - return ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL); + return ERR_PTR(ovl_create_object(dentry, (mode & 07777) | S_IFDIR, 0, NULL)); } static int ovl_mknod(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/overlayfs/export.c b/fs/overlayfs/export.c index 444aeeccb6da..83f80fdb1567 100644 --- a/fs/overlayfs/export.c +++ b/fs/overlayfs/export.c @@ -385,11 +385,9 @@ static struct dentry *ovl_lookup_real_one(struct dentry *connected, */ take_dentry_name_snapshot(&name, real); /* - * No idmap handling here: it's an internal lookup. Could skip - * permission checking altogether, but for now just use non-idmap - * transformed ids. + * No idmap handling here: it's an internal lookup. */ - this = lookup_one_len(name.name.name, connected, name.name.len); + this = lookup_noperm(&name.name, connected); release_dentry_name_snapshot(&name); err = PTR_ERR(this); if (IS_ERR(this)) { diff --git a/fs/overlayfs/namei.c b/fs/overlayfs/namei.c index be5c65d6f848..bf722daf19a9 100644 --- a/fs/overlayfs/namei.c +++ b/fs/overlayfs/namei.c @@ -205,8 +205,8 @@ static struct dentry *ovl_lookup_positive_unlocked(struct ovl_lookup_data *d, struct dentry *base, int len, bool drop_negative) { - struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), name, - base, len); + struct dentry *ret = lookup_one_unlocked(mnt_idmap(d->layer->mnt), + &QSTR_LEN(name, len), base); if (!IS_ERR(ret) && d_flags_negative(smp_load_acquire(&ret->d_flags))) { if (drop_negative && ret->d_lockref.count == 1) { @@ -757,7 +757,7 @@ struct dentry *ovl_get_index_fh(struct ovl_fs *ofs, struct ovl_fh *fh) if (err) return ERR_PTR(err); - index = lookup_positive_unlocked(name.name, ofs->workdir, name.len); + index = lookup_noperm_positive_unlocked(&name, ofs->workdir); kfree(name.name); if (IS_ERR(index)) { if (PTR_ERR(index) == -ENOENT) @@ -789,8 +789,8 @@ struct dentry *ovl_lookup_index(struct ovl_fs *ofs, struct dentry *upper, if (err) return ERR_PTR(err); - index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), name.name, - ofs->workdir, name.len); + index = lookup_one_positive_unlocked(ovl_upper_mnt_idmap(ofs), &name, + ofs->workdir); if (IS_ERR(index)) { err = PTR_ERR(index); if (err == -ENOENT) { @@ -1371,7 +1371,7 @@ out: bool ovl_lower_positive(struct dentry *dentry) { struct ovl_entry *poe = OVL_E(dentry->d_parent); - const struct qstr *name = &dentry->d_name; + struct qstr *name = &dentry->d_name; const struct cred *old_cred; unsigned int i; bool positive = false; @@ -1396,7 +1396,7 @@ bool ovl_lower_positive(struct dentry *dentry) this = lookup_one_positive_unlocked( mnt_idmap(parentpath->layer->mnt), - name->name, parentpath->dentry, name->len); + name, parentpath->dentry); if (IS_ERR(this)) { switch (PTR_ERR(this)) { case -ENOENT: diff --git a/fs/overlayfs/overlayfs.h b/fs/overlayfs/overlayfs.h index 0021e2025020..8baaba0a3fe5 100644 --- a/fs/overlayfs/overlayfs.h +++ b/fs/overlayfs/overlayfs.h @@ -241,13 +241,14 @@ static inline int ovl_do_create(struct ovl_fs *ofs, return err; } -static inline int ovl_do_mkdir(struct ovl_fs *ofs, - struct inode *dir, struct dentry *dentry, - umode_t mode) +static inline struct dentry *ovl_do_mkdir(struct ovl_fs *ofs, + struct inode *dir, + struct dentry *dentry, + umode_t mode) { - int err = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); - pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, err); - return err; + dentry = vfs_mkdir(ovl_upper_mnt_idmap(ofs), dir, dentry, mode); + pr_debug("mkdir(%pd2, 0%o) = %i\n", dentry, mode, PTR_ERR_OR_ZERO(dentry)); + return dentry; } static inline int ovl_do_mknod(struct ovl_fs *ofs, @@ -401,7 +402,7 @@ static inline struct dentry *ovl_lookup_upper(struct ovl_fs *ofs, const char *name, struct dentry *base, int len) { - return lookup_one(ovl_upper_mnt_idmap(ofs), name, base, len); + return lookup_one(ovl_upper_mnt_idmap(ofs), &QSTR_LEN(name, len), base); } static inline bool ovl_open_flags_need_copy_up(int flags) @@ -540,8 +541,6 @@ int ovl_set_metacopy_xattr(struct ovl_fs *ofs, struct dentry *d, bool ovl_is_metacopy_dentry(struct dentry *dentry); char *ovl_get_redirect_xattr(struct ovl_fs *ofs, const struct path *path, int padding); int ovl_ensure_verity_loaded(struct path *path); -int ovl_get_verity_xattr(struct ovl_fs *ofs, const struct path *path, - u8 *digest_buf, int *buf_length); int ovl_validate_verity(struct ovl_fs *ofs, struct path *metapath, struct path *datapath); @@ -838,8 +837,6 @@ struct ovl_cattr { #define OVL_CATTR(m) (&(struct ovl_cattr) { .mode = (m) }) -int ovl_mkdir_real(struct ovl_fs *ofs, struct inode *dir, - struct dentry **newdentry, umode_t mode); struct dentry *ovl_create_real(struct ovl_fs *ofs, struct inode *dir, struct dentry *newdentry, struct ovl_cattr *attr); diff --git a/fs/overlayfs/params.c b/fs/overlayfs/params.c index 1115c22deca0..6759f7d040c8 100644 --- a/fs/overlayfs/params.c +++ b/fs/overlayfs/params.c @@ -59,6 +59,7 @@ enum ovl_opt { Opt_metacopy, Opt_verity, Opt_volatile, + Opt_override_creds, }; static const struct constant_table ovl_parameter_bool[] = { @@ -155,6 +156,7 @@ const struct fs_parameter_spec ovl_parameter_spec[] = { fsparam_enum("metacopy", Opt_metacopy, ovl_parameter_bool), fsparam_enum("verity", Opt_verity, ovl_parameter_verity), fsparam_flag("volatile", Opt_volatile), + fsparam_flag_no("override_creds", Opt_override_creds), {} }; @@ -662,6 +664,29 @@ static int ovl_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_userxattr: config->userxattr = true; break; + case Opt_override_creds: { + const struct cred *cred = NULL; + + if (result.negated) { + swap(cred, ofs->creator_cred); + put_cred(cred); + break; + } + + if (!current_in_userns(fc->user_ns)) { + err = -EINVAL; + break; + } + + cred = prepare_creds(); + if (cred) + swap(cred, ofs->creator_cred); + else + err = -ENOMEM; + + put_cred(cred); + break; + } default: pr_err("unrecognized mount option \"%s\" or missing value\n", param->key); diff --git a/fs/overlayfs/readdir.c b/fs/overlayfs/readdir.c index 881ec5592da5..44e208da417c 100644 --- a/fs/overlayfs/readdir.c +++ b/fs/overlayfs/readdir.c @@ -271,7 +271,6 @@ static bool ovl_fill_merge(struct dir_context *ctx, const char *name, static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data *rdd) { int err; - struct ovl_cache_entry *p; struct dentry *dentry, *dir = path->dentry; const struct cred *old_cred; @@ -280,9 +279,11 @@ static int ovl_check_whiteouts(const struct path *path, struct ovl_readdir_data err = down_write_killable(&dir->d_inode->i_rwsem); if (!err) { while (rdd->first_maybe_whiteout) { - p = rdd->first_maybe_whiteout; + struct ovl_cache_entry *p = + rdd->first_maybe_whiteout; rdd->first_maybe_whiteout = p->next_maybe_whiteout; - dentry = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); + dentry = lookup_one(mnt_idmap(path->mnt), + &QSTR_LEN(p->name, p->len), dir); if (!IS_ERR(dentry)) { p->is_whiteout = ovl_is_whiteout(dentry); dput(dentry); @@ -351,6 +352,7 @@ static int ovl_dir_read_merged(struct dentry *dentry, struct list_head *list, struct path realpath; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_merge, + .ctx.count = INT_MAX, .dentry = dentry, .list = list, .root = root, @@ -492,7 +494,7 @@ static int ovl_cache_update(const struct path *path, struct ovl_cache_entry *p, } } /* This checks also for xwhiteouts */ - this = lookup_one(mnt_idmap(path->mnt), p->name, dir, p->len); + this = lookup_one(mnt_idmap(path->mnt), &QSTR_LEN(p->name, p->len), dir); if (IS_ERR_OR_NULL(this) || !this->d_inode) { /* Mark a stale entry */ p->is_whiteout = true; @@ -571,6 +573,7 @@ static int ovl_dir_read_impure(const struct path *path, struct list_head *list, struct ovl_cache_entry *p, *n; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, + .ctx.count = INT_MAX, .list = list, .root = root, }; @@ -672,6 +675,7 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name, struct ovl_readdir_translate *rdt = container_of(ctx, struct ovl_readdir_translate, ctx); struct dir_context *orig_ctx = rdt->orig_ctx; + bool res; if (rdt->parent_ino && strcmp(name, "..") == 0) { ino = rdt->parent_ino; @@ -686,7 +690,10 @@ static bool ovl_fill_real(struct dir_context *ctx, const char *name, name, namelen, rdt->xinowarn); } - return orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); + res = orig_ctx->actor(orig_ctx, name, namelen, offset, ino, d_type); + ctx->count = orig_ctx->count; + + return res; } static bool ovl_is_impure_dir(struct file *file) @@ -713,6 +720,7 @@ static int ovl_iterate_real(struct file *file, struct dir_context *ctx) const struct ovl_layer *lower_layer = ovl_layer_lower(dir); struct ovl_readdir_translate rdt = { .ctx.actor = ovl_fill_real, + .ctx.count = ctx->count, .orig_ctx = ctx, .xinobits = ovl_xino_bits(ofs), .xinowarn = ovl_xino_warn(ofs), @@ -1073,6 +1081,7 @@ int ovl_check_d_type_supported(const struct path *realpath) int err; struct ovl_readdir_data rdd = { .ctx.actor = ovl_check_d_type, + .ctx.count = INT_MAX, .d_type_supported = false, }; @@ -1094,6 +1103,7 @@ static int ovl_workdir_cleanup_recurse(struct ovl_fs *ofs, const struct path *pa struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, + .ctx.count = INT_MAX, .list = &list, }; bool incompat = false; @@ -1178,6 +1188,7 @@ int ovl_indexdir_cleanup(struct ovl_fs *ofs) struct ovl_cache_entry *p; struct ovl_readdir_data rdd = { .ctx.actor = ovl_fill_plain, + .ctx.count = INT_MAX, .list = &list, }; diff --git a/fs/overlayfs/super.c b/fs/overlayfs/super.c index 86ae6f6da36b..e19940d649ca 100644 --- a/fs/overlayfs/super.c +++ b/fs/overlayfs/super.c @@ -327,9 +327,10 @@ retry: goto retry; } - err = ovl_mkdir_real(ofs, dir, &work, attr.ia_mode); - if (err) - goto out_dput; + work = ovl_do_mkdir(ofs, dir, work, attr.ia_mode); + err = PTR_ERR(work); + if (IS_ERR(work)) + goto out_err; /* Weird filesystem returning with hashed negative (kernfs)? */ err = -EINVAL; @@ -1137,6 +1138,11 @@ static struct ovl_entry *ovl_get_lowerstack(struct super_block *sb, return ERR_PTR(-EINVAL); } + if (ctx->nr == ctx->nr_data) { + pr_err("at least one non-data lowerdir is required\n"); + return ERR_PTR(-EINVAL); + } + err = -EINVAL; for (i = 0; i < ctx->nr; i++) { l = &ctx->lower[i]; @@ -1305,6 +1311,7 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) { struct ovl_fs *ofs = sb->s_fs_info; struct ovl_fs_context *ctx = fc->fs_private; + const struct cred *old_cred = NULL; struct dentry *root_dentry; struct ovl_entry *oe; struct ovl_layer *layers; @@ -1318,10 +1325,15 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_d_op = &ovl_dentry_operations; err = -ENOMEM; - ofs->creator_cred = cred = prepare_creds(); + if (!ofs->creator_cred) + ofs->creator_cred = cred = prepare_creds(); + else + cred = (struct cred *)ofs->creator_cred; if (!cred) goto out_err; + old_cred = ovl_override_creds(sb); + err = ovl_fs_params_verify(ctx, &ofs->config); if (err) goto out_err; @@ -1481,11 +1493,19 @@ int ovl_fill_super(struct super_block *sb, struct fs_context *fc) sb->s_root = root_dentry; + ovl_revert_creds(old_cred); return 0; out_free_oe: ovl_free_entry(oe); out_err: + /* + * Revert creds before calling ovl_free_fs() which will call + * put_cred() and put_cred() requires that the cred's that are + * put are not the caller's creds, i.e., current->cred. + */ + if (old_cred) + ovl_revert_creds(old_cred); ovl_free_fs(ofs); sb->s_fs_info = NULL; return err; diff --git a/fs/pidfs.c b/fs/pidfs.c index 63f9699ebac3..c1f0a067be40 100644 --- a/fs/pidfs.c +++ b/fs/pidfs.c @@ -20,10 +20,34 @@ #include <linux/time_namespace.h> #include <linux/utsname.h> #include <net/net_namespace.h> +#include <linux/coredump.h> #include "internal.h" #include "mount.h" +static struct kmem_cache *pidfs_cachep __ro_after_init; + +/* + * Stashes information that userspace needs to access even after the + * process has been reaped. + */ +struct pidfs_exit_info { + __u64 cgroupid; + __s32 exit_code; + __u32 coredump_mask; +}; + +struct pidfs_inode { + struct pidfs_exit_info __pei; + struct pidfs_exit_info *exit_info; + struct inode vfs_inode; +}; + +static inline struct pidfs_inode *pidfs_i(struct inode *inode) +{ + return container_of(inode, struct pidfs_inode, vfs_inode); +} + static struct rb_root pidfs_ino_tree = RB_ROOT; #if BITS_PER_LONG == 32 @@ -188,36 +212,64 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) static __poll_t pidfd_poll(struct file *file, struct poll_table_struct *pts) { struct pid *pid = pidfd_pid(file); - bool thread = file->f_flags & PIDFD_THREAD; struct task_struct *task; __poll_t poll_flags = 0; poll_wait(file, &pid->wait_pidfd, pts); /* - * Depending on PIDFD_THREAD, inform pollers when the thread - * or the whole thread-group exits. + * Don't wake waiters if the thread-group leader exited + * prematurely. They either get notified when the last subthread + * exits or not at all if one of the remaining subthreads execs + * and assumes the struct pid of the old thread-group leader. */ guard(rcu)(); task = pid_task(pid, PIDTYPE_PID); if (!task) poll_flags = EPOLLIN | EPOLLRDNORM | EPOLLHUP; - else if (task->exit_state && (thread || thread_group_empty(task))) + else if (task->exit_state && !delay_group_leader(task)) poll_flags = EPOLLIN | EPOLLRDNORM; return poll_flags; } -static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long arg) +static inline bool pid_in_current_pidns(const struct pid *pid) +{ + const struct pid_namespace *ns = task_active_pid_ns(current); + + if (ns->level <= pid->level) + return pid->numbers[ns->level].ns == ns; + + return false; +} + +static __u32 pidfs_coredump_mask(unsigned long mm_flags) +{ + switch (__get_dumpable(mm_flags)) { + case SUID_DUMP_USER: + return PIDFD_COREDUMP_USER; + case SUID_DUMP_ROOT: + return PIDFD_COREDUMP_ROOT; + case SUID_DUMP_DISABLE: + return PIDFD_COREDUMP_SKIP; + default: + WARN_ON_ONCE(true); + } + + return 0; +} + +static long pidfd_info(struct file *file, unsigned int cmd, unsigned long arg) { struct pidfd_info __user *uinfo = (struct pidfd_info __user *)arg; + struct inode *inode = file_inode(file); + struct pid *pid = pidfd_pid(file); size_t usize = _IOC_SIZE(cmd); struct pidfd_info kinfo = {}; + struct pidfs_exit_info *exit_info; struct user_namespace *user_ns; + struct task_struct *task; const struct cred *c; __u64 mask; -#ifdef CONFIG_CGROUPS - struct cgroup *cgrp; -#endif if (!uinfo) return -EINVAL; @@ -227,10 +279,53 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long if (copy_from_user(&mask, &uinfo->mask, sizeof(mask))) return -EFAULT; + /* + * Restrict information retrieval to tasks within the caller's pid + * namespace hierarchy. + */ + if (!pid_in_current_pidns(pid)) + return -ESRCH; + + if (mask & PIDFD_INFO_EXIT) { + exit_info = READ_ONCE(pidfs_i(inode)->exit_info); + if (exit_info) { + kinfo.mask |= PIDFD_INFO_EXIT; +#ifdef CONFIG_CGROUPS + kinfo.cgroupid = exit_info->cgroupid; + kinfo.mask |= PIDFD_INFO_CGROUPID; +#endif + kinfo.exit_code = exit_info->exit_code; + } + } + + if (mask & PIDFD_INFO_COREDUMP) { + kinfo.mask |= PIDFD_INFO_COREDUMP; + kinfo.coredump_mask = READ_ONCE(pidfs_i(inode)->__pei.coredump_mask); + } + + task = get_pid_task(pid, PIDTYPE_PID); + if (!task) { + /* + * If the task has already been reaped, only exit + * information is available + */ + if (!(mask & PIDFD_INFO_EXIT)) + return -ESRCH; + + goto copy_out; + } + c = get_task_cred(task); if (!c) return -ESRCH; + if (!(kinfo.mask & PIDFD_INFO_COREDUMP)) { + task_lock(task); + if (task->mm) + kinfo.coredump_mask = pidfs_coredump_mask(task->mm->flags); + task_unlock(task); + } + /* Unconditionally return identifiers and credentials, the rest only on request */ user_ns = current_user_ns(); @@ -246,11 +341,15 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long put_cred(c); #ifdef CONFIG_CGROUPS - rcu_read_lock(); - cgrp = task_dfl_cgroup(task); - kinfo.cgroupid = cgroup_id(cgrp); - kinfo.mask |= PIDFD_INFO_CGROUPID; - rcu_read_unlock(); + if (!kinfo.cgroupid) { + struct cgroup *cgrp; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(task); + kinfo.cgroupid = cgroup_id(cgrp); + kinfo.mask |= PIDFD_INFO_CGROUPID; + rcu_read_unlock(); + } #endif /* @@ -270,16 +369,14 @@ static long pidfd_info(struct task_struct *task, unsigned int cmd, unsigned long if (kinfo.pid == 0 || kinfo.tgid == 0 || (kinfo.ppid == 0 && kinfo.pid != 1)) return -ESRCH; +copy_out: /* * If userspace and the kernel have the same struct size it can just * be copied. If userspace provides an older struct, only the bits that * userspace knows about will be copied. If userspace provides a new * struct, only the bits that the kernel knows about will be copied. */ - if (copy_to_user(uinfo, &kinfo, min(usize, sizeof(kinfo)))) - return -EFAULT; - - return 0; + return copy_struct_to_user(uinfo, usize, &kinfo, sizeof(kinfo), NULL); } static bool pidfs_ioctl_valid(unsigned int cmd) @@ -317,7 +414,6 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) { struct task_struct *task __free(put_task) = NULL; struct nsproxy *nsp __free(put_nsproxy) = NULL; - struct pid *pid = pidfd_pid(file); struct ns_common *ns_common = NULL; struct pid_namespace *pid_ns; @@ -332,13 +428,13 @@ static long pidfd_ioctl(struct file *file, unsigned int cmd, unsigned long arg) return put_user(file_inode(file)->i_generation, argp); } - task = get_pid_task(pid, PIDTYPE_PID); - if (!task) - return -ESRCH; - /* Extensible IOCTL that does not open namespace FDs, take a shortcut */ if (_IOC_NR(cmd) == _IOC_NR(PIDFD_GET_INFO)) - return pidfd_info(task, cmd, arg); + return pidfd_info(file, cmd, arg); + + task = get_pid_task(pidfd_pid(file), PIDTYPE_PID); + if (!task) + return -ESRCH; if (arg) return -EINVAL; @@ -450,6 +546,74 @@ struct pid *pidfd_pid(const struct file *file) return file_inode(file)->i_private; } +/* + * We're called from release_task(). We know there's at least one + * reference to struct pid being held that won't be released until the + * task has been reaped which cannot happen until we're out of + * release_task(). + * + * If this struct pid is referred to by a pidfd then + * stashed_dentry_get() will return the dentry and inode for that struct + * pid. Since we've taken a reference on it there's now an additional + * reference from the exit path on it. Which is fine. We're going to put + * it again in a second and we know that the pid is kept alive anyway. + * + * Worst case is that we've filled in the info and immediately free the + * dentry and inode afterwards since the pidfd has been closed. Since + * pidfs_exit() currently is placed after exit_task_work() we know that + * it cannot be us aka the exiting task holding a pidfd to ourselves. + */ +void pidfs_exit(struct task_struct *tsk) +{ + struct dentry *dentry; + + might_sleep(); + + dentry = stashed_dentry_get(&task_pid(tsk)->stashed); + if (dentry) { + struct inode *inode = d_inode(dentry); + struct pidfs_exit_info *exit_info = &pidfs_i(inode)->__pei; +#ifdef CONFIG_CGROUPS + struct cgroup *cgrp; + + rcu_read_lock(); + cgrp = task_dfl_cgroup(tsk); + exit_info->cgroupid = cgroup_id(cgrp); + rcu_read_unlock(); +#endif + exit_info->exit_code = tsk->exit_code; + + /* Ensure that PIDFD_GET_INFO sees either all or nothing. */ + smp_store_release(&pidfs_i(inode)->exit_info, &pidfs_i(inode)->__pei); + dput(dentry); + } +} + +#ifdef CONFIG_COREDUMP +void pidfs_coredump(const struct coredump_params *cprm) +{ + struct pid *pid = cprm->pid; + struct pidfs_exit_info *exit_info; + struct dentry *dentry; + struct inode *inode; + __u32 coredump_mask = 0; + + dentry = pid->stashed; + if (WARN_ON_ONCE(!dentry)) + return; + + inode = d_inode(dentry); + exit_info = &pidfs_i(inode)->__pei; + /* Note how we were coredumped. */ + coredump_mask = pidfs_coredump_mask(cprm->mm_flags); + /* Note that we actually did coredump. */ + coredump_mask |= PIDFD_COREDUMPED; + /* If coredumping is set to skip we should never end up here. */ + VFS_WARN_ON_ONCE(coredump_mask & PIDFD_COREDUMP_SKIP); + smp_store_release(&exit_info->coredump_mask, coredump_mask); +} +#endif + static struct vfsmount *pidfs_mnt __ro_after_init; /* @@ -460,36 +624,14 @@ static struct vfsmount *pidfs_mnt __ro_after_init; static int pidfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, struct iattr *attr) { - return -EOPNOTSUPP; + return anon_inode_setattr(idmap, dentry, attr); } - -/* - * User space expects pidfs inodes to have no file type in st_mode. - * - * In particular, 'lsof' has this legacy logic: - * - * type = s->st_mode & S_IFMT; - * switch (type) { - * ... - * case 0: - * if (!strcmp(p, "anon_inode")) - * Lf->ntype = Ntype = N_ANON_INODE; - * - * to detect our old anon_inode logic. - * - * Rather than mess with our internal sane inode data, just fix it - * up here in getattr() by masking off the format bits. - */ static int pidfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, unsigned int query_flags) { - struct inode *inode = d_inode(path->dentry); - - generic_fillattr(&nop_mnt_idmap, request_mask, inode, stat); - stat->mode &= ~S_IFMT; - return 0; + return anon_inode_getattr(idmap, path, stat, request_mask, query_flags); } static const struct inode_operations pidfs_inode_operations = { @@ -505,9 +647,30 @@ static void pidfs_evict_inode(struct inode *inode) put_pid(pid); } +static struct inode *pidfs_alloc_inode(struct super_block *sb) +{ + struct pidfs_inode *pi; + + pi = alloc_inode_sb(sb, pidfs_cachep, GFP_KERNEL); + if (!pi) + return NULL; + + memset(&pi->__pei, 0, sizeof(pi->__pei)); + pi->exit_info = NULL; + + return &pi->vfs_inode; +} + +static void pidfs_free_inode(struct inode *inode) +{ + kmem_cache_free(pidfs_cachep, pidfs_i(inode)); +} + static const struct super_operations pidfs_sops = { + .alloc_inode = pidfs_alloc_inode, .drop_inode = generic_delete_inode, .evict_inode = pidfs_evict_inode, + .free_inode = pidfs_free_inode, .statfs = simple_statfs, }; @@ -521,7 +684,6 @@ static char *pidfs_dname(struct dentry *dentry, char *buffer, int buflen) } const struct dentry_operations pidfs_dentry_operations = { - .d_delete = always_delete_dentry, .d_dname = pidfs_dname, .d_prune = stashed_dentry_prune, }; @@ -634,8 +796,53 @@ static int pidfs_export_permission(struct handle_to_path_ctx *ctx, return 0; } +static inline bool pidfs_pid_valid(struct pid *pid, const struct path *path, + unsigned int flags) +{ + enum pid_type type; + + if (flags & PIDFD_STALE) + return true; + + /* + * Make sure that if a pidfd is created PIDFD_INFO_EXIT + * information will be available. So after an inode for the + * pidfd has been allocated perform another check that the pid + * is still alive. If it is exit information is available even + * if the task gets reaped before the pidfd is returned to + * userspace. The only exception are indicated by PIDFD_STALE: + * + * (1) The kernel is in the middle of task creation and thus no + * task linkage has been established yet. + * (2) The caller knows @pid has been registered in pidfs at a + * time when the task was still alive. + * + * In both cases exit information will have been reported. + */ + if (flags & PIDFD_THREAD) + type = PIDTYPE_PID; + else + type = PIDTYPE_TGID; + + /* + * Since pidfs_exit() is called before struct pid's task linkage + * is removed the case where the task got reaped but a dentry + * was already attached to struct pid and exit information was + * recorded and published can be handled correctly. + */ + if (unlikely(!pid_has_task(pid, type))) { + struct inode *inode = d_inode(path->dentry); + return !!READ_ONCE(pidfs_i(inode)->exit_info); + } + + return true; +} + static struct file *pidfs_export_open(struct path *path, unsigned int oflags) { + if (!pidfs_pid_valid(d_inode(path->dentry)->i_private, path, oflags)) + return ERR_PTR(-ESRCH); + /* * Clear O_LARGEFILE as open_by_handle_at() forces it and raise * O_RDWR as pidfds always are. @@ -656,7 +863,7 @@ static int pidfs_init_inode(struct inode *inode, void *data) const struct pid *pid = data; inode->i_private = data; - inode->i_flags |= S_PRIVATE; + inode->i_flags |= S_PRIVATE | S_ANON_INODE; inode->i_mode |= S_IRWXU; inode->i_op = &pidfs_inode_operations; inode->i_fop = &pidfs_file_operations; @@ -699,22 +906,106 @@ static struct file_system_type pidfs_type = { struct file *pidfs_alloc_file(struct pid *pid, unsigned int flags) { - struct file *pidfd_file; - struct path path; + struct path path __free(path_put) = {}; int ret; + /* + * Ensure that PIDFD_STALE can be passed as a flag without + * overloading other uapi pidfd flags. + */ + BUILD_BUG_ON(PIDFD_STALE == PIDFD_THREAD); + BUILD_BUG_ON(PIDFD_STALE == PIDFD_NONBLOCK); + ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); if (ret < 0) return ERR_PTR(ret); + if (!pidfs_pid_valid(pid, &path, flags)) + return ERR_PTR(-ESRCH); + + flags &= ~PIDFD_STALE; + flags |= O_RDWR; pidfd_file = dentry_open(&path, flags, current_cred()); - path_put(&path); + /* Raise PIDFD_THREAD explicitly as do_dentry_open() strips it. */ + if (!IS_ERR(pidfd_file)) + pidfd_file->f_flags |= (flags & PIDFD_THREAD); + return pidfd_file; } +/** + * pidfs_register_pid - register a struct pid in pidfs + * @pid: pid to pin + * + * Register a struct pid in pidfs. Needs to be paired with + * pidfs_put_pid() to not risk leaking the pidfs dentry and inode. + * + * Return: On success zero, on error a negative error code is returned. + */ +int pidfs_register_pid(struct pid *pid) +{ + struct path path __free(path_put) = {}; + int ret; + + might_sleep(); + + if (!pid) + return 0; + + ret = path_from_stashed(&pid->stashed, pidfs_mnt, get_pid(pid), &path); + if (unlikely(ret)) + return ret; + /* Keep the dentry and only put the reference to the mount. */ + path.dentry = NULL; + return 0; +} + +/** + * pidfs_get_pid - pin a struct pid through pidfs + * @pid: pid to pin + * + * Similar to pidfs_register_pid() but only valid if the caller knows + * there's a reference to the @pid through a dentry already that can't + * go away. + */ +void pidfs_get_pid(struct pid *pid) +{ + if (!pid) + return; + WARN_ON_ONCE(!stashed_dentry_get(&pid->stashed)); +} + +/** + * pidfs_put_pid - drop a pidfs reference + * @pid: pid to drop + * + * Drop a reference to @pid via pidfs. This is only safe if the + * reference has been taken via pidfs_get_pid(). + */ +void pidfs_put_pid(struct pid *pid) +{ + might_sleep(); + + if (!pid) + return; + VFS_WARN_ON_ONCE(!pid->stashed); + dput(pid->stashed); +} + +static void pidfs_inode_init_once(void *data) +{ + struct pidfs_inode *pi = data; + + inode_init_once(&pi->vfs_inode); +} + void __init pidfs_init(void) { + pidfs_cachep = kmem_cache_create("pidfs_cache", sizeof(struct pidfs_inode), 0, + (SLAB_HWCACHE_ALIGN | SLAB_RECLAIM_ACCOUNT | + SLAB_ACCOUNT | SLAB_PANIC), + pidfs_inode_init_once); pidfs_mnt = kern_mount(&pidfs_type); if (IS_ERR(pidfs_mnt)) panic("Failed to mount pidfs pseudo filesystem"); diff --git a/fs/pipe.c b/fs/pipe.c index ce1af7592780..da45edd68c41 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -112,20 +112,40 @@ void pipe_double_lock(struct pipe_inode_info *pipe1, pipe_lock(pipe2); } +static struct page *anon_pipe_get_page(struct pipe_inode_info *pipe) +{ + for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (pipe->tmp_page[i]) { + struct page *page = pipe->tmp_page[i]; + pipe->tmp_page[i] = NULL; + return page; + } + } + + return alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); +} + +static void anon_pipe_put_page(struct pipe_inode_info *pipe, + struct page *page) +{ + if (page_count(page) == 1) { + for (int i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (!pipe->tmp_page[i]) { + pipe->tmp_page[i] = page; + return; + } + } + } + + put_page(page); +} + static void anon_pipe_buf_release(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { struct page *page = buf->page; - /* - * If nobody else uses this page, and we don't already have a - * temporary page, let's keep track of it as a one-deep - * allocation cache. (Otherwise just release our reference to it) - */ - if (page_count(page) == 1 && !pipe->tmp_page) - pipe->tmp_page = page; - else - put_page(page); + anon_pipe_put_page(pipe, page); } static bool anon_pipe_buf_try_steal(struct pipe_inode_info *pipe, @@ -210,11 +230,10 @@ static const struct pipe_buf_operations anon_pipe_buf_ops = { /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_readable(const struct pipe_inode_info *pipe) { - unsigned int head = READ_ONCE(pipe->head); - unsigned int tail = READ_ONCE(pipe->tail); + union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) }; unsigned int writers = READ_ONCE(pipe->writers); - return !pipe_empty(head, tail) || !writers; + return !pipe_empty(idx.head, idx.tail) || !writers; } static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, @@ -248,7 +267,7 @@ static inline unsigned int pipe_update_tail(struct pipe_inode_info *pipe, } static ssize_t -pipe_read(struct kiocb *iocb, struct iov_iter *to) +anon_pipe_read(struct kiocb *iocb, struct iov_iter *to) { size_t total_len = iov_iter_count(to); struct file *filp = iocb->ki_filp; @@ -275,7 +294,6 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) /* Read ->head with a barrier vs post_one_notification() */ unsigned int head = smp_load_acquire(&pipe->head); unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; #ifdef CONFIG_WATCH_QUEUE if (pipe->note_loss) { @@ -302,7 +320,7 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) #endif if (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t chars = buf->len; size_t written; int error; @@ -360,29 +378,9 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) break; } mutex_unlock(&pipe->mutex); - /* * We only get here if we didn't actually read anything. * - * However, we could have seen (and removed) a zero-sized - * pipe buffer, and might have made space in the buffers - * that way. - * - * You can't make zero-sized pipe buffers by doing an empty - * write (not even in packet mode), but they can happen if - * the writer gets an EFAULT when trying to fill a buffer - * that already got allocated and inserted in the buffer - * array. - * - * So we still need to wake up any pending writers in the - * _very_ unlikely case that the pipe was full, but we got - * no data. - */ - if (unlikely(wake_writer)) - wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); - - /* * But because we didn't read anything, at this point we can * just return directly with -ERESTARTSYS if we're interrupted, * since we've done any required wakeups and there's no need @@ -391,11 +389,10 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wait_event_interruptible_exclusive(pipe->rd_wait, pipe_readable(pipe)) < 0) return -ERESTARTSYS; - wake_writer = false; wake_next_reader = true; mutex_lock(&pipe->mutex); } - if (pipe_empty(pipe->head, pipe->tail)) + if (pipe_is_empty(pipe)) wake_next_reader = false; mutex_unlock(&pipe->mutex); @@ -404,8 +401,15 @@ pipe_read(struct kiocb *iocb, struct iov_iter *to) if (wake_next_reader) wake_up_interruptible_sync_poll(&pipe->rd_wait, EPOLLIN | EPOLLRDNORM); kill_fasync(&pipe->fasync_writers, SIGIO, POLL_OUT); + return ret; +} + +static ssize_t +fifo_pipe_read(struct kiocb *iocb, struct iov_iter *to) +{ + int ret = anon_pipe_read(iocb, to); if (ret > 0) - file_accessed(filp); + file_accessed(iocb->ki_filp); return ret; } @@ -417,16 +421,15 @@ static inline int is_packetized(struct file *file) /* Done while waiting without holding the pipe lock - thus the READ_ONCE() */ static inline bool pipe_writable(const struct pipe_inode_info *pipe) { - unsigned int head = READ_ONCE(pipe->head); - unsigned int tail = READ_ONCE(pipe->tail); + union pipe_index idx = { .head_tail = READ_ONCE(pipe->head_tail) }; unsigned int max_usage = READ_ONCE(pipe->max_usage); - return !pipe_full(head, tail, max_usage) || + return !pipe_full(idx.head, idx.tail, max_usage) || !READ_ONCE(pipe->readers); } static ssize_t -pipe_write(struct kiocb *iocb, struct iov_iter *from) +anon_pipe_write(struct kiocb *iocb, struct iov_iter *from) { struct file *filp = iocb->ki_filp; struct pipe_inode_info *pipe = filp->private_data; @@ -473,8 +476,7 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) was_empty = pipe_empty(head, pipe->tail); chars = total_len & (PAGE_SIZE-1); if (chars && !was_empty) { - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[(head - 1) & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, head - 1); int offset = buf->offset + buf->len; if ((buf->flags & PIPE_BUF_FLAG_CAN_MERGE) && @@ -505,54 +507,44 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) head = pipe->head; if (!pipe_full(head, pipe->tail, pipe->max_usage)) { - unsigned int mask = pipe->ring_size - 1; struct pipe_buffer *buf; - struct page *page = pipe->tmp_page; + struct page *page; int copied; - if (!page) { - page = alloc_page(GFP_HIGHUSER | __GFP_ACCOUNT); - if (unlikely(!page)) { - ret = ret ? : -ENOMEM; - break; - } - pipe->tmp_page = page; + page = anon_pipe_get_page(pipe); + if (unlikely(!page)) { + if (!ret) + ret = -ENOMEM; + break; } - /* Allocate a slot in the ring in advance and attach an - * empty buffer. If we fault or otherwise fail to use - * it, either the reader will consume it or it'll still - * be there for the next write. - */ - pipe->head = head + 1; + copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); + if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { + anon_pipe_put_page(pipe, page); + if (!ret) + ret = -EFAULT; + break; + } + pipe->head = head + 1; /* Insert it into the buffer array */ - buf = &pipe->bufs[head & mask]; + buf = pipe_buf(pipe, head); buf->page = page; buf->ops = &anon_pipe_buf_ops; buf->offset = 0; - buf->len = 0; if (is_packetized(filp)) buf->flags = PIPE_BUF_FLAG_PACKET; else buf->flags = PIPE_BUF_FLAG_CAN_MERGE; - pipe->tmp_page = NULL; - copied = copy_page_from_iter(page, 0, PAGE_SIZE, from); - if (unlikely(copied < PAGE_SIZE && iov_iter_count(from))) { - if (!ret) - ret = -EFAULT; - break; - } - ret += copied; buf->len = copied; + ret += copied; if (!iov_iter_count(from)) break; - } - if (!pipe_full(head, pipe->tail, pipe->max_usage)) continue; + } /* Wait for buffer space to become available. */ if ((filp->f_flags & O_NONBLOCK) || @@ -579,11 +571,11 @@ pipe_write(struct kiocb *iocb, struct iov_iter *from) kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); wait_event_interruptible_exclusive(pipe->wr_wait, pipe_writable(pipe)); mutex_lock(&pipe->mutex); - was_empty = pipe_empty(pipe->head, pipe->tail); + was_empty = pipe_is_empty(pipe); wake_next_writer = true; } out: - if (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (pipe_is_full(pipe)) wake_next_writer = false; mutex_unlock(&pipe->mutex); @@ -604,11 +596,21 @@ out: kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN); if (wake_next_writer) wake_up_interruptible_sync_poll(&pipe->wr_wait, EPOLLOUT | EPOLLWRNORM); - if (ret > 0 && sb_start_write_trylock(file_inode(filp)->i_sb)) { - int err = file_update_time(filp); - if (err) - ret = err; - sb_end_write(file_inode(filp)->i_sb); + return ret; +} + +static ssize_t +fifo_pipe_write(struct kiocb *iocb, struct iov_iter *from) +{ + int ret = anon_pipe_write(iocb, from); + if (ret > 0) { + struct file *filp = iocb->ki_filp; + if (sb_start_write_trylock(file_inode(filp)->i_sb)) { + int err = file_update_time(filp); + if (err) + ret = err; + sb_end_write(file_inode(filp)->i_sb); + } } return ret; } @@ -616,7 +618,7 @@ out: static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct pipe_inode_info *pipe = filp->private_data; - unsigned int count, head, tail, mask; + unsigned int count, head, tail; switch (cmd) { case FIONREAD: @@ -624,10 +626,9 @@ static long pipe_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) count = 0; head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; - while (tail != head) { - count += pipe->bufs[tail & mask].len; + while (!pipe_empty(head, tail)) { + count += pipe_buf(pipe, tail)->len; tail++; } mutex_unlock(&pipe->mutex); @@ -659,7 +660,7 @@ pipe_poll(struct file *filp, poll_table *wait) { __poll_t mask; struct pipe_inode_info *pipe = filp->private_data; - unsigned int head, tail; + union pipe_index idx; /* Epoll has some historical nasty semantics, this enables them */ WRITE_ONCE(pipe->poll_usage, true); @@ -680,19 +681,18 @@ pipe_poll(struct file *filp, poll_table *wait) * if something changes and you got it wrong, the poll * table entry will wake you up and fix it. */ - head = READ_ONCE(pipe->head); - tail = READ_ONCE(pipe->tail); + idx.head_tail = READ_ONCE(pipe->head_tail); mask = 0; if (filp->f_mode & FMODE_READ) { - if (!pipe_empty(head, tail)) + if (!pipe_empty(idx.head, idx.tail)) mask |= EPOLLIN | EPOLLRDNORM; if (!pipe->writers && filp->f_pipe != pipe->w_counter) mask |= EPOLLHUP; } if (filp->f_mode & FMODE_WRITE) { - if (!pipe_full(head, tail, pipe->max_usage)) + if (!pipe_full(idx.head, idx.tail, pipe->max_usage)) mask |= EPOLLOUT | EPOLLWRNORM; /* * Most Unices do not set EPOLLERR for FIFOs but on Linux they @@ -857,8 +857,10 @@ void free_pipe_info(struct pipe_inode_info *pipe) if (pipe->watch_queue) put_watch_queue(pipe->watch_queue); #endif - if (pipe->tmp_page) - __free_page(pipe->tmp_page); + for (i = 0; i < ARRAY_SIZE(pipe->tmp_page); i++) { + if (pipe->tmp_page[i]) + __free_page(pipe->tmp_page[i]); + } kfree(pipe->bufs); kfree(pipe); } @@ -878,6 +880,8 @@ static const struct dentry_operations pipefs_dentry_operations = { .d_dname = pipefs_dname, }; +static const struct file_operations pipeanon_fops; + static struct inode * get_pipe_inode(void) { struct inode *inode = new_inode_pseudo(pipe_mnt->mnt_sb); @@ -895,7 +899,7 @@ static struct inode * get_pipe_inode(void) inode->i_pipe = pipe; pipe->files = 2; pipe->readers = pipe->writers = 1; - inode->i_fop = &pipefifo_fops; + inode->i_fop = &pipeanon_fops; /* * Mark the inode dirty from the very beginning, @@ -938,7 +942,7 @@ int create_pipe_files(struct file **res, int flags) f = alloc_file_pseudo(inode, pipe_mnt, "", O_WRONLY | (flags & (O_NONBLOCK | O_DIRECT)), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(f)) { free_pipe_info(inode->i_pipe); iput(inode); @@ -949,7 +953,7 @@ int create_pipe_files(struct file **res, int flags) f->f_pipe = 0; res[0] = alloc_file_clone(f, O_RDONLY | (flags & O_NONBLOCK), - &pipefifo_fops); + &pipeanon_fops); if (IS_ERR(res[0])) { put_pipe_info(inode, inode->i_pipe); fput(f); @@ -1113,8 +1117,8 @@ static void wake_up_partner(struct pipe_inode_info *pipe) static int fifo_open(struct inode *inode, struct file *filp) { + bool is_pipe = inode->i_fop == &pipeanon_fops; struct pipe_inode_info *pipe; - bool is_pipe = inode->i_sb->s_magic == PIPEFS_MAGIC; int ret; filp->f_pipe = 0; @@ -1238,8 +1242,19 @@ err: const struct file_operations pipefifo_fops = { .open = fifo_open, - .read_iter = pipe_read, - .write_iter = pipe_write, + .read_iter = fifo_pipe_read, + .write_iter = fifo_pipe_write, + .poll = pipe_poll, + .unlocked_ioctl = pipe_ioctl, + .release = pipe_release, + .fasync = pipe_fasync, + .splice_write = iter_file_splice_write, +}; + +static const struct file_operations pipeanon_fops = { + .open = fifo_open, + .read_iter = anon_pipe_read, + .write_iter = anon_pipe_write, .poll = pipe_poll, .unlocked_ioctl = pipe_ioctl, .release = pipe_release, @@ -1275,6 +1290,10 @@ int pipe_resize_ring(struct pipe_inode_info *pipe, unsigned int nr_slots) struct pipe_buffer *bufs; unsigned int head, tail, mask, n; + /* nr_slots larger than limits of pipe->{head,tail} */ + if (unlikely(nr_slots > (pipe_index_t)-1u)) + return -EINVAL; + bufs = kcalloc(nr_slots, sizeof(*bufs), GFP_KERNEL_ACCOUNT | __GFP_NOWARN); if (unlikely(!bufs)) @@ -1394,7 +1413,9 @@ struct pipe_inode_info *get_pipe_info(struct file *file, bool for_splice) { struct pipe_inode_info *pipe = file->private_data; - if (file->f_op != &pipefifo_fops || !pipe) + if (!pipe) + return NULL; + if (file->f_op != &pipefifo_fops && file->f_op != &pipeanon_fops) return NULL; if (for_splice && pipe_has_watch_queue(pipe)) return NULL; diff --git a/fs/pnode.c b/fs/pnode.c index ef048f008bdd..fb77427df39e 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -228,9 +228,13 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp) /* skip ones added by this propagate_mnt() */ if (IS_MNT_NEW(m)) return 0; - /* skip if mountpoint isn't covered by it */ + /* skip if mountpoint isn't visible in m */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) return 0; + /* skip if m is in the anon_ns we are emptying */ + if (m->mnt_ns->mntns_flags & MNTNS_PROPAGATING) + return 0; + if (peers(m, last_dest)) { type = CL_MAKE_SHARED; } else { @@ -380,9 +384,6 @@ bool propagation_would_overmount(const struct mount *from, if (!IS_MNT_SHARED(from)) return false; - if (IS_MNT_NEW(to)) - return false; - if (to->mnt.mnt_root != mp->m_dentry) return false; @@ -549,8 +550,10 @@ static void restore_mounts(struct list_head *to_restore) mp = parent->mnt_mp; parent = parent->mnt_parent; } - if (parent != mnt->mnt_parent) + if (parent != mnt->mnt_parent) { mnt_change_mountpoint(parent, mp, mnt); + mnt_notify_add(mnt); + } } } diff --git a/fs/pnode.h b/fs/pnode.h index 0b02a6393891..34b6247af01d 100644 --- a/fs/pnode.h +++ b/fs/pnode.h @@ -12,7 +12,7 @@ #define IS_MNT_SHARED(m) ((m)->mnt.mnt_flags & MNT_SHARED) #define IS_MNT_SLAVE(m) ((m)->mnt_master) -#define IS_MNT_NEW(m) (!(m)->mnt_ns || is_anon_ns((m)->mnt_ns)) +#define IS_MNT_NEW(m) (!(m)->mnt_ns) #define CLEAR_MNT_SHARED(m) ((m)->mnt.mnt_flags &= ~MNT_SHARED) #define IS_MNT_UNBINDABLE(m) ((m)->mnt.mnt_flags & MNT_UNBINDABLE) #define IS_MNT_MARKED(m) ((m)->mnt.mnt_flags & MNT_MARKED) diff --git a/fs/proc/base.c b/fs/proc/base.c index cd89e956c322..fe33a5843fbd 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -416,7 +416,7 @@ static const struct file_operations proc_pid_cmdline_ops = { #ifdef CONFIG_KALLSYMS /* * Provides a wchan file via kallsyms in a proper one-value-per-file format. - * Returns the resolved symbol. If that fails, simply return the address. + * Returns the resolved symbol to user space. */ static int proc_pid_wchan(struct seq_file *m, struct pid_namespace *ns, struct pid *pid, struct task_struct *task) @@ -1489,7 +1489,6 @@ static const struct file_operations proc_fail_nth_operations = { #endif -#ifdef CONFIG_SCHED_DEBUG /* * Print out various scheduling related per-task fields: */ @@ -1539,8 +1538,6 @@ static const struct file_operations proc_pid_sched_operations = { .release = single_release, }; -#endif - #ifdef CONFIG_SCHED_AUTOGROUP /* * Print out autogroup related information: @@ -2124,7 +2121,7 @@ bool proc_fill_cache(struct file *file, struct dir_context *ctx, unsigned type = DT_UNKNOWN; ino_t ino = 1; - child = d_hash_and_lookup(dir, &qname); + child = try_lookup_noperm(&qname, dir); if (!child) { DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); child = d_alloc_parallel(dir, &qname, &wq); @@ -2497,11 +2494,9 @@ static const struct file_operations proc_map_files_operations = { #if defined(CONFIG_CHECKPOINT_RESTORE) && defined(CONFIG_POSIX_TIMERS) struct timers_private { - struct pid *pid; - struct task_struct *task; - struct sighand_struct *sighand; - struct pid_namespace *ns; - unsigned long flags; + struct pid *pid; + struct task_struct *task; + struct pid_namespace *ns; }; static void *timers_start(struct seq_file *m, loff_t *pos) @@ -2512,54 +2507,48 @@ static void *timers_start(struct seq_file *m, loff_t *pos) if (!tp->task) return ERR_PTR(-ESRCH); - tp->sighand = lock_task_sighand(tp->task, &tp->flags); - if (!tp->sighand) - return ERR_PTR(-ESRCH); - - return seq_hlist_start(&tp->task->signal->posix_timers, *pos); + rcu_read_lock(); + return seq_hlist_start_rcu(&tp->task->signal->posix_timers, *pos); } static void *timers_next(struct seq_file *m, void *v, loff_t *pos) { struct timers_private *tp = m->private; - return seq_hlist_next(v, &tp->task->signal->posix_timers, pos); + + return seq_hlist_next_rcu(v, &tp->task->signal->posix_timers, pos); } static void timers_stop(struct seq_file *m, void *v) { struct timers_private *tp = m->private; - if (tp->sighand) { - unlock_task_sighand(tp->task, &tp->flags); - tp->sighand = NULL; - } - if (tp->task) { put_task_struct(tp->task); tp->task = NULL; + rcu_read_unlock(); } } static int show_timer(struct seq_file *m, void *v) { - struct k_itimer *timer; - struct timers_private *tp = m->private; - int notify; static const char * const nstr[] = { - [SIGEV_SIGNAL] = "signal", - [SIGEV_NONE] = "none", - [SIGEV_THREAD] = "thread", + [SIGEV_SIGNAL] = "signal", + [SIGEV_NONE] = "none", + [SIGEV_THREAD] = "thread", }; - timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); - notify = timer->it_sigev_notify; + struct k_itimer *timer = hlist_entry((struct hlist_node *)v, struct k_itimer, list); + struct timers_private *tp = m->private; + int notify = timer->it_sigev_notify; + + guard(spinlock_irq)(&timer->it_lock); + if (!posixtimer_valid(timer)) + return 0; seq_printf(m, "ID: %d\n", timer->it_id); - seq_printf(m, "signal: %d/%px\n", - timer->sigq.info.si_signo, + seq_printf(m, "signal: %d/%px\n", timer->sigq.info.si_signo, timer->sigq.info.si_value.sival_ptr); - seq_printf(m, "notify: %s/%s.%d\n", - nstr[notify & ~SIGEV_THREAD_ID], + seq_printf(m, "notify: %s/%s.%d\n", nstr[notify & ~SIGEV_THREAD_ID], (notify & SIGEV_THREAD_ID) ? "tid" : "pid", pid_nr_ns(timer->it_pid, tp->ns)); seq_printf(m, "ClockID: %d\n", timer->it_clock); @@ -3331,9 +3320,7 @@ static const struct pid_entry tgid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif #ifdef CONFIG_SCHED_AUTOGROUP REG("autogroup", S_IRUGO|S_IWUSR, proc_pid_sched_autogroup_operations), #endif @@ -3682,9 +3669,7 @@ static const struct pid_entry tid_base_stuff[] = { ONE("status", S_IRUGO, proc_pid_status), ONE("personality", S_IRUSR, proc_pid_personality), ONE("limits", S_IRUGO, proc_pid_limits), -#ifdef CONFIG_SCHED_DEBUG REG("sched", S_IRUGO|S_IWUSR, proc_pid_sched_operations), -#endif NOD("comm", S_IFREG|S_IRUGO|S_IWUSR, &proc_tid_comm_inode_operations, &proc_pid_set_comm_operations, {}), diff --git a/fs/proc/generic.c b/fs/proc/generic.c index 8ec90826a49e..a3e22803cddf 100644 --- a/fs/proc/generic.c +++ b/fs/proc/generic.c @@ -559,10 +559,16 @@ struct proc_dir_entry *proc_create_reg(const char *name, umode_t mode, return p; } -static inline void pde_set_flags(struct proc_dir_entry *pde) +static void pde_set_flags(struct proc_dir_entry *pde) { if (pde->proc_ops->proc_flags & PROC_ENTRY_PERMANENT) pde->flags |= PROC_ENTRY_PERMANENT; + if (pde->proc_ops->proc_read_iter) + pde->flags |= PROC_ENTRY_proc_read_iter; +#ifdef CONFIG_COMPAT + if (pde->proc_ops->proc_compat_ioctl) + pde->flags |= PROC_ENTRY_proc_compat_ioctl; +#endif } struct proc_dir_entry *proc_create_data(const char *name, umode_t mode, @@ -626,6 +632,7 @@ struct proc_dir_entry *proc_create_seq_private(const char *name, umode_t mode, p->proc_ops = &proc_seq_ops; p->seq_ops = ops; p->state_size = state_size; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_seq_private); @@ -656,6 +663,7 @@ struct proc_dir_entry *proc_create_single_data(const char *name, umode_t mode, return NULL; p->proc_ops = &proc_single_ops; p->single_show = show; + pde_set_flags(p); return proc_register(parent, p); } EXPORT_SYMBOL(proc_create_single_data); diff --git a/fs/proc/inode.c b/fs/proc/inode.c index 626ad7bd94f2..a3eb3b740f76 100644 --- a/fs/proc/inode.c +++ b/fs/proc/inode.c @@ -656,13 +656,13 @@ struct inode *proc_get_inode(struct super_block *sb, struct proc_dir_entry *de) if (S_ISREG(inode->i_mode)) { inode->i_op = de->proc_iops; - if (de->proc_ops->proc_read_iter) + if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops; else inode->i_fop = &proc_reg_file_ops; #ifdef CONFIG_COMPAT - if (de->proc_ops->proc_compat_ioctl) { - if (de->proc_ops->proc_read_iter) + if (pde_has_proc_compat_ioctl(de)) { + if (pde_has_proc_read_iter(de)) inode->i_fop = &proc_iter_file_ops_compat; else inode->i_fop = &proc_reg_file_ops_compat; diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 1695509370b8..96122e91c645 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -85,6 +85,20 @@ static inline void pde_make_permanent(struct proc_dir_entry *pde) pde->flags |= PROC_ENTRY_PERMANENT; } +static inline bool pde_has_proc_read_iter(const struct proc_dir_entry *pde) +{ + return pde->flags & PROC_ENTRY_proc_read_iter; +} + +static inline bool pde_has_proc_compat_ioctl(const struct proc_dir_entry *pde) +{ +#ifdef CONFIG_COMPAT + return pde->flags & PROC_ENTRY_proc_compat_ioctl; +#else + return false; +#endif +} + extern struct kmem_cache *proc_dir_entry_cache; void pde_free(struct proc_dir_entry *pde); @@ -143,6 +157,7 @@ unsigned name_to_int(const struct qstr *qstr); /* Worst case buffer size needed for holding an integer. */ #define PROC_NUMBUF 13 +#ifdef CONFIG_PAGE_MAPCOUNT /** * folio_precise_page_mapcount() - Number of mappings of this folio page. * @folio: The folio. @@ -173,7 +188,49 @@ static inline int folio_precise_page_mapcount(struct folio *folio, return mapcount; } +#else /* !CONFIG_PAGE_MAPCOUNT */ +static inline int folio_precise_page_mapcount(struct folio *folio, + struct page *page) +{ + BUILD_BUG(); +} +#endif /* CONFIG_PAGE_MAPCOUNT */ +/** + * folio_average_page_mapcount() - Average number of mappings per page in this + * folio + * @folio: The folio. + * + * The average number of user page table entries that reference each page in + * this folio as tracked via the RMAP: either referenced directly (PTE) or + * as part of a larger area that covers this page (e.g., PMD). + * + * The average is calculated by rounding to the nearest integer; however, + * to avoid duplicated code in current callers, the average is at least + * 1 if any page of the folio is mapped. + * + * Returns: The average number of mappings per page in this folio. + */ +static inline int folio_average_page_mapcount(struct folio *folio) +{ + int mapcount, entire_mapcount, avg; + + if (!folio_test_large(folio)) + return atomic_read(&folio->_mapcount) + 1; + + mapcount = folio_large_mapcount(folio); + if (unlikely(mapcount <= 0)) + return 0; + entire_mapcount = folio_entire_mapcount(folio); + if (mapcount <= entire_mapcount) + return entire_mapcount; + mapcount -= entire_mapcount; + + /* Round to closest integer ... */ + avg = ((unsigned int)mapcount + folio_large_nr_pages(folio) / 2) >> folio_large_order(folio); + /* ... but return at least 1. */ + return max_t(int, avg + entire_mapcount, 1); +} /* * array.c */ diff --git a/fs/proc/kcore.c b/fs/proc/kcore.c index 1cb33771bf9f..728630b10fdf 100644 --- a/fs/proc/kcore.c +++ b/fs/proc/kcore.c @@ -34,8 +34,6 @@ #include <asm/sections.h> #include "internal.h" -#define CORE_STR "CORE" - #ifndef ELF_CORE_EFLAGS #define ELF_CORE_EFLAGS 0 #endif @@ -122,7 +120,9 @@ static void update_kcore_size(void) kcore_phdrs_len = kcore_nphdr * sizeof(struct elf_phdr); kcore_notes_len = (4 * sizeof(struct elf_note) + - 3 * ALIGN(sizeof(CORE_STR), 4) + + ALIGN(sizeof(NN_PRSTATUS), 4) + + ALIGN(sizeof(NN_PRPSINFO), 4) + + ALIGN(sizeof(NN_TASKSTRUCT), 4) + VMCOREINFO_NOTE_NAME_BYTES + ALIGN(sizeof(struct elf_prstatus), 4) + ALIGN(sizeof(struct elf_prpsinfo), 4) + @@ -443,11 +443,11 @@ static ssize_t read_kcore_iter(struct kiocb *iocb, struct iov_iter *iter) goto out; } - append_kcore_note(notes, &i, CORE_STR, NT_PRSTATUS, &prstatus, + append_kcore_note(notes, &i, NN_PRSTATUS, NT_PRSTATUS, &prstatus, sizeof(prstatus)); - append_kcore_note(notes, &i, CORE_STR, NT_PRPSINFO, &prpsinfo, + append_kcore_note(notes, &i, NN_PRPSINFO, NT_PRPSINFO, &prpsinfo, sizeof(prpsinfo)); - append_kcore_note(notes, &i, CORE_STR, NT_TASKSTRUCT, current, + append_kcore_note(notes, &i, NN_TASKSTRUCT, NT_TASKSTRUCT, current, arch_task_struct_size); /* * vmcoreinfo_size is mostly constant after init time, but it diff --git a/fs/proc/meminfo.c b/fs/proc/meminfo.c index 8ba9b1472390..bc2bc60c36cc 100644 --- a/fs/proc/meminfo.c +++ b/fs/proc/meminfo.c @@ -120,8 +120,7 @@ static int meminfo_proc_show(struct seq_file *m, void *v) global_node_page_state(NR_SECONDARY_PAGETABLE)); show_val_kb(m, "NFS_Unstable: ", 0); - show_val_kb(m, "Bounce: ", - global_zone_page_state(NR_BOUNCE)); + show_val_kb(m, "Bounce: ", 0); show_val_kb(m, "WritebackTmp: ", global_node_page_state(NR_WRITEBACK_TEMP)); show_val_kb(m, "CommitLimit: ", vm_commit_limit()); @@ -162,6 +161,8 @@ static int meminfo_proc_show(struct seq_file *m, void *v) show_val_kb(m, "Unaccepted: ", global_zone_page_state(NR_UNACCEPTED)); #endif + show_val_kb(m, "Balloon: ", + global_node_page_state(NR_BALLOON_PAGES)); hugetlb_report_meminfo(m); diff --git a/fs/proc/page.c b/fs/proc/page.c index a55f5acefa97..23fc771100ae 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -67,9 +67,14 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, * memmaps that were actually initialized. */ page = pfn_to_online_page(pfn); - if (page) - mapcount = folio_precise_page_mapcount(page_folio(page), - page); + if (page) { + struct folio *folio = page_folio(page); + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + mapcount = folio_precise_page_mapcount(folio, page); + else + mapcount = folio_average_page_mapcount(folio); + } if (put_user(mapcount, out)) { ret = -EFAULT; diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index f02cd362309a..994cde10e3f4 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -707,6 +707,8 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, struct folio *folio = page_folio(page); int i, nr = compound ? compound_nr(page) : 1; unsigned long size = nr * PAGE_SIZE; + bool exclusive; + int mapcount; /* * First accumulate quantities that depend only on |size| and the type @@ -747,18 +749,29 @@ static void smaps_account(struct mem_size_stats *mss, struct page *page, dirty, locked, present); return; } + + if (IS_ENABLED(CONFIG_NO_PAGE_MAPCOUNT)) { + mapcount = folio_average_page_mapcount(folio); + exclusive = !folio_maybe_mapped_shared(folio); + } + /* * We obtain a snapshot of the mapcount. Without holding the folio lock * this snapshot can be slightly wrong as we cannot always read the * mapcount atomically. */ for (i = 0; i < nr; i++, page++) { - int mapcount = folio_precise_page_mapcount(folio, page); unsigned long pss = PAGE_SIZE << PSS_SHIFT; + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) { + mapcount = folio_precise_page_mapcount(folio, page); + exclusive = mapcount < 2; + } + if (mapcount >= 2) pss /= mapcount; smaps_page_accumulate(mss, folio, PAGE_SIZE, pss, - dirty, locked, mapcount < 2); + dirty, locked, exclusive); } } @@ -1023,7 +1036,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, if (folio) { /* We treat non-present entries as "maybe shared". */ - if (!present || folio_likely_mapped_shared(folio) || + if (!present || folio_maybe_mapped_shared(folio) || hugetlb_pmd_shared(pte)) mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); else @@ -1632,6 +1645,7 @@ struct pagemapread { #define PM_SOFT_DIRTY BIT_ULL(55) #define PM_MMAP_EXCLUSIVE BIT_ULL(56) #define PM_UFFD_WP BIT_ULL(57) +#define PM_GUARD_REGION BIT_ULL(58) #define PM_FILE BIT_ULL(61) #define PM_SWAP BIT_ULL(62) #define PM_PRESENT BIT_ULL(63) @@ -1651,6 +1665,13 @@ static int add_to_pagemap(pagemap_entry_t *pme, struct pagemapread *pm) return 0; } +static bool __folio_page_mapped_exclusively(struct folio *folio, struct page *page) +{ + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + return folio_precise_page_mapcount(folio, page) == 1; + return !folio_maybe_mapped_shared(folio); +} + static int pagemap_pte_hole(unsigned long start, unsigned long end, __always_unused int depth, struct mm_walk *walk) { @@ -1732,6 +1753,8 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, page = pfn_swap_entry_to_page(entry); if (pte_marker_entry_uffd_wp(entry)) flags |= PM_UFFD_WP; + if (is_guard_swp_entry(entry)) + flags |= PM_GUARD_REGION; } if (page) { @@ -1739,7 +1762,7 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (!folio_test_anon(folio)) flags |= PM_FILE; if ((flags & PM_PRESENT) && - folio_precise_page_mapcount(folio, page) == 1) + __folio_page_mapped_exclusively(folio, page)) flags |= PM_MMAP_EXCLUSIVE; } if (vma->vm_flags & VM_SOFTDIRTY) @@ -1814,7 +1837,7 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, pagemap_entry_t pme; if (folio && (flags & PM_PRESENT) && - folio_precise_page_mapcount(folio, page + idx) == 1) + __folio_page_mapped_exclusively(folio, page)) cur_flags |= PM_MMAP_EXCLUSIVE; pme = make_pme(frame, cur_flags); @@ -1879,7 +1902,7 @@ static int pagemap_hugetlb_range(pte_t *ptep, unsigned long hmask, if (!folio_test_anon(folio)) flags |= PM_FILE; - if (!folio_likely_mapped_shared(folio) && + if (!folio_maybe_mapped_shared(folio) && !hugetlb_pmd_shared(ptep)) flags |= PM_MMAP_EXCLUSIVE; @@ -1931,7 +1954,8 @@ static const struct mm_walk_ops pagemap_ops = { * Bit 55 pte is soft-dirty (see Documentation/admin-guide/mm/soft-dirty.rst) * Bit 56 page exclusively mapped * Bit 57 pte is uffd-wp write-protected - * Bits 58-60 zero + * Bit 58 pte is a guard region + * Bits 59-60 zero * Bit 61 page is file-page or shared-anon * Bit 62 page swapped * Bit 63 page present @@ -2455,22 +2479,19 @@ static int pagemap_scan_pmd_entry(pmd_t *pmd, unsigned long start, spinlock_t *ptl; int ret; - arch_enter_lazy_mmu_mode(); - ret = pagemap_scan_thp_entry(pmd, start, end, walk); - if (ret != -ENOENT) { - arch_leave_lazy_mmu_mode(); + if (ret != -ENOENT) return ret; - } ret = 0; start_pte = pte = pte_offset_map_lock(vma->vm_mm, pmd, start, &ptl); if (!pte) { - arch_leave_lazy_mmu_mode(); walk->action = ACTION_AGAIN; return 0; } + arch_enter_lazy_mmu_mode(); + if ((p->arg.flags & PM_SCAN_WP_MATCHING) && !p->vec_out) { /* Fast path for performing exclusive WP */ for (addr = start; addr != end; pte++, addr += PAGE_SIZE) { @@ -2539,8 +2560,8 @@ flush_and_return: if (flush_end) flush_tlb_range(vma, start, addr); - pte_unmap_unlock(start_pte, ptl); arch_leave_lazy_mmu_mode(); + pte_unmap_unlock(start_pte, ptl); cond_resched(); return ret; @@ -2855,7 +2876,12 @@ static void gather_stats(struct page *page, struct numa_maps *md, int pte_dirty, unsigned long nr_pages) { struct folio *folio = page_folio(page); - int count = folio_precise_page_mapcount(folio, page); + int count; + + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + count = folio_precise_page_mapcount(folio, page); + else + count = folio_average_page_mapcount(folio); md->pages += nr_pages; if (pte_dirty || folio_test_dirty(folio)) diff --git a/fs/proc_namespace.c b/fs/proc_namespace.c index e133b507ddf3..5c555db68aa2 100644 --- a/fs/proc_namespace.c +++ b/fs/proc_namespace.c @@ -111,7 +111,7 @@ static int show_vfsmnt(struct seq_file *m, struct vfsmount *mnt) if (err) goto out; } else { - mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + mangle(m, r->mnt_devname); } seq_putc(m, ' '); /* mountpoints outside of chroot jail will give SEQ_SKIP on this */ @@ -177,7 +177,7 @@ static int show_mountinfo(struct seq_file *m, struct vfsmount *mnt) if (err) goto out; } else { - mangle(m, r->mnt_devname ? r->mnt_devname : "none"); + mangle(m, r->mnt_devname); } seq_puts(m, sb_rdonly(sb) ? " ro" : " rw"); err = show_sb_opts(m, sb); @@ -199,17 +199,13 @@ static int show_vfsstat(struct seq_file *m, struct vfsmount *mnt) int err; /* device */ + seq_puts(m, "device "); if (sb->s_op->show_devname) { - seq_puts(m, "device "); err = sb->s_op->show_devname(m, mnt_path.dentry); if (err) goto out; } else { - if (r->mnt_devname) { - seq_puts(m, "device "); - mangle(m, r->mnt_devname); - } else - seq_puts(m, "no device"); + mangle(m, r->mnt_devname); } /* mount point */ diff --git a/fs/pstore/inode.c b/fs/pstore/inode.c index 56815799ce79..bb3b769edc71 100644 --- a/fs/pstore/inode.c +++ b/fs/pstore/inode.c @@ -14,10 +14,10 @@ #include <linux/init.h> #include <linux/list.h> #include <linux/string.h> -#include <linux/mount.h> #include <linux/seq_file.h> #include <linux/ramfs.h> -#include <linux/parser.h> +#include <linux/fs_parser.h> +#include <linux/fs_context.h> #include <linux/sched.h> #include <linux/magic.h> #include <linux/pstore.h> @@ -226,37 +226,38 @@ static struct inode *pstore_get_inode(struct super_block *sb) } enum { - Opt_kmsg_bytes, Opt_err + Opt_kmsg_bytes }; -static const match_table_t tokens = { - {Opt_kmsg_bytes, "kmsg_bytes=%u"}, - {Opt_err, NULL} +static const struct fs_parameter_spec pstore_param_spec[] = { + fsparam_u32 ("kmsg_bytes", Opt_kmsg_bytes), + {} }; -static void parse_options(char *options) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - if (!options) - return; +struct pstore_context { + unsigned int kmsg_bytes; +}; - while ((p = strsep(&options, ",")) != NULL) { - int token; +static int pstore_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct pstore_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; - if (!*p) - continue; + opt = fs_parse(fc, pstore_param_spec, param, &result); + /* pstore has historically ignored invalid kmsg_bytes param */ + if (opt < 0) + return 0; - token = match_token(p, tokens, args); - switch (token) { - case Opt_kmsg_bytes: - if (!match_int(&args[0], &option)) - pstore_set_kmsg_bytes(option); - break; - } + switch (opt) { + case Opt_kmsg_bytes: + ctx->kmsg_bytes = result.uint_32; + break; + default: + return -EINVAL; } + + return 0; } /* @@ -265,14 +266,16 @@ static void parse_options(char *options) static int pstore_show_options(struct seq_file *m, struct dentry *root) { if (kmsg_bytes != CONFIG_PSTORE_DEFAULT_KMSG_BYTES) - seq_printf(m, ",kmsg_bytes=%lu", kmsg_bytes); + seq_printf(m, ",kmsg_bytes=%u", kmsg_bytes); return 0; } -static int pstore_remount(struct super_block *sb, int *flags, char *data) +static int pstore_reconfigure(struct fs_context *fc) { - sync_filesystem(sb); - parse_options(data); + struct pstore_context *ctx = fc->fs_private; + + sync_filesystem(fc->root->d_sb); + pstore_set_kmsg_bytes(ctx->kmsg_bytes); return 0; } @@ -281,7 +284,6 @@ static const struct super_operations pstore_ops = { .statfs = simple_statfs, .drop_inode = generic_delete_inode, .evict_inode = pstore_evict_inode, - .remount_fs = pstore_remount, .show_options = pstore_show_options, }; @@ -406,8 +408,9 @@ void pstore_get_records(int quiet) inode_unlock(d_inode(root)); } -static int pstore_fill_super(struct super_block *sb, void *data, int silent) +static int pstore_fill_super(struct super_block *sb, struct fs_context *fc) { + struct pstore_context *ctx = fc->fs_private; struct inode *inode; sb->s_maxbytes = MAX_LFS_FILESIZE; @@ -417,7 +420,7 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) sb->s_op = &pstore_ops; sb->s_time_gran = 1; - parse_options(data); + pstore_set_kmsg_bytes(ctx->kmsg_bytes); inode = pstore_get_inode(sb); if (inode) { @@ -438,12 +441,26 @@ static int pstore_fill_super(struct super_block *sb, void *data, int silent) return 0; } -static struct dentry *pstore_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int pstore_get_tree(struct fs_context *fc) +{ + if (fc->root) + return pstore_reconfigure(fc); + + return get_tree_single(fc, pstore_fill_super); +} + +static void pstore_free_fc(struct fs_context *fc) { - return mount_single(fs_type, flags, data, pstore_fill_super); + kfree(fc->fs_private); } +static const struct fs_context_operations pstore_context_ops = { + .parse_param = pstore_parse_param, + .get_tree = pstore_get_tree, + .reconfigure = pstore_reconfigure, + .free = pstore_free_fc, +}; + static void pstore_kill_sb(struct super_block *sb) { guard(mutex)(&pstore_sb_lock); @@ -456,11 +473,33 @@ static void pstore_kill_sb(struct super_block *sb) INIT_LIST_HEAD(&records_list); } +static int pstore_init_fs_context(struct fs_context *fc) +{ + struct pstore_context *ctx; + + ctx = kzalloc(sizeof(struct pstore_context), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + /* + * Global kmsg_bytes is initialized to default, and updated + * every time we (re)mount the single-sb filesystem with the + * option specified. + */ + ctx->kmsg_bytes = kmsg_bytes; + + fc->fs_private = ctx; + fc->ops = &pstore_context_ops; + + return 0; +} + static struct file_system_type pstore_fs_type = { .owner = THIS_MODULE, .name = "pstore", - .mount = pstore_mount, .kill_sb = pstore_kill_sb, + .init_fs_context = pstore_init_fs_context, + .parameters = pstore_param_spec, }; int __init pstore_init_fs(void) diff --git a/fs/pstore/internal.h b/fs/pstore/internal.h index 801d6c0b170c..a0fc51196910 100644 --- a/fs/pstore/internal.h +++ b/fs/pstore/internal.h @@ -6,7 +6,7 @@ #include <linux/time.h> #include <linux/pstore.h> -extern unsigned long kmsg_bytes; +extern unsigned int kmsg_bytes; #ifdef CONFIG_PSTORE_FTRACE extern void pstore_register_ftrace(void); @@ -35,7 +35,7 @@ static inline void pstore_unregister_pmsg(void) {} extern struct pstore_info *psinfo; -extern void pstore_set_kmsg_bytes(int); +extern void pstore_set_kmsg_bytes(unsigned int bytes); extern void pstore_get_records(int); extern void pstore_get_backend_records(struct pstore_info *psi, struct dentry *root, int quiet); diff --git a/fs/pstore/platform.c b/fs/pstore/platform.c index f56b066ab80c..f8b9c9c73997 100644 --- a/fs/pstore/platform.c +++ b/fs/pstore/platform.c @@ -92,8 +92,8 @@ module_param(compress, charp, 0444); MODULE_PARM_DESC(compress, "compression to use"); /* How much of the kernel log to snapshot */ -unsigned long kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; -module_param(kmsg_bytes, ulong, 0444); +unsigned int kmsg_bytes = CONFIG_PSTORE_DEFAULT_KMSG_BYTES; +module_param(kmsg_bytes, uint, 0444); MODULE_PARM_DESC(kmsg_bytes, "amount of kernel log to snapshot (in bytes)"); static void *compress_workspace; @@ -107,9 +107,9 @@ static void *compress_workspace; static char *big_oops_buf; static size_t max_compressed_size; -void pstore_set_kmsg_bytes(int bytes) +void pstore_set_kmsg_bytes(unsigned int bytes) { - kmsg_bytes = bytes; + WRITE_ONCE(kmsg_bytes, bytes); } /* Tag each group of saved records with a sequence number */ @@ -278,6 +278,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, struct kmsg_dump_detail *detail) { struct kmsg_dump_iter iter; + unsigned int remaining = READ_ONCE(kmsg_bytes); unsigned long total = 0; const char *why; unsigned int part = 1; @@ -300,7 +301,7 @@ static void pstore_dump(struct kmsg_dumper *dumper, kmsg_dump_rewind(&iter); oopscount++; - while (total < kmsg_bytes) { + while (total < remaining) { char *dst; size_t dst_size; int header_size; @@ -562,7 +563,7 @@ void pstore_unregister(struct pstore_info *psi) pstore_unregister_kmsg(); /* Stop timer and make sure all work has finished. */ - del_timer_sync(&pstore_timer); + timer_delete_sync(&pstore_timer); flush_work(&pstore_work); /* Remove all backend records from filesystem tree. */ diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 825c5c2e0962..df4a9b348769 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -2560,7 +2560,7 @@ int dquot_quota_on_mount(struct super_block *sb, char *qf_name, struct dentry *dentry; int error; - dentry = lookup_positive_unlocked(qf_name, sb->s_root, strlen(qf_name)); + dentry = lookup_noperm_positive_unlocked(&QSTR(qf_name), sb->s_root); if (IS_ERR(dentry)) return PTR_ERR(dentry); diff --git a/fs/ramfs/inode.c b/fs/ramfs/inode.c index 8006faaaf0ec..775fa905fda0 100644 --- a/fs/ramfs/inode.c +++ b/fs/ramfs/inode.c @@ -119,13 +119,13 @@ out: return error; } -static int ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ramfs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { int retval = ramfs_mknod(&nop_mnt_idmap, dir, dentry, mode | S_IFDIR, 0); if (!retval) inc_nlink(dir); - return retval; + return ERR_PTR(retval); } static int ramfs_create(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/read_write.c b/fs/read_write.c index a6133241dfb8..0ef70e128c4a 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -169,11 +169,16 @@ generic_file_llseek_size(struct file *file, loff_t offset, int whence, if (whence == SEEK_CUR) { /* - * f_lock protects against read/modify/write race with - * other SEEK_CURs. Note that parallel writes and reads - * behave like SEEK_SET. + * If the file requires locking via f_pos_lock we know + * that mutual exclusion for SEEK_CUR on the same file + * is guaranteed. If the file isn't locked, we take + * f_lock to protect against f_pos races with other + * SEEK_CURs. */ - guard(spinlock)(&file->f_lock); + if (file_seek_cur_needs_f_lock(file)) { + guard(spinlock)(&file->f_lock); + return vfs_setpos(file, file->f_pos + offset, maxsize); + } return vfs_setpos(file, file->f_pos + offset, maxsize); } @@ -327,7 +332,9 @@ loff_t default_llseek(struct file *file, loff_t offset, int whence) struct inode *inode = file_inode(file); loff_t retval; - inode_lock(inode); + retval = inode_lock_killable(inode); + if (retval) + return retval; switch (whence) { case SEEK_END: offset += i_size_read(inode); diff --git a/fs/readdir.c b/fs/readdir.c index 0038efda417b..7764b8638978 100644 --- a/fs/readdir.c +++ b/fs/readdir.c @@ -222,6 +222,7 @@ SYSCALL_DEFINE3(old_readdir, unsigned int, fd, CLASS(fd_pos, f)(fd); struct readdir_callback buf = { .ctx.actor = fillonedir, + .ctx.count = 1, /* Hint to fs: just one entry. */ .dirent = dirent }; @@ -252,7 +253,6 @@ struct getdents_callback { struct dir_context ctx; struct linux_dirent __user * current_dir; int prev_reclen; - int count; int error; }; @@ -266,12 +266,16 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent, d_name) + namlen + 2, sizeof(long)); int prev_reclen; + unsigned int flags = d_type; + + BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK); + d_type &= S_DT_MASK; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) + if (reclen > ctx->count) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { @@ -279,7 +283,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen, return false; } prev_reclen = buf->prev_reclen; - if (prev_reclen && signal_pending(current)) + if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; @@ -296,7 +300,7 @@ static bool filldir(struct dir_context *ctx, const char *name, int namlen, buf->current_dir = (void __user *)dirent + reclen; buf->prev_reclen = reclen; - buf->count -= reclen; + ctx->count -= reclen; return true; efault_end: user_write_access_end(); @@ -311,7 +315,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, CLASS(fd_pos, f)(fd); struct getdents_callback buf = { .ctx.actor = filldir, - .count = count, + .ctx.count = count, .current_dir = dirent }; int error; @@ -329,7 +333,7 @@ SYSCALL_DEFINE3(getdents, unsigned int, fd, if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else - error = count - buf.count; + error = count - buf.ctx.count; } return error; } @@ -338,7 +342,6 @@ struct getdents_callback64 { struct dir_context ctx; struct linux_dirent64 __user * current_dir; int prev_reclen; - int count; int error; }; @@ -351,15 +354,19 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen, int reclen = ALIGN(offsetof(struct linux_dirent64, d_name) + namlen + 1, sizeof(u64)); int prev_reclen; + unsigned int flags = d_type; + + BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK); + d_type &= S_DT_MASK; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) + if (reclen > ctx->count) return false; prev_reclen = buf->prev_reclen; - if (prev_reclen && signal_pending(current)) + if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *)dirent - prev_reclen; @@ -376,7 +383,7 @@ static bool filldir64(struct dir_context *ctx, const char *name, int namlen, buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; - buf->count -= reclen; + ctx->count -= reclen; return true; efault_end: @@ -392,7 +399,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, CLASS(fd_pos, f)(fd); struct getdents_callback64 buf = { .ctx.actor = filldir64, - .count = count, + .ctx.count = count, .current_dir = dirent }; int error; @@ -411,7 +418,7 @@ SYSCALL_DEFINE3(getdents64, unsigned int, fd, if (put_user(d_off, &lastdirent->d_off)) error = -EFAULT; else - error = count - buf.count; + error = count - buf.ctx.count; } return error; } @@ -475,6 +482,7 @@ COMPAT_SYSCALL_DEFINE3(old_readdir, unsigned int, fd, CLASS(fd_pos, f)(fd); struct compat_readdir_callback buf = { .ctx.actor = compat_fillonedir, + .ctx.count = 1, /* Hint to fs: just one entry. */ .dirent = dirent }; @@ -499,7 +507,6 @@ struct compat_getdents_callback { struct dir_context ctx; struct compat_linux_dirent __user *current_dir; int prev_reclen; - int count; int error; }; @@ -513,12 +520,16 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen int reclen = ALIGN(offsetof(struct compat_linux_dirent, d_name) + namlen + 2, sizeof(compat_long_t)); int prev_reclen; + unsigned int flags = d_type; + + BUILD_BUG_ON(FILLDIR_FLAG_NOINTR & S_DT_MASK); + d_type &= S_DT_MASK; buf->error = verify_dirent_name(name, namlen); if (unlikely(buf->error)) return false; buf->error = -EINVAL; /* only used if we fail.. */ - if (reclen > buf->count) + if (reclen > ctx->count) return false; d_ino = ino; if (sizeof(d_ino) < sizeof(ino) && d_ino != ino) { @@ -526,7 +537,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen return false; } prev_reclen = buf->prev_reclen; - if (prev_reclen && signal_pending(current)) + if (!(flags & FILLDIR_FLAG_NOINTR) && prev_reclen && signal_pending(current)) return false; dirent = buf->current_dir; prev = (void __user *) dirent - prev_reclen; @@ -542,7 +553,7 @@ static bool compat_filldir(struct dir_context *ctx, const char *name, int namlen buf->prev_reclen = reclen; buf->current_dir = (void __user *)dirent + reclen; - buf->count -= reclen; + ctx->count -= reclen; return true; efault_end: user_write_access_end(); @@ -557,8 +568,8 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, CLASS(fd_pos, f)(fd); struct compat_getdents_callback buf = { .ctx.actor = compat_filldir, + .ctx.count = count, .current_dir = dirent, - .count = count }; int error; @@ -575,7 +586,7 @@ COMPAT_SYSCALL_DEFINE3(getdents, unsigned int, fd, if (put_user(buf.ctx.pos, &lastdirent->d_off)) error = -EFAULT; else - error = count - buf.count; + error = count - buf.ctx.count; } return error; } diff --git a/fs/resctrl/Kconfig b/fs/resctrl/Kconfig new file mode 100644 index 000000000000..21671301bd8a --- /dev/null +++ b/fs/resctrl/Kconfig @@ -0,0 +1,39 @@ +config RESCTRL_FS + bool "CPU Resource Control Filesystem (resctrl)" + depends on ARCH_HAS_CPU_RESCTRL + select KERNFS + select PROC_CPU_RESCTRL if PROC_FS + help + Some architectures provide hardware facilities to group tasks and + monitor and control their usage of memory system resources such as + caches and memory bandwidth. Examples of such facilities include + Intel's Resource Director Technology (Intel(R) RDT) and AMD's + Platform Quality of Service (AMD QoS). + + If your system has the necessary support and you want to be able to + assign tasks to groups and manipulate the associated resource + monitors and controls from userspace, say Y here to get a mountable + 'resctrl' filesystem that lets you do just that. + + If nothing mounts or prods the 'resctrl' filesystem, resource + controls and monitors are left in a quiescent, permissive state. + + On architectures where this can be disabled independently, it is + safe to say N. + + See <file:Documentation/filesystems/resctrl.rst> for more information. + +config RESCTRL_FS_PSEUDO_LOCK + bool + depends on RESCTRL_FS + help + Software mechanism to pin data in a cache portion using + micro-architecture specific knowledge. + +config RESCTRL_RMID_DEPENDS_ON_CLOSID + bool + depends on RESCTRL_FS + help + Enabled by the architecture when the RMID values depend on the CLOSID. + This causes the CLOSID allocator to search for CLOSID with clean + RMID. diff --git a/fs/resctrl/Makefile b/fs/resctrl/Makefile new file mode 100644 index 000000000000..e67f34d2236a --- /dev/null +++ b/fs/resctrl/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_RESCTRL_FS) += rdtgroup.o ctrlmondata.o monitor.o +obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o + +# To allow define_trace.h's recursive include: +CFLAGS_monitor.o = -I$(src) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c new file mode 100644 index 000000000000..6ed2dfd4dbbd --- /dev/null +++ b/fs/resctrl/ctrlmondata.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu <fenghua.yu@intel.com> + * Tony Luck <tony.luck@intel.com> + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/kernfs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/tick.h> + +#include "internal.h" + +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + +typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, + struct resctrl_schema *s, + struct rdt_ctrl_domain *d); + +/* + * Check whether MBA bandwidth percentage value is correct. The value is + * checked against the minimum and max bandwidth values specified by the + * hardware. The allocated bandwidth percentage is rounded to the next + * control step available on the hardware. + */ +static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) +{ + int ret; + u32 bw; + + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); + return false; + } + + ret = kstrtou32(buf, 10, &bw); + if (ret) { + rdt_last_cmd_printf("Invalid MB value %s\n", buf); + return false; + } + + /* Nothing else to do if software controller is enabled. */ + if (is_mba_sc(r)) { + *data = bw; + return true; + } + + if (bw < r->membw.min_bw || bw > r->membw.max_bw) { + rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", + bw, r->membw.min_bw, r->membw.max_bw); + return false; + } + + *data = roundup(bw, (unsigned long)r->membw.bw_gran); + return true; +} + +static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_ctrl_domain *d) +{ + struct resctrl_staged_config *cfg; + u32 closid = data->rdtgrp->closid; + struct rdt_resource *r = s->res; + u32 bw_val; + + cfg = &d->staged_config[s->conf_type]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); + return -EINVAL; + } + + if (!bw_validate(data->buf, &bw_val, r)) + return -EINVAL; + + if (is_mba_sc(r)) { + d->mbps_val[closid] = bw_val; + return 0; + } + + cfg->new_ctrl = bw_val; + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * Check whether a cache bit mask is valid. + * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: + * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 + * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 + * + * Haswell does not support a non-contiguous 1s value and additionally + * requires at least two bits set. + * AMD allows non-contiguous bitmasks. + */ +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) +{ + u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; + unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit, val; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret) { + rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); + return false; + } + + if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { + rdt_last_cmd_puts("Mask out of range\n"); + return false; + } + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + /* Are non-contiguous bitmasks allowed? */ + if (!r->cache.arch_has_sparse_bitmasks && + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { + rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); + return false; + } + + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("Need at least %d bits in the mask\n", + r->cache.min_cbm_bits); + return false; + } + + *data = val; + return true; +} + +/* + * Read one cache bit mask (hex). Check that it is valid for the current + * resource type. + */ +static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_ctrl_domain *d) +{ + struct rdtgroup *rdtgrp = data->rdtgrp; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + u32 cbm_val; + + cfg = &d->staged_config[s->conf_type]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); + return -EINVAL; + } + + /* + * Cannot set up more than one pseudo-locked region in a cache + * hierarchy. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + rdtgroup_pseudo_locked_in_hierarchy(d)) { + rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); + return -EINVAL; + } + + if (!cbm_validate(data->buf, &cbm_val, r)) + return -EINVAL; + + if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_SHAREABLE) && + rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { + rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); + return -EINVAL; + } + + /* + * The CBM may not overlap with the CBM of another closid if + * either is exclusive. + */ + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { + rdt_last_cmd_puts("Overlaps with exclusive group\n"); + return -EINVAL; + } + + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { + if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + rdt_last_cmd_puts("Overlaps with other group\n"); + return -EINVAL; + } + } + + cfg->new_ctrl = cbm_val; + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * For each domain in this resource we expect to find a series of: + * id=mask + * separated by ";". The "id" is in decimal, and must match one of + * the "id"s for this resource. + */ +static int parse_line(char *line, struct resctrl_schema *s, + struct rdtgroup *rdtgrp) +{ + enum resctrl_conf_type t = s->conf_type; + ctrlval_parser_t *parse_ctrlval = NULL; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + struct rdt_parse_data data; + struct rdt_ctrl_domain *d; + char *dom = NULL, *id; + unsigned long dom_id; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + parse_ctrlval = &parse_cbm; + break; + case RESCTRL_SCHEMA_RANGE: + parse_ctrlval = &parse_bw; + break; + } + + if (WARN_ON_ONCE(!parse_ctrlval)) + return -EINVAL; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + +next: + if (!line || line[0] == '\0') + return 0; + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); + return -EINVAL; + } + dom = strim(dom); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (d->hdr.id == dom_id) { + data.buf = dom; + data.rdtgrp = rdtgrp; + if (parse_ctrlval(&data, s, d)) + return -EINVAL; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + cfg = &d->staged_config[t]; + /* + * In pseudo-locking setup mode and just + * parsed a valid CBM that should be + * pseudo-locked. Only one locked region per + * resource group and domain so just do + * the required initialization for single + * region and return. + */ + rdtgrp->plr->s = s; + rdtgrp->plr->d = d; + rdtgrp->plr->cbm = cfg->new_ctrl; + d->plr = rdtgrp->plr; + return 0; + } + goto next; + } + } + return -EINVAL; +} + +static int rdtgroup_parse_resource(char *resname, char *tok, + struct rdtgroup *rdtgrp) +{ + struct resctrl_schema *s; + + list_for_each_entry(s, &resctrl_schema_all, list) { + if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) + return parse_line(tok, s, rdtgrp); + } + rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); + return -EINVAL; +} + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct resctrl_schema *s; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + char *tok, *resname; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + /* + * No changes to pseudo-locked region allowed. It has to be removed + * and re-created instead. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = -EINVAL; + rdt_last_cmd_puts("Resource group is pseudo-locked\n"); + goto out; + } + + rdt_staged_configs_clear(); + + while ((tok = strsep(&buf, "\n")) != NULL) { + resname = strim(strsep(&tok, ":")); + if (!tok) { + rdt_last_cmd_puts("Missing ':'\n"); + ret = -EINVAL; + goto out; + } + if (tok[0] == '\0') { + rdt_last_cmd_printf("Missing '%s' value\n", resname); + ret = -EINVAL; + goto out; + } + ret = rdtgroup_parse_resource(resname, tok, rdtgrp); + if (ret) + goto out; + } + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + + /* + * Writes to mba_sc resources update the software controller, + * not the control MSR. + */ + if (is_mba_sc(r)) + continue; + + ret = resctrl_arch_update_domains(r, rdtgrp->closid); + if (ret) + goto out; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * If pseudo-locking fails we keep the resource group in + * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service + * active and updated for just the domain the pseudo-locked + * region was requested for. + */ + ret = rdtgroup_pseudo_lock_create(rdtgrp); + } + +out: + rdt_staged_configs_clear(); + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) +{ + struct rdt_resource *r = schema->res; + struct rdt_ctrl_domain *dom; + bool sep = false; + u32 ctrl_val; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + seq_printf(s, "%*s:", max_name_width, schema->name); + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { + if (sep) + seq_puts(s, ";"); + + if (is_mba_sc(r)) + ctrl_val = dom->mbps_val[closid]; + else + ctrl_val = resctrl_arch_get_config(r, dom, closid, + schema->conf_type); + + seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); + sep = true; + } + seq_puts(s, "\n"); +} + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct resctrl_schema *schema; + struct rdtgroup *rdtgrp; + int ret = 0; + u32 closid; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + list_for_each_entry(schema, &resctrl_schema_all, list) { + seq_printf(s, "%s:uninitialized\n", schema->name); + } + } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%s:%d=%x\n", + rdtgrp->plr->s->res->name, + rdtgrp->plr->d->hdr.id, + rdtgrp->plr->cbm); + } + } else { + closid = rdtgrp->closid; + list_for_each_entry(schema, &resctrl_schema_all, list) { + if (closid < schema->num_closid) + show_doms(s, schema, closid); + } + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + return ret; +} + +static int smp_mon_event_count(void *arg) +{ + mon_event_count(arg); + + return 0; +} + +ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!strcmp(buf, "mbm_local_bytes")) { + if (resctrl_arch_is_mbm_local_enabled()) + rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else + ret = -EINVAL; + } else if (!strcmp(buf, "mbm_total_bytes")) { + if (resctrl_arch_is_mbm_total_enabled()) + rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; + else + ret = -EINVAL; + } else { + ret = -EINVAL; + } + + if (ret) + rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) { + switch (rdtgrp->mba_mbps_event) { + case QOS_L3_MBM_LOCAL_EVENT_ID: + seq_puts(s, "mbm_local_bytes\n"); + break; + case QOS_L3_MBM_TOTAL_EVENT_ID: + seq_puts(s, "mbm_total_bytes\n"); + break; + default: + pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); + ret = -EINVAL; + break; + } + } else { + ret = -ENOENT; + } + + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, + struct list_head **pos) +{ + struct rdt_domain_hdr *d; + struct list_head *l; + + list_for_each(l, h) { + d = list_entry(l, struct rdt_domain_hdr, list); + /* When id is found, return its domain. */ + if (id == d->id) + return d; + /* Stop searching when finding id's position in sorted list. */ + if (id < d->id) + break; + } + + if (pos) + *pos = l; + + return NULL; +} + +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, int evtid, int first) +{ + int cpu; + + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + /* + * Setup the parameters to pass to mon_event_count() to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->r = r; + rr->d = d; + rr->first = first; + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } + + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); + + /* + * cpumask_any_housekeeping() prefers housekeeping CPUs, but + * are all the CPUs nohz_full? If yes, pick a CPU to IPI. + * MPAM's resctrl_arch_rmid_read() is unable to read the + * counters on some platforms if its called in IRQ context. + */ + if (tick_nohz_full_cpu(cpu)) + smp_call_function_any(cpumask, mon_event_count, rr, 1); + else + smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); + + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + enum resctrl_res_level resid; + enum resctrl_event_id evtid; + struct rdt_domain_hdr *hdr; + struct rmid_read rr = {0}; + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + struct mon_data *md; + int domid, ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } + + md = of->kn->priv; + if (WARN_ON_ONCE(!md)) { + ret = -EIO; + goto out; + } + + resid = md->rid; + domid = md->domid; + evtid = md->evtid; + r = resctrl_arch_get_resource(resid); + + if (md->sum) { + /* + * This file requires summing across all domains that share + * the L3 cache id that was provided in the "domid" field of the + * struct mon_data. Search all domains in the resource for + * one that matches this cache id. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->ci->id == domid) { + rr.ci = d->ci; + mon_event_read(&rr, r, NULL, rdtgrp, + &d->ci->shared_cpu_map, evtid, false); + goto checkresult; + } + } + ret = -ENOENT; + goto out; + } else { + /* + * This file provides data from a single domain. Search + * the resource to find the domain with "domid". + */ + hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); + if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { + ret = -ENOENT; + goto out; + } + d = container_of(hdr, struct rdt_mon_domain, hdr); + mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); + } + +checkresult: + + if (rr.err == -EIO) + seq_puts(m, "Error\n"); + else if (rr.err == -EINVAL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h new file mode 100644 index 000000000000..9a8cf6f11151 --- /dev/null +++ b/fs/resctrl/internal.h @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_RESCTRL_INTERNAL_H +#define _FS_RESCTRL_INTERNAL_H + +#include <linux/resctrl.h> +#include <linux/kernfs.h> +#include <linux/fs_context.h> +#include <linux/tick.h> + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +/** + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that + * aren't marked nohz_full + * @mask: The mask to pick a CPU from. + * @exclude_cpu:The CPU to avoid picking. + * + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping + * CPUs that don't use nohz_full, these are preferred. Pass + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. + * + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. + */ +static inline unsigned int +cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) +{ + unsigned int cpu; + + /* Try to find a CPU that isn't nohz_full to use in preference */ + if (tick_nohz_full_enabled()) { + cpu = cpumask_any_andnot_but(mask, tick_nohz_full_mask, exclude_cpu); + if (cpu < nr_cpu_ids) + return cpu; + } + + return cpumask_any_but(mask, exclude_cpu); +} + +struct rdt_fs_context { + struct kernfs_fs_context kfc; + bool enable_cdpl2; + bool enable_cdpl3; + bool enable_mba_mbps; + bool enable_debug; +}; + +static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + return container_of(kfc, struct rdt_fs_context, kfc); +} + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + * @configurable: true if the event is configurable + * @list: entry in &rdt_resource->evt_list + */ +struct mon_evt { + enum resctrl_event_id evtid; + char *name; + bool configurable; + struct list_head list; +}; + +/** + * struct mon_data - Monitoring details for each event file. + * @list: Member of the global @mon_data_kn_priv_list list. + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file. + * @sum: Set when event must be summed across multiple + * domains. + * @domid: When @sum is zero this is the domain to which + * the event file belongs. When @sum is one this + * is the id of the L3 cache that all domains to be + * summed share. + * + * Pointed to by the kernfs kn->priv field of monitoring event files. + * Readers and writers must hold rdtgroup_mutex. + */ +struct mon_data { + struct list_head list; + enum resctrl_res_level rid; + enum resctrl_event_id evtid; + int domid; + bool sum; +}; + +/** + * struct rmid_read - Data passed across smp_call*() to read event count. + * @rgrp: Resource group for which the counter is being read. If it is a parent + * resource group then its event count is summed with the count from all + * its child resource groups. + * @r: Resource describing the properties of the event being read. + * @d: Domain that the counter should be read from. If NULL then sum all + * domains in @r sharing L3 @ci.id + * @evtid: Which monitor event to read. + * @first: Initialize MBM counter when true. + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @err: Error encountered when reading counter. + * @val: Returned value of event counter. If @rgrp is a parent resource group, + * @val includes the sum of event counts from its child resource groups. + * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, + * (summed across child resource groups if @rgrp is a parent resource group). + * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). + */ +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_resource *r; + struct rdt_mon_domain *d; + enum resctrl_event_id evtid; + bool first; + struct cacheinfo *ci; + int err; + u64 val; + void *arch_mon_ctx; +}; + +extern struct list_head resctrl_schema_all; + +extern bool resctrl_mounted; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * enum rdtgrp_mode - Mode of a RDT resource group + * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations + * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed + * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking + * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations + * allowed AND the allocations are Cache Pseudo-Locked + * @RDT_NUM_MODES: Total number of modes + * + * The mode of a resource group enables control over the allowed overlap + * between allocations associated with different resource groups (classes + * of service). User is able to modify the mode of a resource group by + * writing to the "mode" resctrl file associated with the resource group. + * + * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by + * writing the appropriate text to the "mode" file. A resource group enters + * "pseudo-locked" mode after the schemata is written while the resource + * group is in "pseudo-locksetup" mode. + */ +enum rdtgrp_mode { + RDT_MODE_SHAREABLE = 0, + RDT_MODE_EXCLUSIVE, + RDT_MODE_PSEUDO_LOCKSETUP, + RDT_MODE_PSEUDO_LOCKED, + + /* Must be last */ + RDT_NUM_MODES, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn: kernfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + * @mode: mode of resource group + * @mba_mbps_event: input monitoring event id when mba_sc is enabled + * @plr: pseudo-locked region + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; + enum rdtgrp_mode mode; + enum resctrl_event_id mba_mbps_event; + struct pseudo_lock_region *plr; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) + +#define RFTYPE_BASE BIT(1) + +#define RFTYPE_CTRL BIT(4) + +#define RFTYPE_MON BIT(5) + +#define RFTYPE_TOP BIT(6) + +#define RFTYPE_RES_CACHE BIT(8) + +#define RFTYPE_RES_MB BIT(9) + +#define RFTYPE_DEBUG BIT(10) + +#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) + +#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) + +#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) + +#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width; + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + const struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation + * @prev_bw: The most recent bandwidth in MBps + */ +struct mbm_state { + u64 prev_bw_bytes; + u32 prev_bw; +}; + +extern struct mutex rdtgroup_mutex; + +static inline const char *rdt_kn_name(const struct kernfs_node *kn) +{ + return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); +} + +extern struct rdtgroup rdtgroup_default; + +extern struct dentry *debugfs_resctrl; + +extern enum resctrl_event_id mba_mbps_default_event; + +void rdt_last_cmd_clear(void); + +void rdt_last_cmd_puts(const char *s); + +__printf(1, 2) +void rdt_last_cmd_printf(const char *fmt, ...); + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); + +void rdtgroup_kn_unlock(struct kernfs_node *kn); + +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); + +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask); + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + +int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, + unsigned long cbm, int closid, bool exclusive); + +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, + unsigned long cbm); + +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); + +int rdtgroup_tasks_assigned(struct rdtgroup *r); + +int closids_supported(void); + +void closid_free(int closid); + +int alloc_rmid(u32 closid); + +void free_rmid(u32 closid, u32 rmid); + +void resctrl_mon_resource_exit(void); + +void mon_event_count(void *info); + +int rdtgroup_mondata_show(struct seq_file *m, void *arg); + +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, int evtid, int first); + +int resctrl_mon_resource_init(void); + +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, + unsigned long delay_ms, + int exclude_cpu); + +void mbm_handle_overflow(struct work_struct *work); + +bool is_mba_sc(struct rdt_resource *r); + +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu); + +void cqm_handle_limbo(struct work_struct *work); + +bool has_busy_rmid(struct rdt_mon_domain *d); + +void __check_limbo(struct rdt_mon_domain *d, bool force_free); + +void resctrl_file_fflags_init(const char *config, unsigned long fflags); + +void rdt_staged_configs_clear(void); + +bool closid_allocated(unsigned int closid); + +int resctrl_find_cleanest_closid(void); + +#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); + +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); + +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); + +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); + +int rdt_pseudo_lock_init(void); + +void rdt_pseudo_lock_release(void); + +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); + +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); + +#else +static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) +{ + return false; +} + +static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) +{ + return false; +} + +static inline int rdt_pseudo_lock_init(void) { return 0; } +static inline void rdt_pseudo_lock_release(void) { } +static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } +#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ + +#endif /* _FS_RESCTRL_INTERNAL_H */ diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c new file mode 100644 index 000000000000..bde2801289d3 --- /dev/null +++ b/fs/resctrl/monitor.c @@ -0,0 +1,929 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) "resctrl: " fmt + +#include <linux/cpu.h> +#include <linux/resctrl.h> +#include <linux/sizes.h> +#include <linux/slab.h> + +#include "internal.h" + +#define CREATE_TRACE_POINTS + +#include "monitor_trace.h" + +/** + * struct rmid_entry - dirty tracking for all RMID. + * @closid: The CLOSID for this entry. + * @rmid: The RMID for this entry. + * @busy: The number of domains with cached data using this RMID. + * @list: Member of the rmid_free_lru list when busy == 0. + * + * Depending on the architecture the correct monitor is accessed using + * both @closid and @rmid, or @rmid only. + * + * Take the rdtgroup_mutex when accessing. + */ +struct rmid_entry { + u32 closid; + u32 rmid; + int busy; + struct list_head list; +}; + +/* + * @rmid_free_lru - A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/* + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. + * Indexed by CLOSID. Protected by rdtgroup_mutex. + */ +static u32 *closid_num_dirty_rmid; + +/* + * @rmid_limbo_count - count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > resctrl_rmid_realloc_threshold. User can + * change the threshold occupancy value. + */ +static unsigned int rmid_limbo_count; + +/* + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * This is the threshold cache occupancy in bytes at which we will consider an + * RMID available for re-allocation. + */ +unsigned int resctrl_rmid_realloc_threshold; + +/* + * This is the maximum value for the reallocation threshold, in bytes. + */ +unsigned int resctrl_rmid_realloc_limit; + +/* + * x86 and arm64 differ in their handling of monitoring. + * x86's RMID are independent numbers, there is only one source of traffic + * with an RMID value of '1'. + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID + * value is no longer unique. + * To account for this, resctrl uses an index. On x86 this is just the RMID, + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. + * + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code + * must accept an attempt to read every index. + */ +static inline struct rmid_entry *__rmid_entry(u32 idx) +{ + struct rmid_entry *entry; + u32 closid, rmid; + + entry = &rmid_ptrs[idx]; + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); + + WARN_ON_ONCE(entry->closid != closid); + WARN_ON_ONCE(entry->rmid != rmid); + + return entry; +} + +static void limbo_release_entry(struct rmid_entry *entry) +{ + lockdep_assert_held(&rdtgroup_mutex); + + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]--; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_mon_domain *d, bool force_free) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + struct rmid_entry *entry; + u32 idx, cur_idx = 1; + void *arch_mon_ctx; + bool rmid_dirty; + u64 val = 0; + + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); + if (IS_ERR(arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(arch_mon_ctx)); + return; + } + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); + if (idx >= idx_limit) + break; + + entry = __rmid_entry(idx); + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val, + arch_mon_ctx)) { + rmid_dirty = true; + } else { + rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + + /* + * x86's CLOSID and RMID are independent numbers, so the entry's + * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the + * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't + * used to select the configuration. It is thus necessary to track both + * CLOSID and RMID because there may be dependencies between them + * on some architectures. + */ + trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); + } + + if (force_free || !rmid_dirty) { + clear_bit(idx, d->rmid_busy_llc); + if (!--entry->busy) + limbo_release_entry(entry); + } + cur_idx = idx + 1; + } + + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); +} + +bool has_busy_rmid(struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; +} + +static struct rmid_entry *resctrl_find_free_rmid(u32 closid) +{ + struct rmid_entry *itr; + u32 itr_idx, cmp_idx; + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); + + list_for_each_entry(itr, &rmid_free_lru, list) { + /* + * Get the index of this free RMID, and the index it would need + * to be if it were used with this CLOSID. + * If the CLOSID is irrelevant on this architecture, the two + * index values are always the same on every entry and thus the + * very first entry will be returned. + */ + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); + + if (itr_idx == cmp_idx) + return itr; + } + + return ERR_PTR(-ENOSPC); +} + +/** + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated + * RMID are clean, or the CLOSID that has + * the most clean RMID. + * + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID + * may not be able to allocate clean RMID. To avoid this the allocator will + * choose the CLOSID with the most clean RMID. + * + * When the CLOSID and RMID are independent numbers, the first free CLOSID will + * be returned. + */ +int resctrl_find_cleanest_closid(void) +{ + u32 cleanest_closid = ~0; + int i = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return -EIO; + + for (i = 0; i < closids_supported(); i++) { + int num_dirty; + + if (closid_allocated(i)) + continue; + + num_dirty = closid_num_dirty_rmid[i]; + if (num_dirty == 0) + return i; + + if (cleanest_closid == ~0) + cleanest_closid = i; + + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) + cleanest_closid = i; + } + + if (cleanest_closid == ~0) + return -ENOSPC; + + return cleanest_closid; +} + +/* + * For MPAM the RMID value is not unique, and has to be considered with + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which + * allows all domains to be managed by a single free list. + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. + */ +int alloc_rmid(u32 closid) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = resctrl_find_free_rmid(closid); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + list_del(&entry->list); + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + u32 idx; + + lockdep_assert_held(&rdtgroup_mutex); + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); + + entry->busy = 0; + list_for_each_entry(d, &r->mon_domains, hdr.list) { + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, + RESCTRL_PICK_ANY_CPU); + set_bit(idx, d->rmid_busy_llc); + entry->busy++; + } + + rmid_limbo_count++; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; +} + +void free_rmid(u32 closid, u32 rmid) +{ + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + /* + * Do not allow the default rmid to be free'd. Comparing by index + * allows architectures that ignore the closid parameter to avoid an + * unnecessary check. + */ + if (!resctrl_arch_mon_capable() || + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID)) + return; + + entry = __rmid_entry(idx); + + if (resctrl_arch_is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, + u32 rmid, enum resctrl_event_id evtid) +{ + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + + switch (evtid) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return &d->mbm_total[idx]; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return &d->mbm_local[idx]; + default: + return NULL; + } +} + +static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) +{ + int cpu = smp_processor_id(); + struct rdt_mon_domain *d; + struct mbm_state *m; + int err, ret; + u64 tval = 0; + + if (rr->first) { + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (m) + memset(m, 0, sizeof(struct mbm_state)); + return 0; + } + + if (rr->d) { + /* Reading a single domain, must be on a CPU in that domain. */ + if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) + return -EINVAL; + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; + + rr->val += tval; + + return 0; + } + + /* Summing domains that share a cache, must be on a CPU for that cache. */ + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) + return -EINVAL; + + /* + * Legacy files must report the sum of an event across all + * domains that share the same L3 cache instance. + * Report success if a read from any domain succeeds, -EINVAL + * (translated to "Unavailable" for user space) if reading from + * all domains fail for any reason. + */ + ret = -EINVAL; + list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { + if (d->ci->id != rr->ci->id) + continue; + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (!err) { + rr->val += tval; + ret = 0; + } + } + + if (ret) + rr->err = ret; + + return ret; +} + +/* + * mbm_bw_count() - Update bw count from values previously read by + * __mon_event_count(). + * @closid: The closid used to identify the cached mbm_state. + * @rmid: The rmid used to identify the cached mbm_state. + * @rr: The struct rmid_read populated by __mon_event_count(). + * + * Supporting function to calculate the memory bandwidth + * and delta bandwidth in MBps. The chunks value previously read by + * __mon_event_count() is compared with the chunks value from the previous + * invocation. This must be called once per second to maintain values in MBps. + */ +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) +{ + u64 cur_bw, bytes, cur_bytes; + struct mbm_state *m; + + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (WARN_ON_ONCE(!m)) + return; + + cur_bytes = rr->val; + bytes = cur_bytes - m->prev_bw_bytes; + m->prev_bw_bytes = cur_bytes; + + cur_bw = bytes / SZ_1M; + + m->prev_bw = cur_bw; +} + +/* + * This is scheduled by mon_event_read() to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + int ret; + + rdtgrp = rr->rgrp; + + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); + + /* + * For Ctrl groups read data from child monitor groups and + * add them together. Count events which are read successfully. + * Discard the rmid_read's reporting errors. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->closid, entry->mon.rmid, + rr) == 0) + ret = 0; + } + } + + /* + * __mon_event_count() calls for newly created monitor groups may + * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. + * Discard error if any of the monitor event reads succeeded. + */ + if (ret == 0) + rr->err = 0; +} + +static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, + struct rdt_resource *r) +{ + struct rdt_ctrl_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return d; + } + + return NULL; +} + +/* + * Feedback loop for MBA software controller (mba_sc) + * + * mba_sc is a feedback loop where we periodically read MBM counters and + * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so + * that: + * + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) + * + * This uses the MBM counters to measure the bandwidth and MBA throttle + * MSRs to control the bandwidth for a particular rdtgrp. It builds on the + * fact that resctrl rdtgroups have both monitoring and control. + * + * The frequency of the checks is 1s and we just tag along the MBM overflow + * timer. Having 1s interval makes the calculation of bandwidth simpler. + * + * Although MBA's goal is to restrict the bandwidth to a maximum, there may + * be a need to increase the bandwidth to avoid unnecessarily restricting + * the L2 <-> L3 traffic. + * + * Since MBA controls the L2 external bandwidth where as MBM measures the + * L3 external bandwidth the following sequence could lead to such a + * situation. + * + * Consider an rdtgroup which had high L3 <-> memory traffic in initial + * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but + * after some time rdtgroup has mostly L2 <-> L3 traffic. + * + * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its + * throttle MSRs already have low percentage values. To avoid + * unnecessarily restricting such rdtgroups, we also increase the bandwidth. + */ +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) +{ + u32 closid, rmid, cur_msr_val, new_msr_val; + struct mbm_state *pmbm_data, *cmbm_data; + struct rdt_ctrl_domain *dom_mba; + enum resctrl_event_id evt_id; + struct rdt_resource *r_mba; + struct list_head *head; + struct rdtgroup *entry; + u32 cur_bw, user_bw; + + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + evt_id = rgrp->mba_mbps_event; + + closid = rgrp->closid; + rmid = rgrp->mon.rmid; + pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); + if (WARN_ON_ONCE(!pmbm_data)) + return; + + dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); + if (!dom_mba) { + pr_warn_once("Failure to get domain for MBA update\n"); + return; + } + + cur_bw = pmbm_data->prev_bw; + user_bw = dom_mba->mbps_val[closid]; + + /* MBA resource doesn't support CDP */ + cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rgrp->mon.crdtgrp_list; + list_for_each_entry(entry, head, mon.crdtgrp_list) { + cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); + if (WARN_ON_ONCE(!cmbm_data)) + return; + cur_bw += cmbm_data->prev_bw; + } + + /* + * Scale up/down the bandwidth linearly for the ctrl group. The + * bandwidth step is the bandwidth granularity specified by the + * hardware. + * Always increase throttling if current bandwidth is above the + * target set by user. + * But avoid thrashing up and down on every poll by checking + * whether a decrease in throttling is likely to push the group + * back over target. E.g. if currently throttling to 30% of bandwidth + * on a system with 10% granularity steps, check whether moving to + * 40% would go past the limit by multiplying current bandwidth by + * "(30 + 10) / 30". + */ + if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { + new_msr_val = cur_msr_val - r_mba->membw.bw_gran; + } else if (cur_msr_val < MAX_MBA_BW && + (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { + new_msr_val = cur_msr_val + r_mba->membw.bw_gran; + } else { + return; + } + + resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); +} + +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id evtid) +{ + struct rmid_read rr = {0}; + + rr.r = r; + rr.d = d; + rr.evtid = evtid; + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); + + /* + * If the software controller is enabled, compute the + * bandwidth for this event id. + */ + if (is_mba_sc(NULL)) + mbm_bw_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); +} + +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid) +{ + /* + * This is protected from concurrent reads from user as both + * the user and overflow handler hold the global mutex. + */ + if (resctrl_arch_is_mbm_total_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + + if (resctrl_arch_is_mbm_local_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + struct rdt_mon_domain *d; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); + + __check_limbo(d, false); + + if (has_busy_rmid(d)) { + d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, + delay); + } + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +/** + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this + * domain. + * @dom: The domain the limbo handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); + dom->cqm_work_cpu = cpu; + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + struct rdt_mon_domain *d; + struct list_head *head; + struct rdt_resource *r; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + /* + * If the filesystem has been unmounted this work no longer needs to + * run. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) + goto out_unlock; + + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + d = container_of(work, struct rdt_mon_domain, mbm_over.work); + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); + + if (is_mba_sc(NULL)) + update_mba_bw(prgrp, d); + } + + /* + * Re-check for housekeeping CPUs. This allows the overflow handler to + * move off a nohz_full CPU quickly. + */ + d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +/** + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this + * domain. + * @dom: The domain the overflow handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + /* + * When a domain comes online there is no guarantee the filesystem is + * mounted. If not, there is no need to catch counter overflow. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) + return; + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); + dom->mbm_work_cpu = cpu; + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + u32 num_closid = resctrl_arch_get_num_closid(r); + struct rmid_entry *entry = NULL; + int err = 0, i; + u32 idx; + + mutex_lock(&rdtgroup_mutex); + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 *tmp; + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto out_unlock; + } + + closid_num_dirty_rmid = tmp; + } + + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) { + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + err = -ENOMEM; + goto out_unlock; + } + + for (i = 0; i < idx_limit; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and + * are always allocated. These are used for the rdtgroup_default + * control group, which will be setup later in resctrl_init(). + */ + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + entry = __rmid_entry(idx); + list_del(&entry->list); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +static void dom_data_exit(struct rdt_resource *r) +{ + mutex_lock(&rdtgroup_mutex); + + if (!r->mon_capable) + goto out_unlock; + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + + kfree(rmid_ptrs); + rmid_ptrs = NULL; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (resctrl_arch_is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (resctrl_arch_is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (resctrl_arch_is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +/** + * resctrl_mon_resource_init() - Initialise global monitoring structures. + * + * Allocate and initialise global monitor resources that do not belong to a + * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. + * Called once during boot after the struct rdt_resource's have been configured + * but before the filesystem is mounted. + * Resctrl's cpuhp callbacks may be called before this point to bring a domain + * online. + * + * Returns 0 for success, or -ENOMEM. + */ +int resctrl_mon_resource_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + int ret; + + if (!r->mon_capable) + return 0; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { + mbm_total_event.configurable = true; + resctrl_file_fflags_init("mbm_total_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + } + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { + mbm_local_event.configurable = true; + resctrl_file_fflags_init("mbm_local_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + } + + if (resctrl_arch_is_mbm_local_enabled()) + mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else if (resctrl_arch_is_mbm_total_enabled()) + mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + + return 0; +} + +void resctrl_mon_resource_exit(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + dom_data_exit(r); +} diff --git a/fs/resctrl/monitor_trace.h b/fs/resctrl/monitor_trace.h new file mode 100644 index 000000000000..fdf49f22576a --- /dev/null +++ b/fs/resctrl/monitor_trace.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM resctrl + +#if !defined(_FS_RESCTRL_MONITOR_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _FS_RESCTRL_MONITOR_TRACE_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(mon_llc_occupancy_limbo, + TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), + TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), + TP_STRUCT__entry(__field(u32, ctrl_hw_id) + __field(u32, mon_hw_id) + __field(int, domain_id) + __field(u64, llc_occupancy_bytes)), + TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; + __entry->mon_hw_id = mon_hw_id; + __entry->domain_id = domain_id; + __entry->llc_occupancy_bytes = llc_occupancy_bytes;), + TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", + __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, + __entry->llc_occupancy_bytes) + ); + +#endif /* _FS_RESCTRL_MONITOR_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . + +#define TRACE_INCLUDE_FILE monitor_trace + +#include <trace/define_trace.h> diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c new file mode 100644 index 000000000000..ccc2f9213b4b --- /dev/null +++ b/fs/resctrl/pseudo_lock.c @@ -0,0 +1,1105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Resource Director Technology (RDT) + * + * Pseudo-locking support built on top of Cache Allocation Technology (CAT) + * + * Copyright (C) 2018 Intel Corporation + * + * Author: Reinette Chatre <reinette.chatre@intel.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cacheinfo.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/kthread.h> +#include <linux/mman.h> +#include <linux/pm_qos.h> +#include <linux/resctrl.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#include "internal.h" + +/* + * Major number assigned to and shared by all devices exposing + * pseudo-locked regions. + */ +static unsigned int pseudo_lock_major; + +static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); + +static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) +{ + const struct rdtgroup *rdtgrp; + + rdtgrp = dev_get_drvdata(dev); + if (mode) + *mode = 0600; + guard(mutex)(&rdtgroup_mutex); + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); +} + +static const struct class pseudo_lock_class = { + .name = "pseudo_lock", + .devnode = pseudo_lock_devnode, +}; + +/** + * pseudo_lock_minor_get - Obtain available minor number + * @minor: Pointer to where new minor number will be stored + * + * A bitmask is used to track available minor numbers. Here the next free + * minor number is marked as unavailable and returned. + * + * Return: 0 on success, <0 on failure. + */ +static int pseudo_lock_minor_get(unsigned int *minor) +{ + unsigned long first_bit; + + first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); + + if (first_bit == MINORBITS) + return -ENOSPC; + + __clear_bit(first_bit, &pseudo_lock_minor_avail); + *minor = first_bit; + + return 0; +} + +/** + * pseudo_lock_minor_release - Return minor number to available + * @minor: The minor number made available + */ +static void pseudo_lock_minor_release(unsigned int minor) +{ + __set_bit(minor, &pseudo_lock_minor_avail); +} + +/** + * region_find_by_minor - Locate a pseudo-lock region by inode minor number + * @minor: The minor number of the device representing pseudo-locked region + * + * When the character device is accessed we need to determine which + * pseudo-locked region it belongs to. This is done by matching the minor + * number of the device to the pseudo-locked region it belongs. + * + * Minor numbers are assigned at the time a pseudo-locked region is associated + * with a cache instance. + * + * Return: On success return pointer to resource group owning the pseudo-locked + * region, NULL on failure. + */ +static struct rdtgroup *region_find_by_minor(unsigned int minor) +{ + struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->plr && rdtgrp->plr->minor == minor) { + rdtgrp_match = rdtgrp; + break; + } + } + return rdtgrp_match; +} + +/** + * struct pseudo_lock_pm_req - A power management QoS request list entry + * @list: Entry within the @pm_reqs list for a pseudo-locked region + * @req: PM QoS request + */ +struct pseudo_lock_pm_req { + struct list_head list; + struct dev_pm_qos_request req; +}; + +static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req, *next; + + list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { + dev_pm_qos_remove_request(&pm_req->req); + list_del(&pm_req->list); + kfree(pm_req); + } +} + +/** + * pseudo_lock_cstates_constrain - Restrict cores from entering C6 + * @plr: Pseudo-locked region + * + * To prevent the cache from being affected by power management entering + * C6 has to be avoided. This is accomplished by requesting a latency + * requirement lower than lowest C6 exit latency of all supported + * platforms as found in the cpuidle state tables in the intel_idle driver. + * At this time it is possible to do so with a single latency requirement + * for all supported platforms. + * + * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, + * the ACPI latencies need to be considered while keeping in mind that C2 + * may be set to map to deeper sleep states. In this case the latency + * requirement needs to prevent entering C2 also. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req; + int cpu; + int ret; + + for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { + pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); + if (!pm_req) { + rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); + ret = -ENOMEM; + goto out_err; + } + ret = dev_pm_qos_add_request(get_cpu_device(cpu), + &pm_req->req, + DEV_PM_QOS_RESUME_LATENCY, + 30); + if (ret < 0) { + rdt_last_cmd_printf("Failed to add latency req CPU%d\n", + cpu); + kfree(pm_req); + ret = -1; + goto out_err; + } + list_add(&pm_req->list, &plr->pm_reqs); + } + + return 0; + +out_err: + pseudo_lock_cstates_relax(plr); + return ret; +} + +/** + * pseudo_lock_region_clear - Reset pseudo-lock region data + * @plr: pseudo-lock region + * + * All content of the pseudo-locked region is reset - any memory allocated + * freed. + * + * Return: void + */ +static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) +{ + plr->size = 0; + plr->line_size = 0; + kfree(plr->kmem); + plr->kmem = NULL; + plr->s = NULL; + if (plr->d) + plr->d->plr = NULL; + plr->d = NULL; + plr->cbm = 0; + plr->debugfs_dir = NULL; +} + +/** + * pseudo_lock_region_init - Initialize pseudo-lock region information + * @plr: pseudo-lock region + * + * Called after user provided a schemata to be pseudo-locked. From the + * schemata the &struct pseudo_lock_region is on entry already initialized + * with the resource, domain, and capacity bitmask. Here the information + * required for pseudo-locking is deduced from this data and &struct + * pseudo_lock_region initialized further. This information includes: + * - size in bytes of the region to be pseudo-locked + * - cache line size to know the stride with which data needs to be accessed + * to be pseudo-locked + * - a cpu associated with the cache instance on which the pseudo-locking + * flow can be executed + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_init(struct pseudo_lock_region *plr) +{ + enum resctrl_scope scope = plr->s->res->ctrl_scope; + struct cacheinfo *ci; + int ret; + + if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) + return -ENODEV; + + /* Pick the first cpu we find that is associated with the cache. */ + plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); + + if (!cpu_online(plr->cpu)) { + rdt_last_cmd_printf("CPU %u associated with cache not online\n", + plr->cpu); + ret = -ENODEV; + goto out_region; + } + + ci = get_cpu_cacheinfo_level(plr->cpu, scope); + if (ci) { + plr->line_size = ci->coherency_line_size; + plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); + return 0; + } + + ret = -1; + rdt_last_cmd_puts("Unable to determine cache line size\n"); +out_region: + pseudo_lock_region_clear(plr); + return ret; +} + +/** + * pseudo_lock_init - Initialize a pseudo-lock region + * @rdtgrp: resource group to which new pseudo-locked region will belong + * + * A pseudo-locked region is associated with a resource group. When this + * association is created the pseudo-locked region is initialized. The + * details of the pseudo-locked region are not known at this time so only + * allocation is done and association established. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_init(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr; + + plr = kzalloc(sizeof(*plr), GFP_KERNEL); + if (!plr) + return -ENOMEM; + + init_waitqueue_head(&plr->lock_thread_wq); + INIT_LIST_HEAD(&plr->pm_reqs); + rdtgrp->plr = plr; + return 0; +} + +/** + * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked + * @plr: pseudo-lock region + * + * Initialize the details required to set up the pseudo-locked region and + * allocate the contiguous memory that will be pseudo-locked to the cache. + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) +{ + int ret; + + ret = pseudo_lock_region_init(plr); + if (ret < 0) + return ret; + + /* + * We do not yet support contiguous regions larger than + * KMALLOC_MAX_SIZE. + */ + if (plr->size > KMALLOC_MAX_SIZE) { + rdt_last_cmd_puts("Requested region exceeds maximum size\n"); + ret = -E2BIG; + goto out_region; + } + + plr->kmem = kzalloc(plr->size, GFP_KERNEL); + if (!plr->kmem) { + rdt_last_cmd_puts("Unable to allocate memory\n"); + ret = -ENOMEM; + goto out_region; + } + + ret = 0; + goto out; +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * pseudo_lock_free - Free a pseudo-locked region + * @rdtgrp: resource group to which pseudo-locked region belonged + * + * The pseudo-locked region's resources have already been released, or not + * yet created at this point. Now it can be freed and disassociated from the + * resource group. + * + * Return: void + */ +static void pseudo_lock_free(struct rdtgroup *rdtgrp) +{ + pseudo_lock_region_clear(rdtgrp->plr); + kfree(rdtgrp->plr); + rdtgrp->plr = NULL; +} + +/** + * rdtgroup_monitor_in_progress - Test if monitoring in progress + * @rdtgrp: resource group being queried + * + * Return: 1 if monitor groups have been created for this resource + * group, 0 otherwise. + */ +static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) +{ + return !list_empty(&rdtgrp->mon.crdtgrp_list); +} + +/** + * rdtgroup_locksetup_user_restrict - Restrict user access to group + * @rdtgrp: resource group needing access restricted + * + * A resource group used for cache pseudo-locking cannot have cpus or tasks + * assigned to it. This is communicated to the user by restricting access + * to all the files that can be used to make such changes. + * + * Permissions restored with rdtgroup_locksetup_user_restore() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restriction of access an attempt will be made to restore permissions but + * the state of the mode of these files will be uncertain when a failure + * occurs. + */ +static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); + if (ret) + goto err_cpus; + + if (resctrl_arch_mon_capable()) { + ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); +err_cpus: + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); +err_tasks: + rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); +out: + return ret; +} + +/** + * rdtgroup_locksetup_user_restore - Restore user access to group + * @rdtgrp: resource group needing access restored + * + * Restore all file access previously removed using + * rdtgroup_locksetup_user_restrict() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restoration of access an attempt will be made to restrict permissions + * again but the state of the mode of these files will be uncertain when + * a failure occurs. + */ +static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); + if (ret) + goto err_cpus; + + if (resctrl_arch_mon_capable()) { + ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); +err_cpus: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); +err_tasks: + rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); +out: + return ret; +} + +/** + * rdtgroup_locksetup_enter - Resource group enters locksetup mode + * @rdtgrp: resource group requested to enter locksetup mode + * + * A resource group enters locksetup mode to reflect that it would be used + * to represent a pseudo-locked region and is in the process of being set + * up to do so. A resource group used for a pseudo-locked region would + * lose the closid associated with it so we cannot allow it to have any + * tasks or cpus assigned nor permit tasks or cpus to be assigned in the + * future. Monitoring of a pseudo-locked region is not allowed either. + * + * The above and more restrictions on a pseudo-locked region are checked + * for and enforced before the resource group enters the locksetup mode. + * + * Returns: 0 if the resource group successfully entered locksetup mode, <0 + * on failure. On failure the last_cmd_status buffer is updated with text to + * communicate details of failure to the user. + */ +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + int ret; + + /* + * The default resource group can neither be removed nor lose the + * default closid associated with it. + */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); + return -EINVAL; + } + + /* + * Cache Pseudo-locking not supported when CDP is enabled. + * + * Some things to consider if you would like to enable this + * support (using L3 CDP as example): + * - When CDP is enabled two separate resources are exposed, + * L3DATA and L3CODE, but they are actually on the same cache. + * The implication for pseudo-locking is that if a + * pseudo-locked region is created on a domain of one + * resource (eg. L3CODE), then a pseudo-locked region cannot + * be created on that same domain of the other resource + * (eg. L3DATA). This is because the creation of a + * pseudo-locked region involves a call to wbinvd that will + * affect all cache allocations on particular domain. + * - Considering the previous, it may be possible to only + * expose one of the CDP resources to pseudo-locking and + * hide the other. For example, we could consider to only + * expose L3DATA and since the L3 cache is unified it is + * still possible to place instructions there are execute it. + * - If only one region is exposed to pseudo-locking we should + * still keep in mind that availability of a portion of cache + * for pseudo-locking should take into account both resources. + * Similarly, if a pseudo-locked region is created in one + * resource, the portion of cache used by it should be made + * unavailable to all future allocations from both resources. + */ + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || + resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { + rdt_last_cmd_puts("CDP enabled\n"); + return -EINVAL; + } + + /* + * Not knowing the bits to disable prefetching implies that this + * platform does not support Cache Pseudo-Locking. + */ + if (resctrl_arch_get_prefetch_disable_bits() == 0) { + rdt_last_cmd_puts("Pseudo-locking not supported\n"); + return -EINVAL; + } + + if (rdtgroup_monitor_in_progress(rdtgrp)) { + rdt_last_cmd_puts("Monitoring in progress\n"); + return -EINVAL; + } + + if (rdtgroup_tasks_assigned(rdtgrp)) { + rdt_last_cmd_puts("Tasks assigned to resource group\n"); + return -EINVAL; + } + + if (!cpumask_empty(&rdtgrp->cpu_mask)) { + rdt_last_cmd_puts("CPUs assigned to resource group\n"); + return -EINVAL; + } + + if (rdtgroup_locksetup_user_restrict(rdtgrp)) { + rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); + return -EIO; + } + + ret = pseudo_lock_init(rdtgrp); + if (ret) { + rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); + goto out_release; + } + + /* + * If this system is capable of monitoring a rmid would have been + * allocated when the control group was created. This is not needed + * anymore when this group would be used for pseudo-locking. This + * is safe to call on platforms not capable of monitoring. + */ + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + + ret = 0; + goto out; + +out_release: + rdtgroup_locksetup_user_restore(rdtgrp); +out: + return ret; +} + +/** + * rdtgroup_locksetup_exit - resource group exist locksetup mode + * @rdtgrp: resource group + * + * When a resource group exits locksetup mode the earlier restrictions are + * lifted. + * + * Return: 0 on success, <0 on failure + */ +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + int ret; + + if (resctrl_arch_mon_capable()) { + ret = alloc_rmid(rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + } + + ret = rdtgroup_locksetup_user_restore(rdtgrp); + if (ret) { + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + return ret; + } + + pseudo_lock_free(rdtgrp); + return 0; +} + +/** + * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked + * @d: RDT domain + * @cbm: CBM to test + * + * @d represents a cache instance and @cbm a capacity bitmask that is + * considered for it. Determine if @cbm overlaps with any existing + * pseudo-locked region on @d. + * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: true if @cbm overlaps with pseudo-locked region on @d, false + * otherwise. + */ +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) +{ + unsigned int cbm_len; + unsigned long cbm_b; + + if (d->plr) { + cbm_len = d->plr->s->res->cache.cbm_len; + cbm_b = d->plr->cbm; + if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) + return true; + } + return false; +} + +/** + * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy + * @d: RDT domain under test + * + * The setup of a pseudo-locked region affects all cache instances within + * the hierarchy of the region. It is thus essential to know if any + * pseudo-locked regions exist within a cache hierarchy to prevent any + * attempts to create new pseudo-locked regions in the same hierarchy. + * + * Return: true if a pseudo-locked region exists in the hierarchy of @d or + * if it is not possible to test due to memory allocation issue, + * false otherwise. + */ +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) +{ + struct rdt_ctrl_domain *d_i; + cpumask_var_t cpu_with_psl; + struct rdt_resource *r; + bool ret = false; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) + return true; + + /* + * First determine which cpus have pseudo-locked regions + * associated with them. + */ + for_each_alloc_capable_rdt_resource(r) { + list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { + if (d_i->plr) + cpumask_or(cpu_with_psl, cpu_with_psl, + &d_i->hdr.cpu_mask); + } + } + + /* + * Next test if new pseudo-locked region would intersect with + * existing region. + */ + if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) + ret = true; + + free_cpumask_var(cpu_with_psl); + return ret; +} + +/** + * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region + * @rdtgrp: Resource group to which the pseudo-locked region belongs. + * @sel: Selector of which measurement to perform on a pseudo-locked region. + * + * The measurement of latency to access a pseudo-locked region should be + * done from a cpu that is associated with that pseudo-locked region. + * Determine which cpu is associated with this region and start a thread on + * that cpu to perform the measurement, wait for that thread to complete. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int cpu; + int ret = -1; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out; + } + + if (!plr->d) { + ret = -ENODEV; + goto out; + } + + plr->thread_done = 0; + cpu = cpumask_first(&plr->d->hdr.cpu_mask); + if (!cpu_online(cpu)) { + ret = -ENODEV; + goto out; + } + + plr->cpu = cpu; + + if (sel == 1) + thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, + plr, cpu, "pseudo_lock_measure/%u"); + else if (sel == 2) + thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, + plr, cpu, "pseudo_lock_measure/%u"); + else if (sel == 3) + thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, + plr, cpu, "pseudo_lock_measure/%u"); + else + goto out; + + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + goto out; + } + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) + goto out; + + ret = 0; + +out: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +static ssize_t pseudo_lock_measure_trigger(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rdtgroup *rdtgrp = file->private_data; + size_t buf_size; + char buf[32]; + int ret; + int sel; + + buf_size = min(count, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + ret = kstrtoint(buf, 10, &sel); + if (ret == 0) { + if (sel != 1 && sel != 2 && sel != 3) + return -EINVAL; + ret = debugfs_file_get(file->f_path.dentry); + if (ret) + return ret; + ret = pseudo_lock_measure_cycles(rdtgrp, sel); + if (ret == 0) + ret = count; + debugfs_file_put(file->f_path.dentry); + } + + return ret; +} + +static const struct file_operations pseudo_measure_fops = { + .write = pseudo_lock_measure_trigger, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * rdtgroup_pseudo_lock_create - Create a pseudo-locked region + * @rdtgrp: resource group to which pseudo-lock region belongs + * + * Called when a resource group in the pseudo-locksetup mode receives a + * valid schemata that should be pseudo-locked. Since the resource group is + * in pseudo-locksetup mode the &struct pseudo_lock_region has already been + * allocated and initialized with the essential information. If a failure + * occurs the resource group remains in the pseudo-locksetup mode with the + * &struct pseudo_lock_region associated with it, but cleared from all + * information and ready for the user to re-attempt pseudo-locking by + * writing the schemata again. + * + * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 + * on failure. Descriptive error will be written to last_cmd_status buffer. + */ +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int new_minor; + struct device *dev; + char *kn_name __free(kfree) = NULL; + int ret; + + ret = pseudo_lock_region_alloc(plr); + if (ret < 0) + return ret; + + ret = pseudo_lock_cstates_constrain(plr); + if (ret < 0) { + ret = -EINVAL; + goto out_region; + } + kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); + if (!kn_name) { + ret = -ENOMEM; + goto out_cstates; + } + + plr->thread_done = 0; + + thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, + plr->cpu, "pseudo_lock/%u"); + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + rdt_last_cmd_printf("Locking thread returned error %d\n", ret); + goto out_cstates; + } + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) { + /* + * If the thread does not get on the CPU for whatever + * reason and the process which sets up the region is + * interrupted then this will leave the thread in runnable + * state and once it gets on the CPU it will dereference + * the cleared, but not freed, plr struct resulting in an + * empty pseudo-locking loop. + */ + rdt_last_cmd_puts("Locking thread interrupted\n"); + goto out_cstates; + } + + ret = pseudo_lock_minor_get(&new_minor); + if (ret < 0) { + rdt_last_cmd_puts("Unable to obtain a new minor number\n"); + goto out_cstates; + } + + /* + * Unlock access but do not release the reference. The + * pseudo-locked region will still be here on return. + * + * The mutex has to be released temporarily to avoid a potential + * deadlock with the mm->mmap_lock which is obtained in the + * device_create() and debugfs_create_dir() callpath below as well as + * before the mmap() callback is called. + */ + mutex_unlock(&rdtgroup_mutex); + + if (!IS_ERR_OR_NULL(debugfs_resctrl)) { + plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); + if (!IS_ERR_OR_NULL(plr->debugfs_dir)) + debugfs_create_file("pseudo_lock_measure", 0200, + plr->debugfs_dir, rdtgrp, + &pseudo_measure_fops); + } + + dev = device_create(&pseudo_lock_class, NULL, + MKDEV(pseudo_lock_major, new_minor), + rdtgrp, "%s", kn_name); + + mutex_lock(&rdtgroup_mutex); + + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + rdt_last_cmd_printf("Failed to create character device: %d\n", + ret); + goto out_debugfs; + } + + /* We released the mutex - check if group was removed while we did so */ + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out_device; + } + + plr->minor = new_minor; + + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; + closid_free(rdtgrp->closid); + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); + + ret = 0; + goto out; + +out_device: + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); +out_debugfs: + debugfs_remove_recursive(plr->debugfs_dir); + pseudo_lock_minor_release(new_minor); +out_cstates: + pseudo_lock_cstates_relax(plr); +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region + * @rdtgrp: resource group to which the pseudo-locked region belongs + * + * The removal of a pseudo-locked region can be initiated when the resource + * group is removed from user space via a "rmdir" from userspace or the + * unmount of the resctrl filesystem. On removal the resource group does + * not go back to pseudo-locksetup mode before it is removed, instead it is + * removed directly. There is thus asymmetry with the creation where the + * &struct pseudo_lock_region is removed here while it was not created in + * rdtgroup_pseudo_lock_create(). + * + * Return: void + */ +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * Default group cannot be a pseudo-locked region so we can + * free closid here. + */ + closid_free(rdtgrp->closid); + goto free; + } + + pseudo_lock_cstates_relax(plr); + debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); + pseudo_lock_minor_release(plr->minor); + +free: + pseudo_lock_free(rdtgrp); +} + +static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = region_find_by_minor(iminor(inode)); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + filp->private_data = rdtgrp; + atomic_inc(&rdtgrp->waitcount); + /* Perform a non-seekable open - llseek is not supported */ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + filp->private_data = NULL; + atomic_dec(&rdtgrp->waitcount); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +{ + /* Not supported */ + return -EINVAL; +} + +static const struct vm_operations_struct pseudo_mmap_ops = { + .mremap = pseudo_lock_dev_mremap, +}; + +static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long vsize = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct pseudo_lock_region *plr; + struct rdtgroup *rdtgrp; + unsigned long physical; + unsigned long psize; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + plr = rdtgrp->plr; + + if (!plr->d) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + /* + * Task is required to run with affinity to the cpus associated + * with the pseudo-locked region. If this is not the case the task + * may be scheduled elsewhere and invalidate entries in the + * pseudo-locked region. + */ + if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + physical = __pa(plr->kmem) >> PAGE_SHIFT; + psize = plr->size - off; + + if (off > plr->size) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + /* + * Ensure changes are carried directly to the memory being mapped, + * do not allow copy-on-write mapping. + */ + if (!(vma->vm_flags & VM_SHARED)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + if (vsize > psize) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + memset(plr->kmem + off, 0, vsize); + + if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, + vsize, vma->vm_page_prot)) { + mutex_unlock(&rdtgroup_mutex); + return -EAGAIN; + } + vma->vm_ops = &pseudo_mmap_ops; + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static const struct file_operations pseudo_lock_dev_fops = { + .owner = THIS_MODULE, + .read = NULL, + .write = NULL, + .open = pseudo_lock_dev_open, + .release = pseudo_lock_dev_release, + .mmap = pseudo_lock_dev_mmap, +}; + +int rdt_pseudo_lock_init(void) +{ + int ret; + + ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); + if (ret < 0) + return ret; + + pseudo_lock_major = ret; + + ret = class_register(&pseudo_lock_class); + if (ret) { + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + return ret; + } + + return 0; +} + +void rdt_pseudo_lock_release(void) +{ + class_unregister(&pseudo_lock_class); + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + pseudo_lock_major = 0; +} diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c new file mode 100644 index 000000000000..cc37f58b47dd --- /dev/null +++ b/fs/resctrl/rdtgroup.c @@ -0,0 +1,4353 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * User interface for Resource Allocation in Resource Director Technology(RDT) + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Fenghua Yu <fenghua.yu@intel.com> + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/fs.h> +#include <linux/fs_parser.h> +#include <linux/sysfs.h> +#include <linux/kernfs.h> +#include <linux/resctrl.h> +#include <linux/seq_buf.h> +#include <linux/seq_file.h> +#include <linux/sched/task.h> +#include <linux/slab.h> +#include <linux/user_namespace.h> + +#include <uapi/linux/magic.h> + +#include "internal.h" + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + +static struct kernfs_root *rdt_root; + +struct rdtgroup rdtgroup_default; + +LIST_HEAD(rdt_all_groups); + +/* list of entries for the schemata file */ +LIST_HEAD(resctrl_schema_all); + +/* + * List of struct mon_data containing private data of event files for use by + * rdtgroup_mondata_show(). Protected by rdtgroup_mutex. + */ +static LIST_HEAD(mon_data_kn_priv_list); + +/* The filesystem can only be mounted once. */ +bool resctrl_mounted; + +/* Kernel fs node for "info" directory under root */ +static struct kernfs_node *kn_info; + +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + +/* + * Used to store the max resource name width to display the schemata names in + * a tabular format. + */ +int max_name_width; + +static struct seq_buf last_cmd_status; + +static char last_cmd_status_buf[512]; + +static int rdtgroup_setup_root(struct rdt_fs_context *ctx); + +static void rdtgroup_destroy_root(void); + +struct dentry *debugfs_resctrl; + +/* + * Memory bandwidth monitoring event to use for the default CTRL_MON group + * and each new CTRL_MON group created by the user. Only relevant when + * the filesystem is mounted with the "mba_MBps" option so it does not + * matter that it remains uninitialized on systems that do not support + * the "mba_MBps" option. + */ +enum resctrl_event_id mba_mbps_default_event; + +static bool resctrl_debug; + +void rdt_last_cmd_clear(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_clear(&last_cmd_status); +} + +void rdt_last_cmd_puts(const char *s) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_puts(&last_cmd_status, s); +} + +void rdt_last_cmd_printf(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_vprintf(&last_cmd_status, fmt, ap); + va_end(ap); +} + +void rdt_staged_configs_clear(void) +{ + struct rdt_ctrl_domain *dom; + struct rdt_resource *r; + + lockdep_assert_held(&rdtgroup_mutex); + + for_each_alloc_capable_rdt_resource(r) { + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) + memset(dom->staged_config, 0, sizeof(dom->staged_config)); + } +} + +static bool resctrl_is_mbm_enabled(void) +{ + return (resctrl_arch_is_mbm_total_enabled() || + resctrl_arch_is_mbm_local_enabled()); +} + +static bool resctrl_is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/* + * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap + * of free CLOSIDs. + * + * Using a global CLOSID across all resources has some advantages and + * some drawbacks: + * + We can simply set current's closid to assign a task to a resource + * group. + * + Context switch code can avoid extra memory references deciding which + * CLOSID to load into the PQR_ASSOC MSR + * - We give up some options in configuring resource groups across multi-socket + * systems. + * - Our choices on how to configure each resource become progressively more + * limited as the number of resources grows. + */ +static unsigned long *closid_free_map; + +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} + +static int closid_init(void) +{ + struct resctrl_schema *s; + u32 rdt_min_closid = ~0; + + /* Monitor only platforms still call closid_init() */ + if (list_empty(&resctrl_schema_all)) + return 0; + + /* Compute rdt_min_closid across all resources */ + list_for_each_entry(s, &resctrl_schema_all, list) + rdt_min_closid = min(rdt_min_closid, s->num_closid); + + closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL); + if (!closid_free_map) + return -ENOMEM; + bitmap_fill(closid_free_map, rdt_min_closid); + + /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ + __clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map); + closid_free_map_len = rdt_min_closid; + + return 0; +} + +static void closid_exit(void) +{ + bitmap_free(closid_free_map); + closid_free_map = NULL; +} + +static int closid_alloc(void) +{ + int cleanest_closid; + u32 closid; + + lockdep_assert_held(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && + resctrl_arch_is_llc_occupancy_enabled()) { + cleanest_closid = resctrl_find_cleanest_closid(); + if (cleanest_closid < 0) + return cleanest_closid; + closid = cleanest_closid; + } else { + closid = find_first_bit(closid_free_map, closid_free_map_len); + if (closid == closid_free_map_len) + return -ENOSPC; + } + __clear_bit(closid, closid_free_map); + + return closid; +} + +void closid_free(int closid) +{ + lockdep_assert_held(&rdtgroup_mutex); + + __set_bit(closid, closid_free_map); +} + +/** + * closid_allocated - test if provided closid is in use + * @closid: closid to be tested + * + * Return: true if @closid is currently associated with a resource group, + * false if @closid is free + */ +bool closid_allocated(unsigned int closid) +{ + lockdep_assert_held(&rdtgroup_mutex); + + return !test_bit(closid, closid_free_map); +} + +/** + * rdtgroup_mode_by_closid - Return mode of resource group with closid + * @closid: closid if the resource group + * + * Each resource group is associated with a @closid. Here the mode + * of a resource group can be queried by searching for it using its closid. + * + * Return: mode as &enum rdtgrp_mode of resource group with closid @closid + */ +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) +{ + struct rdtgroup *rdtgrp; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->closid == closid) + return rdtgrp->mode; + } + + return RDT_NUM_MODES; +} + +static const char * const rdt_mode_str[] = { + [RDT_MODE_SHAREABLE] = "shareable", + [RDT_MODE_EXCLUSIVE] = "exclusive", + [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", + [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", +}; + +/** + * rdtgroup_mode_str - Return the string representation of mode + * @mode: the resource group mode as &enum rdtgroup_mode + * + * Return: string representation of valid mode, "unknown" otherwise + */ +static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) +{ + if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) + return "unknown"; + + return rdt_mode_str[mode]; +} + +/* set uid and gid of rdtgroup dirs and files to that of the creator */ +static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) +{ + struct kernfs_node *kn; + int ret; + + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + 0, rft->kf_ops, rft, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return 0; +} + +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + struct rftype *rft = of->kn->priv; + + if (rft->seq_show) + return rft->seq_show(of, m, arg); + return 0; +} + +static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rftype *rft = of->kn->priv; + + if (rft->write) + return rft->write(of, buf, nbytes, off); + + return -EINVAL; +} + +static const struct kernfs_ops rdtgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = rdtgroup_file_write, + .seq_show = rdtgroup_seqfile_show, +}; + +static const struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + +static bool is_cpu_list(struct kernfs_open_file *of) +{ + struct rftype *rft = of->kn->priv; + + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; +} + +static int rdtgroup_cpus_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct cpumask *mask; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + mask = &rdtgrp->plr->d->hdr.cpu_mask; + seq_printf(s, is_cpu_list(of) ? + "%*pbl\n" : "%*pb\n", + cpumask_pr_args(mask)); + } + } else { + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, + * + * Per task closids/rmids must have been set up before calling this function. + * @r may be NULL. + */ +static void +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) +{ + struct resctrl_cpu_defaults defaults, *p = NULL; + + if (r) { + defaults.closid = r->closid; + defaults.rmid = r->mon.rmid; + p = &defaults; + } + + on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); +} + +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); + return -EINVAL; + } + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (!cpumask_empty(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (!cpumask_empty(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Can't drop CPUs from default group\n"); + return -EINVAL; + } + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (!cpumask_empty(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + +static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; + int ret; + + if (!buf) + return -EINVAL; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + return -ENOMEM; + } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto unlock; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto unlock; + } + + if (is_cpu_list(of)) + ret = cpulist_parse(buf, newmask); + else + ret = cpumask_parse(buf, newmask); + + if (ret) { + rdt_last_cmd_puts("Bad CPU list/mask\n"); + goto unlock; + } + + /* check that user didn't specify any offline cpus */ + cpumask_andnot(tmpmask, newmask, cpu_online_mask); + if (!cpumask_empty(tmpmask)) { + ret = -EINVAL; + rdt_last_cmd_puts("Can only assign online CPUs\n"); + goto unlock; + } + + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; + +unlock: + rdtgroup_kn_unlock(of->kn); + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); + + return ret ?: nbytes; +} + +/** + * rdtgroup_remove - the helper to remove resource group safely + * @rdtgrp: resource group to remove + * + * On resource group creation via a mkdir, an extra kernfs_node reference is + * taken to ensure that the rdtgroup structure remains accessible for the + * rdtgroup_kn_unlock() calls where it is removed. + * + * Drop the extra reference here, then free the rdtgroup structure. + * + * Return: void + */ +static void rdtgroup_remove(struct rdtgroup *rdtgrp) +{ + kernfs_put(rdtgrp->kn); + kfree(rdtgrp); +} + +static void _update_task_closid_rmid(void *task) +{ + /* + * If the task is still current on this CPU, update PQR_ASSOC MSR. + * Otherwise, the MSR is updated when the task is scheduled in. + */ + if (task == current) + resctrl_arch_sched_in(task); +} + +static void update_task_closid_rmid(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); + else + _update_task_closid_rmid(t); +} + +static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) +{ + u32 closid, rmid = rdtgrp->mon.rmid; + + if (rdtgrp->type == RDTCTRL_GROUP) + closid = rdtgrp->closid; + else if (rdtgrp->type == RDTMON_GROUP) + closid = rdtgrp->mon.parent->closid; + else + return false; + + return resctrl_arch_match_closid(tsk, closid) && + resctrl_arch_match_rmid(tsk, closid, rmid); +} + +static int __rdtgroup_move_task(struct task_struct *tsk, + struct rdtgroup *rdtgrp) +{ + /* If the task is already in rdtgrp, no need to move the task. */ + if (task_in_rdtgroup(tsk, rdtgrp)) + return 0; + + /* + * Set the task's closid/rmid before the PQR_ASSOC MSR can be + * updated by them. + * + * For ctrl_mon groups, move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; + } + + if (rdtgrp->type == RDTMON_GROUP) + resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, + rdtgrp->mon.rmid); + else + resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, + rdtgrp->mon.rmid); + + /* + * Ensure the task's closid and rmid are written before determining if + * the task is current that will decide if it will be interrupted. + * This pairs with the full barrier between the rq->curr update and + * resctrl_arch_sched_in() during context switch. + */ + smp_mb(); + + /* + * By now, the task's closid and rmid are set. If the task is current + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource + * group go into effect. If the task is not current, the MSR will be + * updated when the task is scheduled in. + */ + update_task_closid_rmid(tsk); + + return 0; +} + +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && + resctrl_arch_match_closid(t, r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && + resctrl_arch_match_rmid(t, r->mon.parent->closid, + r->mon.rmid)); +} + +/** + * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group + * @r: Resource group + * + * Return: 1 if tasks have been assigned to @r, 0 otherwise + */ +int rdtgroup_tasks_assigned(struct rdtgroup *r) +{ + struct task_struct *p, *t; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + ret = 1; + break; + } + } + rcu_read_unlock(); + + return ret; +} + +static int rdtgroup_task_write_permission(struct task_struct *task, + struct kernfs_open_file *of) +{ + const struct cred *tcred = get_task_cred(task); + const struct cred *cred = current_cred(); + int ret = 0; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) { + rdt_last_cmd_printf("No permission to move task %d\n", task->pid); + ret = -EPERM; + } + + put_cred(tcred); + return ret; +} + +static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + struct task_struct *tsk; + int ret; + + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + rdt_last_cmd_printf("No task %d\n", pid); + return -ESRCH; + } + } else { + tsk = current; + } + + get_task_struct(tsk); + rcu_read_unlock(); + + ret = rdtgroup_task_write_permission(tsk, of); + if (!ret) + ret = __rdtgroup_move_task(tsk, rdtgrp); + + put_task_struct(tsk); + return ret; +} + +static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + char *pid_str; + int ret = 0; + pid_t pid; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto unlock; + } + + while (buf && buf[0] != '\0' && buf[0] != '\n') { + pid_str = strim(strsep(&buf, ",")); + + if (kstrtoint(pid_str, 0, &pid)) { + rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); + ret = -EINVAL; + break; + } + + if (pid < 0) { + rdt_last_cmd_printf("Invalid pid %d\n", pid); + ret = -EINVAL; + break; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); + if (ret) { + rdt_last_cmd_printf("Error while processing task %d\n", pid); + break; + } + } + +unlock: + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) +{ + struct task_struct *p, *t; + pid_t pid; + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + pid = task_pid_vnr(t); + if (pid) + seq_printf(s, "%d\n", pid); + } + } + rcu_read_unlock(); +} + +static int rdtgroup_tasks_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + show_rdt_tasks(rdtgrp, s); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static int rdtgroup_closid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->closid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static int rdtgroup_rmid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->mon.rmid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +#ifdef CONFIG_PROC_CPU_RESCTRL +/* + * A task can only be part of one resctrl control group and of one monitor + * group which is associated to that control group. + * + * 1) res: + * mon: + * + * resctrl is not available. + * + * 2) res:/ + * mon: + * + * Task is part of the root resctrl control group, and it is not associated + * to any monitor group. + * + * 3) res:/ + * mon:mon0 + * + * Task is part of the root resctrl control group and monitor group mon0. + * + * 4) res:group0 + * mon: + * + * Task is part of resctrl control group group0, and it is not associated + * to any monitor group. + * + * 5) res:group0 + * mon:mon1 + * + * Task is part of resctrl control group group0 and monitor group mon1. + */ +int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + struct rdtgroup *rdtg; + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + + /* Return empty if resctrl has not been mounted. */ + if (!resctrl_mounted) { + seq_puts(s, "res:\nmon:\n"); + goto unlock; + } + + list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { + struct rdtgroup *crg; + + /* + * Task information is only relevant for shareable + * and exclusive groups. + */ + if (rdtg->mode != RDT_MODE_SHAREABLE && + rdtg->mode != RDT_MODE_EXCLUSIVE) + continue; + + if (!resctrl_arch_match_closid(tsk, rdtg->closid)) + continue; + + seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", + rdt_kn_name(rdtg->kn)); + seq_puts(s, "mon:"); + list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, + mon.crdtgrp_list) { + if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, + crg->mon.rmid)) + continue; + seq_printf(s, "%s", rdt_kn_name(crg->kn)); + break; + } + seq_putc(s, '\n'); + goto unlock; + } + /* + * The above search should succeed. Otherwise return + * with an error. + */ + ret = -ENOENT; +unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} +#endif + +static int rdt_last_cmd_status_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + int len; + + mutex_lock(&rdtgroup_mutex); + len = seq_buf_used(&last_cmd_status); + if (len) + seq_printf(seq, "%.*s", len, last_cmd_status_buf); + else + seq_puts(seq, "ok\n"); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static void *rdt_kn_parent_priv(struct kernfs_node *kn) +{ + /* + * The parent pointer is only valid within RCU section since it can be + * replaced. + */ + guard(rcu)(); + return rcu_dereference(kn->__parent)->priv; +} + +static int rdt_num_closids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + + seq_printf(seq, "%u\n", s->num_closid); + return 0; +} + +static int rdt_default_ctrl_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); + return 0; +} + +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); + return 0; +} + +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + +/* + * rdt_bit_usage_show - Display current usage of resources + * + * A domain is a shared resource that can now be allocated differently. Here + * we display the current regions of the domain as an annotated bitmask. + * For each domain of this resource its allocation bitmask + * is annotated as below to indicate the current usage of the corresponding bit: + * 0 - currently unused + * X - currently available for sharing and used by software and hardware + * H - currently used by hardware only but available for software use + * S - currently used and shareable by software only + * E - currently used exclusively by one resource group + * P - currently pseudo-locked by one resource group + */ +static int rdt_bit_usage_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + /* + * Use unsigned long even though only 32 bits are used to ensure + * test_bit() is used safely. + */ + unsigned long sw_shareable = 0, hw_shareable = 0; + unsigned long exclusive = 0, pseudo_locked = 0; + struct rdt_resource *r = s->res; + struct rdt_ctrl_domain *dom; + int i, hwb, swb, excl, psl; + enum rdtgrp_mode mode; + bool sep = false; + u32 ctrl_val; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + hw_shareable = r->cache.shareable_bits; + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { + if (sep) + seq_putc(seq, ';'); + sw_shareable = 0; + exclusive = 0; + seq_printf(seq, "%d=", dom->hdr.id); + for (i = 0; i < closids_supported(); i++) { + if (!closid_allocated(i)) + continue; + ctrl_val = resctrl_arch_get_config(r, dom, i, + s->conf_type); + mode = rdtgroup_mode_by_closid(i); + switch (mode) { + case RDT_MODE_SHAREABLE: + sw_shareable |= ctrl_val; + break; + case RDT_MODE_EXCLUSIVE: + exclusive |= ctrl_val; + break; + case RDT_MODE_PSEUDO_LOCKSETUP: + /* + * RDT_MODE_PSEUDO_LOCKSETUP is possible + * here but not included since the CBM + * associated with this CLOSID in this mode + * is not initialized and no task or cpu can be + * assigned this CLOSID. + */ + break; + case RDT_MODE_PSEUDO_LOCKED: + case RDT_NUM_MODES: + WARN(1, + "invalid mode for closid %d\n", i); + break; + } + } + for (i = r->cache.cbm_len - 1; i >= 0; i--) { + pseudo_locked = dom->plr ? dom->plr->cbm : 0; + hwb = test_bit(i, &hw_shareable); + swb = test_bit(i, &sw_shareable); + excl = test_bit(i, &exclusive); + psl = test_bit(i, &pseudo_locked); + if (hwb && swb) + seq_putc(seq, 'X'); + else if (hwb && !swb) + seq_putc(seq, 'H'); + else if (!hwb && swb) + seq_putc(seq, 'S'); + else if (excl) + seq_putc(seq, 'E'); + else if (psl) + seq_putc(seq, 'P'); + else /* Unused bits remain */ + seq_putc(seq, '0'); + } + sep = true; + } + seq_putc(seq, '\n'); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return 0; +} + +static int rdt_min_bw_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.min_bw); + return 0; +} + +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) { + seq_printf(seq, "%s\n", mevt->name); + if (mevt->configurable) + seq_printf(seq, "%s_config\n", mevt->name); + } + + return 0; +} + +static int rdt_bw_gran_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.bw_gran); + return 0; +} + +static int rdt_delay_linear_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.delay_linear); + return 0; +} + +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); + + return 0; +} + +static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + switch (r->membw.throttle_mode) { + case THREAD_THROTTLE_PER_THREAD: + seq_puts(seq, "per-thread\n"); + return 0; + case THREAD_THROTTLE_MAX: + seq_puts(seq, "max\n"); + return 0; + case THREAD_THROTTLE_UNDEFINED: + seq_puts(seq, "undefined\n"); + return 0; + } + + WARN_ON_ONCE(1); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > resctrl_rmid_realloc_limit) + return -EINVAL; + + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); + + return nbytes; +} + +/* + * rdtgroup_mode_show - Display mode of this resource group + */ +static int rdtgroup_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); + + rdtgroup_kn_unlock(of->kn); + return 0; +} + +static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) +{ + switch (my_type) { + case CDP_CODE: + return CDP_DATA; + case CDP_DATA: + return CDP_CODE; + default: + case CDP_NONE: + return CDP_NONE; + } +} + +static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); + + return 0; +} + +/** + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @type: CDP type of @r. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Checks if provided @cbm intended to be used for @closid on domain + * @d overlaps with any other closids or other hardware usage associated + * with this domain. If @exclusive is true then only overlaps with + * resource groups in exclusive mode will be considered. If @exclusive + * is false then overlaps with any resource group or hardware entities + * will be considered. + * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: false if CBM does not overlap, true if it does. + */ +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, + unsigned long cbm, int closid, + enum resctrl_conf_type type, bool exclusive) +{ + enum rdtgrp_mode mode; + unsigned long ctrl_b; + int i; + + /* Check for any overlap with regions used by hardware directly */ + if (!exclusive) { + ctrl_b = r->cache.shareable_bits; + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) + return true; + } + + /* Check for overlap with other resource groups */ + for (i = 0; i < closids_supported(); i++) { + ctrl_b = resctrl_arch_get_config(r, d, i, type); + mode = rdtgroup_mode_by_closid(i); + if (closid_allocated(i) && i != closid && + mode != RDT_MODE_PSEUDO_LOCKSETUP) { + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { + if (exclusive) { + if (mode == RDT_MODE_EXCLUSIVE) + return true; + continue; + } + return true; + } + } + } + + return false; +} + +/** + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware + * @s: Schema for the resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Resources that can be allocated using a CBM can use the CBM to control + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test + * for overlap. Overlap test is not limited to the specific resource for + * which the CBM is intended though - when dealing with CDP resources that + * share the underlying hardware the overlap check should be performed on + * the CDP resource sharing the hardware also. + * + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the + * overlap test. + * + * Return: true if CBM overlap detected, false if there is no overlap + */ +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, + unsigned long cbm, int closid, bool exclusive) +{ + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); + struct rdt_resource *r = s->res; + + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, + exclusive)) + return true; + + if (!resctrl_arch_get_cdp_enabled(r->rid)) + return false; + return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); +} + +/** + * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * @rdtgrp: Resource group identified through its closid. + * + * An exclusive resource group implies that there should be no sharing of + * its allocated resources. At the time this group is considered to be + * exclusive this test can determine if its current schemata supports this + * setting by testing for overlap with all other resource groups. + * + * Return: true if resource group can be exclusive, false if there is overlap + * with allocations of other resource groups and thus this resource group + * cannot be exclusive. + */ +static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) +{ + int closid = rdtgrp->closid; + struct rdt_ctrl_domain *d; + struct resctrl_schema *s; + struct rdt_resource *r; + bool has_cache = false; + u32 ctrl; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) + continue; + has_cache = true; + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + ctrl = resctrl_arch_get_config(r, d, closid, + s->conf_type); + if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { + rdt_last_cmd_puts("Schemata overlaps\n"); + return false; + } + } + } + + if (!has_cache) { + rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); + return false; + } + + return true; +} + +/* + * rdtgroup_mode_write - Modify the resource group's mode + */ +static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + enum rdtgrp_mode mode; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + rdt_last_cmd_clear(); + + mode = rdtgrp->mode; + + if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || + (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || + (!strcmp(buf, "pseudo-locksetup") && + mode == RDT_MODE_PSEUDO_LOCKSETUP) || + (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) + goto out; + + if (mode == RDT_MODE_PSEUDO_LOCKED) { + rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); + ret = -EINVAL; + goto out; + } + + if (!strcmp(buf, "shareable")) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + } else if (!strcmp(buf, "exclusive")) { + if (!rdtgroup_mode_test_exclusive(rdtgrp)) { + ret = -EINVAL; + goto out; + } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_EXCLUSIVE; + } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && + !strcmp(buf, "pseudo-locksetup")) { + ret = rdtgroup_locksetup_enter(rdtgrp); + if (ret) + goto out; + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; + } else { + rdt_last_cmd_puts("Unknown or unsupported mode\n"); + ret = -EINVAL; + } + +out: + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +/** + * rdtgroup_cbm_to_size - Translate CBM to size in bytes + * @r: RDT resource to which @d belongs. + * @d: RDT domain instance. + * @cbm: bitmask for which the size should be computed. + * + * The bitmask provided associated with the RDT domain instance @d will be + * translated into how many bytes it represents. The size in bytes is + * computed by first dividing the total cache size by the CBM length to + * determine how many bytes each bit in the bitmask represents. The result + * is multiplied with the number of bits set in the bitmask. + * + * @cbm is unsigned long, even if only 32 bits are used to make the + * bitmap functions work correctly. + */ +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, + struct rdt_ctrl_domain *d, unsigned long cbm) +{ + unsigned int size = 0; + struct cacheinfo *ci; + int num_b; + + if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) + return size; + + num_b = bitmap_weight(&cbm, r->cache.cbm_len); + ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); + if (ci) + size = ci->size / r->cache.cbm_len * num_b; + + return size; +} + +bool is_mba_sc(struct rdt_resource *r) +{ + if (!r) + r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + + /* + * The software controller support is only applicable to MBA resource. + * Make sure to check for resource type. + */ + if (r->rid != RDT_RESOURCE_MBA) + return false; + + return r->membw.mba_sc; +} + +/* + * rdtgroup_size_show - Display size in bytes of allocated regions + * + * The "size" file mirrors the layout of the "schemata" file, printing the + * size in bytes of each region instead of the capacity bitmask. + */ +static int rdtgroup_size_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct resctrl_schema *schema; + enum resctrl_conf_type type; + struct rdt_ctrl_domain *d; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + unsigned int size; + int ret = 0; + u32 closid; + bool sep; + u32 ctrl; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%*s:", max_name_width, + rdtgrp->plr->s->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, + rdtgrp->plr->d, + rdtgrp->plr->cbm); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); + } + goto out; + } + + closid = rdtgrp->closid; + + list_for_each_entry(schema, &resctrl_schema_all, list) { + r = schema->res; + type = schema->conf_type; + sep = false; + seq_printf(s, "%*s:", max_name_width, schema->name); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + size = 0; + } else { + if (is_mba_sc(r)) + ctrl = d->mbps_val[closid]; + else + ctrl = resctrl_arch_get_config(r, d, + closid, + type); + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); + } + seq_printf(s, "%d=%u", d->hdr.id, size); + sep = true; + } + seq_putc(s, '\n'); + } + +out: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static void mondata_config_read(struct resctrl_mon_config_info *mon_info) +{ + smp_call_function_any(&mon_info->d->hdr.cpu_mask, + resctrl_arch_mon_event_config_read, mon_info, 1); +} + +static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) +{ + struct resctrl_mon_config_info mon_info; + struct rdt_mon_domain *dom; + bool sep = false; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_puts(s, ";"); + + memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); + mon_info.r = r; + mon_info.d = dom; + mon_info.evtid = evtid; + mondata_config_read(&mon_info); + + seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); + sep = true; + } + seq_puts(s, "\n"); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return 0; +} + +static int mbm_total_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + + mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); + + return 0; +} + +static int mbm_local_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + + mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); + + return 0; +} + +static void mbm_config_write_domain(struct rdt_resource *r, + struct rdt_mon_domain *d, u32 evtid, u32 val) +{ + struct resctrl_mon_config_info mon_info = {0}; + + /* + * Read the current config value first. If both are the same then + * no need to write it again. + */ + mon_info.r = r; + mon_info.d = d; + mon_info.evtid = evtid; + mondata_config_read(&mon_info); + if (mon_info.mon_config == val) + return; + + mon_info.mon_config = val; + + /* + * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the + * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE + * are scoped at the domain level. Writing any of these MSRs + * on one CPU is observed by all the CPUs in the domain. + */ + smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, + &mon_info, 1); + + /* + * When an Event Configuration is changed, the bandwidth counters + * for all RMIDs and Events will be cleared by the hardware. The + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for + * every RMID on the next read to any event for every RMID. + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) + * cleared while it is tracked by the hardware. Clear the + * mbm_local and mbm_total counts for all the RMIDs. + */ + resctrl_arch_reset_rmid_all(r, d); +} + +static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) +{ + char *dom_str = NULL, *id_str; + unsigned long dom_id, val; + struct rdt_mon_domain *d; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + id_str = strsep(&dom_str, "="); + + if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); + return -EINVAL; + } + + if (!dom_str || kstrtoul(dom_str, 16, &val)) { + rdt_last_cmd_puts("Non-numeric event configuration value\n"); + return -EINVAL; + } + + /* Value from user cannot be more than the supported set of events */ + if ((val & r->mbm_cfg_mask) != val) { + rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", + r->mbm_cfg_mask); + return -EINVAL; + } + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->hdr.id == dom_id) { + mbm_config_write_domain(r, d, evtid, val); + goto next; + } + } + + return -EINVAL; +} + +static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +/* rdtgroup information files for one cache resource. */ +static struct rftype res_common_files[] = { + { + .name = "last_cmd_status", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_last_cmd_status_show, + .fflags = RFTYPE_TOP_INFO, + }, + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + .fflags = RFTYPE_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RFTYPE_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RFTYPE_MON_INFO, + }, + { + .name = "cbm_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_default_ctrl_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "min_cbm_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "shareable_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_shareable_bits_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "bit_usage", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bit_usage_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "min_bandwidth", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "bandwidth_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "delay_linear", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_delay_linear_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, + }, + /* + * Platform specific which (if any) capabilities are provided by + * thread_throttle_mode. Defer "fflags" initialization to platform + * discovery. + */ + { + .name = "thread_throttle_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_thread_throttle_mode_show, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "mbm_total_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_total_bytes_config_show, + .write = mbm_total_bytes_config_write, + }, + { + .name = "mbm_local_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_local_bytes_config_show, + .write = mbm_local_bytes_config_write, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "mon_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_rmid_show, + .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "mba_MBps_event", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mba_mbps_event_write, + .seq_show = rdtgroup_mba_mbps_event_show, + }, + { + .name = "mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mode_write, + .seq_show = rdtgroup_mode_show, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "size", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_size_show, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "sparse_masks", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_has_sparse_bitmasks_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "ctrl_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_closid_show, + .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, + }, +}; + +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) +{ + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + if (resctrl_debug) + fflags |= RFTYPE_DEBUG; + + for (rft = rfts; rft < rfts + len; rft++) { + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; +} + +static struct rftype *rdtgroup_get_rftype_by_name(const char *name) +{ + struct rftype *rfts, *rft; + int len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + return rft; + } + + return NULL; +} + +static void thread_throttle_mode_init(void) +{ + enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; + struct rdt_resource *r_mba, *r_smba; + + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + if (r_mba->alloc_capable && + r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_mba->membw.throttle_mode; + + r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); + if (r_smba->alloc_capable && + r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_smba->membw.throttle_mode; + + if (throttle_mode == THREAD_THROTTLE_UNDEFINED) + return; + + resctrl_file_fflags_init("thread_throttle_mode", + RFTYPE_CTRL_INFO | RFTYPE_RES_MB); +} + +void resctrl_file_fflags_init(const char *config, unsigned long fflags) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name(config); + if (rft) + rft->fflags = fflags; +} + +/** + * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * + * The permissions of named resctrl file, directory, or link are modified + * to not allow read, write, or execute by any user. + * + * WARNING: This function is intended to communicate to the user that the + * resctrl file has been locked down - that it is not relevant to the + * particular state the system finds itself in. It should not be relied + * on to protect from user access because after the file's permissions + * are restricted the user can still change the permissions using chmod + * from the command line. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn; + int ret = 0; + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + iattr.ia_mode = S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode = S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode = S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +/** + * rdtgroup_kn_mode_restore - Restore user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * @mask: Mask of permissions that should be restored + * + * Restore the permissions of the named file. If @name is a directory the + * permissions of its parent will be used. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn, *parent; + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + iattr.ia_mode = rft->mode & mask; + } + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + parent = kernfs_get_parent(kn); + if (parent) { + iattr.ia_mode |= parent->mode; + kernfs_put(parent); + } + iattr.ia_mode |= S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode |= S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode |= S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +static int rdtgroup_mkdir_info_resdir(void *priv, char *name, + unsigned long fflags) +{ + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, priv); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; +} + +static unsigned long fflags_from_resource(struct rdt_resource *r) +{ + switch (r->rid) { + case RDT_RESOURCE_L3: + case RDT_RESOURCE_L2: + return RFTYPE_RES_CACHE; + case RDT_RESOURCE_MBA: + case RDT_RESOURCE_SMBA: + return RFTYPE_RES_MB; + } + + return WARN_ON_ONCE(1); +} + +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +{ + struct resctrl_schema *s; + struct rdt_resource *r; + unsigned long fflags; + char name[32]; + int ret; + + /* create the directory */ + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); + if (IS_ERR(kn_info)) + return PTR_ERR(kn_info); + + ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); + if (ret) + goto out_destroy; + + /* loop over enabled controls, these are all alloc_capable */ + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); + if (ret) + goto out_destroy; + } + + for_each_mon_capable_rdt_resource(r) { + fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); + if (ret) + goto out_destroy; + } + + ret = rdtgroup_kn_set_ugid(kn_info); + if (ret) + goto out_destroy; + + kernfs_activate(kn_info); + + return 0; + +out_destroy: + kernfs_remove(kn_info); + return ret; +} + +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static inline bool is_mba_linear(void) +{ + return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; +} + +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) +{ + u32 num_closid = resctrl_arch_get_num_closid(r); + int cpu = cpumask_any(&d->hdr.cpu_mask); + int i; + + d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), + GFP_KERNEL, cpu_to_node(cpu)); + if (!d->mbps_val) + return -ENOMEM; + + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + + return 0; +} + +static void mba_sc_domain_destroy(struct rdt_resource *r, + struct rdt_ctrl_domain *d) +{ + kfree(d->mbps_val); + d->mbps_val = NULL; +} + +/* + * MBA software controller is supported only if + * MBM is supported and MBA is in linear scale, + * and the MBM monitor scope is the same as MBA + * control scope. + */ +static bool supports_mba_mbps(void) +{ + struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + + return (resctrl_is_mbm_enabled() && + r->alloc_capable && is_mba_linear() && + r->ctrl_scope == rmbm->mon_scope); +} + +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + */ +static int set_mba_sc(bool mba_sc) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + u32 num_closid = resctrl_arch_get_num_closid(r); + struct rdt_ctrl_domain *d; + unsigned long fflags; + int i; + + if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) + return -EINVAL; + + r->membw.mba_sc = mba_sc; + + rdtgroup_default.mba_mbps_event = mba_mbps_default_event; + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + } + + fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; + resctrl_file_fflags_init("mba_MBps_event", fflags); + + return 0; +} + +/* + * We don't allow rdtgroup directories to be created anywhere + * except the root directory. Thus when looking for the rdtgroup + * structure for a kernfs node we are either looking at a directory, + * in which case the rdtgroup structure is pointed at by the "priv" + * field, otherwise we have a file, and need only look to the parent + * to find the rdtgroup. + */ +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) { + /* + * All the resource directories use "kn->priv" + * to point to the "struct rdtgroup" for the + * resource. "info" and its subdirectories don't + * have rdtgroup structures, so return NULL here. + */ + if (kn == kn_info || + rcu_access_pointer(kn->__parent) == kn_info) + return NULL; + else + return kn->priv; + } else { + return rdt_kn_parent_priv(kn); + } +} + +static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); +} + +static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + kernfs_unbreak_active_protection(kn); + rdtgroup_remove(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return NULL; + + rdtgroup_kn_get(rdtgrp, kn); + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + /* Was this group deleted while we waited? */ + if (rdtgrp->flags & RDT_DELETED) + return NULL; + + return rdtgrp; +} + +void rdtgroup_kn_unlock(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return; + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + rdtgroup_kn_put(rdtgrp, kn); +} + +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + +static void rdt_disable_ctx(void) +{ + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); + set_mba_sc(false); + + resctrl_debug = false; +} + +static int rdt_enable_ctx(struct rdt_fs_context *ctx) +{ + int ret = 0; + + if (ctx->enable_cdpl2) { + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); + if (ret) + goto out_done; + } + + if (ctx->enable_cdpl3) { + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); + if (ret) + goto out_cdpl2; + } + + if (ctx->enable_mba_mbps) { + ret = set_mba_sc(true); + if (ret) + goto out_cdpl3; + } + + if (ctx->enable_debug) + resctrl_debug = true; + + return 0; + +out_cdpl3: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); +out_cdpl2: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); +out_done: + return ret; +} + +static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) +{ + struct resctrl_schema *s; + const char *suffix = ""; + int ret, cl; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->res = r; + s->num_closid = resctrl_arch_get_num_closid(r); + if (resctrl_arch_get_cdp_enabled(r->rid)) + s->num_closid /= 2; + + s->conf_type = type; + switch (type) { + case CDP_CODE: + suffix = "CODE"; + break; + case CDP_DATA: + suffix = "DATA"; + break; + case CDP_NONE: + suffix = ""; + break; + } + + ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); + if (ret >= sizeof(s->name)) { + kfree(s); + return -EINVAL; + } + + cl = strlen(s->name); + + /* + * If CDP is supported by this resource, but not enabled, + * include the suffix. This ensures the tabular format of the + * schemata file does not change between mounts of the filesystem. + */ + if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) + cl += 4; + + if (cl > max_name_width) + max_name_width = cl; + + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + s->fmt_str = "%d=%x"; + break; + case RESCTRL_SCHEMA_RANGE: + s->fmt_str = "%d=%u"; + break; + } + + if (WARN_ON_ONCE(!s->fmt_str)) { + kfree(s); + return -EINVAL; + } + + INIT_LIST_HEAD(&s->list); + list_add(&s->list, &resctrl_schema_all); + + return 0; +} + +static int schemata_list_create(void) +{ + struct rdt_resource *r; + int ret = 0; + + for_each_alloc_capable_rdt_resource(r) { + if (resctrl_arch_get_cdp_enabled(r->rid)) { + ret = schemata_list_add(r, CDP_CODE); + if (ret) + break; + + ret = schemata_list_add(r, CDP_DATA); + } else { + ret = schemata_list_add(r, CDP_NONE); + } + + if (ret) + break; + } + + return ret; +} + +static void schemata_list_destroy(void) +{ + struct resctrl_schema *s, *tmp; + + list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { + list_del(&s->list); + kfree(s); + } +} + +static int rdt_get_tree(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + unsigned long flags = RFTYPE_CTRL_BASE; + struct rdt_mon_domain *dom; + struct rdt_resource *r; + int ret; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + /* + * resctrl file system can only be mounted once. + */ + if (resctrl_mounted) { + ret = -EBUSY; + goto out; + } + + ret = rdtgroup_setup_root(ctx); + if (ret) + goto out; + + ret = rdt_enable_ctx(ctx); + if (ret) + goto out_root; + + ret = schemata_list_create(); + if (ret) { + schemata_list_destroy(); + goto out_ctx; + } + + ret = closid_init(); + if (ret) + goto out_schemata_free; + + if (resctrl_arch_mon_capable()) + flags |= RFTYPE_MON; + + ret = rdtgroup_add_files(rdtgroup_default.kn, flags); + if (ret) + goto out_closid_exit; + + kernfs_activate(rdtgroup_default.kn); + + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); + if (ret < 0) + goto out_closid_exit; + + if (resctrl_arch_mon_capable()) { + ret = mongroup_create_dir(rdtgroup_default.kn, + &rdtgroup_default, "mon_groups", + &kn_mongrp); + if (ret < 0) + goto out_info; + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret < 0) + goto out_mongrp; + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + + ret = rdt_pseudo_lock_init(); + if (ret) + goto out_mondata; + + ret = kernfs_get_tree(fc); + if (ret < 0) + goto out_psl; + + if (resctrl_arch_alloc_capable()) + resctrl_arch_enable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_enable_mon(); + + if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) + resctrl_mounted = true; + + if (resctrl_is_mbm_enabled()) { + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + list_for_each_entry(dom, &r->mon_domains, hdr.list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); + } + + goto out; + +out_psl: + rdt_pseudo_lock_release(); +out_mondata: + if (resctrl_arch_mon_capable()) + kernfs_remove(kn_mondata); +out_mongrp: + if (resctrl_arch_mon_capable()) + kernfs_remove(kn_mongrp); +out_info: + kernfs_remove(kn_info); +out_closid_exit: + closid_exit(); +out_schemata_free: + schemata_list_destroy(); +out_ctx: + rdt_disable_ctx(); +out_root: + rdtgroup_destroy_root(); +out: + rdt_last_cmd_clear(); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +enum rdt_param { + Opt_cdp, + Opt_cdpl2, + Opt_mba_mbps, + Opt_debug, + nr__rdt_params +}; + +static const struct fs_parameter_spec rdt_fs_parameters[] = { + fsparam_flag("cdp", Opt_cdp), + fsparam_flag("cdpl2", Opt_cdpl2), + fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), + {} +}; + +static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + struct fs_parse_result result; + const char *msg; + int opt; + + opt = fs_parse(fc, rdt_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cdp: + ctx->enable_cdpl3 = true; + return 0; + case Opt_cdpl2: + ctx->enable_cdpl2 = true; + return 0; + case Opt_mba_mbps: + msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; + if (!supports_mba_mbps()) + return invalfc(fc, msg); + ctx->enable_mba_mbps = true; + return 0; + case Opt_debug: + ctx->enable_debug = true; + return 0; + } + + return -EINVAL; +} + +static void rdt_fs_context_free(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + + kernfs_free_fs_context(fc); + kfree(ctx); +} + +static const struct fs_context_operations rdt_fs_context_ops = { + .free = rdt_fs_context_free, + .parse_param = rdt_parse_param, + .get_tree = rdt_get_tree, +}; + +static int rdt_init_fs_context(struct fs_context *fc) +{ + struct rdt_fs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; + fc->fs_private = &ctx->kfc; + fc->ops = &rdt_fs_context_ops; + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(&init_user_ns); + fc->global = true; + return 0; +} + +/* + * Move tasks from one to the other group. If @from is NULL, then all tasks + * in the systems are moved unconditionally (used for teardown). + * + * If @mask is not NULL the cpus on which moved tasks are running are set + * in that mask so the update smp function call is restricted to affected + * cpus. + */ +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, + struct cpumask *mask) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { + resctrl_arch_set_closid_rmid(t, to->closid, + to->mon.rmid); + + /* + * Order the closid/rmid stores above before the loads + * in task_curr(). This pairs with the full barrier + * between the rq->curr update and + * resctrl_arch_sched_in() during context switch. + */ + smp_mb(); + + /* + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but + * there is no other side effect. + */ + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) + cpumask_set_cpu(task_cpu(t), mask); + } + } + read_unlock(&tasklist_lock); +} + +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->closid, sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + + if (atomic_read(&sentry->waitcount) != 0) + sentry->flags = RDT_DELETED; + else + rdtgroup_remove(sentry); + } +} + +/* + * Forcibly remove all of subdirectories under root. + */ +static void rmdir_all_sub(void) +{ + struct rdtgroup *rdtgrp, *tmp; + + /* Move all tasks to the default resource group */ + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); + + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + + /* Remove each rdtgroup other than root */ + if (rdtgrp == &rdtgroup_default) + continue; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + + /* + * Give any CPUs back to the default group. We cannot copy + * cpu_online_mask because a CPU might have executed the + * offline callback already, but is still marked online. + */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + + kernfs_remove(rdtgrp->kn); + list_del(&rdtgrp->rdtgroup_list); + + if (atomic_read(&rdtgrp->waitcount) != 0) + rdtgrp->flags = RDT_DELETED; + else + rdtgroup_remove(rdtgrp); + } + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ + update_closid_rmid(cpu_online_mask, &rdtgroup_default); + + kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); +} + +/** + * mon_get_kn_priv() - Get the mon_data priv data for this event. + * + * The same values are used across the mon_data directories of all control and + * monitor groups for the same event in the same domain. Keep a list of + * allocated structures and re-use an existing one with the same values for + * @rid, @domid, etc. + * + * @rid: The resource id for the event file being created. + * @domid: The domain id for the event file being created. + * @mevt: The type of event file being created. + * @do_sum: Whether SNC summing monitors are being created. + */ +static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, + struct mon_evt *mevt, + bool do_sum) +{ + struct mon_data *priv; + + lockdep_assert_held(&rdtgroup_mutex); + + list_for_each_entry(priv, &mon_data_kn_priv_list, list) { + if (priv->rid == rid && priv->domid == domid && + priv->sum == do_sum && priv->evtid == mevt->evtid) + return priv; + } + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return NULL; + + priv->rid = rid; + priv->domid = domid; + priv->sum = do_sum; + priv->evtid = mevt->evtid; + list_add_tail(&priv->list, &mon_data_kn_priv_list); + + return priv; +} + +/** + * mon_put_kn_priv() - Free all allocated mon_data structures. + * + * Called when resctrl file system is unmounted. + */ +static void mon_put_kn_priv(void) +{ + struct mon_data *priv, *tmp; + + lockdep_assert_held(&rdtgroup_mutex); + + list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) { + list_del(&priv->list); + kfree(priv); + } +} + +static void resctrl_fs_teardown(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + + /* Cleared by rdtgroup_destroy_root() */ + if (!rdtgroup_default.kn) + return; + + rmdir_all_sub(); + mon_put_kn_priv(); + rdt_pseudo_lock_release(); + rdtgroup_default.mode = RDT_MODE_SHAREABLE; + closid_exit(); + schemata_list_destroy(); + rdtgroup_destroy_root(); +} + +static void rdt_kill_sb(struct super_block *sb) +{ + struct rdt_resource *r; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_disable_ctx(); + + /* Put everything back to default values. */ + for_each_alloc_capable_rdt_resource(r) + resctrl_arch_reset_all_ctrls(r); + + resctrl_fs_teardown(); + if (resctrl_arch_alloc_capable()) + resctrl_arch_disable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_disable_mon(); + resctrl_mounted = false; + kernfs_kill_sb(sb); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +static struct file_system_type rdt_fs_type = { + .name = "resctrl", + .init_fs_context = rdt_init_fs_context, + .parameters = rdt_fs_parameters, + .kill_sb = rdt_kill_sb, +}; + +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) +{ + struct kernfs_node *kn; + int ret = 0; + + kn = __kernfs_create_file(parent_kn, name, 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return ret; +} + +static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) +{ + struct kernfs_node *kn; + + kn = kernfs_find_and_get(pkn, name); + if (!kn) + return; + kernfs_put(kn); + + if (kn->dir.subdirs <= 1) + kernfs_remove(kn); + else + kernfs_remove_by_name(kn, subname); +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups for the given domain. + * Remove files and directories containing "sum" of domain data + * when last domain being summed is removed. + */ +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_mon_domain *d) +{ + struct rdtgroup *prgrp, *crgrp; + char subname[32]; + bool snc_mode; + char name[32]; + + snc_mode = r->mon_scope == RESCTRL_L3_NODE; + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + if (snc_mode) + sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); + } +} + +static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp, + bool do_sum) +{ + struct rmid_read rr = {0}; + struct mon_data *priv; + struct mon_evt *mevt; + int ret, domid; + + if (WARN_ON(list_empty(&r->evt_list))) + return -EPERM; + + list_for_each_entry(mevt, &r->evt_list, list) { + domid = do_sum ? d->ci->id : d->hdr.id; + priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); + if (WARN_ON_ONCE(!priv)) + return -EINVAL; + + ret = mon_addfile(kn, mevt->name, priv); + if (ret) + return ret; + + if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) + mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); + } + + return 0; +} + +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_mon_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + struct kernfs_node *kn, *ckn; + char name[32]; + bool snc_mode; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + snc_mode = r->mon_scope == RESCTRL_L3_NODE; + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + kn = kernfs_find_and_get(parent_kn, name); + if (kn) { + /* + * rdtgroup_mutex will prevent this directory from being + * removed. No need to keep this hold. + */ + kernfs_put(kn); + } else { + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); + if (ret) + goto out_destroy; + } + + if (snc_mode) { + sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); + ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); + if (IS_ERR(ckn)) { + ret = -EINVAL; + goto out_destroy; + } + + ret = rdtgroup_kn_set_ugid(ckn); + if (ret) + goto out_destroy; + + ret = mon_add_all_files(ckn, d, r, prgrp, false); + if (ret) + goto out_destroy; + } + + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_mon_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_mon_domain *dom; + int ret; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain which are named + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_capable_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/** + * cbm_ensure_valid - Enforce validity on provided CBM + * @_val: Candidate CBM + * @r: RDT resource to which the CBM belongs + * + * The provided CBM represents all cache portions available for use. This + * may be represented by a bitmap that does not consist of contiguous ones + * and thus be an invalid CBM. + * Here the provided CBM is forced to be a valid CBM by only considering + * the first set of contiguous bits as valid and clearing all bits. + * The intention here is to provide a valid default CBM with which a new + * resource group is initialized. The user can follow this with a + * modification to the CBM if the default does not satisfy the + * requirements. + */ +static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) +{ + unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit; + unsigned long val = _val; + + if (!val) + return 0; + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + /* Clear any remaining bits to ensure contiguous region */ + bitmap_clear(&val, zero_bit, cbm_len - zero_bit); + return (u32)val; +} + +/* + * Initialize cache resources per RDT domain + * + * Set the RDT domain up to start off with all usable allocations. That is, + * all shareable and unused bits. All-zero CBM is invalid. + */ +static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, + u32 closid) +{ + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); + enum resctrl_conf_type t = s->conf_type; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + u32 used_b = 0, unused_b = 0; + unsigned long tmp_cbm; + enum rdtgrp_mode mode; + u32 peer_ctl, ctrl_val; + int i; + + cfg = &d->staged_config[t]; + cfg->have_new_ctrl = false; + cfg->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + for (i = 0; i < closids_supported(); i++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + /* + * ctrl values for locksetup aren't relevant + * until the schemata is written, and the mode + * becomes RDT_MODE_PSEUDO_LOCKED. + */ + continue; + /* + * If CDP is active include peer domain's + * usage to ensure there is no overlap + * with an exclusive group. + */ + if (resctrl_arch_get_cdp_enabled(r->rid)) + peer_ctl = resctrl_arch_get_config(r, d, i, + peer_type); + else + peer_ctl = 0; + ctrl_val = resctrl_arch_get_config(r, d, i, + s->conf_type); + used_b |= ctrl_val | peer_ctl; + if (mode == RDT_MODE_SHAREABLE) + cfg->new_ctrl |= ctrl_val | peer_ctl; + } + } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + cfg->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); + /* + * Assign the u32 CBM to an unsigned long to ensure that + * bitmap_weight() does not access out-of-bound memory. + */ + tmp_cbm = cfg->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); + return -ENOSPC; + } + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * Initialize cache resources with default values. + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. + * + * If there are no more shareable bits available on any domain then + * the entire allocation will fail. + */ +static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) +{ + struct rdt_ctrl_domain *d; + int ret; + + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { + ret = __init_one_rdt_domain(d, s, closid); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Initialize MBA resource with default values. */ +static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) +{ + struct resctrl_staged_config *cfg; + struct rdt_ctrl_domain *d; + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (is_mba_sc(r)) { + d->mbps_val[closid] = MBA_MAX_MBPS; + continue; + } + + cfg = &d->staged_config[CDP_NONE]; + cfg->new_ctrl = resctrl_get_default_ctrl(r); + cfg->have_new_ctrl = true; + } +} + +/* Initialize the RDT group's allocations. */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + struct resctrl_schema *s; + struct rdt_resource *r; + int ret = 0; + + rdt_staged_configs_clear(); + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) { + rdtgroup_init_mba(r, rdtgrp->closid); + if (is_mba_sc(r)) + continue; + } else { + ret = rdtgroup_init_cat(s, rdtgrp->closid); + if (ret < 0) + goto out; + } + + ret = resctrl_arch_update_domains(r, rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Failed to initialize allocations\n"); + goto out; + } + } + + rdtgrp->mode = RDT_MODE_SHAREABLE; + +out: + rdt_staged_configs_clear(); + return ret; +} + +static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) +{ + int ret; + + if (!resctrl_arch_mon_capable()) + return 0; + + ret = alloc_rmid(rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + return ret; + } + + return 0; +} + +static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) +{ + if (resctrl_arch_mon_capable()) + free_rmid(rgrp->closid, rgrp->mon.rmid); +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(rdt_kn_name(kn), "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + unsigned long files = 0; + struct kernfs_node *kn; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(parent_kn); + if (!prdtgrp) { + ret = -ENODEV; + goto out_unlock; + } + + /* + * Check that the parent directory for a monitor group is a "mon_groups" + * directory. + */ + if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { + ret = -EPERM; + goto out_unlock; + } + + if (rtype == RDTMON_GROUP && + (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto out_unlock; + } + + /* allocate the rdtgroup. */ + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); + if (!rdtgrp) { + ret = -ENOSPC; + rdt_last_cmd_puts("Kernel out of memory\n"); + goto out_unlock; + } + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); + + /* kernfs creates the directory for rdtgrp */ + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + rdt_last_cmd_puts("kernfs create error\n"); + goto out_free_rgrp; + } + rdtgrp->kn = kn; + + /* + * kernfs_remove() will drop the reference count on "kn" which + * will free it. But we still need it to stick around for the + * rdtgroup_kn_unlock(kn) call. Take one extra reference here, + * which will be dropped by kernfs_put() in rdtgroup_remove(). + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + rdt_last_cmd_puts("kernfs perm error\n"); + goto out_destroy; + } + + if (rtype == RDTCTRL_GROUP) { + files = RFTYPE_BASE | RFTYPE_CTRL; + if (resctrl_arch_mon_capable()) + files |= RFTYPE_MON; + } else { + files = RFTYPE_BASE | RFTYPE_MON; + } + + ret = rdtgroup_add_files(kn, files); + if (ret) { + rdt_last_cmd_puts("kernfs fill error\n"); + goto out_destroy; + } + + /* + * The caller unlocks the parent_kn upon success. + */ + return 0; + +out_destroy: + kernfs_put(rdtgrp->kn); + kernfs_remove(rdtgrp->kn); +out_free_rgrp: + kfree(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + rdtgroup_remove(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + const char *name, umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) { + mkdir_rdt_prepare_clean(rdtgrp); + goto out_unlock; + } + + kernfs_activate(rdtgrp->kn); + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + const char *name, umode_t mode) +{ + struct rdtgroup *rdtgrp; + struct kernfs_node *kn; + u32 closid; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); + if (ret) + return ret; + + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) { + rdt_last_cmd_puts("Out of CLOSIDs\n"); + goto out_common_fail; + } + closid = ret; + ret = 0; + + rdtgrp->closid = closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) + goto out_closid_free; + + kernfs_activate(rdtgrp->kn); + + ret = rdtgroup_init_alloc(rdtgrp); + if (ret < 0) + goto out_rmid_free; + + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (resctrl_arch_mon_capable()) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + goto out_del_list; + } + if (is_mba_sc(NULL)) + rdtgrp->mba_mbps_event = mba_mbps_default_event; + } + + goto out_unlock; + +out_del_list: + list_del(&rdtgrp->rdtgroup_list); +out_rmid_free: + mkdir_rdt_prepare_rmid_free(rdtgrp); +out_closid_free: + closid_free(closid); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); + + /* Else, attempt to add a monitoring subdirectory. */ + if (resctrl_arch_mon_capable()) + return rdtgroup_mkdir_mon(parent_kn, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + u32 closid, rmid; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* + * Update per cpu closid/rmid of the moved CPUs first. + * Note: the closid will not change, but the arch code still needs it. + */ + closid = prdtgrp->closid; + rmid = prdtgrp->mon.rmid; + for_each_cpu(cpu, &rdtgrp->cpu_mask) + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); + + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) +{ + rdtgrp->flags = RDT_DELETED; + list_del(&rdtgrp->rdtgroup_list); + + kernfs_remove(rdtgrp->kn); + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) +{ + u32 closid, rmid; + int cpu; + + /* Give any tasks back to the default group */ + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); + + /* Give any CPUs back to the default group */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + /* Update per cpu closid and rmid of the moved CPUs first */ + closid = rdtgroup_default.closid; + rmid = rdtgroup_default.mon.rmid; + for_each_cpu(cpu, &rdtgrp->cpu_mask) + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); + + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + closid_free(rdtgrp->closid); + + rdtgroup_ctrl_remove(rdtgrp); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + + return 0; +} + +static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) +{ + /* + * Valid within the RCU section it was obtained or while rdtgroup_mutex + * is held. + */ + return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + parent_kn = rdt_kn_parent(kn); + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && + rdtgrp != &rdtgroup_default) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = rdtgroup_ctrl_remove(rdtgrp); + } else { + ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); + } + } else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, rdt_kn_name(kn))) { + ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); + } else { + ret = -EPERM; + } + +out: + rdtgroup_kn_unlock(kn); + free_cpumask_var(tmpmask); + return ret; +} + +/** + * mongrp_reparent() - replace parent CTRL_MON group of a MON group + * @rdtgrp: the MON group whose parent should be replaced + * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp + * @cpus: cpumask provided by the caller for use during this call + * + * Replaces the parent CTRL_MON group for a MON group, resulting in all member + * tasks' CLOSID immediately changing to that of the new parent group. + * Monitoring data for the group is unaffected by this operation. + */ +static void mongrp_reparent(struct rdtgroup *rdtgrp, + struct rdtgroup *new_prdtgrp, + cpumask_var_t cpus) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + + WARN_ON(rdtgrp->type != RDTMON_GROUP); + WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); + + /* Nothing to do when simply renaming a MON group. */ + if (prdtgrp == new_prdtgrp) + return; + + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_move_tail(&rdtgrp->mon.crdtgrp_list, + &new_prdtgrp->mon.crdtgrp_list); + + rdtgrp->mon.parent = new_prdtgrp; + rdtgrp->closid = new_prdtgrp->closid; + + /* Propagate updated closid to all tasks in this group. */ + rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); + + update_closid_rmid(cpus, NULL); +} + +static int rdtgroup_rename(struct kernfs_node *kn, + struct kernfs_node *new_parent, const char *new_name) +{ + struct kernfs_node *kn_parent; + struct rdtgroup *new_prdtgrp; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret; + + rdtgrp = kernfs_to_rdtgroup(kn); + new_prdtgrp = kernfs_to_rdtgroup(new_parent); + if (!rdtgrp || !new_prdtgrp) + return -ENOENT; + + /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ + rdtgroup_kn_get(rdtgrp, kn); + rdtgroup_kn_get(new_prdtgrp, new_parent); + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + /* + * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if + * either kernfs_node is a file. + */ + if (kernfs_type(kn) != KERNFS_DIR || + kernfs_type(new_parent) != KERNFS_DIR) { + rdt_last_cmd_puts("Source and destination must be directories"); + ret = -EPERM; + goto out; + } + + if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { + ret = -ENOENT; + goto out; + } + + kn_parent = rdt_kn_parent(kn); + if (rdtgrp->type != RDTMON_GROUP || !kn_parent || + !is_mon_groups(kn_parent, rdt_kn_name(kn))) { + rdt_last_cmd_puts("Source must be a MON group\n"); + ret = -EPERM; + goto out; + } + + if (!is_mon_groups(new_parent, new_name)) { + rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); + ret = -EPERM; + goto out; + } + + /* + * If the MON group is monitoring CPUs, the CPUs must be assigned to the + * current parent CTRL_MON group and therefore cannot be assigned to + * the new parent, making the move illegal. + */ + if (!cpumask_empty(&rdtgrp->cpu_mask) && + rdtgrp->mon.parent != new_prdtgrp) { + rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); + ret = -EPERM; + goto out; + } + + /* + * Allocate the cpumask for use in mongrp_reparent() to avoid the + * possibility of failing to allocate it after kernfs_rename() has + * succeeded. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out; + } + + /* + * Perform all input validation and allocations needed to ensure + * mongrp_reparent() will succeed before calling kernfs_rename(), + * otherwise it would be necessary to revert this call if + * mongrp_reparent() failed. + */ + ret = kernfs_rename(kn, new_parent, new_name); + if (!ret) + mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); + + free_cpumask_var(tmpmask); + +out: + mutex_unlock(&rdtgroup_mutex); + rdtgroup_kn_put(rdtgrp, kn); + rdtgroup_kn_put(new_prdtgrp, new_parent); + return ret; +} + +static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) +{ + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) + seq_puts(seq, ",cdp"); + + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) + seq_puts(seq, ",cdpl2"); + + if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) + seq_puts(seq, ",mba_MBps"); + + if (resctrl_debug) + seq_puts(seq, ",debug"); + + return 0; +} + +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { + .mkdir = rdtgroup_mkdir, + .rmdir = rdtgroup_rmdir, + .rename = rdtgroup_rename, + .show_options = rdtgroup_show_options, +}; + +static int rdtgroup_setup_root(struct rdt_fs_context *ctx) +{ + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + &rdtgroup_default); + if (IS_ERR(rdt_root)) + return PTR_ERR(rdt_root); + + ctx->kfc.root = rdt_root; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); + + return 0; +} + +static void rdtgroup_destroy_root(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + + kernfs_destroy_root(rdt_root); + rdtgroup_default.kn = NULL; +} + +static void rdtgroup_setup_default(void) +{ + mutex_lock(&rdtgroup_mutex); + + rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; + rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); + + mutex_unlock(&rdtgroup_mutex); +} + +static void domain_destroy_mon_state(struct rdt_mon_domain *d) +{ + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); +} + +void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) +{ + mutex_lock(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + mba_sc_domain_destroy(r, d); + + mutex_unlock(&rdtgroup_mutex); +} + +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + mutex_lock(&rdtgroup_mutex); + + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (resctrl_mounted && resctrl_arch_mon_capable()) + rmdir_mondata_subdir_allrdtgrp(r, d); + + if (resctrl_is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + + domain_destroy_mon_state(d); + + mutex_unlock(&rdtgroup_mutex); +} + +/** + * domain_setup_mon_state() - Initialise domain monitoring structures. + * @r: The resource for the newly online domain. + * @d: The newly online domain. + * + * Allocate monitor resources that belong to this domain. + * Called when the first CPU of a domain comes online, regardless of whether + * the filesystem is mounted. + * During boot this may be called before global allocations have been made by + * resctrl_mon_resource_init(). + * + * Returns 0 for success, or -ENOMEM. + */ +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + size_t tsize; + + if (resctrl_arch_is_llc_occupancy_enabled()) { + d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + } + if (resctrl_arch_is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_total) { + bitmap_free(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (resctrl_arch_is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_local) { + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + return 0; +} + +int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) +{ + int err = 0; + + mutex_lock(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { + /* RDT_RESOURCE_MBA is never mon_capable */ + err = mba_sc_domain_allocate(r, d); + } + + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + int err; + + mutex_lock(&rdtgroup_mutex); + + err = domain_setup_mon_state(r, d); + if (err) + goto out_unlock; + + if (resctrl_is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); + } + + if (resctrl_arch_is_llc_occupancy_enabled()) + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + + /* + * If the filesystem is not mounted then only the default resource group + * exists. Creation of its directories is deferred until mount time + * by rdt_get_tree() calling mkdir_mondata_all(). + * If resctrl is mounted, add per domain monitor data directories. + */ + if (resctrl_mounted && resctrl_arch_mon_capable()) + mkdir_mondata_subdir_allrdtgrp(r, d); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +void resctrl_online_cpu(unsigned int cpu) +{ + mutex_lock(&rdtgroup_mutex); + /* The CPU is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&rdtgroup_mutex); +} + +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) + break; + } +} + +static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, + struct rdt_resource *r) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return d; + } + + return NULL; +} + +void resctrl_offline_cpu(unsigned int cpu) +{ + struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } + + if (!l3->mon_capable) + goto out_unlock; + + d = get_mon_domain_from_cpu(cpu, l3); + if (d) { + if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0, cpu); + } + if (resctrl_arch_is_llc_occupancy_enabled() && + cpu == d->cqm_work_cpu && has_busy_rmid(d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0, cpu); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +/* + * resctrl_init - resctrl filesystem initialization + * + * Setup resctrl file system including set up root, create mount point, + * register resctrl filesystem, and initialize files under root directory. + * + * Return: 0 on success or -errno + */ +int resctrl_init(void) +{ + int ret = 0; + + seq_buf_init(&last_cmd_status, last_cmd_status_buf, + sizeof(last_cmd_status_buf)); + + rdtgroup_setup_default(); + + thread_throttle_mode_init(); + + ret = resctrl_mon_resource_init(); + if (ret) + return ret; + + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + if (ret) { + resctrl_mon_resource_exit(); + return ret; + } + + ret = register_filesystem(&rdt_fs_type); + if (ret) + goto cleanup_mountpoint; + + /* + * Adding the resctrl debugfs directory here may not be ideal since + * it would let the resctrl debugfs directory appear on the debugfs + * filesystem before the resctrl filesystem is mounted. + * It may also be ok since that would enable debugging of RDT before + * resctrl is mounted. + * The reason why the debugfs directory is created here and not in + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and + * during the debugfs directory creation also &sb->s_type->i_mutex_key + * (the lockdep class of inode->i_rwsem). Other filesystem + * interactions (eg. SyS_getdents) have the lock ordering: + * &sb->s_type->i_mutex_key --> &mm->mmap_lock + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex + * is taken, thus creating dependency: + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause + * issues considering the other two lock dependencies. + * By creating the debugfs directory here we avoid a dependency + * that may cause deadlock (even though file operations cannot + * occur until the filesystem is mounted, but I do not know how to + * tell lockdep that). + */ + debugfs_resctrl = debugfs_create_dir("resctrl", NULL); + + return 0; + +cleanup_mountpoint: + sysfs_remove_mount_point(fs_kobj, "resctrl"); + resctrl_mon_resource_exit(); + + return ret; +} + +static bool resctrl_online_domains_exist(void) +{ + struct rdt_resource *r; + + /* + * Only walk capable resources to allow resctrl_arch_get_resource() + * to return dummy 'not capable' resources. + */ + for_each_alloc_capable_rdt_resource(r) { + if (!list_empty(&r->ctrl_domains)) + return true; + } + + for_each_mon_capable_rdt_resource(r) { + if (!list_empty(&r->mon_domains)) + return true; + } + + return false; +} + +/** + * resctrl_exit() - Remove the resctrl filesystem and free resources. + * + * Called by the architecture code in response to a fatal error. + * Removes resctrl files and structures from kernfs to prevent further + * configuration. + * + * When called by the architecture code, all CPUs and resctrl domains must be + * offline. This ensures the limbo and overflow handlers are not scheduled to + * run, meaning the data structures they access can be freed by + * resctrl_mon_resource_exit(). + * + * After resctrl_exit() returns, the architecture code should return an + * error from all resctrl_arch_ functions that can do this. + * resctrl_arch_get_resource() must continue to return struct rdt_resources + * with the correct rid field to ensure the filesystem can be unmounted. + */ +void resctrl_exit(void) +{ + cpus_read_lock(); + WARN_ON_ONCE(resctrl_online_domains_exist()); + + mutex_lock(&rdtgroup_mutex); + resctrl_fs_teardown(); + mutex_unlock(&rdtgroup_mutex); + + cpus_read_unlock(); + + debugfs_remove_recursive(debugfs_resctrl); + debugfs_resctrl = NULL; + unregister_filesystem(&rdt_fs_type); + + /* + * Do not remove the sysfs mount point added by resctrl_init() so that + * it can be used to umount resctrl. + */ + + resctrl_mon_resource_exit(); +} diff --git a/fs/select.c b/fs/select.c index 7da531b1cf6b..9fb650d03d52 100644 --- a/fs/select.c +++ b/fs/select.c @@ -630,7 +630,7 @@ int core_sys_select(int n, fd_set __user *inp, fd_set __user *outp, long stack_fds[SELECT_STACK_ALLOC/sizeof(long)]; ret = -EINVAL; - if (n < 0) + if (unlikely(n < 0)) goto out_nofds; /* max_fds can increase, so grab it once to avoid race */ @@ -857,7 +857,7 @@ static inline __poll_t do_pollfd(struct pollfd *pollfd, poll_table *pwait, int fd = pollfd->fd; __poll_t mask, filter; - if (fd < 0) + if (unlikely(fd < 0)) return 0; CLASS(fd, f)(fd); diff --git a/fs/signalfd.c b/fs/signalfd.c index d1a5f43ce466..d469782f97f4 100644 --- a/fs/signalfd.c +++ b/fs/signalfd.c @@ -277,15 +277,14 @@ static int do_signalfd4(int ufd, sigset_t *mask, int flags) return ufd; } - file = anon_inode_getfile("[signalfd]", &signalfd_fops, ctx, - O_RDWR | (flags & O_NONBLOCK)); + file = anon_inode_getfile_fmode("[signalfd]", &signalfd_fops, + ctx, O_RDWR | (flags & O_NONBLOCK), + FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); return PTR_ERR(file); } - file->f_mode |= FMODE_NOWAIT; - fd_install(ufd, file); } else { CLASS(fd, f)(ufd); diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index fe738623cf1b..89d2dbbb742c 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -29,7 +29,6 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, { struct cached_fid *cfid; - spin_lock(&cfids->cfid_list_lock); list_for_each_entry(cfid, &cfids->entries, entry) { if (!strcmp(cfid->path, path)) { /* @@ -38,25 +37,20 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, * being deleted due to a lease break. */ if (!cfid->time || !cfid->has_lease) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } kref_get(&cfid->refcount); - spin_unlock(&cfids->cfid_list_lock); return cfid; } } if (lookup_only) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } if (cfids->num_entries >= max_cached_dirs) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } cfid = init_cached_dir(path); if (cfid == NULL) { - spin_unlock(&cfids->cfid_list_lock); return NULL; } cfid->cfids = cfids; @@ -74,7 +68,6 @@ static struct cached_fid *find_or_create_cached_dir(struct cached_fids *cfids, */ cfid->has_lease = true; - spin_unlock(&cfids->cfid_list_lock); return cfid; } @@ -109,7 +102,8 @@ path_to_dentry(struct cifs_sb_info *cifs_sb, const char *path) while (*s && *s != sep) s++; - child = lookup_positive_unlocked(p, dentry, s - p); + child = lookup_noperm_positive_unlocked(&QSTR_LEN(p, s - p), + dentry); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); @@ -187,8 +181,10 @@ replay_again: if (!utf16_path) return -ENOMEM; + spin_lock(&cfids->cfid_list_lock); cfid = find_or_create_cached_dir(cfids, path, lookup_only, tcon->max_cached_dirs); if (cfid == NULL) { + spin_unlock(&cfids->cfid_list_lock); kfree(utf16_path); return -ENOENT; } @@ -197,7 +193,6 @@ replay_again: * Otherwise, it is either a new entry or laundromat worker removed it * from @cfids->entries. Caller will put last reference if the latter. */ - spin_lock(&cfids->cfid_list_lock); if (cfid->has_lease && cfid->time) { spin_unlock(&cfids->cfid_list_lock); *ret_cfid = cfid; @@ -207,7 +202,7 @@ replay_again: spin_unlock(&cfids->cfid_list_lock); /* - * Skip any prefix paths in @path as lookup_positive_unlocked() ends up + * Skip any prefix paths in @path as lookup_noperm_positive_unlocked() ends up * calling ->lookup() which already adds those through * build_path_from_dentry(). Also, do it earlier as we might reconnect * below when trying to send compounded request and then potentially diff --git a/fs/smb/client/cifs_fs_sb.h b/fs/smb/client/cifs_fs_sb.h index 651759192280..5e8d163cb5f8 100644 --- a/fs/smb/client/cifs_fs_sb.h +++ b/fs/smb/client/cifs_fs_sb.h @@ -49,6 +49,7 @@ struct cifs_sb_info { struct rb_root tlink_tree; + struct list_head tcon_sb_link; spinlock_t tlink_tree_lock; struct tcon_link *master_tlink; struct nls_table *local_nls; diff --git a/fs/smb/client/cifsacl.c b/fs/smb/client/cifsacl.c index 699a3f76d083..63b3b1290bed 100644 --- a/fs/smb/client/cifsacl.c +++ b/fs/smb/client/cifsacl.c @@ -763,7 +763,7 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, struct cifs_fattr *fattr, bool mode_from_special_sid) { int i; - int num_aces = 0; + u16 num_aces = 0; int acl_size; char *acl_base; struct smb_ace **ppace; @@ -778,14 +778,15 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, } /* validate that we do not go past end of acl */ - if (end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { + if (end_of_acl < (char *)pdacl + sizeof(struct smb_acl) || + end_of_acl < (char *)pdacl + le16_to_cpu(pdacl->size)) { cifs_dbg(VFS, "ACL too small to parse DACL\n"); return; } cifs_dbg(NOISY, "DACL revision %d size %d num aces %d\n", le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), - le32_to_cpu(pdacl->num_aces)); + le16_to_cpu(pdacl->num_aces)); /* reset rwx permissions for user/group/other. Also, if num_aces is 0 i.e. DACL has no ACEs, @@ -795,19 +796,38 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, acl_base = (char *)pdacl; acl_size = sizeof(struct smb_acl); - num_aces = le32_to_cpu(pdacl->num_aces); + num_aces = le16_to_cpu(pdacl->num_aces); if (num_aces > 0) { umode_t denied_mode = 0; - if (num_aces > ULONG_MAX / sizeof(struct smb_ace *)) + if (num_aces > (le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) / + (offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth) + sizeof(__le16))) return; + ppace = kmalloc_array(num_aces, sizeof(struct smb_ace *), GFP_KERNEL); if (!ppace) return; for (i = 0; i < num_aces; ++i) { + if (end_of_acl - acl_base < acl_size) + break; + ppace[i] = (struct smb_ace *) (acl_base + acl_size); + acl_base = (char *)ppace[i]; + acl_size = offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth); + + if (end_of_acl - acl_base < acl_size || + ppace[i]->sid.num_subauth == 0 || + ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || + (end_of_acl - acl_base < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) || + (le16_to_cpu(ppace[i]->size) < + acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth)) + break; + #ifdef CONFIG_CIFS_DEBUG2 dump_ace(ppace[i], end_of_acl); #endif @@ -851,7 +871,6 @@ static void parse_dacl(struct smb_acl *pdacl, char *end_of_acl, (void *)ppace[i], sizeof(struct smb_ace)); */ - acl_base = (char *)ppace[i]; acl_size = le16_to_cpu(ppace[i]->size); } @@ -937,12 +956,12 @@ unsigned int setup_special_user_owner_ACE(struct smb_ace *pntace) static void populate_new_aces(char *nacl_base, struct smb_sid *pownersid, struct smb_sid *pgrpsid, - __u64 *pnmode, u32 *pnum_aces, u16 *pnsize, + __u64 *pnmode, u16 *pnum_aces, u16 *pnsize, bool modefromsid, bool posix) { __u64 nmode; - u32 num_aces = 0; + u16 num_aces = 0; u16 nsize = 0; __u64 user_mode; __u64 group_mode; @@ -1050,7 +1069,7 @@ static __u16 replace_sids_and_copy_aces(struct smb_acl *pdacl, struct smb_acl *p u16 size = 0; struct smb_ace *pntace = NULL; char *acl_base = NULL; - u32 src_num_aces = 0; + u16 src_num_aces = 0; u16 nsize = 0; struct smb_ace *pnntace = NULL; char *nacl_base = NULL; @@ -1058,7 +1077,7 @@ static __u16 replace_sids_and_copy_aces(struct smb_acl *pdacl, struct smb_acl *p acl_base = (char *)pdacl; size = sizeof(struct smb_acl); - src_num_aces = le32_to_cpu(pdacl->num_aces); + src_num_aces = le16_to_cpu(pdacl->num_aces); nacl_base = (char *)pndacl; nsize = sizeof(struct smb_acl); @@ -1090,11 +1109,11 @@ static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl, u16 size = 0; struct smb_ace *pntace = NULL; char *acl_base = NULL; - u32 src_num_aces = 0; + u16 src_num_aces = 0; u16 nsize = 0; struct smb_ace *pnntace = NULL; char *nacl_base = NULL; - u32 num_aces = 0; + u16 num_aces = 0; bool new_aces_set = false; /* Assuming that pndacl and pnmode are never NULL */ @@ -1112,7 +1131,7 @@ static int set_chmod_dacl(struct smb_acl *pdacl, struct smb_acl *pndacl, acl_base = (char *)pdacl; size = sizeof(struct smb_acl); - src_num_aces = le32_to_cpu(pdacl->num_aces); + src_num_aces = le16_to_cpu(pdacl->num_aces); /* Retain old ACEs which we can retain */ for (i = 0; i < src_num_aces; ++i) { @@ -1158,7 +1177,7 @@ next_ace: } finalize_dacl: - pndacl->num_aces = cpu_to_le32(num_aces); + pndacl->num_aces = cpu_to_le16(num_aces); pndacl->size = cpu_to_le16(nsize); return 0; @@ -1293,7 +1312,7 @@ static int build_sec_desc(struct smb_ntsd *pntsd, struct smb_ntsd *pnntsd, dacloffset ? dacl_ptr->revision : cpu_to_le16(ACL_REVISION); ndacl_ptr->size = cpu_to_le16(0); - ndacl_ptr->num_aces = cpu_to_le32(0); + ndacl_ptr->num_aces = cpu_to_le16(0); rc = set_chmod_dacl(dacl_ptr, ndacl_ptr, owner_sid_ptr, group_sid_ptr, pnmode, mode_from_sid, posix); @@ -1546,7 +1565,7 @@ cifs_acl_to_fattr(struct cifs_sb_info *cifs_sb, struct cifs_fattr *fattr, int rc = 0; struct tcon_link *tlink = cifs_sb_tlink(cifs_sb); struct smb_version_operations *ops; - const u32 info = 0; + const u32 info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; cifs_dbg(NOISY, "converting ACL to mode for %s\n", path); @@ -1600,7 +1619,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, struct tcon_link *tlink; struct smb_version_operations *ops; bool mode_from_sid, id_from_sid; - const u32 info = 0; + const u32 info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; bool posix; tlink = cifs_sb_tlink(cifs_sb); @@ -1653,7 +1672,7 @@ id_mode_to_cifs_acl(struct inode *inode, const char *path, __u64 *pnmode, dacl_ptr = (struct smb_acl *)((char *)pntsd + dacloffset); if (mode_from_sid) nsecdesclen += - le32_to_cpu(dacl_ptr->num_aces) * sizeof(struct smb_ace); + le16_to_cpu(dacl_ptr->num_aces) * sizeof(struct smb_ace); else /* cifsacl */ nsecdesclen += le16_to_cpu(dacl_ptr->size); } diff --git a/fs/smb/client/cifsencrypt.c b/fs/smb/client/cifsencrypt.c index e69968e88fe7..35892df7335c 100644 --- a/fs/smb/client/cifsencrypt.c +++ b/fs/smb/client/cifsencrypt.c @@ -704,18 +704,12 @@ cifs_crypto_secmech_release(struct TCP_Server_Info *server) cifs_free_hash(&server->secmech.md5); cifs_free_hash(&server->secmech.sha512); - if (!SERVER_IS_CHAN(server)) { - if (server->secmech.enc) { - crypto_free_aead(server->secmech.enc); - server->secmech.enc = NULL; - } - - if (server->secmech.dec) { - crypto_free_aead(server->secmech.dec); - server->secmech.dec = NULL; - } - } else { + if (server->secmech.enc) { + crypto_free_aead(server->secmech.enc); server->secmech.enc = NULL; + } + if (server->secmech.dec) { + crypto_free_aead(server->secmech.dec); server->secmech.dec = NULL; } } diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index 6a3bd652d251..fb04e263611c 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -637,6 +637,10 @@ cifs_show_options(struct seq_file *s, struct dentry *root) cifs_sb->ctx->dir_mode); if (cifs_sb->ctx->iocharset) seq_printf(s, ",iocharset=%s", cifs_sb->ctx->iocharset); + if (tcon->ses->unicode == 0) + seq_puts(s, ",nounicode"); + else if (tcon->ses->unicode == 1) + seq_puts(s, ",unicode"); if (tcon->seal) seq_puts(s, ",seal"); else if (tcon->ses->server->ignore_signature) @@ -925,7 +929,8 @@ cifs_get_root(struct smb3_fs_context *ctx, struct super_block *sb) while (*s && *s != sep) s++; - child = lookup_positive_unlocked(p, dentry, s - p); + child = lookup_noperm_positive_unlocked(&QSTR_LEN(p, s - p), + dentry); dput(dentry); dentry = child; } while (!IS_ERR(dentry)); diff --git a/fs/smb/client/cifsfs.h b/fs/smb/client/cifsfs.h index 831fee962c4d..ca435a3841b8 100644 --- a/fs/smb/client/cifsfs.h +++ b/fs/smb/client/cifsfs.h @@ -59,8 +59,8 @@ extern int cifs_unlink(struct inode *dir, struct dentry *dentry); extern int cifs_hardlink(struct dentry *, struct inode *, struct dentry *); extern int cifs_mknod(struct mnt_idmap *, struct inode *, struct dentry *, umode_t, dev_t); -extern int cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, - umode_t); +extern struct dentry *cifs_mkdir(struct mnt_idmap *, struct inode *, struct dentry *, + umode_t); extern int cifs_rmdir(struct inode *, struct dentry *); extern int cifs_rename2(struct mnt_idmap *, struct inode *, struct dentry *, struct inode *, struct dentry *, @@ -135,7 +135,6 @@ extern ssize_t cifs_file_copychunk_range(unsigned int xid, extern long cifs_ioctl(struct file *filep, unsigned int cmd, unsigned long arg); extern void cifs_setsize(struct inode *inode, loff_t offset); -extern int cifs_truncate_page(struct address_space *mapping, loff_t from); struct smb3_fs_context; extern struct dentry *cifs_smb3_do_mount(struct file_system_type *fs_type, @@ -146,6 +145,6 @@ extern const struct export_operations cifs_export_ops; #endif /* CONFIG_CIFS_NFSD_EXPORT */ /* when changing internal version - update following two lines at same time */ -#define SMB3_PRODUCT_BUILD 53 -#define CIFS_VERSION "2.53" +#define SMB3_PRODUCT_BUILD 54 +#define CIFS_VERSION "2.54" #endif /* _CIFSFS_H */ diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index cddeb2adbf4a..3b32116b0b49 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -625,10 +625,8 @@ struct smb_version_operations { bool (*is_status_io_timeout)(char *buf); /* Check for STATUS_NETWORK_NAME_DELETED */ bool (*is_network_name_deleted)(char *buf, struct TCP_Server_Info *srv); - int (*parse_reparse_point)(struct cifs_sb_info *cifs_sb, - const char *full_path, - struct kvec *rsp_iov, - struct cifs_open_info_data *data); + struct reparse_data_buffer * (*get_reparse_point_buffer)(const struct kvec *rsp_iov, + u32 *plen); int (*create_reparse_symlink)(const unsigned int xid, struct inode *inode, struct dentry *dentry, @@ -653,6 +651,7 @@ struct smb_version_values { unsigned int cap_unix; unsigned int cap_nt_find; unsigned int cap_large_files; + unsigned int cap_unicode; __u16 signing_enabled; __u16 signing_required; size_t create_lease_size; @@ -713,6 +712,8 @@ struct TCP_Server_Info { spinlock_t srv_lock; /* protect anything here that is not protected */ __u64 conn_id; /* connection identifier (useful for debugging) */ int srv_count; /* reference counter */ + int rfc1001_sessinit; /* whether to estasblish netbios session */ + bool with_rfc1001; /* if netbios session is used */ /* 15 character server name + 0x20 16th byte indicating type = srv */ char server_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; struct smb_version_operations *ops; @@ -1120,6 +1121,7 @@ struct cifs_ses { bool sign; /* is signing required? */ bool domainAuto:1; bool expired_pwd; /* track if access denied or expired pwd so can know if need to update */ + int unicode; unsigned int flags; __u16 session_flags; __u8 smb3signingkey[SMB3_SIGN_KEY_SIZE]; @@ -1319,7 +1321,8 @@ struct cifs_tcon { #endif struct list_head pending_opens; /* list of incomplete opens */ struct cached_fids *cfids; - /* BB add field for back pointer to sb struct(s)? */ + struct list_head cifs_sb_list; + spinlock_t sb_list_lock; #ifdef CONFIG_CIFS_DFS_UPCALL struct delayed_work dfs_cache_work; struct list_head dfs_ses_list; @@ -1716,6 +1719,7 @@ struct mid_q_entry { void *resp_buf; /* pointer to received SMB header */ unsigned int resp_buf_size; int mid_state; /* wish this were enum but can not pass to wait_event */ + int mid_rc; /* rc for MID_RC */ unsigned int mid_flags; __le16 command; /* smb command code */ unsigned int optype; /* operation type */ @@ -1878,6 +1882,7 @@ static inline bool is_replayable_error(int error) #define MID_RESPONSE_MALFORMED 0x10 #define MID_SHUTDOWN 0x20 #define MID_RESPONSE_READY 0x40 /* ready for other process handle the rsp */ +#define MID_RC 0x80 /* mid_rc contains custom rc */ /* Flags */ #define MID_WAIT_CANCELLED 1 /* Cancelled while waiting for response */ diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index 48d0d6f439cf..1b79fe07476f 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -1266,10 +1266,9 @@ typedef struct smb_com_query_information_rsp { typedef struct smb_com_setattr_req { struct smb_hdr hdr; /* wct = 8 */ __le16 attr; - __le16 time_low; - __le16 time_high; + __le32 last_write_time; __le16 reserved[5]; /* must be zero */ - __u16 ByteCount; + __le16 ByteCount; __u8 BufferFormat; /* 4 = ASCII */ unsigned char fileName[]; } __attribute__((packed)) SETATTR_REQ; @@ -2256,6 +2255,8 @@ typedef struct { #define FILE_SUPPORTS_ENCRYPTION 0x00020000 #define FILE_SUPPORTS_OBJECT_IDS 0x00010000 #define FILE_VOLUME_IS_COMPRESSED 0x00008000 +#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400 +#define FILE_RETURNS_CLEANUP_RESULT_INFO 0x00000200 #define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100 #define FILE_SUPPORTS_REPARSE_POINTS 0x00000080 #define FILE_SUPPORTS_SPARSE_FILES 0x00000040 diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index 81680001944d..ecf774a8f1ca 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -31,6 +31,9 @@ extern void cifs_small_buf_release(void *); extern void free_rsp_buf(int, void *); extern int smb_send(struct TCP_Server_Info *, struct smb_hdr *, unsigned int /* length */); +extern int smb_send_kvec(struct TCP_Server_Info *server, + struct msghdr *msg, + size_t *sent); extern unsigned int _get_xid(void); extern void _free_xid(unsigned int); #define get_xid() \ @@ -160,6 +163,8 @@ extern int cifs_get_writable_path(struct cifs_tcon *tcon, const char *name, extern struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *, bool); extern int cifs_get_readable_path(struct cifs_tcon *tcon, const char *name, struct cifsFileInfo **ret_file); +extern int cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, + struct file *file); extern unsigned int smbCalcSize(void *buf); extern int decode_negTokenInit(unsigned char *security_blob, int length, struct TCP_Server_Info *server); @@ -390,6 +395,10 @@ extern int CIFSSMBQFSUnixInfo(const unsigned int xid, struct cifs_tcon *tcon); extern int CIFSSMBQFSPosixInfo(const unsigned int xid, struct cifs_tcon *tcon, struct kstatfs *FSData); +extern int SMBSetInformation(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, __le32 attributes, __le64 write_time, + const struct nls_table *nls_codepage, + struct cifs_sb_info *cifs_sb); extern int CIFSSMBSetPathInfo(const unsigned int xid, struct cifs_tcon *tcon, const char *fileName, const FILE_BASIC_INFO *data, const struct nls_table *nls_codepage, @@ -592,7 +601,6 @@ int cifs_async_readv(struct cifs_io_subrequest *rdata); int cifs_readv_receive(struct TCP_Server_Info *server, struct mid_q_entry *mid); void cifs_async_writev(struct cifs_io_subrequest *wdata); -void cifs_writev_complete(struct work_struct *work); int cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_sb_info *cifs_sb, const unsigned char *path, char *pbuf, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index 3feaa0f68169..f55457b4b82e 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -114,19 +114,23 @@ again: mutex_lock(&ses->session_mutex); /* - * Recheck after acquire mutex. If another thread is negotiating - * and the server never sends an answer the socket will be closed - * and tcpStatus set to reconnect. + * Handle the case where a concurrent thread failed to negotiate or + * killed a channel. */ spin_lock(&server->srv_lock); - if (server->tcpStatus == CifsNeedReconnect) { + switch (server->tcpStatus) { + case CifsExiting: spin_unlock(&server->srv_lock); mutex_unlock(&ses->session_mutex); - - if (tcon->retry) - goto again; - rc = -EHOSTDOWN; - goto out; + return -EHOSTDOWN; + case CifsNeedReconnect: + spin_unlock(&server->srv_lock); + mutex_unlock(&ses->session_mutex); + if (!tcon->retry) + return -EHOSTDOWN; + goto again; + default: + break; } spin_unlock(&server->srv_lock); @@ -152,16 +156,20 @@ again: spin_unlock(&ses->ses_lock); rc = cifs_negotiate_protocol(0, ses, server); - if (!rc) { - rc = cifs_setup_session(0, ses, server, ses->local_nls); - if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) { - /* - * Try alternate password for next reconnect if an alternate - * password is available. - */ - if (ses->password2) - swap(ses->password2, ses->password); - } + if (rc) { + mutex_unlock(&ses->session_mutex); + if (!tcon->retry) + return -EHOSTDOWN; + goto again; + } + rc = cifs_setup_session(0, ses, server, ses->local_nls); + if ((rc == -EACCES) || (rc == -EHOSTDOWN) || (rc == -EKEYREVOKED)) { + /* + * Try alternate password for next reconnect if an alternate + * password is available. + */ + if (ses->password2) + swap(ses->password2, ses->password); } /* do we need to reconnect tcon? */ @@ -429,7 +437,10 @@ CIFSSMBNegotiate(const unsigned int xid, return rc; pSMB->hdr.Mid = get_next_mid(server); - pSMB->hdr.Flags2 |= (SMBFLG2_UNICODE | SMBFLG2_ERR_STATUS); + pSMB->hdr.Flags2 |= SMBFLG2_ERR_STATUS; + + if (ses->unicode != 0) + pSMB->hdr.Flags2 |= SMBFLG2_UNICODE; if (should_set_ext_sec_flag(ses->sectype)) { cifs_dbg(FYI, "Requesting extended security\n"); @@ -1030,15 +1041,31 @@ static __u16 convert_disposition(int disposition) static int access_flags_to_smbopen_mode(const int access_flags) { - int masked_flags = access_flags & (GENERIC_READ | GENERIC_WRITE); - - if (masked_flags == GENERIC_READ) - return SMBOPEN_READ; - else if (masked_flags == GENERIC_WRITE) + /* + * SYSTEM_SECURITY grants both read and write access to SACL, treat is as read/write. + * MAXIMUM_ALLOWED grants as many access as possible, so treat it as read/write too. + * SYNCHRONIZE as is does not grant any specific access, so do not check its mask. + * If only SYNCHRONIZE bit is specified then fallback to read access. + */ + bool with_write_flags = access_flags & (FILE_WRITE_DATA | FILE_APPEND_DATA | FILE_WRITE_EA | + FILE_DELETE_CHILD | FILE_WRITE_ATTRIBUTES | DELETE | + WRITE_DAC | WRITE_OWNER | SYSTEM_SECURITY | + MAXIMUM_ALLOWED | GENERIC_WRITE | GENERIC_ALL); + bool with_read_flags = access_flags & (FILE_READ_DATA | FILE_READ_EA | FILE_EXECUTE | + FILE_READ_ATTRIBUTES | READ_CONTROL | + SYSTEM_SECURITY | MAXIMUM_ALLOWED | GENERIC_ALL | + GENERIC_EXECUTE | GENERIC_READ); + bool with_execute_flags = access_flags & (FILE_EXECUTE | MAXIMUM_ALLOWED | GENERIC_ALL | + GENERIC_EXECUTE); + + if (with_write_flags && with_read_flags) + return SMBOPEN_READWRITE; + else if (with_write_flags) return SMBOPEN_WRITE; - - /* just go for read/write */ - return SMBOPEN_READWRITE; + else if (with_execute_flags) + return SMBOPEN_EXECUTE; + else + return SMBOPEN_READ; } int @@ -1338,7 +1365,8 @@ cifs_readv_callback(struct mid_q_entry *mid) rdata->credits.value = 0; rdata->subreq.error = rdata->result; rdata->subreq.transferred += rdata->got_bytes; - queue_work(cifsiod_wq, &rdata->subreq.work); + trace_netfs_sreq(&rdata->subreq, netfs_sreq_trace_io_progress); + netfs_read_subreq_terminated(&rdata->subreq); release_mid(mid); add_credits(server, &credits, 0); } @@ -2700,6 +2728,9 @@ int cifs_query_reparse_point(const unsigned int xid, if (cap_unix(tcon->ses)) return -EOPNOTSUPP; + if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) + return -EOPNOTSUPP; + oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -3391,8 +3422,7 @@ CIFSSMBGetCIFSACL(const unsigned int xid, struct cifs_tcon *tcon, __u16 fid, /* BB TEST with big acls that might need to be e.g. larger than 16K */ pSMB->MaxSetupCount = 0; pSMB->Fid = fid; /* file handle always le */ - pSMB->AclFlags = cpu_to_le32(CIFS_ACL_OWNER | CIFS_ACL_GROUP | - CIFS_ACL_DACL | info); + pSMB->AclFlags = cpu_to_le32(info); pSMB->ByteCount = cpu_to_le16(11); /* 3 bytes pad + 8 bytes parm */ inc_rfc1001_len(pSMB, 11); iov[0].iov_base = (char *)pSMB; @@ -5141,6 +5171,63 @@ CIFSSMBSetFileSize(const unsigned int xid, struct cifs_tcon *tcon, return rc; } +int +SMBSetInformation(const unsigned int xid, struct cifs_tcon *tcon, + const char *fileName, __le32 attributes, __le64 write_time, + const struct nls_table *nls_codepage, + struct cifs_sb_info *cifs_sb) +{ + SETATTR_REQ *pSMB; + SETATTR_RSP *pSMBr; + struct timespec64 ts; + int bytes_returned; + int name_len; + int rc; + + cifs_dbg(FYI, "In %s path %s\n", __func__, fileName); + +retry: + rc = smb_init(SMB_COM_SETATTR, 8, tcon, (void **) &pSMB, + (void **) &pSMBr); + if (rc) + return rc; + + if (pSMB->hdr.Flags2 & SMBFLG2_UNICODE) { + name_len = + cifsConvertToUTF16((__le16 *) pSMB->fileName, + fileName, PATH_MAX, nls_codepage, + cifs_remap(cifs_sb)); + name_len++; /* trailing null */ + name_len *= 2; + } else { + name_len = copy_path_name(pSMB->fileName, fileName); + } + /* Only few attributes can be set by this command, others are not accepted by Win9x. */ + pSMB->attr = cpu_to_le16(le32_to_cpu(attributes) & + (ATTR_READONLY | ATTR_HIDDEN | ATTR_SYSTEM | ATTR_ARCHIVE)); + /* Zero write time value (in both NT and SETATTR formats) means to not change it. */ + if (le64_to_cpu(write_time) != 0) { + ts = cifs_NTtimeToUnix(write_time); + pSMB->last_write_time = cpu_to_le32(ts.tv_sec); + } + pSMB->BufferFormat = 0x04; + name_len++; /* account for buffer type byte */ + inc_rfc1001_len(pSMB, (__u16)name_len); + pSMB->ByteCount = cpu_to_le16(name_len); + + rc = SendReceive(xid, tcon->ses, (struct smb_hdr *) pSMB, + (struct smb_hdr *) pSMBr, &bytes_returned, 0); + if (rc) + cifs_dbg(FYI, "Send error in %s = %d\n", __func__, rc); + + cifs_buf_release(pSMB); + + if (rc == -EAGAIN) + goto retry; + + return rc; +} + /* Some legacy servers such as NT4 require that the file times be set on an open handle, rather than by pathname - this is awkward due to potential access conflicts on the open, but it is unavoidable for these diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index f917de020dd5..6bf04d9a5491 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -370,7 +370,7 @@ static bool cifs_tcp_ses_needs_reconnect(struct TCP_Server_Info *server, int num * */ static int __cifs_reconnect(struct TCP_Server_Info *server, - bool mark_smb_session) + bool mark_smb_session, bool once) { int rc = 0; @@ -398,6 +398,9 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, if (rc) { cifs_server_unlock(server); cifs_dbg(FYI, "%s: reconnect error %d\n", __func__, rc); + /* If was asked to reconnect only once, do not try it more times */ + if (once) + break; msleep(3000); } else { atomic_inc(&tcpSesReconnectCount); @@ -563,19 +566,33 @@ static int reconnect_dfs_server(struct TCP_Server_Info *server) return rc; } -int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) +static int +_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once) { if (!server->leaf_fullpath) - return __cifs_reconnect(server, mark_smb_session); + return __cifs_reconnect(server, mark_smb_session, once); return reconnect_dfs_server(server); } #else -int cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) +static int +_cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session, bool once) { - return __cifs_reconnect(server, mark_smb_session); + return __cifs_reconnect(server, mark_smb_session, once); } #endif +int +cifs_reconnect(struct TCP_Server_Info *server, bool mark_smb_session) +{ + return _cifs_reconnect(server, mark_smb_session, false); +} + +static int +cifs_reconnect_once(struct TCP_Server_Info *server) +{ + return _cifs_reconnect(server, true, true); +} + static void cifs_echo_request(struct work_struct *work) { @@ -802,26 +819,110 @@ is_smb_response(struct TCP_Server_Info *server, unsigned char type) /* Regular SMB response */ return true; case RFC1002_SESSION_KEEP_ALIVE: + /* + * RFC 1002 session keep alive can sent by the server only when + * we established a RFC 1002 session. But Samba servers send + * RFC 1002 session keep alive also over port 445 on which + * RFC 1002 session is not established. + */ cifs_dbg(FYI, "RFC 1002 session keep alive\n"); break; case RFC1002_POSITIVE_SESSION_RESPONSE: - cifs_dbg(FYI, "RFC 1002 positive session response\n"); + /* + * RFC 1002 positive session response cannot be returned + * for SMB request. RFC 1002 session response is handled + * exclusively in ip_rfc1001_connect() function. + */ + cifs_server_dbg(VFS, "RFC 1002 positive session response (unexpected)\n"); + cifs_reconnect(server, true); break; case RFC1002_NEGATIVE_SESSION_RESPONSE: /* * We get this from Windows 98 instead of an error on - * SMB negprot response. + * SMB negprot response, when we have not established + * RFC 1002 session (which means ip_rfc1001_connect() + * was skipped). Note that same still happens with + * Windows Server 2022 when connecting via port 139. + * So for this case when mount option -o nonbsessinit + * was not specified, try to reconnect with establishing + * RFC 1002 session. If new socket establishment with + * RFC 1002 session was successful then return to the + * mid's caller -EAGAIN, so it can retry the request. */ - cifs_dbg(FYI, "RFC 1002 negative session response\n"); - /* give server a second to clean up */ - msleep(1000); - /* - * Always try 445 first on reconnect since we get NACK - * on some if we ever connected to port 139 (the NACK - * is since we do not begin with RFC1001 session - * initialize frame). - */ - cifs_set_port((struct sockaddr *)&server->dstaddr, CIFS_PORT); + if (!cifs_rdma_enabled(server) && + server->tcpStatus == CifsInNegotiate && + !server->with_rfc1001 && + server->rfc1001_sessinit != 0) { + int rc, mid_rc; + struct mid_q_entry *mid, *nmid; + LIST_HEAD(dispose_list); + + cifs_dbg(FYI, "RFC 1002 negative session response during SMB Negotiate, retrying with NetBIOS session\n"); + + /* + * Before reconnect, delete all pending mids for this + * server, so reconnect would not signal connection + * aborted error to mid's callbacks. Note that for this + * server there should be exactly one pending mid + * corresponding to SMB1/SMB2 Negotiate packet. + */ + spin_lock(&server->mid_lock); + list_for_each_entry_safe(mid, nmid, &server->pending_mid_q, qhead) { + kref_get(&mid->refcount); + list_move(&mid->qhead, &dispose_list); + mid->mid_flags |= MID_DELETED; + } + spin_unlock(&server->mid_lock); + + /* Now try to reconnect once with NetBIOS session. */ + server->with_rfc1001 = true; + rc = cifs_reconnect_once(server); + + /* + * If reconnect was successful then indicate -EAGAIN + * to mid's caller. If reconnect failed with -EAGAIN + * then mask it as -EHOSTDOWN, so mid's caller would + * know that it failed. + */ + if (rc == 0) + mid_rc = -EAGAIN; + else if (rc == -EAGAIN) + mid_rc = -EHOSTDOWN; + else + mid_rc = rc; + + /* + * After reconnect (either successful or unsuccessful) + * deliver reconnect status to mid's caller via mid's + * callback. Use MID_RC state which indicates that the + * return code should be read from mid_rc member. + */ + list_for_each_entry_safe(mid, nmid, &dispose_list, qhead) { + list_del_init(&mid->qhead); + mid->mid_rc = mid_rc; + mid->mid_state = MID_RC; + mid->callback(mid); + release_mid(mid); + } + + /* + * If reconnect failed then wait two seconds. In most + * cases we were been called from the mount context and + * delivered failure to mid's callback will stop this + * receiver task thread and fails the mount process. + * So wait two seconds to prevent another reconnect + * in this task thread, which would be useless as the + * mount context will fail at all. + */ + if (rc != 0) + msleep(2000); + } else { + cifs_server_dbg(VFS, "RFC 1002 negative session response (unexpected)\n"); + cifs_reconnect(server, true); + } + break; + case RFC1002_RETARGET_SESSION_RESPONSE: + cifs_server_dbg(VFS, "RFC 1002 retarget session response (unexpected)\n"); cifs_reconnect(server, true); break; default: @@ -972,13 +1073,9 @@ clean_demultiplex_info(struct TCP_Server_Info *server) msleep(125); if (cifs_rdma_enabled(server)) smbd_destroy(server); - if (server->ssocket) { sock_release(server->ssocket); server->ssocket = NULL; - - /* Release netns reference for the socket. */ - put_net(cifs_net_ns(server)); } if (!list_empty(&server->pending_mid_q)) { @@ -1026,7 +1123,6 @@ clean_demultiplex_info(struct TCP_Server_Info *server) */ } - /* Release netns reference for this server. */ put_net(cifs_net_ns(server)); kfree(server->leaf_fullpath); kfree(server->hostname); @@ -1672,10 +1768,9 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, tcp_ses->ops = ctx->ops; tcp_ses->vals = ctx->vals; - - /* Grab netns reference for this server. */ cifs_set_net_ns(tcp_ses, get_net(current->nsproxy->net_ns)); + tcp_ses->sign = ctx->sign; tcp_ses->conn_id = atomic_inc_return(&tcpSesNextId); tcp_ses->noblockcnt = ctx->rootfs; tcp_ses->noblocksnd = ctx->noblocksnd || ctx->rootfs; @@ -1699,6 +1794,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, ctx->source_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); memcpy(tcp_ses->server_RFC1001_name, ctx->target_rfc1001_name, RFC1001_NAME_LEN_WITH_NULL); + tcp_ses->rfc1001_sessinit = ctx->rfc1001_sessinit; + tcp_ses->with_rfc1001 = false; tcp_ses->session_estab = false; tcp_ses->sequence_number = 0; tcp_ses->channel_sequence_num = 0; /* only tracked for primary channel */ @@ -1729,12 +1826,8 @@ cifs_get_tcp_session(struct smb3_fs_context *ctx, */ tcp_ses->tcpStatus = CifsNew; ++tcp_ses->srv_count; + tcp_ses->echo_interval = ctx->echo_interval * HZ; - if (ctx->echo_interval >= SMB_ECHO_INTERVAL_MIN && - ctx->echo_interval <= SMB_ECHO_INTERVAL_MAX) - tcp_ses->echo_interval = ctx->echo_interval * HZ; - else - tcp_ses->echo_interval = SMB_ECHO_INTERVAL_DEFAULT * HZ; if (tcp_ses->rdma) { #ifndef CONFIG_CIFS_SMB_DIRECT cifs_dbg(VFS, "CONFIG_CIFS_SMB_DIRECT is not enabled\n"); @@ -1802,7 +1895,6 @@ smbd_connected: out_err_crypto_release: cifs_crypto_secmech_release(tcp_ses); - /* Release netns reference for this server. */ put_net(cifs_net_ns(tcp_ses)); out_err: @@ -1811,10 +1903,8 @@ out_err: cifs_put_tcp_session(tcp_ses->primary_server, false); kfree(tcp_ses->hostname); kfree(tcp_ses->leaf_fullpath); - if (tcp_ses->ssocket) { + if (tcp_ses->ssocket) sock_release(tcp_ses->ssocket); - put_net(cifs_net_ns(tcp_ses)); - } kfree(tcp_ses); } return ERR_PTR(rc); @@ -1825,9 +1915,8 @@ static int match_session(struct cifs_ses *ses, struct smb3_fs_context *ctx, bool match_super) { - if (ctx->sectype != Unspecified && - ctx->sectype != ses->sectype) - return 0; + struct TCP_Server_Info *server = ses->server; + enum securityEnum ctx_sec, ses_sec; if (!match_super && ctx->dfs_root_ses != ses->dfs_root_ses) return 0; @@ -1839,11 +1928,20 @@ static int match_session(struct cifs_ses *ses, if (ses->chan_max < ctx->max_channels) return 0; - switch (ses->sectype) { + ctx_sec = server->ops->select_sectype(server, ctx->sectype); + ses_sec = server->ops->select_sectype(server, ses->sectype); + + if (ctx_sec != ses_sec) + return 0; + + switch (ctx_sec) { + case IAKerb: case Kerberos: if (!uid_eq(ctx->cred_uid, ses->cred_uid)) return 0; break; + case NTLMv2: + case RawNTLMSSP: default: /* NULL username means anonymous session */ if (ses->user_name == NULL) { @@ -2341,6 +2439,7 @@ retry_old_session: ses->cred_uid = ctx->cred_uid; ses->linux_uid = ctx->linux_uid; + ses->unicode = ctx->unicode; ses->sectype = ctx->sectype; ses->sign = ctx->sign; @@ -2446,6 +2545,8 @@ static int match_tcon(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) return 0; if (tcon->nodelete != ctx->nodelete) return 0; + if (tcon->posix_extensions != ctx->linux_ext) + return 0; return 1; } @@ -3018,6 +3119,44 @@ bind_socket(struct TCP_Server_Info *server) } static int +smb_recv_kvec(struct TCP_Server_Info *server, struct msghdr *msg, size_t *recv) +{ + int rc = 0; + int retries = 0; + int msg_flags = server->noblocksnd ? MSG_DONTWAIT : 0; + + *recv = 0; + + while (msg_data_left(msg)) { + rc = sock_recvmsg(server->ssocket, msg, msg_flags); + if (rc == -EAGAIN) { + retries++; + if (retries >= 14 || + (!server->noblocksnd && (retries > 2))) { + cifs_server_dbg(VFS, "sends on sock %p stuck for 15 seconds\n", + server->ssocket); + return -EAGAIN; + } + msleep(1 << retries); + continue; + } + + if (rc < 0) + return rc; + + if (rc == 0) { + cifs_dbg(FYI, "Received no data (TCP RST)\n"); + return -ECONNABORTED; + } + + /* recv was at least partially successful */ + *recv += rc; + retries = 0; /* in case we get ENOSPC on the next send */ + } + return 0; +} + +static int ip_rfc1001_connect(struct TCP_Server_Info *server) { int rc = 0; @@ -3027,8 +3166,12 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * sessinit is sent but no second negprot */ struct rfc1002_session_packet req = {}; - struct smb_hdr *smb_buf = (struct smb_hdr *)&req; + struct rfc1002_session_packet resp = {}; + struct msghdr msg = {}; + struct kvec iov = {}; unsigned int len; + size_t sent; + size_t recv; req.trailer.session_req.called_len = sizeof(req.trailer.session_req.called_name); @@ -3057,19 +3200,119 @@ ip_rfc1001_connect(struct TCP_Server_Info *server) * As per rfc1002, @len must be the number of bytes that follows the * length field of a rfc1002 session request payload. */ - len = sizeof(req) - offsetof(struct rfc1002_session_packet, trailer.session_req); + len = sizeof(req.trailer.session_req); + req.type = RFC1002_SESSION_REQUEST; + req.flags = 0; + req.length = cpu_to_be16(len); + len += offsetof(typeof(req), trailer.session_req); + iov.iov_base = &req; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_SOURCE, &iov, 1, len); + rc = smb_send_kvec(server, &msg, &sent); + if (rc < 0 || len != sent) + return (rc == -EINTR || rc == -EAGAIN) ? rc : -ECONNABORTED; - smb_buf->smb_buf_length = cpu_to_be32((RFC1002_SESSION_REQUEST << 24) | len); - rc = smb_send(server, smb_buf, len); /* * RFC1001 layer in at least one server requires very short break before * negprot presumably because not expecting negprot to follow so fast. - * This is a simple solution that works without complicating the code - * and causes no significant slowing down on mount for everyone else + * For example DOS SMB servers cannot process negprot if it was received + * before the server sent response for SESSION_REQUEST packet. So, wait + * for the response, read it and parse it as it can contain useful error + * information (e.g. specified server name was incorrect). For example + * even the latest Windows Server 2022 SMB1 server over port 139 send + * error if its server name was in SESSION_REQUEST packet incorrect. + * Nowadays usage of port 139 is not common, so waiting for reply here + * does not slowing down mounting of common case (over port 445). */ - usleep_range(1000, 2000); + len = offsetof(typeof(resp), trailer); + iov.iov_base = &resp; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len); + rc = smb_recv_kvec(server, &msg, &recv); + if (rc < 0 || recv != len) + return (rc == -EINTR || rc == -EAGAIN) ? rc : -ECONNABORTED; + + switch (resp.type) { + case RFC1002_POSITIVE_SESSION_RESPONSE: + if (be16_to_cpu(resp.length) != 0) { + cifs_dbg(VFS, "RFC 1002 positive session response but with invalid non-zero length %u\n", + be16_to_cpu(resp.length)); + return -EIO; + } + cifs_dbg(FYI, "RFC 1002 positive session response"); + break; + case RFC1002_NEGATIVE_SESSION_RESPONSE: + /* Read RFC1002 response error code and convert it to errno in rc */ + len = sizeof(resp.trailer.neg_ses_resp_error_code); + iov.iov_base = &resp.trailer.neg_ses_resp_error_code; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len); + if (be16_to_cpu(resp.length) == len && + smb_recv_kvec(server, &msg, &recv) == 0 && + recv == len) { + cifs_dbg(VFS, "RFC 1002 negative session response with error 0x%x\n", + resp.trailer.neg_ses_resp_error_code); + switch (resp.trailer.neg_ses_resp_error_code) { + case RFC1002_NOT_LISTENING_CALLED: + /* server does not listen for specified server name */ + fallthrough; + case RFC1002_NOT_PRESENT: + /* server name is incorrect */ + rc = -ENOENT; + cifs_dbg(VFS, "Server rejected NetBIOS servername %.15s\n", + server->server_RFC1001_name[0] ? + server->server_RFC1001_name : + DEFAULT_CIFS_CALLED_NAME); + cifs_dbg(VFS, "Specify correct NetBIOS servername in source path or with -o servern= option\n"); + break; + case RFC1002_NOT_LISTENING_CALLING: + /* client name was not accepted by server */ + rc = -EACCES; + cifs_dbg(VFS, "Server rejected NetBIOS clientname %.15s\n", + server->workstation_RFC1001_name[0] ? + server->workstation_RFC1001_name : + "LINUX_CIFS_CLNT"); + cifs_dbg(VFS, "Specify correct NetBIOS clientname with -o netbiosname= option\n"); + break; + case RFC1002_INSUFFICIENT_RESOURCE: + /* remote server resource error */ + rc = -EREMOTEIO; + break; + case RFC1002_UNSPECIFIED_ERROR: + default: + /* other/unknown error */ + rc = -EIO; + break; + } + } else { + cifs_dbg(VFS, "RFC 1002 negative session response\n"); + rc = -EIO; + } + return rc; + case RFC1002_RETARGET_SESSION_RESPONSE: + cifs_dbg(VFS, "RFC 1002 retarget session response\n"); + if (be16_to_cpu(resp.length) == sizeof(resp.trailer.retarget_resp)) { + len = sizeof(resp.trailer.retarget_resp); + iov.iov_base = &resp.trailer.retarget_resp; + iov.iov_len = len; + iov_iter_kvec(&msg.msg_iter, ITER_DEST, &iov, 1, len); + if (smb_recv_kvec(server, &msg, &recv) == 0 && recv == len) { + cifs_dbg(VFS, "Server wants to redirect connection\n"); + cifs_dbg(VFS, "Remount with options -o ip=%pI4,port=%u\n", + &resp.trailer.retarget_resp.retarget_ip_addr, + be16_to_cpu(resp.trailer.retarget_resp.port)); + } + } + cifs_dbg(VFS, "Closing connection\n"); + /* FIXME: Should we automatically redirect to new retarget_resp server? */ + return -EMULTIHOP; + default: + cifs_dbg(VFS, "RFC 1002 unknown response type 0x%x\n", resp.type); + return -EIO; + } - return rc; + server->with_rfc1001 = true; + return 0; } static int @@ -3105,20 +3348,20 @@ generic_ip_connect(struct TCP_Server_Info *server) socket = server->ssocket; } else { struct net *net = cifs_net_ns(server); + struct sock *sk; - rc = sock_create_kern(net, sfamily, SOCK_STREAM, IPPROTO_TCP, &server->ssocket); + rc = __sock_create(net, sfamily, SOCK_STREAM, + IPPROTO_TCP, &server->ssocket, 1); if (rc < 0) { cifs_server_dbg(VFS, "Error %d creating socket\n", rc); return rc; } - /* - * Grab netns reference for the socket. - * - * It'll be released here, on error, or in clean_demultiplex_info() upon server - * teardown. - */ - get_net(net); + sk = server->ssocket->sk; + __netns_tracker_free(net, &sk->ns_tracker, false); + sk->sk_net_refcnt = 1; + get_net_track(net, &sk->ns_tracker, GFP_KERNEL); + sock_inuse_add(net, 1); /* BB other socket options to set KEEPALIVE, NODELAY? */ cifs_dbg(FYI, "Socket created\n"); @@ -3132,10 +3375,8 @@ generic_ip_connect(struct TCP_Server_Info *server) } rc = bind_socket(server); - if (rc < 0) { - put_net(cifs_net_ns(server)); + if (rc < 0) return rc; - } /* * Eventually check for other socket options to change from @@ -3172,17 +3413,22 @@ generic_ip_connect(struct TCP_Server_Info *server) if (rc < 0) { cifs_dbg(FYI, "Error %d connecting to server\n", rc); trace_smb3_connect_err(server->hostname, server->conn_id, &server->dstaddr, rc); - put_net(cifs_net_ns(server)); sock_release(socket); server->ssocket = NULL; return rc; } trace_smb3_connect_done(server->hostname, server->conn_id, &server->dstaddr); - if (sport == htons(RFC1001_PORT)) - rc = ip_rfc1001_connect(server); - if (rc < 0) - put_net(cifs_net_ns(server)); + /* + * Establish RFC1001 NetBIOS session when it was explicitly requested + * by mount option -o nbsessinit, or when connecting to default RFC1001 + * server port (139) and it was not explicitly disabled by mount option + * -o nonbsessinit. + */ + if (server->with_rfc1001 || + server->rfc1001_sessinit == 1 || + (server->rfc1001_sessinit == -1 && sport == htons(RFC1001_PORT))) + rc = ip_rfc1001_connect(server); return rc; } @@ -3330,6 +3576,7 @@ int cifs_setup_cifs_sb(struct cifs_sb_info *cifs_sb) struct smb3_fs_context *ctx = cifs_sb->ctx; INIT_DELAYED_WORK(&cifs_sb->prune_tlinks, cifs_prune_tlinks); + INIT_LIST_HEAD(&cifs_sb->tcon_sb_link); spin_lock_init(&cifs_sb->tlink_tree_lock); cifs_sb->tlink_tree = RB_ROOT; @@ -3506,28 +3753,7 @@ int cifs_mount_get_tcon(struct cifs_mount_ctx *mnt_ctx) } } - /* - * Clamp the rsize/wsize mount arguments if they are too big for the server - * and set the rsize/wsize to the negotiated values if not passed in by - * the user on mount - */ - if ((cifs_sb->ctx->wsize == 0) || - (cifs_sb->ctx->wsize > server->ops->negotiate_wsize(tcon, ctx))) { - cifs_sb->ctx->wsize = - round_down(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE); - /* - * in the very unlikely event that the server sent a max write size under PAGE_SIZE, - * (which would get rounded down to 0) then reset wsize to absolute minimum eg 4096 - */ - if (cifs_sb->ctx->wsize == 0) { - cifs_sb->ctx->wsize = PAGE_SIZE; - cifs_dbg(VFS, "wsize too small, reset to minimum ie PAGE_SIZE, usually 4096\n"); - } - } - if ((cifs_sb->ctx->rsize == 0) || - (cifs_sb->ctx->rsize > server->ops->negotiate_rsize(tcon, ctx))) - cifs_sb->ctx->rsize = server->ops->negotiate_rsize(tcon, ctx); - + cifs_negotiate_iosize(server, cifs_sb->ctx, tcon); /* * The cookie is initialized from volume info returned above. * Inside cifs_fscache_get_super_cookie it checks @@ -3562,6 +3788,10 @@ static int mount_setup_tlink(struct cifs_sb_info *cifs_sb, struct cifs_ses *ses, tlink_rb_insert(&cifs_sb->tlink_tree, tlink); spin_unlock(&cifs_sb->tlink_tree_lock); + spin_lock(&tcon->sb_list_lock); + list_add(&cifs_sb->tcon_sb_link, &tcon->cifs_sb_list); + spin_unlock(&tcon->sb_list_lock); + queue_delayed_work(cifsiod_wq, &cifs_sb->prune_tlinks, TLINK_IDLE_EXPIRE); return 0; @@ -3903,9 +4133,19 @@ cifs_umount(struct cifs_sb_info *cifs_sb) struct rb_root *root = &cifs_sb->tlink_tree; struct rb_node *node; struct tcon_link *tlink; + struct cifs_tcon *tcon = NULL; cancel_delayed_work_sync(&cifs_sb->prune_tlinks); + if (cifs_sb->master_tlink) { + tcon = cifs_sb->master_tlink->tl_tcon; + if (tcon) { + spin_lock(&tcon->sb_list_lock); + list_del_init(&cifs_sb->tcon_sb_link); + spin_unlock(&tcon->sb_list_lock); + } + } + spin_lock(&cifs_sb->tlink_tree_lock); while ((node = rb_first(root))) { tlink = rb_entry(node, struct tcon_link, tl_rbnode); @@ -3927,11 +4167,13 @@ int cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, struct TCP_Server_Info *server) { + bool in_retry = false; int rc = 0; if (!server->ops->need_neg || !server->ops->negotiate) return -ENOSYS; +retry: /* only send once per connect */ spin_lock(&server->srv_lock); if (server->tcpStatus != CifsGood && @@ -3951,6 +4193,14 @@ cifs_negotiate_protocol(const unsigned int xid, struct cifs_ses *ses, spin_unlock(&server->srv_lock); rc = server->ops->negotiate(xid, ses, server); + if (rc == -EAGAIN) { + /* Allow one retry attempt */ + if (!in_retry) { + in_retry = true; + goto retry; + } + rc = -EHOSTDOWN; + } if (rc == 0) { spin_lock(&server->srv_lock); if (server->tcpStatus == CifsInNegotiate) @@ -3973,7 +4223,7 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, struct TCP_Server_Info *server, struct nls_table *nls_info) { - int rc = -ENOSYS; + int rc = 0; struct TCP_Server_Info *pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; struct sockaddr_in6 *addr6 = (struct sockaddr_in6 *)&pserver->dstaddr; struct sockaddr_in *addr = (struct sockaddr_in *)&pserver->dstaddr; @@ -4025,6 +4275,26 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, if (!linuxExtEnabled) ses->capabilities &= (~server->vals->cap_unix); + /* + * Check if the server supports specified encoding mode. + * Zero value in vals->cap_unicode indidcates that chosen + * protocol dialect does not support non-UNICODE mode. + */ + if (ses->unicode == 1 && server->vals->cap_unicode != 0 && + !(server->capabilities & server->vals->cap_unicode)) { + cifs_dbg(VFS, "Server does not support mounting in UNICODE mode\n"); + rc = -EOPNOTSUPP; + } else if (ses->unicode == 0 && server->vals->cap_unicode == 0) { + cifs_dbg(VFS, "Server does not support mounting in non-UNICODE mode\n"); + rc = -EOPNOTSUPP; + } else if (ses->unicode == 0) { + /* + * When UNICODE mode was explicitly disabled then + * do not announce client UNICODE capability. + */ + ses->capabilities &= (~server->vals->cap_unicode); + } + if (ses->auth_key.response) { cifs_dbg(FYI, "Free previous auth_key.response = %p\n", ses->auth_key.response); @@ -4037,8 +4307,12 @@ cifs_setup_session(const unsigned int xid, struct cifs_ses *ses, cifs_dbg(FYI, "Security Mode: 0x%x Capabilities: 0x%x TimeAdjust: %d\n", server->sec_mode, server->capabilities, server->timeAdj); - if (server->ops->sess_setup) - rc = server->ops->sess_setup(xid, ses, server, nls_info); + if (!rc) { + if (server->ops->sess_setup) + rc = server->ops->sess_setup(xid, ses, server, nls_info); + else + rc = -ENOSYS; + } if (rc) { cifs_server_dbg(VFS, "Send error in SessSetup = %d\n", rc); @@ -4108,6 +4382,7 @@ cifs_construct_tcon(struct cifs_sb_info *cifs_sb, kuid_t fsuid) ctx->seal = master_tcon->seal; ctx->witness = master_tcon->use_witness; ctx->dfs_root_ses = master_tcon->ses->dfs_root_ses; + ctx->unicode = master_tcon->ses->unicode; rc = cifs_set_vol_auth(ctx, master_tcon->ses); if (rc) { diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 8582cf61242c..950aa4f912f5 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -160,10 +160,10 @@ static int cifs_prepare_read(struct netfs_io_subrequest *subreq) server = cifs_pick_channel(tlink_tcon(req->cfile->tlink)->ses); rdata->server = server; - if (cifs_sb->ctx->rsize == 0) - cifs_sb->ctx->rsize = - server->ops->negotiate_rsize(tlink_tcon(req->cfile->tlink), - cifs_sb->ctx); + if (cifs_sb->ctx->rsize == 0) { + cifs_negotiate_rsize(server, cifs_sb->ctx, + tlink_tcon(req->cfile->tlink)); + } rc = server->ops->wait_mtu_credits(server, cifs_sb->ctx->rsize, &size, &rdata->credits); @@ -388,7 +388,7 @@ cifs_mark_open_files_invalid(struct cifs_tcon *tcon) spin_unlock(&tcon->tc_lock); /* - * BB Add call to invalidate_inodes(sb) for all superblocks mounted + * BB Add call to evict_inodes(sb) for all superblocks mounted * to this tcon. */ } @@ -1007,6 +1007,11 @@ int cifs_open(struct inode *inode, struct file *file) } else { _cifsFileInfo_put(cfile, true, false); } + } else { + /* hard link on the defeered close file */ + rc = cifs_get_hardlink_path(tcon, inode, file); + if (rc) + cifs_close_deferred_file(CIFS_I(inode)); } if (server->oplocks) @@ -2071,6 +2076,29 @@ cifs_move_llist(struct list_head *source, struct list_head *dest) list_move(li, dest); } +int +cifs_get_hardlink_path(struct cifs_tcon *tcon, struct inode *inode, + struct file *file) +{ + struct cifsFileInfo *open_file = NULL; + struct cifsInodeInfo *cinode = CIFS_I(inode); + int rc = 0; + + spin_lock(&tcon->open_file_lock); + spin_lock(&cinode->open_file_lock); + + list_for_each_entry(open_file, &cinode->openFileList, flist) { + if (file->f_flags == open_file->f_flags) { + rc = -EINVAL; + break; + } + } + + spin_unlock(&cinode->open_file_lock); + spin_unlock(&tcon->open_file_lock); + return rc; +} + void cifs_free_llist(struct list_head *llist) { @@ -3082,7 +3110,7 @@ void cifs_oplock_break(struct work_struct *work) cinode->oplock = 0; } - if (inode && S_ISREG(inode->i_mode)) { + if (S_ISREG(inode->i_mode)) { if (CIFS_CACHE_READ(cinode)) break_lease(inode, O_RDONLY); else diff --git a/fs/smb/client/fs_context.c b/fs/smb/client/fs_context.c index e9b286d9a7ba..a634a34d4086 100644 --- a/fs/smb/client/fs_context.c +++ b/fs/smb/client/fs_context.c @@ -134,6 +134,8 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_flag("compress", Opt_compress), fsparam_flag("witness", Opt_witness), fsparam_flag_no("nativesocket", Opt_nativesocket), + fsparam_flag_no("unicode", Opt_unicode), + fsparam_flag_no("nbsessinit", Opt_nbsessinit), /* Mount options which take uid or gid */ fsparam_uid("backupuid", Opt_backupuid), @@ -171,6 +173,7 @@ const struct fs_parameter_spec smb3_fs_parameters[] = { fsparam_string("username", Opt_user), fsparam_string("pass", Opt_pass), fsparam_string("password", Opt_pass), + fsparam_string("pass2", Opt_pass2), fsparam_string("password2", Opt_pass2), fsparam_string("ip", Opt_ip), fsparam_string("addr", Opt_ip), @@ -962,6 +965,14 @@ static int smb3_verify_reconfigure_ctx(struct fs_context *fc, cifs_errorf(fc, "can not change iocharset during remount\n"); return -EINVAL; } + if (new_ctx->unicode != old_ctx->unicode) { + cifs_errorf(fc, "can not change unicode during remount\n"); + return -EINVAL; + } + if (new_ctx->rfc1001_sessinit != old_ctx->rfc1001_sessinit) { + cifs_errorf(fc, "can not change nbsessinit during remount\n"); + return -EINVAL; + } return 0; } @@ -1010,6 +1021,7 @@ static int smb3_reconfigure(struct fs_context *fc) struct dentry *root = fc->root; struct cifs_sb_info *cifs_sb = CIFS_SB(root->d_sb); struct cifs_ses *ses = cifs_sb_master_tcon(cifs_sb)->ses; + unsigned int rsize = ctx->rsize, wsize = ctx->wsize; char *new_password = NULL, *new_password2 = NULL; bool need_recon = false; int rc; @@ -1092,11 +1104,8 @@ static int smb3_reconfigure(struct fs_context *fc) STEAL_STRING(cifs_sb, ctx, iocharset); /* if rsize or wsize not passed in on remount, use previous values */ - if (ctx->rsize == 0) - ctx->rsize = cifs_sb->ctx->rsize; - if (ctx->wsize == 0) - ctx->wsize = cifs_sb->ctx->wsize; - + ctx->rsize = rsize ? CIFS_ALIGN_RSIZE(fc, rsize) : cifs_sb->ctx->rsize; + ctx->wsize = wsize ? CIFS_ALIGN_WSIZE(fc, wsize) : cifs_sb->ctx->wsize; smb3_cleanup_fs_context_contents(cifs_sb->ctx); rc = smb3_fs_context_dup(cifs_sb->ctx, ctx); @@ -1117,6 +1126,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, int i, opt; bool is_smb3 = !strcmp(fc->fs_type->name, "smb3"); bool skip_parsing = false; + char *hostname; cifs_dbg(FYI, "CIFS: parsing cifs mount option '%s'\n", param->key); @@ -1131,6 +1141,9 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, } else if (!strcmp("user", param->key) || !strcmp("username", param->key)) { skip_parsing = true; opt = Opt_user; + } else if (!strcmp("pass2", param->key) || !strcmp("password2", param->key)) { + skip_parsing = true; + opt = Opt_pass2; } } @@ -1297,7 +1310,7 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, __func__); goto cifs_parse_mount_err; } - ctx->bsize = result.uint_32; + ctx->bsize = CIFS_ALIGN_BSIZE(fc, result.uint_32); ctx->got_bsize = true; break; case Opt_rasize: @@ -1321,40 +1334,31 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->rasize = result.uint_32; break; case Opt_rsize: - ctx->rsize = result.uint_32; + ctx->rsize = CIFS_ALIGN_RSIZE(fc, result.uint_32); ctx->got_rsize = true; + ctx->vol_rsize = ctx->rsize; break; case Opt_wsize: - ctx->wsize = result.uint_32; + ctx->wsize = CIFS_ALIGN_WSIZE(fc, result.uint_32); ctx->got_wsize = true; - if (ctx->wsize % PAGE_SIZE != 0) { - ctx->wsize = round_down(ctx->wsize, PAGE_SIZE); - if (ctx->wsize == 0) { - ctx->wsize = PAGE_SIZE; - cifs_dbg(VFS, "wsize too small, reset to minimum %ld\n", PAGE_SIZE); - } else { - cifs_dbg(VFS, - "wsize rounded down to %d to multiple of PAGE_SIZE %ld\n", - ctx->wsize, PAGE_SIZE); - } - } + ctx->vol_wsize = ctx->wsize; break; case Opt_acregmax: - ctx->acregmax = HZ * result.uint_32; - if (ctx->acregmax > CIFS_MAX_ACTIMEO) { + if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) { cifs_errorf(fc, "acregmax too large\n"); goto cifs_parse_mount_err; } + ctx->acregmax = HZ * result.uint_32; break; case Opt_acdirmax: - ctx->acdirmax = HZ * result.uint_32; - if (ctx->acdirmax > CIFS_MAX_ACTIMEO) { + if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) { cifs_errorf(fc, "acdirmax too large\n"); goto cifs_parse_mount_err; } + ctx->acdirmax = HZ * result.uint_32; break; case Opt_actimeo: - if (HZ * result.uint_32 > CIFS_MAX_ACTIMEO) { + if (result.uint_32 > CIFS_MAX_ACTIMEO / HZ) { cifs_errorf(fc, "timeout too large\n"); goto cifs_parse_mount_err; } @@ -1366,13 +1370,18 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->acdirmax = ctx->acregmax = HZ * result.uint_32; break; case Opt_closetimeo: - ctx->closetimeo = HZ * result.uint_32; - if (ctx->closetimeo > SMB3_MAX_DCLOSETIMEO) { + if (result.uint_32 > SMB3_MAX_DCLOSETIMEO / HZ) { cifs_errorf(fc, "closetimeo too large\n"); goto cifs_parse_mount_err; } + ctx->closetimeo = HZ * result.uint_32; break; case Opt_echo_interval: + if (result.uint_32 < SMB_ECHO_INTERVAL_MIN || + result.uint_32 > SMB_ECHO_INTERVAL_MAX) { + cifs_errorf(fc, "echo interval is out of bounds\n"); + goto cifs_parse_mount_err; + } ctx->echo_interval = result.uint_32; break; case Opt_snapshot: @@ -1439,6 +1448,16 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, cifs_errorf(fc, "OOM when copying UNC string\n"); goto cifs_parse_mount_err; } + hostname = extract_hostname(ctx->UNC); + if (IS_ERR(hostname)) { + cifs_errorf(fc, "Cannot extract hostname from UNC string\n"); + goto cifs_parse_mount_err; + } + /* last byte, type, is 0x20 for servr type */ + memset(ctx->target_rfc1001_name, 0x20, RFC1001_NAME_LEN_WITH_NULL); + for (i = 0; i < RFC1001_NAME_LEN && hostname[i] != 0; i++) + ctx->target_rfc1001_name[i] = toupper(hostname[i]); + kfree(hostname); break; case Opt_user: kfree(ctx->username); @@ -1582,6 +1601,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, if (i == RFC1001_NAME_LEN && param->string[i] != 0) pr_warn("server netbiosname longer than 15 truncated\n"); break; + case Opt_nbsessinit: + ctx->rfc1001_sessinit = !result.negated; + cifs_dbg(FYI, "rfc1001_sessinit set to %d\n", ctx->rfc1001_sessinit); + break; case Opt_ver: /* version of mount userspace tools, not dialect */ /* If interface changes in mount.cifs bump to new ver */ @@ -1623,6 +1646,10 @@ static int smb3_fs_context_parse_param(struct fs_context *fc, ctx->witness = true; pr_warn_once("Witness protocol support is experimental\n"); break; + case Opt_unicode: + ctx->unicode = !result.negated; + cifs_dbg(FYI, "unicode set to %d\n", ctx->unicode); + break; case Opt_rootfs: #ifndef CONFIG_CIFS_ROOT cifs_dbg(VFS, "rootfs support requires CONFIG_CIFS_ROOT config option\n"); @@ -1865,13 +1892,16 @@ int smb3_init_fs_context(struct fs_context *fc) memset(ctx->source_rfc1001_name, 0x20, RFC1001_NAME_LEN); for (i = 0; i < strnlen(nodename, RFC1001_NAME_LEN); i++) ctx->source_rfc1001_name[i] = toupper(nodename[i]); - ctx->source_rfc1001_name[RFC1001_NAME_LEN] = 0; + /* * null target name indicates to use *SMBSERVR default called name * if we end up sending RFC1001 session initialize */ ctx->target_rfc1001_name[0] = 0; + + ctx->rfc1001_sessinit = -1; /* autodetect based on port number */ + ctx->cred_uid = current_uid(); ctx->linux_uid = current_uid(); ctx->linux_gid = current_gid(); @@ -1924,6 +1954,8 @@ int smb3_init_fs_context(struct fs_context *fc) ctx->symlink_type = CIFS_SYMLINK_TYPE_DEFAULT; ctx->nonativesocket = 0; + ctx->unicode = -1; /* autodetect, but prefer UNICODE mode */ + /* * short int override_uid = -1; * short int override_gid = -1; diff --git a/fs/smb/client/fs_context.h b/fs/smb/client/fs_context.h index 881bfc08667e..9e83302ce4b8 100644 --- a/fs/smb/client/fs_context.h +++ b/fs/smb/client/fs_context.h @@ -20,6 +20,21 @@ cifs_dbg(VFS, fmt, ## __VA_ARGS__); \ } while (0) +static inline size_t cifs_io_align(struct fs_context *fc, + const char *name, size_t size) +{ + if (!size || !IS_ALIGNED(size, PAGE_SIZE)) { + cifs_errorf(fc, "unaligned %s, making it a multiple of %lu bytes\n", + name, PAGE_SIZE); + size = umax(round_down(size, PAGE_SIZE), PAGE_SIZE); + } + return size; +} + +#define CIFS_ALIGN_WSIZE(_fc, _size) cifs_io_align(_fc, "wsize", _size) +#define CIFS_ALIGN_RSIZE(_fc, _size) cifs_io_align(_fc, "rsize", _size) +#define CIFS_ALIGN_BSIZE(_fc, _size) cifs_io_align(_fc, "bsize", _size) + enum smb_version { Smb_1 = 1, Smb_20, @@ -135,6 +150,7 @@ enum cifs_param { Opt_witness, Opt_is_upcall_target_mount, Opt_is_upcall_target_application, + Opt_unicode, /* Mount options which take numeric value */ Opt_backupuid, @@ -173,6 +189,7 @@ enum cifs_param { Opt_iocharset, Opt_netbiosname, Opt_servern, + Opt_nbsessinit, Opt_ver, Opt_vers, Opt_sec, @@ -215,6 +232,7 @@ struct smb3_fs_context { char *iocharset; /* local code page for mapping to and from Unicode */ char source_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* clnt nb name */ char target_rfc1001_name[RFC1001_NAME_LEN_WITH_NULL]; /* srvr nb name */ + int rfc1001_sessinit; kuid_t cred_uid; kuid_t linux_uid; kgid_t linux_gid; @@ -279,6 +297,9 @@ struct smb3_fs_context { bool use_client_guid:1; /* reuse existing guid for multichannel */ u8 client_guid[SMB2_CLIENT_GUID_SIZE]; + /* User-specified original r/wsize value */ + unsigned int vol_rsize; + unsigned int vol_wsize; unsigned int bsize; unsigned int rasize; unsigned int rsize; @@ -306,6 +327,7 @@ struct smb3_fs_context { bool compress; /* enable SMB2 messages (READ/WRITE) de/compression */ bool rootfs:1; /* if it's a SMB root file system */ bool witness:1; /* use witness protocol */ + int unicode; char *leaf_fullpath; struct cifs_ses *dfs_root_ses; bool dfs_automount:1; /* set for dfs automount only */ @@ -354,4 +376,36 @@ static inline void cifs_mount_unlock(void) mutex_unlock(&cifs_mount_mutex); } +static inline void cifs_negotiate_rsize(struct TCP_Server_Info *server, + struct smb3_fs_context *ctx, + struct cifs_tcon *tcon) +{ + unsigned int size; + + size = umax(server->ops->negotiate_rsize(tcon, ctx), PAGE_SIZE); + if (ctx->rsize) + size = umax(umin(ctx->rsize, size), PAGE_SIZE); + ctx->rsize = round_down(size, PAGE_SIZE); +} + +static inline void cifs_negotiate_wsize(struct TCP_Server_Info *server, + struct smb3_fs_context *ctx, + struct cifs_tcon *tcon) +{ + unsigned int size; + + size = umax(server->ops->negotiate_wsize(tcon, ctx), PAGE_SIZE); + if (ctx->wsize) + size = umax(umin(ctx->wsize, size), PAGE_SIZE); + ctx->wsize = round_down(size, PAGE_SIZE); +} + +static inline void cifs_negotiate_iosize(struct TCP_Server_Info *server, + struct smb3_fs_context *ctx, + struct cifs_tcon *tcon) +{ + cifs_negotiate_rsize(server, ctx, tcon); + cifs_negotiate_wsize(server, ctx, tcon); +} + #endif diff --git a/fs/smb/client/inode.c b/fs/smb/client/inode.c index 616149c7f0a5..75be4b46bc6f 100644 --- a/fs/smb/client/inode.c +++ b/fs/smb/client/inode.c @@ -1203,18 +1203,17 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, goto out; } break; - case IO_REPARSE_TAG_MOUNT_POINT: - cifs_create_junction_fattr(fattr, sb); - rc = 0; - goto out; default: /* Check for cached reparse point data */ if (data->symlink_target || data->reparse.buf) { rc = 0; - } else if (iov && server->ops->parse_reparse_point) { - rc = server->ops->parse_reparse_point(cifs_sb, - full_path, - iov, data); + } else if (iov && server->ops->get_reparse_point_buffer) { + struct reparse_data_buffer *reparse_buf; + u32 reparse_len; + + reparse_buf = server->ops->get_reparse_point_buffer(iov, &reparse_len); + rc = parse_reparse_point(reparse_buf, reparse_len, + cifs_sb, full_path, data); /* * If the reparse point was not handled but it is the * name surrogate which points to directory, then treat @@ -1228,6 +1227,16 @@ static int reparse_info_to_fattr(struct cifs_open_info_data *data, cifs_create_junction_fattr(fattr, sb); goto out; } + /* + * If the reparse point is unsupported by the Linux SMB + * client then let it process by the SMB server. So mask + * the -EOPNOTSUPP error code. This will allow Linux SMB + * client to send SMB OPEN request to server. If server + * does not support this reparse point too then server + * will return error during open the path. + */ + if (rc == -EOPNOTSUPP) + rc = 0; } if (data->reparse.tag == IO_REPARSE_TAG_SYMLINK && !rc) { @@ -2207,8 +2216,8 @@ posix_mkdir_get_info: } #endif /* CONFIG_CIFS_ALLOW_INSECURE_LEGACY */ -int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, - struct dentry *direntry, umode_t mode) +struct dentry *cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, + struct dentry *direntry, umode_t mode) { int rc = 0; unsigned int xid; @@ -2224,10 +2233,10 @@ int cifs_mkdir(struct mnt_idmap *idmap, struct inode *inode, cifs_sb = CIFS_SB(inode->i_sb); if (unlikely(cifs_forced_shutdown(cifs_sb))) - return -EIO; + return ERR_PTR(-EIO); tlink = cifs_sb_tlink(cifs_sb); if (IS_ERR(tlink)) - return PTR_ERR(tlink); + return ERR_CAST(tlink); tcon = tlink_tcon(tlink); xid = get_xid(); @@ -2283,7 +2292,7 @@ mkdir_out: free_dentry_path(page); free_xid(xid); cifs_put_tlink(tlink); - return rc; + return ERR_PTR(rc); } int cifs_rmdir(struct inode *inode, struct dentry *direntry) @@ -2901,23 +2910,6 @@ int cifs_fiemap(struct inode *inode, struct fiemap_extent_info *fei, u64 start, return -EOPNOTSUPP; } -int cifs_truncate_page(struct address_space *mapping, loff_t from) -{ - pgoff_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE - 1); - struct page *page; - int rc = 0; - - page = grab_cache_page(mapping, index); - if (!page) - return -ENOMEM; - - zero_user_segment(page, offset, PAGE_SIZE); - unlock_page(page); - put_page(page); - return rc; -} - void cifs_setsize(struct inode *inode, loff_t offset) { struct cifsInodeInfo *cifs_i = CIFS_I(inode); @@ -3012,8 +3004,6 @@ set_size_out: */ attrs->ia_ctime = attrs->ia_mtime = current_time(inode); attrs->ia_valid |= ATTR_CTIME | ATTR_MTIME; - - cifs_truncate_page(inode->i_mapping, inode->i_size); } return rc; diff --git a/fs/smb/client/link.c b/fs/smb/client/link.c index 6e6c09cc5ce7..769752ad2c5c 100644 --- a/fs/smb/client/link.c +++ b/fs/smb/client/link.c @@ -258,7 +258,7 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, struct cifs_open_parms oparms; struct cifs_io_parms io_parms = {0}; int buf_type = CIFS_NO_BUFFER; - FILE_ALL_INFO file_info; + struct cifs_open_info_data query_data; oparms = (struct cifs_open_parms) { .tcon = tcon, @@ -270,11 +270,11 @@ cifs_query_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, .fid = &fid, }; - rc = CIFS_open(xid, &oparms, &oplock, &file_info); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, &query_data); if (rc) return rc; - if (file_info.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { + if (query_data.fi.EndOfFile != cpu_to_le64(CIFS_MF_SYMLINK_FILE_SIZE)) { rc = -ENOENT; /* it's not a symlink */ goto out; @@ -313,7 +313,7 @@ cifs_create_mf_symlink(unsigned int xid, struct cifs_tcon *tcon, .fid = &fid, }; - rc = CIFS_open(xid, &oparms, &oplock, NULL); + rc = tcon->ses->server->ops->open(xid, &oparms, &oplock, NULL); if (rc) return rc; @@ -643,7 +643,8 @@ cifs_symlink(struct mnt_idmap *idmap, struct inode *inode, case CIFS_SYMLINK_TYPE_NATIVE: case CIFS_SYMLINK_TYPE_NFS: case CIFS_SYMLINK_TYPE_WSL: - if (server->ops->create_reparse_symlink) { + if (server->ops->create_reparse_symlink && + (le32_to_cpu(pTcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) { rc = server->ops->create_reparse_symlink(xid, inode, direntry, pTcon, diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index b328dc5c7988..7b6ed9b23e71 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -137,8 +137,10 @@ tcon_info_alloc(bool dir_leases_enabled, enum smb3_tcon_ref_trace trace) spin_lock_init(&ret_buf->tc_lock); INIT_LIST_HEAD(&ret_buf->openFileList); INIT_LIST_HEAD(&ret_buf->tcon_list); + INIT_LIST_HEAD(&ret_buf->cifs_sb_list); spin_lock_init(&ret_buf->open_file_lock); spin_lock_init(&ret_buf->stat_lock); + spin_lock_init(&ret_buf->sb_list_lock); atomic_set(&ret_buf->num_local_opens, 0); atomic_set(&ret_buf->num_remote_opens, 0); ret_buf->stats_from_time = ktime_get_real_seconds(); diff --git a/fs/smb/client/readdir.c b/fs/smb/client/readdir.c index 50f96259d9ad..f9f11cbf89be 100644 --- a/fs/smb/client/readdir.c +++ b/fs/smb/client/readdir.c @@ -9,6 +9,7 @@ * */ #include <linux/fs.h> +#include <linux/namei.h> #include <linux/pagemap.h> #include <linux/slab.h> #include <linux/stat.h> @@ -78,7 +79,7 @@ cifs_prime_dcache(struct dentry *parent, struct qstr *name, cifs_dbg(FYI, "%s: for %s\n", __func__, name->name); - dentry = d_hash_and_lookup(parent, name); + dentry = try_lookup_noperm(name, parent); if (!dentry) { /* * If we know that the inode will need to be revalidated @@ -733,7 +734,10 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, else cifs_buf_release(cfile->srch_inf. ntwrk_buf_start); + /* Reset all pointers to the network buffer to prevent stale references */ cfile->srch_inf.ntwrk_buf_start = NULL; + cfile->srch_inf.srch_entries_start = NULL; + cfile->srch_inf.last_entry = NULL; } rc = initiate_cifs_search(xid, file, full_path); if (rc) { @@ -756,11 +760,11 @@ find_cifs_entry(const unsigned int xid, struct cifs_tcon *tcon, loff_t pos, rc = server->ops->query_dir_next(xid, tcon, &cfile->fid, search_flags, &cfile->srch_inf); + if (rc) + return -ENOENT; /* FindFirst/Next set last_entry to NULL on malformed reply */ if (cfile->srch_inf.last_entry) cifs_save_resume_key(cfile->srch_inf.last_entry, cfile); - if (rc) - return -ENOENT; } if (index_to_find < cfile->srch_inf.index_of_last_entry) { /* we found the buffer that contains the entry */ diff --git a/fs/smb/client/reparse.c b/fs/smb/client/reparse.c index 2b9e9885dc42..bb25e77c5540 100644 --- a/fs/smb/client/reparse.c +++ b/fs/smb/client/reparse.c @@ -542,12 +542,12 @@ static int wsl_set_reparse_buf(struct reparse_data_buffer **buf, kfree(symname_utf16); return -ENOMEM; } - /* Flag 0x02000000 is unknown, but all wsl symlinks have this value */ - symlink_buf->Flags = cpu_to_le32(0x02000000); - /* PathBuffer is in UTF-8 but without trailing null-term byte */ + /* Version field must be set to 2 (MS-FSCC 2.1.2.7) */ + symlink_buf->Version = cpu_to_le32(2); + /* Target for Version 2 is in UTF-8 but without trailing null-term byte */ symname_utf8_len = utf16s_to_utf8s((wchar_t *)symname_utf16, symname_utf16_len/2, UTF16_LITTLE_ENDIAN, - symlink_buf->PathBuffer, + symlink_buf->Target, symname_utf8_maxlen); *buf = (struct reparse_data_buffer *)symlink_buf; buf_len = sizeof(struct reparse_wsl_symlink_data_buffer) + symname_utf8_len; @@ -1016,29 +1016,36 @@ static int parse_reparse_wsl_symlink(struct reparse_wsl_symlink_data_buffer *buf struct cifs_open_info_data *data) { int len = le16_to_cpu(buf->ReparseDataLength); + int data_offset = offsetof(typeof(*buf), Target) - offsetof(typeof(*buf), Version); int symname_utf8_len; __le16 *symname_utf16; int symname_utf16_len; - if (len <= sizeof(buf->Flags)) { + if (len <= data_offset) { cifs_dbg(VFS, "srv returned malformed wsl symlink buffer\n"); return -EIO; } - /* PathBuffer is in UTF-8 but without trailing null-term byte */ - symname_utf8_len = len - sizeof(buf->Flags); + /* MS-FSCC 2.1.2.7 defines layout of the Target field only for Version 2. */ + if (le32_to_cpu(buf->Version) != 2) { + cifs_dbg(VFS, "srv returned unsupported wsl symlink version %u\n", le32_to_cpu(buf->Version)); + return -EIO; + } + + /* Target for Version 2 is in UTF-8 but without trailing null-term byte */ + symname_utf8_len = len - data_offset; /* * Check that buffer does not contain null byte * because Linux cannot process symlink with null byte. */ - if (strnlen(buf->PathBuffer, symname_utf8_len) != symname_utf8_len) { + if (strnlen(buf->Target, symname_utf8_len) != symname_utf8_len) { cifs_dbg(VFS, "srv returned null byte in wsl symlink target location\n"); return -EIO; } symname_utf16 = kzalloc(symname_utf8_len * 2, GFP_KERNEL); if (!symname_utf16) return -ENOMEM; - symname_utf16_len = utf8s_to_utf16s(buf->PathBuffer, symname_utf8_len, + symname_utf16_len = utf8s_to_utf16s(buf->Target, symname_utf8_len, UTF16_LITTLE_ENDIAN, (wchar_t *) symname_utf16, symname_utf8_len * 2); if (symname_utf16_len < 0) { @@ -1062,8 +1069,6 @@ int parse_reparse_point(struct reparse_data_buffer *buf, const char *full_path, struct cifs_open_info_data *data) { - struct cifs_tcon *tcon = cifs_sb_master_tcon(cifs_sb); - data->reparse.buf = buf; /* See MS-FSCC 2.1.2 */ @@ -1090,24 +1095,17 @@ int parse_reparse_point(struct reparse_data_buffer *buf, } return 0; default: - cifs_tcon_dbg(VFS | ONCE, "unhandled reparse tag: 0x%08x\n", - le32_to_cpu(buf->ReparseTag)); return -EOPNOTSUPP; } } -int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, - const char *full_path, - struct kvec *rsp_iov, - struct cifs_open_info_data *data) +struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, + u32 *plen) { - struct reparse_data_buffer *buf; struct smb2_ioctl_rsp *io = rsp_iov->iov_base; - u32 plen = le32_to_cpu(io->OutputCount); - - buf = (struct reparse_data_buffer *)((u8 *)io + - le32_to_cpu(io->OutputOffset)); - return parse_reparse_point(buf, plen, cifs_sb, full_path, data); + *plen = le32_to_cpu(io->OutputCount); + return (struct reparse_data_buffer *)((u8 *)io + + le32_to_cpu(io->OutputOffset)); } static bool wsl_to_fattr(struct cifs_open_info_data *data, @@ -1233,16 +1231,6 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, bool ok; switch (tag) { - case IO_REPARSE_TAG_INTERNAL: - if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY)) - return false; - fallthrough; - case IO_REPARSE_TAG_DFS: - case IO_REPARSE_TAG_DFSR: - case IO_REPARSE_TAG_MOUNT_POINT: - /* See cifs_create_junction_fattr() */ - fattr->cf_mode = S_IFDIR | 0711; - break; case IO_REPARSE_TAG_LX_SYMLINK: case IO_REPARSE_TAG_LX_FIFO: case IO_REPARSE_TAG_AF_UNIX: @@ -1262,7 +1250,14 @@ bool cifs_reparse_point_to_fattr(struct cifs_sb_info *cifs_sb, fattr->cf_mode |= S_IFLNK; break; default: - return false; + if (!(fattr->cf_cifsattrs & ATTR_DIRECTORY)) + return false; + if (!IS_REPARSE_TAG_NAME_SURROGATE(tag) && + tag != IO_REPARSE_TAG_INTERNAL) + return false; + /* See cifs_create_junction_fattr() */ + fattr->cf_mode = S_IFDIR | 0711; + break; } fattr->cf_dtype = S_DT(fattr->cf_mode); diff --git a/fs/smb/client/reparse.h b/fs/smb/client/reparse.h index c0be5ab45a78..08de853b36a8 100644 --- a/fs/smb/client/reparse.h +++ b/fs/smb/client/reparse.h @@ -135,9 +135,6 @@ int smb2_create_reparse_symlink(const unsigned int xid, struct inode *inode, int smb2_mknod_reparse(unsigned int xid, struct inode *inode, struct dentry *dentry, struct cifs_tcon *tcon, const char *full_path, umode_t mode, dev_t dev); -int smb2_parse_reparse_point(struct cifs_sb_info *cifs_sb, - const char *full_path, - struct kvec *rsp_iov, - struct cifs_open_info_data *data); +struct reparse_data_buffer *smb2_get_reparse_point_buffer(const struct kvec *rsp_iov, u32 *len); #endif /* _CIFS_REPARSE_H */ diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index faa80e7d54a6..b3fa9ee26912 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -242,7 +242,7 @@ int cifs_try_adding_channels(struct cifs_ses *ses) iface->num_channels++; iface->weight_fulfilled++; - cifs_dbg(VFS, "successfully opened new channel on iface:%pIS\n", + cifs_info("successfully opened new channel on iface:%pIS\n", &iface->sockaddr); break; } @@ -501,6 +501,7 @@ cifs_ses_add_channel(struct cifs_ses *ses, ctx->password = ses->password; ctx->sectype = ses->sectype; ctx->sign = ses->sign; + ctx->unicode = ses->unicode; /* UNC and paths */ /* XXX: Use ses->server->hostname? */ @@ -522,6 +523,13 @@ cifs_ses_add_channel(struct cifs_ses *ses, ctx->sockopt_tcp_nodelay = ses->server->tcp_nodelay; ctx->echo_interval = ses->server->echo_interval / HZ; ctx->max_credits = ses->server->max_credits; + ctx->min_offload = ses->server->min_offload; + ctx->compress = ses->server->compression.requested; + ctx->dfs_conn = ses->server->dfs_conn; + ctx->ignore_signature = ses->server->ignore_signature; + ctx->leaf_fullpath = ses->server->leaf_fullpath; + ctx->rootfs = ses->server->noblockcnt; + ctx->retrans = ses->server->retrans; /* * This will be used for encoding/decoding user/domain/pw @@ -672,6 +680,22 @@ unicode_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) *pbcc_area = bcc_ptr; } +static void +ascii_oslm_strings(char **pbcc_area, const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + + strcpy(bcc_ptr, "Linux version "); + bcc_ptr += strlen("Linux version "); + strcpy(bcc_ptr, init_utsname()->release); + bcc_ptr += strlen(init_utsname()->release) + 1; + + strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); + bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + + *pbcc_area = bcc_ptr; +} + static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, const struct nls_table *nls_cp) { @@ -696,6 +720,25 @@ static void unicode_domain_string(char **pbcc_area, struct cifs_ses *ses, *pbcc_area = bcc_ptr; } +static void ascii_domain_string(char **pbcc_area, struct cifs_ses *ses, + const struct nls_table *nls_cp) +{ + char *bcc_ptr = *pbcc_area; + int len; + + /* copy domain */ + if (ses->domainName != NULL) { + len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); + if (WARN_ON_ONCE(len < 0)) + len = CIFS_MAX_DOMAINNAME_LEN - 1; + bcc_ptr += len; + } /* else we send a null domain name so server will default to its own domain */ + *bcc_ptr = 0; + bcc_ptr++; + + *pbcc_area = bcc_ptr; +} + static void unicode_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, const struct nls_table *nls_cp) { @@ -741,25 +784,10 @@ static void ascii_ssetup_strings(char **pbcc_area, struct cifs_ses *ses, *bcc_ptr = 0; bcc_ptr++; /* account for null termination */ - /* copy domain */ - if (ses->domainName != NULL) { - len = strscpy(bcc_ptr, ses->domainName, CIFS_MAX_DOMAINNAME_LEN); - if (WARN_ON_ONCE(len < 0)) - len = CIFS_MAX_DOMAINNAME_LEN - 1; - bcc_ptr += len; - } /* else we send a null domain name so server will default to its own domain */ - *bcc_ptr = 0; - bcc_ptr++; - /* BB check for overflow here */ - strcpy(bcc_ptr, "Linux version "); - bcc_ptr += strlen("Linux version "); - strcpy(bcc_ptr, init_utsname()->release); - bcc_ptr += strlen(init_utsname()->release) + 1; - - strcpy(bcc_ptr, CIFS_NETWORK_OPSYS); - bcc_ptr += strlen(CIFS_NETWORK_OPSYS) + 1; + ascii_domain_string(&bcc_ptr, ses, nls_cp); + ascii_oslm_strings(&bcc_ptr, nls_cp); *pbcc_area = bcc_ptr; } @@ -1562,7 +1590,7 @@ sess_auth_kerberos(struct sess_data *sess_data) sess_data->iov[1].iov_len = msg->secblob_len; pSMB->req.SecurityBlobLength = cpu_to_le16(sess_data->iov[1].iov_len); - if (ses->capabilities & CAP_UNICODE) { + if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) { /* unicode strings must be word aligned */ if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { *bcc_ptr = 0; @@ -1571,8 +1599,8 @@ sess_auth_kerberos(struct sess_data *sess_data) unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); unicode_domain_string(&bcc_ptr, ses, sess_data->nls_cp); } else { - /* BB: is this right? */ - ascii_ssetup_strings(&bcc_ptr, ses, sess_data->nls_cp); + ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp); + ascii_domain_string(&bcc_ptr, ses, sess_data->nls_cp); } sess_data->iov[2].iov_len = (long) bcc_ptr - diff --git a/fs/smb/client/smb1ops.c b/fs/smb/client/smb1ops.c index d6e2fb669c40..b27a182629ec 100644 --- a/fs/smb/client/smb1ops.c +++ b/fs/smb/client/smb1ops.c @@ -14,6 +14,8 @@ #include "cifspdu.h" #include "cifs_unicode.h" #include "fs_context.h" +#include "nterr.h" +#include "smberr.h" /* * An NT cancel request header looks just like the original request except: @@ -426,26 +428,19 @@ cifs_negotiate(const unsigned int xid, { int rc; rc = CIFSSMBNegotiate(xid, ses, server); - if (rc == -EAGAIN) { - /* retry only once on 1st time connection */ - set_credits(server, 1); - rc = CIFSSMBNegotiate(xid, ses, server); - if (rc == -EAGAIN) - rc = -EHOSTDOWN; - } return rc; } static unsigned int -cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) +smb1_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); struct TCP_Server_Info *server = tcon->ses->server; unsigned int wsize; /* start with specified wsize, or default */ - if (ctx->wsize) - wsize = ctx->wsize; + if (ctx->got_wsize) + wsize = ctx->vol_wsize; else if (tcon->unix_ext && (unix_cap & CIFS_UNIX_LARGE_WRITE_CAP)) wsize = CIFS_DEFAULT_IOSIZE; else @@ -472,7 +467,7 @@ cifs_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) } static unsigned int -cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) +smb1_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { __u64 unix_cap = le64_to_cpu(tcon->fsUnixInfo.Capability); struct TCP_Server_Info *server = tcon->ses->server; @@ -497,7 +492,7 @@ cifs_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) else defsize = server->maxBuf - sizeof(READ_RSP); - rsize = ctx->rsize ? ctx->rsize : defsize; + rsize = ctx->got_rsize ? ctx->vol_rsize : defsize; /* * no CAP_LARGE_READ_X? Then MS-CIFS states that we must limit this to @@ -548,24 +543,104 @@ static int cifs_query_path_info(const unsigned int xid, const char *full_path, struct cifs_open_info_data *data) { - int rc; + int rc = -EOPNOTSUPP; FILE_ALL_INFO fi = {}; + struct cifs_search_info search_info = {}; + bool non_unicode_wildcard = false; data->reparse_point = false; data->adjust_tz = false; - /* could do find first instead but this returns more info */ - rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, cifs_sb->local_nls, - cifs_remap(cifs_sb)); /* - * BB optimize code so we do not make the above call when server claims - * no NT SMB support and the above call failed at least once - set flag - * in tcon or mount. + * First try CIFSSMBQPathInfo() function which returns more info + * (NumberOfLinks) than CIFSFindFirst() fallback function. + * Some servers like Win9x do not support SMB_QUERY_FILE_ALL_INFO over + * TRANS2_QUERY_PATH_INFORMATION, but supports it with filehandle over + * TRANS2_QUERY_FILE_INFORMATION (function CIFSSMBQFileInfo(). But SMB + * Open command on non-NT servers works only for files, does not work + * for directories. And moreover Win9x SMB server returns bogus data in + * SMB_QUERY_FILE_ALL_INFO Attributes field. So for non-NT servers, + * do not even use CIFSSMBQPathInfo() or CIFSSMBQFileInfo() function. + */ + if (tcon->ses->capabilities & CAP_NT_SMBS) + rc = CIFSSMBQPathInfo(xid, tcon, full_path, &fi, 0 /* not legacy */, + cifs_sb->local_nls, cifs_remap(cifs_sb)); + + /* + * Non-UNICODE variant of fallback functions below expands wildcards, + * so they cannot be used for querying paths with wildcard characters. + */ + if (rc && !(tcon->ses->capabilities & CAP_UNICODE) && strpbrk(full_path, "*?\"><")) + non_unicode_wildcard = true; + + /* + * Then fallback to CIFSFindFirst() which works also with non-NT servers + * but does not does not provide NumberOfLinks. */ - if ((rc == -EOPNOTSUPP) || (rc == -EINVAL)) { + if ((rc == -EOPNOTSUPP || rc == -EINVAL) && + !non_unicode_wildcard) { + if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find)) + search_info.info_level = SMB_FIND_FILE_INFO_STANDARD; + else + search_info.info_level = SMB_FIND_FILE_FULL_DIRECTORY_INFO; + rc = CIFSFindFirst(xid, tcon, full_path, cifs_sb, NULL, + CIFS_SEARCH_CLOSE_ALWAYS | CIFS_SEARCH_CLOSE_AT_END, + &search_info, false); + if (rc == 0) { + if (!(tcon->ses->capabilities & tcon->ses->server->vals->cap_nt_find)) { + FIND_FILE_STANDARD_INFO *di; + int offset = tcon->ses->server->timeAdj; + + di = (FIND_FILE_STANDARD_INFO *)search_info.srch_entries_start; + fi.CreationTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( + di->CreationDate, di->CreationTime, offset))); + fi.LastAccessTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( + di->LastAccessDate, di->LastAccessTime, offset))); + fi.LastWriteTime = cpu_to_le64(cifs_UnixTimeToNT(cnvrtDosUnixTm( + di->LastWriteDate, di->LastWriteTime, offset))); + fi.ChangeTime = fi.LastWriteTime; + fi.Attributes = cpu_to_le32(le16_to_cpu(di->Attributes)); + fi.AllocationSize = cpu_to_le64(le32_to_cpu(di->AllocationSize)); + fi.EndOfFile = cpu_to_le64(le32_to_cpu(di->DataSize)); + } else { + FILE_FULL_DIRECTORY_INFO *di; + + di = (FILE_FULL_DIRECTORY_INFO *)search_info.srch_entries_start; + fi.CreationTime = di->CreationTime; + fi.LastAccessTime = di->LastAccessTime; + fi.LastWriteTime = di->LastWriteTime; + fi.ChangeTime = di->ChangeTime; + fi.Attributes = di->ExtFileAttributes; + fi.AllocationSize = di->AllocationSize; + fi.EndOfFile = di->EndOfFile; + fi.EASize = di->EaSize; + } + fi.NumberOfLinks = cpu_to_le32(1); + fi.DeletePending = 0; + fi.Directory = !!(le32_to_cpu(fi.Attributes) & ATTR_DIRECTORY); + cifs_buf_release(search_info.ntwrk_buf_start); + } else if (!full_path[0]) { + /* + * CIFSFindFirst() does not work on root path if the + * root path was exported on the server from the top + * level path (drive letter). + */ + rc = -EOPNOTSUPP; + } + } + + /* + * If everything failed then fallback to the legacy SMB command + * SMB_COM_QUERY_INFORMATION which works with all servers, but + * provide just few information. + */ + if ((rc == -EOPNOTSUPP || rc == -EINVAL) && !non_unicode_wildcard) { rc = SMBQueryInformation(xid, tcon, full_path, &fi, cifs_sb->local_nls, cifs_remap(cifs_sb)); data->adjust_tz = true; + } else if ((rc == -EOPNOTSUPP || rc == -EINVAL) && non_unicode_wildcard) { + /* Path with non-UNICODE wildcard character cannot exist. */ + rc = -ENOENT; } if (!rc) { @@ -573,6 +648,42 @@ static int cifs_query_path_info(const unsigned int xid, data->reparse_point = le32_to_cpu(fi.Attributes) & ATTR_REPARSE; } +#ifdef CONFIG_CIFS_XATTR + /* + * For WSL CHR and BLK reparse points it is required to fetch + * EA $LXDEV which contains major and minor device numbers. + */ + if (!rc && data->reparse_point) { + struct smb2_file_full_ea_info *ea; + + ea = (struct smb2_file_full_ea_info *)data->wsl.eas; + rc = CIFSSMBQAllEAs(xid, tcon, full_path, SMB2_WSL_XATTR_DEV, + &ea->ea_data[SMB2_WSL_XATTR_NAME_LEN + 1], + SMB2_WSL_XATTR_DEV_SIZE, cifs_sb); + if (rc == SMB2_WSL_XATTR_DEV_SIZE) { + ea->next_entry_offset = cpu_to_le32(0); + ea->flags = 0; + ea->ea_name_length = SMB2_WSL_XATTR_NAME_LEN; + ea->ea_value_length = cpu_to_le16(SMB2_WSL_XATTR_DEV_SIZE); + memcpy(&ea->ea_data[0], SMB2_WSL_XATTR_DEV, SMB2_WSL_XATTR_NAME_LEN + 1); + data->wsl.eas_len = sizeof(*ea) + SMB2_WSL_XATTR_NAME_LEN + 1 + + SMB2_WSL_XATTR_DEV_SIZE; + rc = 0; + } else if (rc >= 0) { + /* It is an error if EA $LXDEV has wrong size. */ + rc = -EINVAL; + } else { + /* + * In all other cases ignore error if fetching + * of EA $LXDEV failed. It is needed only for + * WSL CHR and BLK reparse points and wsl_to_fattr() + * handle the case when EA is missing. + */ + rc = 0; + } + } +#endif + return rc; } @@ -608,6 +719,13 @@ static int cifs_query_file_info(const unsigned int xid, struct cifs_tcon *tcon, int rc; FILE_ALL_INFO fi = {}; + /* + * CIFSSMBQFileInfo() for non-NT servers returns bogus data in + * Attributes fields. So do not use this command for non-NT servers. + */ + if (!(tcon->ses->capabilities & CAP_NT_SMBS)) + return -EOPNOTSUPP; + if (cfile->symlink_target) { data->symlink_target = kstrdup(cfile->symlink_target, GFP_KERNEL); if (!data->symlink_target) @@ -778,6 +896,9 @@ smb_set_file_info(struct inode *inode, const char *full_path, struct cifs_fid fid; struct cifs_open_parms oparms; struct cifsFileInfo *open_file; + FILE_BASIC_INFO new_buf; + struct cifs_open_info_data query_data; + __le64 write_time = buf->LastWriteTime; struct cifsInodeInfo *cinode = CIFS_I(inode); struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); struct tcon_link *tlink = NULL; @@ -785,20 +906,58 @@ smb_set_file_info(struct inode *inode, const char *full_path, /* if the file is already open for write, just use that fileid */ open_file = find_writable_file(cinode, FIND_WR_FSUID_ONLY); + if (open_file) { fid.netfid = open_file->fid.netfid; netpid = open_file->pid; tcon = tlink_tcon(open_file->tlink); - goto set_via_filehandle; + } else { + tlink = cifs_sb_tlink(cifs_sb); + if (IS_ERR(tlink)) { + rc = PTR_ERR(tlink); + tlink = NULL; + goto out; + } + tcon = tlink_tcon(tlink); } - tlink = cifs_sb_tlink(cifs_sb); - if (IS_ERR(tlink)) { - rc = PTR_ERR(tlink); - tlink = NULL; - goto out; + /* + * Non-NT servers interprets zero time value in SMB_SET_FILE_BASIC_INFO + * over TRANS2_SET_FILE_INFORMATION as a valid time value. NT servers + * interprets zero time value as do not change existing value on server. + * API of ->set_file_info() callback expects that zero time value has + * the NT meaning - do not change. Therefore if server is non-NT and + * some time values in "buf" are zero, then fetch missing time values. + */ + if (!(tcon->ses->capabilities & CAP_NT_SMBS) && + (!buf->CreationTime || !buf->LastAccessTime || + !buf->LastWriteTime || !buf->ChangeTime)) { + rc = cifs_query_path_info(xid, tcon, cifs_sb, full_path, &query_data); + if (rc) { + if (open_file) { + cifsFileInfo_put(open_file); + open_file = NULL; + } + goto out; + } + /* + * Original write_time from buf->LastWriteTime is preserved + * as SMBSetInformation() interprets zero as do not change. + */ + new_buf = *buf; + buf = &new_buf; + if (!buf->CreationTime) + buf->CreationTime = query_data.fi.CreationTime; + if (!buf->LastAccessTime) + buf->LastAccessTime = query_data.fi.LastAccessTime; + if (!buf->LastWriteTime) + buf->LastWriteTime = query_data.fi.LastWriteTime; + if (!buf->ChangeTime) + buf->ChangeTime = query_data.fi.ChangeTime; } - tcon = tlink_tcon(tlink); + + if (open_file) + goto set_via_filehandle; rc = CIFSSMBSetPathInfo(xid, tcon, full_path, buf, cifs_sb->local_nls, cifs_sb); @@ -819,8 +978,45 @@ smb_set_file_info(struct inode *inode, const char *full_path, .fid = &fid, }; - cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for times not supported by this server\n"); - rc = CIFS_open(xid, &oparms, &oplock, NULL); + if (S_ISDIR(inode->i_mode) && !(tcon->ses->capabilities & CAP_NT_SMBS)) { + /* Opening directory path is not possible on non-NT servers. */ + rc = -EOPNOTSUPP; + } else { + /* + * Use cifs_open_file() instead of CIFS_open() as the + * cifs_open_file() selects the correct function which + * works also on non-NT servers. + */ + rc = cifs_open_file(xid, &oparms, &oplock, NULL); + /* + * Opening path for writing on non-NT servers is not + * possible when the read-only attribute is already set. + * Non-NT server in this case returns -EACCES. For those + * servers the only possible way how to clear the read-only + * bit is via SMB_COM_SETATTR command. + */ + if (rc == -EACCES && + (cinode->cifsAttrs & ATTR_READONLY) && + le32_to_cpu(buf->Attributes) != 0 && /* 0 = do not change attrs */ + !(le32_to_cpu(buf->Attributes) & ATTR_READONLY) && + !(tcon->ses->capabilities & CAP_NT_SMBS)) + rc = -EOPNOTSUPP; + } + + /* Fallback to SMB_COM_SETATTR command when absolutelty needed. */ + if (rc == -EOPNOTSUPP) { + cifs_dbg(FYI, "calling SetInformation since SetPathInfo for attrs/times not supported by this server\n"); + rc = SMBSetInformation(xid, tcon, full_path, + buf->Attributes != 0 ? buf->Attributes : cpu_to_le32(cinode->cifsAttrs), + write_time, + cifs_sb->local_nls, cifs_sb); + if (rc == 0) + cinode->cifsAttrs = le32_to_cpu(buf->Attributes); + else + rc = -EACCES; + goto out; + } + if (rc != 0) { if (rc == -EIO) rc = -EINVAL; @@ -828,6 +1024,7 @@ smb_set_file_info(struct inode *inode, const char *full_path, } netpid = current->tgid; + cifs_dbg(FYI, "calling SetFileInfo since SetPathInfo for attrs/times not supported by this server\n"); set_via_filehandle: rc = CIFSSMBSetFileInfo(xid, tcon, buf, fid.netfid, netpid); @@ -838,6 +1035,21 @@ set_via_filehandle: CIFSSMBClose(xid, tcon, fid.netfid); else cifsFileInfo_put(open_file); + + /* + * Setting the read-only bit is not honered on non-NT servers when done + * via open-semantics. So for setting it, use SMB_COM_SETATTR command. + * This command works only after the file is closed, so use it only when + * operation was called without the filehandle. + */ + if (open_file == NULL && + !(tcon->ses->capabilities & CAP_NT_SMBS) && + le32_to_cpu(buf->Attributes) & ATTR_READONLY) { + SMBSetInformation(xid, tcon, full_path, + buf->Attributes, + 0 /* do not change write time */, + cifs_sb->local_nls, cifs_sb); + } out: if (tlink != NULL) cifs_put_tlink(tlink); @@ -975,18 +1187,13 @@ static int cifs_query_symlink(const unsigned int xid, return rc; } -static int cifs_parse_reparse_point(struct cifs_sb_info *cifs_sb, - const char *full_path, - struct kvec *rsp_iov, - struct cifs_open_info_data *data) +static struct reparse_data_buffer *cifs_get_reparse_point_buffer(const struct kvec *rsp_iov, + u32 *plen) { - struct reparse_data_buffer *buf; TRANSACT_IOCTL_RSP *io = rsp_iov->iov_base; - u32 plen = le16_to_cpu(io->ByteCount); - - buf = (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + - le32_to_cpu(io->DataOffset)); - return parse_reparse_point(buf, plen, cifs_sb, full_path, data); + *plen = le16_to_cpu(io->ByteCount); + return (struct reparse_data_buffer *)((__u8 *)&io->hdr.Protocol + + le32_to_cpu(io->DataOffset)); } static bool @@ -1069,6 +1276,47 @@ cifs_make_node(unsigned int xid, struct inode *inode, full_path, mode, dev); } +static bool +cifs_is_network_name_deleted(char *buf, struct TCP_Server_Info *server) +{ + struct smb_hdr *shdr = (struct smb_hdr *)buf; + struct TCP_Server_Info *pserver; + struct cifs_ses *ses; + struct cifs_tcon *tcon; + + if (shdr->Flags2 & SMBFLG2_ERR_STATUS) { + if (shdr->Status.CifsError != cpu_to_le32(NT_STATUS_NETWORK_NAME_DELETED)) + return false; + } else { + if (shdr->Status.DosError.ErrorClass != ERRSRV || + shdr->Status.DosError.Error != cpu_to_le16(ERRinvtid)) + return false; + } + + /* If server is a channel, select the primary channel */ + pserver = SERVER_IS_CHAN(server) ? server->primary_server : server; + + spin_lock(&cifs_tcp_ses_lock); + list_for_each_entry(ses, &pserver->smb_ses_list, smb_ses_list) { + if (cifs_ses_exiting(ses)) + continue; + list_for_each_entry(tcon, &ses->tcon_list, tcon_list) { + if (tcon->tid == shdr->Tid) { + spin_lock(&tcon->tc_lock); + tcon->need_reconnect = true; + spin_unlock(&tcon->tc_lock); + spin_unlock(&cifs_tcp_ses_lock); + pr_warn_once("Server share %s deleted.\n", + tcon->tree_name); + return true; + } + } + } + spin_unlock(&cifs_tcp_ses_lock); + + return false; +} + struct smb_version_operations smb1_operations = { .send_cancel = send_nt_cancel, .compare_fids = cifs_compare_fids, @@ -1094,8 +1342,8 @@ struct smb_version_operations smb1_operations = { .check_trans2 = cifs_check_trans2, .need_neg = cifs_need_neg, .negotiate = cifs_negotiate, - .negotiate_wsize = cifs_negotiate_wsize, - .negotiate_rsize = cifs_negotiate_rsize, + .negotiate_wsize = smb1_negotiate_wsize, + .negotiate_rsize = smb1_negotiate_rsize, .sess_setup = CIFS_SessSetup, .logoff = CIFSSMBLogoff, .tree_connect = CIFSTCon, @@ -1121,7 +1369,7 @@ struct smb_version_operations smb1_operations = { .rename = CIFSSMBRename, .create_hardlink = CIFSCreateHardLink, .query_symlink = cifs_query_symlink, - .parse_reparse_point = cifs_parse_reparse_point, + .get_reparse_point_buffer = cifs_get_reparse_point_buffer, .open = cifs_open_file, .set_fid = cifs_set_fid, .close = cifs_close_file, @@ -1153,6 +1401,7 @@ struct smb_version_operations smb1_operations = { .get_acl_by_fid = get_cifs_acl_by_fid, .set_acl = set_cifs_acl, .make_node = cifs_make_node, + .is_network_name_deleted = cifs_is_network_name_deleted, }; struct smb_version_values smb1_values = { @@ -1170,6 +1419,7 @@ struct smb_version_values smb1_values = { .cap_unix = CAP_UNIX, .cap_nt_find = CAP_NT_SMBS | CAP_NT_FIND, .cap_large_files = CAP_LARGE_FILES, + .cap_unicode = CAP_UNICODE, .signing_enabled = SECMODE_SIGN_ENABLED, .signing_required = SECMODE_SIGN_REQUIRED, }; diff --git a/fs/smb/client/smb2file.c b/fs/smb/client/smb2file.c index d609a20fb98a..a7f629238830 100644 --- a/fs/smb/client/smb2file.c +++ b/fs/smb/client/smb2file.c @@ -152,16 +152,35 @@ int smb2_open_file(const unsigned int xid, struct cifs_open_parms *oparms, __u32 int err_buftype = CIFS_NO_BUFFER; struct cifs_fid *fid = oparms->fid; struct network_resiliency_req nr_ioctl_req; + bool retry_without_read_attributes = false; smb2_path = cifs_convert_path_to_utf16(oparms->path, oparms->cifs_sb); if (smb2_path == NULL) return -ENOMEM; - oparms->desired_access |= FILE_READ_ATTRIBUTES; + /* + * GENERIC_READ, GENERIC_EXECUTE, GENERIC_ALL and MAXIMUM_ALLOWED + * contains also FILE_READ_ATTRIBUTES access right. So do not append + * FILE_READ_ATTRIBUTES when not needed and prevent calling code path + * for retry_without_read_attributes. + */ + if (!(oparms->desired_access & FILE_READ_ATTRIBUTES) && + !(oparms->desired_access & GENERIC_READ) && + !(oparms->desired_access & GENERIC_EXECUTE) && + !(oparms->desired_access & GENERIC_ALL) && + !(oparms->desired_access & MAXIMUM_ALLOWED)) { + oparms->desired_access |= FILE_READ_ATTRIBUTES; + retry_without_read_attributes = true; + } smb2_oplock = SMB2_OPLOCK_LEVEL_BATCH; rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, &err_buftype); + if (rc == -EACCES && retry_without_read_attributes) { + oparms->desired_access &= ~FILE_READ_ATTRIBUTES; + rc = SMB2_open(xid, oparms, smb2_path, &smb2_oplock, smb2_data, NULL, &err_iov, + &err_buftype); + } if (rc && data) { struct smb2_hdr *hdr = err_iov.iov_base; diff --git a/fs/smb/client/smb2glob.h b/fs/smb/client/smb2glob.h index 2466e6155136..224495322a05 100644 --- a/fs/smb/client/smb2glob.h +++ b/fs/smb/client/smb2glob.h @@ -38,6 +38,7 @@ enum smb2_compound_ops { SMB2_OP_SET_REPARSE, SMB2_OP_GET_REPARSE, SMB2_OP_QUERY_WSL_EA, + SMB2_OP_OPEN_QUERY, }; /* Used when constructing chained read requests. */ diff --git a/fs/smb/client/smb2inode.c b/fs/smb/client/smb2inode.c index 826b57a5a2a8..2a3e46b8e15a 100644 --- a/fs/smb/client/smb2inode.c +++ b/fs/smb/client/smb2inode.c @@ -176,6 +176,7 @@ static int smb2_compound_op(const unsigned int xid, struct cifs_tcon *tcon, struct kvec *out_iov, int *out_buftype, struct dentry *dentry) { + struct smb2_create_rsp *create_rsp = NULL; struct smb2_query_info_rsp *qi_rsp = NULL; struct smb2_compound_vars *vars = NULL; __u8 oplock = SMB2_OPLOCK_LEVEL_NONE; @@ -265,7 +266,13 @@ replay_again: num_rqst++; rc = 0; - for (i = 0; i < num_cmds; i++) { + i = 0; + + /* Skip the leading explicit OPEN operation */ + if (num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY) + i++; + + for (; i < num_cmds; i++) { /* Operation */ switch (cmds[i]) { case SMB2_OP_QUERY_INFO: @@ -640,6 +647,29 @@ finished: } tmp_rc = rc; + + if (rc == 0 && num_cmds > 0 && cmds[0] == SMB2_OP_OPEN_QUERY) { + create_rsp = rsp_iov[0].iov_base; + idata = in_iov[0].iov_base; + idata->fi.CreationTime = create_rsp->CreationTime; + idata->fi.LastAccessTime = create_rsp->LastAccessTime; + idata->fi.LastWriteTime = create_rsp->LastWriteTime; + idata->fi.ChangeTime = create_rsp->ChangeTime; + idata->fi.Attributes = create_rsp->FileAttributes; + idata->fi.AllocationSize = create_rsp->AllocationSize; + idata->fi.EndOfFile = create_rsp->EndofFile; + if (le32_to_cpu(idata->fi.NumberOfLinks) == 0) + idata->fi.NumberOfLinks = cpu_to_le32(1); /* dummy value */ + idata->fi.DeletePending = 0; + idata->fi.Directory = !!(le32_to_cpu(create_rsp->FileAttributes) & ATTR_DIRECTORY); + + /* smb2_parse_contexts() fills idata->fi.IndexNumber */ + rc = smb2_parse_contexts(server, &rsp_iov[0], &oparms->fid->epoch, + oparms->fid->lease_key, &oplock, &idata->fi, NULL); + if (rc) + cifs_dbg(VFS, "rc: %d parsing context of compound op\n", rc); + } + for (i = 0; i < num_cmds; i++) { char *buf = rsp_iov[i + i].iov_base; @@ -978,6 +1008,43 @@ int smb2_query_path_info(const unsigned int xid, case 0: rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]); break; + case -EACCES: + /* + * If SMB2_OP_QUERY_INFO (called when POSIX extensions are not used) failed with + * STATUS_ACCESS_DENIED then it means that caller does not have permission to + * open the path with FILE_READ_ATTRIBUTES access and therefore cannot issue + * SMB2_OP_QUERY_INFO command. + * + * There is an alternative way how to query limited information about path but still + * suitable for stat() syscall. SMB2 OPEN/CREATE operation returns in its successful + * response subset of query information. + * + * So try to open the path without FILE_READ_ATTRIBUTES but with MAXIMUM_ALLOWED + * access which will grant the maximum possible access to the file and the response + * will contain required query information for stat() syscall. + */ + + if (tcon->posix_extensions) + break; + + num_cmds = 1; + cmds[0] = SMB2_OP_OPEN_QUERY; + in_iov[0].iov_base = data; + in_iov[0].iov_len = sizeof(*data); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, MAXIMUM_ALLOWED, + FILE_OPEN, create_options, ACL_NO_MODE); + free_rsp_iov(out_iov, out_buftype, ARRAY_SIZE(out_iov)); + rc = smb2_compound_op(xid, tcon, cifs_sb, full_path, + &oparms, in_iov, cmds, num_cmds, + cfile, out_iov, out_buftype, NULL); + + hdr = out_iov[0].iov_base; + if (!hdr || out_buftype[0] == CIFS_NO_BUFFER) + goto out; + + if (!rc) + rc = parse_create_response(data, cifs_sb, full_path, &out_iov[0]); + break; case -EOPNOTSUPP: /* * BB TODO: When support for special files added to Samba @@ -1273,6 +1340,14 @@ struct inode *smb2_get_reparse_inode(struct cifs_open_info_data *data, int rc; int i; + /* + * If server filesystem does not support reparse points then do not + * attempt to create reparse point. This will prevent creating unusable + * empty object on the server. + */ + if (!(le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS)) + return ERR_PTR(-EOPNOTSUPP); + oparms = CIFS_OPARMS(cifs_sb, tcon, full_path, SYNCHRONIZE | DELETE | FILE_READ_ATTRIBUTES | diff --git a/fs/smb/client/smb2misc.c b/fs/smb/client/smb2misc.c index f3c4b70b77b9..cddf273c14ae 100644 --- a/fs/smb/client/smb2misc.c +++ b/fs/smb/client/smb2misc.c @@ -816,11 +816,12 @@ smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 persistent_fid, WARN_ONCE(tcon->tc_count < 0, "tcon refcount is negative"); spin_unlock(&cifs_tcp_ses_lock); - if (tcon->ses) + if (tcon->ses) { server = tcon->ses->server; - - cifs_server_dbg(FYI, "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n", - tcon->tid, persistent_fid, volatile_fid); + cifs_server_dbg(FYI, + "tid=0x%x: tcon is closing, skipping async close retry of fid %llu %llu\n", + tcon->tid, persistent_fid, volatile_fid); + } return 0; } diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 4dd11eafb69d..2fe8eeb98535 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -464,12 +464,20 @@ smb2_negotiate(const unsigned int xid, server->CurrentMid = 0; spin_unlock(&server->mid_lock); rc = SMB2_negotiate(xid, ses, server); - /* BB we probably don't need to retry with modern servers */ - if (rc == -EAGAIN) - rc = -EHOSTDOWN; return rc; } +static inline unsigned int +prevent_zero_iosize(unsigned int size, const char *type) +{ + if (size == 0) { + cifs_dbg(VFS, "SMB: Zero %ssize calculated, using minimum value %u\n", + type, CIFS_MIN_DEFAULT_IOSIZE); + return CIFS_MIN_DEFAULT_IOSIZE; + } + return size; +} + static unsigned int smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) { @@ -477,12 +485,12 @@ smb2_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - wsize = ctx->wsize ? ctx->wsize : CIFS_DEFAULT_IOSIZE; + wsize = ctx->got_wsize ? ctx->vol_wsize : CIFS_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); - return wsize; + return prevent_zero_iosize(wsize, "w"); } static unsigned int @@ -492,7 +500,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int wsize; /* start with specified wsize, or default */ - wsize = ctx->wsize ? ctx->wsize : SMB3_DEFAULT_IOSIZE; + wsize = ctx->got_wsize ? ctx->vol_wsize : SMB3_DEFAULT_IOSIZE; wsize = min_t(unsigned int, wsize, server->max_write); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { @@ -514,7 +522,7 @@ smb3_negotiate_wsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) wsize = min_t(unsigned int, wsize, SMB2_MAX_BUFFER_SIZE); - return wsize; + return prevent_zero_iosize(wsize, "w"); } static unsigned int @@ -524,13 +532,13 @@ smb2_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int rsize; /* start with specified rsize, or default */ - rsize = ctx->rsize ? ctx->rsize : CIFS_DEFAULT_IOSIZE; + rsize = ctx->got_rsize ? ctx->vol_rsize : CIFS_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); - return rsize; + return prevent_zero_iosize(rsize, "r"); } static unsigned int @@ -540,7 +548,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) unsigned int rsize; /* start with specified rsize, or default */ - rsize = ctx->rsize ? ctx->rsize : SMB3_DEFAULT_IOSIZE; + rsize = ctx->got_rsize ? ctx->vol_rsize : SMB3_DEFAULT_IOSIZE; rsize = min_t(unsigned int, rsize, server->max_read); #ifdef CONFIG_CIFS_SMB_DIRECT if (server->rdma) { @@ -563,7 +571,7 @@ smb3_negotiate_rsize(struct cifs_tcon *tcon, struct smb3_fs_context *ctx) if (!(server->capabilities & SMB2_GLOBAL_CAP_LARGE_MTU)) rsize = min_t(unsigned int, rsize, SMB2_MAX_BUFFER_SIZE); - return rsize; + return prevent_zero_iosize(rsize, "r"); } /* @@ -969,7 +977,7 @@ smb2_is_path_accessible(const unsigned int xid, struct cifs_tcon *tcon, if (islink) rc = -EREMOTE; } - if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && cifs_sb && + if (rc == -EREMOTE && IS_ENABLED(CONFIG_CIFS_DFS_UPCALL) && (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_NO_DFS)) rc = -EOPNOTSUPP; goto out; @@ -3526,8 +3534,6 @@ static long smb3_simple_falloc(struct file *file, struct cifs_tcon *tcon, if (rc == 0) { netfs_resize_file(&cifsi->netfs, new_eof, true); cifs_setsize(inode, new_eof); - cifs_truncate_page(inode->i_mapping, inode->i_size); - truncate_setsize(inode, new_eof); } goto out; } @@ -4549,9 +4555,9 @@ decrypt_raw_data(struct TCP_Server_Info *server, char *buf, return rc; } } else { - if (unlikely(!server->secmech.dec)) - return -EIO; - + rc = smb3_crypto_aead_allocate(server); + if (unlikely(rc)) + return rc; tfm = server->secmech.dec; } @@ -5229,7 +5235,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, const char *full_path, umode_t mode, dev_t dev) { struct cifs_sb_info *cifs_sb = CIFS_SB(inode->i_sb); - int rc; + int rc = -EOPNOTSUPP; /* * Check if mounted with mount parm 'sfu' mount parm. @@ -5240,7 +5246,7 @@ static int smb2_make_node(unsigned int xid, struct inode *inode, if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_UNX_EMUL) { rc = cifs_sfu_make_node(xid, inode, dentry, tcon, full_path, mode, dev); - } else { + } else if (le32_to_cpu(tcon->fsAttrInfo.Attributes) & FILE_SUPPORTS_REPARSE_POINTS) { rc = smb2_mknod_reparse(xid, inode, dentry, tcon, full_path, mode, dev); } @@ -5297,7 +5303,7 @@ struct smb_version_operations smb20_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .parse_reparse_point = smb2_parse_reparse_point, + .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .create_reparse_symlink = smb2_create_reparse_symlink, @@ -5400,7 +5406,7 @@ struct smb_version_operations smb21_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .parse_reparse_point = smb2_parse_reparse_point, + .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .create_reparse_symlink = smb2_create_reparse_symlink, @@ -5507,7 +5513,7 @@ struct smb_version_operations smb30_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .parse_reparse_point = smb2_parse_reparse_point, + .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .create_reparse_symlink = smb2_create_reparse_symlink, @@ -5623,7 +5629,7 @@ struct smb_version_operations smb311_operations = { .unlink = smb2_unlink, .rename = smb2_rename_path, .create_hardlink = smb2_create_hardlink, - .parse_reparse_point = smb2_parse_reparse_point, + .get_reparse_point_buffer = smb2_get_reparse_point_buffer, .query_mf_symlink = smb3_query_mf_symlink, .create_mf_symlink = smb3_create_mf_symlink, .create_reparse_symlink = smb2_create_reparse_symlink, diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index ed7812247ebc..4e28632b5fd6 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -43,6 +43,7 @@ #endif #include "cached_dir.h" #include "compress.h" +#include "fs_context.h" /* * The following table defines the expected "StructureSize" of SMB2 requests @@ -300,32 +301,23 @@ again: mutex_lock(&ses->session_mutex); /* - * if this is called by delayed work, and the channel has been disabled - * in parallel, the delayed work can continue to execute in parallel - * there's a chance that this channel may not exist anymore + * Handle the case where a concurrent thread failed to negotiate or + * killed a channel. */ spin_lock(&server->srv_lock); - if (server->tcpStatus == CifsExiting) { + switch (server->tcpStatus) { + case CifsExiting: spin_unlock(&server->srv_lock); mutex_unlock(&ses->session_mutex); - rc = -EHOSTDOWN; - goto out; - } - - /* - * Recheck after acquire mutex. If another thread is negotiating - * and the server never sends an answer the socket will be closed - * and tcpStatus set to reconnect. - */ - if (server->tcpStatus == CifsNeedReconnect) { + return -EHOSTDOWN; + case CifsNeedReconnect: spin_unlock(&server->srv_lock); mutex_unlock(&ses->session_mutex); - - if (tcon->retry) - goto again; - - rc = -EHOSTDOWN; - goto out; + if (!tcon->retry) + return -EHOSTDOWN; + goto again; + default: + break; } spin_unlock(&server->srv_lock); @@ -350,43 +342,41 @@ again: spin_unlock(&ses->ses_lock); rc = cifs_negotiate_protocol(0, ses, server); - if (!rc) { - /* - * if server stopped supporting multichannel - * and the first channel reconnected, disable all the others. - */ - if (ses->chan_count > 1 && - !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { - rc = cifs_chan_skip_or_disable(ses, server, - from_reconnect); - if (rc) { - mutex_unlock(&ses->session_mutex); - goto out; - } - } - - rc = cifs_setup_session(0, ses, server, ses->local_nls); - if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { - /* - * Try alternate password for next reconnect (key rotation - * could be enabled on the server e.g.) if an alternate - * password is available and the current password is expired, - * but do not swap on non pwd related errors like host down - */ - if (ses->password2) - swap(ses->password2, ses->password); - } - - if ((rc == -EACCES) && !tcon->retry) { - mutex_unlock(&ses->session_mutex); - rc = -EHOSTDOWN; - goto failed; - } else if (rc) { + if (rc) { + mutex_unlock(&ses->session_mutex); + if (!tcon->retry) + return -EHOSTDOWN; + goto again; + } + /* + * if server stopped supporting multichannel + * and the first channel reconnected, disable all the others. + */ + if (ses->chan_count > 1 && + !(server->capabilities & SMB2_GLOBAL_CAP_MULTI_CHANNEL)) { + rc = cifs_chan_skip_or_disable(ses, server, + from_reconnect); + if (rc) { mutex_unlock(&ses->session_mutex); goto out; } - } else { + } + + rc = cifs_setup_session(0, ses, server, ses->local_nls); + if ((rc == -EACCES) || (rc == -EKEYEXPIRED) || (rc == -EKEYREVOKED)) { + /* + * Try alternate password for next reconnect (key rotation + * could be enabled on the server e.g.) if an alternate + * password is available and the current password is expired, + * but do not swap on non pwd related errors like host down + */ + if (ses->password2) + swap(ses->password2, ses->password); + } + if (rc) { mutex_unlock(&ses->session_mutex); + if (rc == -EACCES && !tcon->retry) + return -EHOSTDOWN; goto out; } @@ -490,7 +480,6 @@ out: case SMB2_IOCTL: rc = -EAGAIN; } -failed: return rc; } @@ -1263,15 +1252,8 @@ SMB2_negotiate(const unsigned int xid, cifs_server_dbg(VFS, "Missing expected negotiate contexts\n"); } - if (server->cipher_type && !rc) { - if (!SERVER_IS_CHAN(server)) { - rc = smb3_crypto_aead_allocate(server); - } else { - /* For channels, just reuse the primary server crypto secmech. */ - server->secmech.enc = server->primary_server->secmech.enc; - server->secmech.dec = server->primary_server->secmech.dec; - } - } + if (server->cipher_type && !rc) + rc = smb3_crypto_aead_allocate(server); neg_exit: free_rsp_buf(resp_buftype, rsp); return rc; @@ -2939,6 +2921,7 @@ replay_again: req->CreateContextsOffset = cpu_to_le32( sizeof(struct smb2_create_req) + iov[1].iov_len); + le32_add_cpu(&req->CreateContextsLength, iov[n_iov-1].iov_len); pc_buf = iov[n_iov-1].iov_base; } @@ -2985,7 +2968,7 @@ replay_again: /* Eventually save off posix specific response info and timestamps */ err_free_rsp_buf: - free_rsp_buf(resp_buftype, rsp); + free_rsp_buf(resp_buftype, rsp_iov.iov_base); kfree(pc_buf); err_free_req: cifs_small_buf_release(req); @@ -3928,12 +3911,10 @@ SMB2_query_acl(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, void **data, u32 *plen, u32 extra_info) { - __u32 additional_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO | - extra_info; *plen = 0; return query_info(xid, tcon, persistent_fid, volatile_fid, - 0, SMB2_O_INFO_SECURITY, additional_info, + 0, SMB2_O_INFO_SECURITY, extra_info, SMB2_MAX_BUFFER_SIZE, MIN_SEC_DESC_LEN, data, plen); } @@ -4103,6 +4084,20 @@ smb2_echo_callback(struct mid_q_entry *mid) add_credits(server, &credits, CIFS_ECHO_OP); } +static void cifs_renegotiate_iosize(struct TCP_Server_Info *server, + struct cifs_tcon *tcon) +{ + struct cifs_sb_info *cifs_sb; + + if (server == NULL || tcon == NULL) + return; + + spin_lock(&tcon->sb_list_lock); + list_for_each_entry(cifs_sb, &tcon->cifs_sb_list, tcon_sb_link) + cifs_negotiate_iosize(server, cifs_sb->ctx, tcon); + spin_unlock(&tcon->sb_list_lock); +} + void smb2_reconnect_server(struct work_struct *work) { struct TCP_Server_Info *server = container_of(work, @@ -4188,9 +4183,10 @@ void smb2_reconnect_server(struct work_struct *work) list_for_each_entry_safe(tcon, tcon2, &tmp_list, rlist) { rc = smb2_reconnect(SMB2_INTERNAL_CMD, tcon, server, true); - if (!rc) + if (!rc) { + cifs_renegotiate_iosize(server, tcon); cifs_reopen_persistent_handles(tcon); - else + } else resched = true; list_del_init(&tcon->rlist); if (tcon->ipc) diff --git a/fs/smb/client/transport.c b/fs/smb/client/transport.c index 0dc80959ce48..266af17aa7d9 100644 --- a/fs/smb/client/transport.c +++ b/fs/smb/client/transport.c @@ -179,7 +179,7 @@ delete_mid(struct mid_q_entry *mid) * Our basic "send data to server" function. Should be called with srv_mutex * held. The caller is responsible for handling the results. */ -static int +int smb_send_kvec(struct TCP_Server_Info *server, struct msghdr *smb_msg, size_t *sent) { @@ -894,6 +894,9 @@ cifs_sync_mid_result(struct mid_q_entry *mid, struct TCP_Server_Info *server) case MID_SHUTDOWN: rc = -EHOSTDOWN; break; + case MID_RC: + rc = mid->mid_rc; + break; default: if (!(mid->mid_flags & MID_DELETED)) { list_del_init(&mid->qhead); diff --git a/fs/smb/client/xattr.c b/fs/smb/client/xattr.c index 58a584f0b27e..b88fa04f5792 100644 --- a/fs/smb/client/xattr.c +++ b/fs/smb/client/xattr.c @@ -31,6 +31,8 @@ * secure, replaced by SMB2 (then even more highly secure SMB3) many years ago */ #define SMB3_XATTR_CIFS_ACL "system.smb3_acl" /* DACL only */ +#define SMB3_XATTR_CIFS_NTSD_SACL "system.smb3_ntsd_sacl" /* SACL only */ +#define SMB3_XATTR_CIFS_NTSD_OWNER "system.smb3_ntsd_owner" /* owner only */ #define SMB3_XATTR_CIFS_NTSD "system.smb3_ntsd" /* owner plus DACL */ #define SMB3_XATTR_CIFS_NTSD_FULL "system.smb3_ntsd_full" /* owner/DACL/SACL */ #define SMB3_XATTR_ATTRIB "smb3.dosattrib" /* full name: user.smb3.dosattrib */ @@ -38,6 +40,7 @@ /* BB need to add server (Samba e.g) support for security and trusted prefix */ enum { XATTR_USER, XATTR_CIFS_ACL, XATTR_ACL_ACCESS, XATTR_ACL_DEFAULT, + XATTR_CIFS_NTSD_SACL, XATTR_CIFS_NTSD_OWNER, XATTR_CIFS_NTSD, XATTR_CIFS_NTSD_FULL }; static int cifs_attrib_set(unsigned int xid, struct cifs_tcon *pTcon, @@ -160,6 +163,8 @@ static int cifs_xattr_set(const struct xattr_handler *handler, break; case XATTR_CIFS_ACL: + case XATTR_CIFS_NTSD_SACL: + case XATTR_CIFS_NTSD_OWNER: case XATTR_CIFS_NTSD: case XATTR_CIFS_NTSD_FULL: { struct smb_ntsd *pacl; @@ -187,6 +192,13 @@ static int cifs_xattr_set(const struct xattr_handler *handler, CIFS_ACL_GROUP | CIFS_ACL_DACL); break; + case XATTR_CIFS_NTSD_OWNER: + aclflags = (CIFS_ACL_OWNER | + CIFS_ACL_GROUP); + break; + case XATTR_CIFS_NTSD_SACL: + aclflags = CIFS_ACL_SACL; + break; case XATTR_CIFS_ACL: default: aclflags = CIFS_ACL_DACL; @@ -308,6 +320,8 @@ static int cifs_xattr_get(const struct xattr_handler *handler, break; case XATTR_CIFS_ACL: + case XATTR_CIFS_NTSD_SACL: + case XATTR_CIFS_NTSD_OWNER: case XATTR_CIFS_NTSD: case XATTR_CIFS_NTSD_FULL: { /* @@ -320,10 +334,23 @@ static int cifs_xattr_get(const struct xattr_handler *handler, if (pTcon->ses->server->ops->get_acl == NULL) goto out; /* rc already EOPNOTSUPP */ - if (handler->flags == XATTR_CIFS_NTSD_FULL) { + switch (handler->flags) { + case XATTR_CIFS_NTSD_FULL: + extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO | SACL_SECINFO; + break; + case XATTR_CIFS_NTSD: + extra_info = OWNER_SECINFO | GROUP_SECINFO | DACL_SECINFO; + break; + case XATTR_CIFS_NTSD_OWNER: + extra_info = OWNER_SECINFO | GROUP_SECINFO; + break; + case XATTR_CIFS_NTSD_SACL: extra_info = SACL_SECINFO; - } else { - extra_info = 0; + break; + case XATTR_CIFS_ACL: + default: + extra_info = DACL_SECINFO; + break; } pacl = pTcon->ses->server->ops->get_acl(cifs_sb, inode, full_path, &acllen, extra_info); @@ -441,6 +468,20 @@ static const struct xattr_handler smb3_acl_xattr_handler = { .set = cifs_xattr_set, }; +static const struct xattr_handler smb3_ntsd_sacl_xattr_handler = { + .name = SMB3_XATTR_CIFS_NTSD_SACL, + .flags = XATTR_CIFS_NTSD_SACL, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + +static const struct xattr_handler smb3_ntsd_owner_xattr_handler = { + .name = SMB3_XATTR_CIFS_NTSD_OWNER, + .flags = XATTR_CIFS_NTSD_OWNER, + .get = cifs_xattr_get, + .set = cifs_xattr_set, +}; + static const struct xattr_handler cifs_cifs_ntsd_xattr_handler = { .name = CIFS_XATTR_CIFS_NTSD, .flags = XATTR_CIFS_NTSD, @@ -486,6 +527,8 @@ const struct xattr_handler * const cifs_xattr_handlers[] = { &cifs_os2_xattr_handler, &cifs_cifs_acl_xattr_handler, &smb3_acl_xattr_handler, /* alias for above since avoiding "cifs" */ + &smb3_ntsd_sacl_xattr_handler, + &smb3_ntsd_owner_xattr_handler, &cifs_cifs_ntsd_xattr_handler, &smb3_ntsd_xattr_handler, /* alias for above since avoiding "cifs" */ &cifs_cifs_ntsd_full_xattr_handler, diff --git a/fs/smb/common/smb2pdu.h b/fs/smb/common/smb2pdu.h index c7a0efda4403..f79a5165a7cc 100644 --- a/fs/smb/common/smb2pdu.h +++ b/fs/smb/common/smb2pdu.h @@ -95,6 +95,9 @@ */ #define SMB3_DEFAULT_IOSIZE (4 * 1024 * 1024) +/* According to MS-SMB2 specification The minimum recommended value is 65536.*/ +#define CIFS_MIN_DEFAULT_IOSIZE (65536) + /* * SMB2 Header Definition * @@ -1564,13 +1567,13 @@ struct reparse_nfs_data_buffer { __u8 DataBuffer[]; } __packed; -/* For IO_REPARSE_TAG_LX_SYMLINK */ +/* For IO_REPARSE_TAG_LX_SYMLINK - see MS-FSCC 2.1.2.7 */ struct reparse_wsl_symlink_data_buffer { __le32 ReparseTag; __le16 ReparseDataLength; __u16 Reserved; - __le32 Flags; - __u8 PathBuffer[]; /* Variable Length UTF-8 string without nul-term */ + __le32 Version; /* Always 2 */ + __u8 Target[]; /* Variable Length UTF-8 string without nul-term */ } __packed; struct validate_negotiate_info_req { diff --git a/fs/smb/common/smbacl.h b/fs/smb/common/smbacl.h index 6a60698fc6f0..a624ec9e4a14 100644 --- a/fs/smb/common/smbacl.h +++ b/fs/smb/common/smbacl.h @@ -107,7 +107,8 @@ struct smb_sid { struct smb_acl { __le16 revision; /* revision level */ __le16 size; - __le32 num_aces; + __le16 num_aces; + __le16 reserved; } __attribute__((packed)); struct smb_ace { diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig index cabe6a843c6a..cf70e96ad4de 100644 --- a/fs/smb/server/Kconfig +++ b/fs/smb/server/Kconfig @@ -70,4 +70,4 @@ config SMB_SERVER_CHECK_CAP_NET_ADMIN config SMB_SERVER_KERBEROS5 bool "Support for Kerberos 5" depends on SMB_SERVER - default n + default y diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index 2a5b4a96bf99..b3d121052408 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -550,7 +550,19 @@ int ksmbd_krb5_authenticate(struct ksmbd_session *sess, char *in_blob, retval = -ENOMEM; goto out; } - sess->user = user; + + if (!sess->user) { + /* First successful authentication */ + sess->user = user; + } else { + if (!ksmbd_compare_user(sess->user, user)) { + ksmbd_debug(AUTH, "different user tried to reuse session\n"); + retval = -EPERM; + ksmbd_free_user(user); + goto out; + } + ksmbd_free_user(user); + } memcpy(sess->sess_key, resp->payload, resp->session_key_len); memcpy(out_blob, resp->payload + resp->session_key_len, @@ -1016,9 +1028,9 @@ static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id, ses_enc_key = enc ? sess->smb3encryptionkey : sess->smb3decryptionkey; - if (enc) - ksmbd_user_session_get(sess); memcpy(key, ses_enc_key, SMB3_ENC_DEC_KEY_SIZE); + if (!enc) + ksmbd_user_session_put(sess); return 0; } @@ -1218,7 +1230,7 @@ free_iv: free_sg: kfree(sg); free_req: - kfree(req); + aead_request_free(req); free_ctx: ksmbd_release_crypto_ctx(ctx); return rc; diff --git a/fs/smb/server/connection.c b/fs/smb/server/connection.c index f8a40f65db6a..83764c230e9d 100644 --- a/fs/smb/server/connection.c +++ b/fs/smb/server/connection.c @@ -39,8 +39,10 @@ void ksmbd_conn_free(struct ksmbd_conn *conn) xa_destroy(&conn->sessions); kvfree(conn->request_buf); kfree(conn->preauth_info); - if (atomic_dec_and_test(&conn->refcnt)) + if (atomic_dec_and_test(&conn->refcnt)) { + ksmbd_free_transport(conn->transport); kfree(conn); + } } /** @@ -433,6 +435,26 @@ void ksmbd_conn_init_server_callbacks(struct ksmbd_conn_ops *ops) default_conn_ops.terminate_fn = ops->terminate_fn; } +void ksmbd_conn_r_count_inc(struct ksmbd_conn *conn) +{ + atomic_inc(&conn->r_count); +} + +void ksmbd_conn_r_count_dec(struct ksmbd_conn *conn) +{ + /* + * Checking waitqueue to dropping pending requests on + * disconnection. waitqueue_active is safe because it + * uses atomic operation for condition. + */ + atomic_inc(&conn->refcnt); + if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q)) + wake_up(&conn->r_count_q); + + if (atomic_dec_and_test(&conn->refcnt)) + kfree(conn); +} + int ksmbd_conn_transport_init(void) { int ret; diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index b379ae4fdcdf..14620e147dda 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -27,6 +27,7 @@ enum { KSMBD_SESS_EXITING, KSMBD_SESS_NEED_RECONNECT, KSMBD_SESS_NEED_NEGOTIATE, + KSMBD_SESS_NEED_SETUP, KSMBD_SESS_RELEASING }; @@ -168,6 +169,8 @@ int ksmbd_conn_transport_init(void); void ksmbd_conn_transport_destroy(void); void ksmbd_conn_lock(struct ksmbd_conn *conn); void ksmbd_conn_unlock(struct ksmbd_conn *conn); +void ksmbd_conn_r_count_inc(struct ksmbd_conn *conn); +void ksmbd_conn_r_count_dec(struct ksmbd_conn *conn); /* * WARNING @@ -185,6 +188,11 @@ static inline bool ksmbd_conn_need_negotiate(struct ksmbd_conn *conn) return READ_ONCE(conn->status) == KSMBD_SESS_NEED_NEGOTIATE; } +static inline bool ksmbd_conn_need_setup(struct ksmbd_conn *conn) +{ + return READ_ONCE(conn->status) == KSMBD_SESS_NEED_SETUP; +} + static inline bool ksmbd_conn_need_reconnect(struct ksmbd_conn *conn) { return READ_ONCE(conn->status) == KSMBD_SESS_NEED_RECONNECT; @@ -215,6 +223,11 @@ static inline void ksmbd_conn_set_need_negotiate(struct ksmbd_conn *conn) WRITE_ONCE(conn->status, KSMBD_SESS_NEED_NEGOTIATE); } +static inline void ksmbd_conn_set_need_setup(struct ksmbd_conn *conn) +{ + WRITE_ONCE(conn->status, KSMBD_SESS_NEED_SETUP); +} + static inline void ksmbd_conn_set_need_reconnect(struct ksmbd_conn *conn) { WRITE_ONCE(conn->status, KSMBD_SESS_NEED_RECONNECT); diff --git a/fs/smb/server/ksmbd_work.c b/fs/smb/server/ksmbd_work.c index 4af2e6007c29..72b00ca6e455 100644 --- a/fs/smb/server/ksmbd_work.c +++ b/fs/smb/server/ksmbd_work.c @@ -26,7 +26,6 @@ struct ksmbd_work *ksmbd_alloc_work_struct(void) INIT_LIST_HEAD(&work->request_entry); INIT_LIST_HEAD(&work->async_request_entry); INIT_LIST_HEAD(&work->fp_entry); - INIT_LIST_HEAD(&work->interim_entry); INIT_LIST_HEAD(&work->aux_read_list); work->iov_alloc_cnt = 4; work->iov = kcalloc(work->iov_alloc_cnt, sizeof(struct kvec), @@ -56,8 +55,6 @@ void ksmbd_free_work_struct(struct ksmbd_work *work) kfree(work->tr_buf); kvfree(work->request_buf); kfree(work->iov); - if (!list_empty(&work->interim_entry)) - list_del(&work->interim_entry); if (work->async_id) ksmbd_release_id(&work->conn->async_ida, work->async_id); diff --git a/fs/smb/server/ksmbd_work.h b/fs/smb/server/ksmbd_work.h index 8ca2c813246e..d36393ff8310 100644 --- a/fs/smb/server/ksmbd_work.h +++ b/fs/smb/server/ksmbd_work.h @@ -89,7 +89,6 @@ struct ksmbd_work { /* List head at conn->async_requests */ struct list_head async_request_entry; struct list_head fp_entry; - struct list_head interim_entry; }; /** diff --git a/fs/smb/server/mgmt/user_session.c b/fs/smb/server/mgmt/user_session.c index 71c6939dfbf1..9dec4c2940bc 100644 --- a/fs/smb/server/mgmt/user_session.c +++ b/fs/smb/server/mgmt/user_session.c @@ -59,10 +59,12 @@ static void ksmbd_session_rpc_clear_list(struct ksmbd_session *sess) struct ksmbd_session_rpc *entry; long index; + down_write(&sess->rpc_lock); xa_for_each(&sess->rpc_handle_list, index, entry) { xa_erase(&sess->rpc_handle_list, index); __session_rpc_close(sess, entry); } + up_write(&sess->rpc_lock); xa_destroy(&sess->rpc_handle_list); } @@ -92,7 +94,7 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) { struct ksmbd_session_rpc *entry, *old; struct ksmbd_rpc_command *resp; - int method; + int method, id; method = __rpc_method(rpc_name); if (!method) @@ -102,26 +104,29 @@ int ksmbd_session_rpc_open(struct ksmbd_session *sess, char *rpc_name) if (!entry) return -ENOMEM; + down_read(&sess->rpc_lock); entry->method = method; - entry->id = ksmbd_ipc_id_alloc(); - if (entry->id < 0) + entry->id = id = ksmbd_ipc_id_alloc(); + if (id < 0) goto free_entry; - old = xa_store(&sess->rpc_handle_list, entry->id, entry, KSMBD_DEFAULT_GFP); + old = xa_store(&sess->rpc_handle_list, id, entry, KSMBD_DEFAULT_GFP); if (xa_is_err(old)) goto free_id; - resp = ksmbd_rpc_open(sess, entry->id); + resp = ksmbd_rpc_open(sess, id); if (!resp) goto erase_xa; + up_read(&sess->rpc_lock); kvfree(resp); - return entry->id; + return id; erase_xa: xa_erase(&sess->rpc_handle_list, entry->id); free_id: ksmbd_rpc_id_free(entry->id); free_entry: kfree(entry); + up_read(&sess->rpc_lock); return -EINVAL; } @@ -129,9 +134,11 @@ void ksmbd_session_rpc_close(struct ksmbd_session *sess, int id) { struct ksmbd_session_rpc *entry; + down_write(&sess->rpc_lock); entry = xa_erase(&sess->rpc_handle_list, id); if (entry) __session_rpc_close(sess, entry); + up_write(&sess->rpc_lock); } int ksmbd_session_rpc_method(struct ksmbd_session *sess, int id) @@ -181,7 +188,7 @@ static void ksmbd_expire_session(struct ksmbd_conn *conn) down_write(&sessions_table_lock); down_write(&conn->session_lock); xa_for_each(&conn->sessions, id, sess) { - if (atomic_read(&sess->refcnt) == 0 && + if (atomic_read(&sess->refcnt) <= 1 && (sess->state != SMB2_SESSION_VALID || time_after(jiffies, sess->last_active + SMB2_SESSION_TIMEOUT))) { @@ -230,7 +237,11 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) if (!ksmbd_chann_del(conn, sess) && xa_empty(&sess->ksmbd_chann_list)) { hash_del(&sess->hlist); - ksmbd_session_destroy(sess); + down_write(&conn->session_lock); + xa_erase(&conn->sessions, sess->id); + up_write(&conn->session_lock); + if (atomic_dec_and_test(&sess->refcnt)) + ksmbd_session_destroy(sess); } } } @@ -249,13 +260,30 @@ void ksmbd_sessions_deregister(struct ksmbd_conn *conn) if (xa_empty(&sess->ksmbd_chann_list)) { xa_erase(&conn->sessions, sess->id); hash_del(&sess->hlist); - ksmbd_session_destroy(sess); + if (atomic_dec_and_test(&sess->refcnt)) + ksmbd_session_destroy(sess); } } up_write(&conn->session_lock); up_write(&sessions_table_lock); } +bool is_ksmbd_session_in_connection(struct ksmbd_conn *conn, + unsigned long long id) +{ + struct ksmbd_session *sess; + + down_read(&conn->session_lock); + sess = xa_load(&conn->sessions, id); + if (sess) { + up_read(&conn->session_lock); + return true; + } + up_read(&conn->session_lock); + + return false; +} + struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, unsigned long long id) { @@ -309,8 +337,8 @@ void ksmbd_user_session_put(struct ksmbd_session *sess) if (atomic_read(&sess->refcnt) <= 0) WARN_ON(1); - else - atomic_dec(&sess->refcnt); + else if (atomic_dec_and_test(&sess->refcnt)) + ksmbd_session_destroy(sess); } struct preauth_session *ksmbd_preauth_session_alloc(struct ksmbd_conn *conn, @@ -353,13 +381,13 @@ void destroy_previous_session(struct ksmbd_conn *conn, ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_RECONNECT); err = ksmbd_conn_wait_idle_sess_id(conn, id); if (err) { - ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_SETUP); goto out; } ksmbd_destroy_file_table(&prev_sess->file_table); prev_sess->state = SMB2_SESSION_EXPIRED; - ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_NEGOTIATE); + ksmbd_all_conn_set_status(id, KSMBD_SESS_NEED_SETUP); ksmbd_launch_ksmbd_durable_scavenger(); out: up_write(&conn->session_lock); @@ -417,7 +445,8 @@ static struct ksmbd_session *__session_create(int protocol) xa_init(&sess->rpc_handle_list); sess->sequence_number = 1; rwlock_init(&sess->tree_conns_lock); - atomic_set(&sess->refcnt, 1); + atomic_set(&sess->refcnt, 2); + init_rwsem(&sess->rpc_lock); ret = __init_smb2_session(sess); if (ret) diff --git a/fs/smb/server/mgmt/user_session.h b/fs/smb/server/mgmt/user_session.h index c1c4b20bd5c6..c5749d6ec715 100644 --- a/fs/smb/server/mgmt/user_session.h +++ b/fs/smb/server/mgmt/user_session.h @@ -63,6 +63,7 @@ struct ksmbd_session { rwlock_t tree_conns_lock; atomic_t refcnt; + struct rw_semaphore rpc_lock; }; static inline int test_session_flag(struct ksmbd_session *sess, int bit) @@ -87,6 +88,8 @@ void ksmbd_session_destroy(struct ksmbd_session *sess); struct ksmbd_session *ksmbd_session_lookup_slowpath(unsigned long long id); struct ksmbd_session *ksmbd_session_lookup(struct ksmbd_conn *conn, unsigned long long id); +bool is_ksmbd_session_in_connection(struct ksmbd_conn *conn, + unsigned long long id); int ksmbd_session_register(struct ksmbd_conn *conn, struct ksmbd_session *sess); void ksmbd_sessions_deregister(struct ksmbd_conn *conn); diff --git a/fs/smb/server/oplock.c b/fs/smb/server/oplock.c index 3a3fe4afbdf0..d7a8a580d013 100644 --- a/fs/smb/server/oplock.c +++ b/fs/smb/server/oplock.c @@ -46,7 +46,6 @@ static struct oplock_info *alloc_opinfo(struct ksmbd_work *work, opinfo->fid = id; opinfo->Tid = Tid; INIT_LIST_HEAD(&opinfo->op_entry); - INIT_LIST_HEAD(&opinfo->interim_list); init_waitqueue_head(&opinfo->oplock_q); init_waitqueue_head(&opinfo->oplock_brk); atomic_set(&opinfo->refcount, 1); @@ -130,14 +129,6 @@ static void free_opinfo(struct oplock_info *opinfo) kfree(opinfo); } -static inline void opinfo_free_rcu(struct rcu_head *rcu_head) -{ - struct oplock_info *opinfo; - - opinfo = container_of(rcu_head, struct oplock_info, rcu_head); - free_opinfo(opinfo); -} - struct oplock_info *opinfo_get(struct ksmbd_file *fp) { struct oplock_info *opinfo; @@ -155,12 +146,9 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) { struct oplock_info *opinfo; - if (list_empty(&ci->m_op_list)) - return NULL; - - rcu_read_lock(); - opinfo = list_first_or_null_rcu(&ci->m_op_list, struct oplock_info, - op_entry); + down_read(&ci->m_lock); + opinfo = list_first_entry_or_null(&ci->m_op_list, struct oplock_info, + op_entry); if (opinfo) { if (opinfo->conn == NULL || !atomic_inc_not_zero(&opinfo->refcount)) @@ -172,8 +160,7 @@ static struct oplock_info *opinfo_get_list(struct ksmbd_inode *ci) } } } - - rcu_read_unlock(); + up_read(&ci->m_lock); return opinfo; } @@ -186,7 +173,7 @@ void opinfo_put(struct oplock_info *opinfo) if (!atomic_dec_and_test(&opinfo->refcount)) return; - call_rcu(&opinfo->rcu_head, opinfo_free_rcu); + free_opinfo(opinfo); } static void opinfo_add(struct oplock_info *opinfo) @@ -194,7 +181,7 @@ static void opinfo_add(struct oplock_info *opinfo) struct ksmbd_inode *ci = opinfo->o_fp->f_ci; down_write(&ci->m_lock); - list_add_rcu(&opinfo->op_entry, &ci->m_op_list); + list_add(&opinfo->op_entry, &ci->m_op_list); up_write(&ci->m_lock); } @@ -208,7 +195,7 @@ static void opinfo_del(struct oplock_info *opinfo) write_unlock(&lease_list_lock); } down_write(&ci->m_lock); - list_del_rcu(&opinfo->op_entry); + list_del(&opinfo->op_entry); up_write(&ci->m_lock); } @@ -635,6 +622,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) { struct smb2_oplock_break *rsp = NULL; struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); + struct ksmbd_conn *conn = work->conn; struct oplock_break_info *br_info = work->request_buf; struct smb2_hdr *rsp_hdr; struct ksmbd_file *fp; @@ -690,6 +678,7 @@ static void __smb2_oplock_break_noti(struct work_struct *wk) out: ksmbd_free_work_struct(work); + ksmbd_conn_r_count_dec(conn); } /** @@ -723,6 +712,7 @@ static int smb2_oplock_break_noti(struct oplock_info *opinfo) work->conn = conn; work->sess = opinfo->sess; + ksmbd_conn_r_count_inc(conn); if (opinfo->op_state == OPLOCK_ACK_WAIT) { INIT_WORK(&work->work, __smb2_oplock_break_noti); ksmbd_queue_work(work); @@ -745,6 +735,7 @@ static void __smb2_lease_break_noti(struct work_struct *wk) { struct smb2_lease_break *rsp = NULL; struct ksmbd_work *work = container_of(wk, struct ksmbd_work, work); + struct ksmbd_conn *conn = work->conn; struct lease_break_info *br_info = work->request_buf; struct smb2_hdr *rsp_hdr; @@ -791,6 +782,7 @@ static void __smb2_lease_break_noti(struct work_struct *wk) out: ksmbd_free_work_struct(work); + ksmbd_conn_r_count_dec(conn); } /** @@ -803,7 +795,6 @@ out: static int smb2_lease_break_noti(struct oplock_info *opinfo) { struct ksmbd_conn *conn = opinfo->conn; - struct list_head *tmp, *t; struct ksmbd_work *work; struct lease_break_info *br_info; struct lease *lease = opinfo->o_lease; @@ -830,17 +821,8 @@ static int smb2_lease_break_noti(struct oplock_info *opinfo) work->conn = conn; work->sess = opinfo->sess; + ksmbd_conn_r_count_inc(conn); if (opinfo->op_state == OPLOCK_ACK_WAIT) { - list_for_each_safe(tmp, t, &opinfo->interim_list) { - struct ksmbd_work *in_work; - - in_work = list_entry(tmp, struct ksmbd_work, - interim_entry); - setup_async_work(in_work, NULL, NULL); - smb2_send_interim_resp(in_work, STATUS_PENDING); - list_del_init(&in_work->interim_entry); - release_async_work(in_work); - } INIT_WORK(&work->work, __smb2_lease_break_noti); ksmbd_queue_work(work); wait_for_break_ack(opinfo); @@ -871,7 +853,8 @@ static void wait_lease_breaking(struct oplock_info *opinfo) } } -static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level) +static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level, + struct ksmbd_work *in_work) { int err = 0; @@ -914,9 +897,15 @@ static int oplock_break(struct oplock_info *brk_opinfo, int req_op_level) } if (lease->state & (SMB2_LEASE_WRITE_CACHING_LE | - SMB2_LEASE_HANDLE_CACHING_LE)) + SMB2_LEASE_HANDLE_CACHING_LE)) { + if (in_work) { + setup_async_work(in_work, NULL, NULL); + smb2_send_interim_resp(in_work, STATUS_PENDING); + release_async_work(in_work); + } + brk_opinfo->op_state = OPLOCK_ACK_WAIT; - else + } else atomic_dec(&brk_opinfo->breaking_cnt); } else { err = oplock_break_pending(brk_opinfo, req_op_level); @@ -1116,7 +1105,7 @@ void smb_send_parent_lease_break_noti(struct ksmbd_file *fp, if (ksmbd_conn_releasing(opinfo->conn)) continue; - oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE); + oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL); opinfo_put(opinfo); } } @@ -1152,7 +1141,7 @@ void smb_lazy_parent_lease_break_close(struct ksmbd_file *fp) if (ksmbd_conn_releasing(opinfo->conn)) continue; - oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE); + oplock_break(opinfo, SMB2_OPLOCK_LEVEL_NONE, NULL); opinfo_put(opinfo); } } @@ -1252,8 +1241,7 @@ int smb_grant_oplock(struct ksmbd_work *work, int req_op_level, u64 pid, goto op_break_not_needed; } - list_add(&work->interim_entry, &prev_opinfo->interim_list); - err = oplock_break(prev_opinfo, SMB2_OPLOCK_LEVEL_II); + err = oplock_break(prev_opinfo, SMB2_OPLOCK_LEVEL_II, work); opinfo_put(prev_opinfo); if (err == -ENOENT) goto set_lev; @@ -1322,8 +1310,7 @@ static void smb_break_all_write_oplock(struct ksmbd_work *work, } brk_opinfo->open_trunc = is_trunc; - list_add(&work->interim_entry, &brk_opinfo->interim_list); - oplock_break(brk_opinfo, SMB2_OPLOCK_LEVEL_II); + oplock_break(brk_opinfo, SMB2_OPLOCK_LEVEL_II, work); opinfo_put(brk_opinfo); } @@ -1348,8 +1335,8 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, ci = fp->f_ci; op = opinfo_get(fp); - rcu_read_lock(); - list_for_each_entry_rcu(brk_op, &ci->m_op_list, op_entry) { + down_read(&ci->m_lock); + list_for_each_entry(brk_op, &ci->m_op_list, op_entry) { if (brk_op->conn == NULL) continue; @@ -1359,7 +1346,6 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, if (ksmbd_conn_releasing(brk_op->conn)) continue; - rcu_read_unlock(); if (brk_op->is_lease && (brk_op->o_lease->state & (~(SMB2_LEASE_READ_CACHING_LE | SMB2_LEASE_HANDLE_CACHING_LE)))) { @@ -1386,12 +1372,11 @@ void smb_break_all_levII_oplock(struct ksmbd_work *work, struct ksmbd_file *fp, SMB2_LEASE_KEY_SIZE)) goto next; brk_op->open_trunc = is_trunc; - oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE); + oplock_break(brk_op, SMB2_OPLOCK_LEVEL_NONE, NULL); next: opinfo_put(brk_op); - rcu_read_lock(); } - rcu_read_unlock(); + up_read(&ci->m_lock); if (op) opinfo_put(op); @@ -1506,6 +1491,10 @@ struct lease_ctx_info *parse_lease_state(void *open_req) if (sizeof(struct lease_context_v2) == le32_to_cpu(cc->DataLength)) { struct create_lease_v2 *lc = (struct create_lease_v2 *)cc; + if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) < + sizeof(struct create_lease_v2) - 4) + goto err_out; + memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); lreq->req_state = lc->lcontext.LeaseState; lreq->flags = lc->lcontext.LeaseFlags; @@ -1518,6 +1507,10 @@ struct lease_ctx_info *parse_lease_state(void *open_req) } else { struct create_lease *lc = (struct create_lease *)cc; + if (le16_to_cpu(cc->DataOffset) + le32_to_cpu(cc->DataLength) < + sizeof(struct create_lease)) + goto err_out; + memcpy(lreq->lease_key, lc->lcontext.LeaseKey, SMB2_LEASE_KEY_SIZE); lreq->req_state = lc->lcontext.LeaseState; lreq->flags = lc->lcontext.LeaseFlags; @@ -1525,6 +1518,9 @@ struct lease_ctx_info *parse_lease_state(void *open_req) lreq->version = 1; } return lreq; +err_out: + kfree(lreq); + return NULL; } /** diff --git a/fs/smb/server/oplock.h b/fs/smb/server/oplock.h index 72bc88a63a40..9a56eaadd0dd 100644 --- a/fs/smb/server/oplock.h +++ b/fs/smb/server/oplock.h @@ -67,12 +67,10 @@ struct oplock_info { bool is_lease; bool open_trunc; /* truncate on open */ struct lease *o_lease; - struct list_head interim_list; struct list_head op_entry; struct list_head lease_entry; wait_queue_head_t oplock_q; /* Other server threads */ wait_queue_head_t oplock_brk; /* oplock breaking wait */ - struct rcu_head rcu_head; }; struct lease_break_info { diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index 601e7fcbcf1e..ab533c602987 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -270,17 +270,7 @@ static void handle_ksmbd_work(struct work_struct *wk) ksmbd_conn_try_dequeue_request(work); ksmbd_free_work_struct(work); - /* - * Checking waitqueue to dropping pending requests on - * disconnection. waitqueue_active is safe because it - * uses atomic operation for condition. - */ - atomic_inc(&conn->refcnt); - if (!atomic_dec_return(&conn->r_count) && waitqueue_active(&conn->r_count_q)) - wake_up(&conn->r_count_q); - - if (atomic_dec_and_test(&conn->refcnt)) - kfree(conn); + ksmbd_conn_r_count_dec(conn); } /** @@ -310,7 +300,7 @@ static int queue_ksmbd_work(struct ksmbd_conn *conn) conn->request_buf = NULL; ksmbd_conn_enqueue_request(work); - atomic_inc(&conn->r_count); + ksmbd_conn_r_count_inc(conn); /* update activity on connection */ conn->last_active = jiffies; INIT_WORK(&work->work, handle_ksmbd_work); diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index f1efcd027475..8d414239b3fe 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -633,6 +633,11 @@ smb2_get_name(const char *src, const int maxlen, struct nls_table *local_nls) return name; } + if (*name == '\0') { + kfree(name); + return ERR_PTR(-EINVAL); + } + if (*name == '\\') { pr_err("not allow directory name included leading slash\n"); kfree(name); @@ -1249,7 +1254,7 @@ int smb2_handle_negotiate(struct ksmbd_work *work) } conn->srv_sec_mode = le16_to_cpu(rsp->SecurityMode); - ksmbd_conn_set_need_negotiate(conn); + ksmbd_conn_set_need_setup(conn); err_out: ksmbd_conn_unlock(conn); @@ -1271,6 +1276,9 @@ static int alloc_preauth_hash(struct ksmbd_session *sess, if (sess->Preauth_HashValue) return 0; + if (!conn->preauth_info) + return -ENOMEM; + sess->Preauth_HashValue = kmemdup(conn->preauth_info->Preauth_HashValue, PREAUTH_HASHVALUE_SIZE, KSMBD_DEFAULT_GFP); if (!sess->Preauth_HashValue) @@ -1442,7 +1450,7 @@ static int ntlm_authenticate(struct ksmbd_work *work, { struct ksmbd_conn *conn = work->conn; struct ksmbd_session *sess = work->sess; - struct channel *chann = NULL; + struct channel *chann = NULL, *old; struct ksmbd_user *user; u64 prev_id; int sz, rc; @@ -1554,7 +1562,12 @@ binding_session: return -ENOMEM; chann->conn = conn; - xa_store(&sess->ksmbd_chann_list, (long)conn, chann, KSMBD_DEFAULT_GFP); + old = xa_store(&sess->ksmbd_chann_list, (long)conn, chann, + KSMBD_DEFAULT_GFP); + if (xa_is_err(old)) { + kfree(chann); + return xa_err(old); + } } } @@ -1599,9 +1612,6 @@ static int krb5_authenticate(struct ksmbd_work *work, if (prev_sess_id && prev_sess_id != sess->id) destroy_previous_session(conn, sess->user, prev_sess_id); - if (sess->state == SMB2_SESSION_VALID) - ksmbd_free_user(sess->user); - retval = ksmbd_krb5_authenticate(sess, in_blob, in_len, out_blob, &out_len); if (retval) { @@ -1674,6 +1684,11 @@ int smb2_sess_setup(struct ksmbd_work *work) ksmbd_debug(SMB, "Received smb2 session setup request\n"); + if (!ksmbd_conn_need_setup(conn) && !ksmbd_conn_good(conn)) { + work->send_no_response = 1; + return rc; + } + WORK_BUFFERS(work, req, rsp); rsp->StructureSize = cpu_to_le16(9); @@ -1707,44 +1722,38 @@ int smb2_sess_setup(struct ksmbd_work *work) if (conn->dialect != sess->dialect) { rc = -EINVAL; - ksmbd_user_session_put(sess); goto out_err; } if (!(req->hdr.Flags & SMB2_FLAGS_SIGNED)) { rc = -EINVAL; - ksmbd_user_session_put(sess); goto out_err; } if (strncmp(conn->ClientGUID, sess->ClientGUID, SMB2_CLIENT_GUID_SIZE)) { rc = -ENOENT; - ksmbd_user_session_put(sess); goto out_err; } if (sess->state == SMB2_SESSION_IN_PROGRESS) { rc = -EACCES; - ksmbd_user_session_put(sess); goto out_err; } if (sess->state == SMB2_SESSION_EXPIRED) { rc = -EFAULT; - ksmbd_user_session_put(sess); goto out_err; } - ksmbd_user_session_put(sess); if (ksmbd_conn_need_reconnect(conn)) { rc = -EFAULT; + ksmbd_user_session_put(sess); sess = NULL; goto out_err; } - sess = ksmbd_session_lookup(conn, sess_id); - if (!sess) { + if (is_ksmbd_session_in_connection(conn, sess_id)) { rc = -EACCES; goto out_err; } @@ -1910,10 +1919,12 @@ out_err: sess->last_active = jiffies; sess->state = SMB2_SESSION_EXPIRED; + ksmbd_user_session_put(sess); + work->sess = NULL; if (try_delay) { ksmbd_conn_set_need_reconnect(conn); ssleep(5); - ksmbd_conn_set_need_negotiate(conn); + ksmbd_conn_set_need_setup(conn); } } smb2_set_err_rsp(work); @@ -2239,14 +2250,11 @@ int smb2_session_logoff(struct ksmbd_work *work) return -ENOENT; } - ksmbd_destroy_file_table(&sess->file_table); down_write(&conn->session_lock); sess->state = SMB2_SESSION_EXPIRED; up_write(&conn->session_lock); - ksmbd_free_user(sess->user); - sess->user = NULL; - ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_NEGOTIATE); + ksmbd_all_conn_set_status(sess_id, KSMBD_SESS_NEED_SETUP); rsp->StructureSize = cpu_to_le16(4); err = ksmbd_iov_pin_rsp(work, rsp, sizeof(struct smb2_logoff_rsp)); @@ -2708,6 +2716,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_durable_reconn_v2_req)) { + err = -EINVAL; + goto out; + } + recon_v2 = (struct create_durable_reconn_v2_req *)context; persistent_id = recon_v2->Fid.PersistentFileId; dh_info->fp = ksmbd_lookup_durable_fd(persistent_id); @@ -2741,6 +2756,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_durable_reconn_req)) { + err = -EINVAL; + goto out; + } + recon = (struct create_durable_reconn_req *)context; persistent_id = recon->Data.Fid.PersistentFileId; dh_info->fp = ksmbd_lookup_durable_fd(persistent_id); @@ -2766,6 +2788,13 @@ static int parse_durable_handle_context(struct ksmbd_work *work, goto out; } + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_durable_req_v2)) { + err = -EINVAL; + goto out; + } + durable_v2_blob = (struct create_durable_req_v2 *)context; ksmbd_debug(SMB, "Request for durable v2 open\n"); @@ -4091,9 +4120,10 @@ static int process_query_dir_entries(struct smb2_query_dir_private *priv) return -EINVAL; lock_dir(priv->dir_fp); - dent = lookup_one(idmap, priv->d_info->name, - priv->dir_fp->filp->f_path.dentry, - priv->d_info->name_len); + dent = lookup_one(idmap, + &QSTR_LEN(priv->d_info->name, + priv->d_info->name_len), + priv->dir_fp->filp->f_path.dentry); unlock_dir(priv->dir_fp); if (IS_ERR(dent)) { @@ -7458,17 +7488,17 @@ out_check_cl: } no_check_cl: + flock = smb_lock->fl; + list_del(&smb_lock->llist); + if (smb_lock->zero_len) { err = 0; goto skip; } - - flock = smb_lock->fl; - list_del(&smb_lock->llist); retry: rc = vfs_lock_file(filp, smb_lock->cmd, flock, NULL); skip: - if (flags & SMB2_LOCKFLAG_UNLOCK) { + if (smb_lock->flags & SMB2_LOCKFLAG_UNLOCK) { if (!rc) { ksmbd_debug(SMB, "File unlocked\n"); } else if (rc == -ENOENT) { diff --git a/fs/smb/server/smb_common.h b/fs/smb/server/smb_common.h index a3d8a905b07e..d742ba754348 100644 --- a/fs/smb/server/smb_common.h +++ b/fs/smb/server/smb_common.h @@ -72,6 +72,8 @@ #define FILE_SUPPORTS_ENCRYPTION 0x00020000 #define FILE_SUPPORTS_OBJECT_IDS 0x00010000 #define FILE_VOLUME_IS_COMPRESSED 0x00008000 +#define FILE_SUPPORTS_POSIX_UNLINK_RENAME 0x00000400 +#define FILE_RETURNS_CLEANUP_RESULT_INFO 0x00000200 #define FILE_SUPPORTS_REMOTE_STORAGE 0x00000100 #define FILE_SUPPORTS_REPARSE_POINTS 0x00000080 #define FILE_SUPPORTS_SPARSE_FILES 0x00000040 diff --git a/fs/smb/server/smbacl.c b/fs/smb/server/smbacl.c index d39d3e553366..5aa7a66334d9 100644 --- a/fs/smb/server/smbacl.c +++ b/fs/smb/server/smbacl.c @@ -270,6 +270,11 @@ static int sid_to_id(struct mnt_idmap *idmap, return -EIO; } + if (psid->num_subauth == 0) { + pr_err("%s: zero subauthorities!\n", __func__); + return -EIO; + } + if (sidtype == SIDOWNER) { kuid_t uid; uid_t id; @@ -333,7 +338,7 @@ void posix_state_to_acl(struct posix_acl_state *state, pace->e_perm = state->other.allow; } -int init_acl_state(struct posix_acl_state *state, int cnt) +int init_acl_state(struct posix_acl_state *state, u16 cnt) { int alloc; @@ -368,7 +373,7 @@ static void parse_dacl(struct mnt_idmap *idmap, struct smb_fattr *fattr) { int i, ret; - int num_aces = 0; + u16 num_aces = 0; unsigned int acl_size; char *acl_base; struct smb_ace **ppace; @@ -389,16 +394,18 @@ static void parse_dacl(struct mnt_idmap *idmap, ksmbd_debug(SMB, "DACL revision %d size %d num aces %d\n", le16_to_cpu(pdacl->revision), le16_to_cpu(pdacl->size), - le32_to_cpu(pdacl->num_aces)); + le16_to_cpu(pdacl->num_aces)); acl_base = (char *)pdacl; acl_size = sizeof(struct smb_acl); - num_aces = le32_to_cpu(pdacl->num_aces); + num_aces = le16_to_cpu(pdacl->num_aces); if (num_aces <= 0) return; - if (num_aces > ULONG_MAX / sizeof(struct smb_ace *)) + if (num_aces > (le16_to_cpu(pdacl->size) - sizeof(struct smb_acl)) / + (offsetof(struct smb_ace, sid) + + offsetof(struct smb_sid, sub_auth) + sizeof(__le16))) return; ret = init_acl_state(&acl_state, num_aces); @@ -432,6 +439,7 @@ static void parse_dacl(struct mnt_idmap *idmap, offsetof(struct smb_sid, sub_auth); if (end_of_acl - acl_base < acl_size || + ppace[i]->sid.num_subauth == 0 || ppace[i]->sid.num_subauth > SID_MAX_SUB_AUTHORITIES || (end_of_acl - acl_base < acl_size + sizeof(__le32) * ppace[i]->sid.num_subauth) || @@ -580,7 +588,7 @@ static void parse_dacl(struct mnt_idmap *idmap, static void set_posix_acl_entries_dacl(struct mnt_idmap *idmap, struct smb_ace *pndace, - struct smb_fattr *fattr, u32 *num_aces, + struct smb_fattr *fattr, u16 *num_aces, u16 *size, u32 nt_aces_num) { struct posix_acl_entry *pace; @@ -701,7 +709,7 @@ static void set_ntacl_dacl(struct mnt_idmap *idmap, struct smb_fattr *fattr) { struct smb_ace *ntace, *pndace; - int nt_num_aces = le32_to_cpu(nt_dacl->num_aces), num_aces = 0; + u16 nt_num_aces = le16_to_cpu(nt_dacl->num_aces), num_aces = 0; unsigned short size = 0; int i; @@ -728,7 +736,7 @@ static void set_ntacl_dacl(struct mnt_idmap *idmap, set_posix_acl_entries_dacl(idmap, pndace, fattr, &num_aces, &size, nt_num_aces); - pndacl->num_aces = cpu_to_le32(num_aces); + pndacl->num_aces = cpu_to_le16(num_aces); pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size); } @@ -736,7 +744,7 @@ static void set_mode_dacl(struct mnt_idmap *idmap, struct smb_acl *pndacl, struct smb_fattr *fattr) { struct smb_ace *pace, *pndace; - u32 num_aces = 0; + u16 num_aces = 0; u16 size = 0, ace_size = 0; uid_t uid; const struct smb_sid *sid; @@ -792,7 +800,7 @@ static void set_mode_dacl(struct mnt_idmap *idmap, fattr->cf_mode, 0007); out: - pndacl->num_aces = cpu_to_le32(num_aces); + pndacl->num_aces = cpu_to_le16(num_aces); pndacl->size = cpu_to_le16(le16_to_cpu(pndacl->size) + size); } @@ -807,6 +815,13 @@ static int parse_sid(struct smb_sid *psid, char *end_of_acl) return -EINVAL; } + if (!psid->num_subauth) + return 0; + + if (psid->num_subauth > SID_MAX_SUB_AUTHORITIES || + end_of_acl < (char *)psid + 8 + sizeof(__le32) * psid->num_subauth) + return -EINVAL; + return 0; } @@ -848,6 +863,9 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, pntsd->type = cpu_to_le16(DACL_PRESENT); if (pntsd->osidoffset) { + if (le32_to_cpu(pntsd->osidoffset) < sizeof(struct smb_ntsd)) + return -EINVAL; + rc = parse_sid(owner_sid_ptr, end_of_acl); if (rc) { pr_err("%s: Error %d parsing Owner SID\n", __func__, rc); @@ -863,6 +881,9 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, } if (pntsd->gsidoffset) { + if (le32_to_cpu(pntsd->gsidoffset) < sizeof(struct smb_ntsd)) + return -EINVAL; + rc = parse_sid(group_sid_ptr, end_of_acl); if (rc) { pr_err("%s: Error %d mapping Owner SID to gid\n", @@ -884,6 +905,9 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, pntsd->type |= cpu_to_le16(DACL_PROTECTED); if (dacloffset) { + if (dacloffset < sizeof(struct smb_ntsd)) + return -EINVAL; + parse_dacl(idmap, dacl_ptr, end_of_acl, owner_sid_ptr, group_sid_ptr, fattr); } @@ -1006,8 +1030,11 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, struct smb_sid owner_sid, group_sid; struct dentry *parent = path->dentry->d_parent; struct mnt_idmap *idmap = mnt_idmap(path->mnt); - int inherited_flags = 0, flags = 0, i, ace_cnt = 0, nt_size = 0, pdacl_size; - int rc = 0, num_aces, dacloffset, pntsd_type, pntsd_size, acl_len, aces_size; + int inherited_flags = 0, flags = 0, i, nt_size = 0, pdacl_size; + int rc = 0, pntsd_type, pntsd_size, acl_len, aces_size; + unsigned int dacloffset; + size_t dacl_struct_end; + u16 num_aces, ace_cnt = 0; char *aces_base; bool is_dir = S_ISDIR(d_inode(path->dentry)->i_mode); @@ -1015,15 +1042,18 @@ int smb_inherit_dacl(struct ksmbd_conn *conn, parent, &parent_pntsd); if (pntsd_size <= 0) return -ENOENT; + dacloffset = le32_to_cpu(parent_pntsd->dacloffset); - if (!dacloffset || (dacloffset + sizeof(struct smb_acl) > pntsd_size)) { + if (!dacloffset || + check_add_overflow(dacloffset, sizeof(struct smb_acl), &dacl_struct_end) || + dacl_struct_end > (size_t)pntsd_size) { rc = -EINVAL; goto free_parent_pntsd; } parent_pdacl = (struct smb_acl *)((char *)parent_pntsd + dacloffset); acl_len = pntsd_size - dacloffset; - num_aces = le32_to_cpu(parent_pdacl->num_aces); + num_aces = le16_to_cpu(parent_pdacl->num_aces); pntsd_type = le16_to_cpu(parent_pntsd->type); pdacl_size = le16_to_cpu(parent_pdacl->size); @@ -1183,7 +1213,7 @@ pass: pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset)); pdacl->revision = cpu_to_le16(2); pdacl->size = cpu_to_le16(sizeof(struct smb_acl) + nt_size); - pdacl->num_aces = cpu_to_le32(ace_cnt); + pdacl->num_aces = cpu_to_le16(ace_cnt); pace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); memcpy(pace, aces_base, nt_size); pntsd_size += sizeof(struct smb_acl) + nt_size; @@ -1220,7 +1250,9 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, struct smb_ntsd *pntsd = NULL; struct smb_acl *pdacl; struct posix_acl *posix_acls; - int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size, dacl_offset; + int rc = 0, pntsd_size, acl_size, aces_size, pdacl_size; + unsigned int dacl_offset; + size_t dacl_struct_end; struct smb_sid sid; int granted = le32_to_cpu(*pdaccess & ~FILE_MAXIMAL_ACCESS_LE); struct smb_ace *ace; @@ -1239,7 +1271,8 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, dacl_offset = le32_to_cpu(pntsd->dacloffset); if (!dacl_offset || - (dacl_offset + sizeof(struct smb_acl) > pntsd_size)) + check_add_overflow(dacl_offset, sizeof(struct smb_acl), &dacl_struct_end) || + dacl_struct_end > (size_t)pntsd_size) goto err_out; pdacl = (struct smb_acl *)((char *)pntsd + le32_to_cpu(pntsd->dacloffset)); @@ -1264,7 +1297,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); aces_size = acl_size - sizeof(struct smb_acl); - for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) { + for (i = 0; i < le16_to_cpu(pdacl->num_aces); i++) { if (offsetof(struct smb_ace, access_req) > aces_size) break; ace_size = le16_to_cpu(ace->size); @@ -1285,7 +1318,7 @@ int smb_check_perm_dacl(struct ksmbd_conn *conn, const struct path *path, ace = (struct smb_ace *)((char *)pdacl + sizeof(struct smb_acl)); aces_size = acl_size - sizeof(struct smb_acl); - for (i = 0; i < le32_to_cpu(pdacl->num_aces); i++) { + for (i = 0; i < le16_to_cpu(pdacl->num_aces); i++) { if (offsetof(struct smb_ace, access_req) > aces_size) break; ace_size = le16_to_cpu(ace->size); diff --git a/fs/smb/server/smbacl.h b/fs/smb/server/smbacl.h index 24ce576fc292..355adaee39b8 100644 --- a/fs/smb/server/smbacl.h +++ b/fs/smb/server/smbacl.h @@ -86,7 +86,7 @@ int parse_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, int build_sec_desc(struct mnt_idmap *idmap, struct smb_ntsd *pntsd, struct smb_ntsd *ppntsd, int ppntsd_size, int addition_info, __u32 *secdesclen, struct smb_fattr *fattr); -int init_acl_state(struct posix_acl_state *state, int cnt); +int init_acl_state(struct posix_acl_state *state, u16 cnt); void free_acl_state(struct posix_acl_state *state); void posix_state_to_acl(struct posix_acl_state *state, struct posix_acl_entry *pace); diff --git a/fs/smb/server/transport_ipc.c b/fs/smb/server/transport_ipc.c index 0460ebea6ff0..2a3e2b0ce557 100644 --- a/fs/smb/server/transport_ipc.c +++ b/fs/smb/server/transport_ipc.c @@ -281,6 +281,7 @@ static int handle_response(int type, void *payload, size_t sz) if (entry->type + 1 != type) { pr_err("Waiting for IPC type %d, got %d. Ignore.\n", entry->type + 1, type); + continue; } entry->response = kvzalloc(sz, KSMBD_DEFAULT_GFP); @@ -309,7 +310,11 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) server_conf.signing = req->signing; server_conf.tcp_port = req->tcp_port; server_conf.ipc_timeout = req->ipc_timeout * HZ; - server_conf.deadtime = req->deadtime * SMB_ECHO_INTERVAL; + if (check_mul_overflow(req->deadtime, SMB_ECHO_INTERVAL, + &server_conf.deadtime)) { + ret = -EINVAL; + goto out; + } server_conf.share_fake_fscaps = req->share_fake_fscaps; ksmbd_init_domain(req->sub_auth); @@ -336,6 +341,7 @@ static int ipc_server_config_on_startup(struct ksmbd_startup_request *req) server_conf.bind_interfaces_only = req->bind_interfaces_only; ret |= ksmbd_tcp_set_interfaces(KSMBD_STARTUP_CONFIG_INTERFACES(req), req->ifc_list_sz); +out: if (ret) { pr_err("Server configuration error: %s %s %s\n", req->netbios_name, req->server_string, diff --git a/fs/smb/server/transport_rdma.c b/fs/smb/server/transport_rdma.c index c3785a5434f9..4998df04ab95 100644 --- a/fs/smb/server/transport_rdma.c +++ b/fs/smb/server/transport_rdma.c @@ -14,6 +14,7 @@ #include <linux/mempool.h> #include <linux/highmem.h> #include <linux/scatterlist.h> +#include <linux/string_choices.h> #include <rdma/ib_verbs.h> #include <rdma/rdma_cm.h> #include <rdma/rw.h> @@ -1396,7 +1397,7 @@ static int smb_direct_rdma_xmit(struct smb_direct_transport *t, } ksmbd_debug(RDMA, "RDMA %s, len %#x, needed credits %#x\n", - is_read ? "read" : "write", buf_len, credits_needed); + str_read_write(is_read), buf_len, credits_needed); ret = wait_for_rw_credits(t, credits_needed); if (ret < 0) @@ -2241,38 +2242,16 @@ bool ksmbd_rdma_capable_netdev(struct net_device *netdev) for (i = 0; i < smb_dev->ib_dev->phys_port_cnt; i++) { struct net_device *ndev; - if (smb_dev->ib_dev->ops.get_netdev) { - ndev = smb_dev->ib_dev->ops.get_netdev( - smb_dev->ib_dev, i + 1); - if (!ndev) - continue; + ndev = ib_device_get_netdev(smb_dev->ib_dev, i + 1); + if (!ndev) + continue; - if (ndev == netdev) { - dev_put(ndev); - rdma_capable = true; - goto out; - } + if (ndev == netdev) { dev_put(ndev); - /* if ib_dev does not implement ops.get_netdev - * check for matching infiniband GUID in hw_addr - */ - } else if (netdev->type == ARPHRD_INFINIBAND) { - struct netdev_hw_addr *ha; - union ib_gid gid; - u32 port_num; - int ret; - - netdev_hw_addr_list_for_each( - ha, &netdev->dev_addrs) { - memcpy(&gid, ha->addr + 4, sizeof(gid)); - ret = ib_find_gid(smb_dev->ib_dev, &gid, - &port_num, NULL); - if (!ret) { - rdma_capable = true; - goto out; - } - } + rdma_capable = true; + goto out; } + dev_put(ndev); } } out: @@ -2289,7 +2268,7 @@ out: } ksmbd_debug(RDMA, "netdev(%s) rdma capable : %s\n", - netdev->name, rdma_capable ? "true" : "false"); + netdev->name, str_true_false(rdma_capable)); return rdma_capable; } diff --git a/fs/smb/server/transport_tcp.c b/fs/smb/server/transport_tcp.c index 7f38a3c3f5bd..abedf510899a 100644 --- a/fs/smb/server/transport_tcp.c +++ b/fs/smb/server/transport_tcp.c @@ -93,17 +93,21 @@ static struct tcp_transport *alloc_transport(struct socket *client_sk) return t; } -static void free_transport(struct tcp_transport *t) +void ksmbd_free_transport(struct ksmbd_transport *kt) { - kernel_sock_shutdown(t->sock, SHUT_RDWR); - sock_release(t->sock); - t->sock = NULL; + struct tcp_transport *t = TCP_TRANS(kt); - ksmbd_conn_free(KSMBD_TRANS(t)->conn); + sock_release(t->sock); kfree(t->iov); kfree(t); } +static void free_transport(struct tcp_transport *t) +{ + kernel_sock_shutdown(t->sock, SHUT_RDWR); + ksmbd_conn_free(KSMBD_TRANS(t)->conn); +} + /** * kvec_array_init() - initialize a IO vector segment * @new: IO vector to be initialized diff --git a/fs/smb/server/transport_tcp.h b/fs/smb/server/transport_tcp.h index 8c9aa624cfe3..1e51675ee1b2 100644 --- a/fs/smb/server/transport_tcp.h +++ b/fs/smb/server/transport_tcp.h @@ -8,6 +8,7 @@ int ksmbd_tcp_set_interfaces(char *ifc_list, int ifc_list_sz); struct interface *ksmbd_find_netdev_name_iface_list(char *netdev_name); +void ksmbd_free_transport(struct ksmbd_transport *kt); int ksmbd_tcp_init(void); void ksmbd_tcp_destroy(void); diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index 6890016e1923..baf0d3031a44 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -113,11 +113,6 @@ static int ksmbd_vfs_path_lookup_locked(struct ksmbd_share_config *share_conf, if (IS_ERR(d)) goto err_out; - if (d_is_negative(d)) { - dput(d); - goto err_out; - } - path->dentry = d; path->mnt = mntget(parent_path->mnt); @@ -211,8 +206,8 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) { struct mnt_idmap *idmap; struct path path; - struct dentry *dentry; - int err; + struct dentry *dentry, *d; + int err = 0; dentry = ksmbd_vfs_kern_path_create(work, name, LOOKUP_NO_SYMLINKS | LOOKUP_DIRECTORY, @@ -227,27 +222,15 @@ int ksmbd_vfs_mkdir(struct ksmbd_work *work, const char *name, umode_t mode) idmap = mnt_idmap(path.mnt); mode |= S_IFDIR; - err = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); - if (!err && d_unhashed(dentry)) { - struct dentry *d; - - d = lookup_one(idmap, dentry->d_name.name, dentry->d_parent, - dentry->d_name.len); - if (IS_ERR(d)) { - err = PTR_ERR(d); - goto out_err; - } - if (unlikely(d_is_negative(d))) { - dput(d); - err = -ENOENT; - goto out_err; - } - - ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(d)); - dput(d); - } + d = dentry; + dentry = vfs_mkdir(idmap, d_inode(path.dentry), dentry, mode); + if (IS_ERR(dentry)) + err = PTR_ERR(dentry); + else if (d_is_negative(dentry)) + err = -ENOENT; + if (!err && dentry != d) + ksmbd_vfs_inherit_owner(work, d_inode(path.dentry), d_inode(dentry)); -out_err: done_path_create(&path, dentry); if (err) pr_err("mkdir(%s): creation failed (err:%d)\n", name, err); @@ -426,10 +409,15 @@ static int ksmbd_vfs_stream_write(struct ksmbd_file *fp, char *buf, loff_t *pos, ksmbd_debug(VFS, "write stream data pos : %llu, count : %zd\n", *pos, count); + if (*pos >= XATTR_SIZE_MAX) { + pr_err("stream write position %lld is out of bounds\n", *pos); + return -EINVAL; + } + size = *pos + count; if (size > XATTR_SIZE_MAX) { size = XATTR_SIZE_MAX; - count = (*pos + count) - XATTR_SIZE_MAX; + count = XATTR_SIZE_MAX - *pos; } v_len = ksmbd_vfs_getcasexattr(idmap, @@ -496,7 +484,8 @@ int ksmbd_vfs_write(struct ksmbd_work *work, struct ksmbd_file *fp, int err = 0; if (work->conn->connection_type) { - if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE))) { + if (!(fp->daccess & (FILE_WRITE_DATA_LE | FILE_APPEND_DATA_LE)) || + S_ISDIR(file_inode(fp->filp)->i_mode)) { pr_err("no right to write(%pD)\n", fp->filp); err = -EACCES; goto out; @@ -693,6 +682,7 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, struct ksmbd_file *parent_fp; int new_type; int err, lookup_flags = LOOKUP_NO_SYMLINKS; + int target_lookup_flags = LOOKUP_RENAME_TARGET | LOOKUP_CREATE; if (ksmbd_override_fsids(work)) return -ENOMEM; @@ -703,6 +693,14 @@ int ksmbd_vfs_rename(struct ksmbd_work *work, const struct path *old_path, goto revert_fsids; } + /* + * explicitly handle file overwrite case, for compatibility with + * filesystems that may not support rename flags (e.g: fuse) + */ + if (flags & RENAME_NOREPLACE) + target_lookup_flags |= LOOKUP_EXCL; + flags &= ~(RENAME_NOREPLACE); + retry: err = vfs_path_parent_lookup(to, lookup_flags | LOOKUP_BENEATH, &new_path, &new_last, &new_type, @@ -743,7 +741,7 @@ retry: } new_dentry = lookup_one_qstr_excl(&new_last, new_path.dentry, - lookup_flags | LOOKUP_RENAME_TARGET); + lookup_flags | target_lookup_flags); if (IS_ERR(new_dentry)) { err = PTR_ERR(new_dentry); goto out3; @@ -754,16 +752,6 @@ retry: goto out4; } - /* - * explicitly handle file overwrite case, for compatibility with - * filesystems that may not support rename flags (e.g: fuse) - */ - if ((flags & RENAME_NOREPLACE) && d_is_positive(new_dentry)) { - err = -EEXIST; - goto out4; - } - flags &= ~(RENAME_NOREPLACE); - if (old_child == trap) { err = -EINVAL; goto out4; diff --git a/fs/smb/server/vfs_cache.c b/fs/smb/server/vfs_cache.c index 8d1f30dcba7e..dfed6fce8904 100644 --- a/fs/smb/server/vfs_cache.c +++ b/fs/smb/server/vfs_cache.c @@ -661,21 +661,40 @@ __close_file_table_ids(struct ksmbd_file_table *ft, bool (*skip)(struct ksmbd_tree_connect *tcon, struct ksmbd_file *fp)) { - unsigned int id; - struct ksmbd_file *fp; - int num = 0; + struct ksmbd_file *fp; + unsigned int id = 0; + int num = 0; + + while (1) { + write_lock(&ft->lock); + fp = idr_get_next(ft->idr, &id); + if (!fp) { + write_unlock(&ft->lock); + break; + } - idr_for_each_entry(ft->idr, fp, id) { - if (skip(tcon, fp)) + if (skip(tcon, fp) || + !atomic_dec_and_test(&fp->refcount)) { + id++; + write_unlock(&ft->lock); continue; + } set_close_state_blocked_works(fp); + idr_remove(ft->idr, fp->volatile_id); + fp->volatile_id = KSMBD_NO_FID; + write_unlock(&ft->lock); + + down_write(&fp->f_ci->m_lock); + list_del_init(&fp->node); + up_write(&fp->f_ci->m_lock); - if (!atomic_dec_and_test(&fp->refcount)) - continue; __ksmbd_close_fd(ft, fp); + num++; + id++; } + return num; } @@ -713,12 +732,8 @@ static bool tree_conn_fd_check(struct ksmbd_tree_connect *tcon, static bool ksmbd_durable_scavenger_alive(void) { - mutex_lock(&durable_scavenger_lock); - if (!durable_scavenger_running) { - mutex_unlock(&durable_scavenger_lock); + if (!durable_scavenger_running) return false; - } - mutex_unlock(&durable_scavenger_lock); if (kthread_should_stop()) return false; @@ -799,9 +814,7 @@ static int ksmbd_durable_scavenger(void *dummy) break; } - mutex_lock(&durable_scavenger_lock); durable_scavenger_running = false; - mutex_unlock(&durable_scavenger_lock); module_put(THIS_MODULE); diff --git a/fs/splice.c b/fs/splice.c index 28cfa63aa236..4d6df083e0c0 100644 --- a/fs/splice.c +++ b/fs/splice.c @@ -45,7 +45,7 @@ * here if set to avoid blocking other users of this pipe if splice is * being done on it. */ -static noinline void noinline pipe_clear_nowait(struct file *file) +static noinline void pipe_clear_nowait(struct file *file) { fmode_t fmode = READ_ONCE(file->f_mode); @@ -200,7 +200,6 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, unsigned int spd_pages = spd->nr_pages; unsigned int tail = pipe->tail; unsigned int head = pipe->head; - unsigned int mask = pipe->ring_size - 1; ssize_t ret = 0; int page_nr = 0; @@ -214,7 +213,7 @@ ssize_t splice_to_pipe(struct pipe_inode_info *pipe, } while (!pipe_full(head, tail, pipe->max_usage)) { - struct pipe_buffer *buf = &pipe->bufs[head & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, head); buf->page = spd->pages[page_nr]; buf->offset = spd->partial[page_nr].offset; @@ -247,7 +246,6 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) { unsigned int head = pipe->head; unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; int ret; if (unlikely(!pipe->readers)) { @@ -256,7 +254,7 @@ ssize_t add_to_pipe(struct pipe_inode_info *pipe, struct pipe_buffer *buf) } else if (pipe_full(head, tail, pipe->max_usage)) { ret = -EAGAIN; } else { - pipe->bufs[head & mask] = *buf; + *pipe_buf(pipe, head) = *buf; pipe->head = head + 1; return buf->len; } @@ -331,7 +329,7 @@ ssize_t copy_splice_read(struct file *in, loff_t *ppos, int i; /* Work out how much data we can actually add into the pipe */ - used = pipe_occupancy(pipe->head, pipe->tail); + used = pipe_buf_usage(pipe); npages = max_t(ssize_t, pipe->max_usage - used, 0); len = min_t(size_t, len, npages * PAGE_SIZE); npages = DIV_ROUND_UP(len, PAGE_SIZE); @@ -447,11 +445,10 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des { unsigned int head = pipe->head; unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; int ret; while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); sd->len = buf->len; if (sd->len > sd->total_len) @@ -495,8 +492,7 @@ static int splice_from_pipe_feed(struct pipe_inode_info *pipe, struct splice_des static inline bool eat_empty_buffer(struct pipe_inode_info *pipe) { unsigned int tail = pipe->tail; - unsigned int mask = pipe->ring_size - 1; - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); if (unlikely(!buf->len)) { pipe_buf_release(pipe, buf); @@ -527,7 +523,7 @@ static int splice_from_pipe_next(struct pipe_inode_info *pipe, struct splice_des return -ERESTARTSYS; repeat: - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_is_empty(pipe)) { if (!pipe->writers) return 0; @@ -690,7 +686,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, while (sd.total_len) { struct kiocb kiocb; struct iov_iter from; - unsigned int head, tail, mask; + unsigned int head, tail; size_t left; int n; @@ -711,12 +707,11 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; /* build the vector */ left = sd.total_len; for (n = 0; !pipe_empty(head, tail) && left && n < nbufs; tail++) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t this_len = buf->len; /* zero-length bvecs are not supported, skip them */ @@ -752,7 +747,7 @@ iter_file_splice_write(struct pipe_inode_info *pipe, struct file *out, /* dismiss the fully eaten buffers, adjust the partial one */ tail = pipe->tail; while (ret) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); if (ret >= buf->len) { ret -= buf->len; buf->len = 0; @@ -809,7 +804,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, pipe_lock(pipe); while (len > 0) { - unsigned int head, tail, mask, bc = 0; + unsigned int head, tail, bc = 0; size_t remain = len; /* @@ -820,7 +815,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, if (signal_pending(current)) break; - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_is_empty(pipe)) { ret = 0; if (!pipe->writers) goto out; @@ -846,10 +841,9 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, head = pipe->head; tail = pipe->tail; - mask = pipe->ring_size - 1; while (!pipe_empty(head, tail)) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg; if (!buf->len) { @@ -894,7 +888,7 @@ ssize_t splice_to_socket(struct pipe_inode_info *pipe, struct file *out, len -= ret; tail = pipe->tail; while (ret > 0) { - struct pipe_buffer *buf = &pipe->bufs[tail & mask]; + struct pipe_buffer *buf = pipe_buf(pipe, tail); size_t seg = min_t(size_t, ret, buf->len); buf->offset += seg; @@ -968,7 +962,7 @@ static ssize_t do_splice_read(struct file *in, loff_t *ppos, return 0; /* Don't try to read more the pipe has space for. */ - p_space = pipe->max_usage - pipe_occupancy(pipe->head, pipe->tail); + p_space = pipe->max_usage - pipe_buf_usage(pipe); len = min_t(size_t, len, p_space << PAGE_SHIFT); if (unlikely(len > MAX_RW_COUNT)) @@ -1080,7 +1074,7 @@ ssize_t splice_direct_to_actor(struct file *in, struct splice_desc *sd, more = sd->flags & SPLICE_F_MORE; sd->flags |= SPLICE_F_MORE; - WARN_ON_ONCE(!pipe_empty(pipe->head, pipe->tail)); + WARN_ON_ONCE(!pipe_is_empty(pipe)); while (len) { size_t read_len; @@ -1268,7 +1262,7 @@ static int wait_for_space(struct pipe_inode_info *pipe, unsigned flags) send_sig(SIGPIPE, current, 0); return -EPIPE; } - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (!pipe_is_full(pipe)) return 0; if (flags & SPLICE_F_NONBLOCK) return -EAGAIN; @@ -1652,13 +1646,13 @@ static int ipipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check the pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (!pipe_empty(pipe->head, pipe->tail)) + if (!pipe_is_empty(pipe)) return 0; ret = 0; pipe_lock(pipe); - while (pipe_empty(pipe->head, pipe->tail)) { + while (pipe_is_empty(pipe)) { if (signal_pending(current)) { ret = -ERESTARTSYS; break; @@ -1688,13 +1682,13 @@ static int opipe_prep(struct pipe_inode_info *pipe, unsigned int flags) * Check pipe occupancy without the inode lock first. This function * is speculative anyways, so missing one is ok. */ - if (!pipe_full(pipe->head, pipe->tail, pipe->max_usage)) + if (!pipe_is_full(pipe)) return 0; ret = 0; pipe_lock(pipe); - while (pipe_full(pipe->head, pipe->tail, pipe->max_usage)) { + while (pipe_is_full(pipe)) { if (!pipe->readers) { send_sig(SIGPIPE, current, 0); ret = -EPIPE; @@ -1725,7 +1719,6 @@ static int splice_pipe_to_pipe(struct pipe_inode_info *ipipe, struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; int ret = 0; bool input_wakeup = false; @@ -1747,9 +1740,7 @@ retry: pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; o_head = opipe->head; - o_mask = opipe->ring_size - 1; do { size_t o_len; @@ -1792,8 +1783,8 @@ retry: goto retry; } - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_buf(ipipe, i_tail); + obuf = pipe_buf(opipe, o_head); if (len >= ibuf->len) { /* @@ -1862,7 +1853,6 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, struct pipe_buffer *ibuf, *obuf; unsigned int i_head, o_head; unsigned int i_tail, o_tail; - unsigned int i_mask, o_mask; ssize_t ret = 0; /* @@ -1873,9 +1863,7 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, pipe_double_lock(ipipe, opipe); i_tail = ipipe->tail; - i_mask = ipipe->ring_size - 1; o_head = opipe->head; - o_mask = opipe->ring_size - 1; do { if (!opipe->readers) { @@ -1896,8 +1884,8 @@ static ssize_t link_pipe(struct pipe_inode_info *ipipe, pipe_full(o_head, o_tail, opipe->max_usage)) break; - ibuf = &ipipe->bufs[i_tail & i_mask]; - obuf = &opipe->bufs[o_head & o_mask]; + ibuf = pipe_buf(ipipe, i_tail); + obuf = pipe_buf(opipe, o_head); /* * Get a reference to this pipe buffer, diff --git a/fs/squashfs/cache.c b/fs/squashfs/cache.c index 4db0d2b0aab8..181260e72680 100644 --- a/fs/squashfs/cache.c +++ b/fs/squashfs/cache.c @@ -198,7 +198,7 @@ void squashfs_cache_delete(struct squashfs_cache *cache) { int i, j; - if (cache == NULL) + if (IS_ERR(cache) || cache == NULL) return; for (i = 0; i < cache->entries; i++) { diff --git a/fs/stat.c b/fs/stat.c index f13308bfdc98..f95c1dc3eaa4 100644 --- a/fs/stat.c +++ b/fs/stat.c @@ -136,13 +136,15 @@ EXPORT_SYMBOL(generic_fill_statx_attr); * @stat: Where to fill in the attribute flags * @unit_min: Minimum supported atomic write length in bytes * @unit_max: Maximum supported atomic write length in bytes + * @unit_max_opt: Optimised maximum supported atomic write length in bytes * * Fill in the STATX{_ATTR}_WRITE_ATOMIC flags in the kstat structure from * atomic write unit_min and unit_max values. */ void generic_fill_statx_atomic_writes(struct kstat *stat, unsigned int unit_min, - unsigned int unit_max) + unsigned int unit_max, + unsigned int unit_max_opt) { /* Confirm that the request type is known */ stat->result_mask |= STATX_WRITE_ATOMIC; @@ -153,6 +155,7 @@ void generic_fill_statx_atomic_writes(struct kstat *stat, if (unit_min) { stat->atomic_write_unit_min = unit_min; stat->atomic_write_unit_max = unit_max; + stat->atomic_write_unit_max_opt = unit_max_opt; /* Initially only allow 1x segment */ stat->atomic_write_segments_max = 1; @@ -204,12 +207,25 @@ int vfs_getattr_nosec(const struct path *path, struct kstat *stat, STATX_ATTR_DAX); idmap = mnt_idmap(path->mnt); - if (inode->i_op->getattr) - return inode->i_op->getattr(idmap, path, stat, - request_mask, - query_flags); + if (inode->i_op->getattr) { + int ret; + + ret = inode->i_op->getattr(idmap, path, stat, request_mask, + query_flags); + if (ret) + return ret; + } else { + generic_fillattr(idmap, request_mask, inode, stat); + } + + /* + * If this is a block device inode, override the filesystem attributes + * with the block device specific parameters that need to be obtained + * from the bdev backing inode. + */ + if (S_ISBLK(stat->mode)) + bdev_statx(path, stat, request_mask); - generic_fillattr(idmap, request_mask, inode, stat); return 0; } EXPORT_SYMBOL(vfs_getattr_nosec); @@ -241,7 +257,7 @@ int vfs_getattr(const struct path *path, struct kstat *stat, int retval; retval = security_inode_getattr(path); - if (retval) + if (unlikely(retval)) return retval; return vfs_getattr_nosec(path, stat, request_mask, query_flags); } @@ -295,15 +311,6 @@ static int vfs_statx_path(struct path *path, int flags, struct kstat *stat, if (path_mounted(path)) stat->attributes |= STATX_ATTR_MOUNT_ROOT; stat->attributes_mask |= STATX_ATTR_MOUNT_ROOT; - - /* - * If this is a block device inode, override the filesystem - * attributes with the block device specific parameters that need to be - * obtained from the bdev backing inode. - */ - if (S_ISBLK(stat->mode)) - bdev_statx(path, stat, request_mask); - return 0; } @@ -421,7 +428,7 @@ SYSCALL_DEFINE2(stat, const char __user *, filename, int error; error = vfs_stat(filename, &stat); - if (error) + if (unlikely(error)) return error; return cp_old_stat(&stat, statbuf); @@ -434,7 +441,7 @@ SYSCALL_DEFINE2(lstat, const char __user *, filename, int error; error = vfs_lstat(filename, &stat); - if (error) + if (unlikely(error)) return error; return cp_old_stat(&stat, statbuf); @@ -443,12 +450,13 @@ SYSCALL_DEFINE2(lstat, const char __user *, filename, SYSCALL_DEFINE2(fstat, unsigned int, fd, struct __old_kernel_stat __user *, statbuf) { struct kstat stat; - int error = vfs_fstat(fd, &stat); + int error; - if (!error) - error = cp_old_stat(&stat, statbuf); + error = vfs_fstat(fd, &stat); + if (unlikely(error)) + return error; - return error; + return cp_old_stat(&stat, statbuf); } #endif /* __ARCH_WANT_OLD_STAT */ @@ -502,10 +510,12 @@ SYSCALL_DEFINE2(newstat, const char __user *, filename, struct stat __user *, statbuf) { struct kstat stat; - int error = vfs_stat(filename, &stat); + int error; - if (error) + error = vfs_stat(filename, &stat); + if (unlikely(error)) return error; + return cp_new_stat(&stat, statbuf); } @@ -516,7 +526,7 @@ SYSCALL_DEFINE2(newlstat, const char __user *, filename, int error; error = vfs_lstat(filename, &stat); - if (error) + if (unlikely(error)) return error; return cp_new_stat(&stat, statbuf); @@ -530,8 +540,9 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, int error; error = vfs_fstatat(dfd, filename, &stat, flag); - if (error) + if (unlikely(error)) return error; + return cp_new_stat(&stat, statbuf); } #endif @@ -539,12 +550,13 @@ SYSCALL_DEFINE4(newfstatat, int, dfd, const char __user *, filename, SYSCALL_DEFINE2(newfstat, unsigned int, fd, struct stat __user *, statbuf) { struct kstat stat; - int error = vfs_fstat(fd, &stat); + int error; - if (!error) - error = cp_new_stat(&stat, statbuf); + error = vfs_fstat(fd, &stat); + if (unlikely(error)) + return error; - return error; + return cp_new_stat(&stat, statbuf); } #endif @@ -732,6 +744,7 @@ cp_statx(const struct kstat *stat, struct statx __user *buffer) tmp.stx_atomic_write_unit_min = stat->atomic_write_unit_min; tmp.stx_atomic_write_unit_max = stat->atomic_write_unit_max; tmp.stx_atomic_write_segments_max = stat->atomic_write_segments_max; + tmp.stx_atomic_write_unit_max_opt = stat->atomic_write_unit_max_opt; return copy_to_user(buffer, &tmp, sizeof(tmp)) ? -EFAULT : 0; } diff --git a/fs/super.c b/fs/super.c index 5a7db4a556e3..bcc4e87123c8 100644 --- a/fs/super.c +++ b/fs/super.c @@ -39,7 +39,8 @@ #include <uapi/linux/mount.h> #include "internal.h" -static int thaw_super_locked(struct super_block *sb, enum freeze_holder who); +static int thaw_super_locked(struct super_block *sb, enum freeze_holder who, + const void *freeze_owner); static LIST_HEAD(super_blocks); static DEFINE_SPINLOCK(sb_lock); @@ -201,7 +202,7 @@ static unsigned long super_cache_scan(struct shrinker *shrink, inodes = list_lru_shrink_count(&sb->s_inode_lru, sc); dentries = list_lru_shrink_count(&sb->s_dentry_lru, sc); - total_objects = dentries + inodes + fs_objects + 1; + total_objects = dentries + inodes + fs_objects; if (!total_objects) total_objects = 1; @@ -887,52 +888,48 @@ void drop_super_exclusive(struct super_block *sb) } EXPORT_SYMBOL(drop_super_exclusive); -static void __iterate_supers(void (*f)(struct super_block *)) -{ - struct super_block *sb, *p = NULL; - - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - if (super_flags(sb, SB_DYING)) - continue; - sb->s_count++; - spin_unlock(&sb_lock); +enum super_iter_flags_t { + SUPER_ITER_EXCL = (1U << 0), + SUPER_ITER_UNLOCKED = (1U << 1), + SUPER_ITER_REVERSE = (1U << 2), +}; - f(sb); +static inline struct super_block *first_super(enum super_iter_flags_t flags) +{ + if (flags & SUPER_ITER_REVERSE) + return list_last_entry(&super_blocks, struct super_block, s_list); + return list_first_entry(&super_blocks, struct super_block, s_list); +} - spin_lock(&sb_lock); - if (p) - __put_super(p); - p = sb; - } - if (p) - __put_super(p); - spin_unlock(&sb_lock); +static inline struct super_block *next_super(struct super_block *sb, + enum super_iter_flags_t flags) +{ + if (flags & SUPER_ITER_REVERSE) + return list_prev_entry(sb, s_list); + return list_next_entry(sb, s_list); } -/** - * iterate_supers - call function for all active superblocks - * @f: function to call - * @arg: argument to pass to it - * - * Scans the superblock list and calls given function, passing it - * locked superblock and given argument. - */ -void iterate_supers(void (*f)(struct super_block *, void *), void *arg) + +static void __iterate_supers(void (*f)(struct super_block *, void *), void *arg, + enum super_iter_flags_t flags) { struct super_block *sb, *p = NULL; + bool excl = flags & SUPER_ITER_EXCL; - spin_lock(&sb_lock); - list_for_each_entry(sb, &super_blocks, s_list) { - bool locked; + guard(spinlock)(&sb_lock); + for (sb = first_super(flags); + !list_entry_is_head(sb, &super_blocks, s_list); + sb = next_super(sb, flags)) { + if (super_flags(sb, SB_DYING)) + continue; sb->s_count++; spin_unlock(&sb_lock); - locked = super_lock_shared(sb); - if (locked) { - if (sb->s_root) - f(sb, arg); - super_unlock_shared(sb); + if (flags & SUPER_ITER_UNLOCKED) { + f(sb, arg); + } else if (super_lock(sb, excl)) { + f(sb, arg); + super_unlock(sb, excl); } spin_lock(&sb_lock); @@ -942,7 +939,11 @@ void iterate_supers(void (*f)(struct super_block *, void *), void *arg) } if (p) __put_super(p); - spin_unlock(&sb_lock); +} + +void iterate_supers(void (*f)(struct super_block *, void *), void *arg) +{ + __iterate_supers(f, arg, 0); } /** @@ -963,15 +964,15 @@ void iterate_supers_type(struct file_system_type *type, hlist_for_each_entry(sb, &type->fs_supers, s_instances) { bool locked; + if (super_flags(sb, SB_DYING)) + continue; + sb->s_count++; spin_unlock(&sb_lock); locked = super_lock_shared(sb); - if (locked) { - if (sb->s_root) - f(sb, arg); - super_unlock_shared(sb); - } + if (locked) + f(sb, arg); spin_lock(&sb_lock); if (p) @@ -991,23 +992,21 @@ struct super_block *user_get_super(dev_t dev, bool excl) spin_lock(&sb_lock); list_for_each_entry(sb, &super_blocks, s_list) { - if (sb->s_dev == dev) { - bool locked; - - sb->s_count++; - spin_unlock(&sb_lock); - /* still alive? */ - locked = super_lock(sb, excl); - if (locked) { - if (sb->s_root) - return sb; - super_unlock(sb, excl); - } - /* nope, got unmounted */ - spin_lock(&sb_lock); - __put_super(sb); - break; - } + bool locked; + + if (sb->s_dev != dev) + continue; + + sb->s_count++; + spin_unlock(&sb_lock); + + locked = super_lock(sb, excl); + if (locked) + return sb; + + spin_lock(&sb_lock); + __put_super(sb); + break; } spin_unlock(&sb_lock); return NULL; @@ -1111,11 +1110,9 @@ cancel_readonly: return retval; } -static void do_emergency_remount_callback(struct super_block *sb) +static void do_emergency_remount_callback(struct super_block *sb, void *unused) { - bool locked = super_lock_excl(sb); - - if (locked && sb->s_root && sb->s_bdev && !sb_rdonly(sb)) { + if (sb->s_bdev && !sb_rdonly(sb)) { struct fs_context *fc; fc = fs_context_for_reconfigure(sb->s_root, @@ -1126,13 +1123,12 @@ static void do_emergency_remount_callback(struct super_block *sb) put_fs_context(fc); } } - if (locked) - super_unlock_excl(sb); } static void do_emergency_remount(struct work_struct *work) { - __iterate_supers(do_emergency_remount_callback); + __iterate_supers(do_emergency_remount_callback, NULL, + SUPER_ITER_EXCL | SUPER_ITER_REVERSE); kfree(work); printk("Emergency Remount complete\n"); } @@ -1148,24 +1144,18 @@ void emergency_remount(void) } } -static void do_thaw_all_callback(struct super_block *sb) +static void do_thaw_all_callback(struct super_block *sb, void *unused) { - bool locked = super_lock_excl(sb); - - if (locked && sb->s_root) { - if (IS_ENABLED(CONFIG_BLOCK)) - while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) - pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); - thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE); - return; - } - if (locked) - super_unlock_excl(sb); + if (IS_ENABLED(CONFIG_BLOCK)) + while (sb->s_bdev && !bdev_thaw(sb->s_bdev)) + pr_warn("Emergency Thaw on %pg\n", sb->s_bdev); + thaw_super_locked(sb, FREEZE_HOLDER_USERSPACE, NULL); + return; } static void do_thaw_all(struct work_struct *work) { - __iterate_supers(do_thaw_all_callback); + __iterate_supers(do_thaw_all_callback, NULL, SUPER_ITER_EXCL); kfree(work); printk(KERN_WARNING "Emergency Thaw complete\n"); } @@ -1186,6 +1176,66 @@ void emergency_thaw_all(void) } } +static inline bool get_active_super(struct super_block *sb) +{ + bool active = false; + + if (super_lock_excl(sb)) { + active = atomic_inc_not_zero(&sb->s_active); + super_unlock_excl(sb); + } + return active; +} + +static const char *filesystems_freeze_ptr = "filesystems_freeze"; + +static void filesystems_freeze_callback(struct super_block *sb, void *unused) +{ + if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) + return; + + if (!get_active_super(sb)) + return; + + if (sb->s_op->freeze_super) + sb->s_op->freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, + filesystems_freeze_ptr); + else + freeze_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, + filesystems_freeze_ptr); + + deactivate_super(sb); +} + +void filesystems_freeze(void) +{ + __iterate_supers(filesystems_freeze_callback, NULL, + SUPER_ITER_UNLOCKED | SUPER_ITER_REVERSE); +} + +static void filesystems_thaw_callback(struct super_block *sb, void *unused) +{ + if (!sb->s_op->freeze_fs && !sb->s_op->freeze_super) + return; + + if (!get_active_super(sb)) + return; + + if (sb->s_op->thaw_super) + sb->s_op->thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, + filesystems_freeze_ptr); + else + thaw_super(sb, FREEZE_EXCL | FREEZE_HOLDER_KERNEL, + filesystems_freeze_ptr); + + deactivate_super(sb); +} + +void filesystems_thaw(void) +{ + __iterate_supers(filesystems_thaw_callback, NULL, SUPER_ITER_UNLOCKED); +} + static DEFINE_IDA(unnamed_dev_ida); /** @@ -1417,7 +1467,7 @@ static void fs_bdev_mark_dead(struct block_device *bdev, bool surprise) if (!surprise) sync_filesystem(sb); shrink_dcache_sb(sb); - invalidate_inodes(sb); + evict_inodes(sb); if (sb->s_op->shutdown) sb->s_op->shutdown(sb); @@ -1479,10 +1529,10 @@ static int fs_bdev_freeze(struct block_device *bdev) if (sb->s_op->freeze_super) error = sb->s_op->freeze_super(sb, - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); else error = freeze_super(sb, - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); if (!error) error = sync_blockdev(bdev); deactivate_super(sb); @@ -1528,10 +1578,10 @@ static int fs_bdev_thaw(struct block_device *bdev) if (sb->s_op->thaw_super) error = sb->s_op->thaw_super(sb, - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); else error = thaw_super(sb, - FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE); + FREEZE_MAY_NEST | FREEZE_HOLDER_USERSPACE, NULL); deactivate_super(sb); return error; } @@ -1737,61 +1787,6 @@ struct dentry *mount_nodev(struct file_system_type *fs_type, } EXPORT_SYMBOL(mount_nodev); -int reconfigure_single(struct super_block *s, - int flags, void *data) -{ - struct fs_context *fc; - int ret; - - /* The caller really need to be passing fc down into mount_single(), - * then a chunk of this can be removed. [Bollocks -- AV] - * Better yet, reconfiguration shouldn't happen, but rather the second - * mount should be rejected if the parameters are not compatible. - */ - fc = fs_context_for_reconfigure(s->s_root, flags, MS_RMT_MASK); - if (IS_ERR(fc)) - return PTR_ERR(fc); - - ret = parse_monolithic_mount_data(fc, data); - if (ret < 0) - goto out; - - ret = reconfigure_super(fc); -out: - put_fs_context(fc); - return ret; -} - -static int compare_single(struct super_block *s, void *p) -{ - return 1; -} - -struct dentry *mount_single(struct file_system_type *fs_type, - int flags, void *data, - int (*fill_super)(struct super_block *, void *, int)) -{ - struct super_block *s; - int error; - - s = sget(fs_type, compare_single, set_anon_super, flags, NULL); - if (IS_ERR(s)) - return ERR_CAST(s); - if (!s->s_root) { - error = fill_super(s, data, flags & SB_SILENT ? 1 : 0); - if (!error) - s->s_flags |= SB_ACTIVE; - } else { - error = reconfigure_single(s, flags, data); - } - if (unlikely(error)) { - deactivate_locked_super(s); - return ERR_PTR(error); - } - return dget(s->s_root); -} -EXPORT_SYMBOL(mount_single); - /** * vfs_get_tree - Get the mountable root * @fc: The superblock configuration context. @@ -1958,7 +1953,7 @@ static int wait_for_partially_frozen(struct super_block *sb) } #define FREEZE_HOLDERS (FREEZE_HOLDER_KERNEL | FREEZE_HOLDER_USERSPACE) -#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST) +#define FREEZE_FLAGS (FREEZE_HOLDERS | FREEZE_MAY_NEST | FREEZE_EXCL) static inline int freeze_inc(struct super_block *sb, enum freeze_holder who) { @@ -1984,11 +1979,34 @@ static inline int freeze_dec(struct super_block *sb, enum freeze_holder who) return sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount; } -static inline bool may_freeze(struct super_block *sb, enum freeze_holder who) +static inline bool may_freeze(struct super_block *sb, enum freeze_holder who, + const void *freeze_owner) { + lockdep_assert_held(&sb->s_umount); + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + if (who & FREEZE_EXCL) { + if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL))) + return false; + if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL))) + return false; + if (WARN_ON_ONCE(!freeze_owner)) + return false; + /* This freeze already has a specific owner. */ + if (sb->s_writers.freeze_owner) + return false; + /* + * This is already frozen multiple times so we're just + * going to take a reference count and mark the freeze as + * being owned by the caller. + */ + if (sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount) + sb->s_writers.freeze_owner = freeze_owner; + return true; + } + if (who & FREEZE_HOLDER_KERNEL) return (who & FREEZE_MAY_NEST) || sb->s_writers.freeze_kcount == 0; @@ -1998,10 +2016,61 @@ static inline bool may_freeze(struct super_block *sb, enum freeze_holder who) return false; } +static inline bool may_unfreeze(struct super_block *sb, enum freeze_holder who, + const void *freeze_owner) +{ + lockdep_assert_held(&sb->s_umount); + + WARN_ON_ONCE((who & ~FREEZE_FLAGS)); + WARN_ON_ONCE(hweight32(who & FREEZE_HOLDERS) > 1); + + if (who & FREEZE_EXCL) { + if (WARN_ON_ONCE(!(who & FREEZE_HOLDER_KERNEL))) + return false; + if (WARN_ON_ONCE(who & ~(FREEZE_EXCL | FREEZE_HOLDER_KERNEL))) + return false; + if (WARN_ON_ONCE(!freeze_owner)) + return false; + if (WARN_ON_ONCE(sb->s_writers.freeze_kcount == 0)) + return false; + /* This isn't exclusively frozen. */ + if (!sb->s_writers.freeze_owner) + return false; + /* This isn't exclusively frozen by us. */ + if (sb->s_writers.freeze_owner != freeze_owner) + return false; + /* + * This is still frozen multiple times so we're just + * going to drop our reference count and undo our + * exclusive freeze. + */ + if ((sb->s_writers.freeze_kcount + sb->s_writers.freeze_ucount) > 1) + sb->s_writers.freeze_owner = NULL; + return true; + } + + if (who & FREEZE_HOLDER_KERNEL) { + /* + * Someone's trying to steal the reference belonging to + * @sb->s_writers.freeze_owner. + */ + if (sb->s_writers.freeze_kcount == 1 && + sb->s_writers.freeze_owner) + return false; + return sb->s_writers.freeze_kcount > 0; + } + + if (who & FREEZE_HOLDER_USERSPACE) + return sb->s_writers.freeze_ucount > 0; + + return false; +} + /** * freeze_super - lock the filesystem and force it into a consistent state * @sb: the super to lock * @who: context that wants to freeze + * @freeze_owner: owner of the freeze * * Syncs the super to make sure the filesystem is consistent and calls the fs's * freeze_fs. Subsequent calls to this without first thawing the fs may return @@ -2053,7 +2122,7 @@ static inline bool may_freeze(struct super_block *sb, enum freeze_holder who) * Return: If the freeze was successful zero is returned. If the freeze * failed a negative error code is returned. */ -int freeze_super(struct super_block *sb, enum freeze_holder who) +int freeze_super(struct super_block *sb, enum freeze_holder who, const void *freeze_owner) { int ret; @@ -2065,7 +2134,7 @@ int freeze_super(struct super_block *sb, enum freeze_holder who) retry: if (sb->s_writers.frozen == SB_FREEZE_COMPLETE) { - if (may_freeze(sb, who)) + if (may_freeze(sb, who, freeze_owner)) ret = !!WARN_ON_ONCE(freeze_inc(sb, who) == 1); else ret = -EBUSY; @@ -2087,6 +2156,7 @@ retry: if (sb_rdonly(sb)) { /* Nothing to do really... */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); + sb->s_writers.freeze_owner = freeze_owner; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); super_unlock_excl(sb); @@ -2134,6 +2204,7 @@ retry: * when frozen is set to SB_FREEZE_COMPLETE, and for thaw_super(). */ WARN_ON_ONCE(freeze_inc(sb, who) > 1); + sb->s_writers.freeze_owner = freeze_owner; sb->s_writers.frozen = SB_FREEZE_COMPLETE; wake_up_var(&sb->s_writers.frozen); lockdep_sb_freeze_release(sb); @@ -2148,13 +2219,17 @@ EXPORT_SYMBOL(freeze_super); * removes that state without releasing the other state or unlocking the * filesystem. */ -static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) +static int thaw_super_locked(struct super_block *sb, enum freeze_holder who, + const void *freeze_owner) { int error = -EINVAL; if (sb->s_writers.frozen != SB_FREEZE_COMPLETE) goto out_unlock; + if (!may_unfreeze(sb, who, freeze_owner)) + goto out_unlock; + /* * All freezers share a single active reference. * So just unlock in case there are any left. @@ -2164,6 +2239,7 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) if (sb_rdonly(sb)) { sb->s_writers.frozen = SB_UNFROZEN; + sb->s_writers.freeze_owner = NULL; wake_up_var(&sb->s_writers.frozen); goto out_deactivate; } @@ -2181,6 +2257,7 @@ static int thaw_super_locked(struct super_block *sb, enum freeze_holder who) } sb->s_writers.frozen = SB_UNFROZEN; + sb->s_writers.freeze_owner = NULL; wake_up_var(&sb->s_writers.frozen); sb_freeze_unlock(sb, SB_FREEZE_FS); out_deactivate: @@ -2196,6 +2273,7 @@ out_unlock: * thaw_super -- unlock filesystem * @sb: the super to thaw * @who: context that wants to freeze + * @freeze_owner: owner of the freeze * * Unlocks the filesystem and marks it writeable again after freeze_super() * if there are no remaining freezes on the filesystem. @@ -2209,13 +2287,14 @@ out_unlock: * have been frozen through the block layer via multiple block devices. * The filesystem remains frozen until all block devices are unfrozen. */ -int thaw_super(struct super_block *sb, enum freeze_holder who) +int thaw_super(struct super_block *sb, enum freeze_holder who, + const void *freeze_owner) { if (!super_lock_excl(sb)) { WARN_ON_ONCE("Dying superblock while thawing!"); return -EINVAL; } - return thaw_super_locked(sb, who); + return thaw_super_locked(sb, who, freeze_owner); } EXPORT_SYMBOL(thaw_super); diff --git a/fs/sysfs/dir.c b/fs/sysfs/dir.c index 4df2afa551dc..94e12efd92f2 100644 --- a/fs/sysfs/dir.c +++ b/fs/sysfs/dir.c @@ -123,7 +123,7 @@ int sysfs_move_dir_ns(struct kobject *kobj, struct kobject *new_parent_kobj, new_parent = new_parent_kobj && new_parent_kobj->sd ? new_parent_kobj->sd : sysfs_root_kn; - return kernfs_rename_ns(kn, new_parent, kn->name, new_ns); + return kernfs_rename_ns(kn, new_parent, NULL, new_ns); } /** diff --git a/fs/sysfs/file.c b/fs/sysfs/file.c index 6931308876c4..c3d3b079aedd 100644 --- a/fs/sysfs/file.c +++ b/fs/sysfs/file.c @@ -19,13 +19,19 @@ #include "sysfs.h" +static struct kobject *sysfs_file_kobj(struct kernfs_node *kn) +{ + guard(rcu)(); + return rcu_dereference(kn->__parent)->priv; +} + /* * Determine ktype->sysfs_ops for the given kernfs_node. This function * must be called while holding an active reference. */ static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn) { - struct kobject *kobj = kn->parent->priv; + struct kobject *kobj = sysfs_file_kobj(kn); if (kn->flags & KERNFS_LOCKDEP) lockdep_assert_held(kn); @@ -40,7 +46,7 @@ static const struct sysfs_ops *sysfs_file_ops(struct kernfs_node *kn) static int sysfs_kf_seq_show(struct seq_file *sf, void *v) { struct kernfs_open_file *of = sf->private; - struct kobject *kobj = of->kn->parent->priv; + struct kobject *kobj = sysfs_file_kobj(of->kn); const struct sysfs_ops *ops = sysfs_file_ops(of->kn); ssize_t count; char *buf; @@ -78,7 +84,7 @@ static ssize_t sysfs_kf_bin_read(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { struct bin_attribute *battr = of->kn->priv; - struct kobject *kobj = of->kn->parent->priv; + struct kobject *kobj = sysfs_file_kobj(of->kn); loff_t size = file_inode(of->file)->i_size; if (!count) @@ -105,7 +111,7 @@ static ssize_t sysfs_kf_read(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); - struct kobject *kobj = of->kn->parent->priv; + struct kobject *kobj = sysfs_file_kobj(of->kn); ssize_t len; /* @@ -131,7 +137,7 @@ static ssize_t sysfs_kf_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { const struct sysfs_ops *ops = sysfs_file_ops(of->kn); - struct kobject *kobj = of->kn->parent->priv; + struct kobject *kobj = sysfs_file_kobj(of->kn); if (!count) return 0; @@ -144,7 +150,7 @@ static ssize_t sysfs_kf_bin_write(struct kernfs_open_file *of, char *buf, size_t count, loff_t pos) { struct bin_attribute *battr = of->kn->priv; - struct kobject *kobj = of->kn->parent->priv; + struct kobject *kobj = sysfs_file_kobj(of->kn); loff_t size = file_inode(of->file)->i_size; if (size) { @@ -168,7 +174,7 @@ static int sysfs_kf_bin_mmap(struct kernfs_open_file *of, struct vm_area_struct *vma) { struct bin_attribute *battr = of->kn->priv; - struct kobject *kobj = of->kn->parent->priv; + struct kobject *kobj = sysfs_file_kobj(of->kn); return battr->mmap(of->file, kobj, battr, vma); } @@ -177,7 +183,7 @@ static loff_t sysfs_kf_bin_llseek(struct kernfs_open_file *of, loff_t offset, int whence) { struct bin_attribute *battr = of->kn->priv; - struct kobject *kobj = of->kn->parent->priv; + struct kobject *kobj = sysfs_file_kobj(of->kn); if (battr->llseek) return battr->llseek(of->file, kobj, battr, offset, whence); @@ -494,7 +500,7 @@ EXPORT_SYMBOL_GPL(sysfs_break_active_protection); */ void sysfs_unbreak_active_protection(struct kernfs_node *kn) { - struct kobject *kobj = kn->parent->priv; + struct kobject *kobj = sysfs_file_kobj(kn); kernfs_unbreak_active_protection(kn); kernfs_put(kn); diff --git a/fs/sysv/Kconfig b/fs/sysv/Kconfig deleted file mode 100644 index 67b3f90afbfd..000000000000 --- a/fs/sysv/Kconfig +++ /dev/null @@ -1,38 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -config SYSV_FS - tristate "System V/Xenix/V7/Coherent file system support" - depends on BLOCK - select BUFFER_HEAD - help - SCO, Xenix and Coherent are commercial Unix systems for Intel - machines, and Version 7 was used on the DEC PDP-11. Saying Y - here would allow you to read from their floppies and hard disk - partitions. - - If you have floppies or hard disk partitions like that, it is likely - that they contain binaries from those other Unix systems; in order - to run these binaries, you will want to install linux-abi which is - a set of kernel modules that lets you run SCO, Xenix, Wyse, - UnixWare, Dell Unix and System V programs under Linux. It is - available via FTP (user: ftp) from - <ftp://ftp.openlinux.org/pub/people/hch/linux-abi/>). - NOTE: that will work only for binaries from Intel-based systems; - PDP ones will have to wait until somebody ports Linux to -11 ;-) - - If you only intend to mount files from some other Unix over the - network using NFS, you don't need the System V file system support - (but you need NFS file system support obviously). - - Note that this option is generally not needed for floppies, since a - good portable way to transport files and directories between unixes - (and even other operating systems) is given by the tar program ("man - tar" or preferably "info tar"). Note also that this option has - nothing whatsoever to do with the option "System V IPC". Read about - the System V file system in - <file:Documentation/filesystems/sysv-fs.rst>. - Saying Y here will enlarge your kernel by about 27 KB. - - To compile this as a module, choose M here: the module will be called - sysv. - - If you haven't heard about all of this before, it's safe to say N. diff --git a/fs/sysv/Makefile b/fs/sysv/Makefile deleted file mode 100644 index 17d12ba04b18..000000000000 --- a/fs/sysv/Makefile +++ /dev/null @@ -1,9 +0,0 @@ -# SPDX-License-Identifier: GPL-2.0-only -# -# Makefile for the Linux SystemV/Coherent filesystem routines. -# - -obj-$(CONFIG_SYSV_FS) += sysv.o - -sysv-objs := ialloc.o balloc.o inode.o itree.o file.o dir.o \ - namei.o super.o diff --git a/fs/sysv/balloc.c b/fs/sysv/balloc.c deleted file mode 100644 index 0e69dbdf7277..000000000000 --- a/fs/sysv/balloc.c +++ /dev/null @@ -1,240 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/balloc.c - * - * minix/bitmap.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext/freelists.c - * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) - * - * xenix/alloc.c - * Copyright (C) 1992 Doug Evans - * - * coh/alloc.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/balloc.c - * Copyright (C) 1993 Bruno Haible - * - * This file contains code for allocating/freeing blocks. - */ - -#include <linux/buffer_head.h> -#include <linux/string.h> -#include "sysv.h" - -/* We don't trust the value of - sb->sv_sbd2->s_tfree = *sb->sv_free_blocks - but we nevertheless keep it up to date. */ - -static inline sysv_zone_t *get_chunk(struct super_block *sb, struct buffer_head *bh) -{ - char *bh_data = bh->b_data; - - if (SYSV_SB(sb)->s_type == FSTYPE_SYSV4) - return (sysv_zone_t*)(bh_data+4); - else - return (sysv_zone_t*)(bh_data+2); -} - -/* NOTE NOTE NOTE: nr is a block number _as_ _stored_ _on_ _disk_ */ - -void sysv_free_block(struct super_block * sb, sysv_zone_t nr) -{ - struct sysv_sb_info * sbi = SYSV_SB(sb); - struct buffer_head * bh; - sysv_zone_t *blocks = sbi->s_bcache; - unsigned count; - unsigned block = fs32_to_cpu(sbi, nr); - - /* - * This code does not work at all for AFS (it has a bitmap - * free list). As AFS is supposed to be read-only no one - * should call this for an AFS filesystem anyway... - */ - if (sbi->s_type == FSTYPE_AFS) - return; - - if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { - printk("sysv_free_block: trying to free block not in datazone\n"); - return; - } - - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_bcache_count); - - if (count > sbi->s_flc_size) { - printk("sysv_free_block: flc_count > flc_size\n"); - mutex_unlock(&sbi->s_lock); - return; - } - /* If the free list head in super-block is full, it is copied - * into this block being freed, ditto if it's completely empty - * (applies only on Coherent). - */ - if (count == sbi->s_flc_size || count == 0) { - block += sbi->s_block_base; - bh = sb_getblk(sb, block); - if (!bh) { - printk("sysv_free_block: getblk() failed\n"); - mutex_unlock(&sbi->s_lock); - return; - } - memset(bh->b_data, 0, sb->s_blocksize); - *(__fs16*)bh->b_data = cpu_to_fs16(sbi, count); - memcpy(get_chunk(sb,bh), blocks, count * sizeof(sysv_zone_t)); - mark_buffer_dirty(bh); - set_buffer_uptodate(bh); - brelse(bh); - count = 0; - } - sbi->s_bcache[count++] = nr; - - *sbi->s_bcache_count = cpu_to_fs16(sbi, count); - fs32_add(sbi, sbi->s_free_blocks, 1); - dirty_sb(sb); - mutex_unlock(&sbi->s_lock); -} - -sysv_zone_t sysv_new_block(struct super_block * sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - unsigned int block; - sysv_zone_t nr; - struct buffer_head * bh; - unsigned count; - - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_bcache_count); - - if (count == 0) /* Applies only to Coherent FS */ - goto Enospc; - nr = sbi->s_bcache[--count]; - if (nr == 0) /* Applies only to Xenix FS, SystemV FS */ - goto Enospc; - - block = fs32_to_cpu(sbi, nr); - - *sbi->s_bcache_count = cpu_to_fs16(sbi, count); - - if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) { - printk("sysv_new_block: new block %d is not in data zone\n", - block); - goto Enospc; - } - - if (count == 0) { /* the last block continues the free list */ - unsigned count; - - block += sbi->s_block_base; - if (!(bh = sb_bread(sb, block))) { - printk("sysv_new_block: cannot read free-list block\n"); - /* retry this same block next time */ - *sbi->s_bcache_count = cpu_to_fs16(sbi, 1); - goto Enospc; - } - count = fs16_to_cpu(sbi, *(__fs16*)bh->b_data); - if (count > sbi->s_flc_size) { - printk("sysv_new_block: free-list block with >flc_size entries\n"); - brelse(bh); - goto Enospc; - } - *sbi->s_bcache_count = cpu_to_fs16(sbi, count); - memcpy(sbi->s_bcache, get_chunk(sb, bh), - count * sizeof(sysv_zone_t)); - brelse(bh); - } - /* Now the free list head in the superblock is valid again. */ - fs32_add(sbi, sbi->s_free_blocks, -1); - dirty_sb(sb); - mutex_unlock(&sbi->s_lock); - return nr; - -Enospc: - mutex_unlock(&sbi->s_lock); - return 0; -} - -unsigned long sysv_count_free_blocks(struct super_block * sb) -{ - struct sysv_sb_info * sbi = SYSV_SB(sb); - int sb_count; - int count; - struct buffer_head * bh = NULL; - sysv_zone_t *blocks; - unsigned block; - int n; - - /* - * This code does not work at all for AFS (it has a bitmap - * free list). As AFS is supposed to be read-only we just - * lie and say it has no free block at all. - */ - if (sbi->s_type == FSTYPE_AFS) - return 0; - - mutex_lock(&sbi->s_lock); - sb_count = fs32_to_cpu(sbi, *sbi->s_free_blocks); - - if (0) - goto trust_sb; - - /* this causes a lot of disk traffic ... */ - count = 0; - n = fs16_to_cpu(sbi, *sbi->s_bcache_count); - blocks = sbi->s_bcache; - while (1) { - sysv_zone_t zone; - if (n > sbi->s_flc_size) - goto E2big; - zone = 0; - while (n && (zone = blocks[--n]) != 0) - count++; - if (zone == 0) - break; - - block = fs32_to_cpu(sbi, zone); - if (bh) - brelse(bh); - - if (block < sbi->s_firstdatazone || block >= sbi->s_nzones) - goto Einval; - block += sbi->s_block_base; - bh = sb_bread(sb, block); - if (!bh) - goto Eio; - n = fs16_to_cpu(sbi, *(__fs16*)bh->b_data); - blocks = get_chunk(sb, bh); - } - if (bh) - brelse(bh); - if (count != sb_count) - goto Ecount; -done: - mutex_unlock(&sbi->s_lock); - return count; - -Einval: - printk("sysv_count_free_blocks: new block %d is not in data zone\n", - block); - goto trust_sb; -Eio: - printk("sysv_count_free_blocks: cannot read free-list block\n"); - goto trust_sb; -E2big: - printk("sysv_count_free_blocks: >flc_size entries in free-list block\n"); - if (bh) - brelse(bh); -trust_sb: - count = sb_count; - goto done; -Ecount: - printk("sysv_count_free_blocks: free block count was %d, " - "correcting to %d\n", sb_count, count); - if (!sb_rdonly(sb)) { - *sbi->s_free_blocks = cpu_to_fs32(sbi, count); - dirty_sb(sb); - } - goto done; -} diff --git a/fs/sysv/dir.c b/fs/sysv/dir.c deleted file mode 100644 index 639307e2ff8c..000000000000 --- a/fs/sysv/dir.c +++ /dev/null @@ -1,378 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/dir.c - * - * minix/dir.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * coh/dir.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/dir.c - * Copyright (C) 1993 Bruno Haible - * - * SystemV/Coherent directory handling functions - */ - -#include <linux/pagemap.h> -#include <linux/highmem.h> -#include <linux/swap.h> -#include "sysv.h" - -static int sysv_readdir(struct file *, struct dir_context *); - -const struct file_operations sysv_dir_operations = { - .llseek = generic_file_llseek, - .read = generic_read_dir, - .iterate_shared = sysv_readdir, - .fsync = generic_file_fsync, -}; - -static void dir_commit_chunk(struct folio *folio, loff_t pos, unsigned len) -{ - struct address_space *mapping = folio->mapping; - struct inode *dir = mapping->host; - - block_write_end(NULL, mapping, pos, len, len, folio, NULL); - if (pos+len > dir->i_size) { - i_size_write(dir, pos+len); - mark_inode_dirty(dir); - } - folio_unlock(folio); -} - -static int sysv_handle_dirsync(struct inode *dir) -{ - int err; - - err = filemap_write_and_wait(dir->i_mapping); - if (!err) - err = sync_inode_metadata(dir, 1); - return err; -} - -/* - * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the - * rules documented in mm/highmem.rst. - * - * NOTE: sysv_find_entry() and sysv_dotdot() act as calls to dir_get_folio() - * and must be treated accordingly for nesting purposes. - */ -static void *dir_get_folio(struct inode *dir, unsigned long n, - struct folio **foliop) -{ - struct folio *folio = read_mapping_folio(dir->i_mapping, n, NULL); - - if (IS_ERR(folio)) - return ERR_CAST(folio); - *foliop = folio; - return kmap_local_folio(folio, 0); -} - -static int sysv_readdir(struct file *file, struct dir_context *ctx) -{ - unsigned long pos = ctx->pos; - struct inode *inode = file_inode(file); - struct super_block *sb = inode->i_sb; - unsigned long npages = dir_pages(inode); - unsigned offset; - unsigned long n; - - ctx->pos = pos = (pos + SYSV_DIRSIZE-1) & ~(SYSV_DIRSIZE-1); - if (pos >= inode->i_size) - return 0; - - offset = pos & ~PAGE_MASK; - n = pos >> PAGE_SHIFT; - - for ( ; n < npages; n++, offset = 0) { - char *kaddr, *limit; - struct sysv_dir_entry *de; - struct folio *folio; - - kaddr = dir_get_folio(inode, n, &folio); - if (IS_ERR(kaddr)) - continue; - de = (struct sysv_dir_entry *)(kaddr+offset); - limit = kaddr + PAGE_SIZE - SYSV_DIRSIZE; - for ( ;(char*)de <= limit; de++, ctx->pos += sizeof(*de)) { - char *name = de->name; - - if (!de->inode) - continue; - - if (!dir_emit(ctx, name, strnlen(name,SYSV_NAMELEN), - fs16_to_cpu(SYSV_SB(sb), de->inode), - DT_UNKNOWN)) { - folio_release_kmap(folio, kaddr); - return 0; - } - } - folio_release_kmap(folio, kaddr); - } - return 0; -} - -/* compare strings: name[0..len-1] (not zero-terminated) and - * buffer[0..] (filled with zeroes up to buffer[0..maxlen-1]) - */ -static inline int namecompare(int len, int maxlen, - const char * name, const char * buffer) -{ - if (len < maxlen && buffer[len]) - return 0; - return !memcmp(name, buffer, len); -} - -/* - * sysv_find_entry() - * - * finds an entry in the specified directory with the wanted name. - * It does NOT read the inode of the - * entry - you'll have to do that yourself if you want to. - * - * On Success folio_release_kmap() should be called on *foliop. - * - * sysv_find_entry() acts as a call to dir_get_folio() and must be treated - * accordingly for nesting purposes. - */ -struct sysv_dir_entry *sysv_find_entry(struct dentry *dentry, struct folio **foliop) -{ - const char * name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct inode * dir = d_inode(dentry->d_parent); - unsigned long start, n; - unsigned long npages = dir_pages(dir); - struct sysv_dir_entry *de; - - start = SYSV_I(dir)->i_dir_start_lookup; - if (start >= npages) - start = 0; - n = start; - - do { - char *kaddr = dir_get_folio(dir, n, foliop); - - if (!IS_ERR(kaddr)) { - de = (struct sysv_dir_entry *)kaddr; - kaddr += folio_size(*foliop) - SYSV_DIRSIZE; - for ( ; (char *) de <= kaddr ; de++) { - if (!de->inode) - continue; - if (namecompare(namelen, SYSV_NAMELEN, - name, de->name)) - goto found; - } - folio_release_kmap(*foliop, kaddr); - } - - if (++n >= npages) - n = 0; - } while (n != start); - - return NULL; - -found: - SYSV_I(dir)->i_dir_start_lookup = n; - return de; -} - -int sysv_add_link(struct dentry *dentry, struct inode *inode) -{ - struct inode *dir = d_inode(dentry->d_parent); - const char * name = dentry->d_name.name; - int namelen = dentry->d_name.len; - struct folio *folio = NULL; - struct sysv_dir_entry * de; - unsigned long npages = dir_pages(dir); - unsigned long n; - char *kaddr; - loff_t pos; - int err; - - /* We take care of directory expansion in the same loop */ - for (n = 0; n <= npages; n++) { - kaddr = dir_get_folio(dir, n, &folio); - if (IS_ERR(kaddr)) - return PTR_ERR(kaddr); - de = (struct sysv_dir_entry *)kaddr; - kaddr += PAGE_SIZE - SYSV_DIRSIZE; - while ((char *)de <= kaddr) { - if (!de->inode) - goto got_it; - err = -EEXIST; - if (namecompare(namelen, SYSV_NAMELEN, name, de->name)) - goto out_folio; - de++; - } - folio_release_kmap(folio, kaddr); - } - BUG(); - return -EINVAL; - -got_it: - pos = folio_pos(folio) + offset_in_folio(folio, de); - folio_lock(folio); - err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); - if (err) - goto out_unlock; - memcpy (de->name, name, namelen); - memset (de->name + namelen, 0, SYSV_DIRSIZE - namelen - 2); - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(folio, pos, SYSV_DIRSIZE); - inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - mark_inode_dirty(dir); - err = sysv_handle_dirsync(dir); -out_folio: - folio_release_kmap(folio, kaddr); - return err; -out_unlock: - folio_unlock(folio); - goto out_folio; -} - -int sysv_delete_entry(struct sysv_dir_entry *de, struct folio *folio) -{ - struct inode *inode = folio->mapping->host; - loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); - int err; - - folio_lock(folio); - err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); - if (err) { - folio_unlock(folio); - return err; - } - de->inode = 0; - dir_commit_chunk(folio, pos, SYSV_DIRSIZE); - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - mark_inode_dirty(inode); - return sysv_handle_dirsync(inode); -} - -int sysv_make_empty(struct inode *inode, struct inode *dir) -{ - struct folio *folio = filemap_grab_folio(inode->i_mapping, 0); - struct sysv_dir_entry * de; - char *kaddr; - int err; - - if (IS_ERR(folio)) - return PTR_ERR(folio); - err = sysv_prepare_chunk(folio, 0, 2 * SYSV_DIRSIZE); - if (err) { - folio_unlock(folio); - goto fail; - } - kaddr = kmap_local_folio(folio, 0); - memset(kaddr, 0, folio_size(folio)); - - de = (struct sysv_dir_entry *)kaddr; - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - strcpy(de->name,"."); - de++; - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), dir->i_ino); - strcpy(de->name,".."); - - kunmap_local(kaddr); - dir_commit_chunk(folio, 0, 2 * SYSV_DIRSIZE); - err = sysv_handle_dirsync(inode); -fail: - folio_put(folio); - return err; -} - -/* - * routine to check that the specified directory is empty (for rmdir) - */ -int sysv_empty_dir(struct inode * inode) -{ - struct super_block *sb = inode->i_sb; - struct folio *folio = NULL; - unsigned long i, npages = dir_pages(inode); - char *kaddr; - - for (i = 0; i < npages; i++) { - struct sysv_dir_entry *de; - - kaddr = dir_get_folio(inode, i, &folio); - if (IS_ERR(kaddr)) - continue; - - de = (struct sysv_dir_entry *)kaddr; - kaddr += folio_size(folio) - SYSV_DIRSIZE; - - for ( ;(char *)de <= kaddr; de++) { - if (!de->inode) - continue; - /* check for . and .. */ - if (de->name[0] != '.') - goto not_empty; - if (!de->name[1]) { - if (de->inode == cpu_to_fs16(SYSV_SB(sb), - inode->i_ino)) - continue; - goto not_empty; - } - if (de->name[1] != '.' || de->name[2]) - goto not_empty; - } - folio_release_kmap(folio, kaddr); - } - return 1; - -not_empty: - folio_release_kmap(folio, kaddr); - return 0; -} - -/* Releases the page */ -int sysv_set_link(struct sysv_dir_entry *de, struct folio *folio, - struct inode *inode) -{ - struct inode *dir = folio->mapping->host; - loff_t pos = folio_pos(folio) + offset_in_folio(folio, de); - int err; - - folio_lock(folio); - err = sysv_prepare_chunk(folio, pos, SYSV_DIRSIZE); - if (err) { - folio_unlock(folio); - return err; - } - de->inode = cpu_to_fs16(SYSV_SB(inode->i_sb), inode->i_ino); - dir_commit_chunk(folio, pos, SYSV_DIRSIZE); - inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); - mark_inode_dirty(dir); - return sysv_handle_dirsync(inode); -} - -/* - * Calls to dir_get_folio()/folio_release_kmap() must be nested according to the - * rules documented in mm/highmem.rst. - * - * sysv_dotdot() acts as a call to dir_get_folio() and must be treated - * accordingly for nesting purposes. - */ -struct sysv_dir_entry *sysv_dotdot(struct inode *dir, struct folio **foliop) -{ - struct sysv_dir_entry *de = dir_get_folio(dir, 0, foliop); - - if (IS_ERR(de)) - return NULL; - /* ".." is the second directory entry */ - return de + 1; -} - -ino_t sysv_inode_by_name(struct dentry *dentry) -{ - struct folio *folio; - struct sysv_dir_entry *de = sysv_find_entry (dentry, &folio); - ino_t res = 0; - - if (de) { - res = fs16_to_cpu(SYSV_SB(dentry->d_sb), de->inode); - folio_release_kmap(folio, de); - } - return res; -} diff --git a/fs/sysv/file.c b/fs/sysv/file.c deleted file mode 100644 index c645f60bdb7f..000000000000 --- a/fs/sysv/file.c +++ /dev/null @@ -1,59 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/file.c - * - * minix/file.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * coh/file.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/file.c - * Copyright (C) 1993 Bruno Haible - * - * SystemV/Coherent regular file handling primitives - */ - -#include "sysv.h" - -/* - * We have mostly NULLs here: the current defaults are OK for - * the coh filesystem. - */ -const struct file_operations sysv_file_operations = { - .llseek = generic_file_llseek, - .read_iter = generic_file_read_iter, - .write_iter = generic_file_write_iter, - .mmap = generic_file_mmap, - .fsync = generic_file_fsync, - .splice_read = filemap_splice_read, -}; - -static int sysv_setattr(struct mnt_idmap *idmap, - struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - int error; - - error = setattr_prepare(&nop_mnt_idmap, dentry, attr); - if (error) - return error; - - if ((attr->ia_valid & ATTR_SIZE) && - attr->ia_size != i_size_read(inode)) { - error = inode_newsize_ok(inode, attr->ia_size); - if (error) - return error; - truncate_setsize(inode, attr->ia_size); - sysv_truncate(inode); - } - - setattr_copy(&nop_mnt_idmap, inode, attr); - mark_inode_dirty(inode); - return 0; -} - -const struct inode_operations sysv_file_inode_operations = { - .setattr = sysv_setattr, - .getattr = sysv_getattr, -}; diff --git a/fs/sysv/ialloc.c b/fs/sysv/ialloc.c deleted file mode 100644 index 269df6d49815..000000000000 --- a/fs/sysv/ialloc.c +++ /dev/null @@ -1,235 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/ialloc.c - * - * minix/bitmap.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * ext/freelists.c - * Copyright (C) 1992 Remy Card (card@masi.ibp.fr) - * - * xenix/alloc.c - * Copyright (C) 1992 Doug Evans - * - * coh/alloc.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/ialloc.c - * Copyright (C) 1993 Bruno Haible - * - * This file contains code for allocating/freeing inodes. - */ - -#include <linux/kernel.h> -#include <linux/stddef.h> -#include <linux/sched.h> -#include <linux/stat.h> -#include <linux/string.h> -#include <linux/buffer_head.h> -#include <linux/writeback.h> -#include "sysv.h" - -/* We don't trust the value of - sb->sv_sbd2->s_tinode = *sb->sv_sb_total_free_inodes - but we nevertheless keep it up to date. */ - -/* An inode on disk is considered free if both i_mode == 0 and i_nlink == 0. */ - -/* return &sb->sv_sb_fic_inodes[i] = &sbd->s_inode[i]; */ -static inline sysv_ino_t * -sv_sb_fic_inode(struct super_block * sb, unsigned int i) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - if (sbi->s_bh1 == sbi->s_bh2) - return &sbi->s_sb_fic_inodes[i]; - else { - /* 512 byte Xenix FS */ - unsigned int offset = offsetof(struct xenix_super_block, s_inode[i]); - if (offset < 512) - return (sysv_ino_t*)(sbi->s_sbd1 + offset); - else - return (sysv_ino_t*)(sbi->s_sbd2 + offset); - } -} - -struct sysv_inode * -sysv_raw_inode(struct super_block *sb, unsigned ino, struct buffer_head **bh) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct sysv_inode *res; - int block = sbi->s_firstinodezone + sbi->s_block_base; - - block += (ino-1) >> sbi->s_inodes_per_block_bits; - *bh = sb_bread(sb, block); - if (!*bh) - return NULL; - res = (struct sysv_inode *)(*bh)->b_data; - return res + ((ino-1) & sbi->s_inodes_per_block_1); -} - -static int refill_free_cache(struct super_block *sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - int i = 0, ino; - - ino = SYSV_ROOT_INO+1; - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto out; - while (ino <= sbi->s_ninodes) { - if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) { - *sv_sb_fic_inode(sb,i++) = cpu_to_fs16(SYSV_SB(sb), ino); - if (i == sbi->s_fic_size) - break; - } - if ((ino++ & sbi->s_inodes_per_block_1) == 0) { - brelse(bh); - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto out; - } else - raw_inode++; - } - brelse(bh); -out: - return i; -} - -void sysv_free_inode(struct inode * inode) -{ - struct super_block *sb = inode->i_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - unsigned int ino; - struct buffer_head * bh; - struct sysv_inode * raw_inode; - unsigned count; - - sb = inode->i_sb; - ino = inode->i_ino; - if (ino <= SYSV_ROOT_INO || ino > sbi->s_ninodes) { - printk("sysv_free_inode: inode 0,1,2 or nonexistent inode\n"); - return; - } - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) { - printk("sysv_free_inode: unable to read inode block on device " - "%s\n", inode->i_sb->s_id); - return; - } - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); - if (count < sbi->s_fic_size) { - *sv_sb_fic_inode(sb,count++) = cpu_to_fs16(sbi, ino); - *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); - } - fs16_add(sbi, sbi->s_sb_total_free_inodes, 1); - dirty_sb(sb); - memset(raw_inode, 0, sizeof(struct sysv_inode)); - mark_buffer_dirty(bh); - mutex_unlock(&sbi->s_lock); - brelse(bh); -} - -struct inode * sysv_new_inode(const struct inode * dir, umode_t mode) -{ - struct super_block *sb = dir->i_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct inode *inode; - sysv_ino_t ino; - unsigned count; - struct writeback_control wbc = { - .sync_mode = WB_SYNC_NONE - }; - - inode = new_inode(sb); - if (!inode) - return ERR_PTR(-ENOMEM); - - mutex_lock(&sbi->s_lock); - count = fs16_to_cpu(sbi, *sbi->s_sb_fic_count); - if (count == 0 || (*sv_sb_fic_inode(sb,count-1) == 0)) { - count = refill_free_cache(sb); - if (count == 0) { - iput(inode); - mutex_unlock(&sbi->s_lock); - return ERR_PTR(-ENOSPC); - } - } - /* Now count > 0. */ - ino = *sv_sb_fic_inode(sb,--count); - *sbi->s_sb_fic_count = cpu_to_fs16(sbi, count); - fs16_add(sbi, sbi->s_sb_total_free_inodes, -1); - dirty_sb(sb); - inode_init_owner(&nop_mnt_idmap, inode, dir, mode); - inode->i_ino = fs16_to_cpu(sbi, ino); - simple_inode_init_ts(inode); - inode->i_blocks = 0; - memset(SYSV_I(inode)->i_data, 0, sizeof(SYSV_I(inode)->i_data)); - SYSV_I(inode)->i_dir_start_lookup = 0; - insert_inode_hash(inode); - mark_inode_dirty(inode); - - sysv_write_inode(inode, &wbc); /* ensure inode not allocated again */ - mark_inode_dirty(inode); /* cleared by sysv_write_inode() */ - /* That's it. */ - mutex_unlock(&sbi->s_lock); - return inode; -} - -unsigned long sysv_count_free_inodes(struct super_block * sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - int ino, count, sb_count; - - mutex_lock(&sbi->s_lock); - - sb_count = fs16_to_cpu(sbi, *sbi->s_sb_total_free_inodes); - - if (0) - goto trust_sb; - - /* this causes a lot of disk traffic ... */ - count = 0; - ino = SYSV_ROOT_INO+1; - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto Eio; - while (ino <= sbi->s_ninodes) { - if (raw_inode->i_mode == 0 && raw_inode->i_nlink == 0) - count++; - if ((ino++ & sbi->s_inodes_per_block_1) == 0) { - brelse(bh); - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) - goto Eio; - } else - raw_inode++; - } - brelse(bh); - if (count != sb_count) - goto Einval; -out: - mutex_unlock(&sbi->s_lock); - return count; - -Einval: - printk("sysv_count_free_inodes: " - "free inode count was %d, correcting to %d\n", - sb_count, count); - if (!sb_rdonly(sb)) { - *sbi->s_sb_total_free_inodes = cpu_to_fs16(SYSV_SB(sb), count); - dirty_sb(sb); - } - goto out; - -Eio: - printk("sysv_count_free_inodes: unable to read inode table\n"); -trust_sb: - count = sb_count; - goto out; -} diff --git a/fs/sysv/inode.c b/fs/sysv/inode.c deleted file mode 100644 index 76bc2d5e75a9..000000000000 --- a/fs/sysv/inode.c +++ /dev/null @@ -1,354 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/inode.c - * - * minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * xenix/inode.c - * Copyright (C) 1992 Doug Evans - * - * coh/inode.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/inode.c - * Copyright (C) 1993 Paul B. Monday - * - * sysv/inode.c - * Copyright (C) 1993 Bruno Haible - * Copyright (C) 1997, 1998 Krzysztof G. Baranowski - * - * This file contains code for allocating/freeing inodes and for read/writing - * the superblock. - */ - -#include <linux/highuid.h> -#include <linux/slab.h> -#include <linux/init.h> -#include <linux/buffer_head.h> -#include <linux/vfs.h> -#include <linux/writeback.h> -#include <linux/namei.h> -#include <asm/byteorder.h> -#include "sysv.h" - -static int sysv_sync_fs(struct super_block *sb, int wait) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - u32 time = (u32)ktime_get_real_seconds(), old_time; - - mutex_lock(&sbi->s_lock); - - /* - * If we are going to write out the super block, - * then attach current time stamp. - * But if the filesystem was marked clean, keep it clean. - */ - old_time = fs32_to_cpu(sbi, *sbi->s_sb_time); - if (sbi->s_type == FSTYPE_SYSV4) { - if (*sbi->s_sb_state == cpu_to_fs32(sbi, 0x7c269d38u - old_time)) - *sbi->s_sb_state = cpu_to_fs32(sbi, 0x7c269d38u - time); - *sbi->s_sb_time = cpu_to_fs32(sbi, time); - mark_buffer_dirty(sbi->s_bh2); - } - - mutex_unlock(&sbi->s_lock); - - return 0; -} - -static int sysv_remount(struct super_block *sb, int *flags, char *data) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - sync_filesystem(sb); - if (sbi->s_forced_ro) - *flags |= SB_RDONLY; - return 0; -} - -static void sysv_put_super(struct super_block *sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - if (!sb_rdonly(sb)) { - /* XXX ext2 also updates the state here */ - mark_buffer_dirty(sbi->s_bh1); - if (sbi->s_bh1 != sbi->s_bh2) - mark_buffer_dirty(sbi->s_bh2); - } - - brelse(sbi->s_bh1); - if (sbi->s_bh1 != sbi->s_bh2) - brelse(sbi->s_bh2); - - kfree(sbi); -} - -static int sysv_statfs(struct dentry *dentry, struct kstatfs *buf) -{ - struct super_block *sb = dentry->d_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - - buf->f_type = sb->s_magic; - buf->f_bsize = sb->s_blocksize; - buf->f_blocks = sbi->s_ndatazones; - buf->f_bavail = buf->f_bfree = sysv_count_free_blocks(sb); - buf->f_files = sbi->s_ninodes; - buf->f_ffree = sysv_count_free_inodes(sb); - buf->f_namelen = SYSV_NAMELEN; - buf->f_fsid = u64_to_fsid(id); - return 0; -} - -/* - * NXI <-> N0XI for PDP, XIN <-> XIN0 for le32, NIX <-> 0NIX for be32 - */ -static inline void read3byte(struct sysv_sb_info *sbi, - unsigned char * from, unsigned char * to) -{ - if (sbi->s_bytesex == BYTESEX_PDP) { - to[0] = from[0]; - to[1] = 0; - to[2] = from[1]; - to[3] = from[2]; - } else if (sbi->s_bytesex == BYTESEX_LE) { - to[0] = from[0]; - to[1] = from[1]; - to[2] = from[2]; - to[3] = 0; - } else { - to[0] = 0; - to[1] = from[0]; - to[2] = from[1]; - to[3] = from[2]; - } -} - -static inline void write3byte(struct sysv_sb_info *sbi, - unsigned char * from, unsigned char * to) -{ - if (sbi->s_bytesex == BYTESEX_PDP) { - to[0] = from[0]; - to[1] = from[2]; - to[2] = from[3]; - } else if (sbi->s_bytesex == BYTESEX_LE) { - to[0] = from[0]; - to[1] = from[1]; - to[2] = from[2]; - } else { - to[0] = from[1]; - to[1] = from[2]; - to[2] = from[3]; - } -} - -static const struct inode_operations sysv_symlink_inode_operations = { - .get_link = page_get_link, - .getattr = sysv_getattr, -}; - -void sysv_set_inode(struct inode *inode, dev_t rdev) -{ - if (S_ISREG(inode->i_mode)) { - inode->i_op = &sysv_file_inode_operations; - inode->i_fop = &sysv_file_operations; - inode->i_mapping->a_ops = &sysv_aops; - } else if (S_ISDIR(inode->i_mode)) { - inode->i_op = &sysv_dir_inode_operations; - inode->i_fop = &sysv_dir_operations; - inode->i_mapping->a_ops = &sysv_aops; - } else if (S_ISLNK(inode->i_mode)) { - inode->i_op = &sysv_symlink_inode_operations; - inode_nohighmem(inode); - inode->i_mapping->a_ops = &sysv_aops; - } else - init_special_inode(inode, inode->i_mode, rdev); -} - -struct inode *sysv_iget(struct super_block *sb, unsigned int ino) -{ - struct sysv_sb_info * sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - struct sysv_inode_info * si; - struct inode *inode; - unsigned int block; - - if (!ino || ino > sbi->s_ninodes) { - printk("Bad inode number on dev %s: %d is out of range\n", - sb->s_id, ino); - return ERR_PTR(-EIO); - } - - inode = iget_locked(sb, ino); - if (!inode) - return ERR_PTR(-ENOMEM); - if (!(inode->i_state & I_NEW)) - return inode; - - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) { - printk("Major problem: unable to read inode from dev %s\n", - inode->i_sb->s_id); - goto bad_inode; - } - /* SystemV FS: kludge permissions if ino==SYSV_ROOT_INO ?? */ - inode->i_mode = fs16_to_cpu(sbi, raw_inode->i_mode); - i_uid_write(inode, (uid_t)fs16_to_cpu(sbi, raw_inode->i_uid)); - i_gid_write(inode, (gid_t)fs16_to_cpu(sbi, raw_inode->i_gid)); - set_nlink(inode, fs16_to_cpu(sbi, raw_inode->i_nlink)); - inode->i_size = fs32_to_cpu(sbi, raw_inode->i_size); - inode_set_atime(inode, fs32_to_cpu(sbi, raw_inode->i_atime), 0); - inode_set_mtime(inode, fs32_to_cpu(sbi, raw_inode->i_mtime), 0); - inode_set_ctime(inode, fs32_to_cpu(sbi, raw_inode->i_ctime), 0); - inode->i_blocks = 0; - - si = SYSV_I(inode); - for (block = 0; block < 10+1+1+1; block++) - read3byte(sbi, &raw_inode->i_data[3*block], - (u8 *)&si->i_data[block]); - brelse(bh); - si->i_dir_start_lookup = 0; - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - sysv_set_inode(inode, - old_decode_dev(fs32_to_cpu(sbi, si->i_data[0]))); - else - sysv_set_inode(inode, 0); - unlock_new_inode(inode); - return inode; - -bad_inode: - iget_failed(inode); - return ERR_PTR(-EIO); -} - -static int __sysv_write_inode(struct inode *inode, int wait) -{ - struct super_block * sb = inode->i_sb; - struct sysv_sb_info * sbi = SYSV_SB(sb); - struct buffer_head * bh; - struct sysv_inode * raw_inode; - struct sysv_inode_info * si; - unsigned int ino, block; - int err = 0; - - ino = inode->i_ino; - if (!ino || ino > sbi->s_ninodes) { - printk("Bad inode number on dev %s: %d is out of range\n", - inode->i_sb->s_id, ino); - return -EIO; - } - raw_inode = sysv_raw_inode(sb, ino, &bh); - if (!raw_inode) { - printk("unable to read i-node block\n"); - return -EIO; - } - - raw_inode->i_mode = cpu_to_fs16(sbi, inode->i_mode); - raw_inode->i_uid = cpu_to_fs16(sbi, fs_high2lowuid(i_uid_read(inode))); - raw_inode->i_gid = cpu_to_fs16(sbi, fs_high2lowgid(i_gid_read(inode))); - raw_inode->i_nlink = cpu_to_fs16(sbi, inode->i_nlink); - raw_inode->i_size = cpu_to_fs32(sbi, inode->i_size); - raw_inode->i_atime = cpu_to_fs32(sbi, inode_get_atime_sec(inode)); - raw_inode->i_mtime = cpu_to_fs32(sbi, inode_get_mtime_sec(inode)); - raw_inode->i_ctime = cpu_to_fs32(sbi, inode_get_ctime_sec(inode)); - - si = SYSV_I(inode); - if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) - si->i_data[0] = cpu_to_fs32(sbi, old_encode_dev(inode->i_rdev)); - for (block = 0; block < 10+1+1+1; block++) - write3byte(sbi, (u8 *)&si->i_data[block], - &raw_inode->i_data[3*block]); - mark_buffer_dirty(bh); - if (wait) { - sync_dirty_buffer(bh); - if (buffer_req(bh) && !buffer_uptodate(bh)) { - printk ("IO error syncing sysv inode [%s:%08x]\n", - sb->s_id, ino); - err = -EIO; - } - } - brelse(bh); - return err; -} - -int sysv_write_inode(struct inode *inode, struct writeback_control *wbc) -{ - return __sysv_write_inode(inode, wbc->sync_mode == WB_SYNC_ALL); -} - -int sysv_sync_inode(struct inode *inode) -{ - return __sysv_write_inode(inode, 1); -} - -static void sysv_evict_inode(struct inode *inode) -{ - truncate_inode_pages_final(&inode->i_data); - if (!inode->i_nlink) { - inode->i_size = 0; - sysv_truncate(inode); - } - invalidate_inode_buffers(inode); - clear_inode(inode); - if (!inode->i_nlink) - sysv_free_inode(inode); -} - -static struct kmem_cache *sysv_inode_cachep; - -static struct inode *sysv_alloc_inode(struct super_block *sb) -{ - struct sysv_inode_info *si; - - si = alloc_inode_sb(sb, sysv_inode_cachep, GFP_KERNEL); - if (!si) - return NULL; - return &si->vfs_inode; -} - -static void sysv_free_in_core_inode(struct inode *inode) -{ - kmem_cache_free(sysv_inode_cachep, SYSV_I(inode)); -} - -static void init_once(void *p) -{ - struct sysv_inode_info *si = (struct sysv_inode_info *)p; - - inode_init_once(&si->vfs_inode); -} - -const struct super_operations sysv_sops = { - .alloc_inode = sysv_alloc_inode, - .free_inode = sysv_free_in_core_inode, - .write_inode = sysv_write_inode, - .evict_inode = sysv_evict_inode, - .put_super = sysv_put_super, - .sync_fs = sysv_sync_fs, - .remount_fs = sysv_remount, - .statfs = sysv_statfs, -}; - -int __init sysv_init_icache(void) -{ - sysv_inode_cachep = kmem_cache_create("sysv_inode_cache", - sizeof(struct sysv_inode_info), 0, - SLAB_RECLAIM_ACCOUNT|SLAB_ACCOUNT, - init_once); - if (!sysv_inode_cachep) - return -ENOMEM; - return 0; -} - -void sysv_destroy_icache(void) -{ - /* - * Make sure all delayed rcu free inodes are flushed before we - * destroy cache. - */ - rcu_barrier(); - kmem_cache_destroy(sysv_inode_cachep); -} diff --git a/fs/sysv/itree.c b/fs/sysv/itree.c deleted file mode 100644 index 451e95f474fa..000000000000 --- a/fs/sysv/itree.c +++ /dev/null @@ -1,511 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/itree.c - * - * Handling of indirect blocks' trees. - * AV, Sep--Dec 2000 - */ - -#include <linux/buffer_head.h> -#include <linux/mount.h> -#include <linux/mpage.h> -#include <linux/string.h> -#include "sysv.h" - -enum {DIRECT = 10, DEPTH = 4}; /* Have triple indirect */ - -static inline void dirty_indirect(struct buffer_head *bh, struct inode *inode) -{ - mark_buffer_dirty_inode(bh, inode); - if (IS_SYNC(inode)) - sync_dirty_buffer(bh); -} - -static int block_to_path(struct inode *inode, long block, int offsets[DEPTH]) -{ - struct super_block *sb = inode->i_sb; - struct sysv_sb_info *sbi = SYSV_SB(sb); - int ptrs_bits = sbi->s_ind_per_block_bits; - unsigned long indirect_blocks = sbi->s_ind_per_block, - double_blocks = sbi->s_ind_per_block_2; - int n = 0; - - if (block < 0) { - printk("sysv_block_map: block < 0\n"); - } else if (block < DIRECT) { - offsets[n++] = block; - } else if ( (block -= DIRECT) < indirect_blocks) { - offsets[n++] = DIRECT; - offsets[n++] = block; - } else if ((block -= indirect_blocks) < double_blocks) { - offsets[n++] = DIRECT+1; - offsets[n++] = block >> ptrs_bits; - offsets[n++] = block & (indirect_blocks - 1); - } else if (((block -= double_blocks) >> (ptrs_bits * 2)) < indirect_blocks) { - offsets[n++] = DIRECT+2; - offsets[n++] = block >> (ptrs_bits * 2); - offsets[n++] = (block >> ptrs_bits) & (indirect_blocks - 1); - offsets[n++] = block & (indirect_blocks - 1); - } else { - /* nothing */; - } - return n; -} - -static inline int block_to_cpu(struct sysv_sb_info *sbi, sysv_zone_t nr) -{ - return sbi->s_block_base + fs32_to_cpu(sbi, nr); -} - -typedef struct { - sysv_zone_t *p; - sysv_zone_t key; - struct buffer_head *bh; -} Indirect; - -static DEFINE_RWLOCK(pointers_lock); - -static inline void add_chain(Indirect *p, struct buffer_head *bh, sysv_zone_t *v) -{ - p->key = *(p->p = v); - p->bh = bh; -} - -static inline int verify_chain(Indirect *from, Indirect *to) -{ - while (from <= to && from->key == *from->p) - from++; - return (from > to); -} - -static inline sysv_zone_t *block_end(struct buffer_head *bh) -{ - return (sysv_zone_t*)((char*)bh->b_data + bh->b_size); -} - -static Indirect *get_branch(struct inode *inode, - int depth, - int offsets[], - Indirect chain[], - int *err) -{ - struct super_block *sb = inode->i_sb; - Indirect *p = chain; - struct buffer_head *bh; - - *err = 0; - add_chain(chain, NULL, SYSV_I(inode)->i_data + *offsets); - if (!p->key) - goto no_block; - while (--depth) { - int block = block_to_cpu(SYSV_SB(sb), p->key); - bh = sb_bread(sb, block); - if (!bh) - goto failure; - read_lock(&pointers_lock); - if (!verify_chain(chain, p)) - goto changed; - add_chain(++p, bh, (sysv_zone_t*)bh->b_data + *++offsets); - read_unlock(&pointers_lock); - if (!p->key) - goto no_block; - } - return NULL; - -changed: - read_unlock(&pointers_lock); - brelse(bh); - *err = -EAGAIN; - goto no_block; -failure: - *err = -EIO; -no_block: - return p; -} - -static int alloc_branch(struct inode *inode, - int num, - int *offsets, - Indirect *branch) -{ - int blocksize = inode->i_sb->s_blocksize; - int n = 0; - int i; - - branch[0].key = sysv_new_block(inode->i_sb); - if (branch[0].key) for (n = 1; n < num; n++) { - struct buffer_head *bh; - int parent; - /* Allocate the next block */ - branch[n].key = sysv_new_block(inode->i_sb); - if (!branch[n].key) - break; - /* - * Get buffer_head for parent block, zero it out and set - * the pointer to new one, then send parent to disk. - */ - parent = block_to_cpu(SYSV_SB(inode->i_sb), branch[n-1].key); - bh = sb_getblk(inode->i_sb, parent); - if (!bh) { - sysv_free_block(inode->i_sb, branch[n].key); - break; - } - lock_buffer(bh); - memset(bh->b_data, 0, blocksize); - branch[n].bh = bh; - branch[n].p = (sysv_zone_t*) bh->b_data + offsets[n]; - *branch[n].p = branch[n].key; - set_buffer_uptodate(bh); - unlock_buffer(bh); - dirty_indirect(bh, inode); - } - if (n == num) - return 0; - - /* Allocation failed, free what we already allocated */ - for (i = 1; i < n; i++) - bforget(branch[i].bh); - for (i = 0; i < n; i++) - sysv_free_block(inode->i_sb, branch[i].key); - return -ENOSPC; -} - -static inline int splice_branch(struct inode *inode, - Indirect chain[], - Indirect *where, - int num) -{ - int i; - - /* Verify that place we are splicing to is still there and vacant */ - write_lock(&pointers_lock); - if (!verify_chain(chain, where-1) || *where->p) - goto changed; - *where->p = where->key; - write_unlock(&pointers_lock); - - inode_set_ctime_current(inode); - - /* had we spliced it onto indirect block? */ - if (where->bh) - dirty_indirect(where->bh, inode); - - if (IS_SYNC(inode)) - sysv_sync_inode(inode); - else - mark_inode_dirty(inode); - return 0; - -changed: - write_unlock(&pointers_lock); - for (i = 1; i < num; i++) - bforget(where[i].bh); - for (i = 0; i < num; i++) - sysv_free_block(inode->i_sb, where[i].key); - return -EAGAIN; -} - -static int get_block(struct inode *inode, sector_t iblock, struct buffer_head *bh_result, int create) -{ - int err = -EIO; - int offsets[DEPTH]; - Indirect chain[DEPTH]; - struct super_block *sb = inode->i_sb; - Indirect *partial; - int left; - int depth = block_to_path(inode, iblock, offsets); - - if (depth == 0) - goto out; - -reread: - partial = get_branch(inode, depth, offsets, chain, &err); - - /* Simplest case - block found, no allocation needed */ - if (!partial) { -got_it: - map_bh(bh_result, sb, block_to_cpu(SYSV_SB(sb), - chain[depth-1].key)); - /* Clean up and exit */ - partial = chain+depth-1; /* the whole chain */ - goto cleanup; - } - - /* Next simple case - plain lookup or failed read of indirect block */ - if (!create || err == -EIO) { -cleanup: - while (partial > chain) { - brelse(partial->bh); - partial--; - } -out: - return err; - } - - /* - * Indirect block might be removed by truncate while we were - * reading it. Handling of that case (forget what we've got and - * reread) is taken out of the main path. - */ - if (err == -EAGAIN) - goto changed; - - left = (chain + depth) - partial; - err = alloc_branch(inode, left, offsets+(partial-chain), partial); - if (err) - goto cleanup; - - if (splice_branch(inode, chain, partial, left) < 0) - goto changed; - - set_buffer_new(bh_result); - goto got_it; - -changed: - while (partial > chain) { - brelse(partial->bh); - partial--; - } - goto reread; -} - -static inline int all_zeroes(sysv_zone_t *p, sysv_zone_t *q) -{ - while (p < q) - if (*p++) - return 0; - return 1; -} - -static Indirect *find_shared(struct inode *inode, - int depth, - int offsets[], - Indirect chain[], - sysv_zone_t *top) -{ - Indirect *partial, *p; - int k, err; - - *top = 0; - for (k = depth; k > 1 && !offsets[k-1]; k--) - ; - partial = get_branch(inode, k, offsets, chain, &err); - - write_lock(&pointers_lock); - if (!partial) - partial = chain + k-1; - /* - * If the branch acquired continuation since we've looked at it - - * fine, it should all survive and (new) top doesn't belong to us. - */ - if (!partial->key && *partial->p) { - write_unlock(&pointers_lock); - goto no_top; - } - for (p=partial; p>chain && all_zeroes((sysv_zone_t*)p->bh->b_data,p->p); p--) - ; - /* - * OK, we've found the last block that must survive. The rest of our - * branch should be detached before unlocking. However, if that rest - * of branch is all ours and does not grow immediately from the inode - * it's easier to cheat and just decrement partial->p. - */ - if (p == chain + k - 1 && p > chain) { - p->p--; - } else { - *top = *p->p; - *p->p = 0; - } - write_unlock(&pointers_lock); - - while (partial > p) { - brelse(partial->bh); - partial--; - } -no_top: - return partial; -} - -static inline void free_data(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q) -{ - for ( ; p < q ; p++) { - sysv_zone_t nr = *p; - if (nr) { - *p = 0; - sysv_free_block(inode->i_sb, nr); - mark_inode_dirty(inode); - } - } -} - -static void free_branches(struct inode *inode, sysv_zone_t *p, sysv_zone_t *q, int depth) -{ - struct buffer_head * bh; - struct super_block *sb = inode->i_sb; - - if (depth--) { - for ( ; p < q ; p++) { - int block; - sysv_zone_t nr = *p; - if (!nr) - continue; - *p = 0; - block = block_to_cpu(SYSV_SB(sb), nr); - bh = sb_bread(sb, block); - if (!bh) - continue; - free_branches(inode, (sysv_zone_t*)bh->b_data, - block_end(bh), depth); - bforget(bh); - sysv_free_block(sb, nr); - mark_inode_dirty(inode); - } - } else - free_data(inode, p, q); -} - -void sysv_truncate (struct inode * inode) -{ - sysv_zone_t *i_data = SYSV_I(inode)->i_data; - int offsets[DEPTH]; - Indirect chain[DEPTH]; - Indirect *partial; - sysv_zone_t nr = 0; - int n; - long iblock; - unsigned blocksize; - - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - - blocksize = inode->i_sb->s_blocksize; - iblock = (inode->i_size + blocksize-1) - >> inode->i_sb->s_blocksize_bits; - - block_truncate_page(inode->i_mapping, inode->i_size, get_block); - - n = block_to_path(inode, iblock, offsets); - if (n == 0) - return; - - if (n == 1) { - free_data(inode, i_data+offsets[0], i_data + DIRECT); - goto do_indirects; - } - - partial = find_shared(inode, n, offsets, chain, &nr); - /* Kill the top of shared branch (already detached) */ - if (nr) { - if (partial == chain) - mark_inode_dirty(inode); - else - dirty_indirect(partial->bh, inode); - free_branches(inode, &nr, &nr+1, (chain+n-1) - partial); - } - /* Clear the ends of indirect blocks on the shared branch */ - while (partial > chain) { - free_branches(inode, partial->p + 1, block_end(partial->bh), - (chain+n-1) - partial); - dirty_indirect(partial->bh, inode); - brelse (partial->bh); - partial--; - } -do_indirects: - /* Kill the remaining (whole) subtrees (== subtrees deeper than...) */ - while (n < DEPTH) { - nr = i_data[DIRECT + n - 1]; - if (nr) { - i_data[DIRECT + n - 1] = 0; - mark_inode_dirty(inode); - free_branches(inode, &nr, &nr+1, n); - } - n++; - } - inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode)); - if (IS_SYNC(inode)) - sysv_sync_inode (inode); - else - mark_inode_dirty(inode); -} - -static unsigned sysv_nblocks(struct super_block *s, loff_t size) -{ - struct sysv_sb_info *sbi = SYSV_SB(s); - int ptrs_bits = sbi->s_ind_per_block_bits; - unsigned blocks, res, direct = DIRECT, i = DEPTH; - blocks = (size + s->s_blocksize - 1) >> s->s_blocksize_bits; - res = blocks; - while (--i && blocks > direct) { - blocks = ((blocks - direct - 1) >> ptrs_bits) + 1; - res += blocks; - direct = 1; - } - return res; -} - -int sysv_getattr(struct mnt_idmap *idmap, const struct path *path, - struct kstat *stat, u32 request_mask, unsigned int flags) -{ - struct super_block *s = path->dentry->d_sb; - generic_fillattr(&nop_mnt_idmap, request_mask, d_inode(path->dentry), - stat); - stat->blocks = (s->s_blocksize / 512) * sysv_nblocks(s, stat->size); - stat->blksize = s->s_blocksize; - return 0; -} - -static int sysv_writepages(struct address_space *mapping, - struct writeback_control *wbc) -{ - return mpage_writepages(mapping, wbc, get_block); -} - -static int sysv_read_folio(struct file *file, struct folio *folio) -{ - return block_read_full_folio(folio, get_block); -} - -int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len) -{ - return __block_write_begin(folio, pos, len, get_block); -} - -static void sysv_write_failed(struct address_space *mapping, loff_t to) -{ - struct inode *inode = mapping->host; - - if (to > inode->i_size) { - truncate_pagecache(inode, inode->i_size); - sysv_truncate(inode); - } -} - -static int sysv_write_begin(struct file *file, struct address_space *mapping, - loff_t pos, unsigned len, - struct folio **foliop, void **fsdata) -{ - int ret; - - ret = block_write_begin(mapping, pos, len, foliop, get_block); - if (unlikely(ret)) - sysv_write_failed(mapping, pos + len); - - return ret; -} - -static sector_t sysv_bmap(struct address_space *mapping, sector_t block) -{ - return generic_block_bmap(mapping,block,get_block); -} - -const struct address_space_operations sysv_aops = { - .dirty_folio = block_dirty_folio, - .invalidate_folio = block_invalidate_folio, - .read_folio = sysv_read_folio, - .writepages = sysv_writepages, - .write_begin = sysv_write_begin, - .write_end = generic_write_end, - .migrate_folio = buffer_migrate_folio, - .bmap = sysv_bmap -}; diff --git a/fs/sysv/namei.c b/fs/sysv/namei.c deleted file mode 100644 index fb8bd8437872..000000000000 --- a/fs/sysv/namei.c +++ /dev/null @@ -1,280 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0 -/* - * linux/fs/sysv/namei.c - * - * minix/namei.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * coh/namei.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/namei.c - * Copyright (C) 1993 Bruno Haible - * Copyright (C) 1997, 1998 Krzysztof G. Baranowski - */ - -#include <linux/pagemap.h> -#include "sysv.h" - -static int add_nondir(struct dentry *dentry, struct inode *inode) -{ - int err = sysv_add_link(dentry, inode); - if (!err) { - d_instantiate(dentry, inode); - return 0; - } - inode_dec_link_count(inode); - iput(inode); - return err; -} - -static struct dentry *sysv_lookup(struct inode * dir, struct dentry * dentry, unsigned int flags) -{ - struct inode * inode = NULL; - ino_t ino; - - if (dentry->d_name.len > SYSV_NAMELEN) - return ERR_PTR(-ENAMETOOLONG); - ino = sysv_inode_by_name(dentry); - if (ino) - inode = sysv_iget(dir->i_sb, ino); - return d_splice_alias(inode, dentry); -} - -static int sysv_mknod(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, dev_t rdev) -{ - struct inode * inode; - int err; - - if (!old_valid_dev(rdev)) - return -EINVAL; - - inode = sysv_new_inode(dir, mode); - err = PTR_ERR(inode); - - if (!IS_ERR(inode)) { - sysv_set_inode(inode, rdev); - mark_inode_dirty(inode); - err = add_nondir(dentry, inode); - } - return err; -} - -static int sysv_create(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode, bool excl) -{ - return sysv_mknod(&nop_mnt_idmap, dir, dentry, mode, 0); -} - -static int sysv_symlink(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, const char *symname) -{ - int err = -ENAMETOOLONG; - int l = strlen(symname)+1; - struct inode * inode; - - if (l > dir->i_sb->s_blocksize) - goto out; - - inode = sysv_new_inode(dir, S_IFLNK|0777); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out; - - sysv_set_inode(inode, 0); - err = page_symlink(inode, symname, l); - if (err) - goto out_fail; - - mark_inode_dirty(inode); - err = add_nondir(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - iput(inode); - goto out; -} - -static int sysv_link(struct dentry * old_dentry, struct inode * dir, - struct dentry * dentry) -{ - struct inode *inode = d_inode(old_dentry); - - inode_set_ctime_current(inode); - inode_inc_link_count(inode); - ihold(inode); - - return add_nondir(dentry, inode); -} - -static int sysv_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) -{ - struct inode * inode; - int err; - - inode_inc_link_count(dir); - - inode = sysv_new_inode(dir, S_IFDIR|mode); - err = PTR_ERR(inode); - if (IS_ERR(inode)) - goto out_dir; - - sysv_set_inode(inode, 0); - - inode_inc_link_count(inode); - - err = sysv_make_empty(inode, dir); - if (err) - goto out_fail; - - err = sysv_add_link(dentry, inode); - if (err) - goto out_fail; - - d_instantiate(dentry, inode); -out: - return err; - -out_fail: - inode_dec_link_count(inode); - inode_dec_link_count(inode); - iput(inode); -out_dir: - inode_dec_link_count(dir); - goto out; -} - -static int sysv_unlink(struct inode * dir, struct dentry * dentry) -{ - struct inode * inode = d_inode(dentry); - struct folio *folio; - struct sysv_dir_entry * de; - int err; - - de = sysv_find_entry(dentry, &folio); - if (!de) - return -ENOENT; - - err = sysv_delete_entry(de, folio); - if (!err) { - inode_set_ctime_to_ts(inode, inode_get_ctime(dir)); - inode_dec_link_count(inode); - } - folio_release_kmap(folio, de); - return err; -} - -static int sysv_rmdir(struct inode * dir, struct dentry * dentry) -{ - struct inode *inode = d_inode(dentry); - int err = -ENOTEMPTY; - - if (sysv_empty_dir(inode)) { - err = sysv_unlink(dir, dentry); - if (!err) { - inode->i_size = 0; - inode_dec_link_count(inode); - inode_dec_link_count(dir); - } - } - return err; -} - -/* - * Anybody can rename anything with this: the permission checks are left to the - * higher-level routines. - */ -static int sysv_rename(struct mnt_idmap *idmap, struct inode *old_dir, - struct dentry *old_dentry, struct inode *new_dir, - struct dentry *new_dentry, unsigned int flags) -{ - struct inode * old_inode = d_inode(old_dentry); - struct inode * new_inode = d_inode(new_dentry); - struct folio *dir_folio; - struct sysv_dir_entry * dir_de = NULL; - struct folio *old_folio; - struct sysv_dir_entry * old_de; - int err = -ENOENT; - - if (flags & ~RENAME_NOREPLACE) - return -EINVAL; - - old_de = sysv_find_entry(old_dentry, &old_folio); - if (!old_de) - goto out; - - if (S_ISDIR(old_inode->i_mode)) { - err = -EIO; - dir_de = sysv_dotdot(old_inode, &dir_folio); - if (!dir_de) - goto out_old; - } - - if (new_inode) { - struct folio *new_folio; - struct sysv_dir_entry * new_de; - - err = -ENOTEMPTY; - if (dir_de && !sysv_empty_dir(new_inode)) - goto out_dir; - - err = -ENOENT; - new_de = sysv_find_entry(new_dentry, &new_folio); - if (!new_de) - goto out_dir; - err = sysv_set_link(new_de, new_folio, old_inode); - folio_release_kmap(new_folio, new_de); - if (err) - goto out_dir; - inode_set_ctime_current(new_inode); - if (dir_de) - drop_nlink(new_inode); - inode_dec_link_count(new_inode); - } else { - err = sysv_add_link(new_dentry, old_inode); - if (err) - goto out_dir; - if (dir_de) - inode_inc_link_count(new_dir); - } - - err = sysv_delete_entry(old_de, old_folio); - if (err) - goto out_dir; - - mark_inode_dirty(old_inode); - - if (dir_de) { - err = sysv_set_link(dir_de, dir_folio, new_dir); - if (!err) - inode_dec_link_count(old_dir); - } - -out_dir: - if (dir_de) - folio_release_kmap(dir_folio, dir_de); -out_old: - folio_release_kmap(old_folio, old_de); -out: - return err; -} - -/* - * directories can handle most operations... - */ -const struct inode_operations sysv_dir_inode_operations = { - .create = sysv_create, - .lookup = sysv_lookup, - .link = sysv_link, - .unlink = sysv_unlink, - .symlink = sysv_symlink, - .mkdir = sysv_mkdir, - .rmdir = sysv_rmdir, - .mknod = sysv_mknod, - .rename = sysv_rename, - .getattr = sysv_getattr, -}; diff --git a/fs/sysv/super.c b/fs/sysv/super.c deleted file mode 100644 index 5c0d07ddbda2..000000000000 --- a/fs/sysv/super.c +++ /dev/null @@ -1,595 +0,0 @@ -// SPDX-License-Identifier: GPL-2.0-only -/* - * linux/fs/sysv/inode.c - * - * minix/inode.c - * Copyright (C) 1991, 1992 Linus Torvalds - * - * xenix/inode.c - * Copyright (C) 1992 Doug Evans - * - * coh/inode.c - * Copyright (C) 1993 Pascal Haible, Bruno Haible - * - * sysv/inode.c - * Copyright (C) 1993 Paul B. Monday - * - * sysv/inode.c - * Copyright (C) 1993 Bruno Haible - * Copyright (C) 1997, 1998 Krzysztof G. Baranowski - * - * This file contains code for read/parsing the superblock. - */ - -#include <linux/module.h> -#include <linux/init.h> -#include <linux/slab.h> -#include <linux/buffer_head.h> -#include "sysv.h" - -/* - * The following functions try to recognize specific filesystems. - * - * We recognize: - * - Xenix FS by its magic number. - * - SystemV FS by its magic number. - * - Coherent FS by its funny fname/fpack field. - * - SCO AFS by s_nfree == 0xffff - * - V7 FS has no distinguishing features. - * - * We discriminate among SystemV4 and SystemV2 FS by the assumption that - * the time stamp is not < 01-01-1980. - */ - -enum { - JAN_1_1980 = (10*365 + 2) * 24 * 60 * 60 -}; - -static void detected_xenix(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct buffer_head *bh1 = sbi->s_bh1; - struct buffer_head *bh2 = sbi->s_bh2; - struct xenix_super_block * sbd1; - struct xenix_super_block * sbd2; - - if (bh1 != bh2) - sbd1 = sbd2 = (struct xenix_super_block *) bh1->b_data; - else { - /* block size = 512, so bh1 != bh2 */ - sbd1 = (struct xenix_super_block *) bh1->b_data; - sbd2 = (struct xenix_super_block *) (bh2->b_data - 512); - } - - *max_links = XENIX_LINK_MAX; - sbi->s_fic_size = XENIX_NICINOD; - sbi->s_flc_size = XENIX_NICFREE; - sbi->s_sbd1 = (char *)sbd1; - sbi->s_sbd2 = (char *)sbd2; - sbi->s_sb_fic_count = &sbd1->s_ninode; - sbi->s_sb_fic_inodes = &sbd1->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd2->s_tinode; - sbi->s_bcache_count = &sbd1->s_nfree; - sbi->s_bcache = &sbd1->s_free[0]; - sbi->s_free_blocks = &sbd2->s_tfree; - sbi->s_sb_time = &sbd2->s_time; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd1->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd1->s_fsize); -} - -static void detected_sysv4(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct sysv4_super_block * sbd; - struct buffer_head *bh1 = sbi->s_bh1; - struct buffer_head *bh2 = sbi->s_bh2; - - if (bh1 == bh2) - sbd = (struct sysv4_super_block *) (bh1->b_data + BLOCK_SIZE/2); - else - sbd = (struct sysv4_super_block *) bh2->b_data; - - *max_links = SYSV_LINK_MAX; - sbi->s_fic_size = SYSV_NICINOD; - sbi->s_flc_size = SYSV_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_sb_state = &sbd->s_state; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static void detected_sysv2(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct sysv2_super_block *sbd; - struct buffer_head *bh1 = sbi->s_bh1; - struct buffer_head *bh2 = sbi->s_bh2; - - if (bh1 == bh2) - sbd = (struct sysv2_super_block *) (bh1->b_data + BLOCK_SIZE/2); - else - sbd = (struct sysv2_super_block *) bh2->b_data; - - *max_links = SYSV_LINK_MAX; - sbi->s_fic_size = SYSV_NICINOD; - sbi->s_flc_size = SYSV_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_sb_state = &sbd->s_state; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static void detected_coherent(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct coh_super_block * sbd; - struct buffer_head *bh1 = sbi->s_bh1; - - sbd = (struct coh_super_block *) bh1->b_data; - - *max_links = COH_LINK_MAX; - sbi->s_fic_size = COH_NICINOD; - sbi->s_flc_size = COH_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static void detected_v7(struct sysv_sb_info *sbi, unsigned *max_links) -{ - struct buffer_head *bh2 = sbi->s_bh2; - struct v7_super_block *sbd = (struct v7_super_block *)bh2->b_data; - - *max_links = V7_LINK_MAX; - sbi->s_fic_size = V7_NICINOD; - sbi->s_flc_size = V7_NICFREE; - sbi->s_sbd1 = (char *)sbd; - sbi->s_sbd2 = (char *)sbd; - sbi->s_sb_fic_count = &sbd->s_ninode; - sbi->s_sb_fic_inodes = &sbd->s_inode[0]; - sbi->s_sb_total_free_inodes = &sbd->s_tinode; - sbi->s_bcache_count = &sbd->s_nfree; - sbi->s_bcache = &sbd->s_free[0]; - sbi->s_free_blocks = &sbd->s_tfree; - sbi->s_sb_time = &sbd->s_time; - sbi->s_firstdatazone = fs16_to_cpu(sbi, sbd->s_isize); - sbi->s_nzones = fs32_to_cpu(sbi, sbd->s_fsize); -} - -static int detect_xenix(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - struct xenix_super_block *sbd = (struct xenix_super_block *)bh->b_data; - if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0x2b5544)) - sbi->s_bytesex = BYTESEX_LE; - else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0x2b5544)) - sbi->s_bytesex = BYTESEX_BE; - else - return 0; - switch (fs32_to_cpu(sbi, sbd->s_type)) { - case 1: - sbi->s_type = FSTYPE_XENIX; - return 1; - case 2: - sbi->s_type = FSTYPE_XENIX; - return 2; - default: - return 0; - } -} - -static int detect_sysv(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - struct super_block *sb = sbi->s_sb; - /* All relevant fields are at the same offsets in R2 and R4 */ - struct sysv4_super_block * sbd; - u32 type; - - sbd = (struct sysv4_super_block *) (bh->b_data + BLOCK_SIZE/2); - if (*(__le32 *)&sbd->s_magic == cpu_to_le32(0xfd187e20)) - sbi->s_bytesex = BYTESEX_LE; - else if (*(__be32 *)&sbd->s_magic == cpu_to_be32(0xfd187e20)) - sbi->s_bytesex = BYTESEX_BE; - else - return 0; - - type = fs32_to_cpu(sbi, sbd->s_type); - - if (fs16_to_cpu(sbi, sbd->s_nfree) == 0xffff) { - sbi->s_type = FSTYPE_AFS; - sbi->s_forced_ro = 1; - if (!sb_rdonly(sb)) { - printk("SysV FS: SCO EAFS on %s detected, " - "forcing read-only mode.\n", - sb->s_id); - } - return type; - } - - if (fs32_to_cpu(sbi, sbd->s_time) < JAN_1_1980) { - /* this is likely to happen on SystemV2 FS */ - if (type > 3 || type < 1) - return 0; - sbi->s_type = FSTYPE_SYSV2; - return type; - } - if ((type > 3 || type < 1) && (type > 0x30 || type < 0x10)) - return 0; - - /* On Interactive Unix (ISC) Version 4.0/3.x s_type field = 0x10, - 0x20 or 0x30 indicates that symbolic links and the 14-character - filename limit is gone. Due to lack of information about this - feature read-only mode seems to be a reasonable approach... -KGB */ - - if (type >= 0x10) { - printk("SysV FS: can't handle long file names on %s, " - "forcing read-only mode.\n", sb->s_id); - sbi->s_forced_ro = 1; - } - - sbi->s_type = FSTYPE_SYSV4; - return type >= 0x10 ? type >> 4 : type; -} - -static int detect_coherent(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - struct coh_super_block * sbd; - - sbd = (struct coh_super_block *) (bh->b_data + BLOCK_SIZE/2); - if ((memcmp(sbd->s_fname,"noname",6) && memcmp(sbd->s_fname,"xxxxx ",6)) - || (memcmp(sbd->s_fpack,"nopack",6) && memcmp(sbd->s_fpack,"xxxxx\n",6))) - return 0; - sbi->s_bytesex = BYTESEX_PDP; - sbi->s_type = FSTYPE_COH; - return 1; -} - -static int detect_sysv_odd(struct sysv_sb_info *sbi, struct buffer_head *bh) -{ - int size = detect_sysv(sbi, bh); - - return size>2 ? 0 : size; -} - -static struct { - int block; - int (*test)(struct sysv_sb_info *, struct buffer_head *); -} flavours[] = { - {1, detect_xenix}, - {0, detect_sysv}, - {0, detect_coherent}, - {9, detect_sysv_odd}, - {15,detect_sysv_odd}, - {18,detect_sysv}, -}; - -static char *flavour_names[] = { - [FSTYPE_XENIX] = "Xenix", - [FSTYPE_SYSV4] = "SystemV", - [FSTYPE_SYSV2] = "SystemV Release 2", - [FSTYPE_COH] = "Coherent", - [FSTYPE_V7] = "V7", - [FSTYPE_AFS] = "AFS", -}; - -static void (*flavour_setup[])(struct sysv_sb_info *, unsigned *) = { - [FSTYPE_XENIX] = detected_xenix, - [FSTYPE_SYSV4] = detected_sysv4, - [FSTYPE_SYSV2] = detected_sysv2, - [FSTYPE_COH] = detected_coherent, - [FSTYPE_V7] = detected_v7, - [FSTYPE_AFS] = detected_sysv4, -}; - -static int complete_read_super(struct super_block *sb, int silent, int size) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - struct inode *root_inode; - char *found = flavour_names[sbi->s_type]; - u_char n_bits = size+8; - int bsize = 1 << n_bits; - int bsize_4 = bsize >> 2; - - sbi->s_firstinodezone = 2; - - flavour_setup[sbi->s_type](sbi, &sb->s_max_links); - if (sbi->s_firstdatazone < sbi->s_firstinodezone) - return 0; - - sbi->s_ndatazones = sbi->s_nzones - sbi->s_firstdatazone; - sbi->s_inodes_per_block = bsize >> 6; - sbi->s_inodes_per_block_1 = (bsize >> 6)-1; - sbi->s_inodes_per_block_bits = n_bits-6; - sbi->s_ind_per_block = bsize_4; - sbi->s_ind_per_block_2 = bsize_4*bsize_4; - sbi->s_toobig_block = 10 + bsize_4 * (1 + bsize_4 * (1 + bsize_4)); - sbi->s_ind_per_block_bits = n_bits-2; - - sbi->s_ninodes = (sbi->s_firstdatazone - sbi->s_firstinodezone) - << sbi->s_inodes_per_block_bits; - - if (!silent) - printk("VFS: Found a %s FS (block size = %ld) on device %s\n", - found, sb->s_blocksize, sb->s_id); - - sb->s_magic = SYSV_MAGIC_BASE + sbi->s_type; - /* set up enough so that it can read an inode */ - sb->s_op = &sysv_sops; - if (sbi->s_forced_ro) - sb->s_flags |= SB_RDONLY; - root_inode = sysv_iget(sb, SYSV_ROOT_INO); - if (IS_ERR(root_inode)) { - printk("SysV FS: get root inode failed\n"); - return 0; - } - sb->s_root = d_make_root(root_inode); - if (!sb->s_root) { - printk("SysV FS: get root dentry failed\n"); - return 0; - } - return 1; -} - -static int sysv_fill_super(struct super_block *sb, void *data, int silent) -{ - struct buffer_head *bh1, *bh = NULL; - struct sysv_sb_info *sbi; - unsigned long blocknr; - int size = 0, i; - - BUILD_BUG_ON(1024 != sizeof (struct xenix_super_block)); - BUILD_BUG_ON(512 != sizeof (struct sysv4_super_block)); - BUILD_BUG_ON(512 != sizeof (struct sysv2_super_block)); - BUILD_BUG_ON(500 != sizeof (struct coh_super_block)); - BUILD_BUG_ON(64 != sizeof (struct sysv_inode)); - - sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sbi->s_sb = sb; - sbi->s_block_base = 0; - mutex_init(&sbi->s_lock); - sb->s_fs_info = sbi; - sb->s_time_min = 0; - sb->s_time_max = U32_MAX; - sb_set_blocksize(sb, BLOCK_SIZE); - - for (i = 0; i < ARRAY_SIZE(flavours) && !size; i++) { - brelse(bh); - bh = sb_bread(sb, flavours[i].block); - if (!bh) - continue; - size = flavours[i].test(SYSV_SB(sb), bh); - } - - if (!size) - goto Eunknown; - - switch (size) { - case 1: - blocknr = bh->b_blocknr << 1; - brelse(bh); - sb_set_blocksize(sb, 512); - bh1 = sb_bread(sb, blocknr); - bh = sb_bread(sb, blocknr + 1); - break; - case 2: - bh1 = bh; - break; - case 3: - blocknr = bh->b_blocknr >> 1; - brelse(bh); - sb_set_blocksize(sb, 2048); - bh1 = bh = sb_bread(sb, blocknr); - break; - default: - goto Ebadsize; - } - - if (bh && bh1) { - sbi->s_bh1 = bh1; - sbi->s_bh2 = bh; - if (complete_read_super(sb, silent, size)) - return 0; - } - - brelse(bh1); - brelse(bh); - sb_set_blocksize(sb, BLOCK_SIZE); - printk("oldfs: cannot read superblock\n"); -failed: - kfree(sbi); - return -EINVAL; - -Eunknown: - brelse(bh); - if (!silent) - printk("VFS: unable to find oldfs superblock on device %s\n", - sb->s_id); - goto failed; -Ebadsize: - brelse(bh); - if (!silent) - printk("VFS: oldfs: unsupported block size (%dKb)\n", - 1<<(size-2)); - goto failed; -} - -static int v7_sanity_check(struct super_block *sb, struct buffer_head *bh) -{ - struct v7_super_block *v7sb; - struct sysv_inode *v7i; - struct buffer_head *bh2; - struct sysv_sb_info *sbi; - - sbi = sb->s_fs_info; - - /* plausibility check on superblock */ - v7sb = (struct v7_super_block *) bh->b_data; - if (fs16_to_cpu(sbi, v7sb->s_nfree) > V7_NICFREE || - fs16_to_cpu(sbi, v7sb->s_ninode) > V7_NICINOD || - fs32_to_cpu(sbi, v7sb->s_fsize) > V7_MAXSIZE) - return 0; - - /* plausibility check on root inode: it is a directory, - with a nonzero size that is a multiple of 16 */ - bh2 = sb_bread(sb, 2); - if (bh2 == NULL) - return 0; - - v7i = (struct sysv_inode *)(bh2->b_data + 64); - if ((fs16_to_cpu(sbi, v7i->i_mode) & ~0777) != S_IFDIR || - (fs32_to_cpu(sbi, v7i->i_size) == 0) || - (fs32_to_cpu(sbi, v7i->i_size) & 017) || - (fs32_to_cpu(sbi, v7i->i_size) > V7_NFILES * - sizeof(struct sysv_dir_entry))) { - brelse(bh2); - return 0; - } - - brelse(bh2); - return 1; -} - -static int v7_fill_super(struct super_block *sb, void *data, int silent) -{ - struct sysv_sb_info *sbi; - struct buffer_head *bh; - - BUILD_BUG_ON(sizeof(struct v7_super_block) != 440); - BUILD_BUG_ON(sizeof(struct sysv_inode) != 64); - - sbi = kzalloc(sizeof(struct sysv_sb_info), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - - sbi->s_sb = sb; - sbi->s_block_base = 0; - sbi->s_type = FSTYPE_V7; - mutex_init(&sbi->s_lock); - sb->s_fs_info = sbi; - sb->s_time_min = 0; - sb->s_time_max = U32_MAX; - - sb_set_blocksize(sb, 512); - - if ((bh = sb_bread(sb, 1)) == NULL) { - if (!silent) - printk("VFS: unable to read V7 FS superblock on " - "device %s.\n", sb->s_id); - goto failed; - } - - /* Try PDP-11 UNIX */ - sbi->s_bytesex = BYTESEX_PDP; - if (v7_sanity_check(sb, bh)) - goto detected; - - /* Try PC/IX, v7/x86 */ - sbi->s_bytesex = BYTESEX_LE; - if (v7_sanity_check(sb, bh)) - goto detected; - - goto failed; - -detected: - sbi->s_bh1 = bh; - sbi->s_bh2 = bh; - if (complete_read_super(sb, silent, 1)) - return 0; - -failed: - printk(KERN_ERR "VFS: could not find a valid V7 on %s.\n", - sb->s_id); - brelse(bh); - kfree(sbi); - return -EINVAL; -} - -/* Every kernel module contains stuff like this. */ - -static struct dentry *sysv_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, sysv_fill_super); -} - -static struct dentry *v7_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_bdev(fs_type, flags, dev_name, data, v7_fill_super); -} - -static struct file_system_type sysv_fs_type = { - .owner = THIS_MODULE, - .name = "sysv", - .mount = sysv_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("sysv"); - -static struct file_system_type v7_fs_type = { - .owner = THIS_MODULE, - .name = "v7", - .mount = v7_mount, - .kill_sb = kill_block_super, - .fs_flags = FS_REQUIRES_DEV, -}; -MODULE_ALIAS_FS("v7"); -MODULE_ALIAS("v7"); - -static int __init init_sysv_fs(void) -{ - int error; - - error = sysv_init_icache(); - if (error) - goto out; - error = register_filesystem(&sysv_fs_type); - if (error) - goto destroy_icache; - error = register_filesystem(&v7_fs_type); - if (error) - goto unregister; - return 0; - -unregister: - unregister_filesystem(&sysv_fs_type); -destroy_icache: - sysv_destroy_icache(); -out: - return error; -} - -static void __exit exit_sysv_fs(void) -{ - unregister_filesystem(&sysv_fs_type); - unregister_filesystem(&v7_fs_type); - sysv_destroy_icache(); -} - -module_init(init_sysv_fs) -module_exit(exit_sysv_fs) -MODULE_DESCRIPTION("SystemV Filesystem"); -MODULE_LICENSE("GPL"); diff --git a/fs/sysv/sysv.h b/fs/sysv/sysv.h deleted file mode 100644 index 0a48b2e7edb1..000000000000 --- a/fs/sysv/sysv.h +++ /dev/null @@ -1,245 +0,0 @@ -/* SPDX-License-Identifier: GPL-2.0 */ -#ifndef _SYSV_H -#define _SYSV_H - -#include <linux/buffer_head.h> - -typedef __u16 __bitwise __fs16; -typedef __u32 __bitwise __fs32; - -#include <linux/sysv_fs.h> - -/* - * SystemV/V7/Coherent super-block data in memory - * - * The SystemV/V7/Coherent superblock contains dynamic data (it gets modified - * while the system is running). This is in contrast to the Minix and Berkeley - * filesystems (where the superblock is never modified). This affects the - * sync() operation: we must keep the superblock in a disk buffer and use this - * one as our "working copy". - */ - -struct sysv_sb_info { - struct super_block *s_sb; /* VFS superblock */ - int s_type; /* file system type: FSTYPE_{XENIX|SYSV|COH} */ - char s_bytesex; /* bytesex (le/be/pdp) */ - unsigned int s_inodes_per_block; /* number of inodes per block */ - unsigned int s_inodes_per_block_1; /* inodes_per_block - 1 */ - unsigned int s_inodes_per_block_bits; /* log2(inodes_per_block) */ - unsigned int s_ind_per_block; /* number of indirections per block */ - unsigned int s_ind_per_block_bits; /* log2(ind_per_block) */ - unsigned int s_ind_per_block_2; /* ind_per_block ^ 2 */ - unsigned int s_toobig_block; /* 10 + ipb + ipb^2 + ipb^3 */ - unsigned int s_block_base; /* physical block number of block 0 */ - unsigned short s_fic_size; /* free inode cache size, NICINOD */ - unsigned short s_flc_size; /* free block list chunk size, NICFREE */ - /* The superblock is kept in one or two disk buffers: */ - struct buffer_head *s_bh1; - struct buffer_head *s_bh2; - /* These are pointers into the disk buffer, to compensate for - different superblock layout. */ - char * s_sbd1; /* entire superblock data, for part 1 */ - char * s_sbd2; /* entire superblock data, for part 2 */ - __fs16 *s_sb_fic_count; /* pointer to s_sbd->s_ninode */ - sysv_ino_t *s_sb_fic_inodes; /* pointer to s_sbd->s_inode */ - __fs16 *s_sb_total_free_inodes; /* pointer to s_sbd->s_tinode */ - __fs16 *s_bcache_count; /* pointer to s_sbd->s_nfree */ - sysv_zone_t *s_bcache; /* pointer to s_sbd->s_free */ - __fs32 *s_free_blocks; /* pointer to s_sbd->s_tfree */ - __fs32 *s_sb_time; /* pointer to s_sbd->s_time */ - __fs32 *s_sb_state; /* pointer to s_sbd->s_state, only FSTYPE_SYSV */ - /* We keep those superblock entities that don't change here; - this saves us an indirection and perhaps a conversion. */ - u32 s_firstinodezone; /* index of first inode zone */ - u32 s_firstdatazone; /* same as s_sbd->s_isize */ - u32 s_ninodes; /* total number of inodes */ - u32 s_ndatazones; /* total number of data zones */ - u32 s_nzones; /* same as s_sbd->s_fsize */ - u16 s_namelen; /* max length of dir entry */ - int s_forced_ro; - struct mutex s_lock; -}; - -/* - * SystemV/V7/Coherent FS inode data in memory - */ -struct sysv_inode_info { - __fs32 i_data[13]; - u32 i_dir_start_lookup; - struct inode vfs_inode; -}; - - -static inline struct sysv_inode_info *SYSV_I(struct inode *inode) -{ - return container_of(inode, struct sysv_inode_info, vfs_inode); -} - -static inline struct sysv_sb_info *SYSV_SB(struct super_block *sb) -{ - return sb->s_fs_info; -} - - -/* identify the FS in memory */ -enum { - FSTYPE_NONE = 0, - FSTYPE_XENIX, - FSTYPE_SYSV4, - FSTYPE_SYSV2, - FSTYPE_COH, - FSTYPE_V7, - FSTYPE_AFS, - FSTYPE_END, -}; - -#define SYSV_MAGIC_BASE 0x012FF7B3 - -#define XENIX_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_XENIX) -#define SYSV4_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV4) -#define SYSV2_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_SYSV2) -#define COH_SUPER_MAGIC (SYSV_MAGIC_BASE+FSTYPE_COH) - - -/* Admissible values for i_nlink: 0.._LINK_MAX */ -enum { - XENIX_LINK_MAX = 126, /* ?? */ - SYSV_LINK_MAX = 126, /* 127? 251? */ - V7_LINK_MAX = 126, /* ?? */ - COH_LINK_MAX = 10000, -}; - - -static inline void dirty_sb(struct super_block *sb) -{ - struct sysv_sb_info *sbi = SYSV_SB(sb); - - mark_buffer_dirty(sbi->s_bh1); - if (sbi->s_bh1 != sbi->s_bh2) - mark_buffer_dirty(sbi->s_bh2); -} - - -/* ialloc.c */ -extern struct sysv_inode *sysv_raw_inode(struct super_block *, unsigned, - struct buffer_head **); -extern struct inode * sysv_new_inode(const struct inode *, umode_t); -extern void sysv_free_inode(struct inode *); -extern unsigned long sysv_count_free_inodes(struct super_block *); - -/* balloc.c */ -extern sysv_zone_t sysv_new_block(struct super_block *); -extern void sysv_free_block(struct super_block *, sysv_zone_t); -extern unsigned long sysv_count_free_blocks(struct super_block *); - -/* itree.c */ -void sysv_truncate(struct inode *); -int sysv_prepare_chunk(struct folio *folio, loff_t pos, unsigned len); - -/* inode.c */ -extern struct inode *sysv_iget(struct super_block *, unsigned int); -extern int sysv_write_inode(struct inode *, struct writeback_control *wbc); -extern int sysv_sync_inode(struct inode *); -extern void sysv_set_inode(struct inode *, dev_t); -extern int sysv_getattr(struct mnt_idmap *, const struct path *, - struct kstat *, u32, unsigned int); -extern int sysv_init_icache(void); -extern void sysv_destroy_icache(void); - - -/* dir.c */ -struct sysv_dir_entry *sysv_find_entry(struct dentry *, struct folio **); -int sysv_add_link(struct dentry *, struct inode *); -int sysv_delete_entry(struct sysv_dir_entry *, struct folio *); -int sysv_make_empty(struct inode *, struct inode *); -int sysv_empty_dir(struct inode *); -int sysv_set_link(struct sysv_dir_entry *, struct folio *, - struct inode *); -struct sysv_dir_entry *sysv_dotdot(struct inode *, struct folio **); -ino_t sysv_inode_by_name(struct dentry *); - - -extern const struct inode_operations sysv_file_inode_operations; -extern const struct inode_operations sysv_dir_inode_operations; -extern const struct file_operations sysv_file_operations; -extern const struct file_operations sysv_dir_operations; -extern const struct address_space_operations sysv_aops; -extern const struct super_operations sysv_sops; - - -enum { - BYTESEX_LE, - BYTESEX_PDP, - BYTESEX_BE, -}; - -static inline u32 PDP_swab(u32 x) -{ -#ifdef __LITTLE_ENDIAN - return ((x & 0xffff) << 16) | ((x & 0xffff0000) >> 16); -#else -#ifdef __BIG_ENDIAN - return ((x & 0xff00ff) << 8) | ((x & 0xff00ff00) >> 8); -#else -#error BYTESEX -#endif -#endif -} - -static inline __u32 fs32_to_cpu(struct sysv_sb_info *sbi, __fs32 n) -{ - if (sbi->s_bytesex == BYTESEX_PDP) - return PDP_swab((__force __u32)n); - else if (sbi->s_bytesex == BYTESEX_LE) - return le32_to_cpu((__force __le32)n); - else - return be32_to_cpu((__force __be32)n); -} - -static inline __fs32 cpu_to_fs32(struct sysv_sb_info *sbi, __u32 n) -{ - if (sbi->s_bytesex == BYTESEX_PDP) - return (__force __fs32)PDP_swab(n); - else if (sbi->s_bytesex == BYTESEX_LE) - return (__force __fs32)cpu_to_le32(n); - else - return (__force __fs32)cpu_to_be32(n); -} - -static inline __fs32 fs32_add(struct sysv_sb_info *sbi, __fs32 *n, int d) -{ - if (sbi->s_bytesex == BYTESEX_PDP) - *(__u32*)n = PDP_swab(PDP_swab(*(__u32*)n)+d); - else if (sbi->s_bytesex == BYTESEX_LE) - le32_add_cpu((__le32 *)n, d); - else - be32_add_cpu((__be32 *)n, d); - return *n; -} - -static inline __u16 fs16_to_cpu(struct sysv_sb_info *sbi, __fs16 n) -{ - if (sbi->s_bytesex != BYTESEX_BE) - return le16_to_cpu((__force __le16)n); - else - return be16_to_cpu((__force __be16)n); -} - -static inline __fs16 cpu_to_fs16(struct sysv_sb_info *sbi, __u16 n) -{ - if (sbi->s_bytesex != BYTESEX_BE) - return (__force __fs16)cpu_to_le16(n); - else - return (__force __fs16)cpu_to_be16(n); -} - -static inline __fs16 fs16_add(struct sysv_sb_info *sbi, __fs16 *n, int d) -{ - if (sbi->s_bytesex != BYTESEX_BE) - le16_add_cpu((__le16 *)n, d); - else - be16_add_cpu((__be16 *)n, d); - return *n; -} - -#endif /* _SYSV_H */ diff --git a/fs/timerfd.c b/fs/timerfd.c index 9f7eb451a60f..c68f28d9c426 100644 --- a/fs/timerfd.c +++ b/fs/timerfd.c @@ -205,9 +205,8 @@ static int timerfd_setup(struct timerfd_ctx *ctx, int flags, ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); } else { - hrtimer_init(&ctx->t.tmr, clockid, htmode); + hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, htmode); hrtimer_set_expires(&ctx->t.tmr, texp); - ctx->t.tmr.function = timerfd_tmrproc; } if (texp != 0) { @@ -429,7 +428,7 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) ALARM_REALTIME : ALARM_BOOTTIME, timerfd_alarmproc); else - hrtimer_init(&ctx->t.tmr, clockid, HRTIMER_MODE_ABS); + hrtimer_setup(&ctx->t.tmr, timerfd_tmrproc, clockid, HRTIMER_MODE_ABS); ctx->moffs = ktime_mono_to_real(0); @@ -439,15 +438,15 @@ SYSCALL_DEFINE2(timerfd_create, int, clockid, int, flags) return ufd; } - file = anon_inode_getfile("[timerfd]", &timerfd_fops, ctx, - O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS)); + file = anon_inode_getfile_fmode("[timerfd]", &timerfd_fops, ctx, + O_RDWR | (flags & TFD_SHARED_FCNTL_FLAGS), + FMODE_NOWAIT); if (IS_ERR(file)) { put_unused_fd(ufd); kfree(ctx); return PTR_ERR(file); } - file->f_mode |= FMODE_NOWAIT; fd_install(ufd, file); return ufd; } diff --git a/fs/tracefs/inode.c b/fs/tracefs/inode.c index 53214499e384..a3fd3cc591bd 100644 --- a/fs/tracefs/inode.c +++ b/fs/tracefs/inode.c @@ -109,9 +109,9 @@ static char *get_dname(struct dentry *dentry) return name; } -static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, - struct inode *inode, struct dentry *dentry, - umode_t mode) +static struct dentry *tracefs_syscall_mkdir(struct mnt_idmap *idmap, + struct inode *inode, struct dentry *dentry, + umode_t mode) { struct tracefs_inode *ti; char *name; @@ -119,7 +119,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, name = get_dname(dentry); if (!name) - return -ENOMEM; + return ERR_PTR(-ENOMEM); /* * This is a new directory that does not take the default of @@ -141,7 +141,7 @@ static int tracefs_syscall_mkdir(struct mnt_idmap *idmap, kfree(name); - return ret; + return ERR_PTR(ret); } static int tracefs_syscall_rmdir(struct inode *inode, struct dentry *dentry) @@ -555,7 +555,7 @@ struct dentry *tracefs_start_creating(const char *name, struct dentry *parent) if (unlikely(IS_DEADDIR(d_inode(parent)))) dentry = ERR_PTR(-ENOENT); else - dentry = lookup_one_len(name, parent, strlen(name)); + dentry = lookup_noperm(&QSTR(name), parent); if (!IS_ERR(dentry) && d_inode(dentry)) { dput(dentry); dentry = ERR_PTR(-EEXIST); diff --git a/fs/ubifs/compress.c b/fs/ubifs/compress.c index 0b48cbab8a3d..059a02691edd 100644 --- a/fs/ubifs/compress.c +++ b/fs/ubifs/compress.c @@ -15,9 +15,15 @@ * decompression. */ -#include <linux/crypto.h> +#include <crypto/acompress.h> +#include <linux/highmem.h> #include "ubifs.h" +union ubifs_in_ptr { + const void *buf; + struct folio *folio; +}; + /* Fake description object for the "none" compressor */ static struct ubifs_compressor none_compr = { .compr_type = UBIFS_COMPR_NONE, @@ -26,11 +32,8 @@ static struct ubifs_compressor none_compr = { }; #ifdef CONFIG_UBIFS_FS_LZO -static DEFINE_MUTEX(lzo_mutex); - static struct ubifs_compressor lzo_compr = { .compr_type = UBIFS_COMPR_LZO, - .comp_mutex = &lzo_mutex, .name = "lzo", .capi_name = "lzo", }; @@ -42,13 +45,8 @@ static struct ubifs_compressor lzo_compr = { #endif #ifdef CONFIG_UBIFS_FS_ZLIB -static DEFINE_MUTEX(deflate_mutex); -static DEFINE_MUTEX(inflate_mutex); - static struct ubifs_compressor zlib_compr = { .compr_type = UBIFS_COMPR_ZLIB, - .comp_mutex = &deflate_mutex, - .decomp_mutex = &inflate_mutex, .name = "zlib", .capi_name = "deflate", }; @@ -60,13 +58,8 @@ static struct ubifs_compressor zlib_compr = { #endif #ifdef CONFIG_UBIFS_FS_ZSTD -static DEFINE_MUTEX(zstd_enc_mutex); -static DEFINE_MUTEX(zstd_dec_mutex); - static struct ubifs_compressor zstd_compr = { .compr_type = UBIFS_COMPR_ZSTD, - .comp_mutex = &zstd_enc_mutex, - .decomp_mutex = &zstd_dec_mutex, .name = "zstd", .capi_name = "zstd", }; @@ -80,6 +73,63 @@ static struct ubifs_compressor zstd_compr = { /* All UBIFS compressors */ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; +static void ubifs_compress_common(int *compr_type, union ubifs_in_ptr in_ptr, + size_t in_offset, int in_len, bool in_folio, + void *out_buf, int *out_len) +{ + struct ubifs_compressor *compr = ubifs_compressors[*compr_type]; + int dlen = *out_len; + int err; + + if (*compr_type == UBIFS_COMPR_NONE) + goto no_compr; + + /* If the input data is small, do not even try to compress it */ + if (in_len < UBIFS_MIN_COMPR_LEN) + goto no_compr; + + dlen = min(dlen, in_len - UBIFS_MIN_COMPRESS_DIFF); + + do { + ACOMP_REQUEST_ON_STACK(req, compr->cc); + DECLARE_CRYPTO_WAIT(wait); + + acomp_request_set_callback(req, 0, NULL, NULL); + if (in_folio) + acomp_request_set_src_folio(req, in_ptr.folio, + in_offset, in_len); + else + acomp_request_set_src_dma(req, in_ptr.buf, in_len); + acomp_request_set_dst_dma(req, out_buf, dlen); + err = crypto_acomp_compress(req); + dlen = req->dlen; + if (err != -EAGAIN) + break; + + req = ACOMP_REQUEST_CLONE(req, GFP_NOFS | __GFP_NOWARN); + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + err = crypto_acomp_compress(req); + err = crypto_wait_req(err, &wait); + dlen = req->dlen; + acomp_request_free(req); + } while (0); + + *out_len = dlen; + if (err) + goto no_compr; + + return; + +no_compr: + if (in_folio) + memcpy_from_folio(out_buf, in_ptr.folio, in_offset, in_len); + else + memcpy(out_buf, in_ptr.buf, in_len); + *out_len = in_len; + *compr_type = UBIFS_COMPR_NONE; +} + /** * ubifs_compress - compress data. * @c: UBIFS file-system description object @@ -102,61 +152,51 @@ struct ubifs_compressor *ubifs_compressors[UBIFS_COMPR_TYPES_CNT]; void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len, void *out_buf, int *out_len, int *compr_type) { - int err; - struct ubifs_compressor *compr = ubifs_compressors[*compr_type]; - - if (*compr_type == UBIFS_COMPR_NONE) - goto no_compr; - - /* If the input data is small, do not even try to compress it */ - if (in_len < UBIFS_MIN_COMPR_LEN) - goto no_compr; - - if (compr->comp_mutex) - mutex_lock(compr->comp_mutex); - err = crypto_comp_compress(compr->cc, in_buf, in_len, out_buf, - (unsigned int *)out_len); - if (compr->comp_mutex) - mutex_unlock(compr->comp_mutex); - if (unlikely(err)) { - ubifs_warn(c, "cannot compress %d bytes, compressor %s, error %d, leave data uncompressed", - in_len, compr->name, err); - goto no_compr; - } + union ubifs_in_ptr in_ptr = { .buf = in_buf }; - /* - * If the data compressed only slightly, it is better to leave it - * uncompressed to improve read speed. - */ - if (in_len - *out_len < UBIFS_MIN_COMPRESS_DIFF) - goto no_compr; - - return; - -no_compr: - memcpy(out_buf, in_buf, in_len); - *out_len = in_len; - *compr_type = UBIFS_COMPR_NONE; + ubifs_compress_common(compr_type, in_ptr, 0, in_len, false, + out_buf, out_len); } /** - * ubifs_decompress - decompress data. + * ubifs_compress_folio - compress folio. * @c: UBIFS file-system description object - * @in_buf: data to decompress - * @in_len: length of the data to decompress - * @out_buf: output buffer where decompressed data should - * @out_len: output length is returned here - * @compr_type: type of compression + * @in_folio: data to compress + * @in_offset: offset into @in_folio + * @in_len: length of the data to compress + * @out_buf: output buffer where compressed data should be stored + * @out_len: output buffer length is returned here + * @compr_type: type of compression to use on enter, actually used compression + * type on exit * - * This function decompresses data from buffer @in_buf into buffer @out_buf. - * The length of the uncompressed data is returned in @out_len. This functions - * returns %0 on success or a negative error code on failure. + * This function compresses input folio @in_folio of length @in_len and + * stores the result in the output buffer @out_buf and the resulting length + * in @out_len. If the input buffer does not compress, it is just copied + * to the @out_buf. The same happens if @compr_type is %UBIFS_COMPR_NONE + * or if compression error occurred. + * + * Note, if the input buffer was not compressed, it is copied to the output + * buffer and %UBIFS_COMPR_NONE is returned in @compr_type. */ -int ubifs_decompress(const struct ubifs_info *c, const void *in_buf, - int in_len, void *out_buf, int *out_len, int compr_type) +void ubifs_compress_folio(const struct ubifs_info *c, struct folio *in_folio, + size_t in_offset, int in_len, void *out_buf, + int *out_len, int *compr_type) +{ + union ubifs_in_ptr in_ptr = { .folio = in_folio }; + + ubifs_compress_common(compr_type, in_ptr, in_offset, in_len, true, + out_buf, out_len); +} + +static int ubifs_decompress_common(const struct ubifs_info *c, + const void *in_buf, int in_len, + void *out_ptr, size_t out_offset, + int *out_len, bool out_folio, + int compr_type) { - int err; struct ubifs_compressor *compr; + int dlen = *out_len; + int err; if (unlikely(compr_type < 0 || compr_type >= UBIFS_COMPR_TYPES_CNT)) { ubifs_err(c, "invalid compression type %d", compr_type); @@ -171,17 +211,39 @@ int ubifs_decompress(const struct ubifs_info *c, const void *in_buf, } if (compr_type == UBIFS_COMPR_NONE) { - memcpy(out_buf, in_buf, in_len); + if (out_folio) + memcpy_to_folio(out_ptr, out_offset, in_buf, in_len); + else + memcpy(out_ptr, in_buf, in_len); *out_len = in_len; return 0; } - if (compr->decomp_mutex) - mutex_lock(compr->decomp_mutex); - err = crypto_comp_decompress(compr->cc, in_buf, in_len, out_buf, - (unsigned int *)out_len); - if (compr->decomp_mutex) - mutex_unlock(compr->decomp_mutex); + do { + ACOMP_REQUEST_ON_STACK(req, compr->cc); + DECLARE_CRYPTO_WAIT(wait); + + acomp_request_set_callback(req, CRYPTO_TFM_REQ_MAY_BACKLOG, + crypto_req_done, &wait); + acomp_request_set_src_dma(req, in_buf, in_len); + if (out_folio) + acomp_request_set_dst_folio(req, out_ptr, out_offset, + dlen); + else + acomp_request_set_dst_dma(req, out_ptr, dlen); + err = crypto_acomp_decompress(req); + dlen = req->dlen; + if (err != -EAGAIN) + break; + + req = ACOMP_REQUEST_CLONE(req, GFP_NOFS | __GFP_NOWARN); + err = crypto_acomp_decompress(req); + err = crypto_wait_req(err, &wait); + dlen = req->dlen; + acomp_request_free(req); + } while (0); + + *out_len = dlen; if (err) ubifs_err(c, "cannot decompress %d bytes, compressor %s, error %d", in_len, compr->name, err); @@ -190,6 +252,49 @@ int ubifs_decompress(const struct ubifs_info *c, const void *in_buf, } /** + * ubifs_decompress - decompress data. + * @c: UBIFS file-system description object + * @in_buf: data to decompress + * @in_len: length of the data to decompress + * @out_buf: output buffer where decompressed data should + * @out_len: output length is returned here + * @compr_type: type of compression + * + * This function decompresses data from buffer @in_buf into buffer @out_buf. + * The length of the uncompressed data is returned in @out_len. This functions + * returns %0 on success or a negative error code on failure. + */ +int ubifs_decompress(const struct ubifs_info *c, const void *in_buf, + int in_len, void *out_buf, int *out_len, int compr_type) +{ + return ubifs_decompress_common(c, in_buf, in_len, out_buf, 0, out_len, + false, compr_type); +} + +/** + * ubifs_decompress_folio - decompress folio. + * @c: UBIFS file-system description object + * @in_buf: data to decompress + * @in_len: length of the data to decompress + * @out_folio: output folio where decompressed data should + * @out_offset: offset into @out_folio + * @out_len: output length is returned here + * @compr_type: type of compression + * + * This function decompresses data from buffer @in_buf into folio + * @out_folio. The length of the uncompressed data is returned in + * @out_len. This functions returns %0 on success or a negative error + * code on failure. + */ +int ubifs_decompress_folio(const struct ubifs_info *c, const void *in_buf, + int in_len, struct folio *out_folio, + size_t out_offset, int *out_len, int compr_type) +{ + return ubifs_decompress_common(c, in_buf, in_len, out_folio, + out_offset, out_len, true, compr_type); +} + +/** * compr_init - initialize a compressor. * @compr: compressor description object * @@ -199,7 +304,7 @@ int ubifs_decompress(const struct ubifs_info *c, const void *in_buf, static int __init compr_init(struct ubifs_compressor *compr) { if (compr->capi_name) { - compr->cc = crypto_alloc_comp(compr->capi_name, 0, 0); + compr->cc = crypto_alloc_acomp(compr->capi_name, 0, 0); if (IS_ERR(compr->cc)) { pr_err("UBIFS error (pid %d): cannot initialize compressor %s, error %ld", current->pid, compr->name, PTR_ERR(compr->cc)); @@ -218,7 +323,7 @@ static int __init compr_init(struct ubifs_compressor *compr) static void compr_exit(struct ubifs_compressor *compr) { if (compr->capi_name) - crypto_free_comp(compr->cc); + crypto_free_acomp(compr->cc); } /** diff --git a/fs/ubifs/dir.c b/fs/ubifs/dir.c index fda82f3e16e8..3c3d3ad4fa6c 100644 --- a/fs/ubifs/dir.c +++ b/fs/ubifs/dir.c @@ -1002,8 +1002,8 @@ out_fname: return err; } -static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct ubifs_inode *dir_ui = ubifs_inode(dir); @@ -1023,7 +1023,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, err = ubifs_budget_space(c, &req); if (err) - return err; + return ERR_PTR(err); err = ubifs_prepare_create(dir, dentry, &nm); if (err) @@ -1060,7 +1060,7 @@ static int ubifs_mkdir(struct mnt_idmap *idmap, struct inode *dir, ubifs_release_budget(c, &req); d_instantiate(dentry, inode); fscrypt_free_filename(&nm); - return 0; + return NULL; out_cancel: dir->i_size -= sz_change; @@ -1074,7 +1074,7 @@ out_fname: fscrypt_free_filename(&nm); out_budg: ubifs_release_budget(c, &req); - return err; + return ERR_PTR(err); } static int ubifs_mknod(struct mnt_idmap *idmap, struct inode *dir, diff --git a/fs/ubifs/file.c b/fs/ubifs/file.c index 5130123005e4..bf311c38d9a8 100644 --- a/fs/ubifs/file.c +++ b/fs/ubifs/file.c @@ -42,8 +42,8 @@ #include <linux/slab.h> #include <linux/migrate.h> -static int read_block(struct inode *inode, void *addr, unsigned int block, - struct ubifs_data_node *dn) +static int read_block(struct inode *inode, struct folio *folio, size_t offset, + unsigned int block, struct ubifs_data_node *dn) { struct ubifs_info *c = inode->i_sb->s_fs_info; int err, len, out_len; @@ -55,7 +55,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, if (err) { if (err == -ENOENT) /* Not found, so it must be a hole */ - memset(addr, 0, UBIFS_BLOCK_SIZE); + folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE); return err; } @@ -74,8 +74,8 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, } out_len = UBIFS_BLOCK_SIZE; - err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, - le16_to_cpu(dn->compr_type)); + err = ubifs_decompress_folio(c, &dn->data, dlen, folio, offset, + &out_len, le16_to_cpu(dn->compr_type)); if (err || len != out_len) goto dump; @@ -85,7 +85,7 @@ static int read_block(struct inode *inode, void *addr, unsigned int block, * appending data). Ensure that the remainder is zeroed out. */ if (len < UBIFS_BLOCK_SIZE) - memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + folio_zero_range(folio, offset + len, UBIFS_BLOCK_SIZE - len); return 0; @@ -98,27 +98,25 @@ dump: static int do_readpage(struct folio *folio) { - void *addr; int err = 0, i; unsigned int block, beyond; struct ubifs_data_node *dn = NULL; struct inode *inode = folio->mapping->host; struct ubifs_info *c = inode->i_sb->s_fs_info; loff_t i_size = i_size_read(inode); + size_t offset = 0; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", inode->i_ino, folio->index, i_size, folio->flags); ubifs_assert(c, !folio_test_checked(folio)); ubifs_assert(c, !folio->private); - addr = kmap_local_folio(folio, 0); - block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; beyond = (i_size + UBIFS_BLOCK_SIZE - 1) >> UBIFS_BLOCK_SHIFT; if (block >= beyond) { /* Reading beyond inode */ folio_set_checked(folio); - addr = folio_zero_tail(folio, 0, addr); + folio_zero_range(folio, 0, folio_size(folio)); goto out; } @@ -135,9 +133,9 @@ static int do_readpage(struct folio *folio) if (block >= beyond) { /* Reading beyond inode */ err = -ENOENT; - memset(addr, 0, UBIFS_BLOCK_SIZE); + folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE); } else { - ret = read_block(inode, addr, block, dn); + ret = read_block(inode, folio, offset, block, dn); if (ret) { err = ret; if (err != -ENOENT) @@ -147,17 +145,13 @@ static int do_readpage(struct folio *folio) int ilen = i_size & (UBIFS_BLOCK_SIZE - 1); if (ilen && ilen < dlen) - memset(addr + ilen, 0, dlen - ilen); + folio_zero_range(folio, offset + ilen, dlen - ilen); } } if (++i >= (UBIFS_BLOCKS_PER_PAGE << folio_order(folio))) break; block += 1; - addr += UBIFS_BLOCK_SIZE; - if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { - kunmap_local(addr - UBIFS_BLOCK_SIZE); - addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); - } + offset += UBIFS_BLOCK_SIZE; } if (err) { @@ -177,8 +171,6 @@ out: kfree(dn); if (!err) folio_mark_uptodate(folio); - flush_dcache_folio(folio); - kunmap_local(addr); return err; } @@ -602,18 +594,16 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, struct inode *inode = folio->mapping->host; loff_t i_size = i_size_read(inode); unsigned int page_block; - void *addr, *zaddr; + size_t offset = 0; pgoff_t end_index; dbg_gen("ino %lu, pg %lu, i_size %lld, flags %#lx", inode->i_ino, folio->index, i_size, folio->flags); - addr = zaddr = kmap_local_folio(folio, 0); - end_index = (i_size - 1) >> PAGE_SHIFT; if (!i_size || folio->index > end_index) { hole = 1; - addr = folio_zero_tail(folio, 0, addr); + folio_zero_range(folio, 0, folio_size(folio)); goto out_hole; } @@ -623,7 +613,7 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, if (nn >= bu->cnt) { hole = 1; - memset(addr, 0, UBIFS_BLOCK_SIZE); + folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE); } else if (key_block(c, &bu->zbranch[nn].key) == page_block) { struct ubifs_data_node *dn; @@ -645,13 +635,15 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, goto out_err; } - err = ubifs_decompress(c, &dn->data, dlen, addr, &out_len, - le16_to_cpu(dn->compr_type)); + err = ubifs_decompress_folio( + c, &dn->data, dlen, folio, offset, &out_len, + le16_to_cpu(dn->compr_type)); if (err || len != out_len) goto out_err; if (len < UBIFS_BLOCK_SIZE) - memset(addr + len, 0, UBIFS_BLOCK_SIZE - len); + folio_zero_range(folio, offset + len, + UBIFS_BLOCK_SIZE - len); nn += 1; read = (i << UBIFS_BLOCK_SHIFT) + len; @@ -660,23 +652,19 @@ static int populate_page(struct ubifs_info *c, struct folio *folio, continue; } else { hole = 1; - memset(addr, 0, UBIFS_BLOCK_SIZE); + folio_zero_range(folio, offset, UBIFS_BLOCK_SIZE); } if (++i >= UBIFS_BLOCKS_PER_PAGE) break; - addr += UBIFS_BLOCK_SIZE; + offset += UBIFS_BLOCK_SIZE; page_block += 1; - if (folio_test_highmem(folio) && (offset_in_page(addr) == 0)) { - kunmap_local(addr - UBIFS_BLOCK_SIZE); - addr = kmap_local_folio(folio, i * UBIFS_BLOCK_SIZE); - } } if (end_index == folio->index) { int len = i_size & (PAGE_SIZE - 1); if (len && len < read) - memset(zaddr + len, 0, read - len); + folio_zero_range(folio, len, read - len); } out_hole: @@ -686,14 +674,10 @@ out_hole: } folio_mark_uptodate(folio); - flush_dcache_folio(folio); - kunmap_local(addr); *n = nn; return 0; out_err: - flush_dcache_folio(folio); - kunmap_local(addr); ubifs_err(c, "bad data node (block %u, inode %lu)", page_block, inode->i_ino); return -EINVAL; @@ -898,7 +882,6 @@ static int do_writepage(struct folio *folio, size_t len) { int err = 0, blen; unsigned int block; - void *addr; size_t offset = 0; union ubifs_key key; struct inode *inode = folio->mapping->host; @@ -913,26 +896,19 @@ static int do_writepage(struct folio *folio, size_t len) folio_start_writeback(folio); - addr = kmap_local_folio(folio, offset); block = folio->index << UBIFS_BLOCKS_PER_PAGE_SHIFT; for (;;) { blen = min_t(size_t, len, UBIFS_BLOCK_SIZE); data_key_init(c, &key, inode->i_ino, block); - err = ubifs_jnl_write_data(c, inode, &key, addr, blen); + err = ubifs_jnl_write_data(c, inode, &key, folio, offset, blen); if (err) break; len -= blen; if (!len) break; block += 1; - addr += blen; - if (folio_test_highmem(folio) && !offset_in_page(addr)) { - kunmap_local(addr - blen); - offset += PAGE_SIZE; - addr = kmap_local_folio(folio, offset); - } + offset += blen; } - kunmap_local(addr); if (err) { mapping_set_error(folio->mapping, err); ubifs_err(c, "cannot write folio %lu of inode %lu, error %d", diff --git a/fs/ubifs/io.c b/fs/ubifs/io.c index 01d8eb170382..a79f229df475 100644 --- a/fs/ubifs/io.c +++ b/fs/ubifs/io.c @@ -1179,8 +1179,7 @@ int ubifs_wbuf_init(struct ubifs_info *c, struct ubifs_wbuf *wbuf) wbuf->c = c; wbuf->next_ino = 0; - hrtimer_init(&wbuf->timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); - wbuf->timer.function = wbuf_timer_callback_nolock; + hrtimer_setup(&wbuf->timer, wbuf_timer_callback_nolock, CLOCK_MONOTONIC, HRTIMER_MODE_REL); return 0; } diff --git a/fs/ubifs/journal.c b/fs/ubifs/journal.c index 36ba79fbd2ff..ee954e64ce7f 100644 --- a/fs/ubifs/journal.c +++ b/fs/ubifs/journal.c @@ -845,14 +845,16 @@ out_ro: * @c: UBIFS file-system description object * @inode: inode the data node belongs to * @key: node key - * @buf: buffer to write + * @folio: buffer to write + * @offset: offset to write at * @len: data length (must not exceed %UBIFS_BLOCK_SIZE) * * This function writes a data node to the journal. Returns %0 if the data node * was successfully written, and a negative error code in case of failure. */ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, - const union ubifs_key *key, const void *buf, int len) + const union ubifs_key *key, struct folio *folio, + size_t offset, int len) { struct ubifs_data_node *data; int err, lnum, offs, compr_type, out_len, compr_len, auth_len; @@ -896,7 +898,8 @@ int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, compr_type = ui->compr_type; out_len = compr_len = dlen - UBIFS_DATA_NODE_SZ; - ubifs_compress(c, buf, len, &data->data, &compr_len, &compr_type); + ubifs_compress_folio(c, folio, offset, len, &data->data, &compr_len, + &compr_type); ubifs_assert(c, compr_len <= UBIFS_BLOCK_SIZE); if (encrypted) { @@ -1625,7 +1628,7 @@ static int truncate_data_node(const struct ubifs_info *c, const struct inode *in int err, dlen, compr_type, out_len, data_size; out_len = le32_to_cpu(dn->size); - buf = kmalloc_array(out_len, WORST_COMPR_FACTOR, GFP_NOFS); + buf = kmalloc(out_len, GFP_NOFS); if (!buf) return -ENOMEM; diff --git a/fs/ubifs/ubifs.h b/fs/ubifs/ubifs.h index 3375bbe0508c..256dbaeeb0de 100644 --- a/fs/ubifs/ubifs.h +++ b/fs/ubifs/ubifs.h @@ -124,13 +124,6 @@ #define OLD_ZNODE_AGE 20 #define YOUNG_ZNODE_AGE 5 -/* - * Some compressors, like LZO, may end up with more data then the input buffer. - * So UBIFS always allocates larger output buffer, to be sure the compressor - * will not corrupt memory in case of worst case compression. - */ -#define WORST_COMPR_FACTOR 2 - #ifdef CONFIG_FS_ENCRYPTION #define UBIFS_CIPHER_BLOCK_SIZE FSCRYPT_CONTENTS_ALIGNMENT #else @@ -141,7 +134,7 @@ * How much memory is needed for a buffer where we compress a data node. */ #define COMPRESSED_DATA_NODE_BUF_SZ \ - (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE * WORST_COMPR_FACTOR) + (UBIFS_DATA_NODE_SZ + UBIFS_BLOCK_SIZE) /* Maximum expected tree height for use by bottom_up_buf */ #define BOTTOM_UP_HEIGHT 64 @@ -270,6 +263,8 @@ enum { ASSACT_PANIC, }; +struct folio; + /** * struct ubifs_old_idx - index node obsoleted since last commit start. * @rb: rb-tree node @@ -835,16 +830,12 @@ struct ubifs_node_range { * struct ubifs_compressor - UBIFS compressor description structure. * @compr_type: compressor type (%UBIFS_COMPR_LZO, etc) * @cc: cryptoapi compressor handle - * @comp_mutex: mutex used during compression - * @decomp_mutex: mutex used during decompression * @name: compressor name * @capi_name: cryptoapi compressor name */ struct ubifs_compressor { int compr_type; - struct crypto_comp *cc; - struct mutex *comp_mutex; - struct mutex *decomp_mutex; + struct crypto_acomp *cc; const char *name; const char *capi_name; }; @@ -1795,7 +1786,8 @@ int ubifs_jnl_update(struct ubifs_info *c, const struct inode *dir, const struct fscrypt_name *nm, const struct inode *inode, int deletion, int xent, int in_orphan); int ubifs_jnl_write_data(struct ubifs_info *c, const struct inode *inode, - const union ubifs_key *key, const void *buf, int len); + const union ubifs_key *key, struct folio *folio, + size_t offset, int len); int ubifs_jnl_write_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_delete_inode(struct ubifs_info *c, const struct inode *inode); int ubifs_jnl_xrename(struct ubifs_info *c, const struct inode *fst_dir, @@ -2095,8 +2087,14 @@ int __init ubifs_compressors_init(void); void ubifs_compressors_exit(void); void ubifs_compress(const struct ubifs_info *c, const void *in_buf, int in_len, void *out_buf, int *out_len, int *compr_type); +void ubifs_compress_folio(const struct ubifs_info *c, struct folio *folio, + size_t offset, int in_len, void *out_buf, + int *out_len, int *compr_type); int ubifs_decompress(const struct ubifs_info *c, const void *buf, int len, void *out, int *out_len, int compr_type); +int ubifs_decompress_folio(const struct ubifs_info *c, const void *buf, + int len, struct folio *folio, size_t offset, + int *out_len, int compr_type); /* sysfs.c */ int ubifs_sysfs_init(void); diff --git a/fs/udf/file.c b/fs/udf/file.c index 412fe7c4d348..0d76c4f37b3e 100644 --- a/fs/udf/file.c +++ b/fs/udf/file.c @@ -69,7 +69,7 @@ static vm_fault_t udf_page_mkwrite(struct vm_fault *vmf) goto out_unlock; } - block_commit_write(&folio->page, 0, end); + block_commit_write(folio, 0, end); out_dirty: folio_mark_dirty(folio); folio_wait_stable(folio); diff --git a/fs/udf/inode.c b/fs/udf/inode.c index 70c907fe8af9..4386dd845e40 100644 --- a/fs/udf/inode.c +++ b/fs/udf/inode.c @@ -810,6 +810,7 @@ static int inode_getblk(struct inode *inode, struct udf_map_rq *map) } map->oflags = UDF_BLK_MAPPED; map->pblk = udf_get_lb_pblock(inode->i_sb, &eloc, offset); + ret = 0; goto out_free; } diff --git a/fs/udf/namei.c b/fs/udf/namei.c index 2cb49b6b0716..5f2e9a892bff 100644 --- a/fs/udf/namei.c +++ b/fs/udf/namei.c @@ -419,8 +419,8 @@ static int udf_mknod(struct mnt_idmap *idmap, struct inode *dir, return udf_add_nondir(dentry, inode); } -static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, - struct dentry *dentry, umode_t mode) +static struct dentry *udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, + struct dentry *dentry, umode_t mode) { struct inode *inode; struct udf_fileident_iter iter; @@ -430,7 +430,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, inode = udf_new_inode(dir, S_IFDIR | mode); if (IS_ERR(inode)) - return PTR_ERR(inode); + return ERR_CAST(inode); iinfo = UDF_I(inode); inode->i_op = &udf_dir_inode_operations; @@ -439,7 +439,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (err) { clear_nlink(inode); discard_new_inode(inode); - return err; + return ERR_PTR(err); } set_nlink(inode, 2); iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); @@ -456,7 +456,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, if (err) { clear_nlink(inode); discard_new_inode(inode); - return err; + return ERR_PTR(err); } iter.fi.icb.extLength = cpu_to_le32(inode->i_sb->s_blocksize); iter.fi.icb.extLocation = cpu_to_lelb(iinfo->i_location); @@ -471,7 +471,7 @@ static int udf_mkdir(struct mnt_idmap *idmap, struct inode *dir, mark_inode_dirty(dir); d_instantiate_new(dentry, inode); - return 0; + return NULL; } static int empty_dir(struct inode *dir) diff --git a/fs/udf/truncate.c b/fs/udf/truncate.c index 4f33a4a48886..b4071c9cf8c9 100644 --- a/fs/udf/truncate.c +++ b/fs/udf/truncate.c @@ -115,7 +115,7 @@ void udf_truncate_tail_extent(struct inode *inode) } /* This inode entry is in-memory only and thus we don't have to mark * the inode dirty */ - if (ret == 0) + if (ret >= 0) iinfo->i_lenExtents = inode->i_size; brelse(epos.bh); } diff --git a/fs/ufs/namei.c b/fs/ufs/namei.c index 38a024c8cccd..5b3c85c93242 100644 --- a/fs/ufs/namei.c +++ b/fs/ufs/namei.c @@ -166,8 +166,8 @@ static int ufs_link (struct dentry * old_dentry, struct inode * dir, return error; } -static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, - struct dentry * dentry, umode_t mode) +static struct dentry *ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, + struct dentry * dentry, umode_t mode) { struct inode * inode; int err; @@ -194,7 +194,7 @@ static int ufs_mkdir(struct mnt_idmap * idmap, struct inode * dir, goto out_fail; d_instantiate_new(dentry, inode); - return 0; + return NULL; out_fail: inode_dec_link_count(inode); @@ -202,7 +202,7 @@ out_fail: discard_new_inode(inode); out_dir: inode_dec_link_count(dir); - return err; + return ERR_PTR(err); } static int ufs_unlink(struct inode *dir, struct dentry *dentry) diff --git a/fs/unicode/Kconfig b/fs/unicode/Kconfig index da786a687fdc..4ad2c36550f1 100644 --- a/fs/unicode/Kconfig +++ b/fs/unicode/Kconfig @@ -10,6 +10,7 @@ config UNICODE be a separate loadable module that gets requested only when a file system actually use it. -config UNICODE_NORMALIZATION_SELFTEST +config UNICODE_NORMALIZATION_KUNIT_TEST tristate "Test UTF-8 normalization support" - depends on UNICODE + depends on UNICODE && KUNIT + default KUNIT_ALL_TESTS diff --git a/fs/unicode/Makefile b/fs/unicode/Makefile index e309afe2b2bb..d95be7fb9f6b 100644 --- a/fs/unicode/Makefile +++ b/fs/unicode/Makefile @@ -4,7 +4,7 @@ ifneq ($(CONFIG_UNICODE),) obj-y += unicode.o endif obj-$(CONFIG_UNICODE) += utf8data.o -obj-$(CONFIG_UNICODE_NORMALIZATION_SELFTEST) += utf8-selftest.o +obj-$(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) += tests/utf8_kunit.o unicode-y := utf8-norm.o utf8-core.o diff --git a/fs/unicode/tests/.kunitconfig b/fs/unicode/tests/.kunitconfig new file mode 100644 index 000000000000..62dd5c171f9c --- /dev/null +++ b/fs/unicode/tests/.kunitconfig @@ -0,0 +1,3 @@ +CONFIG_KUNIT=y +CONFIG_UNICODE=y +CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST=y diff --git a/fs/unicode/utf8-selftest.c b/fs/unicode/tests/utf8_kunit.c index 5ddaf27b21a6..5063e8138aec 100644 --- a/fs/unicode/utf8-selftest.c +++ b/fs/unicode/tests/utf8_kunit.c @@ -1,34 +1,14 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Kernel module for testing utf-8 support. + * KUnit tests for utf-8 support. * * Copyright 2017 Collabora Ltd. */ -#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt - -#include <linux/module.h> -#include <linux/printk.h> #include <linux/unicode.h> -#include <linux/dcache.h> - -#include "utf8n.h" - -static unsigned int failed_tests; -static unsigned int total_tests; - -#define _test(cond, func, line, fmt, ...) do { \ - total_tests++; \ - if (!cond) { \ - failed_tests++; \ - pr_err("test %s:%d Failed: %s%s", \ - func, line, #cond, (fmt?":":".")); \ - if (fmt) \ - pr_err(fmt, ##__VA_ARGS__); \ - } \ - } while (0) -#define test_f(cond, fmt, ...) _test(cond, __func__, __LINE__, fmt, ##__VA_ARGS__) -#define test(cond) _test(cond, __func__, __LINE__, "") +#include <kunit/test.h> + +#include "../utf8n.h" static const struct { /* UTF-8 strings in this vector _must_ be NULL-terminated. */ @@ -167,69 +147,74 @@ static int utf8cursor(struct utf8cursor *u8c, const struct unicode_map *um, return utf8ncursor(u8c, um, n, s, (unsigned int)-1); } -static void check_utf8_nfdi(struct unicode_map *um) +static void check_utf8_nfdi(struct kunit *test) { int i; struct utf8cursor u8c; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { int len = strlen(nfdi_test_data[i].str); int nlen = strlen(nfdi_test_data[i].dec); int j = 0; unsigned char c; + int ret; + + KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDI, nfdi_test_data[i].str), nlen); + KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len), + nlen); - test((utf8len(um, UTF8_NFDI, nfdi_test_data[i].str) == nlen)); - test((utf8nlen(um, UTF8_NFDI, nfdi_test_data[i].str, len) == - nlen)); - if (utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str) < 0) - pr_err("can't create cursor\n"); + ret = utf8cursor(&u8c, um, UTF8_NFDI, nfdi_test_data[i].str); + KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { - test_f((c == nfdi_test_data[i].dec[j]), - "Unexpected byte 0x%x should be 0x%x\n", - c, nfdi_test_data[i].dec[j]); + KUNIT_EXPECT_EQ_MSG(test, c, nfdi_test_data[i].dec[j], + "Unexpected byte 0x%x should be 0x%x\n", + c, nfdi_test_data[i].dec[j]); j++; } - test((j == nlen)); + KUNIT_EXPECT_EQ(test, j, nlen); } } -static void check_utf8_nfdicf(struct unicode_map *um) +static void check_utf8_nfdicf(struct kunit *test) { int i; struct utf8cursor u8c; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { int len = strlen(nfdicf_test_data[i].str); int nlen = strlen(nfdicf_test_data[i].ncf); int j = 0; + int ret; unsigned char c; - test((utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str) == - nlen)); - test((utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len) == - nlen)); + KUNIT_EXPECT_EQ(test, utf8len(um, UTF8_NFDICF, nfdicf_test_data[i].str), + nlen); + KUNIT_EXPECT_EQ(test, utf8nlen(um, UTF8_NFDICF, nfdicf_test_data[i].str, len), + nlen); - if (utf8cursor(&u8c, um, UTF8_NFDICF, - nfdicf_test_data[i].str) < 0) - pr_err("can't create cursor\n"); + ret = utf8cursor(&u8c, um, UTF8_NFDICF, nfdicf_test_data[i].str); + KUNIT_EXPECT_TRUE_MSG(test, ret >= 0, "Can't create cursor\n"); while ((c = utf8byte(&u8c)) > 0) { - test_f((c == nfdicf_test_data[i].ncf[j]), - "Unexpected byte 0x%x should be 0x%x\n", - c, nfdicf_test_data[i].ncf[j]); + KUNIT_EXPECT_EQ_MSG(test, c, nfdicf_test_data[i].ncf[j], + "Unexpected byte 0x%x should be 0x%x\n", + c, nfdicf_test_data[i].ncf[j]); j++; } - test((j == nlen)); + KUNIT_EXPECT_EQ(test, j, nlen); } } -static void check_utf8_comparisons(struct unicode_map *table) +static void check_utf8_comparisons(struct kunit *test) { int i; + struct unicode_map *um = test->priv; for (i = 0; i < ARRAY_SIZE(nfdi_test_data); i++) { const struct qstr s1 = {.name = nfdi_test_data[i].str, @@ -237,8 +222,9 @@ static void check_utf8_comparisons(struct unicode_map *table) const struct qstr s2 = {.name = nfdi_test_data[i].dec, .len = sizeof(nfdi_test_data[i].dec)}; - test_f(!utf8_strncmp(table, &s1, &s2), - "%s %s comparison mismatch\n", s1.name, s2.name); + /* strncmp returns 0 when strings are equal */ + KUNIT_EXPECT_TRUE_MSG(test, utf8_strncmp(um, &s1, &s2) == 0, + "%s %s comparison mismatch\n", s1.name, s2.name); } for (i = 0; i < ARRAY_SIZE(nfdicf_test_data); i++) { @@ -247,62 +233,65 @@ static void check_utf8_comparisons(struct unicode_map *table) const struct qstr s2 = {.name = nfdicf_test_data[i].ncf, .len = sizeof(nfdicf_test_data[i].ncf)}; - test_f(!utf8_strncasecmp(table, &s1, &s2), - "%s %s comparison mismatch\n", s1.name, s2.name); + /* strncasecmp returns 0 when strings are equal */ + KUNIT_EXPECT_TRUE_MSG(test, utf8_strncasecmp(um, &s1, &s2) == 0, + "%s %s comparison mismatch\n", s1.name, s2.name); } } -static void check_supported_versions(struct unicode_map *um) +static void check_supported_versions(struct kunit *test) { + struct unicode_map *um = test->priv; /* Unicode 7.0.0 should be supported. */ - test(utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(7, 0, 0))); /* Unicode 9.0.0 should be supported. */ - test(utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UNICODE_AGE(9, 0, 0))); /* Unicode 1x.0.0 (the latest version) should be supported. */ - test(utf8version_is_supported(um, UTF8_LATEST)); + KUNIT_EXPECT_TRUE(test, utf8version_is_supported(um, UTF8_LATEST)); /* Next versions don't exist. */ - test(!utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); - test(!utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); - test(!utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(13, 0, 0))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(0, 0, 0))); + KUNIT_EXPECT_FALSE(test, utf8version_is_supported(um, UNICODE_AGE(-1, -1, -1))); } -static int __init init_test_ucd(void) +static struct kunit_case unicode_normalization_test_cases[] = { + KUNIT_CASE(check_supported_versions), + KUNIT_CASE(check_utf8_comparisons), + KUNIT_CASE(check_utf8_nfdicf), + KUNIT_CASE(check_utf8_nfdi), + {} +}; + +static int init_test_ucd(struct kunit *test) { - struct unicode_map *um; + struct unicode_map *um = utf8_load(UTF8_LATEST); - failed_tests = 0; - total_tests = 0; + test->priv = um; - um = utf8_load(UTF8_LATEST); - if (IS_ERR(um)) { - pr_err("%s: Unable to load utf8 table.\n", __func__); - return PTR_ERR(um); - } + KUNIT_EXPECT_EQ_MSG(test, IS_ERR(um), 0, + "%s: Unable to load utf8 table.\n", __func__); - check_supported_versions(um); - check_utf8_nfdi(um); - check_utf8_nfdicf(um); - check_utf8_comparisons(um); - - if (!failed_tests) - pr_info("All %u tests passed\n", total_tests); - else - pr_err("%u out of %u tests failed\n", failed_tests, - total_tests); - utf8_unload(um); return 0; } -static void __exit exit_test_ucd(void) +static void exit_test_ucd(struct kunit *test) { + utf8_unload(test->priv); } -module_init(init_test_ucd); -module_exit(exit_test_ucd); +static struct kunit_suite unicode_normalization_test_suite = { + .name = "unicode_normalization", + .test_cases = unicode_normalization_test_cases, + .init = init_test_ucd, + .exit = exit_test_ucd, +}; + +kunit_test_suite(unicode_normalization_test_suite); + MODULE_AUTHOR("Gabriel Krisman Bertazi <krisman@collabora.co.uk>"); -MODULE_DESCRIPTION("Kernel module for testing utf-8 support"); +MODULE_DESCRIPTION("KUnit tests for utf-8 support."); MODULE_LICENSE("GPL"); diff --git a/fs/unicode/utf8-norm.c b/fs/unicode/utf8-norm.c index 768f8ab448b8..7b998c99c88d 100644 --- a/fs/unicode/utf8-norm.c +++ b/fs/unicode/utf8-norm.c @@ -586,7 +586,7 @@ ccc_mismatch: } } -#ifdef CONFIG_UNICODE_NORMALIZATION_SELFTEST_MODULE +#if IS_MODULE(CONFIG_UNICODE_NORMALIZATION_KUNIT_TEST) EXPORT_SYMBOL_GPL(utf8version_is_supported); EXPORT_SYMBOL_GPL(utf8nlen); EXPORT_SYMBOL_GPL(utf8ncursor); diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index 97c4d71115d8..22f4bf956ba1 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -396,32 +396,6 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) goto out; /* - * If it's already released don't get it. This avoids to loop - * in __get_user_pages if userfaultfd_release waits on the - * caller of handle_userfault to release the mmap_lock. - */ - if (unlikely(READ_ONCE(ctx->released))) { - /* - * Don't return VM_FAULT_SIGBUS in this case, so a non - * cooperative manager can close the uffd after the - * last UFFDIO_COPY, without risking to trigger an - * involuntary SIGBUS if the process was starting the - * userfaultfd while the userfaultfd was still armed - * (but after the last UFFDIO_COPY). If the uffd - * wasn't already closed when the userfault reached - * this point, that would normally be solved by - * userfaultfd_must_wait returning 'false'. - * - * If we were to return VM_FAULT_SIGBUS here, the non - * cooperative manager would be instead forced to - * always call UFFDIO_UNREGISTER before it can safely - * close the uffd. - */ - ret = VM_FAULT_NOPAGE; - goto out; - } - - /* * Check that we can return VM_FAULT_RETRY. * * NOTE: it should become possible to return VM_FAULT_RETRY @@ -457,6 +431,31 @@ vm_fault_t handle_userfault(struct vm_fault *vmf, unsigned long reason) if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) goto out; + if (unlikely(READ_ONCE(ctx->released))) { + /* + * If a concurrent release is detected, do not return + * VM_FAULT_SIGBUS or VM_FAULT_NOPAGE, but instead always + * return VM_FAULT_RETRY with lock released proactively. + * + * If we were to return VM_FAULT_SIGBUS here, the non + * cooperative manager would be instead forced to + * always call UFFDIO_UNREGISTER before it can safely + * close the uffd, to avoid involuntary SIGBUS triggered. + * + * If we were to return VM_FAULT_NOPAGE, it would work for + * the fault path, in which the lock will be released + * later. However for GUP, faultin_page() does nothing + * special on NOPAGE, so GUP would spin retrying without + * releasing the mmap read lock, causing possible livelock. + * + * Here only VM_FAULT_RETRY would make sure the mmap lock + * be released immediately, so that the thread concurrently + * releasing the userfault would always make progress. + */ + release_fault_lock(vmf); + goto out; + } + /* take the reference before dropping the mmap_lock */ userfaultfd_ctx_get(ctx); @@ -1586,8 +1585,11 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_copy->copy))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_copy, user_uffdio_copy, @@ -1642,8 +1644,11 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, @@ -1745,8 +1750,11 @@ static int userfaultfd_continue(struct userfaultfd_ctx *ctx, unsigned long arg) user_uffdio_continue = (struct uffdio_continue __user *)arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_continue->mapped))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_continue, user_uffdio_continue, @@ -1802,8 +1810,11 @@ static inline int userfaultfd_poison(struct userfaultfd_ctx *ctx, unsigned long user_uffdio_poison = (struct uffdio_poison __user *)arg; ret = -EAGAIN; - if (atomic_read(&ctx->mmap_changing)) + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_poison->updated))) + return -EFAULT; goto out; + } ret = -EFAULT; if (copy_from_user(&uffdio_poison, user_uffdio_poison, @@ -1871,8 +1882,12 @@ static int userfaultfd_move(struct userfaultfd_ctx *ctx, user_uffdio_move = (struct uffdio_move __user *) arg; - if (atomic_read(&ctx->mmap_changing)) - return -EAGAIN; + ret = -EAGAIN; + if (unlikely(atomic_read(&ctx->mmap_changing))) { + if (unlikely(put_user(ret, &user_uffdio_move->move))) + return -EFAULT; + goto out; + } if (copy_from_user(&uffdio_move, user_uffdio_move, /* don't copy "move" last field */ diff --git a/fs/vboxsf/dir.c b/fs/vboxsf/dir.c index a859ac9b74ba..770e29ec3557 100644 --- a/fs/vboxsf/dir.c +++ b/fs/vboxsf/dir.c @@ -303,11 +303,11 @@ static int vboxsf_dir_mkfile(struct mnt_idmap *idmap, return vboxsf_dir_create(parent, dentry, mode, false, excl, NULL); } -static int vboxsf_dir_mkdir(struct mnt_idmap *idmap, - struct inode *parent, struct dentry *dentry, - umode_t mode) +static struct dentry *vboxsf_dir_mkdir(struct mnt_idmap *idmap, + struct inode *parent, struct dentry *dentry, + umode_t mode) { - return vboxsf_dir_create(parent, dentry, mode, true, true, NULL); + return ERR_PTR(vboxsf_dir_create(parent, dentry, mode, true, true, NULL)); } static int vboxsf_dir_atomic_open(struct inode *parent, struct dentry *dentry, diff --git a/fs/vboxsf/file.c b/fs/vboxsf/file.c index b780deb81b02..b492794f8e9a 100644 --- a/fs/vboxsf/file.c +++ b/fs/vboxsf/file.c @@ -262,40 +262,42 @@ static struct vboxsf_handle *vboxsf_get_write_handle(struct vboxsf_inode *sf_i) return sf_handle; } -static int vboxsf_writepage(struct page *page, struct writeback_control *wbc) +static int vboxsf_writepages(struct address_space *mapping, + struct writeback_control *wbc) { - struct inode *inode = page->mapping->host; + struct inode *inode = mapping->host; + struct folio *folio = NULL; struct vboxsf_inode *sf_i = VBOXSF_I(inode); struct vboxsf_handle *sf_handle; - loff_t off = page_offset(page); loff_t size = i_size_read(inode); - u32 nwrite = PAGE_SIZE; - u8 *buf; - int err; - - if (off + PAGE_SIZE > size) - nwrite = size & ~PAGE_MASK; + int error; sf_handle = vboxsf_get_write_handle(sf_i); if (!sf_handle) return -EBADF; - buf = kmap(page); - err = vboxsf_write(sf_handle->root, sf_handle->handle, - off, &nwrite, buf); - kunmap(page); + while ((folio = writeback_iter(mapping, wbc, folio, &error))) { + loff_t off = folio_pos(folio); + u32 nwrite = folio_size(folio); + u8 *buf; - kref_put(&sf_handle->refcount, vboxsf_handle_release); + if (nwrite > size - off) + nwrite = size - off; - if (err == 0) { - /* mtime changed */ - sf_i->force_restat = 1; - } else { - ClearPageUptodate(page); + buf = kmap_local_folio(folio, 0); + error = vboxsf_write(sf_handle->root, sf_handle->handle, + off, &nwrite, buf); + kunmap_local(buf); + + folio_unlock(folio); } - unlock_page(page); - return err; + kref_put(&sf_handle->refcount, vboxsf_handle_release); + + /* mtime changed */ + if (error == 0) + sf_i->force_restat = 1; + return error; } static int vboxsf_write_end(struct file *file, struct address_space *mapping, @@ -347,10 +349,11 @@ out: */ const struct address_space_operations vboxsf_reg_aops = { .read_folio = vboxsf_read_folio, - .writepage = vboxsf_writepage, + .writepages = vboxsf_writepages, .dirty_folio = filemap_dirty_folio, .write_begin = simple_write_begin, .write_end = vboxsf_write_end, + .migrate_folio = filemap_migrate_folio, }; static const char *vboxsf_get_link(struct dentry *dentry, struct inode *inode, diff --git a/fs/vboxsf/super.c b/fs/vboxsf/super.c index 1d94bb784108..0bc96ab6580b 100644 --- a/fs/vboxsf/super.c +++ b/fs/vboxsf/super.c @@ -21,8 +21,7 @@ #define VBOXSF_SUPER_MAGIC 0x786f4256 /* 'VBox' little endian */ -static const unsigned char VBSF_MOUNT_SIGNATURE[4] = { '\000', '\377', '\376', - '\375' }; +static const unsigned char VBSF_MOUNT_SIGNATURE[4] __nonstring = "\000\377\376\375"; static int follow_symlinks; module_param(follow_symlinks, int, 0444); diff --git a/fs/verity/Kconfig b/fs/verity/Kconfig index e1036e535352..40569d3527a7 100644 --- a/fs/verity/Kconfig +++ b/fs/verity/Kconfig @@ -4,13 +4,9 @@ config FS_VERITY bool "FS Verity (read-only file-based authenticity protection)" select CRYPTO select CRYPTO_HASH_INFO - # SHA-256 is implied as it's intended to be the default hash algorithm. + # SHA-256 is selected as it's intended to be the default hash algorithm. # To avoid bloat, other wanted algorithms must be selected explicitly. - # Note that CRYPTO_SHA256 denotes the generic C implementation, but - # some architectures provided optimized implementations of the same - # algorithm that may be used instead. In this case, CRYPTO_SHA256 may - # be omitted even if SHA-256 is being used. - imply CRYPTO_SHA256 + select CRYPTO_SHA256 help This option enables fs-verity. fs-verity is the dm-verity mechanism implemented at the file level. On supported diff --git a/fs/xattr.c b/fs/xattr.c index 02bee149ad96..8ec5b0204bfd 100644 --- a/fs/xattr.c +++ b/fs/xattr.c @@ -703,7 +703,7 @@ static int path_setxattrat(int dfd, const char __user *pathname, return error; filename = getname_maybe_null(pathname, at_flags); - if (!filename) { + if (!filename && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) error = -EBADF; @@ -847,7 +847,7 @@ static ssize_t path_getxattrat(int dfd, const char __user *pathname, return error; filename = getname_maybe_null(pathname, at_flags); - if (!filename) { + if (!filename && dfd >= 0) { CLASS(fd, f)(dfd); if (fd_empty(f)) return -EBADF; @@ -1428,6 +1428,15 @@ static bool xattr_is_trusted(const char *name) return !strncmp(name, XATTR_TRUSTED_PREFIX, XATTR_TRUSTED_PREFIX_LEN); } +static bool xattr_is_maclabel(const char *name) +{ + const char *suffix = name + XATTR_SECURITY_PREFIX_LEN; + + return !strncmp(name, XATTR_SECURITY_PREFIX, + XATTR_SECURITY_PREFIX_LEN) && + security_ismaclabel(suffix); +} + /** * simple_xattr_list - list all xattr objects * @inode: inode from which to get the xattrs @@ -1460,6 +1469,17 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (err) return err; + err = security_inode_listsecurity(inode, buffer, remaining_size); + if (err < 0) + return err; + + if (buffer) { + if (remaining_size < err) + return -ERANGE; + buffer += err; + } + remaining_size -= err; + read_lock(&xattrs->lock); for (rbp = rb_first(&xattrs->rb_root); rbp; rbp = rb_next(rbp)) { xattr = rb_entry(rbp, struct simple_xattr, rb_node); @@ -1468,6 +1488,10 @@ ssize_t simple_xattr_list(struct inode *inode, struct simple_xattrs *xattrs, if (!trusted && xattr_is_trusted(xattr->name)) continue; + /* skip MAC labels; these are provided by LSM above */ + if (xattr_is_maclabel(xattr->name)) + continue; + err = xattr_list_one(&buffer, &remaining_size, xattr->name); if (err) break; diff --git a/fs/xfs/Kconfig b/fs/xfs/Kconfig index fffd6fffdce0..ae0ca6858496 100644 --- a/fs/xfs/Kconfig +++ b/fs/xfs/Kconfig @@ -3,7 +3,7 @@ config XFS_FS tristate "XFS filesystem support" depends on BLOCK select EXPORTFS - select LIBCRC32C + select CRC32 select FS_IOMAP help XFS is a high performance journaling filesystem which originated diff --git a/fs/xfs/Makefile b/fs/xfs/Makefile index 7afa51e41427..5bf501cf8271 100644 --- a/fs/xfs/Makefile +++ b/fs/xfs/Makefile @@ -64,6 +64,7 @@ xfs-y += $(addprefix libxfs/, \ xfs-$(CONFIG_XFS_RT) += $(addprefix libxfs/, \ xfs_rtbitmap.o \ xfs_rtgroup.o \ + xfs_zones.o \ ) # highlevel code @@ -136,7 +137,11 @@ xfs-$(CONFIG_XFS_QUOTA) += xfs_dquot.o \ xfs_quotaops.o # xfs_rtbitmap is shared with libxfs -xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o +xfs-$(CONFIG_XFS_RT) += xfs_rtalloc.o \ + xfs_zone_alloc.o \ + xfs_zone_gc.o \ + xfs_zone_info.o \ + xfs_zone_space_resv.o xfs-$(CONFIG_XFS_POSIX_ACL) += xfs_acl.o xfs-$(CONFIG_SYSCTL) += xfs_sysctl.o diff --git a/fs/xfs/libxfs/xfs_ag.c b/fs/xfs/libxfs/xfs_ag.c index b59cb461e096..e6ba914f6d06 100644 --- a/fs/xfs/libxfs/xfs_ag.c +++ b/fs/xfs/libxfs/xfs_ag.c @@ -301,7 +301,7 @@ xfs_get_aghdr_buf( struct xfs_buf *bp; int error; - error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, 0, &bp); + error = xfs_buf_get_uncached(mp->m_ddev_targp, numblks, &bp); if (error) return error; diff --git a/fs/xfs/libxfs/xfs_alloc.c b/fs/xfs/libxfs/xfs_alloc.c index 3d33e17f2e5c..7839efe050bf 100644 --- a/fs/xfs/libxfs/xfs_alloc.c +++ b/fs/xfs/libxfs/xfs_alloc.c @@ -33,8 +33,6 @@ struct kmem_cache *xfs_extfree_item_cache; struct workqueue_struct *xfs_alloc_wq; -#define XFS_ABSDIFF(a,b) (((a) <= (b)) ? ((b) - (a)) : ((a) - (b))) - #define XFSA_FIXUP_BNO_OK 1 #define XFSA_FIXUP_CNT_OK 2 @@ -410,8 +408,8 @@ xfs_alloc_compute_diff( if (newbno1 != NULLAGBLOCK && newbno2 != NULLAGBLOCK) { if (newlen1 < newlen2 || (newlen1 == newlen2 && - XFS_ABSDIFF(newbno1, wantbno) > - XFS_ABSDIFF(newbno2, wantbno))) + abs_diff(newbno1, wantbno) > + abs_diff(newbno2, wantbno))) newbno1 = newbno2; } else if (newbno2 != NULLAGBLOCK) newbno1 = newbno2; @@ -427,7 +425,7 @@ xfs_alloc_compute_diff( } else newbno1 = freeend - wantlen; *newbnop = newbno1; - return newbno1 == NULLAGBLOCK ? 0 : XFS_ABSDIFF(newbno1, wantbno); + return newbno1 == NULLAGBLOCK ? 0 : abs_diff(newbno1, wantbno); } /* diff --git a/fs/xfs/libxfs/xfs_bmap.c b/fs/xfs/libxfs/xfs_bmap.c index 0ef19f1469ec..d954f9b8071f 100644 --- a/fs/xfs/libxfs/xfs_bmap.c +++ b/fs/xfs/libxfs/xfs_bmap.c @@ -34,13 +34,13 @@ #include "xfs_ag.h" #include "xfs_ag_resv.h" #include "xfs_refcount.h" -#include "xfs_icache.h" #include "xfs_iomap.h" #include "xfs_health.h" #include "xfs_bmap_item.h" #include "xfs_symlink_remote.h" #include "xfs_inode_util.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_bmap_intent_cache; @@ -171,18 +171,16 @@ xfs_bmbt_update( * Compute the worst-case number of indirect blocks that will be used * for ip's delayed extent of length "len". */ -STATIC xfs_filblks_t +xfs_filblks_t xfs_bmap_worst_indlen( - xfs_inode_t *ip, /* incore inode pointer */ - xfs_filblks_t len) /* delayed extent length */ + struct xfs_inode *ip, /* incore inode pointer */ + xfs_filblks_t len) /* delayed extent length */ { - int level; /* btree level number */ - int maxrecs; /* maximum record count at this level */ - xfs_mount_t *mp; /* mount structure */ - xfs_filblks_t rval; /* return value */ + struct xfs_mount *mp = ip->i_mount; + int maxrecs = mp->m_bmap_dmxr[0]; + int level; + xfs_filblks_t rval; - mp = ip->i_mount; - maxrecs = mp->m_bmap_dmxr[0]; for (level = 0, rval = 0; level < XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK); level++) { @@ -2572,146 +2570,6 @@ done: } /* - * Convert a hole to a delayed allocation. - */ -STATIC void -xfs_bmap_add_extent_hole_delay( - xfs_inode_t *ip, /* incore inode pointer */ - int whichfork, - struct xfs_iext_cursor *icur, - xfs_bmbt_irec_t *new) /* new data to add to file extents */ -{ - struct xfs_ifork *ifp; /* inode fork pointer */ - xfs_bmbt_irec_t left; /* left neighbor extent entry */ - xfs_filblks_t newlen=0; /* new indirect size */ - xfs_filblks_t oldlen=0; /* old indirect size */ - xfs_bmbt_irec_t right; /* right neighbor extent entry */ - uint32_t state = xfs_bmap_fork_to_state(whichfork); - xfs_filblks_t temp; /* temp for indirect calculations */ - - ifp = xfs_ifork_ptr(ip, whichfork); - ASSERT(isnullstartblock(new->br_startblock)); - - /* - * Check and set flags if this segment has a left neighbor - */ - if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { - state |= BMAP_LEFT_VALID; - if (isnullstartblock(left.br_startblock)) - state |= BMAP_LEFT_DELAY; - } - - /* - * Check and set flags if the current (right) segment exists. - * If it doesn't exist, we're converting the hole at end-of-file. - */ - if (xfs_iext_get_extent(ifp, icur, &right)) { - state |= BMAP_RIGHT_VALID; - if (isnullstartblock(right.br_startblock)) - state |= BMAP_RIGHT_DELAY; - } - - /* - * Set contiguity flags on the left and right neighbors. - * Don't let extents get too large, even if the pieces are contiguous. - */ - if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && - left.br_startoff + left.br_blockcount == new->br_startoff && - left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) - state |= BMAP_LEFT_CONTIG; - - if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && - new->br_startoff + new->br_blockcount == right.br_startoff && - new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && - (!(state & BMAP_LEFT_CONTIG) || - (left.br_blockcount + new->br_blockcount + - right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) - state |= BMAP_RIGHT_CONTIG; - - /* - * Switch out based on the contiguity flags. - */ - switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { - case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with delayed allocations - * on the left and on the right. - * Merge all three into a single extent record. - */ - temp = left.br_blockcount + new->br_blockcount + - right.br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_startblock = nullstartblock(newlen); - left.br_blockcount = temp; - - xfs_iext_remove(ip, icur, state); - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_LEFT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the left. - * Merge the new allocation with the left neighbor. - */ - temp = left.br_blockcount + new->br_blockcount; - - oldlen = startblockval(left.br_startblock) + - startblockval(new->br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - left.br_blockcount = temp; - left.br_startblock = nullstartblock(newlen); - - xfs_iext_prev(ifp, icur); - xfs_iext_update_extent(ip, state, icur, &left); - break; - - case BMAP_RIGHT_CONTIG: - /* - * New allocation is contiguous with a delayed allocation - * on the right. - * Merge the new allocation with the right neighbor. - */ - temp = new->br_blockcount + right.br_blockcount; - oldlen = startblockval(new->br_startblock) + - startblockval(right.br_startblock); - newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), - oldlen); - right.br_startoff = new->br_startoff; - right.br_startblock = nullstartblock(newlen); - right.br_blockcount = temp; - xfs_iext_update_extent(ip, state, icur, &right); - break; - - case 0: - /* - * New allocation is not contiguous with another - * delayed allocation. - * Insert a new entry. - */ - oldlen = newlen = 0; - xfs_iext_insert(ip, icur, new, state); - break; - } - if (oldlen != newlen) { - ASSERT(oldlen > newlen); - xfs_add_fdblocks(ip->i_mount, oldlen - newlen); - - /* - * Nothing to do for disk quota accounting here. - */ - xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); - } -} - -/* * Convert a hole to a real allocation. */ STATIC int /* error */ @@ -3454,6 +3312,11 @@ xfs_bmap_compute_alignments( align = xfs_get_cowextsz_hint(ap->ip); else if (ap->datatype & XFS_ALLOC_USERDATA) align = xfs_get_extsz_hint(ap->ip); + + /* Try to align start block to any minimum allocation alignment */ + if (align > 1 && (ap->flags & XFS_BMAPI_EXTSZALIGN)) + args->alignment = align; + if (align) { if (xfs_bmap_extsize_align(mp, &ap->got, &ap->prev, align, 0, ap->eof, 0, ap->conv, &ap->offset, @@ -4039,144 +3902,6 @@ xfs_bmapi_read( return 0; } -/* - * Add a delayed allocation extent to an inode. Blocks are reserved from the - * global pool and the extent inserted into the inode in-core extent tree. - * - * On entry, got refers to the first extent beyond the offset of the extent to - * allocate or eof is specified if no such extent exists. On return, got refers - * to the extent record that was inserted to the inode fork. - * - * Note that the allocated extent may have been merged with contiguous extents - * during insertion into the inode fork. Thus, got does not reflect the current - * state of the inode fork on return. If necessary, the caller can use lastx to - * look up the updated record in the inode fork. - */ -int -xfs_bmapi_reserve_delalloc( - struct xfs_inode *ip, - int whichfork, - xfs_fileoff_t off, - xfs_filblks_t len, - xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, - struct xfs_iext_cursor *icur, - int eof) -{ - struct xfs_mount *mp = ip->i_mount; - struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); - xfs_extlen_t alen; - xfs_extlen_t indlen; - uint64_t fdblocks; - int error; - xfs_fileoff_t aoff; - bool use_cowextszhint = - whichfork == XFS_COW_FORK && !prealloc; - -retry: - /* - * Cap the alloc length. Keep track of prealloc so we know whether to - * tag the inode before we return. - */ - aoff = off; - alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); - if (!eof) - alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); - if (prealloc && alen >= len) - prealloc = alen - len; - - /* - * If we're targetting the COW fork but aren't creating a speculative - * posteof preallocation, try to expand the reservation to align with - * the COW extent size hint if there's sufficient free space. - * - * Unlike the data fork, the CoW cancellation functions will free all - * the reservations at inactivation, so we don't require that every - * delalloc reservation have a dirty pagecache. - */ - if (use_cowextszhint) { - struct xfs_bmbt_irec prev; - xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); - - if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) - prev.br_startoff = NULLFILEOFF; - - error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, - 1, 0, &aoff, &alen); - ASSERT(!error); - } - - /* - * Make a transaction-less quota reservation for delayed allocation - * blocks. This number gets adjusted later. We return if we haven't - * allocated blocks already inside this loop. - */ - error = xfs_quota_reserve_blkres(ip, alen); - if (error) - goto out; - - /* - * Split changing sb for alen and indlen since they could be coming - * from different places. - */ - indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); - ASSERT(indlen > 0); - - fdblocks = indlen; - if (XFS_IS_REALTIME_INODE(ip)) { - error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); - if (error) - goto out_unreserve_quota; - } else { - fdblocks += alen; - } - - error = xfs_dec_fdblocks(mp, fdblocks, false); - if (error) - goto out_unreserve_frextents; - - ip->i_delayed_blks += alen; - xfs_mod_delalloc(ip, alen, indlen); - - got->br_startoff = aoff; - got->br_startblock = nullstartblock(indlen); - got->br_blockcount = alen; - got->br_state = XFS_EXT_NORM; - - xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); - - /* - * Tag the inode if blocks were preallocated. Note that COW fork - * preallocation can occur at the start or end of the extent, even when - * prealloc == 0, so we must also check the aligned offset and length. - */ - if (whichfork == XFS_DATA_FORK && prealloc) - xfs_inode_set_eofblocks_tag(ip); - if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) - xfs_inode_set_cowblocks_tag(ip); - - return 0; - -out_unreserve_frextents: - if (XFS_IS_REALTIME_INODE(ip)) - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); -out_unreserve_quota: - if (XFS_IS_QUOTA_ON(mp)) - xfs_quota_unreserve_blkres(ip, alen); -out: - if (error == -ENOSPC || error == -EDQUOT) { - trace_xfs_delalloc_enospc(ip, off, len); - - if (prealloc || use_cowextszhint) { - /* retry without any preallocation */ - use_cowextszhint = false; - prealloc = 0; - goto retry; - } - } - return error; -} - static int xfs_bmapi_allocate( struct xfs_bmalloca *bma) @@ -4948,7 +4673,8 @@ xfs_bmap_del_extent_delay( int whichfork, struct xfs_iext_cursor *icur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del) + struct xfs_bmbt_irec *del, + uint32_t bflags) /* bmapi flags */ { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -5068,10 +4794,18 @@ xfs_bmap_del_extent_delay( da_diff = da_old - da_new; fdblocks = da_diff; - if (isrt) - xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, del->br_blockcount)); - else + if (bflags & XFS_BMAPI_REMAP) { + ; + } else if (isrt) { + xfs_rtbxlen_t rtxlen; + + rtxlen = xfs_blen_to_rtbxlen(mp, del->br_blockcount); + if (xfs_is_zoned_inode(ip)) + xfs_zoned_add_available(mp, rtxlen); + xfs_add_frextents(mp, rtxlen); + } else { fdblocks += del->br_blockcount; + } xfs_add_fdblocks(mp, fdblocks); xfs_mod_delalloc(ip, -(int64_t)del->br_blockcount, -da_diff); @@ -5670,7 +5404,8 @@ __xfs_bunmapi( delete: if (wasdel) { - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, flags); } else { error = xfs_bmap_del_extent_real(ip, tp, &icur, cur, &del, &tmp_logflags, whichfork, diff --git a/fs/xfs/libxfs/xfs_bmap.h b/fs/xfs/libxfs/xfs_bmap.h index 4b721d935994..d5f2729305fa 100644 --- a/fs/xfs/libxfs/xfs_bmap.h +++ b/fs/xfs/libxfs/xfs_bmap.h @@ -87,6 +87,9 @@ struct xfs_bmalloca { /* Do not update the rmap btree. Used for reconstructing bmbt from rmapbt. */ #define XFS_BMAPI_NORMAP (1u << 10) +/* Try to align allocations to the extent size hint */ +#define XFS_BMAPI_EXTSZALIGN (1u << 11) + #define XFS_BMAPI_FLAGS \ { XFS_BMAPI_ENTIRE, "ENTIRE" }, \ { XFS_BMAPI_METADATA, "METADATA" }, \ @@ -98,7 +101,8 @@ struct xfs_bmalloca { { XFS_BMAPI_REMAP, "REMAP" }, \ { XFS_BMAPI_COWFORK, "COWFORK" }, \ { XFS_BMAPI_NODISCARD, "NODISCARD" }, \ - { XFS_BMAPI_NORMAP, "NORMAP" } + { XFS_BMAPI_NORMAP, "NORMAP" },\ + { XFS_BMAPI_EXTSZALIGN, "EXTSZALIGN" } static inline int xfs_bmapi_aflag(int w) @@ -204,7 +208,7 @@ int xfs_bunmapi(struct xfs_trans *tp, struct xfs_inode *ip, xfs_extnum_t nexts, int *done); void xfs_bmap_del_extent_delay(struct xfs_inode *ip, int whichfork, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, - struct xfs_bmbt_irec *del); + struct xfs_bmbt_irec *del, uint32_t bflags); void xfs_bmap_del_extent_cow(struct xfs_inode *ip, struct xfs_iext_cursor *cur, struct xfs_bmbt_irec *got, struct xfs_bmbt_irec *del); @@ -219,10 +223,6 @@ int xfs_bmap_insert_extents(struct xfs_trans *tp, struct xfs_inode *ip, bool *done, xfs_fileoff_t stop_fsb); int xfs_bmap_split_extent(struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t split_offset); -int xfs_bmapi_reserve_delalloc(struct xfs_inode *ip, int whichfork, - xfs_fileoff_t off, xfs_filblks_t len, xfs_filblks_t prealloc, - struct xfs_bmbt_irec *got, struct xfs_iext_cursor *cur, - int eof); int xfs_bmapi_convert_delalloc(struct xfs_inode *ip, int whichfork, xfs_off_t offset, struct iomap *iomap, unsigned int *seq); int xfs_bmap_add_extent_unwritten_real(struct xfs_trans *tp, @@ -233,6 +233,7 @@ xfs_extlen_t xfs_bmapi_minleft(struct xfs_trans *tp, struct xfs_inode *ip, int fork); int xfs_bmap_btalloc_low_space(struct xfs_bmalloca *ap, struct xfs_alloc_arg *args); +xfs_filblks_t xfs_bmap_worst_indlen(struct xfs_inode *ip, xfs_filblks_t len); enum xfs_bmap_intent_type { XFS_BMAP_MAP = 1, diff --git a/fs/xfs/libxfs/xfs_format.h b/fs/xfs/libxfs/xfs_format.h index b1007fb661ba..9566a7623365 100644 --- a/fs/xfs/libxfs/xfs_format.h +++ b/fs/xfs/libxfs/xfs_format.h @@ -178,9 +178,10 @@ typedef struct xfs_sb { xfs_rgnumber_t sb_rgcount; /* number of realtime groups */ xfs_rtxlen_t sb_rgextents; /* size of a realtime group in rtx */ - uint8_t sb_rgblklog; /* rt group number shift */ uint8_t sb_pad[7]; /* zeroes */ + xfs_rfsblock_t sb_rtstart; /* start of internal RT section (FSB) */ + xfs_filblks_t sb_rtreserved; /* reserved (zoned) RT blocks */ /* must be padded to 64 bit alignment */ } xfs_sb_t; @@ -270,9 +271,10 @@ struct xfs_dsb { __be64 sb_metadirino; /* metadata directory tree root */ __be32 sb_rgcount; /* # of realtime groups */ __be32 sb_rgextents; /* size of rtgroup in rtx */ - __u8 sb_rgblklog; /* rt group number shift */ __u8 sb_pad[7]; /* zeroes */ + __be64 sb_rtstart; /* start of internal RT section (FSB) */ + __be64 sb_rtreserved; /* reserved (zoned) RT blocks */ /* * The size of this structure must be padded to 64 bit alignment. @@ -395,6 +397,9 @@ xfs_sb_has_ro_compat_feature( #define XFS_SB_FEAT_INCOMPAT_EXCHRANGE (1 << 6) /* exchangerange supported */ #define XFS_SB_FEAT_INCOMPAT_PARENT (1 << 7) /* parent pointers */ #define XFS_SB_FEAT_INCOMPAT_METADIR (1 << 8) /* metadata dir tree */ +#define XFS_SB_FEAT_INCOMPAT_ZONED (1 << 9) /* zoned RT allocator */ +#define XFS_SB_FEAT_INCOMPAT_ZONE_GAPS (1 << 10) /* RTGs have LBA gaps */ + #define XFS_SB_FEAT_INCOMPAT_ALL \ (XFS_SB_FEAT_INCOMPAT_FTYPE | \ XFS_SB_FEAT_INCOMPAT_SPINODES | \ @@ -404,7 +409,9 @@ xfs_sb_has_ro_compat_feature( XFS_SB_FEAT_INCOMPAT_NREXT64 | \ XFS_SB_FEAT_INCOMPAT_EXCHRANGE | \ XFS_SB_FEAT_INCOMPAT_PARENT | \ - XFS_SB_FEAT_INCOMPAT_METADIR) + XFS_SB_FEAT_INCOMPAT_METADIR | \ + XFS_SB_FEAT_INCOMPAT_ZONED | \ + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS) #define XFS_SB_FEAT_INCOMPAT_UNKNOWN ~XFS_SB_FEAT_INCOMPAT_ALL static inline bool @@ -952,7 +959,12 @@ struct xfs_dinode { __be64 di_changecount; /* number of attribute changes */ __be64 di_lsn; /* flush sequence */ __be64 di_flags2; /* more random flags */ - __be32 di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + __be32 di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + __be32 di_used_blocks; + }; __u8 di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ diff --git a/fs/xfs/libxfs/xfs_fs.h b/fs/xfs/libxfs/xfs_fs.h index 2c3171262b44..12463ba766da 100644 --- a/fs/xfs/libxfs/xfs_fs.h +++ b/fs/xfs/libxfs/xfs_fs.h @@ -189,7 +189,9 @@ struct xfs_fsop_geom { uint32_t checked; /* o: checked fs & rt metadata */ __u32 rgextents; /* rt extents in a realtime group */ __u32 rgcount; /* number of realtime groups */ - __u64 reserved[16]; /* reserved space */ + __u64 rtstart; /* start of internal rt section */ + __u64 rtreserved; /* RT (zoned) reserved blocks */ + __u64 reserved[14]; /* reserved space */ }; #define XFS_FSOP_GEOM_SICK_COUNTERS (1 << 0) /* summary counters */ @@ -247,6 +249,7 @@ typedef struct xfs_fsop_resblks { #define XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE (1 << 24) /* exchange range */ #define XFS_FSOP_GEOM_FLAGS_PARENT (1 << 25) /* linux parent pointers */ #define XFS_FSOP_GEOM_FLAGS_METADIR (1 << 26) /* metadata directories */ +#define XFS_FSOP_GEOM_FLAGS_ZONED (1 << 27) /* zoned rt device */ /* * Minimum and maximum sizes need for growth checks. @@ -1079,6 +1082,15 @@ struct xfs_rtgroup_geometry { #define XFS_IOC_COMMIT_RANGE _IOW ('X', 131, struct xfs_commit_range) /* XFS_IOC_GETFSUUID ---------- deprecated 140 */ +/* + * Devices supported by a single XFS file system. Reported in fsmaps fmr_device + * when using internal RT devices. + */ +enum xfs_device { + XFS_DEV_DATA = 1, + XFS_DEV_LOG = 2, + XFS_DEV_RT = 3, +}; #ifndef HAVE_BBMACROS /* diff --git a/fs/xfs/libxfs/xfs_group.h b/fs/xfs/libxfs/xfs_group.h index 242b05627c7a..4423932a2313 100644 --- a/fs/xfs/libxfs/xfs_group.h +++ b/fs/xfs/libxfs/xfs_group.h @@ -19,10 +19,23 @@ struct xfs_group { #ifdef __KERNEL__ /* -- kernel only structures below this line -- */ - /* - * Track freed but not yet committed extents. - */ - struct xfs_extent_busy_tree *xg_busy_extents; + union { + /* + * For perags and non-zoned RT groups: + * Track freed but not yet committed extents. + */ + struct xfs_extent_busy_tree *xg_busy_extents; + + /* + * For zoned RT groups: + * List of groups that need a zone reset. + * + * The zonegc code forces a log flush of the rtrmap inode before + * resetting the write pointer, so there is no need for + * individual busy extent tracking. + */ + struct xfs_group *xg_next_reset; + }; /* * Bitsets of per-ag metadata that have been checked and/or are sick. @@ -107,9 +120,15 @@ xfs_gbno_to_daddr( xfs_agblock_t gbno) { struct xfs_mount *mp = xg->xg_mount; - uint32_t blocks = mp->m_groups[xg->xg_type].blocks; + struct xfs_groups *g = &mp->m_groups[xg->xg_type]; + xfs_fsblock_t fsbno; + + if (g->has_daddr_gaps) + fsbno = xfs_gbno_to_fsb(xg, gbno); + else + fsbno = (xfs_fsblock_t)xg->xg_gno * g->blocks + gbno; - return XFS_FSB_TO_BB(mp, (xfs_fsblock_t)xg->xg_gno * blocks + gbno); + return XFS_FSB_TO_BB(mp, g->start_fsb + fsbno); } static inline uint32_t diff --git a/fs/xfs/libxfs/xfs_ialloc.c b/fs/xfs/libxfs/xfs_ialloc.c index f3a840a425f5..0c47b5c6ca7d 100644 --- a/fs/xfs/libxfs/xfs_ialloc.c +++ b/fs/xfs/libxfs/xfs_ialloc.c @@ -364,7 +364,7 @@ xfs_ialloc_inode_init( (j * M_IGEO(mp)->blocks_per_cluster)); error = xfs_trans_get_buf(tp, mp->m_ddev_targp, d, mp->m_bsize * M_IGEO(mp)->blocks_per_cluster, - XBF_UNMAPPED, &fbuf); + 0, &fbuf); if (error) return error; @@ -1927,7 +1927,7 @@ xfs_dialloc( * that we can immediately allocate, but then we allow allocation on the * second pass if we fail to find an AG with free inodes in it. */ - if (percpu_counter_read_positive(&mp->m_fdblocks) < + if (xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) < mp->m_low_space[XFS_LOWSP_1_PCNT]) { ok_alloc = false; low_space = true; diff --git a/fs/xfs/libxfs/xfs_inode_buf.c b/fs/xfs/libxfs/xfs_inode_buf.c index f24fa628fecf..aa13fc00afd7 100644 --- a/fs/xfs/libxfs/xfs_inode_buf.c +++ b/fs/xfs/libxfs/xfs_inode_buf.c @@ -137,7 +137,7 @@ xfs_imap_to_bp( int error; error = xfs_trans_read_buf(mp, tp, mp->m_ddev_targp, imap->im_blkno, - imap->im_len, XBF_UNMAPPED, bpp, &xfs_inode_buf_ops); + imap->im_len, 0, bpp, &xfs_inode_buf_ops); if (xfs_metadata_is_sick(error)) xfs_agno_mark_sick(mp, xfs_daddr_to_agno(mp, imap->im_blkno), XFS_SICK_AG_INODES); @@ -252,7 +252,10 @@ xfs_inode_from_disk( be64_to_cpu(from->di_changecount)); ip->i_crtime = xfs_inode_from_disk_ts(from, from->di_crtime); ip->i_diflags2 = be64_to_cpu(from->di_flags2); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = be32_to_cpu(from->di_cowextsize); + BUILD_BUG_ON(sizeof(from->di_cowextsize) != + sizeof(from->di_used_blocks)); } error = xfs_iformat_data_fork(ip, from); @@ -349,6 +352,7 @@ xfs_inode_to_disk( to->di_changecount = cpu_to_be64(inode_peek_iversion(inode)); to->di_crtime = xfs_inode_to_disk_ts(ip, ip->i_crtime); to->di_flags2 = cpu_to_be64(ip->i_diflags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(ip->i_cowextsize); to->di_ino = cpu_to_be64(ip->i_ino); to->di_lsn = cpu_to_be64(lsn); @@ -752,11 +756,18 @@ xfs_dinode_verify( !xfs_has_rtreflink(mp)) return __this_address; - /* COW extent size hint validation */ - fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), - mode, flags, flags2); - if (fa) - return fa; + if (xfs_has_zoned(mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) { + if (be32_to_cpu(dip->di_used_blocks) > mp->m_sb.sb_rgextents) + return __this_address; + } else { + /* COW extent size hint validation */ + fa = xfs_inode_validate_cowextsize(mp, + be32_to_cpu(dip->di_cowextsize), + mode, flags, flags2); + if (fa) + return fa; + } /* bigtime iflag can only happen on bigtime filesystems */ if (xfs_dinode_has_bigtime(dip) && diff --git a/fs/xfs/libxfs/xfs_inode_util.c b/fs/xfs/libxfs/xfs_inode_util.c index deb0b7c00a1f..48fe49a5f050 100644 --- a/fs/xfs/libxfs/xfs_inode_util.c +++ b/fs/xfs/libxfs/xfs_inode_util.c @@ -322,6 +322,7 @@ xfs_inode_init( if (xfs_has_v3inodes(mp)) { inode_set_iversion(inode, 1); + /* also covers the di_used_blocks union arm: */ ip->i_cowextsize = 0; times |= XFS_ICHGTIME_CREATE; } diff --git a/fs/xfs/libxfs/xfs_log_format.h b/fs/xfs/libxfs/xfs_log_format.h index a472ac2e45d0..0d637c276db0 100644 --- a/fs/xfs/libxfs/xfs_log_format.h +++ b/fs/xfs/libxfs/xfs_log_format.h @@ -475,7 +475,12 @@ struct xfs_log_dinode { xfs_lsn_t di_lsn; uint64_t di_flags2; /* more random flags */ - uint32_t di_cowextsize; /* basic cow extent size for file */ + union { + /* basic cow extent size for (regular) file */ + uint32_t di_cowextsize; + /* used blocks in RTG for (zoned) rtrmap inode */ + uint32_t di_used_blocks; + }; uint8_t di_pad2[12]; /* more padding for future expansion */ /* fields only written to during inode creation */ diff --git a/fs/xfs/libxfs/xfs_log_rlimit.c b/fs/xfs/libxfs/xfs_log_rlimit.c index d3bd6a86c8fe..34bba96d30ca 100644 --- a/fs/xfs/libxfs/xfs_log_rlimit.c +++ b/fs/xfs/libxfs/xfs_log_rlimit.c @@ -91,6 +91,7 @@ xfs_log_calc_trans_resv_for_minlogblocks( */ if (xfs_want_minlogsize_fixes(&mp->m_sb)) { xfs_trans_resv_calc(mp, resv); + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; return; } @@ -107,6 +108,9 @@ xfs_log_calc_trans_resv_for_minlogblocks( xfs_trans_resv_calc(mp, resv); + /* Copy the dynamic transaction reservation types from the running fs */ + resv->tr_atomic_ioend = M_RES(mp)->tr_atomic_ioend; + if (xfs_has_reflink(mp)) { /* * In the early days of reflink, typical log operation counts diff --git a/fs/xfs/libxfs/xfs_metafile.c b/fs/xfs/libxfs/xfs_metafile.c index 2f5f554a36d4..225923e463c4 100644 --- a/fs/xfs/libxfs/xfs_metafile.c +++ b/fs/xfs/libxfs/xfs_metafile.c @@ -21,6 +21,9 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_alloc.h" +#include "xfs_rtgroup.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_rtrefcount_btree.h" static const struct { enum xfs_metafile_type mtype; @@ -74,12 +77,11 @@ xfs_metafile_clear_iflag( } /* - * Is the amount of space that could be allocated towards a given metadata - * file at or beneath a certain threshold? + * Is the metafile reservations at or beneath a certain threshold? */ static inline bool xfs_metafile_resv_can_cover( - struct xfs_inode *ip, + struct xfs_mount *mp, int64_t rhs) { /* @@ -88,43 +90,38 @@ xfs_metafile_resv_can_cover( * global free block count. Take care of the first case to avoid * touching the per-cpu counter. */ - if (ip->i_delayed_blks >= rhs) + if (mp->m_metafile_resv_avail >= rhs) return true; /* * There aren't enough blocks left in the inode's reservation, but it * isn't critical unless there also isn't enough free space. */ - return __percpu_counter_compare(&ip->i_mount->m_fdblocks, - rhs - ip->i_delayed_blks, 2048) >= 0; + return xfs_compare_freecounter(mp, XC_FREE_BLOCKS, + rhs - mp->m_metafile_resv_avail, 2048) >= 0; } /* - * Is this metadata file critically low on blocks? For now we'll define that - * as the number of blocks we can get our hands on being less than 10% of what - * we reserved or less than some arbitrary number (maximum btree height). + * Is the metafile reservation critically low on blocks? For now we'll define + * that as the number of blocks we can get our hands on being less than 10% of + * what we reserved or less than some arbitrary number (maximum btree height). */ bool xfs_metafile_resv_critical( - struct xfs_inode *ip) + struct xfs_mount *mp) { - uint64_t asked_low_water; + ASSERT(xfs_has_metadir(mp)); - if (!ip) - return false; - - ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_critical(ip, 0); + trace_xfs_metafile_resv_critical(mp, 0); - if (!xfs_metafile_resv_can_cover(ip, ip->i_mount->m_rtbtree_maxlevels)) + if (!xfs_metafile_resv_can_cover(mp, mp->m_rtbtree_maxlevels)) return true; - asked_low_water = div_u64(ip->i_meta_resv_asked, 10); - if (!xfs_metafile_resv_can_cover(ip, asked_low_water)) + if (!xfs_metafile_resv_can_cover(mp, + div_u64(mp->m_metafile_resv_target, 10))) return true; - return XFS_TEST_ERROR(false, ip->i_mount, - XFS_ERRTAG_METAFILE_RESV_CRITICAL); + return XFS_TEST_ERROR(false, mp, XFS_ERRTAG_METAFILE_RESV_CRITICAL); } /* Allocate a block from the metadata file's reservation. */ @@ -133,22 +130,24 @@ xfs_metafile_resv_alloc_space( struct xfs_inode *ip, struct xfs_alloc_arg *args) { + struct xfs_mount *mp = ip->i_mount; int64_t len = args->len; ASSERT(xfs_is_metadir_inode(ip)); ASSERT(args->resv == XFS_AG_RESV_METAFILE); - trace_xfs_metafile_resv_alloc_space(ip, args->len); + trace_xfs_metafile_resv_alloc_space(mp, args->len); /* * Allocate the blocks from the metadata inode's block reservation * and update the ondisk sb counter. */ - if (ip->i_delayed_blks > 0) { + mutex_lock(&mp->m_metafile_resv_lock); + if (mp->m_metafile_resv_avail > 0) { int64_t from_resv; - from_resv = min_t(int64_t, len, ip->i_delayed_blks); - ip->i_delayed_blks -= from_resv; + from_resv = min_t(int64_t, len, mp->m_metafile_resv_avail); + mp->m_metafile_resv_avail -= from_resv; xfs_mod_delalloc(ip, 0, -from_resv); xfs_trans_mod_sb(args->tp, XFS_TRANS_SB_RES_FDBLOCKS, -from_resv); @@ -175,6 +174,9 @@ xfs_metafile_resv_alloc_space( xfs_trans_mod_sb(args->tp, field, -len); } + mp->m_metafile_resv_used += args->len; + mutex_unlock(&mp->m_metafile_resv_lock); + ip->i_nblocks += args->len; xfs_trans_log_inode(args->tp, ip, XFS_ILOG_CORE); } @@ -186,26 +188,33 @@ xfs_metafile_resv_free_space( struct xfs_trans *tp, xfs_filblks_t len) { + struct xfs_mount *mp = ip->i_mount; int64_t to_resv; ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_free_space(ip, len); + + trace_xfs_metafile_resv_free_space(mp, len); ip->i_nblocks -= len; xfs_trans_log_inode(tp, ip, XFS_ILOG_CORE); + mutex_lock(&mp->m_metafile_resv_lock); + mp->m_metafile_resv_used -= len; + /* * Add the freed blocks back into the inode's delalloc reservation * until it reaches the maximum size. Update the ondisk fdblocks only. */ - to_resv = ip->i_meta_resv_asked - (ip->i_nblocks + ip->i_delayed_blks); + to_resv = mp->m_metafile_resv_target - + (mp->m_metafile_resv_used + mp->m_metafile_resv_avail); if (to_resv > 0) { to_resv = min_t(int64_t, to_resv, len); - ip->i_delayed_blks += to_resv; + mp->m_metafile_resv_avail += to_resv; xfs_mod_delalloc(ip, 0, to_resv); xfs_trans_mod_sb(tp, XFS_TRANS_SB_RES_FDBLOCKS, to_resv); len -= to_resv; } + mutex_unlock(&mp->m_metafile_resv_lock); /* * Everything else goes back to the filesystem, so update the in-core @@ -215,61 +224,99 @@ xfs_metafile_resv_free_space( xfs_trans_mod_sb(tp, XFS_TRANS_SB_FDBLOCKS, len); } -/* Release a metadata file's space reservation. */ +static void +__xfs_metafile_resv_free( + struct xfs_mount *mp) +{ + if (mp->m_metafile_resv_avail) { + xfs_mod_sb_delalloc(mp, -(int64_t)mp->m_metafile_resv_avail); + xfs_add_fdblocks(mp, mp->m_metafile_resv_avail); + } + mp->m_metafile_resv_avail = 0; + mp->m_metafile_resv_used = 0; + mp->m_metafile_resv_target = 0; +} + +/* Release unused metafile space reservation. */ void xfs_metafile_resv_free( - struct xfs_inode *ip) + struct xfs_mount *mp) { - /* Non-btree metadata inodes don't need space reservations. */ - if (!ip || !ip->i_meta_resv_asked) + if (!xfs_has_metadir(mp)) return; - ASSERT(xfs_is_metadir_inode(ip)); - trace_xfs_metafile_resv_free(ip, 0); + trace_xfs_metafile_resv_free(mp, 0); - if (ip->i_delayed_blks) { - xfs_mod_delalloc(ip, 0, -ip->i_delayed_blks); - xfs_add_fdblocks(ip->i_mount, ip->i_delayed_blks); - ip->i_delayed_blks = 0; - } - ip->i_meta_resv_asked = 0; + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + mutex_unlock(&mp->m_metafile_resv_lock); } -/* Set up a metadata file's space reservation. */ +/* Set up a metafile space reservation. */ int xfs_metafile_resv_init( - struct xfs_inode *ip, - xfs_filblks_t ask) + struct xfs_mount *mp) { + struct xfs_rtgroup *rtg = NULL; + xfs_filblks_t used = 0, target = 0; xfs_filblks_t hidden_space; - xfs_filblks_t used; - int error; + xfs_rfsblock_t dblocks_avail = mp->m_sb.sb_dblocks / 4; + int error = 0; - if (!ip || ip->i_meta_resv_asked > 0) + if (!xfs_has_metadir(mp)) return 0; - ASSERT(xfs_is_metadir_inode(ip)); + /* + * Free any previous reservation to have a clean slate. + */ + mutex_lock(&mp->m_metafile_resv_lock); + __xfs_metafile_resv_free(mp); + + /* + * Currently the only btree metafiles that require reservations are the + * rtrmap and the rtrefcount. Anything new will have to be added here + * as well. + */ + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + if (xfs_has_rtrmapbt(mp)) { + used += rtg_rmap(rtg)->i_nblocks; + target += xfs_rtrmapbt_calc_reserves(mp); + } + if (xfs_has_rtreflink(mp)) { + used += rtg_refcount(rtg)->i_nblocks; + target += xfs_rtrefcountbt_calc_reserves(mp); + } + } + + if (!target) + goto out_unlock; /* - * Space taken by all other metadata btrees are accounted on-disk as + * Space taken by the per-AG metadata btrees are accounted on-disk as * used space. We therefore only hide the space that is reserved but * not used by the trees. */ - used = ip->i_nblocks; - if (used > ask) - ask = used; - hidden_space = ask - used; + if (used > target) + target = used; + else if (target > dblocks_avail) + target = dblocks_avail; + hidden_space = target - used; - error = xfs_dec_fdblocks(ip->i_mount, hidden_space, true); + error = xfs_dec_fdblocks(mp, hidden_space, true); if (error) { - trace_xfs_metafile_resv_init_error(ip, error, _RET_IP_); - return error; + trace_xfs_metafile_resv_init_error(mp, 0); + goto out_unlock; } - xfs_mod_delalloc(ip, 0, hidden_space); - ip->i_delayed_blks = hidden_space; - ip->i_meta_resv_asked = ask; + xfs_mod_sb_delalloc(mp, hidden_space); + + mp->m_metafile_resv_target = target; + mp->m_metafile_resv_used = used; + mp->m_metafile_resv_avail = hidden_space; + + trace_xfs_metafile_resv_init(mp, target); - trace_xfs_metafile_resv_init(ip, ask); - return 0; +out_unlock: + mutex_unlock(&mp->m_metafile_resv_lock); + return error; } diff --git a/fs/xfs/libxfs/xfs_metafile.h b/fs/xfs/libxfs/xfs_metafile.h index 95af4b52e5a7..ae6f9e779b98 100644 --- a/fs/xfs/libxfs/xfs_metafile.h +++ b/fs/xfs/libxfs/xfs_metafile.h @@ -26,13 +26,13 @@ void xfs_metafile_clear_iflag(struct xfs_trans *tp, struct xfs_inode *ip); /* Space reservations for metadata inodes. */ struct xfs_alloc_arg; -bool xfs_metafile_resv_critical(struct xfs_inode *ip); +bool xfs_metafile_resv_critical(struct xfs_mount *mp); void xfs_metafile_resv_alloc_space(struct xfs_inode *ip, struct xfs_alloc_arg *args); void xfs_metafile_resv_free_space(struct xfs_inode *ip, struct xfs_trans *tp, xfs_filblks_t len); -void xfs_metafile_resv_free(struct xfs_inode *ip); -int xfs_metafile_resv_init(struct xfs_inode *ip, xfs_filblks_t ask); +void xfs_metafile_resv_free(struct xfs_mount *mp); +int xfs_metafile_resv_init(struct xfs_mount *mp); /* Code specific to kernel/userspace; must be provided externally. */ diff --git a/fs/xfs/libxfs/xfs_ondisk.h b/fs/xfs/libxfs/xfs_ondisk.h index a85ecddaa48e..5ed44fdf7491 100644 --- a/fs/xfs/libxfs/xfs_ondisk.h +++ b/fs/xfs/libxfs/xfs_ondisk.h @@ -233,8 +233,8 @@ xfs_check_ondisk_structs(void) 16299260424LL); /* superblock field checks we got from xfs/122 */ - XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 288); - XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 288); + XFS_CHECK_STRUCT_SIZE(struct xfs_dsb, 304); + XFS_CHECK_STRUCT_SIZE(struct xfs_sb, 304); XFS_CHECK_SB_OFFSET(sb_magicnum, 0); XFS_CHECK_SB_OFFSET(sb_blocksize, 4); XFS_CHECK_SB_OFFSET(sb_dblocks, 8); @@ -295,6 +295,8 @@ xfs_check_ondisk_structs(void) XFS_CHECK_SB_OFFSET(sb_rgextents, 276); XFS_CHECK_SB_OFFSET(sb_rgblklog, 280); XFS_CHECK_SB_OFFSET(sb_pad, 281); + XFS_CHECK_SB_OFFSET(sb_rtstart, 288); + XFS_CHECK_SB_OFFSET(sb_rtreserved, 296); } #endif /* __XFS_ONDISK_H */ diff --git a/fs/xfs/libxfs/xfs_rtbitmap.c b/fs/xfs/libxfs/xfs_rtbitmap.c index 770adf60dd73..5057536e586c 100644 --- a/fs/xfs/libxfs/xfs_rtbitmap.c +++ b/fs/xfs/libxfs/xfs_rtbitmap.c @@ -1123,6 +1123,7 @@ xfs_rtfree_blocks( xfs_extlen_t mod; int error; + ASSERT(!xfs_has_zoned(mp)); ASSERT(rtlen <= XFS_MAX_BMBT_EXTLEN); mod = xfs_blen_to_rtxoff(mp, rtlen); @@ -1174,6 +1175,9 @@ xfs_rtalloc_query_range( end = min(end, rtg->rtg_extents - 1); + if (xfs_has_zoned(mp)) + return -EINVAL; + /* Iterate the bitmap, looking for discrepancies. */ while (start <= end) { struct xfs_rtalloc_rec rec; @@ -1268,6 +1272,8 @@ xfs_rtbitmap_blockcount_len( struct xfs_mount *mp, xfs_rtbxlen_t rtextents) { + if (xfs_has_zoned(mp)) + return 0; return howmany_64(rtextents, xfs_rtbitmap_rtx_per_rbmblock(mp)); } @@ -1308,6 +1314,11 @@ xfs_rtsummary_blockcount( xfs_rtbxlen_t rextents = xfs_rtbitmap_bitcount(mp); unsigned long long rsumwords; + if (xfs_has_zoned(mp)) { + *rsumlevels = 0; + return 0; + } + *rsumlevels = xfs_compute_rextslog(rextents) + 1; rsumwords = xfs_rtbitmap_blockcount_len(mp, rextents) * (*rsumlevels); return howmany_64(rsumwords, mp->m_blockwsize); diff --git a/fs/xfs/libxfs/xfs_rtgroup.c b/fs/xfs/libxfs/xfs_rtgroup.c index d84d32f1b48f..9186c58e83d5 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.c +++ b/fs/xfs/libxfs/xfs_rtgroup.c @@ -194,15 +194,17 @@ xfs_rtgroup_lock( ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) || !(rtglock_flags & XFS_RTGLOCK_BITMAP)); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { - /* - * Lock both realtime free space metadata inodes for a freespace - * update. - */ - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); - xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { - xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + /* + * Lock both realtime free space metadata inodes for a + * freespace update. + */ + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + xfs_ilock(rtg_summary(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_ilock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } } if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) @@ -228,11 +230,13 @@ xfs_rtgroup_unlock( if ((rtglock_flags & XFS_RTGLOCK_RMAP) && rtg_rmap(rtg)) xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_EXCL); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { - xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); - } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { - xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + if (!xfs_has_zoned(rtg_mount(rtg))) { + if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + xfs_iunlock(rtg_summary(rtg), XFS_ILOCK_EXCL); + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_EXCL); + } else if (rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED) { + xfs_iunlock(rtg_bitmap(rtg), XFS_ILOCK_SHARED); + } } } @@ -249,7 +253,8 @@ xfs_rtgroup_trans_join( ASSERT(!(rtglock_flags & ~XFS_RTGLOCK_ALL_FLAGS)); ASSERT(!(rtglock_flags & XFS_RTGLOCK_BITMAP_SHARED)); - if (rtglock_flags & XFS_RTGLOCK_BITMAP) { + if (!xfs_has_zoned(rtg_mount(rtg)) && + (rtglock_flags & XFS_RTGLOCK_BITMAP)) { xfs_trans_ijoin(tp, rtg_bitmap(rtg), XFS_ILOCK_EXCL); xfs_trans_ijoin(tp, rtg_summary(rtg), XFS_ILOCK_EXCL); } @@ -270,7 +275,7 @@ xfs_rtgroup_get_geometry( /* Fill out form. */ memset(rgeo, 0, sizeof(*rgeo)); rgeo->rg_number = rtg_rgno(rtg); - rgeo->rg_length = rtg_group(rtg)->xg_block_count; + rgeo->rg_length = rtg_blocks(rtg); xfs_rtgroup_geom_health(rtg, rgeo); return 0; } @@ -354,6 +359,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { .sick = XFS_SICK_RG_BITMAP, .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, .create = xfs_rtbitmap_create, }, [XFS_RTGI_SUMMARY] = { @@ -362,6 +368,7 @@ static const struct xfs_rtginode_ops xfs_rtginode_ops[XFS_RTGI_MAX] = { .sick = XFS_SICK_RG_SUMMARY, .fmt_mask = (1U << XFS_DINODE_FMT_EXTENTS) | (1U << XFS_DINODE_FMT_BTREE), + .enabled = xfs_has_nonzoned, .create = xfs_rtsummary_create, }, [XFS_RTGI_RMAP] = { diff --git a/fs/xfs/libxfs/xfs_rtgroup.h b/fs/xfs/libxfs/xfs_rtgroup.h index 03f39d4e43fc..d36a6ae0abe5 100644 --- a/fs/xfs/libxfs/xfs_rtgroup.h +++ b/fs/xfs/libxfs/xfs_rtgroup.h @@ -37,15 +37,33 @@ struct xfs_rtgroup { xfs_rtxnum_t rtg_extents; /* - * Cache of rt summary level per bitmap block with the invariant that - * rtg_rsum_cache[bbno] > the maximum i for which rsum[i][bbno] != 0, - * or 0 if rsum[i][bbno] == 0 for all i. - * + * For bitmap based RT devices this points to a cache of rt summary + * level per bitmap block with the invariant that rtg_rsum_cache[bbno] + * > the maximum i for which rsum[i][bbno] != 0, or 0 if + * rsum[i][bbno] == 0 for all i. * Reads and writes are serialized by the rsumip inode lock. + * + * For zoned RT devices this points to the open zone structure for + * a group that is open for writers, or is NULL. */ - uint8_t *rtg_rsum_cache; + union { + uint8_t *rtg_rsum_cache; + struct xfs_open_zone *rtg_open_zone; + }; }; +/* + * For zoned RT devices this is set on groups that have no written blocks + * and can be picked by the allocator for opening. + */ +#define XFS_RTG_FREE XA_MARK_0 + +/* + * For zoned RT devices this is set on groups that are fully written and that + * have unused blocks. Used by the garbage collection to pick targets. + */ +#define XFS_RTG_RECLAIMABLE XA_MARK_1 + static inline struct xfs_rtgroup *to_rtg(struct xfs_group *xg) { return container_of(xg, struct xfs_rtgroup, rtg_group); @@ -66,6 +84,11 @@ static inline xfs_rgnumber_t rtg_rgno(const struct xfs_rtgroup *rtg) return rtg->rtg_group.xg_gno; } +static inline xfs_rgblock_t rtg_blocks(const struct xfs_rtgroup *rtg) +{ + return rtg->rtg_group.xg_block_count; +} + static inline struct xfs_inode *rtg_bitmap(const struct xfs_rtgroup *rtg) { return rtg->rtg_inodes[XFS_RTGI_BITMAP]; @@ -222,10 +245,14 @@ xfs_rtb_to_daddr( xfs_rtblock_t rtbno) { struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; - xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); - uint64_t start_bno = (xfs_rtblock_t)rgno * g->blocks; - return XFS_FSB_TO_BB(mp, start_bno + (rtbno & g->blkmask)); + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { + xfs_rgnumber_t rgno = xfs_rtb_to_rgno(mp, rtbno); + + rtbno = (xfs_rtblock_t)rgno * g->blocks + (rtbno & g->blkmask); + } + + return XFS_FSB_TO_BB(mp, g->start_fsb + rtbno); } static inline xfs_rtblock_t @@ -233,10 +260,11 @@ xfs_daddr_to_rtb( struct xfs_mount *mp, xfs_daddr_t daddr) { - xfs_rfsblock_t bno = XFS_BB_TO_FSBT(mp, daddr); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + xfs_rfsblock_t bno; - if (xfs_has_rtgroups(mp)) { - struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + bno = XFS_BB_TO_FSBT(mp, daddr) - g->start_fsb; + if (xfs_has_rtgroups(mp) && !g->has_daddr_gaps) { xfs_rgnumber_t rgno; uint32_t rgbno; diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.c b/fs/xfs/libxfs/xfs_rtrmap_btree.c index e4ec36943cb7..9bdc2cbfc113 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.c +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.c @@ -1033,3 +1033,22 @@ xfs_rtrmapbt_init_rtsb( xfs_btree_del_cursor(cur, error); return error; } + +/* + * Return the highest rgbno currently tracked by the rmap for this rtg. + */ +xfs_rgblock_t +xfs_rtrmap_highest_rgbno( + struct xfs_rtgroup *rtg) +{ + struct xfs_btree_block *block = rtg_rmap(rtg)->i_df.if_broot; + union xfs_btree_key key = {}; + struct xfs_btree_cur *cur; + + if (block->bb_numrecs == 0) + return NULLRGBLOCK; + cur = xfs_rtrmapbt_init_cursor(NULL, rtg); + xfs_btree_get_keys(cur, block, &key); + xfs_btree_del_cursor(cur, XFS_BTREE_NOERROR); + return be32_to_cpu(key.__rmap_bigkey[1].rm_startblock); +} diff --git a/fs/xfs/libxfs/xfs_rtrmap_btree.h b/fs/xfs/libxfs/xfs_rtrmap_btree.h index 9d0915089891..e328fd62a149 100644 --- a/fs/xfs/libxfs/xfs_rtrmap_btree.h +++ b/fs/xfs/libxfs/xfs_rtrmap_btree.h @@ -207,4 +207,6 @@ struct xfs_btree_cur *xfs_rtrmapbt_mem_cursor(struct xfs_rtgroup *rtg, int xfs_rtrmapbt_mem_init(struct xfs_mount *mp, struct xfbtree *xfbtree, struct xfs_buftarg *btp, xfs_rgnumber_t rgno); +xfs_rgblock_t xfs_rtrmap_highest_rgbno(struct xfs_rtgroup *rtg); + #endif /* __XFS_RTRMAP_BTREE_H__ */ diff --git a/fs/xfs/libxfs/xfs_sb.c b/fs/xfs/libxfs/xfs_sb.c index 3dc5f5dba162..711e180f9ebb 100644 --- a/fs/xfs/libxfs/xfs_sb.c +++ b/fs/xfs/libxfs/xfs_sb.c @@ -185,6 +185,8 @@ xfs_sb_version_to_features( features |= XFS_FEAT_PARENT; if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_METADIR) features |= XFS_FEAT_METADIR; + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) + features |= XFS_FEAT_ZONED; return features; } @@ -266,6 +268,9 @@ static uint64_t xfs_expected_rbmblocks( struct xfs_sb *sbp) { + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) + return 0; return howmany_64(xfs_extents_per_rbm(sbp), NBBY * xfs_rtbmblock_size(sbp)); } @@ -275,9 +280,15 @@ bool xfs_validate_rt_geometry( struct xfs_sb *sbp) { - if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || - sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) - return false; + if (xfs_sb_is_v5(sbp) && + (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED)) { + if (sbp->sb_rextsize != 1) + return false; + } else { + if (sbp->sb_rextsize * sbp->sb_blocksize > XFS_MAX_RTEXTSIZE || + sbp->sb_rextsize * sbp->sb_blocksize < XFS_MIN_RTEXTSIZE) + return false; + } if (sbp->sb_rblocks == 0) { if (sbp->sb_rextents != 0 || sbp->sb_rbmblocks != 0 || @@ -435,6 +446,34 @@ xfs_validate_sb_rtgroups( return 0; } +static int +xfs_validate_sb_zoned( + struct xfs_mount *mp, + struct xfs_sb *sbp) +{ + if (sbp->sb_frextents != 0) { + xfs_warn(mp, +"sb_frextents must be zero for zoned file systems."); + return -EINVAL; + } + + if (sbp->sb_rtstart && sbp->sb_rtstart < sbp->sb_dblocks) { + xfs_warn(mp, +"sb_rtstart (%lld) overlaps sb_dblocks (%lld).", + sbp->sb_rtstart, sbp->sb_dblocks); + return -EINVAL; + } + + if (sbp->sb_rtreserved && sbp->sb_rtreserved >= sbp->sb_rblocks) { + xfs_warn(mp, +"sb_rtreserved (%lld) larger than sb_rblocks (%lld).", + sbp->sb_rtreserved, sbp->sb_rblocks); + return -EINVAL; + } + + return 0; +} + /* Check the validity of the SB. */ STATIC int xfs_validate_sb_common( @@ -523,6 +562,11 @@ xfs_validate_sb_common( if (error) return error; } + if (sbp->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + error = xfs_validate_sb_zoned(mp, sbp); + if (error) + return error; + } } else if (sbp->sb_qflags & (XFS_PQUOTA_ENFD | XFS_GQUOTA_ENFD | XFS_PQUOTA_CHKD | XFS_GQUOTA_CHKD)) { xfs_notice(mp, @@ -835,6 +879,14 @@ __xfs_sb_from_disk( to->sb_rgcount = 1; to->sb_rgextents = 0; } + + if (to->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = be64_to_cpu(from->sb_rtstart); + to->sb_rtreserved = be64_to_cpu(from->sb_rtreserved); + } else { + to->sb_rtstart = 0; + to->sb_rtreserved = 0; + } } void @@ -1001,6 +1053,11 @@ xfs_sb_to_disk( to->sb_rbmino = cpu_to_be64(0); to->sb_rsumino = cpu_to_be64(0); } + + if (from->sb_features_incompat & XFS_SB_FEAT_INCOMPAT_ZONED) { + to->sb_rtstart = cpu_to_be64(from->sb_rtstart); + to->sb_rtreserved = cpu_to_be64(from->sb_rtreserved); + } } /* @@ -1146,6 +1203,10 @@ xfs_sb_mount_rextsize( rgs->blocks = sbp->sb_rgextents * sbp->sb_rextsize; rgs->blklog = mp->m_sb.sb_rgblklog; rgs->blkmask = xfs_mask32lo(mp->m_sb.sb_rgblklog); + rgs->start_fsb = mp->m_sb.sb_rtstart; + if (xfs_sb_has_incompat_feature(sbp, + XFS_SB_FEAT_INCOMPAT_ZONE_GAPS)) + rgs->has_daddr_gaps = true; } else { rgs->blocks = 0; rgs->blklog = 0; @@ -1265,8 +1326,7 @@ xfs_log_sb( mp->m_sb.sb_ifree = min_t(uint64_t, percpu_counter_sum_positive(&mp->m_ifree), mp->m_sb.sb_icount); - mp->m_sb.sb_fdblocks = - percpu_counter_sum_positive(&mp->m_fdblocks); + mp->m_sb.sb_fdblocks = xfs_sum_freecounter(mp, XC_FREE_BLOCKS); } /* @@ -1275,9 +1335,10 @@ xfs_log_sb( * we handle nearly-lockless reservations, so we must use the _positive * variant here to avoid writing out nonsense frextents. */ - if (xfs_has_rtgroups(mp)) + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) { mp->m_sb.sb_frextents = - percpu_counter_sum_positive(&mp->m_frextents); + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS); + } xfs_sb_to_disk(bp->b_addr, &mp->m_sb); xfs_trans_buf_set_type(tp, bp, XFS_BLFT_SB_BUF); @@ -1510,6 +1571,8 @@ xfs_fs_geometry( geo->flags |= XFS_FSOP_GEOM_FLAGS_EXCHANGE_RANGE; if (xfs_has_metadir(mp)) geo->flags |= XFS_FSOP_GEOM_FLAGS_METADIR; + if (xfs_has_zoned(mp)) + geo->flags |= XFS_FSOP_GEOM_FLAGS_ZONED; geo->rtsectsize = sbp->sb_blocksize; geo->dirblocksize = xfs_dir2_dirblock_bytes(sbp); @@ -1530,6 +1593,10 @@ xfs_fs_geometry( geo->rgcount = sbp->sb_rgcount; geo->rgextents = sbp->sb_rgextents; } + if (xfs_has_zoned(mp)) { + geo->rtstart = sbp->sb_rtstart; + geo->rtreserved = sbp->sb_rtreserved; + } } /* Read a secondary superblock. */ diff --git a/fs/xfs/libxfs/xfs_trans_resv.c b/fs/xfs/libxfs/xfs_trans_resv.c index 13d00c7166e1..86a111d0f2fc 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.c +++ b/fs/xfs/libxfs/xfs_trans_resv.c @@ -22,6 +22,12 @@ #include "xfs_rtbitmap.h" #include "xfs_attr_item.h" #include "xfs_log.h" +#include "xfs_defer.h" +#include "xfs_bmap_item.h" +#include "xfs_extfree_item.h" +#include "xfs_rmap_item.h" +#include "xfs_refcount_item.h" +#include "xfs_trace.h" #define _ALLOC true #define _FREE false @@ -264,6 +270,42 @@ xfs_rtalloc_block_count( */ /* + * Finishing a data device refcount updates (t1): + * the agfs of the ags containing the blocks: nr_ops * sector size + * the refcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_reflink(mp)) + return 0; + + return xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* + * Realtime refcount updates (t2); + * the rt refcount inode + * the rtrefcount btrees: nr_ops * 1 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_cui_reservation( + struct xfs_mount *mp, + unsigned int nr_ops) +{ + if (!xfs_has_rtreflink(mp)) + return 0; + + return xfs_calc_inode_res(mp, 1) + + xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), + mp->m_sb.sb_blocksize); +} + +/* * Compute the log reservation required to handle the refcount update * transaction. Refcount updates are always done via deferred log items. * @@ -280,19 +322,10 @@ xfs_calc_refcountbt_reservation( struct xfs_mount *mp, unsigned int nr_ops) { - unsigned int blksz = XFS_FSB_TO_B(mp, 1); - unsigned int t1, t2 = 0; + unsigned int t1, t2; - if (!xfs_has_reflink(mp)) - return 0; - - t1 = xfs_calc_buf_res(nr_ops, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_refcountbt_block_count(mp, nr_ops), blksz); - - if (xfs_has_realtime(mp)) - t2 = xfs_calc_inode_res(mp, 1) + - xfs_calc_buf_res(xfs_rtrefcountbt_block_count(mp, nr_ops), - blksz); + t1 = xfs_calc_finish_cui_reservation(mp, nr_ops); + t2 = xfs_calc_finish_rt_cui_reservation(mp, nr_ops); return max(t1, t2); } @@ -380,6 +413,96 @@ xfs_calc_write_reservation_minlogsize( } /* + * Finishing an EFI can free the blocks and bmap blocks (t2): + * the agf for each of the ags: nr * sector size + * the agfl for each of the ags: nr * sector size + * the super block to reflect the freed blocks: sector size + * worst case split in allocation btrees per extent assuming nr extents: + * nr exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Or, if it's a realtime file (t3): + * the agf for each of the ags: 2 * sector size + * the agfl for each of the ags: 2 * sector size + * the super block to reflect the freed blocks: sector size + * the realtime bitmap: + * 2 exts * ((XFS_BMBT_MAX_EXTLEN / rtextsize) / NBBY) bytes + * the realtime summary: 2 exts * 1 block + * worst case split in allocation btrees per extent assuming 2 extents: + * 2 exts * 2 trees * (2 * max depth - 1) * block size + */ +inline unsigned int +xfs_calc_finish_rt_efi_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_realtime(mp)) + return 0; + + return xfs_calc_buf_res((2 * nr) + 1, mp->m_sb.sb_sectsize) + + xfs_calc_buf_res(xfs_rtalloc_block_count(mp, nr), + mp->m_sb.sb_blocksize) + + xfs_calc_buf_res(xfs_allocfree_block_count(mp, nr), + mp->m_sb.sb_blocksize); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rmapbt(mp)) + return 0; + return xfs_calc_finish_efi_reservation(mp, nr); +} + +/* + * Finishing an RUI is the same as an EFI. We can split the rmap btree twice + * on each end of the record, and that can cause the AGFL to be refilled or + * emptied out. + */ +inline unsigned int +xfs_calc_finish_rt_rui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + if (!xfs_has_rtrmapbt(mp)) + return 0; + return xfs_calc_finish_rt_efi_reservation(mp, nr); +} + +/* + * In finishing a BUI, we can modify: + * the inode being truncated: inode size + * dquots + * the inode's bmap btree: (max depth + 1) * block size + */ +inline unsigned int +xfs_calc_finish_bui_reservation( + struct xfs_mount *mp, + unsigned int nr) +{ + return xfs_calc_inode_res(mp, 1) + XFS_DQUOT_LOGRES + + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, + mp->m_sb.sb_blocksize); +} + +/* * In truncating a file we free up to two extents at once. We can modify (t1): * the inode being truncated: inode size * the inode's bmap btree: (max depth + 1) * block size @@ -411,16 +534,8 @@ xfs_calc_itruncate_reservation( t1 = xfs_calc_inode_res(mp, 1) + xfs_calc_buf_res(XFS_BM_MAXLEVELS(mp, XFS_DATA_FORK) + 1, blksz); - t2 = xfs_calc_buf_res(9, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 4), blksz); - - if (xfs_has_realtime(mp)) { - t3 = xfs_calc_buf_res(5, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_rtalloc_block_count(mp, 2), blksz) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), blksz); - } else { - t3 = 0; - } + t2 = xfs_calc_finish_efi_reservation(mp, 4); + t3 = xfs_calc_finish_rt_efi_reservation(mp, 2); /* * In the early days of reflink, we included enough reservation to log @@ -501,9 +616,7 @@ xfs_calc_rename_reservation( xfs_calc_buf_res(2 * XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(7, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 3), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 3); if (xfs_has_parent(mp)) { unsigned int rename_overhead, exchange_overhead; @@ -611,9 +724,7 @@ xfs_calc_link_reservation( overhead += xfs_calc_iunlink_remove_reservation(mp); t1 = xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(3, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 1), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 1); if (xfs_has_parent(mp)) { t3 = resp->tr_attrsetm.tr_logres; @@ -676,9 +787,7 @@ xfs_calc_remove_reservation( t1 = xfs_calc_inode_res(mp, 2) + xfs_calc_buf_res(XFS_DIROP_LOG_COUNT(mp), XFS_FSB_TO_B(mp, 1)); - t2 = xfs_calc_buf_res(4, mp->m_sb.sb_sectsize) + - xfs_calc_buf_res(xfs_allocfree_block_count(mp, 2), - XFS_FSB_TO_B(mp, 1)); + t2 = xfs_calc_finish_efi_reservation(mp, 2); if (xfs_has_parent(mp)) { t3 = resp->tr_attrrm.tr_logres; @@ -1181,6 +1290,15 @@ xfs_calc_namespace_reservations( resp->tr_mkdir.tr_logflags |= XFS_TRANS_PERM_LOG_RES; } +STATIC void +xfs_calc_default_atomic_ioend_reservation( + struct xfs_mount *mp, + struct xfs_trans_resv *resp) +{ + /* Pick a default that will scale reasonably for the log size. */ + resp->tr_atomic_ioend = resp->tr_itruncate; +} + void xfs_trans_resv_calc( struct xfs_mount *mp, @@ -1275,4 +1393,167 @@ xfs_trans_resv_calc( resp->tr_itruncate.tr_logcount += logcount_adj; resp->tr_write.tr_logcount += logcount_adj; resp->tr_qm_dqalloc.tr_logcount += logcount_adj; + + /* + * Now that we've finished computing the static reservations, we can + * compute the dynamic reservation for atomic writes. + */ + xfs_calc_default_atomic_ioend_reservation(mp, resp); +} + +/* + * Return the per-extent and fixed transaction reservation sizes needed to + * complete an atomic write. + */ +STATIC unsigned int +xfs_calc_atomic_write_ioend_geometry( + struct xfs_mount *mp, + unsigned int *step_size) +{ + const unsigned int efi = xfs_efi_log_space(1); + const unsigned int efd = xfs_efd_log_space(1); + const unsigned int rui = xfs_rui_log_space(1); + const unsigned int rud = xfs_rud_log_space(); + const unsigned int cui = xfs_cui_log_space(1); + const unsigned int cud = xfs_cud_log_space(); + const unsigned int bui = xfs_bui_log_space(1); + const unsigned int bud = xfs_bud_log_space(); + + /* + * Maximum overhead to complete an atomic write ioend in software: + * remove data fork extent + remove cow fork extent + map extent into + * data fork. + * + * tx0: Creates a BUI and a CUI and that's all it needs. + * + * tx1: Roll to finish the BUI. Need space for the BUD, an RUI, and + * enough space to relog the CUI (== CUI + CUD). + * + * tx2: Roll again to finish the RUI. Need space for the RUD and space + * to relog the CUI. + * + * tx3: Roll again, need space for the CUD and possibly a new EFI. + * + * tx4: Roll again, need space for an EFD. + * + * If the extent referenced by the pair of BUI/CUI items is not the one + * being currently processed, then we need to reserve space to relog + * both items. + */ + const unsigned int tx0 = bui + cui; + const unsigned int tx1 = bud + rui + cui + cud; + const unsigned int tx2 = rud + cui + cud; + const unsigned int tx3 = cud + efi; + const unsigned int tx4 = efd; + const unsigned int relog = bui + bud + cui + cud; + + const unsigned int per_intent = max(max3(tx0, tx1, tx2), + max3(tx3, tx4, relog)); + + /* Overhead to finish one step of each intent item type */ + const unsigned int f1 = xfs_calc_finish_efi_reservation(mp, 1); + const unsigned int f2 = xfs_calc_finish_rui_reservation(mp, 1); + const unsigned int f3 = xfs_calc_finish_cui_reservation(mp, 1); + const unsigned int f4 = xfs_calc_finish_bui_reservation(mp, 1); + + /* We only finish one item per transaction in a chain */ + *step_size = max(f4, max3(f1, f2, f3)); + + return per_intent; +} + +/* + * Compute the maximum size (in fsblocks) of atomic writes that we can complete + * given the existing log reservations. + */ +xfs_extlen_t +xfs_calc_max_atomic_write_fsblocks( + struct xfs_mount *mp) +{ + const struct xfs_trans_res *resv = &M_RES(mp)->tr_atomic_ioend; + unsigned int per_intent = 0; + unsigned int step_size = 0; + unsigned int ret = 0; + + if (resv->tr_logres > 0) { + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, + &step_size); + + if (resv->tr_logres >= step_size) + ret = (resv->tr_logres - step_size) / per_intent; + } + + trace_xfs_calc_max_atomic_write_fsblocks(mp, per_intent, step_size, + resv->tr_logres, ret); + + return ret; +} + +/* + * Compute the log blocks and transaction reservation needed to complete an + * atomic write of a given number of blocks. Worst case, each block requires + * separate handling. A return value of 0 means something went wrong. + */ +xfs_extlen_t +xfs_calc_atomic_write_log_geometry( + struct xfs_mount *mp, + xfs_extlen_t blockcount, + unsigned int *new_logres) +{ + struct xfs_trans_res *curr_res = &M_RES(mp)->tr_atomic_ioend; + uint old_logres = curr_res->tr_logres; + unsigned int per_intent, step_size; + unsigned int logres; + xfs_extlen_t min_logblocks; + + ASSERT(blockcount > 0); + + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + + per_intent = xfs_calc_atomic_write_ioend_geometry(mp, &step_size); + + /* Check for overflows */ + if (check_mul_overflow(blockcount, per_intent, &logres) || + check_add_overflow(logres, step_size, &logres)) + return 0; + + curr_res->tr_logres = logres; + min_logblocks = xfs_log_calc_minimum_size(mp); + curr_res->tr_logres = old_logres; + + trace_xfs_calc_max_atomic_write_log_geometry(mp, per_intent, step_size, + blockcount, min_logblocks, logres); + + *new_logres = logres; + return min_logblocks; +} + +/* + * Compute the transaction reservation needed to complete an out of place + * atomic write of a given number of blocks. + */ +int +xfs_calc_atomic_write_reservation( + struct xfs_mount *mp, + xfs_extlen_t blockcount) +{ + unsigned int new_logres; + xfs_extlen_t min_logblocks; + + /* + * If the caller doesn't ask for a specific atomic write size, then + * use the defaults. + */ + if (blockcount == 0) { + xfs_calc_default_atomic_ioend_reservation(mp, M_RES(mp)); + return 0; + } + + min_logblocks = xfs_calc_atomic_write_log_geometry(mp, blockcount, + &new_logres); + if (!min_logblocks || min_logblocks > mp->m_sb.sb_logblocks) + return -EINVAL; + + M_RES(mp)->tr_atomic_ioend.tr_logres = new_logres; + return 0; } diff --git a/fs/xfs/libxfs/xfs_trans_resv.h b/fs/xfs/libxfs/xfs_trans_resv.h index 0554b9d775d2..336279e0fc61 100644 --- a/fs/xfs/libxfs/xfs_trans_resv.h +++ b/fs/xfs/libxfs/xfs_trans_resv.h @@ -48,6 +48,7 @@ struct xfs_trans_resv { struct xfs_trans_res tr_qm_dqalloc; /* allocate quota on disk */ struct xfs_trans_res tr_sb; /* modify superblock */ struct xfs_trans_res tr_fsyncts; /* update timestamps on fsync */ + struct xfs_trans_res tr_atomic_ioend; /* untorn write completion */ }; /* shorthand way of accessing reservation structure */ @@ -98,8 +99,32 @@ struct xfs_trans_resv { void xfs_trans_resv_calc(struct xfs_mount *mp, struct xfs_trans_resv *resp); uint xfs_allocfree_block_count(struct xfs_mount *mp, uint num_ops); +unsigned int xfs_calc_finish_bui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_efi_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_rui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + +unsigned int xfs_calc_finish_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); +unsigned int xfs_calc_finish_rt_cui_reservation(struct xfs_mount *mp, + unsigned int nr_ops); + unsigned int xfs_calc_itruncate_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_write_reservation_minlogsize(struct xfs_mount *mp); unsigned int xfs_calc_qm_dqalloc_reservation_minlogsize(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_max_atomic_write_fsblocks(struct xfs_mount *mp); +xfs_extlen_t xfs_calc_atomic_write_log_geometry(struct xfs_mount *mp, + xfs_extlen_t blockcount, unsigned int *new_logres); +int xfs_calc_atomic_write_reservation(struct xfs_mount *mp, + xfs_extlen_t blockcount); + #endif /* __XFS_TRANS_RESV_H__ */ diff --git a/fs/xfs/libxfs/xfs_types.h b/fs/xfs/libxfs/xfs_types.h index ca2401c1facd..f6f4f2d4b5db 100644 --- a/fs/xfs/libxfs/xfs_types.h +++ b/fs/xfs/libxfs/xfs_types.h @@ -233,6 +233,34 @@ enum xfs_group_type { { XG_TYPE_AG, "ag" }, \ { XG_TYPE_RTG, "rtg" } +enum xfs_free_counter { + /* + * Number of free blocks on the data device. + */ + XC_FREE_BLOCKS, + + /* + * Number of free RT extents on the RT device. + */ + XC_FREE_RTEXTENTS, + + /* + * Number of available for use RT extents. + * + * This counter only exists for zoned RT device and indicates the number + * of RT extents that can be directly used by writes. XC_FREE_RTEXTENTS + * also includes blocks that have been written previously and freed, but + * sit in a rtgroup that still needs a zone reset. + */ + XC_FREE_RTAVAILABLE, + XC_FREE_NR, +}; + +#define XFS_FREECOUNTER_STR \ + { XC_FREE_BLOCKS, "blocks" }, \ + { XC_FREE_RTEXTENTS, "rtextents" }, \ + { XC_FREE_RTAVAILABLE, "rtavailable" } + /* * Type verifier functions */ diff --git a/fs/xfs/libxfs/xfs_zones.c b/fs/xfs/libxfs/xfs_zones.c new file mode 100644 index 000000000000..b0791a71931c --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.c @@ -0,0 +1,186 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_fs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zones.h" + +static bool +xfs_zone_validate_empty( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > 0) { + xfs_warn(mp, "empty zone %u has non-zero used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = 0; + return true; +} + +static bool +xfs_zone_validate_wp( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rtblock_t wp_fsb = xfs_daddr_to_rtb(mp, zone->wp); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + if (xfs_rtb_to_rgno(mp, wp_fsb) != rtg_rgno(rtg)) { + xfs_warn(mp, "zone %u write pointer (0x%llx) outside of zone.", + rtg_rgno(rtg), wp_fsb); + return false; + } + + *write_pointer = xfs_rtb_to_rgbno(mp, wp_fsb); + if (*write_pointer >= rtg->rtg_extents) { + xfs_warn(mp, "zone %u has invalid write pointer (0x%x).", + rtg_rgno(rtg), *write_pointer); + return false; + } + + return true; +} + +static bool +xfs_zone_validate_full( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + if (rtg_rmap(rtg)->i_used_blocks > rtg->rtg_extents) { + xfs_warn(mp, "zone %u has too large used counter (0x%x).", + rtg_rgno(rtg), rtg_rmap(rtg)->i_used_blocks); + return false; + } + + *write_pointer = rtg->rtg_extents; + return true; +} + +static bool +xfs_zone_validate_seq( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_EMPTY: + return xfs_zone_validate_empty(zone, rtg, write_pointer); + case BLK_ZONE_COND_IMP_OPEN: + case BLK_ZONE_COND_EXP_OPEN: + case BLK_ZONE_COND_CLOSED: + return xfs_zone_validate_wp(zone, rtg, write_pointer); + case BLK_ZONE_COND_FULL: + return xfs_zone_validate_full(zone, rtg, write_pointer); + case BLK_ZONE_COND_NOT_WP: + case BLK_ZONE_COND_OFFLINE: + case BLK_ZONE_COND_READONLY: + xfs_warn(mp, "zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + default: + xfs_warn(mp, "zone %u has unknown zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +static bool +xfs_zone_validate_conv( + struct blk_zone *zone, + struct xfs_rtgroup *rtg) +{ + struct xfs_mount *mp = rtg_mount(rtg); + + switch (zone->cond) { + case BLK_ZONE_COND_NOT_WP: + return true; + default: + xfs_warn(mp, +"conventional zone %u has unsupported zone condition 0x%x.", + rtg_rgno(rtg), zone->cond); + return false; + } +} + +bool +xfs_zone_validate( + struct blk_zone *zone, + struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_groups *g = &mp->m_groups[XG_TYPE_RTG]; + uint32_t expected_size; + + /* + * Check that the zone capacity matches the rtgroup size stored in the + * superblock. Note that all zones including the last one must have a + * uniform capacity. + */ + if (XFS_BB_TO_FSB(mp, zone->capacity) != g->blocks) { + xfs_warn(mp, +"zone %u capacity (0x%llx) does not match RT group size (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->capacity), + g->blocks); + return false; + } + + if (g->has_daddr_gaps) { + expected_size = 1 << g->blklog; + } else { + if (zone->len != zone->capacity) { + xfs_warn(mp, +"zone %u has capacity != size ((0x%llx vs 0x%llx)", + rtg_rgno(rtg), + XFS_BB_TO_FSB(mp, zone->len), + XFS_BB_TO_FSB(mp, zone->capacity)); + return false; + } + expected_size = g->blocks; + } + + if (XFS_BB_TO_FSB(mp, zone->len) != expected_size) { + xfs_warn(mp, +"zone %u length (0x%llx) does match geometry (0x%x).", + rtg_rgno(rtg), XFS_BB_TO_FSB(mp, zone->len), + expected_size); + } + + switch (zone->type) { + case BLK_ZONE_TYPE_CONVENTIONAL: + return xfs_zone_validate_conv(zone, rtg); + case BLK_ZONE_TYPE_SEQWRITE_REQ: + return xfs_zone_validate_seq(zone, rtg, write_pointer); + default: + xfs_warn(mp, "zoned %u has unsupported type 0x%x.", + rtg_rgno(rtg), zone->type); + return false; + } +} diff --git a/fs/xfs/libxfs/xfs_zones.h b/fs/xfs/libxfs/xfs_zones.h new file mode 100644 index 000000000000..c4f1367b2cca --- /dev/null +++ b/fs/xfs/libxfs/xfs_zones.h @@ -0,0 +1,35 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _LIBXFS_ZONES_H +#define _LIBXFS_ZONES_H + +struct xfs_rtgroup; + +/* + * In order to guarantee forward progress for GC we need to reserve at least + * two zones: one that will be used for moving data into and one spare zone + * making sure that we have enough space to relocate a nearly-full zone. + * To allow for slightly sloppy accounting for when we need to reserve the + * second zone, we actually reserve three as that is easier than doing fully + * accurate bookkeeping. + */ +#define XFS_GC_ZONES 3U + +/* + * In addition we need two zones for user writes, one open zone for writing + * and one to still have available blocks without resetting the open zone + * when data in the open zone has been freed. + */ +#define XFS_RESERVED_ZONES (XFS_GC_ZONES + 1) +#define XFS_MIN_ZONES (XFS_RESERVED_ZONES + 1) + +/* + * Always keep one zone out of the general open zone pool to allow for GC to + * happen while other writers are waiting for free space. + */ +#define XFS_OPEN_GC_ZONES 1U +#define XFS_MIN_OPEN_ZONES (XFS_OPEN_GC_ZONES + 1U) + +bool xfs_zone_validate(struct blk_zone *zone, struct xfs_rtgroup *rtg, + xfs_rgblock_t *write_pointer); + +#endif /* _LIBXFS_ZONES_H */ diff --git a/fs/xfs/scrub/agheader.c b/fs/xfs/scrub/agheader.c index 9f8c312dfd3c..303374df44bd 100644 --- a/fs/xfs/scrub/agheader.c +++ b/fs/xfs/scrub/agheader.c @@ -69,6 +69,8 @@ STATIC size_t xchk_superblock_ondisk_size( struct xfs_mount *mp) { + if (xfs_has_zoned(mp)) + return offsetofend(struct xfs_dsb, sb_rtreserved); if (xfs_has_metadir(mp)) return offsetofend(struct xfs_dsb, sb_pad); if (xfs_has_metauuid(mp)) diff --git a/fs/xfs/scrub/bmap.c b/fs/xfs/scrub/bmap.c index 66da7d4d56ba..4f1e2574660d 100644 --- a/fs/xfs/scrub/bmap.c +++ b/fs/xfs/scrub/bmap.c @@ -1038,8 +1038,8 @@ xchk_bmap( switch (whichfork) { case XFS_COW_FORK: - /* No CoW forks on non-reflink filesystems. */ - if (!xfs_has_reflink(mp)) { + /* No CoW forks filesystem doesn't support out of place writes */ + if (!xfs_has_reflink(mp) && !xfs_has_zoned(mp)) { xchk_ino_set_corrupt(sc, sc->ip->i_ino); return 0; } diff --git a/fs/xfs/scrub/fscounters.c b/fs/xfs/scrub/fscounters.c index ca23cf4db6c5..9b598c5790ad 100644 --- a/fs/xfs/scrub/fscounters.c +++ b/fs/xfs/scrub/fscounters.c @@ -123,7 +123,7 @@ xchk_fsfreeze( { int error; - error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + error = freeze_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); trace_xchk_fsfreeze(sc, error); return error; } @@ -135,7 +135,7 @@ xchk_fsthaw( int error; /* This should always succeed, we have a kernel freeze */ - error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL); + error = thaw_super(sc->mp->m_super, FREEZE_HOLDER_KERNEL, NULL); trace_xchk_fsthaw(sc, error); return error; } @@ -350,7 +350,7 @@ retry: * The global incore space reservation is taken from the incore * counters, so leave that out of the computation. */ - fsc->fdblocks -= mp->m_resblks_avail; + fsc->fdblocks -= mp->m_free[XC_FREE_BLOCKS].res_avail; /* * Delayed allocation reservations are taken out of the incore counters @@ -413,7 +413,13 @@ xchk_fscount_count_frextents( fsc->frextents = 0; fsc->frextents_delayed = 0; - if (!xfs_has_realtime(mp)) + + /* + * Don't bother verifying and repairing the fs counters for zoned file + * systems as they don't track an on-disk frextents count, and the + * in-memory percpu counter also includes reservations. + */ + if (!xfs_has_realtime(mp) || xfs_has_zoned(mp)) return 0; while ((rtg = xfs_rtgroup_next(mp, rtg))) { @@ -513,8 +519,8 @@ xchk_fscounters( /* Snapshot the percpu counters. */ icount = percpu_counter_sum(&mp->m_icount); ifree = percpu_counter_sum(&mp->m_ifree); - fdblocks = percpu_counter_sum(&mp->m_fdblocks); - frextents = percpu_counter_sum(&mp->m_frextents); + fdblocks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + frextents = xfs_sum_freecounter_raw(mp, XC_FREE_RTEXTENTS); /* No negative values, please! */ if (icount < 0 || ifree < 0) @@ -589,15 +595,17 @@ xchk_fscounters( try_again = true; } - if (!xchk_fscount_within_range(sc, fdblocks, &mp->m_fdblocks, - fsc->fdblocks)) { + if (!xchk_fscount_within_range(sc, fdblocks, + &mp->m_free[XC_FREE_BLOCKS].count, fsc->fdblocks)) { if (fsc->frozen) xchk_set_corrupt(sc); else try_again = true; } - if (!xchk_fscount_within_range(sc, frextents, &mp->m_frextents, + if (!xfs_has_zoned(mp) && + !xchk_fscount_within_range(sc, frextents, + &mp->m_free[XC_FREE_RTEXTENTS].count, fsc->frextents - fsc->frextents_delayed)) { if (fsc->frozen) xchk_set_corrupt(sc); diff --git a/fs/xfs/scrub/fscounters_repair.c b/fs/xfs/scrub/fscounters_repair.c index cda13447a373..f0d2b04644e4 100644 --- a/fs/xfs/scrub/fscounters_repair.c +++ b/fs/xfs/scrub/fscounters_repair.c @@ -64,7 +64,7 @@ xrep_fscounters( percpu_counter_set(&mp->m_icount, fsc->icount); percpu_counter_set(&mp->m_ifree, fsc->ifree); - percpu_counter_set(&mp->m_fdblocks, fsc->fdblocks); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, fsc->fdblocks); /* * Online repair is only supported on v5 file systems, which require @@ -74,10 +74,12 @@ xrep_fscounters( * track of the delalloc reservations separately, as they are are * subtracted from m_frextents, but not included in sb_frextents. */ - percpu_counter_set(&mp->m_frextents, - fsc->frextents - fsc->frextents_delayed); - if (!xfs_has_rtgroups(mp)) - mp->m_sb.sb_frextents = fsc->frextents; + if (!xfs_has_zoned(mp)) { + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + fsc->frextents - fsc->frextents_delayed); + if (!xfs_has_rtgroups(mp)) + mp->m_sb.sb_frextents = fsc->frextents; + } return 0; } diff --git a/fs/xfs/scrub/inode.c b/fs/xfs/scrub/inode.c index db6edd5a5fe5..bb3f475b6353 100644 --- a/fs/xfs/scrub/inode.c +++ b/fs/xfs/scrub/inode.c @@ -273,6 +273,13 @@ xchk_inode_cowextsize( xfs_failaddr_t fa; uint32_t value = be32_to_cpu(dip->di_cowextsize); + /* + * The used block counter for rtrmap is checked and repaired elsewhere. + */ + if (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP)) + return; + fa = xfs_inode_validate_cowextsize(sc->mp, value, mode, flags, flags2); if (fa) xchk_ino_set_corrupt(sc, ino); diff --git a/fs/xfs/scrub/inode_repair.c b/fs/xfs/scrub/inode_repair.c index 13ff1c933cb8..a90a011c7e5f 100644 --- a/fs/xfs/scrub/inode_repair.c +++ b/fs/xfs/scrub/inode_repair.c @@ -710,7 +710,9 @@ xrep_dinode_extsize_hints( XFS_DIFLAG_EXTSZINHERIT); } - if (dip->di_version < 3) + if (dip->di_version < 3 || + (xfs_has_zoned(sc->mp) && + dip->di_metatype == cpu_to_be16(XFS_METAFILE_RTRMAP))) return; fa = xfs_inode_validate_cowextsize(mp, be32_to_cpu(dip->di_cowextsize), @@ -1558,8 +1560,7 @@ xrep_dinode_core( /* Read the inode cluster buffer. */ error = xfs_trans_read_buf(sc->mp, sc->tp, sc->mp->m_ddev_targp, - ri->imap.im_blkno, ri->imap.im_len, XBF_UNMAPPED, &bp, - NULL); + ri->imap.im_blkno, ri->imap.im_len, 0, &bp, NULL); if (error) return error; diff --git a/fs/xfs/scrub/newbt.c b/fs/xfs/scrub/newbt.c index ac38f5843090..1588ce971cb8 100644 --- a/fs/xfs/scrub/newbt.c +++ b/fs/xfs/scrub/newbt.c @@ -62,7 +62,7 @@ xrep_newbt_estimate_slack( free = sc->sa.pag->pagf_freeblks; sz = xfs_ag_block_count(sc->mp, pag_agno(sc->sa.pag)); } else { - free = percpu_counter_sum(&sc->mp->m_fdblocks); + free = xfs_sum_freecounter_raw(sc->mp, XC_FREE_BLOCKS); sz = sc->mp->m_sb.sb_dblocks; } diff --git a/fs/xfs/scrub/orphanage.c b/fs/xfs/scrub/orphanage.c index c287c755f2c5..9c12cb844231 100644 --- a/fs/xfs/scrub/orphanage.c +++ b/fs/xfs/scrub/orphanage.c @@ -153,8 +153,7 @@ xrep_orphanage_create( /* Try to find the orphanage directory. */ inode_lock_nested(root_inode, I_MUTEX_PARENT); - orphanage_dentry = lookup_one_len(ORPHANAGE, root_dentry, - strlen(ORPHANAGE)); + orphanage_dentry = lookup_noperm(&QSTR(ORPHANAGE), root_dentry); if (IS_ERR(orphanage_dentry)) { error = PTR_ERR(orphanage_dentry); goto out_unlock_root; @@ -167,10 +166,11 @@ xrep_orphanage_create( * directory to control access to a file we put in here. */ if (d_really_is_negative(orphanage_dentry)) { - error = vfs_mkdir(&nop_mnt_idmap, root_inode, orphanage_dentry, - 0750); - if (error) - goto out_dput_orphanage; + orphanage_dentry = vfs_mkdir(&nop_mnt_idmap, root_inode, + orphanage_dentry, 0750); + error = PTR_ERR(orphanage_dentry); + if (IS_ERR(orphanage_dentry)) + goto out_unlock_root; } /* Not a directory? Bail out. */ @@ -444,7 +444,7 @@ xrep_adoption_check_dcache( if (!d_orphanage) return 0; - d_child = d_hash_and_lookup(d_orphanage, &qname); + d_child = try_lookup_noperm(&qname, d_orphanage); if (d_child) { trace_xrep_adoption_check_child(sc->mp, d_child); @@ -481,7 +481,7 @@ xrep_adoption_zap_dcache( if (!d_orphanage) return; - d_child = d_hash_and_lookup(d_orphanage, &qname); + d_child = try_lookup_noperm(&qname, d_orphanage); while (d_child != NULL) { trace_xrep_adoption_invalidate_child(sc->mp, d_child); diff --git a/fs/xfs/scrub/reap.c b/fs/xfs/scrub/reap.c index b32fb233cf84..8703897c0a9c 100644 --- a/fs/xfs/scrub/reap.c +++ b/fs/xfs/scrub/reap.c @@ -935,10 +935,13 @@ xrep_reap_metadir_fsblocks( if (error) return error; - if (xreap_dirty(&rs)) - return xrep_defer_finish(sc); + if (xreap_dirty(&rs)) { + error = xrep_defer_finish(sc); + if (error) + return error; + } - return 0; + return xrep_reset_metafile_resv(sc); } /* diff --git a/fs/xfs/scrub/repair.c b/fs/xfs/scrub/repair.c index 3b5288d3ef4e..f8f9ed30f56b 100644 --- a/fs/xfs/scrub/repair.c +++ b/fs/xfs/scrub/repair.c @@ -43,6 +43,7 @@ #include "xfs_rtalloc.h" #include "xfs_metafile.h" #include "xfs_rtrefcount_btree.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/trace.h" @@ -1050,7 +1051,13 @@ xrep_require_rtext_inuse( xfs_rtxnum_t startrtx; xfs_rtxnum_t endrtx; bool is_free = false; - int error; + int error = 0; + + if (xfs_has_zoned(mp)) { + if (!xfs_zone_rgbno_is_valid(sc->sr.rtg, rgbno + len - 1)) + return -EFSCORRUPTED; + return 0; + } startrtx = xfs_rgbno_to_rtx(mp, rgbno); endrtx = xfs_rgbno_to_rtx(mp, rgbno + len - 1); @@ -1386,11 +1393,12 @@ int xrep_reset_metafile_resv( struct xfs_scrub *sc) { - struct xfs_inode *ip = sc->ip; + struct xfs_mount *mp = sc->mp; int64_t delta; int error; - delta = ip->i_nblocks + ip->i_delayed_blks - ip->i_meta_resv_asked; + delta = mp->m_metafile_resv_used + mp->m_metafile_resv_avail - + mp->m_metafile_resv_target; if (delta == 0) return 0; @@ -1401,11 +1409,11 @@ xrep_reset_metafile_resv( if (delta > 0) { int64_t give_back; - give_back = min_t(uint64_t, delta, ip->i_delayed_blks); + give_back = min_t(uint64_t, delta, mp->m_metafile_resv_avail); if (give_back > 0) { - xfs_mod_delalloc(ip, 0, -give_back); - xfs_add_fdblocks(ip->i_mount, give_back); - ip->i_delayed_blks -= give_back; + xfs_mod_sb_delalloc(mp, -give_back); + xfs_add_fdblocks(mp, give_back); + mp->m_metafile_resv_avail -= give_back; } return 0; @@ -1413,24 +1421,23 @@ xrep_reset_metafile_resv( /* * Not enough reservation; try to take some blocks from the filesystem - * to the metadata inode. @delta is negative here, so invert the sign. + * to the metabtree reservation. */ - delta = -delta; - error = xfs_dec_fdblocks(sc->mp, delta, true); + delta = -delta; /* delta is negative here, so invert the sign. */ + error = xfs_dec_fdblocks(mp, delta, true); while (error == -ENOSPC) { delta--; if (delta == 0) { xfs_warn(sc->mp, -"Insufficient free space to reset space reservation for inode 0x%llx after repair.", - ip->i_ino); +"Insufficient free space to reset metabtree reservation after repair."); return 0; } - error = xfs_dec_fdblocks(sc->mp, delta, true); + error = xfs_dec_fdblocks(mp, delta, true); } if (error) return error; - xfs_mod_delalloc(ip, 0, delta); - ip->i_delayed_blks += delta; + xfs_mod_sb_delalloc(mp, delta); + mp->m_metafile_resv_avail += delta; return 0; } diff --git a/fs/xfs/scrub/rtbitmap.c b/fs/xfs/scrub/rtbitmap.c index e8c776a34c1d..d5ff8609dbfb 100644 --- a/fs/xfs/scrub/rtbitmap.c +++ b/fs/xfs/scrub/rtbitmap.c @@ -21,6 +21,7 @@ #include "xfs_rmap.h" #include "xfs_rtrmap_btree.h" #include "xfs_exchmaps.h" +#include "xfs_zone_alloc.h" #include "scrub/scrub.h" #include "scrub/common.h" #include "scrub/repair.h" @@ -272,7 +273,6 @@ xchk_xref_is_used_rt_space( xfs_extlen_t len) { struct xfs_rtgroup *rtg = sc->sr.rtg; - struct xfs_inode *rbmip = rtg_bitmap(rtg); xfs_rtxnum_t startext; xfs_rtxnum_t endext; bool is_free; @@ -281,6 +281,13 @@ xchk_xref_is_used_rt_space( if (xchk_skip_xref(sc->sm)) return; + if (xfs_has_zoned(sc->mp)) { + if (!xfs_zone_rgbno_is_valid(rtg, + xfs_rtb_to_rgbno(sc->mp, rtbno) + len - 1)) + xchk_ino_xref_set_corrupt(sc, rtg_rmap(rtg)->i_ino); + return; + } + startext = xfs_rtb_to_rtx(sc->mp, rtbno); endext = xfs_rtb_to_rtx(sc->mp, rtbno + len - 1); error = xfs_rtalloc_extent_is_free(rtg, sc->tp, startext, @@ -288,5 +295,5 @@ xchk_xref_is_used_rt_space( if (!xchk_should_check_xref(sc, &error, NULL)) return; if (is_free) - xchk_ino_xref_set_corrupt(sc, rbmip->i_ino); + xchk_ino_xref_set_corrupt(sc, rtg_bitmap(rtg)->i_ino); } diff --git a/fs/xfs/scrub/rtrefcount_repair.c b/fs/xfs/scrub/rtrefcount_repair.c index 257cfb24beb4..983362447826 100644 --- a/fs/xfs/scrub/rtrefcount_repair.c +++ b/fs/xfs/scrub/rtrefcount_repair.c @@ -697,32 +697,6 @@ err_cur: return error; } -/* - * Now that we've logged the roots of the new btrees, invalidate all of the - * old blocks and free them. - */ -STATIC int -xrep_rtrefc_remove_old_tree( - struct xrep_rtrefc *rr) -{ - int error; - - /* - * Free all the extents that were allocated to the former rtrefcountbt - * and aren't cross-linked with something else. - */ - error = xrep_reap_metadir_fsblocks(rr->sc, - &rr->old_rtrefcountbt_blocks); - if (error) - return error; - - /* - * Ensure the proper reservation for the rtrefcount inode so that we - * don't fail to expand the btree. - */ - return xrep_reset_metafile_resv(rr->sc); -} - /* Rebuild the rt refcount btree. */ int xrep_rtrefcountbt( @@ -769,8 +743,12 @@ xrep_rtrefcountbt( if (error) goto out_bitmap; - /* Kill the old tree. */ - error = xrep_rtrefc_remove_old_tree(rr); + /* + * Free all the extents that were allocated to the former rtrefcountbt + * and aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, + &rr->old_rtrefcountbt_blocks); if (error) goto out_bitmap; diff --git a/fs/xfs/scrub/rtrmap_repair.c b/fs/xfs/scrub/rtrmap_repair.c index f2fdd7a9fc24..fc2592c53af5 100644 --- a/fs/xfs/scrub/rtrmap_repair.c +++ b/fs/xfs/scrub/rtrmap_repair.c @@ -810,28 +810,6 @@ err_cur: /* Reaping the old btree. */ -/* Reap the old rtrmapbt blocks. */ -STATIC int -xrep_rtrmap_remove_old_tree( - struct xrep_rtrmap *rr) -{ - int error; - - /* - * Free all the extents that were allocated to the former rtrmapbt and - * aren't cross-linked with something else. - */ - error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); - if (error) - return error; - - /* - * Ensure the proper reservation for the rtrmap inode so that we don't - * fail to expand the new btree. - */ - return xrep_reset_metafile_resv(rr->sc); -} - static inline bool xrep_rtrmapbt_want_live_update( struct xchk_iscan *iscan, @@ -995,8 +973,11 @@ xrep_rtrmapbt( if (error) goto out_records; - /* Kill the old tree. */ - error = xrep_rtrmap_remove_old_tree(rr); + /* + * Free all the extents that were allocated to the former rtrmapbt and + * aren't cross-linked with something else. + */ + error = xrep_reap_metadir_fsblocks(rr->sc, &rr->old_rtrmapbt_blocks); if (error) goto out_records; diff --git a/fs/xfs/scrub/scrub.c b/fs/xfs/scrub/scrub.c index 6fa9e3e5bab7..76e24032e99a 100644 --- a/fs/xfs/scrub/scrub.c +++ b/fs/xfs/scrub/scrub.c @@ -399,12 +399,14 @@ static const struct xchk_meta_ops meta_scrub_ops[] = { }, [XFS_SCRUB_TYPE_RTBITMAP] = { /* realtime bitmap */ .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtbitmap, .scrub = xchk_rtbitmap, .repair = xrep_rtbitmap, }, [XFS_SCRUB_TYPE_RTSUM] = { /* realtime summary */ .type = ST_RTGROUP, + .has = xfs_has_nonzoned, .setup = xchk_setup_rtsummary, .scrub = xchk_rtsummary, .repair = xrep_rtsummary, @@ -678,8 +680,6 @@ xfs_scrub_metadata( if (error) goto out; - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_SCRUB); - sc = kzalloc(sizeof(struct xfs_scrub), XCHK_GFP_FLAGS); if (!sc) { error = -ENOMEM; diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 6d9965b546cb..26a04a783489 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -1,7 +1,7 @@ // SPDX-License-Identifier: GPL-2.0 /* * Copyright (c) 2000-2005 Silicon Graphics, Inc. - * Copyright (c) 2016-2018 Christoph Hellwig. + * Copyright (c) 2016-2025 Christoph Hellwig. * All Rights Reserved. */ #include "xfs.h" @@ -20,6 +20,8 @@ #include "xfs_errortag.h" #include "xfs_error.h" #include "xfs_icache.h" +#include "xfs_zone_alloc.h" +#include "xfs_rtgroup.h" struct xfs_writepage_ctx { struct iomap_writepage_ctx ctx; @@ -77,6 +79,26 @@ xfs_setfilesize( return xfs_trans_commit(tp); } +static void +xfs_ioend_put_open_zones( + struct iomap_ioend *ioend) +{ + struct iomap_ioend *tmp; + + /* + * Put the open zone for all ioends merged into this one (if any). + */ + list_for_each_entry(tmp, &ioend->io_list, io_list) + xfs_open_zone_put(tmp->io_private); + + /* + * The main ioend might not have an open zone if the submission failed + * before xfs_zone_alloc_and_submit got called. + */ + if (ioend->io_private) + xfs_open_zone_put(ioend->io_private); +} + /* * IO write completion. */ @@ -86,6 +108,7 @@ xfs_end_ioend( { struct xfs_inode *ip = XFS_I(ioend->io_inode); struct xfs_mount *mp = ip->i_mount; + bool is_zoned = xfs_is_zoned_inode(ip); xfs_off_t offset = ioend->io_offset; size_t size = ioend->io_size; unsigned int nofs_flag; @@ -115,10 +138,11 @@ xfs_end_ioend( */ error = blk_status_to_errno(ioend->io_bio.bi_status); if (unlikely(error)) { - if (ioend->io_flags & IOMAP_F_SHARED) { + if (ioend->io_flags & IOMAP_IOEND_SHARED) { + ASSERT(!is_zoned); xfs_reflink_cancel_cow_range(ip, offset, size, true); xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, offset, - offset + size); + offset + size, NULL); } goto done; } @@ -126,14 +150,21 @@ xfs_end_ioend( /* * Success: commit the COW or unwritten blocks if needed. */ - if (ioend->io_flags & IOMAP_F_SHARED) + if (is_zoned) + error = xfs_zoned_end_io(ip, offset, size, ioend->io_sector, + ioend->io_private, NULLFSBLOCK); + else if (ioend->io_flags & IOMAP_IOEND_SHARED) error = xfs_reflink_end_cow(ip, offset, size); - else if (ioend->io_type == IOMAP_UNWRITTEN) + else if (ioend->io_flags & IOMAP_IOEND_UNWRITTEN) error = xfs_iomap_write_unwritten(ip, offset, size, false); - if (!error && xfs_ioend_is_append(ioend)) + if (!error && + !(ioend->io_flags & IOMAP_IOEND_DIRECT) && + xfs_ioend_is_append(ioend)) error = xfs_setfilesize(ip, offset, size); done: + if (is_zoned) + xfs_ioend_put_open_zones(ioend); iomap_finish_ioends(ioend, error); memalloc_nofs_restore(nofs_flag); } @@ -176,17 +207,27 @@ xfs_end_io( } } -STATIC void +void xfs_end_bio( struct bio *bio) { struct iomap_ioend *ioend = iomap_ioend_from_bio(bio); struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; unsigned long flags; + /* + * For Appends record the actually written block number and set the + * boundary flag if needed. + */ + if (IS_ENABLED(CONFIG_XFS_RT) && bio_is_zone_append(bio)) { + ioend->io_sector = bio->bi_iter.bi_sector; + xfs_mark_rtg_boundary(ioend); + } + spin_lock_irqsave(&ip->i_ioend_lock, flags); if (list_empty(&ip->i_ioend_list)) - WARN_ON_ONCE(!queue_work(ip->i_mount->m_unwritten_workqueue, + WARN_ON_ONCE(!queue_work(mp->m_unwritten_workqueue, &ip->i_ioend_work)); list_add_tail(&ioend->io_list, &ip->i_ioend_list); spin_unlock_irqrestore(&ip->i_ioend_lock, flags); @@ -396,10 +437,11 @@ allocate_blocks: } static int -xfs_prepare_ioend( - struct iomap_ioend *ioend, +xfs_submit_ioend( + struct iomap_writepage_ctx *wpc, int status) { + struct iomap_ioend *ioend = wpc->ioend; unsigned int nofs_flag; /* @@ -410,7 +452,7 @@ xfs_prepare_ioend( nofs_flag = memalloc_nofs_save(); /* Convert CoW extents to regular */ - if (!status && (ioend->io_flags & IOMAP_F_SHARED)) { + if (!status && (ioend->io_flags & IOMAP_IOEND_SHARED)) { status = xfs_reflink_convert_cow(XFS_I(ioend->io_inode), ioend->io_offset, ioend->io_size); } @@ -418,10 +460,14 @@ xfs_prepare_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || ioend->io_type == IOMAP_UNWRITTEN || - (ioend->io_flags & IOMAP_F_SHARED)) + if (xfs_ioend_is_append(ioend) || + (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) ioend->io_bio.bi_end_io = xfs_end_bio; - return status; + + if (status) + return status; + submit_bio(&ioend->io_bio); + return 0; } /* @@ -458,12 +504,107 @@ xfs_discard_folio( * folio itself and not the start offset that is passed in. */ xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, pos, - folio_pos(folio) + folio_size(folio)); + folio_pos(folio) + folio_size(folio), NULL); } static const struct iomap_writeback_ops xfs_writeback_ops = { .map_blocks = xfs_map_blocks, - .prepare_ioend = xfs_prepare_ioend, + .submit_ioend = xfs_submit_ioend, + .discard_folio = xfs_discard_folio, +}; + +struct xfs_zoned_writepage_ctx { + struct iomap_writepage_ctx ctx; + struct xfs_open_zone *open_zone; +}; + +static inline struct xfs_zoned_writepage_ctx * +XFS_ZWPC(struct iomap_writepage_ctx *ctx) +{ + return container_of(ctx, struct xfs_zoned_writepage_ctx, ctx); +} + +static int +xfs_zoned_map_blocks( + struct iomap_writepage_ctx *wpc, + struct inode *inode, + loff_t offset, + unsigned int len) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + len); + xfs_filblks_t count_fsb; + struct xfs_bmbt_irec imap, del; + struct xfs_iext_cursor icur; + + if (xfs_is_shutdown(mp)) + return -EIO; + + XFS_ERRORTAG_DELAY(mp, XFS_ERRTAG_WB_DELAY_MS); + + /* + * All dirty data must be covered by delalloc extents. But truncate can + * remove delalloc extents underneath us or reduce their size. + * Returning a hole tells iomap to not write back any data from this + * range, which is the right thing to do in that case. + * + * Otherwise just tell iomap to treat ranges previously covered by a + * delalloc extent as mapped. The actual block allocation will be done + * just before submitting the bio. + * + * This implies we never map outside folios that are locked or marked + * as under writeback, and thus there is no need check the fork sequence + * count here. + */ + xfs_ilock(ip, XFS_ILOCK_EXCL); + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &imap)) + imap.br_startoff = end_fsb; /* fake a hole past EOF */ + if (imap.br_startoff > offset_fsb) { + imap.br_blockcount = imap.br_startoff - offset_fsb; + imap.br_startoff = offset_fsb; + imap.br_startblock = HOLESTARTBLOCK; + imap.br_state = XFS_EXT_NORM; + xfs_iunlock(ip, XFS_ILOCK_EXCL); + xfs_bmbt_to_iomap(ip, &wpc->iomap, &imap, 0, 0, 0); + return 0; + } + end_fsb = min(end_fsb, imap.br_startoff + imap.br_blockcount); + count_fsb = end_fsb - offset_fsb; + + del = imap; + xfs_trim_extent(&del, offset_fsb, count_fsb); + xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &imap, &del, + XFS_BMAPI_REMAP); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + wpc->iomap.type = IOMAP_MAPPED; + wpc->iomap.flags = IOMAP_F_DIRTY; + wpc->iomap.bdev = mp->m_rtdev_targp->bt_bdev; + wpc->iomap.offset = offset; + wpc->iomap.length = XFS_FSB_TO_B(mp, count_fsb); + wpc->iomap.flags = IOMAP_F_ANON_WRITE; + + trace_xfs_zoned_map_blocks(ip, offset, wpc->iomap.length); + return 0; +} + +static int +xfs_zoned_submit_ioend( + struct iomap_writepage_ctx *wpc, + int status) +{ + wpc->ioend->io_bio.bi_end_io = xfs_end_bio; + if (status) + return status; + xfs_zone_alloc_and_submit(wpc->ioend, &XFS_ZWPC(wpc)->open_zone); + return 0; +} + +static const struct iomap_writeback_ops xfs_zoned_writeback_ops = { + .map_blocks = xfs_zoned_map_blocks, + .submit_ioend = xfs_zoned_submit_ioend, .discard_folio = xfs_discard_folio, }; @@ -472,10 +613,25 @@ xfs_vm_writepages( struct address_space *mapping, struct writeback_control *wbc) { - struct xfs_writepage_ctx wpc = { }; + struct xfs_inode *ip = XFS_I(mapping->host); + + xfs_iflags_clear(ip, XFS_ITRUNCATED); - xfs_iflags_clear(XFS_I(mapping->host), XFS_ITRUNCATED); - return iomap_writepages(mapping, wbc, &wpc.ctx, &xfs_writeback_ops); + if (xfs_is_zoned_inode(ip)) { + struct xfs_zoned_writepage_ctx xc = { }; + int error; + + error = iomap_writepages(mapping, wbc, &xc.ctx, + &xfs_zoned_writeback_ops); + if (xc.open_zone) + xfs_open_zone_put(xc.open_zone); + return error; + } else { + struct xfs_writepage_ctx wpc = { }; + + return iomap_writepages(mapping, wbc, &wpc.ctx, + &xfs_writeback_ops); + } } STATIC int diff --git a/fs/xfs/xfs_aops.h b/fs/xfs/xfs_aops.h index e0bd68419764..5a7a0f1a0b49 100644 --- a/fs/xfs/xfs_aops.h +++ b/fs/xfs/xfs_aops.h @@ -9,6 +9,7 @@ extern const struct address_space_operations xfs_address_space_operations; extern const struct address_space_operations xfs_dax_aops; -int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); +int xfs_setfilesize(struct xfs_inode *ip, xfs_off_t offset, size_t size); +void xfs_end_bio(struct bio *bio); #endif /* __XFS_AOPS_H__ */ diff --git a/fs/xfs/xfs_bio_io.c b/fs/xfs/xfs_bio_io.c index fe21c76f75b8..2a736d10eafb 100644 --- a/fs/xfs/xfs_bio_io.c +++ b/fs/xfs/xfs_bio_io.c @@ -18,42 +18,36 @@ xfs_rw_bdev( enum req_op op) { - unsigned int is_vmalloc = is_vmalloc_addr(data); - unsigned int left = count; + unsigned int done = 0, added; int error; struct bio *bio; - if (is_vmalloc && op == REQ_OP_WRITE) - flush_kernel_vmap_range(data, count); + op |= REQ_META | REQ_SYNC; + if (!is_vmalloc_addr(data)) + return bdev_rw_virt(bdev, sector, data, count, op); - bio = bio_alloc(bdev, bio_max_vecs(left), op | REQ_META | REQ_SYNC, - GFP_KERNEL); + bio = bio_alloc(bdev, bio_max_vecs(count), op, GFP_KERNEL); bio->bi_iter.bi_sector = sector; do { - struct page *page = kmem_to_page(data); - unsigned int off = offset_in_page(data); - unsigned int len = min_t(unsigned, left, PAGE_SIZE - off); - - while (bio_add_page(bio, page, len, off) != len) { + added = bio_add_vmalloc_chunk(bio, data + done, count - done); + if (!added) { struct bio *prev = bio; - bio = bio_alloc(prev->bi_bdev, bio_max_vecs(left), + bio = bio_alloc(prev->bi_bdev, + bio_max_vecs(count - done), prev->bi_opf, GFP_KERNEL); bio->bi_iter.bi_sector = bio_end_sector(prev); bio_chain(prev, bio); - submit_bio(prev); } - - data += len; - left -= len; - } while (left > 0); + done += added; + } while (done < count); error = submit_bio_wait(bio); bio_put(bio); - if (is_vmalloc && op == REQ_OP_READ) + if (op == REQ_OP_READ) invalidate_kernel_vmap_range(data, count); return error; } diff --git a/fs/xfs/xfs_bmap_item.c b/fs/xfs/xfs_bmap_item.c index 3d52e9d7ad57..646c515ee355 100644 --- a/fs/xfs/xfs_bmap_item.c +++ b/fs/xfs/xfs_bmap_item.c @@ -77,6 +77,11 @@ xfs_bui_item_size( *nbytes += xfs_bui_log_format_sizeof(buip->bui_format.bui_nextents); } +unsigned int xfs_bui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_bui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given bui log item. We use only 1 iovec, and we point that @@ -168,6 +173,11 @@ xfs_bud_item_size( *nbytes += sizeof(struct xfs_bud_log_format); } +unsigned int xfs_bud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_bud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given bud log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_bmap_item.h b/fs/xfs/xfs_bmap_item.h index 6fee6a508343..b42fee06899d 100644 --- a/fs/xfs/xfs_bmap_item.h +++ b/fs/xfs/xfs_bmap_item.h @@ -72,4 +72,7 @@ struct xfs_bmap_intent; void xfs_bmap_defer_add(struct xfs_trans *tp, struct xfs_bmap_intent *bi); +unsigned int xfs_bui_log_space(unsigned int nr); +unsigned int xfs_bud_log_space(void); + #endif /* __XFS_BMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_bmap_util.c b/fs/xfs/xfs_bmap_util.c index 0836fea2d6d8..06ca11731e43 100644 --- a/fs/xfs/xfs_bmap_util.c +++ b/fs/xfs/xfs_bmap_util.c @@ -30,6 +30,7 @@ #include "xfs_reflink.h" #include "xfs_rtbitmap.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" /* Kernel only BMAP related definitions and functions */ @@ -436,7 +437,8 @@ xfs_bmap_punch_delalloc_range( struct xfs_inode *ip, int whichfork, xfs_off_t start_byte, - xfs_off_t end_byte) + xfs_off_t end_byte, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); @@ -467,7 +469,21 @@ xfs_bmap_punch_delalloc_range( continue; } - xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, &del); + if (xfs_is_zoned_inode(ip) && ac) { + /* + * In a zoned buffered write context we need to return + * the punched delalloc allocations to the allocation + * context. This allows reusing them in the following + * iomap iterations. + */ + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, XFS_BMAPI_REMAP); + ac->reserved_blocks += del.br_blockcount; + } else { + xfs_bmap_del_extent_delay(ip, whichfork, &icur, &got, + &del, 0); + } + if (!xfs_iext_get_extent(ifp, &icur, &got)) break; } @@ -582,7 +598,7 @@ xfs_free_eofblocks( if (ip->i_delayed_blks) { xfs_bmap_punch_delalloc_range(ip, XFS_DATA_FORK, round_up(XFS_ISIZE(ip), mp->m_sb.sb_blocksize), - LLONG_MAX); + LLONG_MAX, NULL); } xfs_inode_clear_eofblocks_tag(ip); return 0; @@ -825,7 +841,8 @@ int xfs_free_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len) + xfs_off_t len, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; xfs_fileoff_t startoffset_fsb; @@ -880,7 +897,7 @@ xfs_free_file_space( return 0; if (offset + len > XFS_ISIZE(ip)) len = XFS_ISIZE(ip) - offset; - error = xfs_zero_range(ip, offset, len, NULL); + error = xfs_zero_range(ip, offset, len, ac, NULL); if (error) return error; @@ -968,7 +985,8 @@ int xfs_collapse_file_space( struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len) + xfs_off_t len, + struct xfs_zone_alloc_ctx *ac) { struct xfs_mount *mp = ip->i_mount; struct xfs_trans *tp; @@ -981,7 +999,7 @@ xfs_collapse_file_space( trace_xfs_collapse_file_space(ip); - error = xfs_free_file_space(ip, offset, len); + error = xfs_free_file_space(ip, offset, len, ac); if (error) return error; diff --git a/fs/xfs/xfs_bmap_util.h b/fs/xfs/xfs_bmap_util.h index b29760d36e1a..c477b3361630 100644 --- a/fs/xfs/xfs_bmap_util.h +++ b/fs/xfs/xfs_bmap_util.h @@ -15,6 +15,7 @@ struct xfs_inode; struct xfs_mount; struct xfs_trans; struct xfs_bmalloca; +struct xfs_zone_alloc_ctx; #ifdef CONFIG_XFS_RT int xfs_bmap_rtalloc(struct xfs_bmalloca *ap); @@ -31,7 +32,8 @@ xfs_bmap_rtalloc(struct xfs_bmalloca *ap) #endif /* CONFIG_XFS_RT */ void xfs_bmap_punch_delalloc_range(struct xfs_inode *ip, int whichfork, - xfs_off_t start_byte, xfs_off_t end_byte); + xfs_off_t start_byte, xfs_off_t end_byte, + struct xfs_zone_alloc_ctx *ac); struct kgetbmap { __s64 bmv_offset; /* file offset of segment in blocks */ @@ -54,13 +56,13 @@ int xfs_bmap_last_extent(struct xfs_trans *tp, struct xfs_inode *ip, /* preallocation and hole punch interface */ int xfs_alloc_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len); int xfs_free_file_space(struct xfs_inode *ip, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); int xfs_collapse_file_space(struct xfs_inode *, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len, struct xfs_zone_alloc_ctx *ac); int xfs_insert_file_space(struct xfs_inode *, xfs_off_t offset, - xfs_off_t len); + xfs_off_t len); /* EOF block manipulation functions */ bool xfs_can_free_eofblocks(struct xfs_inode *ip); diff --git a/fs/xfs/xfs_buf.c b/fs/xfs/xfs_buf.c index 15bb790359f8..8af83bd161f9 100644 --- a/fs/xfs/xfs_buf.c +++ b/fs/xfs/xfs_buf.c @@ -29,11 +29,6 @@ struct kmem_cache *xfs_buf_cache; /* * Locking orders * - * xfs_buf_ioacct_inc: - * xfs_buf_ioacct_dec: - * b_sema (caller holds) - * b_lock - * * xfs_buf_stale: * b_sema (caller holds) * b_lock @@ -60,72 +55,6 @@ static inline bool xfs_buf_is_uncached(struct xfs_buf *bp) return bp->b_rhash_key == XFS_BUF_DADDR_NULL; } -static inline int -xfs_buf_is_vmapped( - struct xfs_buf *bp) -{ - /* - * Return true if the buffer is vmapped. - * - * b_addr is null if the buffer is not mapped, but the code is clever - * enough to know it doesn't have to map a single page, so the check has - * to be both for b_addr and bp->b_page_count > 1. - */ - return bp->b_addr && bp->b_page_count > 1; -} - -static inline int -xfs_buf_vmap_len( - struct xfs_buf *bp) -{ - return (bp->b_page_count * PAGE_SIZE); -} - -/* - * Bump the I/O in flight count on the buftarg if we haven't yet done so for - * this buffer. The count is incremented once per buffer (per hold cycle) - * because the corresponding decrement is deferred to buffer release. Buffers - * can undergo I/O multiple times in a hold-release cycle and per buffer I/O - * tracking adds unnecessary overhead. This is used for sychronization purposes - * with unmount (see xfs_buftarg_drain()), so all we really need is a count of - * in-flight buffers. - * - * Buffers that are never released (e.g., superblock, iclog buffers) must set - * the XBF_NO_IOACCT flag before I/O submission. Otherwise, the buftarg count - * never reaches zero and unmount hangs indefinitely. - */ -static inline void -xfs_buf_ioacct_inc( - struct xfs_buf *bp) -{ - if (bp->b_flags & XBF_NO_IOACCT) - return; - - ASSERT(bp->b_flags & XBF_ASYNC); - spin_lock(&bp->b_lock); - if (!(bp->b_state & XFS_BSTATE_IN_FLIGHT)) { - bp->b_state |= XFS_BSTATE_IN_FLIGHT; - percpu_counter_inc(&bp->b_target->bt_io_count); - } - spin_unlock(&bp->b_lock); -} - -/* - * Clear the in-flight state on a buffer about to be released to the LRU or - * freed and unaccount from the buftarg. - */ -static inline void -__xfs_buf_ioacct_dec( - struct xfs_buf *bp) -{ - lockdep_assert_held(&bp->b_lock); - - if (bp->b_state & XFS_BSTATE_IN_FLIGHT) { - bp->b_state &= ~XFS_BSTATE_IN_FLIGHT; - percpu_counter_dec(&bp->b_target->bt_io_count); - } -} - /* * When we mark a buffer stale, we remove the buffer from the LRU and clear the * b_lru_ref count so that the buffer is freed immediately when the buffer @@ -149,15 +78,7 @@ xfs_buf_stale( */ bp->b_flags &= ~_XBF_DELWRI_Q; - /* - * Once the buffer is marked stale and unlocked, a subsequent lookup - * could reset b_flags. There is no guarantee that the buffer is - * unaccounted (released to LRU) before that occurs. Drop in-flight - * status now to preserve accounting consistency. - */ spin_lock(&bp->b_lock); - __xfs_buf_ioacct_dec(bp); - atomic_set(&bp->b_lru_ref, 0); if (!(bp->b_state & XFS_BSTATE_DISPOSE) && (list_lru_del_obj(&bp->b_target->bt_lru, &bp->b_lru))) @@ -167,38 +88,169 @@ xfs_buf_stale( spin_unlock(&bp->b_lock); } +static void +xfs_buf_free_callback( + struct callback_head *cb) +{ + struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); + + if (bp->b_maps != &bp->__b_map) + kfree(bp->b_maps); + kmem_cache_free(xfs_buf_cache, bp); +} + +static void +xfs_buf_free( + struct xfs_buf *bp) +{ + unsigned int size = BBTOB(bp->b_length); + + might_sleep(); + trace_xfs_buf_free(bp, _RET_IP_); + + ASSERT(list_empty(&bp->b_lru)); + + if (!xfs_buftarg_is_mem(bp->b_target) && size >= PAGE_SIZE) + mm_account_reclaimed_pages(howmany(size, PAGE_SHIFT)); + + if (is_vmalloc_addr(bp->b_addr)) + vfree(bp->b_addr); + else if (bp->b_flags & _XBF_KMEM) + kfree(bp->b_addr); + else + folio_put(virt_to_folio(bp->b_addr)); + + call_rcu(&bp->b_rcu, xfs_buf_free_callback); +} + static int -xfs_buf_get_maps( +xfs_buf_alloc_kmem( struct xfs_buf *bp, - int map_count) + size_t size, + gfp_t gfp_mask) { - ASSERT(bp->b_maps == NULL); - bp->b_map_count = map_count; + ASSERT(is_power_of_2(size)); + ASSERT(size < PAGE_SIZE); - if (map_count == 1) { - bp->b_maps = &bp->__b_map; - return 0; - } + bp->b_addr = kmalloc(size, gfp_mask | __GFP_NOFAIL); + if (!bp->b_addr) + return -ENOMEM; - bp->b_maps = kzalloc(map_count * sizeof(struct xfs_buf_map), - GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); - if (!bp->b_maps) + /* + * Slab guarantees that we get back naturally aligned allocations for + * power of two sizes. Keep this check as the canary in the coal mine + * if anything changes in slab. + */ + if (WARN_ON_ONCE(!IS_ALIGNED((unsigned long)bp->b_addr, size))) { + kfree(bp->b_addr); + bp->b_addr = NULL; return -ENOMEM; + } + bp->b_flags |= _XBF_KMEM; + trace_xfs_buf_backing_kmem(bp, _RET_IP_); return 0; } -static void -xfs_buf_free_maps( - struct xfs_buf *bp) +/* + * Allocate backing memory for a buffer. + * + * For tmpfs-backed buffers used by in-memory btrees this directly maps the + * tmpfs page cache folios. + * + * For real file system buffers there are three different kinds backing memory: + * + * The first type backs the buffer by a kmalloc allocation. This is done for + * less than PAGE_SIZE allocations to avoid wasting memory. + * + * The second type is a single folio buffer - this may be a high order folio or + * just a single page sized folio, but either way they get treated the same way + * by the rest of the code - the buffer memory spans a single contiguous memory + * region that we don't have to map and unmap to access the data directly. + * + * The third type of buffer is the vmalloc()d buffer. This provides the buffer + * with the required contiguous memory region but backed by discontiguous + * physical pages. + */ +static int +xfs_buf_alloc_backing_mem( + struct xfs_buf *bp, + xfs_buf_flags_t flags) { - if (bp->b_maps != &bp->__b_map) { - kfree(bp->b_maps); - bp->b_maps = NULL; + size_t size = BBTOB(bp->b_length); + gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; + struct folio *folio; + + if (xfs_buftarg_is_mem(bp->b_target)) + return xmbuf_map_backing_mem(bp); + + /* Assure zeroed buffer for non-read cases. */ + if (!(flags & XBF_READ)) + gfp_mask |= __GFP_ZERO; + + if (flags & XBF_READ_AHEAD) + gfp_mask |= __GFP_NORETRY; + + /* + * For buffers smaller than PAGE_SIZE use a kmalloc allocation if that + * is properly aligned. The slab allocator now guarantees an aligned + * allocation for all power of two sizes, which matches most of the + * smaller than PAGE_SIZE buffers used by XFS. + */ + if (size < PAGE_SIZE && is_power_of_2(size)) + return xfs_buf_alloc_kmem(bp, size, gfp_mask); + + /* + * Don't bother with the retry loop for single PAGE allocations: vmalloc + * won't do any better. + */ + if (size <= PAGE_SIZE) + gfp_mask |= __GFP_NOFAIL; + + /* + * Optimistically attempt a single high order folio allocation for + * larger than PAGE_SIZE buffers. + * + * Allocating a high order folio makes the assumption that buffers are a + * power-of-2 size, matching the power-of-2 folios sizes available. + * + * The exception here are user xattr data buffers, which can be arbitrarily + * sized up to 64kB plus structure metadata, skip straight to the vmalloc + * path for them instead of wasting memory here. + */ + if (size > PAGE_SIZE) { + if (!is_power_of_2(size)) + goto fallback; + gfp_mask &= ~__GFP_DIRECT_RECLAIM; + gfp_mask |= __GFP_NORETRY; } + folio = folio_alloc(gfp_mask, get_order(size)); + if (!folio) { + if (size <= PAGE_SIZE) + return -ENOMEM; + trace_xfs_buf_backing_fallback(bp, _RET_IP_); + goto fallback; + } + bp->b_addr = folio_address(folio); + trace_xfs_buf_backing_folio(bp, _RET_IP_); + return 0; + +fallback: + for (;;) { + bp->b_addr = __vmalloc(size, gfp_mask); + if (bp->b_addr) + break; + if (flags & XBF_READ_AHEAD) + return -ENOMEM; + XFS_STATS_INC(bp->b_mount, xb_page_retries); + memalloc_retry_wait(gfp_mask); + } + + trace_xfs_buf_backing_vmalloc(bp, _RET_IP_); + return 0; } static int -_xfs_buf_alloc( +xfs_buf_alloc( struct xfs_buftarg *target, struct xfs_buf_map *map, int nmaps, @@ -217,7 +269,7 @@ _xfs_buf_alloc( * We don't want certain flags to appear in b_flags unless they are * specifically set by later operations on the buffer. */ - flags &= ~(XBF_UNMAPPED | XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); + flags &= ~(XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD); /* * A new buffer is held and locked by the owner. This ensures that the @@ -237,15 +289,14 @@ _xfs_buf_alloc( bp->b_target = target; bp->b_mount = target->bt_mount; bp->b_flags = flags; - - error = xfs_buf_get_maps(bp, nmaps); - if (error) { - kmem_cache_free(xfs_buf_cache, bp); - return error; - } - bp->b_rhash_key = map[0].bm_bn; bp->b_length = 0; + bp->b_map_count = nmaps; + if (nmaps == 1) + bp->b_maps = &bp->__b_map; + else + bp->b_maps = kcalloc(nmaps, sizeof(struct xfs_buf_map), + GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL); for (i = 0; i < nmaps; i++) { bp->b_maps[i].bm_bn = map[i].bm_bn; bp->b_maps[i].bm_len = map[i].bm_len; @@ -258,195 +309,13 @@ _xfs_buf_alloc( XFS_STATS_INC(bp->b_mount, xb_create); trace_xfs_buf_init(bp, _RET_IP_); - *bpp = bp; - return 0; -} - -static void -xfs_buf_free_pages( - struct xfs_buf *bp) -{ - uint i; - - ASSERT(bp->b_flags & _XBF_PAGES); - - if (xfs_buf_is_vmapped(bp)) - vm_unmap_ram(bp->b_addr, bp->b_page_count); - - for (i = 0; i < bp->b_page_count; i++) { - if (bp->b_pages[i]) - __free_page(bp->b_pages[i]); - } - mm_account_reclaimed_pages(bp->b_page_count); - - if (bp->b_pages != bp->b_page_array) - kfree(bp->b_pages); - bp->b_pages = NULL; - bp->b_flags &= ~_XBF_PAGES; -} - -static void -xfs_buf_free_callback( - struct callback_head *cb) -{ - struct xfs_buf *bp = container_of(cb, struct xfs_buf, b_rcu); - - xfs_buf_free_maps(bp); - kmem_cache_free(xfs_buf_cache, bp); -} - -static void -xfs_buf_free( - struct xfs_buf *bp) -{ - trace_xfs_buf_free(bp, _RET_IP_); - - ASSERT(list_empty(&bp->b_lru)); - - if (xfs_buftarg_is_mem(bp->b_target)) - xmbuf_unmap_page(bp); - else if (bp->b_flags & _XBF_PAGES) - xfs_buf_free_pages(bp); - else if (bp->b_flags & _XBF_KMEM) - kfree(bp->b_addr); - - call_rcu(&bp->b_rcu, xfs_buf_free_callback); -} - -static int -xfs_buf_alloc_kmem( - struct xfs_buf *bp, - xfs_buf_flags_t flags) -{ - gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOFAIL; - size_t size = BBTOB(bp->b_length); - - /* Assure zeroed buffer for non-read cases. */ - if (!(flags & XBF_READ)) - gfp_mask |= __GFP_ZERO; - - bp->b_addr = kmalloc(size, gfp_mask); - if (!bp->b_addr) - return -ENOMEM; - - if (((unsigned long)(bp->b_addr + size - 1) & PAGE_MASK) != - ((unsigned long)bp->b_addr & PAGE_MASK)) { - /* b_addr spans two pages - use alloc_page instead */ - kfree(bp->b_addr); - bp->b_addr = NULL; - return -ENOMEM; - } - bp->b_offset = offset_in_page(bp->b_addr); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = kmem_to_page(bp->b_addr); - bp->b_page_count = 1; - bp->b_flags |= _XBF_KMEM; - return 0; -} - -static int -xfs_buf_alloc_pages( - struct xfs_buf *bp, - xfs_buf_flags_t flags) -{ - gfp_t gfp_mask = GFP_KERNEL | __GFP_NOLOCKDEP | __GFP_NOWARN; - long filled = 0; - - if (flags & XBF_READ_AHEAD) - gfp_mask |= __GFP_NORETRY; - - /* Make sure that we have a page list */ - bp->b_page_count = DIV_ROUND_UP(BBTOB(bp->b_length), PAGE_SIZE); - if (bp->b_page_count <= XB_PAGES) { - bp->b_pages = bp->b_page_array; - } else { - bp->b_pages = kzalloc(sizeof(struct page *) * bp->b_page_count, - gfp_mask); - if (!bp->b_pages) - return -ENOMEM; - } - bp->b_flags |= _XBF_PAGES; - - /* Assure zeroed buffer for non-read cases. */ - if (!(flags & XBF_READ)) - gfp_mask |= __GFP_ZERO; - - /* - * Bulk filling of pages can take multiple calls. Not filling the entire - * array is not an allocation failure, so don't back off if we get at - * least one extra page. - */ - for (;;) { - long last = filled; - - filled = alloc_pages_bulk(gfp_mask, bp->b_page_count, - bp->b_pages); - if (filled == bp->b_page_count) { - XFS_STATS_INC(bp->b_mount, xb_page_found); - break; - } - - if (filled != last) - continue; - - if (flags & XBF_READ_AHEAD) { - xfs_buf_free_pages(bp); - return -ENOMEM; - } - - XFS_STATS_INC(bp->b_mount, xb_page_retries); - memalloc_retry_wait(gfp_mask); - } - return 0; -} - -/* - * Map buffer into kernel address-space if necessary. - */ -STATIC int -_xfs_buf_map_pages( - struct xfs_buf *bp, - xfs_buf_flags_t flags) -{ - ASSERT(bp->b_flags & _XBF_PAGES); - if (bp->b_page_count == 1) { - /* A single page buffer is always mappable */ - bp->b_addr = page_address(bp->b_pages[0]); - } else if (flags & XBF_UNMAPPED) { - bp->b_addr = NULL; - } else { - int retried = 0; - unsigned nofs_flag; - - /* - * vm_map_ram() will allocate auxiliary structures (e.g. - * pagetables) with GFP_KERNEL, yet we often under a scoped nofs - * context here. Mixing GFP_KERNEL with GFP_NOFS allocations - * from the same call site that can be run from both above and - * below memory reclaim causes lockdep false positives. Hence we - * always need to force this allocation to nofs context because - * we can't pass __GFP_NOLOCKDEP down to auxillary structures to - * prevent false positive lockdep reports. - * - * XXX(dgc): I think dquot reclaim is the only place we can get - * to this function from memory reclaim context now. If we fix - * that like we've fixed inode reclaim to avoid writeback from - * reclaim, this nofs wrapping can go away. - */ - nofs_flag = memalloc_nofs_save(); - do { - bp->b_addr = vm_map_ram(bp->b_pages, bp->b_page_count, - -1); - if (bp->b_addr) - break; - vm_unmap_aliases(); - } while (retried++ <= 1); - memalloc_nofs_restore(nofs_flag); - - if (!bp->b_addr) - return -ENOMEM; + error = xfs_buf_alloc_backing_mem(bp, flags); + if (error) { + xfs_buf_free(bp); + return error; } + *bpp = bp; return 0; } @@ -565,7 +434,7 @@ xfs_buf_find_lock( return -ENOENT; } ASSERT((bp->b_flags & _XBF_DELWRI_Q) == 0); - bp->b_flags &= _XBF_KMEM | _XBF_PAGES; + bp->b_flags &= _XBF_KMEM; bp->b_ops = NULL; } return 0; @@ -633,25 +502,10 @@ xfs_buf_find_insert( struct xfs_buf *bp; int error; - error = _xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); + error = xfs_buf_alloc(btp, map, nmaps, flags, &new_bp); if (error) goto out_drop_pag; - if (xfs_buftarg_is_mem(new_bp->b_target)) { - error = xmbuf_map_page(new_bp); - } else if (BBTOB(new_bp->b_length) >= PAGE_SIZE || - xfs_buf_alloc_kmem(new_bp, flags) < 0) { - /* - * For buffers that fit entirely within a single page, first - * attempt to allocate the memory from the heap to minimise - * memory usage. If we can't get heap memory for these small - * buffers, we fall back to using the page allocator. - */ - error = xfs_buf_alloc_pages(new_bp, flags); - } - if (error) - goto out_free_buf; - /* The new buffer keeps the perag reference until it is freed. */ new_bp->b_pag = pag; @@ -762,18 +616,6 @@ xfs_buf_get_map( xfs_perag_put(pag); } - /* We do not hold a perag reference anymore. */ - if (!bp->b_addr) { - error = _xfs_buf_map_pages(bp, flags); - if (unlikely(error)) { - xfs_warn_ratelimited(btp->bt_mount, - "%s: failed to map %u pages", __func__, - bp->b_page_count); - xfs_buf_relse(bp); - return error; - } - } - /* * Clear b_error if this is a lookup from a caller that doesn't expect * valid data to be found in the buffer. @@ -794,18 +636,13 @@ out_put_perag: int _xfs_buf_read( - struct xfs_buf *bp, - xfs_buf_flags_t flags) + struct xfs_buf *bp) { - ASSERT(!(flags & XBF_WRITE)); ASSERT(bp->b_maps[0].bm_bn != XFS_BUF_DADDR_NULL); bp->b_flags &= ~(XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD | XBF_DONE); - bp->b_flags |= flags & (XBF_READ | XBF_ASYNC | XBF_READ_AHEAD); - + bp->b_flags |= XBF_READ; xfs_buf_submit(bp); - if (flags & XBF_ASYNC) - return 0; return xfs_buf_iowait(bp); } @@ -857,6 +694,8 @@ xfs_buf_read_map( struct xfs_buf *bp; int error; + ASSERT(!(flags & (XBF_WRITE | XBF_ASYNC | XBF_READ_AHEAD))); + flags |= XBF_READ; *bpp = NULL; @@ -870,21 +709,11 @@ xfs_buf_read_map( /* Initiate the buffer read and wait. */ XFS_STATS_INC(target->bt_mount, xb_get_read); bp->b_ops = ops; - error = _xfs_buf_read(bp, flags); - - /* Readahead iodone already dropped the buffer, so exit. */ - if (flags & XBF_ASYNC) - return 0; + error = _xfs_buf_read(bp); } else { /* Buffer already read; all we need to do is check it. */ error = xfs_buf_reverify(bp, ops); - /* Readahead already finished; drop the buffer and exit. */ - if (flags & XBF_ASYNC) { - xfs_buf_relse(bp); - return 0; - } - /* We do not want read in the flags */ bp->b_flags &= ~XBF_READ; ASSERT(bp->b_ops != NULL || ops == NULL); @@ -936,6 +765,7 @@ xfs_buf_readahead_map( int nmaps, const struct xfs_buf_ops *ops) { + const xfs_buf_flags_t flags = XBF_READ | XBF_ASYNC | XBF_READ_AHEAD; struct xfs_buf *bp; /* @@ -945,9 +775,21 @@ xfs_buf_readahead_map( if (xfs_buftarg_is_mem(target)) return; - xfs_buf_read_map(target, map, nmaps, - XBF_TRYLOCK | XBF_ASYNC | XBF_READ_AHEAD, &bp, ops, - __this_address); + if (xfs_buf_get_map(target, map, nmaps, flags | XBF_TRYLOCK, &bp)) + return; + trace_xfs_buf_readahead(bp, 0, _RET_IP_); + + if (bp->b_flags & XBF_DONE) { + xfs_buf_reverify(bp, ops); + xfs_buf_relse(bp); + return; + } + XFS_STATS_INC(target->bt_mount, xb_get_read); + bp->b_ops = ops; + bp->b_flags &= ~(XBF_WRITE | XBF_DONE); + bp->b_flags |= flags; + percpu_counter_inc(&target->bt_readahead_count); + xfs_buf_submit(bp); } /* @@ -961,7 +803,6 @@ xfs_buf_read_uncached( struct xfs_buftarg *target, xfs_daddr_t daddr, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp, const struct xfs_buf_ops *ops) { @@ -970,7 +811,7 @@ xfs_buf_read_uncached( *bpp = NULL; - error = xfs_buf_get_uncached(target, numblks, flags, &bp); + error = xfs_buf_get_uncached(target, numblks, &bp); if (error) return error; @@ -996,40 +837,14 @@ int xfs_buf_get_uncached( struct xfs_buftarg *target, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp) { int error; - struct xfs_buf *bp; DEFINE_SINGLE_BUF_MAP(map, XFS_BUF_DADDR_NULL, numblks); - *bpp = NULL; - - /* flags might contain irrelevant bits, pass only what we care about */ - error = _xfs_buf_alloc(target, &map, 1, flags & XBF_NO_IOACCT, &bp); - if (error) - return error; - - if (xfs_buftarg_is_mem(bp->b_target)) - error = xmbuf_map_page(bp); - else - error = xfs_buf_alloc_pages(bp, flags); - if (error) - goto fail_free_buf; - - error = _xfs_buf_map_pages(bp, 0); - if (unlikely(error)) { - xfs_warn(target->bt_mount, - "%s: failed to map pages", __func__); - goto fail_free_buf; - } - - trace_xfs_buf_get_uncached(bp, _RET_IP_); - *bpp = bp; - return 0; - -fail_free_buf: - xfs_buf_free(bp); + error = xfs_buf_alloc(target, &map, 1, 0, bpp); + if (!error) + trace_xfs_buf_get_uncached(*bpp, _RET_IP_); return error; } @@ -1060,7 +875,6 @@ xfs_buf_rele_uncached( spin_unlock(&bp->b_lock); return; } - __xfs_buf_ioacct_dec(bp); spin_unlock(&bp->b_lock); xfs_buf_free(bp); } @@ -1079,20 +893,12 @@ xfs_buf_rele_cached( spin_lock(&bp->b_lock); ASSERT(bp->b_hold >= 1); if (bp->b_hold > 1) { - /* - * Drop the in-flight state if the buffer is already on the LRU - * and it holds the only reference. This is racy because we - * haven't acquired the pag lock, but the use of _XBF_IN_FLIGHT - * ensures the decrement occurs only once per-buf. - */ - if (--bp->b_hold == 1 && !list_empty(&bp->b_lru)) - __xfs_buf_ioacct_dec(bp); + bp->b_hold--; goto out_unlock; } /* we are asked to drop the last reference */ - __xfs_buf_ioacct_dec(bp); - if (!(bp->b_flags & XBF_STALE) && atomic_read(&bp->b_lru_ref)) { + if (atomic_read(&bp->b_lru_ref)) { /* * If the buffer is added to the LRU, keep the reference to the * buffer for the LRU and clear the (now stale) dispose list @@ -1345,6 +1151,7 @@ xfs_buf_ioend_handle_error( resubmit: xfs_buf_ioerror(bp, 0); bp->b_flags |= (XBF_DONE | XBF_WRITE_FAIL); + reinit_completion(&bp->b_iowait); xfs_buf_submit(bp); return true; out_stale: @@ -1355,20 +1162,23 @@ out_stale: return false; } -static void -xfs_buf_ioend( +/* returns false if the caller needs to resubmit the I/O, else true */ +static bool +__xfs_buf_ioend( struct xfs_buf *bp) { trace_xfs_buf_iodone(bp, _RET_IP_); if (bp->b_flags & XBF_READ) { - if (!bp->b_error && xfs_buf_is_vmapped(bp)) + if (!bp->b_error && is_vmalloc_addr(bp->b_addr)) invalidate_kernel_vmap_range(bp->b_addr, - xfs_buf_vmap_len(bp)); + roundup(BBTOB(bp->b_length), PAGE_SIZE)); if (!bp->b_error && bp->b_ops) bp->b_ops->verify_read(bp); if (!bp->b_error) bp->b_flags |= XBF_DONE; + if (bp->b_flags & XBF_READ_AHEAD) + percpu_counter_dec(&bp->b_target->bt_readahead_count); } else { if (!bp->b_error) { bp->b_flags &= ~XBF_WRITE_FAIL; @@ -1376,7 +1186,7 @@ xfs_buf_ioend( } if (unlikely(bp->b_error) && xfs_buf_ioend_handle_error(bp)) - return; + return false; /* clear the retry state */ bp->b_last_error = 0; @@ -1397,7 +1207,15 @@ xfs_buf_ioend( bp->b_flags &= ~(XBF_READ | XBF_WRITE | XBF_READ_AHEAD | _XBF_LOGRECOVERY); + return true; +} +static void +xfs_buf_ioend( + struct xfs_buf *bp) +{ + if (!__xfs_buf_ioend(bp)) + return; if (bp->b_flags & XBF_ASYNC) xfs_buf_relse(bp); else @@ -1411,15 +1229,8 @@ xfs_buf_ioend_work( struct xfs_buf *bp = container_of(work, struct xfs_buf, b_ioend_work); - xfs_buf_ioend(bp); -} - -static void -xfs_buf_ioend_async( - struct xfs_buf *bp) -{ - INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); - queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); + if (__xfs_buf_ioend(bp)) + xfs_buf_relse(bp); } void @@ -1491,7 +1302,13 @@ xfs_buf_bio_end_io( XFS_TEST_ERROR(false, bp->b_mount, XFS_ERRTAG_BUF_IOERROR)) xfs_buf_ioerror(bp, -EIO); - xfs_buf_ioend_async(bp); + if (bp->b_flags & XBF_ASYNC) { + INIT_WORK(&bp->b_ioend_work, xfs_buf_ioend_work); + queue_work(bp->b_mount->m_buf_workqueue, &bp->b_ioend_work); + } else { + complete(&bp->b_iowait); + } + bio_put(bio); } @@ -1516,29 +1333,21 @@ static void xfs_buf_submit_bio( struct xfs_buf *bp) { - unsigned int size = BBTOB(bp->b_length); - unsigned int map = 0, p; + unsigned int len = BBTOB(bp->b_length); + unsigned int nr_vecs = bio_add_max_vecs(bp->b_addr, len); + unsigned int map = 0; struct blk_plug plug; struct bio *bio; - bio = bio_alloc(bp->b_target->bt_bdev, bp->b_page_count, - xfs_buf_bio_op(bp), GFP_NOIO); + bio = bio_alloc(bp->b_target->bt_bdev, nr_vecs, xfs_buf_bio_op(bp), + GFP_NOIO); + if (is_vmalloc_addr(bp->b_addr)) + bio_add_vmalloc(bio, bp->b_addr, len); + else + bio_add_virt_nofail(bio, bp->b_addr, len); bio->bi_private = bp; bio->bi_end_io = xfs_buf_bio_end_io; - if (bp->b_flags & _XBF_KMEM) { - __bio_add_page(bio, virt_to_page(bp->b_addr), size, - bp->b_offset); - } else { - for (p = 0; p < bp->b_page_count; p++) - __bio_add_page(bio, bp->b_pages[p], PAGE_SIZE, 0); - bio->bi_iter.bi_size = size; /* limit to the actual size used */ - - if (xfs_buf_is_vmapped(bp)) - flush_kernel_vmap_range(bp->b_addr, - xfs_buf_vmap_len(bp)); - } - /* * If there is more than one map segment, split out a new bio for each * map except of the last one. The last map is handled by the @@ -1568,9 +1377,11 @@ xfs_buf_iowait( { ASSERT(!(bp->b_flags & XBF_ASYNC)); - trace_xfs_buf_iowait(bp, _RET_IP_); - wait_for_completion(&bp->b_iowait); - trace_xfs_buf_iowait_done(bp, _RET_IP_); + do { + trace_xfs_buf_iowait(bp, _RET_IP_); + wait_for_completion(&bp->b_iowait); + trace_xfs_buf_iowait_done(bp, _RET_IP_); + } while (!__xfs_buf_ioend(bp)); return bp->b_error; } @@ -1648,9 +1459,6 @@ xfs_buf_submit( */ bp->b_error = 0; - if (bp->b_flags & XBF_ASYNC) - xfs_buf_ioacct_inc(bp); - if ((bp->b_flags & XBF_WRITE) && !xfs_buf_verify_write(bp)) { xfs_force_shutdown(bp->b_mount, SHUTDOWN_CORRUPT_INCORE); xfs_buf_ioend(bp); @@ -1666,47 +1474,6 @@ xfs_buf_submit( xfs_buf_submit_bio(bp); } -void * -xfs_buf_offset( - struct xfs_buf *bp, - size_t offset) -{ - struct page *page; - - if (bp->b_addr) - return bp->b_addr + offset; - - page = bp->b_pages[offset >> PAGE_SHIFT]; - return page_address(page) + (offset & (PAGE_SIZE-1)); -} - -void -xfs_buf_zero( - struct xfs_buf *bp, - size_t boff, - size_t bsize) -{ - size_t bend; - - bend = boff + bsize; - while (boff < bend) { - struct page *page; - int page_index, page_offset, csize; - - page_index = (boff + bp->b_offset) >> PAGE_SHIFT; - page_offset = (boff + bp->b_offset) & ~PAGE_MASK; - page = bp->b_pages[page_index]; - csize = min_t(size_t, PAGE_SIZE - page_offset, - BBTOB(bp->b_length) - boff); - - ASSERT((csize + page_offset) <= PAGE_SIZE); - - memset(page_address(page) + page_offset, 0, csize); - - boff += csize; - } -} - /* * Log a message about and stale a buffer that a caller has decided is corrupt. * @@ -1776,9 +1543,8 @@ xfs_buftarg_wait( struct xfs_buftarg *btp) { /* - * First wait on the buftarg I/O count for all in-flight buffers to be - * released. This is critical as new buffers do not make the LRU until - * they are released. + * First wait for all in-flight readahead buffers to be released. This is + * critical as new buffers do not make the LRU until they are released. * * Next, flush the buffer workqueue to ensure all completion processing * has finished. Just waiting on buffer locks is not sufficient for @@ -1787,7 +1553,7 @@ xfs_buftarg_wait( * all reference counts have been dropped before we start walking the * LRU list. */ - while (percpu_counter_sum(&btp->bt_io_count)) + while (percpu_counter_sum(&btp->bt_readahead_count)) delay(100); flush_workqueue(btp->bt_mount->m_buf_workqueue); } @@ -1904,8 +1670,8 @@ xfs_destroy_buftarg( struct xfs_buftarg *btp) { shrinker_free(btp->bt_shrinker); - ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); - percpu_counter_destroy(&btp->bt_io_count); + ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); + percpu_counter_destroy(&btp->bt_readahead_count); list_lru_destroy(&btp->bt_lru); } @@ -1921,23 +1687,65 @@ xfs_free_buftarg( kfree(btp); } +/* + * Configure this buffer target for hardware-assisted atomic writes if the + * underlying block device supports is congruent with the filesystem geometry. + */ +static inline void +xfs_configure_buftarg_atomic_writes( + struct xfs_buftarg *btp) +{ + struct xfs_mount *mp = btp->bt_mount; + unsigned int min_bytes, max_bytes; + + min_bytes = bdev_atomic_write_unit_min_bytes(btp->bt_bdev); + max_bytes = bdev_atomic_write_unit_max_bytes(btp->bt_bdev); + + /* + * Ignore atomic write geometry that is nonsense or doesn't even cover + * a single fsblock. + */ + if (min_bytes > max_bytes || + min_bytes > mp->m_sb.sb_blocksize || + max_bytes < mp->m_sb.sb_blocksize) { + min_bytes = 0; + max_bytes = 0; + } + + btp->bt_bdev_awu_min = min_bytes; + btp->bt_bdev_awu_max = max_bytes; +} + +/* Configure a buffer target that abstracts a block device. */ int -xfs_setsize_buftarg( +xfs_configure_buftarg( struct xfs_buftarg *btp, unsigned int sectorsize) { + int error; + + ASSERT(btp->bt_bdev != NULL); + /* Set up metadata sector size info */ btp->bt_meta_sectorsize = sectorsize; btp->bt_meta_sectormask = sectorsize - 1; - if (set_blocksize(btp->bt_bdev_file, sectorsize)) { + error = bdev_validate_blocksize(btp->bt_bdev, sectorsize); + if (error) { xfs_warn(btp->bt_mount, - "Cannot set_blocksize to %u on device %pg", - sectorsize, btp->bt_bdev); + "Cannot use blocksize %u on device %pg, err %d", + sectorsize, btp->bt_bdev, error); return -EINVAL; } - return 0; + /* + * Flush the block device pagecache so our bios see anything dirtied + * before mount. + */ + if (bdev_can_atomic_write(btp->bt_bdev)) + xfs_configure_buftarg_atomic_writes(btp); + + return sync_blockdev(btp->bt_bdev); } int @@ -1959,7 +1767,7 @@ xfs_init_buftarg( if (list_lru_init(&btp->bt_lru)) return -ENOMEM; - if (percpu_counter_init(&btp->bt_io_count, 0, GFP_KERNEL)) + if (percpu_counter_init(&btp->bt_readahead_count, 0, GFP_KERNEL)) goto out_destroy_lru; btp->bt_shrinker = @@ -1973,7 +1781,7 @@ xfs_init_buftarg( return 0; out_destroy_io_count: - percpu_counter_destroy(&btp->bt_io_count); + percpu_counter_destroy(&btp->bt_readahead_count); out_destroy_lru: list_lru_destroy(&btp->bt_lru); return -ENOMEM; @@ -1986,6 +1794,8 @@ xfs_alloc_buftarg( { struct xfs_buftarg *btp; const struct dax_holder_operations *ops = NULL; + int error; + #if defined(CONFIG_FS_DAX) && defined(CONFIG_MEMORY_FAILURE) ops = &xfs_dax_holder_operations; @@ -1999,28 +1809,31 @@ xfs_alloc_buftarg( btp->bt_daxdev = fs_dax_get_by_bdev(btp->bt_bdev, &btp->bt_dax_part_off, mp, ops); - if (bdev_can_atomic_write(btp->bt_bdev)) { - btp->bt_bdev_awu_min = bdev_atomic_write_unit_min_bytes( - btp->bt_bdev); - btp->bt_bdev_awu_max = bdev_atomic_write_unit_max_bytes( - btp->bt_bdev); - } + /* + * Flush and invalidate all devices' pagecaches before reading any + * metadata because XFS doesn't use the bdev pagecache. + */ + error = sync_blockdev(btp->bt_bdev); + if (error) + goto error_free; /* * When allocating the buftargs we have not yet read the super block and * thus don't know the file system sector size yet. */ - if (xfs_setsize_buftarg(btp, bdev_logical_block_size(btp->bt_bdev))) - goto error_free; - if (xfs_init_buftarg(btp, bdev_logical_block_size(btp->bt_bdev), - mp->m_super->s_id)) + btp->bt_meta_sectorsize = bdev_logical_block_size(btp->bt_bdev); + btp->bt_meta_sectormask = btp->bt_meta_sectorsize - 1; + + error = xfs_init_buftarg(btp, btp->bt_meta_sectorsize, + mp->m_super->s_id); + if (error) goto error_free; return btp; error_free: kfree(btp); - return NULL; + return ERR_PTR(error); } static inline void diff --git a/fs/xfs/xfs_buf.h b/fs/xfs/xfs_buf.h index 3b4ed42e11c0..9d2ab567cf81 100644 --- a/fs/xfs/xfs_buf.h +++ b/fs/xfs/xfs_buf.h @@ -27,7 +27,6 @@ struct xfs_buf; #define XBF_READ (1u << 0) /* buffer intended for reading from device */ #define XBF_WRITE (1u << 1) /* buffer intended for writing to device */ #define XBF_READ_AHEAD (1u << 2) /* asynchronous read-ahead */ -#define XBF_NO_IOACCT (1u << 3) /* bypass I/O accounting (non-LRU bufs) */ #define XBF_ASYNC (1u << 4) /* initiator will not wait for completion */ #define XBF_DONE (1u << 5) /* all pages in the buffer uptodate */ #define XBF_STALE (1u << 6) /* buffer has been staled, do not find it */ @@ -37,7 +36,6 @@ struct xfs_buf; #define _XBF_LOGRECOVERY (1u << 18)/* log recovery buffer */ /* flags used only internally */ -#define _XBF_PAGES (1u << 20)/* backed by refcounted pages */ #define _XBF_KMEM (1u << 21)/* backed by heap memory */ #define _XBF_DELWRI_Q (1u << 22)/* buffer on a delwri queue */ @@ -49,7 +47,6 @@ struct xfs_buf; #define XBF_LIVESCAN (1u << 28) #define XBF_INCORE (1u << 29)/* lookup only, return if found in cache */ #define XBF_TRYLOCK (1u << 30)/* lock requested, but do not wait */ -#define XBF_UNMAPPED (1u << 31)/* do not map the buffer */ typedef unsigned int xfs_buf_flags_t; @@ -58,26 +55,22 @@ typedef unsigned int xfs_buf_flags_t; { XBF_READ, "READ" }, \ { XBF_WRITE, "WRITE" }, \ { XBF_READ_AHEAD, "READ_AHEAD" }, \ - { XBF_NO_IOACCT, "NO_IOACCT" }, \ { XBF_ASYNC, "ASYNC" }, \ { XBF_DONE, "DONE" }, \ { XBF_STALE, "STALE" }, \ { XBF_WRITE_FAIL, "WRITE_FAIL" }, \ { _XBF_LOGRECOVERY, "LOG_RECOVERY" }, \ - { _XBF_PAGES, "PAGES" }, \ { _XBF_KMEM, "KMEM" }, \ { _XBF_DELWRI_Q, "DELWRI_Q" }, \ /* The following interface flags should never be set */ \ { XBF_LIVESCAN, "LIVESCAN" }, \ { XBF_INCORE, "INCORE" }, \ - { XBF_TRYLOCK, "TRYLOCK" }, \ - { XBF_UNMAPPED, "UNMAPPED" } + { XBF_TRYLOCK, "TRYLOCK" } /* * Internal state flags. */ #define XFS_BSTATE_DISPOSE (1 << 0) /* buffer being discarded */ -#define XFS_BSTATE_IN_FLIGHT (1 << 1) /* I/O in flight */ struct xfs_buf_cache { struct rhashtable bc_hash; @@ -116,10 +109,10 @@ struct xfs_buftarg { struct shrinker *bt_shrinker; struct list_lru bt_lru; - struct percpu_counter bt_io_count; + struct percpu_counter bt_readahead_count; struct ratelimit_state bt_ioerror_rl; - /* Atomic write unit values */ + /* Atomic write unit values, bytes */ unsigned int bt_bdev_awu_min; unsigned int bt_bdev_awu_max; @@ -127,8 +120,6 @@ struct xfs_buftarg { struct xfs_buf_cache bt_cache[]; }; -#define XB_PAGES 2 - struct xfs_buf_map { xfs_daddr_t bm_bn; /* block number for I/O */ int bm_len; /* size of I/O */ @@ -190,15 +181,10 @@ struct xfs_buf { struct xfs_buf_log_item *b_log_item; struct list_head b_li_list; /* Log items list head */ struct xfs_trans *b_transp; - struct page **b_pages; /* array of page pointers */ - struct page *b_page_array[XB_PAGES]; /* inline pages */ struct xfs_buf_map *b_maps; /* compound buffer map */ struct xfs_buf_map __b_map; /* inline compound buffer map */ int b_map_count; atomic_t b_pin_count; /* pin count */ - unsigned int b_page_count; /* size of page array */ - unsigned int b_offset; /* page offset of b_addr, - only for _XBF_KMEM buffers */ int b_error; /* error code on I/O */ void (*b_iodone)(struct xfs_buf *bp); @@ -287,11 +273,11 @@ xfs_buf_readahead( } int xfs_buf_get_uncached(struct xfs_buftarg *target, size_t numblks, - xfs_buf_flags_t flags, struct xfs_buf **bpp); + struct xfs_buf **bpp); int xfs_buf_read_uncached(struct xfs_buftarg *target, xfs_daddr_t daddr, - size_t numblks, xfs_buf_flags_t flags, struct xfs_buf **bpp, + size_t numblks, struct xfs_buf **bpp, const struct xfs_buf_ops *ops); -int _xfs_buf_read(struct xfs_buf *bp, xfs_buf_flags_t flags); +int _xfs_buf_read(struct xfs_buf *bp); void xfs_buf_hold(struct xfs_buf *bp); /* Releasing Buffers */ @@ -318,12 +304,20 @@ extern void __xfs_buf_ioerror(struct xfs_buf *bp, int error, #define xfs_buf_ioerror(bp, err) __xfs_buf_ioerror((bp), (err), __this_address) extern void xfs_buf_ioerror_alert(struct xfs_buf *bp, xfs_failaddr_t fa); void xfs_buf_ioend_fail(struct xfs_buf *); -void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize); void __xfs_buf_mark_corrupt(struct xfs_buf *bp, xfs_failaddr_t fa); #define xfs_buf_mark_corrupt(bp) __xfs_buf_mark_corrupt((bp), __this_address) /* Buffer Utility Routines */ -extern void *xfs_buf_offset(struct xfs_buf *, size_t); +static inline void *xfs_buf_offset(struct xfs_buf *bp, size_t offset) +{ + return bp->b_addr + offset; +} + +static inline void xfs_buf_zero(struct xfs_buf *bp, size_t boff, size_t bsize) +{ + memset(bp->b_addr + boff, 0, bsize); +} + extern void xfs_buf_stale(struct xfs_buf *bp); /* Delayed Write Buffer Routines */ @@ -380,7 +374,7 @@ struct xfs_buftarg *xfs_alloc_buftarg(struct xfs_mount *mp, extern void xfs_free_buftarg(struct xfs_buftarg *); extern void xfs_buftarg_wait(struct xfs_buftarg *); extern void xfs_buftarg_drain(struct xfs_buftarg *); -extern int xfs_setsize_buftarg(struct xfs_buftarg *, unsigned int); +int xfs_configure_buftarg(struct xfs_buftarg *btp, unsigned int sectorsize); #define xfs_getsize_buftarg(buftarg) block_size((buftarg)->bt_bdev) #define xfs_readonly_buftarg(buftarg) bdev_read_only((buftarg)->bt_bdev) diff --git a/fs/xfs/xfs_buf_item.c b/fs/xfs/xfs_buf_item.c index 47549cfa61cd..90139e0f3271 100644 --- a/fs/xfs/xfs_buf_item.c +++ b/fs/xfs/xfs_buf_item.c @@ -57,24 +57,6 @@ xfs_buf_log_format_size( (blfp->blf_map_size * sizeof(blfp->blf_data_map[0])); } -static inline bool -xfs_buf_item_straddle( - struct xfs_buf *bp, - uint offset, - int first_bit, - int nbits) -{ - void *first, *last; - - first = xfs_buf_offset(bp, offset + (first_bit << XFS_BLF_SHIFT)); - last = xfs_buf_offset(bp, - offset + ((first_bit + nbits) << XFS_BLF_SHIFT)); - - if (last - first != nbits * XFS_BLF_CHUNK) - return true; - return false; -} - /* * Return the number of log iovecs and space needed to log the given buf log * item segment. @@ -91,11 +73,8 @@ xfs_buf_item_size_segment( int *nvecs, int *nbytes) { - struct xfs_buf *bp = bip->bli_buf; int first_bit; int nbits; - int next_bit; - int last_bit; first_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, 0); if (first_bit == -1) @@ -108,15 +87,6 @@ xfs_buf_item_size_segment( nbits = xfs_contig_bits(blfp->blf_data_map, blfp->blf_map_size, first_bit); ASSERT(nbits > 0); - - /* - * Straddling a page is rare because we don't log contiguous - * chunks of unmapped buffers anywhere. - */ - if (nbits > 1 && - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) - goto slow_scan; - (*nvecs)++; *nbytes += nbits * XFS_BLF_CHUNK; @@ -131,40 +101,25 @@ xfs_buf_item_size_segment( } while (first_bit != -1); return; +} -slow_scan: - /* Count the first bit we jumped out of the above loop from */ - (*nvecs)++; - *nbytes += XFS_BLF_CHUNK; - last_bit = first_bit; - while (last_bit != -1) { - /* - * This takes the bit number to start looking from and - * returns the next set bit from there. It returns -1 - * if there are no more bits set or the start bit is - * beyond the end of the bitmap. - */ - next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, - last_bit + 1); - /* - * If we run out of bits, leave the loop, - * else if we find a new set of bits bump the number of vecs, - * else keep scanning the current set of bits. - */ - if (next_bit == -1) { - break; - } else if (next_bit != last_bit + 1 || - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { - last_bit = next_bit; - first_bit = next_bit; - (*nvecs)++; - nbits = 1; - } else { - last_bit++; - nbits++; - } - *nbytes += XFS_BLF_CHUNK; - } +/* + * Compute the worst case log item overhead for an invalidated buffer with the + * given map count and block size. + */ +unsigned int +xfs_buf_inval_log_space( + unsigned int map_count, + unsigned int blocksize) +{ + unsigned int chunks = DIV_ROUND_UP(blocksize, XFS_BLF_CHUNK); + unsigned int bitmap_size = DIV_ROUND_UP(chunks, NBWORD); + unsigned int ret = + offsetof(struct xfs_buf_log_format, blf_data_map) + + (bitmap_size * sizeof_field(struct xfs_buf_log_format, + blf_data_map[0])); + + return ret * map_count; } /* @@ -277,8 +232,6 @@ xfs_buf_item_format_segment( struct xfs_buf *bp = bip->bli_buf; uint base_size; int first_bit; - int last_bit; - int next_bit; uint nbits; /* copy the flags across from the base format item */ @@ -323,15 +276,6 @@ xfs_buf_item_format_segment( nbits = xfs_contig_bits(blfp->blf_data_map, blfp->blf_map_size, first_bit); ASSERT(nbits > 0); - - /* - * Straddling a page is rare because we don't log contiguous - * chunks of unmapped buffers anywhere. - */ - if (nbits > 1 && - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) - goto slow_scan; - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, first_bit, nbits); blfp->blf_size++; @@ -347,45 +291,6 @@ xfs_buf_item_format_segment( } while (first_bit != -1); return; - -slow_scan: - ASSERT(bp->b_addr == NULL); - last_bit = first_bit; - nbits = 1; - for (;;) { - /* - * This takes the bit number to start looking from and - * returns the next set bit from there. It returns -1 - * if there are no more bits set or the start bit is - * beyond the end of the bitmap. - */ - next_bit = xfs_next_bit(blfp->blf_data_map, blfp->blf_map_size, - (uint)last_bit + 1); - /* - * If we run out of bits fill in the last iovec and get out of - * the loop. Else if we start a new set of bits then fill in - * the iovec for the series we were looking at and start - * counting the bits in the new one. Else we're still in the - * same set of bits so just keep counting and scanning. - */ - if (next_bit == -1) { - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, - first_bit, nbits); - blfp->blf_size++; - break; - } else if (next_bit != last_bit + 1 || - xfs_buf_item_straddle(bp, offset, first_bit, nbits)) { - xfs_buf_item_copy_iovec(lv, vecp, bp, offset, - first_bit, nbits); - blfp->blf_size++; - first_bit = next_bit; - last_bit = next_bit; - nbits = 1; - } else { - last_bit++; - nbits++; - } - } } /* diff --git a/fs/xfs/xfs_buf_item.h b/fs/xfs/xfs_buf_item.h index 8cde85259a58..e10e324cd245 100644 --- a/fs/xfs/xfs_buf_item.h +++ b/fs/xfs/xfs_buf_item.h @@ -64,6 +64,9 @@ static inline void xfs_buf_dquot_iodone(struct xfs_buf *bp) void xfs_buf_iodone(struct xfs_buf *); bool xfs_buf_log_check_iovec(struct xfs_log_iovec *iovec); +unsigned int xfs_buf_inval_log_space(unsigned int map_count, + unsigned int blocksize); + extern struct kmem_cache *xfs_buf_item_cache; #endif /* __XFS_BUF_ITEM_H__ */ diff --git a/fs/xfs/xfs_buf_item_recover.c b/fs/xfs/xfs_buf_item_recover.c index 05a2f6927c12..d4c5cef5bc43 100644 --- a/fs/xfs/xfs_buf_item_recover.c +++ b/fs/xfs/xfs_buf_item_recover.c @@ -1006,7 +1006,6 @@ xlog_recover_buf_commit_pass2( struct xfs_mount *mp = log->l_mp; struct xfs_buf *bp; int error; - uint buf_flags; xfs_lsn_t lsn; /* @@ -1025,13 +1024,8 @@ xlog_recover_buf_commit_pass2( } trace_xfs_log_recover_buf_recover(log, buf_f); - - buf_flags = 0; - if (buf_f->blf_flags & XFS_BLF_INODE_BUF) - buf_flags |= XBF_UNMAPPED; - error = xfs_buf_read(mp->m_ddev_targp, buf_f->blf_blkno, buf_f->blf_len, - buf_flags, &bp, NULL); + 0, &bp, NULL); if (error) return error; diff --git a/fs/xfs/xfs_buf_mem.c b/fs/xfs/xfs_buf_mem.c index 07bebbfb16ee..dcbfa274e06d 100644 --- a/fs/xfs/xfs_buf_mem.c +++ b/fs/xfs/xfs_buf_mem.c @@ -74,7 +74,7 @@ xmbuf_alloc( /* * We don't want to bother with kmapping data during repair, so don't - * allow highmem pages to back this mapping. + * allow highmem folios to back this mapping. */ mapping_set_gfp_mask(inode->i_mapping, GFP_KERNEL); @@ -117,7 +117,7 @@ xmbuf_free( struct xfs_buftarg *btp) { ASSERT(xfs_buftarg_is_mem(btp)); - ASSERT(percpu_counter_sum(&btp->bt_io_count) == 0); + ASSERT(percpu_counter_sum(&btp->bt_readahead_count) == 0); trace_xmbuf_free(btp); @@ -127,14 +127,13 @@ xmbuf_free( kfree(btp); } -/* Directly map a shmem page into the buffer cache. */ +/* Directly map a shmem folio into the buffer cache. */ int -xmbuf_map_page( +xmbuf_map_backing_mem( struct xfs_buf *bp) { struct inode *inode = file_inode(bp->b_target->bt_file); struct folio *folio = NULL; - struct page *page; loff_t pos = BBTOB(xfs_buf_daddr(bp)); int error; @@ -159,39 +158,17 @@ xmbuf_map_page( return -EIO; } - page = folio_file_page(folio, pos >> PAGE_SHIFT); - /* - * Mark the page dirty so that it won't be reclaimed once we drop the - * (potentially last) reference in xmbuf_unmap_page. + * Mark the folio dirty so that it won't be reclaimed once we drop the + * (potentially last) reference in xfs_buf_free. */ - set_page_dirty(page); - unlock_page(page); + folio_set_dirty(folio); + folio_unlock(folio); - bp->b_addr = page_address(page); - bp->b_pages = bp->b_page_array; - bp->b_pages[0] = page; - bp->b_page_count = 1; + bp->b_addr = folio_address(folio) + offset_in_folio(folio, pos); return 0; } -/* Unmap a shmem page that was mapped into the buffer cache. */ -void -xmbuf_unmap_page( - struct xfs_buf *bp) -{ - struct page *page = bp->b_pages[0]; - - ASSERT(xfs_buftarg_is_mem(bp->b_target)); - - put_page(page); - - bp->b_addr = NULL; - bp->b_pages[0] = NULL; - bp->b_pages = NULL; - bp->b_page_count = 0; -} - /* Is this a valid daddr within the buftarg? */ bool xmbuf_verify_daddr( @@ -205,7 +182,7 @@ xmbuf_verify_daddr( return daddr < (inode->i_sb->s_maxbytes >> BBSHIFT); } -/* Discard the page backing this buffer. */ +/* Discard the folio backing this buffer. */ static void xmbuf_stale( struct xfs_buf *bp) @@ -220,7 +197,7 @@ xmbuf_stale( } /* - * Finalize a buffer -- discard the backing page if it's stale, or run the + * Finalize a buffer -- discard the backing folio if it's stale, or run the * write verifier to detect problems. */ int diff --git a/fs/xfs/xfs_buf_mem.h b/fs/xfs/xfs_buf_mem.h index eed4a7b63232..67d525cc1513 100644 --- a/fs/xfs/xfs_buf_mem.h +++ b/fs/xfs/xfs_buf_mem.h @@ -19,16 +19,14 @@ int xmbuf_alloc(struct xfs_mount *mp, const char *descr, struct xfs_buftarg **btpp); void xmbuf_free(struct xfs_buftarg *btp); -int xmbuf_map_page(struct xfs_buf *bp); -void xmbuf_unmap_page(struct xfs_buf *bp); bool xmbuf_verify_daddr(struct xfs_buftarg *btp, xfs_daddr_t daddr); void xmbuf_trans_bdetach(struct xfs_trans *tp, struct xfs_buf *bp); int xmbuf_finalize(struct xfs_buf *bp); #else # define xfs_buftarg_is_mem(...) (false) -# define xmbuf_map_page(...) (-ENOMEM) -# define xmbuf_unmap_page(...) ((void)0) # define xmbuf_verify_daddr(...) (false) #endif /* CONFIG_XFS_MEMORY_BUFS */ +int xmbuf_map_backing_mem(struct xfs_buf *bp); + #endif /* __XFS_BUF_MEM_H__ */ diff --git a/fs/xfs/xfs_discard.c b/fs/xfs/xfs_discard.c index 3f2403a7b49c..94d0873bcd62 100644 --- a/fs/xfs/xfs_discard.c +++ b/fs/xfs/xfs_discard.c @@ -167,6 +167,14 @@ xfs_discard_extents( return error; } +/* + * Care must be taken setting up the trim cursor as the perags may not have been + * initialised when the cursor is initialised. e.g. a clean mount which hasn't + * read in AGFs and the first operation run on the mounted fs is a trim. This + * can result in perag fields that aren't initialised until + * xfs_trim_gather_extents() calls xfs_alloc_read_agf() to lock down the AG for + * the free space search. + */ struct xfs_trim_cur { xfs_agblock_t start; xfs_extlen_t count; @@ -204,6 +212,14 @@ xfs_trim_gather_extents( if (error) goto out_trans_cancel; + /* + * First time through tcur->count will not have been initialised as + * pag->pagf_longest is not guaranteed to be valid before we read + * the AGF buffer above. + */ + if (!tcur->count) + tcur->count = pag->pagf_longest; + if (tcur->by_bno) { /* sub-AG discard request always starts at tcur->start */ cur = xfs_bnobt_init_cursor(mp, tp, agbp, pag); @@ -350,7 +366,6 @@ xfs_trim_perag_extents( { struct xfs_trim_cur tcur = { .start = start, - .count = pag->pagf_longest, .end = end, .minlen = minlen, }; @@ -844,7 +859,8 @@ xfs_ioc_trim( if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (mp->m_rtdev_targp && + + if (mp->m_rtdev_targp && !xfs_has_zoned(mp) && bdev_max_discard_sectors(mp->m_rtdev_targp->bt_bdev)) rt_bdev = mp->m_rtdev_targp->bt_bdev; if (!bdev_max_discard_sectors(mp->m_ddev_targp->bt_bdev) && !rt_bdev) diff --git a/fs/xfs/xfs_dquot.c b/fs/xfs/xfs_dquot.c index edbc521870a1..b4e32f0860b7 100644 --- a/fs/xfs/xfs_dquot.c +++ b/fs/xfs/xfs_dquot.c @@ -1186,9 +1186,8 @@ xfs_qm_dqflush_done( if (test_bit(XFS_LI_IN_AIL, &lip->li_flags) && (lip->li_lsn == qlip->qli_flush_lsn || test_bit(XFS_LI_FAILED, &lip->li_flags))) { - spin_lock(&ailp->ail_lock); - xfs_clear_li_failed(lip); + clear_bit(XFS_LI_FAILED, &lip->li_flags); if (lip->li_lsn == qlip->qli_flush_lsn) { /* xfs_ail_update_finish() drops the AIL lock */ tail_lsn = xfs_ail_delete_one(ailp, lip); diff --git a/fs/xfs/xfs_extent_busy.c b/fs/xfs/xfs_extent_busy.c index ea43c9a6e54c..da3161572735 100644 --- a/fs/xfs/xfs_extent_busy.c +++ b/fs/xfs/xfs_extent_busy.c @@ -671,7 +671,7 @@ xfs_extent_busy_wait_all( while ((pag = xfs_perag_next(mp, pag))) xfs_extent_busy_wait_group(pag_group(pag)); - if (xfs_has_rtgroups(mp)) + if (xfs_has_rtgroups(mp) && !xfs_has_zoned(mp)) while ((rtg = xfs_rtgroup_next(mp, rtg))) xfs_extent_busy_wait_group(rtg_group(rtg)); } diff --git a/fs/xfs/xfs_extfree_item.c b/fs/xfs/xfs_extfree_item.c index a25c713ff888..d574f5f639fa 100644 --- a/fs/xfs/xfs_extfree_item.c +++ b/fs/xfs/xfs_extfree_item.c @@ -29,6 +29,7 @@ #include "xfs_inode.h" #include "xfs_rtbitmap.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_efi_cache; struct kmem_cache *xfs_efd_cache; @@ -82,6 +83,11 @@ xfs_efi_item_size( *nbytes += xfs_efi_log_format_sizeof(efip->efi_format.efi_nextents); } +unsigned int xfs_efi_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_efi_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given efi log item. We use only 1 iovec, and we point that @@ -253,6 +259,11 @@ xfs_efd_item_size( *nbytes += xfs_efd_log_format_sizeof(efdp->efd_format.efd_nextents); } +unsigned int xfs_efd_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_efd_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given efd log item. We use only 1 iovec, and we point that @@ -767,21 +778,35 @@ xfs_rtextent_free_finish_item( trace_xfs_extent_free_deferred(mp, xefi); - if (!(xefi->xefi_flags & XFS_EFI_CANCELLED)) { - if (*rtgp != to_rtg(xefi->xefi_group)) { - *rtgp = to_rtg(xefi->xefi_group); - xfs_rtgroup_lock(*rtgp, XFS_RTGLOCK_BITMAP); - xfs_rtgroup_trans_join(tp, *rtgp, - XFS_RTGLOCK_BITMAP); - } - error = xfs_rtfree_blocks(tp, *rtgp, - xefi->xefi_startblock, xefi->xefi_blockcount); + if (xefi->xefi_flags & XFS_EFI_CANCELLED) + goto done; + + if (*rtgp != to_rtg(xefi->xefi_group)) { + unsigned int lock_flags; + + if (xfs_has_zoned(mp)) + lock_flags = XFS_RTGLOCK_RMAP; + else + lock_flags = XFS_RTGLOCK_BITMAP; + + *rtgp = to_rtg(xefi->xefi_group); + xfs_rtgroup_lock(*rtgp, lock_flags); + xfs_rtgroup_trans_join(tp, *rtgp, lock_flags); } + + if (xfs_has_zoned(mp)) { + error = xfs_zone_free_blocks(tp, *rtgp, xefi->xefi_startblock, + xefi->xefi_blockcount); + } else { + error = xfs_rtfree_blocks(tp, *rtgp, xefi->xefi_startblock, + xefi->xefi_blockcount); + } + if (error == -EAGAIN) { xfs_efd_from_efi(efdp); return error; } - +done: xfs_efd_add_extent(efdp, xefi); xfs_extent_free_cancel_item(item); return error; diff --git a/fs/xfs/xfs_extfree_item.h b/fs/xfs/xfs_extfree_item.h index 41b7c4306079..c8402040410b 100644 --- a/fs/xfs/xfs_extfree_item.h +++ b/fs/xfs/xfs_extfree_item.h @@ -94,4 +94,7 @@ void xfs_extent_free_defer_add(struct xfs_trans *tp, struct xfs_extent_free_item *xefi, struct xfs_defer_pending **dfpp); +unsigned int xfs_efi_log_space(unsigned int nr); +unsigned int xfs_efd_log_space(unsigned int nr); + #endif /* __XFS_EXTFREE_ITEM_H__ */ diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index f7a7d89c345e..48254a72071b 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -25,6 +25,8 @@ #include "xfs_iomap.h" #include "xfs_reflink.h" #include "xfs_file.h" +#include "xfs_aops.h" +#include "xfs_zone_alloc.h" #include <linux/dax.h> #include <linux/falloc.h> @@ -150,7 +152,7 @@ xfs_file_fsync( * ensure newly written file data make it to disk before logging the new * inode size in case of an extending write. */ - if (XFS_IS_REALTIME_INODE(ip)) + if (XFS_IS_REALTIME_INODE(ip) && mp->m_rtdev_targp != mp->m_ddev_targp) error = blkdev_issue_flush(mp->m_rtdev_targp->bt_bdev); else if (mp->m_logdev_targp != mp->m_ddev_targp) error = blkdev_issue_flush(mp->m_ddev_targp->bt_bdev); @@ -360,7 +362,8 @@ xfs_file_write_zero_eof( struct iov_iter *from, unsigned int *iolock, size_t count, - bool *drained_dio) + bool *drained_dio, + struct xfs_zone_alloc_ctx *ac) { struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); loff_t isize; @@ -414,7 +417,7 @@ xfs_file_write_zero_eof( trace_xfs_zero_eof(ip, isize, iocb->ki_pos - isize); xfs_ilock(ip, XFS_MMAPLOCK_EXCL); - error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, NULL); + error = xfs_zero_range(ip, isize, iocb->ki_pos - isize, ac, NULL); xfs_iunlock(ip, XFS_MMAPLOCK_EXCL); return error; @@ -431,7 +434,8 @@ STATIC ssize_t xfs_file_write_checks( struct kiocb *iocb, struct iov_iter *from, - unsigned int *iolock) + unsigned int *iolock, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = iocb->ki_filp->f_mapping->host; size_t count = iov_iter_count(from); @@ -481,7 +485,7 @@ restart: */ if (iocb->ki_pos > i_size_read(inode)) { error = xfs_file_write_zero_eof(iocb, from, iolock, count, - &drained_dio); + &drained_dio, ac); if (error == 1) goto restart; if (error) @@ -491,6 +495,48 @@ restart: return kiocb_modified(iocb); } +static ssize_t +xfs_zoned_write_space_reserve( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + loff_t count = iov_iter_count(from); + int error; + + if (iocb->ki_flags & IOCB_NOWAIT) + flags |= XFS_ZR_NOWAIT; + + /* + * Check the rlimit and LFS boundary first so that we don't over-reserve + * by possibly a lot. + * + * The generic write path will redo this check later, and it might have + * changed by then. If it got expanded we'll stick to our earlier + * smaller limit, and if it is decreased the new smaller limit will be + * used and our extra space reservation will be returned after finishing + * the write. + */ + error = generic_write_check_limits(iocb->ki_filp, iocb->ki_pos, &count); + if (error) + return error; + + /* + * Sloppily round up count to file system blocks. + * + * This will often reserve an extra block, but that avoids having to look + * at the start offset, which isn't stable for O_APPEND until taking the + * iolock. Also we need to reserve a block each for zeroing the old + * EOF block and the new start block if they are unaligned. + * + * Any remaining block will be returned after the write. + */ + return xfs_zoned_space_reserve(ip, + XFS_B_TO_FSB(ip->i_mount, count) + 1 + 2, flags, ac); +} + static int xfs_dio_write_end_io( struct kiocb *iocb, @@ -503,6 +549,9 @@ xfs_dio_write_end_io( loff_t offset = iocb->ki_pos; unsigned int nofs_flag; + ASSERT(!xfs_is_zoned_inode(ip) || + !(flags & (IOMAP_DIO_UNWRITTEN | IOMAP_DIO_COW))); + trace_xfs_end_io_direct_write(ip, offset, size); if (xfs_is_shutdown(ip->i_mount)) @@ -527,7 +576,10 @@ xfs_dio_write_end_io( nofs_flag = memalloc_nofs_save(); if (flags & IOMAP_DIO_COW) { - error = xfs_reflink_end_cow(ip, offset, size); + if (iocb->ki_flags & IOCB_ATOMIC) + error = xfs_reflink_end_atomic_cow(ip, offset, size); + else + error = xfs_reflink_end_cow(ip, offset, size); if (error) goto out; } @@ -582,14 +634,51 @@ static const struct iomap_dio_ops xfs_dio_write_ops = { .end_io = xfs_dio_write_end_io, }; +static void +xfs_dio_zoned_submit_io( + const struct iomap_iter *iter, + struct bio *bio, + loff_t file_offset) +{ + struct xfs_mount *mp = XFS_I(iter->inode)->i_mount; + struct xfs_zone_alloc_ctx *ac = iter->private; + xfs_filblks_t count_fsb; + struct iomap_ioend *ioend; + + count_fsb = XFS_B_TO_FSB(mp, bio->bi_iter.bi_size); + if (count_fsb > ac->reserved_blocks) { + xfs_err(mp, +"allocation (%lld) larger than reservation (%lld).", + count_fsb, ac->reserved_blocks); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + bio_io_error(bio); + return; + } + ac->reserved_blocks -= count_fsb; + + bio->bi_end_io = xfs_end_bio; + ioend = iomap_init_ioend(iter->inode, bio, file_offset, + IOMAP_IOEND_DIRECT); + xfs_zone_alloc_and_submit(ioend, &ac->open_zone); +} + +static const struct iomap_dio_ops xfs_dio_zoned_write_ops = { + .bio_set = &iomap_ioend_bioset, + .submit_io = xfs_dio_zoned_submit_io, + .end_io = xfs_dio_write_end_io, +}; + /* - * Handle block aligned direct I/O writes + * Handle block aligned direct I/O writes. */ static noinline ssize_t xfs_file_dio_write_aligned( struct xfs_inode *ip, struct kiocb *iocb, - struct iov_iter *from) + struct iov_iter *from, + const struct iomap_ops *ops, + const struct iomap_dio_ops *dops, + struct xfs_zone_alloc_ctx *ac) { unsigned int iolock = XFS_IOLOCK_SHARED; ssize_t ret; @@ -597,7 +686,7 @@ xfs_file_dio_write_aligned( ret = xfs_ilock_iocb_for_write(iocb, &iolock); if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, ac); if (ret) goto out_unlock; @@ -611,8 +700,94 @@ xfs_file_dio_write_aligned( iolock = XFS_IOLOCK_SHARED; } trace_xfs_file_direct_write(iocb, from); - ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops, - &xfs_dio_write_ops, 0, NULL, 0); + ret = iomap_dio_rw(iocb, from, ops, dops, 0, ac, 0); +out_unlock: + xfs_iunlock(ip, iolock); + return ret; +} + +/* + * Handle block aligned direct I/O writes to zoned devices. + */ +static noinline ssize_t +xfs_file_dio_write_zoned( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip, iocb, from, 0, &ac); + if (ret < 0) + return ret; + ret = xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_zoned_direct_write_iomap_ops, + &xfs_dio_zoned_write_ops, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return ret; +} + +/* + * Handle block atomic writes + * + * Two methods of atomic writes are supported: + * - REQ_ATOMIC-based, which would typically use some form of HW offload in the + * disk + * - COW-based, which uses a COW fork as a staging extent for data updates + * before atomically updating extent mappings for the range being written + * + */ +static noinline ssize_t +xfs_file_dio_write_atomic( + struct xfs_inode *ip, + struct kiocb *iocb, + struct iov_iter *from) +{ + unsigned int iolock = XFS_IOLOCK_SHARED; + ssize_t ret, ocount = iov_iter_count(from); + const struct iomap_ops *dops; + + /* + * HW offload should be faster, so try that first if it is already + * known that the write length is not too large. + */ + if (ocount > xfs_inode_buftarg(ip)->bt_bdev_awu_max) + dops = &xfs_atomic_write_cow_iomap_ops; + else + dops = &xfs_direct_write_iomap_ops; + +retry: + ret = xfs_ilock_iocb_for_write(iocb, &iolock); + if (ret) + return ret; + + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); + if (ret) + goto out_unlock; + + /* Demote similar to xfs_file_dio_write_aligned() */ + if (iolock == XFS_IOLOCK_EXCL) { + xfs_ilock_demote(ip, XFS_IOLOCK_EXCL); + iolock = XFS_IOLOCK_SHARED; + } + + trace_xfs_file_direct_write(iocb, from); + ret = iomap_dio_rw(iocb, from, dops, &xfs_dio_write_ops, + 0, NULL, 0); + + /* + * The retry mechanism is based on the ->iomap_begin method returning + * -ENOPROTOOPT, which would be when the REQ_ATOMIC-based write is not + * possible. The REQ_ATOMIC-based method typically not be possible if + * the write spans multiple extents or the disk blocks are misaligned. + */ + if (ret == -ENOPROTOOPT && dops == &xfs_direct_write_iomap_ops) { + xfs_iunlock(ip, iolock); + dops = &xfs_atomic_write_cow_iomap_ops; + goto retry; + } + out_unlock: if (iolock) xfs_iunlock(ip, iolock); @@ -675,7 +850,7 @@ retry_exclusive: goto out_unlock; } - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out_unlock; @@ -721,9 +896,23 @@ xfs_file_dio_write( /* direct I/O must be aligned to device logical sector size */ if ((iocb->ki_pos | count) & target->bt_logical_sectormask) return -EINVAL; - if ((iocb->ki_pos | count) & ip->i_mount->m_blockmask) + + /* + * For always COW inodes we also must check the alignment of each + * individual iovec segment, as they could end up with different + * I/Os due to the way bio_iov_iter_get_pages works, and we'd + * then overwrite an already written block. + */ + if (((iocb->ki_pos | count) & ip->i_mount->m_blockmask) || + (xfs_is_always_cow_inode(ip) && + (iov_iter_alignment(from) & ip->i_mount->m_blockmask))) return xfs_file_dio_write_unaligned(ip, iocb, from); - return xfs_file_dio_write_aligned(ip, iocb, from); + if (xfs_is_zoned_inode(ip)) + return xfs_file_dio_write_zoned(ip, iocb, from); + if (iocb->ki_flags & IOCB_ATOMIC) + return xfs_file_dio_write_atomic(ip, iocb, from); + return xfs_file_dio_write_aligned(ip, iocb, from, + &xfs_direct_write_iomap_ops, &xfs_dio_write_ops, NULL); } static noinline ssize_t @@ -740,7 +929,7 @@ xfs_file_dax_write( ret = xfs_ilock_iocb(iocb, iolock); if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; @@ -784,7 +973,7 @@ write_retry: if (ret) return ret; - ret = xfs_file_write_checks(iocb, from, &iolock); + ret = xfs_file_write_checks(iocb, from, &iolock, NULL); if (ret) goto out; @@ -832,6 +1021,67 @@ out: } STATIC ssize_t +xfs_file_buffered_write_zoned( + struct kiocb *iocb, + struct iov_iter *from) +{ + struct xfs_inode *ip = XFS_I(iocb->ki_filp->f_mapping->host); + struct xfs_mount *mp = ip->i_mount; + unsigned int iolock = XFS_IOLOCK_EXCL; + bool cleared_space = false; + struct xfs_zone_alloc_ctx ac = { }; + ssize_t ret; + + ret = xfs_zoned_write_space_reserve(ip, iocb, from, XFS_ZR_GREEDY, &ac); + if (ret < 0) + return ret; + + ret = xfs_ilock_iocb(iocb, iolock); + if (ret) + goto out_unreserve; + + ret = xfs_file_write_checks(iocb, from, &iolock, &ac); + if (ret) + goto out_unlock; + + /* + * Truncate the iter to the length that we were actually able to + * allocate blocks for. This needs to happen after + * xfs_file_write_checks, because that assigns ki_pos for O_APPEND + * writes. + */ + iov_iter_truncate(from, + XFS_FSB_TO_B(mp, ac.reserved_blocks) - + (iocb->ki_pos & mp->m_blockmask)); + if (!iov_iter_count(from)) + goto out_unlock; + +retry: + trace_xfs_file_buffered_write(iocb, from); + ret = iomap_file_buffered_write(iocb, from, + &xfs_buffered_write_iomap_ops, &ac); + if (ret == -ENOSPC && !cleared_space) { + /* + * Kick off writeback to convert delalloc space and release the + * usually too pessimistic indirect block reservations. + */ + xfs_flush_inodes(mp); + cleared_space = true; + goto retry; + } + +out_unlock: + xfs_iunlock(ip, iolock); +out_unreserve: + xfs_zoned_space_unreserve(ip, &ac); + if (ret > 0) { + XFS_STATS_ADD(mp, xs_write_bytes, ret); + ret = generic_write_sync(iocb, ret); + } + return ret; +} + +STATIC ssize_t xfs_file_write_iter( struct kiocb *iocb, struct iov_iter *from) @@ -853,14 +1103,12 @@ xfs_file_write_iter( return xfs_file_dax_write(iocb, from); if (iocb->ki_flags & IOCB_ATOMIC) { - /* - * Currently only atomic writing of a single FS block is - * supported. It would be possible to atomic write smaller than - * a FS block, but there is no requirement to support this. - * Note that iomap also does not support this yet. - */ - if (ocount != ip->i_mount->m_sb.sb_blocksize) + if (ocount < xfs_get_atomic_write_min(ip)) return -EINVAL; + + if (ocount > xfs_get_atomic_write_max(ip)) + return -EINVAL; + ret = generic_atomic_write_valid(iocb, from); if (ret) return ret; @@ -878,6 +1126,8 @@ xfs_file_write_iter( return ret; } + if (xfs_is_zoned_inode(ip)) + return xfs_file_buffered_write_zoned(iocb, from); return xfs_file_buffered_write(iocb, from); } @@ -932,7 +1182,8 @@ static int xfs_falloc_collapse_range( struct file *file, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); loff_t new_size = i_size_read(inode) - len; @@ -948,7 +1199,7 @@ xfs_falloc_collapse_range( if (offset + len >= i_size_read(inode)) return -EINVAL; - error = xfs_collapse_file_space(XFS_I(inode), offset, len); + error = xfs_collapse_file_space(XFS_I(inode), offset, len, ac); if (error) return error; return xfs_falloc_setsize(file, new_size); @@ -1004,7 +1255,8 @@ xfs_falloc_zero_range( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); unsigned int blksize = i_blocksize(inode); @@ -1017,7 +1269,7 @@ xfs_falloc_zero_range( if (error) return error; - error = xfs_free_file_space(XFS_I(inode), offset, len); + error = xfs_free_file_space(XFS_I(inode), offset, len, ac); if (error) return error; @@ -1088,22 +1340,18 @@ xfs_falloc_allocate_range( FALLOC_FL_INSERT_RANGE | FALLOC_FL_UNSHARE_RANGE) STATIC long -xfs_file_fallocate( +__xfs_file_fallocate( struct file *file, int mode, loff_t offset, - loff_t len) + loff_t len, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(file); struct xfs_inode *ip = XFS_I(inode); long error; uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; - if (!S_ISREG(inode->i_mode)) - return -EINVAL; - if (mode & ~XFS_FALLOC_FL_SUPPORTED) - return -EOPNOTSUPP; - xfs_ilock(ip, iolock); error = xfs_break_layouts(inode, &iolock, BREAK_UNMAP); if (error) @@ -1124,16 +1372,16 @@ xfs_file_fallocate( switch (mode & FALLOC_FL_MODE_MASK) { case FALLOC_FL_PUNCH_HOLE: - error = xfs_free_file_space(ip, offset, len); + error = xfs_free_file_space(ip, offset, len, ac); break; case FALLOC_FL_COLLAPSE_RANGE: - error = xfs_falloc_collapse_range(file, offset, len); + error = xfs_falloc_collapse_range(file, offset, len, ac); break; case FALLOC_FL_INSERT_RANGE: error = xfs_falloc_insert_range(file, offset, len); break; case FALLOC_FL_ZERO_RANGE: - error = xfs_falloc_zero_range(file, mode, offset, len); + error = xfs_falloc_zero_range(file, mode, offset, len, ac); break; case FALLOC_FL_UNSHARE_RANGE: error = xfs_falloc_unshare_range(file, mode, offset, len); @@ -1154,6 +1402,54 @@ out_unlock: return error; } +static long +xfs_file_zoned_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct xfs_zone_alloc_ctx ac = { }; + struct xfs_inode *ip = XFS_I(file_inode(file)); + int error; + + error = xfs_zoned_space_reserve(ip, 2, XFS_ZR_RESERVED, &ac); + if (error) + return error; + error = __xfs_file_fallocate(file, mode, offset, len, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return error; +} + +static long +xfs_file_fallocate( + struct file *file, + int mode, + loff_t offset, + loff_t len) +{ + struct inode *inode = file_inode(file); + + if (!S_ISREG(inode->i_mode)) + return -EINVAL; + if (mode & ~XFS_FALLOC_FL_SUPPORTED) + return -EOPNOTSUPP; + + /* + * For zoned file systems, zeroing the first and last block of a hole + * punch requires allocating a new block to rewrite the remaining data + * and new zeroes out of place. Get a reservations for those before + * taking the iolock. Dip into the reserved pool because we are + * expected to be able to punch a hole even on a completely full + * file system. + */ + if (xfs_is_zoned_inode(XFS_I(inode)) && + (mode & (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_ZERO_RANGE | + FALLOC_FL_COLLAPSE_RANGE))) + return xfs_file_zoned_fallocate(file, mode, offset, len); + return __xfs_file_fallocate(file, mode, offset, len, NULL); +} + STATIC int xfs_file_fadvise( struct file *file, @@ -1261,7 +1557,7 @@ xfs_file_open( if (xfs_is_shutdown(XFS_M(inode->i_sb))) return -EIO; file->f_mode |= FMODE_NOWAIT | FMODE_CAN_ODIRECT; - if (xfs_inode_can_atomicwrite(XFS_I(inode))) + if (xfs_get_atomic_write_min(XFS_I(inode)) > 0) file->f_mode |= FMODE_CAN_ATOMIC_WRITE; return generic_file_open(inode, file); } @@ -1347,15 +1643,22 @@ xfs_file_release( * blocks. This avoids open/read/close workloads from removing EOF * blocks that other writers depend upon to reduce fragmentation. * + * Inodes on the zoned RT device never have preallocations, so skip + * taking the locks below. + */ + if (!inode->i_nlink || + !(file->f_mode & FMODE_WRITE) || + (ip->i_diflags & XFS_DIFLAG_APPEND) || + xfs_is_zoned_inode(ip)) + return 0; + + /* * If we can't get the iolock just skip truncating the blocks past EOF * because we could deadlock with the mmap_lock otherwise. We'll get * another chance to drop them once the last reference to the inode is * dropped, so we'll never leak blocks permanently. */ - if (inode->i_nlink && - (file->f_mode & FMODE_WRITE) && - !(ip->i_diflags & XFS_DIFLAG_APPEND) && - !xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && + if (!xfs_iflags_test(ip, XFS_EOFBLOCKS_RELEASED) && xfs_ilock_nowait(ip, XFS_IOLOCK_EXCL)) { if (xfs_can_free_eofblocks(ip) && !xfs_iflags_test_and_set(ip, XFS_EOFBLOCKS_RELEASED)) @@ -1451,9 +1754,6 @@ xfs_dax_read_fault( trace_xfs_read_fault(ip, order); - ret = filemap_fsnotify_fault(vmf); - if (unlikely(ret)) - return ret; xfs_ilock(ip, XFS_MMAPLOCK_SHARED); ret = xfs_dax_fault_locked(vmf, order, false); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); @@ -1472,9 +1772,10 @@ xfs_dax_read_fault( * i_lock (XFS - extent map serialisation) */ static vm_fault_t -xfs_write_fault( +__xfs_write_fault( struct vm_fault *vmf, - unsigned int order) + unsigned int order, + struct xfs_zone_alloc_ctx *ac) { struct inode *inode = file_inode(vmf->vma->vm_file); struct xfs_inode *ip = XFS_I(inode); @@ -1482,16 +1783,6 @@ xfs_write_fault( vm_fault_t ret; trace_xfs_write_fault(ip, order); - /* - * Usually we get here from ->page_mkwrite callback but in case of DAX - * we will get here also for ordinary write fault. Handle HSM - * notifications for that case. - */ - if (IS_DAX(inode)) { - ret = filemap_fsnotify_fault(vmf); - if (unlikely(ret)) - return ret; - } sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); @@ -1511,13 +1802,50 @@ xfs_write_fault( if (IS_DAX(inode)) ret = xfs_dax_fault_locked(vmf, order, true); else - ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &xfs_buffered_write_iomap_ops, + ac); xfs_iunlock(ip, lock_mode); sb_end_pagefault(inode->i_sb); return ret; } +static vm_fault_t +xfs_write_fault_zoned( + struct vm_fault *vmf, + unsigned int order) +{ + struct xfs_inode *ip = XFS_I(file_inode(vmf->vma->vm_file)); + unsigned int len = folio_size(page_folio(vmf->page)); + struct xfs_zone_alloc_ctx ac = { }; + int error; + vm_fault_t ret; + + /* + * This could over-allocate as it doesn't check for truncation. + * + * But as the overallocation is limited to less than a folio and will be + * release instantly that's just fine. + */ + error = xfs_zoned_space_reserve(ip, XFS_B_TO_FSB(ip->i_mount, len), 0, + &ac); + if (error < 0) + return vmf_fs_error(error); + ret = __xfs_write_fault(vmf, order, &ac); + xfs_zoned_space_unreserve(ip, &ac); + return ret; +} + +static vm_fault_t +xfs_write_fault( + struct vm_fault *vmf, + unsigned int order) +{ + if (xfs_is_zoned_inode(XFS_I(file_inode(vmf->vma->vm_file)))) + return xfs_write_fault_zoned(vmf, order); + return __xfs_write_fault(vmf, order, NULL); +} + static inline bool xfs_is_write_fault( struct vm_fault *vmf) @@ -1626,7 +1954,8 @@ const struct file_operations xfs_file_operations = { .fadvise = xfs_file_fadvise, .remap_file_range = xfs_file_remap_range, .fop_flags = FOP_MMAP_SYNC | FOP_BUFFER_RASYNC | - FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE, + FOP_BUFFER_WASYNC | FOP_DIO_PARALLEL_WRITE | + FOP_DONTCACHE, }; const struct file_operations xfs_dir_file_operations = { diff --git a/fs/xfs/xfs_filestream.c b/fs/xfs/xfs_filestream.c index a961aa420c48..044918fbae06 100644 --- a/fs/xfs/xfs_filestream.c +++ b/fs/xfs/xfs_filestream.c @@ -304,11 +304,9 @@ xfs_filestream_create_association( * for us, so all we need to do here is take another active reference to * the perag for the cached association. * - * If we fail to store the association, we need to drop the fstrms - * counter as well as drop the perag reference we take here for the - * item. We do not need to return an error for this failure - as long as - * we return a referenced AG, the allocation can still go ahead just - * fine. + * If we fail to store the association, we do not need to return an + * error for this failure - as long as we return a referenced AG, the + * allocation can still go ahead just fine. */ item = kmalloc(sizeof(*item), GFP_KERNEL | __GFP_RETRY_MAYFAIL); if (!item) @@ -316,14 +314,9 @@ xfs_filestream_create_association( atomic_inc(&pag_group(args->pag)->xg_active_ref); item->pag = args->pag; - error = xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); - if (error) - goto out_free_item; + xfs_mru_cache_insert(mp->m_filestream, pino, &item->mru); return 0; -out_free_item: - xfs_perag_rele(item->pag); - kfree(item); out_put_fstrms: atomic_dec(&args->pag->pagf_fstrms); return 0; diff --git a/fs/xfs/xfs_fsmap.c b/fs/xfs/xfs_fsmap.c index 1dbd2d75f7ae..414b27a86458 100644 --- a/fs/xfs/xfs_fsmap.c +++ b/fs/xfs/xfs_fsmap.c @@ -876,21 +876,58 @@ xfs_getfsmap_rtdev_rmapbt( const struct xfs_fsmap *keys, struct xfs_getfsmap_info *info) { + struct xfs_fsmap key0 = *keys; /* struct copy */ struct xfs_mount *mp = tp->t_mountp; struct xfs_rtgroup *rtg = NULL; struct xfs_btree_cur *bt_cur = NULL; + xfs_daddr_t rtstart_daddr; xfs_rtblock_t start_rtb; xfs_rtblock_t end_rtb; xfs_rgnumber_t start_rg, end_rg; uint64_t eofs; int error = 0; - eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - if (keys[0].fmr_physical >= eofs) + eofs = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart + mp->m_sb.sb_rblocks); + if (key0.fmr_physical >= eofs) return 0; - start_rtb = xfs_daddr_to_rtb(mp, keys[0].fmr_physical); - end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); + /* + * On zoned filesystems with an internal rt volume, the volume comes + * immediately after the end of the data volume. However, the + * xfs_rtblock_t address space is relative to the start of the data + * device, which means that the first @rtstart fsblocks do not actually + * point anywhere. If a fsmap query comes in with the low key starting + * below @rtstart, report it as "owned by filesystem". + */ + rtstart_daddr = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart); + if (xfs_has_zoned(mp) && key0.fmr_physical < rtstart_daddr) { + struct xfs_fsmap_irec frec = { + .owner = XFS_RMAP_OWN_FS, + .len_daddr = rtstart_daddr, + }; + + /* + * Adjust the start of the query range if we're picking up from + * a previous round, and only emit the record if we haven't + * already gone past. + */ + key0.fmr_physical += key0.fmr_length; + if (key0.fmr_physical < rtstart_daddr) { + error = xfs_getfsmap_helper(tp, info, &frec); + if (error) + return error; + + key0.fmr_physical = rtstart_daddr; + } + + /* Zero the other fields to avoid further adjustments. */ + key0.fmr_owner = 0; + key0.fmr_offset = 0; + key0.fmr_length = 0; + } + + start_rtb = xfs_daddr_to_rtb(mp, key0.fmr_physical); + end_rtb = xfs_daddr_to_rtb(mp, min(eofs - 1, keys[1].fmr_physical)); info->missing_owner = XFS_FMR_OWN_FREE; /* @@ -898,12 +935,12 @@ xfs_getfsmap_rtdev_rmapbt( * low to the fsmap low key and max out the high key to the end * of the rtgroup. */ - info->low.rm_offset = XFS_BB_TO_FSBT(mp, keys[0].fmr_offset); - error = xfs_fsmap_owner_to_rmap(&info->low, &keys[0]); + info->low.rm_offset = XFS_BB_TO_FSBT(mp, key0.fmr_offset); + error = xfs_fsmap_owner_to_rmap(&info->low, &key0); if (error) return error; - info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, keys[0].fmr_length); - xfs_getfsmap_set_irec_flags(&info->low, &keys[0]); + info->low.rm_blockcount = XFS_BB_TO_FSBT(mp, key0.fmr_length); + xfs_getfsmap_set_irec_flags(&info->low, &key0); /* Adjust the low key if we are continuing from where we left off. */ if (info->low.rm_blockcount == 0) { @@ -1004,22 +1041,40 @@ xfs_getfsmap_rtdev_rmapbt( } #endif /* CONFIG_XFS_RT */ +static uint32_t +xfs_getfsmap_device( + struct xfs_mount *mp, + enum xfs_device dev) +{ + if (mp->m_sb.sb_rtstart) + return dev; + + switch (dev) { + case XFS_DEV_DATA: + return new_encode_dev(mp->m_ddev_targp->bt_dev); + case XFS_DEV_LOG: + return new_encode_dev(mp->m_logdev_targp->bt_dev); + case XFS_DEV_RT: + if (!mp->m_rtdev_targp) + break; + return new_encode_dev(mp->m_rtdev_targp->bt_dev); + } + + return -1; +} + /* Do we recognize the device? */ STATIC bool xfs_getfsmap_is_valid_device( struct xfs_mount *mp, struct xfs_fsmap *fm) { - if (fm->fmr_device == 0 || fm->fmr_device == UINT_MAX || - fm->fmr_device == new_encode_dev(mp->m_ddev_targp->bt_dev)) - return true; - if (mp->m_logdev_targp && - fm->fmr_device == new_encode_dev(mp->m_logdev_targp->bt_dev)) - return true; - if (mp->m_rtdev_targp && - fm->fmr_device == new_encode_dev(mp->m_rtdev_targp->bt_dev)) - return true; - return false; + return fm->fmr_device == 0 || + fm->fmr_device == UINT_MAX || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_DATA) || + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_LOG) || + (mp->m_rtdev_targp && + fm->fmr_device == xfs_getfsmap_device(mp, XFS_DEV_RT)); } /* Ensure that the low key is less than the high key. */ @@ -1126,7 +1181,7 @@ xfs_getfsmap( /* Set up our device handlers. */ memset(handlers, 0, sizeof(handlers)); handlers[0].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_dblocks); - handlers[0].dev = new_encode_dev(mp->m_ddev_targp->bt_dev); + handlers[0].dev = xfs_getfsmap_device(mp, XFS_DEV_DATA); if (use_rmap) handlers[0].fn = xfs_getfsmap_datadev_rmapbt; else @@ -1134,13 +1189,17 @@ xfs_getfsmap( if (mp->m_logdev_targp != mp->m_ddev_targp) { handlers[1].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_logblocks); - handlers[1].dev = new_encode_dev(mp->m_logdev_targp->bt_dev); + handlers[1].dev = xfs_getfsmap_device(mp, XFS_DEV_LOG); handlers[1].fn = xfs_getfsmap_logdev; } #ifdef CONFIG_XFS_RT - if (mp->m_rtdev_targp) { + /* + * For zoned file systems there is no rtbitmap, so only support fsmap + * if the callers is privileged enough to use the full rmap version. + */ + if (mp->m_rtdev_targp && (use_rmap || !xfs_has_zoned(mp))) { handlers[2].nr_sectors = XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - handlers[2].dev = new_encode_dev(mp->m_rtdev_targp->bt_dev); + handlers[2].dev = xfs_getfsmap_device(mp, XFS_DEV_RT); if (use_rmap) handlers[2].fn = xfs_getfsmap_rtdev_rmapbt; else @@ -1230,7 +1289,13 @@ xfs_getfsmap( if (tp) xfs_trans_cancel(tp); - head->fmh_oflags = FMH_OF_DEV_T; + + /* + * For internal RT device we need to report different synthetic devices + * for a single physical device, and thus can't report the actual dev_t. + */ + if (!mp->m_sb.sb_rtstart) + head->fmh_oflags = FMH_OF_DEV_T; return error; } diff --git a/fs/xfs/xfs_fsops.c b/fs/xfs/xfs_fsops.c index 455298503d01..0ada73569394 100644 --- a/fs/xfs/xfs_fsops.c +++ b/fs/xfs/xfs_fsops.c @@ -24,6 +24,7 @@ #include "xfs_rtalloc.h" #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" +#include "xfs_metafile.h" /* * Write new AG headers to disk. Non-transactional, but need to be @@ -110,7 +111,7 @@ xfs_growfs_data_private( if (nb > mp->m_sb.sb_dblocks) { error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_FSB_TO_BB(mp, nb) - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), &bp, NULL); if (error) return error; xfs_buf_relse(bp); @@ -300,24 +301,30 @@ xfs_growfs_data( struct xfs_mount *mp, struct xfs_growfs_data *in) { - int error = 0; + int error; if (!capable(CAP_SYS_ADMIN)) return -EPERM; if (!mutex_trylock(&mp->m_growlock)) return -EWOULDBLOCK; + /* we can't grow the data section when an internal RT section exists */ + if (in->newblocks != mp->m_sb.sb_dblocks && mp->m_sb.sb_rtstart) { + error = -EINVAL; + goto out_unlock; + } + /* update imaxpct separately to the physical grow of the filesystem */ if (in->imaxpct != mp->m_sb.sb_imax_pct) { error = xfs_growfs_imaxpct(mp, in->imaxpct); if (error) - goto out_error; + goto out_unlock; } if (in->newblocks != mp->m_sb.sb_dblocks) { error = xfs_growfs_data_private(mp, in); if (error) - goto out_error; + goto out_unlock; } /* Post growfs calculations needed to reflect new state in operations */ @@ -331,13 +338,12 @@ xfs_growfs_data( /* Update secondary superblocks now the physical grow has completed */ error = xfs_update_secondary_sbs(mp); -out_error: /* - * Increment the generation unconditionally, the error could be from - * updating the secondary superblocks, in which case the new size - * is live already. + * Increment the generation unconditionally, after trying to update the + * secondary superblocks, as the new size is live already at this point. */ mp->m_generation++; +out_unlock: mutex_unlock(&mp->m_growlock); return error; } @@ -366,6 +372,7 @@ xfs_growfs_log( int xfs_reserve_blocks( struct xfs_mount *mp, + enum xfs_free_counter ctr, uint64_t request) { int64_t lcounter, delta; @@ -373,6 +380,8 @@ xfs_reserve_blocks( int64_t free; int error = 0; + ASSERT(ctr < XC_FREE_NR); + /* * With per-cpu counters, this becomes an interesting problem. we need * to work out if we are freeing or allocation blocks first, then we can @@ -391,16 +400,16 @@ xfs_reserve_blocks( * counters directly since we shouldn't have any problems unreserving * space. */ - if (mp->m_resblks > request) { - lcounter = mp->m_resblks_avail - request; + if (mp->m_free[ctr].res_total > request) { + lcounter = mp->m_free[ctr].res_avail - request; if (lcounter > 0) { /* release unused blocks */ fdblks_delta = lcounter; - mp->m_resblks_avail -= lcounter; + mp->m_free[ctr].res_avail -= lcounter; } - mp->m_resblks = request; + mp->m_free[ctr].res_total = request; if (fdblks_delta) { spin_unlock(&mp->m_sb_lock); - xfs_add_fdblocks(mp, fdblks_delta); + xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } @@ -409,7 +418,7 @@ xfs_reserve_blocks( /* * If the request is larger than the current reservation, reserve the - * blocks before we update the reserve counters. Sample m_fdblocks and + * blocks before we update the reserve counters. Sample m_free and * perform a partial reservation if the request exceeds free space. * * The code below estimates how many blocks it can request from @@ -419,10 +428,10 @@ xfs_reserve_blocks( * space to fill it because mod_fdblocks will refill an undersized * reserve when it can. */ - free = percpu_counter_sum(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp); - delta = request - mp->m_resblks; - mp->m_resblks = request; + free = xfs_sum_freecounter_raw(mp, ctr) - + xfs_freecounter_unavailable(mp, ctr); + delta = request - mp->m_free[ctr].res_total; + mp->m_free[ctr].res_total = request; if (delta > 0 && free > 0) { /* * We'll either succeed in getting space from the free block @@ -436,9 +445,9 @@ xfs_reserve_blocks( */ fdblks_delta = min(free, delta); spin_unlock(&mp->m_sb_lock); - error = xfs_dec_fdblocks(mp, fdblks_delta, 0); + error = xfs_dec_freecounter(mp, ctr, fdblks_delta, 0); if (!error) - xfs_add_fdblocks(mp, fdblks_delta); + xfs_add_freecounter(mp, ctr, fdblks_delta); spin_lock(&mp->m_sb_lock); } out: @@ -558,15 +567,13 @@ xfs_fs_reserve_ag_blocks( return error; } - if (xfs_has_realtime(mp)) { - err2 = xfs_rt_resv_init(mp); - if (err2 && err2 != -ENOSPC) { - xfs_warn(mp, - "Error %d reserving realtime metadata reserve pool.", err2); - xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - } + err2 = xfs_metafile_resv_init(mp); + if (err2 && err2 != -ENOSPC) { + xfs_warn(mp, + "Error %d reserving realtime metadata reserve pool.", err2); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); - if (err2 && !error) + if (!error) error = err2; } @@ -582,9 +589,7 @@ xfs_fs_unreserve_ag_blocks( { struct xfs_perag *pag = NULL; - if (xfs_has_realtime(mp)) - xfs_rt_resv_free(mp); - + xfs_metafile_resv_free(mp); while ((pag = xfs_perag_next(mp, pag))) xfs_ag_resv_free(pag); } diff --git a/fs/xfs/xfs_fsops.h b/fs/xfs/xfs_fsops.h index 3e2f73bcf831..9d23c361ef56 100644 --- a/fs/xfs/xfs_fsops.h +++ b/fs/xfs/xfs_fsops.h @@ -8,7 +8,8 @@ int xfs_growfs_data(struct xfs_mount *mp, struct xfs_growfs_data *in); int xfs_growfs_log(struct xfs_mount *mp, struct xfs_growfs_log *in); -int xfs_reserve_blocks(struct xfs_mount *mp, uint64_t request); +int xfs_reserve_blocks(struct xfs_mount *mp, enum xfs_free_counter cnt, + uint64_t request); int xfs_fs_goingdown(struct xfs_mount *mp, uint32_t inflags); int xfs_fs_reserve_ag_blocks(struct xfs_mount *mp); diff --git a/fs/xfs/xfs_globals.c b/fs/xfs/xfs_globals.c index f18fec0adf66..f6f628c01feb 100644 --- a/fs/xfs/xfs_globals.c +++ b/fs/xfs/xfs_globals.c @@ -23,8 +23,6 @@ xfs_param_t xfs_params = { .inherit_sync = { 0, 1, 1 }, .inherit_nodump = { 0, 1, 1 }, .inherit_noatim = { 0, 1, 1 }, - .xfs_buf_timer = { 100/2, 1*100, 30*100 }, - .xfs_buf_age = { 1*100, 15*100, 7200*100}, .inherit_nosym = { 0, 0, 1 }, .rotorstep = { 1, 1, 255 }, .inherit_nodfrg = { 0, 1, 1 }, diff --git a/fs/xfs/xfs_icache.c b/fs/xfs/xfs_icache.c index 7b6c026d01a1..726e29b837e6 100644 --- a/fs/xfs/xfs_icache.c +++ b/fs/xfs/xfs_icache.c @@ -230,7 +230,7 @@ xfs_blockgc_queue( rcu_read_lock(); if (radix_tree_tagged(&pag->pag_ici_root, XFS_ICI_BLOCKGC_TAG)) queue_delayed_work(mp->m_blockgc_wq, &pag->pag_blockgc_work, - msecs_to_jiffies(xfs_blockgc_secs * 1000)); + secs_to_jiffies(xfs_blockgc_secs)); rcu_read_unlock(); } @@ -2073,10 +2073,10 @@ xfs_inodegc_want_queue_rt_file( { struct xfs_mount *mp = ip->i_mount; - if (!XFS_IS_REALTIME_INODE(ip)) + if (!XFS_IS_REALTIME_INODE(ip) || xfs_has_zoned(mp)) return false; - if (__percpu_counter_compare(&mp->m_frextents, + if (xfs_compare_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_low_rtexts[XFS_LOWSP_5_PCNT], XFS_FDBLOCKS_BATCH) < 0) return true; @@ -2104,7 +2104,7 @@ xfs_inodegc_want_queue_work( if (items > mp->m_ino_geo.inodes_per_cluster) return true; - if (__percpu_counter_compare(&mp->m_fdblocks, + if (xfs_compare_freecounter(mp, XC_FREE_BLOCKS, mp->m_low_space[XFS_LOWSP_5_PCNT], XFS_FDBLOCKS_BATCH) < 0) return true; diff --git a/fs/xfs/xfs_inode.c b/fs/xfs/xfs_inode.c index b1f9f156ec88..ee3e0f284287 100644 --- a/fs/xfs/xfs_inode.c +++ b/fs/xfs/xfs_inode.c @@ -1721,8 +1721,7 @@ xfs_ifree_cluster( * to mark all the active inodes on the buffer stale. */ error = xfs_trans_get_buf(tp, mp->m_ddev_targp, blkno, - mp->m_bsize * igeo->blocks_per_cluster, - XBF_UNMAPPED, &bp); + mp->m_bsize * igeo->blocks_per_cluster, 0, &bp); if (error) return error; @@ -2735,21 +2734,16 @@ xfs_mmaplock_two_inodes_and_break_dax_layout( struct xfs_inode *ip2) { int error; - bool retry; - struct page *page; if (ip1->i_ino > ip2->i_ino) swap(ip1, ip2); again: - retry = false; /* Lock the first inode */ xfs_ilock(ip1, XFS_MMAPLOCK_EXCL); - error = xfs_break_dax_layouts(VFS_I(ip1), &retry); - if (error || retry) { + error = xfs_break_dax_layouts(VFS_I(ip1)); + if (error) { xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); - if (error == 0 && retry) - goto again; return error; } @@ -2763,8 +2757,8 @@ again: * need to unlock & lock the XFS_MMAPLOCK_EXCL which is not suitable * for this nested lock case. */ - page = dax_layout_busy_page(VFS_I(ip2)->i_mapping); - if (page && page_ref_count(page) != 1) { + error = dax_break_layout(VFS_I(ip2), 0, -1, NULL); + if (error) { xfs_iunlock(ip2, XFS_MMAPLOCK_EXCL); xfs_iunlock(ip1, XFS_MMAPLOCK_EXCL); goto again; @@ -3008,21 +3002,11 @@ xfs_wait_dax_page( int xfs_break_dax_layouts( - struct inode *inode, - bool *retry) + struct inode *inode) { - struct page *page; - xfs_assert_ilocked(XFS_I(inode), XFS_MMAPLOCK_EXCL); - page = dax_layout_busy_page(inode->i_mapping); - if (!page) - return 0; - - *retry = true; - return ___wait_var_event(&page->_refcount, - atomic_read(&page->_refcount) == 1, TASK_INTERRUPTIBLE, - 0, 0, xfs_wait_dax_page(inode)); + return dax_break_layout_inode(inode, xfs_wait_dax_page); } int @@ -3040,8 +3024,8 @@ xfs_break_layouts( retry = false; switch (reason) { case BREAK_UNMAP: - error = xfs_break_dax_layouts(inode, &retry); - if (error || retry) + error = xfs_break_dax_layouts(inode); + if (error) break; fallthrough; case BREAK_WRITE: @@ -3074,5 +3058,6 @@ bool xfs_is_always_cow_inode( const struct xfs_inode *ip) { - return ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount); + return xfs_is_zoned_inode(ip) || + (ip->i_mount->m_always_cow && xfs_has_reflink(ip->i_mount)); } diff --git a/fs/xfs/xfs_inode.h b/fs/xfs/xfs_inode.h index c08093a65352..d7e2b902ef5c 100644 --- a/fs/xfs/xfs_inode.h +++ b/fs/xfs/xfs_inode.h @@ -25,19 +25,9 @@ struct xfs_dquot; typedef struct xfs_inode { /* Inode linking and identification information. */ struct xfs_mount *i_mount; /* fs mount struct ptr */ - union { - struct { - struct xfs_dquot *i_udquot; /* user dquot */ - struct xfs_dquot *i_gdquot; /* group dquot */ - struct xfs_dquot *i_pdquot; /* project dquot */ - }; - - /* - * Space that has been set aside to accomodate expansions of a - * metadata btree rooted in this file. - */ - uint64_t i_meta_resv_asked; - }; + struct xfs_dquot *i_udquot; /* user dquot */ + struct xfs_dquot *i_gdquot; /* group dquot */ + struct xfs_dquot *i_pdquot; /* project dquot */ /* Inode location stuff */ xfs_ino_t i_ino; /* inode number (agno/agino)*/ @@ -69,8 +59,13 @@ typedef struct xfs_inode { xfs_rfsblock_t i_nblocks; /* # of direct & btree blocks */ prid_t i_projid; /* owner's project id */ xfs_extlen_t i_extsize; /* basic/minimum extent size */ - /* cowextsize is only used for v3 inodes, flushiter for v1/2 */ + /* + * i_used_blocks is used for zoned rtrmap inodes, + * i_cowextsize is used for other v3 inodes, + * i_flushiter for v1/2 inodes + */ union { + uint32_t i_used_blocks; /* used blocks in RTG */ xfs_extlen_t i_cowextsize; /* basic cow extent size */ uint16_t i_flushiter; /* incremented on flush */ }; @@ -309,6 +304,11 @@ static inline bool xfs_is_internal_inode(const struct xfs_inode *ip) xfs_is_quota_inode(&mp->m_sb, ip->i_ino); } +static inline bool xfs_is_zoned_inode(const struct xfs_inode *ip) +{ + return xfs_has_zoned(ip->i_mount) && XFS_IS_REALTIME_INODE(ip); +} + bool xfs_is_always_cow_inode(const struct xfs_inode *ip); static inline bool xfs_is_cow_inode(const struct xfs_inode *ip) @@ -356,19 +356,9 @@ static inline bool xfs_inode_has_bigrtalloc(const struct xfs_inode *ip) (XFS_IS_REALTIME_INODE(ip) ? \ (ip)->i_mount->m_rtdev_targp : (ip)->i_mount->m_ddev_targp) -static inline bool -xfs_inode_can_atomicwrite( - struct xfs_inode *ip) +static inline bool xfs_inode_can_hw_atomic_write(const struct xfs_inode *ip) { - struct xfs_mount *mp = ip->i_mount; - struct xfs_buftarg *target = xfs_inode_buftarg(ip); - - if (mp->m_sb.sb_blocksize < target->bt_bdev_awu_min) - return false; - if (mp->m_sb.sb_blocksize > target->bt_bdev_awu_max) - return false; - - return true; + return xfs_inode_buftarg(ip)->bt_bdev_awu_max > 0; } /* @@ -603,7 +593,7 @@ xfs_itruncate_extents( return xfs_itruncate_extents_flags(tpp, ip, whichfork, new_size, 0); } -int xfs_break_dax_layouts(struct inode *inode, bool *retry); +int xfs_break_dax_layouts(struct inode *inode); int xfs_break_layouts(struct inode *inode, uint *iolock, enum layout_break_reason reason); diff --git a/fs/xfs/xfs_inode_item.c b/fs/xfs/xfs_inode_item.c index 35803fcf0beb..c6cb0b6b9e46 100644 --- a/fs/xfs/xfs_inode_item.c +++ b/fs/xfs/xfs_inode_item.c @@ -596,6 +596,7 @@ xfs_inode_to_log_dinode( to->di_changecount = inode_peek_iversion(inode); to->di_crtime = xfs_inode_to_log_dinode_ts(ip, ip->i_crtime); to->di_flags2 = ip->i_diflags2; + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = ip->i_cowextsize; to->di_ino = ip->i_ino; to->di_lsn = lsn; @@ -1088,13 +1089,7 @@ xfs_iflush_abort( * state. Whilst the inode is in the AIL, it should have a valid buffer * pointer for push operations to access - it is only safe to remove the * inode from the buffer once it has been removed from the AIL. - * - * We also clear the failed bit before removing the item from the AIL - * as xfs_trans_ail_delete()->xfs_clear_li_failed() will release buffer - * references the inode item owns and needs to hold until we've fully - * aborted the inode log item and detached it from the buffer. */ - clear_bit(XFS_LI_FAILED, &iip->ili_item.li_flags); xfs_trans_ail_delete(&iip->ili_item, 0); /* diff --git a/fs/xfs/xfs_inode_item_recover.c b/fs/xfs/xfs_inode_item_recover.c index f3bfb814378c..7205fd14f6b3 100644 --- a/fs/xfs/xfs_inode_item_recover.c +++ b/fs/xfs/xfs_inode_item_recover.c @@ -203,6 +203,7 @@ xfs_log_dinode_to_disk( to->di_crtime = xfs_log_dinode_to_disk_ts(from, from->di_crtime); to->di_flags2 = cpu_to_be64(from->di_flags2); + /* also covers the di_used_blocks union arm: */ to->di_cowextsize = cpu_to_be32(from->di_cowextsize); to->di_ino = cpu_to_be64(from->di_ino); to->di_lsn = cpu_to_be64(lsn); diff --git a/fs/xfs/xfs_ioctl.c b/fs/xfs/xfs_ioctl.c index ed85322507dd..d250f7f74e3b 100644 --- a/fs/xfs/xfs_ioctl.c +++ b/fs/xfs/xfs_ioctl.c @@ -1131,15 +1131,15 @@ xfs_ioctl_getset_resblocks( error = mnt_want_write_file(filp); if (error) return error; - error = xfs_reserve_blocks(mp, fsop.resblks); + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, fsop.resblks); mnt_drop_write_file(filp); if (error) return error; } spin_lock(&mp->m_sb_lock); - fsop.resblks = mp->m_resblks; - fsop.resblks_avail = mp->m_resblks_avail; + fsop.resblks = mp->m_free[XC_FREE_BLOCKS].res_total; + fsop.resblks_avail = mp->m_free[XC_FREE_BLOCKS].res_avail; spin_unlock(&mp->m_sb_lock); if (copy_to_user(arg, &fsop, sizeof(fsop))) @@ -1155,9 +1155,9 @@ xfs_ioctl_fs_counts( struct xfs_fsop_counts out = { .allocino = percpu_counter_read_positive(&mp->m_icount), .freeino = percpu_counter_read_positive(&mp->m_ifree), - .freedata = percpu_counter_read_positive(&mp->m_fdblocks) - - xfs_fdblocks_unavailable(mp), - .freertx = percpu_counter_read_positive(&mp->m_frextents), + .freedata = xfs_estimate_freecounter(mp, XC_FREE_BLOCKS) - + xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS), + .freertx = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS), }; if (copy_to_user(uarg, &out, sizeof(out))) diff --git a/fs/xfs/xfs_iomap.c b/fs/xfs/xfs_iomap.c index d61460309a78..ff05e6b1b0bb 100644 --- a/fs/xfs/xfs_iomap.c +++ b/fs/xfs/xfs_iomap.c @@ -30,6 +30,8 @@ #include "xfs_reflink.h" #include "xfs_health.h" #include "xfs_rtbitmap.h" +#include "xfs_icache.h" +#include "xfs_zone_alloc.h" #define XFS_ALLOC_ALIGN(mp, off) \ (((off) >> mp->m_allocsize_log) << mp->m_allocsize_log) @@ -431,13 +433,14 @@ xfs_quota_calc_throttle( static int64_t xfs_iomap_freesp( - struct percpu_counter *counter, + struct xfs_mount *mp, + unsigned int idx, uint64_t low_space[XFS_LOWSP_MAX], int *shift) { int64_t freesp; - freesp = percpu_counter_read_positive(counter); + freesp = xfs_estimate_freecounter(mp, idx); if (freesp < low_space[XFS_LOWSP_5_PCNT]) { *shift = 2; if (freesp < low_space[XFS_LOWSP_4_PCNT]) @@ -536,10 +539,10 @@ xfs_iomap_prealloc_size( if (unlikely(XFS_IS_REALTIME_INODE(ip))) freesp = xfs_rtbxlen_to_blen(mp, - xfs_iomap_freesp(&mp->m_frextents, + xfs_iomap_freesp(mp, XC_FREE_RTEXTENTS, mp->m_low_rtexts, &shift)); else - freesp = xfs_iomap_freesp(&mp->m_fdblocks, mp->m_low_space, + freesp = xfs_iomap_freesp(mp, XC_FREE_BLOCKS, mp->m_low_space, &shift); /* @@ -795,6 +798,38 @@ imap_spans_range( return true; } +static bool +xfs_bmap_hw_atomic_write_possible( + struct xfs_inode *ip, + struct xfs_bmbt_irec *imap, + xfs_fileoff_t offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fsize_t len = XFS_FSB_TO_B(mp, end_fsb - offset_fsb); + + /* + * atomic writes are required to be naturally aligned for disk blocks, + * which ensures that we adhere to block layer rules that we won't + * straddle any boundary or violate write alignment requirement. + */ + if (!IS_ALIGNED(imap->br_startblock, imap->br_blockcount)) + return false; + + /* + * Spanning multiple extents would mean that multiple BIOs would be + * issued, and so would lose atomicity required for REQ_ATOMIC-based + * atomics. + */ + if (!imap_spans_range(imap, offset_fsb, end_fsb)) + return false; + + /* + * The ->iomap_begin caller should ensure this, but check anyway. + */ + return len <= xfs_inode_buftarg(ip)->bt_bdev_awu_max; +} + static int xfs_direct_write_iomap_begin( struct inode *inode, @@ -809,9 +844,11 @@ xfs_direct_write_iomap_begin( struct xfs_bmbt_irec imap, cmap; xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + xfs_fileoff_t orig_end_fsb = end_fsb; int nimaps = 1, error = 0; bool shared = false; u16 iomap_flags = 0; + bool needs_alloc; unsigned int lockmode; u64 seq; @@ -828,6 +865,10 @@ xfs_direct_write_iomap_begin( if (offset + length > i_size_read(inode)) iomap_flags |= IOMAP_F_DIRTY; + /* HW-offload atomics are always used in this path */ + if (flags & IOMAP_ATOMIC) + iomap_flags |= IOMAP_F_ATOMIC_BIO; + /* * COW writes may allocate delalloc space or convert unwritten COW * extents, so we need to make sure to take the lock exclusively here. @@ -868,13 +909,37 @@ relock: (flags & IOMAP_DIRECT) || IS_DAX(inode)); if (error) goto out_unlock; - if (shared) + if (shared) { + if ((flags & IOMAP_ATOMIC) && + !xfs_bmap_hw_atomic_write_possible(ip, &cmap, + offset_fsb, end_fsb)) { + error = -ENOPROTOOPT; + goto out_unlock; + } goto out_found_cow; + } end_fsb = imap.br_startoff + imap.br_blockcount; length = XFS_FSB_TO_B(mp, end_fsb) - offset; } - if (imap_needs_alloc(inode, flags, &imap, nimaps)) + needs_alloc = imap_needs_alloc(inode, flags, &imap, nimaps); + + if (flags & IOMAP_ATOMIC) { + error = -ENOPROTOOPT; + /* + * If we allocate less than what is required for the write + * then we may end up with multiple extents, which means that + * REQ_ATOMIC-based cannot be used, so avoid this possibility. + */ + if (needs_alloc && orig_end_fsb - offset_fsb > 1) + goto out_unlock; + + if (!xfs_bmap_hw_atomic_write_possible(ip, &imap, offset_fsb, + orig_end_fsb)) + goto out_unlock; + } + + if (needs_alloc) goto allocate_blocks; /* @@ -962,6 +1027,187 @@ const struct iomap_ops xfs_direct_write_iomap_ops = { .iomap_begin = xfs_direct_write_iomap_begin, }; +#ifdef CONFIG_XFS_RT +/* + * This is really simple. The space has already been reserved before taking the + * IOLOCK, the actual block allocation is done just before submitting the bio + * and only recorded in the extent map on I/O completion. + */ +static int +xfs_zoned_direct_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + int error; + + ASSERT(!(flags & IOMAP_OVERWRITE_ONLY)); + + /* + * Needs to be pushed down into the allocator so that only writes into + * a single zone can be supported. + */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + + /* + * Ensure the extent list is in memory in so that we don't have to do + * read it from the I/O completion handler. + */ + if (xfs_need_iread_extents(&ip->i_df)) { + xfs_ilock(ip, XFS_ILOCK_EXCL); + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + } + + iomap->type = IOMAP_MAPPED; + iomap->flags = IOMAP_F_DIRTY; + iomap->bdev = ip->i_mount->m_rtdev_targp->bt_bdev; + iomap->offset = offset; + iomap->length = length; + iomap->flags = IOMAP_F_ANON_WRITE; + return 0; +} + +const struct iomap_ops xfs_zoned_direct_write_iomap_ops = { + .iomap_begin = xfs_zoned_direct_write_iomap_begin, +}; +#endif /* CONFIG_XFS_RT */ + +static int +xfs_atomic_write_cow_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t length, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + const xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, length); + xfs_filblks_t count_fsb = end_fsb - offset_fsb; + int nmaps = 1; + xfs_filblks_t resaligned; + struct xfs_bmbt_irec cmap; + struct xfs_iext_cursor icur; + struct xfs_trans *tp; + unsigned int dblocks = 0, rblocks = 0; + int error; + u64 seq; + + ASSERT(flags & IOMAP_WRITE); + ASSERT(flags & IOMAP_DIRECT); + + if (xfs_is_shutdown(mp)) + return -EIO; + + if (!xfs_can_sw_atomic_write(mp)) { + ASSERT(xfs_can_sw_atomic_write(mp)); + return -EINVAL; + } + + /* blocks are always allocated in this path */ + if (flags & IOMAP_NOWAIT) + return -EAGAIN; + + trace_xfs_iomap_atomic_write_cow(ip, offset, length); + + xfs_ilock(ip, XFS_ILOCK_EXCL); + + if (!ip->i_cowfp) { + ASSERT(!xfs_is_reflink_inode(ip)); + xfs_ifork_init_cow(ip); + } + + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cmap.br_startoff = end_fsb; + if (cmap.br_startoff <= offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + goto found; + } + + end_fsb = cmap.br_startoff; + count_fsb = end_fsb - offset_fsb; + + resaligned = xfs_aligned_fsb_count(offset_fsb, count_fsb, + xfs_get_cowextsz_hint(ip)); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + + if (XFS_IS_REALTIME_INODE(ip)) { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, 0); + rblocks = resaligned; + } else { + dblocks = XFS_DIOSTRAT_SPACE_RES(mp, resaligned); + rblocks = 0; + } + + error = xfs_trans_alloc_inode(ip, &M_RES(mp)->tr_write, dblocks, + rblocks, false, &tp); + if (error) + return error; + + /* extent layout could have changed since the unlock, so check again */ + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &cmap)) + cmap.br_startoff = end_fsb; + if (cmap.br_startoff <= offset_fsb) { + xfs_trim_extent(&cmap, offset_fsb, count_fsb); + xfs_trans_cancel(tp); + goto found; + } + + /* + * Allocate the entire reservation as unwritten blocks. + * + * Use XFS_BMAPI_EXTSZALIGN to hint at aligning new extents according to + * extszhint, such that there will be a greater chance that future + * atomic writes to that same range will be aligned (and don't require + * this COW-based method). + */ + error = xfs_bmapi_write(tp, ip, offset_fsb, count_fsb, + XFS_BMAPI_COWFORK | XFS_BMAPI_PREALLOC | + XFS_BMAPI_EXTSZALIGN, 0, &cmap, &nmaps); + if (error) { + xfs_trans_cancel(tp); + goto out_unlock; + } + + xfs_inode_set_cowblocks_tag(ip); + error = xfs_trans_commit(tp); + if (error) + goto out_unlock; + +found: + if (cmap.br_state != XFS_EXT_NORM) { + error = xfs_reflink_convert_cow_locked(ip, offset_fsb, + count_fsb); + if (error) + goto out_unlock; + cmap.br_state = XFS_EXT_NORM; + } + + length = XFS_FSB_TO_B(mp, cmap.br_startoff + cmap.br_blockcount); + trace_xfs_iomap_found(ip, offset, length - offset, XFS_COW_FORK, &cmap); + seq = xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return xfs_bmbt_to_iomap(ip, iomap, &cmap, flags, IOMAP_F_SHARED, seq); + +out_unlock: + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +const struct iomap_ops xfs_atomic_write_cow_iomap_ops = { + .iomap_begin = xfs_atomic_write_cow_iomap_begin, +}; + static int xfs_dax_write_iomap_end( struct inode *inode, @@ -987,6 +1233,455 @@ const struct iomap_ops xfs_dax_write_iomap_ops = { .iomap_end = xfs_dax_write_iomap_end, }; +/* + * Convert a hole to a delayed allocation. + */ +static void +xfs_bmap_add_extent_hole_delay( + struct xfs_inode *ip, /* incore inode pointer */ + int whichfork, + struct xfs_iext_cursor *icur, + struct xfs_bmbt_irec *new) /* new data to add to file extents */ +{ + struct xfs_ifork *ifp; /* inode fork pointer */ + xfs_bmbt_irec_t left; /* left neighbor extent entry */ + xfs_filblks_t newlen=0; /* new indirect size */ + xfs_filblks_t oldlen=0; /* old indirect size */ + xfs_bmbt_irec_t right; /* right neighbor extent entry */ + uint32_t state = xfs_bmap_fork_to_state(whichfork); + xfs_filblks_t temp; /* temp for indirect calculations */ + + ifp = xfs_ifork_ptr(ip, whichfork); + ASSERT(isnullstartblock(new->br_startblock)); + + /* + * Check and set flags if this segment has a left neighbor + */ + if (xfs_iext_peek_prev_extent(ifp, icur, &left)) { + state |= BMAP_LEFT_VALID; + if (isnullstartblock(left.br_startblock)) + state |= BMAP_LEFT_DELAY; + } + + /* + * Check and set flags if the current (right) segment exists. + * If it doesn't exist, we're converting the hole at end-of-file. + */ + if (xfs_iext_get_extent(ifp, icur, &right)) { + state |= BMAP_RIGHT_VALID; + if (isnullstartblock(right.br_startblock)) + state |= BMAP_RIGHT_DELAY; + } + + /* + * Set contiguity flags on the left and right neighbors. + * Don't let extents get too large, even if the pieces are contiguous. + */ + if ((state & BMAP_LEFT_VALID) && (state & BMAP_LEFT_DELAY) && + left.br_startoff + left.br_blockcount == new->br_startoff && + left.br_blockcount + new->br_blockcount <= XFS_MAX_BMBT_EXTLEN) + state |= BMAP_LEFT_CONTIG; + + if ((state & BMAP_RIGHT_VALID) && (state & BMAP_RIGHT_DELAY) && + new->br_startoff + new->br_blockcount == right.br_startoff && + new->br_blockcount + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN && + (!(state & BMAP_LEFT_CONTIG) || + (left.br_blockcount + new->br_blockcount + + right.br_blockcount <= XFS_MAX_BMBT_EXTLEN))) + state |= BMAP_RIGHT_CONTIG; + + /* + * Switch out based on the contiguity flags. + */ + switch (state & (BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG)) { + case BMAP_LEFT_CONTIG | BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with delayed allocations + * on the left and on the right. + * Merge all three into a single extent record. + */ + temp = left.br_blockcount + new->br_blockcount + + right.br_blockcount; + + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + left.br_startblock = nullstartblock(newlen); + left.br_blockcount = temp; + + xfs_iext_remove(ip, icur, state); + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); + break; + + case BMAP_LEFT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the left. + * Merge the new allocation with the left neighbor. + */ + temp = left.br_blockcount + new->br_blockcount; + + oldlen = startblockval(left.br_startblock) + + startblockval(new->br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + left.br_blockcount = temp; + left.br_startblock = nullstartblock(newlen); + + xfs_iext_prev(ifp, icur); + xfs_iext_update_extent(ip, state, icur, &left); + break; + + case BMAP_RIGHT_CONTIG: + /* + * New allocation is contiguous with a delayed allocation + * on the right. + * Merge the new allocation with the right neighbor. + */ + temp = new->br_blockcount + right.br_blockcount; + oldlen = startblockval(new->br_startblock) + + startblockval(right.br_startblock); + newlen = XFS_FILBLKS_MIN(xfs_bmap_worst_indlen(ip, temp), + oldlen); + right.br_startoff = new->br_startoff; + right.br_startblock = nullstartblock(newlen); + right.br_blockcount = temp; + xfs_iext_update_extent(ip, state, icur, &right); + break; + + case 0: + /* + * New allocation is not contiguous with another + * delayed allocation. + * Insert a new entry. + */ + oldlen = newlen = 0; + xfs_iext_insert(ip, icur, new, state); + break; + } + if (oldlen != newlen) { + ASSERT(oldlen > newlen); + xfs_add_fdblocks(ip->i_mount, oldlen - newlen); + + /* + * Nothing to do for disk quota accounting here. + */ + xfs_mod_delalloc(ip, 0, (int64_t)newlen - oldlen); + } +} + +/* + * Add a delayed allocation extent to an inode. Blocks are reserved from the + * global pool and the extent inserted into the inode in-core extent tree. + * + * On entry, got refers to the first extent beyond the offset of the extent to + * allocate or eof is specified if no such extent exists. On return, got refers + * to the extent record that was inserted to the inode fork. + * + * Note that the allocated extent may have been merged with contiguous extents + * during insertion into the inode fork. Thus, got does not reflect the current + * state of the inode fork on return. If necessary, the caller can use lastx to + * look up the updated record in the inode fork. + */ +static int +xfs_bmapi_reserve_delalloc( + struct xfs_inode *ip, + int whichfork, + xfs_fileoff_t off, + xfs_filblks_t len, + xfs_filblks_t prealloc, + struct xfs_bmbt_irec *got, + struct xfs_iext_cursor *icur, + int eof) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_ifork *ifp = xfs_ifork_ptr(ip, whichfork); + xfs_extlen_t alen; + xfs_extlen_t indlen; + uint64_t fdblocks; + int error; + xfs_fileoff_t aoff; + bool use_cowextszhint = + whichfork == XFS_COW_FORK && !prealloc; + +retry: + /* + * Cap the alloc length. Keep track of prealloc so we know whether to + * tag the inode before we return. + */ + aoff = off; + alen = XFS_FILBLKS_MIN(len + prealloc, XFS_MAX_BMBT_EXTLEN); + if (!eof) + alen = XFS_FILBLKS_MIN(alen, got->br_startoff - aoff); + if (prealloc && alen >= len) + prealloc = alen - len; + + /* + * If we're targetting the COW fork but aren't creating a speculative + * posteof preallocation, try to expand the reservation to align with + * the COW extent size hint if there's sufficient free space. + * + * Unlike the data fork, the CoW cancellation functions will free all + * the reservations at inactivation, so we don't require that every + * delalloc reservation have a dirty pagecache. + */ + if (use_cowextszhint) { + struct xfs_bmbt_irec prev; + xfs_extlen_t extsz = xfs_get_cowextsz_hint(ip); + + if (!xfs_iext_peek_prev_extent(ifp, icur, &prev)) + prev.br_startoff = NULLFILEOFF; + + error = xfs_bmap_extsize_align(mp, got, &prev, extsz, 0, eof, + 1, 0, &aoff, &alen); + ASSERT(!error); + } + + /* + * Make a transaction-less quota reservation for delayed allocation + * blocks. This number gets adjusted later. We return if we haven't + * allocated blocks already inside this loop. + */ + error = xfs_quota_reserve_blkres(ip, alen); + if (error) + goto out; + + /* + * Split changing sb for alen and indlen since they could be coming + * from different places. + */ + indlen = (xfs_extlen_t)xfs_bmap_worst_indlen(ip, alen); + ASSERT(indlen > 0); + + fdblocks = indlen; + if (XFS_IS_REALTIME_INODE(ip)) { + ASSERT(!xfs_is_zoned_inode(ip)); + error = xfs_dec_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); + if (error) + goto out_unreserve_quota; + } else { + fdblocks += alen; + } + + error = xfs_dec_fdblocks(mp, fdblocks, false); + if (error) + goto out_unreserve_frextents; + + ip->i_delayed_blks += alen; + xfs_mod_delalloc(ip, alen, indlen); + + got->br_startoff = aoff; + got->br_startblock = nullstartblock(indlen); + got->br_blockcount = alen; + got->br_state = XFS_EXT_NORM; + + xfs_bmap_add_extent_hole_delay(ip, whichfork, icur, got); + + /* + * Tag the inode if blocks were preallocated. Note that COW fork + * preallocation can occur at the start or end of the extent, even when + * prealloc == 0, so we must also check the aligned offset and length. + */ + if (whichfork == XFS_DATA_FORK && prealloc) + xfs_inode_set_eofblocks_tag(ip); + if (whichfork == XFS_COW_FORK && (prealloc || aoff < off || alen > len)) + xfs_inode_set_cowblocks_tag(ip); + + return 0; + +out_unreserve_frextents: + if (XFS_IS_REALTIME_INODE(ip)) + xfs_add_frextents(mp, xfs_blen_to_rtbxlen(mp, alen)); +out_unreserve_quota: + if (XFS_IS_QUOTA_ON(mp)) + xfs_quota_unreserve_blkres(ip, alen); +out: + if (error == -ENOSPC || error == -EDQUOT) { + trace_xfs_delalloc_enospc(ip, off, len); + + if (prealloc || use_cowextszhint) { + /* retry without any preallocation */ + use_cowextszhint = false; + prealloc = 0; + goto retry; + } + } + return error; +} + +static int +xfs_zoned_buffered_write_iomap_begin( + struct inode *inode, + loff_t offset, + loff_t count, + unsigned flags, + struct iomap *iomap, + struct iomap *srcmap) +{ + struct iomap_iter *iter = + container_of(iomap, struct iomap_iter, iomap); + struct xfs_zone_alloc_ctx *ac = iter->private; + struct xfs_inode *ip = XFS_I(inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSBT(mp, offset); + xfs_fileoff_t end_fsb = xfs_iomap_end_fsb(mp, offset, count); + u16 iomap_flags = IOMAP_F_SHARED; + unsigned int lockmode = XFS_ILOCK_EXCL; + xfs_filblks_t count_fsb; + xfs_extlen_t indlen; + struct xfs_bmbt_irec got; + struct xfs_iext_cursor icur; + int error = 0; + + ASSERT(!xfs_get_extsz_hint(ip)); + ASSERT(!(flags & IOMAP_UNSHARE)); + ASSERT(ac); + + if (xfs_is_shutdown(mp)) + return -EIO; + + error = xfs_qm_dqattach(ip); + if (error) + return error; + + error = xfs_ilock_for_iomap(ip, flags, &lockmode); + if (error) + return error; + + if (XFS_IS_CORRUPT(mp, !xfs_ifork_has_extents(&ip->i_df)) || + XFS_TEST_ERROR(false, mp, XFS_ERRTAG_BMAPIFORMAT)) { + xfs_bmap_mark_sick(ip, XFS_DATA_FORK); + error = -EFSCORRUPTED; + goto out_unlock; + } + + XFS_STATS_INC(mp, xs_blk_mapw); + + error = xfs_iread_extents(NULL, ip, XFS_DATA_FORK); + if (error) + goto out_unlock; + + /* + * For zeroing operations check if there is any data to zero first. + * + * For regular writes we always need to allocate new blocks, but need to + * provide the source mapping when the range is unaligned to support + * read-modify-write of the whole block in the page cache. + * + * In either case we need to limit the reported range to the boundaries + * of the source map in the data fork. + */ + if (!IS_ALIGNED(offset, mp->m_sb.sb_blocksize) || + !IS_ALIGNED(offset + count, mp->m_sb.sb_blocksize) || + (flags & IOMAP_ZERO)) { + struct xfs_bmbt_irec smap; + struct xfs_iext_cursor scur; + + if (!xfs_iext_lookup_extent(ip, &ip->i_df, offset_fsb, &scur, + &smap)) + smap.br_startoff = end_fsb; /* fake hole until EOF */ + if (smap.br_startoff > offset_fsb) { + /* + * We never need to allocate blocks for zeroing a hole. + */ + if (flags & IOMAP_ZERO) { + xfs_hole_to_iomap(ip, iomap, offset_fsb, + smap.br_startoff); + goto out_unlock; + } + end_fsb = min(end_fsb, smap.br_startoff); + } else { + end_fsb = min(end_fsb, + smap.br_startoff + smap.br_blockcount); + xfs_trim_extent(&smap, offset_fsb, + end_fsb - offset_fsb); + error = xfs_bmbt_to_iomap(ip, srcmap, &smap, flags, 0, + xfs_iomap_inode_sequence(ip, 0)); + if (error) + goto out_unlock; + } + } + + if (!ip->i_cowfp) + xfs_ifork_init_cow(ip); + + if (!xfs_iext_lookup_extent(ip, ip->i_cowfp, offset_fsb, &icur, &got)) + got.br_startoff = end_fsb; + if (got.br_startoff <= offset_fsb) { + trace_xfs_reflink_cow_found(ip, &got); + goto done; + } + + /* + * Cap the maximum length to keep the chunks of work done here somewhat + * symmetric with the work writeback does. + */ + end_fsb = min(end_fsb, got.br_startoff); + count_fsb = min3(end_fsb - offset_fsb, XFS_MAX_BMBT_EXTLEN, + XFS_B_TO_FSB(mp, 1024 * PAGE_SIZE)); + + /* + * The block reservation is supposed to cover all blocks that the + * operation could possible write, but there is a nasty corner case + * where blocks could be stolen from underneath us: + * + * 1) while this thread iterates over a larger buffered write, + * 2) another thread is causing a write fault that calls into + * ->page_mkwrite in range this thread writes to, using up the + * delalloc reservation created by a previous call to this function. + * 3) another thread does direct I/O on the range that the write fault + * happened on, which causes writeback of the dirty data. + * 4) this then set the stale flag, which cuts the current iomap + * iteration short, causing the new call to ->iomap_begin that gets + * us here again, but now without a sufficient reservation. + * + * This is a very unusual I/O pattern, and nothing but generic/095 is + * known to hit it. There's not really much we can do here, so turn this + * into a short write. + */ + if (count_fsb > ac->reserved_blocks) { + xfs_warn_ratelimited(mp, +"Short write on ino 0x%llx comm %.20s due to three-way race with write fault and direct I/O", + ip->i_ino, current->comm); + count_fsb = ac->reserved_blocks; + if (!count_fsb) { + error = -EIO; + goto out_unlock; + } + } + + error = xfs_quota_reserve_blkres(ip, count_fsb); + if (error) + goto out_unlock; + + indlen = xfs_bmap_worst_indlen(ip, count_fsb); + error = xfs_dec_fdblocks(mp, indlen, false); + if (error) + goto out_unlock; + ip->i_delayed_blks += count_fsb; + xfs_mod_delalloc(ip, count_fsb, indlen); + + got.br_startoff = offset_fsb; + got.br_startblock = nullstartblock(indlen); + got.br_blockcount = count_fsb; + got.br_state = XFS_EXT_NORM; + xfs_bmap_add_extent_hole_delay(ip, XFS_COW_FORK, &icur, &got); + ac->reserved_blocks -= count_fsb; + iomap_flags |= IOMAP_F_NEW; + + trace_xfs_iomap_alloc(ip, offset, XFS_FSB_TO_B(mp, count_fsb), + XFS_COW_FORK, &got); +done: + error = xfs_bmbt_to_iomap(ip, iomap, &got, flags, iomap_flags, + xfs_iomap_inode_sequence(ip, IOMAP_F_SHARED)); +out_unlock: + xfs_iunlock(ip, lockmode); + return error; +} + static int xfs_buffered_write_iomap_begin( struct inode *inode, @@ -1013,6 +1708,10 @@ xfs_buffered_write_iomap_begin( if (xfs_is_shutdown(mp)) return -EIO; + if (xfs_is_zoned_inode(ip)) + return xfs_zoned_buffered_write_iomap_begin(inode, offset, + count, flags, iomap, srcmap); + /* we can't use delayed allocations when using extent size hints */ if (xfs_get_extsz_hint(ip)) return xfs_direct_write_iomap_begin(inode, offset, count, @@ -1245,10 +1944,13 @@ xfs_buffered_write_delalloc_punch( loff_t length, struct iomap *iomap) { + struct iomap_iter *iter = + container_of(iomap, struct iomap_iter, iomap); + xfs_bmap_punch_delalloc_range(XFS_I(inode), (iomap->flags & IOMAP_F_SHARED) ? XFS_COW_FORK : XFS_DATA_FORK, - offset, offset + length); + offset, offset + length, iter->private); } static int @@ -1485,6 +2187,7 @@ xfs_zero_range( struct xfs_inode *ip, loff_t pos, loff_t len, + struct xfs_zone_alloc_ctx *ac, bool *did_zero) { struct inode *inode = VFS_I(ip); @@ -1495,13 +2198,14 @@ xfs_zero_range( return dax_zero_range(inode, pos, len, did_zero, &xfs_dax_write_iomap_ops); return iomap_zero_range(inode, pos, len, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, ac); } int xfs_truncate_page( struct xfs_inode *ip, loff_t pos, + struct xfs_zone_alloc_ctx *ac, bool *did_zero) { struct inode *inode = VFS_I(ip); @@ -1510,5 +2214,5 @@ xfs_truncate_page( return dax_truncate_page(inode, pos, did_zero, &xfs_dax_write_iomap_ops); return iomap_truncate_page(inode, pos, did_zero, - &xfs_buffered_write_iomap_ops); + &xfs_buffered_write_iomap_ops, ac); } diff --git a/fs/xfs/xfs_iomap.h b/fs/xfs/xfs_iomap.h index 8347268af727..674f8ac1b9bd 100644 --- a/fs/xfs/xfs_iomap.h +++ b/fs/xfs/xfs_iomap.h @@ -10,6 +10,7 @@ struct xfs_inode; struct xfs_bmbt_irec; +struct xfs_zone_alloc_ctx; int xfs_iomap_write_direct(struct xfs_inode *ip, xfs_fileoff_t offset_fsb, xfs_fileoff_t count_fsb, unsigned int flags, @@ -24,8 +25,9 @@ int xfs_bmbt_to_iomap(struct xfs_inode *ip, struct iomap *iomap, u16 iomap_flags, u64 sequence_cookie); int xfs_zero_range(struct xfs_inode *ip, loff_t pos, loff_t len, - bool *did_zero); -int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, bool *did_zero); + struct xfs_zone_alloc_ctx *ac, bool *did_zero); +int xfs_truncate_page(struct xfs_inode *ip, loff_t pos, + struct xfs_zone_alloc_ctx *ac, bool *did_zero); static inline xfs_filblks_t xfs_aligned_fsb_count( @@ -49,9 +51,11 @@ xfs_aligned_fsb_count( extern const struct iomap_ops xfs_buffered_write_iomap_ops; extern const struct iomap_ops xfs_direct_write_iomap_ops; +extern const struct iomap_ops xfs_zoned_direct_write_iomap_ops; extern const struct iomap_ops xfs_read_iomap_ops; extern const struct iomap_ops xfs_seek_iomap_ops; extern const struct iomap_ops xfs_xattr_iomap_ops; extern const struct iomap_ops xfs_dax_write_iomap_ops; +extern const struct iomap_ops xfs_atomic_write_cow_iomap_ops; #endif /* __XFS_IOMAP_H__*/ diff --git a/fs/xfs/xfs_iops.c b/fs/xfs/xfs_iops.c index 40289fe6f5b2..8cddbb7c149b 100644 --- a/fs/xfs/xfs_iops.c +++ b/fs/xfs/xfs_iops.c @@ -29,6 +29,7 @@ #include "xfs_xattr.h" #include "xfs_file.h" #include "xfs_bmap.h" +#include "xfs_zone_alloc.h" #include <linux/posix_acl.h> #include <linux/security.h> @@ -298,14 +299,14 @@ xfs_vn_create( return xfs_generic_create(idmap, dir, dentry, mode, 0, NULL); } -STATIC int +STATIC struct dentry * xfs_vn_mkdir( struct mnt_idmap *idmap, struct inode *dir, struct dentry *dentry, umode_t mode) { - return xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL); + return ERR_PTR(xfs_generic_create(idmap, dir, dentry, mode | S_IFDIR, 0, NULL)); } STATIC struct dentry * @@ -600,16 +601,82 @@ xfs_report_dioalign( stat->dio_offset_align = stat->dio_read_offset_align; } +unsigned int +xfs_get_atomic_write_min( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a minimum size of one fsblock. Without this + * mechanism, we can only guarantee atomic writes up to a single LBA. + * + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (xfs_inode_can_hw_atomic_write(ip) || xfs_can_sw_atomic_write(mp)) + return mp->m_sb.sb_blocksize; + + return 0; +} + +unsigned int +xfs_get_atomic_write_max( + struct xfs_inode *ip) +{ + struct xfs_mount *mp = ip->i_mount; + + /* + * If out of place writes are not available, we can guarantee an atomic + * write of exactly one single fsblock if the bdev will make that + * guarantee for us. + */ + if (!xfs_can_sw_atomic_write(mp)) { + if (xfs_inode_can_hw_atomic_write(ip)) + return mp->m_sb.sb_blocksize; + return 0; + } + + /* + * If we can complete an atomic write via atomic out of place writes, + * then advertise a maximum size of whatever we can complete through + * that means. Hardware support is reported via max_opt, not here. + */ + if (XFS_IS_REALTIME_INODE(ip)) + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_RTG].awu_max); + return XFS_FSB_TO_B(mp, mp->m_groups[XG_TYPE_AG].awu_max); +} + +unsigned int +xfs_get_atomic_write_max_opt( + struct xfs_inode *ip) +{ + unsigned int awu_max = xfs_get_atomic_write_max(ip); + + /* if the max is 1x block, then just keep behaviour that opt is 0 */ + if (awu_max <= ip->i_mount->m_sb.sb_blocksize) + return 0; + + /* + * Advertise the maximum size of an atomic write that we can tell the + * block device to perform for us. In general the bdev limit will be + * less than our out of place write limit, but we don't want to exceed + * the awu_max. + */ + return min(awu_max, xfs_inode_buftarg(ip)->bt_bdev_awu_max); +} + static void xfs_report_atomic_write( struct xfs_inode *ip, struct kstat *stat) { - unsigned int unit_min = 0, unit_max = 0; - - if (xfs_inode_can_atomicwrite(ip)) - unit_min = unit_max = ip->i_mount->m_sb.sb_blocksize; - generic_fill_statx_atomic_writes(stat, unit_min, unit_max); + generic_fill_statx_atomic_writes(stat, + xfs_get_atomic_write_min(ip), + xfs_get_atomic_write_max(ip), + xfs_get_atomic_write_max_opt(ip)); } STATIC int @@ -854,6 +921,7 @@ xfs_setattr_size( uint lock_flags = 0; uint resblks = 0; bool did_zeroing = false; + struct xfs_zone_alloc_ctx ac = { }; xfs_assert_ilocked(ip, XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL); ASSERT(S_ISREG(inode->i_mode)); @@ -890,6 +958,28 @@ xfs_setattr_size( inode_dio_wait(inode); /* + * Normally xfs_zoned_space_reserve is supposed to be called outside the + * IOLOCK. For truncate we can't do that since ->setattr is called with + * it already held by the VFS. So for now chicken out and try to + * allocate space under it. + * + * To avoid deadlocks this means we can't block waiting for space, which + * can lead to spurious -ENOSPC if there are no directly available + * blocks. We mitigate this a bit by allowing zeroing to dip into the + * reserved pool, but eventually the VFS calling convention needs to + * change. + */ + if (xfs_is_zoned_inode(ip)) { + error = xfs_zoned_space_reserve(ip, 1, + XFS_ZR_NOWAIT | XFS_ZR_RESERVED, &ac); + if (error) { + if (error == -EAGAIN) + return -ENOSPC; + return error; + } + } + + /* * File data changes must be complete before we start the transaction to * modify the inode. This needs to be done before joining the inode to * the transaction because the inode cannot be unlocked once it is a @@ -902,11 +992,14 @@ xfs_setattr_size( if (newsize > oldsize) { trace_xfs_zero_eof(ip, oldsize, newsize - oldsize); error = xfs_zero_range(ip, oldsize, newsize - oldsize, - &did_zeroing); + &ac, &did_zeroing); } else { - error = xfs_truncate_page(ip, newsize, &did_zeroing); + error = xfs_truncate_page(ip, newsize, &ac, &did_zeroing); } + if (xfs_is_zoned_inode(ip)) + xfs_zoned_space_unreserve(ip, &ac); + if (error) return error; diff --git a/fs/xfs/xfs_iops.h b/fs/xfs/xfs_iops.h index 3c1a2605ffd2..0896f6b8b3b8 100644 --- a/fs/xfs/xfs_iops.h +++ b/fs/xfs/xfs_iops.h @@ -19,5 +19,8 @@ int xfs_inode_init_security(struct inode *inode, struct inode *dir, extern void xfs_setup_inode(struct xfs_inode *ip); extern void xfs_setup_iops(struct xfs_inode *ip); extern void xfs_diflags_to_iflags(struct xfs_inode *ip, bool init); +unsigned int xfs_get_atomic_write_min(struct xfs_inode *ip); +unsigned int xfs_get_atomic_write_max(struct xfs_inode *ip); +unsigned int xfs_get_atomic_write_max_opt(struct xfs_inode *ip); #endif /* __XFS_IOPS_H__ */ diff --git a/fs/xfs/xfs_log.c b/fs/xfs/xfs_log.c index f8851ff835de..793468b4d30d 100644 --- a/fs/xfs/xfs_log.c +++ b/fs/xfs/xfs_log.c @@ -20,6 +20,7 @@ #include "xfs_sysfs.h" #include "xfs_sb.h" #include "xfs_health.h" +#include "xfs_zone_alloc.h" struct kmem_cache *xfs_log_ticket_cache; @@ -1606,27 +1607,6 @@ xlog_bio_end_io( &iclog->ic_end_io_work); } -static int -xlog_map_iclog_data( - struct bio *bio, - void *data, - size_t count) -{ - do { - struct page *page = kmem_to_page(data); - unsigned int off = offset_in_page(data); - size_t len = min_t(size_t, count, PAGE_SIZE - off); - - if (bio_add_page(bio, page, len, off) != len) - return -EIO; - - data += len; - count -= len; - } while (count); - - return 0; -} - STATIC void xlog_write_iclog( struct xlog *log, @@ -1692,11 +1672,12 @@ xlog_write_iclog( iclog->ic_flags &= ~(XLOG_ICL_NEED_FLUSH | XLOG_ICL_NEED_FUA); - if (xlog_map_iclog_data(&iclog->ic_bio, iclog->ic_data, count)) - goto shutdown; - - if (is_vmalloc_addr(iclog->ic_data)) - flush_kernel_vmap_range(iclog->ic_data, count); + if (is_vmalloc_addr(iclog->ic_data)) { + if (!bio_add_vmalloc(&iclog->ic_bio, iclog->ic_data, count)) + goto shutdown; + } else { + bio_add_virt_nofail(&iclog->ic_bio, iclog->ic_data, count); + } /* * If this log buffer would straddle the end of the log we will have @@ -2887,7 +2868,7 @@ xlog_force_and_check_iclog( * * 1. the current iclog is active and has no data; the previous iclog * is in the active or dirty state. - * 2. the current iclog is drity, and the previous iclog is in the + * 2. the current iclog is dirty, and the previous iclog is in the * active or dirty state. * * We may sleep if: @@ -3540,6 +3521,9 @@ xlog_force_shutdown( spin_unlock(&log->l_icloglock); wake_up_var(&log->l_opstate); + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(log->l_mp)) + xfs_zoned_wake_all(log->l_mp); + return log_error; } diff --git a/fs/xfs/xfs_log_cil.c b/fs/xfs/xfs_log_cil.c index 1ca406ec1b40..f66d2d430e4f 100644 --- a/fs/xfs/xfs_log_cil.c +++ b/fs/xfs/xfs_log_cil.c @@ -309,9 +309,7 @@ xlog_cil_alloc_shadow_bufs( * Then round nbytes up to 64-bit alignment so that the initial * buffer alignment is easy to calculate and verify. */ - nbytes += niovecs * - (sizeof(uint64_t) + sizeof(struct xlog_op_header)); - nbytes = round_up(nbytes, sizeof(uint64_t)); + nbytes = xlog_item_space(niovecs, nbytes); /* * The data buffer needs to start 64-bit aligned, so round up diff --git a/fs/xfs/xfs_log_priv.h b/fs/xfs/xfs_log_priv.h index f3d78869e5e5..39a102cc1b43 100644 --- a/fs/xfs/xfs_log_priv.h +++ b/fs/xfs/xfs_log_priv.h @@ -698,4 +698,17 @@ xlog_kvmalloc( return p; } +/* + * Given a count of iovecs and space for a log item, compute the space we need + * in the log to store that data plus the log headers. + */ +static inline unsigned int +xlog_item_space( + unsigned int niovecs, + unsigned int nbytes) +{ + nbytes += niovecs * (sizeof(uint64_t) + sizeof(struct xlog_op_header)); + return round_up(nbytes, sizeof(uint64_t)); +} + #endif /* __XFS_LOG_PRIV_H__ */ diff --git a/fs/xfs/xfs_log_recover.c b/fs/xfs/xfs_log_recover.c index b3c27dbccce8..2f76531842f8 100644 --- a/fs/xfs/xfs_log_recover.c +++ b/fs/xfs/xfs_log_recover.c @@ -3380,7 +3380,7 @@ xlog_do_recover( */ xfs_buf_lock(bp); xfs_buf_hold(bp); - error = _xfs_buf_read(bp, XBF_READ); + error = _xfs_buf_read(bp); if (error) { if (!xlog_is_shutdown(log)) { xfs_buf_ioerror_alert(bp, __this_address); diff --git a/fs/xfs/xfs_message.c b/fs/xfs/xfs_message.c index 6ed485ff2756..19aba2c3d525 100644 --- a/fs/xfs/xfs_message.c +++ b/fs/xfs/xfs_message.c @@ -141,14 +141,6 @@ xfs_warn_experimental( const char *name; long opstate; } features[] = { - [XFS_EXPERIMENTAL_PNFS] = { - .opstate = XFS_OPSTATE_WARNED_PNFS, - .name = "pNFS", - }, - [XFS_EXPERIMENTAL_SCRUB] = { - .opstate = XFS_OPSTATE_WARNED_SCRUB, - .name = "online scrub", - }, [XFS_EXPERIMENTAL_SHRINK] = { .opstate = XFS_OPSTATE_WARNED_SHRINK, .name = "online shrink", @@ -161,18 +153,14 @@ xfs_warn_experimental( .opstate = XFS_OPSTATE_WARNED_LBS, .name = "large block size", }, - [XFS_EXPERIMENTAL_EXCHRANGE] = { - .opstate = XFS_OPSTATE_WARNED_EXCHRANGE, - .name = "exchange range", - }, - [XFS_EXPERIMENTAL_PPTR] = { - .opstate = XFS_OPSTATE_WARNED_PPTR, - .name = "parent pointer", - }, [XFS_EXPERIMENTAL_METADIR] = { .opstate = XFS_OPSTATE_WARNED_METADIR, .name = "metadata directory tree", }, + [XFS_EXPERIMENTAL_ZONED] = { + .opstate = XFS_OPSTATE_WARNED_ZONED, + .name = "zoned RT device", + }, }; ASSERT(feat >= 0 && feat < XFS_EXPERIMENTAL_MAX); BUILD_BUG_ON(ARRAY_SIZE(features) != XFS_EXPERIMENTAL_MAX); diff --git a/fs/xfs/xfs_message.h b/fs/xfs/xfs_message.h index 7fb36ced9df7..d68e72379f9d 100644 --- a/fs/xfs/xfs_message.h +++ b/fs/xfs/xfs_message.h @@ -91,14 +91,11 @@ void xfs_buf_alert_ratelimited(struct xfs_buf *bp, const char *rlmsg, const char *fmt, ...); enum xfs_experimental_feat { - XFS_EXPERIMENTAL_PNFS, - XFS_EXPERIMENTAL_SCRUB, XFS_EXPERIMENTAL_SHRINK, XFS_EXPERIMENTAL_LARP, XFS_EXPERIMENTAL_LBS, - XFS_EXPERIMENTAL_EXCHRANGE, - XFS_EXPERIMENTAL_PPTR, XFS_EXPERIMENTAL_METADIR, + XFS_EXPERIMENTAL_ZONED, XFS_EXPERIMENTAL_MAX, }; diff --git a/fs/xfs/xfs_mount.c b/fs/xfs/xfs_mount.c index 477c5262cf91..29276fe60df9 100644 --- a/fs/xfs/xfs_mount.c +++ b/fs/xfs/xfs_mount.c @@ -40,6 +40,7 @@ #include "xfs_rtrmap_btree.h" #include "xfs_rtrefcount_btree.h" #include "scrub/stats.h" +#include "xfs_zone_alloc.h" static DEFINE_MUTEX(xfs_uuid_table_mutex); static int xfs_uuid_table_size; @@ -181,14 +182,11 @@ xfs_readsb( /* * Allocate a (locked) buffer to hold the superblock. This will be kept - * around at all times to optimize access to the superblock. Therefore, - * set XBF_NO_IOACCT to make sure it doesn't hold the buftarg count - * elevated. + * around at all times to optimize access to the superblock. */ reread: error = xfs_buf_read_uncached(mp->m_ddev_targp, XFS_SB_DADDR, - BTOBB(sector_size), XBF_NO_IOACCT, &bp, - buf_ops); + BTOBB(sector_size), &bp, buf_ops); if (error) { if (loud) xfs_warn(mp, "SB validate failed with error %d.", error); @@ -416,7 +414,7 @@ xfs_check_sizes( } error = xfs_buf_read_uncached(mp->m_ddev_targp, d - XFS_FSS_TO_BB(mp, 1), - XFS_FSS_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSS_TO_BB(mp, 1), &bp, NULL); if (error) { xfs_warn(mp, "last sector read failed"); return error; @@ -433,7 +431,7 @@ xfs_check_sizes( } error = xfs_buf_read_uncached(mp->m_logdev_targp, d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + XFS_FSB_TO_BB(mp, 1), &bp, NULL); if (error) { xfs_warn(mp, "log device read failed"); return error; @@ -464,22 +462,38 @@ xfs_mount_reset_sbqflags( return xfs_sync_sb(mp, false); } +static const char *const xfs_free_pool_name[] = { + [XC_FREE_BLOCKS] = "free blocks", + [XC_FREE_RTEXTENTS] = "free rt extents", + [XC_FREE_RTAVAILABLE] = "available rt extents", +}; + uint64_t -xfs_default_resblks(xfs_mount_t *mp) +xfs_default_resblks( + struct xfs_mount *mp, + enum xfs_free_counter ctr) { - uint64_t resblks; - - /* - * We default to 5% or 8192 fsbs of space reserved, whichever is - * smaller. This is intended to cover concurrent allocation - * transactions when we initially hit enospc. These each require a 4 - * block reservation. Hence by default we cover roughly 2000 concurrent - * allocation reservations. - */ - resblks = mp->m_sb.sb_dblocks; - do_div(resblks, 20); - resblks = min_t(uint64_t, resblks, 8192); - return resblks; + switch (ctr) { + case XC_FREE_BLOCKS: + /* + * Default to 5% or 8192 FSBs of space reserved, whichever is + * smaller. + * + * This is intended to cover concurrent allocation transactions + * when we initially hit ENOSPC. These each require a 4 block + * reservation. Hence by default we cover roughly 2000 + * concurrent allocation reservations. + */ + return min(div_u64(mp->m_sb.sb_dblocks, 20), 8192ULL); + case XC_FREE_RTEXTENTS: + case XC_FREE_RTAVAILABLE: + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) + return xfs_zoned_default_resblks(mp, ctr); + return 0; + default: + ASSERT(0); + return 0; + } } /* Ensure the summary counts are correct. */ @@ -546,7 +560,7 @@ xfs_check_summary_counts( * If we're mounting the rt volume after recovering the log, recompute * frextents from the rtbitmap file to fix the inconsistency. */ - if (xfs_has_realtime(mp) && !xfs_is_clean(mp)) { + if (xfs_has_realtime(mp) && !xfs_has_zoned(mp) && !xfs_is_clean(mp)) { error = xfs_rtalloc_reinit_frextents(mp); if (error) return error; @@ -652,6 +666,158 @@ xfs_agbtree_compute_maxlevels( mp->m_agbtree_maxlevels = max(levels, mp->m_refc_maxlevels); } +/* Maximum atomic write IO size that the kernel allows. */ +static inline xfs_extlen_t xfs_calc_atomic_write_max(struct xfs_mount *mp) +{ + return rounddown_pow_of_two(XFS_B_TO_FSB(mp, MAX_RW_COUNT)); +} + +static inline unsigned int max_pow_of_two_factor(const unsigned int nr) +{ + return 1 << (ffs(nr) - 1); +} + +/* + * If the data device advertises atomic write support, limit the size of data + * device atomic writes to the greatest power-of-two factor of the AG size so + * that every atomic write unit aligns with the start of every AG. This is + * required so that the per-AG allocations for an atomic write will always be + * aligned compatibly with the alignment requirements of the storage. + * + * If the data device doesn't advertise atomic writes, then there are no + * alignment restrictions and the largest out-of-place write we can do + * ourselves is the number of blocks that user files can allocate from any AG. + */ +static inline xfs_extlen_t xfs_calc_perag_awu_max(struct xfs_mount *mp) +{ + if (mp->m_ddev_targp->bt_bdev_awu_min > 0) + return max_pow_of_two_factor(mp->m_sb.sb_agblocks); + return rounddown_pow_of_two(mp->m_ag_max_usable); +} + +/* + * Reflink on the realtime device requires rtgroups, and atomic writes require + * reflink. + * + * If the realtime device advertises atomic write support, limit the size of + * data device atomic writes to the greatest power-of-two factor of the rtgroup + * size so that every atomic write unit aligns with the start of every rtgroup. + * This is required so that the per-rtgroup allocations for an atomic write + * will always be aligned compatibly with the alignment requirements of the + * storage. + * + * If the rt device doesn't advertise atomic writes, then there are no + * alignment restrictions and the largest out-of-place write we can do + * ourselves is the number of blocks that user files can allocate from any + * rtgroup. + */ +static inline xfs_extlen_t xfs_calc_rtgroup_awu_max(struct xfs_mount *mp) +{ + struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + + if (rgs->blocks == 0) + return 0; + if (mp->m_rtdev_targp && mp->m_rtdev_targp->bt_bdev_awu_min > 0) + return max_pow_of_two_factor(rgs->blocks); + return rounddown_pow_of_two(rgs->blocks); +} + +/* Compute the maximum atomic write unit size for each section. */ +static inline void +xfs_calc_atomic_write_unit_max( + struct xfs_mount *mp) +{ + struct xfs_groups *ags = &mp->m_groups[XG_TYPE_AG]; + struct xfs_groups *rgs = &mp->m_groups[XG_TYPE_RTG]; + + const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); + const xfs_extlen_t max_ioend = xfs_reflink_max_atomic_cow(mp); + const xfs_extlen_t max_agsize = xfs_calc_perag_awu_max(mp); + const xfs_extlen_t max_rgsize = xfs_calc_rtgroup_awu_max(mp); + + ags->awu_max = min3(max_write, max_ioend, max_agsize); + rgs->awu_max = min3(max_write, max_ioend, max_rgsize); + + trace_xfs_calc_atomic_write_unit_max(mp, max_write, max_ioend, + max_agsize, max_rgsize); +} + +/* + * Try to set the atomic write maximum to a new value that we got from + * userspace via mount option. + */ +int +xfs_set_max_atomic_write_opt( + struct xfs_mount *mp, + unsigned long long new_max_bytes) +{ + const xfs_filblks_t new_max_fsbs = XFS_B_TO_FSBT(mp, new_max_bytes); + const xfs_extlen_t max_write = xfs_calc_atomic_write_max(mp); + const xfs_extlen_t max_group = + max(mp->m_groups[XG_TYPE_AG].blocks, + mp->m_groups[XG_TYPE_RTG].blocks); + const xfs_extlen_t max_group_write = + max(xfs_calc_perag_awu_max(mp), xfs_calc_rtgroup_awu_max(mp)); + int error; + + if (new_max_bytes == 0) + goto set_limit; + + ASSERT(max_write <= U32_MAX); + + /* generic_atomic_write_valid enforces power of two length */ + if (!is_power_of_2(new_max_bytes)) { + xfs_warn(mp, + "max atomic write size of %llu bytes is not a power of 2", + new_max_bytes); + return -EINVAL; + } + + if (new_max_bytes & mp->m_blockmask) { + xfs_warn(mp, + "max atomic write size of %llu bytes not aligned with fsblock", + new_max_bytes); + return -EINVAL; + } + + if (new_max_fsbs > max_write) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than max write size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_write) >> 10); + return -EINVAL; + } + + if (new_max_fsbs > max_group) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than allocation group size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_group) >> 10); + return -EINVAL; + } + + if (new_max_fsbs > max_group_write) { + xfs_warn(mp, + "max atomic write size of %lluk cannot be larger than max allocation group write size %lluk", + new_max_bytes >> 10, + XFS_FSB_TO_B(mp, max_group_write) >> 10); + return -EINVAL; + } + +set_limit: + error = xfs_calc_atomic_write_reservation(mp, new_max_fsbs); + if (error) { + xfs_warn(mp, + "cannot support completing atomic writes of %lluk", + new_max_bytes >> 10); + return error; + } + + xfs_calc_atomic_write_unit_max(mp); + mp->m_awu_max_bytes = new_max_bytes; + return 0; +} + /* Compute maximum possible height for realtime btree types for this fs. */ static inline void xfs_rtbtree_compute_maxlevels( @@ -681,6 +847,7 @@ xfs_mountfs( uint quotamount = 0; uint quotaflags = 0; int error = 0; + int i; xfs_sb_mount_common(mp, sbp); @@ -750,27 +917,15 @@ xfs_mountfs( /* enable fail_at_unmount as default */ mp->m_fail_unmount = true; - super_set_sysfs_name_id(mp->m_super); - - error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, - NULL, mp->m_super->s_id); - if (error) - goto out; - - error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, - &mp->m_kobj, "stats"); + error = xfs_mount_sysfs_init(mp); if (error) - goto out_remove_sysfs; + goto out_remove_scrub_stats; xchk_stats_register(mp->m_scrub_stats, mp->m_debugfs); - error = xfs_error_sysfs_init(mp); - if (error) - goto out_remove_scrub_stats; - error = xfs_errortag_init(mp); if (error) - goto out_remove_error_sysfs; + goto out_remove_sysfs; error = xfs_uuid_mount(mp); if (error) @@ -1034,6 +1189,12 @@ xfs_mountfs( if (xfs_is_readonly(mp) && !xfs_has_norecovery(mp)) xfs_log_clean(mp); + if (xfs_has_zoned(mp)) { + error = xfs_mount_zones(mp); + if (error) + goto out_rtunmount; + } + /* * Complete the quota initialisation, post-log-replay component. */ @@ -1049,29 +1210,46 @@ xfs_mountfs( * privileged transactions. This is needed so that transaction * space required for critical operations can dip into this pool * when at ENOSPC. This is needed for operations like create with - * attr, unwritten extent conversion at ENOSPC, etc. Data allocations - * are not allowed to use this reserved space. + * attr, unwritten extent conversion at ENOSPC, garbage collection + * etc. Data allocations are not allowed to use this reserved space. * * This may drive us straight to ENOSPC on mount, but that implies * we were already there on the last unmount. Warn if this occurs. */ if (!xfs_is_readonly(mp)) { - error = xfs_reserve_blocks(mp, xfs_default_resblks(mp)); - if (error) - xfs_warn(mp, - "Unable to allocate reserve blocks. Continuing without reserve pool."); + for (i = 0; i < XC_FREE_NR; i++) { + error = xfs_reserve_blocks(mp, i, + xfs_default_resblks(mp, i)); + if (error) + xfs_warn(mp, +"Unable to allocate reserve blocks. Continuing without reserve pool for %s.", + xfs_free_pool_name[i]); + } /* Reserve AG blocks for future btree expansion. */ error = xfs_fs_reserve_ag_blocks(mp); if (error && error != -ENOSPC) goto out_agresv; + + xfs_zone_gc_start(mp); } + /* + * Pre-calculate atomic write unit max. This involves computations + * derived from transaction reservations, so we must do this after the + * log is fully initialized. + */ + error = xfs_set_max_atomic_write_opt(mp, mp->m_awu_max_bytes); + if (error) + goto out_agresv; + return 0; out_agresv: xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); + if (xfs_has_zoned(mp)) + xfs_unmount_zones(mp); out_rtunmount: xfs_rtunmount_inodes(mp); out_rele_rip: @@ -1119,13 +1297,10 @@ xfs_mountfs( xfs_uuid_unmount(mp); out_remove_errortag: xfs_errortag_del(mp); - out_remove_error_sysfs: - xfs_error_sysfs_del(mp); + out_remove_sysfs: + xfs_mount_sysfs_del(mp); out_remove_scrub_stats: xchk_stats_unregister(mp->m_scrub_stats); - xfs_sysfs_del(&mp->m_stats.xs_kobj); - out_remove_sysfs: - xfs_sysfs_del(&mp->m_kobj); out: return error; } @@ -1151,8 +1326,12 @@ xfs_unmountfs( xfs_inodegc_flush(mp); xfs_blockgc_stop(mp); + if (!test_bit(XFS_OPSTATE_READONLY, &mp->m_opstate)) + xfs_zone_gc_stop(mp); xfs_fs_unreserve_ag_blocks(mp); xfs_qm_unmount_quotas(mp); + if (xfs_has_zoned(mp)) + xfs_unmount_zones(mp); xfs_rtunmount_inodes(mp); xfs_irele(mp->m_rootip); if (mp->m_metadirip) @@ -1176,7 +1355,7 @@ xfs_unmountfs( * we only every apply deltas to the superblock and hence the incore * value does not matter.... */ - error = xfs_reserve_blocks(mp, 0); + error = xfs_reserve_blocks(mp, XC_FREE_BLOCKS, 0); if (error) xfs_warn(mp, "Unable to free reserved block pool. " "Freespace may not be correct on next mount."); @@ -1198,10 +1377,8 @@ xfs_unmountfs( xfs_free_rtgroups(mp, 0, mp->m_sb.sb_rgcount); xfs_free_perag_range(mp, 0, mp->m_sb.sb_agcount); xfs_errortag_del(mp); - xfs_error_sysfs_del(mp); xchk_stats_unregister(mp->m_scrub_stats); - xfs_sysfs_del(&mp->m_stats.xs_kobj); - xfs_sysfs_del(&mp->m_kobj); + xfs_mount_sysfs_del(mp); } /* @@ -1223,52 +1400,67 @@ xfs_fs_writable( return true; } +/* + * Estimate the amount of free space that is not available to userspace and is + * not explicitly reserved from the incore fdblocks. This includes: + * + * - The minimum number of blocks needed to support splitting a bmap btree + * - The blocks currently in use by the freespace btrees because they record + * the actual blocks that will fill per-AG metadata space reservations + */ +uint64_t +xfs_freecounter_unavailable( + struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + if (ctr != XC_FREE_BLOCKS) + return 0; + return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); +} + void xfs_add_freecounter( struct xfs_mount *mp, - struct percpu_counter *counter, + enum xfs_free_counter ctr, uint64_t delta) { - bool has_resv_pool = (counter == &mp->m_fdblocks); + struct xfs_freecounter *counter = &mp->m_free[ctr]; uint64_t res_used; /* * If the reserve pool is depleted, put blocks back into it first. * Most of the time the pool is full. */ - if (!has_resv_pool || mp->m_resblks == mp->m_resblks_avail) { - percpu_counter_add(counter, delta); + if (likely(counter->res_avail == counter->res_total)) { + percpu_counter_add(&counter->count, delta); return; } spin_lock(&mp->m_sb_lock); - res_used = mp->m_resblks - mp->m_resblks_avail; + res_used = counter->res_total - counter->res_avail; if (res_used > delta) { - mp->m_resblks_avail += delta; + counter->res_avail += delta; } else { delta -= res_used; - mp->m_resblks_avail = mp->m_resblks; - percpu_counter_add(counter, delta); + counter->res_avail = counter->res_total; + percpu_counter_add(&counter->count, delta); } spin_unlock(&mp->m_sb_lock); } + +/* Adjust in-core free blocks or RT extents. */ int xfs_dec_freecounter( struct xfs_mount *mp, - struct percpu_counter *counter, + enum xfs_free_counter ctr, uint64_t delta, bool rsvd) { - int64_t lcounter; - uint64_t set_aside = 0; + struct xfs_freecounter *counter = &mp->m_free[ctr]; s32 batch; - bool has_resv_pool; - ASSERT(counter == &mp->m_fdblocks || counter == &mp->m_frextents); - has_resv_pool = (counter == &mp->m_fdblocks); - if (rsvd) - ASSERT(has_resv_pool); + ASSERT(ctr < XC_FREE_NR); /* * Taking blocks away, need to be more accurate the closer we @@ -1278,7 +1470,7 @@ xfs_dec_freecounter( * then make everything serialise as we are real close to * ENOSPC. */ - if (__percpu_counter_compare(counter, 2 * XFS_FDBLOCKS_BATCH, + if (__percpu_counter_compare(&counter->count, 2 * XFS_FDBLOCKS_BATCH, XFS_FDBLOCKS_BATCH) < 0) batch = 1; else @@ -1295,34 +1487,34 @@ xfs_dec_freecounter( * problems (i.e. transaction abort, pagecache discards, etc.) than * slightly premature -ENOSPC. */ - if (has_resv_pool) - set_aside = xfs_fdblocks_unavailable(mp); - percpu_counter_add_batch(counter, -((int64_t)delta), batch); - if (__percpu_counter_compare(counter, set_aside, - XFS_FDBLOCKS_BATCH) >= 0) { - /* we had space! */ - return 0; - } - - /* - * lock up the sb for dipping into reserves before releasing the space - * that took us to ENOSPC. - */ - spin_lock(&mp->m_sb_lock); - percpu_counter_add(counter, delta); - if (!has_resv_pool || !rsvd) - goto fdblocks_enospc; - - lcounter = (long long)mp->m_resblks_avail - delta; - if (lcounter >= 0) { - mp->m_resblks_avail = lcounter; + percpu_counter_add_batch(&counter->count, -((int64_t)delta), batch); + if (__percpu_counter_compare(&counter->count, + xfs_freecounter_unavailable(mp, ctr), + XFS_FDBLOCKS_BATCH) < 0) { + /* + * Lock up the sb for dipping into reserves before releasing the + * space that took us to ENOSPC. + */ + spin_lock(&mp->m_sb_lock); + percpu_counter_add(&counter->count, delta); + if (!rsvd) + goto fdblocks_enospc; + if (delta > counter->res_avail) { + if (ctr == XC_FREE_BLOCKS) + xfs_warn_once(mp, +"Reserve blocks depleted! Consider increasing reserve pool size."); + goto fdblocks_enospc; + } + counter->res_avail -= delta; + trace_xfs_freecounter_reserved(mp, ctr, delta, _RET_IP_); spin_unlock(&mp->m_sb_lock); - return 0; } - xfs_warn_once(mp, -"Reserve blocks depleted! Consider increasing reserve pool size."); + + /* we had space! */ + return 0; fdblocks_enospc: + trace_xfs_freecounter_enospc(mp, ctr, delta, _RET_IP_); spin_unlock(&mp->m_sb_lock); return -ENOSPC; } diff --git a/fs/xfs/xfs_mount.h b/fs/xfs/xfs_mount.h index fbed172d6770..d85084f9f317 100644 --- a/fs/xfs/xfs_mount.h +++ b/fs/xfs/xfs_mount.h @@ -98,11 +98,47 @@ struct xfs_groups { uint8_t blklog; /* + * Zoned devices can have gaps beyond the usable capacity of a zone and + * the end in the LBA/daddr address space. In other words, the hardware + * equivalent to the RT groups already takes care of the power of 2 + * alignment for us. In this case the sparse FSB/RTB address space maps + * 1:1 to the device address space. + */ + bool has_daddr_gaps; + + /* * Mask to extract the group-relative block number from a FSB. * For a pre-rtgroups filesystem we pretend to have one very large * rtgroup, so this mask must be 64-bit. */ uint64_t blkmask; + + /* + * Start of the first group in the device. This is used to support a + * RT device following the data device on the same block device for + * SMR hard drives. + */ + xfs_fsblock_t start_fsb; + + /* + * Maximum length of an atomic write for files stored in this + * collection of allocation groups, in fsblocks. + */ + xfs_extlen_t awu_max; +}; + +struct xfs_freecounter { + /* free blocks for general use: */ + struct percpu_counter count; + + /* total reserved blocks: */ + uint64_t res_total; + + /* available reserved blocks: */ + uint64_t res_avail; + + /* reserved blks @ remount,ro: */ + uint64_t res_saved; }; /* @@ -198,6 +234,12 @@ typedef struct xfs_mount { bool m_fail_unmount; bool m_finobt_nores; /* no per-AG finobt resv. */ bool m_update_sb; /* sb needs update in mount */ + unsigned int m_max_open_zones; + unsigned int m_zonegc_low_space; + struct xfs_mru_cache *m_zone_cache; /* Inode to open zone cache */ + + /* max_atomic_write mount option value */ + unsigned long long m_awu_max_bytes; /* * Bitsets of per-fs metadata that have been checked and/or are sick. @@ -222,8 +264,8 @@ typedef struct xfs_mount { spinlock_t ____cacheline_aligned m_sb_lock; /* sb counter lock */ struct percpu_counter m_icount; /* allocated inodes counter */ struct percpu_counter m_ifree; /* free inodes counter */ - struct percpu_counter m_fdblocks; /* free block counter */ - struct percpu_counter m_frextents; /* free rt extent counter */ + + struct xfs_freecounter m_free[XC_FREE_NR]; /* * Count of data device blocks reserved for delayed allocations, @@ -245,10 +287,8 @@ typedef struct xfs_mount { atomic64_t m_allocbt_blks; struct xfs_groups m_groups[XG_TYPE_MAX]; - uint64_t m_resblks; /* total reserved blocks */ - uint64_t m_resblks_avail;/* available reserved blocks */ - uint64_t m_resblks_save; /* reserved blks @ remount,ro */ struct delayed_work m_reclaim_work; /* background inode reclaim */ + struct xfs_zone_info *m_zone_info; /* zone allocator information */ struct dentry *m_debugfs; /* debugfs parent */ struct xfs_kobj m_kobj; struct xfs_kobj m_error_kobj; @@ -258,10 +298,16 @@ typedef struct xfs_mount { #ifdef CONFIG_XFS_ONLINE_SCRUB_STATS struct xchk_stats *m_scrub_stats; #endif + struct xfs_kobj m_zoned_kobj; xfs_agnumber_t m_agfrotor; /* last ag where space found */ atomic_t m_agirotor; /* last ag dir inode alloced */ atomic_t m_rtgrotor; /* last rtgroup rtpicked */ + struct mutex m_metafile_resv_lock; + uint64_t m_metafile_resv_target; + uint64_t m_metafile_resv_used; + uint64_t m_metafile_resv_avail; + /* Memory shrinker to throttle and reprioritize inodegc */ struct shrinker *m_inodegc_shrinker; /* @@ -336,8 +382,10 @@ typedef struct xfs_mount { #define XFS_FEAT_NREXT64 (1ULL << 26) /* large extent counters */ #define XFS_FEAT_EXCHANGE_RANGE (1ULL << 27) /* exchange range */ #define XFS_FEAT_METADIR (1ULL << 28) /* metadata directory tree */ +#define XFS_FEAT_ZONED (1ULL << 29) /* zoned RT device */ /* Mount features */ +#define XFS_FEAT_NOLIFETIME (1ULL << 47) /* disable lifetime hints */ #define XFS_FEAT_NOATTR2 (1ULL << 48) /* disable attr2 creation */ #define XFS_FEAT_NOALIGN (1ULL << 49) /* ignore alignment */ #define XFS_FEAT_ALLOCSIZE (1ULL << 50) /* user specified allocation size */ @@ -392,6 +440,8 @@ __XFS_HAS_FEAT(needsrepair, NEEDSREPAIR) __XFS_HAS_FEAT(large_extent_counts, NREXT64) __XFS_HAS_FEAT(exchange_range, EXCHANGE_RANGE) __XFS_HAS_FEAT(metadir, METADIR) +__XFS_HAS_FEAT(zoned, ZONED) +__XFS_HAS_FEAT(nolifetime, NOLIFETIME) static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) { @@ -402,7 +452,9 @@ static inline bool xfs_has_rtgroups(const struct xfs_mount *mp) static inline bool xfs_has_rtsb(const struct xfs_mount *mp) { /* all rtgroups filesystems with an rt section have an rtsb */ - return xfs_has_rtgroups(mp) && xfs_has_realtime(mp); + return xfs_has_rtgroups(mp) && + xfs_has_realtime(mp) && + !xfs_has_zoned(mp); } static inline bool xfs_has_rtrmapbt(const struct xfs_mount *mp) @@ -417,6 +469,16 @@ static inline bool xfs_has_rtreflink(const struct xfs_mount *mp) xfs_has_reflink(mp); } +static inline bool xfs_has_nonzoned(const struct xfs_mount *mp) +{ + return !xfs_has_zoned(mp); +} + +static inline bool xfs_can_sw_atomic_write(struct xfs_mount *mp) +{ + return xfs_has_reflink(mp); +} + /* * Some features are always on for v5 file systems, allow the compiler to * eliminiate dead code when building without v4 support. @@ -496,10 +558,6 @@ __XFS_HAS_FEAT(nouuid, NOUUID) */ #define XFS_OPSTATE_BLOCKGC_ENABLED 6 -/* Kernel has logged a warning about pNFS being used on this fs. */ -#define XFS_OPSTATE_WARNED_PNFS 7 -/* Kernel has logged a warning about online fsck being used on this fs. */ -#define XFS_OPSTATE_WARNED_SCRUB 8 /* Kernel has logged a warning about shrink being used on this fs. */ #define XFS_OPSTATE_WARNED_SHRINK 9 /* Kernel has logged a warning about logged xattr updates being used. */ @@ -512,14 +570,14 @@ __XFS_HAS_FEAT(nouuid, NOUUID) #define XFS_OPSTATE_USE_LARP 13 /* Kernel has logged a warning about blocksize > pagesize on this fs. */ #define XFS_OPSTATE_WARNED_LBS 14 -/* Kernel has logged a warning about exchange-range being used on this fs. */ -#define XFS_OPSTATE_WARNED_EXCHRANGE 15 -/* Kernel has logged a warning about parent pointers being used on this fs. */ -#define XFS_OPSTATE_WARNED_PPTR 16 /* Kernel has logged a warning about metadata dirs being used on this fs. */ #define XFS_OPSTATE_WARNED_METADIR 17 /* Filesystem should use qflags to determine quotaon status */ #define XFS_OPSTATE_RESUMING_QUOTAON 18 +/* Kernel has logged a warning about zoned RT device being used on this fs. */ +#define XFS_OPSTATE_WARNED_ZONED 19 +/* (Zoned) GC is in progress */ +#define XFS_OPSTATE_ZONEGC_RUNNING 20 #define __XFS_IS_OPSTATE(name, NAME) \ static inline bool xfs_is_ ## name (struct xfs_mount *mp) \ @@ -564,6 +622,7 @@ static inline bool xfs_clear_resuming_quotaon(struct xfs_mount *mp) #endif /* CONFIG_XFS_QUOTA */ __XFS_IS_OPSTATE(done_with_log_incompat, UNSET_LOG_INCOMPAT) __XFS_IS_OPSTATE(using_logged_xattrs, USE_LARP) +__XFS_IS_OPSTATE(zonegc_running, ZONEGC_RUNNING) static inline bool xfs_should_warn(struct xfs_mount *mp, long nr) @@ -579,7 +638,6 @@ xfs_should_warn(struct xfs_mount *mp, long nr) { (1UL << XFS_OPSTATE_READONLY), "read_only" }, \ { (1UL << XFS_OPSTATE_INODEGC_ENABLED), "inodegc" }, \ { (1UL << XFS_OPSTATE_BLOCKGC_ENABLED), "blockgc" }, \ - { (1UL << XFS_OPSTATE_WARNED_SCRUB), "wscrub" }, \ { (1UL << XFS_OPSTATE_WARNED_SHRINK), "wshrink" }, \ { (1UL << XFS_OPSTATE_WARNED_LARP), "wlarp" }, \ { (1UL << XFS_OPSTATE_QUOTACHECK_RUNNING), "quotacheck" }, \ @@ -633,7 +691,8 @@ xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d) } extern void xfs_uuid_table_free(void); -extern uint64_t xfs_default_resblks(xfs_mount_t *mp); +uint64_t xfs_default_resblks(struct xfs_mount *mp, + enum xfs_free_counter ctr); extern int xfs_mountfs(xfs_mount_t *mp); extern void xfs_unmountfs(xfs_mount_t *); @@ -646,45 +705,74 @@ extern void xfs_unmountfs(xfs_mount_t *); */ #define XFS_FDBLOCKS_BATCH 1024 +uint64_t xfs_freecounter_unavailable(struct xfs_mount *mp, + enum xfs_free_counter ctr); + /* - * Estimate the amount of free space that is not available to userspace and is - * not explicitly reserved from the incore fdblocks. This includes: - * - * - The minimum number of blocks needed to support splitting a bmap btree - * - The blocks currently in use by the freespace btrees because they record - * the actual blocks that will fill per-AG metadata space reservations + * Sum up the freecount, but never return negative values. + */ +static inline s64 xfs_sum_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_sum_positive(&mp->m_free[ctr].count); +} + +/* + * Same as above, but does return negative values. Mostly useful for + * special cases like repair and tracing. + */ +static inline s64 xfs_sum_freecounter_raw(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_sum(&mp->m_free[ctr].count); +} + +/* + * This just provides and estimate without the cpu-local updates, use + * xfs_sum_freecounter for the exact value. */ -static inline uint64_t -xfs_fdblocks_unavailable( - struct xfs_mount *mp) +static inline s64 xfs_estimate_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + return percpu_counter_read_positive(&mp->m_free[ctr].count); +} + +static inline int xfs_compare_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr, s64 rhs, s32 batch) { - return mp->m_alloc_set_aside + atomic64_read(&mp->m_allocbt_blks); + return __percpu_counter_compare(&mp->m_free[ctr].count, rhs, batch); } -int xfs_dec_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, +static inline void xfs_set_freecounter(struct xfs_mount *mp, + enum xfs_free_counter ctr, uint64_t val) +{ + percpu_counter_set(&mp->m_free[ctr].count, val); +} + +int xfs_dec_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t delta, bool rsvd); -void xfs_add_freecounter(struct xfs_mount *mp, struct percpu_counter *counter, +void xfs_add_freecounter(struct xfs_mount *mp, enum xfs_free_counter ctr, uint64_t delta); static inline int xfs_dec_fdblocks(struct xfs_mount *mp, uint64_t delta, bool reserved) { - return xfs_dec_freecounter(mp, &mp->m_fdblocks, delta, reserved); + return xfs_dec_freecounter(mp, XC_FREE_BLOCKS, delta, reserved); } static inline void xfs_add_fdblocks(struct xfs_mount *mp, uint64_t delta) { - xfs_add_freecounter(mp, &mp->m_fdblocks, delta); + xfs_add_freecounter(mp, XC_FREE_BLOCKS, delta); } static inline int xfs_dec_frextents(struct xfs_mount *mp, uint64_t delta) { - return xfs_dec_freecounter(mp, &mp->m_frextents, delta, false); + return xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, delta, false); } static inline void xfs_add_frextents(struct xfs_mount *mp, uint64_t delta) { - xfs_add_freecounter(mp, &mp->m_frextents, delta); + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, delta); } extern int xfs_readsb(xfs_mount_t *, int); @@ -706,5 +794,12 @@ int xfs_add_incompat_log_feature(struct xfs_mount *mp, uint32_t feature); bool xfs_clear_incompat_log_features(struct xfs_mount *mp); void xfs_mod_delalloc(struct xfs_inode *ip, int64_t data_delta, int64_t ind_delta); +static inline void xfs_mod_sb_delalloc(struct xfs_mount *mp, int64_t delta) +{ + percpu_counter_add(&mp->m_delalloc_blks, delta); +} + +int xfs_set_max_atomic_write_opt(struct xfs_mount *mp, + unsigned long long new_max_bytes); #endif /* __XFS_MOUNT_H__ */ diff --git a/fs/xfs/xfs_mru_cache.c b/fs/xfs/xfs_mru_cache.c index d0f5b403bdbe..08443ceec329 100644 --- a/fs/xfs/xfs_mru_cache.c +++ b/fs/xfs/xfs_mru_cache.c @@ -414,6 +414,8 @@ xfs_mru_cache_destroy( * To insert an element, call xfs_mru_cache_insert() with the data store, the * element's key and the client data pointer. This function returns 0 on * success or ENOMEM if memory for the data element couldn't be allocated. + * + * The passed in elem is freed through the per-cache free_func on failure. */ int xfs_mru_cache_insert( @@ -421,14 +423,15 @@ xfs_mru_cache_insert( unsigned long key, struct xfs_mru_cache_elem *elem) { - int error; + int error = -EINVAL; ASSERT(mru && mru->lists); if (!mru || !mru->lists) - return -EINVAL; + goto out_free; + error = -ENOMEM; if (radix_tree_preload(GFP_KERNEL)) - return -ENOMEM; + goto out_free; INIT_LIST_HEAD(&elem->list_node); elem->key = key; @@ -440,6 +443,12 @@ xfs_mru_cache_insert( _xfs_mru_cache_list_insert(mru, elem); spin_unlock(&mru->lock); + if (error) + goto out_free; + return 0; + +out_free: + mru->free_func(mru->data, elem); return error; } diff --git a/fs/xfs/xfs_notify_failure.c b/fs/xfs/xfs_notify_failure.c index ed8d8ed42f0a..3545dc1d953c 100644 --- a/fs/xfs/xfs_notify_failure.c +++ b/fs/xfs/xfs_notify_failure.c @@ -127,7 +127,7 @@ xfs_dax_notify_failure_freeze( struct super_block *sb = mp->m_super; int error; - error = freeze_super(sb, FREEZE_HOLDER_KERNEL); + error = freeze_super(sb, FREEZE_HOLDER_KERNEL, NULL); if (error) xfs_emerg(mp, "already frozen by kernel, err=%d", error); @@ -143,7 +143,7 @@ xfs_dax_notify_failure_thaw( int error; if (kernel_frozen) { - error = thaw_super(sb, FREEZE_HOLDER_KERNEL); + error = thaw_super(sb, FREEZE_HOLDER_KERNEL, NULL); if (error) xfs_emerg(mp, "still frozen after notify failure, err=%d", error); @@ -153,7 +153,7 @@ xfs_dax_notify_failure_thaw( * Also thaw userspace call anyway because the device is about to be * removed immediately. */ - thaw_super(sb, FREEZE_HOLDER_USERSPACE); + thaw_super(sb, FREEZE_HOLDER_USERSPACE, NULL); } static int diff --git a/fs/xfs/xfs_pnfs.c b/fs/xfs/xfs_pnfs.c index 6f4479deac6d..afe7497012d4 100644 --- a/fs/xfs/xfs_pnfs.c +++ b/fs/xfs/xfs_pnfs.c @@ -58,8 +58,6 @@ xfs_fs_get_uuid( { struct xfs_mount *mp = XFS_M(sb); - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PNFS); - if (*len < sizeof(uuid_t)) return -EINVAL; diff --git a/fs/xfs/xfs_qm.c b/fs/xfs/xfs_qm.c index e1ba5af6250f..417439b58785 100644 --- a/fs/xfs/xfs_qm.c +++ b/fs/xfs/xfs_qm.c @@ -1711,7 +1711,8 @@ xfs_qm_mount_quotas( * immediately. We only support rtquota if rtgroups are enabled to * avoid problems with older kernels. */ - if (mp->m_sb.sb_rextents && !xfs_has_rtgroups(mp)) { + if (mp->m_sb.sb_rextents && + (!xfs_has_rtgroups(mp) || xfs_has_zoned(mp))) { xfs_notice(mp, "Cannot turn on quotas for realtime filesystem"); mp->m_qflags = 0; goto write_changes; diff --git a/fs/xfs/xfs_refcount_item.c b/fs/xfs/xfs_refcount_item.c index fe2d7aab8554..076501123d89 100644 --- a/fs/xfs/xfs_refcount_item.c +++ b/fs/xfs/xfs_refcount_item.c @@ -78,6 +78,11 @@ xfs_cui_item_size( *nbytes += xfs_cui_log_format_sizeof(cuip->cui_format.cui_nextents); } +unsigned int xfs_cui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_cui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given cui log item. We use only 1 iovec, and we point that @@ -179,6 +184,11 @@ xfs_cud_item_size( *nbytes += sizeof(struct xfs_cud_log_format); } +unsigned int xfs_cud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_cud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given cud log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_refcount_item.h b/fs/xfs/xfs_refcount_item.h index bfee8f30c63c..0fc3f493342b 100644 --- a/fs/xfs/xfs_refcount_item.h +++ b/fs/xfs/xfs_refcount_item.h @@ -76,4 +76,7 @@ struct xfs_refcount_intent; void xfs_refcount_defer_add(struct xfs_trans *tp, struct xfs_refcount_intent *ri); +unsigned int xfs_cui_log_space(unsigned int nr); +unsigned int xfs_cud_log_space(void); + #endif /* __XFS_REFCOUNT_ITEM_H__ */ diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index 59f7fc16eb80..ad3bcb76d805 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -235,7 +235,7 @@ xfs_reflink_trim_around_shared( int error = 0; /* Holes, unwritten, and delalloc extents cannot be shared */ - if (!xfs_is_cow_inode(ip) || !xfs_bmap_is_written_extent(irec)) { + if (!xfs_is_reflink_inode(ip) || !xfs_bmap_is_written_extent(irec)) { *shared = false; return 0; } @@ -293,7 +293,7 @@ xfs_bmap_trim_cow( return xfs_reflink_trim_around_shared(ip, imap, shared); } -static int +int xfs_reflink_convert_cow_locked( struct xfs_inode *ip, xfs_fileoff_t offset_fsb, @@ -651,7 +651,7 @@ xfs_reflink_cancel_cow_blocks( if (isnullstartblock(del.br_startblock)) { xfs_bmap_del_extent_delay(ip, XFS_COW_FORK, &icur, &got, - &del); + &del, 0); } else if (del.br_state == XFS_EXT_UNWRITTEN || cancel_real) { ASSERT((*tpp)->t_highest_agno == NULLAGNUMBER); @@ -786,35 +786,19 @@ xfs_reflink_update_quota( * requirements as low as possible. */ STATIC int -xfs_reflink_end_cow_extent( +xfs_reflink_end_cow_extent_locked( + struct xfs_trans *tp, struct xfs_inode *ip, xfs_fileoff_t *offset_fsb, xfs_fileoff_t end_fsb) { struct xfs_iext_cursor icur; struct xfs_bmbt_irec got, del, data; - struct xfs_mount *mp = ip->i_mount; - struct xfs_trans *tp; struct xfs_ifork *ifp = xfs_ifork_ptr(ip, XFS_COW_FORK); - unsigned int resblks; int nmaps; bool isrt = XFS_IS_REALTIME_INODE(ip); int error; - resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); - error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, - XFS_TRANS_RESERVE, &tp); - if (error) - return error; - - /* - * Lock the inode. We have to ijoin without automatic unlock because - * the lead transaction is the refcountbt record deletion; the data - * fork update follows as a deferred log item. - */ - xfs_ilock(ip, XFS_ILOCK_EXCL); - xfs_trans_ijoin(tp, ip, 0); - /* * In case of racing, overlapping AIO writes no COW extents might be * left by the time I/O completes for the loser of the race. In that @@ -823,7 +807,7 @@ xfs_reflink_end_cow_extent( if (!xfs_iext_lookup_extent(ip, ifp, *offset_fsb, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; - goto out_cancel; + return 0; } /* @@ -837,7 +821,7 @@ xfs_reflink_end_cow_extent( if (!xfs_iext_next_extent(ifp, &icur, &got) || got.br_startoff >= end_fsb) { *offset_fsb = end_fsb; - goto out_cancel; + return 0; } } del = got; @@ -846,14 +830,14 @@ xfs_reflink_end_cow_extent( error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, XFS_IEXT_REFLINK_END_COW_CNT); if (error) - goto out_cancel; + return error; /* Grab the corresponding mapping in the data fork. */ nmaps = 1; error = xfs_bmapi_read(ip, del.br_startoff, del.br_blockcount, &data, &nmaps, 0); if (error) - goto out_cancel; + return error; /* We can only remap the smaller of the two extent sizes. */ data.br_blockcount = min(data.br_blockcount, del.br_blockcount); @@ -882,7 +866,7 @@ xfs_reflink_end_cow_extent( error = xfs_bunmapi(NULL, ip, data.br_startoff, data.br_blockcount, 0, 1, &done); if (error) - goto out_cancel; + return error; ASSERT(done); } @@ -899,17 +883,45 @@ xfs_reflink_end_cow_extent( /* Remove the mapping from the CoW fork. */ xfs_bmap_del_extent_cow(ip, &icur, &got, &del); - error = xfs_trans_commit(tp); - xfs_iunlock(ip, XFS_ILOCK_EXCL); - if (error) - return error; - /* Update the caller about how much progress we made. */ *offset_fsb = del.br_startoff + del.br_blockcount; return 0; +} -out_cancel: - xfs_trans_cancel(tp); +/* + * Remap part of the CoW fork into the data fork. + * + * We aim to remap the range starting at @offset_fsb and ending at @end_fsb + * into the data fork; this function will remap what it can (at the end of the + * range) and update @end_fsb appropriately. Each remap gets its own + * transaction because we can end up merging and splitting bmbt blocks for + * every remap operation and we'd like to keep the block reservation + * requirements as low as possible. + */ +STATIC int +xfs_reflink_end_cow_extent( + struct xfs_inode *ip, + xfs_fileoff_t *offset_fsb, + xfs_fileoff_t end_fsb) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + int error; + + resblks = XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_reflink_end_cow_extent_locked(tp, ip, offset_fsb, end_fsb); + if (error) + xfs_trans_cancel(tp); + else + error = xfs_trans_commit(tp); xfs_iunlock(ip, XFS_ILOCK_EXCL); return error; } @@ -973,6 +985,78 @@ xfs_reflink_end_cow( } /* + * Fully remap all of the file's data fork at once, which is the critical part + * in achieving atomic behaviour. + * The regular CoW end path does not use function as to keep the block + * reservation per transaction as low as possible. + */ +int +xfs_reflink_end_atomic_cow( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count) +{ + xfs_fileoff_t offset_fsb; + xfs_fileoff_t end_fsb; + int error = 0; + struct xfs_mount *mp = ip->i_mount; + struct xfs_trans *tp; + unsigned int resblks; + + trace_xfs_reflink_end_cow(ip, offset, count); + + offset_fsb = XFS_B_TO_FSBT(mp, offset); + end_fsb = XFS_B_TO_FSB(mp, offset + count); + + /* + * Each remapping operation could cause a btree split, so in the worst + * case that's one for each block. + */ + resblks = (end_fsb - offset_fsb) * + XFS_NEXTENTADD_SPACE_RES(mp, 1, XFS_DATA_FORK); + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_atomic_ioend, resblks, 0, + XFS_TRANS_RESERVE, &tp); + if (error) + return error; + + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + while (end_fsb > offset_fsb && !error) { + error = xfs_reflink_end_cow_extent_locked(tp, ip, &offset_fsb, + end_fsb); + } + if (error) { + trace_xfs_reflink_end_cow_error(ip, error, _RET_IP_); + goto out_cancel; + } + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +out_cancel: + xfs_trans_cancel(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + return error; +} + +/* Compute the largest atomic write that we can complete through software. */ +xfs_extlen_t +xfs_reflink_max_atomic_cow( + struct xfs_mount *mp) +{ + /* We cannot do any atomic writes without out of place writes. */ + if (!xfs_can_sw_atomic_write(mp)) + return 0; + + /* + * Atomic write limits must always be a power-of-2, according to + * generic_atomic_write_valid. + */ + return rounddown_pow_of_two(xfs_calc_max_atomic_write_fsblocks(mp)); +} + +/* * Free all CoW staging blocks that are still referenced by the ondisk refcount * metadata. The ondisk metadata does not track which inode created the * staging extent, so callers must ensure that there are no cached inodes with @@ -1207,15 +1291,9 @@ xfs_reflink_ag_has_free_space( if (!xfs_has_rmapbt(mp)) return 0; if (XFS_IS_REALTIME_INODE(ip)) { - struct xfs_rtgroup *rtg; - xfs_rgnumber_t rgno; - - rgno = xfs_rtb_to_rgno(mp, fsb); - rtg = xfs_rtgroup_get(mp, rgno); - if (xfs_metafile_resv_critical(rtg_rmap(rtg))) - error = -ENOSPC; - xfs_rtgroup_put(rtg); - return error; + if (xfs_metafile_resv_critical(mp)) + return -ENOSPC; + return 0; } agno = XFS_FSB_TO_AGNO(mp, fsb); @@ -1538,7 +1616,7 @@ xfs_reflink_zero_posteof( return 0; trace_xfs_zero_eof(ip, isize, pos - isize); - return xfs_zero_range(ip, isize, pos - isize, NULL); + return xfs_zero_range(ip, isize, pos - isize, NULL, NULL); } /* diff --git a/fs/xfs/xfs_reflink.h b/fs/xfs/xfs_reflink.h index cc4e92278279..36cda724da89 100644 --- a/fs/xfs/xfs_reflink.h +++ b/fs/xfs/xfs_reflink.h @@ -35,6 +35,8 @@ int xfs_reflink_allocate_cow(struct xfs_inode *ip, struct xfs_bmbt_irec *imap, bool convert_now); extern int xfs_reflink_convert_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +int xfs_reflink_convert_cow_locked(struct xfs_inode *ip, + xfs_fileoff_t offset_fsb, xfs_filblks_t count_fsb); extern int xfs_reflink_cancel_cow_blocks(struct xfs_inode *ip, struct xfs_trans **tpp, xfs_fileoff_t offset_fsb, @@ -43,6 +45,8 @@ extern int xfs_reflink_cancel_cow_range(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, bool cancel_real); extern int xfs_reflink_end_cow(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count); +int xfs_reflink_end_atomic_cow(struct xfs_inode *ip, xfs_off_t offset, + xfs_off_t count); extern int xfs_reflink_recover_cow(struct xfs_mount *mp); extern loff_t xfs_reflink_remap_range(struct file *file_in, loff_t pos_in, struct file *file_out, loff_t pos_out, loff_t len, @@ -64,4 +68,6 @@ extern int xfs_reflink_update_dest(struct xfs_inode *dest, xfs_off_t newlen, bool xfs_reflink_supports_rextsize(struct xfs_mount *mp, unsigned int rextsize); +xfs_extlen_t xfs_reflink_max_atomic_cow(struct xfs_mount *mp); + #endif /* __XFS_REFLINK_H */ diff --git a/fs/xfs/xfs_rmap_item.c b/fs/xfs/xfs_rmap_item.c index 89decffe76c8..c99700318ec2 100644 --- a/fs/xfs/xfs_rmap_item.c +++ b/fs/xfs/xfs_rmap_item.c @@ -77,6 +77,11 @@ xfs_rui_item_size( *nbytes += xfs_rui_log_format_sizeof(ruip->rui_format.rui_nextents); } +unsigned int xfs_rui_log_space(unsigned int nr) +{ + return xlog_item_space(1, xfs_rui_log_format_sizeof(nr)); +} + /* * This is called to fill in the vector of log iovecs for the * given rui log item. We use only 1 iovec, and we point that @@ -180,6 +185,11 @@ xfs_rud_item_size( *nbytes += sizeof(struct xfs_rud_log_format); } +unsigned int xfs_rud_log_space(void) +{ + return xlog_item_space(1, sizeof(struct xfs_rud_log_format)); +} + /* * This is called to fill in the vector of log iovecs for the * given rud log item. We use only 1 iovec, and we point that diff --git a/fs/xfs/xfs_rmap_item.h b/fs/xfs/xfs_rmap_item.h index 40d331555675..3a99f0117f2d 100644 --- a/fs/xfs/xfs_rmap_item.h +++ b/fs/xfs/xfs_rmap_item.h @@ -75,4 +75,7 @@ struct xfs_rmap_intent; void xfs_rmap_defer_add(struct xfs_trans *tp, struct xfs_rmap_intent *ri); +unsigned int xfs_rui_log_space(unsigned int nr); +unsigned int xfs_rud_log_space(void); + #endif /* __XFS_RMAP_ITEM_H__ */ diff --git a/fs/xfs/xfs_rtalloc.c b/fs/xfs/xfs_rtalloc.c index d8e6d073d64d..6484c596ecea 100644 --- a/fs/xfs/xfs_rtalloc.c +++ b/fs/xfs/xfs_rtalloc.c @@ -33,6 +33,7 @@ #include "xfs_trace.h" #include "xfs_rtrefcount_btree.h" #include "xfs_reflink.h" +#include "xfs_zone_alloc.h" /* * Return whether there are any free extents in the size range given @@ -663,7 +664,8 @@ xfs_rtunmount_rtg( for (i = 0; i < XFS_RTGI_MAX; i++) xfs_rtginode_irele(&rtg->rtg_inodes[i]); - kvfree(rtg->rtg_rsum_cache); + if (!xfs_has_zoned(rtg_mount(rtg))) + kvfree(rtg->rtg_rsum_cache); } static int @@ -837,7 +839,7 @@ xfs_growfs_rt_init_rtsb( return 0; error = xfs_buf_get_uncached(mp->m_rtdev_targp, XFS_FSB_TO_BB(mp, 1), - 0, &rtsb_bp); + &rtsb_bp); if (error) return error; @@ -858,6 +860,84 @@ xfs_growfs_rt_init_rtsb( return error; } +static void +xfs_growfs_rt_sb_fields( + struct xfs_trans *tp, + const struct xfs_mount *nmp) +{ + struct xfs_mount *mp = tp->t_mountp; + + if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSIZE, + nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); + if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBMBLOCKS, + nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); + if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RBLOCKS, + nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); + if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTENTS, + nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); + if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_REXTSLOG, + nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); + if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) + xfs_trans_mod_sb(tp, XFS_TRANS_SB_RGCOUNT, + nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); +} + +static int +xfs_growfs_rt_zoned( + struct xfs_rtgroup *rtg, + xfs_rfsblock_t nrblocks) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_mount *nmp; + struct xfs_trans *tp; + xfs_rtbxlen_t freed_rtx; + int error; + + /* + * Calculate new sb and mount fields for this round. Also ensure the + * rtg_extents value is uptodate as the rtbitmap code relies on it. + */ + nmp = xfs_growfs_rt_alloc_fake_mount(mp, nrblocks, + mp->m_sb.sb_rextsize); + if (!nmp) + return -ENOMEM; + freed_rtx = nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents; + + xfs_rtgroup_calc_geometry(nmp, rtg, rtg_rgno(rtg), + nmp->m_sb.sb_rgcount, nmp->m_sb.sb_rextents); + + error = xfs_trans_alloc(mp, &M_RES(nmp)->tr_growrtfree, 0, 0, 0, &tp); + if (error) + goto out_free; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + + xfs_growfs_rt_sb_fields(tp, nmp); + xfs_trans_mod_sb(tp, XFS_TRANS_SB_FREXTENTS, freed_rtx); + + error = xfs_trans_commit(tp); + if (error) + goto out_free; + + /* + * Ensure the mount RT feature flag is now set, and compute new + * maxlevels for rt btrees. + */ + mp->m_features |= XFS_FEAT_REALTIME; + xfs_rtrmapbt_compute_maxlevels(mp); + xfs_rtrefcountbt_compute_maxlevels(mp); + xfs_zoned_add_available(mp, freed_rtx); +out_free: + kfree(nmp); + return error; +} + static int xfs_growfs_rt_bmblock( struct xfs_rtgroup *rtg, @@ -943,24 +1023,7 @@ xfs_growfs_rt_bmblock( /* * Update superblock fields. */ - if (nmp->m_sb.sb_rextsize != mp->m_sb.sb_rextsize) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSIZE, - nmp->m_sb.sb_rextsize - mp->m_sb.sb_rextsize); - if (nmp->m_sb.sb_rbmblocks != mp->m_sb.sb_rbmblocks) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBMBLOCKS, - nmp->m_sb.sb_rbmblocks - mp->m_sb.sb_rbmblocks); - if (nmp->m_sb.sb_rblocks != mp->m_sb.sb_rblocks) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RBLOCKS, - nmp->m_sb.sb_rblocks - mp->m_sb.sb_rblocks); - if (nmp->m_sb.sb_rextents != mp->m_sb.sb_rextents) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTENTS, - nmp->m_sb.sb_rextents - mp->m_sb.sb_rextents); - if (nmp->m_sb.sb_rextslog != mp->m_sb.sb_rextslog) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_REXTSLOG, - nmp->m_sb.sb_rextslog - mp->m_sb.sb_rextslog); - if (nmp->m_sb.sb_rgcount != mp->m_sb.sb_rgcount) - xfs_trans_mod_sb(args.tp, XFS_TRANS_SB_RGCOUNT, - nmp->m_sb.sb_rgcount - mp->m_sb.sb_rgcount); + xfs_growfs_rt_sb_fields(args.tp, nmp); /* * Free the new extent. @@ -1127,6 +1190,11 @@ xfs_growfs_rtg( goto out_rele; } + if (xfs_has_zoned(mp)) { + error = xfs_growfs_rt_zoned(rtg, nrblocks); + goto out_rele; + } + error = xfs_growfs_rt_alloc_blocks(rtg, nrblocks, rextsize, &bmblocks); if (error) goto out_rele; @@ -1144,10 +1212,8 @@ xfs_growfs_rtg( goto out_error; } - if (old_rsum_cache) - kvfree(old_rsum_cache); - xfs_rtgroup_rele(rtg); - return 0; + kvfree(old_rsum_cache); + goto out_rele; out_error: /* @@ -1195,6 +1261,22 @@ xfs_growfs_check_rtgeom( if (min_logfsbs > mp->m_sb.sb_logblocks) return -EINVAL; + + if (xfs_has_zoned(mp)) { + uint32_t gblocks = mp->m_groups[XG_TYPE_RTG].blocks; + uint32_t rem; + + if (rextsize != 1) + return -EINVAL; + div_u64_rem(mp->m_sb.sb_rblocks, gblocks, &rem); + if (rem) { + xfs_warn(mp, +"new RT volume size (%lld) not aligned to RT group size (%d)", + mp->m_sb.sb_rblocks, gblocks); + return -EINVAL; + } + } + return 0; } @@ -1249,6 +1331,35 @@ xfs_grow_last_rtg( } /* + * Read in the last block of the RT device to make sure it is accessible. + */ +static int +xfs_rt_check_size( + struct xfs_mount *mp, + xfs_rfsblock_t last_block) +{ + xfs_daddr_t daddr = XFS_FSB_TO_BB(mp, last_block); + struct xfs_buf *bp; + int error; + + if (XFS_BB_TO_FSB(mp, daddr) != last_block) { + xfs_warn(mp, "RT device size overflow: %llu != %llu", + XFS_BB_TO_FSB(mp, daddr), last_block); + return -EFBIG; + } + + error = xfs_buf_read_uncached(mp->m_rtdev_targp, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart) + daddr, + XFS_FSB_TO_BB(mp, 1), &bp, NULL); + if (error) + xfs_warn(mp, "cannot read last RT device sector (%lld)", + last_block); + else + xfs_buf_relse(bp); + return error; +} + +/* * Grow the realtime area of the filesystem. */ int @@ -1259,7 +1370,6 @@ xfs_growfs_rt( xfs_rgnumber_t old_rgcount = mp->m_sb.sb_rgcount; xfs_rgnumber_t new_rgcount = 1; xfs_rgnumber_t rgno; - struct xfs_buf *bp; xfs_agblock_t old_rextsize = mp->m_sb.sb_rextsize; int error; @@ -1302,15 +1412,10 @@ xfs_growfs_rt( error = xfs_sb_validate_fsb_count(&mp->m_sb, in->newblocks); if (error) goto out_unlock; - /* - * Read in the last block of the device, make sure it exists. - */ - error = xfs_buf_read_uncached(mp->m_rtdev_targp, - XFS_FSB_TO_BB(mp, in->newblocks - 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); + + error = xfs_rt_check_size(mp, in->newblocks - 1); if (error) goto out_unlock; - xfs_buf_relse(bp); /* * Calculate new parameters. These are the final values to be reached. @@ -1376,8 +1481,7 @@ xfs_growfs_rt( error = error2; /* Reset the rt metadata btree space reservations. */ - xfs_rt_resv_free(mp); - error2 = xfs_rt_resv_init(mp); + error2 = xfs_metafile_resv_init(mp); if (error2 && error2 != -ENOSPC) error = error2; } @@ -1407,7 +1511,7 @@ xfs_rtmount_readsb( /* m_blkbb_log is not set up yet */ error = xfs_buf_read_uncached(mp->m_rtdev_targp, XFS_RTSB_DADDR, - mp->m_sb.sb_blocksize >> BBSHIFT, XBF_NO_IOACCT, &bp, + mp->m_sb.sb_blocksize >> BBSHIFT, &bp, &xfs_rtsb_buf_ops); if (error) { xfs_warn(mp, "rt sb validate failed with error %d.", error); @@ -1444,10 +1548,6 @@ int /* error */ xfs_rtmount_init( struct xfs_mount *mp) /* file system mount structure */ { - struct xfs_buf *bp; /* buffer for last block of subvolume */ - xfs_daddr_t d; /* address of last block of subvolume */ - int error; - if (mp->m_sb.sb_rblocks == 0) return 0; if (mp->m_rtdev_targp == NULL) { @@ -1458,25 +1558,7 @@ xfs_rtmount_init( mp->m_rsumblocks = xfs_rtsummary_blockcount(mp, &mp->m_rsumlevels); - /* - * Check that the realtime section is an ok size. - */ - d = (xfs_daddr_t)XFS_FSB_TO_BB(mp, mp->m_sb.sb_rblocks); - if (XFS_BB_TO_FSB(mp, d) != mp->m_sb.sb_rblocks) { - xfs_warn(mp, "realtime mount -- %llu != %llu", - (unsigned long long) XFS_BB_TO_FSB(mp, d), - (unsigned long long) mp->m_sb.sb_rblocks); - return -EFBIG; - } - error = xfs_buf_read_uncached(mp->m_rtdev_targp, - d - XFS_FSB_TO_BB(mp, 1), - XFS_FSB_TO_BB(mp, 1), 0, &bp, NULL); - if (error) { - xfs_warn(mp, "realtime device size check failed"); - return error; - } - xfs_buf_relse(bp); - return 0; + return xfs_rt_check_size(mp, mp->m_sb.sb_rblocks - 1); } static int @@ -1519,50 +1601,10 @@ xfs_rtalloc_reinit_frextents( spin_lock(&mp->m_sb_lock); mp->m_sb.sb_frextents = val; spin_unlock(&mp->m_sb_lock); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, mp->m_sb.sb_frextents); return 0; } -/* Free space reservations for rt metadata inodes. */ -void -xfs_rt_resv_free( - struct xfs_mount *mp) -{ - struct xfs_rtgroup *rtg = NULL; - unsigned int i; - - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - for (i = 0; i < XFS_RTGI_MAX; i++) - xfs_metafile_resv_free(rtg->rtg_inodes[i]); - } -} - -/* Reserve space for rt metadata inodes' space expansion. */ -int -xfs_rt_resv_init( - struct xfs_mount *mp) -{ - struct xfs_rtgroup *rtg = NULL; - xfs_filblks_t ask; - int error = 0; - - while ((rtg = xfs_rtgroup_next(mp, rtg))) { - int err2; - - ask = xfs_rtrmapbt_calc_reserves(mp); - err2 = xfs_metafile_resv_init(rtg_rmap(rtg), ask); - if (err2 && !error) - error = err2; - - ask = xfs_rtrefcountbt_calc_reserves(mp); - err2 = xfs_metafile_resv_init(rtg_refcount(rtg), ask); - if (err2 && !error) - error = err2; - } - - return error; -} - /* * Read in the bmbt of an rt metadata inode so that we never have to load them * at runtime. This enables the use of shared ILOCKs for rtbitmap scans. Use @@ -1613,6 +1655,8 @@ xfs_rtmount_rtg( } } + if (xfs_has_zoned(mp)) + return 0; return xfs_alloc_rsum_cache(rtg, mp->m_sb.sb_rbmblocks); } @@ -2097,6 +2141,8 @@ xfs_bmap_rtalloc( ap->datatype & XFS_ALLOC_INITIAL_USER_DATA; int error; + ASSERT(!xfs_has_zoned(ap->tp->t_mountp)); + retry: error = xfs_rtallocate_align(ap, &ralen, &raminlen, &prod, &noalign); if (error) diff --git a/fs/xfs/xfs_rtalloc.h b/fs/xfs/xfs_rtalloc.h index 0d95b29092c9..78a690b489ed 100644 --- a/fs/xfs/xfs_rtalloc.h +++ b/fs/xfs/xfs_rtalloc.h @@ -34,9 +34,6 @@ int /* error */ xfs_rtmount_inodes( struct xfs_mount *mp); /* file system mount structure */ -void xfs_rt_resv_free(struct xfs_mount *mp); -int xfs_rt_resv_init(struct xfs_mount *mp); - /* * Grow the realtime area of the filesystem. */ @@ -65,8 +62,6 @@ xfs_rtmount_init( } # define xfs_rtmount_inodes(m) (((mp)->m_sb.sb_rblocks == 0)? 0 : (-ENOSYS)) # define xfs_rtunmount_inodes(m) -# define xfs_rt_resv_free(mp) ((void)0) -# define xfs_rt_resv_init(mp) (0) static inline int xfs_growfs_check_rtgeom(const struct xfs_mount *mp, diff --git a/fs/xfs/xfs_super.c b/fs/xfs/xfs_super.c index 0055066fb1d9..0bc4b5489078 100644 --- a/fs/xfs/xfs_super.c +++ b/fs/xfs/xfs_super.c @@ -46,6 +46,7 @@ #include "xfs_exchmaps_item.h" #include "xfs_parent.h" #include "xfs_rtalloc.h" +#include "xfs_zone_alloc.h" #include "scrub/stats.h" #include "scrub/rcbag_btree.h" @@ -109,7 +110,8 @@ enum { Opt_filestreams, Opt_quota, Opt_noquota, Opt_usrquota, Opt_grpquota, Opt_prjquota, Opt_uquota, Opt_gquota, Opt_pquota, Opt_uqnoenforce, Opt_gqnoenforce, Opt_pqnoenforce, Opt_qnoenforce, - Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, + Opt_discard, Opt_nodiscard, Opt_dax, Opt_dax_enum, Opt_max_open_zones, + Opt_lifetime, Opt_nolifetime, Opt_max_atomic_write, }; static const struct fs_parameter_spec xfs_fs_parameters[] = { @@ -154,6 +156,10 @@ static const struct fs_parameter_spec xfs_fs_parameters[] = { fsparam_flag("nodiscard", Opt_nodiscard), fsparam_flag("dax", Opt_dax), fsparam_enum("dax", Opt_dax_enum, dax_param_enums), + fsparam_u32("max_open_zones", Opt_max_open_zones), + fsparam_flag("lifetime", Opt_lifetime), + fsparam_flag("nolifetime", Opt_nolifetime), + fsparam_string("max_atomic_write", Opt_max_atomic_write), {} }; @@ -182,6 +188,7 @@ xfs_fs_show_options( { XFS_FEAT_LARGE_IOSIZE, ",largeio" }, { XFS_FEAT_DAX_ALWAYS, ",dax=always" }, { XFS_FEAT_DAX_NEVER, ",dax=never" }, + { XFS_FEAT_NOLIFETIME, ",nolifetime" }, { 0, NULL } }; struct xfs_mount *mp = XFS_M(root->d_sb); @@ -233,6 +240,12 @@ xfs_fs_show_options( if (!(mp->m_qflags & XFS_ALL_QUOTA_ACCT)) seq_puts(m, ",noquota"); + if (mp->m_max_open_zones) + seq_printf(m, ",max_open_zones=%u", mp->m_max_open_zones); + if (mp->m_awu_max_bytes) + seq_printf(m, ",max_atomic_write=%lluk", + mp->m_awu_max_bytes >> 10); + return 0; } @@ -371,10 +384,11 @@ xfs_blkdev_get( struct file **bdev_filep) { int error = 0; + blk_mode_t mode; - *bdev_filep = bdev_file_open_by_path(name, - BLK_OPEN_READ | BLK_OPEN_WRITE | BLK_OPEN_RESTRICT_WRITES, - mp->m_super, &fs_holder_ops); + mode = sb_open_mode(mp->m_super->s_flags); + *bdev_filep = bdev_file_open_by_path(name, mode, + mp->m_super, &fs_holder_ops); if (IS_ERR(*bdev_filep)) { error = PTR_ERR(*bdev_filep); *bdev_filep = NULL; @@ -472,21 +486,29 @@ xfs_open_devices( /* * Setup xfs_mount buffer target pointers */ - error = -ENOMEM; mp->m_ddev_targp = xfs_alloc_buftarg(mp, sb->s_bdev_file); - if (!mp->m_ddev_targp) + if (IS_ERR(mp->m_ddev_targp)) { + error = PTR_ERR(mp->m_ddev_targp); + mp->m_ddev_targp = NULL; goto out_close_rtdev; + } if (rtdev_file) { mp->m_rtdev_targp = xfs_alloc_buftarg(mp, rtdev_file); - if (!mp->m_rtdev_targp) + if (IS_ERR(mp->m_rtdev_targp)) { + error = PTR_ERR(mp->m_rtdev_targp); + mp->m_rtdev_targp = NULL; goto out_free_ddev_targ; + } } if (logdev_file && file_bdev(logdev_file) != ddev) { mp->m_logdev_targp = xfs_alloc_buftarg(mp, logdev_file); - if (!mp->m_logdev_targp) + if (IS_ERR(mp->m_logdev_targp)) { + error = PTR_ERR(mp->m_logdev_targp); + mp->m_logdev_targp = NULL; goto out_free_rtdev_targ; + } } else { mp->m_logdev_targp = mp->m_ddev_targp; /* Handle won't be used, drop it */ @@ -519,7 +541,7 @@ xfs_setup_devices( { int error; - error = xfs_setsize_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); + error = xfs_configure_buftarg(mp->m_ddev_targp, mp->m_sb.sb_sectsize); if (error) return error; @@ -528,13 +550,21 @@ xfs_setup_devices( if (xfs_has_sector(mp)) log_sector_size = mp->m_sb.sb_logsectsize; - error = xfs_setsize_buftarg(mp->m_logdev_targp, + error = xfs_configure_buftarg(mp->m_logdev_targp, log_sector_size); if (error) return error; } - if (mp->m_rtdev_targp) { - error = xfs_setsize_buftarg(mp->m_rtdev_targp, + + if (mp->m_sb.sb_rtstart) { + if (mp->m_rtdev_targp) { + xfs_warn(mp, + "can't use internal and external rtdev at the same time"); + return -EINVAL; + } + mp->m_rtdev_targp = mp->m_ddev_targp; + } else if (mp->m_rtname) { + error = xfs_configure_buftarg(mp->m_rtdev_targp, mp->m_sb.sb_sectsize); if (error) return error; @@ -751,13 +781,24 @@ xfs_fs_drop_inode( return generic_drop_inode(inode); } +STATIC void +xfs_fs_evict_inode( + struct inode *inode) +{ + if (IS_DAX(inode)) + dax_break_layout_final(inode); + + truncate_inode_pages_final(&inode->i_data); + clear_inode(inode); +} + static void xfs_mount_free( struct xfs_mount *mp) { if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_logdev_targp); - if (mp->m_rtdev_targp) + if (mp->m_rtdev_targp && mp->m_rtdev_targp != mp->m_ddev_targp) xfs_free_buftarg(mp->m_rtdev_targp); if (mp->m_ddev_targp) xfs_free_buftarg(mp->m_ddev_targp); @@ -814,6 +855,7 @@ xfs_fs_sync_fs( if (sb->s_writers.frozen == SB_FREEZE_PAGEFAULT) { xfs_inodegc_stop(mp); xfs_blockgc_stop(mp); + xfs_zone_gc_stop(mp); } return 0; @@ -834,10 +876,12 @@ xfs_statfs_data( struct kstatfs *st) { int64_t fdblocks = - percpu_counter_sum(&mp->m_fdblocks); + xfs_sum_freecounter(mp, XC_FREE_BLOCKS); /* make sure st->f_bfree does not underflow */ - st->f_bfree = max(0LL, fdblocks - xfs_fdblocks_unavailable(mp)); + st->f_bfree = max(0LL, + fdblocks - xfs_freecounter_unavailable(mp, XC_FREE_BLOCKS)); + /* * sb_dblocks can change during growfs, but nothing cares about reporting * the old or new value during growfs. @@ -856,8 +900,9 @@ xfs_statfs_rt( struct kstatfs *st) { st->f_bfree = xfs_rtbxlen_to_blen(mp, - percpu_counter_sum_positive(&mp->m_frextents)); - st->f_blocks = mp->m_sb.sb_rblocks; + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + st->f_blocks = mp->m_sb.sb_rblocks - xfs_rtbxlen_to_blen(mp, + mp->m_free[XC_FREE_RTEXTENTS].res_total); } static void @@ -922,24 +967,32 @@ xfs_fs_statfs( } STATIC void -xfs_save_resvblks(struct xfs_mount *mp) +xfs_save_resvblks( + struct xfs_mount *mp) { - mp->m_resblks_save = mp->m_resblks; - xfs_reserve_blocks(mp, 0); + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) { + mp->m_free[i].res_saved = mp->m_free[i].res_total; + xfs_reserve_blocks(mp, i, 0); + } } STATIC void -xfs_restore_resvblks(struct xfs_mount *mp) +xfs_restore_resvblks( + struct xfs_mount *mp) { - uint64_t resblks; + uint64_t resblks; + enum xfs_free_counter i; - if (mp->m_resblks_save) { - resblks = mp->m_resblks_save; - mp->m_resblks_save = 0; - } else - resblks = xfs_default_resblks(mp); - - xfs_reserve_blocks(mp, resblks); + for (i = 0; i < XC_FREE_NR; i++) { + if (mp->m_free[i].res_saved) { + resblks = mp->m_free[i].res_saved; + mp->m_free[i].res_saved = 0; + } else + resblks = xfs_default_resblks(mp, i); + xfs_reserve_blocks(mp, i, resblks); + } } /* @@ -976,6 +1029,7 @@ xfs_fs_freeze( if (ret && !xfs_is_readonly(mp)) { xfs_blockgc_start(mp); xfs_inodegc_start(mp); + xfs_zone_gc_start(mp); } return ret; @@ -997,6 +1051,7 @@ xfs_fs_unfreeze( * filesystem. */ if (!xfs_is_readonly(mp)) { + xfs_zone_gc_start(mp); xfs_blockgc_start(mp); xfs_inodegc_start(mp); } @@ -1058,6 +1113,19 @@ xfs_finish_flags( return -EINVAL; } + if (!xfs_has_zoned(mp)) { + if (mp->m_max_open_zones) { + xfs_warn(mp, +"max_open_zones mount option only supported on zoned file systems."); + return -EINVAL; + } + if (mp->m_features & XFS_FEAT_NOLIFETIME) { + xfs_warn(mp, +"nolifetime mount option only supported on zoned file systems."); + return -EINVAL; + } + } + return 0; } @@ -1065,7 +1133,8 @@ static int xfs_init_percpu_counters( struct xfs_mount *mp) { - int error; + int error; + int i; error = percpu_counter_init(&mp->m_icount, 0, GFP_KERNEL); if (error) @@ -1075,30 +1144,29 @@ xfs_init_percpu_counters( if (error) goto free_icount; - error = percpu_counter_init(&mp->m_fdblocks, 0, GFP_KERNEL); - if (error) - goto free_ifree; - error = percpu_counter_init(&mp->m_delalloc_blks, 0, GFP_KERNEL); if (error) - goto free_fdblocks; + goto free_ifree; error = percpu_counter_init(&mp->m_delalloc_rtextents, 0, GFP_KERNEL); if (error) goto free_delalloc; - error = percpu_counter_init(&mp->m_frextents, 0, GFP_KERNEL); - if (error) - goto free_delalloc_rt; + for (i = 0; i < XC_FREE_NR; i++) { + error = percpu_counter_init(&mp->m_free[i].count, 0, + GFP_KERNEL); + if (error) + goto free_freecounters; + } return 0; -free_delalloc_rt: +free_freecounters: + while (--i >= 0) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_delalloc_rtextents); free_delalloc: percpu_counter_destroy(&mp->m_delalloc_blks); -free_fdblocks: - percpu_counter_destroy(&mp->m_fdblocks); free_ifree: percpu_counter_destroy(&mp->m_ifree); free_icount: @@ -1112,24 +1180,28 @@ xfs_reinit_percpu_counters( { percpu_counter_set(&mp->m_icount, mp->m_sb.sb_icount); percpu_counter_set(&mp->m_ifree, mp->m_sb.sb_ifree); - percpu_counter_set(&mp->m_fdblocks, mp->m_sb.sb_fdblocks); - percpu_counter_set(&mp->m_frextents, mp->m_sb.sb_frextents); + xfs_set_freecounter(mp, XC_FREE_BLOCKS, mp->m_sb.sb_fdblocks); + if (!xfs_has_zoned(mp)) + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + mp->m_sb.sb_frextents); } static void xfs_destroy_percpu_counters( struct xfs_mount *mp) { + enum xfs_free_counter i; + + for (i = 0; i < XC_FREE_NR; i++) + percpu_counter_destroy(&mp->m_free[i].count); percpu_counter_destroy(&mp->m_icount); percpu_counter_destroy(&mp->m_ifree); - percpu_counter_destroy(&mp->m_fdblocks); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_rtextents) == 0); percpu_counter_destroy(&mp->m_delalloc_rtextents); ASSERT(xfs_is_shutdown(mp) || percpu_counter_sum(&mp->m_delalloc_blks) == 0); percpu_counter_destroy(&mp->m_delalloc_blks); - percpu_counter_destroy(&mp->m_frextents); } static int @@ -1210,11 +1282,24 @@ xfs_fs_shutdown( xfs_force_shutdown(XFS_M(sb), SHUTDOWN_DEVICE_REMOVED); } +static int +xfs_fs_show_stats( + struct seq_file *m, + struct dentry *root) +{ + struct xfs_mount *mp = XFS_M(root->d_sb); + + if (xfs_has_zoned(mp) && IS_ENABLED(CONFIG_XFS_RT)) + xfs_zoned_show_stats(m, mp); + return 0; +} + static const struct super_operations xfs_super_operations = { .alloc_inode = xfs_fs_alloc_inode, .destroy_inode = xfs_fs_destroy_inode, .dirty_inode = xfs_fs_dirty_inode, .drop_inode = xfs_fs_drop_inode, + .evict_inode = xfs_fs_evict_inode, .put_super = xfs_fs_put_super, .sync_fs = xfs_fs_sync_fs, .freeze_fs = xfs_fs_freeze, @@ -1224,6 +1309,7 @@ static const struct super_operations xfs_super_operations = { .nr_cached_objects = xfs_fs_nr_cached_objects, .free_cached_objects = xfs_fs_free_cached_objects, .shutdown = xfs_fs_shutdown, + .show_stats = xfs_fs_show_stats, }; static int @@ -1261,6 +1347,42 @@ suffix_kstrtoint( return ret; } +static int +suffix_kstrtoull( + const char *s, + unsigned int base, + unsigned long long *res) +{ + int last, shift_left_factor = 0; + unsigned long long _res; + char *value; + int ret = 0; + + value = kstrdup(s, GFP_KERNEL); + if (!value) + return -ENOMEM; + + last = strlen(value) - 1; + if (value[last] == 'K' || value[last] == 'k') { + shift_left_factor = 10; + value[last] = '\0'; + } + if (value[last] == 'M' || value[last] == 'm') { + shift_left_factor = 20; + value[last] = '\0'; + } + if (value[last] == 'G' || value[last] == 'g') { + shift_left_factor = 30; + value[last] = '\0'; + } + + if (kstrtoull(value, base, &_res)) + ret = -EINVAL; + kfree(value); + *res = _res << shift_left_factor; + return ret; +} + static inline void xfs_fs_warn_deprecated( struct fs_context *fc, @@ -1436,6 +1558,23 @@ xfs_fs_parse_param( xfs_fs_warn_deprecated(fc, param, XFS_FEAT_NOATTR2, true); parsing_mp->m_features |= XFS_FEAT_NOATTR2; return 0; + case Opt_max_open_zones: + parsing_mp->m_max_open_zones = result.uint_32; + return 0; + case Opt_lifetime: + parsing_mp->m_features &= ~XFS_FEAT_NOLIFETIME; + return 0; + case Opt_nolifetime: + parsing_mp->m_features |= XFS_FEAT_NOLIFETIME; + return 0; + case Opt_max_atomic_write: + if (suffix_kstrtoull(param->string, 10, + &parsing_mp->m_awu_max_bytes)) { + xfs_warn(parsing_mp, + "max atomic write size must be positive integer"); + return -EINVAL; + } + return 0; default: xfs_warn(parsing_mp, "unknown mount option [%s].", param->key); return -EINVAL; @@ -1780,8 +1919,17 @@ xfs_fs_fill_super( mp->m_features &= ~XFS_FEAT_DISCARD; } - if (xfs_has_metadir(mp)) + if (xfs_has_zoned(mp)) { + if (!xfs_has_metadir(mp)) { + xfs_alert(mp, + "metadir feature required for zoned realtime devices."); + error = -EINVAL; + goto out_filestream_unmount; + } + xfs_warn_experimental(mp, XFS_EXPERIMENTAL_ZONED); + } else if (xfs_has_metadir(mp)) { xfs_warn_experimental(mp, XFS_EXPERIMENTAL_METADIR); + } if (xfs_has_reflink(mp)) { if (xfs_has_realtime(mp) && @@ -1793,19 +1941,19 @@ xfs_fs_fill_super( goto out_filestream_unmount; } + if (xfs_has_zoned(mp)) { + xfs_alert(mp, + "reflink not compatible with zoned RT device!"); + error = -EINVAL; + goto out_filestream_unmount; + } + if (xfs_globals.always_cow) { xfs_info(mp, "using DEBUG-only always_cow mode."); mp->m_always_cow = true; } } - - if (xfs_has_exchange_range(mp)) - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_EXCHRANGE); - - if (xfs_has_parent(mp)) - xfs_warn_experimental(mp, XFS_EXPERIMENTAL_PPTR); - /* * If no quota mount options were provided, maybe we'll try to pick * up the quota accounting and enforcement flags from the ondisk sb. @@ -1871,6 +2019,20 @@ xfs_remount_rw( struct xfs_sb *sbp = &mp->m_sb; int error; + if (mp->m_logdev_targp && mp->m_logdev_targp != mp->m_ddev_targp && + bdev_read_only(mp->m_logdev_targp->bt_bdev)) { + xfs_warn(mp, + "ro->rw transition prohibited by read-only logdev"); + return -EACCES; + } + + if (mp->m_rtdev_targp && + bdev_read_only(mp->m_rtdev_targp->bt_bdev)) { + xfs_warn(mp, + "ro->rw transition prohibited by read-only rtdev"); + return -EACCES; + } + if (xfs_has_norecovery(mp)) { xfs_warn(mp, "ro->rw transition prohibited on norecovery mount"); @@ -1917,6 +2079,9 @@ xfs_remount_rw( /* Re-enable the background inode inactivation worker. */ xfs_inodegc_start(mp); + /* Restart zone reclaim */ + xfs_zone_gc_start(mp); + return 0; } @@ -1961,6 +2126,9 @@ xfs_remount_ro( */ xfs_inodegc_stop(mp); + /* Stop zone reclaim */ + xfs_zone_gc_stop(mp); + /* Free the per-AG metadata reservation pool. */ xfs_fs_unreserve_ag_blocks(mp); @@ -2010,6 +2178,29 @@ xfs_fs_reconfigure( if (error) return error; + /* attr2 -> noattr2 */ + if (xfs_has_noattr2(new_mp)) { + if (xfs_has_crc(mp)) { + xfs_warn(mp, + "attr2 is always enabled for a V5 filesystem - can't be changed."); + return -EINVAL; + } + mp->m_features &= ~XFS_FEAT_ATTR2; + mp->m_features |= XFS_FEAT_NOATTR2; + } else if (xfs_has_attr2(new_mp)) { + /* noattr2 -> attr2 */ + mp->m_features &= ~XFS_FEAT_NOATTR2; + mp->m_features |= XFS_FEAT_ATTR2; + } + + /* Validate new max_atomic_write option before making other changes */ + if (mp->m_awu_max_bytes != new_mp->m_awu_max_bytes) { + error = xfs_set_max_atomic_write_opt(mp, + new_mp->m_awu_max_bytes); + if (error) + return error; + } + /* inode32 -> inode64 */ if (xfs_has_small_inums(mp) && !xfs_has_small_inums(new_mp)) { mp->m_features &= ~XFS_FEAT_SMALL_INUMS; @@ -2022,6 +2213,17 @@ xfs_fs_reconfigure( mp->m_maxagi = xfs_set_inode_alloc(mp, mp->m_sb.sb_agcount); } + /* + * Now that mp has been modified according to the remount options, we + * do a final option validation with xfs_finish_flags() just like it is + * just like it is done during mount. We cannot use + * done during mount. We cannot use xfs_finish_flags() on new_mp as it + * contains only the user given options. + */ + error = xfs_finish_flags(mp); + if (error) + return error; + /* ro -> rw */ if (xfs_is_readonly(mp) && !(flags & SB_RDONLY)) { error = xfs_remount_rw(mp); @@ -2082,6 +2284,7 @@ xfs_init_fs_context( for (i = 0; i < XG_TYPE_MAX; i++) xa_init(&mp->m_groups[i].xa); mutex_init(&mp->m_growlock); + mutex_init(&mp->m_metafile_resv_lock); INIT_WORK(&mp->m_flush_inodes_work, xfs_flush_inodes_worker); INIT_DELAYED_WORK(&mp->m_reclaim_work, xfs_reclaim_worker); mp->m_kobj.kobject.kset = xfs_kset; @@ -2122,7 +2325,8 @@ static struct file_system_type xfs_fs_type = { .init_fs_context = xfs_init_fs_context, .parameters = xfs_fs_parameters, .kill_sb = xfs_kill_sb, - .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME, + .fs_flags = FS_REQUIRES_DEV | FS_ALLOW_IDMAP | FS_MGTIME | + FS_LBS, }; MODULE_ALIAS_FS("xfs"); diff --git a/fs/xfs/xfs_sysctl.h b/fs/xfs/xfs_sysctl.h index 276696a07040..51646f066c4f 100644 --- a/fs/xfs/xfs_sysctl.h +++ b/fs/xfs/xfs_sysctl.h @@ -29,8 +29,6 @@ typedef struct xfs_param { xfs_sysctl_val_t inherit_sync; /* Inherit the "sync" inode flag. */ xfs_sysctl_val_t inherit_nodump;/* Inherit the "nodump" inode flag. */ xfs_sysctl_val_t inherit_noatim;/* Inherit the "noatime" inode flag. */ - xfs_sysctl_val_t xfs_buf_timer; /* Interval between xfsbufd wakeups. */ - xfs_sysctl_val_t xfs_buf_age; /* Metadata buffer age before flush. */ xfs_sysctl_val_t inherit_nosym; /* Inherit the "nosymlinks" flag. */ xfs_sysctl_val_t rotorstep; /* inode32 AG rotoring control knob */ xfs_sysctl_val_t inherit_nodfrg;/* Inherit the "nodefrag" inode flag. */ diff --git a/fs/xfs/xfs_sysfs.c b/fs/xfs/xfs_sysfs.c index 60cb5318fdae..7a5c5ef2db92 100644 --- a/fs/xfs/xfs_sysfs.c +++ b/fs/xfs/xfs_sysfs.c @@ -13,6 +13,7 @@ #include "xfs_log.h" #include "xfs_log_priv.h" #include "xfs_mount.h" +#include "xfs_zones.h" struct xfs_sysfs_attr { struct attribute attr; @@ -69,7 +70,7 @@ static struct attribute *xfs_mp_attrs[] = { }; ATTRIBUTE_GROUPS(xfs_mp); -const struct kobj_type xfs_mp_ktype = { +static const struct kobj_type xfs_mp_ktype = { .release = xfs_sysfs_release, .sysfs_ops = &xfs_sysfs_ops, .default_groups = xfs_mp_groups, @@ -568,8 +569,8 @@ retry_timeout_seconds_store( if (val == -1) cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; else { - cfg->retry_timeout = msecs_to_jiffies(val * MSEC_PER_SEC); - ASSERT(msecs_to_jiffies(val * MSEC_PER_SEC) < LONG_MAX); + cfg->retry_timeout = secs_to_jiffies(val); + ASSERT(secs_to_jiffies(val) < LONG_MAX); } return count; } @@ -686,8 +687,8 @@ xfs_error_sysfs_init_class( if (init[i].retry_timeout == XFS_ERR_RETRY_FOREVER) cfg->retry_timeout = XFS_ERR_RETRY_FOREVER; else - cfg->retry_timeout = msecs_to_jiffies( - init[i].retry_timeout * MSEC_PER_SEC); + cfg->retry_timeout = + secs_to_jiffies(init[i].retry_timeout); } return 0; @@ -701,45 +702,135 @@ out_error: return error; } +static inline struct xfs_mount *zoned_to_mp(struct kobject *kobj) +{ + return container_of(to_kobj(kobj), struct xfs_mount, m_zoned_kobj); +} + +static ssize_t +max_open_zones_show( + struct kobject *kobj, + char *buf) +{ + /* only report the open zones available for user data */ + return sysfs_emit(buf, "%u\n", + zoned_to_mp(kobj)->m_max_open_zones - XFS_OPEN_GC_ZONES); +} +XFS_SYSFS_ATTR_RO(max_open_zones); + +static ssize_t +zonegc_low_space_store( + struct kobject *kobj, + const char *buf, + size_t count) +{ + int ret; + unsigned int val; + + ret = kstrtouint(buf, 0, &val); + if (ret) + return ret; + + if (val > 100) + return -EINVAL; + + zoned_to_mp(kobj)->m_zonegc_low_space = val; + + return count; +} + +static ssize_t +zonegc_low_space_show( + struct kobject *kobj, + char *buf) +{ + return sysfs_emit(buf, "%u\n", + zoned_to_mp(kobj)->m_zonegc_low_space); +} +XFS_SYSFS_ATTR_RW(zonegc_low_space); + +static struct attribute *xfs_zoned_attrs[] = { + ATTR_LIST(max_open_zones), + ATTR_LIST(zonegc_low_space), + NULL, +}; +ATTRIBUTE_GROUPS(xfs_zoned); + +static const struct kobj_type xfs_zoned_ktype = { + .release = xfs_sysfs_release, + .sysfs_ops = &xfs_sysfs_ops, + .default_groups = xfs_zoned_groups, +}; + int -xfs_error_sysfs_init( +xfs_mount_sysfs_init( struct xfs_mount *mp) { int error; + super_set_sysfs_name_id(mp->m_super); + + /* .../xfs/<dev>/ */ + error = xfs_sysfs_init(&mp->m_kobj, &xfs_mp_ktype, + NULL, mp->m_super->s_id); + if (error) + return error; + + /* .../xfs/<dev>/stats/ */ + error = xfs_sysfs_init(&mp->m_stats.xs_kobj, &xfs_stats_ktype, + &mp->m_kobj, "stats"); + if (error) + goto out_remove_fsdir; + /* .../xfs/<dev>/error/ */ error = xfs_sysfs_init(&mp->m_error_kobj, &xfs_error_ktype, &mp->m_kobj, "error"); if (error) - return error; + goto out_remove_stats_dir; + /* .../xfs/<dev>/error/fail_at_unmount */ error = sysfs_create_file(&mp->m_error_kobj.kobject, ATTR_LIST(fail_at_unmount)); if (error) - goto out_error; + goto out_remove_error_dir; /* .../xfs/<dev>/error/metadata/ */ error = xfs_error_sysfs_init_class(mp, XFS_ERR_METADATA, "metadata", &mp->m_error_meta_kobj, xfs_error_meta_init); if (error) - goto out_error; + goto out_remove_error_dir; + + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) { + /* .../xfs/<dev>/zoned/ */ + error = xfs_sysfs_init(&mp->m_zoned_kobj, &xfs_zoned_ktype, + &mp->m_kobj, "zoned"); + if (error) + goto out_remove_error_dir; + } return 0; -out_error: +out_remove_error_dir: xfs_sysfs_del(&mp->m_error_kobj); +out_remove_stats_dir: + xfs_sysfs_del(&mp->m_stats.xs_kobj); +out_remove_fsdir: + xfs_sysfs_del(&mp->m_kobj); return error; } void -xfs_error_sysfs_del( +xfs_mount_sysfs_del( struct xfs_mount *mp) { struct xfs_error_cfg *cfg; int i, j; + if (IS_ENABLED(CONFIG_XFS_RT) && xfs_has_zoned(mp)) + xfs_sysfs_del(&mp->m_zoned_kobj); + for (i = 0; i < XFS_ERR_CLASS_MAX; i++) { for (j = 0; j < XFS_ERR_ERRNO_MAX; j++) { cfg = &mp->m_error_cfg[i][j]; @@ -749,6 +840,8 @@ xfs_error_sysfs_del( } xfs_sysfs_del(&mp->m_error_meta_kobj); xfs_sysfs_del(&mp->m_error_kobj); + xfs_sysfs_del(&mp->m_stats.xs_kobj); + xfs_sysfs_del(&mp->m_kobj); } struct xfs_error_cfg * diff --git a/fs/xfs/xfs_sysfs.h b/fs/xfs/xfs_sysfs.h index 148893ebfdef..1622fe80ad3e 100644 --- a/fs/xfs/xfs_sysfs.h +++ b/fs/xfs/xfs_sysfs.h @@ -7,7 +7,6 @@ #ifndef __XFS_SYSFS_H__ #define __XFS_SYSFS_H__ -extern const struct kobj_type xfs_mp_ktype; /* xfs_mount */ extern const struct kobj_type xfs_dbg_ktype; /* debug */ extern const struct kobj_type xfs_log_ktype; /* xlog */ extern const struct kobj_type xfs_stats_ktype; /* stats */ @@ -53,7 +52,7 @@ xfs_sysfs_del( wait_for_completion(&kobj->complete); } -int xfs_error_sysfs_init(struct xfs_mount *mp); -void xfs_error_sysfs_del(struct xfs_mount *mp); +int xfs_mount_sysfs_init(struct xfs_mount *mp); +void xfs_mount_sysfs_del(struct xfs_mount *mp); #endif /* __XFS_SYSFS_H__ */ diff --git a/fs/xfs/xfs_trace.c b/fs/xfs/xfs_trace.c index 8f530e69c18a..a60556dbd172 100644 --- a/fs/xfs/xfs_trace.c +++ b/fs/xfs/xfs_trace.c @@ -49,6 +49,8 @@ #include "xfs_metafile.h" #include "xfs_metadir.h" #include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" /* * We include this last to have the helpers above available for the trace diff --git a/fs/xfs/xfs_trace.h b/fs/xfs/xfs_trace.h index b29462363b81..01d284a1c759 100644 --- a/fs/xfs/xfs_trace.h +++ b/fs/xfs/xfs_trace.h @@ -102,6 +102,7 @@ struct xfs_rmap_intent; struct xfs_refcount_intent; struct xfs_metadir_update; struct xfs_rtgroup; +struct xfs_open_zone; #define XFS_ATTR_FILTER_FLAGS \ { XFS_ATTR_ROOT, "ROOT" }, \ @@ -169,6 +170,99 @@ DEFINE_ATTR_LIST_EVENT(xfs_attr_list_notfound); DEFINE_ATTR_LIST_EVENT(xfs_attr_leaf_list); DEFINE_ATTR_LIST_EVENT(xfs_attr_node_list); +TRACE_EVENT(xfs_calc_atomic_write_unit_max, + TP_PROTO(struct xfs_mount *mp, unsigned int max_write, + unsigned int max_ioend, unsigned int max_agsize, + unsigned int max_rgsize), + TP_ARGS(mp, max_write, max_ioend, max_agsize, max_rgsize), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, max_write) + __field(unsigned int, max_ioend) + __field(unsigned int, max_agsize) + __field(unsigned int, max_rgsize) + __field(unsigned int, data_awu_max) + __field(unsigned int, rt_awu_max) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->max_write = max_write; + __entry->max_ioend = max_ioend; + __entry->max_agsize = max_agsize; + __entry->max_rgsize = max_rgsize; + __entry->data_awu_max = mp->m_groups[XG_TYPE_AG].awu_max; + __entry->rt_awu_max = mp->m_groups[XG_TYPE_RTG].awu_max; + ), + TP_printk("dev %d:%d max_write %u max_ioend %u max_agsize %u max_rgsize %u data_awu_max %u rt_awu_max %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->max_write, + __entry->max_ioend, + __entry->max_agsize, + __entry->max_rgsize, + __entry->data_awu_max, + __entry->rt_awu_max) +); + +TRACE_EVENT(xfs_calc_max_atomic_write_fsblocks, + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent, + unsigned int step_size, unsigned int logres, + unsigned int blockcount), + TP_ARGS(mp, per_intent, step_size, logres, blockcount), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, per_intent) + __field(unsigned int, step_size) + __field(unsigned int, logres) + __field(unsigned int, blockcount) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->per_intent = per_intent; + __entry->step_size = step_size; + __entry->logres = logres; + __entry->blockcount = blockcount; + ), + TP_printk("dev %d:%d per_intent %u step_size %u logres %u blockcount %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->per_intent, + __entry->step_size, + __entry->logres, + __entry->blockcount) +); + +TRACE_EVENT(xfs_calc_max_atomic_write_log_geometry, + TP_PROTO(struct xfs_mount *mp, unsigned int per_intent, + unsigned int step_size, unsigned int blockcount, + unsigned int min_logblocks, unsigned int logres), + TP_ARGS(mp, per_intent, step_size, blockcount, min_logblocks, logres), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(unsigned int, per_intent) + __field(unsigned int, step_size) + __field(unsigned int, blockcount) + __field(unsigned int, min_logblocks) + __field(unsigned int, cur_logblocks) + __field(unsigned int, logres) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->per_intent = per_intent; + __entry->step_size = step_size; + __entry->blockcount = blockcount; + __entry->min_logblocks = min_logblocks; + __entry->cur_logblocks = mp->m_sb.sb_logblocks; + __entry->logres = logres; + ), + TP_printk("dev %d:%d per_intent %u step_size %u blockcount %u min_logblocks %u logblocks %u logres %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->per_intent, + __entry->step_size, + __entry->blockcount, + __entry->min_logblocks, + __entry->cur_logblocks, + __entry->logres) +); + TRACE_EVENT(xlog_intent_recovery_failed, TP_PROTO(struct xfs_mount *mp, const struct xfs_defer_op_type *ops, int error), @@ -265,6 +359,152 @@ DEFINE_GROUP_REF_EVENT(xfs_group_grab); DEFINE_GROUP_REF_EVENT(xfs_group_grab_next_tag); DEFINE_GROUP_REF_EVENT(xfs_group_rele); +#ifdef CONFIG_XFS_RT +DECLARE_EVENT_CLASS(xfs_zone_class, + TP_PROTO(struct xfs_rtgroup *rtg), + TP_ARGS(rtg), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(unsigned int, nr_open) + ), + TP_fast_assign( + struct xfs_mount *mp = rtg_mount(rtg); + + __entry->dev = mp->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->nr_open = mp->m_zone_info->zi_nr_open_zones; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x nr_open %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->nr_open) +); + +#define DEFINE_ZONE_EVENT(name) \ +DEFINE_EVENT(xfs_zone_class, name, \ + TP_PROTO(struct xfs_rtgroup *rtg), \ + TP_ARGS(rtg)) +DEFINE_ZONE_EVENT(xfs_zone_emptied); +DEFINE_ZONE_EVENT(xfs_zone_full); +DEFINE_ZONE_EVENT(xfs_zone_opened); +DEFINE_ZONE_EVENT(xfs_zone_reset); +DEFINE_ZONE_EVENT(xfs_zone_gc_target_opened); + +TRACE_EVENT(xfs_zone_free_blocks, + TP_PROTO(struct xfs_rtgroup *rtg, xfs_rgblock_t rgbno, + xfs_extlen_t len), + TP_ARGS(rtg, rgbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->rgbno = rgbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x rgbno 0x%x len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->rgbno, + __entry->len) +); + +DECLARE_EVENT_CLASS(xfs_zone_alloc_class, + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, + xfs_extlen_t len), + TP_ARGS(oz, rgbno, len), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(xfs_rgblock_t, written) + __field(xfs_rgblock_t, write_pointer) + __field(xfs_rgblock_t, rgbno) + __field(xfs_extlen_t, len) + ), + TP_fast_assign( + __entry->dev = rtg_mount(oz->oz_rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(oz->oz_rtg); + __entry->used = rtg_rmap(oz->oz_rtg)->i_used_blocks; + __entry->written = oz->oz_written; + __entry->write_pointer = oz->oz_write_pointer; + __entry->rgbno = rgbno; + __entry->len = len; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x written 0x%x wp 0x%x rgbno 0x%x len 0x%x", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->written, + __entry->write_pointer, + __entry->rgbno, + __entry->len) +); + +#define DEFINE_ZONE_ALLOC_EVENT(name) \ +DEFINE_EVENT(xfs_zone_alloc_class, name, \ + TP_PROTO(struct xfs_open_zone *oz, xfs_rgblock_t rgbno, \ + xfs_extlen_t len), \ + TP_ARGS(oz, rgbno, len)) +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_record_blocks); +DEFINE_ZONE_ALLOC_EVENT(xfs_zone_alloc_blocks); + +TRACE_EVENT(xfs_zone_gc_select_victim, + TP_PROTO(struct xfs_rtgroup *rtg, unsigned int bucket), + TP_ARGS(rtg, bucket), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgno) + __field(xfs_rgblock_t, used) + __field(unsigned int, bucket) + ), + TP_fast_assign( + __entry->dev = rtg_mount(rtg)->m_super->s_dev; + __entry->rgno = rtg_rgno(rtg); + __entry->used = rtg_rmap(rtg)->i_used_blocks; + __entry->bucket = bucket; + ), + TP_printk("dev %d:%d rgno 0x%x used 0x%x bucket %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgno, + __entry->used, + __entry->bucket) +); + +TRACE_EVENT(xfs_zones_mount, + TP_PROTO(struct xfs_mount *mp), + TP_ARGS(mp), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_rgnumber_t, rgcount) + __field(uint32_t, blocks) + __field(unsigned int, max_open_zones) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->rgcount = mp->m_sb.sb_rgcount; + __entry->blocks = mp->m_groups[XG_TYPE_RTG].blocks; + __entry->max_open_zones = mp->m_max_open_zones; + ), + TP_printk("dev %d:%d zoned %u blocks_per_zone %u, max_open %u", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->rgcount, + __entry->blocks, + __entry->max_open_zones) +); +#endif /* CONFIG_XFS_RT */ + TRACE_EVENT(xfs_inodegc_worker, TP_PROTO(struct xfs_mount *mp, unsigned int shrinker_hits), TP_ARGS(mp, shrinker_hits), @@ -545,6 +785,10 @@ DEFINE_BUF_EVENT(xfs_buf_iodone_async); DEFINE_BUF_EVENT(xfs_buf_error_relse); DEFINE_BUF_EVENT(xfs_buf_drain_buftarg); DEFINE_BUF_EVENT(xfs_trans_read_buf_shut); +DEFINE_BUF_EVENT(xfs_buf_backing_folio); +DEFINE_BUF_EVENT(xfs_buf_backing_kmem); +DEFINE_BUF_EVENT(xfs_buf_backing_vmalloc); +DEFINE_BUF_EVENT(xfs_buf_backing_fallback); /* not really buffer traces, but the buf provides useful information */ DEFINE_BUF_EVENT(xfs_btree_corrupt); @@ -593,6 +837,7 @@ DEFINE_EVENT(xfs_buf_flags_class, name, \ DEFINE_BUF_FLAGS_EVENT(xfs_buf_find); DEFINE_BUF_FLAGS_EVENT(xfs_buf_get); DEFINE_BUF_FLAGS_EVENT(xfs_buf_read); +DEFINE_BUF_FLAGS_EVENT(xfs_buf_readahead); TRACE_EVENT(xfs_buf_ioerror, TP_PROTO(struct xfs_buf *bp, int error, xfs_failaddr_t caller_ip), @@ -1505,6 +1750,28 @@ DEFINE_RW_EVENT(xfs_file_direct_write); DEFINE_RW_EVENT(xfs_file_dax_write); DEFINE_RW_EVENT(xfs_reflink_bounce_dio_write); +TRACE_EVENT(xfs_iomap_atomic_write_cow, + TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count), + TP_ARGS(ip, offset, count), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(xfs_ino_t, ino) + __field(xfs_off_t, offset) + __field(ssize_t, count) + ), + TP_fast_assign( + __entry->dev = VFS_I(ip)->i_sb->s_dev; + __entry->ino = ip->i_ino; + __entry->offset = offset; + __entry->count = count; + ), + TP_printk("dev %d:%d ino 0x%llx pos 0x%llx bytecount 0x%zx", + MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ino, + __entry->offset, + __entry->count) +) + DECLARE_EVENT_CLASS(xfs_imap_class, TP_PROTO(struct xfs_inode *ip, xfs_off_t offset, ssize_t count, int whichfork, struct xfs_bmbt_irec *irec), @@ -1595,6 +1862,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_unwritten); DEFINE_SIMPLE_IO_EVENT(xfs_end_io_direct_write_append); DEFINE_SIMPLE_IO_EVENT(xfs_file_splice_read); +DEFINE_SIMPLE_IO_EVENT(xfs_zoned_map_blocks); DECLARE_EVENT_CLASS(xfs_itrunc_class, TP_PROTO(struct xfs_inode *ip, xfs_fsize_t new_size), @@ -3982,6 +4250,7 @@ DEFINE_SIMPLE_IO_EVENT(xfs_reflink_cancel_cow_range); DEFINE_SIMPLE_IO_EVENT(xfs_reflink_end_cow); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_from); DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_to); +DEFINE_INODE_IREC_EVENT(xfs_reflink_cow_remap_skip); DEFINE_INODE_ERROR_EVENT(xfs_reflink_cancel_cow_range_error); DEFINE_INODE_ERROR_EVENT(xfs_reflink_end_cow_error); @@ -5605,11 +5874,10 @@ DEFINE_METADIR_EVENT(xfs_metadir_lookup); /* metadata inode space reservations */ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), - TP_ARGS(ip, len), + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), + TP_ARGS(mp, len), TP_STRUCT__entry( __field(dev_t, dev) - __field(xfs_ino_t, ino) __field(unsigned long long, freeblks) __field(unsigned long long, reserved) __field(unsigned long long, asked) @@ -5617,19 +5885,15 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, __field(unsigned long long, len) ), TP_fast_assign( - struct xfs_mount *mp = ip->i_mount; - __entry->dev = mp->m_super->s_dev; - __entry->ino = ip->i_ino; - __entry->freeblks = percpu_counter_sum(&mp->m_fdblocks); - __entry->reserved = ip->i_delayed_blks; - __entry->asked = ip->i_meta_resv_asked; - __entry->used = ip->i_nblocks; + __entry->freeblks = xfs_sum_freecounter_raw(mp, XC_FREE_BLOCKS); + __entry->reserved = mp->m_metafile_resv_avail; + __entry->asked = mp->m_metafile_resv_target; + __entry->used = mp->m_metafile_resv_used; __entry->len = len; ), - TP_printk("dev %d:%d ino 0x%llx freeblks %llu resv %llu ask %llu used %llu len %llu", + TP_printk("dev %d:%d freeblks %llu resv %llu ask %llu used %llu len %llu", MAJOR(__entry->dev), MINOR(__entry->dev), - __entry->ino, __entry->freeblks, __entry->reserved, __entry->asked, @@ -5638,14 +5902,14 @@ DECLARE_EVENT_CLASS(xfs_metafile_resv_class, ) #define DEFINE_METAFILE_RESV_EVENT(name) \ DEFINE_EVENT(xfs_metafile_resv_class, name, \ - TP_PROTO(struct xfs_inode *ip, xfs_filblks_t len), \ - TP_ARGS(ip, len)) + TP_PROTO(struct xfs_mount *mp, xfs_filblks_t len), \ + TP_ARGS(mp, len)) DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_alloc_space); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_free_space); DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_critical); -DEFINE_INODE_ERROR_EVENT(xfs_metafile_resv_init_error); +DEFINE_METAFILE_RESV_EVENT(xfs_metafile_resv_init_error); #ifdef CONFIG_XFS_RT TRACE_EVENT(xfs_growfs_check_rtgeom, @@ -5668,6 +5932,46 @@ TRACE_EVENT(xfs_growfs_check_rtgeom, ); #endif /* CONFIG_XFS_RT */ +TRACE_DEFINE_ENUM(XC_FREE_BLOCKS); +TRACE_DEFINE_ENUM(XC_FREE_RTEXTENTS); +TRACE_DEFINE_ENUM(XC_FREE_RTAVAILABLE); + +DECLARE_EVENT_CLASS(xfs_freeblocks_resv_class, + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, + uint64_t delta, unsigned long caller_ip), + TP_ARGS(mp, ctr, delta, caller_ip), + TP_STRUCT__entry( + __field(dev_t, dev) + __field(enum xfs_free_counter, ctr) + __field(uint64_t, delta) + __field(uint64_t, avail) + __field(uint64_t, total) + __field(unsigned long, caller_ip) + ), + TP_fast_assign( + __entry->dev = mp->m_super->s_dev; + __entry->ctr = ctr; + __entry->delta = delta; + __entry->avail = mp->m_free[ctr].res_avail; + __entry->total = mp->m_free[ctr].res_total; + __entry->caller_ip = caller_ip; + ), + TP_printk("dev %d:%d ctr %s delta %llu avail %llu total %llu caller %pS", + MAJOR(__entry->dev), MINOR(__entry->dev), + __print_symbolic(__entry->ctr, XFS_FREECOUNTER_STR), + __entry->delta, + __entry->avail, + __entry->total, + (char *)__entry->caller_ip) +) +#define DEFINE_FREEBLOCKS_RESV_EVENT(name) \ +DEFINE_EVENT(xfs_freeblocks_resv_class, name, \ + TP_PROTO(struct xfs_mount *mp, enum xfs_free_counter ctr, \ + uint64_t delta, unsigned long caller_ip), \ + TP_ARGS(mp, ctr, delta, caller_ip)) +DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_reserved); +DEFINE_FREEBLOCKS_RESV_EVENT(xfs_freecounter_enospc); + #endif /* _TRACE_XFS_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/xfs/xfs_trans_ail.c b/fs/xfs/xfs_trans_ail.c index 0fcb1828e598..67c328d23e4a 100644 --- a/fs/xfs/xfs_trans_ail.c +++ b/fs/xfs/xfs_trans_ail.c @@ -315,7 +315,7 @@ xfs_ail_splice( } /* - * Delete the given item from the AIL. Return a pointer to the item. + * Delete the given item from the AIL. */ static void xfs_ail_delete( @@ -777,26 +777,28 @@ xfs_ail_update_finish( } /* - * xfs_trans_ail_update - bulk AIL insertion operation. + * xfs_trans_ail_update_bulk - bulk AIL insertion operation. * - * @xfs_trans_ail_update takes an array of log items that all need to be + * @xfs_trans_ail_update_bulk takes an array of log items that all need to be * positioned at the same LSN in the AIL. If an item is not in the AIL, it will - * be added. Otherwise, it will be repositioned by removing it and re-adding - * it to the AIL. If we move the first item in the AIL, update the log tail to - * match the new minimum LSN in the AIL. + * be added. Otherwise, it will be repositioned by removing it and re-adding + * it to the AIL. * - * This function takes the AIL lock once to execute the update operations on - * all the items in the array, and as such should not be called with the AIL - * lock held. As a result, once we have the AIL lock, we need to check each log - * item LSN to confirm it needs to be moved forward in the AIL. + * If we move the first item in the AIL, update the log tail to match the new + * minimum LSN in the AIL. * - * To optimise the insert operation, we delete all the items from the AIL in - * the first pass, moving them into a temporary list, then splice the temporary - * list into the correct position in the AIL. This avoids needing to do an - * insert operation on every item. + * This function should be called with the AIL lock held. * - * This function must be called with the AIL lock held. The lock is dropped - * before returning. + * To optimise the insert operation, we add all items to a temporary list, then + * splice this list into the correct position in the AIL. + * + * Items that are already in the AIL are first deleted from their current + * location before being added to the temporary list. + * + * This avoids needing to do an insert operation on every item. + * + * The AIL lock is dropped by xfs_ail_update_finish() before returning to + * the caller. */ void xfs_trans_ail_update_bulk( @@ -909,10 +911,9 @@ xfs_trans_ail_delete( return; } - /* xfs_ail_update_finish() drops the AIL lock */ - xfs_clear_li_failed(lip); + clear_bit(XFS_LI_FAILED, &lip->li_flags); tail_lsn = xfs_ail_delete_one(ailp, lip); - xfs_ail_update_finish(ailp, tail_lsn); + xfs_ail_update_finish(ailp, tail_lsn); /* drops the AIL lock */ } int diff --git a/fs/xfs/xfs_trans_priv.h b/fs/xfs/xfs_trans_priv.h index bd841df93021..f945f0450b16 100644 --- a/fs/xfs/xfs_trans_priv.h +++ b/fs/xfs/xfs_trans_priv.h @@ -167,32 +167,4 @@ xfs_trans_ail_copy_lsn( } #endif -static inline void -xfs_clear_li_failed( - struct xfs_log_item *lip) -{ - struct xfs_buf *bp = lip->li_buf; - - ASSERT(test_bit(XFS_LI_IN_AIL, &lip->li_flags)); - lockdep_assert_held(&lip->li_ailp->ail_lock); - - if (test_and_clear_bit(XFS_LI_FAILED, &lip->li_flags)) { - lip->li_buf = NULL; - xfs_buf_rele(bp); - } -} - -static inline void -xfs_set_li_failed( - struct xfs_log_item *lip, - struct xfs_buf *bp) -{ - lockdep_assert_held(&lip->li_ailp->ail_lock); - - if (!test_and_set_bit(XFS_LI_FAILED, &lip->li_flags)) { - xfs_buf_hold(bp); - lip->li_buf = bp; - } -} - #endif /* __XFS_TRANS_PRIV_H__ */ diff --git a/fs/xfs/xfs_zone_alloc.c b/fs/xfs/xfs_zone_alloc.c new file mode 100644 index 000000000000..80add26c0111 --- /dev/null +++ b/fs/xfs/xfs_zone_alloc.c @@ -0,0 +1,1336 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_error.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_iomap.h" +#include "xfs_trans.h" +#include "xfs_alloc.h" +#include "xfs_bmap.h" +#include "xfs_bmap_btree.h" +#include "xfs_trans_space.h" +#include "xfs_refcount.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" +#include "xfs_mru_cache.h" + +void +xfs_open_zone_put( + struct xfs_open_zone *oz) +{ + if (atomic_dec_and_test(&oz->oz_ref)) { + xfs_rtgroup_rele(oz->oz_rtg); + kfree(oz); + } +} + +static inline uint32_t +xfs_zone_bucket( + struct xfs_mount *mp, + uint32_t used_blocks) +{ + return XFS_ZONE_USED_BUCKETS * used_blocks / + mp->m_groups[XG_TYPE_RTG].blocks; +} + +static inline void +xfs_zone_add_to_bucket( + struct xfs_zone_info *zi, + xfs_rgnumber_t rgno, + uint32_t to_bucket) +{ + __set_bit(rgno, zi->zi_used_bucket_bitmap[to_bucket]); + zi->zi_used_bucket_entries[to_bucket]++; +} + +static inline void +xfs_zone_remove_from_bucket( + struct xfs_zone_info *zi, + xfs_rgnumber_t rgno, + uint32_t from_bucket) +{ + __clear_bit(rgno, zi->zi_used_bucket_bitmap[from_bucket]); + zi->zi_used_bucket_entries[from_bucket]--; +} + +static void +xfs_zone_account_reclaimable( + struct xfs_rtgroup *rtg, + uint32_t freed) +{ + struct xfs_group *xg = &rtg->rtg_group; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; + xfs_rgnumber_t rgno = rtg_rgno(rtg); + uint32_t from_bucket = xfs_zone_bucket(mp, used + freed); + uint32_t to_bucket = xfs_zone_bucket(mp, used); + bool was_full = (used + freed == rtg_blocks(rtg)); + + /* + * This can be called from log recovery, where the zone_info structure + * hasn't been allocated yet. Skip all work as xfs_mount_zones will + * add the zones to the right buckets before the file systems becomes + * active. + */ + if (!zi) + return; + + if (!used) { + /* + * The zone is now empty, remove it from the bottom bucket and + * trigger a reset. + */ + trace_xfs_zone_emptied(rtg); + + if (!was_full) + xfs_group_clear_mark(xg, XFS_RTG_RECLAIMABLE); + + spin_lock(&zi->zi_used_buckets_lock); + if (!was_full) + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + + spin_lock(&zi->zi_reset_list_lock); + xg->xg_next_reset = zi->zi_reset_list; + zi->zi_reset_list = xg; + spin_unlock(&zi->zi_reset_list_lock); + + if (zi->zi_gc_thread) + wake_up_process(zi->zi_gc_thread); + } else if (was_full) { + /* + * The zone transitioned from full, mark it up as reclaimable + * and wake up GC which might be waiting for zones to reclaim. + */ + spin_lock(&zi->zi_used_buckets_lock); + xfs_zone_add_to_bucket(zi, rgno, to_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + + xfs_group_set_mark(xg, XFS_RTG_RECLAIMABLE); + if (zi->zi_gc_thread && xfs_zoned_need_gc(mp)) + wake_up_process(zi->zi_gc_thread); + } else if (to_bucket != from_bucket) { + /* + * Move the zone to a new bucket if it dropped below the + * threshold. + */ + spin_lock(&zi->zi_used_buckets_lock); + xfs_zone_add_to_bucket(zi, rgno, to_bucket); + xfs_zone_remove_from_bucket(zi, rgno, from_bucket); + spin_unlock(&zi->zi_used_buckets_lock); + } +} + +static void +xfs_open_zone_mark_full( + struct xfs_open_zone *oz) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t used = rtg_rmap(rtg)->i_used_blocks; + + trace_xfs_zone_full(rtg); + + WRITE_ONCE(rtg->rtg_open_zone, NULL); + + spin_lock(&zi->zi_open_zones_lock); + if (oz->oz_is_gc) { + ASSERT(current == zi->zi_gc_thread); + zi->zi_open_gc_zone = NULL; + } else { + zi->zi_nr_open_zones--; + list_del_init(&oz->oz_entry); + } + spin_unlock(&zi->zi_open_zones_lock); + xfs_open_zone_put(oz); + + wake_up_all(&zi->zi_zone_wait); + if (used < rtg_blocks(rtg)) + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); +} + +static void +xfs_zone_record_blocks( + struct xfs_trans *tp, + xfs_fsblock_t fsbno, + xfs_filblks_t len, + struct xfs_open_zone *oz, + bool used) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_inode *rmapip = rtg_rmap(rtg); + + trace_xfs_zone_record_blocks(oz, xfs_rtb_to_rgbno(mp, fsbno), len); + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + xfs_rtgroup_trans_join(tp, rtg, XFS_RTGLOCK_RMAP); + if (used) { + rmapip->i_used_blocks += len; + ASSERT(rmapip->i_used_blocks <= rtg_blocks(rtg)); + } else { + xfs_add_frextents(mp, len); + } + oz->oz_written += len; + if (oz->oz_written == rtg_blocks(rtg)) + xfs_open_zone_mark_full(oz); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); +} + +static int +xfs_zoned_map_extent( + struct xfs_trans *tp, + struct xfs_inode *ip, + struct xfs_bmbt_irec *new, + struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock) +{ + struct xfs_bmbt_irec data; + int nmaps = 1; + int error; + + /* Grab the corresponding mapping in the data fork. */ + error = xfs_bmapi_read(ip, new->br_startoff, new->br_blockcount, &data, + &nmaps, 0); + if (error) + return error; + + /* + * Cap the update to the existing extent in the data fork because we can + * only overwrite one extent at a time. + */ + ASSERT(new->br_blockcount >= data.br_blockcount); + new->br_blockcount = data.br_blockcount; + + /* + * If a data write raced with this GC write, keep the existing data in + * the data fork, mark our newly written GC extent as reclaimable, then + * move on to the next extent. + */ + if (old_startblock != NULLFSBLOCK && + old_startblock != data.br_startblock) + goto skip; + + trace_xfs_reflink_cow_remap_from(ip, new); + trace_xfs_reflink_cow_remap_to(ip, &data); + + error = xfs_iext_count_extend(tp, ip, XFS_DATA_FORK, + XFS_IEXT_REFLINK_END_COW_CNT); + if (error) + return error; + + if (data.br_startblock != HOLESTARTBLOCK) { + ASSERT(data.br_startblock != DELAYSTARTBLOCK); + ASSERT(!isnullstartblock(data.br_startblock)); + + xfs_bmap_unmap_extent(tp, ip, XFS_DATA_FORK, &data); + if (xfs_is_reflink_inode(ip)) { + xfs_refcount_decrease_extent(tp, true, &data); + } else { + error = xfs_free_extent_later(tp, data.br_startblock, + data.br_blockcount, NULL, + XFS_AG_RESV_NONE, + XFS_FREE_EXTENT_REALTIME); + if (error) + return error; + } + } + + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, + true); + + /* Map the new blocks into the data fork. */ + xfs_bmap_map_extent(tp, ip, XFS_DATA_FORK, new); + return 0; + +skip: + trace_xfs_reflink_cow_remap_skip(ip, new); + xfs_zone_record_blocks(tp, new->br_startblock, new->br_blockcount, oz, + false); + return 0; +} + +int +xfs_zoned_end_io( + struct xfs_inode *ip, + xfs_off_t offset, + xfs_off_t count, + xfs_daddr_t daddr, + struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock) +{ + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t end_fsb = XFS_B_TO_FSB(mp, offset + count); + struct xfs_bmbt_irec new = { + .br_startoff = XFS_B_TO_FSBT(mp, offset), + .br_startblock = xfs_daddr_to_rtb(mp, daddr), + .br_state = XFS_EXT_NORM, + }; + unsigned int resblks = + XFS_EXTENTADD_SPACE_RES(mp, XFS_DATA_FORK); + struct xfs_trans *tp; + int error; + + if (xfs_is_shutdown(mp)) + return -EIO; + + while (new.br_startoff < end_fsb) { + new.br_blockcount = end_fsb - new.br_startoff; + + error = xfs_trans_alloc(mp, &M_RES(mp)->tr_write, resblks, 0, + XFS_TRANS_RESERVE | XFS_TRANS_RES_FDBLKS, &tp); + if (error) + return error; + xfs_ilock(ip, XFS_ILOCK_EXCL); + xfs_trans_ijoin(tp, ip, 0); + + error = xfs_zoned_map_extent(tp, ip, &new, oz, old_startblock); + if (error) + xfs_trans_cancel(tp); + else + error = xfs_trans_commit(tp); + xfs_iunlock(ip, XFS_ILOCK_EXCL); + if (error) + return error; + + new.br_startoff += new.br_blockcount; + new.br_startblock += new.br_blockcount; + if (old_startblock != NULLFSBLOCK) + old_startblock += new.br_blockcount; + } + + return 0; +} + +/* + * "Free" blocks allocated in a zone. + * + * Just decrement the used blocks counter and report the space as freed. + */ +int +xfs_zone_free_blocks( + struct xfs_trans *tp, + struct xfs_rtgroup *rtg, + xfs_fsblock_t fsbno, + xfs_filblks_t len) +{ + struct xfs_mount *mp = tp->t_mountp; + struct xfs_inode *rmapip = rtg_rmap(rtg); + + xfs_assert_ilocked(rmapip, XFS_ILOCK_EXCL); + + if (len > rmapip->i_used_blocks) { + xfs_err(mp, +"trying to free more blocks (%lld) than used counter (%u).", + len, rmapip->i_used_blocks); + ASSERT(len <= rmapip->i_used_blocks); + xfs_rtginode_mark_sick(rtg, XFS_RTGI_RMAP); + xfs_force_shutdown(mp, SHUTDOWN_CORRUPT_INCORE); + return -EFSCORRUPTED; + } + + trace_xfs_zone_free_blocks(rtg, xfs_rtb_to_rgbno(mp, fsbno), len); + + rmapip->i_used_blocks -= len; + /* + * Don't add open zones to the reclaimable buckets. The I/O completion + * for writing the last block will take care of accounting for already + * unused blocks instead. + */ + if (!READ_ONCE(rtg->rtg_open_zone)) + xfs_zone_account_reclaimable(rtg, len); + xfs_add_frextents(mp, len); + xfs_trans_log_inode(tp, rmapip, XFS_ILOG_CORE); + return 0; +} + +/* + * Check if the zone containing the data just before the offset we are + * writing to is still open and has space. + */ +static struct xfs_open_zone * +xfs_last_used_zone( + struct iomap_ioend *ioend) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + xfs_fileoff_t offset_fsb = XFS_B_TO_FSB(mp, ioend->io_offset); + struct xfs_rtgroup *rtg = NULL; + struct xfs_open_zone *oz = NULL; + struct xfs_iext_cursor icur; + struct xfs_bmbt_irec got; + + xfs_ilock(ip, XFS_ILOCK_SHARED); + if (!xfs_iext_lookup_extent_before(ip, &ip->i_df, &offset_fsb, + &icur, &got)) { + xfs_iunlock(ip, XFS_ILOCK_SHARED); + return NULL; + } + xfs_iunlock(ip, XFS_ILOCK_SHARED); + + rtg = xfs_rtgroup_grab(mp, xfs_rtb_to_rgno(mp, got.br_startblock)); + if (!rtg) + return NULL; + + xfs_ilock(rtg_rmap(rtg), XFS_ILOCK_SHARED); + oz = READ_ONCE(rtg->rtg_open_zone); + if (oz && (oz->oz_is_gc || !atomic_inc_not_zero(&oz->oz_ref))) + oz = NULL; + xfs_iunlock(rtg_rmap(rtg), XFS_ILOCK_SHARED); + + xfs_rtgroup_rele(rtg); + return oz; +} + +static struct xfs_group * +xfs_find_free_zone( + struct xfs_mount *mp, + unsigned long start, + unsigned long end) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + XA_STATE (xas, &mp->m_groups[XG_TYPE_RTG].xa, start); + struct xfs_group *xg; + + xas_lock(&xas); + xas_for_each_marked(&xas, xg, end, XFS_RTG_FREE) + if (atomic_inc_not_zero(&xg->xg_active_ref)) + goto found; + xas_unlock(&xas); + return NULL; + +found: + xas_clear_mark(&xas, XFS_RTG_FREE); + atomic_dec(&zi->zi_nr_free_zones); + zi->zi_free_zone_cursor = xg->xg_gno; + xas_unlock(&xas); + return xg; +} + +static struct xfs_open_zone * +xfs_init_open_zone( + struct xfs_rtgroup *rtg, + xfs_rgblock_t write_pointer, + enum rw_hint write_hint, + bool is_gc) +{ + struct xfs_open_zone *oz; + + oz = kzalloc(sizeof(*oz), GFP_NOFS | __GFP_NOFAIL); + spin_lock_init(&oz->oz_alloc_lock); + atomic_set(&oz->oz_ref, 1); + oz->oz_rtg = rtg; + oz->oz_write_pointer = write_pointer; + oz->oz_written = write_pointer; + oz->oz_write_hint = write_hint; + oz->oz_is_gc = is_gc; + + /* + * All dereferences of rtg->rtg_open_zone hold the ILOCK for the rmap + * inode, but we don't really want to take that here because we are + * under the zone_list_lock. Ensure the pointer is only set for a fully + * initialized open zone structure so that a racy lookup finding it is + * fine. + */ + WRITE_ONCE(rtg->rtg_open_zone, oz); + return oz; +} + +/* + * Find a completely free zone, open it, and return a reference. + */ +struct xfs_open_zone * +xfs_open_zone( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool is_gc) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_group *xg; + + xg = xfs_find_free_zone(mp, zi->zi_free_zone_cursor, ULONG_MAX); + if (!xg) + xg = xfs_find_free_zone(mp, 0, zi->zi_free_zone_cursor); + if (!xg) + return NULL; + + set_current_state(TASK_RUNNING); + return xfs_init_open_zone(to_rtg(xg), 0, write_hint, is_gc); +} + +static struct xfs_open_zone * +xfs_try_open_zone( + struct xfs_mount *mp, + enum rw_hint write_hint) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz; + + if (zi->zi_nr_open_zones >= mp->m_max_open_zones - XFS_OPEN_GC_ZONES) + return NULL; + if (atomic_read(&zi->zi_nr_free_zones) < + XFS_GC_ZONES - XFS_OPEN_GC_ZONES) + return NULL; + + /* + * Increment the open zone count to reserve our slot before dropping + * zi_open_zones_lock. + */ + zi->zi_nr_open_zones++; + spin_unlock(&zi->zi_open_zones_lock); + oz = xfs_open_zone(mp, write_hint, false); + spin_lock(&zi->zi_open_zones_lock); + if (!oz) { + zi->zi_nr_open_zones--; + return NULL; + } + + atomic_inc(&oz->oz_ref); + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); + + /* + * If this was the last free zone, other waiters might be waiting + * on us to write to it as well. + */ + wake_up_all(&zi->zi_zone_wait); + + if (xfs_zoned_need_gc(mp)) + wake_up_process(zi->zi_gc_thread); + + trace_xfs_zone_opened(oz->oz_rtg); + return oz; +} + +/* + * For data with short or medium lifetime, try to colocated it into an + * already open zone with a matching temperature. + */ +static bool +xfs_colocate_eagerly( + enum rw_hint file_hint) +{ + switch (file_hint) { + case WRITE_LIFE_MEDIUM: + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + return true; + default: + return false; + } +} + +static bool +xfs_good_hint_match( + struct xfs_open_zone *oz, + enum rw_hint file_hint) +{ + switch (oz->oz_write_hint) { + case WRITE_LIFE_LONG: + case WRITE_LIFE_EXTREME: + /* colocate long and extreme */ + if (file_hint == WRITE_LIFE_LONG || + file_hint == WRITE_LIFE_EXTREME) + return true; + break; + case WRITE_LIFE_MEDIUM: + /* colocate medium with medium */ + if (file_hint == WRITE_LIFE_MEDIUM) + return true; + break; + case WRITE_LIFE_SHORT: + case WRITE_LIFE_NONE: + case WRITE_LIFE_NOT_SET: + /* colocate short and none */ + if (file_hint <= WRITE_LIFE_SHORT) + return true; + break; + } + return false; +} + +static bool +xfs_try_use_zone( + struct xfs_zone_info *zi, + enum rw_hint file_hint, + struct xfs_open_zone *oz, + bool lowspace) +{ + if (oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + return false; + if (!lowspace && !xfs_good_hint_match(oz, file_hint)) + return false; + if (!atomic_inc_not_zero(&oz->oz_ref)) + return false; + + /* + * If we have a hint set for the data, use that for the zone even if + * some data was written already without any hint set, but don't change + * the temperature after that as that would make little sense without + * tracking per-temperature class written block counts, which is + * probably overkill anyway. + */ + if (file_hint != WRITE_LIFE_NOT_SET && + oz->oz_write_hint == WRITE_LIFE_NOT_SET) + oz->oz_write_hint = file_hint; + + /* + * If we couldn't match by inode or life time we just pick the first + * zone with enough space above. For that we want the least busy zone + * for some definition of "least" busy. For now this simple LRU + * algorithm that rotates every zone to the end of the list will do it, + * even if it isn't exactly cache friendly. + */ + if (!list_is_last(&oz->oz_entry, &zi->zi_open_zones)) + list_move_tail(&oz->oz_entry, &zi->zi_open_zones); + return true; +} + +static struct xfs_open_zone * +xfs_select_open_zone_lru( + struct xfs_zone_info *zi, + enum rw_hint file_hint, + bool lowspace) +{ + struct xfs_open_zone *oz; + + lockdep_assert_held(&zi->zi_open_zones_lock); + + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) + if (xfs_try_use_zone(zi, file_hint, oz, lowspace)) + return oz; + + cond_resched_lock(&zi->zi_open_zones_lock); + return NULL; +} + +static struct xfs_open_zone * +xfs_select_open_zone_mru( + struct xfs_zone_info *zi, + enum rw_hint file_hint) +{ + struct xfs_open_zone *oz; + + lockdep_assert_held(&zi->zi_open_zones_lock); + + list_for_each_entry_reverse(oz, &zi->zi_open_zones, oz_entry) + if (xfs_try_use_zone(zi, file_hint, oz, false)) + return oz; + + cond_resched_lock(&zi->zi_open_zones_lock); + return NULL; +} + +static inline enum rw_hint xfs_inode_write_hint(struct xfs_inode *ip) +{ + if (xfs_has_nolifetime(ip->i_mount)) + return WRITE_LIFE_NOT_SET; + return VFS_I(ip)->i_write_hint; +} + +/* + * Try to pack inodes that are written back after they were closed tight instead + * of trying to open new zones for them or spread them to the least recently + * used zone. This optimizes the data layout for workloads that untar or copy + * a lot of small files. Right now this does not separate multiple such + * streams. + */ +static inline bool xfs_zoned_pack_tight(struct xfs_inode *ip) +{ + return !inode_is_open_for_write(VFS_I(ip)) && + !(ip->i_diflags & XFS_DIFLAG_APPEND); +} + +/* + * Pick a new zone for writes. + * + * If we aren't using up our budget of open zones just open a new one from the + * freelist. Else try to find one that matches the expected data lifetime. If + * we don't find one that is good pick any zone that is available. + */ +static struct xfs_open_zone * +xfs_select_zone_nowait( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool pack_tight) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz = NULL; + + if (xfs_is_shutdown(mp)) + return NULL; + + /* + * Try to fill up open zones with matching temperature if available. It + * is better to try to co-locate data when this is favorable, so we can + * activate empty zones when it is statistically better to separate + * data. + */ + spin_lock(&zi->zi_open_zones_lock); + if (xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + else if (pack_tight) + oz = xfs_select_open_zone_mru(zi, write_hint); + if (oz) + goto out_unlock; + + /* + * See if we can open a new zone and use that. + */ + oz = xfs_try_open_zone(mp, write_hint); + if (oz) + goto out_unlock; + + /* + * Try to colocate cold data with other cold data if we failed to open a + * new zone for it. + */ + if (write_hint != WRITE_LIFE_NOT_SET && + !xfs_colocate_eagerly(write_hint)) + oz = xfs_select_open_zone_lru(zi, write_hint, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, false); + if (!oz) + oz = xfs_select_open_zone_lru(zi, WRITE_LIFE_NOT_SET, true); +out_unlock: + spin_unlock(&zi->zi_open_zones_lock); + return oz; +} + +static struct xfs_open_zone * +xfs_select_zone( + struct xfs_mount *mp, + enum rw_hint write_hint, + bool pack_tight) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + DEFINE_WAIT (wait); + struct xfs_open_zone *oz; + + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); + if (oz) + return oz; + + for (;;) { + prepare_to_wait(&zi->zi_zone_wait, &wait, TASK_UNINTERRUPTIBLE); + oz = xfs_select_zone_nowait(mp, write_hint, pack_tight); + if (oz) + break; + schedule(); + } + finish_wait(&zi->zi_zone_wait, &wait); + return oz; +} + +static unsigned int +xfs_zone_alloc_blocks( + struct xfs_open_zone *oz, + xfs_filblks_t count_fsb, + sector_t *sector, + bool *is_seq) +{ + struct xfs_rtgroup *rtg = oz->oz_rtg; + struct xfs_mount *mp = rtg_mount(rtg); + xfs_rgblock_t rgbno; + + spin_lock(&oz->oz_alloc_lock); + count_fsb = min3(count_fsb, XFS_MAX_BMBT_EXTLEN, + (xfs_filblks_t)rtg_blocks(rtg) - oz->oz_write_pointer); + if (!count_fsb) { + spin_unlock(&oz->oz_alloc_lock); + return 0; + } + rgbno = oz->oz_write_pointer; + oz->oz_write_pointer += count_fsb; + spin_unlock(&oz->oz_alloc_lock); + + trace_xfs_zone_alloc_blocks(oz, rgbno, count_fsb); + + *sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *sector); + if (!*is_seq) + *sector += XFS_FSB_TO_BB(mp, rgbno); + return XFS_FSB_TO_B(mp, count_fsb); +} + +void +xfs_mark_rtg_boundary( + struct iomap_ioend *ioend) +{ + struct xfs_mount *mp = XFS_I(ioend->io_inode)->i_mount; + sector_t sector = ioend->io_bio.bi_iter.bi_sector; + + if (xfs_rtb_to_rgbno(mp, xfs_daddr_to_rtb(mp, sector)) == 0) + ioend->io_flags |= IOMAP_IOEND_BOUNDARY; +} + +static void +xfs_submit_zoned_bio( + struct iomap_ioend *ioend, + struct xfs_open_zone *oz, + bool is_seq) +{ + ioend->io_bio.bi_iter.bi_sector = ioend->io_sector; + ioend->io_private = oz; + atomic_inc(&oz->oz_ref); /* for xfs_zoned_end_io */ + + if (is_seq) { + ioend->io_bio.bi_opf &= ~REQ_OP_WRITE; + ioend->io_bio.bi_opf |= REQ_OP_ZONE_APPEND; + } else { + xfs_mark_rtg_boundary(ioend); + } + + submit_bio(&ioend->io_bio); +} + +/* + * Cache the last zone written to for an inode so that it is considered first + * for subsequent writes. + */ +struct xfs_zone_cache_item { + struct xfs_mru_cache_elem mru; + struct xfs_open_zone *oz; +}; + +static inline struct xfs_zone_cache_item * +xfs_zone_cache_item(struct xfs_mru_cache_elem *mru) +{ + return container_of(mru, struct xfs_zone_cache_item, mru); +} + +static void +xfs_zone_cache_free_func( + void *data, + struct xfs_mru_cache_elem *mru) +{ + struct xfs_zone_cache_item *item = xfs_zone_cache_item(mru); + + xfs_open_zone_put(item->oz); + kfree(item); +} + +/* + * Check if we have a cached last open zone available for the inode and + * if yes return a reference to it. + */ +static struct xfs_open_zone * +xfs_cached_zone( + struct xfs_mount *mp, + struct xfs_inode *ip) +{ + struct xfs_mru_cache_elem *mru; + struct xfs_open_zone *oz; + + mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); + if (!mru) + return NULL; + oz = xfs_zone_cache_item(mru)->oz; + if (oz) { + /* + * GC only steals open zones at mount time, so no GC zones + * should end up in the cache. + */ + ASSERT(!oz->oz_is_gc); + ASSERT(atomic_read(&oz->oz_ref) > 0); + atomic_inc(&oz->oz_ref); + } + xfs_mru_cache_done(mp->m_zone_cache); + return oz; +} + +/* + * Update the last used zone cache for a given inode. + * + * The caller must have a reference on the open zone. + */ +static void +xfs_zone_cache_create_association( + struct xfs_inode *ip, + struct xfs_open_zone *oz) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_cache_item *item = NULL; + struct xfs_mru_cache_elem *mru; + + ASSERT(atomic_read(&oz->oz_ref) > 0); + atomic_inc(&oz->oz_ref); + + mru = xfs_mru_cache_lookup(mp->m_zone_cache, ip->i_ino); + if (mru) { + /* + * If we have an association already, update it to point to the + * new zone. + */ + item = xfs_zone_cache_item(mru); + xfs_open_zone_put(item->oz); + item->oz = oz; + xfs_mru_cache_done(mp->m_zone_cache); + return; + } + + item = kmalloc(sizeof(*item), GFP_KERNEL); + if (!item) { + xfs_open_zone_put(oz); + return; + } + item->oz = oz; + xfs_mru_cache_insert(mp->m_zone_cache, ip->i_ino, &item->mru); +} + +void +xfs_zone_alloc_and_submit( + struct iomap_ioend *ioend, + struct xfs_open_zone **oz) +{ + struct xfs_inode *ip = XFS_I(ioend->io_inode); + struct xfs_mount *mp = ip->i_mount; + enum rw_hint write_hint = xfs_inode_write_hint(ip); + bool pack_tight = xfs_zoned_pack_tight(ip); + unsigned int alloc_len; + struct iomap_ioend *split; + bool is_seq; + + if (xfs_is_shutdown(mp)) + goto out_error; + + /* + * If we don't have a cached zone in this write context, see if the + * last extent before the one we are writing to points to an active + * zone. If so, just continue writing to it. + */ + if (!*oz && ioend->io_offset) + *oz = xfs_last_used_zone(ioend); + if (!*oz) + *oz = xfs_cached_zone(mp, ip); + + if (!*oz) { +select_zone: + *oz = xfs_select_zone(mp, write_hint, pack_tight); + if (!*oz) + goto out_error; + + xfs_zone_cache_create_association(ip, *oz); + } + + alloc_len = xfs_zone_alloc_blocks(*oz, XFS_B_TO_FSB(mp, ioend->io_size), + &ioend->io_sector, &is_seq); + if (!alloc_len) { + xfs_open_zone_put(*oz); + goto select_zone; + } + + while ((split = iomap_split_ioend(ioend, alloc_len, is_seq))) { + if (IS_ERR(split)) + goto out_split_error; + alloc_len -= split->io_bio.bi_iter.bi_size; + xfs_submit_zoned_bio(split, *oz, is_seq); + if (!alloc_len) { + xfs_open_zone_put(*oz); + goto select_zone; + } + } + + xfs_submit_zoned_bio(ioend, *oz, is_seq); + return; + +out_split_error: + ioend->io_bio.bi_status = errno_to_blk_status(PTR_ERR(split)); +out_error: + bio_io_error(&ioend->io_bio); +} + +/* + * Wake up all threads waiting for a zoned space allocation when the file system + * is shut down. + */ +void +xfs_zoned_wake_all( + struct xfs_mount *mp) +{ + /* + * Don't wake up if there is no m_zone_info. This is complicated by the + * fact that unmount can't atomically clear m_zone_info and thus we need + * to check SB_ACTIVE for that, but mount temporarily enables SB_ACTIVE + * during log recovery so we can't entirely rely on that either. + */ + if ((mp->m_super->s_flags & SB_ACTIVE) && mp->m_zone_info) + wake_up_all(&mp->m_zone_info->zi_zone_wait); +} + +/* + * Check if @rgbno in @rgb is a potentially valid block. It might still be + * unused, but that information is only found in the rmap. + */ +bool +xfs_zone_rgbno_is_valid( + struct xfs_rtgroup *rtg, + xfs_rgnumber_t rgbno) +{ + lockdep_assert_held(&rtg_rmap(rtg)->i_lock); + + if (rtg->rtg_open_zone) + return rgbno < rtg->rtg_open_zone->oz_write_pointer; + return !xa_get_mark(&rtg_mount(rtg)->m_groups[XG_TYPE_RTG].xa, + rtg_rgno(rtg), XFS_RTG_FREE); +} + +static void +xfs_free_open_zones( + struct xfs_zone_info *zi) +{ + struct xfs_open_zone *oz; + + spin_lock(&zi->zi_open_zones_lock); + while ((oz = list_first_entry_or_null(&zi->zi_open_zones, + struct xfs_open_zone, oz_entry))) { + list_del(&oz->oz_entry); + xfs_open_zone_put(oz); + } + spin_unlock(&zi->zi_open_zones_lock); +} + +struct xfs_init_zones { + struct xfs_mount *mp; + uint64_t available; + uint64_t reclaimable; +}; + +static int +xfs_init_zone( + struct xfs_init_zones *iz, + struct xfs_rtgroup *rtg, + struct blk_zone *zone) +{ + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + uint64_t used = rtg_rmap(rtg)->i_used_blocks; + xfs_rgblock_t write_pointer, highest_rgbno; + int error; + + if (zone && !xfs_zone_validate(zone, rtg, &write_pointer)) + return -EFSCORRUPTED; + + /* + * For sequential write required zones we retrieved the hardware write + * pointer above. + * + * For conventional zones or conventional devices we don't have that + * luxury. Instead query the rmap to find the highest recorded block + * and set the write pointer to the block after that. In case of a + * power loss this misses blocks where the data I/O has completed but + * not recorded in the rmap yet, and it also rewrites blocks if the most + * recently written ones got deleted again before unmount, but this is + * the best we can do without hardware support. + */ + if (!zone || zone->cond == BLK_ZONE_COND_NOT_WP) { + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + highest_rgbno = xfs_rtrmap_highest_rgbno(rtg); + if (highest_rgbno == NULLRGBLOCK) + write_pointer = 0; + else + write_pointer = highest_rgbno + 1; + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + } + + /* + * If there are no used blocks, but the zone is not in empty state yet + * we lost power before the zoned reset. In that case finish the work + * here. + */ + if (write_pointer == rtg_blocks(rtg) && used == 0) { + error = xfs_zone_gc_reset_sync(rtg); + if (error) + return error; + write_pointer = 0; + } + + if (write_pointer == 0) { + /* zone is empty */ + atomic_inc(&zi->zi_nr_free_zones); + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + iz->available += rtg_blocks(rtg); + } else if (write_pointer < rtg_blocks(rtg)) { + /* zone is open */ + struct xfs_open_zone *oz; + + atomic_inc(&rtg_group(rtg)->xg_active_ref); + oz = xfs_init_open_zone(rtg, write_pointer, WRITE_LIFE_NOT_SET, + false); + list_add_tail(&oz->oz_entry, &zi->zi_open_zones); + zi->zi_nr_open_zones++; + + iz->available += (rtg_blocks(rtg) - write_pointer); + iz->reclaimable += write_pointer - used; + } else if (used < rtg_blocks(rtg)) { + /* zone fully written, but has freed blocks */ + xfs_zone_account_reclaimable(rtg, rtg_blocks(rtg) - used); + iz->reclaimable += (rtg_blocks(rtg) - used); + } + + return 0; +} + +static int +xfs_get_zone_info_cb( + struct blk_zone *zone, + unsigned int idx, + void *data) +{ + struct xfs_init_zones *iz = data; + struct xfs_mount *mp = iz->mp; + xfs_fsblock_t zsbno = xfs_daddr_to_rtb(mp, zone->start); + xfs_rgnumber_t rgno; + struct xfs_rtgroup *rtg; + int error; + + if (xfs_rtb_to_rgbno(mp, zsbno) != 0) { + xfs_warn(mp, "mismatched zone start 0x%llx.", zsbno); + return -EFSCORRUPTED; + } + + rgno = xfs_rtb_to_rgno(mp, zsbno); + rtg = xfs_rtgroup_grab(mp, rgno); + if (!rtg) { + xfs_warn(mp, "realtime group not found for zone %u.", rgno); + return -EFSCORRUPTED; + } + error = xfs_init_zone(iz, rtg, zone); + xfs_rtgroup_rele(rtg); + return error; +} + +/* + * Calculate the max open zone limit based on the of number of + * backing zones available + */ +static inline uint32_t +xfs_max_open_zones( + struct xfs_mount *mp) +{ + unsigned int max_open, max_open_data_zones; + /* + * We need two zones for every open data zone, + * one in reserve as we don't reclaim open zones. One data zone + * and its spare is included in XFS_MIN_ZONES. + */ + max_open_data_zones = (mp->m_sb.sb_rgcount - XFS_MIN_ZONES) / 2 + 1; + max_open = max_open_data_zones + XFS_OPEN_GC_ZONES; + + /* + * Cap the max open limit to 1/4 of available space + */ + max_open = min(max_open, mp->m_sb.sb_rgcount / 4); + + return max(XFS_MIN_OPEN_ZONES, max_open); +} + +/* + * Normally we use the open zone limit that the device reports. If there is + * none let the user pick one from the command line. + * + * If the device doesn't report an open zone limit and there is no override, + * allow to hold about a quarter of the zones open. In theory we could allow + * all to be open, but at that point we run into GC deadlocks because we can't + * reclaim open zones. + * + * When used on conventional SSDs a lower open limit is advisable as we'll + * otherwise overwhelm the FTL just as much as a conventional block allocator. + * + * Note: To debug the open zone management code, force max_open to 1 here. + */ +static int +xfs_calc_open_zones( + struct xfs_mount *mp) +{ + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + unsigned int bdev_open_zones = bdev_max_open_zones(bdev); + + if (!mp->m_max_open_zones) { + if (bdev_open_zones) + mp->m_max_open_zones = bdev_open_zones; + else + mp->m_max_open_zones = xfs_max_open_zones(mp); + } + + if (mp->m_max_open_zones < XFS_MIN_OPEN_ZONES) { + xfs_notice(mp, "need at least %u open zones.", + XFS_MIN_OPEN_ZONES); + return -EIO; + } + + if (bdev_open_zones && bdev_open_zones < mp->m_max_open_zones) { + mp->m_max_open_zones = bdev_open_zones; + xfs_info(mp, "limiting open zones to %u due to hardware limit.\n", + bdev_open_zones); + } + + if (mp->m_max_open_zones > xfs_max_open_zones(mp)) { + mp->m_max_open_zones = xfs_max_open_zones(mp); + xfs_info(mp, +"limiting open zones to %u due to total zone count (%u)", + mp->m_max_open_zones, mp->m_sb.sb_rgcount); + } + + return 0; +} + +static unsigned long * +xfs_alloc_bucket_bitmap( + struct xfs_mount *mp) +{ + return kvmalloc_array(BITS_TO_LONGS(mp->m_sb.sb_rgcount), + sizeof(unsigned long), GFP_KERNEL | __GFP_ZERO); +} + +static struct xfs_zone_info * +xfs_alloc_zone_info( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi; + int i; + + zi = kzalloc(sizeof(*zi), GFP_KERNEL); + if (!zi) + return NULL; + INIT_LIST_HEAD(&zi->zi_open_zones); + INIT_LIST_HEAD(&zi->zi_reclaim_reservations); + spin_lock_init(&zi->zi_reset_list_lock); + spin_lock_init(&zi->zi_open_zones_lock); + spin_lock_init(&zi->zi_reservation_lock); + init_waitqueue_head(&zi->zi_zone_wait); + spin_lock_init(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + zi->zi_used_bucket_bitmap[i] = xfs_alloc_bucket_bitmap(mp); + if (!zi->zi_used_bucket_bitmap[i]) + goto out_free_bitmaps; + } + return zi; + +out_free_bitmaps: + while (--i > 0) + kvfree(zi->zi_used_bucket_bitmap[i]); + kfree(zi); + return NULL; +} + +static void +xfs_free_zone_info( + struct xfs_zone_info *zi) +{ + int i; + + xfs_free_open_zones(zi); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) + kvfree(zi->zi_used_bucket_bitmap[i]); + kfree(zi); +} + +int +xfs_mount_zones( + struct xfs_mount *mp) +{ + struct xfs_init_zones iz = { + .mp = mp, + }; + struct xfs_buftarg *bt = mp->m_rtdev_targp; + int error; + + if (!bt) { + xfs_notice(mp, "RT device missing."); + return -EINVAL; + } + + if (!xfs_has_rtgroups(mp) || !xfs_has_rmapbt(mp)) { + xfs_notice(mp, "invalid flag combination."); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rextsize != 1) { + xfs_notice(mp, "zoned file systems do not support rextsize."); + return -EFSCORRUPTED; + } + if (mp->m_sb.sb_rgcount < XFS_MIN_ZONES) { + xfs_notice(mp, +"zoned file systems need to have at least %u zones.", XFS_MIN_ZONES); + return -EFSCORRUPTED; + } + + error = xfs_calc_open_zones(mp); + if (error) + return error; + + mp->m_zone_info = xfs_alloc_zone_info(mp); + if (!mp->m_zone_info) + return -ENOMEM; + + xfs_info(mp, "%u zones of %u blocks size (%u max open)", + mp->m_sb.sb_rgcount, mp->m_groups[XG_TYPE_RTG].blocks, + mp->m_max_open_zones); + trace_xfs_zones_mount(mp); + + if (bdev_is_zoned(bt->bt_bdev)) { + error = blkdev_report_zones(bt->bt_bdev, + XFS_FSB_TO_BB(mp, mp->m_sb.sb_rtstart), + mp->m_sb.sb_rgcount, xfs_get_zone_info_cb, &iz); + if (error < 0) + goto out_free_zone_info; + } else { + struct xfs_rtgroup *rtg = NULL; + + while ((rtg = xfs_rtgroup_next(mp, rtg))) { + error = xfs_init_zone(&iz, rtg, NULL); + if (error) + goto out_free_zone_info; + } + } + + xfs_set_freecounter(mp, XC_FREE_RTAVAILABLE, iz.available); + xfs_set_freecounter(mp, XC_FREE_RTEXTENTS, + iz.available + iz.reclaimable); + + /* + * The user may configure GC to free up a percentage of unused blocks. + * By default this is 0. GC will always trigger at the minimum level + * for keeping max_open_zones available for data placement. + */ + mp->m_zonegc_low_space = 0; + + error = xfs_zone_gc_mount(mp); + if (error) + goto out_free_zone_info; + + /* + * Set up a mru cache to track inode to open zone for data placement + * purposes. The magic values for group count and life time is the + * same as the defaults for file streams, which seems sane enough. + */ + xfs_mru_cache_create(&mp->m_zone_cache, mp, + 5000, 10, xfs_zone_cache_free_func); + return 0; + +out_free_zone_info: + xfs_free_zone_info(mp->m_zone_info); + return error; +} + +void +xfs_unmount_zones( + struct xfs_mount *mp) +{ + xfs_zone_gc_unmount(mp); + xfs_free_zone_info(mp->m_zone_info); + xfs_mru_cache_destroy(mp->m_zone_cache); +} diff --git a/fs/xfs/xfs_zone_alloc.h b/fs/xfs/xfs_zone_alloc.h new file mode 100644 index 000000000000..ecf39106704c --- /dev/null +++ b/fs/xfs/xfs_zone_alloc.h @@ -0,0 +1,70 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XFS_ZONE_ALLOC_H +#define _XFS_ZONE_ALLOC_H + +struct iomap_ioend; +struct xfs_open_zone; + +struct xfs_zone_alloc_ctx { + struct xfs_open_zone *open_zone; + xfs_filblks_t reserved_blocks; +}; + +/* + * Grab any available space, even if it is less than what the caller asked for. + */ +#define XFS_ZR_GREEDY (1U << 0) +/* + * Only grab instantly available space, don't wait or GC. + */ +#define XFS_ZR_NOWAIT (1U << 1) +/* + * Dip into the reserved pool. + */ +#define XFS_ZR_RESERVED (1U << 2) + +int xfs_zoned_space_reserve(struct xfs_inode *ip, xfs_filblks_t count_fsb, + unsigned int flags, struct xfs_zone_alloc_ctx *ac); +void xfs_zoned_space_unreserve(struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac); +void xfs_zoned_add_available(struct xfs_mount *mp, xfs_filblks_t count_fsb); + +void xfs_zone_alloc_and_submit(struct iomap_ioend *ioend, + struct xfs_open_zone **oz); +int xfs_zone_free_blocks(struct xfs_trans *tp, struct xfs_rtgroup *rtg, + xfs_fsblock_t fsbno, xfs_filblks_t len); +int xfs_zoned_end_io(struct xfs_inode *ip, xfs_off_t offset, xfs_off_t count, + xfs_daddr_t daddr, struct xfs_open_zone *oz, + xfs_fsblock_t old_startblock); +void xfs_open_zone_put(struct xfs_open_zone *oz); + +void xfs_zoned_wake_all(struct xfs_mount *mp); +bool xfs_zone_rgbno_is_valid(struct xfs_rtgroup *rtg, xfs_rgnumber_t rgbno); +void xfs_mark_rtg_boundary(struct iomap_ioend *ioend); + +uint64_t xfs_zoned_default_resblks(struct xfs_mount *mp, + enum xfs_free_counter ctr); +void xfs_zoned_show_stats(struct seq_file *m, struct xfs_mount *mp); + +#ifdef CONFIG_XFS_RT +int xfs_mount_zones(struct xfs_mount *mp); +void xfs_unmount_zones(struct xfs_mount *mp); +void xfs_zone_gc_start(struct xfs_mount *mp); +void xfs_zone_gc_stop(struct xfs_mount *mp); +#else +static inline int xfs_mount_zones(struct xfs_mount *mp) +{ + return -EIO; +} +static inline void xfs_unmount_zones(struct xfs_mount *mp) +{ +} +static inline void xfs_zone_gc_start(struct xfs_mount *mp) +{ +} +static inline void xfs_zone_gc_stop(struct xfs_mount *mp) +{ +} +#endif /* CONFIG_XFS_RT */ + +#endif /* _XFS_ZONE_ALLOC_H */ diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c new file mode 100644 index 000000000000..d613a4094db6 --- /dev/null +++ b/fs/xfs/xfs_zone_gc.c @@ -0,0 +1,1184 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_log_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_btree.h" +#include "xfs_trans.h" +#include "xfs_icache.h" +#include "xfs_rmap.h" +#include "xfs_rtbitmap.h" +#include "xfs_rtrmap_btree.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" +#include "xfs_trace.h" + +/* + * Implement Garbage Collection (GC) of partially used zoned. + * + * To support the purely sequential writes in each zone, zoned XFS needs to be + * able to move data remaining in a zone out of it to reset the zone to prepare + * for writing to it again. + * + * This is done by the GC thread implemented in this file. To support that a + * number of zones (XFS_GC_ZONES) is reserved from the user visible capacity to + * write the garbage collected data into. + * + * Whenever the available space is below the chosen threshold, the GC thread + * looks for potential non-empty but not fully used zones that are worth + * reclaiming. Once found the rmap for the victim zone is queried, and after + * a bit of sorting to reduce fragmentation, the still live extents are read + * into memory and written to the GC target zone, and the bmap btree of the + * files is updated to point to the new location. To avoid taking the IOLOCK + * and MMAPLOCK for the entire GC process and thus affecting the latency of + * user reads and writes to the files, the GC writes are speculative and the + * I/O completion checks that no other writes happened for the affected regions + * before remapping. + * + * Once a zone does not contain any valid data, be that through GC or user + * block removal, it is queued for for a zone reset. The reset operation + * carefully ensures that the RT device cache is flushed and all transactions + * referencing the rmap have been committed to disk. + */ + +/* + * Size of each GC scratch pad. This is also the upper bound for each + * GC I/O, which helps to keep latency down. + */ +#define XFS_GC_CHUNK_SIZE SZ_1M + +/* + * Scratchpad data to read GCed data into. + * + * The offset member tracks where the next allocation starts, and freed tracks + * the amount of space that is not used anymore. + */ +#define XFS_ZONE_GC_NR_SCRATCH 2 +struct xfs_zone_scratch { + struct folio *folio; + unsigned int offset; + unsigned int freed; +}; + +/* + * Chunk that is read and written for each GC operation. + * + * Note that for writes to actual zoned devices, the chunk can be split when + * reaching the hardware limit. + */ +struct xfs_gc_bio { + struct xfs_zone_gc_data *data; + + /* + * Entry into the reading/writing/resetting list. Only accessed from + * the GC thread, so no locking needed. + */ + struct list_head entry; + + /* + * State of this gc_bio. Done means the current I/O completed. + * Set from the bio end I/O handler, read from the GC thread. + */ + enum { + XFS_GC_BIO_NEW, + XFS_GC_BIO_DONE, + } state; + + /* + * Pointer to the inode and byte range in the inode that this + * GC chunk is operating on. + */ + struct xfs_inode *ip; + loff_t offset; + unsigned int len; + + /* + * Existing startblock (in the zone to be freed) and newly assigned + * daddr in the zone GCed into. + */ + xfs_fsblock_t old_startblock; + xfs_daddr_t new_daddr; + struct xfs_zone_scratch *scratch; + + /* Are we writing to a sequential write required zone? */ + bool is_seq; + + /* Open Zone being written to */ + struct xfs_open_zone *oz; + + /* Bio used for reads and writes, including the bvec used by it */ + struct bio_vec bv; + struct bio bio; /* must be last */ +}; + +#define XFS_ZONE_GC_RECS 1024 + +/* iterator, needs to be reinitialized for each victim zone */ +struct xfs_zone_gc_iter { + struct xfs_rtgroup *victim_rtg; + unsigned int rec_count; + unsigned int rec_idx; + xfs_agblock_t next_startblock; + struct xfs_rmap_irec *recs; +}; + +/* + * Per-mount GC state. + */ +struct xfs_zone_gc_data { + struct xfs_mount *mp; + + /* bioset used to allocate the gc_bios */ + struct bio_set bio_set; + + /* + * Scratchpad used, and index to indicated which one is used. + */ + struct xfs_zone_scratch scratch[XFS_ZONE_GC_NR_SCRATCH]; + unsigned int scratch_idx; + + /* + * List of bios currently being read, written and reset. + * These lists are only accessed by the GC thread itself, and must only + * be processed in order. + */ + struct list_head reading; + struct list_head writing; + struct list_head resetting; + + /* + * Iterator for the victim zone. + */ + struct xfs_zone_gc_iter iter; +}; + +/* + * We aim to keep enough zones free in stock to fully use the open zone limit + * for data placement purposes. Additionally, the m_zonegc_low_space tunable + * can be set to make sure a fraction of the unused blocks are available for + * writing. + */ +bool +xfs_zoned_need_gc( + struct xfs_mount *mp) +{ + s64 available, free, threshold; + s32 remainder; + + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE)) + return false; + + available = xfs_estimate_freecounter(mp, XC_FREE_RTAVAILABLE); + + if (available < + mp->m_groups[XG_TYPE_RTG].blocks * + (mp->m_max_open_zones - XFS_OPEN_GC_ZONES)) + return true; + + free = xfs_estimate_freecounter(mp, XC_FREE_RTEXTENTS); + + threshold = div_s64_rem(free, 100, &remainder); + threshold = threshold * mp->m_zonegc_low_space + + remainder * div_s64(mp->m_zonegc_low_space, 100); + + if (available < threshold) + return true; + + return false; +} + +static struct xfs_zone_gc_data * +xfs_zone_gc_data_alloc( + struct xfs_mount *mp) +{ + struct xfs_zone_gc_data *data; + int i; + + data = kzalloc(sizeof(*data), GFP_KERNEL); + if (!data) + return NULL; + data->iter.recs = kcalloc(XFS_ZONE_GC_RECS, sizeof(*data->iter.recs), + GFP_KERNEL); + if (!data->iter.recs) + goto out_free_data; + + /* + * We actually only need a single bio_vec. It would be nice to have + * a flag that only allocates the inline bvecs and not the separate + * bvec pool. + */ + if (bioset_init(&data->bio_set, 16, offsetof(struct xfs_gc_bio, bio), + BIOSET_NEED_BVECS)) + goto out_free_recs; + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) { + data->scratch[i].folio = + folio_alloc(GFP_KERNEL, get_order(XFS_GC_CHUNK_SIZE)); + if (!data->scratch[i].folio) + goto out_free_scratch; + } + INIT_LIST_HEAD(&data->reading); + INIT_LIST_HEAD(&data->writing); + INIT_LIST_HEAD(&data->resetting); + data->mp = mp; + return data; + +out_free_scratch: + while (--i >= 0) + folio_put(data->scratch[i].folio); + bioset_exit(&data->bio_set); +out_free_recs: + kfree(data->iter.recs); +out_free_data: + kfree(data); + return NULL; +} + +static void +xfs_zone_gc_data_free( + struct xfs_zone_gc_data *data) +{ + int i; + + for (i = 0; i < XFS_ZONE_GC_NR_SCRATCH; i++) + folio_put(data->scratch[i].folio); + bioset_exit(&data->bio_set); + kfree(data->iter.recs); + kfree(data); +} + +static void +xfs_zone_gc_iter_init( + struct xfs_zone_gc_iter *iter, + struct xfs_rtgroup *victim_rtg) + +{ + iter->next_startblock = 0; + iter->rec_count = 0; + iter->rec_idx = 0; + iter->victim_rtg = victim_rtg; +} + +/* + * Query the rmap of the victim zone to gather the records to evacuate. + */ +static int +xfs_zone_gc_query_cb( + struct xfs_btree_cur *cur, + const struct xfs_rmap_irec *irec, + void *private) +{ + struct xfs_zone_gc_iter *iter = private; + + ASSERT(!XFS_RMAP_NON_INODE_OWNER(irec->rm_owner)); + ASSERT(!xfs_is_sb_inum(cur->bc_mp, irec->rm_owner)); + ASSERT(!(irec->rm_flags & (XFS_RMAP_ATTR_FORK | XFS_RMAP_BMBT_BLOCK))); + + iter->recs[iter->rec_count] = *irec; + if (++iter->rec_count == XFS_ZONE_GC_RECS) { + iter->next_startblock = + irec->rm_startblock + irec->rm_blockcount; + return 1; + } + return 0; +} + +#define cmp_int(l, r) ((l > r) - (l < r)) + +static int +xfs_zone_gc_rmap_rec_cmp( + const void *a, + const void *b) +{ + const struct xfs_rmap_irec *reca = a; + const struct xfs_rmap_irec *recb = b; + int diff; + + diff = cmp_int(reca->rm_owner, recb->rm_owner); + if (diff) + return diff; + return cmp_int(reca->rm_offset, recb->rm_offset); +} + +static int +xfs_zone_gc_query( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter) +{ + struct xfs_rtgroup *rtg = iter->victim_rtg; + struct xfs_rmap_irec ri_low = { }; + struct xfs_rmap_irec ri_high; + struct xfs_btree_cur *cur; + struct xfs_trans *tp; + int error; + + ASSERT(iter->next_startblock <= rtg_blocks(rtg)); + if (iter->next_startblock == rtg_blocks(rtg)) + goto done; + + ASSERT(iter->next_startblock < rtg_blocks(rtg)); + ri_low.rm_startblock = iter->next_startblock; + memset(&ri_high, 0xFF, sizeof(ri_high)); + + iter->rec_idx = 0; + iter->rec_count = 0; + + error = xfs_trans_alloc_empty(mp, &tp); + if (error) + return error; + + xfs_rtgroup_lock(rtg, XFS_RTGLOCK_RMAP); + cur = xfs_rtrmapbt_init_cursor(tp, rtg); + error = xfs_rmap_query_range(cur, &ri_low, &ri_high, + xfs_zone_gc_query_cb, iter); + xfs_rtgroup_unlock(rtg, XFS_RTGLOCK_RMAP); + xfs_btree_del_cursor(cur, error < 0 ? error : 0); + xfs_trans_cancel(tp); + + if (error < 0) + return error; + + /* + * Sort the rmap records by inode number and increasing offset to + * defragment the mappings. + * + * This could be further enhanced by an even bigger look ahead window, + * but that's better left until we have better detection of changes to + * inode mapping to avoid the potential of GCing already dead data. + */ + sort(iter->recs, iter->rec_count, sizeof(iter->recs[0]), + xfs_zone_gc_rmap_rec_cmp, NULL); + + if (error == 0) { + /* + * We finished iterating through the zone. + */ + iter->next_startblock = rtg_blocks(rtg); + if (iter->rec_count == 0) + goto done; + } + + return 0; +done: + xfs_rtgroup_rele(iter->victim_rtg); + iter->victim_rtg = NULL; + return 0; +} + +static bool +xfs_zone_gc_iter_next( + struct xfs_mount *mp, + struct xfs_zone_gc_iter *iter, + struct xfs_rmap_irec *chunk_rec, + struct xfs_inode **ipp) +{ + struct xfs_rmap_irec *irec; + int error; + + if (!iter->victim_rtg) + return false; + +retry: + if (iter->rec_idx == iter->rec_count) { + error = xfs_zone_gc_query(mp, iter); + if (error) + goto fail; + if (!iter->victim_rtg) + return false; + } + + irec = &iter->recs[iter->rec_idx]; + error = xfs_iget(mp, NULL, irec->rm_owner, + XFS_IGET_UNTRUSTED | XFS_IGET_DONTCACHE, 0, ipp); + if (error) { + /* + * If the inode was already deleted, skip over it. + */ + if (error == -ENOENT) { + iter->rec_idx++; + goto retry; + } + goto fail; + } + + if (!S_ISREG(VFS_I(*ipp)->i_mode) || !XFS_IS_REALTIME_INODE(*ipp)) { + iter->rec_idx++; + xfs_irele(*ipp); + goto retry; + } + + *chunk_rec = *irec; + return true; + +fail: + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + return false; +} + +static void +xfs_zone_gc_iter_advance( + struct xfs_zone_gc_iter *iter, + xfs_extlen_t count_fsb) +{ + struct xfs_rmap_irec *irec = &iter->recs[iter->rec_idx]; + + irec->rm_offset += count_fsb; + irec->rm_startblock += count_fsb; + irec->rm_blockcount -= count_fsb; + if (!irec->rm_blockcount) + iter->rec_idx++; +} + +static struct xfs_rtgroup * +xfs_zone_gc_pick_victim_from( + struct xfs_mount *mp, + uint32_t bucket) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + uint32_t victim_used = U32_MAX; + struct xfs_rtgroup *victim_rtg = NULL; + uint32_t bit; + + if (!zi->zi_used_bucket_entries[bucket]) + return NULL; + + for_each_set_bit(bit, zi->zi_used_bucket_bitmap[bucket], + mp->m_sb.sb_rgcount) { + struct xfs_rtgroup *rtg = xfs_rtgroup_grab(mp, bit); + + if (!rtg) + continue; + + /* skip zones that are just waiting for a reset */ + if (rtg_rmap(rtg)->i_used_blocks == 0 || + rtg_rmap(rtg)->i_used_blocks >= victim_used) { + xfs_rtgroup_rele(rtg); + continue; + } + + if (victim_rtg) + xfs_rtgroup_rele(victim_rtg); + victim_rtg = rtg; + victim_used = rtg_rmap(rtg)->i_used_blocks; + + /* + * Any zone that is less than 1 percent used is fair game for + * instant reclaim. All of these zones are in the last + * bucket, so avoid the expensive division for the zones + * in the other buckets. + */ + if (bucket == 0 && + rtg_rmap(rtg)->i_used_blocks < rtg_blocks(rtg) / 100) + break; + } + + return victim_rtg; +} + +/* + * Iterate through all zones marked as reclaimable and find a candidate to + * reclaim. + */ +static bool +xfs_zone_gc_select_victim( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_rtgroup *victim_rtg = NULL; + unsigned int bucket; + + if (xfs_is_shutdown(mp)) + return false; + + if (iter->victim_rtg) + return true; + + /* + * Don't start new work if we are asked to stop or park. + */ + if (kthread_should_stop() || kthread_should_park()) + return false; + + if (!xfs_zoned_need_gc(mp)) + return false; + + spin_lock(&zi->zi_used_buckets_lock); + for (bucket = 0; bucket < XFS_ZONE_USED_BUCKETS; bucket++) { + victim_rtg = xfs_zone_gc_pick_victim_from(mp, bucket); + if (victim_rtg) + break; + } + spin_unlock(&zi->zi_used_buckets_lock); + + if (!victim_rtg) + return false; + + trace_xfs_zone_gc_select_victim(victim_rtg, bucket); + xfs_zone_gc_iter_init(iter, victim_rtg); + return true; +} + +static struct xfs_open_zone * +xfs_zone_gc_steal_open( + struct xfs_zone_info *zi) +{ + struct xfs_open_zone *oz, *found = NULL; + + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) { + if (!found || + oz->oz_write_pointer < found->oz_write_pointer) + found = oz; + } + + if (found) { + found->oz_is_gc = true; + list_del_init(&found->oz_entry); + zi->zi_nr_open_zones--; + } + + spin_unlock(&zi->zi_open_zones_lock); + return found; +} + +static struct xfs_open_zone * +xfs_zone_gc_select_target( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz = zi->zi_open_gc_zone; + + /* + * We need to wait for pending writes to finish. + */ + if (oz && oz->oz_written < rtg_blocks(oz->oz_rtg)) + return NULL; + + ASSERT(zi->zi_nr_open_zones <= + mp->m_max_open_zones - XFS_OPEN_GC_ZONES); + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); + if (oz) + trace_xfs_zone_gc_target_opened(oz->oz_rtg); + spin_lock(&zi->zi_open_zones_lock); + zi->zi_open_gc_zone = oz; + spin_unlock(&zi->zi_open_zones_lock); + return oz; +} + +/* + * Ensure we have a valid open zone to write the GC data to. + * + * If the current target zone has space keep writing to it, else first wait for + * all pending writes and then pick a new one. + */ +static struct xfs_open_zone * +xfs_zone_gc_ensure_target( + struct xfs_mount *mp) +{ + struct xfs_open_zone *oz = mp->m_zone_info->zi_open_gc_zone; + + if (!oz || oz->oz_write_pointer == rtg_blocks(oz->oz_rtg)) + return xfs_zone_gc_select_target(mp); + return oz; +} + +static unsigned int +xfs_zone_gc_scratch_available( + struct xfs_zone_gc_data *data) +{ + return XFS_GC_CHUNK_SIZE - data->scratch[data->scratch_idx].offset; +} + +static bool +xfs_zone_gc_space_available( + struct xfs_zone_gc_data *data) +{ + struct xfs_open_zone *oz; + + oz = xfs_zone_gc_ensure_target(data->mp); + if (!oz) + return false; + return oz->oz_write_pointer < rtg_blocks(oz->oz_rtg) && + xfs_zone_gc_scratch_available(data); +} + +static void +xfs_zone_gc_end_io( + struct bio *bio) +{ + struct xfs_gc_bio *chunk = + container_of(bio, struct xfs_gc_bio, bio); + struct xfs_zone_gc_data *data = chunk->data; + + WRITE_ONCE(chunk->state, XFS_GC_BIO_DONE); + wake_up_process(data->mp->m_zone_info->zi_gc_thread); +} + +static struct xfs_open_zone * +xfs_zone_gc_alloc_blocks( + struct xfs_zone_gc_data *data, + xfs_extlen_t *count_fsb, + xfs_daddr_t *daddr, + bool *is_seq) +{ + struct xfs_mount *mp = data->mp; + struct xfs_open_zone *oz; + + oz = xfs_zone_gc_ensure_target(mp); + if (!oz) + return NULL; + + *count_fsb = min(*count_fsb, + XFS_B_TO_FSB(mp, xfs_zone_gc_scratch_available(data))); + + /* + * Directly allocate GC blocks from the reserved pool. + * + * If we'd take them from the normal pool we could be stealing blocks + * from a regular writer, which would then have to wait for GC and + * deadlock. + */ + spin_lock(&mp->m_sb_lock); + *count_fsb = min(*count_fsb, + rtg_blocks(oz->oz_rtg) - oz->oz_write_pointer); + *count_fsb = min3(*count_fsb, + mp->m_free[XC_FREE_RTEXTENTS].res_avail, + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + mp->m_free[XC_FREE_RTEXTENTS].res_avail -= *count_fsb; + mp->m_free[XC_FREE_RTAVAILABLE].res_avail -= *count_fsb; + spin_unlock(&mp->m_sb_lock); + + if (!*count_fsb) + return NULL; + + *daddr = xfs_gbno_to_daddr(&oz->oz_rtg->rtg_group, 0); + *is_seq = bdev_zone_is_seq(mp->m_rtdev_targp->bt_bdev, *daddr); + if (!*is_seq) + *daddr += XFS_FSB_TO_BB(mp, oz->oz_write_pointer); + oz->oz_write_pointer += *count_fsb; + atomic_inc(&oz->oz_ref); + return oz; +} + +static bool +xfs_zone_gc_start_chunk( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_gc_iter *iter = &data->iter; + struct xfs_mount *mp = data->mp; + struct block_device *bdev = mp->m_rtdev_targp->bt_bdev; + struct xfs_open_zone *oz; + struct xfs_rmap_irec irec; + struct xfs_gc_bio *chunk; + struct xfs_inode *ip; + struct bio *bio; + xfs_daddr_t daddr; + bool is_seq; + + if (xfs_is_shutdown(mp)) + return false; + + if (!xfs_zone_gc_iter_next(mp, iter, &irec, &ip)) + return false; + oz = xfs_zone_gc_alloc_blocks(data, &irec.rm_blockcount, &daddr, + &is_seq); + if (!oz) { + xfs_irele(ip); + return false; + } + + bio = bio_alloc_bioset(bdev, 1, REQ_OP_READ, GFP_NOFS, &data->bio_set); + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->ip = ip; + chunk->offset = XFS_FSB_TO_B(mp, irec.rm_offset); + chunk->len = XFS_FSB_TO_B(mp, irec.rm_blockcount); + chunk->old_startblock = + xfs_rgbno_to_rtb(iter->victim_rtg, irec.rm_startblock); + chunk->new_daddr = daddr; + chunk->is_seq = is_seq; + chunk->scratch = &data->scratch[data->scratch_idx]; + chunk->data = data; + chunk->oz = oz; + + bio->bi_iter.bi_sector = xfs_rtb_to_daddr(mp, chunk->old_startblock); + bio->bi_end_io = xfs_zone_gc_end_io; + bio_add_folio_nofail(bio, chunk->scratch->folio, chunk->len, + chunk->scratch->offset); + chunk->scratch->offset += chunk->len; + if (chunk->scratch->offset == XFS_GC_CHUNK_SIZE) { + data->scratch_idx = + (data->scratch_idx + 1) % XFS_ZONE_GC_NR_SCRATCH; + } + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->reading); + xfs_zone_gc_iter_advance(iter, irec.rm_blockcount); + + submit_bio(bio); + return true; +} + +static void +xfs_zone_gc_free_chunk( + struct xfs_gc_bio *chunk) +{ + list_del(&chunk->entry); + xfs_open_zone_put(chunk->oz); + xfs_irele(chunk->ip); + bio_put(&chunk->bio); +} + +static void +xfs_zone_gc_submit_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + if (chunk->is_seq) { + chunk->bio.bi_opf &= ~REQ_OP_WRITE; + chunk->bio.bi_opf |= REQ_OP_ZONE_APPEND; + } + chunk->bio.bi_iter.bi_sector = chunk->new_daddr; + chunk->bio.bi_end_io = xfs_zone_gc_end_io; + submit_bio(&chunk->bio); +} + +static struct xfs_gc_bio * +xfs_zone_gc_split_write( + struct xfs_zone_gc_data *data, + struct xfs_gc_bio *chunk) +{ + struct queue_limits *lim = + &bdev_get_queue(chunk->bio.bi_bdev)->limits; + struct xfs_gc_bio *split_chunk; + int split_sectors; + unsigned int split_len; + struct bio *split; + unsigned int nsegs; + + if (!chunk->is_seq) + return NULL; + + split_sectors = bio_split_rw_at(&chunk->bio, lim, &nsegs, + lim->max_zone_append_sectors << SECTOR_SHIFT); + if (!split_sectors) + return NULL; + + /* ensure the split chunk is still block size aligned */ + split_sectors = ALIGN_DOWN(split_sectors << SECTOR_SHIFT, + data->mp->m_sb.sb_blocksize) >> SECTOR_SHIFT; + split_len = split_sectors << SECTOR_SHIFT; + + split = bio_split(&chunk->bio, split_sectors, GFP_NOFS, &data->bio_set); + split_chunk = container_of(split, struct xfs_gc_bio, bio); + split_chunk->data = data; + ihold(VFS_I(chunk->ip)); + split_chunk->ip = chunk->ip; + split_chunk->is_seq = chunk->is_seq; + split_chunk->scratch = chunk->scratch; + split_chunk->offset = chunk->offset; + split_chunk->len = split_len; + split_chunk->old_startblock = chunk->old_startblock; + split_chunk->new_daddr = chunk->new_daddr; + split_chunk->oz = chunk->oz; + atomic_inc(&chunk->oz->oz_ref); + + chunk->offset += split_len; + chunk->len -= split_len; + chunk->old_startblock += XFS_B_TO_FSB(data->mp, split_len); + + /* add right before the original chunk */ + WRITE_ONCE(split_chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&split_chunk->entry, &chunk->entry); + return split_chunk; +} + +static void +xfs_zone_gc_write_chunk( + struct xfs_gc_bio *chunk) +{ + struct xfs_zone_gc_data *data = chunk->data; + struct xfs_mount *mp = chunk->ip->i_mount; + phys_addr_t bvec_paddr = + bvec_phys(bio_first_bvec_all(&chunk->bio)); + struct xfs_gc_bio *split_chunk; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_move_tail(&chunk->entry, &data->writing); + + bio_reset(&chunk->bio, mp->m_rtdev_targp->bt_bdev, REQ_OP_WRITE); + bio_add_folio_nofail(&chunk->bio, chunk->scratch->folio, chunk->len, + offset_in_folio(chunk->scratch->folio, bvec_paddr)); + + while ((split_chunk = xfs_zone_gc_split_write(data, chunk))) + xfs_zone_gc_submit_write(data, split_chunk); + xfs_zone_gc_submit_write(data, chunk); +} + +static void +xfs_zone_gc_finish_chunk( + struct xfs_gc_bio *chunk) +{ + uint iolock = XFS_IOLOCK_EXCL | XFS_MMAPLOCK_EXCL; + struct xfs_inode *ip = chunk->ip; + struct xfs_mount *mp = ip->i_mount; + int error; + + if (chunk->bio.bi_status) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + if (xfs_is_shutdown(mp)) { + xfs_zone_gc_free_chunk(chunk); + return; + } + + chunk->scratch->freed += chunk->len; + if (chunk->scratch->freed == chunk->scratch->offset) { + chunk->scratch->offset = 0; + chunk->scratch->freed = 0; + } + + /* + * Cycle through the iolock and wait for direct I/O and layouts to + * ensure no one is reading from the old mapping before it goes away. + * + * Note that xfs_zoned_end_io() below checks that no other writer raced + * with us to update the mapping by checking that the old startblock + * didn't change. + */ + xfs_ilock(ip, iolock); + error = xfs_break_layouts(VFS_I(ip), &iolock, BREAK_UNMAP); + if (!error) + inode_dio_wait(VFS_I(ip)); + xfs_iunlock(ip, iolock); + if (error) + goto free; + + if (chunk->is_seq) + chunk->new_daddr = chunk->bio.bi_iter.bi_sector; + error = xfs_zoned_end_io(ip, chunk->offset, chunk->len, + chunk->new_daddr, chunk->oz, chunk->old_startblock); +free: + if (error) + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + xfs_zone_gc_free_chunk(chunk); +} + +static void +xfs_zone_gc_finish_reset( + struct xfs_gc_bio *chunk) +{ + struct xfs_rtgroup *rtg = chunk->bio.bi_private; + struct xfs_mount *mp = rtg_mount(rtg); + struct xfs_zone_info *zi = mp->m_zone_info; + + if (chunk->bio.bi_status) { + xfs_force_shutdown(mp, SHUTDOWN_META_IO_ERROR); + goto out; + } + + xfs_group_set_mark(&rtg->rtg_group, XFS_RTG_FREE); + atomic_inc(&zi->zi_nr_free_zones); + + xfs_zoned_add_available(mp, rtg_blocks(rtg)); + + wake_up_all(&zi->zi_zone_wait); +out: + list_del(&chunk->entry); + bio_put(&chunk->bio); +} + +static bool +xfs_zone_gc_prepare_reset( + struct bio *bio, + struct xfs_rtgroup *rtg) +{ + trace_xfs_zone_reset(rtg); + + ASSERT(rtg_rmap(rtg)->i_used_blocks == 0); + bio->bi_iter.bi_sector = xfs_gbno_to_daddr(&rtg->rtg_group, 0); + if (!bdev_zone_is_seq(bio->bi_bdev, bio->bi_iter.bi_sector)) { + if (!bdev_max_discard_sectors(bio->bi_bdev)) + return false; + bio->bi_opf = REQ_OP_DISCARD | REQ_SYNC; + bio->bi_iter.bi_size = + XFS_FSB_TO_B(rtg_mount(rtg), rtg_blocks(rtg)); + } + + return true; +} + +int +xfs_zone_gc_reset_sync( + struct xfs_rtgroup *rtg) +{ + int error = 0; + struct bio bio; + + bio_init(&bio, rtg_mount(rtg)->m_rtdev_targp->bt_bdev, NULL, 0, + REQ_OP_ZONE_RESET); + if (xfs_zone_gc_prepare_reset(&bio, rtg)) + error = submit_bio_wait(&bio); + bio_uninit(&bio); + + return error; +} + +static void +xfs_zone_gc_reset_zones( + struct xfs_zone_gc_data *data, + struct xfs_group *reset_list) +{ + struct xfs_group *next = reset_list; + + if (blkdev_issue_flush(data->mp->m_rtdev_targp->bt_bdev) < 0) { + xfs_force_shutdown(data->mp, SHUTDOWN_META_IO_ERROR); + return; + } + + do { + struct xfs_rtgroup *rtg = to_rtg(next); + struct xfs_gc_bio *chunk; + struct bio *bio; + + xfs_log_force_inode(rtg_rmap(rtg)); + + next = rtg_group(rtg)->xg_next_reset; + rtg_group(rtg)->xg_next_reset = NULL; + + bio = bio_alloc_bioset(rtg_mount(rtg)->m_rtdev_targp->bt_bdev, + 0, REQ_OP_ZONE_RESET, GFP_NOFS, &data->bio_set); + bio->bi_private = rtg; + bio->bi_end_io = xfs_zone_gc_end_io; + + chunk = container_of(bio, struct xfs_gc_bio, bio); + chunk->data = data; + WRITE_ONCE(chunk->state, XFS_GC_BIO_NEW); + list_add_tail(&chunk->entry, &data->resetting); + + /* + * Also use the bio to drive the state machine when neither + * zone reset nor discard is supported to keep things simple. + */ + if (xfs_zone_gc_prepare_reset(bio, rtg)) + submit_bio(bio); + else + bio_endio(bio); + } while (next); +} + +/* + * Handle the work to read and write data for GC and to reset the zones, + * including handling all completions. + * + * Note that the order of the chunks is preserved so that we don't undo the + * optimal order established by xfs_zone_gc_query(). + */ +static bool +xfs_zone_gc_handle_work( + struct xfs_zone_gc_data *data) +{ + struct xfs_zone_info *zi = data->mp->m_zone_info; + struct xfs_gc_bio *chunk, *next; + struct xfs_group *reset_list; + struct blk_plug plug; + + spin_lock(&zi->zi_reset_list_lock); + reset_list = zi->zi_reset_list; + zi->zi_reset_list = NULL; + spin_unlock(&zi->zi_reset_list_lock); + + if (!xfs_zone_gc_select_victim(data) || + !xfs_zone_gc_space_available(data)) { + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !reset_list) + return false; + } + + __set_current_state(TASK_RUNNING); + try_to_freeze(); + + if (reset_list) + xfs_zone_gc_reset_zones(data, reset_list); + + list_for_each_entry_safe(chunk, next, &data->resetting, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_finish_reset(chunk); + } + + list_for_each_entry_safe(chunk, next, &data->writing, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_finish_chunk(chunk); + } + + blk_start_plug(&plug); + list_for_each_entry_safe(chunk, next, &data->reading, entry) { + if (READ_ONCE(chunk->state) != XFS_GC_BIO_DONE) + break; + xfs_zone_gc_write_chunk(chunk); + } + blk_finish_plug(&plug); + + blk_start_plug(&plug); + while (xfs_zone_gc_start_chunk(data)) + ; + blk_finish_plug(&plug); + return true; +} + +/* + * Note that the current GC algorithm would break reflinks and thus duplicate + * data that was shared by multiple owners before. Because of that reflinks + * are currently not supported on zoned file systems and can't be created or + * mounted. + */ +static int +xfs_zoned_gcd( + void *private) +{ + struct xfs_zone_gc_data *data = private; + struct xfs_mount *mp = data->mp; + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int nofs_flag; + + nofs_flag = memalloc_nofs_save(); + set_freezable(); + + for (;;) { + set_current_state(TASK_INTERRUPTIBLE | TASK_FREEZABLE); + xfs_set_zonegc_running(mp); + if (xfs_zone_gc_handle_work(data)) + continue; + + if (list_empty(&data->reading) && + list_empty(&data->writing) && + list_empty(&data->resetting) && + !zi->zi_reset_list) { + xfs_clear_zonegc_running(mp); + xfs_zoned_resv_wake_all(mp); + + if (kthread_should_stop()) { + __set_current_state(TASK_RUNNING); + break; + } + + if (kthread_should_park()) { + __set_current_state(TASK_RUNNING); + kthread_parkme(); + continue; + } + } + + schedule(); + } + xfs_clear_zonegc_running(mp); + + if (data->iter.victim_rtg) + xfs_rtgroup_rele(data->iter.victim_rtg); + + memalloc_nofs_restore(nofs_flag); + xfs_zone_gc_data_free(data); + return 0; +} + +void +xfs_zone_gc_start( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_unpark(mp->m_zone_info->zi_gc_thread); +} + +void +xfs_zone_gc_stop( + struct xfs_mount *mp) +{ + if (xfs_has_zoned(mp)) + kthread_park(mp->m_zone_info->zi_gc_thread); +} + +int +xfs_zone_gc_mount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_gc_data *data; + struct xfs_open_zone *oz; + int error; + + /* + * If there are no free zones available for GC, pick the open zone with + * the least used space to GC into. This should only happen after an + * unclean shutdown near ENOSPC while GC was ongoing. + * + * We also need to do this for the first gc zone allocation if we + * unmounted while at the open limit. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_FREE) || + zi->zi_nr_open_zones == mp->m_max_open_zones) + oz = xfs_zone_gc_steal_open(zi); + else + oz = xfs_open_zone(mp, WRITE_LIFE_NOT_SET, true); + if (!oz) { + xfs_warn(mp, "unable to allocate a zone for gc"); + error = -EIO; + goto out; + } + + trace_xfs_zone_gc_target_opened(oz->oz_rtg); + zi->zi_open_gc_zone = oz; + + data = xfs_zone_gc_data_alloc(mp); + if (!data) { + error = -ENOMEM; + goto out_put_gc_zone; + } + + mp->m_zone_info->zi_gc_thread = kthread_create(xfs_zoned_gcd, data, + "xfs-zone-gc/%s", mp->m_super->s_id); + if (IS_ERR(mp->m_zone_info->zi_gc_thread)) { + xfs_warn(mp, "unable to create zone gc thread"); + error = PTR_ERR(mp->m_zone_info->zi_gc_thread); + goto out_free_gc_data; + } + + /* xfs_zone_gc_start will unpark for rw mounts */ + kthread_park(mp->m_zone_info->zi_gc_thread); + return 0; + +out_free_gc_data: + kfree(data); +out_put_gc_zone: + xfs_open_zone_put(zi->zi_open_gc_zone); +out: + return error; +} + +void +xfs_zone_gc_unmount( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + + kthread_stop(zi->zi_gc_thread); + if (zi->zi_open_gc_zone) + xfs_open_zone_put(zi->zi_open_gc_zone); +} diff --git a/fs/xfs/xfs_zone_info.c b/fs/xfs/xfs_zone_info.c new file mode 100644 index 000000000000..733bcc2f8645 --- /dev/null +++ b/fs/xfs/xfs_zone_info.c @@ -0,0 +1,105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtgroup.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" + +static const char xfs_write_hint_shorthand[6][16] = { + "NOT_SET", "NONE", "SHORT", "MEDIUM", "LONG", "EXTREME"}; + +static inline const char * +xfs_write_hint_to_str( + uint8_t write_hint) +{ + if (write_hint > WRITE_LIFE_EXTREME) + return "UNKNOWN"; + return xfs_write_hint_shorthand[write_hint]; +} + +static void +xfs_show_open_zone( + struct seq_file *m, + struct xfs_open_zone *oz) +{ + seq_printf(m, "\t zone %d, wp %u, written %u, used %u, hint %s\n", + rtg_rgno(oz->oz_rtg), + oz->oz_write_pointer, oz->oz_written, + rtg_rmap(oz->oz_rtg)->i_used_blocks, + xfs_write_hint_to_str(oz->oz_write_hint)); +} + +static void +xfs_show_full_zone_used_distribution( + struct seq_file *m, + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + unsigned int reclaimable = 0, full, i; + + spin_lock(&zi->zi_used_buckets_lock); + for (i = 0; i < XFS_ZONE_USED_BUCKETS; i++) { + unsigned int entries = zi->zi_used_bucket_entries[i]; + + seq_printf(m, "\t %2u..%2u%%: %u\n", + i * (100 / XFS_ZONE_USED_BUCKETS), + (i + 1) * (100 / XFS_ZONE_USED_BUCKETS) - 1, + entries); + reclaimable += entries; + } + spin_unlock(&zi->zi_used_buckets_lock); + + full = mp->m_sb.sb_rgcount; + if (zi->zi_open_gc_zone) + full--; + full -= zi->zi_nr_open_zones; + full -= atomic_read(&zi->zi_nr_free_zones); + full -= reclaimable; + + seq_printf(m, "\t 100%%: %u\n", full); +} + +void +xfs_zoned_show_stats( + struct seq_file *m, + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_open_zone *oz; + + seq_puts(m, "\n"); + + seq_printf(m, "\tuser free RT blocks: %lld\n", + xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + seq_printf(m, "\treserved free RT blocks: %lld\n", + mp->m_free[XC_FREE_RTEXTENTS].res_avail); + seq_printf(m, "\tuser available RT blocks: %lld\n", + xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE)); + seq_printf(m, "\treserved available RT blocks: %lld\n", + mp->m_free[XC_FREE_RTAVAILABLE].res_avail); + seq_printf(m, "\tRT reservations required: %d\n", + !list_empty_careful(&zi->zi_reclaim_reservations)); + seq_printf(m, "\tRT GC required: %d\n", + xfs_zoned_need_gc(mp)); + + seq_printf(m, "\tfree zones: %d\n", atomic_read(&zi->zi_nr_free_zones)); + seq_puts(m, "\topen zones:\n"); + spin_lock(&zi->zi_open_zones_lock); + list_for_each_entry(oz, &zi->zi_open_zones, oz_entry) + xfs_show_open_zone(m, oz); + if (zi->zi_open_gc_zone) { + seq_puts(m, "\topen gc zone:\n"); + xfs_show_open_zone(m, zi->zi_open_gc_zone); + } + spin_unlock(&zi->zi_open_zones_lock); + seq_puts(m, "\tused blocks distribution (fully written zones):\n"); + xfs_show_full_zone_used_distribution(m, mp); +} diff --git a/fs/xfs/xfs_zone_priv.h b/fs/xfs/xfs_zone_priv.h new file mode 100644 index 000000000000..ab696975a993 --- /dev/null +++ b/fs/xfs/xfs_zone_priv.h @@ -0,0 +1,119 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _XFS_ZONE_PRIV_H +#define _XFS_ZONE_PRIV_H + +struct xfs_open_zone { + /* + * Entry in the open zone list and refcount. Protected by + * zi_open_zones_lock in struct xfs_zone_info. + */ + struct list_head oz_entry; + atomic_t oz_ref; + + /* + * oz_write_pointer is the write pointer at which space is handed out + * for conventional zones, or simple the count of blocks handed out + * so far for sequential write required zones and is protected by + * oz_alloc_lock/ + */ + spinlock_t oz_alloc_lock; + xfs_rgblock_t oz_write_pointer; + + /* + * oz_written is the number of blocks for which we've received a + * write completion. oz_written must always be <= oz_write_pointer + * and is protected by the ILOCK of the rmap inode. + */ + xfs_rgblock_t oz_written; + + /* + * Write hint (data temperature) assigned to this zone, or + * WRITE_LIFE_NOT_SET if none was set. + */ + enum rw_hint oz_write_hint; + + /* + * Is this open zone used for garbage collection? There can only be a + * single open GC zone, which is pointed to by zi_open_gc_zone in + * struct xfs_zone_info. Constant over the life time of an open zone. + */ + bool oz_is_gc; + + /* + * Pointer to the RT groups structure for this open zone. Constant over + * the life time of an open zone. + */ + struct xfs_rtgroup *oz_rtg; +}; + +/* + * Number of bitmap buckets to track reclaimable zones. There are 10 buckets + * so that each 10% of the usable capacity get their own bucket and GC can + * only has to walk the bitmaps of the lesser used zones if there are any. + */ +#define XFS_ZONE_USED_BUCKETS 10u + +struct xfs_zone_info { + /* + * List of pending space reservations: + */ + spinlock_t zi_reservation_lock; + struct list_head zi_reclaim_reservations; + + /* + * List and number of open zones: + */ + spinlock_t zi_open_zones_lock; + struct list_head zi_open_zones; + unsigned int zi_nr_open_zones; + + /* + * Free zone search cursor and number of free zones: + */ + unsigned long zi_free_zone_cursor; + atomic_t zi_nr_free_zones; + + /* + * Wait queue to wait for free zones or open zone resources to become + * available: + */ + wait_queue_head_t zi_zone_wait; + + /* + * Pointer to the GC thread, and the current open zone used by GC + * (if any). + * + * zi_open_gc_zone is mostly private to the GC thread, but can be read + * for debugging from other threads, in which case zi_open_zones_lock + * must be taken to access it. + */ + struct task_struct *zi_gc_thread; + struct xfs_open_zone *zi_open_gc_zone; + + /* + * List of zones that need a reset: + */ + spinlock_t zi_reset_list_lock; + struct xfs_group *zi_reset_list; + + /* + * A set of bitmaps to bucket-sort reclaimable zones by used blocks to help + * garbage collection to quickly find the best candidate for reclaim. + */ + spinlock_t zi_used_buckets_lock; + unsigned int zi_used_bucket_entries[XFS_ZONE_USED_BUCKETS]; + unsigned long *zi_used_bucket_bitmap[XFS_ZONE_USED_BUCKETS]; + +}; + +struct xfs_open_zone *xfs_open_zone(struct xfs_mount *mp, + enum rw_hint write_hint, bool is_gc); + +int xfs_zone_gc_reset_sync(struct xfs_rtgroup *rtg); +bool xfs_zoned_need_gc(struct xfs_mount *mp); +int xfs_zone_gc_mount(struct xfs_mount *mp); +void xfs_zone_gc_unmount(struct xfs_mount *mp); + +void xfs_zoned_resv_wake_all(struct xfs_mount *mp); + +#endif /* _XFS_ZONE_PRIV_H */ diff --git a/fs/xfs/xfs_zone_space_resv.c b/fs/xfs/xfs_zone_space_resv.c new file mode 100644 index 000000000000..93c9a7721139 --- /dev/null +++ b/fs/xfs/xfs_zone_space_resv.c @@ -0,0 +1,263 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Copyright (c) 2023-2025 Christoph Hellwig. + * Copyright (c) 2024-2025, Western Digital Corporation or its affiliates. + */ +#include "xfs.h" +#include "xfs_shared.h" +#include "xfs_format.h" +#include "xfs_trans_resv.h" +#include "xfs_mount.h" +#include "xfs_inode.h" +#include "xfs_rtbitmap.h" +#include "xfs_zone_alloc.h" +#include "xfs_zone_priv.h" +#include "xfs_zones.h" + +/* + * Note: the zoned allocator does not support a rtextsize > 1, so this code and + * the allocator itself uses file system blocks interchangeable with realtime + * extents without doing the otherwise required conversions. + */ + +/* + * Per-task space reservation. + * + * Tasks that need to wait for GC to free up space allocate one of these + * on-stack and adds it to the per-mount zi_reclaim_reservations lists. + * The GC thread will then wake the tasks in order when space becomes available. + */ +struct xfs_zone_reservation { + struct list_head entry; + struct task_struct *task; + xfs_filblks_t count_fsb; +}; + +/* + * Calculate the number of reserved blocks. + * + * XC_FREE_RTEXTENTS counts the user available capacity, to which the file + * system can be filled, while XC_FREE_RTAVAILABLE counts the blocks instantly + * available for writes without waiting for GC. + * + * For XC_FREE_RTAVAILABLE only the smaller reservation required for GC and + * block zeroing is excluded from the user capacity, while XC_FREE_RTEXTENTS + * is further restricted by at least one zone as well as the optional + * persistently reserved blocks. This allows the allocator to run more + * smoothly by not always triggering GC. + */ +uint64_t +xfs_zoned_default_resblks( + struct xfs_mount *mp, + enum xfs_free_counter ctr) +{ + switch (ctr) { + case XC_FREE_RTEXTENTS: + return (uint64_t)XFS_RESERVED_ZONES * + mp->m_groups[XG_TYPE_RTG].blocks + + mp->m_sb.sb_rtreserved; + case XC_FREE_RTAVAILABLE: + return (uint64_t)XFS_GC_ZONES * + mp->m_groups[XG_TYPE_RTG].blocks; + default: + ASSERT(0); + return 0; + } +} + +void +xfs_zoned_resv_wake_all( + struct xfs_mount *mp) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation *reservation; + + spin_lock(&zi->zi_reservation_lock); + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) + wake_up_process(reservation->task); + spin_unlock(&zi->zi_reservation_lock); +} + +void +xfs_zoned_add_available( + struct xfs_mount *mp, + xfs_filblks_t count_fsb) +{ + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation *reservation; + + if (list_empty_careful(&zi->zi_reclaim_reservations)) { + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); + return; + } + + spin_lock(&zi->zi_reservation_lock); + xfs_add_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb); + count_fsb = xfs_sum_freecounter(mp, XC_FREE_RTAVAILABLE); + list_for_each_entry(reservation, &zi->zi_reclaim_reservations, entry) { + if (reservation->count_fsb > count_fsb) + break; + wake_up_process(reservation->task); + count_fsb -= reservation->count_fsb; + + } + spin_unlock(&zi->zi_reservation_lock); +} + +static int +xfs_zoned_space_wait_error( + struct xfs_mount *mp) +{ + if (xfs_is_shutdown(mp)) + return -EIO; + if (fatal_signal_pending(current)) + return -EINTR; + return 0; +} + +static int +xfs_zoned_reserve_available( + struct xfs_inode *ip, + xfs_filblks_t count_fsb, + unsigned int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_info *zi = mp->m_zone_info; + struct xfs_zone_reservation reservation = { + .task = current, + .count_fsb = count_fsb, + }; + int error; + + /* + * If there are no waiters, try to directly grab the available blocks + * from the percpu counter. + * + * If the caller wants to dip into the reserved pool also bypass the + * wait list. This relies on the fact that we have a very graciously + * sized reserved pool that always has enough space. If the reserved + * allocations fail we're in trouble. + */ + if (likely(list_empty_careful(&zi->zi_reclaim_reservations) || + (flags & XFS_ZR_RESERVED))) { + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, + flags & XFS_ZR_RESERVED); + if (error != -ENOSPC) + return error; + } + + if (flags & XFS_ZR_NOWAIT) + return -EAGAIN; + + spin_lock(&zi->zi_reservation_lock); + list_add_tail(&reservation.entry, &zi->zi_reclaim_reservations); + while ((error = xfs_zoned_space_wait_error(mp)) == 0) { + set_current_state(TASK_KILLABLE); + + error = xfs_dec_freecounter(mp, XC_FREE_RTAVAILABLE, count_fsb, + flags & XFS_ZR_RESERVED); + if (error != -ENOSPC) + break; + + /* + * Make sure to start GC if it is not running already. As we + * check the rtavailable count when filling up zones, GC is + * normally already running at this point, but in some setups + * with very few zones we may completely run out of non- + * reserved blocks in between filling zones. + */ + if (!xfs_is_zonegc_running(mp)) + wake_up_process(zi->zi_gc_thread); + + /* + * If there is no reclaimable group left and we aren't still + * processing a pending GC request give up as we're fully out + * of space. + */ + if (!xfs_group_marked(mp, XG_TYPE_RTG, XFS_RTG_RECLAIMABLE) && + !xfs_is_zonegc_running(mp)) + break; + + spin_unlock(&zi->zi_reservation_lock); + schedule(); + spin_lock(&zi->zi_reservation_lock); + } + list_del(&reservation.entry); + spin_unlock(&zi->zi_reservation_lock); + + __set_current_state(TASK_RUNNING); + return error; +} + +/* + * Implement greedy space allocation for short writes by trying to grab all + * that is left after locking out other threads from trying to do the same. + * + * This isn't exactly optimal and can hopefully be replaced by a proper + * percpu_counter primitive one day. + */ +static int +xfs_zoned_reserve_extents_greedy( + struct xfs_inode *ip, + xfs_filblks_t *count_fsb, + unsigned int flags) +{ + struct xfs_mount *mp = ip->i_mount; + struct xfs_zone_info *zi = mp->m_zone_info; + s64 len = *count_fsb; + int error = -ENOSPC; + + spin_lock(&zi->zi_reservation_lock); + len = min(len, xfs_sum_freecounter(mp, XC_FREE_RTEXTENTS)); + if (len > 0) { + *count_fsb = len; + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, *count_fsb, + flags & XFS_ZR_RESERVED); + } + spin_unlock(&zi->zi_reservation_lock); + return error; +} + +int +xfs_zoned_space_reserve( + struct xfs_inode *ip, + xfs_filblks_t count_fsb, + unsigned int flags, + struct xfs_zone_alloc_ctx *ac) +{ + struct xfs_mount *mp = ip->i_mount; + int error; + + ASSERT(ac->reserved_blocks == 0); + ASSERT(ac->open_zone == NULL); + + error = xfs_dec_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb, + flags & XFS_ZR_RESERVED); + if (error == -ENOSPC && (flags & XFS_ZR_GREEDY) && count_fsb > 1) + error = xfs_zoned_reserve_extents_greedy(ip, &count_fsb, flags); + if (error) + return error; + + error = xfs_zoned_reserve_available(ip, count_fsb, flags); + if (error) { + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, count_fsb); + return error; + } + ac->reserved_blocks = count_fsb; + return 0; +} + +void +xfs_zoned_space_unreserve( + struct xfs_inode *ip, + struct xfs_zone_alloc_ctx *ac) +{ + if (ac->reserved_blocks > 0) { + struct xfs_mount *mp = ip->i_mount; + + xfs_zoned_add_available(mp, ac->reserved_blocks); + xfs_add_freecounter(mp, XC_FREE_RTEXTENTS, ac->reserved_blocks); + } + if (ac->open_zone) + xfs_open_zone_put(ac->open_zone); +} diff --git a/fs/zonefs/file.c b/fs/zonefs/file.c index 35166c92420c..42e2c0065bb3 100644 --- a/fs/zonefs/file.c +++ b/fs/zonefs/file.c @@ -299,7 +299,7 @@ static vm_fault_t zonefs_filemap_page_mkwrite(struct vm_fault *vmf) /* Serialize against truncates */ filemap_invalidate_lock_shared(inode->i_mapping); - ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops); + ret = iomap_page_mkwrite(vmf, &zonefs_write_iomap_ops, NULL); filemap_invalidate_unlock_shared(inode->i_mapping); sb_end_pagefault(inode->i_sb); diff --git a/fs/zonefs/super.c b/fs/zonefs/super.c index faf1eb87895d..d165eb979f21 100644 --- a/fs/zonefs/super.c +++ b/fs/zonefs/super.c @@ -1111,28 +1111,19 @@ static int zonefs_read_super(struct super_block *sb) struct zonefs_sb_info *sbi = ZONEFS_SB(sb); struct zonefs_super *super; u32 crc, stored_crc; - struct page *page; - struct bio_vec bio_vec; - struct bio bio; int ret; - page = alloc_page(GFP_KERNEL); - if (!page) + super = kmalloc(PAGE_SIZE, GFP_KERNEL); + if (!super) return -ENOMEM; - bio_init(&bio, sb->s_bdev, &bio_vec, 1, REQ_OP_READ); - bio.bi_iter.bi_sector = 0; - __bio_add_page(&bio, page, PAGE_SIZE, 0); - - ret = submit_bio_wait(&bio); + ret = bdev_rw_virt(sb->s_bdev, 0, super, PAGE_SIZE, REQ_OP_READ); if (ret) - goto free_page; - - super = page_address(page); + goto free_super; ret = -EINVAL; if (le32_to_cpu(super->s_magic) != ZONEFS_MAGIC) - goto free_page; + goto free_super; stored_crc = le32_to_cpu(super->s_crc); super->s_crc = 0; @@ -1140,14 +1131,14 @@ static int zonefs_read_super(struct super_block *sb) if (crc != stored_crc) { zonefs_err(sb, "Invalid checksum (Expected 0x%08x, got 0x%08x)", crc, stored_crc); - goto free_page; + goto free_super; } sbi->s_features = le64_to_cpu(super->s_features); if (sbi->s_features & ~ZONEFS_F_DEFINED_FEATURES) { zonefs_err(sb, "Unknown features set 0x%llx\n", sbi->s_features); - goto free_page; + goto free_super; } if (sbi->s_features & ZONEFS_F_UID) { @@ -1155,7 +1146,7 @@ static int zonefs_read_super(struct super_block *sb) le32_to_cpu(super->s_uid)); if (!uid_valid(sbi->s_uid)) { zonefs_err(sb, "Invalid UID feature\n"); - goto free_page; + goto free_super; } } @@ -1164,7 +1155,7 @@ static int zonefs_read_super(struct super_block *sb) le32_to_cpu(super->s_gid)); if (!gid_valid(sbi->s_gid)) { zonefs_err(sb, "Invalid GID feature\n"); - goto free_page; + goto free_super; } } @@ -1173,15 +1164,14 @@ static int zonefs_read_super(struct super_block *sb) if (memchr_inv(super->s_reserved, 0, sizeof(super->s_reserved))) { zonefs_err(sb, "Reserved area is being used\n"); - goto free_page; + goto free_super; } import_uuid(&sbi->s_uuid, super->s_uuid); ret = 0; -free_page: - __free_page(page); - +free_super: + kfree(super); return ret; } |