diff options
Diffstat (limited to 'fs')
328 files changed, 16185 insertions, 6431 deletions
diff --git a/fs/9p/vfs_addr.c b/fs/9p/vfs_addr.c index 1286d96a29bc..862164181bac 100644 --- a/fs/9p/vfs_addr.c +++ b/fs/9p/vfs_addr.c @@ -59,7 +59,7 @@ static void v9fs_issue_write(struct netfs_io_subrequest *subreq) len = p9_client_write(fid, subreq->start, &subreq->io_iter, &err); if (len > 0) __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); - netfs_write_subrequest_terminated(subreq, len ?: err, false); + netfs_write_subrequest_terminated(subreq, len ?: err); } /** @@ -77,7 +77,8 @@ static void v9fs_issue_read(struct netfs_io_subrequest *subreq) /* if we just extended the file size, any portion not in * cache won't be on server and is zeroes */ - if (subreq->rreq->origin != NETFS_DIO_READ) + if (subreq->rreq->origin != NETFS_UNBUFFERED_READ && + subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (pos + total >= i_size_read(rreq->inode)) __set_bit(NETFS_SREQ_HIT_EOF, &subreq->flags); diff --git a/fs/Kconfig b/fs/Kconfig index 5b4847bd2fbb..44b6cdd36dc1 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -335,6 +335,7 @@ source "fs/omfs/Kconfig" source "fs/hpfs/Kconfig" source "fs/qnx4/Kconfig" source "fs/qnx6/Kconfig" +source "fs/resctrl/Kconfig" source "fs/romfs/Kconfig" source "fs/pstore/Kconfig" source "fs/ufs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index 77fd7f7b5d02..79c08b914c47 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -128,3 +128,4 @@ obj-$(CONFIG_EROFS_FS) += erofs/ obj-$(CONFIG_VBOXSF_FS) += vboxsf/ obj-$(CONFIG_ZONEFS_FS) += zonefs/ obj-$(CONFIG_BPF_LSM) += bpf_fs_kfuncs.o +obj-$(CONFIG_RESCTRL_FS) += resctrl/ diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig index fc8ba9142f2f..682bd8ec2c10 100644 --- a/fs/afs/Kconfig +++ b/fs/afs/Kconfig @@ -5,6 +5,7 @@ config AFS_FS select AF_RXRPC select DNS_RESOLVER select NETFS_SUPPORT + select CRYPTO_KRB5 help If you say Y here, you will get an experimental Andrew File System driver. It currently only supports unsecured read-only AFS access. diff --git a/fs/afs/Makefile b/fs/afs/Makefile index 5efd7e13b304..b49b8fe682f3 100644 --- a/fs/afs/Makefile +++ b/fs/afs/Makefile @@ -8,6 +8,7 @@ kafs-y := \ addr_prefs.o \ callback.o \ cell.o \ + cm_security.o \ cmservice.o \ dir.o \ dir_edit.o \ diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c new file mode 100644 index 000000000000..edcbd249d202 --- /dev/null +++ b/fs/afs/cm_security.c @@ -0,0 +1,340 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* Cache manager security. + * + * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved. + * Written by David Howells (dhowells@redhat.com) + */ + +#include <linux/slab.h> +#include <crypto/krb5.h> +#include "internal.h" +#include "afs_cm.h" +#include "afs_fs.h" +#include "protocol_yfs.h" +#define RXRPC_TRACE_ONLY_DEFINE_ENUMS +#include <trace/events/rxrpc.h> + +#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c +#define xdr_round_up(x) (round_up((x), sizeof(__be32))) +#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32))) + +#ifdef CONFIG_RXGK +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server); +#endif + +/* + * Respond to an RxGK challenge, adding appdata. + */ +static int afs_respond_to_challenge(struct sk_buff *challenge) +{ +#ifdef CONFIG_RXGK + struct krb5_buffer appdata = {}; + struct afs_server *server; +#endif + struct rxrpc_peer *peer; + unsigned long peer_data; + u16 service_id; + u8 security_index; + + rxrpc_kernel_query_challenge(challenge, &peer, &peer_data, + &service_id, &security_index); + + _enter("%u,%u", service_id, security_index); + + switch (service_id) { + /* We don't send CM_SERVICE RPCs, so don't expect a challenge + * therefrom. + */ + case FS_SERVICE: + case VL_SERVICE: + case YFS_FS_SERVICE: + case YFS_VL_SERVICE: + break; + default: + pr_warn("Can't respond to unknown challenge %u:%u", + service_id, security_index); + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } + + switch (security_index) { +#ifdef CONFIG_RXKAD + case RXRPC_SECURITY_RXKAD: + return rxkad_kernel_respond_to_challenge(challenge); +#endif + +#ifdef CONFIG_RXGK + case RXRPC_SECURITY_RXGK: + return rxgk_kernel_respond_to_challenge(challenge, &appdata); + + case RXRPC_SECURITY_YFS_RXGK: + switch (service_id) { + case FS_SERVICE: + case YFS_FS_SERVICE: + server = (struct afs_server *)peer_data; + if (!server->cm_rxgk_appdata.data) { + mutex_lock(&server->cm_token_lock); + if (!server->cm_rxgk_appdata.data) + afs_create_yfs_cm_token(challenge, server); + mutex_unlock(&server->cm_token_lock); + } + if (server->cm_rxgk_appdata.data) + appdata = server->cm_rxgk_appdata; + break; + } + return rxgk_kernel_respond_to_challenge(challenge, &appdata); +#endif + + default: + return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO, + afs_abort_unsupported_sec_class); + } +} + +/* + * Process the OOB message queue, processing challenge packets. + */ +void afs_process_oob_queue(struct work_struct *work) +{ + struct afs_net *net = container_of(work, struct afs_net, rx_oob_work); + struct sk_buff *oob; + enum rxrpc_oob_type type; + + while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) { + switch (type) { + case RXRPC_OOB_CHALLENGE: + afs_respond_to_challenge(oob); + break; + } + rxrpc_kernel_free_oob(oob); + } +} + +#ifdef CONFIG_RXGK +/* + * Create a securities keyring for the cache manager and attach a key to it for + * the RxGK tokens we want to use to secure the callback connection back from + * the fileserver. + */ +int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + const struct krb5_enctype *krb5; + struct key *ring; + key_ref_t key; + char K0[32], *desc; + int ret; + + ring = keyring_alloc("kafs", + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(), + KEY_POS_SEARCH | KEY_POS_WRITE | + KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH, + KEY_ALLOC_NOT_IN_QUOTA, + NULL, NULL); + if (IS_ERR(ring)) + return PTR_ERR(ring); + + ret = rxrpc_sock_set_security_keyring(socket->sk, ring); + if (ret < 0) + goto out; + + ret = -ENOPKG; + krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96); + if (!krb5) + goto out; + + if (WARN_ON_ONCE(krb5->key_len > sizeof(K0))) + goto out; + + ret = -ENOMEM; + desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u", + YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype); + if (!desc) + goto out; + + wait_for_random_bytes(); + get_random_bytes(K0, krb5->key_len); + + key = key_create(make_key_ref(ring, true), + "rxrpc_s", desc, + K0, krb5->key_len, + KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW, + KEY_ALLOC_NOT_IN_QUOTA); + kfree(desc); + if (IS_ERR(key)) { + ret = PTR_ERR(key); + goto out; + } + + net->fs_cm_token_key = key_ref_to_ptr(key); + ret = 0; +out: + key_put(ring); + return ret; +} + +/* + * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver. + */ +static int afs_create_yfs_cm_token(struct sk_buff *challenge, + struct afs_server *server) +{ + const struct krb5_enctype *conn_krb5, *token_krb5; + const struct krb5_buffer *token_key; + struct crypto_aead *aead; + struct scatterlist sg; + struct afs_net *net = server->cell->net; + const struct key *key = net->fs_cm_token_key; + size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset; + __be32 caps[1] = { + [0] = htonl(AFS_CAP_ERROR_TRANSLATION), + }; + __be32 *xdr; + void *appdata, *K0, *encbase; + u32 enctype; + int ret; + + if (!key) + return -ENOKEY; + + /* Assume that the fileserver is happy to use the same encoding type as + * we were told to use by the token obtained by the user. + */ + enctype = rxgk_kernel_query_challenge(challenge); + + conn_krb5 = crypto_krb5_find_enctype(enctype); + if (!conn_krb5) + return -ENOPKG; + token_krb5 = key->payload.data[0]; + token_key = (const struct krb5_buffer *)&key->payload.data[2]; + + /* struct rxgk_key { + * afs_uint32 enctype; + * opaque key<>; + * }; + */ + keysize = 4 + xdr_len_object(conn_krb5->key_len); + + /* struct RXGK_AuthName { + * afs_int32 kind; + * opaque data<AUTHDATAMAX>; + * opaque display<AUTHPRINTABLEMAX>; + * }; + */ + uuidsize = sizeof(server->uuid); + authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0); + + /* struct RXGK_Token { + * rxgk_key K0; + * RXGK_Level level; + * rxgkTime starttime; + * afs_int32 lifetime; + * afs_int32 bytelife; + * rxgkTime expirationtime; + * struct RXGK_AuthName identities<>; + * }; + */ + toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize); + + offset = 0; + encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset); + + /* struct RXGK_TokenContainer { + * afs_int32 kvno; + * afs_int32 enctype; + * opaque encrypted_token<>; + * }; + */ + contsize = 4 + 4 + xdr_len_object(encsize); + + /* struct YFSAppData { + * opr_uuid initiatorUuid; + * opr_uuid acceptorUuid; + * Capabilities caps; + * afs_int32 enctype; + * opaque callbackKey<>; + * opaque callbackToken<>; + * }; + */ + adatasize = 16 + 16 + + xdr_len_object(sizeof(caps)) + + 4 + + xdr_len_object(conn_krb5->key_len) + + xdr_len_object(contsize); + + ret = -ENOMEM; + appdata = kzalloc(adatasize, GFP_KERNEL); + if (!appdata) + goto out; + xdr = appdata; + + memcpy(xdr, &net->uuid, 16); /* appdata.initiatorUuid */ + xdr += 16 / 4; + memcpy(xdr, &server->uuid, 16); /* appdata.acceptorUuid */ + xdr += 16 / 4; + *xdr++ = htonl(ARRAY_SIZE(caps)); /* appdata.caps.len */ + memcpy(xdr, &caps, sizeof(caps)); /* appdata.caps */ + xdr += ARRAY_SIZE(caps); + *xdr++ = htonl(conn_krb5->etype); /* appdata.enctype */ + + *xdr++ = htonl(conn_krb5->key_len); /* appdata.callbackKey.len */ + K0 = xdr; + get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(contsize); /* appdata.callbackToken.len */ + *xdr++ = htonl(1); /* cont.kvno */ + *xdr++ = htonl(token_krb5->etype); /* cont.enctype */ + *xdr++ = htonl(encsize); /* cont.encrypted_token.len */ + + encbase = xdr; + xdr += offset / 4; + *xdr++ = htonl(conn_krb5->etype); /* token.K0.enctype */ + *xdr++ = htonl(conn_krb5->key_len); /* token.K0.key.len */ + memcpy(xdr, K0, conn_krb5->key_len); /* token.K0.key.data */ + xdr += xdr_round_up(conn_krb5->key_len) / 4; + + *xdr++ = htonl(RXRPC_SECURITY_ENCRYPT); /* token.level */ + *xdr++ = htonl(0); /* token.starttime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(0); /* token.lifetime */ + *xdr++ = htonl(0); /* token.bytelife */ + *xdr++ = htonl(0); /* token.expirationtime */ + *xdr++ = htonl(0); /* " */ + *xdr++ = htonl(1); /* token.identities.count */ + *xdr++ = htonl(0); /* token.identities[0].kind */ + *xdr++ = htonl(uuidsize); /* token.identities[0].data.len */ + memcpy(xdr, &server->uuid, uuidsize); + xdr += xdr_round_up(uuidsize) / 4; + *xdr++ = htonl(0); /* token.identities[0].display.len */ + + xdr = encbase + xdr_round_up(encsize); + + if ((unsigned long)xdr - (unsigned long)appdata != adatasize) + pr_err("Appdata size incorrect %lx != %zx\n", + (unsigned long)xdr - (unsigned long)appdata, adatasize); + + aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN, + GFP_KERNEL); + if (IS_ERR(aead)) { + ret = PTR_ERR(aead); + goto out_token; + } + + sg_init_one(&sg, encbase, encsize); + ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false); + if (ret < 0) + goto out_aead; + + server->cm_rxgk_appdata.len = adatasize; + server->cm_rxgk_appdata.data = appdata; + appdata = NULL; + +out_aead: + crypto_free_aead(aead); +out_token: + kfree(appdata); +out: + return ret; +} +#endif /* CONFIG_RXGK */ diff --git a/fs/afs/internal.h b/fs/afs/internal.h index 440b0e731093..1124ea4000cb 100644 --- a/fs/afs/internal.h +++ b/fs/afs/internal.h @@ -20,6 +20,7 @@ #include <linux/uuid.h> #include <linux/mm_types.h> #include <linux/dns_resolver.h> +#include <crypto/krb5.h> #include <net/net_namespace.h> #include <net/netns/generic.h> #include <net/sock.h> @@ -176,8 +177,10 @@ struct afs_call { bool intr; /* T if interruptible */ bool unmarshalling_error; /* T if an unmarshalling error occurred */ bool responded; /* Got a response from the call (may be abort) */ + u8 security_ix; /* Security class */ u16 service_id; /* Actual service ID (after upgrade) */ unsigned int debug_id; /* Trace ID */ + u32 enctype; /* Security encoding type */ u32 operation_ID; /* operation ID for an incoming call */ u32 count; /* count for use in unmarshalling */ union { /* place to extract temporary data */ @@ -281,6 +284,7 @@ struct afs_net { struct socket *socket; struct afs_call *spare_incoming_call; struct work_struct charge_preallocation_work; + struct work_struct rx_oob_work; struct mutex socket_mutex; atomic_t nr_outstanding_calls; atomic_t nr_superblocks; @@ -305,6 +309,7 @@ struct afs_net { struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */ struct hlist_head fs_proc; /* procfs servers list */ + struct key *fs_cm_token_key; /* Key for creating CM tokens */ struct work_struct fs_prober; struct timer_list fs_probe_timer; atomic_t servers_outstanding; @@ -540,6 +545,8 @@ struct afs_server { struct list_head volumes; /* RCU list of afs_server_entry objects */ struct work_struct destroyer; /* Work item to try and destroy a server */ struct timer_list timer; /* Management timer */ + struct mutex cm_token_lock; /* Lock governing creation of appdata */ + struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */ time64_t unuse_time; /* Time at which last unused */ unsigned long flags; #define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */ @@ -1059,6 +1066,19 @@ extern void __net_exit afs_cell_purge(struct afs_net *); extern bool afs_cm_incoming_call(struct afs_call *); /* + * cm_security.c + */ +void afs_process_oob_queue(struct work_struct *work); +#ifdef CONFIG_RXGK +int afs_create_token_key(struct afs_net *net, struct socket *socket); +#else +static inline int afs_create_token_key(struct afs_net *net, struct socket *socket) +{ + return 0; +} +#endif + +/* * dir.c */ extern const struct file_operations afs_dir_file_operations; diff --git a/fs/afs/main.c b/fs/afs/main.c index c845c5daaeba..02475d415d88 100644 --- a/fs/afs/main.c +++ b/fs/afs/main.c @@ -73,6 +73,7 @@ static int __net_init afs_net_init(struct net *net_ns) generate_random_uuid((unsigned char *)&net->uuid); INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation); + INIT_WORK(&net->rx_oob_work, afs_process_oob_queue); mutex_init(&net->socket_mutex); net->cells = RB_ROOT; diff --git a/fs/afs/misc.c b/fs/afs/misc.c index b8180bf2281f..8f2b3a177690 100644 --- a/fs/afs/misc.c +++ b/fs/afs/misc.c @@ -8,6 +8,7 @@ #include <linux/kernel.h> #include <linux/module.h> #include <linux/errno.h> +#include <crypto/krb5.h> #include "internal.h" #include "afs_fs.h" #include "protocol_uae.h" @@ -103,6 +104,32 @@ int afs_abort_to_error(u32 abort_code) case RXKADDATALEN: return -EKEYREJECTED; case RXKADILLEGALLEVEL: return -EKEYREJECTED; + case RXGK_INCONSISTENCY: return -EPROTO; + case RXGK_PACKETSHORT: return -EPROTO; + case RXGK_BADCHALLENGE: return -EPROTO; + case RXGK_SEALEDINCON: return -EKEYREJECTED; + case RXGK_NOTAUTH: return -EKEYREJECTED; + case RXGK_EXPIRED: return -EKEYEXPIRED; + case RXGK_BADLEVEL: return -EKEYREJECTED; + case RXGK_BADKEYNO: return -EKEYREJECTED; + case RXGK_NOTRXGK: return -EKEYREJECTED; + case RXGK_UNSUPPORTED: return -EKEYREJECTED; + case RXGK_GSSERROR: return -EKEYREJECTED; +#ifdef RXGK_BADETYPE + case RXGK_BADETYPE: return -ENOPKG; +#endif +#ifdef RXGK_BADTOKEN + case RXGK_BADTOKEN: return -EKEYREJECTED; +#endif +#ifdef RXGK_BADETYPE + case RXGK_DATALEN: return -EPROTO; +#endif +#ifdef RXGK_BADQOP + case RXGK_BADQOP: return -EKEYREJECTED; +#endif + + case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG; + case RXGEN_OPCODE: return -ENOTSUPP; default: return -EREMOTEIO; diff --git a/fs/afs/mntpt.c b/fs/afs/mntpt.c index 45cee6534122..9434a5399f2b 100644 --- a/fs/afs/mntpt.c +++ b/fs/afs/mntpt.c @@ -189,7 +189,6 @@ struct vfsmount *afs_d_automount(struct path *path) if (IS_ERR(newmnt)) return newmnt; - mntget(newmnt); /* prevent immediate expiration */ mnt_set_expiry(newmnt, &afs_vfsmounts); queue_delayed_work(afs_wq, &afs_mntpt_expiry_timer, afs_mntpt_expiry_timeout * HZ); diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c index d5e480a33859..c1cadf8fb346 100644 --- a/fs/afs/rxrpc.c +++ b/fs/afs/rxrpc.c @@ -24,8 +24,17 @@ static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned static void afs_process_async_call(struct work_struct *); static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long); static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long); +static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID); +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob); static int afs_deliver_cm_op_id(struct afs_call *); +static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = { + .notify_new_call = afs_rx_new_call, + .discard_new_call = afs_rx_discard_new_call, + .user_attach_call = afs_rx_attach, + .notify_oob = afs_rx_notify_oob, +}; + /* asynchronous incoming call initial processing */ static const struct afs_call_type afs_RXCMxxxx = { .name = "CB.xxxx", @@ -49,6 +58,7 @@ int afs_open_socket(struct afs_net *net) goto error_1; socket->sk->sk_allocation = GFP_NOFS; + socket->sk->sk_user_data = net; /* bind the callback manager's address to make this a server socket */ memset(&srx, 0, sizeof(srx)); @@ -64,6 +74,14 @@ int afs_open_socket(struct afs_net *net) if (ret < 0) goto error_2; + ret = rxrpc_sock_set_manage_response(socket->sk, true); + if (ret < 0) + goto error_2; + + ret = afs_create_token_key(net, socket); + if (ret < 0) + pr_err("Couldn't create RxGK CM key: %d\n", ret); + ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx)); if (ret == -EADDRINUSE) { srx.transport.sin6.sin6_port = 0; @@ -84,8 +102,7 @@ int afs_open_socket(struct afs_net *net) * it sends back to us. */ - rxrpc_kernel_new_call_notification(socket, afs_rx_new_call, - afs_rx_discard_new_call); + rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops); ret = kernel_listen(socket, INT_MAX); if (ret < 0) @@ -125,7 +142,9 @@ void afs_close_socket(struct afs_net *net) kernel_sock_shutdown(net->socket, SHUT_RDWR); flush_workqueue(afs_async_calls); + net->socket->sk->sk_user_data = NULL; sock_release(net->socket); + key_put(net->fs_cm_token_key); _debug("dework"); _leave(""); @@ -738,7 +757,6 @@ void afs_charge_preallocation(struct work_struct *work) if (rxrpc_kernel_charge_accept(net->socket, afs_wake_up_async_call, - afs_rx_attach, (unsigned long)call, GFP_KERNEL, call->debug_id) < 0) @@ -800,10 +818,14 @@ static int afs_deliver_cm_op_id(struct afs_call *call) if (!afs_cm_incoming_call(call)) return -ENOTSUPP; + call->security_ix = rxrpc_kernel_query_call_security(call->rxcall, + &call->service_id, + &call->enctype); + trace_afs_cb_call(call); call->work.func = call->type->work; - /* pass responsibility for the remainer of this message off to the + /* pass responsibility for the remainder of this message off to the * cache manager op */ return call->type->deliver(call); } @@ -952,3 +974,13 @@ noinline int afs_protocol_error(struct afs_call *call, call->unmarshalling_error = true; return -EBADMSG; } + +/* + * Wake up OOB notification processing. + */ +static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob) +{ + struct afs_net *net = sk->sk_user_data; + + schedule_work(&net->rx_oob_work); +} diff --git a/fs/afs/server.c b/fs/afs/server.c index 8755f2703815..a97562f831eb 100644 --- a/fs/afs/server.c +++ b/fs/afs/server.c @@ -131,6 +131,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t * timer_setup(&server->timer, afs_server_timer, 0); INIT_LIST_HEAD(&server->volumes); init_waitqueue_head(&server->probe_wq); + mutex_init(&server->cm_token_lock); INIT_LIST_HEAD(&server->probe_link); INIT_HLIST_NODE(&server->proc_link); spin_lock_init(&server->probe_lock); @@ -396,6 +397,7 @@ static void afs_server_rcu(struct rcu_head *rcu) afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state), afs_estate_trace_put_server); afs_put_cell(server->cell, afs_cell_trace_put_server); + kfree(server->cm_rxgk_appdata.data); kfree(server); } diff --git a/fs/afs/write.c b/fs/afs/write.c index 18b0a9f1615e..2e7526ea883a 100644 --- a/fs/afs/write.c +++ b/fs/afs/write.c @@ -120,17 +120,17 @@ static void afs_issue_write_worker(struct work_struct *work) #if 0 // Error injection if (subreq->debug_index == 3) - return netfs_write_subrequest_terminated(subreq, -ENOANO, false); + return netfs_write_subrequest_terminated(subreq, -ENOANO); if (!subreq->retry_count) { set_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags); - return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); + return netfs_write_subrequest_terminated(subreq, -EAGAIN); } #endif op = afs_alloc_operation(wreq->netfs_priv, vnode->volume); if (IS_ERR(op)) - return netfs_write_subrequest_terminated(subreq, -EAGAIN, false); + return netfs_write_subrequest_terminated(subreq, -EAGAIN); afs_op_set_vnode(op, 0, vnode); op->file[0].dv_delta = 1; @@ -166,7 +166,7 @@ static void afs_issue_write_worker(struct work_struct *work) break; } - netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len, false); + netfs_write_subrequest_terminated(subreq, ret < 0 ? ret : subreq->len); } void afs_issue_write(struct netfs_io_subrequest *subreq) @@ -202,6 +202,7 @@ void afs_retry_request(struct netfs_io_request *wreq, struct netfs_io_stream *st case NETFS_READ_GAPS: case NETFS_READ_SINGLE: case NETFS_READ_FOR_WRITE: + case NETFS_UNBUFFERED_READ: case NETFS_DIO_READ: return; default: diff --git a/fs/bcachefs/alloc_background.c b/fs/bcachefs/alloc_background.c index 173e81c2bbcb..b228a5a64479 100644 --- a/fs/bcachefs/alloc_background.c +++ b/fs/bcachefs/alloc_background.c @@ -21,7 +21,6 @@ #include "error.h" #include "lru.h" #include "recovery.h" -#include "trace.h" #include "varint.h" #include <linux/kthread.h> @@ -337,11 +336,10 @@ void bch2_alloc_v4_swab(struct bkey_s k) a->stripe_sectors = swab32(a->stripe_sectors); } -void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +static inline void __bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, + unsigned dev, const struct bch_alloc_v4 *a) { - struct bch_alloc_v4 _a; - const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); - struct bch_dev *ca = c ? bch2_dev_bucket_tryget_noerror(c, k.k->p) : NULL; + struct bch_dev *ca = c ? bch2_dev_tryget_noerror(c, dev) : NULL; prt_newline(out); printbuf_indent_add(out, 2); @@ -369,6 +367,19 @@ void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c bch2_dev_put(ca); } +void bch2_alloc_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + struct bch_alloc_v4 _a; + const struct bch_alloc_v4 *a = bch2_alloc_to_v4(k, &_a); + + __bch2_alloc_v4_to_text(out, c, k.k->p.inode, a); +} + +void bch2_alloc_v4_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c k) +{ + __bch2_alloc_v4_to_text(out, c, k.k->p.inode, bkey_s_c_to_alloc_v4(k).v); +} + void __bch2_alloc_to_v4(struct bkey_s_c k, struct bch_alloc_v4 *out) { if (k.k->type == KEY_TYPE_alloc_v4) { @@ -697,8 +708,8 @@ static int __need_discard_or_freespace_err(struct btree_trans *trans, set ? "" : "un", bch2_btree_id_str(btree), buf.buf); - if (ret == -BCH_ERR_fsck_ignore || - ret == -BCH_ERR_fsck_errors_not_fixed) + if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) || + bch2_err_matches(ret, BCH_ERR_fsck_errors_not_fixed)) ret = 0; printbuf_exit(&buf); @@ -854,7 +865,7 @@ int bch2_trigger_alloc(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_bucket_tryget(c, new.k->p); if (!ca) - return -BCH_ERR_trigger_alloc; + return bch_err_throw(c, trigger_alloc); struct bch_alloc_v4 old_a_convert; const struct bch_alloc_v4 *old_a = bch2_alloc_to_v4(old, &old_a_convert); @@ -988,14 +999,11 @@ int bch2_trigger_alloc(struct btree_trans *trans, } if (new_a->gen != old_a->gen) { - rcu_read_lock(); + guard(rcu)(); u8 *gen = bucket_gen(ca, new.k->p.offset); - if (unlikely(!gen)) { - rcu_read_unlock(); + if (unlikely(!gen)) goto invalid_bucket; - } *gen = new_a->gen; - rcu_read_unlock(); } #define eval_state(_a, expr) ({ const struct bch_alloc_v4 *a = _a; expr; }) @@ -1021,15 +1029,12 @@ int bch2_trigger_alloc(struct btree_trans *trans, } if ((flags & BTREE_TRIGGER_gc) && (flags & BTREE_TRIGGER_insert)) { - rcu_read_lock(); + guard(rcu)(); struct bucket *g = gc_bucket(ca, new.k->p.offset); - if (unlikely(!g)) { - rcu_read_unlock(); + if (unlikely(!g)) goto invalid_bucket; - } g->gen_valid = 1; g->gen = new_a->gen; - rcu_read_unlock(); } err: fsck_err: @@ -1039,7 +1044,7 @@ fsck_err: invalid_bucket: bch2_fs_inconsistent(c, "reference to invalid bucket\n%s", (bch2_bkey_val_to_text(&buf, c, new.s_c), buf.buf)); - ret = -BCH_ERR_trigger_alloc; + ret = bch_err_throw(c, trigger_alloc); goto err; } @@ -1105,13 +1110,12 @@ static bool next_bucket(struct bch_fs *c, struct bch_dev **ca, struct bpos *buck bucket->offset = 0; } - rcu_read_lock(); + guard(rcu)(); *ca = __bch2_next_dev_idx(c, bucket->inode, NULL); if (*ca) { *bucket = POS((*ca)->dev_idx, (*ca)->mi.first_bucket); bch2_dev_get(*ca); } - rcu_read_unlock(); return *ca != NULL; } @@ -1454,7 +1458,7 @@ delete: ret = bch2_btree_bit_mod_iter(trans, iter, false) ?: bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_commit; + bch_err_throw(c, transaction_restart_commit); goto out; } else { /* @@ -1777,14 +1781,16 @@ int bch2_check_alloc_to_lru_refs(struct bch_fs *c) static int discard_in_flight_add(struct bch_dev *ca, u64 bucket, bool in_progress) { + struct bch_fs *c = ca->fs; int ret; mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) - if (i->bucket == bucket) { - ret = -BCH_ERR_EEXIST_discard_in_flight_add; - goto out; - } + struct discard_in_flight *i = + darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); + if (i) { + ret = bch_err_throw(c, EEXIST_discard_in_flight_add); + goto out; + } ret = darray_push(&ca->discard_buckets_in_flight, ((struct discard_in_flight) { .in_progress = in_progress, @@ -1798,14 +1804,11 @@ out: static void discard_in_flight_remove(struct bch_dev *ca, u64 bucket) { mutex_lock(&ca->discard_buckets_in_flight_lock); - darray_for_each(ca->discard_buckets_in_flight, i) - if (i->bucket == bucket) { - BUG_ON(!i->in_progress); - darray_remove_item(&ca->discard_buckets_in_flight, i); - goto found; - } - BUG(); -found: + struct discard_in_flight *i = + darray_find_p(ca->discard_buckets_in_flight, i, i->bucket == bucket); + BUG_ON(!i || !i->in_progress); + + darray_remove_item(&ca->discard_buckets_in_flight, i); mutex_unlock(&ca->discard_buckets_in_flight_lock); } @@ -2504,7 +2507,7 @@ void bch2_recalc_capacity(struct bch_fs *c) lockdep_assert_held(&c->state_lock); - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) { struct block_device *bdev = READ_ONCE(ca->disk_sb.bdev); if (bdev) @@ -2549,7 +2552,6 @@ void bch2_recalc_capacity(struct bch_fs *c) bucket_size_max = max_t(unsigned, bucket_size_max, ca->mi.bucket_size); } - rcu_read_unlock(); bch2_set_ra_pages(c, ra_pages); @@ -2574,10 +2576,9 @@ u64 bch2_min_rw_member_capacity(struct bch_fs *c) { u64 ret = U64_MAX; - rcu_read_lock(); + guard(rcu)(); for_each_rw_member_rcu(c, ca) ret = min(ret, ca->mi.nbuckets * ca->mi.bucket_size); - rcu_read_unlock(); return ret; } diff --git a/fs/bcachefs/alloc_background.h b/fs/bcachefs/alloc_background.h index 4f94c6a661bf..0cc5adc55b6f 100644 --- a/fs/bcachefs/alloc_background.h +++ b/fs/bcachefs/alloc_background.h @@ -13,11 +13,9 @@ static inline bool bch2_dev_bucket_exists(struct bch_fs *c, struct bpos pos) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, pos.inode); - bool ret = ca && bucket_valid(ca, pos.offset); - rcu_read_unlock(); - return ret; + return ca && bucket_valid(ca, pos.offset); } static inline u64 bucket_to_u64(struct bpos bucket) @@ -253,6 +251,7 @@ int bch2_alloc_v4_validate(struct bch_fs *, struct bkey_s_c, struct bkey_validate_context); void bch2_alloc_v4_swab(struct bkey_s); void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); +void bch2_alloc_v4_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc ((struct bkey_ops) { \ .key_validate = bch2_alloc_v1_validate, \ @@ -277,7 +276,7 @@ void bch2_alloc_to_text(struct printbuf *, struct bch_fs *, struct bkey_s_c); #define bch2_bkey_ops_alloc_v4 ((struct bkey_ops) { \ .key_validate = bch2_alloc_v4_validate, \ - .val_to_text = bch2_alloc_to_text, \ + .val_to_text = bch2_alloc_v4_to_text, \ .swab = bch2_alloc_v4_swab, \ .trigger = bch2_trigger_alloc, \ .min_val_size = 48, \ diff --git a/fs/bcachefs/alloc_foreground.c b/fs/bcachefs/alloc_foreground.c index 1a52c12c51ae..b375ad610acd 100644 --- a/fs/bcachefs/alloc_foreground.c +++ b/fs/bcachefs/alloc_foreground.c @@ -69,10 +69,9 @@ const char * const bch2_watermarks[] = { void bch2_reset_alloc_cursors(struct bch_fs *c) { - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) memset(ca->alloc_cursor, 0, sizeof(ca->alloc_cursor)); - rcu_read_unlock(); } static void bch2_open_bucket_hash_add(struct bch_fs *c, struct open_bucket *ob) @@ -166,9 +165,8 @@ static void open_bucket_free_unused(struct bch_fs *c, struct open_bucket *ob) ARRAY_SIZE(c->open_buckets_partial)); spin_lock(&c->freelist_lock); - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; - rcu_read_unlock(); + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets++; ob->on_partial_list = true; c->open_buckets_partial[c->open_buckets_partial_nr++] = @@ -229,7 +227,7 @@ static struct open_bucket *__try_alloc_bucket(struct bch_fs *c, track_event_change(&c->times[BCH_TIME_blocked_allocate_open_bucket], true); spin_unlock(&c->freelist_lock); - return ERR_PTR(-BCH_ERR_open_buckets_empty); + return ERR_PTR(bch_err_throw(c, open_buckets_empty)); } /* Recheck under lock: */ @@ -535,7 +533,7 @@ again: track_event_change(&c->times[BCH_TIME_blocked_allocate], true); - ob = ERR_PTR(-BCH_ERR_freelist_empty); + ob = ERR_PTR(bch_err_throw(c, freelist_empty)); goto err; } @@ -560,7 +558,7 @@ alloc: } err: if (!ob) - ob = ERR_PTR(-BCH_ERR_no_buckets_found); + ob = ERR_PTR(bch_err_throw(c, no_buckets_found)); if (!IS_ERR(ob)) ob->data_type = req->data_type; @@ -603,18 +601,18 @@ static int __dev_stripe_cmp(struct dev_stripe_state *stripe, #define dev_stripe_cmp(l, r) __dev_stripe_cmp(stripe, l, r) -struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *c, - struct dev_stripe_state *stripe, - struct bch_devs_mask *devs) +void bch2_dev_alloc_list(struct bch_fs *c, + struct dev_stripe_state *stripe, + struct bch_devs_mask *devs, + struct dev_alloc_list *ret) { - struct dev_alloc_list ret = { .nr = 0 }; - unsigned i; + ret->nr = 0; + unsigned i; for_each_set_bit(i, devs->d, BCH_SB_MEMBERS_MAX) - ret.data[ret.nr++] = i; + ret->data[ret->nr++] = i; - bubble_sort(ret.data, ret.nr, dev_stripe_cmp); - return ret; + bubble_sort(ret->data, ret->nr, dev_stripe_cmp); } static const u64 stripe_clock_hand_rescale = 1ULL << 62; /* trigger rescale at */ @@ -705,18 +703,19 @@ static int add_new_bucket(struct bch_fs *c, return 0; } -int bch2_bucket_alloc_set_trans(struct btree_trans *trans, - struct alloc_request *req, - struct dev_stripe_state *stripe, - struct closure *cl) +inline int bch2_bucket_alloc_set_trans(struct btree_trans *trans, + struct alloc_request *req, + struct dev_stripe_state *stripe, + struct closure *cl) { struct bch_fs *c = trans->c; - int ret = -BCH_ERR_insufficient_devices; + int ret = 0; BUG_ON(req->nr_effective >= req->nr_replicas); - struct dev_alloc_list devs_sorted = bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc); - darray_for_each(devs_sorted, i) { + bch2_dev_alloc_list(c, stripe, &req->devs_may_alloc, &req->devs_sorted); + + darray_for_each(req->devs_sorted, i) { req->ca = bch2_dev_tryget_noerror(c, *i); if (!req->ca) continue; @@ -739,13 +738,16 @@ int bch2_bucket_alloc_set_trans(struct btree_trans *trans, continue; } - if (add_new_bucket(c, req, ob)) { - ret = 0; + ret = add_new_bucket(c, req, ob); + if (ret) break; - } } - return ret; + if (ret == 1) + return 0; + if (ret) + return ret; + return bch_err_throw(c, insufficient_devices); } /* Allocate from stripes: */ @@ -776,9 +778,9 @@ static int bucket_alloc_from_stripe(struct btree_trans *trans, if (!h) return 0; - struct dev_alloc_list devs_sorted = - bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc); - darray_for_each(devs_sorted, i) + bch2_dev_alloc_list(c, &req->wp->stripe, &req->devs_may_alloc, &req->devs_sorted); + + darray_for_each(req->devs_sorted, i) for (unsigned ec_idx = 0; ec_idx < h->s->nr_data; ec_idx++) { if (!h->s->blocks[ec_idx]) continue; @@ -872,9 +874,8 @@ static int bucket_alloc_set_partial(struct bch_fs *c, i); ob->on_partial_list = false; - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - rcu_read_unlock(); + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; ret = add_new_bucket(c, req, ob); if (ret) @@ -1056,9 +1057,8 @@ void bch2_open_buckets_stop(struct bch_fs *c, struct bch_dev *ca, ob->on_partial_list = false; - rcu_read_lock(); - bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; - rcu_read_unlock(); + scoped_guard(rcu) + bch2_dev_rcu(c, ob->dev)->nr_partial_buckets--; spin_unlock(&c->freelist_lock); bch2_open_bucket_put(c, ob); @@ -1086,14 +1086,11 @@ static struct write_point *__writepoint_find(struct hlist_head *head, { struct write_point *wp; - rcu_read_lock(); + guard(rcu)(); hlist_for_each_entry_rcu(wp, head, node) if (wp->write_point == write_point) - goto out; - wp = NULL; -out: - rcu_read_unlock(); - return wp; + return wp; + return NULL; } static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) @@ -1104,7 +1101,7 @@ static inline bool too_many_writepoints(struct bch_fs *c, unsigned factor) return stranded * factor > free; } -static bool try_increase_writepoints(struct bch_fs *c) +static noinline bool try_increase_writepoints(struct bch_fs *c) { struct write_point *wp; @@ -1117,7 +1114,7 @@ static bool try_increase_writepoints(struct bch_fs *c) return true; } -static bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) +static noinline bool try_decrease_writepoints(struct btree_trans *trans, unsigned old_nr) { struct bch_fs *c = trans->c; struct write_point *wp; @@ -1379,11 +1376,11 @@ err: goto retry; if (cl && bch2_err_matches(ret, BCH_ERR_open_buckets_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; + ret = bch_err_throw(c, bucket_alloc_blocked); if (cl && !(flags & BCH_WRITE_alloc_nowait) && bch2_err_matches(ret, BCH_ERR_freelist_empty)) - ret = -BCH_ERR_bucket_alloc_blocked; + ret = bch_err_throw(c, bucket_alloc_blocked); return ret; } @@ -1637,19 +1634,16 @@ static noinline void bch2_print_allocator_stuck(struct bch_fs *c) bch2_printbuf_make_room(&buf, 4096); - rcu_read_lock(); buf.atomic++; - - for_each_online_member_rcu(c, ca) { - prt_printf(&buf, "Dev %u:\n", ca->dev_idx); - printbuf_indent_add(&buf, 2); - bch2_dev_alloc_debug_to_text(&buf, ca); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) { + prt_printf(&buf, "Dev %u:\n", ca->dev_idx); + printbuf_indent_add(&buf, 2); + bch2_dev_alloc_debug_to_text(&buf, ca); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } --buf.atomic; - rcu_read_unlock(); prt_printf(&buf, "Copygc debug:\n"); printbuf_indent_add(&buf, 2); diff --git a/fs/bcachefs/alloc_foreground.h b/fs/bcachefs/alloc_foreground.h index 2e01c7b61ed1..1b3fc8460096 100644 --- a/fs/bcachefs/alloc_foreground.h +++ b/fs/bcachefs/alloc_foreground.h @@ -42,6 +42,7 @@ struct alloc_request { struct bch_devs_mask devs_may_alloc; /* bch2_bucket_alloc_set_trans(): */ + struct dev_alloc_list devs_sorted; struct bch_dev_usage usage; /* bch2_bucket_alloc_trans(): */ @@ -71,9 +72,10 @@ struct alloc_request { struct bch_devs_mask scratch_devs_may_alloc; }; -struct dev_alloc_list bch2_dev_alloc_list(struct bch_fs *, - struct dev_stripe_state *, - struct bch_devs_mask *); +void bch2_dev_alloc_list(struct bch_fs *, + struct dev_stripe_state *, + struct bch_devs_mask *, + struct dev_alloc_list *); void bch2_dev_stripe_increment(struct bch_dev *, struct dev_stripe_state *); static inline struct bch_dev *ob_dev(struct bch_fs *c, struct open_bucket *ob) diff --git a/fs/bcachefs/backpointers.c b/fs/bcachefs/backpointers.c index cde7dd115267..e76809e71858 100644 --- a/fs/bcachefs/backpointers.c +++ b/fs/bcachefs/backpointers.c @@ -48,17 +48,19 @@ void bch2_backpointer_to_text(struct printbuf *out, struct bch_fs *c, struct bke { struct bkey_s_c_backpointer bp = bkey_s_c_to_backpointer(k); - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); - if (ca) { - u32 bucket_offset; - struct bpos bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); - rcu_read_unlock(); + struct bch_dev *ca; + u32 bucket_offset; + struct bpos bucket; + scoped_guard(rcu) { + ca = bch2_dev_rcu_noerror(c, bp.k->p.inode); + if (ca) + bucket = bp_pos_to_bucket_and_offset(ca, bp.k->p, &bucket_offset); + } + + if (ca) prt_printf(out, "bucket=%llu:%llu:%u ", bucket.inode, bucket.offset, bucket_offset); - } else { - rcu_read_unlock(); + else prt_printf(out, "sector=%llu:%llu ", bp.k->p.inode, bp.k->p.offset >> MAX_EXTENT_COMPRESS_RATIO_SHIFT); - } bch2_btree_id_level_to_text(out, bp.v->btree_id, bp.v->level); prt_str(out, " data_type="); @@ -140,7 +142,7 @@ static noinline int backpointer_mod_err(struct btree_trans *trans, } if (!will_check && __bch2_inconsistent_error(c, &buf)) - ret = -BCH_ERR_erofs_unfixed_errors; + ret = bch_err_throw(c, erofs_unfixed_errors); bch_err(c, "%s", buf.buf); printbuf_exit(&buf); @@ -293,7 +295,7 @@ static struct btree *__bch2_backpointer_get_node(struct btree_trans *trans, return b; if (btree_node_will_make_reachable(b)) { - b = ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node); + b = ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node)); } else { int ret = backpointer_target_not_found(trans, bp, bkey_i_to_s_c(&b->key), last_flushed, commit); @@ -351,7 +353,7 @@ static struct bkey_s_c __bch2_backpointer_get_key(struct btree_trans *trans, return ret ? bkey_s_c_err(ret) : bkey_s_c_null; } else { struct btree *b = __bch2_backpointer_get_node(trans, bp, iter, last_flushed, commit); - if (b == ERR_PTR(-BCH_ERR_backpointer_to_overwritten_btree_node)) + if (b == ERR_PTR(bch_err_throw(c, backpointer_to_overwritten_btree_node))) return bkey_s_c_null; if (IS_ERR_OR_NULL(b)) return ((struct bkey_s_c) { .k = ERR_CAST(b) }); @@ -591,6 +593,7 @@ check_existing_bp: bkey_for_each_ptr(other_extent_ptrs, ptr) if (ptr->dev == bp->k.p.inode && dev_ptr_stale_rcu(ca, ptr)) { + rcu_read_unlock(); ret = drop_dev_and_update(trans, other_bp.v->btree_id, other_extent, bp->k.p.inode); if (ret) @@ -648,7 +651,7 @@ check_existing_bp: prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, other_extent); bch_err(c, "%s", buf.buf); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; missing: printbuf_reset(&buf); @@ -679,26 +682,23 @@ static int check_extent_to_backpointers(struct btree_trans *trans, if (p.ptr.dev == BCH_SB_MEMBER_INVALID) continue; - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); - if (!ca) { - rcu_read_unlock(); - continue; - } + bool empty; + { + /* scoped_guard() is a loop, so it breaks continue */ + guard(rcu)(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, p.ptr.dev); + if (!ca) + continue; - if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) { - rcu_read_unlock(); - continue; - } + if (p.ptr.cached && dev_ptr_stale_rcu(ca, &p.ptr)) + continue; - u64 b = PTR_BUCKET_NR(ca, &p.ptr); - if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) { - rcu_read_unlock(); - continue; - } + u64 b = PTR_BUCKET_NR(ca, &p.ptr); + if (!bch2_bucket_bitmap_test(&ca->bucket_backpointer_mismatch, b)) + continue; - bool empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); - rcu_read_unlock(); + empty = bch2_bucket_bitmap_test(&ca->bucket_backpointer_empty, b); + } struct bkey_i_backpointer bp; bch2_extent_ptr_to_bp(c, btree, level, k, p, entry, &bp); @@ -953,7 +953,7 @@ static int check_bucket_backpointer_mismatch(struct btree_trans *trans, struct b sectors[ALLOC_cached] > a->cached_sectors || sectors[ALLOC_stripe] > a->stripe_sectors) { ret = check_bucket_backpointers_to_extents(trans, ca, alloc_k.k->p) ?: - -BCH_ERR_transaction_restart_nested; + bch_err_throw(c, transaction_restart_nested); goto err; } @@ -981,7 +981,7 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) case KEY_TYPE_btree_ptr_v2: { bool ret = false; - rcu_read_lock(); + guard(rcu)(); struct bpos pos = bkey_s_c_to_btree_ptr_v2(k).v->min_key; while (pos.inode <= k.k->p.inode) { if (pos.inode >= c->sb.nr_devices) @@ -1009,7 +1009,6 @@ static bool backpointer_node_has_missing(struct bch_fs *c, struct bkey_s_c k) next: pos = SPOS(pos.inode + 1, 0, 0); } - rcu_read_unlock(); return ret; } @@ -1352,7 +1351,7 @@ static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u b->buckets = kvcalloc(BITS_TO_LONGS(ca->mi.nbuckets), sizeof(unsigned long), GFP_KERNEL); if (!b->buckets) - return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); } b->nr += !__test_and_set_bit(bit, b->buckets); @@ -1361,7 +1360,8 @@ static int bch2_bucket_bitmap_set(struct bch_dev *ca, struct bucket_bitmap *b, u return 0; } -int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_size) +int bch2_bucket_bitmap_resize(struct bch_dev *ca, struct bucket_bitmap *b, + u64 old_size, u64 new_size) { scoped_guard(mutex, &b->lock) { if (!b->buckets) @@ -1370,7 +1370,7 @@ int bch2_bucket_bitmap_resize(struct bucket_bitmap *b, u64 old_size, u64 new_siz unsigned long *n = kvcalloc(BITS_TO_LONGS(new_size), sizeof(unsigned long), GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_backpointer_mismatches_bitmap; + return bch_err_throw(ca->fs, ENOMEM_backpointer_mismatches_bitmap); memcpy(n, b->buckets, BITS_TO_LONGS(min(old_size, new_size)) * sizeof(unsigned long)); diff --git a/fs/bcachefs/backpointers.h b/fs/bcachefs/backpointers.h index 6840561084ce..7e71afee1ac0 100644 --- a/fs/bcachefs/backpointers.h +++ b/fs/bcachefs/backpointers.h @@ -53,11 +53,10 @@ static inline struct bpos bp_pos_to_bucket_and_offset(const struct bch_dev *ca, static inline bool bp_pos_to_bucket_nodev_noerror(struct bch_fs *c, struct bpos bp_pos, struct bpos *bucket) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, bp_pos.inode); if (ca) *bucket = bp_pos_to_bucket(ca, bp_pos); - rcu_read_unlock(); return ca != NULL; } @@ -195,7 +194,7 @@ static inline bool bch2_bucket_bitmap_test(struct bucket_bitmap *b, u64 i) return bitmap && test_bit(i, bitmap); } -int bch2_bucket_bitmap_resize(struct bucket_bitmap *, u64, u64); +int bch2_bucket_bitmap_resize(struct bch_dev *, struct bucket_bitmap *, u64, u64); void bch2_bucket_bitmap_free(struct bucket_bitmap *); #endif /* _BCACHEFS_BACKPOINTERS_BACKGROUND_H */ diff --git a/fs/bcachefs/bcachefs.h b/fs/bcachefs/bcachefs.h index 7824da2af9d0..3651a296d506 100644 --- a/fs/bcachefs/bcachefs.h +++ b/fs/bcachefs/bcachefs.h @@ -183,6 +183,16 @@ #define pr_fmt(fmt) "%s() " fmt "\n", __func__ #endif +#ifdef CONFIG_BCACHEFS_DEBUG +#define ENUMERATED_REF_DEBUG +#endif + +#ifndef dynamic_fault +#define dynamic_fault(...) 0 +#endif + +#define race_fault(...) dynamic_fault("bcachefs:race") + #include <linux/backing-dev-defs.h> #include <linux/bug.h> #include <linux/bio.h> @@ -219,15 +229,30 @@ #include "time_stats.h" #include "util.h" -#ifdef CONFIG_BCACHEFS_DEBUG -#define ENUMERATED_REF_DEBUG -#endif - -#ifndef dynamic_fault -#define dynamic_fault(...) 0 -#endif +#include "alloc_types.h" +#include "async_objs_types.h" +#include "btree_gc_types.h" +#include "btree_types.h" +#include "btree_node_scan_types.h" +#include "btree_write_buffer_types.h" +#include "buckets_types.h" +#include "buckets_waiting_for_journal_types.h" +#include "clock_types.h" +#include "disk_groups_types.h" +#include "ec_types.h" +#include "enumerated_ref_types.h" +#include "journal_types.h" +#include "keylist_types.h" +#include "quota_types.h" +#include "rebalance_types.h" +#include "recovery_passes_types.h" +#include "replicas_types.h" +#include "sb-members_types.h" +#include "subvolume_types.h" +#include "super_types.h" +#include "thread_with_file_types.h" -#define race_fault(...) dynamic_fault("bcachefs:race") +#include "trace.h" #define count_event(_c, _name) this_cpu_inc((_c)->counters[BCH_COUNTER_##_name]) @@ -380,6 +405,14 @@ do { \ pr_info(fmt, ##__VA_ARGS__); \ } while (0) +static inline int __bch2_err_trace(struct bch_fs *c, int err) +{ + trace_error_throw(c, err, _THIS_IP_); + return err; +} + +#define bch_err_throw(_c, _err) __bch2_err_trace(_c, -BCH_ERR_##_err) + /* Parameters that are useful for debugging, but should always be compiled in: */ #define BCH_DEBUG_PARAMS_ALWAYS() \ BCH_DEBUG_PARAM(key_merging_disabled, \ @@ -486,29 +519,6 @@ enum bch_time_stats { BCH_TIME_STAT_NR }; -#include "alloc_types.h" -#include "async_objs_types.h" -#include "btree_gc_types.h" -#include "btree_types.h" -#include "btree_node_scan_types.h" -#include "btree_write_buffer_types.h" -#include "buckets_types.h" -#include "buckets_waiting_for_journal_types.h" -#include "clock_types.h" -#include "disk_groups_types.h" -#include "ec_types.h" -#include "enumerated_ref_types.h" -#include "journal_types.h" -#include "keylist_types.h" -#include "quota_types.h" -#include "rebalance_types.h" -#include "recovery_passes_types.h" -#include "replicas_types.h" -#include "sb-members_types.h" -#include "subvolume_types.h" -#include "super_types.h" -#include "thread_with_file_types.h" - /* Number of nodes btree coalesce will try to coalesce at once */ #define GC_MERGE_NODES 4U diff --git a/fs/bcachefs/btree_cache.c b/fs/bcachefs/btree_cache.c index 8557cbd3d818..91e0aa796e6b 100644 --- a/fs/bcachefs/btree_cache.c +++ b/fs/bcachefs/btree_cache.c @@ -149,7 +149,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) b->data = kvmalloc(btree_buf_bytes(b), gfp); if (!b->data) - return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); #ifdef __KERNEL__ b->aux_data = kvmalloc(btree_aux_data_bytes(b), gfp); #else @@ -162,7 +162,7 @@ static int btree_node_data_alloc(struct bch_fs *c, struct btree *b, gfp_t gfp) if (!b->aux_data) { kvfree(b->data); b->data = NULL; - return -BCH_ERR_ENOMEM_btree_node_mem_alloc; + return bch_err_throw(c, ENOMEM_btree_node_mem_alloc); } return 0; @@ -353,21 +353,21 @@ static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, if (btree_node_noevict(b)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_noevict]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (btree_node_write_blocked(b)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_blocked]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (btree_node_will_make_reachable(b)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_will_make_reachable]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (btree_node_dirty(b)) { if (!flush) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_dirty]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (locked) { @@ -393,7 +393,7 @@ static int __btree_node_reclaim_checks(struct bch_fs *c, struct btree *b, bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_read_in_flight]++; else if (btree_node_write_in_flight(b)) bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_write_in_flight]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (locked) @@ -424,13 +424,13 @@ retry_unlocked: if (!six_trylock_intent(&b->c.lock)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_intent]++; - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } if (!six_trylock_write(&b->c.lock)) { bc->not_freed[BCH_BTREE_CACHE_NOT_FREED_lock_write]++; six_unlock_intent(&b->c.lock); - return -BCH_ERR_ENOMEM_btree_node_reclaim; + return bch_err_throw(c, ENOMEM_btree_node_reclaim); } /* recheck under lock */ @@ -682,7 +682,7 @@ int bch2_fs_btree_cache_init(struct bch_fs *c) return 0; err: - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); } void bch2_fs_btree_cache_init_early(struct btree_cache *bc) @@ -727,7 +727,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure if (!cl) { trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return -BCH_ERR_ENOMEM_btree_cache_cannibalize_lock; + return bch_err_throw(c, ENOMEM_btree_cache_cannibalize_lock); } closure_wait(&bc->alloc_wait, cl); @@ -741,7 +741,7 @@ int bch2_btree_cache_cannibalize_lock(struct btree_trans *trans, struct closure } trace_and_count(c, btree_cache_cannibalize_lock_fail, trans); - return -BCH_ERR_btree_cache_cannibalize_lock_blocked; + return bch_err_throw(c, btree_cache_cannibalize_lock_blocked); success: trace_and_count(c, btree_cache_cannibalize_lock, trans); diff --git a/fs/bcachefs/btree_gc.c b/fs/bcachefs/btree_gc.c index 91b6395421df..9ddcbe1bda78 100644 --- a/fs/bcachefs/btree_gc.c +++ b/fs/bcachefs/btree_gc.c @@ -150,7 +150,7 @@ static int set_node_min(struct bch_fs *c, struct btree *b, struct bpos new_min) new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) - return -BCH_ERR_ENOMEM_gc_repair_key; + return bch_err_throw(c, ENOMEM_gc_repair_key); btree_ptr_to_v2(b, new); b->data->min_key = new_min; @@ -190,7 +190,7 @@ static int set_node_max(struct bch_fs *c, struct btree *b, struct bpos new_max) new = kmalloc_array(BKEY_BTREE_PTR_U64s_MAX, sizeof(u64), GFP_KERNEL); if (!new) - return -BCH_ERR_ENOMEM_gc_repair_key; + return bch_err_throw(c, ENOMEM_gc_repair_key); btree_ptr_to_v2(b, new); b->data->max_key = new_max; @@ -935,7 +935,7 @@ static int bch2_gc_alloc_start(struct bch_fs *c) ret = genradix_prealloc(&ca->buckets_gc, ca->mi.nbuckets, GFP_KERNEL); if (ret) { bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_gc_alloc_start; + ret = bch_err_throw(c, ENOMEM_gc_alloc_start); break; } } @@ -1093,42 +1093,41 @@ static int gc_btree_gens_key(struct btree_trans *trans, { struct bch_fs *c = trans->c; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - struct bkey_i *u; - int ret; if (unlikely(test_bit(BCH_FS_going_ro, &c->flags))) return -EROFS; - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; + bool too_stale = false; + scoped_guard(rcu) { + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; - if (dev_ptr_stale(ca, ptr) > 16) { - rcu_read_unlock(); - goto update; + too_stale |= dev_ptr_stale(ca, ptr) > 16; } + + if (!too_stale) + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + if (!ca) + continue; + + u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; + if (gen_after(*gen, ptr->gen)) + *gen = ptr->gen; + } } - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; + if (too_stale) { + struct bkey_i *u = bch2_bkey_make_mut(trans, iter, &k, 0); + int ret = PTR_ERR_OR_ZERO(u); + if (ret) + return ret; - u8 *gen = &ca->oldest_gen[PTR_BUCKET_NR(ca, ptr)]; - if (gen_after(*gen, ptr->gen)) - *gen = ptr->gen; + bch2_extent_normalize(c, bkey_i_to_s(u)); } - rcu_read_unlock(); - return 0; -update: - u = bch2_bkey_make_mut(trans, iter, &k, 0); - ret = PTR_ERR_OR_ZERO(u); - if (ret) - return ret; - bch2_extent_normalize(c, bkey_i_to_s(u)); return 0; } @@ -1181,7 +1180,7 @@ int bch2_gc_gens(struct bch_fs *c) ca->oldest_gen = kvmalloc(gens->nbuckets, GFP_KERNEL); if (!ca->oldest_gen) { bch2_dev_put(ca); - ret = -BCH_ERR_ENOMEM_gc_gens; + ret = bch_err_throw(c, ENOMEM_gc_gens); goto err; } diff --git a/fs/bcachefs/btree_io.c b/fs/bcachefs/btree_io.c index 34018296053a..57eff3012a7b 100644 --- a/fs/bcachefs/btree_io.c +++ b/fs/bcachefs/btree_io.c @@ -557,7 +557,7 @@ static int __btree_err(int ret, const char *fmt, ...) { if (c->recovery.curr_pass == BCH_RECOVERY_PASS_scan_for_btree_nodes) - return -BCH_ERR_fsck_fix; + return bch_err_throw(c, fsck_fix); bool have_retry = false; int ret2; @@ -572,9 +572,9 @@ static int __btree_err(int ret, } if (!have_retry && ret == -BCH_ERR_btree_node_read_err_want_retry) - ret = -BCH_ERR_btree_node_read_err_fixable; + ret = bch_err_throw(c, btree_node_read_err_fixable); if (!have_retry && ret == -BCH_ERR_btree_node_read_err_must_retry) - ret = -BCH_ERR_btree_node_read_err_bad_node; + ret = bch_err_throw(c, btree_node_read_err_bad_node); bch2_sb_error_count(c, err_type); @@ -602,14 +602,14 @@ static int __btree_err(int ret, switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: ret2 = bch2_fsck_err_opt(c, FSCK_CAN_FIX, err_type); - if (ret2 != -BCH_ERR_fsck_fix && - ret2 != -BCH_ERR_fsck_ignore) { + if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { ret = ret2; goto fsck_err; } if (!have_retry) - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); goto out; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); @@ -631,14 +631,14 @@ static int __btree_err(int ret, switch (ret) { case -BCH_ERR_btree_node_read_err_fixable: ret2 = __bch2_fsck_err(c, NULL, FSCK_CAN_FIX, err_type, "%s", out.buf); - if (ret2 != -BCH_ERR_fsck_fix && - ret2 != -BCH_ERR_fsck_ignore) { + if (!bch2_err_matches(ret2, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret2, BCH_ERR_fsck_ignore)) { ret = ret2; goto fsck_err; } if (!have_retry) - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); goto out; case -BCH_ERR_btree_node_read_err_bad_node: prt_str(&out, ", "); @@ -660,7 +660,7 @@ fsck_err: failed, err_msg, \ msg, ##__VA_ARGS__); \ \ - if (_ret != -BCH_ERR_fsck_fix) { \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix)) { \ ret = _ret; \ goto fsck_err; \ } \ @@ -1325,14 +1325,13 @@ int bch2_btree_node_read_done(struct bch_fs *c, struct bch_dev *ca, btree_node_reset_sib_u64s(b); - rcu_read_lock(); - bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { - struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); + scoped_guard(rcu) + bkey_for_each_ptr(bch2_bkey_ptrs(bkey_i_to_s(&b->key)), ptr) { + struct bch_dev *ca2 = bch2_dev_rcu(c, ptr->dev); - if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) - set_btree_node_need_rewrite(b); - } - rcu_read_unlock(); + if (!ca2 || ca2->mi.state != BCH_MEMBER_STATE_rw) + set_btree_node_need_rewrite(b); + } if (!ptr_written) set_btree_node_need_rewrite(b); @@ -1688,7 +1687,7 @@ static int btree_node_read_all_replicas(struct bch_fs *c, struct btree *b, bool ra = kzalloc(sizeof(*ra), GFP_NOFS); if (!ra) - return -BCH_ERR_ENOMEM_btree_node_read_all_replicas; + return bch_err_throw(c, ENOMEM_btree_node_read_all_replicas); closure_init(&ra->cl, NULL); ra->c = c; @@ -1870,7 +1869,7 @@ static int __bch2_btree_root_read(struct btree_trans *trans, enum btree_id id, bch2_btree_node_hash_remove(&c->btree_cache, b); mutex_unlock(&c->btree_cache.lock); - ret = -BCH_ERR_btree_node_read_error; + ret = bch_err_throw(c, btree_node_read_error); goto err; } @@ -2020,7 +2019,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, struct bch_fs *c = trans->c; if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_node_scrub)) - return -BCH_ERR_erofs_no_writes; + return bch_err_throw(c, erofs_no_writes); struct extent_ptr_decoded pick; int ret = bch2_bkey_pick_read_device(c, k, NULL, &pick, dev); @@ -2030,7 +2029,7 @@ int bch2_btree_node_scrub(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_get_ioref(c, pick.ptr.dev, READ, BCH_DEV_READ_REF_btree_node_scrub); if (!ca) { - ret = -BCH_ERR_device_offline; + ret = bch_err_throw(c, device_offline); goto err; } @@ -2167,7 +2166,7 @@ static void btree_node_write_work(struct work_struct *work) bch2_dev_list_has_dev(wbio->wbio.failed, ptr->dev)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(&wbio->key))) { - ret = -BCH_ERR_btree_node_write_all_failed; + ret = bch_err_throw(c, btree_node_write_all_failed); goto err; } diff --git a/fs/bcachefs/btree_iter.c b/fs/bcachefs/btree_iter.c index b4bf4217a3fa..b78403376c07 100644 --- a/fs/bcachefs/btree_iter.c +++ b/fs/bcachefs/btree_iter.c @@ -890,8 +890,7 @@ static noinline void btree_node_mem_ptr_set(struct btree_trans *trans, static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, struct btree_path *path, - unsigned flags, - struct bkey_buf *out) + unsigned flags) { struct bch_fs *c = trans->c; struct btree_path_level *l = path_l(path); @@ -915,7 +914,7 @@ static noinline int btree_node_iter_and_journal_peek(struct btree_trans *trans, goto err; } - bch2_bkey_buf_reassemble(out, c, k); + bkey_reassemble(&trans->btree_path_down, k); if ((flags & BTREE_ITER_prefetch) && c->opts.btree_node_prefetch) @@ -926,6 +925,22 @@ err: return ret; } +static noinline_for_stack int btree_node_missing_err(struct btree_trans *trans, + struct btree_path *path) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + + prt_str(&buf, "node not found at pos "); + bch2_bpos_to_text(&buf, path->pos); + prt_str(&buf, " within parent node "); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&path_l(path)->b->key)); + + bch2_fs_fatal_error(c, "%s", buf.buf); + printbuf_exit(&buf); + return bch_err_throw(c, btree_need_topology_repair); +} + static __always_inline int btree_path_down(struct btree_trans *trans, struct btree_path *path, unsigned flags, @@ -936,51 +951,38 @@ static __always_inline int btree_path_down(struct btree_trans *trans, struct btree *b; unsigned level = path->level - 1; enum six_lock_type lock_type = __btree_lock_want(path, level); - struct bkey_buf tmp; int ret; EBUG_ON(!btree_node_locked(path, path->level)); - bch2_bkey_buf_init(&tmp); - if (unlikely(trans->journal_replay_not_finished)) { - ret = btree_node_iter_and_journal_peek(trans, path, flags, &tmp); + ret = btree_node_iter_and_journal_peek(trans, path, flags); if (ret) - goto err; + return ret; } else { struct bkey_packed *k = bch2_btree_node_iter_peek(&l->iter, l->b); - if (!k) { - struct printbuf buf = PRINTBUF; - - prt_str(&buf, "node not found at pos "); - bch2_bpos_to_text(&buf, path->pos); - prt_str(&buf, " within parent node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&l->b->key)); + if (unlikely(!k)) + return btree_node_missing_err(trans, path); - bch2_fs_fatal_error(c, "%s", buf.buf); - printbuf_exit(&buf); - ret = -BCH_ERR_btree_need_topology_repair; - goto err; - } - - bch2_bkey_buf_unpack(&tmp, c, l->b, k); + bch2_bkey_unpack(l->b, &trans->btree_path_down, k); - if ((flags & BTREE_ITER_prefetch) && + if (unlikely((flags & BTREE_ITER_prefetch)) && c->opts.btree_node_prefetch) { ret = btree_path_prefetch(trans, path); if (ret) - goto err; + return ret; } } - b = bch2_btree_node_get(trans, path, tmp.k, level, lock_type, trace_ip); + b = bch2_btree_node_get(trans, path, &trans->btree_path_down, + level, lock_type, trace_ip); ret = PTR_ERR_OR_ZERO(b); if (unlikely(ret)) - goto err; + return ret; - if (likely(!trans->journal_replay_not_finished && - tmp.k->k.type == KEY_TYPE_btree_ptr_v2) && - unlikely(b != btree_node_mem_ptr(tmp.k))) + if (unlikely(b != btree_node_mem_ptr(&trans->btree_path_down)) && + likely(!trans->journal_replay_not_finished && + trans->btree_path_down.k.type == KEY_TYPE_btree_ptr_v2)) btree_node_mem_ptr_set(trans, path, level + 1, b); if (btree_node_read_locked(path, level + 1)) @@ -992,9 +994,7 @@ static __always_inline int btree_path_down(struct btree_trans *trans, bch2_btree_path_level_init(trans, path, b); bch2_btree_path_verify_locks(trans, path); -err: - bch2_bkey_buf_exit(&tmp, c); - return ret; + return 0; } static int bch2_btree_path_traverse_all(struct btree_trans *trans) @@ -1006,7 +1006,7 @@ static int bch2_btree_path_traverse_all(struct btree_trans *trans) int ret = 0; if (trans->in_traverse_all) - return -BCH_ERR_transaction_restart_in_traverse_all; + return bch_err_throw(c, transaction_restart_in_traverse_all); trans->in_traverse_all = true; retry_all: @@ -3568,13 +3568,12 @@ bch2_btree_bkey_cached_common_to_text(struct printbuf *out, struct btree_bkey_cached_common *b) { struct six_lock_count c = six_lock_counts(&b->lock); - struct task_struct *owner; pid_t pid; - rcu_read_lock(); - owner = READ_ONCE(b->lock.owner); - pid = owner ? owner->pid : 0; - rcu_read_unlock(); + scoped_guard(rcu) { + struct task_struct *owner = READ_ONCE(b->lock.owner); + pid = owner ? owner->pid : 0; + } prt_printf(out, "\t%px %c ", b, b->cached ? 'c' : 'b'); bch2_btree_id_to_text(out, b->btree_id); @@ -3603,7 +3602,7 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) prt_printf(out, "%i %s\n", task ? task->pid : 0, trans->fn); /* trans->paths is rcu protected vs. freeing */ - rcu_read_lock(); + guard(rcu)(); out->atomic++; struct btree_path *paths = rcu_dereference(trans->paths); @@ -3646,7 +3645,6 @@ void bch2_btree_trans_to_text(struct printbuf *out, struct btree_trans *trans) } out: --out->atomic; - rcu_read_unlock(); } void bch2_fs_btree_iter_exit(struct bch_fs *c) diff --git a/fs/bcachefs/btree_iter.h b/fs/bcachefs/btree_iter.h index 2cabb5f0f484..09dd3e52622e 100644 --- a/fs/bcachefs/btree_iter.h +++ b/fs/bcachefs/btree_iter.h @@ -963,16 +963,6 @@ struct bkey_s_c bch2_btree_iter_peek_and_restart_outlined(struct btree_trans *, _p; \ }) -#define bch2_trans_run(_c, _do) \ -({ \ - struct btree_trans *trans = bch2_trans_get(_c); \ - int _ret = (_do); \ - bch2_trans_put(trans); \ - _ret; \ -}) - -#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) - struct btree_trans *__bch2_trans_get(struct bch_fs *, unsigned); void bch2_trans_put(struct btree_trans *); @@ -990,6 +980,27 @@ unsigned bch2_trans_get_fn_idx(const char *); __bch2_trans_get(_c, trans_fn_idx); \ }) +/* + * We don't use DEFINE_CLASS() because using a function for the constructor + * breaks bch2_trans_get()'s use of __func__ + */ +typedef struct btree_trans * class_btree_trans_t; +static inline void class_btree_trans_destructor(struct btree_trans **p) +{ + struct btree_trans *trans = *p; + bch2_trans_put(trans); +} + +#define class_btree_trans_constructor(_c) bch2_trans_get(_c) + +#define bch2_trans_run(_c, _do) \ +({ \ + CLASS(btree_trans, trans)(_c); \ + (_do); \ +}) + +#define bch2_trans_do(_c, _do) bch2_trans_run(_c, lockrestart_do(trans, _do)) + void bch2_btree_trans_to_text(struct printbuf *, struct btree_trans *); void bch2_fs_btree_iter_exit(struct bch_fs *); diff --git a/fs/bcachefs/btree_journal_iter.c b/fs/bcachefs/btree_journal_iter.c index ade3b5addd75..cf7398751644 100644 --- a/fs/bcachefs/btree_journal_iter.c +++ b/fs/bcachefs/btree_journal_iter.c @@ -292,7 +292,7 @@ int bch2_journal_key_insert_take(struct bch_fs *c, enum btree_id id, if (!new_keys.data) { bch_err(c, "%s: error allocating new key array (size %zu)", __func__, new_keys.size); - return -BCH_ERR_ENOMEM_journal_key_insert; + return bch_err_throw(c, ENOMEM_journal_key_insert); } /* Since @keys was full, there was no gap: */ @@ -331,7 +331,7 @@ int bch2_journal_key_insert(struct bch_fs *c, enum btree_id id, n = kmalloc(bkey_bytes(&k->k), GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_journal_key_insert; + return bch_err_throw(c, ENOMEM_journal_key_insert); bkey_copy(n, k); ret = bch2_journal_key_insert_take(c, id, level, n); @@ -457,11 +457,9 @@ static void bch2_journal_iter_advance(struct journal_iter *iter) static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) { - struct bkey_s_c ret = bkey_s_c_null; - journal_iter_verify(iter); - rcu_read_lock(); + guard(rcu)(); while (iter->idx < iter->keys->size) { struct journal_key *k = iter->keys->data + iter->idx; @@ -470,19 +468,16 @@ static struct bkey_s_c bch2_journal_iter_peek(struct journal_iter *iter) break; BUG_ON(cmp); - if (!k->overwritten) { - ret = bkey_i_to_s_c(k->k); - break; - } + if (!k->overwritten) + return bkey_i_to_s_c(k->k); if (k->overwritten_range) iter->idx = idx_to_pos(iter->keys, rcu_dereference(k->overwritten_range)->end); else bch2_journal_iter_advance(iter); } - rcu_read_unlock(); - return ret; + return bkey_s_c_null; } static void bch2_journal_iter_exit(struct journal_iter *iter) @@ -741,7 +736,7 @@ int bch2_journal_keys_sort(struct bch_fs *c) if (keys->nr * 8 > keys->size * 7) { bch_err(c, "Too many journal keys for slowpath; have %zu compacted, buf size %zu, processed %zu keys at seq %llu", keys->nr, keys->size, nr_read, le64_to_cpu(i->j.seq)); - return -BCH_ERR_ENOMEM_journal_keys_sort; + return bch_err_throw(c, ENOMEM_journal_keys_sort); } BUG_ON(darray_push(keys, n)); diff --git a/fs/bcachefs/btree_key_cache.c b/fs/bcachefs/btree_key_cache.c index 9da950e7eb7d..d96188b92db2 100644 --- a/fs/bcachefs/btree_key_cache.c +++ b/fs/bcachefs/btree_key_cache.c @@ -187,27 +187,23 @@ lock: static struct bkey_cached * bkey_cached_reuse(struct btree_key_cache *c) { - struct bucket_table *tbl; + + guard(rcu)(); + struct bucket_table *tbl = rht_dereference_rcu(c->table.tbl, &c->table); struct rhash_head *pos; struct bkey_cached *ck; - unsigned i; - rcu_read_lock(); - tbl = rht_dereference_rcu(c->table.tbl, &c->table); - for (i = 0; i < tbl->size; i++) + for (unsigned i = 0; i < tbl->size; i++) rht_for_each_entry_rcu(ck, pos, tbl, i, hash) { if (!test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bkey_cached_lock_for_evict(ck)) { if (bkey_cached_evict(c, ck)) - goto out; + return ck; six_unlock_write(&ck->c.lock); six_unlock_intent(&ck->c.lock); } } - ck = NULL; -out: - rcu_read_unlock(); - return ck; + return NULL; } static int btree_key_cache_create(struct btree_trans *trans, @@ -242,7 +238,7 @@ static int btree_key_cache_create(struct btree_trans *trans, if (unlikely(!ck)) { bch_err(c, "error allocating memory for key cache item, btree %s", bch2_btree_id_str(ck_path->btree_id)); - return -BCH_ERR_ENOMEM_btree_key_cache_create; + return bch_err_throw(c, ENOMEM_btree_key_cache_create); } } @@ -260,7 +256,7 @@ static int btree_key_cache_create(struct btree_trans *trans, if (unlikely(!new_k)) { bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(ck->key.btree_id), key_u64s); - ret = -BCH_ERR_ENOMEM_btree_key_cache_fill; + ret = bch_err_throw(c, ENOMEM_btree_key_cache_fill); } else if (ret) { kfree(new_k); goto err; @@ -826,20 +822,20 @@ int bch2_fs_btree_key_cache_init(struct btree_key_cache *bc) bc->nr_pending = alloc_percpu(size_t); if (!bc->nr_pending) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); if (rcu_pending_init(&bc->pending[0], &c->btree_trans_barrier, __bkey_cached_free) || rcu_pending_init(&bc->pending[1], &c->btree_trans_barrier, __bkey_cached_free)) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); if (rhashtable_init(&bc->table, &bch2_btree_key_cache_params)) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); bc->table_init_done = true; shrink = shrinker_alloc(0, "%s-btree_key_cache", c->name); if (!shrink) - return -BCH_ERR_ENOMEM_fs_btree_cache_init; + return bch_err_throw(c, ENOMEM_fs_btree_cache_init); bc->shrink = shrink; shrink->count_objects = bch2_btree_key_cache_count; shrink->scan_objects = bch2_btree_key_cache_scan; diff --git a/fs/bcachefs/btree_locking.c b/fs/bcachefs/btree_locking.c index 2f2aed0c9916..47035aae232e 100644 --- a/fs/bcachefs/btree_locking.c +++ b/fs/bcachefs/btree_locking.c @@ -194,6 +194,30 @@ static int btree_trans_abort_preference(struct btree_trans *trans) return 3; } +static noinline __noreturn void break_cycle_fail(struct lock_graph *g) +{ + struct printbuf buf = PRINTBUF; + buf.atomic++; + + prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); + + for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) { + struct btree_trans *trans = i->trans; + + bch2_btree_trans_to_text(&buf, trans); + + prt_printf(&buf, "backtrace:\n"); + printbuf_indent_add(&buf, 2); + bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); + printbuf_indent_sub(&buf, 2); + prt_newline(&buf); + } + + bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + BUG(); +} + static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, struct trans_waiting_for_lock *from) { @@ -219,28 +243,8 @@ static noinline int break_cycle(struct lock_graph *g, struct printbuf *cycle, } } - if (unlikely(!best)) { - struct printbuf buf = PRINTBUF; - buf.atomic++; - - prt_printf(&buf, bch2_fmt(g->g->trans->c, "cycle of nofail locks")); - - for (i = g->g; i < g->g + g->nr; i++) { - struct btree_trans *trans = i->trans; - - bch2_btree_trans_to_text(&buf, trans); - - prt_printf(&buf, "backtrace:\n"); - printbuf_indent_add(&buf, 2); - bch2_prt_task_backtrace(&buf, trans->locking_wait.task, 2, GFP_NOWAIT); - printbuf_indent_sub(&buf, 2); - prt_newline(&buf); - } - - bch2_print_str_nonblocking(g->g->trans->c, KERN_ERR, buf.buf); - printbuf_exit(&buf); - BUG(); - } + if (unlikely(!best)) + break_cycle_fail(g); ret = abort_lock(g, abort); out: @@ -255,15 +259,14 @@ static int lock_graph_descend(struct lock_graph *g, struct btree_trans *trans, struct printbuf *cycle) { struct btree_trans *orig_trans = g->g->trans; - struct trans_waiting_for_lock *i; - for (i = g->g; i < g->g + g->nr; i++) + for (struct trans_waiting_for_lock *i = g->g; i < g->g + g->nr; i++) if (i->trans == trans) { closure_put(&trans->ref); return break_cycle(g, cycle, i); } - if (g->nr == ARRAY_SIZE(g->g)) { + if (unlikely(g->nr == ARRAY_SIZE(g->g))) { closure_put(&trans->ref); if (orig_trans->lock_may_not_fail) @@ -308,7 +311,7 @@ int bch2_check_for_deadlock(struct btree_trans *trans, struct printbuf *cycle) lock_graph_down(&g, trans); /* trans->paths is rcu protected vs. freeing */ - rcu_read_lock(); + guard(rcu)(); if (cycle) cycle->atomic++; next: @@ -406,7 +409,6 @@ up: out: if (cycle) --cycle->atomic; - rcu_read_unlock(); return ret; } diff --git a/fs/bcachefs/btree_node_scan.c b/fs/bcachefs/btree_node_scan.c index 5a97a6b8a757..a35847734a60 100644 --- a/fs/bcachefs/btree_node_scan.c +++ b/fs/bcachefs/btree_node_scan.c @@ -363,6 +363,8 @@ static int handle_overwrites(struct bch_fs *c, min_heap_sift_down(nodes_heap, 0, &found_btree_node_heap_cbs, NULL); } } + + cond_resched(); } return 0; diff --git a/fs/bcachefs/btree_trans_commit.c b/fs/bcachefs/btree_trans_commit.c index 1c03c965d836..d9710801e3ee 100644 --- a/fs/bcachefs/btree_trans_commit.c +++ b/fs/bcachefs/btree_trans_commit.c @@ -376,7 +376,7 @@ static inline int btree_key_can_insert(struct btree_trans *trans, struct btree *b, unsigned u64s) { if (!bch2_btree_node_insert_fits(b, u64s)) - return -BCH_ERR_btree_insert_btree_node_full; + return bch_err_throw(trans->c, btree_insert_btree_node_full); return 0; } @@ -394,9 +394,10 @@ btree_key_can_insert_cached_slowpath(struct btree_trans *trans, unsigned flags, new_k = kmalloc(new_u64s * sizeof(u64), GFP_KERNEL); if (!new_k) { - bch_err(trans->c, "error allocating memory for key cache key, btree %s u64s %u", + struct bch_fs *c = trans->c; + bch_err(c, "error allocating memory for key cache key, btree %s u64s %u", bch2_btree_id_str(path->btree_id), new_u64s); - return -BCH_ERR_ENOMEM_btree_key_cache_insert; + return bch_err_throw(c, ENOMEM_btree_key_cache_insert); } ret = bch2_trans_relock(trans) ?: @@ -432,7 +433,7 @@ static int btree_key_can_insert_cached(struct btree_trans *trans, unsigned flags if (watermark < BCH_WATERMARK_reclaim && !test_bit(BKEY_CACHED_DIRTY, &ck->flags) && bch2_btree_key_cache_must_wait(c)) - return -BCH_ERR_btree_insert_need_journal_reclaim; + return bch_err_throw(c, btree_insert_need_journal_reclaim); /* * bch2_varint_decode can read past the end of the buffer by at most 7 @@ -894,7 +895,7 @@ int bch2_trans_commit_error(struct btree_trans *trans, unsigned flags, */ if ((flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); goto out; } @@ -966,14 +967,27 @@ do_bch2_trans_commit_to_journal_replay(struct btree_trans *trans) for (struct jset_entry *i = btree_trans_journal_entries_start(trans); i != btree_trans_journal_entries_top(trans); - i = vstruct_next(i)) + i = vstruct_next(i)) { if (i->type == BCH_JSET_ENTRY_btree_keys || i->type == BCH_JSET_ENTRY_write_buffer_keys) { - int ret = bch2_journal_key_insert(c, i->btree_id, i->level, i->start); - if (ret) - return ret; + jset_entry_for_each_key(i, k) { + int ret = bch2_journal_key_insert(c, i->btree_id, i->level, k); + if (ret) + return ret; + } } + if (i->type == BCH_JSET_ENTRY_btree_root) { + guard(mutex)(&c->btree_root_lock); + + struct btree_root *r = bch2_btree_id_root(c, i->btree_id); + + bkey_copy(&r->key, i->start); + r->level = i->level; + r->alive = true; + } + } + for (struct bkey_i *i = btree_trans_subbuf_base(trans, &trans->accounting); i != btree_trans_subbuf_top(trans, &trans->accounting); i = bkey_next(i)) { @@ -1011,7 +1025,7 @@ int __bch2_trans_commit(struct btree_trans *trans, unsigned flags) if (unlikely(!test_bit(BCH_FS_may_go_rw, &c->flags))) ret = do_bch2_trans_commit_to_journal_replay(trans); else - ret = -BCH_ERR_erofs_trans_commit; + ret = bch_err_throw(c, erofs_trans_commit); goto out_reset; } @@ -1093,7 +1107,7 @@ err: * restart: */ if (flags & BCH_TRANS_COMMIT_no_journal_res) { - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); goto out; } diff --git a/fs/bcachefs/btree_types.h b/fs/bcachefs/btree_types.h index 9d641bf9d2a2..c61c4171ae50 100644 --- a/fs/bcachefs/btree_types.h +++ b/fs/bcachefs/btree_types.h @@ -555,6 +555,8 @@ struct btree_trans { unsigned journal_u64s; unsigned extra_disk_res; /* XXX kill */ + __BKEY_PADDED(btree_path_down, BKEY_BTREE_PTR_VAL_U64s_MAX); + #ifdef CONFIG_DEBUG_LOCK_ALLOC struct lockdep_map dep_map; #endif diff --git a/fs/bcachefs/btree_update.c b/fs/bcachefs/btree_update.c index 5dac09c98026..e97e78c10f49 100644 --- a/fs/bcachefs/btree_update.c +++ b/fs/bcachefs/btree_update.c @@ -123,65 +123,44 @@ static int need_whiteout_for_snapshot(struct btree_trans *trans, } int __bch2_insert_snapshot_whiteouts(struct btree_trans *trans, - enum btree_id id, - struct bpos old_pos, - struct bpos new_pos) + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) { - struct bch_fs *c = trans->c; - struct btree_iter old_iter, new_iter = {}; - struct bkey_s_c old_k, new_k; - snapshot_id_list s; - struct bkey_i *update; int ret = 0; - if (!bch2_snapshot_has_children(c, old_pos.snapshot)) - return 0; - - darray_init(&s); + darray_for_each(*s, id) { + pos.snapshot = *id; - bch2_trans_iter_init(trans, &old_iter, id, old_pos, - BTREE_ITER_not_extents| - BTREE_ITER_all_snapshots); - while ((old_k = bch2_btree_iter_prev(trans, &old_iter)).k && - !(ret = bkey_err(old_k)) && - bkey_eq(old_pos, old_k.k->p)) { - struct bpos whiteout_pos = - SPOS(new_pos.inode, new_pos.offset, old_k.k->p.snapshot); - - if (!bch2_snapshot_is_ancestor(c, old_k.k->p.snapshot, old_pos.snapshot) || - snapshot_list_has_ancestor(c, &s, old_k.k->p.snapshot)) - continue; - - new_k = bch2_bkey_get_iter(trans, &new_iter, id, whiteout_pos, - BTREE_ITER_not_extents| - BTREE_ITER_intent); - ret = bkey_err(new_k); + struct btree_iter iter; + struct bkey_s_c k = bch2_bkey_get_iter(trans, &iter, btree, pos, + BTREE_ITER_not_extents| + BTREE_ITER_intent); + ret = bkey_err(k); if (ret) break; - if (new_k.k->type == KEY_TYPE_deleted) { - update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); + if (k.k->type == KEY_TYPE_deleted) { + struct bkey_i *update = bch2_trans_kmalloc(trans, sizeof(struct bkey_i)); ret = PTR_ERR_OR_ZERO(update); - if (ret) + if (ret) { + bch2_trans_iter_exit(trans, &iter); break; + } bkey_init(&update->k); - update->k.p = whiteout_pos; + update->k.p = pos; update->k.type = KEY_TYPE_whiteout; - ret = bch2_trans_update(trans, &new_iter, update, + ret = bch2_trans_update(trans, &iter, update, BTREE_UPDATE_internal_snapshot_node); } - bch2_trans_iter_exit(trans, &new_iter); + bch2_trans_iter_exit(trans, &iter); - ret = snapshot_list_add(c, &s, old_k.k->p.snapshot); if (ret) break; } - bch2_trans_iter_exit(trans, &new_iter); - bch2_trans_iter_exit(trans, &old_iter); - darray_exit(&s); + darray_exit(s); return ret; } @@ -608,7 +587,7 @@ int bch2_bkey_get_empty_slot(struct btree_trans *trans, struct btree_iter *iter, BUG_ON(k.k->type != KEY_TYPE_deleted); if (bkey_gt(k.k->p, end)) { - ret = -BCH_ERR_ENOSPC_btree_slot; + ret = bch_err_throw(trans->c, ENOSPC_btree_slot); goto err; } diff --git a/fs/bcachefs/btree_update.h b/fs/bcachefs/btree_update.h index f907eaa8b185..9feef1dc4de5 100644 --- a/fs/bcachefs/btree_update.h +++ b/fs/bcachefs/btree_update.h @@ -4,6 +4,7 @@ #include "btree_iter.h" #include "journal.h" +#include "snapshot.h" struct bch_fs; struct btree; @@ -74,7 +75,7 @@ static inline int bch2_btree_delete_at_buffered(struct btree_trans *trans, } int __bch2_insert_snapshot_whiteouts(struct btree_trans *, enum btree_id, - struct bpos, struct bpos); + struct bpos, snapshot_id_list *); /* * For use when splitting extents in existing snapshots: @@ -88,11 +89,20 @@ static inline int bch2_insert_snapshot_whiteouts(struct btree_trans *trans, struct bpos old_pos, struct bpos new_pos) { + BUG_ON(old_pos.snapshot != new_pos.snapshot); + if (!btree_type_has_snapshots(btree) || bkey_eq(old_pos, new_pos)) return 0; - return __bch2_insert_snapshot_whiteouts(trans, btree, old_pos, new_pos); + snapshot_id_list s; + int ret = bch2_get_snapshot_overwrites(trans, btree, old_pos, &s); + if (ret) + return ret; + + return s.nr + ? __bch2_insert_snapshot_whiteouts(trans, btree, new_pos, &s) + : 0; } int bch2_trans_update_extent_overwrite(struct btree_trans *, struct btree_iter *, diff --git a/fs/bcachefs/btree_update_interior.c b/fs/bcachefs/btree_update_interior.c index 74e65714fecd..d2ecb782919b 100644 --- a/fs/bcachefs/btree_update_interior.c +++ b/fs/bcachefs/btree_update_interior.c @@ -57,8 +57,6 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) struct bkey_buf prev; int ret = 0; - printbuf_indent_add_nextline(&buf, 2); - BUG_ON(b->key.k.type == KEY_TYPE_btree_ptr_v2 && !bpos_eq(bkey_i_to_btree_ptr_v2(&b->key)->v.min_key, b->data->min_key)); @@ -69,20 +67,23 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) if (b == btree_node_root(c, b)) { if (!bpos_eq(b->data->min_key, POS_MIN)) { - ret = __bch2_topology_error(c, &buf); - + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "btree root with incorrect min_key: "); bch2_bpos_to_text(&buf, b->data->min_key); - log_fsck_err(trans, btree_root_bad_min_key, - "btree root with incorrect min_key: %s", buf.buf); - goto out; + prt_newline(&buf); + + bch2_count_fsck_err(c, btree_root_bad_min_key, &buf); + goto err; } if (!bpos_eq(b->data->max_key, SPOS_MAX)) { - ret = __bch2_topology_error(c, &buf); + bch2_log_msg_start(c, &buf); + prt_printf(&buf, "btree root with incorrect max_key: "); bch2_bpos_to_text(&buf, b->data->max_key); - log_fsck_err(trans, btree_root_bad_max_key, - "btree root with incorrect max_key: %s", buf.buf); - goto out; + prt_newline(&buf); + + bch2_count_fsck_err(c, btree_root_bad_max_key, &buf); + goto err; } } @@ -100,19 +101,15 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) : bpos_successor(prev.k->k.p); if (!bpos_eq(expected_min, bp.v->min_key)) { - ret = __bch2_topology_error(c, &buf); - - prt_str(&buf, "end of prev node doesn't match start of next node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_str(&buf, "end of prev node doesn't match start of next node"); prt_str(&buf, "\nprev "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); prt_str(&buf, "\nnext "); bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); - log_fsck_err(trans, btree_node_topology_bad_min_key, "%s", buf.buf); - goto out; + bch2_count_fsck_err(c, btree_node_topology_bad_min_key, &buf); + goto err; } bch2_bkey_buf_reassemble(&prev, c, k); @@ -120,32 +117,34 @@ int bch2_btree_node_check_topology(struct btree_trans *trans, struct btree *b) } if (bkey_deleted(&prev.k->k)) { - ret = __bch2_topology_error(c, &buf); - - prt_str(&buf, "empty interior node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - - log_fsck_err(trans, btree_node_topology_empty_interior_node, "%s", buf.buf); - } else if (!bpos_eq(prev.k->k.p, b->key.k.p)) { - ret = __bch2_topology_error(c, &buf); + prt_printf(&buf, "empty interior node\n"); + bch2_count_fsck_err(c, btree_node_topology_empty_interior_node, &buf); + goto err; + } - prt_str(&buf, "last child node doesn't end at end of parent node\nin "); - bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); - prt_str(&buf, " node "); - bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); - prt_str(&buf, "\nlast key "); + if (!bpos_eq(prev.k->k.p, b->key.k.p)) { + prt_str(&buf, "last child node doesn't end at end of parent node\nchild: "); bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(prev.k)); + prt_newline(&buf); - log_fsck_err(trans, btree_node_topology_bad_max_key, "%s", buf.buf); + bch2_count_fsck_err(c, btree_node_topology_bad_max_key, &buf); + goto err; } out: -fsck_err: bch2_btree_and_journal_iter_exit(&iter); bch2_bkey_buf_exit(&prev, c); printbuf_exit(&buf); return ret; +err: + bch2_btree_id_level_to_text(&buf, b->c.btree_id, b->c.level); + prt_char(&buf, ' '); + bch2_bkey_val_to_text(&buf, c, bkey_i_to_s_c(&b->key)); + prt_newline(&buf); + + ret = __bch2_topology_error(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + BUG_ON(!ret); + goto out; } /* Calculate ideal packed bkey format for new btree nodes: */ @@ -685,12 +684,31 @@ static void btree_update_nodes_written(struct btree_update *as) /* * Wait for any in flight writes to finish before we free the old nodes - * on disk: + * on disk. But we haven't pinned those old nodes in the btree cache, + * they might have already been evicted. + * + * The update we're completing deleted references to those nodes from the + * btree, so we know if they've been evicted they can't be pulled back in. + * We just have to check if the nodes we have pointers to are still those + * old nodes, and haven't been reused. + * + * This can't be done locklessly because the data buffer might have been + * vmalloc allocated, and they're not RCU freed. We also need the + * __no_kmsan_checks annotation because even with the btree node read + * lock, nothing tells us that the data buffer has been initialized (if + * the btree node has been reused for a different node, and the data + * buffer swapped for a new data buffer). */ for (i = 0; i < as->nr_old_nodes; i++) { b = as->old_nodes[i]; - if (btree_node_seq_matches(b, as->old_nodes_seq[i])) + bch2_trans_begin(trans); + btree_node_lock_nopath_nofail(trans, &b->c, SIX_LOCK_read); + bool seq_matches = btree_node_seq_matches(b, as->old_nodes_seq[i]); + six_unlock_read(&b->c.lock); + bch2_trans_unlock_long(trans); + + if (seq_matches) wait_on_bit_io(&b->flags, BTREE_NODE_write_in_flight_inner, TASK_UNINTERRUPTIBLE); } @@ -1245,7 +1263,7 @@ bch2_btree_update_start(struct btree_trans *trans, struct btree_path *path, if (bch2_err_matches(ret, ENOSPC) && (flags & BCH_TRANS_COMMIT_journal_reclaim) && watermark < BCH_WATERMARK_reclaim) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); goto err; } @@ -2178,7 +2196,7 @@ static int get_iter_to_node(struct btree_trans *trans, struct btree_iter *iter, if (btree_iter_path(trans, iter)->l[b->c.level].b != b) { /* node has been freed: */ BUG_ON(!btree_node_dying(b)); - ret = -BCH_ERR_btree_node_dying; + ret = bch_err_throw(trans->c, btree_node_dying); goto err; } @@ -2792,16 +2810,16 @@ int bch2_fs_btree_interior_update_init(struct bch_fs *c) c->btree_interior_update_worker = alloc_workqueue("btree_update", WQ_UNBOUND|WQ_MEM_RECLAIM, 8); if (!c->btree_interior_update_worker) - return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); c->btree_node_rewrite_worker = alloc_ordered_workqueue("btree_node_rewrite", WQ_UNBOUND); if (!c->btree_node_rewrite_worker) - return -BCH_ERR_ENOMEM_btree_interior_update_worker_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_worker_init); if (mempool_init_kmalloc_pool(&c->btree_interior_update_pool, 1, sizeof(struct btree_update))) - return -BCH_ERR_ENOMEM_btree_interior_update_pool_init; + return bch_err_throw(c, ENOMEM_btree_interior_update_pool_init); return 0; } diff --git a/fs/bcachefs/btree_write_buffer.c b/fs/bcachefs/btree_write_buffer.c index efb0c64d0aac..90b21e61d2b6 100644 --- a/fs/bcachefs/btree_write_buffer.c +++ b/fs/bcachefs/btree_write_buffer.c @@ -394,7 +394,7 @@ static int bch2_btree_write_buffer_flush_locked(struct btree_trans *trans) bool accounting_accumulated = false; do { if (race_fault()) { - ret = -BCH_ERR_journal_reclaim_would_deadlock; + ret = bch_err_throw(c, journal_reclaim_would_deadlock); break; } @@ -633,7 +633,7 @@ int bch2_btree_write_buffer_tryflush(struct btree_trans *trans) struct bch_fs *c = trans->c; if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_btree_write_buffer)) - return -BCH_ERR_erofs_no_writes; + return bch_err_throw(c, erofs_no_writes); int ret = bch2_btree_write_buffer_flush_nocheck_rw(trans); enumerated_ref_put(&c->writes, BCH_WRITE_REF_btree_write_buffer); @@ -676,7 +676,7 @@ int bch2_btree_write_buffer_maybe_flush(struct btree_trans *trans, goto err; bch2_bkey_buf_copy(last_flushed, c, tmp.k); - ret = -BCH_ERR_transaction_restart_write_buffer_flush; + ret = bch_err_throw(c, transaction_restart_write_buffer_flush); } err: bch2_bkey_buf_exit(&tmp, c); diff --git a/fs/bcachefs/buckets.c b/fs/bcachefs/buckets.c index 09eb5a543ae4..f25903c10e8a 100644 --- a/fs/bcachefs/buckets.c +++ b/fs/bcachefs/buckets.c @@ -221,6 +221,20 @@ static int bch2_check_fix_ptr(struct btree_trans *trans, bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { if (!p.ptr.cached && data_type == BCH_DATA_btree) { + switch (g->data_type) { + case BCH_DATA_sb: + bch_err(c, "btree and superblock in the same bucket - cannot repair"); + ret = bch_err_throw(c, fsck_repair_unimplemented); + goto out; + case BCH_DATA_journal: + ret = bch2_dev_journal_bucket_delete(ca, PTR_BUCKET_NR(ca, &p.ptr)); + bch_err_msg(c, ret, "error deleting journal bucket %zu", + PTR_BUCKET_NR(ca, &p.ptr)); + if (ret) + goto out; + break; + } + g->data_type = data_type; g->stripe_sectors = 0; g->dirty_sectors = 0; @@ -270,6 +284,9 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, struct printbuf buf = PRINTBUF; int ret = 0; + /* We don't yet do btree key updates correctly for when we're RW */ + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); + bkey_for_each_ptr_decode(k.k, ptrs_c, p, entry_c) { ret = bch2_check_fix_ptr(trans, k, p, entry_c, &do_update); if (ret) @@ -277,20 +294,13 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, } if (do_update) { - if (flags & BTREE_TRIGGER_is_root) { - bch_err(c, "cannot update btree roots yet"); - ret = -EINVAL; - goto err; - } - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); ret = PTR_ERR_OR_ZERO(new); if (ret) goto err; - rcu_read_lock(); - bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); - rcu_read_unlock(); + scoped_guard(rcu) + bch2_bkey_drop_ptrs(bkey_i_to_s(new), ptr, !bch2_dev_exists(c, ptr->dev)); if (level) { /* @@ -299,14 +309,11 @@ int bch2_check_fix_ptrs(struct btree_trans *trans, * sort it out: */ struct bkey_ptrs ptrs = bch2_bkey_ptrs(bkey_i_to_s(new)); - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - struct bucket *g = PTR_GC_BUCKET(ca, ptr); - - ptr->gen = g->gen; - } - rcu_read_unlock(); + scoped_guard(rcu) + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + ptr->gen = PTR_GC_BUCKET(ca, ptr)->gen; + } } else { struct bkey_ptrs ptrs; union bch_extent_entry *entry; @@ -370,19 +377,41 @@ found: bch_info(c, "new key %s", buf.buf); } - struct btree_iter iter; - bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, - BTREE_ITER_intent|BTREE_ITER_all_snapshots); - ret = bch2_btree_iter_traverse(trans, &iter) ?: - bch2_trans_update(trans, &iter, new, - BTREE_UPDATE_internal_snapshot_node| - BTREE_TRIGGER_norun); - bch2_trans_iter_exit(trans, &iter); - if (ret) - goto err; + if (!(flags & BTREE_TRIGGER_is_root)) { + struct btree_iter iter; + bch2_trans_node_iter_init(trans, &iter, btree, new->k.p, 0, level, + BTREE_ITER_intent|BTREE_ITER_all_snapshots); + ret = bch2_btree_iter_traverse(trans, &iter) ?: + bch2_trans_update(trans, &iter, new, + BTREE_UPDATE_internal_snapshot_node| + BTREE_TRIGGER_norun); + bch2_trans_iter_exit(trans, &iter); + if (ret) + goto err; + + if (level) + bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + } else { + struct jset_entry *e = bch2_trans_jset_entry_alloc(trans, + jset_u64s(new->k.u64s)); + ret = PTR_ERR_OR_ZERO(e); + if (ret) + goto err; + + journal_entry_set(e, + BCH_JSET_ENTRY_btree_root, + btree, level - 1, + new, new->k.u64s); - if (level) - bch2_btree_node_update_key_early(trans, btree, level - 1, k, new); + /* + * no locking, we're single threaded and not rw yet, see + * the big assertino above that we repeat here: + */ + BUG_ON(test_bit(BCH_FS_rw, &c->flags)); + + struct btree *b = bch2_btree_id_root(c, btree)->b; + bkey_copy(&b->key, new); + } } err: printbuf_exit(&buf); @@ -406,7 +435,15 @@ static int bucket_ref_update_err(struct btree_trans *trans, struct printbuf *buf if (insert) { bch2_trans_updates_to_text(buf, trans); __bch2_inconsistent_error(c, buf); - ret = -BCH_ERR_bucket_ref_update; + /* + * If we're in recovery, run_explicit_recovery_pass might give + * us an error code for rewinding recovery + */ + if (!ret) + ret = bch_err_throw(c, bucket_ref_update); + } else { + /* Always ignore overwrite errors, so that deletion works */ + ret = 0; } if (print || insert) @@ -595,7 +632,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, p.ptr.dev); if (unlikely(!ca)) { if (insert && p.ptr.dev != BCH_SB_MEMBER_INVALID) - ret = -BCH_ERR_trigger_pointer; + ret = bch_err_throw(c, trigger_pointer); goto err; } @@ -603,7 +640,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (!bucket_valid(ca, bucket.offset)) { if (insert) { bch2_dev_bucket_missing(ca, bucket.offset); - ret = -BCH_ERR_trigger_pointer; + ret = bch_err_throw(c, trigger_pointer); } goto err; } @@ -625,7 +662,7 @@ static int bch2_trigger_pointer(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n %s", p.ptr.dev, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -BCH_ERR_trigger_pointer; + ret = bch_err_throw(c, trigger_pointer); goto err; } @@ -651,6 +688,8 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, s64 sectors, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; + if (flags & BTREE_TRIGGER_transactional) { struct btree_iter iter; struct bkey_i_stripe *s = bch2_bkey_get_mut_typed(trans, &iter, @@ -668,7 +707,7 @@ static int bch2_trigger_stripe_ptr(struct btree_trans *trans, bch2_trans_inconsistent(trans, "stripe pointer doesn't match stripe %llu", (u64) p.ec.idx); - ret = -BCH_ERR_trigger_stripe_pointer; + ret = bch_err_throw(c, trigger_stripe_pointer); goto err; } @@ -688,13 +727,11 @@ err: } if (flags & BTREE_TRIGGER_gc) { - struct bch_fs *c = trans->c; - struct gc_stripe *m = genradix_ptr_alloc(&c->gc_stripes, p.ec.idx, GFP_KERNEL); if (!m) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", (u64) p.ec.idx); - return -BCH_ERR_ENOMEM_mark_stripe_ptr; + return bch_err_throw(c, ENOMEM_mark_stripe_ptr); } gc_stripe_lock(m); @@ -709,7 +746,7 @@ err: __bch2_inconsistent_error(c, &buf); bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - return -BCH_ERR_trigger_stripe_pointer; + return bch_err_throw(c, trigger_stripe_pointer); } m->block_sectors[p.ec.block] += sectors; @@ -732,8 +769,7 @@ err: static int __trigger_extent(struct btree_trans *trans, enum btree_id btree_id, unsigned level, struct bkey_s_c k, - enum btree_iter_update_trigger_flags flags, - s64 *replicas_sectors) + enum btree_iter_update_trigger_flags flags) { bool gc = flags & BTREE_TRIGGER_gc; struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -744,6 +780,8 @@ static int __trigger_extent(struct btree_trans *trans, : BCH_DATA_user; int ret = 0; + s64 replicas_sectors = 0; + struct disk_accounting_pos acc_replicas_key; memset(&acc_replicas_key, 0, sizeof(acc_replicas_key)); acc_replicas_key.type = BCH_DISK_ACCOUNTING_replicas; @@ -770,7 +808,7 @@ static int __trigger_extent(struct btree_trans *trans, if (ret) return ret; } else if (!p.has_ec) { - *replicas_sectors += disk_sectors; + replicas_sectors += disk_sectors; replicas_entry_add_dev(&acc_replicas_key.replicas, p.ptr.dev); } else { ret = bch2_trigger_stripe_ptr(trans, k, p, data_type, disk_sectors, flags); @@ -808,13 +846,13 @@ static int __trigger_extent(struct btree_trans *trans, } if (acc_replicas_key.replicas.nr_devs) { - ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, replicas_sectors, 1, gc); + ret = bch2_disk_accounting_mod(trans, &acc_replicas_key, &replicas_sectors, 1, gc); if (ret) return ret; } if (acc_replicas_key.replicas.nr_devs && !level && k.k->p.snapshot) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, snapshot, k.k->p.snapshot); + ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, snapshot, k.k->p.snapshot); if (ret) return ret; } @@ -830,7 +868,7 @@ static int __trigger_extent(struct btree_trans *trans, } if (level) { - ret = bch2_disk_accounting_mod2_nr(trans, gc, replicas_sectors, 1, btree, btree_id); + ret = bch2_disk_accounting_mod2_nr(trans, gc, &replicas_sectors, 1, btree, btree_id); if (ret) return ret; } else { @@ -839,7 +877,7 @@ static int __trigger_extent(struct btree_trans *trans, s64 v[3] = { insert ? 1 : -1, insert ? k.k->size : -((s64) k.k->size), - *replicas_sectors, + replicas_sectors, }; ret = bch2_disk_accounting_mod2(trans, gc, v, inum, k.k->p.inode); if (ret) @@ -871,20 +909,16 @@ int bch2_trigger_extent(struct btree_trans *trans, return 0; if (flags & (BTREE_TRIGGER_transactional|BTREE_TRIGGER_gc)) { - s64 old_replicas_sectors = 0, new_replicas_sectors = 0; - if (old.k->type) { int ret = __trigger_extent(trans, btree, level, old, - flags & ~BTREE_TRIGGER_insert, - &old_replicas_sectors); + flags & ~BTREE_TRIGGER_insert); if (ret) return ret; } if (new.k->type) { int ret = __trigger_extent(trans, btree, level, new.s_c, - flags & ~BTREE_TRIGGER_overwrite, - &new_replicas_sectors); + flags & ~BTREE_TRIGGER_overwrite); if (ret) return ret; } @@ -971,15 +1005,16 @@ static int __bch2_trans_mark_metadata_bucket(struct btree_trans *trans, bch2_data_type_str(type), bch2_data_type_str(type)); - bool print = bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); + bch2_count_fsck_err(c, bucket_metadata_type_mismatch, &buf); - bch2_run_explicit_recovery_pass(c, &buf, + ret = bch2_run_explicit_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_allocations, 0); - if (print) - bch2_print_str(c, KERN_ERR, buf.buf); + /* Always print, this is always fatal */ + bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_metadata_bucket_inconsistency; + if (!ret) + ret = bch_err_throw(c, metadata_bucket_inconsistency); goto err; } @@ -1032,7 +1067,7 @@ static int bch2_mark_metadata_bucket(struct btree_trans *trans, struct bch_dev * err_unlock: bucket_unlock(g); err: - return -BCH_ERR_metadata_bucket_inconsistency; + return bch_err_throw(c, metadata_bucket_inconsistency); } int bch2_trans_mark_metadata_bucket(struct btree_trans *trans, @@ -1247,7 +1282,7 @@ recalculate: ret = 0; } else { atomic64_set(&c->sectors_available, sectors_available); - ret = -BCH_ERR_ENOSPC_disk_reservation; + ret = bch_err_throw(c, ENOSPC_disk_reservation); } mutex_unlock(&c->sectors_available_lock); @@ -1276,7 +1311,7 @@ int bch2_buckets_nouse_alloc(struct bch_fs *c) GFP_KERNEL|__GFP_ZERO); if (!ca->buckets_nouse) { bch2_dev_put(ca); - return -BCH_ERR_ENOMEM_buckets_nouse; + return bch_err_throw(c, ENOMEM_buckets_nouse); } } @@ -1301,12 +1336,12 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) lockdep_assert_held(&c->state_lock); if (resize && ca->buckets_nouse) - return -BCH_ERR_no_resize_with_buckets_nouse; + return bch_err_throw(c, no_resize_with_buckets_nouse); bucket_gens = bch2_kvmalloc(struct_size(bucket_gens, b, nbuckets), GFP_KERNEL|__GFP_ZERO); if (!bucket_gens) { - ret = -BCH_ERR_ENOMEM_bucket_gens; + ret = bch_err_throw(c, ENOMEM_bucket_gens); goto err; } @@ -1325,9 +1360,9 @@ int bch2_dev_buckets_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) sizeof(bucket_gens->b[0]) * copy); } - ret = bch2_bucket_bitmap_resize(&ca->bucket_backpointer_mismatch, + ret = bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_mismatch, ca->mi.nbuckets, nbuckets) ?: - bch2_bucket_bitmap_resize(&ca->bucket_backpointer_empty, + bch2_bucket_bitmap_resize(ca, &ca->bucket_backpointer_empty, ca->mi.nbuckets, nbuckets); rcu_assign_pointer(ca->bucket_gens, bucket_gens); @@ -1354,7 +1389,7 @@ int bch2_dev_buckets_alloc(struct bch_fs *c, struct bch_dev *ca) { ca->usage = alloc_percpu(struct bch_dev_usage_full); if (!ca->usage) - return -BCH_ERR_ENOMEM_usage_init; + return bch_err_throw(c, ENOMEM_usage_init); return bch2_dev_buckets_resize(c, ca, ca->mi.nbuckets); } diff --git a/fs/bcachefs/buckets.h b/fs/bcachefs/buckets.h index af1532de4a37..49a3807a5eab 100644 --- a/fs/bcachefs/buckets.h +++ b/fs/bcachefs/buckets.h @@ -84,10 +84,8 @@ static inline int bucket_gen_get_rcu(struct bch_dev *ca, size_t b) static inline int bucket_gen_get(struct bch_dev *ca, size_t b) { - rcu_read_lock(); - int ret = bucket_gen_get_rcu(ca, b); - rcu_read_unlock(); - return ret; + guard(rcu)(); + return bucket_gen_get_rcu(ca, b); } static inline size_t PTR_BUCKET_NR(const struct bch_dev *ca, @@ -156,10 +154,8 @@ static inline int dev_ptr_stale_rcu(struct bch_dev *ca, const struct bch_extent_ */ static inline int dev_ptr_stale(struct bch_dev *ca, const struct bch_extent_ptr *ptr) { - rcu_read_lock(); - int ret = dev_ptr_stale_rcu(ca, ptr); - rcu_read_unlock(); - return ret; + guard(rcu)(); + return dev_ptr_stale_rcu(ca, ptr); } /* Device usage: */ diff --git a/fs/bcachefs/buckets_waiting_for_journal.c b/fs/bcachefs/buckets_waiting_for_journal.c index c8a488e6b7b8..832eff93acb6 100644 --- a/fs/bcachefs/buckets_waiting_for_journal.c +++ b/fs/bcachefs/buckets_waiting_for_journal.c @@ -108,7 +108,8 @@ int bch2_set_bucket_needs_journal_commit(struct buckets_waiting_for_journal *b, realloc: n = kvmalloc(sizeof(*n) + (sizeof(n->d[0]) << new_bits), GFP_KERNEL); if (!n) { - ret = -BCH_ERR_ENOMEM_buckets_waiting_for_journal_set; + struct bch_fs *c = container_of(b, struct bch_fs, buckets_waiting_for_journal); + ret = bch_err_throw(c, ENOMEM_buckets_waiting_for_journal_set); goto out; } diff --git a/fs/bcachefs/chardev.c b/fs/bcachefs/chardev.c index 4066946b26bc..2d38466eddfd 100644 --- a/fs/bcachefs/chardev.c +++ b/fs/bcachefs/chardev.c @@ -613,15 +613,12 @@ static long bch2_ioctl_disk_get_idx(struct bch_fs *c, if (!dev) return -EINVAL; - rcu_read_lock(); + guard(rcu)(); for_each_online_member_rcu(c, ca) - if (ca->dev == dev) { - rcu_read_unlock(); + if (ca->dev == dev) return ca->dev_idx; - } - rcu_read_unlock(); - return -BCH_ERR_ENOENT_dev_idx_not_found; + return bch_err_throw(c, ENOENT_dev_idx_not_found); } static long bch2_ioctl_disk_resize(struct bch_fs *c, diff --git a/fs/bcachefs/checksum.c b/fs/bcachefs/checksum.c index d3e2e4f776c6..a6795e73f0b9 100644 --- a/fs/bcachefs/checksum.c +++ b/fs/bcachefs/checksum.c @@ -173,7 +173,7 @@ int bch2_encrypt(struct bch_fs *c, unsigned type, if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) - return -BCH_ERR_no_encryption_key; + return bch_err_throw(c, no_encryption_key); bch2_chacha20(&c->chacha20_key, nonce, data, len); return 0; @@ -262,7 +262,7 @@ int __bch2_encrypt_bio(struct bch_fs *c, unsigned type, if (bch2_fs_inconsistent_on(!c->chacha20_key_set, c, "attempting to encrypt without encryption key")) - return -BCH_ERR_no_encryption_key; + return bch_err_throw(c, no_encryption_key); bch2_chacha20_init(&chacha_state, &c->chacha20_key, nonce); @@ -375,7 +375,7 @@ int bch2_rechecksum_bio(struct bch_fs *c, struct bio *bio, prt_str(&buf, ")"); WARN_RATELIMIT(1, "%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_recompute_checksum; + return bch_err_throw(c, recompute_checksum); } for (i = splits; i < splits + ARRAY_SIZE(splits); i++) { @@ -659,7 +659,7 @@ int bch2_enable_encryption(struct bch_fs *c, bool keyed) crypt = bch2_sb_field_resize(&c->disk_sb, crypt, sizeof(*crypt) / sizeof(u64)); if (!crypt) { - ret = -BCH_ERR_ENOSPC_sb_crypt; + ret = bch_err_throw(c, ENOSPC_sb_crypt); goto err; } diff --git a/fs/bcachefs/clock.c b/fs/bcachefs/clock.c index d6dd12d74d4f..8e9264b5a84e 100644 --- a/fs/bcachefs/clock.c +++ b/fs/bcachefs/clock.c @@ -53,7 +53,6 @@ void bch2_io_timer_del(struct io_clock *clock, struct io_timer *timer) struct io_clock_wait { struct io_timer io_timer; - struct timer_list cpu_timer; struct task_struct *task; int expired; }; @@ -67,15 +66,6 @@ static void io_clock_wait_fn(struct io_timer *timer) wake_up_process(wait->task); } -static void io_clock_cpu_timeout(struct timer_list *timer) -{ - struct io_clock_wait *wait = container_of(timer, - struct io_clock_wait, cpu_timer); - - wait->expired = 1; - wake_up_process(wait->task); -} - void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) { struct io_clock_wait wait = { @@ -90,8 +80,8 @@ void bch2_io_clock_schedule_timeout(struct io_clock *clock, u64 until) bch2_io_timer_del(clock, &wait.io_timer); } -void bch2_kthread_io_clock_wait(struct io_clock *clock, - u64 io_until, unsigned long cpu_timeout) +unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *clock, + u64 io_until, unsigned long cpu_timeout) { bool kthread = (current->flags & PF_KTHREAD) != 0; struct io_clock_wait wait = { @@ -103,27 +93,26 @@ void bch2_kthread_io_clock_wait(struct io_clock *clock, bch2_io_timer_add(clock, &wait.io_timer); - timer_setup_on_stack(&wait.cpu_timer, io_clock_cpu_timeout, 0); - - if (cpu_timeout != MAX_SCHEDULE_TIMEOUT) - mod_timer(&wait.cpu_timer, cpu_timeout + jiffies); - - do { - set_current_state(TASK_INTERRUPTIBLE); - if (kthread && kthread_should_stop()) - break; - - if (wait.expired) - break; - - schedule(); + set_current_state(TASK_INTERRUPTIBLE); + if (!(kthread && kthread_should_stop())) { + cpu_timeout = schedule_timeout(cpu_timeout); try_to_freeze(); - } while (0); + } __set_current_state(TASK_RUNNING); - timer_delete_sync(&wait.cpu_timer); - destroy_timer_on_stack(&wait.cpu_timer); bch2_io_timer_del(clock, &wait.io_timer); + return cpu_timeout; +} + +void bch2_kthread_io_clock_wait(struct io_clock *clock, + u64 io_until, unsigned long cpu_timeout) +{ + bool kthread = (current->flags & PF_KTHREAD) != 0; + + while (!(kthread && kthread_should_stop()) && + cpu_timeout && + atomic64_read(&clock->now) < io_until) + cpu_timeout = bch2_kthread_io_clock_wait_once(clock, io_until, cpu_timeout); } static struct io_timer *get_expired_timer(struct io_clock *clock, u64 now) diff --git a/fs/bcachefs/clock.h b/fs/bcachefs/clock.h index 82c79c8baf92..8769be2aa21e 100644 --- a/fs/bcachefs/clock.h +++ b/fs/bcachefs/clock.h @@ -4,6 +4,7 @@ void bch2_io_timer_add(struct io_clock *, struct io_timer *); void bch2_io_timer_del(struct io_clock *, struct io_timer *); +unsigned long bch2_kthread_io_clock_wait_once(struct io_clock *, u64, unsigned long); void bch2_kthread_io_clock_wait(struct io_clock *, u64, unsigned long); void __bch2_increment_clock(struct io_clock *, u64); diff --git a/fs/bcachefs/compress.c b/fs/bcachefs/compress.c index 1bca61d17092..b37b1f325f0a 100644 --- a/fs/bcachefs/compress.c +++ b/fs/bcachefs/compress.c @@ -187,7 +187,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, __bch2_compression_types[crc.compression_type])) ret = bch2_check_set_has_compressed_data(c, opt); else - ret = -BCH_ERR_compression_workspace_not_initialized; + ret = bch_err_throw(c, compression_workspace_not_initialized); if (ret) goto err; } @@ -200,7 +200,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, ret2 = LZ4_decompress_safe_partial(src_data.b, dst_data, src_len, dst_len, dst_len); if (ret2 != dst_len) - ret = -BCH_ERR_decompress_lz4; + ret = bch_err_throw(c, decompress_lz4); break; case BCH_COMPRESSION_TYPE_gzip: { z_stream strm = { @@ -219,7 +219,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, mempool_free(workspace, workspace_pool); if (ret2 != Z_STREAM_END) - ret = -BCH_ERR_decompress_gzip; + ret = bch_err_throw(c, decompress_gzip); break; } case BCH_COMPRESSION_TYPE_zstd: { @@ -227,7 +227,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, size_t real_src_len = le32_to_cpup(src_data.b); if (real_src_len > src_len - 4) { - ret = -BCH_ERR_decompress_zstd_src_len_bad; + ret = bch_err_throw(c, decompress_zstd_src_len_bad); goto err; } @@ -241,7 +241,7 @@ static int __bio_uncompress(struct bch_fs *c, struct bio *src, mempool_free(workspace, workspace_pool); if (ret2 != dst_len) - ret = -BCH_ERR_decompress_zstd; + ret = bch_err_throw(c, decompress_zstd); break; } default: @@ -270,7 +270,7 @@ int bch2_bio_uncompress_inplace(struct bch_write_op *op, bch2_write_op_error(op, op->pos.offset, "extent too big to decompress (%u > %u)", crc->uncompressed_size << 9, c->opts.encoded_extent_max); - return -BCH_ERR_decompress_exceeded_max_encoded_extent; + return bch_err_throw(c, decompress_exceeded_max_encoded_extent); } data = __bounce_alloc(c, dst_len, WRITE); @@ -314,7 +314,7 @@ int bch2_bio_uncompress(struct bch_fs *c, struct bio *src, if (crc.uncompressed_size << 9 > c->opts.encoded_extent_max || crc.compressed_size << 9 > c->opts.encoded_extent_max) - return -BCH_ERR_decompress_exceeded_max_encoded_extent; + return bch_err_throw(c, decompress_exceeded_max_encoded_extent); dst_data = dst_len == dst_iter.bi_size ? __bio_map_or_bounce(c, dst, dst_iter, WRITE) @@ -656,12 +656,12 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (!mempool_initialized(&c->compression_bounce[READ]) && mempool_init_kvmalloc_pool(&c->compression_bounce[READ], 1, c->opts.encoded_extent_max)) - return -BCH_ERR_ENOMEM_compression_bounce_read_init; + return bch_err_throw(c, ENOMEM_compression_bounce_read_init); if (!mempool_initialized(&c->compression_bounce[WRITE]) && mempool_init_kvmalloc_pool(&c->compression_bounce[WRITE], 1, c->opts.encoded_extent_max)) - return -BCH_ERR_ENOMEM_compression_bounce_write_init; + return bch_err_throw(c, ENOMEM_compression_bounce_write_init); for (i = compression_types; i < compression_types + ARRAY_SIZE(compression_types); @@ -675,7 +675,7 @@ static int __bch2_fs_compress_init(struct bch_fs *c, u64 features) if (mempool_init_kvmalloc_pool( &c->compress_workspace[i->type], 1, i->compress_workspace)) - return -BCH_ERR_ENOMEM_compression_workspace_init; + return bch_err_throw(c, ENOMEM_compression_workspace_init); } return 0; diff --git a/fs/bcachefs/darray.h b/fs/bcachefs/darray.h index 50ec3decfe8c..4080ee99aadd 100644 --- a/fs/bcachefs/darray.h +++ b/fs/bcachefs/darray.h @@ -8,6 +8,7 @@ * Inspired by CCAN's darray */ +#include <linux/cleanup.h> #include <linux/slab.h> #define DARRAY_PREALLOCATED(_type, _nr) \ @@ -87,7 +88,23 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); #define darray_remove_item(_d, _pos) \ array_remove_item((_d)->data, (_d)->nr, (_pos) - (_d)->data) -#define __darray_for_each(_d, _i) \ +#define darray_find_p(_d, _i, cond) \ +({ \ + typeof((_d).data) _ret = NULL; \ + \ + darray_for_each(_d, _i) \ + if (cond) { \ + _ret = _i; \ + break; \ + } \ + _ret; \ +}) + +#define darray_find(_d, _item) darray_find_p(_d, _i, *_i == _item) + +/* Iteration: */ + +#define __darray_for_each(_d, _i) \ for ((_i) = (_d).data; _i < (_d).data + (_d).nr; _i++) #define darray_for_each(_d, _i) \ @@ -96,6 +113,8 @@ int __bch2_darray_resize_noprof(darray_char *, size_t, size_t, gfp_t); #define darray_for_each_reverse(_d, _i) \ for (typeof(&(_d).data[0]) _i = (_d).data + (_d).nr - 1; _i >= (_d).data && (_d).nr; --_i) +/* Init/exit */ + #define darray_init(_d) \ do { \ (_d)->nr = 0; \ @@ -111,4 +130,29 @@ do { \ darray_init(_d); \ } while (0) +#define DEFINE_DARRAY_CLASS(_type) \ +DEFINE_CLASS(_type, _type, darray_exit(&(_T)), (_type) {}, void) + +#define DEFINE_DARRAY(_type) \ +typedef DARRAY(_type) darray_##_type; \ +DEFINE_DARRAY_CLASS(darray_##_type) + +#define DEFINE_DARRAY_NAMED(_name, _type) \ +typedef DARRAY(_type) _name; \ +DEFINE_DARRAY_CLASS(_name) + +DEFINE_DARRAY_CLASS(darray_char); +DEFINE_DARRAY_CLASS(darray_str) +DEFINE_DARRAY_CLASS(darray_const_str) + +DEFINE_DARRAY_CLASS(darray_u8) +DEFINE_DARRAY_CLASS(darray_u16) +DEFINE_DARRAY_CLASS(darray_u32) +DEFINE_DARRAY_CLASS(darray_u64) + +DEFINE_DARRAY_CLASS(darray_s8) +DEFINE_DARRAY_CLASS(darray_s16) +DEFINE_DARRAY_CLASS(darray_s32) +DEFINE_DARRAY_CLASS(darray_s64) + #endif /* _BCACHEFS_DARRAY_H */ diff --git a/fs/bcachefs/data_update.c b/fs/bcachefs/data_update.c index c34e5b88ba9d..5f1174348974 100644 --- a/fs/bcachefs/data_update.c +++ b/fs/bcachefs/data_update.c @@ -66,37 +66,46 @@ static void bkey_nocow_unlock(struct bch_fs *c, struct bkey_s_c k) } } -static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_s_c k) +static noinline_for_stack +bool __bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs, + const struct bch_extent_ptr *start) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); + if (!ctxt) { + bkey_for_each_ptr(ptrs, ptr) { + if (ptr == start) + break; + + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); + bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); + } + return false; + } - bkey_for_each_ptr(ptrs, ptr) { + __bkey_for_each_ptr(start, ptrs.end, ptr) { struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - if (ctxt) { - bool locked; - - move_ctxt_wait_event(ctxt, - (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || - list_empty(&ctxt->ios)); + bool locked; + move_ctxt_wait_event(ctxt, + (locked = bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) || + list_empty(&ctxt->ios)); + if (!locked) + bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); + } + return true; +} - if (!locked) - bch2_bucket_nocow_lock(&c->nocow_locks, bucket, 0); - } else { - if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) { - bkey_for_each_ptr(ptrs, ptr2) { - if (ptr2 == ptr) - break; +static bool bkey_nocow_lock(struct bch_fs *c, struct moving_context *ctxt, struct bkey_ptrs_c ptrs) +{ + bkey_for_each_ptr(ptrs, ptr) { + struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); + struct bpos bucket = PTR_BUCKET_POS(ca, ptr); - ca = bch2_dev_have_ref(c, ptr2->dev); - bucket = PTR_BUCKET_POS(ca, ptr2); - bch2_bucket_nocow_unlock(&c->nocow_locks, bucket, 0); - } - return false; - } - } + if (!bch2_bucket_nocow_trylock(&c->nocow_locks, bucket, 0)) + return __bkey_nocow_lock(c, ctxt, ptrs, ptr); } + return true; } @@ -246,7 +255,7 @@ static int data_update_invalid_bkey(struct data_update *m, bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - return -BCH_ERR_invalid_bkey; + return bch_err_throw(c, invalid_bkey); } static int __bch2_data_update_index_update(struct btree_trans *trans, @@ -367,21 +376,21 @@ restart_drop_conflicting_replicas: bch2_bkey_durability(c, bkey_i_to_s_c(&new->k_i)); /* Now, drop excess replicas: */ - rcu_read_lock(); + scoped_guard(rcu) { restart_drop_extra_replicas: - bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { - unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); + bkey_for_each_ptr_decode(old.k, bch2_bkey_ptrs(bkey_i_to_s(insert)), p, entry) { + unsigned ptr_durability = bch2_extent_ptr_durability(c, &p); - if (!p.ptr.cached && - durability - ptr_durability >= m->op.opts.data_replicas) { - durability -= ptr_durability; + if (!p.ptr.cached && + durability - ptr_durability >= m->op.opts.data_replicas) { + durability -= ptr_durability; - bch2_extent_ptr_set_cached(c, &m->op.opts, - bkey_i_to_s(insert), &entry->ptr); - goto restart_drop_extra_replicas; + bch2_extent_ptr_set_cached(c, &m->op.opts, + bkey_i_to_s(insert), &entry->ptr); + goto restart_drop_extra_replicas; + } } } - rcu_read_unlock(); /* Finally, add the pointers we just wrote: */ extent_for_each_ptr_decode(extent_i_to_s(new), p, entry) @@ -523,8 +532,9 @@ void bch2_data_update_exit(struct data_update *update) bch2_bkey_buf_exit(&update->k, c); } -static int bch2_update_unwritten_extent(struct btree_trans *trans, - struct data_update *update) +static noinline_for_stack +int bch2_update_unwritten_extent(struct btree_trans *trans, + struct data_update *update) { struct bch_fs *c = update->op.c; struct bkey_i_extent *e; @@ -716,18 +726,10 @@ int bch2_extent_drop_ptrs(struct btree_trans *trans, bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); } -int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, - struct bch_io_opts *io_opts) +static int __bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts, + unsigned buf_bytes) { - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - - /* write path might have to decompress data: */ - unsigned buf_bytes = 0; - bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) - buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); - unsigned nr_vecs = DIV_ROUND_UP(buf_bytes, PAGE_SIZE); m->bvecs = kmalloc_array(nr_vecs, sizeof*(m->bvecs), GFP_KERNEL); @@ -751,11 +753,26 @@ int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, return 0; } +int bch2_data_update_bios_init(struct data_update *m, struct bch_fs *c, + struct bch_io_opts *io_opts) +{ + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + + /* write path might have to decompress data: */ + unsigned buf_bytes = 0; + bkey_for_each_ptr_decode(&m->k.k->k, ptrs, p, entry) + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + + return __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); +} + static int can_write_extent(struct bch_fs *c, struct data_update *m) { if ((m->op.flags & BCH_WRITE_alloc_nowait) && unlikely(c->open_buckets_nr_free <= bch2_open_buckets_reserved(m->op.watermark))) - return -BCH_ERR_data_update_done_would_block; + return bch_err_throw(c, data_update_done_would_block); unsigned target = m->op.flags & BCH_WRITE_only_specified_devs ? m->op.target @@ -765,7 +782,8 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) darray_for_each(m->op.devs_have, i) __clear_bit(*i, devs.d); - rcu_read_lock(); + guard(rcu)(); + unsigned nr_replicas = 0, i; for_each_set_bit(i, devs.d, BCH_SB_MEMBERS_MAX) { struct bch_dev *ca = bch2_dev_rcu_noerror(c, i); @@ -782,12 +800,11 @@ static int can_write_extent(struct bch_fs *c, struct data_update *m) if (nr_replicas >= m->op.nr_replicas) break; } - rcu_read_unlock(); if (!nr_replicas) - return -BCH_ERR_data_update_done_no_rw_devs; + return bch_err_throw(c, data_update_done_no_rw_devs); if (nr_replicas < m->op.nr_replicas) - return -BCH_ERR_insufficient_devices; + return bch_err_throw(c, insufficient_devices); return 0; } @@ -802,19 +819,21 @@ int bch2_data_update_init(struct btree_trans *trans, struct bkey_s_c k) { struct bch_fs *c = trans->c; - struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); - const union bch_extent_entry *entry; - struct extent_ptr_decoded p; - unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; int ret = 0; - /* - * fs is corrupt we have a key for a snapshot node that doesn't exist, - * and we have to check for this because we go rw before repairing the - * snapshots table - just skip it, we can move it later. - */ - if (unlikely(k.k->p.snapshot && !bch2_snapshot_exists(c, k.k->p.snapshot))) - return -BCH_ERR_data_update_done_no_snapshot; + if (k.k->p.snapshot) { + ret = bch2_check_key_has_snapshot(trans, iter, k); + if (bch2_err_matches(ret, BCH_ERR_recovery_will_run)) { + /* Can't repair yet, waiting on other recovery passes */ + return bch_err_throw(c, data_update_done_no_snapshot); + } + if (ret < 0) + return ret; + if (ret) /* key was deleted */ + return bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(c, data_update_done_no_snapshot); + ret = 0; + } bch2_bkey_buf_init(&m->k); bch2_bkey_buf_reassemble(&m->k, c, k); @@ -842,10 +861,17 @@ int bch2_data_update_init(struct btree_trans *trans, unsigned durability_have = 0, durability_removing = 0; + struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(bkey_i_to_s_c(m->k.k)); + const union bch_extent_entry *entry; + struct extent_ptr_decoded p; + unsigned reserve_sectors = k.k->size * data_opts.extra_replicas; + unsigned buf_bytes = 0; + bool unwritten = false; + unsigned ptr_bit = 1; bkey_for_each_ptr_decode(k.k, ptrs, p, entry) { if (!p.ptr.cached) { - rcu_read_lock(); + guard(rcu)(); if (ptr_bit & m->data_opts.rewrite_ptrs) { if (crc_is_compressed(p.crc)) reserve_sectors += k.k->size; @@ -856,7 +882,6 @@ int bch2_data_update_init(struct btree_trans *trans, bch2_dev_list_add_dev(&m->op.devs_have, p.ptr.dev); durability_have += bch2_extent_ptr_durability(c, &p); } - rcu_read_unlock(); } /* @@ -872,6 +897,9 @@ int bch2_data_update_init(struct btree_trans *trans, if (p.crc.compression_type == BCH_COMPRESSION_TYPE_incompressible) m->op.incompressible = true; + buf_bytes = max_t(unsigned, buf_bytes, p.crc.uncompressed_size << 9); + unwritten |= p.ptr.unwritten; + ptr_bit <<= 1; } @@ -910,7 +938,7 @@ int bch2_data_update_init(struct btree_trans *trans, if (iter) ret = bch2_extent_drop_ptrs(trans, iter, k, io_opts, &m->data_opts); if (!ret) - ret = -BCH_ERR_data_update_done_no_writes_needed; + ret = bch_err_throw(c, data_update_done_no_writes_needed); goto out_bkey_buf_exit; } @@ -941,23 +969,25 @@ int bch2_data_update_init(struct btree_trans *trans, } if (!bkey_get_dev_refs(c, k)) { - ret = -BCH_ERR_data_update_done_no_dev_refs; + ret = bch_err_throw(c, data_update_done_no_dev_refs); goto out_put_disk_res; } if (c->opts.nocow_enabled && - !bkey_nocow_lock(c, ctxt, k)) { - ret = -BCH_ERR_nocow_lock_blocked; + !bkey_nocow_lock(c, ctxt, ptrs)) { + ret = bch_err_throw(c, nocow_lock_blocked); goto out_put_dev_refs; } - if (bkey_extent_is_unwritten(k)) { + if (unwritten) { ret = bch2_update_unwritten_extent(trans, m) ?: - -BCH_ERR_data_update_done_unwritten; + bch_err_throw(c, data_update_done_unwritten); goto out_nocow_unlock; } - ret = bch2_data_update_bios_init(m, c, io_opts); + bch2_trans_unlock(trans); + + ret = __bch2_data_update_bios_init(m, c, io_opts, buf_bytes); if (ret) goto out_nocow_unlock; diff --git a/fs/bcachefs/debug.c b/fs/bcachefs/debug.c index 4fa70634c90e..901f643ead83 100644 --- a/fs/bcachefs/debug.c +++ b/fs/bcachefs/debug.c @@ -492,6 +492,8 @@ static void bch2_cached_btree_node_to_text(struct printbuf *out, struct bch_fs * prt_printf(out, "journal pin %px:\t%llu\n", &b->writes[1].journal, b->writes[1].journal.seq); + prt_printf(out, "ob:\t%u\n", b->ob.nr); + printbuf_indent_sub(out, 2); } @@ -508,27 +510,27 @@ static ssize_t bch2_cached_btree_nodes_read(struct file *file, char __user *buf, i->ret = 0; do { - struct bucket_table *tbl; - struct rhash_head *pos; - struct btree *b; - ret = bch2_debugfs_flush_buf(i); if (ret) return ret; - rcu_read_lock(); i->buf.atomic++; - tbl = rht_dereference_rcu(c->btree_cache.table.tbl, - &c->btree_cache.table); - if (i->iter < tbl->size) { - rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) - bch2_cached_btree_node_to_text(&i->buf, c, b); - i->iter++; - } else { - done = true; + scoped_guard(rcu) { + struct bucket_table *tbl = + rht_dereference_rcu(c->btree_cache.table.tbl, + &c->btree_cache.table); + if (i->iter < tbl->size) { + struct rhash_head *pos; + struct btree *b; + + rht_for_each_entry_rcu(b, pos, tbl, i->iter, hash) + bch2_cached_btree_node_to_text(&i->buf, c, b); + i->iter++; + } else { + done = true; + } } --i->buf.atomic; - rcu_read_unlock(); } while (!done); if (i->buf.allocation_failure) diff --git a/fs/bcachefs/dirent.c b/fs/bcachefs/dirent.c index d198001838f3..300f7cc8abdf 100644 --- a/fs/bcachefs/dirent.c +++ b/fs/bcachefs/dirent.c @@ -231,70 +231,64 @@ void bch2_dirent_to_text(struct printbuf *out, struct bch_fs *c, struct bkey_s_c prt_printf(out, " type %s", bch2_d_type_str(d.v->d_type)); } -static struct bkey_i_dirent *dirent_alloc_key(struct btree_trans *trans, - subvol_inum dir, - u8 type, - int name_len, int cf_name_len, - u64 dst) +int bch2_dirent_init_name(struct bkey_i_dirent *dirent, + const struct bch_hash_info *hash_info, + const struct qstr *name, + const struct qstr *cf_name) { - struct bkey_i_dirent *dirent; - unsigned u64s = BKEY_U64s + dirent_val_u64s(name_len, cf_name_len); + EBUG_ON(hash_info->cf_encoding == NULL && cf_name); + int cf_len = 0; - BUG_ON(u64s > U8_MAX); - - dirent = bch2_trans_kmalloc(trans, u64s * sizeof(u64)); - if (IS_ERR(dirent)) - return dirent; + if (name->len > BCH_NAME_MAX) + return -ENAMETOOLONG; - bkey_dirent_init(&dirent->k_i); - dirent->k.u64s = u64s; + dirent->v.d_casefold = hash_info->cf_encoding != NULL; - if (type != DT_SUBVOL) { - dirent->v.d_inum = cpu_to_le64(dst); + if (!dirent->v.d_casefold) { + memcpy(&dirent->v.d_name[0], name->name, name->len); + memset(&dirent->v.d_name[name->len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_name) - + name->len); } else { - dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); - dirent->v.d_child_subvol = cpu_to_le32(dst); - } +#ifdef CONFIG_UNICODE + memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - dirent->v.d_type = type; - dirent->v.d_unused = 0; - dirent->v.d_casefold = cf_name_len ? 1 : 0; + char *cf_out = &dirent->v.d_cf_name_block.d_names[name->len]; - return dirent; -} + if (cf_name) { + cf_len = cf_name->len; -static void dirent_init_regular_name(struct bkey_i_dirent *dirent, - const struct qstr *name) -{ - EBUG_ON(dirent->v.d_casefold); + memcpy(cf_out, cf_name->name, cf_name->len); + } else { + cf_len = utf8_casefold(hash_info->cf_encoding, name, + cf_out, + bkey_val_end(bkey_i_to_s(&dirent->k_i)) - (void *) cf_out); + if (cf_len <= 0) + return cf_len; + } - memcpy(&dirent->v.d_name[0], name->name, name->len); - memset(&dirent->v.d_name[name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_name) - - name->len); -} + memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_len], 0, + bkey_val_bytes(&dirent->k) - + offsetof(struct bch_dirent, d_cf_name_block.d_names) - + name->len + cf_len); -static void dirent_init_casefolded_name(struct bkey_i_dirent *dirent, - const struct qstr *name, - const struct qstr *cf_name) -{ - EBUG_ON(!dirent->v.d_casefold); - EBUG_ON(!cf_name->len); - - dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); - dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_name->len); - memcpy(&dirent->v.d_cf_name_block.d_names[0], name->name, name->len); - memcpy(&dirent->v.d_cf_name_block.d_names[name->len], cf_name->name, cf_name->len); - memset(&dirent->v.d_cf_name_block.d_names[name->len + cf_name->len], 0, - bkey_val_bytes(&dirent->k) - - offsetof(struct bch_dirent, d_cf_name_block.d_names) - - name->len + cf_name->len); - - EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_name->len); + dirent->v.d_cf_name_block.d_name_len = cpu_to_le16(name->len); + dirent->v.d_cf_name_block.d_cf_name_len = cpu_to_le16(cf_len); + + EBUG_ON(bch2_dirent_get_casefold_name(dirent_i_to_s_c(dirent)).len != cf_len); +#else + return -EOPNOTSUPP; +#endif + } + + unsigned u64s = dirent_val_u64s(name->len, cf_len); + BUG_ON(u64s > bkey_val_u64s(&dirent->k)); + set_bkey_val_u64s(&dirent->k, u64s); + return 0; } -static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, +struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *trans, const struct bch_hash_info *hash_info, subvol_inum dir, u8 type, @@ -302,31 +296,28 @@ static struct bkey_i_dirent *dirent_create_key(struct btree_trans *trans, const struct qstr *cf_name, u64 dst) { - struct bkey_i_dirent *dirent; - struct qstr _cf_name; - - if (name->len > BCH_NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); + struct bkey_i_dirent *dirent = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); + if (IS_ERR(dirent)) + return dirent; - if (hash_info->cf_encoding && !cf_name) { - int ret = bch2_casefold(trans, hash_info, name, &_cf_name); - if (ret) - return ERR_PTR(ret); + bkey_dirent_init(&dirent->k_i); + dirent->k.u64s = BKEY_U64s_MAX; - cf_name = &_cf_name; + if (type != DT_SUBVOL) { + dirent->v.d_inum = cpu_to_le64(dst); + } else { + dirent->v.d_parent_subvol = cpu_to_le32(dir.subvol); + dirent->v.d_child_subvol = cpu_to_le32(dst); } - dirent = dirent_alloc_key(trans, dir, type, name->len, cf_name ? cf_name->len : 0, dst); - if (IS_ERR(dirent)) - return dirent; + dirent->v.d_type = type; + dirent->v.d_unused = 0; - if (cf_name) - dirent_init_casefolded_name(dirent, name, cf_name); - else - dirent_init_regular_name(dirent, name); + int ret = bch2_dirent_init_name(dirent, hash_info, name, cf_name); + if (ret) + return ERR_PTR(ret); EBUG_ON(bch2_dirent_get_name(dirent_i_to_s_c(dirent)).len != name->len); - return dirent; } @@ -341,7 +332,7 @@ int bch2_dirent_create_snapshot(struct btree_trans *trans, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); + dirent = bch2_dirent_create_key(trans, hash_info, dir_inum, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -365,7 +356,7 @@ int bch2_dirent_create(struct btree_trans *trans, subvol_inum dir, struct bkey_i_dirent *dirent; int ret; - dirent = dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); + dirent = bch2_dirent_create_key(trans, hash_info, dir, type, name, NULL, dst_inum); ret = PTR_ERR_OR_ZERO(dirent); if (ret) return ret; @@ -402,8 +393,8 @@ int bch2_dirent_read_target(struct btree_trans *trans, subvol_inum dir, } int bch2_dirent_rename(struct btree_trans *trans, - subvol_inum src_dir, struct bch_hash_info *src_hash, u64 *src_dir_i_size, - subvol_inum dst_dir, struct bch_hash_info *dst_hash, u64 *dst_dir_i_size, + subvol_inum src_dir, struct bch_hash_info *src_hash, + subvol_inum dst_dir, struct bch_hash_info *dst_hash, const struct qstr *src_name, subvol_inum *src_inum, u64 *src_offset, const struct qstr *dst_name, subvol_inum *dst_inum, u64 *dst_offset, enum bch_rename_mode mode) @@ -470,8 +461,8 @@ int bch2_dirent_rename(struct btree_trans *trans, *src_offset = dst_iter.pos.offset; /* Create new dst key: */ - new_dst = dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, - dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); + new_dst = bch2_dirent_create_key(trans, dst_hash, dst_dir, 0, dst_name, + dst_hash->cf_encoding ? &dst_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_dst); if (ret) goto out; @@ -481,8 +472,8 @@ int bch2_dirent_rename(struct btree_trans *trans, /* Create new src key: */ if (mode == BCH_RENAME_EXCHANGE) { - new_src = dirent_create_key(trans, src_hash, src_dir, 0, src_name, - src_hash->cf_encoding ? &src_name_lookup : NULL, 0); + new_src = bch2_dirent_create_key(trans, src_hash, src_dir, 0, src_name, + src_hash->cf_encoding ? &src_name_lookup : NULL, 0); ret = PTR_ERR_OR_ZERO(new_src); if (ret) goto out; @@ -542,14 +533,6 @@ int bch2_dirent_rename(struct btree_trans *trans, new_src->v.d_type == DT_SUBVOL) new_src->v.d_parent_subvol = cpu_to_le32(src_dir.subvol); - if (old_dst.k) - *dst_dir_i_size -= bkey_bytes(old_dst.k); - *src_dir_i_size -= bkey_bytes(old_src.k); - - if (mode == BCH_RENAME_EXCHANGE) - *src_dir_i_size += bkey_bytes(&new_src->k); - *dst_dir_i_size += bkey_bytes(&new_dst->k); - ret = bch2_trans_update(trans, &dst_iter, &new_dst->k_i, 0); if (ret) goto out; @@ -656,7 +639,7 @@ int bch2_empty_dir_snapshot(struct btree_trans *trans, u64 dir, u32 subvol, u32 struct bkey_s_c_dirent d = bkey_s_c_to_dirent(k); if (d.v->d_type == DT_SUBVOL && le32_to_cpu(d.v->d_parent_subvol) != subvol) continue; - ret = -BCH_ERR_ENOTEMPTY_dir_not_empty; + ret = bch_err_throw(trans->c, ENOTEMPTY_dir_not_empty); break; } bch2_trans_iter_exit(trans, &iter); @@ -692,7 +675,9 @@ static int bch2_dir_emit(struct dir_context *ctx, struct bkey_s_c_dirent d, subv return !ret; } -int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) +int bch2_readdir(struct bch_fs *c, subvol_inum inum, + struct bch_hash_info *hash_info, + struct dir_context *ctx) { struct bkey_buf sk; bch2_bkey_buf_init(&sk); @@ -710,7 +695,11 @@ int bch2_readdir(struct bch_fs *c, subvol_inum inum, struct dir_context *ctx) struct bkey_s_c_dirent dirent = bkey_i_to_s_c_dirent(sk.k); subvol_inum target; - int ret2 = bch2_dirent_read_target(trans, inum, dirent, &target); + + bool need_second_pass = false; + int ret2 = bch2_str_hash_check_key(trans, NULL, &bch2_dirent_hash_desc, + hash_info, &iter, k, &need_second_pass) ?: + bch2_dirent_read_target(trans, inum, dirent, &target); if (ret2 > 0) continue; @@ -740,7 +729,7 @@ static int lookup_first_inode(struct btree_trans *trans, u64 inode_nr, ret = bch2_inode_unpack(k, inode); goto found; } - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(trans->c, ENOENT_inode); found: bch_err_msg(trans->c, ret, "fetching inode %llu", inode_nr); bch2_trans_iter_exit(trans, &iter); diff --git a/fs/bcachefs/dirent.h b/fs/bcachefs/dirent.h index d3e7ae669575..70fb0b581221 100644 --- a/fs/bcachefs/dirent.h +++ b/fs/bcachefs/dirent.h @@ -38,7 +38,7 @@ static inline int bch2_maybe_casefold(struct btree_trans *trans, } } -struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent d); +struct qstr bch2_dirent_get_name(struct bkey_s_c_dirent); static inline unsigned dirent_val_u64s(unsigned len, unsigned cf_len) { @@ -59,6 +59,14 @@ static inline void dirent_copy_target(struct bkey_i_dirent *dst, dst->v.d_type = src.v->d_type; } +int bch2_dirent_init_name(struct bkey_i_dirent *, + const struct bch_hash_info *, + const struct qstr *, + const struct qstr *); +struct bkey_i_dirent *bch2_dirent_create_key(struct btree_trans *, + const struct bch_hash_info *, subvol_inum, u8, + const struct qstr *, const struct qstr *, u64); + int bch2_dirent_create_snapshot(struct btree_trans *, u32, u64, u32, const struct bch_hash_info *, u8, const struct qstr *, u64, u64 *, @@ -80,8 +88,8 @@ enum bch_rename_mode { }; int bch2_dirent_rename(struct btree_trans *, - subvol_inum, struct bch_hash_info *, u64 *, - subvol_inum, struct bch_hash_info *, u64 *, + subvol_inum, struct bch_hash_info *, + subvol_inum, struct bch_hash_info *, const struct qstr *, subvol_inum *, u64 *, const struct qstr *, subvol_inum *, u64 *, enum bch_rename_mode); @@ -95,7 +103,7 @@ u64 bch2_dirent_lookup(struct bch_fs *, subvol_inum, int bch2_empty_dir_snapshot(struct btree_trans *, u64, u32, u32); int bch2_empty_dir_trans(struct btree_trans *, subvol_inum); -int bch2_readdir(struct bch_fs *, subvol_inum, struct dir_context *); +int bch2_readdir(struct bch_fs *, subvol_inum, struct bch_hash_info *, struct dir_context *); int bch2_fsck_remove_dirent(struct btree_trans *, struct bpos); diff --git a/fs/bcachefs/disk_accounting.c b/fs/bcachefs/disk_accounting.c index b3840ff7c407..3d59a57a5256 100644 --- a/fs/bcachefs/disk_accounting.c +++ b/fs/bcachefs/disk_accounting.c @@ -390,7 +390,7 @@ static int __bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accoun err: free_percpu(n.v[1]); free_percpu(n.v[0]); - return -BCH_ERR_ENOMEM_disk_accounting; + return bch_err_throw(c, ENOMEM_disk_accounting); } int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, @@ -401,7 +401,7 @@ int bch2_accounting_mem_insert(struct bch_fs *c, struct bkey_s_c_accounting a, if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) - return -BCH_ERR_btree_insert_need_mark_replicas; + return bch_err_throw(c, btree_insert_need_mark_replicas); percpu_up_read(&c->mark_lock); percpu_down_write(&c->mark_lock); @@ -419,7 +419,7 @@ int bch2_accounting_mem_insert_locked(struct bch_fs *c, struct bkey_s_c_accounti if (mode != BCH_ACCOUNTING_read && accounting_to_replicas(&r.e, a.k->p) && !bch2_replicas_marked_locked(c, &r.e)) - return -BCH_ERR_btree_insert_need_mark_replicas; + return bch_err_throw(c, btree_insert_need_mark_replicas); return __bch2_accounting_mem_insert(c, a); } @@ -559,7 +559,7 @@ int bch2_gc_accounting_start(struct bch_fs *c) sizeof(u64), GFP_KERNEL); if (!e->v[1]) { bch2_accounting_free_counters(acc, true); - ret = -BCH_ERR_ENOMEM_disk_accounting; + ret = bch_err_throw(c, ENOMEM_disk_accounting); break; } } @@ -737,7 +737,7 @@ invalid_device: bch2_disk_accounting_mod(trans, acc, v, nr, false)) ?: -BCH_ERR_remove_disk_accounting_entry; } else { - ret = -BCH_ERR_remove_disk_accounting_entry; + ret = bch_err_throw(c, remove_disk_accounting_entry); } goto fsck_err; } @@ -897,8 +897,8 @@ int bch2_accounting_read(struct bch_fs *c) case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(usage, k.replicas.data_type, v[0]); break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); + case BCH_DISK_ACCOUNTING_dev_data_type: { + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.dev_data_type.dev); if (ca) { struct bch_dev_usage_type __percpu *d = &ca->usage->d[k.dev_data_type.data_type]; @@ -910,9 +910,9 @@ int bch2_accounting_read(struct bch_fs *c) k.dev_data_type.data_type == BCH_DATA_journal) usage->hidden += v[0] * ca->mi.bucket_size; } - rcu_read_unlock(); break; } + } } preempt_enable(); fsck_err: @@ -1006,19 +1006,18 @@ void bch2_verify_accounting_clean(struct bch_fs *c) case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(&base, acc_k.replicas.data_type, a.v->d[0]); break; - case BCH_DISK_ACCOUNTING_dev_data_type: { - rcu_read_lock(); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); - if (!ca) { - rcu_read_unlock(); - continue; + case BCH_DISK_ACCOUNTING_dev_data_type: + { + guard(rcu)(); /* scoped guard is a loop, and doesn't play nicely with continue */ + struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); + if (!ca) + continue; + + v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); + v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); + v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); } - v[0] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].buckets); - v[1] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].sectors); - v[2] = percpu_u64_get(&ca->usage->d[acc_k.dev_data_type.data_type].fragmented); - rcu_read_unlock(); - if (memcmp(a.v->d, v, 3 * sizeof(u64))) { struct printbuf buf = PRINTBUF; @@ -1032,7 +1031,6 @@ void bch2_verify_accounting_clean(struct bch_fs *c) mismatch = true; } } - } 0; }))); diff --git a/fs/bcachefs/disk_accounting.h b/fs/bcachefs/disk_accounting.h index f6098e33ab30..d61abebf3e0b 100644 --- a/fs/bcachefs/disk_accounting.h +++ b/fs/bcachefs/disk_accounting.h @@ -174,17 +174,17 @@ static inline int bch2_accounting_mem_mod_locked(struct btree_trans *trans, case BCH_DISK_ACCOUNTING_replicas: fs_usage_data_type_to_base(&trans->fs_usage_delta, acc_k.replicas.data_type, a.v->d[0]); break; - case BCH_DISK_ACCOUNTING_dev_data_type: - rcu_read_lock(); + case BCH_DISK_ACCOUNTING_dev_data_type: { + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, acc_k.dev_data_type.dev); if (ca) { this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].buckets, a.v->d[0]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].sectors, a.v->d[1]); this_cpu_add(ca->usage->d[acc_k.dev_data_type.data_type].fragmented, a.v->d[2]); } - rcu_read_unlock(); break; } + } } unsigned idx; diff --git a/fs/bcachefs/disk_groups.c b/fs/bcachefs/disk_groups.c index c20ecf5e5381..cde842ac1886 100644 --- a/fs/bcachefs/disk_groups.c +++ b/fs/bcachefs/disk_groups.c @@ -130,7 +130,7 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) cpu_g = kzalloc(struct_size(cpu_g, entries, nr_groups), GFP_KERNEL); if (!cpu_g) - return -BCH_ERR_ENOMEM_disk_groups_to_cpu; + return bch_err_throw(c, ENOMEM_disk_groups_to_cpu); cpu_g->nr = nr_groups; @@ -170,36 +170,28 @@ int bch2_sb_disk_groups_to_cpu(struct bch_fs *c) const struct bch_devs_mask *bch2_target_to_mask(struct bch_fs *c, unsigned target) { struct target t = target_decode(target); - struct bch_devs_mask *devs; - rcu_read_lock(); + guard(rcu)(); switch (t.type) { case TARGET_NULL: - devs = NULL; - break; + return NULL; case TARGET_DEV: { struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; - devs = ca ? &ca->self : NULL; - break; + return ca ? &ca->self : NULL; } case TARGET_GROUP: { struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); - devs = g && t.group < g->nr && !g->entries[t.group].deleted + return g && t.group < g->nr && !g->entries[t.group].deleted ? &g->entries[t.group].devs : NULL; - break; } default: BUG(); } - - rcu_read_unlock(); - - return devs; } bool bch2_dev_in_target(struct bch_fs *c, unsigned dev, unsigned target) @@ -384,7 +376,7 @@ void bch2_disk_groups_to_text(struct printbuf *out, struct bch_fs *c) bch2_printbuf_make_room(out, 4096); out->atomic++; - rcu_read_lock(); + guard(rcu)(); struct bch_disk_groups_cpu *g = rcu_dereference(c->disk_groups); for (unsigned i = 0; i < (g ? g->nr : 0); i++) { @@ -405,16 +397,14 @@ next: prt_newline(out); } - rcu_read_unlock(); out->atomic--; } void bch2_disk_path_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) { out->atomic++; - rcu_read_lock(); + guard(rcu)(); __bch2_disk_path_to_text(out, rcu_dereference(c->disk_groups), v), - rcu_read_unlock(); --out->atomic; } @@ -535,13 +525,11 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) switch (t.type) { case TARGET_NULL: prt_printf(out, "none"); - break; + return; case TARGET_DEV: { - struct bch_dev *ca; - out->atomic++; - rcu_read_lock(); - ca = t.dev < c->sb.nr_devices + guard(rcu)(); + struct bch_dev *ca = t.dev < c->sb.nr_devices ? rcu_dereference(c->devs[t.dev]) : NULL; @@ -552,13 +540,12 @@ void bch2_target_to_text(struct printbuf *out, struct bch_fs *c, unsigned v) else prt_printf(out, "invalid device %u", t.dev); - rcu_read_unlock(); out->atomic--; - break; + return; } case TARGET_GROUP: bch2_disk_path_to_text(out, c, t.group); - break; + return; default: BUG(); } diff --git a/fs/bcachefs/ec.c b/fs/bcachefs/ec.c index c581426e3894..543dbba9b14f 100644 --- a/fs/bcachefs/ec.c +++ b/fs/bcachefs/ec.c @@ -213,7 +213,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->stripe, s.k->p.offset, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -224,7 +224,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } } else { @@ -234,7 +234,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bucket.inode, bucket.offset, a->gen, a->stripe, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -244,7 +244,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, bch2_data_type_str(a->data_type), bch2_data_type_str(data_type), (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -256,7 +256,7 @@ static int __mark_stripe_bucket(struct btree_trans *trans, a->dirty_sectors, a->cached_sectors, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } } @@ -295,7 +295,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, struct bch_dev *ca = bch2_dev_tryget(c, ptr->dev); if (unlikely(!ca)) { if (ptr->dev != BCH_SB_MEMBER_INVALID && !(flags & BTREE_TRIGGER_overwrite)) - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -325,7 +325,7 @@ static int mark_stripe_bucket(struct btree_trans *trans, if (bch2_fs_inconsistent_on(!g, c, "reference to invalid bucket on device %u\n%s", ptr->dev, (bch2_bkey_val_to_text(&buf, c, s.s_c), buf.buf))) { - ret = -BCH_ERR_mark_stripe; + ret = bch_err_throw(c, mark_stripe); goto err; } @@ -428,7 +428,7 @@ int bch2_trigger_stripe(struct btree_trans *trans, gc = genradix_ptr_alloc(&c->gc_stripes, idx, GFP_KERNEL); if (!gc) { bch_err(c, "error allocating memory for gc_stripes, idx %llu", idx); - return -BCH_ERR_ENOMEM_mark_stripe; + return bch_err_throw(c, ENOMEM_mark_stripe); } /* @@ -536,7 +536,8 @@ static void ec_stripe_buf_exit(struct ec_stripe_buf *buf) } /* XXX: this is a non-mempoolified memory allocation: */ -static int ec_stripe_buf_init(struct ec_stripe_buf *buf, +static int ec_stripe_buf_init(struct bch_fs *c, + struct ec_stripe_buf *buf, unsigned offset, unsigned size) { struct bch_stripe *v = &bkey_i_to_stripe(&buf->key)->v; @@ -564,7 +565,7 @@ static int ec_stripe_buf_init(struct ec_stripe_buf *buf, return 0; err: ec_stripe_buf_exit(buf); - return -BCH_ERR_ENOMEM_stripe_buf; + return bch_err_throw(c, ENOMEM_stripe_buf); } /* Checksumming: */ @@ -840,7 +841,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, buf = kzalloc(sizeof(*buf), GFP_NOFS); if (!buf) - return -BCH_ERR_ENOMEM_ec_read_extent; + return bch_err_throw(c, ENOMEM_ec_read_extent); ret = lockrestart_do(trans, get_stripe_key_trans(trans, rbio->pick.ec.idx, buf)); if (ret) { @@ -861,7 +862,7 @@ int bch2_ec_read_extent(struct btree_trans *trans, struct bch_read_bio *rbio, goto err; } - ret = ec_stripe_buf_init(buf, offset, bio_sectors(&rbio->bio)); + ret = ec_stripe_buf_init(c, buf, offset, bio_sectors(&rbio->bio)); if (ret) { msg = "-ENOMEM"; goto err; @@ -894,7 +895,7 @@ err: bch_err_ratelimited(c, "error doing reconstruct read: %s\n %s", msg, msgbuf.buf); printbuf_exit(&msgbuf); - ret = -BCH_ERR_stripe_reconstruct; + ret = bch_err_throw(c, stripe_reconstruct); goto out; } @@ -904,7 +905,7 @@ static int __ec_stripe_mem_alloc(struct bch_fs *c, size_t idx, gfp_t gfp) { if (c->gc_pos.phase != GC_PHASE_not_running && !genradix_ptr_alloc(&c->gc_stripes, idx, gfp)) - return -BCH_ERR_ENOMEM_ec_stripe_mem_alloc; + return bch_err_throw(c, ENOMEM_ec_stripe_mem_alloc); return 0; } @@ -1129,7 +1130,7 @@ static int ec_stripe_update_extent(struct btree_trans *trans, bch2_fs_inconsistent(c, "%s", buf.buf); printbuf_exit(&buf); - return -BCH_ERR_erasure_coding_found_btree_node; + return bch_err_throw(c, erasure_coding_found_btree_node); } k = bch2_backpointer_get_key(trans, bp, &iter, BTREE_ITER_intent, last_flushed); @@ -1195,7 +1196,7 @@ static int ec_stripe_update_bucket(struct btree_trans *trans, struct ec_stripe_b struct bch_dev *ca = bch2_dev_tryget(c, ptr.dev); if (!ca) - return -BCH_ERR_ENOENT_dev_not_found; + return bch_err_throw(c, ENOENT_dev_not_found); struct bpos bucket_pos = PTR_BUCKET_POS(ca, &ptr); @@ -1256,7 +1257,7 @@ static void zero_out_rest_of_ec_bucket(struct bch_fs *c, struct bch_dev *ca = bch2_dev_get_ioref(c, ob->dev, WRITE, BCH_DEV_WRITE_REF_ec_bucket_zero); if (!ca) { - s->err = -BCH_ERR_erofs_no_writes; + s->err = bch_err_throw(c, erofs_no_writes); return; } @@ -1320,7 +1321,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_do_recov(c, &s->existing_stripe)) { bch_err(c, "error creating stripe: error reading existing stripe"); - ret = -BCH_ERR_ec_block_read; + ret = bch_err_throw(c, ec_block_read); goto err; } @@ -1346,7 +1347,7 @@ static void ec_stripe_create(struct ec_stripe_new *s) if (ec_nr_failed(&s->new_stripe)) { bch_err(c, "error creating stripe: error writing redundancy buckets"); - ret = -BCH_ERR_ec_block_write; + ret = bch_err_throw(c, ec_block_write); goto err; } @@ -1578,26 +1579,26 @@ static struct ec_stripe_new *ec_new_stripe_alloc(struct bch_fs *c, struct ec_str static void ec_stripe_head_devs_update(struct bch_fs *c, struct ec_stripe_head *h) { struct bch_devs_mask devs = h->devs; + unsigned nr_devs, nr_devs_with_durability; - rcu_read_lock(); - h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label - ? group_to_target(h->disk_label - 1) - : 0); - unsigned nr_devs = dev_mask_nr(&h->devs); + scoped_guard(rcu) { + h->devs = target_rw_devs(c, BCH_DATA_user, h->disk_label + ? group_to_target(h->disk_label - 1) + : 0); + nr_devs = dev_mask_nr(&h->devs); - for_each_member_device_rcu(c, ca, &h->devs) - if (!ca->mi.durability) - __clear_bit(ca->dev_idx, h->devs.d); - unsigned nr_devs_with_durability = dev_mask_nr(&h->devs); + for_each_member_device_rcu(c, ca, &h->devs) + if (!ca->mi.durability) + __clear_bit(ca->dev_idx, h->devs.d); + nr_devs_with_durability = dev_mask_nr(&h->devs); - h->blocksize = pick_blocksize(c, &h->devs); + h->blocksize = pick_blocksize(c, &h->devs); - h->nr_active_devs = 0; - for_each_member_device_rcu(c, ca, &h->devs) - if (ca->mi.bucket_size == h->blocksize) - h->nr_active_devs++; - - rcu_read_unlock(); + h->nr_active_devs = 0; + for_each_member_device_rcu(c, ca, &h->devs) + if (ca->mi.bucket_size == h->blocksize) + h->nr_active_devs++; + } /* * If we only have redundancy + 1 devices, we're better off with just @@ -1865,7 +1866,7 @@ static int init_new_stripe_from_existing(struct bch_fs *c, struct ec_stripe_new s->nr_data = existing_v->nr_blocks - existing_v->nr_redundant; - int ret = ec_stripe_buf_init(&s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); + int ret = ec_stripe_buf_init(c, &s->existing_stripe, 0, le16_to_cpu(existing_v->sectors)); if (ret) { bch2_stripe_close(c, s); return ret; @@ -1925,7 +1926,7 @@ static int __bch2_ec_stripe_head_reuse(struct btree_trans *trans, struct ec_stri } bch2_trans_iter_exit(trans, &lru_iter); if (!ret) - ret = -BCH_ERR_stripe_alloc_blocked; + ret = bch_err_throw(c, stripe_alloc_blocked); if (ret == 1) ret = 0; if (ret) @@ -1966,7 +1967,7 @@ static int __bch2_ec_stripe_head_reserve(struct btree_trans *trans, struct ec_st continue; } - ret = -BCH_ERR_ENOSPC_stripe_create; + ret = bch_err_throw(c, ENOSPC_stripe_create); break; } @@ -2024,7 +2025,7 @@ struct ec_stripe_head *bch2_ec_stripe_head_get(struct btree_trans *trans, if (!h->s) { h->s = ec_new_stripe_alloc(c, h); if (!h->s) { - ret = -BCH_ERR_ENOMEM_ec_new_stripe_alloc; + ret = bch_err_throw(c, ENOMEM_ec_new_stripe_alloc); bch_err(c, "failed to allocate new stripe"); goto err; } @@ -2089,7 +2090,7 @@ alloc_existing: goto err; allocate_buf: - ret = ec_stripe_buf_init(&s->new_stripe, 0, h->blocksize); + ret = ec_stripe_buf_init(c, &s->new_stripe, 0, h->blocksize); if (ret) goto err; @@ -2115,6 +2116,7 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, if (k.k->type != KEY_TYPE_stripe) return 0; + struct bch_fs *c = trans->c; struct bkey_i_stripe *s = bch2_bkey_make_mut_typed(trans, iter, &k, 0, stripe); int ret = PTR_ERR_OR_ZERO(s); @@ -2141,23 +2143,22 @@ int bch2_invalidate_stripe_to_dev(struct btree_trans *trans, unsigned nr_good = 0; - rcu_read_lock(); - bkey_for_each_ptr(ptrs, ptr) { - if (ptr->dev == dev_idx) - ptr->dev = BCH_SB_MEMBER_INVALID; + scoped_guard(rcu) + bkey_for_each_ptr(ptrs, ptr) { + if (ptr->dev == dev_idx) + ptr->dev = BCH_SB_MEMBER_INVALID; - struct bch_dev *ca = bch2_dev_rcu(trans->c, ptr->dev); - nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; - } - rcu_read_unlock(); + struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); + nr_good += ca && ca->mi.state != BCH_MEMBER_STATE_failed; + } if (nr_good < s->v.nr_blocks && !(flags & BCH_FORCE_IF_DATA_DEGRADED)) - return -BCH_ERR_remove_would_lose_data; + return bch_err_throw(c, remove_would_lose_data); unsigned nr_data = s->v.nr_blocks - s->v.nr_redundant; if (nr_good < nr_data && !(flags & BCH_FORCE_IF_DATA_LOST)) - return -BCH_ERR_remove_would_lose_data; + return bch_err_throw(c, remove_would_lose_data); sectors = -sectors; @@ -2178,14 +2179,15 @@ static int bch2_invalidate_stripe_to_dev_from_alloc(struct btree_trans *trans, s return 0; if (a->stripe_sectors) { - bch_err(trans->c, "trying to invalidate device in stripe when bucket has stripe data"); - return -BCH_ERR_invalidate_stripe_to_dev; + struct bch_fs *c = trans->c; + bch_err(c, "trying to invalidate device in stripe when bucket has stripe data"); + return bch_err_throw(c, invalidate_stripe_to_dev); } struct btree_iter iter; struct bkey_s_c_stripe s = bch2_bkey_get_iter_typed(trans, &iter, BTREE_ID_stripes, POS(0, a->stripe), - BTREE_ITER_slots, stripe); + BTREE_ITER_slots, stripe); int ret = bkey_err(s); if (ret) return ret; diff --git a/fs/bcachefs/errcode.c b/fs/bcachefs/errcode.c index 43557bebd0f8..c39cf304c681 100644 --- a/fs/bcachefs/errcode.c +++ b/fs/bcachefs/errcode.c @@ -13,12 +13,13 @@ static const char * const bch2_errcode_strs[] = { NULL }; -static unsigned bch2_errcode_parents[] = { +static const unsigned bch2_errcode_parents[] = { #define x(class, err) [BCH_ERR_##err - BCH_ERR_START] = class, BCH_ERRCODES() #undef x }; +__attribute__((const)) const char *bch2_err_str(int err) { const char *errstr; @@ -36,6 +37,7 @@ const char *bch2_err_str(int err) return errstr ?: "(Invalid error)"; } +__attribute__((const)) bool __bch2_err_matches(int err, int class) { err = abs(err); diff --git a/fs/bcachefs/errcode.h b/fs/bcachefs/errcode.h index 62843e772b2c..ac3264134a15 100644 --- a/fs/bcachefs/errcode.h +++ b/fs/bcachefs/errcode.h @@ -182,9 +182,12 @@ x(BCH_ERR_fsck, fsck_errors_not_fixed) \ x(BCH_ERR_fsck, fsck_repair_unimplemented) \ x(BCH_ERR_fsck, fsck_repair_impossible) \ - x(EINVAL, restart_recovery) \ - x(EINVAL, cannot_rewind_recovery) \ + x(EINVAL, recovery_will_run) \ + x(BCH_ERR_recovery_will_run, restart_recovery) \ + x(BCH_ERR_recovery_will_run, cannot_rewind_recovery) \ + x(BCH_ERR_recovery_will_run, recovery_pass_will_run) \ x(0, data_update_done) \ + x(0, bkey_was_deleted) \ x(BCH_ERR_data_update_done, data_update_done_would_block) \ x(BCH_ERR_data_update_done, data_update_done_unwritten) \ x(BCH_ERR_data_update_done, data_update_done_no_writes_needed) \ @@ -211,6 +214,8 @@ x(EINVAL, remove_would_lose_data) \ x(EINVAL, no_resize_with_buckets_nouse) \ x(EINVAL, inode_unpack_error) \ + x(EINVAL, inode_not_unlinked) \ + x(EINVAL, inode_has_child_snapshot) \ x(EINVAL, varint_decode_error) \ x(EINVAL, erasure_coding_found_btree_node) \ x(EINVAL, option_negative) \ @@ -357,9 +362,11 @@ enum bch_errcode { BCH_ERR_MAX }; -const char *bch2_err_str(int); -bool __bch2_err_matches(int, int); +__attribute__((const)) const char *bch2_err_str(int); +__attribute__((const)) bool __bch2_err_matches(int, int); + +__attribute__((const)) static inline bool _bch2_err_matches(int err, int class) { return err < 0 && __bch2_err_matches(err, class); diff --git a/fs/bcachefs/error.c b/fs/bcachefs/error.c index c2cad28635bf..63951e293c47 100644 --- a/fs/bcachefs/error.c +++ b/fs/bcachefs/error.c @@ -100,10 +100,10 @@ int __bch2_topology_error(struct bch_fs *c, struct printbuf *out) set_bit(BCH_FS_topology_error, &c->flags); if (!test_bit(BCH_FS_in_recovery, &c->flags)) { __bch2_inconsistent_error(c, out); - return -BCH_ERR_btree_need_topology_repair; + return bch_err_throw(c, btree_need_topology_repair); } else { return bch2_run_explicit_recovery_pass(c, out, BCH_RECOVERY_PASS_check_topology, 0) ?: - -BCH_ERR_btree_node_read_validate_error; + bch_err_throw(c, btree_node_read_validate_error); } } @@ -403,23 +403,23 @@ int bch2_fsck_err_opt(struct bch_fs *c, if (test_bit(BCH_FS_in_fsck, &c->flags)) { if (!(flags & (FSCK_CAN_FIX|FSCK_CAN_IGNORE))) - return -BCH_ERR_fsck_repair_unimplemented; + return bch_err_throw(c, fsck_repair_unimplemented); switch (c->opts.fix_errors) { case FSCK_FIX_exit: - return -BCH_ERR_fsck_errors_not_fixed; + return bch_err_throw(c, fsck_errors_not_fixed); case FSCK_FIX_yes: if (flags & FSCK_CAN_FIX) - return -BCH_ERR_fsck_fix; + return bch_err_throw(c, fsck_fix); fallthrough; case FSCK_FIX_no: if (flags & FSCK_CAN_IGNORE) - return -BCH_ERR_fsck_ignore; - return -BCH_ERR_fsck_errors_not_fixed; + return bch_err_throw(c, fsck_ignore); + return bch_err_throw(c, fsck_errors_not_fixed); case FSCK_FIX_ask: if (flags & FSCK_AUTOFIX) - return -BCH_ERR_fsck_fix; - return -BCH_ERR_fsck_ask; + return bch_err_throw(c, fsck_fix); + return bch_err_throw(c, fsck_ask); default: BUG(); } @@ -427,12 +427,12 @@ int bch2_fsck_err_opt(struct bch_fs *c, if ((flags & FSCK_AUTOFIX) && (c->opts.errors == BCH_ON_ERROR_continue || c->opts.errors == BCH_ON_ERROR_fix_safe)) - return -BCH_ERR_fsck_fix; + return bch_err_throw(c, fsck_fix); if (c->opts.errors == BCH_ON_ERROR_continue && (flags & FSCK_CAN_IGNORE)) - return -BCH_ERR_fsck_ignore; - return -BCH_ERR_fsck_errors_not_fixed; + return bch_err_throw(c, fsck_ignore); + return bch_err_throw(c, fsck_errors_not_fixed); } } @@ -444,7 +444,7 @@ int __bch2_fsck_err(struct bch_fs *c, { va_list args; struct printbuf buf = PRINTBUF, *out = &buf; - int ret = -BCH_ERR_fsck_ignore; + int ret = 0; const char *action_orig = "fix?", *action = action_orig; might_sleep(); @@ -474,8 +474,8 @@ int __bch2_fsck_err(struct bch_fs *c, if (test_bit(err, c->sb.errors_silent)) return flags & FSCK_CAN_FIX - ? -BCH_ERR_fsck_fix - : -BCH_ERR_fsck_ignore; + ? bch_err_throw(c, fsck_fix) + : bch_err_throw(c, fsck_ignore); printbuf_indent_add_nextline(out, 2); @@ -517,10 +517,10 @@ int __bch2_fsck_err(struct bch_fs *c, prt_str(out, ", "); if (flags & FSCK_CAN_FIX) { prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", continuing"); - ret = -BCH_ERR_fsck_ignore; + ret = bch_err_throw(c, fsck_ignore); } goto print; @@ -532,18 +532,18 @@ int __bch2_fsck_err(struct bch_fs *c, "run fsck, and forward to devs so error can be marked for self-healing"); inconsistent = true; print = true; - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); } else if (flags & FSCK_CAN_FIX) { prt_str(out, ", "); prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", continuing"); - ret = -BCH_ERR_fsck_ignore; + ret = bch_err_throw(c, fsck_ignore); } } else if (c->opts.fix_errors == FSCK_FIX_exit) { prt_str(out, ", exiting"); - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); } else if (flags & FSCK_CAN_FIX) { int fix = s && s->fix ? s->fix @@ -562,30 +562,37 @@ int __bch2_fsck_err(struct bch_fs *c, : FSCK_FIX_yes; ret = ret & 1 - ? -BCH_ERR_fsck_fix - : -BCH_ERR_fsck_ignore; + ? bch_err_throw(c, fsck_fix) + : bch_err_throw(c, fsck_ignore); } else if (fix == FSCK_FIX_yes || (c->opts.nochanges && !(flags & FSCK_CAN_IGNORE))) { prt_str(out, ", "); prt_actioning(out, action); - ret = -BCH_ERR_fsck_fix; + ret = bch_err_throw(c, fsck_fix); } else { prt_str(out, ", not "); prt_actioning(out, action); + ret = bch_err_throw(c, fsck_ignore); + } + } else { + if (flags & FSCK_CAN_IGNORE) { + prt_str(out, ", continuing"); + ret = bch_err_throw(c, fsck_ignore); + } else { + prt_str(out, " (repair unimplemented)"); + ret = bch_err_throw(c, fsck_repair_unimplemented); } - } else if (!(flags & FSCK_CAN_IGNORE)) { - prt_str(out, " (repair unimplemented)"); } - if (ret == -BCH_ERR_fsck_ignore && + if (bch2_err_matches(ret, BCH_ERR_fsck_ignore) && (c->opts.fix_errors == FSCK_FIX_exit || !(flags & FSCK_CAN_IGNORE))) - ret = -BCH_ERR_fsck_errors_not_fixed; + ret = bch_err_throw(c, fsck_errors_not_fixed); if (test_bit(BCH_FS_in_fsck, &c->flags) && - (ret != -BCH_ERR_fsck_fix && - ret != -BCH_ERR_fsck_ignore)) { + (!bch2_err_matches(ret, BCH_ERR_fsck_fix) && + !bch2_err_matches(ret, BCH_ERR_fsck_ignore))) { exiting = true; print = true; } @@ -613,26 +620,26 @@ print: if (s) s->ret = ret; - +err_unlock: + mutex_unlock(&c->fsck_error_msgs_lock); +err: /* * We don't yet track whether the filesystem currently has errors, for * log_fsck_err()s: that would require us to track for every error type * which recovery pass corrects it, to get the fsck exit status correct: */ - if (flags & FSCK_CAN_FIX) { - if (ret == -BCH_ERR_fsck_fix) { - set_bit(BCH_FS_errors_fixed, &c->flags); - } else { - set_bit(BCH_FS_errors_not_fixed, &c->flags); - set_bit(BCH_FS_error, &c->flags); - } + if (bch2_err_matches(ret, BCH_ERR_fsck_fix)) { + set_bit(BCH_FS_errors_fixed, &c->flags); + } else { + set_bit(BCH_FS_errors_not_fixed, &c->flags); + set_bit(BCH_FS_error, &c->flags); } -err_unlock: - mutex_unlock(&c->fsck_error_msgs_lock); -err: + if (action != action_orig) kfree(action); printbuf_exit(&buf); + + BUG_ON(!ret); return ret; } @@ -650,12 +657,12 @@ int __bch2_bkey_fsck_err(struct bch_fs *c, const char *fmt, ...) { if (from.flags & BCH_VALIDATE_silent) - return -BCH_ERR_fsck_delete_bkey; + return bch_err_throw(c, fsck_delete_bkey); unsigned fsck_flags = 0; if (!(from.flags & (BCH_VALIDATE_write|BCH_VALIDATE_commit))) { if (test_bit(err, c->sb.errors_silent)) - return -BCH_ERR_fsck_delete_bkey; + return bch_err_throw(c, fsck_delete_bkey); fsck_flags |= FSCK_AUTOFIX|FSCK_CAN_FIX; } diff --git a/fs/bcachefs/error.h b/fs/bcachefs/error.h index 5123d4c86770..0c3c3a24fc6f 100644 --- a/fs/bcachefs/error.h +++ b/fs/bcachefs/error.h @@ -105,13 +105,13 @@ void bch2_free_fsck_errs(struct bch_fs *); #define fsck_err_wrap(_do) \ ({ \ int _ret = _do; \ - if (_ret != -BCH_ERR_fsck_fix && \ - _ret != -BCH_ERR_fsck_ignore) { \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ + !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) { \ ret = _ret; \ goto fsck_err; \ } \ \ - _ret == -BCH_ERR_fsck_fix; \ + bch2_err_matches(_ret, BCH_ERR_fsck_fix); \ }) #define __fsck_err(...) fsck_err_wrap(bch2_fsck_err(__VA_ARGS__)) @@ -170,10 +170,10 @@ do { \ int _ret = __bch2_bkey_fsck_err(c, k, from, \ BCH_FSCK_ERR_##_err_type, \ _err_msg, ##__VA_ARGS__); \ - if (_ret != -BCH_ERR_fsck_fix && \ - _ret != -BCH_ERR_fsck_ignore) \ + if (!bch2_err_matches(_ret, BCH_ERR_fsck_fix) && \ + !bch2_err_matches(_ret, BCH_ERR_fsck_ignore)) \ ret = _ret; \ - ret = -BCH_ERR_fsck_delete_bkey; \ + ret = bch_err_throw(c, fsck_delete_bkey); \ goto fsck_err; \ } while (0) diff --git a/fs/bcachefs/extents.c b/fs/bcachefs/extents.c index 1ac9897f189d..036e4ad95987 100644 --- a/fs/bcachefs/extents.c +++ b/fs/bcachefs/extents.c @@ -65,15 +65,15 @@ void bch2_io_failures_to_text(struct printbuf *out, continue; bch2_printbuf_make_room(out, 1024); - rcu_read_lock(); out->atomic++; - struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); - if (ca) - prt_str(out, ca->name); - else - prt_printf(out, "(invalid device %u)", f->dev); + scoped_guard(rcu) { + struct bch_dev *ca = bch2_dev_rcu_noerror(c, f->dev); + if (ca) + prt_str(out, ca->name); + else + prt_printf(out, "(invalid device %u)", f->dev); + } --out->atomic; - rcu_read_unlock(); prt_char(out, ' '); @@ -193,7 +193,7 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, bool have_dirty_ptrs = false, have_pick = false; if (k.k->type == KEY_TYPE_error) - return -BCH_ERR_key_type_error; + return bch_err_throw(c, key_type_error); rcu_read_lock(); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); @@ -286,17 +286,17 @@ int bch2_bkey_pick_read_device(struct bch_fs *c, struct bkey_s_c k, if (!have_dirty_ptrs) return 0; if (have_missing_devs) - return -BCH_ERR_no_device_to_read_from; + return bch_err_throw(c, no_device_to_read_from); if (have_csum_errors) - return -BCH_ERR_data_read_csum_err; + return bch_err_throw(c, data_read_csum_err); if (have_io_errors) - return -BCH_ERR_data_read_io_err; + return bch_err_throw(c, data_read_io_err); /* * If we get here, we have pointers (bkey_ptrs_validate() ensures that), * but they don't point to valid devices: */ - return -BCH_ERR_no_devices_valid; + return bch_err_throw(c, no_devices_valid); } /* KEY_TYPE_btree_ptr: */ @@ -407,6 +407,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) lp.crc = bch2_extent_crc_unpack(l.k, NULL); rp.crc = bch2_extent_crc_unpack(r.k, NULL); + guard(rcu)(); + while (__bkey_ptr_next_decode(l.k, l_ptrs.end, lp, en_l) && __bkey_ptr_next_decode(r.k, r_ptrs.end, rp, en_r)) { if (lp.ptr.offset + lp.crc.offset + lp.crc.live_size != @@ -418,10 +420,8 @@ bool bch2_extent_merge(struct bch_fs *c, struct bkey_s l, struct bkey_s_c r) return false; /* Extents may not straddle buckets: */ - rcu_read_lock(); struct bch_dev *ca = bch2_dev_rcu(c, lp.ptr.dev); bool same_bucket = ca && PTR_BUCKET_NR(ca, &lp.ptr) == PTR_BUCKET_NR(ca, &rp.ptr); - rcu_read_unlock(); if (!same_bucket) return false; @@ -838,11 +838,9 @@ unsigned bch2_bkey_durability(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) durability += bch2_extent_ptr_durability(c, &p); - rcu_read_unlock(); - return durability; } @@ -853,12 +851,10 @@ static unsigned bch2_bkey_durability_safe(struct bch_fs *c, struct bkey_s_c k) struct extent_ptr_decoded p; unsigned durability = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (p.ptr.dev < c->sb.nr_devices && c->devs[p.ptr.dev]) durability += bch2_extent_ptr_durability(c, &p); - rcu_read_unlock(); - return durability; } @@ -1015,20 +1011,16 @@ bool bch2_bkey_has_target(struct bch_fs *c, struct bkey_s_c k, unsigned target) { struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); struct bch_dev *ca; - bool ret = false; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr(ptrs, ptr) if (bch2_dev_in_target(c, ptr->dev, target) && (ca = bch2_dev_rcu(c, ptr->dev)) && (!ptr->cached || - !dev_ptr_stale_rcu(ca, ptr))) { - ret = true; - break; - } - rcu_read_unlock(); + !dev_ptr_stale_rcu(ca, ptr))) + return true; - return ret; + return false; } bool bch2_bkey_matches_ptr(struct bch_fs *c, struct bkey_s_c k, @@ -1142,7 +1134,7 @@ void bch2_extent_ptr_set_cached(struct bch_fs *c, bool have_cached_ptr; unsigned drop_dev = ptr->dev; - rcu_read_lock(); + guard(rcu)(); restart_drop_ptrs: ptrs = bch2_bkey_ptrs(k); have_cached_ptr = false; @@ -1175,10 +1167,8 @@ restart_drop_ptrs: goto drop; ptr->cached = true; - rcu_read_unlock(); return; drop: - rcu_read_unlock(); bch2_bkey_drop_ptr_noerror(k, ptr); } @@ -1194,12 +1184,11 @@ bool bch2_extent_normalize(struct bch_fs *c, struct bkey_s k) { struct bch_dev *ca; - rcu_read_lock(); + guard(rcu)(); bch2_bkey_drop_ptrs(k, ptr, ptr->cached && (!(ca = bch2_dev_rcu(c, ptr->dev)) || dev_ptr_stale_rcu(ca, ptr) > 0)); - rcu_read_unlock(); return bkey_deleted(k.k); } @@ -1217,7 +1206,7 @@ bool bch2_extent_normalize_by_opts(struct bch_fs *c, struct bkey_ptrs ptrs; bool have_cached_ptr; - rcu_read_lock(); + guard(rcu)(); restart_drop_ptrs: ptrs = bch2_bkey_ptrs(k); have_cached_ptr = false; @@ -1230,7 +1219,6 @@ restart_drop_ptrs: } have_cached_ptr = true; } - rcu_read_unlock(); return bkey_deleted(k.k); } @@ -1238,7 +1226,7 @@ restart_drop_ptrs: void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struct bch_extent_ptr *ptr) { out->atomic++; - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, ptr->dev); if (!ca) { prt_printf(out, "ptr: %u:%llu gen %u%s", ptr->dev, @@ -1262,7 +1250,6 @@ void bch2_extent_ptr_to_text(struct printbuf *out, struct bch_fs *c, const struc else if (stale) prt_printf(out, " invalid"); } - rcu_read_unlock(); --out->atomic; } @@ -1528,7 +1515,7 @@ int bch2_bkey_ptrs_validate(struct bch_fs *c, struct bkey_s_c k, struct bch_compression_opt opt = __bch2_compression_decode(r->compression); prt_printf(err, "invalid compression opt %u:%u", opt.type, opt.level); - return -BCH_ERR_invalid_bkey; + return bch_err_throw(c, invalid_bkey); } #endif break; diff --git a/fs/bcachefs/fs-io-buffered.c b/fs/bcachefs/fs-io-buffered.c index e3a75dcca60c..66bacdd49f78 100644 --- a/fs/bcachefs/fs-io-buffered.c +++ b/fs/bcachefs/fs-io-buffered.c @@ -394,17 +394,9 @@ struct bch_writepage_state { struct bch_io_opts opts; struct bch_folio_sector *tmp; unsigned tmp_sectors; + struct blk_plug plug; }; -static inline struct bch_writepage_state bch_writepage_state_init(struct bch_fs *c, - struct bch_inode_info *inode) -{ - struct bch_writepage_state ret = { 0 }; - - bch2_inode_opts_get(&ret.opts, c, &inode->ei_inode); - return ret; -} - /* * Determine when a writepage io is full. We have to limit writepage bios to a * single page per bvec (i.e. 1MB with 4k pages) because that is the limit to @@ -666,17 +658,17 @@ do_io: int bch2_writepages(struct address_space *mapping, struct writeback_control *wbc) { struct bch_fs *c = mapping->host->i_sb->s_fs_info; - struct bch_writepage_state w = - bch_writepage_state_init(c, to_bch_ei(mapping->host)); - struct blk_plug plug; - int ret; + struct bch_writepage_state *w = kzalloc(sizeof(*w), GFP_NOFS|__GFP_NOFAIL); - blk_start_plug(&plug); - ret = write_cache_pages(mapping, wbc, __bch2_writepage, &w); - if (w.io) - bch2_writepage_do_io(&w); - blk_finish_plug(&plug); - kfree(w.tmp); + bch2_inode_opts_get(&w->opts, c, &to_bch_ei(mapping->host)->ei_inode); + + blk_start_plug(&w->plug); + int ret = write_cache_pages(mapping, wbc, __bch2_writepage, w); + if (w->io) + bch2_writepage_do_io(w); + blk_finish_plug(&w->plug); + kfree(w->tmp); + kfree(w); return bch2_err_class(ret); } diff --git a/fs/bcachefs/fs-io-pagecache.c b/fs/bcachefs/fs-io-pagecache.c index fbae9c1de746..c2cc405822f2 100644 --- a/fs/bcachefs/fs-io-pagecache.c +++ b/fs/bcachefs/fs-io-pagecache.c @@ -447,7 +447,7 @@ static int __bch2_folio_reservation_get(struct bch_fs *c, if (!reserved) { bch2_disk_reservation_put(c, &disk_res); - return -BCH_ERR_ENOSPC_disk_reservation; + return bch_err_throw(c, ENOSPC_disk_reservation); } break; } diff --git a/fs/bcachefs/fs-io.c b/fs/bcachefs/fs-io.c index b1e9ee28fc0f..a233f45875e9 100644 --- a/fs/bcachefs/fs-io.c +++ b/fs/bcachefs/fs-io.c @@ -71,12 +71,12 @@ void bch2_inode_flush_nocow_writes_async(struct bch_fs *c, memset(&inode->ei_devs_need_flush, 0, sizeof(inode->ei_devs_need_flush)); for_each_set_bit(dev, devs.d, BCH_SB_MEMBERS_MAX) { - rcu_read_lock(); - ca = rcu_dereference(c->devs[dev]); - if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], - BCH_DEV_WRITE_REF_nocow_flush)) - ca = NULL; - rcu_read_unlock(); + scoped_guard(rcu) { + ca = rcu_dereference(c->devs[dev]); + if (ca && !enumerated_ref_tryget(&ca->io_ref[WRITE], + BCH_DEV_WRITE_REF_nocow_flush)) + ca = NULL; + } if (!ca) continue; diff --git a/fs/bcachefs/fs-ioctl.c b/fs/bcachefs/fs-ioctl.c index 05361a793206..4e72e654da96 100644 --- a/fs/bcachefs/fs-ioctl.c +++ b/fs/bcachefs/fs-ioctl.c @@ -268,13 +268,13 @@ static long bch2_ioctl_subvolume_create(struct bch_fs *c, struct file *filp, } if (dst_dentry->d_inode) { - error = -BCH_ERR_EEXIST_subvolume_create; + error = bch_err_throw(c, EEXIST_subvolume_create); goto err3; } dir = dst_path.dentry->d_inode; if (IS_DEADDIR(dir)) { - error = -BCH_ERR_ENOENT_directory_dead; + error = bch_err_throw(c, ENOENT_directory_dead); goto err3; } diff --git a/fs/bcachefs/fs.c b/fs/bcachefs/fs.c index ddfe89d84966..85d13f800165 100644 --- a/fs/bcachefs/fs.c +++ b/fs/bcachefs/fs.c @@ -124,8 +124,9 @@ retry: goto err; struct bch_extent_rebalance new_r = bch2_inode_rebalance_opts_get(c, &inode_u); + bool rebalance_changed = memcmp(&old_r, &new_r, sizeof(new_r)); - if (memcmp(&old_r, &new_r, sizeof(new_r))) { + if (rebalance_changed) { ret = bch2_set_rebalance_needs_scan_trans(trans, inode_u.bi_inum); if (ret) goto err; @@ -146,6 +147,9 @@ err: if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) goto retry; + if (rebalance_changed) + bch2_rebalance_wakeup(c); + bch2_fs_fatal_err_on(bch2_err_matches(ret, ENOENT), c, "%s: inode %llu:%llu not found when updating", bch2_err_str(ret), @@ -1569,11 +1573,12 @@ static int bch2_vfs_readdir(struct file *file, struct dir_context *ctx) { struct bch_inode_info *inode = file_bch_inode(file); struct bch_fs *c = inode->v.i_sb->s_fs_info; + struct bch_hash_info hash = bch2_hash_info_init(c, &inode->ei_inode); if (!dir_emit_dots(file, ctx)) return 0; - int ret = bch2_readdir(c, inode_inum(inode), ctx); + int ret = bch2_readdir(c, inode_inum(inode), &hash, ctx); bch_err_fn(c, ret); return bch2_err_class(ret); @@ -2002,14 +2007,14 @@ retry: goto err; if (k.k->type != KEY_TYPE_dirent) { - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); goto err; } d = bkey_s_c_to_dirent(k); ret = bch2_dirent_read_target(trans, inode_inum(dir), d, &target); if (ret > 0) - ret = -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + ret = bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); if (ret) goto err; @@ -2175,7 +2180,13 @@ static void bch2_evict_inode(struct inode *vinode) KEY_TYPE_QUOTA_WARN); bch2_quota_acct(c, inode->ei_qid, Q_INO, -1, KEY_TYPE_QUOTA_WARN); - bch2_inode_rm(c, inode_inum(inode)); + int ret = bch2_inode_rm(c, inode_inum(inode)); + if (ret && !bch2_err_matches(ret, EROFS)) { + bch_err_msg(c, ret, "VFS incorrectly tried to delete inode %llu:%llu", + inode->ei_inum.subvol, + inode->ei_inum.inum); + bch2_sb_error_count(c, BCH_FSCK_ERR_vfs_bad_inode_rm); + } /* * If we are deleting, we need it present in the vfs hash table @@ -2322,14 +2333,13 @@ static int bch2_show_devname(struct seq_file *seq, struct dentry *root) struct bch_fs *c = root->d_sb->s_fs_info; bool first = true; - rcu_read_lock(); + guard(rcu)(); for_each_online_member_rcu(c, ca) { if (!first) seq_putc(seq, ':'); first = false; seq_puts(seq, ca->disk_sb.sb_name); } - rcu_read_unlock(); return 0; } @@ -2526,16 +2536,16 @@ got_sb: sb->s_bdi->ra_pages = VM_READAHEAD_PAGES; - rcu_read_lock(); - for_each_online_member_rcu(c, ca) { - struct block_device *bdev = ca->disk_sb.bdev; + scoped_guard(rcu) { + for_each_online_member_rcu(c, ca) { + struct block_device *bdev = ca->disk_sb.bdev; - /* XXX: create an anonymous device for multi device filesystems */ - sb->s_bdev = bdev; - sb->s_dev = bdev->bd_dev; - break; + /* XXX: create an anonymous device for multi device filesystems */ + sb->s_bdev = bdev; + sb->s_dev = bdev->bd_dev; + break; + } } - rcu_read_unlock(); c->dev = sb->s_dev; diff --git a/fs/bcachefs/fsck.c b/fs/bcachefs/fsck.c index 49f46df8340e..68ed69a255e1 100644 --- a/fs/bcachefs/fsck.c +++ b/fs/bcachefs/fsck.c @@ -23,14 +23,15 @@ #include <linux/bsearch.h> #include <linux/dcache.h> /* struct qstr */ -static int dirent_points_to_inode_nowarn(struct bkey_s_c_dirent d, +static int dirent_points_to_inode_nowarn(struct bch_fs *c, + struct bkey_s_c_dirent d, struct bch_inode_unpacked *inode) { if (d.v->d_type == DT_SUBVOL ? le32_to_cpu(d.v->d_child_subvol) == inode->bi_subvol : le64_to_cpu(d.v->d_inum) == inode->bi_inum) return 0; - return -BCH_ERR_ENOENT_dirent_doesnt_match_inode; + return bch_err_throw(c, ENOENT_dirent_doesnt_match_inode); } static void dirent_inode_mismatch_msg(struct printbuf *out, @@ -49,7 +50,7 @@ static int dirent_points_to_inode(struct bch_fs *c, struct bkey_s_c_dirent dirent, struct bch_inode_unpacked *inode) { - int ret = dirent_points_to_inode_nowarn(dirent, inode); + int ret = dirent_points_to_inode_nowarn(c, dirent, inode); if (ret) { struct printbuf buf = PRINTBUF; dirent_inode_mismatch_msg(&buf, c, dirent, inode); @@ -152,7 +153,7 @@ static int find_snapshot_tree_subvol(struct btree_trans *trans, goto found; } } - ret = -BCH_ERR_ENOENT_no_snapshot_tree_subvol; + ret = bch_err_throw(trans->c, ENOENT_no_snapshot_tree_subvol); found: bch2_trans_iter_exit(trans, &iter); return ret; @@ -229,7 +230,7 @@ static int lookup_lostfound(struct btree_trans *trans, u32 snapshot, if (d_type != DT_DIR) { bch_err(c, "error looking up lost+found: not a directory"); - return -BCH_ERR_ENOENT_not_directory; + return bch_err_throw(c, ENOENT_not_directory); } /* @@ -531,7 +532,7 @@ static int reconstruct_subvol(struct btree_trans *trans, u32 snapshotid, u32 sub if (!bch2_snapshot_is_leaf(c, snapshotid)) { bch_err(c, "need to reconstruct subvol, but have interior node snapshot"); - return -BCH_ERR_fsck_repair_unimplemented; + return bch_err_throw(c, fsck_repair_unimplemented); } /* @@ -643,11 +644,6 @@ static int reconstruct_inode(struct btree_trans *trans, enum btree_id btree, u32 return __bch2_fsck_write_inode(trans, &new_inode); } -struct snapshots_seen { - struct bpos pos; - snapshot_id_list ids; -}; - static inline void snapshots_seen_exit(struct snapshots_seen *s) { darray_exit(&s->ids); @@ -890,14 +886,11 @@ lookup_inode_for_snapshot(struct btree_trans *trans, struct inode_walker *w, str { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; - __darray_for_each(w->inodes, i) - if (bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)) - goto found; + struct inode_walker_entry *i = darray_find_p(w->inodes, i, + bch2_snapshot_is_ancestor(c, k.k->p.snapshot, i->inode.bi_snapshot)); - return NULL; -found: - BUG_ON(k.k->p.snapshot > i->inode.bi_snapshot); + if (!i) + return NULL; struct printbuf buf = PRINTBUF; int ret = 0; @@ -947,7 +940,7 @@ found: if (ret) goto fsck_err; - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); goto fsck_err; } @@ -992,7 +985,8 @@ int bch2_fsck_update_backpointers(struct btree_trans *trans, int ret = 0; if (d->v.d_type == DT_SUBVOL) { - BUG(); + bch_err(trans->c, "%s does not support DT_SUBVOL", __func__); + ret = -BCH_ERR_fsck_repair_unimplemented; } else { ret = get_visible_inodes(trans, &target, s, le64_to_cpu(d->v.d_inum)); if (ret) @@ -1048,7 +1042,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, if (ret && !bch2_err_matches(ret, ENOENT)) return ret; - if ((ret || dirent_points_to_inode_nowarn(d, inode)) && + if ((ret || dirent_points_to_inode_nowarn(c, d, inode)) && inode->bi_subvol && (inode->bi_flags & BCH_INODE_has_child_snapshot)) { /* Older version of a renamed subvolume root: we won't have a @@ -1069,7 +1063,7 @@ static int check_inode_dirent_inode(struct btree_trans *trans, trans, inode_points_to_missing_dirent, "inode points to missing dirent\n%s", (bch2_inode_unpacked_to_text(&buf, inode), buf.buf)) || - fsck_err_on(!ret && dirent_points_to_inode_nowarn(d, inode), + fsck_err_on(!ret && dirent_points_to_inode_nowarn(c, d, inode), trans, inode_points_to_wrong_dirent, "%s", (printbuf_reset(&buf), @@ -1174,6 +1168,14 @@ static int check_inode(struct btree_trans *trans, ret = 0; } + if (fsck_err_on(S_ISDIR(u.bi_mode) && u.bi_size, + trans, inode_dir_has_nonzero_i_size, + "directory %llu:%u with nonzero i_size %lli", + u.bi_inum, u.bi_snapshot, u.bi_size)) { + u.bi_size = 0; + do_update = true; + } + ret = bch2_inode_has_child_snapshots(trans, k.k->p); if (ret < 0) goto err; @@ -1452,7 +1454,7 @@ static int check_key_has_inode(struct btree_trans *trans, goto err; inode->last_pos.inode--; - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); goto err; } @@ -1569,7 +1571,7 @@ static int extent_ends_at(struct bch_fs *c, sizeof(seen->ids.data[0]) * seen->ids.size, GFP_KERNEL); if (!n.seen.ids.data) - return -BCH_ERR_ENOMEM_fsck_extent_ends_at; + return bch_err_throw(c, ENOMEM_fsck_extent_ends_at); __darray_for_each(extent_ends->e, i) { if (i->snapshot == k.k->p.snapshot) { @@ -1619,7 +1621,7 @@ static int overlapping_extents_found(struct btree_trans *trans, bch_err(c, "%s: error finding first overlapping extent when repairing, got%s", __func__, buf.buf); - ret = -BCH_ERR_internal_fsck_err; + ret = bch_err_throw(c, internal_fsck_err); goto err; } @@ -1644,7 +1646,7 @@ static int overlapping_extents_found(struct btree_trans *trans, pos2.size != k2.k->size) { bch_err(c, "%s: error finding seconding overlapping extent when repairing%s", __func__, buf.buf); - ret = -BCH_ERR_internal_fsck_err; + ret = bch_err_throw(c, internal_fsck_err); goto err; } @@ -1692,7 +1694,7 @@ static int overlapping_extents_found(struct btree_trans *trans, * We overwrote the second extent - restart * check_extent() from the top: */ - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); } } fsck_err: @@ -2045,7 +2047,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * (bch2_bkey_val_to_text(&buf, c, d.s_c), buf.buf))) { if (!new_parent_subvol) { bch_err(c, "could not find a subvol for snapshot %u", d.k->p.snapshot); - return -BCH_ERR_fsck_repair_unimplemented; + return bch_err_throw(c, fsck_repair_unimplemented); } struct bkey_i_dirent *new_dirent = bch2_bkey_make_mut_typed(trans, iter, &d.s_c, 0, dirent); @@ -2107,7 +2109,7 @@ static int check_dirent_to_subvol(struct btree_trans *trans, struct btree_iter * if (ret) { bch_err(c, "subvol %u points to missing inode root %llu", target_subvol, target_inum); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; } @@ -2139,7 +2141,8 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, struct bch_hash_info *hash_info, struct inode_walker *dir, struct inode_walker *target, - struct snapshots_seen *s) + struct snapshots_seen *s, + bool *need_second_pass) { struct bch_fs *c = trans->c; struct inode_walker_entry *i; @@ -2181,7 +2184,12 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); dir->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, iter, k); +#ifdef CONFIG_UNICODE + hash_info->cf_encoding = bch2_inode_casefold(c, &i->inode) ? c->cf_encoding : NULL; +#endif + + ret = bch2_str_hash_check_key(trans, s, &bch2_dirent_hash_desc, hash_info, + iter, k, need_second_pass); if (ret < 0) goto err; if (ret) { @@ -2202,31 +2210,34 @@ static int check_dirent(struct btree_trans *trans, struct btree_iter *iter, (printbuf_reset(&buf), bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - struct qstr name = bch2_dirent_get_name(d); - u32 subvol = d.v->d_type == DT_SUBVOL - ? le32_to_cpu(d.v->d_parent_subvol) - : 0; + subvol_inum dir_inum = { .subvol = d.v->d_type == DT_SUBVOL + ? le32_to_cpu(d.v->d_parent_subvol) + : 0, + }; u64 target = d.v->d_type == DT_SUBVOL ? le32_to_cpu(d.v->d_child_subvol) : le64_to_cpu(d.v->d_inum); - u64 dir_offset; + struct qstr name = bch2_dirent_get_name(d); + + struct bkey_i_dirent *new_d = + bch2_dirent_create_key(trans, hash_info, dir_inum, + d.v->d_type, &name, NULL, target); + ret = PTR_ERR_OR_ZERO(new_d); + if (ret) + goto out; - ret = bch2_hash_delete_at(trans, + new_d->k.p.inode = d.k->p.inode; + new_d->k.p.snapshot = d.k->p.snapshot; + + struct btree_iter dup_iter = {}; + ret = bch2_hash_delete_at(trans, bch2_dirent_hash_desc, hash_info, iter, BTREE_UPDATE_internal_snapshot_node) ?: - bch2_dirent_create_snapshot(trans, subvol, - d.k->p.inode, d.k->p.snapshot, - hash_info, - d.v->d_type, - &name, - target, - &dir_offset, - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node| - STR_HASH_must_create) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc); - - /* might need another check_dirents pass */ + bch2_str_hash_repair_key(trans, s, + &bch2_dirent_hash_desc, hash_info, + iter, bkey_i_to_s_c(&new_d->k_i), + &dup_iter, bkey_s_c_null, + need_second_pass); goto out; } @@ -2294,7 +2305,6 @@ out: err: fsck_err: printbuf_exit(&buf); - bch_err_fn(c, ret); return ret; } @@ -2308,16 +2318,31 @@ int bch2_check_dirents(struct bch_fs *c) struct inode_walker target = inode_walker_init(); struct snapshots_seen s; struct bch_hash_info hash_info; + bool need_second_pass = false, did_second_pass = false; + int ret; snapshots_seen_init(&s); - - int ret = bch2_trans_run(c, - for_each_btree_key(trans, iter, BTREE_ID_dirents, +again: + ret = bch2_trans_run(c, + for_each_btree_key_commit(trans, iter, BTREE_ID_dirents, POS(BCACHEFS_ROOT_INO, 0), BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, - check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s)) ?: + NULL, NULL, BCH_TRANS_COMMIT_no_enospc, + check_dirent(trans, &iter, k, &hash_info, &dir, &target, &s, + &need_second_pass)) ?: check_subdir_count_notnested(trans, &dir)); + if (!ret && need_second_pass && !did_second_pass) { + bch_info(c, "check_dirents requires second pass"); + swap(did_second_pass, need_second_pass); + goto again; + } + + if (!ret && need_second_pass) { + bch_err(c, "dirents not repairing"); + ret = -EINVAL; + } + snapshots_seen_exit(&s); inode_walker_exit(&dir); inode_walker_exit(&target); @@ -2331,16 +2356,14 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, struct inode_walker *inode) { struct bch_fs *c = trans->c; - struct inode_walker_entry *i; - int ret; - ret = bch2_check_key_has_snapshot(trans, iter, k); + int ret = bch2_check_key_has_snapshot(trans, iter, k); if (ret < 0) return ret; if (ret) return 0; - i = walk_inode(trans, inode, k); + struct inode_walker_entry *i = walk_inode(trans, inode, k); ret = PTR_ERR_OR_ZERO(i); if (ret) return ret; @@ -2356,9 +2379,9 @@ static int check_xattr(struct btree_trans *trans, struct btree_iter *iter, *hash_info = bch2_hash_info_init(c, &i->inode); inode->first_this_inode = false; - ret = bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, iter, k); - bch_err_fn(c, ret); - return ret; + bool need_second_pass = false; + return bch2_str_hash_check_key(trans, NULL, &bch2_xattr_hash_desc, hash_info, + iter, k, &need_second_pass); } /* @@ -2747,7 +2770,7 @@ static int add_nlink(struct bch_fs *c, struct nlink_table *t, if (!d) { bch_err(c, "fsck: error allocating memory for nlink_table, size %zu", new_size); - return -BCH_ERR_ENOMEM_fsck_add_nlink; + return bch_err_throw(c, ENOMEM_fsck_add_nlink); } if (t->d) diff --git a/fs/bcachefs/fsck.h b/fs/bcachefs/fsck.h index 574948278cd4..e5fe7cf7b251 100644 --- a/fs/bcachefs/fsck.h +++ b/fs/bcachefs/fsck.h @@ -4,6 +4,12 @@ #include "str_hash.h" +/* recoverds snapshot IDs of overwrites at @pos */ +struct snapshots_seen { + struct bpos pos; + snapshot_id_list ids; +}; + int bch2_fsck_update_backpointers(struct btree_trans *, struct snapshots_seen *, const struct bch_hash_desc, diff --git a/fs/bcachefs/inode.c b/fs/bcachefs/inode.c index 5cf70108ae2f..53e5dc1f6ac1 100644 --- a/fs/bcachefs/inode.c +++ b/fs/bcachefs/inode.c @@ -38,6 +38,7 @@ static const char * const bch2_inode_flag_strs[] = { #undef x static int delete_ancestor_snapshot_inodes(struct btree_trans *, struct bpos); +static int may_delete_deleted_inum(struct btree_trans *, subvol_inum); static const u8 byte_table[8] = { 1, 2, 3, 4, 6, 8, 10, 13 }; @@ -1041,7 +1042,7 @@ again: goto found_slot; if (!ret && start == min) - ret = -BCH_ERR_ENOSPC_inode_create; + ret = bch_err_throw(trans->c, ENOSPC_inode_create); if (ret) { bch2_trans_iter_exit(trans, iter); @@ -1130,19 +1131,23 @@ int bch2_inode_rm(struct bch_fs *c, subvol_inum inum) u32 snapshot; int ret; + ret = lockrestart_do(trans, may_delete_deleted_inum(trans, inum)); + if (ret) + goto err2; + /* * If this was a directory, there shouldn't be any real dirents left - * but there could be whiteouts (from hash collisions) that we should * delete: * - * XXX: the dirent could ideally would delete whiteouts when they're no + * XXX: the dirent code ideally would delete whiteouts when they're no * longer needed */ ret = bch2_inode_delete_keys(trans, inum, BTREE_ID_extents) ?: bch2_inode_delete_keys(trans, inum, BTREE_ID_xattrs) ?: bch2_inode_delete_keys(trans, inum, BTREE_ID_dirents); if (ret) - goto err; + goto err2; retry: bch2_trans_begin(trans); @@ -1161,7 +1166,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum.inum, snapshot); - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(c, ENOENT_inode); goto err; } @@ -1328,7 +1333,7 @@ retry: bch2_fs_inconsistent(c, "inode %llu:%u not found when deleting", inum, snapshot); - ret = -BCH_ERR_ENOENT_inode; + ret = bch_err_throw(c, ENOENT_inode); goto err; } @@ -1392,10 +1397,8 @@ int bch2_inode_rm_snapshot(struct btree_trans *trans, u64 inum, u32 snapshot) delete_ancestor_snapshot_inodes(trans, SPOS(0, inum, snapshot)); } -static int may_delete_deleted_inode(struct btree_trans *trans, - struct btree_iter *iter, - struct bpos pos, - bool *need_another_pass) +static int may_delete_deleted_inode(struct btree_trans *trans, struct bpos pos, + bool from_deleted_inodes) { struct bch_fs *c = trans->c; struct btree_iter inode_iter; @@ -1409,12 +1412,14 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (ret) return ret; - ret = bkey_is_inode(k.k) ? 0 : -BCH_ERR_ENOENT_inode; - if (fsck_err_on(!bkey_is_inode(k.k), + ret = bkey_is_inode(k.k) ? 0 : bch_err_throw(c, ENOENT_inode); + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_missing, "nonexistent inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + goto out; ret = bch2_inode_unpack(k, &inode); if (ret) @@ -1422,7 +1427,8 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (S_ISDIR(inode.bi_mode)) { ret = bch2_empty_dir_snapshot(trans, pos.offset, 0, pos.snapshot); - if (fsck_err_on(bch2_err_matches(ret, ENOTEMPTY), + if (fsck_err_on(from_deleted_inodes && + bch2_err_matches(ret, ENOTEMPTY), trans, deleted_inode_is_dir, "non empty directory %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) @@ -1431,17 +1437,25 @@ static int may_delete_deleted_inode(struct btree_trans *trans, goto out; } - if (fsck_err_on(!(inode.bi_flags & BCH_INODE_unlinked), + ret = inode.bi_flags & BCH_INODE_unlinked ? 0 : bch_err_throw(c, inode_not_unlinked); + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_not_unlinked, "non-deleted inode %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + goto out; - if (fsck_err_on(inode.bi_flags & BCH_INODE_has_child_snapshot, + ret = !(inode.bi_flags & BCH_INODE_has_child_snapshot) + ? 0 : bch_err_throw(c, inode_has_child_snapshot); + + if (fsck_err_on(from_deleted_inodes && ret, trans, deleted_inode_has_child_snapshots, "inode with child snapshots %llu:%u in deleted_inodes btree", pos.offset, pos.snapshot)) goto delete; + if (ret) + goto out; ret = bch2_inode_has_child_snapshots(trans, k.k->p); if (ret < 0) @@ -1458,19 +1472,28 @@ static int may_delete_deleted_inode(struct btree_trans *trans, if (ret) goto out; } + + if (!from_deleted_inodes) { + ret = bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + bch_err_throw(c, inode_has_child_snapshot); + goto out; + } + goto delete; } - if (test_bit(BCH_FS_clean_recovery, &c->flags) && - !fsck_err(trans, deleted_inode_but_clean, - "filesystem marked as clean but have deleted inode %llu:%u", - pos.offset, pos.snapshot)) { - ret = 0; - goto out; - } + if (from_deleted_inodes) { + if (test_bit(BCH_FS_clean_recovery, &c->flags) && + !fsck_err(trans, deleted_inode_but_clean, + "filesystem marked as clean but have deleted inode %llu:%u", + pos.offset, pos.snapshot)) { + ret = 0; + goto out; + } - ret = 1; + ret = 1; + } out: fsck_err: bch2_trans_iter_exit(trans, &inode_iter); @@ -1481,12 +1504,19 @@ delete: goto out; } +static int may_delete_deleted_inum(struct btree_trans *trans, subvol_inum inum) +{ + u32 snapshot; + + return bch2_subvolume_get_snapshot(trans, inum.subvol, &snapshot) ?: + may_delete_deleted_inode(trans, SPOS(0, inum.inum, snapshot), false); +} + int bch2_delete_dead_inodes(struct bch_fs *c) { struct btree_trans *trans = bch2_trans_get(c); - bool need_another_pass; int ret; -again: + /* * if we ran check_inodes() unlinked inodes will have already been * cleaned up but the write buffer will be out of sync; therefore we @@ -1496,8 +1526,6 @@ again: if (ret) goto err; - need_another_pass = false; - /* * Weird transaction restart handling here because on successful delete, * bch2_inode_rm_snapshot() will return a nested transaction restart, @@ -1507,7 +1535,7 @@ again: ret = for_each_btree_key_commit(trans, iter, BTREE_ID_deleted_inodes, POS_MIN, BTREE_ITER_prefetch|BTREE_ITER_all_snapshots, k, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, ({ - ret = may_delete_deleted_inode(trans, &iter, k.k->p, &need_another_pass); + ret = may_delete_deleted_inode(trans, k.k->p, true); if (ret > 0) { bch_verbose_ratelimited(c, "deleting unlinked inode %llu:%u", k.k->p.offset, k.k->p.snapshot); @@ -1528,10 +1556,8 @@ again: ret; })); - - if (!ret && need_another_pass) - goto again; err: bch2_trans_put(trans); + bch_err_fn(c, ret); return ret; } diff --git a/fs/bcachefs/inode.h b/fs/bcachefs/inode.h index 77ad2d549541..82cec2836cbd 100644 --- a/fs/bcachefs/inode.h +++ b/fs/bcachefs/inode.h @@ -283,15 +283,6 @@ static inline void bch2_inode_nlink_set(struct bch_inode_unpacked *bi, int bch2_inode_nlink_inc(struct bch_inode_unpacked *); void bch2_inode_nlink_dec(struct btree_trans *, struct bch_inode_unpacked *); -static inline bool bch2_inode_should_have_single_bp(struct bch_inode_unpacked *inode) -{ - bool inode_has_bp = inode->bi_dir || inode->bi_dir_offset; - - return S_ISDIR(inode->bi_mode) || - inode->bi_subvol || - (!inode->bi_nlink && inode_has_bp); -} - struct bch_opts bch2_inode_opts_to_opts(struct bch_inode_unpacked *); void bch2_inode_opts_get(struct bch_io_opts *, struct bch_fs *, struct bch_inode_unpacked *); diff --git a/fs/bcachefs/io_misc.c b/fs/bcachefs/io_misc.c index cc07729a4b62..bf72b1d2e2cb 100644 --- a/fs/bcachefs/io_misc.c +++ b/fs/bcachefs/io_misc.c @@ -91,7 +91,7 @@ int bch2_extent_fallocate(struct btree_trans *trans, opts.data_replicas, BCH_WATERMARK_normal, 0, &cl, &wp); if (bch2_err_matches(ret, BCH_ERR_operation_blocked)) - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); if (ret) goto err; diff --git a/fs/bcachefs/io_read.c b/fs/bcachefs/io_read.c index cc708d46557e..a77779afad01 100644 --- a/fs/bcachefs/io_read.c +++ b/fs/bcachefs/io_read.c @@ -56,7 +56,7 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) if (!target) return false; - rcu_read_lock(); + guard(rcu)(); devs = bch2_target_to_mask(c, target) ?: &c->rw_devs[BCH_DATA_user]; @@ -73,7 +73,6 @@ static bool bch2_target_congested(struct bch_fs *c, u16 target) total += max(congested, 0LL); nr++; } - rcu_read_unlock(); return get_random_u32_below(nr * CONGESTED_MAX) < total; } @@ -138,21 +137,21 @@ static inline int should_promote(struct bch_fs *c, struct bkey_s_c k, BUG_ON(!opts.promote_target); if (!(flags & BCH_READ_may_promote)) - return -BCH_ERR_nopromote_may_not; + return bch_err_throw(c, nopromote_may_not); if (bch2_bkey_has_target(c, k, opts.promote_target)) - return -BCH_ERR_nopromote_already_promoted; + return bch_err_throw(c, nopromote_already_promoted); if (bkey_extent_is_unwritten(k)) - return -BCH_ERR_nopromote_unwritten; + return bch_err_throw(c, nopromote_unwritten); if (bch2_target_congested(c, opts.promote_target)) - return -BCH_ERR_nopromote_congested; + return bch_err_throw(c, nopromote_congested); } if (rhashtable_lookup_fast(&c->promote_table, &pos, bch_promote_params)) - return -BCH_ERR_nopromote_in_flight; + return bch_err_throw(c, nopromote_in_flight); return 0; } @@ -240,7 +239,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, struct promote_op *op = kzalloc(sizeof(*op), GFP_KERNEL); if (!op) { - ret = -BCH_ERR_nopromote_enomem; + ret = bch_err_throw(c, nopromote_enomem); goto err_put; } @@ -249,7 +248,7 @@ static struct bch_read_bio *__promote_alloc(struct btree_trans *trans, if (rhashtable_lookup_insert_fast(&c->promote_table, &op->hash, bch_promote_params)) { - ret = -BCH_ERR_nopromote_in_flight; + ret = bch_err_throw(c, nopromote_in_flight); goto err; } @@ -545,7 +544,7 @@ retry: if (!bkey_and_val_eq(k, bkey_i_to_s_c(u->k.k))) { /* extent we wanted to read no longer exists: */ - rbio->ret = -BCH_ERR_data_read_key_overwritten; + rbio->ret = bch_err_throw(trans->c, data_read_key_overwritten); goto err; } @@ -1036,7 +1035,7 @@ int __bch2_read_extent(struct btree_trans *trans, struct bch_read_bio *orig, if ((bch2_bkey_extent_flags(k) & BIT_ULL(BCH_EXTENT_FLAG_poisoned)) && !orig->data_update) - return -BCH_ERR_extent_poisoned; + return bch_err_throw(c, extent_poisoned); retry_pick: ret = bch2_bkey_pick_read_device(c, k, failed, &pick, dev); @@ -1074,7 +1073,7 @@ retry_pick: bch_err_ratelimited(c, "%s", buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_data_read_no_encryption_key; + ret = bch_err_throw(c, data_read_no_encryption_key); goto err; } @@ -1128,7 +1127,7 @@ retry_pick: if (ca) enumerated_ref_put(&ca->io_ref[READ], BCH_DEV_READ_REF_io_read); - rbio->ret = -BCH_ERR_data_read_buffer_too_small; + rbio->ret = bch_err_throw(c, data_read_buffer_too_small); goto out_read_done; } @@ -1333,7 +1332,7 @@ hole: * have to signal that: */ if (u) - orig->ret = -BCH_ERR_data_read_key_overwritten; + orig->ret = bch_err_throw(c, data_read_key_overwritten); zero_fill_bio_iter(&orig->bio, iter); out_read_done: @@ -1510,18 +1509,18 @@ int bch2_fs_io_read_init(struct bch_fs *c) c->opts.btree_node_size, c->opts.encoded_extent_max) / PAGE_SIZE, 0)) - return -BCH_ERR_ENOMEM_bio_bounce_pages_init; + return bch_err_throw(c, ENOMEM_bio_bounce_pages_init); if (bioset_init(&c->bio_read, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_init; + return bch_err_throw(c, ENOMEM_bio_read_init); if (bioset_init(&c->bio_read_split, 1, offsetof(struct bch_read_bio, bio), BIOSET_NEED_BVECS)) - return -BCH_ERR_ENOMEM_bio_read_split_init; + return bch_err_throw(c, ENOMEM_bio_read_split_init); if (rhashtable_init(&c->promote_table, &bch_promote_params)) - return -BCH_ERR_ENOMEM_promote_table_init; + return bch_err_throw(c, ENOMEM_promote_table_init); return 0; } diff --git a/fs/bcachefs/io_read.h b/fs/bcachefs/io_read.h index c08b9c047b3e..45c959018919 100644 --- a/fs/bcachefs/io_read.h +++ b/fs/bcachefs/io_read.h @@ -91,6 +91,8 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, return 0; *data_btree = BTREE_ID_reflink; + + struct bch_fs *c = trans->c; struct btree_iter iter; struct bkey_s_c k = bch2_lookup_indirect_extent(trans, &iter, offset_into_extent, @@ -102,10 +104,10 @@ static inline int bch2_read_indirect_extent(struct btree_trans *trans, if (bkey_deleted(k.k)) { bch2_trans_iter_exit(trans, &iter); - return -BCH_ERR_missing_indirect_extent; + return bch_err_throw(c, missing_indirect_extent); } - bch2_bkey_buf_reassemble(extent, trans->c, k); + bch2_bkey_buf_reassemble(extent, c, k); bch2_trans_iter_exit(trans, &iter); return 0; } diff --git a/fs/bcachefs/io_write.c b/fs/bcachefs/io_write.c index 52a60982a66b..88b1eec8eff3 100644 --- a/fs/bcachefs/io_write.c +++ b/fs/bcachefs/io_write.c @@ -558,6 +558,7 @@ static void bch2_write_done(struct closure *cl) static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) { + struct bch_fs *c = op->c; struct keylist *keys = &op->insert_keys; struct bkey_i *src, *dst = keys->keys, *n; @@ -569,7 +570,7 @@ static noinline int bch2_write_drop_io_error_ptrs(struct bch_write_op *op) test_bit(ptr->dev, op->failed.d)); if (!bch2_bkey_nr_ptrs(bkey_i_to_s_c(src))) - return -BCH_ERR_data_write_io; + return bch_err_throw(c, data_write_io); } if (dst != src) @@ -976,7 +977,7 @@ csum_err: op->crc.csum_type < BCH_CSUM_NR ? __bch2_csum_types[op->crc.csum_type] : "(unknown)"); - return -BCH_ERR_data_write_csum; + return bch_err_throw(c, data_write_csum); } static int bch2_write_extent(struct bch_write_op *op, struct write_point *wp, @@ -1208,16 +1209,13 @@ static bool bch2_extent_is_writeable(struct bch_write_op *op, e = bkey_s_c_to_extent(k); - rcu_read_lock(); + guard(rcu)(); extent_for_each_ptr_decode(e, p, entry) { - if (crc_is_encoded(p.crc) || p.has_ec) { - rcu_read_unlock(); + if (crc_is_encoded(p.crc) || p.has_ec) return false; - } replicas += bch2_extent_ptr_durability(c, &p); } - rcu_read_unlock(); return replicas >= op->opts.data_replicas; } @@ -1290,7 +1288,7 @@ static void bch2_nocow_write_convert_unwritten(struct bch_write_op *op) static void __bch2_nocow_write_done(struct bch_write_op *op) { if (unlikely(op->flags & BCH_WRITE_io_error)) { - op->error = -BCH_ERR_data_write_io; + op->error = bch_err_throw(op->c, data_write_io); } else if (unlikely(op->flags & BCH_WRITE_convert_unwritten)) bch2_nocow_write_convert_unwritten(op); } @@ -1483,10 +1481,10 @@ err_bucket_stale: "pointer to invalid bucket in nocow path on device %llu\n %s", stale_at->b.inode, (bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { - ret = -BCH_ERR_data_write_invalid_ptr; + ret = bch_err_throw(c, data_write_invalid_ptr); } else { /* We can retry this: */ - ret = -BCH_ERR_transaction_restart; + ret = bch_err_throw(c, transaction_restart); } printbuf_exit(&buf); @@ -1693,18 +1691,18 @@ CLOSURE_CALLBACK(bch2_write) if (unlikely(bio->bi_iter.bi_size & (c->opts.block_size - 1))) { bch2_write_op_error(op, op->pos.offset, "misaligned write"); - op->error = -BCH_ERR_data_write_misaligned; + op->error = bch_err_throw(c, data_write_misaligned); goto err; } if (c->opts.nochanges) { - op->error = -BCH_ERR_erofs_no_writes; + op->error = bch_err_throw(c, erofs_no_writes); goto err; } if (!(op->flags & BCH_WRITE_move) && !enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_write)) { - op->error = -BCH_ERR_erofs_no_writes; + op->error = bch_err_throw(c, erofs_no_writes); goto err; } @@ -1776,7 +1774,7 @@ int bch2_fs_io_write_init(struct bch_fs *c) { if (bioset_init(&c->bio_write, 1, offsetof(struct bch_write_bio, bio), BIOSET_NEED_BVECS) || bioset_init(&c->replica_set, 4, offsetof(struct bch_write_bio, bio), 0)) - return -BCH_ERR_ENOMEM_bio_write_init; + return bch_err_throw(c, ENOMEM_bio_write_init); return 0; } diff --git a/fs/bcachefs/journal.c b/fs/bcachefs/journal.c index 09b70fd140a1..dda802a656cf 100644 --- a/fs/bcachefs/journal.c +++ b/fs/bcachefs/journal.c @@ -397,7 +397,7 @@ static int journal_entry_open(struct journal *j) BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); if (j->blocked) - return -BCH_ERR_journal_blocked; + return bch_err_throw(c, journal_blocked); if (j->cur_entry_error) return j->cur_entry_error; @@ -407,23 +407,23 @@ static int journal_entry_open(struct journal *j) return ret; if (!fifo_free(&j->pin)) - return -BCH_ERR_journal_pin_full; + return bch_err_throw(c, journal_pin_full); if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf)) - return -BCH_ERR_journal_max_in_flight; + return bch_err_throw(c, journal_max_in_flight); if (atomic64_read(&j->seq) - j->seq_write_started == JOURNAL_STATE_BUF_NR) - return -BCH_ERR_journal_max_open; + return bch_err_throw(c, journal_max_open); if (unlikely(journal_cur_seq(j) >= JOURNAL_SEQ_MAX)) { bch_err(c, "cannot start: journal seq overflow"); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); - return -BCH_ERR_journal_shutdown; + return bch_err_throw(c, journal_shutdown); } if (!j->free_buf && !buf->data) - return -BCH_ERR_journal_buf_enomem; /* will retry after write completion frees up a buf */ + return bch_err_throw(c, journal_buf_enomem); /* will retry after write completion frees up a buf */ BUG_ON(!j->cur_entry_sectors); @@ -447,7 +447,7 @@ static int journal_entry_open(struct journal *j) u64s = clamp_t(int, u64s, 0, JOURNAL_ENTRY_CLOSED_VAL - 1); if (u64s <= (ssize_t) j->early_journal_entries.nr) - return -BCH_ERR_journal_full; + return bch_err_throw(c, journal_full); if (fifo_empty(&j->pin) && j->reclaim_thread) wake_up_process(j->reclaim_thread); @@ -464,7 +464,7 @@ static int journal_entry_open(struct journal *j) journal_cur_seq(j)); if (bch2_fs_emergency_read_only_locked(c)) bch_err(c, "fatal error - emergency read only"); - return -BCH_ERR_journal_shutdown; + return bch_err_throw(c, journal_shutdown); } BUG_ON(j->pin.back - 1 != atomic64_read(&j->seq)); @@ -597,16 +597,16 @@ retry: return ret; if (j->blocked) - return -BCH_ERR_journal_blocked; + return bch_err_throw(c, journal_blocked); if ((flags & BCH_WATERMARK_MASK) < j->watermark) { - ret = -BCH_ERR_journal_full; + ret = bch_err_throw(c, journal_full); can_discard = j->can_discard; goto out; } if (nr_unwritten_journal_entries(j) == ARRAY_SIZE(j->buf) && !journal_entry_is_open(j)) { - ret = -BCH_ERR_journal_max_in_flight; + ret = bch_err_throw(c, journal_max_in_flight); goto out; } @@ -647,7 +647,7 @@ out: goto retry; if (journal_error_check_stuck(j, ret, flags)) - ret = -BCH_ERR_journal_stuck; + ret = bch_err_throw(c, journal_stuck); if (ret == -BCH_ERR_journal_max_in_flight && track_event_change(&c->times[BCH_TIME_blocked_journal_max_in_flight], true) && @@ -708,10 +708,9 @@ static unsigned max_dev_latency(struct bch_fs *c) { u64 nsecs = 0; - rcu_read_lock(); + guard(rcu)(); for_each_rw_member_rcu(c, ca) nsecs = max(nsecs, ca->io_latency[WRITE].stats.max_duration); - rcu_read_unlock(); return nsecs_to_jiffies(nsecs); } @@ -813,6 +812,7 @@ out: int bch2_journal_flush_seq_async(struct journal *j, u64 seq, struct closure *parent) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_buf *buf; int ret = 0; @@ -828,7 +828,7 @@ int bch2_journal_flush_seq_async(struct journal *j, u64 seq, /* Recheck under lock: */ if (j->err_seq && seq >= j->err_seq) { - ret = -BCH_ERR_journal_flush_err; + ret = bch_err_throw(c, journal_flush_err); goto out; } @@ -999,7 +999,7 @@ int bch2_journal_meta(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_journal)) - return -BCH_ERR_erofs_no_writes; + return bch_err_throw(c, erofs_no_writes); int ret = __bch2_journal_meta(j); enumerated_ref_put(&c->writes, BCH_WRITE_REF_journal); @@ -1132,7 +1132,7 @@ static int bch2_set_nr_journal_buckets_iter(struct bch_dev *ca, unsigned nr, new_buckets = kcalloc(nr, sizeof(u64), GFP_KERNEL); new_bucket_seq = kcalloc(nr, sizeof(u64), GFP_KERNEL); if (!bu || !ob || !new_buckets || !new_bucket_seq) { - ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); goto err_free; } @@ -1304,6 +1304,66 @@ int bch2_set_nr_journal_buckets(struct bch_fs *c, struct bch_dev *ca, return ret; } +int bch2_dev_journal_bucket_delete(struct bch_dev *ca, u64 b) +{ + struct bch_fs *c = ca->fs; + struct journal *j = &c->journal; + struct journal_device *ja = &ca->journal; + + guard(mutex)(&c->sb_lock); + unsigned pos; + for (pos = 0; pos < ja->nr; pos++) + if (ja->buckets[pos] == b) + break; + + if (pos == ja->nr) { + bch_err(ca, "journal bucket %llu not found when deleting", b); + return -EINVAL; + } + + u64 *new_buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL);; + if (!new_buckets) + return bch_err_throw(c, ENOMEM_set_nr_journal_buckets); + + memcpy(new_buckets, ja->buckets, ja->nr * sizeof(u64)); + memmove(&new_buckets[pos], + &new_buckets[pos + 1], + (ja->nr - 1 - pos) * sizeof(new_buckets[0])); + + int ret = bch2_journal_buckets_to_sb(c, ca, ja->buckets, ja->nr - 1) ?: + bch2_write_super(c); + if (ret) { + kfree(new_buckets); + return ret; + } + + scoped_guard(spinlock, &j->lock) { + if (pos < ja->discard_idx) + --ja->discard_idx; + if (pos < ja->dirty_idx_ondisk) + --ja->dirty_idx_ondisk; + if (pos < ja->dirty_idx) + --ja->dirty_idx; + if (pos < ja->cur_idx) + --ja->cur_idx; + + ja->nr--; + + memmove(&ja->buckets[pos], + &ja->buckets[pos + 1], + (ja->nr - pos) * sizeof(ja->buckets[0])); + + memmove(&ja->bucket_seq[pos], + &ja->bucket_seq[pos + 1], + (ja->nr - pos) * sizeof(ja->bucket_seq[0])); + + bch2_journal_space_available(j); + } + + kfree(new_buckets); + return 0; +} + int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) { struct bch_fs *c = ca->fs; @@ -1313,14 +1373,14 @@ int bch2_dev_journal_alloc(struct bch_dev *ca, bool new_fs) if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { bch_err(c, "cannot allocate journal, filesystem is an unresized image file"); - return -BCH_ERR_erofs_filesystem_full; + return bch_err_throw(c, erofs_filesystem_full); } unsigned nr; int ret; if (dynamic_fault("bcachefs:add:journal_alloc")) { - ret = -BCH_ERR_ENOMEM_set_nr_journal_buckets; + ret = bch_err_throw(c, ENOMEM_set_nr_journal_buckets); goto err; } @@ -1459,7 +1519,7 @@ int bch2_fs_journal_start(struct journal *j, u64 cur_seq) init_fifo(&j->pin, roundup_pow_of_two(nr), GFP_KERNEL); if (!j->pin.data) { bch_err(c, "error reallocating journal fifo (%llu open entries)", nr); - return -BCH_ERR_ENOMEM_journal_pin_fifo; + return bch_err_throw(c, ENOMEM_journal_pin_fifo); } j->replay_journal_seq = last_seq; @@ -1547,6 +1607,7 @@ void bch2_dev_journal_exit(struct bch_dev *ca) int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) { + struct bch_fs *c = ca->fs; struct journal_device *ja = &ca->journal; struct bch_sb_field_journal *journal_buckets = bch2_sb_field_get(sb, journal); @@ -1566,7 +1627,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bucket_seq = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->bucket_seq) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); unsigned nr_bvecs = DIV_ROUND_UP(JOURNAL_ENTRY_SIZE_MAX, PAGE_SIZE); @@ -1574,7 +1635,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->bio[i] = kzalloc(struct_size(ja->bio[i], bio.bi_inline_vecs, nr_bvecs), GFP_KERNEL); if (!ja->bio[i]) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); ja->bio[i]->ca = ca; ja->bio[i]->buf_idx = i; @@ -1583,7 +1644,7 @@ int bch2_dev_journal_init(struct bch_dev *ca, struct bch_sb *sb) ja->buckets = kcalloc(ja->nr, sizeof(u64), GFP_KERNEL); if (!ja->buckets) - return -BCH_ERR_ENOMEM_dev_journal_init; + return bch_err_throw(c, ENOMEM_dev_journal_init); if (journal_buckets_v2) { unsigned nr = bch2_sb_field_journal_v2_nr_entries(journal_buckets_v2); @@ -1637,10 +1698,12 @@ void bch2_fs_journal_init_early(struct journal *j) int bch2_fs_journal_init(struct journal *j) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); + j->free_buf_size = j->buf_size_want = JOURNAL_ENTRY_SIZE_MIN; j->free_buf = kvmalloc(j->free_buf_size, GFP_KERNEL); if (!j->free_buf) - return -BCH_ERR_ENOMEM_journal_buf; + return bch_err_throw(c, ENOMEM_journal_buf); for (unsigned i = 0; i < ARRAY_SIZE(j->buf); i++) j->buf[i].idx = i; @@ -1648,7 +1711,7 @@ int bch2_fs_journal_init(struct journal *j) j->wq = alloc_workqueue("bcachefs_journal", WQ_HIGHPRI|WQ_FREEZABLE|WQ_UNBOUND|WQ_MEM_RECLAIM, 512); if (!j->wq) - return -BCH_ERR_ENOMEM_fs_other_alloc; + return bch_err_throw(c, ENOMEM_fs_other_alloc); return 0; } @@ -1672,7 +1735,7 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) printbuf_tabstop_push(out, 28); out->atomic++; - rcu_read_lock(); + guard(rcu)(); s = READ_ONCE(j->reservations); prt_printf(out, "flags:\t"); @@ -1763,8 +1826,6 @@ void __bch2_journal_debug_to_text(struct printbuf *out, struct journal *j) prt_printf(out, "replicas want %u need %u\n", c->opts.metadata_replicas, c->opts.metadata_replicas_required); - rcu_read_unlock(); - --out->atomic; } diff --git a/fs/bcachefs/journal.h b/fs/bcachefs/journal.h index 8ff00a0ec778..83734fe4331f 100644 --- a/fs/bcachefs/journal.h +++ b/fs/bcachefs/journal.h @@ -444,8 +444,9 @@ struct journal_buf *bch2_next_write_buffer_flush_journal_buf(struct journal *, u void __bch2_journal_debug_to_text(struct printbuf *, struct journal *); void bch2_journal_debug_to_text(struct printbuf *, struct journal *); -int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, - unsigned nr); +int bch2_set_nr_journal_buckets(struct bch_fs *, struct bch_dev *, unsigned); +int bch2_dev_journal_bucket_delete(struct bch_dev *, u64); + int bch2_dev_journal_alloc(struct bch_dev *, bool); int bch2_fs_journal_alloc(struct bch_fs *); diff --git a/fs/bcachefs/journal_io.c b/fs/bcachefs/journal_io.c index 63bb207208b2..0b15d71a8d2d 100644 --- a/fs/bcachefs/journal_io.c +++ b/fs/bcachefs/journal_io.c @@ -49,25 +49,27 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *c) mutex_unlock(&c->sb_lock); } -void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +static void bch2_journal_ptr_to_text(struct printbuf *out, struct bch_fs *c, struct journal_ptr *p) +{ + struct bch_dev *ca = bch2_dev_tryget_noerror(c, p->dev); + prt_printf(out, "%s %u:%u:%u (sector %llu)", + ca ? ca->name : "(invalid dev)", + p->dev, p->bucket, p->bucket_offset, p->sector); + bch2_dev_put(ca); +} + +void bch2_journal_ptrs_to_text(struct printbuf *out, struct bch_fs *c, struct journal_replay *j) { darray_for_each(j->ptrs, i) { if (i != j->ptrs.data) prt_printf(out, " "); - prt_printf(out, "%u:%u:%u (sector %llu)", - i->dev, i->bucket, i->bucket_offset, i->sector); + bch2_journal_ptr_to_text(out, c, i); } } -static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, - struct journal_replay *j) +static void bch2_journal_datetime_to_text(struct printbuf *out, struct jset *j) { - prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); - - bch2_journal_ptrs_to_text(out, c, j); - - for_each_jset_entry_type(entry, &j->j, BCH_JSET_ENTRY_datetime) { + for_each_jset_entry_type(entry, j, BCH_JSET_ENTRY_datetime) { struct jset_entry_datetime *datetime = container_of(entry, struct jset_entry_datetime, entry); bch2_prt_datetime(out, le64_to_cpu(datetime->seconds)); @@ -75,6 +77,15 @@ static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, } } +static void bch2_journal_replay_to_text(struct printbuf *out, struct bch_fs *c, + struct journal_replay *j) +{ + prt_printf(out, "seq %llu ", le64_to_cpu(j->j.seq)); + bch2_journal_datetime_to_text(out, &j->j); + prt_char(out, ' '); + bch2_journal_ptrs_to_text(out, c, j); +} + static struct nonce journal_nonce(const struct jset *jset) { return (struct nonce) {{ @@ -188,7 +199,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, journal_entry_radix_idx(c, le64_to_cpu(j->seq)), GFP_KERNEL); if (!_i) - return -BCH_ERR_ENOMEM_journal_entry_add; + return bch_err_throw(c, ENOMEM_journal_entry_add); /* * Duplicate journal entries? If so we want the one that didn't have a @@ -231,7 +242,7 @@ static int journal_entry_add(struct bch_fs *c, struct bch_dev *ca, replace: i = kvmalloc(offsetof(struct journal_replay, j) + bytes, GFP_KERNEL); if (!i) - return -BCH_ERR_ENOMEM_journal_entry_add; + return bch_err_throw(c, ENOMEM_journal_entry_add); darray_init(&i->ptrs); i->csum_good = entry_ptr.csum_good; @@ -311,7 +322,7 @@ static void journal_entry_err_msg(struct printbuf *out, bch2_sb_error_count(c, BCH_FSCK_ERR_##_err); \ if (bch2_fs_inconsistent(c, \ "corrupt metadata before write: %s\n", _buf.buf)) {\ - ret = -BCH_ERR_fsck_errors_not_fixed; \ + ret = bch_err_throw(c, fsck_errors_not_fixed); \ goto fsck_err; \ } \ break; \ @@ -418,6 +429,10 @@ static void journal_entry_btree_keys_to_text(struct printbuf *out, struct bch_fs bool first = true; jset_entry_for_each_key(entry, k) { + /* We may be called on entries that haven't been validated: */ + if (!k->k.u64s) + break; + if (!first) { prt_newline(out); bch2_prt_jset_entry_type(out, entry->type); @@ -1005,19 +1020,19 @@ struct journal_read_buf { size_t size; }; -static int journal_read_buf_realloc(struct journal_read_buf *b, +static int journal_read_buf_realloc(struct bch_fs *c, struct journal_read_buf *b, size_t new_size) { void *n; /* the bios are sized for this many pages, max: */ if (new_size > JOURNAL_ENTRY_SIZE_MAX) - return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); new_size = roundup_pow_of_two(new_size); n = kvmalloc(new_size, GFP_KERNEL); if (!n) - return -BCH_ERR_ENOMEM_journal_read_buf_realloc; + return bch_err_throw(c, ENOMEM_journal_read_buf_realloc); kvfree(b->data); b->data = n; @@ -1037,7 +1052,6 @@ static int journal_read_bucket(struct bch_dev *ca, u64 offset = bucket_to_sector(ca, ja->buckets[bucket]), end = offset + ca->mi.bucket_size; bool saw_bad = false, csum_good; - struct printbuf err = PRINTBUF; int ret = 0; pr_debug("reading %u", bucket); @@ -1053,7 +1067,7 @@ reread: bio = bio_kmalloc(nr_bvecs, GFP_KERNEL); if (!bio) - return -BCH_ERR_ENOMEM_journal_read_bucket; + return bch_err_throw(c, ENOMEM_journal_read_bucket); bio_init(bio, ca->disk_sb.bdev, bio->bi_inline_vecs, nr_bvecs, REQ_OP_READ); bio->bi_iter.bi_sector = offset; @@ -1064,7 +1078,7 @@ reread: kfree(bio); if (!ret && bch2_meta_read_fault("journal")) - ret = -BCH_ERR_EIO_fault_injected; + ret = bch_err_throw(c, EIO_fault_injected); bch2_account_io_completion(ca, BCH_MEMBER_ERROR_read, submit_time, !ret); @@ -1078,7 +1092,7 @@ reread: * found on a different device, and missing or * no journal entries will be handled later */ - goto out; + return 0; } j = buf->data; @@ -1092,15 +1106,15 @@ reread: break; case JOURNAL_ENTRY_REREAD: if (vstruct_bytes(j) > buf->size) { - ret = journal_read_buf_realloc(buf, + ret = journal_read_buf_realloc(c, buf, vstruct_bytes(j)); if (ret) - goto err; + return ret; } goto reread; case JOURNAL_ENTRY_NONE: if (!saw_bad) - goto out; + return 0; /* * On checksum error we don't really trust the size * field of the journal entry we read, so try reading @@ -1109,7 +1123,7 @@ reread: sectors = block_sectors(c); goto next_block; default: - goto err; + return ret; } if (le64_to_cpu(j->seq) > ja->highest_seq_found) { @@ -1126,22 +1140,20 @@ reread: * bucket: */ if (le64_to_cpu(j->seq) < ja->bucket_seq[bucket]) - goto out; + return 0; ja->bucket_seq[bucket] = le64_to_cpu(j->seq); - enum bch_csum_type csum_type = JSET_CSUM_TYPE(j); struct bch_csum csum; csum_good = jset_csum_good(c, j, &csum); bch2_account_io_completion(ca, BCH_MEMBER_ERROR_checksum, 0, csum_good); if (!csum_good) { - bch_err_dev_ratelimited(ca, "%s", - (printbuf_reset(&err), - prt_str(&err, "journal "), - bch2_csum_err_msg(&err, csum_type, j->csum, csum), - err.buf)); + /* + * Don't print an error here, we'll print the error + * later if we need this journal entry + */ saw_bad = true; } @@ -1153,6 +1165,7 @@ reread: mutex_lock(&jlist->lock); ret = journal_entry_add(c, ca, (struct journal_ptr) { .csum_good = csum_good, + .csum = csum, .dev = ca->dev_idx, .bucket = bucket, .bucket_offset = offset - @@ -1167,7 +1180,7 @@ reread: case JOURNAL_ENTRY_ADD_OUT_OF_RANGE: break; default: - goto err; + return ret; } next_block: pr_debug("next"); @@ -1176,11 +1189,7 @@ next_block: j = ((void *) j) + (sectors << 9); } -out: - ret = 0; -err: - printbuf_exit(&err); - return ret; + return 0; } static CLOSURE_CALLBACK(bch2_journal_read_device) @@ -1197,7 +1206,7 @@ static CLOSURE_CALLBACK(bch2_journal_read_device) if (!ja->nr) goto out; - ret = journal_read_buf_realloc(&buf, PAGE_SIZE); + ret = journal_read_buf_realloc(c, &buf, PAGE_SIZE); if (ret) goto err; @@ -1229,13 +1238,105 @@ err: goto out; } +noinline_for_stack +static void bch2_journal_print_checksum_error(struct bch_fs *c, struct journal_replay *j) +{ + struct printbuf buf = PRINTBUF; + enum bch_csum_type csum_type = JSET_CSUM_TYPE(&j->j); + bool have_good = false; + + prt_printf(&buf, "invalid journal checksum(s) at seq %llu ", le64_to_cpu(j->j.seq)); + bch2_journal_datetime_to_text(&buf, &j->j); + prt_newline(&buf); + + darray_for_each(j->ptrs, ptr) + if (!ptr->csum_good) { + bch2_journal_ptr_to_text(&buf, c, ptr); + prt_char(&buf, ' '); + bch2_csum_to_text(&buf, csum_type, ptr->csum); + prt_newline(&buf); + } else { + have_good = true; + } + + prt_printf(&buf, "should be "); + bch2_csum_to_text(&buf, csum_type, j->j.csum); + + if (have_good) + prt_printf(&buf, "\n(had good copy on another device)"); + + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); +} + +noinline_for_stack +static int bch2_journal_check_for_missing(struct bch_fs *c, u64 start_seq, u64 end_seq) +{ + struct printbuf buf = PRINTBUF; + int ret = 0; + + struct genradix_iter radix_iter; + struct journal_replay *i, **_i, *prev = NULL; + u64 seq = start_seq; + + genradix_for_each(&c->journal_entries, radix_iter, _i) { + i = *_i; + + if (journal_replay_ignore(i)) + continue; + + BUG_ON(seq > le64_to_cpu(i->j.seq)); + + while (seq < le64_to_cpu(i->j.seq)) { + while (seq < le64_to_cpu(i->j.seq) && + bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + if (seq == le64_to_cpu(i->j.seq)) + break; + + u64 missing_start = seq; + + while (seq < le64_to_cpu(i->j.seq) && + !bch2_journal_seq_is_blacklisted(c, seq, false)) + seq++; + + u64 missing_end = seq - 1; + + printbuf_reset(&buf); + prt_printf(&buf, "journal entries %llu-%llu missing! (replaying %llu-%llu)", + missing_start, missing_end, + start_seq, end_seq); + + prt_printf(&buf, "\nprev at "); + if (prev) { + bch2_journal_ptrs_to_text(&buf, c, prev); + prt_printf(&buf, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); + } else + prt_printf(&buf, "(none)"); + + prt_printf(&buf, "\nnext at "); + bch2_journal_ptrs_to_text(&buf, c, i); + prt_printf(&buf, ", continue?"); + + fsck_err(c, journal_entries_missing, "%s", buf.buf); + } + + prev = i; + seq++; + } +fsck_err: + printbuf_exit(&buf); + return ret; +} + int bch2_journal_read(struct bch_fs *c, u64 *last_seq, u64 *blacklist_seq, u64 *start_seq) { struct journal_list jlist; - struct journal_replay *i, **_i, *prev = NULL; + struct journal_replay *i, **_i; struct genradix_iter radix_iter; struct printbuf buf = PRINTBUF; bool degraded = false, last_write_torn = false; @@ -1326,12 +1427,12 @@ int bch2_journal_read(struct bch_fs *c, return 0; } - bch_info(c, "journal read done, replaying entries %llu-%llu", - *last_seq, *blacklist_seq - 1); - + printbuf_reset(&buf); + prt_printf(&buf, "journal read done, replaying entries %llu-%llu", + *last_seq, *blacklist_seq - 1); if (*start_seq != *blacklist_seq) - bch_info(c, "dropped unflushed entries %llu-%llu", - *blacklist_seq, *start_seq - 1); + prt_printf(&buf, " (unflushed %llu-%llu)", *blacklist_seq, *start_seq - 1); + bch_info(c, "%s", buf.buf); /* Drop blacklisted entries and entries older than last_seq: */ genradix_for_each(&c->journal_entries, radix_iter, _i) { @@ -1354,56 +1455,9 @@ int bch2_journal_read(struct bch_fs *c, } } - /* Check for missing entries: */ - seq = *last_seq; - genradix_for_each(&c->journal_entries, radix_iter, _i) { - i = *_i; - - if (journal_replay_ignore(i)) - continue; - - BUG_ON(seq > le64_to_cpu(i->j.seq)); - - while (seq < le64_to_cpu(i->j.seq)) { - u64 missing_start, missing_end; - struct printbuf buf1 = PRINTBUF, buf2 = PRINTBUF; - - while (seq < le64_to_cpu(i->j.seq) && - bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (seq == le64_to_cpu(i->j.seq)) - break; - - missing_start = seq; - - while (seq < le64_to_cpu(i->j.seq) && - !bch2_journal_seq_is_blacklisted(c, seq, false)) - seq++; - - if (prev) { - bch2_journal_ptrs_to_text(&buf1, c, prev); - prt_printf(&buf1, " size %zu", vstruct_sectors(&prev->j, c->block_bits)); - } else - prt_printf(&buf1, "(none)"); - bch2_journal_ptrs_to_text(&buf2, c, i); - - missing_end = seq - 1; - fsck_err(c, journal_entries_missing, - "journal entries %llu-%llu missing! (replaying %llu-%llu)\n" - "prev at %s\n" - "next at %s, continue?", - missing_start, missing_end, - *last_seq, *blacklist_seq - 1, - buf1.buf, buf2.buf); - - printbuf_exit(&buf1); - printbuf_exit(&buf2); - } - - prev = i; - seq++; - } + ret = bch2_journal_check_for_missing(c, *last_seq, *blacklist_seq - 1); + if (ret) + goto err; genradix_for_each(&c->journal_entries, radix_iter, _i) { union bch_replicas_padded replicas = { @@ -1416,15 +1470,15 @@ int bch2_journal_read(struct bch_fs *c, if (journal_replay_ignore(i)) continue; - darray_for_each(i->ptrs, ptr) { - struct bch_dev *ca = bch2_dev_have_ref(c, ptr->dev); - - if (!ptr->csum_good) - bch_err_dev_offset(ca, ptr->sector, - "invalid journal checksum, seq %llu%s", - le64_to_cpu(i->j.seq), - i->csum_good ? " (had good copy on another device)" : ""); - } + /* + * Don't print checksum errors until we know we're going to use + * a given journal entry: + */ + darray_for_each(i->ptrs, ptr) + if (!ptr->csum_good) { + bch2_journal_print_checksum_error(c, i); + break; + } ret = jset_validate(c, bch2_dev_have_ref(c, i->ptrs.data[0].dev), @@ -1467,7 +1521,7 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, { struct bch_fs *c = container_of(j, struct bch_fs, journal); - rcu_read_lock(); + guard(rcu)(); darray_for_each(*devs, i) { struct bch_dev *ca = rcu_dereference(c->devs[*i]); if (!ca) @@ -1489,7 +1543,6 @@ static void journal_advance_devs_to_next_bucket(struct journal *j, ja->bucket_seq[ja->cur_idx] = le64_to_cpu(seq); } } - rcu_read_unlock(); } static void __journal_write_alloc(struct journal *j, @@ -1559,7 +1612,7 @@ static int journal_write_alloc(struct journal *j, struct journal_buf *w, retry_target: devs = target_rw_devs(c, BCH_DATA_journal, target); - devs_sorted = bch2_dev_alloc_list(c, &j->wp.stripe, &devs); + bch2_dev_alloc_list(c, &j->wp.stripe, &devs, &devs_sorted); retry_alloc: __journal_write_alloc(j, w, &devs_sorted, sectors, replicas, replicas_want); @@ -1581,6 +1634,16 @@ retry_alloc: done: BUG_ON(bkey_val_u64s(&w->key.k) > BCH_REPLICAS_MAX); +#if 0 + /* + * XXX: we need a way to alert the user when we go degraded for any + * reason + */ + if (*replicas < min(replicas_want, + dev_mask_nr(&c->rw_devs[BCH_DATA_free]))) { + } +#endif + return *replicas >= replicas_need ? 0 : -BCH_ERR_insufficient_journal_devices; } @@ -1628,7 +1691,7 @@ static CLOSURE_CALLBACK(journal_write_done) : j->noflush_write_time, j->write_start_time); if (!w->devs_written.nr) { - err = -BCH_ERR_journal_write_err; + err = bch_err_throw(c, journal_write_err); } else { bch2_devlist_to_replicas(&replicas.e, BCH_DATA_journal, w->devs_written); @@ -2058,7 +2121,7 @@ CLOSURE_CALLBACK(bch2_journal_write) struct journal *j = container_of(w, struct journal, buf[w->idx]); struct bch_fs *c = container_of(j, struct bch_fs, journal); union bch_replicas_padded replicas; - unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_journal]); + unsigned nr_rw_members = dev_mask_nr(&c->rw_devs[BCH_DATA_free]); int ret; BUG_ON(BCH_SB_CLEAN(c->disk_sb.sb)); diff --git a/fs/bcachefs/journal_io.h b/fs/bcachefs/journal_io.h index 12b39fcb4424..6fa82c4050fe 100644 --- a/fs/bcachefs/journal_io.h +++ b/fs/bcachefs/journal_io.h @@ -9,6 +9,7 @@ void bch2_journal_pos_from_member_info_resume(struct bch_fs *); struct journal_ptr { bool csum_good; + struct bch_csum csum; u8 dev; u32 bucket; u32 bucket_offset; diff --git a/fs/bcachefs/journal_reclaim.c b/fs/bcachefs/journal_reclaim.c index 70f36f6bc482..cd6201741c59 100644 --- a/fs/bcachefs/journal_reclaim.c +++ b/fs/bcachefs/journal_reclaim.c @@ -83,18 +83,20 @@ static struct journal_space journal_dev_space_available(struct journal *j, struct bch_dev *ca, enum journal_space_from from) { + struct bch_fs *c = container_of(j, struct bch_fs, journal); struct journal_device *ja = &ca->journal; unsigned sectors, buckets, unwritten; + unsigned bucket_size_aligned = round_down(ca->mi.bucket_size, block_sectors(c)); u64 seq; if (from == journal_space_total) return (struct journal_space) { - .next_entry = ca->mi.bucket_size, - .total = ca->mi.bucket_size * ja->nr, + .next_entry = bucket_size_aligned, + .total = bucket_size_aligned * ja->nr, }; buckets = bch2_journal_dev_buckets_available(j, ja, from); - sectors = ja->sectors_free; + sectors = round_down(ja->sectors_free, block_sectors(c)); /* * We that we don't allocate the space for a journal entry @@ -109,7 +111,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, continue; /* entry won't fit on this device, skip: */ - if (unwritten > ca->mi.bucket_size) + if (unwritten > bucket_size_aligned) continue; if (unwritten >= sectors) { @@ -119,7 +121,7 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, } buckets--; - sectors = ca->mi.bucket_size; + sectors = bucket_size_aligned; } sectors -= unwritten; @@ -127,12 +129,12 @@ journal_dev_space_available(struct journal *j, struct bch_dev *ca, if (sectors < ca->mi.bucket_size && buckets) { buckets--; - sectors = ca->mi.bucket_size; + sectors = bucket_size_aligned; } return (struct journal_space) { .next_entry = sectors, - .total = sectors + buckets * ca->mi.bucket_size, + .total = sectors + buckets * bucket_size_aligned, }; } @@ -146,7 +148,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne BUG_ON(nr_devs_want > ARRAY_SIZE(dev_space)); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { if (!ca->journal.nr || !ca->mi.durability) @@ -164,7 +165,6 @@ static struct journal_space __journal_space_available(struct journal *j, unsigne array_insert_item(dev_space, nr_devs, pos, space); } - rcu_read_unlock(); if (nr_devs < nr_devs_want) return (struct journal_space) { 0, 0 }; @@ -189,8 +189,8 @@ void bch2_journal_space_available(struct journal *j) int ret = 0; lockdep_assert_held(&j->lock); + guard(rcu)(); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) { struct journal_device *ja = &ca->journal; @@ -210,7 +210,6 @@ void bch2_journal_space_available(struct journal *j) max_entry_size = min_t(unsigned, max_entry_size, ca->mi.bucket_size); nr_online++; } - rcu_read_unlock(); j->can_discard = can_discard; @@ -221,15 +220,13 @@ void bch2_journal_space_available(struct journal *j) prt_printf(&buf, "insufficient writeable journal devices available: have %u, need %u\n" "rw journal devs:", nr_online, metadata_replicas_required(c)); - rcu_read_lock(); for_each_member_device_rcu(c, ca, &c->rw_devs[BCH_DATA_journal]) prt_printf(&buf, " %s", ca->name); - rcu_read_unlock(); bch_err(c, "%s", buf.buf); printbuf_exit(&buf); } - ret = -BCH_ERR_insufficient_journal_devices; + ret = bch_err_throw(c, insufficient_journal_devices); goto out; } @@ -243,7 +240,7 @@ void bch2_journal_space_available(struct journal *j) total = j->space[journal_space_total].total; if (!j->space[journal_space_discarded].next_entry) - ret = -BCH_ERR_journal_full; + ret = bch_err_throw(c, journal_full); if ((j->space[journal_space_clean_ondisk].next_entry < j->space[journal_space_clean_ondisk].total) && @@ -256,8 +253,7 @@ void bch2_journal_space_available(struct journal *j) bch2_journal_set_watermark(j); out: j->cur_entry_sectors = !ret - ? round_down(j->space[journal_space_discarded].next_entry, - block_sectors(c)) + ? j->space[journal_space_discarded].next_entry : 0; j->cur_entry_error = ret; @@ -625,9 +621,9 @@ static u64 journal_seq_to_flush(struct journal *j) struct bch_fs *c = container_of(j, struct bch_fs, journal); u64 seq_to_flush = 0; - spin_lock(&j->lock); + guard(spinlock)(&j->lock); + guard(rcu)(); - rcu_read_lock(); for_each_rw_member_rcu(c, ca) { struct journal_device *ja = &ca->journal; unsigned nr_buckets, bucket_to_flush; @@ -642,15 +638,11 @@ static u64 journal_seq_to_flush(struct journal *j) seq_to_flush = max(seq_to_flush, ja->bucket_seq[bucket_to_flush]); } - rcu_read_unlock(); /* Also flush if the pin fifo is more than half full */ - seq_to_flush = max_t(s64, seq_to_flush, - (s64) journal_cur_seq(j) - - (j->pin.size >> 1)); - spin_unlock(&j->lock); - - return seq_to_flush; + return max_t(s64, seq_to_flush, + (s64) journal_cur_seq(j) - + (j->pin.size >> 1)); } /** diff --git a/fs/bcachefs/journal_sb.c b/fs/bcachefs/journal_sb.c index 62b910f2fb27..0cb9b93f13e7 100644 --- a/fs/bcachefs/journal_sb.c +++ b/fs/bcachefs/journal_sb.c @@ -210,7 +210,7 @@ int bch2_journal_buckets_to_sb(struct bch_fs *c, struct bch_dev *ca, j = bch2_sb_field_resize(&ca->disk_sb, journal_v2, (sizeof(*j) + sizeof(j->d[0]) * nr_compacted) / sizeof(u64)); if (!j) - return -BCH_ERR_ENOSPC_sb_journal; + return bch_err_throw(c, ENOSPC_sb_journal); bch2_sb_field_delete(&ca->disk_sb, BCH_SB_FIELD_journal); diff --git a/fs/bcachefs/journal_seq_blacklist.c b/fs/bcachefs/journal_seq_blacklist.c index c5a7d800a0f5..af4fe416d9ec 100644 --- a/fs/bcachefs/journal_seq_blacklist.c +++ b/fs/bcachefs/journal_seq_blacklist.c @@ -78,7 +78,7 @@ int bch2_journal_seq_blacklist_add(struct bch_fs *c, u64 start, u64 end) bl = bch2_sb_field_resize(&c->disk_sb, journal_seq_blacklist, sb_blacklist_u64s(nr + 1)); if (!bl) { - ret = -BCH_ERR_ENOSPC_sb_journal_seq_blacklist; + ret = bch_err_throw(c, ENOSPC_sb_journal_seq_blacklist); goto out; } @@ -152,7 +152,7 @@ int bch2_blacklist_table_initialize(struct bch_fs *c) t = kzalloc(struct_size(t, entries, nr), GFP_KERNEL); if (!t) - return -BCH_ERR_ENOMEM_blacklist_table_init; + return bch_err_throw(c, ENOMEM_blacklist_table_init); t->nr = nr; diff --git a/fs/bcachefs/lru.c b/fs/bcachefs/lru.c index 2f63fc6d456f..57b5b3263b08 100644 --- a/fs/bcachefs/lru.c +++ b/fs/bcachefs/lru.c @@ -145,13 +145,11 @@ static u64 bkey_lru_type_idx(struct bch_fs *c, case BCH_LRU_fragmentation: { a = bch2_alloc_to_v4(k, &a_convert); - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, k.k->p.inode); - u64 idx = ca + return ca ? alloc_lru_idx_fragmentation(*a, ca) : 0; - rcu_read_unlock(); - return idx; } case BCH_LRU_stripes: return k.k->type == KEY_TYPE_stripe diff --git a/fs/bcachefs/migrate.c b/fs/bcachefs/migrate.c index bb7a92270c09..f296cce95338 100644 --- a/fs/bcachefs/migrate.c +++ b/fs/bcachefs/migrate.c @@ -35,7 +35,7 @@ static int drop_dev_ptrs(struct bch_fs *c, struct bkey_s k, nr_good = bch2_bkey_durability(c, k.s_c); if ((!nr_good && !(flags & lost)) || (nr_good < replicas && !(flags & degraded))) - return -BCH_ERR_remove_would_lose_data; + return bch_err_throw(c, remove_would_lose_data); return 0; } @@ -156,7 +156,7 @@ static int bch2_dev_metadata_drop(struct bch_fs *c, /* don't handle this yet: */ if (flags & BCH_FORCE_IF_METADATA_LOST) - return -BCH_ERR_remove_with_metadata_missing_unimplemented; + return bch_err_throw(c, remove_with_metadata_missing_unimplemented); trans = bch2_trans_get(c); bch2_bkey_buf_init(&k); diff --git a/fs/bcachefs/move.c b/fs/bcachefs/move.c index 79f4722621d5..eec591e947bd 100644 --- a/fs/bcachefs/move.c +++ b/fs/bcachefs/move.c @@ -38,30 +38,74 @@ const char * const bch2_data_ops_strs[] = { NULL }; -static void trace_io_move2(struct bch_fs *c, struct bkey_s_c k, - struct bch_io_opts *io_opts, - struct data_update_opts *data_opts) +struct evacuate_bucket_arg { + struct bpos bucket; + int gen; + struct data_update_opts data_opts; +}; + +static bool evacuate_bucket_pred(struct bch_fs *, void *, + enum btree_id, struct bkey_s_c, + struct bch_io_opts *, + struct data_update_opts *); + +static noinline void +trace_io_move2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts) { - if (trace_io_move_enabled()) { - struct printbuf buf = PRINTBUF; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - prt_newline(&buf); - bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); - trace_io_move(c, buf.buf); - printbuf_exit(&buf); - } + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); + trace_io_move(c, buf.buf); + printbuf_exit(&buf); } -static void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) +static noinline void trace_io_move_read2(struct bch_fs *c, struct bkey_s_c k) { - if (trace_io_move_read_enabled()) { - struct printbuf buf = PRINTBUF; + struct printbuf buf = PRINTBUF; - bch2_bkey_val_to_text(&buf, c, k); - trace_io_move_read(c, buf.buf); - printbuf_exit(&buf); + bch2_bkey_val_to_text(&buf, c, k); + trace_io_move_read(c, buf.buf); + printbuf_exit(&buf); +} + +static noinline void +trace_io_move_pred2(struct bch_fs *c, struct bkey_s_c k, + struct bch_io_opts *io_opts, + struct data_update_opts *data_opts, + move_pred_fn pred, void *_arg, bool p) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "%ps: %u", pred, p); + + if (pred == evacuate_bucket_pred) { + struct evacuate_bucket_arg *arg = _arg; + prt_printf(&buf, " gen=%u", arg->gen); } + + prt_newline(&buf); + bch2_bkey_val_to_text(&buf, c, k); + prt_newline(&buf); + bch2_data_update_opts_to_text(&buf, c, io_opts, data_opts); + trace_io_move_pred(c, buf.buf); + printbuf_exit(&buf); +} + +static noinline void +trace_io_move_evacuate_bucket2(struct bch_fs *c, struct bpos bucket, int gen) +{ + struct printbuf buf = PRINTBUF; + + prt_printf(&buf, "bucket: "); + bch2_bpos_to_text(&buf, bucket); + prt_printf(&buf, " gen: %i\n", gen); + + trace_io_move_evacuate_bucket(c, buf.buf); + printbuf_exit(&buf); } struct moving_io { @@ -298,7 +342,8 @@ int bch2_move_extent(struct moving_context *ctxt, struct bch_fs *c = trans->c; int ret = -ENOMEM; - trace_io_move2(c, k, &io_opts, &data_opts); + if (trace_io_move_enabled()) + trace_io_move2(c, k, &io_opts, &data_opts); this_cpu_add(c->counters[BCH_COUNTER_io_move], k.k->size); if (ctxt->stats) @@ -314,16 +359,14 @@ int bch2_move_extent(struct moving_context *ctxt, return 0; } - /* - * Before memory allocations & taking nocow locks in - * bch2_data_update_init(): - */ - bch2_trans_unlock(trans); - - struct moving_io *io = kzalloc(sizeof(struct moving_io), GFP_KERNEL); + struct moving_io *io = allocate_dropping_locks(trans, ret, + kzalloc(sizeof(struct moving_io), _gfp)); if (!io) goto err; + if (ret) + goto err_free; + INIT_LIST_HEAD(&io->io_list); io->write.ctxt = ctxt; io->read_sectors = k.k->size; @@ -343,6 +386,8 @@ int bch2_move_extent(struct moving_context *ctxt, io->write.op.c = c; io->write.data_opts = data_opts; + bch2_trans_unlock(trans); + ret = bch2_data_update_bios_init(&io->write, c, &io_opts); if (ret) goto err_free; @@ -364,7 +409,8 @@ int bch2_move_extent(struct moving_context *ctxt, atomic_inc(&io->b->count); } - trace_io_move_read2(c, k); + if (trace_io_move_read_enabled()) + trace_io_move_read2(c, k); mutex_lock(&ctxt->lock); atomic_add(io->read_sectors, &ctxt->read_sectors); @@ -390,9 +436,6 @@ int bch2_move_extent(struct moving_context *ctxt, err_free: kfree(io); err: - if (bch2_err_matches(ret, BCH_ERR_data_update_done)) - return 0; - if (bch2_err_matches(ret, EROFS) || bch2_err_matches(ret, BCH_ERR_transaction_restart)) return ret; @@ -408,6 +451,9 @@ err: trace_io_move_start_fail(c, buf.buf); printbuf_exit(&buf); } + + if (bch2_err_matches(ret, BCH_ERR_data_update_done)) + return 0; return ret; } @@ -496,6 +542,7 @@ int bch2_move_get_io_opts_one(struct btree_trans *trans, bch2_inode_opts_get(io_opts, c, &inode); } bch2_trans_iter_exit(trans, &inode_iter); + /* seem to be spinning here? */ out: return bch2_get_update_rebalance_opts(trans, io_opts, extent_iter, extent_k); } @@ -910,7 +957,13 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, } struct data_update_opts data_opts = {}; - if (!pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts)) { + bool p = pred(c, arg, bp.v->btree_id, k, &io_opts, &data_opts); + + if (trace_io_move_pred_enabled()) + trace_io_move_pred2(c, k, &io_opts, &data_opts, + pred, arg, p); + + if (!p) { bch2_trans_iter_exit(trans, &iter); goto next; } @@ -918,7 +971,7 @@ static int __bch2_move_data_phys(struct moving_context *ctxt, if (data_opts.scrub && !bch2_dev_idx_is_online(c, data_opts.read_dev)) { bch2_trans_iter_exit(trans, &iter); - ret = -BCH_ERR_device_offline; + ret = bch_err_throw(c, device_offline); break; } @@ -993,12 +1046,6 @@ int bch2_move_data_phys(struct bch_fs *c, return ret; } -struct evacuate_bucket_arg { - struct bpos bucket; - int gen; - struct data_update_opts data_opts; -}; - static bool evacuate_bucket_pred(struct bch_fs *c, void *_arg, enum btree_id btree, struct bkey_s_c k, struct bch_io_opts *io_opts, @@ -1025,8 +1072,13 @@ int bch2_evacuate_bucket(struct moving_context *ctxt, struct bpos bucket, int gen, struct data_update_opts data_opts) { + struct bch_fs *c = ctxt->trans->c; struct evacuate_bucket_arg arg = { bucket, gen, data_opts, }; + count_event(c, io_move_evacuate_bucket); + if (trace_io_move_evacuate_bucket_enabled()) + trace_io_move_evacuate_bucket2(c, bucket, gen); + return __bch2_move_data_phys(ctxt, bucket_in_flight, bucket.inode, bucket.offset, @@ -1124,7 +1176,7 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, ? c->opts.metadata_replicas : io_opts->data_replicas; - rcu_read_lock(); + guard(rcu)(); struct bkey_ptrs_c ptrs = bch2_bkey_ptrs_c(k); unsigned i = 0; bkey_for_each_ptr(ptrs, ptr) { @@ -1134,7 +1186,6 @@ static bool rereplicate_pred(struct bch_fs *c, void *arg, data_opts->kill_ptrs |= BIT(i); i++; } - rcu_read_unlock(); if (!data_opts->kill_ptrs && (!nr_good || nr_good >= replicas)) @@ -1242,7 +1293,7 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, struct extent_ptr_decoded p; unsigned i = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, bch2_bkey_ptrs_c(k), p, entry) { unsigned d = bch2_extent_ptr_durability(c, &p); @@ -1253,7 +1304,6 @@ static bool drop_extra_replicas_pred(struct bch_fs *c, void *arg, i++; } - rcu_read_unlock(); return data_opts->kill_ptrs != 0; } diff --git a/fs/bcachefs/movinggc.c b/fs/bcachefs/movinggc.c index e7a2a13554d7..6d7b1d5f7697 100644 --- a/fs/bcachefs/movinggc.c +++ b/fs/bcachefs/movinggc.c @@ -293,11 +293,9 @@ u64 bch2_copygc_wait_amount(struct bch_fs *c) { u64 wait = U64_MAX; - rcu_read_lock(); + guard(rcu)(); for_each_rw_member_rcu(c, ca) wait = min(wait, bch2_copygc_dev_wait_amount(ca)); - rcu_read_unlock(); - return wait; } @@ -321,21 +319,21 @@ void bch2_copygc_wait_to_text(struct printbuf *out, struct bch_fs *c) bch2_printbuf_make_room(out, 4096); - rcu_read_lock(); + struct task_struct *t; out->atomic++; + scoped_guard(rcu) { + prt_printf(out, "Currently calculated wait:\n"); + for_each_rw_member_rcu(c, ca) { + prt_printf(out, " %s:\t", ca->name); + prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); + prt_newline(out); + } - prt_printf(out, "Currently calculated wait:\n"); - for_each_rw_member_rcu(c, ca) { - prt_printf(out, " %s:\t", ca->name); - prt_human_readable_u64(out, bch2_copygc_dev_wait_amount(ca)); - prt_newline(out); + t = rcu_dereference(c->copygc_thread); + if (t) + get_task_struct(t); } - - struct task_struct *t = rcu_dereference(c->copygc_thread); - if (t) - get_task_struct(t); --out->atomic; - rcu_read_unlock(); if (t) { bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); diff --git a/fs/bcachefs/movinggc.h b/fs/bcachefs/movinggc.h index b9683d22bab0..f615910d6f98 100644 --- a/fs/bcachefs/movinggc.h +++ b/fs/bcachefs/movinggc.h @@ -7,11 +7,10 @@ void bch2_copygc_wait_to_text(struct printbuf *, struct bch_fs *); static inline void bch2_copygc_wakeup(struct bch_fs *c) { - rcu_read_lock(); + guard(rcu)(); struct task_struct *p = rcu_dereference(c->copygc_thread); if (p) wake_up_process(p); - rcu_read_unlock(); } void bch2_copygc_stop(struct bch_fs *); diff --git a/fs/bcachefs/namei.c b/fs/bcachefs/namei.c index a84b69d6caef..24120037c031 100644 --- a/fs/bcachefs/namei.c +++ b/fs/bcachefs/namei.c @@ -287,7 +287,7 @@ int bch2_unlink_trans(struct btree_trans *trans, } if (deleting_subvol && !inode_u->bi_subvol) { - ret = -BCH_ERR_ENOENT_not_subvol; + ret = bch_err_throw(c, ENOENT_not_subvol); goto err; } @@ -425,8 +425,8 @@ int bch2_rename_trans(struct btree_trans *trans, } ret = bch2_dirent_rename(trans, - src_dir, &src_hash, &src_dir_u->bi_size, - dst_dir, &dst_hash, &dst_dir_u->bi_size, + src_dir, &src_hash, + dst_dir, &dst_hash, src_name, &src_inum, &src_offset, dst_name, &dst_inum, &dst_offset, mode); @@ -633,7 +633,7 @@ static int __bch2_inum_to_path(struct btree_trans *trans, break; if (!inode.bi_dir && !inode.bi_dir_offset) { - ret = -BCH_ERR_ENOENT_inode_no_backpointer; + ret = bch_err_throw(trans->c, ENOENT_inode_no_backpointer); goto disconnected; } @@ -733,15 +733,6 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, return __bch2_fsck_write_inode(trans, target); } - if (bch2_inode_should_have_single_bp(target) && - !fsck_err(trans, inode_wrong_backpointer, - "dirent points to inode that does not point back:\n%s", - (bch2_bkey_val_to_text(&buf, c, d.s_c), - prt_newline(&buf), - bch2_inode_unpacked_to_text(&buf, target), - buf.buf))) - goto err; - struct bkey_s_c_dirent bp_dirent = bch2_bkey_get_iter_typed(trans, &bp_iter, BTREE_ID_dirents, SPOS(target->bi_dir, target->bi_dir_offset, target->bi_snapshot), @@ -768,6 +759,7 @@ static int bch2_check_dirent_inode_dirent(struct btree_trans *trans, ret = __bch2_fsck_write_inode(trans, target); } } else { + printbuf_reset(&buf); bch2_bkey_val_to_text(&buf, c, d.s_c); prt_newline(&buf); bch2_bkey_val_to_text(&buf, c, bp_dirent.s_c); @@ -857,7 +849,8 @@ int __bch2_check_dirent_target(struct btree_trans *trans, n->v.d_inum = cpu_to_le64(target->bi_inum); } - ret = bch2_trans_update(trans, dirent_iter, &n->k_i, 0); + ret = bch2_trans_update(trans, dirent_iter, &n->k_i, + BTREE_UPDATE_internal_snapshot_node); if (ret) goto err; } diff --git a/fs/bcachefs/printbuf.h b/fs/bcachefs/printbuf.h index 1ca476adbf6f..8f4e28d440ac 100644 --- a/fs/bcachefs/printbuf.h +++ b/fs/bcachefs/printbuf.h @@ -140,6 +140,14 @@ void bch2_prt_bitflags_vector(struct printbuf *, const char * const[], .size = _size, \ }) +static inline struct printbuf bch2_printbuf_init(void) +{ + return PRINTBUF; +} + +DEFINE_CLASS(printbuf, struct printbuf, + bch2_printbuf_exit(&_T), bch2_printbuf_init(), void) + /* * Returns size remaining of output buffer: */ diff --git a/fs/bcachefs/quota.c b/fs/bcachefs/quota.c index 3d4755d73af7..f241efb1fb50 100644 --- a/fs/bcachefs/quota.c +++ b/fs/bcachefs/quota.c @@ -527,7 +527,7 @@ int bch2_fs_quota_read(struct bch_fs *c) struct bch_sb_field_quota *sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { mutex_unlock(&c->sb_lock); - return -BCH_ERR_ENOSPC_sb_quota; + return bch_err_throw(c, ENOSPC_sb_quota); } bch2_sb_quota_read(c); @@ -572,7 +572,7 @@ static int bch2_quota_enable(struct super_block *sb, unsigned uflags) mutex_lock(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { - ret = -BCH_ERR_ENOSPC_sb_quota; + ret = bch_err_throw(c, ENOSPC_sb_quota); goto unlock; } @@ -726,7 +726,7 @@ static int bch2_quota_set_info(struct super_block *sb, int type, mutex_lock(&c->sb_lock); sb_quota = bch2_sb_get_or_create_quota(&c->disk_sb); if (!sb_quota) { - ret = -BCH_ERR_ENOSPC_sb_quota; + ret = bch_err_throw(c, ENOSPC_sb_quota); goto unlock; } diff --git a/fs/bcachefs/rebalance.c b/fs/bcachefs/rebalance.c index de1ec9e0caa0..1c345b86b1c0 100644 --- a/fs/bcachefs/rebalance.c +++ b/fs/bcachefs/rebalance.c @@ -80,13 +80,12 @@ static inline unsigned bch2_bkey_ptrs_need_move(struct bch_fs *c, unsigned ptr_bit = 1; unsigned rewrite_ptrs = 0; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr(ptrs, ptr) { if (!ptr->cached && !bch2_dev_in_target(c, ptr->dev, opts->background_target)) rewrite_ptrs |= ptr_bit; ptr_bit <<= 1; } - rcu_read_unlock(); return rewrite_ptrs; } @@ -135,12 +134,11 @@ u64 bch2_bkey_sectors_need_rebalance(struct bch_fs *c, struct bkey_s_c k) } incompressible: if (opts->background_target) { - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr_decode(k.k, ptrs, p, entry) if (!p.ptr.cached && !bch2_dev_in_target(c, p.ptr.dev, opts->background_target)) sectors += p.crc.compressed_size; - rcu_read_unlock(); } return sectors; @@ -445,7 +443,7 @@ static int do_rebalance_extent(struct moving_context *ctxt, if (bch2_err_matches(ret, ENOMEM)) { /* memory allocation failure, wait for some IO to finish */ bch2_move_ctxt_wait_for_io(ctxt); - ret = -BCH_ERR_transaction_restart_nested; + ret = bch_err_throw(c, transaction_restart_nested); } if (bch2_err_matches(ret, BCH_ERR_transaction_restart)) @@ -527,7 +525,7 @@ static void rebalance_wait(struct bch_fs *c) r->state = BCH_REBALANCE_waiting; } - bch2_kthread_io_clock_wait(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); + bch2_kthread_io_clock_wait_once(clock, r->wait_iotime_end, MAX_SCHEDULE_TIMEOUT); } static bool bch2_rebalance_enabled(struct bch_fs *c) @@ -544,6 +542,7 @@ static int do_rebalance(struct moving_context *ctxt) struct bch_fs_rebalance *r = &c->rebalance; struct btree_iter rebalance_work_iter, extent_iter = {}; struct bkey_s_c k; + u32 kick = r->kick; int ret = 0; bch2_trans_begin(trans); @@ -593,7 +592,8 @@ static int do_rebalance(struct moving_context *ctxt) if (!ret && !kthread_should_stop() && !atomic64_read(&r->work_stats.sectors_seen) && - !atomic64_read(&r->scan_stats.sectors_seen)) { + !atomic64_read(&r->scan_stats.sectors_seen) && + kick == r->kick) { bch2_moving_ctxt_flush_all(ctxt); bch2_trans_unlock_long(trans); rebalance_wait(c); @@ -677,11 +677,12 @@ void bch2_rebalance_status_to_text(struct printbuf *out, struct bch_fs *c) } prt_newline(out); - rcu_read_lock(); - struct task_struct *t = rcu_dereference(c->rebalance.thread); - if (t) - get_task_struct(t); - rcu_read_unlock(); + struct task_struct *t; + scoped_guard(rcu) { + t = rcu_dereference(c->rebalance.thread); + if (t) + get_task_struct(t); + } if (t) { bch2_prt_task_backtrace(out, t, 0, GFP_KERNEL); @@ -794,7 +795,7 @@ static int check_rebalance_work_one(struct btree_trans *trans, BTREE_ID_extents, POS_MIN, BTREE_ITER_prefetch| BTREE_ITER_all_snapshots); - return -BCH_ERR_transaction_restart_nested; + return bch_err_throw(c, transaction_restart_nested); } if (!extent_k.k && !rebalance_k.k) diff --git a/fs/bcachefs/rebalance.h b/fs/bcachefs/rebalance.h index 5d9214fe1a22..7a565ea7dbfc 100644 --- a/fs/bcachefs/rebalance.h +++ b/fs/bcachefs/rebalance.h @@ -39,13 +39,11 @@ int bch2_set_fs_needs_rebalance(struct bch_fs *); static inline void bch2_rebalance_wakeup(struct bch_fs *c) { - struct task_struct *p; - - rcu_read_lock(); - p = rcu_dereference(c->rebalance.thread); + c->rebalance.kick++; + guard(rcu)(); + struct task_struct *p = rcu_dereference(c->rebalance.thread); if (p) wake_up_process(p); - rcu_read_unlock(); } void bch2_rebalance_status_to_text(struct printbuf *, struct bch_fs *); diff --git a/fs/bcachefs/rebalance_types.h b/fs/bcachefs/rebalance_types.h index 33d77286f1d5..c659da149fa3 100644 --- a/fs/bcachefs/rebalance_types.h +++ b/fs/bcachefs/rebalance_types.h @@ -18,6 +18,7 @@ enum bch_rebalance_states { struct bch_fs_rebalance { struct task_struct __rcu *thread; + u32 kick; struct bch_pd_controller pd; enum bch_rebalance_states state; diff --git a/fs/bcachefs/recovery.c b/fs/bcachefs/recovery.c index 4fca57575565..1e68e61f08e8 100644 --- a/fs/bcachefs/recovery.c +++ b/fs/bcachefs/recovery.c @@ -879,7 +879,7 @@ int bch2_fs_recovery(struct bch_fs *c) use_clean: if (!clean) { bch_err(c, "no superblock clean section found"); - ret = -BCH_ERR_fsck_repair_impossible; + ret = bch_err_throw(c, fsck_repair_impossible); goto err; } @@ -1093,10 +1093,6 @@ use_clean: out: bch2_flush_fsck_errs(c); - if (!c->opts.retain_recovery_info) { - bch2_journal_keys_put_initial(c); - bch2_find_btree_nodes_exit(&c->found_btree_nodes); - } if (!IS_ERR(clean)) kfree(clean); diff --git a/fs/bcachefs/recovery_passes.c b/fs/bcachefs/recovery_passes.c index dabb29b08ad0..605588e33fb3 100644 --- a/fs/bcachefs/recovery_passes.c +++ b/fs/bcachefs/recovery_passes.c @@ -103,20 +103,20 @@ static void bch2_sb_recovery_passes_to_text(struct printbuf *out, prt_tab(out); bch2_pr_time_units(out, le32_to_cpu(i->last_runtime) * NSEC_PER_SEC); + + if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) + prt_str(out, " (no ratelimit)"); + prt_newline(out); } } -static void bch2_sb_recovery_pass_complete(struct bch_fs *c, - enum bch_recovery_pass pass, - s64 start_time) +static struct recovery_pass_entry *bch2_sb_recovery_pass_entry(struct bch_fs *c, + enum bch_recovery_pass pass) { enum bch_recovery_pass_stable stable = bch2_recovery_pass_to_stable(pass); - s64 end_time = ktime_get_real_seconds(); - mutex_lock(&c->sb_lock); - struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); - __clear_bit_le64(stable, ext->recovery_passes_required); + lockdep_assert_held(&c->sb_lock); struct bch_sb_field_recovery_passes *r = bch2_sb_field_get(c->disk_sb.sb, recovery_passes); @@ -127,15 +127,43 @@ static void bch2_sb_recovery_pass_complete(struct bch_fs *c, r = bch2_sb_field_resize(&c->disk_sb, recovery_passes, u64s); if (!r) { bch_err(c, "error creating recovery_passes sb section"); - goto out; + return NULL; } } - r->start[stable].last_run = cpu_to_le64(end_time); - r->start[stable].last_runtime = cpu_to_le32(max(0, end_time - start_time)); -out: + return r->start + stable; +} + +static void bch2_sb_recovery_pass_complete(struct bch_fs *c, + enum bch_recovery_pass pass, + s64 start_time) +{ + guard(mutex)(&c->sb_lock); + struct bch_sb_field_ext *ext = bch2_sb_field_get(c->disk_sb.sb, ext); + __clear_bit_le64(bch2_recovery_pass_to_stable(pass), + ext->recovery_passes_required); + + struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); + if (e) { + s64 end_time = ktime_get_real_seconds(); + e->last_run = cpu_to_le64(end_time); + e->last_runtime = cpu_to_le32(max(0, end_time - start_time)); + SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); + } + bch2_write_super(c); - mutex_unlock(&c->sb_lock); +} + +void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *c, + enum bch_recovery_pass pass) +{ + guard(mutex)(&c->sb_lock); + + struct recovery_pass_entry *e = bch2_sb_recovery_pass_entry(c, pass); + if (e && !BCH_RECOVERY_PASS_NO_RATELIMIT(e)) { + SET_BCH_RECOVERY_PASS_NO_RATELIMIT(e, false); + bch2_write_super(c); + } } static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recovery_pass pass) @@ -157,6 +185,9 @@ static bool bch2_recovery_pass_want_ratelimit(struct bch_fs *c, enum bch_recover */ ret = (u64) le32_to_cpu(i->last_runtime) * 100 > ktime_get_real_seconds() - le64_to_cpu(i->last_run); + + if (BCH_RECOVERY_PASS_NO_RATELIMIT(i)) + ret = false; } return ret; @@ -315,7 +346,9 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, goto out; bool in_recovery = test_bit(BCH_FS_in_recovery, &c->flags); - bool rewind = in_recovery && r->curr_pass > pass; + bool rewind = in_recovery && + r->curr_pass > pass && + !(r->passes_complete & BIT_ULL(pass)); bool ratelimit = flags & RUN_RECOVERY_PASS_ratelimit; if (!(in_recovery && (flags & RUN_RECOVERY_PASS_nopersistent))) { @@ -327,7 +360,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, (!in_recovery || r->curr_pass >= BCH_RECOVERY_PASS_set_may_go_rw)) { prt_printf(out, "need recovery pass %s (%u), but already rw\n", bch2_recovery_passes[pass], pass); - ret = -BCH_ERR_cannot_rewind_recovery; + ret = bch_err_throw(c, cannot_rewind_recovery); goto out; } @@ -347,7 +380,7 @@ int __bch2_run_explicit_recovery_pass(struct bch_fs *c, if (rewind) { r->next_pass = pass; r->passes_complete &= (1ULL << pass) >> 1; - ret = -BCH_ERR_restart_recovery; + ret = bch_err_throw(c, restart_recovery); } } else { prt_printf(out, "scheduling recovery pass %s (%u)%s\n", @@ -382,6 +415,35 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *c, return ret; } +/* + * Returns 0 if @pass has run recently, otherwise one of + * -BCH_ERR_restart_recovery + * -BCH_ERR_recovery_pass_will_run + */ +int bch2_require_recovery_pass(struct bch_fs *c, + struct printbuf *out, + enum bch_recovery_pass pass) +{ + if (test_bit(BCH_FS_in_recovery, &c->flags) && + c->recovery.passes_complete & BIT_ULL(pass)) + return 0; + + guard(mutex)(&c->sb_lock); + + if (bch2_recovery_pass_want_ratelimit(c, pass)) + return 0; + + enum bch_run_recovery_pass_flags flags = 0; + int ret = 0; + + if (recovery_pass_needs_set(c, pass, &flags)) { + ret = __bch2_run_explicit_recovery_pass(c, out, pass, flags); + bch2_write_super(c); + } + + return ret ?: bch_err_throw(c, recovery_pass_will_run); +} + int bch2_run_print_explicit_recovery_pass(struct bch_fs *c, enum bch_recovery_pass pass) { enum bch_run_recovery_pass_flags flags = RUN_RECOVERY_PASS_nopersistent; diff --git a/fs/bcachefs/recovery_passes.h b/fs/bcachefs/recovery_passes.h index dc0d2014ff9b..260571c7105e 100644 --- a/fs/bcachefs/recovery_passes.h +++ b/fs/bcachefs/recovery_passes.h @@ -10,6 +10,8 @@ u64 bch2_recovery_passes_from_stable(u64 v); u64 bch2_fsck_recovery_passes(void); +void bch2_recovery_pass_set_no_ratelimit(struct bch_fs *, enum bch_recovery_pass); + enum bch_run_recovery_pass_flags { RUN_RECOVERY_PASS_nopersistent = BIT(0), RUN_RECOVERY_PASS_ratelimit = BIT(1), @@ -24,6 +26,9 @@ int bch2_run_explicit_recovery_pass(struct bch_fs *, struct printbuf *, enum bch_recovery_pass, enum bch_run_recovery_pass_flags); +int bch2_require_recovery_pass(struct bch_fs *, struct printbuf *, + enum bch_recovery_pass); + int bch2_run_online_recovery_passes(struct bch_fs *, u64); int bch2_run_recovery_passes(struct bch_fs *, enum bch_recovery_pass); diff --git a/fs/bcachefs/recovery_passes_format.h b/fs/bcachefs/recovery_passes_format.h index c434eafbca19..b63c20558d3d 100644 --- a/fs/bcachefs/recovery_passes_format.h +++ b/fs/bcachefs/recovery_passes_format.h @@ -87,6 +87,8 @@ struct recovery_pass_entry { __le32 flags; }; +LE32_BITMASK(BCH_RECOVERY_PASS_NO_RATELIMIT, struct recovery_pass_entry, flags, 0, 1) + struct bch_sb_field_recovery_passes { struct bch_sb_field field; struct recovery_pass_entry start[]; diff --git a/fs/bcachefs/reflink.c b/fs/bcachefs/reflink.c index 3a13dbcab6ba..a535abd44df3 100644 --- a/fs/bcachefs/reflink.c +++ b/fs/bcachefs/reflink.c @@ -312,7 +312,7 @@ static int trans_trigger_reflink_p_segment(struct btree_trans *trans, if (!bkey_refcount_c(k)) { if (!(flags & BTREE_TRIGGER_overwrite)) - ret = -BCH_ERR_missing_indirect_extent; + ret = bch_err_throw(c, missing_indirect_extent); goto next; } @@ -612,7 +612,7 @@ s64 bch2_remap_range(struct bch_fs *c, int ret = 0, ret2 = 0; if (!enumerated_ref_tryget(&c->writes, BCH_WRITE_REF_reflink)) - return -BCH_ERR_erofs_no_writes; + return bch_err_throw(c, erofs_no_writes); bch2_check_set_feature(c, BCH_FEATURE_reflink); @@ -711,7 +711,8 @@ s64 bch2_remap_range(struct bch_fs *c, SET_REFLINK_P_IDX(&dst_p->v, offset); if (reflink_p_may_update_opts_field && - may_change_src_io_path_opts) + may_change_src_io_path_opts && + REFLINK_P_MAY_UPDATE_OPTIONS(src_p.v)) SET_REFLINK_P_MAY_UPDATE_OPTIONS(&dst_p->v, true); } else { BUG(); @@ -847,7 +848,7 @@ int bch2_gc_reflink_start(struct bch_fs *c) struct reflink_gc *r = genradix_ptr_alloc(&c->reflink_gc_table, c->reflink_gc_nr++, GFP_KERNEL); if (!r) { - ret = -BCH_ERR_ENOMEM_gc_reflink_start; + ret = bch_err_throw(c, ENOMEM_gc_reflink_start); break; } diff --git a/fs/bcachefs/replicas.c b/fs/bcachefs/replicas.c index 477ef0997949..8383bd7fdb3f 100644 --- a/fs/bcachefs/replicas.c +++ b/fs/bcachefs/replicas.c @@ -119,7 +119,7 @@ int bch2_replicas_entry_validate(struct bch_replicas_entry_v1 *r, return 0; bad: bch2_replicas_entry_to_text(err, r); - return -BCH_ERR_invalid_replicas_entry; + return bch_err_throw(c, invalid_replicas_entry); } void bch2_cpu_replicas_to_text(struct printbuf *out, @@ -311,7 +311,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, !__replicas_has_entry(&c->replicas_gc, new_entry)) { new_gc = cpu_replicas_add_entry(c, &c->replicas_gc, new_entry); if (!new_gc.entries) { - ret = -BCH_ERR_ENOMEM_cpu_replicas; + ret = bch_err_throw(c, ENOMEM_cpu_replicas); goto err; } } @@ -319,7 +319,7 @@ static int bch2_mark_replicas_slowpath(struct bch_fs *c, if (!__replicas_has_entry(&c->replicas, new_entry)) { new_r = cpu_replicas_add_entry(c, &c->replicas, new_entry); if (!new_r.entries) { - ret = -BCH_ERR_ENOMEM_cpu_replicas; + ret = bch_err_throw(c, ENOMEM_cpu_replicas); goto err; } @@ -422,7 +422,7 @@ int bch2_replicas_gc_start(struct bch_fs *c, unsigned typemask) if (!c->replicas_gc.entries) { mutex_unlock(&c->sb_lock); bch_err(c, "error allocating c->replicas_gc"); - return -BCH_ERR_ENOMEM_replicas_gc; + return bch_err_throw(c, ENOMEM_replicas_gc); } for_each_cpu_replicas_entry(&c->replicas, e) @@ -458,7 +458,7 @@ retry: new.entries = kcalloc(nr, new.entry_size, GFP_KERNEL); if (!new.entries) { bch_err(c, "error allocating c->replicas_gc"); - return -BCH_ERR_ENOMEM_replicas_gc; + return bch_err_throw(c, ENOMEM_replicas_gc); } mutex_lock(&c->sb_lock); @@ -622,7 +622,7 @@ static int bch2_cpu_replicas_to_sb_replicas_v0(struct bch_fs *c, sb_r = bch2_sb_field_resize(&c->disk_sb, replicas_v0, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -BCH_ERR_ENOSPC_sb_replicas; + return bch_err_throw(c, ENOSPC_sb_replicas); bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas); sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas_v0); @@ -667,7 +667,7 @@ static int bch2_cpu_replicas_to_sb_replicas(struct bch_fs *c, sb_r = bch2_sb_field_resize(&c->disk_sb, replicas, DIV_ROUND_UP(bytes, sizeof(u64))); if (!sb_r) - return -BCH_ERR_ENOSPC_sb_replicas; + return bch_err_throw(c, ENOSPC_sb_replicas); bch2_sb_field_delete(&c->disk_sb, BCH_SB_FIELD_replicas_v0); sb_r = bch2_sb_field_get(c->disk_sb.sb, replicas); @@ -819,19 +819,18 @@ bool bch2_have_enough_devs(struct bch_fs *c, struct bch_devs_mask devs, if (e->data_type == BCH_DATA_cached) continue; - rcu_read_lock(); - for (unsigned i = 0; i < e->nr_devs; i++) { - if (e->devs[i] == BCH_SB_MEMBER_INVALID) { - nr_failed++; - continue; - } + scoped_guard(rcu) + for (unsigned i = 0; i < e->nr_devs; i++) { + if (e->devs[i] == BCH_SB_MEMBER_INVALID) { + nr_failed++; + continue; + } - nr_online += test_bit(e->devs[i], devs.d); + nr_online += test_bit(e->devs[i], devs.d); - struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); - nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; - } - rcu_read_unlock(); + struct bch_dev *ca = bch2_dev_rcu_noerror(c, e->devs[i]); + nr_failed += !ca || ca->mi.state == BCH_MEMBER_STATE_failed; + } if (nr_online + nr_failed == e->nr_devs) continue; diff --git a/fs/bcachefs/sb-counters_format.h b/fs/bcachefs/sb-counters_format.h index 7c0c9c842b4e..b868702a431a 100644 --- a/fs/bcachefs/sb-counters_format.h +++ b/fs/bcachefs/sb-counters_format.h @@ -26,6 +26,7 @@ enum counters_flags { x(io_move_write_fail, 82, TYPE_COUNTER) \ x(io_move_start_fail, 39, TYPE_COUNTER) \ x(io_move_created_rebalance, 83, TYPE_COUNTER) \ + x(io_move_evacuate_bucket, 84, TYPE_COUNTER) \ x(bucket_invalidate, 3, TYPE_COUNTER) \ x(bucket_discard, 4, TYPE_COUNTER) \ x(bucket_discard_fast, 79, TYPE_COUNTER) \ diff --git a/fs/bcachefs/sb-downgrade.c b/fs/bcachefs/sb-downgrade.c index 861fce1630f0..b61f88450a6d 100644 --- a/fs/bcachefs/sb-downgrade.c +++ b/fs/bcachefs/sb-downgrade.c @@ -417,7 +417,7 @@ int bch2_sb_downgrade_update(struct bch_fs *c) d = bch2_sb_field_resize(&c->disk_sb, downgrade, sb_u64s); if (!d) { - ret = -BCH_ERR_ENOSPC_sb_downgrade; + ret = bch_err_throw(c, ENOSPC_sb_downgrade); goto out; } diff --git a/fs/bcachefs/sb-errors.c b/fs/bcachefs/sb-errors.c index 013a96883b4e..48853efdc105 100644 --- a/fs/bcachefs/sb-errors.c +++ b/fs/bcachefs/sb-errors.c @@ -78,6 +78,28 @@ const struct bch_sb_field_ops bch_sb_field_ops_errors = { .to_text = bch2_sb_errors_to_text, }; +void bch2_fs_errors_to_text(struct printbuf *out, struct bch_fs *c) +{ + if (out->nr_tabstops < 1) + printbuf_tabstop_push(out, 48); + if (out->nr_tabstops < 2) + printbuf_tabstop_push(out, 8); + if (out->nr_tabstops < 3) + printbuf_tabstop_push(out, 16); + + guard(mutex)(&c->fsck_error_counts_lock); + + bch_sb_errors_cpu *e = &c->fsck_error_counts; + darray_for_each(*e, i) { + bch2_sb_error_id_to_text(out, i->id); + prt_tab(out); + prt_u64(out, i->nr); + prt_tab(out); + bch2_prt_datetime(out, i->last_error_time); + prt_newline(out); + } +} + void bch2_sb_error_count(struct bch_fs *c, enum bch_sb_error_id err) { bch_sb_errors_cpu *e = &c->fsck_error_counts; diff --git a/fs/bcachefs/sb-errors.h b/fs/bcachefs/sb-errors.h index b2357b8e6107..e86267264692 100644 --- a/fs/bcachefs/sb-errors.h +++ b/fs/bcachefs/sb-errors.h @@ -7,6 +7,7 @@ extern const char * const bch2_sb_error_strs[]; void bch2_sb_error_id_to_text(struct printbuf *, enum bch_sb_error_id); +void bch2_fs_errors_to_text(struct printbuf *, struct bch_fs *); extern const struct bch_sb_field_ops bch_sb_field_ops_errors; diff --git a/fs/bcachefs/sb-errors_format.h b/fs/bcachefs/sb-errors_format.h index 0bfb151da9cf..6fdbf265e4c0 100644 --- a/fs/bcachefs/sb-errors_format.h +++ b/fs/bcachefs/sb-errors_format.h @@ -232,6 +232,7 @@ enum bch_fsck_flags { x(inode_dir_multiple_links, 206, FSCK_AUTOFIX) \ x(inode_dir_missing_backpointer, 284, FSCK_AUTOFIX) \ x(inode_dir_unlinked_but_not_empty, 286, FSCK_AUTOFIX) \ + x(inode_dir_has_nonzero_i_size, 319, FSCK_AUTOFIX) \ x(inode_multiple_links_but_nlink_0, 207, FSCK_AUTOFIX) \ x(inode_wrong_backpointer, 208, FSCK_AUTOFIX) \ x(inode_wrong_nlink, 209, FSCK_AUTOFIX) \ @@ -243,6 +244,7 @@ enum bch_fsck_flags { x(inode_parent_has_case_insensitive_not_set, 317, FSCK_AUTOFIX) \ x(vfs_inode_i_blocks_underflow, 311, FSCK_AUTOFIX) \ x(vfs_inode_i_blocks_not_zero_at_truncate, 313, FSCK_AUTOFIX) \ + x(vfs_bad_inode_rm, 320, 0) \ x(deleted_inode_but_clean, 211, FSCK_AUTOFIX) \ x(deleted_inode_missing, 212, FSCK_AUTOFIX) \ x(deleted_inode_is_dir, 213, FSCK_AUTOFIX) \ @@ -328,7 +330,7 @@ enum bch_fsck_flags { x(dirent_stray_data_after_cf_name, 305, 0) \ x(rebalance_work_incorrectly_set, 309, FSCK_AUTOFIX) \ x(rebalance_work_incorrectly_unset, 310, FSCK_AUTOFIX) \ - x(MAX, 319, 0) + x(MAX, 321, 0) enum bch_sb_error_id { #define x(t, n, ...) BCH_FSCK_ERR_##t = n, diff --git a/fs/bcachefs/sb-members.c b/fs/bcachefs/sb-members.c index 3398906660a5..363eb0c6eb7c 100644 --- a/fs/bcachefs/sb-members.c +++ b/fs/bcachefs/sb-members.c @@ -101,7 +101,7 @@ static int sb_members_v2_resize_entries(struct bch_fs *c) mi = bch2_sb_field_resize(&c->disk_sb, members_v2, u64s); if (!mi) - return -BCH_ERR_ENOSPC_sb_members_v2; + return bch_err_throw(c, ENOSPC_sb_members_v2); for (int i = c->disk_sb.sb->nr_devices - 1; i >= 0; --i) { void *dst = (void *) mi->_members + (i * sizeof(struct bch_member)); @@ -378,14 +378,13 @@ void bch2_sb_members_from_cpu(struct bch_fs *c) { struct bch_sb_field_members_v2 *mi = bch2_sb_field_get(c->disk_sb.sb, members_v2); - rcu_read_lock(); + guard(rcu)(); for_each_member_device_rcu(c, ca, NULL) { struct bch_member *m = __bch2_members_v2_get_mut(mi, ca->dev_idx); for (unsigned e = 0; e < BCH_MEMBER_ERROR_NR; e++) m->errors[e] = cpu_to_le64(atomic64_read(&ca->errors[e])); } - rcu_read_unlock(); } void bch2_dev_io_errors_to_text(struct printbuf *out, struct bch_dev *ca) @@ -443,20 +442,14 @@ void bch2_dev_errors_reset(struct bch_dev *ca) bool bch2_dev_btree_bitmap_marked(struct bch_fs *c, struct bkey_s_c k) { - bool ret = true; - rcu_read_lock(); + guard(rcu)(); bkey_for_each_ptr(bch2_bkey_ptrs_c(k), ptr) { struct bch_dev *ca = bch2_dev_rcu(c, ptr->dev); - if (!ca) - continue; - - if (!bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) { - ret = false; - break; - } + if (ca && + !bch2_dev_btree_bitmap_marked_sectors(ca, ptr->offset, btree_sectors(c))) + return false; } - rcu_read_unlock(); - return ret; + return true; } static void __bch2_dev_btree_bitmap_mark(struct bch_sb_field_members_v2 *mi, unsigned dev, diff --git a/fs/bcachefs/sb-members.h b/fs/bcachefs/sb-members.h index 6bd9b86aee5b..8d8a8a857648 100644 --- a/fs/bcachefs/sb-members.h +++ b/fs/bcachefs/sb-members.h @@ -28,12 +28,9 @@ static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *, unsigned); static inline bool bch2_dev_idx_is_online(struct bch_fs *c, unsigned dev) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - bool ret = ca && bch2_dev_is_online(ca); - rcu_read_unlock(); - - return ret; + return ca && bch2_dev_is_online(ca); } static inline bool bch2_dev_is_healthy(struct bch_dev *ca) @@ -142,12 +139,10 @@ static inline void bch2_dev_put(struct bch_dev *ca) static inline struct bch_dev *bch2_get_next_dev(struct bch_fs *c, struct bch_dev *ca) { - rcu_read_lock(); + guard(rcu)(); bch2_dev_put(ca); if ((ca = __bch2_next_dev(c, ca, NULL))) bch2_dev_get(ca); - rcu_read_unlock(); - return ca; } @@ -166,7 +161,7 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, unsigned state_mask, int rw, unsigned ref_idx) { - rcu_read_lock(); + guard(rcu)(); if (ca) enumerated_ref_put(&ca->io_ref[rw], ref_idx); @@ -174,7 +169,6 @@ static inline struct bch_dev *bch2_get_next_online_dev(struct bch_fs *c, (!((1 << ca->mi.state) & state_mask) || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx))) ; - rcu_read_unlock(); return ca; } @@ -239,11 +233,10 @@ static inline struct bch_dev *bch2_dev_rcu(struct bch_fs *c, unsigned dev) static inline struct bch_dev *bch2_dev_tryget_noerror(struct bch_fs *c, unsigned dev) { - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu_noerror(c, dev); if (ca) bch2_dev_get(ca); - rcu_read_unlock(); return ca; } @@ -299,19 +292,16 @@ static inline struct bch_dev *bch2_dev_get_ioref(struct bch_fs *c, unsigned dev, { might_sleep(); - rcu_read_lock(); + guard(rcu)(); struct bch_dev *ca = bch2_dev_rcu(c, dev); - if (ca && !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) - ca = NULL; - rcu_read_unlock(); + if (!ca || !enumerated_ref_tryget(&ca->io_ref[rw], ref_idx)) + return NULL; - if (ca && - (ca->mi.state == BCH_MEMBER_STATE_rw || - (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ))) + if (ca->mi.state == BCH_MEMBER_STATE_rw || + (ca->mi.state == BCH_MEMBER_STATE_ro && rw == READ)) return ca; - if (ca) - enumerated_ref_put(&ca->io_ref[rw], ref_idx); + enumerated_ref_put(&ca->io_ref[rw], ref_idx); return NULL; } diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c index 7c403427fbdb..538c324f4765 100644 --- a/fs/bcachefs/six.c +++ b/fs/bcachefs/six.c @@ -339,12 +339,9 @@ static inline bool six_owner_running(struct six_lock *lock) * acquiring the lock and setting the owner field. If we're an RT task * that will live-lock because we won't let the owner complete. */ - rcu_read_lock(); + guard(rcu)(); struct task_struct *owner = READ_ONCE(lock->owner); - bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); - rcu_read_unlock(); - - return ret; + return owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); } static inline bool six_optimistic_spin(struct six_lock *lock, diff --git a/fs/bcachefs/snapshot.c b/fs/bcachefs/snapshot.c index 00d62d1190ef..23a332d76b32 100644 --- a/fs/bcachefs/snapshot.c +++ b/fs/bcachefs/snapshot.c @@ -54,7 +54,7 @@ int bch2_snapshot_tree_lookup(struct btree_trans *trans, u32 id, BTREE_ITER_with_updates, snapshot_tree, s); if (bch2_err_matches(ret, ENOENT)) - ret = -BCH_ERR_ENOENT_snapshot_tree; + ret = bch_err_throw(trans->c, ENOENT_snapshot_tree); return ret; } @@ -67,7 +67,7 @@ __bch2_snapshot_tree_create(struct btree_trans *trans) struct bkey_i_snapshot_tree *s_t; if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = -BCH_ERR_ENOSPC_snapshot_tree; + ret = bch_err_throw(trans->c, ENOSPC_snapshot_tree); if (ret) return ERR_PTR(ret); @@ -105,11 +105,8 @@ static bool __bch2_snapshot_is_ancestor_early(struct snapshot_table *t, u32 id, static bool bch2_snapshot_is_ancestor_early(struct bch_fs *c, u32 id, u32 ancestor) { - rcu_read_lock(); - bool ret = __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return __bch2_snapshot_is_ancestor_early(rcu_dereference(c->snapshots), id, ancestor); } static inline u32 get_ancestor_below(struct snapshot_table *t, u32 id, u32 ancestor) @@ -140,13 +137,11 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) { bool ret; - rcu_read_lock(); + guard(rcu)(); struct snapshot_table *t = rcu_dereference(c->snapshots); - if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) { - ret = __bch2_snapshot_is_ancestor_early(t, id, ancestor); - goto out; - } + if (unlikely(c->recovery.pass_done < BCH_RECOVERY_PASS_check_snapshots)) + return __bch2_snapshot_is_ancestor_early(t, id, ancestor); if (likely(ancestor >= IS_ANCESTOR_BITMAP)) while (id && id < ancestor - IS_ANCESTOR_BITMAP) @@ -157,9 +152,6 @@ bool __bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ancestor) : id == ancestor; EBUG_ON(ret != __bch2_snapshot_is_ancestor_early(t, id, ancestor)); -out: - rcu_read_unlock(); - return ret; } @@ -293,7 +285,7 @@ static int bch2_snapshot_table_make_room(struct bch_fs *c, u32 id) mutex_lock(&c->snapshot_table_lock); int ret = snapshot_t_mut(c, id) ? 0 - : -BCH_ERR_ENOMEM_mark_snapshot; + : bch_err_throw(c, ENOMEM_mark_snapshot); mutex_unlock(&c->snapshot_table_lock); return ret; } @@ -312,7 +304,7 @@ static int __bch2_mark_snapshot(struct btree_trans *trans, t = snapshot_t_mut(c, id); if (!t) { - ret = -BCH_ERR_ENOMEM_mark_snapshot; + ret = bch_err_throw(c, ENOMEM_mark_snapshot); goto err; } @@ -412,10 +404,10 @@ static u32 bch2_snapshot_tree_next(struct bch_fs *c, u32 id) u32 bch2_snapshot_oldest_subvol(struct bch_fs *c, u32 snapshot_root, snapshot_id_list *skip) { + guard(rcu)(); u32 id, subvol = 0, s; retry: id = snapshot_root; - rcu_read_lock(); while (id && bch2_snapshot_exists(c, id)) { if (!(skip && snapshot_list_has_id(skip, id))) { s = snapshot_t(c, id)->subvol; @@ -427,7 +419,6 @@ retry: if (id == snapshot_root) break; } - rcu_read_unlock(); if (!subvol && skip) { skip = NULL; @@ -617,18 +608,14 @@ static int snapshot_tree_ptr_good(struct btree_trans *trans, u32 bch2_snapshot_skiplist_get(struct bch_fs *c, u32 id) { - const struct snapshot_t *s; - if (!id) return 0; - rcu_read_lock(); - s = snapshot_t(c, id); - if (s->parent) - id = bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)); - rcu_read_unlock(); - - return id; + guard(rcu)(); + const struct snapshot_t *s = snapshot_t(c, id); + return s->parent + ? bch2_snapshot_nth_parent(c, id, get_random_u32_below(s->depth)) + : id; } static int snapshot_skiplist_good(struct btree_trans *trans, u32 id, struct bch_snapshot s) @@ -947,10 +934,7 @@ static inline bool same_snapshot(struct snapshot_tree_reconstruct *r, struct bpo static inline bool snapshot_id_lists_have_common(snapshot_id_list *l, snapshot_id_list *r) { - darray_for_each(*l, i) - if (snapshot_list_has_id(r, *i)) - return true; - return false; + return darray_find_p(*l, i, snapshot_list_has_id(r, *i)) != NULL; } static void snapshot_id_list_to_text(struct printbuf *out, snapshot_id_list *s) @@ -1022,7 +1006,7 @@ int bch2_reconstruct_snapshots(struct bch_fs *c) "snapshot node %u from tree %s missing, recreate?", *id, buf.buf)) { if (t->nr > 1) { bch_err(c, "cannot reconstruct snapshot trees with multiple nodes"); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; } @@ -1061,24 +1045,73 @@ int __bch2_check_key_has_snapshot(struct btree_trans *trans, ret = bch2_btree_delete_at(trans, iter, BTREE_UPDATE_internal_snapshot_node) ?: 1; - /* - * Snapshot missing: we should have caught this with btree_lost_data and - * kicked off reconstruct_snapshots, so if we end up here we have no - * idea what happened: - */ - if (fsck_err_on(state == SNAPSHOT_ID_empty, - trans, bkey_in_missing_snapshot, - "key in missing snapshot %s, delete?", - (bch2_btree_id_to_text(&buf, iter->btree_id), - prt_char(&buf, ' '), - bch2_bkey_val_to_text(&buf, c, k), buf.buf))) - ret = bch2_btree_delete_at(trans, iter, - BTREE_UPDATE_internal_snapshot_node) ?: 1; + if (state == SNAPSHOT_ID_empty) { + /* + * Snapshot missing: we should have caught this with btree_lost_data and + * kicked off reconstruct_snapshots, so if we end up here we have no + * idea what happened. + * + * Do not delete unless we know that subvolumes and snapshots + * are consistent: + * + * XXX: + * + * We could be smarter here, and instead of using the generic + * recovery pass ratelimiting, track if there have been any + * changes to the snapshots or inodes btrees since those passes + * last ran. + */ + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_snapshots) ?: ret; + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_check_subvols) ?: ret; + + if (c->sb.btrees_lost_data & BIT_ULL(BTREE_ID_snapshots)) + ret = bch2_require_recovery_pass(c, &buf, BCH_RECOVERY_PASS_reconstruct_snapshots) ?: ret; + + unsigned repair_flags = FSCK_CAN_IGNORE | (!ret ? FSCK_CAN_FIX : 0); + + if (__fsck_err(trans, repair_flags, bkey_in_missing_snapshot, + "key in missing snapshot %s, delete?", + (bch2_btree_id_to_text(&buf, iter->btree_id), + prt_char(&buf, ' '), + bch2_bkey_val_to_text(&buf, c, k), buf.buf))) { + ret = bch2_btree_delete_at(trans, iter, + BTREE_UPDATE_internal_snapshot_node) ?: 1; + } + } fsck_err: printbuf_exit(&buf); return ret; } +int __bch2_get_snapshot_overwrites(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) +{ + struct bch_fs *c = trans->c; + struct btree_iter iter; + struct bkey_s_c k; + int ret = 0; + + for_each_btree_key_reverse_norestart(trans, iter, btree, bpos_predecessor(pos), + BTREE_ITER_all_snapshots, k, ret) { + if (!bkey_eq(k.k->p, pos)) + break; + + if (!bch2_snapshot_is_ancestor(c, k.k->p.snapshot, pos.snapshot) || + snapshot_list_has_ancestor(c, s, k.k->p.snapshot)) + continue; + + ret = snapshot_list_add(c, s, k.k->p.snapshot); + if (ret) + break; + } + bch2_trans_iter_exit(trans, &iter); + if (ret) + darray_exit(s); + + return ret; +} + /* * Mark a snapshot as deleted, for future cleanup: */ @@ -1263,7 +1296,7 @@ static int create_snapids(struct btree_trans *trans, u32 parent, u32 tree, goto err; if (!k.k || !k.k->p.offset) { - ret = -BCH_ERR_ENOSPC_snapshot_create; + ret = bch_err_throw(c, ENOSPC_snapshot_create); goto err; } @@ -1399,10 +1432,8 @@ int bch2_snapshot_node_create(struct btree_trans *trans, u32 parent, static inline u32 interior_delete_has_id(interior_delete_list *l, u32 id) { - darray_for_each(*l, i) - if (i->id == id) - return i->live_child; - return 0; + struct snapshot_interior_delete *i = darray_find_p(*l, i, i->id == id); + return i ? i->live_child : 0; } static unsigned __live_child(struct snapshot_table *t, u32 id, @@ -1434,11 +1465,9 @@ static unsigned live_child(struct bch_fs *c, u32 id) { struct snapshot_delete *d = &c->snapshot_delete; - rcu_read_lock(); - u32 ret = __live_child(rcu_dereference(c->snapshots), id, - &d->delete_leaves, &d->delete_interior); - rcu_read_unlock(); - return ret; + guard(rcu)(); + return __live_child(rcu_dereference(c->snapshots), id, + &d->delete_leaves, &d->delete_interior); } static bool snapshot_id_dying(struct snapshot_delete *d, unsigned id) @@ -1695,7 +1724,7 @@ static int check_should_delete_snapshot(struct btree_trans *trans, struct bkey_s static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, interior_delete_list *skip) { - rcu_read_lock(); + guard(rcu)(); while (interior_delete_has_id(skip, id)) id = __bch2_snapshot_parent(c, id); @@ -1704,7 +1733,6 @@ static inline u32 bch2_snapshot_nth_parent_skip(struct bch_fs *c, u32 id, u32 n, id = __bch2_snapshot_parent(c, id); } while (interior_delete_has_id(skip, id)); } - rcu_read_unlock(); return id; } @@ -1870,6 +1898,8 @@ err: d->running = false; mutex_unlock(&d->progress_lock); bch2_trans_put(trans); + + bch2_recovery_pass_set_no_ratelimit(c, BCH_RECOVERY_PASS_check_snapshots); out_unlock: mutex_unlock(&d->lock); if (!bch2_err_matches(ret, EROFS)) @@ -1905,7 +1935,7 @@ void bch2_delete_dead_snapshots_async(struct bch_fs *c) BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); - if (!queue_work(c->write_ref_wq, &c->snapshot_delete.work)) + if (!queue_work(system_long_wq, &c->snapshot_delete.work)) enumerated_ref_put(&c->writes, BCH_WRITE_REF_delete_dead_snapshots); } diff --git a/fs/bcachefs/snapshot.h b/fs/bcachefs/snapshot.h index 382a171f5413..6766bf673ed9 100644 --- a/fs/bcachefs/snapshot.h +++ b/fs/bcachefs/snapshot.h @@ -46,12 +46,9 @@ static inline const struct snapshot_t *snapshot_t(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_tree(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *s = snapshot_t(c, id); - id = s ? s->tree : 0; - rcu_read_unlock(); - - return id; + return s ? s->tree : 0; } static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) @@ -62,11 +59,8 @@ static inline u32 __bch2_snapshot_parent_early(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_parent_early(struct bch_fs *c, u32 id) { - rcu_read_lock(); - id = __bch2_snapshot_parent_early(c, id); - rcu_read_unlock(); - - return id; + guard(rcu)(); + return __bch2_snapshot_parent_early(c, id); } static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) @@ -88,20 +82,15 @@ static inline u32 __bch2_snapshot_parent(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_parent(struct bch_fs *c, u32 id) { - rcu_read_lock(); - id = __bch2_snapshot_parent(c, id); - rcu_read_unlock(); - - return id; + guard(rcu)(); + return __bch2_snapshot_parent(c, id); } static inline u32 bch2_snapshot_nth_parent(struct bch_fs *c, u32 id, u32 n) { - rcu_read_lock(); + guard(rcu)(); while (n--) id = __bch2_snapshot_parent(c, id); - rcu_read_unlock(); - return id; } @@ -110,13 +99,11 @@ u32 bch2_snapshot_skiplist_get(struct bch_fs *, u32); static inline u32 bch2_snapshot_root(struct bch_fs *c, u32 id) { - u32 parent; + guard(rcu)(); - rcu_read_lock(); + u32 parent; while ((parent = __bch2_snapshot_parent(c, id))) id = parent; - rcu_read_unlock(); - return id; } @@ -128,11 +115,8 @@ static inline enum snapshot_id_state __bch2_snapshot_id_state(struct bch_fs *c, static inline enum snapshot_id_state bch2_snapshot_id_state(struct bch_fs *c, u32 id) { - rcu_read_lock(); - enum snapshot_id_state ret = __bch2_snapshot_id_state(c, id); - rcu_read_unlock(); - - return ret; + guard(rcu)(); + return __bch2_snapshot_id_state(c, id); } static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) @@ -142,12 +126,9 @@ static inline bool bch2_snapshot_exists(struct bch_fs *c, u32 id) static inline int bch2_snapshot_is_internal_node(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *s = snapshot_t(c, id); - int ret = s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; - rcu_read_unlock(); - - return ret; + return s ? s->children[0] : -BCH_ERR_invalid_snapshot_node; } static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) @@ -160,13 +141,8 @@ static inline int bch2_snapshot_is_leaf(struct bch_fs *c, u32 id) static inline u32 bch2_snapshot_depth(struct bch_fs *c, u32 parent) { - u32 depth; - - rcu_read_lock(); - depth = parent ? snapshot_t(c, parent)->depth + 1 : 0; - rcu_read_unlock(); - - return depth; + guard(rcu)(); + return parent ? snapshot_t(c, parent)->depth + 1 : 0; } bool __bch2_snapshot_is_ancestor(struct bch_fs *, u32, u32); @@ -180,20 +156,14 @@ static inline bool bch2_snapshot_is_ancestor(struct bch_fs *c, u32 id, u32 ances static inline bool bch2_snapshot_has_children(struct bch_fs *c, u32 id) { - rcu_read_lock(); + guard(rcu)(); const struct snapshot_t *t = snapshot_t(c, id); - bool ret = t && (t->children[0]|t->children[1]) != 0; - rcu_read_unlock(); - - return ret; + return t && (t->children[0]|t->children[1]) != 0; } static inline bool snapshot_list_has_id(snapshot_id_list *s, u32 id) { - darray_for_each(*s, i) - if (*i == id) - return true; - return false; + return darray_find(*s, id) != NULL; } static inline bool snapshot_list_has_ancestor(struct bch_fs *c, snapshot_id_list *s, u32 id) @@ -258,6 +228,25 @@ static inline int bch2_check_key_has_snapshot(struct btree_trans *trans, : __bch2_check_key_has_snapshot(trans, iter, k); } +int __bch2_get_snapshot_overwrites(struct btree_trans *, + enum btree_id, struct bpos, + snapshot_id_list *); + +/* + * Get a list of snapshot IDs that have overwritten a given key: + */ +static inline int bch2_get_snapshot_overwrites(struct btree_trans *trans, + enum btree_id btree, struct bpos pos, + snapshot_id_list *s) +{ + darray_init(s); + + return bch2_snapshot_has_children(trans->c, pos.snapshot) + ? __bch2_get_snapshot_overwrites(trans, btree, pos, s) + : 0; + +} + int bch2_snapshot_node_set_deleted(struct btree_trans *, u32); int __bch2_key_has_snapshot_overwrites(struct btree_trans *, enum btree_id, struct bpos); diff --git a/fs/bcachefs/str_hash.c b/fs/bcachefs/str_hash.c index 0cbf5508a32c..71b735a85026 100644 --- a/fs/bcachefs/str_hash.c +++ b/fs/bcachefs/str_hash.c @@ -31,14 +31,15 @@ static int bch2_dirent_has_target(struct btree_trans *trans, struct bkey_s_c_dir } } -static noinline int fsck_rename_dirent(struct btree_trans *trans, - struct snapshots_seen *s, - const struct bch_hash_desc desc, - struct bch_hash_info *hash_info, - struct bkey_s_c_dirent old) +static int bch2_fsck_rename_dirent(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc desc, + struct bch_hash_info *hash_info, + struct bkey_s_c_dirent old, + bool *updated_before_k_pos) { struct qstr old_name = bch2_dirent_get_name(old); - struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, bkey_bytes(old.k) + 32); + struct bkey_i_dirent *new = bch2_trans_kmalloc(trans, BKEY_U64s_MAX * sizeof(u64)); int ret = PTR_ERR_OR_ZERO(new); if (ret) return ret; @@ -47,28 +48,39 @@ static noinline int fsck_rename_dirent(struct btree_trans *trans, dirent_copy_target(new, old); new->k.p = old.k->p; + char *renamed_buf = bch2_trans_kmalloc(trans, old_name.len + 20); + ret = PTR_ERR_OR_ZERO(renamed_buf); + if (ret) + return ret; + for (unsigned i = 0; i < 1000; i++) { - unsigned len = sprintf(new->v.d_name, "%.*s.fsck_renamed-%u", - old_name.len, old_name.name, i); - unsigned u64s = BKEY_U64s + dirent_val_u64s(len, 0); + new->k.u64s = BKEY_U64s_MAX; - if (u64s > U8_MAX) - return -EINVAL; + struct qstr renamed_name = (struct qstr) QSTR_INIT(renamed_buf, + sprintf(renamed_buf, "%.*s.fsck_renamed-%u", + old_name.len, old_name.name, i)); - new->k.u64s = u64s; + ret = bch2_dirent_init_name(new, hash_info, &renamed_name, NULL); + if (ret) + return ret; ret = bch2_hash_set_in_snapshot(trans, bch2_dirent_hash_desc, hash_info, (subvol_inum) { 0, old.k->p.inode }, old.k->p.snapshot, &new->k_i, - BTREE_UPDATE_internal_snapshot_node); - if (!bch2_err_matches(ret, EEXIST)) + BTREE_UPDATE_internal_snapshot_node| + STR_HASH_must_create); + if (ret && !bch2_err_matches(ret, EEXIST)) + break; + if (!ret) { + if (bpos_lt(new->k.p, old.k->p)) + *updated_before_k_pos = true; break; + } } - if (ret) - return ret; - - return bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); + ret = ret ?: bch2_fsck_update_backpointers(trans, s, desc, hash_info, &new->k_i); + bch_err_fn(trans->c, ret); + return ret; } static noinline int hash_pick_winner(struct btree_trans *trans, @@ -186,7 +198,7 @@ int bch2_repair_inode_hash_info(struct btree_trans *trans, #endif bch2_print_str(c, KERN_ERR, buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_fsck_repair_unimplemented; + ret = bch_err_throw(c, fsck_repair_unimplemented); goto err; } @@ -221,11 +233,115 @@ static noinline int check_inode_hash_info_matches_root(struct btree_trans *trans return ret; } +/* Put a str_hash key in its proper location, checking for duplicates */ +int bch2_str_hash_repair_key(struct btree_trans *trans, + struct snapshots_seen *s, + const struct bch_hash_desc *desc, + struct bch_hash_info *hash_info, + struct btree_iter *k_iter, struct bkey_s_c k, + struct btree_iter *dup_iter, struct bkey_s_c dup_k, + bool *updated_before_k_pos) +{ + struct bch_fs *c = trans->c; + struct printbuf buf = PRINTBUF; + bool free_snapshots_seen = false; + int ret = 0; + + if (!s) { + s = bch2_trans_kmalloc(trans, sizeof(*s)); + ret = PTR_ERR_OR_ZERO(s); + if (ret) + goto out; + + s->pos = k_iter->pos; + darray_init(&s->ids); + + ret = bch2_get_snapshot_overwrites(trans, desc->btree_id, k_iter->pos, &s->ids); + if (ret) + goto out; + + free_snapshots_seen = true; + } + + if (!dup_k.k) { + struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, k); + ret = PTR_ERR_OR_ZERO(new); + if (ret) + goto out; + + dup_k = bch2_hash_set_or_get_in_snapshot(trans, dup_iter, *desc, hash_info, + (subvol_inum) { 0, new->k.p.inode }, + new->k.p.snapshot, new, + STR_HASH_must_create| + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node); + ret = bkey_err(dup_k); + if (ret) + goto out; + if (dup_k.k) + goto duplicate_entries; + + if (bpos_lt(new->k.p, k.k->p)) + *updated_before_k_pos = true; + + ret = bch2_insert_snapshot_whiteouts(trans, desc->btree_id, + k_iter->pos, new->k.p) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_ITER_with_updates| + BTREE_UPDATE_internal_snapshot_node) ?: + bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: + bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: + -BCH_ERR_transaction_restart_commit; + } else { +duplicate_entries: + ret = hash_pick_winner(trans, *desc, hash_info, k, dup_k); + if (ret < 0) + goto out; + + if (!fsck_err(trans, hash_table_key_duplicate, + "duplicate hash table keys%s:\n%s", + ret != 2 ? "" : ", both point to valid inodes", + (printbuf_reset(&buf), + bch2_bkey_val_to_text(&buf, c, k), + prt_newline(&buf), + bch2_bkey_val_to_text(&buf, c, dup_k), + buf.buf))) + goto out; + + switch (ret) { + case 0: + ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); + break; + case 1: + ret = bch2_hash_delete_at(trans, *desc, hash_info, dup_iter, 0); + break; + case 2: + ret = bch2_fsck_rename_dirent(trans, s, *desc, hash_info, + bkey_s_c_to_dirent(k), + updated_before_k_pos) ?: + bch2_hash_delete_at(trans, *desc, hash_info, k_iter, + BTREE_ITER_with_updates); + goto out; + } + + ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: + -BCH_ERR_transaction_restart_commit; + } +out: +fsck_err: + bch2_trans_iter_exit(trans, dup_iter); + printbuf_exit(&buf); + if (free_snapshots_seen) + darray_exit(&s->ids); + return ret; +} + int __bch2_str_hash_check_key(struct btree_trans *trans, struct snapshots_seen *s, const struct bch_hash_desc *desc, struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) + struct btree_iter *k_iter, struct bkey_s_c hash_k, + bool *updated_before_k_pos) { struct bch_fs *c = trans->c; struct btree_iter iter = {}; @@ -239,24 +355,31 @@ int __bch2_str_hash_check_key(struct btree_trans *trans, for_each_btree_key_norestart(trans, iter, desc->btree_id, SPOS(hash_k.k->p.inode, hash, hash_k.k->p.snapshot), - BTREE_ITER_slots, k, ret) { + BTREE_ITER_slots| + BTREE_ITER_with_updates, k, ret) { if (bkey_eq(k.k->p, hash_k.k->p)) break; if (k.k->type == desc->key_type && - !desc->cmp_bkey(k, hash_k)) - goto duplicate_entries; + !desc->cmp_bkey(k, hash_k)) { + ret = check_inode_hash_info_matches_root(trans, hash_k.k->p.inode, + hash_info) ?: + bch2_str_hash_repair_key(trans, s, desc, hash_info, + k_iter, hash_k, + &iter, k, updated_before_k_pos); + break; + } - if (bkey_deleted(k.k)) { - bch2_trans_iter_exit(trans, &iter); + if (bkey_deleted(k.k)) goto bad_hash; - } } -out: bch2_trans_iter_exit(trans, &iter); +out: +fsck_err: printbuf_exit(&buf); return ret; bad_hash: + bch2_trans_iter_exit(trans, &iter); /* * Before doing any repair, check hash_info itself: */ @@ -265,64 +388,12 @@ bad_hash: goto out; if (fsck_err(trans, hash_table_key_wrong_offset, - "hash table key at wrong offset: btree %s inode %llu offset %llu, hashed to %llu\n%s", - bch2_btree_id_str(desc->btree_id), hash_k.k->p.inode, hash_k.k->p.offset, hash, - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) { - struct bkey_i *new = bch2_bkey_make_mut_noupdate(trans, hash_k); - if (IS_ERR(new)) - return PTR_ERR(new); - - k = bch2_hash_set_or_get_in_snapshot(trans, &iter, *desc, hash_info, - (subvol_inum) { 0, hash_k.k->p.inode }, - hash_k.k->p.snapshot, new, - STR_HASH_must_create| - BTREE_ITER_with_updates| - BTREE_UPDATE_internal_snapshot_node); - ret = bkey_err(k); - if (ret) - goto out; - if (k.k) - goto duplicate_entries; - - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, - BTREE_UPDATE_internal_snapshot_node) ?: - bch2_fsck_update_backpointers(trans, s, *desc, hash_info, new) ?: - bch2_trans_commit(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc) ?: - -BCH_ERR_transaction_restart_nested; - goto out; - } -fsck_err: - goto out; -duplicate_entries: - ret = hash_pick_winner(trans, *desc, hash_info, hash_k, k); - if (ret < 0) - goto out; - - if (!fsck_err(trans, hash_table_key_duplicate, - "duplicate hash table keys%s:\n%s", - ret != 2 ? "" : ", both point to valid inodes", - (printbuf_reset(&buf), - bch2_bkey_val_to_text(&buf, c, hash_k), - prt_newline(&buf), - bch2_bkey_val_to_text(&buf, c, k), - buf.buf))) - goto out; - - switch (ret) { - case 0: - ret = bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - break; - case 1: - ret = bch2_hash_delete_at(trans, *desc, hash_info, &iter, 0); - break; - case 2: - ret = fsck_rename_dirent(trans, s, *desc, hash_info, bkey_s_c_to_dirent(hash_k)) ?: - bch2_hash_delete_at(trans, *desc, hash_info, k_iter, 0); - goto out; - } - - ret = bch2_trans_commit(trans, NULL, NULL, 0) ?: - -BCH_ERR_transaction_restart_nested; + "hash table key at wrong offset: should be at %llu\n%s", + hash, + (bch2_bkey_val_to_text(&buf, c, hash_k), buf.buf))) + ret = bch2_str_hash_repair_key(trans, s, desc, hash_info, + k_iter, hash_k, + &iter, bkey_s_c_null, + updated_before_k_pos); goto out; } diff --git a/fs/bcachefs/str_hash.h b/fs/bcachefs/str_hash.h index 6762b3627e1b..79d51aef70aa 100644 --- a/fs/bcachefs/str_hash.h +++ b/fs/bcachefs/str_hash.h @@ -261,6 +261,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, struct bkey_i *insert, enum btree_iter_update_trigger_flags flags) { + struct bch_fs *c = trans->c; struct btree_iter slot = {}; struct bkey_s_c k; bool found = false; @@ -288,7 +289,7 @@ struct bkey_s_c bch2_hash_set_or_get_in_snapshot(struct btree_trans *trans, } if (!ret) - ret = -BCH_ERR_ENOSPC_str_hash_create; + ret = bch_err_throw(c, ENOSPC_str_hash_create); out: bch2_trans_iter_exit(trans, &slot); bch2_trans_iter_exit(trans, iter); @@ -300,7 +301,7 @@ not_found: bch2_trans_iter_exit(trans, &slot); return k; } else if (!found && (flags & STR_HASH_must_replace)) { - ret = -BCH_ERR_ENOENT_str_hash_set_must_replace; + ret = bch_err_throw(c, ENOENT_str_hash_set_must_replace); } else { if (!found && slot.path) swap(*iter, slot); @@ -328,7 +329,7 @@ int bch2_hash_set_in_snapshot(struct btree_trans *trans, return ret; if (k.k) { bch2_trans_iter_exit(trans, &iter); - return -BCH_ERR_EEXIST_str_hash_set; + return bch_err_throw(trans->c, EEXIST_str_hash_set); } return 0; @@ -397,17 +398,27 @@ int bch2_hash_delete(struct btree_trans *trans, int bch2_repair_inode_hash_info(struct btree_trans *, struct bch_inode_unpacked *); struct snapshots_seen; +int bch2_str_hash_repair_key(struct btree_trans *, + struct snapshots_seen *, + const struct bch_hash_desc *, + struct bch_hash_info *, + struct btree_iter *, struct bkey_s_c, + struct btree_iter *, struct bkey_s_c, + bool *); + int __bch2_str_hash_check_key(struct btree_trans *, struct snapshots_seen *, const struct bch_hash_desc *, struct bch_hash_info *, - struct btree_iter *, struct bkey_s_c); + struct btree_iter *, struct bkey_s_c, + bool *); static inline int bch2_str_hash_check_key(struct btree_trans *trans, struct snapshots_seen *s, const struct bch_hash_desc *desc, struct bch_hash_info *hash_info, - struct btree_iter *k_iter, struct bkey_s_c hash_k) + struct btree_iter *k_iter, struct bkey_s_c hash_k, + bool *updated_before_k_pos) { if (hash_k.k->type != desc->key_type) return 0; @@ -415,7 +426,8 @@ static inline int bch2_str_hash_check_key(struct btree_trans *trans, if (likely(desc->hash_bkey(hash_info, hash_k) == hash_k.k->p.offset)) return 0; - return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k); + return __bch2_str_hash_check_key(trans, s, desc, hash_info, k_iter, hash_k, + updated_before_k_pos); } #endif /* _BCACHEFS_STR_HASH_H */ diff --git a/fs/bcachefs/subvolume.c b/fs/bcachefs/subvolume.c index 35c9f86a73c1..020587449123 100644 --- a/fs/bcachefs/subvolume.c +++ b/fs/bcachefs/subvolume.c @@ -130,10 +130,20 @@ static int check_subvol(struct btree_trans *trans, "subvolume %llu points to missing subvolume root %llu:%u", k.k->p.offset, le64_to_cpu(subvol.v->inode), le32_to_cpu(subvol.v->snapshot))) { - ret = bch2_subvolume_delete(trans, iter->pos.offset); - bch_err_msg(c, ret, "deleting subvolume %llu", iter->pos.offset); - ret = ret ?: -BCH_ERR_transaction_restart_nested; - goto err; + /* + * Recreate - any contents that are still disconnected + * will then get reattached under lost+found + */ + bch2_inode_init_early(c, &inode); + bch2_inode_init_late(c, &inode, bch2_current_time(c), + 0, 0, S_IFDIR|0700, 0, NULL); + inode.bi_inum = le64_to_cpu(subvol.v->inode); + inode.bi_snapshot = le32_to_cpu(subvol.v->snapshot); + inode.bi_subvol = k.k->p.offset; + inode.bi_parent_subvol = le32_to_cpu(subvol.v->fs_path_parent); + ret = __bch2_fsck_write_inode(trans, &inode); + if (ret) + goto err; } } else { goto err; @@ -141,13 +151,9 @@ static int check_subvol(struct btree_trans *trans, if (!BCH_SUBVOLUME_SNAP(subvol.v)) { u32 snapshot_root = bch2_snapshot_root(c, le32_to_cpu(subvol.v->snapshot)); - u32 snapshot_tree; - struct bch_snapshot_tree st; - - rcu_read_lock(); - snapshot_tree = snapshot_t(c, snapshot_root)->tree; - rcu_read_unlock(); + u32 snapshot_tree = bch2_snapshot_tree(c, snapshot_root); + struct bch_snapshot_tree st; ret = bch2_snapshot_tree_lookup(trans, snapshot_tree, &st); bch2_fs_inconsistent_on(bch2_err_matches(ret, ENOENT), c, @@ -259,6 +265,13 @@ void bch2_subvolume_to_text(struct printbuf *out, struct bch_fs *c, prt_printf(out, " creation_parent %u", le32_to_cpu(s.v->creation_parent)); prt_printf(out, " fs_parent %u", le32_to_cpu(s.v->fs_path_parent)); } + + if (BCH_SUBVOLUME_RO(s.v)) + prt_printf(out, " ro"); + if (BCH_SUBVOLUME_SNAP(s.v)) + prt_printf(out, " snapshot"); + if (BCH_SUBVOLUME_UNLINKED(s.v)) + prt_printf(out, " unlinked"); } static int subvolume_children_mod(struct btree_trans *trans, struct bpos pos, bool set) @@ -486,9 +499,12 @@ err: static int bch2_subvolume_delete(struct btree_trans *trans, u32 subvolid) { - return bch2_subvolumes_reparent(trans, subvolid) ?: + int ret = bch2_subvolumes_reparent(trans, subvolid) ?: commit_do(trans, NULL, NULL, BCH_TRANS_COMMIT_no_enospc, __bch2_subvolume_delete(trans, subvolid)); + + bch2_recovery_pass_set_no_ratelimit(trans->c, BCH_RECOVERY_PASS_check_subvols); + return ret; } static void bch2_subvolume_wait_for_pagecache_and_delete(struct work_struct *work) @@ -597,7 +613,7 @@ int bch2_subvolume_create(struct btree_trans *trans, u64 inode, ret = bch2_bkey_get_empty_slot(trans, &dst_iter, BTREE_ID_subvolumes, POS(0, U32_MAX)); if (ret == -BCH_ERR_ENOSPC_btree_slot) - ret = -BCH_ERR_ENOSPC_subvolume_create; + ret = bch_err_throw(c, ENOSPC_subvolume_create); if (ret) return ret; @@ -703,8 +719,9 @@ static int __bch2_fs_upgrade_for_subvolumes(struct btree_trans *trans) return ret; if (!bkey_is_inode(k.k)) { - bch_err(trans->c, "root inode not found"); - ret = -BCH_ERR_ENOENT_inode; + struct bch_fs *c = trans->c; + bch_err(c, "root inode not found"); + ret = bch_err_throw(c, ENOENT_inode); goto err; } diff --git a/fs/bcachefs/super-io.c b/fs/bcachefs/super-io.c index 6687b9235d3c..6c2e1d647403 100644 --- a/fs/bcachefs/super-io.c +++ b/fs/bcachefs/super-io.c @@ -1112,7 +1112,7 @@ int bch2_write_super(struct bch_fs *c) prt_str(&buf, ")"); bch2_fs_fatal_error(c, ": %s", buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_sb_not_downgraded; + ret = bch_err_throw(c, sb_not_downgraded); goto out; } @@ -1142,7 +1142,7 @@ int bch2_write_super(struct bch_fs *c) if (c->opts.errors != BCH_ON_ERROR_continue && c->opts.errors != BCH_ON_ERROR_fix_safe) { - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); bch2_fs_fatal_error(c, "%s", buf.buf); } else { bch_err(c, "%s", buf.buf); @@ -1161,7 +1161,7 @@ int bch2_write_super(struct bch_fs *c) ca->disk_sb.seq); bch2_fs_fatal_error(c, "%s", buf.buf); printbuf_exit(&buf); - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); } } @@ -1215,7 +1215,7 @@ int bch2_write_super(struct bch_fs *c) !can_mount_with_written), c, ": Unable to write superblock to sufficient devices (from %ps)", (void *) _RET_IP_)) - ret = -BCH_ERR_erofs_sb_err; + ret = bch_err_throw(c, erofs_sb_err); out: /* Make new options visible after they're persistent: */ bch2_sb_update(c); diff --git a/fs/bcachefs/super.c b/fs/bcachefs/super.c index 11579b74c640..397a69da5a75 100644 --- a/fs/bcachefs/super.c +++ b/fs/bcachefs/super.c @@ -219,23 +219,17 @@ static int bch2_fs_init_rw(struct bch_fs *); struct bch_fs *bch2_dev_to_fs(dev_t dev) { - struct bch_fs *c; - - mutex_lock(&bch_fs_list_lock); - rcu_read_lock(); + guard(mutex)(&bch_fs_list_lock); + guard(rcu)(); + struct bch_fs *c; list_for_each_entry(c, &bch_fs_list, list) for_each_member_device_rcu(c, ca, NULL) if (ca->disk_sb.bdev && ca->disk_sb.bdev->bd_dev == dev) { closure_get(&c->cl); - goto found; + return c; } - c = NULL; -found: - rcu_read_unlock(); - mutex_unlock(&bch_fs_list_lock); - - return c; + return NULL; } static struct bch_fs *__bch2_uuid_to_fs(__uuid_t uuid) @@ -480,16 +474,16 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) BUG_ON(!test_bit(BCH_FS_may_go_rw, &c->flags)); if (WARN_ON(c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info))) - return -BCH_ERR_erofs_no_alloc_info; + return bch_err_throw(c, erofs_no_alloc_info); if (test_bit(BCH_FS_initial_gc_unfixed, &c->flags)) { bch_err(c, "cannot go rw, unfixed btree errors"); - return -BCH_ERR_erofs_unfixed_errors; + return bch_err_throw(c, erofs_unfixed_errors); } if (c->sb.features & BIT_ULL(BCH_FEATURE_small_image)) { bch_err(c, "cannot go rw, filesystem is an unresized image file"); - return -BCH_ERR_erofs_filesystem_full; + return bch_err_throw(c, erofs_filesystem_full); } if (test_bit(BCH_FS_rw, &c->flags)) @@ -507,13 +501,12 @@ static int __bch2_fs_read_write(struct bch_fs *c, bool early) clear_bit(BCH_FS_clean_shutdown, &c->flags); - rcu_read_lock(); - for_each_online_member_rcu(c, ca) - if (ca->mi.state == BCH_MEMBER_STATE_rw) { - bch2_dev_allocator_add(c, ca); - enumerated_ref_start(&ca->io_ref[WRITE]); - } - rcu_read_unlock(); + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) { + bch2_dev_allocator_add(c, ca); + enumerated_ref_start(&ca->io_ref[WRITE]); + } bch2_recalc_capacity(c); @@ -571,13 +564,13 @@ int bch2_fs_read_write(struct bch_fs *c) { if (c->opts.recovery_pass_last && c->opts.recovery_pass_last < BCH_RECOVERY_PASS_journal_replay) - return -BCH_ERR_erofs_norecovery; + return bch_err_throw(c, erofs_norecovery); if (c->opts.nochanges) - return -BCH_ERR_erofs_nochanges; + return bch_err_throw(c, erofs_nochanges); if (c->sb.features & BIT_ULL(BCH_FEATURE_no_alloc_info)) - return -BCH_ERR_erofs_no_alloc_info; + return bch_err_throw(c, erofs_no_alloc_info); return __bch2_fs_read_write(c, false); } @@ -762,7 +755,7 @@ static int bch2_fs_online(struct bch_fs *c) if (c->sb.multi_device && __bch2_uuid_to_fs(c->sb.uuid)) { bch_err(c, "filesystem UUID already open"); - return -BCH_ERR_filesystem_uuid_already_open; + return bch_err_throw(c, filesystem_uuid_already_open); } ret = bch2_fs_chardev_init(c); @@ -821,7 +814,7 @@ static int bch2_fs_init_rw(struct bch_fs *c) WQ_HIGHPRI|WQ_FREEZABLE|WQ_MEM_RECLAIM, 1)) || !(c->write_ref_wq = alloc_workqueue("bcachefs_write_ref", WQ_FREEZABLE, 0))) - return -BCH_ERR_ENOMEM_fs_other_alloc; + return bch_err_throw(c, ENOMEM_fs_other_alloc); int ret = bch2_fs_btree_interior_update_init(c) ?: bch2_fs_btree_write_buffer_init(c) ?: @@ -1002,7 +995,7 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, mempool_init_kvmalloc_pool(&c->btree_bounce_pool, 1, c->opts.btree_node_size) || mempool_init_kmalloc_pool(&c->large_bkey_pool, 1, 2048)) { - ret = -BCH_ERR_ENOMEM_fs_other_alloc; + ret = bch_err_throw(c, ENOMEM_fs_other_alloc); goto err; } @@ -1038,10 +1031,6 @@ static struct bch_fs *bch2_fs_alloc(struct bch_sb *sb, struct bch_opts *opts, ret = -EINVAL; goto err; } - bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", - unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), - unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); #else if (c->sb.features & BIT_ULL(BCH_FEATURE_casefolding)) { printk(KERN_ERR "Cannot mount a filesystem with casefolding on a kernel without CONFIG_UNICODE\n"); @@ -1159,8 +1148,15 @@ int bch2_fs_start(struct bch_fs *c) print_mount_opts(c); +#ifdef CONFIG_UNICODE + bch_info(c, "Using encoding defined by superblock: utf8-%u.%u.%u", + unicode_major(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_minor(BCH_FS_DEFAULT_UTF8_ENCODING), + unicode_rev(BCH_FS_DEFAULT_UTF8_ENCODING)); +#endif + if (!bch2_fs_may_start(c)) - return -BCH_ERR_insufficient_devices_to_start; + return bch_err_throw(c, insufficient_devices_to_start); down_write(&c->state_lock); mutex_lock(&c->sb_lock); @@ -1171,7 +1167,7 @@ int bch2_fs_start(struct bch_fs *c) sizeof(struct bch_sb_field_ext) / sizeof(u64))) { mutex_unlock(&c->sb_lock); up_write(&c->state_lock); - ret = -BCH_ERR_ENOSPC_sb; + ret = bch_err_throw(c, ENOSPC_sb); goto err; } @@ -1182,22 +1178,20 @@ int bch2_fs_start(struct bch_fs *c) goto err; } - rcu_read_lock(); - for_each_online_member_rcu(c, ca) - bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = - cpu_to_le64(now); - rcu_read_unlock(); + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) + bch2_members_v2_get_mut(c->disk_sb.sb, ca->dev_idx)->last_mount = + cpu_to_le64(now); /* * Dno't write superblock yet: recovery might have to downgrade */ mutex_unlock(&c->sb_lock); - rcu_read_lock(); - for_each_online_member_rcu(c, ca) - if (ca->mi.state == BCH_MEMBER_STATE_rw) - bch2_dev_allocator_add(c, ca); - rcu_read_unlock(); + scoped_guard(rcu) + for_each_online_member_rcu(c, ca) + if (ca->mi.state == BCH_MEMBER_STATE_rw) + bch2_dev_allocator_add(c, ca); bch2_recalc_capacity(c); up_write(&c->state_lock); @@ -1215,7 +1209,7 @@ int bch2_fs_start(struct bch_fs *c) goto err; if (bch2_fs_init_fault("fs_start")) { - ret = -BCH_ERR_injected_fs_start; + ret = bch_err_throw(c, injected_fs_start); goto err; } @@ -1242,11 +1236,11 @@ static int bch2_dev_may_add(struct bch_sb *sb, struct bch_fs *c) struct bch_member m = bch2_sb_member_get(sb, sb->dev_idx); if (le16_to_cpu(sb->block_size) != block_sectors(c)) - return -BCH_ERR_mismatched_block_size; + return bch_err_throw(c, mismatched_block_size); if (le16_to_cpu(m.bucket_size) < BCH_SB_BTREE_NODE_SIZE(c->disk_sb.sb)) - return -BCH_ERR_bucket_size_too_small; + return bch_err_throw(c, bucket_size_too_small); return 0; } @@ -1557,7 +1551,7 @@ static int bch2_dev_alloc(struct bch_fs *c, unsigned dev_idx) bch2_dev_attach(c, ca, dev_idx); return 0; err: - return -BCH_ERR_ENOMEM_dev_alloc; + return bch_err_throw(c, ENOMEM_dev_alloc); } static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) @@ -1567,13 +1561,13 @@ static int __bch2_dev_attach_bdev(struct bch_dev *ca, struct bch_sb_handle *sb) if (bch2_dev_is_online(ca)) { bch_err(ca, "already have device online in slot %u", sb->sb->dev_idx); - return -BCH_ERR_device_already_online; + return bch_err_throw(ca->fs, device_already_online); } if (get_capacity(sb->bdev->bd_disk) < ca->mi.bucket_size * ca->mi.nbuckets) { bch_err(ca, "cannot online: device too small"); - return -BCH_ERR_device_size_too_small; + return bch_err_throw(ca->fs, device_size_too_small); } BUG_ON(!enumerated_ref_is_zero(&ca->io_ref[READ])); @@ -1725,7 +1719,7 @@ int __bch2_dev_set_state(struct bch_fs *c, struct bch_dev *ca, return 0; if (!bch2_dev_state_allowed(c, ca, new_state, flags)) - return -BCH_ERR_device_state_not_allowed; + return bch_err_throw(c, device_state_not_allowed); if (new_state != BCH_MEMBER_STATE_rw) __bch2_dev_read_only(c, ca); @@ -1778,7 +1772,7 @@ int bch2_dev_remove(struct bch_fs *c, struct bch_dev *ca, int flags) if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot remove without losing data"); - ret = -BCH_ERR_device_state_not_allowed; + ret = bch_err_throw(c, device_state_not_allowed); goto err; } @@ -1914,7 +1908,7 @@ int bch2_dev_add(struct bch_fs *c, const char *path) if (list_empty(&c->list)) { mutex_lock(&bch_fs_list_lock); if (__bch2_uuid_to_fs(c->sb.uuid)) - ret = -BCH_ERR_filesystem_uuid_already_open; + ret = bch_err_throw(c, filesystem_uuid_already_open); else list_add(&c->list, &bch_fs_list); mutex_unlock(&bch_fs_list_lock); @@ -2101,7 +2095,7 @@ int bch2_dev_offline(struct bch_fs *c, struct bch_dev *ca, int flags) if (!bch2_dev_state_allowed(c, ca, BCH_MEMBER_STATE_failed, flags)) { bch_err(ca, "Cannot offline required disk"); up_write(&c->state_lock); - return -BCH_ERR_device_state_not_allowed; + return bch_err_throw(c, device_state_not_allowed); } __bch2_dev_offline(c, ca); @@ -2140,7 +2134,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) if (nbuckets > BCH_MEMBER_NBUCKETS_MAX) { bch_err(ca, "New device size too big (%llu greater than max %u)", nbuckets, BCH_MEMBER_NBUCKETS_MAX); - ret = -BCH_ERR_device_size_too_big; + ret = bch_err_throw(c, device_size_too_big); goto err; } @@ -2148,7 +2142,7 @@ int bch2_dev_resize(struct bch_fs *c, struct bch_dev *ca, u64 nbuckets) get_capacity(ca->disk_sb.bdev->bd_disk) < ca->mi.bucket_size * nbuckets) { bch_err(ca, "New size larger than device"); - ret = -BCH_ERR_device_size_too_small; + ret = bch_err_throw(c, device_size_too_small); goto err; } @@ -2383,7 +2377,7 @@ struct bch_fs *bch2_fs_open(darray_const_str *devices, } if (opts->nochanges && !opts->read_only) { - ret = -BCH_ERR_erofs_nochanges; + ret = bch_err_throw(c, erofs_nochanges); goto err_print; } diff --git a/fs/bcachefs/sysfs.c b/fs/bcachefs/sysfs.c index 1a55196d69f1..05848375cea2 100644 --- a/fs/bcachefs/sysfs.c +++ b/fs/bcachefs/sysfs.c @@ -26,6 +26,7 @@ #include "disk_groups.h" #include "ec.h" #include "enumerated_ref.h" +#include "error.h" #include "inode.h" #include "journal.h" #include "journal_reclaim.h" @@ -37,6 +38,7 @@ #include "rebalance.h" #include "recovery_passes.h" #include "replicas.h" +#include "sb-errors.h" #include "super-io.h" #include "tests.h" @@ -143,6 +145,7 @@ do { \ write_attribute(trigger_gc); write_attribute(trigger_discards); write_attribute(trigger_invalidates); +write_attribute(trigger_journal_commit); write_attribute(trigger_journal_flush); write_attribute(trigger_journal_writes); write_attribute(trigger_btree_cache_shrink); @@ -151,6 +154,7 @@ write_attribute(trigger_btree_updates); write_attribute(trigger_freelist_wakeup); write_attribute(trigger_recalc_capacity); write_attribute(trigger_delete_dead_snapshots); +write_attribute(trigger_emergency_read_only); read_attribute(gc_gens_pos); read_attribute(uuid); @@ -172,6 +176,7 @@ read_attribute(btree_write_stats); read_attribute(btree_cache_size); read_attribute(compression_stats); +read_attribute(errors); read_attribute(journal_debug); read_attribute(btree_cache); read_attribute(btree_key_cache); @@ -353,6 +358,9 @@ SHOW(bch2_fs) if (attr == &sysfs_compression_stats) bch2_compression_stats_to_text(out, c); + if (attr == &sysfs_errors) + bch2_fs_errors_to_text(out, c); + if (attr == &sysfs_new_stripes) bch2_new_stripes_to_text(out, c); @@ -428,6 +436,9 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_invalidates) bch2_do_invalidates(c); + if (attr == &sysfs_trigger_journal_commit) + bch2_journal_flush(&c->journal); + if (attr == &sysfs_trigger_journal_flush) { bch2_journal_flush_all_pins(&c->journal); bch2_journal_meta(&c->journal); @@ -448,6 +459,16 @@ STORE(bch2_fs) if (attr == &sysfs_trigger_delete_dead_snapshots) __bch2_delete_dead_snapshots(c); + if (attr == &sysfs_trigger_emergency_read_only) { + struct printbuf buf = PRINTBUF; + bch2_log_msg_start(c, &buf); + + prt_printf(&buf, "shutdown by sysfs\n"); + bch2_fs_emergency_read_only2(c, &buf); + bch2_print_str(c, KERN_ERR, buf.buf); + printbuf_exit(&buf); + } + #ifdef CONFIG_BCACHEFS_TESTS if (attr == &sysfs_perf_test) { char *tmp = kstrdup(buf, GFP_KERNEL), *p = tmp; @@ -483,6 +504,7 @@ struct attribute *bch2_fs_files[] = { &sysfs_recovery_status, &sysfs_compression_stats, + &sysfs_errors, #ifdef CONFIG_BCACHEFS_TESTS &sysfs_perf_test, @@ -571,6 +593,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_gc, &sysfs_trigger_discards, &sysfs_trigger_invalidates, + &sysfs_trigger_journal_commit, &sysfs_trigger_journal_flush, &sysfs_trigger_journal_writes, &sysfs_trigger_btree_cache_shrink, @@ -579,6 +602,7 @@ struct attribute *bch2_fs_internal_files[] = { &sysfs_trigger_freelist_wakeup, &sysfs_trigger_recalc_capacity, &sysfs_trigger_delete_dead_snapshots, + &sysfs_trigger_emergency_read_only, &sysfs_gc_gens_pos, diff --git a/fs/bcachefs/trace.h b/fs/bcachefs/trace.h index 8cb5b40704fd..dc09532796af 100644 --- a/fs/bcachefs/trace.h +++ b/fs/bcachefs/trace.h @@ -199,6 +199,50 @@ DECLARE_EVENT_CLASS(bio, (unsigned long long)__entry->sector, __entry->nr_sector) ); +/* errors */ + +TRACE_EVENT(error_throw, + TP_PROTO(struct bch_fs *c, int bch_err, unsigned long ip), + TP_ARGS(c, bch_err, ip), + + TP_STRUCT__entry( + __field(dev_t, dev ) + __field(int, err ) + __array(char, err_str, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + __entry->dev = c->dev; + __entry->err = bch_err; + strscpy(__entry->err_str, bch2_err_str(bch_err), sizeof(__entry->err_str)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%d,%d %s ret %s", MAJOR(__entry->dev), MINOR(__entry->dev), + __entry->ip, __entry->err_str) +); + +TRACE_EVENT(error_downcast, + TP_PROTO(int bch_err, int std_err, unsigned long ip), + TP_ARGS(bch_err, std_err, ip), + + TP_STRUCT__entry( + __array(char, bch_err, 32 ) + __array(char, std_err, 32 ) + __array(char, ip, 32 ) + ), + + TP_fast_assign( + strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); + strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); + snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); + ), + + TP_printk("%s ret %s -> %s %s", __entry->ip, + __entry->bch_err, __entry->std_err, __entry->ip) +); + /* disk_accounting.c */ TRACE_EVENT(accounting_mem_insert, @@ -1431,28 +1475,19 @@ DEFINE_EVENT(fs_str, data_update, TP_ARGS(c, str) ); -DEFINE_EVENT(fs_str, io_move_created_rebalance, +DEFINE_EVENT(fs_str, io_move_pred, TP_PROTO(struct bch_fs *c, const char *str), TP_ARGS(c, str) ); -TRACE_EVENT(error_downcast, - TP_PROTO(int bch_err, int std_err, unsigned long ip), - TP_ARGS(bch_err, std_err, ip), - - TP_STRUCT__entry( - __array(char, bch_err, 32 ) - __array(char, std_err, 32 ) - __array(char, ip, 32 ) - ), - - TP_fast_assign( - strscpy(__entry->bch_err, bch2_err_str(bch_err), sizeof(__entry->bch_err)); - strscpy(__entry->std_err, bch2_err_str(std_err), sizeof(__entry->std_err)); - snprintf(__entry->ip, sizeof(__entry->ip), "%ps", (void *) ip); - ), +DEFINE_EVENT(fs_str, io_move_created_rebalance, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) +); - TP_printk("%s -> %s %s", __entry->bch_err, __entry->std_err, __entry->ip) +DEFINE_EVENT(fs_str, io_move_evacuate_bucket, + TP_PROTO(struct bch_fs *c, const char *str), + TP_ARGS(c, str) ); #ifdef CONFIG_BCACHEFS_PATH_TRACEPOINTS diff --git a/fs/bcachefs/util.h b/fs/bcachefs/util.h index 25cf61ebd40c..0a4b1d433621 100644 --- a/fs/bcachefs/util.h +++ b/fs/bcachefs/util.h @@ -17,6 +17,7 @@ #include <linux/random.h> #include <linux/ratelimit.h> #include <linux/slab.h> +#include <linux/sort.h> #include <linux/vmalloc.h> #include <linux/workqueue.h> @@ -672,8 +673,6 @@ static inline void percpu_memset(void __percpu *p, int c, size_t bytes) u64 *bch2_acc_percpu_u64s(u64 __percpu *, unsigned); -#define cmp_int(l, r) ((l > r) - (l < r)) - static inline int u8_cmp(u8 l, u8 r) { return cmp_int(l, r); diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c index e43f6280f954..849199768664 100644 --- a/fs/btrfs/extent_io.c +++ b/fs/btrfs/extent_io.c @@ -2189,7 +2189,6 @@ retry: done = 1; break; } - free_extent_buffer(eb); continue; } diff --git a/fs/cachefiles/io.c b/fs/cachefiles/io.c index 92058ae43488..c08e4a66ac07 100644 --- a/fs/cachefiles/io.c +++ b/fs/cachefiles/io.c @@ -63,7 +63,7 @@ static void cachefiles_read_complete(struct kiocb *iocb, long ret) ret = -ESTALE; } - ki->term_func(ki->term_func_priv, ret, ki->was_async); + ki->term_func(ki->term_func_priv, ret); } cachefiles_put_kiocb(ki); @@ -188,7 +188,7 @@ in_progress: presubmission_error: if (term_func) - term_func(term_func_priv, ret < 0 ? ret : skipped, false); + term_func(term_func_priv, ret < 0 ? ret : skipped); return ret; } @@ -271,7 +271,7 @@ static void cachefiles_write_complete(struct kiocb *iocb, long ret) atomic_long_sub(ki->b_writing, &object->volume->cache->b_writing); set_bit(FSCACHE_COOKIE_HAVE_DATA, &object->cookie->flags); if (ki->term_func) - ki->term_func(ki->term_func_priv, ret, ki->was_async); + ki->term_func(ki->term_func_priv, ret); cachefiles_put_kiocb(ki); } @@ -301,7 +301,7 @@ int __cachefiles_write(struct cachefiles_object *object, ki = kzalloc(sizeof(struct cachefiles_kiocb), GFP_KERNEL); if (!ki) { if (term_func) - term_func(term_func_priv, -ENOMEM, false); + term_func(term_func_priv, -ENOMEM); return -ENOMEM; } @@ -366,7 +366,7 @@ static int cachefiles_write(struct netfs_cache_resources *cres, { if (!fscache_wait_for_operation(cres, FSCACHE_WANT_WRITE)) { if (term_func) - term_func(term_func_priv, -ENOBUFS, false); + term_func(term_func_priv, -ENOBUFS); trace_netfs_sreq(term_func_priv, netfs_sreq_trace_cache_nowrite); return -ENOBUFS; } @@ -665,7 +665,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) pre = CACHEFILES_DIO_BLOCK_SIZE - off; if (pre >= len) { fscache_count_dio_misfit(); - netfs_write_subrequest_terminated(subreq, len, false); + netfs_write_subrequest_terminated(subreq, len); return; } subreq->transferred += pre; @@ -691,7 +691,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) len -= post; if (len == 0) { fscache_count_dio_misfit(); - netfs_write_subrequest_terminated(subreq, post, false); + netfs_write_subrequest_terminated(subreq, post); return; } iov_iter_truncate(&subreq->io_iter, len); @@ -703,7 +703,7 @@ static void cachefiles_issue_write(struct netfs_io_subrequest *subreq) &start, &len, len, true); cachefiles_end_secure(cache, saved_cred); if (ret < 0) { - netfs_write_subrequest_terminated(subreq, ret, false); + netfs_write_subrequest_terminated(subreq, ret); return; } diff --git a/fs/ceph/addr.c b/fs/ceph/addr.c index 29be367905a1..b95c4cb21c13 100644 --- a/fs/ceph/addr.c +++ b/fs/ceph/addr.c @@ -238,6 +238,7 @@ static void finish_netfs_read(struct ceph_osd_request *req) if (sparse && err > 0) err = ceph_sparse_ext_map_end(op); if (err < subreq->len && + subreq->rreq->origin != NETFS_UNBUFFERED_READ && subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); if (IS_ENCRYPTED(inode) && err > 0) { @@ -281,7 +282,8 @@ static bool ceph_netfs_issue_op_inline(struct netfs_io_subrequest *subreq) size_t len; int mode; - if (rreq->origin != NETFS_DIO_READ) + if (rreq->origin != NETFS_UNBUFFERED_READ && + rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); __clear_bit(NETFS_SREQ_COPY_TO_CACHE, &subreq->flags); @@ -539,7 +541,7 @@ static void ceph_set_page_fscache(struct page *page) folio_start_private_2(page_folio(page)); /* [DEPRECATED] */ } -static void ceph_fscache_write_terminated(void *priv, ssize_t error, bool was_async) +static void ceph_fscache_write_terminated(void *priv, ssize_t error) { struct inode *inode = priv; diff --git a/fs/configfs/Kconfig b/fs/configfs/Kconfig index 272b64456999..1fcd761fe7be 100644 --- a/fs/configfs/Kconfig +++ b/fs/configfs/Kconfig @@ -1,7 +1,6 @@ # SPDX-License-Identifier: GPL-2.0-only config CONFIGFS_FS tristate "Userspace-driven configuration filesystem" - select SYSFS help configfs is a RAM-based filesystem that provides the converse of sysfs's functionality. Where sysfs is a filesystem-based @@ -257,7 +257,7 @@ static void *wait_entry_unlocked_exclusive(struct xa_state *xas, void *entry) wq = dax_entry_waitqueue(xas, entry, &ewait.key); prepare_to_wait_exclusive(wq, &ewait.wait, TASK_UNINTERRUPTIBLE); - xas_pause(xas); + xas_reset(xas); xas_unlock_irq(xas); schedule(); finish_wait(wq, &ewait.wait); @@ -1422,8 +1422,7 @@ static vm_fault_t dax_pmd_load_hole(struct xa_state *xas, struct vm_fault *vmf, pgtable_trans_huge_deposit(vma->vm_mm, vmf->pmd, pgtable); mm_inc_nr_ptes(vma->vm_mm); } - pmd_entry = mk_pmd(&zero_folio->page, vmf->vma->vm_page_prot); - pmd_entry = pmd_mkhuge(pmd_entry); + pmd_entry = folio_mk_pmd(zero_folio, vmf->vma->vm_page_prot); set_pmd_at(vmf->vma->vm_mm, pmd_addr, vmf->pmd, pmd_entry); spin_unlock(ptl); trace_dax_pmd_load_hole(inode, vmf, zero_folio, *entry); diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig index f82a4952769d..b46165df5a91 100644 --- a/fs/dlm/Kconfig +++ b/fs/dlm/Kconfig @@ -3,7 +3,6 @@ menuconfig DLM tristate "Distributed Lock Manager (DLM)" depends on INET depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n) - select IP_SCTP help A general purpose distributed lock manager for kernel or userspace applications. diff --git a/fs/dlm/config.c b/fs/dlm/config.c index cf9ba6fd7a28..a23fd524a6ee 100644 --- a/fs/dlm/config.c +++ b/fs/dlm/config.c @@ -197,6 +197,9 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x) break; case 1: /* SCTP */ + if (!IS_ENABLED(CONFIG_IP_SCTP)) + return -EOPNOTSUPP; + break; default: return -EINVAL; diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c index 70abd4da17a6..e4373bce1bc2 100644 --- a/fs/dlm/lowcomms.c +++ b/fs/dlm/lowcomms.c @@ -160,6 +160,7 @@ struct dlm_proto_ops { bool try_new_addr; const char *name; int proto; + int how; void (*sockopts)(struct socket *sock); int (*bind)(struct socket *sock); @@ -533,7 +534,7 @@ static void lowcomms_state_change(struct sock *sk) /* SCTP layer is not calling sk_data_ready when the connection * is done, so we catch the signal through here. */ - if (sk->sk_shutdown == RCV_SHUTDOWN) + if (sk->sk_shutdown & RCV_SHUTDOWN) lowcomms_data_ready(sk); } @@ -810,7 +811,7 @@ static void shutdown_connection(struct connection *con, bool and_other) return; } - ret = kernel_sock_shutdown(con->sock, SHUT_WR); + ret = kernel_sock_shutdown(con->sock, dlm_proto_ops->how); up_read(&con->sock_lock); if (ret) { log_print("Connection %p failed to shutdown: %d will force close", @@ -1858,6 +1859,7 @@ static int dlm_tcp_listen_bind(struct socket *sock) static const struct dlm_proto_ops dlm_tcp_ops = { .name = "TCP", .proto = IPPROTO_TCP, + .how = SHUT_WR, .sockopts = dlm_tcp_sockopts, .bind = dlm_tcp_bind, .listen_validate = dlm_tcp_listen_validate, @@ -1896,6 +1898,7 @@ static void dlm_sctp_sockopts(struct socket *sock) static const struct dlm_proto_ops dlm_sctp_ops = { .name = "SCTP", .proto = IPPROTO_SCTP, + .how = SHUT_RDWR, .try_new_addr = true, .sockopts = dlm_sctp_sockopts, .bind = dlm_sctp_bind, diff --git a/fs/erofs/fscache.c b/fs/erofs/fscache.c index 9c9129bca346..34517ca9df91 100644 --- a/fs/erofs/fscache.c +++ b/fs/erofs/fscache.c @@ -102,8 +102,7 @@ static void erofs_fscache_req_io_put(struct erofs_fscache_io *io) erofs_fscache_req_put(req); } -static void erofs_fscache_req_end_io(void *priv, - ssize_t transferred_or_error, bool was_async) +static void erofs_fscache_req_end_io(void *priv, ssize_t transferred_or_error) { struct erofs_fscache_io *io = priv; struct erofs_fscache_rq *req = io->private; @@ -180,8 +179,7 @@ struct erofs_fscache_bio { struct bio_vec bvecs[BIO_MAX_VECS]; }; -static void erofs_fscache_bio_endio(void *priv, - ssize_t transferred_or_error, bool was_async) +static void erofs_fscache_bio_endio(void *priv, ssize_t transferred_or_error) { struct erofs_fscache_bio *io = priv; diff --git a/fs/exec.c b/fs/exec.c index cfbb2b9ee3c9..1f5fdd2e096e 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -78,6 +78,9 @@ #include <trace/events/sched.h> +/* For vma exec functions. */ +#include "../mm/internal.h" + static int bprm_creds_from_file(struct linux_binprm *bprm); int suid_dumpable = 0; @@ -182,60 +185,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, flush_cache_page(bprm->vma, pos, page_to_pfn(page)); } -static int __bprm_mm_init(struct linux_binprm *bprm) -{ - int err; - struct vm_area_struct *vma = NULL; - struct mm_struct *mm = bprm->mm; - - bprm->vma = vma = vm_area_alloc(mm); - if (!vma) - return -ENOMEM; - vma_set_anonymous(vma); - - if (mmap_write_lock_killable(mm)) { - err = -EINTR; - goto err_free; - } - - /* - * Need to be called with mmap write lock - * held, to avoid race with ksmd. - */ - err = ksm_execve(mm); - if (err) - goto err_ksm; - - /* - * Place the stack at the largest stack address the architecture - * supports. Later, we'll move this to an appropriate place. We don't - * use STACK_TOP because that can depend on attributes which aren't - * configured yet. - */ - BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP); - vma->vm_end = STACK_TOP_MAX; - vma->vm_start = vma->vm_end - PAGE_SIZE; - vm_flags_init(vma, VM_SOFTDIRTY | VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP); - vma->vm_page_prot = vm_get_page_prot(vma->vm_flags); - - err = insert_vm_struct(mm, vma); - if (err) - goto err; - - mm->stack_vm = mm->total_vm = 1; - mmap_write_unlock(mm); - bprm->p = vma->vm_end - sizeof(void *); - return 0; -err: - ksm_exit(mm); -err_ksm: - mmap_write_unlock(mm); -err_free: - bprm->vma = NULL; - vm_area_free(vma); - return err; -} - static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= MAX_ARG_STRLEN; @@ -288,12 +237,6 @@ static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos, { } -static int __bprm_mm_init(struct linux_binprm *bprm) -{ - bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); - return 0; -} - static bool valid_arg_len(struct linux_binprm *bprm, long len) { return len <= bprm->p; @@ -322,9 +265,13 @@ static int bprm_mm_init(struct linux_binprm *bprm) bprm->rlim_stack = current->signal->rlim[RLIMIT_STACK]; task_unlock(current->group_leader); - err = __bprm_mm_init(bprm); +#ifndef CONFIG_MMU + bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *); +#else + err = create_init_stack_vma(bprm->mm, &bprm->vma, &bprm->p); if (err) goto err; +#endif return 0; diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c index d47896a89596..1729bf42eb51 100644 --- a/fs/exfat/nls.c +++ b/fs/exfat/nls.c @@ -801,4 +801,5 @@ load_default: void exfat_free_upcase_table(struct exfat_sb_info *sbi) { kvfree(sbi->vol_utbl); + sbi->vol_utbl = NULL; } diff --git a/fs/exfat/super.c b/fs/exfat/super.c index 8465033a6cf0..7ed858937d45 100644 --- a/fs/exfat/super.c +++ b/fs/exfat/super.c @@ -36,31 +36,12 @@ static void exfat_put_super(struct super_block *sb) struct exfat_sb_info *sbi = EXFAT_SB(sb); mutex_lock(&sbi->s_lock); + exfat_clear_volume_dirty(sb); exfat_free_bitmap(sbi); brelse(sbi->boot_bh); mutex_unlock(&sbi->s_lock); } -static int exfat_sync_fs(struct super_block *sb, int wait) -{ - struct exfat_sb_info *sbi = EXFAT_SB(sb); - int err = 0; - - if (unlikely(exfat_forced_shutdown(sb))) - return 0; - - if (!wait) - return 0; - - /* If there are some dirty buffers in the bdev inode */ - mutex_lock(&sbi->s_lock); - sync_blockdev(sb->s_bdev); - if (exfat_clear_volume_dirty(sb)) - err = -EIO; - mutex_unlock(&sbi->s_lock); - return err; -} - static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf) { struct super_block *sb = dentry->d_sb; @@ -219,7 +200,6 @@ static const struct super_operations exfat_sops = { .write_inode = exfat_write_inode, .evict_inode = exfat_evict_inode, .put_super = exfat_put_super, - .sync_fs = exfat_sync_fs, .statfs = exfat_statfs, .show_options = exfat_show_options, .shutdown = exfat_shutdown, @@ -751,10 +731,14 @@ static void exfat_free(struct fs_context *fc) static int exfat_reconfigure(struct fs_context *fc) { + struct super_block *sb = fc->root->d_sb; fc->sb_flags |= SB_NODIRATIME; - /* volume flag will be updated in exfat_sync_fs */ - sync_filesystem(fc->root->d_sb); + sync_filesystem(sb); + mutex_lock(&EXFAT_SB(sb)->s_lock); + exfat_clear_volume_dirty(sb); + mutex_unlock(&EXFAT_SB(sb)->s_lock); + return 0; } diff --git a/fs/ext2/super.c b/fs/ext2/super.c index 28ff47ec4be6..121e634c792a 100644 --- a/fs/ext2/super.c +++ b/fs/ext2/super.c @@ -601,7 +601,8 @@ static int ext2_parse_param(struct fs_context *fc, struct fs_parameter *param) case Opt_dax: #ifdef CONFIG_FS_DAX ext2_msg_fc(fc, KERN_WARNING, - "DAX enabled. Warning: EXPERIMENTAL, use at your own risk"); + "DAX enabled. Warning: DAX support in ext2 driver is deprecated" + " and will be removed at the end of 2025. Please use ext4 driver instead."); ctx_set_mount_opt(ctx, EXT2_MOUNT_DAX); #else ext2_msg_fc(fc, KERN_INFO, "dax option not supported"); diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c index a4dbaccee6e7..87760fabdd2e 100644 --- a/fs/ext4/bitmap.c +++ b/fs/ext4/bitmap.c @@ -30,7 +30,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb, sz = EXT4_INODES_PER_GROUP(sb) >> 3; provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi); provided |= (hi << 16); @@ -52,7 +52,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb, return; sz = EXT4_INODES_PER_GROUP(sb) >> 3; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16); @@ -71,7 +71,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb, return 1; provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo); - calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) { hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi); provided |= (hi << 16); @@ -92,7 +92,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb, if (!ext4_has_feature_metadata_csum(sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz); gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF); if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16); diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h index 5a20e9cd7184..18373de980f2 100644 --- a/fs/ext4/ext4.h +++ b/fs/ext4/ext4.h @@ -256,9 +256,19 @@ struct ext4_allocation_request { #define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten) #define EXT4_MAP_BOUNDARY BIT(BH_Boundary) #define EXT4_MAP_DELAYED BIT(BH_Delay) +/* + * This is for use in ext4_map_query_blocks() for a special case where we can + * have a physically and logically contiguous blocks split across two leaf + * nodes instead of a single extent. This is required in case of atomic writes + * to know whether the returned extent is last in leaf. If yes, then lookup for + * next in leaf block in ext4_map_query_blocks_next_in_leaf(). + * - This is never going to be added to any buffer head state. + * - We use the next available bit after BH_BITMAP_UPTODATE. + */ +#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1) #define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\ EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\ - EXT4_MAP_DELAYED) + EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF) struct ext4_map_blocks { ext4_fsblk_t m_pblk; @@ -706,9 +716,6 @@ enum { #define EXT4_GET_BLOCKS_CONVERT 0x0010 #define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) - /* Convert extent to initialized after IO complete */ -#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\ - EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT) /* Eventual metadata allocation (due to growing extent tree) * should not fail, so try to use reserved blocks for that.*/ #define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020 @@ -720,11 +727,23 @@ enum { #define EXT4_GET_BLOCKS_ZERO 0x0200 #define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\ EXT4_GET_BLOCKS_ZERO) - /* Caller will submit data before dropping transaction handle. This - * allows jbd2 to avoid submitting data before commit. */ + /* Caller is in the context of data submission, such as writeback, + * fsync, etc. Especially, in the generic writeback path, caller will + * submit data before dropping transaction handle. This allows jbd2 + * to avoid submitting data before commit. */ #define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400 + /* Convert extent to initialized after IO complete */ +#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\ + EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\ + EXT4_GET_BLOCKS_IO_SUBMIT) /* Caller is in the atomic contex, find extent if it has been cached */ #define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800 +/* + * Atomic write caller needs this to query in the slow path of mixed mapping + * case, when a contiguous extent can be split across two adjacent leaf nodes. + * Look EXT4_MAP_QUERY_LAST_IN_LEAF. + */ +#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000 /* * The bit position of these flags must not overlap with any of the @@ -738,6 +757,13 @@ enum { #define EXT4_EX_NOCACHE 0x40000000 #define EXT4_EX_FORCE_CACHE 0x20000000 #define EXT4_EX_NOFAIL 0x10000000 +/* + * ext4_map_query_blocks() uses this filter mask to filter the flags needed to + * pass while lookup/querying of on disk extent tree. + */ +#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\ + EXT4_EX_NOFAIL |\ + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) /* * Flags used by ext4_free_blocks @@ -1061,16 +1087,16 @@ struct ext4_inode_info { /* End of lblk range that needs to be committed in this fast commit */ ext4_lblk_t i_fc_lblk_len; - /* Number of ongoing updates on this inode */ - atomic_t i_fc_updates; - spinlock_t i_raw_lock; /* protects updates to the raw inode */ /* Fast commit wait queue for this inode */ wait_queue_head_t i_fc_wait; - /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */ - struct mutex i_fc_lock; + /* + * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len + * and inode's EXT4_FC_STATE_COMMITTING state bit. + */ + spinlock_t i_fc_lock; /* * i_disksize keeps track of what the inode size is ON DISK, not @@ -1754,7 +1780,7 @@ struct ext4_sb_info { * following fields: * ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh. */ - spinlock_t s_fc_lock; + struct mutex s_fc_lock; struct buffer_head *s_fc_bh; struct ext4_fc_stats s_fc_stats; tid_t s_fc_ineligible_tid; @@ -1913,6 +1939,7 @@ enum { EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */ EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */ EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */ + EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */ EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */ }; @@ -2295,10 +2322,12 @@ static inline int ext4_emergency_state(struct super_block *sb) #define EXT4_DEFM_NODELALLOC 0x0800 /* - * Default journal batch times + * Default journal batch times and ioprio. */ #define EXT4_DEF_MIN_BATCH_TIME 0 #define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */ +#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) + /* * Default values for superblock update @@ -2487,8 +2516,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize) #define DX_HASH_SIPHASH 6 #define DX_HASH_LAST DX_HASH_SIPHASH -static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc, - const void *address, unsigned int length) +static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length) { return crc32c(crc, address, length); } @@ -2922,8 +2950,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode, void ext4_fc_track_create(handle_t *handle, struct dentry *dentry); void ext4_fc_track_inode(handle_t *handle, struct inode *inode); void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle); -void ext4_fc_start_update(struct inode *inode); -void ext4_fc_stop_update(struct inode *inode); void ext4_fc_del(struct inode *inode); bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block); void ext4_fc_replay_cleanup(struct super_block *sb); @@ -2973,6 +2999,7 @@ static inline bool ext4_mb_cr_expensive(enum criteria cr) void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei); int ext4_inode_is_fast_symlink(struct inode *inode); +void ext4_check_map_extents_env(struct inode *inode); struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int); struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int); int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count, @@ -2993,6 +3020,7 @@ int ext4_walk_page_buffers(handle_t *handle, struct buffer_head *bh)); int do_journal_get_write_access(handle_t *handle, struct inode *inode, struct buffer_head *bh); +bool ext4_should_enable_large_folio(struct inode *inode); #define FALL_BACK_TO_NONDELALLOC 1 #define CONVERT_INLINE_DATA 2 @@ -3039,6 +3067,8 @@ extern void ext4_set_aops(struct inode *inode); extern int ext4_writepage_trans_blocks(struct inode *); extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode); extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks); +extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks, + int pextents); extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode, loff_t lstart, loff_t lend); extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf); @@ -3050,6 +3080,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode, extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, ext4_lblk_t len); +static inline bool is_special_ino(struct super_block *sb, unsigned long ino) +{ + struct ext4_super_block *es = EXT4_SB(sb)->s_es; + + return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || + ino == le32_to_cpu(es->s_usr_quota_inum) || + ino == le32_to_cpu(es->s_grp_quota_inum) || + ino == le32_to_cpu(es->s_prj_quota_inum) || + ino == le32_to_cpu(es->s_orphan_file_inum); +} + /* indirect.c */ extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode, struct ext4_map_blocks *map, int flags); @@ -3119,8 +3160,7 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wa extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block); extern int ext4_seq_options_show(struct seq_file *seq, void *offset); extern int ext4_calculate_overhead(struct super_block *sb); -extern __le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es); +extern __le32 ext4_superblock_csum(struct ext4_super_block *es); extern void ext4_superblock_csum_set(struct super_block *sb); extern int ext4_alloc_flex_bg_array(struct super_block *sb, ext4_group_t ngroup); @@ -3378,6 +3418,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi) return 1 << sbi->s_log_groups_per_flex; } +static inline loff_t ext4_get_maxbytes(struct inode *inode) +{ + if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + return inode->i_sb->s_maxbytes; + return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; +} + #define ext4_std_error(sb, errno) \ do { \ if ((errno)) \ @@ -3710,6 +3757,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset, loff_t len); extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, loff_t offset, ssize_t len); +extern int ext4_convert_unwritten_extents_atomic(handle_t *handle, + struct inode *inode, loff_t offset, ssize_t len); extern int ext4_convert_unwritten_io_end_vec(handle_t *handle, ext4_io_end_t *io_end); extern int ext4_map_blocks(handle_t *handle, struct inode *inode, @@ -3847,7 +3896,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh) static inline bool ext4_inode_can_atomic_write(struct inode *inode) { - return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0; + return S_ISREG(inode->i_mode) && + ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) && + EXT4_SB(inode->i_sb)->s_awu_min > 0; } extern int ext4_block_write_begin(handle_t *handle, struct folio *folio, diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c index 135e278c832e..b3e9b7bd7978 100644 --- a/fs/ext4/ext4_jbd2.c +++ b/fs/ext4/ext4_jbd2.c @@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode) ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || (ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) && - !test_opt(inode->i_sb, DELALLOC))) { + !test_opt(inode->i_sb, DELALLOC) && + !mapping_large_folio_support(inode->i_mapping))) { /* We do not support data journalling for encrypted data */ if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode)) return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */ diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h index 3221714d9901..63d17c5201b5 100644 --- a/fs/ext4/ext4_jbd2.h +++ b/fs/ext4/ext4_jbd2.h @@ -319,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits, revoke_creds, 0); } -static inline int ext4_journal_blocks_per_page(struct inode *inode) +static inline int ext4_journal_blocks_per_folio(struct inode *inode) { if (EXT4_JOURNAL(inode) != NULL) - return jbd2_journal_blocks_per_page(inode); + return jbd2_journal_blocks_per_folio(inode); return 0; } diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c index c616a16a9f36..b543a46fc809 100644 --- a/fs/ext4/extents.c +++ b/fs/ext4/extents.c @@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode, struct ext4_extent_header *eh) { struct ext4_inode_info *ei = EXT4_I(inode); - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh, + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh, EXT4_EXTENT_TAIL_OFFSET(eh)); return cpu_to_le32(csum); } @@ -611,6 +610,8 @@ int ext4_ext_precache(struct inode *inode) if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) return 0; /* not an extent-mapped inode */ + ext4_check_map_extents_env(inode); + down_read(&ei->i_data_sem); depth = ext_depth(inode); @@ -1530,7 +1531,7 @@ static int ext4_ext_search_left(struct inode *inode, static int ext4_ext_search_right(struct inode *inode, struct ext4_ext_path *path, ext4_lblk_t *logical, ext4_fsblk_t *phys, - struct ext4_extent *ret_ex) + struct ext4_extent *ret_ex, int flags) { struct buffer_head *bh = NULL; struct ext4_extent_header *eh; @@ -1604,7 +1605,8 @@ got_index: ix++; while (++depth < path->p_depth) { /* subtract from p_depth to get proper eh_depth */ - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, + flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -1612,7 +1614,7 @@ got_index: put_bh(bh); } - bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0); + bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags); if (IS_ERR(bh)) return PTR_ERR(bh); eh = ext_block_hdr(bh); @@ -2396,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks, int ext4_ext_index_trans_blocks(struct inode *inode, int extents) { int index; - int depth; /* If we are converting the inline data, only one is needed here. */ if (ext4_has_inline_data(inode)) return 1; - depth = ext_depth(inode); - + /* + * Extent tree can change between the time we estimate credits and + * the time we actually modify the tree. Assume the worst case. + */ if (extents <= 1) - index = depth * 2; + index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents; else - index = depth * 3; + index = (EXT4_MAX_EXTENT_DEPTH * 3) + + DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0)); return index; } @@ -2821,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start, struct partial_cluster partial; handle_t *handle; int i = 0, err = 0; + int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL; partial.pclu = 0; partial.lblk = 0; @@ -2851,8 +2856,7 @@ again: ext4_fsblk_t pblk; /* find extent for or closest extent to this block */ - path = ext4_find_extent(inode, end, NULL, - EXT4_EX_NOCACHE | EXT4_EX_NOFAIL); + path = ext4_find_extent(inode, end, NULL, flags); if (IS_ERR(path)) { ext4_journal_stop(handle); return PTR_ERR(path); @@ -2918,7 +2922,7 @@ again: */ lblk = ex_end + 1; err = ext4_ext_search_right(inode, path, &lblk, &pblk, - NULL); + NULL, flags); if (err < 0) goto out; if (pblk) { @@ -2994,8 +2998,7 @@ again: i + 1, ext4_idx_pblock(path[i].p_idx)); memset(path + i + 1, 0, sizeof(*path)); bh = read_extent_tree_block(inode, path[i].p_idx, - depth - i - 1, - EXT4_EX_NOCACHE); + depth - i - 1, flags); if (IS_ERR(bh)) { /* should we reset i_size? */ err = PTR_ERR(bh); @@ -4202,7 +4205,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags); /* find extent for this block */ - path = ext4_find_extent(inode, map->m_lblk, NULL, 0); + path = ext4_find_extent(inode, map->m_lblk, NULL, flags); if (IS_ERR(path)) { err = PTR_ERR(path); goto out; @@ -4314,7 +4317,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode, if (err) goto out; ar.lright = map->m_lblk; - err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2); + err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, + &ex2, flags); if (err < 0) goto out; @@ -4433,6 +4437,20 @@ got_allocated_blocks: allocated = map->m_len; ext4_ext_show_leaf(inode, path); out: + /* + * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag. + * So we know that the depth used here is correct, since there was no + * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set. + * If tomorrow we start using this QUERY flag with CREATE, then we will + * need to re-calculate the depth as it might have changed due to block + * allocation. + */ + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) { + WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE); + if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr))) + map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF; + } + ext4_free_ext_path(path); trace_ext4_ext_map_blocks_exit(inode, flags, map, @@ -4781,6 +4799,93 @@ out_inode_lock: } /* + * This function converts a range of blocks to written extents. The caller of + * this function will pass the start offset and the size. all unwritten extents + * within this range will be converted to written extents. + * + * This function is called from the direct IO end io call back function for + * atomic writes, to convert the unwritten extents after IO is completed. + * + * Note that the requirement for atomic writes is that all conversion should + * happen atomically in a single fs journal transaction. We mainly only allocate + * unwritten extents either on a hole on a pre-exiting unwritten extent range in + * ext4_map_blocks_atomic_write(). The only case where we can have multiple + * unwritten extents in a range [offset, offset+len) is when there is a split + * unwritten extent between two leaf nodes which was cached in extent status + * cache during ext4_iomap_alloc() time. That will allow + * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going + * into the slow path. That means we might need a loop for conversion of this + * unwritten extent split across leaf block within a single journal transaction. + * Split extents across leaf nodes is a rare case, but let's still handle that + * to meet the requirements of multi-fsblock atomic writes. + * + * Returns 0 on success. + */ +int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode, + loff_t offset, ssize_t len) +{ + unsigned int max_blocks; + int ret = 0, ret2 = 0, ret3 = 0; + struct ext4_map_blocks map; + unsigned int blkbits = inode->i_blkbits; + unsigned int credits = 0; + int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE; + + map.m_lblk = offset >> blkbits; + max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits); + + if (!handle) { + /* + * TODO: An optimization can be added later by having an extent + * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that + * it can tell if the extent in the cache is a split extent. + * But for now let's assume pextents as 2 always. + */ + credits = ext4_meta_trans_blocks(inode, max_blocks, 2); + } + + if (credits) { + handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + return ret; + } + } + + while (ret >= 0 && ret < max_blocks) { + map.m_lblk += ret; + map.m_len = (max_blocks -= ret); + ret = ext4_map_blocks(handle, inode, &map, flags); + if (ret != max_blocks) + ext4_msg(inode->i_sb, KERN_INFO, + "inode #%lu: block %u: len %u: " + "split block mapping found for atomic write, " + "ret = %d", + inode->i_ino, map.m_lblk, + map.m_len, ret); + if (ret <= 0) + break; + } + + ret2 = ext4_mark_inode_dirty(handle, inode); + + if (credits) { + ret3 = ext4_journal_stop(handle); + if (unlikely(ret3)) + ret2 = ret3; + } + + if (ret <= 0 || ret2) + ext4_warning(inode->i_sb, + "inode #%lu: block %u: len %u: " + "returned %d or %d", + inode->i_ino, map.m_lblk, + map.m_len, ret, ret2); + + return ret > 0 ? ret2 : ret; +} + +/* * This function convert a range of blocks to written extents * The caller of this function will pass the start offset and the size. * all unwritten extents within this range will be converted to @@ -4819,8 +4924,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode, break; } } + /* + * Do not cache any unrelated extents, as it does not hold the + * i_rwsem or invalidate_lock, which could corrupt the extent + * status tree. + */ ret = ext4_map_blocks(handle, inode, &map, - EXT4_GET_BLOCKS_IO_CONVERT_EXT); + EXT4_GET_BLOCKS_IO_CONVERT_EXT | + EXT4_EX_NOCACHE); if (ret <= 0) ext4_warning(inode->i_sb, "inode #%lu: block %u: len %u: " @@ -4931,12 +5042,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = { static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len) { - u64 maxbytes; - - if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - maxbytes = inode->i_sb->s_maxbytes; - else - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; + u64 maxbytes = ext4_get_maxbytes(inode); if (*len == 0) return -EINVAL; @@ -4956,10 +5062,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, { int error = 0; + inode_lock_shared(inode); if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { error = ext4_ext_precache(inode); if (error) - return error; + goto unlock; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; } @@ -4970,15 +5077,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, */ error = ext4_fiemap_check_ranges(inode, start, &len); if (error) - return error; + goto unlock; if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) { fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR; - return iomap_fiemap(inode, fieinfo, start, len, - &ext4_iomap_xattr_ops); + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_xattr_ops); + } else { + error = iomap_fiemap(inode, fieinfo, start, len, + &ext4_iomap_report_ops); } - - return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops); +unlock: + inode_unlock_shared(inode); + return error; } int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -4999,7 +5110,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo, } if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) { + inode_lock_shared(inode); error = ext4_ext_precache(inode); + inode_unlock_shared(inode); if (error) return error; fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE; @@ -5328,6 +5441,8 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len) start_lblk = offset >> inode->i_blkbits; end_lblk = (offset + len) >> inode->i_blkbits; + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk); @@ -5429,6 +5544,8 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len) start_lblk = offset >> inode->i_blkbits; len_lblk = len >> inode->i_blkbits; + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c index d1401d4a5513..31dc0496f8d0 100644 --- a/fs/ext4/extents_status.c +++ b/fs/ext4/extents_status.c @@ -120,9 +120,40 @@ * memory. Hence, we will reclaim written/unwritten/hole extents from * the tree under a heavy memory pressure. * + * ========================================================================== + * 3. Assurance of Ext4 extent status tree consistency + * + * When mapping blocks, Ext4 queries the extent status tree first and should + * always trusts that the extent status tree is consistent and up to date. + * Therefore, it is important to adheres to the following rules when createing, + * modifying and removing extents. + * + * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings, + * the extent information should always be processed through the extent + * status tree instead of being organized manually through the on-disk + * extent tree. + * + * 2. When updating the extent tree, Ext4 should acquire the i_data_sem + * exclusively and update the extent status tree atomically. If the extents + * to be modified are large enough to exceed the range that a single + * i_data_sem can process (as ext4_datasem_ensure_credits() may drop + * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole() + * does): + * + * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against page faults, as well as reads and writes that may + * concurrently modify the extent status tree. + * b) Evict all page cache in the affected range and recommend rebuilding + * or dropping the extent status tree after modifying the on-disk + * extent tree. This ensures exclusion against concurrent writebacks + * that do not hold those locks but only holds a folio lock. + * + * 3. Based on the rules above, when querying block mappings, Ext4 should at + * least hold the i_rwsem or invalidate_lock or folio lock(s) for the + * specified querying range. * * ========================================================================== - * 3. Performance analysis + * 4. Performance analysis * * -- overhead * 1. There is a cache extent for write access, so if writes are @@ -134,7 +165,7 @@ * * * ========================================================================== - * 4. TODO list + * 5. TODO list * * -- Refactor delayed space reservation * diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c index da4263a14a20..42bee1d4f9f9 100644 --- a/fs/ext4/fast_commit.c +++ b/fs/ext4/fast_commit.c @@ -12,6 +12,7 @@ #include "ext4_extents.h" #include "mballoc.h" +#include <linux/lockdep.h> /* * Ext4 Fast Commits * ----------------- @@ -49,19 +50,27 @@ * that need to be committed during a fast commit in another in memory queue of * inodes. During the commit operation, we commit in the following order: * - * [1] Lock inodes for any further data updates by setting COMMITTING state - * [2] Submit data buffers of all the inodes - * [3] Wait for [2] to complete - * [4] Commit all the directory entry updates in the fast commit space - * [5] Commit all the changed inode structures - * [6] Write tail tag (this tag ensures the atomicity, please read the following + * [1] Prepare all the inodes to write out their data by setting + * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be + * deleted while it is being flushed. + * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA" + * state. + * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that + * all the exsiting handles finish and no new handles can start. + * [4] Mark all the fast commit eligible inodes as undergoing fast commit + * by setting "EXT4_STATE_FC_COMMITTING" state. + * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows + * starting of new handles. If new handles try to start an update on + * any of the inodes that are being committed, ext4_fc_track_inode() + * will block until those inodes have finished the fast commit. + * [6] Commit all the directory entry updates in the fast commit space. + * [7] Commit all the changed inodes in the fast commit space and clear + * "EXT4_STATE_FC_COMMITTING" for these inodes. + * [8] Write tail tag (this tag ensures the atomicity, please read the following * section for more details). - * [7] Wait for [4], [5] and [6] to complete. * - * All the inode updates must call ext4_fc_start_update() before starting an - * update. If such an ongoing update is present, fast commit waits for it to - * complete. The completion of such an update is marked by - * ext4_fc_stop_update(). + * All the inode updates must be enclosed within jbd2_jounrnal_start() + * and jbd2_journal_stop() similar to JBD2 journaling. * * Fast Commit Ineligibility * ------------------------- @@ -142,6 +151,13 @@ * similarly. Thus, by converting a non-idempotent procedure into a series of * idempotent outcomes, fast commits ensured idempotence during the replay. * + * Locking + * ------- + * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit + * dentry queue. ei->i_fc_lock protects the fast commit related info in a given + * inode. Most of the code avoids acquiring both the locks, but if one must do + * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock. + * * TODOs * ----- * @@ -156,13 +172,12 @@ * fast commit recovery even if that area is invalidated by later full * commits. * - * 1) Fast commit's commit path locks the entire file system during fast - * commit. This has significant performance penalty. Instead of that, we - * should use ext4_fc_start/stop_update functions to start inode level - * updates from ext4_journal_start/stop. Once we do that we can drop file - * system locking during commit path. + * 1) Handle more ineligible cases. * - * 2) Handle more ineligible cases. + * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent + * status tree. This would get rid of the need to call ext4_fc_track_inode() + * before acquiring i_data_sem. To do that we would need to ensure that + * modified extents from the extent status tree are not evicted from memory. */ #include <trace/events/ext4.h> @@ -201,32 +216,6 @@ void ext4_fc_init_inode(struct inode *inode) INIT_LIST_HEAD(&ei->i_fc_list); INIT_LIST_HEAD(&ei->i_fc_dilist); init_waitqueue_head(&ei->i_fc_wait); - atomic_set(&ei->i_fc_updates, 0); -} - -/* This function must be called with sbi->s_fc_lock held. */ -static void ext4_fc_wait_committing_inode(struct inode *inode) -__releases(&EXT4_SB(inode->i_sb)->s_fc_lock) -{ - wait_queue_head_t *wq; - struct ext4_inode_info *ei = EXT4_I(inode); - -#if (BITS_PER_LONG < 64) - DEFINE_WAIT_BIT(wait, &ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_state_flags, - EXT4_STATE_FC_COMMITTING); -#else - DEFINE_WAIT_BIT(wait, &ei->i_flags, - EXT4_STATE_FC_COMMITTING); - wq = bit_waitqueue(&ei->i_flags, - EXT4_STATE_FC_COMMITTING); -#endif - lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock); - prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); - schedule(); - finish_wait(wq, &wait.wq_entry); } static bool ext4_fc_disabled(struct super_block *sb) @@ -236,48 +225,6 @@ static bool ext4_fc_disabled(struct super_block *sb) } /* - * Inform Ext4's fast about start of an inode update - * - * This function is called by the high level call VFS callbacks before - * performing any inode update. This function blocks if there's an ongoing - * fast commit on the inode in question. - */ -void ext4_fc_start_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - -restart: - spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock); - if (list_empty(&ei->i_fc_list)) - goto out; - - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; - } -out: - atomic_inc(&ei->i_fc_updates); - spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock); -} - -/* - * Stop inode update and wake up waiting fast commits if any. - */ -void ext4_fc_stop_update(struct inode *inode) -{ - struct ext4_inode_info *ei = EXT4_I(inode); - - if (ext4_fc_disabled(inode->i_sb)) - return; - - if (atomic_dec_and_test(&ei->i_fc_updates)) - wake_up_all(&ei->i_fc_wait); -} - -/* * Remove inode from fast commit list. If the inode is being committed * we wait until inode commit is done. */ @@ -286,31 +233,62 @@ void ext4_fc_del(struct inode *inode) struct ext4_inode_info *ei = EXT4_I(inode); struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_fc_dentry_update *fc_dentry; + wait_queue_head_t *wq; if (ext4_fc_disabled(inode->i_sb)) return; -restart: - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } - if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { - ext4_fc_wait_committing_inode(inode); - goto restart; + /* + * Since ext4_fc_del is called from ext4_evict_inode while having a + * handle open, there is no need for us to wait here even if a fast + * commit is going on. That is because, if this inode is being + * committed, ext4_mark_inode_dirty would have waited for inode commit + * operation to finish before we come here. So, by the time we come + * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So, + * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode + * here. + * + * We may come here without any handles open in the "no_delete" case of + * ext4_evict_inode as well. However, if that happens, we first mark the + * file system as fast commit ineligible anyway. So, even in that case, + * it is okay to remove the inode from the fc list. + */ + WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING) + && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)); + while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_FLUSHING_DATA); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) { + mutex_unlock(&sbi->s_fc_lock); + schedule(); + mutex_lock(&sbi->s_fc_lock); + } + finish_wait(wq, &wait.wq_entry); } - - if (!list_empty(&ei->i_fc_list)) - list_del_init(&ei->i_fc_list); + list_del_init(&ei->i_fc_list); /* * Since this inode is getting removed, let's also remove all FC * dentry create references, since it is not needed to log it anyways. */ if (list_empty(&ei->i_fc_dilist)) { - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return; } @@ -320,12 +298,10 @@ restart: list_del_init(&fc_dentry->fcd_dilist); WARN_ON(!list_empty(&ei->i_fc_dilist)); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - - return; } /* @@ -353,12 +329,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl has_transaction = false; read_unlock(&sbi->s_journal->j_state_lock); } - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid))) sbi->s_fc_ineligible_tid = tid; ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); WARN_ON(reason >= EXT4_FC_REASON_MAX); sbi->s_fc_stats.fc_ineligible_reason_count[reason]++; } @@ -385,7 +361,7 @@ static int ext4_fc_track_template( int ret; tid = handle->h_transaction->t_tid; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (tid == ei->i_sync_tid) { update = true; } else { @@ -393,19 +369,18 @@ static int ext4_fc_track_template( ei->i_sync_tid = tid; } ret = __fc_track_fn(handle, inode, args, update); - mutex_unlock(&ei->i_fc_lock); - + spin_unlock(&ei->i_fc_lock); if (!enqueue) return ret; - spin_lock(&sbi->s_fc_lock); + mutex_lock(&sbi->s_fc_lock); if (list_empty(&EXT4_I(inode)->i_fc_list)) list_add_tail(&EXT4_I(inode)->i_fc_list, (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ? &sbi->s_fc_q[FC_Q_STAGING] : &sbi->s_fc_q[FC_Q_MAIN]); - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); return ret; } @@ -428,19 +403,19 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, struct super_block *sb = inode->i_sb; struct ext4_sb_info *sbi = EXT4_SB(sb); - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); if (IS_ENCRYPTED(dir)) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -EOPNOTSUPP; } node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS); if (!node) { ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle); - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); return -ENOMEM; } @@ -449,7 +424,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, node->fcd_ino = inode->i_ino; take_dentry_name_snapshot(&node->fcd_name, dentry); INIT_LIST_HEAD(&node->fcd_dilist); - spin_lock(&sbi->s_fc_lock); + INIT_LIST_HEAD(&node->fcd_list); + mutex_lock(&sbi->s_fc_lock); if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING || sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) list_add_tail(&node->fcd_list, @@ -470,8 +446,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode, WARN_ON(!list_empty(&ei->i_fc_dilist)); list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist); } - spin_unlock(&sbi->s_fc_lock); - mutex_lock(&ei->i_fc_lock); + mutex_unlock(&sbi->s_fc_lock); + spin_lock(&ei->i_fc_lock); return 0; } @@ -571,6 +547,8 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg, void ext4_fc_track_inode(handle_t *handle, struct inode *inode) { + struct ext4_inode_info *ei = EXT4_I(inode); + wait_queue_head_t *wq; int ret; if (S_ISDIR(inode->i_mode)) @@ -588,6 +566,35 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode) if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE)) return; + /* + * If we come here, we may sleep while waiting for the inode to + * commit. We shouldn't be holding i_data_sem when we go to sleep since + * the commit path needs to grab the lock while committing the inode. + */ + lockdep_assert_not_held(&ei->i_data_sem); + + while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) { +#if (BITS_PER_LONG < 64) + DEFINE_WAIT_BIT(wait, &ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_state_flags, + EXT4_STATE_FC_COMMITTING); +#else + DEFINE_WAIT_BIT(wait, &ei->i_flags, + EXT4_STATE_FC_COMMITTING); + wq = bit_waitqueue(&ei->i_flags, + EXT4_STATE_FC_COMMITTING); +#endif + prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE); + if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) + schedule(); + finish_wait(wq, &wait.wq_entry); + } + + /* + * From this point on, this inode will not be committed either + * by fast or full commit as long as the handle is open. + */ ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1); trace_ext4_fc_track_inode(handle, inode, ret); } @@ -727,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc) tl.fc_len = cpu_to_le16(remaining); memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN); memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining); - *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize); + *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize); ext4_fc_submit_bh(sb, false); @@ -774,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc) tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid); memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid)); dst += sizeof(tail.fc_tid); - crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data, + crc = ext4_chksum(crc, sbi->s_fc_bh->b_data, dst - (u8 *)sbi->s_fc_bh->b_data); tail.fc_crc = cpu_to_le32(crc); memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc)); @@ -893,15 +900,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) struct ext4_extent *ex; int ret; - mutex_lock(&ei->i_fc_lock); + spin_lock(&ei->i_fc_lock); if (ei->i_fc_lblk_len == 0) { - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); return 0; } old_blk_size = ei->i_fc_lblk_start; new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1; ei->i_fc_lblk_len = 0; - mutex_unlock(&ei->i_fc_lock); + spin_unlock(&ei->i_fc_lock); cur_lblk_off = old_blk_size; ext4_debug("will try writing %d to %d for inode %ld\n", @@ -910,7 +917,9 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) while (cur_lblk_off <= new_blk_size) { map.m_lblk = cur_lblk_off; map.m_len = new_blk_size - cur_lblk_off + 1; - ret = ext4_map_blocks(NULL, inode, &map, 0); + ret = ext4_map_blocks(NULL, inode, &map, + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE); if (ret < 0) return -ECANCELED; @@ -954,69 +963,31 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc) } -/* Submit data for all the fast commit inodes */ -static int ext4_fc_submit_inode_data_all(journal_t *journal) +/* Flushes data of all the inodes in the commit queue. */ +static int ext4_fc_flush_data(journal_t *journal) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); struct ext4_inode_info *ei; int ret = 0; - spin_lock(&sbi->s_fc_lock); list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - while (atomic_read(&ei->i_fc_updates)) { - DEFINE_WAIT(wait); - - prepare_to_wait(&ei->i_fc_wait, &wait, - TASK_UNINTERRUPTIBLE); - if (atomic_read(&ei->i_fc_updates)) { - spin_unlock(&sbi->s_fc_lock); - schedule(); - spin_lock(&sbi->s_fc_lock); - } - finish_wait(&ei->i_fc_wait, &wait); - } - spin_unlock(&sbi->s_fc_lock); ret = jbd2_submit_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - - return ret; -} - -/* Wait for completion of data for all the fast commit inodes */ -static int ext4_fc_wait_inode_data_all(journal_t *journal) -{ - struct super_block *sb = journal->j_private; - struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *pos, *n; - int ret = 0; - - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { - if (!ext4_test_inode_state(&pos->vfs_inode, - EXT4_STATE_FC_COMMITTING)) - continue; - spin_unlock(&sbi->s_fc_lock); - ret = jbd2_wait_inode_data(journal, pos->jinode); + list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ret = jbd2_wait_inode_data(journal, ei->jinode); if (ret) return ret; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); return 0; } /* Commit all the directory entry updates */ static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc) -__acquires(&sbi->s_fc_lock) -__releases(&sbi->s_fc_lock) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); @@ -1030,26 +1001,22 @@ __releases(&sbi->s_fc_lock) list_for_each_entry_safe(fc_dentry, fc_dentry_n, &sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) { if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) { - spin_unlock(&sbi->s_fc_lock); - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - spin_lock(&sbi->s_fc_lock); + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; continue; } /* * With fcd_dilist we need not loop in sbi->s_fc_q to get the - * corresponding inode pointer + * corresponding inode. Also, the corresponding inode could have been + * deleted, in which case, we don't need to do anything. */ - WARN_ON(list_empty(&fc_dentry->fcd_dilist)); + if (list_empty(&fc_dentry->fcd_dilist)) + continue; ei = list_first_entry(&fc_dentry->fcd_dilist, struct ext4_inode_info, i_fc_dilist); inode = &ei->vfs_inode; WARN_ON(inode->i_ino != fc_dentry->fcd_ino); - spin_unlock(&sbi->s_fc_lock); - /* * We first write the inode and then the create dirent. This * allows the recovery code to create an unnamed inode first @@ -1059,23 +1026,14 @@ __releases(&sbi->s_fc_lock) */ ret = ext4_fc_write_inode(inode, crc); if (ret) - goto lock_and_exit; - + return ret; ret = ext4_fc_write_inode_data(inode, crc); if (ret) - goto lock_and_exit; - - if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) { - ret = -ENOSPC; - goto lock_and_exit; - } - - spin_lock(&sbi->s_fc_lock); + return ret; + if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) + return -ENOSPC; } return 0; -lock_and_exit: - spin_lock(&sbi->s_fc_lock); - return ret; } static int ext4_fc_perform_commit(journal_t *journal) @@ -1089,26 +1047,81 @@ static int ext4_fc_perform_commit(journal_t *journal) int ret = 0; u32 crc = 0; - ret = ext4_fc_submit_inode_data_all(journal); - if (ret) - return ret; + /* + * Step 1: Mark all inodes on s_fc_q[MAIN] with + * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being + * freed until the data flush is over. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); + } + mutex_unlock(&sbi->s_fc_lock); + + /* Step 2: Flush data for all the eligible inodes. */ + ret = ext4_fc_flush_data(journal); - ret = ext4_fc_wait_inode_data_all(journal); + /* + * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning + * any error from step 2. This ensures that waiters waiting on + * EXT4_STATE_FC_FLUSHING_DATA can resume. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_clear_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_FLUSHING_DATA); +#if (BITS_PER_LONG < 64) + wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA); +#else + wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA); +#endif + } + + /* + * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before + * the waiter checks the bit. Pairs with implicit barrier in + * prepare_to_wait() in ext4_fc_del(). + */ + smp_mb(); + mutex_unlock(&sbi->s_fc_lock); + + /* + * If we encountered error in Step 2, return it now after clearing + * EXT4_STATE_FC_FLUSHING_DATA bit. + */ if (ret) return ret; + + /* Step 4: Mark all inodes as being committed. */ + jbd2_journal_lock_updates(journal); /* - * If file system device is different from journal device, issue a cache - * flush before we start writing fast commit blocks. + * The journal is now locked. No more handles can start and all the + * previous handles are now drained. We now mark the inodes on the + * commit queue as being committed. + */ + mutex_lock(&sbi->s_fc_lock); + list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { + ext4_set_inode_state(&iter->vfs_inode, + EXT4_STATE_FC_COMMITTING); + } + mutex_unlock(&sbi->s_fc_lock); + jbd2_journal_unlock_updates(journal); + + /* + * Step 5: If file system device is different from journal device, + * issue a cache flush before we start writing fast commit blocks. */ if (journal->j_fs_dev != journal->j_dev) blkdev_issue_flush(journal->j_fs_dev); blk_start_plug(&plug); + /* Step 6: Write fast commit blocks to disk. */ if (sbi->s_fc_bytes == 0) { /* - * Add a head tag only if this is the first fast commit - * in this TID. + * Step 6.1: Add a head tag only if this is the first fast + * commit in this TID. */ head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES); head.fc_tid = cpu_to_le32( @@ -1120,32 +1133,30 @@ static int ext4_fc_perform_commit(journal_t *journal) } } - spin_lock(&sbi->s_fc_lock); + /* Step 6.2: Now write all the dentry updates. */ + mutex_lock(&sbi->s_fc_lock); ret = ext4_fc_commit_dentry_updates(journal, &crc); - if (ret) { - spin_unlock(&sbi->s_fc_lock); + if (ret) goto out; - } + /* Step 6.3: Now write all the changed inodes to disk. */ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) { inode = &iter->vfs_inode; if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) continue; - spin_unlock(&sbi->s_fc_lock); ret = ext4_fc_write_inode_data(inode, &crc); if (ret) goto out; ret = ext4_fc_write_inode(inode, &crc); if (ret) goto out; - spin_lock(&sbi->s_fc_lock); } - spin_unlock(&sbi->s_fc_lock); - + /* Step 6.4: Finally write tail tag to conclude this fast commit. */ ret = ext4_fc_write_tail(sb, crc); out: + mutex_unlock(&sbi->s_fc_lock); blk_finish_plug(&plug); return ret; } @@ -1191,6 +1202,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) int subtid = atomic_read(&sbi->s_fc_subtid); int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0; ktime_t start_time, commit_time; + int old_ioprio, journal_ioprio; if (!test_opt2(sb, JOURNAL_FAST_COMMIT)) return jbd2_complete_transaction(journal, commit_tid); @@ -1198,6 +1210,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid) trace_ext4_fc_commit_start(sb, commit_tid); start_time = ktime_get(); + old_ioprio = get_current_ioprio(); restart_fc: ret = jbd2_fc_begin_commit(journal, commit_tid); @@ -1228,6 +1241,15 @@ restart_fc: goto fallback; } + /* + * Now that we know that this thread is going to do a fast commit, + * elevate the priority to match that of the journal thread. + */ + if (journal->j_task->io_context) + journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; + else + journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; + set_task_ioprio(current, journal_ioprio); fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize; ret = ext4_fc_perform_commit(journal); if (ret < 0) { @@ -1242,6 +1264,7 @@ restart_fc: } atomic_inc(&sbi->s_fc_subtid); ret = jbd2_fc_end_commit(journal); + set_task_ioprio(current, old_ioprio); /* * weight the commit time higher than the average time so we * don't react too strongly to vast changes in the commit time @@ -1251,6 +1274,7 @@ restart_fc: return ret; fallback: + set_task_ioprio(current, old_ioprio); ret = jbd2_fc_end_commit_fallback(journal); ext4_fc_update_stats(sb, status, 0, 0, commit_tid); return ret; @@ -1264,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) { struct super_block *sb = journal->j_private; struct ext4_sb_info *sbi = EXT4_SB(sb); - struct ext4_inode_info *iter, *iter_n; + struct ext4_inode_info *ei; struct ext4_fc_dentry_update *fc_dentry; if (full && sbi->s_fc_bh) @@ -1273,14 +1297,16 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) trace_ext4_fc_cleanup(journal, full, tid); jbd2_fc_release_bufs(journal); - spin_lock(&sbi->s_fc_lock); - list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN], - i_fc_list) { - list_del_init(&iter->i_fc_list); - ext4_clear_inode_state(&iter->vfs_inode, + mutex_lock(&sbi->s_fc_lock); + while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) { + ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN], + struct ext4_inode_info, + i_fc_list); + list_del_init(&ei->i_fc_list); + ext4_clear_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING); - if (tid_geq(tid, iter->i_sync_tid)) { - ext4_fc_reset_inode(&iter->vfs_inode); + if (tid_geq(tid, ei->i_sync_tid)) { + ext4_fc_reset_inode(&ei->vfs_inode); } else if (full) { /* * We are called after a full commit, inode has been @@ -1291,15 +1317,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) * time in that case (and tid doesn't increase so * tid check above isn't reliable). */ - list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list, + list_add_tail(&ei->i_fc_list, &sbi->s_fc_q[FC_Q_STAGING]); } - /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */ + /* + * Make sure clearing of EXT4_STATE_FC_COMMITTING is + * visible before we send the wakeup. Pairs with implicit + * barrier in prepare_to_wait() in ext4_fc_track_inode(). + */ smp_mb(); #if (BITS_PER_LONG < 64) - wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING); #else - wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING); + wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING); #endif } @@ -1309,11 +1339,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) fcd_list); list_del_init(&fc_dentry->fcd_list); list_del_init(&fc_dentry->fcd_dilist); - spin_unlock(&sbi->s_fc_lock); release_dentry_name_snapshot(&fc_dentry->fcd_name); kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry); - spin_lock(&sbi->s_fc_lock); } list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING], @@ -1328,7 +1356,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid) if (full) sbi->s_fc_bytes = 0; - spin_unlock(&sbi->s_fc_lock); + mutex_unlock(&sbi->s_fc_lock); trace_ext4_fc_stats(sb); } @@ -2105,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal, case EXT4_FC_TAG_INODE: case EXT4_FC_TAG_PAD: state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; case EXT4_FC_TAG_TAIL: state->fc_cur_tag++; memcpy(&tail, val, sizeof(tail)); - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + offsetof(struct ext4_fc_tail, fc_crc)); @@ -2138,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal, break; } state->fc_cur_tag++; - state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur, + state->fc_crc = ext4_chksum(state->fc_crc, cur, EXT4_FC_TAG_BASE_LEN + tl.fc_len); break; default: diff --git a/fs/ext4/file.c b/fs/ext4/file.c index beb078ee4811..21df81347147 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -377,7 +377,12 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size, loff_t pos = iocb->ki_pos; struct inode *inode = file_inode(iocb->ki_filp); - if (!error && size && flags & IOMAP_DIO_UNWRITTEN) + + if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) && + (iocb->ki_flags & IOCB_ATOMIC)) + error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos, + size); + else if (!error && size && flags & IOMAP_DIO_UNWRITTEN) error = ext4_convert_unwritten_extents(NULL, inode, pos, size); if (error) return error; @@ -929,12 +934,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp) loff_t ext4_llseek(struct file *file, loff_t offset, int whence) { struct inode *inode = file->f_mapping->host; - loff_t maxbytes; - - if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))) - maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes; - else - maxbytes = inode->i_sb->s_maxbytes; + loff_t maxbytes = ext4_get_maxbytes(inode); switch (whence) { default: diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c index e7ecc7c8a729..79aa3df8d019 100644 --- a/fs/ext4/ialloc.c +++ b/fs/ext4/ialloc.c @@ -1288,10 +1288,9 @@ got: __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = cpu_to_le32(inode->i_generation); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */ @@ -1336,6 +1335,9 @@ got: } } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ext4_update_inode_fsync_trans(handle, inode, 1); err = ext4_mark_inode_dirty(handle, inode); diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c index 2c9b762925c7..a1bbcdf40824 100644 --- a/fs/ext4/inline.c +++ b/fs/ext4/inline.c @@ -397,7 +397,7 @@ out: } static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode, - unsigned int len) + loff_t len) { int ret, size, no_expand; struct ext4_inode_info *ei = EXT4_I(inode); @@ -601,6 +601,7 @@ retry: goto out; } + ext4_fc_track_inode(handle, inode); ret = ext4_destroy_inline_data_nolock(handle, inode); if (ret) goto out; diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c index cdf01e60fa6d..be9a4cba35fd 100644 --- a/fs/ext4/inode.c +++ b/fs/ext4/inode.c @@ -58,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle, static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw, struct ext4_inode_info *ei) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); __u32 csum; __u16 dummy_csum = 0; int offset = offsetof(struct ext4_inode, i_checksum_lo); unsigned int csum_size = sizeof(dummy_csum); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_GOOD_OLD_INODE_SIZE - offset); if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) { offset = offsetof(struct ext4_inode, i_checksum_hi); - csum = ext4_chksum(sbi, csum, (__u8 *)raw + - EXT4_GOOD_OLD_INODE_SIZE, + csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE, offset - EXT4_GOOD_OLD_INODE_SIZE); if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) { - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size); offset += csum_size; } - csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset, + csum = ext4_chksum(csum, (__u8 *)raw + offset, EXT4_INODE_SIZE(inode->i_sb) - offset); } @@ -142,9 +140,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode, new_size); } -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents); - /* * Test whether an inode is a fast symlink. * A fast symlink has its symlink data stored in ext4_inode_info->i_data. @@ -416,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk, return ret; } +/* + * For generic regular files, when updating the extent tree, Ext4 should + * hold the i_rwsem and invalidate_lock exclusively. This ensures + * exclusion against concurrent page faults, as well as reads and writes. + */ +#ifdef CONFIG_EXT4_DEBUG +void ext4_check_map_extents_env(struct inode *inode) +{ + if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) + return; + + if (!S_ISREG(inode->i_mode) || + IS_NOQUOTA(inode) || IS_VERITY(inode) || + is_special_ino(inode->i_sb, inode->i_ino) || + (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) || + ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) || + ext4_verity_in_progress(inode)) + return; + + WARN_ON_ONCE(!inode_is_locked(inode) && + !rwsem_is_locked(&inode->i_mapping->invalidate_lock)); +} +#else +void ext4_check_map_extents_env(struct inode *inode) {} +#endif + #define check_block_validity(inode, map) \ __check_block_validity((inode), __func__, __LINE__, (map)) @@ -462,16 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle, } #endif /* ES_AGGRESSIVE_TEST */ +static int ext4_map_query_blocks_next_in_leaf(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map, + unsigned int orig_mlen) +{ + struct ext4_map_blocks map2; + unsigned int status, status2; + int retval; + + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF)); + WARN_ON_ONCE(orig_mlen <= map->m_len); + + /* Prepare map2 for lookup in next leaf block */ + map2.m_lblk = map->m_lblk + map->m_len; + map2.m_len = orig_mlen - map->m_len; + map2.m_flags = 0; + retval = ext4_ext_map_blocks(handle, inode, &map2, 0); + + if (retval <= 0) { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return map->m_len; + } + + if (unlikely(retval != map2.m_len)) { + ext4_warning(inode->i_sb, + "ES len assertion failed for inode " + "%lu: retval %d != map->m_len %d", + inode->i_ino, retval, map2.m_len); + WARN_ON(1); + } + + status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + + /* + * If map2 is contiguous with map, then let's insert it as a single + * extent in es cache and return the combined length of both the maps. + */ + if (map->m_pblk + map->m_len == map2.m_pblk && + status == status2) { + ext4_es_insert_extent(inode, map->m_lblk, + map->m_len + map2.m_len, map->m_pblk, + status, false); + map->m_len += map2.m_len; + } else { + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + } + + return map->m_len; +} + static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, - struct ext4_map_blocks *map) + struct ext4_map_blocks *map, int flags) { unsigned int status; int retval; + unsigned int orig_mlen = map->m_len; + flags &= EXT4_EX_QUERY_FILTER; if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) - retval = ext4_ext_map_blocks(handle, inode, map, 0); + retval = ext4_ext_map_blocks(handle, inode, map, flags); else - retval = ext4_ind_map_blocks(handle, inode, map, 0); + retval = ext4_ind_map_blocks(handle, inode, map, flags); if (retval <= 0) return retval; @@ -484,11 +562,22 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode, WARN_ON(1); } - status = map->m_flags & EXT4_MAP_UNWRITTEN ? - EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; - ext4_es_insert_extent(inode, map->m_lblk, map->m_len, - map->m_pblk, status, false); - return retval; + /* + * No need to query next in leaf: + * - if returned extent is not last in leaf or + * - if the last in leaf is the full requested range + */ + if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) || + map->m_len == orig_mlen) { + status = map->m_flags & EXT4_MAP_UNWRITTEN ? + EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN; + ext4_es_insert_extent(inode, map->m_lblk, map->m_len, + map->m_pblk, status, false); + return retval; + } + + return ext4_map_query_blocks_next_in_leaf(handle, inode, map, + orig_mlen); } static int ext4_map_create_blocks(handle_t *handle, struct inode *inode, @@ -602,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, struct extent_status es; int retval; int ret = 0; + unsigned int orig_mlen = map->m_len; #ifdef ES_AGGRESSIVE_TEST struct ext4_map_blocks orig_map; @@ -622,6 +712,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS)) return -EFSCORRUPTED; + /* + * Callers from the context of data submission are the only exceptions + * for regular files that do not hold the i_rwsem or invalidate_lock. + * However, caching unrelated ranges is not permitted. + */ + if (flags & EXT4_GET_BLOCKS_IO_SUBMIT) + WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE)); + else + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) && ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { @@ -653,7 +753,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, ext4_map_blocks_es_recheck(handle, inode, map, &orig_map, flags); #endif - goto found; + if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) || + orig_mlen == map->m_len) + goto found; + + if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) + map->m_len = orig_mlen; } /* * In the query cache no-wait mode, nothing we can do more if we @@ -667,7 +772,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode, * file system block. */ down_read(&EXT4_I(inode)->i_data_sem); - retval = ext4_map_query_blocks(handle, inode, map); + retval = ext4_map_query_blocks(handle, inode, map, flags); up_read((&EXT4_I(inode)->i_data_sem)); found: @@ -696,6 +801,8 @@ found: if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN)) return retval; + + ext4_fc_track_inode(handle, inode); /* * New blocks allocate and/or writing to unwritten extent * will possibly result in updating i_data, so we take @@ -1009,7 +1116,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode, */ static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh) { - folio_mark_dirty(bh->b_folio); + struct folio *folio = bh->b_folio; + struct inode *inode = folio->mapping->host; + + /* only regular files have a_ops */ + if (S_ISREG(inode->i_mode)) + folio_mark_dirty(folio); return ext4_handle_dirty_metadata(handle, NULL, bh); } @@ -1027,7 +1139,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, loff_t pos, unsigned len, get_block_t *get_block) { - unsigned from = pos & (PAGE_SIZE - 1); + unsigned int from = offset_in_folio(folio, pos); unsigned to = from + len; struct inode *inode = folio->mapping->host; unsigned block_start, block_end; @@ -1041,8 +1153,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio, bool should_journal_data = ext4_should_journal_data(inode); BUG_ON(!folio_test_locked(folio)); - BUG_ON(from > PAGE_SIZE); - BUG_ON(to > PAGE_SIZE); + BUG_ON(to > folio_size(folio)); BUG_ON(from > to); head = folio_buffers(folio); @@ -1152,6 +1263,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; unsigned from, to; + fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -1164,8 +1276,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, */ needed_blocks = ext4_writepage_trans_blocks(inode) + 1; index = pos >> PAGE_SHIFT; - from = pos & (PAGE_SIZE - 1); - to = from + len; if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) { ret = ext4_try_to_write_inline_data(mapping, inode, pos, len, @@ -1184,10 +1294,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping, * the folio (if needed) without using GFP_NOFS. */ retry_grab: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + + from = offset_in_folio(folio, pos); + to = from + len; + /* * The same as page allocation, we prealloc buffer heads before * starting the handle. @@ -1765,6 +1883,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map) ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len, (unsigned long) map->m_lblk); + ext4_check_map_extents_env(inode); + /* Lookup extent status tree firstly */ if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) { map->m_len = min_t(unsigned int, map->m_len, @@ -1805,7 +1925,7 @@ found: if (ext4_has_inline_data(inode)) retval = 0; else - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); up_read(&EXT4_I(inode)->i_data_sem); if (retval) return retval < 0 ? retval : 0; @@ -1828,7 +1948,7 @@ add_delayed: goto found; } } else if (!ext4_has_inline_data(inode)) { - retval = ext4_map_query_blocks(NULL, inode, map); + retval = ext4_map_query_blocks(NULL, inode, map, 0); if (retval) { up_write(&EXT4_I(inode)->i_data_sem); return retval < 0 ? retval : 0; @@ -1936,7 +2056,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio) len = size & (len - 1); err = ext4_bio_write_folio(&mpd->io_submit, folio, len); if (!err) - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); return err; } @@ -2159,7 +2279,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) start = mpd->map.m_lblk >> bpp_bits; end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits; - lblk = start << bpp_bits; pblock = mpd->map.m_pblk; folio_batch_init(&fbatch); @@ -2170,6 +2289,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd) for (i = 0; i < nr; i++) { struct folio *folio = fbatch.folios[i]; + lblk = folio->index << bpp_bits; err = mpage_process_folio(mpd, folio, &lblk, &pblock, &map_bh); /* @@ -2212,11 +2332,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd) * previously reserved. However we must not fail because we're in * writeback and there is nothing we can do about it so it might result * in data loss. So use reserved blocks to allocate metadata if - * possible. + * possible. In addition, do not cache any unrelated extents, as it + * only holds the folio lock but does not hold the i_rwsem or + * invalidate_lock, which could corrupt the extent status tree. */ get_blocks_flags = EXT4_GET_BLOCKS_CREATE | EXT4_GET_BLOCKS_METADATA_NOFAIL | - EXT4_GET_BLOCKS_IO_SUBMIT; + EXT4_GET_BLOCKS_IO_SUBMIT | + EXT4_EX_NOCACHE; + dioread_nolock = ext4_should_dioread_nolock(inode); if (dioread_nolock) get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT; @@ -2355,7 +2479,7 @@ update_disksize: */ static int ext4_da_writepages_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); return ext4_meta_trans_blocks(inode, MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp); @@ -2391,7 +2515,7 @@ static int mpage_journal_page_buffers(handle_t *handle, size_t len = folio_size(folio); folio_clear_checked(folio); - mpd->wbc->nr_to_write--; + mpd->wbc->nr_to_write -= folio_nr_pages(folio); if (folio_pos(folio) + len > size && !ext4_verity_in_progress(inode)) @@ -2433,7 +2557,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd) ext4_lblk_t lblk; struct buffer_head *head; handle_t *handle = NULL; - int bpp = ext4_journal_blocks_per_page(mpd->inode); + int bpp = ext4_journal_blocks_per_folio(mpd->inode); if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages) tag = PAGECACHE_TAG_TOWRITE; @@ -2920,6 +3044,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, struct folio *folio; pgoff_t index; struct inode *inode = mapping->host; + fgf_t fgp = FGP_WRITEBEGIN; ret = ext4_emergency_state(inode->i_sb); if (unlikely(ret)) @@ -2945,11 +3070,15 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping, } retry: - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, - mapping_gfp_mask(mapping)); + fgp |= fgf_set_order(len); + folio = __filemap_get_folio(mapping, index, fgp, + mapping_gfp_mask(mapping)); if (IS_ERR(folio)) return PTR_ERR(folio); + if (pos + len > folio_pos(folio) + folio_size(folio)) + len = folio_pos(folio) + folio_size(folio) - pos; + ret = ext4_block_write_begin(NULL, folio, pos, len, ext4_da_get_block_prep); if (ret < 0) { @@ -3038,7 +3167,7 @@ static int ext4_da_do_write_end(struct address_space *mapping, unsigned long end; i_size_write(inode, new_i_size); - end = (new_i_size - 1) & (PAGE_SIZE - 1); + end = offset_in_folio(folio, new_i_size - 1); if (copied && ext4_da_should_update_i_disksize(folio, end)) { ext4_update_i_disksize(inode, new_i_size); disksize_changed = true; @@ -3340,12 +3469,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap, } } +static int ext4_map_blocks_atomic_write_slow(handle_t *handle, + struct inode *inode, struct ext4_map_blocks *map) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + unsigned int mapped_len = 0, m_flags = 0; + ext4_fsblk_t next_pblk; + bool check_next_pblk = false; + int ret = 0; + + WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb)); + + /* + * This is a slow path in case of mixed mapping. We use + * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single + * contiguous mapped mapping. This will ensure any unwritten or hole + * regions within the requested range is zeroed out and we return + * a single contiguous mapped extent. + */ + m_flags = EXT4_GET_BLOCKS_CREATE_ZERO; + + do { + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 && ret != -ENOSPC) + goto out_err; + /* + * This should never happen, but let's return an error code to + * avoid an infinite loop in here. + */ + if (ret == 0) { + ret = -EFSCORRUPTED; + ext4_warning_inode(inode, + "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d", + m_flags, ret); + goto out_err; + } + /* + * With bigalloc we should never get ENOSPC nor discontiguous + * physical extents. + */ + if ((check_next_pblk && next_pblk != map->m_pblk) || + ret == -ENOSPC) { + ext4_warning_inode(inode, + "Non-contiguous allocation detected: expected %llu, got %llu, " + "or ext4_map_blocks() returned out of space ret: %d", + next_pblk, map->m_pblk, ret); + ret = -EFSCORRUPTED; + goto out_err; + } + next_pblk = map->m_pblk + map->m_len; + check_next_pblk = true; + + mapped_len += map->m_len; + map->m_lblk += map->m_len; + map->m_len = m_len - mapped_len; + } while (mapped_len < m_len); + + /* + * We might have done some work in above loop, so we need to query the + * start of the physical extent, based on the origin m_lblk and m_len. + * Let's also ensure we were able to allocate the required range for + * mixed mapping case. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + ret = ext4_map_blocks(handle, inode, map, + EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF); + if (ret != m_len) { + ext4_warning_inode(inode, + "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n", + m_lblk, m_len, ret); + ret = -EINVAL; + } + return ret; + +out_err: + /* reset map before returning an error */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + return ret; +} + +/* + * ext4_map_blocks_atomic: Helper routine to ensure the entire requested + * range in @map [lblk, lblk + len) is one single contiguous extent with no + * mixed mappings. + * + * We first use m_flags passed to us by our caller (ext4_iomap_alloc()). + * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying + * physical extent for the requested range does not have a single contiguous + * mapping type i.e. (Hole, Mapped, or Unwritten) throughout. + * In that case we will loop over the requested range to allocate and zero out + * the unwritten / holes in between, to get a single mapped extent from + * [m_lblk, m_lblk + m_len). Note that this is only possible because we know + * this can be called only with bigalloc enabled filesystem where the underlying + * cluster is already allocated. This avoids allocating discontiguous extents + * in the slow path due to multiple calls to ext4_map_blocks(). + * The slow path is mostly non-performance critical path, so it should be ok to + * loop using ext4_map_blocks() with appropriate flags to allocate & zero the + * underlying short holes/unwritten extents within the requested range. + */ +static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode, + struct ext4_map_blocks *map, int m_flags, + bool *force_commit) +{ + ext4_lblk_t m_lblk = map->m_lblk; + unsigned int m_len = map->m_len; + int ret = 0; + + WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb)); + + ret = ext4_map_blocks(handle, inode, map, m_flags); + if (ret < 0 || ret == m_len) + goto out; + /* + * This is a mixed mapping case where we were not able to allocate + * a single contiguous extent. In that case let's reset requested + * mapping and call the slow path. + */ + map->m_lblk = m_lblk; + map->m_len = m_len; + map->m_flags = 0; + + /* + * slow path means we have mixed mapping, that means we will need + * to force txn commit. + */ + *force_commit = true; + return ext4_map_blocks_atomic_write_slow(handle, inode, map); +out: + return ret; +} + static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, unsigned int flags) { handle_t *handle; u8 blkbits = inode->i_blkbits; int ret, dio_credits, m_flags = 0, retries = 0; + bool force_commit = false; /* * Trim the mapping request to the maximum value that we can map at @@ -3353,7 +3619,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map, */ if (map->m_len > DIO_MAX_BLOCKS) map->m_len = DIO_MAX_BLOCKS; - dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + + /* + * journal credits estimation for atomic writes. We call + * ext4_map_blocks(), to find if there could be a mixed mapping. If yes, + * then let's assume the no. of pextents required can be m_len i.e. + * every alternate block can be unwritten and hole. + */ + if (flags & IOMAP_ATOMIC) { + unsigned int orig_mlen = map->m_len; + + ret = ext4_map_blocks(NULL, inode, map, 0); + if (ret < 0) + return ret; + if (map->m_len < orig_mlen) { + map->m_len = orig_mlen; + dio_credits = ext4_meta_trans_blocks(inode, orig_mlen, + map->m_len); + } else { + dio_credits = ext4_chunk_trans_blocks(inode, + map->m_len); + } + } else { + dio_credits = ext4_chunk_trans_blocks(inode, map->m_len); + } retry: /* @@ -3384,7 +3673,11 @@ retry: else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT; - ret = ext4_map_blocks(handle, inode, map, m_flags); + if (flags & IOMAP_ATOMIC) + ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags, + &force_commit); + else + ret = ext4_map_blocks(handle, inode, map, m_flags); /* * We cannot fill holes in indirect tree based inodes as that could @@ -3398,6 +3691,22 @@ retry: if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries)) goto retry; + /* + * Force commit the current transaction if the allocation spans a mixed + * mapping range. This ensures any pending metadata updates (like + * unwritten to written extents conversion) in this range are in + * consistent state with the file data blocks, before performing the + * actual write I/O. If the commit fails, the whole I/O must be aborted + * to prevent any possible torn writes. + */ + if (ret > 0 && force_commit) { + int ret2; + + ret2 = ext4_force_commit(inode->i_sb); + if (ret2) + return ret2; + } + return ret; } @@ -3408,6 +3717,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, int ret; struct ext4_map_blocks map; u8 blkbits = inode->i_blkbits; + unsigned int orig_mlen; if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK) return -EINVAL; @@ -3421,6 +3731,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, map.m_lblk = offset >> blkbits; map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits, EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1; + orig_mlen = map.m_len; if (flags & IOMAP_WRITE) { /* @@ -3431,11 +3742,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length, */ if (offset + length <= i_size_read(inode)) { ret = ext4_map_blocks(NULL, inode, &map, 0); - if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED)) - goto out; + /* + * For atomic writes the entire requested length should + * be mapped. + */ + if (map.m_flags & EXT4_MAP_MAPPED) { + if ((!(flags & IOMAP_ATOMIC) && ret > 0) || + (flags & IOMAP_ATOMIC && ret >= orig_mlen)) + goto out; + } + map.m_len = orig_mlen; } ret = ext4_iomap_alloc(inode, &map, flags); } else { + /* + * This can be called for overwrites path from + * ext4_iomap_overwrite_begin(). + */ ret = ext4_map_blocks(NULL, inode, &map, 0); } @@ -3449,6 +3772,16 @@ out: */ map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len); + /* + * Before returning to iomap, let's ensure the allocated mapping + * covers the entire requested length for atomic writes. + */ + if (flags & IOMAP_ATOMIC) { + if (map.m_len < (length >> blkbits)) { + WARN_ON_ONCE(1); + return -EINVAL; + } + } ext4_set_iomap(inode, iomap, &map, offset, length, flags); return 0; @@ -3690,9 +4023,7 @@ void ext4_set_aops(struct inode *inode) static int __ext4_block_zero_page_range(handle_t *handle, struct address_space *mapping, loff_t from, loff_t length) { - ext4_fsblk_t index = from >> PAGE_SHIFT; - unsigned offset = from & (PAGE_SIZE-1); - unsigned blocksize, pos; + unsigned int offset, blocksize, pos; ext4_lblk_t iblock; struct inode *inode = mapping->host; struct buffer_head *bh; @@ -3707,13 +4038,14 @@ static int __ext4_block_zero_page_range(handle_t *handle, blocksize = inode->i_sb->s_blocksize; - iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); bh = folio_buffers(folio); if (!bh) bh = create_empty_buffers(folio, blocksize, 0); /* Find the buffer that contains "offset" */ + offset = offset_in_folio(folio, from); pos = blocksize; while (offset >= pos) { bh = bh->b_this_page; @@ -4006,7 +4338,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) struct inode *inode = file_inode(file); struct super_block *sb = inode->i_sb; ext4_lblk_t start_lblk, end_lblk; - loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + loff_t max_end = sb->s_maxbytes; loff_t end = offset + length; handle_t *handle; unsigned int credits; @@ -4015,14 +4347,20 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) trace_ext4_punch_hole(inode, offset, length, 0); WARN_ON_ONCE(!inode_is_locked(inode)); + /* + * For indirect-block based inodes, make sure that the hole within + * one block before last range. + */ + if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) + max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize; + /* No need to punch hole beyond i_size */ - if (offset >= inode->i_size) + if (offset >= inode->i_size || offset >= max_end) return 0; /* * If the hole extends beyond i_size, set the hole to end after - * the page that contains i_size, and also make sure that the hole - * within one block before last range. + * the page that contains i_size. */ if (end > inode->i_size) end = round_up(inode->i_size, PAGE_SIZE); @@ -4072,6 +4410,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length) if (end_lblk > start_lblk) { ext4_lblk_t hole_len = end_lblk - start_lblk; + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); @@ -4224,8 +4564,10 @@ int ext4_truncate(struct inode *inode) if (err) goto out_stop; - down_write(&EXT4_I(inode)->i_data_sem); + ext4_fc_track_inode(handle, inode); + ext4_check_map_extents_env(inode); + down_write(&EXT4_I(inode)->i_data_sem); ext4_discard_preallocations(inode); if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)) @@ -4760,10 +5102,27 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags, return 0; error: - ext4_error_inode(inode, function, line, 0, err_str); + ext4_error_inode(inode, function, line, 0, "%s", err_str); return -EFSCORRUPTED; } +bool ext4_should_enable_large_folio(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + + if (!S_ISREG(inode->i_mode)) + return false; + if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA || + ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA)) + return false; + if (ext4_has_feature_verity(sb)) + return false; + if (ext4_has_feature_encrypt(sb)) + return false; + + return true; +} + struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ext4_iget_flags flags, const char *function, unsigned int line) @@ -4781,12 +5140,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, gid_t i_gid; projid_t i_projid; - if ((!(flags & EXT4_IGET_SPECIAL) && - ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) || - ino == le32_to_cpu(es->s_usr_quota_inum) || - ino == le32_to_cpu(es->s_grp_quota_inum) || - ino == le32_to_cpu(es->s_prj_quota_inum) || - ino == le32_to_cpu(es->s_orphan_file_inum))) || + if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) || (ino < EXT4_ROOT_INO) || (ino > le32_to_cpu(es->s_inodes_count))) { if (flags & EXT4_IGET_HANDLE) @@ -4845,10 +5199,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, __u32 csum; __le32 inum = cpu_to_le32(inode->i_ino); __le32 gen = raw_inode->i_generation; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, - sizeof(gen)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } if ((!ext4_inode_csum_verify(inode, raw_inode, ei) || @@ -4916,7 +5269,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ei->i_file_acl |= ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; inode->i_size = ext4_isize(sb, raw_inode); - if ((size = i_size_read(inode)) < 0) { + size = i_size_read(inode); + if (size < 0 || size > ext4_get_maxbytes(inode)) { ext4_error_inode(inode, function, line, 0, "iget: bad i_size value: %lld", size); ret = -EFSCORRUPTED; @@ -5086,6 +5440,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino, ret = -EFSCORRUPTED; goto bad_inode; } + if (ext4_should_enable_large_folio(inode)) + mapping_set_large_folios(inode->i_mapping); + ret = check_igot_inode(inode, flags, function, line); /* * -ESTALE here means there is nothing inherently wrong with the inode, @@ -5564,9 +5921,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, down_write(&EXT4_I(inode)->i_data_sem); old_disksize = EXT4_I(inode)->i_disksize; EXT4_I(inode)->i_disksize = attr->ia_size; - rc = ext4_mark_inode_dirty(handle, inode); - if (!error) - error = rc; + /* * We have to update i_size under i_data_sem together * with i_disksize to avoid races with writeback code @@ -5577,6 +5932,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry, else EXT4_I(inode)->i_disksize = old_disksize; up_write(&EXT4_I(inode)->i_data_sem); + rc = ext4_mark_inode_dirty(handle, inode); + if (!error) + error = rc; ext4_journal_stop(handle); if (error) goto out_mmap_sem; @@ -5773,8 +6131,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks, * * Also account for superblock, inode, quota and xattr blocks */ -static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, - int pextents) +int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents) { ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb); int gdpblocks; @@ -5782,18 +6139,16 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int ret; /* - * How many index blocks need to touch to map @lblocks logical blocks - * to @pextents physical extents? + * How many index and lead blocks need to touch to map @lblocks + * logical blocks to @pextents physical extents? */ idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents); - ret = idxblocks; - /* * Now let's see how many group bitmaps and group descriptors need * to account */ - groups = idxblocks + pextents; + groups = idxblocks; gdpblocks = groups; if (groups > ngroups) groups = ngroups; @@ -5801,7 +6156,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count; /* bitmaps and block group descriptor blocks */ - ret += groups + gdpblocks; + ret = idxblocks + groups + gdpblocks; /* Blocks for super block, inode, quota and xattr blocks */ ret += EXT4_META_TRANS_BLOCKS(inode->i_sb); @@ -5821,7 +6176,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks, */ int ext4_writepage_trans_blocks(struct inode *inode) { - int bpp = ext4_journal_blocks_per_page(inode); + int bpp = ext4_journal_blocks_per_folio(inode); int ret; ret = ext4_meta_trans_blocks(inode, bpp, bpp); @@ -5895,6 +6250,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode, brelse(iloc->bh); iloc->bh = NULL; } + ext4_fc_track_inode(handle, inode); } ext4_std_error(inode->i_sb, err); return err; diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c index d17207386ead..5668a17458ae 100644 --- a/fs/ext4/ioctl.c +++ b/fs/ext4/ioctl.c @@ -143,7 +143,7 @@ static int ext4_update_backup_sb(struct super_block *sb, es = (struct ext4_super_block *) (bh->b_data + offset); lock_buffer(bh); if (ext4_has_feature_metadata_csum(sb) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "Invalid checksum for backup " "superblock %llu", sb_block); unlock_buffer(bh); @@ -151,7 +151,7 @@ static int ext4_update_backup_sb(struct super_block *sb, } func(es, arg); if (ext4_has_feature_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); set_buffer_uptodate(bh); unlock_buffer(bh); @@ -354,8 +354,8 @@ void ext4_reset_inode_seed(struct inode *inode) if (!ext4_has_feature_metadata_csum(inode->i_sb)) return; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); - ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen)); + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum)); + ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen)); } /* @@ -1505,8 +1505,14 @@ resizefs_out: return 0; } case EXT4_IOC_PRECACHE_EXTENTS: - return ext4_ext_precache(inode); + { + int ret; + inode_lock_shared(inode); + ret = ext4_ext_precache(inode); + inode_unlock_shared(inode); + return ret; + } case FS_IOC_SET_ENCRYPTION_POLICY: if (!ext4_has_feature_encrypt(sb)) return -EOPNOTSUPP; diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c index 3e26464b1425..51661570cf3b 100644 --- a/fs/ext4/mmp.c +++ b/fs/ext4/mmp.c @@ -14,7 +14,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp) int offset = offsetof(struct mmp_struct, mmp_checksum); __u32 csum; - csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset); + csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset); return cpu_to_le32(csum); } diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c index 48649be64d6a..1f8493a56e8f 100644 --- a/fs/ext4/move_extent.c +++ b/fs/ext4/move_extent.c @@ -269,7 +269,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode, unsigned int tmp_data_size, data_size, replaced_size; int i, err2, jblocks, retries = 0; int replaced_count = 0; - int from = data_offset_in_page << orig_inode->i_blkbits; + int from; int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits; struct super_block *sb = orig_inode->i_sb; struct buffer_head *bh = NULL; @@ -323,11 +323,6 @@ again: * hold page's lock, if it is still the case data copy is not * necessary, just swap data blocks between orig and donor. */ - - VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]); - VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]); - VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]); - if (unwritten) { ext4_double_down_write_data_sem(orig_inode, donor_inode); /* If any of extents in range became initialized we have to @@ -360,6 +355,8 @@ again: goto unlock_folios; } data_copy: + from = offset_in_folio(folio[0], + orig_blk_offset << orig_inode->i_blkbits); *err = mext_page_mkuptodate(folio[0], from, from + replaced_size); if (*err) goto unlock_folios; @@ -390,7 +387,7 @@ data_copy: if (!bh) bh = create_empty_buffers(folio[0], 1 << orig_inode->i_blkbits, 0); - for (i = 0; i < data_offset_in_page; i++) + for (i = 0; i < from >> orig_inode->i_blkbits; i++) bh = bh->b_this_page; for (i = 0; i < block_len_in_page; i++) { *err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0); diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c index e9712e64ec8f..a178ac229489 100644 --- a/fs/ext4/namei.c +++ b/fs/ext4/namei.c @@ -346,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode, static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); return cpu_to_le32(csum); } @@ -442,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode, static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int count_offset, int count, struct dx_tail *t) { - struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb); struct ext4_inode_info *ei = EXT4_I(inode); __u32 csum; int size; @@ -450,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent, int offset = offsetof(struct dx_tail, dt_checksum); size = count_offset + (count * sizeof(struct dx_entry)); - csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size); - csum = ext4_chksum(sbi, csum, (__u8 *)t, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size); + csum = ext4_chksum(csum, (__u8 *)t, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); return cpu_to_le32(csum); } diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c index c66e0cb29bd4..7c7f792ad6ab 100644 --- a/fs/ext4/orphan.c +++ b/fs/ext4/orphan.c @@ -541,9 +541,9 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb, return 1; ot = ext4_orphan_block_tail(sb, bh); - calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data, + calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + calculated = ext4_chksum(calculated, (__u8 *)bh->b_data, inodes_per_ob * sizeof(__u32)); return le32_to_cpu(ot->ob_checksum) == calculated; } @@ -560,10 +560,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers, struct ext4_orphan_block_tail *ot; __le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr); - csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed, - (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data, - inodes_per_ob * sizeof(__u32)); + csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr, + sizeof(dsk_block_nr)); + csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32)); ot = ext4_orphan_block_tail(sb, bh); ot->ob_checksum = cpu_to_le32(csum); } diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c index 5d3a9dc9a32d..f329daf6e5c7 100644 --- a/fs/ext4/readpage.c +++ b/fs/ext4/readpage.c @@ -227,24 +227,30 @@ int ext4_mpage_readpages(struct inode *inode, int length; unsigned relative_block = 0; struct ext4_map_blocks map; - unsigned int nr_pages = rac ? readahead_count(rac) : 1; + unsigned int nr_pages, folio_pages; map.m_pblk = 0; map.m_lblk = 0; map.m_len = 0; map.m_flags = 0; - for (; nr_pages; nr_pages--) { + nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio); + for (; nr_pages; nr_pages -= folio_pages) { int fully_mapped = 1; - unsigned first_hole = blocks_per_page; + unsigned int first_hole; + unsigned int blocks_per_folio; if (rac) folio = readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); prefetchw(&folio->flags); if (folio_buffers(folio)) goto confused; + blocks_per_folio = folio_size(folio) >> blkbits; + first_hole = blocks_per_folio; block_in_file = next_block = (sector_t)folio->index << (PAGE_SHIFT - blkbits); last_block = block_in_file + nr_pages * blocks_per_page; @@ -270,7 +276,7 @@ int ext4_mpage_readpages(struct inode *inode, map.m_flags &= ~EXT4_MAP_MAPPED; break; } - if (page_block == blocks_per_page) + if (page_block == blocks_per_folio) break; page_block++; block_in_file++; @@ -281,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode, * Then do more ext4_map_blocks() calls until we are * done with this folio. */ - while (page_block < blocks_per_page) { + while (page_block < blocks_per_folio) { if (block_in_file < last_block) { map.m_lblk = block_in_file; map.m_len = last_block - block_in_file; @@ -296,13 +302,13 @@ int ext4_mpage_readpages(struct inode *inode, } if ((map.m_flags & EXT4_MAP_MAPPED) == 0) { fully_mapped = 0; - if (first_hole == blocks_per_page) + if (first_hole == blocks_per_folio) first_hole = page_block; page_block++; block_in_file++; continue; } - if (first_hole != blocks_per_page) + if (first_hole != blocks_per_folio) goto confused; /* hole -> non-hole */ /* Contiguous blocks? */ @@ -315,13 +321,13 @@ int ext4_mpage_readpages(struct inode *inode, /* needed? */ map.m_flags &= ~EXT4_MAP_MAPPED; break; - } else if (page_block == blocks_per_page) + } else if (page_block == blocks_per_folio) break; page_block++; block_in_file++; } } - if (first_hole != blocks_per_page) { + if (first_hole != blocks_per_folio) { folio_zero_segment(folio, first_hole << blkbits, folio_size(folio)); if (first_hole == 0) { @@ -367,11 +373,11 @@ int ext4_mpage_readpages(struct inode *inode, if (((map.m_flags & EXT4_MAP_BOUNDARY) && (relative_block == map.m_len)) || - (first_hole != blocks_per_page)) { + (first_hole != blocks_per_folio)) { submit_bio(bio); bio = NULL; } else - last_block_in_bio = first_block + blocks_per_page - 1; + last_block_in_bio = first_block + blocks_per_folio - 1; continue; confused: if (bio) { diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c index b7ff0d955f0d..050f26168d97 100644 --- a/fs/ext4/resize.c +++ b/fs/ext4/resize.c @@ -1119,7 +1119,7 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data, es->s_block_group_nr = cpu_to_le16(group); if (ext4_has_feature_metadata_csum(sb)) - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); } /* diff --git a/fs/ext4/super.c b/fs/ext4/super.c index 181934499624..a7f80ca01174 100644 --- a/fs/ext4/super.c +++ b/fs/ext4/super.c @@ -286,14 +286,12 @@ static int ext4_verify_csum_type(struct super_block *sb, return es->s_checksum_type == EXT4_CRC32C_CHKSUM; } -__le32 ext4_superblock_csum(struct super_block *sb, - struct ext4_super_block *es) +__le32 ext4_superblock_csum(struct ext4_super_block *es) { - struct ext4_sb_info *sbi = EXT4_SB(sb); int offset = offsetof(struct ext4_super_block, s_checksum); __u32 csum; - csum = ext4_chksum(sbi, ~0, (char *)es, offset); + csum = ext4_chksum(~0, (char *)es, offset); return cpu_to_le32(csum); } @@ -304,7 +302,7 @@ static int ext4_superblock_csum_verify(struct super_block *sb, if (!ext4_has_feature_metadata_csum(sb)) return 1; - return es->s_checksum == ext4_superblock_csum(sb, es); + return es->s_checksum == ext4_superblock_csum(es); } void ext4_superblock_csum_set(struct super_block *sb) @@ -314,7 +312,7 @@ void ext4_superblock_csum_set(struct super_block *sb) if (!ext4_has_feature_metadata_csum(sb)) return; - es->s_checksum = ext4_superblock_csum(sb, es); + es->s_checksum = ext4_superblock_csum(es); } ext4_fsblk_t ext4_block_bitmap(struct super_block *sb, @@ -508,21 +506,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn) ext4_maybe_update_superblock(sb); } -/* - * This writepage callback for write_cache_pages() - * takes care of a few cases after page cleaning. - * - * write_cache_pages() already checks for dirty pages - * and calls clear_page_dirty_for_io(), which we want, - * to write protect the pages. - * - * However, we may have to redirty a page (see below.) - */ -static int ext4_journalled_writepage_callback(struct folio *folio, - struct writeback_control *wbc, - void *data) +static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode, + struct folio *folio) { - transaction_t *transaction = (transaction_t *) data; struct buffer_head *bh, *head; struct journal_head *jh; @@ -543,15 +529,12 @@ static int ext4_journalled_writepage_callback(struct folio *folio, */ jh = bh2jh(bh); if (buffer_dirty(bh) || - (jh && (jh->b_transaction != transaction || - jh->b_next_transaction))) { - folio_redirty_for_writepage(wbc, folio); - goto out; - } + (jh && (jh->b_transaction != jinode->i_transaction || + jh->b_next_transaction))) + return true; } while ((bh = bh->b_this_page) != head); -out: - return AOP_WRITEPAGE_ACTIVATE; + return false; } static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -563,10 +546,23 @@ static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode) .range_start = jinode->i_dirty_start, .range_end = jinode->i_dirty_end, }; + struct folio *folio = NULL; + int error; - return write_cache_pages(mapping, &wbc, - ext4_journalled_writepage_callback, - jinode->i_transaction); + /* + * writeback_iter() already checks for dirty pages and calls + * folio_clear_dirty_for_io(), which we want to write protect the + * folios. + * + * However, we may have to redirty a folio sometimes. + */ + while ((folio = writeback_iter(mapping, &wbc, folio, &error))) { + if (ext4_journalled_writepage_needs_redirty(jinode, folio)) + folio_redirty_for_writepage(&wbc, folio); + folio_unlock(folio); + } + + return error; } static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode) @@ -1415,7 +1411,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb) ei->i_datasync_tid = 0; INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work); ext4_fc_init_inode(&ei->vfs_inode); - mutex_init(&ei->i_fc_lock); + spin_lock_init(&ei->i_fc_lock); return &ei->vfs_inode; } @@ -1809,7 +1805,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = { {} }; -#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3)) #define MOPT_SET 0x0001 #define MOPT_CLEAR 0x0002 @@ -3209,14 +3204,14 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group, __u32 csum32; __u16 dummy_csum = 0; - csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group, + csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group, sizeof(le_group)); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset); - csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum, + csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset); + csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); if (offset < sbi->s_desc_size) - csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset, + csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset, sbi->s_desc_size - offset); crc = csum32 & 0xFFFF; @@ -4441,13 +4436,16 @@ static int ext4_handle_clustersize(struct super_block *sb) /* * ext4_atomic_write_init: Initializes filesystem min & max atomic write units. + * With non-bigalloc filesystem awu will be based upon filesystem blocksize + * & bdev awu units. + * With bigalloc it will be based upon bigalloc cluster size & bdev awu units. * @sb: super block - * TODO: Later add support for bigalloc */ static void ext4_atomic_write_init(struct super_block *sb) { struct ext4_sb_info *sbi = EXT4_SB(sb); struct block_device *bdev = sb->s_bdev; + unsigned int clustersize = EXT4_CLUSTER_SIZE(sb); if (!bdev_can_atomic_write(bdev)) return; @@ -4457,7 +4455,7 @@ static void ext4_atomic_write_init(struct super_block *sb) sbi->s_awu_min = max(sb->s_blocksize, bdev_atomic_write_unit_min_bytes(bdev)); - sbi->s_awu_max = min(sb->s_blocksize, + sbi->s_awu_max = min(clustersize, bdev_atomic_write_unit_max_bytes(bdev)); if (sbi->s_awu_min && sbi->s_awu_max && sbi->s_awu_min <= sbi->s_awu_max) { @@ -4482,7 +4480,7 @@ static void ext4_fast_commit_init(struct super_block *sb) sbi->s_fc_bytes = 0; ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE); sbi->s_fc_ineligible_tid = 0; - spin_lock_init(&sbi->s_fc_lock); + mutex_init(&sbi->s_fc_lock); memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats)); sbi->s_fc_replay_state.fc_regions = NULL; sbi->s_fc_replay_state.fc_regions_size = 0; @@ -4644,7 +4642,7 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed); else if (ext4_has_feature_metadata_csum(sb) || ext4_has_feature_ea_inode(sb)) - sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid, + sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid, sizeof(es->s_uuid)); return 0; } @@ -5255,7 +5253,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb) /* Set defaults for the variables that will be set during parsing */ if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO)) - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS; sbi->s_sectors_written_start = @@ -5916,7 +5914,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb, if ((le32_to_cpu(es->s_feature_ro_compat) & EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) && - es->s_checksum != ext4_superblock_csum(sb, es)) { + es->s_checksum != ext4_superblock_csum(es)) { ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock"); errno = -EFSCORRUPTED; goto out_bh; @@ -6495,7 +6493,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb) ctx->journal_ioprio = sbi->s_journal->j_task->io_context->ioprio; else - ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO; + ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO; } diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c index 7ab8f2e8e815..8d15acbacc20 100644 --- a/fs/ext4/xattr.c +++ b/fs/ext4/xattr.c @@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode, __u32 dummy_csum = 0; int offset = offsetof(struct ext4_xattr_header, h_checksum); - csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr, + csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr)); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset); - csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); + csum = ext4_chksum(csum, (__u8 *)hdr, offset); + csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum)); offset += sizeof(dummy_csum); - csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset, + csum = ext4_chksum(csum, (__u8 *)hdr + offset, EXT4_BLOCK_SIZE(inode->i_sb) - offset); return cpu_to_le32(csum); @@ -348,7 +348,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry, static u32 ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size) { - return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size); + return ext4_chksum(sbi->s_csum_seed, buffer, size); } static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode) diff --git a/fs/f2fs/acl.c b/fs/f2fs/acl.c index 1fbc0607363b..d4d7f329d23f 100644 --- a/fs/f2fs/acl.c +++ b/fs/f2fs/acl.c @@ -166,7 +166,7 @@ fail: } static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, - struct page *dpage) + struct folio *dfolio) { int name_index = F2FS_XATTR_INDEX_POSIX_ACL_DEFAULT; void *value = NULL; @@ -176,13 +176,13 @@ static struct posix_acl *__f2fs_get_acl(struct inode *inode, int type, if (type == ACL_TYPE_ACCESS) name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dpage); + retval = f2fs_getxattr(inode, name_index, "", NULL, 0, dfolio); if (retval > 0) { value = f2fs_kmalloc(F2FS_I_SB(inode), retval, GFP_F2FS_ZERO); if (!value) return ERR_PTR(-ENOMEM); retval = f2fs_getxattr(inode, name_index, "", value, - retval, dpage); + retval, dfolio); } if (retval > 0) @@ -227,7 +227,7 @@ static int f2fs_acl_update_mode(struct mnt_idmap *idmap, static int __f2fs_set_acl(struct mnt_idmap *idmap, struct inode *inode, int type, - struct posix_acl *acl, struct page *ipage) + struct posix_acl *acl, struct folio *ifolio) { int name_index; void *value = NULL; @@ -238,9 +238,8 @@ static int __f2fs_set_acl(struct mnt_idmap *idmap, switch (type) { case ACL_TYPE_ACCESS: name_index = F2FS_XATTR_INDEX_POSIX_ACL_ACCESS; - if (acl && !ipage) { - error = f2fs_acl_update_mode(idmap, inode, - &mode, &acl); + if (acl && !ifolio) { + error = f2fs_acl_update_mode(idmap, inode, &mode, &acl); if (error) return error; set_acl_inode(inode, mode); @@ -265,7 +264,7 @@ static int __f2fs_set_acl(struct mnt_idmap *idmap, } } - error = f2fs_setxattr(inode, name_index, "", value, size, ipage, 0); + error = f2fs_setxattr(inode, name_index, "", value, size, ifolio, 0); kfree(value); if (!error) @@ -360,7 +359,7 @@ static int f2fs_acl_create_masq(struct posix_acl *acl, umode_t *mode_p) static int f2fs_acl_create(struct inode *dir, umode_t *mode, struct posix_acl **default_acl, struct posix_acl **acl, - struct page *dpage) + struct folio *dfolio) { struct posix_acl *p; struct posix_acl *clone; @@ -372,7 +371,7 @@ static int f2fs_acl_create(struct inode *dir, umode_t *mode, if (S_ISLNK(*mode) || !IS_POSIXACL(dir)) return 0; - p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dpage); + p = __f2fs_get_acl(dir, ACL_TYPE_DEFAULT, dfolio); if (!p || p == ERR_PTR(-EOPNOTSUPP)) { *mode &= ~current_umask(); return 0; @@ -409,29 +408,29 @@ release_acl: return ret; } -int f2fs_init_acl(struct inode *inode, struct inode *dir, struct page *ipage, - struct page *dpage) +int f2fs_init_acl(struct inode *inode, struct inode *dir, struct folio *ifolio, + struct folio *dfolio) { struct posix_acl *default_acl = NULL, *acl = NULL; int error; - error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dpage); + error = f2fs_acl_create(dir, &inode->i_mode, &default_acl, &acl, dfolio); if (error) return error; f2fs_mark_inode_dirty_sync(inode, true); if (default_acl) { - error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, default_acl, - ipage); + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_DEFAULT, + default_acl, ifolio); posix_acl_release(default_acl); } else { inode->i_default_acl = NULL; } if (acl) { if (!error) - error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, acl, - ipage); + error = __f2fs_set_acl(NULL, inode, ACL_TYPE_ACCESS, + acl, ifolio); posix_acl_release(acl); } else { inode->i_acl = NULL; diff --git a/fs/f2fs/acl.h b/fs/f2fs/acl.h index 94ebfbfbdc6f..20e87e63c089 100644 --- a/fs/f2fs/acl.h +++ b/fs/f2fs/acl.h @@ -33,17 +33,17 @@ struct f2fs_acl_header { #ifdef CONFIG_F2FS_FS_POSIX_ACL -extern struct posix_acl *f2fs_get_acl(struct inode *, int, bool); -extern int f2fs_set_acl(struct mnt_idmap *, struct dentry *, +struct posix_acl *f2fs_get_acl(struct inode *, int, bool); +int f2fs_set_acl(struct mnt_idmap *, struct dentry *, struct posix_acl *, int); -extern int f2fs_init_acl(struct inode *, struct inode *, struct page *, - struct page *); +int f2fs_init_acl(struct inode *, struct inode *, struct folio *ifolio, + struct folio *dfolio); #else #define f2fs_get_acl NULL #define f2fs_set_acl NULL static inline int f2fs_init_acl(struct inode *inode, struct inode *dir, - struct page *ipage, struct page *dpage) + struct folio *ifolio, struct folio *dfolio) { return 0; } diff --git a/fs/f2fs/checkpoint.c b/fs/f2fs/checkpoint.c index cf77987d0698..f149ec28aefd 100644 --- a/fs/f2fs/checkpoint.c +++ b/fs/f2fs/checkpoint.c @@ -29,7 +29,7 @@ struct kmem_cache *f2fs_inode_entry_slab; void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, unsigned char reason) { - f2fs_build_fault_attr(sbi, 0, 0); + f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL); if (!end_io) f2fs_flush_merged_writes(sbi); f2fs_handle_critical_error(sbi, reason); @@ -38,23 +38,23 @@ void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, /* * We guarantee no failure on the returned page. */ -struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct folio *f2fs_grab_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index) { struct address_space *mapping = META_MAPPING(sbi); - struct page *page; + struct folio *folio; repeat: - page = f2fs_grab_cache_page(mapping, index, false); - if (!page) { + folio = f2fs_grab_cache_folio(mapping, index, false); + if (IS_ERR(folio)) { cond_resched(); goto repeat; } - f2fs_wait_on_page_writeback(page, META, true, true); - if (!PageUptodate(page)) - SetPageUptodate(page); - return page; + f2fs_folio_wait_writeback(folio, META, true, true); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + return folio; } -static struct page *__get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index, +static struct folio *__get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index, bool is_meta) { struct address_space *mapping = META_MAPPING(sbi); @@ -93,7 +93,7 @@ repeat: f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, F2FS_BLKSIZE); folio_lock(folio); - if (unlikely(folio->mapping != mapping)) { + if (unlikely(!is_meta_folio(folio))) { f2fs_folio_put(folio, true); goto repeat; } @@ -104,34 +104,34 @@ repeat: return ERR_PTR(-EIO); } out: - return &folio->page; + return folio; } -struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct folio *f2fs_get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index) { - return __get_meta_page(sbi, index, true); + return __get_meta_folio(sbi, index, true); } -struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index) +struct folio *f2fs_get_meta_folio_retry(struct f2fs_sb_info *sbi, pgoff_t index) { - struct page *page; + struct folio *folio; int count = 0; retry: - page = __get_meta_page(sbi, index, true); - if (IS_ERR(page)) { - if (PTR_ERR(page) == -EIO && + folio = __get_meta_folio(sbi, index, true); + if (IS_ERR(folio)) { + if (PTR_ERR(folio) == -EIO && ++count <= DEFAULT_RETRY_IO_COUNT) goto retry; f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_META_PAGE); } - return page; + return folio; } /* for POR only */ -struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index) +struct folio *f2fs_get_tmp_folio(struct f2fs_sb_info *sbi, pgoff_t index) { - return __get_meta_page(sbi, index, false); + return __get_meta_folio(sbi, index, false); } static bool __is_bitmap_valid(struct f2fs_sb_info *sbi, block_t blkaddr, @@ -252,7 +252,6 @@ bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, int type, bool sync) { - struct page *page; block_t blkno = start; struct f2fs_io_info fio = { .sbi = sbi, @@ -271,6 +270,7 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, blk_start_plug(&plug); for (; nrpages-- > 0; blkno++) { + struct folio *folio; if (!f2fs_is_valid_blkaddr(sbi, blkno, type)) goto out; @@ -300,18 +300,18 @@ int f2fs_ra_meta_pages(struct f2fs_sb_info *sbi, block_t start, int nrpages, BUG(); } - page = f2fs_grab_cache_page(META_MAPPING(sbi), + folio = f2fs_grab_cache_folio(META_MAPPING(sbi), fio.new_blkaddr, false); - if (!page) + if (IS_ERR(folio)) continue; - if (PageUptodate(page)) { - f2fs_put_page(page, 1); + if (folio_test_uptodate(folio)) { + f2fs_folio_put(folio, true); continue; } - fio.page = page; + fio.page = &folio->page; err = f2fs_submit_page_bio(&fio); - f2fs_put_page(page, err ? 1 : 0); + f2fs_folio_put(folio, err ? true : false); if (!err) f2fs_update_iostat(sbi, NULL, FS_META_READ_IO, @@ -325,27 +325,26 @@ out: void f2fs_ra_meta_pages_cond(struct f2fs_sb_info *sbi, pgoff_t index, unsigned int ra_blocks) { - struct page *page; + struct folio *folio; bool readahead = false; if (ra_blocks == RECOVERY_MIN_RA_BLOCKS) return; - page = find_get_page(META_MAPPING(sbi), index); - if (!page || !PageUptodate(page)) + folio = filemap_get_folio(META_MAPPING(sbi), index); + if (IS_ERR(folio) || !folio_test_uptodate(folio)) readahead = true; - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); if (readahead) f2fs_ra_meta_pages(sbi, index, ra_blocks, META_POR, true); } -static int __f2fs_write_meta_page(struct page *page, +static bool __f2fs_write_meta_folio(struct folio *folio, struct writeback_control *wbc, enum iostat_type io_type) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); - struct folio *folio = page_folio(page); + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); trace_f2fs_writepage(folio, META); @@ -354,31 +353,26 @@ static int __f2fs_write_meta_page(struct page *page, folio_clear_uptodate(folio); dec_page_count(sbi, F2FS_DIRTY_META); folio_unlock(folio); - return 0; + return true; } goto redirty_out; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) goto redirty_out; - if (wbc->for_reclaim && folio->index < GET_SUM_BLOCK(sbi, 0)) - goto redirty_out; f2fs_do_write_meta_page(sbi, folio, io_type); dec_page_count(sbi, F2FS_DIRTY_META); - if (wbc->for_reclaim) - f2fs_submit_merged_write_cond(sbi, NULL, page, 0, META); - folio_unlock(folio); if (unlikely(f2fs_cp_error(sbi))) f2fs_submit_merged_write(sbi, META); - return 0; + return true; redirty_out: - redirty_page_for_writepage(wbc, page); - return AOP_WRITEPAGE_ACTIVATE; + folio_redirty_for_writepage(wbc, folio); + return false; } static int f2fs_write_meta_pages(struct address_space *mapping, @@ -421,9 +415,7 @@ long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, struct folio_batch fbatch; long nwritten = 0; int nr_folios; - struct writeback_control wbc = { - .for_reclaim = 0, - }; + struct writeback_control wbc = {}; struct blk_plug plug; folio_batch_init(&fbatch); @@ -447,7 +439,7 @@ long f2fs_sync_meta_pages(struct f2fs_sb_info *sbi, enum page_type type, folio_lock(folio); - if (unlikely(folio->mapping != mapping)) { + if (unlikely(!is_meta_folio(folio))) { continue_unlock: folio_unlock(folio); continue; @@ -457,13 +449,12 @@ continue_unlock: goto continue_unlock; } - f2fs_wait_on_page_writeback(&folio->page, META, - true, true); + f2fs_folio_wait_writeback(folio, META, true, true); if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - if (__f2fs_write_meta_page(&folio->page, &wbc, + if (!__f2fs_write_meta_folio(folio, &wbc, io_type)) { folio_unlock(folio); break; @@ -513,6 +504,7 @@ static void __add_ino_entry(struct f2fs_sb_info *sbi, nid_t ino, { struct inode_management *im = &sbi->im[type]; struct ino_entry *e = NULL, *new = NULL; + int ret; if (type == FLUSH_INO) { rcu_read_lock(); @@ -525,7 +517,8 @@ retry: new = f2fs_kmem_cache_alloc(ino_entry_slab, GFP_NOFS, true, NULL); - radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + ret = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + f2fs_bug_on(sbi, ret); spin_lock(&im->ino_lock); e = radix_tree_lookup(&im->ino_root, ino); @@ -750,26 +743,26 @@ int f2fs_recover_orphan_inodes(struct f2fs_sb_info *sbi) f2fs_ra_meta_pages(sbi, start_blk, orphan_blocks, META_CP, true); for (i = 0; i < orphan_blocks; i++) { - struct page *page; + struct folio *folio; struct f2fs_orphan_block *orphan_blk; - page = f2fs_get_meta_page(sbi, start_blk + i); - if (IS_ERR(page)) { - err = PTR_ERR(page); + folio = f2fs_get_meta_folio(sbi, start_blk + i); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); goto out; } - orphan_blk = (struct f2fs_orphan_block *)page_address(page); + orphan_blk = folio_address(folio); for (j = 0; j < le32_to_cpu(orphan_blk->entry_count); j++) { nid_t ino = le32_to_cpu(orphan_blk->ino[j]); err = recover_orphan_inode(sbi, ino); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); goto out; } } - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); } /* clear Orphan Flag */ clear_ckpt_flags(sbi, CP_ORPHAN_PRESENT_FLAG); @@ -786,7 +779,7 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) unsigned int nentries = 0; unsigned short index = 1; unsigned short orphan_blocks; - struct page *page = NULL; + struct folio *folio = NULL; struct ino_entry *orphan = NULL; struct inode_management *im = &sbi->im[ORPHAN_INO]; @@ -801,10 +794,9 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) /* loop for each orphan inode entry and write them in journal block */ list_for_each_entry(orphan, head, list) { - if (!page) { - page = f2fs_grab_meta_page(sbi, start_blk++); - orphan_blk = - (struct f2fs_orphan_block *)page_address(page); + if (!folio) { + folio = f2fs_grab_meta_folio(sbi, start_blk++); + orphan_blk = folio_address(folio); memset(orphan_blk, 0, sizeof(*orphan_blk)); } @@ -819,62 +811,61 @@ static void write_orphan_inodes(struct f2fs_sb_info *sbi, block_t start_blk) orphan_blk->blk_addr = cpu_to_le16(index); orphan_blk->blk_count = cpu_to_le16(orphan_blocks); orphan_blk->entry_count = cpu_to_le32(nentries); - set_page_dirty(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); index++; nentries = 0; - page = NULL; + folio = NULL; } } - if (page) { + if (folio) { orphan_blk->blk_addr = cpu_to_le16(index); orphan_blk->blk_count = cpu_to_le16(orphan_blocks); orphan_blk->entry_count = cpu_to_le32(nentries); - set_page_dirty(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); } } -static __u32 f2fs_checkpoint_chksum(struct f2fs_sb_info *sbi, - struct f2fs_checkpoint *ckpt) +static __u32 f2fs_checkpoint_chksum(struct f2fs_checkpoint *ckpt) { unsigned int chksum_ofs = le32_to_cpu(ckpt->checksum_offset); __u32 chksum; - chksum = f2fs_crc32(sbi, ckpt, chksum_ofs); + chksum = f2fs_crc32(ckpt, chksum_ofs); if (chksum_ofs < CP_CHKSUM_OFFSET) { chksum_ofs += sizeof(chksum); - chksum = f2fs_chksum(sbi, chksum, (__u8 *)ckpt + chksum_ofs, - F2FS_BLKSIZE - chksum_ofs); + chksum = f2fs_chksum(chksum, (__u8 *)ckpt + chksum_ofs, + F2FS_BLKSIZE - chksum_ofs); } return chksum; } static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, - struct f2fs_checkpoint **cp_block, struct page **cp_page, + struct f2fs_checkpoint **cp_block, struct folio **cp_folio, unsigned long long *version) { size_t crc_offset = 0; __u32 crc; - *cp_page = f2fs_get_meta_page(sbi, cp_addr); - if (IS_ERR(*cp_page)) - return PTR_ERR(*cp_page); + *cp_folio = f2fs_get_meta_folio(sbi, cp_addr); + if (IS_ERR(*cp_folio)) + return PTR_ERR(*cp_folio); - *cp_block = (struct f2fs_checkpoint *)page_address(*cp_page); + *cp_block = folio_address(*cp_folio); crc_offset = le32_to_cpu((*cp_block)->checksum_offset); if (crc_offset < CP_MIN_CHKSUM_OFFSET || crc_offset > CP_CHKSUM_OFFSET) { - f2fs_put_page(*cp_page, 1); + f2fs_folio_put(*cp_folio, true); f2fs_warn(sbi, "invalid crc_offset: %zu", crc_offset); return -EINVAL; } - crc = f2fs_checkpoint_chksum(sbi, *cp_block); + crc = f2fs_checkpoint_chksum(*cp_block); if (crc != cur_cp_crc(*cp_block)) { - f2fs_put_page(*cp_page, 1); + f2fs_folio_put(*cp_folio, true); f2fs_warn(sbi, "invalid crc value"); return -EINVAL; } @@ -883,17 +874,17 @@ static int get_checkpoint_version(struct f2fs_sb_info *sbi, block_t cp_addr, return 0; } -static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, +static struct folio *validate_checkpoint(struct f2fs_sb_info *sbi, block_t cp_addr, unsigned long long *version) { - struct page *cp_page_1 = NULL, *cp_page_2 = NULL; + struct folio *cp_folio_1 = NULL, *cp_folio_2 = NULL; struct f2fs_checkpoint *cp_block = NULL; unsigned long long cur_version = 0, pre_version = 0; unsigned int cp_blocks; int err; err = get_checkpoint_version(sbi, cp_addr, &cp_block, - &cp_page_1, version); + &cp_folio_1, version); if (err) return NULL; @@ -908,19 +899,19 @@ static struct page *validate_checkpoint(struct f2fs_sb_info *sbi, cp_addr += cp_blocks - 1; err = get_checkpoint_version(sbi, cp_addr, &cp_block, - &cp_page_2, version); + &cp_folio_2, version); if (err) goto invalid_cp; cur_version = *version; if (cur_version == pre_version) { *version = cur_version; - f2fs_put_page(cp_page_2, 1); - return cp_page_1; + f2fs_folio_put(cp_folio_2, true); + return cp_folio_1; } - f2fs_put_page(cp_page_2, 1); + f2fs_folio_put(cp_folio_2, true); invalid_cp: - f2fs_put_page(cp_page_1, 1); + f2fs_folio_put(cp_folio_1, true); return NULL; } @@ -928,7 +919,7 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) { struct f2fs_checkpoint *cp_block; struct f2fs_super_block *fsb = sbi->raw_super; - struct page *cp1, *cp2, *cur_page; + struct folio *cp1, *cp2, *cur_folio; unsigned long blk_size = sbi->blocksize; unsigned long long cp1_version = 0, cp2_version = 0; unsigned long long cp_start_blk_no; @@ -955,22 +946,22 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) if (cp1 && cp2) { if (ver_after(cp2_version, cp1_version)) - cur_page = cp2; + cur_folio = cp2; else - cur_page = cp1; + cur_folio = cp1; } else if (cp1) { - cur_page = cp1; + cur_folio = cp1; } else if (cp2) { - cur_page = cp2; + cur_folio = cp2; } else { err = -EFSCORRUPTED; goto fail_no_cp; } - cp_block = (struct f2fs_checkpoint *)page_address(cur_page); + cp_block = folio_address(cur_folio); memcpy(sbi->ckpt, cp_block, blk_size); - if (cur_page == cp1) + if (cur_folio == cp1) sbi->cur_cp_pack = 1; else sbi->cur_cp_pack = 2; @@ -985,30 +976,30 @@ int f2fs_get_valid_checkpoint(struct f2fs_sb_info *sbi) goto done; cp_blk_no = le32_to_cpu(fsb->cp_blkaddr); - if (cur_page == cp2) + if (cur_folio == cp2) cp_blk_no += BIT(le32_to_cpu(fsb->log_blocks_per_seg)); for (i = 1; i < cp_blks; i++) { void *sit_bitmap_ptr; unsigned char *ckpt = (unsigned char *)sbi->ckpt; - cur_page = f2fs_get_meta_page(sbi, cp_blk_no + i); - if (IS_ERR(cur_page)) { - err = PTR_ERR(cur_page); + cur_folio = f2fs_get_meta_folio(sbi, cp_blk_no + i); + if (IS_ERR(cur_folio)) { + err = PTR_ERR(cur_folio); goto free_fail_no_cp; } - sit_bitmap_ptr = page_address(cur_page); + sit_bitmap_ptr = folio_address(cur_folio); memcpy(ckpt + i * blk_size, sit_bitmap_ptr, blk_size); - f2fs_put_page(cur_page, 1); + f2fs_folio_put(cur_folio, true); } done: - f2fs_put_page(cp1, 1); - f2fs_put_page(cp2, 1); + f2fs_folio_put(cp1, true); + f2fs_folio_put(cp2, true); return 0; free_fail_no_cp: - f2fs_put_page(cp1, 1); - f2fs_put_page(cp2, 1); + f2fs_folio_put(cp1, true); + f2fs_folio_put(cp2, true); fail_no_cp: kvfree(sbi->ckpt); return err; @@ -1218,7 +1209,6 @@ static int block_operations(struct f2fs_sb_info *sbi) struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .for_reclaim = 0, }; int err = 0, cnt = 0; @@ -1402,35 +1392,31 @@ static void update_ckpt_flags(struct f2fs_sb_info *sbi, struct cp_control *cpc) static void commit_checkpoint(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { - struct writeback_control wbc = { - .for_reclaim = 0, - }; + struct writeback_control wbc = {}; /* - * filemap_get_folios_tag and lock_page again will take + * filemap_get_folios_tag and folio_lock again will take * some extra time. Therefore, f2fs_update_meta_pages and * f2fs_sync_meta_pages are combined in this function. */ - struct page *page = f2fs_grab_meta_page(sbi, blk_addr); - int err; + struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr); - f2fs_wait_on_page_writeback(page, META, true, true); + memcpy(folio_address(folio), src, PAGE_SIZE); - memcpy(page_address(page), src, PAGE_SIZE); - - set_page_dirty(page); - if (unlikely(!clear_page_dirty_for_io(page))) + folio_mark_dirty(folio); + if (unlikely(!folio_clear_dirty_for_io(folio))) f2fs_bug_on(sbi, 1); /* writeout cp pack 2 page */ - err = __f2fs_write_meta_page(page, &wbc, FS_CP_META_IO); - if (unlikely(err && f2fs_cp_error(sbi))) { - f2fs_put_page(page, 1); - return; + if (unlikely(!__f2fs_write_meta_folio(folio, &wbc, FS_CP_META_IO))) { + if (f2fs_cp_error(sbi)) { + f2fs_folio_put(folio, true); + return; + } + f2fs_bug_on(sbi, true); } - f2fs_bug_on(sbi, err); - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); /* submit checkpoint (with barrier if NOBARRIER is not set) */ f2fs_submit_merged_write(sbi, META_FLUSH); @@ -1520,7 +1506,7 @@ static int do_checkpoint(struct f2fs_sb_info *sbi, struct cp_control *cpc) get_sit_bitmap(sbi, __bitmap_ptr(sbi, SIT_BITMAP)); get_nat_bitmap(sbi, __bitmap_ptr(sbi, NAT_BITMAP)); - crc32 = f2fs_checkpoint_chksum(sbi, ckpt); + crc32 = f2fs_checkpoint_chksum(ckpt); *((__le32 *)((unsigned char *)ckpt + le32_to_cpu(ckpt->checksum_offset))) = cpu_to_le32(crc32); diff --git a/fs/f2fs/compress.c b/fs/f2fs/compress.c index 9b94810675c1..b3c1df93a163 100644 --- a/fs/f2fs/compress.c +++ b/fs/f2fs/compress.c @@ -82,7 +82,7 @@ bool f2fs_is_compressed_page(struct page *page) if (page_private_nonpointer(page)) return false; - f2fs_bug_on(F2FS_M_SB(page->mapping), + f2fs_bug_on(F2FS_P_SB(page), *((u32 *)page_private(page)) != F2FS_COMPRESSED_PAGE_MAGIC); return true; } @@ -137,9 +137,11 @@ static void f2fs_put_rpages_wbc(struct compress_ctx *cc, } } -struct page *f2fs_compress_control_page(struct page *page) +struct folio *f2fs_compress_control_folio(struct folio *folio) { - return ((struct compress_io_ctx *)page_private(page))->rpages[0]; + struct compress_io_ctx *ctx = folio->private; + + return page_folio(ctx->rpages[0]); } int f2fs_init_compress_ctx(struct compress_ctx *cc) @@ -178,8 +180,8 @@ void f2fs_compress_ctx_add_page(struct compress_ctx *cc, struct folio *folio) #ifdef CONFIG_F2FS_FS_LZO static int lzo_init_compress_ctx(struct compress_ctx *cc) { - cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), - LZO1X_MEM_COMPRESS, GFP_NOFS); + cc->private = f2fs_vmalloc(F2FS_I_SB(cc->inode), + LZO1X_MEM_COMPRESS); if (!cc->private) return -ENOMEM; @@ -189,7 +191,7 @@ static int lzo_init_compress_ctx(struct compress_ctx *cc) static void lzo_destroy_compress_ctx(struct compress_ctx *cc) { - kvfree(cc->private); + vfree(cc->private); cc->private = NULL; } @@ -246,7 +248,7 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc) size = LZ4HC_MEM_COMPRESS; #endif - cc->private = f2fs_kvmalloc(F2FS_I_SB(cc->inode), size, GFP_NOFS); + cc->private = f2fs_vmalloc(F2FS_I_SB(cc->inode), size); if (!cc->private) return -ENOMEM; @@ -261,7 +263,7 @@ static int lz4_init_compress_ctx(struct compress_ctx *cc) static void lz4_destroy_compress_ctx(struct compress_ctx *cc) { - kvfree(cc->private); + vfree(cc->private); cc->private = NULL; } @@ -342,8 +344,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) params = zstd_get_params(level, cc->rlen); workspace_size = zstd_cstream_workspace_bound(¶ms.cParams); - workspace = f2fs_kvmalloc(F2FS_I_SB(cc->inode), - workspace_size, GFP_NOFS); + workspace = f2fs_vmalloc(F2FS_I_SB(cc->inode), workspace_size); if (!workspace) return -ENOMEM; @@ -351,7 +352,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) if (!stream) { f2fs_err_ratelimited(F2FS_I_SB(cc->inode), "%s zstd_init_cstream failed", __func__); - kvfree(workspace); + vfree(workspace); return -EIO; } @@ -364,7 +365,7 @@ static int zstd_init_compress_ctx(struct compress_ctx *cc) static void zstd_destroy_compress_ctx(struct compress_ctx *cc) { - kvfree(cc->private); + vfree(cc->private); cc->private = NULL; cc->private2 = NULL; } @@ -423,8 +424,7 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) workspace_size = zstd_dstream_workspace_bound(max_window_size); - workspace = f2fs_kvmalloc(F2FS_I_SB(dic->inode), - workspace_size, GFP_NOFS); + workspace = f2fs_vmalloc(F2FS_I_SB(dic->inode), workspace_size); if (!workspace) return -ENOMEM; @@ -432,7 +432,7 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) if (!stream) { f2fs_err_ratelimited(F2FS_I_SB(dic->inode), "%s zstd_init_dstream failed", __func__); - kvfree(workspace); + vfree(workspace); return -EIO; } @@ -444,7 +444,7 @@ static int zstd_init_decompress_ctx(struct decompress_io_ctx *dic) static void zstd_destroy_decompress_ctx(struct decompress_io_ctx *dic) { - kvfree(dic->private); + vfree(dic->private); dic->private = NULL; dic->private2 = NULL; } @@ -593,11 +593,14 @@ static struct page *f2fs_compress_alloc_page(void) static void f2fs_compress_free_page(struct page *page) { + struct folio *folio; + if (!page) return; - detach_page_private(page); - page->mapping = NULL; - unlock_page(page); + folio = page_folio(page); + folio_detach_private(folio); + folio->mapping = NULL; + folio_unlock(folio); mempool_free(page, compress_page_pool); } @@ -674,8 +677,7 @@ static int f2fs_compress_pages(struct compress_ctx *cc) cc->cbuf->clen = cpu_to_le32(cc->clen); if (fi->i_compress_flag & BIT(COMPRESS_CHKSUM)) - chksum = f2fs_crc32(F2FS_I_SB(cc->inode), - cc->cbuf->cdata, cc->clen); + chksum = f2fs_crc32(cc->cbuf->cdata, cc->clen); cc->cbuf->chksum = cpu_to_le32(chksum); for (i = 0; i < COMPRESS_DATA_RESERVED_SIZE; i++) @@ -771,7 +773,7 @@ void f2fs_decompress_cluster(struct decompress_io_ctx *dic, bool in_task) if (!ret && (fi->i_compress_flag & BIT(COMPRESS_CHKSUM))) { u32 provided = le32_to_cpu(dic->cbuf->chksum); - u32 calculated = f2fs_crc32(sbi, dic->cbuf->cdata, dic->clen); + u32 calculated = f2fs_crc32(dic->cbuf->cdata, dic->clen); if (provided != calculated) { if (!is_inode_flag_set(dic->inode, FI_COMPRESS_CORRUPT)) { @@ -909,7 +911,7 @@ bool f2fs_sanity_check_cluster(struct dnode_of_data *dn) } for (i = 1, count = 1; i < cluster_size; i++, count++) { - block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + block_t blkaddr = data_blkaddr(dn->inode, dn->node_folio, dn->ofs_in_node + i); /* [COMPR_ADDR, ..., COMPR_ADDR] */ @@ -950,7 +952,7 @@ static int __f2fs_get_cluster_blocks(struct inode *inode, int count, i; for (i = 0, count = 0; i < cluster_size; i++) { - block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + block_t blkaddr = data_blkaddr(dn->inode, dn->node_folio, dn->ofs_in_node + i); if (__is_valid_data_blkaddr(blkaddr)) @@ -1090,7 +1092,7 @@ static int prepare_compress_overwrite(struct compress_ctx *cc, { struct f2fs_sb_info *sbi = F2FS_I_SB(cc->inode); struct address_space *mapping = cc->inode->i_mapping; - struct page *page; + struct folio *folio; sector_t last_block_in_bio; fgf_t fgp_flag = FGP_LOCK | FGP_WRITE | FGP_CREAT; pgoff_t start_idx = start_idx_of_cluster(cc); @@ -1105,19 +1107,19 @@ retry: if (ret) return ret; - /* keep page reference to avoid page reclaim */ + /* keep folio reference to avoid page reclaim */ for (i = 0; i < cc->cluster_size; i++) { - page = f2fs_pagecache_get_page(mapping, start_idx + i, - fgp_flag, GFP_NOFS); - if (!page) { - ret = -ENOMEM; + folio = f2fs_filemap_get_folio(mapping, start_idx + i, + fgp_flag, GFP_NOFS); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); goto unlock_pages; } - if (PageUptodate(page)) - f2fs_put_page(page, 1); + if (folio_test_uptodate(folio)) + f2fs_folio_put(folio, true); else - f2fs_compress_ctx_add_page(cc, page_folio(page)); + f2fs_compress_ctx_add_page(cc, folio); } if (!f2fs_cluster_is_empty(cc)) { @@ -1140,17 +1142,17 @@ retry: for (i = 0; i < cc->cluster_size; i++) { f2fs_bug_on(sbi, cc->rpages[i]); - page = find_lock_page(mapping, start_idx + i); - if (!page) { - /* page can be truncated */ + folio = filemap_lock_folio(mapping, start_idx + i); + if (IS_ERR(folio)) { + /* folio could be truncated */ goto release_and_retry; } - f2fs_wait_on_page_writeback(page, DATA, true, true); - f2fs_compress_ctx_add_page(cc, page_folio(page)); + f2fs_folio_wait_writeback(folio, DATA, true, true); + f2fs_compress_ctx_add_page(cc, folio); - if (!PageUptodate(page)) { - f2fs_handle_page_eio(sbi, page_folio(page), DATA); + if (!folio_test_uptodate(folio)) { + f2fs_handle_page_eio(sbi, folio, DATA); release_and_retry: f2fs_put_rpages(cc); f2fs_unlock_rpages(cc, i + 1); @@ -1317,7 +1319,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, goto out_unlock_op; for (i = 0; i < cc->cluster_size; i++) { - if (data_blkaddr(dn.inode, dn.node_page, + if (data_blkaddr(dn.inode, dn.node_folio, dn.ofs_in_node + i) == NULL_ADDR) goto out_put_dnode; } @@ -1349,7 +1351,7 @@ static int f2fs_write_compressed_pages(struct compress_ctx *cc, page_folio(cc->rpages[i + 1])->index, cic); fio.compressed_page = cc->cpages[i]; - fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_page, + fio.old_blkaddr = data_blkaddr(dn.inode, dn.node_folio, dn.ofs_in_node + i + 1); /* wait for GCed page writeback via META_MAPPING */ @@ -1481,7 +1483,7 @@ void f2fs_compress_write_end_io(struct bio *bio, struct page *page) f2fs_is_compressed_page(page)); int i; - if (unlikely(bio->bi_status)) + if (unlikely(bio->bi_status != BLK_STS_OK)) mapping_set_error(cic->inode->i_mapping, -EIO); f2fs_compress_free_page(page); @@ -1529,37 +1531,38 @@ static int f2fs_write_raw_pages(struct compress_ctx *cc, f2fs_lock_op(sbi); for (i = 0; i < cc->cluster_size; i++) { + struct folio *folio; + if (!cc->rpages[i]) continue; + folio = page_folio(cc->rpages[i]); retry_write: - lock_page(cc->rpages[i]); + folio_lock(folio); - if (cc->rpages[i]->mapping != mapping) { + if (folio->mapping != mapping) { continue_unlock: - unlock_page(cc->rpages[i]); + folio_unlock(folio); continue; } - if (!PageDirty(cc->rpages[i])) + if (!folio_test_dirty(folio)) goto continue_unlock; - if (folio_test_writeback(page_folio(cc->rpages[i]))) { + if (folio_test_writeback(folio)) { if (wbc->sync_mode == WB_SYNC_NONE) goto continue_unlock; - f2fs_wait_on_page_writeback(cc->rpages[i], DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); } - if (!clear_page_dirty_for_io(cc->rpages[i])) + if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; submitted = 0; - ret = f2fs_write_single_data_page(page_folio(cc->rpages[i]), - &submitted, + ret = f2fs_write_single_data_page(folio, &submitted, NULL, NULL, wbc, io_type, compr_blocks, false); if (ret) { - if (ret == AOP_WRITEPAGE_ACTIVATE) { - unlock_page(cc->rpages[i]); + if (ret == 1) { ret = 0; } else if (ret == -EAGAIN) { ret = 0; @@ -1862,14 +1865,13 @@ void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, } /* - * Put a reference to a compressed page's decompress_io_ctx. + * Put a reference to a compressed folio's decompress_io_ctx. * - * This is called when the page is no longer needed and can be freed. + * This is called when the folio is no longer needed and can be freed. */ -void f2fs_put_page_dic(struct page *page, bool in_task) +void f2fs_put_folio_dic(struct folio *folio, bool in_task) { - struct decompress_io_ctx *dic = - (struct decompress_io_ctx *)page_private(page); + struct decompress_io_ctx *dic = folio->private; f2fs_put_dic(dic, in_task); } @@ -1881,14 +1883,14 @@ void f2fs_put_page_dic(struct page *page, bool in_task) unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn, unsigned int ofs_in_node) { - bool compressed = data_blkaddr(dn->inode, dn->node_page, + bool compressed = data_blkaddr(dn->inode, dn->node_folio, ofs_in_node) == COMPRESS_ADDR; int i = compressed ? 1 : 0; - block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, + block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_folio, ofs_in_node + i); for (i += 1; i < F2FS_I(dn->inode)->i_cluster_size; i++) { - block_t blkaddr = data_blkaddr(dn->inode, dn->node_page, + block_t blkaddr = data_blkaddr(dn->inode, dn->node_folio, ofs_in_node + i); if (!__is_valid_data_blkaddr(blkaddr)) @@ -1922,7 +1924,7 @@ void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, nid_t ino, block_t blkaddr) { - struct page *cpage; + struct folio *cfolio; int ret; if (!test_opt(sbi, COMPRESS_CACHE)) @@ -1934,49 +1936,49 @@ void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, if (!f2fs_available_free_memory(sbi, COMPRESS_PAGE)) return; - cpage = find_get_page(COMPRESS_MAPPING(sbi), blkaddr); - if (cpage) { - f2fs_put_page(cpage, 0); + cfolio = filemap_get_folio(COMPRESS_MAPPING(sbi), blkaddr); + if (!IS_ERR(cfolio)) { + f2fs_folio_put(cfolio, false); return; } - cpage = alloc_page(__GFP_NOWARN | __GFP_IO); - if (!cpage) + cfolio = filemap_alloc_folio(__GFP_NOWARN | __GFP_IO, 0); + if (!cfolio) return; - ret = add_to_page_cache_lru(cpage, COMPRESS_MAPPING(sbi), + ret = filemap_add_folio(COMPRESS_MAPPING(sbi), cfolio, blkaddr, GFP_NOFS); if (ret) { - f2fs_put_page(cpage, 0); + f2fs_folio_put(cfolio, false); return; } - set_page_private_data(cpage, ino); + set_page_private_data(&cfolio->page, ino); - memcpy(page_address(cpage), page_address(page), PAGE_SIZE); - SetPageUptodate(cpage); - f2fs_put_page(cpage, 1); + memcpy(folio_address(cfolio), page_address(page), PAGE_SIZE); + folio_mark_uptodate(cfolio); + f2fs_folio_put(cfolio, true); } -bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, +bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr) { - struct page *cpage; + struct folio *cfolio; bool hitted = false; if (!test_opt(sbi, COMPRESS_CACHE)) return false; - cpage = f2fs_pagecache_get_page(COMPRESS_MAPPING(sbi), + cfolio = f2fs_filemap_get_folio(COMPRESS_MAPPING(sbi), blkaddr, FGP_LOCK | FGP_NOWAIT, GFP_NOFS); - if (cpage) { - if (PageUptodate(cpage)) { + if (!IS_ERR(cfolio)) { + if (folio_test_uptodate(cfolio)) { atomic_inc(&sbi->compress_page_hit); - memcpy(page_address(page), - page_address(cpage), PAGE_SIZE); + memcpy(folio_address(folio), + folio_address(cfolio), folio_size(folio)); hitted = true; } - f2fs_put_page(cpage, 1); + f2fs_folio_put(cfolio, true); } return hitted; diff --git a/fs/f2fs/data.c b/fs/f2fs/data.c index 54f89f0ee69b..31e892842625 100644 --- a/fs/f2fs/data.c +++ b/fs/f2fs/data.c @@ -49,12 +49,12 @@ void f2fs_destroy_bioset(void) bool f2fs_is_cp_guaranteed(struct page *page) { - struct address_space *mapping = page->mapping; + struct address_space *mapping = page_folio(page)->mapping; struct inode *inode; struct f2fs_sb_info *sbi; - if (!mapping) - return false; + if (fscrypt_is_bounce_page(page)) + return page_private_gcing(fscrypt_pagecache_page(page)); inode = mapping->host; sbi = F2FS_I_SB(inode); @@ -146,12 +146,12 @@ static void f2fs_finish_read_bio(struct bio *bio, bool in_task) if (ctx && !ctx->decompression_attempted) f2fs_end_read_compressed_page(&folio->page, true, 0, in_task); - f2fs_put_page_dic(&folio->page, in_task); + f2fs_put_folio_dic(folio, in_task); continue; } dec_page_count(F2FS_F_SB(folio), __read_io_type(folio)); - folio_end_read(folio, bio->bi_status == 0); + folio_end_read(folio, bio->bi_status == BLK_STS_OK); } if (ctx) @@ -290,7 +290,7 @@ static void f2fs_read_end_io(struct bio *bio) if (time_to_inject(sbi, FAULT_READ_IO)) bio->bi_status = BLK_STS_IOERR; - if (bio->bi_status) { + if (bio->bi_status != BLK_STS_OK) { f2fs_finish_read_bio(bio, intask); return; } @@ -347,19 +347,19 @@ static void f2fs_write_end_io(struct bio *bio) type = WB_DATA_TYPE(&folio->page, false); - if (unlikely(bio->bi_status)) { + if (unlikely(bio->bi_status != BLK_STS_OK)) { mapping_set_error(folio->mapping, -EIO); if (type == F2FS_WB_CP_DATA) f2fs_stop_checkpoint(sbi, true, STOP_CP_REASON_WRITE_FAIL); } - f2fs_bug_on(sbi, folio->mapping == NODE_MAPPING(sbi) && + f2fs_bug_on(sbi, is_node_folio(folio) && folio->index != nid_of_node(&folio->page)); dec_page_count(sbi, type); if (f2fs_in_warm_node_list(sbi, folio)) - f2fs_del_fsync_node_entry(sbi, &folio->page); + f2fs_del_fsync_node_entry(sbi, folio); clear_page_private_gcing(&folio->page); folio_end_writeback(folio); } @@ -548,8 +548,7 @@ static void __submit_merged_bio(struct f2fs_bio_info *io) static bool __has_merged_page(struct bio *bio, struct inode *inode, struct page *page, nid_t ino) { - struct bio_vec *bvec; - struct bvec_iter_all iter_all; + struct folio_iter fi; if (!bio) return false; @@ -557,25 +556,25 @@ static bool __has_merged_page(struct bio *bio, struct inode *inode, if (!inode && !page && !ino) return true; - bio_for_each_segment_all(bvec, bio, iter_all) { - struct page *target = bvec->bv_page; + bio_for_each_folio_all(fi, bio) { + struct folio *target = fi.folio; - if (fscrypt_is_bounce_page(target)) { - target = fscrypt_pagecache_page(target); + if (fscrypt_is_bounce_folio(target)) { + target = fscrypt_pagecache_folio(target); if (IS_ERR(target)) continue; } - if (f2fs_is_compressed_page(target)) { - target = f2fs_compress_control_page(target); + if (f2fs_is_compressed_page(&target->page)) { + target = f2fs_compress_control_folio(target); if (IS_ERR(target)) continue; } if (inode && inode == target->mapping->host) return true; - if (page && page == target) + if (page && page == &target->page) return true; - if (ino && ino == ino_of_node(target)) + if (ino && ino == ino_of_node(&target->page)) return true; } @@ -780,6 +779,7 @@ static void del_bio_entry(struct bio_entry *be) static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, struct page *page) { + struct folio *fio_folio = page_folio(fio->page); struct f2fs_sb_info *sbi = fio->sbi; enum temp_type temp; bool found = false; @@ -801,8 +801,8 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, *fio->last_block, fio->new_blkaddr)); if (f2fs_crypt_mergeable_bio(*bio, - fio->page->mapping->host, - page_folio(fio->page)->index, fio) && + fio_folio->mapping->host, + fio_folio->index, fio) && bio_add_page(*bio, page, PAGE_SIZE, 0) == PAGE_SIZE) { ret = 0; @@ -826,13 +826,13 @@ static int add_ipu_page(struct f2fs_io_info *fio, struct bio **bio, } void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, - struct bio **bio, struct page *page) + struct bio **bio, struct folio *folio) { enum temp_type temp; bool found = false; struct bio *target = bio ? *bio : NULL; - f2fs_bug_on(sbi, !target && !page); + f2fs_bug_on(sbi, !target && !folio); for (temp = HOT; temp < NR_TEMP_TYPE && !found; temp++) { struct f2fs_bio_info *io = sbi->write_io[DATA] + temp; @@ -848,7 +848,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, - page, 0); + &folio->page, 0); if (found) break; } @@ -865,7 +865,7 @@ void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, found = (target == be->bio); else found = __has_merged_page(be->bio, NULL, - page, 0); + &folio->page, 0); if (found) { target = be->bio; del_bio_entry(be); @@ -995,13 +995,13 @@ next: if (io->bio && (!io_is_mergeable(sbi, io->bio, io, fio, io->last_block_in_bio, fio->new_blkaddr) || - !f2fs_crypt_mergeable_bio(io->bio, fio->page->mapping->host, + !f2fs_crypt_mergeable_bio(io->bio, fio_inode(fio), page_folio(bio_page)->index, fio))) __submit_merged_bio(io); alloc_new: if (io->bio == NULL) { io->bio = __bio_alloc(fio, BIO_MAX_VECS); - f2fs_set_bio_crypt_ctx(io->bio, fio->page->mapping->host, + f2fs_set_bio_crypt_ctx(io->bio, fio_inode(fio), page_folio(bio_page)->index, fio, GFP_NOIO); io->fio = *fio; } @@ -1116,7 +1116,7 @@ static int f2fs_submit_page_read(struct inode *inode, struct folio *folio, static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { - __le32 *addr = get_dnode_addr(dn->inode, dn->node_page); + __le32 *addr = get_dnode_addr(dn->inode, dn->node_folio); dn->data_blkaddr = blkaddr; addr[dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr); @@ -1125,14 +1125,14 @@ static void __set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) /* * Lock ordering for the change of data block address: * ->data_page - * ->node_page + * ->node_folio * update block addresses in the node page */ void f2fs_set_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr) { - f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); + f2fs_folio_wait_writeback(dn->node_folio, NODE, true, true); __set_data_blkaddr(dn, blkaddr); - if (set_page_dirty(dn->node_page)) + if (folio_mark_dirty(dn->node_folio)) dn->node_changed = true; } @@ -1160,7 +1160,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) trace_f2fs_reserve_new_blocks(dn->inode, dn->nid, dn->ofs_in_node, count); - f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true); + f2fs_folio_wait_writeback(dn->node_folio, NODE, true, true); for (; count > 0; dn->ofs_in_node++) { block_t blkaddr = f2fs_data_blkaddr(dn); @@ -1171,7 +1171,7 @@ int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count) } } - if (set_page_dirty(dn->node_page)) + if (folio_mark_dirty(dn->node_folio)) dn->node_changed = true; return 0; } @@ -1189,7 +1189,7 @@ int f2fs_reserve_new_block(struct dnode_of_data *dn) int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index) { - bool need_put = dn->inode_page ? false : true; + bool need_put = dn->inode_folio ? false : true; int err; err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE); @@ -1257,7 +1257,7 @@ got_it: * A new dentry page is allocated but not able to be written, since its * new inode page couldn't be allocated due to -ENOSPC. * In such the case, its blkaddr can be remained as NEW_ADDR. - * see, f2fs_add_link -> f2fs_get_new_data_page -> + * see, f2fs_add_link -> f2fs_get_new_data_folio -> * f2fs_init_inode_metadata. */ if (dn.data_blkaddr == NEW_ADDR) { @@ -1338,57 +1338,57 @@ struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, * * Also, caller should grab and release a rwsem by calling f2fs_lock_op() and * f2fs_unlock_op(). - * Note that, ipage is set only by make_empty_dir, and if any error occur, - * ipage should be released by this function. + * Note that, ifolio is set only by make_empty_dir, and if any error occur, + * ifolio should be released by this function. */ -struct page *f2fs_get_new_data_page(struct inode *inode, - struct page *ipage, pgoff_t index, bool new_i_size) +struct folio *f2fs_get_new_data_folio(struct inode *inode, + struct folio *ifolio, pgoff_t index, bool new_i_size) { struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; struct dnode_of_data dn; int err; - page = f2fs_grab_cache_page(mapping, index, true); - if (!page) { + folio = f2fs_grab_cache_folio(mapping, index, true); + if (IS_ERR(folio)) { /* - * before exiting, we should make sure ipage will be released + * before exiting, we should make sure ifolio will be released * if any error occur. */ - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); return ERR_PTR(-ENOMEM); } - set_new_dnode(&dn, inode, ipage, NULL, 0); + set_new_dnode(&dn, inode, ifolio, NULL, 0); err = f2fs_reserve_block(&dn, index); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return ERR_PTR(err); } - if (!ipage) + if (!ifolio) f2fs_put_dnode(&dn); - if (PageUptodate(page)) + if (folio_test_uptodate(folio)) goto got_it; if (dn.data_blkaddr == NEW_ADDR) { - zero_user_segment(page, 0, PAGE_SIZE); - if (!PageUptodate(page)) - SetPageUptodate(page); + folio_zero_segment(folio, 0, folio_size(folio)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); } else { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); - /* if ipage exists, blkaddr should be NEW_ADDR */ - f2fs_bug_on(F2FS_I_SB(inode), ipage); - page = f2fs_get_lock_data_page(inode, index, true); - if (IS_ERR(page)) - return page; + /* if ifolio exists, blkaddr should be NEW_ADDR */ + f2fs_bug_on(F2FS_I_SB(inode), ifolio); + folio = f2fs_get_lock_data_folio(inode, index, true); + if (IS_ERR(folio)) + return folio; } got_it: if (new_i_size && i_size_read(inode) < ((loff_t)(index + 1) << PAGE_SHIFT)) f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT)); - return page; + return folio; } static int __allocate_data_block(struct dnode_of_data *dn, int seg_type) @@ -1589,7 +1589,7 @@ next_dnode: start_pgofs = pgofs; prealloc = 0; last_ofs_in_node = ofs_in_node = dn.ofs_in_node; - end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); next_block: blkaddr = f2fs_data_blkaddr(&dn); @@ -1825,7 +1825,6 @@ static int f2fs_xattr_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *page; struct node_info ni; __u64 phys = 0, len; __u32 flags; @@ -1834,15 +1833,15 @@ static int f2fs_xattr_fiemap(struct inode *inode, if (f2fs_has_inline_xattr(inode)) { int offset; + struct folio *folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), + inode->i_ino, false); - page = f2fs_grab_cache_page(NODE_MAPPING(sbi), - inode->i_ino, false); - if (!page) - return -ENOMEM; + if (IS_ERR(folio)) + return PTR_ERR(folio); err = f2fs_get_node_info(sbi, inode->i_ino, &ni, false); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return err; } @@ -1854,7 +1853,7 @@ static int f2fs_xattr_fiemap(struct inode *inode, phys += offset; len = inline_xattr_size(inode); - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED; @@ -1868,20 +1867,22 @@ static int f2fs_xattr_fiemap(struct inode *inode, } if (xnid) { - page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false); - if (!page) - return -ENOMEM; + struct folio *folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), + xnid, false); + + if (IS_ERR(folio)) + return PTR_ERR(folio); err = f2fs_get_node_info(sbi, xnid, &ni, false); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return err; } phys = F2FS_BLK_TO_BYTES(ni.blk_addr); len = inode->i_sb->s_blocksize; - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); flags = FIEMAP_EXTENT_LAST; } @@ -2077,7 +2078,7 @@ static int f2fs_read_single_page(struct inode *inode, struct folio *folio, sector_t last_block; sector_t last_block_in_file; sector_t block_nr; - pgoff_t index = folio_index(folio); + pgoff_t index = folio->index; int ret = 0; block_in_file = (sector_t)index; @@ -2245,7 +2246,7 @@ skip_reading_dnode: for (i = 1; i < cc->cluster_size; i++) { block_t blkaddr; - blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_folio, dn.ofs_in_node + i) : ei.blk + i - 1; @@ -2279,14 +2280,13 @@ skip_reading_dnode: block_t blkaddr; struct bio_post_read_ctx *ctx; - blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_page, + blkaddr = from_dnode ? data_blkaddr(dn.inode, dn.node_folio, dn.ofs_in_node + i + 1) : ei.blk + i; f2fs_wait_on_block_writeback(inode, blkaddr); - if (f2fs_load_compressed_page(sbi, folio_page(folio, 0), - blkaddr)) { + if (f2fs_load_compressed_folio(sbi, folio, blkaddr)) { if (atomic_dec_and_test(&dic->remaining_pages)) { f2fs_decompress_cluster(dic, true); break; @@ -2392,7 +2392,7 @@ static int f2fs_mpage_readpages(struct inode *inode, } #ifdef CONFIG_F2FS_FS_COMPRESSION - index = folio_index(folio); + index = folio->index; if (!f2fs_compressed_file(inode)) goto read_single_page; @@ -2501,8 +2501,9 @@ static void f2fs_readahead(struct readahead_control *rac) int f2fs_encrypt_one_page(struct f2fs_io_info *fio) { - struct inode *inode = fio->page->mapping->host; - struct page *mpage, *page; + struct inode *inode = fio_inode(fio); + struct folio *mfolio; + struct page *page; gfp_t gfp_flags = GFP_NOFS; if (!f2fs_encrypted_file(inode)) @@ -2527,12 +2528,12 @@ retry_encrypt: return PTR_ERR(fio->encrypted_page); } - mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr); - if (mpage) { - if (PageUptodate(mpage)) - memcpy(page_address(mpage), + mfolio = filemap_lock_folio(META_MAPPING(fio->sbi), fio->old_blkaddr); + if (!IS_ERR(mfolio)) { + if (folio_test_uptodate(mfolio)) + memcpy(folio_address(mfolio), page_address(fio->encrypted_page), PAGE_SIZE); - f2fs_put_page(mpage, 1); + f2fs_folio_put(mfolio, true); } return 0; } @@ -2631,7 +2632,7 @@ bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio) static inline bool need_inplace_update(struct f2fs_io_info *fio) { - struct inode *inode = fio->page->mapping->host; + struct inode *inode = fio_inode(fio); if (f2fs_should_update_outplace(inode, fio)) return false; @@ -2855,13 +2856,7 @@ write: goto done; } - if (!wbc->for_reclaim) - need_balance_fs = true; - else if (has_not_enough_free_secs(sbi, 0, 0)) - goto redirty_out; - else - set_inode_flag(inode, FI_HOT_DATA); - + need_balance_fs = true; err = -EAGAIN; if (f2fs_has_inline_data(inode)) { err = f2fs_write_inline_data(inode, folio); @@ -2897,13 +2892,6 @@ out: folio_clear_uptodate(folio); clear_page_private_gcing(page); } - - if (wbc->for_reclaim) { - f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA); - clear_inode_flag(inode, FI_HOT_DATA); - f2fs_remove_dirty_inode(inode); - submitted = NULL; - } folio_unlock(folio); if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) && !F2FS_I(inode)->wb_task && allow_balance) @@ -2929,9 +2917,9 @@ redirty_out: * file_write_and_wait_range() will see EIO error, which is critical * to return value of fsync() followed by atomic_write failure to user. */ - if (!err || wbc->for_reclaim) - return AOP_WRITEPAGE_ACTIVATE; folio_unlock(folio); + if (!err) + return 1; return err; } @@ -3128,7 +3116,7 @@ continue_unlock: if (folio_test_writeback(folio)) { if (wbc->sync_mode == WB_SYNC_NONE) goto continue_unlock; - f2fs_wait_on_page_writeback(&folio->page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); } if (!folio_clear_dirty_for_io(folio)) @@ -3145,8 +3133,6 @@ continue_unlock: ret = f2fs_write_single_data_page(folio, &submitted, &bio, &last_block, wbc, io_type, 0, true); - if (ret == AOP_WRITEPAGE_ACTIVATE) - folio_unlock(folio); #ifdef CONFIG_F2FS_FS_COMPRESSION result: #endif @@ -3158,7 +3144,7 @@ result: * keep nr_to_write, since vfs uses this to * get # of written pages. */ - if (ret == AOP_WRITEPAGE_ACTIVATE) { + if (ret == 1) { ret = 0; goto next; } else if (ret == -EAGAIN) { @@ -3352,7 +3338,7 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, struct inode *inode = folio->mapping->host; pgoff_t index = folio->index; struct dnode_of_data dn; - struct page *ipage; + struct folio *ifolio; bool locked = false; int flag = F2FS_GET_BLOCK_PRE_AIO; int err = 0; @@ -3377,23 +3363,23 @@ static int prepare_write_begin(struct f2fs_sb_info *sbi, restart: /* check inline_data */ - ipage = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) { + err = PTR_ERR(ifolio); goto unlock_out; } - set_new_dnode(&dn, inode, ipage, ipage, 0); + set_new_dnode(&dn, inode, ifolio, ifolio, 0); if (f2fs_has_inline_data(inode)) { if (pos + len <= MAX_INLINE_DATA(inode)) { - f2fs_do_read_inline_data(folio, ipage); + f2fs_do_read_inline_data(folio, ifolio); set_inode_flag(inode, FI_DATA_EXIST); if (inode->i_nlink) - set_page_private_inline(ipage); + set_page_private_inline(&ifolio->page); goto out; } - err = f2fs_convert_inline_page(&dn, folio_page(folio, 0)); + err = f2fs_convert_inline_folio(&dn, folio); if (err || dn.data_blkaddr != NULL_ADDR) goto out; } @@ -3437,14 +3423,14 @@ static int __find_data_block(struct inode *inode, pgoff_t index, block_t *blk_addr) { struct dnode_of_data dn; - struct page *ipage; + struct folio *ifolio; int err = 0; - ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); - set_new_dnode(&dn, inode, ipage, ipage, 0); + set_new_dnode(&dn, inode, ifolio, ifolio, 0); if (!f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { @@ -3465,17 +3451,17 @@ static int __reserve_data_block(struct inode *inode, pgoff_t index, { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - struct page *ipage; + struct folio *ifolio; int err = 0; f2fs_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO); - ipage = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) { + err = PTR_ERR(ifolio); goto unlock_out; } - set_new_dnode(&dn, inode, ipage, ipage, 0); + set_new_dnode(&dn, inode, ifolio, ifolio, 0); if (!f2fs_lookup_read_extent_cache_block(dn.inode, index, &dn.data_blkaddr)) @@ -3623,7 +3609,7 @@ repeat: } } - f2fs_wait_on_page_writeback(&folio->page, DATA, false, true); + f2fs_folio_wait_writeback(folio, DATA, false, true); if (len == folio_size(folio) || folio_test_uptodate(folio)) return 0; @@ -3878,18 +3864,18 @@ static int f2fs_migrate_blocks(struct inode *inode, block_t start_blk, set_inode_flag(inode, FI_SKIP_WRITES); for (blkofs = 0; blkofs <= blkofs_end; blkofs++) { - struct page *page; + struct folio *folio; unsigned int blkidx = secidx * blk_per_sec + blkofs; - page = f2fs_get_lock_data_page(inode, blkidx, true); - if (IS_ERR(page)) { + folio = f2fs_get_lock_data_folio(inode, blkidx, true); + if (IS_ERR(folio)) { f2fs_up_write(&sbi->pin_sem); - ret = PTR_ERR(page); + ret = PTR_ERR(folio); goto done; } - set_page_dirty(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); } clear_inode_flag(inode, FI_SKIP_WRITES); @@ -3966,7 +3952,7 @@ retry: if ((pblock - SM_I(sbi)->main_blkaddr) % blks_per_sec || nr_pblocks % blks_per_sec || - !f2fs_valid_pinned_area(sbi, pblock)) { + f2fs_is_sequential_zone_area(sbi, pblock)) { bool last_extent = false; not_aligned++; diff --git a/fs/f2fs/dir.c b/fs/f2fs/dir.c index 5a63ff0df03b..c36b3b22bfff 100644 --- a/fs/f2fs/dir.c +++ b/fs/f2fs/dir.c @@ -173,7 +173,7 @@ static unsigned long dir_block_index(unsigned int level, } static struct f2fs_dir_entry *find_in_block(struct inode *dir, - struct page *dentry_page, + struct folio *dentry_folio, const struct f2fs_filename *fname, int *max_slots, bool use_hash) @@ -181,7 +181,7 @@ static struct f2fs_dir_entry *find_in_block(struct inode *dir, struct f2fs_dentry_block *dentry_blk; struct f2fs_dentry_ptr d; - dentry_blk = (struct f2fs_dentry_block *)page_address(dentry_page); + dentry_blk = folio_address(dentry_folio); make_dentry_ptr_block(dir, &d, dentry_blk); return f2fs_find_target_dentry(&d, fname, max_slots, use_hash); @@ -260,13 +260,12 @@ found: static struct f2fs_dir_entry *find_in_level(struct inode *dir, unsigned int level, const struct f2fs_filename *fname, - struct page **res_page, + struct folio **res_folio, bool use_hash) { int s = GET_DENTRY_SLOTS(fname->disk_name.len); unsigned int nbucket, nblock; unsigned int bidx, end_block, bucket_no; - struct page *dentry_page; struct f2fs_dir_entry *de = NULL; pgoff_t next_pgofs; bool room = false; @@ -284,31 +283,32 @@ start_find_bucket: while (bidx < end_block) { /* no need to allocate new dentry pages to all the indices */ - dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); - if (IS_ERR(dentry_page)) { - if (PTR_ERR(dentry_page) == -ENOENT) { + struct folio *dentry_folio; + dentry_folio = f2fs_find_data_folio(dir, bidx, &next_pgofs); + if (IS_ERR(dentry_folio)) { + if (PTR_ERR(dentry_folio) == -ENOENT) { room = true; bidx = next_pgofs; continue; } else { - *res_page = dentry_page; + *res_folio = dentry_folio; break; } } - de = find_in_block(dir, dentry_page, fname, &max_slots, use_hash); + de = find_in_block(dir, dentry_folio, fname, &max_slots, use_hash); if (IS_ERR(de)) { - *res_page = ERR_CAST(de); + *res_folio = ERR_CAST(de); de = NULL; break; } else if (de) { - *res_page = dentry_page; + *res_folio = dentry_folio; break; } if (max_slots >= s) room = true; - f2fs_put_page(dentry_page, 0); + f2fs_folio_put(dentry_folio, false); bidx++; } @@ -329,7 +329,7 @@ start_find_bucket: struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, const struct f2fs_filename *fname, - struct page **res_page) + struct folio **res_folio) { unsigned long npages = dir_blocks(dir); struct f2fs_dir_entry *de = NULL; @@ -337,13 +337,13 @@ struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, unsigned int level; bool use_hash = true; - *res_page = NULL; + *res_folio = NULL; #if IS_ENABLED(CONFIG_UNICODE) start_find_entry: #endif if (f2fs_has_inline_dentry(dir)) { - de = f2fs_find_in_inline_dir(dir, fname, res_page, use_hash); + de = f2fs_find_in_inline_dir(dir, fname, res_folio, use_hash); goto out; } @@ -359,14 +359,15 @@ start_find_entry: } for (level = 0; level < max_depth; level++) { - de = find_in_level(dir, level, fname, res_page, use_hash); - if (de || IS_ERR(*res_page)) + de = find_in_level(dir, level, fname, res_folio, use_hash); + if (de || IS_ERR(*res_folio)) break; } out: #if IS_ENABLED(CONFIG_UNICODE) - if (IS_CASEFOLDED(dir) && !de && use_hash) { + if (!sb_no_casefold_compat_fallback(dir->i_sb) && + IS_CASEFOLDED(dir) && !de && use_hash) { use_hash = false; goto start_find_entry; } @@ -384,7 +385,7 @@ out: * Entry is guaranteed to be valid. */ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, - const struct qstr *child, struct page **res_page) + const struct qstr *child, struct folio **res_folio) { struct f2fs_dir_entry *de = NULL; struct f2fs_filename fname; @@ -393,67 +394,67 @@ struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, err = f2fs_setup_filename(dir, child, 1, &fname); if (err) { if (err == -ENOENT) - *res_page = NULL; + *res_folio = NULL; else - *res_page = ERR_PTR(err); + *res_folio = ERR_PTR(err); return NULL; } - de = __f2fs_find_entry(dir, &fname, res_page); + de = __f2fs_find_entry(dir, &fname, res_folio); f2fs_free_filename(&fname); return de; } -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p) +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct folio **f) { - return f2fs_find_entry(dir, &dotdot_name, p); + return f2fs_find_entry(dir, &dotdot_name, f); } ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, - struct page **page) + struct folio **folio) { ino_t res = 0; struct f2fs_dir_entry *de; - de = f2fs_find_entry(dir, qstr, page); + de = f2fs_find_entry(dir, qstr, folio); if (de) { res = le32_to_cpu(de->ino); - f2fs_put_page(*page, 0); + f2fs_folio_put(*folio, false); } return res; } void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, - struct page *page, struct inode *inode) + struct folio *folio, struct inode *inode) { enum page_type type = f2fs_has_inline_dentry(dir) ? NODE : DATA; - lock_page(page); - f2fs_wait_on_page_writeback(page, type, true, true); + folio_lock(folio); + f2fs_folio_wait_writeback(folio, type, true, true); de->ino = cpu_to_le32(inode->i_ino); de->file_type = fs_umode_to_ftype(inode->i_mode); - set_page_dirty(page); + folio_mark_dirty(folio); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); } static void init_dent_inode(struct inode *dir, struct inode *inode, const struct f2fs_filename *fname, - struct page *ipage) + struct folio *ifolio) { struct f2fs_inode *ri; if (!fname) /* tmpfile case? */ return; - f2fs_wait_on_page_writeback(ipage, NODE, true, true); + f2fs_folio_wait_writeback(ifolio, NODE, true, true); - /* copy name info. to this inode page */ - ri = F2FS_INODE(ipage); + /* copy name info. to this inode folio */ + ri = F2FS_INODE(&ifolio->page); ri->i_namelen = cpu_to_le32(fname->disk_name.len); memcpy(ri->i_name, fname->disk_name.name, fname->disk_name.len); if (IS_ENCRYPTED(dir)) { @@ -474,7 +475,7 @@ static void init_dent_inode(struct inode *dir, struct inode *inode, file_lost_pino(inode); } } - set_page_dirty(ipage); + folio_mark_dirty(ifolio); } void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, @@ -491,72 +492,73 @@ void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, } static int make_empty_dir(struct inode *inode, - struct inode *parent, struct page *page) + struct inode *parent, struct folio *folio) { - struct page *dentry_page; + struct folio *dentry_folio; struct f2fs_dentry_block *dentry_blk; struct f2fs_dentry_ptr d; if (f2fs_has_inline_dentry(inode)) - return f2fs_make_empty_inline_dir(inode, parent, page); + return f2fs_make_empty_inline_dir(inode, parent, folio); - dentry_page = f2fs_get_new_data_page(inode, page, 0, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); + dentry_folio = f2fs_get_new_data_folio(inode, folio, 0, true); + if (IS_ERR(dentry_folio)) + return PTR_ERR(dentry_folio); - dentry_blk = page_address(dentry_page); + dentry_blk = folio_address(dentry_folio); make_dentry_ptr_block(NULL, &d, dentry_blk); f2fs_do_make_empty_dir(inode, parent, &d); - set_page_dirty(dentry_page); - f2fs_put_page(dentry_page, 1); + folio_mark_dirty(dentry_folio); + f2fs_folio_put(dentry_folio, true); return 0; } -struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, - const struct f2fs_filename *fname, struct page *dpage) +struct folio *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, + const struct f2fs_filename *fname, struct folio *dfolio) { - struct page *page; + struct folio *folio; int err; if (is_inode_flag_set(inode, FI_NEW_INODE)) { - page = f2fs_new_inode_page(inode); - if (IS_ERR(page)) - return page; + folio = f2fs_new_inode_folio(inode); + if (IS_ERR(folio)) + return folio; if (S_ISDIR(inode->i_mode)) { /* in order to handle error case */ - get_page(page); - err = make_empty_dir(inode, dir, page); + folio_get(folio); + err = make_empty_dir(inode, dir, folio); if (err) { - lock_page(page); + folio_lock(folio); goto put_error; } - put_page(page); + folio_put(folio); } - err = f2fs_init_acl(inode, dir, page, dpage); + err = f2fs_init_acl(inode, dir, folio, dfolio); if (err) goto put_error; err = f2fs_init_security(inode, dir, - fname ? fname->usr_fname : NULL, page); + fname ? fname->usr_fname : NULL, + folio); if (err) goto put_error; if (IS_ENCRYPTED(inode)) { - err = fscrypt_set_context(inode, page); + err = fscrypt_set_context(inode, folio); if (err) goto put_error; } } else { - page = f2fs_get_inode_page(F2FS_I_SB(dir), inode->i_ino); - if (IS_ERR(page)) - return page; + folio = f2fs_get_inode_folio(F2FS_I_SB(dir), inode->i_ino); + if (IS_ERR(folio)) + return folio; } - init_dent_inode(dir, inode, fname, page); + init_dent_inode(dir, inode, fname, folio); /* * This file should be checkpointed during fsync. @@ -573,12 +575,12 @@ struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, f2fs_remove_orphan_inode(F2FS_I_SB(dir), inode->i_ino); f2fs_i_links_write(inode, true); } - return page; + return folio; put_error: clear_nlink(inode); - f2fs_update_inode(inode, page); - f2fs_put_page(page, 1); + f2fs_update_inode(inode, folio); + f2fs_folio_put(folio, true); return ERR_PTR(err); } @@ -620,14 +622,14 @@ next: goto next; } -bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, +bool f2fs_has_enough_room(struct inode *dir, struct folio *ifolio, const struct f2fs_filename *fname) { struct f2fs_dentry_ptr d; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(fname->disk_name.len); - make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ipage)); + make_dentry_ptr_inline(dir, &d, inline_data_addr(dir, ifolio)); bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); @@ -664,10 +666,10 @@ int f2fs_add_regular_entry(struct inode *dir, const struct f2fs_filename *fname, unsigned int current_depth; unsigned long bidx, block; unsigned int nbucket, nblock; - struct page *dentry_page = NULL; + struct folio *dentry_folio = NULL; struct f2fs_dentry_block *dentry_blk = NULL; struct f2fs_dentry_ptr d; - struct page *page = NULL; + struct folio *folio = NULL; int slots, err = 0; level = 0; @@ -697,30 +699,30 @@ start: (le32_to_cpu(fname->hash) % nbucket)); for (block = bidx; block <= (bidx + nblock - 1); block++) { - dentry_page = f2fs_get_new_data_page(dir, NULL, block, true); - if (IS_ERR(dentry_page)) - return PTR_ERR(dentry_page); + dentry_folio = f2fs_get_new_data_folio(dir, NULL, block, true); + if (IS_ERR(dentry_folio)) + return PTR_ERR(dentry_folio); - dentry_blk = page_address(dentry_page); + dentry_blk = folio_address(dentry_folio); bit_pos = f2fs_room_for_filename(&dentry_blk->dentry_bitmap, slots, NR_DENTRY_IN_BLOCK); if (bit_pos < NR_DENTRY_IN_BLOCK) goto add_dentry; - f2fs_put_page(dentry_page, 1); + f2fs_folio_put(dentry_folio, true); } /* Move to next level to find the empty slot for new dentry */ ++level; goto start; add_dentry: - f2fs_wait_on_page_writeback(dentry_page, DATA, true, true); + f2fs_folio_wait_writeback(dentry_folio, DATA, true, true); if (inode) { f2fs_down_write(&F2FS_I(inode)->i_sem); - page = f2fs_init_inode_metadata(inode, dir, fname, NULL); - if (IS_ERR(page)) { - err = PTR_ERR(page); + folio = f2fs_init_inode_metadata(inode, dir, fname, NULL); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); goto fail; } } @@ -729,16 +731,16 @@ add_dentry: f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, bit_pos); - set_page_dirty(dentry_page); + folio_mark_dirty(dentry_folio); if (inode) { f2fs_i_pino_write(inode, dir->i_ino); /* synchronize inode page's data from inode cache */ if (is_inode_flag_set(inode, FI_NEW_INODE)) - f2fs_update_inode(inode, page); + f2fs_update_inode(inode, folio); - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); } f2fs_update_parent_metadata(dir, inode, current_depth); @@ -746,7 +748,7 @@ fail: if (inode) f2fs_up_write(&F2FS_I(inode)->i_sem); - f2fs_put_page(dentry_page, 1); + f2fs_folio_put(dentry_folio, true); return err; } @@ -780,7 +782,7 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_filename fname; - struct page *page = NULL; + struct folio *folio = NULL; struct f2fs_dir_entry *de = NULL; int err; @@ -796,14 +798,14 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name, * consistency more. */ if (current != F2FS_I(dir)->task) { - de = __f2fs_find_entry(dir, &fname, &page); + de = __f2fs_find_entry(dir, &fname, &folio); F2FS_I(dir)->task = NULL; } if (de) { - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); err = -EEXIST; - } else if (IS_ERR(page)) { - err = PTR_ERR(page); + } else if (IS_ERR(folio)) { + err = PTR_ERR(folio); } else { err = f2fs_add_dentry(dir, &fname, inode, ino, mode); } @@ -814,16 +816,16 @@ int f2fs_do_add_link(struct inode *dir, const struct qstr *name, int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, struct f2fs_filename *fname) { - struct page *page; + struct folio *folio; int err = 0; f2fs_down_write(&F2FS_I(inode)->i_sem); - page = f2fs_init_inode_metadata(inode, dir, fname, NULL); - if (IS_ERR(page)) { - err = PTR_ERR(page); + folio = f2fs_init_inode_metadata(inode, dir, fname, NULL); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); goto fail; } - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); clear_inode_flag(inode, FI_NEW_INODE); f2fs_update_time(F2FS_I_SB(inode), REQ_TIME); @@ -859,13 +861,13 @@ void f2fs_drop_nlink(struct inode *dir, struct inode *inode) * It only removes the dentry from the dentry page, corresponding name * entry in name page does not need to be touched during deletion. */ -void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio, struct inode *dir, struct inode *inode) { - struct f2fs_dentry_block *dentry_blk; + struct f2fs_dentry_block *dentry_blk; unsigned int bit_pos; int slots = GET_DENTRY_SLOTS(le16_to_cpu(dentry->name_len)); - pgoff_t index = page_folio(page)->index; + pgoff_t index = folio->index; int i; f2fs_update_time(F2FS_I_SB(dir), REQ_TIME); @@ -874,12 +876,12 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, f2fs_add_ino_entry(F2FS_I_SB(dir), dir->i_ino, TRANS_DIR_INO); if (f2fs_has_inline_dentry(dir)) - return f2fs_delete_inline_entry(dentry, page, dir, inode); + return f2fs_delete_inline_entry(dentry, folio, dir, inode); - lock_page(page); - f2fs_wait_on_page_writeback(page, DATA, true, true); + folio_lock(folio); + f2fs_folio_wait_writeback(folio, DATA, true, true); - dentry_blk = page_address(page); + dentry_blk = folio_address(folio); bit_pos = dentry - dentry_blk->dentry; for (i = 0; i < slots; i++) __clear_bit_le(bit_pos + i, &dentry_blk->dentry_bitmap); @@ -888,19 +890,19 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bit_pos = find_next_bit_le(&dentry_blk->dentry_bitmap, NR_DENTRY_IN_BLOCK, 0); - set_page_dirty(page); + folio_mark_dirty(folio); if (bit_pos == NR_DENTRY_IN_BLOCK && !f2fs_truncate_hole(dir, index, index + 1)) { - f2fs_clear_page_cache_dirty_tag(page_folio(page)); - clear_page_dirty_for_io(page); - ClearPageUptodate(page); - clear_page_private_all(page); + f2fs_clear_page_cache_dirty_tag(folio); + folio_clear_dirty_for_io(folio); + folio_clear_uptodate(folio); + clear_page_private_all(&folio->page); inode_dec_dirty_pages(dir); f2fs_remove_dirty_inode(dir); } - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); @@ -912,7 +914,6 @@ void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, bool f2fs_empty_dir(struct inode *dir) { unsigned long bidx = 0; - struct page *dentry_page; unsigned int bit_pos; struct f2fs_dentry_block *dentry_blk; unsigned long nblock = dir_blocks(dir); @@ -922,10 +923,11 @@ bool f2fs_empty_dir(struct inode *dir) while (bidx < nblock) { pgoff_t next_pgofs; + struct folio *dentry_folio; - dentry_page = f2fs_find_data_page(dir, bidx, &next_pgofs); - if (IS_ERR(dentry_page)) { - if (PTR_ERR(dentry_page) == -ENOENT) { + dentry_folio = f2fs_find_data_folio(dir, bidx, &next_pgofs); + if (IS_ERR(dentry_folio)) { + if (PTR_ERR(dentry_folio) == -ENOENT) { bidx = next_pgofs; continue; } else { @@ -933,7 +935,7 @@ bool f2fs_empty_dir(struct inode *dir) } } - dentry_blk = page_address(dentry_page); + dentry_blk = folio_address(dentry_folio); if (bidx == 0) bit_pos = 2; else @@ -942,7 +944,7 @@ bool f2fs_empty_dir(struct inode *dir) NR_DENTRY_IN_BLOCK, bit_pos); - f2fs_put_page(dentry_page, 0); + f2fs_folio_put(dentry_folio, false); if (bit_pos < NR_DENTRY_IN_BLOCK) return false; @@ -1041,7 +1043,6 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) struct inode *inode = file_inode(file); unsigned long npages = dir_blocks(inode); struct f2fs_dentry_block *dentry_blk = NULL; - struct page *dentry_page = NULL; struct file_ra_state *ra = &file->f_ra; loff_t start_pos = ctx->pos; unsigned int n = ((unsigned long)ctx->pos / NR_DENTRY_IN_BLOCK); @@ -1065,6 +1066,7 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } for (; n < npages; ctx->pos = n * NR_DENTRY_IN_BLOCK) { + struct folio *dentry_folio; pgoff_t next_pgofs; /* allow readdir() to be interrupted */ @@ -1079,9 +1081,9 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) page_cache_sync_readahead(inode->i_mapping, ra, file, n, min(npages - n, (pgoff_t)MAX_DIR_RA_PAGES)); - dentry_page = f2fs_find_data_page(inode, n, &next_pgofs); - if (IS_ERR(dentry_page)) { - err = PTR_ERR(dentry_page); + dentry_folio = f2fs_find_data_folio(inode, n, &next_pgofs); + if (IS_ERR(dentry_folio)) { + err = PTR_ERR(dentry_folio); if (err == -ENOENT) { err = 0; n = next_pgofs; @@ -1091,18 +1093,15 @@ static int f2fs_readdir(struct file *file, struct dir_context *ctx) } } - dentry_blk = page_address(dentry_page); + dentry_blk = folio_address(dentry_folio); make_dentry_ptr_block(inode, &d, dentry_blk); err = f2fs_fill_dentries(ctx, &d, n * NR_DENTRY_IN_BLOCK, &fstr); - if (err) { - f2fs_put_page(dentry_page, 0); + f2fs_folio_put(dentry_folio, false); + if (err) break; - } - - f2fs_put_page(dentry_page, 0); n++; } diff --git a/fs/f2fs/extent_cache.c b/fs/f2fs/extent_cache.c index 347b3b647834..cfe925a3d555 100644 --- a/fs/f2fs/extent_cache.c +++ b/fs/f2fs/extent_cache.c @@ -407,11 +407,11 @@ static void __drop_largest_extent(struct extent_tree *et, } } -void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) +void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct extent_tree_info *eti = &sbi->extent_tree[EX_READ]; - struct f2fs_extent *i_ext = &F2FS_INODE(ipage)->i_ext; + struct f2fs_extent *i_ext = &F2FS_INODE(&ifolio->page)->i_ext; struct extent_tree *et; struct extent_node *en; struct extent_info ei; @@ -419,9 +419,9 @@ void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage) if (!__may_extent_tree(inode, EX_READ)) { /* drop largest read extent */ if (i_ext->len) { - f2fs_wait_on_page_writeback(ipage, NODE, true, true); + f2fs_folio_wait_writeback(ifolio, NODE, true, true); i_ext->len = 0; - set_page_dirty(ipage); + folio_mark_dirty(ifolio); } set_inode_flag(inode, FI_NO_EXTENT); return; @@ -934,7 +934,7 @@ static void __update_extent_cache(struct dnode_of_data *dn, enum extent_type typ if (!__may_extent_tree(dn->inode, type)) return; - ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), dn->inode) + + ei.fofs = f2fs_start_bidx_of_node(ofs_of_node(&dn->node_folio->page), dn->inode) + dn->ofs_in_node; ei.len = 1; diff --git a/fs/f2fs/f2fs.h b/fs/f2fs/f2fs.h index f1576dc6ec67..9333a22b9a01 100644 --- a/fs/f2fs/f2fs.h +++ b/fs/f2fs/f2fs.h @@ -63,16 +63,25 @@ enum { FAULT_BLKADDR_CONSISTENCE, FAULT_NO_SEGMENT, FAULT_INCONSISTENT_FOOTER, + FAULT_TIMEOUT, + FAULT_VMALLOC, FAULT_MAX, }; -#ifdef CONFIG_F2FS_FAULT_INJECTION -#define F2FS_ALL_FAULT_TYPE (GENMASK(FAULT_MAX - 1, 0)) +/* indicate which option to update */ +enum fault_option { + FAULT_RATE = 1, /* only update fault rate */ + FAULT_TYPE = 2, /* only update fault type */ + FAULT_ALL = 4, /* reset all fault injection options/stats */ +}; +#ifdef CONFIG_F2FS_FAULT_INJECTION struct f2fs_fault_info { atomic_t inject_ops; int inject_rate; unsigned int inject_type; + /* Used to account total count of injection for each type */ + unsigned int inject_count[FAULT_MAX]; }; extern const char *f2fs_fault_name[FAULT_MAX]; @@ -317,7 +326,7 @@ struct inode_entry { struct fsync_node_entry { struct list_head list; /* list head */ - struct page *page; /* warm node page pointer */ + struct folio *folio; /* warm node folio pointer */ unsigned int seq_id; /* sequence id */ }; @@ -606,6 +615,9 @@ enum { /* congestion wait timeout value, default: 20ms */ #define DEFAULT_IO_TIMEOUT (msecs_to_jiffies(20)) +/* timeout value injected, default: 1000ms */ +#define DEFAULT_FAULT_TIMEOUT (msecs_to_jiffies(1000)) + /* maximum retry quota flush count */ #define DEFAULT_RETRY_QUOTA_FLUSH_COUNT 8 @@ -821,6 +833,7 @@ enum { FI_ATOMIC_DIRTIED, /* indicate atomic file is dirtied */ FI_ATOMIC_REPLACE, /* indicate atomic replace */ FI_OPENED_FILE, /* indicate file has been opened */ + FI_DONATE_FINISHED, /* indicate page donation of file has been finished */ FI_MAX, /* max flag, never be used */ }; @@ -994,11 +1007,11 @@ struct f2fs_nm_info { */ struct dnode_of_data { struct inode *inode; /* vfs inode pointer */ - struct page *inode_page; /* its inode page, NULL is possible */ - struct page *node_page; /* cached direct node page */ + struct folio *inode_folio; /* its inode folio, NULL is possible */ + struct folio *node_folio; /* cached direct node folio */ nid_t nid; /* node id of the direct node block */ unsigned int ofs_in_node; /* data offset in the node page */ - bool inode_page_locked; /* inode page is locked or not */ + bool inode_folio_locked; /* inode folio is locked or not */ bool node_changed; /* is node block changed */ char cur_level; /* level of hole node page */ char max_level; /* level of current page located */ @@ -1006,12 +1019,12 @@ struct dnode_of_data { }; static inline void set_new_dnode(struct dnode_of_data *dn, struct inode *inode, - struct page *ipage, struct page *npage, nid_t nid) + struct folio *ifolio, struct folio *nfolio, nid_t nid) { memset(dn, 0, sizeof(*dn)); dn->inode = inode; - dn->inode_page = ipage; - dn->node_page = npage; + dn->inode_folio = ifolio; + dn->node_folio = nfolio; dn->nid = nid; } @@ -1780,7 +1793,7 @@ struct f2fs_sb_info { unsigned int dirty_device; /* for checkpoint data flush */ spinlock_t dev_lock; /* protect dirty_device */ bool aligned_blksize; /* all devices has the same logical blksize */ - unsigned int first_zoned_segno; /* first zoned segno */ + unsigned int first_seq_zone_segno; /* first segno in sequential zone */ /* For write statistics */ u64 sectors_written_start; @@ -1902,6 +1915,7 @@ static inline bool __time_to_inject(struct f2fs_sb_info *sbi, int type, atomic_inc(&ffi->inject_ops); if (atomic_read(&ffi->inject_ops) >= ffi->inject_rate) { atomic_set(&ffi->inject_ops, 0); + ffi->inject_count[type]++; f2fs_info_ratelimited(sbi, "inject %s in %s of %pS", f2fs_fault_name[type], func, parent_func); return true; @@ -1963,28 +1977,20 @@ static inline unsigned int f2fs_time_to_wait(struct f2fs_sb_info *sbi, /* * Inline functions */ -static inline u32 __f2fs_crc32(struct f2fs_sb_info *sbi, u32 crc, - const void *address, unsigned int length) +static inline u32 __f2fs_crc32(u32 crc, const void *address, + unsigned int length) { return crc32(crc, address, length); } -static inline u32 f2fs_crc32(struct f2fs_sb_info *sbi, const void *address, - unsigned int length) +static inline u32 f2fs_crc32(const void *address, unsigned int length) { - return __f2fs_crc32(sbi, F2FS_SUPER_MAGIC, address, length); + return __f2fs_crc32(F2FS_SUPER_MAGIC, address, length); } -static inline bool f2fs_crc_valid(struct f2fs_sb_info *sbi, __u32 blk_crc, - void *buf, size_t buf_size) +static inline u32 f2fs_chksum(u32 crc, const void *address, unsigned int length) { - return f2fs_crc32(sbi, buf, buf_size) == blk_crc; -} - -static inline u32 f2fs_chksum(struct f2fs_sb_info *sbi, u32 crc, - const void *address, unsigned int length) -{ - return __f2fs_crc32(sbi, crc, address, length); + return __f2fs_crc32(crc, address, length); } static inline struct f2fs_inode_info *F2FS_I(struct inode *inode) @@ -2082,6 +2088,16 @@ static inline struct address_space *NODE_MAPPING(struct f2fs_sb_info *sbi) return sbi->node_inode->i_mapping; } +static inline bool is_meta_folio(struct folio *folio) +{ + return folio->mapping == META_MAPPING(F2FS_F_SB(folio)); +} + +static inline bool is_node_folio(struct folio *folio) +{ + return folio->mapping == NODE_MAPPING(F2FS_F_SB(folio)); +} + static inline bool is_sbi_flag_set(struct f2fs_sb_info *sbi, unsigned int type) { return test_bit(type, &sbi->s_flag); @@ -2518,8 +2534,14 @@ static inline void dec_valid_block_count(struct f2fs_sb_info *sbi, blkcnt_t sectors = count << F2FS_LOG_SECTORS_PER_BLOCK; spin_lock(&sbi->stat_lock); - f2fs_bug_on(sbi, sbi->total_valid_block_count < (block_t) count); - sbi->total_valid_block_count -= (block_t)count; + if (unlikely(sbi->total_valid_block_count < count)) { + f2fs_warn(sbi, "Inconsistent total_valid_block_count:%u, ino:%lu, count:%u", + sbi->total_valid_block_count, inode->i_ino, count); + sbi->total_valid_block_count = 0; + set_sbi_flag(sbi, SBI_NEED_FSCK); + } else { + sbi->total_valid_block_count -= count; + } if (sbi->reserved_blocks && sbi->current_reserved_blocks < sbi->reserved_blocks) sbi->current_reserved_blocks = min(sbi->reserved_blocks, @@ -2849,14 +2871,14 @@ static inline struct folio *f2fs_grab_cache_folio(struct address_space *mapping, return folio; } -static inline struct page *f2fs_grab_cache_page(struct address_space *mapping, - pgoff_t index, bool for_write) +static inline struct folio *f2fs_filemap_get_folio( + struct address_space *mapping, pgoff_t index, + fgf_t fgp_flags, gfp_t gfp_mask) { - struct folio *folio = f2fs_grab_cache_folio(mapping, index, for_write); + if (time_to_inject(F2FS_M_SB(mapping), FAULT_PAGE_GET)) + return ERR_PTR(-ENOMEM); - if (IS_ERR(folio)) - return NULL; - return &folio->page; + return __filemap_get_folio(mapping, index, fgp_flags, gfp_mask); } static inline struct page *f2fs_pagecache_get_page( @@ -2871,7 +2893,7 @@ static inline struct page *f2fs_pagecache_get_page( static inline void f2fs_folio_put(struct folio *folio, bool unlock) { - if (!folio) + if (IS_ERR_OR_NULL(folio)) return; if (unlock) { @@ -2890,12 +2912,12 @@ static inline void f2fs_put_page(struct page *page, int unlock) static inline void f2fs_put_dnode(struct dnode_of_data *dn) { - if (dn->node_page) - f2fs_put_page(dn->node_page, 1); - if (dn->inode_page && dn->node_page != dn->inode_page) - f2fs_put_page(dn->inode_page, 0); - dn->node_page = NULL; - dn->inode_page = NULL; + if (dn->node_folio) + f2fs_folio_put(dn->node_folio, true); + if (dn->inode_folio && dn->node_folio != dn->inode_folio) + f2fs_folio_put(dn->inode_folio, false); + dn->node_folio = NULL; + dn->inode_folio = NULL; } static inline struct kmem_cache *f2fs_kmem_cache_create(const char *name, @@ -3019,21 +3041,21 @@ static inline unsigned int get_dnode_base(struct inode *inode, } static inline __le32 *get_dnode_addr(struct inode *inode, - struct page *node_page) + struct folio *node_folio) { - return blkaddr_in_node(F2FS_NODE(node_page)) + - get_dnode_base(inode, node_page); + return blkaddr_in_node(F2FS_NODE(&node_folio->page)) + + get_dnode_base(inode, &node_folio->page); } static inline block_t data_blkaddr(struct inode *inode, - struct page *node_page, unsigned int offset) + struct folio *node_folio, unsigned int offset) { - return le32_to_cpu(*(get_dnode_addr(inode, node_page) + offset)); + return le32_to_cpu(*(get_dnode_addr(inode, node_folio) + offset)); } static inline block_t f2fs_data_blkaddr(struct dnode_of_data *dn) { - return data_blkaddr(dn->inode, dn->node_page, dn->ofs_in_node); + return data_blkaddr(dn->inode, dn->node_folio, dn->ofs_in_node); } static inline int f2fs_test_bit(unsigned int nr, char *addr) @@ -3344,9 +3366,9 @@ static inline unsigned int addrs_per_page(struct inode *inode, return addrs; } -static inline void *inline_xattr_addr(struct inode *inode, struct page *page) +static inline void *inline_xattr_addr(struct inode *inode, struct folio *folio) { - struct f2fs_inode *ri = F2FS_INODE(page); + struct f2fs_inode *ri = F2FS_INODE(&folio->page); return (void *)&(ri->i_addr[DEF_ADDRS_PER_INODE - get_inline_xattr_addrs(inode)]); @@ -3361,7 +3383,7 @@ static inline int inline_xattr_size(struct inode *inode) /* * Notice: check inline_data flag without inode page lock is unsafe. - * It could change at any time by f2fs_convert_inline_page(). + * It could change at any time by f2fs_convert_inline_folio(). */ static inline int f2fs_has_inline_data(struct inode *inode) { @@ -3393,9 +3415,9 @@ static inline bool f2fs_is_cow_file(struct inode *inode) return is_inode_flag_set(inode, FI_COW_FILE); } -static inline void *inline_data_addr(struct inode *inode, struct page *page) +static inline void *inline_data_addr(struct inode *inode, struct folio *folio) { - __le32 *addr = get_dnode_addr(inode, page); + __le32 *addr = get_dnode_addr(inode, folio); return (void *)(addr + DEF_INLINE_RESERVED_SIZE); } @@ -3521,6 +3543,14 @@ static inline void *f2fs_kvzalloc(struct f2fs_sb_info *sbi, return f2fs_kvmalloc(sbi, size, flags | __GFP_ZERO); } +static inline void *f2fs_vmalloc(struct f2fs_sb_info *sbi, size_t size) +{ + if (time_to_inject(sbi, FAULT_VMALLOC)) + return NULL; + + return vmalloc(size); +} + static inline int get_extra_isize(struct inode *inode) { return F2FS_I(inode)->i_extra_isize / sizeof(__le32); @@ -3597,12 +3627,12 @@ int f2fs_pin_file_control(struct inode *inode, bool inc); * inode.c */ void f2fs_set_inode_flags(struct inode *inode); -bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page); +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio); void f2fs_inode_chksum_set(struct f2fs_sb_info *sbi, struct page *page); struct inode *f2fs_iget(struct super_block *sb, unsigned long ino); struct inode *f2fs_iget_retry(struct super_block *sb, unsigned long ino); int f2fs_try_to_free_nats(struct f2fs_sb_info *sbi, int nr_shrink); -void f2fs_update_inode(struct inode *inode, struct page *node_page); +void f2fs_update_inode(struct inode *inode, struct folio *node_folio); void f2fs_update_inode_page(struct inode *inode); int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc); void f2fs_evict_inode(struct inode *inode); @@ -3648,23 +3678,22 @@ int f2fs_fill_dentries(struct dir_context *ctx, struct f2fs_dentry_ptr *d, unsigned int start_pos, struct fscrypt_str *fstr); void f2fs_do_make_empty_dir(struct inode *inode, struct inode *parent, struct f2fs_dentry_ptr *d); -struct page *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, - const struct f2fs_filename *fname, struct page *dpage); +struct folio *f2fs_init_inode_metadata(struct inode *inode, struct inode *dir, + const struct f2fs_filename *fname, struct folio *dfolio); void f2fs_update_parent_metadata(struct inode *dir, struct inode *inode, unsigned int current_depth); int f2fs_room_for_filename(const void *bitmap, int slots, int max_slots); void f2fs_drop_nlink(struct inode *dir, struct inode *inode); struct f2fs_dir_entry *__f2fs_find_entry(struct inode *dir, - const struct f2fs_filename *fname, - struct page **res_page); + const struct f2fs_filename *fname, struct folio **res_folio); struct f2fs_dir_entry *f2fs_find_entry(struct inode *dir, - const struct qstr *child, struct page **res_page); -struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct page **p); + const struct qstr *child, struct folio **res_folio); +struct f2fs_dir_entry *f2fs_parent_dir(struct inode *dir, struct folio **f); ino_t f2fs_inode_by_name(struct inode *dir, const struct qstr *qstr, - struct page **page); + struct folio **folio); void f2fs_set_link(struct inode *dir, struct f2fs_dir_entry *de, - struct page *page, struct inode *inode); -bool f2fs_has_enough_room(struct inode *dir, struct page *ipage, + struct folio *folio, struct inode *inode); +bool f2fs_has_enough_room(struct inode *dir, struct folio *ifolio, const struct f2fs_filename *fname); void f2fs_update_dentry(nid_t ino, umode_t mode, struct f2fs_dentry_ptr *d, const struct fscrypt_str *name, f2fs_hash_t name_hash, @@ -3675,7 +3704,7 @@ int f2fs_add_dentry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); int f2fs_do_add_link(struct inode *dir, const struct qstr *name, struct inode *inode, nid_t ino, umode_t mode); -void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct page *page, +void f2fs_delete_entry(struct f2fs_dir_entry *dentry, struct folio *folio, struct inode *dir, struct inode *inode); int f2fs_do_tmpfile(struct inode *inode, struct inode *dir, struct f2fs_filename *fname); @@ -3719,10 +3748,9 @@ struct node_info; int f2fs_check_nid_range(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type); -bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, - const struct folio *folio); +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio); void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi); -void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page); +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct folio *folio); void f2fs_reset_fsync_node_info(struct f2fs_sb_info *sbi); int f2fs_need_dentry_mark(struct f2fs_sb_info *sbi, nid_t nid); bool f2fs_is_checkpointed_node(struct f2fs_sb_info *sbi, nid_t nid); @@ -3736,15 +3764,13 @@ int f2fs_truncate_xattr_node(struct inode *inode); int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id); int f2fs_remove_inode_page(struct inode *inode); -struct page *f2fs_new_inode_page(struct inode *inode); -struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs); +struct folio *f2fs_new_inode_folio(struct inode *inode); +struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs); void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid); -struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid); +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid); struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino); -struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino); -struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid); -struct page *f2fs_get_node_page_ra(struct page *parent, int start); -int f2fs_move_node_page(struct page *node_page, int gc_type); +struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid); +int f2fs_move_node_folio(struct folio *node_folio, int gc_type); void f2fs_flush_inline_data(struct f2fs_sb_info *sbi); int f2fs_fsync_node_pages(struct f2fs_sb_info *sbi, struct inode *inode, struct writeback_control *wbc, bool atomic, @@ -3757,7 +3783,7 @@ bool f2fs_alloc_nid(struct f2fs_sb_info *sbi, nid_t *nid); void f2fs_alloc_nid_done(struct f2fs_sb_info *sbi, nid_t nid); void f2fs_alloc_nid_failed(struct f2fs_sb_info *sbi, nid_t nid); int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink); -int f2fs_recover_inline_xattr(struct inode *inode, struct page *page); +int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio); int f2fs_recover_xattr_data(struct inode *inode, struct page *page); int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page); int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, @@ -3807,7 +3833,7 @@ int f2fs_allocate_new_segments(struct f2fs_sb_info *sbi); int f2fs_trim_fs(struct f2fs_sb_info *sbi, struct fstrim_range *range); bool f2fs_exist_trim_candidates(struct f2fs_sb_info *sbi, struct cp_control *cpc); -struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno); +struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno); void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr); void f2fs_do_write_meta_page(struct f2fs_sb_info *sbi, struct folio *folio, @@ -3858,6 +3884,11 @@ unsigned int f2fs_usable_blks_in_seg(struct f2fs_sb_info *sbi, unsigned long long f2fs_get_section_mtime(struct f2fs_sb_info *sbi, unsigned int segno); +static inline struct inode *fio_inode(struct f2fs_io_info *fio) +{ + return page_folio(fio->page)->mapping->host; +} + #define DEF_FRAGMENT_SIZE 4 #define MIN_FRAGMENT_SIZE 1 #define MAX_FRAGMENT_SIZE 512 @@ -3874,10 +3905,10 @@ static inline bool f2fs_need_rand_seg(struct f2fs_sb_info *sbi) void f2fs_stop_checkpoint(struct f2fs_sb_info *sbi, bool end_io, unsigned char reason); void f2fs_flush_ckpt_thread(struct f2fs_sb_info *sbi); -struct page *f2fs_grab_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *f2fs_get_meta_page(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *f2fs_get_meta_page_retry(struct f2fs_sb_info *sbi, pgoff_t index); -struct page *f2fs_get_tmp_page(struct f2fs_sb_info *sbi, pgoff_t index); +struct folio *f2fs_grab_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index); +struct folio *f2fs_get_meta_folio(struct f2fs_sb_info *sbi, pgoff_t index); +struct folio *f2fs_get_meta_folio_retry(struct f2fs_sb_info *sbi, pgoff_t index); +struct folio *f2fs_get_tmp_folio(struct f2fs_sb_info *sbi, pgoff_t index); bool f2fs_is_valid_blkaddr(struct f2fs_sb_info *sbi, block_t blkaddr, int type); bool f2fs_is_valid_blkaddr_raw(struct f2fs_sb_info *sbi, @@ -3933,7 +3964,7 @@ void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi, struct inode *inode, struct page *page, nid_t ino, enum page_type type); void f2fs_submit_merged_ipu_write(struct f2fs_sb_info *sbi, - struct bio **bio, struct page *page); + struct bio **bio, struct folio *folio); void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi); int f2fs_submit_page_bio(struct f2fs_io_info *fio); int f2fs_merge_page_bio(struct f2fs_io_info *fio); @@ -3953,8 +3984,8 @@ struct folio *f2fs_find_data_folio(struct inode *inode, pgoff_t index, pgoff_t *next_pgofs); struct folio *f2fs_get_lock_data_folio(struct inode *inode, pgoff_t index, bool for_write); -struct page *f2fs_get_new_data_page(struct inode *inode, - struct page *ipage, pgoff_t index, bool new_i_size); +struct folio *f2fs_get_new_data_folio(struct inode *inode, + struct folio *ifolio, pgoff_t index, bool new_i_size); int f2fs_do_write_data_page(struct f2fs_io_info *fio); int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map, int flag); int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo, @@ -3978,22 +4009,6 @@ int f2fs_init_post_read_wq(struct f2fs_sb_info *sbi); void f2fs_destroy_post_read_wq(struct f2fs_sb_info *sbi); extern const struct iomap_ops f2fs_iomap_ops; -static inline struct page *f2fs_find_data_page(struct inode *inode, - pgoff_t index, pgoff_t *next_pgofs) -{ - struct folio *folio = f2fs_find_data_folio(inode, index, next_pgofs); - - return &folio->page; -} - -static inline struct page *f2fs_get_lock_data_page(struct inode *inode, - pgoff_t index, bool for_write) -{ - struct folio *folio = f2fs_get_lock_data_folio(inode, index, for_write); - - return &folio->page; -} - /* * gc.c */ @@ -4290,26 +4305,24 @@ extern struct kmem_cache *f2fs_inode_entry_slab; bool f2fs_may_inline_data(struct inode *inode); bool f2fs_sanity_check_inline_data(struct inode *inode, struct page *ipage); bool f2fs_may_inline_dentry(struct inode *inode); -void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage); -void f2fs_truncate_inline_inode(struct inode *inode, - struct page *ipage, u64 from); +void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio); +void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio, + u64 from); int f2fs_read_inline_data(struct inode *inode, struct folio *folio); -int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page); +int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio); int f2fs_convert_inline_inode(struct inode *inode); int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry); int f2fs_write_inline_data(struct inode *inode, struct folio *folio); -int f2fs_recover_inline_data(struct inode *inode, struct page *npage); +int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio); struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, - const struct f2fs_filename *fname, - struct page **res_page, - bool use_hash); + const struct f2fs_filename *fname, struct folio **res_folio, + bool use_hash); int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, - struct page *ipage); + struct folio *ifolio); int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode); void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, - struct page *page, struct inode *dir, - struct inode *inode); + struct folio *folio, struct inode *dir, struct inode *inode); bool f2fs_empty_inline_dir(struct inode *dir); int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr); @@ -4342,7 +4355,7 @@ int __init f2fs_create_extent_cache(void); void f2fs_destroy_extent_cache(void); /* read extent cache ops */ -void f2fs_init_read_extent_tree(struct inode *inode, struct page *ipage); +void f2fs_init_read_extent_tree(struct inode *inode, struct folio *ifolio); bool f2fs_lookup_read_extent_cache(struct inode *inode, pgoff_t pgofs, struct extent_info *ei); bool f2fs_lookup_read_extent_cache_block(struct inode *inode, pgoff_t index, @@ -4423,7 +4436,7 @@ enum cluster_check_type { CLUSTER_RAW_BLKS /* return # of raw blocks in a cluster */ }; bool f2fs_is_compressed_page(struct page *page); -struct page *f2fs_compress_control_page(struct page *page); +struct folio *f2fs_compress_control_folio(struct folio *folio); int f2fs_prepare_compress_overwrite(struct inode *inode, struct page **pagep, pgoff_t index, void **fsdata); bool f2fs_compress_write_end(struct inode *inode, void *fsdata, @@ -4458,7 +4471,7 @@ int f2fs_read_multi_pages(struct compress_ctx *cc, struct bio **bio_ret, struct decompress_io_ctx *f2fs_alloc_dic(struct compress_ctx *cc); void f2fs_decompress_end_io(struct decompress_io_ctx *dic, bool failed, bool in_task); -void f2fs_put_page_dic(struct page *page, bool in_task); +void f2fs_put_folio_dic(struct folio *folio, bool in_task); unsigned int f2fs_cluster_blocks_are_contiguous(struct dnode_of_data *dn, unsigned int ofs_in_node); int f2fs_init_compress_ctx(struct compress_ctx *cc); @@ -4475,7 +4488,7 @@ void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi, block_t blkaddr, unsigned int len); void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, nid_t ino, block_t blkaddr); -bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, struct page *page, +bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, struct folio *folio, block_t blkaddr); void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino); #define inc_compr_inode_stat(inode) \ @@ -4500,7 +4513,7 @@ static inline bool f2fs_is_compress_backend_ready(struct inode *inode) return false; } static inline bool f2fs_is_compress_level_valid(int alg, int lvl) { return false; } -static inline struct page *f2fs_compress_control_page(struct page *page) +static inline struct folio *f2fs_compress_control_folio(struct folio *folio) { WARN_ON_ONCE(1); return ERR_PTR(-EINVAL); @@ -4514,7 +4527,7 @@ static inline void f2fs_end_read_compressed_page(struct page *page, { WARN_ON_ONCE(1); } -static inline void f2fs_put_page_dic(struct page *page, bool in_task) +static inline void f2fs_put_folio_dic(struct folio *folio, bool in_task) { WARN_ON_ONCE(1); } @@ -4531,8 +4544,8 @@ static inline void f2fs_invalidate_compress_pages_range(struct f2fs_sb_info *sbi block_t blkaddr, unsigned int len) { } static inline void f2fs_cache_compressed_page(struct f2fs_sb_info *sbi, struct page *page, nid_t ino, block_t blkaddr) { } -static inline bool f2fs_load_compressed_page(struct f2fs_sb_info *sbi, - struct page *page, block_t blkaddr) { return false; } +static inline bool f2fs_load_compressed_folio(struct f2fs_sb_info *sbi, + struct folio *folio, block_t blkaddr) { return false; } static inline void f2fs_invalidate_compress_pages(struct f2fs_sb_info *sbi, nid_t ino) { } #define inc_compr_inode_stat(inode) do { } while (0) @@ -4622,12 +4635,16 @@ F2FS_FEATURE_FUNCS(readonly, RO); F2FS_FEATURE_FUNCS(device_alias, DEVICE_ALIAS); #ifdef CONFIG_BLK_DEV_ZONED -static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, - block_t blkaddr) +static inline bool f2fs_zone_is_seq(struct f2fs_sb_info *sbi, int devi, + unsigned int zone) { - unsigned int zno = blkaddr / sbi->blocks_per_blkz; + return test_bit(zone, FDEV(devi).blkz_seq); +} - return test_bit(zno, FDEV(devi).blkz_seq); +static inline bool f2fs_blkz_is_seq(struct f2fs_sb_info *sbi, int devi, + block_t blkaddr) +{ + return f2fs_zone_is_seq(sbi, devi, blkaddr / sbi->blocks_per_blkz); } #endif @@ -4699,15 +4716,31 @@ static inline bool f2fs_lfs_mode(struct f2fs_sb_info *sbi) return F2FS_OPTION(sbi).fs_mode == FS_MODE_LFS; } -static inline bool f2fs_valid_pinned_area(struct f2fs_sb_info *sbi, +static inline bool f2fs_is_sequential_zone_area(struct f2fs_sb_info *sbi, block_t blkaddr) { if (f2fs_sb_has_blkzoned(sbi)) { +#ifdef CONFIG_BLK_DEV_ZONED int devi = f2fs_target_device_index(sbi, blkaddr); - return !bdev_is_zoned(FDEV(devi).bdev); + if (!bdev_is_zoned(FDEV(devi).bdev)) + return false; + + if (f2fs_is_multi_device(sbi)) { + if (blkaddr < FDEV(devi).start_blk || + blkaddr > FDEV(devi).end_blk) { + f2fs_err(sbi, "Invalid block %x", blkaddr); + return false; + } + blkaddr -= FDEV(devi).start_blk; + } + + return f2fs_blkz_is_seq(sbi, devi, blkaddr); +#else + return false; +#endif } - return true; + return false; } static inline bool f2fs_low_mem_mode(struct f2fs_sb_info *sbi) @@ -4762,10 +4795,11 @@ static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx) #ifdef CONFIG_F2FS_FAULT_INJECTION extern int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, - unsigned long type); + unsigned long type, enum fault_option fo); #else static inline int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, - unsigned long rate, unsigned long type) + unsigned long rate, unsigned long type, + enum fault_option fo) { return 0; } @@ -4795,6 +4829,19 @@ static inline void f2fs_io_schedule_timeout(long timeout) io_schedule_timeout(timeout); } +static inline void f2fs_io_schedule_timeout_killable(long timeout) +{ + while (timeout) { + if (fatal_signal_pending(current)) + return; + set_current_state(TASK_UNINTERRUPTIBLE); + io_schedule_timeout(DEFAULT_IO_TIMEOUT); + if (timeout <= DEFAULT_IO_TIMEOUT) + return; + timeout -= DEFAULT_IO_TIMEOUT; + } +} + static inline void f2fs_handle_page_eio(struct f2fs_sb_info *sbi, struct folio *folio, enum page_type type) { @@ -4824,13 +4871,13 @@ static inline void f2fs_truncate_meta_inode_pages(struct f2fs_sb_info *sbi, int i = 0; do { - struct page *page; + struct folio *folio; - page = find_get_page(META_MAPPING(sbi), blkaddr + i); - if (page) { - if (folio_test_writeback(page_folio(page))) + folio = filemap_get_folio(META_MAPPING(sbi), blkaddr + i); + if (!IS_ERR(folio)) { + if (folio_test_writeback(folio)) need_submit = true; - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); } } while (++i < cnt && !need_submit); diff --git a/fs/f2fs/file.c b/fs/f2fs/file.c index abbcbb5865a3..6bd3de64f2a8 100644 --- a/fs/f2fs/file.c +++ b/fs/f2fs/file.c @@ -131,7 +131,7 @@ static vm_fault_t f2fs_vm_page_mkwrite(struct vm_fault *vmf) goto out_sem; } - f2fs_wait_on_page_writeback(folio_page(folio, 0), DATA, false, true); + f2fs_folio_wait_writeback(folio, DATA, false, true); /* wait for GCed page writeback via META_MAPPING */ f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); @@ -226,12 +226,13 @@ static inline enum cp_reason_type need_do_checkpoint(struct inode *inode) static bool need_inode_page_update(struct f2fs_sb_info *sbi, nid_t ino) { - struct page *i = find_get_page(NODE_MAPPING(sbi), ino); + struct folio *i = filemap_get_folio(NODE_MAPPING(sbi), ino); bool ret = false; /* But we need to avoid that there are some inode updates */ - if ((i && PageDirty(i)) || f2fs_need_inode_block_update(sbi, ino)) + if ((!IS_ERR(i) && folio_test_dirty(i)) || + f2fs_need_inode_block_update(sbi, ino)) ret = true; - f2fs_put_page(i, 0); + f2fs_folio_put(i, false); return ret; } @@ -260,7 +261,6 @@ static int f2fs_do_sync_file(struct file *file, loff_t start, loff_t end, struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = LONG_MAX, - .for_reclaim = 0, }; unsigned int seq_id = 0; @@ -403,7 +403,7 @@ static bool __found_offset(struct address_space *mapping, bool compressed_cluster = false; if (f2fs_compressed_file(inode)) { - block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_page, + block_t first_blkaddr = data_blkaddr(dn->inode, dn->node_folio, ALIGN_DOWN(dn->ofs_in_node, F2FS_I(inode)->i_cluster_size)); compressed_cluster = first_blkaddr == COMPRESS_ADDR; @@ -473,7 +473,7 @@ static loff_t f2fs_seek_block(struct file *file, loff_t offset, int whence) } } - end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); /* find data/hole in dnode block */ for (; dn.ofs_in_node < end_offset; @@ -554,19 +554,21 @@ static int f2fs_file_mmap(struct file *file, struct vm_area_struct *vma) static int finish_preallocate_blocks(struct inode *inode) { - int ret; + int ret = 0; + bool opened; - inode_lock(inode); - if (is_inode_flag_set(inode, FI_OPENED_FILE)) { - inode_unlock(inode); + f2fs_down_read(&F2FS_I(inode)->i_sem); + opened = is_inode_flag_set(inode, FI_OPENED_FILE); + f2fs_up_read(&F2FS_I(inode)->i_sem); + if (opened) return 0; - } - if (!file_should_truncate(inode)) { - set_inode_flag(inode, FI_OPENED_FILE); - inode_unlock(inode); - return 0; - } + inode_lock(inode); + if (is_inode_flag_set(inode, FI_OPENED_FILE)) + goto out_unlock; + + if (!file_should_truncate(inode)) + goto out_update; f2fs_down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); filemap_invalidate_lock(inode->i_mapping); @@ -576,16 +578,17 @@ static int finish_preallocate_blocks(struct inode *inode) filemap_invalidate_unlock(inode->i_mapping); f2fs_up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]); - - if (!ret) - set_inode_flag(inode, FI_OPENED_FILE); - - inode_unlock(inode); if (ret) - return ret; + goto out_unlock; file_dont_truncate(inode); - return 0; +out_update: + f2fs_down_write(&F2FS_I(inode)->i_sem); + set_inode_flag(inode, FI_OPENED_FILE); + f2fs_up_write(&F2FS_I(inode)->i_sem); +out_unlock: + inode_unlock(inode); + return ret; } static int f2fs_file_open(struct inode *inode, struct file *filp) @@ -624,7 +627,7 @@ void f2fs_truncate_data_blocks_range(struct dnode_of_data *dn, int count) block_t blkstart; int blklen = 0; - addr = get_dnode_addr(dn->inode, dn->node_page) + ofs; + addr = get_dnode_addr(dn->inode, dn->node_folio) + ofs; blkstart = le32_to_cpu(*addr); /* Assumption: truncation starts with cluster */ @@ -688,7 +691,7 @@ next: * once we invalidate valid blkaddr in range [ofs, ofs + count], * we will invalidate all blkaddr in the whole range. */ - fofs = f2fs_start_bidx_of_node(ofs_of_node(dn->node_page), + fofs = f2fs_start_bidx_of_node(ofs_of_node(&dn->node_folio->page), dn->inode) + ofs; f2fs_update_read_extent_cache_range(dn, fofs, 0, len); f2fs_update_age_extent_cache_range(dn, fofs, len); @@ -743,7 +746,7 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) struct dnode_of_data dn; pgoff_t free_from; int count = 0, err = 0; - struct page *ipage; + struct folio *ifolio; bool truncate_page = false; trace_f2fs_truncate_blocks_enter(inode, from); @@ -761,9 +764,9 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) if (lock) f2fs_lock_op(sbi); - ipage = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) { + err = PTR_ERR(ifolio); goto out; } @@ -776,18 +779,18 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) dec_valid_block_count(sbi, inode, ei.len); f2fs_update_time(sbi, REQ_TIME); - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); goto out; } if (f2fs_has_inline_data(inode)) { - f2fs_truncate_inline_inode(inode, ipage, from); - f2fs_put_page(ipage, 1); + f2fs_truncate_inline_inode(inode, ifolio, from); + f2fs_folio_put(ifolio, true); truncate_page = true; goto out; } - set_new_dnode(&dn, inode, ipage, NULL, 0); + set_new_dnode(&dn, inode, ifolio, NULL, 0); err = f2fs_get_dnode_of_data(&dn, free_from, LOOKUP_NODE_RA); if (err) { if (err == -ENOENT) @@ -795,12 +798,12 @@ int f2fs_do_truncate_blocks(struct inode *inode, u64 from, bool lock) goto out; } - count = ADDRS_PER_PAGE(dn.node_page, inode); + count = ADDRS_PER_PAGE(&dn.node_folio->page, inode); count -= dn.ofs_in_node; f2fs_bug_on(sbi, count < 0); - if (dn.ofs_in_node || IS_INODE(dn.node_page)) { + if (dn.ofs_in_node || IS_INODE(&dn.node_folio->page)) { f2fs_truncate_data_blocks_range(&dn, count); free_from += count; } @@ -1161,7 +1164,7 @@ static int fill_zero(struct inode *inode, pgoff_t index, loff_t start, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *page; + struct folio *folio; if (!len) return 0; @@ -1169,16 +1172,16 @@ static int fill_zero(struct inode *inode, pgoff_t index, f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); - page = f2fs_get_new_data_page(inode, NULL, index, false); + folio = f2fs_get_new_data_folio(inode, NULL, index, false); f2fs_unlock_op(sbi); - if (IS_ERR(page)) - return PTR_ERR(page); + if (IS_ERR(folio)) + return PTR_ERR(folio); - f2fs_wait_on_page_writeback(page, DATA, true, true); - zero_user(page, start, len); - set_page_dirty(page); - f2fs_put_page(page, 1); + f2fs_folio_wait_writeback(folio, DATA, true, true); + folio_zero_range(folio, start, len); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); return 0; } @@ -1201,7 +1204,7 @@ int f2fs_truncate_hole(struct inode *inode, pgoff_t pg_start, pgoff_t pg_end) return err; } - end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); count = min(end_offset - dn.ofs_in_node, pg_end - pg_start); f2fs_bug_on(F2FS_I_SB(inode), count == 0 || count > end_offset); @@ -1296,7 +1299,7 @@ next_dnode: goto next; } - done = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, inode) - + done = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, inode) - dn.ofs_in_node, len); for (i = 0; i < done; i++, blkaddr++, do_replace++, dn.ofs_in_node++) { *blkaddr = f2fs_data_blkaddr(&dn); @@ -1385,7 +1388,7 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, } ilen = min((pgoff_t) - ADDRS_PER_PAGE(dn.node_page, dst_inode) - + ADDRS_PER_PAGE(&dn.node_folio->page, dst_inode) - dn.ofs_in_node, len - i); do { dn.data_blkaddr = f2fs_data_blkaddr(&dn); @@ -1410,26 +1413,26 @@ static int __clone_blkaddrs(struct inode *src_inode, struct inode *dst_inode, f2fs_put_dnode(&dn); } else { - struct page *psrc, *pdst; + struct folio *fsrc, *fdst; - psrc = f2fs_get_lock_data_page(src_inode, + fsrc = f2fs_get_lock_data_folio(src_inode, src + i, true); - if (IS_ERR(psrc)) - return PTR_ERR(psrc); - pdst = f2fs_get_new_data_page(dst_inode, NULL, dst + i, + if (IS_ERR(fsrc)) + return PTR_ERR(fsrc); + fdst = f2fs_get_new_data_folio(dst_inode, NULL, dst + i, true); - if (IS_ERR(pdst)) { - f2fs_put_page(psrc, 1); - return PTR_ERR(pdst); + if (IS_ERR(fdst)) { + f2fs_folio_put(fsrc, true); + return PTR_ERR(fdst); } - f2fs_wait_on_page_writeback(pdst, DATA, true, true); + f2fs_folio_wait_writeback(fdst, DATA, true, true); - memcpy_page(pdst, 0, psrc, 0, PAGE_SIZE); - set_page_dirty(pdst); - set_page_private_gcing(pdst); - f2fs_put_page(pdst, 1); - f2fs_put_page(psrc, 1); + memcpy_folio(fdst, 0, fsrc, 0, PAGE_SIZE); + folio_mark_dirty(fdst); + set_page_private_gcing(&fdst->page); + f2fs_folio_put(fdst, true); + f2fs_folio_put(fsrc, true); ret = f2fs_truncate_hole(src_inode, src + i, src + i + 1); @@ -1675,7 +1678,7 @@ static int f2fs_zero_range(struct inode *inode, loff_t offset, loff_t len, goto out; } - end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); end = min(pg_end, end_offset - dn.ofs_in_node + index); ret = f2fs_do_zero_range(&dn, index, end); @@ -2464,19 +2467,20 @@ static int f2fs_ioc_shutdown(struct file *filp, unsigned long arg) return ret; } -static void f2fs_keep_noreuse_range(struct inode *inode, +static int f2fs_keep_noreuse_range(struct inode *inode, loff_t offset, loff_t len) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); u64 max_bytes = F2FS_BLK_TO_BYTES(max_file_blocks(inode)); u64 start, end; + int ret = 0; if (!S_ISREG(inode->i_mode)) - return; + return 0; if (offset >= max_bytes || len > max_bytes || (offset + len) > max_bytes) - return; + return 0; start = offset >> PAGE_SHIFT; end = DIV_ROUND_UP(offset + len, PAGE_SIZE); @@ -2484,7 +2488,7 @@ static void f2fs_keep_noreuse_range(struct inode *inode, inode_lock(inode); if (f2fs_is_atomic_file(inode)) { inode_unlock(inode); - return; + return 0; } spin_lock(&sbi->inode_lock[DONATE_INODE]); @@ -2493,7 +2497,12 @@ static void f2fs_keep_noreuse_range(struct inode *inode, if (!list_empty(&F2FS_I(inode)->gdonate_list)) { list_del_init(&F2FS_I(inode)->gdonate_list); sbi->donate_files--; - } + if (is_inode_flag_set(inode, FI_DONATE_FINISHED)) + ret = -EALREADY; + else + set_inode_flag(inode, FI_DONATE_FINISHED); + } else + ret = -ENOENT; } else { if (list_empty(&F2FS_I(inode)->gdonate_list)) { list_add_tail(&F2FS_I(inode)->gdonate_list, @@ -2505,9 +2514,12 @@ static void f2fs_keep_noreuse_range(struct inode *inode, } F2FS_I(inode)->donate_start = start; F2FS_I(inode)->donate_end = end - 1; + clear_inode_flag(inode, FI_DONATE_FINISHED); } spin_unlock(&sbi->inode_lock[DONATE_INODE]); inode_unlock(inode); + + return ret; } static int f2fs_ioc_fitrim(struct file *filp, unsigned long arg) @@ -2920,19 +2932,19 @@ do_map: idx = map.m_lblk; while (idx < map.m_lblk + map.m_len && cnt < BLKS_PER_SEG(sbi)) { - struct page *page; + struct folio *folio; - page = f2fs_get_lock_data_page(inode, idx, true); - if (IS_ERR(page)) { - err = PTR_ERR(page); + folio = f2fs_get_lock_data_folio(inode, idx, true); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); goto clear_out; } - f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); - set_page_dirty(page); - set_page_private_gcing(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + set_page_private_gcing(&folio->page); + f2fs_folio_put(folio, true); idx++; cnt++; @@ -3711,7 +3723,7 @@ static int release_compress_blocks(struct dnode_of_data *dn, pgoff_t count) int i; for (i = 0; i < count; i++) { - blkaddr = data_blkaddr(dn->inode, dn->node_page, + blkaddr = data_blkaddr(dn->inode, dn->node_folio, dn->ofs_in_node + i); if (!__is_valid_data_blkaddr(blkaddr)) @@ -3829,7 +3841,7 @@ static int f2fs_release_compress_blocks(struct file *filp, unsigned long arg) break; } - end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, fi->i_cluster_size); @@ -3880,7 +3892,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count, int i; for (i = 0; i < count; i++) { - blkaddr = data_blkaddr(dn->inode, dn->node_page, + blkaddr = data_blkaddr(dn->inode, dn->node_folio, dn->ofs_in_node + i); if (!__is_valid_data_blkaddr(blkaddr)) @@ -3897,7 +3909,7 @@ static int reserve_compress_blocks(struct dnode_of_data *dn, pgoff_t count, int ret; for (i = 0; i < cluster_size; i++) { - blkaddr = data_blkaddr(dn->inode, dn->node_page, + blkaddr = data_blkaddr(dn->inode, dn->node_folio, dn->ofs_in_node + i); if (i == 0) { @@ -4007,7 +4019,7 @@ static int f2fs_reserve_compress_blocks(struct file *filp, unsigned long arg) break; } - end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); count = min(end_offset - dn.ofs_in_node, last_idx - page_idx); count = round_up(count, fi->i_cluster_size); @@ -4171,7 +4183,7 @@ static int f2fs_sec_trim_file(struct file *filp, unsigned long arg) goto out; } - end_offset = ADDRS_PER_PAGE(dn.node_page, inode); + end_offset = ADDRS_PER_PAGE(&dn.node_folio->page, inode); count = min(end_offset - dn.ofs_in_node, pg_end - index); for (i = 0; i < count; i++, index++, dn.ofs_in_node++) { struct block_device *cur_bdev; @@ -4343,34 +4355,36 @@ static int redirty_blocks(struct inode *inode, pgoff_t page_idx, int len) { DEFINE_READAHEAD(ractl, NULL, NULL, inode->i_mapping, page_idx); struct address_space *mapping = inode->i_mapping; - struct page *page; + struct folio *folio; pgoff_t redirty_idx = page_idx; - int i, page_len = 0, ret = 0; + int page_len = 0, ret = 0; page_cache_ra_unbounded(&ractl, len, 0); - for (i = 0; i < len; i++, page_idx++) { - page = read_cache_page(mapping, page_idx, NULL, NULL); - if (IS_ERR(page)) { - ret = PTR_ERR(page); + do { + folio = read_cache_folio(mapping, page_idx, NULL, NULL); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); break; } - page_len++; - } + page_len += folio_nr_pages(folio) - (page_idx - folio->index); + page_idx = folio_next_index(folio); + } while (page_len < len); - for (i = 0; i < page_len; i++, redirty_idx++) { - page = find_lock_page(mapping, redirty_idx); + do { + folio = filemap_lock_folio(mapping, redirty_idx); - /* It will never fail, when page has pinned above */ - f2fs_bug_on(F2FS_I_SB(inode), !page); + /* It will never fail, when folio has pinned above */ + f2fs_bug_on(F2FS_I_SB(inode), IS_ERR(folio)); - f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); - set_page_dirty(page); - set_page_private_gcing(page); - f2fs_put_page(page, 1); - f2fs_put_page(page, 0); - } + folio_mark_dirty(folio); + set_page_private_gcing(&folio->page); + redirty_idx = folio_next_index(folio); + folio_unlock(folio); + folio_put_refs(folio, 2); + } while (redirty_idx < page_idx); return ret; } @@ -5236,8 +5250,8 @@ static int f2fs_file_fadvise(struct file *filp, loff_t offset, loff_t len, f2fs_compressed_file(inode))) f2fs_invalidate_compress_pages(F2FS_I_SB(inode), inode->i_ino); else if (advice == POSIX_FADV_NOREUSE) - f2fs_keep_noreuse_range(inode, offset, len); - return 0; + err = f2fs_keep_noreuse_range(inode, offset, len); + return err; } #ifdef CONFIG_COMPAT diff --git a/fs/f2fs/gc.c b/fs/f2fs/gc.c index dd0ba0532e01..3cb5242f4ddf 100644 --- a/fs/f2fs/gc.c +++ b/fs/f2fs/gc.c @@ -1045,7 +1045,7 @@ next_step: for (off = 0; off < usable_blks_in_seg; off++, entry++) { nid_t nid = le32_to_cpu(entry->nid); - struct page *node_page; + struct folio *node_folio; struct node_info ni; int err; @@ -1068,27 +1068,27 @@ next_step: } /* phase == 2 */ - node_page = f2fs_get_node_page(sbi, nid); - if (IS_ERR(node_page)) + node_folio = f2fs_get_node_folio(sbi, nid); + if (IS_ERR(node_folio)) continue; - /* block may become invalid during f2fs_get_node_page */ + /* block may become invalid during f2fs_get_node_folio */ if (check_valid_map(sbi, segno, off) == 0) { - f2fs_put_page(node_page, 1); + f2fs_folio_put(node_folio, true); continue; } if (f2fs_get_node_info(sbi, nid, &ni, false)) { - f2fs_put_page(node_page, 1); + f2fs_folio_put(node_folio, true); continue; } if (ni.blk_addr != start_addr + off) { - f2fs_put_page(node_page, 1); + f2fs_folio_put(node_folio, true); continue; } - err = f2fs_move_node_page(node_page, gc_type); + err = f2fs_move_node_folio(node_folio, gc_type); if (!err && gc_type == FG_GC) submitted++; stat_inc_node_blk_count(sbi, 1, gc_type); @@ -1134,7 +1134,7 @@ block_t f2fs_start_bidx_of_node(unsigned int node_ofs, struct inode *inode) static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, struct node_info *dni, block_t blkaddr, unsigned int *nofs) { - struct page *node_page; + struct folio *node_folio; nid_t nid; unsigned int ofs_in_node, max_addrs, base; block_t source_blkaddr; @@ -1142,12 +1142,12 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, nid = le32_to_cpu(sum->nid); ofs_in_node = le16_to_cpu(sum->ofs_in_node); - node_page = f2fs_get_node_page(sbi, nid); - if (IS_ERR(node_page)) + node_folio = f2fs_get_node_folio(sbi, nid); + if (IS_ERR(node_folio)) return false; if (f2fs_get_node_info(sbi, nid, dni, false)) { - f2fs_put_page(node_page, 1); + f2fs_folio_put(node_folio, true); return false; } @@ -1158,12 +1158,12 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, } if (f2fs_check_nid_range(sbi, dni->ino)) { - f2fs_put_page(node_page, 1); + f2fs_folio_put(node_folio, true); return false; } - if (IS_INODE(node_page)) { - base = offset_in_addr(F2FS_INODE(node_page)); + if (IS_INODE(&node_folio->page)) { + base = offset_in_addr(F2FS_INODE(&node_folio->page)); max_addrs = DEF_ADDRS_PER_INODE; } else { base = 0; @@ -1173,13 +1173,13 @@ static bool is_alive(struct f2fs_sb_info *sbi, struct f2fs_summary *sum, if (base + ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent blkaddr offset: base:%u, ofs_in_node:%u, max:%u, ino:%u, nid:%u", base, ofs_in_node, max_addrs, dni->ino, dni->nid); - f2fs_put_page(node_page, 1); + f2fs_folio_put(node_folio, true); return false; } - *nofs = ofs_of_node(node_page); - source_blkaddr = data_blkaddr(NULL, node_page, ofs_in_node); - f2fs_put_page(node_page, 1); + *nofs = ofs_of_node(&node_folio->page); + source_blkaddr = data_blkaddr(NULL, node_folio, ofs_in_node); + f2fs_folio_put(node_folio, true); if (source_blkaddr != blkaddr) { #ifdef CONFIG_F2FS_CHECK_FS @@ -1205,7 +1205,7 @@ static int ra_data_block(struct inode *inode, pgoff_t index) struct address_space *mapping = f2fs_is_cow_file(inode) ? F2FS_I(inode)->atomic_inode->i_mapping : inode->i_mapping; struct dnode_of_data dn; - struct page *page; + struct folio *folio; struct f2fs_io_info fio = { .sbi = sbi, .ino = inode->i_ino, @@ -1218,16 +1218,16 @@ static int ra_data_block(struct inode *inode, pgoff_t index) }; int err; - page = f2fs_grab_cache_page(mapping, index, true); - if (!page) - return -ENOMEM; + folio = f2fs_grab_cache_folio(mapping, index, true); + if (IS_ERR(folio)) + return PTR_ERR(folio); if (f2fs_lookup_read_extent_cache_block(inode, index, &dn.data_blkaddr)) { if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE_READ))) { err = -EFSCORRUPTED; - goto put_page; + goto put_folio; } goto got_it; } @@ -1235,28 +1235,28 @@ static int ra_data_block(struct inode *inode, pgoff_t index) set_new_dnode(&dn, inode, NULL, NULL, 0); err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE); if (err) - goto put_page; + goto put_folio; f2fs_put_dnode(&dn); if (!__is_valid_data_blkaddr(dn.data_blkaddr)) { err = -ENOENT; - goto put_page; + goto put_folio; } if (unlikely(!f2fs_is_valid_blkaddr(sbi, dn.data_blkaddr, DATA_GENERIC_ENHANCE))) { err = -EFSCORRUPTED; - goto put_page; + goto put_folio; } got_it: - /* read page */ - fio.page = page; + /* read folio */ + fio.page = &folio->page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; /* * don't cache encrypted data into meta inode until previous dirty * data were writebacked to avoid racing between GC and flush. */ - f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); @@ -1265,14 +1265,14 @@ got_it: FGP_LOCK | FGP_CREAT, GFP_NOFS); if (!fio.encrypted_page) { err = -ENOMEM; - goto put_page; + goto put_folio; } err = f2fs_submit_page_bio(&fio); if (err) goto put_encrypted_page; f2fs_put_page(fio.encrypted_page, 0); - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); f2fs_update_iostat(sbi, inode, FS_DATA_READ_IO, F2FS_BLKSIZE); f2fs_update_iostat(sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); @@ -1280,8 +1280,8 @@ got_it: return 0; put_encrypted_page: f2fs_put_page(fio.encrypted_page, 1); -put_page: - f2fs_put_page(page, 1); +put_folio: + f2fs_folio_put(folio, true); return err; } @@ -1307,7 +1307,7 @@ static int move_data_block(struct inode *inode, block_t bidx, struct dnode_of_data dn; struct f2fs_summary sum; struct node_info ni; - struct page *page, *mpage; + struct folio *folio, *mfolio; block_t newaddr; int err = 0; bool lfs_mode = f2fs_lfs_mode(fio.sbi); @@ -1316,9 +1316,9 @@ static int move_data_block(struct inode *inode, block_t bidx, CURSEG_ALL_DATA_ATGC : CURSEG_COLD_DATA; /* do not read out */ - page = f2fs_grab_cache_page(mapping, bidx, false); - if (!page) - return -ENOMEM; + folio = f2fs_grab_cache_folio(mapping, bidx, false); + if (IS_ERR(folio)) + return PTR_ERR(folio); if (!check_valid_map(F2FS_I_SB(inode), segno, off)) { err = -ENOENT; @@ -1335,7 +1335,7 @@ static int move_data_block(struct inode *inode, block_t bidx, goto out; if (unlikely(dn.data_blkaddr == NULL_ADDR)) { - ClearPageUptodate(page); + folio_clear_uptodate(folio); err = -ENOENT; goto put_out; } @@ -1344,7 +1344,7 @@ static int move_data_block(struct inode *inode, block_t bidx, * don't cache encrypted data into meta inode until previous dirty * data were writebacked to avoid racing between GC and flush. */ - f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); f2fs_wait_on_block_writeback(inode, dn.data_blkaddr); @@ -1353,26 +1353,26 @@ static int move_data_block(struct inode *inode, block_t bidx, goto put_out; /* read page */ - fio.page = page; + fio.page = &folio->page; fio.new_blkaddr = fio.old_blkaddr = dn.data_blkaddr; if (lfs_mode) f2fs_down_write(&fio.sbi->io_order_lock); - mpage = f2fs_grab_cache_page(META_MAPPING(fio.sbi), + mfolio = f2fs_grab_cache_folio(META_MAPPING(fio.sbi), fio.old_blkaddr, false); - if (!mpage) { - err = -ENOMEM; + if (IS_ERR(mfolio)) { + err = PTR_ERR(mfolio); goto up_out; } - fio.encrypted_page = mpage; + fio.encrypted_page = folio_file_page(mfolio, fio.old_blkaddr); - /* read source block in mpage */ - if (!PageUptodate(mpage)) { + /* read source block in mfolio */ + if (!folio_test_uptodate(mfolio)) { err = f2fs_submit_page_bio(&fio); if (err) { - f2fs_put_page(mpage, 1); + f2fs_folio_put(mfolio, true); goto up_out; } @@ -1381,11 +1381,11 @@ static int move_data_block(struct inode *inode, block_t bidx, f2fs_update_iostat(fio.sbi, NULL, FS_GDATA_READ_IO, F2FS_BLKSIZE); - lock_page(mpage); - if (unlikely(mpage->mapping != META_MAPPING(fio.sbi) || - !PageUptodate(mpage))) { + folio_lock(mfolio); + if (unlikely(!is_meta_folio(mfolio) || + !folio_test_uptodate(mfolio))) { err = -EIO; - f2fs_put_page(mpage, 1); + f2fs_folio_put(mfolio, true); goto up_out; } } @@ -1396,7 +1396,7 @@ static int move_data_block(struct inode *inode, block_t bidx, err = f2fs_allocate_data_block(fio.sbi, NULL, fio.old_blkaddr, &newaddr, &sum, type, NULL); if (err) { - f2fs_put_page(mpage, 1); + f2fs_folio_put(mfolio, true); /* filesystem should shutdown, no need to recovery block */ goto up_out; } @@ -1405,15 +1405,15 @@ static int move_data_block(struct inode *inode, block_t bidx, newaddr, FGP_LOCK | FGP_CREAT, GFP_NOFS); if (!fio.encrypted_page) { err = -ENOMEM; - f2fs_put_page(mpage, 1); + f2fs_folio_put(mfolio, true); goto recover_block; } /* write target block */ f2fs_wait_on_page_writeback(fio.encrypted_page, DATA, true, true); memcpy(page_address(fio.encrypted_page), - page_address(mpage), PAGE_SIZE); - f2fs_put_page(mpage, 1); + folio_address(mfolio), PAGE_SIZE); + f2fs_folio_put(mfolio, true); f2fs_invalidate_internal_cache(fio.sbi, fio.old_blkaddr, 1); @@ -1444,7 +1444,7 @@ up_out: put_out: f2fs_put_dnode(&dn); out: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return err; } @@ -1718,8 +1718,6 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, struct gc_inode_list *gc_list, int gc_type, bool force_migrate, bool one_time) { - struct page *sum_page; - struct f2fs_summary_block *sum; struct blk_plug plug; unsigned int segno = start_segno; unsigned int end_segno = start_segno + SEGS_PER_SEC(sbi); @@ -1769,40 +1767,40 @@ static int do_garbage_collect(struct f2fs_sb_info *sbi, /* reference all summary page */ while (segno < end_segno) { - sum_page = f2fs_get_sum_page(sbi, segno++); - if (IS_ERR(sum_page)) { - int err = PTR_ERR(sum_page); + struct folio *sum_folio = f2fs_get_sum_folio(sbi, segno++); + if (IS_ERR(sum_folio)) { + int err = PTR_ERR(sum_folio); end_segno = segno - 1; for (segno = start_segno; segno < end_segno; segno++) { - sum_page = find_get_page(META_MAPPING(sbi), + sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - f2fs_put_page(sum_page, 0); - f2fs_put_page(sum_page, 0); + folio_put_refs(sum_folio, 2); } return err; } - unlock_page(sum_page); + folio_unlock(sum_folio); } blk_start_plug(&plug); for (segno = start_segno; segno < end_segno; segno++) { + struct f2fs_summary_block *sum; /* find segment summary of victim */ - sum_page = find_get_page(META_MAPPING(sbi), + struct folio *sum_folio = filemap_get_folio(META_MAPPING(sbi), GET_SUM_BLOCK(sbi, segno)); - f2fs_put_page(sum_page, 0); if (get_valid_blocks(sbi, segno, false) == 0) goto freed; if (gc_type == BG_GC && __is_large_section(sbi) && migrated >= sbi->migration_granularity) goto skip; - if (!PageUptodate(sum_page) || unlikely(f2fs_cp_error(sbi))) + if (!folio_test_uptodate(sum_folio) || + unlikely(f2fs_cp_error(sbi))) goto skip; - sum = page_address(sum_page); + sum = folio_address(sum_folio); if (type != GET_SUM_TYPE((&sum->footer))) { f2fs_err(sbi, "Inconsistent segment (%u) type [%d, %d] in SSA and SIT", segno, type, GET_SUM_TYPE((&sum->footer))); @@ -1840,7 +1838,7 @@ freed: (segno + 1 < sec_end_segno) ? segno + 1 : NULL_SEGNO; skip: - f2fs_put_page(sum_page, 0); + folio_put_refs(sum_folio, 2); } if (submitted) @@ -2066,6 +2064,9 @@ int f2fs_gc_range(struct f2fs_sb_info *sbi, .iroot = RADIX_TREE_INIT(gc_list.iroot, GFP_NOFS), }; + if (IS_CURSEC(sbi, GET_SEC_FROM_SEG(sbi, segno))) + continue; + do_garbage_collect(sbi, segno, &gc_list, FG_GC, true, false); put_gc_inode(&gc_list); diff --git a/fs/f2fs/inline.c b/fs/f2fs/inline.c index ad92e9008781..901c630685ce 100644 --- a/fs/f2fs/inline.c +++ b/fs/f2fs/inline.c @@ -79,37 +79,37 @@ bool f2fs_may_inline_dentry(struct inode *inode) return true; } -void f2fs_do_read_inline_data(struct folio *folio, struct page *ipage) +void f2fs_do_read_inline_data(struct folio *folio, struct folio *ifolio) { struct inode *inode = folio->mapping->host; if (folio_test_uptodate(folio)) return; - f2fs_bug_on(F2FS_I_SB(inode), folio_index(folio)); + f2fs_bug_on(F2FS_I_SB(inode), folio->index); folio_zero_segment(folio, MAX_INLINE_DATA(inode), folio_size(folio)); /* Copy the whole inline data block */ - memcpy_to_folio(folio, 0, inline_data_addr(inode, ipage), + memcpy_to_folio(folio, 0, inline_data_addr(inode, ifolio), MAX_INLINE_DATA(inode)); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); } -void f2fs_truncate_inline_inode(struct inode *inode, - struct page *ipage, u64 from) +void f2fs_truncate_inline_inode(struct inode *inode, struct folio *ifolio, + u64 from) { void *addr; if (from >= MAX_INLINE_DATA(inode)) return; - addr = inline_data_addr(inode, ipage); + addr = inline_data_addr(inode, ifolio); - f2fs_wait_on_page_writeback(ipage, NODE, true, true); + f2fs_folio_wait_writeback(ifolio, NODE, true, true); memset(addr + from, 0, MAX_INLINE_DATA(inode) - from); - set_page_dirty(ipage); + folio_mark_dirty(ifolio); if (from == 0) clear_inode_flag(inode, FI_DATA_EXIST); @@ -117,32 +117,32 @@ void f2fs_truncate_inline_inode(struct inode *inode, int f2fs_read_inline_data(struct inode *inode, struct folio *folio) { - struct page *ipage; + struct folio *ifolio; - ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(ipage)) { + ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ifolio)) { folio_unlock(folio); - return PTR_ERR(ipage); + return PTR_ERR(ifolio); } if (!f2fs_has_inline_data(inode)) { - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); return -EAGAIN; } - if (folio_index(folio)) + if (folio->index) folio_zero_segment(folio, 0, folio_size(folio)); else - f2fs_do_read_inline_data(folio, ipage); + f2fs_do_read_inline_data(folio, ifolio); if (!folio_test_uptodate(folio)) folio_mark_uptodate(folio); - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); folio_unlock(folio); return 0; } -int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) +int f2fs_convert_inline_folio(struct dnode_of_data *dn, struct folio *folio) { struct f2fs_io_info fio = { .sbi = F2FS_I_SB(dn->inode), @@ -150,7 +150,7 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) .type = DATA, .op = REQ_OP_WRITE, .op_flags = REQ_SYNC | REQ_PRIO, - .page = page, + .page = &folio->page, .encrypted_page = NULL, .io_type = FS_DATA_IO, }; @@ -182,20 +182,20 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) return -EFSCORRUPTED; } - f2fs_bug_on(F2FS_P_SB(page), folio_test_writeback(page_folio(page))); + f2fs_bug_on(F2FS_F_SB(folio), folio_test_writeback(folio)); - f2fs_do_read_inline_data(page_folio(page), dn->inode_page); - set_page_dirty(page); + f2fs_do_read_inline_data(folio, dn->inode_folio); + folio_mark_dirty(folio); /* clear dirty state */ - dirty = clear_page_dirty_for_io(page); + dirty = folio_clear_dirty_for_io(folio); /* write data page to try to make data consistent */ - set_page_writeback(page); + folio_start_writeback(folio); fio.old_blkaddr = dn->data_blkaddr; set_inode_flag(dn->inode, FI_HOT_DATA); f2fs_outplace_write_data(dn, &fio); - f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); if (dirty) { inode_dec_dirty_pages(dn->inode); f2fs_remove_dirty_inode(dn->inode); @@ -205,8 +205,8 @@ int f2fs_convert_inline_page(struct dnode_of_data *dn, struct page *page) set_inode_flag(dn->inode, FI_APPEND_WRITE); /* clear inline data and flag after data writeback */ - f2fs_truncate_inline_inode(dn->inode, dn->inode_page, 0); - clear_page_private_inline(dn->inode_page); + f2fs_truncate_inline_inode(dn->inode, dn->inode_folio, 0); + clear_page_private_inline(&dn->inode_folio->page); clear_out: stat_dec_inline_inode(dn->inode); clear_inode_flag(dn->inode, FI_INLINE_DATA); @@ -218,7 +218,7 @@ int f2fs_convert_inline_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct dnode_of_data dn; - struct page *ipage, *page; + struct folio *ifolio, *folio; int err = 0; if (f2fs_hw_is_readonly(sbi) || f2fs_readonly(sbi->sb)) @@ -231,28 +231,28 @@ int f2fs_convert_inline_inode(struct inode *inode) if (err) return err; - page = f2fs_grab_cache_page(inode->i_mapping, 0, false); - if (!page) - return -ENOMEM; + folio = f2fs_grab_cache_folio(inode->i_mapping, 0, false); + if (IS_ERR(folio)) + return PTR_ERR(folio); f2fs_lock_op(sbi); - ipage = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) { + err = PTR_ERR(ifolio); goto out; } - set_new_dnode(&dn, inode, ipage, ipage, 0); + set_new_dnode(&dn, inode, ifolio, ifolio, 0); if (f2fs_has_inline_data(inode)) - err = f2fs_convert_inline_page(&dn, page); + err = f2fs_convert_inline_folio(&dn, folio); f2fs_put_dnode(&dn); out: f2fs_unlock_op(sbi); - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); if (!err) f2fs_balance_fs(sbi, dn.node_changed); @@ -263,40 +263,39 @@ out: int f2fs_write_inline_data(struct inode *inode, struct folio *folio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *ipage; + struct folio *ifolio; - ipage = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); if (!f2fs_has_inline_data(inode)) { - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); return -EAGAIN; } f2fs_bug_on(F2FS_I_SB(inode), folio->index); - f2fs_wait_on_page_writeback(ipage, NODE, true, true); - memcpy_from_folio(inline_data_addr(inode, ipage), + f2fs_folio_wait_writeback(ifolio, NODE, true, true); + memcpy_from_folio(inline_data_addr(inode, ifolio), folio, 0, MAX_INLINE_DATA(inode)); - set_page_dirty(ipage); + folio_mark_dirty(ifolio); f2fs_clear_page_cache_dirty_tag(folio); set_inode_flag(inode, FI_APPEND_WRITE); set_inode_flag(inode, FI_DATA_EXIST); - clear_page_private_inline(ipage); - f2fs_put_page(ipage, 1); + clear_page_private_inline(&ifolio->page); + f2fs_folio_put(ifolio, 1); return 0; } -int f2fs_recover_inline_data(struct inode *inode, struct page *npage) +int f2fs_recover_inline_data(struct inode *inode, struct folio *nfolio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode *ri = NULL; void *src_addr, *dst_addr; - struct page *ipage; /* * The inline_data recovery policy is as follows. @@ -306,38 +305,39 @@ int f2fs_recover_inline_data(struct inode *inode, struct page *npage) * x o -> remove data blocks, and then recover inline_data * x x -> recover data blocks */ - if (IS_INODE(npage)) - ri = F2FS_INODE(npage); + if (IS_INODE(&nfolio->page)) + ri = F2FS_INODE(&nfolio->page); if (f2fs_has_inline_data(inode) && ri && (ri->i_inline & F2FS_INLINE_DATA)) { + struct folio *ifolio; process_inline: - ipage = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); - f2fs_wait_on_page_writeback(ipage, NODE, true, true); + f2fs_folio_wait_writeback(ifolio, NODE, true, true); - src_addr = inline_data_addr(inode, npage); - dst_addr = inline_data_addr(inode, ipage); + src_addr = inline_data_addr(inode, nfolio); + dst_addr = inline_data_addr(inode, ifolio); memcpy(dst_addr, src_addr, MAX_INLINE_DATA(inode)); set_inode_flag(inode, FI_INLINE_DATA); set_inode_flag(inode, FI_DATA_EXIST); - set_page_dirty(ipage); - f2fs_put_page(ipage, 1); + folio_mark_dirty(ifolio); + f2fs_folio_put(ifolio, true); return 1; } if (f2fs_has_inline_data(inode)) { - ipage = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); - f2fs_truncate_inline_inode(inode, ipage, 0); + struct folio *ifolio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); + f2fs_truncate_inline_inode(inode, ifolio, 0); stat_dec_inline_inode(inode); clear_inode_flag(inode, FI_INLINE_DATA); - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); } else if (ri && (ri->i_inline & F2FS_INLINE_DATA)) { int ret; @@ -352,50 +352,50 @@ process_inline: struct f2fs_dir_entry *f2fs_find_in_inline_dir(struct inode *dir, const struct f2fs_filename *fname, - struct page **res_page, + struct folio **res_folio, bool use_hash) { struct f2fs_sb_info *sbi = F2FS_SB(dir->i_sb); struct f2fs_dir_entry *de; struct f2fs_dentry_ptr d; - struct page *ipage; + struct folio *ifolio; void *inline_dentry; - ipage = f2fs_get_inode_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) { - *res_page = ipage; + ifolio = f2fs_get_inode_folio(sbi, dir->i_ino); + if (IS_ERR(ifolio)) { + *res_folio = ifolio; return NULL; } - inline_dentry = inline_data_addr(dir, ipage); + inline_dentry = inline_data_addr(dir, ifolio); make_dentry_ptr_inline(dir, &d, inline_dentry); de = f2fs_find_target_dentry(&d, fname, NULL, use_hash); - unlock_page(ipage); + folio_unlock(ifolio); if (IS_ERR(de)) { - *res_page = ERR_CAST(de); + *res_folio = ERR_CAST(de); de = NULL; } if (de) - *res_page = ipage; + *res_folio = ifolio; else - f2fs_put_page(ipage, 0); + f2fs_folio_put(ifolio, false); return de; } int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, - struct page *ipage) + struct folio *ifolio) { struct f2fs_dentry_ptr d; void *inline_dentry; - inline_dentry = inline_data_addr(inode, ipage); + inline_dentry = inline_data_addr(inode, ifolio); make_dentry_ptr_inline(inode, &d, inline_dentry); f2fs_do_make_empty_dir(inode, parent, &d); - set_page_dirty(ipage); + folio_mark_dirty(ifolio); /* update i_size to MAX_INLINE_DATA */ if (i_size_read(inode) < MAX_INLINE_DATA(inode)) @@ -407,39 +407,39 @@ int f2fs_make_empty_inline_dir(struct inode *inode, struct inode *parent, * NOTE: ipage is grabbed by caller, but if any error occurs, we should * release ipage in this function. */ -static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, +static int f2fs_move_inline_dirents(struct inode *dir, struct folio *ifolio, void *inline_dentry) { - struct page *page; + struct folio *folio; struct dnode_of_data dn; struct f2fs_dentry_block *dentry_blk; struct f2fs_dentry_ptr src, dst; int err; - page = f2fs_grab_cache_page(dir->i_mapping, 0, true); - if (!page) { - f2fs_put_page(ipage, 1); - return -ENOMEM; + folio = f2fs_grab_cache_folio(dir->i_mapping, 0, true); + if (IS_ERR(folio)) { + f2fs_folio_put(ifolio, true); + return PTR_ERR(folio); } - set_new_dnode(&dn, dir, ipage, NULL, 0); + set_new_dnode(&dn, dir, ifolio, NULL, 0); err = f2fs_reserve_block(&dn, 0); if (err) goto out; if (unlikely(dn.data_blkaddr != NEW_ADDR)) { f2fs_put_dnode(&dn); - set_sbi_flag(F2FS_P_SB(page), SBI_NEED_FSCK); - f2fs_warn(F2FS_P_SB(page), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", + set_sbi_flag(F2FS_F_SB(folio), SBI_NEED_FSCK); + f2fs_warn(F2FS_F_SB(folio), "%s: corrupted inline inode ino=%lx, i_addr[0]:0x%x, run fsck to fix.", __func__, dir->i_ino, dn.data_blkaddr); - f2fs_handle_error(F2FS_P_SB(page), ERROR_INVALID_BLKADDR); + f2fs_handle_error(F2FS_F_SB(folio), ERROR_INVALID_BLKADDR); err = -EFSCORRUPTED; goto out; } - f2fs_wait_on_page_writeback(page, DATA, true, true); + f2fs_folio_wait_writeback(folio, DATA, true, true); - dentry_blk = page_address(page); + dentry_blk = folio_address(folio); /* * Start by zeroing the full block, to ensure that all unused space is @@ -455,12 +455,12 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, memcpy(dst.dentry, src.dentry, SIZE_OF_DIR_ENTRY * src.max); memcpy(dst.filename, src.filename, src.max * F2FS_SLOT_LEN); - if (!PageUptodate(page)) - SetPageUptodate(page); - set_page_dirty(page); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + folio_mark_dirty(folio); /* clear inline dir and flag after data writeback */ - f2fs_truncate_inline_inode(dir, ipage, 0); + f2fs_truncate_inline_inode(dir, ifolio, 0); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -477,7 +477,7 @@ static int f2fs_move_inline_dirents(struct inode *dir, struct page *ipage, if (i_size_read(dir) < PAGE_SIZE) f2fs_i_size_write(dir, PAGE_SIZE); out: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return err; } @@ -533,7 +533,7 @@ punch_dentry_pages: return err; } -static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, +static int f2fs_move_rehashed_dirents(struct inode *dir, struct folio *ifolio, void *inline_dentry) { void *backup_dentry; @@ -542,20 +542,20 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, backup_dentry = f2fs_kmalloc(F2FS_I_SB(dir), MAX_INLINE_DATA(dir), GFP_F2FS_ZERO); if (!backup_dentry) { - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); return -ENOMEM; } memcpy(backup_dentry, inline_dentry, MAX_INLINE_DATA(dir)); - f2fs_truncate_inline_inode(dir, ipage, 0); + f2fs_truncate_inline_inode(dir, ifolio, 0); - unlock_page(ipage); + folio_unlock(ifolio); err = f2fs_add_inline_entries(dir, backup_dentry); if (err) goto recover; - lock_page(ipage); + folio_lock(ifolio); stat_dec_inline_dir(dir); clear_inode_flag(dir, FI_INLINE_DENTRY); @@ -571,31 +571,31 @@ static int f2fs_move_rehashed_dirents(struct inode *dir, struct page *ipage, kfree(backup_dentry); return 0; recover: - lock_page(ipage); - f2fs_wait_on_page_writeback(ipage, NODE, true, true); + folio_lock(ifolio); + f2fs_folio_wait_writeback(ifolio, NODE, true, true); memcpy(inline_dentry, backup_dentry, MAX_INLINE_DATA(dir)); f2fs_i_depth_write(dir, 0); f2fs_i_size_write(dir, MAX_INLINE_DATA(dir)); - set_page_dirty(ipage); - f2fs_put_page(ipage, 1); + folio_mark_dirty(ifolio); + f2fs_folio_put(ifolio, 1); kfree(backup_dentry); return err; } -static int do_convert_inline_dir(struct inode *dir, struct page *ipage, +static int do_convert_inline_dir(struct inode *dir, struct folio *ifolio, void *inline_dentry) { if (!F2FS_I(dir)->i_dir_level) - return f2fs_move_inline_dirents(dir, ipage, inline_dentry); + return f2fs_move_inline_dirents(dir, ifolio, inline_dentry); else - return f2fs_move_rehashed_dirents(dir, ipage, inline_dentry); + return f2fs_move_rehashed_dirents(dir, ifolio, inline_dentry); } int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct page *ipage; + struct folio *ifolio; struct f2fs_filename fname; void *inline_dentry = NULL; int err = 0; @@ -609,22 +609,22 @@ int f2fs_try_convert_inline_dir(struct inode *dir, struct dentry *dentry) if (err) goto out; - ipage = f2fs_get_inode_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) { - err = PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(sbi, dir->i_ino); + if (IS_ERR(ifolio)) { + err = PTR_ERR(ifolio); goto out_fname; } - if (f2fs_has_enough_room(dir, ipage, &fname)) { - f2fs_put_page(ipage, 1); + if (f2fs_has_enough_room(dir, ifolio, &fname)) { + f2fs_folio_put(ifolio, true); goto out_fname; } - inline_dentry = inline_data_addr(dir, ipage); + inline_dentry = inline_data_addr(dir, ifolio); - err = do_convert_inline_dir(dir, ipage, inline_dentry); + err = do_convert_inline_dir(dir, ifolio, inline_dentry); if (!err) - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); out_fname: f2fs_free_filename(&fname); out: @@ -636,24 +636,24 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, struct inode *inode, nid_t ino, umode_t mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct page *ipage; + struct folio *ifolio; unsigned int bit_pos; void *inline_dentry = NULL; struct f2fs_dentry_ptr d; int slots = GET_DENTRY_SLOTS(fname->disk_name.len); - struct page *page = NULL; + struct folio *folio = NULL; int err = 0; - ipage = f2fs_get_inode_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(sbi, dir->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); - inline_dentry = inline_data_addr(dir, ipage); + inline_dentry = inline_data_addr(dir, ifolio); make_dentry_ptr_inline(dir, &d, inline_dentry); bit_pos = f2fs_room_for_filename(d.bitmap, slots, d.max); if (bit_pos >= d.max) { - err = do_convert_inline_dir(dir, ipage, inline_dentry); + err = do_convert_inline_dir(dir, ifolio, inline_dentry); if (err) return err; err = -EAGAIN; @@ -663,19 +663,19 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, if (inode) { f2fs_down_write_nested(&F2FS_I(inode)->i_sem, SINGLE_DEPTH_NESTING); - page = f2fs_init_inode_metadata(inode, dir, fname, ipage); - if (IS_ERR(page)) { - err = PTR_ERR(page); + folio = f2fs_init_inode_metadata(inode, dir, fname, ifolio); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); goto fail; } } - f2fs_wait_on_page_writeback(ipage, NODE, true, true); + f2fs_folio_wait_writeback(ifolio, NODE, true, true); f2fs_update_dentry(ino, mode, &d, &fname->disk_name, fname->hash, bit_pos); - set_page_dirty(ipage); + folio_mark_dirty(ifolio); /* we don't need to mark_inode_dirty now */ if (inode) { @@ -683,9 +683,9 @@ int f2fs_add_inline_entry(struct inode *dir, const struct f2fs_filename *fname, /* synchronize inode page's data from inode cache */ if (is_inode_flag_set(inode, FI_NEW_INODE)) - f2fs_update_inode(inode, page); + f2fs_update_inode(inode, folio); - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); } f2fs_update_parent_metadata(dir, inode, 0); @@ -693,12 +693,12 @@ fail: if (inode) f2fs_up_write(&F2FS_I(inode)->i_sem); out: - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); return err; } -void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, - struct inode *dir, struct inode *inode) +void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, + struct folio *folio, struct inode *dir, struct inode *inode) { struct f2fs_dentry_ptr d; void *inline_dentry; @@ -706,18 +706,18 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, unsigned int bit_pos; int i; - lock_page(page); - f2fs_wait_on_page_writeback(page, NODE, true, true); + folio_lock(folio); + f2fs_folio_wait_writeback(folio, NODE, true, true); - inline_dentry = inline_data_addr(dir, page); + inline_dentry = inline_data_addr(dir, folio); make_dentry_ptr_inline(dir, &d, inline_dentry); bit_pos = dentry - d.dentry; for (i = 0; i < slots; i++) __clear_bit_le(bit_pos + i, d.bitmap); - set_page_dirty(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); inode_set_mtime_to_ts(dir, inode_set_ctime_current(dir)); f2fs_mark_inode_dirty_sync(dir, false); @@ -729,21 +729,21 @@ void f2fs_delete_inline_entry(struct f2fs_dir_entry *dentry, struct page *page, bool f2fs_empty_inline_dir(struct inode *dir) { struct f2fs_sb_info *sbi = F2FS_I_SB(dir); - struct page *ipage; + struct folio *ifolio; unsigned int bit_pos = 2; void *inline_dentry; struct f2fs_dentry_ptr d; - ipage = f2fs_get_inode_page(sbi, dir->i_ino); - if (IS_ERR(ipage)) + ifolio = f2fs_get_inode_folio(sbi, dir->i_ino); + if (IS_ERR(ifolio)) return false; - inline_dentry = inline_data_addr(dir, ipage); + inline_dentry = inline_data_addr(dir, ifolio); make_dentry_ptr_inline(dir, &d, inline_dentry); bit_pos = find_next_bit_le(d.bitmap, d.max, bit_pos); - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); if (bit_pos < d.max) return false; @@ -755,7 +755,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, struct fscrypt_str *fstr) { struct inode *inode = file_inode(file); - struct page *ipage = NULL; + struct folio *ifolio = NULL; struct f2fs_dentry_ptr d; void *inline_dentry = NULL; int err; @@ -765,17 +765,17 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (ctx->pos == d.max) return 0; - ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); /* * f2fs_readdir was protected by inode.i_rwsem, it is safe to access * ipage without page's lock held. */ - unlock_page(ipage); + folio_unlock(ifolio); - inline_dentry = inline_data_addr(inode, ipage); + inline_dentry = inline_data_addr(inode, ifolio); make_dentry_ptr_inline(inode, &d, inline_dentry); @@ -783,7 +783,7 @@ int f2fs_read_inline_dir(struct file *file, struct dir_context *ctx, if (!err) ctx->pos = d.max; - f2fs_put_page(ipage, 0); + f2fs_folio_put(ifolio, false); return err < 0 ? err : 0; } @@ -794,12 +794,12 @@ int f2fs_inline_data_fiemap(struct inode *inode, __u32 flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED | FIEMAP_EXTENT_LAST; struct node_info ni; - struct page *ipage; + struct folio *ifolio; int err = 0; - ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); if ((S_ISREG(inode->i_mode) || S_ISLNK(inode->i_mode)) && !f2fs_has_inline_data(inode)) { @@ -824,11 +824,11 @@ int f2fs_inline_data_fiemap(struct inode *inode, goto out; byteaddr = (__u64)ni.blk_addr << inode->i_sb->s_blocksize_bits; - byteaddr += (char *)inline_data_addr(inode, ipage) - - (char *)F2FS_INODE(ipage); + byteaddr += (char *)inline_data_addr(inode, ifolio) - + (char *)F2FS_INODE(&ifolio->page); err = fiemap_fill_next_extent(fieinfo, start, byteaddr, ilen, flags); trace_f2fs_fiemap(inode, start, byteaddr, ilen, flags, err); out: - f2fs_put_page(ipage, 1); + f2fs_folio_put(ifolio, true); return err; } diff --git a/fs/f2fs/inode.c b/fs/f2fs/inode.c index 83f862578fc8..083d52a42bfb 100644 --- a/fs/f2fs/inode.c +++ b/fs/f2fs/inode.c @@ -34,7 +34,9 @@ void f2fs_mark_inode_dirty_sync(struct inode *inode, bool sync) if (f2fs_inode_dirtied(inode, sync)) return; - if (f2fs_is_atomic_file(inode)) + /* only atomic file w/ FI_ATOMIC_COMMITTED can be set vfs dirty */ + if (f2fs_is_atomic_file(inode) && + !is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) return; mark_inode_dirty_sync(inode); @@ -66,9 +68,9 @@ void f2fs_set_inode_flags(struct inode *inode) S_ENCRYPTED|S_VERITY|S_CASEFOLD); } -static void __get_inode_rdev(struct inode *inode, struct page *node_page) +static void __get_inode_rdev(struct inode *inode, struct folio *node_folio) { - __le32 *addr = get_dnode_addr(inode, node_page); + __le32 *addr = get_dnode_addr(inode, node_folio); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode) || S_ISFIFO(inode->i_mode) || S_ISSOCK(inode->i_mode)) { @@ -79,9 +81,9 @@ static void __get_inode_rdev(struct inode *inode, struct page *node_page) } } -static void __set_inode_rdev(struct inode *inode, struct page *node_page) +static void __set_inode_rdev(struct inode *inode, struct folio *node_folio) { - __le32 *addr = get_dnode_addr(inode, node_page); + __le32 *addr = get_dnode_addr(inode, node_folio); if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode)) { if (old_valid_dev(inode->i_rdev)) { @@ -95,19 +97,19 @@ static void __set_inode_rdev(struct inode *inode, struct page *node_page) } } -static void __recover_inline_status(struct inode *inode, struct page *ipage) +static void __recover_inline_status(struct inode *inode, struct folio *ifolio) { - void *inline_data = inline_data_addr(inode, ipage); + void *inline_data = inline_data_addr(inode, ifolio); __le32 *start = inline_data; __le32 *end = start + MAX_INLINE_DATA(inode) / sizeof(__le32); while (start < end) { if (*start++) { - f2fs_wait_on_page_writeback(ipage, NODE, true, true); + f2fs_folio_wait_writeback(ifolio, NODE, true, true); set_inode_flag(inode, FI_DATA_EXIST); - set_raw_inline(inode, F2FS_INODE(ipage)); - set_page_dirty(ipage); + set_raw_inline(inode, F2FS_INODE(&ifolio->page)); + folio_mark_dirty(ifolio); return; } } @@ -142,19 +144,18 @@ static __u32 f2fs_inode_chksum(struct f2fs_sb_info *sbi, struct page *page) unsigned int offset = offsetof(struct f2fs_inode, i_inode_checksum); unsigned int cs_size = sizeof(dummy_cs); - chksum = f2fs_chksum(sbi, sbi->s_chksum_seed, (__u8 *)&ino, - sizeof(ino)); - chksum_seed = f2fs_chksum(sbi, chksum, (__u8 *)&gen, sizeof(gen)); + chksum = f2fs_chksum(sbi->s_chksum_seed, (__u8 *)&ino, sizeof(ino)); + chksum_seed = f2fs_chksum(chksum, (__u8 *)&gen, sizeof(gen)); - chksum = f2fs_chksum(sbi, chksum_seed, (__u8 *)ri, offset); - chksum = f2fs_chksum(sbi, chksum, (__u8 *)&dummy_cs, cs_size); + chksum = f2fs_chksum(chksum_seed, (__u8 *)ri, offset); + chksum = f2fs_chksum(chksum, (__u8 *)&dummy_cs, cs_size); offset += cs_size; - chksum = f2fs_chksum(sbi, chksum, (__u8 *)ri + offset, - F2FS_BLKSIZE - offset); + chksum = f2fs_chksum(chksum, (__u8 *)ri + offset, + F2FS_BLKSIZE - offset); return chksum; } -bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) +bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct folio *folio) { struct f2fs_inode *ri; __u32 provided, calculated; @@ -163,21 +164,21 @@ bool f2fs_inode_chksum_verify(struct f2fs_sb_info *sbi, struct page *page) return true; #ifdef CONFIG_F2FS_CHECK_FS - if (!f2fs_enable_inode_chksum(sbi, page)) + if (!f2fs_enable_inode_chksum(sbi, &folio->page)) #else - if (!f2fs_enable_inode_chksum(sbi, page) || - PageDirty(page) || - folio_test_writeback(page_folio(page))) + if (!f2fs_enable_inode_chksum(sbi, &folio->page) || + folio_test_dirty(folio) || + folio_test_writeback(folio)) #endif return true; - ri = &F2FS_NODE(page)->i; + ri = &F2FS_NODE(&folio->page)->i; provided = le32_to_cpu(ri->i_inode_checksum); - calculated = f2fs_inode_chksum(sbi, page); + calculated = f2fs_inode_chksum(sbi, &folio->page); if (provided != calculated) f2fs_warn(sbi, "checksum invalid, nid = %lu, ino_of_node = %x, %x vs. %x", - page_folio(page)->index, ino_of_node(page), + folio->index, ino_of_node(&folio->page), provided, calculated); return provided == calculated; @@ -286,6 +287,12 @@ static bool sanity_check_inode(struct inode *inode, struct page *node_page) return false; } + if (ino_of_node(node_page) == fi->i_xattr_nid) { + f2fs_warn(sbi, "%s: corrupted inode i_ino=%lx, xnid=%x, run fsck to fix.", + __func__, inode->i_ino, fi->i_xattr_nid); + return false; + } + if (f2fs_has_extra_attr(inode)) { if (!f2fs_sb_has_extra_attr(sbi)) { f2fs_warn(sbi, "%s: inode (ino=%lx) is with extra_attr, but extra_attr feature is off", @@ -400,7 +407,7 @@ static int do_read_inode(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_inode_info *fi = F2FS_I(inode); - struct page *node_page; + struct folio *node_folio; struct f2fs_inode *ri; projid_t i_projid; @@ -408,11 +415,11 @@ static int do_read_inode(struct inode *inode) if (f2fs_check_nid_range(sbi, inode->i_ino)) return -EINVAL; - node_page = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); + node_folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(node_folio)) + return PTR_ERR(node_folio); - ri = F2FS_INODE(node_page); + ri = F2FS_INODE(&node_folio->page); inode->i_mode = le16_to_cpu(ri->i_mode); i_uid_write(inode, le32_to_cpu(ri->i_uid)); @@ -462,8 +469,8 @@ static int do_read_inode(struct inode *inode) fi->i_inline_xattr_size = 0; } - if (!sanity_check_inode(inode, node_page)) { - f2fs_put_page(node_page, 1); + if (!sanity_check_inode(inode, &node_folio->page)) { + f2fs_folio_put(node_folio, true); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; @@ -471,17 +478,17 @@ static int do_read_inode(struct inode *inode) /* check data exist */ if (f2fs_has_inline_data(inode) && !f2fs_exist_data(inode)) - __recover_inline_status(inode, node_page); + __recover_inline_status(inode, node_folio); /* try to recover cold bit for non-dir inode */ - if (!S_ISDIR(inode->i_mode) && !is_cold_node(node_page)) { - f2fs_wait_on_page_writeback(node_page, NODE, true, true); - set_cold_node(node_page, false); - set_page_dirty(node_page); + if (!S_ISDIR(inode->i_mode) && !is_cold_node(&node_folio->page)) { + f2fs_folio_wait_writeback(node_folio, NODE, true, true); + set_cold_node(&node_folio->page, false); + folio_mark_dirty(node_folio); } /* get rdev by using inline_info */ - __get_inode_rdev(inode, node_page); + __get_inode_rdev(inode, node_folio); if (!f2fs_need_inode_block_update(sbi, inode->i_ino)) fi->last_disk_size = inode->i_size; @@ -524,17 +531,17 @@ static int do_read_inode(struct inode *inode) init_idisk_time(inode); - if (!sanity_check_extent_cache(inode, node_page)) { - f2fs_put_page(node_page, 1); + if (!sanity_check_extent_cache(inode, &node_folio->page)) { + f2fs_folio_put(node_folio, true); f2fs_handle_error(sbi, ERROR_CORRUPTED_INODE); return -EFSCORRUPTED; } /* Need all the flag bits */ - f2fs_init_read_extent_tree(inode, node_page); + f2fs_init_read_extent_tree(inode, node_folio); f2fs_init_age_extent_tree(inode); - f2fs_put_page(node_page, 1); + f2fs_folio_put(node_folio, true); stat_inc_inline_xattr(inode); stat_inc_inline_inode(inode); @@ -651,18 +658,18 @@ retry: return inode; } -void f2fs_update_inode(struct inode *inode, struct page *node_page) +void f2fs_update_inode(struct inode *inode, struct folio *node_folio) { struct f2fs_inode_info *fi = F2FS_I(inode); struct f2fs_inode *ri; struct extent_tree *et = fi->extent_tree[EX_READ]; - f2fs_wait_on_page_writeback(node_page, NODE, true, true); - set_page_dirty(node_page); + f2fs_folio_wait_writeback(node_folio, NODE, true, true); + folio_mark_dirty(node_folio); f2fs_inode_synced(inode); - ri = F2FS_INODE(node_page); + ri = F2FS_INODE(&node_folio->page); ri->i_mode = cpu_to_le16(inode->i_mode); ri->i_advise = fi->i_advise; @@ -737,27 +744,27 @@ void f2fs_update_inode(struct inode *inode, struct page *node_page) } } - __set_inode_rdev(inode, node_page); + __set_inode_rdev(inode, node_folio); /* deleted inode */ if (inode->i_nlink == 0) - clear_page_private_inline(node_page); + clear_page_private_inline(&node_folio->page); init_idisk_time(inode); #ifdef CONFIG_F2FS_CHECK_FS - f2fs_inode_chksum_set(F2FS_I_SB(inode), node_page); + f2fs_inode_chksum_set(F2FS_I_SB(inode), &node_folio->page); #endif } void f2fs_update_inode_page(struct inode *inode) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *node_page; + struct folio *node_folio; int count = 0; retry: - node_page = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(node_page)) { - int err = PTR_ERR(node_page); + node_folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(node_folio)) { + int err = PTR_ERR(node_folio); /* The node block was truncated. */ if (err == -ENOENT) @@ -772,8 +779,8 @@ stop_checkpoint: f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_UPDATE_INODE); return; } - f2fs_update_inode(inode, node_page); - f2fs_put_page(node_page, 1); + f2fs_update_inode(inode, node_folio); + f2fs_folio_put(node_folio, true); } int f2fs_write_inode(struct inode *inode, struct writeback_control *wbc) diff --git a/fs/f2fs/namei.c b/fs/f2fs/namei.c index 8f8b9b843bdf..07e333ee21b7 100644 --- a/fs/f2fs/namei.c +++ b/fs/f2fs/namei.c @@ -414,7 +414,7 @@ static int f2fs_link(struct dentry *old_dentry, struct inode *dir, if (is_inode_flag_set(dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(dir)->i_projid, - F2FS_I(old_dentry->d_inode)->i_projid))) + F2FS_I(inode)->i_projid))) return -EXDEV; err = f2fs_dquot_initialize(dir); @@ -447,12 +447,12 @@ out: struct dentry *f2fs_get_parent(struct dentry *child) { - struct page *page; - unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &page); + struct folio *folio; + unsigned long ino = f2fs_inode_by_name(d_inode(child), &dotdot_name, &folio); if (!ino) { - if (IS_ERR(page)) - return ERR_CAST(page); + if (IS_ERR(folio)) + return ERR_CAST(folio); return ERR_PTR(-ENOENT); } return d_obtain_alias(f2fs_iget(child->d_sb, ino)); @@ -463,7 +463,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, { struct inode *inode = NULL; struct f2fs_dir_entry *de; - struct page *page; + struct folio *folio; struct dentry *new; nid_t ino = -1; int err = 0; @@ -481,12 +481,12 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, goto out_splice; if (err) goto out; - de = __f2fs_find_entry(dir, &fname, &page); + de = __f2fs_find_entry(dir, &fname, &folio); f2fs_free_filename(&fname); if (!de) { - if (IS_ERR(page)) { - err = PTR_ERR(page); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); goto out; } err = -ENOENT; @@ -494,7 +494,7 @@ static struct dentry *f2fs_lookup(struct inode *dir, struct dentry *dentry, } ino = le32_to_cpu(de->ino); - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); inode = f2fs_iget(dir->i_sb, ino); if (IS_ERR(inode)) { @@ -545,7 +545,7 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) struct f2fs_sb_info *sbi = F2FS_I_SB(dir); struct inode *inode = d_inode(dentry); struct f2fs_dir_entry *de; - struct page *page; + struct folio *folio; int err; trace_f2fs_unlink_enter(dir, dentry); @@ -562,10 +562,19 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) if (err) goto fail; - de = f2fs_find_entry(dir, &dentry->d_name, &page); + de = f2fs_find_entry(dir, &dentry->d_name, &folio); if (!de) { - if (IS_ERR(page)) - err = PTR_ERR(page); + if (IS_ERR(folio)) + err = PTR_ERR(folio); + goto fail; + } + + if (unlikely(inode->i_nlink == 0)) { + f2fs_warn(F2FS_I_SB(inode), "%s: inode (ino=%lx) has zero i_nlink", + __func__, inode->i_ino); + err = -EFSCORRUPTED; + set_sbi_flag(F2FS_I_SB(inode), SBI_NEED_FSCK); + f2fs_folio_put(folio, false); goto fail; } @@ -575,10 +584,10 @@ static int f2fs_unlink(struct inode *dir, struct dentry *dentry) err = f2fs_acquire_orphan_inode(sbi); if (err) { f2fs_unlock_op(sbi); - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); goto fail; } - f2fs_delete_entry(de, page, dir, inode); + f2fs_delete_entry(de, folio, dir, inode); f2fs_unlock_op(sbi); /* VFS negative dentries are incompatible with Encoding and @@ -899,8 +908,8 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); struct inode *whiteout = NULL; - struct page *old_dir_page = NULL; - struct page *old_page, *new_page = NULL; + struct folio *old_dir_folio = NULL; + struct folio *old_folio, *new_folio = NULL; struct f2fs_dir_entry *old_dir_entry = NULL; struct f2fs_dir_entry *old_entry; struct f2fs_dir_entry *new_entry; @@ -914,7 +923,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && (!projid_eq(F2FS_I(new_dir)->i_projid, - F2FS_I(old_dentry->d_inode)->i_projid))) + F2FS_I(old_inode)->i_projid))) return -EXDEV; /* @@ -959,18 +968,18 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } err = -ENOENT; - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_folio); if (!old_entry) { - if (IS_ERR(old_page)) - err = PTR_ERR(old_page); + if (IS_ERR(old_folio)) + err = PTR_ERR(old_folio); goto out; } if (old_is_dir && old_dir != new_dir) { - old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_page); + old_dir_entry = f2fs_parent_dir(old_inode, &old_dir_folio); if (!old_dir_entry) { - if (IS_ERR(old_dir_page)) - err = PTR_ERR(old_dir_page); + if (IS_ERR(old_dir_folio)) + err = PTR_ERR(old_dir_folio); goto out_old; } } @@ -983,10 +992,10 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, err = -ENOENT; new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, - &new_page); + &new_folio); if (!new_entry) { - if (IS_ERR(new_page)) - err = PTR_ERR(new_page); + if (IS_ERR(new_folio)) + err = PTR_ERR(new_folio); goto out_dir; } @@ -998,8 +1007,8 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, if (err) goto put_out_dir; - f2fs_set_link(new_dir, new_entry, new_page, old_inode); - new_page = NULL; + f2fs_set_link(new_dir, new_entry, new_folio, old_inode); + new_folio = NULL; inode_set_ctime_current(new_inode); f2fs_down_write(&F2FS_I(new_inode)->i_sem); @@ -1038,8 +1047,8 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, inode_set_ctime_current(old_inode); f2fs_mark_inode_dirty_sync(old_inode, false); - f2fs_delete_entry(old_entry, old_page, old_dir, NULL); - old_page = NULL; + f2fs_delete_entry(old_entry, old_folio, old_dir, NULL); + old_folio = NULL; if (whiteout) { set_inode_flag(whiteout, FI_INC_LINK); @@ -1055,7 +1064,7 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, } if (old_dir_entry) - f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + f2fs_set_link(old_inode, old_dir_entry, old_dir_folio, new_dir); if (old_is_dir) f2fs_i_links_write(old_dir, false); @@ -1076,12 +1085,12 @@ static int f2fs_rename(struct mnt_idmap *idmap, struct inode *old_dir, put_out_dir: f2fs_unlock_op(sbi); - f2fs_put_page(new_page, 0); + f2fs_folio_put(new_folio, false); out_dir: if (old_dir_entry) - f2fs_put_page(old_dir_page, 0); + f2fs_folio_put(old_dir_folio, false); out_old: - f2fs_put_page(old_page, 0); + f2fs_folio_put(old_folio, false); out: iput(whiteout); return err; @@ -1093,8 +1102,8 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, struct f2fs_sb_info *sbi = F2FS_I_SB(old_dir); struct inode *old_inode = d_inode(old_dentry); struct inode *new_inode = d_inode(new_dentry); - struct page *old_dir_page, *new_dir_page; - struct page *old_page, *new_page; + struct folio *old_dir_folio, *new_dir_folio; + struct folio *old_folio, *new_folio; struct f2fs_dir_entry *old_dir_entry = NULL, *new_dir_entry = NULL; struct f2fs_dir_entry *old_entry, *new_entry; int old_nlink = 0, new_nlink = 0; @@ -1107,10 +1116,10 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if ((is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && !projid_eq(F2FS_I(new_dir)->i_projid, - F2FS_I(old_dentry->d_inode)->i_projid)) || - (is_inode_flag_set(new_dir, FI_PROJ_INHERIT) && + F2FS_I(old_inode)->i_projid)) || + (is_inode_flag_set(old_dir, FI_PROJ_INHERIT) && !projid_eq(F2FS_I(old_dir)->i_projid, - F2FS_I(new_dentry->d_inode)->i_projid))) + F2FS_I(new_inode)->i_projid))) return -EXDEV; err = f2fs_dquot_initialize(old_dir); @@ -1122,17 +1131,17 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, goto out; err = -ENOENT; - old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_page); + old_entry = f2fs_find_entry(old_dir, &old_dentry->d_name, &old_folio); if (!old_entry) { - if (IS_ERR(old_page)) - err = PTR_ERR(old_page); + if (IS_ERR(old_folio)) + err = PTR_ERR(old_folio); goto out; } - new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_page); + new_entry = f2fs_find_entry(new_dir, &new_dentry->d_name, &new_folio); if (!new_entry) { - if (IS_ERR(new_page)) - err = PTR_ERR(new_page); + if (IS_ERR(new_folio)) + err = PTR_ERR(new_folio); goto out_old; } @@ -1140,20 +1149,20 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, if (old_dir != new_dir) { if (S_ISDIR(old_inode->i_mode)) { old_dir_entry = f2fs_parent_dir(old_inode, - &old_dir_page); + &old_dir_folio); if (!old_dir_entry) { - if (IS_ERR(old_dir_page)) - err = PTR_ERR(old_dir_page); + if (IS_ERR(old_dir_folio)) + err = PTR_ERR(old_dir_folio); goto out_new; } } if (S_ISDIR(new_inode->i_mode)) { new_dir_entry = f2fs_parent_dir(new_inode, - &new_dir_page); + &new_dir_folio); if (!new_dir_entry) { - if (IS_ERR(new_dir_page)) - err = PTR_ERR(new_dir_page); + if (IS_ERR(new_dir_folio)) + err = PTR_ERR(new_dir_folio); goto out_old_dir; } } @@ -1180,14 +1189,14 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, /* update ".." directory entry info of old dentry */ if (old_dir_entry) - f2fs_set_link(old_inode, old_dir_entry, old_dir_page, new_dir); + f2fs_set_link(old_inode, old_dir_entry, old_dir_folio, new_dir); /* update ".." directory entry info of new dentry */ if (new_dir_entry) - f2fs_set_link(new_inode, new_dir_entry, new_dir_page, old_dir); + f2fs_set_link(new_inode, new_dir_entry, new_dir_folio, old_dir); /* update directory entry info of old dir inode */ - f2fs_set_link(old_dir, old_entry, old_page, new_inode); + f2fs_set_link(old_dir, old_entry, old_folio, new_inode); f2fs_down_write(&F2FS_I(old_inode)->i_sem); if (!old_dir_entry) @@ -1206,7 +1215,7 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, f2fs_mark_inode_dirty_sync(old_dir, false); /* update directory entry info of new dir inode */ - f2fs_set_link(new_dir, new_entry, new_page, old_inode); + f2fs_set_link(new_dir, new_entry, new_folio, old_inode); f2fs_down_write(&F2FS_I(new_inode)->i_sem); if (!new_dir_entry) @@ -1238,16 +1247,16 @@ static int f2fs_cross_rename(struct inode *old_dir, struct dentry *old_dentry, return 0; out_new_dir: if (new_dir_entry) { - f2fs_put_page(new_dir_page, 0); + f2fs_folio_put(new_dir_folio, 0); } out_old_dir: if (old_dir_entry) { - f2fs_put_page(old_dir_page, 0); + f2fs_folio_put(old_dir_folio, 0); } out_new: - f2fs_put_page(new_page, 0); + f2fs_folio_put(new_folio, false); out_old: - f2fs_put_page(old_page, 0); + f2fs_folio_put(old_folio, false); out: return err; } diff --git a/fs/f2fs/node.c b/fs/f2fs/node.c index 5f15c224bf78..1cb4cba7f961 100644 --- a/fs/f2fs/node.c +++ b/fs/f2fs/node.c @@ -120,25 +120,25 @@ bool f2fs_available_free_memory(struct f2fs_sb_info *sbi, int type) return res; } -static void clear_node_page_dirty(struct page *page) +static void clear_node_folio_dirty(struct folio *folio) { - if (PageDirty(page)) { - f2fs_clear_page_cache_dirty_tag(page_folio(page)); - clear_page_dirty_for_io(page); - dec_page_count(F2FS_P_SB(page), F2FS_DIRTY_NODES); + if (folio_test_dirty(folio)) { + f2fs_clear_page_cache_dirty_tag(folio); + folio_clear_dirty_for_io(folio); + dec_page_count(F2FS_F_SB(folio), F2FS_DIRTY_NODES); } - ClearPageUptodate(page); + folio_clear_uptodate(folio); } -static struct page *get_current_nat_page(struct f2fs_sb_info *sbi, nid_t nid) +static struct folio *get_current_nat_folio(struct f2fs_sb_info *sbi, nid_t nid) { - return f2fs_get_meta_page_retry(sbi, current_nat_addr(sbi, nid)); + return f2fs_get_meta_folio_retry(sbi, current_nat_addr(sbi, nid)); } static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) { - struct page *src_page; - struct page *dst_page; + struct folio *src_folio; + struct folio *dst_folio; pgoff_t dst_off; void *src_addr; void *dst_addr; @@ -147,21 +147,21 @@ static struct page *get_next_nat_page(struct f2fs_sb_info *sbi, nid_t nid) dst_off = next_nat_addr(sbi, current_nat_addr(sbi, nid)); /* get current nat block page with lock */ - src_page = get_current_nat_page(sbi, nid); - if (IS_ERR(src_page)) - return src_page; - dst_page = f2fs_grab_meta_page(sbi, dst_off); - f2fs_bug_on(sbi, PageDirty(src_page)); - - src_addr = page_address(src_page); - dst_addr = page_address(dst_page); + src_folio = get_current_nat_folio(sbi, nid); + if (IS_ERR(src_folio)) + return &src_folio->page; + dst_folio = f2fs_grab_meta_folio(sbi, dst_off); + f2fs_bug_on(sbi, folio_test_dirty(src_folio)); + + src_addr = folio_address(src_folio); + dst_addr = folio_address(dst_folio); memcpy(dst_addr, src_addr, PAGE_SIZE); - set_page_dirty(dst_page); - f2fs_put_page(src_page, 1); + folio_mark_dirty(dst_folio); + f2fs_folio_put(src_folio, true); set_to_next_nat(nm_i, nid); - return dst_page; + return &dst_folio->page; } static struct nat_entry *__alloc_nat_entry(struct f2fs_sb_info *sbi, @@ -310,10 +310,10 @@ static unsigned int __gang_lookup_nat_set(struct f2fs_nm_info *nm_i, start, nr); } -bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, const struct folio *folio) +bool f2fs_in_warm_node_list(struct f2fs_sb_info *sbi, struct folio *folio) { - return NODE_MAPPING(sbi) == folio->mapping && - IS_DNODE(&folio->page) && is_cold_node(&folio->page); + return is_node_folio(folio) && IS_DNODE(&folio->page) && + is_cold_node(&folio->page); } void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) @@ -325,7 +325,7 @@ void f2fs_init_fsync_node_info(struct f2fs_sb_info *sbi) } static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, - struct page *page) + struct folio *folio) { struct fsync_node_entry *fn; unsigned long flags; @@ -334,8 +334,8 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, fn = f2fs_kmem_cache_alloc(fsync_node_entry_slab, GFP_NOFS, true, NULL); - get_page(page); - fn->page = page; + folio_get(folio); + fn->folio = folio; INIT_LIST_HEAD(&fn->list); spin_lock_irqsave(&sbi->fsync_node_lock, flags); @@ -348,19 +348,19 @@ static unsigned int f2fs_add_fsync_node_entry(struct f2fs_sb_info *sbi, return seq_id; } -void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct page *page) +void f2fs_del_fsync_node_entry(struct f2fs_sb_info *sbi, struct folio *folio) { struct fsync_node_entry *fn; unsigned long flags; spin_lock_irqsave(&sbi->fsync_node_lock, flags); list_for_each_entry(fn, &sbi->fsync_node_list, list) { - if (fn->page == page) { + if (fn->folio == folio) { list_del(&fn->list); sbi->fsync_node_num--; spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); kmem_cache_free(fsync_node_entry_slab, fn); - put_page(page); + folio_put(folio); return; } } @@ -551,7 +551,7 @@ int f2fs_get_node_info(struct f2fs_sb_info *sbi, nid_t nid, struct f2fs_journal *journal = curseg->journal; nid_t start_nid = START_NID(nid); struct f2fs_nat_block *nat_blk; - struct page *page = NULL; + struct folio *folio = NULL; struct f2fs_nat_entry ne; struct nat_entry *e; pgoff_t index; @@ -601,14 +601,14 @@ retry: index = current_nat_addr(sbi, nid); f2fs_up_read(&nm_i->nat_tree_lock); - page = f2fs_get_meta_page(sbi, index); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = f2fs_get_meta_folio(sbi, index); + if (IS_ERR(folio)) + return PTR_ERR(folio); - nat_blk = (struct f2fs_nat_block *)page_address(page); + nat_blk = folio_address(folio); ne = nat_blk->entries[nid - start_nid]; node_info_from_raw_nat(ni, &ne); - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); cache: blkaddr = le32_to_cpu(ne.block_addr); if (__is_valid_data_blkaddr(blkaddr) && @@ -623,9 +623,9 @@ cache: /* * readahead MAX_RA_NODE number of node pages. */ -static void f2fs_ra_node_pages(struct page *parent, int start, int n) +static void f2fs_ra_node_pages(struct folio *parent, int start, int n) { - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); + struct f2fs_sb_info *sbi = F2FS_F_SB(parent); struct blk_plug plug; int i, end; nid_t nid; @@ -636,7 +636,7 @@ static void f2fs_ra_node_pages(struct page *parent, int start, int n) end = start + n; end = min(end, (int)NIDS_PER_BLOCK); for (i = start; i < end; i++) { - nid = get_nid(parent, i, false); + nid = get_nid(&parent->page, i, false); f2fs_ra_node_page(sbi, nid); } @@ -754,6 +754,8 @@ got: return level; } +static struct folio *f2fs_get_node_folio_ra(struct folio *parent, int start); + /* * Caller should call f2fs_put_dnode(dn). * Also, it should grab and release a rwsem by calling f2fs_lock_op() and @@ -762,8 +764,8 @@ got: int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct page *npage[4]; - struct page *parent = NULL; + struct folio *nfolio[4]; + struct folio *parent = NULL; int offset[4]; unsigned int noffset[4]; nid_t nids[4]; @@ -775,26 +777,27 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) return level; nids[0] = dn->inode->i_ino; - npage[0] = dn->inode_page; - if (!npage[0]) { - npage[0] = f2fs_get_inode_page(sbi, nids[0]); - if (IS_ERR(npage[0])) - return PTR_ERR(npage[0]); + if (!dn->inode_folio) { + nfolio[0] = f2fs_get_inode_folio(sbi, nids[0]); + if (IS_ERR(nfolio[0])) + return PTR_ERR(nfolio[0]); + } else { + nfolio[0] = dn->inode_folio; } /* if inline_data is set, should not report any block indices */ if (f2fs_has_inline_data(dn->inode) && index) { err = -ENOENT; - f2fs_put_page(npage[0], 1); + f2fs_folio_put(nfolio[0], true); goto release_out; } - parent = npage[0]; + parent = nfolio[0]; if (level != 0) - nids[1] = get_nid(parent, offset[0], true); - dn->inode_page = npage[0]; - dn->inode_page_locked = true; + nids[1] = get_nid(&parent->page, offset[0], true); + dn->inode_folio = nfolio[0]; + dn->inode_folio_locked = true; /* get indirect or direct nodes */ for (i = 1; i <= level; i++) { @@ -808,10 +811,10 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) } dn->nid = nids[i]; - npage[i] = f2fs_new_node_page(dn, noffset[i]); - if (IS_ERR(npage[i])) { + nfolio[i] = f2fs_new_node_folio(dn, noffset[i]); + if (IS_ERR(nfolio[i])) { f2fs_alloc_nid_failed(sbi, nids[i]); - err = PTR_ERR(npage[i]); + err = PTR_ERR(nfolio[i]); goto release_pages; } @@ -819,36 +822,36 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) f2fs_alloc_nid_done(sbi, nids[i]); done = true; } else if (mode == LOOKUP_NODE_RA && i == level && level > 1) { - npage[i] = f2fs_get_node_page_ra(parent, offset[i - 1]); - if (IS_ERR(npage[i])) { - err = PTR_ERR(npage[i]); + nfolio[i] = f2fs_get_node_folio_ra(parent, offset[i - 1]); + if (IS_ERR(nfolio[i])) { + err = PTR_ERR(nfolio[i]); goto release_pages; } done = true; } if (i == 1) { - dn->inode_page_locked = false; - unlock_page(parent); + dn->inode_folio_locked = false; + folio_unlock(parent); } else { - f2fs_put_page(parent, 1); + f2fs_folio_put(parent, true); } if (!done) { - npage[i] = f2fs_get_node_page(sbi, nids[i]); - if (IS_ERR(npage[i])) { - err = PTR_ERR(npage[i]); - f2fs_put_page(npage[0], 0); + nfolio[i] = f2fs_get_node_folio(sbi, nids[i]); + if (IS_ERR(nfolio[i])) { + err = PTR_ERR(nfolio[i]); + f2fs_folio_put(nfolio[0], false); goto release_out; } } if (i < level) { - parent = npage[i]; - nids[i + 1] = get_nid(parent, offset[i], false); + parent = nfolio[i]; + nids[i + 1] = get_nid(&parent->page, offset[i], false); } } dn->nid = nids[level]; dn->ofs_in_node = offset[level]; - dn->node_page = npage[level]; + dn->node_folio = nfolio[level]; dn->data_blkaddr = f2fs_data_blkaddr(dn); if (is_inode_flag_set(dn->inode, FI_COMPRESSED_FILE) && @@ -869,9 +872,9 @@ int f2fs_get_dnode_of_data(struct dnode_of_data *dn, pgoff_t index, int mode) if (!c_len) goto out; - blkaddr = data_blkaddr(dn->inode, dn->node_page, ofs_in_node); + blkaddr = data_blkaddr(dn->inode, dn->node_folio, ofs_in_node); if (blkaddr == COMPRESS_ADDR) - blkaddr = data_blkaddr(dn->inode, dn->node_page, + blkaddr = data_blkaddr(dn->inode, dn->node_folio, ofs_in_node + 1); f2fs_update_read_extent_tree_range_compressed(dn->inode, @@ -881,12 +884,12 @@ out: return 0; release_pages: - f2fs_put_page(parent, 1); + f2fs_folio_put(parent, true); if (i > 1) - f2fs_put_page(npage[0], 0); + f2fs_folio_put(nfolio[0], false); release_out: - dn->inode_page = NULL; - dn->node_page = NULL; + dn->inode_folio = NULL; + dn->node_folio = NULL; if (err == -ENOENT) { dn->cur_level = i; dn->max_level = level; @@ -927,16 +930,16 @@ static int truncate_node(struct dnode_of_data *dn) f2fs_inode_synced(dn->inode); } - clear_node_page_dirty(dn->node_page); + clear_node_folio_dirty(dn->node_folio); set_sbi_flag(sbi, SBI_IS_DIRTY); - index = page_folio(dn->node_page)->index; - f2fs_put_page(dn->node_page, 1); + index = dn->node_folio->index; + f2fs_folio_put(dn->node_folio, true); invalidate_mapping_pages(NODE_MAPPING(sbi), index, index); - dn->node_page = NULL; + dn->node_folio = NULL; trace_f2fs_truncate_node(dn->inode, dn->nid, ni.blk_addr); return 0; @@ -945,35 +948,35 @@ static int truncate_node(struct dnode_of_data *dn) static int truncate_dnode(struct dnode_of_data *dn) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); - struct page *page; + struct folio *folio; int err; if (dn->nid == 0) return 1; /* get direct node */ - page = f2fs_get_node_page(sbi, dn->nid); - if (PTR_ERR(page) == -ENOENT) + folio = f2fs_get_node_folio(sbi, dn->nid); + if (PTR_ERR(folio) == -ENOENT) return 1; - else if (IS_ERR(page)) - return PTR_ERR(page); + else if (IS_ERR(folio)) + return PTR_ERR(folio); - if (IS_INODE(page) || ino_of_node(page) != dn->inode->i_ino) { + if (IS_INODE(&folio->page) || ino_of_node(&folio->page) != dn->inode->i_ino) { f2fs_err(sbi, "incorrect node reference, ino: %lu, nid: %u, ino_of_node: %u", - dn->inode->i_ino, dn->nid, ino_of_node(page)); + dn->inode->i_ino, dn->nid, ino_of_node(&folio->page)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INVALID_NODE_REFERENCE); - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return -EFSCORRUPTED; } /* Make dnode_of_data for parameter */ - dn->node_page = page; + dn->node_folio = folio; dn->ofs_in_node = 0; f2fs_truncate_data_blocks_range(dn, ADDRS_PER_BLOCK(dn->inode)); err = truncate_node(dn); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return err; } @@ -984,7 +987,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, int ofs, int depth) { struct dnode_of_data rdn = *dn; - struct page *page; + struct folio *folio; struct f2fs_node *rn; nid_t child_nid; unsigned int child_nofs; @@ -996,15 +999,15 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, trace_f2fs_truncate_nodes_enter(dn->inode, dn->nid, dn->data_blkaddr); - page = f2fs_get_node_page(F2FS_I_SB(dn->inode), dn->nid); - if (IS_ERR(page)) { - trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(page)); - return PTR_ERR(page); + folio = f2fs_get_node_folio(F2FS_I_SB(dn->inode), dn->nid); + if (IS_ERR(folio)) { + trace_f2fs_truncate_nodes_exit(dn->inode, PTR_ERR(folio)); + return PTR_ERR(folio); } - f2fs_ra_node_pages(page, ofs, NIDS_PER_BLOCK); + f2fs_ra_node_pages(folio, ofs, NIDS_PER_BLOCK); - rn = F2FS_NODE(page); + rn = F2FS_NODE(&folio->page); if (depth < 3) { for (i = ofs; i < NIDS_PER_BLOCK; i++, freed++) { child_nid = le32_to_cpu(rn->in.nid[i]); @@ -1014,7 +1017,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, ret = truncate_dnode(&rdn); if (ret < 0) goto out_err; - if (set_nid(page, i, 0, false)) + if (set_nid(folio, i, 0, false)) dn->node_changed = true; } } else { @@ -1028,7 +1031,7 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, rdn.nid = child_nid; ret = truncate_nodes(&rdn, child_nofs, 0, depth - 1); if (ret == (NIDS_PER_BLOCK + 1)) { - if (set_nid(page, i, 0, false)) + if (set_nid(folio, i, 0, false)) dn->node_changed = true; child_nofs += ret; } else if (ret < 0 && ret != -ENOENT) { @@ -1040,19 +1043,19 @@ static int truncate_nodes(struct dnode_of_data *dn, unsigned int nofs, if (!ofs) { /* remove current indirect node */ - dn->node_page = page; + dn->node_folio = folio; ret = truncate_node(dn); if (ret) goto out_err; freed++; } else { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); } trace_f2fs_truncate_nodes_exit(dn->inode, freed); return freed; out_err: - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); trace_f2fs_truncate_nodes_exit(dn->inode, ret); return ret; } @@ -1060,59 +1063,59 @@ out_err: static int truncate_partial_nodes(struct dnode_of_data *dn, struct f2fs_inode *ri, int *offset, int depth) { - struct page *pages[2]; + struct folio *folios[2]; nid_t nid[3]; nid_t child_nid; int err = 0; int i; int idx = depth - 2; - nid[0] = get_nid(dn->inode_page, offset[0], true); + nid[0] = get_nid(&dn->inode_folio->page, offset[0], true); if (!nid[0]) return 0; /* get indirect nodes in the path */ for (i = 0; i < idx + 1; i++) { /* reference count'll be increased */ - pages[i] = f2fs_get_node_page(F2FS_I_SB(dn->inode), nid[i]); - if (IS_ERR(pages[i])) { - err = PTR_ERR(pages[i]); + folios[i] = f2fs_get_node_folio(F2FS_I_SB(dn->inode), nid[i]); + if (IS_ERR(folios[i])) { + err = PTR_ERR(folios[i]); idx = i - 1; goto fail; } - nid[i + 1] = get_nid(pages[i], offset[i + 1], false); + nid[i + 1] = get_nid(&folios[i]->page, offset[i + 1], false); } - f2fs_ra_node_pages(pages[idx], offset[idx + 1], NIDS_PER_BLOCK); + f2fs_ra_node_pages(folios[idx], offset[idx + 1], NIDS_PER_BLOCK); /* free direct nodes linked to a partial indirect node */ for (i = offset[idx + 1]; i < NIDS_PER_BLOCK; i++) { - child_nid = get_nid(pages[idx], i, false); + child_nid = get_nid(&folios[idx]->page, i, false); if (!child_nid) continue; dn->nid = child_nid; err = truncate_dnode(dn); if (err < 0) goto fail; - if (set_nid(pages[idx], i, 0, false)) + if (set_nid(folios[idx], i, 0, false)) dn->node_changed = true; } if (offset[idx + 1] == 0) { - dn->node_page = pages[idx]; + dn->node_folio = folios[idx]; dn->nid = nid[idx]; err = truncate_node(dn); if (err) goto fail; } else { - f2fs_put_page(pages[idx], 1); + f2fs_folio_put(folios[idx], true); } offset[idx]++; offset[idx + 1] = 0; idx--; fail: for (i = idx; i >= 0; i--) - f2fs_put_page(pages[i], 1); + f2fs_folio_put(folios[i], true); trace_f2fs_truncate_partial_nodes(dn->inode, nid, depth, err); @@ -1153,7 +1156,7 @@ int f2fs_truncate_inode_blocks(struct inode *inode, pgoff_t from) return PTR_ERR(folio); } - set_new_dnode(&dn, inode, &folio->page, NULL, 0); + set_new_dnode(&dn, inode, folio, NULL, 0); folio_unlock(folio); ri = F2FS_INODE(&folio->page); @@ -1219,8 +1222,8 @@ skip_partial: goto fail; if (offset[1] == 0 && get_nid(&folio->page, offset[0], true)) { folio_lock(folio); - BUG_ON(folio->mapping != NODE_MAPPING(sbi)); - set_nid(&folio->page, offset[0], 0, true); + BUG_ON(!is_node_folio(folio)); + set_nid(folio, offset[0], 0, true); folio_unlock(folio); } offset[1] = 0; @@ -1239,20 +1242,20 @@ int f2fs_truncate_xattr_node(struct inode *inode) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t nid = F2FS_I(inode)->i_xattr_nid; struct dnode_of_data dn; - struct page *npage; + struct folio *nfolio; int err; if (!nid) return 0; - npage = f2fs_get_xnode_page(sbi, nid); - if (IS_ERR(npage)) - return PTR_ERR(npage); + nfolio = f2fs_get_xnode_folio(sbi, nid); + if (IS_ERR(nfolio)) + return PTR_ERR(nfolio); - set_new_dnode(&dn, inode, NULL, npage, nid); + set_new_dnode(&dn, inode, NULL, nfolio, nid); err = truncate_node(&dn); if (err) { - f2fs_put_page(npage, 1); + f2fs_folio_put(nfolio, true); return err; } @@ -1309,30 +1312,30 @@ int f2fs_remove_inode_page(struct inode *inode) return 0; } -struct page *f2fs_new_inode_page(struct inode *inode) +struct folio *f2fs_new_inode_folio(struct inode *inode) { struct dnode_of_data dn; /* allocate inode page for new inode */ set_new_dnode(&dn, inode, NULL, NULL, inode->i_ino); - /* caller should f2fs_put_page(page, 1); */ - return f2fs_new_node_page(&dn, 0); + /* caller should f2fs_folio_put(folio, true); */ + return f2fs_new_node_folio(&dn, 0); } -struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) +struct folio *f2fs_new_node_folio(struct dnode_of_data *dn, unsigned int ofs) { struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode); struct node_info new_ni; - struct page *page; + struct folio *folio; int err; if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC))) return ERR_PTR(-EPERM); - page = f2fs_grab_cache_page(NODE_MAPPING(sbi), dn->nid, false); - if (!page) - return ERR_PTR(-ENOMEM); + folio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), dn->nid, false); + if (IS_ERR(folio)) + return folio; if (unlikely((err = inc_valid_node_count(sbi, dn->inode, !ofs)))) goto fail; @@ -1348,7 +1351,7 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) dec_valid_node_count(sbi, dn->inode, !ofs); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_warn_ratelimited(sbi, - "f2fs_new_node_page: inconsistent nat entry, " + "f2fs_new_node_folio: inconsistent nat entry, " "ino:%u, nid:%u, blkaddr:%u, ver:%u, flag:%u", new_ni.ino, new_ni.nid, new_ni.blk_addr, new_ni.version, new_ni.flag); @@ -1363,12 +1366,12 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) new_ni.version = 0; set_node_addr(sbi, &new_ni, NEW_ADDR, false); - f2fs_wait_on_page_writeback(page, NODE, true, true); - fill_node_footer(page, dn->nid, dn->inode->i_ino, ofs, true); - set_cold_node(page, S_ISDIR(dn->inode->i_mode)); - if (!PageUptodate(page)) - SetPageUptodate(page); - if (set_page_dirty(page)) + f2fs_folio_wait_writeback(folio, NODE, true, true); + fill_node_footer(&folio->page, dn->nid, dn->inode->i_ino, ofs, true); + set_cold_node(&folio->page, S_ISDIR(dn->inode->i_mode)); + if (!folio_test_uptodate(folio)) + folio_mark_uptodate(folio); + if (folio_mark_dirty(folio)) dn->node_changed = true; if (f2fs_has_xattr_block(ofs)) @@ -1376,35 +1379,34 @@ struct page *f2fs_new_node_page(struct dnode_of_data *dn, unsigned int ofs) if (ofs == 0) inc_valid_inode_count(sbi); - return page; + return folio; fail: - clear_node_page_dirty(page); - f2fs_put_page(page, 1); + clear_node_folio_dirty(folio); + f2fs_folio_put(folio, true); return ERR_PTR(err); } /* * Caller should do after getting the following values. - * 0: f2fs_put_page(page, 0) - * LOCKED_PAGE or error: f2fs_put_page(page, 1) + * 0: f2fs_folio_put(folio, false) + * LOCKED_PAGE or error: f2fs_folio_put(folio, true) */ -static int read_node_page(struct page *page, blk_opf_t op_flags) +static int read_node_folio(struct folio *folio, blk_opf_t op_flags) { - struct folio *folio = page_folio(page); - struct f2fs_sb_info *sbi = F2FS_P_SB(page); + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, .type = NODE, .op = REQ_OP_READ, .op_flags = op_flags, - .page = page, + .page = &folio->page, .encrypted_page = NULL, }; int err; if (folio_test_uptodate(folio)) { - if (!f2fs_inode_chksum_verify(sbi, page)) { + if (!f2fs_inode_chksum_verify(sbi, folio)) { folio_clear_uptodate(folio); return -EFSBADCRC; } @@ -1436,7 +1438,7 @@ static int read_node_page(struct page *page, blk_opf_t op_flags) */ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) { - struct page *apage; + struct folio *afolio; int err; if (!nid) @@ -1444,22 +1446,24 @@ void f2fs_ra_node_page(struct f2fs_sb_info *sbi, nid_t nid) if (f2fs_check_nid_range(sbi, nid)) return; - apage = xa_load(&NODE_MAPPING(sbi)->i_pages, nid); - if (apage) + afolio = xa_load(&NODE_MAPPING(sbi)->i_pages, nid); + if (afolio) return; - apage = f2fs_grab_cache_page(NODE_MAPPING(sbi), nid, false); - if (!apage) + afolio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), nid, false); + if (IS_ERR(afolio)) return; - err = read_node_page(apage, REQ_RAHEAD); - f2fs_put_page(apage, err ? 1 : 0); + err = read_node_folio(afolio, REQ_RAHEAD); + f2fs_folio_put(afolio, err ? true : false); } static int sanity_check_node_footer(struct f2fs_sb_info *sbi, - struct page *page, pgoff_t nid, + struct folio *folio, pgoff_t nid, enum node_type ntype) { + struct page *page = &folio->page; + if (unlikely(nid != nid_of_node(page) || (ntype == NODE_TYPE_INODE && !IS_INODE(page)) || (ntype == NODE_TYPE_XATTR && @@ -1469,7 +1473,7 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, "node_footer[nid:%u,ino:%u,ofs:%u,cpver:%llu,blkaddr:%u]", ntype, nid, nid_of_node(page), ino_of_node(page), ofs_of_node(page), cpver_of_node(page), - next_blkaddr_of_node(page)); + next_blkaddr_of_node(folio)); set_sbi_flag(sbi, SBI_NEED_FSCK); f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); return -EFSCORRUPTED; @@ -1478,8 +1482,7 @@ static int sanity_check_node_footer(struct f2fs_sb_info *sbi, } static struct folio *__get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid, - struct page *parent, int start, - enum node_type ntype) + struct folio *parent, int start, enum node_type ntype) { struct folio *folio; int err; @@ -1493,20 +1496,18 @@ repeat: if (IS_ERR(folio)) return folio; - err = read_node_page(&folio->page, 0); - if (err < 0) { + err = read_node_folio(folio, 0); + if (err < 0) goto out_put_err; - } else if (err == LOCKED_PAGE) { - err = 0; + if (err == LOCKED_PAGE) goto page_hit; - } if (parent) f2fs_ra_node_pages(parent, start + 1, MAX_RA_NODE); folio_lock(folio); - if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { + if (unlikely(!is_node_folio(folio))) { f2fs_folio_put(folio, true); goto repeat; } @@ -1516,30 +1517,27 @@ repeat: goto out_err; } - if (!f2fs_inode_chksum_verify(sbi, &folio->page)) { + if (!f2fs_inode_chksum_verify(sbi, folio)) { err = -EFSBADCRC; goto out_err; } page_hit: - err = sanity_check_node_footer(sbi, &folio->page, nid, ntype); + err = sanity_check_node_footer(sbi, folio, nid, ntype); if (!err) return folio; out_err: folio_clear_uptodate(folio); out_put_err: - /* ENOENT comes from read_node_page which is not an error. */ + /* ENOENT comes from read_node_folio which is not an error. */ if (err != -ENOENT) f2fs_handle_page_eio(sbi, folio, NODE); f2fs_folio_put(folio, true); return ERR_PTR(err); } -struct page *f2fs_get_node_page(struct f2fs_sb_info *sbi, pgoff_t nid) +struct folio *f2fs_get_node_folio(struct f2fs_sb_info *sbi, pgoff_t nid) { - struct folio *folio = __get_node_folio(sbi, nid, NULL, 0, - NODE_TYPE_REGULAR); - - return &folio->page; + return __get_node_folio(sbi, nid, NULL, 0, NODE_TYPE_REGULAR); } struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) @@ -1547,35 +1545,23 @@ struct folio *f2fs_get_inode_folio(struct f2fs_sb_info *sbi, pgoff_t ino) return __get_node_folio(sbi, ino, NULL, 0, NODE_TYPE_INODE); } -struct page *f2fs_get_inode_page(struct f2fs_sb_info *sbi, pgoff_t ino) +struct folio *f2fs_get_xnode_folio(struct f2fs_sb_info *sbi, pgoff_t xnid) { - struct folio *folio = f2fs_get_inode_folio(sbi, ino); - - return &folio->page; -} - -struct page *f2fs_get_xnode_page(struct f2fs_sb_info *sbi, pgoff_t xnid) -{ - struct folio *folio = __get_node_folio(sbi, xnid, NULL, 0, - NODE_TYPE_XATTR); - - return &folio->page; + return __get_node_folio(sbi, xnid, NULL, 0, NODE_TYPE_XATTR); } -struct page *f2fs_get_node_page_ra(struct page *parent, int start) +static struct folio *f2fs_get_node_folio_ra(struct folio *parent, int start) { - struct f2fs_sb_info *sbi = F2FS_P_SB(parent); - nid_t nid = get_nid(parent, start, false); - struct folio *folio = __get_node_folio(sbi, nid, parent, start, - NODE_TYPE_REGULAR); + struct f2fs_sb_info *sbi = F2FS_F_SB(parent); + nid_t nid = get_nid(&parent->page, start, false); - return &folio->page; + return __get_node_folio(sbi, nid, parent, start, NODE_TYPE_REGULAR); } static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) { struct inode *inode; - struct page *page; + struct folio *folio; int ret; /* should flush inline_data before evict_inode */ @@ -1583,27 +1569,27 @@ static void flush_inline_data(struct f2fs_sb_info *sbi, nid_t ino) if (!inode) return; - page = f2fs_pagecache_get_page(inode->i_mapping, 0, + folio = f2fs_filemap_get_folio(inode->i_mapping, 0, FGP_LOCK|FGP_NOWAIT, 0); - if (!page) + if (IS_ERR(folio)) goto iput_out; - if (!PageUptodate(page)) - goto page_out; + if (!folio_test_uptodate(folio)) + goto folio_out; - if (!PageDirty(page)) - goto page_out; + if (!folio_test_dirty(folio)) + goto folio_out; - if (!clear_page_dirty_for_io(page)) - goto page_out; + if (!folio_clear_dirty_for_io(folio)) + goto folio_out; - ret = f2fs_write_inline_data(inode, page_folio(page)); + ret = f2fs_write_inline_data(inode, folio); inode_dec_dirty_pages(inode); f2fs_remove_dirty_inode(inode); if (ret) - set_page_dirty(page); -page_out: - f2fs_put_page(page, 1); + folio_mark_dirty(folio); +folio_out: + f2fs_folio_put(folio, true); iput_out: iput(inode); } @@ -1639,7 +1625,7 @@ static struct folio *last_fsync_dnode(struct f2fs_sb_info *sbi, nid_t ino) folio_lock(folio); - if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { + if (unlikely(!is_node_folio(folio))) { continue_unlock: folio_unlock(folio); continue; @@ -1665,21 +1651,20 @@ continue_unlock: return last_folio; } -static int __write_node_page(struct page *page, bool atomic, bool *submitted, +static bool __write_node_folio(struct folio *folio, bool atomic, bool *submitted, struct writeback_control *wbc, bool do_balance, enum iostat_type io_type, unsigned int *seq_id) { - struct f2fs_sb_info *sbi = F2FS_P_SB(page); - struct folio *folio = page_folio(page); + struct f2fs_sb_info *sbi = F2FS_F_SB(folio); nid_t nid; struct node_info ni; struct f2fs_io_info fio = { .sbi = sbi, - .ino = ino_of_node(page), + .ino = ino_of_node(&folio->page), .type = NODE, .op = REQ_OP_WRITE, .op_flags = wbc_to_write_flags(wbc), - .page = page, + .page = &folio->page, .encrypted_page = NULL, .submitted = 0, .io_type = io_type, @@ -1696,7 +1681,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, folio_clear_uptodate(folio); dec_page_count(sbi, F2FS_DIRTY_NODES); folio_unlock(folio); - return 0; + return true; } if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING))) @@ -1704,22 +1689,17 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED) && wbc->sync_mode == WB_SYNC_NONE && - IS_DNODE(page) && is_cold_node(page)) + IS_DNODE(&folio->page) && is_cold_node(&folio->page)) goto redirty_out; /* get old block addr of this node page */ - nid = nid_of_node(page); + nid = nid_of_node(&folio->page); f2fs_bug_on(sbi, folio->index != nid); if (f2fs_get_node_info(sbi, nid, &ni, !do_balance)) goto redirty_out; - if (wbc->for_reclaim) { - if (!f2fs_down_read_trylock(&sbi->node_write)) - goto redirty_out; - } else { - f2fs_down_read(&sbi->node_write); - } + f2fs_down_read(&sbi->node_write); /* This page is already truncated */ if (unlikely(ni.blk_addr == NULL_ADDR)) { @@ -1727,7 +1707,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, dec_page_count(sbi, F2FS_DIRTY_NODES); f2fs_up_read(&sbi->node_write); folio_unlock(folio); - return 0; + return true; } if (__is_valid_data_blkaddr(ni.blk_addr) && @@ -1742,7 +1722,7 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, /* should add to global list before clearing PAGECACHE status */ if (f2fs_in_warm_node_list(sbi, folio)) { - seq = f2fs_add_fsync_node_entry(sbi, page); + seq = f2fs_add_fsync_node_entry(sbi, folio); if (seq_id) *seq_id = seq; } @@ -1751,15 +1731,10 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, fio.old_blkaddr = ni.blk_addr; f2fs_do_write_node_page(nid, &fio); - set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(page)); + set_node_addr(sbi, &ni, fio.new_blkaddr, is_fsync_dnode(&folio->page)); dec_page_count(sbi, F2FS_DIRTY_NODES); f2fs_up_read(&sbi->node_write); - if (wbc->for_reclaim) { - f2fs_submit_merged_write_cond(sbi, NULL, page, 0, NODE); - submitted = NULL; - } - folio_unlock(folio); if (unlikely(f2fs_cp_error(sbi))) { @@ -1771,14 +1746,15 @@ static int __write_node_page(struct page *page, bool atomic, bool *submitted, if (do_balance) f2fs_balance_fs(sbi, false); - return 0; + return true; redirty_out: folio_redirty_for_writepage(wbc, folio); - return AOP_WRITEPAGE_ACTIVATE; + folio_unlock(folio); + return false; } -int f2fs_move_node_page(struct page *node_page, int gc_type) +int f2fs_move_node_folio(struct folio *node_folio, int gc_type) { int err = 0; @@ -1786,33 +1762,30 @@ int f2fs_move_node_page(struct page *node_page, int gc_type) struct writeback_control wbc = { .sync_mode = WB_SYNC_ALL, .nr_to_write = 1, - .for_reclaim = 0, }; - f2fs_wait_on_page_writeback(node_page, NODE, true, true); + f2fs_folio_wait_writeback(node_folio, NODE, true, true); - set_page_dirty(node_page); + folio_mark_dirty(node_folio); - if (!clear_page_dirty_for_io(node_page)) { + if (!folio_clear_dirty_for_io(node_folio)) { err = -EAGAIN; goto out_page; } - if (__write_node_page(node_page, false, NULL, - &wbc, false, FS_GC_NODE_IO, NULL)) { + if (!__write_node_folio(node_folio, false, NULL, + &wbc, false, FS_GC_NODE_IO, NULL)) err = -EAGAIN; - unlock_page(node_page); - } goto release_page; } else { /* set page dirty and write it */ - if (!folio_test_writeback(page_folio(node_page))) - set_page_dirty(node_page); + if (!folio_test_writeback(node_folio)) + folio_mark_dirty(node_folio); } out_page: - unlock_page(node_page); + folio_unlock(node_folio); release_page: - f2fs_put_page(node_page, 0); + f2fs_folio_put(node_folio, false); return err; } @@ -1861,7 +1834,7 @@ retry: folio_lock(folio); - if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { + if (unlikely(!is_node_folio(folio))) { continue_unlock: folio_unlock(folio); continue; @@ -1885,7 +1858,7 @@ continue_unlock: if (IS_INODE(&folio->page)) { if (is_inode_flag_set(inode, FI_DIRTY_INODE)) - f2fs_update_inode(inode, &folio->page); + f2fs_update_inode(inode, folio); set_dentry_mark(&folio->page, f2fs_need_dentry_mark(sbi, ino)); } @@ -1897,31 +1870,29 @@ continue_unlock: if (!folio_clear_dirty_for_io(folio)) goto continue_unlock; - ret = __write_node_page(&folio->page, atomic && + if (!__write_node_folio(folio, atomic && folio == last_folio, &submitted, wbc, true, - FS_NODE_IO, seq_id); - if (ret) { - folio_unlock(folio); + FS_NODE_IO, seq_id)) { f2fs_folio_put(last_folio, false); - break; - } else if (submitted) { - nwritten++; + folio_batch_release(&fbatch); + ret = -EIO; + goto out; } + if (submitted) + nwritten++; if (folio == last_folio) { f2fs_folio_put(folio, false); + folio_batch_release(&fbatch); marked = true; - break; + goto out; } } folio_batch_release(&fbatch); cond_resched(); - - if (ret || marked) - break; } - if (!ret && atomic && !marked) { + if (atomic && !marked) { f2fs_debug(sbi, "Retry to write fsync mark: ino=%u, idx=%lx", ino, last_folio->index); folio_lock(last_folio); @@ -1933,7 +1904,7 @@ continue_unlock: out: if (nwritten) f2fs_submit_merged_write_cond(sbi, NULL, NULL, ino, NODE); - return ret ? -EIO : 0; + return ret; } static int f2fs_match_ino(struct inode *inode, unsigned long ino, void *data) @@ -1970,7 +1941,7 @@ static bool flush_dirty_inode(struct folio *folio) if (!inode) return false; - f2fs_update_inode(inode, &folio->page); + f2fs_update_inode(inode, folio); folio_unlock(folio); iput(inode); @@ -1998,7 +1969,7 @@ void f2fs_flush_inline_data(struct f2fs_sb_info *sbi) folio_lock(folio); - if (unlikely(folio->mapping != NODE_MAPPING(sbi))) + if (unlikely(!is_node_folio(folio))) goto unlock; if (!folio_test_dirty(folio)) goto unlock; @@ -2070,7 +2041,7 @@ lock_node: else if (!folio_trylock(folio)) continue; - if (unlikely(folio->mapping != NODE_MAPPING(sbi))) { + if (unlikely(!is_node_folio(folio))) { continue_unlock: folio_unlock(folio); continue; @@ -2105,11 +2076,14 @@ write_node: set_fsync_mark(&folio->page, 0); set_dentry_mark(&folio->page, 0); - ret = __write_node_page(&folio->page, false, &submitted, - wbc, do_balance, io_type, NULL); - if (ret) + if (!__write_node_folio(folio, false, &submitted, + wbc, do_balance, io_type, NULL)) { folio_unlock(folio); - else if (submitted) + folio_batch_release(&fbatch); + ret = -EIO; + goto out; + } + if (submitted) nwritten++; if (--wbc->nr_to_write == 0) @@ -2144,12 +2118,13 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, unsigned int seq_id) { struct fsync_node_entry *fn; - struct page *page; struct list_head *head = &sbi->fsync_node_list; unsigned long flags; unsigned int cur_seq_id = 0; while (seq_id && cur_seq_id < seq_id) { + struct folio *folio; + spin_lock_irqsave(&sbi->fsync_node_lock, flags); if (list_empty(head)) { spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); @@ -2161,13 +2136,13 @@ int f2fs_wait_on_node_pages_writeback(struct f2fs_sb_info *sbi, break; } cur_seq_id = fn->seq_id; - page = fn->page; - get_page(page); + folio = fn->folio; + folio_get(folio); spin_unlock_irqrestore(&sbi->fsync_node_lock, flags); - f2fs_wait_on_page_writeback(page, NODE, true, false); + f2fs_folio_wait_writeback(folio, NODE, true, false); - put_page(page); + folio_put(folio); } return filemap_check_errors(NODE_MAPPING(sbi)); @@ -2334,7 +2309,7 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, struct f2fs_nm_info *nm_i = NM_I(sbi); struct free_nid *i, *e; struct nat_entry *ne; - int err = -EINVAL; + int err; bool ret = false; /* 0 nid should not be used */ @@ -2348,7 +2323,10 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, i->nid = nid; i->state = FREE_NID; - radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + err = radix_tree_preload(GFP_NOFS | __GFP_NOFAIL); + f2fs_bug_on(sbi, err); + + err = -EINVAL; spin_lock(&nm_i->nid_list_lock); @@ -2367,8 +2345,8 @@ static bool add_free_nid(struct f2fs_sb_info *sbi, * - __lookup_nat_cache * - f2fs_add_link * - f2fs_init_inode_metadata - * - f2fs_new_inode_page - * - f2fs_new_node_page + * - f2fs_new_inode_folio + * - f2fs_new_node_folio * - set_node_addr * - f2fs_alloc_nid_done * - __remove_nid_from_list(PREALLOC_NID) @@ -2421,10 +2399,9 @@ static void remove_free_nid(struct f2fs_sb_info *sbi, nid_t nid) } static int scan_nat_page(struct f2fs_sb_info *sbi, - struct page *nat_page, nid_t start_nid) + struct f2fs_nat_block *nat_blk, nid_t start_nid) { struct f2fs_nm_info *nm_i = NM_I(sbi); - struct f2fs_nat_block *nat_blk = page_address(nat_page); block_t blk_addr; unsigned int nat_ofs = NAT_BLOCK_OFFSET(start_nid); int i; @@ -2544,13 +2521,14 @@ static int __f2fs_build_free_nids(struct f2fs_sb_info *sbi, while (1) { if (!test_bit_le(NAT_BLOCK_OFFSET(nid), nm_i->nat_block_bitmap)) { - struct page *page = get_current_nat_page(sbi, nid); + struct folio *folio = get_current_nat_folio(sbi, nid); - if (IS_ERR(page)) { - ret = PTR_ERR(page); + if (IS_ERR(folio)) { + ret = PTR_ERR(folio); } else { - ret = scan_nat_page(sbi, page, nid); - f2fs_put_page(page, 1); + ret = scan_nat_page(sbi, folio_address(folio), + nid); + f2fs_folio_put(folio, true); } if (ret) { @@ -2726,18 +2704,18 @@ int f2fs_try_to_free_nids(struct f2fs_sb_info *sbi, int nr_shrink) return nr - nr_shrink; } -int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) +int f2fs_recover_inline_xattr(struct inode *inode, struct folio *folio) { void *src_addr, *dst_addr; size_t inline_size; - struct page *ipage; + struct folio *ifolio; struct f2fs_inode *ri; - ipage = f2fs_get_inode_page(F2FS_I_SB(inode), inode->i_ino); - if (IS_ERR(ipage)) - return PTR_ERR(ipage); + ifolio = f2fs_get_inode_folio(F2FS_I_SB(inode), inode->i_ino); + if (IS_ERR(ifolio)) + return PTR_ERR(ifolio); - ri = F2FS_INODE(page); + ri = F2FS_INODE(&folio->page); if (ri->i_inline & F2FS_INLINE_XATTR) { if (!f2fs_has_inline_xattr(inode)) { set_inode_flag(inode, FI_INLINE_XATTR); @@ -2751,15 +2729,15 @@ int f2fs_recover_inline_xattr(struct inode *inode, struct page *page) goto update_inode; } - dst_addr = inline_xattr_addr(inode, ipage); - src_addr = inline_xattr_addr(inode, page); + dst_addr = inline_xattr_addr(inode, ifolio); + src_addr = inline_xattr_addr(inode, folio); inline_size = inline_xattr_size(inode); - f2fs_wait_on_page_writeback(ipage, NODE, true, true); + f2fs_folio_wait_writeback(ifolio, NODE, true, true); memcpy(dst_addr, src_addr, inline_size); update_inode: - f2fs_update_inode(inode, ipage); - f2fs_put_page(ipage, 1); + f2fs_update_inode(inode, ifolio); + f2fs_folio_put(ifolio, true); return 0; } @@ -2770,7 +2748,7 @@ int f2fs_recover_xattr_data(struct inode *inode, struct page *page) nid_t new_xnid; struct dnode_of_data dn; struct node_info ni; - struct page *xpage; + struct folio *xfolio; int err; if (!prev_xnid) @@ -2791,10 +2769,10 @@ recover_xnid: return -ENOSPC; set_new_dnode(&dn, inode, NULL, NULL, new_xnid); - xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); - if (IS_ERR(xpage)) { + xfolio = f2fs_new_node_folio(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xfolio)) { f2fs_alloc_nid_failed(sbi, new_xnid); - return PTR_ERR(xpage); + return PTR_ERR(xfolio); } f2fs_alloc_nid_done(sbi, new_xnid); @@ -2802,11 +2780,11 @@ recover_xnid: /* 3: update and set xattr node page dirty */ if (page) { - memcpy(F2FS_NODE(xpage), F2FS_NODE(page), + memcpy(F2FS_NODE(&xfolio->page), F2FS_NODE(page), VALID_XATTR_BLOCK_SIZE); - set_page_dirty(xpage); + folio_mark_dirty(xfolio); } - f2fs_put_page(xpage, 1); + f2fs_folio_put(xfolio, true); return 0; } @@ -2816,7 +2794,7 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) struct f2fs_inode *src, *dst; nid_t ino = ino_of_node(page); struct node_info old_ni, new_ni; - struct page *ipage; + struct folio *ifolio; int err; err = f2fs_get_node_info(sbi, ino, &old_ni, false); @@ -2826,8 +2804,8 @@ int f2fs_recover_inode_page(struct f2fs_sb_info *sbi, struct page *page) if (unlikely(old_ni.blk_addr != NULL_ADDR)) return -EINVAL; retry: - ipage = f2fs_grab_cache_page(NODE_MAPPING(sbi), ino, false); - if (!ipage) { + ifolio = f2fs_grab_cache_folio(NODE_MAPPING(sbi), ino, false); + if (IS_ERR(ifolio)) { memalloc_retry_wait(GFP_NOFS); goto retry; } @@ -2835,13 +2813,13 @@ retry: /* Should not use this inode from free nid list */ remove_free_nid(sbi, ino); - if (!PageUptodate(ipage)) - SetPageUptodate(ipage); - fill_node_footer(ipage, ino, ino, 0, true); - set_cold_node(ipage, false); + if (!folio_test_uptodate(ifolio)) + folio_mark_uptodate(ifolio); + fill_node_footer(&ifolio->page, ino, ino, 0, true); + set_cold_node(&ifolio->page, false); src = F2FS_INODE(page); - dst = F2FS_INODE(ipage); + dst = F2FS_INODE(&ifolio->page); memcpy(dst, src, offsetof(struct f2fs_inode, i_ext)); dst->i_size = 0; @@ -2877,8 +2855,8 @@ retry: WARN_ON(1); set_node_addr(sbi, &new_ni, NEW_ADDR, false); inc_valid_inode_count(sbi); - set_page_dirty(ipage); - f2fs_put_page(ipage, 1); + folio_mark_dirty(ifolio); + f2fs_folio_put(ifolio, true); return 0; } @@ -2902,17 +2880,17 @@ int f2fs_restore_node_summary(struct f2fs_sb_info *sbi, f2fs_ra_meta_pages(sbi, addr, nrpages, META_POR, true); for (idx = addr; idx < addr + nrpages; idx++) { - struct page *page = f2fs_get_tmp_page(sbi, idx); + struct folio *folio = f2fs_get_tmp_folio(sbi, idx); - if (IS_ERR(page)) - return PTR_ERR(page); + if (IS_ERR(folio)) + return PTR_ERR(folio); - rn = F2FS_NODE(page); + rn = F2FS_NODE(&folio->page); sum_entry->nid = rn->footer.nid; sum_entry->version = 0; sum_entry->ofs_in_node = 0; sum_entry++; - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); } invalidate_mapping_pages(META_MAPPING(sbi), addr, @@ -3173,15 +3151,15 @@ static int __get_nat_bitmaps(struct f2fs_sb_info *sbi) nat_bits_addr = __start_cp_addr(sbi) + BLKS_PER_SEG(sbi) - nm_i->nat_bits_blocks; for (i = 0; i < nm_i->nat_bits_blocks; i++) { - struct page *page; + struct folio *folio; - page = f2fs_get_meta_page(sbi, nat_bits_addr++); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = f2fs_get_meta_folio(sbi, nat_bits_addr++); + if (IS_ERR(folio)) + return PTR_ERR(folio); memcpy(nm_i->nat_bits + F2FS_BLK_TO_BYTES(i), - page_address(page), F2FS_BLKSIZE); - f2fs_put_page(page, 1); + folio_address(folio), F2FS_BLKSIZE); + f2fs_folio_put(folio, true); } cp_ver |= (cur_cp_crc(ckpt) << 32); diff --git a/fs/f2fs/node.h b/fs/f2fs/node.h index 103a437e6425..1446c433b3ec 100644 --- a/fs/f2fs/node.h +++ b/fs/f2fs/node.h @@ -268,9 +268,9 @@ static inline __u64 cpver_of_node(struct page *node_page) return le64_to_cpu(rn->footer.cp_ver); } -static inline block_t next_blkaddr_of_node(struct page *node_page) +static inline block_t next_blkaddr_of_node(struct folio *node_folio) { - struct f2fs_node *rn = F2FS_NODE(node_page); + struct f2fs_node *rn = F2FS_NODE(&node_folio->page); return le32_to_cpu(rn->footer.next_blkaddr); } @@ -367,17 +367,17 @@ static inline bool IS_DNODE(const struct page *node_page) return true; } -static inline int set_nid(struct page *p, int off, nid_t nid, bool i) +static inline int set_nid(struct folio *folio, int off, nid_t nid, bool i) { - struct f2fs_node *rn = F2FS_NODE(p); + struct f2fs_node *rn = F2FS_NODE(&folio->page); - f2fs_wait_on_page_writeback(p, NODE, true, true); + f2fs_folio_wait_writeback(folio, NODE, true, true); if (i) rn->i.i_nid[off - NODE_DIR1_BLOCK] = cpu_to_le32(nid); else rn->in.nid[off] = cpu_to_le32(nid); - return set_page_dirty(p); + return folio_mark_dirty(folio); } static inline nid_t get_nid(struct page *p, int off, bool i) diff --git a/fs/f2fs/recovery.c b/fs/f2fs/recovery.c index 69a2027e3ebc..51ebed4e1521 100644 --- a/fs/f2fs/recovery.c +++ b/fs/f2fs/recovery.c @@ -165,7 +165,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage, struct f2fs_dir_entry *de; struct f2fs_filename fname; struct qstr usr_fname; - struct page *page; + struct folio *folio; struct inode *dir, *einode; struct fsync_inode_entry *entry; int err = 0; @@ -187,7 +187,7 @@ static int recover_dentry(struct inode *inode, struct page *ipage, if (err) goto out; retry: - de = __f2fs_find_entry(dir, &fname, &page); + de = __f2fs_find_entry(dir, &fname, &folio); if (de && inode->i_ino == le32_to_cpu(de->ino)) goto out_put; @@ -212,11 +212,11 @@ retry: iput(einode); goto out_put; } - f2fs_delete_entry(de, page, dir, einode); + f2fs_delete_entry(de, folio, dir, einode); iput(einode); goto retry; - } else if (IS_ERR(page)) { - err = PTR_ERR(page); + } else if (IS_ERR(folio)) { + err = PTR_ERR(folio); } else { err = f2fs_add_dentry(dir, &fname, inode, inode->i_ino, inode->i_mode); @@ -226,7 +226,7 @@ retry: goto out; out_put: - f2fs_put_page(page, 0); + f2fs_folio_put(folio, false); out: if (file_enc_name(inode)) name = "<encrypted>"; @@ -358,33 +358,34 @@ static int sanity_check_node_chain(struct f2fs_sb_info *sbi, block_t blkaddr, block_t *blkaddr_fast, bool *is_detecting) { unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; - struct page *page = NULL; int i; if (!*is_detecting) return 0; for (i = 0; i < 2; i++) { + struct folio *folio; + if (!f2fs_is_valid_blkaddr(sbi, *blkaddr_fast, META_POR)) { *is_detecting = false; return 0; } - page = f2fs_get_tmp_page(sbi, *blkaddr_fast); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = f2fs_get_tmp_folio(sbi, *blkaddr_fast); + if (IS_ERR(folio)) + return PTR_ERR(folio); - if (!is_recoverable_dnode(page)) { - f2fs_put_page(page, 1); + if (!is_recoverable_dnode(&folio->page)) { + f2fs_folio_put(folio, true); *is_detecting = false; return 0; } ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, *blkaddr_fast, - next_blkaddr_of_node(page)); + next_blkaddr_of_node(folio)); - *blkaddr_fast = next_blkaddr_of_node(page); - f2fs_put_page(page, 1); + *blkaddr_fast = next_blkaddr_of_node(folio); + f2fs_folio_put(folio, true); f2fs_ra_meta_pages_cond(sbi, *blkaddr_fast, ra_blocks); } @@ -401,7 +402,6 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, bool check_only) { struct curseg_info *curseg; - struct page *page = NULL; block_t blkaddr, blkaddr_fast; bool is_detecting = true; int err = 0; @@ -413,33 +413,35 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, while (1) { struct fsync_inode_entry *entry; + struct folio *folio; if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) return 0; - page = f2fs_get_tmp_page(sbi, blkaddr); - if (IS_ERR(page)) { - err = PTR_ERR(page); + folio = f2fs_get_tmp_folio(sbi, blkaddr); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); break; } - if (!is_recoverable_dnode(page)) { - f2fs_put_page(page, 1); + if (!is_recoverable_dnode(&folio->page)) { + f2fs_folio_put(folio, true); break; } - if (!is_fsync_dnode(page)) + if (!is_fsync_dnode(&folio->page)) goto next; - entry = get_fsync_inode(head, ino_of_node(page)); + entry = get_fsync_inode(head, ino_of_node(&folio->page)); if (!entry) { bool quota_inode = false; if (!check_only && - IS_INODE(page) && is_dent_dnode(page)) { - err = f2fs_recover_inode_page(sbi, page); + IS_INODE(&folio->page) && + is_dent_dnode(&folio->page)) { + err = f2fs_recover_inode_page(sbi, &folio->page); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); break; } quota_inode = true; @@ -449,24 +451,24 @@ static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head, * CP | dnode(F) | inode(DF) * For this case, we should not give up now. */ - entry = add_fsync_inode(sbi, head, ino_of_node(page), + entry = add_fsync_inode(sbi, head, ino_of_node(&folio->page), quota_inode); if (IS_ERR(entry)) { err = PTR_ERR(entry); if (err == -ENOENT) goto next; - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); break; } } entry->blkaddr = blkaddr; - if (IS_INODE(page) && is_dent_dnode(page)) + if (IS_INODE(&folio->page) && is_dent_dnode(&folio->page)) entry->last_dentry = blkaddr; next: /* check next segment */ - blkaddr = next_blkaddr_of_node(page); - f2fs_put_page(page, 1); + blkaddr = next_blkaddr_of_node(folio); + f2fs_folio_put(folio, true); err = sanity_check_node_chain(sbi, blkaddr, &blkaddr_fast, &is_detecting); @@ -492,7 +494,7 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr); struct f2fs_summary_block *sum_node; struct f2fs_summary sum; - struct page *sum_page, *node_page; + struct folio *sum_folio, *node_folio; struct dnode_of_data tdn = *dn; nid_t ino, nid; struct inode *inode; @@ -514,18 +516,18 @@ static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi, } } - sum_page = f2fs_get_sum_page(sbi, segno); - if (IS_ERR(sum_page)) - return PTR_ERR(sum_page); - sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum_folio = f2fs_get_sum_folio(sbi, segno); + if (IS_ERR(sum_folio)) + return PTR_ERR(sum_folio); + sum_node = folio_address(sum_folio); sum = sum_node->entries[blkoff]; - f2fs_put_page(sum_page, 1); + f2fs_folio_put(sum_folio, true); got_it: /* Use the locked dnode page and inode */ nid = le32_to_cpu(sum.nid); ofs_in_node = le16_to_cpu(sum.ofs_in_node); - max_addrs = ADDRS_PER_PAGE(dn->node_page, dn->inode); + max_addrs = ADDRS_PER_PAGE(&dn->node_folio->page, dn->inode); if (ofs_in_node >= max_addrs) { f2fs_err(sbi, "Inconsistent ofs_in_node:%u in summary, ino:%lu, nid:%u, max:%u", ofs_in_node, dn->inode->i_ino, nid, max_addrs); @@ -535,9 +537,9 @@ got_it: if (dn->inode->i_ino == nid) { tdn.nid = nid; - if (!dn->inode_page_locked) - lock_page(dn->inode_page); - tdn.node_page = dn->inode_page; + if (!dn->inode_folio_locked) + folio_lock(dn->inode_folio); + tdn.node_folio = dn->inode_folio; tdn.ofs_in_node = ofs_in_node; goto truncate_out; } else if (dn->nid == nid) { @@ -546,13 +548,13 @@ got_it: } /* Get the node page */ - node_page = f2fs_get_node_page(sbi, nid); - if (IS_ERR(node_page)) - return PTR_ERR(node_page); + node_folio = f2fs_get_node_folio(sbi, nid); + if (IS_ERR(node_folio)) + return PTR_ERR(node_folio); - offset = ofs_of_node(node_page); - ino = ino_of_node(node_page); - f2fs_put_page(node_page, 1); + offset = ofs_of_node(&node_folio->page); + ino = ino_of_node(&node_folio->page); + f2fs_folio_put(node_folio, true); if (ino != dn->inode->i_ino) { int ret; @@ -578,8 +580,8 @@ got_it: * if inode page is locked, unlock temporarily, but its reference * count keeps alive. */ - if (ino == dn->inode->i_ino && dn->inode_page_locked) - unlock_page(dn->inode_page); + if (ino == dn->inode->i_ino && dn->inode_folio_locked) + folio_unlock(dn->inode_folio); set_new_dnode(&tdn, inode, NULL, NULL, 0); if (f2fs_get_dnode_of_data(&tdn, bidx, LOOKUP_NODE)) @@ -592,15 +594,15 @@ got_it: out: if (ino != dn->inode->i_ino) iput(inode); - else if (dn->inode_page_locked) - lock_page(dn->inode_page); + else if (dn->inode_folio_locked) + folio_lock(dn->inode_folio); return 0; truncate_out: if (f2fs_data_blkaddr(&tdn) == blkaddr) f2fs_truncate_data_blocks_range(&tdn, 1); - if (dn->inode->i_ino == nid && !dn->inode_page_locked) - unlock_page(dn->inode_page); + if (dn->inode->i_ino == nid && !dn->inode_folio_locked) + folio_unlock(dn->inode_folio); return 0; } @@ -618,7 +620,7 @@ static int f2fs_reserve_new_block_retry(struct dnode_of_data *dn) } static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, - struct page *page) + struct folio *folio) { struct dnode_of_data dn; struct node_info ni; @@ -626,19 +628,19 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, int err = 0, recovered = 0; /* step 1: recover xattr */ - if (IS_INODE(page)) { - err = f2fs_recover_inline_xattr(inode, page); + if (IS_INODE(&folio->page)) { + err = f2fs_recover_inline_xattr(inode, folio); if (err) goto out; - } else if (f2fs_has_xattr_block(ofs_of_node(page))) { - err = f2fs_recover_xattr_data(inode, page); + } else if (f2fs_has_xattr_block(ofs_of_node(&folio->page))) { + err = f2fs_recover_xattr_data(inode, &folio->page); if (!err) recovered++; goto out; } /* step 2: recover inline data */ - err = f2fs_recover_inline_data(inode, page); + err = f2fs_recover_inline_data(inode, folio); if (err) { if (err == 1) err = 0; @@ -646,8 +648,8 @@ static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode, } /* step 3: recover data indices */ - start = f2fs_start_bidx_of_node(ofs_of_node(page), inode); - end = start + ADDRS_PER_PAGE(page, inode); + start = f2fs_start_bidx_of_node(ofs_of_node(&folio->page), inode); + end = start + ADDRS_PER_PAGE(&folio->page, inode); set_new_dnode(&dn, inode, NULL, NULL, 0); retry_dn: @@ -660,18 +662,18 @@ retry_dn: goto out; } - f2fs_wait_on_page_writeback(dn.node_page, NODE, true, true); + f2fs_folio_wait_writeback(dn.node_folio, NODE, true, true); err = f2fs_get_node_info(sbi, dn.nid, &ni, false); if (err) goto err; - f2fs_bug_on(sbi, ni.ino != ino_of_node(page)); + f2fs_bug_on(sbi, ni.ino != ino_of_node(&folio->page)); - if (ofs_of_node(dn.node_page) != ofs_of_node(page)) { + if (ofs_of_node(&dn.node_folio->page) != ofs_of_node(&folio->page)) { f2fs_warn(sbi, "Inconsistent ofs_of_node, ino:%lu, ofs:%u, %u", - inode->i_ino, ofs_of_node(dn.node_page), - ofs_of_node(page)); + inode->i_ino, ofs_of_node(&dn.node_folio->page), + ofs_of_node(&folio->page)); err = -EFSCORRUPTED; f2fs_handle_error(sbi, ERROR_INCONSISTENT_FOOTER); goto err; @@ -681,7 +683,7 @@ retry_dn: block_t src, dest; src = f2fs_data_blkaddr(&dn); - dest = data_blkaddr(dn.inode, page, dn.ofs_in_node); + dest = data_blkaddr(dn.inode, folio, dn.ofs_in_node); if (__is_valid_data_blkaddr(src) && !f2fs_is_valid_blkaddr(sbi, src, META_POR)) { @@ -756,10 +758,10 @@ retry_prev: } } - copy_node_footer(dn.node_page, page); - fill_node_footer(dn.node_page, dn.nid, ni.ino, - ofs_of_node(page), false); - set_page_dirty(dn.node_page); + copy_node_footer(&dn.node_folio->page, &folio->page); + fill_node_footer(&dn.node_folio->page, dn.nid, ni.ino, + ofs_of_node(&folio->page), false); + folio_mark_dirty(dn.node_folio); err: f2fs_put_dnode(&dn); out: @@ -773,7 +775,6 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, struct list_head *tmp_inode_list, struct list_head *dir_list) { struct curseg_info *curseg; - struct page *page = NULL; int err = 0; block_t blkaddr; unsigned int ra_blocks = RECOVERY_MAX_RA_BLOCKS; @@ -784,22 +785,23 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, while (1) { struct fsync_inode_entry *entry; + struct folio *folio; if (!f2fs_is_valid_blkaddr(sbi, blkaddr, META_POR)) break; - page = f2fs_get_tmp_page(sbi, blkaddr); - if (IS_ERR(page)) { - err = PTR_ERR(page); + folio = f2fs_get_tmp_folio(sbi, blkaddr); + if (IS_ERR(folio)) { + err = PTR_ERR(folio); break; } - if (!is_recoverable_dnode(page)) { - f2fs_put_page(page, 1); + if (!is_recoverable_dnode(&folio->page)) { + f2fs_folio_put(folio, true); break; } - entry = get_fsync_inode(inode_list, ino_of_node(page)); + entry = get_fsync_inode(inode_list, ino_of_node(&folio->page)); if (!entry) goto next; /* @@ -807,23 +809,23 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, * In this case, we can lose the latest inode(x). * So, call recover_inode for the inode update. */ - if (IS_INODE(page)) { - err = recover_inode(entry->inode, page); + if (IS_INODE(&folio->page)) { + err = recover_inode(entry->inode, &folio->page); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); break; } } if (entry->last_dentry == blkaddr) { - err = recover_dentry(entry->inode, page, dir_list); + err = recover_dentry(entry->inode, &folio->page, dir_list); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); break; } } - err = do_recover_data(sbi, entry->inode, page); + err = do_recover_data(sbi, entry->inode, folio); if (err) { - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); break; } @@ -831,11 +833,11 @@ static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list, list_move_tail(&entry->list, tmp_inode_list); next: ra_blocks = adjust_por_ra_blocks(sbi, ra_blocks, blkaddr, - next_blkaddr_of_node(page)); + next_blkaddr_of_node(folio)); /* check next segment */ - blkaddr = next_blkaddr_of_node(page); - f2fs_put_page(page, 1); + blkaddr = next_blkaddr_of_node(folio); + f2fs_folio_put(folio, true); f2fs_ra_meta_pages_cond(sbi, blkaddr, ra_blocks); } diff --git a/fs/f2fs/segment.c b/fs/f2fs/segment.c index 396ef71f41e3..ae1223ef648f 100644 --- a/fs/f2fs/segment.c +++ b/fs/f2fs/segment.c @@ -334,7 +334,7 @@ static int __f2fs_commit_atomic_write(struct inode *inode) goto next; } - blen = min((pgoff_t)ADDRS_PER_PAGE(dn.node_page, cow_inode), + blen = min((pgoff_t)ADDRS_PER_PAGE(&dn.node_folio->page, cow_inode), len); index = off; for (i = 0; i < blen; i++, dn.ofs_in_node++, index++) { @@ -371,12 +371,21 @@ next: } out: + if (time_to_inject(sbi, FAULT_TIMEOUT)) + f2fs_io_schedule_timeout_killable(DEFAULT_FAULT_TIMEOUT); + if (ret) { sbi->revoked_atomic_block += fi->atomic_write_cnt; } else { sbi->committed_atomic_block += fi->atomic_write_cnt; set_inode_flag(inode, FI_ATOMIC_COMMITTED); + + /* + * inode may has no FI_ATOMIC_DIRTIED flag due to no write + * before commit. + */ if (is_inode_flag_set(inode, FI_ATOMIC_DIRTIED)) { + /* clear atomic dirty status and set vfs dirty status */ clear_inode_flag(inode, FI_ATOMIC_DIRTIED); f2fs_mark_inode_dirty_sync(inode, true); } @@ -424,7 +433,7 @@ void f2fs_balance_fs(struct f2fs_sb_info *sbi, bool need) if (need && excess_cached_nats(sbi)) f2fs_balance_fs_bg(sbi, false); - if (!f2fs_is_checkpoint_ready(sbi)) + if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED))) return; /* @@ -2438,7 +2447,7 @@ static void update_segment_mtime(struct f2fs_sb_info *sbi, block_t blkaddr, * that the consecutive input blocks belong to the same segment. */ static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_entry *se, - block_t blkaddr, unsigned int offset, int del) + unsigned int segno, block_t blkaddr, unsigned int offset, int del) { bool exist; #ifdef CONFIG_F2FS_CHECK_FS @@ -2483,15 +2492,21 @@ static int update_sit_entry_for_release(struct f2fs_sb_info *sbi, struct seg_ent f2fs_test_and_clear_bit(offset + i, se->discard_map)) sbi->discard_blks++; - if (!f2fs_test_bit(offset + i, se->ckpt_valid_map)) + if (!f2fs_test_bit(offset + i, se->ckpt_valid_map)) { se->ckpt_valid_blocks -= 1; + if (__is_large_section(sbi)) + get_sec_entry(sbi, segno)->ckpt_valid_blocks -= 1; + } } + if (__is_large_section(sbi)) + sanity_check_valid_blocks(sbi, segno); + return del; } static int update_sit_entry_for_alloc(struct f2fs_sb_info *sbi, struct seg_entry *se, - block_t blkaddr, unsigned int offset, int del) + unsigned int segno, block_t blkaddr, unsigned int offset, int del) { bool exist; #ifdef CONFIG_F2FS_CHECK_FS @@ -2524,12 +2539,21 @@ static int update_sit_entry_for_alloc(struct f2fs_sb_info *sbi, struct seg_entry * or newly invalidated. */ if (!is_sbi_flag_set(sbi, SBI_CP_DISABLED)) { - if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) + if (!f2fs_test_and_set_bit(offset, se->ckpt_valid_map)) { se->ckpt_valid_blocks++; + if (__is_large_section(sbi)) + get_sec_entry(sbi, segno)->ckpt_valid_blocks++; + } } - if (!f2fs_test_bit(offset, se->ckpt_valid_map)) + if (!f2fs_test_bit(offset, se->ckpt_valid_map)) { se->ckpt_valid_blocks += del; + if (__is_large_section(sbi)) + get_sec_entry(sbi, segno)->ckpt_valid_blocks += del; + } + + if (__is_large_section(sbi)) + sanity_check_valid_blocks(sbi, segno); return del; } @@ -2560,9 +2584,9 @@ static void update_sit_entry(struct f2fs_sb_info *sbi, block_t blkaddr, int del) /* Update valid block bitmap */ if (del > 0) { - del = update_sit_entry_for_alloc(sbi, se, blkaddr, offset, del); + del = update_sit_entry_for_alloc(sbi, se, segno, blkaddr, offset, del); } else { - del = update_sit_entry_for_release(sbi, se, blkaddr, offset, del); + del = update_sit_entry_for_release(sbi, se, segno, blkaddr, offset, del); } __mark_sit_entry_dirty(sbi, segno); @@ -2675,23 +2699,23 @@ int f2fs_npages_for_summary_flush(struct f2fs_sb_info *sbi, bool for_ra) } /* - * Caller should put this summary page + * Caller should put this summary folio */ -struct page *f2fs_get_sum_page(struct f2fs_sb_info *sbi, unsigned int segno) +struct folio *f2fs_get_sum_folio(struct f2fs_sb_info *sbi, unsigned int segno) { if (unlikely(f2fs_cp_error(sbi))) return ERR_PTR(-EIO); - return f2fs_get_meta_page_retry(sbi, GET_SUM_BLOCK(sbi, segno)); + return f2fs_get_meta_folio_retry(sbi, GET_SUM_BLOCK(sbi, segno)); } void f2fs_update_meta_page(struct f2fs_sb_info *sbi, void *src, block_t blk_addr) { - struct page *page = f2fs_grab_meta_page(sbi, blk_addr); + struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr); - memcpy(page_address(page), src, PAGE_SIZE); - set_page_dirty(page); - f2fs_put_page(page, 1); + memcpy(folio_address(folio), src, PAGE_SIZE); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); } static void write_sum_page(struct f2fs_sb_info *sbi, @@ -2704,11 +2728,11 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, int type, block_t blk_addr) { struct curseg_info *curseg = CURSEG_I(sbi, type); - struct page *page = f2fs_grab_meta_page(sbi, blk_addr); + struct folio *folio = f2fs_grab_meta_folio(sbi, blk_addr); struct f2fs_summary_block *src = curseg->sum_blk; struct f2fs_summary_block *dst; - dst = (struct f2fs_summary_block *)page_address(page); + dst = folio_address(folio); memset(dst, 0, PAGE_SIZE); mutex_lock(&curseg->curseg_mutex); @@ -2722,8 +2746,8 @@ static void write_current_sum_page(struct f2fs_sb_info *sbi, mutex_unlock(&curseg->curseg_mutex); - set_page_dirty(page); - f2fs_put_page(page, 1); + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); } static int is_next_segment_free(struct f2fs_sb_info *sbi, @@ -2777,7 +2801,7 @@ static int get_new_segment(struct f2fs_sb_info *sbi, if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_PRIOR_CONV || pinning) segno = 0; else - segno = max(sbi->first_zoned_segno, *newseg); + segno = max(sbi->first_seq_zone_segno, *newseg); hint = GET_SEC_FROM_SEG(sbi, segno); } #endif @@ -2789,7 +2813,7 @@ find_other_zone: if (secno >= MAIN_SECS(sbi) && f2fs_sb_has_blkzoned(sbi)) { /* Write only to sequential zones */ if (sbi->blkzone_alloc_policy == BLKZONE_ALLOC_ONLY_SEQ) { - hint = GET_SEC_FROM_SEG(sbi, sbi->first_zoned_segno); + hint = GET_SEC_FROM_SEG(sbi, sbi->first_seq_zone_segno); secno = find_next_zero_bit(free_i->free_secmap, MAIN_SECS(sbi), hint); } else secno = find_first_zero_bit(free_i->free_secmap, @@ -2836,11 +2860,15 @@ find_other_zone: } got_it: /* set it as dirty segment in free segmap */ - f2fs_bug_on(sbi, test_bit(segno, free_i->free_segmap)); + if (test_bit(segno, free_i->free_segmap)) { + ret = -EFSCORRUPTED; + f2fs_stop_checkpoint(sbi, false, STOP_CP_REASON_CORRUPTED_FREE_BITMAP); + goto out_unlock; + } - /* no free section in conventional zone */ + /* no free section in conventional device or conventional zone */ if (new_sec && pinning && - !f2fs_valid_pinned_area(sbi, START_BLOCK(sbi, segno))) { + f2fs_is_sequential_zone_area(sbi, START_BLOCK(sbi, segno))) { ret = -EAGAIN; goto out_unlock; } @@ -2997,7 +3025,7 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type) struct curseg_info *curseg = CURSEG_I(sbi, type); unsigned int new_segno = curseg->next_segno; struct f2fs_summary_block *sum_node; - struct page *sum_page; + struct folio *sum_folio; if (curseg->inited) write_sum_page(sbi, curseg->sum_blk, GET_SUM_BLOCK(sbi, curseg->segno)); @@ -3013,15 +3041,15 @@ static int change_curseg(struct f2fs_sb_info *sbi, int type) curseg->alloc_type = SSR; curseg->next_blkoff = __next_free_blkoff(sbi, curseg->segno, 0); - sum_page = f2fs_get_sum_page(sbi, new_segno); - if (IS_ERR(sum_page)) { + sum_folio = f2fs_get_sum_folio(sbi, new_segno); + if (IS_ERR(sum_folio)) { /* GC won't be able to use stale summary pages by cp_error */ memset(curseg->sum_blk, 0, SUM_ENTRY_SIZE); - return PTR_ERR(sum_page); + return PTR_ERR(sum_folio); } - sum_node = (struct f2fs_summary_block *)page_address(sum_page); + sum_node = folio_address(sum_folio); memcpy(curseg->sum_blk, sum_node, SUM_ENTRY_SIZE); - f2fs_put_page(sum_page, 1); + f2fs_folio_put(sum_folio, true); return 0; } @@ -3311,7 +3339,7 @@ retry: if (f2fs_sb_has_blkzoned(sbi) && err == -EAGAIN && gc_required) { f2fs_down_write(&sbi->gc_lock); - err = f2fs_gc_range(sbi, 0, GET_SEGNO(sbi, FDEV(0).end_blk), + err = f2fs_gc_range(sbi, 0, sbi->first_seq_zone_segno - 1, true, ZONED_PIN_SEC_REQUIRED_COUNT); f2fs_up_write(&sbi->gc_lock); @@ -3584,7 +3612,7 @@ static int __get_segment_type_2(struct f2fs_io_info *fio) static int __get_segment_type_4(struct f2fs_io_info *fio) { if (fio->type == DATA) { - struct inode *inode = fio->page->mapping->host; + struct inode *inode = fio_inode(fio); if (S_ISDIR(inode->i_mode)) return CURSEG_HOT_DATA; @@ -3618,7 +3646,7 @@ static int __get_age_segment_type(struct inode *inode, pgoff_t pgofs) static int __get_segment_type_6(struct f2fs_io_info *fio) { if (fio->type == DATA) { - struct inode *inode = fio->page->mapping->host; + struct inode *inode = fio_inode(fio); int type; if (is_inode_flag_set(inode, FI_ALIGNED_WRITE)) @@ -3918,7 +3946,7 @@ static void do_write_page(struct f2fs_summary *sum, struct f2fs_io_info *fio) fscrypt_finalize_bounce_page(&fio->encrypted_page); folio_end_writeback(folio); if (f2fs_in_warm_node_list(fio->sbi, folio)) - f2fs_del_fsync_node_entry(fio->sbi, fio->page); + f2fs_del_fsync_node_entry(fio->sbi, folio); goto out; } if (GET_SEGNO(fio->sbi, fio->old_blkaddr) != NULL_SEGNO) @@ -4023,7 +4051,7 @@ int f2fs_inplace_write_data(struct f2fs_io_info *fio) if (!err) { f2fs_update_device_state(fio->sbi, fio->ino, fio->new_blkaddr, 1); - f2fs_update_iostat(fio->sbi, fio->page->mapping->host, + f2fs_update_iostat(fio->sbi, fio_inode(fio), fio->io_type, F2FS_BLKSIZE); } @@ -4165,7 +4193,7 @@ void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, /* submit cached LFS IO */ f2fs_submit_merged_write_cond(sbi, NULL, &folio->page, 0, type); /* submit cached IPU IO */ - f2fs_submit_merged_ipu_write(sbi, NULL, &folio->page); + f2fs_submit_merged_ipu_write(sbi, NULL, folio); if (ordered) { folio_wait_writeback(folio); f2fs_bug_on(sbi, locked && folio_test_writeback(folio)); @@ -4178,7 +4206,7 @@ void f2fs_folio_wait_writeback(struct folio *folio, enum page_type type, void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); - struct page *cpage; + struct folio *cfolio; if (!f2fs_meta_inode_gc_required(inode)) return; @@ -4186,10 +4214,10 @@ void f2fs_wait_on_block_writeback(struct inode *inode, block_t blkaddr) if (!__is_valid_data_blkaddr(blkaddr)) return; - cpage = find_lock_page(META_MAPPING(sbi), blkaddr); - if (cpage) { - f2fs_wait_on_page_writeback(cpage, DATA, true, true); - f2fs_put_page(cpage, 1); + cfolio = filemap_lock_folio(META_MAPPING(sbi), blkaddr); + if (!IS_ERR(cfolio)) { + f2fs_folio_wait_writeback(cfolio, DATA, true, true); + f2fs_folio_put(cfolio, true); } } @@ -4213,16 +4241,16 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct curseg_info *seg_i; unsigned char *kaddr; - struct page *page; + struct folio *folio; block_t start; int i, j, offset; start = start_sum_block(sbi); - page = f2fs_get_meta_page(sbi, start++); - if (IS_ERR(page)) - return PTR_ERR(page); - kaddr = (unsigned char *)page_address(page); + folio = f2fs_get_meta_folio(sbi, start++); + if (IS_ERR(folio)) + return PTR_ERR(folio); + kaddr = folio_address(folio); /* Step 1: restore nat cache */ seg_i = CURSEG_I(sbi, CURSEG_HOT_DATA); @@ -4259,17 +4287,16 @@ static int read_compacted_summaries(struct f2fs_sb_info *sbi) SUM_FOOTER_SIZE) continue; - f2fs_put_page(page, 1); - page = NULL; + f2fs_folio_put(folio, true); - page = f2fs_get_meta_page(sbi, start++); - if (IS_ERR(page)) - return PTR_ERR(page); - kaddr = (unsigned char *)page_address(page); + folio = f2fs_get_meta_folio(sbi, start++); + if (IS_ERR(folio)) + return PTR_ERR(folio); + kaddr = folio_address(folio); offset = 0; } } - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return 0; } @@ -4278,7 +4305,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) struct f2fs_checkpoint *ckpt = F2FS_CKPT(sbi); struct f2fs_summary_block *sum; struct curseg_info *curseg; - struct page *new; + struct folio *new; unsigned short blk_off; unsigned int segno = 0; block_t blk_addr = 0; @@ -4305,10 +4332,10 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) blk_addr = GET_SUM_BLOCK(sbi, segno); } - new = f2fs_get_meta_page(sbi, blk_addr); + new = f2fs_get_meta_folio(sbi, blk_addr); if (IS_ERR(new)) return PTR_ERR(new); - sum = (struct f2fs_summary_block *)page_address(new); + sum = folio_address(new); if (IS_NODESEG(type)) { if (__exist_node_summaries(sbi)) { @@ -4343,7 +4370,7 @@ static int read_normal_summaries(struct f2fs_sb_info *sbi, int type) curseg->next_blkoff = blk_off; mutex_unlock(&curseg->curseg_mutex); out: - f2fs_put_page(new, 1); + f2fs_folio_put(new, true); return err; } @@ -4392,15 +4419,15 @@ static int restore_curseg_summaries(struct f2fs_sb_info *sbi) static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) { - struct page *page; + struct folio *folio; unsigned char *kaddr; struct f2fs_summary *summary; struct curseg_info *seg_i; int written_size = 0; int i, j; - page = f2fs_grab_meta_page(sbi, blkaddr++); - kaddr = (unsigned char *)page_address(page); + folio = f2fs_grab_meta_folio(sbi, blkaddr++); + kaddr = folio_address(folio); memset(kaddr, 0, PAGE_SIZE); /* Step 1: write nat cache */ @@ -4417,9 +4444,9 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) { seg_i = CURSEG_I(sbi, i); for (j = 0; j < f2fs_curseg_valid_blocks(sbi, i); j++) { - if (!page) { - page = f2fs_grab_meta_page(sbi, blkaddr++); - kaddr = (unsigned char *)page_address(page); + if (!folio) { + folio = f2fs_grab_meta_folio(sbi, blkaddr++); + kaddr = folio_address(folio); memset(kaddr, 0, PAGE_SIZE); written_size = 0; } @@ -4431,14 +4458,14 @@ static void write_compacted_summaries(struct f2fs_sb_info *sbi, block_t blkaddr) SUM_FOOTER_SIZE) continue; - set_page_dirty(page); - f2fs_put_page(page, 1); - page = NULL; + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); + folio = NULL; } } - if (page) { - set_page_dirty(page); - f2fs_put_page(page, 1); + if (folio) { + folio_mark_dirty(folio); + f2fs_folio_put(folio, true); } } @@ -4491,29 +4518,29 @@ int f2fs_lookup_journal_in_cursum(struct f2fs_journal *journal, int type, return -1; } -static struct page *get_current_sit_page(struct f2fs_sb_info *sbi, +static struct folio *get_current_sit_folio(struct f2fs_sb_info *sbi, unsigned int segno) { - return f2fs_get_meta_page(sbi, current_sit_addr(sbi, segno)); + return f2fs_get_meta_folio(sbi, current_sit_addr(sbi, segno)); } -static struct page *get_next_sit_page(struct f2fs_sb_info *sbi, +static struct folio *get_next_sit_folio(struct f2fs_sb_info *sbi, unsigned int start) { struct sit_info *sit_i = SIT_I(sbi); - struct page *page; + struct folio *folio; pgoff_t src_off, dst_off; src_off = current_sit_addr(sbi, start); dst_off = next_sit_addr(sbi, src_off); - page = f2fs_grab_meta_page(sbi, dst_off); - seg_info_to_sit_page(sbi, page, start); + folio = f2fs_grab_meta_folio(sbi, dst_off); + seg_info_to_sit_folio(sbi, folio, start); - set_page_dirty(page); + folio_mark_dirty(folio); set_to_next_sit(sit_i, start); - return page; + return folio; } static struct sit_entry_set *grab_sit_entry_set(void) @@ -4643,7 +4670,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) * #2, flush sit entries to sit page. */ list_for_each_entry_safe(ses, tmp, head, set_list) { - struct page *page = NULL; + struct folio *folio = NULL; struct f2fs_sit_block *raw_sit = NULL; unsigned int start_segno = ses->start_segno; unsigned int end = min(start_segno + SIT_ENTRY_PER_BLOCK, @@ -4657,8 +4684,8 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (to_journal) { down_write(&curseg->journal_rwsem); } else { - page = get_next_sit_page(sbi, start_segno); - raw_sit = page_address(page); + folio = get_next_sit_folio(sbi, start_segno); + raw_sit = folio_address(folio); } /* flush dirty sit entries in region of current sit set */ @@ -4696,6 +4723,12 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) &raw_sit->entries[sit_offset]); } + /* update ckpt_valid_block */ + if (__is_large_section(sbi)) { + set_ckpt_valid_blocks(sbi, segno); + sanity_check_valid_blocks(sbi, segno); + } + __clear_bit(segno, bitmap); sit_i->dirty_sentries--; ses->entry_cnt--; @@ -4704,7 +4737,7 @@ void f2fs_flush_sit_entries(struct f2fs_sb_info *sbi, struct cp_control *cpc) if (to_journal) up_write(&curseg->journal_rwsem); else - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); f2fs_bug_on(sbi, ses->entry_cnt); release_sit_entry_set(ses); @@ -4916,15 +4949,15 @@ static int build_sit_entries(struct f2fs_sb_info *sbi) for (; start < end && start < MAIN_SEGS(sbi); start++) { struct f2fs_sit_block *sit_blk; - struct page *page; + struct folio *folio; se = &sit_i->sentries[start]; - page = get_current_sit_page(sbi, start); - if (IS_ERR(page)) - return PTR_ERR(page); - sit_blk = (struct f2fs_sit_block *)page_address(page); + folio = get_current_sit_folio(sbi, start); + if (IS_ERR(folio)) + return PTR_ERR(folio); + sit_blk = folio_address(folio); sit = sit_blk->entries[SIT_ENTRY_OFFSET(sit_i, start)]; - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); err = check_block_count(sbi, start, &sit); if (err) @@ -5017,6 +5050,16 @@ init_discard_map_done: } up_read(&curseg->journal_rwsem); + /* update ckpt_valid_block */ + if (__is_large_section(sbi)) { + unsigned int segno; + + for (segno = 0; segno < MAIN_SEGS(sbi); segno += SEGS_PER_SEC(sbi)) { + set_ckpt_valid_blocks(sbi, segno); + sanity_check_valid_blocks(sbi, segno); + } + } + if (err) return err; diff --git a/fs/f2fs/segment.h b/fs/f2fs/segment.h index 0465dc00b349..db619fd2f51a 100644 --- a/fs/f2fs/segment.h +++ b/fs/f2fs/segment.h @@ -102,6 +102,8 @@ static inline void sanity_check_seg_type(struct f2fs_sb_info *sbi, #define CAP_SEGS_PER_SEC(sbi) \ (SEGS_PER_SEC(sbi) - \ BLKS_TO_SEGS(sbi, (sbi)->unusable_blocks_per_sec)) +#define GET_START_SEG_FROM_SEC(sbi, segno) \ + (rounddown(segno, SEGS_PER_SEC(sbi))) #define GET_SEC_FROM_SEG(sbi, segno) \ (((segno) == -1) ? -1 : (segno) / SEGS_PER_SEC(sbi)) #define GET_SEG_FROM_SEC(sbi, secno) \ @@ -209,6 +211,7 @@ struct seg_entry { struct sec_entry { unsigned int valid_blocks; /* # of valid blocks in a section */ + unsigned int ckpt_valid_blocks; /* # of valid blocks last cp in a section */ }; #define MAX_SKIP_GC_COUNT 16 @@ -345,22 +348,57 @@ static inline unsigned int get_valid_blocks(struct f2fs_sb_info *sbi, static inline unsigned int get_ckpt_valid_blocks(struct f2fs_sb_info *sbi, unsigned int segno, bool use_section) { - if (use_section && __is_large_section(sbi)) { - unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); - unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); - unsigned int blocks = 0; - int i; + if (use_section && __is_large_section(sbi)) + return get_sec_entry(sbi, segno)->ckpt_valid_blocks; + else + return get_seg_entry(sbi, segno)->ckpt_valid_blocks; +} + +static inline void set_ckpt_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); + unsigned int blocks = 0; + int i; - for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) { - struct seg_entry *se = get_seg_entry(sbi, start_segno); + for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) { + struct seg_entry *se = get_seg_entry(sbi, start_segno); - blocks += se->ckpt_valid_blocks; - } - return blocks; + blocks += se->ckpt_valid_blocks; } - return get_seg_entry(sbi, segno)->ckpt_valid_blocks; + get_sec_entry(sbi, segno)->ckpt_valid_blocks = blocks; } +#ifdef CONFIG_F2FS_CHECK_FS +static inline void sanity_check_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno) +{ + unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); + unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); + unsigned int blocks = 0; + int i; + + for (i = 0; i < SEGS_PER_SEC(sbi); i++, start_segno++) { + struct seg_entry *se = get_seg_entry(sbi, start_segno); + + blocks += se->ckpt_valid_blocks; + } + + if (blocks != get_sec_entry(sbi, segno)->ckpt_valid_blocks) { + f2fs_err(sbi, + "Inconsistent ckpt valid blocks: " + "seg entry(%d) vs sec entry(%d) at secno %d", + blocks, get_sec_entry(sbi, segno)->ckpt_valid_blocks, secno); + f2fs_bug_on(sbi, 1); + } +} +#else +static inline void sanity_check_valid_blocks(struct f2fs_sb_info *sbi, + unsigned int segno) +{ +} +#endif static inline void seg_info_from_raw_sit(struct seg_entry *se, struct f2fs_sit_entry *rs) { @@ -385,8 +423,8 @@ static inline void __seg_info_to_raw_sit(struct seg_entry *se, rs->mtime = cpu_to_le64(se->mtime); } -static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, - struct page *page, unsigned int start) +static inline void seg_info_to_sit_folio(struct f2fs_sb_info *sbi, + struct folio *folio, unsigned int start) { struct f2fs_sit_block *raw_sit; struct seg_entry *se; @@ -395,7 +433,7 @@ static inline void seg_info_to_sit_page(struct f2fs_sb_info *sbi, (unsigned long)MAIN_SEGS(sbi)); int i; - raw_sit = (struct f2fs_sit_block *)page_address(page); + raw_sit = folio_address(folio); memset(raw_sit, 0, PAGE_SIZE); for (i = 0; i < end - start; i++) { rs = &raw_sit->entries[i]; @@ -429,7 +467,6 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; - unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi); spin_lock(&free_i->segmap_lock); clear_bit(segno, free_i->free_segmap); @@ -437,7 +474,7 @@ static inline void __set_free(struct f2fs_sb_info *sbi, unsigned int segno) next = find_next_bit(free_i->free_segmap, start_segno + SEGS_PER_SEC(sbi), start_segno); - if (next >= start_segno + usable_segs) { + if (next >= start_segno + f2fs_usable_segs_in_sec(sbi)) { clear_bit(secno, free_i->free_secmap); free_i->free_sections++; } @@ -463,22 +500,36 @@ static inline void __set_test_and_free(struct f2fs_sb_info *sbi, unsigned int secno = GET_SEC_FROM_SEG(sbi, segno); unsigned int start_segno = GET_SEG_FROM_SEC(sbi, secno); unsigned int next; - unsigned int usable_segs = f2fs_usable_segs_in_sec(sbi); + bool ret; spin_lock(&free_i->segmap_lock); - if (test_and_clear_bit(segno, free_i->free_segmap)) { - free_i->free_segments++; - - if (!inmem && IS_CURSEC(sbi, secno)) - goto skip_free; - next = find_next_bit(free_i->free_segmap, - start_segno + SEGS_PER_SEC(sbi), start_segno); - if (next >= start_segno + usable_segs) { - if (test_and_clear_bit(secno, free_i->free_secmap)) - free_i->free_sections++; - } - } -skip_free: + ret = test_and_clear_bit(segno, free_i->free_segmap); + if (!ret) + goto unlock_out; + + free_i->free_segments++; + + if (!inmem && IS_CURSEC(sbi, secno)) + goto unlock_out; + + /* check large section */ + next = find_next_bit(free_i->free_segmap, + start_segno + SEGS_PER_SEC(sbi), start_segno); + if (next < start_segno + f2fs_usable_segs_in_sec(sbi)) + goto unlock_out; + + ret = test_and_clear_bit(secno, free_i->free_secmap); + if (!ret) + goto unlock_out; + + free_i->free_sections++; + + if (GET_SEC_FROM_SEG(sbi, sbi->next_victim_seg[BG_GC]) == secno) + sbi->next_victim_seg[BG_GC] = NULL_SEGNO; + if (GET_SEC_FROM_SEG(sbi, sbi->next_victim_seg[FG_GC]) == secno) + sbi->next_victim_seg[FG_GC] = NULL_SEGNO; + +unlock_out: spin_unlock(&free_i->segmap_lock); } @@ -569,8 +620,14 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, if (unlikely(segno == NULL_SEGNO)) return false; - left_blocks = CAP_BLKS_PER_SEC(sbi) - - get_ckpt_valid_blocks(sbi, segno, true); + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) { + left_blocks = CAP_BLKS_PER_SEC(sbi) - + SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - + CURSEG_I(sbi, i)->next_blkoff; + } else { + left_blocks = CAP_BLKS_PER_SEC(sbi) - + get_ckpt_valid_blocks(sbi, segno, true); + } blocks = i <= CURSEG_COLD_DATA ? data_blocks : node_blocks; if (blocks > left_blocks) @@ -583,8 +640,15 @@ static inline bool has_curseg_enough_space(struct f2fs_sb_info *sbi, if (unlikely(segno == NULL_SEGNO)) return false; - left_blocks = CAP_BLKS_PER_SEC(sbi) - - get_ckpt_valid_blocks(sbi, segno, true); + if (f2fs_lfs_mode(sbi) && __is_large_section(sbi)) { + left_blocks = CAP_BLKS_PER_SEC(sbi) - + SEGS_TO_BLKS(sbi, (segno - GET_START_SEG_FROM_SEC(sbi, segno))) - + CURSEG_I(sbi, CURSEG_HOT_DATA)->next_blkoff; + } else { + left_blocks = CAP_BLKS_PER_SEC(sbi) - + get_ckpt_valid_blocks(sbi, segno, true); + } + if (dent_blocks > left_blocks) return false; return true; diff --git a/fs/f2fs/shrinker.c b/fs/f2fs/shrinker.c index 9c8d3aee89af..b88babcf6ab4 100644 --- a/fs/f2fs/shrinker.c +++ b/fs/f2fs/shrinker.c @@ -184,10 +184,17 @@ static unsigned int do_reclaim_caches(struct f2fs_sb_info *sbi, if (!inode) continue; - len = fi->donate_end - fi->donate_start + 1; - npages = npages < len ? 0 : npages - len; - invalidate_inode_pages2_range(inode->i_mapping, + inode_lock(inode); + if (!is_inode_flag_set(inode, FI_DONATE_FINISHED)) { + len = fi->donate_end - fi->donate_start + 1; + npages = npages < len ? 0 : npages - len; + + invalidate_inode_pages2_range(inode->i_mapping, fi->donate_start, fi->donate_end); + set_inode_flag(inode, FI_DONATE_FINISHED); + } + inode_unlock(inode); + iput(inode); cond_resched(); } diff --git a/fs/f2fs/super.c b/fs/f2fs/super.c index f087b2b71c89..bbf1dad6843f 100644 --- a/fs/f2fs/super.c +++ b/fs/f2fs/super.c @@ -47,6 +47,7 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_KVMALLOC] = "kvmalloc", [FAULT_PAGE_ALLOC] = "page alloc", [FAULT_PAGE_GET] = "page get", + [FAULT_ALLOC_BIO] = "alloc bio(obsolete)", [FAULT_ALLOC_NID] = "alloc nid", [FAULT_ORPHAN] = "orphan", [FAULT_BLOCK] = "no more block", @@ -64,32 +65,35 @@ const char *f2fs_fault_name[FAULT_MAX] = { [FAULT_BLKADDR_CONSISTENCE] = "inconsistent blkaddr", [FAULT_NO_SEGMENT] = "no free segment", [FAULT_INCONSISTENT_FOOTER] = "inconsistent footer", + [FAULT_TIMEOUT] = "timeout", + [FAULT_VMALLOC] = "vmalloc", }; int f2fs_build_fault_attr(struct f2fs_sb_info *sbi, unsigned long rate, - unsigned long type) + unsigned long type, enum fault_option fo) { struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; - if (rate) { + if (fo & FAULT_ALL) { + memset(ffi, 0, sizeof(struct f2fs_fault_info)); + return 0; + } + + if (fo & FAULT_RATE) { if (rate > INT_MAX) return -EINVAL; atomic_set(&ffi->inject_ops, 0); ffi->inject_rate = (int)rate; + f2fs_info(sbi, "build fault injection rate: %lu", rate); } - if (type) { + if (fo & FAULT_TYPE) { if (type >= BIT(FAULT_MAX)) return -EINVAL; ffi->inject_type = (unsigned int)type; + f2fs_info(sbi, "build fault injection type: 0x%lx", type); } - if (!rate && !type) - memset(ffi, 0, sizeof(struct f2fs_fault_info)); - else - f2fs_info(sbi, - "build fault injection attr: rate: %lu, type: 0x%lx", - rate, type); return 0; } #endif @@ -896,8 +900,7 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remoun case Opt_fault_injection: if (args->from && match_int(args, &arg)) return -EINVAL; - if (f2fs_build_fault_attr(sbi, arg, - F2FS_ALL_FAULT_TYPE)) + if (f2fs_build_fault_attr(sbi, arg, 0, FAULT_RATE)) return -EINVAL; set_opt(sbi, FAULT_INJECTION); break; @@ -905,7 +908,7 @@ static int parse_options(struct f2fs_sb_info *sbi, char *options, bool is_remoun case Opt_fault_type: if (args->from && match_int(args, &arg)) return -EINVAL; - if (f2fs_build_fault_attr(sbi, 0, arg)) + if (f2fs_build_fault_attr(sbi, 0, arg, FAULT_TYPE)) return -EINVAL; set_opt(sbi, FAULT_INJECTION); break; @@ -1531,7 +1534,9 @@ int f2fs_inode_dirtied(struct inode *inode, bool sync) } spin_unlock(&sbi->inode_lock[DIRTY_META]); - if (!ret && f2fs_is_atomic_file(inode)) + /* if atomic write is not committed, set inode w/ atomic dirty */ + if (!ret && f2fs_is_atomic_file(inode) && + !is_inode_flag_set(inode, FI_ATOMIC_COMMITTED)) set_inode_flag(inode, FI_ATOMIC_DIRTIED); return ret; @@ -1804,26 +1809,32 @@ static int f2fs_statfs_project(struct super_block *sb, limit = min_not_zero(dquot->dq_dqb.dqb_bsoftlimit, dquot->dq_dqb.dqb_bhardlimit); - if (limit) - limit >>= sb->s_blocksize_bits; + limit >>= sb->s_blocksize_bits; + + if (limit) { + uint64_t remaining = 0; - if (limit && buf->f_blocks > limit) { curblock = (dquot->dq_dqb.dqb_curspace + dquot->dq_dqb.dqb_rsvspace) >> sb->s_blocksize_bits; - buf->f_blocks = limit; - buf->f_bfree = buf->f_bavail = - (buf->f_blocks > curblock) ? - (buf->f_blocks - curblock) : 0; + if (limit > curblock) + remaining = limit - curblock; + + buf->f_blocks = min(buf->f_blocks, limit); + buf->f_bfree = min(buf->f_bfree, remaining); + buf->f_bavail = min(buf->f_bavail, remaining); } limit = min_not_zero(dquot->dq_dqb.dqb_isoftlimit, dquot->dq_dqb.dqb_ihardlimit); - if (limit && buf->f_files > limit) { - buf->f_files = limit; - buf->f_ffree = - (buf->f_files > dquot->dq_dqb.dqb_curinodes) ? - (buf->f_files - dquot->dq_dqb.dqb_curinodes) : 0; + if (limit) { + uint64_t remaining = 0; + + if (limit > dquot->dq_dqb.dqb_curinodes) + remaining = limit - dquot->dq_dqb.dqb_curinodes; + + buf->f_files = min(buf->f_files, limit); + buf->f_ffree = min(buf->f_ffree, remaining); } spin_unlock(&dquot->dq_dqb_lock); @@ -1882,9 +1893,9 @@ static int f2fs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid = u64_to_fsid(id); #ifdef CONFIG_QUOTA - if (is_inode_flag_set(dentry->d_inode, FI_PROJ_INHERIT) && + if (is_inode_flag_set(d_inode(dentry), FI_PROJ_INHERIT) && sb_has_quota_limits_enabled(sb, PRJQUOTA)) { - f2fs_statfs_project(sb, F2FS_I(dentry->d_inode)->i_projid, buf); + f2fs_statfs_project(sb, F2FS_I(d_inode(dentry))->i_projid, buf); } #endif return 0; @@ -2208,7 +2219,7 @@ static void default_options(struct f2fs_sb_info *sbi, bool remount) set_opt(sbi, POSIX_ACL); #endif - f2fs_build_fault_attr(sbi, 0, 0); + f2fs_build_fault_attr(sbi, 0, 0, FAULT_ALL); } #ifdef CONFIG_QUOTA @@ -2689,12 +2700,9 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, { struct inode *inode = sb_dqopt(sb)->files[type]; struct address_space *mapping = inode->i_mapping; - block_t blkidx = F2FS_BYTES_TO_BLK(off); - int offset = off & (sb->s_blocksize - 1); int tocopy; size_t toread; loff_t i_size = i_size_read(inode); - struct page *page; if (off > i_size) return 0; @@ -2703,37 +2711,42 @@ static ssize_t f2fs_quota_read(struct super_block *sb, int type, char *data, len = i_size - off; toread = len; while (toread > 0) { - tocopy = min_t(unsigned long, sb->s_blocksize - offset, toread); + struct folio *folio; + size_t offset; + repeat: - page = read_cache_page_gfp(mapping, blkidx, GFP_NOFS); - if (IS_ERR(page)) { - if (PTR_ERR(page) == -ENOMEM) { + folio = mapping_read_folio_gfp(mapping, off >> PAGE_SHIFT, + GFP_NOFS); + if (IS_ERR(folio)) { + if (PTR_ERR(folio) == -ENOMEM) { memalloc_retry_wait(GFP_NOFS); goto repeat; } set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); - return PTR_ERR(page); + return PTR_ERR(folio); } + offset = offset_in_folio(folio, off); + tocopy = min(folio_size(folio) - offset, toread); - lock_page(page); + folio_lock(folio); - if (unlikely(page->mapping != mapping)) { - f2fs_put_page(page, 1); + if (unlikely(folio->mapping != mapping)) { + f2fs_folio_put(folio, true); goto repeat; } - if (unlikely(!PageUptodate(page))) { - f2fs_put_page(page, 1); - set_sbi_flag(F2FS_SB(sb), SBI_QUOTA_NEED_REPAIR); - return -EIO; - } - memcpy_from_page(data, page, offset, tocopy); - f2fs_put_page(page, 1); + /* + * should never happen, just leave f2fs_bug_on() here to catch + * any potential bug. + */ + f2fs_bug_on(F2FS_SB(sb), !folio_test_uptodate(folio)); + + memcpy_from_folio(data, folio, offset, tocopy); + f2fs_folio_put(folio, true); - offset = 0; toread -= tocopy; data += tocopy; - blkidx++; + off += tocopy; } return len; } @@ -3432,7 +3445,7 @@ static int __f2fs_commit_super(struct f2fs_sb_info *sbi, struct folio *folio, bio = bio_alloc(sbi->sb->s_bdev, 1, opf, GFP_NOFS); /* it doesn't need to set crypto context for superblock update */ - bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(folio_index(folio)); + bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(folio->index); if (!bio_add_folio(bio, folio, folio_size(folio), 0)) f2fs_bug_on(sbi, 1); @@ -3558,7 +3571,7 @@ static int sanity_check_raw_super(struct f2fs_sb_info *sbi, return -EFSCORRUPTED; } crc = le32_to_cpu(raw_super->crc); - if (!f2fs_crc_valid(sbi, crc, raw_super, crc_offset)) { + if (crc != f2fs_crc32(raw_super, crc_offset)) { f2fs_info(sbi, "Invalid SB checksum value: %u", crc); return -EFSCORRUPTED; } @@ -3717,6 +3730,7 @@ int f2fs_sanity_check_ckpt(struct f2fs_sb_info *sbi) block_t user_block_count, valid_user_blocks; block_t avail_node_count, valid_node_count; unsigned int nat_blocks, nat_bits_bytes, nat_bits_blocks; + unsigned int sit_blk_cnt; int i, j; total = le32_to_cpu(raw_super->segment_count); @@ -3828,6 +3842,13 @@ skip_cross: return 1; } + sit_blk_cnt = DIV_ROUND_UP(main_segs, SIT_ENTRY_PER_BLOCK); + if (sit_bitmap_size * 8 < sit_blk_cnt) { + f2fs_err(sbi, "Wrong bitmap size: sit: %u, sit_blk_cnt:%u", + sit_bitmap_size, sit_blk_cnt); + return 1; + } + cp_pack_start_sum = __start_sum_addr(sbi); cp_payload = __cp_payload(sbi); if (cp_pack_start_sum < cp_payload + 1 || @@ -4106,7 +4127,7 @@ int f2fs_commit_super(struct f2fs_sb_info *sbi, bool recover) /* we should update superblock crc here */ if (!recover && f2fs_sb_has_sb_chksum(sbi)) { - crc = f2fs_crc32(sbi, F2FS_RAW_SUPER(sbi), + crc = f2fs_crc32(F2FS_RAW_SUPER(sbi), offsetof(struct f2fs_super_block, crc)); F2FS_RAW_SUPER(sbi)->crc = cpu_to_le32(crc); } @@ -4304,14 +4325,35 @@ static void f2fs_record_error_work(struct work_struct *work) f2fs_record_stop_reason(sbi); } -static inline unsigned int get_first_zoned_segno(struct f2fs_sb_info *sbi) +static inline unsigned int get_first_seq_zone_segno(struct f2fs_sb_info *sbi) { +#ifdef CONFIG_BLK_DEV_ZONED + unsigned int zoneno, total_zones; int devi; - for (devi = 0; devi < sbi->s_ndevs; devi++) - if (bdev_is_zoned(FDEV(devi).bdev)) - return GET_SEGNO(sbi, FDEV(devi).start_blk); - return 0; + if (!f2fs_sb_has_blkzoned(sbi)) + return NULL_SEGNO; + + for (devi = 0; devi < sbi->s_ndevs; devi++) { + if (!bdev_is_zoned(FDEV(devi).bdev)) + continue; + + total_zones = GET_ZONE_FROM_SEG(sbi, FDEV(devi).total_segments); + + for (zoneno = 0; zoneno < total_zones; zoneno++) { + unsigned int segs, blks; + + if (!f2fs_zone_is_seq(sbi, devi, zoneno)) + continue; + + segs = GET_SEG_FROM_SEC(sbi, + zoneno * sbi->secs_per_zone); + blks = SEGS_TO_BLKS(sbi, segs); + return GET_SEGNO(sbi, FDEV(devi).start_blk + blks); + } + } +#endif + return NULL_SEGNO; } static int f2fs_scan_devices(struct f2fs_sb_info *sbi) @@ -4348,6 +4390,14 @@ static int f2fs_scan_devices(struct f2fs_sb_info *sbi) #endif for (i = 0; i < max_devices; i++) { + if (max_devices == 1) { + FDEV(i).total_segments = + le32_to_cpu(raw_super->segment_count_main); + FDEV(i).start_blk = 0; + FDEV(i).end_blk = FDEV(i).total_segments * + BLKS_PER_SEG(sbi); + } + if (i == 0) FDEV(0).bdev_file = sbi->sb->s_bdev_file; else if (!RDEV(i).path[0]) @@ -4538,8 +4588,8 @@ try_onemore: /* precompute checksum seed for metadata */ if (f2fs_sb_has_inode_chksum(sbi)) - sbi->s_chksum_seed = f2fs_chksum(sbi, ~0, raw_super->uuid, - sizeof(raw_super->uuid)); + sbi->s_chksum_seed = f2fs_chksum(~0, raw_super->uuid, + sizeof(raw_super->uuid)); default_options(sbi, false); /* parse mount options */ @@ -4718,7 +4768,7 @@ try_onemore: sbi->sectors_written_start = f2fs_get_sectors_written(sbi); /* get segno of first zoned block device */ - sbi->first_zoned_segno = get_first_zoned_segno(sbi); + sbi->first_seq_zone_segno = get_first_seq_zone_segno(sbi); /* Read accumulated write IO statistics if exists */ seg_i = CURSEG_I(sbi, CURSEG_HOT_NODE); diff --git a/fs/f2fs/sysfs.c b/fs/f2fs/sysfs.c index c69161366467..75134d69a0bd 100644 --- a/fs/f2fs/sysfs.c +++ b/fs/f2fs/sysfs.c @@ -274,6 +274,13 @@ static ssize_t encoding_show(struct f2fs_attr *a, return sysfs_emit(buf, "(none)\n"); } +static ssize_t encoding_flags_show(struct f2fs_attr *a, + struct f2fs_sb_info *sbi, char *buf) +{ + return sysfs_emit(buf, "%x\n", + le16_to_cpu(F2FS_RAW_SUPER(sbi)->s_encoding_flags)); +} + static ssize_t mounted_time_sec_show(struct f2fs_attr *a, struct f2fs_sb_info *sbi, char *buf) { @@ -494,12 +501,12 @@ out: return ret; #ifdef CONFIG_F2FS_FAULT_INJECTION if (a->struct_type == FAULT_INFO_TYPE) { - if (f2fs_build_fault_attr(sbi, 0, t)) + if (f2fs_build_fault_attr(sbi, 0, t, FAULT_TYPE)) return -EINVAL; return count; } if (a->struct_type == FAULT_INFO_RATE) { - if (f2fs_build_fault_attr(sbi, t, 0)) + if (f2fs_build_fault_attr(sbi, t, 0, FAULT_RATE)) return -EINVAL; return count; } @@ -1158,6 +1165,7 @@ F2FS_GENERAL_RO_ATTR(features); F2FS_GENERAL_RO_ATTR(current_reserved_blocks); F2FS_GENERAL_RO_ATTR(unusable); F2FS_GENERAL_RO_ATTR(encoding); +F2FS_GENERAL_RO_ATTR(encoding_flags); F2FS_GENERAL_RO_ATTR(mounted_time_sec); F2FS_GENERAL_RO_ATTR(main_blkaddr); F2FS_GENERAL_RO_ATTR(pending_discard); @@ -1199,6 +1207,9 @@ F2FS_FEATURE_RO_ATTR(readonly); F2FS_FEATURE_RO_ATTR(compression); #endif F2FS_FEATURE_RO_ATTR(pin_file); +#ifdef CONFIG_UNICODE +F2FS_FEATURE_RO_ATTR(linear_lookup); +#endif #define ATTR_LIST(name) (&f2fs_attr_##name.attr) static struct attribute *f2fs_attrs[] = { @@ -1270,6 +1281,7 @@ static struct attribute *f2fs_attrs[] = { ATTR_LIST(reserved_blocks), ATTR_LIST(current_reserved_blocks), ATTR_LIST(encoding), + ATTR_LIST(encoding_flags), ATTR_LIST(mounted_time_sec), #ifdef CONFIG_F2FS_STAT_FS ATTR_LIST(cp_foreground_calls), @@ -1347,6 +1359,9 @@ static struct attribute *f2fs_feat_attrs[] = { BASE_ATTR_LIST(compression), #endif BASE_ATTR_LIST(pin_file), +#ifdef CONFIG_UNICODE + BASE_ATTR_LIST(linear_lookup), +#endif NULL, }; ATTRIBUTE_GROUPS(f2fs_feat); @@ -1679,6 +1694,24 @@ static int __maybe_unused disk_map_seq_show(struct seq_file *seq, return 0; } +#ifdef CONFIG_F2FS_FAULT_INJECTION +static int __maybe_unused inject_stats_seq_show(struct seq_file *seq, + void *offset) +{ + struct super_block *sb = seq->private; + struct f2fs_sb_info *sbi = F2FS_SB(sb); + struct f2fs_fault_info *ffi = &F2FS_OPTION(sbi).fault_info; + int i; + + seq_puts(seq, "fault_type injected_count\n"); + + for (i = 0; i < FAULT_MAX; i++) + seq_printf(seq, "%-24s%-10u\n", f2fs_fault_name[i], + ffi->inject_count[i]); + return 0; +} +#endif + int __init f2fs_init_sysfs(void) { int ret; @@ -1770,6 +1803,10 @@ int f2fs_register_sysfs(struct f2fs_sb_info *sbi) discard_plist_seq_show, sb); proc_create_single_data("disk_map", 0444, sbi->s_proc, disk_map_seq_show, sb); +#ifdef CONFIG_F2FS_FAULT_INJECTION + proc_create_single_data("inject_stats", 0444, sbi->s_proc, + inject_stats_seq_show, sb); +#endif return 0; put_feature_list_kobj: kobject_put(&sbi->s_feature_list_kobj); diff --git a/fs/f2fs/xattr.c b/fs/f2fs/xattr.c index c691b35618ad..58632a2b6613 100644 --- a/fs/f2fs/xattr.c +++ b/fs/f2fs/xattr.c @@ -136,7 +136,7 @@ static int f2fs_xattr_advise_set(const struct xattr_handler *handler, #ifdef CONFIG_F2FS_FS_SECURITY static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, - void *page) + void *folio) { const struct xattr *xattr; int err = 0; @@ -144,7 +144,7 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, for (xattr = xattr_array; xattr->name != NULL; xattr++) { err = f2fs_setxattr(inode, F2FS_XATTR_INDEX_SECURITY, xattr->name, xattr->value, - xattr->value_len, (struct page *)page, 0); + xattr->value_len, folio, 0); if (err < 0) break; } @@ -152,10 +152,10 @@ static int f2fs_initxattrs(struct inode *inode, const struct xattr *xattr_array, } int f2fs_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, struct page *ipage) + const struct qstr *qstr, struct folio *ifolio) { return security_inode_init_security(inode, dir, qstr, - &f2fs_initxattrs, ipage); + f2fs_initxattrs, ifolio); } #endif @@ -271,25 +271,25 @@ static struct f2fs_xattr_entry *__find_inline_xattr(struct inode *inode, return entry; } -static int read_inline_xattr(struct inode *inode, struct page *ipage, +static int read_inline_xattr(struct inode *inode, struct folio *ifolio, void *txattr_addr) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); unsigned int inline_size = inline_xattr_size(inode); - struct page *page = NULL; + struct folio *folio = NULL; void *inline_addr; - if (ipage) { - inline_addr = inline_xattr_addr(inode, ipage); + if (ifolio) { + inline_addr = inline_xattr_addr(inode, ifolio); } else { - page = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(page)) - return PTR_ERR(page); + folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(folio)) + return PTR_ERR(folio); - inline_addr = inline_xattr_addr(inode, page); + inline_addr = inline_xattr_addr(inode, folio); } memcpy(txattr_addr, inline_addr, inline_size); - f2fs_put_page(page, 1); + f2fs_folio_put(folio, true); return 0; } @@ -299,22 +299,22 @@ static int read_xattr_block(struct inode *inode, void *txattr_addr) struct f2fs_sb_info *sbi = F2FS_I_SB(inode); nid_t xnid = F2FS_I(inode)->i_xattr_nid; unsigned int inline_size = inline_xattr_size(inode); - struct page *xpage; + struct folio *xfolio; void *xattr_addr; /* The inode already has an extended attribute block. */ - xpage = f2fs_get_xnode_page(sbi, xnid); - if (IS_ERR(xpage)) - return PTR_ERR(xpage); + xfolio = f2fs_get_xnode_folio(sbi, xnid); + if (IS_ERR(xfolio)) + return PTR_ERR(xfolio); - xattr_addr = page_address(xpage); + xattr_addr = folio_address(xfolio); memcpy(txattr_addr + inline_size, xattr_addr, VALID_XATTR_BLOCK_SIZE); - f2fs_put_page(xpage, 1); + f2fs_folio_put(xfolio, true); return 0; } -static int lookup_all_xattrs(struct inode *inode, struct page *ipage, +static int lookup_all_xattrs(struct inode *inode, struct folio *ifolio, unsigned int index, unsigned int len, const char *name, struct f2fs_xattr_entry **xe, void **base_addr, int *base_size, @@ -338,7 +338,7 @@ static int lookup_all_xattrs(struct inode *inode, struct page *ipage, /* read from inline xattr */ if (inline_size) { - err = read_inline_xattr(inode, ipage, txattr_addr); + err = read_inline_xattr(inode, ifolio, txattr_addr); if (err) goto out; @@ -385,7 +385,7 @@ out: return err; } -static int read_all_xattrs(struct inode *inode, struct page *ipage, +static int read_all_xattrs(struct inode *inode, struct folio *ifolio, void **base_addr) { struct f2fs_xattr_header *header; @@ -402,7 +402,7 @@ static int read_all_xattrs(struct inode *inode, struct page *ipage, /* read from inline xattr */ if (inline_size) { - err = read_inline_xattr(inode, ipage, txattr_addr); + err = read_inline_xattr(inode, ifolio, txattr_addr); if (err) goto fail; } @@ -429,14 +429,14 @@ fail: } static inline int write_all_xattrs(struct inode *inode, __u32 hsize, - void *txattr_addr, struct page *ipage) + void *txattr_addr, struct folio *ifolio) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); size_t inline_size = inline_xattr_size(inode); - struct page *in_page = NULL; + struct folio *in_folio = NULL; void *xattr_addr; void *inline_addr = NULL; - struct page *xpage; + struct folio *xfolio; nid_t new_nid = 0; int err = 0; @@ -446,73 +446,73 @@ static inline int write_all_xattrs(struct inode *inode, __u32 hsize, /* write to inline xattr */ if (inline_size) { - if (ipage) { - inline_addr = inline_xattr_addr(inode, ipage); + if (ifolio) { + inline_addr = inline_xattr_addr(inode, ifolio); } else { - in_page = f2fs_get_inode_page(sbi, inode->i_ino); - if (IS_ERR(in_page)) { + in_folio = f2fs_get_inode_folio(sbi, inode->i_ino); + if (IS_ERR(in_folio)) { f2fs_alloc_nid_failed(sbi, new_nid); - return PTR_ERR(in_page); + return PTR_ERR(in_folio); } - inline_addr = inline_xattr_addr(inode, in_page); + inline_addr = inline_xattr_addr(inode, in_folio); } - f2fs_wait_on_page_writeback(ipage ? ipage : in_page, + f2fs_folio_wait_writeback(ifolio ? ifolio : in_folio, NODE, true, true); /* no need to use xattr node block */ if (hsize <= inline_size) { err = f2fs_truncate_xattr_node(inode); f2fs_alloc_nid_failed(sbi, new_nid); if (err) { - f2fs_put_page(in_page, 1); + f2fs_folio_put(in_folio, true); return err; } memcpy(inline_addr, txattr_addr, inline_size); - set_page_dirty(ipage ? ipage : in_page); + folio_mark_dirty(ifolio ? ifolio : in_folio); goto in_page_out; } } /* write to xattr node block */ if (F2FS_I(inode)->i_xattr_nid) { - xpage = f2fs_get_xnode_page(sbi, F2FS_I(inode)->i_xattr_nid); - if (IS_ERR(xpage)) { - err = PTR_ERR(xpage); + xfolio = f2fs_get_xnode_folio(sbi, F2FS_I(inode)->i_xattr_nid); + if (IS_ERR(xfolio)) { + err = PTR_ERR(xfolio); f2fs_alloc_nid_failed(sbi, new_nid); goto in_page_out; } f2fs_bug_on(sbi, new_nid); - f2fs_wait_on_page_writeback(xpage, NODE, true, true); + f2fs_folio_wait_writeback(xfolio, NODE, true, true); } else { struct dnode_of_data dn; set_new_dnode(&dn, inode, NULL, NULL, new_nid); - xpage = f2fs_new_node_page(&dn, XATTR_NODE_OFFSET); - if (IS_ERR(xpage)) { - err = PTR_ERR(xpage); + xfolio = f2fs_new_node_folio(&dn, XATTR_NODE_OFFSET); + if (IS_ERR(xfolio)) { + err = PTR_ERR(xfolio); f2fs_alloc_nid_failed(sbi, new_nid); goto in_page_out; } f2fs_alloc_nid_done(sbi, new_nid); } - xattr_addr = page_address(xpage); + xattr_addr = folio_address(xfolio); if (inline_size) memcpy(inline_addr, txattr_addr, inline_size); memcpy(xattr_addr, txattr_addr + inline_size, VALID_XATTR_BLOCK_SIZE); if (inline_size) - set_page_dirty(ipage ? ipage : in_page); - set_page_dirty(xpage); + folio_mark_dirty(ifolio ? ifolio : in_folio); + folio_mark_dirty(xfolio); - f2fs_put_page(xpage, 1); + f2fs_folio_put(xfolio, true); in_page_out: - f2fs_put_page(in_page, 1); + f2fs_folio_put(in_folio, true); return err; } int f2fs_getxattr(struct inode *inode, int index, const char *name, - void *buffer, size_t buffer_size, struct page *ipage) + void *buffer, size_t buffer_size, struct folio *ifolio) { struct f2fs_xattr_entry *entry = NULL; int error; @@ -528,11 +528,11 @@ int f2fs_getxattr(struct inode *inode, int index, const char *name, if (len > F2FS_NAME_LEN) return -ERANGE; - if (!ipage) + if (!ifolio) f2fs_down_read(&F2FS_I(inode)->i_xattr_sem); - error = lookup_all_xattrs(inode, ipage, index, len, name, + error = lookup_all_xattrs(inode, ifolio, index, len, name, &entry, &base_addr, &base_size, &is_inline); - if (!ipage) + if (!ifolio) f2fs_up_read(&F2FS_I(inode)->i_xattr_sem); if (error) return error; @@ -627,7 +627,7 @@ static bool f2fs_xattr_value_same(struct f2fs_xattr_entry *entry, static int __f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, - struct page *ipage, int flags) + struct folio *ifolio, int flags) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); struct f2fs_xattr_entry *here, *last; @@ -651,7 +651,7 @@ static int __f2fs_setxattr(struct inode *inode, int index, if (size > MAX_VALUE_LEN(inode)) return -E2BIG; retry: - error = read_all_xattrs(inode, ipage, &base_addr); + error = read_all_xattrs(inode, ifolio, &base_addr); if (error) return error; @@ -766,7 +766,7 @@ retry: *(u32 *)((u8 *)last + newsize) = 0; } - error = write_all_xattrs(inode, new_hsize, base_addr, ipage); + error = write_all_xattrs(inode, new_hsize, base_addr, ifolio); if (error) goto exit; @@ -800,7 +800,7 @@ exit: int f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, - struct page *ipage, int flags) + struct folio *ifolio, int flags) { struct f2fs_sb_info *sbi = F2FS_I_SB(inode); int err; @@ -815,14 +815,14 @@ int f2fs_setxattr(struct inode *inode, int index, const char *name, return err; /* this case is only from f2fs_init_inode_metadata */ - if (ipage) + if (ifolio) return __f2fs_setxattr(inode, index, name, value, - size, ipage, flags); + size, ifolio, flags); f2fs_balance_fs(sbi, true); f2fs_lock_op(sbi); f2fs_down_write(&F2FS_I(inode)->i_xattr_sem); - err = __f2fs_setxattr(inode, index, name, value, size, ipage, flags); + err = __f2fs_setxattr(inode, index, name, value, size, NULL, flags); f2fs_up_write(&F2FS_I(inode)->i_xattr_sem); f2fs_unlock_op(sbi); diff --git a/fs/f2fs/xattr.h b/fs/f2fs/xattr.h index a005ffdcf717..4fc0b2305fbd 100644 --- a/fs/f2fs/xattr.h +++ b/fs/f2fs/xattr.h @@ -127,26 +127,26 @@ extern const struct xattr_handler f2fs_xattr_security_handler; extern const struct xattr_handler * const f2fs_xattr_handlers[]; -extern int f2fs_setxattr(struct inode *, int, const char *, - const void *, size_t, struct page *, int); -extern int f2fs_getxattr(struct inode *, int, const char *, void *, - size_t, struct page *); -extern ssize_t f2fs_listxattr(struct dentry *, char *, size_t); -extern int f2fs_init_xattr_caches(struct f2fs_sb_info *); -extern void f2fs_destroy_xattr_caches(struct f2fs_sb_info *); +int f2fs_setxattr(struct inode *, int, const char *, const void *, + size_t, struct folio *, int); +int f2fs_getxattr(struct inode *, int, const char *, void *, + size_t, struct folio *); +ssize_t f2fs_listxattr(struct dentry *, char *, size_t); +int f2fs_init_xattr_caches(struct f2fs_sb_info *); +void f2fs_destroy_xattr_caches(struct f2fs_sb_info *); #else #define f2fs_xattr_handlers NULL #define f2fs_listxattr NULL static inline int f2fs_setxattr(struct inode *inode, int index, const char *name, const void *value, size_t size, - struct page *page, int flags) + struct folio *folio, int flags) { return -EOPNOTSUPP; } static inline int f2fs_getxattr(struct inode *inode, int index, const char *name, void *buffer, - size_t buffer_size, struct page *dpage) + size_t buffer_size, struct folio *dfolio) { return -EOPNOTSUPP; } @@ -155,11 +155,11 @@ static inline void f2fs_destroy_xattr_caches(struct f2fs_sb_info *sbi) { } #endif #ifdef CONFIG_F2FS_FS_SECURITY -extern int f2fs_init_security(struct inode *, struct inode *, - const struct qstr *, struct page *); +int f2fs_init_security(struct inode *, struct inode *, + const struct qstr *, struct folio *); #else static inline int f2fs_init_security(struct inode *inode, struct inode *dir, - const struct qstr *qstr, struct page *ipage) + const struct qstr *qstr, struct folio *ifolio) { return 0; } diff --git a/fs/fuse/dev.c b/fs/fuse/dev.c index 6dcbaa218b7a..e80cd8f2c049 100644 --- a/fs/fuse/dev.c +++ b/fs/fuse/dev.c @@ -23,6 +23,7 @@ #include <linux/swap.h> #include <linux/splice.h> #include <linux/sched.h> +#include <linux/seq_file.h> #define CREATE_TRACE_POINTS #include "fuse_trace.h" @@ -45,7 +46,7 @@ bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list) return time_is_before_jiffies(req->create_time + fc->timeout.req_timeout); } -bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) +static bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing) { int i; @@ -816,7 +817,7 @@ static int unlock_request(struct fuse_req *req) return err; } -void fuse_copy_init(struct fuse_copy_state *cs, int write, +void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter) { memset(cs, 0, sizeof(*cs)); @@ -955,10 +956,10 @@ static int fuse_check_folio(struct folio *folio) * folio that was originally in @pagep will lose a reference and the new * folio returned in @pagep will carry a reference. */ -static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) +static int fuse_try_move_folio(struct fuse_copy_state *cs, struct folio **foliop) { int err; - struct folio *oldfolio = page_folio(*pagep); + struct folio *oldfolio = *foliop; struct folio *newfolio; struct pipe_buffer *buf = cs->pipebufs; @@ -979,7 +980,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) cs->pipebufs++; cs->nr_segs--; - if (cs->len != PAGE_SIZE) + if (cs->len != folio_size(oldfolio)) goto out_fallback; if (!pipe_buf_try_steal(cs->pipe, buf)) @@ -1025,7 +1026,7 @@ static int fuse_try_move_page(struct fuse_copy_state *cs, struct page **pagep) if (test_bit(FR_ABORTED, &cs->req->flags)) err = -ENOENT; else - *pagep = &newfolio->page; + *foliop = newfolio; spin_unlock(&cs->req->waitq.lock); if (err) { @@ -1058,8 +1059,8 @@ out_fallback: goto out_put_old; } -static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, - unsigned offset, unsigned count) +static int fuse_ref_folio(struct fuse_copy_state *cs, struct folio *folio, + unsigned offset, unsigned count) { struct pipe_buffer *buf; int err; @@ -1067,17 +1068,17 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, if (cs->nr_segs >= cs->pipe->max_usage) return -EIO; - get_page(page); + folio_get(folio); err = unlock_request(cs->req); if (err) { - put_page(page); + folio_put(folio); return err; } fuse_copy_finish(cs); buf = cs->pipebufs; - buf->page = page; + buf->page = &folio->page; buf->offset = offset; buf->len = count; @@ -1089,20 +1090,24 @@ static int fuse_ref_page(struct fuse_copy_state *cs, struct page *page, } /* - * Copy a page in the request to/from the userspace buffer. Must be + * Copy a folio in the request to/from the userspace buffer. Must be * done atomically */ -static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, - unsigned offset, unsigned count, int zeroing) +static int fuse_copy_folio(struct fuse_copy_state *cs, struct folio **foliop, + unsigned offset, unsigned count, int zeroing) { int err; - struct page *page = *pagep; + struct folio *folio = *foliop; + size_t size; - if (page && zeroing && count < PAGE_SIZE) - clear_highpage(page); + if (folio) { + size = folio_size(folio); + if (zeroing && count < size) + folio_zero_range(folio, 0, size); + } while (count) { - if (cs->write && cs->pipebufs && page) { + if (cs->write && cs->pipebufs && folio) { /* * Can't control lifetime of pipe buffers, so always * copy user pages. @@ -1112,12 +1117,12 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, if (err) return err; } else { - return fuse_ref_page(cs, page, offset, count); + return fuse_ref_folio(cs, folio, offset, count); } } else if (!cs->len) { - if (cs->move_pages && page && - offset == 0 && count == PAGE_SIZE) { - err = fuse_try_move_page(cs, pagep); + if (cs->move_folios && folio && + offset == 0 && count == size) { + err = fuse_try_move_folio(cs, foliop); if (err <= 0) return err; } else { @@ -1126,22 +1131,30 @@ static int fuse_copy_page(struct fuse_copy_state *cs, struct page **pagep, return err; } } - if (page) { - void *mapaddr = kmap_local_page(page); - void *buf = mapaddr + offset; - offset += fuse_copy_do(cs, &buf, &count); + if (folio) { + void *mapaddr = kmap_local_folio(folio, offset); + void *buf = mapaddr; + unsigned int copy = count; + unsigned int bytes_copied; + + if (folio_test_highmem(folio) && count > PAGE_SIZE - offset_in_page(offset)) + copy = PAGE_SIZE - offset_in_page(offset); + + bytes_copied = fuse_copy_do(cs, &buf, ©); kunmap_local(mapaddr); + offset += bytes_copied; + count -= bytes_copied; } else offset += fuse_copy_do(cs, NULL, &count); } - if (page && !cs->write) - flush_dcache_page(page); + if (folio && !cs->write) + flush_dcache_folio(folio); return 0; } -/* Copy pages in the request to/from userspace buffer */ -static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, - int zeroing) +/* Copy folios in the request to/from userspace buffer */ +static int fuse_copy_folios(struct fuse_copy_state *cs, unsigned nbytes, + int zeroing) { unsigned i; struct fuse_req *req = cs->req; @@ -1151,23 +1164,12 @@ static int fuse_copy_pages(struct fuse_copy_state *cs, unsigned nbytes, int err; unsigned int offset = ap->descs[i].offset; unsigned int count = min(nbytes, ap->descs[i].length); - struct page *orig, *pagep; - orig = pagep = &ap->folios[i]->page; - - err = fuse_copy_page(cs, &pagep, offset, count, zeroing); + err = fuse_copy_folio(cs, &ap->folios[i], offset, count, zeroing); if (err) return err; nbytes -= count; - - /* - * fuse_copy_page may have moved a page from a pipe instead of - * copying into our given page, so update the folios if it was - * replaced. - */ - if (pagep != orig) - ap->folios[i] = page_folio(pagep); } return 0; } @@ -1197,7 +1199,7 @@ int fuse_copy_args(struct fuse_copy_state *cs, unsigned numargs, for (i = 0; !err && i < numargs; i++) { struct fuse_arg *arg = &args[i]; if (i == numargs - 1 && argpages) - err = fuse_copy_pages(cs, arg->size, zeroing); + err = fuse_copy_folios(cs, arg->size, zeroing); else err = fuse_copy_one(cs, arg->value, arg->size); } @@ -1538,7 +1540,7 @@ static ssize_t fuse_dev_read(struct kiocb *iocb, struct iov_iter *to) if (!user_backed_iter(to)) return -EINVAL; - fuse_copy_init(&cs, 1, to); + fuse_copy_init(&cs, true, to); return fuse_dev_do_read(fud, file, &cs, iov_iter_count(to)); } @@ -1561,7 +1563,7 @@ static ssize_t fuse_dev_splice_read(struct file *in, loff_t *ppos, if (!bufs) return -ENOMEM; - fuse_copy_init(&cs, 1, NULL); + fuse_copy_init(&cs, true, NULL); cs.pipebufs = bufs; cs.pipe = pipe; ret = fuse_dev_do_read(fud, in, &cs, len); @@ -1786,20 +1788,23 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, num = outarg.size; while (num) { struct folio *folio; - struct page *page; - unsigned int this_num; + unsigned int folio_offset; + unsigned int nr_bytes; + unsigned int nr_pages; folio = filemap_grab_folio(mapping, index); err = PTR_ERR(folio); if (IS_ERR(folio)) goto out_iput; - page = &folio->page; - this_num = min_t(unsigned, num, folio_size(folio) - offset); - err = fuse_copy_page(cs, &page, offset, this_num, 0); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + nr_bytes = min_t(unsigned, num, folio_size(folio) - folio_offset); + nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + + err = fuse_copy_folio(cs, &folio, folio_offset, nr_bytes, 0); if (!folio_test_uptodate(folio) && !err && offset == 0 && - (this_num == folio_size(folio) || file_size == end)) { - folio_zero_segment(folio, this_num, folio_size(folio)); + (nr_bytes == folio_size(folio) || file_size == end)) { + folio_zero_segment(folio, nr_bytes, folio_size(folio)); folio_mark_uptodate(folio); } folio_unlock(folio); @@ -1808,9 +1813,9 @@ static int fuse_notify_store(struct fuse_conn *fc, unsigned int size, if (err) goto out_iput; - num -= this_num; + num -= nr_bytes; offset = 0; - index++; + index += nr_pages; } err = 0; @@ -1849,7 +1854,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, unsigned int num; unsigned int offset; size_t total_len = 0; - unsigned int num_pages, cur_pages = 0; + unsigned int num_pages; struct fuse_conn *fc = fm->fc; struct fuse_retrieve_args *ra; size_t args_size = sizeof(*ra); @@ -1867,6 +1872,7 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, num_pages = (num + offset + PAGE_SIZE - 1) >> PAGE_SHIFT; num_pages = min(num_pages, fc->max_pages); + num = min(num, num_pages << PAGE_SHIFT); args_size += num_pages * (sizeof(ap->folios[0]) + sizeof(ap->descs[0])); @@ -1887,25 +1893,29 @@ static int fuse_retrieve(struct fuse_mount *fm, struct inode *inode, index = outarg->offset >> PAGE_SHIFT; - while (num && cur_pages < num_pages) { + while (num) { struct folio *folio; - unsigned int this_num; + unsigned int folio_offset; + unsigned int nr_bytes; + unsigned int nr_pages; folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) break; - this_num = min_t(unsigned, num, PAGE_SIZE - offset); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + nr_bytes = min(folio_size(folio) - folio_offset, num); + nr_pages = (offset + nr_bytes + PAGE_SIZE - 1) >> PAGE_SHIFT; + ap->folios[ap->num_folios] = folio; - ap->descs[ap->num_folios].offset = offset; - ap->descs[ap->num_folios].length = this_num; + ap->descs[ap->num_folios].offset = folio_offset; + ap->descs[ap->num_folios].length = nr_bytes; ap->num_folios++; - cur_pages++; offset = 0; - num -= this_num; - total_len += this_num; - index++; + num -= nr_bytes; + total_len += nr_bytes; + index += nr_pages; } ra->inarg.offset = outarg->offset; ra->inarg.size = total_len; @@ -2021,11 +2031,24 @@ static int fuse_notify_resend(struct fuse_conn *fc) return 0; } +/* + * Increments the fuse connection epoch. This will result of dentries from + * previous epochs to be invalidated. + * + * XXX optimization: add call to shrink_dcache_sb()? + */ +static int fuse_notify_inc_epoch(struct fuse_conn *fc) +{ + atomic_inc(&fc->epoch); + + return 0; +} + static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, unsigned int size, struct fuse_copy_state *cs) { - /* Don't try to move pages (yet) */ - cs->move_pages = 0; + /* Don't try to move folios (yet) */ + cs->move_folios = false; switch (code) { case FUSE_NOTIFY_POLL: @@ -2049,6 +2072,9 @@ static int fuse_notify(struct fuse_conn *fc, enum fuse_notify_code code, case FUSE_NOTIFY_RESEND: return fuse_notify_resend(fc); + case FUSE_NOTIFY_INC_EPOCH: + return fuse_notify_inc_epoch(fc); + default: fuse_copy_finish(cs); return -EINVAL; @@ -2173,7 +2199,7 @@ static ssize_t fuse_dev_do_write(struct fuse_dev *fud, spin_unlock(&fpq->lock); cs->req = req; if (!req->args->page_replace) - cs->move_pages = 0; + cs->move_folios = false; if (oh.error) err = nbytes != sizeof(oh) ? -EINVAL : 0; @@ -2211,7 +2237,7 @@ static ssize_t fuse_dev_write(struct kiocb *iocb, struct iov_iter *from) if (!user_backed_iter(from)) return -EINVAL; - fuse_copy_init(&cs, 0, from); + fuse_copy_init(&cs, false, from); return fuse_dev_do_write(fud, &cs, iov_iter_count(from)); } @@ -2285,13 +2311,13 @@ static ssize_t fuse_dev_splice_write(struct pipe_inode_info *pipe, } pipe_unlock(pipe); - fuse_copy_init(&cs, 0, NULL); + fuse_copy_init(&cs, false, NULL); cs.pipebufs = bufs; cs.nr_segs = nbuf; cs.pipe = pipe; if (flags & SPLICE_F_MOVE) - cs.move_pages = 1; + cs.move_folios = true; ret = fuse_dev_do_write(fud, &cs, len); @@ -2602,6 +2628,17 @@ static long fuse_dev_ioctl(struct file *file, unsigned int cmd, } } +#ifdef CONFIG_PROC_FS +static void fuse_dev_show_fdinfo(struct seq_file *seq, struct file *file) +{ + struct fuse_dev *fud = fuse_get_dev(file); + if (!fud) + return; + + seq_printf(seq, "fuse_connection:\t%u\n", fud->fc->dev); +} +#endif + const struct file_operations fuse_dev_operations = { .owner = THIS_MODULE, .open = fuse_dev_open, @@ -2617,6 +2654,9 @@ const struct file_operations fuse_dev_operations = { #ifdef CONFIG_FUSE_IO_URING .uring_cmd = fuse_uring_cmd, #endif +#ifdef CONFIG_PROC_FS + .show_fdinfo = fuse_dev_show_fdinfo, +#endif }; EXPORT_SYMBOL_GPL(fuse_dev_operations); diff --git a/fs/fuse/dev_uring.c b/fs/fuse/dev_uring.c index accdce2977c5..249b210becb1 100644 --- a/fs/fuse/dev_uring.c +++ b/fs/fuse/dev_uring.c @@ -140,6 +140,21 @@ void fuse_uring_abort_end_requests(struct fuse_ring *ring) } } +static bool ent_list_request_expired(struct fuse_conn *fc, struct list_head *list) +{ + struct fuse_ring_ent *ent; + struct fuse_req *req; + + ent = list_first_entry_or_null(list, struct fuse_ring_ent, list); + if (!ent) + return false; + + req = ent->fuse_req; + + return time_is_before_jiffies(req->create_time + + fc->timeout.req_timeout); +} + bool fuse_uring_request_expired(struct fuse_conn *fc) { struct fuse_ring *ring = fc->ring; @@ -157,7 +172,8 @@ bool fuse_uring_request_expired(struct fuse_conn *fc) spin_lock(&queue->lock); if (fuse_request_expired(fc, &queue->fuse_req_queue) || fuse_request_expired(fc, &queue->fuse_req_bg_queue) || - fuse_fpq_processing_expired(fc, queue->fpq.processing)) { + ent_list_request_expired(fc, &queue->ent_w_req_queue) || + ent_list_request_expired(fc, &queue->ent_in_userspace)) { spin_unlock(&queue->lock); return true; } @@ -494,7 +510,7 @@ static void fuse_uring_cancel(struct io_uring_cmd *cmd, spin_lock(&queue->lock); if (ent->state == FRRS_AVAILABLE) { ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); need_cmd_done = true; ent->cmd = NULL; } @@ -577,8 +593,8 @@ static int fuse_uring_copy_from_ring(struct fuse_ring *ring, if (err) return err; - fuse_copy_init(&cs, 0, &iter); - cs.is_uring = 1; + fuse_copy_init(&cs, false, &iter); + cs.is_uring = true; cs.req = req; return fuse_copy_out_args(&cs, args, ring_in_out.payload_sz); @@ -607,8 +623,8 @@ static int fuse_uring_args_to_ring(struct fuse_ring *ring, struct fuse_req *req, return err; } - fuse_copy_init(&cs, 1, &iter); - cs.is_uring = 1; + fuse_copy_init(&cs, true, &iter); + cs.is_uring = true; cs.req = req; if (num_args > 0) { @@ -714,7 +730,7 @@ static int fuse_uring_send_next_to_ring(struct fuse_ring_ent *ent, cmd = ent->cmd; ent->cmd = NULL; ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); spin_unlock(&queue->lock); io_uring_cmd_done(cmd, 0, 0, issue_flags); @@ -764,7 +780,7 @@ static void fuse_uring_add_req_to_ring_ent(struct fuse_ring_ent *ent, clear_bit(FR_PENDING, &req->flags); ent->fuse_req = req; ent->state = FRRS_FUSE_REQ; - list_move(&ent->list, &queue->ent_w_req_queue); + list_move_tail(&ent->list, &queue->ent_w_req_queue); fuse_uring_add_to_pq(ent, req); } @@ -1180,7 +1196,7 @@ static void fuse_uring_send(struct fuse_ring_ent *ent, struct io_uring_cmd *cmd, spin_lock(&queue->lock); ent->state = FRRS_USERSPACE; - list_move(&ent->list, &queue->ent_in_userspace); + list_move_tail(&ent->list, &queue->ent_in_userspace); ent->cmd = NULL; spin_unlock(&queue->lock); diff --git a/fs/fuse/dir.c b/fs/fuse/dir.c index 33b82529cb6e..45b4c3cc1396 100644 --- a/fs/fuse/dir.c +++ b/fs/fuse/dir.c @@ -200,9 +200,14 @@ static int fuse_dentry_revalidate(struct inode *dir, const struct qstr *name, { struct inode *inode; struct fuse_mount *fm; + struct fuse_conn *fc; struct fuse_inode *fi; int ret; + fc = get_fuse_conn_super(dir->i_sb); + if (entry->d_time < atomic_read(&fc->epoch)) + goto invalid; + inode = d_inode_rcu(entry); if (inode && fuse_is_bad(inode)) goto invalid; @@ -319,9 +324,6 @@ static struct vfsmount *fuse_dentry_automount(struct path *path) /* Create the submount */ mnt = fc_mount(fsc); - if (!IS_ERR(mnt)) - mntget(mnt); - put_fs_context(fsc); return mnt; } @@ -415,16 +417,20 @@ int fuse_lookup_name(struct super_block *sb, u64 nodeid, const struct qstr *name static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, unsigned int flags) { - int err; struct fuse_entry_out outarg; + struct fuse_conn *fc; struct inode *inode; struct dentry *newent; + int err, epoch; bool outarg_valid = true; bool locked; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); + fc = get_fuse_conn_super(dir->i_sb); + epoch = atomic_read(&fc->epoch); + locked = fuse_lock_inode(dir); err = fuse_lookup_name(dir->i_sb, get_node_id(dir), &entry->d_name, &outarg, &inode); @@ -446,6 +452,7 @@ static struct dentry *fuse_lookup(struct inode *dir, struct dentry *entry, goto out_err; entry = newent ? newent : entry; + entry->d_time = epoch; if (outarg_valid) fuse_change_entry_timeout(entry, &outarg); else @@ -619,7 +626,6 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct dentry *entry, struct file *file, unsigned int flags, umode_t mode, u32 opcode) { - int err; struct inode *inode; struct fuse_mount *fm = get_fuse_mount(dir); FUSE_ARGS(args); @@ -629,11 +635,13 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, struct fuse_entry_out outentry; struct fuse_inode *fi; struct fuse_file *ff; + int epoch, err; bool trunc = flags & O_TRUNC; /* Userspace expects S_IFREG in create mode */ BUG_ON((mode & S_IFMT) != S_IFREG); + epoch = atomic_read(&fm->fc->epoch); forget = fuse_alloc_forget(); err = -ENOMEM; if (!forget) @@ -702,6 +710,7 @@ static int fuse_create_open(struct mnt_idmap *idmap, struct inode *dir, } kfree(forget); d_instantiate(entry, inode); + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outentry); fuse_dir_changed(dir); err = generic_file_open(inode, file); @@ -788,12 +797,14 @@ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_moun struct fuse_entry_out outarg; struct inode *inode; struct dentry *d; - int err; struct fuse_forget_link *forget; + int epoch, err; if (fuse_is_bad(dir)) return ERR_PTR(-EIO); + epoch = atomic_read(&fm->fc->epoch); + forget = fuse_alloc_forget(); if (!forget) return ERR_PTR(-ENOMEM); @@ -835,10 +846,13 @@ static struct dentry *create_new_entry(struct mnt_idmap *idmap, struct fuse_moun if (IS_ERR(d)) return d; - if (d) + if (d) { + d->d_time = epoch; fuse_change_entry_timeout(d, &outarg); - else + } else { + entry->d_time = epoch; fuse_change_entry_timeout(entry, &outarg); + } fuse_dir_changed(dir); return d; @@ -1612,10 +1626,10 @@ static int fuse_permission(struct mnt_idmap *idmap, return err; } -static int fuse_readlink_page(struct inode *inode, struct folio *folio) +static int fuse_readlink_folio(struct inode *inode, struct folio *folio) { struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_folio_desc desc = { .length = PAGE_SIZE - 1 }; + struct fuse_folio_desc desc = { .length = folio_size(folio) - 1 }; struct fuse_args_pages ap = { .num_folios = 1, .folios = &folio, @@ -1670,7 +1684,7 @@ static const char *fuse_get_link(struct dentry *dentry, struct inode *inode, if (!folio) goto out_err; - err = fuse_readlink_page(inode, folio); + err = fuse_readlink_folio(inode, folio); if (err) { folio_put(folio); goto out_err; @@ -1946,6 +1960,7 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, int err; bool trust_local_cmtime = is_wb; bool fault_blocked = false; + u64 attr_version; if (!fc->default_permissions) attr->ia_valid |= ATTR_FORCE; @@ -2030,6 +2045,8 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (fc->handle_killpriv_v2 && !capable(CAP_FSETID)) inarg.valid |= FATTR_KILL_SUIDGID; } + + attr_version = fuse_get_attr_version(fm->fc); fuse_setattr_fill(fc, &args, inode, &inarg, &outarg); err = fuse_simple_request(fm, &args); if (err) { @@ -2055,6 +2072,14 @@ int fuse_do_setattr(struct mnt_idmap *idmap, struct dentry *dentry, /* FIXME: clear I_DIRTY_SYNC? */ } + if (fi->attr_version > attr_version) { + /* + * Apply attributes, for example for fsnotify_change(), but set + * attribute timeout to zero. + */ + outarg.attr_valid = outarg.attr_valid_nsec = 0; + } + fuse_change_attributes_common(inode, &outarg.attr, NULL, ATTR_TIMEOUT(&outarg), fuse_get_cache_mask(inode), 0); @@ -2260,7 +2285,7 @@ void fuse_init_dir(struct inode *inode) static int fuse_symlink_read_folio(struct file *null, struct folio *folio) { - int err = fuse_readlink_page(folio->mapping->host, folio); + int err = fuse_readlink_folio(folio->mapping->host, folio); if (!err) folio_mark_uptodate(folio); diff --git a/fs/fuse/file.c b/fs/fuse/file.c index 754378dd9f71..f102afc03359 100644 --- a/fs/fuse/file.c +++ b/fs/fuse/file.c @@ -415,89 +415,11 @@ u64 fuse_lock_owner_id(struct fuse_conn *fc, fl_owner_t id) struct fuse_writepage_args { struct fuse_io_args ia; - struct rb_node writepages_entry; struct list_head queue_entry; - struct fuse_writepage_args *next; struct inode *inode; struct fuse_sync_bucket *bucket; }; -static struct fuse_writepage_args *fuse_find_writeback(struct fuse_inode *fi, - pgoff_t idx_from, pgoff_t idx_to) -{ - struct rb_node *n; - - n = fi->writepages.rb_node; - - while (n) { - struct fuse_writepage_args *wpa; - pgoff_t curr_index; - - wpa = rb_entry(n, struct fuse_writepage_args, writepages_entry); - WARN_ON(get_fuse_inode(wpa->inode) != fi); - curr_index = wpa->ia.write.in.offset >> PAGE_SHIFT; - if (idx_from >= curr_index + wpa->ia.ap.num_folios) - n = n->rb_right; - else if (idx_to < curr_index) - n = n->rb_left; - else - return wpa; - } - return NULL; -} - -/* - * Check if any page in a range is under writeback - */ -static bool fuse_range_is_writeback(struct inode *inode, pgoff_t idx_from, - pgoff_t idx_to) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - bool found; - - if (RB_EMPTY_ROOT(&fi->writepages)) - return false; - - spin_lock(&fi->lock); - found = fuse_find_writeback(fi, idx_from, idx_to); - spin_unlock(&fi->lock); - - return found; -} - -static inline bool fuse_page_is_writeback(struct inode *inode, pgoff_t index) -{ - return fuse_range_is_writeback(inode, index, index); -} - -/* - * Wait for page writeback to be completed. - * - * Since fuse doesn't rely on the VM writeback tracking, this has to - * use some other means. - */ -static void fuse_wait_on_page_writeback(struct inode *inode, pgoff_t index) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_page_is_writeback(inode, index)); -} - -static inline bool fuse_folio_is_writeback(struct inode *inode, - struct folio *folio) -{ - pgoff_t last = folio_next_index(folio) - 1; - return fuse_range_is_writeback(inode, folio_index(folio), last); -} - -static void fuse_wait_on_folio_writeback(struct inode *inode, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(inode); - - wait_event(fi->page_waitq, !fuse_folio_is_writeback(inode, folio)); -} - /* * Wait for all pending writepages on the inode to finish. * @@ -532,10 +454,6 @@ static int fuse_flush(struct file *file, fl_owner_t id) if (err) return err; - inode_lock(inode); - fuse_sync_writes(inode); - inode_unlock(inode); - err = filemap_check_errors(file->f_mapping); if (err) return err; @@ -875,7 +793,7 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) struct inode *inode = folio->mapping->host; struct fuse_mount *fm = get_fuse_mount(inode); loff_t pos = folio_pos(folio); - struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + struct fuse_folio_desc desc = { .length = folio_size(folio) }; struct fuse_io_args ia = { .ap.args.page_zeroing = true, .ap.args.out_pages = true, @@ -886,13 +804,6 @@ static int fuse_do_readfolio(struct file *file, struct folio *folio) ssize_t res; u64 attr_ver; - /* - * With the temporary pages that are used to complete writeback, we can - * have writeback that extends beyond the lifetime of the folio. So - * make sure we read a properly synced folio. - */ - fuse_wait_on_folio_writeback(inode, folio); - attr_ver = fuse_get_attr_version(fm->fc); /* Don't overflow end offset */ @@ -965,14 +876,13 @@ static void fuse_readpages_end(struct fuse_mount *fm, struct fuse_args *args, fuse_io_free(ia); } -static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) +static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file, + unsigned int count) { struct fuse_file *ff = file->private_data; struct fuse_mount *fm = ff->fm; struct fuse_args_pages *ap = &ia->ap; loff_t pos = folio_pos(ap->folios[0]); - /* Currently, all folios in FUSE are one page */ - size_t count = ap->num_folios << PAGE_SHIFT; ssize_t res; int err; @@ -1005,17 +915,13 @@ static void fuse_send_readpages(struct fuse_io_args *ia, struct file *file) static void fuse_readahead(struct readahead_control *rac) { struct inode *inode = rac->mapping->host; - struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); unsigned int max_pages, nr_pages; - pgoff_t first = readahead_index(rac); - pgoff_t last = first + readahead_count(rac) - 1; + struct folio *folio = NULL; if (fuse_is_bad(inode)) return; - wait_event(fi->page_waitq, !fuse_range_is_writeback(inode, first, last)); - max_pages = min_t(unsigned int, fc->max_pages, fc->max_read / PAGE_SIZE); @@ -1033,8 +939,8 @@ static void fuse_readahead(struct readahead_control *rac) while (nr_pages) { struct fuse_io_args *ia; struct fuse_args_pages *ap; - struct folio *folio; unsigned cur_pages = min(max_pages, nr_pages); + unsigned int pages = 0; if (fc->num_background >= fc->congestion_threshold && rac->ra->async_size >= readahead_count(rac)) @@ -1046,10 +952,12 @@ static void fuse_readahead(struct readahead_control *rac) ia = fuse_io_alloc(NULL, cur_pages); if (!ia) - return; + break; ap = &ia->ap; - while (ap->num_folios < cur_pages) { + while (pages < cur_pages) { + unsigned int folio_pages; + /* * This returns a folio with a ref held on it. * The ref needs to be held until the request is @@ -1057,13 +965,31 @@ static void fuse_readahead(struct readahead_control *rac) * fuse_try_move_page()) drops the ref after it's * replaced in the page cache. */ - folio = __readahead_folio(rac); + if (!folio) + folio = __readahead_folio(rac); + + folio_pages = folio_nr_pages(folio); + if (folio_pages > cur_pages - pages) { + /* + * Large folios belonging to fuse will never + * have more pages than max_pages. + */ + WARN_ON(!pages); + break; + } + ap->folios[ap->num_folios] = folio; ap->descs[ap->num_folios].length = folio_size(folio); ap->num_folios++; + pages += folio_pages; + folio = NULL; } - fuse_send_readpages(ia, rac->file); - nr_pages -= cur_pages; + fuse_send_readpages(ia, rac->file, pages << PAGE_SHIFT); + nr_pages -= pages; + } + if (folio) { + folio_end_read(folio, false); + folio_put(folio); } } @@ -1181,7 +1107,7 @@ static ssize_t fuse_send_write_pages(struct fuse_io_args *ia, int err; for (i = 0; i < ap->num_folios; i++) - fuse_wait_on_folio_writeback(inode, ap->folios[i]); + folio_wait_writeback(ap->folios[i]); fuse_write_args_fill(ia, ff, pos, count); ia->write.in.flags = fuse_write_flags(iocb); @@ -1226,27 +1152,24 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, struct fuse_args_pages *ap = &ia->ap; struct fuse_conn *fc = get_fuse_conn(mapping->host); unsigned offset = pos & (PAGE_SIZE - 1); - unsigned int nr_pages = 0; size_t count = 0; - int err; + unsigned int num; + int err = 0; + + num = min(iov_iter_count(ii), fc->max_write); + num = min(num, max_pages << PAGE_SHIFT); ap->args.in_pages = true; ap->descs[0].offset = offset; - do { + while (num) { size_t tmp; struct folio *folio; pgoff_t index = pos >> PAGE_SHIFT; - size_t bytes = min_t(size_t, PAGE_SIZE - offset, - iov_iter_count(ii)); - - bytes = min_t(size_t, bytes, fc->max_write - count); + unsigned int bytes; + unsigned int folio_offset; again: - err = -EFAULT; - if (fault_in_iov_iter_readable(ii, bytes)) - break; - folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN, mapping_gfp_mask(mapping)); if (IS_ERR(folio)) { @@ -1257,29 +1180,42 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, if (mapping_writably_mapped(mapping)) flush_dcache_folio(folio); - tmp = copy_folio_from_iter_atomic(folio, offset, bytes, ii); + folio_offset = ((index - folio->index) << PAGE_SHIFT) + offset; + bytes = min(folio_size(folio) - folio_offset, num); + + tmp = copy_folio_from_iter_atomic(folio, folio_offset, bytes, ii); flush_dcache_folio(folio); if (!tmp) { folio_unlock(folio); folio_put(folio); + + /* + * Ensure forward progress by faulting in + * while not holding the folio lock: + */ + if (fault_in_iov_iter_readable(ii, bytes)) { + err = -EFAULT; + break; + } + goto again; } - err = 0; ap->folios[ap->num_folios] = folio; + ap->descs[ap->num_folios].offset = folio_offset; ap->descs[ap->num_folios].length = tmp; ap->num_folios++; - nr_pages++; count += tmp; pos += tmp; + num -= tmp; offset += tmp; - if (offset == PAGE_SIZE) + if (offset == folio_size(folio)) offset = 0; - /* If we copied full page, mark it uptodate */ - if (tmp == PAGE_SIZE) + /* If we copied full folio, mark it uptodate */ + if (tmp == folio_size(folio)) folio_mark_uptodate(folio); if (folio_test_uptodate(folio)) { @@ -1288,10 +1224,9 @@ static ssize_t fuse_fill_write_pages(struct fuse_io_args *ia, ia->write.folio_locked = true; break; } - if (!fc->big_writes) + if (!fc->big_writes || offset != 0) break; - } while (iov_iter_count(ii) && count < fc->max_write && - nr_pages < max_pages && offset == 0); + } return count > 0 ? count : err; } @@ -1638,7 +1573,7 @@ ssize_t fuse_direct_io(struct fuse_io_priv *io, struct iov_iter *iter, return res; } } - if (!cuse && fuse_range_is_writeback(inode, idx_from, idx_to)) { + if (!cuse && filemap_range_has_writeback(mapping, pos, (pos + count - 1))) { if (!write) inode_lock(inode); fuse_sync_writes(inode); @@ -1835,38 +1770,34 @@ static ssize_t fuse_splice_write(struct pipe_inode_info *pipe, struct file *out, static void fuse_writepage_free(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; - int i; if (wpa->bucket) fuse_sync_bucket_dec(wpa->bucket); - for (i = 0; i < ap->num_folios; i++) - folio_put(ap->folios[i]); - fuse_file_put(wpa->ia.ff, false); kfree(ap->folios); kfree(wpa); } -static void fuse_writepage_finish_stat(struct inode *inode, struct folio *folio) -{ - struct backing_dev_info *bdi = inode_to_bdi(inode); - - dec_wb_stat(&bdi->wb, WB_WRITEBACK); - node_stat_sub_folio(folio, NR_WRITEBACK_TEMP); - wb_writeout_inc(&bdi->wb); -} - static void fuse_writepage_finish(struct fuse_writepage_args *wpa) { struct fuse_args_pages *ap = &wpa->ia.ap; struct inode *inode = wpa->inode; struct fuse_inode *fi = get_fuse_inode(inode); + struct backing_dev_info *bdi = inode_to_bdi(inode); int i; - for (i = 0; i < ap->num_folios; i++) - fuse_writepage_finish_stat(inode, ap->folios[i]); + for (i = 0; i < ap->num_folios; i++) { + /* + * Benchmarks showed that ending writeback within the + * scope of the fi->lock alleviates xarray lock + * contention and noticeably improves performance. + */ + folio_end_writeback(ap->folios[i]); + dec_wb_stat(&bdi->wb, WB_WRITEBACK); + wb_writeout_inc(&bdi->wb); + } wake_up(&fi->page_waitq); } @@ -1877,13 +1808,15 @@ static void fuse_send_writepage(struct fuse_mount *fm, __releases(fi->lock) __acquires(fi->lock) { - struct fuse_writepage_args *aux, *next; struct fuse_inode *fi = get_fuse_inode(wpa->inode); + struct fuse_args_pages *ap = &wpa->ia.ap; struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_args *args = &wpa->ia.ap.args; - /* Currently, all folios in FUSE are one page */ - __u64 data_size = wpa->ia.ap.num_folios * PAGE_SIZE; - int err; + struct fuse_args *args = &ap->args; + __u64 data_size = 0; + int err, i; + + for (i = 0; i < ap->num_folios; i++) + data_size += ap->descs[i].length; fi->writectr++; if (inarg->offset + data_size <= size) { @@ -1914,19 +1847,8 @@ __acquires(fi->lock) out_free: fi->writectr--; - rb_erase(&wpa->writepages_entry, &fi->writepages); fuse_writepage_finish(wpa); spin_unlock(&fi->lock); - - /* After rb_erase() aux request list is private */ - for (aux = wpa->next; aux; aux = next) { - next = aux->next; - aux->next = NULL; - fuse_writepage_finish_stat(aux->inode, - aux->ia.ap.folios[0]); - fuse_writepage_free(aux); - } - fuse_writepage_free(wpa); spin_lock(&fi->lock); } @@ -1954,43 +1876,6 @@ __acquires(fi->lock) } } -static struct fuse_writepage_args *fuse_insert_writeback(struct rb_root *root, - struct fuse_writepage_args *wpa) -{ - pgoff_t idx_from = wpa->ia.write.in.offset >> PAGE_SHIFT; - pgoff_t idx_to = idx_from + wpa->ia.ap.num_folios - 1; - struct rb_node **p = &root->rb_node; - struct rb_node *parent = NULL; - - WARN_ON(!wpa->ia.ap.num_folios); - while (*p) { - struct fuse_writepage_args *curr; - pgoff_t curr_index; - - parent = *p; - curr = rb_entry(parent, struct fuse_writepage_args, - writepages_entry); - WARN_ON(curr->inode != wpa->inode); - curr_index = curr->ia.write.in.offset >> PAGE_SHIFT; - - if (idx_from >= curr_index + curr->ia.ap.num_folios) - p = &(*p)->rb_right; - else if (idx_to < curr_index) - p = &(*p)->rb_left; - else - return curr; - } - - rb_link_node(&wpa->writepages_entry, parent, p); - rb_insert_color(&wpa->writepages_entry, root); - return NULL; -} - -static void tree_insert(struct rb_root *root, struct fuse_writepage_args *wpa) -{ - WARN_ON(fuse_insert_writeback(root, wpa)); -} - static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, int error) { @@ -2010,41 +1895,6 @@ static void fuse_writepage_end(struct fuse_mount *fm, struct fuse_args *args, if (!fc->writeback_cache) fuse_invalidate_attr_mask(inode, FUSE_STATX_MODIFY); spin_lock(&fi->lock); - rb_erase(&wpa->writepages_entry, &fi->writepages); - while (wpa->next) { - struct fuse_mount *fm = get_fuse_mount(inode); - struct fuse_write_in *inarg = &wpa->ia.write.in; - struct fuse_writepage_args *next = wpa->next; - - wpa->next = next->next; - next->next = NULL; - tree_insert(&fi->writepages, next); - - /* - * Skip fuse_flush_writepages() to make it easy to crop requests - * based on primary request size. - * - * 1st case (trivial): there are no concurrent activities using - * fuse_set/release_nowrite. Then we're on safe side because - * fuse_flush_writepages() would call fuse_send_writepage() - * anyway. - * - * 2nd case: someone called fuse_set_nowrite and it is waiting - * now for completion of all in-flight requests. This happens - * rarely and no more than once per page, so this should be - * okay. - * - * 3rd case: someone (e.g. fuse_do_setattr()) is in the middle - * of fuse_set_nowrite..fuse_release_nowrite section. The fact - * that fuse_set_nowrite returned implies that all in-flight - * requests were completed along with all of their secondary - * requests. Further primary requests are blocked by negative - * writectr. Hence there cannot be any in-flight requests and - * no invocations of fuse_writepage_end() while we're in - * fuse_set_nowrite..fuse_release_nowrite section. - */ - fuse_send_writepage(fm, next, inarg->offset + inarg->size); - } fi->writectr--; fuse_writepage_finish(wpa); spin_unlock(&fi->lock); @@ -2131,19 +1981,16 @@ static void fuse_writepage_add_to_bucket(struct fuse_conn *fc, } static void fuse_writepage_args_page_fill(struct fuse_writepage_args *wpa, struct folio *folio, - struct folio *tmp_folio, uint32_t folio_index) + uint32_t folio_index) { struct inode *inode = folio->mapping->host; struct fuse_args_pages *ap = &wpa->ia.ap; - folio_copy(tmp_folio, folio); - - ap->folios[folio_index] = tmp_folio; + ap->folios[folio_index] = folio; ap->descs[folio_index].offset = 0; - ap->descs[folio_index].length = PAGE_SIZE; + ap->descs[folio_index].length = folio_size(folio); inc_wb_stat(&inode_to_bdi(inode)->wb, WB_WRITEBACK); - node_stat_add_folio(tmp_folio, NR_WRITEBACK_TEMP); } static struct fuse_writepage_args *fuse_writepage_args_setup(struct folio *folio, @@ -2178,18 +2025,12 @@ static int fuse_writepage_locked(struct folio *folio) struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_writepage_args *wpa; struct fuse_args_pages *ap; - struct folio *tmp_folio; struct fuse_file *ff; - int error = -ENOMEM; + int error = -EIO; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto err; - - error = -EIO; ff = fuse_write_file_get(fi); if (!ff) - goto err_nofile; + goto err; wpa = fuse_writepage_args_setup(folio, ff); error = -ENOMEM; @@ -2200,22 +2041,17 @@ static int fuse_writepage_locked(struct folio *folio) ap->num_folios = 1; folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, 0); + fuse_writepage_args_page_fill(wpa, folio, 0); spin_lock(&fi->lock); - tree_insert(&fi->writepages, wpa); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - folio_end_writeback(folio); - return 0; err_writepage_args: fuse_file_put(ff, false); -err_nofile: - folio_put(tmp_folio); err: mapping_set_error(folio->mapping, error); return error; @@ -2225,8 +2061,8 @@ struct fuse_fill_wb_data { struct fuse_writepage_args *wpa; struct fuse_file *ff; struct inode *inode; - struct folio **orig_folios; unsigned int max_folios; + unsigned int nr_pages; }; static bool fuse_pages_realloc(struct fuse_fill_wb_data *data) @@ -2260,69 +2096,11 @@ static void fuse_writepages_send(struct fuse_fill_wb_data *data) struct fuse_writepage_args *wpa = data->wpa; struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); - int num_folios = wpa->ia.ap.num_folios; - int i; spin_lock(&fi->lock); list_add_tail(&wpa->queue_entry, &fi->queued_writes); fuse_flush_writepages(inode); spin_unlock(&fi->lock); - - for (i = 0; i < num_folios; i++) - folio_end_writeback(data->orig_folios[i]); -} - -/* - * Check under fi->lock if the page is under writeback, and insert it onto the - * rb_tree if not. Otherwise iterate auxiliary write requests, to see if there's - * one already added for a page at this offset. If there's none, then insert - * this new request onto the auxiliary list, otherwise reuse the existing one by - * swapping the new temp page with the old one. - */ -static bool fuse_writepage_add(struct fuse_writepage_args *new_wpa, - struct folio *folio) -{ - struct fuse_inode *fi = get_fuse_inode(new_wpa->inode); - struct fuse_writepage_args *tmp; - struct fuse_writepage_args *old_wpa; - struct fuse_args_pages *new_ap = &new_wpa->ia.ap; - - WARN_ON(new_ap->num_folios != 0); - new_ap->num_folios = 1; - - spin_lock(&fi->lock); - old_wpa = fuse_insert_writeback(&fi->writepages, new_wpa); - if (!old_wpa) { - spin_unlock(&fi->lock); - return true; - } - - for (tmp = old_wpa->next; tmp; tmp = tmp->next) { - pgoff_t curr_index; - - WARN_ON(tmp->inode != new_wpa->inode); - curr_index = tmp->ia.write.in.offset >> PAGE_SHIFT; - if (curr_index == folio->index) { - WARN_ON(tmp->ia.ap.num_folios != 1); - swap(tmp->ia.ap.folios[0], new_ap->folios[0]); - break; - } - } - - if (!tmp) { - new_wpa->next = old_wpa->next; - old_wpa->next = new_wpa; - } - - spin_unlock(&fi->lock); - - if (tmp) { - fuse_writepage_finish_stat(new_wpa->inode, - folio); - fuse_writepage_free(new_wpa); - } - - return false; } static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, @@ -2331,25 +2109,16 @@ static bool fuse_writepage_need_send(struct fuse_conn *fc, struct folio *folio, { WARN_ON(!ap->num_folios); - /* - * Being under writeback is unlikely but possible. For example direct - * read to an mmaped fuse file will set the page dirty twice; once when - * the pages are faulted with get_user_pages(), and then after the read - * completed. - */ - if (fuse_folio_is_writeback(data->inode, folio)) - return true; - /* Reached max pages */ - if (ap->num_folios == fc->max_pages) + if (data->nr_pages + folio_nr_pages(folio) > fc->max_pages) return true; /* Reached max write bytes */ - if ((ap->num_folios + 1) * PAGE_SIZE > fc->max_write) + if ((data->nr_pages * PAGE_SIZE) + folio_size(folio) > fc->max_write) return true; /* Discontinuity */ - if (data->orig_folios[ap->num_folios - 1]->index + 1 != folio_index(folio)) + if (folio_next_index(ap->folios[ap->num_folios - 1]) != folio->index) return true; /* Need to grow the pages array? If so, did the expansion fail? */ @@ -2368,7 +2137,6 @@ static int fuse_writepages_fill(struct folio *folio, struct inode *inode = data->inode; struct fuse_inode *fi = get_fuse_inode(inode); struct fuse_conn *fc = get_fuse_conn(inode); - struct folio *tmp_folio; int err; if (!data->ff) { @@ -2381,56 +2149,27 @@ static int fuse_writepages_fill(struct folio *folio, if (wpa && fuse_writepage_need_send(fc, folio, ap, data)) { fuse_writepages_send(data); data->wpa = NULL; + data->nr_pages = 0; } - err = -ENOMEM; - tmp_folio = folio_alloc(GFP_NOFS | __GFP_HIGHMEM, 0); - if (!tmp_folio) - goto out_unlock; - - /* - * The page must not be redirtied until the writeout is completed - * (i.e. userspace has sent a reply to the write request). Otherwise - * there could be more than one temporary page instance for each real - * page. - * - * This is ensured by holding the page lock in page_mkwrite() while - * checking fuse_page_is_writeback(). We already hold the page lock - * since clear_page_dirty_for_io() and keep it held until we add the - * request to the fi->writepages list and increment ap->num_folios. - * After this fuse_page_is_writeback() will indicate that the page is - * under writeback, so we can release the page lock. - */ if (data->wpa == NULL) { err = -ENOMEM; wpa = fuse_writepage_args_setup(folio, data->ff); - if (!wpa) { - folio_put(tmp_folio); + if (!wpa) goto out_unlock; - } fuse_file_get(wpa->ia.ff); data->max_folios = 1; ap = &wpa->ia.ap; } folio_start_writeback(folio); - fuse_writepage_args_page_fill(wpa, folio, tmp_folio, ap->num_folios); - data->orig_folios[ap->num_folios] = folio; + fuse_writepage_args_page_fill(wpa, folio, ap->num_folios); + data->nr_pages += folio_nr_pages(folio); err = 0; - if (data->wpa) { - /* - * Protected by fi->lock against concurrent access by - * fuse_page_is_writeback(). - */ - spin_lock(&fi->lock); - ap->num_folios++; - spin_unlock(&fi->lock); - } else if (fuse_writepage_add(wpa, folio)) { + ap->num_folios++; + if (!data->wpa) data->wpa = wpa; - } else { - folio_end_writeback(folio); - } out_unlock: folio_unlock(folio); @@ -2456,13 +2195,7 @@ static int fuse_writepages(struct address_space *mapping, data.inode = inode; data.wpa = NULL; data.ff = NULL; - - err = -ENOMEM; - data.orig_folios = kcalloc(fc->max_pages, - sizeof(struct folio *), - GFP_NOFS); - if (!data.orig_folios) - goto out; + data.nr_pages = 0; err = write_cache_pages(mapping, wbc, fuse_writepages_fill, &data); if (data.wpa) { @@ -2472,7 +2205,6 @@ static int fuse_writepages(struct address_space *mapping, if (data.ff) fuse_file_put(data.ff, false); - kfree(data.orig_folios); out: return err; } @@ -2497,8 +2229,6 @@ static int fuse_write_begin(struct file *file, struct address_space *mapping, if (IS_ERR(folio)) goto error; - fuse_wait_on_page_writeback(mapping->host, folio->index); - if (folio_test_uptodate(folio) || len >= folio_size(folio)) goto success; /* @@ -2561,13 +2291,9 @@ static int fuse_launder_folio(struct folio *folio) { int err = 0; if (folio_clear_dirty_for_io(folio)) { - struct inode *inode = folio->mapping->host; - - /* Serialize with pending writeback for the same page */ - fuse_wait_on_page_writeback(inode, folio->index); err = fuse_writepage_locked(folio); if (!err) - fuse_wait_on_page_writeback(inode, folio->index); + folio_wait_writeback(folio); } return err; } @@ -2611,7 +2337,7 @@ static vm_fault_t fuse_page_mkwrite(struct vm_fault *vmf) return VM_FAULT_NOPAGE; } - fuse_wait_on_folio_writeback(inode, folio); + folio_wait_writeback(folio); return VM_FAULT_LOCKED; } @@ -3429,9 +3155,12 @@ static const struct address_space_operations fuse_file_aops = { void fuse_init_file_inode(struct inode *inode, unsigned int flags) { struct fuse_inode *fi = get_fuse_inode(inode); + struct fuse_conn *fc = get_fuse_conn(inode); inode->i_fop = &fuse_file_operations; inode->i_data.a_ops = &fuse_file_aops; + if (fc->writeback_cache) + mapping_set_writeback_may_deadlock_on_reclaim(&inode->i_data); INIT_LIST_HEAD(&fi->write_files); INIT_LIST_HEAD(&fi->queued_writes); @@ -3439,7 +3168,6 @@ void fuse_init_file_inode(struct inode *inode, unsigned int flags) fi->iocachectr = 0; init_waitqueue_head(&fi->page_waitq); init_waitqueue_head(&fi->direct_io_waitq); - fi->writepages = RB_ROOT; if (IS_ENABLED(CONFIG_FUSE_DAX)) fuse_dax_inode_init(inode, flags); diff --git a/fs/fuse/fuse_dev_i.h b/fs/fuse/fuse_dev_i.h index b3c2e32254ba..5a9bd771a319 100644 --- a/fs/fuse/fuse_dev_i.h +++ b/fs/fuse/fuse_dev_i.h @@ -20,7 +20,6 @@ struct fuse_iqueue; struct fuse_forget_link; struct fuse_copy_state { - int write; struct fuse_req *req; struct iov_iter *iter; struct pipe_buffer *pipebufs; @@ -30,8 +29,9 @@ struct fuse_copy_state { struct page *pg; unsigned int len; unsigned int offset; - unsigned int move_pages:1; - unsigned int is_uring:1; + bool write:1; + bool move_folios:1; + bool is_uring:1; struct { unsigned int copied_sz; /* copied size into the user buffer */ } ring; @@ -51,7 +51,7 @@ struct fuse_req *fuse_request_find(struct fuse_pqueue *fpq, u64 unique); void fuse_dev_end_requests(struct list_head *head); -void fuse_copy_init(struct fuse_copy_state *cs, int write, +void fuse_copy_init(struct fuse_copy_state *cs, bool write, struct iov_iter *iter); int fuse_copy_args(struct fuse_copy_state *cs, unsigned int numargs, unsigned int argpages, struct fuse_arg *args, @@ -64,7 +64,6 @@ void fuse_dev_queue_interrupt(struct fuse_iqueue *fiq, struct fuse_req *req); bool fuse_remove_pending_req(struct fuse_req *req, spinlock_t *lock); bool fuse_request_expired(struct fuse_conn *fc, struct list_head *list); -bool fuse_fpq_processing_expired(struct fuse_conn *fc, struct list_head *processing); #endif diff --git a/fs/fuse/fuse_i.h b/fs/fuse/fuse_i.h index d56d4fd956db..b54f4f57789f 100644 --- a/fs/fuse/fuse_i.h +++ b/fs/fuse/fuse_i.h @@ -74,8 +74,8 @@ extern struct list_head fuse_conn_list; extern struct mutex fuse_mutex; /** Module parameters */ -extern unsigned max_user_bgreq; -extern unsigned max_user_congthresh; +extern unsigned int max_user_bgreq; +extern unsigned int max_user_congthresh; /* One forget request */ struct fuse_forget_link { @@ -161,9 +161,6 @@ struct fuse_inode { /* waitq for direct-io completion */ wait_queue_head_t direct_io_waitq; - - /* List of writepage requestst (pending or sent) */ - struct rb_root writepages; }; /* readdir cache (directory only) */ @@ -636,6 +633,9 @@ struct fuse_conn { /** Number of fuse_dev's */ atomic_t dev_count; + /** Current epoch for up-to-date dentries */ + atomic_t epoch; + struct rcu_head rcu; /** The user id for this mount */ diff --git a/fs/fuse/inode.c b/fs/fuse/inode.c index fd48e8d37f2e..bfe8d8af46f3 100644 --- a/fs/fuse/inode.c +++ b/fs/fuse/inode.c @@ -41,7 +41,7 @@ unsigned int fuse_max_pages_limit = 256; unsigned int fuse_default_req_timeout; unsigned int fuse_max_req_timeout; -unsigned max_user_bgreq; +unsigned int max_user_bgreq; module_param_call(max_user_bgreq, set_global_limit, param_get_uint, &max_user_bgreq, 0644); __MODULE_PARM_TYPE(max_user_bgreq, "uint"); @@ -49,7 +49,7 @@ MODULE_PARM_DESC(max_user_bgreq, "Global limit for the maximum number of backgrounded requests an " "unprivileged user can set"); -unsigned max_user_congthresh; +unsigned int max_user_congthresh; module_param_call(max_user_congthresh, set_global_limit, param_get_uint, &max_user_congthresh, 0644); __MODULE_PARM_TYPE(max_user_congthresh, "uint"); @@ -962,6 +962,7 @@ void fuse_conn_init(struct fuse_conn *fc, struct fuse_mount *fm, init_rwsem(&fc->killsb); refcount_set(&fc->count, 1); atomic_set(&fc->dev_count, 1); + atomic_set(&fc->epoch, 1); init_waitqueue_head(&fc->blocked_waitq); fuse_iqueue_init(&fc->iq, fiq_ops, fiq_priv); INIT_LIST_HEAD(&fc->bg_queue); @@ -1036,7 +1037,7 @@ struct fuse_conn *fuse_conn_get(struct fuse_conn *fc) } EXPORT_SYMBOL_GPL(fuse_conn_get); -static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned mode) +static struct inode *fuse_get_root_inode(struct super_block *sb, unsigned int mode) { struct fuse_attr attr; memset(&attr, 0, sizeof(attr)); @@ -1211,7 +1212,7 @@ static const struct super_operations fuse_super_operations = { .show_options = fuse_show_options, }; -static void sanitize_global_limit(unsigned *limit) +static void sanitize_global_limit(unsigned int *limit) { /* * The default maximum number of async requests is calculated to consume @@ -1232,7 +1233,7 @@ static int set_global_limit(const char *val, const struct kernel_param *kp) if (rv) return rv; - sanitize_global_limit((unsigned *)kp->arg); + sanitize_global_limit((unsigned int *)kp->arg); return 0; } diff --git a/fs/fuse/readdir.c b/fs/fuse/readdir.c index edcd6f18a8a8..c2aae2eef086 100644 --- a/fs/fuse/readdir.c +++ b/fs/fuse/readdir.c @@ -161,6 +161,7 @@ static int fuse_direntplus_link(struct file *file, struct fuse_conn *fc; struct inode *inode; DECLARE_WAIT_QUEUE_HEAD_ONSTACK(wq); + int epoch; if (!o->nodeid) { /* @@ -190,6 +191,7 @@ static int fuse_direntplus_link(struct file *file, return -EIO; fc = get_fuse_conn(dir); + epoch = atomic_read(&fc->epoch); name.hash = full_name_hash(parent, name.name, name.len); dentry = d_lookup(parent, &name); @@ -256,6 +258,7 @@ retry: } if (fc->readdirplus_auto) set_bit(FUSE_I_INIT_RDPLUS, &get_fuse_inode(inode)->state); + dentry->d_time = epoch; fuse_change_entry_timeout(dentry, o); dput(dentry); @@ -332,35 +335,32 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) { int plus; ssize_t res; - struct folio *folio; struct inode *inode = file_inode(file); struct fuse_mount *fm = get_fuse_mount(inode); + struct fuse_conn *fc = fm->fc; struct fuse_io_args ia = {}; - struct fuse_args_pages *ap = &ia.ap; - struct fuse_folio_desc desc = { .length = PAGE_SIZE }; + struct fuse_args *args = &ia.ap.args; + void *buf; + size_t bufsize = clamp((unsigned int) ctx->count, PAGE_SIZE, fc->max_pages << PAGE_SHIFT); u64 attr_version = 0, evict_ctr = 0; bool locked; - folio = folio_alloc(GFP_KERNEL, 0); - if (!folio) + buf = kvmalloc(bufsize, GFP_KERNEL); + if (!buf) return -ENOMEM; + args->out_args[0].value = buf; + plus = fuse_use_readdirplus(inode, ctx); - ap->args.out_pages = true; - ap->num_folios = 1; - ap->folios = &folio; - ap->descs = &desc; if (plus) { attr_version = fuse_get_attr_version(fm->fc); evict_ctr = fuse_get_evict_ctr(fm->fc); - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIRPLUS); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIRPLUS); } else { - fuse_read_args_fill(&ia, file, ctx->pos, PAGE_SIZE, - FUSE_READDIR); + fuse_read_args_fill(&ia, file, ctx->pos, bufsize, FUSE_READDIR); } locked = fuse_lock_inode(inode); - res = fuse_simple_request(fm, &ap->args); + res = fuse_simple_request(fm, args); fuse_unlock_inode(inode, locked); if (res >= 0) { if (!res) { @@ -369,16 +369,14 @@ static int fuse_readdir_uncached(struct file *file, struct dir_context *ctx) if (ff->open_flags & FOPEN_CACHE_DIR) fuse_readdir_cache_end(file, ctx->pos); } else if (plus) { - res = parse_dirplusfile(folio_address(folio), res, - file, ctx, attr_version, + res = parse_dirplusfile(buf, res, file, ctx, attr_version, evict_ctr); } else { - res = parse_dirfile(folio_address(folio), res, file, - ctx); + res = parse_dirfile(buf, res, file, ctx); } } - folio_put(folio); + kvfree(buf); fuse_invalidate_atime(inode); return res; } diff --git a/fs/iomap/buffered-io.c b/fs/iomap/buffered-io.c index 233abf598f65..3729391a18f3 100644 --- a/fs/iomap/buffered-io.c +++ b/fs/iomap/buffered-io.c @@ -1691,6 +1691,8 @@ static int iomap_add_to_ioend(struct iomap_writepage_ctx *wpc, ioend_flags |= IOMAP_IOEND_UNWRITTEN; if (wpc->iomap.flags & IOMAP_F_SHARED) ioend_flags |= IOMAP_IOEND_SHARED; + if (folio_test_dropbehind(folio)) + ioend_flags |= IOMAP_IOEND_DONTCACHE; if (pos == wpc->iomap.offset && (wpc->iomap.flags & IOMAP_F_BOUNDARY)) ioend_flags |= IOMAP_IOEND_BOUNDARY; diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c index 47038e660812..d5da9817df9b 100644 --- a/fs/isofs/inode.c +++ b/fs/isofs/inode.c @@ -1275,6 +1275,7 @@ static int isofs_read_inode(struct inode *inode, int relocated) unsigned long offset; struct iso_inode_info *ei = ISOFS_I(inode); int ret = -EIO; + struct timespec64 ts; block = ei->i_iget5_block; bh = sb_bread(inode->i_sb, block); @@ -1387,8 +1388,10 @@ static int isofs_read_inode(struct inode *inode, int relocated) inode->i_ino, de->flags[-high_sierra]); } #endif - inode_set_mtime_to_ts(inode, - inode_set_atime_to_ts(inode, inode_set_ctime(inode, iso_date(de->date, high_sierra), 0))); + ts = iso_date(de->date, high_sierra ? ISO_DATE_HIGH_SIERRA : 0); + inode_set_ctime_to_ts(inode, ts); + inode_set_atime_to_ts(inode, ts); + inode_set_mtime_to_ts(inode, ts); ei->i_first_extent = (isonum_733(de->extent) + isonum_711(de->ext_attr_length)); diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h index 2d55207c9a99..506555837533 100644 --- a/fs/isofs/isofs.h +++ b/fs/isofs/isofs.h @@ -106,7 +106,9 @@ static inline unsigned int isonum_733(u8 *p) /* Ignore bigendian datum due to broken mastering programs */ return get_unaligned_le32(p); } -extern int iso_date(u8 *, int); +#define ISO_DATE_HIGH_SIERRA (1 << 0) +#define ISO_DATE_LONG_FORM (1 << 1) +struct timespec64 iso_date(u8 *p, int flags); struct inode; /* To make gcc happy */ diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c index dbf911126e61..576498245b9d 100644 --- a/fs/isofs/rock.c +++ b/fs/isofs/rock.c @@ -412,7 +412,12 @@ repeat: } } break; - case SIG('T', 'F'): + case SIG('T', 'F'): { + int flags, size, slen; + + flags = rr->u.TF.flags & TF_LONG_FORM ? ISO_DATE_LONG_FORM : 0; + size = rr->u.TF.flags & TF_LONG_FORM ? 17 : 7; + slen = rr->len - 5; /* * Some RRIP writers incorrectly place ctime in the * TF_CREATE field. Try to handle this correctly for @@ -420,27 +425,28 @@ repeat: */ /* Rock ridge never appears on a High Sierra disk */ cnt = 0; - if (rr->u.TF.flags & TF_CREATE) { - inode_set_ctime(inode, - iso_date(rr->u.TF.times[cnt++].time, 0), - 0); + if ((rr->u.TF.flags & TF_CREATE) && size <= slen) { + inode_set_ctime_to_ts(inode, + iso_date(rr->u.TF.data + size * cnt++, flags)); + slen -= size; } - if (rr->u.TF.flags & TF_MODIFY) { - inode_set_mtime(inode, - iso_date(rr->u.TF.times[cnt++].time, 0), - 0); + if ((rr->u.TF.flags & TF_MODIFY) && size <= slen) { + inode_set_mtime_to_ts(inode, + iso_date(rr->u.TF.data + size * cnt++, flags)); + slen -= size; } - if (rr->u.TF.flags & TF_ACCESS) { - inode_set_atime(inode, - iso_date(rr->u.TF.times[cnt++].time, 0), - 0); + if ((rr->u.TF.flags & TF_ACCESS) && size <= slen) { + inode_set_atime_to_ts(inode, + iso_date(rr->u.TF.data + size * cnt++, flags)); + slen -= size; } - if (rr->u.TF.flags & TF_ATTRIBUTES) { - inode_set_ctime(inode, - iso_date(rr->u.TF.times[cnt++].time, 0), - 0); + if ((rr->u.TF.flags & TF_ATTRIBUTES) && size <= slen) { + inode_set_ctime_to_ts(inode, + iso_date(rr->u.TF.data + size * cnt++, flags)); + slen -= size; } break; + } case SIG('S', 'L'): { int slen; diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h index 7755e587f778..c0856fa9bb6a 100644 --- a/fs/isofs/rock.h +++ b/fs/isofs/rock.h @@ -65,13 +65,9 @@ struct RR_PL_s { __u8 location[8]; }; -struct stamp { - __u8 time[7]; /* actually 6 unsigned, 1 signed */ -} __attribute__ ((packed)); - struct RR_TF_s { __u8 flags; - struct stamp times[]; /* Variable number of these beasts */ + __u8 data[]; } __attribute__ ((packed)); /* Linux-specific extension for transparent decompression */ diff --git a/fs/isofs/util.c b/fs/isofs/util.c index e88dba721661..42f479da0b28 100644 --- a/fs/isofs/util.c +++ b/fs/isofs/util.c @@ -16,29 +16,44 @@ * to GMT. Thus we should always be correct. */ -int iso_date(u8 *p, int flag) +struct timespec64 iso_date(u8 *p, int flags) { int year, month, day, hour, minute, second, tz; - int crtime; + struct timespec64 ts; + + if (flags & ISO_DATE_LONG_FORM) { + year = (p[0] - '0') * 1000 + + (p[1] - '0') * 100 + + (p[2] - '0') * 10 + + (p[3] - '0') - 1900; + month = ((p[4] - '0') * 10 + (p[5] - '0')); + day = ((p[6] - '0') * 10 + (p[7] - '0')); + hour = ((p[8] - '0') * 10 + (p[9] - '0')); + minute = ((p[10] - '0') * 10 + (p[11] - '0')); + second = ((p[12] - '0') * 10 + (p[13] - '0')); + ts.tv_nsec = ((p[14] - '0') * 10 + (p[15] - '0')) * 10000000; + tz = p[16]; + } else { + year = p[0]; + month = p[1]; + day = p[2]; + hour = p[3]; + minute = p[4]; + second = p[5]; + ts.tv_nsec = 0; + /* High sierra has no time zone */ + tz = flags & ISO_DATE_HIGH_SIERRA ? 0 : p[6]; + } - year = p[0]; - month = p[1]; - day = p[2]; - hour = p[3]; - minute = p[4]; - second = p[5]; - if (flag == 0) tz = p[6]; /* High sierra has no time zone */ - else tz = 0; - if (year < 0) { - crtime = 0; + ts.tv_sec = 0; } else { - crtime = mktime64(year+1900, month, day, hour, minute, second); + ts.tv_sec = mktime64(year+1900, month, day, hour, minute, second); /* sign extend */ if (tz & 0x80) tz |= (-1 << 8); - + /* * The timezone offset is unreliable on some disks, * so we make a sanity check. In no case is it ever @@ -65,7 +80,7 @@ int iso_date(u8 *p, int flag) * for pointing out the sign error. */ if (-52 <= tz && tz <= 52) - crtime -= tz * 15 * 60; + ts.tv_sec -= tz * 15 * 60; } - return crtime; -} + return ts; +} diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c index 1c7c49356878..7203d2d2624d 100644 --- a/fs/jbd2/commit.c +++ b/fs/jbd2/commit.c @@ -99,7 +99,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh) h->h_chksum_type = 0; h->h_chksum_size = 0; h->h_chksum[0] = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); h->h_chksum[0] = cpu_to_be32(csum); } @@ -330,8 +330,8 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag, seq = cpu_to_be32(sequence); addr = kmap_local_folio(bh->b_folio, bh_offset(bh)); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, addr, bh->b_size); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, addr, bh->b_size); kunmap_local(addr); if (jbd2_has_feature_csum3(j)) diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c index 743a1d7633cd..6d5e76848733 100644 --- a/fs/jbd2/journal.c +++ b/fs/jbd2/journal.c @@ -83,7 +83,7 @@ EXPORT_SYMBOL(jbd2_log_wait_commit); EXPORT_SYMBOL(jbd2_journal_start_commit); EXPORT_SYMBOL(jbd2_journal_force_commit_nested); EXPORT_SYMBOL(jbd2_journal_wipe); -EXPORT_SYMBOL(jbd2_journal_blocks_per_page); +EXPORT_SYMBOL(jbd2_journal_blocks_per_folio); EXPORT_SYMBOL(jbd2_journal_invalidate_folio); EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers); EXPORT_SYMBOL(jbd2_journal_force_commit); @@ -115,14 +115,14 @@ void __jbd2_debug(int level, const char *file, const char *func, #endif /* Checksumming functions */ -static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb) +static __be32 jbd2_superblock_csum(journal_superblock_t *sb) { __u32 csum; __be32 old_csum; old_csum = sb->s_checksum; sb->s_checksum = 0; - csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t)); + csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t)); sb->s_checksum = old_csum; return cpu_to_be32(csum); @@ -728,7 +728,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid) } journal->j_flags |= JBD2_FAST_COMMIT_ONGOING; write_unlock(&journal->j_state_lock); - jbd2_journal_lock_updates(journal); return 0; } @@ -742,7 +741,6 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback) { if (journal->j_fc_cleanup_callback) journal->j_fc_cleanup_callback(journal, 0, tid); - jbd2_journal_unlock_updates(journal); write_lock(&journal->j_state_lock); journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING; if (fallback) @@ -1002,7 +1000,7 @@ void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh) tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); tail->t_checksum = 0; - csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize); + csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize); tail->t_checksum = cpu_to_be32(csum); } @@ -1386,7 +1384,7 @@ static int journal_check_superblock(journal_t *journal) } /* Check superblock checksum */ - if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) { + if (sb->s_checksum != jbd2_superblock_csum(sb)) { printk(KERN_ERR "JBD2: journal checksum error\n"); err = -EFSBADCRC; return err; @@ -1492,7 +1490,7 @@ static int journal_load_superblock(journal_t *journal) journal->j_total_len = be32_to_cpu(sb->s_maxlen); /* Precompute checksum seed for all metadata */ if (jbd2_journal_has_csum_v2or3(journal)) - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, sizeof(sb->s_uuid)); /* After journal features are set, we can compute transaction limits */ jbd2_journal_init_transaction_limits(journal); @@ -1821,7 +1819,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags) set_buffer_uptodate(bh); } if (jbd2_journal_has_csum_v2or3(journal)) - sb->s_checksum = jbd2_superblock_csum(journal, sb); + sb->s_checksum = jbd2_superblock_csum(sb); get_bh(bh); bh->b_end_io = end_buffer_write_sync; submit_bh(REQ_OP_WRITE | write_flags, bh); @@ -2338,7 +2336,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat, sb->s_checksum_type = JBD2_CRC32C_CHKSUM; sb->s_feature_compat &= ~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM); - journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid, + journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid, sizeof(sb->s_uuid)); } @@ -2657,9 +2655,10 @@ void jbd2_journal_ack_err(journal_t *journal) write_unlock(&journal->j_state_lock); } -int jbd2_journal_blocks_per_page(struct inode *inode) +int jbd2_journal_blocks_per_folio(struct inode *inode) { - return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits); + return 1 << (PAGE_SHIFT + mapping_max_folio_order(inode->i_mapping) - + inode->i_sb->s_blocksize_bits); } /* diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c index c271a050b7e6..cac8c2cd4a92 100644 --- a/fs/jbd2/recovery.c +++ b/fs/jbd2/recovery.c @@ -185,7 +185,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf) j->j_blocksize - sizeof(struct jbd2_journal_block_tail)); provided = tail->t_checksum; tail->t_checksum = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); tail->t_checksum = provided; return provided == cpu_to_be32(calculated); @@ -440,7 +440,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf) h = buf; provided = h->h_chksum[0]; h->h_chksum[0] = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize); h->h_chksum[0] = provided; return provided == cpu_to_be32(calculated); @@ -461,7 +461,7 @@ static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf) h = tmpbuf; provided = h->h_chksum[0]; h->h_chksum[0] = 0; - calculated = jbd2_chksum(j, j->j_csum_seed, tmpbuf, j->j_blocksize); + calculated = jbd2_chksum(j->j_csum_seed, tmpbuf, j->j_blocksize); kfree(tmpbuf); return provided == cpu_to_be32(calculated); @@ -478,8 +478,8 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag, return 1; seq = cpu_to_be32(sequence); - csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); - csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize); + csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq)); + csum32 = jbd2_chksum(csum32, buf, j->j_blocksize); if (jbd2_has_feature_csum3(j)) return tag3->t_checksum == cpu_to_be32(csum32); diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c index cbc4785462f5..c7867139af69 100644 --- a/fs/jbd2/transaction.c +++ b/fs/jbd2/transaction.c @@ -1509,7 +1509,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) jh->b_next_transaction == transaction); spin_unlock(&jh->b_state_lock); } - if (jh->b_modified == 1) { + if (data_race(jh->b_modified == 1)) { /* If it's in our transaction it must be in BJ_Metadata list. */ if (data_race(jh->b_transaction == transaction && jh->b_jlist != BJ_Metadata)) { @@ -1528,7 +1528,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out; } - journal = transaction->t_journal; spin_lock(&jh->b_state_lock); if (is_handle_aborted(handle)) { @@ -1543,6 +1542,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh) goto out_unlock_bh; } + journal = transaction->t_journal; + if (jh->b_modified == 0) { /* * This buffer's got modified and becoming part diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c index 5f4b305030ad..4b660296caf3 100644 --- a/fs/jfs/jfs_discard.c +++ b/fs/jfs/jfs_discard.c @@ -86,7 +86,8 @@ int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range) down_read(&sb->s_umount); bmp = JFS_SBI(ip->i_sb)->bmap; - if (minlen > bmp->db_agsize || + if (bmp == NULL || + minlen > bmp->db_agsize || start >= bmp->db_mapsize || range->len < sb->s_blocksize) { up_read(&sb->s_umount); diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c index 26e89d0c69b6..35e063c9f3a4 100644 --- a/fs/jfs/jfs_dmap.c +++ b/fs/jfs/jfs_dmap.c @@ -194,7 +194,11 @@ int dbMount(struct inode *ipbmap) !bmp->db_numag || (bmp->db_numag > MAXAG) || (bmp->db_maxag >= MAXAG) || (bmp->db_maxag < 0) || (bmp->db_agpref >= MAXAG) || (bmp->db_agpref < 0) || - !bmp->db_agwidth || + (bmp->db_agheight < 0) || (bmp->db_agheight > (L2LPERCTL >> 1)) || + (bmp->db_agwidth < 1) || (bmp->db_agwidth > (LPERCTL / MAXAG)) || + (bmp->db_agwidth > (1 << (L2LPERCTL - (bmp->db_agheight << 1)))) || + (bmp->db_agstart < 0) || + (bmp->db_agstart > (CTLTREESIZE - 1 - bmp->db_agwidth * (MAXAG - 1))) || (bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) || (bmp->db_agl2size < 0) || ((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) { diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c index 93db6eec4465..ab11849cf9cc 100644 --- a/fs/jfs/jfs_dtree.c +++ b/fs/jfs/jfs_dtree.c @@ -2613,7 +2613,7 @@ void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot) * fsck.jfs should really fix this, but it currently does not. * Called from jfs_readdir when bad index is detected. */ -static void add_missing_indices(struct inode *inode, s64 bn) +static int add_missing_indices(struct inode *inode, s64 bn) { struct ldtentry *d; struct dt_lock *dtlck; @@ -2622,7 +2622,7 @@ static void add_missing_indices(struct inode *inode, s64 bn) struct lv *lv; struct metapage *mp; dtpage_t *p; - int rc; + int rc = 0; s8 *stbl; tid_t tid; struct tlock *tlck; @@ -2647,6 +2647,16 @@ static void add_missing_indices(struct inode *inode, s64 bn) stbl = DT_GETSTBL(p); for (i = 0; i < p->header.nextindex; i++) { + if (stbl[i] < 0) { + jfs_err("jfs: add_missing_indices: Invalid stbl[%d] = %d for inode %ld, block = %lld", + i, stbl[i], (long)inode->i_ino, (long long)bn); + rc = -EIO; + + DT_PUTPAGE(mp); + txAbort(tid, 0); + goto end; + } + d = (struct ldtentry *) &p->slot[stbl[i]]; index = le32_to_cpu(d->index); if ((index < 2) || (index >= JFS_IP(inode)->next_index)) { @@ -2664,6 +2674,7 @@ static void add_missing_indices(struct inode *inode, s64 bn) (void) txCommit(tid, 1, &inode, 0); end: txEnd(tid); + return rc; } /* @@ -3017,7 +3028,8 @@ skip_one: } if (fix_page) { - add_missing_indices(ip, bn); + if ((rc = add_missing_indices(ip, bn))) + goto out; page_fixed = 1; } diff --git a/fs/jfs/jfs_metapage.c b/fs/jfs/jfs_metapage.c index df575a873ec6..9029cd216912 100644 --- a/fs/jfs/jfs_metapage.c +++ b/fs/jfs/jfs_metapage.c @@ -15,6 +15,7 @@ #include <linux/mempool.h> #include <linux/seq_file.h> #include <linux/writeback.h> +#include <linux/migrate.h> #include "jfs_incore.h" #include "jfs_superblock.h" #include "jfs_filsys.h" @@ -151,7 +152,59 @@ static inline void dec_io(struct folio *folio, blk_status_t status, handler(folio, anchor->status); } +#ifdef CONFIG_MIGRATION +static int __metapage_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + struct meta_anchor *src_anchor = src->private; + struct metapage *mps[MPS_PER_PAGE] = {0}; + struct metapage *mp; + int i, rc; + + for (i = 0; i < MPS_PER_PAGE; i++) { + mp = src_anchor->mp[i]; + if (mp && metapage_locked(mp)) + return -EAGAIN; + } + + rc = filemap_migrate_folio(mapping, dst, src, mode); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + for (i = 0; i < MPS_PER_PAGE; i++) { + mp = src_anchor->mp[i]; + if (!mp) + continue; + if (unlikely(insert_metapage(dst, mp))) { + /* If error, roll-back previosly inserted pages */ + for (int j = 0 ; j < i; j++) { + if (mps[j]) + remove_metapage(dst, mps[j]); + } + return -EAGAIN; + } + mps[i] = mp; + } + + /* Update the metapage and remove it from src */ + for (i = 0; i < MPS_PER_PAGE; i++) { + mp = mps[i]; + if (mp) { + int page_offset = mp->data - folio_address(src); + + mp->data = folio_address(dst) + page_offset; + mp->folio = dst; + remove_metapage(src, mp); + } + } + + return MIGRATEPAGE_SUCCESS; +} +#endif /* CONFIG_MIGRATION */ + #else + static inline struct metapage *folio_to_mp(struct folio *folio, int offset) { return folio->private; @@ -175,6 +228,35 @@ static inline void remove_metapage(struct folio *folio, struct metapage *mp) #define inc_io(folio) do {} while(0) #define dec_io(folio, status, handler) handler(folio, status) +#ifdef CONFIG_MIGRATION +static int __metapage_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + struct metapage *mp; + int page_offset; + int rc; + + mp = folio_to_mp(src, 0); + if (metapage_locked(mp)) + return -EAGAIN; + + rc = filemap_migrate_folio(mapping, dst, src, mode); + if (rc != MIGRATEPAGE_SUCCESS) + return rc; + + if (unlikely(insert_metapage(dst, mp))) + return -EAGAIN; + + page_offset = mp->data - folio_address(src); + mp->data = folio_address(dst) + page_offset; + mp->folio = dst; + remove_metapage(src, mp); + + return MIGRATEPAGE_SUCCESS; +} +#endif /* CONFIG_MIGRATION */ + #endif static inline struct metapage *alloc_metapage(gfp_t gfp_mask) @@ -554,6 +636,29 @@ static bool metapage_release_folio(struct folio *folio, gfp_t gfp_mask) return ret; } +#ifdef CONFIG_MIGRATION +/* + * metapage_migrate_folio - Migration function for JFS metapages + */ +static int metapage_migrate_folio(struct address_space *mapping, + struct folio *dst, struct folio *src, + enum migrate_mode mode) +{ + int expected_count; + + if (!src->private) + return filemap_migrate_folio(mapping, dst, src, mode); + + /* Check whether page does not have extra refs before we do more work */ + expected_count = folio_expected_ref_count(src) + 1; + if (folio_ref_count(src) != expected_count) + return -EAGAIN; + return __metapage_migrate_folio(mapping, dst, src, mode); +} +#else +#define metapage_migrate_folio NULL +#endif /* CONFIG_MIGRATION */ + static void metapage_invalidate_folio(struct folio *folio, size_t offset, size_t length) { @@ -570,6 +675,7 @@ const struct address_space_operations jfs_metapage_aops = { .release_folio = metapage_release_folio, .invalidate_folio = metapage_invalidate_folio, .dirty_folio = filemap_dirty_folio, + .migrate_folio = metapage_migrate_folio, }; struct metapage *__get_metapage(struct inode *inode, unsigned long lblock, diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c index fc70d72c3fe8..a670ba3e565e 100644 --- a/fs/kernfs/dir.c +++ b/fs/kernfs/dir.c @@ -17,7 +17,6 @@ #include "kernfs-internal.h" -DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ /* * Don't use rename_lock to piggy back on pr_cont_buf. We don't want to * call pr_cont() while holding rename_lock. Because sometimes pr_cont() @@ -27,7 +26,6 @@ DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */ */ static DEFINE_SPINLOCK(kernfs_pr_cont_lock); static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */ -static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */ #define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb) @@ -229,7 +227,7 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from, if (to) { root = kernfs_root(to); if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) { - guard(read_lock_irqsave)(&kernfs_rename_lock); + guard(read_lock_irqsave)(&root->kernfs_rename_lock); return kernfs_path_from_node_locked(to, from, buf, buflen); } } @@ -296,12 +294,14 @@ out: struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn) { struct kernfs_node *parent; + struct kernfs_root *root; unsigned long flags; - read_lock_irqsave(&kernfs_rename_lock, flags); + root = kernfs_root(kn); + read_lock_irqsave(&root->kernfs_rename_lock, flags); parent = kernfs_parent(kn); kernfs_get(parent); - read_unlock_irqrestore(&kernfs_rename_lock, flags); + read_unlock_irqrestore(&root->kernfs_rename_lock, flags); return parent; } @@ -584,9 +584,9 @@ void kernfs_put(struct kernfs_node *kn) if (kernfs_type(kn) == KERNFS_LINK) kernfs_put(kn->symlink.target_kn); - spin_lock(&kernfs_idr_lock); + spin_lock(&root->kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); - spin_unlock(&kernfs_idr_lock); + spin_unlock(&root->kernfs_idr_lock); call_rcu(&kn->rcu, kernfs_free_rcu); @@ -639,13 +639,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, goto err_out1; idr_preload(GFP_KERNEL); - spin_lock(&kernfs_idr_lock); + spin_lock(&root->kernfs_idr_lock); ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC); if (ret >= 0 && ret < root->last_id_lowbits) root->id_highbits++; id_highbits = root->id_highbits; root->last_id_lowbits = ret; - spin_unlock(&kernfs_idr_lock); + spin_unlock(&root->kernfs_idr_lock); idr_preload_end(); if (ret < 0) goto err_out2; @@ -681,9 +681,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root, return kn; err_out3: - spin_lock(&kernfs_idr_lock); + spin_lock(&root->kernfs_idr_lock); idr_remove(&root->ino_idr, (u32)kernfs_ino(kn)); - spin_unlock(&kernfs_idr_lock); + spin_unlock(&root->kernfs_idr_lock); err_out2: kmem_cache_free(kernfs_node_cache, kn); err_out1: @@ -989,10 +989,12 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops, return ERR_PTR(-ENOMEM); idr_init(&root->ino_idr); + spin_lock_init(&root->kernfs_idr_lock); init_rwsem(&root->kernfs_rwsem); init_rwsem(&root->kernfs_iattr_rwsem); init_rwsem(&root->kernfs_supers_rwsem); INIT_LIST_HEAD(&root->supers); + rwlock_init(&root->kernfs_rename_lock); /* * On 64bit ino setups, id is ino. On 32bit, low 32bits are ino. @@ -1580,8 +1582,9 @@ void kernfs_break_active_protection(struct kernfs_node *kn) * invoked before finishing the kernfs operation. Note that while this * function restores the active reference, it doesn't and can't actually * restore the active protection - @kn may already or be in the process of - * being removed. Once kernfs_break_active_protection() is invoked, that - * protection is irreversibly gone for the kernfs operation instance. + * being drained and removed. Once kernfs_break_active_protection() is + * invoked, that protection is irreversibly gone for the kernfs operation + * instance. * * While this function may be called at any point after * kernfs_break_active_protection() is invoked, its most useful location @@ -1789,7 +1792,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, /* rename_lock protects ->parent accessors */ if (old_parent != new_parent) { kernfs_get(new_parent); - write_lock_irq(&kernfs_rename_lock); + write_lock_irq(&root->kernfs_rename_lock); rcu_assign_pointer(kn->__parent, new_parent); @@ -1797,7 +1800,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent, if (new_name) rcu_assign_pointer(kn->name, new_name); - write_unlock_irq(&kernfs_rename_lock); + write_unlock_irq(&root->kernfs_rename_lock); kernfs_put(old_parent); } else { /* name assignment is RCU protected, parent is the same */ diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c index 66fe8fe41f06..a6c692cac616 100644 --- a/fs/kernfs/file.c +++ b/fs/kernfs/file.c @@ -778,8 +778,9 @@ bool kernfs_should_drain_open_files(struct kernfs_node *kn) /* * @kn being deactivated guarantees that @kn->attr.open can't change * beneath us making the lockless test below safe. + * Callers post kernfs_unbreak_active_protection may be counted in + * kn->active by now, do not WARN_ON because of them. */ - WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS); rcu_read_lock(); on = rcu_dereference(kn->attr.open); diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h index 40a2a9cd819d..6061b6f70d2a 100644 --- a/fs/kernfs/kernfs-internal.h +++ b/fs/kernfs/kernfs-internal.h @@ -19,8 +19,6 @@ #include <linux/kernfs.h> #include <linux/fs_context.h> -extern rwlock_t kernfs_rename_lock; - struct kernfs_iattrs { kuid_t ia_uid; kgid_t ia_gid; @@ -40,6 +38,7 @@ struct kernfs_root { /* private fields, do not use outside kernfs proper */ struct idr ino_idr; + spinlock_t kernfs_idr_lock; /* root->ino_idr */ u32 last_id_lowbits; u32 id_highbits; struct kernfs_syscall_ops *syscall_ops; @@ -52,6 +51,9 @@ struct kernfs_root { struct rw_semaphore kernfs_iattr_rwsem; struct rw_semaphore kernfs_supers_rwsem; + /* kn->parent and kn->name */ + rwlock_t kernfs_rename_lock; + struct rcu_head rcu; }; @@ -107,6 +109,11 @@ static inline bool kernfs_root_is_locked(const struct kernfs_node *kn) return lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem); } +static inline bool kernfs_rename_is_locked(const struct kernfs_node *kn) +{ + return lockdep_is_held(&kernfs_root(kn)->kernfs_rename_lock); +} + static inline const char *kernfs_rcu_name(const struct kernfs_node *kn) { return rcu_dereference_check(kn->name, kernfs_root_is_locked(kn)); @@ -117,14 +124,15 @@ static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn) /* * The kernfs_node::__parent remains valid within a RCU section. The kn * can be reparented (and renamed) which changes the entry. This can be - * avoided by locking kernfs_root::kernfs_rwsem or kernfs_rename_lock. + * avoided by locking kernfs_root::kernfs_rwsem or + * kernfs_root::kernfs_rename_lock. * Both locks can be used to obtain a reference on __parent. Once the * reference count reaches 0 then the node is about to be freed * and can not be renamed (or become a different parent) anymore. */ return rcu_dereference_check(kn->__parent, kernfs_root_is_locked(kn) || - lockdep_is_held(&kernfs_rename_lock) || + kernfs_rename_is_locked(kn) || !atomic_read(&kn->count)); } diff --git a/fs/mount.h b/fs/mount.h index 7aecf2a60472..ad7173037924 100644 --- a/fs/mount.h +++ b/fs/mount.h @@ -7,10 +7,6 @@ extern struct list_head notify_list; -typedef __u32 __bitwise mntns_flags_t; - -#define MNTNS_PROPAGATING ((__force mntns_flags_t)(1 << 0)) - struct mnt_namespace { struct ns_common ns; struct mount * root; @@ -37,7 +33,6 @@ struct mnt_namespace { struct rb_node mnt_ns_tree_node; /* node in the mnt_ns_tree */ struct list_head mnt_ns_list; /* entry in the sequential list of mounts namespace */ refcount_t passive; /* number references not pinning @mounts */ - mntns_flags_t mntns_flags; } __randomize_layout; struct mnt_pcp { diff --git a/fs/namespace.c b/fs/namespace.c index 552ad7f4d18b..2f2e93927f46 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1326,21 +1326,6 @@ struct vfsmount *vfs_kern_mount(struct file_system_type *type, } EXPORT_SYMBOL_GPL(vfs_kern_mount); -struct vfsmount * -vfs_submount(const struct dentry *mountpoint, struct file_system_type *type, - const char *name, void *data) -{ - /* Until it is worked out how to pass the user namespace - * through from the parent mount to the submount don't support - * unprivileged mounts with submounts. - */ - if (mountpoint->d_sb->s_user_ns != &init_user_ns) - return ERR_PTR(-EPERM); - - return vfs_kern_mount(type, SB_SUBMOUNT, name, data); -} -EXPORT_SYMBOL_GPL(vfs_submount); - static struct mount *clone_mnt(struct mount *old, struct dentry *root, int flag) { @@ -3649,7 +3634,7 @@ static int do_move_mount(struct path *old_path, if (!(attached ? check_mnt(old) : is_anon_ns(ns))) goto out; - if (is_anon_ns(ns)) { + if (is_anon_ns(ns) && ns == p->mnt_ns) { /* * Ending up with two files referring to the root of the * same anonymous mount namespace would cause an error @@ -3657,16 +3642,7 @@ static int do_move_mount(struct path *old_path, * twice into the mount tree which would be rejected * later. But be explicit about it right here. */ - if ((is_anon_ns(p->mnt_ns) && ns == p->mnt_ns)) - goto out; - - /* - * If this is an anonymous mount tree ensure that mount - * propagation can detect mounts that were just - * propagated to the target mount tree so we don't - * propagate onto them. - */ - ns->mntns_flags |= MNTNS_PROPAGATING; + goto out; } else if (is_anon_ns(p->mnt_ns)) { /* * Don't allow moving an attached mount tree to an @@ -3723,8 +3699,6 @@ static int do_move_mount(struct path *old_path, if (attached) put_mountpoint(old_mp); out: - if (is_anon_ns(ns)) - ns->mntns_flags &= ~MNTNS_PROPAGATING; unlock_mount(mp); if (!err) { if (attached) { @@ -3900,10 +3874,6 @@ int finish_automount(struct vfsmount *m, const struct path *path) return PTR_ERR(m); mnt = real_mount(m); - /* The new mount record should have at least 2 refs to prevent it being - * expired before we get a chance to add it - */ - BUG_ON(mnt_get_count(mnt) < 2); if (m->mnt_sb == path->mnt->mnt_sb && m->mnt_root == dentry) { @@ -3936,7 +3906,6 @@ int finish_automount(struct vfsmount *m, const struct path *path) unlock_mount(mp); if (unlikely(err)) goto discard; - mntput(m); return 0; discard_locked: @@ -3950,7 +3919,6 @@ discard: namespace_unlock(); } mntput(m); - mntput(m); return err; } @@ -3987,11 +3955,14 @@ void mark_mounts_for_expiry(struct list_head *mounts) /* extract from the expiration list every vfsmount that matches the * following criteria: + * - already mounted * - only referenced by its parent vfsmount * - still marked for expiry (marked on the last call here; marks are * cleared by mntput()) */ list_for_each_entry_safe(mnt, next, mounts, mnt_expire) { + if (!is_mounted(&mnt->mnt)) + continue; if (!xchg(&mnt->mnt_expiry_mark, 1) || propagate_mount_busy(mnt, 1)) continue; diff --git a/fs/netfs/buffered_read.c b/fs/netfs/buffered_read.c index 0d1b6d35ff3b..18b3dc74c70e 100644 --- a/fs/netfs/buffered_read.c +++ b/fs/netfs/buffered_read.c @@ -78,7 +78,8 @@ static int netfs_begin_cache_read(struct netfs_io_request *rreq, struct netfs_in * [!] NOTE: This must be run in the same thread as ->issue_read() was called * in as we access the readahead_control struct. */ -static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) +static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq, + struct readahead_control *ractl) { struct netfs_io_request *rreq = subreq->rreq; size_t rsize = subreq->len; @@ -86,7 +87,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) if (subreq->source == NETFS_DOWNLOAD_FROM_SERVER) rsize = umin(rsize, rreq->io_streams[0].sreq_max_len); - if (rreq->ractl) { + if (ractl) { /* If we don't have sufficient folios in the rolling buffer, * extract a folioq's worth from the readahead region at a time * into the buffer. Note that this acquires a ref on each page @@ -99,7 +100,7 @@ static ssize_t netfs_prepare_read_iterator(struct netfs_io_subrequest *subreq) while (rreq->submitted < subreq->start + rsize) { ssize_t added; - added = rolling_buffer_load_from_ra(&rreq->buffer, rreq->ractl, + added = rolling_buffer_load_from_ra(&rreq->buffer, ractl, &put_batch); if (added < 0) return added; @@ -211,7 +212,8 @@ static void netfs_issue_read(struct netfs_io_request *rreq, * slicing up the region to be read according to available cache blocks and * network rsize. */ -static void netfs_read_to_pagecache(struct netfs_io_request *rreq) +static void netfs_read_to_pagecache(struct netfs_io_request *rreq, + struct readahead_control *ractl) { struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned long long start = rreq->start; @@ -262,9 +264,9 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) if (ret < 0) { subreq->error = ret; /* Not queued - release both refs. */ - netfs_put_subrequest(subreq, false, + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); - netfs_put_subrequest(subreq, false, + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } @@ -291,14 +293,14 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) break; issue: - slice = netfs_prepare_read_iterator(subreq); + slice = netfs_prepare_read_iterator(subreq, ractl); if (slice < 0) { ret = slice; subreq->error = ret; trace_netfs_sreq(subreq, netfs_sreq_trace_cancel); /* Not queued - release both refs. */ - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } size -= slice; @@ -312,7 +314,7 @@ static void netfs_read_to_pagecache(struct netfs_io_request *rreq) if (unlikely(size > 0)) { smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - netfs_wake_read_collector(rreq); + netfs_wake_collector(rreq); } /* Defer error return as we may need to wait for outstanding I/O. */ @@ -359,18 +361,15 @@ void netfs_readahead(struct readahead_control *ractl) netfs_rreq_expand(rreq, ractl); - rreq->ractl = ractl; rreq->submitted = rreq->start; if (rolling_buffer_init(&rreq->buffer, rreq->debug_id, ITER_DEST) < 0) goto cleanup_free; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, ractl); - netfs_put_request(rreq, true, netfs_rreq_trace_put_return); - return; + return netfs_put_request(rreq, netfs_rreq_trace_put_return); cleanup_free: - netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); - return; + return netfs_put_request(rreq, netfs_rreq_trace_put_failed); } EXPORT_SYMBOL(netfs_readahead); @@ -389,7 +388,6 @@ static int netfs_create_singular_buffer(struct netfs_io_request *rreq, struct fo if (added < 0) return added; rreq->submitted = rreq->start + added; - rreq->ractl = (struct readahead_control *)1UL; return 0; } @@ -459,7 +457,7 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) iov_iter_bvec(&rreq->buffer.iter, ITER_DEST, bvec, i, rreq->len); rreq->submitted = rreq->start + flen; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); if (sink) folio_put(sink); @@ -470,11 +468,11 @@ static int netfs_read_gaps(struct file *file, struct folio *folio) folio_mark_uptodate(folio); } folio_unlock(folio); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); alloc_error: folio_unlock(folio); return ret; @@ -528,13 +526,13 @@ int netfs_read_folio(struct file *file, struct folio *folio) if (ret < 0) goto discard; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); ret = netfs_wait_for_read(rreq); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; discard: - netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); alloc_error: folio_unlock(folio); return ret; @@ -685,11 +683,11 @@ retry: if (ret < 0) goto error_put; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); ret = netfs_wait_for_read(rreq); if (ret < 0) goto error; - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); have_folio: ret = folio_wait_private_2_killable(folio); @@ -701,7 +699,7 @@ have_folio_no_wait: return 0; error_put: - netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); + netfs_put_request(rreq, netfs_rreq_trace_put_failed); error: if (folio) { folio_unlock(folio); @@ -750,13 +748,13 @@ int netfs_prefetch_for_write(struct file *file, struct folio *folio, if (ret < 0) goto error_put; - netfs_read_to_pagecache(rreq); + netfs_read_to_pagecache(rreq, NULL); ret = netfs_wait_for_read(rreq); - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret < 0 ? ret : 0; error_put: - netfs_put_request(rreq, false, netfs_rreq_trace_put_discard); + netfs_put_request(rreq, netfs_rreq_trace_put_discard); error: _leave(" = %d", ret); return ret; diff --git a/fs/netfs/buffered_write.c b/fs/netfs/buffered_write.c index b4826360a411..72a3e6db2524 100644 --- a/fs/netfs/buffered_write.c +++ b/fs/netfs/buffered_write.c @@ -115,8 +115,7 @@ ssize_t netfs_perform_write(struct kiocb *iocb, struct iov_iter *iter, size_t max_chunk = mapping_max_folio_size(mapping); bool maybe_trouble = false; - if (unlikely(test_bit(NETFS_ICTX_WRITETHROUGH, &ctx->flags) || - iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) + if (unlikely(iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) ) { wbc_attach_fdatawrite_inode(&wbc, mapping->host); @@ -386,7 +385,7 @@ out: wbc_detach_inode(&wbc); if (ret2 == -EIOCBQUEUED) return ret2; - if (ret == 0) + if (ret == 0 && ret2 < 0) ret = ret2; } diff --git a/fs/netfs/direct_read.c b/fs/netfs/direct_read.c index 5e3f0aeb51f3..a05e13472baf 100644 --- a/fs/netfs/direct_read.c +++ b/fs/netfs/direct_read.c @@ -85,7 +85,7 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) if (rreq->netfs_ops->prepare_read) { ret = rreq->netfs_ops->prepare_read(subreq); if (ret < 0) { - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); break; } } @@ -103,19 +103,16 @@ static int netfs_dispatch_unbuffered_reads(struct netfs_io_request *rreq) rreq->netfs_ops->issue_read(subreq); if (test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) - netfs_wait_for_pause(rreq); + netfs_wait_for_paused_read(rreq); if (test_bit(NETFS_RREQ_FAILED, &rreq->flags)) break; - if (test_bit(NETFS_RREQ_BLOCKED, &rreq->flags) && - test_bit(NETFS_RREQ_NONBLOCK, &rreq->flags)) - break; cond_resched(); } while (size > 0); if (unlikely(size > 0)) { smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); - netfs_wake_read_collector(rreq); + netfs_wake_collector(rreq); } return ret; @@ -144,7 +141,7 @@ static ssize_t netfs_unbuffered_read(struct netfs_io_request *rreq, bool sync) ret = netfs_dispatch_unbuffered_reads(rreq); if (!rreq->submitted) { - netfs_put_request(rreq, false, netfs_rreq_trace_put_no_submit); + netfs_put_request(rreq, netfs_rreq_trace_put_no_submit); inode_dio_end(rreq->inode); ret = 0; goto out; @@ -188,7 +185,8 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i rreq = netfs_alloc_request(iocb->ki_filp->f_mapping, iocb->ki_filp, iocb->ki_pos, orig_count, - NETFS_DIO_READ); + iocb->ki_flags & IOCB_DIRECT ? + NETFS_DIO_READ : NETFS_UNBUFFERED_READ); if (IS_ERR(rreq)) return PTR_ERR(rreq); @@ -236,7 +234,7 @@ ssize_t netfs_unbuffered_read_iter_locked(struct kiocb *iocb, struct iov_iter *i } out: - netfs_put_request(rreq, false, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); if (ret > 0) orig_count -= ret; return ret; diff --git a/fs/netfs/direct_write.c b/fs/netfs/direct_write.c index 42ce53cc216e..fa9a5bf3c6d5 100644 --- a/fs/netfs/direct_write.c +++ b/fs/netfs/direct_write.c @@ -87,6 +87,8 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * } __set_bit(NETFS_RREQ_USE_IO_ITER, &wreq->flags); + if (async) + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); /* Copy the data into the bounce buffer and encrypt it. */ // TODO @@ -105,19 +107,15 @@ ssize_t netfs_unbuffered_write_iter_locked(struct kiocb *iocb, struct iov_iter * if (!async) { trace_netfs_rreq(wreq, netfs_rreq_trace_wait_ip); - wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - ret = wreq->error; - if (ret == 0) { - ret = wreq->transferred; + ret = netfs_wait_for_write(wreq); + if (ret > 0) iocb->ki_pos += ret; - } } else { ret = -EIOCBQUEUED; } out: - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; } EXPORT_SYMBOL(netfs_unbuffered_write_iter_locked); diff --git a/fs/netfs/fscache_io.c b/fs/netfs/fscache_io.c index b1722a82c03d..e4308457633c 100644 --- a/fs/netfs/fscache_io.c +++ b/fs/netfs/fscache_io.c @@ -192,8 +192,7 @@ EXPORT_SYMBOL(__fscache_clear_page_bits); /* * Deal with the completion of writing the data to the cache. */ -static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, - bool was_async) +static void fscache_wreq_done(void *priv, ssize_t transferred_or_error) { struct fscache_write_request *wreq = priv; @@ -202,8 +201,7 @@ static void fscache_wreq_done(void *priv, ssize_t transferred_or_error, wreq->set_bits); if (wreq->term_func) - wreq->term_func(wreq->term_func_priv, transferred_or_error, - was_async); + wreq->term_func(wreq->term_func_priv, transferred_or_error); fscache_end_operation(&wreq->cache_resources); kfree(wreq); } @@ -255,14 +253,14 @@ void __fscache_write_to_cache(struct fscache_cookie *cookie, return; abandon_end: - return fscache_wreq_done(wreq, ret, false); + return fscache_wreq_done(wreq, ret); abandon_free: kfree(wreq); abandon: if (using_pgpriv2) fscache_clear_page_bits(mapping, start, len, cond); if (term_func) - term_func(term_func_priv, ret, false); + term_func(term_func_priv, ret); } EXPORT_SYMBOL(__fscache_write_to_cache); diff --git a/fs/netfs/internal.h b/fs/netfs/internal.h index 1c4f953c3d68..e2ee9183392b 100644 --- a/fs/netfs/internal.h +++ b/fs/netfs/internal.h @@ -23,7 +23,7 @@ /* * buffered_read.c */ -void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async); +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error); int netfs_prefetch_for_write(struct file *file, struct folio *folio, size_t offset, size_t len); @@ -62,6 +62,14 @@ static inline void netfs_proc_del_rreq(struct netfs_io_request *rreq) {} struct folio_queue *netfs_buffer_make_space(struct netfs_io_request *rreq, enum netfs_folioq_trace trace); void netfs_reset_iter(struct netfs_io_subrequest *subreq); +void netfs_wake_collector(struct netfs_io_request *rreq); +void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq); +void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, + struct netfs_io_stream *stream); +ssize_t netfs_wait_for_read(struct netfs_io_request *rreq); +ssize_t netfs_wait_for_write(struct netfs_io_request *rreq); +void netfs_wait_for_paused_read(struct netfs_io_request *rreq); +void netfs_wait_for_paused_write(struct netfs_io_request *rreq); /* * objects.c @@ -71,9 +79,8 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, loff_t start, size_t len, enum netfs_io_origin origin); void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); -void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async); -void netfs_put_request(struct netfs_io_request *rreq, bool was_async, - enum netfs_rreq_ref_trace what); +void netfs_clear_subrequests(struct netfs_io_request *rreq); +void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what); struct netfs_io_subrequest *netfs_alloc_subrequest(struct netfs_io_request *rreq); static inline void netfs_see_request(struct netfs_io_request *rreq, @@ -92,11 +99,9 @@ static inline void netfs_see_subrequest(struct netfs_io_subrequest *subreq, /* * read_collect.c */ +bool netfs_read_collection(struct netfs_io_request *rreq); void netfs_read_collection_worker(struct work_struct *work); -void netfs_wake_read_collector(struct netfs_io_request *rreq); -void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async); -ssize_t netfs_wait_for_read(struct netfs_io_request *rreq); -void netfs_wait_for_pause(struct netfs_io_request *rreq); +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error); /* * read_pgpriv2.c @@ -176,8 +181,8 @@ static inline void netfs_stat_d(atomic_t *stat) * write_collect.c */ int netfs_folio_written_back(struct folio *folio); +bool netfs_write_collection(struct netfs_io_request *wreq); void netfs_write_collection_worker(struct work_struct *work); -void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async); /* * write_issue.c @@ -198,8 +203,8 @@ struct netfs_io_request *netfs_begin_writethrough(struct kiocb *iocb, size_t len int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, struct folio *folio, size_t copied, bool to_page_end, struct folio **writethrough_cache); -int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, - struct folio *writethrough_cache); +ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache); int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t len); /* @@ -255,6 +260,21 @@ static inline void netfs_put_group_many(struct netfs_group *netfs_group, int nr) } /* + * Clear and wake up a NETFS_RREQ_* flag bit on a request. + */ +static inline void netfs_wake_rreq_flag(struct netfs_io_request *rreq, + unsigned int rreq_flag, + enum netfs_rreq_trace trace) +{ + if (test_bit(rreq_flag, &rreq->flags)) { + trace_netfs_rreq(rreq, trace); + clear_bit_unlock(rreq_flag, &rreq->flags); + smp_mb__after_atomic(); /* Set flag before task state */ + wake_up(&rreq->waitq); + } +} + +/* * fscache-cache.c */ #ifdef CONFIG_PROC_FS diff --git a/fs/netfs/main.c b/fs/netfs/main.c index 70ecc8f5f210..3db401d269e7 100644 --- a/fs/netfs/main.c +++ b/fs/netfs/main.c @@ -39,6 +39,7 @@ static const char *netfs_origins[nr__netfs_io_origin] = { [NETFS_READ_GAPS] = "RG", [NETFS_READ_SINGLE] = "R1", [NETFS_READ_FOR_WRITE] = "RW", + [NETFS_UNBUFFERED_READ] = "UR", [NETFS_DIO_READ] = "DR", [NETFS_WRITEBACK] = "WB", [NETFS_WRITEBACK_SINGLE] = "W1", diff --git a/fs/netfs/misc.c b/fs/netfs/misc.c index 7099aa07737a..43b67a28a8fa 100644 --- a/fs/netfs/misc.c +++ b/fs/netfs/misc.c @@ -313,3 +313,222 @@ bool netfs_release_folio(struct folio *folio, gfp_t gfp) return true; } EXPORT_SYMBOL(netfs_release_folio); + +/* + * Wake the collection work item. + */ +void netfs_wake_collector(struct netfs_io_request *rreq) +{ + if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && + !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { + queue_work(system_unbound_wq, &rreq->work); + } else { + trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); + wake_up(&rreq->waitq); + } +} + +/* + * Mark a subrequest as no longer being in progress and, if need be, wake the + * collector. + */ +void netfs_subreq_clear_in_progress(struct netfs_io_subrequest *subreq) +{ + struct netfs_io_request *rreq = subreq->rreq; + struct netfs_io_stream *stream = &rreq->io_streams[subreq->stream_nr]; + + clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); + smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ + + /* If we are at the head of the queue, wake up the collector. */ + if (list_is_first(&subreq->rreq_link, &stream->subrequests) || + test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) + netfs_wake_collector(rreq); +} + +/* + * Wait for all outstanding I/O in a stream to quiesce. + */ +void netfs_wait_for_in_progress_stream(struct netfs_io_request *rreq, + struct netfs_io_stream *stream) +{ + struct netfs_io_subrequest *subreq; + DEFINE_WAIT(myself); + + list_for_each_entry(subreq, &stream->subrequests, rreq_link) { + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + continue; + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + for (;;) { + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) + break; + + trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + } + + finish_wait(&rreq->waitq, &myself); +} + +/* + * Perform collection in app thread if not offloaded to workqueue. + */ +static int netfs_collect_in_app(struct netfs_io_request *rreq, + bool (*collector)(struct netfs_io_request *rreq)) +{ + bool need_collect = false, inactive = true; + + for (int i = 0; i < NR_IO_STREAMS; i++) { + struct netfs_io_subrequest *subreq; + struct netfs_io_stream *stream = &rreq->io_streams[i]; + + if (!stream->active) + continue; + inactive = false; + trace_netfs_collect_stream(rreq, stream); + subreq = list_first_entry_or_null(&stream->subrequests, + struct netfs_io_subrequest, + rreq_link); + if (subreq && + (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || + test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { + need_collect = true; + break; + } + } + + if (!need_collect && !inactive) + return 0; /* Sleep */ + + __set_current_state(TASK_RUNNING); + if (collector(rreq)) { + /* Drop the ref from the NETFS_RREQ_IN_PROGRESS flag. */ + netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); + return 1; /* Done */ + } + + if (inactive) { + WARN(true, "Failed to collect inactive req R=%08x\n", + rreq->debug_id); + cond_resched(); + } + return 2; /* Again */ +} + +/* + * Wait for a request to complete, successfully or otherwise. + */ +static ssize_t netfs_wait_for_request(struct netfs_io_request *rreq, + bool (*collector)(struct netfs_io_request *rreq)) +{ + DEFINE_WAIT(myself); + ssize_t ret; + + for (;;) { + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + switch (netfs_collect_in_app(rreq, collector)) { + case 0: + break; + case 1: + goto all_collected; + case 2: + continue; + } + } + + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) + break; + + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + +all_collected: + finish_wait(&rreq->waitq, &myself); + + ret = rreq->error; + if (ret == 0) { + ret = rreq->transferred; + switch (rreq->origin) { + case NETFS_DIO_READ: + case NETFS_DIO_WRITE: + case NETFS_READ_SINGLE: + case NETFS_UNBUFFERED_READ: + case NETFS_UNBUFFERED_WRITE: + break; + default: + if (rreq->submitted < rreq->len) { + trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); + ret = -EIO; + } + break; + } + } + + return ret; +} + +ssize_t netfs_wait_for_read(struct netfs_io_request *rreq) +{ + return netfs_wait_for_request(rreq, netfs_read_collection); +} + +ssize_t netfs_wait_for_write(struct netfs_io_request *rreq) +{ + return netfs_wait_for_request(rreq, netfs_write_collection); +} + +/* + * Wait for a paused operation to unpause or complete in some manner. + */ +static void netfs_wait_for_pause(struct netfs_io_request *rreq, + bool (*collector)(struct netfs_io_request *rreq)) +{ + DEFINE_WAIT(myself); + + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause); + + for (;;) { + trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); + prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); + + if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { + switch (netfs_collect_in_app(rreq, collector)) { + case 0: + break; + case 1: + goto all_collected; + case 2: + continue; + } + } + + if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) || + !test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) + break; + + schedule(); + trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); + } + +all_collected: + finish_wait(&rreq->waitq, &myself); +} + +void netfs_wait_for_paused_read(struct netfs_io_request *rreq) +{ + return netfs_wait_for_pause(rreq, netfs_read_collection); +} + +void netfs_wait_for_paused_write(struct netfs_io_request *rreq) +{ + return netfs_wait_for_pause(rreq, netfs_write_collection); +} diff --git a/fs/netfs/objects.c b/fs/netfs/objects.c index dc6b41ef18b0..e8c99738b5bb 100644 --- a/fs/netfs/objects.c +++ b/fs/netfs/objects.c @@ -10,6 +10,8 @@ #include <linux/delay.h> #include "internal.h" +static void netfs_free_request(struct work_struct *work); + /* * Allocate an I/O request and initialise it. */ @@ -34,6 +36,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } memset(rreq, 0, kmem_cache_size(cache)); + INIT_WORK(&rreq->cleanup_work, netfs_free_request); rreq->start = start; rreq->len = len; rreq->origin = origin; @@ -49,13 +52,14 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, INIT_LIST_HEAD(&rreq->io_streams[0].subrequests); INIT_LIST_HEAD(&rreq->io_streams[1].subrequests); init_waitqueue_head(&rreq->waitq); - refcount_set(&rreq->ref, 1); + refcount_set(&rreq->ref, 2); if (origin == NETFS_READAHEAD || origin == NETFS_READPAGE || origin == NETFS_READ_GAPS || origin == NETFS_READ_SINGLE || origin == NETFS_READ_FOR_WRITE || + origin == NETFS_UNBUFFERED_READ || origin == NETFS_DIO_READ) { INIT_WORK(&rreq->work, netfs_read_collection_worker); rreq->io_streams[0].avail = true; @@ -64,8 +68,6 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } __set_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); - if (file && file->f_flags & O_NONBLOCK) - __set_bit(NETFS_RREQ_NONBLOCK, &rreq->flags); if (rreq->netfs_ops->init_request) { ret = rreq->netfs_ops->init_request(rreq, file); if (ret < 0) { @@ -75,7 +77,7 @@ struct netfs_io_request *netfs_alloc_request(struct address_space *mapping, } atomic_inc(&ctx->io_count); - trace_netfs_rreq_ref(rreq->debug_id, 1, netfs_rreq_trace_new); + trace_netfs_rreq_ref(rreq->debug_id, refcount_read(&rreq->ref), netfs_rreq_trace_new); netfs_proc_add_rreq(rreq); netfs_stat(&netfs_n_rh_rreq); return rreq; @@ -89,7 +91,7 @@ void netfs_get_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace trace_netfs_rreq_ref(rreq->debug_id, r + 1, what); } -void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) +void netfs_clear_subrequests(struct netfs_io_request *rreq) { struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream; @@ -101,8 +103,7 @@ void netfs_clear_subrequests(struct netfs_io_request *rreq, bool was_async) subreq = list_first_entry(&stream->subrequests, struct netfs_io_subrequest, rreq_link); list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, was_async, - netfs_sreq_trace_put_clear); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_clear); } } } @@ -118,13 +119,19 @@ static void netfs_free_request_rcu(struct rcu_head *rcu) static void netfs_free_request(struct work_struct *work) { struct netfs_io_request *rreq = - container_of(work, struct netfs_io_request, work); + container_of(work, struct netfs_io_request, cleanup_work); struct netfs_inode *ictx = netfs_inode(rreq->inode); unsigned int i; trace_netfs_rreq(rreq, netfs_rreq_trace_free); + + /* Cancel/flush the result collection worker. That does not carry a + * ref of its own, so we must wait for it somewhere. + */ + cancel_work_sync(&rreq->work); + netfs_proc_del_rreq(rreq); - netfs_clear_subrequests(rreq, false); + netfs_clear_subrequests(rreq); if (rreq->netfs_ops->free_request) rreq->netfs_ops->free_request(rreq); if (rreq->cache_resources.ops) @@ -145,8 +152,7 @@ static void netfs_free_request(struct work_struct *work) call_rcu(&rreq->rcu, netfs_free_request_rcu); } -void netfs_put_request(struct netfs_io_request *rreq, bool was_async, - enum netfs_rreq_ref_trace what) +void netfs_put_request(struct netfs_io_request *rreq, enum netfs_rreq_ref_trace what) { unsigned int debug_id; bool dead; @@ -156,15 +162,8 @@ void netfs_put_request(struct netfs_io_request *rreq, bool was_async, debug_id = rreq->debug_id; dead = __refcount_dec_and_test(&rreq->ref, &r); trace_netfs_rreq_ref(debug_id, r - 1, what); - if (dead) { - if (was_async) { - rreq->work.func = netfs_free_request; - if (!queue_work(system_unbound_wq, &rreq->work)) - WARN_ON(1); - } else { - netfs_free_request(&rreq->work); - } - } + if (dead) + WARN_ON(!queue_work(system_unbound_wq, &rreq->cleanup_work)); } } @@ -206,8 +205,7 @@ void netfs_get_subrequest(struct netfs_io_subrequest *subreq, what); } -static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, - bool was_async) +static void netfs_free_subrequest(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; @@ -216,10 +214,10 @@ static void netfs_free_subrequest(struct netfs_io_subrequest *subreq, rreq->netfs_ops->free_subrequest(subreq); mempool_free(subreq, rreq->netfs_ops->subrequest_pool ?: &netfs_subrequest_pool); netfs_stat_d(&netfs_n_rh_sreq); - netfs_put_request(rreq, was_async, netfs_rreq_trace_put_subreq); + netfs_put_request(rreq, netfs_rreq_trace_put_subreq); } -void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, +void netfs_put_subrequest(struct netfs_io_subrequest *subreq, enum netfs_sreq_ref_trace what) { unsigned int debug_index = subreq->debug_index; @@ -230,5 +228,5 @@ void netfs_put_subrequest(struct netfs_io_subrequest *subreq, bool was_async, dead = __refcount_dec_and_test(&subreq->ref, &r); trace_netfs_sreq_ref(debug_id, debug_index, r - 1, what); if (dead) - netfs_free_subrequest(subreq, was_async); + netfs_free_subrequest(subreq); } diff --git a/fs/netfs/read_collect.c b/fs/netfs/read_collect.c index 23c75755ad4e..96ee18af28ef 100644 --- a/fs/netfs/read_collect.c +++ b/fs/netfs/read_collect.c @@ -83,14 +83,12 @@ static void netfs_unlock_read_folio(struct netfs_io_request *rreq, } just_unlock: - if (!test_bit(NETFS_RREQ_DONT_UNLOCK_FOLIOS, &rreq->flags)) { - if (folio->index == rreq->no_unlock_folio && - test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { - _debug("no unlock"); - } else { - trace_netfs_folio(folio, netfs_folio_trace_read_unlock); - folio_unlock(folio); - } + if (folio->index == rreq->no_unlock_folio && + test_bit(NETFS_RREQ_NO_UNLOCK_FOLIO, &rreq->flags)) { + _debug("no unlock"); + } else { + trace_netfs_folio(folio, netfs_folio_trace_read_unlock); + folio_unlock(folio); } folioq_clear(folioq, slot); @@ -280,9 +278,13 @@ reassess: stream->need_retry = true; notes |= NEED_RETRY | MADE_PROGRESS; break; + } else if (test_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags)) { + notes |= MADE_PROGRESS; } else { if (!stream->failed) - stream->transferred = stream->collected_to - rreq->start; + stream->transferred += transferred; + if (front->transferred < front->len) + set_bit(NETFS_RREQ_SHORT_TRANSFER, &rreq->flags); notes |= MADE_PROGRESS; } @@ -297,7 +299,7 @@ reassess: struct netfs_io_subrequest, rreq_link); stream->front = front; spin_unlock(&rreq->lock); - netfs_put_subrequest(remove, false, + netfs_put_subrequest(remove, notes & ABANDON_SREQ ? netfs_sreq_trace_put_abandon : netfs_sreq_trace_put_done); @@ -311,14 +313,8 @@ reassess: if (notes & NEED_RETRY) goto need_retry; - if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) { - trace_netfs_rreq(rreq, netfs_rreq_trace_unpause); - clear_bit_unlock(NETFS_RREQ_PAUSE, &rreq->flags); - smp_mb__after_atomic(); /* Set PAUSE before task state */ - wake_up(&rreq->waitq); - } - if (notes & MADE_PROGRESS) { + netfs_wake_rreq_flag(rreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause); //cond_resched(); goto reassess; } @@ -342,24 +338,10 @@ need_retry: */ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) { - struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; unsigned int i; - /* Collect unbuffered reads and direct reads, adding up the transfer - * sizes until we find the first short or failed subrequest. - */ - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - rreq->transferred += subreq->transferred; - - if (subreq->transferred < subreq->len || - test_bit(NETFS_SREQ_FAILED, &subreq->flags)) { - rreq->error = subreq->error; - break; - } - } - - if (rreq->origin == NETFS_DIO_READ) { + if (rreq->origin == NETFS_UNBUFFERED_READ || + rreq->origin == NETFS_DIO_READ) { for (i = 0; i < rreq->direct_bv_count; i++) { flush_dcache_page(rreq->direct_bv[i].bv_page); // TODO: cifs marks pages in the destination buffer @@ -377,7 +359,8 @@ static void netfs_rreq_assess_dio(struct netfs_io_request *rreq) } if (rreq->netfs_ops->done) rreq->netfs_ops->done(rreq); - if (rreq->origin == NETFS_DIO_READ) + if (rreq->origin == NETFS_UNBUFFERED_READ || + rreq->origin == NETFS_DIO_READ) inode_dio_end(rreq->inode); } @@ -410,7 +393,7 @@ static void netfs_rreq_assess_single(struct netfs_io_request *rreq) * Note that we're in normal kernel thread context at this point, possibly * running on a workqueue. */ -static void netfs_read_collection(struct netfs_io_request *rreq) +bool netfs_read_collection(struct netfs_io_request *rreq) { struct netfs_io_stream *stream = &rreq->io_streams[0]; @@ -420,11 +403,11 @@ static void netfs_read_collection(struct netfs_io_request *rreq) * queue is empty. */ if (!test_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags)) - return; + return false; smp_rmb(); /* Read ALL_QUEUED before subreq lists. */ if (!list_empty(&stream->subrequests)) - return; + return false; /* Okay, declare that all I/O is complete. */ rreq->transferred = stream->transferred; @@ -433,6 +416,7 @@ static void netfs_read_collection(struct netfs_io_request *rreq) //netfs_rreq_is_still_valid(rreq); switch (rreq->origin) { + case NETFS_UNBUFFERED_READ: case NETFS_DIO_READ: case NETFS_READ_GAPS: netfs_rreq_assess_dio(rreq); @@ -445,14 +429,15 @@ static void netfs_read_collection(struct netfs_io_request *rreq) } task_io_account_read(rreq->transferred); - trace_netfs_rreq(rreq, netfs_rreq_trace_wake_ip); - clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags); + netfs_wake_rreq_flag(rreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); + /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ trace_netfs_rreq(rreq, netfs_rreq_trace_done); - netfs_clear_subrequests(rreq, false); + netfs_clear_subrequests(rreq); netfs_unlock_abandoned_read_pages(rreq); if (unlikely(rreq->copy_to_cache)) netfs_pgpriv2_end_copy_to_cache(rreq); + return true; } void netfs_read_collection_worker(struct work_struct *work) @@ -460,26 +445,12 @@ void netfs_read_collection_worker(struct work_struct *work) struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); netfs_see_request(rreq, netfs_rreq_trace_see_work); - if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - netfs_read_collection(rreq); - netfs_put_request(rreq, false, netfs_rreq_trace_put_work); -} - -/* - * Wake the collection work item. - */ -void netfs_wake_read_collector(struct netfs_io_request *rreq) -{ - if (test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags) && - !test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) { - if (!work_pending(&rreq->work)) { - netfs_get_request(rreq, netfs_rreq_trace_get_work); - if (!queue_work(system_unbound_wq, &rreq->work)) - netfs_put_request(rreq, true, netfs_rreq_trace_put_work_nq); - } - } else { - trace_netfs_rreq(rreq, netfs_rreq_trace_wake_queue); - wake_up(&rreq->waitq); + if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) { + if (netfs_read_collection(rreq)) + /* Drop the ref from the IN_PROGRESS flag. */ + netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); + else + netfs_see_request(rreq, netfs_rreq_trace_see_work_complete); } } @@ -511,7 +482,7 @@ void netfs_read_subreq_progress(struct netfs_io_subrequest *subreq) list_is_first(&subreq->rreq_link, &stream->subrequests) ) { __set_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags); - netfs_wake_read_collector(rreq); + netfs_wake_collector(rreq); } } EXPORT_SYMBOL(netfs_read_subreq_progress); @@ -535,7 +506,6 @@ EXPORT_SYMBOL(netfs_read_subreq_progress); void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) { struct netfs_io_request *rreq = subreq->rreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; switch (subreq->source) { case NETFS_READ_FROM_CACHE: @@ -582,23 +552,15 @@ void netfs_read_subreq_terminated(struct netfs_io_subrequest *subreq) } trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - clear_bit_unlock(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - smp_mb__after_atomic(); /* Clear IN_PROGRESS before task state */ - - /* If we are at the head of the queue, wake up the collector. */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests) || - test_bit(NETFS_RREQ_RETRYING, &rreq->flags)) - netfs_wake_read_collector(rreq); - - netfs_put_subrequest(subreq, true, netfs_sreq_trace_put_terminated); + netfs_subreq_clear_in_progress(subreq); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated); } EXPORT_SYMBOL(netfs_read_subreq_terminated); /* * Handle termination of a read from the cache. */ -void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool was_async) +void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error) { struct netfs_io_subrequest *subreq = priv; @@ -613,94 +575,3 @@ void netfs_cache_read_terminated(void *priv, ssize_t transferred_or_error, bool } netfs_read_subreq_terminated(subreq); } - -/* - * Wait for the read operation to complete, successfully or otherwise. - */ -ssize_t netfs_wait_for_read(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; - DEFINE_WAIT(myself); - ssize_t ret; - - for (;;) { - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); - prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - - subreq = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); - if (subreq && - (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || - test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { - __set_current_state(TASK_RUNNING); - netfs_read_collection(rreq); - continue; - } - - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) - break; - - schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); - } - - finish_wait(&rreq->waitq, &myself); - - ret = rreq->error; - if (ret == 0) { - ret = rreq->transferred; - switch (rreq->origin) { - case NETFS_DIO_READ: - case NETFS_READ_SINGLE: - ret = rreq->transferred; - break; - default: - if (rreq->submitted < rreq->len) { - trace_netfs_failure(rreq, NULL, ret, netfs_fail_short_read); - ret = -EIO; - } - break; - } - } - - return ret; -} - -/* - * Wait for a paused read operation to unpause or complete in some manner. - */ -void netfs_wait_for_pause(struct netfs_io_request *rreq) -{ - struct netfs_io_subrequest *subreq; - struct netfs_io_stream *stream = &rreq->io_streams[0]; - DEFINE_WAIT(myself); - - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_pause); - - for (;;) { - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); - prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - - if (!test_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &rreq->flags)) { - subreq = list_first_entry_or_null(&stream->subrequests, - struct netfs_io_subrequest, rreq_link); - if (subreq && - (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags) || - test_bit(NETFS_SREQ_MADE_PROGRESS, &subreq->flags))) { - __set_current_state(TASK_RUNNING); - netfs_read_collection(rreq); - continue; - } - } - - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags) || - !test_bit(NETFS_RREQ_PAUSE, &rreq->flags)) - break; - - schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); - } - - finish_wait(&rreq->waitq, &myself); -} diff --git a/fs/netfs/read_pgpriv2.c b/fs/netfs/read_pgpriv2.c index cf7727060215..5bbe906a551d 100644 --- a/fs/netfs/read_pgpriv2.c +++ b/fs/netfs/read_pgpriv2.c @@ -116,7 +116,7 @@ static struct netfs_io_request *netfs_pgpriv2_begin_copy_to_cache( return creq; cancel_put: - netfs_put_request(creq, false, netfs_rreq_trace_put_return); + netfs_put_request(creq, netfs_rreq_trace_put_return); cancel: rreq->copy_to_cache = ERR_PTR(-ENOBUFS); clear_bit(NETFS_RREQ_FOLIO_COPY_TO_CACHE, &rreq->flags); @@ -155,7 +155,7 @@ void netfs_pgpriv2_end_copy_to_cache(struct netfs_io_request *rreq) smp_wmb(); /* Write lists before ALL_QUEUED. */ set_bit(NETFS_RREQ_ALL_QUEUED, &creq->flags); - netfs_put_request(creq, false, netfs_rreq_trace_put_return); + netfs_put_request(creq, netfs_rreq_trace_put_return); creq->copy_to_cache = NULL; } diff --git a/fs/netfs/read_retry.c b/fs/netfs/read_retry.c index 0f294b26e08c..b99e84a8170a 100644 --- a/fs/netfs/read_retry.c +++ b/fs/netfs/read_retry.c @@ -173,7 +173,7 @@ static void netfs_retry_read_subrequests(struct netfs_io_request *rreq) &stream->subrequests, rreq_link) { trace_netfs_sreq(subreq, netfs_sreq_trace_superfluous); list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_done); if (subreq == to) break; } @@ -257,35 +257,15 @@ abandon: */ void netfs_retry_reads(struct netfs_io_request *rreq) { - struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream = &rreq->io_streams[0]; - DEFINE_WAIT(myself); netfs_stat(&netfs_n_rh_retry_read_req); - set_bit(NETFS_RREQ_RETRYING, &rreq->flags); - /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) - continue; - - trace_netfs_rreq(rreq, netfs_rreq_trace_wait_queue); - for (;;) { - prepare_to_wait(&rreq->waitq, &myself, TASK_UNINTERRUPTIBLE); - - if (!test_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags)) - break; - - trace_netfs_sreq(subreq, netfs_sreq_trace_wait_for); - schedule(); - trace_netfs_rreq(rreq, netfs_rreq_trace_woke_queue); - } - - finish_wait(&rreq->waitq, &myself); - } + set_bit(NETFS_RREQ_RETRYING, &rreq->flags); + netfs_wait_for_in_progress_stream(rreq, stream); clear_bit(NETFS_RREQ_RETRYING, &rreq->flags); trace_netfs_rreq(rreq, netfs_rreq_trace_resubmit); diff --git a/fs/netfs/read_single.c b/fs/netfs/read_single.c index fea0ecdecc53..fa622a6cd56d 100644 --- a/fs/netfs/read_single.c +++ b/fs/netfs/read_single.c @@ -142,7 +142,7 @@ static int netfs_single_dispatch_read(struct netfs_io_request *rreq) set_bit(NETFS_RREQ_ALL_QUEUED, &rreq->flags); return ret; cancel: - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_cancel); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_cancel); return ret; } @@ -185,11 +185,11 @@ ssize_t netfs_read_single(struct inode *inode, struct file *file, struct iov_ite netfs_single_dispatch_read(rreq); ret = netfs_wait_for_read(rreq); - netfs_put_request(rreq, true, netfs_rreq_trace_put_return); + netfs_put_request(rreq, netfs_rreq_trace_put_return); return ret; cleanup_free: - netfs_put_request(rreq, false, netfs_rreq_trace_put_failed); + netfs_put_request(rreq, netfs_rreq_trace_put_failed); return ret; } EXPORT_SYMBOL(netfs_read_single); diff --git a/fs/netfs/write_collect.c b/fs/netfs/write_collect.c index 3fca59e6475d..e2b102ffb768 100644 --- a/fs/netfs/write_collect.c +++ b/fs/netfs/write_collect.c @@ -280,7 +280,7 @@ reassess_streams: struct netfs_io_subrequest, rreq_link); stream->front = front; spin_unlock(&wreq->lock); - netfs_put_subrequest(remove, false, + netfs_put_subrequest(remove, notes & SAW_FAILURE ? netfs_sreq_trace_put_cancel : netfs_sreq_trace_put_done); @@ -321,18 +321,14 @@ reassess_streams: if (notes & NEED_RETRY) goto need_retry; - if ((notes & MADE_PROGRESS) && test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { - trace_netfs_rreq(wreq, netfs_rreq_trace_unpause); - clear_bit_unlock(NETFS_RREQ_PAUSE, &wreq->flags); - smp_mb__after_atomic(); /* Set PAUSE before task state */ - wake_up(&wreq->waitq); - } - if (notes & NEED_REASSESS) { + if (notes & MADE_PROGRESS) { + netfs_wake_rreq_flag(wreq, NETFS_RREQ_PAUSE, netfs_rreq_trace_unpause); //cond_resched(); goto reassess_streams; } - if (notes & MADE_PROGRESS) { + + if (notes & NEED_REASSESS) { //cond_resched(); goto reassess_streams; } @@ -356,30 +352,21 @@ need_retry: /* * Perform the collection of subrequests, folios and encryption buffers. */ -void netfs_write_collection_worker(struct work_struct *work) +bool netfs_write_collection(struct netfs_io_request *wreq) { - struct netfs_io_request *wreq = container_of(work, struct netfs_io_request, work); struct netfs_inode *ictx = netfs_inode(wreq->inode); size_t transferred; int s; _enter("R=%x", wreq->debug_id); - netfs_see_request(wreq, netfs_rreq_trace_see_work); - if (!test_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags)) { - netfs_put_request(wreq, false, netfs_rreq_trace_put_work); - return; - } - netfs_collect_write_results(wreq); /* We're done when the app thread has finished posting subreqs and all * the queues in all the streams are empty. */ - if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) { - netfs_put_request(wreq, false, netfs_rreq_trace_put_work); - return; - } + if (!test_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags)) + return false; smp_rmb(); /* Read ALL_QUEUED before lists. */ transferred = LONG_MAX; @@ -387,10 +374,8 @@ void netfs_write_collection_worker(struct work_struct *work) struct netfs_io_stream *stream = &wreq->io_streams[s]; if (!stream->active) continue; - if (!list_empty(&stream->subrequests)) { - netfs_put_request(wreq, false, netfs_rreq_trace_put_work); - return; - } + if (!list_empty(&stream->subrequests)) + return false; if (stream->transferred < transferred) transferred = stream->transferred; } @@ -428,8 +413,8 @@ void netfs_write_collection_worker(struct work_struct *work) inode_dio_end(wreq->inode); _debug("finished"); - trace_netfs_rreq(wreq, netfs_rreq_trace_wake_ip); - clear_and_wake_up_bit(NETFS_RREQ_IN_PROGRESS, &wreq->flags); + netfs_wake_rreq_flag(wreq, NETFS_RREQ_IN_PROGRESS, netfs_rreq_trace_wake_ip); + /* As we cleared NETFS_RREQ_IN_PROGRESS, we acquired its ref. */ if (wreq->iocb) { size_t written = min(wreq->transferred, wreq->len); @@ -440,19 +425,21 @@ void netfs_write_collection_worker(struct work_struct *work) wreq->iocb = VFS_PTR_POISON; } - netfs_clear_subrequests(wreq, false); - netfs_put_request(wreq, false, netfs_rreq_trace_put_work_complete); + netfs_clear_subrequests(wreq); + return true; } -/* - * Wake the collection work item. - */ -void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) +void netfs_write_collection_worker(struct work_struct *work) { - if (!work_pending(&wreq->work)) { - netfs_get_request(wreq, netfs_rreq_trace_get_work); - if (!queue_work(system_unbound_wq, &wreq->work)) - netfs_put_request(wreq, was_async, netfs_rreq_trace_put_work_nq); + struct netfs_io_request *rreq = container_of(work, struct netfs_io_request, work); + + netfs_see_request(rreq, netfs_rreq_trace_see_work); + if (test_bit(NETFS_RREQ_IN_PROGRESS, &rreq->flags)) { + if (netfs_write_collection(rreq)) + /* Drop the ref from the IN_PROGRESS flag. */ + netfs_put_request(rreq, netfs_rreq_trace_put_work_ip); + else + netfs_see_request(rreq, netfs_rreq_trace_see_work_complete); } } @@ -460,7 +447,6 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) * netfs_write_subrequest_terminated - Note the termination of a write operation. * @_op: The I/O request that has terminated. * @transferred_or_error: The amount of data transferred or an error code. - * @was_async: The termination was asynchronous * * This tells the library that a contributory write I/O operation has * terminated, one way or another, and that it should collect the results. @@ -470,21 +456,16 @@ void netfs_wake_write_collector(struct netfs_io_request *wreq, bool was_async) * negative error code. The library will look after reissuing I/O operations * as appropriate and writing downloaded data to the cache. * - * If @was_async is true, the caller might be running in softirq or interrupt - * context and we can't sleep. - * * When this is called, ownership of the subrequest is transferred back to the * library, along with a ref. * * Note that %_op is a void* so that the function can be passed to * kiocb::term_func without the need for a casting wrapper. */ -void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, - bool was_async) +void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error) { struct netfs_io_subrequest *subreq = _op; struct netfs_io_request *wreq = subreq->rreq; - struct netfs_io_stream *stream = &wreq->io_streams[subreq->stream_nr]; _enter("%x[%x] %zd", wreq->debug_id, subreq->debug_index, transferred_or_error); @@ -495,8 +476,6 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, case NETFS_WRITE_TO_CACHE: netfs_stat(&netfs_n_wh_write_done); break; - case NETFS_INVALID_WRITE: - break; default: BUG(); } @@ -536,15 +515,7 @@ void netfs_write_subrequest_terminated(void *_op, ssize_t transferred_or_error, } trace_netfs_sreq(subreq, netfs_sreq_trace_terminated); - - clear_and_wake_up_bit(NETFS_SREQ_IN_PROGRESS, &subreq->flags); - - /* If we are at the head of the queue, wake up the collector, - * transferring a ref to it if we were the ones to do so. - */ - if (list_is_first(&subreq->rreq_link, &stream->subrequests)) - netfs_wake_write_collector(wreq, was_async); - - netfs_put_subrequest(subreq, was_async, netfs_sreq_trace_put_terminated); + netfs_subreq_clear_in_progress(subreq); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_terminated); } EXPORT_SYMBOL(netfs_write_subrequest_terminated); diff --git a/fs/netfs/write_issue.c b/fs/netfs/write_issue.c index 77279fc5b5a7..50bee2c4130d 100644 --- a/fs/netfs/write_issue.c +++ b/fs/netfs/write_issue.c @@ -134,7 +134,7 @@ struct netfs_io_request *netfs_create_write_req(struct address_space *mapping, return wreq; nomem: wreq->error = -ENOMEM; - netfs_put_request(wreq, false, netfs_rreq_trace_put_failed); + netfs_put_request(wreq, netfs_rreq_trace_put_failed); return ERR_PTR(-ENOMEM); } @@ -233,7 +233,7 @@ static void netfs_do_issue_write(struct netfs_io_stream *stream, _enter("R=%x[%x],%zx", wreq->debug_id, subreq->debug_index, subreq->len); if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) - return netfs_write_subrequest_terminated(subreq, subreq->error, false); + return netfs_write_subrequest_terminated(subreq, subreq->error); trace_netfs_sreq(subreq, netfs_sreq_trace_submit); stream->issue_write(subreq); @@ -542,7 +542,7 @@ static void netfs_end_issue_write(struct netfs_io_request *wreq) } if (needs_poke) - netfs_wake_write_collector(wreq, false); + netfs_wake_collector(wreq); } /* @@ -576,6 +576,7 @@ int netfs_writepages(struct address_space *mapping, goto couldnt_start; } + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); trace_netfs_write(wreq, netfs_write_trace_writeback); netfs_stat(&netfs_n_wh_writepages); @@ -599,8 +600,9 @@ int netfs_writepages(struct address_space *mapping, netfs_end_issue_write(wreq); mutex_unlock(&ictx->wb_lock); + netfs_wake_collector(wreq); - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + netfs_put_request(wreq, netfs_rreq_trace_put_return); _leave(" = %d", error); return error; @@ -673,11 +675,11 @@ int netfs_advance_writethrough(struct netfs_io_request *wreq, struct writeback_c /* * End a write operation used when writing through the pagecache. */ -int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, - struct folio *writethrough_cache) +ssize_t netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_control *wbc, + struct folio *writethrough_cache) { struct netfs_inode *ictx = netfs_inode(wreq->inode); - int ret; + ssize_t ret; _enter("R=%x", wreq->debug_id); @@ -688,13 +690,11 @@ int netfs_end_writethrough(struct netfs_io_request *wreq, struct writeback_contr mutex_unlock(&ictx->wb_lock); - if (wreq->iocb) { + if (wreq->iocb) ret = -EIOCBQUEUED; - } else { - wait_on_bit(&wreq->flags, NETFS_RREQ_IN_PROGRESS, TASK_UNINTERRUPTIBLE); - ret = wreq->error; - } - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + else + ret = netfs_wait_for_write(wreq); + netfs_put_request(wreq, netfs_rreq_trace_put_return); return ret; } @@ -722,10 +722,8 @@ int netfs_unbuffered_write(struct netfs_io_request *wreq, bool may_wait, size_t start += part; len -= part; rolling_buffer_advance(&wreq->buffer, part); - if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) { - trace_netfs_rreq(wreq, netfs_rreq_trace_wait_pause); - wait_event(wreq->waitq, !test_bit(NETFS_RREQ_PAUSE, &wreq->flags)); - } + if (test_bit(NETFS_RREQ_PAUSE, &wreq->flags)) + netfs_wait_for_paused_write(wreq); if (test_bit(NETFS_RREQ_FAILED, &wreq->flags)) break; } @@ -885,7 +883,8 @@ int netfs_writeback_single(struct address_space *mapping, goto couldnt_start; } - trace_netfs_write(wreq, netfs_write_trace_writeback); + __set_bit(NETFS_RREQ_OFFLOAD_COLLECTION, &wreq->flags); + trace_netfs_write(wreq, netfs_write_trace_writeback_single); netfs_stat(&netfs_n_wh_writepages); if (__test_and_set_bit(NETFS_RREQ_UPLOAD_TO_SERVER, &wreq->flags)) @@ -914,8 +913,9 @@ stop: set_bit(NETFS_RREQ_ALL_QUEUED, &wreq->flags); mutex_unlock(&ictx->wb_lock); + netfs_wake_collector(wreq); - netfs_put_request(wreq, false, netfs_rreq_trace_put_return); + netfs_put_request(wreq, netfs_rreq_trace_put_return); _leave(" = %d", ret); return ret; diff --git a/fs/netfs/write_retry.c b/fs/netfs/write_retry.c index 545d33079a77..9d1d8a8bab72 100644 --- a/fs/netfs/write_retry.c +++ b/fs/netfs/write_retry.c @@ -39,9 +39,10 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, if (test_bit(NETFS_SREQ_FAILED, &subreq->flags)) break; if (__test_and_clear_bit(NETFS_SREQ_NEED_RETRY, &subreq->flags)) { - struct iov_iter source = subreq->io_iter; + struct iov_iter source; - iov_iter_revert(&source, subreq->len - source.count); + netfs_reset_iter(subreq); + source = subreq->io_iter; netfs_get_subrequest(subreq, netfs_sreq_trace_get_resubmit); netfs_reissue_write(stream, subreq, &source); } @@ -131,7 +132,7 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, &stream->subrequests, rreq_link) { trace_netfs_sreq(subreq, netfs_sreq_trace_discard); list_del(&subreq->rreq_link); - netfs_put_subrequest(subreq, false, netfs_sreq_trace_put_done); + netfs_put_subrequest(subreq, netfs_sreq_trace_put_done); if (subreq == to) break; } @@ -199,7 +200,6 @@ static void netfs_retry_write_stream(struct netfs_io_request *wreq, */ void netfs_retry_writes(struct netfs_io_request *wreq) { - struct netfs_io_subrequest *subreq; struct netfs_io_stream *stream; int s; @@ -208,16 +208,13 @@ void netfs_retry_writes(struct netfs_io_request *wreq) /* Wait for all outstanding I/O to quiesce before performing retries as * we may need to renegotiate the I/O sizes. */ + set_bit(NETFS_RREQ_RETRYING, &wreq->flags); for (s = 0; s < NR_IO_STREAMS; s++) { stream = &wreq->io_streams[s]; - if (!stream->active) - continue; - - list_for_each_entry(subreq, &stream->subrequests, rreq_link) { - wait_on_bit(&subreq->flags, NETFS_SREQ_IN_PROGRESS, - TASK_UNINTERRUPTIBLE); - } + if (stream->active) + netfs_wait_for_in_progress_stream(wreq, stream); } + clear_bit(NETFS_RREQ_RETRYING, &wreq->flags); // TODO: Enc: Fetch changed partial pages // TODO: Enc: Reencrypt content if needed. diff --git a/fs/nfs/client.c b/fs/nfs/client.c index 6d63b958c4bb..cf35ad3f818a 100644 --- a/fs/nfs/client.c +++ b/fs/nfs/client.c @@ -180,7 +180,7 @@ struct nfs_client *nfs_alloc_client(const struct nfs_client_initdata *cl_init) clp->cl_proto = cl_init->proto; clp->cl_nconnect = cl_init->nconnect; clp->cl_max_connect = cl_init->max_connect ? cl_init->max_connect : 1; - clp->cl_net = get_net(cl_init->net); + clp->cl_net = get_net_track(cl_init->net, &clp->cl_ns_tracker, GFP_KERNEL); #if IS_ENABLED(CONFIG_NFS_LOCALIO) seqlock_init(&clp->cl_boot_lock); @@ -250,7 +250,7 @@ void nfs_free_client(struct nfs_client *clp) if (!IS_ERR(clp->cl_rpcclient)) rpc_shutdown_client(clp->cl_rpcclient); - put_net(clp->cl_net); + put_net_track(clp->cl_net, &clp->cl_ns_tracker); put_nfs_version(clp->cl_nfs_mod); kfree(clp->cl_hostname); kfree(clp->cl_acceptor); @@ -439,7 +439,7 @@ struct nfs_client *nfs_get_client(const struct nfs_client_initdata *cl_init) spin_unlock(&nn->nfs_client_lock); new = rpc_ops->init_client(new, cl_init); if (!IS_ERR(new)) - nfs_local_probe(new); + nfs_local_probe_async(new); return new; } diff --git a/fs/nfs/delegation.c b/fs/nfs/delegation.c index 8bdbc4dca89c..10ef46e29b25 100644 --- a/fs/nfs/delegation.c +++ b/fs/nfs/delegation.c @@ -1021,13 +1021,6 @@ out: nfs_inode_find_state_and_recover(inode, stateid); } -void nfs_remove_bad_delegation(struct inode *inode, - const nfs4_stateid *stateid) -{ - nfs_revoke_delegation(inode, stateid); -} -EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); - void nfs_delegation_mark_returned(struct inode *inode, const nfs4_stateid *stateid) { @@ -1070,6 +1063,24 @@ out_rcu_unlock: } /** + * nfs_remove_bad_delegation - handle delegations that are unusable + * @inode: inode to process + * @stateid: the delegation's stateid + * + * If the server ACK-ed our FREE_STATEID then clean + * up the delegation, else mark and keep the revoked state. + */ +void nfs_remove_bad_delegation(struct inode *inode, + const nfs4_stateid *stateid) +{ + if (stateid && stateid->type == NFS4_FREED_STATEID_TYPE) + nfs_delegation_mark_returned(inode, stateid); + else + nfs_revoke_delegation(inode, stateid); +} +EXPORT_SYMBOL_GPL(nfs_remove_bad_delegation); + +/** * nfs_expire_unused_delegation_types * @clp: client to process * @flags: delegation types to expire diff --git a/fs/nfs/flexfilelayout/flexfilelayout.c b/fs/nfs/flexfilelayout/flexfilelayout.c index e6909cafab68..df4807460596 100644 --- a/fs/nfs/flexfilelayout/flexfilelayout.c +++ b/fs/nfs/flexfilelayout/flexfilelayout.c @@ -1129,6 +1129,8 @@ static int ff_layout_async_handle_error_v4(struct rpc_task *task, nfs4_schedule_session_recovery(clp->cl_session, task->tk_status); break; case -NFS4ERR_DELAY: + nfs_inc_stats(lseg->pls_layout->plh_inode, NFSIOS_DELAY); + fallthrough; case -NFS4ERR_GRACE: rpc_delay(task, FF_LAYOUT_POLL_RETRY_MAX); break; diff --git a/fs/nfs/flexfilelayout/flexfilelayoutdev.c b/fs/nfs/flexfilelayout/flexfilelayoutdev.c index 4a304cf17c4b..656d5c50bbce 100644 --- a/fs/nfs/flexfilelayout/flexfilelayoutdev.c +++ b/fs/nfs/flexfilelayout/flexfilelayoutdev.c @@ -400,7 +400,7 @@ nfs4_ff_layout_prepare_ds(struct pnfs_layout_segment *lseg, * keep ds_clp even if DS is local, so that if local IO cannot * proceed somehow, we can fall back to NFS whenever we want. */ - nfs_local_probe(ds->ds_clp); + nfs_local_probe_async(ds->ds_clp); max_payload = nfs_block_size(rpc_max_payload(ds->ds_clp->cl_rpcclient), NULL); diff --git a/fs/nfs/fscache.c b/fs/nfs/fscache.c index e278a1ad1ca3..8b0785178731 100644 --- a/fs/nfs/fscache.c +++ b/fs/nfs/fscache.c @@ -367,6 +367,7 @@ void nfs_netfs_read_completion(struct nfs_pgio_header *hdr) sreq = netfs->sreq; if (test_bit(NFS_IOHDR_EOF, &hdr->flags) && + sreq->rreq->origin != NETFS_UNBUFFERED_READ && sreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &sreq->flags); diff --git a/fs/nfs/inode.c b/fs/nfs/inode.c index 119e447758b9..8ab7868807a7 100644 --- a/fs/nfs/inode.c +++ b/fs/nfs/inode.c @@ -557,6 +557,8 @@ nfs_fhget(struct super_block *sb, struct nfs_fh *fh, struct nfs_fattr *fattr) set_nlink(inode, fattr->nlink); else if (fattr_supported & NFS_ATTR_FATTR_NLINK) nfs_set_cache_invalid(inode, NFS_INO_INVALID_NLINK); + else + set_nlink(inode, 1); if (fattr->valid & NFS_ATTR_FATTR_OWNER) inode->i_uid = fattr->uid; else if (fattr_supported & NFS_ATTR_FATTR_OWNER) @@ -633,6 +635,34 @@ nfs_fattr_fixup_delegated(struct inode *inode, struct nfs_fattr *fattr) } } +static void nfs_set_timestamps_to_ts(struct inode *inode, struct iattr *attr) +{ + unsigned int cache_flags = 0; + + if (attr->ia_valid & ATTR_MTIME_SET) { + struct timespec64 ctime = inode_get_ctime(inode); + struct timespec64 mtime = inode_get_mtime(inode); + struct timespec64 now; + int updated = 0; + + now = inode_set_ctime_current(inode); + if (!timespec64_equal(&now, &ctime)) + updated |= S_CTIME; + + inode_set_mtime_to_ts(inode, attr->ia_mtime); + if (!timespec64_equal(&now, &mtime)) + updated |= S_MTIME; + + inode_maybe_inc_iversion(inode, updated); + cache_flags |= NFS_INO_INVALID_CTIME | NFS_INO_INVALID_MTIME; + } + if (attr->ia_valid & ATTR_ATIME_SET) { + inode_set_atime_to_ts(inode, attr->ia_atime); + cache_flags |= NFS_INO_INVALID_ATIME; + } + NFS_I(inode)->cache_validity &= ~cache_flags; +} + static void nfs_update_timestamps(struct inode *inode, unsigned int ia_valid) { enum file_time_flags time_flags = 0; @@ -701,14 +731,27 @@ nfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, if (nfs_have_delegated_mtime(inode) && attr->ia_valid & ATTR_MTIME) { spin_lock(&inode->i_lock); - nfs_update_timestamps(inode, attr->ia_valid); + if (attr->ia_valid & ATTR_MTIME_SET) { + nfs_set_timestamps_to_ts(inode, attr); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_MTIME_SET| + ATTR_ATIME|ATTR_ATIME_SET); + } else { + nfs_update_timestamps(inode, attr->ia_valid); + attr->ia_valid &= ~(ATTR_MTIME|ATTR_ATIME); + } spin_unlock(&inode->i_lock); - attr->ia_valid &= ~(ATTR_MTIME | ATTR_ATIME); } else if (nfs_have_delegated_atime(inode) && attr->ia_valid & ATTR_ATIME && !(attr->ia_valid & ATTR_MTIME)) { - nfs_update_delegated_atime(inode); - attr->ia_valid &= ~ATTR_ATIME; + if (attr->ia_valid & ATTR_ATIME_SET) { + spin_lock(&inode->i_lock); + nfs_set_timestamps_to_ts(inode, attr); + spin_unlock(&inode->i_lock); + attr->ia_valid &= ~(ATTR_ATIME|ATTR_ATIME_SET); + } else { + nfs_update_delegated_atime(inode); + attr->ia_valid &= ~ATTR_ATIME; + } } /* Optimization: if the end result is no change, don't RPC */ diff --git a/fs/nfs/internal.h b/fs/nfs/internal.h index 6655e5f32ec6..69c2c10ee658 100644 --- a/fs/nfs/internal.h +++ b/fs/nfs/internal.h @@ -455,7 +455,6 @@ extern int nfs_wait_bit_killable(struct wait_bit_key *key, int mode); #if IS_ENABLED(CONFIG_NFS_LOCALIO) /* localio.c */ -extern void nfs_local_probe(struct nfs_client *); extern void nfs_local_probe_async(struct nfs_client *); extern void nfs_local_probe_async_work(struct work_struct *); extern struct nfsd_file *nfs_local_open_fh(struct nfs_client *, diff --git a/fs/nfs/localio.c b/fs/nfs/localio.c index 4ec952f9f47d..510d0a16cfe9 100644 --- a/fs/nfs/localio.c +++ b/fs/nfs/localio.c @@ -171,7 +171,7 @@ static bool nfs_server_uuid_is_local(struct nfs_client *clp) * - called after alloc_client and init_client (so cl_rpcclient exists) * - this function is idempotent, it can be called for old or new clients */ -void nfs_local_probe(struct nfs_client *clp) +static void nfs_local_probe(struct nfs_client *clp) { /* Disallow localio if disabled via sysfs or AUTH_SYS isn't used */ if (!localio_enabled || @@ -191,14 +191,16 @@ void nfs_local_probe(struct nfs_client *clp) nfs_localio_enable_client(clp); nfs_uuid_end(&clp->cl_uuid); } -EXPORT_SYMBOL_GPL(nfs_local_probe); void nfs_local_probe_async_work(struct work_struct *work) { struct nfs_client *clp = container_of(work, struct nfs_client, cl_local_probe_work); + if (!refcount_inc_not_zero(&clp->cl_count)) + return; nfs_local_probe(clp); + nfs_put_client(clp); } void nfs_local_probe_async(struct nfs_client *clp) @@ -207,14 +209,16 @@ void nfs_local_probe_async(struct nfs_client *clp) } EXPORT_SYMBOL_GPL(nfs_local_probe_async); -static inline struct nfsd_file *nfs_local_file_get(struct nfsd_file *nf) +static inline void nfs_local_file_put(struct nfsd_file *localio) { - return nfs_to->nfsd_file_get(nf); -} + /* nfs_to_nfsd_file_put_local() expects an __rcu pointer + * but we have a __kernel pointer. It is always safe + * to cast a __kernel pointer to an __rcu pointer + * because the cast only weakens what is known about the pointer. + */ + struct nfsd_file __rcu *nf = (struct nfsd_file __rcu*) localio; -static inline void nfs_local_file_put(struct nfsd_file *nf) -{ - nfs_to->nfsd_file_put(nf); + nfs_to_nfsd_file_put_local(&nf); } /* @@ -226,12 +230,13 @@ static inline void nfs_local_file_put(struct nfsd_file *nf) static struct nfsd_file * __nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, struct nfs_file_localio *nfl, + struct nfsd_file __rcu **pnf, const fmode_t mode) { struct nfsd_file *localio; localio = nfs_open_local_fh(&clp->cl_uuid, clp->cl_rpcclient, - cred, fh, nfl, mode); + cred, fh, nfl, pnf, mode); if (IS_ERR(localio)) { int status = PTR_ERR(localio); trace_nfs_local_open_fh(fh, mode, status); @@ -258,7 +263,7 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, struct nfs_fh *fh, struct nfs_file_localio *nfl, const fmode_t mode) { - struct nfsd_file *nf, *new, __rcu **pnf; + struct nfsd_file *nf, __rcu **pnf; if (!nfs_server_is_local(clp)) return NULL; @@ -270,29 +275,9 @@ nfs_local_open_fh(struct nfs_client *clp, const struct cred *cred, else pnf = &nfl->ro_file; - new = NULL; - rcu_read_lock(); - nf = rcu_dereference(*pnf); - if (!nf) { - rcu_read_unlock(); - new = __nfs_local_open_fh(clp, cred, fh, nfl, mode); - if (IS_ERR(new)) - return NULL; - rcu_read_lock(); - /* try to swap in the pointer */ - spin_lock(&clp->cl_uuid.lock); - nf = rcu_dereference_protected(*pnf, 1); - if (!nf) { - nf = new; - new = NULL; - rcu_assign_pointer(*pnf, nf); - } - spin_unlock(&clp->cl_uuid.lock); - } - nf = nfs_local_file_get(nf); - rcu_read_unlock(); - if (new) - nfs_to_nfsd_file_put_local(new); + nf = __nfs_local_open_fh(clp, cred, fh, nfl, pnf, mode); + if (IS_ERR(nf)) + return NULL; return nf; } EXPORT_SYMBOL_GPL(nfs_local_open_fh); diff --git a/fs/nfs/namespace.c b/fs/nfs/namespace.c index 973aed9cc5fe..7f1ec9c67ff2 100644 --- a/fs/nfs/namespace.c +++ b/fs/nfs/namespace.c @@ -195,7 +195,6 @@ struct vfsmount *nfs_d_automount(struct path *path) if (IS_ERR(mnt)) goto out_fc; - mntget(mnt); /* prevent immediate expiration */ if (timeout <= 0) goto out_fc; diff --git a/fs/nfs/nfs42.h b/fs/nfs/nfs42.h index 0282d93c8bcc..aafd15a4afce 100644 --- a/fs/nfs/nfs42.h +++ b/fs/nfs/nfs42.h @@ -21,6 +21,7 @@ int nfs42_proc_allocate(struct file *, loff_t, loff_t); ssize_t nfs42_proc_copy(struct file *, loff_t, struct file *, loff_t, size_t, struct nl4_server *, nfs4_stateid *, bool); int nfs42_proc_deallocate(struct file *, loff_t, loff_t); +int nfs42_proc_zero_range(struct file *, loff_t, loff_t); loff_t nfs42_proc_llseek(struct file *, loff_t, int); int nfs42_proc_layoutstats_generic(struct nfs_server *, struct nfs42_layoutstat_data *); diff --git a/fs/nfs/nfs42proc.c b/fs/nfs/nfs42proc.c index 5cf52ece96ac..01c01f45358b 100644 --- a/fs/nfs/nfs42proc.c +++ b/fs/nfs/nfs42proc.c @@ -146,7 +146,8 @@ int nfs42_proc_allocate(struct file *filep, loff_t offset, loff_t len) err = nfs42_proc_fallocate(&msg, filep, offset, len); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_ALLOCATE; + NFS_SERVER(inode)->caps &= ~(NFS_CAP_ALLOCATE | + NFS_CAP_ZERO_RANGE); inode_unlock(inode); return err; @@ -169,7 +170,31 @@ int nfs42_proc_deallocate(struct file *filep, loff_t offset, loff_t len) if (err == 0) truncate_pagecache_range(inode, offset, (offset + len) -1); if (err == -EOPNOTSUPP) - NFS_SERVER(inode)->caps &= ~NFS_CAP_DEALLOCATE; + NFS_SERVER(inode)->caps &= ~(NFS_CAP_DEALLOCATE | + NFS_CAP_ZERO_RANGE); + + inode_unlock(inode); + return err; +} + +int nfs42_proc_zero_range(struct file *filep, loff_t offset, loff_t len) +{ + struct rpc_message msg = { + .rpc_proc = &nfs4_procedures[NFSPROC4_CLNT_ZERO_RANGE], + }; + struct inode *inode = file_inode(filep); + int err; + + if (!nfs_server_capable(inode, NFS_CAP_ZERO_RANGE)) + return -EOPNOTSUPP; + + inode_lock(inode); + + err = nfs42_proc_fallocate(&msg, filep, offset, len); + if (err == 0) + truncate_pagecache_range(inode, offset, (offset + len) -1); + if (err == -EOPNOTSUPP) + NFS_SERVER(inode)->caps &= ~NFS_CAP_ZERO_RANGE; inode_unlock(inode); return err; diff --git a/fs/nfs/nfs42xdr.c b/fs/nfs/nfs42xdr.c index b1b663468249..4cc915d5741d 100644 --- a/fs/nfs/nfs42xdr.c +++ b/fs/nfs/nfs42xdr.c @@ -174,6 +174,18 @@ decode_putfh_maxsz + \ decode_deallocate_maxsz + \ decode_getattr_maxsz) +#define NFS4_enc_zero_range_sz (compound_encode_hdr_maxsz + \ + encode_sequence_maxsz + \ + encode_putfh_maxsz + \ + encode_deallocate_maxsz + \ + encode_allocate_maxsz + \ + encode_getattr_maxsz) +#define NFS4_dec_zero_range_sz (compound_decode_hdr_maxsz + \ + decode_sequence_maxsz + \ + decode_putfh_maxsz + \ + decode_deallocate_maxsz + \ + decode_allocate_maxsz + \ + decode_getattr_maxsz) #define NFS4_enc_read_plus_sz (compound_encode_hdr_maxsz + \ encode_sequence_maxsz + \ encode_putfh_maxsz + \ @@ -649,6 +661,27 @@ static void nfs4_xdr_enc_deallocate(struct rpc_rqst *req, } /* + * Encode ZERO_RANGE request + */ +static void nfs4_xdr_enc_zero_range(struct rpc_rqst *req, + struct xdr_stream *xdr, + const void *data) +{ + const struct nfs42_falloc_args *args = data; + struct compound_hdr hdr = { + .minorversion = nfs4_xdr_minorversion(&args->seq_args), + }; + + encode_compound_hdr(xdr, req, &hdr); + encode_sequence(xdr, &args->seq_args, &hdr); + encode_putfh(xdr, args->falloc_fh, &hdr); + encode_deallocate(xdr, args, &hdr); + encode_allocate(xdr, args, &hdr); + encode_getfattr(xdr, args->falloc_bitmask, &hdr); + encode_nops(&hdr); +} + +/* * Encode READ_PLUS request */ static void nfs4_xdr_enc_read_plus(struct rpc_rqst *req, @@ -1511,6 +1544,37 @@ out: } /* + * Decode ZERO_RANGE request + */ +static int nfs4_xdr_dec_zero_range(struct rpc_rqst *rqstp, + struct xdr_stream *xdr, + void *data) +{ + struct nfs42_falloc_res *res = data; + struct compound_hdr hdr; + int status; + + status = decode_compound_hdr(xdr, &hdr); + if (status) + goto out; + status = decode_sequence(xdr, &res->seq_res, rqstp); + if (status) + goto out; + status = decode_putfh(xdr); + if (status) + goto out; + status = decode_deallocate(xdr, res); + if (status) + goto out; + status = decode_allocate(xdr, res); + if (status) + goto out; + decode_getfattr(xdr, res->falloc_fattr, res->falloc_server); +out: + return status; +} + +/* * Decode READ_PLUS request */ static int nfs4_xdr_dec_read_plus(struct rpc_rqst *rqstp, diff --git a/fs/nfs/nfs4_fs.h b/fs/nfs/nfs4_fs.h index 7d383d29a995..d3ca91f60fc1 100644 --- a/fs/nfs/nfs4_fs.h +++ b/fs/nfs/nfs4_fs.h @@ -67,8 +67,7 @@ struct nfs4_minor_version_ops { void (*free_lock_state)(struct nfs_server *, struct nfs4_lock_state *); int (*test_and_free_expired)(struct nfs_server *, - const nfs4_stateid *, - const struct cred *); + nfs4_stateid *, const struct cred *); struct nfs_seqid * (*alloc_seqid)(struct nfs_seqid_counter *, gfp_t); void (*session_trunk)(struct rpc_clnt *clnt, diff --git a/fs/nfs/nfs4file.c b/fs/nfs/nfs4file.c index 1cd9652f3c28..5e9d66f3466c 100644 --- a/fs/nfs/nfs4file.c +++ b/fs/nfs/nfs4file.c @@ -225,8 +225,14 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (!S_ISREG(inode->i_mode)) return -EOPNOTSUPP; - if ((mode != 0) && (mode != (FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE))) + switch (mode) { + case 0: + case FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE: + case FALLOC_FL_ZERO_RANGE: + break; + default: return -EOPNOTSUPP; + } ret = inode_newsize_ok(inode, offset + len); if (ret < 0) @@ -234,6 +240,8 @@ static long nfs42_fallocate(struct file *filep, int mode, loff_t offset, loff_t if (mode & FALLOC_FL_PUNCH_HOLE) return nfs42_proc_deallocate(filep, offset, len); + else if (mode & FALLOC_FL_ZERO_RANGE) + return nfs42_proc_zero_range(filep, offset ,len); return nfs42_proc_allocate(filep, offset, len); } diff --git a/fs/nfs/nfs4proc.c b/fs/nfs/nfs4proc.c index b1d2122bd5a7..341740fa293d 100644 --- a/fs/nfs/nfs4proc.c +++ b/fs/nfs/nfs4proc.c @@ -105,7 +105,7 @@ static struct rpc_task *_nfs41_proc_sequence(struct nfs_client *clp, bool is_privileged); static int nfs41_test_stateid(struct nfs_server *, const nfs4_stateid *, const struct cred *); -static int nfs41_free_stateid(struct nfs_server *, const nfs4_stateid *, +static int nfs41_free_stateid(struct nfs_server *, nfs4_stateid *, const struct cred *, bool); #endif @@ -325,14 +325,14 @@ static void nfs4_bitmap_copy_adjust(__u32 *dst, const __u32 *src, if (nfs_have_delegated_mtime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) - dst[1] &= ~FATTR4_WORD1_TIME_ACCESS; + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); if (!(cache_validity & NFS_INO_INVALID_MTIME)) - dst[1] &= ~FATTR4_WORD1_TIME_MODIFY; + dst[1] &= ~(FATTR4_WORD1_TIME_MODIFY|FATTR4_WORD1_TIME_MODIFY_SET); if (!(cache_validity & NFS_INO_INVALID_CTIME)) - dst[1] &= ~FATTR4_WORD1_TIME_METADATA; + dst[1] &= ~(FATTR4_WORD1_TIME_METADATA|FATTR4_WORD1_TIME_MODIFY_SET); } else if (nfs_have_delegated_atime(inode)) { if (!(cache_validity & NFS_INO_INVALID_ATIME)) - dst[1] &= ~FATTR4_WORD1_TIME_ACCESS; + dst[1] &= ~(FATTR4_WORD1_TIME_ACCESS|FATTR4_WORD1_TIME_ACCESS_SET); } } @@ -2903,16 +2903,14 @@ static int nfs40_open_expired(struct nfs4_state_owner *sp, struct nfs4_state *st } static int nfs40_test_and_free_expired_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - const struct cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { return -NFS4ERR_BAD_STATEID; } #if defined(CONFIG_NFS_V4_1) static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, - const struct cred *cred) + nfs4_stateid *stateid, const struct cred *cred) { int status; @@ -2921,6 +2919,7 @@ static int nfs41_test_and_free_expired_stateid(struct nfs_server *server, break; case NFS4_INVALID_STATEID_TYPE: case NFS4_SPECIAL_STATEID_TYPE: + case NFS4_FREED_STATEID_TYPE: return -NFS4ERR_BAD_STATEID; case NFS4_REVOKED_STATEID_TYPE: goto out_free; @@ -3976,8 +3975,9 @@ static int _nfs4_server_capabilities(struct nfs_server *server, struct nfs_fh *f FATTR4_WORD0_CASE_INSENSITIVE | FATTR4_WORD0_CASE_PRESERVING; if (minorversion) - bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT | - FATTR4_WORD2_OPEN_ARGUMENTS; + bitmask[2] = FATTR4_WORD2_SUPPATTR_EXCLCREAT; + if (minorversion > 1) + bitmask[2] |= FATTR4_WORD2_OPEN_ARGUMENTS; status = nfs4_call_sync(server->client, server, &msg, &args.seq_args, &res.seq_res, 0); if (status == 0) { @@ -5164,13 +5164,15 @@ static int nfs4_do_create(struct inode *dir, struct dentry *dentry, struct nfs4_ } static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, - struct nfs4_createdata *data) + struct nfs4_createdata *data, int *statusp) { - int status = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, + struct dentry *ret; + + *statusp = nfs4_call_sync(NFS_SERVER(dir)->client, NFS_SERVER(dir), &data->msg, &data->arg.seq_args, &data->res.seq_res, 1); - if (status) - return ERR_PTR(status); + if (*statusp) + return NULL; spin_lock(&dir->i_lock); /* Creating a directory bumps nlink in the parent */ @@ -5179,7 +5181,11 @@ static struct dentry *nfs4_do_mkdir(struct inode *dir, struct dentry *dentry, data->res.fattr->time_start, NFS_INO_INVALID_DATA); spin_unlock(&dir->i_lock); - return nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + ret = nfs_add_or_obtain(dentry, data->res.fh, data->res.fattr); + if (!IS_ERR(ret)) + return ret; + *statusp = PTR_ERR(ret); + return NULL; } static void nfs4_free_createdata(struct nfs4_createdata *data) @@ -5240,17 +5246,18 @@ static int nfs4_proc_symlink(struct inode *dir, struct dentry *dentry, static struct dentry *_nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, struct iattr *sattr, - struct nfs4_label *label) + struct nfs4_label *label, int *statusp) { struct nfs4_createdata *data; - struct dentry *ret = ERR_PTR(-ENOMEM); + struct dentry *ret = NULL; + *statusp = -ENOMEM; data = nfs4_alloc_createdata(dir, &dentry->d_name, sattr, NF4DIR); if (data == NULL) goto out; data->arg.label = label; - ret = nfs4_do_mkdir(dir, dentry, data); + ret = nfs4_do_mkdir(dir, dentry, data, statusp); nfs4_free_createdata(data); out: @@ -5273,11 +5280,12 @@ static struct dentry *nfs4_proc_mkdir(struct inode *dir, struct dentry *dentry, if (!(server->attr_bitmask[2] & FATTR4_WORD2_MODE_UMASK)) sattr->ia_mode &= ~current_umask(); do { - alias = _nfs4_proc_mkdir(dir, dentry, sattr, label); - err = PTR_ERR_OR_ZERO(alias); + alias = _nfs4_proc_mkdir(dir, dentry, sattr, label, &err); trace_nfs4_mkdir(dir, &dentry->d_name, err); - err = nfs4_handle_exception(NFS_SERVER(dir), err, - &exception); + if (err) + alias = ERR_PTR(nfs4_handle_exception(NFS_SERVER(dir), + err, + &exception)); } while (exception.retry); nfs4_label_release_security(label); @@ -6211,6 +6219,8 @@ static ssize_t nfs4_proc_get_acl(struct inode *inode, void *buf, size_t buflen, struct nfs_server *server = NFS_SERVER(inode); int ret; + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; if (!nfs4_server_supports_acls(server, type)) return -EOPNOTSUPP; ret = nfs_revalidate_inode(inode, NFS_INO_INVALID_CHANGE); @@ -6285,6 +6295,9 @@ static int nfs4_proc_set_acl(struct inode *inode, const void *buf, { struct nfs4_exception exception = { }; int err; + + if (unlikely(NFS_FH(inode)->size == 0)) + return -ENODATA; do { err = __nfs4_proc_set_acl(inode, buf, buflen, type); trace_nfs4_set_acl(inode, err); @@ -10611,7 +10624,7 @@ static const struct rpc_call_ops nfs41_free_stateid_ops = { * Note: this function is always asynchronous. */ static int nfs41_free_stateid(struct nfs_server *server, - const nfs4_stateid *stateid, + nfs4_stateid *stateid, const struct cred *cred, bool privileged) { @@ -10651,6 +10664,7 @@ static int nfs41_free_stateid(struct nfs_server *server, if (IS_ERR(task)) return PTR_ERR(task); rpc_put_task(task); + stateid->type = NFS4_FREED_STATEID_TYPE; return 0; } @@ -10817,6 +10831,7 @@ static const struct nfs4_minor_version_ops nfs_v4_2_minor_ops = { | NFS_CAP_OFFLOAD_CANCEL | NFS_CAP_COPY_NOTIFY | NFS_CAP_DEALLOCATE + | NFS_CAP_ZERO_RANGE | NFS_CAP_SEEK | NFS_CAP_LAYOUTSTATS | NFS_CAP_CLONE @@ -10852,7 +10867,7 @@ const struct nfs4_minor_version_ops *nfs_v4_minor_ops[] = { static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) { - ssize_t error, error2, error3; + ssize_t error, error2, error3, error4; size_t left = size; error = generic_listxattr(dentry, list, left); @@ -10875,8 +10890,16 @@ static ssize_t nfs4_listxattr(struct dentry *dentry, char *list, size_t size) error3 = nfs4_listxattr_nfs4_user(d_inode(dentry), list, left); if (error3 < 0) return error3; + if (list) { + list += error3; + left -= error3; + } + + error4 = security_inode_listsecurity(d_inode(dentry), list, left); + if (error4 < 0) + return error4; - error += error2 + error3; + error += error2 + error3 + error4; if (size && error > size) return -ERANGE; return error; diff --git a/fs/nfs/nfs4xdr.c b/fs/nfs/nfs4xdr.c index 55bef5fbfa47..318afde38057 100644 --- a/fs/nfs/nfs4xdr.c +++ b/fs/nfs/nfs4xdr.c @@ -7711,6 +7711,7 @@ const struct rpc_procinfo nfs4_procedures[] = { PROC42(LISTXATTRS, enc_listxattrs, dec_listxattrs), PROC42(REMOVEXATTR, enc_removexattr, dec_removexattr), PROC42(READ_PLUS, enc_read_plus, dec_read_plus), + PROC42(ZERO_RANGE, enc_zero_range, dec_zero_range), }; static unsigned int nfs_version4_counts[ARRAY_SIZE(nfs4_procedures)]; diff --git a/fs/nfs/pnfs_nfs.c b/fs/nfs/pnfs_nfs.c index 91ef486f40b9..b4ccdf78d4dd 100644 --- a/fs/nfs/pnfs_nfs.c +++ b/fs/nfs/pnfs_nfs.c @@ -830,10 +830,16 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, .servername = clp->cl_hostname, .connect_timeout = connect_timeout, .reconnect_timeout = connect_timeout, + .xprtsec = clp->cl_xprtsec, }; - if (da->da_transport != clp->cl_proto) + if (da->da_transport != clp->cl_proto && + clp->cl_proto != XPRT_TRANSPORT_TCP_TLS) continue; + if (da->da_transport == XPRT_TRANSPORT_TCP && + mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) + xprt_args.ident = XPRT_TRANSPORT_TCP_TLS; + if (da->da_addr.ss_family != clp->cl_addr.ss_family) continue; /* Add this address as an alias */ @@ -841,6 +847,9 @@ static int _nfs4_pnfs_v3_ds_connect(struct nfs_server *mds_srv, rpc_clnt_test_and_add_xprt, NULL); continue; } + if (da->da_transport == XPRT_TRANSPORT_TCP && + mds_srv->nfs_client->cl_proto == XPRT_TRANSPORT_TCP_TLS) + da->da_transport = XPRT_TRANSPORT_TCP_TLS; clp = get_v3_ds_connect(mds_srv, &da->da_addr, da->da_addrlen, da->da_transport, diff --git a/fs/nfs/read.c b/fs/nfs/read.c index 81bd1b9aba17..3c1fa320b3f1 100644 --- a/fs/nfs/read.c +++ b/fs/nfs/read.c @@ -56,7 +56,8 @@ static int nfs_return_empty_folio(struct folio *folio) { folio_zero_segment(folio, 0, folio_size(folio)); folio_mark_uptodate(folio); - folio_unlock(folio); + if (nfs_netfs_folio_unlock(folio)) + folio_unlock(folio); return 0; } diff --git a/fs/nfs/super.c b/fs/nfs/super.c index 9eea9e62afc9..91b5503b6f74 100644 --- a/fs/nfs/super.c +++ b/fs/nfs/super.c @@ -1052,6 +1052,16 @@ int nfs_reconfigure(struct fs_context *fc) sync_filesystem(sb); /* + * The SB_RDONLY flag has been removed from the superblock during + * mounts to prevent interference between different filesystems. + * Similarly, it is also necessary to ignore the SB_RDONLY flag + * during reconfiguration; otherwise, it may also result in the + * creation of redundant superblocks when mounting a directory with + * different rw and ro flags multiple times. + */ + fc->sb_flags_mask &= ~SB_RDONLY; + + /* * Userspace mount programs that send binary options generally send * them populated with default values. We have no way to know which * ones were explicitly specified. Fall back to legacy behavior and @@ -1308,8 +1318,17 @@ int nfs_get_tree_common(struct fs_context *fc) if (IS_ERR(server)) return PTR_ERR(server); + /* + * When NFS_MOUNT_UNSHARED is not set, NFS forces the sharing of a + * superblock among each filesystem that mounts sub-directories + * belonging to a single exported root path. + * To prevent interference between different filesystems, the + * SB_RDONLY flag should be removed from the superblock. + */ if (server->flags & NFS_MOUNT_UNSHARED) compare_super = NULL; + else + fc->sb_flags &= ~SB_RDONLY; /* -o noac implies -o sync */ if (server->flags & NFS_MOUNT_NOAC) diff --git a/fs/nfs/sysfs.c b/fs/nfs/sysfs.c index 37cb2b776435..545148d42dcc 100644 --- a/fs/nfs/sysfs.c +++ b/fs/nfs/sysfs.c @@ -387,6 +387,33 @@ static inline void nfs_sysfs_add_nfsv41_server(struct nfs_server *server) } #endif /* CONFIG_NFS_V4_1 */ +#if IS_ENABLED(CONFIG_NFS_LOCALIO) + +static ssize_t +localio_show(struct kobject *kobj, struct kobj_attribute *attr, + char *buf) +{ + struct nfs_server *server = container_of(kobj, struct nfs_server, kobj); + bool localio = nfs_server_is_local(server->nfs_client); + return sysfs_emit(buf, "%d\n", localio); +} + +static struct kobj_attribute nfs_sysfs_attr_localio = __ATTR_RO(localio); + +static void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ + int ret = sysfs_create_file_ns(&server->kobj, &nfs_sysfs_attr_localio.attr, + nfs_netns_server_namespace(&server->kobj)); + if (ret < 0) + pr_warn("NFS: sysfs_create_file_ns for server-%d failed (%d)\n", + server->s_sysfs_id, ret); +} +#else +static inline void nfs_sysfs_add_nfs_localio_server(struct nfs_server *server) +{ +} +#endif /* IS_ENABLED(CONFIG_NFS_LOCALIO) */ + void nfs_sysfs_add_server(struct nfs_server *server) { int ret; @@ -405,6 +432,7 @@ void nfs_sysfs_add_server(struct nfs_server *server) server->s_sysfs_id, ret); nfs_sysfs_add_nfsv41_server(server); + nfs_sysfs_add_nfs_localio_server(server); } EXPORT_SYMBOL_GPL(nfs_sysfs_add_server); diff --git a/fs/nfs/write.c b/fs/nfs/write.c index 23df8b214474..374fc6b34c79 100644 --- a/fs/nfs/write.c +++ b/fs/nfs/write.c @@ -632,19 +632,19 @@ static void nfs_write_error(struct nfs_page *req, int error) * Find an associated nfs write request, and prepare to flush it out * May return an error if the user signalled nfs_wait_on_request(). */ -static int nfs_page_async_flush(struct folio *folio, - struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) +static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, + struct nfs_pageio_descriptor *pgio) { struct nfs_page *req; - int ret = 0; + int ret; + + nfs_pageio_cond_complete(pgio, folio->index); req = nfs_lock_and_join_requests(folio); if (!req) - goto out; - ret = PTR_ERR(req); + return 0; if (IS_ERR(req)) - goto out; + return PTR_ERR(req); nfs_folio_set_writeback(folio); WARN_ON_ONCE(test_bit(PG_CLEAN, &req->wb_flags)); @@ -654,7 +654,6 @@ static int nfs_page_async_flush(struct folio *folio, if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - ret = 0; if (!nfs_pageio_add_request(pgio, req)) { ret = pgio->pg_error; /* @@ -662,28 +661,20 @@ static int nfs_page_async_flush(struct folio *folio, */ if (nfs_error_is_fatal_on_server(ret)) goto out_launder; - if (wbc->sync_mode == WB_SYNC_NONE) - ret = AOP_WRITEPAGE_ACTIVATE; folio_redirty_for_writepage(wbc, folio); nfs_redirty_request(req); pgio->pg_error = 0; - } else - nfs_add_stats(folio->mapping->host, - NFSIOS_WRITEPAGES, 1); -out: - return ret; + return ret; + } + + nfs_add_stats(folio->mapping->host, NFSIOS_WRITEPAGES, 1); + return 0; + out_launder: nfs_write_error(req, ret); return 0; } -static int nfs_do_writepage(struct folio *folio, struct writeback_control *wbc, - struct nfs_pageio_descriptor *pgio) -{ - nfs_pageio_cond_complete(pgio, folio->index); - return nfs_page_async_flush(folio, wbc, pgio); -} - /* * Write an mmapped page to the server. */ @@ -703,17 +694,6 @@ static int nfs_writepage_locked(struct folio *folio, return err; } -static int nfs_writepages_callback(struct folio *folio, - struct writeback_control *wbc, void *data) -{ - int ret; - - ret = nfs_do_writepage(folio, wbc, data); - if (ret != AOP_WRITEPAGE_ACTIVATE) - folio_unlock(folio); - return ret; -} - static void nfs_io_completion_commit(void *inode) { nfs_commit_inode(inode, 0); @@ -749,11 +729,15 @@ int nfs_writepages(struct address_space *mapping, struct writeback_control *wbc) } do { + struct folio *folio = NULL; + nfs_pageio_init_write(&pgio, inode, priority, false, &nfs_async_write_completion_ops); pgio.pg_io_completion = ioc; - err = write_cache_pages(mapping, wbc, nfs_writepages_callback, - &pgio); + while ((folio = writeback_iter(mapping, wbc, folio, &err))) { + err = nfs_do_writepage(folio, wbc, &pgio); + folio_unlock(folio); + } pgio.pg_error = 0; nfs_pageio_complete(&pgio); if (err == -EAGAIN && mntflags & NFS_MOUNT_SOFTERR) diff --git a/fs/nfs_common/nfslocalio.c b/fs/nfs_common/nfslocalio.c index 6a0bdea6d644..05c7c16e37ab 100644 --- a/fs/nfs_common/nfslocalio.c +++ b/fs/nfs_common/nfslocalio.c @@ -151,8 +151,7 @@ EXPORT_SYMBOL_GPL(nfs_localio_enable_client); */ static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid) { - LIST_HEAD(local_files); - struct nfs_file_localio *nfl, *tmp; + struct nfs_file_localio *nfl; spin_lock(&nfs_uuid->lock); if (unlikely(!rcu_access_pointer(nfs_uuid->net))) { @@ -166,17 +165,42 @@ static bool nfs_uuid_put(nfs_uuid_t *nfs_uuid) nfs_uuid->dom = NULL; } - list_splice_init(&nfs_uuid->files, &local_files); - spin_unlock(&nfs_uuid->lock); - /* Walk list of files and ensure their last references dropped */ - list_for_each_entry_safe(nfl, tmp, &local_files, list) { - nfs_close_local_fh(nfl); + + while ((nfl = list_first_entry_or_null(&nfs_uuid->files, + struct nfs_file_localio, + list)) != NULL) { + /* If nfs_uuid is already NULL, nfs_close_local_fh is + * closing and we must wait, else we unlink and close. + */ + if (rcu_access_pointer(nfl->nfs_uuid) == NULL) { + /* nfs_close_local_fh() is doing the + * close and we must wait. until it unlinks + */ + wait_var_event_spinlock(nfl, + list_first_entry_or_null( + &nfs_uuid->files, + struct nfs_file_localio, + list) != nfl, + &nfs_uuid->lock); + continue; + } + + /* Remove nfl from nfs_uuid->files list */ + list_del_init(&nfl->list); + spin_unlock(&nfs_uuid->lock); + + nfs_to_nfsd_file_put_local(&nfl->ro_file); + nfs_to_nfsd_file_put_local(&nfl->rw_file); cond_resched(); - } - spin_lock(&nfs_uuid->lock); - BUG_ON(!list_empty(&nfs_uuid->files)); + spin_lock(&nfs_uuid->lock); + /* Now we can allow racing nfs_close_local_fh() to + * skip the locking. + */ + RCU_INIT_POINTER(nfl->nfs_uuid, NULL); + wake_up_var_locked(&nfl->nfs_uuid, &nfs_uuid->lock); + } /* Remove client from nn->local_clients */ if (nfs_uuid->list_lock) { @@ -237,6 +261,7 @@ static void nfs_uuid_add_file(nfs_uuid_t *nfs_uuid, struct nfs_file_localio *nfl struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid, struct rpc_clnt *rpc_clnt, const struct cred *cred, const struct nfs_fh *nfs_fh, struct nfs_file_localio *nfl, + struct nfsd_file __rcu **pnf, const fmode_t fmode) { struct net *net; @@ -261,10 +286,9 @@ struct nfsd_file *nfs_open_local_fh(nfs_uuid_t *uuid, rcu_read_unlock(); /* We have an implied reference to net thanks to nfsd_net_try_get */ localio = nfs_to->nfsd_open_local_fh(net, uuid->dom, rpc_clnt, - cred, nfs_fh, fmode); - if (IS_ERR(localio)) - nfs_to_nfsd_net_put(net); - else + cred, nfs_fh, pnf, fmode); + nfs_to_nfsd_net_put(net); + if (!IS_ERR(localio)) nfs_uuid_add_file(uuid, nfl); return localio; @@ -273,8 +297,6 @@ EXPORT_SYMBOL_GPL(nfs_open_local_fh); void nfs_close_local_fh(struct nfs_file_localio *nfl) { - struct nfsd_file *ro_nf = NULL; - struct nfsd_file *rw_nf = NULL; nfs_uuid_t *nfs_uuid; rcu_read_lock(); @@ -285,28 +307,39 @@ void nfs_close_local_fh(struct nfs_file_localio *nfl) return; } - ro_nf = rcu_access_pointer(nfl->ro_file); - rw_nf = rcu_access_pointer(nfl->rw_file); - if (ro_nf || rw_nf) { - spin_lock(&nfs_uuid->lock); - if (ro_nf) - ro_nf = rcu_dereference_protected(xchg(&nfl->ro_file, NULL), 1); - if (rw_nf) - rw_nf = rcu_dereference_protected(xchg(&nfl->rw_file, NULL), 1); - - /* Remove nfl from nfs_uuid->files list */ - RCU_INIT_POINTER(nfl->nfs_uuid, NULL); - list_del_init(&nfl->list); + spin_lock(&nfs_uuid->lock); + if (!rcu_access_pointer(nfl->nfs_uuid)) { + /* nfs_uuid_put has finished here */ spin_unlock(&nfs_uuid->lock); rcu_read_unlock(); - - if (ro_nf) - nfs_to_nfsd_file_put_local(ro_nf); - if (rw_nf) - nfs_to_nfsd_file_put_local(rw_nf); return; } + if (list_empty(&nfs_uuid->files)) { + /* nfs_uuid_put() has started closing files, wait for it + * to finished + */ + spin_unlock(&nfs_uuid->lock); + rcu_read_unlock(); + wait_var_event(&nfl->nfs_uuid, + rcu_access_pointer(nfl->nfs_uuid) == NULL); + return; + } + /* tell nfs_uuid_put() to wait for us */ + RCU_INIT_POINTER(nfl->nfs_uuid, NULL); + spin_unlock(&nfs_uuid->lock); rcu_read_unlock(); + + nfs_to_nfsd_file_put_local(&nfl->ro_file); + nfs_to_nfsd_file_put_local(&nfl->rw_file); + + /* Remove nfl from nfs_uuid->files list and signal nfs_uuid_put() + * that we are done. The moment we drop the spinlock the + * nfs_uuid could be freed. + */ + spin_lock(&nfs_uuid->lock); + list_del_init(&nfl->list); + wake_up_var_locked(&nfl->nfs_uuid, &nfs_uuid->lock); + spin_unlock(&nfs_uuid->lock); } EXPORT_SYMBOL_GPL(nfs_close_local_fh); diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig index 731a88f6313e..879e0b104d1c 100644 --- a/fs/nfsd/Kconfig +++ b/fs/nfsd/Kconfig @@ -77,8 +77,8 @@ config NFSD_V4 select FS_POSIX_ACL select RPCSEC_GSS_KRB5 select CRYPTO + select CRYPTO_LIB_SHA256 select CRYPTO_MD5 - select CRYPTO_SHA256 select GRACE_PERIOD select NFS_V4_2_SSC_HELPER if NFS_V4_2 help diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile index 2f687619f65b..55744bb786c9 100644 --- a/fs/nfsd/Makefile +++ b/fs/nfsd/Makefile @@ -24,6 +24,7 @@ nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o nfsd-$(CONFIG_NFS_LOCALIO) += localio.o +nfsd-$(CONFIG_DEBUG_FS) += debugfs.o .PHONY: xdrgen diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c new file mode 100644 index 000000000000..84b0c8b559dc --- /dev/null +++ b/fs/nfsd/debugfs.c @@ -0,0 +1,47 @@ +// SPDX-License-Identifier: GPL-2.0 + +#include <linux/debugfs.h> + +#include "nfsd.h" + +static struct dentry *nfsd_top_dir __read_mostly; + +/* + * /sys/kernel/debug/nfsd/disable-splice-read + * + * Contents: + * %0: NFS READ is allowed to use page splicing + * %1: NFS READ uses only iov iter read + * + * The default value of this setting is zero (page splicing is + * allowed). This setting takes immediate effect for all NFS + * versions, all exports, and in all NFSD net namespaces. + */ + +static int nfsd_dsr_get(void *data, u64 *val) +{ + *val = nfsd_disable_splice_read ? 1 : 0; + return 0; +} + +static int nfsd_dsr_set(void *data, u64 val) +{ + nfsd_disable_splice_read = (val > 0) ? true : false; + return 0; +} + +DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n"); + +void nfsd_debugfs_exit(void) +{ + debugfs_remove_recursive(nfsd_top_dir); + nfsd_top_dir = NULL; +} + +void nfsd_debugfs_init(void) +{ + nfsd_top_dir = debugfs_create_dir("nfsd", NULL); + + debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO, + nfsd_top_dir, NULL, &nfsd_dsr_fops); +} diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c index 0363720280d4..88ae410b4113 100644 --- a/fs/nfsd/export.c +++ b/fs/nfsd/export.c @@ -1124,7 +1124,8 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp, test_bit(XPT_PEER_AUTH, &xprt->xpt_flags)) goto ok; } - goto denied; + if (!may_bypass_gss) + goto denied; ok: /* legacy gss-only clients are always OK: */ diff --git a/fs/nfsd/filecache.c b/fs/nfsd/filecache.c index ab85e6a2454f..e108b6c705b4 100644 --- a/fs/nfsd/filecache.c +++ b/fs/nfsd/filecache.c @@ -378,15 +378,41 @@ nfsd_file_put(struct nfsd_file *nf) * the reference of the nfsd_file. */ struct net * -nfsd_file_put_local(struct nfsd_file *nf) +nfsd_file_put_local(struct nfsd_file __rcu **pnf) { - struct net *net = nf->nf_net; + struct nfsd_file *nf; + struct net *net = NULL; - nfsd_file_put(nf); + nf = unrcu_pointer(xchg(pnf, NULL)); + if (nf) { + net = nf->nf_net; + nfsd_file_put(nf); + } return net; } /** + * nfsd_file_get_local - get nfsd_file reference and reference to net + * @nf: nfsd_file of which to put the reference + * + * Get reference to both the nfsd_file and nf->nf_net. + */ +struct nfsd_file * +nfsd_file_get_local(struct nfsd_file *nf) +{ + struct net *net = nf->nf_net; + + if (nfsd_net_try_get(net)) { + nf = nfsd_file_get(nf); + if (!nf) + nfsd_net_put(net); + } else { + nf = NULL; + } + return nf; +} + +/** * nfsd_file_file - get the backing file of an nfsd_file * @nf: nfsd_file of which to access the backing file. * diff --git a/fs/nfsd/filecache.h b/fs/nfsd/filecache.h index 5865f9c72712..722b26c71e45 100644 --- a/fs/nfsd/filecache.h +++ b/fs/nfsd/filecache.h @@ -62,7 +62,8 @@ void nfsd_file_cache_shutdown(void); int nfsd_file_cache_start_net(struct net *net); void nfsd_file_cache_shutdown_net(struct net *net); void nfsd_file_put(struct nfsd_file *nf); -struct net *nfsd_file_put_local(struct nfsd_file *nf); +struct net *nfsd_file_put_local(struct nfsd_file __rcu **nf); +struct nfsd_file *nfsd_file_get_local(struct nfsd_file *nf); struct nfsd_file *nfsd_file_get(struct nfsd_file *nf); struct file *nfsd_file_file(struct nfsd_file *nf); void nfsd_file_close_inode_sync(struct inode *inode); diff --git a/fs/nfsd/localio.c b/fs/nfsd/localio.c index 238647fa379e..80d9ff6608a7 100644 --- a/fs/nfsd/localio.c +++ b/fs/nfsd/localio.c @@ -24,21 +24,6 @@ #include "filecache.h" #include "cache.h" -static const struct nfsd_localio_operations nfsd_localio_ops = { - .nfsd_net_try_get = nfsd_net_try_get, - .nfsd_net_put = nfsd_net_put, - .nfsd_open_local_fh = nfsd_open_local_fh, - .nfsd_file_put_local = nfsd_file_put_local, - .nfsd_file_get = nfsd_file_get, - .nfsd_file_put = nfsd_file_put, - .nfsd_file_file = nfsd_file_file, -}; - -void nfsd_localio_ops_init(void) -{ - nfs_to = &nfsd_localio_ops; -} - /** * nfsd_open_local_fh - lookup a local filehandle @nfs_fh and map to nfsd_file * @@ -47,6 +32,7 @@ void nfsd_localio_ops_init(void) * @rpc_clnt: rpc_clnt that the client established * @cred: cred that the client established * @nfs_fh: filehandle to lookup + * @nfp: place to find the nfsd_file, or store it if it was non-NULL * @fmode: fmode_t to use for open * * This function maps a local fh to a path on a local filesystem. @@ -57,10 +43,11 @@ void nfsd_localio_ops_init(void) * set. Caller (NFS client) is responsible for calling nfsd_net_put and * nfsd_file_put (via nfs_to_nfsd_file_put_local). */ -struct nfsd_file * +static struct nfsd_file * nfsd_open_local_fh(struct net *net, struct auth_domain *dom, struct rpc_clnt *rpc_clnt, const struct cred *cred, - const struct nfs_fh *nfs_fh, const fmode_t fmode) + const struct nfs_fh *nfs_fh, struct nfsd_file __rcu **pnf, + const fmode_t fmode) { int mayflags = NFSD_MAY_LOCALIO; struct svc_cred rq_cred; @@ -71,6 +58,15 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, if (nfs_fh->size > NFS4_FHSIZE) return ERR_PTR(-EINVAL); + if (!nfsd_net_try_get(net)) + return ERR_PTR(-ENXIO); + + rcu_read_lock(); + localio = nfsd_file_get(rcu_dereference(*pnf)); + rcu_read_unlock(); + if (localio) + return localio; + /* nfs_fh -> svc_fh */ fh_init(&fh, NFS4_FHSIZE); fh.fh_handle.fh_size = nfs_fh->size; @@ -92,9 +88,47 @@ nfsd_open_local_fh(struct net *net, struct auth_domain *dom, if (rq_cred.cr_group_info) put_group_info(rq_cred.cr_group_info); + if (!IS_ERR(localio)) { + struct nfsd_file *new; + if (!nfsd_net_try_get(net)) { + nfsd_file_put(localio); + nfsd_net_put(net); + return ERR_PTR(-ENXIO); + } + nfsd_file_get(localio); + again: + new = unrcu_pointer(cmpxchg(pnf, NULL, RCU_INITIALIZER(localio))); + if (new) { + /* Some other thread installed an nfsd_file */ + if (nfsd_file_get(new) == NULL) + goto again; + /* + * Drop the ref we were going to install and the + * one we were going to return. + */ + nfsd_file_put(localio); + nfsd_file_put(localio); + localio = new; + } + } else + nfsd_net_put(net); + return localio; } -EXPORT_SYMBOL_GPL(nfsd_open_local_fh); + +static const struct nfsd_localio_operations nfsd_localio_ops = { + .nfsd_net_try_get = nfsd_net_try_get, + .nfsd_net_put = nfsd_net_put, + .nfsd_open_local_fh = nfsd_open_local_fh, + .nfsd_file_put_local = nfsd_file_put_local, + .nfsd_file_get_local = nfsd_file_get_local, + .nfsd_file_file = nfsd_file_file, +}; + +void nfsd_localio_ops_init(void) +{ + nfs_to = &nfsd_localio_ops; +} /* * UUID_IS_LOCAL XDR functions diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c index ac1731eb34ab..a817d8485d21 100644 --- a/fs/nfsd/nfs3proc.c +++ b/fs/nfsd/nfs3proc.c @@ -14,6 +14,7 @@ #include "xdr3.h" #include "vfs.h" #include "filecache.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -69,8 +70,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: GETATTR(3) %s\n", - SVCFH_fmt(&argp->fh)); + trace_nfsd_vfs_getattr(rqstp, &argp->fh); fh_copy(&resp->fh, &argp->fh); resp->status = fh_verify(rqstp, &resp->fh, 0, @@ -220,7 +220,6 @@ nfsd3_proc_write(struct svc_rqst *rqstp) struct nfsd3_writeargs *argp = rqstp->rq_argp; struct nfsd3_writeres *resp = rqstp->rq_resp; unsigned long cnt = argp->len; - unsigned int nvecs; dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n", SVCFH_fmt(&argp->fh), @@ -235,10 +234,8 @@ nfsd3_proc_write(struct svc_rqst *rqstp) fh_copy(&resp->fh, &argp->fh); resp->committed = argp->stable; - nvecs = svc_fill_write_vector(rqstp, &argp->payload); - resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, - rqstp->rq_vec, nvecs, &cnt, + &argp->payload, &cnt, resp->committed, resp->verf); resp->count = cnt; resp->status = nfsd3_map_status(resp->status); @@ -266,6 +263,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 status; int host_err; + trace_nfsd_vfs_create(rqstp, fhp, S_IFREG, argp->name, argp->len); + if (isdotent(argp->name, argp->len)) return nfserr_exist; if (!(iap->ia_valid & ATTR_MODE)) @@ -382,11 +381,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp) struct nfsd3_diropres *resp = rqstp->rq_resp; svc_fh *dirfhp, *newfhp; - dprintk("nfsd: CREATE(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - dirfhp = fh_copy(&resp->dirfh, &argp->fh); newfhp = fh_init(&resp->fh, NFS3_FHSIZE); @@ -407,11 +401,6 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp) .na_iattr = &argp->attrs, }; - dprintk("nfsd: MKDIR(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - argp->attrs.ia_valid &= ~ATTR_SIZE; fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); @@ -447,11 +436,6 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp) goto out; } - dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n", - SVCFH_fmt(&argp->ffh), - argp->flen, argp->fname, - argp->tlen, argp->tname); - fh_copy(&resp->dirfh, &argp->ffh); fh_init(&resp->fh, NFS3_FHSIZE); resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname, @@ -476,11 +460,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp) int type; dev_t rdev = 0; - dprintk("nfsd: MKNOD(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - fh_copy(&resp->dirfh, &argp->fh); fh_init(&resp->fh, NFS3_FHSIZE); @@ -513,11 +492,6 @@ nfsd3_proc_remove(struct svc_rqst *rqstp) struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: REMOVE(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - /* Unlink. -S_IFDIR means file must not be a directory */ fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR, @@ -535,11 +509,6 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp) struct nfsd3_diropargs *argp = rqstp->rq_argp; struct nfsd3_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: RMDIR(3) %s %.*s\n", - SVCFH_fmt(&argp->fh), - argp->len, - argp->name); - fh_copy(&resp->fh, &argp->fh); resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR, argp->name, argp->len); @@ -553,15 +522,6 @@ nfsd3_proc_rename(struct svc_rqst *rqstp) struct nfsd3_renameargs *argp = rqstp->rq_argp; struct nfsd3_renameres *resp = rqstp->rq_resp; - dprintk("nfsd: RENAME(3) %s %.*s ->\n", - SVCFH_fmt(&argp->ffh), - argp->flen, - argp->fname); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); - fh_copy(&resp->ffh, &argp->ffh); fh_copy(&resp->tfh, &argp->tfh); resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen, @@ -576,13 +536,6 @@ nfsd3_proc_link(struct svc_rqst *rqstp) struct nfsd3_linkargs *argp = rqstp->rq_argp; struct nfsd3_linkres *resp = rqstp->rq_resp; - dprintk("nfsd: LINK(3) %s ->\n", - SVCFH_fmt(&argp->ffh)); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); - fh_copy(&resp->fh, &argp->ffh); fh_copy(&resp->tfh, &argp->tfh); resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen, @@ -621,9 +574,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp) struct nfsd3_readdirres *resp = rqstp->rq_resp; loff_t offset; - dprintk("nfsd: READDIR(3) %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, (u32) argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); nfsd3_init_dirlist_pages(rqstp, resp, argp->count); @@ -655,9 +606,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp) struct nfsd3_readdirres *resp = rqstp->rq_resp; loff_t offset; - dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, (u32) argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); nfsd3_init_dirlist_pages(rqstp, resp, argp->count); @@ -698,9 +647,6 @@ nfsd3_proc_fsstat(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd3_fsstatres *resp = rqstp->rq_resp; - dprintk("nfsd: FSSTAT(3) %s\n", - SVCFH_fmt(&argp->fh)); - resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0); fh_put(&argp->fh); resp->status = nfsd3_map_status(resp->status); diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c index ec6539cec0fe..ccb00aa93be0 100644 --- a/fs/nfsd/nfs4callback.c +++ b/fs/nfsd/nfs4callback.c @@ -417,6 +417,29 @@ static u32 highest_slotid(struct nfsd4_session *ses) return idx; } +static void +encode_referring_call4(struct xdr_stream *xdr, + const struct nfsd4_referring_call *rc) +{ + encode_uint32(xdr, rc->rc_sequenceid); + encode_uint32(xdr, rc->rc_slotid); +} + +static void +encode_referring_call_list4(struct xdr_stream *xdr, + const struct nfsd4_referring_call_list *rcl) +{ + struct nfsd4_referring_call *rc; + __be32 *p; + + p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN); + xdr_encode_opaque_fixed(p, rcl->rcl_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + encode_uint32(xdr, rcl->__nr_referring_calls); + list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) + encode_referring_call4(xdr, rc); +} + /* * CB_SEQUENCE4args * @@ -434,6 +457,7 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr, struct nfs4_cb_compound_hdr *hdr) { struct nfsd4_session *session = cb->cb_clp->cl_cb_session; + struct nfsd4_referring_call_list *rcl; __be32 *p; if (hdr->minorversion == 0) @@ -442,12 +466,16 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr, encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE); encode_sessionid4(xdr, session); - p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4); + p = xdr_reserve_space(xdr, XDR_UNIT * 4); *p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]); /* csa_sequenceid */ *p++ = cpu_to_be32(cb->cb_held_slot); /* csa_slotid */ *p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */ *p++ = xdr_zero; /* csa_cachethis */ - xdr_encode_empty_array(p); /* csa_referring_call_lists */ + + /* csa_referring_call_lists */ + encode_uint32(xdr, cb->cb_nr_referring_call_list); + list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) + encode_referring_call_list4(xdr, rcl); hdr->nops++; } @@ -1320,10 +1348,102 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb) nfsd41_cb_inflight_end(clp); } -/* - * TODO: cb_sequence should support referring call lists, cachethis, - * and mark callback channel down on communication errors. +/** + * nfsd41_cb_referring_call - add a referring call to a callback operation + * @cb: context of callback to add the rc to + * @sessionid: referring call's session ID + * @slotid: referring call's session slot index + * @seqno: referring call's slot sequence number + * + * Caller serializes access to @cb. + * + * NB: If memory allocation fails, the referring call is not added. */ +void nfsd41_cb_referring_call(struct nfsd4_callback *cb, + struct nfs4_sessionid *sessionid, + u32 slotid, u32 seqno) +{ + struct nfsd4_referring_call_list *rcl; + struct nfsd4_referring_call *rc; + bool found; + + might_sleep(); + + found = false; + list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) { + if (!memcmp(rcl->rcl_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN)) { + found = true; + break; + } + } + if (!found) { + rcl = kmalloc(sizeof(*rcl), GFP_KERNEL); + if (!rcl) + return; + memcpy(rcl->rcl_sessionid.data, sessionid->data, + NFS4_MAX_SESSIONID_LEN); + rcl->__nr_referring_calls = 0; + INIT_LIST_HEAD(&rcl->rcl_referring_calls); + list_add(&rcl->__list, &cb->cb_referring_call_list); + cb->cb_nr_referring_call_list++; + } + + found = false; + list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) { + if (rc->rc_sequenceid == seqno && rc->rc_slotid == slotid) { + found = true; + break; + } + } + if (!found) { + rc = kmalloc(sizeof(*rc), GFP_KERNEL); + if (!rc) + goto out; + rc->rc_sequenceid = seqno; + rc->rc_slotid = slotid; + rcl->__nr_referring_calls++; + list_add(&rc->__list, &rcl->rcl_referring_calls); + } + +out: + if (!rcl->__nr_referring_calls) { + cb->cb_nr_referring_call_list--; + kfree(rcl); + } +} + +/** + * nfsd41_cb_destroy_referring_call_list - release referring call info + * @cb: context of a callback that has completed + * + * Callers who allocate referring calls using nfsd41_cb_referring_call() must + * release those resources by calling nfsd41_cb_destroy_referring_call_list. + * + * Caller serializes access to @cb. + */ +void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb) +{ + struct nfsd4_referring_call_list *rcl; + struct nfsd4_referring_call *rc; + + while (!list_empty(&cb->cb_referring_call_list)) { + rcl = list_first_entry(&cb->cb_referring_call_list, + struct nfsd4_referring_call_list, + __list); + + while (!list_empty(&rcl->rcl_referring_calls)) { + rc = list_first_entry(&rcl->rcl_referring_calls, + struct nfsd4_referring_call, + __list); + list_del(&rc->__list); + kfree(rc); + } + list_del(&rcl->__list); + kfree(rcl); + } +} + static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata) { struct nfsd4_callback *cb = calldata; @@ -1643,6 +1763,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, INIT_WORK(&cb->cb_work, nfsd4_run_cb_work); cb->cb_status = 0; cb->cb_held_slot = -1; + cb->cb_nr_referring_call_list = 0; + INIT_LIST_HEAD(&cb->cb_referring_call_list); } /** diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c index fd560dcf6059..f13abbb13b38 100644 --- a/fs/nfsd/nfs4proc.c +++ b/fs/nfsd/nfs4proc.c @@ -878,6 +878,8 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd4_getattr *getattr = &u->getattr; __be32 status; + trace_nfsd_vfs_getattr(rqstp, &cstate->current_fh); + status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP); if (status) return status; @@ -1000,6 +1002,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, u64 cookie = readdir->rd_cookie; static const nfs4_verifier zeroverf; + trace_nfsd_vfs_readdir(rqstp, &cstate->current_fh, + readdir->rd_maxcount, readdir->rd_cookie); + /* no need to check permission - this will be done in nfsd_readdir() */ if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1) @@ -1213,7 +1218,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, struct nfsd_file *nf = NULL; __be32 status = nfs_ok; unsigned long cnt; - int nvecs; if (write->wr_offset > (u64)OFFSET_MAX || write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX) @@ -1228,13 +1232,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, return status; write->wr_how_written = write->wr_stable_how; - - nvecs = svc_fill_write_vector(rqstp, &write->wr_payload); - WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec)); - status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf, - write->wr_offset, rqstp->rq_vec, nvecs, &cnt, - write->wr_how_written, + write->wr_offset, &write->wr_payload, + &cnt, write->wr_how_written, (__be32 *)write->wr_verifier.data); nfsd_file_put(nf); @@ -1381,8 +1381,11 @@ static void nfs4_put_copy(struct nfsd4_copy *copy) static void nfsd4_stop_copy(struct nfsd4_copy *copy) { trace_nfsd_copy_async_cancel(copy); - if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) + if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, ©->cp_flags)) { kthread_stop(copy->copy_task); + copy->nfserr = nfs_ok; + set_bit(NFSD4_COPY_F_COMPLETED, ©->cp_flags); + } nfs4_put_copy(copy); } @@ -1711,10 +1714,11 @@ static int nfsd4_cb_offload_done(struct nfsd4_callback *cb, switch (task->tk_status) { case -NFS4ERR_DELAY: if (cbo->co_retries--) { - rpc_delay(task, 1 * HZ); + rpc_delay(task, HZ / 5); return 0; } } + nfsd41_cb_destroy_referring_call_list(cb); return 1; } @@ -1847,6 +1851,9 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy) nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops, NFSPROC4_CLNT_CB_OFFLOAD); + nfsd41_cb_referring_call(&cbo->co_cb, &cbo->co_referring_sessionid, + cbo->co_referring_slotid, + cbo->co_referring_seqno); trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid, &cbo->co_fh, copy->cp_count, copy->nfserr); nfsd4_try_run_cb(&cbo->co_cb); @@ -1963,6 +1970,11 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, memcpy(&result->cb_stateid, ©->cp_stateid.cs_stid, sizeof(result->cb_stateid)); dup_copy_fields(copy, async_copy); + memcpy(async_copy->cp_cb_offload.co_referring_sessionid.data, + cstate->session->se_sessionid.data, + NFS4_MAX_SESSIONID_LEN); + async_copy->cp_cb_offload.co_referring_slotid = cstate->slot->sl_index; + async_copy->cp_cb_offload.co_referring_seqno = cstate->slot->sl_seqid; async_copy->copy_task = kthread_create(nfsd4_do_async_copy, async_copy, "%s", "copy thread"); if (IS_ERR(async_copy->copy_task)) @@ -3768,7 +3780,8 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp) struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow; u32 opiter; - if (!cstate->minorversion) + if (rqstp->rq_procinfo != &nfsd_version4.vs_proc[NFSPROC4_COMPOUND] || + cstate->minorversion == 0) return false; if (cstate->spo_must_allowed) @@ -3834,7 +3847,7 @@ static const struct svc_procedure nfsd_procedures4[2] = { .pc_ressize = sizeof(struct nfsd4_compoundres), .pc_release = nfsd4_release_compoundargs, .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = NFSD_BUFSIZE/4, + .pc_xdrressize = 3+NFSSVC_MAXBLKSIZE/4, .pc_name = "COMPOUND", }, }; diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c index acde3edab733..82785db730d9 100644 --- a/fs/nfsd/nfs4recover.c +++ b/fs/nfsd/nfs4recover.c @@ -33,6 +33,7 @@ */ #include <crypto/hash.h> +#include <crypto/sha2.h> #include <linux/file.h> #include <linux/slab.h> #include <linux/namei.h> @@ -737,7 +738,6 @@ struct cld_net { spinlock_t cn_lock; struct list_head cn_list; unsigned int cn_xid; - struct crypto_shash *cn_tfm; #ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING bool cn_has_legacy; #endif @@ -1063,8 +1063,6 @@ nfsd4_remove_cld_pipe(struct net *net) nfsd4_cld_unregister_net(net, cn->cn_pipe); rpc_destroy_pipe_data(cn->cn_pipe); - if (cn->cn_tfm) - crypto_free_shash(cn->cn_tfm); kfree(nn->cld_net); nn->cld_net = NULL; } @@ -1158,8 +1156,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); struct cld_net *cn = nn->cld_net; struct cld_msg_v2 *cmsg; - struct crypto_shash *tfm = cn->cn_tfm; - struct xdr_netobj cksum; char *principal = NULL; /* Don't upcall if it's already stored */ @@ -1182,22 +1178,9 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal) { - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) { - ret = -ENOMEM; - goto out; - } - ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal), - cksum.data); - if (ret) { - kfree(cksum.data); - goto out; - } - cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len; - memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data, - cksum.data, cksum.len); - kfree(cksum.data); + sha256(principal, strlen(principal), + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data); + cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = SHA256_DIGEST_SIZE; } else cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0; @@ -1207,7 +1190,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp) set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags); } -out: free_cld_upcall(cup); out_err: if (ret) @@ -1346,12 +1328,11 @@ found: static int nfsd4_cld_check_v2(struct nfs4_client *clp) { - struct nfs4_client_reclaim *crp; struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id); +#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING struct cld_net *cn = nn->cld_net; - int status; - struct crypto_shash *tfm = cn->cn_tfm; - struct xdr_netobj cksum; +#endif + struct nfs4_client_reclaim *crp; char *principal = NULL; /* did we already find that this client is stable? */ @@ -1367,6 +1348,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) if (cn->cn_has_legacy) { struct xdr_netobj name; char dname[HEXDIR_LEN]; + int status; status = nfs4_make_rec_clidname(dname, &clp->cl_name); if (status) @@ -1389,28 +1371,18 @@ nfsd4_cld_check_v2(struct nfs4_client *clp) return -ENOENT; found: if (crp->cr_princhash.len) { + u8 digest[SHA256_DIGEST_SIZE]; + if (clp->cl_cred.cr_raw_principal) principal = clp->cl_cred.cr_raw_principal; else if (clp->cl_cred.cr_principal) principal = clp->cl_cred.cr_principal; if (principal == NULL) return -ENOENT; - cksum.len = crypto_shash_digestsize(tfm); - cksum.data = kmalloc(cksum.len, GFP_KERNEL); - if (cksum.data == NULL) - return -ENOENT; - status = crypto_shash_tfm_digest(tfm, principal, - strlen(principal), cksum.data); - if (status) { - kfree(cksum.data); + sha256(principal, strlen(principal), digest); + if (memcmp(crp->cr_princhash.data, digest, + crp->cr_princhash.len)) return -ENOENT; - } - if (memcmp(crp->cr_princhash.data, cksum.data, - crp->cr_princhash.len)) { - kfree(cksum.data); - return -ENOENT; - } - kfree(cksum.data); } crp->cr_clp = clp; return 0; @@ -1590,7 +1562,6 @@ nfsd4_cld_tracking_init(struct net *net) struct nfsd_net *nn = net_generic(net, nfsd_net_id); bool running; int retries = 10; - struct crypto_shash *tfm; status = nfs4_cld_state_init(net); if (status) @@ -1615,12 +1586,6 @@ nfsd4_cld_tracking_init(struct net *net) status = -ETIMEDOUT; goto err_remove; } - tfm = crypto_alloc_shash("sha256", 0, 0); - if (IS_ERR(tfm)) { - status = PTR_ERR(tfm); - goto err_remove; - } - nn->cld_net->cn_tfm = tfm; status = nfsd4_cld_get_version(nn); if (status == -EOPNOTSUPP) diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c index 59a693f22452..d5694987f86f 100644 --- a/fs/nfsd/nfs4state.c +++ b/fs/nfsd/nfs4state.c @@ -1987,26 +1987,30 @@ reduce_session_slots(struct nfsd4_session *ses, int dec) return ret; } -/* - * We don't actually need to cache the rpc and session headers, so we - * can allocate a little less for each slot: - */ -static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca) +static struct nfsd4_slot *nfsd4_alloc_slot(struct nfsd4_channel_attrs *fattrs, + int index, gfp_t gfp) { - u32 size; + struct nfsd4_slot *slot; + size_t size; - if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ) - size = 0; - else - size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; - return size + sizeof(struct nfsd4_slot); + /* + * The RPC and NFS session headers are never saved in + * the slot reply cache buffer. + */ + size = fattrs->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ ? + 0 : fattrs->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ; + + slot = kzalloc(struct_size(slot, sl_data, size), gfp); + if (!slot) + return NULL; + slot->sl_index = index; + return slot; } static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, struct nfsd4_channel_attrs *battrs) { int numslots = fattrs->maxreqs; - int slotsize = slot_bytes(fattrs); struct nfsd4_session *new; struct nfsd4_slot *slot; int i; @@ -2015,14 +2019,14 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs, if (!new) return NULL; xa_init(&new->se_slots); - /* allocate each struct nfsd4_slot and data cache in one piece */ - slot = kzalloc(slotsize, GFP_KERNEL); + + slot = nfsd4_alloc_slot(fattrs, 0, GFP_KERNEL); if (!slot || xa_is_err(xa_store(&new->se_slots, 0, slot, GFP_KERNEL))) goto out_free; for (i = 1; i < numslots; i++) { const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN; - slot = kzalloc(slotsize, gfp); + slot = nfsd4_alloc_slot(fattrs, i, gfp); if (!slot) break; if (xa_is_err(xa_store(&new->se_slots, i, slot, gfp))) { @@ -4402,7 +4406,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, nfserr_rep_too_big; if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack)) goto out_put_session; - svc_reserve(rqstp, buflen); + svc_reserve_auth(rqstp, buflen); status = nfs_ok; /* Success! accept new slot seqid */ @@ -4438,8 +4442,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate, * spinlock, and only succeeds if there is * plenty of memory. */ - slot = kzalloc(slot_bytes(&session->se_fchannel), - GFP_NOWAIT); + slot = nfsd4_alloc_slot(&session->se_fchannel, s, + GFP_NOWAIT); prev_slot = xa_load(&session->se_slots, s); if (xa_is_value(prev_slot) && slot) { slot->sl_seqid = xa_to_value(prev_slot); diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c index fe876395985a..3afcdbed6e14 100644 --- a/fs/nfsd/nfs4xdr.c +++ b/fs/nfsd/nfs4xdr.c @@ -2564,7 +2564,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp) /* Sessions make the DRC unnecessary: */ if (argp->minorversion) cachethis = false; - svc_reserve(argp->rqstp, max_reply + readbytes); + svc_reserve_auth(argp->rqstp, max_reply + readbytes); argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE; argp->splice_ok = nfsd_read_splice_ok(argp->rqstp); @@ -3391,6 +3391,23 @@ static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr, return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]); } +/* + * Copied from generic_remap_checks/generic_remap_file_range_prep. + * + * These generic functions use the file system's s_blocksize, but + * individual file systems aren't required to use + * generic_remap_file_range_prep. Until there is a mechanism for + * determining a particular file system's (or file's) clone block + * size, this is the best NFSD can do. + */ +static __be32 nfsd4_encode_fattr4_clone_blksize(struct xdr_stream *xdr, + const struct nfsd4_fattr_args *args) +{ + struct inode *inode = d_inode(args->dentry); + + return nfsd4_encode_uint32_t(xdr, inode->i_sb->s_blocksize); +} + #ifdef CONFIG_NFSD_V4_SECURITY_LABEL static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr, const struct nfsd4_fattr_args *args) @@ -3545,7 +3562,7 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = { [FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop, [FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat, [FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop, - [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop, + [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4_clone_blksize, [FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop, [FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop, diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c index ac265d6fde35..3f3e9f6c4250 100644 --- a/fs/nfsd/nfsctl.c +++ b/fs/nfsd/nfsctl.c @@ -2281,6 +2281,8 @@ static int __init init_nfsd(void) { int retval; + nfsd_debugfs_init(); + retval = nfsd4_init_slabs(); if (retval) return retval; @@ -2291,12 +2293,9 @@ static int __init init_nfsd(void) if (retval) goto out_free_pnfs; nfsd_lockd_init(); /* lockd->nfsd callbacks */ - retval = create_proc_exports_entry(); - if (retval) - goto out_free_lockd; retval = register_pernet_subsys(&nfsd_net_ops); if (retval < 0) - goto out_free_exports; + goto out_free_lockd; retval = register_cld_notifier(); if (retval) goto out_free_subsys; @@ -2305,22 +2304,26 @@ static int __init init_nfsd(void) goto out_free_cld; retval = register_filesystem(&nfsd_fs_type); if (retval) - goto out_free_all; + goto out_free_nfsd4; retval = genl_register_family(&nfsd_nl_family); if (retval) + goto out_free_filesystem; + retval = create_proc_exports_entry(); + if (retval) goto out_free_all; nfsd_localio_ops_init(); return 0; out_free_all: + genl_unregister_family(&nfsd_nl_family); +out_free_filesystem: + unregister_filesystem(&nfsd_fs_type); +out_free_nfsd4: nfsd4_destroy_laundry_wq(); out_free_cld: unregister_cld_notifier(); out_free_subsys: unregister_pernet_subsys(&nfsd_net_ops); -out_free_exports: - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); out_free_lockd: nfsd_lockd_shutdown(); nfsd_drc_slab_free(); @@ -2328,22 +2331,24 @@ out_free_pnfs: nfsd4_exit_pnfs(); out_free_slabs: nfsd4_free_slabs(); + nfsd_debugfs_exit(); return retval; } static void __exit exit_nfsd(void) { + remove_proc_entry("fs/nfs/exports", NULL); + remove_proc_entry("fs/nfs", NULL); genl_unregister_family(&nfsd_nl_family); unregister_filesystem(&nfsd_fs_type); nfsd4_destroy_laundry_wq(); unregister_cld_notifier(); unregister_pernet_subsys(&nfsd_net_ops); nfsd_drc_slab_free(); - remove_proc_entry("fs/nfs/exports", NULL); - remove_proc_entry("fs/nfs", NULL); nfsd_lockd_shutdown(); nfsd4_free_slabs(); nfsd4_exit_pnfs(); + nfsd_debugfs_exit(); } MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>"); diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h index e2997f0ffbc5..1bfd0b4e9af7 100644 --- a/fs/nfsd/nfsd.h +++ b/fs/nfsd/nfsd.h @@ -44,24 +44,14 @@ bool nfsd_support_version(int vers); #include "stats.h" /* - * Maximum blocksizes supported by daemon under various circumstances. + * Default and maximum payload size (NFS READ or WRITE), in bytes. + * The default is historical, and the maximum is an implementation + * limit. */ -#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD -/* NFSv2 is limited by the protocol specification, see RFC 1094 */ -#define NFSSVC_MAXBLKSIZE_V2 (8*1024) - - -/* - * Largest number of bytes we need to allocate for an NFS - * call or reply. Used to control buffer sizes. We use - * the length of v3 WRITE, READDIR and READDIR replies - * which are an RPC header, up to 26 XDR units of reply - * data, and some page data. - * - * Note that accuracy here doesn't matter too much as the - * size is rounded up to a page size when allocating space. - */ -#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE) +enum { + NFSSVC_DEFBLKSIZE = 1 * 1024 * 1024, + NFSSVC_MAXBLKSIZE = RPCSVC_MAXPAYLOAD, +}; struct readdir_cd { __be32 err; /* 0, nfserr, or nfserr_eof */ @@ -156,6 +146,16 @@ void nfsd_reset_versions(struct nfsd_net *nn); int nfsd_create_serv(struct net *net); void nfsd_destroy_serv(struct net *net); +#ifdef CONFIG_DEBUG_FS +void nfsd_debugfs_init(void); +void nfsd_debugfs_exit(void); +#else +static inline void nfsd_debugfs_init(void) {} +static inline void nfsd_debugfs_exit(void) {} +#endif + +extern bool nfsd_disable_splice_read __read_mostly; + extern int nfsd_max_blksize; static inline int nfsd_v4client(struct svc_rqst *rq) diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c index 6370ac0a85fd..c10fa8128a8a 100644 --- a/fs/nfsd/nfsproc.c +++ b/fs/nfsd/nfsproc.c @@ -10,6 +10,7 @@ #include "cache.h" #include "xdr.h" #include "vfs.h" +#include "trace.h" #define NFSDDBG_FACILITY NFSDDBG_PROC @@ -54,7 +55,7 @@ nfsd_proc_getattr(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; - dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh)); + trace_nfsd_vfs_getattr(rqstp, &argp->fh); fh_copy(&resp->fh, &argp->fh); resp->status = fh_verify(rqstp, &resp->fh, 0, @@ -211,7 +212,7 @@ nfsd_proc_read(struct svc_rqst *rqstp) SVCFH_fmt(&argp->fh), argp->count, argp->offset); - argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2); + argp->count = min_t(u32, argp->count, NFS_MAXDATA); argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen); resp->pages = rqstp->rq_next_page; @@ -250,17 +251,14 @@ nfsd_proc_write(struct svc_rqst *rqstp) struct nfsd_writeargs *argp = rqstp->rq_argp; struct nfsd_attrstat *resp = rqstp->rq_resp; unsigned long cnt = argp->len; - unsigned int nvecs; dprintk("nfsd: WRITE %s %u bytes at %d\n", SVCFH_fmt(&argp->fh), argp->len, argp->offset); - nvecs = svc_fill_write_vector(rqstp, &argp->payload); - - resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh), - argp->offset, rqstp->rq_vec, nvecs, - &cnt, NFS_DATA_SYNC, NULL); + fh_copy(&resp->fh, &argp->fh); + resp->status = nfsd_write(rqstp, &resp->fh, argp->offset, + &argp->payload, &cnt, NFS_DATA_SYNC, NULL); if (resp->status == nfs_ok) resp->status = fh_getattr(&resp->fh, &resp->stat); else if (resp->status == nfserr_jukebox) @@ -292,9 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp) int hosterr; dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size); - dprintk("nfsd: CREATE %s %.*s\n", - SVCFH_fmt(dirfhp), argp->len, argp->name); - /* First verify the parent file handle */ resp->status = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC); if (resp->status != nfs_ok) @@ -446,9 +441,6 @@ nfsd_proc_remove(struct svc_rqst *rqstp) struct nfsd_diropargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh), - argp->len, argp->name); - /* Unlink. -SIFDIR means file must not be a directory */ resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR, argp->name, argp->len); @@ -463,11 +455,6 @@ nfsd_proc_rename(struct svc_rqst *rqstp) struct nfsd_renameargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: RENAME %s %.*s -> \n", - SVCFH_fmt(&argp->ffh), argp->flen, argp->fname); - dprintk("nfsd: -> %s %.*s\n", - SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname); - resp->status = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen, &argp->tfh, argp->tname, argp->tlen); fh_put(&argp->ffh); @@ -482,13 +469,6 @@ nfsd_proc_link(struct svc_rqst *rqstp) struct nfsd_linkargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: LINK %s ->\n", - SVCFH_fmt(&argp->ffh)); - dprintk("nfsd: %s %.*s\n", - SVCFH_fmt(&argp->tfh), - argp->tlen, - argp->tname); - resp->status = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen, &argp->ffh); fh_put(&argp->ffh); @@ -520,10 +500,6 @@ nfsd_proc_symlink(struct svc_rqst *rqstp) goto out; } - dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n", - SVCFH_fmt(&argp->ffh), argp->flen, argp->fname, - argp->tlen, argp->tname); - fh_init(&newfh, NFS_FHSIZE); resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen, argp->tname, &attrs, &newfh); @@ -549,8 +525,6 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp) .na_iattr = &argp->attrs, }; - dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); - if (resp->fh.fh_dentry) { printk(KERN_WARNING "nfsd_proc_mkdir: response already verified??\n"); @@ -579,8 +553,6 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp) struct nfsd_diropargs *argp = rqstp->rq_argp; struct nfsd_stat *resp = rqstp->rq_resp; - dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name); - resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR, argp->name, argp->len); fh_put(&argp->fh); @@ -616,9 +588,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp) struct nfsd_readdirres *resp = rqstp->rq_resp; loff_t offset; - dprintk("nfsd: READDIR %s %d bytes at %d\n", - SVCFH_fmt(&argp->fh), - argp->count, argp->cookie); + trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie); nfsd_init_dirlist_pages(rqstp, resp, argp->count); @@ -643,8 +613,6 @@ nfsd_proc_statfs(struct svc_rqst *rqstp) struct nfsd_fhandle *argp = rqstp->rq_argp; struct nfsd_statfsres *resp = rqstp->rq_resp; - dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh)); - resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, NFSD_MAY_BYPASS_GSS_ON_ROOT); fh_put(&argp->fh); @@ -740,7 +708,7 @@ static const struct svc_procedure nfsd_procedures2[18] = { .pc_argzero = sizeof(struct nfsd_readargs), .pc_ressize = sizeof(struct nfsd_readres), .pc_cachetype = RC_NOCACHE, - .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4, + .pc_xdrressize = ST+AT+1+NFS_MAXDATA/4, .pc_name = "READ", }, [NFSPROC_WRITECACHE] = { diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c index 9b3d6cff0e1e..82b0111ac469 100644 --- a/fs/nfsd/nfssvc.c +++ b/fs/nfsd/nfssvc.c @@ -396,13 +396,13 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred) if (ret) goto out_filecache; +#ifdef CONFIG_NFSD_V4_2_INTER_SSC + nfsd4_ssc_init_umount_work(nn); +#endif ret = nfs4_state_start_net(net); if (ret) goto out_reply_cache; -#ifdef CONFIG_NFSD_V4_2_INTER_SSC - nfsd4_ssc_init_umount_work(nn); -#endif nn->nfsd_net_up = true; return 0; @@ -582,7 +582,7 @@ static int nfsd_get_default_max_blksize(void) */ target >>= 12; - ret = NFSSVC_MAXBLKSIZE; + ret = NFSSVC_DEFBLKSIZE; while (ret > target && ret >= 8*1024*2) ret /= 2; return ret; diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c index 5777f40c7353..fc262ceafca9 100644 --- a/fs/nfsd/nfsxdr.c +++ b/fs/nfsd/nfsxdr.c @@ -336,7 +336,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr) /* opaque data */ if (xdr_stream_decode_u32(xdr, &args->len) < 0) return false; - if (args->len > NFSSVC_MAXBLKSIZE_V2) + if (args->len > NFS_MAXDATA) return false; return xdr_stream_subsegment(xdr, &args->payload, args->len); @@ -540,7 +540,7 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr) p = xdr_reserve_space(xdr, XDR_UNIT * 5); if (!p) return false; - *p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2); + *p++ = cpu_to_be32(NFS_MAXDATA); *p++ = cpu_to_be32(stat->f_bsize); *p++ = cpu_to_be32(stat->f_blocks); *p++ = cpu_to_be32(stat->f_bfree); diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h index 290e29dd43eb..1995bca158b8 100644 --- a/fs/nfsd/state.h +++ b/fs/nfsd/state.h @@ -64,6 +64,21 @@ typedef struct { refcount_t cs_count; } copy_stateid_t; +struct nfsd4_referring_call { + struct list_head __list; + + u32 rc_sequenceid; + u32 rc_slotid; +}; + +struct nfsd4_referring_call_list { + struct list_head __list; + + struct nfs4_sessionid rcl_sessionid; + int __nr_referring_calls; + struct list_head rcl_referring_calls; +}; + struct nfsd4_callback { struct nfs4_client *cb_clp; struct rpc_message cb_msg; @@ -76,6 +91,9 @@ struct nfsd4_callback { int cb_seq_status; int cb_status; int cb_held_slot; + + int cb_nr_referring_call_list; + struct list_head cb_referring_call_list; }; struct nfsd4_callback_ops { @@ -260,6 +278,7 @@ struct nfsd4_slot { u32 sl_seqid; __be32 sl_status; struct svc_cred sl_cred; + u32 sl_index; u32 sl_datalen; u16 sl_opcnt; u16 sl_generation; @@ -774,6 +793,10 @@ extern __be32 nfs4_check_open_reclaim(struct nfs4_client *); extern void nfsd4_probe_callback(struct nfs4_client *clp); extern void nfsd4_probe_callback_sync(struct nfs4_client *clp); extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *); +extern void nfsd41_cb_referring_call(struct nfsd4_callback *cb, + struct nfs4_sessionid *sessionid, + u32 slotid, u32 seqno); +extern void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb); extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp, const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op); extern bool nfsd4_run_cb(struct nfsd4_callback *cb); diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h index a7630e9f6577..3c5505ef5e3a 100644 --- a/fs/nfsd/trace.h +++ b/fs/nfsd/trace.h @@ -11,6 +11,7 @@ #include <linux/tracepoint.h> #include <linux/sunrpc/clnt.h> #include <linux/sunrpc/xprt.h> +#include <trace/misc/fs.h> #include <trace/misc/nfs.h> #include <trace/misc/sunrpc.h> @@ -18,22 +19,40 @@ #include "nfsfh.h" #include "xdr4.h" -#define NFSD_TRACE_PROC_RES_FIELDS \ +#define NFSD_TRACE_PROC_CALL_FIELDS(r) \ + __field(unsigned int, netns_ino) \ + __field(u32, xid) \ + __sockaddr(server, (r)->rq_xprt->xpt_locallen) \ + __sockaddr(client, (r)->rq_xprt->xpt_remotelen) + +#define NFSD_TRACE_PROC_CALL_ASSIGNMENTS(r) \ + do { \ + struct svc_xprt *xprt = (r)->rq_xprt; \ + __entry->netns_ino = SVC_NET(r)->ns.inum; \ + __entry->xid = be32_to_cpu((r)->rq_xid); \ + __assign_sockaddr(server, &xprt->xpt_local, \ + xprt->xpt_locallen); \ + __assign_sockaddr(client, &xprt->xpt_remote, \ + xprt->xpt_remotelen); \ + } while (0) + +#define NFSD_TRACE_PROC_RES_FIELDS(r) \ __field(unsigned int, netns_ino) \ __field(u32, xid) \ __field(unsigned long, status) \ - __array(unsigned char, server, sizeof(struct sockaddr_in6)) \ - __array(unsigned char, client, sizeof(struct sockaddr_in6)) + __sockaddr(server, (r)->rq_xprt->xpt_locallen) \ + __sockaddr(client, (r)->rq_xprt->xpt_remotelen) -#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(error) \ +#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(r, error) \ do { \ - __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \ - __entry->xid = be32_to_cpu(rqstp->rq_xid); \ + struct svc_xprt *xprt = (r)->rq_xprt; \ + __entry->netns_ino = SVC_NET(r)->ns.inum; \ + __entry->xid = be32_to_cpu((r)->rq_xid); \ __entry->status = be32_to_cpu(error); \ - memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \ - rqstp->rq_xprt->xpt_locallen); \ - memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \ - rqstp->rq_xprt->xpt_remotelen); \ + __assign_sockaddr(server, &xprt->xpt_local, \ + xprt->xpt_locallen); \ + __assign_sockaddr(client, &xprt->xpt_remote, \ + xprt->xpt_remotelen); \ } while (0); DECLARE_EVENT_CLASS(nfsd_xdr_err_class, @@ -145,14 +164,14 @@ TRACE_EVENT(nfsd_compound_decode_err, ), TP_ARGS(rqstp, args_opcnt, resp_opcnt, opnum, status), TP_STRUCT__entry( - NFSD_TRACE_PROC_RES_FIELDS + NFSD_TRACE_PROC_RES_FIELDS(rqstp) __field(u32, args_opcnt) __field(u32, resp_opcnt) __field(u32, opnum) ), TP_fast_assign( - NFSD_TRACE_PROC_RES_ASSIGNMENTS(status) + NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status) __entry->args_opcnt = args_opcnt; __entry->resp_opcnt = resp_opcnt; @@ -171,12 +190,12 @@ DECLARE_EVENT_CLASS(nfsd_compound_err_class, ), TP_ARGS(rqstp, opnum, status), TP_STRUCT__entry( - NFSD_TRACE_PROC_RES_FIELDS + NFSD_TRACE_PROC_RES_FIELDS(rqstp) __field(u32, opnum) ), TP_fast_assign( - NFSD_TRACE_PROC_RES_ASSIGNMENTS(status) + NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status) __entry->opnum = opnum; ), @@ -451,6 +470,8 @@ DEFINE_NFSD_IO_EVENT(write_start); DEFINE_NFSD_IO_EVENT(write_opened); DEFINE_NFSD_IO_EVENT(write_io_done); DEFINE_NFSD_IO_EVENT(write_done); +DEFINE_NFSD_IO_EVENT(commit_start); +DEFINE_NFSD_IO_EVENT(commit_done); DECLARE_EVENT_CLASS(nfsd_err_class, TP_PROTO(struct svc_rqst *rqstp, @@ -2335,6 +2356,259 @@ DEFINE_EVENT(nfsd_copy_async_done_class, \ DEFINE_COPY_ASYNC_DONE_EVENT(done); DEFINE_COPY_ASYNC_DONE_EVENT(cancel); +TRACE_EVENT(nfsd_vfs_setattr, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const struct iattr *iap, + const struct timespec64 *guardtime + ), + TP_ARGS(rqstp, fhp, iap, guardtime), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(s64, gtime_tv_sec) + __field(u32, gtime_tv_nsec) + __field(unsigned int, ia_valid) + __field(loff_t, ia_size) + __field(uid_t, ia_uid) + __field(gid_t, ia_gid) + __field(umode_t, ia_mode) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->gtime_tv_sec = guardtime ? guardtime->tv_sec : 0; + __entry->gtime_tv_nsec = guardtime ? guardtime->tv_nsec : 0; + __entry->ia_valid = iap->ia_valid; + __entry->ia_size = iap->ia_size; + __entry->ia_uid = __kuid_val(iap->ia_uid); + __entry->ia_gid = __kgid_val(iap->ia_gid); + __entry->ia_mode = iap->ia_mode; + ), + TP_printk( + "xid=0x%08x fh_hash=0x%08x ia_valid=%s ia_size=%llu ia_mode=0%o ia_uid=%u ia_gid=%u guard_time=%lld.%u", + __entry->xid, __entry->fh_hash, show_ia_valid_flags(__entry->ia_valid), + __entry->ia_size, __entry->ia_mode, __entry->ia_uid, __entry->ia_gid, + __entry->gtime_tv_sec, __entry->gtime_tv_nsec + ) +) + +TRACE_EVENT(nfsd_vfs_lookup, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s", + __entry->xid, __entry->fh_hash, __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_create, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + umode_t type, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, type, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(umode_t, type) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->type = type; + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x type=%s name=%s", + __entry->xid, __entry->fh_hash, + show_fs_file_type(__entry->type), __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_symlink, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int namelen, + const char *target + ), + TP_ARGS(rqstp, fhp, name, namelen, target), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, namelen) + __string(target, target) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + __assign_str(target); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s target=%s", + __entry->xid, __entry->fh_hash, + __get_str(name), __get_str(target) + ) +); + +TRACE_EVENT(nfsd_vfs_link, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *sfhp, + const struct svc_fh *tfhp, + const char *name, + unsigned int namelen + ), + TP_ARGS(rqstp, sfhp, tfhp, name, namelen), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, sfh_hash) + __field(u32, tfh_hash) + __string_len(name, name, namelen) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle); + __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x src_fh=0x%08x tgt_fh=0x%08x name=%s", + __entry->xid, __entry->sfh_hash, __entry->tfh_hash, + __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_unlink, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + const char *name, + unsigned int len + ), + TP_ARGS(rqstp, fhp, name, len), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __string_len(name, name, len) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __assign_str(name); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x name=%s", + __entry->xid, __entry->fh_hash, + __get_str(name) + ) +); + +TRACE_EVENT(nfsd_vfs_rename, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *sfhp, + const struct svc_fh *tfhp, + const char *source, + unsigned int sourcelen, + const char *target, + unsigned int targetlen + ), + TP_ARGS(rqstp, sfhp, tfhp, source, sourcelen, target, targetlen), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, sfh_hash) + __field(u32, tfh_hash) + __string_len(source, source, sourcelen) + __string_len(target, target, targetlen) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle); + __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle); + __assign_str(source); + __assign_str(target); + ), + TP_printk("xid=0x%08x sfh_hash=0x%08x tfh_hash=0x%08x source=%s target=%s", + __entry->xid, __entry->sfh_hash, __entry->tfh_hash, + __get_str(source), __get_str(target) + ) +); + +TRACE_EVENT(nfsd_vfs_readdir, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp, + u32 count, + u64 offset + ), + TP_ARGS(rqstp, fhp, count, offset), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + __field(u32, count) + __field(u64, offset) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + __entry->count = count; + __entry->offset = offset; + ), + TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu count=%u", + __entry->xid, __entry->fh_hash, + __entry->offset, __entry->count + ) +); + +DECLARE_EVENT_CLASS(nfsd_vfs_getattr_class, + TP_PROTO( + const struct svc_rqst *rqstp, + const struct svc_fh *fhp + ), + TP_ARGS(rqstp, fhp), + TP_STRUCT__entry( + NFSD_TRACE_PROC_CALL_FIELDS(rqstp) + __field(u32, fh_hash) + ), + TP_fast_assign( + NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp); + __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle); + ), + TP_printk("xid=0x%08x fh_hash=0x%08x", + __entry->xid, __entry->fh_hash + ) +); + +#define DEFINE_NFSD_VFS_GETATTR_EVENT(__name) \ +DEFINE_EVENT(nfsd_vfs_getattr_class, __name, \ + TP_PROTO( \ + const struct svc_rqst *rqstp, \ + const struct svc_fh *fhp \ + ), \ + TP_ARGS(rqstp, fhp)) + +DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_getattr); +DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_statfs); + #endif /* _NFSD_TRACE_H */ #undef TRACE_INCLUDE_PATH diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c index 160a839af405..cd689df2ca5d 100644 --- a/fs/nfsd/vfs.c +++ b/fs/nfsd/vfs.c @@ -31,6 +31,7 @@ #include <linux/exportfs.h> #include <linux/writeback.h> #include <linux/security.h> +#include <linux/sunrpc/xdr.h> #include "xdr3.h" @@ -47,6 +48,8 @@ #define NFSDDBG_FACILITY NFSDDBG_FILEOP +bool nfsd_disable_splice_read __read_mostly; + /** * nfserrno - Map Linux errnos to NFS errnos * @errno: POSIX(-ish) error code to be mapped @@ -244,7 +247,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp, struct dentry *dentry; int host_err; - dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name); + trace_nfsd_vfs_lookup(rqstp, fhp, name, len); dparent = fhp->fh_dentry; exp = exp_get(fhp->fh_export); @@ -500,6 +503,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp, bool size_change = (iap->ia_valid & ATTR_SIZE); int retries; + trace_nfsd_vfs_setattr(rqstp, fhp, iap, guardtime); + if (iap->ia_valid & ATTR_SIZE) { accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE; ftype = S_IFREG; @@ -1082,23 +1087,23 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp, unsigned long v, total; struct iov_iter iter; loff_t ppos = offset; - struct page *page; ssize_t host_err; + size_t len; v = 0; total = *count; while (total) { - page = *(rqstp->rq_next_page++); - rqstp->rq_vec[v].iov_base = page_address(page) + base; - rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base); - total -= rqstp->rq_vec[v].iov_len; + len = min_t(size_t, total, PAGE_SIZE - base); + bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++), + len, base); + total -= len; ++v; base = 0; } - WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec)); + WARN_ON_ONCE(v > rqstp->rq_maxpages); trace_nfsd_read_vector(rqstp, fhp, offset, *count); - iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count); + iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count); host_err = vfs_iter_read(file, &iter, &ppos, 0); return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err); } @@ -1140,11 +1145,27 @@ static int wait_for_concurrent_writes(struct file *file) return err; } +/** + * nfsd_vfs_write - write data to an already-open file + * @rqstp: RPC execution context + * @fhp: File handle of file to write into + * @nf: An open file matching @fhp + * @offset: Byte offset of start + * @payload: xdr_buf containing the write payload + * @cnt: IN: number of bytes to write, OUT: number of bytes actually written + * @stable: An NFS stable_how value + * @verf: NFS WRITE verifier + * + * Upon return, caller must invoke fh_put on @fhp. + * + * Return values: + * An nfsstat value in network byte order. + */ __be32 -nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, - loff_t offset, struct kvec *vec, int vlen, - unsigned long *cnt, int stable, - __be32 *verf) +nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + struct nfsd_file *nf, loff_t offset, + const struct xdr_buf *payload, unsigned long *cnt, + int stable, __be32 *verf) { struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id); struct file *file = nf->nf_file; @@ -1159,6 +1180,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, unsigned int pflags = current->flags; rwf_t flags = 0; bool restore_flags = false; + unsigned int nvecs; trace_nfsd_write_opened(rqstp, fhp, offset, *cnt); @@ -1186,7 +1208,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, if (stable && !fhp->fh_use_wgather) flags |= RWF_SYNC; - iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt); + nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload); + iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt); since = READ_ONCE(file->f_wb_err); if (verf) nfsd_copy_write_verifier(verf, nn); @@ -1237,6 +1260,8 @@ out_nfserr: */ bool nfsd_read_splice_ok(struct svc_rqst *rqstp) { + if (nfsd_disable_splice_read) + return false; switch (svc_auth_flavor(rqstp)) { case RPC_AUTH_GSS_KRB5I: case RPC_AUTH_GSS_KRB5P: @@ -1284,14 +1309,24 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, return err; } -/* - * Write data to a file. - * The stable flag requests synchronous writes. - * N.B. After this call fhp needs an fh_put +/** + * nfsd_write - open a file and write data to it + * @rqstp: RPC execution context + * @fhp: File handle of file to write into; nfsd_write() may modify it + * @offset: Byte offset of start + * @payload: xdr_buf containing the write payload + * @cnt: IN: number of bytes to write, OUT: number of bytes actually written + * @stable: An NFS stable_how value + * @verf: NFS WRITE verifier + * + * Upon return, caller must invoke fh_put on @fhp. + * + * Return values: + * An nfsstat value in network byte order. */ __be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, - struct kvec *vec, int vlen, unsigned long *cnt, int stable, + const struct xdr_buf *payload, unsigned long *cnt, int stable, __be32 *verf) { struct nfsd_file *nf; @@ -1303,8 +1338,8 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, if (err) goto out; - err = nfsd_vfs_write(rqstp, fhp, nf, offset, vec, - vlen, cnt, stable, verf); + err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt, + stable, verf); nfsd_file_put(nf); out: trace_nfsd_write_done(rqstp, fhp, offset, *cnt); @@ -1340,6 +1375,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, loff_t start, end; struct nfsd_net *nn; + trace_nfsd_commit_start(rqstp, fhp, offset, count); + /* * Convert the client-provided (offset, count) range to a * (start, end) range. If the client-provided range falls @@ -1378,6 +1415,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, } else nfsd_copy_write_verifier(verf, nn); + trace_nfsd_commit_done(rqstp, fhp, offset, count); return err; } @@ -1541,6 +1579,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 err; int host_err; + trace_nfsd_vfs_create(rqstp, fhp, type, fname, flen); + if (isdotent(fname, flen)) return nfserr_exist; @@ -1641,6 +1681,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp, __be32 err, cerr; int host_err; + trace_nfsd_vfs_symlink(rqstp, fhp, fname, flen, path); + err = nfserr_noent; if (!flen || path[0] == '\0') goto out; @@ -1709,6 +1751,8 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp, __be32 err; int host_err; + trace_nfsd_vfs_link(rqstp, ffhp, tfhp, name, len); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE); if (err) goto out; @@ -1826,6 +1870,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen, int host_err; bool close_cached = false; + trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen); + err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE); if (err) goto out; @@ -1984,6 +2030,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type, __be32 err; int host_err; + trace_nfsd_vfs_unlink(rqstp, fhp, fname, flen); + err = nfserr_acces; if (!flen || isdotent(fname, flen)) goto out; @@ -2272,6 +2320,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, in { __be32 err; + trace_nfsd_vfs_statfs(rqstp, fhp); + err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access); if (!err) { struct path path = { diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h index f9b09b842856..eff04959606f 100644 --- a/fs/nfsd/vfs.h +++ b/fs/nfsd/vfs.h @@ -128,13 +128,13 @@ bool nfsd_read_splice_ok(struct svc_rqst *rqstp); __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset, unsigned long *count, u32 *eof); -__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t, - struct kvec *, int, unsigned long *, - int stable, __be32 *verf); +__be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, + loff_t offset, const struct xdr_buf *payload, + unsigned long *cnt, int stable, __be32 *verf); __be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf, loff_t offset, - struct kvec *vec, int vlen, unsigned long *cnt, - int stable, __be32 *verf); + const struct xdr_buf *payload, + unsigned long *cnt, int stable, __be32 *verf); __be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *, char *, int *); __be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *, diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h index c26ba86dbdfd..aa2a356da784 100644 --- a/fs/nfsd/xdr4.h +++ b/fs/nfsd/xdr4.h @@ -676,6 +676,10 @@ struct nfsd4_cb_offload { __be32 co_nfserr; unsigned int co_retries; struct knfsd_fh co_fh; + + struct nfs4_sessionid co_referring_sessionid; + u32 co_referring_slotid; + u32 co_referring_seqno; }; struct nfsd4_copy { diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h index f1a315cd31b7..f4e29c0c701c 100644 --- a/fs/nfsd/xdr4cb.h +++ b/fs/nfsd/xdr4cb.h @@ -6,8 +6,11 @@ #define cb_compound_enc_hdr_sz 4 #define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2)) #define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2) +#define enc_referring_call4_sz (1 + 1) +#define enc_referring_call_list4_sz (sessionid_sz + 1 + \ + enc_referring_call4_sz) #define cb_sequence_enc_sz (sessionid_sz + 4 + \ - 1 /* no referring calls list yet */) + enc_referring_call_list4_sz) #define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4) #define op_enc_sz 1 diff --git a/fs/nilfs2/btree.c b/fs/nilfs2/btree.c index 0d8f7fb15c2e..dd0c8e560ef6 100644 --- a/fs/nilfs2/btree.c +++ b/fs/nilfs2/btree.c @@ -2102,11 +2102,13 @@ static int nilfs_btree_propagate(struct nilfs_bmap *btree, ret = nilfs_btree_do_lookup(btree, path, key, NULL, level + 1, 0); if (ret < 0) { - if (unlikely(ret == -ENOENT)) + if (unlikely(ret == -ENOENT)) { nilfs_crit(btree->b_inode->i_sb, "writing node/leaf block does not appear in b-tree (ino=%lu) at key=%llu, level=%d", btree->b_inode->i_ino, (unsigned long long)key, level); + ret = -EINVAL; + } goto out; } diff --git a/fs/nilfs2/direct.c b/fs/nilfs2/direct.c index 893ab36824cc..2d8dc6b35b54 100644 --- a/fs/nilfs2/direct.c +++ b/fs/nilfs2/direct.c @@ -273,6 +273,9 @@ static int nilfs_direct_propagate(struct nilfs_bmap *bmap, dat = nilfs_bmap_get_dat(bmap); key = nilfs_bmap_data_get_key(bmap, bh); ptr = nilfs_direct_get_ptr(bmap, key); + if (ptr == NILFS_BMAP_INVALID_PTR) + return -EINVAL; + if (!buffer_nilfs_volatile(bh)) { oldreq.pr_entry_nr = ptr; newreq.pr_entry_nr = ptr; diff --git a/fs/nilfs2/mdt.c b/fs/nilfs2/mdt.c index 2f850a18d6e7..946b0d3534a5 100644 --- a/fs/nilfs2/mdt.c +++ b/fs/nilfs2/mdt.c @@ -422,8 +422,6 @@ static int nilfs_mdt_write_folio(struct folio *folio, if (wbc->sync_mode == WB_SYNC_ALL) err = nilfs_construct_segment(sb); - else if (wbc->for_reclaim) - nilfs_flush_segment(sb, inode->i_ino); return err; } diff --git a/fs/nilfs2/segment.c b/fs/nilfs2/segment.c index 83970d97840b..61a4141f8d6b 100644 --- a/fs/nilfs2/segment.c +++ b/fs/nilfs2/segment.c @@ -2221,22 +2221,6 @@ static void nilfs_segctor_do_flush(struct nilfs_sc_info *sci, int bn) spin_unlock(&sci->sc_state_lock); } -/** - * nilfs_flush_segment - trigger a segment construction for resource control - * @sb: super block - * @ino: inode number of the file to be flushed out. - */ -void nilfs_flush_segment(struct super_block *sb, ino_t ino) -{ - struct the_nilfs *nilfs = sb->s_fs_info; - struct nilfs_sc_info *sci = nilfs->ns_writer; - - if (!sci || nilfs_doing_construction()) - return; - nilfs_segctor_do_flush(sci, NILFS_MDT_INODE(sb, ino) ? ino : 0); - /* assign bit 0 to data files */ -} - struct nilfs_segctor_wait_request { wait_queue_entry_t wq; __u32 seq; diff --git a/fs/nilfs2/segment.h b/fs/nilfs2/segment.h index f723f47ddc4e..4b39ed43ae72 100644 --- a/fs/nilfs2/segment.h +++ b/fs/nilfs2/segment.h @@ -226,7 +226,6 @@ extern void nilfs_relax_pressure_in_lock(struct super_block *); extern int nilfs_construct_segment(struct super_block *); extern int nilfs_construct_dsync_segment(struct super_block *, struct inode *, loff_t, loff_t); -extern void nilfs_flush_segment(struct super_block *, ino_t); extern int nilfs_clean_segments(struct super_block *, struct nilfs_argv *, void **); diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 6d386080faf2..3083643b864b 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -415,7 +415,7 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode, { int dwords, type = 0; char *ext_buf = NULL; - void *buf = fh->buf; + void *buf = fh + 1; int err; fh->type = FILEID_ROOT; @@ -1009,6 +1009,7 @@ finish: static void fanotify_free_group_priv(struct fsnotify_group *group) { + put_user_ns(group->user_ns); kfree(group->fanotify_data.merge_hash); if (group->fanotify_data.ucounts) dec_ucount(group->fanotify_data.ucounts, diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h index b44e70e44be6..b78308975082 100644 --- a/fs/notify/fanotify/fanotify.h +++ b/fs/notify/fanotify/fanotify.h @@ -25,7 +25,7 @@ enum { * stored in either the first or last 2 dwords. */ #define FANOTIFY_INLINE_FH_LEN (3 << 2) -#define FANOTIFY_FH_HDR_LEN offsetof(struct fanotify_fh, buf) +#define FANOTIFY_FH_HDR_LEN sizeof(struct fanotify_fh) /* Fixed size struct for file handle */ struct fanotify_fh { @@ -34,7 +34,6 @@ struct fanotify_fh { #define FANOTIFY_FH_FLAG_EXT_BUF 1 u8 flags; u8 pad; - unsigned char buf[]; } __aligned(4); /* Variable size struct for dir file handle + child file handle + name */ @@ -92,7 +91,7 @@ static inline char **fanotify_fh_ext_buf_ptr(struct fanotify_fh *fh) BUILD_BUG_ON(FANOTIFY_FH_HDR_LEN % 4); BUILD_BUG_ON(__alignof__(char *) - 4 + sizeof(char *) > FANOTIFY_INLINE_FH_LEN); - return (char **)ALIGN((unsigned long)(fh->buf), __alignof__(char *)); + return (char **)ALIGN((unsigned long)(fh + 1), __alignof__(char *)); } static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh) @@ -102,7 +101,7 @@ static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh) static inline void *fanotify_fh_buf(struct fanotify_fh *fh) { - return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh->buf; + return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh + 1; } static inline int fanotify_info_dir_fh_len(struct fanotify_info *info) @@ -278,7 +277,7 @@ static inline void fanotify_init_event(struct fanotify_event *event, #define FANOTIFY_INLINE_FH(name, size) \ struct { \ struct fanotify_fh name; \ - /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \ + /* Space for filehandle - access with fanotify_fh_buf() */ \ unsigned char _inline_fh_buf[size]; \ } diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c index 87f861e9004f..b192ee068a7a 100644 --- a/fs/notify/fanotify/fanotify_user.c +++ b/fs/notify/fanotify/fanotify_user.c @@ -1334,6 +1334,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group, * A group with FAN_UNLIMITED_MARKS does not contribute to mark count * in the limited groups account. */ + BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_MARKS)); if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) && !inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS)) return ERR_PTR(-ENOSPC); @@ -1498,6 +1499,7 @@ static struct hlist_head *fanotify_alloc_merge_hash(void) /* fanotify syscalls */ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) { + struct user_namespace *user_ns = current_user_ns(); struct fsnotify_group *group; int f_flags, fd; unsigned int fid_mode = flags & FANOTIFY_FID_BITS; @@ -1512,10 +1514,11 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) /* * An unprivileged user can setup an fanotify group with * limited functionality - an unprivileged group is limited to - * notification events with file handles and it cannot use - * unlimited queue/marks. + * notification events with file handles or mount ids and it + * cannot use unlimited queue/marks. */ - if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode) + if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || + !(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT))) return -EPERM; /* @@ -1594,8 +1597,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) } /* Enforce groups limits per user in all containing user ns */ - group->fanotify_data.ucounts = inc_ucount(current_user_ns(), - current_euid(), + group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(), UCOUNT_FANOTIFY_GROUPS); if (!group->fanotify_data.ucounts) { fd = -EMFILE; @@ -1604,6 +1606,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) group->fanotify_data.flags = flags | internal_flags; group->memcg = get_mem_cgroup_from_mm(current->mm); + group->user_ns = get_user_ns(user_ns); group->fanotify_data.merge_hash = fanotify_alloc_merge_hash(); if (!group->fanotify_data.merge_hash) { @@ -1637,21 +1640,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags) goto out_destroy_group; } + BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE)); if (flags & FAN_UNLIMITED_QUEUE) { - fd = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out_destroy_group; group->max_events = UINT_MAX; } else { group->max_events = fanotify_max_queued_events; } - if (flags & FAN_UNLIMITED_MARKS) { - fd = -EPERM; - if (!capable(CAP_SYS_ADMIN)) - goto out_destroy_group; - } - if (flags & FAN_ENABLE_AUDIT) { fd = -EPERM; if (!capable(CAP_AUDIT_WRITE)) @@ -1811,6 +1806,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, struct fsnotify_group *group; struct path path; struct fan_fsid __fsid, *fsid = NULL; + struct user_namespace *user_ns = NULL; + struct mnt_namespace *mntns; u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS; unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS; unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS; @@ -1904,12 +1901,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, } /* - * An unprivileged user is not allowed to setup mount nor filesystem - * marks. This also includes setting up such marks by a group that - * was initialized by an unprivileged user. + * A user is allowed to setup sb/mount/mntns marks only if it is + * capable in the user ns where the group was created. */ - if ((!capable(CAP_SYS_ADMIN) || - FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) && + if (!ns_capable(group->user_ns, CAP_SYS_ADMIN) && mark_type != FAN_MARK_INODE) return -EPERM; @@ -1988,18 +1983,31 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask, fsid = &__fsid; } - /* inode held in place by reference to path; group by fget on fd */ + /* + * In addition to being capable in the user ns where group was created, + * the user also needs to be capable in the user ns associated with + * the filesystem or in the user ns associated with the mntns + * (when marking mntns). + */ if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) { inode = path.dentry->d_inode; obj = inode; } else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) { + user_ns = path.mnt->mnt_sb->s_user_ns; obj = path.mnt; } else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) { + user_ns = path.mnt->mnt_sb->s_user_ns; obj = path.mnt->mnt_sb; } else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) { - obj = mnt_ns_from_dentry(path.dentry); + mntns = mnt_ns_from_dentry(path.dentry); + user_ns = mntns->user_ns; + obj = mntns; } + ret = -EPERM; + if (user_ns && !ns_capable(user_ns, CAP_SYS_ADMIN)) + goto path_put_and_out; + ret = -EINVAL; if (!obj) goto path_put_and_out; diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c index e946f75eb540..eced9013a881 100644 --- a/fs/ntfs3/attrib.c +++ b/fs/ntfs3/attrib.c @@ -2605,75 +2605,3 @@ int attr_force_nonresident(struct ntfs_inode *ni) return err; } - -/* - * Change the compression of data attribute - */ -int attr_set_compress(struct ntfs_inode *ni, bool compr) -{ - struct ATTRIB *attr; - struct mft_inode *mi; - - attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi); - if (!attr) - return -ENOENT; - - if (is_attr_compressed(attr) == !!compr) { - /* Already required compressed state. */ - return 0; - } - - if (attr->non_res) { - u16 run_off; - u32 run_size; - char *run; - - if (attr->nres.data_size) { - /* - * There are rare cases when it possible to change - * compress state without big changes. - * TODO: Process these cases. - */ - return -EOPNOTSUPP; - } - - run_off = le16_to_cpu(attr->nres.run_off); - run_size = le32_to_cpu(attr->size) - run_off; - run = Add2Ptr(attr, run_off); - - if (!compr) { - /* remove field 'attr->nres.total_size'. */ - memmove(run - 8, run, run_size); - run_off -= 8; - } - - if (!mi_resize_attr(mi, attr, compr ? +8 : -8)) { - /* - * Ignore rare case when there are no 8 bytes in record with attr. - * TODO: split attribute. - */ - return -EOPNOTSUPP; - } - - if (compr) { - /* Make a gap for 'attr->nres.total_size'. */ - memmove(run + 8, run, run_size); - run_off += 8; - attr->nres.total_size = attr->nres.alloc_size; - } - attr->nres.run_off = cpu_to_le16(run_off); - } - - /* Update attribute flags. */ - if (compr) { - attr->flags &= ~ATTR_FLAG_SPARSED; - attr->flags |= ATTR_FLAG_COMPRESSED; - attr->nres.c_unit = NTFS_LZNT_CUNIT; - } else { - attr->flags &= ~ATTR_FLAG_COMPRESSED; - attr->nres.c_unit = 0; - } - mi->dirty = true; - - return 0; -} diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c index 9b6a3f8d2e7c..1e99a35691cd 100644 --- a/fs/ntfs3/file.c +++ b/fs/ntfs3/file.c @@ -50,90 +50,6 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg) } /* - * ntfs_fileattr_get - inode_operations::fileattr_get - */ -int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - struct ntfs_inode *ni = ntfs_i(inode); - u32 flags = 0; - - if (inode->i_flags & S_IMMUTABLE) - flags |= FS_IMMUTABLE_FL; - - if (inode->i_flags & S_APPEND) - flags |= FS_APPEND_FL; - - if (is_compressed(ni)) - flags |= FS_COMPR_FL; - - if (is_encrypted(ni)) - flags |= FS_ENCRYPT_FL; - - fileattr_fill_flags(fa, flags); - - return 0; -} - -/* - * ntfs_fileattr_set - inode_operations::fileattr_set - */ -int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa) -{ - struct inode *inode = d_inode(dentry); - struct ntfs_inode *ni = ntfs_i(inode); - u32 flags = fa->flags; - unsigned int new_fl = 0; - - if (fileattr_has_fsx(fa)) - return -EOPNOTSUPP; - - if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_COMPR_FL)) - return -EOPNOTSUPP; - - if (flags & FS_IMMUTABLE_FL) - new_fl |= S_IMMUTABLE; - - if (flags & FS_APPEND_FL) - new_fl |= S_APPEND; - - /* Allowed to change compression for empty files and for directories only. */ - if (!is_dedup(ni) && !is_encrypted(ni) && - (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) { - int err = 0; - struct address_space *mapping = inode->i_mapping; - - /* write out all data and wait. */ - filemap_invalidate_lock(mapping); - err = filemap_write_and_wait(mapping); - - if (err >= 0) { - /* Change compress state. */ - bool compr = flags & FS_COMPR_FL; - err = ni_set_compress(inode, compr); - - /* For files change a_ops too. */ - if (!err) - mapping->a_ops = compr ? &ntfs_aops_cmpr : - &ntfs_aops; - } - - filemap_invalidate_unlock(mapping); - - if (err) - return err; - } - - inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND); - - inode_set_ctime_current(inode); - mark_inode_dirty(inode); - - return 0; -} - -/* * ntfs_ioctl - file_operations::unlocked_ioctl */ long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg) @@ -430,7 +346,6 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count, } if (extend_init && !is_compressed(ni)) { - WARN_ON(ni->i_valid >= pos); err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos); if (err) goto out; @@ -998,7 +913,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) struct ntfs_inode *ni = ntfs_i(inode); u64 valid = ni->i_valid; struct ntfs_sb_info *sbi = ni->mi.sbi; - struct page *page, **pages = NULL; + struct page **pages = NULL; + struct folio *folio; size_t written = 0; u8 frame_bits = NTFS_LZNT_CUNIT + sbi->cluster_bits; u32 frame_size = 1u << frame_bits; @@ -1008,7 +924,6 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) u64 frame_vbo; pgoff_t index; bool frame_uptodate; - struct folio *folio; if (frame_size < PAGE_SIZE) { /* @@ -1062,8 +977,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) pages_per_frame); if (err) { for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - folio = page_folio(page); + folio = page_folio(pages[ip]); folio_unlock(folio); folio_put(folio); } @@ -1074,10 +988,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) ip = off >> PAGE_SHIFT; off = offset_in_page(valid); for (; ip < pages_per_frame; ip++, off = 0) { - page = pages[ip]; - folio = page_folio(page); - zero_user_segment(page, off, PAGE_SIZE); - flush_dcache_page(page); + folio = page_folio(pages[ip]); + folio_zero_segment(folio, off, PAGE_SIZE); + flush_dcache_folio(folio); folio_mark_uptodate(folio); } @@ -1086,8 +999,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) ni_unlock(ni); for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - folio = page_folio(page); + folio = page_folio(pages[ip]); folio_mark_uptodate(folio); folio_unlock(folio); folio_put(folio); @@ -1131,8 +1043,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) if (err) { for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - folio = page_folio(page); + folio = page_folio(pages[ip]); folio_unlock(folio); folio_put(folio); } @@ -1150,10 +1061,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) for (;;) { size_t cp, tail = PAGE_SIZE - off; - page = pages[ip]; - cp = copy_page_from_iter_atomic(page, off, + folio = page_folio(pages[ip]); + cp = copy_folio_from_iter_atomic(folio, off, min(tail, bytes), from); - flush_dcache_page(page); + flush_dcache_folio(folio); copied += cp; bytes -= cp; @@ -1173,9 +1084,8 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) ni_unlock(ni); for (ip = 0; ip < pages_per_frame; ip++) { - page = pages[ip]; - ClearPageDirty(page); - folio = page_folio(page); + folio = page_folio(pages[ip]); + folio_clear_dirty(folio); folio_mark_uptodate(folio); folio_unlock(folio); folio_put(folio); @@ -1409,8 +1319,6 @@ const struct inode_operations ntfs_file_inode_operations = { .get_acl = ntfs_get_acl, .set_acl = ntfs_set_acl, .fiemap = ntfs_fiemap, - .fileattr_get = ntfs_fileattr_get, - .fileattr_set = ntfs_fileattr_set, }; const struct file_operations ntfs_file_operations = { diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c index b7a83200f2cc..756e1306fe6c 100644 --- a/fs/ntfs3/frecord.c +++ b/fs/ntfs3/frecord.c @@ -3327,77 +3327,3 @@ out: return 0; } - -/* - * ni_set_compress - * - * Helper for 'ntfs_fileattr_set'. - * Changes compression for empty files and directories only. - */ -int ni_set_compress(struct inode *inode, bool compr) -{ - int err; - struct ntfs_inode *ni = ntfs_i(inode); - struct ATTR_STD_INFO *std; - const char *bad_inode; - - if (is_compressed(ni) == !!compr) - return 0; - - if (is_sparsed(ni)) { - /* sparse and compress not compatible. */ - return -EOPNOTSUPP; - } - - if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) { - /*Skip other inodes. (symlink,fifo,...) */ - return -EOPNOTSUPP; - } - - bad_inode = NULL; - - ni_lock(ni); - - std = ni_std(ni); - if (!std) { - bad_inode = "no std"; - goto out; - } - - if (S_ISREG(inode->i_mode)) { - err = attr_set_compress(ni, compr); - if (err) { - if (err == -ENOENT) { - /* Fix on the fly? */ - /* Each file must contain data attribute. */ - bad_inode = "no data attribute"; - } - goto out; - } - } - - ni->std_fa = std->fa; - if (compr) { - std->fa &= ~FILE_ATTRIBUTE_SPARSE_FILE; - std->fa |= FILE_ATTRIBUTE_COMPRESSED; - } else { - std->fa &= ~FILE_ATTRIBUTE_COMPRESSED; - } - - if (ni->std_fa != std->fa) { - ni->std_fa = std->fa; - ni->mi.dirty = true; - } - /* update duplicate information and directory entries in ni_write_inode.*/ - ni->ni_flags |= NI_FLAG_UPDATE_PARENT; - err = 0; - -out: - ni_unlock(ni); - if (bad_inode) { - ntfs_bad_inode(inode, bad_inode); - err = -EINVAL; - } - - return err; -} diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c index d0d530f4e2b9..38934e6978ec 100644 --- a/fs/ntfs3/fslog.c +++ b/fs/ntfs3/fslog.c @@ -3091,16 +3091,16 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, inode = ilookup(sbi->sb, rno); if (inode) { mi = &ntfs_i(inode)->mi; - } else if (op == InitializeFileRecordSegment) { - mi = kzalloc(sizeof(struct mft_inode), GFP_NOFS); - if (!mi) - return -ENOMEM; - err = mi_format_new(mi, sbi, rno, 0, false); - if (err) - goto out; } else { /* Read from disk. */ err = mi_get(sbi, rno, &mi); + if (err && op == InitializeFileRecordSegment) { + mi = kzalloc(sizeof(struct mft_inode), + GFP_NOFS); + if (!mi) + return -ENOMEM; + err = mi_format_new(mi, sbi, rno, 0, false); + } if (err) return err; } @@ -3109,15 +3109,13 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, if (op == DeallocateFileRecordSegment) goto skip_load_parent; - if (InitializeFileRecordSegment != op) { - if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE) - goto dirty_vol; - if (!check_lsn(&rec->rhdr, rlsn)) - goto out; - if (!check_file_record(rec, NULL, sbi)) - goto dirty_vol; - attr = Add2Ptr(rec, roff); - } + if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE) + goto dirty_vol; + if (!check_lsn(&rec->rhdr, rlsn)) + goto out; + if (!check_file_record(rec, NULL, sbi)) + goto dirty_vol; + attr = Add2Ptr(rec, roff); if (is_rec_base(rec) || InitializeFileRecordSegment == op) { rno_base = rno; @@ -3143,7 +3141,7 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe, if (inode) iput(inode); - else if (mi) + else mi_put(mi); inode = inode_parent; diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c index 78d20e4baa2c..1bf2a6593dec 100644 --- a/fs/ntfs3/index.c +++ b/fs/ntfs3/index.c @@ -2182,6 +2182,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx, e = hdr_first_de(&n->index->ihdr); fnd_push(fnd, n, e); + if (!e) { + err = -EINVAL; + goto out; + } if (!de_is_last(e)) { /* @@ -2203,6 +2207,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx, n = fnd->nodes[level]; te = hdr_first_de(&n->index->ihdr); + if (!te) { + err = -EINVAL; + goto out; + } /* Copy the candidate entry into the replacement entry buffer. */ re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS); if (!re) { diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c index 3e2957a1e360..0f0d27d4644a 100644 --- a/fs/ntfs3/inode.c +++ b/fs/ntfs3/inode.c @@ -805,6 +805,10 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter) ret = 0; goto out; } + if (is_compressed(ni)) { + ret = 0; + goto out; + } ret = blockdev_direct_IO(iocb, inode, iter, wr ? ntfs_get_block_direct_IO_W : @@ -2068,5 +2072,6 @@ const struct address_space_operations ntfs_aops_cmpr = { .read_folio = ntfs_read_folio, .readahead = ntfs_readahead, .dirty_folio = block_dirty_folio, + .direct_IO = ntfs_direct_IO, }; // clang-format on diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c index 652735a0b0c4..b807744fc6a9 100644 --- a/fs/ntfs3/namei.c +++ b/fs/ntfs3/namei.c @@ -507,8 +507,6 @@ const struct inode_operations ntfs_dir_inode_operations = { .getattr = ntfs_getattr, .listxattr = ntfs_listxattr, .fiemap = ntfs_fiemap, - .fileattr_get = ntfs_fileattr_get, - .fileattr_set = ntfs_fileattr_set, }; const struct inode_operations ntfs_special_inode_operations = { diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h index d628977e2556..36b8052660d5 100644 --- a/fs/ntfs3/ntfs_fs.h +++ b/fs/ntfs3/ntfs_fs.h @@ -454,7 +454,6 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes); int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size); int attr_force_nonresident(struct ntfs_inode *ni); -int attr_set_compress(struct ntfs_inode *ni, bool compr); /* Functions from attrlist.c */ void al_destroy(struct ntfs_inode *ni); @@ -497,9 +496,6 @@ extern const struct file_operations ntfs_dir_operations; extern const struct file_operations ntfs_legacy_dir_operations; /* Globals from file.c */ -int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa); -int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry, - struct fileattr *fa); int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path, struct kstat *stat, u32 request_mask, u32 flags); int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry, @@ -585,7 +581,6 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni, bool *is_bad); bool ni_is_dirty(struct inode *inode); -int ni_set_compress(struct inode *inode, bool compr); /* Globals from fslog.c */ bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes); diff --git a/fs/ocfs2/cluster/tcp.c b/fs/ocfs2/cluster/tcp.c index fce9beb214f0..43e652a2adaf 100644 --- a/fs/ocfs2/cluster/tcp.c +++ b/fs/ocfs2/cluster/tcp.c @@ -1483,7 +1483,7 @@ static void o2net_sc_send_keep_req(struct work_struct *work) sc_put(sc); } -/* socket shutdown does a del_timer_sync against this as it tears down. +/* socket shutdown does a timer_delete_sync against this as it tears down. * we can't start this timer until we've got to the point in sc buildup * where shutdown is going to be involved */ static void o2net_idle_timer(struct timer_list *t) diff --git a/fs/ocfs2/filecheck.c b/fs/ocfs2/filecheck.c index 1ad7106741f8..3ad7baf67658 100644 --- a/fs/ocfs2/filecheck.c +++ b/fs/ocfs2/filecheck.c @@ -505,5 +505,5 @@ static ssize_t ocfs2_filecheck_attr_store(struct kobject *kobj, ocfs2_filecheck_handle_entry(ent, entry); exit: - return (!ret ? count : ret); + return ret ?: count; } diff --git a/fs/ocfs2/quota_local.c b/fs/ocfs2/quota_local.c index e272429da3db..de7f12858729 100644 --- a/fs/ocfs2/quota_local.c +++ b/fs/ocfs2/quota_local.c @@ -674,7 +674,7 @@ out_put: break; } out: - kfree(rec); + ocfs2_free_quota_recovery(rec); return status; } diff --git a/fs/ocfs2/stackglue.c b/fs/ocfs2/stackglue.c index ddd761cf44c8..a28c127b9934 100644 --- a/fs/ocfs2/stackglue.c +++ b/fs/ocfs2/stackglue.c @@ -691,8 +691,7 @@ static void __exit ocfs2_stack_glue_exit(void) memset(&locking_max_version, 0, sizeof(struct ocfs2_protocol_version)); ocfs2_sysfs_exit(); - if (ocfs2_table_header) - unregister_sysctl_table(ocfs2_table_header); + unregister_sysctl_table(ocfs2_table_header); } MODULE_AUTHOR("Oracle"); diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h index 3d4b883a7660..3e153c2f6b82 100644 --- a/fs/orangefs/orangefs-kernel.h +++ b/fs/orangefs/orangefs-kernel.h @@ -32,6 +32,8 @@ #include <linux/slab.h> #include <linux/types.h> #include <linux/fs.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/vmalloc.h> #include <linux/aio.h> @@ -328,11 +330,9 @@ void purge_waiting_ops(void); * defined in super.c */ extern uint64_t orangefs_features; +extern const struct fs_parameter_spec orangefs_fs_param_spec[]; -struct dentry *orangefs_mount(struct file_system_type *fst, - int flags, - const char *devname, - void *data); +int orangefs_init_fs_context(struct fs_context *fc); void orangefs_kill_sb(struct super_block *sb); int orangefs_remount(struct orangefs_sb_info_s *); diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c index 5ab741c60b7e..7ac16a4d2dc6 100644 --- a/fs/orangefs/orangefs-mod.c +++ b/fs/orangefs/orangefs-mod.c @@ -46,7 +46,8 @@ MODULE_PARM_DESC(hash_table_size, static struct file_system_type orangefs_fs_type = { .name = "pvfs2", - .mount = orangefs_mount, + .init_fs_context = orangefs_init_fs_context, + .parameters = orangefs_fs_param_spec, .kill_sb = orangefs_kill_sb, .owner = THIS_MODULE, }; diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c index eba3e357192e..64ca9498f550 100644 --- a/fs/orangefs/super.c +++ b/fs/orangefs/super.c @@ -9,7 +9,6 @@ #include "orangefs-kernel.h" #include "orangefs-bufmap.h" -#include <linux/parser.h> #include <linux/hashtable.h> #include <linux/seq_file.h> @@ -22,18 +21,16 @@ LIST_HEAD(orangefs_superblocks); DEFINE_SPINLOCK(orangefs_superblocks_lock); enum { - Opt_intr, Opt_acl, + Opt_intr, Opt_local_lock, - - Opt_err }; -static const match_table_t tokens = { - { Opt_acl, "acl" }, - { Opt_intr, "intr" }, - { Opt_local_lock, "local_lock" }, - { Opt_err, NULL } +const struct fs_parameter_spec orangefs_fs_param_spec[] = { + fsparam_flag ("acl", Opt_acl), + fsparam_flag ("intr", Opt_intr), + fsparam_flag ("local_lock", Opt_local_lock), + {} }; uint64_t orangefs_features; @@ -51,48 +48,30 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root) return 0; } -static int parse_mount_options(struct super_block *sb, char *options, - int silent) +static int orangefs_parse_param(struct fs_context *fc, + struct fs_parameter *param) { - struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb); - substring_t args[MAX_OPT_ARGS]; - char *p; - - /* - * Force any potential flags that might be set from the mount - * to zero, ie, initialize to unset. - */ - sb->s_flags &= ~SB_POSIXACL; - orangefs_sb->flags &= ~ORANGEFS_OPT_INTR; - orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_acl: - sb->s_flags |= SB_POSIXACL; - break; - case Opt_intr: - orangefs_sb->flags |= ORANGEFS_OPT_INTR; - break; - case Opt_local_lock: - orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK; - break; - default: - goto fail; - } + struct orangefs_sb_info_s *orangefs_sb = fc->s_fs_info; + struct fs_parse_result result; + int opt; + + opt = fs_parse(fc, orangefs_fs_param_spec, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_acl: + fc->sb_flags |= SB_POSIXACL; + break; + case Opt_intr: + orangefs_sb->flags |= ORANGEFS_OPT_INTR; + break; + case Opt_local_lock: + orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK; + break; } return 0; -fail: - if (!silent) - gossip_err("Error: mount option [%s] is not supported.\n", p); - return -EINVAL; } static void orangefs_inode_cache_ctor(void *req) @@ -223,10 +202,20 @@ out_op_release: * Remount as initiated by VFS layer. We just need to reparse the mount * options, no need to signal pvfs2-client-core about it. */ -static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data) +static int orangefs_reconfigure(struct fs_context *fc) { - gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n"); - return parse_mount_options(sb, data, 1); + struct super_block *sb = fc->root->d_sb; + struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb); + struct orangefs_sb_info_s *revised = fc->s_fs_info; + unsigned int flags; + + flags = orangefs_sb->flags; + flags &= ~(ORANGEFS_OPT_INTR | ORANGEFS_OPT_LOCAL_LOCK); + flags |= revised->flags; + WRITE_ONCE(orangefs_sb->flags, flags); + + gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_reconfigure: called\n"); + return 0; } /* @@ -319,7 +308,6 @@ static const struct super_operations orangefs_s_ops = { .write_inode = orangefs_write_inode, .drop_inode = generic_delete_inode, .statfs = orangefs_statfs, - .remount_fs = orangefs_remount_fs, .show_options = orangefs_show_options, }; @@ -410,8 +398,8 @@ static int orangefs_unmount(int id, __s32 fs_id, const char *devname) } static int orangefs_fill_sb(struct super_block *sb, - struct orangefs_fs_mount_response *fs_mount, - void *data, int silent) + struct fs_context *fc, + struct orangefs_fs_mount_response *fs_mount) { int ret; struct inode *root; @@ -424,12 +412,6 @@ static int orangefs_fill_sb(struct super_block *sb, ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id; ORANGEFS_SB(sb)->id = fs_mount->id; - if (data) { - ret = parse_mount_options(sb, data, silent); - if (ret) - return ret; - } - /* Hang the xattr handlers off the superblock */ sb->s_xattr = orangefs_xattr_handlers; sb->s_magic = ORANGEFS_SUPER_MAGIC; @@ -470,30 +452,24 @@ static int orangefs_fill_sb(struct super_block *sb, return 0; } -struct dentry *orangefs_mount(struct file_system_type *fst, - int flags, - const char *devname, - void *data) +static int orangefs_get_tree(struct fs_context *fc) { int ret; struct super_block *sb = ERR_PTR(-EINVAL); struct orangefs_kernel_op_s *new_op; - struct dentry *d = ERR_PTR(-EINVAL); + + if (!fc->source) + return invalf(fc, "Device name not specified.\n"); gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_mount: called with devname %s\n", - devname); - - if (!devname) { - gossip_err("ERROR: device name not specified.\n"); - return ERR_PTR(-EINVAL); - } + fc->source); new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT); if (!new_op) - return ERR_PTR(-ENOMEM); + return -ENOMEM; - strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, devname); + strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, fc->source); gossip_debug(GOSSIP_SUPER_DEBUG, "Attempting ORANGEFS Mount via host %s\n", @@ -511,37 +487,27 @@ struct dentry *orangefs_mount(struct file_system_type *fst, goto free_op; } - sb = sget(fst, NULL, set_anon_super, flags, NULL); + sb = sget_fc(fc, NULL, set_anon_super_fc); if (IS_ERR(sb)) { - d = ERR_CAST(sb); + ret = PTR_ERR(sb); orangefs_unmount(new_op->downcall.resp.fs_mount.id, - new_op->downcall.resp.fs_mount.fs_id, devname); - goto free_op; - } - - /* alloc and init our private orangefs sb info */ - sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); - if (!ORANGEFS_SB(sb)) { - d = ERR_PTR(-ENOMEM); + new_op->downcall.resp.fs_mount.fs_id, + fc->source); goto free_op; } - ret = orangefs_fill_sb(sb, - &new_op->downcall.resp.fs_mount, data, - flags & SB_SILENT ? 1 : 0); + /* init our private orangefs sb info */ + ret = orangefs_fill_sb(sb, fc, &new_op->downcall.resp.fs_mount); - if (ret) { - d = ERR_PTR(ret); + if (ret) goto free_sb_and_op; - } /* * on successful mount, store the devname and data * used */ - strscpy(ORANGEFS_SB(sb)->devname, devname); - + strscpy(ORANGEFS_SB(sb)->devname, fc->source); /* mount_pending must be cleared */ ORANGEFS_SB(sb)->mount_pending = 0; @@ -564,7 +530,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst, if (orangefs_userspace_version >= 20906) { new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES); if (!new_op) - return ERR_PTR(-ENOMEM); + return -ENOMEM; new_op->upcall.req.features.features = 0; ret = service_operation(new_op, "orangefs_features", 0); orangefs_features = new_op->downcall.resp.features.features; @@ -573,7 +539,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst, orangefs_features = 0; } - return dget(sb->s_root); + fc->root = dget(sb->s_root); + return 0; free_sb_and_op: /* Will call orangefs_kill_sb with sb not in list. */ @@ -589,7 +556,43 @@ free_op: op_release(new_op); - return d; + return ret; +} + +static void orangefs_free_fc(struct fs_context *fc) +{ + kfree(fc->s_fs_info); +} + +static const struct fs_context_operations orangefs_context_ops = { + .free = orangefs_free_fc, + .parse_param = orangefs_parse_param, + .get_tree = orangefs_get_tree, + .reconfigure = orangefs_reconfigure, +}; + +/* + * Set up the filesystem mount context. + */ +int orangefs_init_fs_context(struct fs_context *fc) +{ + struct orangefs_sb_info_s *osi; + + osi = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL); + if (!osi) + return -ENOMEM; + + /* + * Force any potential flags that might be set from the mount + * to zero, ie, initialize to unset. + */ + fc->sb_flags_mask &= ~SB_POSIXACL; + osi->flags &= ~ORANGEFS_OPT_INTR; + osi->flags &= ~ORANGEFS_OPT_LOCAL_LOCK; + + fc->s_fs_info = osi; + fc->ops = &orangefs_context_ops; + return 0; } void orangefs_kill_sb(struct super_block *sb) diff --git a/fs/pipe.c b/fs/pipe.c index da45edd68c41..45077c37bad1 100644 --- a/fs/pipe.c +++ b/fs/pipe.c @@ -26,6 +26,7 @@ #include <linux/memcontrol.h> #include <linux/watch_queue.h> #include <linux/sysctl.h> +#include <linux/sort.h> #include <linux/uaccess.h> #include <asm/ioctls.h> @@ -76,8 +77,6 @@ static unsigned long pipe_user_pages_soft = PIPE_DEF_BUFFERS * INR_OPEN_CUR; * -- Manfred Spraul <manfred@colorfullife.com> 2002-05-09 */ -#define cmp_int(l, r) ((l > r) - (l < r)) - #ifdef CONFIG_PROVE_LOCKING static int pipe_lock_cmp_fn(const struct lockdep_map *a, const struct lockdep_map *b) diff --git a/fs/pnode.c b/fs/pnode.c index fb77427df39e..ffd429b760d5 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -231,8 +231,8 @@ static int propagate_one(struct mount *m, struct mountpoint *dest_mp) /* skip if mountpoint isn't visible in m */ if (!is_subdir(dest_mp->m_dentry, m->mnt.mnt_root)) return 0; - /* skip if m is in the anon_ns we are emptying */ - if (m->mnt_ns->mntns_flags & MNTNS_PROPAGATING) + /* skip if m is in the anon_ns */ + if (is_anon_ns(m->mnt_ns)) return 0; if (peers(m, last_dest)) { diff --git a/fs/proc/base.c b/fs/proc/base.c index fe33a5843fbd..c667702dc69b 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -827,7 +827,13 @@ static const struct file_operations proc_single_file_operations = { .release = single_release, }; - +/* + * proc_mem_open() can return errno, NULL or mm_struct*. + * + * - Returns NULL if the task has no mm (PF_KTHREAD or PF_EXITING) + * - Returns mm_struct* on success + * - Returns error code on failure + */ struct mm_struct *proc_mem_open(struct inode *inode, unsigned int mode) { struct task_struct *task = get_proc_task(inode); @@ -854,8 +860,8 @@ static int __mem_open(struct inode *inode, struct file *file, unsigned int mode) { struct mm_struct *mm = proc_mem_open(inode, mode); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; diff --git a/fs/proc/page.c b/fs/proc/page.c index 23fc771100ae..999af26c7298 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -22,6 +22,12 @@ #define KPMMASK (KPMSIZE - 1) #define KPMBITS (KPMSIZE * BITS_PER_BYTE) +enum kpage_operation { + KPAGE_FLAGS, + KPAGE_COUNT, + KPAGE_CGROUP, +}; + static inline unsigned long get_max_dump_pfn(void) { #ifdef CONFIG_SPARSEMEM @@ -37,19 +43,17 @@ static inline unsigned long get_max_dump_pfn(void) #endif } -/* /proc/kpagecount - an array exposing page mapcounts - * - * Each entry is a u64 representing the corresponding - * physical page mapcount. - */ -static ssize_t kpagecount_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) +static ssize_t kpage_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos, + enum kpage_operation op) { const unsigned long max_dump_pfn = get_max_dump_pfn(); u64 __user *out = (u64 __user *)buf; + struct page *page; unsigned long src = *ppos; unsigned long pfn; ssize_t ret = 0; + u64 info; pfn = src / KPMSIZE; if (src & KPMMASK || count & KPMMASK) @@ -59,24 +63,34 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); while (count > 0) { - struct page *page; - u64 mapcount = 0; - /* * TODO: ZONE_DEVICE support requires to identify * memmaps that were actually initialized. */ page = pfn_to_online_page(pfn); - if (page) { - struct folio *folio = page_folio(page); - if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) - mapcount = folio_precise_page_mapcount(folio, page); - else - mapcount = folio_average_page_mapcount(folio); - } - - if (put_user(mapcount, out)) { + if (page) { + switch (op) { + case KPAGE_FLAGS: + info = stable_page_flags(page); + break; + case KPAGE_COUNT: + if (IS_ENABLED(CONFIG_PAGE_MAPCOUNT)) + info = folio_precise_page_mapcount(page_folio(page), page); + else + info = folio_average_page_mapcount(page_folio(page)); + break; + case KPAGE_CGROUP: + info = page_cgroup_ino(page); + break; + default: + info = 0; + break; + } + } else + info = 0; + + if (put_user(info, out)) { ret = -EFAULT; break; } @@ -94,17 +108,23 @@ static ssize_t kpagecount_read(struct file *file, char __user *buf, return ret; } +/* /proc/kpagecount - an array exposing page mapcounts + * + * Each entry is a u64 representing the corresponding + * physical page mapcount. + */ +static ssize_t kpagecount_read(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + return kpage_read(file, buf, count, ppos, KPAGE_COUNT); +} + static const struct proc_ops kpagecount_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, .proc_read = kpagecount_read, }; -/* /proc/kpageflags - an array exposing page flags - * - * Each entry is a u64 representing the corresponding - * physical page flags. - */ static inline u64 kpf_copy_bit(u64 kflags, int ubit, int kbit) { @@ -225,47 +245,17 @@ u64 stable_page_flags(const struct page *page) #endif return u; -}; +} +/* /proc/kpageflags - an array exposing page flags + * + * Each entry is a u64 representing the corresponding + * physical page flags. + */ static ssize_t kpageflags_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - const unsigned long max_dump_pfn = get_max_dump_pfn(); - u64 __user *out = (u64 __user *)buf; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - - pfn = src / KPMSIZE; - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - if (src >= max_dump_pfn * KPMSIZE) - return 0; - count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); - - while (count > 0) { - /* - * TODO: ZONE_DEVICE support requires to identify - * memmaps that were actually initialized. - */ - struct page *page = pfn_to_online_page(pfn); - - if (put_user(stable_page_flags(page), out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_FLAGS); } static const struct proc_ops kpageflags_proc_ops = { @@ -276,53 +266,10 @@ static const struct proc_ops kpageflags_proc_ops = { #ifdef CONFIG_MEMCG static ssize_t kpagecgroup_read(struct file *file, char __user *buf, - size_t count, loff_t *ppos) + size_t count, loff_t *ppos) { - const unsigned long max_dump_pfn = get_max_dump_pfn(); - u64 __user *out = (u64 __user *)buf; - struct page *ppage; - unsigned long src = *ppos; - unsigned long pfn; - ssize_t ret = 0; - u64 ino; - - pfn = src / KPMSIZE; - if (src & KPMMASK || count & KPMMASK) - return -EINVAL; - if (src >= max_dump_pfn * KPMSIZE) - return 0; - count = min_t(unsigned long, count, (max_dump_pfn * KPMSIZE) - src); - - while (count > 0) { - /* - * TODO: ZONE_DEVICE support requires to identify - * memmaps that were actually initialized. - */ - ppage = pfn_to_online_page(pfn); - - if (ppage) - ino = page_cgroup_ino(ppage); - else - ino = 0; - - if (put_user(ino, out)) { - ret = -EFAULT; - break; - } - - pfn++; - out++; - count -= KPMSIZE; - - cond_resched(); - } - - *ppos += (char __user *)out - buf; - if (!ret) - ret = (char __user *)out - buf; - return ret; + return kpage_read(file, buf, count, ppos, KPAGE_CGROUP); } - static const struct proc_ops kpagecgroup_proc_ops = { .proc_flags = PROC_ENTRY_PERMANENT, .proc_lseek = mem_lseek, diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 994cde10e3f4..27972c0749e7 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -212,8 +212,8 @@ static int proc_maps_open(struct inode *inode, struct file *file, priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; seq_release_private(inode, file); return err; @@ -1325,8 +1325,8 @@ static int smaps_rollup_open(struct inode *inode, struct file *file) priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - ret = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + ret = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; single_release(inode, file); goto out_free; @@ -2069,8 +2069,8 @@ static int pagemap_open(struct inode *inode, struct file *file) struct mm_struct *mm; mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(mm)) - return PTR_ERR(mm); + if (IS_ERR_OR_NULL(mm)) + return mm ? PTR_ERR(mm) : -ESRCH; file->private_data = mm; return 0; } @@ -2087,7 +2087,8 @@ static int pagemap_release(struct inode *inode, struct file *file) #define PM_SCAN_CATEGORIES (PAGE_IS_WPALLOWED | PAGE_IS_WRITTEN | \ PAGE_IS_FILE | PAGE_IS_PRESENT | \ PAGE_IS_SWAPPED | PAGE_IS_PFNZERO | \ - PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY) + PAGE_IS_HUGE | PAGE_IS_SOFT_DIRTY | \ + PAGE_IS_GUARD) #define PM_SCAN_FLAGS (PM_SCAN_WP_MATCHING | PM_SCAN_CHECK_WPASYNC) struct pagemap_scan_private { @@ -2128,12 +2129,14 @@ static unsigned long pagemap_page_category(struct pagemap_scan_private *p, if (!pte_swp_uffd_wp_any(pte)) categories |= PAGE_IS_WRITTEN; - if (p->masks_of_interest & PAGE_IS_FILE) { - swp = pte_to_swp_entry(pte); - if (is_pfn_swap_entry(swp) && - !folio_test_anon(pfn_swap_entry_folio(swp))) - categories |= PAGE_IS_FILE; - } + swp = pte_to_swp_entry(pte); + if (is_guard_swp_entry(swp)) + categories |= PAGE_IS_GUARD; + else if ((p->masks_of_interest & PAGE_IS_FILE) && + is_pfn_swap_entry(swp) && + !folio_test_anon(pfn_swap_entry_folio(swp))) + categories |= PAGE_IS_FILE; + if (pte_swp_soft_dirty(pte)) categories |= PAGE_IS_SOFT_DIRTY; } diff --git a/fs/proc/task_nommu.c b/fs/proc/task_nommu.c index bce674533000..59bfd61d653a 100644 --- a/fs/proc/task_nommu.c +++ b/fs/proc/task_nommu.c @@ -260,8 +260,8 @@ static int maps_open(struct inode *inode, struct file *file, priv->inode = inode; priv->mm = proc_mem_open(inode, PTRACE_MODE_READ); - if (IS_ERR(priv->mm)) { - int err = PTR_ERR(priv->mm); + if (IS_ERR_OR_NULL(priv->mm)) { + int err = priv->mm ? PTR_ERR(priv->mm) : -ESRCH; seq_release_private(inode, file); return err; diff --git a/fs/resctrl/Kconfig b/fs/resctrl/Kconfig new file mode 100644 index 000000000000..21671301bd8a --- /dev/null +++ b/fs/resctrl/Kconfig @@ -0,0 +1,39 @@ +config RESCTRL_FS + bool "CPU Resource Control Filesystem (resctrl)" + depends on ARCH_HAS_CPU_RESCTRL + select KERNFS + select PROC_CPU_RESCTRL if PROC_FS + help + Some architectures provide hardware facilities to group tasks and + monitor and control their usage of memory system resources such as + caches and memory bandwidth. Examples of such facilities include + Intel's Resource Director Technology (Intel(R) RDT) and AMD's + Platform Quality of Service (AMD QoS). + + If your system has the necessary support and you want to be able to + assign tasks to groups and manipulate the associated resource + monitors and controls from userspace, say Y here to get a mountable + 'resctrl' filesystem that lets you do just that. + + If nothing mounts or prods the 'resctrl' filesystem, resource + controls and monitors are left in a quiescent, permissive state. + + On architectures where this can be disabled independently, it is + safe to say N. + + See <file:Documentation/filesystems/resctrl.rst> for more information. + +config RESCTRL_FS_PSEUDO_LOCK + bool + depends on RESCTRL_FS + help + Software mechanism to pin data in a cache portion using + micro-architecture specific knowledge. + +config RESCTRL_RMID_DEPENDS_ON_CLOSID + bool + depends on RESCTRL_FS + help + Enabled by the architecture when the RMID values depend on the CLOSID. + This causes the CLOSID allocator to search for CLOSID with clean + RMID. diff --git a/fs/resctrl/Makefile b/fs/resctrl/Makefile new file mode 100644 index 000000000000..e67f34d2236a --- /dev/null +++ b/fs/resctrl/Makefile @@ -0,0 +1,6 @@ +# SPDX-License-Identifier: GPL-2.0 +obj-$(CONFIG_RESCTRL_FS) += rdtgroup.o ctrlmondata.o monitor.o +obj-$(CONFIG_RESCTRL_FS_PSEUDO_LOCK) += pseudo_lock.o + +# To allow define_trace.h's recursive include: +CFLAGS_monitor.o = -I$(src) diff --git a/fs/resctrl/ctrlmondata.c b/fs/resctrl/ctrlmondata.c new file mode 100644 index 000000000000..6ed2dfd4dbbd --- /dev/null +++ b/fs/resctrl/ctrlmondata.c @@ -0,0 +1,661 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Cache Allocation code. + * + * Copyright (C) 2016 Intel Corporation + * + * Authors: + * Fenghua Yu <fenghua.yu@intel.com> + * Tony Luck <tony.luck@intel.com> + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/kernfs.h> +#include <linux/seq_file.h> +#include <linux/slab.h> +#include <linux/tick.h> + +#include "internal.h" + +struct rdt_parse_data { + struct rdtgroup *rdtgrp; + char *buf; +}; + +typedef int (ctrlval_parser_t)(struct rdt_parse_data *data, + struct resctrl_schema *s, + struct rdt_ctrl_domain *d); + +/* + * Check whether MBA bandwidth percentage value is correct. The value is + * checked against the minimum and max bandwidth values specified by the + * hardware. The allocated bandwidth percentage is rounded to the next + * control step available on the hardware. + */ +static bool bw_validate(char *buf, u32 *data, struct rdt_resource *r) +{ + int ret; + u32 bw; + + /* + * Only linear delay values is supported for current Intel SKUs. + */ + if (!r->membw.delay_linear && r->membw.arch_needs_linear) { + rdt_last_cmd_puts("No support for non-linear MB domains\n"); + return false; + } + + ret = kstrtou32(buf, 10, &bw); + if (ret) { + rdt_last_cmd_printf("Invalid MB value %s\n", buf); + return false; + } + + /* Nothing else to do if software controller is enabled. */ + if (is_mba_sc(r)) { + *data = bw; + return true; + } + + if (bw < r->membw.min_bw || bw > r->membw.max_bw) { + rdt_last_cmd_printf("MB value %u out of range [%d,%d]\n", + bw, r->membw.min_bw, r->membw.max_bw); + return false; + } + + *data = roundup(bw, (unsigned long)r->membw.bw_gran); + return true; +} + +static int parse_bw(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_ctrl_domain *d) +{ + struct resctrl_staged_config *cfg; + u32 closid = data->rdtgrp->closid; + struct rdt_resource *r = s->res; + u32 bw_val; + + cfg = &d->staged_config[s->conf_type]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); + return -EINVAL; + } + + if (!bw_validate(data->buf, &bw_val, r)) + return -EINVAL; + + if (is_mba_sc(r)) { + d->mbps_val[closid] = bw_val; + return 0; + } + + cfg->new_ctrl = bw_val; + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * Check whether a cache bit mask is valid. + * On Intel CPUs, non-contiguous 1s value support is indicated by CPUID: + * - CPUID.0x10.1:ECX[3]: L3 non-contiguous 1s value supported if 1 + * - CPUID.0x10.2:ECX[3]: L2 non-contiguous 1s value supported if 1 + * + * Haswell does not support a non-contiguous 1s value and additionally + * requires at least two bits set. + * AMD allows non-contiguous bitmasks. + */ +static bool cbm_validate(char *buf, u32 *data, struct rdt_resource *r) +{ + u32 supported_bits = BIT_MASK(r->cache.cbm_len) - 1; + unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit, val; + int ret; + + ret = kstrtoul(buf, 16, &val); + if (ret) { + rdt_last_cmd_printf("Non-hex character in the mask %s\n", buf); + return false; + } + + if ((r->cache.min_cbm_bits > 0 && val == 0) || val > supported_bits) { + rdt_last_cmd_puts("Mask out of range\n"); + return false; + } + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + /* Are non-contiguous bitmasks allowed? */ + if (!r->cache.arch_has_sparse_bitmasks && + (find_next_bit(&val, cbm_len, zero_bit) < cbm_len)) { + rdt_last_cmd_printf("The mask %lx has non-consecutive 1-bits\n", val); + return false; + } + + if ((zero_bit - first_bit) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("Need at least %d bits in the mask\n", + r->cache.min_cbm_bits); + return false; + } + + *data = val; + return true; +} + +/* + * Read one cache bit mask (hex). Check that it is valid for the current + * resource type. + */ +static int parse_cbm(struct rdt_parse_data *data, struct resctrl_schema *s, + struct rdt_ctrl_domain *d) +{ + struct rdtgroup *rdtgrp = data->rdtgrp; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + u32 cbm_val; + + cfg = &d->staged_config[s->conf_type]; + if (cfg->have_new_ctrl) { + rdt_last_cmd_printf("Duplicate domain %d\n", d->hdr.id); + return -EINVAL; + } + + /* + * Cannot set up more than one pseudo-locked region in a cache + * hierarchy. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + rdtgroup_pseudo_locked_in_hierarchy(d)) { + rdt_last_cmd_puts("Pseudo-locked region in hierarchy\n"); + return -EINVAL; + } + + if (!cbm_validate(data->buf, &cbm_val, r)) + return -EINVAL; + + if ((rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_SHAREABLE) && + rdtgroup_cbm_overlaps_pseudo_locked(d, cbm_val)) { + rdt_last_cmd_puts("CBM overlaps with pseudo-locked region\n"); + return -EINVAL; + } + + /* + * The CBM may not overlap with the CBM of another closid if + * either is exclusive. + */ + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, true)) { + rdt_last_cmd_puts("Overlaps with exclusive group\n"); + return -EINVAL; + } + + if (rdtgroup_cbm_overlaps(s, d, cbm_val, rdtgrp->closid, false)) { + if (rdtgrp->mode == RDT_MODE_EXCLUSIVE || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + rdt_last_cmd_puts("Overlaps with other group\n"); + return -EINVAL; + } + } + + cfg->new_ctrl = cbm_val; + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * For each domain in this resource we expect to find a series of: + * id=mask + * separated by ";". The "id" is in decimal, and must match one of + * the "id"s for this resource. + */ +static int parse_line(char *line, struct resctrl_schema *s, + struct rdtgroup *rdtgrp) +{ + enum resctrl_conf_type t = s->conf_type; + ctrlval_parser_t *parse_ctrlval = NULL; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + struct rdt_parse_data data; + struct rdt_ctrl_domain *d; + char *dom = NULL, *id; + unsigned long dom_id; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + parse_ctrlval = &parse_cbm; + break; + case RESCTRL_SCHEMA_RANGE: + parse_ctrlval = &parse_bw; + break; + } + + if (WARN_ON_ONCE(!parse_ctrlval)) + return -EINVAL; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP && + (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA)) { + rdt_last_cmd_puts("Cannot pseudo-lock MBA resource\n"); + return -EINVAL; + } + +next: + if (!line || line[0] == '\0') + return 0; + dom = strsep(&line, ";"); + id = strsep(&dom, "="); + if (!dom || kstrtoul(id, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain\n"); + return -EINVAL; + } + dom = strim(dom); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (d->hdr.id == dom_id) { + data.buf = dom; + data.rdtgrp = rdtgrp; + if (parse_ctrlval(&data, s, d)) + return -EINVAL; + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + cfg = &d->staged_config[t]; + /* + * In pseudo-locking setup mode and just + * parsed a valid CBM that should be + * pseudo-locked. Only one locked region per + * resource group and domain so just do + * the required initialization for single + * region and return. + */ + rdtgrp->plr->s = s; + rdtgrp->plr->d = d; + rdtgrp->plr->cbm = cfg->new_ctrl; + d->plr = rdtgrp->plr; + return 0; + } + goto next; + } + } + return -EINVAL; +} + +static int rdtgroup_parse_resource(char *resname, char *tok, + struct rdtgroup *rdtgrp) +{ + struct resctrl_schema *s; + + list_for_each_entry(s, &resctrl_schema_all, list) { + if (!strcmp(resname, s->name) && rdtgrp->closid < s->num_closid) + return parse_line(tok, s, rdtgrp); + } + rdt_last_cmd_printf("Unknown or unsupported resource name '%s'\n", resname); + return -EINVAL; +} + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct resctrl_schema *s; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + char *tok, *resname; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + /* + * No changes to pseudo-locked region allowed. It has to be removed + * and re-created instead. + */ + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = -EINVAL; + rdt_last_cmd_puts("Resource group is pseudo-locked\n"); + goto out; + } + + rdt_staged_configs_clear(); + + while ((tok = strsep(&buf, "\n")) != NULL) { + resname = strim(strsep(&tok, ":")); + if (!tok) { + rdt_last_cmd_puts("Missing ':'\n"); + ret = -EINVAL; + goto out; + } + if (tok[0] == '\0') { + rdt_last_cmd_printf("Missing '%s' value\n", resname); + ret = -EINVAL; + goto out; + } + ret = rdtgroup_parse_resource(resname, tok, rdtgrp); + if (ret) + goto out; + } + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + + /* + * Writes to mba_sc resources update the software controller, + * not the control MSR. + */ + if (is_mba_sc(r)) + continue; + + ret = resctrl_arch_update_domains(r, rdtgrp->closid); + if (ret) + goto out; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * If pseudo-locking fails we keep the resource group in + * mode RDT_MODE_PSEUDO_LOCKSETUP with its class of service + * active and updated for just the domain the pseudo-locked + * region was requested for. + */ + ret = rdtgroup_pseudo_lock_create(rdtgrp); + } + +out: + rdt_staged_configs_clear(); + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +static void show_doms(struct seq_file *s, struct resctrl_schema *schema, int closid) +{ + struct rdt_resource *r = schema->res; + struct rdt_ctrl_domain *dom; + bool sep = false; + u32 ctrl_val; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + seq_printf(s, "%*s:", max_name_width, schema->name); + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { + if (sep) + seq_puts(s, ";"); + + if (is_mba_sc(r)) + ctrl_val = dom->mbps_val[closid]; + else + ctrl_val = resctrl_arch_get_config(r, dom, closid, + schema->conf_type); + + seq_printf(s, schema->fmt_str, dom->hdr.id, ctrl_val); + sep = true; + } + seq_puts(s, "\n"); +} + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct resctrl_schema *schema; + struct rdtgroup *rdtgrp; + int ret = 0; + u32 closid; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + list_for_each_entry(schema, &resctrl_schema_all, list) { + seq_printf(s, "%s:uninitialized\n", schema->name); + } + } else if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%s:%d=%x\n", + rdtgrp->plr->s->res->name, + rdtgrp->plr->d->hdr.id, + rdtgrp->plr->cbm); + } + } else { + closid = rdtgrp->closid; + list_for_each_entry(schema, &resctrl_schema_all, list) { + if (closid < schema->num_closid) + show_doms(s, schema, closid); + } + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + return ret; +} + +static int smp_mon_event_count(void *arg) +{ + mon_event_count(arg); + + return 0; +} + +ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (!strcmp(buf, "mbm_local_bytes")) { + if (resctrl_arch_is_mbm_local_enabled()) + rdtgrp->mba_mbps_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else + ret = -EINVAL; + } else if (!strcmp(buf, "mbm_total_bytes")) { + if (resctrl_arch_is_mbm_total_enabled()) + rdtgrp->mba_mbps_event = QOS_L3_MBM_TOTAL_EVENT_ID; + else + ret = -EINVAL; + } else { + ret = -EINVAL; + } + + if (ret) + rdt_last_cmd_printf("Unsupported event id '%s'\n", buf); + + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) { + switch (rdtgrp->mba_mbps_event) { + case QOS_L3_MBM_LOCAL_EVENT_ID: + seq_puts(s, "mbm_local_bytes\n"); + break; + case QOS_L3_MBM_TOTAL_EVENT_ID: + seq_puts(s, "mbm_total_bytes\n"); + break; + default: + pr_warn_once("Bad event %d\n", rdtgrp->mba_mbps_event); + ret = -EINVAL; + break; + } + } else { + ret = -ENOENT; + } + + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +struct rdt_domain_hdr *resctrl_find_domain(struct list_head *h, int id, + struct list_head **pos) +{ + struct rdt_domain_hdr *d; + struct list_head *l; + + list_for_each(l, h) { + d = list_entry(l, struct rdt_domain_hdr, list); + /* When id is found, return its domain. */ + if (id == d->id) + return d; + /* Stop searching when finding id's position in sorted list. */ + if (id < d->id) + break; + } + + if (pos) + *pos = l; + + return NULL; +} + +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, int evtid, int first) +{ + int cpu; + + /* When picking a CPU from cpu_mask, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + /* + * Setup the parameters to pass to mon_event_count() to read the data. + */ + rr->rgrp = rdtgrp; + rr->evtid = evtid; + rr->r = r; + rr->d = d; + rr->first = first; + rr->arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, evtid); + if (IS_ERR(rr->arch_mon_ctx)) { + rr->err = -EINVAL; + return; + } + + cpu = cpumask_any_housekeeping(cpumask, RESCTRL_PICK_ANY_CPU); + + /* + * cpumask_any_housekeeping() prefers housekeeping CPUs, but + * are all the CPUs nohz_full? If yes, pick a CPU to IPI. + * MPAM's resctrl_arch_rmid_read() is unable to read the + * counters on some platforms if its called in IRQ context. + */ + if (tick_nohz_full_cpu(cpu)) + smp_call_function_any(cpumask, mon_event_count, rr, 1); + else + smp_call_on_cpu(cpu, smp_mon_event_count, rr, false); + + resctrl_arch_mon_ctx_free(r, evtid, rr->arch_mon_ctx); +} + +int rdtgroup_mondata_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + enum resctrl_res_level resid; + enum resctrl_event_id evtid; + struct rdt_domain_hdr *hdr; + struct rmid_read rr = {0}; + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + struct mon_data *md; + int domid, ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto out; + } + + md = of->kn->priv; + if (WARN_ON_ONCE(!md)) { + ret = -EIO; + goto out; + } + + resid = md->rid; + domid = md->domid; + evtid = md->evtid; + r = resctrl_arch_get_resource(resid); + + if (md->sum) { + /* + * This file requires summing across all domains that share + * the L3 cache id that was provided in the "domid" field of the + * struct mon_data. Search all domains in the resource for + * one that matches this cache id. + */ + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->ci->id == domid) { + rr.ci = d->ci; + mon_event_read(&rr, r, NULL, rdtgrp, + &d->ci->shared_cpu_map, evtid, false); + goto checkresult; + } + } + ret = -ENOENT; + goto out; + } else { + /* + * This file provides data from a single domain. Search + * the resource to find the domain with "domid". + */ + hdr = resctrl_find_domain(&r->mon_domains, domid, NULL); + if (!hdr || WARN_ON_ONCE(hdr->type != RESCTRL_MON_DOMAIN)) { + ret = -ENOENT; + goto out; + } + d = container_of(hdr, struct rdt_mon_domain, hdr); + mon_event_read(&rr, r, d, rdtgrp, &d->hdr.cpu_mask, evtid, false); + } + +checkresult: + + if (rr.err == -EIO) + seq_puts(m, "Error\n"); + else if (rr.err == -EINVAL) + seq_puts(m, "Unavailable\n"); + else + seq_printf(m, "%llu\n", rr.val); + +out: + rdtgroup_kn_unlock(of->kn); + return ret; +} diff --git a/fs/resctrl/internal.h b/fs/resctrl/internal.h new file mode 100644 index 000000000000..9a8cf6f11151 --- /dev/null +++ b/fs/resctrl/internal.h @@ -0,0 +1,426 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef _FS_RESCTRL_INTERNAL_H +#define _FS_RESCTRL_INTERNAL_H + +#include <linux/resctrl.h> +#include <linux/kernfs.h> +#include <linux/fs_context.h> +#include <linux/tick.h> + +#define CQM_LIMBOCHECK_INTERVAL 1000 + +/** + * cpumask_any_housekeeping() - Choose any CPU in @mask, preferring those that + * aren't marked nohz_full + * @mask: The mask to pick a CPU from. + * @exclude_cpu:The CPU to avoid picking. + * + * Returns a CPU from @mask, but not @exclude_cpu. If there are housekeeping + * CPUs that don't use nohz_full, these are preferred. Pass + * RESCTRL_PICK_ANY_CPU to avoid excluding any CPUs. + * + * When a CPU is excluded, returns >= nr_cpu_ids if no CPUs are available. + */ +static inline unsigned int +cpumask_any_housekeeping(const struct cpumask *mask, int exclude_cpu) +{ + unsigned int cpu; + + /* Try to find a CPU that isn't nohz_full to use in preference */ + if (tick_nohz_full_enabled()) { + cpu = cpumask_any_andnot_but(mask, tick_nohz_full_mask, exclude_cpu); + if (cpu < nr_cpu_ids) + return cpu; + } + + return cpumask_any_but(mask, exclude_cpu); +} + +struct rdt_fs_context { + struct kernfs_fs_context kfc; + bool enable_cdpl2; + bool enable_cdpl3; + bool enable_mba_mbps; + bool enable_debug; +}; + +static inline struct rdt_fs_context *rdt_fc2context(struct fs_context *fc) +{ + struct kernfs_fs_context *kfc = fc->fs_private; + + return container_of(kfc, struct rdt_fs_context, kfc); +} + +/** + * struct mon_evt - Entry in the event list of a resource + * @evtid: event id + * @name: name of the event + * @configurable: true if the event is configurable + * @list: entry in &rdt_resource->evt_list + */ +struct mon_evt { + enum resctrl_event_id evtid; + char *name; + bool configurable; + struct list_head list; +}; + +/** + * struct mon_data - Monitoring details for each event file. + * @list: Member of the global @mon_data_kn_priv_list list. + * @rid: Resource id associated with the event file. + * @evtid: Event id associated with the event file. + * @sum: Set when event must be summed across multiple + * domains. + * @domid: When @sum is zero this is the domain to which + * the event file belongs. When @sum is one this + * is the id of the L3 cache that all domains to be + * summed share. + * + * Pointed to by the kernfs kn->priv field of monitoring event files. + * Readers and writers must hold rdtgroup_mutex. + */ +struct mon_data { + struct list_head list; + enum resctrl_res_level rid; + enum resctrl_event_id evtid; + int domid; + bool sum; +}; + +/** + * struct rmid_read - Data passed across smp_call*() to read event count. + * @rgrp: Resource group for which the counter is being read. If it is a parent + * resource group then its event count is summed with the count from all + * its child resource groups. + * @r: Resource describing the properties of the event being read. + * @d: Domain that the counter should be read from. If NULL then sum all + * domains in @r sharing L3 @ci.id + * @evtid: Which monitor event to read. + * @first: Initialize MBM counter when true. + * @ci: Cacheinfo for L3. Only set when @d is NULL. Used when summing domains. + * @err: Error encountered when reading counter. + * @val: Returned value of event counter. If @rgrp is a parent resource group, + * @val includes the sum of event counts from its child resource groups. + * If @d is NULL, @val includes the sum of all domains in @r sharing @ci.id, + * (summed across child resource groups if @rgrp is a parent resource group). + * @arch_mon_ctx: Hardware monitor allocated for this read request (MPAM only). + */ +struct rmid_read { + struct rdtgroup *rgrp; + struct rdt_resource *r; + struct rdt_mon_domain *d; + enum resctrl_event_id evtid; + bool first; + struct cacheinfo *ci; + int err; + u64 val; + void *arch_mon_ctx; +}; + +extern struct list_head resctrl_schema_all; + +extern bool resctrl_mounted; + +enum rdt_group_type { + RDTCTRL_GROUP = 0, + RDTMON_GROUP, + RDT_NUM_GROUP, +}; + +/** + * enum rdtgrp_mode - Mode of a RDT resource group + * @RDT_MODE_SHAREABLE: This resource group allows sharing of its allocations + * @RDT_MODE_EXCLUSIVE: No sharing of this resource group's allocations allowed + * @RDT_MODE_PSEUDO_LOCKSETUP: Resource group will be used for Pseudo-Locking + * @RDT_MODE_PSEUDO_LOCKED: No sharing of this resource group's allocations + * allowed AND the allocations are Cache Pseudo-Locked + * @RDT_NUM_MODES: Total number of modes + * + * The mode of a resource group enables control over the allowed overlap + * between allocations associated with different resource groups (classes + * of service). User is able to modify the mode of a resource group by + * writing to the "mode" resctrl file associated with the resource group. + * + * The "shareable", "exclusive", and "pseudo-locksetup" modes are set by + * writing the appropriate text to the "mode" file. A resource group enters + * "pseudo-locked" mode after the schemata is written while the resource + * group is in "pseudo-locksetup" mode. + */ +enum rdtgrp_mode { + RDT_MODE_SHAREABLE = 0, + RDT_MODE_EXCLUSIVE, + RDT_MODE_PSEUDO_LOCKSETUP, + RDT_MODE_PSEUDO_LOCKED, + + /* Must be last */ + RDT_NUM_MODES, +}; + +/** + * struct mongroup - store mon group's data in resctrl fs. + * @mon_data_kn: kernfs node for the mon_data directory + * @parent: parent rdtgrp + * @crdtgrp_list: child rdtgroup node list + * @rmid: rmid for this rdtgroup + */ +struct mongroup { + struct kernfs_node *mon_data_kn; + struct rdtgroup *parent; + struct list_head crdtgrp_list; + u32 rmid; +}; + +/** + * struct rdtgroup - store rdtgroup's data in resctrl file system. + * @kn: kernfs node + * @rdtgroup_list: linked list for all rdtgroups + * @closid: closid for this rdtgroup + * @cpu_mask: CPUs assigned to this rdtgroup + * @flags: status bits + * @waitcount: how many cpus expect to find this + * group when they acquire rdtgroup_mutex + * @type: indicates type of this rdtgroup - either + * monitor only or ctrl_mon group + * @mon: mongroup related data + * @mode: mode of resource group + * @mba_mbps_event: input monitoring event id when mba_sc is enabled + * @plr: pseudo-locked region + */ +struct rdtgroup { + struct kernfs_node *kn; + struct list_head rdtgroup_list; + u32 closid; + struct cpumask cpu_mask; + int flags; + atomic_t waitcount; + enum rdt_group_type type; + struct mongroup mon; + enum rdtgrp_mode mode; + enum resctrl_event_id mba_mbps_event; + struct pseudo_lock_region *plr; +}; + +/* rdtgroup.flags */ +#define RDT_DELETED 1 + +/* rftype.flags */ +#define RFTYPE_FLAGS_CPUS_LIST 1 + +/* + * Define the file type flags for base and info directories. + */ +#define RFTYPE_INFO BIT(0) + +#define RFTYPE_BASE BIT(1) + +#define RFTYPE_CTRL BIT(4) + +#define RFTYPE_MON BIT(5) + +#define RFTYPE_TOP BIT(6) + +#define RFTYPE_RES_CACHE BIT(8) + +#define RFTYPE_RES_MB BIT(9) + +#define RFTYPE_DEBUG BIT(10) + +#define RFTYPE_CTRL_INFO (RFTYPE_INFO | RFTYPE_CTRL) + +#define RFTYPE_MON_INFO (RFTYPE_INFO | RFTYPE_MON) + +#define RFTYPE_TOP_INFO (RFTYPE_INFO | RFTYPE_TOP) + +#define RFTYPE_CTRL_BASE (RFTYPE_BASE | RFTYPE_CTRL) + +#define RFTYPE_MON_BASE (RFTYPE_BASE | RFTYPE_MON) + +/* List of all resource groups */ +extern struct list_head rdt_all_groups; + +extern int max_name_width; + +/** + * struct rftype - describe each file in the resctrl file system + * @name: File name + * @mode: Access mode + * @kf_ops: File operations + * @flags: File specific RFTYPE_FLAGS_* flags + * @fflags: File specific RFTYPE_* flags + * @seq_show: Show content of the file + * @write: Write to the file + */ +struct rftype { + char *name; + umode_t mode; + const struct kernfs_ops *kf_ops; + unsigned long flags; + unsigned long fflags; + + int (*seq_show)(struct kernfs_open_file *of, + struct seq_file *sf, void *v); + /* + * write() is the generic write callback which maps directly to + * kernfs write operation and overrides all other operations. + * Maximum write size is determined by ->max_write_len. + */ + ssize_t (*write)(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); +}; + +/** + * struct mbm_state - status for each MBM counter in each domain + * @prev_bw_bytes: Previous bytes value read for bandwidth calculation + * @prev_bw: The most recent bandwidth in MBps + */ +struct mbm_state { + u64 prev_bw_bytes; + u32 prev_bw; +}; + +extern struct mutex rdtgroup_mutex; + +static inline const char *rdt_kn_name(const struct kernfs_node *kn) +{ + return rcu_dereference_check(kn->name, lockdep_is_held(&rdtgroup_mutex)); +} + +extern struct rdtgroup rdtgroup_default; + +extern struct dentry *debugfs_resctrl; + +extern enum resctrl_event_id mba_mbps_default_event; + +void rdt_last_cmd_clear(void); + +void rdt_last_cmd_puts(const char *s); + +__printf(1, 2) +void rdt_last_cmd_printf(const char *fmt, ...); + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn); + +void rdtgroup_kn_unlock(struct kernfs_node *kn); + +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name); + +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask); + +ssize_t rdtgroup_schemata_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + +int rdtgroup_schemata_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +ssize_t rdtgroup_mba_mbps_event_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off); + +int rdtgroup_mba_mbps_event_show(struct kernfs_open_file *of, + struct seq_file *s, void *v); + +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, + unsigned long cbm, int closid, bool exclusive); + +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, struct rdt_ctrl_domain *d, + unsigned long cbm); + +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid); + +int rdtgroup_tasks_assigned(struct rdtgroup *r); + +int closids_supported(void); + +void closid_free(int closid); + +int alloc_rmid(u32 closid); + +void free_rmid(u32 closid, u32 rmid); + +void resctrl_mon_resource_exit(void); + +void mon_event_count(void *info); + +int rdtgroup_mondata_show(struct seq_file *m, void *arg); + +void mon_event_read(struct rmid_read *rr, struct rdt_resource *r, + struct rdt_mon_domain *d, struct rdtgroup *rdtgrp, + cpumask_t *cpumask, int evtid, int first); + +int resctrl_mon_resource_init(void); + +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, + unsigned long delay_ms, + int exclude_cpu); + +void mbm_handle_overflow(struct work_struct *work); + +bool is_mba_sc(struct rdt_resource *r); + +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu); + +void cqm_handle_limbo(struct work_struct *work); + +bool has_busy_rmid(struct rdt_mon_domain *d); + +void __check_limbo(struct rdt_mon_domain *d, bool force_free); + +void resctrl_file_fflags_init(const char *config, unsigned long fflags); + +void rdt_staged_configs_clear(void); + +bool closid_allocated(unsigned int closid); + +int resctrl_find_cleanest_closid(void); + +#ifdef CONFIG_RESCTRL_FS_PSEUDO_LOCK +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp); + +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp); + +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm); + +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d); + +int rdt_pseudo_lock_init(void); + +void rdt_pseudo_lock_release(void); + +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp); + +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp); + +#else +static inline int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) +{ + return false; +} + +static inline bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) +{ + return false; +} + +static inline int rdt_pseudo_lock_init(void) { return 0; } +static inline void rdt_pseudo_lock_release(void) { } +static inline int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + return -EOPNOTSUPP; +} + +static inline void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) { } +#endif /* CONFIG_RESCTRL_FS_PSEUDO_LOCK */ + +#endif /* _FS_RESCTRL_INTERNAL_H */ diff --git a/fs/resctrl/monitor.c b/fs/resctrl/monitor.c new file mode 100644 index 000000000000..bde2801289d3 --- /dev/null +++ b/fs/resctrl/monitor.c @@ -0,0 +1,929 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * Resource Director Technology(RDT) + * - Monitoring code + * + * Copyright (C) 2017 Intel Corporation + * + * Author: + * Vikas Shivappa <vikas.shivappa@intel.com> + * + * This replaces the cqm.c based on perf but we reuse a lot of + * code and datastructures originally from Peter Zijlstra and Matt Fleming. + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual June 2016, volume 3, section 17.17. + */ + +#define pr_fmt(fmt) "resctrl: " fmt + +#include <linux/cpu.h> +#include <linux/resctrl.h> +#include <linux/sizes.h> +#include <linux/slab.h> + +#include "internal.h" + +#define CREATE_TRACE_POINTS + +#include "monitor_trace.h" + +/** + * struct rmid_entry - dirty tracking for all RMID. + * @closid: The CLOSID for this entry. + * @rmid: The RMID for this entry. + * @busy: The number of domains with cached data using this RMID. + * @list: Member of the rmid_free_lru list when busy == 0. + * + * Depending on the architecture the correct monitor is accessed using + * both @closid and @rmid, or @rmid only. + * + * Take the rdtgroup_mutex when accessing. + */ +struct rmid_entry { + u32 closid; + u32 rmid; + int busy; + struct list_head list; +}; + +/* + * @rmid_free_lru - A least recently used list of free RMIDs + * These RMIDs are guaranteed to have an occupancy less than the + * threshold occupancy + */ +static LIST_HEAD(rmid_free_lru); + +/* + * @closid_num_dirty_rmid The number of dirty RMID each CLOSID has. + * Only allocated when CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID is defined. + * Indexed by CLOSID. Protected by rdtgroup_mutex. + */ +static u32 *closid_num_dirty_rmid; + +/* + * @rmid_limbo_count - count of currently unused but (potentially) + * dirty RMIDs. + * This counts RMIDs that no one is currently using but that + * may have a occupancy value > resctrl_rmid_realloc_threshold. User can + * change the threshold occupancy value. + */ +static unsigned int rmid_limbo_count; + +/* + * @rmid_entry - The entry in the limbo and free lists. + */ +static struct rmid_entry *rmid_ptrs; + +/* + * This is the threshold cache occupancy in bytes at which we will consider an + * RMID available for re-allocation. + */ +unsigned int resctrl_rmid_realloc_threshold; + +/* + * This is the maximum value for the reallocation threshold, in bytes. + */ +unsigned int resctrl_rmid_realloc_limit; + +/* + * x86 and arm64 differ in their handling of monitoring. + * x86's RMID are independent numbers, there is only one source of traffic + * with an RMID value of '1'. + * arm64's PMG extends the PARTID/CLOSID space, there are multiple sources of + * traffic with a PMG value of '1', one for each CLOSID, meaning the RMID + * value is no longer unique. + * To account for this, resctrl uses an index. On x86 this is just the RMID, + * on arm64 it encodes the CLOSID and RMID. This gives a unique number. + * + * The domain's rmid_busy_llc and rmid_ptrs[] are sized by index. The arch code + * must accept an attempt to read every index. + */ +static inline struct rmid_entry *__rmid_entry(u32 idx) +{ + struct rmid_entry *entry; + u32 closid, rmid; + + entry = &rmid_ptrs[idx]; + resctrl_arch_rmid_idx_decode(idx, &closid, &rmid); + + WARN_ON_ONCE(entry->closid != closid); + WARN_ON_ONCE(entry->rmid != rmid); + + return entry; +} + +static void limbo_release_entry(struct rmid_entry *entry) +{ + lockdep_assert_held(&rdtgroup_mutex); + + rmid_limbo_count--; + list_add_tail(&entry->list, &rmid_free_lru); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]--; +} + +/* + * Check the RMIDs that are marked as busy for this domain. If the + * reported LLC occupancy is below the threshold clear the busy bit and + * decrement the count. If the busy count gets to zero on an RMID, we + * free the RMID + */ +void __check_limbo(struct rdt_mon_domain *d, bool force_free) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + struct rmid_entry *entry; + u32 idx, cur_idx = 1; + void *arch_mon_ctx; + bool rmid_dirty; + u64 val = 0; + + arch_mon_ctx = resctrl_arch_mon_ctx_alloc(r, QOS_L3_OCCUP_EVENT_ID); + if (IS_ERR(arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(arch_mon_ctx)); + return; + } + + /* + * Skip RMID 0 and start from RMID 1 and check all the RMIDs that + * are marked as busy for occupancy < threshold. If the occupancy + * is less than the threshold decrement the busy counter of the + * RMID and move it to the free list when the counter reaches 0. + */ + for (;;) { + idx = find_next_bit(d->rmid_busy_llc, idx_limit, cur_idx); + if (idx >= idx_limit) + break; + + entry = __rmid_entry(idx); + if (resctrl_arch_rmid_read(r, d, entry->closid, entry->rmid, + QOS_L3_OCCUP_EVENT_ID, &val, + arch_mon_ctx)) { + rmid_dirty = true; + } else { + rmid_dirty = (val >= resctrl_rmid_realloc_threshold); + + /* + * x86's CLOSID and RMID are independent numbers, so the entry's + * CLOSID is an empty CLOSID (X86_RESCTRL_EMPTY_CLOSID). On Arm the + * RMID (PMG) extends the CLOSID (PARTID) space with bits that aren't + * used to select the configuration. It is thus necessary to track both + * CLOSID and RMID because there may be dependencies between them + * on some architectures. + */ + trace_mon_llc_occupancy_limbo(entry->closid, entry->rmid, d->hdr.id, val); + } + + if (force_free || !rmid_dirty) { + clear_bit(idx, d->rmid_busy_llc); + if (!--entry->busy) + limbo_release_entry(entry); + } + cur_idx = idx + 1; + } + + resctrl_arch_mon_ctx_free(r, QOS_L3_OCCUP_EVENT_ID, arch_mon_ctx); +} + +bool has_busy_rmid(struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + + return find_first_bit(d->rmid_busy_llc, idx_limit) != idx_limit; +} + +static struct rmid_entry *resctrl_find_free_rmid(u32 closid) +{ + struct rmid_entry *itr; + u32 itr_idx, cmp_idx; + + if (list_empty(&rmid_free_lru)) + return rmid_limbo_count ? ERR_PTR(-EBUSY) : ERR_PTR(-ENOSPC); + + list_for_each_entry(itr, &rmid_free_lru, list) { + /* + * Get the index of this free RMID, and the index it would need + * to be if it were used with this CLOSID. + * If the CLOSID is irrelevant on this architecture, the two + * index values are always the same on every entry and thus the + * very first entry will be returned. + */ + itr_idx = resctrl_arch_rmid_idx_encode(itr->closid, itr->rmid); + cmp_idx = resctrl_arch_rmid_idx_encode(closid, itr->rmid); + + if (itr_idx == cmp_idx) + return itr; + } + + return ERR_PTR(-ENOSPC); +} + +/** + * resctrl_find_cleanest_closid() - Find a CLOSID where all the associated + * RMID are clean, or the CLOSID that has + * the most clean RMID. + * + * MPAM's equivalent of RMID are per-CLOSID, meaning a freshly allocated CLOSID + * may not be able to allocate clean RMID. To avoid this the allocator will + * choose the CLOSID with the most clean RMID. + * + * When the CLOSID and RMID are independent numbers, the first free CLOSID will + * be returned. + */ +int resctrl_find_cleanest_closid(void) +{ + u32 cleanest_closid = ~0; + int i = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + if (!IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + return -EIO; + + for (i = 0; i < closids_supported(); i++) { + int num_dirty; + + if (closid_allocated(i)) + continue; + + num_dirty = closid_num_dirty_rmid[i]; + if (num_dirty == 0) + return i; + + if (cleanest_closid == ~0) + cleanest_closid = i; + + if (num_dirty < closid_num_dirty_rmid[cleanest_closid]) + cleanest_closid = i; + } + + if (cleanest_closid == ~0) + return -ENOSPC; + + return cleanest_closid; +} + +/* + * For MPAM the RMID value is not unique, and has to be considered with + * the CLOSID. The (CLOSID, RMID) pair is allocated on all domains, which + * allows all domains to be managed by a single free list. + * Each domain also has a rmid_busy_llc to reduce the work of the limbo handler. + */ +int alloc_rmid(u32 closid) +{ + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + entry = resctrl_find_free_rmid(closid); + if (IS_ERR(entry)) + return PTR_ERR(entry); + + list_del(&entry->list); + return entry->rmid; +} + +static void add_rmid_to_limbo(struct rmid_entry *entry) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + u32 idx; + + lockdep_assert_held(&rdtgroup_mutex); + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + idx = resctrl_arch_rmid_idx_encode(entry->closid, entry->rmid); + + entry->busy = 0; + list_for_each_entry(d, &r->mon_domains, hdr.list) { + /* + * For the first limbo RMID in the domain, + * setup up the limbo worker. + */ + if (!has_busy_rmid(d)) + cqm_setup_limbo_handler(d, CQM_LIMBOCHECK_INTERVAL, + RESCTRL_PICK_ANY_CPU); + set_bit(idx, d->rmid_busy_llc); + entry->busy++; + } + + rmid_limbo_count++; + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) + closid_num_dirty_rmid[entry->closid]++; +} + +void free_rmid(u32 closid, u32 rmid) +{ + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + struct rmid_entry *entry; + + lockdep_assert_held(&rdtgroup_mutex); + + /* + * Do not allow the default rmid to be free'd. Comparing by index + * allows architectures that ignore the closid parameter to avoid an + * unnecessary check. + */ + if (!resctrl_arch_mon_capable() || + idx == resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID)) + return; + + entry = __rmid_entry(idx); + + if (resctrl_arch_is_llc_occupancy_enabled()) + add_rmid_to_limbo(entry); + else + list_add_tail(&entry->list, &rmid_free_lru); +} + +static struct mbm_state *get_mbm_state(struct rdt_mon_domain *d, u32 closid, + u32 rmid, enum resctrl_event_id evtid) +{ + u32 idx = resctrl_arch_rmid_idx_encode(closid, rmid); + + switch (evtid) { + case QOS_L3_MBM_TOTAL_EVENT_ID: + return &d->mbm_total[idx]; + case QOS_L3_MBM_LOCAL_EVENT_ID: + return &d->mbm_local[idx]; + default: + return NULL; + } +} + +static int __mon_event_count(u32 closid, u32 rmid, struct rmid_read *rr) +{ + int cpu = smp_processor_id(); + struct rdt_mon_domain *d; + struct mbm_state *m; + int err, ret; + u64 tval = 0; + + if (rr->first) { + resctrl_arch_reset_rmid(rr->r, rr->d, closid, rmid, rr->evtid); + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (m) + memset(m, 0, sizeof(struct mbm_state)); + return 0; + } + + if (rr->d) { + /* Reading a single domain, must be on a CPU in that domain. */ + if (!cpumask_test_cpu(cpu, &rr->d->hdr.cpu_mask)) + return -EINVAL; + rr->err = resctrl_arch_rmid_read(rr->r, rr->d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (rr->err) + return rr->err; + + rr->val += tval; + + return 0; + } + + /* Summing domains that share a cache, must be on a CPU for that cache. */ + if (!cpumask_test_cpu(cpu, &rr->ci->shared_cpu_map)) + return -EINVAL; + + /* + * Legacy files must report the sum of an event across all + * domains that share the same L3 cache instance. + * Report success if a read from any domain succeeds, -EINVAL + * (translated to "Unavailable" for user space) if reading from + * all domains fail for any reason. + */ + ret = -EINVAL; + list_for_each_entry(d, &rr->r->mon_domains, hdr.list) { + if (d->ci->id != rr->ci->id) + continue; + err = resctrl_arch_rmid_read(rr->r, d, closid, rmid, + rr->evtid, &tval, rr->arch_mon_ctx); + if (!err) { + rr->val += tval; + ret = 0; + } + } + + if (ret) + rr->err = ret; + + return ret; +} + +/* + * mbm_bw_count() - Update bw count from values previously read by + * __mon_event_count(). + * @closid: The closid used to identify the cached mbm_state. + * @rmid: The rmid used to identify the cached mbm_state. + * @rr: The struct rmid_read populated by __mon_event_count(). + * + * Supporting function to calculate the memory bandwidth + * and delta bandwidth in MBps. The chunks value previously read by + * __mon_event_count() is compared with the chunks value from the previous + * invocation. This must be called once per second to maintain values in MBps. + */ +static void mbm_bw_count(u32 closid, u32 rmid, struct rmid_read *rr) +{ + u64 cur_bw, bytes, cur_bytes; + struct mbm_state *m; + + m = get_mbm_state(rr->d, closid, rmid, rr->evtid); + if (WARN_ON_ONCE(!m)) + return; + + cur_bytes = rr->val; + bytes = cur_bytes - m->prev_bw_bytes; + m->prev_bw_bytes = cur_bytes; + + cur_bw = bytes / SZ_1M; + + m->prev_bw = cur_bw; +} + +/* + * This is scheduled by mon_event_read() to read the CQM/MBM counters + * on a domain. + */ +void mon_event_count(void *info) +{ + struct rdtgroup *rdtgrp, *entry; + struct rmid_read *rr = info; + struct list_head *head; + int ret; + + rdtgrp = rr->rgrp; + + ret = __mon_event_count(rdtgrp->closid, rdtgrp->mon.rmid, rr); + + /* + * For Ctrl groups read data from child monitor groups and + * add them together. Count events which are read successfully. + * Discard the rmid_read's reporting errors. + */ + head = &rdtgrp->mon.crdtgrp_list; + + if (rdtgrp->type == RDTCTRL_GROUP) { + list_for_each_entry(entry, head, mon.crdtgrp_list) { + if (__mon_event_count(entry->closid, entry->mon.rmid, + rr) == 0) + ret = 0; + } + } + + /* + * __mon_event_count() calls for newly created monitor groups may + * report -EINVAL/Unavailable if the monitor hasn't seen any traffic. + * Discard error if any of the monitor event reads succeeded. + */ + if (ret == 0) + rr->err = 0; +} + +static struct rdt_ctrl_domain *get_ctrl_domain_from_cpu(int cpu, + struct rdt_resource *r) +{ + struct rdt_ctrl_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return d; + } + + return NULL; +} + +/* + * Feedback loop for MBA software controller (mba_sc) + * + * mba_sc is a feedback loop where we periodically read MBM counters and + * adjust the bandwidth percentage values via the IA32_MBA_THRTL_MSRs so + * that: + * + * current bandwidth(cur_bw) < user specified bandwidth(user_bw) + * + * This uses the MBM counters to measure the bandwidth and MBA throttle + * MSRs to control the bandwidth for a particular rdtgrp. It builds on the + * fact that resctrl rdtgroups have both monitoring and control. + * + * The frequency of the checks is 1s and we just tag along the MBM overflow + * timer. Having 1s interval makes the calculation of bandwidth simpler. + * + * Although MBA's goal is to restrict the bandwidth to a maximum, there may + * be a need to increase the bandwidth to avoid unnecessarily restricting + * the L2 <-> L3 traffic. + * + * Since MBA controls the L2 external bandwidth where as MBM measures the + * L3 external bandwidth the following sequence could lead to such a + * situation. + * + * Consider an rdtgroup which had high L3 <-> memory traffic in initial + * phases -> mba_sc kicks in and reduced bandwidth percentage values -> but + * after some time rdtgroup has mostly L2 <-> L3 traffic. + * + * In this case we may restrict the rdtgroup's L2 <-> L3 traffic as its + * throttle MSRs already have low percentage values. To avoid + * unnecessarily restricting such rdtgroups, we also increase the bandwidth. + */ +static void update_mba_bw(struct rdtgroup *rgrp, struct rdt_mon_domain *dom_mbm) +{ + u32 closid, rmid, cur_msr_val, new_msr_val; + struct mbm_state *pmbm_data, *cmbm_data; + struct rdt_ctrl_domain *dom_mba; + enum resctrl_event_id evt_id; + struct rdt_resource *r_mba; + struct list_head *head; + struct rdtgroup *entry; + u32 cur_bw, user_bw; + + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + evt_id = rgrp->mba_mbps_event; + + closid = rgrp->closid; + rmid = rgrp->mon.rmid; + pmbm_data = get_mbm_state(dom_mbm, closid, rmid, evt_id); + if (WARN_ON_ONCE(!pmbm_data)) + return; + + dom_mba = get_ctrl_domain_from_cpu(smp_processor_id(), r_mba); + if (!dom_mba) { + pr_warn_once("Failure to get domain for MBA update\n"); + return; + } + + cur_bw = pmbm_data->prev_bw; + user_bw = dom_mba->mbps_val[closid]; + + /* MBA resource doesn't support CDP */ + cur_msr_val = resctrl_arch_get_config(r_mba, dom_mba, closid, CDP_NONE); + + /* + * For Ctrl groups read data from child monitor groups. + */ + head = &rgrp->mon.crdtgrp_list; + list_for_each_entry(entry, head, mon.crdtgrp_list) { + cmbm_data = get_mbm_state(dom_mbm, entry->closid, entry->mon.rmid, evt_id); + if (WARN_ON_ONCE(!cmbm_data)) + return; + cur_bw += cmbm_data->prev_bw; + } + + /* + * Scale up/down the bandwidth linearly for the ctrl group. The + * bandwidth step is the bandwidth granularity specified by the + * hardware. + * Always increase throttling if current bandwidth is above the + * target set by user. + * But avoid thrashing up and down on every poll by checking + * whether a decrease in throttling is likely to push the group + * back over target. E.g. if currently throttling to 30% of bandwidth + * on a system with 10% granularity steps, check whether moving to + * 40% would go past the limit by multiplying current bandwidth by + * "(30 + 10) / 30". + */ + if (cur_msr_val > r_mba->membw.min_bw && user_bw < cur_bw) { + new_msr_val = cur_msr_val - r_mba->membw.bw_gran; + } else if (cur_msr_val < MAX_MBA_BW && + (user_bw > (cur_bw * (cur_msr_val + r_mba->membw.min_bw) / cur_msr_val))) { + new_msr_val = cur_msr_val + r_mba->membw.bw_gran; + } else { + return; + } + + resctrl_arch_update_one(r_mba, dom_mba, closid, CDP_NONE, new_msr_val); +} + +static void mbm_update_one_event(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid, enum resctrl_event_id evtid) +{ + struct rmid_read rr = {0}; + + rr.r = r; + rr.d = d; + rr.evtid = evtid; + rr.arch_mon_ctx = resctrl_arch_mon_ctx_alloc(rr.r, rr.evtid); + if (IS_ERR(rr.arch_mon_ctx)) { + pr_warn_ratelimited("Failed to allocate monitor context: %ld", + PTR_ERR(rr.arch_mon_ctx)); + return; + } + + __mon_event_count(closid, rmid, &rr); + + /* + * If the software controller is enabled, compute the + * bandwidth for this event id. + */ + if (is_mba_sc(NULL)) + mbm_bw_count(closid, rmid, &rr); + + resctrl_arch_mon_ctx_free(rr.r, rr.evtid, rr.arch_mon_ctx); +} + +static void mbm_update(struct rdt_resource *r, struct rdt_mon_domain *d, + u32 closid, u32 rmid) +{ + /* + * This is protected from concurrent reads from user as both + * the user and overflow handler hold the global mutex. + */ + if (resctrl_arch_is_mbm_total_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_TOTAL_EVENT_ID); + + if (resctrl_arch_is_mbm_local_enabled()) + mbm_update_one_event(r, d, closid, rmid, QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/* + * Handler to scan the limbo list and move the RMIDs + * to free list whose occupancy < threshold_occupancy. + */ +void cqm_handle_limbo(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(CQM_LIMBOCHECK_INTERVAL); + struct rdt_mon_domain *d; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + d = container_of(work, struct rdt_mon_domain, cqm_limbo.work); + + __check_limbo(d, false); + + if (has_busy_rmid(d)) { + d->cqm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->cqm_work_cpu, &d->cqm_limbo, + delay); + } + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +/** + * cqm_setup_limbo_handler() - Schedule the limbo handler to run for this + * domain. + * @dom: The domain the limbo handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void cqm_setup_limbo_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); + dom->cqm_work_cpu = cpu; + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->cqm_limbo, delay); +} + +void mbm_handle_overflow(struct work_struct *work) +{ + unsigned long delay = msecs_to_jiffies(MBM_OVERFLOW_INTERVAL); + struct rdtgroup *prgrp, *crgrp; + struct rdt_mon_domain *d; + struct list_head *head; + struct rdt_resource *r; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + /* + * If the filesystem has been unmounted this work no longer needs to + * run. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) + goto out_unlock; + + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + d = container_of(work, struct rdt_mon_domain, mbm_over.work); + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mbm_update(r, d, prgrp->closid, prgrp->mon.rmid); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) + mbm_update(r, d, crgrp->closid, crgrp->mon.rmid); + + if (is_mba_sc(NULL)) + update_mba_bw(prgrp, d); + } + + /* + * Re-check for housekeeping CPUs. This allows the overflow handler to + * move off a nohz_full CPU quickly. + */ + d->mbm_work_cpu = cpumask_any_housekeeping(&d->hdr.cpu_mask, + RESCTRL_PICK_ANY_CPU); + schedule_delayed_work_on(d->mbm_work_cpu, &d->mbm_over, delay); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +/** + * mbm_setup_overflow_handler() - Schedule the overflow handler to run for this + * domain. + * @dom: The domain the overflow handler should run for. + * @delay_ms: How far in the future the handler should run. + * @exclude_cpu: Which CPU the handler should not run on, + * RESCTRL_PICK_ANY_CPU to pick any CPU. + */ +void mbm_setup_overflow_handler(struct rdt_mon_domain *dom, unsigned long delay_ms, + int exclude_cpu) +{ + unsigned long delay = msecs_to_jiffies(delay_ms); + int cpu; + + /* + * When a domain comes online there is no guarantee the filesystem is + * mounted. If not, there is no need to catch counter overflow. + */ + if (!resctrl_mounted || !resctrl_arch_mon_capable()) + return; + cpu = cpumask_any_housekeeping(&dom->hdr.cpu_mask, exclude_cpu); + dom->mbm_work_cpu = cpu; + + if (cpu < nr_cpu_ids) + schedule_delayed_work_on(cpu, &dom->mbm_over, delay); +} + +static int dom_data_init(struct rdt_resource *r) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + u32 num_closid = resctrl_arch_get_num_closid(r); + struct rmid_entry *entry = NULL; + int err = 0, i; + u32 idx; + + mutex_lock(&rdtgroup_mutex); + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + u32 *tmp; + + /* + * If the architecture hasn't provided a sanitised value here, + * this may result in larger arrays than necessary. Resctrl will + * use a smaller system wide value based on the resources in + * use. + */ + tmp = kcalloc(num_closid, sizeof(*tmp), GFP_KERNEL); + if (!tmp) { + err = -ENOMEM; + goto out_unlock; + } + + closid_num_dirty_rmid = tmp; + } + + rmid_ptrs = kcalloc(idx_limit, sizeof(struct rmid_entry), GFP_KERNEL); + if (!rmid_ptrs) { + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + err = -ENOMEM; + goto out_unlock; + } + + for (i = 0; i < idx_limit; i++) { + entry = &rmid_ptrs[i]; + INIT_LIST_HEAD(&entry->list); + + resctrl_arch_rmid_idx_decode(i, &entry->closid, &entry->rmid); + list_add_tail(&entry->list, &rmid_free_lru); + } + + /* + * RESCTRL_RESERVED_CLOSID and RESCTRL_RESERVED_RMID are special and + * are always allocated. These are used for the rdtgroup_default + * control group, which will be setup later in resctrl_init(). + */ + idx = resctrl_arch_rmid_idx_encode(RESCTRL_RESERVED_CLOSID, + RESCTRL_RESERVED_RMID); + entry = __rmid_entry(idx); + list_del(&entry->list); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +static void dom_data_exit(struct rdt_resource *r) +{ + mutex_lock(&rdtgroup_mutex); + + if (!r->mon_capable) + goto out_unlock; + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID)) { + kfree(closid_num_dirty_rmid); + closid_num_dirty_rmid = NULL; + } + + kfree(rmid_ptrs); + rmid_ptrs = NULL; + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +static struct mon_evt llc_occupancy_event = { + .name = "llc_occupancy", + .evtid = QOS_L3_OCCUP_EVENT_ID, +}; + +static struct mon_evt mbm_total_event = { + .name = "mbm_total_bytes", + .evtid = QOS_L3_MBM_TOTAL_EVENT_ID, +}; + +static struct mon_evt mbm_local_event = { + .name = "mbm_local_bytes", + .evtid = QOS_L3_MBM_LOCAL_EVENT_ID, +}; + +/* + * Initialize the event list for the resource. + * + * Note that MBM events are also part of RDT_RESOURCE_L3 resource + * because as per the SDM the total and local memory bandwidth + * are enumerated as part of L3 monitoring. + */ +static void l3_mon_evt_init(struct rdt_resource *r) +{ + INIT_LIST_HEAD(&r->evt_list); + + if (resctrl_arch_is_llc_occupancy_enabled()) + list_add_tail(&llc_occupancy_event.list, &r->evt_list); + if (resctrl_arch_is_mbm_total_enabled()) + list_add_tail(&mbm_total_event.list, &r->evt_list); + if (resctrl_arch_is_mbm_local_enabled()) + list_add_tail(&mbm_local_event.list, &r->evt_list); +} + +/** + * resctrl_mon_resource_init() - Initialise global monitoring structures. + * + * Allocate and initialise global monitor resources that do not belong to a + * specific domain. i.e. the rmid_ptrs[] used for the limbo and free lists. + * Called once during boot after the struct rdt_resource's have been configured + * but before the filesystem is mounted. + * Resctrl's cpuhp callbacks may be called before this point to bring a domain + * online. + * + * Returns 0 for success, or -ENOMEM. + */ +int resctrl_mon_resource_init(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + int ret; + + if (!r->mon_capable) + return 0; + + ret = dom_data_init(r); + if (ret) + return ret; + + l3_mon_evt_init(r); + + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_TOTAL_EVENT_ID)) { + mbm_total_event.configurable = true; + resctrl_file_fflags_init("mbm_total_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + } + if (resctrl_arch_is_evt_configurable(QOS_L3_MBM_LOCAL_EVENT_ID)) { + mbm_local_event.configurable = true; + resctrl_file_fflags_init("mbm_local_bytes_config", + RFTYPE_MON_INFO | RFTYPE_RES_CACHE); + } + + if (resctrl_arch_is_mbm_local_enabled()) + mba_mbps_default_event = QOS_L3_MBM_LOCAL_EVENT_ID; + else if (resctrl_arch_is_mbm_total_enabled()) + mba_mbps_default_event = QOS_L3_MBM_TOTAL_EVENT_ID; + + return 0; +} + +void resctrl_mon_resource_exit(void) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + + dom_data_exit(r); +} diff --git a/fs/resctrl/monitor_trace.h b/fs/resctrl/monitor_trace.h new file mode 100644 index 000000000000..fdf49f22576a --- /dev/null +++ b/fs/resctrl/monitor_trace.h @@ -0,0 +1,33 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#undef TRACE_SYSTEM +#define TRACE_SYSTEM resctrl + +#if !defined(_FS_RESCTRL_MONITOR_TRACE_H) || defined(TRACE_HEADER_MULTI_READ) +#define _FS_RESCTRL_MONITOR_TRACE_H + +#include <linux/tracepoint.h> + +TRACE_EVENT(mon_llc_occupancy_limbo, + TP_PROTO(u32 ctrl_hw_id, u32 mon_hw_id, int domain_id, u64 llc_occupancy_bytes), + TP_ARGS(ctrl_hw_id, mon_hw_id, domain_id, llc_occupancy_bytes), + TP_STRUCT__entry(__field(u32, ctrl_hw_id) + __field(u32, mon_hw_id) + __field(int, domain_id) + __field(u64, llc_occupancy_bytes)), + TP_fast_assign(__entry->ctrl_hw_id = ctrl_hw_id; + __entry->mon_hw_id = mon_hw_id; + __entry->domain_id = domain_id; + __entry->llc_occupancy_bytes = llc_occupancy_bytes;), + TP_printk("ctrl_hw_id=%u mon_hw_id=%u domain_id=%d llc_occupancy_bytes=%llu", + __entry->ctrl_hw_id, __entry->mon_hw_id, __entry->domain_id, + __entry->llc_occupancy_bytes) + ); + +#endif /* _FS_RESCTRL_MONITOR_TRACE_H */ + +#undef TRACE_INCLUDE_PATH +#define TRACE_INCLUDE_PATH . + +#define TRACE_INCLUDE_FILE monitor_trace + +#include <trace/define_trace.h> diff --git a/fs/resctrl/pseudo_lock.c b/fs/resctrl/pseudo_lock.c new file mode 100644 index 000000000000..ccc2f9213b4b --- /dev/null +++ b/fs/resctrl/pseudo_lock.c @@ -0,0 +1,1105 @@ +// SPDX-License-Identifier: GPL-2.0 +/* + * Resource Director Technology (RDT) + * + * Pseudo-locking support built on top of Cache Allocation Technology (CAT) + * + * Copyright (C) 2018 Intel Corporation + * + * Author: Reinette Chatre <reinette.chatre@intel.com> + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cacheinfo.h> +#include <linux/cpu.h> +#include <linux/cpumask.h> +#include <linux/debugfs.h> +#include <linux/kthread.h> +#include <linux/mman.h> +#include <linux/pm_qos.h> +#include <linux/resctrl.h> +#include <linux/slab.h> +#include <linux/uaccess.h> + +#include "internal.h" + +/* + * Major number assigned to and shared by all devices exposing + * pseudo-locked regions. + */ +static unsigned int pseudo_lock_major; + +static unsigned long pseudo_lock_minor_avail = GENMASK(MINORBITS, 0); + +static char *pseudo_lock_devnode(const struct device *dev, umode_t *mode) +{ + const struct rdtgroup *rdtgrp; + + rdtgrp = dev_get_drvdata(dev); + if (mode) + *mode = 0600; + guard(mutex)(&rdtgroup_mutex); + return kasprintf(GFP_KERNEL, "pseudo_lock/%s", rdt_kn_name(rdtgrp->kn)); +} + +static const struct class pseudo_lock_class = { + .name = "pseudo_lock", + .devnode = pseudo_lock_devnode, +}; + +/** + * pseudo_lock_minor_get - Obtain available minor number + * @minor: Pointer to where new minor number will be stored + * + * A bitmask is used to track available minor numbers. Here the next free + * minor number is marked as unavailable and returned. + * + * Return: 0 on success, <0 on failure. + */ +static int pseudo_lock_minor_get(unsigned int *minor) +{ + unsigned long first_bit; + + first_bit = find_first_bit(&pseudo_lock_minor_avail, MINORBITS); + + if (first_bit == MINORBITS) + return -ENOSPC; + + __clear_bit(first_bit, &pseudo_lock_minor_avail); + *minor = first_bit; + + return 0; +} + +/** + * pseudo_lock_minor_release - Return minor number to available + * @minor: The minor number made available + */ +static void pseudo_lock_minor_release(unsigned int minor) +{ + __set_bit(minor, &pseudo_lock_minor_avail); +} + +/** + * region_find_by_minor - Locate a pseudo-lock region by inode minor number + * @minor: The minor number of the device representing pseudo-locked region + * + * When the character device is accessed we need to determine which + * pseudo-locked region it belongs to. This is done by matching the minor + * number of the device to the pseudo-locked region it belongs. + * + * Minor numbers are assigned at the time a pseudo-locked region is associated + * with a cache instance. + * + * Return: On success return pointer to resource group owning the pseudo-locked + * region, NULL on failure. + */ +static struct rdtgroup *region_find_by_minor(unsigned int minor) +{ + struct rdtgroup *rdtgrp, *rdtgrp_match = NULL; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->plr && rdtgrp->plr->minor == minor) { + rdtgrp_match = rdtgrp; + break; + } + } + return rdtgrp_match; +} + +/** + * struct pseudo_lock_pm_req - A power management QoS request list entry + * @list: Entry within the @pm_reqs list for a pseudo-locked region + * @req: PM QoS request + */ +struct pseudo_lock_pm_req { + struct list_head list; + struct dev_pm_qos_request req; +}; + +static void pseudo_lock_cstates_relax(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req, *next; + + list_for_each_entry_safe(pm_req, next, &plr->pm_reqs, list) { + dev_pm_qos_remove_request(&pm_req->req); + list_del(&pm_req->list); + kfree(pm_req); + } +} + +/** + * pseudo_lock_cstates_constrain - Restrict cores from entering C6 + * @plr: Pseudo-locked region + * + * To prevent the cache from being affected by power management entering + * C6 has to be avoided. This is accomplished by requesting a latency + * requirement lower than lowest C6 exit latency of all supported + * platforms as found in the cpuidle state tables in the intel_idle driver. + * At this time it is possible to do so with a single latency requirement + * for all supported platforms. + * + * Since Goldmont is supported, which is affected by X86_BUG_MONITOR, + * the ACPI latencies need to be considered while keeping in mind that C2 + * may be set to map to deeper sleep states. In this case the latency + * requirement needs to prevent entering C2 also. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_cstates_constrain(struct pseudo_lock_region *plr) +{ + struct pseudo_lock_pm_req *pm_req; + int cpu; + int ret; + + for_each_cpu(cpu, &plr->d->hdr.cpu_mask) { + pm_req = kzalloc(sizeof(*pm_req), GFP_KERNEL); + if (!pm_req) { + rdt_last_cmd_puts("Failure to allocate memory for PM QoS\n"); + ret = -ENOMEM; + goto out_err; + } + ret = dev_pm_qos_add_request(get_cpu_device(cpu), + &pm_req->req, + DEV_PM_QOS_RESUME_LATENCY, + 30); + if (ret < 0) { + rdt_last_cmd_printf("Failed to add latency req CPU%d\n", + cpu); + kfree(pm_req); + ret = -1; + goto out_err; + } + list_add(&pm_req->list, &plr->pm_reqs); + } + + return 0; + +out_err: + pseudo_lock_cstates_relax(plr); + return ret; +} + +/** + * pseudo_lock_region_clear - Reset pseudo-lock region data + * @plr: pseudo-lock region + * + * All content of the pseudo-locked region is reset - any memory allocated + * freed. + * + * Return: void + */ +static void pseudo_lock_region_clear(struct pseudo_lock_region *plr) +{ + plr->size = 0; + plr->line_size = 0; + kfree(plr->kmem); + plr->kmem = NULL; + plr->s = NULL; + if (plr->d) + plr->d->plr = NULL; + plr->d = NULL; + plr->cbm = 0; + plr->debugfs_dir = NULL; +} + +/** + * pseudo_lock_region_init - Initialize pseudo-lock region information + * @plr: pseudo-lock region + * + * Called after user provided a schemata to be pseudo-locked. From the + * schemata the &struct pseudo_lock_region is on entry already initialized + * with the resource, domain, and capacity bitmask. Here the information + * required for pseudo-locking is deduced from this data and &struct + * pseudo_lock_region initialized further. This information includes: + * - size in bytes of the region to be pseudo-locked + * - cache line size to know the stride with which data needs to be accessed + * to be pseudo-locked + * - a cpu associated with the cache instance on which the pseudo-locking + * flow can be executed + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_init(struct pseudo_lock_region *plr) +{ + enum resctrl_scope scope = plr->s->res->ctrl_scope; + struct cacheinfo *ci; + int ret; + + if (WARN_ON_ONCE(scope != RESCTRL_L2_CACHE && scope != RESCTRL_L3_CACHE)) + return -ENODEV; + + /* Pick the first cpu we find that is associated with the cache. */ + plr->cpu = cpumask_first(&plr->d->hdr.cpu_mask); + + if (!cpu_online(plr->cpu)) { + rdt_last_cmd_printf("CPU %u associated with cache not online\n", + plr->cpu); + ret = -ENODEV; + goto out_region; + } + + ci = get_cpu_cacheinfo_level(plr->cpu, scope); + if (ci) { + plr->line_size = ci->coherency_line_size; + plr->size = rdtgroup_cbm_to_size(plr->s->res, plr->d, plr->cbm); + return 0; + } + + ret = -1; + rdt_last_cmd_puts("Unable to determine cache line size\n"); +out_region: + pseudo_lock_region_clear(plr); + return ret; +} + +/** + * pseudo_lock_init - Initialize a pseudo-lock region + * @rdtgrp: resource group to which new pseudo-locked region will belong + * + * A pseudo-locked region is associated with a resource group. When this + * association is created the pseudo-locked region is initialized. The + * details of the pseudo-locked region are not known at this time so only + * allocation is done and association established. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_init(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr; + + plr = kzalloc(sizeof(*plr), GFP_KERNEL); + if (!plr) + return -ENOMEM; + + init_waitqueue_head(&plr->lock_thread_wq); + INIT_LIST_HEAD(&plr->pm_reqs); + rdtgrp->plr = plr; + return 0; +} + +/** + * pseudo_lock_region_alloc - Allocate kernel memory that will be pseudo-locked + * @plr: pseudo-lock region + * + * Initialize the details required to set up the pseudo-locked region and + * allocate the contiguous memory that will be pseudo-locked to the cache. + * + * Return: 0 on success, <0 on failure. Descriptive error will be written + * to last_cmd_status buffer. + */ +static int pseudo_lock_region_alloc(struct pseudo_lock_region *plr) +{ + int ret; + + ret = pseudo_lock_region_init(plr); + if (ret < 0) + return ret; + + /* + * We do not yet support contiguous regions larger than + * KMALLOC_MAX_SIZE. + */ + if (plr->size > KMALLOC_MAX_SIZE) { + rdt_last_cmd_puts("Requested region exceeds maximum size\n"); + ret = -E2BIG; + goto out_region; + } + + plr->kmem = kzalloc(plr->size, GFP_KERNEL); + if (!plr->kmem) { + rdt_last_cmd_puts("Unable to allocate memory\n"); + ret = -ENOMEM; + goto out_region; + } + + ret = 0; + goto out; +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * pseudo_lock_free - Free a pseudo-locked region + * @rdtgrp: resource group to which pseudo-locked region belonged + * + * The pseudo-locked region's resources have already been released, or not + * yet created at this point. Now it can be freed and disassociated from the + * resource group. + * + * Return: void + */ +static void pseudo_lock_free(struct rdtgroup *rdtgrp) +{ + pseudo_lock_region_clear(rdtgrp->plr); + kfree(rdtgrp->plr); + rdtgrp->plr = NULL; +} + +/** + * rdtgroup_monitor_in_progress - Test if monitoring in progress + * @rdtgrp: resource group being queried + * + * Return: 1 if monitor groups have been created for this resource + * group, 0 otherwise. + */ +static int rdtgroup_monitor_in_progress(struct rdtgroup *rdtgrp) +{ + return !list_empty(&rdtgrp->mon.crdtgrp_list); +} + +/** + * rdtgroup_locksetup_user_restrict - Restrict user access to group + * @rdtgrp: resource group needing access restricted + * + * A resource group used for cache pseudo-locking cannot have cpus or tasks + * assigned to it. This is communicated to the user by restricting access + * to all the files that can be used to make such changes. + * + * Permissions restored with rdtgroup_locksetup_user_restore() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restriction of access an attempt will be made to restore permissions but + * the state of the mode of these files will be uncertain when a failure + * occurs. + */ +static int rdtgroup_locksetup_user_restrict(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); + if (ret) + goto err_cpus; + + if (resctrl_arch_mon_capable()) { + ret = rdtgroup_kn_mode_restrict(rdtgrp, "mon_groups"); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); +err_cpus: + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); +err_tasks: + rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); +out: + return ret; +} + +/** + * rdtgroup_locksetup_user_restore - Restore user access to group + * @rdtgrp: resource group needing access restored + * + * Restore all file access previously removed using + * rdtgroup_locksetup_user_restrict() + * + * Return: 0 on success, <0 on failure. If a failure occurs during the + * restoration of access an attempt will be made to restrict permissions + * again but the state of the mode of these files will be uncertain when + * a failure occurs. + */ +static int rdtgroup_locksetup_user_restore(struct rdtgroup *rdtgrp) +{ + int ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "tasks", 0777); + if (ret) + return ret; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0777); + if (ret) + goto err_tasks; + + ret = rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0777); + if (ret) + goto err_cpus; + + if (resctrl_arch_mon_capable()) { + ret = rdtgroup_kn_mode_restore(rdtgrp, "mon_groups", 0777); + if (ret) + goto err_cpus_list; + } + + ret = 0; + goto out; + +err_cpus_list: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus_list"); +err_cpus: + rdtgroup_kn_mode_restrict(rdtgrp, "cpus"); +err_tasks: + rdtgroup_kn_mode_restrict(rdtgrp, "tasks"); +out: + return ret; +} + +/** + * rdtgroup_locksetup_enter - Resource group enters locksetup mode + * @rdtgrp: resource group requested to enter locksetup mode + * + * A resource group enters locksetup mode to reflect that it would be used + * to represent a pseudo-locked region and is in the process of being set + * up to do so. A resource group used for a pseudo-locked region would + * lose the closid associated with it so we cannot allow it to have any + * tasks or cpus assigned nor permit tasks or cpus to be assigned in the + * future. Monitoring of a pseudo-locked region is not allowed either. + * + * The above and more restrictions on a pseudo-locked region are checked + * for and enforced before the resource group enters the locksetup mode. + * + * Returns: 0 if the resource group successfully entered locksetup mode, <0 + * on failure. On failure the last_cmd_status buffer is updated with text to + * communicate details of failure to the user. + */ +int rdtgroup_locksetup_enter(struct rdtgroup *rdtgrp) +{ + int ret; + + /* + * The default resource group can neither be removed nor lose the + * default closid associated with it. + */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Cannot pseudo-lock default group\n"); + return -EINVAL; + } + + /* + * Cache Pseudo-locking not supported when CDP is enabled. + * + * Some things to consider if you would like to enable this + * support (using L3 CDP as example): + * - When CDP is enabled two separate resources are exposed, + * L3DATA and L3CODE, but they are actually on the same cache. + * The implication for pseudo-locking is that if a + * pseudo-locked region is created on a domain of one + * resource (eg. L3CODE), then a pseudo-locked region cannot + * be created on that same domain of the other resource + * (eg. L3DATA). This is because the creation of a + * pseudo-locked region involves a call to wbinvd that will + * affect all cache allocations on particular domain. + * - Considering the previous, it may be possible to only + * expose one of the CDP resources to pseudo-locking and + * hide the other. For example, we could consider to only + * expose L3DATA and since the L3 cache is unified it is + * still possible to place instructions there are execute it. + * - If only one region is exposed to pseudo-locking we should + * still keep in mind that availability of a portion of cache + * for pseudo-locking should take into account both resources. + * Similarly, if a pseudo-locked region is created in one + * resource, the portion of cache used by it should be made + * unavailable to all future allocations from both resources. + */ + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3) || + resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) { + rdt_last_cmd_puts("CDP enabled\n"); + return -EINVAL; + } + + /* + * Not knowing the bits to disable prefetching implies that this + * platform does not support Cache Pseudo-Locking. + */ + if (resctrl_arch_get_prefetch_disable_bits() == 0) { + rdt_last_cmd_puts("Pseudo-locking not supported\n"); + return -EINVAL; + } + + if (rdtgroup_monitor_in_progress(rdtgrp)) { + rdt_last_cmd_puts("Monitoring in progress\n"); + return -EINVAL; + } + + if (rdtgroup_tasks_assigned(rdtgrp)) { + rdt_last_cmd_puts("Tasks assigned to resource group\n"); + return -EINVAL; + } + + if (!cpumask_empty(&rdtgrp->cpu_mask)) { + rdt_last_cmd_puts("CPUs assigned to resource group\n"); + return -EINVAL; + } + + if (rdtgroup_locksetup_user_restrict(rdtgrp)) { + rdt_last_cmd_puts("Unable to modify resctrl permissions\n"); + return -EIO; + } + + ret = pseudo_lock_init(rdtgrp); + if (ret) { + rdt_last_cmd_puts("Unable to init pseudo-lock region\n"); + goto out_release; + } + + /* + * If this system is capable of monitoring a rmid would have been + * allocated when the control group was created. This is not needed + * anymore when this group would be used for pseudo-locking. This + * is safe to call on platforms not capable of monitoring. + */ + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + + ret = 0; + goto out; + +out_release: + rdtgroup_locksetup_user_restore(rdtgrp); +out: + return ret; +} + +/** + * rdtgroup_locksetup_exit - resource group exist locksetup mode + * @rdtgrp: resource group + * + * When a resource group exits locksetup mode the earlier restrictions are + * lifted. + * + * Return: 0 on success, <0 on failure + */ +int rdtgroup_locksetup_exit(struct rdtgroup *rdtgrp) +{ + int ret; + + if (resctrl_arch_mon_capable()) { + ret = alloc_rmid(rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + } + + ret = rdtgroup_locksetup_user_restore(rdtgrp); + if (ret) { + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + return ret; + } + + pseudo_lock_free(rdtgrp); + return 0; +} + +/** + * rdtgroup_cbm_overlaps_pseudo_locked - Test if CBM or portion is pseudo-locked + * @d: RDT domain + * @cbm: CBM to test + * + * @d represents a cache instance and @cbm a capacity bitmask that is + * considered for it. Determine if @cbm overlaps with any existing + * pseudo-locked region on @d. + * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: true if @cbm overlaps with pseudo-locked region on @d, false + * otherwise. + */ +bool rdtgroup_cbm_overlaps_pseudo_locked(struct rdt_ctrl_domain *d, unsigned long cbm) +{ + unsigned int cbm_len; + unsigned long cbm_b; + + if (d->plr) { + cbm_len = d->plr->s->res->cache.cbm_len; + cbm_b = d->plr->cbm; + if (bitmap_intersects(&cbm, &cbm_b, cbm_len)) + return true; + } + return false; +} + +/** + * rdtgroup_pseudo_locked_in_hierarchy - Pseudo-locked region in cache hierarchy + * @d: RDT domain under test + * + * The setup of a pseudo-locked region affects all cache instances within + * the hierarchy of the region. It is thus essential to know if any + * pseudo-locked regions exist within a cache hierarchy to prevent any + * attempts to create new pseudo-locked regions in the same hierarchy. + * + * Return: true if a pseudo-locked region exists in the hierarchy of @d or + * if it is not possible to test due to memory allocation issue, + * false otherwise. + */ +bool rdtgroup_pseudo_locked_in_hierarchy(struct rdt_ctrl_domain *d) +{ + struct rdt_ctrl_domain *d_i; + cpumask_var_t cpu_with_psl; + struct rdt_resource *r; + bool ret = false; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + if (!zalloc_cpumask_var(&cpu_with_psl, GFP_KERNEL)) + return true; + + /* + * First determine which cpus have pseudo-locked regions + * associated with them. + */ + for_each_alloc_capable_rdt_resource(r) { + list_for_each_entry(d_i, &r->ctrl_domains, hdr.list) { + if (d_i->plr) + cpumask_or(cpu_with_psl, cpu_with_psl, + &d_i->hdr.cpu_mask); + } + } + + /* + * Next test if new pseudo-locked region would intersect with + * existing region. + */ + if (cpumask_intersects(&d->hdr.cpu_mask, cpu_with_psl)) + ret = true; + + free_cpumask_var(cpu_with_psl); + return ret; +} + +/** + * pseudo_lock_measure_cycles - Trigger latency measure to pseudo-locked region + * @rdtgrp: Resource group to which the pseudo-locked region belongs. + * @sel: Selector of which measurement to perform on a pseudo-locked region. + * + * The measurement of latency to access a pseudo-locked region should be + * done from a cpu that is associated with that pseudo-locked region. + * Determine which cpu is associated with this region and start a thread on + * that cpu to perform the measurement, wait for that thread to complete. + * + * Return: 0 on success, <0 on failure + */ +static int pseudo_lock_measure_cycles(struct rdtgroup *rdtgrp, int sel) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int cpu; + int ret = -1; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out; + } + + if (!plr->d) { + ret = -ENODEV; + goto out; + } + + plr->thread_done = 0; + cpu = cpumask_first(&plr->d->hdr.cpu_mask); + if (!cpu_online(cpu)) { + ret = -ENODEV; + goto out; + } + + plr->cpu = cpu; + + if (sel == 1) + thread = kthread_run_on_cpu(resctrl_arch_measure_cycles_lat_fn, + plr, cpu, "pseudo_lock_measure/%u"); + else if (sel == 2) + thread = kthread_run_on_cpu(resctrl_arch_measure_l2_residency, + plr, cpu, "pseudo_lock_measure/%u"); + else if (sel == 3) + thread = kthread_run_on_cpu(resctrl_arch_measure_l3_residency, + plr, cpu, "pseudo_lock_measure/%u"); + else + goto out; + + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + goto out; + } + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) + goto out; + + ret = 0; + +out: + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +static ssize_t pseudo_lock_measure_trigger(struct file *file, + const char __user *user_buf, + size_t count, loff_t *ppos) +{ + struct rdtgroup *rdtgrp = file->private_data; + size_t buf_size; + char buf[32]; + int ret; + int sel; + + buf_size = min(count, (sizeof(buf) - 1)); + if (copy_from_user(buf, user_buf, buf_size)) + return -EFAULT; + + buf[buf_size] = '\0'; + ret = kstrtoint(buf, 10, &sel); + if (ret == 0) { + if (sel != 1 && sel != 2 && sel != 3) + return -EINVAL; + ret = debugfs_file_get(file->f_path.dentry); + if (ret) + return ret; + ret = pseudo_lock_measure_cycles(rdtgrp, sel); + if (ret == 0) + ret = count; + debugfs_file_put(file->f_path.dentry); + } + + return ret; +} + +static const struct file_operations pseudo_measure_fops = { + .write = pseudo_lock_measure_trigger, + .open = simple_open, + .llseek = default_llseek, +}; + +/** + * rdtgroup_pseudo_lock_create - Create a pseudo-locked region + * @rdtgrp: resource group to which pseudo-lock region belongs + * + * Called when a resource group in the pseudo-locksetup mode receives a + * valid schemata that should be pseudo-locked. Since the resource group is + * in pseudo-locksetup mode the &struct pseudo_lock_region has already been + * allocated and initialized with the essential information. If a failure + * occurs the resource group remains in the pseudo-locksetup mode with the + * &struct pseudo_lock_region associated with it, but cleared from all + * information and ready for the user to re-attempt pseudo-locking by + * writing the schemata again. + * + * Return: 0 if the pseudo-locked region was successfully pseudo-locked, <0 + * on failure. Descriptive error will be written to last_cmd_status buffer. + */ +int rdtgroup_pseudo_lock_create(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + struct task_struct *thread; + unsigned int new_minor; + struct device *dev; + char *kn_name __free(kfree) = NULL; + int ret; + + ret = pseudo_lock_region_alloc(plr); + if (ret < 0) + return ret; + + ret = pseudo_lock_cstates_constrain(plr); + if (ret < 0) { + ret = -EINVAL; + goto out_region; + } + kn_name = kstrdup(rdt_kn_name(rdtgrp->kn), GFP_KERNEL); + if (!kn_name) { + ret = -ENOMEM; + goto out_cstates; + } + + plr->thread_done = 0; + + thread = kthread_run_on_cpu(resctrl_arch_pseudo_lock_fn, plr, + plr->cpu, "pseudo_lock/%u"); + if (IS_ERR(thread)) { + ret = PTR_ERR(thread); + rdt_last_cmd_printf("Locking thread returned error %d\n", ret); + goto out_cstates; + } + + ret = wait_event_interruptible(plr->lock_thread_wq, + plr->thread_done == 1); + if (ret < 0) { + /* + * If the thread does not get on the CPU for whatever + * reason and the process which sets up the region is + * interrupted then this will leave the thread in runnable + * state and once it gets on the CPU it will dereference + * the cleared, but not freed, plr struct resulting in an + * empty pseudo-locking loop. + */ + rdt_last_cmd_puts("Locking thread interrupted\n"); + goto out_cstates; + } + + ret = pseudo_lock_minor_get(&new_minor); + if (ret < 0) { + rdt_last_cmd_puts("Unable to obtain a new minor number\n"); + goto out_cstates; + } + + /* + * Unlock access but do not release the reference. The + * pseudo-locked region will still be here on return. + * + * The mutex has to be released temporarily to avoid a potential + * deadlock with the mm->mmap_lock which is obtained in the + * device_create() and debugfs_create_dir() callpath below as well as + * before the mmap() callback is called. + */ + mutex_unlock(&rdtgroup_mutex); + + if (!IS_ERR_OR_NULL(debugfs_resctrl)) { + plr->debugfs_dir = debugfs_create_dir(kn_name, debugfs_resctrl); + if (!IS_ERR_OR_NULL(plr->debugfs_dir)) + debugfs_create_file("pseudo_lock_measure", 0200, + plr->debugfs_dir, rdtgrp, + &pseudo_measure_fops); + } + + dev = device_create(&pseudo_lock_class, NULL, + MKDEV(pseudo_lock_major, new_minor), + rdtgrp, "%s", kn_name); + + mutex_lock(&rdtgroup_mutex); + + if (IS_ERR(dev)) { + ret = PTR_ERR(dev); + rdt_last_cmd_printf("Failed to create character device: %d\n", + ret); + goto out_debugfs; + } + + /* We released the mutex - check if group was removed while we did so */ + if (rdtgrp->flags & RDT_DELETED) { + ret = -ENODEV; + goto out_device; + } + + plr->minor = new_minor; + + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKED; + closid_free(rdtgrp->closid); + rdtgroup_kn_mode_restore(rdtgrp, "cpus", 0444); + rdtgroup_kn_mode_restore(rdtgrp, "cpus_list", 0444); + + ret = 0; + goto out; + +out_device: + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, new_minor)); +out_debugfs: + debugfs_remove_recursive(plr->debugfs_dir); + pseudo_lock_minor_release(new_minor); +out_cstates: + pseudo_lock_cstates_relax(plr); +out_region: + pseudo_lock_region_clear(plr); +out: + return ret; +} + +/** + * rdtgroup_pseudo_lock_remove - Remove a pseudo-locked region + * @rdtgrp: resource group to which the pseudo-locked region belongs + * + * The removal of a pseudo-locked region can be initiated when the resource + * group is removed from user space via a "rmdir" from userspace or the + * unmount of the resctrl filesystem. On removal the resource group does + * not go back to pseudo-locksetup mode before it is removed, instead it is + * removed directly. There is thus asymmetry with the creation where the + * &struct pseudo_lock_region is removed here while it was not created in + * rdtgroup_pseudo_lock_create(). + * + * Return: void + */ +void rdtgroup_pseudo_lock_remove(struct rdtgroup *rdtgrp) +{ + struct pseudo_lock_region *plr = rdtgrp->plr; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + /* + * Default group cannot be a pseudo-locked region so we can + * free closid here. + */ + closid_free(rdtgrp->closid); + goto free; + } + + pseudo_lock_cstates_relax(plr); + debugfs_remove_recursive(rdtgrp->plr->debugfs_dir); + device_destroy(&pseudo_lock_class, MKDEV(pseudo_lock_major, plr->minor)); + pseudo_lock_minor_release(plr->minor); + +free: + pseudo_lock_free(rdtgrp); +} + +static int pseudo_lock_dev_open(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = region_find_by_minor(iminor(inode)); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + filp->private_data = rdtgrp; + atomic_inc(&rdtgrp->waitcount); + /* Perform a non-seekable open - llseek is not supported */ + filp->f_mode &= ~(FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE); + + mutex_unlock(&rdtgroup_mutex); + + return 0; +} + +static int pseudo_lock_dev_release(struct inode *inode, struct file *filp) +{ + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + filp->private_data = NULL; + atomic_dec(&rdtgrp->waitcount); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static int pseudo_lock_dev_mremap(struct vm_area_struct *area) +{ + /* Not supported */ + return -EINVAL; +} + +static const struct vm_operations_struct pseudo_mmap_ops = { + .mremap = pseudo_lock_dev_mremap, +}; + +static int pseudo_lock_dev_mmap(struct file *filp, struct vm_area_struct *vma) +{ + unsigned long vsize = vma->vm_end - vma->vm_start; + unsigned long off = vma->vm_pgoff << PAGE_SHIFT; + struct pseudo_lock_region *plr; + struct rdtgroup *rdtgrp; + unsigned long physical; + unsigned long psize; + + mutex_lock(&rdtgroup_mutex); + + rdtgrp = filp->private_data; + WARN_ON(!rdtgrp); + if (!rdtgrp) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + plr = rdtgrp->plr; + + if (!plr->d) { + mutex_unlock(&rdtgroup_mutex); + return -ENODEV; + } + + /* + * Task is required to run with affinity to the cpus associated + * with the pseudo-locked region. If this is not the case the task + * may be scheduled elsewhere and invalidate entries in the + * pseudo-locked region. + */ + if (!cpumask_subset(current->cpus_ptr, &plr->d->hdr.cpu_mask)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + physical = __pa(plr->kmem) >> PAGE_SHIFT; + psize = plr->size - off; + + if (off > plr->size) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + /* + * Ensure changes are carried directly to the memory being mapped, + * do not allow copy-on-write mapping. + */ + if (!(vma->vm_flags & VM_SHARED)) { + mutex_unlock(&rdtgroup_mutex); + return -EINVAL; + } + + if (vsize > psize) { + mutex_unlock(&rdtgroup_mutex); + return -ENOSPC; + } + + memset(plr->kmem + off, 0, vsize); + + if (remap_pfn_range(vma, vma->vm_start, physical + vma->vm_pgoff, + vsize, vma->vm_page_prot)) { + mutex_unlock(&rdtgroup_mutex); + return -EAGAIN; + } + vma->vm_ops = &pseudo_mmap_ops; + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static const struct file_operations pseudo_lock_dev_fops = { + .owner = THIS_MODULE, + .read = NULL, + .write = NULL, + .open = pseudo_lock_dev_open, + .release = pseudo_lock_dev_release, + .mmap = pseudo_lock_dev_mmap, +}; + +int rdt_pseudo_lock_init(void) +{ + int ret; + + ret = register_chrdev(0, "pseudo_lock", &pseudo_lock_dev_fops); + if (ret < 0) + return ret; + + pseudo_lock_major = ret; + + ret = class_register(&pseudo_lock_class); + if (ret) { + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + return ret; + } + + return 0; +} + +void rdt_pseudo_lock_release(void) +{ + class_unregister(&pseudo_lock_class); + unregister_chrdev(pseudo_lock_major, "pseudo_lock"); + pseudo_lock_major = 0; +} diff --git a/fs/resctrl/rdtgroup.c b/fs/resctrl/rdtgroup.c new file mode 100644 index 000000000000..cc37f58b47dd --- /dev/null +++ b/fs/resctrl/rdtgroup.c @@ -0,0 +1,4353 @@ +// SPDX-License-Identifier: GPL-2.0-only +/* + * User interface for Resource Allocation in Resource Director Technology(RDT) + * + * Copyright (C) 2016 Intel Corporation + * + * Author: Fenghua Yu <fenghua.yu@intel.com> + * + * More information about RDT be found in the Intel (R) x86 Architecture + * Software Developer Manual. + */ + +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include <linux/cpu.h> +#include <linux/debugfs.h> +#include <linux/fs.h> +#include <linux/fs_parser.h> +#include <linux/sysfs.h> +#include <linux/kernfs.h> +#include <linux/resctrl.h> +#include <linux/seq_buf.h> +#include <linux/seq_file.h> +#include <linux/sched/task.h> +#include <linux/slab.h> +#include <linux/user_namespace.h> + +#include <uapi/linux/magic.h> + +#include "internal.h" + +/* Mutex to protect rdtgroup access. */ +DEFINE_MUTEX(rdtgroup_mutex); + +static struct kernfs_root *rdt_root; + +struct rdtgroup rdtgroup_default; + +LIST_HEAD(rdt_all_groups); + +/* list of entries for the schemata file */ +LIST_HEAD(resctrl_schema_all); + +/* + * List of struct mon_data containing private data of event files for use by + * rdtgroup_mondata_show(). Protected by rdtgroup_mutex. + */ +static LIST_HEAD(mon_data_kn_priv_list); + +/* The filesystem can only be mounted once. */ +bool resctrl_mounted; + +/* Kernel fs node for "info" directory under root */ +static struct kernfs_node *kn_info; + +/* Kernel fs node for "mon_groups" directory under root */ +static struct kernfs_node *kn_mongrp; + +/* Kernel fs node for "mon_data" directory under root */ +static struct kernfs_node *kn_mondata; + +/* + * Used to store the max resource name width to display the schemata names in + * a tabular format. + */ +int max_name_width; + +static struct seq_buf last_cmd_status; + +static char last_cmd_status_buf[512]; + +static int rdtgroup_setup_root(struct rdt_fs_context *ctx); + +static void rdtgroup_destroy_root(void); + +struct dentry *debugfs_resctrl; + +/* + * Memory bandwidth monitoring event to use for the default CTRL_MON group + * and each new CTRL_MON group created by the user. Only relevant when + * the filesystem is mounted with the "mba_MBps" option so it does not + * matter that it remains uninitialized on systems that do not support + * the "mba_MBps" option. + */ +enum resctrl_event_id mba_mbps_default_event; + +static bool resctrl_debug; + +void rdt_last_cmd_clear(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_clear(&last_cmd_status); +} + +void rdt_last_cmd_puts(const char *s) +{ + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_puts(&last_cmd_status, s); +} + +void rdt_last_cmd_printf(const char *fmt, ...) +{ + va_list ap; + + va_start(ap, fmt); + lockdep_assert_held(&rdtgroup_mutex); + seq_buf_vprintf(&last_cmd_status, fmt, ap); + va_end(ap); +} + +void rdt_staged_configs_clear(void) +{ + struct rdt_ctrl_domain *dom; + struct rdt_resource *r; + + lockdep_assert_held(&rdtgroup_mutex); + + for_each_alloc_capable_rdt_resource(r) { + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) + memset(dom->staged_config, 0, sizeof(dom->staged_config)); + } +} + +static bool resctrl_is_mbm_enabled(void) +{ + return (resctrl_arch_is_mbm_total_enabled() || + resctrl_arch_is_mbm_local_enabled()); +} + +static bool resctrl_is_mbm_event(int e) +{ + return (e >= QOS_L3_MBM_TOTAL_EVENT_ID && + e <= QOS_L3_MBM_LOCAL_EVENT_ID); +} + +/* + * Trivial allocator for CLOSIDs. Use BITMAP APIs to manipulate a bitmap + * of free CLOSIDs. + * + * Using a global CLOSID across all resources has some advantages and + * some drawbacks: + * + We can simply set current's closid to assign a task to a resource + * group. + * + Context switch code can avoid extra memory references deciding which + * CLOSID to load into the PQR_ASSOC MSR + * - We give up some options in configuring resource groups across multi-socket + * systems. + * - Our choices on how to configure each resource become progressively more + * limited as the number of resources grows. + */ +static unsigned long *closid_free_map; + +static int closid_free_map_len; + +int closids_supported(void) +{ + return closid_free_map_len; +} + +static int closid_init(void) +{ + struct resctrl_schema *s; + u32 rdt_min_closid = ~0; + + /* Monitor only platforms still call closid_init() */ + if (list_empty(&resctrl_schema_all)) + return 0; + + /* Compute rdt_min_closid across all resources */ + list_for_each_entry(s, &resctrl_schema_all, list) + rdt_min_closid = min(rdt_min_closid, s->num_closid); + + closid_free_map = bitmap_alloc(rdt_min_closid, GFP_KERNEL); + if (!closid_free_map) + return -ENOMEM; + bitmap_fill(closid_free_map, rdt_min_closid); + + /* RESCTRL_RESERVED_CLOSID is always reserved for the default group */ + __clear_bit(RESCTRL_RESERVED_CLOSID, closid_free_map); + closid_free_map_len = rdt_min_closid; + + return 0; +} + +static void closid_exit(void) +{ + bitmap_free(closid_free_map); + closid_free_map = NULL; +} + +static int closid_alloc(void) +{ + int cleanest_closid; + u32 closid; + + lockdep_assert_held(&rdtgroup_mutex); + + if (IS_ENABLED(CONFIG_RESCTRL_RMID_DEPENDS_ON_CLOSID) && + resctrl_arch_is_llc_occupancy_enabled()) { + cleanest_closid = resctrl_find_cleanest_closid(); + if (cleanest_closid < 0) + return cleanest_closid; + closid = cleanest_closid; + } else { + closid = find_first_bit(closid_free_map, closid_free_map_len); + if (closid == closid_free_map_len) + return -ENOSPC; + } + __clear_bit(closid, closid_free_map); + + return closid; +} + +void closid_free(int closid) +{ + lockdep_assert_held(&rdtgroup_mutex); + + __set_bit(closid, closid_free_map); +} + +/** + * closid_allocated - test if provided closid is in use + * @closid: closid to be tested + * + * Return: true if @closid is currently associated with a resource group, + * false if @closid is free + */ +bool closid_allocated(unsigned int closid) +{ + lockdep_assert_held(&rdtgroup_mutex); + + return !test_bit(closid, closid_free_map); +} + +/** + * rdtgroup_mode_by_closid - Return mode of resource group with closid + * @closid: closid if the resource group + * + * Each resource group is associated with a @closid. Here the mode + * of a resource group can be queried by searching for it using its closid. + * + * Return: mode as &enum rdtgrp_mode of resource group with closid @closid + */ +enum rdtgrp_mode rdtgroup_mode_by_closid(int closid) +{ + struct rdtgroup *rdtgrp; + + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (rdtgrp->closid == closid) + return rdtgrp->mode; + } + + return RDT_NUM_MODES; +} + +static const char * const rdt_mode_str[] = { + [RDT_MODE_SHAREABLE] = "shareable", + [RDT_MODE_EXCLUSIVE] = "exclusive", + [RDT_MODE_PSEUDO_LOCKSETUP] = "pseudo-locksetup", + [RDT_MODE_PSEUDO_LOCKED] = "pseudo-locked", +}; + +/** + * rdtgroup_mode_str - Return the string representation of mode + * @mode: the resource group mode as &enum rdtgroup_mode + * + * Return: string representation of valid mode, "unknown" otherwise + */ +static const char *rdtgroup_mode_str(enum rdtgrp_mode mode) +{ + if (mode < RDT_MODE_SHAREABLE || mode >= RDT_NUM_MODES) + return "unknown"; + + return rdt_mode_str[mode]; +} + +/* set uid and gid of rdtgroup dirs and files to that of the creator */ +static int rdtgroup_kn_set_ugid(struct kernfs_node *kn) +{ + struct iattr iattr = { .ia_valid = ATTR_UID | ATTR_GID, + .ia_uid = current_fsuid(), + .ia_gid = current_fsgid(), }; + + if (uid_eq(iattr.ia_uid, GLOBAL_ROOT_UID) && + gid_eq(iattr.ia_gid, GLOBAL_ROOT_GID)) + return 0; + + return kernfs_setattr(kn, &iattr); +} + +static int rdtgroup_add_file(struct kernfs_node *parent_kn, struct rftype *rft) +{ + struct kernfs_node *kn; + int ret; + + kn = __kernfs_create_file(parent_kn, rft->name, rft->mode, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, + 0, rft->kf_ops, rft, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return 0; +} + +static int rdtgroup_seqfile_show(struct seq_file *m, void *arg) +{ + struct kernfs_open_file *of = m->private; + struct rftype *rft = of->kn->priv; + + if (rft->seq_show) + return rft->seq_show(of, m, arg); + return 0; +} + +static ssize_t rdtgroup_file_write(struct kernfs_open_file *of, char *buf, + size_t nbytes, loff_t off) +{ + struct rftype *rft = of->kn->priv; + + if (rft->write) + return rft->write(of, buf, nbytes, off); + + return -EINVAL; +} + +static const struct kernfs_ops rdtgroup_kf_single_ops = { + .atomic_write_len = PAGE_SIZE, + .write = rdtgroup_file_write, + .seq_show = rdtgroup_seqfile_show, +}; + +static const struct kernfs_ops kf_mondata_ops = { + .atomic_write_len = PAGE_SIZE, + .seq_show = rdtgroup_mondata_show, +}; + +static bool is_cpu_list(struct kernfs_open_file *of) +{ + struct rftype *rft = of->kn->priv; + + return rft->flags & RFTYPE_FLAGS_CPUS_LIST; +} + +static int rdtgroup_cpus_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + struct cpumask *mask; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + + if (rdtgrp) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + mask = &rdtgrp->plr->d->hdr.cpu_mask; + seq_printf(s, is_cpu_list(of) ? + "%*pbl\n" : "%*pb\n", + cpumask_pr_args(mask)); + } + } else { + seq_printf(s, is_cpu_list(of) ? "%*pbl\n" : "%*pb\n", + cpumask_pr_args(&rdtgrp->cpu_mask)); + } + } else { + ret = -ENOENT; + } + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +/* + * Update the PGR_ASSOC MSR on all cpus in @cpu_mask, + * + * Per task closids/rmids must have been set up before calling this function. + * @r may be NULL. + */ +static void +update_closid_rmid(const struct cpumask *cpu_mask, struct rdtgroup *r) +{ + struct resctrl_cpu_defaults defaults, *p = NULL; + + if (r) { + defaults.closid = r->closid; + defaults.rmid = r->mon.rmid; + p = &defaults; + } + + on_each_cpu_mask(cpu_mask, resctrl_arch_sync_cpu_closid_rmid, p, 1); +} + +static int cpus_mon_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask) +{ + struct rdtgroup *prgrp = rdtgrp->mon.parent, *crgrp; + struct list_head *head; + + /* Check whether cpus belong to parent ctrl group */ + cpumask_andnot(tmpmask, newmask, &prgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + rdt_last_cmd_puts("Can only add CPUs to mongroup that belong to parent\n"); + return -EINVAL; + } + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (!cpumask_empty(tmpmask)) { + /* Give any dropped cpus to parent rdtgroup */ + cpumask_or(&prgrp->cpu_mask, &prgrp->cpu_mask, tmpmask); + update_closid_rmid(tmpmask, prgrp); + } + + /* + * If we added cpus, remove them from previous group that owned them + * and update per-cpu rmid + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + if (crgrp == rdtgrp) + continue; + cpumask_andnot(&crgrp->cpu_mask, &crgrp->cpu_mask, + tmpmask); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + return 0; +} + +static void cpumask_rdtgrp_clear(struct rdtgroup *r, struct cpumask *m) +{ + struct rdtgroup *crgrp; + + cpumask_andnot(&r->cpu_mask, &r->cpu_mask, m); + /* update the child mon group masks as well*/ + list_for_each_entry(crgrp, &r->mon.crdtgrp_list, mon.crdtgrp_list) + cpumask_and(&crgrp->cpu_mask, &r->cpu_mask, &crgrp->cpu_mask); +} + +static int cpus_ctrl_write(struct rdtgroup *rdtgrp, cpumask_var_t newmask, + cpumask_var_t tmpmask, cpumask_var_t tmpmask1) +{ + struct rdtgroup *r, *crgrp; + struct list_head *head; + + /* Check whether cpus are dropped from this group */ + cpumask_andnot(tmpmask, &rdtgrp->cpu_mask, newmask); + if (!cpumask_empty(tmpmask)) { + /* Can't drop from default group */ + if (rdtgrp == &rdtgroup_default) { + rdt_last_cmd_puts("Can't drop CPUs from default group\n"); + return -EINVAL; + } + + /* Give any dropped cpus to rdtgroup_default */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, tmpmask); + update_closid_rmid(tmpmask, &rdtgroup_default); + } + + /* + * If we added cpus, remove them from previous group and + * the prev group's child groups that owned them + * and update per-cpu closid/rmid. + */ + cpumask_andnot(tmpmask, newmask, &rdtgrp->cpu_mask); + if (!cpumask_empty(tmpmask)) { + list_for_each_entry(r, &rdt_all_groups, rdtgroup_list) { + if (r == rdtgrp) + continue; + cpumask_and(tmpmask1, &r->cpu_mask, tmpmask); + if (!cpumask_empty(tmpmask1)) + cpumask_rdtgrp_clear(r, tmpmask1); + } + update_closid_rmid(tmpmask, rdtgrp); + } + + /* Done pushing/pulling - update this group with new mask */ + cpumask_copy(&rdtgrp->cpu_mask, newmask); + + /* + * Clear child mon group masks since there is a new parent mask + * now and update the rmid for the cpus the child lost. + */ + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + cpumask_and(tmpmask, &rdtgrp->cpu_mask, &crgrp->cpu_mask); + update_closid_rmid(tmpmask, rdtgrp); + cpumask_clear(&crgrp->cpu_mask); + } + + return 0; +} + +static ssize_t rdtgroup_cpus_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + cpumask_var_t tmpmask, newmask, tmpmask1; + struct rdtgroup *rdtgrp; + int ret; + + if (!buf) + return -EINVAL; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + if (!zalloc_cpumask_var(&newmask, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + return -ENOMEM; + } + if (!zalloc_cpumask_var(&tmpmask1, GFP_KERNEL)) { + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + return -ENOMEM; + } + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + ret = -ENOENT; + goto unlock; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto unlock; + } + + if (is_cpu_list(of)) + ret = cpulist_parse(buf, newmask); + else + ret = cpumask_parse(buf, newmask); + + if (ret) { + rdt_last_cmd_puts("Bad CPU list/mask\n"); + goto unlock; + } + + /* check that user didn't specify any offline cpus */ + cpumask_andnot(tmpmask, newmask, cpu_online_mask); + if (!cpumask_empty(tmpmask)) { + ret = -EINVAL; + rdt_last_cmd_puts("Can only assign online CPUs\n"); + goto unlock; + } + + if (rdtgrp->type == RDTCTRL_GROUP) + ret = cpus_ctrl_write(rdtgrp, newmask, tmpmask, tmpmask1); + else if (rdtgrp->type == RDTMON_GROUP) + ret = cpus_mon_write(rdtgrp, newmask, tmpmask); + else + ret = -EINVAL; + +unlock: + rdtgroup_kn_unlock(of->kn); + free_cpumask_var(tmpmask); + free_cpumask_var(newmask); + free_cpumask_var(tmpmask1); + + return ret ?: nbytes; +} + +/** + * rdtgroup_remove - the helper to remove resource group safely + * @rdtgrp: resource group to remove + * + * On resource group creation via a mkdir, an extra kernfs_node reference is + * taken to ensure that the rdtgroup structure remains accessible for the + * rdtgroup_kn_unlock() calls where it is removed. + * + * Drop the extra reference here, then free the rdtgroup structure. + * + * Return: void + */ +static void rdtgroup_remove(struct rdtgroup *rdtgrp) +{ + kernfs_put(rdtgrp->kn); + kfree(rdtgrp); +} + +static void _update_task_closid_rmid(void *task) +{ + /* + * If the task is still current on this CPU, update PQR_ASSOC MSR. + * Otherwise, the MSR is updated when the task is scheduled in. + */ + if (task == current) + resctrl_arch_sched_in(task); +} + +static void update_task_closid_rmid(struct task_struct *t) +{ + if (IS_ENABLED(CONFIG_SMP) && task_curr(t)) + smp_call_function_single(task_cpu(t), _update_task_closid_rmid, t, 1); + else + _update_task_closid_rmid(t); +} + +static bool task_in_rdtgroup(struct task_struct *tsk, struct rdtgroup *rdtgrp) +{ + u32 closid, rmid = rdtgrp->mon.rmid; + + if (rdtgrp->type == RDTCTRL_GROUP) + closid = rdtgrp->closid; + else if (rdtgrp->type == RDTMON_GROUP) + closid = rdtgrp->mon.parent->closid; + else + return false; + + return resctrl_arch_match_closid(tsk, closid) && + resctrl_arch_match_rmid(tsk, closid, rmid); +} + +static int __rdtgroup_move_task(struct task_struct *tsk, + struct rdtgroup *rdtgrp) +{ + /* If the task is already in rdtgrp, no need to move the task. */ + if (task_in_rdtgroup(tsk, rdtgrp)) + return 0; + + /* + * Set the task's closid/rmid before the PQR_ASSOC MSR can be + * updated by them. + * + * For ctrl_mon groups, move both closid and rmid. + * For monitor groups, can move the tasks only from + * their parent CTRL group. + */ + if (rdtgrp->type == RDTMON_GROUP && + !resctrl_arch_match_closid(tsk, rdtgrp->mon.parent->closid)) { + rdt_last_cmd_puts("Can't move task to different control group\n"); + return -EINVAL; + } + + if (rdtgrp->type == RDTMON_GROUP) + resctrl_arch_set_closid_rmid(tsk, rdtgrp->mon.parent->closid, + rdtgrp->mon.rmid); + else + resctrl_arch_set_closid_rmid(tsk, rdtgrp->closid, + rdtgrp->mon.rmid); + + /* + * Ensure the task's closid and rmid are written before determining if + * the task is current that will decide if it will be interrupted. + * This pairs with the full barrier between the rq->curr update and + * resctrl_arch_sched_in() during context switch. + */ + smp_mb(); + + /* + * By now, the task's closid and rmid are set. If the task is current + * on a CPU, the PQR_ASSOC MSR needs to be updated to make the resource + * group go into effect. If the task is not current, the MSR will be + * updated when the task is scheduled in. + */ + update_task_closid_rmid(tsk); + + return 0; +} + +static bool is_closid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (resctrl_arch_alloc_capable() && (r->type == RDTCTRL_GROUP) && + resctrl_arch_match_closid(t, r->closid)); +} + +static bool is_rmid_match(struct task_struct *t, struct rdtgroup *r) +{ + return (resctrl_arch_mon_capable() && (r->type == RDTMON_GROUP) && + resctrl_arch_match_rmid(t, r->mon.parent->closid, + r->mon.rmid)); +} + +/** + * rdtgroup_tasks_assigned - Test if tasks have been assigned to resource group + * @r: Resource group + * + * Return: 1 if tasks have been assigned to @r, 0 otherwise + */ +int rdtgroup_tasks_assigned(struct rdtgroup *r) +{ + struct task_struct *p, *t; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + ret = 1; + break; + } + } + rcu_read_unlock(); + + return ret; +} + +static int rdtgroup_task_write_permission(struct task_struct *task, + struct kernfs_open_file *of) +{ + const struct cred *tcred = get_task_cred(task); + const struct cred *cred = current_cred(); + int ret = 0; + + /* + * Even if we're attaching all tasks in the thread group, we only + * need to check permissions on one of them. + */ + if (!uid_eq(cred->euid, GLOBAL_ROOT_UID) && + !uid_eq(cred->euid, tcred->uid) && + !uid_eq(cred->euid, tcred->suid)) { + rdt_last_cmd_printf("No permission to move task %d\n", task->pid); + ret = -EPERM; + } + + put_cred(tcred); + return ret; +} + +static int rdtgroup_move_task(pid_t pid, struct rdtgroup *rdtgrp, + struct kernfs_open_file *of) +{ + struct task_struct *tsk; + int ret; + + rcu_read_lock(); + if (pid) { + tsk = find_task_by_vpid(pid); + if (!tsk) { + rcu_read_unlock(); + rdt_last_cmd_printf("No task %d\n", pid); + return -ESRCH; + } + } else { + tsk = current; + } + + get_task_struct(tsk); + rcu_read_unlock(); + + ret = rdtgroup_task_write_permission(tsk, of); + if (!ret) + ret = __rdtgroup_move_task(tsk, rdtgrp); + + put_task_struct(tsk); + return ret; +} + +static ssize_t rdtgroup_tasks_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + char *pid_str; + int ret = 0; + pid_t pid; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + rdt_last_cmd_clear(); + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto unlock; + } + + while (buf && buf[0] != '\0' && buf[0] != '\n') { + pid_str = strim(strsep(&buf, ",")); + + if (kstrtoint(pid_str, 0, &pid)) { + rdt_last_cmd_printf("Task list parsing error pid %s\n", pid_str); + ret = -EINVAL; + break; + } + + if (pid < 0) { + rdt_last_cmd_printf("Invalid pid %d\n", pid); + ret = -EINVAL; + break; + } + + ret = rdtgroup_move_task(pid, rdtgrp, of); + if (ret) { + rdt_last_cmd_printf("Error while processing task %d\n", pid); + break; + } + } + +unlock: + rdtgroup_kn_unlock(of->kn); + + return ret ?: nbytes; +} + +static void show_rdt_tasks(struct rdtgroup *r, struct seq_file *s) +{ + struct task_struct *p, *t; + pid_t pid; + + rcu_read_lock(); + for_each_process_thread(p, t) { + if (is_closid_match(t, r) || is_rmid_match(t, r)) { + pid = task_pid_vnr(t); + if (pid) + seq_printf(s, "%d\n", pid); + } + } + rcu_read_unlock(); +} + +static int rdtgroup_tasks_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + show_rdt_tasks(rdtgrp, s); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static int rdtgroup_closid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->closid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static int rdtgroup_rmid_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + int ret = 0; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (rdtgrp) + seq_printf(s, "%u\n", rdtgrp->mon.rmid); + else + ret = -ENOENT; + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +#ifdef CONFIG_PROC_CPU_RESCTRL +/* + * A task can only be part of one resctrl control group and of one monitor + * group which is associated to that control group. + * + * 1) res: + * mon: + * + * resctrl is not available. + * + * 2) res:/ + * mon: + * + * Task is part of the root resctrl control group, and it is not associated + * to any monitor group. + * + * 3) res:/ + * mon:mon0 + * + * Task is part of the root resctrl control group and monitor group mon0. + * + * 4) res:group0 + * mon: + * + * Task is part of resctrl control group group0, and it is not associated + * to any monitor group. + * + * 5) res:group0 + * mon:mon1 + * + * Task is part of resctrl control group group0 and monitor group mon1. + */ +int proc_resctrl_show(struct seq_file *s, struct pid_namespace *ns, + struct pid *pid, struct task_struct *tsk) +{ + struct rdtgroup *rdtg; + int ret = 0; + + mutex_lock(&rdtgroup_mutex); + + /* Return empty if resctrl has not been mounted. */ + if (!resctrl_mounted) { + seq_puts(s, "res:\nmon:\n"); + goto unlock; + } + + list_for_each_entry(rdtg, &rdt_all_groups, rdtgroup_list) { + struct rdtgroup *crg; + + /* + * Task information is only relevant for shareable + * and exclusive groups. + */ + if (rdtg->mode != RDT_MODE_SHAREABLE && + rdtg->mode != RDT_MODE_EXCLUSIVE) + continue; + + if (!resctrl_arch_match_closid(tsk, rdtg->closid)) + continue; + + seq_printf(s, "res:%s%s\n", (rdtg == &rdtgroup_default) ? "/" : "", + rdt_kn_name(rdtg->kn)); + seq_puts(s, "mon:"); + list_for_each_entry(crg, &rdtg->mon.crdtgrp_list, + mon.crdtgrp_list) { + if (!resctrl_arch_match_rmid(tsk, crg->mon.parent->closid, + crg->mon.rmid)) + continue; + seq_printf(s, "%s", rdt_kn_name(crg->kn)); + break; + } + seq_putc(s, '\n'); + goto unlock; + } + /* + * The above search should succeed. Otherwise return + * with an error. + */ + ret = -ENOENT; +unlock: + mutex_unlock(&rdtgroup_mutex); + + return ret; +} +#endif + +static int rdt_last_cmd_status_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + int len; + + mutex_lock(&rdtgroup_mutex); + len = seq_buf_used(&last_cmd_status); + if (len) + seq_printf(seq, "%.*s", len, last_cmd_status_buf); + else + seq_puts(seq, "ok\n"); + mutex_unlock(&rdtgroup_mutex); + return 0; +} + +static void *rdt_kn_parent_priv(struct kernfs_node *kn) +{ + /* + * The parent pointer is only valid within RCU section since it can be + * replaced. + */ + guard(rcu)(); + return rcu_dereference(kn->__parent)->priv; +} + +static int rdt_num_closids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + + seq_printf(seq, "%u\n", s->num_closid); + return 0; +} + +static int rdt_default_ctrl_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%x\n", resctrl_get_default_ctrl(r)); + return 0; +} + +static int rdt_min_cbm_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.min_cbm_bits); + return 0; +} + +static int rdt_shareable_bits_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%x\n", r->cache.shareable_bits); + return 0; +} + +/* + * rdt_bit_usage_show - Display current usage of resources + * + * A domain is a shared resource that can now be allocated differently. Here + * we display the current regions of the domain as an annotated bitmask. + * For each domain of this resource its allocation bitmask + * is annotated as below to indicate the current usage of the corresponding bit: + * 0 - currently unused + * X - currently available for sharing and used by software and hardware + * H - currently used by hardware only but available for software use + * S - currently used and shareable by software only + * E - currently used exclusively by one resource group + * P - currently pseudo-locked by one resource group + */ +static int rdt_bit_usage_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + /* + * Use unsigned long even though only 32 bits are used to ensure + * test_bit() is used safely. + */ + unsigned long sw_shareable = 0, hw_shareable = 0; + unsigned long exclusive = 0, pseudo_locked = 0; + struct rdt_resource *r = s->res; + struct rdt_ctrl_domain *dom; + int i, hwb, swb, excl, psl; + enum rdtgrp_mode mode; + bool sep = false; + u32 ctrl_val; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + hw_shareable = r->cache.shareable_bits; + list_for_each_entry(dom, &r->ctrl_domains, hdr.list) { + if (sep) + seq_putc(seq, ';'); + sw_shareable = 0; + exclusive = 0; + seq_printf(seq, "%d=", dom->hdr.id); + for (i = 0; i < closids_supported(); i++) { + if (!closid_allocated(i)) + continue; + ctrl_val = resctrl_arch_get_config(r, dom, i, + s->conf_type); + mode = rdtgroup_mode_by_closid(i); + switch (mode) { + case RDT_MODE_SHAREABLE: + sw_shareable |= ctrl_val; + break; + case RDT_MODE_EXCLUSIVE: + exclusive |= ctrl_val; + break; + case RDT_MODE_PSEUDO_LOCKSETUP: + /* + * RDT_MODE_PSEUDO_LOCKSETUP is possible + * here but not included since the CBM + * associated with this CLOSID in this mode + * is not initialized and no task or cpu can be + * assigned this CLOSID. + */ + break; + case RDT_MODE_PSEUDO_LOCKED: + case RDT_NUM_MODES: + WARN(1, + "invalid mode for closid %d\n", i); + break; + } + } + for (i = r->cache.cbm_len - 1; i >= 0; i--) { + pseudo_locked = dom->plr ? dom->plr->cbm : 0; + hwb = test_bit(i, &hw_shareable); + swb = test_bit(i, &sw_shareable); + excl = test_bit(i, &exclusive); + psl = test_bit(i, &pseudo_locked); + if (hwb && swb) + seq_putc(seq, 'X'); + else if (hwb && !swb) + seq_putc(seq, 'H'); + else if (!hwb && swb) + seq_putc(seq, 'S'); + else if (excl) + seq_putc(seq, 'E'); + else if (psl) + seq_putc(seq, 'P'); + else /* Unused bits remain */ + seq_putc(seq, '0'); + } + sep = true; + } + seq_putc(seq, '\n'); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return 0; +} + +static int rdt_min_bw_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.min_bw); + return 0; +} + +static int rdt_num_rmids_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + + seq_printf(seq, "%d\n", r->num_rmid); + + return 0; +} + +static int rdt_mon_features_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + struct mon_evt *mevt; + + list_for_each_entry(mevt, &r->evt_list, list) { + seq_printf(seq, "%s\n", mevt->name); + if (mevt->configurable) + seq_printf(seq, "%s_config\n", mevt->name); + } + + return 0; +} + +static int rdt_bw_gran_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.bw_gran); + return 0; +} + +static int rdt_delay_linear_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->membw.delay_linear); + return 0; +} + +static int max_threshold_occ_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + seq_printf(seq, "%u\n", resctrl_rmid_realloc_threshold); + + return 0; +} + +static int rdt_thread_throttle_mode_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + switch (r->membw.throttle_mode) { + case THREAD_THROTTLE_PER_THREAD: + seq_puts(seq, "per-thread\n"); + return 0; + case THREAD_THROTTLE_MAX: + seq_puts(seq, "max\n"); + return 0; + case THREAD_THROTTLE_UNDEFINED: + seq_puts(seq, "undefined\n"); + return 0; + } + + WARN_ON_ONCE(1); + + return 0; +} + +static ssize_t max_threshold_occ_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + unsigned int bytes; + int ret; + + ret = kstrtouint(buf, 0, &bytes); + if (ret) + return ret; + + if (bytes > resctrl_rmid_realloc_limit) + return -EINVAL; + + resctrl_rmid_realloc_threshold = resctrl_arch_round_mon_val(bytes); + + return nbytes; +} + +/* + * rdtgroup_mode_show - Display mode of this resource group + */ +static int rdtgroup_mode_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct rdtgroup *rdtgrp; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + seq_printf(s, "%s\n", rdtgroup_mode_str(rdtgrp->mode)); + + rdtgroup_kn_unlock(of->kn); + return 0; +} + +static enum resctrl_conf_type resctrl_peer_type(enum resctrl_conf_type my_type) +{ + switch (my_type) { + case CDP_CODE: + return CDP_DATA; + case CDP_DATA: + return CDP_CODE; + default: + case CDP_NONE: + return CDP_NONE; + } +} + +static int rdt_has_sparse_bitmasks_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct resctrl_schema *s = rdt_kn_parent_priv(of->kn); + struct rdt_resource *r = s->res; + + seq_printf(seq, "%u\n", r->cache.arch_has_sparse_bitmasks); + + return 0; +} + +/** + * __rdtgroup_cbm_overlaps - Does CBM for intended closid overlap with other + * @r: Resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @type: CDP type of @r. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Checks if provided @cbm intended to be used for @closid on domain + * @d overlaps with any other closids or other hardware usage associated + * with this domain. If @exclusive is true then only overlaps with + * resource groups in exclusive mode will be considered. If @exclusive + * is false then overlaps with any resource group or hardware entities + * will be considered. + * + * @cbm is unsigned long, even if only 32 bits are used, to make the + * bitmap functions work correctly. + * + * Return: false if CBM does not overlap, true if it does. + */ +static bool __rdtgroup_cbm_overlaps(struct rdt_resource *r, struct rdt_ctrl_domain *d, + unsigned long cbm, int closid, + enum resctrl_conf_type type, bool exclusive) +{ + enum rdtgrp_mode mode; + unsigned long ctrl_b; + int i; + + /* Check for any overlap with regions used by hardware directly */ + if (!exclusive) { + ctrl_b = r->cache.shareable_bits; + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) + return true; + } + + /* Check for overlap with other resource groups */ + for (i = 0; i < closids_supported(); i++) { + ctrl_b = resctrl_arch_get_config(r, d, i, type); + mode = rdtgroup_mode_by_closid(i); + if (closid_allocated(i) && i != closid && + mode != RDT_MODE_PSEUDO_LOCKSETUP) { + if (bitmap_intersects(&cbm, &ctrl_b, r->cache.cbm_len)) { + if (exclusive) { + if (mode == RDT_MODE_EXCLUSIVE) + return true; + continue; + } + return true; + } + } + } + + return false; +} + +/** + * rdtgroup_cbm_overlaps - Does CBM overlap with other use of hardware + * @s: Schema for the resource to which domain instance @d belongs. + * @d: The domain instance for which @closid is being tested. + * @cbm: Capacity bitmask being tested. + * @closid: Intended closid for @cbm. + * @exclusive: Only check if overlaps with exclusive resource groups + * + * Resources that can be allocated using a CBM can use the CBM to control + * the overlap of these allocations. rdtgroup_cmb_overlaps() is the test + * for overlap. Overlap test is not limited to the specific resource for + * which the CBM is intended though - when dealing with CDP resources that + * share the underlying hardware the overlap check should be performed on + * the CDP resource sharing the hardware also. + * + * Refer to description of __rdtgroup_cbm_overlaps() for the details of the + * overlap test. + * + * Return: true if CBM overlap detected, false if there is no overlap + */ +bool rdtgroup_cbm_overlaps(struct resctrl_schema *s, struct rdt_ctrl_domain *d, + unsigned long cbm, int closid, bool exclusive) +{ + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); + struct rdt_resource *r = s->res; + + if (__rdtgroup_cbm_overlaps(r, d, cbm, closid, s->conf_type, + exclusive)) + return true; + + if (!resctrl_arch_get_cdp_enabled(r->rid)) + return false; + return __rdtgroup_cbm_overlaps(r, d, cbm, closid, peer_type, exclusive); +} + +/** + * rdtgroup_mode_test_exclusive - Test if this resource group can be exclusive + * @rdtgrp: Resource group identified through its closid. + * + * An exclusive resource group implies that there should be no sharing of + * its allocated resources. At the time this group is considered to be + * exclusive this test can determine if its current schemata supports this + * setting by testing for overlap with all other resource groups. + * + * Return: true if resource group can be exclusive, false if there is overlap + * with allocations of other resource groups and thus this resource group + * cannot be exclusive. + */ +static bool rdtgroup_mode_test_exclusive(struct rdtgroup *rdtgrp) +{ + int closid = rdtgrp->closid; + struct rdt_ctrl_domain *d; + struct resctrl_schema *s; + struct rdt_resource *r; + bool has_cache = false; + u32 ctrl; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + if (r->rid == RDT_RESOURCE_MBA || r->rid == RDT_RESOURCE_SMBA) + continue; + has_cache = true; + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + ctrl = resctrl_arch_get_config(r, d, closid, + s->conf_type); + if (rdtgroup_cbm_overlaps(s, d, ctrl, closid, false)) { + rdt_last_cmd_puts("Schemata overlaps\n"); + return false; + } + } + } + + if (!has_cache) { + rdt_last_cmd_puts("Cannot be exclusive without CAT/CDP\n"); + return false; + } + + return true; +} + +/* + * rdtgroup_mode_write - Modify the resource group's mode + */ +static ssize_t rdtgroup_mode_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct rdtgroup *rdtgrp; + enum rdtgrp_mode mode; + int ret = 0; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + buf[nbytes - 1] = '\0'; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + rdt_last_cmd_clear(); + + mode = rdtgrp->mode; + + if ((!strcmp(buf, "shareable") && mode == RDT_MODE_SHAREABLE) || + (!strcmp(buf, "exclusive") && mode == RDT_MODE_EXCLUSIVE) || + (!strcmp(buf, "pseudo-locksetup") && + mode == RDT_MODE_PSEUDO_LOCKSETUP) || + (!strcmp(buf, "pseudo-locked") && mode == RDT_MODE_PSEUDO_LOCKED)) + goto out; + + if (mode == RDT_MODE_PSEUDO_LOCKED) { + rdt_last_cmd_puts("Cannot change pseudo-locked group\n"); + ret = -EINVAL; + goto out; + } + + if (!strcmp(buf, "shareable")) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_SHAREABLE; + } else if (!strcmp(buf, "exclusive")) { + if (!rdtgroup_mode_test_exclusive(rdtgrp)) { + ret = -EINVAL; + goto out; + } + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + ret = rdtgroup_locksetup_exit(rdtgrp); + if (ret) + goto out; + } + rdtgrp->mode = RDT_MODE_EXCLUSIVE; + } else if (IS_ENABLED(CONFIG_RESCTRL_FS_PSEUDO_LOCK) && + !strcmp(buf, "pseudo-locksetup")) { + ret = rdtgroup_locksetup_enter(rdtgrp); + if (ret) + goto out; + rdtgrp->mode = RDT_MODE_PSEUDO_LOCKSETUP; + } else { + rdt_last_cmd_puts("Unknown or unsupported mode\n"); + ret = -EINVAL; + } + +out: + rdtgroup_kn_unlock(of->kn); + return ret ?: nbytes; +} + +/** + * rdtgroup_cbm_to_size - Translate CBM to size in bytes + * @r: RDT resource to which @d belongs. + * @d: RDT domain instance. + * @cbm: bitmask for which the size should be computed. + * + * The bitmask provided associated with the RDT domain instance @d will be + * translated into how many bytes it represents. The size in bytes is + * computed by first dividing the total cache size by the CBM length to + * determine how many bytes each bit in the bitmask represents. The result + * is multiplied with the number of bits set in the bitmask. + * + * @cbm is unsigned long, even if only 32 bits are used to make the + * bitmap functions work correctly. + */ +unsigned int rdtgroup_cbm_to_size(struct rdt_resource *r, + struct rdt_ctrl_domain *d, unsigned long cbm) +{ + unsigned int size = 0; + struct cacheinfo *ci; + int num_b; + + if (WARN_ON_ONCE(r->ctrl_scope != RESCTRL_L2_CACHE && r->ctrl_scope != RESCTRL_L3_CACHE)) + return size; + + num_b = bitmap_weight(&cbm, r->cache.cbm_len); + ci = get_cpu_cacheinfo_level(cpumask_any(&d->hdr.cpu_mask), r->ctrl_scope); + if (ci) + size = ci->size / r->cache.cbm_len * num_b; + + return size; +} + +bool is_mba_sc(struct rdt_resource *r) +{ + if (!r) + r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + + /* + * The software controller support is only applicable to MBA resource. + * Make sure to check for resource type. + */ + if (r->rid != RDT_RESOURCE_MBA) + return false; + + return r->membw.mba_sc; +} + +/* + * rdtgroup_size_show - Display size in bytes of allocated regions + * + * The "size" file mirrors the layout of the "schemata" file, printing the + * size in bytes of each region instead of the capacity bitmask. + */ +static int rdtgroup_size_show(struct kernfs_open_file *of, + struct seq_file *s, void *v) +{ + struct resctrl_schema *schema; + enum resctrl_conf_type type; + struct rdt_ctrl_domain *d; + struct rdtgroup *rdtgrp; + struct rdt_resource *r; + unsigned int size; + int ret = 0; + u32 closid; + bool sep; + u32 ctrl; + + rdtgrp = rdtgroup_kn_lock_live(of->kn); + if (!rdtgrp) { + rdtgroup_kn_unlock(of->kn); + return -ENOENT; + } + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + if (!rdtgrp->plr->d) { + rdt_last_cmd_clear(); + rdt_last_cmd_puts("Cache domain offline\n"); + ret = -ENODEV; + } else { + seq_printf(s, "%*s:", max_name_width, + rdtgrp->plr->s->name); + size = rdtgroup_cbm_to_size(rdtgrp->plr->s->res, + rdtgrp->plr->d, + rdtgrp->plr->cbm); + seq_printf(s, "%d=%u\n", rdtgrp->plr->d->hdr.id, size); + } + goto out; + } + + closid = rdtgrp->closid; + + list_for_each_entry(schema, &resctrl_schema_all, list) { + r = schema->res; + type = schema->conf_type; + sep = false; + seq_printf(s, "%*s:", max_name_width, schema->name); + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (sep) + seq_putc(s, ';'); + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP) { + size = 0; + } else { + if (is_mba_sc(r)) + ctrl = d->mbps_val[closid]; + else + ctrl = resctrl_arch_get_config(r, d, + closid, + type); + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) + size = ctrl; + else + size = rdtgroup_cbm_to_size(r, d, ctrl); + } + seq_printf(s, "%d=%u", d->hdr.id, size); + sep = true; + } + seq_putc(s, '\n'); + } + +out: + rdtgroup_kn_unlock(of->kn); + + return ret; +} + +static void mondata_config_read(struct resctrl_mon_config_info *mon_info) +{ + smp_call_function_any(&mon_info->d->hdr.cpu_mask, + resctrl_arch_mon_event_config_read, mon_info, 1); +} + +static int mbm_config_show(struct seq_file *s, struct rdt_resource *r, u32 evtid) +{ + struct resctrl_mon_config_info mon_info; + struct rdt_mon_domain *dom; + bool sep = false; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + if (sep) + seq_puts(s, ";"); + + memset(&mon_info, 0, sizeof(struct resctrl_mon_config_info)); + mon_info.r = r; + mon_info.d = dom; + mon_info.evtid = evtid; + mondata_config_read(&mon_info); + + seq_printf(s, "%d=0x%02x", dom->hdr.id, mon_info.mon_config); + sep = true; + } + seq_puts(s, "\n"); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return 0; +} + +static int mbm_total_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + + mbm_config_show(seq, r, QOS_L3_MBM_TOTAL_EVENT_ID); + + return 0; +} + +static int mbm_local_bytes_config_show(struct kernfs_open_file *of, + struct seq_file *seq, void *v) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + + mbm_config_show(seq, r, QOS_L3_MBM_LOCAL_EVENT_ID); + + return 0; +} + +static void mbm_config_write_domain(struct rdt_resource *r, + struct rdt_mon_domain *d, u32 evtid, u32 val) +{ + struct resctrl_mon_config_info mon_info = {0}; + + /* + * Read the current config value first. If both are the same then + * no need to write it again. + */ + mon_info.r = r; + mon_info.d = d; + mon_info.evtid = evtid; + mondata_config_read(&mon_info); + if (mon_info.mon_config == val) + return; + + mon_info.mon_config = val; + + /* + * Update MSR_IA32_EVT_CFG_BASE MSR on one of the CPUs in the + * domain. The MSRs offset from MSR MSR_IA32_EVT_CFG_BASE + * are scoped at the domain level. Writing any of these MSRs + * on one CPU is observed by all the CPUs in the domain. + */ + smp_call_function_any(&d->hdr.cpu_mask, resctrl_arch_mon_event_config_write, + &mon_info, 1); + + /* + * When an Event Configuration is changed, the bandwidth counters + * for all RMIDs and Events will be cleared by the hardware. The + * hardware also sets MSR_IA32_QM_CTR.Unavailable (bit 62) for + * every RMID on the next read to any event for every RMID. + * Subsequent reads will have MSR_IA32_QM_CTR.Unavailable (bit 62) + * cleared while it is tracked by the hardware. Clear the + * mbm_local and mbm_total counts for all the RMIDs. + */ + resctrl_arch_reset_rmid_all(r, d); +} + +static int mon_config_write(struct rdt_resource *r, char *tok, u32 evtid) +{ + char *dom_str = NULL, *id_str; + unsigned long dom_id, val; + struct rdt_mon_domain *d; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + +next: + if (!tok || tok[0] == '\0') + return 0; + + /* Start processing the strings for each domain */ + dom_str = strim(strsep(&tok, ";")); + id_str = strsep(&dom_str, "="); + + if (!id_str || kstrtoul(id_str, 10, &dom_id)) { + rdt_last_cmd_puts("Missing '=' or non-numeric domain id\n"); + return -EINVAL; + } + + if (!dom_str || kstrtoul(dom_str, 16, &val)) { + rdt_last_cmd_puts("Non-numeric event configuration value\n"); + return -EINVAL; + } + + /* Value from user cannot be more than the supported set of events */ + if ((val & r->mbm_cfg_mask) != val) { + rdt_last_cmd_printf("Invalid event configuration: max valid mask is 0x%02x\n", + r->mbm_cfg_mask); + return -EINVAL; + } + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + if (d->hdr.id == dom_id) { + mbm_config_write_domain(r, d, evtid, val); + goto next; + } + } + + return -EINVAL; +} + +static ssize_t mbm_total_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_TOTAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +static ssize_t mbm_local_bytes_config_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, + loff_t off) +{ + struct rdt_resource *r = rdt_kn_parent_priv(of->kn); + int ret; + + /* Valid input requires a trailing newline */ + if (nbytes == 0 || buf[nbytes - 1] != '\n') + return -EINVAL; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + buf[nbytes - 1] = '\0'; + + ret = mon_config_write(r, buf, QOS_L3_MBM_LOCAL_EVENT_ID); + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + return ret ?: nbytes; +} + +/* rdtgroup information files for one cache resource. */ +static struct rftype res_common_files[] = { + { + .name = "last_cmd_status", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_last_cmd_status_show, + .fflags = RFTYPE_TOP_INFO, + }, + { + .name = "num_closids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_closids_show, + .fflags = RFTYPE_CTRL_INFO, + }, + { + .name = "mon_features", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_mon_features_show, + .fflags = RFTYPE_MON_INFO, + }, + { + .name = "num_rmids", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_num_rmids_show, + .fflags = RFTYPE_MON_INFO, + }, + { + .name = "cbm_mask", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_default_ctrl_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "min_cbm_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_cbm_bits_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "shareable_bits", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_shareable_bits_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "bit_usage", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bit_usage_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "min_bandwidth", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_min_bw_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "bandwidth_gran", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_bw_gran_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, + }, + { + .name = "delay_linear", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_delay_linear_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_MB, + }, + /* + * Platform specific which (if any) capabilities are provided by + * thread_throttle_mode. Defer "fflags" initialization to platform + * discovery. + */ + { + .name = "thread_throttle_mode", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_thread_throttle_mode_show, + }, + { + .name = "max_threshold_occupancy", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = max_threshold_occ_write, + .seq_show = max_threshold_occ_show, + .fflags = RFTYPE_MON_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "mbm_total_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_total_bytes_config_show, + .write = mbm_total_bytes_config_write, + }, + { + .name = "mbm_local_bytes_config", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = mbm_local_bytes_config_show, + .write = mbm_local_bytes_config_write, + }, + { + .name = "cpus", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "cpus_list", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_cpus_write, + .seq_show = rdtgroup_cpus_show, + .flags = RFTYPE_FLAGS_CPUS_LIST, + .fflags = RFTYPE_BASE, + }, + { + .name = "tasks", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_tasks_write, + .seq_show = rdtgroup_tasks_show, + .fflags = RFTYPE_BASE, + }, + { + .name = "mon_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_rmid_show, + .fflags = RFTYPE_MON_BASE | RFTYPE_DEBUG, + }, + { + .name = "schemata", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_schemata_write, + .seq_show = rdtgroup_schemata_show, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "mba_MBps_event", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mba_mbps_event_write, + .seq_show = rdtgroup_mba_mbps_event_show, + }, + { + .name = "mode", + .mode = 0644, + .kf_ops = &rdtgroup_kf_single_ops, + .write = rdtgroup_mode_write, + .seq_show = rdtgroup_mode_show, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "size", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_size_show, + .fflags = RFTYPE_CTRL_BASE, + }, + { + .name = "sparse_masks", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdt_has_sparse_bitmasks_show, + .fflags = RFTYPE_CTRL_INFO | RFTYPE_RES_CACHE, + }, + { + .name = "ctrl_hw_id", + .mode = 0444, + .kf_ops = &rdtgroup_kf_single_ops, + .seq_show = rdtgroup_closid_show, + .fflags = RFTYPE_CTRL_BASE | RFTYPE_DEBUG, + }, +}; + +static int rdtgroup_add_files(struct kernfs_node *kn, unsigned long fflags) +{ + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + lockdep_assert_held(&rdtgroup_mutex); + + if (resctrl_debug) + fflags |= RFTYPE_DEBUG; + + for (rft = rfts; rft < rfts + len; rft++) { + if (rft->fflags && ((fflags & rft->fflags) == rft->fflags)) { + ret = rdtgroup_add_file(kn, rft); + if (ret) + goto error; + } + } + + return 0; +error: + pr_warn("Failed to add %s, err=%d\n", rft->name, ret); + while (--rft >= rfts) { + if ((fflags & rft->fflags) == rft->fflags) + kernfs_remove_by_name(kn, rft->name); + } + return ret; +} + +static struct rftype *rdtgroup_get_rftype_by_name(const char *name) +{ + struct rftype *rfts, *rft; + int len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + return rft; + } + + return NULL; +} + +static void thread_throttle_mode_init(void) +{ + enum membw_throttle_mode throttle_mode = THREAD_THROTTLE_UNDEFINED; + struct rdt_resource *r_mba, *r_smba; + + r_mba = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + if (r_mba->alloc_capable && + r_mba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_mba->membw.throttle_mode; + + r_smba = resctrl_arch_get_resource(RDT_RESOURCE_SMBA); + if (r_smba->alloc_capable && + r_smba->membw.throttle_mode != THREAD_THROTTLE_UNDEFINED) + throttle_mode = r_smba->membw.throttle_mode; + + if (throttle_mode == THREAD_THROTTLE_UNDEFINED) + return; + + resctrl_file_fflags_init("thread_throttle_mode", + RFTYPE_CTRL_INFO | RFTYPE_RES_MB); +} + +void resctrl_file_fflags_init(const char *config, unsigned long fflags) +{ + struct rftype *rft; + + rft = rdtgroup_get_rftype_by_name(config); + if (rft) + rft->fflags = fflags; +} + +/** + * rdtgroup_kn_mode_restrict - Restrict user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * + * The permissions of named resctrl file, directory, or link are modified + * to not allow read, write, or execute by any user. + * + * WARNING: This function is intended to communicate to the user that the + * resctrl file has been locked down - that it is not relevant to the + * particular state the system finds itself in. It should not be relied + * on to protect from user access because after the file's permissions + * are restricted the user can still change the permissions using chmod + * from the command line. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restrict(struct rdtgroup *r, const char *name) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn; + int ret = 0; + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + iattr.ia_mode = S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode = S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode = S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +/** + * rdtgroup_kn_mode_restore - Restore user access to named resctrl file + * @r: The resource group with which the file is associated. + * @name: Name of the file + * @mask: Mask of permissions that should be restored + * + * Restore the permissions of the named file. If @name is a directory the + * permissions of its parent will be used. + * + * Return: 0 on success, <0 on failure. + */ +int rdtgroup_kn_mode_restore(struct rdtgroup *r, const char *name, + umode_t mask) +{ + struct iattr iattr = {.ia_valid = ATTR_MODE,}; + struct kernfs_node *kn, *parent; + struct rftype *rfts, *rft; + int ret, len; + + rfts = res_common_files; + len = ARRAY_SIZE(res_common_files); + + for (rft = rfts; rft < rfts + len; rft++) { + if (!strcmp(rft->name, name)) + iattr.ia_mode = rft->mode & mask; + } + + kn = kernfs_find_and_get_ns(r->kn, name, NULL); + if (!kn) + return -ENOENT; + + switch (kernfs_type(kn)) { + case KERNFS_DIR: + parent = kernfs_get_parent(kn); + if (parent) { + iattr.ia_mode |= parent->mode; + kernfs_put(parent); + } + iattr.ia_mode |= S_IFDIR; + break; + case KERNFS_FILE: + iattr.ia_mode |= S_IFREG; + break; + case KERNFS_LINK: + iattr.ia_mode |= S_IFLNK; + break; + } + + ret = kernfs_setattr(kn, &iattr); + kernfs_put(kn); + return ret; +} + +static int rdtgroup_mkdir_info_resdir(void *priv, char *name, + unsigned long fflags) +{ + struct kernfs_node *kn_subdir; + int ret; + + kn_subdir = kernfs_create_dir(kn_info, name, + kn_info->mode, priv); + if (IS_ERR(kn_subdir)) + return PTR_ERR(kn_subdir); + + ret = rdtgroup_kn_set_ugid(kn_subdir); + if (ret) + return ret; + + ret = rdtgroup_add_files(kn_subdir, fflags); + if (!ret) + kernfs_activate(kn_subdir); + + return ret; +} + +static unsigned long fflags_from_resource(struct rdt_resource *r) +{ + switch (r->rid) { + case RDT_RESOURCE_L3: + case RDT_RESOURCE_L2: + return RFTYPE_RES_CACHE; + case RDT_RESOURCE_MBA: + case RDT_RESOURCE_SMBA: + return RFTYPE_RES_MB; + } + + return WARN_ON_ONCE(1); +} + +static int rdtgroup_create_info_dir(struct kernfs_node *parent_kn) +{ + struct resctrl_schema *s; + struct rdt_resource *r; + unsigned long fflags; + char name[32]; + int ret; + + /* create the directory */ + kn_info = kernfs_create_dir(parent_kn, "info", parent_kn->mode, NULL); + if (IS_ERR(kn_info)) + return PTR_ERR(kn_info); + + ret = rdtgroup_add_files(kn_info, RFTYPE_TOP_INFO); + if (ret) + goto out_destroy; + + /* loop over enabled controls, these are all alloc_capable */ + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + fflags = fflags_from_resource(r) | RFTYPE_CTRL_INFO; + ret = rdtgroup_mkdir_info_resdir(s, s->name, fflags); + if (ret) + goto out_destroy; + } + + for_each_mon_capable_rdt_resource(r) { + fflags = fflags_from_resource(r) | RFTYPE_MON_INFO; + sprintf(name, "%s_MON", r->name); + ret = rdtgroup_mkdir_info_resdir(r, name, fflags); + if (ret) + goto out_destroy; + } + + ret = rdtgroup_kn_set_ugid(kn_info); + if (ret) + goto out_destroy; + + kernfs_activate(kn_info); + + return 0; + +out_destroy: + kernfs_remove(kn_info); + return ret; +} + +static int +mongroup_create_dir(struct kernfs_node *parent_kn, struct rdtgroup *prgrp, + char *name, struct kernfs_node **dest_kn) +{ + struct kernfs_node *kn; + int ret; + + /* create the directory */ + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + if (dest_kn) + *dest_kn = kn; + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + + kernfs_activate(kn); + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +static inline bool is_mba_linear(void) +{ + return resctrl_arch_get_resource(RDT_RESOURCE_MBA)->membw.delay_linear; +} + +static int mba_sc_domain_allocate(struct rdt_resource *r, struct rdt_ctrl_domain *d) +{ + u32 num_closid = resctrl_arch_get_num_closid(r); + int cpu = cpumask_any(&d->hdr.cpu_mask); + int i; + + d->mbps_val = kcalloc_node(num_closid, sizeof(*d->mbps_val), + GFP_KERNEL, cpu_to_node(cpu)); + if (!d->mbps_val) + return -ENOMEM; + + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + + return 0; +} + +static void mba_sc_domain_destroy(struct rdt_resource *r, + struct rdt_ctrl_domain *d) +{ + kfree(d->mbps_val); + d->mbps_val = NULL; +} + +/* + * MBA software controller is supported only if + * MBM is supported and MBA is in linear scale, + * and the MBM monitor scope is the same as MBA + * control scope. + */ +static bool supports_mba_mbps(void) +{ + struct rdt_resource *rmbm = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + + return (resctrl_is_mbm_enabled() && + r->alloc_capable && is_mba_linear() && + r->ctrl_scope == rmbm->mon_scope); +} + +/* + * Enable or disable the MBA software controller + * which helps user specify bandwidth in MBps. + */ +static int set_mba_sc(bool mba_sc) +{ + struct rdt_resource *r = resctrl_arch_get_resource(RDT_RESOURCE_MBA); + u32 num_closid = resctrl_arch_get_num_closid(r); + struct rdt_ctrl_domain *d; + unsigned long fflags; + int i; + + if (!supports_mba_mbps() || mba_sc == is_mba_sc(r)) + return -EINVAL; + + r->membw.mba_sc = mba_sc; + + rdtgroup_default.mba_mbps_event = mba_mbps_default_event; + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + for (i = 0; i < num_closid; i++) + d->mbps_val[i] = MBA_MAX_MBPS; + } + + fflags = mba_sc ? RFTYPE_CTRL_BASE | RFTYPE_MON_BASE : 0; + resctrl_file_fflags_init("mba_MBps_event", fflags); + + return 0; +} + +/* + * We don't allow rdtgroup directories to be created anywhere + * except the root directory. Thus when looking for the rdtgroup + * structure for a kernfs node we are either looking at a directory, + * in which case the rdtgroup structure is pointed at by the "priv" + * field, otherwise we have a file, and need only look to the parent + * to find the rdtgroup. + */ +static struct rdtgroup *kernfs_to_rdtgroup(struct kernfs_node *kn) +{ + if (kernfs_type(kn) == KERNFS_DIR) { + /* + * All the resource directories use "kn->priv" + * to point to the "struct rdtgroup" for the + * resource. "info" and its subdirectories don't + * have rdtgroup structures, so return NULL here. + */ + if (kn == kn_info || + rcu_access_pointer(kn->__parent) == kn_info) + return NULL; + else + return kn->priv; + } else { + return rdt_kn_parent_priv(kn); + } +} + +static void rdtgroup_kn_get(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + atomic_inc(&rdtgrp->waitcount); + kernfs_break_active_protection(kn); +} + +static void rdtgroup_kn_put(struct rdtgroup *rdtgrp, struct kernfs_node *kn) +{ + if (atomic_dec_and_test(&rdtgrp->waitcount) && + (rdtgrp->flags & RDT_DELETED)) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + kernfs_unbreak_active_protection(kn); + rdtgroup_remove(rdtgrp); + } else { + kernfs_unbreak_active_protection(kn); + } +} + +struct rdtgroup *rdtgroup_kn_lock_live(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return NULL; + + rdtgroup_kn_get(rdtgrp, kn); + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + /* Was this group deleted while we waited? */ + if (rdtgrp->flags & RDT_DELETED) + return NULL; + + return rdtgrp; +} + +void rdtgroup_kn_unlock(struct kernfs_node *kn) +{ + struct rdtgroup *rdtgrp = kernfs_to_rdtgroup(kn); + + if (!rdtgrp) + return; + + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + + rdtgroup_kn_put(rdtgrp, kn); +} + +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **mon_data_kn); + +static void rdt_disable_ctx(void) +{ + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); + set_mba_sc(false); + + resctrl_debug = false; +} + +static int rdt_enable_ctx(struct rdt_fs_context *ctx) +{ + int ret = 0; + + if (ctx->enable_cdpl2) { + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, true); + if (ret) + goto out_done; + } + + if (ctx->enable_cdpl3) { + ret = resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, true); + if (ret) + goto out_cdpl2; + } + + if (ctx->enable_mba_mbps) { + ret = set_mba_sc(true); + if (ret) + goto out_cdpl3; + } + + if (ctx->enable_debug) + resctrl_debug = true; + + return 0; + +out_cdpl3: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L3, false); +out_cdpl2: + resctrl_arch_set_cdp_enabled(RDT_RESOURCE_L2, false); +out_done: + return ret; +} + +static int schemata_list_add(struct rdt_resource *r, enum resctrl_conf_type type) +{ + struct resctrl_schema *s; + const char *suffix = ""; + int ret, cl; + + s = kzalloc(sizeof(*s), GFP_KERNEL); + if (!s) + return -ENOMEM; + + s->res = r; + s->num_closid = resctrl_arch_get_num_closid(r); + if (resctrl_arch_get_cdp_enabled(r->rid)) + s->num_closid /= 2; + + s->conf_type = type; + switch (type) { + case CDP_CODE: + suffix = "CODE"; + break; + case CDP_DATA: + suffix = "DATA"; + break; + case CDP_NONE: + suffix = ""; + break; + } + + ret = snprintf(s->name, sizeof(s->name), "%s%s", r->name, suffix); + if (ret >= sizeof(s->name)) { + kfree(s); + return -EINVAL; + } + + cl = strlen(s->name); + + /* + * If CDP is supported by this resource, but not enabled, + * include the suffix. This ensures the tabular format of the + * schemata file does not change between mounts of the filesystem. + */ + if (r->cdp_capable && !resctrl_arch_get_cdp_enabled(r->rid)) + cl += 4; + + if (cl > max_name_width) + max_name_width = cl; + + switch (r->schema_fmt) { + case RESCTRL_SCHEMA_BITMAP: + s->fmt_str = "%d=%x"; + break; + case RESCTRL_SCHEMA_RANGE: + s->fmt_str = "%d=%u"; + break; + } + + if (WARN_ON_ONCE(!s->fmt_str)) { + kfree(s); + return -EINVAL; + } + + INIT_LIST_HEAD(&s->list); + list_add(&s->list, &resctrl_schema_all); + + return 0; +} + +static int schemata_list_create(void) +{ + struct rdt_resource *r; + int ret = 0; + + for_each_alloc_capable_rdt_resource(r) { + if (resctrl_arch_get_cdp_enabled(r->rid)) { + ret = schemata_list_add(r, CDP_CODE); + if (ret) + break; + + ret = schemata_list_add(r, CDP_DATA); + } else { + ret = schemata_list_add(r, CDP_NONE); + } + + if (ret) + break; + } + + return ret; +} + +static void schemata_list_destroy(void) +{ + struct resctrl_schema *s, *tmp; + + list_for_each_entry_safe(s, tmp, &resctrl_schema_all, list) { + list_del(&s->list); + kfree(s); + } +} + +static int rdt_get_tree(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + unsigned long flags = RFTYPE_CTRL_BASE; + struct rdt_mon_domain *dom; + struct rdt_resource *r; + int ret; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + /* + * resctrl file system can only be mounted once. + */ + if (resctrl_mounted) { + ret = -EBUSY; + goto out; + } + + ret = rdtgroup_setup_root(ctx); + if (ret) + goto out; + + ret = rdt_enable_ctx(ctx); + if (ret) + goto out_root; + + ret = schemata_list_create(); + if (ret) { + schemata_list_destroy(); + goto out_ctx; + } + + ret = closid_init(); + if (ret) + goto out_schemata_free; + + if (resctrl_arch_mon_capable()) + flags |= RFTYPE_MON; + + ret = rdtgroup_add_files(rdtgroup_default.kn, flags); + if (ret) + goto out_closid_exit; + + kernfs_activate(rdtgroup_default.kn); + + ret = rdtgroup_create_info_dir(rdtgroup_default.kn); + if (ret < 0) + goto out_closid_exit; + + if (resctrl_arch_mon_capable()) { + ret = mongroup_create_dir(rdtgroup_default.kn, + &rdtgroup_default, "mon_groups", + &kn_mongrp); + if (ret < 0) + goto out_info; + + ret = mkdir_mondata_all(rdtgroup_default.kn, + &rdtgroup_default, &kn_mondata); + if (ret < 0) + goto out_mongrp; + rdtgroup_default.mon.mon_data_kn = kn_mondata; + } + + ret = rdt_pseudo_lock_init(); + if (ret) + goto out_mondata; + + ret = kernfs_get_tree(fc); + if (ret < 0) + goto out_psl; + + if (resctrl_arch_alloc_capable()) + resctrl_arch_enable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_enable_mon(); + + if (resctrl_arch_alloc_capable() || resctrl_arch_mon_capable()) + resctrl_mounted = true; + + if (resctrl_is_mbm_enabled()) { + r = resctrl_arch_get_resource(RDT_RESOURCE_L3); + list_for_each_entry(dom, &r->mon_domains, hdr.list) + mbm_setup_overflow_handler(dom, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); + } + + goto out; + +out_psl: + rdt_pseudo_lock_release(); +out_mondata: + if (resctrl_arch_mon_capable()) + kernfs_remove(kn_mondata); +out_mongrp: + if (resctrl_arch_mon_capable()) + kernfs_remove(kn_mongrp); +out_info: + kernfs_remove(kn_info); +out_closid_exit: + closid_exit(); +out_schemata_free: + schemata_list_destroy(); +out_ctx: + rdt_disable_ctx(); +out_root: + rdtgroup_destroy_root(); +out: + rdt_last_cmd_clear(); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); + return ret; +} + +enum rdt_param { + Opt_cdp, + Opt_cdpl2, + Opt_mba_mbps, + Opt_debug, + nr__rdt_params +}; + +static const struct fs_parameter_spec rdt_fs_parameters[] = { + fsparam_flag("cdp", Opt_cdp), + fsparam_flag("cdpl2", Opt_cdpl2), + fsparam_flag("mba_MBps", Opt_mba_mbps), + fsparam_flag("debug", Opt_debug), + {} +}; + +static int rdt_parse_param(struct fs_context *fc, struct fs_parameter *param) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + struct fs_parse_result result; + const char *msg; + int opt; + + opt = fs_parse(fc, rdt_fs_parameters, param, &result); + if (opt < 0) + return opt; + + switch (opt) { + case Opt_cdp: + ctx->enable_cdpl3 = true; + return 0; + case Opt_cdpl2: + ctx->enable_cdpl2 = true; + return 0; + case Opt_mba_mbps: + msg = "mba_MBps requires MBM and linear scale MBA at L3 scope"; + if (!supports_mba_mbps()) + return invalfc(fc, msg); + ctx->enable_mba_mbps = true; + return 0; + case Opt_debug: + ctx->enable_debug = true; + return 0; + } + + return -EINVAL; +} + +static void rdt_fs_context_free(struct fs_context *fc) +{ + struct rdt_fs_context *ctx = rdt_fc2context(fc); + + kernfs_free_fs_context(fc); + kfree(ctx); +} + +static const struct fs_context_operations rdt_fs_context_ops = { + .free = rdt_fs_context_free, + .parse_param = rdt_parse_param, + .get_tree = rdt_get_tree, +}; + +static int rdt_init_fs_context(struct fs_context *fc) +{ + struct rdt_fs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + ctx->kfc.magic = RDTGROUP_SUPER_MAGIC; + fc->fs_private = &ctx->kfc; + fc->ops = &rdt_fs_context_ops; + put_user_ns(fc->user_ns); + fc->user_ns = get_user_ns(&init_user_ns); + fc->global = true; + return 0; +} + +/* + * Move tasks from one to the other group. If @from is NULL, then all tasks + * in the systems are moved unconditionally (used for teardown). + * + * If @mask is not NULL the cpus on which moved tasks are running are set + * in that mask so the update smp function call is restricted to affected + * cpus. + */ +static void rdt_move_group_tasks(struct rdtgroup *from, struct rdtgroup *to, + struct cpumask *mask) +{ + struct task_struct *p, *t; + + read_lock(&tasklist_lock); + for_each_process_thread(p, t) { + if (!from || is_closid_match(t, from) || + is_rmid_match(t, from)) { + resctrl_arch_set_closid_rmid(t, to->closid, + to->mon.rmid); + + /* + * Order the closid/rmid stores above before the loads + * in task_curr(). This pairs with the full barrier + * between the rq->curr update and + * resctrl_arch_sched_in() during context switch. + */ + smp_mb(); + + /* + * If the task is on a CPU, set the CPU in the mask. + * The detection is inaccurate as tasks might move or + * schedule before the smp function call takes place. + * In such a case the function call is pointless, but + * there is no other side effect. + */ + if (IS_ENABLED(CONFIG_SMP) && mask && task_curr(t)) + cpumask_set_cpu(task_cpu(t), mask); + } + } + read_unlock(&tasklist_lock); +} + +static void free_all_child_rdtgrp(struct rdtgroup *rdtgrp) +{ + struct rdtgroup *sentry, *stmp; + struct list_head *head; + + head = &rdtgrp->mon.crdtgrp_list; + list_for_each_entry_safe(sentry, stmp, head, mon.crdtgrp_list) { + free_rmid(sentry->closid, sentry->mon.rmid); + list_del(&sentry->mon.crdtgrp_list); + + if (atomic_read(&sentry->waitcount) != 0) + sentry->flags = RDT_DELETED; + else + rdtgroup_remove(sentry); + } +} + +/* + * Forcibly remove all of subdirectories under root. + */ +static void rmdir_all_sub(void) +{ + struct rdtgroup *rdtgrp, *tmp; + + /* Move all tasks to the default resource group */ + rdt_move_group_tasks(NULL, &rdtgroup_default, NULL); + + list_for_each_entry_safe(rdtgrp, tmp, &rdt_all_groups, rdtgroup_list) { + /* Free any child rmids */ + free_all_child_rdtgrp(rdtgrp); + + /* Remove each rdtgroup other than root */ + if (rdtgrp == &rdtgroup_default) + continue; + + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) + rdtgroup_pseudo_lock_remove(rdtgrp); + + /* + * Give any CPUs back to the default group. We cannot copy + * cpu_online_mask because a CPU might have executed the + * offline callback already, but is still marked online. + */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + + kernfs_remove(rdtgrp->kn); + list_del(&rdtgrp->rdtgroup_list); + + if (atomic_read(&rdtgrp->waitcount) != 0) + rdtgrp->flags = RDT_DELETED; + else + rdtgroup_remove(rdtgrp); + } + /* Notify online CPUs to update per cpu storage and PQR_ASSOC MSR */ + update_closid_rmid(cpu_online_mask, &rdtgroup_default); + + kernfs_remove(kn_info); + kernfs_remove(kn_mongrp); + kernfs_remove(kn_mondata); +} + +/** + * mon_get_kn_priv() - Get the mon_data priv data for this event. + * + * The same values are used across the mon_data directories of all control and + * monitor groups for the same event in the same domain. Keep a list of + * allocated structures and re-use an existing one with the same values for + * @rid, @domid, etc. + * + * @rid: The resource id for the event file being created. + * @domid: The domain id for the event file being created. + * @mevt: The type of event file being created. + * @do_sum: Whether SNC summing monitors are being created. + */ +static struct mon_data *mon_get_kn_priv(enum resctrl_res_level rid, int domid, + struct mon_evt *mevt, + bool do_sum) +{ + struct mon_data *priv; + + lockdep_assert_held(&rdtgroup_mutex); + + list_for_each_entry(priv, &mon_data_kn_priv_list, list) { + if (priv->rid == rid && priv->domid == domid && + priv->sum == do_sum && priv->evtid == mevt->evtid) + return priv; + } + + priv = kzalloc(sizeof(*priv), GFP_KERNEL); + if (!priv) + return NULL; + + priv->rid = rid; + priv->domid = domid; + priv->sum = do_sum; + priv->evtid = mevt->evtid; + list_add_tail(&priv->list, &mon_data_kn_priv_list); + + return priv; +} + +/** + * mon_put_kn_priv() - Free all allocated mon_data structures. + * + * Called when resctrl file system is unmounted. + */ +static void mon_put_kn_priv(void) +{ + struct mon_data *priv, *tmp; + + lockdep_assert_held(&rdtgroup_mutex); + + list_for_each_entry_safe(priv, tmp, &mon_data_kn_priv_list, list) { + list_del(&priv->list); + kfree(priv); + } +} + +static void resctrl_fs_teardown(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + + /* Cleared by rdtgroup_destroy_root() */ + if (!rdtgroup_default.kn) + return; + + rmdir_all_sub(); + mon_put_kn_priv(); + rdt_pseudo_lock_release(); + rdtgroup_default.mode = RDT_MODE_SHAREABLE; + closid_exit(); + schemata_list_destroy(); + rdtgroup_destroy_root(); +} + +static void rdt_kill_sb(struct super_block *sb) +{ + struct rdt_resource *r; + + cpus_read_lock(); + mutex_lock(&rdtgroup_mutex); + + rdt_disable_ctx(); + + /* Put everything back to default values. */ + for_each_alloc_capable_rdt_resource(r) + resctrl_arch_reset_all_ctrls(r); + + resctrl_fs_teardown(); + if (resctrl_arch_alloc_capable()) + resctrl_arch_disable_alloc(); + if (resctrl_arch_mon_capable()) + resctrl_arch_disable_mon(); + resctrl_mounted = false; + kernfs_kill_sb(sb); + mutex_unlock(&rdtgroup_mutex); + cpus_read_unlock(); +} + +static struct file_system_type rdt_fs_type = { + .name = "resctrl", + .init_fs_context = rdt_init_fs_context, + .parameters = rdt_fs_parameters, + .kill_sb = rdt_kill_sb, +}; + +static int mon_addfile(struct kernfs_node *parent_kn, const char *name, + void *priv) +{ + struct kernfs_node *kn; + int ret = 0; + + kn = __kernfs_create_file(parent_kn, name, 0444, + GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, 0, + &kf_mondata_ops, priv, NULL, NULL); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + kernfs_remove(kn); + return ret; + } + + return ret; +} + +static void mon_rmdir_one_subdir(struct kernfs_node *pkn, char *name, char *subname) +{ + struct kernfs_node *kn; + + kn = kernfs_find_and_get(pkn, name); + if (!kn) + return; + kernfs_put(kn); + + if (kn->dir.subdirs <= 1) + kernfs_remove(kn); + else + kernfs_remove_by_name(kn, subname); +} + +/* + * Remove all subdirectories of mon_data of ctrl_mon groups + * and monitor groups for the given domain. + * Remove files and directories containing "sum" of domain data + * when last domain being summed is removed. + */ +static void rmdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_mon_domain *d) +{ + struct rdtgroup *prgrp, *crgrp; + char subname[32]; + bool snc_mode; + char name[32]; + + snc_mode = r->mon_scope == RESCTRL_L3_NODE; + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + if (snc_mode) + sprintf(subname, "mon_sub_%s_%02d", r->name, d->hdr.id); + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + mon_rmdir_one_subdir(prgrp->mon.mon_data_kn, name, subname); + + list_for_each_entry(crgrp, &prgrp->mon.crdtgrp_list, mon.crdtgrp_list) + mon_rmdir_one_subdir(crgrp->mon.mon_data_kn, name, subname); + } +} + +static int mon_add_all_files(struct kernfs_node *kn, struct rdt_mon_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp, + bool do_sum) +{ + struct rmid_read rr = {0}; + struct mon_data *priv; + struct mon_evt *mevt; + int ret, domid; + + if (WARN_ON(list_empty(&r->evt_list))) + return -EPERM; + + list_for_each_entry(mevt, &r->evt_list, list) { + domid = do_sum ? d->ci->id : d->hdr.id; + priv = mon_get_kn_priv(r->rid, domid, mevt, do_sum); + if (WARN_ON_ONCE(!priv)) + return -EINVAL; + + ret = mon_addfile(kn, mevt->name, priv); + if (ret) + return ret; + + if (!do_sum && resctrl_is_mbm_event(mevt->evtid)) + mon_event_read(&rr, r, d, prgrp, &d->hdr.cpu_mask, mevt->evtid, true); + } + + return 0; +} + +static int mkdir_mondata_subdir(struct kernfs_node *parent_kn, + struct rdt_mon_domain *d, + struct rdt_resource *r, struct rdtgroup *prgrp) +{ + struct kernfs_node *kn, *ckn; + char name[32]; + bool snc_mode; + int ret = 0; + + lockdep_assert_held(&rdtgroup_mutex); + + snc_mode = r->mon_scope == RESCTRL_L3_NODE; + sprintf(name, "mon_%s_%02d", r->name, snc_mode ? d->ci->id : d->hdr.id); + kn = kernfs_find_and_get(parent_kn, name); + if (kn) { + /* + * rdtgroup_mutex will prevent this directory from being + * removed. No need to keep this hold. + */ + kernfs_put(kn); + } else { + kn = kernfs_create_dir(parent_kn, name, parent_kn->mode, prgrp); + if (IS_ERR(kn)) + return PTR_ERR(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) + goto out_destroy; + ret = mon_add_all_files(kn, d, r, prgrp, snc_mode); + if (ret) + goto out_destroy; + } + + if (snc_mode) { + sprintf(name, "mon_sub_%s_%02d", r->name, d->hdr.id); + ckn = kernfs_create_dir(kn, name, parent_kn->mode, prgrp); + if (IS_ERR(ckn)) { + ret = -EINVAL; + goto out_destroy; + } + + ret = rdtgroup_kn_set_ugid(ckn); + if (ret) + goto out_destroy; + + ret = mon_add_all_files(ckn, d, r, prgrp, false); + if (ret) + goto out_destroy; + } + + kernfs_activate(kn); + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/* + * Add all subdirectories of mon_data for "ctrl_mon" groups + * and "monitor" groups with given domain id. + */ +static void mkdir_mondata_subdir_allrdtgrp(struct rdt_resource *r, + struct rdt_mon_domain *d) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *prgrp, *crgrp; + struct list_head *head; + + list_for_each_entry(prgrp, &rdt_all_groups, rdtgroup_list) { + parent_kn = prgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, prgrp); + + head = &prgrp->mon.crdtgrp_list; + list_for_each_entry(crgrp, head, mon.crdtgrp_list) { + parent_kn = crgrp->mon.mon_data_kn; + mkdir_mondata_subdir(parent_kn, d, r, crgrp); + } + } +} + +static int mkdir_mondata_subdir_alldom(struct kernfs_node *parent_kn, + struct rdt_resource *r, + struct rdtgroup *prgrp) +{ + struct rdt_mon_domain *dom; + int ret; + + /* Walking r->domains, ensure it can't race with cpuhp */ + lockdep_assert_cpus_held(); + + list_for_each_entry(dom, &r->mon_domains, hdr.list) { + ret = mkdir_mondata_subdir(parent_kn, dom, r, prgrp); + if (ret) + return ret; + } + + return 0; +} + +/* + * This creates a directory mon_data which contains the monitored data. + * + * mon_data has one directory for each domain which are named + * in the format mon_<domain_name>_<domain_id>. For ex: A mon_data + * with L3 domain looks as below: + * ./mon_data: + * mon_L3_00 + * mon_L3_01 + * mon_L3_02 + * ... + * + * Each domain directory has one file per event: + * ./mon_L3_00/: + * llc_occupancy + * + */ +static int mkdir_mondata_all(struct kernfs_node *parent_kn, + struct rdtgroup *prgrp, + struct kernfs_node **dest_kn) +{ + struct rdt_resource *r; + struct kernfs_node *kn; + int ret; + + /* + * Create the mon_data directory first. + */ + ret = mongroup_create_dir(parent_kn, prgrp, "mon_data", &kn); + if (ret) + return ret; + + if (dest_kn) + *dest_kn = kn; + + /* + * Create the subdirectories for each domain. Note that all events + * in a domain like L3 are grouped into a resource whose domain is L3 + */ + for_each_mon_capable_rdt_resource(r) { + ret = mkdir_mondata_subdir_alldom(kn, r, prgrp); + if (ret) + goto out_destroy; + } + + return 0; + +out_destroy: + kernfs_remove(kn); + return ret; +} + +/** + * cbm_ensure_valid - Enforce validity on provided CBM + * @_val: Candidate CBM + * @r: RDT resource to which the CBM belongs + * + * The provided CBM represents all cache portions available for use. This + * may be represented by a bitmap that does not consist of contiguous ones + * and thus be an invalid CBM. + * Here the provided CBM is forced to be a valid CBM by only considering + * the first set of contiguous bits as valid and clearing all bits. + * The intention here is to provide a valid default CBM with which a new + * resource group is initialized. The user can follow this with a + * modification to the CBM if the default does not satisfy the + * requirements. + */ +static u32 cbm_ensure_valid(u32 _val, struct rdt_resource *r) +{ + unsigned int cbm_len = r->cache.cbm_len; + unsigned long first_bit, zero_bit; + unsigned long val = _val; + + if (!val) + return 0; + + first_bit = find_first_bit(&val, cbm_len); + zero_bit = find_next_zero_bit(&val, cbm_len, first_bit); + + /* Clear any remaining bits to ensure contiguous region */ + bitmap_clear(&val, zero_bit, cbm_len - zero_bit); + return (u32)val; +} + +/* + * Initialize cache resources per RDT domain + * + * Set the RDT domain up to start off with all usable allocations. That is, + * all shareable and unused bits. All-zero CBM is invalid. + */ +static int __init_one_rdt_domain(struct rdt_ctrl_domain *d, struct resctrl_schema *s, + u32 closid) +{ + enum resctrl_conf_type peer_type = resctrl_peer_type(s->conf_type); + enum resctrl_conf_type t = s->conf_type; + struct resctrl_staged_config *cfg; + struct rdt_resource *r = s->res; + u32 used_b = 0, unused_b = 0; + unsigned long tmp_cbm; + enum rdtgrp_mode mode; + u32 peer_ctl, ctrl_val; + int i; + + cfg = &d->staged_config[t]; + cfg->have_new_ctrl = false; + cfg->new_ctrl = r->cache.shareable_bits; + used_b = r->cache.shareable_bits; + for (i = 0; i < closids_supported(); i++) { + if (closid_allocated(i) && i != closid) { + mode = rdtgroup_mode_by_closid(i); + if (mode == RDT_MODE_PSEUDO_LOCKSETUP) + /* + * ctrl values for locksetup aren't relevant + * until the schemata is written, and the mode + * becomes RDT_MODE_PSEUDO_LOCKED. + */ + continue; + /* + * If CDP is active include peer domain's + * usage to ensure there is no overlap + * with an exclusive group. + */ + if (resctrl_arch_get_cdp_enabled(r->rid)) + peer_ctl = resctrl_arch_get_config(r, d, i, + peer_type); + else + peer_ctl = 0; + ctrl_val = resctrl_arch_get_config(r, d, i, + s->conf_type); + used_b |= ctrl_val | peer_ctl; + if (mode == RDT_MODE_SHAREABLE) + cfg->new_ctrl |= ctrl_val | peer_ctl; + } + } + if (d->plr && d->plr->cbm > 0) + used_b |= d->plr->cbm; + unused_b = used_b ^ (BIT_MASK(r->cache.cbm_len) - 1); + unused_b &= BIT_MASK(r->cache.cbm_len) - 1; + cfg->new_ctrl |= unused_b; + /* + * Force the initial CBM to be valid, user can + * modify the CBM based on system availability. + */ + cfg->new_ctrl = cbm_ensure_valid(cfg->new_ctrl, r); + /* + * Assign the u32 CBM to an unsigned long to ensure that + * bitmap_weight() does not access out-of-bound memory. + */ + tmp_cbm = cfg->new_ctrl; + if (bitmap_weight(&tmp_cbm, r->cache.cbm_len) < r->cache.min_cbm_bits) { + rdt_last_cmd_printf("No space on %s:%d\n", s->name, d->hdr.id); + return -ENOSPC; + } + cfg->have_new_ctrl = true; + + return 0; +} + +/* + * Initialize cache resources with default values. + * + * A new RDT group is being created on an allocation capable (CAT) + * supporting system. Set this group up to start off with all usable + * allocations. + * + * If there are no more shareable bits available on any domain then + * the entire allocation will fail. + */ +static int rdtgroup_init_cat(struct resctrl_schema *s, u32 closid) +{ + struct rdt_ctrl_domain *d; + int ret; + + list_for_each_entry(d, &s->res->ctrl_domains, hdr.list) { + ret = __init_one_rdt_domain(d, s, closid); + if (ret < 0) + return ret; + } + + return 0; +} + +/* Initialize MBA resource with default values. */ +static void rdtgroup_init_mba(struct rdt_resource *r, u32 closid) +{ + struct resctrl_staged_config *cfg; + struct rdt_ctrl_domain *d; + + list_for_each_entry(d, &r->ctrl_domains, hdr.list) { + if (is_mba_sc(r)) { + d->mbps_val[closid] = MBA_MAX_MBPS; + continue; + } + + cfg = &d->staged_config[CDP_NONE]; + cfg->new_ctrl = resctrl_get_default_ctrl(r); + cfg->have_new_ctrl = true; + } +} + +/* Initialize the RDT group's allocations. */ +static int rdtgroup_init_alloc(struct rdtgroup *rdtgrp) +{ + struct resctrl_schema *s; + struct rdt_resource *r; + int ret = 0; + + rdt_staged_configs_clear(); + + list_for_each_entry(s, &resctrl_schema_all, list) { + r = s->res; + if (r->rid == RDT_RESOURCE_MBA || + r->rid == RDT_RESOURCE_SMBA) { + rdtgroup_init_mba(r, rdtgrp->closid); + if (is_mba_sc(r)) + continue; + } else { + ret = rdtgroup_init_cat(s, rdtgrp->closid); + if (ret < 0) + goto out; + } + + ret = resctrl_arch_update_domains(r, rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Failed to initialize allocations\n"); + goto out; + } + } + + rdtgrp->mode = RDT_MODE_SHAREABLE; + +out: + rdt_staged_configs_clear(); + return ret; +} + +static int mkdir_rdt_prepare_rmid_alloc(struct rdtgroup *rdtgrp) +{ + int ret; + + if (!resctrl_arch_mon_capable()) + return 0; + + ret = alloc_rmid(rdtgrp->closid); + if (ret < 0) { + rdt_last_cmd_puts("Out of RMIDs\n"); + return ret; + } + rdtgrp->mon.rmid = ret; + + ret = mkdir_mondata_all(rdtgrp->kn, rdtgrp, &rdtgrp->mon.mon_data_kn); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + return ret; + } + + return 0; +} + +static void mkdir_rdt_prepare_rmid_free(struct rdtgroup *rgrp) +{ + if (resctrl_arch_mon_capable()) + free_rmid(rgrp->closid, rgrp->mon.rmid); +} + +/* + * We allow creating mon groups only with in a directory called "mon_groups" + * which is present in every ctrl_mon group. Check if this is a valid + * "mon_groups" directory. + * + * 1. The directory should be named "mon_groups". + * 2. The mon group itself should "not" be named "mon_groups". + * This makes sure "mon_groups" directory always has a ctrl_mon group + * as parent. + */ +static bool is_mon_groups(struct kernfs_node *kn, const char *name) +{ + return (!strcmp(rdt_kn_name(kn), "mon_groups") && + strcmp(name, "mon_groups")); +} + +static int mkdir_rdt_prepare(struct kernfs_node *parent_kn, + const char *name, umode_t mode, + enum rdt_group_type rtype, struct rdtgroup **r) +{ + struct rdtgroup *prdtgrp, *rdtgrp; + unsigned long files = 0; + struct kernfs_node *kn; + int ret; + + prdtgrp = rdtgroup_kn_lock_live(parent_kn); + if (!prdtgrp) { + ret = -ENODEV; + goto out_unlock; + } + + /* + * Check that the parent directory for a monitor group is a "mon_groups" + * directory. + */ + if (rtype == RDTMON_GROUP && !is_mon_groups(parent_kn, name)) { + ret = -EPERM; + goto out_unlock; + } + + if (rtype == RDTMON_GROUP && + (prdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + prdtgrp->mode == RDT_MODE_PSEUDO_LOCKED)) { + ret = -EINVAL; + rdt_last_cmd_puts("Pseudo-locking in progress\n"); + goto out_unlock; + } + + /* allocate the rdtgroup. */ + rdtgrp = kzalloc(sizeof(*rdtgrp), GFP_KERNEL); + if (!rdtgrp) { + ret = -ENOSPC; + rdt_last_cmd_puts("Kernel out of memory\n"); + goto out_unlock; + } + *r = rdtgrp; + rdtgrp->mon.parent = prdtgrp; + rdtgrp->type = rtype; + INIT_LIST_HEAD(&rdtgrp->mon.crdtgrp_list); + + /* kernfs creates the directory for rdtgrp */ + kn = kernfs_create_dir(parent_kn, name, mode, rdtgrp); + if (IS_ERR(kn)) { + ret = PTR_ERR(kn); + rdt_last_cmd_puts("kernfs create error\n"); + goto out_free_rgrp; + } + rdtgrp->kn = kn; + + /* + * kernfs_remove() will drop the reference count on "kn" which + * will free it. But we still need it to stick around for the + * rdtgroup_kn_unlock(kn) call. Take one extra reference here, + * which will be dropped by kernfs_put() in rdtgroup_remove(). + */ + kernfs_get(kn); + + ret = rdtgroup_kn_set_ugid(kn); + if (ret) { + rdt_last_cmd_puts("kernfs perm error\n"); + goto out_destroy; + } + + if (rtype == RDTCTRL_GROUP) { + files = RFTYPE_BASE | RFTYPE_CTRL; + if (resctrl_arch_mon_capable()) + files |= RFTYPE_MON; + } else { + files = RFTYPE_BASE | RFTYPE_MON; + } + + ret = rdtgroup_add_files(kn, files); + if (ret) { + rdt_last_cmd_puts("kernfs fill error\n"); + goto out_destroy; + } + + /* + * The caller unlocks the parent_kn upon success. + */ + return 0; + +out_destroy: + kernfs_put(rdtgrp->kn); + kernfs_remove(rdtgrp->kn); +out_free_rgrp: + kfree(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static void mkdir_rdt_prepare_clean(struct rdtgroup *rgrp) +{ + kernfs_remove(rgrp->kn); + rdtgroup_remove(rgrp); +} + +/* + * Create a monitor group under "mon_groups" directory of a control + * and monitor group(ctrl_mon). This is a resource group + * to monitor a subset of tasks and cpus in its parent ctrl_mon group. + */ +static int rdtgroup_mkdir_mon(struct kernfs_node *parent_kn, + const char *name, umode_t mode) +{ + struct rdtgroup *rdtgrp, *prgrp; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTMON_GROUP, &rdtgrp); + if (ret) + return ret; + + prgrp = rdtgrp->mon.parent; + rdtgrp->closid = prgrp->closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) { + mkdir_rdt_prepare_clean(rdtgrp); + goto out_unlock; + } + + kernfs_activate(rdtgrp->kn); + + /* + * Add the rdtgrp to the list of rdtgrps the parent + * ctrl_mon group has to track. + */ + list_add_tail(&rdtgrp->mon.crdtgrp_list, &prgrp->mon.crdtgrp_list); + +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +/* + * These are rdtgroups created under the root directory. Can be used + * to allocate and monitor resources. + */ +static int rdtgroup_mkdir_ctrl_mon(struct kernfs_node *parent_kn, + const char *name, umode_t mode) +{ + struct rdtgroup *rdtgrp; + struct kernfs_node *kn; + u32 closid; + int ret; + + ret = mkdir_rdt_prepare(parent_kn, name, mode, RDTCTRL_GROUP, &rdtgrp); + if (ret) + return ret; + + kn = rdtgrp->kn; + ret = closid_alloc(); + if (ret < 0) { + rdt_last_cmd_puts("Out of CLOSIDs\n"); + goto out_common_fail; + } + closid = ret; + ret = 0; + + rdtgrp->closid = closid; + + ret = mkdir_rdt_prepare_rmid_alloc(rdtgrp); + if (ret) + goto out_closid_free; + + kernfs_activate(rdtgrp->kn); + + ret = rdtgroup_init_alloc(rdtgrp); + if (ret < 0) + goto out_rmid_free; + + list_add(&rdtgrp->rdtgroup_list, &rdt_all_groups); + + if (resctrl_arch_mon_capable()) { + /* + * Create an empty mon_groups directory to hold the subset + * of tasks and cpus to monitor. + */ + ret = mongroup_create_dir(kn, rdtgrp, "mon_groups", NULL); + if (ret) { + rdt_last_cmd_puts("kernfs subdir error\n"); + goto out_del_list; + } + if (is_mba_sc(NULL)) + rdtgrp->mba_mbps_event = mba_mbps_default_event; + } + + goto out_unlock; + +out_del_list: + list_del(&rdtgrp->rdtgroup_list); +out_rmid_free: + mkdir_rdt_prepare_rmid_free(rdtgrp); +out_closid_free: + closid_free(closid); +out_common_fail: + mkdir_rdt_prepare_clean(rdtgrp); +out_unlock: + rdtgroup_kn_unlock(parent_kn); + return ret; +} + +static int rdtgroup_mkdir(struct kernfs_node *parent_kn, const char *name, + umode_t mode) +{ + /* Do not accept '\n' to avoid unparsable situation. */ + if (strchr(name, '\n')) + return -EINVAL; + + /* + * If the parent directory is the root directory and RDT + * allocation is supported, add a control and monitoring + * subdirectory + */ + if (resctrl_arch_alloc_capable() && parent_kn == rdtgroup_default.kn) + return rdtgroup_mkdir_ctrl_mon(parent_kn, name, mode); + + /* Else, attempt to add a monitoring subdirectory. */ + if (resctrl_arch_mon_capable()) + return rdtgroup_mkdir_mon(parent_kn, name, mode); + + return -EPERM; +} + +static int rdtgroup_rmdir_mon(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + u32 closid, rmid; + int cpu; + + /* Give any tasks back to the parent group */ + rdt_move_group_tasks(rdtgrp, prdtgrp, tmpmask); + + /* + * Update per cpu closid/rmid of the moved CPUs first. + * Note: the closid will not change, but the arch code still needs it. + */ + closid = prdtgrp->closid; + rmid = prdtgrp->mon.rmid; + for_each_cpu(cpu, &rdtgrp->cpu_mask) + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); + + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + rdtgrp->flags = RDT_DELETED; + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + + /* + * Remove the rdtgrp from the parent ctrl_mon group's list + */ + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_del(&rdtgrp->mon.crdtgrp_list); + + kernfs_remove(rdtgrp->kn); + + return 0; +} + +static int rdtgroup_ctrl_remove(struct rdtgroup *rdtgrp) +{ + rdtgrp->flags = RDT_DELETED; + list_del(&rdtgrp->rdtgroup_list); + + kernfs_remove(rdtgrp->kn); + return 0; +} + +static int rdtgroup_rmdir_ctrl(struct rdtgroup *rdtgrp, cpumask_var_t tmpmask) +{ + u32 closid, rmid; + int cpu; + + /* Give any tasks back to the default group */ + rdt_move_group_tasks(rdtgrp, &rdtgroup_default, tmpmask); + + /* Give any CPUs back to the default group */ + cpumask_or(&rdtgroup_default.cpu_mask, + &rdtgroup_default.cpu_mask, &rdtgrp->cpu_mask); + + /* Update per cpu closid and rmid of the moved CPUs first */ + closid = rdtgroup_default.closid; + rmid = rdtgroup_default.mon.rmid; + for_each_cpu(cpu, &rdtgrp->cpu_mask) + resctrl_arch_set_cpu_default_closid_rmid(cpu, closid, rmid); + + /* + * Update the MSR on moved CPUs and CPUs which have moved + * task running on them. + */ + cpumask_or(tmpmask, tmpmask, &rdtgrp->cpu_mask); + update_closid_rmid(tmpmask, NULL); + + free_rmid(rdtgrp->closid, rdtgrp->mon.rmid); + closid_free(rdtgrp->closid); + + rdtgroup_ctrl_remove(rdtgrp); + + /* + * Free all the child monitor group rmids. + */ + free_all_child_rdtgrp(rdtgrp); + + return 0; +} + +static struct kernfs_node *rdt_kn_parent(struct kernfs_node *kn) +{ + /* + * Valid within the RCU section it was obtained or while rdtgroup_mutex + * is held. + */ + return rcu_dereference_check(kn->__parent, lockdep_is_held(&rdtgroup_mutex)); +} + +static int rdtgroup_rmdir(struct kernfs_node *kn) +{ + struct kernfs_node *parent_kn; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret = 0; + + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) + return -ENOMEM; + + rdtgrp = rdtgroup_kn_lock_live(kn); + if (!rdtgrp) { + ret = -EPERM; + goto out; + } + parent_kn = rdt_kn_parent(kn); + + /* + * If the rdtgroup is a ctrl_mon group and parent directory + * is the root directory, remove the ctrl_mon group. + * + * If the rdtgroup is a mon group and parent directory + * is a valid "mon_groups" directory, remove the mon group. + */ + if (rdtgrp->type == RDTCTRL_GROUP && parent_kn == rdtgroup_default.kn && + rdtgrp != &rdtgroup_default) { + if (rdtgrp->mode == RDT_MODE_PSEUDO_LOCKSETUP || + rdtgrp->mode == RDT_MODE_PSEUDO_LOCKED) { + ret = rdtgroup_ctrl_remove(rdtgrp); + } else { + ret = rdtgroup_rmdir_ctrl(rdtgrp, tmpmask); + } + } else if (rdtgrp->type == RDTMON_GROUP && + is_mon_groups(parent_kn, rdt_kn_name(kn))) { + ret = rdtgroup_rmdir_mon(rdtgrp, tmpmask); + } else { + ret = -EPERM; + } + +out: + rdtgroup_kn_unlock(kn); + free_cpumask_var(tmpmask); + return ret; +} + +/** + * mongrp_reparent() - replace parent CTRL_MON group of a MON group + * @rdtgrp: the MON group whose parent should be replaced + * @new_prdtgrp: replacement parent CTRL_MON group for @rdtgrp + * @cpus: cpumask provided by the caller for use during this call + * + * Replaces the parent CTRL_MON group for a MON group, resulting in all member + * tasks' CLOSID immediately changing to that of the new parent group. + * Monitoring data for the group is unaffected by this operation. + */ +static void mongrp_reparent(struct rdtgroup *rdtgrp, + struct rdtgroup *new_prdtgrp, + cpumask_var_t cpus) +{ + struct rdtgroup *prdtgrp = rdtgrp->mon.parent; + + WARN_ON(rdtgrp->type != RDTMON_GROUP); + WARN_ON(new_prdtgrp->type != RDTCTRL_GROUP); + + /* Nothing to do when simply renaming a MON group. */ + if (prdtgrp == new_prdtgrp) + return; + + WARN_ON(list_empty(&prdtgrp->mon.crdtgrp_list)); + list_move_tail(&rdtgrp->mon.crdtgrp_list, + &new_prdtgrp->mon.crdtgrp_list); + + rdtgrp->mon.parent = new_prdtgrp; + rdtgrp->closid = new_prdtgrp->closid; + + /* Propagate updated closid to all tasks in this group. */ + rdt_move_group_tasks(rdtgrp, rdtgrp, cpus); + + update_closid_rmid(cpus, NULL); +} + +static int rdtgroup_rename(struct kernfs_node *kn, + struct kernfs_node *new_parent, const char *new_name) +{ + struct kernfs_node *kn_parent; + struct rdtgroup *new_prdtgrp; + struct rdtgroup *rdtgrp; + cpumask_var_t tmpmask; + int ret; + + rdtgrp = kernfs_to_rdtgroup(kn); + new_prdtgrp = kernfs_to_rdtgroup(new_parent); + if (!rdtgrp || !new_prdtgrp) + return -ENOENT; + + /* Release both kernfs active_refs before obtaining rdtgroup mutex. */ + rdtgroup_kn_get(rdtgrp, kn); + rdtgroup_kn_get(new_prdtgrp, new_parent); + + mutex_lock(&rdtgroup_mutex); + + rdt_last_cmd_clear(); + + /* + * Don't allow kernfs_to_rdtgroup() to return a parent rdtgroup if + * either kernfs_node is a file. + */ + if (kernfs_type(kn) != KERNFS_DIR || + kernfs_type(new_parent) != KERNFS_DIR) { + rdt_last_cmd_puts("Source and destination must be directories"); + ret = -EPERM; + goto out; + } + + if ((rdtgrp->flags & RDT_DELETED) || (new_prdtgrp->flags & RDT_DELETED)) { + ret = -ENOENT; + goto out; + } + + kn_parent = rdt_kn_parent(kn); + if (rdtgrp->type != RDTMON_GROUP || !kn_parent || + !is_mon_groups(kn_parent, rdt_kn_name(kn))) { + rdt_last_cmd_puts("Source must be a MON group\n"); + ret = -EPERM; + goto out; + } + + if (!is_mon_groups(new_parent, new_name)) { + rdt_last_cmd_puts("Destination must be a mon_groups subdirectory\n"); + ret = -EPERM; + goto out; + } + + /* + * If the MON group is monitoring CPUs, the CPUs must be assigned to the + * current parent CTRL_MON group and therefore cannot be assigned to + * the new parent, making the move illegal. + */ + if (!cpumask_empty(&rdtgrp->cpu_mask) && + rdtgrp->mon.parent != new_prdtgrp) { + rdt_last_cmd_puts("Cannot move a MON group that monitors CPUs\n"); + ret = -EPERM; + goto out; + } + + /* + * Allocate the cpumask for use in mongrp_reparent() to avoid the + * possibility of failing to allocate it after kernfs_rename() has + * succeeded. + */ + if (!zalloc_cpumask_var(&tmpmask, GFP_KERNEL)) { + ret = -ENOMEM; + goto out; + } + + /* + * Perform all input validation and allocations needed to ensure + * mongrp_reparent() will succeed before calling kernfs_rename(), + * otherwise it would be necessary to revert this call if + * mongrp_reparent() failed. + */ + ret = kernfs_rename(kn, new_parent, new_name); + if (!ret) + mongrp_reparent(rdtgrp, new_prdtgrp, tmpmask); + + free_cpumask_var(tmpmask); + +out: + mutex_unlock(&rdtgroup_mutex); + rdtgroup_kn_put(rdtgrp, kn); + rdtgroup_kn_put(new_prdtgrp, new_parent); + return ret; +} + +static int rdtgroup_show_options(struct seq_file *seq, struct kernfs_root *kf) +{ + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L3)) + seq_puts(seq, ",cdp"); + + if (resctrl_arch_get_cdp_enabled(RDT_RESOURCE_L2)) + seq_puts(seq, ",cdpl2"); + + if (is_mba_sc(resctrl_arch_get_resource(RDT_RESOURCE_MBA))) + seq_puts(seq, ",mba_MBps"); + + if (resctrl_debug) + seq_puts(seq, ",debug"); + + return 0; +} + +static struct kernfs_syscall_ops rdtgroup_kf_syscall_ops = { + .mkdir = rdtgroup_mkdir, + .rmdir = rdtgroup_rmdir, + .rename = rdtgroup_rename, + .show_options = rdtgroup_show_options, +}; + +static int rdtgroup_setup_root(struct rdt_fs_context *ctx) +{ + rdt_root = kernfs_create_root(&rdtgroup_kf_syscall_ops, + KERNFS_ROOT_CREATE_DEACTIVATED | + KERNFS_ROOT_EXTRA_OPEN_PERM_CHECK, + &rdtgroup_default); + if (IS_ERR(rdt_root)) + return PTR_ERR(rdt_root); + + ctx->kfc.root = rdt_root; + rdtgroup_default.kn = kernfs_root_to_node(rdt_root); + + return 0; +} + +static void rdtgroup_destroy_root(void) +{ + lockdep_assert_held(&rdtgroup_mutex); + + kernfs_destroy_root(rdt_root); + rdtgroup_default.kn = NULL; +} + +static void rdtgroup_setup_default(void) +{ + mutex_lock(&rdtgroup_mutex); + + rdtgroup_default.closid = RESCTRL_RESERVED_CLOSID; + rdtgroup_default.mon.rmid = RESCTRL_RESERVED_RMID; + rdtgroup_default.type = RDTCTRL_GROUP; + INIT_LIST_HEAD(&rdtgroup_default.mon.crdtgrp_list); + + list_add(&rdtgroup_default.rdtgroup_list, &rdt_all_groups); + + mutex_unlock(&rdtgroup_mutex); +} + +static void domain_destroy_mon_state(struct rdt_mon_domain *d) +{ + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + kfree(d->mbm_local); +} + +void resctrl_offline_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) +{ + mutex_lock(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) + mba_sc_domain_destroy(r, d); + + mutex_unlock(&rdtgroup_mutex); +} + +void resctrl_offline_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + mutex_lock(&rdtgroup_mutex); + + /* + * If resctrl is mounted, remove all the + * per domain monitor data directories. + */ + if (resctrl_mounted && resctrl_arch_mon_capable()) + rmdir_mondata_subdir_allrdtgrp(r, d); + + if (resctrl_is_mbm_enabled()) + cancel_delayed_work(&d->mbm_over); + if (resctrl_arch_is_llc_occupancy_enabled() && has_busy_rmid(d)) { + /* + * When a package is going down, forcefully + * decrement rmid->ebusy. There is no way to know + * that the L3 was flushed and hence may lead to + * incorrect counts in rare scenarios, but leaving + * the RMID as busy creates RMID leaks if the + * package never comes back. + */ + __check_limbo(d, true); + cancel_delayed_work(&d->cqm_limbo); + } + + domain_destroy_mon_state(d); + + mutex_unlock(&rdtgroup_mutex); +} + +/** + * domain_setup_mon_state() - Initialise domain monitoring structures. + * @r: The resource for the newly online domain. + * @d: The newly online domain. + * + * Allocate monitor resources that belong to this domain. + * Called when the first CPU of a domain comes online, regardless of whether + * the filesystem is mounted. + * During boot this may be called before global allocations have been made by + * resctrl_mon_resource_init(). + * + * Returns 0 for success, or -ENOMEM. + */ +static int domain_setup_mon_state(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + u32 idx_limit = resctrl_arch_system_num_rmid_idx(); + size_t tsize; + + if (resctrl_arch_is_llc_occupancy_enabled()) { + d->rmid_busy_llc = bitmap_zalloc(idx_limit, GFP_KERNEL); + if (!d->rmid_busy_llc) + return -ENOMEM; + } + if (resctrl_arch_is_mbm_total_enabled()) { + tsize = sizeof(*d->mbm_total); + d->mbm_total = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_total) { + bitmap_free(d->rmid_busy_llc); + return -ENOMEM; + } + } + if (resctrl_arch_is_mbm_local_enabled()) { + tsize = sizeof(*d->mbm_local); + d->mbm_local = kcalloc(idx_limit, tsize, GFP_KERNEL); + if (!d->mbm_local) { + bitmap_free(d->rmid_busy_llc); + kfree(d->mbm_total); + return -ENOMEM; + } + } + + return 0; +} + +int resctrl_online_ctrl_domain(struct rdt_resource *r, struct rdt_ctrl_domain *d) +{ + int err = 0; + + mutex_lock(&rdtgroup_mutex); + + if (supports_mba_mbps() && r->rid == RDT_RESOURCE_MBA) { + /* RDT_RESOURCE_MBA is never mon_capable */ + err = mba_sc_domain_allocate(r, d); + } + + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +int resctrl_online_mon_domain(struct rdt_resource *r, struct rdt_mon_domain *d) +{ + int err; + + mutex_lock(&rdtgroup_mutex); + + err = domain_setup_mon_state(r, d); + if (err) + goto out_unlock; + + if (resctrl_is_mbm_enabled()) { + INIT_DELAYED_WORK(&d->mbm_over, mbm_handle_overflow); + mbm_setup_overflow_handler(d, MBM_OVERFLOW_INTERVAL, + RESCTRL_PICK_ANY_CPU); + } + + if (resctrl_arch_is_llc_occupancy_enabled()) + INIT_DELAYED_WORK(&d->cqm_limbo, cqm_handle_limbo); + + /* + * If the filesystem is not mounted then only the default resource group + * exists. Creation of its directories is deferred until mount time + * by rdt_get_tree() calling mkdir_mondata_all(). + * If resctrl is mounted, add per domain monitor data directories. + */ + if (resctrl_mounted && resctrl_arch_mon_capable()) + mkdir_mondata_subdir_allrdtgrp(r, d); + +out_unlock: + mutex_unlock(&rdtgroup_mutex); + + return err; +} + +void resctrl_online_cpu(unsigned int cpu) +{ + mutex_lock(&rdtgroup_mutex); + /* The CPU is set in default rdtgroup after online. */ + cpumask_set_cpu(cpu, &rdtgroup_default.cpu_mask); + mutex_unlock(&rdtgroup_mutex); +} + +static void clear_childcpus(struct rdtgroup *r, unsigned int cpu) +{ + struct rdtgroup *cr; + + list_for_each_entry(cr, &r->mon.crdtgrp_list, mon.crdtgrp_list) { + if (cpumask_test_and_clear_cpu(cpu, &cr->cpu_mask)) + break; + } +} + +static struct rdt_mon_domain *get_mon_domain_from_cpu(int cpu, + struct rdt_resource *r) +{ + struct rdt_mon_domain *d; + + lockdep_assert_cpus_held(); + + list_for_each_entry(d, &r->mon_domains, hdr.list) { + /* Find the domain that contains this CPU */ + if (cpumask_test_cpu(cpu, &d->hdr.cpu_mask)) + return d; + } + + return NULL; +} + +void resctrl_offline_cpu(unsigned int cpu) +{ + struct rdt_resource *l3 = resctrl_arch_get_resource(RDT_RESOURCE_L3); + struct rdt_mon_domain *d; + struct rdtgroup *rdtgrp; + + mutex_lock(&rdtgroup_mutex); + list_for_each_entry(rdtgrp, &rdt_all_groups, rdtgroup_list) { + if (cpumask_test_and_clear_cpu(cpu, &rdtgrp->cpu_mask)) { + clear_childcpus(rdtgrp, cpu); + break; + } + } + + if (!l3->mon_capable) + goto out_unlock; + + d = get_mon_domain_from_cpu(cpu, l3); + if (d) { + if (resctrl_is_mbm_enabled() && cpu == d->mbm_work_cpu) { + cancel_delayed_work(&d->mbm_over); + mbm_setup_overflow_handler(d, 0, cpu); + } + if (resctrl_arch_is_llc_occupancy_enabled() && + cpu == d->cqm_work_cpu && has_busy_rmid(d)) { + cancel_delayed_work(&d->cqm_limbo); + cqm_setup_limbo_handler(d, 0, cpu); + } + } + +out_unlock: + mutex_unlock(&rdtgroup_mutex); +} + +/* + * resctrl_init - resctrl filesystem initialization + * + * Setup resctrl file system including set up root, create mount point, + * register resctrl filesystem, and initialize files under root directory. + * + * Return: 0 on success or -errno + */ +int resctrl_init(void) +{ + int ret = 0; + + seq_buf_init(&last_cmd_status, last_cmd_status_buf, + sizeof(last_cmd_status_buf)); + + rdtgroup_setup_default(); + + thread_throttle_mode_init(); + + ret = resctrl_mon_resource_init(); + if (ret) + return ret; + + ret = sysfs_create_mount_point(fs_kobj, "resctrl"); + if (ret) { + resctrl_mon_resource_exit(); + return ret; + } + + ret = register_filesystem(&rdt_fs_type); + if (ret) + goto cleanup_mountpoint; + + /* + * Adding the resctrl debugfs directory here may not be ideal since + * it would let the resctrl debugfs directory appear on the debugfs + * filesystem before the resctrl filesystem is mounted. + * It may also be ok since that would enable debugging of RDT before + * resctrl is mounted. + * The reason why the debugfs directory is created here and not in + * rdt_get_tree() is because rdt_get_tree() takes rdtgroup_mutex and + * during the debugfs directory creation also &sb->s_type->i_mutex_key + * (the lockdep class of inode->i_rwsem). Other filesystem + * interactions (eg. SyS_getdents) have the lock ordering: + * &sb->s_type->i_mutex_key --> &mm->mmap_lock + * During mmap(), called with &mm->mmap_lock, the rdtgroup_mutex + * is taken, thus creating dependency: + * &mm->mmap_lock --> rdtgroup_mutex for the latter that can cause + * issues considering the other two lock dependencies. + * By creating the debugfs directory here we avoid a dependency + * that may cause deadlock (even though file operations cannot + * occur until the filesystem is mounted, but I do not know how to + * tell lockdep that). + */ + debugfs_resctrl = debugfs_create_dir("resctrl", NULL); + + return 0; + +cleanup_mountpoint: + sysfs_remove_mount_point(fs_kobj, "resctrl"); + resctrl_mon_resource_exit(); + + return ret; +} + +static bool resctrl_online_domains_exist(void) +{ + struct rdt_resource *r; + + /* + * Only walk capable resources to allow resctrl_arch_get_resource() + * to return dummy 'not capable' resources. + */ + for_each_alloc_capable_rdt_resource(r) { + if (!list_empty(&r->ctrl_domains)) + return true; + } + + for_each_mon_capable_rdt_resource(r) { + if (!list_empty(&r->mon_domains)) + return true; + } + + return false; +} + +/** + * resctrl_exit() - Remove the resctrl filesystem and free resources. + * + * Called by the architecture code in response to a fatal error. + * Removes resctrl files and structures from kernfs to prevent further + * configuration. + * + * When called by the architecture code, all CPUs and resctrl domains must be + * offline. This ensures the limbo and overflow handlers are not scheduled to + * run, meaning the data structures they access can be freed by + * resctrl_mon_resource_exit(). + * + * After resctrl_exit() returns, the architecture code should return an + * error from all resctrl_arch_ functions that can do this. + * resctrl_arch_get_resource() must continue to return struct rdt_resources + * with the correct rid field to ensure the filesystem can be unmounted. + */ +void resctrl_exit(void) +{ + cpus_read_lock(); + WARN_ON_ONCE(resctrl_online_domains_exist()); + + mutex_lock(&rdtgroup_mutex); + resctrl_fs_teardown(); + mutex_unlock(&rdtgroup_mutex); + + cpus_read_unlock(); + + debugfs_remove_recursive(debugfs_resctrl); + debugfs_resctrl = NULL; + unregister_filesystem(&rdt_fs_type); + + /* + * Do not remove the sysfs mount point added by resctrl_init() so that + * it can be used to umount resctrl. + */ + + resctrl_mon_resource_exit(); +} diff --git a/fs/smb/client/cached_dir.c b/fs/smb/client/cached_dir.c index 89d2dbbb742c..5200a0f3cafc 100644 --- a/fs/smb/client/cached_dir.c +++ b/fs/smb/client/cached_dir.c @@ -155,6 +155,7 @@ int open_cached_dir(unsigned int xid, struct cifs_tcon *tcon, struct cached_fids *cfids; const char *npath; int retries = 0, cur_sleep = 1; + __le32 lease_flags = 0; if (cifs_sb->root == NULL) return -ENOENT; @@ -201,6 +202,8 @@ replay_again: } spin_unlock(&cfids->cfid_list_lock); + pfid = &cfid->fid; + /* * Skip any prefix paths in @path as lookup_noperm_positive_unlocked() ends up * calling ->lookup() which already adds those through @@ -222,6 +225,25 @@ replay_again: rc = -ENOENT; goto out; } + if (dentry->d_parent && server->dialect >= SMB30_PROT_ID) { + struct cached_fid *parent_cfid; + + spin_lock(&cfids->cfid_list_lock); + list_for_each_entry(parent_cfid, &cfids->entries, entry) { + if (parent_cfid->dentry == dentry->d_parent) { + cifs_dbg(FYI, "found a parent cached file handle\n"); + if (parent_cfid->has_lease && parent_cfid->time) { + lease_flags + |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE; + memcpy(pfid->parent_lease_key, + parent_cfid->fid.lease_key, + SMB2_LEASE_KEY_SIZE); + } + break; + } + } + spin_unlock(&cfids->cfid_list_lock); + } } cfid->dentry = dentry; cfid->tcon = tcon; @@ -236,7 +258,6 @@ replay_again: if (smb3_encryption_required(tcon)) flags |= CIFS_TRANSFORM_REQ; - pfid = &cfid->fid; server->ops->new_lease_key(pfid); memset(rqst, 0, sizeof(rqst)); @@ -256,6 +277,7 @@ replay_again: FILE_READ_EA, .disposition = FILE_OPEN, .fid = pfid, + .lease_flags = lease_flags, .replay = !!(retries), }; diff --git a/fs/smb/client/cifsfs.c b/fs/smb/client/cifsfs.c index fb04e263611c..0a5266ecfd15 100644 --- a/fs/smb/client/cifsfs.c +++ b/fs/smb/client/cifsfs.c @@ -70,7 +70,6 @@ bool require_gcm_256; /* false by default */ bool enable_negotiate_signing; /* false by default */ unsigned int global_secflags = CIFSSEC_DEF; /* unsigned int ntlmv2_support = 0; */ -unsigned int sign_CIFS_PDUs = 1; /* * Global transaction id (XID) information diff --git a/fs/smb/client/cifsglob.h b/fs/smb/client/cifsglob.h index 3b32116b0b49..ad7dd16db3e9 100644 --- a/fs/smb/client/cifsglob.h +++ b/fs/smb/client/cifsglob.h @@ -556,7 +556,7 @@ struct smb_version_operations { void (*set_oplock_level)(struct cifsInodeInfo *cinode, __u32 oplock, __u16 epoch, bool *purge_cache); /* create lease context buffer for CREATE request */ - char * (*create_lease_buf)(u8 *lease_key, u8 oplock); + char * (*create_lease_buf)(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 le_flags); /* parse lease context buffer and return oplock/epoch info */ __u8 (*parse_lease_buf)(void *buf, __u16 *epoch, char *lkey); ssize_t (*copychunk_range)(const unsigned int, @@ -773,6 +773,7 @@ struct TCP_Server_Info { char workstation_RFC1001_name[RFC1001_NAME_LEN_WITH_NULL]; __u32 sequence_number; /* for signing, protected by srv_mutex */ __u32 reconnect_instance; /* incremented on each reconnect */ + __le32 session_key_id; /* retrieved from negotiate response and send in session setup request */ struct session_key session_key; unsigned long lstrp; /* when we got last response from this server */ struct cifs_secmech secmech; /* crypto sec mech functs, descriptors */ @@ -1441,6 +1442,7 @@ struct cifs_open_parms { bool reconnect:1; bool replay:1; /* indicates that this open is for a replay */ struct kvec *ea_cctx; + __le32 lease_flags; }; struct cifs_fid { @@ -1448,6 +1450,7 @@ struct cifs_fid { __u64 persistent_fid; /* persist file id for smb2 */ __u64 volatile_fid; /* volatile file id for smb2 */ __u8 lease_key[SMB2_LEASE_KEY_SIZE]; /* lease key for smb2 */ + __u8 parent_lease_key[SMB2_LEASE_KEY_SIZE]; __u8 create_guid[16]; __u32 access; struct cifs_pending_open *pending_open; @@ -1988,8 +1991,7 @@ require use of the stronger protocol */ * TCP_Server_Info-> TCP_Server_Info cifs_get_tcp_session * reconnect_mutex * TCP_Server_Info->srv_mutex TCP_Server_Info cifs_get_tcp_session - * cifs_ses->session_mutex cifs_ses sesInfoAlloc - * cifs_tcon + * cifs_ses->session_mutex cifs_ses sesInfoAlloc * cifs_tcon->open_file_lock cifs_tcon->openFileList tconInfoAlloc * cifs_tcon->pending_opens * cifs_tcon->stat_lock cifs_tcon->bytes_read tconInfoAlloc @@ -2008,21 +2010,25 @@ require use of the stronger protocol */ * ->oplock_credits * ->reconnect_instance * cifs_ses->ses_lock (anything that is not protected by another lock and can change) + * sesInfoAlloc * cifs_ses->iface_lock cifs_ses->iface_list sesInfoAlloc * ->iface_count * ->iface_last_update - * cifs_ses->chan_lock cifs_ses->chans + * cifs_ses->chan_lock cifs_ses->chans sesInfoAlloc * ->chans_need_reconnect * ->chans_in_reconnect * cifs_tcon->tc_lock (anything that is not protected by another lock and can change) + * tcon_info_alloc * inode->i_rwsem, taken by fs/netfs/locking.c e.g. should be taken before cifsInodeInfo locks * cifsInodeInfo->open_file_lock cifsInodeInfo->openFileList cifs_alloc_inode * cifsInodeInfo->writers_lock cifsInodeInfo->writers cifsInodeInfo_alloc * cifsInodeInfo->lock_sem cifsInodeInfo->llist cifs_init_once * ->can_cache_brlcks * cifsInodeInfo->deferred_lock cifsInodeInfo->deferred_closes cifsInodeInfo_alloc - * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs - * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo + * cached_fids->cfid_list_lock cifs_tcon->cfids->entries init_cached_dirs + * cached_fid->fid_lock (anything that is not protected by another lock and can change) + * init_cached_dir + * cifsFileInfo->fh_mutex cifsFileInfo cifs_new_fileinfo * cifsFileInfo->file_info_lock cifsFileInfo->count cifs_new_fileinfo * ->invalidHandle initiate_cifs_search * ->oplock_break_cancelled diff --git a/fs/smb/client/cifspdu.h b/fs/smb/client/cifspdu.h index 1b79fe07476f..d9cf7db0ac35 100644 --- a/fs/smb/client/cifspdu.h +++ b/fs/smb/client/cifspdu.h @@ -597,7 +597,7 @@ typedef union smb_com_session_setup_andx { __le16 MaxBufferSize; __le16 MaxMpxCount; __le16 VcNumber; - __u32 SessionKey; + __le32 SessionKey; __le16 SecurityBlobLength; __u32 Reserved; __le32 Capabilities; /* see below */ @@ -616,7 +616,7 @@ typedef union smb_com_session_setup_andx { __le16 MaxBufferSize; __le16 MaxMpxCount; __le16 VcNumber; - __u32 SessionKey; + __le32 SessionKey; __le16 CaseInsensitivePasswordLength; /* ASCII password len */ __le16 CaseSensitivePasswordLength; /* Unicode password length*/ __u32 Reserved; /* see below */ @@ -654,7 +654,7 @@ typedef union smb_com_session_setup_andx { __le16 MaxBufferSize; __le16 MaxMpxCount; __le16 VcNumber; - __u32 SessionKey; + __le32 SessionKey; __le16 PasswordLength; __u32 Reserved; /* encrypt key len and offset */ __le16 ByteCount; diff --git a/fs/smb/client/cifsproto.h b/fs/smb/client/cifsproto.h index ecf774a8f1ca..66093fa78aed 100644 --- a/fs/smb/client/cifsproto.h +++ b/fs/smb/client/cifsproto.h @@ -151,8 +151,7 @@ extern bool is_size_safe_to_change(struct cifsInodeInfo *cifsInode, __u64 eof, bool from_readdir); extern void cifs_update_eof(struct cifsInodeInfo *cifsi, loff_t offset, unsigned int bytes_written); -void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result, - bool was_async); +void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result); extern struct cifsFileInfo *find_writable_file(struct cifsInodeInfo *, int); extern int cifs_get_writable_file(struct cifsInodeInfo *cifs_inode, int flags, diff --git a/fs/smb/client/cifssmb.c b/fs/smb/client/cifssmb.c index f55457b4b82e..7216fcec79e8 100644 --- a/fs/smb/client/cifssmb.c +++ b/fs/smb/client/cifssmb.c @@ -498,6 +498,7 @@ CIFSSMBNegotiate(const unsigned int xid, server->max_rw = le32_to_cpu(pSMBr->MaxRawSize); cifs_dbg(NOISY, "Max buf = %d\n", ses->server->maxBuf); server->capabilities = le32_to_cpu(pSMBr->Capabilities); + server->session_key_id = pSMBr->SessionKey; server->timeAdj = (int)(__s16)le16_to_cpu(pSMBr->ServerTimeZone); server->timeAdj *= 60; @@ -1725,7 +1726,7 @@ cifs_writev_callback(struct mid_q_entry *mid) server->credits, server->in_flight, 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; - cifs_write_subrequest_terminated(wdata, result, true); + cifs_write_subrequest_terminated(wdata, result); release_mid(mid); trace_smb3_rw_credits(credits.rreq_debug_id, credits.rreq_debug_index, 0, server->credits, server->in_flight, @@ -1813,7 +1814,7 @@ async_writev_out: out: if (rc) { add_credits_and_wake_if(wdata->server, &wdata->credits, 0); - cifs_write_subrequest_terminated(wdata, rc, false); + cifs_write_subrequest_terminated(wdata, rc); } } @@ -2753,10 +2754,10 @@ int cifs_query_reparse_point(const unsigned int xid, io_req->TotalParameterCount = 0; io_req->TotalDataCount = 0; - io_req->MaxParameterCount = cpu_to_le32(2); + io_req->MaxParameterCount = cpu_to_le32(0); /* BB find exact data count max from sess structure BB */ io_req->MaxDataCount = cpu_to_le32(CIFSMaxBufSize & 0xFFFFFF00); - io_req->MaxSetupCount = 4; + io_req->MaxSetupCount = 1; io_req->Reserved = 0; io_req->ParameterOffset = 0; io_req->DataCount = 0; @@ -2783,6 +2784,22 @@ int cifs_query_reparse_point(const unsigned int xid, goto error; } + /* SetupCount must be 1, otherwise offset to ByteCount is incorrect. */ + if (io_rsp->SetupCount != 1) { + rc = -EIO; + goto error; + } + + /* + * ReturnedDataLen is output length of executed IOCTL. + * DataCount is output length transferred over network. + * Check that we have full FSCTL_GET_REPARSE_POINT buffer. + */ + if (data_count != le16_to_cpu(io_rsp->ReturnedDataLen)) { + rc = -EIO; + goto error; + } + end = 2 + get_bcc(&io_rsp->hdr) + (__u8 *)&io_rsp->ByteCount; start = (__u8 *)&io_rsp->hdr.Protocol + data_offset; if (start >= end) { diff --git a/fs/smb/client/connect.c b/fs/smb/client/connect.c index 6bf04d9a5491..024817d40c5f 100644 --- a/fs/smb/client/connect.c +++ b/fs/smb/client/connect.c @@ -377,6 +377,13 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, if (!cifs_tcp_ses_needs_reconnect(server, 1)) return 0; + /* + * if smb session has been marked for reconnect, also reconnect all + * connections. This way, the other connections do not end up bad. + */ + if (mark_smb_session) + cifs_signal_cifsd_for_reconnect(server, mark_smb_session); + cifs_mark_tcp_ses_conns_for_reconnect(server, mark_smb_session); cifs_abort_connection(server); @@ -385,7 +392,8 @@ static int __cifs_reconnect(struct TCP_Server_Info *server, try_to_freeze(); cifs_server_lock(server); - if (!cifs_swn_set_server_dstaddr(server)) { + if (!cifs_swn_set_server_dstaddr(server) && + !SERVER_IS_CHAN(server)) { /* resolve the hostname again to make sure that IP address is up-to-date */ rc = reconn_set_ipaddr_from_hostname(server); cifs_dbg(FYI, "%s: reconn_set_ipaddr_from_hostname: rc=%d\n", __func__, rc); diff --git a/fs/smb/client/dir.c b/fs/smb/client/dir.c index d1e95632ac54..1c6e5389c51f 100644 --- a/fs/smb/client/dir.c +++ b/fs/smb/client/dir.c @@ -23,6 +23,7 @@ #include "fs_context.h" #include "cifs_ioctl.h" #include "fscache.h" +#include "cached_dir.h" static void renew_parental_timestamps(struct dentry *direntry) @@ -190,6 +191,7 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned struct TCP_Server_Info *server = tcon->ses->server; struct cifs_open_parms oparms; int rdwr_for_fscache = 0; + __le32 lease_flags = 0; *oplock = 0; if (tcon->ses->server->oplocks) @@ -312,6 +314,26 @@ static int cifs_do_create(struct inode *inode, struct dentry *direntry, unsigned create_options |= CREATE_OPTION_READONLY; retry_open: + if (tcon->cfids && direntry->d_parent && server->dialect >= SMB30_PROT_ID) { + struct cached_fid *parent_cfid; + + spin_lock(&tcon->cfids->cfid_list_lock); + list_for_each_entry(parent_cfid, &tcon->cfids->entries, entry) { + if (parent_cfid->dentry == direntry->d_parent) { + cifs_dbg(FYI, "found a parent cached file handle\n"); + if (parent_cfid->has_lease && parent_cfid->time) { + lease_flags + |= SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE; + memcpy(fid->parent_lease_key, + parent_cfid->fid.lease_key, + SMB2_LEASE_KEY_SIZE); + } + break; + } + } + spin_unlock(&tcon->cfids->cfid_list_lock); + } + oparms = (struct cifs_open_parms) { .tcon = tcon, .cifs_sb = cifs_sb, @@ -320,6 +342,7 @@ retry_open: .disposition = disposition, .path = full_path, .fid = fid, + .lease_flags = lease_flags, .mode = mode, }; rc = server->ops->open(xid, &oparms, oplock, buf); diff --git a/fs/smb/client/file.c b/fs/smb/client/file.c index 950aa4f912f5..d2df10b8e6fd 100644 --- a/fs/smb/client/file.c +++ b/fs/smb/client/file.c @@ -130,7 +130,7 @@ fail: else trace_netfs_sreq(subreq, netfs_sreq_trace_fail); add_credits_and_wake_if(wdata->server, &wdata->credits, 0); - cifs_write_subrequest_terminated(wdata, rc, false); + cifs_write_subrequest_terminated(wdata, rc); goto out; } @@ -219,7 +219,8 @@ static void cifs_issue_read(struct netfs_io_subrequest *subreq) goto failed; } - if (subreq->rreq->origin != NETFS_DIO_READ) + if (subreq->rreq->origin != NETFS_UNBUFFERED_READ && + subreq->rreq->origin != NETFS_DIO_READ) __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); trace_netfs_sreq(subreq, netfs_sreq_trace_submit); @@ -2423,8 +2424,7 @@ int cifs_lock(struct file *file, int cmd, struct file_lock *flock) return rc; } -void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result, - bool was_async) +void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t result) { struct netfs_io_request *wreq = wdata->rreq; struct netfs_inode *ictx = netfs_inode(wreq->inode); @@ -2441,7 +2441,7 @@ void cifs_write_subrequest_terminated(struct cifs_io_subrequest *wdata, ssize_t netfs_resize_file(ictx, wrend, true); } - netfs_write_subrequest_terminated(&wdata->subreq, result, was_async); + netfs_write_subrequest_terminated(&wdata->subreq, result); } struct cifsFileInfo *find_readable_file(struct cifsInodeInfo *cifs_inode, diff --git a/fs/smb/client/misc.c b/fs/smb/client/misc.c index 7b6ed9b23e71..e77017f47084 100644 --- a/fs/smb/client/misc.c +++ b/fs/smb/client/misc.c @@ -326,6 +326,14 @@ check_smb_hdr(struct smb_hdr *smb) if (smb->Command == SMB_COM_LOCKING_ANDX) return 0; + /* + * Windows NT server returns error resposne (e.g. STATUS_DELETE_PENDING + * or STATUS_OBJECT_NAME_NOT_FOUND or ERRDOS/ERRbadfile or any other) + * for some TRANS2 requests without the RESPONSE flag set in header. + */ + if (smb->Command == SMB_COM_TRANSACTION2 && smb->Status.CifsError != 0) + return 0; + cifs_dbg(VFS, "Server sent request, not response. mid=%u\n", get_mid(smb)); return 1; diff --git a/fs/smb/client/namespace.c b/fs/smb/client/namespace.c index e3f9213131c4..52a520349cb7 100644 --- a/fs/smb/client/namespace.c +++ b/fs/smb/client/namespace.c @@ -146,6 +146,9 @@ static char *automount_fullpath(struct dentry *dentry, void *page) } spin_unlock(&tcon->tc_lock); + if (unlikely(!page)) + return ERR_PTR(-ENOMEM); + s = dentry_path_raw(dentry, page, PATH_MAX); if (IS_ERR(s)) return s; @@ -283,7 +286,6 @@ struct vfsmount *cifs_d_automount(struct path *path) return newmnt; } - mntget(newmnt); /* prevent immediate expiration */ mnt_set_expiry(newmnt, &cifs_automount_list); schedule_delayed_work(&cifs_automount_task, cifs_mountpoint_expiry_timeout); diff --git a/fs/smb/client/sess.c b/fs/smb/client/sess.c index b3fa9ee26912..ec0db32c7d98 100644 --- a/fs/smb/client/sess.c +++ b/fs/smb/client/sess.c @@ -445,6 +445,10 @@ cifs_chan_update_iface(struct cifs_ses *ses, struct TCP_Server_Info *server) ses->chans[chan_index].iface = iface; spin_unlock(&ses->chan_lock); + + spin_lock(&server->srv_lock); + memcpy(&server->dstaddr, &iface->sockaddr, sizeof(server->dstaddr)); + spin_unlock(&server->srv_lock); } static int @@ -628,6 +632,7 @@ static __u32 cifs_ssetup_hdr(struct cifs_ses *ses, USHRT_MAX)); pSMB->req.MaxMpxCount = cpu_to_le16(server->maxReq); pSMB->req.VcNumber = cpu_to_le16(1); + pSMB->req.SessionKey = server->session_key_id; /* Now no need to set SMBFLG_CASELESS or obsolete CANONICAL PATH */ @@ -1684,22 +1689,22 @@ _sess_auth_rawntlmssp_assemble_req(struct sess_data *sess_data) pSMB = (SESSION_SETUP_ANDX *)sess_data->iov[0].iov_base; capabilities = cifs_ssetup_hdr(ses, server, pSMB); - if ((pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) == 0) { - cifs_dbg(VFS, "NTLMSSP requires Unicode support\n"); - return -ENOSYS; - } - pSMB->req.hdr.Flags2 |= SMBFLG2_EXT_SEC; capabilities |= CAP_EXTENDED_SECURITY; pSMB->req.Capabilities |= cpu_to_le32(capabilities); bcc_ptr = sess_data->iov[2].iov_base; - /* unicode strings must be word aligned */ - if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { - *bcc_ptr = 0; - bcc_ptr++; + + if (pSMB->req.hdr.Flags2 & SMBFLG2_UNICODE) { + /* unicode strings must be word aligned */ + if (!IS_ALIGNED(sess_data->iov[0].iov_len + sess_data->iov[1].iov_len, 2)) { + *bcc_ptr = 0; + bcc_ptr++; + } + unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); + } else { + ascii_oslm_strings(&bcc_ptr, sess_data->nls_cp); } - unicode_oslm_strings(&bcc_ptr, sess_data->nls_cp); sess_data->iov[2].iov_len = (long) bcc_ptr - (long) sess_data->iov[2].iov_base; diff --git a/fs/smb/client/smb2ops.c b/fs/smb/client/smb2ops.c index 2fe8eeb98535..bab9f567d9b7 100644 --- a/fs/smb/client/smb2ops.c +++ b/fs/smb/client/smb2ops.c @@ -4069,7 +4069,7 @@ map_oplock_to_lease(u8 oplock) } static char * -smb2_create_lease_buf(u8 *lease_key, u8 oplock) +smb2_create_lease_buf(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 flags) { struct create_lease *buf; @@ -4095,7 +4095,7 @@ smb2_create_lease_buf(u8 *lease_key, u8 oplock) } static char * -smb3_create_lease_buf(u8 *lease_key, u8 oplock) +smb3_create_lease_buf(u8 *lease_key, u8 oplock, u8 *parent_lease_key, __le32 flags) { struct create_lease_v2 *buf; @@ -4105,6 +4105,9 @@ smb3_create_lease_buf(u8 *lease_key, u8 oplock) memcpy(&buf->lcontext.LeaseKey, lease_key, SMB2_LEASE_KEY_SIZE); buf->lcontext.LeaseState = map_oplock_to_lease(oplock); + buf->lcontext.LeaseFlags = flags; + if (flags & SMB2_LEASE_FLAG_PARENT_LEASE_KEY_SET_LE) + memcpy(&buf->lcontext.ParentLeaseKey, parent_lease_key, SMB2_LEASE_KEY_SIZE); buf->ccontext.DataOffset = cpu_to_le16(offsetof (struct create_lease_v2, lcontext)); diff --git a/fs/smb/client/smb2pdu.c b/fs/smb/client/smb2pdu.c index 4e28632b5fd6..0c320d06809c 100644 --- a/fs/smb/client/smb2pdu.c +++ b/fs/smb/client/smb2pdu.c @@ -2392,11 +2392,16 @@ static int add_lease_context(struct TCP_Server_Info *server, struct smb2_create_req *req, struct kvec *iov, - unsigned int *num_iovec, u8 *lease_key, __u8 *oplock) + unsigned int *num_iovec, + u8 *lease_key, + __u8 *oplock, + u8 *parent_lease_key, + __le32 flags) { unsigned int num = *num_iovec; - iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock); + iov[num].iov_base = server->ops->create_lease_buf(lease_key, *oplock, + parent_lease_key, flags); if (iov[num].iov_base == NULL) return -ENOMEM; iov[num].iov_len = server->vals->create_lease_size; @@ -3069,7 +3074,9 @@ SMB2_open_init(struct cifs_tcon *tcon, struct TCP_Server_Info *server, req->RequestedOplockLevel = *oplock; /* no srv lease support */ else { rc = add_lease_context(server, req, iov, &n_iov, - oparms->fid->lease_key, oplock); + oparms->fid->lease_key, oplock, + oparms->fid->parent_lease_key, + oparms->lease_flags); if (rc) return rc; } @@ -4888,7 +4895,7 @@ smb2_writev_callback(struct mid_q_entry *mid) 0, cifs_trace_rw_credits_write_response_clear); wdata->credits.value = 0; trace_netfs_sreq(&wdata->subreq, netfs_sreq_trace_io_progress); - cifs_write_subrequest_terminated(wdata, result ?: written, true); + cifs_write_subrequest_terminated(wdata, result ?: written); release_mid(mid); trace_smb3_rw_credits(rreq_debug_id, subreq_debug_index, 0, server->credits, server->in_flight, @@ -5061,7 +5068,7 @@ out: -(int)wdata->credits.value, cifs_trace_rw_credits_write_response_clear); add_credits_and_wake_if(wdata->server, &wdata->credits, 0); - cifs_write_subrequest_terminated(wdata, rc, true); + cifs_write_subrequest_terminated(wdata, rc); } } @@ -5917,71 +5924,6 @@ posix_qfsinf_exit: } int -SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_fid, u64 volatile_fid, struct kstatfs *fsdata) -{ - struct smb_rqst rqst; - struct smb2_query_info_rsp *rsp = NULL; - struct kvec iov; - struct kvec rsp_iov; - int rc = 0; - int resp_buftype; - struct cifs_ses *ses = tcon->ses; - struct TCP_Server_Info *server; - struct smb2_fs_full_size_info *info = NULL; - int flags = 0; - int retries = 0, cur_sleep = 1; - -replay_again: - /* reinitialize for possible replay */ - flags = 0; - server = cifs_pick_channel(ses); - - rc = build_qfs_info_req(&iov, tcon, server, - FS_FULL_SIZE_INFORMATION, - sizeof(struct smb2_fs_full_size_info), - persistent_fid, volatile_fid); - if (rc) - return rc; - - if (smb3_encryption_required(tcon)) - flags |= CIFS_TRANSFORM_REQ; - - memset(&rqst, 0, sizeof(struct smb_rqst)); - rqst.rq_iov = &iov; - rqst.rq_nvec = 1; - - if (retries) - smb2_set_replay(server, &rqst); - - rc = cifs_send_recv(xid, ses, server, - &rqst, &resp_buftype, flags, &rsp_iov); - free_qfs_info_req(&iov); - if (rc) { - cifs_stats_fail_inc(tcon, SMB2_QUERY_INFO_HE); - goto qfsinf_exit; - } - rsp = (struct smb2_query_info_rsp *)rsp_iov.iov_base; - - info = (struct smb2_fs_full_size_info *)( - le16_to_cpu(rsp->OutputBufferOffset) + (char *)rsp); - rc = smb2_validate_iov(le16_to_cpu(rsp->OutputBufferOffset), - le32_to_cpu(rsp->OutputBufferLength), &rsp_iov, - sizeof(struct smb2_fs_full_size_info)); - if (!rc) - smb2_copy_fs_info_to_kstatfs(info, fsdata); - -qfsinf_exit: - free_rsp_buf(resp_buftype, rsp_iov.iov_base); - - if (is_replayable_error(rc) && - smb2_should_replay(tcon, &retries, &cur_sleep)) - goto replay_again; - - return rc; -} - -int SMB2_QFS_attr(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_fid, u64 volatile_fid, int level) { diff --git a/fs/smb/client/smb2proto.h b/fs/smb/client/smb2proto.h index 4662c7e2d259..035aa1624053 100644 --- a/fs/smb/client/smb2proto.h +++ b/fs/smb/client/smb2proto.h @@ -259,9 +259,6 @@ extern int smb2_handle_cancelled_close(struct cifs_tcon *tcon, __u64 volatile_fid); extern int smb2_handle_cancelled_mid(struct mid_q_entry *mid, struct TCP_Server_Info *server); void smb2_cancelled_close_fid(struct work_struct *work); -extern int SMB2_QFS_info(const unsigned int xid, struct cifs_tcon *tcon, - u64 persistent_file_id, u64 volatile_file_id, - struct kstatfs *FSData); extern int SMB311_posix_qfs_info(const unsigned int xid, struct cifs_tcon *tcon, u64 persistent_file_id, u64 volatile_file_id, struct kstatfs *FSData); diff --git a/fs/smb/server/Kconfig b/fs/smb/server/Kconfig index cf70e96ad4de..4a23a5e7e8fe 100644 --- a/fs/smb/server/Kconfig +++ b/fs/smb/server/Kconfig @@ -11,6 +11,7 @@ config SMB_SERVER select CRYPTO_HMAC select CRYPTO_ECB select CRYPTO_LIB_DES + select CRYPTO_LIB_SHA256 select CRYPTO_SHA256 select CRYPTO_CMAC select CRYPTO_SHA512 diff --git a/fs/smb/server/auth.c b/fs/smb/server/auth.c index b3d121052408..d99871c21451 100644 --- a/fs/smb/server/auth.c +++ b/fs/smb/server/auth.c @@ -979,40 +979,6 @@ out: return rc; } -int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len, - __u8 *pi_hash) -{ - int rc; - struct ksmbd_crypto_ctx *ctx = NULL; - - ctx = ksmbd_crypto_ctx_find_sha256(); - if (!ctx) { - ksmbd_debug(AUTH, "could not alloc sha256\n"); - return -ENOMEM; - } - - rc = crypto_shash_init(CRYPTO_SHA256(ctx)); - if (rc) { - ksmbd_debug(AUTH, "could not init shashn"); - goto out; - } - - rc = crypto_shash_update(CRYPTO_SHA256(ctx), sd_buf, len); - if (rc) { - ksmbd_debug(AUTH, "could not update with n\n"); - goto out; - } - - rc = crypto_shash_final(CRYPTO_SHA256(ctx), pi_hash); - if (rc) { - ksmbd_debug(AUTH, "Could not generate hash err : %d\n", rc); - goto out; - } -out: - ksmbd_release_crypto_ctx(ctx); - return rc; -} - static int ksmbd_get_encryption_key(struct ksmbd_work *work, __u64 ses_id, int enc, u8 *key) { diff --git a/fs/smb/server/auth.h b/fs/smb/server/auth.h index 362b6159a6cf..6879a1bd1b91 100644 --- a/fs/smb/server/auth.h +++ b/fs/smb/server/auth.h @@ -66,6 +66,4 @@ int ksmbd_gen_smb311_encryptionkey(struct ksmbd_conn *conn, struct ksmbd_session *sess); int ksmbd_gen_preauth_integrity_hash(struct ksmbd_conn *conn, char *buf, __u8 *pi_hash); -int ksmbd_gen_sd_hash(struct ksmbd_conn *conn, char *sd_buf, int len, - __u8 *pi_hash); #endif diff --git a/fs/smb/server/connection.h b/fs/smb/server/connection.h index 14620e147dda..6efed923bd68 100644 --- a/fs/smb/server/connection.h +++ b/fs/smb/server/connection.h @@ -108,6 +108,7 @@ struct ksmbd_conn { __le16 signing_algorithm; bool binding; atomic_t refcnt; + bool is_aapl; }; struct ksmbd_conn_ops { diff --git a/fs/smb/server/crypto_ctx.c b/fs/smb/server/crypto_ctx.c index ce733dc9a4a3..80bd68c8635e 100644 --- a/fs/smb/server/crypto_ctx.c +++ b/fs/smb/server/crypto_ctx.c @@ -75,9 +75,6 @@ static struct shash_desc *alloc_shash_desc(int id) case CRYPTO_SHASH_CMACAES: tfm = crypto_alloc_shash("cmac(aes)", 0, 0); break; - case CRYPTO_SHASH_SHA256: - tfm = crypto_alloc_shash("sha256", 0, 0); - break; case CRYPTO_SHASH_SHA512: tfm = crypto_alloc_shash("sha512", 0, 0); break; @@ -198,11 +195,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void) return ____crypto_shash_ctx_find(CRYPTO_SHASH_CMACAES); } -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void) -{ - return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA256); -} - struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void) { return ____crypto_shash_ctx_find(CRYPTO_SHASH_SHA512); diff --git a/fs/smb/server/crypto_ctx.h b/fs/smb/server/crypto_ctx.h index 4a367c62f653..ac64801d52d3 100644 --- a/fs/smb/server/crypto_ctx.h +++ b/fs/smb/server/crypto_ctx.h @@ -13,7 +13,6 @@ enum { CRYPTO_SHASH_HMACMD5 = 0, CRYPTO_SHASH_HMACSHA256, CRYPTO_SHASH_CMACAES, - CRYPTO_SHASH_SHA256, CRYPTO_SHASH_SHA512, CRYPTO_SHASH_MAX, }; @@ -39,14 +38,12 @@ struct ksmbd_crypto_ctx { #define CRYPTO_HMACMD5(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]) #define CRYPTO_HMACSHA256(c) ((c)->desc[CRYPTO_SHASH_HMACSHA256]) #define CRYPTO_CMACAES(c) ((c)->desc[CRYPTO_SHASH_CMACAES]) -#define CRYPTO_SHA256(c) ((c)->desc[CRYPTO_SHASH_SHA256]) #define CRYPTO_SHA512(c) ((c)->desc[CRYPTO_SHASH_SHA512]) #define CRYPTO_HMACMD5_TFM(c) ((c)->desc[CRYPTO_SHASH_HMACMD5]->tfm) #define CRYPTO_HMACSHA256_TFM(c)\ ((c)->desc[CRYPTO_SHASH_HMACSHA256]->tfm) #define CRYPTO_CMACAES_TFM(c) ((c)->desc[CRYPTO_SHASH_CMACAES]->tfm) -#define CRYPTO_SHA256_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA256]->tfm) #define CRYPTO_SHA512_TFM(c) ((c)->desc[CRYPTO_SHASH_SHA512]->tfm) #define CRYPTO_GCM(c) ((c)->ccmaes[CRYPTO_AEAD_AES_GCM]) @@ -57,7 +54,6 @@ struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacmd5(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_hmacsha256(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_cmacaes(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha512(void); -struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_sha256(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_gcm(void); struct ksmbd_crypto_ctx *ksmbd_crypto_ctx_find_ccm(void); void ksmbd_crypto_destroy(void); diff --git a/fs/smb/server/server.c b/fs/smb/server/server.c index ab533c602987..8c9c49c3a0a4 100644 --- a/fs/smb/server/server.c +++ b/fs/smb/server/server.c @@ -631,6 +631,5 @@ MODULE_SOFTDEP("pre: sha512"); MODULE_SOFTDEP("pre: aead2"); MODULE_SOFTDEP("pre: ccm"); MODULE_SOFTDEP("pre: gcm"); -MODULE_SOFTDEP("pre: crc32"); module_init(ksmbd_server_init) module_exit(ksmbd_server_exit) diff --git a/fs/smb/server/smb2pdu.c b/fs/smb/server/smb2pdu.c index 8d414239b3fe..1a308171b599 100644 --- a/fs/smb/server/smb2pdu.c +++ b/fs/smb/server/smb2pdu.c @@ -2874,7 +2874,7 @@ int smb2_open(struct ksmbd_work *work) int req_op_level = 0, open_flags = 0, may_flags = 0, file_info = 0; int rc = 0; int contxt_cnt = 0, query_disk_id = 0; - int maximal_access_ctxt = 0, posix_ctxt = 0; + bool maximal_access_ctxt = false, posix_ctxt = false; int s_type = 0; int next_off = 0; char *name = NULL; @@ -2903,6 +2903,27 @@ int smb2_open(struct ksmbd_work *work) return create_smb2_pipe(work); } + if (req->CreateContextsOffset && tcon->posix_extensions) { + context = smb2_find_context_vals(req, SMB2_CREATE_TAG_POSIX, 16); + if (IS_ERR(context)) { + rc = PTR_ERR(context); + goto err_out2; + } else if (context) { + struct create_posix *posix = (struct create_posix *)context; + + if (le16_to_cpu(context->DataOffset) + + le32_to_cpu(context->DataLength) < + sizeof(struct create_posix) - 4) { + rc = -EINVAL; + goto err_out2; + } + ksmbd_debug(SMB, "get posix context\n"); + + posix_mode = le32_to_cpu(posix->Mode); + posix_ctxt = true; + } + } + if (req->NameLength) { name = smb2_get_name((char *)req + le16_to_cpu(req->NameOffset), le16_to_cpu(req->NameLength), @@ -2925,9 +2946,11 @@ int smb2_open(struct ksmbd_work *work) goto err_out2; } - rc = ksmbd_validate_filename(name); - if (rc < 0) - goto err_out2; + if (posix_ctxt == false) { + rc = ksmbd_validate_filename(name); + if (rc < 0) + goto err_out2; + } if (ksmbd_share_veto_filename(share, name)) { rc = -ENOENT; @@ -3085,28 +3108,6 @@ int smb2_open(struct ksmbd_work *work) rc = -EBADF; goto err_out2; } - - if (tcon->posix_extensions) { - context = smb2_find_context_vals(req, - SMB2_CREATE_TAG_POSIX, 16); - if (IS_ERR(context)) { - rc = PTR_ERR(context); - goto err_out2; - } else if (context) { - struct create_posix *posix = - (struct create_posix *)context; - if (le16_to_cpu(context->DataOffset) + - le32_to_cpu(context->DataLength) < - sizeof(struct create_posix) - 4) { - rc = -EINVAL; - goto err_out2; - } - ksmbd_debug(SMB, "get posix context\n"); - - posix_mode = le32_to_cpu(posix->Mode); - posix_ctxt = 1; - } - } } if (ksmbd_override_fsids(work)) { @@ -3539,6 +3540,15 @@ int smb2_open(struct ksmbd_work *work) ksmbd_debug(SMB, "get query on disk id context\n"); query_disk_id = 1; } + + if (conn->is_aapl == false) { + context = smb2_find_context_vals(req, SMB2_CREATE_AAPL, 4); + if (IS_ERR(context)) { + rc = PTR_ERR(context); + goto err_out1; + } else if (context) + conn->is_aapl = true; + } } rc = ksmbd_vfs_getattr(&path, &stat); @@ -3978,7 +3988,10 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, if (dinfo->EaSize) dinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE; dinfo->Reserved = 0; - dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino); + if (conn->is_aapl) + dinfo->UniqueId = 0; + else + dinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino); if (d_info->hide_dot_file && d_info->name[0] == '.') dinfo->ExtFileAttributes |= FILE_ATTRIBUTE_HIDDEN_LE; memcpy(dinfo->FileName, conv_name, conv_len); @@ -3995,7 +4008,10 @@ static int smb2_populate_readdir_entry(struct ksmbd_conn *conn, int info_level, smb2_get_reparse_tag_special_file(ksmbd_kstat->kstat->mode); if (fibdinfo->EaSize) fibdinfo->ExtFileAttributes = FILE_ATTRIBUTE_REPARSE_POINT_LE; - fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino); + if (conn->is_aapl) + fibdinfo->UniqueId = 0; + else + fibdinfo->UniqueId = cpu_to_le64(ksmbd_kstat->kstat->ino); fibdinfo->ShortNameLength = 0; fibdinfo->Reserved = 0; fibdinfo->Reserved2 = cpu_to_le16(0); diff --git a/fs/smb/server/smb2pdu.h b/fs/smb/server/smb2pdu.h index 17a0b18a8406..16ae8a10490b 100644 --- a/fs/smb/server/smb2pdu.h +++ b/fs/smb/server/smb2pdu.h @@ -63,6 +63,9 @@ struct preauth_integrity_info { #define SMB2_SESSION_TIMEOUT (10 * HZ) +/* Apple Defined Contexts */ +#define SMB2_CREATE_AAPL "AAPL" + struct create_durable_req_v2 { struct create_context_hdr ccontext; __u8 Name[8]; diff --git a/fs/smb/server/vfs.c b/fs/smb/server/vfs.c index baf0d3031a44..ba45e809555a 100644 --- a/fs/smb/server/vfs.c +++ b/fs/smb/server/vfs.c @@ -4,6 +4,7 @@ * Copyright (C) 2018 Samsung Electronics Co., Ltd. */ +#include <crypto/sha2.h> #include <linux/kernel.h> #include <linux/fs.h> #include <linux/filelock.h> @@ -1476,11 +1477,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, acl.sd_buf = (char *)pntsd; acl.sd_size = len; - rc = ksmbd_gen_sd_hash(conn, acl.sd_buf, acl.sd_size, acl.hash); - if (rc) { - pr_err("failed to generate hash for ndr acl\n"); - return rc; - } + sha256(acl.sd_buf, acl.sd_size, acl.hash); smb_acl = ksmbd_vfs_make_xattr_posix_acl(idmap, inode, ACL_TYPE_ACCESS); @@ -1495,12 +1492,7 @@ int ksmbd_vfs_set_sd_xattr(struct ksmbd_conn *conn, goto out; } - rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset, - acl.posix_acl_hash); - if (rc) { - pr_err("failed to generate hash for ndr acl\n"); - goto out; - } + sha256(acl_ndr.data, acl_ndr.offset, acl.posix_acl_hash); rc = ndr_encode_v4_ntacl(&sd_ndr, &acl); if (rc) { @@ -1557,11 +1549,7 @@ int ksmbd_vfs_get_sd_xattr(struct ksmbd_conn *conn, goto out_free; } - rc = ksmbd_gen_sd_hash(conn, acl_ndr.data, acl_ndr.offset, cmp_hash); - if (rc) { - pr_err("failed to generate hash for ndr acl\n"); - goto out_free; - } + sha256(acl_ndr.data, acl_ndr.offset, cmp_hash); if (memcmp(cmp_hash, acl.posix_acl_hash, XATTR_SD_HASH_SIZE)) { pr_err("hash value diff\n"); diff --git a/fs/squashfs/Kconfig b/fs/squashfs/Kconfig index b1091e70434a..a9602aae21ef 100644 --- a/fs/squashfs/Kconfig +++ b/fs/squashfs/Kconfig @@ -149,6 +149,27 @@ config SQUASHFS_XATTR If unsure, say N. +config SQUASHFS_COMP_CACHE_FULL + bool "Enable full caching of compressed blocks" + depends on SQUASHFS + default n + help + This option enables caching of all compressed blocks, Without caching, + repeated reads of the same files trigger excessive disk I/O, significantly + reducinng performance in workloads like fio-based benchmarks. + + For example, fio tests (iodepth=1, numjobs=1, ioengine=psync) show: + With caching: IOPS=2223, BW=278MiB/s (291MB/s) + Without caching: IOPS=815, BW=102MiB/s (107MB/s) + + Enabling this option restores performance to pre-regression levels by + caching all compressed blocks in the page cache, reducing disk I/O for + repeated reads. However, this increases memory usage, which may be a + concern in memory-constrained environments. + + Enable this option if your workload involves frequent repeated reads and + memory usage is not a limiting factor. If unsure, say N. + config SQUASHFS_ZLIB bool "Include support for ZLIB compressed file systems" depends on SQUASHFS diff --git a/fs/squashfs/block.c b/fs/squashfs/block.c index 2dc730800f44..3061043e915c 100644 --- a/fs/squashfs/block.c +++ b/fs/squashfs/block.c @@ -88,6 +88,10 @@ static int squashfs_bio_read_cached(struct bio *fullbio, struct bio_vec *bv; int idx = 0; int err = 0; +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + struct page **cache_pages = kmalloc_array(page_count, + sizeof(void *), GFP_KERNEL | __GFP_ZERO); +#endif bio_for_each_segment_all(bv, fullbio, iter_all) { struct page *page = bv->bv_page; @@ -110,6 +114,11 @@ static int squashfs_bio_read_cached(struct bio *fullbio, head_to_cache = page; else if (idx == page_count - 1 && index + length != read_end) tail_to_cache = page; +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + /* Cache all pages in the BIO for repeated reads */ + else if (cache_pages) + cache_pages[idx] = page; +#endif if (!bio || idx != end_idx) { struct bio *new = bio_alloc_clone(bdev, fullbio, @@ -163,6 +172,25 @@ static int squashfs_bio_read_cached(struct bio *fullbio, } } +#ifdef CONFIG_SQUASHFS_COMP_CACHE_FULL + if (!cache_pages) + goto out; + + for (idx = 0; idx < page_count; idx++) { + if (!cache_pages[idx]) + continue; + int ret = add_to_page_cache_lru(cache_pages[idx], cache_mapping, + (read_start >> PAGE_SHIFT) + idx, + GFP_NOIO); + + if (!ret) { + SetPageUptodate(cache_pages[idx]); + unlock_page(cache_pages[idx]); + } + } + kfree(cache_pages); +out: +#endif return 0; } diff --git a/fs/squashfs/super.c b/fs/squashfs/super.c index 67c55fe32ce8..992ea0e37257 100644 --- a/fs/squashfs/super.c +++ b/fs/squashfs/super.c @@ -202,6 +202,11 @@ static int squashfs_fill_super(struct super_block *sb, struct fs_context *fc) msblk->panic_on_errors = (opts->errors == Opt_errors_panic); msblk->devblksize = sb_min_blocksize(sb, SQUASHFS_DEVBLK_SIZE); + if (!msblk->devblksize) { + errorf(fc, "squashfs: unable to set blocksize\n"); + return -EINVAL; + } + msblk->devblksize_log2 = ffz(~msblk->devblksize); mutex_init(&msblk->meta_index_mutex); diff --git a/fs/super.c b/fs/super.c index bcc4e87123c8..21799e213fd7 100644 --- a/fs/super.c +++ b/fs/super.c @@ -824,13 +824,6 @@ struct super_block *sget(struct file_system_type *type, struct super_block *old; int err; - /* We don't yet pass the user namespace of the parent - * mount through to here so always use &init_user_ns - * until that changes. - */ - if (flags & SB_SUBMOUNT) - user_ns = &init_user_ns; - retry: spin_lock(&sb_lock); if (test) { @@ -850,7 +843,7 @@ retry: } if (!s) { spin_unlock(&sb_lock); - s = alloc_super(type, (flags & ~SB_SUBMOUNT), user_ns); + s = alloc_super(type, flags, user_ns); if (!s) return ERR_PTR(-ENOMEM); goto retry; diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c index 8b01a7eda5fb..2d78e94072a0 100644 --- a/fs/sysfs/group.c +++ b/fs/sysfs/group.c @@ -21,7 +21,7 @@ static void remove_files(struct kernfs_node *parent, const struct attribute_group *grp) { struct attribute *const *attr; - struct bin_attribute *const *bin_attr; + const struct bin_attribute *const *bin_attr; if (grp->attrs) for (attr = grp->attrs; *attr; attr++) @@ -47,7 +47,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj, const struct attribute_group *grp, int update) { struct attribute *const *attr; - struct bin_attribute *const *bin_attr; + const struct bin_attribute *const *bin_attr; int error = 0, i; if (grp->attrs) { @@ -521,7 +521,7 @@ static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn, } if (grp->bin_attrs) { - struct bin_attribute *const *bin_attr; + const struct bin_attribute *const *bin_attr; for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) { kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name); diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 762699c1bcf6..eea718ac66b4 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -83,11 +83,11 @@ #include <linux/blkdev.h> #include <linux/backing-dev.h> #include <linux/init.h> -#include <linux/parser.h> +#include <linux/fs_context.h> +#include <linux/fs_parser.h> #include <linux/buffer_head.h> #include <linux/vfs.h> #include <linux/log2.h> -#include <linux/mount.h> #include <linux/seq_file.h> #include <linux/iversion.h> @@ -289,7 +289,7 @@ void ufs_error (struct super_block * sb, const char * function, va_start(args, fmt); vaf.fmt = fmt; vaf.va = &args; - switch (UFS_SB(sb)->s_mount_opt & UFS_MOUNT_ONERROR) { + switch (UFS_SB(sb)->s_on_err) { case UFS_MOUNT_ONERROR_PANIC: panic("panic (device %s): %s: %pV\n", sb->s_id, function, &vaf); @@ -342,124 +342,74 @@ void ufs_warning (struct super_block * sb, const char * function, va_end(args); } -enum { - Opt_type_old = UFS_MOUNT_UFSTYPE_OLD, - Opt_type_sunx86 = UFS_MOUNT_UFSTYPE_SUNx86, - Opt_type_sun = UFS_MOUNT_UFSTYPE_SUN, - Opt_type_sunos = UFS_MOUNT_UFSTYPE_SUNOS, - Opt_type_44bsd = UFS_MOUNT_UFSTYPE_44BSD, - Opt_type_ufs2 = UFS_MOUNT_UFSTYPE_UFS2, - Opt_type_hp = UFS_MOUNT_UFSTYPE_HP, - Opt_type_nextstepcd = UFS_MOUNT_UFSTYPE_NEXTSTEP_CD, - Opt_type_nextstep = UFS_MOUNT_UFSTYPE_NEXTSTEP, - Opt_type_openstep = UFS_MOUNT_UFSTYPE_OPENSTEP, - Opt_onerror_panic = UFS_MOUNT_ONERROR_PANIC, - Opt_onerror_lock = UFS_MOUNT_ONERROR_LOCK, - Opt_onerror_umount = UFS_MOUNT_ONERROR_UMOUNT, - Opt_onerror_repair = UFS_MOUNT_ONERROR_REPAIR, - Opt_err +enum { Opt_type, Opt_onerror }; + +static const struct constant_table ufs_param_ufstype[] = { + {"old", UFS_MOUNT_UFSTYPE_OLD}, + {"sunx86", UFS_MOUNT_UFSTYPE_SUNx86}, + {"sun", UFS_MOUNT_UFSTYPE_SUN}, + {"sunos", UFS_MOUNT_UFSTYPE_SUNOS}, + {"44bsd", UFS_MOUNT_UFSTYPE_44BSD}, + {"ufs2", UFS_MOUNT_UFSTYPE_UFS2}, + {"5xbsd", UFS_MOUNT_UFSTYPE_UFS2}, + {"hp", UFS_MOUNT_UFSTYPE_HP}, + {"nextstep-cd", UFS_MOUNT_UFSTYPE_NEXTSTEP_CD}, + {"nextstep", UFS_MOUNT_UFSTYPE_NEXTSTEP}, + {"openstep", UFS_MOUNT_UFSTYPE_OPENSTEP}, + {} }; -static const match_table_t tokens = { - {Opt_type_old, "ufstype=old"}, - {Opt_type_sunx86, "ufstype=sunx86"}, - {Opt_type_sun, "ufstype=sun"}, - {Opt_type_sunos, "ufstype=sunos"}, - {Opt_type_44bsd, "ufstype=44bsd"}, - {Opt_type_ufs2, "ufstype=ufs2"}, - {Opt_type_ufs2, "ufstype=5xbsd"}, - {Opt_type_hp, "ufstype=hp"}, - {Opt_type_nextstepcd, "ufstype=nextstep-cd"}, - {Opt_type_nextstep, "ufstype=nextstep"}, - {Opt_type_openstep, "ufstype=openstep"}, -/*end of possible ufs types */ - {Opt_onerror_panic, "onerror=panic"}, - {Opt_onerror_lock, "onerror=lock"}, - {Opt_onerror_umount, "onerror=umount"}, - {Opt_onerror_repair, "onerror=repair"}, - {Opt_err, NULL} +static const struct constant_table ufs_param_onerror[] = { + {"panic", UFS_MOUNT_ONERROR_PANIC}, + {"lock", UFS_MOUNT_ONERROR_LOCK}, + {"umount", UFS_MOUNT_ONERROR_UMOUNT}, + {"repair", UFS_MOUNT_ONERROR_REPAIR}, + {} }; -static int ufs_parse_options (char * options, unsigned * mount_options) +static const struct fs_parameter_spec ufs_param_spec[] = { + fsparam_enum ("ufstype", Opt_type, ufs_param_ufstype), + fsparam_enum ("onerror", Opt_onerror, ufs_param_onerror), + {} +}; + +struct ufs_fs_context { + unsigned int flavour, on_err; +}; + +static int ufs_parse_param(struct fs_context *fc, struct fs_parameter *param) { - char * p; - + struct ufs_fs_context *ctx = fc->fs_private; + struct fs_parse_result result; + int opt; + UFSD("ENTER\n"); - - if (!options) - return 1; - while ((p = strsep(&options, ",")) != NULL) { - substring_t args[MAX_OPT_ARGS]; - int token; - if (!*p) - continue; + opt = fs_parse(fc, ufs_param_spec, param, &result); + if (opt < 0) + return opt; - token = match_token(p, tokens, args); - switch (token) { - case Opt_type_old: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_OLD); - break; - case Opt_type_sunx86: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_SUNx86); - break; - case Opt_type_sun: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_SUN); - break; - case Opt_type_sunos: - ufs_clear_opt(*mount_options, UFSTYPE); - ufs_set_opt(*mount_options, UFSTYPE_SUNOS); - break; - case Opt_type_44bsd: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_44BSD); - break; - case Opt_type_ufs2: - ufs_clear_opt(*mount_options, UFSTYPE); - ufs_set_opt(*mount_options, UFSTYPE_UFS2); - break; - case Opt_type_hp: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_HP); - break; - case Opt_type_nextstepcd: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP_CD); - break; - case Opt_type_nextstep: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_NEXTSTEP); - break; - case Opt_type_openstep: - ufs_clear_opt (*mount_options, UFSTYPE); - ufs_set_opt (*mount_options, UFSTYPE_OPENSTEP); - break; - case Opt_onerror_panic: - ufs_clear_opt (*mount_options, ONERROR); - ufs_set_opt (*mount_options, ONERROR_PANIC); - break; - case Opt_onerror_lock: - ufs_clear_opt (*mount_options, ONERROR); - ufs_set_opt (*mount_options, ONERROR_LOCK); - break; - case Opt_onerror_umount: - ufs_clear_opt (*mount_options, ONERROR); - ufs_set_opt (*mount_options, ONERROR_UMOUNT); - break; - case Opt_onerror_repair: - pr_err("Unable to do repair on error, will lock lock instead\n"); - ufs_clear_opt (*mount_options, ONERROR); - ufs_set_opt (*mount_options, ONERROR_REPAIR); - break; - default: - pr_err("Invalid option: \"%s\" or missing value\n", p); + switch (opt) { + case Opt_type: + if (ctx->flavour == result.uint_32) /* no-op */ return 0; + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + pr_err("ufstype can't be changed during remount\n"); + return -EINVAL; } + if (!ctx->flavour) { + pr_err("conflicting ufstype options\n"); + return -EINVAL; + } + ctx->flavour = result.uint_32; + break; + case Opt_onerror: + ctx->on_err = result.uint_32; + break; + default: + return -EINVAL; } - return 1; + return 0; } /* @@ -474,7 +424,7 @@ static void ufs_setup_cstotal(struct super_block *sb) struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; struct ufs_super_block_third *usb3; - unsigned mtype = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; + unsigned mtype = sbi->s_flavour; UFSD("ENTER, mtype=%u\n", mtype); usb1 = ubh_get_usb_first(uspi); @@ -580,7 +530,7 @@ failed: */ static void ufs_put_cstotal(struct super_block *sb) { - unsigned mtype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; + unsigned mtype = UFS_SB(sb)->s_flavour; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_super_block_first *usb1; struct ufs_super_block_second *usb2; @@ -764,8 +714,10 @@ static u64 ufs_max_bytes(struct super_block *sb) return res << uspi->s_bshift; } -static int ufs_fill_super(struct super_block *sb, void *data, int silent) +static int ufs_fill_super(struct super_block *sb, struct fs_context *fc) { + struct ufs_fs_context *ctx = fc->fs_private; + int silent = fc->sb_flags & SB_SILENT; struct ufs_sb_info * sbi; struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; @@ -803,24 +755,18 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); - /* - * Set default mount options - * Parse mount options - */ - sbi->s_mount_opt = 0; - ufs_set_opt (sbi->s_mount_opt, ONERROR_LOCK); - if (!ufs_parse_options ((char *) data, &sbi->s_mount_opt)) { - pr_err("wrong mount options\n"); - goto failed; - } - if (!(sbi->s_mount_opt & UFS_MOUNT_UFSTYPE)) { + + sbi->s_flavour = ctx->flavour; + sbi->s_on_err = ctx->on_err; + + if (!sbi->s_flavour) { if (!silent) pr_err("You didn't specify the type of your ufs filesystem\n\n" "mount -t ufs -o ufstype=" "sun|sunx86|44bsd|ufs2|5xbsd|old|hp|nextstep|nextstep-cd|openstep ...\n\n" ">>>WARNING<<< Wrong ufstype may corrupt your filesystem, " "default is ufstype=old\n"); - ufs_set_opt (sbi->s_mount_opt, UFSTYPE_OLD); + sbi->s_flavour = UFS_MOUNT_UFSTYPE_OLD; } uspi = kzalloc(sizeof(struct ufs_sb_private_info), GFP_KERNEL); @@ -836,7 +782,7 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) sb->s_time_min = S32_MIN; sb->s_time_max = S32_MAX; - switch (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) { + switch (sbi->s_flavour) { case UFS_MOUNT_UFSTYPE_44BSD: UFSD("ufstype=44bsd\n"); uspi->s_fsize = block_size = 512; @@ -1035,9 +981,9 @@ again: goto magic_found; } - if ((((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP) - || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD) - || ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_OPENSTEP)) + if ((sbi->s_flavour == UFS_MOUNT_UFSTYPE_NEXTSTEP + || sbi->s_flavour == UFS_MOUNT_UFSTYPE_NEXTSTEP_CD + || sbi->s_flavour == UFS_MOUNT_UFSTYPE_OPENSTEP) && uspi->s_sbbase < 256) { ubh_brelse_uspi(uspi); ubh = NULL; @@ -1237,8 +1183,8 @@ magic_found: uspi->s_bpf = uspi->s_fsize << 3; uspi->s_bpfshift = uspi->s_fshift + 3; uspi->s_bpfmask = uspi->s_bpf - 1; - if ((sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_44BSD || - (sbi->s_mount_opt & UFS_MOUNT_UFSTYPE) == UFS_MOUNT_UFSTYPE_UFS2) + if (sbi->s_flavour == UFS_MOUNT_UFSTYPE_44BSD || + sbi->s_flavour == UFS_MOUNT_UFSTYPE_UFS2) uspi->s_maxsymlinklen = fs32_to_cpu(sb, usb3->fs_un2.fs_44.fs_maxsymlinklen); @@ -1290,13 +1236,15 @@ failed_nomem: return -ENOMEM; } -static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) +static int ufs_reconfigure(struct fs_context *fc) { struct ufs_sb_private_info * uspi; struct ufs_super_block_first * usb1; struct ufs_super_block_third * usb3; - unsigned new_mount_opt, ufstype; - unsigned flags; + struct ufs_fs_context *ctx = fc->fs_private; + struct super_block *sb = fc->root->d_sb; + unsigned int ufstype; + unsigned int flags; sync_filesystem(sb); mutex_lock(&UFS_SB(sb)->s_lock); @@ -1305,27 +1253,10 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) usb1 = ubh_get_usb_first(uspi); usb3 = ubh_get_usb_third(uspi); - /* - * Allow the "check" option to be passed as a remount option. - * It is not possible to change ufstype option during remount - */ - ufstype = UFS_SB(sb)->s_mount_opt & UFS_MOUNT_UFSTYPE; - new_mount_opt = 0; - ufs_set_opt (new_mount_opt, ONERROR_LOCK); - if (!ufs_parse_options (data, &new_mount_opt)) { - mutex_unlock(&UFS_SB(sb)->s_lock); - return -EINVAL; - } - if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { - new_mount_opt |= ufstype; - } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { - pr_err("ufstype can't be changed during remount\n"); - mutex_unlock(&UFS_SB(sb)->s_lock); - return -EINVAL; - } + ufstype = UFS_SB(sb)->s_flavour; - if ((bool)(*mount_flags & SB_RDONLY) == sb_rdonly(sb)) { - UFS_SB(sb)->s_mount_opt = new_mount_opt; + if ((bool)(fc->sb_flags & SB_RDONLY) == sb_rdonly(sb)) { + UFS_SB(sb)->s_on_err = ctx->on_err; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } @@ -1333,7 +1264,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) /* * fs was mouted as rw, remounting ro */ - if (*mount_flags & SB_RDONLY) { + if (fc->sb_flags & SB_RDONLY) { ufs_put_super_internal(sb); usb1->fs_time = ufs_get_seconds(sb); if ((flags & UFS_ST_MASK) == UFS_ST_SUN @@ -1369,7 +1300,7 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) sb->s_flags &= ~SB_RDONLY; #endif } - UFS_SB(sb)->s_mount_opt = new_mount_opt; + UFS_SB(sb)->s_on_err = ctx->on_err; mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } @@ -1377,19 +1308,19 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) static int ufs_show_options(struct seq_file *seq, struct dentry *root) { struct ufs_sb_info *sbi = UFS_SB(root->d_sb); - unsigned mval = sbi->s_mount_opt & UFS_MOUNT_UFSTYPE; - const struct match_token *tp = tokens; + unsigned mval = sbi->s_flavour; + const struct constant_table *tp; - while (tp->token != Opt_onerror_panic && tp->token != mval) + tp = ufs_param_ufstype; + while (tp->value && tp->value != mval) ++tp; - BUG_ON(tp->token == Opt_onerror_panic); - seq_printf(seq, ",%s", tp->pattern); + seq_printf(seq, ",ufstype=%s", tp->name); - mval = sbi->s_mount_opt & UFS_MOUNT_ONERROR; - while (tp->token != Opt_err && tp->token != mval) + tp = ufs_param_onerror; + mval = sbi->s_on_err; + while (tp->value && tp->value != mval) ++tp; - BUG_ON(tp->token == Opt_err); - seq_printf(seq, ",%s", tp->pattern); + seq_printf(seq, ",onerror=%s", tp->name); return 0; } @@ -1483,21 +1414,57 @@ static const struct super_operations ufs_super_ops = { .put_super = ufs_put_super, .sync_fs = ufs_sync_fs, .statfs = ufs_statfs, - .remount_fs = ufs_remount, .show_options = ufs_show_options, }; -static struct dentry *ufs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) +static int ufs_get_tree(struct fs_context *fc) +{ + return get_tree_bdev(fc, ufs_fill_super); +} + +static void ufs_free_fc(struct fs_context *fc) +{ + kfree(fc->fs_private); +} + +static const struct fs_context_operations ufs_context_ops = { + .parse_param = ufs_parse_param, + .get_tree = ufs_get_tree, + .reconfigure = ufs_reconfigure, + .free = ufs_free_fc, +}; + +static int ufs_init_fs_context(struct fs_context *fc) { - return mount_bdev(fs_type, flags, dev_name, data, ufs_fill_super); + struct ufs_fs_context *ctx; + + ctx = kzalloc(sizeof(*ctx), GFP_KERNEL); + if (!ctx) + return -ENOMEM; + + if (fc->purpose == FS_CONTEXT_FOR_RECONFIGURE) { + struct super_block *sb = fc->root->d_sb; + struct ufs_sb_info *sbi = UFS_SB(sb); + + ctx->flavour = sbi->s_flavour; + ctx->on_err = sbi->s_on_err; + } else { + ctx->flavour = 0; + ctx->on_err = UFS_MOUNT_ONERROR_LOCK; + } + + fc->fs_private = ctx; + fc->ops = &ufs_context_ops; + + return 0; } static struct file_system_type ufs_fs_type = { .owner = THIS_MODULE, .name = "ufs", - .mount = ufs_mount, .kill_sb = kill_block_super, + .init_fs_context = ufs_init_fs_context, + .parameters = ufs_param_spec, .fs_flags = FS_REQUIRES_DEV, }; MODULE_ALIAS_FS("ufs"); diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index e7df65dd4351..788e025056b2 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -24,7 +24,8 @@ struct ufs_sb_info { struct ufs_cg_private_info * s_ucpi[UFS_MAX_GROUP_LOADED]; unsigned s_cgno[UFS_MAX_GROUP_LOADED]; unsigned short s_cg_loaded; - unsigned s_mount_opt; + unsigned s_flavour; + unsigned s_on_err; struct super_block *sb; int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ @@ -52,13 +53,11 @@ struct ufs_inode_info { }; /* mount options */ -#define UFS_MOUNT_ONERROR 0x0000000F #define UFS_MOUNT_ONERROR_PANIC 0x00000001 #define UFS_MOUNT_ONERROR_LOCK 0x00000002 #define UFS_MOUNT_ONERROR_UMOUNT 0x00000004 #define UFS_MOUNT_ONERROR_REPAIR 0x00000008 -#define UFS_MOUNT_UFSTYPE 0x0000FFF0 #define UFS_MOUNT_UFSTYPE_OLD 0x00000010 #define UFS_MOUNT_UFSTYPE_44BSD 0x00000020 #define UFS_MOUNT_UFSTYPE_SUN 0x00000040 @@ -70,10 +69,6 @@ struct ufs_inode_info { #define UFS_MOUNT_UFSTYPE_UFS2 0x00001000 #define UFS_MOUNT_UFSTYPE_SUNOS 0x00002000 -#define ufs_clear_opt(o,opt) o &= ~UFS_MOUNT_##opt -#define ufs_set_opt(o,opt) o |= UFS_MOUNT_##opt -#define ufs_test_opt(o,opt) ((o) & UFS_MOUNT_##opt) - /* * Debug code */ diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 26a04a783489..63151feb9c3f 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -436,6 +436,25 @@ allocate_blocks: return 0; } +static bool +xfs_ioend_needs_wq_completion( + struct iomap_ioend *ioend) +{ + /* Changing inode size requires a transaction. */ + if (xfs_ioend_is_append(ioend)) + return true; + + /* Extent manipulation requires a transaction. */ + if (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED)) + return true; + + /* Page cache invalidation cannot be done in irq context. */ + if (ioend->io_flags & IOMAP_IOEND_DONTCACHE) + return true; + + return false; +} + static int xfs_submit_ioend( struct iomap_writepage_ctx *wpc, @@ -460,8 +479,7 @@ xfs_submit_ioend( memalloc_nofs_restore(nofs_flag); /* send ioends that might require a transaction to the completion wq */ - if (xfs_ioend_is_append(ioend) || - (ioend->io_flags & (IOMAP_IOEND_UNWRITTEN | IOMAP_IOEND_SHARED))) + if (xfs_ioend_needs_wq_completion(ioend)) ioend->io_bio.bi_end_io = xfs_end_bio; if (status) diff --git a/fs/xfs/xfs_zone_gc.c b/fs/xfs/xfs_zone_gc.c index d613a4094db6..9c00fc5baa30 100644 --- a/fs/xfs/xfs_zone_gc.c +++ b/fs/xfs/xfs_zone_gc.c @@ -290,8 +290,6 @@ xfs_zone_gc_query_cb( return 0; } -#define cmp_int(l, r) ((l > r) - (l < r)) - static int xfs_zone_gc_rmap_rec_cmp( const void *a, |