summaryrefslogtreecommitdiff
path: root/fs
diff options
context:
space:
mode:
Diffstat (limited to 'fs')
-rw-r--r--fs/afs/Kconfig1
-rw-r--r--fs/afs/Makefile1
-rw-r--r--fs/afs/cm_security.c340
-rw-r--r--fs/afs/internal.h20
-rw-r--r--fs/afs/main.c1
-rw-r--r--fs/afs/misc.c27
-rw-r--r--fs/afs/rxrpc.c40
-rw-r--r--fs/afs/server.c2
-rw-r--r--fs/btrfs/extent_io.c1
-rw-r--r--fs/dlm/Kconfig1
-rw-r--r--fs/dlm/config.c3
-rw-r--r--fs/dlm/lowcomms.c7
-rw-r--r--fs/exfat/nls.c1
-rw-r--r--fs/exfat/super.c30
-rw-r--r--fs/ext2/super.c3
-rw-r--r--fs/ext4/bitmap.c8
-rw-r--r--fs/ext4/ext4.h91
-rw-r--r--fs/ext4/ext4_jbd2.c3
-rw-r--r--fs/ext4/ext4_jbd2.h4
-rw-r--r--fs/ext4/extents.c177
-rw-r--r--fs/ext4/extents_status.c35
-rw-r--r--fs/ext4/fast_commit.c460
-rw-r--r--fs/ext4/file.c14
-rw-r--r--fs/ext4/ialloc.c8
-rw-r--r--fs/ext4/inline.c3
-rw-r--r--fs/ext4/inode.c510
-rw-r--r--fs/ext4/ioctl.c16
-rw-r--r--fs/ext4/mmp.c2
-rw-r--r--fs/ext4/move_extent.c11
-rw-r--r--fs/ext4/namei.c10
-rw-r--r--fs/ext4/orphan.c13
-rw-r--r--fs/ext4/readpage.c28
-rw-r--r--fs/ext4/resize.c2
-rw-r--r--fs/ext4/super.c84
-rw-r--r--fs/ext4/xattr.c10
-rw-r--r--fs/isofs/inode.c7
-rw-r--r--fs/isofs/isofs.h4
-rw-r--r--fs/isofs/rock.c40
-rw-r--r--fs/isofs/rock.h6
-rw-r--r--fs/isofs/util.c49
-rw-r--r--fs/jbd2/commit.c6
-rw-r--r--fs/jbd2/journal.c23
-rw-r--r--fs/jbd2/recovery.c10
-rw-r--r--fs/jbd2/transaction.c5
-rw-r--r--fs/jfs/jfs_discard.c3
-rw-r--r--fs/jfs/jfs_dmap.c6
-rw-r--r--fs/jfs/jfs_dtree.c18
-rw-r--r--fs/kernfs/dir.c33
-rw-r--r--fs/kernfs/file.c3
-rw-r--r--fs/kernfs/kernfs-internal.h16
-rw-r--r--fs/nfsd/Kconfig2
-rw-r--r--fs/nfsd/Makefile1
-rw-r--r--fs/nfsd/debugfs.c47
-rw-r--r--fs/nfsd/export.c3
-rw-r--r--fs/nfsd/nfs3proc.c68
-rw-r--r--fs/nfsd/nfs4callback.c132
-rw-r--r--fs/nfsd/nfs4proc.c35
-rw-r--r--fs/nfsd/nfs4recover.c61
-rw-r--r--fs/nfsd/nfs4state.c40
-rw-r--r--fs/nfsd/nfs4xdr.c21
-rw-r--r--fs/nfsd/nfsctl.c25
-rw-r--r--fs/nfsd/nfsd.h34
-rw-r--r--fs/nfsd/nfsproc.c48
-rw-r--r--fs/nfsd/nfssvc.c8
-rw-r--r--fs/nfsd/nfsxdr.c4
-rw-r--r--fs/nfsd/state.h23
-rw-r--r--fs/nfsd/trace.h302
-rw-r--r--fs/nfsd/vfs.c90
-rw-r--r--fs/nfsd/vfs.h10
-rw-r--r--fs/nfsd/xdr4.h4
-rw-r--r--fs/nfsd/xdr4cb.h5
-rw-r--r--fs/notify/fanotify/fanotify.c3
-rw-r--r--fs/notify/fanotify/fanotify.h9
-rw-r--r--fs/notify/fanotify/fanotify_user.c50
-rw-r--r--fs/ntfs3/attrib.c72
-rw-r--r--fs/ntfs3/file.c87
-rw-r--r--fs/ntfs3/frecord.c74
-rw-r--r--fs/ntfs3/fslog.c32
-rw-r--r--fs/ntfs3/index.c8
-rw-r--r--fs/ntfs3/inode.c5
-rw-r--r--fs/ntfs3/namei.c2
-rw-r--r--fs/ntfs3/ntfs_fs.h5
-rw-r--r--fs/orangefs/orangefs-kernel.h8
-rw-r--r--fs/orangefs/orangefs-mod.c3
-rw-r--r--fs/orangefs/super.c189
-rw-r--r--fs/sysfs/group.c6
86 files changed, 2497 insertions, 1215 deletions
diff --git a/fs/afs/Kconfig b/fs/afs/Kconfig
index fc8ba9142f2f..682bd8ec2c10 100644
--- a/fs/afs/Kconfig
+++ b/fs/afs/Kconfig
@@ -5,6 +5,7 @@ config AFS_FS
select AF_RXRPC
select DNS_RESOLVER
select NETFS_SUPPORT
+ select CRYPTO_KRB5
help
If you say Y here, you will get an experimental Andrew File System
driver. It currently only supports unsecured read-only AFS access.
diff --git a/fs/afs/Makefile b/fs/afs/Makefile
index 5efd7e13b304..b49b8fe682f3 100644
--- a/fs/afs/Makefile
+++ b/fs/afs/Makefile
@@ -8,6 +8,7 @@ kafs-y := \
addr_prefs.o \
callback.o \
cell.o \
+ cm_security.o \
cmservice.o \
dir.o \
dir_edit.o \
diff --git a/fs/afs/cm_security.c b/fs/afs/cm_security.c
new file mode 100644
index 000000000000..edcbd249d202
--- /dev/null
+++ b/fs/afs/cm_security.c
@@ -0,0 +1,340 @@
+// SPDX-License-Identifier: GPL-2.0-or-later
+/* Cache manager security.
+ *
+ * Copyright (C) 2025 Red Hat, Inc. All Rights Reserved.
+ * Written by David Howells (dhowells@redhat.com)
+ */
+
+#include <linux/slab.h>
+#include <crypto/krb5.h>
+#include "internal.h"
+#include "afs_cm.h"
+#include "afs_fs.h"
+#include "protocol_yfs.h"
+#define RXRPC_TRACE_ONLY_DEFINE_ENUMS
+#include <trace/events/rxrpc.h>
+
+#define RXGK_SERVER_ENC_TOKEN 1036U // 0x40c
+#define xdr_round_up(x) (round_up((x), sizeof(__be32)))
+#define xdr_len_object(x) (4 + round_up((x), sizeof(__be32)))
+
+#ifdef CONFIG_RXGK
+static int afs_create_yfs_cm_token(struct sk_buff *challenge,
+ struct afs_server *server);
+#endif
+
+/*
+ * Respond to an RxGK challenge, adding appdata.
+ */
+static int afs_respond_to_challenge(struct sk_buff *challenge)
+{
+#ifdef CONFIG_RXGK
+ struct krb5_buffer appdata = {};
+ struct afs_server *server;
+#endif
+ struct rxrpc_peer *peer;
+ unsigned long peer_data;
+ u16 service_id;
+ u8 security_index;
+
+ rxrpc_kernel_query_challenge(challenge, &peer, &peer_data,
+ &service_id, &security_index);
+
+ _enter("%u,%u", service_id, security_index);
+
+ switch (service_id) {
+ /* We don't send CM_SERVICE RPCs, so don't expect a challenge
+ * therefrom.
+ */
+ case FS_SERVICE:
+ case VL_SERVICE:
+ case YFS_FS_SERVICE:
+ case YFS_VL_SERVICE:
+ break;
+ default:
+ pr_warn("Can't respond to unknown challenge %u:%u",
+ service_id, security_index);
+ return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO,
+ afs_abort_unsupported_sec_class);
+ }
+
+ switch (security_index) {
+#ifdef CONFIG_RXKAD
+ case RXRPC_SECURITY_RXKAD:
+ return rxkad_kernel_respond_to_challenge(challenge);
+#endif
+
+#ifdef CONFIG_RXGK
+ case RXRPC_SECURITY_RXGK:
+ return rxgk_kernel_respond_to_challenge(challenge, &appdata);
+
+ case RXRPC_SECURITY_YFS_RXGK:
+ switch (service_id) {
+ case FS_SERVICE:
+ case YFS_FS_SERVICE:
+ server = (struct afs_server *)peer_data;
+ if (!server->cm_rxgk_appdata.data) {
+ mutex_lock(&server->cm_token_lock);
+ if (!server->cm_rxgk_appdata.data)
+ afs_create_yfs_cm_token(challenge, server);
+ mutex_unlock(&server->cm_token_lock);
+ }
+ if (server->cm_rxgk_appdata.data)
+ appdata = server->cm_rxgk_appdata;
+ break;
+ }
+ return rxgk_kernel_respond_to_challenge(challenge, &appdata);
+#endif
+
+ default:
+ return rxrpc_kernel_reject_challenge(challenge, RX_USER_ABORT, -EPROTO,
+ afs_abort_unsupported_sec_class);
+ }
+}
+
+/*
+ * Process the OOB message queue, processing challenge packets.
+ */
+void afs_process_oob_queue(struct work_struct *work)
+{
+ struct afs_net *net = container_of(work, struct afs_net, rx_oob_work);
+ struct sk_buff *oob;
+ enum rxrpc_oob_type type;
+
+ while ((oob = rxrpc_kernel_dequeue_oob(net->socket, &type))) {
+ switch (type) {
+ case RXRPC_OOB_CHALLENGE:
+ afs_respond_to_challenge(oob);
+ break;
+ }
+ rxrpc_kernel_free_oob(oob);
+ }
+}
+
+#ifdef CONFIG_RXGK
+/*
+ * Create a securities keyring for the cache manager and attach a key to it for
+ * the RxGK tokens we want to use to secure the callback connection back from
+ * the fileserver.
+ */
+int afs_create_token_key(struct afs_net *net, struct socket *socket)
+{
+ const struct krb5_enctype *krb5;
+ struct key *ring;
+ key_ref_t key;
+ char K0[32], *desc;
+ int ret;
+
+ ring = keyring_alloc("kafs",
+ GLOBAL_ROOT_UID, GLOBAL_ROOT_GID, current_cred(),
+ KEY_POS_SEARCH | KEY_POS_WRITE |
+ KEY_USR_VIEW | KEY_USR_READ | KEY_USR_SEARCH,
+ KEY_ALLOC_NOT_IN_QUOTA,
+ NULL, NULL);
+ if (IS_ERR(ring))
+ return PTR_ERR(ring);
+
+ ret = rxrpc_sock_set_security_keyring(socket->sk, ring);
+ if (ret < 0)
+ goto out;
+
+ ret = -ENOPKG;
+ krb5 = crypto_krb5_find_enctype(KRB5_ENCTYPE_AES128_CTS_HMAC_SHA1_96);
+ if (!krb5)
+ goto out;
+
+ if (WARN_ON_ONCE(krb5->key_len > sizeof(K0)))
+ goto out;
+
+ ret = -ENOMEM;
+ desc = kasprintf(GFP_KERNEL, "%u:%u:%u:%u",
+ YFS_CM_SERVICE, RXRPC_SECURITY_YFS_RXGK, 1, krb5->etype);
+ if (!desc)
+ goto out;
+
+ wait_for_random_bytes();
+ get_random_bytes(K0, krb5->key_len);
+
+ key = key_create(make_key_ref(ring, true),
+ "rxrpc_s", desc,
+ K0, krb5->key_len,
+ KEY_POS_VIEW | KEY_POS_READ | KEY_POS_SEARCH | KEY_USR_VIEW,
+ KEY_ALLOC_NOT_IN_QUOTA);
+ kfree(desc);
+ if (IS_ERR(key)) {
+ ret = PTR_ERR(key);
+ goto out;
+ }
+
+ net->fs_cm_token_key = key_ref_to_ptr(key);
+ ret = 0;
+out:
+ key_put(ring);
+ return ret;
+}
+
+/*
+ * Create an YFS RxGK GSS token to use as a ticket to the specified fileserver.
+ */
+static int afs_create_yfs_cm_token(struct sk_buff *challenge,
+ struct afs_server *server)
+{
+ const struct krb5_enctype *conn_krb5, *token_krb5;
+ const struct krb5_buffer *token_key;
+ struct crypto_aead *aead;
+ struct scatterlist sg;
+ struct afs_net *net = server->cell->net;
+ const struct key *key = net->fs_cm_token_key;
+ size_t keysize, uuidsize, authsize, toksize, encsize, contsize, adatasize, offset;
+ __be32 caps[1] = {
+ [0] = htonl(AFS_CAP_ERROR_TRANSLATION),
+ };
+ __be32 *xdr;
+ void *appdata, *K0, *encbase;
+ u32 enctype;
+ int ret;
+
+ if (!key)
+ return -ENOKEY;
+
+ /* Assume that the fileserver is happy to use the same encoding type as
+ * we were told to use by the token obtained by the user.
+ */
+ enctype = rxgk_kernel_query_challenge(challenge);
+
+ conn_krb5 = crypto_krb5_find_enctype(enctype);
+ if (!conn_krb5)
+ return -ENOPKG;
+ token_krb5 = key->payload.data[0];
+ token_key = (const struct krb5_buffer *)&key->payload.data[2];
+
+ /* struct rxgk_key {
+ * afs_uint32 enctype;
+ * opaque key<>;
+ * };
+ */
+ keysize = 4 + xdr_len_object(conn_krb5->key_len);
+
+ /* struct RXGK_AuthName {
+ * afs_int32 kind;
+ * opaque data<AUTHDATAMAX>;
+ * opaque display<AUTHPRINTABLEMAX>;
+ * };
+ */
+ uuidsize = sizeof(server->uuid);
+ authsize = 4 + xdr_len_object(uuidsize) + xdr_len_object(0);
+
+ /* struct RXGK_Token {
+ * rxgk_key K0;
+ * RXGK_Level level;
+ * rxgkTime starttime;
+ * afs_int32 lifetime;
+ * afs_int32 bytelife;
+ * rxgkTime expirationtime;
+ * struct RXGK_AuthName identities<>;
+ * };
+ */
+ toksize = keysize + 8 + 4 + 4 + 8 + xdr_len_object(authsize);
+
+ offset = 0;
+ encsize = crypto_krb5_how_much_buffer(token_krb5, KRB5_ENCRYPT_MODE, toksize, &offset);
+
+ /* struct RXGK_TokenContainer {
+ * afs_int32 kvno;
+ * afs_int32 enctype;
+ * opaque encrypted_token<>;
+ * };
+ */
+ contsize = 4 + 4 + xdr_len_object(encsize);
+
+ /* struct YFSAppData {
+ * opr_uuid initiatorUuid;
+ * opr_uuid acceptorUuid;
+ * Capabilities caps;
+ * afs_int32 enctype;
+ * opaque callbackKey<>;
+ * opaque callbackToken<>;
+ * };
+ */
+ adatasize = 16 + 16 +
+ xdr_len_object(sizeof(caps)) +
+ 4 +
+ xdr_len_object(conn_krb5->key_len) +
+ xdr_len_object(contsize);
+
+ ret = -ENOMEM;
+ appdata = kzalloc(adatasize, GFP_KERNEL);
+ if (!appdata)
+ goto out;
+ xdr = appdata;
+
+ memcpy(xdr, &net->uuid, 16); /* appdata.initiatorUuid */
+ xdr += 16 / 4;
+ memcpy(xdr, &server->uuid, 16); /* appdata.acceptorUuid */
+ xdr += 16 / 4;
+ *xdr++ = htonl(ARRAY_SIZE(caps)); /* appdata.caps.len */
+ memcpy(xdr, &caps, sizeof(caps)); /* appdata.caps */
+ xdr += ARRAY_SIZE(caps);
+ *xdr++ = htonl(conn_krb5->etype); /* appdata.enctype */
+
+ *xdr++ = htonl(conn_krb5->key_len); /* appdata.callbackKey.len */
+ K0 = xdr;
+ get_random_bytes(K0, conn_krb5->key_len); /* appdata.callbackKey.data */
+ xdr += xdr_round_up(conn_krb5->key_len) / 4;
+
+ *xdr++ = htonl(contsize); /* appdata.callbackToken.len */
+ *xdr++ = htonl(1); /* cont.kvno */
+ *xdr++ = htonl(token_krb5->etype); /* cont.enctype */
+ *xdr++ = htonl(encsize); /* cont.encrypted_token.len */
+
+ encbase = xdr;
+ xdr += offset / 4;
+ *xdr++ = htonl(conn_krb5->etype); /* token.K0.enctype */
+ *xdr++ = htonl(conn_krb5->key_len); /* token.K0.key.len */
+ memcpy(xdr, K0, conn_krb5->key_len); /* token.K0.key.data */
+ xdr += xdr_round_up(conn_krb5->key_len) / 4;
+
+ *xdr++ = htonl(RXRPC_SECURITY_ENCRYPT); /* token.level */
+ *xdr++ = htonl(0); /* token.starttime */
+ *xdr++ = htonl(0); /* " */
+ *xdr++ = htonl(0); /* token.lifetime */
+ *xdr++ = htonl(0); /* token.bytelife */
+ *xdr++ = htonl(0); /* token.expirationtime */
+ *xdr++ = htonl(0); /* " */
+ *xdr++ = htonl(1); /* token.identities.count */
+ *xdr++ = htonl(0); /* token.identities[0].kind */
+ *xdr++ = htonl(uuidsize); /* token.identities[0].data.len */
+ memcpy(xdr, &server->uuid, uuidsize);
+ xdr += xdr_round_up(uuidsize) / 4;
+ *xdr++ = htonl(0); /* token.identities[0].display.len */
+
+ xdr = encbase + xdr_round_up(encsize);
+
+ if ((unsigned long)xdr - (unsigned long)appdata != adatasize)
+ pr_err("Appdata size incorrect %lx != %zx\n",
+ (unsigned long)xdr - (unsigned long)appdata, adatasize);
+
+ aead = crypto_krb5_prepare_encryption(token_krb5, token_key, RXGK_SERVER_ENC_TOKEN,
+ GFP_KERNEL);
+ if (IS_ERR(aead)) {
+ ret = PTR_ERR(aead);
+ goto out_token;
+ }
+
+ sg_init_one(&sg, encbase, encsize);
+ ret = crypto_krb5_encrypt(token_krb5, aead, &sg, 1, encsize, offset, toksize, false);
+ if (ret < 0)
+ goto out_aead;
+
+ server->cm_rxgk_appdata.len = adatasize;
+ server->cm_rxgk_appdata.data = appdata;
+ appdata = NULL;
+
+out_aead:
+ crypto_free_aead(aead);
+out_token:
+ kfree(appdata);
+out:
+ return ret;
+}
+#endif /* CONFIG_RXGK */
diff --git a/fs/afs/internal.h b/fs/afs/internal.h
index 440b0e731093..1124ea4000cb 100644
--- a/fs/afs/internal.h
+++ b/fs/afs/internal.h
@@ -20,6 +20,7 @@
#include <linux/uuid.h>
#include <linux/mm_types.h>
#include <linux/dns_resolver.h>
+#include <crypto/krb5.h>
#include <net/net_namespace.h>
#include <net/netns/generic.h>
#include <net/sock.h>
@@ -176,8 +177,10 @@ struct afs_call {
bool intr; /* T if interruptible */
bool unmarshalling_error; /* T if an unmarshalling error occurred */
bool responded; /* Got a response from the call (may be abort) */
+ u8 security_ix; /* Security class */
u16 service_id; /* Actual service ID (after upgrade) */
unsigned int debug_id; /* Trace ID */
+ u32 enctype; /* Security encoding type */
u32 operation_ID; /* operation ID for an incoming call */
u32 count; /* count for use in unmarshalling */
union { /* place to extract temporary data */
@@ -281,6 +284,7 @@ struct afs_net {
struct socket *socket;
struct afs_call *spare_incoming_call;
struct work_struct charge_preallocation_work;
+ struct work_struct rx_oob_work;
struct mutex socket_mutex;
atomic_t nr_outstanding_calls;
atomic_t nr_superblocks;
@@ -305,6 +309,7 @@ struct afs_net {
struct list_head fs_probe_slow; /* List of afs_server to probe at 5m intervals */
struct hlist_head fs_proc; /* procfs servers list */
+ struct key *fs_cm_token_key; /* Key for creating CM tokens */
struct work_struct fs_prober;
struct timer_list fs_probe_timer;
atomic_t servers_outstanding;
@@ -540,6 +545,8 @@ struct afs_server {
struct list_head volumes; /* RCU list of afs_server_entry objects */
struct work_struct destroyer; /* Work item to try and destroy a server */
struct timer_list timer; /* Management timer */
+ struct mutex cm_token_lock; /* Lock governing creation of appdata */
+ struct krb5_buffer cm_rxgk_appdata; /* Appdata to be included in RESPONSE packet */
time64_t unuse_time; /* Time at which last unused */
unsigned long flags;
#define AFS_SERVER_FL_RESPONDING 0 /* The server is responding */
@@ -1059,6 +1066,19 @@ extern void __net_exit afs_cell_purge(struct afs_net *);
extern bool afs_cm_incoming_call(struct afs_call *);
/*
+ * cm_security.c
+ */
+void afs_process_oob_queue(struct work_struct *work);
+#ifdef CONFIG_RXGK
+int afs_create_token_key(struct afs_net *net, struct socket *socket);
+#else
+static inline int afs_create_token_key(struct afs_net *net, struct socket *socket)
+{
+ return 0;
+}
+#endif
+
+/*
* dir.c
*/
extern const struct file_operations afs_dir_file_operations;
diff --git a/fs/afs/main.c b/fs/afs/main.c
index c845c5daaeba..02475d415d88 100644
--- a/fs/afs/main.c
+++ b/fs/afs/main.c
@@ -73,6 +73,7 @@ static int __net_init afs_net_init(struct net *net_ns)
generate_random_uuid((unsigned char *)&net->uuid);
INIT_WORK(&net->charge_preallocation_work, afs_charge_preallocation);
+ INIT_WORK(&net->rx_oob_work, afs_process_oob_queue);
mutex_init(&net->socket_mutex);
net->cells = RB_ROOT;
diff --git a/fs/afs/misc.c b/fs/afs/misc.c
index b8180bf2281f..8f2b3a177690 100644
--- a/fs/afs/misc.c
+++ b/fs/afs/misc.c
@@ -8,6 +8,7 @@
#include <linux/kernel.h>
#include <linux/module.h>
#include <linux/errno.h>
+#include <crypto/krb5.h>
#include "internal.h"
#include "afs_fs.h"
#include "protocol_uae.h"
@@ -103,6 +104,32 @@ int afs_abort_to_error(u32 abort_code)
case RXKADDATALEN: return -EKEYREJECTED;
case RXKADILLEGALLEVEL: return -EKEYREJECTED;
+ case RXGK_INCONSISTENCY: return -EPROTO;
+ case RXGK_PACKETSHORT: return -EPROTO;
+ case RXGK_BADCHALLENGE: return -EPROTO;
+ case RXGK_SEALEDINCON: return -EKEYREJECTED;
+ case RXGK_NOTAUTH: return -EKEYREJECTED;
+ case RXGK_EXPIRED: return -EKEYEXPIRED;
+ case RXGK_BADLEVEL: return -EKEYREJECTED;
+ case RXGK_BADKEYNO: return -EKEYREJECTED;
+ case RXGK_NOTRXGK: return -EKEYREJECTED;
+ case RXGK_UNSUPPORTED: return -EKEYREJECTED;
+ case RXGK_GSSERROR: return -EKEYREJECTED;
+#ifdef RXGK_BADETYPE
+ case RXGK_BADETYPE: return -ENOPKG;
+#endif
+#ifdef RXGK_BADTOKEN
+ case RXGK_BADTOKEN: return -EKEYREJECTED;
+#endif
+#ifdef RXGK_BADETYPE
+ case RXGK_DATALEN: return -EPROTO;
+#endif
+#ifdef RXGK_BADQOP
+ case RXGK_BADQOP: return -EKEYREJECTED;
+#endif
+
+ case KRB5_PROG_KEYTYPE_NOSUPP: return -ENOPKG;
+
case RXGEN_OPCODE: return -ENOTSUPP;
default: return -EREMOTEIO;
diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
index d5e480a33859..c1cadf8fb346 100644
--- a/fs/afs/rxrpc.c
+++ b/fs/afs/rxrpc.c
@@ -24,8 +24,17 @@ static void afs_wake_up_async_call(struct sock *, struct rxrpc_call *, unsigned
static void afs_process_async_call(struct work_struct *);
static void afs_rx_new_call(struct sock *, struct rxrpc_call *, unsigned long);
static void afs_rx_discard_new_call(struct rxrpc_call *, unsigned long);
+static void afs_rx_attach(struct rxrpc_call *rxcall, unsigned long user_call_ID);
+static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob);
static int afs_deliver_cm_op_id(struct afs_call *);
+static const struct rxrpc_kernel_ops afs_rxrpc_callback_ops = {
+ .notify_new_call = afs_rx_new_call,
+ .discard_new_call = afs_rx_discard_new_call,
+ .user_attach_call = afs_rx_attach,
+ .notify_oob = afs_rx_notify_oob,
+};
+
/* asynchronous incoming call initial processing */
static const struct afs_call_type afs_RXCMxxxx = {
.name = "CB.xxxx",
@@ -49,6 +58,7 @@ int afs_open_socket(struct afs_net *net)
goto error_1;
socket->sk->sk_allocation = GFP_NOFS;
+ socket->sk->sk_user_data = net;
/* bind the callback manager's address to make this a server socket */
memset(&srx, 0, sizeof(srx));
@@ -64,6 +74,14 @@ int afs_open_socket(struct afs_net *net)
if (ret < 0)
goto error_2;
+ ret = rxrpc_sock_set_manage_response(socket->sk, true);
+ if (ret < 0)
+ goto error_2;
+
+ ret = afs_create_token_key(net, socket);
+ if (ret < 0)
+ pr_err("Couldn't create RxGK CM key: %d\n", ret);
+
ret = kernel_bind(socket, (struct sockaddr *) &srx, sizeof(srx));
if (ret == -EADDRINUSE) {
srx.transport.sin6.sin6_port = 0;
@@ -84,8 +102,7 @@ int afs_open_socket(struct afs_net *net)
* it sends back to us.
*/
- rxrpc_kernel_new_call_notification(socket, afs_rx_new_call,
- afs_rx_discard_new_call);
+ rxrpc_kernel_set_notifications(socket, &afs_rxrpc_callback_ops);
ret = kernel_listen(socket, INT_MAX);
if (ret < 0)
@@ -125,7 +142,9 @@ void afs_close_socket(struct afs_net *net)
kernel_sock_shutdown(net->socket, SHUT_RDWR);
flush_workqueue(afs_async_calls);
+ net->socket->sk->sk_user_data = NULL;
sock_release(net->socket);
+ key_put(net->fs_cm_token_key);
_debug("dework");
_leave("");
@@ -738,7 +757,6 @@ void afs_charge_preallocation(struct work_struct *work)
if (rxrpc_kernel_charge_accept(net->socket,
afs_wake_up_async_call,
- afs_rx_attach,
(unsigned long)call,
GFP_KERNEL,
call->debug_id) < 0)
@@ -800,10 +818,14 @@ static int afs_deliver_cm_op_id(struct afs_call *call)
if (!afs_cm_incoming_call(call))
return -ENOTSUPP;
+ call->security_ix = rxrpc_kernel_query_call_security(call->rxcall,
+ &call->service_id,
+ &call->enctype);
+
trace_afs_cb_call(call);
call->work.func = call->type->work;
- /* pass responsibility for the remainer of this message off to the
+ /* pass responsibility for the remainder of this message off to the
* cache manager op */
return call->type->deliver(call);
}
@@ -952,3 +974,13 @@ noinline int afs_protocol_error(struct afs_call *call,
call->unmarshalling_error = true;
return -EBADMSG;
}
+
+/*
+ * Wake up OOB notification processing.
+ */
+static void afs_rx_notify_oob(struct sock *sk, struct sk_buff *oob)
+{
+ struct afs_net *net = sk->sk_user_data;
+
+ schedule_work(&net->rx_oob_work);
+}
diff --git a/fs/afs/server.c b/fs/afs/server.c
index 8755f2703815..a97562f831eb 100644
--- a/fs/afs/server.c
+++ b/fs/afs/server.c
@@ -131,6 +131,7 @@ static struct afs_server *afs_alloc_server(struct afs_cell *cell, const uuid_t *
timer_setup(&server->timer, afs_server_timer, 0);
INIT_LIST_HEAD(&server->volumes);
init_waitqueue_head(&server->probe_wq);
+ mutex_init(&server->cm_token_lock);
INIT_LIST_HEAD(&server->probe_link);
INIT_HLIST_NODE(&server->proc_link);
spin_lock_init(&server->probe_lock);
@@ -396,6 +397,7 @@ static void afs_server_rcu(struct rcu_head *rcu)
afs_put_endpoint_state(rcu_access_pointer(server->endpoint_state),
afs_estate_trace_put_server);
afs_put_cell(server->cell, afs_cell_trace_put_server);
+ kfree(server->cm_rxgk_appdata.data);
kfree(server);
}
diff --git a/fs/btrfs/extent_io.c b/fs/btrfs/extent_io.c
index e43f6280f954..849199768664 100644
--- a/fs/btrfs/extent_io.c
+++ b/fs/btrfs/extent_io.c
@@ -2189,7 +2189,6 @@ retry:
done = 1;
break;
}
- free_extent_buffer(eb);
continue;
}
diff --git a/fs/dlm/Kconfig b/fs/dlm/Kconfig
index f82a4952769d..b46165df5a91 100644
--- a/fs/dlm/Kconfig
+++ b/fs/dlm/Kconfig
@@ -3,7 +3,6 @@ menuconfig DLM
tristate "Distributed Lock Manager (DLM)"
depends on INET
depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
- select IP_SCTP
help
A general purpose distributed lock manager for kernel or userspace
applications.
diff --git a/fs/dlm/config.c b/fs/dlm/config.c
index cf9ba6fd7a28..a23fd524a6ee 100644
--- a/fs/dlm/config.c
+++ b/fs/dlm/config.c
@@ -197,6 +197,9 @@ static int dlm_check_protocol_and_dlm_running(unsigned int x)
break;
case 1:
/* SCTP */
+ if (!IS_ENABLED(CONFIG_IP_SCTP))
+ return -EOPNOTSUPP;
+
break;
default:
return -EINVAL;
diff --git a/fs/dlm/lowcomms.c b/fs/dlm/lowcomms.c
index 70abd4da17a6..e4373bce1bc2 100644
--- a/fs/dlm/lowcomms.c
+++ b/fs/dlm/lowcomms.c
@@ -160,6 +160,7 @@ struct dlm_proto_ops {
bool try_new_addr;
const char *name;
int proto;
+ int how;
void (*sockopts)(struct socket *sock);
int (*bind)(struct socket *sock);
@@ -533,7 +534,7 @@ static void lowcomms_state_change(struct sock *sk)
/* SCTP layer is not calling sk_data_ready when the connection
* is done, so we catch the signal through here.
*/
- if (sk->sk_shutdown == RCV_SHUTDOWN)
+ if (sk->sk_shutdown & RCV_SHUTDOWN)
lowcomms_data_ready(sk);
}
@@ -810,7 +811,7 @@ static void shutdown_connection(struct connection *con, bool and_other)
return;
}
- ret = kernel_sock_shutdown(con->sock, SHUT_WR);
+ ret = kernel_sock_shutdown(con->sock, dlm_proto_ops->how);
up_read(&con->sock_lock);
if (ret) {
log_print("Connection %p failed to shutdown: %d will force close",
@@ -1858,6 +1859,7 @@ static int dlm_tcp_listen_bind(struct socket *sock)
static const struct dlm_proto_ops dlm_tcp_ops = {
.name = "TCP",
.proto = IPPROTO_TCP,
+ .how = SHUT_WR,
.sockopts = dlm_tcp_sockopts,
.bind = dlm_tcp_bind,
.listen_validate = dlm_tcp_listen_validate,
@@ -1896,6 +1898,7 @@ static void dlm_sctp_sockopts(struct socket *sock)
static const struct dlm_proto_ops dlm_sctp_ops = {
.name = "SCTP",
.proto = IPPROTO_SCTP,
+ .how = SHUT_RDWR,
.try_new_addr = true,
.sockopts = dlm_sctp_sockopts,
.bind = dlm_sctp_bind,
diff --git a/fs/exfat/nls.c b/fs/exfat/nls.c
index d47896a89596..1729bf42eb51 100644
--- a/fs/exfat/nls.c
+++ b/fs/exfat/nls.c
@@ -801,4 +801,5 @@ load_default:
void exfat_free_upcase_table(struct exfat_sb_info *sbi)
{
kvfree(sbi->vol_utbl);
+ sbi->vol_utbl = NULL;
}
diff --git a/fs/exfat/super.c b/fs/exfat/super.c
index 8465033a6cf0..7ed858937d45 100644
--- a/fs/exfat/super.c
+++ b/fs/exfat/super.c
@@ -36,31 +36,12 @@ static void exfat_put_super(struct super_block *sb)
struct exfat_sb_info *sbi = EXFAT_SB(sb);
mutex_lock(&sbi->s_lock);
+ exfat_clear_volume_dirty(sb);
exfat_free_bitmap(sbi);
brelse(sbi->boot_bh);
mutex_unlock(&sbi->s_lock);
}
-static int exfat_sync_fs(struct super_block *sb, int wait)
-{
- struct exfat_sb_info *sbi = EXFAT_SB(sb);
- int err = 0;
-
- if (unlikely(exfat_forced_shutdown(sb)))
- return 0;
-
- if (!wait)
- return 0;
-
- /* If there are some dirty buffers in the bdev inode */
- mutex_lock(&sbi->s_lock);
- sync_blockdev(sb->s_bdev);
- if (exfat_clear_volume_dirty(sb))
- err = -EIO;
- mutex_unlock(&sbi->s_lock);
- return err;
-}
-
static int exfat_statfs(struct dentry *dentry, struct kstatfs *buf)
{
struct super_block *sb = dentry->d_sb;
@@ -219,7 +200,6 @@ static const struct super_operations exfat_sops = {
.write_inode = exfat_write_inode,
.evict_inode = exfat_evict_inode,
.put_super = exfat_put_super,
- .sync_fs = exfat_sync_fs,
.statfs = exfat_statfs,
.show_options = exfat_show_options,
.shutdown = exfat_shutdown,
@@ -751,10 +731,14 @@ static void exfat_free(struct fs_context *fc)
static int exfat_reconfigure(struct fs_context *fc)
{
+ struct super_block *sb = fc->root->d_sb;
fc->sb_flags |= SB_NODIRATIME;
- /* volume flag will be updated in exfat_sync_fs */
- sync_filesystem(fc->root->d_sb);
+ sync_filesystem(sb);
+ mutex_lock(&EXFAT_SB(sb)->s_lock);
+ exfat_clear_volume_dirty(sb);
+ mutex_unlock(&EXFAT_SB(sb)->s_lock);
+
return 0;
}
diff --git a/fs/ext2/super.c b/fs/ext2/super.c
index 28ff47ec4be6..121e634c792a 100644
--- a/fs/ext2/super.c
+++ b/fs/ext2/super.c
@@ -601,7 +601,8 @@ static int ext2_parse_param(struct fs_context *fc, struct fs_parameter *param)
case Opt_dax:
#ifdef CONFIG_FS_DAX
ext2_msg_fc(fc, KERN_WARNING,
- "DAX enabled. Warning: EXPERIMENTAL, use at your own risk");
+ "DAX enabled. Warning: DAX support in ext2 driver is deprecated"
+ " and will be removed at the end of 2025. Please use ext4 driver instead.");
ctx_set_mount_opt(ctx, EXT2_MOUNT_DAX);
#else
ext2_msg_fc(fc, KERN_INFO, "dax option not supported");
diff --git a/fs/ext4/bitmap.c b/fs/ext4/bitmap.c
index a4dbaccee6e7..87760fabdd2e 100644
--- a/fs/ext4/bitmap.c
+++ b/fs/ext4/bitmap.c
@@ -30,7 +30,7 @@ int ext4_inode_bitmap_csum_verify(struct super_block *sb,
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
provided = le16_to_cpu(gdp->bg_inode_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_inode_bitmap_csum_hi);
provided |= (hi << 16);
@@ -52,7 +52,7 @@ void ext4_inode_bitmap_csum_set(struct super_block *sb,
return;
sz = EXT4_INODES_PER_GROUP(sb) >> 3;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_inode_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_INODE_BITMAP_CSUM_HI_END)
gdp->bg_inode_bitmap_csum_hi = cpu_to_le16(csum >> 16);
@@ -71,7 +71,7 @@ int ext4_block_bitmap_csum_verify(struct super_block *sb,
return 1;
provided = le16_to_cpu(gdp->bg_block_bitmap_csum_lo);
- calculated = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ calculated = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END) {
hi = le16_to_cpu(gdp->bg_block_bitmap_csum_hi);
provided |= (hi << 16);
@@ -92,7 +92,7 @@ void ext4_block_bitmap_csum_set(struct super_block *sb,
if (!ext4_has_feature_metadata_csum(sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)bh->b_data, sz);
gdp->bg_block_bitmap_csum_lo = cpu_to_le16(csum & 0xFFFF);
if (sbi->s_desc_size >= EXT4_BG_BLOCK_BITMAP_CSUM_HI_END)
gdp->bg_block_bitmap_csum_hi = cpu_to_le16(csum >> 16);
diff --git a/fs/ext4/ext4.h b/fs/ext4/ext4.h
index 5a20e9cd7184..18373de980f2 100644
--- a/fs/ext4/ext4.h
+++ b/fs/ext4/ext4.h
@@ -256,9 +256,19 @@ struct ext4_allocation_request {
#define EXT4_MAP_UNWRITTEN BIT(BH_Unwritten)
#define EXT4_MAP_BOUNDARY BIT(BH_Boundary)
#define EXT4_MAP_DELAYED BIT(BH_Delay)
+/*
+ * This is for use in ext4_map_query_blocks() for a special case where we can
+ * have a physically and logically contiguous blocks split across two leaf
+ * nodes instead of a single extent. This is required in case of atomic writes
+ * to know whether the returned extent is last in leaf. If yes, then lookup for
+ * next in leaf block in ext4_map_query_blocks_next_in_leaf().
+ * - This is never going to be added to any buffer head state.
+ * - We use the next available bit after BH_BITMAP_UPTODATE.
+ */
+#define EXT4_MAP_QUERY_LAST_IN_LEAF BIT(BH_BITMAP_UPTODATE + 1)
#define EXT4_MAP_FLAGS (EXT4_MAP_NEW | EXT4_MAP_MAPPED |\
EXT4_MAP_UNWRITTEN | EXT4_MAP_BOUNDARY |\
- EXT4_MAP_DELAYED)
+ EXT4_MAP_DELAYED | EXT4_MAP_QUERY_LAST_IN_LEAF)
struct ext4_map_blocks {
ext4_fsblk_t m_pblk;
@@ -706,9 +716,6 @@ enum {
#define EXT4_GET_BLOCKS_CONVERT 0x0010
#define EXT4_GET_BLOCKS_IO_CREATE_EXT (EXT4_GET_BLOCKS_PRE_IO|\
EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
- /* Convert extent to initialized after IO complete */
-#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT|\
- EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT)
/* Eventual metadata allocation (due to growing extent tree)
* should not fail, so try to use reserved blocks for that.*/
#define EXT4_GET_BLOCKS_METADATA_NOFAIL 0x0020
@@ -720,11 +727,23 @@ enum {
#define EXT4_GET_BLOCKS_ZERO 0x0200
#define EXT4_GET_BLOCKS_CREATE_ZERO (EXT4_GET_BLOCKS_CREATE |\
EXT4_GET_BLOCKS_ZERO)
- /* Caller will submit data before dropping transaction handle. This
- * allows jbd2 to avoid submitting data before commit. */
+ /* Caller is in the context of data submission, such as writeback,
+ * fsync, etc. Especially, in the generic writeback path, caller will
+ * submit data before dropping transaction handle. This allows jbd2
+ * to avoid submitting data before commit. */
#define EXT4_GET_BLOCKS_IO_SUBMIT 0x0400
+ /* Convert extent to initialized after IO complete */
+#define EXT4_GET_BLOCKS_IO_CONVERT_EXT (EXT4_GET_BLOCKS_CONVERT |\
+ EXT4_GET_BLOCKS_CREATE_UNWRIT_EXT |\
+ EXT4_GET_BLOCKS_IO_SUBMIT)
/* Caller is in the atomic contex, find extent if it has been cached */
#define EXT4_GET_BLOCKS_CACHED_NOWAIT 0x0800
+/*
+ * Atomic write caller needs this to query in the slow path of mixed mapping
+ * case, when a contiguous extent can be split across two adjacent leaf nodes.
+ * Look EXT4_MAP_QUERY_LAST_IN_LEAF.
+ */
+#define EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF 0x1000
/*
* The bit position of these flags must not overlap with any of the
@@ -738,6 +757,13 @@ enum {
#define EXT4_EX_NOCACHE 0x40000000
#define EXT4_EX_FORCE_CACHE 0x20000000
#define EXT4_EX_NOFAIL 0x10000000
+/*
+ * ext4_map_query_blocks() uses this filter mask to filter the flags needed to
+ * pass while lookup/querying of on disk extent tree.
+ */
+#define EXT4_EX_QUERY_FILTER (EXT4_EX_NOCACHE | EXT4_EX_FORCE_CACHE |\
+ EXT4_EX_NOFAIL |\
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
/*
* Flags used by ext4_free_blocks
@@ -1061,16 +1087,16 @@ struct ext4_inode_info {
/* End of lblk range that needs to be committed in this fast commit */
ext4_lblk_t i_fc_lblk_len;
- /* Number of ongoing updates on this inode */
- atomic_t i_fc_updates;
-
spinlock_t i_raw_lock; /* protects updates to the raw inode */
/* Fast commit wait queue for this inode */
wait_queue_head_t i_fc_wait;
- /* Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len */
- struct mutex i_fc_lock;
+ /*
+ * Protect concurrent accesses on i_fc_lblk_start, i_fc_lblk_len
+ * and inode's EXT4_FC_STATE_COMMITTING state bit.
+ */
+ spinlock_t i_fc_lock;
/*
* i_disksize keeps track of what the inode size is ON DISK, not
@@ -1754,7 +1780,7 @@ struct ext4_sb_info {
* following fields:
* ei->i_fc_list, s_fc_dentry_q, s_fc_q, s_fc_bytes, s_fc_bh.
*/
- spinlock_t s_fc_lock;
+ struct mutex s_fc_lock;
struct buffer_head *s_fc_bh;
struct ext4_fc_stats s_fc_stats;
tid_t s_fc_ineligible_tid;
@@ -1913,6 +1939,7 @@ enum {
EXT4_STATE_LUSTRE_EA_INODE, /* Lustre-style ea_inode */
EXT4_STATE_VERITY_IN_PROGRESS, /* building fs-verity Merkle tree */
EXT4_STATE_FC_COMMITTING, /* Fast commit ongoing */
+ EXT4_STATE_FC_FLUSHING_DATA, /* Fast commit flushing data */
EXT4_STATE_ORPHAN_FILE, /* Inode orphaned in orphan file */
};
@@ -2295,10 +2322,12 @@ static inline int ext4_emergency_state(struct super_block *sb)
#define EXT4_DEFM_NODELALLOC 0x0800
/*
- * Default journal batch times
+ * Default journal batch times and ioprio.
*/
#define EXT4_DEF_MIN_BATCH_TIME 0
#define EXT4_DEF_MAX_BATCH_TIME 15000 /* 15ms */
+#define EXT4_DEF_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
+
/*
* Default values for superblock update
@@ -2487,8 +2516,7 @@ static inline __le16 ext4_rec_len_to_disk(unsigned len, unsigned blocksize)
#define DX_HASH_SIPHASH 6
#define DX_HASH_LAST DX_HASH_SIPHASH
-static inline u32 ext4_chksum(struct ext4_sb_info *sbi, u32 crc,
- const void *address, unsigned int length)
+static inline u32 ext4_chksum(u32 crc, const void *address, unsigned int length)
{
return crc32c(crc, address, length);
}
@@ -2922,8 +2950,6 @@ void __ext4_fc_track_create(handle_t *handle, struct inode *inode,
void ext4_fc_track_create(handle_t *handle, struct dentry *dentry);
void ext4_fc_track_inode(handle_t *handle, struct inode *inode);
void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handle);
-void ext4_fc_start_update(struct inode *inode);
-void ext4_fc_stop_update(struct inode *inode);
void ext4_fc_del(struct inode *inode);
bool ext4_fc_replay_check_excluded(struct super_block *sb, ext4_fsblk_t block);
void ext4_fc_replay_cleanup(struct super_block *sb);
@@ -2973,6 +2999,7 @@ static inline bool ext4_mb_cr_expensive(enum criteria cr)
void ext4_inode_csum_set(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei);
int ext4_inode_is_fast_symlink(struct inode *inode);
+void ext4_check_map_extents_env(struct inode *inode);
struct buffer_head *ext4_getblk(handle_t *, struct inode *, ext4_lblk_t, int);
struct buffer_head *ext4_bread(handle_t *, struct inode *, ext4_lblk_t, int);
int ext4_bread_batch(struct inode *inode, ext4_lblk_t block, int bh_count,
@@ -2993,6 +3020,7 @@ int ext4_walk_page_buffers(handle_t *handle,
struct buffer_head *bh));
int do_journal_get_write_access(handle_t *handle, struct inode *inode,
struct buffer_head *bh);
+bool ext4_should_enable_large_folio(struct inode *inode);
#define FALL_BACK_TO_NONDELALLOC 1
#define CONVERT_INLINE_DATA 2
@@ -3039,6 +3067,8 @@ extern void ext4_set_aops(struct inode *inode);
extern int ext4_writepage_trans_blocks(struct inode *);
extern int ext4_normal_submit_inode_data_buffers(struct jbd2_inode *jinode);
extern int ext4_chunk_trans_blocks(struct inode *, int nrblocks);
+extern int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
+ int pextents);
extern int ext4_zero_partial_blocks(handle_t *handle, struct inode *inode,
loff_t lstart, loff_t lend);
extern vm_fault_t ext4_page_mkwrite(struct vm_fault *vmf);
@@ -3050,6 +3080,17 @@ extern void ext4_da_update_reserve_space(struct inode *inode,
extern int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk,
ext4_fsblk_t pblk, ext4_lblk_t len);
+static inline bool is_special_ino(struct super_block *sb, unsigned long ino)
+{
+ struct ext4_super_block *es = EXT4_SB(sb)->s_es;
+
+ return (ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
+ ino == le32_to_cpu(es->s_usr_quota_inum) ||
+ ino == le32_to_cpu(es->s_grp_quota_inum) ||
+ ino == le32_to_cpu(es->s_prj_quota_inum) ||
+ ino == le32_to_cpu(es->s_orphan_file_inum);
+}
+
/* indirect.c */
extern int ext4_ind_map_blocks(handle_t *handle, struct inode *inode,
struct ext4_map_blocks *map, int flags);
@@ -3119,8 +3160,7 @@ extern int ext4_read_bh_lock(struct buffer_head *bh, blk_opf_t op_flags, bool wa
extern void ext4_sb_breadahead_unmovable(struct super_block *sb, sector_t block);
extern int ext4_seq_options_show(struct seq_file *seq, void *offset);
extern int ext4_calculate_overhead(struct super_block *sb);
-extern __le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es);
+extern __le32 ext4_superblock_csum(struct ext4_super_block *es);
extern void ext4_superblock_csum_set(struct super_block *sb);
extern int ext4_alloc_flex_bg_array(struct super_block *sb,
ext4_group_t ngroup);
@@ -3378,6 +3418,13 @@ static inline unsigned int ext4_flex_bg_size(struct ext4_sb_info *sbi)
return 1 << sbi->s_log_groups_per_flex;
}
+static inline loff_t ext4_get_maxbytes(struct inode *inode)
+{
+ if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ return inode->i_sb->s_maxbytes;
+ return EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+}
+
#define ext4_std_error(sb, errno) \
do { \
if ((errno)) \
@@ -3710,6 +3757,8 @@ extern long ext4_fallocate(struct file *file, int mode, loff_t offset,
loff_t len);
extern int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
loff_t offset, ssize_t len);
+extern int ext4_convert_unwritten_extents_atomic(handle_t *handle,
+ struct inode *inode, loff_t offset, ssize_t len);
extern int ext4_convert_unwritten_io_end_vec(handle_t *handle,
ext4_io_end_t *io_end);
extern int ext4_map_blocks(handle_t *handle, struct inode *inode,
@@ -3847,7 +3896,9 @@ static inline int ext4_buffer_uptodate(struct buffer_head *bh)
static inline bool ext4_inode_can_atomic_write(struct inode *inode)
{
- return S_ISREG(inode->i_mode) && EXT4_SB(inode->i_sb)->s_awu_min > 0;
+ return S_ISREG(inode->i_mode) &&
+ ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS) &&
+ EXT4_SB(inode->i_sb)->s_awu_min > 0;
}
extern int ext4_block_write_begin(handle_t *handle, struct folio *folio,
diff --git a/fs/ext4/ext4_jbd2.c b/fs/ext4/ext4_jbd2.c
index 135e278c832e..b3e9b7bd7978 100644
--- a/fs/ext4/ext4_jbd2.c
+++ b/fs/ext4/ext4_jbd2.c
@@ -16,7 +16,8 @@ int ext4_inode_journal_mode(struct inode *inode)
ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
test_opt(inode->i_sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
(ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA) &&
- !test_opt(inode->i_sb, DELALLOC))) {
+ !test_opt(inode->i_sb, DELALLOC) &&
+ !mapping_large_folio_support(inode->i_mapping))) {
/* We do not support data journalling for encrypted data */
if (S_ISREG(inode->i_mode) && IS_ENCRYPTED(inode))
return EXT4_INODE_ORDERED_DATA_MODE; /* ordered */
diff --git a/fs/ext4/ext4_jbd2.h b/fs/ext4/ext4_jbd2.h
index 3221714d9901..63d17c5201b5 100644
--- a/fs/ext4/ext4_jbd2.h
+++ b/fs/ext4/ext4_jbd2.h
@@ -319,10 +319,10 @@ static inline int ext4_journal_ensure_credits(handle_t *handle, int credits,
revoke_creds, 0);
}
-static inline int ext4_journal_blocks_per_page(struct inode *inode)
+static inline int ext4_journal_blocks_per_folio(struct inode *inode)
{
if (EXT4_JOURNAL(inode) != NULL)
- return jbd2_journal_blocks_per_page(inode);
+ return jbd2_journal_blocks_per_folio(inode);
return 0;
}
diff --git a/fs/ext4/extents.c b/fs/ext4/extents.c
index c616a16a9f36..b543a46fc809 100644
--- a/fs/ext4/extents.c
+++ b/fs/ext4/extents.c
@@ -50,10 +50,9 @@ static __le32 ext4_extent_block_csum(struct inode *inode,
struct ext4_extent_header *eh)
{
struct ext4_inode_info *ei = EXT4_I(inode);
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)eh,
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)eh,
EXT4_EXTENT_TAIL_OFFSET(eh));
return cpu_to_le32(csum);
}
@@ -611,6 +610,8 @@ int ext4_ext_precache(struct inode *inode)
if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
return 0; /* not an extent-mapped inode */
+ ext4_check_map_extents_env(inode);
+
down_read(&ei->i_data_sem);
depth = ext_depth(inode);
@@ -1530,7 +1531,7 @@ static int ext4_ext_search_left(struct inode *inode,
static int ext4_ext_search_right(struct inode *inode,
struct ext4_ext_path *path,
ext4_lblk_t *logical, ext4_fsblk_t *phys,
- struct ext4_extent *ret_ex)
+ struct ext4_extent *ret_ex, int flags)
{
struct buffer_head *bh = NULL;
struct ext4_extent_header *eh;
@@ -1604,7 +1605,8 @@ got_index:
ix++;
while (++depth < path->p_depth) {
/* subtract from p_depth to get proper eh_depth */
- bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth,
+ flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -1612,7 +1614,7 @@ got_index:
put_bh(bh);
}
- bh = read_extent_tree_block(inode, ix, path->p_depth - depth, 0);
+ bh = read_extent_tree_block(inode, ix, path->p_depth - depth, flags);
if (IS_ERR(bh))
return PTR_ERR(bh);
eh = ext_block_hdr(bh);
@@ -2396,18 +2398,20 @@ int ext4_ext_calc_credits_for_single_extent(struct inode *inode, int nrblocks,
int ext4_ext_index_trans_blocks(struct inode *inode, int extents)
{
int index;
- int depth;
/* If we are converting the inline data, only one is needed here. */
if (ext4_has_inline_data(inode))
return 1;
- depth = ext_depth(inode);
-
+ /*
+ * Extent tree can change between the time we estimate credits and
+ * the time we actually modify the tree. Assume the worst case.
+ */
if (extents <= 1)
- index = depth * 2;
+ index = (EXT4_MAX_EXTENT_DEPTH * 2) + extents;
else
- index = depth * 3;
+ index = (EXT4_MAX_EXTENT_DEPTH * 3) +
+ DIV_ROUND_UP(extents, ext4_ext_space_block(inode, 0));
return index;
}
@@ -2821,6 +2825,7 @@ int ext4_ext_remove_space(struct inode *inode, ext4_lblk_t start,
struct partial_cluster partial;
handle_t *handle;
int i = 0, err = 0;
+ int flags = EXT4_EX_NOCACHE | EXT4_EX_NOFAIL;
partial.pclu = 0;
partial.lblk = 0;
@@ -2851,8 +2856,7 @@ again:
ext4_fsblk_t pblk;
/* find extent for or closest extent to this block */
- path = ext4_find_extent(inode, end, NULL,
- EXT4_EX_NOCACHE | EXT4_EX_NOFAIL);
+ path = ext4_find_extent(inode, end, NULL, flags);
if (IS_ERR(path)) {
ext4_journal_stop(handle);
return PTR_ERR(path);
@@ -2918,7 +2922,7 @@ again:
*/
lblk = ex_end + 1;
err = ext4_ext_search_right(inode, path, &lblk, &pblk,
- NULL);
+ NULL, flags);
if (err < 0)
goto out;
if (pblk) {
@@ -2994,8 +2998,7 @@ again:
i + 1, ext4_idx_pblock(path[i].p_idx));
memset(path + i + 1, 0, sizeof(*path));
bh = read_extent_tree_block(inode, path[i].p_idx,
- depth - i - 1,
- EXT4_EX_NOCACHE);
+ depth - i - 1, flags);
if (IS_ERR(bh)) {
/* should we reset i_size? */
err = PTR_ERR(bh);
@@ -4202,7 +4205,7 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
trace_ext4_ext_map_blocks_enter(inode, map->m_lblk, map->m_len, flags);
/* find extent for this block */
- path = ext4_find_extent(inode, map->m_lblk, NULL, 0);
+ path = ext4_find_extent(inode, map->m_lblk, NULL, flags);
if (IS_ERR(path)) {
err = PTR_ERR(path);
goto out;
@@ -4314,7 +4317,8 @@ int ext4_ext_map_blocks(handle_t *handle, struct inode *inode,
if (err)
goto out;
ar.lright = map->m_lblk;
- err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright, &ex2);
+ err = ext4_ext_search_right(inode, path, &ar.lright, &ar.pright,
+ &ex2, flags);
if (err < 0)
goto out;
@@ -4433,6 +4437,20 @@ got_allocated_blocks:
allocated = map->m_len;
ext4_ext_show_leaf(inode, path);
out:
+ /*
+ * We never use EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF with CREATE flag.
+ * So we know that the depth used here is correct, since there was no
+ * block allocation done if EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF is set.
+ * If tomorrow we start using this QUERY flag with CREATE, then we will
+ * need to re-calculate the depth as it might have changed due to block
+ * allocation.
+ */
+ if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) {
+ WARN_ON_ONCE(flags & EXT4_GET_BLOCKS_CREATE);
+ if (!err && ex && (ex == EXT_LAST_EXTENT(path[depth].p_hdr)))
+ map->m_flags |= EXT4_MAP_QUERY_LAST_IN_LEAF;
+ }
+
ext4_free_ext_path(path);
trace_ext4_ext_map_blocks_exit(inode, flags, map,
@@ -4781,6 +4799,93 @@ out_inode_lock:
}
/*
+ * This function converts a range of blocks to written extents. The caller of
+ * this function will pass the start offset and the size. all unwritten extents
+ * within this range will be converted to written extents.
+ *
+ * This function is called from the direct IO end io call back function for
+ * atomic writes, to convert the unwritten extents after IO is completed.
+ *
+ * Note that the requirement for atomic writes is that all conversion should
+ * happen atomically in a single fs journal transaction. We mainly only allocate
+ * unwritten extents either on a hole on a pre-exiting unwritten extent range in
+ * ext4_map_blocks_atomic_write(). The only case where we can have multiple
+ * unwritten extents in a range [offset, offset+len) is when there is a split
+ * unwritten extent between two leaf nodes which was cached in extent status
+ * cache during ext4_iomap_alloc() time. That will allow
+ * ext4_map_blocks_atomic_write() to return the unwritten extent range w/o going
+ * into the slow path. That means we might need a loop for conversion of this
+ * unwritten extent split across leaf block within a single journal transaction.
+ * Split extents across leaf nodes is a rare case, but let's still handle that
+ * to meet the requirements of multi-fsblock atomic writes.
+ *
+ * Returns 0 on success.
+ */
+int ext4_convert_unwritten_extents_atomic(handle_t *handle, struct inode *inode,
+ loff_t offset, ssize_t len)
+{
+ unsigned int max_blocks;
+ int ret = 0, ret2 = 0, ret3 = 0;
+ struct ext4_map_blocks map;
+ unsigned int blkbits = inode->i_blkbits;
+ unsigned int credits = 0;
+ int flags = EXT4_GET_BLOCKS_IO_CONVERT_EXT | EXT4_EX_NOCACHE;
+
+ map.m_lblk = offset >> blkbits;
+ max_blocks = EXT4_MAX_BLOCKS(len, offset, blkbits);
+
+ if (!handle) {
+ /*
+ * TODO: An optimization can be added later by having an extent
+ * status flag e.g. EXTENT_STATUS_SPLIT_LEAF. If we query that
+ * it can tell if the extent in the cache is a split extent.
+ * But for now let's assume pextents as 2 always.
+ */
+ credits = ext4_meta_trans_blocks(inode, max_blocks, 2);
+ }
+
+ if (credits) {
+ handle = ext4_journal_start(inode, EXT4_HT_MAP_BLOCKS, credits);
+ if (IS_ERR(handle)) {
+ ret = PTR_ERR(handle);
+ return ret;
+ }
+ }
+
+ while (ret >= 0 && ret < max_blocks) {
+ map.m_lblk += ret;
+ map.m_len = (max_blocks -= ret);
+ ret = ext4_map_blocks(handle, inode, &map, flags);
+ if (ret != max_blocks)
+ ext4_msg(inode->i_sb, KERN_INFO,
+ "inode #%lu: block %u: len %u: "
+ "split block mapping found for atomic write, "
+ "ret = %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret);
+ if (ret <= 0)
+ break;
+ }
+
+ ret2 = ext4_mark_inode_dirty(handle, inode);
+
+ if (credits) {
+ ret3 = ext4_journal_stop(handle);
+ if (unlikely(ret3))
+ ret2 = ret3;
+ }
+
+ if (ret <= 0 || ret2)
+ ext4_warning(inode->i_sb,
+ "inode #%lu: block %u: len %u: "
+ "returned %d or %d",
+ inode->i_ino, map.m_lblk,
+ map.m_len, ret, ret2);
+
+ return ret > 0 ? ret2 : ret;
+}
+
+/*
* This function convert a range of blocks to written extents
* The caller of this function will pass the start offset and the size.
* all unwritten extents within this range will be converted to
@@ -4819,8 +4924,14 @@ int ext4_convert_unwritten_extents(handle_t *handle, struct inode *inode,
break;
}
}
+ /*
+ * Do not cache any unrelated extents, as it does not hold the
+ * i_rwsem or invalidate_lock, which could corrupt the extent
+ * status tree.
+ */
ret = ext4_map_blocks(handle, inode, &map,
- EXT4_GET_BLOCKS_IO_CONVERT_EXT);
+ EXT4_GET_BLOCKS_IO_CONVERT_EXT |
+ EXT4_EX_NOCACHE);
if (ret <= 0)
ext4_warning(inode->i_sb,
"inode #%lu: block %u: len %u: "
@@ -4931,12 +5042,7 @@ static const struct iomap_ops ext4_iomap_xattr_ops = {
static int ext4_fiemap_check_ranges(struct inode *inode, u64 start, u64 *len)
{
- u64 maxbytes;
-
- if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- maxbytes = inode->i_sb->s_maxbytes;
- else
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
+ u64 maxbytes = ext4_get_maxbytes(inode);
if (*len == 0)
return -EINVAL;
@@ -4956,10 +5062,11 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
{
int error = 0;
+ inode_lock_shared(inode);
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
error = ext4_ext_precache(inode);
if (error)
- return error;
+ goto unlock;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
}
@@ -4970,15 +5077,19 @@ int ext4_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
*/
error = ext4_fiemap_check_ranges(inode, start, &len);
if (error)
- return error;
+ goto unlock;
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
fieinfo->fi_flags &= ~FIEMAP_FLAG_XATTR;
- return iomap_fiemap(inode, fieinfo, start, len,
- &ext4_iomap_xattr_ops);
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_xattr_ops);
+ } else {
+ error = iomap_fiemap(inode, fieinfo, start, len,
+ &ext4_iomap_report_ops);
}
-
- return iomap_fiemap(inode, fieinfo, start, len, &ext4_iomap_report_ops);
+unlock:
+ inode_unlock_shared(inode);
+ return error;
}
int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -4999,7 +5110,9 @@ int ext4_get_es_cache(struct inode *inode, struct fiemap_extent_info *fieinfo,
}
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
+ inode_lock_shared(inode);
error = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
if (error)
return error;
fieinfo->fi_flags &= ~FIEMAP_FLAG_CACHE;
@@ -5328,6 +5441,8 @@ static int ext4_collapse_range(struct file *file, loff_t offset, loff_t len)
start_lblk = offset >> inode->i_blkbits;
end_lblk = (offset + len) >> inode->i_blkbits;
+ ext4_check_map_extents_env(inode);
+
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
ext4_es_remove_extent(inode, start_lblk, EXT_MAX_BLOCKS - start_lblk);
@@ -5429,6 +5544,8 @@ static int ext4_insert_range(struct file *file, loff_t offset, loff_t len)
start_lblk = offset >> inode->i_blkbits;
len_lblk = len >> inode->i_blkbits;
+ ext4_check_map_extents_env(inode);
+
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
diff --git a/fs/ext4/extents_status.c b/fs/ext4/extents_status.c
index d1401d4a5513..31dc0496f8d0 100644
--- a/fs/ext4/extents_status.c
+++ b/fs/ext4/extents_status.c
@@ -120,9 +120,40 @@
* memory. Hence, we will reclaim written/unwritten/hole extents from
* the tree under a heavy memory pressure.
*
+ * ==========================================================================
+ * 3. Assurance of Ext4 extent status tree consistency
+ *
+ * When mapping blocks, Ext4 queries the extent status tree first and should
+ * always trusts that the extent status tree is consistent and up to date.
+ * Therefore, it is important to adheres to the following rules when createing,
+ * modifying and removing extents.
+ *
+ * 1. Besides fastcommit replay, when Ext4 creates or queries block mappings,
+ * the extent information should always be processed through the extent
+ * status tree instead of being organized manually through the on-disk
+ * extent tree.
+ *
+ * 2. When updating the extent tree, Ext4 should acquire the i_data_sem
+ * exclusively and update the extent status tree atomically. If the extents
+ * to be modified are large enough to exceed the range that a single
+ * i_data_sem can process (as ext4_datasem_ensure_credits() may drop
+ * i_data_sem to restart a transaction), it must (e.g. as ext4_punch_hole()
+ * does):
+ *
+ * a) Hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against page faults, as well as reads and writes that may
+ * concurrently modify the extent status tree.
+ * b) Evict all page cache in the affected range and recommend rebuilding
+ * or dropping the extent status tree after modifying the on-disk
+ * extent tree. This ensures exclusion against concurrent writebacks
+ * that do not hold those locks but only holds a folio lock.
+ *
+ * 3. Based on the rules above, when querying block mappings, Ext4 should at
+ * least hold the i_rwsem or invalidate_lock or folio lock(s) for the
+ * specified querying range.
*
* ==========================================================================
- * 3. Performance analysis
+ * 4. Performance analysis
*
* -- overhead
* 1. There is a cache extent for write access, so if writes are
@@ -134,7 +165,7 @@
*
*
* ==========================================================================
- * 4. TODO list
+ * 5. TODO list
*
* -- Refactor delayed space reservation
*
diff --git a/fs/ext4/fast_commit.c b/fs/ext4/fast_commit.c
index da4263a14a20..42bee1d4f9f9 100644
--- a/fs/ext4/fast_commit.c
+++ b/fs/ext4/fast_commit.c
@@ -12,6 +12,7 @@
#include "ext4_extents.h"
#include "mballoc.h"
+#include <linux/lockdep.h>
/*
* Ext4 Fast Commits
* -----------------
@@ -49,19 +50,27 @@
* that need to be committed during a fast commit in another in memory queue of
* inodes. During the commit operation, we commit in the following order:
*
- * [1] Lock inodes for any further data updates by setting COMMITTING state
- * [2] Submit data buffers of all the inodes
- * [3] Wait for [2] to complete
- * [4] Commit all the directory entry updates in the fast commit space
- * [5] Commit all the changed inode structures
- * [6] Write tail tag (this tag ensures the atomicity, please read the following
+ * [1] Prepare all the inodes to write out their data by setting
+ * "EXT4_STATE_FC_FLUSHING_DATA". This ensures that inode cannot be
+ * deleted while it is being flushed.
+ * [2] Flush data buffers to disk and clear "EXT4_STATE_FC_FLUSHING_DATA"
+ * state.
+ * [3] Lock the journal by calling jbd2_journal_lock_updates. This ensures that
+ * all the exsiting handles finish and no new handles can start.
+ * [4] Mark all the fast commit eligible inodes as undergoing fast commit
+ * by setting "EXT4_STATE_FC_COMMITTING" state.
+ * [5] Unlock the journal by calling jbd2_journal_unlock_updates. This allows
+ * starting of new handles. If new handles try to start an update on
+ * any of the inodes that are being committed, ext4_fc_track_inode()
+ * will block until those inodes have finished the fast commit.
+ * [6] Commit all the directory entry updates in the fast commit space.
+ * [7] Commit all the changed inodes in the fast commit space and clear
+ * "EXT4_STATE_FC_COMMITTING" for these inodes.
+ * [8] Write tail tag (this tag ensures the atomicity, please read the following
* section for more details).
- * [7] Wait for [4], [5] and [6] to complete.
*
- * All the inode updates must call ext4_fc_start_update() before starting an
- * update. If such an ongoing update is present, fast commit waits for it to
- * complete. The completion of such an update is marked by
- * ext4_fc_stop_update().
+ * All the inode updates must be enclosed within jbd2_jounrnal_start()
+ * and jbd2_journal_stop() similar to JBD2 journaling.
*
* Fast Commit Ineligibility
* -------------------------
@@ -142,6 +151,13 @@
* similarly. Thus, by converting a non-idempotent procedure into a series of
* idempotent outcomes, fast commits ensured idempotence during the replay.
*
+ * Locking
+ * -------
+ * sbi->s_fc_lock protects the fast commit inodes queue and the fast commit
+ * dentry queue. ei->i_fc_lock protects the fast commit related info in a given
+ * inode. Most of the code avoids acquiring both the locks, but if one must do
+ * that then sbi->s_fc_lock must be acquired before ei->i_fc_lock.
+ *
* TODOs
* -----
*
@@ -156,13 +172,12 @@
* fast commit recovery even if that area is invalidated by later full
* commits.
*
- * 1) Fast commit's commit path locks the entire file system during fast
- * commit. This has significant performance penalty. Instead of that, we
- * should use ext4_fc_start/stop_update functions to start inode level
- * updates from ext4_journal_start/stop. Once we do that we can drop file
- * system locking during commit path.
+ * 1) Handle more ineligible cases.
*
- * 2) Handle more ineligible cases.
+ * 2) Change ext4_fc_commit() to lookup logical to physical mapping using extent
+ * status tree. This would get rid of the need to call ext4_fc_track_inode()
+ * before acquiring i_data_sem. To do that we would need to ensure that
+ * modified extents from the extent status tree are not evicted from memory.
*/
#include <trace/events/ext4.h>
@@ -201,32 +216,6 @@ void ext4_fc_init_inode(struct inode *inode)
INIT_LIST_HEAD(&ei->i_fc_list);
INIT_LIST_HEAD(&ei->i_fc_dilist);
init_waitqueue_head(&ei->i_fc_wait);
- atomic_set(&ei->i_fc_updates, 0);
-}
-
-/* This function must be called with sbi->s_fc_lock held. */
-static void ext4_fc_wait_committing_inode(struct inode *inode)
-__releases(&EXT4_SB(inode->i_sb)->s_fc_lock)
-{
- wait_queue_head_t *wq;
- struct ext4_inode_info *ei = EXT4_I(inode);
-
-#if (BITS_PER_LONG < 64)
- DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_state_flags,
- EXT4_STATE_FC_COMMITTING);
-#else
- DEFINE_WAIT_BIT(wait, &ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
- wq = bit_waitqueue(&ei->i_flags,
- EXT4_STATE_FC_COMMITTING);
-#endif
- lockdep_assert_held(&EXT4_SB(inode->i_sb)->s_fc_lock);
- prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- schedule();
- finish_wait(wq, &wait.wq_entry);
}
static bool ext4_fc_disabled(struct super_block *sb)
@@ -236,48 +225,6 @@ static bool ext4_fc_disabled(struct super_block *sb)
}
/*
- * Inform Ext4's fast about start of an inode update
- *
- * This function is called by the high level call VFS callbacks before
- * performing any inode update. This function blocks if there's an ongoing
- * fast commit on the inode in question.
- */
-void ext4_fc_start_update(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- if (ext4_fc_disabled(inode->i_sb))
- return;
-
-restart:
- spin_lock(&EXT4_SB(inode->i_sb)->s_fc_lock);
- if (list_empty(&ei->i_fc_list))
- goto out;
-
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
- }
-out:
- atomic_inc(&ei->i_fc_updates);
- spin_unlock(&EXT4_SB(inode->i_sb)->s_fc_lock);
-}
-
-/*
- * Stop inode update and wake up waiting fast commits if any.
- */
-void ext4_fc_stop_update(struct inode *inode)
-{
- struct ext4_inode_info *ei = EXT4_I(inode);
-
- if (ext4_fc_disabled(inode->i_sb))
- return;
-
- if (atomic_dec_and_test(&ei->i_fc_updates))
- wake_up_all(&ei->i_fc_wait);
-}
-
-/*
* Remove inode from fast commit list. If the inode is being committed
* we wait until inode commit is done.
*/
@@ -286,31 +233,62 @@ void ext4_fc_del(struct inode *inode)
struct ext4_inode_info *ei = EXT4_I(inode);
struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_fc_dentry_update *fc_dentry;
+ wait_queue_head_t *wq;
if (ext4_fc_disabled(inode->i_sb))
return;
-restart:
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
if (list_empty(&ei->i_fc_list) && list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return;
}
- if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
- ext4_fc_wait_committing_inode(inode);
- goto restart;
+ /*
+ * Since ext4_fc_del is called from ext4_evict_inode while having a
+ * handle open, there is no need for us to wait here even if a fast
+ * commit is going on. That is because, if this inode is being
+ * committed, ext4_mark_inode_dirty would have waited for inode commit
+ * operation to finish before we come here. So, by the time we come
+ * here, inode's EXT4_STATE_FC_COMMITTING would have been cleared. So,
+ * we shouldn't see EXT4_STATE_FC_COMMITTING to be set on this inode
+ * here.
+ *
+ * We may come here without any handles open in the "no_delete" case of
+ * ext4_evict_inode as well. However, if that happens, we first mark the
+ * file system as fast commit ineligible anyway. So, even in that case,
+ * it is okay to remove the inode from the fc list.
+ */
+ WARN_ON(ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)
+ && !ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE));
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_FLUSHING_DATA)) {
+ mutex_unlock(&sbi->s_fc_lock);
+ schedule();
+ mutex_lock(&sbi->s_fc_lock);
+ }
+ finish_wait(wq, &wait.wq_entry);
}
-
- if (!list_empty(&ei->i_fc_list))
- list_del_init(&ei->i_fc_list);
+ list_del_init(&ei->i_fc_list);
/*
* Since this inode is getting removed, let's also remove all FC
* dentry create references, since it is not needed to log it anyways.
*/
if (list_empty(&ei->i_fc_dilist)) {
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return;
}
@@ -320,12 +298,10 @@ restart:
list_del_init(&fc_dentry->fcd_dilist);
WARN_ON(!list_empty(&ei->i_fc_dilist));
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
-
- return;
}
/*
@@ -353,12 +329,12 @@ void ext4_fc_mark_ineligible(struct super_block *sb, int reason, handle_t *handl
has_transaction = false;
read_unlock(&sbi->s_journal->j_state_lock);
}
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
is_ineligible = ext4_test_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
if (has_transaction && (!is_ineligible || tid_gt(tid, sbi->s_fc_ineligible_tid)))
sbi->s_fc_ineligible_tid = tid;
ext4_set_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
WARN_ON(reason >= EXT4_FC_REASON_MAX);
sbi->s_fc_stats.fc_ineligible_reason_count[reason]++;
}
@@ -385,7 +361,7 @@ static int ext4_fc_track_template(
int ret;
tid = handle->h_transaction->t_tid;
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
if (tid == ei->i_sync_tid) {
update = true;
} else {
@@ -393,19 +369,18 @@ static int ext4_fc_track_template(
ei->i_sync_tid = tid;
}
ret = __fc_track_fn(handle, inode, args, update);
- mutex_unlock(&ei->i_fc_lock);
-
+ spin_unlock(&ei->i_fc_lock);
if (!enqueue)
return ret;
- spin_lock(&sbi->s_fc_lock);
+ mutex_lock(&sbi->s_fc_lock);
if (list_empty(&EXT4_I(inode)->i_fc_list))
list_add_tail(&EXT4_I(inode)->i_fc_list,
(sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING) ?
&sbi->s_fc_q[FC_Q_STAGING] :
&sbi->s_fc_q[FC_Q_MAIN]);
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
return ret;
}
@@ -428,19 +403,19 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
struct super_block *sb = inode->i_sb;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
if (IS_ENCRYPTED(dir)) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_ENCRYPTED_FILENAME,
handle);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return -EOPNOTSUPP;
}
node = kmem_cache_alloc(ext4_fc_dentry_cachep, GFP_NOFS);
if (!node) {
ext4_fc_mark_ineligible(sb, EXT4_FC_REASON_NOMEM, handle);
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return -ENOMEM;
}
@@ -449,7 +424,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
node->fcd_ino = inode->i_ino;
take_dentry_name_snapshot(&node->fcd_name, dentry);
INIT_LIST_HEAD(&node->fcd_dilist);
- spin_lock(&sbi->s_fc_lock);
+ INIT_LIST_HEAD(&node->fcd_list);
+ mutex_lock(&sbi->s_fc_lock);
if (sbi->s_journal->j_flags & JBD2_FULL_COMMIT_ONGOING ||
sbi->s_journal->j_flags & JBD2_FAST_COMMIT_ONGOING)
list_add_tail(&node->fcd_list,
@@ -470,8 +446,8 @@ static int __track_dentry_update(handle_t *handle, struct inode *inode,
WARN_ON(!list_empty(&ei->i_fc_dilist));
list_add_tail(&node->fcd_dilist, &ei->i_fc_dilist);
}
- spin_unlock(&sbi->s_fc_lock);
- mutex_lock(&ei->i_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
+ spin_lock(&ei->i_fc_lock);
return 0;
}
@@ -571,6 +547,8 @@ static int __track_inode(handle_t *handle, struct inode *inode, void *arg,
void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
{
+ struct ext4_inode_info *ei = EXT4_I(inode);
+ wait_queue_head_t *wq;
int ret;
if (S_ISDIR(inode->i_mode))
@@ -588,6 +566,35 @@ void ext4_fc_track_inode(handle_t *handle, struct inode *inode)
if (ext4_test_mount_flag(inode->i_sb, EXT4_MF_FC_INELIGIBLE))
return;
+ /*
+ * If we come here, we may sleep while waiting for the inode to
+ * commit. We shouldn't be holding i_data_sem when we go to sleep since
+ * the commit path needs to grab the lock while committing the inode.
+ */
+ lockdep_assert_not_held(&ei->i_data_sem);
+
+ while (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING)) {
+#if (BITS_PER_LONG < 64)
+ DEFINE_WAIT_BIT(wait, &ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_state_flags,
+ EXT4_STATE_FC_COMMITTING);
+#else
+ DEFINE_WAIT_BIT(wait, &ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+ wq = bit_waitqueue(&ei->i_flags,
+ EXT4_STATE_FC_COMMITTING);
+#endif
+ prepare_to_wait(wq, &wait.wq_entry, TASK_UNINTERRUPTIBLE);
+ if (ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
+ schedule();
+ finish_wait(wq, &wait.wq_entry);
+ }
+
+ /*
+ * From this point on, this inode will not be committed either
+ * by fast or full commit as long as the handle is open.
+ */
ret = ext4_fc_track_template(handle, inode, __track_inode, NULL, 1);
trace_ext4_fc_track_inode(handle, inode, ret);
}
@@ -727,7 +734,7 @@ static u8 *ext4_fc_reserve_space(struct super_block *sb, int len, u32 *crc)
tl.fc_len = cpu_to_le16(remaining);
memcpy(dst, &tl, EXT4_FC_TAG_BASE_LEN);
memset(dst + EXT4_FC_TAG_BASE_LEN, 0, remaining);
- *crc = ext4_chksum(sbi, *crc, sbi->s_fc_bh->b_data, bsize);
+ *crc = ext4_chksum(*crc, sbi->s_fc_bh->b_data, bsize);
ext4_fc_submit_bh(sb, false);
@@ -774,7 +781,7 @@ static int ext4_fc_write_tail(struct super_block *sb, u32 crc)
tail.fc_tid = cpu_to_le32(sbi->s_journal->j_running_transaction->t_tid);
memcpy(dst, &tail.fc_tid, sizeof(tail.fc_tid));
dst += sizeof(tail.fc_tid);
- crc = ext4_chksum(sbi, crc, sbi->s_fc_bh->b_data,
+ crc = ext4_chksum(crc, sbi->s_fc_bh->b_data,
dst - (u8 *)sbi->s_fc_bh->b_data);
tail.fc_crc = cpu_to_le32(crc);
memcpy(dst, &tail.fc_crc, sizeof(tail.fc_crc));
@@ -893,15 +900,15 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
struct ext4_extent *ex;
int ret;
- mutex_lock(&ei->i_fc_lock);
+ spin_lock(&ei->i_fc_lock);
if (ei->i_fc_lblk_len == 0) {
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
return 0;
}
old_blk_size = ei->i_fc_lblk_start;
new_blk_size = ei->i_fc_lblk_start + ei->i_fc_lblk_len - 1;
ei->i_fc_lblk_len = 0;
- mutex_unlock(&ei->i_fc_lock);
+ spin_unlock(&ei->i_fc_lock);
cur_lblk_off = old_blk_size;
ext4_debug("will try writing %d to %d for inode %ld\n",
@@ -910,7 +917,9 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
while (cur_lblk_off <= new_blk_size) {
map.m_lblk = cur_lblk_off;
map.m_len = new_blk_size - cur_lblk_off + 1;
- ret = ext4_map_blocks(NULL, inode, &map, 0);
+ ret = ext4_map_blocks(NULL, inode, &map,
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE);
if (ret < 0)
return -ECANCELED;
@@ -954,69 +963,31 @@ static int ext4_fc_write_inode_data(struct inode *inode, u32 *crc)
}
-/* Submit data for all the fast commit inodes */
-static int ext4_fc_submit_inode_data_all(journal_t *journal)
+/* Flushes data of all the inodes in the commit queue. */
+static int ext4_fc_flush_data(journal_t *journal)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct ext4_inode_info *ei;
int ret = 0;
- spin_lock(&sbi->s_fc_lock);
list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- ext4_set_inode_state(&ei->vfs_inode, EXT4_STATE_FC_COMMITTING);
- while (atomic_read(&ei->i_fc_updates)) {
- DEFINE_WAIT(wait);
-
- prepare_to_wait(&ei->i_fc_wait, &wait,
- TASK_UNINTERRUPTIBLE);
- if (atomic_read(&ei->i_fc_updates)) {
- spin_unlock(&sbi->s_fc_lock);
- schedule();
- spin_lock(&sbi->s_fc_lock);
- }
- finish_wait(&ei->i_fc_wait, &wait);
- }
- spin_unlock(&sbi->s_fc_lock);
ret = jbd2_submit_inode_data(journal, ei->jinode);
if (ret)
return ret;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
-
- return ret;
-}
-
-/* Wait for completion of data for all the fast commit inodes */
-static int ext4_fc_wait_inode_data_all(journal_t *journal)
-{
- struct super_block *sb = journal->j_private;
- struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_inode_info *pos, *n;
- int ret = 0;
-
- spin_lock(&sbi->s_fc_lock);
- list_for_each_entry_safe(pos, n, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
- if (!ext4_test_inode_state(&pos->vfs_inode,
- EXT4_STATE_FC_COMMITTING))
- continue;
- spin_unlock(&sbi->s_fc_lock);
- ret = jbd2_wait_inode_data(journal, pos->jinode);
+ list_for_each_entry(ei, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ret = jbd2_wait_inode_data(journal, ei->jinode);
if (ret)
return ret;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
return 0;
}
/* Commit all the directory entry updates */
static int ext4_fc_commit_dentry_updates(journal_t *journal, u32 *crc)
-__acquires(&sbi->s_fc_lock)
-__releases(&sbi->s_fc_lock)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
@@ -1030,26 +1001,22 @@ __releases(&sbi->s_fc_lock)
list_for_each_entry_safe(fc_dentry, fc_dentry_n,
&sbi->s_fc_dentry_q[FC_Q_MAIN], fcd_list) {
if (fc_dentry->fcd_op != EXT4_FC_TAG_CREAT) {
- spin_unlock(&sbi->s_fc_lock);
- if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
- ret = -ENOSPC;
- goto lock_and_exit;
- }
- spin_lock(&sbi->s_fc_lock);
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
continue;
}
/*
* With fcd_dilist we need not loop in sbi->s_fc_q to get the
- * corresponding inode pointer
+ * corresponding inode. Also, the corresponding inode could have been
+ * deleted, in which case, we don't need to do anything.
*/
- WARN_ON(list_empty(&fc_dentry->fcd_dilist));
+ if (list_empty(&fc_dentry->fcd_dilist))
+ continue;
ei = list_first_entry(&fc_dentry->fcd_dilist,
struct ext4_inode_info, i_fc_dilist);
inode = &ei->vfs_inode;
WARN_ON(inode->i_ino != fc_dentry->fcd_ino);
- spin_unlock(&sbi->s_fc_lock);
-
/*
* We first write the inode and then the create dirent. This
* allows the recovery code to create an unnamed inode first
@@ -1059,23 +1026,14 @@ __releases(&sbi->s_fc_lock)
*/
ret = ext4_fc_write_inode(inode, crc);
if (ret)
- goto lock_and_exit;
-
+ return ret;
ret = ext4_fc_write_inode_data(inode, crc);
if (ret)
- goto lock_and_exit;
-
- if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry)) {
- ret = -ENOSPC;
- goto lock_and_exit;
- }
-
- spin_lock(&sbi->s_fc_lock);
+ return ret;
+ if (!ext4_fc_add_dentry_tlv(sb, crc, fc_dentry))
+ return -ENOSPC;
}
return 0;
-lock_and_exit:
- spin_lock(&sbi->s_fc_lock);
- return ret;
}
static int ext4_fc_perform_commit(journal_t *journal)
@@ -1089,26 +1047,81 @@ static int ext4_fc_perform_commit(journal_t *journal)
int ret = 0;
u32 crc = 0;
- ret = ext4_fc_submit_inode_data_all(journal);
- if (ret)
- return ret;
+ /*
+ * Step 1: Mark all inodes on s_fc_q[MAIN] with
+ * EXT4_STATE_FC_FLUSHING_DATA. This prevents these inodes from being
+ * freed until the data flush is over.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+
+ /* Step 2: Flush data for all the eligible inodes. */
+ ret = ext4_fc_flush_data(journal);
- ret = ext4_fc_wait_inode_data_all(journal);
+ /*
+ * Step 3: Clear EXT4_STATE_FC_FLUSHING_DATA flag, before returning
+ * any error from step 2. This ensures that waiters waiting on
+ * EXT4_STATE_FC_FLUSHING_DATA can resume.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_clear_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_FLUSHING_DATA);
+#if (BITS_PER_LONG < 64)
+ wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#else
+ wake_up_bit(&iter->i_flags, EXT4_STATE_FC_FLUSHING_DATA);
+#endif
+ }
+
+ /*
+ * Make sure clearing of EXT4_STATE_FC_FLUSHING_DATA is visible before
+ * the waiter checks the bit. Pairs with implicit barrier in
+ * prepare_to_wait() in ext4_fc_del().
+ */
+ smp_mb();
+ mutex_unlock(&sbi->s_fc_lock);
+
+ /*
+ * If we encountered error in Step 2, return it now after clearing
+ * EXT4_STATE_FC_FLUSHING_DATA bit.
+ */
if (ret)
return ret;
+
+ /* Step 4: Mark all inodes as being committed. */
+ jbd2_journal_lock_updates(journal);
/*
- * If file system device is different from journal device, issue a cache
- * flush before we start writing fast commit blocks.
+ * The journal is now locked. No more handles can start and all the
+ * previous handles are now drained. We now mark the inodes on the
+ * commit queue as being committed.
+ */
+ mutex_lock(&sbi->s_fc_lock);
+ list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
+ ext4_set_inode_state(&iter->vfs_inode,
+ EXT4_STATE_FC_COMMITTING);
+ }
+ mutex_unlock(&sbi->s_fc_lock);
+ jbd2_journal_unlock_updates(journal);
+
+ /*
+ * Step 5: If file system device is different from journal device,
+ * issue a cache flush before we start writing fast commit blocks.
*/
if (journal->j_fs_dev != journal->j_dev)
blkdev_issue_flush(journal->j_fs_dev);
blk_start_plug(&plug);
+ /* Step 6: Write fast commit blocks to disk. */
if (sbi->s_fc_bytes == 0) {
/*
- * Add a head tag only if this is the first fast commit
- * in this TID.
+ * Step 6.1: Add a head tag only if this is the first fast
+ * commit in this TID.
*/
head.fc_features = cpu_to_le32(EXT4_FC_SUPPORTED_FEATURES);
head.fc_tid = cpu_to_le32(
@@ -1120,32 +1133,30 @@ static int ext4_fc_perform_commit(journal_t *journal)
}
}
- spin_lock(&sbi->s_fc_lock);
+ /* Step 6.2: Now write all the dentry updates. */
+ mutex_lock(&sbi->s_fc_lock);
ret = ext4_fc_commit_dentry_updates(journal, &crc);
- if (ret) {
- spin_unlock(&sbi->s_fc_lock);
+ if (ret)
goto out;
- }
+ /* Step 6.3: Now write all the changed inodes to disk. */
list_for_each_entry(iter, &sbi->s_fc_q[FC_Q_MAIN], i_fc_list) {
inode = &iter->vfs_inode;
if (!ext4_test_inode_state(inode, EXT4_STATE_FC_COMMITTING))
continue;
- spin_unlock(&sbi->s_fc_lock);
ret = ext4_fc_write_inode_data(inode, &crc);
if (ret)
goto out;
ret = ext4_fc_write_inode(inode, &crc);
if (ret)
goto out;
- spin_lock(&sbi->s_fc_lock);
}
- spin_unlock(&sbi->s_fc_lock);
-
+ /* Step 6.4: Finally write tail tag to conclude this fast commit. */
ret = ext4_fc_write_tail(sb, crc);
out:
+ mutex_unlock(&sbi->s_fc_lock);
blk_finish_plug(&plug);
return ret;
}
@@ -1191,6 +1202,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
int subtid = atomic_read(&sbi->s_fc_subtid);
int status = EXT4_FC_STATUS_OK, fc_bufs_before = 0;
ktime_t start_time, commit_time;
+ int old_ioprio, journal_ioprio;
if (!test_opt2(sb, JOURNAL_FAST_COMMIT))
return jbd2_complete_transaction(journal, commit_tid);
@@ -1198,6 +1210,7 @@ int ext4_fc_commit(journal_t *journal, tid_t commit_tid)
trace_ext4_fc_commit_start(sb, commit_tid);
start_time = ktime_get();
+ old_ioprio = get_current_ioprio();
restart_fc:
ret = jbd2_fc_begin_commit(journal, commit_tid);
@@ -1228,6 +1241,15 @@ restart_fc:
goto fallback;
}
+ /*
+ * Now that we know that this thread is going to do a fast commit,
+ * elevate the priority to match that of the journal thread.
+ */
+ if (journal->j_task->io_context)
+ journal_ioprio = sbi->s_journal->j_task->io_context->ioprio;
+ else
+ journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
+ set_task_ioprio(current, journal_ioprio);
fc_bufs_before = (sbi->s_fc_bytes + bsize - 1) / bsize;
ret = ext4_fc_perform_commit(journal);
if (ret < 0) {
@@ -1242,6 +1264,7 @@ restart_fc:
}
atomic_inc(&sbi->s_fc_subtid);
ret = jbd2_fc_end_commit(journal);
+ set_task_ioprio(current, old_ioprio);
/*
* weight the commit time higher than the average time so we
* don't react too strongly to vast changes in the commit time
@@ -1251,6 +1274,7 @@ restart_fc:
return ret;
fallback:
+ set_task_ioprio(current, old_ioprio);
ret = jbd2_fc_end_commit_fallback(journal);
ext4_fc_update_stats(sb, status, 0, 0, commit_tid);
return ret;
@@ -1264,7 +1288,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
{
struct super_block *sb = journal->j_private;
struct ext4_sb_info *sbi = EXT4_SB(sb);
- struct ext4_inode_info *iter, *iter_n;
+ struct ext4_inode_info *ei;
struct ext4_fc_dentry_update *fc_dentry;
if (full && sbi->s_fc_bh)
@@ -1273,14 +1297,16 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
trace_ext4_fc_cleanup(journal, full, tid);
jbd2_fc_release_bufs(journal);
- spin_lock(&sbi->s_fc_lock);
- list_for_each_entry_safe(iter, iter_n, &sbi->s_fc_q[FC_Q_MAIN],
- i_fc_list) {
- list_del_init(&iter->i_fc_list);
- ext4_clear_inode_state(&iter->vfs_inode,
+ mutex_lock(&sbi->s_fc_lock);
+ while (!list_empty(&sbi->s_fc_q[FC_Q_MAIN])) {
+ ei = list_first_entry(&sbi->s_fc_q[FC_Q_MAIN],
+ struct ext4_inode_info,
+ i_fc_list);
+ list_del_init(&ei->i_fc_list);
+ ext4_clear_inode_state(&ei->vfs_inode,
EXT4_STATE_FC_COMMITTING);
- if (tid_geq(tid, iter->i_sync_tid)) {
- ext4_fc_reset_inode(&iter->vfs_inode);
+ if (tid_geq(tid, ei->i_sync_tid)) {
+ ext4_fc_reset_inode(&ei->vfs_inode);
} else if (full) {
/*
* We are called after a full commit, inode has been
@@ -1291,15 +1317,19 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
* time in that case (and tid doesn't increase so
* tid check above isn't reliable).
*/
- list_add_tail(&EXT4_I(&iter->vfs_inode)->i_fc_list,
+ list_add_tail(&ei->i_fc_list,
&sbi->s_fc_q[FC_Q_STAGING]);
}
- /* Make sure EXT4_STATE_FC_COMMITTING bit is clear */
+ /*
+ * Make sure clearing of EXT4_STATE_FC_COMMITTING is
+ * visible before we send the wakeup. Pairs with implicit
+ * barrier in prepare_to_wait() in ext4_fc_track_inode().
+ */
smp_mb();
#if (BITS_PER_LONG < 64)
- wake_up_bit(&iter->i_state_flags, EXT4_STATE_FC_COMMITTING);
+ wake_up_bit(&ei->i_state_flags, EXT4_STATE_FC_COMMITTING);
#else
- wake_up_bit(&iter->i_flags, EXT4_STATE_FC_COMMITTING);
+ wake_up_bit(&ei->i_flags, EXT4_STATE_FC_COMMITTING);
#endif
}
@@ -1309,11 +1339,9 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
fcd_list);
list_del_init(&fc_dentry->fcd_list);
list_del_init(&fc_dentry->fcd_dilist);
- spin_unlock(&sbi->s_fc_lock);
release_dentry_name_snapshot(&fc_dentry->fcd_name);
kmem_cache_free(ext4_fc_dentry_cachep, fc_dentry);
- spin_lock(&sbi->s_fc_lock);
}
list_splice_init(&sbi->s_fc_dentry_q[FC_Q_STAGING],
@@ -1328,7 +1356,7 @@ static void ext4_fc_cleanup(journal_t *journal, int full, tid_t tid)
if (full)
sbi->s_fc_bytes = 0;
- spin_unlock(&sbi->s_fc_lock);
+ mutex_unlock(&sbi->s_fc_lock);
trace_ext4_fc_stats(sb);
}
@@ -2105,13 +2133,13 @@ static int ext4_fc_replay_scan(journal_t *journal,
case EXT4_FC_TAG_INODE:
case EXT4_FC_TAG_PAD:
state->fc_cur_tag++;
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
case EXT4_FC_TAG_TAIL:
state->fc_cur_tag++;
memcpy(&tail, val, sizeof(tail));
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN +
offsetof(struct ext4_fc_tail,
fc_crc));
@@ -2138,7 +2166,7 @@ static int ext4_fc_replay_scan(journal_t *journal,
break;
}
state->fc_cur_tag++;
- state->fc_crc = ext4_chksum(sbi, state->fc_crc, cur,
+ state->fc_crc = ext4_chksum(state->fc_crc, cur,
EXT4_FC_TAG_BASE_LEN + tl.fc_len);
break;
default:
diff --git a/fs/ext4/file.c b/fs/ext4/file.c
index beb078ee4811..21df81347147 100644
--- a/fs/ext4/file.c
+++ b/fs/ext4/file.c
@@ -377,7 +377,12 @@ static int ext4_dio_write_end_io(struct kiocb *iocb, ssize_t size,
loff_t pos = iocb->ki_pos;
struct inode *inode = file_inode(iocb->ki_filp);
- if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
+
+ if (!error && size && (flags & IOMAP_DIO_UNWRITTEN) &&
+ (iocb->ki_flags & IOCB_ATOMIC))
+ error = ext4_convert_unwritten_extents_atomic(NULL, inode, pos,
+ size);
+ else if (!error && size && flags & IOMAP_DIO_UNWRITTEN)
error = ext4_convert_unwritten_extents(NULL, inode, pos, size);
if (error)
return error;
@@ -929,12 +934,7 @@ static int ext4_file_open(struct inode *inode, struct file *filp)
loff_t ext4_llseek(struct file *file, loff_t offset, int whence)
{
struct inode *inode = file->f_mapping->host;
- loff_t maxbytes;
-
- if (!(ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS)))
- maxbytes = EXT4_SB(inode->i_sb)->s_bitmap_maxbytes;
- else
- maxbytes = inode->i_sb->s_maxbytes;
+ loff_t maxbytes = ext4_get_maxbytes(inode);
switch (whence) {
default:
diff --git a/fs/ext4/ialloc.c b/fs/ext4/ialloc.c
index e7ecc7c8a729..79aa3df8d019 100644
--- a/fs/ext4/ialloc.c
+++ b/fs/ext4/ialloc.c
@@ -1288,10 +1288,9 @@ got:
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = cpu_to_le32(inode->i_generation);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
ext4_clear_state_flags(ei); /* Only relevant on 32-bit archs */
@@ -1336,6 +1335,9 @@ got:
}
}
+ if (ext4_should_enable_large_folio(inode))
+ mapping_set_large_folios(inode->i_mapping);
+
ext4_update_inode_fsync_trans(handle, inode, 1);
err = ext4_mark_inode_dirty(handle, inode);
diff --git a/fs/ext4/inline.c b/fs/ext4/inline.c
index 2c9b762925c7..a1bbcdf40824 100644
--- a/fs/ext4/inline.c
+++ b/fs/ext4/inline.c
@@ -397,7 +397,7 @@ out:
}
static int ext4_prepare_inline_data(handle_t *handle, struct inode *inode,
- unsigned int len)
+ loff_t len)
{
int ret, size, no_expand;
struct ext4_inode_info *ei = EXT4_I(inode);
@@ -601,6 +601,7 @@ retry:
goto out;
}
+ ext4_fc_track_inode(handle, inode);
ret = ext4_destroy_inline_data_nolock(handle, inode);
if (ret)
goto out;
diff --git a/fs/ext4/inode.c b/fs/ext4/inode.c
index cdf01e60fa6d..be9a4cba35fd 100644
--- a/fs/ext4/inode.c
+++ b/fs/ext4/inode.c
@@ -58,29 +58,27 @@ static void ext4_journalled_zero_new_buffers(handle_t *handle,
static __u32 ext4_inode_csum(struct inode *inode, struct ext4_inode *raw,
struct ext4_inode_info *ei)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
__u32 csum;
__u16 dummy_csum = 0;
int offset = offsetof(struct ext4_inode, i_checksum_lo);
unsigned int csum_size = sizeof(dummy_csum);
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)raw, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, csum_size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)raw, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, csum_size);
offset += csum_size;
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_GOOD_OLD_INODE_SIZE - offset);
if (EXT4_INODE_SIZE(inode->i_sb) > EXT4_GOOD_OLD_INODE_SIZE) {
offset = offsetof(struct ext4_inode, i_checksum_hi);
- csum = ext4_chksum(sbi, csum, (__u8 *)raw +
- EXT4_GOOD_OLD_INODE_SIZE,
+ csum = ext4_chksum(csum, (__u8 *)raw + EXT4_GOOD_OLD_INODE_SIZE,
offset - EXT4_GOOD_OLD_INODE_SIZE);
if (EXT4_FITS_IN_INODE(raw, ei, i_checksum_hi)) {
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum,
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum,
csum_size);
offset += csum_size;
}
- csum = ext4_chksum(sbi, csum, (__u8 *)raw + offset,
+ csum = ext4_chksum(csum, (__u8 *)raw + offset,
EXT4_INODE_SIZE(inode->i_sb) - offset);
}
@@ -142,9 +140,6 @@ static inline int ext4_begin_ordered_truncate(struct inode *inode,
new_size);
}
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents);
-
/*
* Test whether an inode is a fast symlink.
* A fast symlink has its symlink data stored in ext4_inode_info->i_data.
@@ -416,6 +411,32 @@ int ext4_issue_zeroout(struct inode *inode, ext4_lblk_t lblk, ext4_fsblk_t pblk,
return ret;
}
+/*
+ * For generic regular files, when updating the extent tree, Ext4 should
+ * hold the i_rwsem and invalidate_lock exclusively. This ensures
+ * exclusion against concurrent page faults, as well as reads and writes.
+ */
+#ifdef CONFIG_EXT4_DEBUG
+void ext4_check_map_extents_env(struct inode *inode)
+{
+ if (EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY)
+ return;
+
+ if (!S_ISREG(inode->i_mode) ||
+ IS_NOQUOTA(inode) || IS_VERITY(inode) ||
+ is_special_ino(inode->i_sb, inode->i_ino) ||
+ (inode->i_state & (I_FREEING | I_WILL_FREE | I_NEW)) ||
+ ext4_test_inode_flag(inode, EXT4_INODE_EA_INODE) ||
+ ext4_verity_in_progress(inode))
+ return;
+
+ WARN_ON_ONCE(!inode_is_locked(inode) &&
+ !rwsem_is_locked(&inode->i_mapping->invalidate_lock));
+}
+#else
+void ext4_check_map_extents_env(struct inode *inode) {}
+#endif
+
#define check_block_validity(inode, map) \
__check_block_validity((inode), __func__, __LINE__, (map))
@@ -462,16 +483,73 @@ static void ext4_map_blocks_es_recheck(handle_t *handle,
}
#endif /* ES_AGGRESSIVE_TEST */
+static int ext4_map_query_blocks_next_in_leaf(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map,
+ unsigned int orig_mlen)
+{
+ struct ext4_map_blocks map2;
+ unsigned int status, status2;
+ int retval;
+
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ WARN_ON_ONCE(!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF));
+ WARN_ON_ONCE(orig_mlen <= map->m_len);
+
+ /* Prepare map2 for lookup in next leaf block */
+ map2.m_lblk = map->m_lblk + map->m_len;
+ map2.m_len = orig_mlen - map->m_len;
+ map2.m_flags = 0;
+ retval = ext4_ext_map_blocks(handle, inode, &map2, 0);
+
+ if (retval <= 0) {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return map->m_len;
+ }
+
+ if (unlikely(retval != map2.m_len)) {
+ ext4_warning(inode->i_sb,
+ "ES len assertion failed for inode "
+ "%lu: retval %d != map->m_len %d",
+ inode->i_ino, retval, map2.m_len);
+ WARN_ON(1);
+ }
+
+ status2 = map2.m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+
+ /*
+ * If map2 is contiguous with map, then let's insert it as a single
+ * extent in es cache and return the combined length of both the maps.
+ */
+ if (map->m_pblk + map->m_len == map2.m_pblk &&
+ status == status2) {
+ ext4_es_insert_extent(inode, map->m_lblk,
+ map->m_len + map2.m_len, map->m_pblk,
+ status, false);
+ map->m_len += map2.m_len;
+ } else {
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ }
+
+ return map->m_len;
+}
+
static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
- struct ext4_map_blocks *map)
+ struct ext4_map_blocks *map, int flags)
{
unsigned int status;
int retval;
+ unsigned int orig_mlen = map->m_len;
+ flags &= EXT4_EX_QUERY_FILTER;
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
- retval = ext4_ext_map_blocks(handle, inode, map, 0);
+ retval = ext4_ext_map_blocks(handle, inode, map, flags);
else
- retval = ext4_ind_map_blocks(handle, inode, map, 0);
+ retval = ext4_ind_map_blocks(handle, inode, map, flags);
if (retval <= 0)
return retval;
@@ -484,11 +562,22 @@ static int ext4_map_query_blocks(handle_t *handle, struct inode *inode,
WARN_ON(1);
}
- status = map->m_flags & EXT4_MAP_UNWRITTEN ?
- EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
- ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
- map->m_pblk, status, false);
- return retval;
+ /*
+ * No need to query next in leaf:
+ * - if returned extent is not last in leaf or
+ * - if the last in leaf is the full requested range
+ */
+ if (!(map->m_flags & EXT4_MAP_QUERY_LAST_IN_LEAF) ||
+ map->m_len == orig_mlen) {
+ status = map->m_flags & EXT4_MAP_UNWRITTEN ?
+ EXTENT_STATUS_UNWRITTEN : EXTENT_STATUS_WRITTEN;
+ ext4_es_insert_extent(inode, map->m_lblk, map->m_len,
+ map->m_pblk, status, false);
+ return retval;
+ }
+
+ return ext4_map_query_blocks_next_in_leaf(handle, inode, map,
+ orig_mlen);
}
static int ext4_map_create_blocks(handle_t *handle, struct inode *inode,
@@ -602,6 +691,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
struct extent_status es;
int retval;
int ret = 0;
+ unsigned int orig_mlen = map->m_len;
#ifdef ES_AGGRESSIVE_TEST
struct ext4_map_blocks orig_map;
@@ -622,6 +712,16 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
if (unlikely(map->m_lblk >= EXT_MAX_BLOCKS))
return -EFSCORRUPTED;
+ /*
+ * Callers from the context of data submission are the only exceptions
+ * for regular files that do not hold the i_rwsem or invalidate_lock.
+ * However, caching unrelated ranges is not permitted.
+ */
+ if (flags & EXT4_GET_BLOCKS_IO_SUBMIT)
+ WARN_ON_ONCE(!(flags & EXT4_EX_NOCACHE));
+ else
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
if (!(EXT4_SB(inode->i_sb)->s_mount_state & EXT4_FC_REPLAY) &&
ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
@@ -653,7 +753,12 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
ext4_map_blocks_es_recheck(handle, inode, map,
&orig_map, flags);
#endif
- goto found;
+ if (!(flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF) ||
+ orig_mlen == map->m_len)
+ goto found;
+
+ if (flags & EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF)
+ map->m_len = orig_mlen;
}
/*
* In the query cache no-wait mode, nothing we can do more if we
@@ -667,7 +772,7 @@ int ext4_map_blocks(handle_t *handle, struct inode *inode,
* file system block.
*/
down_read(&EXT4_I(inode)->i_data_sem);
- retval = ext4_map_query_blocks(handle, inode, map);
+ retval = ext4_map_query_blocks(handle, inode, map, flags);
up_read((&EXT4_I(inode)->i_data_sem));
found:
@@ -696,6 +801,8 @@ found:
if (!(flags & EXT4_GET_BLOCKS_CONVERT_UNWRITTEN))
return retval;
+
+ ext4_fc_track_inode(handle, inode);
/*
* New blocks allocate and/or writing to unwritten extent
* will possibly result in updating i_data, so we take
@@ -1009,7 +1116,12 @@ int ext4_walk_page_buffers(handle_t *handle, struct inode *inode,
*/
static int ext4_dirty_journalled_data(handle_t *handle, struct buffer_head *bh)
{
- folio_mark_dirty(bh->b_folio);
+ struct folio *folio = bh->b_folio;
+ struct inode *inode = folio->mapping->host;
+
+ /* only regular files have a_ops */
+ if (S_ISREG(inode->i_mode))
+ folio_mark_dirty(folio);
return ext4_handle_dirty_metadata(handle, NULL, bh);
}
@@ -1027,7 +1139,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
loff_t pos, unsigned len,
get_block_t *get_block)
{
- unsigned from = pos & (PAGE_SIZE - 1);
+ unsigned int from = offset_in_folio(folio, pos);
unsigned to = from + len;
struct inode *inode = folio->mapping->host;
unsigned block_start, block_end;
@@ -1041,8 +1153,7 @@ int ext4_block_write_begin(handle_t *handle, struct folio *folio,
bool should_journal_data = ext4_should_journal_data(inode);
BUG_ON(!folio_test_locked(folio));
- BUG_ON(from > PAGE_SIZE);
- BUG_ON(to > PAGE_SIZE);
+ BUG_ON(to > folio_size(folio));
BUG_ON(from > to);
head = folio_buffers(folio);
@@ -1152,6 +1263,7 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
struct folio *folio;
pgoff_t index;
unsigned from, to;
+ fgf_t fgp = FGP_WRITEBEGIN;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
@@ -1164,8 +1276,6 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
*/
needed_blocks = ext4_writepage_trans_blocks(inode) + 1;
index = pos >> PAGE_SHIFT;
- from = pos & (PAGE_SIZE - 1);
- to = from + len;
if (ext4_test_inode_state(inode, EXT4_STATE_MAY_INLINE_DATA)) {
ret = ext4_try_to_write_inline_data(mapping, inode, pos, len,
@@ -1184,10 +1294,18 @@ static int ext4_write_begin(struct file *file, struct address_space *mapping,
* the folio (if needed) without using GFP_NOFS.
*/
retry_grab:
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
+ fgp |= fgf_set_order(len);
+ folio = __filemap_get_folio(mapping, index, fgp,
+ mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
+
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
+ from = offset_in_folio(folio, pos);
+ to = from + len;
+
/*
* The same as page allocation, we prealloc buffer heads before
* starting the handle.
@@ -1765,6 +1883,8 @@ static int ext4_da_map_blocks(struct inode *inode, struct ext4_map_blocks *map)
ext_debug(inode, "max_blocks %u, logical block %lu\n", map->m_len,
(unsigned long) map->m_lblk);
+ ext4_check_map_extents_env(inode);
+
/* Lookup extent status tree firstly */
if (ext4_es_lookup_extent(inode, map->m_lblk, NULL, &es)) {
map->m_len = min_t(unsigned int, map->m_len,
@@ -1805,7 +1925,7 @@ found:
if (ext4_has_inline_data(inode))
retval = 0;
else
- retval = ext4_map_query_blocks(NULL, inode, map);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
up_read(&EXT4_I(inode)->i_data_sem);
if (retval)
return retval < 0 ? retval : 0;
@@ -1828,7 +1948,7 @@ add_delayed:
goto found;
}
} else if (!ext4_has_inline_data(inode)) {
- retval = ext4_map_query_blocks(NULL, inode, map);
+ retval = ext4_map_query_blocks(NULL, inode, map, 0);
if (retval) {
up_write(&EXT4_I(inode)->i_data_sem);
return retval < 0 ? retval : 0;
@@ -1936,7 +2056,7 @@ static int mpage_submit_folio(struct mpage_da_data *mpd, struct folio *folio)
len = size & (len - 1);
err = ext4_bio_write_folio(&mpd->io_submit, folio, len);
if (!err)
- mpd->wbc->nr_to_write--;
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
return err;
}
@@ -2159,7 +2279,6 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
start = mpd->map.m_lblk >> bpp_bits;
end = (mpd->map.m_lblk + mpd->map.m_len - 1) >> bpp_bits;
- lblk = start << bpp_bits;
pblock = mpd->map.m_pblk;
folio_batch_init(&fbatch);
@@ -2170,6 +2289,7 @@ static int mpage_map_and_submit_buffers(struct mpage_da_data *mpd)
for (i = 0; i < nr; i++) {
struct folio *folio = fbatch.folios[i];
+ lblk = folio->index << bpp_bits;
err = mpage_process_folio(mpd, folio, &lblk, &pblock,
&map_bh);
/*
@@ -2212,11 +2332,15 @@ static int mpage_map_one_extent(handle_t *handle, struct mpage_da_data *mpd)
* previously reserved. However we must not fail because we're in
* writeback and there is nothing we can do about it so it might result
* in data loss. So use reserved blocks to allocate metadata if
- * possible.
+ * possible. In addition, do not cache any unrelated extents, as it
+ * only holds the folio lock but does not hold the i_rwsem or
+ * invalidate_lock, which could corrupt the extent status tree.
*/
get_blocks_flags = EXT4_GET_BLOCKS_CREATE |
EXT4_GET_BLOCKS_METADATA_NOFAIL |
- EXT4_GET_BLOCKS_IO_SUBMIT;
+ EXT4_GET_BLOCKS_IO_SUBMIT |
+ EXT4_EX_NOCACHE;
+
dioread_nolock = ext4_should_dioread_nolock(inode);
if (dioread_nolock)
get_blocks_flags |= EXT4_GET_BLOCKS_IO_CREATE_EXT;
@@ -2355,7 +2479,7 @@ update_disksize:
*/
static int ext4_da_writepages_trans_blocks(struct inode *inode)
{
- int bpp = ext4_journal_blocks_per_page(inode);
+ int bpp = ext4_journal_blocks_per_folio(inode);
return ext4_meta_trans_blocks(inode,
MAX_WRITEPAGES_EXTENT_LEN + bpp - 1, bpp);
@@ -2391,7 +2515,7 @@ static int mpage_journal_page_buffers(handle_t *handle,
size_t len = folio_size(folio);
folio_clear_checked(folio);
- mpd->wbc->nr_to_write--;
+ mpd->wbc->nr_to_write -= folio_nr_pages(folio);
if (folio_pos(folio) + len > size &&
!ext4_verity_in_progress(inode))
@@ -2433,7 +2557,7 @@ static int mpage_prepare_extent_to_map(struct mpage_da_data *mpd)
ext4_lblk_t lblk;
struct buffer_head *head;
handle_t *handle = NULL;
- int bpp = ext4_journal_blocks_per_page(mpd->inode);
+ int bpp = ext4_journal_blocks_per_folio(mpd->inode);
if (mpd->wbc->sync_mode == WB_SYNC_ALL || mpd->wbc->tagged_writepages)
tag = PAGECACHE_TAG_TOWRITE;
@@ -2920,6 +3044,7 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
struct folio *folio;
pgoff_t index;
struct inode *inode = mapping->host;
+ fgf_t fgp = FGP_WRITEBEGIN;
ret = ext4_emergency_state(inode->i_sb);
if (unlikely(ret))
@@ -2945,11 +3070,15 @@ static int ext4_da_write_begin(struct file *file, struct address_space *mapping,
}
retry:
- folio = __filemap_get_folio(mapping, index, FGP_WRITEBEGIN,
- mapping_gfp_mask(mapping));
+ fgp |= fgf_set_order(len);
+ folio = __filemap_get_folio(mapping, index, fgp,
+ mapping_gfp_mask(mapping));
if (IS_ERR(folio))
return PTR_ERR(folio);
+ if (pos + len > folio_pos(folio) + folio_size(folio))
+ len = folio_pos(folio) + folio_size(folio) - pos;
+
ret = ext4_block_write_begin(NULL, folio, pos, len,
ext4_da_get_block_prep);
if (ret < 0) {
@@ -3038,7 +3167,7 @@ static int ext4_da_do_write_end(struct address_space *mapping,
unsigned long end;
i_size_write(inode, new_i_size);
- end = (new_i_size - 1) & (PAGE_SIZE - 1);
+ end = offset_in_folio(folio, new_i_size - 1);
if (copied && ext4_da_should_update_i_disksize(folio, end)) {
ext4_update_i_disksize(inode, new_i_size);
disksize_changed = true;
@@ -3340,12 +3469,149 @@ static void ext4_set_iomap(struct inode *inode, struct iomap *iomap,
}
}
+static int ext4_map_blocks_atomic_write_slow(handle_t *handle,
+ struct inode *inode, struct ext4_map_blocks *map)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ unsigned int mapped_len = 0, m_flags = 0;
+ ext4_fsblk_t next_pblk;
+ bool check_next_pblk = false;
+ int ret = 0;
+
+ WARN_ON_ONCE(!ext4_has_feature_bigalloc(inode->i_sb));
+
+ /*
+ * This is a slow path in case of mixed mapping. We use
+ * EXT4_GET_BLOCKS_CREATE_ZERO flag here to make sure we get a single
+ * contiguous mapped mapping. This will ensure any unwritten or hole
+ * regions within the requested range is zeroed out and we return
+ * a single contiguous mapped extent.
+ */
+ m_flags = EXT4_GET_BLOCKS_CREATE_ZERO;
+
+ do {
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 && ret != -ENOSPC)
+ goto out_err;
+ /*
+ * This should never happen, but let's return an error code to
+ * avoid an infinite loop in here.
+ */
+ if (ret == 0) {
+ ret = -EFSCORRUPTED;
+ ext4_warning_inode(inode,
+ "ext4_map_blocks() couldn't allocate blocks m_flags: 0x%x, ret:%d",
+ m_flags, ret);
+ goto out_err;
+ }
+ /*
+ * With bigalloc we should never get ENOSPC nor discontiguous
+ * physical extents.
+ */
+ if ((check_next_pblk && next_pblk != map->m_pblk) ||
+ ret == -ENOSPC) {
+ ext4_warning_inode(inode,
+ "Non-contiguous allocation detected: expected %llu, got %llu, "
+ "or ext4_map_blocks() returned out of space ret: %d",
+ next_pblk, map->m_pblk, ret);
+ ret = -EFSCORRUPTED;
+ goto out_err;
+ }
+ next_pblk = map->m_pblk + map->m_len;
+ check_next_pblk = true;
+
+ mapped_len += map->m_len;
+ map->m_lblk += map->m_len;
+ map->m_len = m_len - mapped_len;
+ } while (mapped_len < m_len);
+
+ /*
+ * We might have done some work in above loop, so we need to query the
+ * start of the physical extent, based on the origin m_lblk and m_len.
+ * Let's also ensure we were able to allocate the required range for
+ * mixed mapping case.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ ret = ext4_map_blocks(handle, inode, map,
+ EXT4_GET_BLOCKS_QUERY_LAST_IN_LEAF);
+ if (ret != m_len) {
+ ext4_warning_inode(inode,
+ "allocation failed for atomic write request m_lblk:%u, m_len:%u, ret:%d\n",
+ m_lblk, m_len, ret);
+ ret = -EINVAL;
+ }
+ return ret;
+
+out_err:
+ /* reset map before returning an error */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+ return ret;
+}
+
+/*
+ * ext4_map_blocks_atomic: Helper routine to ensure the entire requested
+ * range in @map [lblk, lblk + len) is one single contiguous extent with no
+ * mixed mappings.
+ *
+ * We first use m_flags passed to us by our caller (ext4_iomap_alloc()).
+ * We only call EXT4_GET_BLOCKS_ZERO in the slow path, when the underlying
+ * physical extent for the requested range does not have a single contiguous
+ * mapping type i.e. (Hole, Mapped, or Unwritten) throughout.
+ * In that case we will loop over the requested range to allocate and zero out
+ * the unwritten / holes in between, to get a single mapped extent from
+ * [m_lblk, m_lblk + m_len). Note that this is only possible because we know
+ * this can be called only with bigalloc enabled filesystem where the underlying
+ * cluster is already allocated. This avoids allocating discontiguous extents
+ * in the slow path due to multiple calls to ext4_map_blocks().
+ * The slow path is mostly non-performance critical path, so it should be ok to
+ * loop using ext4_map_blocks() with appropriate flags to allocate & zero the
+ * underlying short holes/unwritten extents within the requested range.
+ */
+static int ext4_map_blocks_atomic_write(handle_t *handle, struct inode *inode,
+ struct ext4_map_blocks *map, int m_flags,
+ bool *force_commit)
+{
+ ext4_lblk_t m_lblk = map->m_lblk;
+ unsigned int m_len = map->m_len;
+ int ret = 0;
+
+ WARN_ON_ONCE(m_len > 1 && !ext4_has_feature_bigalloc(inode->i_sb));
+
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (ret < 0 || ret == m_len)
+ goto out;
+ /*
+ * This is a mixed mapping case where we were not able to allocate
+ * a single contiguous extent. In that case let's reset requested
+ * mapping and call the slow path.
+ */
+ map->m_lblk = m_lblk;
+ map->m_len = m_len;
+ map->m_flags = 0;
+
+ /*
+ * slow path means we have mixed mapping, that means we will need
+ * to force txn commit.
+ */
+ *force_commit = true;
+ return ext4_map_blocks_atomic_write_slow(handle, inode, map);
+out:
+ return ret;
+}
+
static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
unsigned int flags)
{
handle_t *handle;
u8 blkbits = inode->i_blkbits;
int ret, dio_credits, m_flags = 0, retries = 0;
+ bool force_commit = false;
/*
* Trim the mapping request to the maximum value that we can map at
@@ -3353,7 +3619,30 @@ static int ext4_iomap_alloc(struct inode *inode, struct ext4_map_blocks *map,
*/
if (map->m_len > DIO_MAX_BLOCKS)
map->m_len = DIO_MAX_BLOCKS;
- dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+
+ /*
+ * journal credits estimation for atomic writes. We call
+ * ext4_map_blocks(), to find if there could be a mixed mapping. If yes,
+ * then let's assume the no. of pextents required can be m_len i.e.
+ * every alternate block can be unwritten and hole.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ unsigned int orig_mlen = map->m_len;
+
+ ret = ext4_map_blocks(NULL, inode, map, 0);
+ if (ret < 0)
+ return ret;
+ if (map->m_len < orig_mlen) {
+ map->m_len = orig_mlen;
+ dio_credits = ext4_meta_trans_blocks(inode, orig_mlen,
+ map->m_len);
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode,
+ map->m_len);
+ }
+ } else {
+ dio_credits = ext4_chunk_trans_blocks(inode, map->m_len);
+ }
retry:
/*
@@ -3384,7 +3673,11 @@ retry:
else if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
m_flags = EXT4_GET_BLOCKS_IO_CREATE_EXT;
- ret = ext4_map_blocks(handle, inode, map, m_flags);
+ if (flags & IOMAP_ATOMIC)
+ ret = ext4_map_blocks_atomic_write(handle, inode, map, m_flags,
+ &force_commit);
+ else
+ ret = ext4_map_blocks(handle, inode, map, m_flags);
/*
* We cannot fill holes in indirect tree based inodes as that could
@@ -3398,6 +3691,22 @@ retry:
if (ret == -ENOSPC && ext4_should_retry_alloc(inode->i_sb, &retries))
goto retry;
+ /*
+ * Force commit the current transaction if the allocation spans a mixed
+ * mapping range. This ensures any pending metadata updates (like
+ * unwritten to written extents conversion) in this range are in
+ * consistent state with the file data blocks, before performing the
+ * actual write I/O. If the commit fails, the whole I/O must be aborted
+ * to prevent any possible torn writes.
+ */
+ if (ret > 0 && force_commit) {
+ int ret2;
+
+ ret2 = ext4_force_commit(inode->i_sb);
+ if (ret2)
+ return ret2;
+ }
+
return ret;
}
@@ -3408,6 +3717,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
int ret;
struct ext4_map_blocks map;
u8 blkbits = inode->i_blkbits;
+ unsigned int orig_mlen;
if ((offset >> blkbits) > EXT4_MAX_LOGICAL_BLOCK)
return -EINVAL;
@@ -3421,6 +3731,7 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
map.m_lblk = offset >> blkbits;
map.m_len = min_t(loff_t, (offset + length - 1) >> blkbits,
EXT4_MAX_LOGICAL_BLOCK) - map.m_lblk + 1;
+ orig_mlen = map.m_len;
if (flags & IOMAP_WRITE) {
/*
@@ -3431,11 +3742,23 @@ static int ext4_iomap_begin(struct inode *inode, loff_t offset, loff_t length,
*/
if (offset + length <= i_size_read(inode)) {
ret = ext4_map_blocks(NULL, inode, &map, 0);
- if (ret > 0 && (map.m_flags & EXT4_MAP_MAPPED))
- goto out;
+ /*
+ * For atomic writes the entire requested length should
+ * be mapped.
+ */
+ if (map.m_flags & EXT4_MAP_MAPPED) {
+ if ((!(flags & IOMAP_ATOMIC) && ret > 0) ||
+ (flags & IOMAP_ATOMIC && ret >= orig_mlen))
+ goto out;
+ }
+ map.m_len = orig_mlen;
}
ret = ext4_iomap_alloc(inode, &map, flags);
} else {
+ /*
+ * This can be called for overwrites path from
+ * ext4_iomap_overwrite_begin().
+ */
ret = ext4_map_blocks(NULL, inode, &map, 0);
}
@@ -3449,6 +3772,16 @@ out:
*/
map.m_len = fscrypt_limit_io_blocks(inode, map.m_lblk, map.m_len);
+ /*
+ * Before returning to iomap, let's ensure the allocated mapping
+ * covers the entire requested length for atomic writes.
+ */
+ if (flags & IOMAP_ATOMIC) {
+ if (map.m_len < (length >> blkbits)) {
+ WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
+ }
ext4_set_iomap(inode, iomap, &map, offset, length, flags);
return 0;
@@ -3690,9 +4023,7 @@ void ext4_set_aops(struct inode *inode)
static int __ext4_block_zero_page_range(handle_t *handle,
struct address_space *mapping, loff_t from, loff_t length)
{
- ext4_fsblk_t index = from >> PAGE_SHIFT;
- unsigned offset = from & (PAGE_SIZE-1);
- unsigned blocksize, pos;
+ unsigned int offset, blocksize, pos;
ext4_lblk_t iblock;
struct inode *inode = mapping->host;
struct buffer_head *bh;
@@ -3707,13 +4038,14 @@ static int __ext4_block_zero_page_range(handle_t *handle,
blocksize = inode->i_sb->s_blocksize;
- iblock = index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ iblock = folio->index << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
bh = folio_buffers(folio);
if (!bh)
bh = create_empty_buffers(folio, blocksize, 0);
/* Find the buffer that contains "offset" */
+ offset = offset_in_folio(folio, from);
pos = blocksize;
while (offset >= pos) {
bh = bh->b_this_page;
@@ -4006,7 +4338,7 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
struct inode *inode = file_inode(file);
struct super_block *sb = inode->i_sb;
ext4_lblk_t start_lblk, end_lblk;
- loff_t max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
+ loff_t max_end = sb->s_maxbytes;
loff_t end = offset + length;
handle_t *handle;
unsigned int credits;
@@ -4015,14 +4347,20 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
trace_ext4_punch_hole(inode, offset, length, 0);
WARN_ON_ONCE(!inode_is_locked(inode));
+ /*
+ * For indirect-block based inodes, make sure that the hole within
+ * one block before last range.
+ */
+ if (!ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
+ max_end = EXT4_SB(sb)->s_bitmap_maxbytes - sb->s_blocksize;
+
/* No need to punch hole beyond i_size */
- if (offset >= inode->i_size)
+ if (offset >= inode->i_size || offset >= max_end)
return 0;
/*
* If the hole extends beyond i_size, set the hole to end after
- * the page that contains i_size, and also make sure that the hole
- * within one block before last range.
+ * the page that contains i_size.
*/
if (end > inode->i_size)
end = round_up(inode->i_size, PAGE_SIZE);
@@ -4072,6 +4410,8 @@ int ext4_punch_hole(struct file *file, loff_t offset, loff_t length)
if (end_lblk > start_lblk) {
ext4_lblk_t hole_len = end_lblk - start_lblk;
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
@@ -4224,8 +4564,10 @@ int ext4_truncate(struct inode *inode)
if (err)
goto out_stop;
- down_write(&EXT4_I(inode)->i_data_sem);
+ ext4_fc_track_inode(handle, inode);
+ ext4_check_map_extents_env(inode);
+ down_write(&EXT4_I(inode)->i_data_sem);
ext4_discard_preallocations(inode);
if (ext4_test_inode_flag(inode, EXT4_INODE_EXTENTS))
@@ -4760,10 +5102,27 @@ static int check_igot_inode(struct inode *inode, ext4_iget_flags flags,
return 0;
error:
- ext4_error_inode(inode, function, line, 0, err_str);
+ ext4_error_inode(inode, function, line, 0, "%s", err_str);
return -EFSCORRUPTED;
}
+bool ext4_should_enable_large_folio(struct inode *inode)
+{
+ struct super_block *sb = inode->i_sb;
+
+ if (!S_ISREG(inode->i_mode))
+ return false;
+ if (test_opt(sb, DATA_FLAGS) == EXT4_MOUNT_JOURNAL_DATA ||
+ ext4_test_inode_flag(inode, EXT4_INODE_JOURNAL_DATA))
+ return false;
+ if (ext4_has_feature_verity(sb))
+ return false;
+ if (ext4_has_feature_encrypt(sb))
+ return false;
+
+ return true;
+}
+
struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ext4_iget_flags flags, const char *function,
unsigned int line)
@@ -4781,12 +5140,7 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
gid_t i_gid;
projid_t i_projid;
- if ((!(flags & EXT4_IGET_SPECIAL) &&
- ((ino < EXT4_FIRST_INO(sb) && ino != EXT4_ROOT_INO) ||
- ino == le32_to_cpu(es->s_usr_quota_inum) ||
- ino == le32_to_cpu(es->s_grp_quota_inum) ||
- ino == le32_to_cpu(es->s_prj_quota_inum) ||
- ino == le32_to_cpu(es->s_orphan_file_inum))) ||
+ if ((!(flags & EXT4_IGET_SPECIAL) && is_special_ino(sb, ino)) ||
(ino < EXT4_ROOT_INO) ||
(ino > le32_to_cpu(es->s_inodes_count))) {
if (flags & EXT4_IGET_HANDLE)
@@ -4845,10 +5199,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
__u32 csum;
__le32 inum = cpu_to_le32(inode->i_ino);
__le32 gen = raw_inode->i_generation;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum,
sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen,
- sizeof(gen));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
if ((!ext4_inode_csum_verify(inode, raw_inode, ei) ||
@@ -4916,7 +5269,8 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(sb, raw_inode);
- if ((size = i_size_read(inode)) < 0) {
+ size = i_size_read(inode);
+ if (size < 0 || size > ext4_get_maxbytes(inode)) {
ext4_error_inode(inode, function, line, 0,
"iget: bad i_size value: %lld", size);
ret = -EFSCORRUPTED;
@@ -5086,6 +5440,9 @@ struct inode *__ext4_iget(struct super_block *sb, unsigned long ino,
ret = -EFSCORRUPTED;
goto bad_inode;
}
+ if (ext4_should_enable_large_folio(inode))
+ mapping_set_large_folios(inode->i_mapping);
+
ret = check_igot_inode(inode, flags, function, line);
/*
* -ESTALE here means there is nothing inherently wrong with the inode,
@@ -5564,9 +5921,7 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
down_write(&EXT4_I(inode)->i_data_sem);
old_disksize = EXT4_I(inode)->i_disksize;
EXT4_I(inode)->i_disksize = attr->ia_size;
- rc = ext4_mark_inode_dirty(handle, inode);
- if (!error)
- error = rc;
+
/*
* We have to update i_size under i_data_sem together
* with i_disksize to avoid races with writeback code
@@ -5577,6 +5932,9 @@ int ext4_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
else
EXT4_I(inode)->i_disksize = old_disksize;
up_write(&EXT4_I(inode)->i_data_sem);
+ rc = ext4_mark_inode_dirty(handle, inode);
+ if (!error)
+ error = rc;
ext4_journal_stop(handle);
if (error)
goto out_mmap_sem;
@@ -5773,8 +6131,7 @@ static int ext4_index_trans_blocks(struct inode *inode, int lblocks,
*
* Also account for superblock, inode, quota and xattr blocks
*/
-static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
- int pextents)
+int ext4_meta_trans_blocks(struct inode *inode, int lblocks, int pextents)
{
ext4_group_t groups, ngroups = ext4_get_groups_count(inode->i_sb);
int gdpblocks;
@@ -5782,18 +6139,16 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
int ret;
/*
- * How many index blocks need to touch to map @lblocks logical blocks
- * to @pextents physical extents?
+ * How many index and lead blocks need to touch to map @lblocks
+ * logical blocks to @pextents physical extents?
*/
idxblocks = ext4_index_trans_blocks(inode, lblocks, pextents);
- ret = idxblocks;
-
/*
* Now let's see how many group bitmaps and group descriptors need
* to account
*/
- groups = idxblocks + pextents;
+ groups = idxblocks;
gdpblocks = groups;
if (groups > ngroups)
groups = ngroups;
@@ -5801,7 +6156,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
gdpblocks = EXT4_SB(inode->i_sb)->s_gdb_count;
/* bitmaps and block group descriptor blocks */
- ret += groups + gdpblocks;
+ ret = idxblocks + groups + gdpblocks;
/* Blocks for super block, inode, quota and xattr blocks */
ret += EXT4_META_TRANS_BLOCKS(inode->i_sb);
@@ -5821,7 +6176,7 @@ static int ext4_meta_trans_blocks(struct inode *inode, int lblocks,
*/
int ext4_writepage_trans_blocks(struct inode *inode)
{
- int bpp = ext4_journal_blocks_per_page(inode);
+ int bpp = ext4_journal_blocks_per_folio(inode);
int ret;
ret = ext4_meta_trans_blocks(inode, bpp, bpp);
@@ -5895,6 +6250,7 @@ ext4_reserve_inode_write(handle_t *handle, struct inode *inode,
brelse(iloc->bh);
iloc->bh = NULL;
}
+ ext4_fc_track_inode(handle, inode);
}
ext4_std_error(inode->i_sb, err);
return err;
diff --git a/fs/ext4/ioctl.c b/fs/ext4/ioctl.c
index d17207386ead..5668a17458ae 100644
--- a/fs/ext4/ioctl.c
+++ b/fs/ext4/ioctl.c
@@ -143,7 +143,7 @@ static int ext4_update_backup_sb(struct super_block *sb,
es = (struct ext4_super_block *) (bh->b_data + offset);
lock_buffer(bh);
if (ext4_has_feature_metadata_csum(sb) &&
- es->s_checksum != ext4_superblock_csum(sb, es)) {
+ es->s_checksum != ext4_superblock_csum(es)) {
ext4_msg(sb, KERN_ERR, "Invalid checksum for backup "
"superblock %llu", sb_block);
unlock_buffer(bh);
@@ -151,7 +151,7 @@ static int ext4_update_backup_sb(struct super_block *sb,
}
func(es, arg);
if (ext4_has_feature_metadata_csum(sb))
- es->s_checksum = ext4_superblock_csum(sb, es);
+ es->s_checksum = ext4_superblock_csum(es);
set_buffer_uptodate(bh);
unlock_buffer(bh);
@@ -354,8 +354,8 @@ void ext4_reset_inode_seed(struct inode *inode)
if (!ext4_has_feature_metadata_csum(inode->i_sb))
return;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
- ei->i_csum_seed = ext4_chksum(sbi, csum, (__u8 *)&gen, sizeof(gen));
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&inum, sizeof(inum));
+ ei->i_csum_seed = ext4_chksum(csum, (__u8 *)&gen, sizeof(gen));
}
/*
@@ -1505,8 +1505,14 @@ resizefs_out:
return 0;
}
case EXT4_IOC_PRECACHE_EXTENTS:
- return ext4_ext_precache(inode);
+ {
+ int ret;
+ inode_lock_shared(inode);
+ ret = ext4_ext_precache(inode);
+ inode_unlock_shared(inode);
+ return ret;
+ }
case FS_IOC_SET_ENCRYPTION_POLICY:
if (!ext4_has_feature_encrypt(sb))
return -EOPNOTSUPP;
diff --git a/fs/ext4/mmp.c b/fs/ext4/mmp.c
index 3e26464b1425..51661570cf3b 100644
--- a/fs/ext4/mmp.c
+++ b/fs/ext4/mmp.c
@@ -14,7 +14,7 @@ static __le32 ext4_mmp_csum(struct super_block *sb, struct mmp_struct *mmp)
int offset = offsetof(struct mmp_struct, mmp_checksum);
__u32 csum;
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (char *)mmp, offset);
+ csum = ext4_chksum(sbi->s_csum_seed, (char *)mmp, offset);
return cpu_to_le32(csum);
}
diff --git a/fs/ext4/move_extent.c b/fs/ext4/move_extent.c
index 48649be64d6a..1f8493a56e8f 100644
--- a/fs/ext4/move_extent.c
+++ b/fs/ext4/move_extent.c
@@ -269,7 +269,7 @@ move_extent_per_page(struct file *o_filp, struct inode *donor_inode,
unsigned int tmp_data_size, data_size, replaced_size;
int i, err2, jblocks, retries = 0;
int replaced_count = 0;
- int from = data_offset_in_page << orig_inode->i_blkbits;
+ int from;
int blocks_per_page = PAGE_SIZE >> orig_inode->i_blkbits;
struct super_block *sb = orig_inode->i_sb;
struct buffer_head *bh = NULL;
@@ -323,11 +323,6 @@ again:
* hold page's lock, if it is still the case data copy is not
* necessary, just swap data blocks between orig and donor.
*/
-
- VM_BUG_ON_FOLIO(folio_test_large(folio[0]), folio[0]);
- VM_BUG_ON_FOLIO(folio_test_large(folio[1]), folio[1]);
- VM_BUG_ON_FOLIO(folio_nr_pages(folio[0]) != folio_nr_pages(folio[1]), folio[1]);
-
if (unwritten) {
ext4_double_down_write_data_sem(orig_inode, donor_inode);
/* If any of extents in range became initialized we have to
@@ -360,6 +355,8 @@ again:
goto unlock_folios;
}
data_copy:
+ from = offset_in_folio(folio[0],
+ orig_blk_offset << orig_inode->i_blkbits);
*err = mext_page_mkuptodate(folio[0], from, from + replaced_size);
if (*err)
goto unlock_folios;
@@ -390,7 +387,7 @@ data_copy:
if (!bh)
bh = create_empty_buffers(folio[0],
1 << orig_inode->i_blkbits, 0);
- for (i = 0; i < data_offset_in_page; i++)
+ for (i = 0; i < from >> orig_inode->i_blkbits; i++)
bh = bh->b_this_page;
for (i = 0; i < block_len_in_page; i++) {
*err = ext4_get_block(orig_inode, orig_blk_offset + i, bh, 0);
diff --git a/fs/ext4/namei.c b/fs/ext4/namei.c
index e9712e64ec8f..a178ac229489 100644
--- a/fs/ext4/namei.c
+++ b/fs/ext4/namei.c
@@ -346,11 +346,10 @@ static struct ext4_dir_entry_tail *get_dirent_tail(struct inode *inode,
static __le32 ext4_dirblock_csum(struct inode *inode, void *dirent, int size)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
return cpu_to_le32(csum);
}
@@ -442,7 +441,6 @@ static struct dx_countlimit *get_dx_countlimit(struct inode *inode,
static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
int count_offset, int count, struct dx_tail *t)
{
- struct ext4_sb_info *sbi = EXT4_SB(inode->i_sb);
struct ext4_inode_info *ei = EXT4_I(inode);
__u32 csum;
int size;
@@ -450,9 +448,9 @@ static __le32 ext4_dx_csum(struct inode *inode, struct ext4_dir_entry *dirent,
int offset = offsetof(struct dx_tail, dt_checksum);
size = count_offset + (count * sizeof(struct dx_entry));
- csum = ext4_chksum(sbi, ei->i_csum_seed, (__u8 *)dirent, size);
- csum = ext4_chksum(sbi, csum, (__u8 *)t, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ csum = ext4_chksum(ei->i_csum_seed, (__u8 *)dirent, size);
+ csum = ext4_chksum(csum, (__u8 *)t, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
return cpu_to_le32(csum);
}
diff --git a/fs/ext4/orphan.c b/fs/ext4/orphan.c
index c66e0cb29bd4..7c7f792ad6ab 100644
--- a/fs/ext4/orphan.c
+++ b/fs/ext4/orphan.c
@@ -541,9 +541,9 @@ static int ext4_orphan_file_block_csum_verify(struct super_block *sb,
return 1;
ot = ext4_orphan_block_tail(sb, bh);
- calculated = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
- (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
- calculated = ext4_chksum(EXT4_SB(sb), calculated, (__u8 *)bh->b_data,
+ calculated = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ calculated = ext4_chksum(calculated, (__u8 *)bh->b_data,
inodes_per_ob * sizeof(__u32));
return le32_to_cpu(ot->ob_checksum) == calculated;
}
@@ -560,10 +560,9 @@ void ext4_orphan_file_block_trigger(struct jbd2_buffer_trigger_type *triggers,
struct ext4_orphan_block_tail *ot;
__le64 dsk_block_nr = cpu_to_le64(bh->b_blocknr);
- csum = ext4_chksum(EXT4_SB(sb), oi->of_csum_seed,
- (__u8 *)&dsk_block_nr, sizeof(dsk_block_nr));
- csum = ext4_chksum(EXT4_SB(sb), csum, (__u8 *)data,
- inodes_per_ob * sizeof(__u32));
+ csum = ext4_chksum(oi->of_csum_seed, (__u8 *)&dsk_block_nr,
+ sizeof(dsk_block_nr));
+ csum = ext4_chksum(csum, (__u8 *)data, inodes_per_ob * sizeof(__u32));
ot = ext4_orphan_block_tail(sb, bh);
ot->ob_checksum = cpu_to_le32(csum);
}
diff --git a/fs/ext4/readpage.c b/fs/ext4/readpage.c
index 5d3a9dc9a32d..f329daf6e5c7 100644
--- a/fs/ext4/readpage.c
+++ b/fs/ext4/readpage.c
@@ -227,24 +227,30 @@ int ext4_mpage_readpages(struct inode *inode,
int length;
unsigned relative_block = 0;
struct ext4_map_blocks map;
- unsigned int nr_pages = rac ? readahead_count(rac) : 1;
+ unsigned int nr_pages, folio_pages;
map.m_pblk = 0;
map.m_lblk = 0;
map.m_len = 0;
map.m_flags = 0;
- for (; nr_pages; nr_pages--) {
+ nr_pages = rac ? readahead_count(rac) : folio_nr_pages(folio);
+ for (; nr_pages; nr_pages -= folio_pages) {
int fully_mapped = 1;
- unsigned first_hole = blocks_per_page;
+ unsigned int first_hole;
+ unsigned int blocks_per_folio;
if (rac)
folio = readahead_folio(rac);
+
+ folio_pages = folio_nr_pages(folio);
prefetchw(&folio->flags);
if (folio_buffers(folio))
goto confused;
+ blocks_per_folio = folio_size(folio) >> blkbits;
+ first_hole = blocks_per_folio;
block_in_file = next_block =
(sector_t)folio->index << (PAGE_SHIFT - blkbits);
last_block = block_in_file + nr_pages * blocks_per_page;
@@ -270,7 +276,7 @@ int ext4_mpage_readpages(struct inode *inode,
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
}
- if (page_block == blocks_per_page)
+ if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
@@ -281,7 +287,7 @@ int ext4_mpage_readpages(struct inode *inode,
* Then do more ext4_map_blocks() calls until we are
* done with this folio.
*/
- while (page_block < blocks_per_page) {
+ while (page_block < blocks_per_folio) {
if (block_in_file < last_block) {
map.m_lblk = block_in_file;
map.m_len = last_block - block_in_file;
@@ -296,13 +302,13 @@ int ext4_mpage_readpages(struct inode *inode,
}
if ((map.m_flags & EXT4_MAP_MAPPED) == 0) {
fully_mapped = 0;
- if (first_hole == blocks_per_page)
+ if (first_hole == blocks_per_folio)
first_hole = page_block;
page_block++;
block_in_file++;
continue;
}
- if (first_hole != blocks_per_page)
+ if (first_hole != blocks_per_folio)
goto confused; /* hole -> non-hole */
/* Contiguous blocks? */
@@ -315,13 +321,13 @@ int ext4_mpage_readpages(struct inode *inode,
/* needed? */
map.m_flags &= ~EXT4_MAP_MAPPED;
break;
- } else if (page_block == blocks_per_page)
+ } else if (page_block == blocks_per_folio)
break;
page_block++;
block_in_file++;
}
}
- if (first_hole != blocks_per_page) {
+ if (first_hole != blocks_per_folio) {
folio_zero_segment(folio, first_hole << blkbits,
folio_size(folio));
if (first_hole == 0) {
@@ -367,11 +373,11 @@ int ext4_mpage_readpages(struct inode *inode,
if (((map.m_flags & EXT4_MAP_BOUNDARY) &&
(relative_block == map.m_len)) ||
- (first_hole != blocks_per_page)) {
+ (first_hole != blocks_per_folio)) {
submit_bio(bio);
bio = NULL;
} else
- last_block_in_bio = first_block + blocks_per_page - 1;
+ last_block_in_bio = first_block + blocks_per_folio - 1;
continue;
confused:
if (bio) {
diff --git a/fs/ext4/resize.c b/fs/ext4/resize.c
index b7ff0d955f0d..050f26168d97 100644
--- a/fs/ext4/resize.c
+++ b/fs/ext4/resize.c
@@ -1119,7 +1119,7 @@ static inline void ext4_set_block_group_nr(struct super_block *sb, char *data,
es->s_block_group_nr = cpu_to_le16(group);
if (ext4_has_feature_metadata_csum(sb))
- es->s_checksum = ext4_superblock_csum(sb, es);
+ es->s_checksum = ext4_superblock_csum(es);
}
/*
diff --git a/fs/ext4/super.c b/fs/ext4/super.c
index 181934499624..a7f80ca01174 100644
--- a/fs/ext4/super.c
+++ b/fs/ext4/super.c
@@ -286,14 +286,12 @@ static int ext4_verify_csum_type(struct super_block *sb,
return es->s_checksum_type == EXT4_CRC32C_CHKSUM;
}
-__le32 ext4_superblock_csum(struct super_block *sb,
- struct ext4_super_block *es)
+__le32 ext4_superblock_csum(struct ext4_super_block *es)
{
- struct ext4_sb_info *sbi = EXT4_SB(sb);
int offset = offsetof(struct ext4_super_block, s_checksum);
__u32 csum;
- csum = ext4_chksum(sbi, ~0, (char *)es, offset);
+ csum = ext4_chksum(~0, (char *)es, offset);
return cpu_to_le32(csum);
}
@@ -304,7 +302,7 @@ static int ext4_superblock_csum_verify(struct super_block *sb,
if (!ext4_has_feature_metadata_csum(sb))
return 1;
- return es->s_checksum == ext4_superblock_csum(sb, es);
+ return es->s_checksum == ext4_superblock_csum(es);
}
void ext4_superblock_csum_set(struct super_block *sb)
@@ -314,7 +312,7 @@ void ext4_superblock_csum_set(struct super_block *sb)
if (!ext4_has_feature_metadata_csum(sb))
return;
- es->s_checksum = ext4_superblock_csum(sb, es);
+ es->s_checksum = ext4_superblock_csum(es);
}
ext4_fsblk_t ext4_block_bitmap(struct super_block *sb,
@@ -508,21 +506,9 @@ static void ext4_journal_commit_callback(journal_t *journal, transaction_t *txn)
ext4_maybe_update_superblock(sb);
}
-/*
- * This writepage callback for write_cache_pages()
- * takes care of a few cases after page cleaning.
- *
- * write_cache_pages() already checks for dirty pages
- * and calls clear_page_dirty_for_io(), which we want,
- * to write protect the pages.
- *
- * However, we may have to redirty a page (see below.)
- */
-static int ext4_journalled_writepage_callback(struct folio *folio,
- struct writeback_control *wbc,
- void *data)
+static bool ext4_journalled_writepage_needs_redirty(struct jbd2_inode *jinode,
+ struct folio *folio)
{
- transaction_t *transaction = (transaction_t *) data;
struct buffer_head *bh, *head;
struct journal_head *jh;
@@ -543,15 +529,12 @@ static int ext4_journalled_writepage_callback(struct folio *folio,
*/
jh = bh2jh(bh);
if (buffer_dirty(bh) ||
- (jh && (jh->b_transaction != transaction ||
- jh->b_next_transaction))) {
- folio_redirty_for_writepage(wbc, folio);
- goto out;
- }
+ (jh && (jh->b_transaction != jinode->i_transaction ||
+ jh->b_next_transaction)))
+ return true;
} while ((bh = bh->b_this_page) != head);
-out:
- return AOP_WRITEPAGE_ACTIVATE;
+ return false;
}
static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
@@ -563,10 +546,23 @@ static int ext4_journalled_submit_inode_data_buffers(struct jbd2_inode *jinode)
.range_start = jinode->i_dirty_start,
.range_end = jinode->i_dirty_end,
};
+ struct folio *folio = NULL;
+ int error;
- return write_cache_pages(mapping, &wbc,
- ext4_journalled_writepage_callback,
- jinode->i_transaction);
+ /*
+ * writeback_iter() already checks for dirty pages and calls
+ * folio_clear_dirty_for_io(), which we want to write protect the
+ * folios.
+ *
+ * However, we may have to redirty a folio sometimes.
+ */
+ while ((folio = writeback_iter(mapping, &wbc, folio, &error))) {
+ if (ext4_journalled_writepage_needs_redirty(jinode, folio))
+ folio_redirty_for_writepage(&wbc, folio);
+ folio_unlock(folio);
+ }
+
+ return error;
}
static int ext4_journal_submit_inode_data_buffers(struct jbd2_inode *jinode)
@@ -1415,7 +1411,7 @@ static struct inode *ext4_alloc_inode(struct super_block *sb)
ei->i_datasync_tid = 0;
INIT_WORK(&ei->i_rsv_conversion_work, ext4_end_io_rsv_work);
ext4_fc_init_inode(&ei->vfs_inode);
- mutex_init(&ei->i_fc_lock);
+ spin_lock_init(&ei->i_fc_lock);
return &ei->vfs_inode;
}
@@ -1809,7 +1805,6 @@ static const struct fs_parameter_spec ext4_param_specs[] = {
{}
};
-#define DEFAULT_JOURNAL_IOPRIO (IOPRIO_PRIO_VALUE(IOPRIO_CLASS_BE, 3))
#define MOPT_SET 0x0001
#define MOPT_CLEAR 0x0002
@@ -3209,14 +3204,14 @@ static __le16 ext4_group_desc_csum(struct super_block *sb, __u32 block_group,
__u32 csum32;
__u16 dummy_csum = 0;
- csum32 = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&le_group,
+ csum32 = ext4_chksum(sbi->s_csum_seed, (__u8 *)&le_group,
sizeof(le_group));
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp, offset);
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)&dummy_csum,
+ csum32 = ext4_chksum(csum32, (__u8 *)gdp, offset);
+ csum32 = ext4_chksum(csum32, (__u8 *)&dummy_csum,
sizeof(dummy_csum));
offset += sizeof(dummy_csum);
if (offset < sbi->s_desc_size)
- csum32 = ext4_chksum(sbi, csum32, (__u8 *)gdp + offset,
+ csum32 = ext4_chksum(csum32, (__u8 *)gdp + offset,
sbi->s_desc_size - offset);
crc = csum32 & 0xFFFF;
@@ -4441,13 +4436,16 @@ static int ext4_handle_clustersize(struct super_block *sb)
/*
* ext4_atomic_write_init: Initializes filesystem min & max atomic write units.
+ * With non-bigalloc filesystem awu will be based upon filesystem blocksize
+ * & bdev awu units.
+ * With bigalloc it will be based upon bigalloc cluster size & bdev awu units.
* @sb: super block
- * TODO: Later add support for bigalloc
*/
static void ext4_atomic_write_init(struct super_block *sb)
{
struct ext4_sb_info *sbi = EXT4_SB(sb);
struct block_device *bdev = sb->s_bdev;
+ unsigned int clustersize = EXT4_CLUSTER_SIZE(sb);
if (!bdev_can_atomic_write(bdev))
return;
@@ -4457,7 +4455,7 @@ static void ext4_atomic_write_init(struct super_block *sb)
sbi->s_awu_min = max(sb->s_blocksize,
bdev_atomic_write_unit_min_bytes(bdev));
- sbi->s_awu_max = min(sb->s_blocksize,
+ sbi->s_awu_max = min(clustersize,
bdev_atomic_write_unit_max_bytes(bdev));
if (sbi->s_awu_min && sbi->s_awu_max &&
sbi->s_awu_min <= sbi->s_awu_max) {
@@ -4482,7 +4480,7 @@ static void ext4_fast_commit_init(struct super_block *sb)
sbi->s_fc_bytes = 0;
ext4_clear_mount_flag(sb, EXT4_MF_FC_INELIGIBLE);
sbi->s_fc_ineligible_tid = 0;
- spin_lock_init(&sbi->s_fc_lock);
+ mutex_init(&sbi->s_fc_lock);
memset(&sbi->s_fc_stats, 0, sizeof(sbi->s_fc_stats));
sbi->s_fc_replay_state.fc_regions = NULL;
sbi->s_fc_replay_state.fc_regions_size = 0;
@@ -4644,7 +4642,7 @@ static int ext4_init_metadata_csum(struct super_block *sb, struct ext4_super_blo
sbi->s_csum_seed = le32_to_cpu(es->s_checksum_seed);
else if (ext4_has_feature_metadata_csum(sb) ||
ext4_has_feature_ea_inode(sb))
- sbi->s_csum_seed = ext4_chksum(sbi, ~0, es->s_uuid,
+ sbi->s_csum_seed = ext4_chksum(~0, es->s_uuid,
sizeof(es->s_uuid));
return 0;
}
@@ -5255,7 +5253,7 @@ static int __ext4_fill_super(struct fs_context *fc, struct super_block *sb)
/* Set defaults for the variables that will be set during parsing */
if (!(ctx->spec & EXT4_SPEC_JOURNAL_IOPRIO))
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
sbi->s_inode_readahead_blks = EXT4_DEF_INODE_READAHEAD_BLKS;
sbi->s_sectors_written_start =
@@ -5916,7 +5914,7 @@ static struct file *ext4_get_journal_blkdev(struct super_block *sb,
if ((le32_to_cpu(es->s_feature_ro_compat) &
EXT4_FEATURE_RO_COMPAT_METADATA_CSUM) &&
- es->s_checksum != ext4_superblock_csum(sb, es)) {
+ es->s_checksum != ext4_superblock_csum(es)) {
ext4_msg(sb, KERN_ERR, "external journal has corrupt superblock");
errno = -EFSCORRUPTED;
goto out_bh;
@@ -6495,7 +6493,7 @@ static int __ext4_remount(struct fs_context *fc, struct super_block *sb)
ctx->journal_ioprio =
sbi->s_journal->j_task->io_context->ioprio;
else
- ctx->journal_ioprio = DEFAULT_JOURNAL_IOPRIO;
+ ctx->journal_ioprio = EXT4_DEF_JOURNAL_IOPRIO;
}
diff --git a/fs/ext4/xattr.c b/fs/ext4/xattr.c
index 7ab8f2e8e815..8d15acbacc20 100644
--- a/fs/ext4/xattr.c
+++ b/fs/ext4/xattr.c
@@ -139,12 +139,12 @@ static __le32 ext4_xattr_block_csum(struct inode *inode,
__u32 dummy_csum = 0;
int offset = offsetof(struct ext4_xattr_header, h_checksum);
- csum = ext4_chksum(sbi, sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
+ csum = ext4_chksum(sbi->s_csum_seed, (__u8 *)&dsk_block_nr,
sizeof(dsk_block_nr));
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr, offset);
- csum = ext4_chksum(sbi, csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
+ csum = ext4_chksum(csum, (__u8 *)hdr, offset);
+ csum = ext4_chksum(csum, (__u8 *)&dummy_csum, sizeof(dummy_csum));
offset += sizeof(dummy_csum);
- csum = ext4_chksum(sbi, csum, (__u8 *)hdr + offset,
+ csum = ext4_chksum(csum, (__u8 *)hdr + offset,
EXT4_BLOCK_SIZE(inode->i_sb) - offset);
return cpu_to_le32(csum);
@@ -348,7 +348,7 @@ xattr_find_entry(struct inode *inode, struct ext4_xattr_entry **pentry,
static u32
ext4_xattr_inode_hash(struct ext4_sb_info *sbi, const void *buffer, size_t size)
{
- return ext4_chksum(sbi, sbi->s_csum_seed, buffer, size);
+ return ext4_chksum(sbi->s_csum_seed, buffer, size);
}
static u64 ext4_xattr_inode_get_ref(struct inode *ea_inode)
diff --git a/fs/isofs/inode.c b/fs/isofs/inode.c
index 47038e660812..d5da9817df9b 100644
--- a/fs/isofs/inode.c
+++ b/fs/isofs/inode.c
@@ -1275,6 +1275,7 @@ static int isofs_read_inode(struct inode *inode, int relocated)
unsigned long offset;
struct iso_inode_info *ei = ISOFS_I(inode);
int ret = -EIO;
+ struct timespec64 ts;
block = ei->i_iget5_block;
bh = sb_bread(inode->i_sb, block);
@@ -1387,8 +1388,10 @@ static int isofs_read_inode(struct inode *inode, int relocated)
inode->i_ino, de->flags[-high_sierra]);
}
#endif
- inode_set_mtime_to_ts(inode,
- inode_set_atime_to_ts(inode, inode_set_ctime(inode, iso_date(de->date, high_sierra), 0)));
+ ts = iso_date(de->date, high_sierra ? ISO_DATE_HIGH_SIERRA : 0);
+ inode_set_ctime_to_ts(inode, ts);
+ inode_set_atime_to_ts(inode, ts);
+ inode_set_mtime_to_ts(inode, ts);
ei->i_first_extent = (isonum_733(de->extent) +
isonum_711(de->ext_attr_length));
diff --git a/fs/isofs/isofs.h b/fs/isofs/isofs.h
index 2d55207c9a99..506555837533 100644
--- a/fs/isofs/isofs.h
+++ b/fs/isofs/isofs.h
@@ -106,7 +106,9 @@ static inline unsigned int isonum_733(u8 *p)
/* Ignore bigendian datum due to broken mastering programs */
return get_unaligned_le32(p);
}
-extern int iso_date(u8 *, int);
+#define ISO_DATE_HIGH_SIERRA (1 << 0)
+#define ISO_DATE_LONG_FORM (1 << 1)
+struct timespec64 iso_date(u8 *p, int flags);
struct inode; /* To make gcc happy */
diff --git a/fs/isofs/rock.c b/fs/isofs/rock.c
index dbf911126e61..576498245b9d 100644
--- a/fs/isofs/rock.c
+++ b/fs/isofs/rock.c
@@ -412,7 +412,12 @@ repeat:
}
}
break;
- case SIG('T', 'F'):
+ case SIG('T', 'F'): {
+ int flags, size, slen;
+
+ flags = rr->u.TF.flags & TF_LONG_FORM ? ISO_DATE_LONG_FORM : 0;
+ size = rr->u.TF.flags & TF_LONG_FORM ? 17 : 7;
+ slen = rr->len - 5;
/*
* Some RRIP writers incorrectly place ctime in the
* TF_CREATE field. Try to handle this correctly for
@@ -420,27 +425,28 @@ repeat:
*/
/* Rock ridge never appears on a High Sierra disk */
cnt = 0;
- if (rr->u.TF.flags & TF_CREATE) {
- inode_set_ctime(inode,
- iso_date(rr->u.TF.times[cnt++].time, 0),
- 0);
+ if ((rr->u.TF.flags & TF_CREATE) && size <= slen) {
+ inode_set_ctime_to_ts(inode,
+ iso_date(rr->u.TF.data + size * cnt++, flags));
+ slen -= size;
}
- if (rr->u.TF.flags & TF_MODIFY) {
- inode_set_mtime(inode,
- iso_date(rr->u.TF.times[cnt++].time, 0),
- 0);
+ if ((rr->u.TF.flags & TF_MODIFY) && size <= slen) {
+ inode_set_mtime_to_ts(inode,
+ iso_date(rr->u.TF.data + size * cnt++, flags));
+ slen -= size;
}
- if (rr->u.TF.flags & TF_ACCESS) {
- inode_set_atime(inode,
- iso_date(rr->u.TF.times[cnt++].time, 0),
- 0);
+ if ((rr->u.TF.flags & TF_ACCESS) && size <= slen) {
+ inode_set_atime_to_ts(inode,
+ iso_date(rr->u.TF.data + size * cnt++, flags));
+ slen -= size;
}
- if (rr->u.TF.flags & TF_ATTRIBUTES) {
- inode_set_ctime(inode,
- iso_date(rr->u.TF.times[cnt++].time, 0),
- 0);
+ if ((rr->u.TF.flags & TF_ATTRIBUTES) && size <= slen) {
+ inode_set_ctime_to_ts(inode,
+ iso_date(rr->u.TF.data + size * cnt++, flags));
+ slen -= size;
}
break;
+ }
case SIG('S', 'L'):
{
int slen;
diff --git a/fs/isofs/rock.h b/fs/isofs/rock.h
index 7755e587f778..c0856fa9bb6a 100644
--- a/fs/isofs/rock.h
+++ b/fs/isofs/rock.h
@@ -65,13 +65,9 @@ struct RR_PL_s {
__u8 location[8];
};
-struct stamp {
- __u8 time[7]; /* actually 6 unsigned, 1 signed */
-} __attribute__ ((packed));
-
struct RR_TF_s {
__u8 flags;
- struct stamp times[]; /* Variable number of these beasts */
+ __u8 data[];
} __attribute__ ((packed));
/* Linux-specific extension for transparent decompression */
diff --git a/fs/isofs/util.c b/fs/isofs/util.c
index e88dba721661..42f479da0b28 100644
--- a/fs/isofs/util.c
+++ b/fs/isofs/util.c
@@ -16,29 +16,44 @@
* to GMT. Thus we should always be correct.
*/
-int iso_date(u8 *p, int flag)
+struct timespec64 iso_date(u8 *p, int flags)
{
int year, month, day, hour, minute, second, tz;
- int crtime;
+ struct timespec64 ts;
+
+ if (flags & ISO_DATE_LONG_FORM) {
+ year = (p[0] - '0') * 1000 +
+ (p[1] - '0') * 100 +
+ (p[2] - '0') * 10 +
+ (p[3] - '0') - 1900;
+ month = ((p[4] - '0') * 10 + (p[5] - '0'));
+ day = ((p[6] - '0') * 10 + (p[7] - '0'));
+ hour = ((p[8] - '0') * 10 + (p[9] - '0'));
+ minute = ((p[10] - '0') * 10 + (p[11] - '0'));
+ second = ((p[12] - '0') * 10 + (p[13] - '0'));
+ ts.tv_nsec = ((p[14] - '0') * 10 + (p[15] - '0')) * 10000000;
+ tz = p[16];
+ } else {
+ year = p[0];
+ month = p[1];
+ day = p[2];
+ hour = p[3];
+ minute = p[4];
+ second = p[5];
+ ts.tv_nsec = 0;
+ /* High sierra has no time zone */
+ tz = flags & ISO_DATE_HIGH_SIERRA ? 0 : p[6];
+ }
- year = p[0];
- month = p[1];
- day = p[2];
- hour = p[3];
- minute = p[4];
- second = p[5];
- if (flag == 0) tz = p[6]; /* High sierra has no time zone */
- else tz = 0;
-
if (year < 0) {
- crtime = 0;
+ ts.tv_sec = 0;
} else {
- crtime = mktime64(year+1900, month, day, hour, minute, second);
+ ts.tv_sec = mktime64(year+1900, month, day, hour, minute, second);
/* sign extend */
if (tz & 0x80)
tz |= (-1 << 8);
-
+
/*
* The timezone offset is unreliable on some disks,
* so we make a sanity check. In no case is it ever
@@ -65,7 +80,7 @@ int iso_date(u8 *p, int flag)
* for pointing out the sign error.
*/
if (-52 <= tz && tz <= 52)
- crtime -= tz * 15 * 60;
+ ts.tv_sec -= tz * 15 * 60;
}
- return crtime;
-}
+ return ts;
+}
diff --git a/fs/jbd2/commit.c b/fs/jbd2/commit.c
index 1c7c49356878..7203d2d2624d 100644
--- a/fs/jbd2/commit.c
+++ b/fs/jbd2/commit.c
@@ -99,7 +99,7 @@ static void jbd2_commit_block_csum_set(journal_t *j, struct buffer_head *bh)
h->h_chksum_type = 0;
h->h_chksum_size = 0;
h->h_chksum[0] = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
h->h_chksum[0] = cpu_to_be32(csum);
}
@@ -330,8 +330,8 @@ static void jbd2_block_tag_csum_set(journal_t *j, journal_block_tag_t *tag,
seq = cpu_to_be32(sequence);
addr = kmap_local_folio(bh->b_folio, bh_offset(bh));
- csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
- csum32 = jbd2_chksum(j, csum32, addr, bh->b_size);
+ csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
+ csum32 = jbd2_chksum(csum32, addr, bh->b_size);
kunmap_local(addr);
if (jbd2_has_feature_csum3(j))
diff --git a/fs/jbd2/journal.c b/fs/jbd2/journal.c
index 743a1d7633cd..6d5e76848733 100644
--- a/fs/jbd2/journal.c
+++ b/fs/jbd2/journal.c
@@ -83,7 +83,7 @@ EXPORT_SYMBOL(jbd2_log_wait_commit);
EXPORT_SYMBOL(jbd2_journal_start_commit);
EXPORT_SYMBOL(jbd2_journal_force_commit_nested);
EXPORT_SYMBOL(jbd2_journal_wipe);
-EXPORT_SYMBOL(jbd2_journal_blocks_per_page);
+EXPORT_SYMBOL(jbd2_journal_blocks_per_folio);
EXPORT_SYMBOL(jbd2_journal_invalidate_folio);
EXPORT_SYMBOL(jbd2_journal_try_to_free_buffers);
EXPORT_SYMBOL(jbd2_journal_force_commit);
@@ -115,14 +115,14 @@ void __jbd2_debug(int level, const char *file, const char *func,
#endif
/* Checksumming functions */
-static __be32 jbd2_superblock_csum(journal_t *j, journal_superblock_t *sb)
+static __be32 jbd2_superblock_csum(journal_superblock_t *sb)
{
__u32 csum;
__be32 old_csum;
old_csum = sb->s_checksum;
sb->s_checksum = 0;
- csum = jbd2_chksum(j, ~0, (char *)sb, sizeof(journal_superblock_t));
+ csum = jbd2_chksum(~0, (char *)sb, sizeof(journal_superblock_t));
sb->s_checksum = old_csum;
return cpu_to_be32(csum);
@@ -728,7 +728,6 @@ int jbd2_fc_begin_commit(journal_t *journal, tid_t tid)
}
journal->j_flags |= JBD2_FAST_COMMIT_ONGOING;
write_unlock(&journal->j_state_lock);
- jbd2_journal_lock_updates(journal);
return 0;
}
@@ -742,7 +741,6 @@ static int __jbd2_fc_end_commit(journal_t *journal, tid_t tid, bool fallback)
{
if (journal->j_fc_cleanup_callback)
journal->j_fc_cleanup_callback(journal, 0, tid);
- jbd2_journal_unlock_updates(journal);
write_lock(&journal->j_state_lock);
journal->j_flags &= ~JBD2_FAST_COMMIT_ONGOING;
if (fallback)
@@ -1002,7 +1000,7 @@ void jbd2_descriptor_block_csum_set(journal_t *j, struct buffer_head *bh)
tail = (struct jbd2_journal_block_tail *)(bh->b_data + j->j_blocksize -
sizeof(struct jbd2_journal_block_tail));
tail->t_checksum = 0;
- csum = jbd2_chksum(j, j->j_csum_seed, bh->b_data, j->j_blocksize);
+ csum = jbd2_chksum(j->j_csum_seed, bh->b_data, j->j_blocksize);
tail->t_checksum = cpu_to_be32(csum);
}
@@ -1386,7 +1384,7 @@ static int journal_check_superblock(journal_t *journal)
}
/* Check superblock checksum */
- if (sb->s_checksum != jbd2_superblock_csum(journal, sb)) {
+ if (sb->s_checksum != jbd2_superblock_csum(sb)) {
printk(KERN_ERR "JBD2: journal checksum error\n");
err = -EFSBADCRC;
return err;
@@ -1492,7 +1490,7 @@ static int journal_load_superblock(journal_t *journal)
journal->j_total_len = be32_to_cpu(sb->s_maxlen);
/* Precompute checksum seed for all metadata */
if (jbd2_journal_has_csum_v2or3(journal))
- journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid,
sizeof(sb->s_uuid));
/* After journal features are set, we can compute transaction limits */
jbd2_journal_init_transaction_limits(journal);
@@ -1821,7 +1819,7 @@ static int jbd2_write_superblock(journal_t *journal, blk_opf_t write_flags)
set_buffer_uptodate(bh);
}
if (jbd2_journal_has_csum_v2or3(journal))
- sb->s_checksum = jbd2_superblock_csum(journal, sb);
+ sb->s_checksum = jbd2_superblock_csum(sb);
get_bh(bh);
bh->b_end_io = end_buffer_write_sync;
submit_bh(REQ_OP_WRITE | write_flags, bh);
@@ -2338,7 +2336,7 @@ int jbd2_journal_set_features(journal_t *journal, unsigned long compat,
sb->s_checksum_type = JBD2_CRC32C_CHKSUM;
sb->s_feature_compat &=
~cpu_to_be32(JBD2_FEATURE_COMPAT_CHECKSUM);
- journal->j_csum_seed = jbd2_chksum(journal, ~0, sb->s_uuid,
+ journal->j_csum_seed = jbd2_chksum(~0, sb->s_uuid,
sizeof(sb->s_uuid));
}
@@ -2657,9 +2655,10 @@ void jbd2_journal_ack_err(journal_t *journal)
write_unlock(&journal->j_state_lock);
}
-int jbd2_journal_blocks_per_page(struct inode *inode)
+int jbd2_journal_blocks_per_folio(struct inode *inode)
{
- return 1 << (PAGE_SHIFT - inode->i_sb->s_blocksize_bits);
+ return 1 << (PAGE_SHIFT + mapping_max_folio_order(inode->i_mapping) -
+ inode->i_sb->s_blocksize_bits);
}
/*
diff --git a/fs/jbd2/recovery.c b/fs/jbd2/recovery.c
index c271a050b7e6..cac8c2cd4a92 100644
--- a/fs/jbd2/recovery.c
+++ b/fs/jbd2/recovery.c
@@ -185,7 +185,7 @@ static int jbd2_descriptor_block_csum_verify(journal_t *j, void *buf)
j->j_blocksize - sizeof(struct jbd2_journal_block_tail));
provided = tail->t_checksum;
tail->t_checksum = 0;
- calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize);
tail->t_checksum = provided;
return provided == cpu_to_be32(calculated);
@@ -440,7 +440,7 @@ static int jbd2_commit_block_csum_verify(journal_t *j, void *buf)
h = buf;
provided = h->h_chksum[0];
h->h_chksum[0] = 0;
- calculated = jbd2_chksum(j, j->j_csum_seed, buf, j->j_blocksize);
+ calculated = jbd2_chksum(j->j_csum_seed, buf, j->j_blocksize);
h->h_chksum[0] = provided;
return provided == cpu_to_be32(calculated);
@@ -461,7 +461,7 @@ static bool jbd2_commit_block_csum_verify_partial(journal_t *j, void *buf)
h = tmpbuf;
provided = h->h_chksum[0];
h->h_chksum[0] = 0;
- calculated = jbd2_chksum(j, j->j_csum_seed, tmpbuf, j->j_blocksize);
+ calculated = jbd2_chksum(j->j_csum_seed, tmpbuf, j->j_blocksize);
kfree(tmpbuf);
return provided == cpu_to_be32(calculated);
@@ -478,8 +478,8 @@ static int jbd2_block_tag_csum_verify(journal_t *j, journal_block_tag_t *tag,
return 1;
seq = cpu_to_be32(sequence);
- csum32 = jbd2_chksum(j, j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
- csum32 = jbd2_chksum(j, csum32, buf, j->j_blocksize);
+ csum32 = jbd2_chksum(j->j_csum_seed, (__u8 *)&seq, sizeof(seq));
+ csum32 = jbd2_chksum(csum32, buf, j->j_blocksize);
if (jbd2_has_feature_csum3(j))
return tag3->t_checksum == cpu_to_be32(csum32);
diff --git a/fs/jbd2/transaction.c b/fs/jbd2/transaction.c
index cbc4785462f5..c7867139af69 100644
--- a/fs/jbd2/transaction.c
+++ b/fs/jbd2/transaction.c
@@ -1509,7 +1509,7 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
jh->b_next_transaction == transaction);
spin_unlock(&jh->b_state_lock);
}
- if (jh->b_modified == 1) {
+ if (data_race(jh->b_modified == 1)) {
/* If it's in our transaction it must be in BJ_Metadata list. */
if (data_race(jh->b_transaction == transaction &&
jh->b_jlist != BJ_Metadata)) {
@@ -1528,7 +1528,6 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
goto out;
}
- journal = transaction->t_journal;
spin_lock(&jh->b_state_lock);
if (is_handle_aborted(handle)) {
@@ -1543,6 +1542,8 @@ int jbd2_journal_dirty_metadata(handle_t *handle, struct buffer_head *bh)
goto out_unlock_bh;
}
+ journal = transaction->t_journal;
+
if (jh->b_modified == 0) {
/*
* This buffer's got modified and becoming part
diff --git a/fs/jfs/jfs_discard.c b/fs/jfs/jfs_discard.c
index 5f4b305030ad..4b660296caf3 100644
--- a/fs/jfs/jfs_discard.c
+++ b/fs/jfs/jfs_discard.c
@@ -86,7 +86,8 @@ int jfs_ioc_trim(struct inode *ip, struct fstrim_range *range)
down_read(&sb->s_umount);
bmp = JFS_SBI(ip->i_sb)->bmap;
- if (minlen > bmp->db_agsize ||
+ if (bmp == NULL ||
+ minlen > bmp->db_agsize ||
start >= bmp->db_mapsize ||
range->len < sb->s_blocksize) {
up_read(&sb->s_umount);
diff --git a/fs/jfs/jfs_dmap.c b/fs/jfs/jfs_dmap.c
index 26e89d0c69b6..35e063c9f3a4 100644
--- a/fs/jfs/jfs_dmap.c
+++ b/fs/jfs/jfs_dmap.c
@@ -194,7 +194,11 @@ int dbMount(struct inode *ipbmap)
!bmp->db_numag || (bmp->db_numag > MAXAG) ||
(bmp->db_maxag >= MAXAG) || (bmp->db_maxag < 0) ||
(bmp->db_agpref >= MAXAG) || (bmp->db_agpref < 0) ||
- !bmp->db_agwidth ||
+ (bmp->db_agheight < 0) || (bmp->db_agheight > (L2LPERCTL >> 1)) ||
+ (bmp->db_agwidth < 1) || (bmp->db_agwidth > (LPERCTL / MAXAG)) ||
+ (bmp->db_agwidth > (1 << (L2LPERCTL - (bmp->db_agheight << 1)))) ||
+ (bmp->db_agstart < 0) ||
+ (bmp->db_agstart > (CTLTREESIZE - 1 - bmp->db_agwidth * (MAXAG - 1))) ||
(bmp->db_agl2size > L2MAXL2SIZE - L2MAXAG) ||
(bmp->db_agl2size < 0) ||
((bmp->db_mapsize - 1) >> bmp->db_agl2size) > MAXAG) {
diff --git a/fs/jfs/jfs_dtree.c b/fs/jfs/jfs_dtree.c
index 93db6eec4465..ab11849cf9cc 100644
--- a/fs/jfs/jfs_dtree.c
+++ b/fs/jfs/jfs_dtree.c
@@ -2613,7 +2613,7 @@ void dtInitRoot(tid_t tid, struct inode *ip, u32 idotdot)
* fsck.jfs should really fix this, but it currently does not.
* Called from jfs_readdir when bad index is detected.
*/
-static void add_missing_indices(struct inode *inode, s64 bn)
+static int add_missing_indices(struct inode *inode, s64 bn)
{
struct ldtentry *d;
struct dt_lock *dtlck;
@@ -2622,7 +2622,7 @@ static void add_missing_indices(struct inode *inode, s64 bn)
struct lv *lv;
struct metapage *mp;
dtpage_t *p;
- int rc;
+ int rc = 0;
s8 *stbl;
tid_t tid;
struct tlock *tlck;
@@ -2647,6 +2647,16 @@ static void add_missing_indices(struct inode *inode, s64 bn)
stbl = DT_GETSTBL(p);
for (i = 0; i < p->header.nextindex; i++) {
+ if (stbl[i] < 0) {
+ jfs_err("jfs: add_missing_indices: Invalid stbl[%d] = %d for inode %ld, block = %lld",
+ i, stbl[i], (long)inode->i_ino, (long long)bn);
+ rc = -EIO;
+
+ DT_PUTPAGE(mp);
+ txAbort(tid, 0);
+ goto end;
+ }
+
d = (struct ldtentry *) &p->slot[stbl[i]];
index = le32_to_cpu(d->index);
if ((index < 2) || (index >= JFS_IP(inode)->next_index)) {
@@ -2664,6 +2674,7 @@ static void add_missing_indices(struct inode *inode, s64 bn)
(void) txCommit(tid, 1, &inode, 0);
end:
txEnd(tid);
+ return rc;
}
/*
@@ -3017,7 +3028,8 @@ skip_one:
}
if (fix_page) {
- add_missing_indices(ip, bn);
+ if ((rc = add_missing_indices(ip, bn)))
+ goto out;
page_fixed = 1;
}
diff --git a/fs/kernfs/dir.c b/fs/kernfs/dir.c
index fc70d72c3fe8..a670ba3e565e 100644
--- a/fs/kernfs/dir.c
+++ b/fs/kernfs/dir.c
@@ -17,7 +17,6 @@
#include "kernfs-internal.h"
-DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */
/*
* Don't use rename_lock to piggy back on pr_cont_buf. We don't want to
* call pr_cont() while holding rename_lock. Because sometimes pr_cont()
@@ -27,7 +26,6 @@ DEFINE_RWLOCK(kernfs_rename_lock); /* kn->parent and ->name */
*/
static DEFINE_SPINLOCK(kernfs_pr_cont_lock);
static char kernfs_pr_cont_buf[PATH_MAX]; /* protected by pr_cont_lock */
-static DEFINE_SPINLOCK(kernfs_idr_lock); /* root->ino_idr */
#define rb_to_kn(X) rb_entry((X), struct kernfs_node, rb)
@@ -229,7 +227,7 @@ int kernfs_path_from_node(struct kernfs_node *to, struct kernfs_node *from,
if (to) {
root = kernfs_root(to);
if (!(root->flags & KERNFS_ROOT_INVARIANT_PARENT)) {
- guard(read_lock_irqsave)(&kernfs_rename_lock);
+ guard(read_lock_irqsave)(&root->kernfs_rename_lock);
return kernfs_path_from_node_locked(to, from, buf, buflen);
}
}
@@ -296,12 +294,14 @@ out:
struct kernfs_node *kernfs_get_parent(struct kernfs_node *kn)
{
struct kernfs_node *parent;
+ struct kernfs_root *root;
unsigned long flags;
- read_lock_irqsave(&kernfs_rename_lock, flags);
+ root = kernfs_root(kn);
+ read_lock_irqsave(&root->kernfs_rename_lock, flags);
parent = kernfs_parent(kn);
kernfs_get(parent);
- read_unlock_irqrestore(&kernfs_rename_lock, flags);
+ read_unlock_irqrestore(&root->kernfs_rename_lock, flags);
return parent;
}
@@ -584,9 +584,9 @@ void kernfs_put(struct kernfs_node *kn)
if (kernfs_type(kn) == KERNFS_LINK)
kernfs_put(kn->symlink.target_kn);
- spin_lock(&kernfs_idr_lock);
+ spin_lock(&root->kernfs_idr_lock);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
- spin_unlock(&kernfs_idr_lock);
+ spin_unlock(&root->kernfs_idr_lock);
call_rcu(&kn->rcu, kernfs_free_rcu);
@@ -639,13 +639,13 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
goto err_out1;
idr_preload(GFP_KERNEL);
- spin_lock(&kernfs_idr_lock);
+ spin_lock(&root->kernfs_idr_lock);
ret = idr_alloc_cyclic(&root->ino_idr, kn, 1, 0, GFP_ATOMIC);
if (ret >= 0 && ret < root->last_id_lowbits)
root->id_highbits++;
id_highbits = root->id_highbits;
root->last_id_lowbits = ret;
- spin_unlock(&kernfs_idr_lock);
+ spin_unlock(&root->kernfs_idr_lock);
idr_preload_end();
if (ret < 0)
goto err_out2;
@@ -681,9 +681,9 @@ static struct kernfs_node *__kernfs_new_node(struct kernfs_root *root,
return kn;
err_out3:
- spin_lock(&kernfs_idr_lock);
+ spin_lock(&root->kernfs_idr_lock);
idr_remove(&root->ino_idr, (u32)kernfs_ino(kn));
- spin_unlock(&kernfs_idr_lock);
+ spin_unlock(&root->kernfs_idr_lock);
err_out2:
kmem_cache_free(kernfs_node_cache, kn);
err_out1:
@@ -989,10 +989,12 @@ struct kernfs_root *kernfs_create_root(struct kernfs_syscall_ops *scops,
return ERR_PTR(-ENOMEM);
idr_init(&root->ino_idr);
+ spin_lock_init(&root->kernfs_idr_lock);
init_rwsem(&root->kernfs_rwsem);
init_rwsem(&root->kernfs_iattr_rwsem);
init_rwsem(&root->kernfs_supers_rwsem);
INIT_LIST_HEAD(&root->supers);
+ rwlock_init(&root->kernfs_rename_lock);
/*
* On 64bit ino setups, id is ino. On 32bit, low 32bits are ino.
@@ -1580,8 +1582,9 @@ void kernfs_break_active_protection(struct kernfs_node *kn)
* invoked before finishing the kernfs operation. Note that while this
* function restores the active reference, it doesn't and can't actually
* restore the active protection - @kn may already or be in the process of
- * being removed. Once kernfs_break_active_protection() is invoked, that
- * protection is irreversibly gone for the kernfs operation instance.
+ * being drained and removed. Once kernfs_break_active_protection() is
+ * invoked, that protection is irreversibly gone for the kernfs operation
+ * instance.
*
* While this function may be called at any point after
* kernfs_break_active_protection() is invoked, its most useful location
@@ -1789,7 +1792,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
/* rename_lock protects ->parent accessors */
if (old_parent != new_parent) {
kernfs_get(new_parent);
- write_lock_irq(&kernfs_rename_lock);
+ write_lock_irq(&root->kernfs_rename_lock);
rcu_assign_pointer(kn->__parent, new_parent);
@@ -1797,7 +1800,7 @@ int kernfs_rename_ns(struct kernfs_node *kn, struct kernfs_node *new_parent,
if (new_name)
rcu_assign_pointer(kn->name, new_name);
- write_unlock_irq(&kernfs_rename_lock);
+ write_unlock_irq(&root->kernfs_rename_lock);
kernfs_put(old_parent);
} else {
/* name assignment is RCU protected, parent is the same */
diff --git a/fs/kernfs/file.c b/fs/kernfs/file.c
index 66fe8fe41f06..a6c692cac616 100644
--- a/fs/kernfs/file.c
+++ b/fs/kernfs/file.c
@@ -778,8 +778,9 @@ bool kernfs_should_drain_open_files(struct kernfs_node *kn)
/*
* @kn being deactivated guarantees that @kn->attr.open can't change
* beneath us making the lockless test below safe.
+ * Callers post kernfs_unbreak_active_protection may be counted in
+ * kn->active by now, do not WARN_ON because of them.
*/
- WARN_ON_ONCE(atomic_read(&kn->active) != KN_DEACTIVATED_BIAS);
rcu_read_lock();
on = rcu_dereference(kn->attr.open);
diff --git a/fs/kernfs/kernfs-internal.h b/fs/kernfs/kernfs-internal.h
index 40a2a9cd819d..6061b6f70d2a 100644
--- a/fs/kernfs/kernfs-internal.h
+++ b/fs/kernfs/kernfs-internal.h
@@ -19,8 +19,6 @@
#include <linux/kernfs.h>
#include <linux/fs_context.h>
-extern rwlock_t kernfs_rename_lock;
-
struct kernfs_iattrs {
kuid_t ia_uid;
kgid_t ia_gid;
@@ -40,6 +38,7 @@ struct kernfs_root {
/* private fields, do not use outside kernfs proper */
struct idr ino_idr;
+ spinlock_t kernfs_idr_lock; /* root->ino_idr */
u32 last_id_lowbits;
u32 id_highbits;
struct kernfs_syscall_ops *syscall_ops;
@@ -52,6 +51,9 @@ struct kernfs_root {
struct rw_semaphore kernfs_iattr_rwsem;
struct rw_semaphore kernfs_supers_rwsem;
+ /* kn->parent and kn->name */
+ rwlock_t kernfs_rename_lock;
+
struct rcu_head rcu;
};
@@ -107,6 +109,11 @@ static inline bool kernfs_root_is_locked(const struct kernfs_node *kn)
return lockdep_is_held(&kernfs_root(kn)->kernfs_rwsem);
}
+static inline bool kernfs_rename_is_locked(const struct kernfs_node *kn)
+{
+ return lockdep_is_held(&kernfs_root(kn)->kernfs_rename_lock);
+}
+
static inline const char *kernfs_rcu_name(const struct kernfs_node *kn)
{
return rcu_dereference_check(kn->name, kernfs_root_is_locked(kn));
@@ -117,14 +124,15 @@ static inline struct kernfs_node *kernfs_parent(const struct kernfs_node *kn)
/*
* The kernfs_node::__parent remains valid within a RCU section. The kn
* can be reparented (and renamed) which changes the entry. This can be
- * avoided by locking kernfs_root::kernfs_rwsem or kernfs_rename_lock.
+ * avoided by locking kernfs_root::kernfs_rwsem or
+ * kernfs_root::kernfs_rename_lock.
* Both locks can be used to obtain a reference on __parent. Once the
* reference count reaches 0 then the node is about to be freed
* and can not be renamed (or become a different parent) anymore.
*/
return rcu_dereference_check(kn->__parent,
kernfs_root_is_locked(kn) ||
- lockdep_is_held(&kernfs_rename_lock) ||
+ kernfs_rename_is_locked(kn) ||
!atomic_read(&kn->count));
}
diff --git a/fs/nfsd/Kconfig b/fs/nfsd/Kconfig
index 731a88f6313e..879e0b104d1c 100644
--- a/fs/nfsd/Kconfig
+++ b/fs/nfsd/Kconfig
@@ -77,8 +77,8 @@ config NFSD_V4
select FS_POSIX_ACL
select RPCSEC_GSS_KRB5
select CRYPTO
+ select CRYPTO_LIB_SHA256
select CRYPTO_MD5
- select CRYPTO_SHA256
select GRACE_PERIOD
select NFS_V4_2_SSC_HELPER if NFS_V4_2
help
diff --git a/fs/nfsd/Makefile b/fs/nfsd/Makefile
index 2f687619f65b..55744bb786c9 100644
--- a/fs/nfsd/Makefile
+++ b/fs/nfsd/Makefile
@@ -24,6 +24,7 @@ nfsd-$(CONFIG_NFSD_BLOCKLAYOUT) += blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_SCSILAYOUT) += blocklayout.o blocklayoutxdr.o
nfsd-$(CONFIG_NFSD_FLEXFILELAYOUT) += flexfilelayout.o flexfilelayoutxdr.o
nfsd-$(CONFIG_NFS_LOCALIO) += localio.o
+nfsd-$(CONFIG_DEBUG_FS) += debugfs.o
.PHONY: xdrgen
diff --git a/fs/nfsd/debugfs.c b/fs/nfsd/debugfs.c
new file mode 100644
index 000000000000..84b0c8b559dc
--- /dev/null
+++ b/fs/nfsd/debugfs.c
@@ -0,0 +1,47 @@
+// SPDX-License-Identifier: GPL-2.0
+
+#include <linux/debugfs.h>
+
+#include "nfsd.h"
+
+static struct dentry *nfsd_top_dir __read_mostly;
+
+/*
+ * /sys/kernel/debug/nfsd/disable-splice-read
+ *
+ * Contents:
+ * %0: NFS READ is allowed to use page splicing
+ * %1: NFS READ uses only iov iter read
+ *
+ * The default value of this setting is zero (page splicing is
+ * allowed). This setting takes immediate effect for all NFS
+ * versions, all exports, and in all NFSD net namespaces.
+ */
+
+static int nfsd_dsr_get(void *data, u64 *val)
+{
+ *val = nfsd_disable_splice_read ? 1 : 0;
+ return 0;
+}
+
+static int nfsd_dsr_set(void *data, u64 val)
+{
+ nfsd_disable_splice_read = (val > 0) ? true : false;
+ return 0;
+}
+
+DEFINE_DEBUGFS_ATTRIBUTE(nfsd_dsr_fops, nfsd_dsr_get, nfsd_dsr_set, "%llu\n");
+
+void nfsd_debugfs_exit(void)
+{
+ debugfs_remove_recursive(nfsd_top_dir);
+ nfsd_top_dir = NULL;
+}
+
+void nfsd_debugfs_init(void)
+{
+ nfsd_top_dir = debugfs_create_dir("nfsd", NULL);
+
+ debugfs_create_file("disable-splice-read", S_IWUSR | S_IRUGO,
+ nfsd_top_dir, NULL, &nfsd_dsr_fops);
+}
diff --git a/fs/nfsd/export.c b/fs/nfsd/export.c
index 0363720280d4..88ae410b4113 100644
--- a/fs/nfsd/export.c
+++ b/fs/nfsd/export.c
@@ -1124,7 +1124,8 @@ __be32 check_nfsd_access(struct svc_export *exp, struct svc_rqst *rqstp,
test_bit(XPT_PEER_AUTH, &xprt->xpt_flags))
goto ok;
}
- goto denied;
+ if (!may_bypass_gss)
+ goto denied;
ok:
/* legacy gss-only clients are always OK: */
diff --git a/fs/nfsd/nfs3proc.c b/fs/nfsd/nfs3proc.c
index ac1731eb34ab..a817d8485d21 100644
--- a/fs/nfsd/nfs3proc.c
+++ b/fs/nfsd/nfs3proc.c
@@ -14,6 +14,7 @@
#include "xdr3.h"
#include "vfs.h"
#include "filecache.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -69,8 +70,7 @@ nfsd3_proc_getattr(struct svc_rqst *rqstp)
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- dprintk("nfsd: GETATTR(3) %s\n",
- SVCFH_fmt(&argp->fh));
+ trace_nfsd_vfs_getattr(rqstp, &argp->fh);
fh_copy(&resp->fh, &argp->fh);
resp->status = fh_verify(rqstp, &resp->fh, 0,
@@ -220,7 +220,6 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
struct nfsd3_writeargs *argp = rqstp->rq_argp;
struct nfsd3_writeres *resp = rqstp->rq_resp;
unsigned long cnt = argp->len;
- unsigned int nvecs;
dprintk("nfsd: WRITE(3) %s %d bytes at %Lu%s\n",
SVCFH_fmt(&argp->fh),
@@ -235,10 +234,8 @@ nfsd3_proc_write(struct svc_rqst *rqstp)
fh_copy(&resp->fh, &argp->fh);
resp->committed = argp->stable;
- nvecs = svc_fill_write_vector(rqstp, &argp->payload);
-
resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
- rqstp->rq_vec, nvecs, &cnt,
+ &argp->payload, &cnt,
resp->committed, resp->verf);
resp->count = cnt;
resp->status = nfsd3_map_status(resp->status);
@@ -266,6 +263,8 @@ nfsd3_create_file(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 status;
int host_err;
+ trace_nfsd_vfs_create(rqstp, fhp, S_IFREG, argp->name, argp->len);
+
if (isdotent(argp->name, argp->len))
return nfserr_exist;
if (!(iap->ia_valid & ATTR_MODE))
@@ -382,11 +381,6 @@ nfsd3_proc_create(struct svc_rqst *rqstp)
struct nfsd3_diropres *resp = rqstp->rq_resp;
svc_fh *dirfhp, *newfhp;
- dprintk("nfsd: CREATE(3) %s %.*s\n",
- SVCFH_fmt(&argp->fh),
- argp->len,
- argp->name);
-
dirfhp = fh_copy(&resp->dirfh, &argp->fh);
newfhp = fh_init(&resp->fh, NFS3_FHSIZE);
@@ -407,11 +401,6 @@ nfsd3_proc_mkdir(struct svc_rqst *rqstp)
.na_iattr = &argp->attrs,
};
- dprintk("nfsd: MKDIR(3) %s %.*s\n",
- SVCFH_fmt(&argp->fh),
- argp->len,
- argp->name);
-
argp->attrs.ia_valid &= ~ATTR_SIZE;
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
@@ -447,11 +436,6 @@ nfsd3_proc_symlink(struct svc_rqst *rqstp)
goto out;
}
- dprintk("nfsd: SYMLINK(3) %s %.*s -> %.*s\n",
- SVCFH_fmt(&argp->ffh),
- argp->flen, argp->fname,
- argp->tlen, argp->tname);
-
fh_copy(&resp->dirfh, &argp->ffh);
fh_init(&resp->fh, NFS3_FHSIZE);
resp->status = nfsd_symlink(rqstp, &resp->dirfh, argp->fname,
@@ -476,11 +460,6 @@ nfsd3_proc_mknod(struct svc_rqst *rqstp)
int type;
dev_t rdev = 0;
- dprintk("nfsd: MKNOD(3) %s %.*s\n",
- SVCFH_fmt(&argp->fh),
- argp->len,
- argp->name);
-
fh_copy(&resp->dirfh, &argp->fh);
fh_init(&resp->fh, NFS3_FHSIZE);
@@ -513,11 +492,6 @@ nfsd3_proc_remove(struct svc_rqst *rqstp)
struct nfsd3_diropargs *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- dprintk("nfsd: REMOVE(3) %s %.*s\n",
- SVCFH_fmt(&argp->fh),
- argp->len,
- argp->name);
-
/* Unlink. -S_IFDIR means file must not be a directory */
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_unlink(rqstp, &resp->fh, -S_IFDIR,
@@ -535,11 +509,6 @@ nfsd3_proc_rmdir(struct svc_rqst *rqstp)
struct nfsd3_diropargs *argp = rqstp->rq_argp;
struct nfsd3_attrstat *resp = rqstp->rq_resp;
- dprintk("nfsd: RMDIR(3) %s %.*s\n",
- SVCFH_fmt(&argp->fh),
- argp->len,
- argp->name);
-
fh_copy(&resp->fh, &argp->fh);
resp->status = nfsd_unlink(rqstp, &resp->fh, S_IFDIR,
argp->name, argp->len);
@@ -553,15 +522,6 @@ nfsd3_proc_rename(struct svc_rqst *rqstp)
struct nfsd3_renameargs *argp = rqstp->rq_argp;
struct nfsd3_renameres *resp = rqstp->rq_resp;
- dprintk("nfsd: RENAME(3) %s %.*s ->\n",
- SVCFH_fmt(&argp->ffh),
- argp->flen,
- argp->fname);
- dprintk("nfsd: -> %s %.*s\n",
- SVCFH_fmt(&argp->tfh),
- argp->tlen,
- argp->tname);
-
fh_copy(&resp->ffh, &argp->ffh);
fh_copy(&resp->tfh, &argp->tfh);
resp->status = nfsd_rename(rqstp, &resp->ffh, argp->fname, argp->flen,
@@ -576,13 +536,6 @@ nfsd3_proc_link(struct svc_rqst *rqstp)
struct nfsd3_linkargs *argp = rqstp->rq_argp;
struct nfsd3_linkres *resp = rqstp->rq_resp;
- dprintk("nfsd: LINK(3) %s ->\n",
- SVCFH_fmt(&argp->ffh));
- dprintk("nfsd: -> %s %.*s\n",
- SVCFH_fmt(&argp->tfh),
- argp->tlen,
- argp->tname);
-
fh_copy(&resp->fh, &argp->ffh);
fh_copy(&resp->tfh, &argp->tfh);
resp->status = nfsd_link(rqstp, &resp->tfh, argp->tname, argp->tlen,
@@ -621,9 +574,7 @@ nfsd3_proc_readdir(struct svc_rqst *rqstp)
struct nfsd3_readdirres *resp = rqstp->rq_resp;
loff_t offset;
- dprintk("nfsd: READDIR(3) %s %d bytes at %d\n",
- SVCFH_fmt(&argp->fh),
- argp->count, (u32) argp->cookie);
+ trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie);
nfsd3_init_dirlist_pages(rqstp, resp, argp->count);
@@ -655,9 +606,7 @@ nfsd3_proc_readdirplus(struct svc_rqst *rqstp)
struct nfsd3_readdirres *resp = rqstp->rq_resp;
loff_t offset;
- dprintk("nfsd: READDIR+(3) %s %d bytes at %d\n",
- SVCFH_fmt(&argp->fh),
- argp->count, (u32) argp->cookie);
+ trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie);
nfsd3_init_dirlist_pages(rqstp, resp, argp->count);
@@ -698,9 +647,6 @@ nfsd3_proc_fsstat(struct svc_rqst *rqstp)
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd3_fsstatres *resp = rqstp->rq_resp;
- dprintk("nfsd: FSSTAT(3) %s\n",
- SVCFH_fmt(&argp->fh));
-
resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats, 0);
fh_put(&argp->fh);
resp->status = nfsd3_map_status(resp->status);
diff --git a/fs/nfsd/nfs4callback.c b/fs/nfsd/nfs4callback.c
index ec6539cec0fe..ccb00aa93be0 100644
--- a/fs/nfsd/nfs4callback.c
+++ b/fs/nfsd/nfs4callback.c
@@ -417,6 +417,29 @@ static u32 highest_slotid(struct nfsd4_session *ses)
return idx;
}
+static void
+encode_referring_call4(struct xdr_stream *xdr,
+ const struct nfsd4_referring_call *rc)
+{
+ encode_uint32(xdr, rc->rc_sequenceid);
+ encode_uint32(xdr, rc->rc_slotid);
+}
+
+static void
+encode_referring_call_list4(struct xdr_stream *xdr,
+ const struct nfsd4_referring_call_list *rcl)
+{
+ struct nfsd4_referring_call *rc;
+ __be32 *p;
+
+ p = xdr_reserve_space(xdr, NFS4_MAX_SESSIONID_LEN);
+ xdr_encode_opaque_fixed(p, rcl->rcl_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ encode_uint32(xdr, rcl->__nr_referring_calls);
+ list_for_each_entry(rc, &rcl->rcl_referring_calls, __list)
+ encode_referring_call4(xdr, rc);
+}
+
/*
* CB_SEQUENCE4args
*
@@ -434,6 +457,7 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
struct nfs4_cb_compound_hdr *hdr)
{
struct nfsd4_session *session = cb->cb_clp->cl_cb_session;
+ struct nfsd4_referring_call_list *rcl;
__be32 *p;
if (hdr->minorversion == 0)
@@ -442,12 +466,16 @@ static void encode_cb_sequence4args(struct xdr_stream *xdr,
encode_nfs_cb_opnum4(xdr, OP_CB_SEQUENCE);
encode_sessionid4(xdr, session);
- p = xdr_reserve_space(xdr, 4 + 4 + 4 + 4 + 4);
+ p = xdr_reserve_space(xdr, XDR_UNIT * 4);
*p++ = cpu_to_be32(session->se_cb_seq_nr[cb->cb_held_slot]); /* csa_sequenceid */
*p++ = cpu_to_be32(cb->cb_held_slot); /* csa_slotid */
*p++ = cpu_to_be32(highest_slotid(session)); /* csa_highest_slotid */
*p++ = xdr_zero; /* csa_cachethis */
- xdr_encode_empty_array(p); /* csa_referring_call_lists */
+
+ /* csa_referring_call_lists */
+ encode_uint32(xdr, cb->cb_nr_referring_call_list);
+ list_for_each_entry(rcl, &cb->cb_referring_call_list, __list)
+ encode_referring_call_list4(xdr, rcl);
hdr->nops++;
}
@@ -1320,10 +1348,102 @@ static void nfsd41_destroy_cb(struct nfsd4_callback *cb)
nfsd41_cb_inflight_end(clp);
}
-/*
- * TODO: cb_sequence should support referring call lists, cachethis,
- * and mark callback channel down on communication errors.
+/**
+ * nfsd41_cb_referring_call - add a referring call to a callback operation
+ * @cb: context of callback to add the rc to
+ * @sessionid: referring call's session ID
+ * @slotid: referring call's session slot index
+ * @seqno: referring call's slot sequence number
+ *
+ * Caller serializes access to @cb.
+ *
+ * NB: If memory allocation fails, the referring call is not added.
*/
+void nfsd41_cb_referring_call(struct nfsd4_callback *cb,
+ struct nfs4_sessionid *sessionid,
+ u32 slotid, u32 seqno)
+{
+ struct nfsd4_referring_call_list *rcl;
+ struct nfsd4_referring_call *rc;
+ bool found;
+
+ might_sleep();
+
+ found = false;
+ list_for_each_entry(rcl, &cb->cb_referring_call_list, __list) {
+ if (!memcmp(rcl->rcl_sessionid.data, sessionid->data,
+ NFS4_MAX_SESSIONID_LEN)) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ rcl = kmalloc(sizeof(*rcl), GFP_KERNEL);
+ if (!rcl)
+ return;
+ memcpy(rcl->rcl_sessionid.data, sessionid->data,
+ NFS4_MAX_SESSIONID_LEN);
+ rcl->__nr_referring_calls = 0;
+ INIT_LIST_HEAD(&rcl->rcl_referring_calls);
+ list_add(&rcl->__list, &cb->cb_referring_call_list);
+ cb->cb_nr_referring_call_list++;
+ }
+
+ found = false;
+ list_for_each_entry(rc, &rcl->rcl_referring_calls, __list) {
+ if (rc->rc_sequenceid == seqno && rc->rc_slotid == slotid) {
+ found = true;
+ break;
+ }
+ }
+ if (!found) {
+ rc = kmalloc(sizeof(*rc), GFP_KERNEL);
+ if (!rc)
+ goto out;
+ rc->rc_sequenceid = seqno;
+ rc->rc_slotid = slotid;
+ rcl->__nr_referring_calls++;
+ list_add(&rc->__list, &rcl->rcl_referring_calls);
+ }
+
+out:
+ if (!rcl->__nr_referring_calls) {
+ cb->cb_nr_referring_call_list--;
+ kfree(rcl);
+ }
+}
+
+/**
+ * nfsd41_cb_destroy_referring_call_list - release referring call info
+ * @cb: context of a callback that has completed
+ *
+ * Callers who allocate referring calls using nfsd41_cb_referring_call() must
+ * release those resources by calling nfsd41_cb_destroy_referring_call_list.
+ *
+ * Caller serializes access to @cb.
+ */
+void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb)
+{
+ struct nfsd4_referring_call_list *rcl;
+ struct nfsd4_referring_call *rc;
+
+ while (!list_empty(&cb->cb_referring_call_list)) {
+ rcl = list_first_entry(&cb->cb_referring_call_list,
+ struct nfsd4_referring_call_list,
+ __list);
+
+ while (!list_empty(&rcl->rcl_referring_calls)) {
+ rc = list_first_entry(&rcl->rcl_referring_calls,
+ struct nfsd4_referring_call,
+ __list);
+ list_del(&rc->__list);
+ kfree(rc);
+ }
+ list_del(&rcl->__list);
+ kfree(rcl);
+ }
+}
+
static void nfsd4_cb_prepare(struct rpc_task *task, void *calldata)
{
struct nfsd4_callback *cb = calldata;
@@ -1643,6 +1763,8 @@ void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
INIT_WORK(&cb->cb_work, nfsd4_run_cb_work);
cb->cb_status = 0;
cb->cb_held_slot = -1;
+ cb->cb_nr_referring_call_list = 0;
+ INIT_LIST_HEAD(&cb->cb_referring_call_list);
}
/**
diff --git a/fs/nfsd/nfs4proc.c b/fs/nfsd/nfs4proc.c
index fd560dcf6059..f13abbb13b38 100644
--- a/fs/nfsd/nfs4proc.c
+++ b/fs/nfsd/nfs4proc.c
@@ -878,6 +878,8 @@ nfsd4_getattr(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd4_getattr *getattr = &u->getattr;
__be32 status;
+ trace_nfsd_vfs_getattr(rqstp, &cstate->current_fh);
+
status = fh_verify(rqstp, &cstate->current_fh, 0, NFSD_MAY_NOP);
if (status)
return status;
@@ -1000,6 +1002,9 @@ nfsd4_readdir(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
u64 cookie = readdir->rd_cookie;
static const nfs4_verifier zeroverf;
+ trace_nfsd_vfs_readdir(rqstp, &cstate->current_fh,
+ readdir->rd_maxcount, readdir->rd_cookie);
+
/* no need to check permission - this will be done in nfsd_readdir() */
if (readdir->rd_bmval[1] & NFSD_WRITEONLY_ATTRS_WORD1)
@@ -1213,7 +1218,6 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
struct nfsd_file *nf = NULL;
__be32 status = nfs_ok;
unsigned long cnt;
- int nvecs;
if (write->wr_offset > (u64)OFFSET_MAX ||
write->wr_offset + write->wr_buflen > (u64)OFFSET_MAX)
@@ -1228,13 +1232,9 @@ nfsd4_write(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
return status;
write->wr_how_written = write->wr_stable_how;
-
- nvecs = svc_fill_write_vector(rqstp, &write->wr_payload);
- WARN_ON_ONCE(nvecs > ARRAY_SIZE(rqstp->rq_vec));
-
status = nfsd_vfs_write(rqstp, &cstate->current_fh, nf,
- write->wr_offset, rqstp->rq_vec, nvecs, &cnt,
- write->wr_how_written,
+ write->wr_offset, &write->wr_payload,
+ &cnt, write->wr_how_written,
(__be32 *)write->wr_verifier.data);
nfsd_file_put(nf);
@@ -1381,8 +1381,11 @@ static void nfs4_put_copy(struct nfsd4_copy *copy)
static void nfsd4_stop_copy(struct nfsd4_copy *copy)
{
trace_nfsd_copy_async_cancel(copy);
- if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, &copy->cp_flags))
+ if (!test_and_set_bit(NFSD4_COPY_F_STOPPED, &copy->cp_flags)) {
kthread_stop(copy->copy_task);
+ copy->nfserr = nfs_ok;
+ set_bit(NFSD4_COPY_F_COMPLETED, &copy->cp_flags);
+ }
nfs4_put_copy(copy);
}
@@ -1711,10 +1714,11 @@ static int nfsd4_cb_offload_done(struct nfsd4_callback *cb,
switch (task->tk_status) {
case -NFS4ERR_DELAY:
if (cbo->co_retries--) {
- rpc_delay(task, 1 * HZ);
+ rpc_delay(task, HZ / 5);
return 0;
}
}
+ nfsd41_cb_destroy_referring_call_list(cb);
return 1;
}
@@ -1847,6 +1851,9 @@ static void nfsd4_send_cb_offload(struct nfsd4_copy *copy)
nfsd4_init_cb(&cbo->co_cb, copy->cp_clp, &nfsd4_cb_offload_ops,
NFSPROC4_CLNT_CB_OFFLOAD);
+ nfsd41_cb_referring_call(&cbo->co_cb, &cbo->co_referring_sessionid,
+ cbo->co_referring_slotid,
+ cbo->co_referring_seqno);
trace_nfsd_cb_offload(copy->cp_clp, &cbo->co_res.cb_stateid,
&cbo->co_fh, copy->cp_count, copy->nfserr);
nfsd4_try_run_cb(&cbo->co_cb);
@@ -1963,6 +1970,11 @@ nfsd4_copy(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
memcpy(&result->cb_stateid, &copy->cp_stateid.cs_stid,
sizeof(result->cb_stateid));
dup_copy_fields(copy, async_copy);
+ memcpy(async_copy->cp_cb_offload.co_referring_sessionid.data,
+ cstate->session->se_sessionid.data,
+ NFS4_MAX_SESSIONID_LEN);
+ async_copy->cp_cb_offload.co_referring_slotid = cstate->slot->sl_index;
+ async_copy->cp_cb_offload.co_referring_seqno = cstate->slot->sl_seqid;
async_copy->copy_task = kthread_create(nfsd4_do_async_copy,
async_copy, "%s", "copy thread");
if (IS_ERR(async_copy->copy_task))
@@ -3768,7 +3780,8 @@ bool nfsd4_spo_must_allow(struct svc_rqst *rqstp)
struct nfs4_op_map *allow = &cstate->clp->cl_spo_must_allow;
u32 opiter;
- if (!cstate->minorversion)
+ if (rqstp->rq_procinfo != &nfsd_version4.vs_proc[NFSPROC4_COMPOUND] ||
+ cstate->minorversion == 0)
return false;
if (cstate->spo_must_allowed)
@@ -3834,7 +3847,7 @@ static const struct svc_procedure nfsd_procedures4[2] = {
.pc_ressize = sizeof(struct nfsd4_compoundres),
.pc_release = nfsd4_release_compoundargs,
.pc_cachetype = RC_NOCACHE,
- .pc_xdrressize = NFSD_BUFSIZE/4,
+ .pc_xdrressize = 3+NFSSVC_MAXBLKSIZE/4,
.pc_name = "COMPOUND",
},
};
diff --git a/fs/nfsd/nfs4recover.c b/fs/nfsd/nfs4recover.c
index acde3edab733..82785db730d9 100644
--- a/fs/nfsd/nfs4recover.c
+++ b/fs/nfsd/nfs4recover.c
@@ -33,6 +33,7 @@
*/
#include <crypto/hash.h>
+#include <crypto/sha2.h>
#include <linux/file.h>
#include <linux/slab.h>
#include <linux/namei.h>
@@ -737,7 +738,6 @@ struct cld_net {
spinlock_t cn_lock;
struct list_head cn_list;
unsigned int cn_xid;
- struct crypto_shash *cn_tfm;
#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
bool cn_has_legacy;
#endif
@@ -1063,8 +1063,6 @@ nfsd4_remove_cld_pipe(struct net *net)
nfsd4_cld_unregister_net(net, cn->cn_pipe);
rpc_destroy_pipe_data(cn->cn_pipe);
- if (cn->cn_tfm)
- crypto_free_shash(cn->cn_tfm);
kfree(nn->cld_net);
nn->cld_net = NULL;
}
@@ -1158,8 +1156,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp)
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
struct cld_net *cn = nn->cld_net;
struct cld_msg_v2 *cmsg;
- struct crypto_shash *tfm = cn->cn_tfm;
- struct xdr_netobj cksum;
char *principal = NULL;
/* Don't upcall if it's already stored */
@@ -1182,22 +1178,9 @@ nfsd4_cld_create_v2(struct nfs4_client *clp)
else if (clp->cl_cred.cr_principal)
principal = clp->cl_cred.cr_principal;
if (principal) {
- cksum.len = crypto_shash_digestsize(tfm);
- cksum.data = kmalloc(cksum.len, GFP_KERNEL);
- if (cksum.data == NULL) {
- ret = -ENOMEM;
- goto out;
- }
- ret = crypto_shash_tfm_digest(tfm, principal, strlen(principal),
- cksum.data);
- if (ret) {
- kfree(cksum.data);
- goto out;
- }
- cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = cksum.len;
- memcpy(cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data,
- cksum.data, cksum.len);
- kfree(cksum.data);
+ sha256(principal, strlen(principal),
+ cmsg->cm_u.cm_clntinfo.cc_princhash.cp_data);
+ cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = SHA256_DIGEST_SIZE;
} else
cmsg->cm_u.cm_clntinfo.cc_princhash.cp_len = 0;
@@ -1207,7 +1190,6 @@ nfsd4_cld_create_v2(struct nfs4_client *clp)
set_bit(NFSD4_CLIENT_STABLE, &clp->cl_flags);
}
-out:
free_cld_upcall(cup);
out_err:
if (ret)
@@ -1346,12 +1328,11 @@ found:
static int
nfsd4_cld_check_v2(struct nfs4_client *clp)
{
- struct nfs4_client_reclaim *crp;
struct nfsd_net *nn = net_generic(clp->net, nfsd_net_id);
+#ifdef CONFIG_NFSD_LEGACY_CLIENT_TRACKING
struct cld_net *cn = nn->cld_net;
- int status;
- struct crypto_shash *tfm = cn->cn_tfm;
- struct xdr_netobj cksum;
+#endif
+ struct nfs4_client_reclaim *crp;
char *principal = NULL;
/* did we already find that this client is stable? */
@@ -1367,6 +1348,7 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
if (cn->cn_has_legacy) {
struct xdr_netobj name;
char dname[HEXDIR_LEN];
+ int status;
status = nfs4_make_rec_clidname(dname, &clp->cl_name);
if (status)
@@ -1389,28 +1371,18 @@ nfsd4_cld_check_v2(struct nfs4_client *clp)
return -ENOENT;
found:
if (crp->cr_princhash.len) {
+ u8 digest[SHA256_DIGEST_SIZE];
+
if (clp->cl_cred.cr_raw_principal)
principal = clp->cl_cred.cr_raw_principal;
else if (clp->cl_cred.cr_principal)
principal = clp->cl_cred.cr_principal;
if (principal == NULL)
return -ENOENT;
- cksum.len = crypto_shash_digestsize(tfm);
- cksum.data = kmalloc(cksum.len, GFP_KERNEL);
- if (cksum.data == NULL)
- return -ENOENT;
- status = crypto_shash_tfm_digest(tfm, principal,
- strlen(principal), cksum.data);
- if (status) {
- kfree(cksum.data);
+ sha256(principal, strlen(principal), digest);
+ if (memcmp(crp->cr_princhash.data, digest,
+ crp->cr_princhash.len))
return -ENOENT;
- }
- if (memcmp(crp->cr_princhash.data, cksum.data,
- crp->cr_princhash.len)) {
- kfree(cksum.data);
- return -ENOENT;
- }
- kfree(cksum.data);
}
crp->cr_clp = clp;
return 0;
@@ -1590,7 +1562,6 @@ nfsd4_cld_tracking_init(struct net *net)
struct nfsd_net *nn = net_generic(net, nfsd_net_id);
bool running;
int retries = 10;
- struct crypto_shash *tfm;
status = nfs4_cld_state_init(net);
if (status)
@@ -1615,12 +1586,6 @@ nfsd4_cld_tracking_init(struct net *net)
status = -ETIMEDOUT;
goto err_remove;
}
- tfm = crypto_alloc_shash("sha256", 0, 0);
- if (IS_ERR(tfm)) {
- status = PTR_ERR(tfm);
- goto err_remove;
- }
- nn->cld_net->cn_tfm = tfm;
status = nfsd4_cld_get_version(nn);
if (status == -EOPNOTSUPP)
diff --git a/fs/nfsd/nfs4state.c b/fs/nfsd/nfs4state.c
index 59a693f22452..d5694987f86f 100644
--- a/fs/nfsd/nfs4state.c
+++ b/fs/nfsd/nfs4state.c
@@ -1987,26 +1987,30 @@ reduce_session_slots(struct nfsd4_session *ses, int dec)
return ret;
}
-/*
- * We don't actually need to cache the rpc and session headers, so we
- * can allocate a little less for each slot:
- */
-static inline u32 slot_bytes(struct nfsd4_channel_attrs *ca)
+static struct nfsd4_slot *nfsd4_alloc_slot(struct nfsd4_channel_attrs *fattrs,
+ int index, gfp_t gfp)
{
- u32 size;
+ struct nfsd4_slot *slot;
+ size_t size;
- if (ca->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ)
- size = 0;
- else
- size = ca->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
- return size + sizeof(struct nfsd4_slot);
+ /*
+ * The RPC and NFS session headers are never saved in
+ * the slot reply cache buffer.
+ */
+ size = fattrs->maxresp_cached < NFSD_MIN_HDR_SEQ_SZ ?
+ 0 : fattrs->maxresp_cached - NFSD_MIN_HDR_SEQ_SZ;
+
+ slot = kzalloc(struct_size(slot, sl_data, size), gfp);
+ if (!slot)
+ return NULL;
+ slot->sl_index = index;
+ return slot;
}
static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
struct nfsd4_channel_attrs *battrs)
{
int numslots = fattrs->maxreqs;
- int slotsize = slot_bytes(fattrs);
struct nfsd4_session *new;
struct nfsd4_slot *slot;
int i;
@@ -2015,14 +2019,14 @@ static struct nfsd4_session *alloc_session(struct nfsd4_channel_attrs *fattrs,
if (!new)
return NULL;
xa_init(&new->se_slots);
- /* allocate each struct nfsd4_slot and data cache in one piece */
- slot = kzalloc(slotsize, GFP_KERNEL);
+
+ slot = nfsd4_alloc_slot(fattrs, 0, GFP_KERNEL);
if (!slot || xa_is_err(xa_store(&new->se_slots, 0, slot, GFP_KERNEL)))
goto out_free;
for (i = 1; i < numslots; i++) {
const gfp_t gfp = GFP_KERNEL | __GFP_NORETRY | __GFP_NOWARN;
- slot = kzalloc(slotsize, gfp);
+ slot = nfsd4_alloc_slot(fattrs, i, gfp);
if (!slot)
break;
if (xa_is_err(xa_store(&new->se_slots, i, slot, gfp))) {
@@ -4402,7 +4406,7 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
nfserr_rep_too_big;
if (xdr_restrict_buflen(xdr, buflen - rqstp->rq_auth_slack))
goto out_put_session;
- svc_reserve(rqstp, buflen);
+ svc_reserve_auth(rqstp, buflen);
status = nfs_ok;
/* Success! accept new slot seqid */
@@ -4438,8 +4442,8 @@ nfsd4_sequence(struct svc_rqst *rqstp, struct nfsd4_compound_state *cstate,
* spinlock, and only succeeds if there is
* plenty of memory.
*/
- slot = kzalloc(slot_bytes(&session->se_fchannel),
- GFP_NOWAIT);
+ slot = nfsd4_alloc_slot(&session->se_fchannel, s,
+ GFP_NOWAIT);
prev_slot = xa_load(&session->se_slots, s);
if (xa_is_value(prev_slot) && slot) {
slot->sl_seqid = xa_to_value(prev_slot);
diff --git a/fs/nfsd/nfs4xdr.c b/fs/nfsd/nfs4xdr.c
index fe876395985a..3afcdbed6e14 100644
--- a/fs/nfsd/nfs4xdr.c
+++ b/fs/nfsd/nfs4xdr.c
@@ -2564,7 +2564,7 @@ nfsd4_decode_compound(struct nfsd4_compoundargs *argp)
/* Sessions make the DRC unnecessary: */
if (argp->minorversion)
cachethis = false;
- svc_reserve(argp->rqstp, max_reply + readbytes);
+ svc_reserve_auth(argp->rqstp, max_reply + readbytes);
argp->rqstp->rq_cachetype = cachethis ? RC_REPLBUFF : RC_NOCACHE;
argp->splice_ok = nfsd_read_splice_ok(argp->rqstp);
@@ -3391,6 +3391,23 @@ static __be32 nfsd4_encode_fattr4_suppattr_exclcreat(struct xdr_stream *xdr,
return nfsd4_encode_bitmap4(xdr, supp[0], supp[1], supp[2]);
}
+/*
+ * Copied from generic_remap_checks/generic_remap_file_range_prep.
+ *
+ * These generic functions use the file system's s_blocksize, but
+ * individual file systems aren't required to use
+ * generic_remap_file_range_prep. Until there is a mechanism for
+ * determining a particular file system's (or file's) clone block
+ * size, this is the best NFSD can do.
+ */
+static __be32 nfsd4_encode_fattr4_clone_blksize(struct xdr_stream *xdr,
+ const struct nfsd4_fattr_args *args)
+{
+ struct inode *inode = d_inode(args->dentry);
+
+ return nfsd4_encode_uint32_t(xdr, inode->i_sb->s_blocksize);
+}
+
#ifdef CONFIG_NFSD_V4_SECURITY_LABEL
static __be32 nfsd4_encode_fattr4_sec_label(struct xdr_stream *xdr,
const struct nfsd4_fattr_args *args)
@@ -3545,7 +3562,7 @@ static const nfsd4_enc_attr nfsd4_enc_fattr4_encode_ops[] = {
[FATTR4_MODE_SET_MASKED] = nfsd4_encode_fattr4__noop,
[FATTR4_SUPPATTR_EXCLCREAT] = nfsd4_encode_fattr4_suppattr_exclcreat,
[FATTR4_FS_CHARSET_CAP] = nfsd4_encode_fattr4__noop,
- [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4__noop,
+ [FATTR4_CLONE_BLKSIZE] = nfsd4_encode_fattr4_clone_blksize,
[FATTR4_SPACE_FREED] = nfsd4_encode_fattr4__noop,
[FATTR4_CHANGE_ATTR_TYPE] = nfsd4_encode_fattr4__noop,
diff --git a/fs/nfsd/nfsctl.c b/fs/nfsd/nfsctl.c
index ac265d6fde35..3f3e9f6c4250 100644
--- a/fs/nfsd/nfsctl.c
+++ b/fs/nfsd/nfsctl.c
@@ -2281,6 +2281,8 @@ static int __init init_nfsd(void)
{
int retval;
+ nfsd_debugfs_init();
+
retval = nfsd4_init_slabs();
if (retval)
return retval;
@@ -2291,12 +2293,9 @@ static int __init init_nfsd(void)
if (retval)
goto out_free_pnfs;
nfsd_lockd_init(); /* lockd->nfsd callbacks */
- retval = create_proc_exports_entry();
- if (retval)
- goto out_free_lockd;
retval = register_pernet_subsys(&nfsd_net_ops);
if (retval < 0)
- goto out_free_exports;
+ goto out_free_lockd;
retval = register_cld_notifier();
if (retval)
goto out_free_subsys;
@@ -2305,22 +2304,26 @@ static int __init init_nfsd(void)
goto out_free_cld;
retval = register_filesystem(&nfsd_fs_type);
if (retval)
- goto out_free_all;
+ goto out_free_nfsd4;
retval = genl_register_family(&nfsd_nl_family);
if (retval)
+ goto out_free_filesystem;
+ retval = create_proc_exports_entry();
+ if (retval)
goto out_free_all;
nfsd_localio_ops_init();
return 0;
out_free_all:
+ genl_unregister_family(&nfsd_nl_family);
+out_free_filesystem:
+ unregister_filesystem(&nfsd_fs_type);
+out_free_nfsd4:
nfsd4_destroy_laundry_wq();
out_free_cld:
unregister_cld_notifier();
out_free_subsys:
unregister_pernet_subsys(&nfsd_net_ops);
-out_free_exports:
- remove_proc_entry("fs/nfs/exports", NULL);
- remove_proc_entry("fs/nfs", NULL);
out_free_lockd:
nfsd_lockd_shutdown();
nfsd_drc_slab_free();
@@ -2328,22 +2331,24 @@ out_free_pnfs:
nfsd4_exit_pnfs();
out_free_slabs:
nfsd4_free_slabs();
+ nfsd_debugfs_exit();
return retval;
}
static void __exit exit_nfsd(void)
{
+ remove_proc_entry("fs/nfs/exports", NULL);
+ remove_proc_entry("fs/nfs", NULL);
genl_unregister_family(&nfsd_nl_family);
unregister_filesystem(&nfsd_fs_type);
nfsd4_destroy_laundry_wq();
unregister_cld_notifier();
unregister_pernet_subsys(&nfsd_net_ops);
nfsd_drc_slab_free();
- remove_proc_entry("fs/nfs/exports", NULL);
- remove_proc_entry("fs/nfs", NULL);
nfsd_lockd_shutdown();
nfsd4_free_slabs();
nfsd4_exit_pnfs();
+ nfsd_debugfs_exit();
}
MODULE_AUTHOR("Olaf Kirch <okir@monad.swb.de>");
diff --git a/fs/nfsd/nfsd.h b/fs/nfsd/nfsd.h
index e2997f0ffbc5..1bfd0b4e9af7 100644
--- a/fs/nfsd/nfsd.h
+++ b/fs/nfsd/nfsd.h
@@ -44,24 +44,14 @@ bool nfsd_support_version(int vers);
#include "stats.h"
/*
- * Maximum blocksizes supported by daemon under various circumstances.
+ * Default and maximum payload size (NFS READ or WRITE), in bytes.
+ * The default is historical, and the maximum is an implementation
+ * limit.
*/
-#define NFSSVC_MAXBLKSIZE RPCSVC_MAXPAYLOAD
-/* NFSv2 is limited by the protocol specification, see RFC 1094 */
-#define NFSSVC_MAXBLKSIZE_V2 (8*1024)
-
-
-/*
- * Largest number of bytes we need to allocate for an NFS
- * call or reply. Used to control buffer sizes. We use
- * the length of v3 WRITE, READDIR and READDIR replies
- * which are an RPC header, up to 26 XDR units of reply
- * data, and some page data.
- *
- * Note that accuracy here doesn't matter too much as the
- * size is rounded up to a page size when allocating space.
- */
-#define NFSD_BUFSIZE ((RPC_MAX_HEADER_WITH_AUTH+26)*XDR_UNIT + NFSSVC_MAXBLKSIZE)
+enum {
+ NFSSVC_DEFBLKSIZE = 1 * 1024 * 1024,
+ NFSSVC_MAXBLKSIZE = RPCSVC_MAXPAYLOAD,
+};
struct readdir_cd {
__be32 err; /* 0, nfserr, or nfserr_eof */
@@ -156,6 +146,16 @@ void nfsd_reset_versions(struct nfsd_net *nn);
int nfsd_create_serv(struct net *net);
void nfsd_destroy_serv(struct net *net);
+#ifdef CONFIG_DEBUG_FS
+void nfsd_debugfs_init(void);
+void nfsd_debugfs_exit(void);
+#else
+static inline void nfsd_debugfs_init(void) {}
+static inline void nfsd_debugfs_exit(void) {}
+#endif
+
+extern bool nfsd_disable_splice_read __read_mostly;
+
extern int nfsd_max_blksize;
static inline int nfsd_v4client(struct svc_rqst *rq)
diff --git a/fs/nfsd/nfsproc.c b/fs/nfsd/nfsproc.c
index 6370ac0a85fd..c10fa8128a8a 100644
--- a/fs/nfsd/nfsproc.c
+++ b/fs/nfsd/nfsproc.c
@@ -10,6 +10,7 @@
#include "cache.h"
#include "xdr.h"
#include "vfs.h"
+#include "trace.h"
#define NFSDDBG_FACILITY NFSDDBG_PROC
@@ -54,7 +55,7 @@ nfsd_proc_getattr(struct svc_rqst *rqstp)
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd_attrstat *resp = rqstp->rq_resp;
- dprintk("nfsd: GETATTR %s\n", SVCFH_fmt(&argp->fh));
+ trace_nfsd_vfs_getattr(rqstp, &argp->fh);
fh_copy(&resp->fh, &argp->fh);
resp->status = fh_verify(rqstp, &resp->fh, 0,
@@ -211,7 +212,7 @@ nfsd_proc_read(struct svc_rqst *rqstp)
SVCFH_fmt(&argp->fh),
argp->count, argp->offset);
- argp->count = min_t(u32, argp->count, NFSSVC_MAXBLKSIZE_V2);
+ argp->count = min_t(u32, argp->count, NFS_MAXDATA);
argp->count = min_t(u32, argp->count, rqstp->rq_res.buflen);
resp->pages = rqstp->rq_next_page;
@@ -250,17 +251,14 @@ nfsd_proc_write(struct svc_rqst *rqstp)
struct nfsd_writeargs *argp = rqstp->rq_argp;
struct nfsd_attrstat *resp = rqstp->rq_resp;
unsigned long cnt = argp->len;
- unsigned int nvecs;
dprintk("nfsd: WRITE %s %u bytes at %d\n",
SVCFH_fmt(&argp->fh),
argp->len, argp->offset);
- nvecs = svc_fill_write_vector(rqstp, &argp->payload);
-
- resp->status = nfsd_write(rqstp, fh_copy(&resp->fh, &argp->fh),
- argp->offset, rqstp->rq_vec, nvecs,
- &cnt, NFS_DATA_SYNC, NULL);
+ fh_copy(&resp->fh, &argp->fh);
+ resp->status = nfsd_write(rqstp, &resp->fh, argp->offset,
+ &argp->payload, &cnt, NFS_DATA_SYNC, NULL);
if (resp->status == nfs_ok)
resp->status = fh_getattr(&resp->fh, &resp->stat);
else if (resp->status == nfserr_jukebox)
@@ -292,9 +290,6 @@ nfsd_proc_create(struct svc_rqst *rqstp)
int hosterr;
dev_t rdev = 0, wanted = new_decode_dev(attr->ia_size);
- dprintk("nfsd: CREATE %s %.*s\n",
- SVCFH_fmt(dirfhp), argp->len, argp->name);
-
/* First verify the parent file handle */
resp->status = fh_verify(rqstp, dirfhp, S_IFDIR, NFSD_MAY_EXEC);
if (resp->status != nfs_ok)
@@ -446,9 +441,6 @@ nfsd_proc_remove(struct svc_rqst *rqstp)
struct nfsd_diropargs *argp = rqstp->rq_argp;
struct nfsd_stat *resp = rqstp->rq_resp;
- dprintk("nfsd: REMOVE %s %.*s\n", SVCFH_fmt(&argp->fh),
- argp->len, argp->name);
-
/* Unlink. -SIFDIR means file must not be a directory */
resp->status = nfsd_unlink(rqstp, &argp->fh, -S_IFDIR,
argp->name, argp->len);
@@ -463,11 +455,6 @@ nfsd_proc_rename(struct svc_rqst *rqstp)
struct nfsd_renameargs *argp = rqstp->rq_argp;
struct nfsd_stat *resp = rqstp->rq_resp;
- dprintk("nfsd: RENAME %s %.*s -> \n",
- SVCFH_fmt(&argp->ffh), argp->flen, argp->fname);
- dprintk("nfsd: -> %s %.*s\n",
- SVCFH_fmt(&argp->tfh), argp->tlen, argp->tname);
-
resp->status = nfsd_rename(rqstp, &argp->ffh, argp->fname, argp->flen,
&argp->tfh, argp->tname, argp->tlen);
fh_put(&argp->ffh);
@@ -482,13 +469,6 @@ nfsd_proc_link(struct svc_rqst *rqstp)
struct nfsd_linkargs *argp = rqstp->rq_argp;
struct nfsd_stat *resp = rqstp->rq_resp;
- dprintk("nfsd: LINK %s ->\n",
- SVCFH_fmt(&argp->ffh));
- dprintk("nfsd: %s %.*s\n",
- SVCFH_fmt(&argp->tfh),
- argp->tlen,
- argp->tname);
-
resp->status = nfsd_link(rqstp, &argp->tfh, argp->tname, argp->tlen,
&argp->ffh);
fh_put(&argp->ffh);
@@ -520,10 +500,6 @@ nfsd_proc_symlink(struct svc_rqst *rqstp)
goto out;
}
- dprintk("nfsd: SYMLINK %s %.*s -> %.*s\n",
- SVCFH_fmt(&argp->ffh), argp->flen, argp->fname,
- argp->tlen, argp->tname);
-
fh_init(&newfh, NFS_FHSIZE);
resp->status = nfsd_symlink(rqstp, &argp->ffh, argp->fname, argp->flen,
argp->tname, &attrs, &newfh);
@@ -549,8 +525,6 @@ nfsd_proc_mkdir(struct svc_rqst *rqstp)
.na_iattr = &argp->attrs,
};
- dprintk("nfsd: MKDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
-
if (resp->fh.fh_dentry) {
printk(KERN_WARNING
"nfsd_proc_mkdir: response already verified??\n");
@@ -579,8 +553,6 @@ nfsd_proc_rmdir(struct svc_rqst *rqstp)
struct nfsd_diropargs *argp = rqstp->rq_argp;
struct nfsd_stat *resp = rqstp->rq_resp;
- dprintk("nfsd: RMDIR %s %.*s\n", SVCFH_fmt(&argp->fh), argp->len, argp->name);
-
resp->status = nfsd_unlink(rqstp, &argp->fh, S_IFDIR,
argp->name, argp->len);
fh_put(&argp->fh);
@@ -616,9 +588,7 @@ nfsd_proc_readdir(struct svc_rqst *rqstp)
struct nfsd_readdirres *resp = rqstp->rq_resp;
loff_t offset;
- dprintk("nfsd: READDIR %s %d bytes at %d\n",
- SVCFH_fmt(&argp->fh),
- argp->count, argp->cookie);
+ trace_nfsd_vfs_readdir(rqstp, &argp->fh, argp->count, argp->cookie);
nfsd_init_dirlist_pages(rqstp, resp, argp->count);
@@ -643,8 +613,6 @@ nfsd_proc_statfs(struct svc_rqst *rqstp)
struct nfsd_fhandle *argp = rqstp->rq_argp;
struct nfsd_statfsres *resp = rqstp->rq_resp;
- dprintk("nfsd: STATFS %s\n", SVCFH_fmt(&argp->fh));
-
resp->status = nfsd_statfs(rqstp, &argp->fh, &resp->stats,
NFSD_MAY_BYPASS_GSS_ON_ROOT);
fh_put(&argp->fh);
@@ -740,7 +708,7 @@ static const struct svc_procedure nfsd_procedures2[18] = {
.pc_argzero = sizeof(struct nfsd_readargs),
.pc_ressize = sizeof(struct nfsd_readres),
.pc_cachetype = RC_NOCACHE,
- .pc_xdrressize = ST+AT+1+NFSSVC_MAXBLKSIZE_V2/4,
+ .pc_xdrressize = ST+AT+1+NFS_MAXDATA/4,
.pc_name = "READ",
},
[NFSPROC_WRITECACHE] = {
diff --git a/fs/nfsd/nfssvc.c b/fs/nfsd/nfssvc.c
index 9b3d6cff0e1e..82b0111ac469 100644
--- a/fs/nfsd/nfssvc.c
+++ b/fs/nfsd/nfssvc.c
@@ -396,13 +396,13 @@ static int nfsd_startup_net(struct net *net, const struct cred *cred)
if (ret)
goto out_filecache;
+#ifdef CONFIG_NFSD_V4_2_INTER_SSC
+ nfsd4_ssc_init_umount_work(nn);
+#endif
ret = nfs4_state_start_net(net);
if (ret)
goto out_reply_cache;
-#ifdef CONFIG_NFSD_V4_2_INTER_SSC
- nfsd4_ssc_init_umount_work(nn);
-#endif
nn->nfsd_net_up = true;
return 0;
@@ -582,7 +582,7 @@ static int nfsd_get_default_max_blksize(void)
*/
target >>= 12;
- ret = NFSSVC_MAXBLKSIZE;
+ ret = NFSSVC_DEFBLKSIZE;
while (ret > target && ret >= 8*1024*2)
ret /= 2;
return ret;
diff --git a/fs/nfsd/nfsxdr.c b/fs/nfsd/nfsxdr.c
index 5777f40c7353..fc262ceafca9 100644
--- a/fs/nfsd/nfsxdr.c
+++ b/fs/nfsd/nfsxdr.c
@@ -336,7 +336,7 @@ nfssvc_decode_writeargs(struct svc_rqst *rqstp, struct xdr_stream *xdr)
/* opaque data */
if (xdr_stream_decode_u32(xdr, &args->len) < 0)
return false;
- if (args->len > NFSSVC_MAXBLKSIZE_V2)
+ if (args->len > NFS_MAXDATA)
return false;
return xdr_stream_subsegment(xdr, &args->payload, args->len);
@@ -540,7 +540,7 @@ nfssvc_encode_statfsres(struct svc_rqst *rqstp, struct xdr_stream *xdr)
p = xdr_reserve_space(xdr, XDR_UNIT * 5);
if (!p)
return false;
- *p++ = cpu_to_be32(NFSSVC_MAXBLKSIZE_V2);
+ *p++ = cpu_to_be32(NFS_MAXDATA);
*p++ = cpu_to_be32(stat->f_bsize);
*p++ = cpu_to_be32(stat->f_blocks);
*p++ = cpu_to_be32(stat->f_bfree);
diff --git a/fs/nfsd/state.h b/fs/nfsd/state.h
index 290e29dd43eb..1995bca158b8 100644
--- a/fs/nfsd/state.h
+++ b/fs/nfsd/state.h
@@ -64,6 +64,21 @@ typedef struct {
refcount_t cs_count;
} copy_stateid_t;
+struct nfsd4_referring_call {
+ struct list_head __list;
+
+ u32 rc_sequenceid;
+ u32 rc_slotid;
+};
+
+struct nfsd4_referring_call_list {
+ struct list_head __list;
+
+ struct nfs4_sessionid rcl_sessionid;
+ int __nr_referring_calls;
+ struct list_head rcl_referring_calls;
+};
+
struct nfsd4_callback {
struct nfs4_client *cb_clp;
struct rpc_message cb_msg;
@@ -76,6 +91,9 @@ struct nfsd4_callback {
int cb_seq_status;
int cb_status;
int cb_held_slot;
+
+ int cb_nr_referring_call_list;
+ struct list_head cb_referring_call_list;
};
struct nfsd4_callback_ops {
@@ -260,6 +278,7 @@ struct nfsd4_slot {
u32 sl_seqid;
__be32 sl_status;
struct svc_cred sl_cred;
+ u32 sl_index;
u32 sl_datalen;
u16 sl_opcnt;
u16 sl_generation;
@@ -774,6 +793,10 @@ extern __be32 nfs4_check_open_reclaim(struct nfs4_client *);
extern void nfsd4_probe_callback(struct nfs4_client *clp);
extern void nfsd4_probe_callback_sync(struct nfs4_client *clp);
extern void nfsd4_change_callback(struct nfs4_client *clp, struct nfs4_cb_conn *);
+extern void nfsd41_cb_referring_call(struct nfsd4_callback *cb,
+ struct nfs4_sessionid *sessionid,
+ u32 slotid, u32 seqno);
+extern void nfsd41_cb_destroy_referring_call_list(struct nfsd4_callback *cb);
extern void nfsd4_init_cb(struct nfsd4_callback *cb, struct nfs4_client *clp,
const struct nfsd4_callback_ops *ops, enum nfsd4_cb_op op);
extern bool nfsd4_run_cb(struct nfsd4_callback *cb);
diff --git a/fs/nfsd/trace.h b/fs/nfsd/trace.h
index a7630e9f6577..3c5505ef5e3a 100644
--- a/fs/nfsd/trace.h
+++ b/fs/nfsd/trace.h
@@ -11,6 +11,7 @@
#include <linux/tracepoint.h>
#include <linux/sunrpc/clnt.h>
#include <linux/sunrpc/xprt.h>
+#include <trace/misc/fs.h>
#include <trace/misc/nfs.h>
#include <trace/misc/sunrpc.h>
@@ -18,22 +19,40 @@
#include "nfsfh.h"
#include "xdr4.h"
-#define NFSD_TRACE_PROC_RES_FIELDS \
+#define NFSD_TRACE_PROC_CALL_FIELDS(r) \
+ __field(unsigned int, netns_ino) \
+ __field(u32, xid) \
+ __sockaddr(server, (r)->rq_xprt->xpt_locallen) \
+ __sockaddr(client, (r)->rq_xprt->xpt_remotelen)
+
+#define NFSD_TRACE_PROC_CALL_ASSIGNMENTS(r) \
+ do { \
+ struct svc_xprt *xprt = (r)->rq_xprt; \
+ __entry->netns_ino = SVC_NET(r)->ns.inum; \
+ __entry->xid = be32_to_cpu((r)->rq_xid); \
+ __assign_sockaddr(server, &xprt->xpt_local, \
+ xprt->xpt_locallen); \
+ __assign_sockaddr(client, &xprt->xpt_remote, \
+ xprt->xpt_remotelen); \
+ } while (0)
+
+#define NFSD_TRACE_PROC_RES_FIELDS(r) \
__field(unsigned int, netns_ino) \
__field(u32, xid) \
__field(unsigned long, status) \
- __array(unsigned char, server, sizeof(struct sockaddr_in6)) \
- __array(unsigned char, client, sizeof(struct sockaddr_in6))
+ __sockaddr(server, (r)->rq_xprt->xpt_locallen) \
+ __sockaddr(client, (r)->rq_xprt->xpt_remotelen)
-#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(error) \
+#define NFSD_TRACE_PROC_RES_ASSIGNMENTS(r, error) \
do { \
- __entry->netns_ino = SVC_NET(rqstp)->ns.inum; \
- __entry->xid = be32_to_cpu(rqstp->rq_xid); \
+ struct svc_xprt *xprt = (r)->rq_xprt; \
+ __entry->netns_ino = SVC_NET(r)->ns.inum; \
+ __entry->xid = be32_to_cpu((r)->rq_xid); \
__entry->status = be32_to_cpu(error); \
- memcpy(__entry->server, &rqstp->rq_xprt->xpt_local, \
- rqstp->rq_xprt->xpt_locallen); \
- memcpy(__entry->client, &rqstp->rq_xprt->xpt_remote, \
- rqstp->rq_xprt->xpt_remotelen); \
+ __assign_sockaddr(server, &xprt->xpt_local, \
+ xprt->xpt_locallen); \
+ __assign_sockaddr(client, &xprt->xpt_remote, \
+ xprt->xpt_remotelen); \
} while (0);
DECLARE_EVENT_CLASS(nfsd_xdr_err_class,
@@ -145,14 +164,14 @@ TRACE_EVENT(nfsd_compound_decode_err,
),
TP_ARGS(rqstp, args_opcnt, resp_opcnt, opnum, status),
TP_STRUCT__entry(
- NFSD_TRACE_PROC_RES_FIELDS
+ NFSD_TRACE_PROC_RES_FIELDS(rqstp)
__field(u32, args_opcnt)
__field(u32, resp_opcnt)
__field(u32, opnum)
),
TP_fast_assign(
- NFSD_TRACE_PROC_RES_ASSIGNMENTS(status)
+ NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status)
__entry->args_opcnt = args_opcnt;
__entry->resp_opcnt = resp_opcnt;
@@ -171,12 +190,12 @@ DECLARE_EVENT_CLASS(nfsd_compound_err_class,
),
TP_ARGS(rqstp, opnum, status),
TP_STRUCT__entry(
- NFSD_TRACE_PROC_RES_FIELDS
+ NFSD_TRACE_PROC_RES_FIELDS(rqstp)
__field(u32, opnum)
),
TP_fast_assign(
- NFSD_TRACE_PROC_RES_ASSIGNMENTS(status)
+ NFSD_TRACE_PROC_RES_ASSIGNMENTS(rqstp, status)
__entry->opnum = opnum;
),
@@ -451,6 +470,8 @@ DEFINE_NFSD_IO_EVENT(write_start);
DEFINE_NFSD_IO_EVENT(write_opened);
DEFINE_NFSD_IO_EVENT(write_io_done);
DEFINE_NFSD_IO_EVENT(write_done);
+DEFINE_NFSD_IO_EVENT(commit_start);
+DEFINE_NFSD_IO_EVENT(commit_done);
DECLARE_EVENT_CLASS(nfsd_err_class,
TP_PROTO(struct svc_rqst *rqstp,
@@ -2335,6 +2356,259 @@ DEFINE_EVENT(nfsd_copy_async_done_class, \
DEFINE_COPY_ASYNC_DONE_EVENT(done);
DEFINE_COPY_ASYNC_DONE_EVENT(cancel);
+TRACE_EVENT(nfsd_vfs_setattr,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ const struct iattr *iap,
+ const struct timespec64 *guardtime
+ ),
+ TP_ARGS(rqstp, fhp, iap, guardtime),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __field(s64, gtime_tv_sec)
+ __field(u32, gtime_tv_nsec)
+ __field(unsigned int, ia_valid)
+ __field(loff_t, ia_size)
+ __field(uid_t, ia_uid)
+ __field(gid_t, ia_gid)
+ __field(umode_t, ia_mode)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->gtime_tv_sec = guardtime ? guardtime->tv_sec : 0;
+ __entry->gtime_tv_nsec = guardtime ? guardtime->tv_nsec : 0;
+ __entry->ia_valid = iap->ia_valid;
+ __entry->ia_size = iap->ia_size;
+ __entry->ia_uid = __kuid_val(iap->ia_uid);
+ __entry->ia_gid = __kgid_val(iap->ia_gid);
+ __entry->ia_mode = iap->ia_mode;
+ ),
+ TP_printk(
+ "xid=0x%08x fh_hash=0x%08x ia_valid=%s ia_size=%llu ia_mode=0%o ia_uid=%u ia_gid=%u guard_time=%lld.%u",
+ __entry->xid, __entry->fh_hash, show_ia_valid_flags(__entry->ia_valid),
+ __entry->ia_size, __entry->ia_mode, __entry->ia_uid, __entry->ia_gid,
+ __entry->gtime_tv_sec, __entry->gtime_tv_nsec
+ )
+)
+
+TRACE_EVENT(nfsd_vfs_lookup,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ const char *name,
+ unsigned int len
+ ),
+ TP_ARGS(rqstp, fhp, name, len),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __string_len(name, name, len)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __assign_str(name);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x name=%s",
+ __entry->xid, __entry->fh_hash, __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_create,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ umode_t type,
+ const char *name,
+ unsigned int len
+ ),
+ TP_ARGS(rqstp, fhp, type, name, len),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __field(umode_t, type)
+ __string_len(name, name, len)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->type = type;
+ __assign_str(name);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x type=%s name=%s",
+ __entry->xid, __entry->fh_hash,
+ show_fs_file_type(__entry->type), __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_symlink,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ const char *name,
+ unsigned int namelen,
+ const char *target
+ ),
+ TP_ARGS(rqstp, fhp, name, namelen, target),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __string_len(name, name, namelen)
+ __string(target, target)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __assign_str(name);
+ __assign_str(target);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x name=%s target=%s",
+ __entry->xid, __entry->fh_hash,
+ __get_str(name), __get_str(target)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_link,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *sfhp,
+ const struct svc_fh *tfhp,
+ const char *name,
+ unsigned int namelen
+ ),
+ TP_ARGS(rqstp, sfhp, tfhp, name, namelen),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, sfh_hash)
+ __field(u32, tfh_hash)
+ __string_len(name, name, namelen)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle);
+ __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle);
+ __assign_str(name);
+ ),
+ TP_printk("xid=0x%08x src_fh=0x%08x tgt_fh=0x%08x name=%s",
+ __entry->xid, __entry->sfh_hash, __entry->tfh_hash,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_unlink,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ const char *name,
+ unsigned int len
+ ),
+ TP_ARGS(rqstp, fhp, name, len),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __string_len(name, name, len)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __assign_str(name);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x name=%s",
+ __entry->xid, __entry->fh_hash,
+ __get_str(name)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_rename,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *sfhp,
+ const struct svc_fh *tfhp,
+ const char *source,
+ unsigned int sourcelen,
+ const char *target,
+ unsigned int targetlen
+ ),
+ TP_ARGS(rqstp, sfhp, tfhp, source, sourcelen, target, targetlen),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, sfh_hash)
+ __field(u32, tfh_hash)
+ __string_len(source, source, sourcelen)
+ __string_len(target, target, targetlen)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->sfh_hash = knfsd_fh_hash(&sfhp->fh_handle);
+ __entry->tfh_hash = knfsd_fh_hash(&tfhp->fh_handle);
+ __assign_str(source);
+ __assign_str(target);
+ ),
+ TP_printk("xid=0x%08x sfh_hash=0x%08x tfh_hash=0x%08x source=%s target=%s",
+ __entry->xid, __entry->sfh_hash, __entry->tfh_hash,
+ __get_str(source), __get_str(target)
+ )
+);
+
+TRACE_EVENT(nfsd_vfs_readdir,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp,
+ u32 count,
+ u64 offset
+ ),
+ TP_ARGS(rqstp, fhp, count, offset),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ __field(u32, count)
+ __field(u64, offset)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ __entry->count = count;
+ __entry->offset = offset;
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x offset=%llu count=%u",
+ __entry->xid, __entry->fh_hash,
+ __entry->offset, __entry->count
+ )
+);
+
+DECLARE_EVENT_CLASS(nfsd_vfs_getattr_class,
+ TP_PROTO(
+ const struct svc_rqst *rqstp,
+ const struct svc_fh *fhp
+ ),
+ TP_ARGS(rqstp, fhp),
+ TP_STRUCT__entry(
+ NFSD_TRACE_PROC_CALL_FIELDS(rqstp)
+ __field(u32, fh_hash)
+ ),
+ TP_fast_assign(
+ NFSD_TRACE_PROC_CALL_ASSIGNMENTS(rqstp);
+ __entry->fh_hash = knfsd_fh_hash(&fhp->fh_handle);
+ ),
+ TP_printk("xid=0x%08x fh_hash=0x%08x",
+ __entry->xid, __entry->fh_hash
+ )
+);
+
+#define DEFINE_NFSD_VFS_GETATTR_EVENT(__name) \
+DEFINE_EVENT(nfsd_vfs_getattr_class, __name, \
+ TP_PROTO( \
+ const struct svc_rqst *rqstp, \
+ const struct svc_fh *fhp \
+ ), \
+ TP_ARGS(rqstp, fhp))
+
+DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_getattr);
+DEFINE_NFSD_VFS_GETATTR_EVENT(nfsd_vfs_statfs);
+
#endif /* _NFSD_TRACE_H */
#undef TRACE_INCLUDE_PATH
diff --git a/fs/nfsd/vfs.c b/fs/nfsd/vfs.c
index 160a839af405..cd689df2ca5d 100644
--- a/fs/nfsd/vfs.c
+++ b/fs/nfsd/vfs.c
@@ -31,6 +31,7 @@
#include <linux/exportfs.h>
#include <linux/writeback.h>
#include <linux/security.h>
+#include <linux/sunrpc/xdr.h>
#include "xdr3.h"
@@ -47,6 +48,8 @@
#define NFSDDBG_FACILITY NFSDDBG_FILEOP
+bool nfsd_disable_splice_read __read_mostly;
+
/**
* nfserrno - Map Linux errnos to NFS errnos
* @errno: POSIX(-ish) error code to be mapped
@@ -244,7 +247,7 @@ nfsd_lookup_dentry(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct dentry *dentry;
int host_err;
- dprintk("nfsd: nfsd_lookup(fh %s, %.*s)\n", SVCFH_fmt(fhp), len,name);
+ trace_nfsd_vfs_lookup(rqstp, fhp, name, len);
dparent = fhp->fh_dentry;
exp = exp_get(fhp->fh_export);
@@ -500,6 +503,8 @@ nfsd_setattr(struct svc_rqst *rqstp, struct svc_fh *fhp,
bool size_change = (iap->ia_valid & ATTR_SIZE);
int retries;
+ trace_nfsd_vfs_setattr(rqstp, fhp, iap, guardtime);
+
if (iap->ia_valid & ATTR_SIZE) {
accmode |= NFSD_MAY_WRITE|NFSD_MAY_OWNER_OVERRIDE;
ftype = S_IFREG;
@@ -1082,23 +1087,23 @@ __be32 nfsd_iter_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
unsigned long v, total;
struct iov_iter iter;
loff_t ppos = offset;
- struct page *page;
ssize_t host_err;
+ size_t len;
v = 0;
total = *count;
while (total) {
- page = *(rqstp->rq_next_page++);
- rqstp->rq_vec[v].iov_base = page_address(page) + base;
- rqstp->rq_vec[v].iov_len = min_t(size_t, total, PAGE_SIZE - base);
- total -= rqstp->rq_vec[v].iov_len;
+ len = min_t(size_t, total, PAGE_SIZE - base);
+ bvec_set_page(&rqstp->rq_bvec[v], *(rqstp->rq_next_page++),
+ len, base);
+ total -= len;
++v;
base = 0;
}
- WARN_ON_ONCE(v > ARRAY_SIZE(rqstp->rq_vec));
+ WARN_ON_ONCE(v > rqstp->rq_maxpages);
trace_nfsd_read_vector(rqstp, fhp, offset, *count);
- iov_iter_kvec(&iter, ITER_DEST, rqstp->rq_vec, v, *count);
+ iov_iter_bvec(&iter, ITER_DEST, rqstp->rq_bvec, v, *count);
host_err = vfs_iter_read(file, &iter, &ppos, 0);
return nfsd_finish_read(rqstp, fhp, file, offset, count, eof, host_err);
}
@@ -1140,11 +1145,27 @@ static int wait_for_concurrent_writes(struct file *file)
return err;
}
+/**
+ * nfsd_vfs_write - write data to an already-open file
+ * @rqstp: RPC execution context
+ * @fhp: File handle of file to write into
+ * @nf: An open file matching @fhp
+ * @offset: Byte offset of start
+ * @payload: xdr_buf containing the write payload
+ * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
+ * @stable: An NFS stable_how value
+ * @verf: NFS WRITE verifier
+ *
+ * Upon return, caller must invoke fh_put on @fhp.
+ *
+ * Return values:
+ * An nfsstat value in network byte order.
+ */
__be32
-nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
- loff_t offset, struct kvec *vec, int vlen,
- unsigned long *cnt, int stable,
- __be32 *verf)
+nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ struct nfsd_file *nf, loff_t offset,
+ const struct xdr_buf *payload, unsigned long *cnt,
+ int stable, __be32 *verf)
{
struct nfsd_net *nn = net_generic(SVC_NET(rqstp), nfsd_net_id);
struct file *file = nf->nf_file;
@@ -1159,6 +1180,7 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
unsigned int pflags = current->flags;
rwf_t flags = 0;
bool restore_flags = false;
+ unsigned int nvecs;
trace_nfsd_write_opened(rqstp, fhp, offset, *cnt);
@@ -1186,7 +1208,8 @@ nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
if (stable && !fhp->fh_use_wgather)
flags |= RWF_SYNC;
- iov_iter_kvec(&iter, ITER_SOURCE, vec, vlen, *cnt);
+ nvecs = xdr_buf_to_bvec(rqstp->rq_bvec, rqstp->rq_maxpages, payload);
+ iov_iter_bvec(&iter, ITER_SOURCE, rqstp->rq_bvec, nvecs, *cnt);
since = READ_ONCE(file->f_wb_err);
if (verf)
nfsd_copy_write_verifier(verf, nn);
@@ -1237,6 +1260,8 @@ out_nfserr:
*/
bool nfsd_read_splice_ok(struct svc_rqst *rqstp)
{
+ if (nfsd_disable_splice_read)
+ return false;
switch (svc_auth_flavor(rqstp)) {
case RPC_AUTH_GSS_KRB5I:
case RPC_AUTH_GSS_KRB5P:
@@ -1284,14 +1309,24 @@ __be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
return err;
}
-/*
- * Write data to a file.
- * The stable flag requests synchronous writes.
- * N.B. After this call fhp needs an fh_put
+/**
+ * nfsd_write - open a file and write data to it
+ * @rqstp: RPC execution context
+ * @fhp: File handle of file to write into; nfsd_write() may modify it
+ * @offset: Byte offset of start
+ * @payload: xdr_buf containing the write payload
+ * @cnt: IN: number of bytes to write, OUT: number of bytes actually written
+ * @stable: An NFS stable_how value
+ * @verf: NFS WRITE verifier
+ *
+ * Upon return, caller must invoke fh_put on @fhp.
+ *
+ * Return values:
+ * An nfsstat value in network byte order.
*/
__be32
nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
- struct kvec *vec, int vlen, unsigned long *cnt, int stable,
+ const struct xdr_buf *payload, unsigned long *cnt, int stable,
__be32 *verf)
{
struct nfsd_file *nf;
@@ -1303,8 +1338,8 @@ nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp, loff_t offset,
if (err)
goto out;
- err = nfsd_vfs_write(rqstp, fhp, nf, offset, vec,
- vlen, cnt, stable, verf);
+ err = nfsd_vfs_write(rqstp, fhp, nf, offset, payload, cnt,
+ stable, verf);
nfsd_file_put(nf);
out:
trace_nfsd_write_done(rqstp, fhp, offset, *cnt);
@@ -1340,6 +1375,8 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
loff_t start, end;
struct nfsd_net *nn;
+ trace_nfsd_commit_start(rqstp, fhp, offset, count);
+
/*
* Convert the client-provided (offset, count) range to a
* (start, end) range. If the client-provided range falls
@@ -1378,6 +1415,7 @@ nfsd_commit(struct svc_rqst *rqstp, struct svc_fh *fhp, struct nfsd_file *nf,
} else
nfsd_copy_write_verifier(verf, nn);
+ trace_nfsd_commit_done(rqstp, fhp, offset, count);
return err;
}
@@ -1541,6 +1579,8 @@ nfsd_create(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 err;
int host_err;
+ trace_nfsd_vfs_create(rqstp, fhp, type, fname, flen);
+
if (isdotent(fname, flen))
return nfserr_exist;
@@ -1641,6 +1681,8 @@ nfsd_symlink(struct svc_rqst *rqstp, struct svc_fh *fhp,
__be32 err, cerr;
int host_err;
+ trace_nfsd_vfs_symlink(rqstp, fhp, fname, flen, path);
+
err = nfserr_noent;
if (!flen || path[0] == '\0')
goto out;
@@ -1709,6 +1751,8 @@ nfsd_link(struct svc_rqst *rqstp, struct svc_fh *ffhp,
__be32 err;
int host_err;
+ trace_nfsd_vfs_link(rqstp, ffhp, tfhp, name, len);
+
err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_CREATE);
if (err)
goto out;
@@ -1826,6 +1870,8 @@ nfsd_rename(struct svc_rqst *rqstp, struct svc_fh *ffhp, char *fname, int flen,
int host_err;
bool close_cached = false;
+ trace_nfsd_vfs_rename(rqstp, ffhp, tfhp, fname, flen, tname, tlen);
+
err = fh_verify(rqstp, ffhp, S_IFDIR, NFSD_MAY_REMOVE);
if (err)
goto out;
@@ -1984,6 +2030,8 @@ nfsd_unlink(struct svc_rqst *rqstp, struct svc_fh *fhp, int type,
__be32 err;
int host_err;
+ trace_nfsd_vfs_unlink(rqstp, fhp, fname, flen);
+
err = nfserr_acces;
if (!flen || isdotent(fname, flen))
goto out;
@@ -2272,6 +2320,8 @@ nfsd_statfs(struct svc_rqst *rqstp, struct svc_fh *fhp, struct kstatfs *stat, in
{
__be32 err;
+ trace_nfsd_vfs_statfs(rqstp, fhp);
+
err = fh_verify(rqstp, fhp, 0, NFSD_MAY_NOP | access);
if (!err) {
struct path path = {
diff --git a/fs/nfsd/vfs.h b/fs/nfsd/vfs.h
index f9b09b842856..eff04959606f 100644
--- a/fs/nfsd/vfs.h
+++ b/fs/nfsd/vfs.h
@@ -128,13 +128,13 @@ bool nfsd_read_splice_ok(struct svc_rqst *rqstp);
__be32 nfsd_read(struct svc_rqst *rqstp, struct svc_fh *fhp,
loff_t offset, unsigned long *count,
u32 *eof);
-__be32 nfsd_write(struct svc_rqst *, struct svc_fh *, loff_t,
- struct kvec *, int, unsigned long *,
- int stable, __be32 *verf);
+__be32 nfsd_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
+ loff_t offset, const struct xdr_buf *payload,
+ unsigned long *cnt, int stable, __be32 *verf);
__be32 nfsd_vfs_write(struct svc_rqst *rqstp, struct svc_fh *fhp,
struct nfsd_file *nf, loff_t offset,
- struct kvec *vec, int vlen, unsigned long *cnt,
- int stable, __be32 *verf);
+ const struct xdr_buf *payload,
+ unsigned long *cnt, int stable, __be32 *verf);
__be32 nfsd_readlink(struct svc_rqst *, struct svc_fh *,
char *, int *);
__be32 nfsd_symlink(struct svc_rqst *, struct svc_fh *,
diff --git a/fs/nfsd/xdr4.h b/fs/nfsd/xdr4.h
index c26ba86dbdfd..aa2a356da784 100644
--- a/fs/nfsd/xdr4.h
+++ b/fs/nfsd/xdr4.h
@@ -676,6 +676,10 @@ struct nfsd4_cb_offload {
__be32 co_nfserr;
unsigned int co_retries;
struct knfsd_fh co_fh;
+
+ struct nfs4_sessionid co_referring_sessionid;
+ u32 co_referring_slotid;
+ u32 co_referring_seqno;
};
struct nfsd4_copy {
diff --git a/fs/nfsd/xdr4cb.h b/fs/nfsd/xdr4cb.h
index f1a315cd31b7..f4e29c0c701c 100644
--- a/fs/nfsd/xdr4cb.h
+++ b/fs/nfsd/xdr4cb.h
@@ -6,8 +6,11 @@
#define cb_compound_enc_hdr_sz 4
#define cb_compound_dec_hdr_sz (3 + (NFS4_MAXTAGLEN >> 2))
#define sessionid_sz (NFS4_MAX_SESSIONID_LEN >> 2)
+#define enc_referring_call4_sz (1 + 1)
+#define enc_referring_call_list4_sz (sessionid_sz + 1 + \
+ enc_referring_call4_sz)
#define cb_sequence_enc_sz (sessionid_sz + 4 + \
- 1 /* no referring calls list yet */)
+ enc_referring_call_list4_sz)
#define cb_sequence_dec_sz (op_dec_sz + sessionid_sz + 4)
#define op_enc_sz 1
diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c
index 6d386080faf2..3083643b864b 100644
--- a/fs/notify/fanotify/fanotify.c
+++ b/fs/notify/fanotify/fanotify.c
@@ -415,7 +415,7 @@ static int fanotify_encode_fh(struct fanotify_fh *fh, struct inode *inode,
{
int dwords, type = 0;
char *ext_buf = NULL;
- void *buf = fh->buf;
+ void *buf = fh + 1;
int err;
fh->type = FILEID_ROOT;
@@ -1009,6 +1009,7 @@ finish:
static void fanotify_free_group_priv(struct fsnotify_group *group)
{
+ put_user_ns(group->user_ns);
kfree(group->fanotify_data.merge_hash);
if (group->fanotify_data.ucounts)
dec_ucount(group->fanotify_data.ucounts,
diff --git a/fs/notify/fanotify/fanotify.h b/fs/notify/fanotify/fanotify.h
index b44e70e44be6..b78308975082 100644
--- a/fs/notify/fanotify/fanotify.h
+++ b/fs/notify/fanotify/fanotify.h
@@ -25,7 +25,7 @@ enum {
* stored in either the first or last 2 dwords.
*/
#define FANOTIFY_INLINE_FH_LEN (3 << 2)
-#define FANOTIFY_FH_HDR_LEN offsetof(struct fanotify_fh, buf)
+#define FANOTIFY_FH_HDR_LEN sizeof(struct fanotify_fh)
/* Fixed size struct for file handle */
struct fanotify_fh {
@@ -34,7 +34,6 @@ struct fanotify_fh {
#define FANOTIFY_FH_FLAG_EXT_BUF 1
u8 flags;
u8 pad;
- unsigned char buf[];
} __aligned(4);
/* Variable size struct for dir file handle + child file handle + name */
@@ -92,7 +91,7 @@ static inline char **fanotify_fh_ext_buf_ptr(struct fanotify_fh *fh)
BUILD_BUG_ON(FANOTIFY_FH_HDR_LEN % 4);
BUILD_BUG_ON(__alignof__(char *) - 4 + sizeof(char *) >
FANOTIFY_INLINE_FH_LEN);
- return (char **)ALIGN((unsigned long)(fh->buf), __alignof__(char *));
+ return (char **)ALIGN((unsigned long)(fh + 1), __alignof__(char *));
}
static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh)
@@ -102,7 +101,7 @@ static inline void *fanotify_fh_ext_buf(struct fanotify_fh *fh)
static inline void *fanotify_fh_buf(struct fanotify_fh *fh)
{
- return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh->buf;
+ return fanotify_fh_has_ext_buf(fh) ? fanotify_fh_ext_buf(fh) : fh + 1;
}
static inline int fanotify_info_dir_fh_len(struct fanotify_info *info)
@@ -278,7 +277,7 @@ static inline void fanotify_init_event(struct fanotify_event *event,
#define FANOTIFY_INLINE_FH(name, size) \
struct { \
struct fanotify_fh name; \
- /* Space for object_fh.buf[] - access with fanotify_fh_buf() */ \
+ /* Space for filehandle - access with fanotify_fh_buf() */ \
unsigned char _inline_fh_buf[size]; \
}
diff --git a/fs/notify/fanotify/fanotify_user.c b/fs/notify/fanotify/fanotify_user.c
index 87f861e9004f..b192ee068a7a 100644
--- a/fs/notify/fanotify/fanotify_user.c
+++ b/fs/notify/fanotify/fanotify_user.c
@@ -1334,6 +1334,7 @@ static struct fsnotify_mark *fanotify_add_new_mark(struct fsnotify_group *group,
* A group with FAN_UNLIMITED_MARKS does not contribute to mark count
* in the limited groups account.
*/
+ BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_MARKS));
if (!FAN_GROUP_FLAG(group, FAN_UNLIMITED_MARKS) &&
!inc_ucount(ucounts->ns, ucounts->uid, UCOUNT_FANOTIFY_MARKS))
return ERR_PTR(-ENOSPC);
@@ -1498,6 +1499,7 @@ static struct hlist_head *fanotify_alloc_merge_hash(void)
/* fanotify syscalls */
SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
{
+ struct user_namespace *user_ns = current_user_ns();
struct fsnotify_group *group;
int f_flags, fd;
unsigned int fid_mode = flags & FANOTIFY_FID_BITS;
@@ -1512,10 +1514,11 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
/*
* An unprivileged user can setup an fanotify group with
* limited functionality - an unprivileged group is limited to
- * notification events with file handles and it cannot use
- * unlimited queue/marks.
+ * notification events with file handles or mount ids and it
+ * cannot use unlimited queue/marks.
*/
- if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) || !fid_mode)
+ if ((flags & FANOTIFY_ADMIN_INIT_FLAGS) ||
+ !(flags & (FANOTIFY_FID_BITS | FAN_REPORT_MNT)))
return -EPERM;
/*
@@ -1594,8 +1597,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
}
/* Enforce groups limits per user in all containing user ns */
- group->fanotify_data.ucounts = inc_ucount(current_user_ns(),
- current_euid(),
+ group->fanotify_data.ucounts = inc_ucount(user_ns, current_euid(),
UCOUNT_FANOTIFY_GROUPS);
if (!group->fanotify_data.ucounts) {
fd = -EMFILE;
@@ -1604,6 +1606,7 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
group->fanotify_data.flags = flags | internal_flags;
group->memcg = get_mem_cgroup_from_mm(current->mm);
+ group->user_ns = get_user_ns(user_ns);
group->fanotify_data.merge_hash = fanotify_alloc_merge_hash();
if (!group->fanotify_data.merge_hash) {
@@ -1637,21 +1640,13 @@ SYSCALL_DEFINE2(fanotify_init, unsigned int, flags, unsigned int, event_f_flags)
goto out_destroy_group;
}
+ BUILD_BUG_ON(!(FANOTIFY_ADMIN_INIT_FLAGS & FAN_UNLIMITED_QUEUE));
if (flags & FAN_UNLIMITED_QUEUE) {
- fd = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out_destroy_group;
group->max_events = UINT_MAX;
} else {
group->max_events = fanotify_max_queued_events;
}
- if (flags & FAN_UNLIMITED_MARKS) {
- fd = -EPERM;
- if (!capable(CAP_SYS_ADMIN))
- goto out_destroy_group;
- }
-
if (flags & FAN_ENABLE_AUDIT) {
fd = -EPERM;
if (!capable(CAP_AUDIT_WRITE))
@@ -1811,6 +1806,8 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
struct fsnotify_group *group;
struct path path;
struct fan_fsid __fsid, *fsid = NULL;
+ struct user_namespace *user_ns = NULL;
+ struct mnt_namespace *mntns;
u32 valid_mask = FANOTIFY_EVENTS | FANOTIFY_EVENT_FLAGS;
unsigned int mark_type = flags & FANOTIFY_MARK_TYPE_BITS;
unsigned int mark_cmd = flags & FANOTIFY_MARK_CMD_BITS;
@@ -1904,12 +1901,10 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
}
/*
- * An unprivileged user is not allowed to setup mount nor filesystem
- * marks. This also includes setting up such marks by a group that
- * was initialized by an unprivileged user.
+ * A user is allowed to setup sb/mount/mntns marks only if it is
+ * capable in the user ns where the group was created.
*/
- if ((!capable(CAP_SYS_ADMIN) ||
- FAN_GROUP_FLAG(group, FANOTIFY_UNPRIV)) &&
+ if (!ns_capable(group->user_ns, CAP_SYS_ADMIN) &&
mark_type != FAN_MARK_INODE)
return -EPERM;
@@ -1988,18 +1983,31 @@ static int do_fanotify_mark(int fanotify_fd, unsigned int flags, __u64 mask,
fsid = &__fsid;
}
- /* inode held in place by reference to path; group by fget on fd */
+ /*
+ * In addition to being capable in the user ns where group was created,
+ * the user also needs to be capable in the user ns associated with
+ * the filesystem or in the user ns associated with the mntns
+ * (when marking mntns).
+ */
if (obj_type == FSNOTIFY_OBJ_TYPE_INODE) {
inode = path.dentry->d_inode;
obj = inode;
} else if (obj_type == FSNOTIFY_OBJ_TYPE_VFSMOUNT) {
+ user_ns = path.mnt->mnt_sb->s_user_ns;
obj = path.mnt;
} else if (obj_type == FSNOTIFY_OBJ_TYPE_SB) {
+ user_ns = path.mnt->mnt_sb->s_user_ns;
obj = path.mnt->mnt_sb;
} else if (obj_type == FSNOTIFY_OBJ_TYPE_MNTNS) {
- obj = mnt_ns_from_dentry(path.dentry);
+ mntns = mnt_ns_from_dentry(path.dentry);
+ user_ns = mntns->user_ns;
+ obj = mntns;
}
+ ret = -EPERM;
+ if (user_ns && !ns_capable(user_ns, CAP_SYS_ADMIN))
+ goto path_put_and_out;
+
ret = -EINVAL;
if (!obj)
goto path_put_and_out;
diff --git a/fs/ntfs3/attrib.c b/fs/ntfs3/attrib.c
index e946f75eb540..eced9013a881 100644
--- a/fs/ntfs3/attrib.c
+++ b/fs/ntfs3/attrib.c
@@ -2605,75 +2605,3 @@ int attr_force_nonresident(struct ntfs_inode *ni)
return err;
}
-
-/*
- * Change the compression of data attribute
- */
-int attr_set_compress(struct ntfs_inode *ni, bool compr)
-{
- struct ATTRIB *attr;
- struct mft_inode *mi;
-
- attr = ni_find_attr(ni, NULL, NULL, ATTR_DATA, NULL, 0, NULL, &mi);
- if (!attr)
- return -ENOENT;
-
- if (is_attr_compressed(attr) == !!compr) {
- /* Already required compressed state. */
- return 0;
- }
-
- if (attr->non_res) {
- u16 run_off;
- u32 run_size;
- char *run;
-
- if (attr->nres.data_size) {
- /*
- * There are rare cases when it possible to change
- * compress state without big changes.
- * TODO: Process these cases.
- */
- return -EOPNOTSUPP;
- }
-
- run_off = le16_to_cpu(attr->nres.run_off);
- run_size = le32_to_cpu(attr->size) - run_off;
- run = Add2Ptr(attr, run_off);
-
- if (!compr) {
- /* remove field 'attr->nres.total_size'. */
- memmove(run - 8, run, run_size);
- run_off -= 8;
- }
-
- if (!mi_resize_attr(mi, attr, compr ? +8 : -8)) {
- /*
- * Ignore rare case when there are no 8 bytes in record with attr.
- * TODO: split attribute.
- */
- return -EOPNOTSUPP;
- }
-
- if (compr) {
- /* Make a gap for 'attr->nres.total_size'. */
- memmove(run + 8, run, run_size);
- run_off += 8;
- attr->nres.total_size = attr->nres.alloc_size;
- }
- attr->nres.run_off = cpu_to_le16(run_off);
- }
-
- /* Update attribute flags. */
- if (compr) {
- attr->flags &= ~ATTR_FLAG_SPARSED;
- attr->flags |= ATTR_FLAG_COMPRESSED;
- attr->nres.c_unit = NTFS_LZNT_CUNIT;
- } else {
- attr->flags &= ~ATTR_FLAG_COMPRESSED;
- attr->nres.c_unit = 0;
- }
- mi->dirty = true;
-
- return 0;
-}
diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c
index 9b6a3f8d2e7c..34ed242e1063 100644
--- a/fs/ntfs3/file.c
+++ b/fs/ntfs3/file.c
@@ -50,90 +50,6 @@ static int ntfs_ioctl_fitrim(struct ntfs_sb_info *sbi, unsigned long arg)
}
/*
- * ntfs_fileattr_get - inode_operations::fileattr_get
- */
-int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
-{
- struct inode *inode = d_inode(dentry);
- struct ntfs_inode *ni = ntfs_i(inode);
- u32 flags = 0;
-
- if (inode->i_flags & S_IMMUTABLE)
- flags |= FS_IMMUTABLE_FL;
-
- if (inode->i_flags & S_APPEND)
- flags |= FS_APPEND_FL;
-
- if (is_compressed(ni))
- flags |= FS_COMPR_FL;
-
- if (is_encrypted(ni))
- flags |= FS_ENCRYPT_FL;
-
- fileattr_fill_flags(fa, flags);
-
- return 0;
-}
-
-/*
- * ntfs_fileattr_set - inode_operations::fileattr_set
- */
-int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
- struct fileattr *fa)
-{
- struct inode *inode = d_inode(dentry);
- struct ntfs_inode *ni = ntfs_i(inode);
- u32 flags = fa->flags;
- unsigned int new_fl = 0;
-
- if (fileattr_has_fsx(fa))
- return -EOPNOTSUPP;
-
- if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | FS_COMPR_FL))
- return -EOPNOTSUPP;
-
- if (flags & FS_IMMUTABLE_FL)
- new_fl |= S_IMMUTABLE;
-
- if (flags & FS_APPEND_FL)
- new_fl |= S_APPEND;
-
- /* Allowed to change compression for empty files and for directories only. */
- if (!is_dedup(ni) && !is_encrypted(ni) &&
- (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode))) {
- int err = 0;
- struct address_space *mapping = inode->i_mapping;
-
- /* write out all data and wait. */
- filemap_invalidate_lock(mapping);
- err = filemap_write_and_wait(mapping);
-
- if (err >= 0) {
- /* Change compress state. */
- bool compr = flags & FS_COMPR_FL;
- err = ni_set_compress(inode, compr);
-
- /* For files change a_ops too. */
- if (!err)
- mapping->a_ops = compr ? &ntfs_aops_cmpr :
- &ntfs_aops;
- }
-
- filemap_invalidate_unlock(mapping);
-
- if (err)
- return err;
- }
-
- inode_set_flags(inode, new_fl, S_IMMUTABLE | S_APPEND);
-
- inode_set_ctime_current(inode);
- mark_inode_dirty(inode);
-
- return 0;
-}
-
-/*
* ntfs_ioctl - file_operations::unlocked_ioctl
*/
long ntfs_ioctl(struct file *filp, u32 cmd, unsigned long arg)
@@ -430,7 +346,6 @@ static int ntfs_extend(struct inode *inode, loff_t pos, size_t count,
}
if (extend_init && !is_compressed(ni)) {
- WARN_ON(ni->i_valid >= pos);
err = ntfs_extend_initialized_size(file, ni, ni->i_valid, pos);
if (err)
goto out;
@@ -1409,8 +1324,6 @@ const struct inode_operations ntfs_file_inode_operations = {
.get_acl = ntfs_get_acl,
.set_acl = ntfs_set_acl,
.fiemap = ntfs_fiemap,
- .fileattr_get = ntfs_fileattr_get,
- .fileattr_set = ntfs_fileattr_set,
};
const struct file_operations ntfs_file_operations = {
diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c
index b7a83200f2cc..756e1306fe6c 100644
--- a/fs/ntfs3/frecord.c
+++ b/fs/ntfs3/frecord.c
@@ -3327,77 +3327,3 @@ out:
return 0;
}
-
-/*
- * ni_set_compress
- *
- * Helper for 'ntfs_fileattr_set'.
- * Changes compression for empty files and directories only.
- */
-int ni_set_compress(struct inode *inode, bool compr)
-{
- int err;
- struct ntfs_inode *ni = ntfs_i(inode);
- struct ATTR_STD_INFO *std;
- const char *bad_inode;
-
- if (is_compressed(ni) == !!compr)
- return 0;
-
- if (is_sparsed(ni)) {
- /* sparse and compress not compatible. */
- return -EOPNOTSUPP;
- }
-
- if (!S_ISREG(inode->i_mode) && !S_ISDIR(inode->i_mode)) {
- /*Skip other inodes. (symlink,fifo,...) */
- return -EOPNOTSUPP;
- }
-
- bad_inode = NULL;
-
- ni_lock(ni);
-
- std = ni_std(ni);
- if (!std) {
- bad_inode = "no std";
- goto out;
- }
-
- if (S_ISREG(inode->i_mode)) {
- err = attr_set_compress(ni, compr);
- if (err) {
- if (err == -ENOENT) {
- /* Fix on the fly? */
- /* Each file must contain data attribute. */
- bad_inode = "no data attribute";
- }
- goto out;
- }
- }
-
- ni->std_fa = std->fa;
- if (compr) {
- std->fa &= ~FILE_ATTRIBUTE_SPARSE_FILE;
- std->fa |= FILE_ATTRIBUTE_COMPRESSED;
- } else {
- std->fa &= ~FILE_ATTRIBUTE_COMPRESSED;
- }
-
- if (ni->std_fa != std->fa) {
- ni->std_fa = std->fa;
- ni->mi.dirty = true;
- }
- /* update duplicate information and directory entries in ni_write_inode.*/
- ni->ni_flags |= NI_FLAG_UPDATE_PARENT;
- err = 0;
-
-out:
- ni_unlock(ni);
- if (bad_inode) {
- ntfs_bad_inode(inode, bad_inode);
- err = -EINVAL;
- }
-
- return err;
-}
diff --git a/fs/ntfs3/fslog.c b/fs/ntfs3/fslog.c
index d0d530f4e2b9..38934e6978ec 100644
--- a/fs/ntfs3/fslog.c
+++ b/fs/ntfs3/fslog.c
@@ -3091,16 +3091,16 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe,
inode = ilookup(sbi->sb, rno);
if (inode) {
mi = &ntfs_i(inode)->mi;
- } else if (op == InitializeFileRecordSegment) {
- mi = kzalloc(sizeof(struct mft_inode), GFP_NOFS);
- if (!mi)
- return -ENOMEM;
- err = mi_format_new(mi, sbi, rno, 0, false);
- if (err)
- goto out;
} else {
/* Read from disk. */
err = mi_get(sbi, rno, &mi);
+ if (err && op == InitializeFileRecordSegment) {
+ mi = kzalloc(sizeof(struct mft_inode),
+ GFP_NOFS);
+ if (!mi)
+ return -ENOMEM;
+ err = mi_format_new(mi, sbi, rno, 0, false);
+ }
if (err)
return err;
}
@@ -3109,15 +3109,13 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe,
if (op == DeallocateFileRecordSegment)
goto skip_load_parent;
- if (InitializeFileRecordSegment != op) {
- if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE)
- goto dirty_vol;
- if (!check_lsn(&rec->rhdr, rlsn))
- goto out;
- if (!check_file_record(rec, NULL, sbi))
- goto dirty_vol;
- attr = Add2Ptr(rec, roff);
- }
+ if (rec->rhdr.sign == NTFS_BAAD_SIGNATURE)
+ goto dirty_vol;
+ if (!check_lsn(&rec->rhdr, rlsn))
+ goto out;
+ if (!check_file_record(rec, NULL, sbi))
+ goto dirty_vol;
+ attr = Add2Ptr(rec, roff);
if (is_rec_base(rec) || InitializeFileRecordSegment == op) {
rno_base = rno;
@@ -3143,7 +3141,7 @@ static int do_action(struct ntfs_log *log, struct OPEN_ATTR_ENRTY *oe,
if (inode)
iput(inode);
- else if (mi)
+ else
mi_put(mi);
inode = inode_parent;
diff --git a/fs/ntfs3/index.c b/fs/ntfs3/index.c
index 78d20e4baa2c..1bf2a6593dec 100644
--- a/fs/ntfs3/index.c
+++ b/fs/ntfs3/index.c
@@ -2182,6 +2182,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx,
e = hdr_first_de(&n->index->ihdr);
fnd_push(fnd, n, e);
+ if (!e) {
+ err = -EINVAL;
+ goto out;
+ }
if (!de_is_last(e)) {
/*
@@ -2203,6 +2207,10 @@ static int indx_get_entry_to_replace(struct ntfs_index *indx,
n = fnd->nodes[level];
te = hdr_first_de(&n->index->ihdr);
+ if (!te) {
+ err = -EINVAL;
+ goto out;
+ }
/* Copy the candidate entry into the replacement entry buffer. */
re = kmalloc(le16_to_cpu(te->size) + sizeof(u64), GFP_NOFS);
if (!re) {
diff --git a/fs/ntfs3/inode.c b/fs/ntfs3/inode.c
index 3e2957a1e360..0f0d27d4644a 100644
--- a/fs/ntfs3/inode.c
+++ b/fs/ntfs3/inode.c
@@ -805,6 +805,10 @@ static ssize_t ntfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
ret = 0;
goto out;
}
+ if (is_compressed(ni)) {
+ ret = 0;
+ goto out;
+ }
ret = blockdev_direct_IO(iocb, inode, iter,
wr ? ntfs_get_block_direct_IO_W :
@@ -2068,5 +2072,6 @@ const struct address_space_operations ntfs_aops_cmpr = {
.read_folio = ntfs_read_folio,
.readahead = ntfs_readahead,
.dirty_folio = block_dirty_folio,
+ .direct_IO = ntfs_direct_IO,
};
// clang-format on
diff --git a/fs/ntfs3/namei.c b/fs/ntfs3/namei.c
index 652735a0b0c4..b807744fc6a9 100644
--- a/fs/ntfs3/namei.c
+++ b/fs/ntfs3/namei.c
@@ -507,8 +507,6 @@ const struct inode_operations ntfs_dir_inode_operations = {
.getattr = ntfs_getattr,
.listxattr = ntfs_listxattr,
.fiemap = ntfs_fiemap,
- .fileattr_get = ntfs_fileattr_get,
- .fileattr_set = ntfs_fileattr_set,
};
const struct inode_operations ntfs_special_inode_operations = {
diff --git a/fs/ntfs3/ntfs_fs.h b/fs/ntfs3/ntfs_fs.h
index d628977e2556..36b8052660d5 100644
--- a/fs/ntfs3/ntfs_fs.h
+++ b/fs/ntfs3/ntfs_fs.h
@@ -454,7 +454,6 @@ int attr_collapse_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
int attr_insert_range(struct ntfs_inode *ni, u64 vbo, u64 bytes);
int attr_punch_hole(struct ntfs_inode *ni, u64 vbo, u64 bytes, u32 *frame_size);
int attr_force_nonresident(struct ntfs_inode *ni);
-int attr_set_compress(struct ntfs_inode *ni, bool compr);
/* Functions from attrlist.c */
void al_destroy(struct ntfs_inode *ni);
@@ -497,9 +496,6 @@ extern const struct file_operations ntfs_dir_operations;
extern const struct file_operations ntfs_legacy_dir_operations;
/* Globals from file.c */
-int ntfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
-int ntfs_fileattr_set(struct mnt_idmap *idmap, struct dentry *dentry,
- struct fileattr *fa);
int ntfs_getattr(struct mnt_idmap *idmap, const struct path *path,
struct kstat *stat, u32 request_mask, u32 flags);
int ntfs_setattr(struct mnt_idmap *idmap, struct dentry *dentry,
@@ -585,7 +581,6 @@ int ni_rename(struct ntfs_inode *dir_ni, struct ntfs_inode *new_dir_ni,
bool *is_bad);
bool ni_is_dirty(struct inode *inode);
-int ni_set_compress(struct inode *inode, bool compr);
/* Globals from fslog.c */
bool check_index_header(const struct INDEX_HDR *hdr, size_t bytes);
diff --git a/fs/orangefs/orangefs-kernel.h b/fs/orangefs/orangefs-kernel.h
index 3d4b883a7660..3e153c2f6b82 100644
--- a/fs/orangefs/orangefs-kernel.h
+++ b/fs/orangefs/orangefs-kernel.h
@@ -32,6 +32,8 @@
#include <linux/slab.h>
#include <linux/types.h>
#include <linux/fs.h>
+#include <linux/fs_context.h>
+#include <linux/fs_parser.h>
#include <linux/vmalloc.h>
#include <linux/aio.h>
@@ -328,11 +330,9 @@ void purge_waiting_ops(void);
* defined in super.c
*/
extern uint64_t orangefs_features;
+extern const struct fs_parameter_spec orangefs_fs_param_spec[];
-struct dentry *orangefs_mount(struct file_system_type *fst,
- int flags,
- const char *devname,
- void *data);
+int orangefs_init_fs_context(struct fs_context *fc);
void orangefs_kill_sb(struct super_block *sb);
int orangefs_remount(struct orangefs_sb_info_s *);
diff --git a/fs/orangefs/orangefs-mod.c b/fs/orangefs/orangefs-mod.c
index 5ab741c60b7e..7ac16a4d2dc6 100644
--- a/fs/orangefs/orangefs-mod.c
+++ b/fs/orangefs/orangefs-mod.c
@@ -46,7 +46,8 @@ MODULE_PARM_DESC(hash_table_size,
static struct file_system_type orangefs_fs_type = {
.name = "pvfs2",
- .mount = orangefs_mount,
+ .init_fs_context = orangefs_init_fs_context,
+ .parameters = orangefs_fs_param_spec,
.kill_sb = orangefs_kill_sb,
.owner = THIS_MODULE,
};
diff --git a/fs/orangefs/super.c b/fs/orangefs/super.c
index eba3e357192e..64ca9498f550 100644
--- a/fs/orangefs/super.c
+++ b/fs/orangefs/super.c
@@ -9,7 +9,6 @@
#include "orangefs-kernel.h"
#include "orangefs-bufmap.h"
-#include <linux/parser.h>
#include <linux/hashtable.h>
#include <linux/seq_file.h>
@@ -22,18 +21,16 @@ LIST_HEAD(orangefs_superblocks);
DEFINE_SPINLOCK(orangefs_superblocks_lock);
enum {
- Opt_intr,
Opt_acl,
+ Opt_intr,
Opt_local_lock,
-
- Opt_err
};
-static const match_table_t tokens = {
- { Opt_acl, "acl" },
- { Opt_intr, "intr" },
- { Opt_local_lock, "local_lock" },
- { Opt_err, NULL }
+const struct fs_parameter_spec orangefs_fs_param_spec[] = {
+ fsparam_flag ("acl", Opt_acl),
+ fsparam_flag ("intr", Opt_intr),
+ fsparam_flag ("local_lock", Opt_local_lock),
+ {}
};
uint64_t orangefs_features;
@@ -51,48 +48,30 @@ static int orangefs_show_options(struct seq_file *m, struct dentry *root)
return 0;
}
-static int parse_mount_options(struct super_block *sb, char *options,
- int silent)
+static int orangefs_parse_param(struct fs_context *fc,
+ struct fs_parameter *param)
{
- struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb);
- substring_t args[MAX_OPT_ARGS];
- char *p;
-
- /*
- * Force any potential flags that might be set from the mount
- * to zero, ie, initialize to unset.
- */
- sb->s_flags &= ~SB_POSIXACL;
- orangefs_sb->flags &= ~ORANGEFS_OPT_INTR;
- orangefs_sb->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
-
- while ((p = strsep(&options, ",")) != NULL) {
- int token;
-
- if (!*p)
- continue;
-
- token = match_token(p, tokens, args);
- switch (token) {
- case Opt_acl:
- sb->s_flags |= SB_POSIXACL;
- break;
- case Opt_intr:
- orangefs_sb->flags |= ORANGEFS_OPT_INTR;
- break;
- case Opt_local_lock:
- orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK;
- break;
- default:
- goto fail;
- }
+ struct orangefs_sb_info_s *orangefs_sb = fc->s_fs_info;
+ struct fs_parse_result result;
+ int opt;
+
+ opt = fs_parse(fc, orangefs_fs_param_spec, param, &result);
+ if (opt < 0)
+ return opt;
+
+ switch (opt) {
+ case Opt_acl:
+ fc->sb_flags |= SB_POSIXACL;
+ break;
+ case Opt_intr:
+ orangefs_sb->flags |= ORANGEFS_OPT_INTR;
+ break;
+ case Opt_local_lock:
+ orangefs_sb->flags |= ORANGEFS_OPT_LOCAL_LOCK;
+ break;
}
return 0;
-fail:
- if (!silent)
- gossip_err("Error: mount option [%s] is not supported.\n", p);
- return -EINVAL;
}
static void orangefs_inode_cache_ctor(void *req)
@@ -223,10 +202,20 @@ out_op_release:
* Remount as initiated by VFS layer. We just need to reparse the mount
* options, no need to signal pvfs2-client-core about it.
*/
-static int orangefs_remount_fs(struct super_block *sb, int *flags, char *data)
+static int orangefs_reconfigure(struct fs_context *fc)
{
- gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_remount_fs: called\n");
- return parse_mount_options(sb, data, 1);
+ struct super_block *sb = fc->root->d_sb;
+ struct orangefs_sb_info_s *orangefs_sb = ORANGEFS_SB(sb);
+ struct orangefs_sb_info_s *revised = fc->s_fs_info;
+ unsigned int flags;
+
+ flags = orangefs_sb->flags;
+ flags &= ~(ORANGEFS_OPT_INTR | ORANGEFS_OPT_LOCAL_LOCK);
+ flags |= revised->flags;
+ WRITE_ONCE(orangefs_sb->flags, flags);
+
+ gossip_debug(GOSSIP_SUPER_DEBUG, "orangefs_reconfigure: called\n");
+ return 0;
}
/*
@@ -319,7 +308,6 @@ static const struct super_operations orangefs_s_ops = {
.write_inode = orangefs_write_inode,
.drop_inode = generic_delete_inode,
.statfs = orangefs_statfs,
- .remount_fs = orangefs_remount_fs,
.show_options = orangefs_show_options,
};
@@ -410,8 +398,8 @@ static int orangefs_unmount(int id, __s32 fs_id, const char *devname)
}
static int orangefs_fill_sb(struct super_block *sb,
- struct orangefs_fs_mount_response *fs_mount,
- void *data, int silent)
+ struct fs_context *fc,
+ struct orangefs_fs_mount_response *fs_mount)
{
int ret;
struct inode *root;
@@ -424,12 +412,6 @@ static int orangefs_fill_sb(struct super_block *sb,
ORANGEFS_SB(sb)->fs_id = fs_mount->fs_id;
ORANGEFS_SB(sb)->id = fs_mount->id;
- if (data) {
- ret = parse_mount_options(sb, data, silent);
- if (ret)
- return ret;
- }
-
/* Hang the xattr handlers off the superblock */
sb->s_xattr = orangefs_xattr_handlers;
sb->s_magic = ORANGEFS_SUPER_MAGIC;
@@ -470,30 +452,24 @@ static int orangefs_fill_sb(struct super_block *sb,
return 0;
}
-struct dentry *orangefs_mount(struct file_system_type *fst,
- int flags,
- const char *devname,
- void *data)
+static int orangefs_get_tree(struct fs_context *fc)
{
int ret;
struct super_block *sb = ERR_PTR(-EINVAL);
struct orangefs_kernel_op_s *new_op;
- struct dentry *d = ERR_PTR(-EINVAL);
+
+ if (!fc->source)
+ return invalf(fc, "Device name not specified.\n");
gossip_debug(GOSSIP_SUPER_DEBUG,
"orangefs_mount: called with devname %s\n",
- devname);
-
- if (!devname) {
- gossip_err("ERROR: device name not specified.\n");
- return ERR_PTR(-EINVAL);
- }
+ fc->source);
new_op = op_alloc(ORANGEFS_VFS_OP_FS_MOUNT);
if (!new_op)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
- strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, devname);
+ strscpy(new_op->upcall.req.fs_mount.orangefs_config_server, fc->source);
gossip_debug(GOSSIP_SUPER_DEBUG,
"Attempting ORANGEFS Mount via host %s\n",
@@ -511,37 +487,27 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
goto free_op;
}
- sb = sget(fst, NULL, set_anon_super, flags, NULL);
+ sb = sget_fc(fc, NULL, set_anon_super_fc);
if (IS_ERR(sb)) {
- d = ERR_CAST(sb);
+ ret = PTR_ERR(sb);
orangefs_unmount(new_op->downcall.resp.fs_mount.id,
- new_op->downcall.resp.fs_mount.fs_id, devname);
- goto free_op;
- }
-
- /* alloc and init our private orangefs sb info */
- sb->s_fs_info = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
- if (!ORANGEFS_SB(sb)) {
- d = ERR_PTR(-ENOMEM);
+ new_op->downcall.resp.fs_mount.fs_id,
+ fc->source);
goto free_op;
}
- ret = orangefs_fill_sb(sb,
- &new_op->downcall.resp.fs_mount, data,
- flags & SB_SILENT ? 1 : 0);
+ /* init our private orangefs sb info */
+ ret = orangefs_fill_sb(sb, fc, &new_op->downcall.resp.fs_mount);
- if (ret) {
- d = ERR_PTR(ret);
+ if (ret)
goto free_sb_and_op;
- }
/*
* on successful mount, store the devname and data
* used
*/
- strscpy(ORANGEFS_SB(sb)->devname, devname);
-
+ strscpy(ORANGEFS_SB(sb)->devname, fc->source);
/* mount_pending must be cleared */
ORANGEFS_SB(sb)->mount_pending = 0;
@@ -564,7 +530,7 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
if (orangefs_userspace_version >= 20906) {
new_op = op_alloc(ORANGEFS_VFS_OP_FEATURES);
if (!new_op)
- return ERR_PTR(-ENOMEM);
+ return -ENOMEM;
new_op->upcall.req.features.features = 0;
ret = service_operation(new_op, "orangefs_features", 0);
orangefs_features = new_op->downcall.resp.features.features;
@@ -573,7 +539,8 @@ struct dentry *orangefs_mount(struct file_system_type *fst,
orangefs_features = 0;
}
- return dget(sb->s_root);
+ fc->root = dget(sb->s_root);
+ return 0;
free_sb_and_op:
/* Will call orangefs_kill_sb with sb not in list. */
@@ -589,7 +556,43 @@ free_op:
op_release(new_op);
- return d;
+ return ret;
+}
+
+static void orangefs_free_fc(struct fs_context *fc)
+{
+ kfree(fc->s_fs_info);
+}
+
+static const struct fs_context_operations orangefs_context_ops = {
+ .free = orangefs_free_fc,
+ .parse_param = orangefs_parse_param,
+ .get_tree = orangefs_get_tree,
+ .reconfigure = orangefs_reconfigure,
+};
+
+/*
+ * Set up the filesystem mount context.
+ */
+int orangefs_init_fs_context(struct fs_context *fc)
+{
+ struct orangefs_sb_info_s *osi;
+
+ osi = kzalloc(sizeof(struct orangefs_sb_info_s), GFP_KERNEL);
+ if (!osi)
+ return -ENOMEM;
+
+ /*
+ * Force any potential flags that might be set from the mount
+ * to zero, ie, initialize to unset.
+ */
+ fc->sb_flags_mask &= ~SB_POSIXACL;
+ osi->flags &= ~ORANGEFS_OPT_INTR;
+ osi->flags &= ~ORANGEFS_OPT_LOCAL_LOCK;
+
+ fc->s_fs_info = osi;
+ fc->ops = &orangefs_context_ops;
+ return 0;
}
void orangefs_kill_sb(struct super_block *sb)
diff --git a/fs/sysfs/group.c b/fs/sysfs/group.c
index 8b01a7eda5fb..2d78e94072a0 100644
--- a/fs/sysfs/group.c
+++ b/fs/sysfs/group.c
@@ -21,7 +21,7 @@ static void remove_files(struct kernfs_node *parent,
const struct attribute_group *grp)
{
struct attribute *const *attr;
- struct bin_attribute *const *bin_attr;
+ const struct bin_attribute *const *bin_attr;
if (grp->attrs)
for (attr = grp->attrs; *attr; attr++)
@@ -47,7 +47,7 @@ static int create_files(struct kernfs_node *parent, struct kobject *kobj,
const struct attribute_group *grp, int update)
{
struct attribute *const *attr;
- struct bin_attribute *const *bin_attr;
+ const struct bin_attribute *const *bin_attr;
int error = 0, i;
if (grp->attrs) {
@@ -521,7 +521,7 @@ static int sysfs_group_attrs_change_owner(struct kernfs_node *grp_kn,
}
if (grp->bin_attrs) {
- struct bin_attribute *const *bin_attr;
+ const struct bin_attribute *const *bin_attr;
for (bin_attr = grp->bin_attrs; *bin_attr; bin_attr++) {
kn = kernfs_find_and_get(grp_kn, (*bin_attr)->attr.name);