summaryrefslogtreecommitdiff
path: root/mm/filemap.c
diff options
context:
space:
mode:
authorLinus Torvalds <torvalds@linux-foundation.org>2025-01-23 13:36:06 -0800
committerLinus Torvalds <torvalds@linux-foundation.org>2025-01-23 13:36:06 -0800
commit8883957b3c9de2087fb6cf9691c1188cccf1ac9c (patch)
treeedd99014c5e520d44d6fe572a0c9d4776307b53a /mm/filemap.c
parentfb6fec6bdd9b16a935a0557773e313262366d071 (diff)
parent0c0214df28f0dba8de084cb4dedc0c459dfbc083 (diff)
Merge tag 'fsnotify_hsm_for_v6.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs
Pull fsnotify pre-content notification support from Jan Kara: "This introduces a new fsnotify event (FS_PRE_ACCESS) that gets generated before a file contents is accessed. The event is synchronous so if there is listener for this event, the kernel waits for reply. On success the execution continues as usual, on failure we propagate the error to userspace. This allows userspace to fill in file content on demand from slow storage. The context in which the events are generated has been picked so that we don't hold any locks and thus there's no risk of a deadlock for the userspace handler. The new pre-content event is available only for users with global CAP_SYS_ADMIN capability (similarly to other parts of fanotify functionality) and it is an administrator responsibility to make sure the userspace event handler doesn't do stupid stuff that can DoS the system. Based on your feedback from the last submission, fsnotify code has been improved and now file->f_mode encodes whether pre-content event needs to be generated for the file so the fast path when nobody wants pre-content event for the file just grows the additional file->f_mode check. As a bonus this also removes the checks whether the old FS_ACCESS event needs to be generated from the fast path. Also the place where the event is generated during page fault has been moved so now filemap_fault() generates the event if and only if there is no uptodate folio in the page cache. Also we have dropped FS_PRE_MODIFY event as current real-world users of the pre-content functionality don't really use it so let's start with the minimal useful feature set" * tag 'fsnotify_hsm_for_v6.14-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/jack/linux-fs: (21 commits) fanotify: Fix crash in fanotify_init(2) fs: don't block write during exec on pre-content watched files fs: enable pre-content events on supported file systems ext4: add pre-content fsnotify hook for DAX faults btrfs: disable defrag on pre-content watched files xfs: add pre-content fsnotify hook for DAX faults fsnotify: generate pre-content permission event on page fault mm: don't allow huge faults for files with pre content watches fanotify: disable readahead if we have pre-content watches fanotify: allow to set errno in FAN_DENY permission response fanotify: report file range info with pre-content events fanotify: introduce FAN_PRE_ACCESS permission event fsnotify: generate pre-content permission event on truncate fsnotify: pass optional file access range in pre-content event fsnotify: introduce pre-content permission events fanotify: reserve event bit of deprecated FAN_DIR_MODIFY fanotify: rename a misnamed constant fanotify: don't skip extra event info if no info_mode is set fsnotify: check if file is actually being watched for pre-content events on open fsnotify: opt-in for permission events at file open time ...
Diffstat (limited to 'mm/filemap.c')
-rw-r--r--mm/filemap.c86
1 files changed, 86 insertions, 0 deletions
diff --git a/mm/filemap.c b/mm/filemap.c
index 440922a7d8f1..b8ed647416e9 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -47,6 +47,7 @@
#include <linux/splice.h>
#include <linux/rcupdate_wait.h>
#include <linux/sched/mm.h>
+#include <linux/fsnotify.h>
#include <asm/pgalloc.h>
#include <asm/tlbflush.h>
#include "internal.h"
@@ -3141,6 +3142,14 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf)
unsigned long vm_flags = vmf->vma->vm_flags;
unsigned int mmap_miss;
+ /*
+ * If we have pre-content watches we need to disable readahead to make
+ * sure that we don't populate our mapping with 0 filled pages that we
+ * never emitted an event for.
+ */
+ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
+ return fpin;
+
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
/* Use the readahead code, even if readahead is disabled */
if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) {
@@ -3209,6 +3218,10 @@ static struct file *do_async_mmap_readahead(struct vm_fault *vmf,
struct file *fpin = NULL;
unsigned int mmap_miss;
+ /* See comment in do_sync_mmap_readahead. */
+ if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode)))
+ return fpin;
+
/* If we don't want any read-ahead, don't bother */
if (vmf->vma->vm_flags & VM_RAND_READ || !ra->ra_pages)
return fpin;
@@ -3268,6 +3281,48 @@ static vm_fault_t filemap_fault_recheck_pte_none(struct vm_fault *vmf)
}
/**
+ * filemap_fsnotify_fault - maybe emit a pre-content event.
+ * @vmf: struct vm_fault containing details of the fault.
+ *
+ * If we have a pre-content watch on this file we will emit an event for this
+ * range. If we return anything the fault caller should return immediately, we
+ * will return VM_FAULT_RETRY if we had to emit an event, which will trigger the
+ * fault again and then the fault handler will run the second time through.
+ *
+ * Return: a bitwise-OR of %VM_FAULT_ codes, 0 if nothing happened.
+ */
+vm_fault_t filemap_fsnotify_fault(struct vm_fault *vmf)
+{
+ struct file *fpin = NULL;
+ int mask = (vmf->flags & FAULT_FLAG_WRITE) ? MAY_WRITE : MAY_ACCESS;
+ loff_t pos = vmf->pgoff >> PAGE_SHIFT;
+ size_t count = PAGE_SIZE;
+ int err;
+
+ /*
+ * We already did this and now we're retrying with everything locked,
+ * don't emit the event and continue.
+ */
+ if (vmf->flags & FAULT_FLAG_TRIED)
+ return 0;
+
+ /* No watches, we're done. */
+ if (likely(!FMODE_FSNOTIFY_HSM(vmf->vma->vm_file->f_mode)))
+ return 0;
+
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ if (!fpin)
+ return VM_FAULT_SIGBUS;
+
+ err = fsnotify_file_area_perm(fpin, mask, &pos, count);
+ fput(fpin);
+ if (err)
+ return VM_FAULT_SIGBUS;
+ return VM_FAULT_RETRY;
+}
+EXPORT_SYMBOL_GPL(filemap_fsnotify_fault);
+
+/**
* filemap_fault - read in file data for page fault handling
* @vmf: struct vm_fault containing details of the fault
*
@@ -3371,6 +3426,37 @@ retry_find:
*/
if (unlikely(!folio_test_uptodate(folio))) {
/*
+ * If this is a precontent file we have can now emit an event to
+ * try and populate the folio.
+ */
+ if (!(vmf->flags & FAULT_FLAG_TRIED) &&
+ unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
+ loff_t pos = folio_pos(folio);
+ size_t count = folio_size(folio);
+
+ /* We're NOWAIT, we have to retry. */
+ if (vmf->flags & FAULT_FLAG_RETRY_NOWAIT) {
+ folio_unlock(folio);
+ goto out_retry;
+ }
+
+ if (mapping_locked)
+ filemap_invalidate_unlock_shared(mapping);
+ mapping_locked = false;
+
+ folio_unlock(folio);
+ fpin = maybe_unlock_mmap_for_io(vmf, fpin);
+ if (!fpin)
+ goto out_retry;
+
+ error = fsnotify_file_area_perm(fpin, MAY_ACCESS, &pos,
+ count);
+ if (error)
+ ret = VM_FAULT_SIGBUS;
+ goto out_retry;
+ }
+
+ /*
* If the invalidate lock is not held, the folio was in cache
* and uptodate and now it is not. Strange but possible since we
* didn't hold the page lock all the time. Let's drop