diff options
Diffstat (limited to 'mm/filemap.c')
| -rw-r--r-- | mm/filemap.c | 326 |
1 files changed, 195 insertions, 131 deletions
diff --git a/mm/filemap.c b/mm/filemap.c index 13f0259d993c..ebd75684cb0a 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -21,7 +21,7 @@ #include <linux/gfp.h> #include <linux/mm.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/syscalls.h> #include <linux/mman.h> #include <linux/pagemap.h> @@ -48,7 +48,8 @@ #include <linux/rcupdate_wait.h> #include <linux/sched/mm.h> #include <linux/sysctl.h> -#include <asm/pgalloc.h> +#include <linux/pgalloc.h> + #include <asm/tlbflush.h> #include "internal.h" @@ -181,13 +182,13 @@ static void filemap_unaccount_folio(struct address_space *mapping, nr = folio_nr_pages(folio); - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, -nr); if (folio_test_swapbacked(folio)) { - __lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); + lruvec_stat_mod_folio(folio, NR_SHMEM, -nr); if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); + lruvec_stat_mod_folio(folio, NR_SHMEM_THPS, -nr); } else if (folio_test_pmd_mappable(folio)) { - __lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); + lruvec_stat_mod_folio(folio, NR_FILE_THPS, -nr); filemap_nr_thps_dec(mapping); } if (test_bit(AS_KERNEL_FILE, &folio->mapping->flags)) @@ -256,7 +257,7 @@ void filemap_remove_folio(struct folio *folio) __filemap_remove_folio(folio, NULL); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); filemap_free_folio(mapping, folio); @@ -335,7 +336,7 @@ void delete_from_page_cache_batch(struct address_space *mapping, page_cache_delete_batch(mapping, fbatch); xa_unlock_irq(&mapping->i_pages); if (mapping_shrinkable(mapping)) - inode_add_lru(mapping->host); + inode_lru_list_add(mapping->host); spin_unlock(&mapping->host->i_lock); for (i = 0; i < folio_batch_count(fbatch); i++) @@ -366,83 +367,60 @@ static int filemap_check_and_keep_errors(struct address_space *mapping) return 0; } -/** - * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range - * @mapping: address space structure to write - * @wbc: the writeback_control controlling the writeout - * - * Call writepages on the mapping using the provided wbc to control the - * writeout. - * - * Return: %0 on success, negative error code otherwise. - */ -int filemap_fdatawrite_wbc(struct address_space *mapping, - struct writeback_control *wbc) +static int filemap_writeback(struct address_space *mapping, loff_t start, + loff_t end, enum writeback_sync_modes sync_mode, + long *nr_to_write) { + struct writeback_control wbc = { + .sync_mode = sync_mode, + .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX, + .range_start = start, + .range_end = end, + }; int ret; if (!mapping_can_writeback(mapping) || !mapping_tagged(mapping, PAGECACHE_TAG_DIRTY)) return 0; - wbc_attach_fdatawrite_inode(wbc, mapping->host); - ret = do_writepages(mapping, wbc); - wbc_detach_inode(wbc); + wbc_attach_fdatawrite_inode(&wbc, mapping->host); + ret = do_writepages(mapping, &wbc); + wbc_detach_inode(&wbc); + + if (!ret && nr_to_write) + *nr_to_write = wbc.nr_to_write; return ret; } -EXPORT_SYMBOL(filemap_fdatawrite_wbc); /** - * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range + * filemap_fdatawrite_range - start writeback on mapping dirty pages in range * @mapping: address space structure to write * @start: offset in bytes where the range starts * @end: offset in bytes where the range ends (inclusive) - * @sync_mode: enable synchronous operation * * Start writeback against all of a mapping's dirty pages that lie * within the byte offsets <start, end> inclusive. * - * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as - * opposed to a regular memory cleansing writeback. The difference between - * these two operations is that if a dirty page/buffer is encountered, it must - * be waited upon, and not just skipped over. + * This is a data integrity operation that waits upon dirty or in writeback + * pages. * * Return: %0 on success, negative error code otherwise. */ -int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end, int sync_mode) -{ - struct writeback_control wbc = { - .sync_mode = sync_mode, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - return filemap_fdatawrite_wbc(mapping, &wbc); -} - -static inline int __filemap_fdatawrite(struct address_space *mapping, - int sync_mode) +int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, + loff_t end) { - return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode); + return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL); } +EXPORT_SYMBOL(filemap_fdatawrite_range); int filemap_fdatawrite(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_ALL); + return filemap_fdatawrite_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_fdatawrite); -int filemap_fdatawrite_range(struct address_space *mapping, loff_t start, - loff_t end) -{ - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL); -} -EXPORT_SYMBOL(filemap_fdatawrite_range); - /** - * filemap_fdatawrite_range_kick - start writeback on a range + * filemap_flush_range - start writeback on a range * @mapping: target address_space * @start: index to start writeback on * @end: last (inclusive) index for writeback @@ -452,12 +430,12 @@ EXPORT_SYMBOL(filemap_fdatawrite_range); * * Return: %0 on success, negative error code otherwise. */ -int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start, +int filemap_flush_range(struct address_space *mapping, loff_t start, loff_t end) { - return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE); + return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL); } -EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); +EXPORT_SYMBOL_GPL(filemap_flush_range); /** * filemap_flush - mostly a non-blocking flush @@ -470,10 +448,22 @@ EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick); */ int filemap_flush(struct address_space *mapping) { - return __filemap_fdatawrite(mapping, WB_SYNC_NONE); + return filemap_flush_range(mapping, 0, LLONG_MAX); } EXPORT_SYMBOL(filemap_flush); +/* + * Start writeback on @nr_to_write pages from @mapping. No one but the existing + * btrfs caller should be using this. Talk to linux-mm if you think adding a + * new caller is a good idea. + */ +int filemap_flush_nr(struct address_space *mapping, long *nr_to_write) +{ + return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE, + nr_to_write); +} +EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs"); + /** * filemap_range_has_page - check if a page exists in range. * @mapping: address space within which to check @@ -691,8 +681,7 @@ int filemap_write_and_wait_range(struct address_space *mapping, return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* * Even if the above returned error, the pages may be * written partially (e.g. -ENOSPC), so we wait for it. @@ -794,8 +783,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend) return 0; if (mapping_needs_writeback(mapping)) { - err = __filemap_fdatawrite_range(mapping, lstart, lend, - WB_SYNC_ALL); + err = filemap_fdatawrite_range(mapping, lstart, lend); /* See comment of filemap_write_and_wait() */ if (err != -EIO) __filemap_fdatawait_range(mapping, lstart, lend); @@ -843,13 +831,13 @@ void replace_page_cache_folio(struct folio *old, struct folio *new) old->mapping = NULL; /* hugetlb pages do not participate in page cache accounting. */ if (!folio_test_hugetlb(old)) - __lruvec_stat_sub_folio(old, NR_FILE_PAGES); + lruvec_stat_sub_folio(old, NR_FILE_PAGES); if (!folio_test_hugetlb(new)) - __lruvec_stat_add_folio(new, NR_FILE_PAGES); + lruvec_stat_add_folio(new, NR_FILE_PAGES); if (folio_test_swapbacked(old)) - __lruvec_stat_sub_folio(old, NR_SHMEM); + lruvec_stat_sub_folio(old, NR_SHMEM); if (folio_test_swapbacked(new)) - __lruvec_stat_add_folio(new, NR_SHMEM); + lruvec_stat_add_folio(new, NR_SHMEM); xas_unlock_irq(&xas); if (free_folio) free_folio(old); @@ -932,9 +920,9 @@ noinline int __filemap_add_folio(struct address_space *mapping, /* hugetlb pages do not participate in page cache accounting */ if (!huge) { - __lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); + lruvec_stat_mod_folio(folio, NR_FILE_PAGES, nr); if (folio_test_pmd_mappable(folio)) - __lruvec_stat_mod_folio(folio, + lruvec_stat_mod_folio(folio, NR_FILE_THPS, nr); } @@ -1002,11 +990,16 @@ int filemap_add_folio(struct address_space *mapping, struct folio *folio, EXPORT_SYMBOL_GPL(filemap_add_folio); #ifdef CONFIG_NUMA -struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order) +struct folio *filemap_alloc_folio_noprof(gfp_t gfp, unsigned int order, + struct mempolicy *policy) { int n; struct folio *folio; + if (policy) + return folio_alloc_mpol_noprof(gfp, order, policy, + NO_INTERLEAVE_INDEX, numa_node_id()); + if (cpuset_do_page_mem_spread()) { unsigned int cpuset_mems_cookie; do { @@ -1401,7 +1394,7 @@ repeat: * This follows the same logic as folio_wait_bit_common() so see the comments * there. */ -void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) +void migration_entry_wait_on_locked(softleaf_t entry, spinlock_t *ptl) __releases(ptl) { struct wait_page_queue wait_page; @@ -1410,7 +1403,7 @@ void migration_entry_wait_on_locked(swp_entry_t entry, spinlock_t *ptl) unsigned long pflags; bool in_thrashing; wait_queue_head_t *q; - struct folio *folio = pfn_swap_entry_folio(entry); + struct folio *folio = softleaf_to_folio(entry); q = folio_waitqueue(folio); if (!folio_test_uptodate(folio) && folio_test_workingset(folio)) { @@ -1923,11 +1916,12 @@ out: } /** - * __filemap_get_folio - Find and get a reference to a folio. + * __filemap_get_folio_mpol - Find and get a reference to a folio. * @mapping: The address_space to search. * @index: The page index. * @fgp_flags: %FGP flags modify how the folio is returned. * @gfp: Memory allocation flags to use if %FGP_CREAT is specified. + * @policy: NUMA memory allocation policy to follow. * * Looks up the page cache entry at @mapping & @index. * @@ -1938,8 +1932,8 @@ out: * * Return: The found folio or an ERR_PTR() otherwise. */ -struct folio *__filemap_get_folio(struct address_space *mapping, pgoff_t index, - fgf_t fgp_flags, gfp_t gfp) +struct folio *__filemap_get_folio_mpol(struct address_space *mapping, + pgoff_t index, fgf_t fgp_flags, gfp_t gfp, struct mempolicy *policy) { struct folio *folio; @@ -2009,7 +2003,7 @@ no_page: err = -ENOMEM; if (order > min_order) alloc_gfp |= __GFP_NORETRY | __GFP_NOWARN; - folio = filemap_alloc_folio(alloc_gfp, order); + folio = filemap_alloc_folio(alloc_gfp, order, policy); if (!folio) continue; @@ -2056,7 +2050,7 @@ no_page: folio_clear_dropbehind(folio); return folio; } -EXPORT_SYMBOL(__filemap_get_folio); +EXPORT_SYMBOL(__filemap_get_folio_mpol); static inline struct folio *find_get_entry(struct xa_state *xas, pgoff_t max, xa_mark_t mark) @@ -2366,6 +2360,64 @@ out: } EXPORT_SYMBOL(filemap_get_folios_tag); +/** + * filemap_get_folios_dirty - Get a batch of dirty folios + * @mapping: The address_space to search + * @start: The starting folio index + * @end: The final folio index (inclusive) + * @fbatch: The batch to fill + * + * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except + * the returned folios are presumed to be dirty or undergoing writeback. Dirty + * state is presumed because we don't block on folio lock nor want to miss + * folios. Callers that need to can recheck state upon locking the folio. + * + * This may not return all dirty folios if the batch gets filled up. + * + * Return: The number of folios found. + * Also update @start to be positioned for traversal of the next folio. + */ +unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start, + pgoff_t end, struct folio_batch *fbatch) +{ + XA_STATE(xas, &mapping->i_pages, *start); + struct folio *folio; + + rcu_read_lock(); + while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) { + if (xa_is_value(folio)) + continue; + if (folio_trylock(folio)) { + bool clean = !folio_test_dirty(folio) && + !folio_test_writeback(folio); + folio_unlock(folio); + if (clean) { + folio_put(folio); + continue; + } + } + if (!folio_batch_add(fbatch, folio)) { + unsigned long nr = folio_nr_pages(folio); + *start = folio->index + nr; + goto out; + } + } + /* + * We come here when there is no folio beyond @end. We take care to not + * overflow the index @start as it confuses some of the callers. This + * breaks the iteration when there is a folio at index -1 but that is + * already broke anyway. + */ + if (end == (pgoff_t)-1) + *start = (pgoff_t)-1; + else + *start = end + 1; +out: + rcu_read_unlock(); + + return folio_batch_count(fbatch); +} + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: @@ -2551,7 +2603,7 @@ static int filemap_create_folio(struct kiocb *iocb, struct folio_batch *fbatch) if (iocb->ki_flags & (IOCB_NOWAIT | IOCB_WAITQ)) return -EAGAIN; - folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order); + folio = filemap_alloc_folio(mapping_gfp_mask(mapping), min_order, NULL); if (!folio) return -ENOMEM; if (iocb->ki_flags & IOCB_DONTCACHE) @@ -3253,11 +3305,47 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) DEFINE_READAHEAD(ractl, file, ra, mapping, vmf->pgoff); struct file *fpin = NULL; vm_flags_t vm_flags = vmf->vma->vm_flags; + bool force_thp_readahead = false; unsigned short mmap_miss; -#ifdef CONFIG_TRANSPARENT_HUGEPAGE /* Use the readahead code, even if readahead is disabled */ - if ((vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) { + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && + (vm_flags & VM_HUGEPAGE) && HPAGE_PMD_ORDER <= MAX_PAGECACHE_ORDER) + force_thp_readahead = true; + + if (!force_thp_readahead) { + /* + * If we don't want any read-ahead, don't bother. + * VM_EXEC case below is already intended for random access. + */ + if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) + return fpin; + + if (!ra->ra_pages) + return fpin; + + if (vm_flags & VM_SEQ_READ) { + fpin = maybe_unlock_mmap_for_io(vmf, fpin); + page_cache_sync_ra(&ractl, ra->ra_pages); + return fpin; + } + } + + if (!(vm_flags & VM_SEQ_READ)) { + /* Avoid banging the cache line if not needed */ + mmap_miss = READ_ONCE(ra->mmap_miss); + if (mmap_miss < MMAP_LOTSAMISS * 10) + WRITE_ONCE(ra->mmap_miss, ++mmap_miss); + + /* + * Do we miss much more than hit in this file? If so, + * stop bothering with read-ahead. It will only hurt. + */ + if (mmap_miss > MMAP_LOTSAMISS) + return fpin; + } + + if (force_thp_readahead) { fpin = maybe_unlock_mmap_for_io(vmf, fpin); ractl._index &= ~((unsigned long)HPAGE_PMD_NR - 1); ra->size = HPAGE_PMD_NR; @@ -3272,34 +3360,6 @@ static struct file *do_sync_mmap_readahead(struct vm_fault *vmf) page_cache_ra_order(&ractl, ra); return fpin; } -#endif - - /* - * If we don't want any read-ahead, don't bother. VM_EXEC case below is - * already intended for random access. - */ - if ((vm_flags & (VM_RAND_READ | VM_EXEC)) == VM_RAND_READ) - return fpin; - if (!ra->ra_pages) - return fpin; - - if (vm_flags & VM_SEQ_READ) { - fpin = maybe_unlock_mmap_for_io(vmf, fpin); - page_cache_sync_ra(&ractl, ra->ra_pages); - return fpin; - } - - /* Avoid banging the cache line if not needed */ - mmap_miss = READ_ONCE(ra->mmap_miss); - if (mmap_miss < MMAP_LOTSAMISS * 10) - WRITE_ONCE(ra->mmap_miss, ++mmap_miss); - - /* - * Do we miss much more than hit in this file? If so, - * stop bothering with read-ahead. It will only hurt. - */ - if (mmap_miss > MMAP_LOTSAMISS) - return fpin; if (vm_flags & VM_EXEC) { /* @@ -3681,8 +3741,10 @@ skip: static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, struct folio *folio, unsigned long start, unsigned long addr, unsigned int nr_pages, - unsigned long *rss, unsigned short *mmap_miss) + unsigned long *rss, unsigned short *mmap_miss, + pgoff_t file_end) { + struct address_space *mapping = folio->mapping; unsigned int ref_from_caller = 1; vm_fault_t ret = 0; struct page *page = folio_page(folio, start); @@ -3691,12 +3753,16 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf, unsigned long addr0; /* - * Map the large folio fully where possible. + * Map the large folio fully where possible: * - * The folio must not cross VMA or page table boundary. + * - The folio is fully within size of the file or belong + * to shmem/tmpfs; + * - The folio doesn't cross VMA boundary; + * - The folio doesn't cross page table boundary; */ addr0 = addr - start * PAGE_SIZE; - if (folio_within_vma(folio, vmf->vma) && + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && + folio_within_vma(folio, vmf->vma) && (addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) { vmf->pte -= start; page -= start; @@ -3817,7 +3883,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, if (!folio) goto out; - if (filemap_map_pmd(vmf, folio, start_pgoff)) { + file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; + end_pgoff = min(end_pgoff, file_end); + + /* + * Do not allow to map with PMD across i_size to preserve + * SIGBUS semantics. + * + * Make an exception for shmem/tmpfs that for long time + * intentionally mapped with PMDs across i_size. + */ + if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) && + filemap_map_pmd(vmf, folio, start_pgoff)) { ret = VM_FAULT_NOPAGE; goto out; } @@ -3830,10 +3907,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, goto out; } - file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1; - if (end_pgoff > file_end) - end_pgoff = file_end; - folio_type = mm_counter_file(folio); do { unsigned long end; @@ -3850,7 +3923,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf, else ret |= filemap_map_folio_range(vmf, folio, xas.xa_index - folio->index, addr, - nr_pages, &rss, &mmap_miss); + nr_pages, &rss, &mmap_miss, file_end); folio_unlock(folio); } while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL); @@ -3983,8 +4056,7 @@ static struct folio *do_read_cache_folio(struct address_space *mapping, repeat: folio = filemap_get_folio(mapping, index); if (IS_ERR(folio)) { - folio = filemap_alloc_folio(gfp, - mapping_min_folio_order(mapping)); + folio = filemap_alloc_folio(gfp, mapping_min_folio_order(mapping), NULL); if (!folio) return ERR_PTR(-ENOMEM); index = mapping_align_index(mapping, index); @@ -4457,16 +4529,8 @@ int filemap_invalidate_inode(struct inode *inode, bool flush, unmap_mapping_pages(mapping, first, nr, false); /* Write back the data if we're asked to. */ - if (flush) { - struct writeback_control wbc = { - .sync_mode = WB_SYNC_ALL, - .nr_to_write = LONG_MAX, - .range_start = start, - .range_end = end, - }; - - filemap_fdatawrite_wbc(mapping, &wbc); - } + if (flush) + filemap_fdatawrite_range(mapping, start, end); /* Wait for writeback to complete on all folios and discard. */ invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE); @@ -4545,7 +4609,7 @@ static void filemap_cachestat(struct address_space *mapping, swp_entry_t swp = radix_to_swp_entry(folio); /* swapin error results in poisoned entry */ - if (non_swap_entry(swp)) + if (!softleaf_is_swap(swp)) goto resched; /* |
