diff options
Diffstat (limited to 'mm')
-rw-r--r-- | mm/damon/Kconfig | 1 | ||||
-rw-r--r-- | mm/damon/sysfs-schemes.c | 1 | ||||
-rw-r--r-- | mm/execmem.c | 40 | ||||
-rw-r--r-- | mm/gup.c | 14 | ||||
-rw-r--r-- | mm/hugetlb.c | 54 | ||||
-rw-r--r-- | mm/kmemleak.c | 14 | ||||
-rw-r--r-- | mm/madvise.c | 2 | ||||
-rw-r--r-- | mm/memory.c | 20 | ||||
-rw-r--r-- | mm/shmem.c | 6 | ||||
-rw-r--r-- | mm/swap.h | 23 | ||||
-rw-r--r-- | mm/userfaultfd.c | 33 | ||||
-rw-r--r-- | mm/util.c | 40 | ||||
-rw-r--r-- | mm/vma.c | 23 | ||||
-rw-r--r-- | mm/vma.h | 47 |
14 files changed, 197 insertions, 121 deletions
diff --git a/mm/damon/Kconfig b/mm/damon/Kconfig index 551745df011b..c93d0c56b963 100644 --- a/mm/damon/Kconfig +++ b/mm/damon/Kconfig @@ -4,7 +4,6 @@ menu "Data Access Monitoring" config DAMON bool "DAMON: Data Access Monitoring Framework" - default y help This builds a framework that allows kernel subsystems to monitor access frequency of each memory region. The information can be useful diff --git a/mm/damon/sysfs-schemes.c b/mm/damon/sysfs-schemes.c index 0f6c9e1fec0b..30ae7518ffbf 100644 --- a/mm/damon/sysfs-schemes.c +++ b/mm/damon/sysfs-schemes.c @@ -472,6 +472,7 @@ static ssize_t memcg_path_store(struct kobject *kobj, return -ENOMEM; strscpy(path, buf, count + 1); + kfree(filter->memcg_path); filter->memcg_path = path; return count; } diff --git a/mm/execmem.c b/mm/execmem.c index 9720ac2dfa41..2b683e7d864d 100644 --- a/mm/execmem.c +++ b/mm/execmem.c @@ -254,34 +254,6 @@ out_unlock: return ptr; } -static bool execmem_cache_rox = false; - -void execmem_cache_make_ro(void) -{ - struct maple_tree *free_areas = &execmem_cache.free_areas; - struct maple_tree *busy_areas = &execmem_cache.busy_areas; - MA_STATE(mas_free, free_areas, 0, ULONG_MAX); - MA_STATE(mas_busy, busy_areas, 0, ULONG_MAX); - struct mutex *mutex = &execmem_cache.mutex; - void *area; - - execmem_cache_rox = true; - - mutex_lock(mutex); - - mas_for_each(&mas_free, area, ULONG_MAX) { - unsigned long pages = mas_range_len(&mas_free) >> PAGE_SHIFT; - set_memory_ro(mas_free.index, pages); - } - - mas_for_each(&mas_busy, area, ULONG_MAX) { - unsigned long pages = mas_range_len(&mas_busy) >> PAGE_SHIFT; - set_memory_ro(mas_busy.index, pages); - } - - mutex_unlock(mutex); -} - static int execmem_cache_populate(struct execmem_range *range, size_t size) { unsigned long vm_flags = VM_ALLOW_HUGE_VMAP; @@ -302,15 +274,9 @@ static int execmem_cache_populate(struct execmem_range *range, size_t size) /* fill memory with instructions that will trap */ execmem_fill_trapping_insns(p, alloc_size, /* writable = */ true); - if (execmem_cache_rox) { - err = set_memory_rox((unsigned long)p, vm->nr_pages); - if (err) - goto err_free_mem; - } else { - err = set_memory_x((unsigned long)p, vm->nr_pages); - if (err) - goto err_free_mem; - } + err = set_memory_rox((unsigned long)p, vm->nr_pages); + if (err) + goto err_free_mem; err = execmem_cache_add(p, alloc_size); if (err) @@ -2303,13 +2303,13 @@ static void pofs_unpin(struct pages_or_folios *pofs) /* * Returns the number of collected folios. Return value is always >= 0. */ -static void collect_longterm_unpinnable_folios( +static unsigned long collect_longterm_unpinnable_folios( struct list_head *movable_folio_list, struct pages_or_folios *pofs) { + unsigned long i, collected = 0; struct folio *prev_folio = NULL; bool drain_allow = true; - unsigned long i; for (i = 0; i < pofs->nr_entries; i++) { struct folio *folio = pofs_get_folio(pofs, i); @@ -2321,6 +2321,8 @@ static void collect_longterm_unpinnable_folios( if (folio_is_longterm_pinnable(folio)) continue; + collected++; + if (folio_is_device_coherent(folio)) continue; @@ -2342,6 +2344,8 @@ static void collect_longterm_unpinnable_folios( NR_ISOLATED_ANON + folio_is_file_lru(folio), folio_nr_pages(folio)); } + + return collected; } /* @@ -2418,9 +2422,11 @@ static long check_and_migrate_movable_pages_or_folios(struct pages_or_folios *pofs) { LIST_HEAD(movable_folio_list); + unsigned long collected; - collect_longterm_unpinnable_folios(&movable_folio_list, pofs); - if (list_empty(&movable_folio_list)) + collected = collect_longterm_unpinnable_folios(&movable_folio_list, + pofs); + if (!collected) return 0; return migrate_longterm_unpinnable_folios(&movable_folio_list, pofs); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 8746ed2fec13..9dc95eac558c 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -2787,20 +2787,24 @@ void restore_reserve_on_error(struct hstate *h, struct vm_area_struct *vma, /* * alloc_and_dissolve_hugetlb_folio - Allocate a new folio and dissolve * the old one - * @h: struct hstate old page belongs to * @old_folio: Old folio to dissolve * @list: List to isolate the page in case we need to * Returns 0 on success, otherwise negated error. */ -static int alloc_and_dissolve_hugetlb_folio(struct hstate *h, - struct folio *old_folio, struct list_head *list) +static int alloc_and_dissolve_hugetlb_folio(struct folio *old_folio, + struct list_head *list) { - gfp_t gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; + gfp_t gfp_mask; + struct hstate *h; int nid = folio_nid(old_folio); struct folio *new_folio = NULL; int ret = 0; retry: + /* + * The old_folio might have been dissolved from under our feet, so make sure + * to carefully check the state under the lock. + */ spin_lock_irq(&hugetlb_lock); if (!folio_test_hugetlb(old_folio)) { /* @@ -2829,8 +2833,10 @@ retry: cond_resched(); goto retry; } else { + h = folio_hstate(old_folio); if (!new_folio) { spin_unlock_irq(&hugetlb_lock); + gfp_mask = htlb_alloc_mask(h) | __GFP_THISNODE; new_folio = alloc_buddy_hugetlb_folio(h, gfp_mask, nid, NULL, NULL); if (!new_folio) @@ -2874,35 +2880,24 @@ free_new: int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) { - struct hstate *h; int ret = -EBUSY; - /* - * The page might have been dissolved from under our feet, so make sure - * to carefully check the state under the lock. - * Return success when racing as if we dissolved the page ourselves. - */ - spin_lock_irq(&hugetlb_lock); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - spin_unlock_irq(&hugetlb_lock); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (!folio_test_hugetlb(folio)) return 0; - } - spin_unlock_irq(&hugetlb_lock); /* * Fence off gigantic pages as there is a cyclic dependency between * alloc_contig_range and them. Return -ENOMEM as this has the effect * of bailing out right away without further retrying. */ - if (hstate_is_gigantic(h)) + if (folio_order(folio) > MAX_PAGE_ORDER) return -ENOMEM; if (folio_ref_count(folio) && folio_isolate_hugetlb(folio, list)) ret = 0; else if (!folio_ref_count(folio)) - ret = alloc_and_dissolve_hugetlb_folio(h, folio, list); + ret = alloc_and_dissolve_hugetlb_folio(folio, list); return ret; } @@ -2916,7 +2911,6 @@ int isolate_or_dissolve_huge_folio(struct folio *folio, struct list_head *list) */ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) { - struct hstate *h; struct folio *folio; int ret = 0; @@ -2925,23 +2919,9 @@ int replace_free_hugepage_folios(unsigned long start_pfn, unsigned long end_pfn) while (start_pfn < end_pfn) { folio = pfn_folio(start_pfn); - /* - * The folio might have been dissolved from under our feet, so make sure - * to carefully check the state under the lock. - */ - spin_lock_irq(&hugetlb_lock); - if (folio_test_hugetlb(folio)) { - h = folio_hstate(folio); - } else { - spin_unlock_irq(&hugetlb_lock); - start_pfn++; - continue; - } - spin_unlock_irq(&hugetlb_lock); - - if (!folio_ref_count(folio)) { - ret = alloc_and_dissolve_hugetlb_folio(h, folio, - &isolate_list); + /* Not to disrupt normal path by vainly holding hugetlb_lock */ + if (folio_test_hugetlb(folio) && !folio_ref_count(folio)) { + ret = alloc_and_dissolve_hugetlb_folio(folio, &isolate_list); if (ret) break; diff --git a/mm/kmemleak.c b/mm/kmemleak.c index da9cee34ee1b..8d588e685311 100644 --- a/mm/kmemleak.c +++ b/mm/kmemleak.c @@ -1247,6 +1247,20 @@ void __ref kmemleak_transient_leak(const void *ptr) EXPORT_SYMBOL(kmemleak_transient_leak); /** + * kmemleak_ignore_percpu - similar to kmemleak_ignore but taking a percpu + * address argument + * @ptr: percpu address of the object + */ +void __ref kmemleak_ignore_percpu(const void __percpu *ptr) +{ + pr_debug("%s(0x%px)\n", __func__, ptr); + + if (kmemleak_enabled && ptr && !IS_ERR_PCPU(ptr)) + make_black_object((unsigned long)ptr, OBJECT_PERCPU); +} +EXPORT_SYMBOL_GPL(kmemleak_ignore_percpu); + +/** * kmemleak_ignore - ignore an allocated object * @ptr: pointer to beginning of the object * diff --git a/mm/madvise.c b/mm/madvise.c index 5f7a66a1617e..1d44a35ae85c 100644 --- a/mm/madvise.c +++ b/mm/madvise.c @@ -508,6 +508,7 @@ restart: pte_offset_map_lock(mm, pmd, addr, &ptl); if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; @@ -741,6 +742,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr, start_pte = pte; if (!start_pte) break; + flush_tlb_batched_pending(mm); arch_enter_lazy_mmu_mode(); if (!err) nr = 0; diff --git a/mm/memory.c b/mm/memory.c index 8eba595056fe..b0cda5aab398 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -4315,26 +4315,6 @@ static struct folio *__alloc_swap_folio(struct vm_fault *vmf) } #ifdef CONFIG_TRANSPARENT_HUGEPAGE -static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) -{ - struct swap_info_struct *si = swp_swap_info(entry); - pgoff_t offset = swp_offset(entry); - int i; - - /* - * While allocating a large folio and doing swap_read_folio, which is - * the case the being faulted pte doesn't have swapcache. We need to - * ensure all PTEs have no cache as well, otherwise, we might go to - * swap devices while the content is in swapcache. - */ - for (i = 0; i < max_nr; i++) { - if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) - return i; - } - - return i; -} - /* * Check if the PTEs within a range are contiguous swap entries * and have consistent swapcache, zeromap. diff --git a/mm/shmem.c b/mm/shmem.c index 0c5fb4ffa03a..3a5a65b1f41a 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2259,6 +2259,7 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, folio = swap_cache_get_folio(swap, NULL, 0); order = xa_get_order(&mapping->i_pages, index); if (!folio) { + int nr_pages = 1 << order; bool fallback_order0 = false; /* Or update major stats only when swapin succeeds?? */ @@ -2272,9 +2273,12 @@ static int shmem_swapin_folio(struct inode *inode, pgoff_t index, * If uffd is active for the vma, we need per-page fault * fidelity to maintain the uffd semantics, then fallback * to swapin order-0 folio, as well as for zswap case. + * Any existing sub folio in the swap cache also blocks + * mTHP swapin. */ if (order > 0 && ((vma && unlikely(userfaultfd_armed(vma))) || - !zswap_never_enabled())) + !zswap_never_enabled() || + non_swapcache_batch(swap, nr_pages) != nr_pages)) fallback_order0 = true; /* Skip swapcache for synchronous device. */ diff --git a/mm/swap.h b/mm/swap.h index 2269eb9df0af..9096082a915e 100644 --- a/mm/swap.h +++ b/mm/swap.h @@ -106,6 +106,25 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return find_next_bit(sis->zeromap, end, start) - start; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + struct swap_info_struct *si = swp_swap_info(entry); + pgoff_t offset = swp_offset(entry); + int i; + + /* + * While allocating a large folio and doing mTHP swapin, we need to + * ensure all entries are not cached, otherwise, the mTHP folio will + * be in conflict with the folio in swap cache. + */ + for (i = 0; i < max_nr; i++) { + if ((si->swap_map[offset + i] & SWAP_HAS_CACHE)) + return i; + } + + return i; +} + #else /* CONFIG_SWAP */ struct swap_iocb; static inline void swap_read_folio(struct folio *folio, struct swap_iocb **plug) @@ -199,6 +218,10 @@ static inline int swap_zeromap_batch(swp_entry_t entry, int max_nr, return 0; } +static inline int non_swapcache_batch(swp_entry_t entry, int max_nr) +{ + return 0; +} #endif /* CONFIG_SWAP */ /** diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index bc473ad21202..8253978ee0fb 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -1084,8 +1084,18 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, pte_t orig_dst_pte, pte_t orig_src_pte, pmd_t *dst_pmd, pmd_t dst_pmdval, spinlock_t *dst_ptl, spinlock_t *src_ptl, - struct folio *src_folio) + struct folio *src_folio, + struct swap_info_struct *si, swp_entry_t entry) { + /* + * Check if the folio still belongs to the target swap entry after + * acquiring the lock. Folio can be freed in the swap cache while + * not locked. + */ + if (src_folio && unlikely(!folio_test_swapcache(src_folio) || + entry.val != src_folio->swap.val)) + return -EAGAIN; + double_pt_lock(dst_ptl, src_ptl); if (!is_pte_pages_stable(dst_pte, src_pte, orig_dst_pte, orig_src_pte, @@ -1102,6 +1112,25 @@ static int move_swap_pte(struct mm_struct *mm, struct vm_area_struct *dst_vma, if (src_folio) { folio_move_anon_rmap(src_folio, dst_vma); src_folio->index = linear_page_index(dst_vma, dst_addr); + } else { + /* + * Check if the swap entry is cached after acquiring the src_pte + * lock. Otherwise, we might miss a newly loaded swap cache folio. + * + * Check swap_map directly to minimize overhead, READ_ONCE is sufficient. + * We are trying to catch newly added swap cache, the only possible case is + * when a folio is swapped in and out again staying in swap cache, using the + * same entry before the PTE check above. The PTL is acquired and released + * twice, each time after updating the swap_map's flag. So holding + * the PTL here ensures we see the updated value. False positive is possible, + * e.g. SWP_SYNCHRONOUS_IO swapin may set the flag without touching the + * cache, or during the tiny synchronization window between swap cache and + * swap_map, but it will be gone very quickly, worst result is retry jitters. + */ + if (READ_ONCE(si->swap_map[swp_offset(entry)]) & SWAP_HAS_CACHE) { + double_pt_unlock(dst_ptl, src_ptl); + return -EAGAIN; + } } orig_src_pte = ptep_get_and_clear(mm, src_addr, src_pte); @@ -1412,7 +1441,7 @@ retry: } err = move_swap_pte(mm, dst_vma, dst_addr, src_addr, dst_pte, src_pte, orig_dst_pte, orig_src_pte, dst_pmd, dst_pmdval, - dst_ptl, src_ptl, src_folio); + dst_ptl, src_ptl, src_folio, si, entry); } out: diff --git a/mm/util.c b/mm/util.c index 448117da071f..0b270c43d7d1 100644 --- a/mm/util.c +++ b/mm/util.c @@ -1131,3 +1131,43 @@ void flush_dcache_folio(struct folio *folio) } EXPORT_SYMBOL(flush_dcache_folio); #endif + +/** + * compat_vma_mmap_prepare() - Apply the file's .mmap_prepare() hook to an + * existing VMA + * @file: The file which possesss an f_op->mmap_prepare() hook + * @vma: The VMA to apply the .mmap_prepare() hook to. + * + * Ordinarily, .mmap_prepare() is invoked directly upon mmap(). However, certain + * 'wrapper' file systems invoke a nested mmap hook of an underlying file. + * + * Until all filesystems are converted to use .mmap_prepare(), we must be + * conservative and continue to invoke these 'wrapper' filesystems using the + * deprecated .mmap() hook. + * + * However we have a problem if the underlying file system possesses an + * .mmap_prepare() hook, as we are in a different context when we invoke the + * .mmap() hook, already having a VMA to deal with. + * + * compat_vma_mmap_prepare() is a compatibility function that takes VMA state, + * establishes a struct vm_area_desc descriptor, passes to the underlying + * .mmap_prepare() hook and applies any changes performed by it. + * + * Once the conversion of filesystems is complete this function will no longer + * be required and will be removed. + * + * Returns: 0 on success or error. + */ +int compat_vma_mmap_prepare(struct file *file, struct vm_area_struct *vma) +{ + struct vm_area_desc desc; + int err; + + err = file->f_op->mmap_prepare(vma_to_desc(vma, &desc)); + if (err) + return err; + set_vma_from_desc(vma, &desc); + + return 0; +} +EXPORT_SYMBOL(compat_vma_mmap_prepare); @@ -967,26 +967,9 @@ static __must_check struct vm_area_struct *vma_merge_existing_range( err = dup_anon_vma(next, middle, &anon_dup); } - if (err) + if (err || commit_merge(vmg)) goto abort; - err = commit_merge(vmg); - if (err) { - VM_WARN_ON(err != -ENOMEM); - - if (anon_dup) - unlink_anon_vmas(anon_dup); - - /* - * We've cleaned up any cloned anon_vma's, no VMAs have been - * modified, no harm no foul if the user requests that we not - * report this and just give up, leaving the VMAs unmerged. - */ - if (!vmg->give_up_on_oom) - vmg->state = VMA_MERGE_ERROR_NOMEM; - return NULL; - } - khugepaged_enter_vma(vmg->target, vmg->flags); vmg->state = VMA_MERGE_SUCCESS; return vmg->target; @@ -995,6 +978,9 @@ abort: vma_iter_set(vmg->vmi, start); vma_iter_load(vmg->vmi); + if (anon_dup) + unlink_anon_vmas(anon_dup); + /* * This means we have failed to clone anon_vma's correctly, but no * actual changes to VMAs have occurred, so no harm no foul - if the @@ -3127,7 +3113,6 @@ int __vm_munmap(unsigned long start, size_t len, bool unlock) return ret; } - /* Insert vm structure into process list sorted by address * and into the inode's i_mmap tree. If vm_file is non-NULL * then i_mmap_rwsem is taken here. @@ -222,6 +222,53 @@ static inline int vma_iter_store_gfp(struct vma_iterator *vmi, return 0; } + +/* + * Temporary helper functions for file systems which wrap an invocation of + * f_op->mmap() but which might have an underlying file system which implements + * f_op->mmap_prepare(). + */ + +static inline struct vm_area_desc *vma_to_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc) +{ + desc->mm = vma->vm_mm; + desc->start = vma->vm_start; + desc->end = vma->vm_end; + + desc->pgoff = vma->vm_pgoff; + desc->file = vma->vm_file; + desc->vm_flags = vma->vm_flags; + desc->page_prot = vma->vm_page_prot; + + desc->vm_ops = NULL; + desc->private_data = NULL; + + return desc; +} + +static inline void set_vma_from_desc(struct vm_area_struct *vma, + struct vm_area_desc *desc) +{ + /* + * Since we're invoking .mmap_prepare() despite having a partially + * established VMA, we must take care to handle setting fields + * correctly. + */ + + /* Mutable fields. Populated with initial state. */ + vma->vm_pgoff = desc->pgoff; + if (vma->vm_file != desc->file) + vma_set_file(vma, desc->file); + if (vma->vm_flags != desc->vm_flags) + vm_flags_set(vma, desc->vm_flags); + vma->vm_page_prot = desc->page_prot; + + /* User-defined fields. */ + vma->vm_ops = desc->vm_ops; + vma->vm_private_data = desc->private_data; +} + int do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma, struct mm_struct *mm, unsigned long start, |