diff options
Diffstat (limited to 'mm/swapfile.c')
| -rw-r--r-- | mm/swapfile.c | 234 |
1 files changed, 96 insertions, 138 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c index a1b4b9d80e3b..46d2008e4b99 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -44,7 +44,7 @@ #include <linux/plist.h> #include <asm/tlbflush.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/swap_cgroup.h> #include "swap_table.h" #include "internal.h" @@ -74,7 +74,7 @@ atomic_long_t nr_swap_pages; EXPORT_SYMBOL_GPL(nr_swap_pages); /* protected with swap_lock. reading in vm_swap_full() doesn't need lock */ long total_swap_pages; -static int least_priority = -1; +#define DEF_SWAP_PRIO -1 unsigned long swapfile_maximum_size; #ifdef CONFIG_MIGRATION bool swap_migration_ad_supported; @@ -103,7 +103,7 @@ static PLIST_HEAD(swap_active_head); * is held and the locking order requires swap_lock to be taken * before any swap_info_struct->lock. */ -static struct plist_head *swap_avail_heads; +static PLIST_HEAD(swap_avail_head); static DEFINE_SPINLOCK(swap_avail_lock); struct swap_info_struct *swap_info[MAX_SWAPFILES]; @@ -236,11 +236,10 @@ again: ret = -nr_pages; /* - * When this function is called from scan_swap_map_slots() and it's - * called by vmscan.c at reclaiming folios. So we hold a folio lock - * here. We have to use trylock for avoiding deadlock. This is a special - * case and you should use folio_free_swap() with explicit folio_lock() - * in usual operations. + * We hold a folio lock here. We have to use trylock for + * avoiding deadlock. This is a special case and you should + * use folio_free_swap() with explicit folio_lock() in usual + * operations. */ if (!folio_trylock(folio)) goto out; @@ -594,7 +593,7 @@ static void __free_cluster(struct swap_info_struct *si, struct swap_cluster_info * this returns NULL for an non-empty list. */ static struct swap_cluster_info *isolate_lock_cluster( - struct swap_info_struct *si, struct list_head *list, int order) + struct swap_info_struct *si, struct list_head *list) { struct swap_cluster_info *ci, *found = NULL; @@ -751,14 +750,14 @@ static void relocate_cluster(struct swap_info_struct *si, } /* - * The cluster corresponding to page_nr will be used. The cluster will not be - * added to free cluster list and its usage counter will be increased by 1. - * Only used for initialization. + * The cluster corresponding to @offset will be accounted as having one bad + * slot. The cluster will not be added to the free cluster list, and its + * usage counter will be increased by 1. Only used for initialization. */ -static int inc_cluster_info_page(struct swap_info_struct *si, - struct swap_cluster_info *cluster_info, unsigned long page_nr) +static int swap_cluster_setup_bad_slot(struct swap_cluster_info *cluster_info, + unsigned long offset) { - unsigned long idx = page_nr / SWAPFILE_CLUSTER; + unsigned long idx = offset / SWAPFILE_CLUSTER; struct swap_table *table; struct swap_cluster_info *ci; @@ -772,8 +771,8 @@ static int inc_cluster_info_page(struct swap_info_struct *si, ci->count++; - VM_BUG_ON(ci->count > SWAPFILE_CLUSTER); - VM_BUG_ON(ci->flags); + WARN_ON(ci->count > SWAPFILE_CLUSTER); + WARN_ON(ci->flags); return 0; } @@ -957,7 +956,7 @@ static unsigned int alloc_swap_scan_list(struct swap_info_struct *si, unsigned int found = SWAP_ENTRY_INVALID; do { - struct swap_cluster_info *ci = isolate_lock_cluster(si, list, order); + struct swap_cluster_info *ci = isolate_lock_cluster(si, list); unsigned long offset; if (!ci) @@ -982,7 +981,7 @@ static void swap_reclaim_full_clusters(struct swap_info_struct *si, bool force) if (force) to_scan = swap_usage_in_pages(si) / SWAPFILE_CLUSTER; - while ((ci = isolate_lock_cluster(si, &si->full_clusters, 0))) { + while ((ci = isolate_lock_cluster(si, &si->full_clusters))) { offset = cluster_offset(si, ci); end = min(si->max, offset + SWAPFILE_CLUSTER); to_scan--; @@ -1101,13 +1100,6 @@ new_cluster: goto done; } - /* - * We don't have free cluster but have some clusters in discarding, - * do discard now and reclaim them. - */ - if ((si->flags & SWP_PAGE_DISCARD) && swap_do_scheduled_discard(si)) - goto new_cluster; - if (order) goto done; @@ -1137,7 +1129,6 @@ done: /* SWAP_USAGE_OFFLIST_BIT can only be set by this helper. */ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) { - int nid; unsigned long pages; spin_lock(&swap_avail_lock); @@ -1166,8 +1157,7 @@ static void del_from_avail_list(struct swap_info_struct *si, bool swapoff) goto skip; } - for_each_node(nid) - plist_del(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_del(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1176,7 +1166,6 @@ skip: /* SWAP_USAGE_OFFLIST_BIT can only be cleared by this helper. */ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) { - int nid; long val; unsigned long pages; @@ -1209,8 +1198,7 @@ static void add_to_avail_list(struct swap_info_struct *si, bool swapon) goto skip; } - for_each_node(nid) - plist_add(&si->avail_lists[nid], &swap_avail_heads[nid]); + plist_add(&si->avail_list, &swap_avail_head); skip: spin_unlock(&swap_avail_lock); @@ -1350,54 +1338,79 @@ static bool swap_alloc_fast(swp_entry_t *entry, } /* Rotate the device and switch to a new cluster */ -static bool swap_alloc_slow(swp_entry_t *entry, +static void swap_alloc_slow(swp_entry_t *entry, int order) { - int node; unsigned long offset; struct swap_info_struct *si, *next; - node = numa_node_id(); spin_lock(&swap_avail_lock); start_over: - plist_for_each_entry_safe(si, next, &swap_avail_heads[node], avail_lists[node]) { + plist_for_each_entry_safe(si, next, &swap_avail_head, avail_list) { /* Rotate the device and switch to a new cluster */ - plist_requeue(&si->avail_lists[node], &swap_avail_heads[node]); + plist_requeue(&si->avail_list, &swap_avail_head); spin_unlock(&swap_avail_lock); if (get_swap_device_info(si)) { offset = cluster_alloc_swap_entry(si, order, SWAP_HAS_CACHE); put_swap_device(si); if (offset) { *entry = swp_entry(si->type, offset); - return true; + return; } if (order) - return false; + return; } spin_lock(&swap_avail_lock); /* * if we got here, it's likely that si was almost full before, - * and since scan_swap_map_slots() can drop the si->lock, * multiple callers probably all tried to get a page from the * same si and it filled up before we could get one; or, the si - * filled up between us dropping swap_avail_lock and taking - * si->lock. Since we dropped the swap_avail_lock, the - * swap_avail_head list may have been modified; so if next is - * still in the swap_avail_head list then try it, otherwise - * start over if we have not gotten any slots. + * filled up between us dropping swap_avail_lock. + * Since we dropped the swap_avail_lock, the swap_avail_list + * may have been modified; so if next is still in the + * swap_avail_head list then try it, otherwise start over if we + * have not gotten any slots. */ - if (plist_node_empty(&next->avail_lists[node])) + if (plist_node_empty(&next->avail_list)) goto start_over; } spin_unlock(&swap_avail_lock); +} + +/* + * Discard pending clusters in a synchronized way when under high pressure. + * Return: true if any cluster is discarded. + */ +static bool swap_sync_discard(void) +{ + bool ret = false; + struct swap_info_struct *si, *next; + + spin_lock(&swap_lock); +start_over: + plist_for_each_entry_safe(si, next, &swap_active_head, list) { + spin_unlock(&swap_lock); + if (get_swap_device_info(si)) { + if (si->flags & SWP_PAGE_DISCARD) + ret = swap_do_scheduled_discard(si); + put_swap_device(si); + } + if (ret) + return true; + + spin_lock(&swap_lock); + if (plist_node_empty(&next->list)) + goto start_over; + } + spin_unlock(&swap_lock); + return false; } /** * folio_alloc_swap - allocate swap space for a folio * @folio: folio we want to move to swap - * @gfp: gfp mask for shadow nodes * * Allocate swap space for the folio and add the folio to the * swap cache. @@ -1405,7 +1418,7 @@ start_over: * Context: Caller needs to hold the folio lock. * Return: Whether the folio was added to the swap cache. */ -int folio_alloc_swap(struct folio *folio, gfp_t gfp) +int folio_alloc_swap(struct folio *folio) { unsigned int order = folio_order(folio); unsigned int size = 1 << order; @@ -1432,11 +1445,17 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp) } } +again: local_lock(&percpu_swap_cluster.lock); if (!swap_alloc_fast(&entry, order)) swap_alloc_slow(&entry, order); local_unlock(&percpu_swap_cluster.lock); + if (unlikely(!order && !entry.val)) { + if (swap_sync_discard()) + goto again; + } + /* Need to call this even if allocation failed, for MEMCG_SWAP_FAIL. */ if (mem_cgroup_try_charge_swap(folio, entry)) goto out_free; @@ -1677,7 +1696,7 @@ static bool swap_entries_put_map_nr(struct swap_info_struct *si, /* * Check if it's the last ref of swap entry in the freeing path. - * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. + * Qualified value includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM. */ static inline bool __maybe_unused swap_is_last_ref(unsigned char count) { @@ -2239,7 +2258,7 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, struct folio *folio; unsigned long offset; unsigned char swp_count; - swp_entry_t entry; + softleaf_t entry; int ret; pte_t ptent; @@ -2250,11 +2269,10 @@ static int unuse_pte_range(struct vm_area_struct *vma, pmd_t *pmd, } ptent = ptep_get_lockless(pte); + entry = softleaf_from_pte(ptent); - if (!is_swap_pte(ptent)) + if (!softleaf_is_swap(entry)) continue; - - entry = pte_to_swp_entry(ptent); if (swp_type(entry) != type) continue; @@ -2682,44 +2700,18 @@ static int setup_swap_extents(struct swap_info_struct *sis, sector_t *span) return generic_swapfile_activate(sis, swap_file, span); } -static int swap_node(struct swap_info_struct *si) -{ - struct block_device *bdev; - - if (si->bdev) - bdev = si->bdev; - else - bdev = si->swap_file->f_inode->i_sb->s_bdev; - - return bdev ? bdev->bd_disk->node_id : NUMA_NO_NODE; -} - static void setup_swap_info(struct swap_info_struct *si, int prio, unsigned char *swap_map, struct swap_cluster_info *cluster_info, unsigned long *zeromap) { - int i; - - if (prio >= 0) - si->prio = prio; - else - si->prio = --least_priority; + si->prio = prio; /* * the plist prio is negated because plist ordering is * low-to-high, while swap ordering is high-to-low */ si->list.prio = -si->prio; - for_each_node(i) { - if (si->prio >= 0) - si->avail_lists[i].prio = -si->prio; - else { - if (swap_node(si) == i) - si->avail_lists[i].prio = 1; - else - si->avail_lists[i].prio = -si->prio; - } - } + si->avail_list.prio = -si->prio; si->swap_map = swap_map; si->cluster_info = cluster_info; si->zeromap = zeromap; @@ -2731,16 +2723,7 @@ static void _enable_swap_info(struct swap_info_struct *si) total_swap_pages += si->pages; assert_spin_locked(&swap_lock); - /* - * both lists are plists, and thus priority ordered. - * swap_active_head needs to be priority ordered for swapoff(), - * which on removal of any swap_info_struct with an auto-assigned - * (i.e. negative) priority increments the auto-assigned priority - * of any lower-priority swap_info_structs. - * swap_avail_head needs to be priority ordered for folio_alloc_swap(), - * which allocates swap pages from the highest available priority - * swap_info_struct. - */ + plist_add(&si->list, &swap_active_head); /* Add back to available list */ @@ -2890,20 +2873,6 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) } spin_lock(&p->lock); del_from_avail_list(p, true); - if (p->prio < 0) { - struct swap_info_struct *si = p; - int nid; - - plist_for_each_entry_continue(si, &swap_active_head, list) { - si->prio++; - si->list.prio--; - for_each_node(nid) { - if (si->avail_lists[nid].prio != 1) - si->avail_lists[nid].prio--; - } - } - least_priority++; - } plist_del(&p->list, &swap_active_head); atomic_long_sub(p->pages, &nr_swap_pages); total_swap_pages -= p->pages; @@ -2942,7 +2911,7 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile) if (p->flags & SWP_CONTINUED) free_swap_count_continuations(p); - if (!p->bdev || !bdev_nonrot(p->bdev)) + if (!(p->flags & SWP_SOLIDSTATE)) atomic_dec(&nr_rotate_swap); mutex_lock(&swapon_mutex); @@ -3141,9 +3110,8 @@ static struct swap_info_struct *alloc_swap_info(void) struct swap_info_struct *p; struct swap_info_struct *defer = NULL; unsigned int type; - int i; - p = kvzalloc(struct_size(p, avail_lists, nr_node_ids), GFP_KERNEL); + p = kvzalloc(sizeof(struct swap_info_struct), GFP_KERNEL); if (!p) return ERR_PTR(-ENOMEM); @@ -3182,8 +3150,7 @@ static struct swap_info_struct *alloc_swap_info(void) } p->swap_extent_root = RB_ROOT; plist_node_init(&p->list, 0); - for_each_node(i) - plist_node_init(&p->avail_lists[i], 0); + plist_node_init(&p->avail_list, 0); p->flags = SWP_USED; spin_unlock(&swap_lock); if (defer) { @@ -3236,8 +3203,17 @@ static int claim_swapfile(struct swap_info_struct *si, struct inode *inode) */ unsigned long generic_max_swapfile_size(void) { - return swp_offset(pte_to_swp_entry( - swp_entry_to_pte(swp_entry(0, ~0UL)))) + 1; + swp_entry_t entry = swp_entry(0, ~0UL); + const pte_t pte = softleaf_to_pte(entry); + + /* + * Since the PTE can be an invalid softleaf entry (e.g. the none PTE), + * we need to do this manually. + */ + entry = __pte_to_swp_entry(pte); + entry = swp_entry(__swp_type(entry), __swp_offset(entry)); + + return swp_offset(entry) + 1; } /* Can be overridden by an architecture for additional checks. */ @@ -3355,7 +3331,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, si->global_cluster = kmalloc(sizeof(*si->global_cluster), GFP_KERNEL); if (!si->global_cluster) - goto err_free; + goto err; for (i = 0; i < SWAP_NR_ORDERS; i++) si->global_cluster->next[i] = SWAP_ENTRY_INVALID; spin_lock_init(&si->global_cluster_lock); @@ -3368,7 +3344,7 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, * See setup_swap_map(): header page, bad pages, * and the EOF part of the last cluster. */ - err = inc_cluster_info_page(si, cluster_info, 0); + err = swap_cluster_setup_bad_slot(cluster_info, 0); if (err) goto err; for (i = 0; i < swap_header->info.nr_badpages; i++) { @@ -3376,12 +3352,12 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, if (page_nr >= maxpages) continue; - err = inc_cluster_info_page(si, cluster_info, page_nr); + err = swap_cluster_setup_bad_slot(cluster_info, page_nr); if (err) goto err; } for (i = maxpages; i < round_up(maxpages, SWAPFILE_CLUSTER); i++) { - err = inc_cluster_info_page(si, cluster_info, i); + err = swap_cluster_setup_bad_slot(cluster_info, i); if (err) goto err; } @@ -3408,9 +3384,8 @@ static struct swap_cluster_info *setup_clusters(struct swap_info_struct *si, } return cluster_info; -err_free: - free_cluster_info(cluster_info, maxpages); err: + free_cluster_info(cluster_info, maxpages); return ERR_PTR(err); } @@ -3440,9 +3415,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) if (!capable(CAP_SYS_ADMIN)) return -EPERM; - if (!swap_avail_heads) - return -ENOMEM; - si = alloc_swap_info(); if (IS_ERR(si)) return PTR_ERR(si); @@ -3619,7 +3591,7 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) } mutex_lock(&swapon_mutex); - prio = -1; + prio = DEF_SWAP_PRIO; if (swap_flags & SWAP_FLAG_PREFER) prio = swap_flags & SWAP_FLAG_PRIO_MASK; enable_swap_info(si, prio, swap_map, cluster_info, zeromap); @@ -4051,8 +4023,7 @@ static bool __has_usable_swap(void) void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) { - struct swap_info_struct *si, *next; - int nid = folio_nid(folio); + struct swap_info_struct *si; if (!(gfp & __GFP_IO)) return; @@ -4071,8 +4042,7 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) return; spin_lock(&swap_avail_lock); - plist_for_each_entry_safe(si, next, &swap_avail_heads[nid], - avail_lists[nid]) { + plist_for_each_entry(si, &swap_avail_head, avail_list) { if (si->bdev) { blkcg_schedule_throttle(si->bdev->bd_disk, true); break; @@ -4084,18 +4054,6 @@ void __folio_throttle_swaprate(struct folio *folio, gfp_t gfp) static int __init swapfile_init(void) { - int nid; - - swap_avail_heads = kmalloc_array(nr_node_ids, sizeof(struct plist_head), - GFP_KERNEL); - if (!swap_avail_heads) { - pr_emerg("Not enough memory for swap heads, swap is disabled\n"); - return -ENOMEM; - } - - for_each_node(nid) - plist_head_init(&swap_avail_heads[nid]); - swapfile_maximum_size = arch_max_swapfile_size(); /* |
