summaryrefslogtreecommitdiff
path: root/mm/swapfile.c
diff options
context:
space:
mode:
Diffstat (limited to 'mm/swapfile.c')
-rw-r--r--mm/swapfile.c218
1 files changed, 113 insertions, 105 deletions
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 2eff8b51a945..68ce283e84be 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -52,9 +52,9 @@
static bool swap_count_continued(struct swap_info_struct *, pgoff_t,
unsigned char);
static void free_swap_count_continuations(struct swap_info_struct *);
-static void swap_entry_range_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry, unsigned int nr_pages);
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages);
static void swap_range_alloc(struct swap_info_struct *si,
unsigned int nr_entries);
static bool folio_swapcache_freeable(struct folio *folio);
@@ -192,7 +192,7 @@ static bool swap_is_last_map(struct swap_info_struct *si,
unsigned char *map_end = map + nr_pages;
unsigned char count = *map;
- if (swap_count(count) != 1)
+ if (swap_count(count) != 1 && swap_count(count) != SWAP_MAP_SHMEM)
return false;
while (++map < map_end) {
@@ -1272,13 +1272,22 @@ int folio_alloc_swap(struct folio *folio, gfp_t gfp)
VM_BUG_ON_FOLIO(!folio_test_locked(folio), folio);
VM_BUG_ON_FOLIO(!folio_test_uptodate(folio), folio);
- /*
- * Should not even be attempting large allocations when huge
- * page swap is disabled. Warn and fail the allocation.
- */
- if (order && (!IS_ENABLED(CONFIG_THP_SWAP) || size > SWAPFILE_CLUSTER)) {
- VM_WARN_ON_ONCE(1);
- return -EINVAL;
+ if (order) {
+ /*
+ * Reject large allocation when THP_SWAP is disabled,
+ * the caller should split the folio and try again.
+ */
+ if (!IS_ENABLED(CONFIG_THP_SWAP))
+ return -EAGAIN;
+
+ /*
+ * Allocation size should never exceed cluster size
+ * (HPAGE_PMD_SIZE).
+ */
+ if (size > SWAPFILE_CLUSTER) {
+ VM_WARN_ON_ONCE(1);
+ return -EINVAL;
+ }
}
local_lock(&percpu_swap_cluster.lock);
@@ -1346,10 +1355,12 @@ out:
return NULL;
}
-static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
- unsigned long offset,
- unsigned char usage)
+static unsigned char swap_entry_put_locked(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry,
+ unsigned char usage)
{
+ unsigned long offset = swp_offset(entry);
unsigned char count;
unsigned char has_cache;
@@ -1381,7 +1392,7 @@ static unsigned char __swap_entry_free_locked(struct swap_info_struct *si,
if (usage)
WRITE_ONCE(si->swap_map[offset], usage);
else
- WRITE_ONCE(si->swap_map[offset], SWAP_HAS_CACHE);
+ swap_entries_free(si, ci, entry, 1);
return usage;
}
@@ -1452,71 +1463,104 @@ put_out:
return NULL;
}
-static unsigned char __swap_entry_free(struct swap_info_struct *si,
- swp_entry_t entry)
+static void swap_entries_put_cache(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
- struct swap_cluster_info *ci;
unsigned long offset = swp_offset(entry);
- unsigned char usage;
+ struct swap_cluster_info *ci;
ci = lock_cluster(si, offset);
- usage = __swap_entry_free_locked(si, offset, 1);
- if (!usage)
- swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
+ if (swap_only_has_cache(si, offset, nr))
+ swap_entries_free(si, ci, entry, nr);
+ else {
+ for (int i = 0; i < nr; i++, entry.val++)
+ swap_entry_put_locked(si, ci, entry, SWAP_HAS_CACHE);
+ }
unlock_cluster(ci);
-
- return usage;
}
-static bool __swap_entries_free(struct swap_info_struct *si,
- swp_entry_t entry, int nr)
+static bool swap_entries_put_map(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
{
unsigned long offset = swp_offset(entry);
- unsigned int type = swp_type(entry);
struct swap_cluster_info *ci;
bool has_cache = false;
unsigned char count;
int i;
- if (nr <= 1 || swap_count(data_race(si->swap_map[offset])) != 1)
+ if (nr <= 1)
goto fallback;
- /* cross into another cluster */
- if (nr > SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER)
+ count = swap_count(data_race(si->swap_map[offset]));
+ if (count != 1 && count != SWAP_MAP_SHMEM)
goto fallback;
ci = lock_cluster(si, offset);
if (!swap_is_last_map(si, offset, nr, &has_cache)) {
- unlock_cluster(ci);
- goto fallback;
+ goto locked_fallback;
}
- for (i = 0; i < nr; i++)
- WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
if (!has_cache)
- swap_entry_range_free(si, ci, entry, nr);
+ swap_entries_free(si, ci, entry, nr);
+ else
+ for (i = 0; i < nr; i++)
+ WRITE_ONCE(si->swap_map[offset + i], SWAP_HAS_CACHE);
unlock_cluster(ci);
return has_cache;
fallback:
- for (i = 0; i < nr; i++) {
- if (data_race(si->swap_map[offset + i])) {
- count = __swap_entry_free(si, swp_entry(type, offset + i));
- if (count == SWAP_HAS_CACHE)
- has_cache = true;
- } else {
- WARN_ON_ONCE(1);
- }
+ ci = lock_cluster(si, offset);
+locked_fallback:
+ for (i = 0; i < nr; i++, entry.val++) {
+ count = swap_entry_put_locked(si, ci, entry, 1);
+ if (count == SWAP_HAS_CACHE)
+ has_cache = true;
}
+ unlock_cluster(ci);
return has_cache;
+
}
/*
- * Drop the last HAS_CACHE flag of swap entries, caller have to
- * ensure all entries belong to the same cgroup.
+ * Only functions with "_nr" suffix are able to free entries spanning
+ * cross multi clusters, so ensure the range is within a single cluster
+ * when freeing entries with functions without "_nr" suffix.
*/
-static void swap_entry_range_free(struct swap_info_struct *si,
- struct swap_cluster_info *ci,
- swp_entry_t entry, unsigned int nr_pages)
+static bool swap_entries_put_map_nr(struct swap_info_struct *si,
+ swp_entry_t entry, int nr)
+{
+ int cluster_nr, cluster_rest;
+ unsigned long offset = swp_offset(entry);
+ bool has_cache = false;
+
+ cluster_rest = SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER;
+ while (nr) {
+ cluster_nr = min(nr, cluster_rest);
+ has_cache |= swap_entries_put_map(si, entry, cluster_nr);
+ cluster_rest = SWAPFILE_CLUSTER;
+ nr -= cluster_nr;
+ entry.val += cluster_nr;
+ }
+
+ return has_cache;
+}
+
+/*
+ * Check if it's the last ref of swap entry in the freeing path.
+ * Qualified vlaue includes 1, SWAP_HAS_CACHE or SWAP_MAP_SHMEM.
+ */
+static inline bool __maybe_unused swap_is_last_ref(unsigned char count)
+{
+ return (count == SWAP_HAS_CACHE) || (count == 1) ||
+ (count == SWAP_MAP_SHMEM);
+}
+
+/*
+ * Drop the last ref of swap entries, caller have to ensure all entries
+ * belong to the same cgroup and cluster.
+ */
+static void swap_entries_free(struct swap_info_struct *si,
+ struct swap_cluster_info *ci,
+ swp_entry_t entry, unsigned int nr_pages)
{
unsigned long offset = swp_offset(entry);
unsigned char *map = si->swap_map + offset;
@@ -1529,7 +1573,7 @@ static void swap_entry_range_free(struct swap_info_struct *si,
ci->count -= nr_pages;
do {
- VM_BUG_ON(*map != SWAP_HAS_CACHE);
+ VM_BUG_ON(!swap_is_last_ref(*map));
*map = 0;
} while (++map < map_end);
@@ -1542,21 +1586,6 @@ static void swap_entry_range_free(struct swap_info_struct *si,
partial_free_cluster(si, ci);
}
-static void cluster_swap_free_nr(struct swap_info_struct *si,
- unsigned long offset, int nr_pages,
- unsigned char usage)
-{
- struct swap_cluster_info *ci;
- unsigned long end = offset + nr_pages;
-
- ci = lock_cluster(si, offset);
- do {
- if (!__swap_entry_free_locked(si, offset, usage))
- swap_entry_range_free(si, ci, swp_entry(si->type, offset), 1);
- } while (++offset < end);
- unlock_cluster(ci);
-}
-
/*
* Caller has made sure that the swap device corresponding to entry
* is still around or has not been recycled.
@@ -1573,7 +1602,7 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
while (nr_pages) {
nr = min_t(int, nr_pages, SWAPFILE_CLUSTER - offset % SWAPFILE_CLUSTER);
- cluster_swap_free_nr(sis, offset, nr, 1);
+ swap_entries_put_map(sis, swp_entry(sis->type, offset), nr);
offset += nr;
nr_pages -= nr;
}
@@ -1584,8 +1613,6 @@ void swap_free_nr(swp_entry_t entry, int nr_pages)
*/
void put_swap_folio(struct folio *folio, swp_entry_t entry)
{
- unsigned long offset = swp_offset(entry);
- struct swap_cluster_info *ci;
struct swap_info_struct *si;
int size = 1 << swap_entry_order(folio_order(folio));
@@ -1593,16 +1620,7 @@ void put_swap_folio(struct folio *folio, swp_entry_t entry)
if (!si)
return;
- ci = lock_cluster(si, offset);
- if (swap_only_has_cache(si, offset, size))
- swap_entry_range_free(si, ci, entry, size);
- else {
- for (int i = 0; i < size; i++, entry.val++) {
- if (!__swap_entry_free_locked(si, offset + i, SWAP_HAS_CACHE))
- swap_entry_range_free(si, ci, entry, 1);
- }
- }
- unlock_cluster(ci);
+ swap_entries_put_cache(si, entry, size);
}
int __swap_count(swp_entry_t entry)
@@ -1797,7 +1815,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
/*
* First free all entries in the range.
*/
- any_only_cache = __swap_entries_free(si, entry, nr);
+ any_only_cache = swap_entries_put_map_nr(si, entry, nr);
/*
* Short-circuit the below loop if none of the entries had their
@@ -1807,13 +1825,7 @@ void free_swap_and_cache_nr(swp_entry_t entry, int nr)
goto out;
/*
- * Now go back over the range trying to reclaim the swap cache. This is
- * more efficient for large folios because we will only try to reclaim
- * the swap once per folio in the common case. If we do
- * __swap_entry_free() and __try_to_reclaim_swap() in the same loop, the
- * latter will get a reference and lock the folio for every individual
- * page but will only succeed once the swap slot for every subpage is
- * zero.
+ * Now go back over the range trying to reclaim the swap cache.
*/
for (offset = start_offset; offset < end_offset; offset += nr) {
nr = 1;
@@ -2359,7 +2371,7 @@ retry:
* Limit the number of retries? No: when mmget_not_zero()
* above fails, that mm is likely to be freeing swap from
* exit_mmap(), which proceeds at its own independent pace;
- * and even shmem_writepage() could have been preempted after
+ * and even shmem_writeout() could have been preempted after
* folio_alloc_swap(), temporarily hiding that swap. It's easy
* and robust (though cpu-intensive) just to keep retrying.
*/
@@ -3323,6 +3335,15 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags)
}
/*
+ * The swap subsystem needs a major overhaul to support this.
+ * It doesn't work yet so just disable it for now.
+ */
+ if (mapping_min_folio_order(mapping) > 0) {
+ error = -EINVAL;
+ goto bad_swap_unlock_inode;
+ }
+
+ /*
* Read the swap header.
*/
if (!mapping->a_ops->read_folio) {
@@ -3636,11 +3657,13 @@ int swapcache_prepare(swp_entry_t entry, int nr)
return __swap_duplicate(entry, SWAP_HAS_CACHE, nr);
}
+/*
+ * Caller should ensure entries belong to the same folio so
+ * the entries won't span cross cluster boundary.
+ */
void swapcache_clear(struct swap_info_struct *si, swp_entry_t entry, int nr)
{
- unsigned long offset = swp_offset(entry);
-
- cluster_swap_free_nr(si, offset, nr, SWAP_HAS_CACHE);
+ swap_entries_put_cache(si, entry, nr);
}
struct swap_info_struct *swp_swap_info(swp_entry_t entry)
@@ -3649,21 +3672,6 @@ struct swap_info_struct *swp_swap_info(swp_entry_t entry)
}
/*
- * out-of-line methods to avoid include hell.
- */
-struct address_space *swapcache_mapping(struct folio *folio)
-{
- return swp_swap_info(folio->swap)->swap_file->f_mapping;
-}
-EXPORT_SYMBOL_GPL(swapcache_mapping);
-
-pgoff_t __folio_swap_cache_index(struct folio *folio)
-{
- return swap_cache_index(folio->swap);
-}
-EXPORT_SYMBOL_GPL(__folio_swap_cache_index);
-
-/*
* add_swap_count_continuation - called when a swap count is duplicated
* beyond SWAP_MAP_MAX, it allocates a new page and links that to the entry's
* page of the original vmalloc'ed swap_map, to hold the continuation count
@@ -3780,7 +3788,7 @@ outer:
* into, carry if so, or else fail until a new continuation page is allocated;
* when the original swap_map count is decremented from 0 with continuation,
* borrow from the continuation and report whether it still holds more.
- * Called while __swap_duplicate() or caller of __swap_entry_free_locked()
+ * Called while __swap_duplicate() or caller of swap_entry_put_locked()
* holds cluster lock.
*/
static bool swap_count_continued(struct swap_info_struct *si,