summaryrefslogtreecommitdiff
path: root/mm
diff options
context:
space:
mode:
Diffstat (limited to 'mm')
-rw-r--r--mm/Kconfig7
-rw-r--r--mm/backing-dev.c2
-rw-r--r--mm/damon/stat.c9
-rw-r--r--mm/damon/sysfs.c10
-rw-r--r--mm/fadvise.c3
-rw-r--r--mm/filemap.c204
-rw-r--r--mm/huge_memory.c96
-rw-r--r--mm/internal.h2
-rw-r--r--mm/kmsan/core.c3
-rw-r--r--mm/kmsan/hooks.c6
-rw-r--r--mm/kmsan/shadow.c2
-rw-r--r--mm/ksm.c113
-rw-r--r--mm/memblock.c3
-rw-r--r--mm/memfd.c56
-rw-r--r--mm/memory.c82
-rw-r--r--mm/memory_hotplug.c17
-rw-r--r--mm/mempool.c32
-rw-r--r--mm/mm_init.c2
-rw-r--r--mm/mmap_lock.c1
-rw-r--r--mm/mremap.c2
-rw-r--r--mm/page-writeback.c6
-rw-r--r--mm/page_alloc.c9
-rw-r--r--mm/secretmem.c22
-rw-r--r--mm/shmem.c32
-rw-r--r--mm/slub.c14
-rw-r--r--mm/sparse.c3
-rw-r--r--mm/swap_state.c13
-rw-r--r--mm/swapfile.c4
-rw-r--r--mm/truncate.c47
-rw-r--r--mm/vmscan.c2
-rw-r--r--mm/workingset.c2
31 files changed, 533 insertions, 273 deletions
diff --git a/mm/Kconfig b/mm/Kconfig
index 0e26f4fc8717..ca3f146bc705 100644
--- a/mm/Kconfig
+++ b/mm/Kconfig
@@ -908,6 +908,13 @@ config PAGE_MAPCOUNT
config PGTABLE_HAS_HUGE_LEAVES
def_bool TRANSPARENT_HUGEPAGE || HUGETLB_PAGE
+#
+# We can end up creating gigantic folio.
+#
+config HAVE_GIGANTIC_FOLIOS
+ def_bool (HUGETLB_PAGE && ARCH_HAS_GIGANTIC_PAGE) || \
+ (ZONE_DEVICE && HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD)
+
# TODO: Allow to be enabled without THP
config ARCH_SUPPORTS_HUGE_PFNMAP
def_bool n
diff --git a/mm/backing-dev.c b/mm/backing-dev.c
index 41b6c9386b69..c5740c6d37a2 100644
--- a/mm/backing-dev.c
+++ b/mm/backing-dev.c
@@ -72,7 +72,7 @@ static void collect_wb_stats(struct wb_stats *stats,
list_for_each_entry(inode, &wb->b_more_io, i_io_list)
stats->nr_more_io++;
list_for_each_entry(inode, &wb->b_dirty_time, i_io_list)
- if (inode->i_state & I_DIRTY_TIME)
+ if (inode_state_read_once(inode) & I_DIRTY_TIME)
stats->nr_dirty_time++;
spin_unlock(&wb->list_lock);
diff --git a/mm/damon/stat.c b/mm/damon/stat.c
index d8010968bbed..bf8626859902 100644
--- a/mm/damon/stat.c
+++ b/mm/damon/stat.c
@@ -46,6 +46,8 @@ MODULE_PARM_DESC(aggr_interval_us,
static struct damon_ctx *damon_stat_context;
+static unsigned long damon_stat_last_refresh_jiffies;
+
static void damon_stat_set_estimated_memory_bandwidth(struct damon_ctx *c)
{
struct damon_target *t;
@@ -130,13 +132,12 @@ static void damon_stat_set_idletime_percentiles(struct damon_ctx *c)
static int damon_stat_damon_call_fn(void *data)
{
struct damon_ctx *c = data;
- static unsigned long last_refresh_jiffies;
/* avoid unnecessarily frequent stat update */
- if (time_before_eq(jiffies, last_refresh_jiffies +
+ if (time_before_eq(jiffies, damon_stat_last_refresh_jiffies +
msecs_to_jiffies(5 * MSEC_PER_SEC)))
return 0;
- last_refresh_jiffies = jiffies;
+ damon_stat_last_refresh_jiffies = jiffies;
aggr_interval_us = c->attrs.aggr_interval;
damon_stat_set_estimated_memory_bandwidth(c);
@@ -210,6 +211,8 @@ static int damon_stat_start(void)
err = damon_start(&damon_stat_context, 1, true);
if (err)
return err;
+
+ damon_stat_last_refresh_jiffies = jiffies;
call_control.data = damon_stat_context;
return damon_call(damon_stat_context, &call_control);
}
diff --git a/mm/damon/sysfs.c b/mm/damon/sysfs.c
index cd6815ecc04e..3c0d727788c8 100644
--- a/mm/damon/sysfs.c
+++ b/mm/damon/sysfs.c
@@ -1552,16 +1552,17 @@ static struct damon_ctx *damon_sysfs_build_ctx(
return ctx;
}
+static unsigned long damon_sysfs_next_update_jiffies;
+
static int damon_sysfs_repeat_call_fn(void *data)
{
struct damon_sysfs_kdamond *sysfs_kdamond = data;
- static unsigned long next_update_jiffies;
if (!sysfs_kdamond->refresh_ms)
return 0;
- if (time_before(jiffies, next_update_jiffies))
+ if (time_before(jiffies, damon_sysfs_next_update_jiffies))
return 0;
- next_update_jiffies = jiffies +
+ damon_sysfs_next_update_jiffies = jiffies +
msecs_to_jiffies(sysfs_kdamond->refresh_ms);
if (!mutex_trylock(&damon_sysfs_lock))
@@ -1607,6 +1608,9 @@ static int damon_sysfs_turn_damon_on(struct damon_sysfs_kdamond *kdamond)
}
kdamond->damon_ctx = ctx;
+ damon_sysfs_next_update_jiffies =
+ jiffies + msecs_to_jiffies(kdamond->refresh_ms);
+
repeat_call_control->fn = damon_sysfs_repeat_call_fn;
repeat_call_control->data = kdamond;
repeat_call_control->repeat = true;
diff --git a/mm/fadvise.c b/mm/fadvise.c
index 588fe76c5a14..67028e30aa91 100644
--- a/mm/fadvise.c
+++ b/mm/fadvise.c
@@ -111,8 +111,7 @@ int generic_fadvise(struct file *file, loff_t offset, loff_t len, int advice)
spin_unlock(&file->f_lock);
break;
case POSIX_FADV_DONTNEED:
- __filemap_fdatawrite_range(mapping, offset, endbyte,
- WB_SYNC_NONE);
+ filemap_flush_range(mapping, offset, endbyte);
/*
* First and last FULL page! Partial pages are deliberately
diff --git a/mm/filemap.c b/mm/filemap.c
index 13f0259d993c..dfc8a31f1222 100644
--- a/mm/filemap.c
+++ b/mm/filemap.c
@@ -256,7 +256,7 @@ void filemap_remove_folio(struct folio *folio)
__filemap_remove_folio(folio, NULL);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
filemap_free_folio(mapping, folio);
@@ -335,7 +335,7 @@ void delete_from_page_cache_batch(struct address_space *mapping,
page_cache_delete_batch(mapping, fbatch);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
for (i = 0; i < folio_batch_count(fbatch); i++)
@@ -366,83 +366,60 @@ static int filemap_check_and_keep_errors(struct address_space *mapping)
return 0;
}
-/**
- * filemap_fdatawrite_wbc - start writeback on mapping dirty pages in range
- * @mapping: address space structure to write
- * @wbc: the writeback_control controlling the writeout
- *
- * Call writepages on the mapping using the provided wbc to control the
- * writeout.
- *
- * Return: %0 on success, negative error code otherwise.
- */
-int filemap_fdatawrite_wbc(struct address_space *mapping,
- struct writeback_control *wbc)
+static int filemap_writeback(struct address_space *mapping, loff_t start,
+ loff_t end, enum writeback_sync_modes sync_mode,
+ long *nr_to_write)
{
+ struct writeback_control wbc = {
+ .sync_mode = sync_mode,
+ .nr_to_write = nr_to_write ? *nr_to_write : LONG_MAX,
+ .range_start = start,
+ .range_end = end,
+ };
int ret;
if (!mapping_can_writeback(mapping) ||
!mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
return 0;
- wbc_attach_fdatawrite_inode(wbc, mapping->host);
- ret = do_writepages(mapping, wbc);
- wbc_detach_inode(wbc);
+ wbc_attach_fdatawrite_inode(&wbc, mapping->host);
+ ret = do_writepages(mapping, &wbc);
+ wbc_detach_inode(&wbc);
+
+ if (!ret && nr_to_write)
+ *nr_to_write = wbc.nr_to_write;
return ret;
}
-EXPORT_SYMBOL(filemap_fdatawrite_wbc);
/**
- * __filemap_fdatawrite_range - start writeback on mapping dirty pages in range
+ * filemap_fdatawrite_range - start writeback on mapping dirty pages in range
* @mapping: address space structure to write
* @start: offset in bytes where the range starts
* @end: offset in bytes where the range ends (inclusive)
- * @sync_mode: enable synchronous operation
*
* Start writeback against all of a mapping's dirty pages that lie
* within the byte offsets <start, end> inclusive.
*
- * If sync_mode is WB_SYNC_ALL then this is a "data integrity" operation, as
- * opposed to a regular memory cleansing writeback. The difference between
- * these two operations is that if a dirty page/buffer is encountered, it must
- * be waited upon, and not just skipped over.
+ * This is a data integrity operation that waits upon dirty or in writeback
+ * pages.
*
* Return: %0 on success, negative error code otherwise.
*/
-int __filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
- loff_t end, int sync_mode)
-{
- struct writeback_control wbc = {
- .sync_mode = sync_mode,
- .nr_to_write = LONG_MAX,
- .range_start = start,
- .range_end = end,
- };
-
- return filemap_fdatawrite_wbc(mapping, &wbc);
-}
-
-static inline int __filemap_fdatawrite(struct address_space *mapping,
- int sync_mode)
+int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
+ loff_t end)
{
- return __filemap_fdatawrite_range(mapping, 0, LLONG_MAX, sync_mode);
+ return filemap_writeback(mapping, start, end, WB_SYNC_ALL, NULL);
}
+EXPORT_SYMBOL(filemap_fdatawrite_range);
int filemap_fdatawrite(struct address_space *mapping)
{
- return __filemap_fdatawrite(mapping, WB_SYNC_ALL);
+ return filemap_fdatawrite_range(mapping, 0, LLONG_MAX);
}
EXPORT_SYMBOL(filemap_fdatawrite);
-int filemap_fdatawrite_range(struct address_space *mapping, loff_t start,
- loff_t end)
-{
- return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_ALL);
-}
-EXPORT_SYMBOL(filemap_fdatawrite_range);
-
/**
- * filemap_fdatawrite_range_kick - start writeback on a range
+ * filemap_flush_range - start writeback on a range
* @mapping: target address_space
* @start: index to start writeback on
* @end: last (inclusive) index for writeback
@@ -452,12 +429,12 @@ EXPORT_SYMBOL(filemap_fdatawrite_range);
*
* Return: %0 on success, negative error code otherwise.
*/
-int filemap_fdatawrite_range_kick(struct address_space *mapping, loff_t start,
+int filemap_flush_range(struct address_space *mapping, loff_t start,
loff_t end)
{
- return __filemap_fdatawrite_range(mapping, start, end, WB_SYNC_NONE);
+ return filemap_writeback(mapping, start, end, WB_SYNC_NONE, NULL);
}
-EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
+EXPORT_SYMBOL_GPL(filemap_flush_range);
/**
* filemap_flush - mostly a non-blocking flush
@@ -470,10 +447,22 @@ EXPORT_SYMBOL_GPL(filemap_fdatawrite_range_kick);
*/
int filemap_flush(struct address_space *mapping)
{
- return __filemap_fdatawrite(mapping, WB_SYNC_NONE);
+ return filemap_flush_range(mapping, 0, LLONG_MAX);
}
EXPORT_SYMBOL(filemap_flush);
+/*
+ * Start writeback on @nr_to_write pages from @mapping. No one but the existing
+ * btrfs caller should be using this. Talk to linux-mm if you think adding a
+ * new caller is a good idea.
+ */
+int filemap_flush_nr(struct address_space *mapping, long *nr_to_write)
+{
+ return filemap_writeback(mapping, 0, LLONG_MAX, WB_SYNC_NONE,
+ nr_to_write);
+}
+EXPORT_SYMBOL_FOR_MODULES(filemap_flush_nr, "btrfs");
+
/**
* filemap_range_has_page - check if a page exists in range.
* @mapping: address space within which to check
@@ -691,8 +680,7 @@ int filemap_write_and_wait_range(struct address_space *mapping,
return 0;
if (mapping_needs_writeback(mapping)) {
- err = __filemap_fdatawrite_range(mapping, lstart, lend,
- WB_SYNC_ALL);
+ err = filemap_fdatawrite_range(mapping, lstart, lend);
/*
* Even if the above returned error, the pages may be
* written partially (e.g. -ENOSPC), so we wait for it.
@@ -794,8 +782,7 @@ int file_write_and_wait_range(struct file *file, loff_t lstart, loff_t lend)
return 0;
if (mapping_needs_writeback(mapping)) {
- err = __filemap_fdatawrite_range(mapping, lstart, lend,
- WB_SYNC_ALL);
+ err = filemap_fdatawrite_range(mapping, lstart, lend);
/* See comment of filemap_write_and_wait() */
if (err != -EIO)
__filemap_fdatawait_range(mapping, lstart, lend);
@@ -2366,6 +2353,64 @@ out:
}
EXPORT_SYMBOL(filemap_get_folios_tag);
+/**
+ * filemap_get_folios_dirty - Get a batch of dirty folios
+ * @mapping: The address_space to search
+ * @start: The starting folio index
+ * @end: The final folio index (inclusive)
+ * @fbatch: The batch to fill
+ *
+ * filemap_get_folios_dirty() works exactly like filemap_get_folios(), except
+ * the returned folios are presumed to be dirty or undergoing writeback. Dirty
+ * state is presumed because we don't block on folio lock nor want to miss
+ * folios. Callers that need to can recheck state upon locking the folio.
+ *
+ * This may not return all dirty folios if the batch gets filled up.
+ *
+ * Return: The number of folios found.
+ * Also update @start to be positioned for traversal of the next folio.
+ */
+unsigned filemap_get_folios_dirty(struct address_space *mapping, pgoff_t *start,
+ pgoff_t end, struct folio_batch *fbatch)
+{
+ XA_STATE(xas, &mapping->i_pages, *start);
+ struct folio *folio;
+
+ rcu_read_lock();
+ while ((folio = find_get_entry(&xas, end, XA_PRESENT)) != NULL) {
+ if (xa_is_value(folio))
+ continue;
+ if (folio_trylock(folio)) {
+ bool clean = !folio_test_dirty(folio) &&
+ !folio_test_writeback(folio);
+ folio_unlock(folio);
+ if (clean) {
+ folio_put(folio);
+ continue;
+ }
+ }
+ if (!folio_batch_add(fbatch, folio)) {
+ unsigned long nr = folio_nr_pages(folio);
+ *start = folio->index + nr;
+ goto out;
+ }
+ }
+ /*
+ * We come here when there is no folio beyond @end. We take care to not
+ * overflow the index @start as it confuses some of the callers. This
+ * breaks the iteration when there is a folio at index -1 but that is
+ * already broke anyway.
+ */
+ if (end == (pgoff_t)-1)
+ *start = (pgoff_t)-1;
+ else
+ *start = end + 1;
+out:
+ rcu_read_unlock();
+
+ return folio_batch_count(fbatch);
+}
+
/*
* CD/DVDs are error prone. When a medium error occurs, the driver may fail
* a _large_ part of the i/o request. Imagine the worst scenario:
@@ -3681,8 +3726,10 @@ skip:
static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
struct folio *folio, unsigned long start,
unsigned long addr, unsigned int nr_pages,
- unsigned long *rss, unsigned short *mmap_miss)
+ unsigned long *rss, unsigned short *mmap_miss,
+ pgoff_t file_end)
{
+ struct address_space *mapping = folio->mapping;
unsigned int ref_from_caller = 1;
vm_fault_t ret = 0;
struct page *page = folio_page(folio, start);
@@ -3691,12 +3738,16 @@ static vm_fault_t filemap_map_folio_range(struct vm_fault *vmf,
unsigned long addr0;
/*
- * Map the large folio fully where possible.
+ * Map the large folio fully where possible:
*
- * The folio must not cross VMA or page table boundary.
+ * - The folio is fully within size of the file or belong
+ * to shmem/tmpfs;
+ * - The folio doesn't cross VMA boundary;
+ * - The folio doesn't cross page table boundary;
*/
addr0 = addr - start * PAGE_SIZE;
- if (folio_within_vma(folio, vmf->vma) &&
+ if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&
+ folio_within_vma(folio, vmf->vma) &&
(addr0 & PMD_MASK) == ((addr0 + folio_size(folio) - 1) & PMD_MASK)) {
vmf->pte -= start;
page -= start;
@@ -3817,7 +3868,18 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
if (!folio)
goto out;
- if (filemap_map_pmd(vmf, folio, start_pgoff)) {
+ file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
+ end_pgoff = min(end_pgoff, file_end);
+
+ /*
+ * Do not allow to map with PMD across i_size to preserve
+ * SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
+ */
+ if ((file_end >= folio_next_index(folio) || shmem_mapping(mapping)) &&
+ filemap_map_pmd(vmf, folio, start_pgoff)) {
ret = VM_FAULT_NOPAGE;
goto out;
}
@@ -3830,10 +3892,6 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
goto out;
}
- file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE) - 1;
- if (end_pgoff > file_end)
- end_pgoff = file_end;
-
folio_type = mm_counter_file(folio);
do {
unsigned long end;
@@ -3850,7 +3908,7 @@ vm_fault_t filemap_map_pages(struct vm_fault *vmf,
else
ret |= filemap_map_folio_range(vmf, folio,
xas.xa_index - folio->index, addr,
- nr_pages, &rss, &mmap_miss);
+ nr_pages, &rss, &mmap_miss, file_end);
folio_unlock(folio);
} while ((folio = next_uptodate_folio(&xas, mapping, end_pgoff)) != NULL);
@@ -4457,16 +4515,8 @@ int filemap_invalidate_inode(struct inode *inode, bool flush,
unmap_mapping_pages(mapping, first, nr, false);
/* Write back the data if we're asked to. */
- if (flush) {
- struct writeback_control wbc = {
- .sync_mode = WB_SYNC_ALL,
- .nr_to_write = LONG_MAX,
- .range_start = start,
- .range_end = end,
- };
-
- filemap_fdatawrite_wbc(mapping, &wbc);
- }
+ if (flush)
+ filemap_fdatawrite_range(mapping, start, end);
/* Wait for writeback to complete on all folios and discard. */
invalidate_inode_pages2_range(mapping, start / PAGE_SIZE, end / PAGE_SIZE);
diff --git a/mm/huge_memory.c b/mm/huge_memory.c
index 1d1b74950332..1192e62531cd 100644
--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -214,7 +214,8 @@ retry:
if (likely(atomic_inc_not_zero(&huge_zero_refcount)))
return true;
- zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO) & ~__GFP_MOVABLE,
+ zero_folio = folio_alloc((GFP_TRANSHUGE | __GFP_ZERO | __GFP_ZEROTAGS) &
+ ~__GFP_MOVABLE,
HPAGE_PMD_ORDER);
if (!zero_folio) {
count_vm_event(THP_ZERO_PAGE_ALLOC_FAILED);
@@ -1641,17 +1642,30 @@ vm_fault_t vmf_insert_folio_pud(struct vm_fault *vmf, struct folio *folio,
EXPORT_SYMBOL_GPL(vmf_insert_folio_pud);
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+/**
+ * touch_pmd - Mark page table pmd entry as accessed and dirty (for write)
+ * @vma: The VMA covering @addr
+ * @addr: The virtual address
+ * @pmd: pmd pointer into the page table mapping @addr
+ * @write: Whether it's a write access
+ *
+ * Return: whether the pmd entry is changed
+ */
+bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write)
{
- pmd_t _pmd;
+ pmd_t entry;
- _pmd = pmd_mkyoung(*pmd);
+ entry = pmd_mkyoung(*pmd);
if (write)
- _pmd = pmd_mkdirty(_pmd);
+ entry = pmd_mkdirty(entry);
if (pmdp_set_access_flags(vma, addr & HPAGE_PMD_MASK,
- pmd, _pmd, write))
+ pmd, entry, write)) {
update_mmu_cache_pmd(vma, addr, pmd);
+ return true;
+ }
+
+ return false;
}
int copy_huge_pmd(struct mm_struct *dst_mm, struct mm_struct *src_mm,
@@ -1841,18 +1855,14 @@ unlock:
}
#endif /* CONFIG_HAVE_ARCH_TRANSPARENT_HUGEPAGE_PUD */
-void huge_pmd_set_accessed(struct vm_fault *vmf)
+bool huge_pmd_set_accessed(struct vm_fault *vmf)
{
bool write = vmf->flags & FAULT_FLAG_WRITE;
- vmf->ptl = pmd_lock(vmf->vma->vm_mm, vmf->pmd);
if (unlikely(!pmd_same(*vmf->pmd, vmf->orig_pmd)))
- goto unlock;
-
- touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
+ return false;
-unlock:
- spin_unlock(vmf->ptl);
+ return touch_pmd(vmf->vma, vmf->address, vmf->pmd, write);
}
static vm_fault_t do_huge_zero_wp_pmd(struct vm_fault *vmf)
@@ -3263,6 +3273,14 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
caller_pins;
}
+static bool page_range_has_hwpoisoned(struct page *page, long nr_pages)
+{
+ for (; nr_pages; page++, nr_pages--)
+ if (PageHWPoison(page))
+ return true;
+ return false;
+}
+
/*
* It splits @folio into @new_order folios and copies the @folio metadata to
* all the resulting folios.
@@ -3270,17 +3288,24 @@ bool can_split_folio(struct folio *folio, int caller_pins, int *pextra_pins)
static void __split_folio_to_order(struct folio *folio, int old_order,
int new_order)
{
+ /* Scan poisoned pages when split a poisoned folio to large folios */
+ const bool handle_hwpoison = folio_test_has_hwpoisoned(folio) && new_order;
long new_nr_pages = 1 << new_order;
long nr_pages = 1 << old_order;
long i;
+ folio_clear_has_hwpoisoned(folio);
+
+ /* Check first new_nr_pages since the loop below skips them */
+ if (handle_hwpoison &&
+ page_range_has_hwpoisoned(folio_page(folio, 0), new_nr_pages))
+ folio_set_has_hwpoisoned(folio);
/*
* Skip the first new_nr_pages, since the new folio from them have all
* the flags from the original folio.
*/
for (i = new_nr_pages; i < nr_pages; i += new_nr_pages) {
struct page *new_head = &folio->page + i;
-
/*
* Careful: new_folio is not a "real" folio before we cleared PageTail.
* Don't pass it around before clear_compound_head().
@@ -3322,6 +3347,10 @@ static void __split_folio_to_order(struct folio *folio, int old_order,
(1L << PG_dirty) |
LRU_GEN_MASK | LRU_REFS_MASK));
+ if (handle_hwpoison &&
+ page_range_has_hwpoisoned(new_head, new_nr_pages))
+ folio_set_has_hwpoisoned(new_folio);
+
new_folio->mapping = folio->mapping;
new_folio->index = folio->index + i;
@@ -3422,8 +3451,6 @@ static int __split_unmapped_folio(struct folio *folio, int new_order,
if (folio_test_anon(folio))
mod_mthp_stat(order, MTHP_STAT_NR_ANON, -1);
- folio_clear_has_hwpoisoned(folio);
-
/*
* split to new_order one order at a time. For uniform split,
* folio is split to new_order directly.
@@ -3504,7 +3531,8 @@ bool non_uniform_split_supported(struct folio *folio, unsigned int new_order,
/* order-1 is not supported for anonymous THP. */
VM_WARN_ONCE(warns && new_order == 1,
"Cannot split to order-1 folio");
- return new_order != 1;
+ if (new_order == 1)
+ return false;
} else if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
!mapping_large_folio_support(folio->mapping)) {
/*
@@ -3535,7 +3563,8 @@ bool uniform_split_supported(struct folio *folio, unsigned int new_order,
if (folio_test_anon(folio)) {
VM_WARN_ONCE(warns && new_order == 1,
"Cannot split to order-1 folio");
- return new_order != 1;
+ if (new_order == 1)
+ return false;
} else if (new_order) {
if (IS_ENABLED(CONFIG_READ_ONLY_THP_FOR_FS) &&
!mapping_large_folio_support(folio->mapping)) {
@@ -3599,6 +3628,16 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
if (folio != page_folio(split_at) || folio != page_folio(lock_at))
return -EINVAL;
+ /*
+ * Folios that just got truncated cannot get split. Signal to the
+ * caller that there was a race.
+ *
+ * TODO: this will also currently refuse shmem folios that are in the
+ * swapcache.
+ */
+ if (!is_anon && !folio->mapping)
+ return -EBUSY;
+
if (new_order >= folio_order(folio))
return -EINVAL;
@@ -3639,22 +3678,8 @@ static int __folio_split(struct folio *folio, unsigned int new_order,
gfp_t gfp;
mapping = folio->mapping;
-
- /* Truncated ? */
- /*
- * TODO: add support for large shmem folio in swap cache.
- * When shmem is in swap cache, mapping is NULL and
- * folio_test_swapcache() is true.
- */
- if (!mapping) {
- ret = -EBUSY;
- goto out;
- }
-
min_order = mapping_min_folio_order(folio->mapping);
if (new_order < min_order) {
- VM_WARN_ONCE(1, "Cannot split mapped folio below min-order: %u",
- min_order);
ret = -EINVAL;
goto out;
}
@@ -3986,12 +4011,7 @@ int min_order_for_split(struct folio *folio)
int split_folio_to_list(struct folio *folio, struct list_head *list)
{
- int ret = min_order_for_split(folio);
-
- if (ret < 0)
- return ret;
-
- return split_huge_page_to_list_to_order(&folio->page, list, ret);
+ return split_huge_page_to_list_to_order(&folio->page, list, 0);
}
/*
diff --git a/mm/internal.h b/mm/internal.h
index 1561fc2ff5b8..27ad37a41868 100644
--- a/mm/internal.h
+++ b/mm/internal.h
@@ -1402,7 +1402,7 @@ int __must_check try_grab_folio(struct folio *folio, int refs,
*/
void touch_pud(struct vm_area_struct *vma, unsigned long addr,
pud_t *pud, bool write);
-void touch_pmd(struct vm_area_struct *vma, unsigned long addr,
+bool touch_pmd(struct vm_area_struct *vma, unsigned long addr,
pmd_t *pmd, bool write);
/*
diff --git a/mm/kmsan/core.c b/mm/kmsan/core.c
index 8bca7fece47f..35ceaa8adb41 100644
--- a/mm/kmsan/core.c
+++ b/mm/kmsan/core.c
@@ -72,9 +72,6 @@ depot_stack_handle_t kmsan_save_stack_with_flags(gfp_t flags,
nr_entries = stack_trace_save(entries, KMSAN_STACK_DEPTH, 0);
- /* Don't sleep. */
- flags &= ~(__GFP_DIRECT_RECLAIM | __GFP_KSWAPD_RECLAIM);
-
handle = stack_depot_save(entries, nr_entries, flags);
return stack_depot_set_extra_bits(handle, extra);
}
diff --git a/mm/kmsan/hooks.c b/mm/kmsan/hooks.c
index 2cee59d89c80..8f22d1f22981 100644
--- a/mm/kmsan/hooks.c
+++ b/mm/kmsan/hooks.c
@@ -84,7 +84,8 @@ void kmsan_slab_free(struct kmem_cache *s, void *object)
if (s->ctor)
return;
kmsan_enter_runtime();
- kmsan_internal_poison_memory(object, s->object_size, GFP_KERNEL,
+ kmsan_internal_poison_memory(object, s->object_size,
+ GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
@@ -114,7 +115,8 @@ void kmsan_kfree_large(const void *ptr)
kmsan_enter_runtime();
page = virt_to_head_page((void *)ptr);
KMSAN_WARN_ON(ptr != page_address(page));
- kmsan_internal_poison_memory((void *)ptr, page_size(page), GFP_KERNEL,
+ kmsan_internal_poison_memory((void *)ptr, page_size(page),
+ GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
diff --git a/mm/kmsan/shadow.c b/mm/kmsan/shadow.c
index 54f3c3c962f0..55fdea199aaf 100644
--- a/mm/kmsan/shadow.c
+++ b/mm/kmsan/shadow.c
@@ -208,7 +208,7 @@ void kmsan_free_page(struct page *page, unsigned int order)
return;
kmsan_enter_runtime();
kmsan_internal_poison_memory(page_address(page), page_size(page),
- GFP_KERNEL,
+ GFP_KERNEL & ~(__GFP_RECLAIM),
KMSAN_POISON_CHECK | KMSAN_POISON_FREE);
kmsan_leave_runtime();
}
diff --git a/mm/ksm.c b/mm/ksm.c
index 7bc726b50b2f..c4e730409949 100644
--- a/mm/ksm.c
+++ b/mm/ksm.c
@@ -2455,6 +2455,95 @@ static bool should_skip_rmap_item(struct folio *folio,
return true;
}
+struct ksm_next_page_arg {
+ struct folio *folio;
+ struct page *page;
+ unsigned long addr;
+};
+
+static int ksm_next_page_pmd_entry(pmd_t *pmdp, unsigned long addr, unsigned long end,
+ struct mm_walk *walk)
+{
+ struct ksm_next_page_arg *private = walk->private;
+ struct vm_area_struct *vma = walk->vma;
+ pte_t *start_ptep = NULL, *ptep, pte;
+ struct mm_struct *mm = walk->mm;
+ struct folio *folio;
+ struct page *page;
+ spinlock_t *ptl;
+ pmd_t pmd;
+
+ if (ksm_test_exit(mm))
+ return 0;
+
+ cond_resched();
+
+ pmd = pmdp_get_lockless(pmdp);
+ if (!pmd_present(pmd))
+ return 0;
+
+ if (IS_ENABLED(CONFIG_TRANSPARENT_HUGEPAGE) && pmd_leaf(pmd)) {
+ ptl = pmd_lock(mm, pmdp);
+ pmd = pmdp_get(pmdp);
+
+ if (!pmd_present(pmd)) {
+ goto not_found_unlock;
+ } else if (pmd_leaf(pmd)) {
+ page = vm_normal_page_pmd(vma, addr, pmd);
+ if (!page)
+ goto not_found_unlock;
+ folio = page_folio(page);
+
+ if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+ goto not_found_unlock;
+
+ page += ((addr & (PMD_SIZE - 1)) >> PAGE_SHIFT);
+ goto found_unlock;
+ }
+ spin_unlock(ptl);
+ }
+
+ start_ptep = pte_offset_map_lock(mm, pmdp, addr, &ptl);
+ if (!start_ptep)
+ return 0;
+
+ for (ptep = start_ptep; addr < end; ptep++, addr += PAGE_SIZE) {
+ pte = ptep_get(ptep);
+
+ if (!pte_present(pte))
+ continue;
+
+ page = vm_normal_page(vma, addr, pte);
+ if (!page)
+ continue;
+ folio = page_folio(page);
+
+ if (folio_is_zone_device(folio) || !folio_test_anon(folio))
+ continue;
+ goto found_unlock;
+ }
+
+not_found_unlock:
+ spin_unlock(ptl);
+ if (start_ptep)
+ pte_unmap(start_ptep);
+ return 0;
+found_unlock:
+ folio_get(folio);
+ spin_unlock(ptl);
+ if (start_ptep)
+ pte_unmap(start_ptep);
+ private->page = page;
+ private->folio = folio;
+ private->addr = addr;
+ return 1;
+}
+
+static struct mm_walk_ops ksm_next_page_ops = {
+ .pmd_entry = ksm_next_page_pmd_entry,
+ .walk_lock = PGWALK_RDLOCK,
+};
+
static struct ksm_rmap_item *scan_get_next_rmap_item(struct page **page)
{
struct mm_struct *mm;
@@ -2542,21 +2631,27 @@ next_mm:
ksm_scan.address = vma->vm_end;
while (ksm_scan.address < vma->vm_end) {
+ struct ksm_next_page_arg ksm_next_page_arg;
struct page *tmp_page = NULL;
- struct folio_walk fw;
struct folio *folio;
if (ksm_test_exit(mm))
break;
- folio = folio_walk_start(&fw, vma, ksm_scan.address, 0);
- if (folio) {
- if (!folio_is_zone_device(folio) &&
- folio_test_anon(folio)) {
- folio_get(folio);
- tmp_page = fw.page;
- }
- folio_walk_end(&fw, vma);
+ int found;
+
+ found = walk_page_range_vma(vma, ksm_scan.address,
+ vma->vm_end,
+ &ksm_next_page_ops,
+ &ksm_next_page_arg);
+
+ if (found > 0) {
+ folio = ksm_next_page_arg.folio;
+ tmp_page = ksm_next_page_arg.page;
+ ksm_scan.address = ksm_next_page_arg.addr;
+ } else {
+ VM_WARN_ON_ONCE(found < 0);
+ ksm_scan.address = vma->vm_end - PAGE_SIZE;
}
if (tmp_page) {
diff --git a/mm/memblock.c b/mm/memblock.c
index e23e16618e9b..f0f2dc66e9a2 100644
--- a/mm/memblock.c
+++ b/mm/memblock.c
@@ -1826,7 +1826,8 @@ phys_addr_t __init_memblock memblock_reserved_kern_size(phys_addr_t limit, int n
*/
unsigned long __init memblock_estimated_nr_free_pages(void)
{
- return PHYS_PFN(memblock_phys_mem_size() - memblock_reserved_size());
+ return PHYS_PFN(memblock_phys_mem_size() -
+ memblock_reserved_kern_size(MEMBLOCK_ALLOC_ANYWHERE, NUMA_NO_NODE));
}
/* lowest address */
diff --git a/mm/memfd.c b/mm/memfd.c
index 1d109c1acf21..805e297916e5 100644
--- a/mm/memfd.c
+++ b/mm/memfd.c
@@ -96,9 +96,36 @@ struct folio *memfd_alloc_folio(struct file *memfd, pgoff_t idx)
NULL,
gfp_mask);
if (folio) {
+ u32 hash;
+
+ /*
+ * Zero the folio to prevent information leaks to userspace.
+ * Use folio_zero_user() which is optimized for huge/gigantic
+ * pages. Pass 0 as addr_hint since this is not a faulting path
+ * and we don't have a user virtual address yet.
+ */
+ folio_zero_user(folio, 0);
+
+ /*
+ * Mark the folio uptodate before adding to page cache,
+ * as required by filemap.c and other hugetlb paths.
+ */
+ __folio_mark_uptodate(folio);
+
+ /*
+ * Serialize hugepage allocation and instantiation to prevent
+ * races with concurrent allocations, as required by all other
+ * callers of hugetlb_add_to_page_cache().
+ */
+ hash = hugetlb_fault_mutex_hash(memfd->f_mapping, idx);
+ mutex_lock(&hugetlb_fault_mutex_table[hash]);
+
err = hugetlb_add_to_page_cache(folio,
memfd->f_mapping,
idx);
+
+ mutex_unlock(&hugetlb_fault_mutex_table[hash]);
+
if (err) {
folio_put(folio);
goto err_unresv;
@@ -470,9 +497,9 @@ SYSCALL_DEFINE2(memfd_create,
const char __user *, uname,
unsigned int, flags)
{
- struct file *file;
- int fd, error;
- char *name;
+ char *name __free(kfree) = NULL;
+ unsigned int fd_flags;
+ int error;
error = sanitize_flags(&flags);
if (error < 0)
@@ -482,25 +509,6 @@ SYSCALL_DEFINE2(memfd_create,
if (IS_ERR(name))
return PTR_ERR(name);
- fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0);
- if (fd < 0) {
- error = fd;
- goto err_free_name;
- }
-
- file = alloc_file(name, flags);
- if (IS_ERR(file)) {
- error = PTR_ERR(file);
- goto err_free_fd;
- }
-
- fd_install(fd, file);
- kfree(name);
- return fd;
-
-err_free_fd:
- put_unused_fd(fd);
-err_free_name:
- kfree(name);
- return error;
+ fd_flags = (flags & MFD_CLOEXEC) ? O_CLOEXEC : 0;
+ return FD_ADD(fd_flags, alloc_file(name, flags));
}
diff --git a/mm/memory.c b/mm/memory.c
index 74b45e258323..aad432e71251 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -65,6 +65,7 @@
#include <linux/gfp.h>
#include <linux/migrate.h>
#include <linux/string.h>
+#include <linux/shmem_fs.h>
#include <linux/memory-tiers.h>
#include <linux/debugfs.h>
#include <linux/userfaultfd_k.h>
@@ -5501,8 +5502,25 @@ fallback:
return ret;
}
+ if (!needs_fallback && vma->vm_file) {
+ struct address_space *mapping = vma->vm_file->f_mapping;
+ pgoff_t file_end;
+
+ file_end = DIV_ROUND_UP(i_size_read(mapping->host), PAGE_SIZE);
+
+ /*
+ * Do not allow to map with PTEs beyond i_size and with PMD
+ * across i_size to preserve SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
+ */
+ needs_fallback = !shmem_mapping(mapping) &&
+ file_end < folio_next_index(folio);
+ }
+
if (pmd_none(*vmf->pmd)) {
- if (folio_test_pmd_mappable(folio)) {
+ if (!needs_fallback && folio_test_pmd_mappable(folio)) {
ret = do_set_pmd(vmf, folio, page);
if (ret != VM_FAULT_FALLBACK)
return ret;
@@ -6116,6 +6134,45 @@ split:
}
/*
+ * The page faults may be spurious because of the racy access to the
+ * page table. For example, a non-populated virtual page is accessed
+ * on 2 CPUs simultaneously, thus the page faults are triggered on
+ * both CPUs. However, it's possible that one CPU (say CPU A) cannot
+ * find the reason for the page fault if the other CPU (say CPU B) has
+ * changed the page table before the PTE is checked on CPU A. Most of
+ * the time, the spurious page faults can be ignored safely. However,
+ * if the page fault is for the write access, it's possible that a
+ * stale read-only TLB entry exists in the local CPU and needs to be
+ * flushed on some architectures. This is called the spurious page
+ * fault fixing.
+ *
+ * Note: flush_tlb_fix_spurious_fault() is defined as flush_tlb_page()
+ * by default and used as such on most architectures, while
+ * flush_tlb_fix_spurious_fault_pmd() is defined as NOP by default and
+ * used as such on most architectures.
+ */
+static void fix_spurious_fault(struct vm_fault *vmf,
+ enum pgtable_level ptlevel)
+{
+ /* Skip spurious TLB flush for retried page fault */
+ if (vmf->flags & FAULT_FLAG_TRIED)
+ return;
+ /*
+ * This is needed only for protection faults but the arch code
+ * is not yet telling us if this is a protection fault or not.
+ * This still avoids useless tlb flushes for .text page faults
+ * with threads.
+ */
+ if (vmf->flags & FAULT_FLAG_WRITE) {
+ if (ptlevel == PGTABLE_LEVEL_PTE)
+ flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
+ vmf->pte);
+ else
+ flush_tlb_fix_spurious_fault_pmd(vmf->vma, vmf->address,
+ vmf->pmd);
+ }
+}
+/*
* These routines also need to handle stuff like marking pages dirty
* and/or accessed for architectures that don't do it in hardware (most
* RISC architectures). The early dirtying is also good on the i386.
@@ -6196,23 +6253,11 @@ static vm_fault_t handle_pte_fault(struct vm_fault *vmf)
}
entry = pte_mkyoung(entry);
if (ptep_set_access_flags(vmf->vma, vmf->address, vmf->pte, entry,
- vmf->flags & FAULT_FLAG_WRITE)) {
+ vmf->flags & FAULT_FLAG_WRITE))
update_mmu_cache_range(vmf, vmf->vma, vmf->address,
vmf->pte, 1);
- } else {
- /* Skip spurious TLB flush for retried page fault */
- if (vmf->flags & FAULT_FLAG_TRIED)
- goto unlock;
- /*
- * This is needed only for protection faults but the arch code
- * is not yet telling us if this is a protection fault or not.
- * This still avoids useless tlb flushes for .text page faults
- * with threads.
- */
- if (vmf->flags & FAULT_FLAG_WRITE)
- flush_tlb_fix_spurious_fault(vmf->vma, vmf->address,
- vmf->pte);
- }
+ else
+ fix_spurious_fault(vmf, PGTABLE_LEVEL_PTE);
unlock:
pte_unmap_unlock(vmf->pte, vmf->ptl);
return 0;
@@ -6309,7 +6354,10 @@ retry_pud:
if (!(ret & VM_FAULT_FALLBACK))
return ret;
} else {
- huge_pmd_set_accessed(&vmf);
+ vmf.ptl = pmd_lock(mm, vmf.pmd);
+ if (!huge_pmd_set_accessed(&vmf))
+ fix_spurious_fault(&vmf, PGTABLE_LEVEL_PMD);
+ spin_unlock(vmf.ptl);
return 0;
}
}
diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c
index 0be83039c3b5..238a6712738e 100644
--- a/mm/memory_hotplug.c
+++ b/mm/memory_hotplug.c
@@ -1088,7 +1088,7 @@ void adjust_present_page_count(struct page *page, struct memory_group *group,
}
int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
- struct zone *zone, bool mhp_off_inaccessible)
+ struct zone *zone)
{
unsigned long end_pfn = pfn + nr_pages;
int ret, i;
@@ -1097,15 +1097,6 @@ int mhp_init_memmap_on_memory(unsigned long pfn, unsigned long nr_pages,
if (ret)
return ret;
- /*
- * Memory block is accessible at this stage and hence poison the struct
- * pages now. If the memory block is accessible during memory hotplug
- * addition phase, then page poisining is already performed in
- * sparse_add_section().
- */
- if (mhp_off_inaccessible)
- page_init_poison(pfn_to_page(pfn), sizeof(struct page) * nr_pages);
-
move_pfn_range_to_zone(zone, pfn, nr_pages, NULL, MIGRATE_UNMOVABLE,
false);
@@ -1444,7 +1435,7 @@ static void remove_memory_blocks_and_altmaps(u64 start, u64 size)
}
static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
- u64 start, u64 size, mhp_t mhp_flags)
+ u64 start, u64 size)
{
unsigned long memblock_size = memory_block_size_bytes();
u64 cur_start;
@@ -1460,8 +1451,6 @@ static int create_altmaps_and_memory_blocks(int nid, struct memory_group *group,
};
mhp_altmap.free = memory_block_memmap_on_memory_pages();
- if (mhp_flags & MHP_OFFLINE_INACCESSIBLE)
- mhp_altmap.inaccessible = true;
params.altmap = kmemdup(&mhp_altmap, sizeof(struct vmem_altmap),
GFP_KERNEL);
if (!params.altmap) {
@@ -1555,7 +1544,7 @@ int add_memory_resource(int nid, struct resource *res, mhp_t mhp_flags)
*/
if ((mhp_flags & MHP_MEMMAP_ON_MEMORY) &&
mhp_supports_memmap_on_memory()) {
- ret = create_altmaps_and_memory_blocks(nid, group, start, size, mhp_flags);
+ ret = create_altmaps_and_memory_blocks(nid, group, start, size);
if (ret)
goto error;
} else {
diff --git a/mm/mempool.c b/mm/mempool.c
index 1c38e873e546..d7bbf1189db9 100644
--- a/mm/mempool.c
+++ b/mm/mempool.c
@@ -68,10 +68,20 @@ static void check_element(mempool_t *pool, void *element)
} else if (pool->free == mempool_free_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
- void *addr = kmap_local_page((struct page *)element);
- __check_element(pool, addr, 1UL << (PAGE_SHIFT + order));
- kunmap_local(addr);
+#ifdef CONFIG_HIGHMEM
+ for (int i = 0; i < (1 << order); i++) {
+ struct page *page = (struct page *)element;
+ void *addr = kmap_local_page(page + i);
+
+ __check_element(pool, addr, PAGE_SIZE);
+ kunmap_local(addr);
+ }
+#else
+ void *addr = page_address((struct page *)element);
+
+ __check_element(pool, addr, PAGE_SIZE << order);
+#endif
}
}
@@ -97,10 +107,20 @@ static void poison_element(mempool_t *pool, void *element)
} else if (pool->alloc == mempool_alloc_pages) {
/* Mempools backed by page allocator */
int order = (int)(long)pool->pool_data;
- void *addr = kmap_local_page((struct page *)element);
- __poison_element(addr, 1UL << (PAGE_SHIFT + order));
- kunmap_local(addr);
+#ifdef CONFIG_HIGHMEM
+ for (int i = 0; i < (1 << order); i++) {
+ struct page *page = (struct page *)element;
+ void *addr = kmap_local_page(page + i);
+
+ __poison_element(addr, PAGE_SIZE);
+ kunmap_local(addr);
+ }
+#else
+ void *addr = page_address((struct page *)element);
+
+ __poison_element(addr, PAGE_SIZE << order);
+#endif
}
}
#else /* CONFIG_SLUB_DEBUG_ON */
diff --git a/mm/mm_init.c b/mm/mm_init.c
index 3db2dea7db4c..7712d887b696 100644
--- a/mm/mm_init.c
+++ b/mm/mm_init.c
@@ -2469,7 +2469,7 @@ void *__init alloc_large_system_hash(const char *tablename,
panic("Failed to allocate %s hash table\n", tablename);
pr_info("%s hash table entries: %ld (order: %d, %lu bytes, %s)\n",
- tablename, 1UL << log2qty, ilog2(size) - PAGE_SHIFT, size,
+ tablename, 1UL << log2qty, get_order(size), size,
virt ? (huge ? "vmalloc hugepage" : "vmalloc") : "linear");
if (_hash_shift)
diff --git a/mm/mmap_lock.c b/mm/mmap_lock.c
index 0a0db5849b8e..42e3dde73e74 100644
--- a/mm/mmap_lock.c
+++ b/mm/mmap_lock.c
@@ -241,6 +241,7 @@ retry:
if (PTR_ERR(vma) == -EAGAIN) {
count_vm_vma_lock_event(VMA_LOCK_MISS);
/* The area was replaced with another one */
+ mas_set(&mas, address);
goto retry;
}
diff --git a/mm/mremap.c b/mm/mremap.c
index bd7314898ec5..419a0ea0a870 100644
--- a/mm/mremap.c
+++ b/mm/mremap.c
@@ -187,7 +187,7 @@ static int mremap_folio_pte_batch(struct vm_area_struct *vma, unsigned long addr
if (!folio || !folio_test_large(folio))
return 1;
- return folio_pte_batch(folio, ptep, pte, max_nr);
+ return folio_pte_batch_flags(folio, NULL, ptep, &pte, max_nr, FPB_RESPECT_WRITE);
}
static int move_ptes(struct pagetable_move_control *pmc,
diff --git a/mm/page-writeback.c b/mm/page-writeback.c
index 757bc4d3b5b5..a124ab6a205d 100644
--- a/mm/page-writeback.c
+++ b/mm/page-writeback.c
@@ -2434,12 +2434,6 @@ static bool folio_prepare_writeback(struct address_space *mapping,
return true;
}
-static xa_mark_t wbc_to_tag(struct writeback_control *wbc)
-{
- if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
- return PAGECACHE_TAG_TOWRITE;
- return PAGECACHE_TAG_DIRTY;
-}
static pgoff_t wbc_end(struct writeback_control *wbc)
{
diff --git a/mm/page_alloc.c b/mm/page_alloc.c
index 600d9e981c23..ed82ee55e66a 100644
--- a/mm/page_alloc.c
+++ b/mm/page_alloc.c
@@ -1822,14 +1822,9 @@ inline void post_alloc_hook(struct page *page, unsigned int order,
* If memory tags should be zeroed
* (which happens only when memory should be initialized as well).
*/
- if (zero_tags) {
- /* Initialize both memory and memory tags. */
- for (i = 0; i != 1 << order; ++i)
- tag_clear_highpage(page + i);
+ if (zero_tags)
+ init = !tag_clear_highpages(page, 1 << order);
- /* Take note that memory was initialized by the loop above. */
- init = false;
- }
if (!should_skip_kasan_unpoison(gfp_flags) &&
kasan_unpoison_pages(page, order, init)) {
/* Take note that memory was initialized by KASAN. */
diff --git a/mm/secretmem.c b/mm/secretmem.c
index 60137305bc20..f0ef4e198884 100644
--- a/mm/secretmem.c
+++ b/mm/secretmem.c
@@ -82,13 +82,13 @@ retry:
__folio_mark_uptodate(folio);
err = filemap_add_folio(mapping, folio, offset, gfp);
if (unlikely(err)) {
- folio_put(folio);
/*
* If a split of large page was required, it
* already happened when we marked the page invalid
* which guarantees that this call won't fail
*/
set_direct_map_default_noflush(folio_page(folio, 0));
+ folio_put(folio);
if (err == -EEXIST)
goto retry;
@@ -224,9 +224,6 @@ err_free_inode:
SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
{
- struct file *file;
- int fd, err;
-
/* make sure local flags do not confict with global fcntl.h */
BUILD_BUG_ON(SECRETMEM_FLAGS_MASK & O_CLOEXEC);
@@ -238,22 +235,7 @@ SYSCALL_DEFINE1(memfd_secret, unsigned int, flags)
if (atomic_read(&secretmem_users) < 0)
return -ENFILE;
- fd = get_unused_fd_flags(flags & O_CLOEXEC);
- if (fd < 0)
- return fd;
-
- file = secretmem_file_create(flags);
- if (IS_ERR(file)) {
- err = PTR_ERR(file);
- goto err_put_fd;
- }
-
- fd_install(fd, file);
- return fd;
-
-err_put_fd:
- put_unused_fd(fd);
- return err;
+ return FD_ADD(flags & O_CLOEXEC, secretmem_file_create(flags));
}
static int secretmem_init_fs_context(struct fs_context *fc)
diff --git a/mm/shmem.c b/mm/shmem.c
index b9081b817d28..899303d8c9aa 100644
--- a/mm/shmem.c
+++ b/mm/shmem.c
@@ -131,8 +131,7 @@ struct shmem_options {
#define SHMEM_SEEN_INODES 2
#define SHMEM_SEEN_HUGE 4
#define SHMEM_SEEN_INUMS 8
-#define SHMEM_SEEN_NOSWAP 16
-#define SHMEM_SEEN_QUOTA 32
+#define SHMEM_SEEN_QUOTA 16
};
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
@@ -1076,7 +1075,7 @@ static struct folio *shmem_get_partial_folio(struct inode *inode, pgoff_t index)
* Remove range of pages and swap entries from page cache, and free them.
* If !unfalloc, truncate or punch hole; if unfalloc, undo failed fallocate.
*/
-static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
+static void shmem_undo_range(struct inode *inode, loff_t lstart, uoff_t lend,
bool unfalloc)
{
struct address_space *mapping = inode->i_mapping;
@@ -1133,7 +1132,7 @@ static void shmem_undo_range(struct inode *inode, loff_t lstart, loff_t lend,
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
folio = shmem_get_partial_folio(inode, lstart >> PAGE_SHIFT);
if (folio) {
- same_folio = lend < folio_pos(folio) + folio_size(folio);
+ same_folio = lend < folio_next_pos(folio);
folio_mark_dirty(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
start = folio_next_index(folio);
@@ -1227,7 +1226,7 @@ whole_folios:
shmem_recalc_inode(inode, 0, -nr_swaps_freed);
}
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
{
shmem_undo_range(inode, lstart, lend, false);
inode_set_mtime_to_ts(inode, inode_set_ctime_current(inode));
@@ -1882,6 +1881,7 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
struct shmem_inode_info *info = SHMEM_I(inode);
unsigned long suitable_orders = 0;
struct folio *folio = NULL;
+ pgoff_t aligned_index;
long pages;
int error, order;
@@ -1895,10 +1895,12 @@ static struct folio *shmem_alloc_and_add_folio(struct vm_fault *vmf,
order = highest_order(suitable_orders);
while (suitable_orders) {
pages = 1UL << order;
- index = round_down(index, pages);
- folio = shmem_alloc_folio(gfp, order, info, index);
- if (folio)
+ aligned_index = round_down(index, pages);
+ folio = shmem_alloc_folio(gfp, order, info, aligned_index);
+ if (folio) {
+ index = aligned_index;
goto allocated;
+ }
if (pages == HPAGE_PMD_NR)
count_vm_event(THP_FILE_FALLBACK);
@@ -4677,7 +4679,6 @@ static int shmem_parse_one(struct fs_context *fc, struct fs_parameter *param)
"Turning off swap in unprivileged tmpfs mounts unsupported");
}
ctx->noswap = true;
- ctx->seen |= SHMEM_SEEN_NOSWAP;
break;
case Opt_quota:
if (fc->user_ns != &init_user_ns)
@@ -4827,14 +4828,15 @@ static int shmem_reconfigure(struct fs_context *fc)
err = "Current inum too high to switch to 32-bit inums";
goto out;
}
- if ((ctx->seen & SHMEM_SEEN_NOSWAP) && ctx->noswap && !sbinfo->noswap) {
+
+ /*
+ * "noswap" doesn't use fsparam_flag_no, i.e. there's no "swap"
+ * counterpart for (re-)enabling swap.
+ */
+ if (ctx->noswap && !sbinfo->noswap) {
err = "Cannot disable swap on remount";
goto out;
}
- if (!(ctx->seen & SHMEM_SEEN_NOSWAP) && !ctx->noswap && sbinfo->noswap) {
- err = "Cannot enable swap on remount if it was disabled on first mount";
- goto out;
- }
if (ctx->seen & SHMEM_SEEN_QUOTA &&
!sb_any_quota_loaded(fc->root->d_sb)) {
@@ -5776,7 +5778,7 @@ unsigned long shmem_get_unmapped_area(struct file *file,
}
#endif
-void shmem_truncate_range(struct inode *inode, loff_t lstart, loff_t lend)
+void shmem_truncate_range(struct inode *inode, loff_t lstart, uoff_t lend)
{
truncate_inode_pages_range(inode->i_mapping, lstart, lend);
}
diff --git a/mm/slub.c b/mm/slub.c
index f1a5373eee7b..a0b905c2a557 100644
--- a/mm/slub.c
+++ b/mm/slub.c
@@ -2046,7 +2046,11 @@ static inline void mark_objexts_empty(struct slabobj_ext *obj_exts)
if (slab_exts) {
unsigned int offs = obj_to_index(obj_exts_slab->slab_cache,
obj_exts_slab, obj_exts);
- /* codetag should be NULL */
+
+ if (unlikely(is_codetag_empty(&slab_exts[offs].ref)))
+ return;
+
+ /* codetag should be NULL here */
WARN_ON(slab_exts[offs].ref.ct);
set_codetag_empty(&slab_exts[offs].ref);
}
@@ -6332,8 +6336,6 @@ next_remote_batch:
if (unlikely(!slab_free_hook(s, p[i], init, false))) {
p[i] = p[--size];
- if (!size)
- goto flush_remote;
continue;
}
@@ -6348,6 +6350,9 @@ next_remote_batch:
i++;
}
+ if (!size)
+ goto flush_remote;
+
next_batch:
if (!local_trylock(&s->cpu_sheaves->lock))
goto fallback;
@@ -6402,6 +6407,9 @@ do_free:
goto next_batch;
}
+ if (remote_nr)
+ goto flush_remote;
+
return;
no_empty:
diff --git a/mm/sparse.c b/mm/sparse.c
index 17c50a6415c2..b5b2b6f7041b 100644
--- a/mm/sparse.c
+++ b/mm/sparse.c
@@ -951,8 +951,7 @@ int __meminit sparse_add_section(int nid, unsigned long start_pfn,
* Poison uninitialized struct pages in order to catch invalid flags
* combinations.
*/
- if (!altmap || !altmap->inaccessible)
- page_init_poison(memmap, sizeof(struct page) * nr_pages);
+ page_init_poison(memmap, sizeof(struct page) * nr_pages);
ms = __nr_to_section(section_nr);
set_section_nid(section_nr, nid);
diff --git a/mm/swap_state.c b/mm/swap_state.c
index b13e9c4baa90..f4980dde5394 100644
--- a/mm/swap_state.c
+++ b/mm/swap_state.c
@@ -748,6 +748,8 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
blk_start_plug(&plug);
for (addr = start; addr < end; ilx++, addr += PAGE_SIZE) {
+ struct swap_info_struct *si = NULL;
+
if (!pte++) {
pte = pte_offset_map(vmf->pmd, addr);
if (!pte)
@@ -761,8 +763,19 @@ static struct folio *swap_vma_readahead(swp_entry_t targ_entry, gfp_t gfp_mask,
continue;
pte_unmap(pte);
pte = NULL;
+ /*
+ * Readahead entry may come from a device that we are not
+ * holding a reference to, try to grab a reference, or skip.
+ */
+ if (swp_type(entry) != swp_type(targ_entry)) {
+ si = get_swap_device(entry);
+ if (!si)
+ continue;
+ }
folio = __read_swap_cache_async(entry, gfp_mask, mpol, ilx,
&page_allocated, false);
+ if (si)
+ put_swap_device(si);
if (!folio)
continue;
if (page_allocated) {
diff --git a/mm/swapfile.c b/mm/swapfile.c
index 10760240a3a2..a1b4b9d80e3b 100644
--- a/mm/swapfile.c
+++ b/mm/swapfile.c
@@ -2005,10 +2005,8 @@ swp_entry_t get_swap_page_of_type(int type)
local_lock(&percpu_swap_cluster.lock);
offset = cluster_alloc_swap_entry(si, 0, 1);
local_unlock(&percpu_swap_cluster.lock);
- if (offset) {
+ if (offset)
entry = swp_entry(si->type, offset);
- atomic_long_dec(&nr_swap_pages);
- }
}
put_swap_device(si);
}
diff --git a/mm/truncate.c b/mm/truncate.c
index 91eb92a5ce4f..12467c1bd711 100644
--- a/mm/truncate.c
+++ b/mm/truncate.c
@@ -46,7 +46,7 @@ static void clear_shadow_entries(struct address_space *mapping,
xas_unlock_irq(&xas);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
}
@@ -111,7 +111,7 @@ static void truncate_folio_batch_exceptionals(struct address_space *mapping,
xas_unlock_irq(&xas);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
out:
folio_batch_remove_exceptionals(fbatch);
@@ -177,6 +177,32 @@ int truncate_inode_folio(struct address_space *mapping, struct folio *folio)
return 0;
}
+static int try_folio_split_or_unmap(struct folio *folio, struct page *split_at,
+ unsigned long min_order)
+{
+ enum ttu_flags ttu_flags =
+ TTU_SYNC |
+ TTU_SPLIT_HUGE_PMD |
+ TTU_IGNORE_MLOCK;
+ int ret;
+
+ ret = try_folio_split_to_order(folio, split_at, min_order);
+
+ /*
+ * If the split fails, unmap the folio, so it will be refaulted
+ * with PTEs to respect SIGBUS semantics.
+ *
+ * Make an exception for shmem/tmpfs that for long time
+ * intentionally mapped with PMDs across i_size.
+ */
+ if (ret && !shmem_mapping(folio->mapping)) {
+ try_to_unmap(folio, ttu_flags);
+ WARN_ON(folio_mapped(folio));
+ }
+
+ return ret;
+}
+
/*
* Handle partial folios. The folio may be entirely within the
* range if a split has raced with us. If not, we zero the part of the
@@ -194,6 +220,7 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
size_t size = folio_size(folio);
unsigned int offset, length;
struct page *split_at, *split_at2;
+ unsigned int min_order;
if (pos < start)
offset = start - pos;
@@ -223,8 +250,9 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
if (!folio_test_large(folio))
return true;
+ min_order = mapping_min_folio_order(folio->mapping);
split_at = folio_page(folio, PAGE_ALIGN_DOWN(offset) / PAGE_SIZE);
- if (!try_folio_split(folio, split_at, NULL)) {
+ if (!try_folio_split_or_unmap(folio, split_at, min_order)) {
/*
* try to split at offset + length to make sure folios within
* the range can be dropped, especially to avoid memory waste
@@ -248,13 +276,10 @@ bool truncate_inode_partial_folio(struct folio *folio, loff_t start, loff_t end)
if (!folio_trylock(folio2))
goto out;
- /*
- * make sure folio2 is large and does not change its mapping.
- * Its split result does not matter here.
- */
+ /* make sure folio2 is large and does not change its mapping */
if (folio_test_large(folio2) &&
folio2->mapping == folio->mapping)
- try_folio_split(folio2, split_at2, NULL);
+ try_folio_split_or_unmap(folio2, split_at2, min_order);
folio_unlock(folio2);
out:
@@ -339,7 +364,7 @@ long mapping_evict_folio(struct address_space *mapping, struct folio *folio)
* page aligned properly.
*/
void truncate_inode_pages_range(struct address_space *mapping,
- loff_t lstart, loff_t lend)
+ loff_t lstart, uoff_t lend)
{
pgoff_t start; /* inclusive */
pgoff_t end; /* exclusive */
@@ -387,7 +412,7 @@ void truncate_inode_pages_range(struct address_space *mapping,
same_folio = (lstart >> PAGE_SHIFT) == (lend >> PAGE_SHIFT);
folio = __filemap_get_folio(mapping, lstart >> PAGE_SHIFT, FGP_LOCK, 0);
if (!IS_ERR(folio)) {
- same_folio = lend < folio_pos(folio) + folio_size(folio);
+ same_folio = lend < folio_next_pos(folio);
if (!truncate_inode_partial_folio(folio, lstart, lend)) {
start = folio_next_index(folio);
if (same_folio)
@@ -622,7 +647,7 @@ int folio_unmap_invalidate(struct address_space *mapping, struct folio *folio,
__filemap_remove_folio(folio, NULL);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
filemap_free_folio(mapping, folio);
diff --git a/mm/vmscan.c b/mm/vmscan.c
index b2fc8b626d3d..bb4a96c7b682 100644
--- a/mm/vmscan.c
+++ b/mm/vmscan.c
@@ -811,7 +811,7 @@ static int __remove_mapping(struct address_space *mapping, struct folio *folio,
__filemap_remove_folio(folio, shadow);
xa_unlock_irq(&mapping->i_pages);
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
if (free_folio)
diff --git a/mm/workingset.c b/mm/workingset.c
index 68a76a91111f..d32dc2e02a61 100644
--- a/mm/workingset.c
+++ b/mm/workingset.c
@@ -755,7 +755,7 @@ out_invalid:
xa_unlock_irq(&mapping->i_pages);
if (mapping->host != NULL) {
if (mapping_shrinkable(mapping))
- inode_add_lru(mapping->host);
+ inode_lru_list_add(mapping->host);
spin_unlock(&mapping->host->i_lock);
}
ret = LRU_REMOVED_RETRY;