diff options
author | Yu Zhao <yuzhao@google.com> | 2024-12-30 21:35:37 -0700 |
---|---|---|
committer | Andrew Morton <akpm@linux-foundation.org> | 2025-01-25 20:22:39 -0800 |
commit | 4d5d14a01e2c9091b128fb46e1d07475e9a7bb72 (patch) | |
tree | 2385dd6c0eebf80cdeed4d76104f86b047e32e63 /mm/vmscan.c | |
parent | b1a71694fb00c9a7bad788a0b49198eab20621b3 (diff) |
mm/mglru: rework workingset protection
With the aging feedback no longer considering the distribution of folios
in each generation, rework workingset protection to better distribute
folios across MAX_NR_GENS. This is achieved by reusing PG_workingset and
PG_referenced/LRU_REFS_FLAGS in a slightly different way.
For folios accessed multiple times through file descriptors, make
lru_gen_inc_refs() set additional bits of LRU_REFS_WIDTH in folio->flags
after PG_referenced, then PG_workingset after LRU_REFS_WIDTH. After all
its bits are set, i.e., LRU_REFS_FLAGS|BIT(PG_workingset), a folio is
lazily promoted into the second oldest generation in the eviction path.
And when folio_inc_gen() does that, it clears LRU_REFS_FLAGS so that
lru_gen_inc_refs() can start over. For this case, LRU_REFS_MASK is only
valid when PG_referenced is set.
For folios accessed multiple times through page tables, folio_update_gen()
from a page table walk or lru_gen_set_refs() from a rmap walk sets
PG_referenced after the accessed bit is cleared for the first time.
Thereafter, those two paths set PG_workingset and promote folios to the
youngest generation. Like folio_inc_gen(), when folio_update_gen() does
that, it also clears PG_referenced. For this case, LRU_REFS_MASK is not
used.
For both of the cases, after PG_workingset is set on a folio, it remains
until this folio is either reclaimed, or "deactivated" by
lru_gen_clear_refs(). It can be set again if lru_gen_test_recent()
returns true upon a refault.
When adding folios to the LRU lists, lru_gen_folio_seq() distributes
them as follows:
+---------------------------------+---------------------------------+
| Accessed thru page tables | Accessed thru file descriptors |
+---------------------------------+---------------------------------+
| PG_active (set while isolated) | |
+----------------+----------------+----------------+----------------+
| PG_workingset | PG_referenced | PG_workingset | LRU_REFS_FLAGS |
+---------------------------------+---------------------------------+
|<--------- MIN_NR_GENS --------->| |
|<-------------------------- MAX_NR_GENS -------------------------->|
After this patch, some typical client and server workloads showed
improvements under heavy memory pressure. For example, Python TPC-C,
which was used to benchmark a different approach [1] to better detect
refault distances, showed a significant decrease in total refaults:
Before After Change
Time (seconds) 10801 10801 0%
Executed (transactions) 41472 43663 +5%
workingset_nodes 109070 120244 +10%
workingset_refault_anon 5019627 7281831 +45%
workingset_refault_file 1294678786 554855564 -57%
workingset_refault_total 1299698413 562137395 -57%
[1] https://lore.kernel.org/20230920190244.16839-1-ryncsn@gmail.com/
Link: https://lkml.kernel.org/r/20241231043538.4075764-7-yuzhao@google.com
Signed-off-by: Yu Zhao <yuzhao@google.com>
Reported-by: Kairui Song <kasong@tencent.com>
Closes: https://lore.kernel.org/CAOUHufahuWcKf5f1Sg3emnqX+cODuR=2TQo7T4Gr-QYLujn4RA@mail.gmail.com/
Tested-by: Kalesh Singh <kaleshsingh@google.com>
Cc: Barry Song <v-songbaohua@oppo.com>
Cc: Bharata B Rao <bharata@amd.com>
Cc: David Stevens <stevensd@chromium.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Diffstat (limited to 'mm/vmscan.c')
-rw-r--r-- | mm/vmscan.c | 143 |
1 files changed, 85 insertions, 58 deletions
diff --git a/mm/vmscan.c b/mm/vmscan.c index 761298d9c09a..9b8e0a9fc9d5 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -862,6 +862,31 @@ enum folio_references { FOLIOREF_ACTIVATE, }; +#ifdef CONFIG_LRU_GEN +/* + * Only used on a mapped folio in the eviction (rmap walk) path, where promotion + * needs to be done by taking the folio off the LRU list and then adding it back + * with PG_active set. In contrast, the aging (page table walk) path uses + * folio_update_gen(). + */ +static bool lru_gen_set_refs(struct folio *folio) +{ + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return false; + } + + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_workingset)); + return true; +} +#else +static bool lru_gen_set_refs(struct folio *folio) +{ + return false; +} +#endif /* CONFIG_LRU_GEN */ + static enum folio_references folio_check_references(struct folio *folio, struct scan_control *sc) { @@ -870,7 +895,6 @@ static enum folio_references folio_check_references(struct folio *folio, referenced_ptes = folio_referenced(folio, 1, sc->target_mem_cgroup, &vm_flags); - referenced_folio = folio_test_clear_referenced(folio); /* * The supposedly reclaimable folio was found to be in a VM_LOCKED vma. @@ -888,6 +912,15 @@ static enum folio_references folio_check_references(struct folio *folio, if (referenced_ptes == -1) return FOLIOREF_KEEP; + if (lru_gen_enabled()) { + if (!referenced_ptes) + return FOLIOREF_RECLAIM; + + return lru_gen_set_refs(folio) ? FOLIOREF_ACTIVATE : FOLIOREF_KEEP; + } + + referenced_folio = folio_test_clear_referenced(folio); + if (referenced_ptes) { /* * All mapped folios start out with page table @@ -1092,11 +1125,6 @@ retry: if (!sc->may_unmap && folio_mapped(folio)) goto keep_locked; - /* folio_update_gen() tried to promote this page? */ - if (lru_gen_enabled() && !ignore_references && - folio_mapped(folio) && folio_test_referenced(folio)) - goto keep_locked; - /* * The number of dirty pages determines if a node is marked * reclaim_congested. kswapd will stall and start writing @@ -3159,16 +3187,19 @@ static int folio_update_gen(struct folio *folio, int gen) VM_WARN_ON_ONCE(gen >= MAX_NR_GENS); + /* see the comment on LRU_REFS_FLAGS */ + if (!folio_test_referenced(folio) && !folio_test_workingset(folio)) { + set_mask_bits(&folio->flags, LRU_REFS_MASK, BIT(PG_referenced)); + return -1; + } + do { /* lru_gen_del_folio() has isolated this page? */ - if (!(old_flags & LRU_GEN_MASK)) { - /* for shrink_folio_list() */ - new_flags = old_flags | BIT(PG_referenced); - continue; - } + if (!(old_flags & LRU_GEN_MASK)) + return -1; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); - new_flags |= (gen + 1UL) << LRU_GEN_PGOFF; + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); + new_flags |= ((gen + 1UL) << LRU_GEN_PGOFF) | BIT(PG_workingset); } while (!try_cmpxchg(&folio->flags, &old_flags, new_flags)); return ((old_flags & LRU_GEN_MASK) >> LRU_GEN_PGOFF) - 1; @@ -3192,7 +3223,7 @@ static int folio_inc_gen(struct lruvec *lruvec, struct folio *folio, bool reclai new_gen = (old_gen + 1) % MAX_NR_GENS; - new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_MASK | LRU_REFS_FLAGS); + new_flags = old_flags & ~(LRU_GEN_MASK | LRU_REFS_FLAGS); new_flags |= (new_gen + 1UL) << LRU_GEN_PGOFF; /* for folio_end_writeback() */ if (reclaiming) @@ -3370,9 +3401,11 @@ static unsigned long get_pmd_pfn(pmd_t pmd, struct vm_area_struct *vma, unsigned static struct folio *get_pfn_folio(unsigned long pfn, struct mem_cgroup *memcg, struct pglist_data *pgdat) { - struct folio *folio; + struct folio *folio = pfn_folio(pfn); + + if (folio_lru_gen(folio) < 0) + return NULL; - folio = pfn_folio(pfn); if (folio_nid(folio) != pgdat->node_id) return NULL; @@ -3749,8 +3782,7 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) while (!list_empty(head)) { struct folio *folio = lru_to_folio(head); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); - int delta = folio_nr_pages(folio); + bool workingset = folio_test_workingset(folio); VM_WARN_ON_ONCE_FOLIO(folio_test_unevictable(folio), folio); VM_WARN_ON_ONCE_FOLIO(folio_test_active(folio), folio); @@ -3760,8 +3792,14 @@ static bool inc_min_seq(struct lruvec *lruvec, int type, int swappiness) new_gen = folio_inc_gen(lruvec, folio, false); list_move_tail(&folio->lru, &lrugen->folios[new_gen][type][zone]); - WRITE_ONCE(lrugen->protected[hist][type][tier], - lrugen->protected[hist][type][tier] + delta); + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int tier = lru_tier_from_refs(refs, workingset); + int delta = folio_nr_pages(folio); + + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } if (!--remaining) return false; @@ -4147,16 +4185,10 @@ bool lru_gen_look_around(struct page_vma_mapped_walk *pvmw) old_gen = folio_update_gen(folio, new_gen); if (old_gen >= 0 && old_gen != new_gen) update_batch_size(walk, folio, old_gen, new_gen); - - continue; - } - - old_gen = folio_lru_gen(folio); - if (old_gen < 0) - folio_set_referenced(folio); - else if (old_gen != new_gen) { - folio_clear_lru_refs(folio); - folio_activate(folio); + } else if (lru_gen_set_refs(folio)) { + old_gen = folio_lru_gen(folio); + if (old_gen >= 0 && old_gen != new_gen) + folio_activate(folio); } } @@ -4317,7 +4349,8 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c int zone = folio_zonenum(folio); int delta = folio_nr_pages(folio); int refs = folio_lru_refs(folio); - int tier = lru_tier_from_refs(refs); + bool workingset = folio_test_workingset(folio); + int tier = lru_tier_from_refs(refs, workingset); struct lru_gen_folio *lrugen = &lruvec->lrugen; VM_WARN_ON_ONCE_FOLIO(gen >= MAX_NR_GENS, folio); @@ -4339,14 +4372,17 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* protected */ - if (tier > tier_idx || refs == BIT(LRU_REFS_WIDTH)) { - int hist = lru_hist_from_seq(lrugen->min_seq[type]); - + if (tier > tier_idx || refs + workingset == BIT(LRU_REFS_WIDTH) + 1) { gen = folio_inc_gen(lruvec, folio, false); - list_move_tail(&folio->lru, &lrugen->folios[gen][type][zone]); + list_move(&folio->lru, &lrugen->folios[gen][type][zone]); + + /* don't count the workingset being lazily promoted */ + if (refs + workingset != BIT(LRU_REFS_WIDTH) + 1) { + int hist = lru_hist_from_seq(lrugen->min_seq[type]); - WRITE_ONCE(lrugen->protected[hist][type][tier], - lrugen->protected[hist][type][tier] + delta); + WRITE_ONCE(lrugen->protected[hist][type][tier], + lrugen->protected[hist][type][tier] + delta); + } return true; } @@ -4366,8 +4402,7 @@ static bool sort_folio(struct lruvec *lruvec, struct folio *folio, struct scan_c } /* waiting for writeback */ - if (folio_test_locked(folio) || writeback || - (type == LRU_GEN_FILE && dirty)) { + if (writeback || (type == LRU_GEN_FILE && dirty)) { gen = folio_inc_gen(lruvec, folio, true); list_move(&folio->lru, &lrugen->folios[gen][type][zone]); return true; @@ -4396,13 +4431,12 @@ static bool isolate_folio(struct lruvec *lruvec, struct folio *folio, struct sca return false; } - /* see the comment on MAX_NR_TIERS */ + /* see the comment on LRU_REFS_FLAGS */ if (!folio_test_referenced(folio)) - folio_clear_lru_refs(folio); + set_mask_bits(&folio->flags, LRU_REFS_MASK, 0); /* for shrink_folio_list() */ folio_clear_reclaim(folio); - folio_clear_referenced(folio); success = lru_gen_del_folio(lruvec, folio, true); VM_WARN_ON_ONCE_FOLIO(!success, folio); @@ -4592,31 +4626,24 @@ retry: type ? LRU_INACTIVE_FILE : LRU_INACTIVE_ANON); list_for_each_entry_safe_reverse(folio, next, &list, lru) { + DEFINE_MIN_SEQ(lruvec); + if (!folio_evictable(folio)) { list_del(&folio->lru); folio_putback_lru(folio); continue; } - if (folio_test_reclaim(folio) && - (folio_test_dirty(folio) || folio_test_writeback(folio))) { - /* restore LRU_REFS_FLAGS cleared by isolate_folio() */ - if (folio_test_workingset(folio)) - folio_set_referenced(folio); - continue; - } - - if (skip_retry || folio_test_active(folio) || folio_test_referenced(folio) || - folio_mapped(folio) || folio_test_locked(folio) || - folio_test_dirty(folio) || folio_test_writeback(folio)) { - /* don't add rejected folios to the oldest generation */ - set_mask_bits(&folio->flags, LRU_REFS_MASK | LRU_REFS_FLAGS, - BIT(PG_active)); + /* retry folios that may have missed folio_rotate_reclaimable() */ + if (!skip_retry && !folio_test_active(folio) && !folio_mapped(folio) && + !folio_test_dirty(folio) && !folio_test_writeback(folio)) { + list_move(&folio->lru, &clean); continue; } - /* retry folios that may have missed folio_rotate_reclaimable() */ - list_move(&folio->lru, &clean); + /* don't add rejected folios to the oldest generation */ + if (lru_gen_folio_seq(lruvec, folio, false) == min_seq[type]) + set_mask_bits(&folio->flags, LRU_REFS_FLAGS, BIT(PG_active)); } spin_lock_irq(&lruvec->lru_lock); |