diff options
Diffstat (limited to 'mm/hugetlb.c')
| -rw-r--r-- | mm/hugetlb.c | 983 |
1 files changed, 143 insertions, 840 deletions
diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 0455119716ec..9e7815b4f058 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -7,7 +7,6 @@ #include <linux/init.h> #include <linux/mm.h> #include <linux/seq_file.h> -#include <linux/sysctl.h> #include <linux/highmem.h> #include <linux/mmu_notifier.h> #include <linux/nodemask.h> @@ -19,7 +18,6 @@ #include <linux/mutex.h> #include <linux/memblock.h> #include <linux/minmax.h> -#include <linux/sysfs.h> #include <linux/slab.h> #include <linux/sched/mm.h> #include <linux/mmdebug.h> @@ -28,7 +26,7 @@ #include <linux/string_choices.h> #include <linux/string_helpers.h> #include <linux/swap.h> -#include <linux/swapops.h> +#include <linux/leafops.h> #include <linux/jhash.h> #include <linux/numa.h> #include <linux/llist.h> @@ -39,20 +37,19 @@ #include <linux/memory.h> #include <linux/mm_inline.h> #include <linux/padata.h> +#include <linux/pgalloc.h> #include <asm/page.h> -#include <asm/pgalloc.h> #include <asm/tlb.h> #include <asm/setup.h> #include <linux/io.h> -#include <linux/hugetlb.h> -#include <linux/hugetlb_cgroup.h> #include <linux/node.h> #include <linux/page_owner.h> #include "internal.h" #include "hugetlb_vmemmap.h" #include "hugetlb_cma.h" +#include "hugetlb_internal.h" #include <linux/page-isolation.h> int hugetlb_max_hstate __read_mostly; @@ -119,7 +116,6 @@ struct mutex *hugetlb_fault_mutex_table __ro_after_init; /* Forward declaration */ static int hugetlb_acct_memory(struct hstate *h, long delta); static void hugetlb_vma_lock_free(struct vm_area_struct *vma); -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma); static void __hugetlb_vma_unlock_write_free(struct vm_area_struct *vma); static void hugetlb_unshare_pmds(struct vm_area_struct *vma, unsigned long start, unsigned long end, bool take_locks); @@ -427,17 +423,21 @@ static void hugetlb_vma_lock_free(struct vm_area_struct *vma) } } -static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) +/* + * vma specific semaphore used for pmd sharing and fault/truncation + * synchronization + */ +int hugetlb_vma_lock_alloc(struct vm_area_struct *vma) { struct hugetlb_vma_lock *vma_lock; /* Only establish in (flags) sharable vmas */ if (!vma || !(vma->vm_flags & VM_MAYSHARE)) - return; + return 0; /* Should never get here with non-NULL vm_private_data */ if (vma->vm_private_data) - return; + return -EINVAL; vma_lock = kmalloc(sizeof(*vma_lock), GFP_KERNEL); if (!vma_lock) { @@ -452,13 +452,15 @@ static void hugetlb_vma_lock_alloc(struct vm_area_struct *vma) * allocation failure. */ pr_warn_once("HugeTLB: unable to allocate vma specific lock\n"); - return; + return -EINVAL; } kref_init(&vma_lock->refs); init_rwsem(&vma_lock->rw_sema); vma_lock->vma = vma; vma->vm_private_data = vma_lock; + + return 0; } /* Helper that removes a struct file_region from the resv_map cache and returns @@ -1190,20 +1192,28 @@ static struct resv_map *vma_resv_map(struct vm_area_struct *vma) } } -static void set_vma_resv_map(struct vm_area_struct *vma, struct resv_map *map) +static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE_VMA(!is_vm_hugetlb_page(vma), vma); + VM_WARN_ON_ONCE_VMA(vma->vm_flags & VM_MAYSHARE, vma); - set_vma_private_data(vma, (unsigned long)map); + set_vma_private_data(vma, get_vma_private_data(vma) | flags); } -static void set_vma_resv_flags(struct vm_area_struct *vma, unsigned long flags) +static void set_vma_desc_resv_map(struct vm_area_desc *desc, struct resv_map *map) { - VM_BUG_ON_VMA(!is_vm_hugetlb_page(vma), vma); - VM_BUG_ON_VMA(vma->vm_flags & VM_MAYSHARE, vma); + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); - set_vma_private_data(vma, get_vma_private_data(vma) | flags); + desc->private_data = map; +} + +static void set_vma_desc_resv_flags(struct vm_area_desc *desc, unsigned long flags) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + VM_WARN_ON_ONCE(desc->vm_flags & VM_MAYSHARE); + + desc->private_data = (void *)((unsigned long)desc->private_data | flags); } static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) @@ -1213,6 +1223,13 @@ static int is_vma_resv_set(struct vm_area_struct *vma, unsigned long flag) return (get_vma_private_data(vma) & flag) != 0; } +static bool is_vma_desc_resv_set(struct vm_area_desc *desc, unsigned long flag) +{ + VM_WARN_ON_ONCE(!is_vm_hugetlb_flags(desc->vm_flags)); + + return ((unsigned long)desc->private_data) & flag; +} + bool __vma_private_lock(struct vm_area_struct *vma) { return !(vma->vm_flags & VM_MAYSHARE) && @@ -1400,77 +1417,6 @@ err: return NULL; } -/* - * common helper functions for hstate_next_node_to_{alloc|free}. - * We may have allocated or freed a huge page based on a different - * nodes_allowed previously, so h->next_node_to_{alloc|free} might - * be outside of *nodes_allowed. Ensure that we use an allowed - * node for alloc or free. - */ -static int next_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - nid = next_node_in(nid, *nodes_allowed); - VM_BUG_ON(nid >= MAX_NUMNODES); - - return nid; -} - -static int get_valid_node_allowed(int nid, nodemask_t *nodes_allowed) -{ - if (!node_isset(nid, *nodes_allowed)) - nid = next_node_allowed(nid, nodes_allowed); - return nid; -} - -/* - * returns the previously saved node ["this node"] from which to - * allocate a persistent huge page for the pool and advance the - * next node from which to allocate, handling wrap at end of node - * mask. - */ -static int hstate_next_node_to_alloc(int *next_node, - nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(*next_node, nodes_allowed); - *next_node = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -/* - * helper for remove_pool_hugetlb_folio() - return the previously saved - * node ["this node"] from which to free a huge page. Advance the - * next node id whether or not we find a free huge page to free so - * that the next attempt to free addresses the next node. - */ -static int hstate_next_node_to_free(struct hstate *h, nodemask_t *nodes_allowed) -{ - int nid; - - VM_BUG_ON(!nodes_allowed); - - nid = get_valid_node_allowed(h->next_nid_to_free, nodes_allowed); - h->next_nid_to_free = next_node_allowed(nid, nodes_allowed); - - return nid; -} - -#define for_each_node_mask_to_alloc(next_node, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_alloc(next_node, mask)) || 1); \ - nr_nodes--) - -#define for_each_node_mask_to_free(hs, nr_nodes, node, mask) \ - for (nr_nodes = nodes_weight(*mask); \ - nr_nodes > 0 && \ - ((node = hstate_next_node_to_free(hs, mask)) || 1); \ - nr_nodes--) - #ifdef CONFIG_ARCH_HAS_GIGANTIC_PAGE #ifdef CONFIG_CONTIG_ALLOC static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, @@ -1526,8 +1472,8 @@ static struct folio *alloc_gigantic_folio(int order, gfp_t gfp_mask, int nid, * * Must be called with hugetlb lock held. */ -static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus) +void remove_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus) { int nid = folio_nid(folio); @@ -1535,7 +1481,7 @@ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, VM_BUG_ON_FOLIO(hugetlb_cgroup_from_folio_rsvd(folio), folio); lockdep_assert_held(&hugetlb_lock); - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) return; list_del(&folio->lru); @@ -1562,8 +1508,8 @@ static void remove_hugetlb_folio(struct hstate *h, struct folio *folio, h->nr_huge_pages_node[nid]--; } -static void add_hugetlb_folio(struct hstate *h, struct folio *folio, - bool adjust_surplus) +void add_hugetlb_folio(struct hstate *h, struct folio *folio, + bool adjust_surplus) { int nid = folio_nid(folio); @@ -1597,7 +1543,7 @@ static void __update_and_free_hugetlb_folio(struct hstate *h, { bool clear_flag = folio_test_hugetlb_vmemmap_optimized(folio); - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) return; /* @@ -1894,7 +1840,7 @@ static void account_new_hugetlb_folio(struct hstate *h, struct folio *folio) h->nr_huge_pages_node[folio_nid(folio)]++; } -static void init_new_hugetlb_folio(struct folio *folio) +void init_new_hugetlb_folio(struct folio *folio) { __folio_set_hugetlb(folio); INIT_LIST_HEAD(&folio->lru); @@ -2006,8 +1952,8 @@ static struct folio *alloc_fresh_hugetlb_folio(struct hstate *h, return folio; } -static void prep_and_add_allocated_folios(struct hstate *h, - struct list_head *folio_list) +void prep_and_add_allocated_folios(struct hstate *h, + struct list_head *folio_list) { unsigned long flags; struct folio *folio, *tmp_f; @@ -2212,7 +2158,7 @@ static struct folio *alloc_surplus_hugetlb_folio(struct hstate *h, { struct folio *folio = NULL; - if (hstate_is_gigantic(h)) + if (hstate_is_gigantic_no_runtime(h)) return NULL; spin_lock_irq(&hugetlb_lock); @@ -2491,7 +2437,7 @@ static void return_unused_surplus_pages(struct hstate *h, /* Uncommit the reservation */ h->resv_huge_pages -= unused_resv_pages; - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) goto out; /* @@ -2934,7 +2880,7 @@ typedef enum { * NOTE: This is mostly identical to MAP_CHG_NEEDED, except * that currently vma_needs_reservation() has an unwanted side * effect to either use end() or commit() to complete the - * transaction. Hence it needs to differenciate from NEEDED. + * transaction. Hence it needs to differentiate from NEEDED. */ MAP_CHG_ENFORCED = 2, } map_chg_state; @@ -3215,7 +3161,7 @@ static void __init hugetlb_folio_init_tail_vmemmap(struct folio *folio, unsigned long start_page_number, unsigned long end_page_number) { - enum zone_type zone = zone_idx(folio_zone(folio)); + enum zone_type zone = folio_zonenum(folio); int nid = folio_nid(folio); struct page *page = folio_page(folio, start_page_number); unsigned long head_pfn = folio_pfn(folio); @@ -3248,7 +3194,7 @@ static void __init hugetlb_folio_init_vmemmap(struct folio *folio, ret = folio_ref_freeze(folio, 1); VM_BUG_ON(!ret); hugetlb_folio_init_tail_vmemmap(folio, 1, nr_pages); - prep_compound_head((struct page *)folio, huge_page_order(h)); + prep_compound_head(&folio->page, huge_page_order(h)); } static bool __init hugetlb_bootmem_page_prehvo(struct huge_bootmem_page *m) @@ -3705,7 +3651,7 @@ static void __init hugetlb_init_hstates(void) * - If CMA allocation is possible, we can not demote * HUGETLB_PAGE_ORDER or smaller size pages. */ - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) continue; if (hugetlb_cma_total_size() && h->order <= HUGETLB_PAGE_ORDER) continue; @@ -4062,8 +4008,8 @@ static long demote_free_hugetlb_folios(struct hstate *src, struct hstate *dst, return rc; } -static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, - unsigned long nr_to_demote) +long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, + unsigned long nr_to_demote) __must_hold(&hugetlb_lock) { int nr_nodes, node; @@ -4131,58 +4077,14 @@ static long demote_pool_huge_page(struct hstate *src, nodemask_t *nodes_allowed, return -EBUSY; } -#define HSTATE_ATTR_RO(_name) \ - static struct kobj_attribute _name##_attr = __ATTR_RO(_name) - -#define HSTATE_ATTR_WO(_name) \ - static struct kobj_attribute _name##_attr = __ATTR_WO(_name) - -#define HSTATE_ATTR(_name) \ - static struct kobj_attribute _name##_attr = __ATTR_RW(_name) - -static struct kobject *hugepages_kobj; -static struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; - -static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp); - -static struct hstate *kobj_to_hstate(struct kobject *kobj, int *nidp) -{ - int i; - - for (i = 0; i < HUGE_MAX_HSTATE; i++) - if (hstate_kobjs[i] == kobj) { - if (nidp) - *nidp = NUMA_NO_NODE; - return &hstates[i]; - } - - return kobj_to_node_hstate(kobj, nidp); -} - -static ssize_t nr_hugepages_show_common(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h; - unsigned long nr_huge_pages; - int nid; - - h = kobj_to_hstate(kobj, &nid); - if (nid == NUMA_NO_NODE) - nr_huge_pages = h->nr_huge_pages; - else - nr_huge_pages = h->nr_huge_pages_node[nid]; - - return sysfs_emit(buf, "%lu\n", nr_huge_pages); -} - -static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, +ssize_t __nr_hugepages_store_common(bool obey_mempolicy, struct hstate *h, int nid, unsigned long count, size_t len) { int err; nodemask_t nodes_allowed, *n_mask; - if (hstate_is_gigantic(h) && !gigantic_page_runtime_supported()) + if (hstate_is_gigantic_no_runtime(h)) return -EINVAL; if (nid == NUMA_NO_NODE) { @@ -4208,458 +4110,6 @@ static ssize_t __nr_hugepages_store_common(bool obey_mempolicy, return err ? err : len; } -static ssize_t nr_hugepages_store_common(bool obey_mempolicy, - struct kobject *kobj, const char *buf, - size_t len) -{ - struct hstate *h; - unsigned long count; - int nid; - int err; - - err = kstrtoul(buf, 10, &count); - if (err) - return err; - - h = kobj_to_hstate(kobj, &nid); - return __nr_hugepages_store_common(obey_mempolicy, h, nid, count, len); -} - -static ssize_t nr_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - return nr_hugepages_show_common(kobj, attr, buf); -} - -static ssize_t nr_hugepages_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - return nr_hugepages_store_common(false, kobj, buf, len); -} -HSTATE_ATTR(nr_hugepages); - -#ifdef CONFIG_NUMA - -/* - * hstate attribute for optionally mempolicy-based constraint on persistent - * huge page alloc/free. - */ -static ssize_t nr_hugepages_mempolicy_show(struct kobject *kobj, - struct kobj_attribute *attr, - char *buf) -{ - return nr_hugepages_show_common(kobj, attr, buf); -} - -static ssize_t nr_hugepages_mempolicy_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - return nr_hugepages_store_common(true, kobj, buf, len); -} -HSTATE_ATTR(nr_hugepages_mempolicy); -#endif - - -static ssize_t nr_overcommit_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h = kobj_to_hstate(kobj, NULL); - return sysfs_emit(buf, "%lu\n", h->nr_overcommit_huge_pages); -} - -static ssize_t nr_overcommit_hugepages_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t count) -{ - int err; - unsigned long input; - struct hstate *h = kobj_to_hstate(kobj, NULL); - - if (hstate_is_gigantic(h)) - return -EINVAL; - - err = kstrtoul(buf, 10, &input); - if (err) - return err; - - spin_lock_irq(&hugetlb_lock); - h->nr_overcommit_huge_pages = input; - spin_unlock_irq(&hugetlb_lock); - - return count; -} -HSTATE_ATTR(nr_overcommit_hugepages); - -static ssize_t free_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h; - unsigned long free_huge_pages; - int nid; - - h = kobj_to_hstate(kobj, &nid); - if (nid == NUMA_NO_NODE) - free_huge_pages = h->free_huge_pages; - else - free_huge_pages = h->free_huge_pages_node[nid]; - - return sysfs_emit(buf, "%lu\n", free_huge_pages); -} -HSTATE_ATTR_RO(free_hugepages); - -static ssize_t resv_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h = kobj_to_hstate(kobj, NULL); - return sysfs_emit(buf, "%lu\n", h->resv_huge_pages); -} -HSTATE_ATTR_RO(resv_hugepages); - -static ssize_t surplus_hugepages_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h; - unsigned long surplus_huge_pages; - int nid; - - h = kobj_to_hstate(kobj, &nid); - if (nid == NUMA_NO_NODE) - surplus_huge_pages = h->surplus_huge_pages; - else - surplus_huge_pages = h->surplus_huge_pages_node[nid]; - - return sysfs_emit(buf, "%lu\n", surplus_huge_pages); -} -HSTATE_ATTR_RO(surplus_hugepages); - -static ssize_t demote_store(struct kobject *kobj, - struct kobj_attribute *attr, const char *buf, size_t len) -{ - unsigned long nr_demote; - unsigned long nr_available; - nodemask_t nodes_allowed, *n_mask; - struct hstate *h; - int err; - int nid; - - err = kstrtoul(buf, 10, &nr_demote); - if (err) - return err; - h = kobj_to_hstate(kobj, &nid); - - if (nid != NUMA_NO_NODE) { - init_nodemask_of_node(&nodes_allowed, nid); - n_mask = &nodes_allowed; - } else { - n_mask = &node_states[N_MEMORY]; - } - - /* Synchronize with other sysfs operations modifying huge pages */ - mutex_lock(&h->resize_lock); - spin_lock_irq(&hugetlb_lock); - - while (nr_demote) { - long rc; - - /* - * Check for available pages to demote each time thorough the - * loop as demote_pool_huge_page will drop hugetlb_lock. - */ - if (nid != NUMA_NO_NODE) - nr_available = h->free_huge_pages_node[nid]; - else - nr_available = h->free_huge_pages; - nr_available -= h->resv_huge_pages; - if (!nr_available) - break; - - rc = demote_pool_huge_page(h, n_mask, nr_demote); - if (rc < 0) { - err = rc; - break; - } - - nr_demote -= rc; - } - - spin_unlock_irq(&hugetlb_lock); - mutex_unlock(&h->resize_lock); - - if (err) - return err; - return len; -} -HSTATE_ATTR_WO(demote); - -static ssize_t demote_size_show(struct kobject *kobj, - struct kobj_attribute *attr, char *buf) -{ - struct hstate *h = kobj_to_hstate(kobj, NULL); - unsigned long demote_size = (PAGE_SIZE << h->demote_order) / SZ_1K; - - return sysfs_emit(buf, "%lukB\n", demote_size); -} - -static ssize_t demote_size_store(struct kobject *kobj, - struct kobj_attribute *attr, - const char *buf, size_t count) -{ - struct hstate *h, *demote_hstate; - unsigned long demote_size; - unsigned int demote_order; - - demote_size = (unsigned long)memparse(buf, NULL); - - demote_hstate = size_to_hstate(demote_size); - if (!demote_hstate) - return -EINVAL; - demote_order = demote_hstate->order; - if (demote_order < HUGETLB_PAGE_ORDER) - return -EINVAL; - - /* demote order must be smaller than hstate order */ - h = kobj_to_hstate(kobj, NULL); - if (demote_order >= h->order) - return -EINVAL; - - /* resize_lock synchronizes access to demote size and writes */ - mutex_lock(&h->resize_lock); - h->demote_order = demote_order; - mutex_unlock(&h->resize_lock); - - return count; -} -HSTATE_ATTR(demote_size); - -static struct attribute *hstate_attrs[] = { - &nr_hugepages_attr.attr, - &nr_overcommit_hugepages_attr.attr, - &free_hugepages_attr.attr, - &resv_hugepages_attr.attr, - &surplus_hugepages_attr.attr, -#ifdef CONFIG_NUMA - &nr_hugepages_mempolicy_attr.attr, -#endif - NULL, -}; - -static const struct attribute_group hstate_attr_group = { - .attrs = hstate_attrs, -}; - -static struct attribute *hstate_demote_attrs[] = { - &demote_size_attr.attr, - &demote_attr.attr, - NULL, -}; - -static const struct attribute_group hstate_demote_attr_group = { - .attrs = hstate_demote_attrs, -}; - -static int hugetlb_sysfs_add_hstate(struct hstate *h, struct kobject *parent, - struct kobject **hstate_kobjs, - const struct attribute_group *hstate_attr_group) -{ - int retval; - int hi = hstate_index(h); - - hstate_kobjs[hi] = kobject_create_and_add(h->name, parent); - if (!hstate_kobjs[hi]) - return -ENOMEM; - - retval = sysfs_create_group(hstate_kobjs[hi], hstate_attr_group); - if (retval) { - kobject_put(hstate_kobjs[hi]); - hstate_kobjs[hi] = NULL; - return retval; - } - - if (h->demote_order) { - retval = sysfs_create_group(hstate_kobjs[hi], - &hstate_demote_attr_group); - if (retval) { - pr_warn("HugeTLB unable to create demote interfaces for %s\n", h->name); - sysfs_remove_group(hstate_kobjs[hi], hstate_attr_group); - kobject_put(hstate_kobjs[hi]); - hstate_kobjs[hi] = NULL; - return retval; - } - } - - return 0; -} - -#ifdef CONFIG_NUMA -static bool hugetlb_sysfs_initialized __ro_after_init; - -/* - * node_hstate/s - associate per node hstate attributes, via their kobjects, - * with node devices in node_devices[] using a parallel array. The array - * index of a node device or _hstate == node id. - * This is here to avoid any static dependency of the node device driver, in - * the base kernel, on the hugetlb module. - */ -struct node_hstate { - struct kobject *hugepages_kobj; - struct kobject *hstate_kobjs[HUGE_MAX_HSTATE]; -}; -static struct node_hstate node_hstates[MAX_NUMNODES]; - -/* - * A subset of global hstate attributes for node devices - */ -static struct attribute *per_node_hstate_attrs[] = { - &nr_hugepages_attr.attr, - &free_hugepages_attr.attr, - &surplus_hugepages_attr.attr, - NULL, -}; - -static const struct attribute_group per_node_hstate_attr_group = { - .attrs = per_node_hstate_attrs, -}; - -/* - * kobj_to_node_hstate - lookup global hstate for node device hstate attr kobj. - * Returns node id via non-NULL nidp. - */ -static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) -{ - int nid; - - for (nid = 0; nid < nr_node_ids; nid++) { - struct node_hstate *nhs = &node_hstates[nid]; - int i; - for (i = 0; i < HUGE_MAX_HSTATE; i++) - if (nhs->hstate_kobjs[i] == kobj) { - if (nidp) - *nidp = nid; - return &hstates[i]; - } - } - - BUG(); - return NULL; -} - -/* - * Unregister hstate attributes from a single node device. - * No-op if no hstate attributes attached. - */ -void hugetlb_unregister_node(struct node *node) -{ - struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->dev.id]; - - if (!nhs->hugepages_kobj) - return; /* no hstate attributes */ - - for_each_hstate(h) { - int idx = hstate_index(h); - struct kobject *hstate_kobj = nhs->hstate_kobjs[idx]; - - if (!hstate_kobj) - continue; - if (h->demote_order) - sysfs_remove_group(hstate_kobj, &hstate_demote_attr_group); - sysfs_remove_group(hstate_kobj, &per_node_hstate_attr_group); - kobject_put(hstate_kobj); - nhs->hstate_kobjs[idx] = NULL; - } - - kobject_put(nhs->hugepages_kobj); - nhs->hugepages_kobj = NULL; -} - - -/* - * Register hstate attributes for a single node device. - * No-op if attributes already registered. - */ -void hugetlb_register_node(struct node *node) -{ - struct hstate *h; - struct node_hstate *nhs = &node_hstates[node->dev.id]; - int err; - - if (!hugetlb_sysfs_initialized) - return; - - if (nhs->hugepages_kobj) - return; /* already allocated */ - - nhs->hugepages_kobj = kobject_create_and_add("hugepages", - &node->dev.kobj); - if (!nhs->hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, nhs->hugepages_kobj, - nhs->hstate_kobjs, - &per_node_hstate_attr_group); - if (err) { - pr_err("HugeTLB: Unable to add hstate %s for node %d\n", - h->name, node->dev.id); - hugetlb_unregister_node(node); - break; - } - } -} - -/* - * hugetlb init time: register hstate attributes for all registered node - * devices of nodes that have memory. All on-line nodes should have - * registered their associated device by this time. - */ -static void __init hugetlb_register_all_nodes(void) -{ - int nid; - - for_each_online_node(nid) - hugetlb_register_node(node_devices[nid]); -} -#else /* !CONFIG_NUMA */ - -static struct hstate *kobj_to_node_hstate(struct kobject *kobj, int *nidp) -{ - BUG(); - if (nidp) - *nidp = -1; - return NULL; -} - -static void hugetlb_register_all_nodes(void) { } - -#endif - -static void __init hugetlb_sysfs_init(void) -{ - struct hstate *h; - int err; - - hugepages_kobj = kobject_create_and_add("hugepages", mm_kobj); - if (!hugepages_kobj) - return; - - for_each_hstate(h) { - err = hugetlb_sysfs_add_hstate(h, hugepages_kobj, - hstate_kobjs, &hstate_attr_group); - if (err) - pr_err("HugeTLB: Unable to add hstate %s\n", h->name); - } - -#ifdef CONFIG_NUMA - hugetlb_sysfs_initialized = true; -#endif - hugetlb_register_all_nodes(); -} - -#ifdef CONFIG_SYSCTL -static void hugetlb_sysctl_init(void); -#else -static inline void hugetlb_sysctl_init(void) { } -#endif - static int __init hugetlb_init(void) { int i; @@ -5092,131 +4542,6 @@ static unsigned int allowed_mems_nr(struct hstate *h) return nr; } -#ifdef CONFIG_SYSCTL -static int proc_hugetlb_doulongvec_minmax(const struct ctl_table *table, int write, - void *buffer, size_t *length, - loff_t *ppos, unsigned long *out) -{ - struct ctl_table dup_table; - - /* - * In order to avoid races with __do_proc_doulongvec_minmax(), we - * can duplicate the @table and alter the duplicate of it. - */ - dup_table = *table; - dup_table.data = out; - - return proc_doulongvec_minmax(&dup_table, write, buffer, length, ppos); -} - -static int hugetlb_sysctl_handler_common(bool obey_mempolicy, - const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - struct hstate *h = &default_hstate; - unsigned long tmp = h->max_huge_pages; - int ret; - - if (!hugepages_supported()) - return -EOPNOTSUPP; - - ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, - &tmp); - if (ret) - goto out; - - if (write) - ret = __nr_hugepages_store_common(obey_mempolicy, h, - NUMA_NO_NODE, tmp, *length); -out: - return ret; -} - -static int hugetlb_sysctl_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - - return hugetlb_sysctl_handler_common(false, table, write, - buffer, length, ppos); -} - -#ifdef CONFIG_NUMA -static int hugetlb_mempolicy_sysctl_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - return hugetlb_sysctl_handler_common(true, table, write, - buffer, length, ppos); -} -#endif /* CONFIG_NUMA */ - -static int hugetlb_overcommit_handler(const struct ctl_table *table, int write, - void *buffer, size_t *length, loff_t *ppos) -{ - struct hstate *h = &default_hstate; - unsigned long tmp; - int ret; - - if (!hugepages_supported()) - return -EOPNOTSUPP; - - tmp = h->nr_overcommit_huge_pages; - - if (write && hstate_is_gigantic(h)) - return -EINVAL; - - ret = proc_hugetlb_doulongvec_minmax(table, write, buffer, length, ppos, - &tmp); - if (ret) - goto out; - - if (write) { - spin_lock_irq(&hugetlb_lock); - h->nr_overcommit_huge_pages = tmp; - spin_unlock_irq(&hugetlb_lock); - } -out: - return ret; -} - -static const struct ctl_table hugetlb_table[] = { - { - .procname = "nr_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_sysctl_handler, - }, -#ifdef CONFIG_NUMA - { - .procname = "nr_hugepages_mempolicy", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = &hugetlb_mempolicy_sysctl_handler, - }, -#endif - { - .procname = "hugetlb_shm_group", - .data = &sysctl_hugetlb_shm_group, - .maxlen = sizeof(gid_t), - .mode = 0644, - .proc_handler = proc_dointvec, - }, - { - .procname = "nr_overcommit_hugepages", - .data = NULL, - .maxlen = sizeof(unsigned long), - .mode = 0644, - .proc_handler = hugetlb_overcommit_handler, - }, -}; - -static void __init hugetlb_sysctl_init(void) -{ - register_sysctl_init("vm", hugetlb_table); -} -#endif /* CONFIG_SYSCTL */ - void hugetlb_report_meminfo(struct seq_file *m) { struct hstate *h; @@ -5521,32 +4846,6 @@ static void set_huge_ptep_maybe_writable(struct vm_area_struct *vma, set_huge_ptep_writable(vma, address, ptep); } -bool is_hugetlb_entry_migration(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return false; - swp = pte_to_swp_entry(pte); - if (is_migration_entry(swp)) - return true; - else - return false; -} - -bool is_hugetlb_entry_hwpoisoned(pte_t pte) -{ - swp_entry_t swp; - - if (huge_pte_none(pte) || pte_present(pte)) - return false; - swp = pte_to_swp_entry(pte); - if (is_hwpoison_entry(swp)) - return true; - else - return false; -} - static void hugetlb_install_folio(struct vm_area_struct *vma, pte_t *ptep, unsigned long addr, struct folio *new_folio, pte_t old, unsigned long sz) @@ -5575,6 +4874,7 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, unsigned long npages = pages_per_huge_page(h); struct mmu_notifier_range range; unsigned long last_addr_mask; + softleaf_t softleaf; int ret = 0; if (cow) { @@ -5622,26 +4922,26 @@ int copy_hugetlb_page_range(struct mm_struct *dst, struct mm_struct *src, entry = huge_ptep_get(src_vma->vm_mm, addr, src_pte); again: if (huge_pte_none(entry)) { - /* - * Skip if src entry none. - */ - ; - } else if (unlikely(is_hugetlb_entry_hwpoisoned(entry))) { + /* Skip if src entry none. */ + goto next; + } + + softleaf = softleaf_from_pte(entry); + if (unlikely(softleaf_is_hwpoison(softleaf))) { if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); - } else if (unlikely(is_hugetlb_entry_migration(entry))) { - swp_entry_t swp_entry = pte_to_swp_entry(entry); + } else if (unlikely(softleaf_is_migration(softleaf))) { bool uffd_wp = pte_swp_uffd_wp(entry); - if (!is_readable_migration_entry(swp_entry) && cow) { + if (!softleaf_is_migration_read(softleaf) && cow) { /* * COW mappings require pages in both * parent and child to be set to read. */ - swp_entry = make_readable_migration_entry( - swp_offset(swp_entry)); - entry = swp_entry_to_pte(swp_entry); + softleaf = make_readable_migration_entry( + swp_offset(softleaf)); + entry = swp_entry_to_pte(softleaf); if (userfaultfd_wp(src_vma) && uffd_wp) entry = pte_swp_mkuffd_wp(entry); set_huge_pte_at(src, addr, src_pte, entry, sz); @@ -5649,9 +4949,8 @@ again: if (!userfaultfd_wp(dst_vma)) entry = huge_pte_clear_uffd_wp(entry); set_huge_pte_at(dst, addr, dst_pte, entry, sz); - } else if (unlikely(is_pte_marker(entry))) { - pte_marker marker = copy_pte_marker( - pte_to_swp_entry(entry), dst_vma); + } else if (unlikely(pte_is_marker(entry))) { + const pte_marker marker = copy_pte_marker(softleaf, dst_vma); if (marker) set_huge_pte_at(dst, addr, dst_pte, @@ -5708,9 +5007,7 @@ again: } hugetlb_install_folio(dst_vma, dst_pte, addr, new_folio, src_pte_old, sz); - spin_unlock(src_ptl); - spin_unlock(dst_ptl); - continue; + goto next; } if (cow) { @@ -5731,6 +5028,8 @@ again: set_huge_pte_at(dst, addr, dst_pte, entry, sz); hugetlb_count_add(npages, dst); } + +next: spin_unlock(src_ptl); spin_unlock(dst_ptl); } @@ -5767,13 +5066,13 @@ static void move_huge_pte(struct vm_area_struct *vma, unsigned long old_addr, pte = huge_ptep_get_and_clear(mm, old_addr, src_pte, sz); - if (need_clear_uffd_wp && pte_marker_uffd_wp(pte)) + if (need_clear_uffd_wp && pte_is_uffd_wp_marker(pte)) { huge_pte_clear(mm, new_addr, dst_pte, sz); - else { + } else { if (need_clear_uffd_wp) { if (pte_present(pte)) pte = huge_pte_clear_uffd_wp(pte); - else if (is_swap_pte(pte)) + else pte = pte_swp_clear_uffd_wp(pte); } set_huge_pte_at(mm, new_addr, dst_pte, pte, sz); @@ -6007,7 +5306,7 @@ void __unmap_hugepage_range(struct mmu_gather *tlb, struct vm_area_struct *vma, /* * If we unshared PMDs, the TLB flush was not recorded in mmu_gather. We * could defer the flush until now, since by holding i_mmap_rwsem we - * guaranteed that the last refernece would not be dropped. But we must + * guaranteed that the last reference would not be dropped. But we must * do the flushing before we return, as otherwise i_mmap_rwsem will be * dropped and the last reference to the shared PMDs page might be * dropped as well. @@ -6586,7 +5885,7 @@ static vm_fault_t hugetlb_no_page(struct address_space *mapping, * If this pte was previously wr-protected, keep it wr-protected even * if populated. */ - if (unlikely(pte_marker_uffd_wp(vmf->orig_pte))) + if (unlikely(pte_is_uffd_wp_marker(vmf->orig_pte))) new_pte = huge_pte_mkuffd_wp(new_pte); set_huge_pte_at(mm, vmf->address, vmf->pte, new_pte, huge_page_size(h)); @@ -6712,36 +6011,37 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, } vmf.orig_pte = huge_ptep_get(mm, vmf.address, vmf.pte); - if (huge_pte_none_mostly(vmf.orig_pte)) { - if (is_pte_marker(vmf.orig_pte)) { - pte_marker marker = - pte_marker_get(pte_to_swp_entry(vmf.orig_pte)); - - if (marker & PTE_MARKER_POISONED) { - ret = VM_FAULT_HWPOISON_LARGE | - VM_FAULT_SET_HINDEX(hstate_index(h)); - goto out_mutex; - } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { - /* This isn't supported in hugetlb. */ - ret = VM_FAULT_SIGSEGV; - goto out_mutex; - } - } - + if (huge_pte_none(vmf.orig_pte)) /* - * Other PTE markers should be handled the same way as none PTE. - * * hugetlb_no_page will drop vma lock and hugetlb fault * mutex internally, which make us return immediately. */ return hugetlb_no_page(mapping, &vmf); + + if (pte_is_marker(vmf.orig_pte)) { + const pte_marker marker = + softleaf_to_marker(softleaf_from_pte(vmf.orig_pte)); + + if (marker & PTE_MARKER_POISONED) { + ret = VM_FAULT_HWPOISON_LARGE | + VM_FAULT_SET_HINDEX(hstate_index(h)); + goto out_mutex; + } else if (WARN_ON_ONCE(marker & PTE_MARKER_GUARD)) { + /* This isn't supported in hugetlb. */ + ret = VM_FAULT_SIGSEGV; + goto out_mutex; + } + + return hugetlb_no_page(mapping, &vmf); } ret = 0; /* Not present, either a migration or a hwpoisoned entry */ - if (!pte_present(vmf.orig_pte)) { - if (is_hugetlb_entry_migration(vmf.orig_pte)) { + if (!pte_present(vmf.orig_pte) && !huge_pte_none(vmf.orig_pte)) { + const softleaf_t softleaf = softleaf_from_pte(vmf.orig_pte); + + if (softleaf_is_migration(softleaf)) { /* * Release the hugetlb fault lock now, but retain * the vma lock, because it is needed to guard the @@ -6752,9 +6052,12 @@ vm_fault_t hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, mutex_unlock(&hugetlb_fault_mutex_table[hash]); migration_entry_wait_huge(vma, vmf.address, vmf.pte); return 0; - } else if (is_hugetlb_entry_hwpoisoned(vmf.orig_pte)) + } + if (softleaf_is_hwpoison(softleaf)) { ret = VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); + } + goto out_mutex; } @@ -6903,6 +6206,7 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, int ret = -ENOMEM; struct folio *folio; bool folio_in_pagecache = false; + pte_t dst_ptep; if (uffd_flags_mode_is(flags, MFILL_ATOMIC_POISON)) { ptl = huge_pte_lock(h, dst_mm, dst_pte); @@ -7042,13 +6346,14 @@ int hugetlb_mfill_atomic_pte(pte_t *dst_pte, if (folio_test_hwpoison(folio)) goto out_release_unlock; + ret = -EEXIST; + + dst_ptep = huge_ptep_get(dst_mm, dst_addr, dst_pte); /* - * We allow to overwrite a pte marker: consider when both MISSING|WP - * registered, we firstly wr-protect a none pte which has no page cache - * page backing it, then access the page. + * See comment about UFFD marker overwriting in + * mfill_atomic_install_pte(). */ - ret = -EEXIST; - if (!huge_pte_none_mostly(huge_ptep_get(dst_mm, dst_addr, dst_pte))) + if (!huge_pte_none(dst_ptep) && !pte_is_uffd_marker(dst_ptep)) goto out_release_unlock; if (folio_in_pagecache) @@ -7134,7 +6439,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, i_mmap_lock_write(vma->vm_file->f_mapping); last_addr_mask = hugetlb_mask_last_page(h); for (; address < end; address += psize) { + softleaf_t entry; spinlock_t *ptl; + ptep = hugetlb_walk(vma, address, psize); if (!ptep) { if (!uffd_wp) { @@ -7166,14 +6473,23 @@ long hugetlb_change_protection(struct vm_area_struct *vma, continue; } pte = huge_ptep_get(mm, address, ptep); - if (unlikely(is_hugetlb_entry_hwpoisoned(pte))) { + if (huge_pte_none(pte)) { + if (unlikely(uffd_wp)) + /* Safe to modify directly (none->non-present). */ + set_huge_pte_at(mm, address, ptep, + make_pte_marker(PTE_MARKER_UFFD_WP), + psize); + goto next; + } + + entry = softleaf_from_pte(pte); + if (unlikely(softleaf_is_hwpoison(entry))) { /* Nothing to do. */ - } else if (unlikely(is_hugetlb_entry_migration(pte))) { - swp_entry_t entry = pte_to_swp_entry(pte); - struct folio *folio = pfn_swap_entry_folio(entry); + } else if (unlikely(softleaf_is_migration(entry))) { + struct folio *folio = softleaf_to_folio(entry); pte_t newpte = pte; - if (is_writable_migration_entry(entry)) { + if (softleaf_is_migration_write(entry)) { if (folio_test_anon(folio)) entry = make_readable_exclusive_migration_entry( swp_offset(entry)); @@ -7190,17 +6506,17 @@ long hugetlb_change_protection(struct vm_area_struct *vma, newpte = pte_swp_clear_uffd_wp(newpte); if (!pte_same(pte, newpte)) set_huge_pte_at(mm, address, ptep, newpte, psize); - } else if (unlikely(is_pte_marker(pte))) { + } else if (unlikely(pte_is_marker(pte))) { /* * Do nothing on a poison marker; page is - * corrupted, permissons do not apply. Here + * corrupted, permissions do not apply. Here * pte_marker_uffd_wp()==true implies !poison * because they're mutual exclusive. */ - if (pte_marker_uffd_wp(pte) && uffd_wp_resolve) + if (pte_is_uffd_wp_marker(pte) && uffd_wp_resolve) /* Safe to modify directly (non-present->none). */ huge_pte_clear(mm, address, ptep, psize); - } else if (!huge_pte_none(pte)) { + } else { pte_t old_pte; unsigned int shift = huge_page_shift(hstate_vma(vma)); @@ -7213,16 +6529,10 @@ long hugetlb_change_protection(struct vm_area_struct *vma, pte = huge_pte_clear_uffd_wp(pte); huge_ptep_modify_prot_commit(vma, address, ptep, old_pte, pte); pages++; - } else { - /* None pte */ - if (unlikely(uffd_wp)) - /* Safe to modify directly (none->non-present). */ - set_huge_pte_at(mm, address, ptep, - make_pte_marker(PTE_MARKER_UFFD_WP), - psize); } - spin_unlock(ptl); +next: + spin_unlock(ptl); cond_resched(); } /* @@ -7259,9 +6569,9 @@ long hugetlb_change_protection(struct vm_area_struct *vma, */ long hugetlb_reserve_pages(struct inode *inode, - long from, long to, - struct vm_area_struct *vma, - vm_flags_t vm_flags) + long from, long to, + struct vm_area_desc *desc, + vm_flags_t vm_flags) { long chg = -1, add = -1, spool_resv, gbl_resv; struct hstate *h = hstate_inode(inode); @@ -7277,12 +6587,6 @@ long hugetlb_reserve_pages(struct inode *inode, } /* - * vma specific semaphore used for pmd sharing and fault/truncation - * synchronization - */ - hugetlb_vma_lock_alloc(vma); - - /* * Only apply hugepage reservation if asked. At fault time, an * attempt will be made for VM_NORESERVE to allocate a page * without using reserves @@ -7294,9 +6598,9 @@ long hugetlb_reserve_pages(struct inode *inode, * Shared mappings base their reservation on the number of pages that * are already allocated on behalf of the file. Private mappings need * to reserve the full area even if read-only as mprotect() may be - * called to make the mapping read-write. Assume !vma is a shm mapping + * called to make the mapping read-write. Assume !desc is a shm mapping */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { /* * resv_map can not be NULL as hugetlb_reserve_pages is only * called for inodes for which resv_maps were created (see @@ -7313,8 +6617,8 @@ long hugetlb_reserve_pages(struct inode *inode, chg = to - from; - set_vma_resv_map(vma, resv_map); - set_vma_resv_flags(vma, HPAGE_RESV_OWNER); + set_vma_desc_resv_map(desc, resv_map); + set_vma_desc_resv_flags(desc, HPAGE_RESV_OWNER); } if (chg < 0) @@ -7324,7 +6628,7 @@ long hugetlb_reserve_pages(struct inode *inode, chg * pages_per_huge_page(h), &h_cg) < 0) goto out_err; - if (vma && !(vma->vm_flags & VM_MAYSHARE) && h_cg) { + if (desc && !(desc->vm_flags & VM_MAYSHARE) && h_cg) { /* For private mappings, the hugetlb_cgroup uncharge info hangs * of the resv_map. */ @@ -7358,7 +6662,7 @@ long hugetlb_reserve_pages(struct inode *inode, * consumed reservations are stored in the map. Hence, nothing * else has to be done for private mappings here */ - if (!vma || vma->vm_flags & VM_MAYSHARE) { + if (!desc || desc->vm_flags & VM_MAYSHARE) { add = region_add(resv_map, from, to, regions_needed, h, h_cg); if (unlikely(add < 0)) { @@ -7412,16 +6716,15 @@ out_uncharge_cgroup: hugetlb_cgroup_uncharge_cgroup_rsvd(hstate_index(h), chg * pages_per_huge_page(h), h_cg); out_err: - hugetlb_vma_lock_free(vma); - if (!vma || vma->vm_flags & VM_MAYSHARE) + if (!desc || desc->vm_flags & VM_MAYSHARE) /* Only call region_abort if the region_chg succeeded but the * region_add failed or didn't run. */ if (chg >= 0 && add < 0) region_abort(resv_map, from, to, regions_needed); - if (vma && is_vma_resv_set(vma, HPAGE_RESV_OWNER)) { + if (desc && is_vma_desc_resv_set(desc, HPAGE_RESV_OWNER)) { kref_put(&resv_map->refs, resv_map_release); - set_vma_resv_map(vma, NULL); + set_vma_desc_resv_map(desc, NULL); } return chg < 0 ? chg : add < 0 ? add : -EINVAL; } |
