From 2f882800f6ab57b50b7c23a376a452a808025f37 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Tue, 7 Dec 2021 20:06:21 +0100 Subject: s390/pgalloc: add virt/phys address handling to base asce functions The base asce functions create/free page tables open-coded to make sure that the returned asce and page tables do not make use of any enhanced DAT features like e.g. large pages. This is required for some I/O functions that use an asce, like e.g. some service call requests. Handling of virtual vs physical addresses is missing; therefore add that now. Note: this currently doesn't fix a real bug, since virtual addresses are indentical to physical ones. Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/mm/pgalloc.c | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) (limited to 'arch/s390/mm/pgalloc.c') diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 781965f7210e..2e33dfb38a76 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -409,9 +409,9 @@ static int base_segment_walk(unsigned long origin, unsigned long addr, table = base_pgt_alloc(); if (!table) return -ENOMEM; - *ste = table | _SEGMENT_ENTRY; + *ste = __pa(table) | _SEGMENT_ENTRY; } - table = *ste & _SEGMENT_ENTRY_ORIGIN; + table = (unsigned long)__va(*ste & _SEGMENT_ENTRY_ORIGIN); rc = base_page_walk(table, addr, next, alloc); if (rc) return rc; @@ -438,9 +438,9 @@ static int base_region3_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_SEGMENT_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rtte = table | _REGION3_ENTRY; + *rtte = __pa(table) | _REGION3_ENTRY; } - table = *rtte & _REGION_ENTRY_ORIGIN; + table = (unsigned long)__va(*rtte & _REGION_ENTRY_ORIGIN); rc = base_segment_walk(table, addr, next, alloc); if (rc) return rc; @@ -466,9 +466,9 @@ static int base_region2_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_REGION3_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rste = table | _REGION2_ENTRY; + *rste = __pa(table) | _REGION2_ENTRY; } - table = *rste & _REGION_ENTRY_ORIGIN; + table = (unsigned long)__va(*rste & _REGION_ENTRY_ORIGIN); rc = base_region3_walk(table, addr, next, alloc); if (rc) return rc; @@ -494,9 +494,9 @@ static int base_region1_walk(unsigned long origin, unsigned long addr, table = base_crst_alloc(_REGION2_ENTRY_EMPTY); if (!table) return -ENOMEM; - *rfte = table | _REGION1_ENTRY; + *rfte = __pa(table) | _REGION1_ENTRY; } - table = *rfte & _REGION_ENTRY_ORIGIN; + table = (unsigned long)__va(*rfte & _REGION_ENTRY_ORIGIN); rc = base_region2_walk(table, addr, next, alloc); if (rc) return rc; @@ -515,7 +515,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr, */ void base_asce_free(unsigned long asce) { - unsigned long table = asce & _ASCE_ORIGIN; + unsigned long table = (unsigned long)__va(asce & _ASCE_ORIGIN); if (!asce) return; @@ -578,25 +578,25 @@ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) if (!table) return 0; rc = base_segment_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_SEGMENT | _ASCE_TABLE_LENGTH; } else if (end <= _REGION2_SIZE) { table = base_crst_alloc(_REGION3_ENTRY_EMPTY); if (!table) return 0; rc = base_region3_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION3 | _ASCE_TABLE_LENGTH; } else if (end <= _REGION1_SIZE) { table = base_crst_alloc(_REGION2_ENTRY_EMPTY); if (!table) return 0; rc = base_region2_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION2 | _ASCE_TABLE_LENGTH; } else { table = base_crst_alloc(_REGION1_ENTRY_EMPTY); if (!table) return 0; rc = base_region1_walk(table, addr, end, 1); - asce = table | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; + asce = __pa(table) | _ASCE_TYPE_REGION1 | _ASCE_TABLE_LENGTH; } if (rc) { base_asce_free(asce); -- cgit From da001fce26bec9b5933162b32b3cc65923ad6c17 Mon Sep 17 00:00:00 2001 From: Heiko Carstens Date: Thu, 9 Dec 2021 12:01:25 +0100 Subject: s390/pgalloc: use pointers instead of unsigned long values After adding the missing __va()/__pa() calls to the base asce functions there are even more casts in the code than before. Make the code more readable by passing and using pointers to page tables, instead of using unsigned values for the same purpose. This allows to get rid of nearly all casts within the code. Suggested-by: Alexander Gordeev Reviewed-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/mm/pgalloc.c | 64 +++++++++++++++++++++++++------------------------- 1 file changed, 32 insertions(+), 32 deletions(-) (limited to 'arch/s390/mm/pgalloc.c') diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 2e33dfb38a76..82a2617362d5 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -322,34 +322,34 @@ void __tlb_remove_table(void *_table) static struct kmem_cache *base_pgt_cache; -static unsigned long base_pgt_alloc(void) +static unsigned long *base_pgt_alloc(void) { - u64 *table; + unsigned long *table; table = kmem_cache_alloc(base_pgt_cache, GFP_KERNEL); if (table) - memset64(table, _PAGE_INVALID, PTRS_PER_PTE); - return (unsigned long) table; + memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); + return table; } -static void base_pgt_free(unsigned long table) +static void base_pgt_free(unsigned long *table) { - kmem_cache_free(base_pgt_cache, (void *) table); + kmem_cache_free(base_pgt_cache, table); } -static unsigned long base_crst_alloc(unsigned long val) +static unsigned long *base_crst_alloc(unsigned long val) { - unsigned long table; + unsigned long *table; - table = __get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); + table = (unsigned long *)__get_free_pages(GFP_KERNEL, CRST_ALLOC_ORDER); if (table) - crst_table_init((unsigned long *)table, val); + crst_table_init(table, val); return table; } -static void base_crst_free(unsigned long table) +static void base_crst_free(unsigned long *table) { - free_pages(table, CRST_ALLOC_ORDER); + free_pages((unsigned long)table, CRST_ALLOC_ORDER); } #define BASE_ADDR_END_FUNC(NAME, SIZE) \ @@ -377,14 +377,14 @@ static inline unsigned long base_lra(unsigned long address) return real; } -static int base_page_walk(unsigned long origin, unsigned long addr, +static int base_page_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { unsigned long *pte, next; if (!alloc) return 0; - pte = (unsigned long *) origin; + pte = origin; pte += (addr & _PAGE_INDEX) >> _PAGE_SHIFT; do { next = base_page_addr_end(addr, end); @@ -393,13 +393,13 @@ static int base_page_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_segment_walk(unsigned long origin, unsigned long addr, +static int base_segment_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *ste, next, table; + unsigned long *ste, next, *table; int rc; - ste = (unsigned long *) origin; + ste = origin; ste += (addr & _SEGMENT_INDEX) >> _SEGMENT_SHIFT; do { next = base_segment_addr_end(addr, end); @@ -411,7 +411,7 @@ static int base_segment_walk(unsigned long origin, unsigned long addr, return -ENOMEM; *ste = __pa(table) | _SEGMENT_ENTRY; } - table = (unsigned long)__va(*ste & _SEGMENT_ENTRY_ORIGIN); + table = __va(*ste & _SEGMENT_ENTRY_ORIGIN); rc = base_page_walk(table, addr, next, alloc); if (rc) return rc; @@ -422,13 +422,13 @@ static int base_segment_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region3_walk(unsigned long origin, unsigned long addr, +static int base_region3_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rtte, next, table; + unsigned long *rtte, next, *table; int rc; - rtte = (unsigned long *) origin; + rtte = origin; rtte += (addr & _REGION3_INDEX) >> _REGION3_SHIFT; do { next = base_region3_addr_end(addr, end); @@ -440,7 +440,7 @@ static int base_region3_walk(unsigned long origin, unsigned long addr, return -ENOMEM; *rtte = __pa(table) | _REGION3_ENTRY; } - table = (unsigned long)__va(*rtte & _REGION_ENTRY_ORIGIN); + table = __va(*rtte & _REGION_ENTRY_ORIGIN); rc = base_segment_walk(table, addr, next, alloc); if (rc) return rc; @@ -450,13 +450,13 @@ static int base_region3_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region2_walk(unsigned long origin, unsigned long addr, +static int base_region2_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rste, next, table; + unsigned long *rste, next, *table; int rc; - rste = (unsigned long *) origin; + rste = origin; rste += (addr & _REGION2_INDEX) >> _REGION2_SHIFT; do { next = base_region2_addr_end(addr, end); @@ -468,7 +468,7 @@ static int base_region2_walk(unsigned long origin, unsigned long addr, return -ENOMEM; *rste = __pa(table) | _REGION2_ENTRY; } - table = (unsigned long)__va(*rste & _REGION_ENTRY_ORIGIN); + table = __va(*rste & _REGION_ENTRY_ORIGIN); rc = base_region3_walk(table, addr, next, alloc); if (rc) return rc; @@ -478,13 +478,13 @@ static int base_region2_walk(unsigned long origin, unsigned long addr, return 0; } -static int base_region1_walk(unsigned long origin, unsigned long addr, +static int base_region1_walk(unsigned long *origin, unsigned long addr, unsigned long end, int alloc) { - unsigned long *rfte, next, table; + unsigned long *rfte, next, *table; int rc; - rfte = (unsigned long *) origin; + rfte = origin; rfte += (addr & _REGION1_INDEX) >> _REGION1_SHIFT; do { next = base_region1_addr_end(addr, end); @@ -496,7 +496,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr, return -ENOMEM; *rfte = __pa(table) | _REGION1_ENTRY; } - table = (unsigned long)__va(*rfte & _REGION_ENTRY_ORIGIN); + table = __va(*rfte & _REGION_ENTRY_ORIGIN); rc = base_region2_walk(table, addr, next, alloc); if (rc) return rc; @@ -515,7 +515,7 @@ static int base_region1_walk(unsigned long origin, unsigned long addr, */ void base_asce_free(unsigned long asce) { - unsigned long table = (unsigned long)__va(asce & _ASCE_ORIGIN); + unsigned long *table = __va(asce & _ASCE_ORIGIN); if (!asce) return; @@ -567,7 +567,7 @@ static int base_pgt_cache_init(void) */ unsigned long base_asce_alloc(unsigned long addr, unsigned long num_pages) { - unsigned long asce, table, end; + unsigned long asce, *table, end; int rc; if (base_pgt_cache_init()) -- cgit From c2c224932fd0ee6854d6ebfc8d059c2bcad86606 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 4 Nov 2021 07:14:44 +0100 Subject: s390/mm: fix 2KB pgtable release race There is a race on concurrent 2KB-pgtables release paths when both upper and lower halves of the containing parent page are freed, one via page_table_free_rcu() + __tlb_remove_table(), and the other via page_table_free(). The race might lead to a corruption as result of remove of list item in page_table_free() concurrently with __free_page() in __tlb_remove_table(). Let's assume first the lower and next the upper 2KB-pgtables are freed from a page. Since both halves of the page are allocated the tracking byte (bits 24-31 of the page _refcount) has value of 0x03 initially: CPU0 CPU1 ---- ---- page_table_free_rcu() // lower half { // _refcount[31..24] == 0x03 ... atomic_xor_bits(&page->_refcount, 0x11U << (0 + 24)); // _refcount[31..24] <= 0x12 ... table = table | (1U << 0); tlb_remove_table(tlb, table); } ... __tlb_remove_table() { // _refcount[31..24] == 0x12 mask = _table & 3; // mask <= 0x01 ... page_table_free() // upper half { // _refcount[31..24] == 0x12 ... atomic_xor_bits( &page->_refcount, 1U << (1 + 24)); // _refcount[31..24] <= 0x10 // mask <= 0x10 ... atomic_xor_bits(&page->_refcount, mask << (4 + 24)); // _refcount[31..24] <= 0x00 // mask <= 0x00 ... if (mask != 0) // == false break; fallthrough; ... if (mask & 3) // == false ... else __free_page(page); list_del(&page->lru); ^^^^^^^^^^^^^^^^^^ RACE! ^^^^^^^^^^^^^^^^^^^^^ } ... } The problem is page_table_free() releases the page as result of lower nibble unset and __tlb_remove_table() observing zero too early. With this update page_table_free() will use the similar logic as page_table_free_rcu() + __tlb_remove_table(), and mark the fragment as pending for removal in the upper nibble until after the list_del(). In other words, the parent page is considered as unreferenced and safe to release only when the lower nibble is cleared already and unsetting a bit in upper nibble results in that nibble turned zero. Cc: stable@vger.kernel.org Suggested-by: Vlastimil Babka Reviewed-by: Gerald Schaefer Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/mm/pgalloc.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) (limited to 'arch/s390/mm/pgalloc.c') diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 82a2617362d5..457c80924542 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -244,13 +244,15 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) /* Free 2K page table fragment of a 4K page */ bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24)); + mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; if (mask & 3) list_add(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); spin_unlock_bh(&mm->context.lock); + mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); + mask >>= 24; if (mask != 0) return; } else { -- cgit From 1194372db6f3c917c9c6f6907e8378cf1076c557 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 4 Nov 2021 07:14:45 +0100 Subject: s390/mm: better annotate 2KB pagetable fragments handling Explicitly encode immediate value of pending remove nibble (bits 31-28) and tracking nibble (bits 27-24) of the page refcount whenever these nibbles are tested or changed, for better readability. Also, add some comments describing how the fragments are handled. Reviewed-by: Gerald Schaefer Signed-off-by: Alexander Gordeev Signed-off-by: Heiko Carstens --- arch/s390/mm/pgalloc.c | 127 +++++++++++++++++++++++++++++++++++++++++-------- 1 file changed, 107 insertions(+), 20 deletions(-) (limited to 'arch/s390/mm/pgalloc.c') diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 457c80924542..5d5549843c5c 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -176,7 +176,75 @@ void page_table_free_pgste(struct page *page) #endif /* CONFIG_PGSTE */ /* - * page table entry allocation/free routines. + * A 2KB-pgtable is either upper or lower half of a normal page. + * The second half of the page may be unused or used as another + * 2KB-pgtable. + * + * Whenever possible the parent page for a new 2KB-pgtable is picked + * from the list of partially allocated pages mm_context_t::pgtable_list. + * In case the list is empty a new parent page is allocated and added to + * the list. + * + * When a parent page gets fully allocated it contains 2KB-pgtables in both + * upper and lower halves and is removed from mm_context_t::pgtable_list. + * + * When 2KB-pgtable is freed from to fully allocated parent page that + * page turns partially allocated and added to mm_context_t::pgtable_list. + * + * If 2KB-pgtable is freed from the partially allocated parent page that + * page turns unused and gets removed from mm_context_t::pgtable_list. + * Furthermore, the unused parent page is released. + * + * As follows from the above, no unallocated or fully allocated parent + * pages are contained in mm_context_t::pgtable_list. + * + * The upper byte (bits 24-31) of the parent page _refcount is used + * for tracking contained 2KB-pgtables and has the following format: + * + * PP AA + * 01234567 upper byte (bits 24-31) of struct page::_refcount + * || || + * || |+--- upper 2KB-pgtable is allocated + * || +---- lower 2KB-pgtable is allocated + * |+------- upper 2KB-pgtable is pending for removal + * +-------- lower 2KB-pgtable is pending for removal + * + * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why + * using _refcount is possible). + * + * When 2KB-pgtable is allocated the corresponding AA bit is set to 1. + * The parent page is either: + * - added to mm_context_t::pgtable_list in case the second half of the + * parent page is still unallocated; + * - removed from mm_context_t::pgtable_list in case both hales of the + * parent page are allocated; + * These operations are protected with mm_context_t::lock. + * + * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0 + * and the corresponding PP bit is set to 1 in a single atomic operation. + * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually + * exclusive and may never be both set to 1! + * The parent page is either: + * - added to mm_context_t::pgtable_list in case the second half of the + * parent page is still allocated; + * - removed from mm_context_t::pgtable_list in case the second half of + * the parent page is unallocated; + * These operations are protected with mm_context_t::lock. + * + * It is important to understand that mm_context_t::lock only protects + * mm_context_t::pgtable_list and AA bits, but not the parent page itself + * and PP bits. + * + * Releasing the parent page happens whenever the PP bit turns from 1 to 0, + * while both AA bits and the second PP bit are already unset. Then the + * parent page does not contain any 2KB-pgtable fragment anymore, and it has + * also been removed from mm_context_t::pgtable_list. It is safe to release + * the page therefore. + * + * PGSTE memory spaces use full 4KB-pgtables and do not need most of the + * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable + * while the PP bits are never used, nor such a page is added to or removed + * from mm_context_t::pgtable_list. */ unsigned long *page_table_alloc(struct mm_struct *mm) { @@ -192,14 +260,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm) page = list_first_entry(&mm->context.pgtable_list, struct page, lru); mask = atomic_read(&page->_refcount) >> 24; - mask = (mask | (mask >> 4)) & 3; - if (mask != 3) { + /* + * The pending removal bits must also be checked. + * Failure to do so might lead to an impossible + * value of (i.e 0x13 or 0x23) written to _refcount. + * Such values violate the assumption that pending and + * allocation bits are mutually exclusive, and the rest + * of the code unrails as result. That could lead to + * a whole bunch of races and corruptions. + */ + mask = (mask | (mask >> 4)) & 0x03U; + if (mask != 0x03U) { table = (unsigned long *) page_to_virt(page); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; atomic_xor_bits(&page->_refcount, - 1U << (bit + 24)); + 0x01U << (bit + 24)); list_del(&page->lru); } } @@ -220,12 +297,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = (unsigned long *) page_to_virt(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - atomic_xor_bits(&page->_refcount, 3 << 24); + atomic_xor_bits(&page->_refcount, 0x03U << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_xor_bits(&page->_refcount, 1 << 24); + atomic_xor_bits(&page->_refcount, 0x01U << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); list_add(&page->lru, &mm->context.pgtable_list); @@ -244,19 +321,24 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) /* Free 2K page table fragment of a 4K page */ bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); + /* + * Mark the page for delayed release. The actual release + * will happen outside of the critical section from this + * function or from __tlb_remove_table() + */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 3) + if (mask & 0x03U) list_add(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); spin_unlock_bh(&mm->context.lock); mask = atomic_xor_bits(&page->_refcount, 0x10U << (bit + 24)); mask >>= 24; - if (mask != 0) + if (mask != 0x00U) return; } else { - atomic_xor_bits(&page->_refcount, 3U << 24); + atomic_xor_bits(&page->_refcount, 0x03U << 24); } pgtable_pte_page_dtor(page); @@ -274,43 +356,48 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, page = virt_to_page(table); if (mm_alloc_pgste(mm)) { gmap_unlink(mm, table, vmaddr); - table = (unsigned long *) ((unsigned long)table | 3); + table = (unsigned long *) ((unsigned long)table | 0x03U); tlb_remove_table(tlb, table); return; } bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); + /* + * Mark the page for delayed release. The actual release will happen + * outside of the critical section from __tlb_remove_table() or from + * page_table_free() + */ mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); mask >>= 24; - if (mask & 3) + if (mask & 0x03U) list_add_tail(&page->lru, &mm->context.pgtable_list); else list_del(&page->lru); spin_unlock_bh(&mm->context.lock); - table = (unsigned long *) ((unsigned long) table | (1U << bit)); + table = (unsigned long *) ((unsigned long) table | (0x01U << bit)); tlb_remove_table(tlb, table); } void __tlb_remove_table(void *_table) { - unsigned int mask = (unsigned long) _table & 3; + unsigned int mask = (unsigned long) _table & 0x03U; void *table = (void *)((unsigned long) _table ^ mask); struct page *page = virt_to_page(table); switch (mask) { - case 0: /* pmd, pud, or p4d */ + case 0x00U: /* pmd, pud, or p4d */ free_pages((unsigned long) table, 2); break; - case 1: /* lower 2K of a 4K page table */ - case 2: /* higher 2K of a 4K page table */ + case 0x01U: /* lower 2K of a 4K page table */ + case 0x02U: /* higher 2K of a 4K page table */ mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); mask >>= 24; - if (mask != 0) + if (mask != 0x00U) break; fallthrough; - case 3: /* 4K page table with pgstes */ - if (mask & 3) - atomic_xor_bits(&page->_refcount, 3 << 24); + case 0x03U: /* 4K page table with pgstes */ + if (mask & 0x03U) + atomic_xor_bits(&page->_refcount, 0x03U << 24); pgtable_pte_page_dtor(page); __free_page(page); break; -- cgit From 4c88bb96e40b757f4796f70a4a7507df554467c4 Mon Sep 17 00:00:00 2001 From: Alexander Gordeev Date: Thu, 4 Nov 2021 07:14:46 +0100 Subject: s390/mm: check 2KB-fragment page on release When CONFIG_DEBUG_VM is defined check that pending remove and tracking nibbles (bits 31-24 of the page refcount) are cleared. Should the earlier stages of the page lifespan have a race or logical error, such check could help in exposing the issue. Signed-off-by: Alexander Gordeev Reviewed-by: Gerald Schaefer Signed-off-by: Heiko Carstens --- arch/s390/mm/pgalloc.c | 41 ++++++++++++++++++++++++++++++----------- 1 file changed, 30 insertions(+), 11 deletions(-) (limited to 'arch/s390/mm/pgalloc.c') diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 5d5549843c5c..fd35c1a0213b 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -311,10 +311,23 @@ unsigned long *page_table_alloc(struct mm_struct *mm) return table; } +static void page_table_release_check(struct page *page, void *table, + unsigned int half, unsigned int mask) +{ + char msg[128]; + + if (!IS_ENABLED(CONFIG_DEBUG_VM) || !mask) + return; + snprintf(msg, sizeof(msg), + "Invalid pgtable %p release half 0x%02x mask 0x%02x", + table, half, mask); + dump_page(page, msg); +} + void page_table_free(struct mm_struct *mm, unsigned long *table) { + unsigned int mask, bit, half; struct page *page; - unsigned int bit, mask; page = virt_to_page(table); if (!mm_alloc_pgste(mm)) { @@ -337,10 +350,14 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) mask >>= 24; if (mask != 0x00U) return; + half = 0x01U << bit; } else { - atomic_xor_bits(&page->_refcount, 0x03U << 24); + half = 0x03U; + mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask >>= 24; } + page_table_release_check(page, table, half, mask); pgtable_pte_page_dtor(page); __free_page(page); } @@ -380,28 +397,30 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, void __tlb_remove_table(void *_table) { - unsigned int mask = (unsigned long) _table & 0x03U; + unsigned int mask = (unsigned long) _table & 0x03U, half = mask; void *table = (void *)((unsigned long) _table ^ mask); struct page *page = virt_to_page(table); - switch (mask) { + switch (half) { case 0x00U: /* pmd, pud, or p4d */ free_pages((unsigned long) table, 2); - break; + return; case 0x01U: /* lower 2K of a 4K page table */ case 0x02U: /* higher 2K of a 4K page table */ mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); mask >>= 24; if (mask != 0x00U) - break; - fallthrough; + return; + break; case 0x03U: /* 4K page table with pgstes */ - if (mask & 0x03U) - atomic_xor_bits(&page->_refcount, 0x03U << 24); - pgtable_pte_page_dtor(page); - __free_page(page); + mask = atomic_xor_bits(&page->_refcount, 0x03U << 24); + mask >>= 24; break; } + + page_table_release_check(page, table, half, mask); + pgtable_pte_page_dtor(page); + __free_page(page); } /* -- cgit