From 1aaf18ff9de1f37bf674236fc0779c3aaa65b998 Mon Sep 17 00:00:00 2001 From: Andrew Morton Date: Wed, 27 Jul 2005 11:43:54 -0700 Subject: [PATCH] check_user_page_readable() deadlock fix Fix bug identifued by Richard Purdie . oprofile calls check_user_page_readable() from interrupt context, so we deadlock over various VFS locks. But check_user_page_readable() doesn't imply either a read or a write of the page's contents. Change __follow_page() so that check_user_page_readable() can tell __follow_page() that we're not accessing the page's contents, and use that info to avoid the troublesome lock-takings. Also, make follow_page() inline for the single callsite in memory.c to save a bit of stack space. Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 25 +++++++++++++++---------- 1 file changed, 15 insertions(+), 10 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index beabdefa6254..6fe77acbc1cd 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -776,8 +776,8 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address, * Do a quick page-table lookup for a single page. * mm->page_table_lock must be held. */ -static struct page * -__follow_page(struct mm_struct *mm, unsigned long address, int read, int write) +static struct page *__follow_page(struct mm_struct *mm, unsigned long address, + int read, int write, int accessed) { pgd_t *pgd; pud_t *pud; @@ -818,9 +818,11 @@ __follow_page(struct mm_struct *mm, unsigned long address, int read, int write) pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (write && !pte_dirty(pte) && !PageDirty(page)) - set_page_dirty(page); - mark_page_accessed(page); + if (accessed) { + if (write && !pte_dirty(pte) &&!PageDirty(page)) + set_page_dirty(page); + mark_page_accessed(page); + } return page; } } @@ -829,16 +831,19 @@ out: return NULL; } -struct page * +inline struct page * follow_page(struct mm_struct *mm, unsigned long address, int write) { - return __follow_page(mm, address, /*read*/0, write); + return __follow_page(mm, address, 0, write, 1); } -int -check_user_page_readable(struct mm_struct *mm, unsigned long address) +/* + * check_user_page_readable() can be called frm niterrupt context by oprofile, + * so we need to avoid taking any non-irq-safe locks + */ +int check_user_page_readable(struct mm_struct *mm, unsigned long address) { - return __follow_page(mm, address, /*read*/1, /*write*/0) != NULL; + return __follow_page(mm, address, 1, 0, 0) != NULL; } EXPORT_SYMBOL(check_user_page_readable); -- cgit From 4ceb5db9757aaeadcf8fbbf97d76bd42aa4df0d6 Mon Sep 17 00:00:00 2001 From: Linus Torvalds Date: Mon, 1 Aug 2005 11:14:49 -0700 Subject: Fix get_user_pages() race for write access There's no real guarantee that handle_mm_fault() will always be able to break a COW situation - if an update from another thread ends up modifying the page table some way, handle_mm_fault() may end up requiring us to re-try the operation. That's normally fine, but get_user_pages() ended up re-trying it as a read, and thus a write access could in theory end up losing the dirty bit or be done on a page that had not been properly COW'ed. This makes get_user_pages() always retry write accesses as write accesses by making "follow_page()" require that a writable follow has the dirty bit set. That simplifies the code and solves the race: if the COW break fails for some reason, we'll just loop around and try again. Signed-off-by: Linus Torvalds --- mm/memory.c | 21 ++++----------------- 1 file changed, 4 insertions(+), 17 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 6fe77acbc1cd..4e1c673784db 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -811,18 +811,15 @@ static struct page *__follow_page(struct mm_struct *mm, unsigned long address, pte = *ptep; pte_unmap(ptep); if (pte_present(pte)) { - if (write && !pte_write(pte)) + if (write && !pte_dirty(pte)) goto out; if (read && !pte_read(pte)) goto out; pfn = pte_pfn(pte); if (pfn_valid(pfn)) { page = pfn_to_page(pfn); - if (accessed) { - if (write && !pte_dirty(pte) &&!PageDirty(page)) - set_page_dirty(page); + if (accessed) mark_page_accessed(page); - } return page; } } @@ -941,10 +938,9 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, spin_lock(&mm->page_table_lock); do { struct page *page; - int lookup_write = write; cond_resched_lock(&mm->page_table_lock); - while (!(page = follow_page(mm, start, lookup_write))) { + while (!(page = follow_page(mm, start, write))) { /* * Shortcut for anonymous pages. We don't want * to force the creation of pages tables for @@ -952,8 +948,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, * nobody touched so far. This is important * for doing a core dump for these mappings. */ - if (!lookup_write && - untouched_anonymous_page(mm,vma,start)) { + if (!write && untouched_anonymous_page(mm,vma,start)) { page = ZERO_PAGE(start); break; } @@ -972,14 +967,6 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, default: BUG(); } - /* - * Now that we have performed a write fault - * and surely no longer have a shared page we - * shouldn't write, we shouldn't ignore an - * unwritable page in the page table if - * we are forcing write access. - */ - lookup_write = write && !force; spin_lock(&mm->page_table_lock); } if (pages) { -- cgit From 690dbe1ced143876d8fa56b72310738dbe079d0a Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 1 Aug 2005 21:11:42 -0700 Subject: [PATCH] x86_64: access of some bad address x86_64 has a large sparse gate area between VSYSCALL_START and VSYSCALL_END, not all of it presently backed by pmds. Alexander Nyberg has found that in some circumstances gdb may try to ptrace here, and hit get_user_pages BUG_ON. It seems odd that gdb should be accessing here, but it certainly shouldn't crash in this way: relax BUG_ON to -EFAULT. Fixes kernel bugzilla #4801. Signed-off-by: Hugh Dickins Cc: Andi Kleen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) (limited to 'mm/memory.c') diff --git a/mm/memory.c b/mm/memory.c index 4e1c673784db..2405289dfdf8 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -910,9 +910,13 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, pud = pud_offset(pgd, pg); BUG_ON(pud_none(*pud)); pmd = pmd_offset(pud, pg); - BUG_ON(pmd_none(*pmd)); + if (pmd_none(*pmd)) + return i ? : -EFAULT; pte = pte_offset_map(pmd, pg); - BUG_ON(pte_none(*pte)); + if (pte_none(*pte)) { + pte_unmap(pte); + return i ? : -EFAULT; + } if (pages) { pages[i] = pte_page(*pte); get_page(pages[i]); -- cgit