From ed5297a94090d9a9f27b0ce1f9601ebe73561cff Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 21 Nov 2005 21:32:11 -0800
Subject: [PATCH] unpaged: get_user_pages VM_RESERVED

The PageReserved removal in 2.6.15-rc1 prohibited get_user_pages on the areas
flagged VM_RESERVED in place of PageReserved.  That is correct in theory - we
ought not to interfere with struct pages in such a reserved area; but in
practice it broke BTTV for one.

So revert to prohibiting only on VM_IO: if someone gets into trouble with
get_user_pages on VM_RESERVED, it'll just be a "don't do that".

You can argue that videobuf_mmap_mapper shouldn't set VM_RESERVED in the first
place, but now's not the time for breaking drivers without notice.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 2998cfc12f5b..cfce5f1f30f2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -968,7 +968,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 			continue;
 		}
 
-		if (!vma || (vma->vm_flags & (VM_IO | VM_RESERVED))
+		if (!vma || (vma->vm_flags & VM_IO)
 				|| !(vm_flags & vma->vm_flags))
 			return i ? : -EFAULT;
 
-- 
cgit 


From 0b14c179a483e71ea41df2aa4a661760063115bd Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 21 Nov 2005 21:32:15 -0800
Subject: [PATCH] unpaged: VM_UNPAGED

Although we tend to associate VM_RESERVED with remap_pfn_range, quite a few
drivers set VM_RESERVED on areas which are then populated by nopage.  The
PageReserved removal in 2.6.15-rc1 changed VM_RESERVED not to free pages in
zap_pte_range, without changing those drivers not to set it: so their pages
just leak away.

Let's not change miscellaneous drivers now: introduce VM_UNPAGED at the core,
to flag the special areas where the ptes may have no struct page, or if they
have then it's not to be touched.  Replace most instances of VM_RESERVED in
core mm by VM_UNPAGED.  Force it on in remap_pfn_range, and the sparc and
sparc64 io_remap_pfn_range.

Revert addition of VM_RESERVED to powerpc vdso, it's not needed there.  Is it
needed anywhere?  It still governs the mm->reserved_vm statistic, and special
vmas not to be merged, and areas not to be core dumped; but could probably be
eliminated later (the drivers are probably specifying it because in 2.4 it
kept swapout off the vma, but in 2.6 we work from the LRU, which these pages
don't get on).

Use the VM_SHM slot for VM_UNPAGED, and define VM_SHM to 0: it serves no
purpose whatsoever, and should be removed from drivers when we clean up.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Acked-by: William Irwin <wli@holomorphy.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 30 ++++++++++++++++++------------
 1 file changed, 18 insertions(+), 12 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index cfce5f1f30f2..ece04963158e 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -334,7 +334,7 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 
 /*
  * This function is called to print an error when a pte in a
- * !VM_RESERVED region is found pointing to an invalid pfn (which
+ * !VM_UNPAGED region is found pointing to an invalid pfn (which
  * is an error.
  *
  * The calling function must still handle the error.
@@ -381,15 +381,15 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_set_pte;
 	}
 
-	/* If the region is VM_RESERVED, the mapping is not
+	/* If the region is VM_UNPAGED, the mapping is not
 	 * mapped via rmap - duplicate the pte as is.
 	 */
-	if (vm_flags & VM_RESERVED)
+	if (vm_flags & VM_UNPAGED)
 		goto out_set_pte;
 
 	pfn = pte_pfn(pte);
 	/* If the pte points outside of valid memory but
-	 * the region is not VM_RESERVED, we have a problem.
+	 * the region is not VM_UNPAGED, we have a problem.
 	 */
 	if (unlikely(!pfn_valid(pfn))) {
 		print_bad_pte(vma, pte, addr);
@@ -528,7 +528,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_RESERVED))) {
+	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
@@ -572,7 +572,7 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 
 			(*zap_work) -= PAGE_SIZE;
 
-			if (!(vma->vm_flags & VM_RESERVED)) {
+			if (!(vma->vm_flags & VM_UNPAGED)) {
 				unsigned long pfn = pte_pfn(ptent);
 				if (unlikely(!pfn_valid(pfn)))
 					print_bad_pte(vma, ptent, addr);
@@ -1191,10 +1191,16 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 * rest of the world about it:
 	 *   VM_IO tells people not to look at these pages
 	 *	(accesses can have side effects).
-	 *   VM_RESERVED tells the core MM not to "manage" these pages
-         *	(e.g. refcount, mapcount, try to swap them out).
+	 *   VM_RESERVED is specified all over the place, because
+	 *	in 2.4 it kept swapout's vma scan off this vma; but
+	 *	in 2.6 the LRU scan won't even find its pages, so this
+	 *	flag means no more than count its pages in reserved_vm,
+	 * 	and omit it from core dump, even when VM_IO turned off.
+	 *   VM_UNPAGED tells the core MM not to "manage" these pages
+         *	(e.g. refcount, mapcount, try to swap them out): in
+	 *	particular, zap_pte_range does not try to free them.
 	 */
-	vma->vm_flags |= VM_IO | VM_RESERVED;
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED;
 
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
@@ -1276,7 +1282,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
-	BUG_ON(vma->vm_flags & VM_RESERVED);
+	BUG_ON(vma->vm_flags & VM_UNPAGED);
 
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
@@ -1924,7 +1930,7 @@ retry:
 			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
 			page_add_anon_rmap(new_page, vma, address);
-		} else if (!(vma->vm_flags & VM_RESERVED)) {
+		} else if (!(vma->vm_flags & VM_UNPAGED)) {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
 		}
@@ -2203,7 +2209,7 @@ static int __init gate_vma_init(void)
 	gate_vma.vm_start = FIXADDR_USER_START;
 	gate_vma.vm_end = FIXADDR_USER_END;
 	gate_vma.vm_page_prot = PAGE_READONLY;
-	gate_vma.vm_flags = VM_RESERVED;
+	gate_vma.vm_flags = 0;
 	return 0;
 }
 __initcall(gate_vma_init);
-- 
cgit 


From 920fc356f58d0e455bdfa53451f1c58eb211a846 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 21 Nov 2005 21:32:17 -0800
Subject: [PATCH] unpaged: COW on VM_UNPAGED

Remove the BUG_ON(vma->vm_flags & VM_UNPAGED) from do_wp_page, and let it do
Copy-On-Write without touching the VM_UNPAGED's page counts - but this is
incomplete, because the anonymous page it inserts will itself need to be
handled, here and in other functions - next patch.

We still don't copy the page if the pfn is invalid, because the
copy_user_highpage interface does not allow it.  But that's not been a problem
in the past: can be added in later if the need arises.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 37 +++++++++++++++++++++++++------------
 1 file changed, 25 insertions(+), 12 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index ece04963158e..107b619cfb16 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1277,22 +1277,28 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		unsigned long address, pte_t *page_table, pmd_t *pmd,
 		spinlock_t *ptl, pte_t orig_pte)
 {
-	struct page *old_page, *new_page;
+	struct page *old_page, *src_page, *new_page;
 	unsigned long pfn = pte_pfn(orig_pte);
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
-	BUG_ON(vma->vm_flags & VM_UNPAGED);
-
 	if (unlikely(!pfn_valid(pfn))) {
 		/*
 		 * Page table corrupted: show pte and kill process.
+		 * Or it's an attempt to COW an out-of-map VM_UNPAGED
+		 * entry, which copy_user_highpage does not support.
 		 */
 		print_bad_pte(vma, orig_pte, address);
 		ret = VM_FAULT_OOM;
 		goto unlock;
 	}
 	old_page = pfn_to_page(pfn);
+	src_page = old_page;
+
+	if (unlikely(vma->vm_flags & VM_UNPAGED)) {
+		old_page = NULL;
+		goto gotten;
+	}
 
 	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
 		int reuse = can_share_swap_page(old_page);
@@ -1313,11 +1319,12 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 * Ok, we need to copy. Oh, well..
 	 */
 	page_cache_get(old_page);
+gotten:
 	pte_unmap_unlock(page_table, ptl);
 
 	if (unlikely(anon_vma_prepare(vma)))
 		goto oom;
-	if (old_page == ZERO_PAGE(address)) {
+	if (src_page == ZERO_PAGE(address)) {
 		new_page = alloc_zeroed_user_highpage(vma, address);
 		if (!new_page)
 			goto oom;
@@ -1325,7 +1332,7 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!new_page)
 			goto oom;
-		copy_user_highpage(new_page, old_page, address);
+		copy_user_highpage(new_page, src_page, address);
 	}
 
 	/*
@@ -1333,11 +1340,14 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	 */
 	page_table = pte_offset_map_lock(mm, pmd, address, &ptl);
 	if (likely(pte_same(*page_table, orig_pte))) {
-		page_remove_rmap(old_page);
-		if (!PageAnon(old_page)) {
+		if (old_page) {
+			page_remove_rmap(old_page);
+			if (!PageAnon(old_page)) {
+				dec_mm_counter(mm, file_rss);
+				inc_mm_counter(mm, anon_rss);
+			}
+		} else
 			inc_mm_counter(mm, anon_rss);
-			dec_mm_counter(mm, file_rss);
-		}
 		flush_cache_page(vma, address, pfn);
 		entry = mk_pte(new_page, vma->vm_page_prot);
 		entry = maybe_mkwrite(pte_mkdirty(entry), vma);
@@ -1351,13 +1361,16 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		new_page = old_page;
 		ret |= VM_FAULT_WRITE;
 	}
-	page_cache_release(new_page);
-	page_cache_release(old_page);
+	if (new_page)
+		page_cache_release(new_page);
+	if (old_page)
+		page_cache_release(old_page);
 unlock:
 	pte_unmap_unlock(page_table, ptl);
 	return ret;
 oom:
-	page_cache_release(old_page);
+	if (old_page)
+		page_cache_release(old_page);
 	return VM_FAULT_OOM;
 }
 
-- 
cgit 


From ee498ed730283e9cdfc8913f12b90a2246f1a8cc Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 21 Nov 2005 21:32:18 -0800
Subject: [PATCH] unpaged: anon in VM_UNPAGED

copy_one_pte needs to copy the anonymous COWed pages in a VM_UNPAGED area,
zap_pte_range needs to free them, do_wp_page needs to COW them: just like
ordinary pages, not like the unpaged.

But recognizing them is a little subtle: because PageReserved is no longer a
condition for remap_pfn_range, we can now mmap all of /dev/mem (whether the
distro permits, and whether it's advisable on this or that architecture, is
another matter).  So if we can see a PageAnon, it may not be ours to mess with
(or may be ours from elsewhere in the address space).  I suspect there's an
entertaining insoluble self-referential problem here, but the page_is_anon
function does a good practical job, and MAP_PRIVATE PROT_WRITE VM_UNPAGED will
always be an odd choice.

In updating the comment on page_address_in_vma, noticed a potential NULL
dereference, in a path we don't actually take, but fixed it.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 63 ++++++++++++++++++++++++++++++++++++++++---------------------
 1 file changed, 41 insertions(+), 22 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 107b619cfb16..3666a4c6dd22 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -349,6 +349,22 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
 	dump_stack();
 }
 
+/*
+ * page_is_anon applies strict checks for an anonymous page belonging to
+ * this vma at this address.  It is used on VM_UNPAGED vmas, which are
+ * usually populated with shared originals (which must not be counted),
+ * but occasionally contain private COWed copies (when !VM_SHARED, or
+ * perhaps via ptrace when VM_SHARED).  An mmap of /dev/mem might window
+ * free pages, pages from other processes, or from other parts of this:
+ * it's tricky, but try not to be deceived by foreign anonymous pages.
+ */
+static inline int page_is_anon(struct page *page,
+			struct vm_area_struct *vma, unsigned long addr)
+{
+	return page && PageAnon(page) && page_mapped(page) &&
+		page_address_in_vma(page, vma) == addr;
+}
+
 /*
  * copy one vm_area from one task to the other. Assumes the page tables
  * already present in the new task to be cleared in the whole range
@@ -381,23 +397,22 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_set_pte;
 	}
 
-	/* If the region is VM_UNPAGED, the mapping is not
-	 * mapped via rmap - duplicate the pte as is.
-	 */
-	if (vm_flags & VM_UNPAGED)
-		goto out_set_pte;
-
 	pfn = pte_pfn(pte);
-	/* If the pte points outside of valid memory but
+	page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
+
+	if (unlikely(vm_flags & VM_UNPAGED))
+		if (!page_is_anon(page, vma, addr))
+			goto out_set_pte;
+
+	/*
+	 * If the pte points outside of valid memory but
 	 * the region is not VM_UNPAGED, we have a problem.
 	 */
-	if (unlikely(!pfn_valid(pfn))) {
+	if (unlikely(!page)) {
 		print_bad_pte(vma, pte, addr);
 		goto out_set_pte; /* try to do something sane */
 	}
 
-	page = pfn_to_page(pfn);
-
 	/*
 	 * If it's a COW mapping, write protect it both
 	 * in the parent and the child
@@ -568,17 +583,20 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 			continue;
 		}
 		if (pte_present(ptent)) {
-			struct page *page = NULL;
+			struct page *page;
+			unsigned long pfn;
 
 			(*zap_work) -= PAGE_SIZE;
 
-			if (!(vma->vm_flags & VM_UNPAGED)) {
-				unsigned long pfn = pte_pfn(ptent);
-				if (unlikely(!pfn_valid(pfn)))
-					print_bad_pte(vma, ptent, addr);
-				else
-					page = pfn_to_page(pfn);
-			}
+			pfn = pte_pfn(ptent);
+			page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
+
+			if (unlikely(vma->vm_flags & VM_UNPAGED)) {
+				if (!page_is_anon(page, vma, addr))
+					page = NULL;
+			} else if (unlikely(!page))
+				print_bad_pte(vma, ptent, addr);
+
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
@@ -1295,10 +1313,11 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	old_page = pfn_to_page(pfn);
 	src_page = old_page;
 
-	if (unlikely(vma->vm_flags & VM_UNPAGED)) {
-		old_page = NULL;
-		goto gotten;
-	}
+	if (unlikely(vma->vm_flags & VM_UNPAGED))
+		if (!page_is_anon(old_page, vma, address)) {
+			old_page = NULL;
+			goto gotten;
+		}
 
 	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
 		int reuse = can_share_swap_page(old_page);
-- 
cgit 


From f57e88a8d83de8d844b57e16b84d2f762fe9f092 Mon Sep 17 00:00:00 2001
From: Hugh Dickins <hugh@veritas.com>
Date: Mon, 21 Nov 2005 21:32:19 -0800
Subject: [PATCH] unpaged: ZERO_PAGE in VM_UNPAGED

It's strange enough to be looking out for anonymous pages in VM_UNPAGED areas,
let's not insert the ZERO_PAGE there - though whether it would matter will
depend on what we decide about ZERO_PAGE refcounting.

But whereas do_anonymous_page may (exceptionally) be called on a VM_UNPAGED
area, do_no_page should never be: just BUG_ON.

Signed-off-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 14 ++++++++++++--
 1 file changed, 12 insertions(+), 2 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index 3666a4c6dd22..d1f46f4e4c8a 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -1812,7 +1812,16 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	if (write_access) {
+	/*
+	 * A VM_UNPAGED vma will normally be filled with present ptes
+	 * by remap_pfn_range, and never arrive here; but it might have
+	 * holes, or if !VM_DONTEXPAND, mremap might have expanded it.
+	 * It's weird enough handling anon pages in unpaged vmas, we do
+	 * not want to worry about ZERO_PAGEs too (it may or may not
+	 * matter if their counts wrap): just give them anon pages.
+	 */
+
+	if (write_access || (vma->vm_flags & VM_UNPAGED)) {
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
 
@@ -1887,6 +1896,7 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int anon = 0;
 
 	pte_unmap(page_table);
+	BUG_ON(vma->vm_flags & VM_UNPAGED);
 
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
@@ -1962,7 +1972,7 @@ retry:
 			inc_mm_counter(mm, anon_rss);
 			lru_cache_add_active(new_page);
 			page_add_anon_rmap(new_page, vma, address);
-		} else if (!(vma->vm_flags & VM_UNPAGED)) {
+		} else {
 			inc_mm_counter(mm, file_rss);
 			page_add_file_rmap(new_page);
 		}
-- 
cgit 


From 6aab341e0a28aff100a09831c5300a2994b8b986 Mon Sep 17 00:00:00 2001
From: Linus Torvalds <torvalds@g5.osdl.org>
Date: Mon, 28 Nov 2005 14:34:23 -0800
Subject: mm: re-architect the VM_UNPAGED logic

This replaces the (in my opinion horrible) VM_UNMAPPED logic with very
explicit support for a "remapped page range" aka VM_PFNMAP.  It allows a
VM area to contain an arbitrary range of page table entries that the VM
never touches, and never considers to be normal pages.

Any user of "remap_pfn_range()" automatically gets this new
functionality, and doesn't even have to mark the pages reserved or
indeed mark them any other way.  It just works.  As a side effect, doing
mmap() on /dev/mem works for arbitrary ranges.

Sparc update from David in the next commit.

Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 189 ++++++++++++++++++++++++++++++++----------------------------
 1 file changed, 100 insertions(+), 89 deletions(-)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index d1f46f4e4c8a..b57fbc636058 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -333,9 +333,9 @@ static inline void add_mm_rss(struct mm_struct *mm, int file_rss, int anon_rss)
 }
 
 /*
- * This function is called to print an error when a pte in a
- * !VM_UNPAGED region is found pointing to an invalid pfn (which
- * is an error.
+ * This function is called to print an error when a bad pte
+ * is found. For example, we might have a PFN-mapped pte in
+ * a region that doesn't allow it.
  *
  * The calling function must still handle the error.
  */
@@ -350,19 +350,56 @@ void print_bad_pte(struct vm_area_struct *vma, pte_t pte, unsigned long vaddr)
 }
 
 /*
- * page_is_anon applies strict checks for an anonymous page belonging to
- * this vma at this address.  It is used on VM_UNPAGED vmas, which are
- * usually populated with shared originals (which must not be counted),
- * but occasionally contain private COWed copies (when !VM_SHARED, or
- * perhaps via ptrace when VM_SHARED).  An mmap of /dev/mem might window
- * free pages, pages from other processes, or from other parts of this:
- * it's tricky, but try not to be deceived by foreign anonymous pages.
+ * This function gets the "struct page" associated with a pte.
+ *
+ * NOTE! Some mappings do not have "struct pages". A raw PFN mapping
+ * will have each page table entry just pointing to a raw page frame
+ * number, and as far as the VM layer is concerned, those do not have
+ * pages associated with them - even if the PFN might point to memory
+ * that otherwise is perfectly fine and has a "struct page".
+ *
+ * The way we recognize those mappings is through the rules set up
+ * by "remap_pfn_range()": the vma will have the VM_PFNMAP bit set,
+ * and the vm_pgoff will point to the first PFN mapped: thus every
+ * page that is a raw mapping will always honor the rule
+ *
+ *	pfn_of_page == vma->vm_pgoff + ((addr - vma->vm_start) >> PAGE_SHIFT)
+ *
+ * and if that isn't true, the page has been COW'ed (in which case it
+ * _does_ have a "struct page" associated with it even if it is in a
+ * VM_PFNMAP range).
  */
-static inline int page_is_anon(struct page *page,
-			struct vm_area_struct *vma, unsigned long addr)
+struct page *vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte)
 {
-	return page && PageAnon(page) && page_mapped(page) &&
-		page_address_in_vma(page, vma) == addr;
+	unsigned long pfn = pte_pfn(pte);
+
+	if (vma->vm_flags & VM_PFNMAP) {
+		unsigned long off = (addr - vma->vm_start) >> PAGE_SHIFT;
+		if (pfn == vma->vm_pgoff + off)
+			return NULL;
+	}
+
+	/*
+	 * Add some anal sanity checks for now. Eventually,
+	 * we should just do "return pfn_to_page(pfn)", but
+	 * in the meantime we check that we get a valid pfn,
+	 * and that the resulting page looks ok.
+	 *
+	 * Remove this test eventually!
+	 */
+	if (unlikely(!pfn_valid(pfn))) {
+		print_bad_pte(vma, pte, addr);
+		return NULL;
+	}
+
+	/*
+	 * NOTE! We still have PageReserved() pages in the page 
+	 * tables. 
+	 *
+	 * The PAGE_ZERO() pages and various VDSO mappings can
+	 * cause them to exist.
+	 */
+	return pfn_to_page(pfn);
 }
 
 /*
@@ -379,7 +416,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	unsigned long vm_flags = vma->vm_flags;
 	pte_t pte = *src_pte;
 	struct page *page;
-	unsigned long pfn;
 
 	/* pte contains position in swap or file, so copy. */
 	if (unlikely(!pte_present(pte))) {
@@ -397,22 +433,6 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 		goto out_set_pte;
 	}
 
-	pfn = pte_pfn(pte);
-	page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
-
-	if (unlikely(vm_flags & VM_UNPAGED))
-		if (!page_is_anon(page, vma, addr))
-			goto out_set_pte;
-
-	/*
-	 * If the pte points outside of valid memory but
-	 * the region is not VM_UNPAGED, we have a problem.
-	 */
-	if (unlikely(!page)) {
-		print_bad_pte(vma, pte, addr);
-		goto out_set_pte; /* try to do something sane */
-	}
-
 	/*
 	 * If it's a COW mapping, write protect it both
 	 * in the parent and the child
@@ -429,9 +449,13 @@ copy_one_pte(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	if (vm_flags & VM_SHARED)
 		pte = pte_mkclean(pte);
 	pte = pte_mkold(pte);
-	get_page(page);
-	page_dup_rmap(page);
-	rss[!!PageAnon(page)]++;
+
+	page = vm_normal_page(vma, addr, pte);
+	if (page) {
+		get_page(page);
+		page_dup_rmap(page);
+		rss[!!PageAnon(page)]++;
+	}
 
 out_set_pte:
 	set_pte_at(dst_mm, addr, dst_pte, pte);
@@ -543,7 +567,7 @@ int copy_page_range(struct mm_struct *dst_mm, struct mm_struct *src_mm,
 	 * readonly mappings. The tradeoff is that copy_page_range is more
 	 * efficient than faulting.
 	 */
-	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_UNPAGED))) {
+	if (!(vma->vm_flags & (VM_HUGETLB|VM_NONLINEAR|VM_PFNMAP))) {
 		if (!vma->anon_vma)
 			return 0;
 	}
@@ -584,19 +608,10 @@ static unsigned long zap_pte_range(struct mmu_gather *tlb,
 		}
 		if (pte_present(ptent)) {
 			struct page *page;
-			unsigned long pfn;
 
 			(*zap_work) -= PAGE_SIZE;
 
-			pfn = pte_pfn(ptent);
-			page = pfn_valid(pfn)? pfn_to_page(pfn): NULL;
-
-			if (unlikely(vma->vm_flags & VM_UNPAGED)) {
-				if (!page_is_anon(page, vma, addr))
-					page = NULL;
-			} else if (unlikely(!page))
-				print_bad_pte(vma, ptent, addr);
-
+			page = vm_normal_page(vma, addr, ptent);
 			if (unlikely(details) && page) {
 				/*
 				 * unmap_shared_mapping_pages() wants to
@@ -852,7 +867,7 @@ unsigned long zap_page_range(struct vm_area_struct *vma, unsigned long address,
 /*
  * Do a quick page-table lookup for a single page.
  */
-struct page *follow_page(struct mm_struct *mm, unsigned long address,
+struct page *follow_page(struct vm_area_struct *vma, unsigned long address,
 			unsigned int flags)
 {
 	pgd_t *pgd;
@@ -860,8 +875,8 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
 	pmd_t *pmd;
 	pte_t *ptep, pte;
 	spinlock_t *ptl;
-	unsigned long pfn;
 	struct page *page;
+	struct mm_struct *mm = vma->vm_mm;
 
 	page = follow_huge_addr(mm, address, flags & FOLL_WRITE);
 	if (!IS_ERR(page)) {
@@ -897,11 +912,10 @@ struct page *follow_page(struct mm_struct *mm, unsigned long address,
 		goto unlock;
 	if ((flags & FOLL_WRITE) && !pte_write(pte))
 		goto unlock;
-	pfn = pte_pfn(pte);
-	if (!pfn_valid(pfn))
+	page = vm_normal_page(vma, address, pte);
+	if (unlikely(!page))
 		goto unlock;
 
-	page = pfn_to_page(pfn);
 	if (flags & FOLL_GET)
 		get_page(page);
 	if (flags & FOLL_TOUCH) {
@@ -974,8 +988,10 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				return i ? : -EFAULT;
 			}
 			if (pages) {
-				pages[i] = pte_page(*pte);
-				get_page(pages[i]);
+				struct page *page = vm_normal_page(vma, start, *pte);
+				pages[i] = page;
+				if (page)
+					get_page(page);
 			}
 			pte_unmap(pte);
 			if (vmas)
@@ -1010,7 +1026,7 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
 				foll_flags |= FOLL_WRITE;
 
 			cond_resched();
-			while (!(page = follow_page(mm, start, foll_flags))) {
+			while (!(page = follow_page(vma, start, foll_flags))) {
 				int ret;
 				ret = __handle_mm_fault(mm, vma, start,
 						foll_flags & FOLL_WRITE);
@@ -1214,11 +1230,12 @@ int remap_pfn_range(struct vm_area_struct *vma, unsigned long addr,
 	 *	in 2.6 the LRU scan won't even find its pages, so this
 	 *	flag means no more than count its pages in reserved_vm,
 	 * 	and omit it from core dump, even when VM_IO turned off.
-	 *   VM_UNPAGED tells the core MM not to "manage" these pages
-         *	(e.g. refcount, mapcount, try to swap them out): in
-	 *	particular, zap_pte_range does not try to free them.
+	 *   VM_PFNMAP tells the core MM that the base pages are just
+	 *	raw PFN mappings, and do not have a "struct page" associated
+	 *	with them.
 	 */
-	vma->vm_flags |= VM_IO | VM_RESERVED | VM_UNPAGED;
+	vma->vm_flags |= VM_IO | VM_RESERVED | VM_PFNMAP;
+	vma->vm_pgoff = pfn;
 
 	BUG_ON(addr >= end);
 	pfn -= addr >> PAGE_SHIFT;
@@ -1273,6 +1290,26 @@ static inline pte_t maybe_mkwrite(pte_t pte, struct vm_area_struct *vma)
 	return pte;
 }
 
+static inline void cow_user_page(struct page *dst, struct page *src, unsigned long va)
+{
+	/*
+	 * If the source page was a PFN mapping, we don't have
+	 * a "struct page" for it. We do a best-effort copy by
+	 * just copying from the original user address. If that
+	 * fails, we just zero-fill it. Live with it.
+	 */
+	if (unlikely(!src)) {
+		void *kaddr = kmap_atomic(dst, KM_USER0);
+		unsigned long left = __copy_from_user_inatomic(kaddr, (void __user *)va, PAGE_SIZE);
+		if (left)
+			memset(kaddr, 0, PAGE_SIZE);
+		kunmap_atomic(kaddr, KM_USER0);
+		return;
+		
+	}
+	copy_user_highpage(dst, src, va);
+}
+
 /*
  * This routine handles present pages, when users try to write
  * to a shared page. It is done by copying the page to a new address
@@ -1296,28 +1333,13 @@ static int do_wp_page(struct mm_struct *mm, struct vm_area_struct *vma,
 		spinlock_t *ptl, pte_t orig_pte)
 {
 	struct page *old_page, *src_page, *new_page;
-	unsigned long pfn = pte_pfn(orig_pte);
 	pte_t entry;
 	int ret = VM_FAULT_MINOR;
 
-	if (unlikely(!pfn_valid(pfn))) {
-		/*
-		 * Page table corrupted: show pte and kill process.
-		 * Or it's an attempt to COW an out-of-map VM_UNPAGED
-		 * entry, which copy_user_highpage does not support.
-		 */
-		print_bad_pte(vma, orig_pte, address);
-		ret = VM_FAULT_OOM;
-		goto unlock;
-	}
-	old_page = pfn_to_page(pfn);
+	old_page = vm_normal_page(vma, address, orig_pte);
 	src_page = old_page;
-
-	if (unlikely(vma->vm_flags & VM_UNPAGED))
-		if (!page_is_anon(old_page, vma, address)) {
-			old_page = NULL;
-			goto gotten;
-		}
+	if (!old_page)
+		goto gotten;
 
 	if (PageAnon(old_page) && !TestSetPageLocked(old_page)) {
 		int reuse = can_share_swap_page(old_page);
@@ -1351,7 +1373,7 @@ gotten:
 		new_page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!new_page)
 			goto oom;
-		copy_user_highpage(new_page, src_page, address);
+		cow_user_page(new_page, src_page, address);
 	}
 
 	/*
@@ -1812,16 +1834,7 @@ static int do_anonymous_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	spinlock_t *ptl;
 	pte_t entry;
 
-	/*
-	 * A VM_UNPAGED vma will normally be filled with present ptes
-	 * by remap_pfn_range, and never arrive here; but it might have
-	 * holes, or if !VM_DONTEXPAND, mremap might have expanded it.
-	 * It's weird enough handling anon pages in unpaged vmas, we do
-	 * not want to worry about ZERO_PAGEs too (it may or may not
-	 * matter if their counts wrap): just give them anon pages.
-	 */
-
-	if (write_access || (vma->vm_flags & VM_UNPAGED)) {
+	if (write_access) {
 		/* Allocate our own private page. */
 		pte_unmap(page_table);
 
@@ -1896,8 +1909,6 @@ static int do_no_page(struct mm_struct *mm, struct vm_area_struct *vma,
 	int anon = 0;
 
 	pte_unmap(page_table);
-	BUG_ON(vma->vm_flags & VM_UNPAGED);
-
 	if (vma->vm_file) {
 		mapping = vma->vm_file->f_mapping;
 		sequence = mapping->truncate_count;
@@ -1930,7 +1941,7 @@ retry:
 		page = alloc_page_vma(GFP_HIGHUSER, vma, address);
 		if (!page)
 			goto oom;
-		copy_user_highpage(page, new_page, address);
+		cow_user_page(page, new_page, address);
 		page_cache_release(new_page);
 		new_page = page;
 		anon = 1;
-- 
cgit 


From e0f39591cc178026607fcbbe9a53be435fe8285d Mon Sep 17 00:00:00 2001
From: Alan Stern <stern@rowland.harvard.edu>
Date: Mon, 28 Nov 2005 13:43:44 -0800
Subject: [PATCH] Workaround for gcc 2.96 (undefined references)

  LD      .tmp_vmlinux1
mm/built-in.o(.text+0x100d6): In function `copy_page_range':
: undefined reference to `__pud_alloc'
mm/built-in.o(.text+0x1010b): In function `copy_page_range':
: undefined reference to `__pmd_alloc'
mm/built-in.o(.text+0x11ef4): In function `__handle_mm_fault':
: undefined reference to `__pud_alloc'
fs/built-in.o(.text+0xc930): In function `install_arg_page':
: undefined reference to `__pud_alloc'
make: *** [.tmp_vmlinux1] Error 1

Those missing references in mm/memory.c arise from this code in
include/linux/mm.h, combined with the fact that __PGTABLE_PMD_FOLDED and
__PGTABLE_PUD_FOLDED are both set and __ARCH_HAS_4LEVEL_HACK is not:

/*
 * The following ifdef needed to get the 4level-fixup.h header to work.
 * Remove it when 4level-fixup.h has been removed.
 */
#if defined(CONFIG_MMU) && !defined(__ARCH_HAS_4LEVEL_HACK)
static inline pud_t *pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
{
        return (unlikely(pgd_none(*pgd)) && __pud_alloc(mm, pgd, address))?
                NULL: pud_offset(pgd, address);
}

static inline pmd_t *pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
{
        return (unlikely(pud_none(*pud)) && __pmd_alloc(mm, pud, address))?
                NULL: pmd_offset(pud, address);
}
#endif /* CONFIG_MMU && !__ARCH_HAS_4LEVEL_HACK */

With my configuration the pgd_none and pud_none routines are inlines
returning a constant 0.  Apparently the old compiler avoids generating
calls to __pud_alloc and __pmd_alloc but still lists them as undefined
references in the module's symbol table.

I don't know which change caused this problem.  I think it was added
somewhere between 2.6.14 and 2.6.15-rc1, because I remember building
several 2.6.14-rc kernels without difficulty.  However I can't point to an
individual culprit.

Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
---
 mm/memory.c | 12 ++++++++++++
 1 file changed, 12 insertions(+)

(limited to 'mm/memory.c')

diff --git a/mm/memory.c b/mm/memory.c
index b57fbc636058..9ab206b829a2 100644
--- a/mm/memory.c
+++ b/mm/memory.c
@@ -2160,6 +2160,12 @@ int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
+#else
+/* Workaround for gcc 2.96 */
+int __pud_alloc(struct mm_struct *mm, pgd_t *pgd, unsigned long address)
+{
+	return 0;
+}
 #endif /* __PAGETABLE_PUD_FOLDED */
 
 #ifndef __PAGETABLE_PMD_FOLDED
@@ -2188,6 +2194,12 @@ int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
 	spin_unlock(&mm->page_table_lock);
 	return 0;
 }
+#else
+/* Workaround for gcc 2.96 */
+int __pmd_alloc(struct mm_struct *mm, pud_t *pud, unsigned long address)
+{
+	return 0;
+}
 #endif /* __PAGETABLE_PMD_FOLDED */
 
 int make_pages_present(unsigned long addr, unsigned long end)
-- 
cgit