From a70e65df8812c52252fa07a2eb92a46451a4427f Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Fri, 15 Feb 2008 14:37:28 -0800 Subject: [PATCH] merge open_namei() and do_filp_open() open_namei() will, in the future, need to take mount write counts over its creation and truncation (via may_open()) operations. It needs to keep these write counts until any potential filp that is created gets __fput()'d. This gets complicated in the error handling and becomes very murky as to how far open_namei() actually got, and whether or not that mount write count was taken. That makes it a bad interface. All that the current do_filp_open() really does is allocate the nameidata on the stack, then call open_namei(). So, this merges those two functions and moves filp_open() over to namei.c so it can be close to its buddy: do_filp_open(). It also gets a kerneldoc comment in the process. Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Dave Hansen Signed-off-by: Al Viro --- include/linux/fs.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index b84b848431f2..013b9c2b88e6 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1735,7 +1735,8 @@ extern struct file *create_read_pipe(struct file *f); extern struct file *create_write_pipe(void); extern void free_write_pipe(struct file *); -extern int open_namei(int dfd, const char *, int, int, struct nameidata *); +extern struct file *do_filp_open(int dfd, const char *pathname, + int open_flag, int mode); extern int may_open(struct nameidata *, int, int); extern int kernel_read(struct file *, unsigned long, char *, unsigned long); -- cgit From 8366025eb80dfa0d8d94b286d53027081c280ef1 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:37:30 -0800 Subject: [PATCH] r/o bind mounts: stub functions This patch adds two function mnt_want_write() and mnt_drop_write(). These are used like a lock pair around and fs operations that might cause a write to the filesystem. Before these can become useful, we must first cover each place in the VFS where writes are performed with a want/drop pair. When that is complete, we can actually introduce code that will safely check the counts before allowing r/w<->r/o transitions to occur. Acked-by: Serge Hallyn Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Dave Hansen Signed-off-by: Al Viro --- include/linux/mount.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index 5ee2df217cdf..2eecd2c8c760 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -71,9 +71,12 @@ static inline struct vfsmount *mntget(struct vfsmount *mnt) return mnt; } +extern int mnt_want_write(struct vfsmount *mnt); +extern void mnt_drop_write(struct vfsmount *mnt); extern void mntput_no_expire(struct vfsmount *mnt); extern void mnt_pin(struct vfsmount *mnt); extern void mnt_unpin(struct vfsmount *mnt); +extern int __mnt_is_readonly(struct vfsmount *mnt); static inline void mntput(struct vfsmount *mnt) { -- cgit From aceaf78da92a53f5e1b105649a1b8c0afdb2135c Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:37:31 -0800 Subject: [PATCH] r/o bind mounts: create helper to drop file write access If someone decides to demote a file from r/w to just r/o, they can use this same code as __fput(). NFS does just that, and will use this in the next patch. AV: drop write access in __fput() only after we evict from file list. Signed-off-by: Dave Hansen Cc: Erez Zadok Cc: Trond Myklebust Cc: "J Bruce Fields" Acked-by: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- include/linux/file.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/file.h b/include/linux/file.h index 7239baac81a9..653477021e4c 100644 --- a/include/linux/file.h +++ b/include/linux/file.h @@ -61,6 +61,7 @@ extern struct kmem_cache *filp_cachep; extern void __fput(struct file *); extern void fput(struct file *); +extern void drop_file_write_access(struct file *file); struct file_operations; struct vfsmount; -- cgit From 3d733633a633065729c9e4e254b2e5442c00ef7e Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:37:59 -0800 Subject: [PATCH] r/o bind mounts: track numbers of writers to mounts This is the real meat of the entire series. It actually implements the tracking of the number of writers to a mount. However, it causes scalability problems because there can be hundreds of cpus doing open()/close() on files on the same mnt at the same time. Even an atomic_t in the mnt has massive scalaing problems because the cacheline gets so terribly contended. This uses a statically-allocated percpu variable. All want/drop operations are local to a cpu as long that cpu operates on the same mount, and there are no writer count imbalances. Writer count imbalances happen when a write is taken on one cpu, and released on another, like when an open/close pair is performed on two Upon a remount,ro request, all of the data from the percpu variables is collected (expensive, but very rare) and we determine if there are any outstanding writers to the mount. I've written a little benchmark to sit in a loop for a couple of seconds in several cpus in parallel doing open/write/close loops. http://sr71.net/~dave/linux/openbench.c The code in here is a a worst-possible case for this patch. It does opens on a _pair_ of files in two different mounts in parallel. This should cause my code to lose its "operate on the same mount" optimization completely. This worst-case scenario causes a 3% degredation in the benchmark. I could probably get rid of even this 3%, but it would be more complex than what I have here, and I think this is getting into acceptable territory. In practice, I expect writing more than 3 bytes to a file, as well as disk I/O to mask any effects that this has. (To get rid of that 3%, we could have an #defined number of mounts in the percpu variable. So, instead of a CPU getting operate only on percpu data when it accesses only one mount, it could stay on percpu data when it only accesses N or fewer mounts.) [AV] merged fix for __clear_mnt_mount() stepping on freed vfsmount Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/mount.h | 7 +++++++ 1 file changed, 7 insertions(+) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index 2eecd2c8c760..8c8e94369ac8 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -14,6 +14,7 @@ #include #include +#include #include #include @@ -30,6 +31,7 @@ struct mnt_namespace; #define MNT_RELATIME 0x20 #define MNT_SHRINKABLE 0x100 +#define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ #define MNT_SHARED 0x1000 /* if the vfsmount is a shared mount */ #define MNT_UNBINDABLE 0x2000 /* if the vfsmount is a unbindable mount */ @@ -62,6 +64,11 @@ struct vfsmount { int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; int mnt_ghosts; + /* + * This value is not stable unless all of the mnt_writers[] spinlocks + * are held, and all mnt_writer[]s on this mount have 0 as their ->count + */ + atomic_t __mnt_writers; }; static inline struct vfsmount *mntget(struct vfsmount *mnt) -- cgit From 2e4b7fcd926006531935a4c79a5e9349fe51125b Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:38:00 -0800 Subject: [PATCH] r/o bind mounts: honor mount writer counts at remount Originally from: Herbert Poetzl This is the core of the read-only bind mount patch set. Note that this does _not_ add a "ro" option directly to the bind mount operation. If you require such a mount, you must first do the bind, then follow it up with a 'mount -o remount,ro' operation: If you wish to have a r/o bind mount of /foo on bar: mount --bind /foo /bar mount -o remount,ro /bar Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/mount.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index 8c8e94369ac8..d6600e3f7e45 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -29,6 +29,7 @@ struct mnt_namespace; #define MNT_NOATIME 0x08 #define MNT_NODIRATIME 0x10 #define MNT_RELATIME 0x20 +#define MNT_READONLY 0x40 /* does the user want this to be r/o? */ #define MNT_SHRINKABLE 0x100 #define MNT_IMBALANCED_WRITE_COUNT 0x200 /* just for debugging */ -- cgit From ad775f5a8faa5845377f093ca11caf577404add9 Mon Sep 17 00:00:00 2001 From: Dave Hansen Date: Fri, 15 Feb 2008 14:38:01 -0800 Subject: [PATCH] r/o bind mounts: debugging for missed calls There have been a few oopses caused by 'struct file's with NULL f_vfsmnts. There was also a set of potentially missed mnt_want_write()s from dentry_open() calls. This patch provides a very simple debugging framework to catch these kinds of bugs. It will WARN_ON() them, but should stop us from having any oopses or mnt_writer count imbalances. I'm quite convinced that this is a good thing because it found bugs in the stuff I was working on as soon as I wrote it. [hch: made it conditional on a debug option. But it's still a little bit too ugly] [hch: merged forced remount r/o fix from Dave and akpm's fix for the fix] Signed-off-by: Dave Hansen Acked-by: Al Viro Signed-off-by: Christoph Hellwig Signed-off-by: Andrew Morton Signed-off-by: Al Viro --- include/linux/fs.h | 49 +++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 49 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 013b9c2b88e6..d1eeea669d2c 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -776,6 +776,9 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index) index < ra->start + ra->size); } +#define FILE_MNT_WRITE_TAKEN 1 +#define FILE_MNT_WRITE_RELEASED 2 + struct file { /* * fu_list becomes invalid after file_free is called and queued via @@ -810,6 +813,9 @@ struct file { spinlock_t f_ep_lock; #endif /* #ifdef CONFIG_EPOLL */ struct address_space *f_mapping; +#ifdef CONFIG_DEBUG_WRITECOUNT + unsigned long f_mnt_write_state; +#endif }; extern spinlock_t files_lock; #define file_list_lock() spin_lock(&files_lock); @@ -818,6 +824,49 @@ extern spinlock_t files_lock; #define get_file(x) atomic_inc(&(x)->f_count) #define file_count(x) atomic_read(&(x)->f_count) +#ifdef CONFIG_DEBUG_WRITECOUNT +static inline void file_take_write(struct file *f) +{ + WARN_ON(f->f_mnt_write_state != 0); + f->f_mnt_write_state = FILE_MNT_WRITE_TAKEN; +} +static inline void file_release_write(struct file *f) +{ + f->f_mnt_write_state |= FILE_MNT_WRITE_RELEASED; +} +static inline void file_reset_write(struct file *f) +{ + f->f_mnt_write_state = 0; +} +static inline void file_check_state(struct file *f) +{ + /* + * At this point, either both or neither of these bits + * should be set. + */ + WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN); + WARN_ON(f->f_mnt_write_state == FILE_MNT_WRITE_RELEASED); +} +static inline int file_check_writeable(struct file *f) +{ + if (f->f_mnt_write_state == FILE_MNT_WRITE_TAKEN) + return 0; + printk(KERN_WARNING "writeable file with no " + "mnt_want_write()\n"); + WARN_ON(1); + return -EINVAL; +} +#else /* !CONFIG_DEBUG_WRITECOUNT */ +static inline void file_take_write(struct file *filp) {} +static inline void file_release_write(struct file *filp) {} +static inline void file_reset_write(struct file *filp) {} +static inline void file_check_state(struct file *filp) {} +static inline int file_check_writeable(struct file *filp) +{ + return 0; +} +#endif /* CONFIG_DEBUG_WRITECOUNT */ + #define MAX_NON_LFS ((1UL<<31) - 1) /* Page cache limit. The filesystems should put that into their s_maxbytes -- cgit