From ccd35fb9f4da856b105ea0f1e0cab3702e8ae6ba Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:17 +1100 Subject: kernel: kmem_ptr_validate considered harmful This is a nasty and error prone API. It is no longer used, remove it. Signed-off-by: Nick Piggin --- include/linux/slab.h | 2 -- 1 file changed, 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/slab.h b/include/linux/slab.h index 59260e21bdf5..fa9086647eb7 100644 --- a/include/linux/slab.h +++ b/include/linux/slab.h @@ -106,8 +106,6 @@ int kmem_cache_shrink(struct kmem_cache *); void kmem_cache_free(struct kmem_cache *, void *); unsigned int kmem_cache_size(struct kmem_cache *); const char *kmem_cache_name(struct kmem_cache *); -int kern_ptr_validate(const void *ptr, unsigned long size); -int kmem_ptr_validate(struct kmem_cache *cachep, const void *ptr); /* * Please use this macro to create slab caches. Simply specify the -- cgit From 5eef7fa905c814826f518aca2d414ca77508ce30 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:22 +1100 Subject: fs: dcache documentation cleanup Remove redundant (and incorrect, since dcache RCU lookup) dentry locking documentation and point to the canonical document. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 18 ++++++------------ 1 file changed, 6 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6a4aea30aa09..fff975576b5b 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -141,22 +141,16 @@ struct dentry_operations { char *(*d_dname)(struct dentry *, char *, int); }; -/* the dentry parameter passed to d_hash and d_compare is the parent +/* + * Locking rules for dentry_operations callbacks are to be found in + * Documentation/filesystems/Locking. Keep it updated! + * + * the dentry parameter passed to d_hash and d_compare is the parent * directory of the entries to be compared. It is used in case these * functions need any directory specific information for determining * equivalency classes. Using the dentry itself might not work, as it * might be a negative dentry which has no information associated with - * it */ - -/* -locking rules: - big lock dcache_lock d_lock may block -d_revalidate: no no no yes -d_hash no no no yes -d_compare: no yes yes no -d_delete: no yes no no -d_release: no no no yes -d_iput: no no no yes + * it. */ /* d_flags entries */ -- cgit From fe15ce446beb3a33583af81ffe6c9d01a75314ed Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:23 +1100 Subject: fs: change d_delete semantics Change d_delete from a dentry deletion notification to a dentry caching advise, more like ->drop_inode. Require it to be constant and idempotent, and not take d_lock. This is how all existing filesystems use the callback anyway. This makes fine grained dentry locking of dput and dentry lru scanning much simpler. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index fff975576b5b..cbfc9567e4e9 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -133,9 +133,9 @@ enum dentry_d_lock_class struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); - int (*d_hash) (struct dentry *, struct qstr *); - int (*d_compare) (struct dentry *, struct qstr *, struct qstr *); - int (*d_delete)(struct dentry *); + int (*d_hash)(struct dentry *, struct qstr *); + int (*d_compare)(struct dentry *, struct qstr *, struct qstr *); + int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); -- cgit From fb2d5b86aff355a27ebfc132d3c99f4a940cc3fe Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:26 +1100 Subject: fs: name case update method smpfs and ncpfs want to update a live dentry name in-place. Rather than have them open code the locking, provide a documented dcache API. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 2 ++ 1 file changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index cbfc9567e4e9..6cdf4995c90a 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -290,6 +290,8 @@ static inline struct dentry *d_add_unique(struct dentry *entry, struct inode *in return res; } +extern void dentry_update_name_case(struct dentry *, struct qstr *); + /* used for rename() and baskets */ extern void d_move(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); -- cgit From 621e155a3591962420eacdd39f6f0aa29ceb221e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:27 +1100 Subject: fs: change d_compare for rcu-walk Change d_compare so it may be called from lock-free RCU lookups. This does put significant restrictions on what may be done from the callback, however there don't seem to have been any problems with in-tree fses. If some strange use case pops up that _really_ cannot cope with the rcu-walk rules, we can just add new rcu-unaware callbacks, which would cause name lookup to drop out of rcu-walk mode. For in-tree filesystems, this is just a mechanical change. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 12 +++++------- include/linux/ncp_fs.h | 4 ++-- 2 files changed, 7 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 6cdf4995c90a..75a072bf2a34 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -134,7 +134,9 @@ enum dentry_d_lock_class struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); int (*d_hash)(struct dentry *, struct qstr *); - int (*d_compare)(struct dentry *, struct qstr *, struct qstr *); + int (*d_compare)(const struct dentry *, const struct inode *, + const struct dentry *, const struct inode *, + unsigned int, const char *, const struct qstr *); int (*d_delete)(const struct dentry *); void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); @@ -145,12 +147,8 @@ struct dentry_operations { * Locking rules for dentry_operations callbacks are to be found in * Documentation/filesystems/Locking. Keep it updated! * - * the dentry parameter passed to d_hash and d_compare is the parent - * directory of the entries to be compared. It is used in case these - * functions need any directory specific information for determining - * equivalency classes. Using the dentry itself might not work, as it - * might be a negative dentry which has no information associated with - * it. + * FUrther descriptions are found in Documentation/filesystems/vfs.txt. + * Keep it updated too! */ /* d_flags entries */ diff --git a/include/linux/ncp_fs.h b/include/linux/ncp_fs.h index ef663061d5ac..1c27f201c856 100644 --- a/include/linux/ncp_fs.h +++ b/include/linux/ncp_fs.h @@ -184,13 +184,13 @@ struct ncp_entry_info { __u8 file_handle[6]; }; -static inline struct ncp_server *NCP_SBP(struct super_block *sb) +static inline struct ncp_server *NCP_SBP(const struct super_block *sb) { return sb->s_fs_info; } #define NCP_SERVER(inode) NCP_SBP((inode)->i_sb) -static inline struct ncp_inode_info *NCP_FINFO(struct inode *inode) +static inline struct ncp_inode_info *NCP_FINFO(const struct inode *inode) { return container_of(inode, struct ncp_inode_info, vfs_inode); } -- cgit From b1e6a015a580ad145689ad1d6b4aa0e03e6c868b Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:28 +1100 Subject: fs: change d_hash for rcu-walk Change d_hash so it may be called from lock-free RCU lookups. See similar patch for d_compare for details. For in-tree filesystems, this is just a mechanical change. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 75a072bf2a34..1149e706f04d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -133,7 +133,8 @@ enum dentry_d_lock_class struct dentry_operations { int (*d_revalidate)(struct dentry *, struct nameidata *); - int (*d_hash)(struct dentry *, struct qstr *); + int (*d_hash)(const struct dentry *, const struct inode *, + struct qstr *); int (*d_compare)(const struct dentry *, const struct inode *, const struct dentry *, const struct inode *, unsigned int, const char *, const struct qstr *); -- cgit From ec2447c278ee973d35f38e53ca16ba7f965ae33d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:29 +1100 Subject: hostfs: simplify locking Remove dcache_lock locking from hostfs filesystem, and move it into dcache helpers. All that is required is a coherent path name. Protection from concurrent modification of the namespace after path name generation is not provided in current code, because dcache_lock is dropped before the path is used. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 1149e706f04d..cea27dfca532 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -311,7 +311,7 @@ extern char *dynamic_dname(struct dentry *, char *, int, const char *, ...); extern char *__d_path(const struct path *path, struct path *root, char *, int); extern char *d_path(const struct path *, char *, int); extern char *d_path_with_unreachable(const struct path *, char *, int); -extern char *__dentry_path(struct dentry *, char *, int); +extern char *dentry_path_raw(struct dentry *, char *, int); extern char *dentry_path(struct dentry *, char *, int); /* Allocation counts.. */ -- cgit From 789680d1ee9311cdf095241dc02bd9784d799cd1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:30 +1100 Subject: fs: dcache scale hash Add a new lock, dcache_hash_lock, to protect the dcache hash table from concurrent modification. d_hash is also protected by d_lock. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 35 ++--------------------------------- 1 file changed, 2 insertions(+), 33 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index cea27dfca532..2feb624b67f1 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -184,39 +184,6 @@ struct dentry_operations { extern spinlock_t dcache_lock; extern seqlock_t rename_lock; -/** - * d_drop - drop a dentry - * @dentry: dentry to drop - * - * d_drop() unhashes the entry from the parent dentry hashes, so that it won't - * be found through a VFS lookup any more. Note that this is different from - * deleting the dentry - d_delete will try to mark the dentry negative if - * possible, giving a successful _negative_ lookup, while d_drop will - * just make the cache lookup fail. - * - * d_drop() is used mainly for stuff that wants to invalidate a dentry for some - * reason (NFS timeouts or autofs deletes). - * - * __d_drop requires dentry->d_lock. - */ - -static inline void __d_drop(struct dentry *dentry) -{ - if (!(dentry->d_flags & DCACHE_UNHASHED)) { - dentry->d_flags |= DCACHE_UNHASHED; - hlist_del_rcu(&dentry->d_hash); - } -} - -static inline void d_drop(struct dentry *dentry) -{ - spin_lock(&dcache_lock); - spin_lock(&dentry->d_lock); - __d_drop(dentry); - spin_unlock(&dentry->d_lock); - spin_unlock(&dcache_lock); -} - static inline int dname_external(struct dentry *dentry) { return dentry->d_name.name != dentry->d_iname; @@ -228,6 +195,8 @@ static inline int dname_external(struct dentry *dentry) extern void d_instantiate(struct dentry *, struct inode *); extern struct dentry * d_instantiate_unique(struct dentry *, struct inode *); extern struct dentry * d_materialise_unique(struct dentry *, struct inode *); +extern void __d_drop(struct dentry *dentry); +extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); /* allocate/de-allocate */ -- cgit From b7ab39f631f505edc2bbdb86620d5493f995c9da Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:32 +1100 Subject: fs: dcache scale dentry refcount Make d_count non-atomic and protect it with d_lock. This allows us to ensure a 0 refcount dentry remains 0 without dcache_lock. It is also fairly natural when we start protecting many other dentry members with d_lock. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 2feb624b67f1..b0ade2d46805 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -87,7 +87,7 @@ full_name_hash(const unsigned char *name, unsigned int len) #endif struct dentry { - atomic_t d_count; + unsigned int d_count; /* protected by d_lock */ unsigned int d_flags; /* protected by d_lock */ spinlock_t d_lock; /* per dentry lock */ int d_mounted; @@ -297,17 +297,28 @@ extern char *dentry_path(struct dentry *, char *, int); * needs and they take necessary precautions) you should hold dcache_lock * and call dget_locked() instead of dget(). */ - +static inline struct dentry *dget_dlock(struct dentry *dentry) +{ + if (dentry) { + BUG_ON(!dentry->d_count); + dentry->d_count++; + } + return dentry; +} static inline struct dentry *dget(struct dentry *dentry) { if (dentry) { - BUG_ON(!atomic_read(&dentry->d_count)); - atomic_inc(&dentry->d_count); + spin_lock(&dentry->d_lock); + dget_dlock(dentry); + spin_unlock(&dentry->d_lock); } return dentry; } extern struct dentry * dget_locked(struct dentry *); +extern struct dentry * dget_locked_dlock(struct dentry *); + +extern struct dentry *dget_parent(struct dentry *dentry); /** * d_unhashed - is dentry hashed @@ -338,16 +349,6 @@ static inline void dont_mount(struct dentry *dentry) spin_unlock(&dentry->d_lock); } -static inline struct dentry *dget_parent(struct dentry *dentry) -{ - struct dentry *ret; - - spin_lock(&dentry->d_lock); - ret = dget(dentry->d_parent); - spin_unlock(&dentry->d_lock); - return ret; -} - extern void dput(struct dentry *); static inline int d_mountpoint(struct dentry *dentry) -- cgit From 2fd6b7f50797f2e993eea59e0a0b8c6399c811dc Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:34 +1100 Subject: fs: dcache scale subdirs Protect d_subdirs and d_child with d_lock, except in filesystems that aren't using dcache_lock for these anyway (eg. using i_mutex). Note: if we change the locking rule in future so that ->d_child protection is provided only with ->d_parent->d_lock, it may allow us to reduce some locking. But it would be an exception to an otherwise regular locking scheme, so we'd have to see some good results. Probably not worthwhile. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index b0ade2d46805..ddf4f55624f7 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -305,6 +305,7 @@ static inline struct dentry *dget_dlock(struct dentry *dentry) } return dentry; } + static inline struct dentry *dget(struct dentry *dentry) { if (dentry) { -- cgit From b23fb0a60379a95e10c671f646b259ea2558421e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:35 +1100 Subject: fs: scale inode alias list Add a new lock, dcache_inode_lock, to protect the inode's i_dentry list from concurrent modification. d_alias is also protected by d_lock. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ddf4f55624f7..bda5ec0b077d 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -181,6 +181,7 @@ struct dentry_operations { #define DCACHE_CANT_MOUNT 0x0100 +extern spinlock_t dcache_inode_lock; extern spinlock_t dcache_lock; extern seqlock_t rename_lock; -- cgit From 949854d02455080d20cd3e1db28a3a18daf7599d Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:37 +1100 Subject: fs: Use rename lock and RCU for multi-step operations The remaining usages for dcache_lock is to allow atomic, multi-step read-side operations over the directory tree by excluding modifications to the tree. Also, to walk in the leaf->root direction in the tree where we don't have a natural d_lock ordering. This could be accomplished by taking every d_lock, but this would mean a huge number of locks and actually gets very tricky. Solve this instead by using the rename seqlock for multi-step read-side operations, retry in case of a rename so we don't walk up the wrong parent. Concurrent dentry insertions are not serialised against. Concurrent deletes are tricky when walking up the directory: our parent might have been deleted when dropping locks so also need to check and retry for that. We can also use the rename lock in cases where livelock is a worry (and it is introduced in subsequent patch). Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index bda5ec0b077d..c963ebada922 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -180,6 +180,7 @@ struct dentry_operations { #define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */ #define DCACHE_CANT_MOUNT 0x0100 +#define DCACHE_GENOCIDE 0x0200 extern spinlock_t dcache_inode_lock; extern spinlock_t dcache_lock; -- cgit From b5c84bf6f6fa3a7dfdcb556023a62953574b60ee Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:38 +1100 Subject: fs: dcache remove dcache_lock dcache_lock no longer protects anything. remove it. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 5 ++--- include/linux/fs.h | 6 +++++- include/linux/fsnotify.h | 2 -- include/linux/fsnotify_backend.h | 11 +++++++---- include/linux/namei.h | 1 - 5 files changed, 14 insertions(+), 11 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c963ebada922..a2ceb94b0e38 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -183,7 +183,6 @@ struct dentry_operations { #define DCACHE_GENOCIDE 0x0200 extern spinlock_t dcache_inode_lock; -extern spinlock_t dcache_lock; extern seqlock_t rename_lock; static inline int dname_external(struct dentry *dentry) @@ -296,8 +295,8 @@ extern char *dentry_path(struct dentry *, char *, int); * destroyed when it has references. dget() should never be * called for dentries with zero reference counter. For these cases * (preferably none, functions in dcache.c are sufficient for normal - * needs and they take necessary precautions) you should hold dcache_lock - * and call dget_locked() instead of dget(). + * needs and they take necessary precautions) you should hold d_lock + * and call dget_dlock() instead of dget(). */ static inline struct dentry *dget_dlock(struct dentry *dentry) { diff --git a/include/linux/fs.h b/include/linux/fs.h index 090f0eacde29..296cf2fde945 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1378,7 +1378,7 @@ struct super_block { #else struct list_head s_files; #endif - /* s_dentry_lru and s_nr_dentry_unused are protected by dcache_lock */ + /* s_dentry_lru, s_nr_dentry_unused protected by dcache.c lru locks */ struct list_head s_dentry_lru; /* unused dentry lru */ int s_nr_dentry_unused; /* # of dentry on lru */ @@ -2446,6 +2446,10 @@ static inline ino_t parent_ino(struct dentry *dentry) { ino_t res; + /* + * Don't strictly need d_lock here? If the parent ino could change + * then surely we'd have a deeper race in the caller? + */ spin_lock(&dentry->d_lock); res = dentry->d_parent->d_inode->i_ino; spin_unlock(&dentry->d_lock); diff --git a/include/linux/fsnotify.h b/include/linux/fsnotify.h index b10bcdeaef76..2a53f10712b3 100644 --- a/include/linux/fsnotify.h +++ b/include/linux/fsnotify.h @@ -17,7 +17,6 @@ /* * fsnotify_d_instantiate - instantiate a dentry for inode - * Called with dcache_lock held. */ static inline void fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode) @@ -62,7 +61,6 @@ static inline int fsnotify_perm(struct file *file, int mask) /* * fsnotify_d_move - dentry has been moved - * Called with dcache_lock and dentry->d_lock held. */ static inline void fsnotify_d_move(struct dentry *dentry) { diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 7380763595d3..69ad89b50489 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -329,9 +329,15 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry) { struct dentry *parent; - assert_spin_locked(&dcache_lock); assert_spin_locked(&dentry->d_lock); + /* + * Serialisation of setting PARENT_WATCHED on the dentries is provided + * by d_lock. If inotify_inode_watched changes after we have taken + * d_lock, the following __fsnotify_update_child_dentry_flags call will + * find our entry, so it will spin until we complete here, and update + * us with the new state. + */ parent = dentry->d_parent; if (parent->d_inode && fsnotify_inode_watches_children(parent->d_inode)) dentry->d_flags |= DCACHE_FSNOTIFY_PARENT_WATCHED; @@ -341,15 +347,12 @@ static inline void __fsnotify_update_dcache_flags(struct dentry *dentry) /* * fsnotify_d_instantiate - instantiate a dentry for inode - * Called with dcache_lock held. */ static inline void __fsnotify_d_instantiate(struct dentry *dentry, struct inode *inode) { if (!inode) return; - assert_spin_locked(&dcache_lock); - spin_lock(&dentry->d_lock); __fsnotify_update_dcache_flags(dentry); spin_unlock(&dentry->d_lock); diff --git a/include/linux/namei.h b/include/linux/namei.h index 05b441d93642..aec730b53935 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -41,7 +41,6 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; * - require a directory * - ending slashes ok even for nonexistent files * - internal "there are more path components" flag - * - locked when lookup done with dcache_lock held * - dentry cache is untrusted; force a real lookup */ #define LOOKUP_FOLLOW 1 -- cgit From dc0474be3e27463d4d4a2793f82366eed906f223 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:43 +1100 Subject: fs: dcache rationalise dget variants dget_locked was a shortcut to avoid the lazy lru manipulation when we already held dcache_lock (lru manipulation was relatively cheap at that point). However, how that the lru lock is an innermost one, we never hold it at any caller, so the lock cost can now be avoided. We already have well working lazy dcache LRU, so it should be fine to defer LRU manipulations to scan time. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 15 +++------------ 1 file changed, 3 insertions(+), 12 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index a2ceb94b0e38..ca648685f0cc 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -287,23 +287,17 @@ extern char *dentry_path(struct dentry *, char *, int); /* Allocation counts.. */ /** - * dget, dget_locked - get a reference to a dentry + * dget, dget_dlock - get a reference to a dentry * @dentry: dentry to get a reference to * * Given a dentry or %NULL pointer increment the reference count * if appropriate and return the dentry. A dentry will not be - * destroyed when it has references. dget() should never be - * called for dentries with zero reference counter. For these cases - * (preferably none, functions in dcache.c are sufficient for normal - * needs and they take necessary precautions) you should hold d_lock - * and call dget_dlock() instead of dget(). + * destroyed when it has references. */ static inline struct dentry *dget_dlock(struct dentry *dentry) { - if (dentry) { - BUG_ON(!dentry->d_count); + if (dentry) dentry->d_count++; - } return dentry; } @@ -317,9 +311,6 @@ static inline struct dentry *dget(struct dentry *dentry) return dentry; } -extern struct dentry * dget_locked(struct dentry *); -extern struct dentry * dget_locked_dlock(struct dentry *); - extern struct dentry *dget_parent(struct dentry *dentry); /** -- cgit From fa0d7e3de6d6fc5004ad9dea0dd6b286af8f03e9 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:49 +1100 Subject: fs: icache RCU free inodes RCU free the struct inode. This will allow: - Subsequent store-free path walking patch. The inode must be consulted for permissions when walking, so an RCU inode reference is a must. - sb_inode_list_lock to be moved inside i_lock because sb list walkers who want to take i_lock no longer need to take sb_inode_list_lock to walk the list in the first place. This will simplify and optimize locking. - Could remove some nested trylock loops in dcache code - Could potentially simplify things a bit in VM land. Do not need to take the page lock to follow page->mapping. The downsides of this is the performance cost of using RCU. In a simple creat/unlink microbenchmark, performance drops by about 10% due to inability to reuse cache-hot slab objects. As iterations increase and RCU freeing starts kicking over, this increases to about 20%. In cases where inode lifetimes are longer (ie. many inodes may be allocated during the average life span of a single inode), a lot of this cache reuse is not applicable, so the regression caused by this patch is smaller. The cache-hot regression could largely be avoided by using SLAB_DESTROY_BY_RCU, however this adds some complexity to list walking and store-free path walking, so I prefer to implement this at a later date, if it is shown to be a win in real situations. I haven't found a regression in any non-micro benchmark so I doubt it will be a problem. Signed-off-by: Nick Piggin --- include/linux/fs.h | 5 ++++- include/linux/net.h | 1 - 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 296cf2fde945..1ff4d0a33b25 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -737,7 +737,10 @@ struct inode { struct list_head i_wb_list; /* backing dev IO list */ struct list_head i_lru; /* inode LRU list */ struct list_head i_sb_list; - struct list_head i_dentry; + union { + struct list_head i_dentry; + struct rcu_head i_rcu; + }; unsigned long i_ino; atomic_t i_count; unsigned int i_nlink; diff --git a/include/linux/net.h b/include/linux/net.h index 16faa130088c..06bde4908473 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -120,7 +120,6 @@ enum sock_shutdown_cmd { struct socket_wq { wait_queue_head_t wait; struct fasync_struct *fasync_list; - struct rcu_head rcu; } ____cacheline_aligned_in_smp; /** -- cgit From ff0c7d15f9787b7e8c601533c015295cc68329f8 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:50 +1100 Subject: fs: avoid inode RCU freeing for pseudo fs Pseudo filesystems that don't put inode on RCU list or reachable by rcu-walk dentries do not need to RCU free their inodes. Signed-off-by: Nick Piggin --- include/linux/fs.h | 1 + include/linux/net.h | 1 + 2 files changed, 2 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs.h b/include/linux/fs.h index 1ff4d0a33b25..ea202fff44f8 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2233,6 +2233,7 @@ extern void iget_failed(struct inode *); extern void end_writeback(struct inode *); extern void __destroy_inode(struct inode *); extern struct inode *new_inode(struct super_block *); +extern void free_inode_nonrcu(struct inode *inode); extern int should_remove_suid(struct dentry *); extern int file_remove_suid(struct file *); diff --git a/include/linux/net.h b/include/linux/net.h index 06bde4908473..16faa130088c 100644 --- a/include/linux/net.h +++ b/include/linux/net.h @@ -120,6 +120,7 @@ enum sock_shutdown_cmd { struct socket_wq { wait_queue_head_t wait; struct fasync_struct *fasync_list; + struct rcu_head rcu; } ____cacheline_aligned_in_smp; /** -- cgit From 3c22cd5709e8143444a6d08682a87f4c57902df3 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:51 +1100 Subject: kernel: optimise seqlock Add branch annotations for seqlock read fastpath, and introduce __read_seqcount_begin and __read_seqcount_end functions, that can avoid the smp_rmb() if used carefully. These will be used by store-free path walking algorithm performance is critical and seqlocks are in use. Signed-off-by: Nick Piggin --- include/linux/seqlock.h | 80 ++++++++++++++++++++++++++++++++++++++++++++----- 1 file changed, 73 insertions(+), 7 deletions(-) (limited to 'include/linux') diff --git a/include/linux/seqlock.h b/include/linux/seqlock.h index 632205ccc25d..e98cd2e57194 100644 --- a/include/linux/seqlock.h +++ b/include/linux/seqlock.h @@ -107,7 +107,7 @@ static __always_inline int read_seqretry(const seqlock_t *sl, unsigned start) { smp_rmb(); - return (sl->sequence != start); + return unlikely(sl->sequence != start); } @@ -125,14 +125,25 @@ typedef struct seqcount { #define SEQCNT_ZERO { 0 } #define seqcount_init(x) do { *(x) = (seqcount_t) SEQCNT_ZERO; } while (0) -/* Start of read using pointer to a sequence counter only. */ -static inline unsigned read_seqcount_begin(const seqcount_t *s) +/** + * __read_seqcount_begin - begin a seq-read critical section (without barrier) + * @s: pointer to seqcount_t + * Returns: count to be passed to read_seqcount_retry + * + * __read_seqcount_begin is like read_seqcount_begin, but has no smp_rmb() + * barrier. Callers should ensure that smp_rmb() or equivalent ordering is + * provided before actually loading any of the variables that are to be + * protected in this critical section. + * + * Use carefully, only in critical code, and comment how the barrier is + * provided. + */ +static inline unsigned __read_seqcount_begin(const seqcount_t *s) { unsigned ret; repeat: ret = s->sequence; - smp_rmb(); if (unlikely(ret & 1)) { cpu_relax(); goto repeat; @@ -140,14 +151,56 @@ repeat: return ret; } -/* - * Test if reader processed invalid data because sequence number has changed. +/** + * read_seqcount_begin - begin a seq-read critical section + * @s: pointer to seqcount_t + * Returns: count to be passed to read_seqcount_retry + * + * read_seqcount_begin opens a read critical section of the given seqcount. + * Validity of the critical section is tested by checking read_seqcount_retry + * function. + */ +static inline unsigned read_seqcount_begin(const seqcount_t *s) +{ + unsigned ret = __read_seqcount_begin(s); + smp_rmb(); + return ret; +} + +/** + * __read_seqcount_retry - end a seq-read critical section (without barrier) + * @s: pointer to seqcount_t + * @start: count, from read_seqcount_begin + * Returns: 1 if retry is required, else 0 + * + * __read_seqcount_retry is like read_seqcount_retry, but has no smp_rmb() + * barrier. Callers should ensure that smp_rmb() or equivalent ordering is + * provided before actually loading any of the variables that are to be + * protected in this critical section. + * + * Use carefully, only in critical code, and comment how the barrier is + * provided. + */ +static inline int __read_seqcount_retry(const seqcount_t *s, unsigned start) +{ + return unlikely(s->sequence != start); +} + +/** + * read_seqcount_retry - end a seq-read critical section + * @s: pointer to seqcount_t + * @start: count, from read_seqcount_begin + * Returns: 1 if retry is required, else 0 + * + * read_seqcount_retry closes a read critical section of the given seqcount. + * If the critical section was invalid, it must be ignored (and typically + * retried). */ static inline int read_seqcount_retry(const seqcount_t *s, unsigned start) { smp_rmb(); - return s->sequence != start; + return __read_seqcount_retry(s, start); } @@ -167,6 +220,19 @@ static inline void write_seqcount_end(seqcount_t *s) s->sequence++; } +/** + * write_seqcount_barrier - invalidate in-progress read-side seq operations + * @s: pointer to seqcount_t + * + * After write_seqcount_barrier, no read-side seq operations will complete + * successfully and see data older than this. + */ +static inline void write_seqcount_barrier(seqcount_t *s) +{ + smp_wmb(); + s->sequence+=2; +} + /* * Possible sw/hw IRQ protected versions of the interfaces. */ -- cgit From 31e6b01f4183ff419a6d1f86177cbf4662347cec Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:52 +1100 Subject: fs: rcu-walk for path lookup Perform common cases of path lookups without any stores or locking in the ancestor dentry elements. This is called rcu-walk, as opposed to the current algorithm which is a refcount based walk, or ref-walk. This results in far fewer atomic operations on every path element, significantly improving path lookup performance. It also avoids cacheline bouncing on common dentries, significantly improving scalability. The overall design is like this: * LOOKUP_RCU is set in nd->flags, which distinguishes rcu-walk from ref-walk. * Take the RCU lock for the entire path walk, starting with the acquiring of the starting path (eg. root/cwd/fd-path). So now dentry refcounts are not required for dentry persistence. * synchronize_rcu is called when unregistering a filesystem, so we can access d_ops and i_ops during rcu-walk. * Similarly take the vfsmount lock for the entire path walk. So now mnt refcounts are not required for persistence. Also we are free to perform mount lookups, and to assume dentry mount points and mount roots are stable up and down the path. * Have a per-dentry seqlock to protect the dentry name, parent, and inode, so we can load this tuple atomically, and also check whether any of its members have changed. * Dentry lookups (based on parent, candidate string tuple) recheck the parent sequence after the child is found in case anything changed in the parent during the path walk. * inode is also RCU protected so we can load d_inode and use the inode for limited things. * i_mode, i_uid, i_gid can be tested for exec permissions during path walk. * i_op can be loaded. When we reach the destination dentry, we lock it, recheck lookup sequence, and increment its refcount and mountpoint refcount. RCU and vfsmount locks are dropped. This is termed "dropping rcu-walk". If the dentry refcount does not match, we can not drop rcu-walk gracefully at the current point in the lokup, so instead return -ECHILD (for want of a better errno). This signals the path walking code to re-do the entire lookup with a ref-walk. Aside from the final dentry, there are other situations that may be encounted where we cannot continue rcu-walk. In that case, we drop rcu-walk (ie. take a reference on the last good dentry) and continue with a ref-walk. Again, if we can drop rcu-walk gracefully, we return -ECHILD and do the whole lookup using ref-walk. But it is very important that we can continue with ref-walk for most cases, particularly to avoid the overhead of double lookups, and to gain the scalability advantages on common path elements (like cwd and root). The cases where rcu-walk cannot continue are: * NULL dentry (ie. any uncached path element) * parent with d_inode->i_op->permission or ACLs * dentries with d_revalidate * Following links In future patches, permission checks and d_revalidate become rcu-walk aware. It may be possible eventually to make following links rcu-walk aware. Uncached path elements will always require dropping to ref-walk mode, at the very least because i_mutex needs to be grabbed, and objects allocated. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 32 +++++++++++++++++++++++++++++--- include/linux/namei.h | 15 ++++++++++----- include/linux/security.h | 8 +++++++- 3 files changed, 46 insertions(+), 9 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index ca648685f0cc..c2e7390289cc 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -5,6 +5,7 @@ #include #include #include +#include #include #include @@ -90,6 +91,7 @@ struct dentry { unsigned int d_count; /* protected by d_lock */ unsigned int d_flags; /* protected by d_lock */ spinlock_t d_lock; /* per dentry lock */ + seqcount_t d_seq; /* per dentry seqlock */ int d_mounted; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ @@ -266,9 +268,33 @@ extern void d_move(struct dentry *, struct dentry *); extern struct dentry *d_ancestor(struct dentry *, struct dentry *); /* appendix may either be NULL or be used for transname suffixes */ -extern struct dentry * d_lookup(struct dentry *, struct qstr *); -extern struct dentry * __d_lookup(struct dentry *, struct qstr *); -extern struct dentry * d_hash_and_lookup(struct dentry *, struct qstr *); +extern struct dentry *d_lookup(struct dentry *, struct qstr *); +extern struct dentry *d_hash_and_lookup(struct dentry *, struct qstr *); +extern struct dentry *__d_lookup(struct dentry *, struct qstr *); +extern struct dentry *__d_lookup_rcu(struct dentry *parent, struct qstr *name, + unsigned *seq, struct inode **inode); + +/** + * __d_rcu_to_refcount - take a refcount on dentry if sequence check is ok + * @dentry: dentry to take a ref on + * @seq: seqcount to verify against + * @Returns: 0 on failure, else 1. + * + * __d_rcu_to_refcount operates on a dentry,seq pair that was returned + * by __d_lookup_rcu, to get a reference on an rcu-walk dentry. + */ +static inline int __d_rcu_to_refcount(struct dentry *dentry, unsigned seq) +{ + int ret = 0; + + assert_spin_locked(&dentry->d_lock); + if (!read_seqcount_retry(&dentry->d_seq, seq)) { + ret = 1; + dentry->d_count++; + } + + return ret; +} /* validate "insecure" dentry pointer */ extern int d_validate(struct dentry *, struct dentry *); diff --git a/include/linux/namei.h b/include/linux/namei.h index aec730b53935..18d06add0a40 100644 --- a/include/linux/namei.h +++ b/include/linux/namei.h @@ -19,7 +19,10 @@ struct nameidata { struct path path; struct qstr last; struct path root; + struct file *file; + struct inode *inode; /* path.dentry.d_inode */ unsigned int flags; + unsigned seq; int last_type; unsigned depth; char *saved_names[MAX_NESTED_LINKS + 1]; @@ -43,11 +46,13 @@ enum {LAST_NORM, LAST_ROOT, LAST_DOT, LAST_DOTDOT, LAST_BIND}; * - internal "there are more path components" flag * - dentry cache is untrusted; force a real lookup */ -#define LOOKUP_FOLLOW 1 -#define LOOKUP_DIRECTORY 2 -#define LOOKUP_CONTINUE 4 -#define LOOKUP_PARENT 16 -#define LOOKUP_REVAL 64 +#define LOOKUP_FOLLOW 0x0001 +#define LOOKUP_DIRECTORY 0x0002 +#define LOOKUP_CONTINUE 0x0004 + +#define LOOKUP_PARENT 0x0010 +#define LOOKUP_REVAL 0x0020 +#define LOOKUP_RCU 0x0040 /* * Intent data */ diff --git a/include/linux/security.h b/include/linux/security.h index fd4d55fb8845..ed95401970c7 100644 --- a/include/linux/security.h +++ b/include/linux/security.h @@ -457,7 +457,6 @@ static inline void security_free_mnt_opts(struct security_mnt_opts *opts) * called when the actual read/write operations are performed. * @inode contains the inode structure to check. * @mask contains the permission mask. - * @nd contains the nameidata (may be NULL). * Return 0 if permission is granted. * @inode_setattr: * Check permission before setting file attributes. Note that the kernel @@ -1713,6 +1712,7 @@ int security_inode_rename(struct inode *old_dir, struct dentry *old_dentry, int security_inode_readlink(struct dentry *dentry); int security_inode_follow_link(struct dentry *dentry, struct nameidata *nd); int security_inode_permission(struct inode *inode, int mask); +int security_inode_exec_permission(struct inode *inode, unsigned int flags); int security_inode_setattr(struct dentry *dentry, struct iattr *attr); int security_inode_getattr(struct vfsmount *mnt, struct dentry *dentry); int security_inode_setxattr(struct dentry *dentry, const char *name, @@ -2102,6 +2102,12 @@ static inline int security_inode_permission(struct inode *inode, int mask) return 0; } +static inline int security_inode_exec_permission(struct inode *inode, + unsigned int flags) +{ + return 0; +} + static inline int security_inode_setattr(struct dentry *dentry, struct iattr *attr) { -- cgit From c28cc36469554dc55540f059fbdc7fa22a2c31fc Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:53 +1100 Subject: fs: fs_struct use seqlock Use a seqlock in the fs_struct to enable us to take an atomic copy of the complete cwd and root paths. Use this in the RCU lookup path to avoid a thread-shared spinlock in RCU lookup operations. Multi-threaded apps may now perform path lookups with scalability matching multi-process apps. Operations such as stat(2) become very scalable for multi-threaded workload. Signed-off-by: Nick Piggin --- include/linux/fs_struct.h | 3 +++ 1 file changed, 3 insertions(+) (limited to 'include/linux') diff --git a/include/linux/fs_struct.h b/include/linux/fs_struct.h index a42b5bf02f8b..003dc0fd7347 100644 --- a/include/linux/fs_struct.h +++ b/include/linux/fs_struct.h @@ -2,10 +2,13 @@ #define _LINUX_FS_STRUCT_H #include +#include +#include struct fs_struct { int users; spinlock_t lock; + seqcount_t seq; int umask; int in_exec; struct path root, pwd; -- cgit From 5f57cbcc02cf18f6b22ef4066bb10afeb8f930ff Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:54 +1100 Subject: fs: dcache remove d_mounted Rather than keep a d_mounted count in the dentry, set a dentry flag instead. The flag can be cleared by checking the hash table to see if there are any mounts left, which is not time critical because it is performed at detach time. The mounted state of a dentry is only used to speculatively take a look in the mount hash table if it is set -- before following the mount, vfsmount lock is taken and mount re-checked without races. This saves 4 bytes on 32-bit, nothing on 64-bit but it does provide a hole I might use later (and some configs have larger than 32-bit spinlocks which might make use of the hole). Autofs4 conversion and changelog by Ian Kent : In autofs4, when expring direct (or offset) mounts we need to ensure that we block user path walks into the autofs mount, which is covered by another mount. To do this we clear the mounted status so that follows stop before walking into the mount and are essentially blocked until the expire is completed. The automount daemon still finds the correct dentry for the umount due to the follow mount logic in fs/autofs4/root.c:autofs4_follow_link(), which is set as an inode operation for direct and offset mounts only and is called following the lookup that stopped at the covered mount. At the end of the expire the covering mount probably has gone away so the mounted status need not be restored. But we need to check this and only restore the mounted status if the expire failed. XXX: autofs may not work right if we have other mounts go over the top of it? Signed-off-by: Nick Piggin --- include/linux/dcache.h | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c2e7390289cc..e4414693065e 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -92,7 +92,6 @@ struct dentry { unsigned int d_flags; /* protected by d_lock */ spinlock_t d_lock; /* per dentry lock */ seqcount_t d_seq; /* per dentry seqlock */ - int d_mounted; struct inode *d_inode; /* Where the name belongs to - NULL is * negative */ /* @@ -156,33 +155,34 @@ struct dentry_operations { /* d_flags entries */ #define DCACHE_AUTOFS_PENDING 0x0001 /* autofs: "under construction" */ -#define DCACHE_NFSFS_RENAMED 0x0002 /* this dentry has been "silly - * renamed" and has to be - * deleted on the last dput() - */ -#define DCACHE_DISCONNECTED 0x0004 - /* This dentry is possibly not currently connected to the dcache tree, - * in which case its parent will either be itself, or will have this - * flag as well. nfsd will not use a dentry with this bit set, but will - * first endeavour to clear the bit either by discovering that it is - * connected, or by performing lookup operations. Any filesystem which - * supports nfsd_operations MUST have a lookup function which, if it finds - * a directory inode with a DCACHE_DISCONNECTED dentry, will d_move - * that dentry into place and return that dentry rather than the passed one, - * typically using d_splice_alias. - */ +#define DCACHE_NFSFS_RENAMED 0x0002 + /* this dentry has been "silly renamed" and has to be deleted on the last + * dput() */ + +#define DCACHE_DISCONNECTED 0x0004 + /* This dentry is possibly not currently connected to the dcache tree, in + * which case its parent will either be itself, or will have this flag as + * well. nfsd will not use a dentry with this bit set, but will first + * endeavour to clear the bit either by discovering that it is connected, + * or by performing lookup operations. Any filesystem which supports + * nfsd_operations MUST have a lookup function which, if it finds a + * directory inode with a DCACHE_DISCONNECTED dentry, will d_move that + * dentry into place and return that dentry rather than the passed one, + * typically using d_splice_alias. */ #define DCACHE_REFERENCED 0x0008 /* Recently used, don't discard. */ #define DCACHE_UNHASHED 0x0010 - -#define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 /* Parent inode is watched by inotify */ +#define DCACHE_INOTIFY_PARENT_WATCHED 0x0020 + /* Parent inode is watched by inotify */ #define DCACHE_COOKIE 0x0040 /* For use by dcookie subsystem */ - -#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 /* Parent inode is watched by some fsnotify listener */ +#define DCACHE_FSNOTIFY_PARENT_WATCHED 0x0080 + /* Parent inode is watched by some fsnotify listener */ #define DCACHE_CANT_MOUNT 0x0100 #define DCACHE_GENOCIDE 0x0200 +#define DCACHE_MOUNTED 0x0400 /* is a mountpoint */ + extern spinlock_t dcache_inode_lock; extern seqlock_t rename_lock; @@ -372,7 +372,7 @@ extern void dput(struct dentry *); static inline int d_mountpoint(struct dentry *dentry) { - return dentry->d_mounted; + return dentry->d_flags & DCACHE_MOUNTED; } extern struct vfsmount *lookup_mnt(struct path *); -- cgit From fb045adb99d9b7c562dc7fef834857f78249daa1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:55 +1100 Subject: fs: dcache reduce branches in lookup path Reduce some branches and memory accesses in dcache lookup by adding dentry flags to indicate common d_ops are set, rather than having to check them. This saves a pointer memory access (dentry->d_op) in common path lookup situations, and saves another pointer load and branch in cases where we have d_op but not the particular operation. Patched with: git grep -E '[.>]([[:space:]])*d_op([[:space:]])*=' | xargs sed -e 's/\([^\t ]*\)->d_op = \(.*\);/d_set_d_op(\1, \2);/' -e 's/\([^\t ]*\)\.d_op = \(.*\);/d_set_d_op(\&\1, \2);/' -i Signed-off-by: Nick Piggin --- include/linux/dcache.h | 6 ++++++ 1 file changed, 6 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index e4414693065e..f4b40a751f09 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -183,6 +183,11 @@ struct dentry_operations { #define DCACHE_GENOCIDE 0x0200 #define DCACHE_MOUNTED 0x0400 /* is a mountpoint */ +#define DCACHE_OP_HASH 0x1000 +#define DCACHE_OP_COMPARE 0x2000 +#define DCACHE_OP_REVALIDATE 0x4000 +#define DCACHE_OP_DELETE 0x8000 + extern spinlock_t dcache_inode_lock; extern seqlock_t rename_lock; @@ -201,6 +206,7 @@ extern struct dentry * d_materialise_unique(struct dentry *, struct inode *); extern void __d_drop(struct dentry *dentry); extern void d_drop(struct dentry *dentry); extern void d_delete(struct dentry *); +extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op); /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); -- cgit From 44a7d7a878c9cbb74f236ea755b25b6b2e26a9a9 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:56 +1100 Subject: fs: cache optimise dentry and inode for rcu-walk Put dentry and inode fields into top of data structure. This allows RCU path traversal to perform an RCU dentry lookup in a path walk by touching only the first 56 bytes of the dentry. We also fit in 8 bytes of inline name in the first 64 bytes, so for short names, only 64 bytes needs to be touched to perform the lookup. We should get rid of the hash->prev pointer from the first 64 bytes, and fit 16 bytes of name in there, which will take care of 81% rather than 32% of the kernel tree. inode is also rearranged so that RCU lookup will only touch a single cacheline in the inode, plus one in the i_ops structure. This is important for directory component lookups in RCU path walking. In the kernel source, directory names average is around 6 chars, so this works. When we reach the last element of the lookup, we need to lock it and take its refcount which requires another cacheline access. Align dentry and inode operations structs, so members will be at predictable offsets and we can group common operations into head of structure. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 36 +++++++++++++++++++----------------- include/linux/fs.h | 42 +++++++++++++++++++++++------------------- 2 files changed, 42 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index f4b40a751f09..b1aeda077258 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -82,25 +82,33 @@ full_name_hash(const unsigned char *name, unsigned int len) * large memory footprint increase). */ #ifdef CONFIG_64BIT -#define DNAME_INLINE_LEN_MIN 32 /* 192 bytes */ +# define DNAME_INLINE_LEN 32 /* 192 bytes */ #else -#define DNAME_INLINE_LEN_MIN 40 /* 128 bytes */ +# ifdef CONFIG_SMP +# define DNAME_INLINE_LEN 36 /* 128 bytes */ +# else +# define DNAME_INLINE_LEN 40 /* 128 bytes */ +# endif #endif struct dentry { - unsigned int d_count; /* protected by d_lock */ + /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ - spinlock_t d_lock; /* per dentry lock */ seqcount_t d_seq; /* per dentry seqlock */ - struct inode *d_inode; /* Where the name belongs to - NULL is - * negative */ - /* - * The next three fields are touched by __d_lookup. Place them here - * so they all fit in a cache line. - */ struct hlist_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; + struct inode *d_inode; /* Where the name belongs to - NULL is + * negative */ + unsigned char d_iname[DNAME_INLINE_LEN]; /* small names */ + + /* Ref lookup also touches following */ + unsigned int d_count; /* protected by d_lock */ + spinlock_t d_lock; /* per dentry lock */ + const struct dentry_operations *d_op; + struct super_block *d_sb; /* The root of the dentry tree */ + unsigned long d_time; /* used by d_revalidate */ + void *d_fsdata; /* fs-specific data */ struct list_head d_lru; /* LRU list */ /* @@ -112,12 +120,6 @@ struct dentry { } d_u; struct list_head d_subdirs; /* our children */ struct list_head d_alias; /* inode alias list */ - unsigned long d_time; /* used by d_revalidate */ - const struct dentry_operations *d_op; - struct super_block *d_sb; /* The root of the dentry tree */ - void *d_fsdata; /* fs-specific data */ - - unsigned char d_iname[DNAME_INLINE_LEN_MIN]; /* small names */ }; /* @@ -143,7 +145,7 @@ struct dentry_operations { void (*d_release)(struct dentry *); void (*d_iput)(struct dentry *, struct inode *); char *(*d_dname)(struct dentry *, char *, int); -}; +} ____cacheline_aligned; /* * Locking rules for dentry_operations callbacks are to be found in diff --git a/include/linux/fs.h b/include/linux/fs.h index ea202fff44f8..a04aa96acb71 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -733,6 +733,20 @@ struct posix_acl; #define ACL_NOT_CACHED ((void *)(-1)) struct inode { + /* RCU path lookup touches following: */ + umode_t i_mode; + uid_t i_uid; + gid_t i_gid; + const struct inode_operations *i_op; + struct super_block *i_sb; + + spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ + unsigned int i_flags; + struct mutex i_mutex; + + unsigned long i_state; + unsigned long dirtied_when; /* jiffies of first dirtying */ + struct hlist_node i_hash; struct list_head i_wb_list; /* backing dev IO list */ struct list_head i_lru; /* inode LRU list */ @@ -744,8 +758,6 @@ struct inode { unsigned long i_ino; atomic_t i_count; unsigned int i_nlink; - uid_t i_uid; - gid_t i_gid; dev_t i_rdev; unsigned int i_blkbits; u64 i_version; @@ -758,13 +770,8 @@ struct inode { struct timespec i_ctime; blkcnt_t i_blocks; unsigned short i_bytes; - umode_t i_mode; - spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */ - struct mutex i_mutex; struct rw_semaphore i_alloc_sem; - const struct inode_operations *i_op; const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ - struct super_block *i_sb; struct file_lock *i_flock; struct address_space *i_mapping; struct address_space i_data; @@ -785,11 +792,6 @@ struct inode { struct hlist_head i_fsnotify_marks; #endif - unsigned long i_state; - unsigned long dirtied_when; /* jiffies of first dirtying */ - - unsigned int i_flags; - #ifdef CONFIG_IMA /* protected by i_lock */ unsigned int i_readcount; /* struct files open RO */ @@ -1549,8 +1551,15 @@ struct file_operations { }; struct inode_operations { - int (*create) (struct inode *,struct dentry *,int, struct nameidata *); struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); + void * (*follow_link) (struct dentry *, struct nameidata *); + int (*permission) (struct inode *, int); + int (*check_acl)(struct inode *, int); + + int (*readlink) (struct dentry *, char __user *,int); + void (*put_link) (struct dentry *, struct nameidata *, void *); + + int (*create) (struct inode *,struct dentry *,int, struct nameidata *); int (*link) (struct dentry *,struct inode *,struct dentry *); int (*unlink) (struct inode *,struct dentry *); int (*symlink) (struct inode *,struct dentry *,const char *); @@ -1559,12 +1568,7 @@ struct inode_operations { int (*mknod) (struct inode *,struct dentry *,int,dev_t); int (*rename) (struct inode *, struct dentry *, struct inode *, struct dentry *); - int (*readlink) (struct dentry *, char __user *,int); - void * (*follow_link) (struct dentry *, struct nameidata *); - void (*put_link) (struct dentry *, struct nameidata *, void *); void (*truncate) (struct inode *); - int (*permission) (struct inode *, int); - int (*check_acl)(struct inode *, int); int (*setattr) (struct dentry *, struct iattr *); int (*getattr) (struct vfsmount *mnt, struct dentry *, struct kstat *); int (*setxattr) (struct dentry *, const char *,const void *,size_t,int); @@ -1576,7 +1580,7 @@ struct inode_operations { loff_t len); int (*fiemap)(struct inode *, struct fiemap_extent_info *, u64 start, u64 len); -}; +} ____cacheline_aligned; struct seq_file; -- cgit From 34286d6662308d82aed891852d04c7c3a2649b16 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:57 +1100 Subject: fs: rcu-walk aware d_revalidate method Require filesystems be aware of .d_revalidate being called in rcu-walk mode (nd->flags & LOOKUP_RCU). For now do a simple push down, returning -ECHILD from all implementations. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index b1aeda077258..8b2064d02928 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -190,7 +190,6 @@ struct dentry_operations { #define DCACHE_OP_REVALIDATE 0x4000 #define DCACHE_OP_DELETE 0x8000 - extern spinlock_t dcache_inode_lock; extern seqlock_t rename_lock; -- cgit From b74c79e99389cd79b31fcc08f82c24e492e63c7e Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:58 +1100 Subject: fs: provide rcu-walk aware permission i_ops Signed-off-by: Nick Piggin --- include/linux/coda_linux.h | 2 +- include/linux/fs.h | 10 ++++++---- include/linux/generic_acl.h | 2 +- include/linux/nfs_fs.h | 2 +- include/linux/reiserfs_xattr.h | 2 +- 5 files changed, 10 insertions(+), 8 deletions(-) (limited to 'include/linux') diff --git a/include/linux/coda_linux.h b/include/linux/coda_linux.h index 2e914d0771b9..4ccc59c1ea82 100644 --- a/include/linux/coda_linux.h +++ b/include/linux/coda_linux.h @@ -37,7 +37,7 @@ extern const struct file_operations coda_ioctl_operations; /* operations shared over more than one file */ int coda_open(struct inode *i, struct file *f); int coda_release(struct inode *i, struct file *f); -int coda_permission(struct inode *inode, int mask); +int coda_permission(struct inode *inode, int mask, unsigned int flags); int coda_revalidate_inode(struct dentry *); int coda_getattr(struct vfsmount *, struct dentry *, struct kstat *); int coda_setattr(struct dentry *, struct iattr *); diff --git a/include/linux/fs.h b/include/linux/fs.h index a04aa96acb71..d5a4d42f655b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1550,11 +1550,13 @@ struct file_operations { int (*setlease)(struct file *, long, struct file_lock **); }; +#define IPERM_FLAG_RCU 0x0001 + struct inode_operations { struct dentry * (*lookup) (struct inode *,struct dentry *, struct nameidata *); void * (*follow_link) (struct dentry *, struct nameidata *); - int (*permission) (struct inode *, int); - int (*check_acl)(struct inode *, int); + int (*permission) (struct inode *, int, unsigned int); + int (*check_acl)(struct inode *, int, unsigned int); int (*readlink) (struct dentry *, char __user *,int); void (*put_link) (struct dentry *, struct nameidata *, void *); @@ -2165,8 +2167,8 @@ extern sector_t bmap(struct inode *, sector_t); #endif extern int notify_change(struct dentry *, struct iattr *); extern int inode_permission(struct inode *, int); -extern int generic_permission(struct inode *, int, - int (*check_acl)(struct inode *, int)); +extern int generic_permission(struct inode *, int, unsigned int, + int (*check_acl)(struct inode *, int, unsigned int)); static inline bool execute_ok(struct inode *inode) { diff --git a/include/linux/generic_acl.h b/include/linux/generic_acl.h index 574bea4013b6..0437e377b555 100644 --- a/include/linux/generic_acl.h +++ b/include/linux/generic_acl.h @@ -10,6 +10,6 @@ extern const struct xattr_handler generic_acl_default_handler; int generic_acl_init(struct inode *, struct inode *); int generic_acl_chmod(struct inode *); -int generic_check_acl(struct inode *inode, int mask); +int generic_check_acl(struct inode *inode, int mask, unsigned int flags); #endif /* LINUX_GENERIC_ACL_H */ diff --git a/include/linux/nfs_fs.h b/include/linux/nfs_fs.h index 29d504d5d1c3..0779bb8f95be 100644 --- a/include/linux/nfs_fs.h +++ b/include/linux/nfs_fs.h @@ -351,7 +351,7 @@ extern int nfs_refresh_inode(struct inode *, struct nfs_fattr *); extern int nfs_post_op_update_inode(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_post_op_update_inode_force_wcc(struct inode *inode, struct nfs_fattr *fattr); extern int nfs_getattr(struct vfsmount *, struct dentry *, struct kstat *); -extern int nfs_permission(struct inode *, int); +extern int nfs_permission(struct inode *, int, unsigned int); extern int nfs_open(struct inode *, struct file *); extern int nfs_release(struct inode *, struct file *); extern int nfs_attribute_timeout(struct inode *inode); diff --git a/include/linux/reiserfs_xattr.h b/include/linux/reiserfs_xattr.h index b2cf2089769b..3b94c91f20a6 100644 --- a/include/linux/reiserfs_xattr.h +++ b/include/linux/reiserfs_xattr.h @@ -41,7 +41,7 @@ int reiserfs_xattr_init(struct super_block *sb, int mount_flags); int reiserfs_lookup_privroot(struct super_block *sb); int reiserfs_delete_xattrs(struct inode *inode); int reiserfs_chown_xattrs(struct inode *inode, struct iattr *attrs); -int reiserfs_permission(struct inode *inode, int mask); +int reiserfs_permission(struct inode *inode, int mask, unsigned int flags); #ifdef CONFIG_REISERFS_FS_XATTR #define has_xattr_dir(inode) (REISERFS_I(inode)->i_flags & i_has_xattr_dir) -- cgit From 1e1743ebe35ec7e3c1fa732408358fbc614cbbe5 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:49:59 +1100 Subject: fs: provide simple rcu-walk generic_check_acl implementation This simple implementation just checks for no ACLs on the inode, and if so, then the rcu-walk may proceed, otherwise fail it. This could easily be extended to put acls under RCU and check them under seqlock, if need be. But this implementation is enough to show the rcu-walk aware permissions code for path lookups is working, and will handle cases where there are no ACLs or ACLs in just the final element. This patch implicity converts tmpfs to rcu-aware permission check. Subsequent patches onvert ext*, xfs, and, btrfs. Each of these uses acl/permission code in a different way, so convert them all to provide templates and proof of concept. Signed-off-by: Nick Piggin --- include/linux/posix_acl.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) (limited to 'include/linux') diff --git a/include/linux/posix_acl.h b/include/linux/posix_acl.h index 67608161df6b..d68283a898bb 100644 --- a/include/linux/posix_acl.h +++ b/include/linux/posix_acl.h @@ -108,6 +108,25 @@ static inline struct posix_acl *get_cached_acl(struct inode *inode, int type) return acl; } +static inline int negative_cached_acl(struct inode *inode, int type) +{ + struct posix_acl **p, *acl; + switch (type) { + case ACL_TYPE_ACCESS: + p = &inode->i_acl; + break; + case ACL_TYPE_DEFAULT: + p = &inode->i_default_acl; + break; + default: + BUG(); + } + acl = ACCESS_ONCE(*p); + if (acl) + return 0; + return 1; +} + static inline void set_cached_acl(struct inode *inode, int type, struct posix_acl *acl) -- cgit From 4e35e6070b1ceed89c3bba2af4216c286fb1dafd Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:03 +1100 Subject: kernel: add bl_list Introduce a type of hlist that can support the use of the lowest bit in the hlist_head. This will be subsequently used to implement per-bucket bit spinlock for inode and dentry hashes, and may be useful in other cases such as network hashes. Reviewed-by: Paul E. McKenney Signed-off-by: Nick Piggin --- include/linux/list_bl.h | 144 +++++++++++++++++++++++++++++++++++++++++++++ include/linux/rculist_bl.h | 127 +++++++++++++++++++++++++++++++++++++++ 2 files changed, 271 insertions(+) create mode 100644 include/linux/list_bl.h create mode 100644 include/linux/rculist_bl.h (limited to 'include/linux') diff --git a/include/linux/list_bl.h b/include/linux/list_bl.h new file mode 100644 index 000000000000..9ee97e7f2be4 --- /dev/null +++ b/include/linux/list_bl.h @@ -0,0 +1,144 @@ +#ifndef _LINUX_LIST_BL_H +#define _LINUX_LIST_BL_H + +#include + +/* + * Special version of lists, where head of the list has a lock in the lowest + * bit. This is useful for scalable hash tables without increasing memory + * footprint overhead. + * + * For modification operations, the 0 bit of hlist_bl_head->first + * pointer must be set. + * + * With some small modifications, this can easily be adapted to store several + * arbitrary bits (not just a single lock bit), if the need arises to store + * some fast and compact auxiliary data. + */ + +#if defined(CONFIG_SMP) || defined(CONFIG_DEBUG_SPINLOCK) +#define LIST_BL_LOCKMASK 1UL +#else +#define LIST_BL_LOCKMASK 0UL +#endif + +#ifdef CONFIG_DEBUG_LIST +#define LIST_BL_BUG_ON(x) BUG_ON(x) +#else +#define LIST_BL_BUG_ON(x) +#endif + + +struct hlist_bl_head { + struct hlist_bl_node *first; +}; + +struct hlist_bl_node { + struct hlist_bl_node *next, **pprev; +}; +#define INIT_HLIST_BL_HEAD(ptr) \ + ((ptr)->first = NULL) + +static inline void INIT_HLIST_BL_NODE(struct hlist_bl_node *h) +{ + h->next = NULL; + h->pprev = NULL; +} + +#define hlist_bl_entry(ptr, type, member) container_of(ptr,type,member) + +static inline int hlist_bl_unhashed(const struct hlist_bl_node *h) +{ + return !h->pprev; +} + +static inline struct hlist_bl_node *hlist_bl_first(struct hlist_bl_head *h) +{ + return (struct hlist_bl_node *) + ((unsigned long)h->first & ~LIST_BL_LOCKMASK); +} + +static inline void hlist_bl_set_first(struct hlist_bl_head *h, + struct hlist_bl_node *n) +{ + LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); + LIST_BL_BUG_ON(!((unsigned long)h->first & LIST_BL_LOCKMASK)); + h->first = (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK); +} + +static inline int hlist_bl_empty(const struct hlist_bl_head *h) +{ + return !((unsigned long)h->first & ~LIST_BL_LOCKMASK); +} + +static inline void hlist_bl_add_head(struct hlist_bl_node *n, + struct hlist_bl_head *h) +{ + struct hlist_bl_node *first = hlist_bl_first(h); + + n->next = first; + if (first) + first->pprev = &n->next; + n->pprev = &h->first; + hlist_bl_set_first(h, n); +} + +static inline void __hlist_bl_del(struct hlist_bl_node *n) +{ + struct hlist_bl_node *next = n->next; + struct hlist_bl_node **pprev = n->pprev; + + LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); + + /* pprev may be `first`, so be careful not to lose the lock bit */ + *pprev = (struct hlist_bl_node *) + ((unsigned long)next | + ((unsigned long)*pprev & LIST_BL_LOCKMASK)); + if (next) + next->pprev = pprev; +} + +static inline void hlist_bl_del(struct hlist_bl_node *n) +{ + __hlist_bl_del(n); + n->next = LIST_POISON1; + n->pprev = LIST_POISON2; +} + +static inline void hlist_bl_del_init(struct hlist_bl_node *n) +{ + if (!hlist_bl_unhashed(n)) { + __hlist_bl_del(n); + INIT_HLIST_BL_NODE(n); + } +} + +/** + * hlist_bl_for_each_entry - iterate over list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + * + */ +#define hlist_bl_for_each_entry(tpos, pos, head, member) \ + for (pos = hlist_bl_first(head); \ + pos && \ + ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ + pos = pos->next) + +/** + * hlist_bl_for_each_entry_safe - iterate over list of given type safe against removal of list entry + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_node to use as a loop cursor. + * @n: another &struct hlist_node to use as temporary storage + * @head: the head for your list. + * @member: the name of the hlist_node within the struct. + */ +#define hlist_bl_for_each_entry_safe(tpos, pos, n, head, member) \ + for (pos = hlist_bl_first(head); \ + pos && ({ n = pos->next; 1; }) && \ + ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1;}); \ + pos = n) + +#endif diff --git a/include/linux/rculist_bl.h b/include/linux/rculist_bl.h new file mode 100644 index 000000000000..b872b493724d --- /dev/null +++ b/include/linux/rculist_bl.h @@ -0,0 +1,127 @@ +#ifndef _LINUX_RCULIST_BL_H +#define _LINUX_RCULIST_BL_H + +/* + * RCU-protected bl list version. See include/linux/list_bl.h. + */ +#include +#include + +static inline void hlist_bl_set_first_rcu(struct hlist_bl_head *h, + struct hlist_bl_node *n) +{ + LIST_BL_BUG_ON((unsigned long)n & LIST_BL_LOCKMASK); + LIST_BL_BUG_ON(!((unsigned long)h->first & LIST_BL_LOCKMASK)); + rcu_assign_pointer(h->first, + (struct hlist_bl_node *)((unsigned long)n | LIST_BL_LOCKMASK)); +} + +static inline struct hlist_bl_node *hlist_bl_first_rcu(struct hlist_bl_head *h) +{ + return (struct hlist_bl_node *) + ((unsigned long)rcu_dereference(h->first) & ~LIST_BL_LOCKMASK); +} + +/** + * hlist_bl_del_init_rcu - deletes entry from hash list with re-initialization + * @n: the element to delete from the hash list. + * + * Note: hlist_bl_unhashed() on the node returns true after this. It is + * useful for RCU based read lockfree traversal if the writer side + * must know if the list entry is still hashed or already unhashed. + * + * In particular, it means that we can not poison the forward pointers + * that may still be used for walking the hash list and we can only + * zero the pprev pointer so list_unhashed() will return true after + * this. + * + * The caller must take whatever precautions are necessary (such as + * holding appropriate locks) to avoid racing with another + * list-mutation primitive, such as hlist_bl_add_head_rcu() or + * hlist_bl_del_rcu(), running on this same list. However, it is + * perfectly legal to run concurrently with the _rcu list-traversal + * primitives, such as hlist_bl_for_each_entry_rcu(). + */ +static inline void hlist_bl_del_init_rcu(struct hlist_bl_node *n) +{ + if (!hlist_bl_unhashed(n)) { + __hlist_bl_del(n); + n->pprev = NULL; + } +} + +/** + * hlist_bl_del_rcu - deletes entry from hash list without re-initialization + * @n: the element to delete from the hash list. + * + * Note: hlist_bl_unhashed() on entry does not return true after this, + * the entry is in an undefined state. It is useful for RCU based + * lockfree traversal. + * + * In particular, it means that we can not poison the forward + * pointers that may still be used for walking the hash list. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_bl_add_head_rcu() + * or hlist_bl_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_bl_for_each_entry(). + */ +static inline void hlist_bl_del_rcu(struct hlist_bl_node *n) +{ + __hlist_bl_del(n); + n->pprev = LIST_POISON2; +} + +/** + * hlist_bl_add_head_rcu + * @n: the element to add to the hash list. + * @h: the list to add to. + * + * Description: + * Adds the specified element to the specified hlist_bl, + * while permitting racing traversals. + * + * The caller must take whatever precautions are necessary + * (such as holding appropriate locks) to avoid racing + * with another list-mutation primitive, such as hlist_bl_add_head_rcu() + * or hlist_bl_del_rcu(), running on this same list. + * However, it is perfectly legal to run concurrently with + * the _rcu list-traversal primitives, such as + * hlist_bl_for_each_entry_rcu(), used to prevent memory-consistency + * problems on Alpha CPUs. Regardless of the type of CPU, the + * list-traversal primitive must be guarded by rcu_read_lock(). + */ +static inline void hlist_bl_add_head_rcu(struct hlist_bl_node *n, + struct hlist_bl_head *h) +{ + struct hlist_bl_node *first; + + /* don't need hlist_bl_first_rcu because we're under lock */ + first = hlist_bl_first(h); + + n->next = first; + if (first) + first->pprev = &n->next; + n->pprev = &h->first; + + /* need _rcu because we can have concurrent lock free readers */ + hlist_bl_set_first_rcu(h, n); +} +/** + * hlist_bl_for_each_entry_rcu - iterate over rcu list of given type + * @tpos: the type * to use as a loop cursor. + * @pos: the &struct hlist_bl_node to use as a loop cursor. + * @head: the head for your list. + * @member: the name of the hlist_bl_node within the struct. + * + */ +#define hlist_bl_for_each_entry_rcu(tpos, pos, head, member) \ + for (pos = hlist_bl_first_rcu(head); \ + pos && \ + ({ tpos = hlist_bl_entry(pos, typeof(*tpos), member); 1; }); \ + pos = rcu_dereference_raw(pos->next)) + +#endif -- cgit From 626d607435617cc0f033522083e2bb195b81813c Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:04 +1100 Subject: bit_spinlock: add required includes Signed-off-by: Nick Piggin --- include/linux/bit_spinlock.h | 4 ++++ 1 file changed, 4 insertions(+) (limited to 'include/linux') diff --git a/include/linux/bit_spinlock.h b/include/linux/bit_spinlock.h index 7113a32a86ea..e612575a2596 100644 --- a/include/linux/bit_spinlock.h +++ b/include/linux/bit_spinlock.h @@ -1,6 +1,10 @@ #ifndef __LINUX_BIT_SPINLOCK_H #define __LINUX_BIT_SPINLOCK_H +#include +#include +#include + /* * bit-based spin_lock() * -- cgit From ceb5bdc2d246f6d81cf61ed70f325308a11821d2 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:05 +1100 Subject: fs: dcache per-bucket dcache hash locking We can turn the dcache hash locking from a global dcache_hash_lock into per-bucket locking. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 3 ++- include/linux/fs.h | 3 ++- 2 files changed, 4 insertions(+), 2 deletions(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 8b2064d02928..5f0392ef759b 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -4,6 +4,7 @@ #include #include #include +#include #include #include #include @@ -95,7 +96,7 @@ struct dentry { /* RCU lookup touched fields */ unsigned int d_flags; /* protected by d_lock */ seqcount_t d_seq; /* per dentry seqlock */ - struct hlist_node d_hash; /* lookup hash list */ + struct hlist_bl_node d_hash; /* lookup hash list */ struct dentry *d_parent; /* parent directory */ struct qstr d_name; struct inode *d_inode; /* Where the name belongs to - NULL is diff --git a/include/linux/fs.h b/include/linux/fs.h index d5a4d42f655b..baf3e556ff0e 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -392,6 +392,7 @@ struct inodes_stat_t { #include #include #include +#include #include #include @@ -1377,7 +1378,7 @@ struct super_block { const struct xattr_handler **s_xattr; struct list_head s_inodes; /* all inodes */ - struct hlist_head s_anon; /* anonymous dentries for (nfs) exporting */ + struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ #ifdef CONFIG_SMP struct list_head __percpu *s_files; #else -- cgit From 873feea09ebc980cbd3631b767356ce1eee65ec1 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:06 +1100 Subject: fs: dcache per-inode inode alias locking dcache_inode_lock can be replaced with per-inode locking. Use existing inode->i_lock for this. This is slightly non-trivial because we sometimes need to find the inode from the dentry, which requires d_inode to be stabilised (either with refcount or d_lock). Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index 5f0392ef759b..d719e4de8046 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -191,7 +191,6 @@ struct dentry_operations { #define DCACHE_OP_REVALIDATE 0x4000 #define DCACHE_OP_DELETE 0x8000 -extern spinlock_t dcache_inode_lock; extern seqlock_t rename_lock; static inline int dname_external(struct dentry *dentry) -- cgit From 4b936885ab04dc6e0bb0ef35e0e23c1a7364d9e5 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:07 +1100 Subject: fs: improve scalability of pseudo filesystems Regardless of how much we possibly try to scale dcache, there is likely always going to be some fundamental contention when adding or removing children under the same parent. Pseudo filesystems do not seem need to have connected dentries because by definition they are disconnected. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 1 + 1 file changed, 1 insertion(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index d719e4de8046..c0a2ca97c72f 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -211,6 +211,7 @@ extern void d_set_d_op(struct dentry *dentry, const struct dentry_operations *op /* allocate/de-allocate */ extern struct dentry * d_alloc(struct dentry *, const struct qstr *); +extern struct dentry * d_alloc_pseudo(struct super_block *, const struct qstr *); extern struct dentry * d_splice_alias(struct inode *, struct dentry *); extern struct dentry * d_add_ci(struct dentry *, struct inode *, struct qstr *); extern struct dentry * d_obtain_alias(struct inode *); -- cgit From 9d55c369bb5e695e629bc35cba2ef607755b3bee Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:09 +1100 Subject: fs: implement faster dentry memcmp The standard memcmp function on a Westmere system shows up hot in profiles in the `git diff` workload (both parallel and single threaded), and it is likely due to the costs associated with trapping into microcode, and little opportunity to improve memory access (dentry name is not likely to take up more than a cacheline). So replace it with an open-coded byte comparison. This increases code size by 8 bytes in the critical __d_lookup_rcu function, but the speedup is huge, averaging 10 runs of each: git diff st user sys elapsed CPU before 1.15 2.57 3.82 97.1 after 1.14 2.35 3.61 96.8 git diff mt user sys elapsed CPU before 1.27 3.85 1.46 349 after 1.26 3.54 1.43 333 Elapsed time for single threaded git diff at 95.0% confidence: -0.21 +/- 0.01 -5.45% +/- 0.24% It's -0.66% +/- 0.06% elapsed time on my Opteron, so rep cmp costs on the fam10h seem to be relatively smaller, but there is still a win. Signed-off-by: Nick Piggin --- include/linux/dcache.h | 21 +++++++++++++++++++++ 1 file changed, 21 insertions(+) (limited to 'include/linux') diff --git a/include/linux/dcache.h b/include/linux/dcache.h index c0a2ca97c72f..bd07758943e0 100644 --- a/include/linux/dcache.h +++ b/include/linux/dcache.h @@ -47,6 +47,27 @@ struct dentry_stat_t { }; extern struct dentry_stat_t dentry_stat; +/* + * Compare 2 name strings, return 0 if they match, otherwise non-zero. + * The strings are both count bytes long, and count is non-zero. + */ +static inline int dentry_cmp(const unsigned char *cs, size_t scount, + const unsigned char *ct, size_t tcount) +{ + int ret; + if (scount != tcount) + return 1; + do { + ret = (*cs != *ct); + if (ret) + break; + cs++; + ct++; + tcount--; + } while (tcount); + return ret; +} + /* Name hashing routines. Initial hash value */ /* Hash courtesy of the R5 hash in reiserfs modulo sign bits */ #define init_name_hash() 0 -- cgit From b3e19d924b6eaf2ca7d22cba99a517c5171007b6 Mon Sep 17 00:00:00 2001 From: Nick Piggin Date: Fri, 7 Jan 2011 17:50:11 +1100 Subject: fs: scale mntget/mntput The problem that this patch aims to fix is vfsmount refcounting scalability. We need to take a reference on the vfsmount for every successful path lookup, which often go to the same mount point. The fundamental difficulty is that a "simple" reference count can never be made scalable, because any time a reference is dropped, we must check whether that was the last reference. To do that requires communication with all other CPUs that may have taken a reference count. We can make refcounts more scalable in a couple of ways, involving keeping distributed counters, and checking for the global-zero condition less frequently. - check the global sum once every interval (this will delay zero detection for some interval, so it's probably a showstopper for vfsmounts). - keep a local count and only taking the global sum when local reaches 0 (this is difficult for vfsmounts, because we can't hold preempt off for the life of a reference, so a counter would need to be per-thread or tied strongly to a particular CPU which requires more locking). - keep a local difference of increments and decrements, which allows us to sum the total difference and hence find the refcount when summing all CPUs. Then, keep a single integer "long" refcount for slow and long lasting references, and only take the global sum of local counters when the long refcount is 0. This last scheme is what I implemented here. Attached mounts and process root and working directory references are "long" references, and everything else is a short reference. This allows scalable vfsmount references during path walking over mounted subtrees and unattached (lazy umounted) mounts with processes still running in them. This results in one fewer atomic op in the fastpath: mntget is now just a per-CPU inc, rather than an atomic inc; and mntput just requires a spinlock and non-atomic decrement in the common case. However code is otherwise bigger and heavier, so single threaded performance is basically a wash. Signed-off-by: Nick Piggin --- include/linux/mount.h | 53 +++++++++++++++++---------------------------------- include/linux/path.h | 2 ++ 2 files changed, 19 insertions(+), 36 deletions(-) (limited to 'include/linux') diff --git a/include/linux/mount.h b/include/linux/mount.h index 5e7a59408dd4..1869ea24a739 100644 --- a/include/linux/mount.h +++ b/include/linux/mount.h @@ -13,6 +13,7 @@ #include #include #include +#include #include struct super_block; @@ -46,12 +47,24 @@ struct mnt_namespace; #define MNT_INTERNAL 0x4000 +struct mnt_pcp { + int mnt_count; + int mnt_writers; +}; + struct vfsmount { struct list_head mnt_hash; struct vfsmount *mnt_parent; /* fs we are mounted on */ struct dentry *mnt_mountpoint; /* dentry of mountpoint */ struct dentry *mnt_root; /* root of the mounted tree */ struct super_block *mnt_sb; /* pointer to superblock */ +#ifdef CONFIG_SMP + struct mnt_pcp __percpu *mnt_pcp; + atomic_t mnt_longrefs; +#else + int mnt_count; + int mnt_writers; +#endif struct list_head mnt_mounts; /* list of children, anchored here */ struct list_head mnt_child; /* and going through their mnt_child */ int mnt_flags; @@ -70,57 +83,25 @@ struct vfsmount { struct mnt_namespace *mnt_ns; /* containing namespace */ int mnt_id; /* mount identifier */ int mnt_group_id; /* peer group identifier */ - /* - * We put mnt_count & mnt_expiry_mark at the end of struct vfsmount - * to let these frequently modified fields in a separate cache line - * (so that reads of mnt_flags wont ping-pong on SMP machines) - */ - atomic_t mnt_count; int mnt_expiry_mark; /* true if marked for expiry */ int mnt_pinned; int mnt_ghosts; -#ifdef CONFIG_SMP - int __percpu *mnt_writers; -#else - int mnt_writers; -#endif }; -static inline int *get_mnt_writers_ptr(struct vfsmount *mnt) -{ -#ifdef CONFIG_SMP - return mnt->mnt_writers; -#else - return &mnt->mnt_writers; -#endif -} - -static inline struct vfsmount *mntget(struct vfsmount *mnt) -{ - if (mnt) - atomic_inc(&mnt->mnt_count); - return mnt; -} - struct file; /* forward dec */ extern int mnt_want_write(struct vfsmount *mnt); extern int mnt_want_write_file(struct file *file); extern int mnt_clone_write(struct vfsmount *mnt); extern void mnt_drop_write(struct vfsmount *mnt); -extern void mntput_no_expire(struct vfsmount *mnt); +extern void mntput(struct vfsmount *mnt); +extern struct vfsmount *mntget(struct vfsmount *mnt); +extern void mntput_long(struct vfsmount *mnt); +extern struct vfsmount *mntget_long(struct vfsmount *mnt); extern void mnt_pin(struct vfsmount *mnt); extern void mnt_unpin(struct vfsmount *mnt); extern int __mnt_is_readonly(struct vfsmount *mnt); -static inline void mntput(struct vfsmount *mnt) -{ - if (mnt) { - mnt->mnt_expiry_mark = 0; - mntput_no_expire(mnt); - } -} - extern struct vfsmount *do_kern_mount(const char *fstype, int flags, const char *name, void *data); diff --git a/include/linux/path.h b/include/linux/path.h index edc98dec6266..a581e8c06533 100644 --- a/include/linux/path.h +++ b/include/linux/path.h @@ -10,7 +10,9 @@ struct path { }; extern void path_get(struct path *); +extern void path_get_long(struct path *); extern void path_put(struct path *); +extern void path_put_long(struct path *); static inline int path_equal(const struct path *path1, const struct path *path2) { -- cgit