Commit 3ddcd056 authored by Linus Torvalds's avatar Linus Torvalds

vfs: optimize inode cache access patterns

The inode structure layout is largely random, and some of the vfs paths
really do care.  The path lookup in particular is already quite D$
intensive, and profiles show that accessing the 'inode->i_op->xyz'
fields is quite costly.

We already optimized the dcache to not unnecessarily load the d_op
structure for members that are often NULL using the DCACHE_OP_xyz bits
in dentry->d_flags, and this does something very similar for the inode
ops that are used during pathname lookup.

It also re-orders the fields so that the fields accessed by 'stat' are
together at the beginning of the inode structure, and roughly in the
order accessed.

The effect of this seems to be in the 1-2% range for an empty kernel
"make -j" run (which is fairly kernel-intensive, mostly in filename
lookup), so it's visible.  The numbers are fairly noisy, though, and
likely depend a lot on exact microarchitecture.  So there's more tuning
to be done.
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 830c0f0e
...@@ -143,6 +143,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode) ...@@ -143,6 +143,7 @@ int inode_init_always(struct super_block *sb, struct inode *inode)
inode->i_op = &empty_iops; inode->i_op = &empty_iops;
inode->i_fop = &empty_fops; inode->i_fop = &empty_fops;
inode->i_nlink = 1; inode->i_nlink = 1;
inode->i_opflags = 0;
inode->i_uid = 0; inode->i_uid = 0;
inode->i_gid = 0; inode->i_gid = 0;
atomic_set(&inode->i_writecount, 0); atomic_set(&inode->i_writecount, 0);
......
...@@ -308,6 +308,26 @@ int generic_permission(struct inode *inode, int mask) ...@@ -308,6 +308,26 @@ int generic_permission(struct inode *inode, int mask)
return -EACCES; return -EACCES;
} }
/*
* We _really_ want to just do "generic_permission()" without
* even looking at the inode->i_op values. So we keep a cache
* flag in inode->i_opflags, that says "this has not special
* permission function, use the fast case".
*/
static inline int do_inode_permission(struct inode *inode, int mask)
{
if (unlikely(!(inode->i_opflags & IOP_FASTPERM))) {
if (likely(inode->i_op->permission))
return inode->i_op->permission(inode, mask);
/* This gets set once for the inode lifetime */
spin_lock(&inode->i_lock);
inode->i_opflags |= IOP_FASTPERM;
spin_unlock(&inode->i_lock);
}
return generic_permission(inode, mask);
}
/** /**
* inode_permission - check for access rights to a given inode * inode_permission - check for access rights to a given inode
* @inode: inode to check permission on * @inode: inode to check permission on
...@@ -322,7 +342,7 @@ int inode_permission(struct inode *inode, int mask) ...@@ -322,7 +342,7 @@ int inode_permission(struct inode *inode, int mask)
{ {
int retval; int retval;
if (mask & MAY_WRITE) { if (unlikely(mask & MAY_WRITE)) {
umode_t mode = inode->i_mode; umode_t mode = inode->i_mode;
/* /*
...@@ -339,11 +359,7 @@ int inode_permission(struct inode *inode, int mask) ...@@ -339,11 +359,7 @@ int inode_permission(struct inode *inode, int mask)
return -EACCES; return -EACCES;
} }
if (inode->i_op->permission) retval = do_inode_permission(inode, mask);
retval = inode->i_op->permission(inode, mask);
else
retval = generic_permission(inode, mask);
if (retval) if (retval)
return retval; return retval;
...@@ -1245,6 +1261,26 @@ static void terminate_walk(struct nameidata *nd) ...@@ -1245,6 +1261,26 @@ static void terminate_walk(struct nameidata *nd)
} }
} }
/*
* Do we need to follow links? We _really_ want to be able
* to do this check without having to look at inode->i_op,
* so we keep a cache of "no, this doesn't need follow_link"
* for the common case.
*/
static inline int do_follow_link(struct inode *inode, int follow)
{
if (unlikely(!(inode->i_opflags & IOP_NOFOLLOW))) {
if (likely(inode->i_op->follow_link))
return follow;
/* This gets set once for the inode lifetime */
spin_lock(&inode->i_lock);
inode->i_opflags |= IOP_NOFOLLOW;
spin_unlock(&inode->i_lock);
}
return 0;
}
static inline int walk_component(struct nameidata *nd, struct path *path, static inline int walk_component(struct nameidata *nd, struct path *path,
struct qstr *name, int type, int follow) struct qstr *name, int type, int follow)
{ {
...@@ -1267,7 +1303,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path, ...@@ -1267,7 +1303,7 @@ static inline int walk_component(struct nameidata *nd, struct path *path,
terminate_walk(nd); terminate_walk(nd);
return -ENOENT; return -ENOENT;
} }
if (unlikely(inode->i_op->follow_link) && follow) { if (do_follow_link(inode, follow)) {
if (nd->flags & LOOKUP_RCU) { if (nd->flags & LOOKUP_RCU) {
if (unlikely(unlazy_walk(nd, path->dentry))) { if (unlikely(unlazy_walk(nd, path->dentry))) {
terminate_walk(nd); terminate_walk(nd);
...@@ -1319,6 +1355,26 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd) ...@@ -1319,6 +1355,26 @@ static inline int nested_symlink(struct path *path, struct nameidata *nd)
return res; return res;
} }
/*
* We really don't want to look at inode->i_op->lookup
* when we don't have to. So we keep a cache bit in
* the inode ->i_opflags field that says "yes, we can
* do lookup on this inode".
*/
static inline int can_lookup(struct inode *inode)
{
if (likely(inode->i_opflags & IOP_LOOKUP))
return 1;
if (likely(!inode->i_op->lookup))
return 0;
/* We do this once for the lifetime of the inode */
spin_lock(&inode->i_lock);
inode->i_opflags |= IOP_LOOKUP;
spin_unlock(&inode->i_lock);
return 1;
}
/* /*
* Name resolution. * Name resolution.
* This is the basic name resolution function, turning a pathname into * This is the basic name resolution function, turning a pathname into
...@@ -1398,10 +1454,10 @@ static int link_path_walk(const char *name, struct nameidata *nd) ...@@ -1398,10 +1454,10 @@ static int link_path_walk(const char *name, struct nameidata *nd)
if (err) if (err)
return err; return err;
} }
if (can_lookup(nd->inode))
continue;
err = -ENOTDIR; err = -ENOTDIR;
if (!nd->inode->i_op->lookup) break;
break;
continue;
/* here ends the main loop */ /* here ends the main loop */
last_component: last_component:
......
...@@ -27,12 +27,12 @@ void generic_fillattr(struct inode *inode, struct kstat *stat) ...@@ -27,12 +27,12 @@ void generic_fillattr(struct inode *inode, struct kstat *stat)
stat->uid = inode->i_uid; stat->uid = inode->i_uid;
stat->gid = inode->i_gid; stat->gid = inode->i_gid;
stat->rdev = inode->i_rdev; stat->rdev = inode->i_rdev;
stat->size = i_size_read(inode);
stat->atime = inode->i_atime; stat->atime = inode->i_atime;
stat->mtime = inode->i_mtime; stat->mtime = inode->i_mtime;
stat->ctime = inode->i_ctime; stat->ctime = inode->i_ctime;
stat->size = i_size_read(inode);
stat->blocks = inode->i_blocks;
stat->blksize = (1 << inode->i_blkbits); stat->blksize = (1 << inode->i_blkbits);
stat->blocks = inode->i_blocks;
} }
EXPORT_SYMBOL(generic_fillattr); EXPORT_SYMBOL(generic_fillattr);
......
...@@ -738,22 +738,54 @@ static inline int mapping_writably_mapped(struct address_space *mapping) ...@@ -738,22 +738,54 @@ static inline int mapping_writably_mapped(struct address_space *mapping)
struct posix_acl; struct posix_acl;
#define ACL_NOT_CACHED ((void *)(-1)) #define ACL_NOT_CACHED ((void *)(-1))
#define IOP_FASTPERM 0x0001
#define IOP_LOOKUP 0x0002
#define IOP_NOFOLLOW 0x0004
/*
* Keep mostly read-only and often accessed (especially for
* the RCU path lookup and 'stat' data) fields at the beginning
* of the 'struct inode'
*/
struct inode { struct inode {
/* RCU path lookup touches following: */
umode_t i_mode; umode_t i_mode;
unsigned short i_opflags;
uid_t i_uid; uid_t i_uid;
gid_t i_gid; gid_t i_gid;
unsigned int i_flags;
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
const struct inode_operations *i_op; const struct inode_operations *i_op;
struct super_block *i_sb; struct super_block *i_sb;
struct address_space *i_mapping;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
unsigned int i_flags;
unsigned long i_state;
#ifdef CONFIG_SECURITY #ifdef CONFIG_SECURITY
void *i_security; void *i_security;
#endif #endif
struct mutex i_mutex;
/* Stat data, not accessed from path walking */
unsigned long i_ino;
unsigned int i_nlink;
dev_t i_rdev;
loff_t i_size;
struct timespec i_atime;
struct timespec i_mtime;
struct timespec i_ctime;
unsigned int i_blkbits;
blkcnt_t i_blocks;
#ifdef __NEED_I_SIZE_ORDERED
seqcount_t i_size_seqcount;
#endif
/* Misc */
unsigned long i_state;
spinlock_t i_lock; /* i_blocks, i_bytes, maybe i_size */
struct mutex i_mutex;
unsigned long dirtied_when; /* jiffies of first dirtying */ unsigned long dirtied_when; /* jiffies of first dirtying */
...@@ -765,25 +797,12 @@ struct inode { ...@@ -765,25 +797,12 @@ struct inode {
struct list_head i_dentry; struct list_head i_dentry;
struct rcu_head i_rcu; struct rcu_head i_rcu;
}; };
unsigned long i_ino;
atomic_t i_count; atomic_t i_count;
unsigned int i_nlink;
dev_t i_rdev;
unsigned int i_blkbits;
u64 i_version; u64 i_version;
loff_t i_size;
#ifdef __NEED_I_SIZE_ORDERED
seqcount_t i_size_seqcount;
#endif
struct timespec i_atime;
struct timespec i_mtime;
struct timespec i_ctime;
blkcnt_t i_blocks;
unsigned short i_bytes; unsigned short i_bytes;
atomic_t i_dio_count; atomic_t i_dio_count;
const struct file_operations *i_fop; /* former ->i_op->default_file_ops */ const struct file_operations *i_fop; /* former ->i_op->default_file_ops */
struct file_lock *i_flock; struct file_lock *i_flock;
struct address_space *i_mapping;
struct address_space i_data; struct address_space i_data;
#ifdef CONFIG_QUOTA #ifdef CONFIG_QUOTA
struct dquot *i_dquot[MAXQUOTAS]; struct dquot *i_dquot[MAXQUOTAS];
...@@ -806,10 +825,6 @@ struct inode { ...@@ -806,10 +825,6 @@ struct inode {
atomic_t i_readcount; /* struct files open RO */ atomic_t i_readcount; /* struct files open RO */
#endif #endif
atomic_t i_writecount; atomic_t i_writecount;
#ifdef CONFIG_FS_POSIX_ACL
struct posix_acl *i_acl;
struct posix_acl *i_default_acl;
#endif
void *i_private; /* fs or device private pointer */ void *i_private; /* fs or device private pointer */
}; };
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment