Commit fdd4e158 authored by Yan, Zheng's avatar Yan, Zheng Committed by Ilya Dryomov

ceph: rework dcache readdir

Previously our dcache readdir code relies on that child dentries in
directory dentry's d_subdir list are sorted by dentry's offset in
descending order. When adding dentries to the dcache, if a dentry
already exists, our readdir code moves it to head of directory
dentry's d_subdir list. This design relies on dcache internals.
Al Viro suggests using ncpfs's approach: keeping array of pointers
to dentries in page cache of directory inode. the validity of those
pointers are presented by directory inode's complete and ordered
flags. When a dentry gets pruned, we clear directory inode's complete
flag in the d_prune() callback. Before moving a dentry to other
directory, we clear the ordered flag for both old and new directory.
Signed-off-by: default avatarYan, Zheng <zyan@redhat.com>
parent b459be73
...@@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci) ...@@ -833,7 +833,9 @@ int __ceph_caps_used(struct ceph_inode_info *ci)
used |= CEPH_CAP_PIN; used |= CEPH_CAP_PIN;
if (ci->i_rd_ref) if (ci->i_rd_ref)
used |= CEPH_CAP_FILE_RD; used |= CEPH_CAP_FILE_RD;
if (ci->i_rdcache_ref || ci->vfs_inode.i_data.nrpages) if (ci->i_rdcache_ref ||
(!S_ISDIR(ci->vfs_inode.i_mode) && /* ignore readdir cache */
ci->vfs_inode.i_data.nrpages))
used |= CEPH_CAP_FILE_CACHE; used |= CEPH_CAP_FILE_CACHE;
if (ci->i_wr_ref) if (ci->i_wr_ref)
used |= CEPH_CAP_FILE_WR; used |= CEPH_CAP_FILE_WR;
...@@ -1651,6 +1653,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ...@@ -1651,6 +1653,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
* If we fail, it's because pages are locked.... try again later. * If we fail, it's because pages are locked.... try again later.
*/ */
if ((!is_delayed || mdsc->stopping) && if ((!is_delayed || mdsc->stopping) &&
!S_ISDIR(inode->i_mode) && /* ignore readdir cache */
ci->i_wrbuffer_ref == 0 && /* no dirty pages... */ ci->i_wrbuffer_ref == 0 && /* no dirty pages... */
inode->i_data.nrpages && /* have cached pages */ inode->i_data.nrpages && /* have cached pages */
(file_wanted == 0 || /* no open files */ (file_wanted == 0 || /* no open files */
...@@ -2805,7 +2808,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ...@@ -2805,7 +2808,8 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
* try to invalidate (once). (If there are dirty buffers, we * try to invalidate (once). (If there are dirty buffers, we
* will invalidate _after_ writeback.) * will invalidate _after_ writeback.)
*/ */
if (((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) && if (!S_ISDIR(inode->i_mode) && /* don't invalidate readdir cache */
((cap->issued & ~newcaps) & CEPH_CAP_FILE_CACHE) &&
(newcaps & CEPH_CAP_FILE_LAZYIO) == 0 && (newcaps & CEPH_CAP_FILE_LAZYIO) == 0 &&
!ci->i_wrbuffer_ref) { !ci->i_wrbuffer_ref) {
if (try_nonblocking_invalidate(inode)) { if (try_nonblocking_invalidate(inode)) {
......
...@@ -106,6 +106,27 @@ static int fpos_cmp(loff_t l, loff_t r) ...@@ -106,6 +106,27 @@ static int fpos_cmp(loff_t l, loff_t r)
return (int)(fpos_off(l) - fpos_off(r)); return (int)(fpos_off(l) - fpos_off(r));
} }
/*
* make note of the last dentry we read, so we can
* continue at the same lexicographical point,
* regardless of what dir changes take place on the
* server.
*/
static int note_last_dentry(struct ceph_file_info *fi, const char *name,
int len, unsigned next_offset)
{
char *buf = kmalloc(len+1, GFP_KERNEL);
if (!buf)
return -ENOMEM;
kfree(fi->last_name);
fi->last_name = buf;
memcpy(fi->last_name, name, len);
fi->last_name[len] = 0;
fi->next_offset = next_offset;
dout("note_last_dentry '%s'\n", fi->last_name);
return 0;
}
/* /*
* When possible, we try to satisfy a readdir by peeking at the * When possible, we try to satisfy a readdir by peeking at the
* dcache. We make this work by carefully ordering dentries on * dcache. We make this work by carefully ordering dentries on
...@@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx, ...@@ -123,123 +144,113 @@ static int __dcache_readdir(struct file *file, struct dir_context *ctx,
struct ceph_file_info *fi = file->private_data; struct ceph_file_info *fi = file->private_data;
struct dentry *parent = file->f_path.dentry; struct dentry *parent = file->f_path.dentry;
struct inode *dir = d_inode(parent); struct inode *dir = d_inode(parent);
struct list_head *p; struct dentry *dentry, *last = NULL;
struct dentry *dentry, *last;
struct ceph_dentry_info *di; struct ceph_dentry_info *di;
unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry *);
int err = 0; int err = 0;
loff_t ptr_pos = 0;
struct ceph_readdir_cache_control cache_ctl = {};
/* claim ref on last dentry we returned */ dout("__dcache_readdir %p v%u at %llu\n", dir, shared_gen, ctx->pos);
last = fi->dentry;
fi->dentry = NULL;
dout("__dcache_readdir %p v%u at %llu (last %p)\n",
dir, shared_gen, ctx->pos, last);
spin_lock(&parent->d_lock); /* we can calculate cache index for the first dirfrag */
if (ceph_frag_is_leftmost(fpos_frag(ctx->pos))) {
/* start at beginning? */ cache_ctl.index = fpos_off(ctx->pos) - 2;
if (ctx->pos == 2 || last == NULL || BUG_ON(cache_ctl.index < 0);
fpos_cmp(ctx->pos, ceph_dentry(last)->offset) < 0) { ptr_pos = cache_ctl.index * sizeof(struct dentry *);
if (list_empty(&parent->d_subdirs))
goto out_unlock;
p = parent->d_subdirs.prev;
dout(" initial p %p/%p\n", p->prev, p->next);
} else {
p = last->d_child.prev;
} }
more: while (true) {
dentry = list_entry(p, struct dentry, d_child); pgoff_t pgoff;
di = ceph_dentry(dentry); bool emit_dentry;
while (1) {
dout(" p %p/%p %s d_subdirs %p/%p\n", p->prev, p->next, if (ptr_pos >= i_size_read(dir)) {
d_unhashed(dentry) ? "!hashed" : "hashed",
parent->d_subdirs.prev, parent->d_subdirs.next);
if (p == &parent->d_subdirs) {
fi->flags |= CEPH_F_ATEND; fi->flags |= CEPH_F_ATEND;
goto out_unlock; err = 0;
break;
} }
spin_lock_nested(&dentry->d_lock, DENTRY_D_LOCK_NESTED);
if (di->lease_shared_gen == shared_gen && err = -EAGAIN;
!d_unhashed(dentry) && d_really_is_positive(dentry) && pgoff = ptr_pos >> PAGE_CACHE_SHIFT;
ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR && if (!cache_ctl.page || pgoff != page_index(cache_ctl.page)) {
ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH && ceph_readdir_cache_release(&cache_ctl);
fpos_cmp(ctx->pos, di->offset) <= 0) cache_ctl.page = find_lock_page(&dir->i_data, pgoff);
if (!cache_ctl.page) {
dout(" page %lu not found\n", pgoff);
break; break;
dout(" skipping %p %pd at %llu (%llu)%s%s\n", dentry, }
dentry, di->offset, /* reading/filling the cache are serialized by
ctx->pos, d_unhashed(dentry) ? " unhashed" : "", * i_mutex, no need to use page lock */
!d_inode(dentry) ? " null" : ""); unlock_page(cache_ctl.page);
spin_unlock(&dentry->d_lock); cache_ctl.dentries = kmap(cache_ctl.page);
p = p->prev;
dentry = list_entry(p, struct dentry, d_child);
di = ceph_dentry(dentry);
} }
dget_dlock(dentry); rcu_read_lock();
spin_unlock(&dentry->d_lock); spin_lock(&parent->d_lock);
/* check i_size again here, because empty directory can be
* marked as complete while not holding the i_mutex. */
if (ceph_dir_is_complete_ordered(dir) &&
ptr_pos < i_size_read(dir))
dentry = cache_ctl.dentries[cache_ctl.index % nsize];
else
dentry = NULL;
spin_unlock(&parent->d_lock); spin_unlock(&parent->d_lock);
if (dentry && !lockref_get_not_dead(&dentry->d_lockref))
dentry = NULL;
rcu_read_unlock();
if (!dentry)
break;
/* make sure a dentry wasn't dropped while we didn't have parent lock */ emit_dentry = false;
if (!ceph_dir_is_complete_ordered(dir)) { di = ceph_dentry(dentry);
dout(" lost dir complete on %p; falling back to mds\n", dir); spin_lock(&dentry->d_lock);
dput(dentry); if (di->lease_shared_gen == shared_gen &&
err = -EAGAIN; d_really_is_positive(dentry) &&
goto out; ceph_snap(d_inode(dentry)) != CEPH_SNAPDIR &&
ceph_ino(d_inode(dentry)) != CEPH_INO_CEPH &&
fpos_cmp(ctx->pos, di->offset) <= 0) {
emit_dentry = true;
} }
spin_unlock(&dentry->d_lock);
if (emit_dentry) {
dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos, dout(" %llu (%llu) dentry %p %pd %p\n", di->offset, ctx->pos,
dentry, dentry, d_inode(dentry)); dentry, dentry, d_inode(dentry));
ctx->pos = di->offset;
if (!dir_emit(ctx, dentry->d_name.name, if (!dir_emit(ctx, dentry->d_name.name,
dentry->d_name.len, dentry->d_name.len,
ceph_translate_ino(dentry->d_sb, d_inode(dentry)->i_ino), ceph_translate_ino(dentry->d_sb,
d_inode(dentry)->i_ino),
d_inode(dentry)->i_mode >> 12)) { d_inode(dentry)->i_mode >> 12)) {
if (last) {
/* remember our position */
fi->dentry = last;
fi->next_offset = fpos_off(di->offset);
}
dput(dentry); dput(dentry);
return 0; err = 0;
break;
} }
ctx->pos++;
ctx->pos = di->offset + 1;
if (last) if (last)
dput(last); dput(last);
last = dentry; last = dentry;
} else {
dput(dentry);
}
spin_lock(&parent->d_lock); cache_ctl.index++;
p = p->prev; /* advance to next dentry */ ptr_pos += sizeof(struct dentry *);
goto more; }
ceph_readdir_cache_release(&cache_ctl);
out_unlock: if (last) {
spin_unlock(&parent->d_lock); int ret;
out: di = ceph_dentry(last);
if (last) ret = note_last_dentry(fi, last->d_name.name, last->d_name.len,
fpos_off(di->offset) + 1);
if (ret < 0)
err = ret;
dput(last); dput(last);
}
return err; return err;
} }
/*
* make note of the last dentry we read, so we can
* continue at the same lexicographical point,
* regardless of what dir changes take place on the
* server.
*/
static int note_last_dentry(struct ceph_file_info *fi, const char *name,
int len)
{
kfree(fi->last_name);
fi->last_name = kmalloc(len+1, GFP_KERNEL);
if (!fi->last_name)
return -ENOMEM;
memcpy(fi->last_name, name, len);
fi->last_name[len] = 0;
dout("note_last_dentry '%s'\n", fi->last_name);
return 0;
}
static int ceph_readdir(struct file *file, struct dir_context *ctx) static int ceph_readdir(struct file *file, struct dir_context *ctx)
{ {
struct ceph_file_info *fi = file->private_data; struct ceph_file_info *fi = file->private_data;
...@@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -280,8 +291,7 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
/* can we use the dcache? */ /* can we use the dcache? */
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if ((ctx->pos == 2 || fi->dentry) && if (ceph_test_mount_opt(fsc, DCACHE) &&
ceph_test_mount_opt(fsc, DCACHE) &&
!ceph_test_mount_opt(fsc, NOASYNCREADDIR) && !ceph_test_mount_opt(fsc, NOASYNCREADDIR) &&
ceph_snap(inode) != CEPH_SNAPDIR && ceph_snap(inode) != CEPH_SNAPDIR &&
__ceph_dir_is_complete_ordered(ci) && __ceph_dir_is_complete_ordered(ci) &&
...@@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -296,24 +306,8 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
} else { } else {
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
} }
if (fi->dentry) {
err = note_last_dentry(fi, fi->dentry->d_name.name,
fi->dentry->d_name.len);
if (err)
return err;
dput(fi->dentry);
fi->dentry = NULL;
}
/* proceed with a normal readdir */ /* proceed with a normal readdir */
if (ctx->pos == 2) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
fi->dir_release_count = atomic_read(&ci->i_release_count);
fi->dir_ordered_count = ci->i_ordered_count;
}
more: more:
/* do we have the correct frag content buffered? */ /* do we have the correct frag content buffered? */
if (fi->frag != frag || fi->last_readdir == NULL) { if (fi->frag != frag || fi->last_readdir == NULL) {
...@@ -348,6 +342,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -348,6 +342,9 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
return -ENOMEM; return -ENOMEM;
} }
} }
req->r_dir_release_cnt = fi->dir_release_count;
req->r_dir_ordered_cnt = fi->dir_ordered_count;
req->r_readdir_cache_idx = fi->readdir_cache_idx;
req->r_readdir_offset = fi->next_offset; req->r_readdir_offset = fi->next_offset;
req->r_args.readdir.frag = cpu_to_le32(frag); req->r_args.readdir.frag = cpu_to_le32(frag);
...@@ -364,26 +361,38 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -364,26 +361,38 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
(int)req->r_reply_info.dir_end, (int)req->r_reply_info.dir_end,
(int)req->r_reply_info.dir_complete); (int)req->r_reply_info.dir_complete);
if (!req->r_did_prepopulate) {
dout("readdir !did_prepopulate");
/* preclude from marking dir complete */
fi->dir_release_count--;
}
/* note next offset and last dentry name */ /* note next offset and last dentry name */
rinfo = &req->r_reply_info; rinfo = &req->r_reply_info;
if (le32_to_cpu(rinfo->dir_dir->frag) != frag) { if (le32_to_cpu(rinfo->dir_dir->frag) != frag) {
frag = le32_to_cpu(rinfo->dir_dir->frag); frag = le32_to_cpu(rinfo->dir_dir->frag);
if (ceph_frag_is_leftmost(frag)) off = req->r_readdir_offset;
fi->next_offset = 2; fi->next_offset = off;
else
fi->next_offset = 0;
off = fi->next_offset;
} }
fi->frag = frag; fi->frag = frag;
fi->offset = fi->next_offset; fi->offset = fi->next_offset;
fi->last_readdir = req; fi->last_readdir = req;
if (req->r_did_prepopulate) {
fi->readdir_cache_idx = req->r_readdir_cache_idx;
if (fi->readdir_cache_idx < 0) {
/* preclude from marking dir ordered */
fi->dir_ordered_count = 0;
} else if (ceph_frag_is_leftmost(frag) && off == 2) {
/* note dir version at start of readdir so
* we can tell if any dentries get dropped */
fi->dir_release_count = req->r_dir_release_cnt;
fi->dir_ordered_count = req->r_dir_ordered_cnt;
}
} else {
dout("readdir !did_prepopulate");
/* disable readdir cache */
fi->readdir_cache_idx = -1;
/* preclude from marking dir complete */
fi->dir_release_count = 0;
}
if (req->r_reply_info.dir_end) { if (req->r_reply_info.dir_end) {
kfree(fi->last_name); kfree(fi->last_name);
fi->last_name = NULL; fi->last_name = NULL;
...@@ -394,10 +403,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -394,10 +403,10 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
} else { } else {
err = note_last_dentry(fi, err = note_last_dentry(fi,
rinfo->dir_dname[rinfo->dir_nr-1], rinfo->dir_dname[rinfo->dir_nr-1],
rinfo->dir_dname_len[rinfo->dir_nr-1]); rinfo->dir_dname_len[rinfo->dir_nr-1],
fi->next_offset + rinfo->dir_nr);
if (err) if (err)
return err; return err;
fi->next_offset += rinfo->dir_nr;
} }
} }
...@@ -453,16 +462,22 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx) ...@@ -453,16 +462,22 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
* were released during the whole readdir, and we should have * were released during the whole readdir, and we should have
* the complete dir contents in our cache. * the complete dir contents in our cache.
*/ */
if (atomic64_read(&ci->i_release_count) == fi->dir_release_count) {
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (atomic_read(&ci->i_release_count) == fi->dir_release_count) { if (fi->dir_ordered_count == atomic64_read(&ci->i_ordered_count)) {
if (ci->i_ordered_count == fi->dir_ordered_count)
dout(" marking %p complete and ordered\n", inode); dout(" marking %p complete and ordered\n", inode);
else /* use i_size to track number of entries in
* readdir cache */
BUG_ON(fi->readdir_cache_idx < 0);
i_size_write(inode, fi->readdir_cache_idx *
sizeof(struct dentry*));
} else {
dout(" marking %p complete\n", inode); dout(" marking %p complete\n", inode);
}
__ceph_dir_set_complete(ci, fi->dir_release_count, __ceph_dir_set_complete(ci, fi->dir_release_count,
fi->dir_ordered_count); fi->dir_ordered_count);
}
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
}
dout("readdir %p file %p done.\n", inode, file); dout("readdir %p file %p done.\n", inode, file);
return 0; return 0;
...@@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag) ...@@ -476,14 +491,12 @@ static void reset_readdir(struct ceph_file_info *fi, unsigned frag)
} }
kfree(fi->last_name); kfree(fi->last_name);
fi->last_name = NULL; fi->last_name = NULL;
fi->dir_release_count = 0;
fi->readdir_cache_idx = -1;
if (ceph_frag_is_leftmost(frag)) if (ceph_frag_is_leftmost(frag))
fi->next_offset = 2; /* compensate for . and .. */ fi->next_offset = 2; /* compensate for . and .. */
else else
fi->next_offset = 0; fi->next_offset = 0;
if (fi->dentry) {
dput(fi->dentry);
fi->dentry = NULL;
}
fi->flags &= ~CEPH_F_ATEND; fi->flags &= ~CEPH_F_ATEND;
} }
...@@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) ...@@ -497,13 +510,12 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
mutex_lock(&inode->i_mutex); mutex_lock(&inode->i_mutex);
retval = -EINVAL; retval = -EINVAL;
switch (whence) { switch (whence) {
case SEEK_END:
offset += inode->i_size + 2; /* FIXME */
break;
case SEEK_CUR: case SEEK_CUR:
offset += file->f_pos; offset += file->f_pos;
case SEEK_SET: case SEEK_SET:
break; break;
case SEEK_END:
retval = -EOPNOTSUPP;
default: default:
goto out; goto out;
} }
...@@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence) ...@@ -516,20 +528,18 @@ static loff_t ceph_dir_llseek(struct file *file, loff_t offset, int whence)
} }
retval = offset; retval = offset;
/*
* discard buffered readdir content on seekdir(0), or
* seek to new frag, or seek prior to current chunk.
*/
if (offset == 0 || if (offset == 0 ||
fpos_frag(offset) != fi->frag || fpos_frag(offset) != fi->frag ||
fpos_off(offset) < fi->offset) { fpos_off(offset) < fi->offset) {
/* discard buffered readdir content on seekdir(0), or
* seek to new frag, or seek prior to current chunk */
dout("dir_llseek dropping %p content\n", file); dout("dir_llseek dropping %p content\n", file);
reset_readdir(fi, fpos_frag(offset)); reset_readdir(fi, fpos_frag(offset));
} else if (fpos_cmp(offset, old_offset) > 0) {
/* reset dir_release_count if we did a forward seek */
fi->dir_release_count = 0;
fi->readdir_cache_idx = -1;
} }
/* bump dir_release_count if we did a forward seek */
if (fpos_cmp(offset, old_offset) > 0)
fi->dir_release_count--;
} }
out: out:
mutex_unlock(&inode->i_mutex); mutex_unlock(&inode->i_mutex);
...@@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry, ...@@ -985,16 +995,15 @@ static int ceph_rename(struct inode *old_dir, struct dentry *old_dentry,
* to do it here. * to do it here.
*/ */
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_complete(old_dir);
ceph_dir_clear_complete(new_dir);
d_move(old_dentry, new_dentry); d_move(old_dentry, new_dentry);
/* ensure target dentry is invalidated, despite /* ensure target dentry is invalidated, despite
rehashing bug in vfs_rename_dir */ rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(new_dentry); ceph_invalidate_dentry_lease(new_dentry);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_complete(old_dir);
ceph_dir_clear_complete(new_dir);
} }
ceph_mdsc_put_request(req); ceph_mdsc_put_request(req);
return err; return err;
......
...@@ -96,6 +96,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) ...@@ -96,6 +96,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
} }
cf->fmode = fmode; cf->fmode = fmode;
cf->next_offset = 2; cf->next_offset = 2;
cf->readdir_cache_idx = -1;
file->private_data = cf; file->private_data = cf;
BUG_ON(inode->i_fop->release != ceph_release); BUG_ON(inode->i_fop->release != ceph_release);
break; break;
...@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file) ...@@ -324,7 +325,6 @@ int ceph_release(struct inode *inode, struct file *file)
ceph_mdsc_put_request(cf->last_readdir); ceph_mdsc_put_request(cf->last_readdir);
kfree(cf->last_name); kfree(cf->last_name);
kfree(cf->dir_info); kfree(cf->dir_info);
dput(cf->dentry);
kmem_cache_free(ceph_file_cachep, cf); kmem_cache_free(ceph_file_cachep, cf);
/* wake up anyone waiting for caps on this inode */ /* wake up anyone waiting for caps on this inode */
......
...@@ -390,9 +390,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ...@@ -390,9 +390,10 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_inline_version = 0; ci->i_inline_version = 0;
ci->i_time_warp_seq = 0; ci->i_time_warp_seq = 0;
ci->i_ceph_flags = 0; ci->i_ceph_flags = 0;
ci->i_ordered_count = 0; atomic64_set(&ci->i_ordered_count, 1);
atomic_set(&ci->i_release_count, 1); atomic64_set(&ci->i_release_count, 1);
atomic_set(&ci->i_complete_count, 0); atomic64_set(&ci->i_complete_seq[0], 0);
atomic64_set(&ci->i_complete_seq[1], 0);
ci->i_symlink = NULL; ci->i_symlink = NULL;
memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout)); memset(&ci->i_dir_layout, 0, sizeof(ci->i_dir_layout));
...@@ -860,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ...@@ -860,9 +861,10 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
(issued & CEPH_CAP_FILE_EXCL) == 0 && (issued & CEPH_CAP_FILE_EXCL) == 0 &&
!__ceph_dir_is_complete(ci)) { !__ceph_dir_is_complete(ci)) {
dout(" marking %p complete (empty)\n", inode); dout(" marking %p complete (empty)\n", inode);
i_size_write(inode, 0);
__ceph_dir_set_complete(ci, __ceph_dir_set_complete(ci,
atomic_read(&ci->i_release_count), atomic64_read(&ci->i_release_count),
ci->i_ordered_count); atomic64_read(&ci->i_ordered_count));
} }
wake = true; wake = true;
...@@ -1214,6 +1216,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1214,6 +1216,10 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
dout("fill_trace doing d_move %p -> %p\n", dout("fill_trace doing d_move %p -> %p\n",
req->r_old_dentry, dn); req->r_old_dentry, dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
d_move(req->r_old_dentry, dn); d_move(req->r_old_dentry, dn);
dout(" src %p '%pd' dst %p '%pd'\n", dout(" src %p '%pd' dst %p '%pd'\n",
req->r_old_dentry, req->r_old_dentry,
...@@ -1224,10 +1230,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1224,10 +1230,6 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
rehashing bug in vfs_rename_dir */ rehashing bug in vfs_rename_dir */
ceph_invalidate_dentry_lease(dn); ceph_invalidate_dentry_lease(dn);
/* d_move screws up sibling dentries' offsets */
ceph_dir_clear_ordered(dir);
ceph_dir_clear_ordered(olddir);
dout("dn %p gets new offset %lld\n", req->r_old_dentry, dout("dn %p gets new offset %lld\n", req->r_old_dentry,
ceph_dentry(req->r_old_dentry)->offset); ceph_dentry(req->r_old_dentry)->offset);
...@@ -1335,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, ...@@ -1335,6 +1337,49 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
return err; return err;
} }
void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl)
{
if (ctl->page) {
kunmap(ctl->page);
page_cache_release(ctl->page);
ctl->page = NULL;
}
}
static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
struct ceph_readdir_cache_control *ctl,
struct ceph_mds_request *req)
{
struct ceph_inode_info *ci = ceph_inode(dir);
unsigned nsize = PAGE_CACHE_SIZE / sizeof(struct dentry*);
unsigned idx = ctl->index % nsize;
pgoff_t pgoff = ctl->index / nsize;
if (!ctl->page || pgoff != page_index(ctl->page)) {
ceph_readdir_cache_release(ctl);
ctl->page = grab_cache_page(&dir->i_data, pgoff);
if (!ctl->page) {
ctl->index = -1;
return -ENOMEM;
}
/* reading/filling the cache are serialized by
* i_mutex, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
}
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
req->r_dir_ordered_cnt == atomic64_read(&ci->i_ordered_count)) {
dout("readdir cache dn %p idx %d\n", dn, ctl->index);
ctl->dentries[idx] = dn;
ctl->index++;
} else {
dout("disable readdir cache\n");
ctl->index = -1;
}
return 0;
}
int ceph_readdir_prepopulate(struct ceph_mds_request *req, int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct ceph_mds_session *session) struct ceph_mds_session *session)
{ {
...@@ -1347,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1347,8 +1392,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct inode *snapdir = NULL; struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di; struct ceph_dentry_info *di;
u64 r_readdir_offset = req->r_readdir_offset;
u32 frag = le32_to_cpu(rhead->args.readdir.frag); u32 frag = le32_to_cpu(rhead->args.readdir.frag);
struct ceph_readdir_cache_control cache_ctl = {};
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
if (rinfo->dir_dir && if (rinfo->dir_dir &&
le32_to_cpu(rinfo->dir_dir->frag) != frag) { le32_to_cpu(rinfo->dir_dir->frag) != frag) {
...@@ -1356,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1356,14 +1404,11 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
frag, le32_to_cpu(rinfo->dir_dir->frag)); frag, le32_to_cpu(rinfo->dir_dir->frag));
frag = le32_to_cpu(rinfo->dir_dir->frag); frag = le32_to_cpu(rinfo->dir_dir->frag);
if (ceph_frag_is_leftmost(frag)) if (ceph_frag_is_leftmost(frag))
r_readdir_offset = 2; req->r_readdir_offset = 2;
else else
r_readdir_offset = 0; req->r_readdir_offset = 0;
} }
if (req->r_aborted)
return readdir_prepopulate_inodes_only(req, session);
if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) { if (le32_to_cpu(rinfo->head->op) == CEPH_MDS_OP_LSSNAP) {
snapdir = ceph_get_snapdir(d_inode(parent)); snapdir = ceph_get_snapdir(d_inode(parent));
parent = d_find_alias(snapdir); parent = d_find_alias(snapdir);
...@@ -1376,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1376,6 +1421,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir); ceph_fill_dirfrag(d_inode(parent), rinfo->dir_dir);
} }
if (ceph_frag_is_leftmost(frag) && req->r_readdir_offset == 2) {
/* note dir version at start of readdir so we can tell
* if any dentries get dropped */
struct ceph_inode_info *ci = ceph_inode(d_inode(parent));
req->r_dir_release_cnt = atomic64_read(&ci->i_release_count);
req->r_dir_ordered_cnt = atomic64_read(&ci->i_ordered_count);
req->r_readdir_cache_idx = 0;
}
cache_ctl.index = req->r_readdir_cache_idx;
/* FIXME: release caps/leases if error occurs */ /* FIXME: release caps/leases if error occurs */
for (i = 0; i < rinfo->dir_nr; i++) { for (i = 0; i < rinfo->dir_nr; i++) {
struct ceph_vino vino; struct ceph_vino vino;
...@@ -1415,13 +1471,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1415,13 +1471,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
d_delete(dn); d_delete(dn);
dput(dn); dput(dn);
goto retry_lookup; goto retry_lookup;
} else {
/* reorder parent's d_subdirs */
spin_lock(&parent->d_lock);
spin_lock_nested(&dn->d_lock, DENTRY_D_LOCK_NESTED);
list_move(&dn->d_child, &parent->d_subdirs);
spin_unlock(&dn->d_lock);
spin_unlock(&parent->d_lock);
} }
/* inode */ /* inode */
...@@ -1438,13 +1487,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1438,13 +1487,15 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
} }
if (fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session, ret = fill_inode(in, NULL, &rinfo->dir_in[i], NULL, session,
req->r_request_started, -1, req->r_request_started, -1,
&req->r_caps_reservation) < 0) { &req->r_caps_reservation);
if (ret < 0) {
pr_err("fill_inode badness on %p\n", in); pr_err("fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) if (d_really_is_negative(dn))
iput(in); iput(in);
d_drop(dn); d_drop(dn);
err = ret;
goto next_item; goto next_item;
} }
...@@ -1460,19 +1511,28 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1460,19 +1511,28 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
di = dn->d_fsdata; di = dn->d_fsdata;
di->offset = ceph_make_fpos(frag, i + r_readdir_offset); di->offset = ceph_make_fpos(frag, i + req->r_readdir_offset);
update_dentry_lease(dn, rinfo->dir_dlease[i], update_dentry_lease(dn, rinfo->dir_dlease[i],
req->r_session, req->r_session,
req->r_request_started); req->r_request_started);
if (err == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn,
&cache_ctl, req);
if (ret < 0)
err = ret;
}
next_item: next_item:
if (dn) if (dn)
dput(dn); dput(dn);
} }
if (err == 0)
req->r_did_prepopulate = true;
out: out:
if (err == 0) {
req->r_did_prepopulate = true;
req->r_readdir_cache_idx = cache_ctl.index;
}
ceph_readdir_cache_release(&cache_ctl);
if (snapdir) { if (snapdir) {
iput(snapdir); iput(snapdir);
dput(parent); dput(parent);
......
...@@ -253,6 +253,9 @@ struct ceph_mds_request { ...@@ -253,6 +253,9 @@ struct ceph_mds_request {
bool r_got_unsafe, r_got_safe, r_got_result; bool r_got_unsafe, r_got_safe, r_got_result;
bool r_did_prepopulate; bool r_did_prepopulate;
long long r_dir_release_cnt;
long long r_dir_ordered_cnt;
int r_readdir_cache_idx;
u32 r_readdir_offset; u32 r_readdir_offset;
struct ceph_cap_reservation r_caps_reservation; struct ceph_cap_reservation r_caps_reservation;
......
...@@ -282,9 +282,9 @@ struct ceph_inode_info { ...@@ -282,9 +282,9 @@ struct ceph_inode_info {
u32 i_time_warp_seq; u32 i_time_warp_seq;
unsigned i_ceph_flags; unsigned i_ceph_flags;
int i_ordered_count; atomic64_t i_release_count;
atomic_t i_release_count; atomic64_t i_ordered_count;
atomic_t i_complete_count; atomic64_t i_complete_seq[2];
struct ceph_dir_layout i_dir_layout; struct ceph_dir_layout i_dir_layout;
struct ceph_file_layout i_layout; struct ceph_file_layout i_layout;
...@@ -471,30 +471,36 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, ...@@ -471,30 +471,36 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
int release_count, int ordered_count) long long release_count,
long long ordered_count)
{ {
atomic_set(&ci->i_complete_count, release_count); smp_mb__before_atomic();
if (ci->i_ordered_count == ordered_count) atomic64_set(&ci->i_complete_seq[0], release_count);
ci->i_ceph_flags |= CEPH_I_DIR_ORDERED; atomic64_set(&ci->i_complete_seq[1], ordered_count);
else
ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
} }
static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci) static inline void __ceph_dir_clear_complete(struct ceph_inode_info *ci)
{ {
atomic_inc(&ci->i_release_count); atomic64_inc(&ci->i_release_count);
}
static inline void __ceph_dir_clear_ordered(struct ceph_inode_info *ci)
{
atomic64_inc(&ci->i_ordered_count);
} }
static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci) static inline bool __ceph_dir_is_complete(struct ceph_inode_info *ci)
{ {
return atomic_read(&ci->i_complete_count) == return atomic64_read(&ci->i_complete_seq[0]) ==
atomic_read(&ci->i_release_count); atomic64_read(&ci->i_release_count);
} }
static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci) static inline bool __ceph_dir_is_complete_ordered(struct ceph_inode_info *ci)
{ {
return __ceph_dir_is_complete(ci) && return atomic64_read(&ci->i_complete_seq[0]) ==
(ci->i_ceph_flags & CEPH_I_DIR_ORDERED); atomic64_read(&ci->i_release_count) &&
atomic64_read(&ci->i_complete_seq[1]) ==
atomic64_read(&ci->i_ordered_count);
} }
static inline void ceph_dir_clear_complete(struct inode *inode) static inline void ceph_dir_clear_complete(struct inode *inode)
...@@ -504,20 +510,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode) ...@@ -504,20 +510,13 @@ static inline void ceph_dir_clear_complete(struct inode *inode)
static inline void ceph_dir_clear_ordered(struct inode *inode) static inline void ceph_dir_clear_ordered(struct inode *inode)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); __ceph_dir_clear_ordered(ceph_inode(inode));
spin_lock(&ci->i_ceph_lock);
ci->i_ordered_count++;
ci->i_ceph_flags &= ~CEPH_I_DIR_ORDERED;
spin_unlock(&ci->i_ceph_lock);
} }
static inline bool ceph_dir_is_complete_ordered(struct inode *inode) static inline bool ceph_dir_is_complete_ordered(struct inode *inode)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); bool ret = __ceph_dir_is_complete_ordered(ceph_inode(inode));
bool ret; smp_rmb();
spin_lock(&ci->i_ceph_lock);
ret = __ceph_dir_is_complete_ordered(ci);
spin_unlock(&ci->i_ceph_lock);
return ret; return ret;
} }
...@@ -636,16 +635,20 @@ struct ceph_file_info { ...@@ -636,16 +635,20 @@ struct ceph_file_info {
unsigned offset; /* offset of last chunk, adjusted for . and .. */ unsigned offset; /* offset of last chunk, adjusted for . and .. */
unsigned next_offset; /* offset of next chunk (last_name's + 1) */ unsigned next_offset; /* offset of next chunk (last_name's + 1) */
char *last_name; /* last entry in previous chunk */ char *last_name; /* last entry in previous chunk */
struct dentry *dentry; /* next dentry (for dcache readdir) */ long long dir_release_count;
int dir_release_count; long long dir_ordered_count;
int dir_ordered_count; int readdir_cache_idx;
/* used for -o dirstat read() on directory thing */ /* used for -o dirstat read() on directory thing */
char *dir_info; char *dir_info;
int dir_info_len; int dir_info_len;
}; };
struct ceph_readdir_cache_control {
struct page *page;
struct dentry **dentries;
int index;
};
/* /*
* A "snap realm" describes a subset of the file hierarchy sharing * A "snap realm" describes a subset of the file hierarchy sharing
...@@ -944,6 +947,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn); ...@@ -944,6 +947,7 @@ extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry); extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
/* /*
* our d_ops vary depending on whether the inode is live, * our d_ops vary depending on whether the inode is live,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment