Commit 85c7000f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The highlights are:

   - several changes to how snap context and snap realms are tracked
     (Xiubo Li). In particular, this should resolve a long-standing
     issue of high kworker CPU usage and various stalls caused by
     needless iteration over all inodes in the snap realm.

   - async create fixes to address hangs in some edge cases (Jeff
     Layton)

   - support for getvxattr MDS op for querying server-side xattrs, such
     as file/directory layouts and ephemeral pins (Milind Changire)

   - average latency is now maintained for all metrics (Venky Shankar)

   - some tweaks around handling inline data to make it fit better with
     netfs helper library (David Howells)

  Also a couple of memory leaks got plugged along with a few assorted
  fixups. Last but not least, Xiubo has stepped up to serve as a CephFS
  co-maintainer"

* tag 'ceph-for-5.18-rc1' of https://github.com/ceph/ceph-client: (27 commits)
  ceph: fix memory leak in ceph_readdir when note_last_dentry returns error
  ceph: uninitialized variable in debug output
  ceph: use tracked average r/w/m latencies to display metrics in debugfs
  ceph: include average/stdev r/w/m latency in mds metrics
  ceph: track average r/w/m latency
  ceph: use ktime_to_timespec64() rather than jiffies_to_timespec64()
  ceph: assign the ci only when the inode isn't NULL
  ceph: fix inode reference leakage in ceph_get_snapdir()
  ceph: misc fix for code style and logs
  ceph: allocate capsnap memory outside of ceph_queue_cap_snap()
  ceph: do not release the global snaprealm until unmounting
  ceph: remove incorrect and unused CEPH_INO_DOTDOT macro
  MAINTAINERS: add Xiubo Li as cephfs co-maintainer
  ceph: eliminate the recursion when rebuilding the snap context
  ceph: do not update snapshot context when there is no new snapshot
  ceph: zero the dir_entries memory when allocating it
  ceph: move to a dedicated slabcache for ceph_cap_snap
  ceph: add getvxattr op
  libceph: drop else branches in prepare_read_data{,_cont}
  ceph: fix comments mentioning i_mutex
  ...
parents b1b07ba3 f639d986
......@@ -4456,6 +4456,7 @@ F: drivers/power/supply/cw2015_battery.c
CEPH COMMON CODE (LIBCEPH)
M: Ilya Dryomov <idryomov@gmail.com>
M: Jeff Layton <jlayton@kernel.org>
M: Xiubo Li <xiubli@redhat.com>
L: ceph-devel@vger.kernel.org
S: Supported
W: http://ceph.com/
......@@ -4466,6 +4467,7 @@ F: net/ceph/
CEPH DISTRIBUTED FILE SYSTEM CLIENT (CEPH)
M: Jeff Layton <jlayton@kernel.org>
M: Xiubo Li <xiubli@redhat.com>
M: Ilya Dryomov <idryomov@gmail.com>
L: ceph-devel@vger.kernel.org
S: Supported
......
This diff is collapsed.
......@@ -1915,6 +1915,13 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
ceph_get_mds_session(session);
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
/* Don't send messages until we get async create reply */
spin_unlock(&ci->i_ceph_lock);
ceph_put_mds_session(session);
return;
}
if (ci->i_ceph_flags & CEPH_I_FLUSH)
flags |= CHECK_CAPS_FLUSH;
retry:
......@@ -2409,6 +2416,9 @@ int ceph_write_inode(struct inode *inode, struct writeback_control *wbc)
dout("write_inode %p wait=%d\n", inode, wait);
ceph_fscache_unpin_writeback(inode, wbc);
if (wait) {
err = ceph_wait_on_async_create(inode);
if (err)
return err;
dirty = try_flush_caps(inode, &flush_tid);
if (dirty)
err = wait_event_interruptible(ci->i_cap_wq,
......@@ -2439,6 +2449,10 @@ static void __kick_flushing_caps(struct ceph_mds_client *mdsc,
u64 first_tid = 0;
u64 last_snap_flush = 0;
/* Don't do anything until create reply comes in */
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE)
return;
ci->i_ceph_flags &= ~CEPH_I_KICK_FLUSH;
list_for_each_entry_reverse(cf, &ci->i_cap_flush_list, i_list) {
......@@ -4152,7 +4166,6 @@ void ceph_handle_caps(struct ceph_mds_session *session,
/* lookup ino */
inode = ceph_find_inode(mdsc->fsc->sb, vino);
ci = ceph_inode(inode);
dout(" op %s ino %llx.%llx inode %p\n", ceph_cap_op_name(op), vino.ino,
vino.snap, inode);
......@@ -4178,6 +4191,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
}
goto flush_cap_releases;
}
ci = ceph_inode(inode);
/* these will work even if we don't have a cap yet */
switch (op) {
......
......@@ -175,7 +175,7 @@ static int metrics_latency_show(struct seq_file *s, void *p)
struct ceph_fs_client *fsc = s->private;
struct ceph_client_metric *cm = &fsc->mdsc->metric;
struct ceph_metric *m;
s64 total, sum, avg, min, max, sq;
s64 total, avg, min, max, sq;
int i;
seq_printf(s, "item total avg_lat(us) min_lat(us) max_lat(us) stdev(us)\n");
......@@ -185,8 +185,7 @@ static int metrics_latency_show(struct seq_file *s, void *p)
m = &cm->metric[i];
spin_lock(&m->lock);
total = m->total;
sum = m->latency_sum;
avg = total > 0 ? DIV64_U64_ROUND_CLOSEST(sum, total) : 0;
avg = m->latency_avg;
min = m->latency_min;
max = m->latency_max;
sq = m->latency_sq_sum;
......
......@@ -145,7 +145,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
return ERR_PTR(-EAGAIN);
}
/* reading/filling the cache are serialized by
i_mutex, no need to use page lock */
i_rwsem, no need to use page lock */
unlock_page(cache_ctl->page);
cache_ctl->dentries = kmap(cache_ctl->page);
}
......@@ -155,7 +155,7 @@ __dcache_find_get_entry(struct dentry *parent, u64 idx,
rcu_read_lock();
spin_lock(&parent->d_lock);
/* check i_size again here, because empty directory can be
* marked as complete while not holding the i_mutex. */
* marked as complete while not holding the i_rwsem. */
if (ceph_dir_is_complete_ordered(dir) && ptr_pos < i_size_read(dir))
dentry = cache_ctl->dentries[cache_ctl->index];
else
......@@ -478,8 +478,11 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
2 : (fpos_off(rde->offset) + 1);
err = note_last_dentry(dfi, rde->name, rde->name_len,
next_offset);
if (err)
if (err) {
ceph_mdsc_put_request(dfi->last_readdir);
dfi->last_readdir = NULL;
return err;
}
} else if (req->r_reply_info.dir_end) {
dfi->next_offset = 2;
/* keep last name */
......@@ -520,6 +523,12 @@ static int ceph_readdir(struct file *file, struct dir_context *ctx)
if (!dir_emit(ctx, rde->name, rde->name_len,
ceph_present_ino(inode->i_sb, le64_to_cpu(rde->inode.in->ino)),
le32_to_cpu(rde->inode.in->mode) >> 12)) {
/*
* NOTE: Here no need to put the 'dfi->last_readdir',
* because when dir_emit stops us it's most likely
* doesn't have enough memory, etc. So for next readdir
* it will continue.
*/
dout("filldir stopping us...\n");
return 0;
}
......@@ -671,7 +680,7 @@ struct dentry *ceph_handle_snapdir(struct ceph_mds_request *req,
struct dentry *dentry)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(dentry->d_sb);
struct inode *parent = d_inode(dentry->d_parent); /* we hold i_mutex */
struct inode *parent = d_inode(dentry->d_parent); /* we hold i_rwsem */
/* .snap dir? */
if (ceph_snap(parent) == CEPH_NOSNAP &&
......
......@@ -207,6 +207,7 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
struct ceph_mount_options *opt =
ceph_inode_to_client(&ci->vfs_inode)->mount_options;
struct ceph_file_info *fi;
int ret;
dout("%s %p %p 0%o (%s)\n", __func__, inode, file,
inode->i_mode, isdir ? "dir" : "regular");
......@@ -240,7 +241,22 @@ static int ceph_init_file_info(struct inode *inode, struct file *file,
INIT_LIST_HEAD(&fi->rw_contexts);
fi->filp_gen = READ_ONCE(ceph_inode_to_client(inode)->filp_gen);
if ((file->f_mode & FMODE_WRITE) &&
ci->i_inline_version != CEPH_INLINE_NONE) {
ret = ceph_uninline_data(file);
if (ret < 0)
goto error;
}
return 0;
error:
ceph_fscache_unuse_cookie(inode, file->f_mode & FMODE_WRITE);
ceph_put_fmode(ci, fi->fmode, 1);
kmem_cache_free(ceph_file_cachep, fi);
/* wake up anyone waiting for caps on this inode */
wake_up_all(&ci->i_cap_wq);
return ret;
}
/*
......@@ -516,52 +532,67 @@ static void restore_deleg_ino(struct inode *dir, u64 ino)
}
}
static void wake_async_create_waiters(struct inode *inode,
struct ceph_mds_session *session)
{
struct ceph_inode_info *ci = ceph_inode(inode);
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
}
ceph_kick_flushing_inode_caps(session, ci);
spin_unlock(&ci->i_ceph_lock);
}
static void ceph_async_create_cb(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
{
struct dentry *dentry = req->r_dentry;
struct inode *dinode = d_inode(dentry);
struct inode *tinode = req->r_target_inode;
int result = req->r_err ? req->r_err :
le32_to_cpu(req->r_reply_info.head->result);
WARN_ON_ONCE(dinode && tinode && dinode != tinode);
/* MDS changed -- caller must resubmit */
if (result == -EJUKEBOX)
goto out;
mapping_set_error(req->r_parent->i_mapping, result);
if (result) {
struct dentry *dentry = req->r_dentry;
struct inode *inode = d_inode(dentry);
int pathlen = 0;
u64 base = 0;
char *path = ceph_mdsc_build_path(req->r_dentry, &pathlen,
&base, 0);
pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
ceph_dir_clear_complete(req->r_parent);
if (!d_unhashed(dentry))
d_drop(dentry);
ceph_inode_shutdown(inode);
pr_warn("ceph: async create failure path=(%llx)%s result=%d!\n",
base, IS_ERR(path) ? "<<bad>>" : path, result);
ceph_mdsc_free_path(path, pathlen);
if (dinode) {
mapping_set_error(dinode->i_mapping, result);
ceph_inode_shutdown(dinode);
wake_async_create_waiters(dinode, req->r_session);
}
}
if (req->r_target_inode) {
struct ceph_inode_info *ci = ceph_inode(req->r_target_inode);
u64 ino = ceph_vino(req->r_target_inode).ino;
if (tinode) {
u64 ino = ceph_vino(tinode).ino;
if (req->r_deleg_ino != ino)
pr_warn("%s: inode number mismatch! err=%d deleg_ino=0x%llx target=0x%llx\n",
__func__, req->r_err, req->r_deleg_ino, ino);
mapping_set_error(req->r_target_inode->i_mapping, result);
spin_lock(&ci->i_ceph_lock);
if (ci->i_ceph_flags & CEPH_I_ASYNC_CREATE) {
ci->i_ceph_flags &= ~CEPH_I_ASYNC_CREATE;
wake_up_bit(&ci->i_ceph_flags, CEPH_ASYNC_CREATE_BIT);
}
ceph_kick_flushing_inode_caps(req->r_session, ci);
spin_unlock(&ci->i_ceph_lock);
mapping_set_error(tinode->i_mapping, result);
wake_async_create_waiters(tinode, req->r_session);
} else if (!result) {
pr_warn("%s: no req->r_target_inode for 0x%llx\n", __func__,
req->r_deleg_ino);
......@@ -1041,7 +1072,6 @@ static void ceph_aio_complete(struct inode *inode,
}
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&aio_req->prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
......@@ -1778,12 +1808,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
if (err)
goto out;
if (ci->i_inline_version != CEPH_INLINE_NONE) {
err = ceph_uninline_data(file, NULL);
if (err < 0)
goto out;
}
dout("aio_write %p %llx.%llx %llu~%zd getting caps. i_size %llu\n",
inode, ceph_vinop(inode), pos, count, i_size_read(inode));
if (!(fi->flags & CEPH_F_SYNC) && !direct_lock)
......@@ -1855,7 +1879,6 @@ static ssize_t ceph_write_iter(struct kiocb *iocb, struct iov_iter *from)
int dirty;
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
......@@ -2109,12 +2132,6 @@ static long ceph_fallocate(struct file *file, int mode,
goto unlock;
}
if (ci->i_inline_version != CEPH_INLINE_NONE) {
ret = ceph_uninline_data(file, NULL);
if (ret < 0)
goto unlock;
}
size = i_size_read(inode);
/* Are we punching a hole beyond EOF? */
......@@ -2139,7 +2156,6 @@ static long ceph_fallocate(struct file *file, int mode,
if (!ret) {
spin_lock(&ci->i_ceph_lock);
ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_FILE_WR,
&prealloc_cf);
spin_unlock(&ci->i_ceph_lock);
......@@ -2532,7 +2548,6 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
}
/* Mark Fw dirty */
spin_lock(&dst_ci->i_ceph_lock);
dst_ci->i_inline_version = CEPH_INLINE_NONE;
dirty = __ceph_mark_dirty_caps(dst_ci, CEPH_CAP_FILE_WR, &prealloc_cf);
spin_unlock(&dst_ci->i_ceph_lock);
if (dirty)
......
......@@ -87,13 +87,13 @@ struct inode *ceph_get_snapdir(struct inode *parent)
if (!S_ISDIR(parent->i_mode)) {
pr_warn_once("bad snapdir parent type (mode=0%o)\n",
parent->i_mode);
return ERR_PTR(-ENOTDIR);
goto err;
}
if (!(inode->i_state & I_NEW) && !S_ISDIR(inode->i_mode)) {
pr_warn_once("bad snapdir inode type (mode=0%o)\n",
inode->i_mode);
return ERR_PTR(-ENOTDIR);
goto err;
}
inode->i_mode = parent->i_mode;
......@@ -113,6 +113,12 @@ struct inode *ceph_get_snapdir(struct inode *parent)
}
return inode;
err:
if ((inode->i_state & I_NEW))
discard_new_inode(inode);
else
iput(inode);
return ERR_PTR(-ENOTDIR);
}
const struct inode_operations ceph_file_iops = {
......@@ -1201,7 +1207,7 @@ static void update_dentry_lease_careful(struct dentry *dentry,
/*
* splice a dentry to an inode.
* caller must hold directory i_mutex for this to be safe.
* caller must hold directory i_rwsem for this to be safe.
*/
static int splice_dentry(struct dentry **pdn, struct inode *in)
{
......@@ -1598,7 +1604,7 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
return idx == 0 ? -ENOMEM : 0;
}
/* reading/filling the cache are serialized by
* i_mutex, no need to use page lock */
* i_rwsem, no need to use page lock */
unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page);
if (idx == 0)
......@@ -2301,6 +2307,57 @@ int __ceph_do_getattr(struct inode *inode, struct page *locked_page,
return err;
}
int ceph_do_getvxattr(struct inode *inode, const char *name, void *value,
size_t size)
{
struct ceph_fs_client *fsc = ceph_sb_to_client(inode->i_sb);
struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req;
int mode = USE_AUTH_MDS;
int err;
char *xattr_value;
size_t xattr_value_len;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_GETVXATTR, mode);
if (IS_ERR(req)) {
err = -ENOMEM;
goto out;
}
req->r_path2 = kstrdup(name, GFP_NOFS);
if (!req->r_path2) {
err = -ENOMEM;
goto put;
}
ihold(inode);
req->r_inode = inode;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err < 0)
goto put;
xattr_value = req->r_reply_info.xattr_info.xattr_value;
xattr_value_len = req->r_reply_info.xattr_info.xattr_value_len;
dout("do_getvxattr xattr_value_len:%zu, size:%zu\n", xattr_value_len, size);
err = (int)xattr_value_len;
if (size == 0)
goto put;
if (xattr_value_len > size) {
err = -ERANGE;
goto put;
}
memcpy(value, xattr_value, xattr_value_len);
put:
ceph_mdsc_put_request(req);
out:
dout("do_getvxattr result=%d\n", err);
return err;
}
/*
* Check inode permissions. We verify we have a valid value for
......
......@@ -111,10 +111,10 @@ static int ceph_lock_message(u8 lock_type, u16 operation, struct inode *inode,
req->r_args.filelock_change.length = cpu_to_le64(length);
req->r_args.filelock_change.wait = wait;
if (wait)
req->r_wait_for_completion = ceph_lock_wait_for_completion;
err = ceph_mdsc_do_request(mdsc, inode, req);
err = ceph_mdsc_submit_request(mdsc, inode, req);
if (!err)
err = ceph_mdsc_wait_request(mdsc, req, wait ?
ceph_lock_wait_for_completion : NULL);
if (!err && operation == CEPH_MDS_OP_GETFILELOCK) {
fl->fl_pid = -le64_to_cpu(req->r_reply_info.filelock_reply->pid);
if (CEPH_LOCK_SHARED == req->r_reply_info.filelock_reply->type)
......
......@@ -555,6 +555,28 @@ static int parse_reply_info_create(void **p, void *end,
return -EIO;
}
static int parse_reply_info_getvxattr(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
u64 features)
{
u32 value_len;
ceph_decode_skip_8(p, end, bad); /* skip current version: 1 */
ceph_decode_skip_8(p, end, bad); /* skip first version: 1 */
ceph_decode_skip_32(p, end, bad); /* skip payload length */
ceph_decode_32_safe(p, end, value_len, bad);
if (value_len == end - *p) {
info->xattr_info.xattr_value = *p;
info->xattr_info.xattr_value_len = value_len;
*p = end;
return value_len;
}
bad:
return -EIO;
}
/*
* parse extra results
*/
......@@ -570,6 +592,8 @@ static int parse_reply_info_extra(void **p, void *end,
return parse_reply_info_readdir(p, end, info, features);
else if (op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features, s);
else if (op == CEPH_MDS_OP_GETVXATTR)
return parse_reply_info_getvxattr(p, end, info, features);
else
return -EIO;
}
......@@ -2178,7 +2202,8 @@ int ceph_alloc_readdir_reply_buffer(struct ceph_mds_request *req,
order = get_order(size * num_entries);
while (order >= 0) {
rinfo->dir_entries = (void*)__get_free_pages(GFP_KERNEL |
__GFP_NOWARN,
__GFP_NOWARN |
__GFP_ZERO,
order);
if (rinfo->dir_entries)
break;
......@@ -2946,15 +2971,16 @@ int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc, struct inode *dir,
return err;
}
static int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req)
int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req,
ceph_mds_request_wait_callback_t wait_func)
{
int err;
/* wait */
dout("do_request waiting\n");
if (!req->r_timeout && req->r_wait_for_completion) {
err = req->r_wait_for_completion(mdsc, req);
if (wait_func) {
err = wait_func(mdsc, req);
} else {
long timeleft = wait_for_completion_killable_timeout(
&req->r_completion,
......@@ -3011,7 +3037,7 @@ int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
/* issue */
err = ceph_mdsc_submit_request(mdsc, dir, req);
if (!err)
err = ceph_mdsc_wait_request(mdsc, req);
err = ceph_mdsc_wait_request(mdsc, req, NULL);
dout("do_request %p done, result %d\n", req, err);
return err;
}
......@@ -3097,35 +3123,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
result = le32_to_cpu(head->result);
/*
* Handle an ESTALE
* if we're not talking to the authority, send to them
* if the authority has changed while we weren't looking,
* send to new authority
* Otherwise we just have to return an ESTALE
*/
if (result == -ESTALE) {
dout("got ESTALE on request %llu\n", req->r_tid);
req->r_resend_mds = -1;
if (req->r_direct_mode != USE_AUTH_MDS) {
dout("not using auth, setting for that now\n");
req->r_direct_mode = USE_AUTH_MDS;
__do_request(mdsc, req);
mutex_unlock(&mdsc->mutex);
goto out;
} else {
int mds = __choose_mds(mdsc, req, NULL);
if (mds >= 0 && mds != req->r_session->s_mds) {
dout("but auth changed, so resending\n");
__do_request(mdsc, req);
mutex_unlock(&mdsc->mutex);
goto out;
}
}
dout("have to return ESTALE on request %llu\n", req->r_tid);
}
if (head->safe) {
set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req);
......@@ -4841,7 +4838,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_unlock(&mdsc->mutex);
ceph_cleanup_snapid_map(mdsc);
ceph_cleanup_empty_realms(mdsc);
ceph_cleanup_global_and_empty_realms(mdsc);
cancel_work_sync(&mdsc->cap_reclaim_work);
cancel_delayed_work_sync(&mdsc->delayed_work); /* cancel timer */
......
......@@ -100,6 +100,11 @@ struct ceph_mds_reply_dir_entry {
loff_t offset;
};
struct ceph_mds_reply_xattr {
char *xattr_value;
size_t xattr_value_len;
};
/*
* parsed info about an mds reply, including information about
* either: 1) the target inode and/or its parent directory and dentry,
......@@ -115,6 +120,7 @@ struct ceph_mds_reply_info_parsed {
char *dname;
u32 dname_len;
struct ceph_mds_reply_lease *dlease;
struct ceph_mds_reply_xattr xattr_info;
/* extra */
union {
......@@ -274,8 +280,8 @@ struct ceph_mds_request {
union ceph_mds_request_args r_args;
int r_fmode; /* file mode, if expecting cap */
const struct cred *r_cred;
int r_request_release_offset;
const struct cred *r_cred;
struct timespec64 r_stamp;
/* for choosing which mds to send this request to */
......@@ -296,12 +302,11 @@ struct ceph_mds_request {
struct ceph_msg *r_reply;
struct ceph_mds_reply_info_parsed r_reply_info;
int r_err;
u32 r_readdir_offset;
struct page *r_locked_page;
int r_dir_caps;
int r_num_caps;
u32 r_readdir_offset;
unsigned long r_timeout; /* optional. jiffies, 0 is "wait forever" */
unsigned long r_started; /* start time to measure timeout against */
......@@ -329,7 +334,6 @@ struct ceph_mds_request {
struct completion r_completion;
struct completion r_safe_completion;
ceph_mds_request_callback_t r_callback;
ceph_mds_request_wait_callback_t r_wait_for_completion;
struct list_head r_unsafe_item; /* per-session unsafe list item */
long long r_dir_release_cnt;
......@@ -507,6 +511,9 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode);
extern int ceph_mdsc_submit_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
int ceph_mdsc_wait_request(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req,
ceph_mds_request_wait_callback_t wait_func);
extern int ceph_mdsc_do_request(struct ceph_mds_client *mdsc,
struct inode *dir,
struct ceph_mds_request *req);
......
......@@ -8,6 +8,12 @@
#include "metric.h"
#include "mds_client.h"
static void ktime_to_ceph_timespec(struct ceph_timespec *ts, ktime_t val)
{
struct timespec64 t = ktime_to_timespec64(val);
ceph_encode_timespec64(ts, &t);
}
static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
struct ceph_mds_session *s)
{
......@@ -26,7 +32,6 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
u64 nr_caps = atomic64_read(&m->total_caps);
u32 header_len = sizeof(struct ceph_metric_header);
struct ceph_msg *msg;
struct timespec64 ts;
s64 sum;
s32 items = 0;
s32 len;
......@@ -59,37 +64,40 @@ static bool ceph_mdsc_send_metrics(struct ceph_mds_client *mdsc,
/* encode the read latency metric */
read = (struct ceph_metric_read_latency *)(cap + 1);
read->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_READ_LATENCY);
read->header.ver = 1;
read->header.ver = 2;
read->header.compat = 1;
read->header.data_len = cpu_to_le32(sizeof(*read) - header_len);
sum = m->metric[METRIC_READ].latency_sum;
jiffies_to_timespec64(sum, &ts);
read->sec = cpu_to_le32(ts.tv_sec);
read->nsec = cpu_to_le32(ts.tv_nsec);
ktime_to_ceph_timespec(&read->lat, sum);
ktime_to_ceph_timespec(&read->avg, m->metric[METRIC_READ].latency_avg);
read->sq_sum = cpu_to_le64(m->metric[METRIC_READ].latency_sq_sum);
read->count = cpu_to_le64(m->metric[METRIC_READ].total);
items++;
/* encode the write latency metric */
write = (struct ceph_metric_write_latency *)(read + 1);
write->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_WRITE_LATENCY);
write->header.ver = 1;
write->header.ver = 2;
write->header.compat = 1;
write->header.data_len = cpu_to_le32(sizeof(*write) - header_len);
sum = m->metric[METRIC_WRITE].latency_sum;
jiffies_to_timespec64(sum, &ts);
write->sec = cpu_to_le32(ts.tv_sec);
write->nsec = cpu_to_le32(ts.tv_nsec);
ktime_to_ceph_timespec(&write->lat, sum);
ktime_to_ceph_timespec(&write->avg, m->metric[METRIC_WRITE].latency_avg);
write->sq_sum = cpu_to_le64(m->metric[METRIC_WRITE].latency_sq_sum);
write->count = cpu_to_le64(m->metric[METRIC_WRITE].total);
items++;
/* encode the metadata latency metric */
meta = (struct ceph_metric_metadata_latency *)(write + 1);
meta->header.type = cpu_to_le32(CLIENT_METRIC_TYPE_METADATA_LATENCY);
meta->header.ver = 1;
meta->header.ver = 2;
meta->header.compat = 1;
meta->header.data_len = cpu_to_le32(sizeof(*meta) - header_len);
sum = m->metric[METRIC_METADATA].latency_sum;
jiffies_to_timespec64(sum, &ts);
meta->sec = cpu_to_le32(ts.tv_sec);
meta->nsec = cpu_to_le32(ts.tv_nsec);
ktime_to_ceph_timespec(&meta->lat, sum);
ktime_to_ceph_timespec(&meta->avg, m->metric[METRIC_METADATA].latency_avg);
meta->sq_sum = cpu_to_le64(m->metric[METRIC_METADATA].latency_sq_sum);
meta->count = cpu_to_le64(m->metric[METRIC_METADATA].total);
items++;
/* encode the dentry lease metric */
......@@ -250,6 +258,7 @@ int ceph_metric_init(struct ceph_client_metric *m)
metric->size_max = 0;
metric->total = 0;
metric->latency_sum = 0;
metric->latency_avg = 0;
metric->latency_sq_sum = 0;
metric->latency_min = KTIME_MAX;
metric->latency_max = 0;
......@@ -307,20 +316,19 @@ void ceph_metric_destroy(struct ceph_client_metric *m)
max = new; \
}
static inline void __update_stdev(ktime_t total, ktime_t lsum,
ktime_t *sq_sump, ktime_t lat)
static inline void __update_mean_and_stdev(ktime_t total, ktime_t *lavg,
ktime_t *sq_sump, ktime_t lat)
{
ktime_t avg, sq;
if (unlikely(total == 1))
return;
/* the sq is (lat - old_avg) * (lat - new_avg) */
avg = DIV64_U64_ROUND_CLOSEST((lsum - lat), (total - 1));
sq = lat - avg;
avg = DIV64_U64_ROUND_CLOSEST(lsum, total);
sq = sq * (lat - avg);
*sq_sump += sq;
ktime_t avg;
if (unlikely(total == 1)) {
*lavg = lat;
} else {
/* the sq is (lat - old_avg) * (lat - new_avg) */
avg = *lavg + div64_s64(lat - *lavg, total);
*sq_sump += (lat - *lavg)*(lat - avg);
*lavg = avg;
}
}
void ceph_update_metrics(struct ceph_metric *m,
......@@ -339,6 +347,7 @@ void ceph_update_metrics(struct ceph_metric *m,
METRIC_UPDATE_MIN_MAX(m->size_min, m->size_max, size);
m->latency_sum += lat;
METRIC_UPDATE_MIN_MAX(m->latency_min, m->latency_max, lat);
__update_stdev(total, m->latency_sum, &m->latency_sq_sum, lat);
__update_mean_and_stdev(total, &m->latency_avg, &m->latency_sq_sum,
lat);
spin_unlock(&m->lock);
}
......@@ -2,7 +2,7 @@
#ifndef _FS_CEPH_MDS_METRIC_H
#define _FS_CEPH_MDS_METRIC_H
#include <linux/types.h>
#include <linux/ceph/types.h>
#include <linux/percpu_counter.h>
#include <linux/ktime.h>
......@@ -19,27 +19,39 @@ enum ceph_metric_type {
CLIENT_METRIC_TYPE_OPENED_INODES,
CLIENT_METRIC_TYPE_READ_IO_SIZES,
CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_WRITE_IO_SIZES,
CLIENT_METRIC_TYPE_AVG_READ_LATENCY,
CLIENT_METRIC_TYPE_STDEV_READ_LATENCY,
CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY,
CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY,
CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY,
CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
CLIENT_METRIC_TYPE_MAX = CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY,
};
/*
* This will always have the highest metric bit value
* as the last element of the array.
*/
#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
CLIENT_METRIC_TYPE_CAP_INFO, \
CLIENT_METRIC_TYPE_READ_LATENCY, \
CLIENT_METRIC_TYPE_WRITE_LATENCY, \
CLIENT_METRIC_TYPE_METADATA_LATENCY, \
CLIENT_METRIC_TYPE_DENTRY_LEASE, \
CLIENT_METRIC_TYPE_OPENED_FILES, \
CLIENT_METRIC_TYPE_PINNED_ICAPS, \
CLIENT_METRIC_TYPE_OPENED_INODES, \
CLIENT_METRIC_TYPE_READ_IO_SIZES, \
CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
\
CLIENT_METRIC_TYPE_MAX, \
#define CEPHFS_METRIC_SPEC_CLIENT_SUPPORTED { \
CLIENT_METRIC_TYPE_CAP_INFO, \
CLIENT_METRIC_TYPE_READ_LATENCY, \
CLIENT_METRIC_TYPE_WRITE_LATENCY, \
CLIENT_METRIC_TYPE_METADATA_LATENCY, \
CLIENT_METRIC_TYPE_DENTRY_LEASE, \
CLIENT_METRIC_TYPE_OPENED_FILES, \
CLIENT_METRIC_TYPE_PINNED_ICAPS, \
CLIENT_METRIC_TYPE_OPENED_INODES, \
CLIENT_METRIC_TYPE_READ_IO_SIZES, \
CLIENT_METRIC_TYPE_WRITE_IO_SIZES, \
CLIENT_METRIC_TYPE_AVG_READ_LATENCY, \
CLIENT_METRIC_TYPE_STDEV_READ_LATENCY, \
CLIENT_METRIC_TYPE_AVG_WRITE_LATENCY, \
CLIENT_METRIC_TYPE_STDEV_WRITE_LATENCY, \
CLIENT_METRIC_TYPE_AVG_METADATA_LATENCY, \
CLIENT_METRIC_TYPE_STDEV_METADATA_LATENCY, \
\
CLIENT_METRIC_TYPE_MAX, \
}
struct ceph_metric_header {
......@@ -60,22 +72,28 @@ struct ceph_metric_cap {
/* metric read latency header */
struct ceph_metric_read_latency {
struct ceph_metric_header header;
__le32 sec;
__le32 nsec;
struct ceph_timespec lat;
struct ceph_timespec avg;
__le64 sq_sum;
__le64 count;
} __packed;
/* metric write latency header */
struct ceph_metric_write_latency {
struct ceph_metric_header header;
__le32 sec;
__le32 nsec;
struct ceph_timespec lat;
struct ceph_timespec avg;
__le64 sq_sum;
__le64 count;
} __packed;
/* metric metadata latency header */
struct ceph_metric_metadata_latency {
struct ceph_metric_header header;
__le32 sec;
__le32 nsec;
struct ceph_timespec lat;
struct ceph_timespec avg;
__le64 sq_sum;
__le64 count;
} __packed;
/* metric dentry lease header */
......@@ -140,6 +158,7 @@ struct ceph_metric {
u64 size_min;
u64 size_max;
ktime_t latency_sum;
ktime_t latency_avg;
ktime_t latency_sq_sum;
ktime_t latency_min;
ktime_t latency_max;
......
This diff is collapsed.
......@@ -60,6 +60,7 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_LOOKUPNAME: return "lookupname";
case CEPH_MDS_OP_GETATTR: return "getattr";
case CEPH_MDS_OP_GETVXATTR: return "getvxattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr";
......
......@@ -865,6 +865,7 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
*/
struct kmem_cache *ceph_inode_cachep;
struct kmem_cache *ceph_cap_cachep;
struct kmem_cache *ceph_cap_snap_cachep;
struct kmem_cache *ceph_cap_flush_cachep;
struct kmem_cache *ceph_dentry_cachep;
struct kmem_cache *ceph_file_cachep;
......@@ -893,6 +894,9 @@ static int __init init_caches(void)
ceph_cap_cachep = KMEM_CACHE(ceph_cap, SLAB_MEM_SPREAD);
if (!ceph_cap_cachep)
goto bad_cap;
ceph_cap_snap_cachep = KMEM_CACHE(ceph_cap_snap, SLAB_MEM_SPREAD);
if (!ceph_cap_snap_cachep)
goto bad_cap_snap;
ceph_cap_flush_cachep = KMEM_CACHE(ceph_cap_flush,
SLAB_RECLAIM_ACCOUNT|SLAB_MEM_SPREAD);
if (!ceph_cap_flush_cachep)
......@@ -932,6 +936,8 @@ static int __init init_caches(void)
bad_dentry:
kmem_cache_destroy(ceph_cap_flush_cachep);
bad_cap_flush:
kmem_cache_destroy(ceph_cap_snap_cachep);
bad_cap_snap:
kmem_cache_destroy(ceph_cap_cachep);
bad_cap:
kmem_cache_destroy(ceph_inode_cachep);
......@@ -948,6 +954,7 @@ static void destroy_caches(void)
kmem_cache_destroy(ceph_inode_cachep);
kmem_cache_destroy(ceph_cap_cachep);
kmem_cache_destroy(ceph_cap_snap_cachep);
kmem_cache_destroy(ceph_cap_flush_cachep);
kmem_cache_destroy(ceph_dentry_cachep);
kmem_cache_destroy(ceph_file_cachep);
......
......@@ -231,7 +231,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
if (refcount_dec_and_test(&capsnap->nref)) {
if (capsnap->xattr_blob)
ceph_buffer_put(capsnap->xattr_blob);
kfree(capsnap);
kmem_cache_free(ceph_cap_snap_cachep, capsnap);
}
}
......@@ -884,6 +884,8 @@ struct ceph_snap_realm {
struct list_head dirty_item; /* if realm needs new context */
struct list_head rebuild_item; /* rebuild snap realms _downward_ in hierarchy */
/* the current set of snaps for this realm */
struct ceph_snap_context *cached_context;
......@@ -939,7 +941,7 @@ extern void ceph_handle_snap(struct ceph_mds_client *mdsc,
struct ceph_msg *msg);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
extern void ceph_cleanup_global_and_empty_realms(struct ceph_mds_client *mdsc);
extern struct ceph_snapid_map *ceph_get_snapid_map(struct ceph_mds_client *mdsc,
u64 snap);
......@@ -1049,6 +1051,7 @@ static inline bool ceph_inode_is_shutdown(struct inode *inode)
/* xattr.c */
int __ceph_setxattr(struct inode *, const char *, const void *, size_t, int);
int ceph_do_getvxattr(struct inode *inode, const char *name, void *value, size_t size);
ssize_t __ceph_getxattr(struct inode *, const char *, void *, size_t);
extern ssize_t ceph_listxattr(struct dentry *, char *, size_t);
extern struct ceph_buffer *__ceph_build_xattrs_blob(struct ceph_inode_info *ci);
......@@ -1214,7 +1217,7 @@ extern void __ceph_touch_fmode(struct ceph_inode_info *ci,
/* addr.c */
extern const struct address_space_operations ceph_aops;
extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
extern int ceph_uninline_data(struct file *filp, struct page *locked_page);
extern int ceph_uninline_data(struct file *file);
extern int ceph_pool_perm_check(struct inode *inode, int need);
extern void ceph_pool_perm_destroy(struct ceph_mds_client* mdsc);
int ceph_purge_inode_cap(struct inode *inode, struct ceph_cap *cap, bool *invalidate);
......
......@@ -923,10 +923,13 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
{
struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_inode_xattr *xattr;
struct ceph_vxattr *vxattr = NULL;
struct ceph_vxattr *vxattr;
int req_mask;
ssize_t err;
if (strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
goto handle_non_vxattrs;
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr) {
......@@ -945,8 +948,14 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
err = -ERANGE;
}
return err;
} else {
err = ceph_do_getvxattr(inode, name, value, size);
/* this would happen with a new client and old server combo */
if (err == -EOPNOTSUPP)
err = -ENODATA;
return err;
}
handle_non_vxattrs:
req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock);
......
......@@ -28,8 +28,8 @@
#define CEPH_INO_ROOT 1
#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */
#define CEPH_INO_CEPH 2 /* hidden .ceph dir */
#define CEPH_INO_GLOBAL_SNAPREALM 3 /* global dummy snaprealm */
/* arbitrary limit on max # of monitors (cluster of 3 is typical) */
#define CEPH_MAX_MON 31
......@@ -328,6 +328,7 @@ enum {
CEPH_MDS_OP_LOOKUPPARENT = 0x00103,
CEPH_MDS_OP_LOOKUPINO = 0x00104,
CEPH_MDS_OP_LOOKUPNAME = 0x00105,
CEPH_MDS_OP_GETVXATTR = 0x00106,
CEPH_MDS_OP_SETXATTR = 0x01105,
CEPH_MDS_OP_RMXATTR = 0x01106,
......
......@@ -284,6 +284,7 @@ DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
extern struct kmem_cache *ceph_inode_cachep;
extern struct kmem_cache *ceph_cap_cachep;
extern struct kmem_cache *ceph_cap_snap_cachep;
extern struct kmem_cache *ceph_cap_flush_cachep;
extern struct kmem_cache *ceph_dentry_cachep;
extern struct kmem_cache *ceph_file_cachep;
......
......@@ -1773,10 +1773,8 @@ static int prepare_read_data(struct ceph_connection *con)
bv.bv_page = con->bounce_page;
bv.bv_offset = 0;
set_in_bvec(con, &bv);
} else {
set_in_bvec(con, &bv);
}
set_in_bvec(con, &bv);
con->v2.in_state = IN_S_PREPARE_READ_DATA_CONT;
return 0;
}
......@@ -1807,10 +1805,8 @@ static void prepare_read_data_cont(struct ceph_connection *con)
if (ceph_test_opt(from_msgr(con->msgr), RXBOUNCE)) {
bv.bv_page = con->bounce_page;
bv.bv_offset = 0;
set_in_bvec(con, &bv);
} else {
set_in_bvec(con, &bv);
}
set_in_bvec(con, &bv);
WARN_ON(con->v2.in_state != IN_S_PREPARE_READ_DATA_CONT);
return;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment