Commit 4c46bef2 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ceph-for-5.6-rc1' of https://github.com/ceph/ceph-client

Pull ceph fixes from Ilya Dryomov:

 - a set of patches that fixes various corner cases in mount and umount
   code (Xiubo Li). This has to do with choosing an MDS, distinguishing
   between laggy and down MDSes and parsing the server path.

 - inode initialization fixes (Jeff Layton). The one included here
   mostly concerns things like open_by_handle() and there is another one
   that will come through Al.

 - copy_file_range() now uses the new copy-from2 op (Luis Henriques).
   The existing copy-from op turned out to be infeasible for generic
   filesystem use; we disable the copy offload if OSDs don't support
   copy-from2.

 - a patch to link "rbd" and "block" devices together in sysfs (Hannes
   Reinecke)

... and a smattering of cleanups from Xiubo, Jeff and Chengguang.

* tag 'ceph-for-5.6-rc1' of https://github.com/ceph/ceph-client: (25 commits)
  rbd: set the 'device' link in sysfs
  ceph: move net/ceph/ceph_fs.c to fs/ceph/util.c
  ceph: print name of xattr in __ceph_{get,set}xattr() douts
  ceph: print r_direct_hash in hex in __choose_mds() dout
  ceph: use copy-from2 op in copy_file_range
  ceph: close holes in structs ceph_mds_session and ceph_mds_request
  rbd: work around -Wuninitialized warning
  ceph: allocate the correct amount of extra bytes for the session features
  ceph: rename get_session and switch to use ceph_get_mds_session
  ceph: remove the extra slashes in the server path
  ceph: add possible_max_rank and make the code more readable
  ceph: print dentry offset in hex and fix xattr_version type
  ceph: only touch the caps which have the subset mask requested
  ceph: don't clear I_NEW until inode metadata is fully populated
  ceph: retry the same mds later after the new session is opened
  ceph: check availability of mds cluster on mount after wait timeout
  ceph: keep the session state until it is released
  ceph: add __send_request helper
  ceph: ensure we have a new cap before continuing in fill_inode
  ceph: drop unused ttl_from parameter from fill_inode
  ...
parents 5b211154 3325322f
...@@ -2662,7 +2662,7 @@ static int rbd_img_fill_nodata(struct rbd_img_request *img_req, ...@@ -2662,7 +2662,7 @@ static int rbd_img_fill_nodata(struct rbd_img_request *img_req,
u64 off, u64 len) u64 off, u64 len)
{ {
struct ceph_file_extent ex = { off, len }; struct ceph_file_extent ex = { off, len };
union rbd_img_fill_iter dummy; union rbd_img_fill_iter dummy = {};
struct rbd_img_fill_ctx fctx = { struct rbd_img_fill_ctx fctx = {
.pos_type = OBJ_REQUEST_NODATA, .pos_type = OBJ_REQUEST_NODATA,
.pos = &dummy, .pos = &dummy,
...@@ -7143,7 +7143,7 @@ static ssize_t do_rbd_add(struct bus_type *bus, ...@@ -7143,7 +7143,7 @@ static ssize_t do_rbd_add(struct bus_type *bus,
if (rc) if (rc)
goto err_out_image_lock; goto err_out_image_lock;
add_disk(rbd_dev->disk); device_add_disk(&rbd_dev->dev, rbd_dev->disk, NULL);
/* see rbd_init_disk() */ /* see rbd_init_disk() */
blk_put_queue(rbd_dev->disk->queue); blk_put_queue(rbd_dev->disk->queue);
......
...@@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o ...@@ -8,7 +8,7 @@ obj-$(CONFIG_CEPH_FS) += ceph.o
ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \ ceph-y := super.o inode.o dir.o file.o locks.o addr.o ioctl.o \
export.o caps.o snap.o xattr.o quota.o io.o \ export.o caps.o snap.o xattr.o quota.o io.o \
mds_client.o mdsmap.o strings.o ceph_frag.o \ mds_client.o mdsmap.o strings.o ceph_frag.o \
debugfs.o debugfs.o util.o
ceph-$(CONFIG_CEPH_FSCACHE) += cache.o ceph-$(CONFIG_CEPH_FSCACHE) += cache.o
ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o ceph-$(CONFIG_CEPH_FS_POSIX_ACL) += acl.o
...@@ -222,7 +222,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode, ...@@ -222,7 +222,7 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8); err = ceph_pagelist_reserve(pagelist, len + val_size2 + 8);
if (err) if (err)
goto out_err; goto out_err;
err = ceph_pagelist_encode_string(pagelist, ceph_pagelist_encode_string(pagelist,
XATTR_NAME_POSIX_ACL_DEFAULT, len); XATTR_NAME_POSIX_ACL_DEFAULT, len);
err = posix_acl_to_xattr(&init_user_ns, default_acl, err = posix_acl_to_xattr(&init_user_ns, default_acl,
tmp_buf, val_size2); tmp_buf, val_size2);
......
...@@ -908,6 +908,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch) ...@@ -908,6 +908,7 @@ int __ceph_caps_issued_mask(struct ceph_inode_info *ci, int mask, int touch)
ci_node); ci_node);
if (!__cap_is_valid(cap)) if (!__cap_is_valid(cap))
continue; continue;
if (cap->issued & mask)
__touch_cap(cap); __touch_cap(cap);
} }
} }
......
...@@ -33,7 +33,7 @@ static int mdsmap_show(struct seq_file *s, void *p) ...@@ -33,7 +33,7 @@ static int mdsmap_show(struct seq_file *s, void *p)
seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds); seq_printf(s, "max_mds %d\n", mdsmap->m_max_mds);
seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout); seq_printf(s, "session_timeout %d\n", mdsmap->m_session_timeout);
seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose); seq_printf(s, "session_autoclose %d\n", mdsmap->m_session_autoclose);
for (i = 0; i < mdsmap->m_num_mds; i++) { for (i = 0; i < mdsmap->possible_max_rank; i++) {
struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr; struct ceph_entity_addr *addr = &mdsmap->m_info[i].addr;
int state = mdsmap->m_info[i].state; int state = mdsmap->m_info[i].state;
seq_printf(s, "\tmds%d\t%s\t(%s)\n", i, seq_printf(s, "\tmds%d\t%s\t(%s)\n", i,
......
...@@ -1186,7 +1186,7 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di) ...@@ -1186,7 +1186,7 @@ void __ceph_dentry_dir_lease_touch(struct ceph_dentry_info *di)
struct dentry *dn = di->dentry; struct dentry *dn = di->dentry;
struct ceph_mds_client *mdsc; struct ceph_mds_client *mdsc;
dout("dentry_dir_lease_touch %p %p '%pd' (offset %lld)\n", dout("dentry_dir_lease_touch %p %p '%pd' (offset 0x%llx)\n",
di, dn, dn, di->offset); di, dn, dn, di->offset);
if (!list_empty(&di->lease_list)) { if (!list_empty(&di->lease_list)) {
...@@ -1567,7 +1567,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) ...@@ -1567,7 +1567,7 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
inode = d_inode(dentry); inode = d_inode(dentry);
} }
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, dout("d_revalidate %p '%pd' inode %p offset 0x%llx\n", dentry,
dentry, inode, ceph_dentry(dentry)->offset); dentry, inode, ceph_dentry(dentry)->offset);
/* always trust cached snapped dentries, snapdir dentry */ /* always trust cached snapped dentries, snapdir dentry */
......
...@@ -1974,6 +1974,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, ...@@ -1974,6 +1974,9 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
if (ceph_test_mount_opt(src_fsc, NOCOPYFROM)) if (ceph_test_mount_opt(src_fsc, NOCOPYFROM))
return -EOPNOTSUPP; return -EOPNOTSUPP;
if (!src_fsc->have_copy_from2)
return -EOPNOTSUPP;
/* /*
* Striped file layouts require that we copy partial objects, but the * Striped file layouts require that we copy partial objects, but the
* OSD copy-from operation only supports full-object copies. Limit * OSD copy-from operation only supports full-object copies. Limit
...@@ -2101,8 +2104,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off, ...@@ -2101,8 +2104,14 @@ static ssize_t __ceph_copy_file_range(struct file *src_file, loff_t src_off,
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE, CEPH_OSD_OP_FLAG_FADVISE_NOCACHE,
&dst_oid, &dst_oloc, &dst_oid, &dst_oloc,
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL | CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL |
CEPH_OSD_OP_FLAG_FADVISE_DONTNEED, 0); CEPH_OSD_OP_FLAG_FADVISE_DONTNEED,
dst_ci->i_truncate_seq, dst_ci->i_truncate_size,
CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ);
if (err) { if (err) {
if (err == -EOPNOTSUPP) {
src_fsc->have_copy_from2 = false;
pr_notice("OSDs don't support copy-from2; disabling copy offload\n");
}
dout("ceph_osdc_copy_from returned %d\n", err); dout("ceph_osdc_copy_from returned %d\n", err);
if (!ret) if (!ret)
ret = err; ret = err;
......
...@@ -55,11 +55,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino) ...@@ -55,11 +55,9 @@ struct inode *ceph_get_inode(struct super_block *sb, struct ceph_vino vino)
inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino); inode = iget5_locked(sb, t, ceph_ino_compare, ceph_set_ino_cb, &vino);
if (!inode) if (!inode)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
if (inode->i_state & I_NEW) { if (inode->i_state & I_NEW)
dout("get_inode created new inode %p %llx.%llx ino %llx\n", dout("get_inode created new inode %p %llx.%llx ino %llx\n",
inode, ceph_vinop(inode), (u64)inode->i_ino); inode, ceph_vinop(inode), (u64)inode->i_ino);
unlock_new_inode(inode);
}
dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino, dout("get_inode on %lu=%llx.%llx got %p\n", inode->i_ino, vino.ino,
vino.snap, inode); vino.snap, inode);
...@@ -88,6 +86,10 @@ struct inode *ceph_get_snapdir(struct inode *parent) ...@@ -88,6 +86,10 @@ struct inode *ceph_get_snapdir(struct inode *parent)
inode->i_fop = &ceph_snapdir_fops; inode->i_fop = &ceph_snapdir_fops;
ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */ ci->i_snap_caps = CEPH_CAP_PIN; /* so we can open */
ci->i_rbytes = 0; ci->i_rbytes = 0;
if (inode->i_state & I_NEW)
unlock_new_inode(inode);
return inode; return inode;
} }
...@@ -728,8 +730,7 @@ void ceph_fill_file_time(struct inode *inode, int issued, ...@@ -728,8 +730,7 @@ void ceph_fill_file_time(struct inode *inode, int issued,
static int fill_inode(struct inode *inode, struct page *locked_page, static int fill_inode(struct inode *inode, struct page *locked_page,
struct ceph_mds_reply_info_in *iinfo, struct ceph_mds_reply_info_in *iinfo,
struct ceph_mds_reply_dirfrag *dirinfo, struct ceph_mds_reply_dirfrag *dirinfo,
struct ceph_mds_session *session, struct ceph_mds_session *session, int cap_fmode,
unsigned long ttl_from, int cap_fmode,
struct ceph_cap_reservation *caps_reservation) struct ceph_cap_reservation *caps_reservation)
{ {
struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc; struct ceph_mds_client *mdsc = ceph_inode_to_client(inode)->mdsc;
...@@ -754,8 +755,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page, ...@@ -754,8 +755,11 @@ static int fill_inode(struct inode *inode, struct page *locked_page,
info_caps = le32_to_cpu(info->cap.caps); info_caps = le32_to_cpu(info->cap.caps);
/* prealloc new cap struct */ /* prealloc new cap struct */
if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) if (info_caps && ceph_snap(inode) == CEPH_NOSNAP) {
new_cap = ceph_get_cap(mdsc, caps_reservation); new_cap = ceph_get_cap(mdsc, caps_reservation);
if (!new_cap)
return -ENOMEM;
}
/* /*
* prealloc xattr data, if it looks like we'll need it. only * prealloc xattr data, if it looks like we'll need it. only
...@@ -1237,7 +1241,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) ...@@ -1237,7 +1241,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
if (dir) { if (dir) {
err = fill_inode(dir, NULL, err = fill_inode(dir, NULL,
&rinfo->diri, rinfo->dirfrag, &rinfo->diri, rinfo->dirfrag,
session, req->r_request_started, -1, session, -1,
&req->r_caps_reservation); &req->r_caps_reservation);
if (err < 0) if (err < 0)
goto done; goto done;
...@@ -1302,18 +1306,22 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req) ...@@ -1302,18 +1306,22 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
err = PTR_ERR(in); err = PTR_ERR(in);
goto done; goto done;
} }
req->r_target_inode = in;
err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL, err = fill_inode(in, req->r_locked_page, &rinfo->targeti, NULL,
session, req->r_request_started, session,
(!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) && (!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags) &&
rinfo->head->result == 0) ? req->r_fmode : -1, rinfo->head->result == 0) ? req->r_fmode : -1,
&req->r_caps_reservation); &req->r_caps_reservation);
if (err < 0) { if (err < 0) {
pr_err("fill_inode badness %p %llx.%llx\n", pr_err("fill_inode badness %p %llx.%llx\n",
in, ceph_vinop(in)); in, ceph_vinop(in));
if (in->i_state & I_NEW)
discard_new_inode(in);
goto done; goto done;
} }
req->r_target_inode = in;
if (in->i_state & I_NEW)
unlock_new_inode(in);
} }
/* /*
...@@ -1493,12 +1501,18 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req, ...@@ -1493,12 +1501,18 @@ static int readdir_prepopulate_inodes_only(struct ceph_mds_request *req,
continue; continue;
} }
rc = fill_inode(in, NULL, &rde->inode, NULL, session, rc = fill_inode(in, NULL, &rde->inode, NULL, session,
req->r_request_started, -1, -1, &req->r_caps_reservation);
&req->r_caps_reservation);
if (rc < 0) { if (rc < 0) {
pr_err("fill_inode badness on %p got %d\n", in, rc); pr_err("fill_inode badness on %p got %d\n", in, rc);
err = rc; err = rc;
if (in->i_state & I_NEW) {
ihold(in);
discard_new_inode(in);
}
} else if (in->i_state & I_NEW) {
unlock_new_inode(in);
} }
/* avoid calling iput_final() in mds dispatch threads */ /* avoid calling iput_final() in mds dispatch threads */
ceph_async_iput(in); ceph_async_iput(in);
} }
...@@ -1694,19 +1708,24 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1694,19 +1708,24 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
ret = fill_inode(in, NULL, &rde->inode, NULL, session, ret = fill_inode(in, NULL, &rde->inode, NULL, session,
req->r_request_started, -1, -1, &req->r_caps_reservation);
&req->r_caps_reservation);
if (ret < 0) { if (ret < 0) {
pr_err("fill_inode badness on %p\n", in); pr_err("fill_inode badness on %p\n", in);
if (d_really_is_negative(dn)) { if (d_really_is_negative(dn)) {
/* avoid calling iput_final() in mds /* avoid calling iput_final() in mds
* dispatch threads */ * dispatch threads */
if (in->i_state & I_NEW) {
ihold(in);
discard_new_inode(in);
}
ceph_async_iput(in); ceph_async_iput(in);
} }
d_drop(dn); d_drop(dn);
err = ret; err = ret;
goto next_item; goto next_item;
} }
if (in->i_state & I_NEW)
unlock_new_inode(in);
if (d_really_is_negative(dn)) { if (d_really_is_negative(dn)) {
if (ceph_security_xattr_deadlock(in)) { if (ceph_security_xattr_deadlock(in)) {
......
...@@ -9,6 +9,7 @@ ...@@ -9,6 +9,7 @@
#include <linux/debugfs.h> #include <linux/debugfs.h>
#include <linux/seq_file.h> #include <linux/seq_file.h>
#include <linux/ratelimit.h> #include <linux/ratelimit.h>
#include <linux/bits.h>
#include "super.h" #include "super.h"
#include "mds_client.h" #include "mds_client.h"
...@@ -530,6 +531,7 @@ const char *ceph_session_state_name(int s) ...@@ -530,6 +531,7 @@ const char *ceph_session_state_name(int s)
case CEPH_MDS_SESSION_OPEN: return "open"; case CEPH_MDS_SESSION_OPEN: return "open";
case CEPH_MDS_SESSION_HUNG: return "hung"; case CEPH_MDS_SESSION_HUNG: return "hung";
case CEPH_MDS_SESSION_CLOSING: return "closing"; case CEPH_MDS_SESSION_CLOSING: return "closing";
case CEPH_MDS_SESSION_CLOSED: return "closed";
case CEPH_MDS_SESSION_RESTARTING: return "restarting"; case CEPH_MDS_SESSION_RESTARTING: return "restarting";
case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting"; case CEPH_MDS_SESSION_RECONNECTING: return "reconnecting";
case CEPH_MDS_SESSION_REJECTED: return "rejected"; case CEPH_MDS_SESSION_REJECTED: return "rejected";
...@@ -537,7 +539,7 @@ const char *ceph_session_state_name(int s) ...@@ -537,7 +539,7 @@ const char *ceph_session_state_name(int s)
} }
} }
static struct ceph_mds_session *get_session(struct ceph_mds_session *s) struct ceph_mds_session *ceph_get_mds_session(struct ceph_mds_session *s)
{ {
if (refcount_inc_not_zero(&s->s_ref)) { if (refcount_inc_not_zero(&s->s_ref)) {
dout("mdsc get_session %p %d -> %d\n", s, dout("mdsc get_session %p %d -> %d\n", s,
...@@ -568,7 +570,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc, ...@@ -568,7 +570,7 @@ struct ceph_mds_session *__ceph_lookup_mds_session(struct ceph_mds_client *mdsc,
{ {
if (mds >= mdsc->max_sessions || !mdsc->sessions[mds]) if (mds >= mdsc->max_sessions || !mdsc->sessions[mds])
return NULL; return NULL;
return get_session(mdsc->sessions[mds]); return ceph_get_mds_session(mdsc->sessions[mds]);
} }
static bool __have_session(struct ceph_mds_client *mdsc, int mds) static bool __have_session(struct ceph_mds_client *mdsc, int mds)
...@@ -597,7 +599,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc, ...@@ -597,7 +599,7 @@ static struct ceph_mds_session *register_session(struct ceph_mds_client *mdsc,
{ {
struct ceph_mds_session *s; struct ceph_mds_session *s;
if (mds >= mdsc->mdsmap->m_num_mds) if (mds >= mdsc->mdsmap->possible_max_rank)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
s = kzalloc(sizeof(*s), GFP_NOFS); s = kzalloc(sizeof(*s), GFP_NOFS);
...@@ -674,7 +676,6 @@ static void __unregister_session(struct ceph_mds_client *mdsc, ...@@ -674,7 +676,6 @@ static void __unregister_session(struct ceph_mds_client *mdsc,
dout("__unregister_session mds%d %p\n", s->s_mds, s); dout("__unregister_session mds%d %p\n", s->s_mds, s);
BUG_ON(mdsc->sessions[s->s_mds] != s); BUG_ON(mdsc->sessions[s->s_mds] != s);
mdsc->sessions[s->s_mds] = NULL; mdsc->sessions[s->s_mds] = NULL;
s->s_state = 0;
ceph_con_close(&s->s_con); ceph_con_close(&s->s_con);
ceph_put_mds_session(s); ceph_put_mds_session(s);
atomic_dec(&mdsc->num_sessions); atomic_dec(&mdsc->num_sessions);
...@@ -878,7 +879,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry) ...@@ -878,7 +879,8 @@ static struct inode *get_nonsnap_parent(struct dentry *dentry)
* Called under mdsc->mutex. * Called under mdsc->mutex.
*/ */
static int __choose_mds(struct ceph_mds_client *mdsc, static int __choose_mds(struct ceph_mds_client *mdsc,
struct ceph_mds_request *req) struct ceph_mds_request *req,
bool *random)
{ {
struct inode *inode; struct inode *inode;
struct ceph_inode_info *ci; struct ceph_inode_info *ci;
...@@ -888,6 +890,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -888,6 +890,9 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
u32 hash = req->r_direct_hash; u32 hash = req->r_direct_hash;
bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags); bool is_hash = test_bit(CEPH_MDS_R_DIRECT_IS_HASH, &req->r_req_flags);
if (random)
*random = false;
/* /*
* is there a specific mds we should try? ignore hint if we have * is there a specific mds we should try? ignore hint if we have
* no session and the mds is not up (active or recovering). * no session and the mds is not up (active or recovering).
...@@ -895,7 +900,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -895,7 +900,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
if (req->r_resend_mds >= 0 && if (req->r_resend_mds >= 0 &&
(__have_session(mdsc, req->r_resend_mds) || (__have_session(mdsc, req->r_resend_mds) ||
ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) { ceph_mdsmap_get_state(mdsc->mdsmap, req->r_resend_mds) > 0)) {
dout("choose_mds using resend_mds mds%d\n", dout("%s using resend_mds mds%d\n", __func__,
req->r_resend_mds); req->r_resend_mds);
return req->r_resend_mds; return req->r_resend_mds;
} }
...@@ -913,7 +918,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -913,7 +918,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
rcu_read_lock(); rcu_read_lock();
inode = get_nonsnap_parent(req->r_dentry); inode = get_nonsnap_parent(req->r_dentry);
rcu_read_unlock(); rcu_read_unlock();
dout("__choose_mds using snapdir's parent %p\n", inode); dout("%s using snapdir's parent %p\n", __func__, inode);
} }
} else if (req->r_dentry) { } else if (req->r_dentry) {
/* ignore race with rename; old or new d_parent is okay */ /* ignore race with rename; old or new d_parent is okay */
...@@ -933,7 +938,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -933,7 +938,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
/* direct snapped/virtual snapdir requests /* direct snapped/virtual snapdir requests
* based on parent dir inode */ * based on parent dir inode */
inode = get_nonsnap_parent(parent); inode = get_nonsnap_parent(parent);
dout("__choose_mds using nonsnap parent %p\n", inode); dout("%s using nonsnap parent %p\n", __func__, inode);
} else { } else {
/* dentry target */ /* dentry target */
inode = d_inode(req->r_dentry); inode = d_inode(req->r_dentry);
...@@ -949,8 +954,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -949,8 +954,8 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
rcu_read_unlock(); rcu_read_unlock();
} }
dout("__choose_mds %p is_hash=%d (%d) mode %d\n", inode, (int)is_hash, dout("%s %p is_hash=%d (0x%x) mode %d\n", __func__, inode, (int)is_hash,
(int)hash, mode); hash, mode);
if (!inode) if (!inode)
goto random; goto random;
ci = ceph_inode(inode); ci = ceph_inode(inode);
...@@ -968,31 +973,34 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -968,31 +973,34 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
get_random_bytes(&r, 1); get_random_bytes(&r, 1);
r %= frag.ndist; r %= frag.ndist;
mds = frag.dist[r]; mds = frag.dist[r];
dout("choose_mds %p %llx.%llx " dout("%s %p %llx.%llx frag %u mds%d (%d/%d)\n",
"frag %u mds%d (%d/%d)\n", __func__, inode, ceph_vinop(inode),
inode, ceph_vinop(inode), frag.frag, mds, (int)r, frag.ndist);
frag.frag, mds,
(int)r, frag.ndist);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE) CEPH_MDS_STATE_ACTIVE &&
!ceph_mdsmap_is_laggy(mdsc->mdsmap, mds))
goto out; goto out;
} }
/* since this file/dir wasn't known to be /* since this file/dir wasn't known to be
* replicated, then we want to look for the * replicated, then we want to look for the
* authoritative mds. */ * authoritative mds. */
mode = USE_AUTH_MDS;
if (frag.mds >= 0) { if (frag.mds >= 0) {
/* choose auth mds */ /* choose auth mds */
mds = frag.mds; mds = frag.mds;
dout("choose_mds %p %llx.%llx " dout("%s %p %llx.%llx frag %u mds%d (auth)\n",
"frag %u mds%d (auth)\n", __func__, inode, ceph_vinop(inode),
inode, ceph_vinop(inode), frag.frag, mds); frag.frag, mds);
if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >= if (ceph_mdsmap_get_state(mdsc->mdsmap, mds) >=
CEPH_MDS_STATE_ACTIVE) CEPH_MDS_STATE_ACTIVE) {
if (mode == USE_ANY_MDS &&
!ceph_mdsmap_is_laggy(mdsc->mdsmap,
mds))
goto out; goto out;
} }
} }
mode = USE_AUTH_MDS;
}
} }
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
...@@ -1007,7 +1015,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -1007,7 +1015,7 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
goto random; goto random;
} }
mds = cap->session->s_mds; mds = cap->session->s_mds;
dout("choose_mds %p %llx.%llx mds%d (%scap %p)\n", dout("%s %p %llx.%llx mds%d (%scap %p)\n", __func__,
inode, ceph_vinop(inode), mds, inode, ceph_vinop(inode), mds,
cap == ci->i_auth_cap ? "auth " : "", cap); cap == ci->i_auth_cap ? "auth " : "", cap);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
...@@ -1018,8 +1026,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc, ...@@ -1018,8 +1026,11 @@ static int __choose_mds(struct ceph_mds_client *mdsc,
return mds; return mds;
random: random:
if (random)
*random = true;
mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap); mds = ceph_mdsmap_get_random_mds(mdsc->mdsmap);
dout("choose_mds chose random mds%d\n", mds); dout("%s chose random mds%d\n", __func__, mds);
return mds; return mds;
} }
...@@ -1045,20 +1056,21 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq) ...@@ -1045,20 +1056,21 @@ static struct ceph_msg *create_session_msg(u32 op, u64 seq)
return msg; return msg;
} }
static const unsigned char feature_bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED;
#define FEATURE_BYTES(c) (DIV_ROUND_UP((size_t)feature_bits[c - 1] + 1, 64) * 8)
static void encode_supported_features(void **p, void *end) static void encode_supported_features(void **p, void *end)
{ {
static const unsigned char bits[] = CEPHFS_FEATURES_CLIENT_SUPPORTED; static const size_t count = ARRAY_SIZE(feature_bits);
static const size_t count = ARRAY_SIZE(bits);
if (count > 0) { if (count > 0) {
size_t i; size_t i;
size_t size = ((size_t)bits[count - 1] + 64) / 64 * 8; size_t size = FEATURE_BYTES(count);
BUG_ON(*p + 4 + size > end); BUG_ON(*p + 4 + size > end);
ceph_encode_32(p, size); ceph_encode_32(p, size);
memset(*p, 0, size); memset(*p, 0, size);
for (i = 0; i < count; i++) for (i = 0; i < count; i++)
((unsigned char*)(*p))[i / 8] |= 1 << (bits[i] % 8); ((unsigned char*)(*p))[i / 8] |= BIT(feature_bits[i] % 8);
*p += size; *p += size;
} else { } else {
BUG_ON(*p + 4 > end); BUG_ON(*p + 4 > end);
...@@ -1079,6 +1091,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 ...@@ -1079,6 +1091,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
int metadata_key_count = 0; int metadata_key_count = 0;
struct ceph_options *opt = mdsc->fsc->client->options; struct ceph_options *opt = mdsc->fsc->client->options;
struct ceph_mount_options *fsopt = mdsc->fsc->mount_options; struct ceph_mount_options *fsopt = mdsc->fsc->mount_options;
size_t size, count;
void *p, *end; void *p, *end;
const char* metadata[][2] = { const char* metadata[][2] = {
...@@ -1096,8 +1109,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 ...@@ -1096,8 +1109,13 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
strlen(metadata[i][1]); strlen(metadata[i][1]);
metadata_key_count++; metadata_key_count++;
} }
/* supported feature */ /* supported feature */
extra_bytes += 4 + 8; size = 0;
count = ARRAY_SIZE(feature_bits);
if (count > 0)
size = FEATURE_BYTES(count);
extra_bytes += 4 + size;
/* Allocate the message */ /* Allocate the message */
msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes, msg = ceph_msg_new(CEPH_MSG_CLIENT_SESSION, sizeof(*h) + extra_bytes,
...@@ -1117,7 +1135,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6 ...@@ -1117,7 +1135,7 @@ static struct ceph_msg *create_session_open_msg(struct ceph_mds_client *mdsc, u6
* Serialize client metadata into waiting buffer space, using * Serialize client metadata into waiting buffer space, using
* the format that userspace expects for map<string, string> * the format that userspace expects for map<string, string>
* *
* ClientSession messages with metadata are v2 * ClientSession messages with metadata are v3
*/ */
msg->hdr.version = cpu_to_le16(3); msg->hdr.version = cpu_to_le16(3);
msg->hdr.compat_version = cpu_to_le16(1); msg->hdr.compat_version = cpu_to_le16(1);
...@@ -1219,7 +1237,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc, ...@@ -1219,7 +1237,7 @@ static void __open_export_target_sessions(struct ceph_mds_client *mdsc,
struct ceph_mds_session *ts; struct ceph_mds_session *ts;
int i, mds = session->s_mds; int i, mds = session->s_mds;
if (mds >= mdsc->mdsmap->m_num_mds) if (mds >= mdsc->mdsmap->possible_max_rank)
return; return;
mi = &mdsc->mdsmap->m_info[mds]; mi = &mdsc->mdsmap->m_info[mds];
...@@ -1967,7 +1985,7 @@ void ceph_flush_cap_releases(struct ceph_mds_client *mdsc, ...@@ -1967,7 +1985,7 @@ void ceph_flush_cap_releases(struct ceph_mds_client *mdsc,
if (mdsc->stopping) if (mdsc->stopping)
return; return;
get_session(session); ceph_get_mds_session(session);
if (queue_work(mdsc->fsc->cap_wq, if (queue_work(mdsc->fsc->cap_wq,
&session->s_cap_release_work)) { &session->s_cap_release_work)) {
dout("cap release work queued\n"); dout("cap release work queued\n");
...@@ -2515,6 +2533,26 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc, ...@@ -2515,6 +2533,26 @@ static int __prepare_send_request(struct ceph_mds_client *mdsc,
return 0; return 0;
} }
/*
* called under mdsc->mutex
*/
static int __send_request(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session,
struct ceph_mds_request *req,
bool drop_cap_releases)
{
int err;
err = __prepare_send_request(mdsc, req, session->s_mds,
drop_cap_releases);
if (!err) {
ceph_msg_get(req->r_request);
ceph_con_send(&session->s_con, req->r_request);
}
return err;
}
/* /*
* send request, or put it on the appropriate wait list. * send request, or put it on the appropriate wait list.
*/ */
...@@ -2524,6 +2562,7 @@ static void __do_request(struct ceph_mds_client *mdsc, ...@@ -2524,6 +2562,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
struct ceph_mds_session *session = NULL; struct ceph_mds_session *session = NULL;
int mds = -1; int mds = -1;
int err = 0; int err = 0;
bool random;
if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) { if (req->r_err || test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) if (test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags))
...@@ -2556,15 +2595,14 @@ static void __do_request(struct ceph_mds_client *mdsc, ...@@ -2556,15 +2595,14 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (!(mdsc->fsc->mount_options->flags & if (!(mdsc->fsc->mount_options->flags &
CEPH_MOUNT_OPT_MOUNTWAIT) && CEPH_MOUNT_OPT_MOUNTWAIT) &&
!ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) { !ceph_mdsmap_is_cluster_available(mdsc->mdsmap)) {
err = -ENOENT; err = -EHOSTUNREACH;
pr_info("probably no mds server is up\n");
goto finish; goto finish;
} }
} }
put_request_session(req); put_request_session(req);
mds = __choose_mds(mdsc, req); mds = __choose_mds(mdsc, req, &random);
if (mds < 0 || if (mds < 0 ||
ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) { ceph_mdsmap_get_state(mdsc->mdsmap, mds) < CEPH_MDS_STATE_ACTIVE) {
dout("do_request no mds or not active, waiting for map\n"); dout("do_request no mds or not active, waiting for map\n");
...@@ -2581,7 +2619,7 @@ static void __do_request(struct ceph_mds_client *mdsc, ...@@ -2581,7 +2619,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
goto finish; goto finish;
} }
} }
req->r_session = get_session(session); req->r_session = ceph_get_mds_session(session);
dout("do_request mds%d session %p state %s\n", mds, session, dout("do_request mds%d session %p state %s\n", mds, session,
ceph_session_state_name(session->s_state)); ceph_session_state_name(session->s_state));
...@@ -2592,8 +2630,12 @@ static void __do_request(struct ceph_mds_client *mdsc, ...@@ -2592,8 +2630,12 @@ static void __do_request(struct ceph_mds_client *mdsc,
goto out_session; goto out_session;
} }
if (session->s_state == CEPH_MDS_SESSION_NEW || if (session->s_state == CEPH_MDS_SESSION_NEW ||
session->s_state == CEPH_MDS_SESSION_CLOSING) session->s_state == CEPH_MDS_SESSION_CLOSING) {
__open_session(mdsc, session); __open_session(mdsc, session);
/* retry the same mds later */
if (random)
req->r_resend_mds = mds;
}
list_add(&req->r_wait, &session->s_waiting); list_add(&req->r_wait, &session->s_waiting);
goto out_session; goto out_session;
} }
...@@ -2604,11 +2646,7 @@ static void __do_request(struct ceph_mds_client *mdsc, ...@@ -2604,11 +2646,7 @@ static void __do_request(struct ceph_mds_client *mdsc,
if (req->r_request_started == 0) /* note request start time */ if (req->r_request_started == 0) /* note request start time */
req->r_request_started = jiffies; req->r_request_started = jiffies;
err = __prepare_send_request(mdsc, req, mds, false); err = __send_request(mdsc, session, req, false);
if (!err) {
ceph_msg_get(req->r_request);
ceph_con_send(&session->s_con, req->r_request);
}
out_session: out_session:
ceph_put_mds_session(session); ceph_put_mds_session(session);
...@@ -2861,7 +2899,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2861,7 +2899,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
goto out; goto out;
} else { } else {
int mds = __choose_mds(mdsc, req); int mds = __choose_mds(mdsc, req, NULL);
if (mds >= 0 && mds != req->r_session->s_mds) { if (mds >= 0 && mds != req->r_session->s_mds) {
dout("but auth changed, so resending\n"); dout("but auth changed, so resending\n");
__do_request(mdsc, req); __do_request(mdsc, req);
...@@ -2877,6 +2915,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2877,6 +2915,10 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags); set_bit(CEPH_MDS_R_GOT_SAFE, &req->r_req_flags);
__unregister_request(mdsc, req); __unregister_request(mdsc, req);
/* last request during umount? */
if (mdsc->stopping && !__get_oldest_req(mdsc))
complete_all(&mdsc->safe_umount_waiters);
if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) { if (test_bit(CEPH_MDS_R_GOT_UNSAFE, &req->r_req_flags)) {
/* /*
* We already handled the unsafe response, now do the * We already handled the unsafe response, now do the
...@@ -2887,9 +2929,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2887,9 +2929,6 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
*/ */
dout("got safe reply %llu, mds%d\n", tid, mds); dout("got safe reply %llu, mds%d\n", tid, mds);
/* last unsafe request during umount? */
if (mdsc->stopping && !__get_oldest_req(mdsc))
complete_all(&mdsc->safe_umount_waiters);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
goto out; goto out;
} }
...@@ -3104,7 +3143,7 @@ static void handle_session(struct ceph_mds_session *session, ...@@ -3104,7 +3143,7 @@ static void handle_session(struct ceph_mds_session *session,
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
if (op == CEPH_SESSION_CLOSE) { if (op == CEPH_SESSION_CLOSE) {
get_session(session); ceph_get_mds_session(session);
__unregister_session(mdsc, session); __unregister_session(mdsc, session);
} }
/* FIXME: this ttl calculation is generous */ /* FIXME: this ttl calculation is generous */
...@@ -3142,6 +3181,7 @@ static void handle_session(struct ceph_mds_session *session, ...@@ -3142,6 +3181,7 @@ static void handle_session(struct ceph_mds_session *session,
case CEPH_SESSION_CLOSE: case CEPH_SESSION_CLOSE:
if (session->s_state == CEPH_MDS_SESSION_RECONNECTING) if (session->s_state == CEPH_MDS_SESSION_RECONNECTING)
pr_info("mds%d reconnect denied\n", session->s_mds); pr_info("mds%d reconnect denied\n", session->s_mds);
session->s_state = CEPH_MDS_SESSION_CLOSED;
cleanup_session_requests(mdsc, session); cleanup_session_requests(mdsc, session);
remove_session_caps(session); remove_session_caps(session);
wake = 2; /* for good measure */ wake = 2; /* for good measure */
...@@ -3209,7 +3249,6 @@ static void handle_session(struct ceph_mds_session *session, ...@@ -3209,7 +3249,6 @@ static void handle_session(struct ceph_mds_session *session,
return; return;
} }
/* /*
* called under session->mutex. * called under session->mutex.
*/ */
...@@ -3218,18 +3257,12 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, ...@@ -3218,18 +3257,12 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
{ {
struct ceph_mds_request *req, *nreq; struct ceph_mds_request *req, *nreq;
struct rb_node *p; struct rb_node *p;
int err;
dout("replay_unsafe_requests mds%d\n", session->s_mds); dout("replay_unsafe_requests mds%d\n", session->s_mds);
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item) { list_for_each_entry_safe(req, nreq, &session->s_unsafe, r_unsafe_item)
err = __prepare_send_request(mdsc, req, session->s_mds, true); __send_request(mdsc, session, req, true);
if (!err) {
ceph_msg_get(req->r_request);
ceph_con_send(&session->s_con, req->r_request);
}
}
/* /*
* also re-send old requests when MDS enters reconnect stage. So that MDS * also re-send old requests when MDS enters reconnect stage. So that MDS
...@@ -3244,14 +3277,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc, ...@@ -3244,14 +3277,8 @@ static void replay_unsafe_requests(struct ceph_mds_client *mdsc,
if (req->r_attempts == 0) if (req->r_attempts == 0)
continue; /* only old requests */ continue; /* only old requests */
if (req->r_session && if (req->r_session &&
req->r_session->s_mds == session->s_mds) { req->r_session->s_mds == session->s_mds)
err = __prepare_send_request(mdsc, req, __send_request(mdsc, session, req, true);
session->s_mds, true);
if (!err) {
ceph_msg_get(req->r_request);
ceph_con_send(&session->s_con, req->r_request);
}
}
} }
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
} }
...@@ -3762,7 +3789,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, ...@@ -3762,7 +3789,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
dout("check_new_map new %u old %u\n", dout("check_new_map new %u old %u\n",
newmap->m_epoch, oldmap->m_epoch); newmap->m_epoch, oldmap->m_epoch);
for (i = 0; i < oldmap->m_num_mds && i < mdsc->max_sessions; i++) { for (i = 0; i < oldmap->possible_max_rank && i < mdsc->max_sessions; i++) {
if (!mdsc->sessions[i]) if (!mdsc->sessions[i])
continue; continue;
s = mdsc->sessions[i]; s = mdsc->sessions[i];
...@@ -3776,9 +3803,9 @@ static void check_new_map(struct ceph_mds_client *mdsc, ...@@ -3776,9 +3803,9 @@ static void check_new_map(struct ceph_mds_client *mdsc,
ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "", ceph_mdsmap_is_laggy(newmap, i) ? " (laggy)" : "",
ceph_session_state_name(s->s_state)); ceph_session_state_name(s->s_state));
if (i >= newmap->m_num_mds) { if (i >= newmap->possible_max_rank) {
/* force close session for stopped mds */ /* force close session for stopped mds */
get_session(s); ceph_get_mds_session(s);
__unregister_session(mdsc, s); __unregister_session(mdsc, s);
__wake_requests(mdsc, &s->s_waiting); __wake_requests(mdsc, &s->s_waiting);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
...@@ -3833,7 +3860,7 @@ static void check_new_map(struct ceph_mds_client *mdsc, ...@@ -3833,7 +3860,7 @@ static void check_new_map(struct ceph_mds_client *mdsc,
} }
} }
for (i = 0; i < newmap->m_num_mds && i < mdsc->max_sessions; i++) { for (i = 0; i < newmap->possible_max_rank && i < mdsc->max_sessions; i++) {
s = mdsc->sessions[i]; s = mdsc->sessions[i];
if (!s) if (!s)
continue; continue;
...@@ -4379,7 +4406,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc) ...@@ -4379,7 +4406,7 @@ void ceph_mdsc_close_sessions(struct ceph_mds_client *mdsc)
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
for (i = 0; i < mdsc->max_sessions; i++) { for (i = 0; i < mdsc->max_sessions; i++) {
if (mdsc->sessions[i]) { if (mdsc->sessions[i]) {
session = get_session(mdsc->sessions[i]); session = ceph_get_mds_session(mdsc->sessions[i]);
__unregister_session(mdsc, session); __unregister_session(mdsc, session);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex); mutex_lock(&session->s_mutex);
...@@ -4607,11 +4634,8 @@ static struct ceph_connection *con_get(struct ceph_connection *con) ...@@ -4607,11 +4634,8 @@ static struct ceph_connection *con_get(struct ceph_connection *con)
{ {
struct ceph_mds_session *s = con->private; struct ceph_mds_session *s = con->private;
if (get_session(s)) { if (ceph_get_mds_session(s))
dout("mdsc con_get %p ok (%d)\n", s, refcount_read(&s->s_ref));
return con; return con;
}
dout("mdsc con_get %p FAIL\n", s);
return NULL; return NULL;
} }
...@@ -4619,7 +4643,6 @@ static void con_put(struct ceph_connection *con) ...@@ -4619,7 +4643,6 @@ static void con_put(struct ceph_connection *con)
{ {
struct ceph_mds_session *s = con->private; struct ceph_mds_session *s = con->private;
dout("mdsc con_put %p (%d)\n", s, refcount_read(&s->s_ref) - 1);
ceph_put_mds_session(s); ceph_put_mds_session(s);
} }
......
...@@ -17,22 +17,31 @@ ...@@ -17,22 +17,31 @@
#include <linux/ceph/auth.h> #include <linux/ceph/auth.h>
/* The first 8 bits are reserved for old ceph releases */ /* The first 8 bits are reserved for old ceph releases */
#define CEPHFS_FEATURE_MIMIC 8 enum ceph_feature_type {
#define CEPHFS_FEATURE_REPLY_ENCODING 9 CEPHFS_FEATURE_MIMIC = 8,
#define CEPHFS_FEATURE_RECLAIM_CLIENT 10 CEPHFS_FEATURE_REPLY_ENCODING,
#define CEPHFS_FEATURE_LAZY_CAP_WANTED 11 CEPHFS_FEATURE_RECLAIM_CLIENT,
#define CEPHFS_FEATURE_MULTI_RECONNECT 12 CEPHFS_FEATURE_LAZY_CAP_WANTED,
CEPHFS_FEATURE_MULTI_RECONNECT,
CEPHFS_FEATURE_MAX = CEPHFS_FEATURE_MULTI_RECONNECT,
};
/*
* This will always have the highest feature bit value
* as the last element of the array.
*/
#define CEPHFS_FEATURES_CLIENT_SUPPORTED { \ #define CEPHFS_FEATURES_CLIENT_SUPPORTED { \
0, 1, 2, 3, 4, 5, 6, 7, \ 0, 1, 2, 3, 4, 5, 6, 7, \
CEPHFS_FEATURE_MIMIC, \ CEPHFS_FEATURE_MIMIC, \
CEPHFS_FEATURE_REPLY_ENCODING, \ CEPHFS_FEATURE_REPLY_ENCODING, \
CEPHFS_FEATURE_LAZY_CAP_WANTED, \ CEPHFS_FEATURE_LAZY_CAP_WANTED, \
CEPHFS_FEATURE_MULTI_RECONNECT, \ CEPHFS_FEATURE_MULTI_RECONNECT, \
\
CEPHFS_FEATURE_MAX, \
} }
#define CEPHFS_FEATURES_CLIENT_REQUIRED {} #define CEPHFS_FEATURES_CLIENT_REQUIRED {}
/* /*
* Some lock dependencies: * Some lock dependencies:
* *
...@@ -151,7 +160,8 @@ enum { ...@@ -151,7 +160,8 @@ enum {
CEPH_MDS_SESSION_RESTARTING = 5, CEPH_MDS_SESSION_RESTARTING = 5,
CEPH_MDS_SESSION_RECONNECTING = 6, CEPH_MDS_SESSION_RECONNECTING = 6,
CEPH_MDS_SESSION_CLOSING = 7, CEPH_MDS_SESSION_CLOSING = 7,
CEPH_MDS_SESSION_REJECTED = 8, CEPH_MDS_SESSION_CLOSED = 8,
CEPH_MDS_SESSION_REJECTED = 9,
}; };
struct ceph_mds_session { struct ceph_mds_session {
...@@ -174,6 +184,7 @@ struct ceph_mds_session { ...@@ -174,6 +184,7 @@ struct ceph_mds_session {
/* protected by s_cap_lock */ /* protected by s_cap_lock */
spinlock_t s_cap_lock; spinlock_t s_cap_lock;
refcount_t s_ref;
struct list_head s_caps; /* all caps issued by this session */ struct list_head s_caps; /* all caps issued by this session */
struct ceph_cap *s_cap_iterator; struct ceph_cap *s_cap_iterator;
int s_nr_caps; int s_nr_caps;
...@@ -188,7 +199,6 @@ struct ceph_mds_session { ...@@ -188,7 +199,6 @@ struct ceph_mds_session {
unsigned long s_renew_requested; /* last time we sent a renew req */ unsigned long s_renew_requested; /* last time we sent a renew req */
u64 s_renew_seq; u64 s_renew_seq;
refcount_t s_ref;
struct list_head s_waiting; /* waiting requests */ struct list_head s_waiting; /* waiting requests */
struct list_head s_unsafe; /* unsafe requests */ struct list_head s_unsafe; /* unsafe requests */
}; };
...@@ -224,6 +234,7 @@ struct ceph_mds_request { ...@@ -224,6 +234,7 @@ struct ceph_mds_request {
struct rb_node r_node; struct rb_node r_node;
struct ceph_mds_client *r_mdsc; struct ceph_mds_client *r_mdsc;
struct kref r_kref;
int r_op; /* mds op code */ int r_op; /* mds op code */
/* operation on what? */ /* operation on what? */
...@@ -294,7 +305,6 @@ struct ceph_mds_request { ...@@ -294,7 +305,6 @@ struct ceph_mds_request {
int r_resend_mds; /* mds to resend to next, if any*/ int r_resend_mds; /* mds to resend to next, if any*/
u32 r_sent_on_mseq; /* cap mseq request was sent at*/ u32 r_sent_on_mseq; /* cap mseq request was sent at*/
struct kref r_kref;
struct list_head r_wait; struct list_head r_wait;
struct completion r_completion; struct completion r_completion;
struct completion r_safe_completion; struct completion r_safe_completion;
...@@ -451,15 +461,10 @@ extern const char *ceph_mds_op_name(int op); ...@@ -451,15 +461,10 @@ extern const char *ceph_mds_op_name(int op);
extern struct ceph_mds_session * extern struct ceph_mds_session *
__ceph_lookup_mds_session(struct ceph_mds_client *, int mds); __ceph_lookup_mds_session(struct ceph_mds_client *, int mds);
static inline struct ceph_mds_session *
ceph_get_mds_session(struct ceph_mds_session *s)
{
refcount_inc(&s->s_ref);
return s;
}
extern const char *ceph_session_state_name(int s); extern const char *ceph_session_state_name(int s);
extern struct ceph_mds_session *
ceph_get_mds_session(struct ceph_mds_session *s);
extern void ceph_put_mds_session(struct ceph_mds_session *s); extern void ceph_put_mds_session(struct ceph_mds_session *s);
extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc, extern int ceph_send_msg_mds(struct ceph_mds_client *mdsc,
......
...@@ -13,30 +13,25 @@ ...@@ -13,30 +13,25 @@
#include "super.h" #include "super.h"
#define CEPH_MDS_IS_READY(i, ignore_laggy) \
(m->m_info[i].state > 0 && ignore_laggy ? true : !m->m_info[i].laggy)
/* static int __mdsmap_get_random_mds(struct ceph_mdsmap *m, bool ignore_laggy)
* choose a random mds that is "up" (i.e. has a state > 0), or -1.
*/
int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
{ {
int n = 0; int n = 0;
int i, j; int i, j;
/* special case for one mds */
if (1 == m->m_num_mds && m->m_info[0].state > 0)
return 0;
/* count */ /* count */
for (i = 0; i < m->m_num_mds; i++) for (i = 0; i < m->possible_max_rank; i++)
if (m->m_info[i].state > 0) if (CEPH_MDS_IS_READY(i, ignore_laggy))
n++; n++;
if (n == 0) if (n == 0)
return -1; return -1;
/* pick */ /* pick */
n = prandom_u32() % n; n = prandom_u32() % n;
for (j = 0, i = 0; i < m->m_num_mds; i++) { for (j = 0, i = 0; i < m->possible_max_rank; i++) {
if (m->m_info[i].state > 0) if (CEPH_MDS_IS_READY(i, ignore_laggy))
j++; j++;
if (j > n) if (j > n)
break; break;
...@@ -45,6 +40,20 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m) ...@@ -45,6 +40,20 @@ int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
return i; return i;
} }
/*
* choose a random mds that is "up" (i.e. has a state > 0), or -1.
*/
int ceph_mdsmap_get_random_mds(struct ceph_mdsmap *m)
{
int mds;
mds = __mdsmap_get_random_mds(m, false);
if (mds == m->possible_max_rank || mds == -1)
mds = __mdsmap_get_random_mds(m, true);
return mds == m->possible_max_rank ? -1 : mds;
}
#define __decode_and_drop_type(p, end, type, bad) \ #define __decode_and_drop_type(p, end, type, bad) \
do { \ do { \
if (*p + sizeof(type) > end) \ if (*p + sizeof(type) > end) \
...@@ -138,14 +147,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -138,14 +147,29 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
m->m_session_autoclose = ceph_decode_32(p); m->m_session_autoclose = ceph_decode_32(p);
m->m_max_file_size = ceph_decode_64(p); m->m_max_file_size = ceph_decode_64(p);
m->m_max_mds = ceph_decode_32(p); m->m_max_mds = ceph_decode_32(p);
m->m_num_mds = m->m_max_mds;
m->m_info = kcalloc(m->m_num_mds, sizeof(*m->m_info), GFP_NOFS); /*
* pick out the active nodes as the m_num_active_mds, the
* m_num_active_mds maybe larger than m_max_mds when decreasing
* the max_mds in cluster side, in other case it should less
* than or equal to m_max_mds.
*/
m->m_num_active_mds = n = ceph_decode_32(p);
/*
* the possible max rank, it maybe larger than the m_num_active_mds,
* for example if the mds_max == 2 in the cluster, when the MDS(0)
* was laggy and being replaced by a new MDS, we will temporarily
* receive a new mds map with n_num_mds == 1 and the active MDS(1),
* and the mds rank >= m_num_active_mds.
*/
m->possible_max_rank = max(m->m_num_active_mds, m->m_max_mds);
m->m_info = kcalloc(m->possible_max_rank, sizeof(*m->m_info), GFP_NOFS);
if (!m->m_info) if (!m->m_info)
goto nomem; goto nomem;
/* pick out active nodes from mds_info (state > 0) */ /* pick out active nodes from mds_info (state > 0) */
n = ceph_decode_32(p);
for (i = 0; i < n; i++) { for (i = 0; i < n; i++) {
u64 global_id; u64 global_id;
u32 namelen; u32 namelen;
...@@ -215,18 +239,15 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -215,18 +239,15 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
ceph_mds_state_name(state), ceph_mds_state_name(state),
laggy ? "(laggy)" : ""); laggy ? "(laggy)" : "");
if (mds < 0 || state <= 0) if (mds < 0 || mds >= m->possible_max_rank) {
pr_warn("mdsmap_decode got incorrect mds(%d)\n", mds);
continue; continue;
}
if (mds >= m->m_num_mds) { if (state <= 0) {
int new_num = max(mds + 1, m->m_num_mds * 2); pr_warn("mdsmap_decode got incorrect state(%s)\n",
void *new_m_info = krealloc(m->m_info, ceph_mds_state_name(state));
new_num * sizeof(*m->m_info), continue;
GFP_NOFS | __GFP_ZERO);
if (!new_m_info)
goto nomem;
m->m_info = new_m_info;
m->m_num_mds = new_num;
} }
info = &m->m_info[mds]; info = &m->m_info[mds];
...@@ -247,14 +268,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -247,14 +268,6 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
info->export_targets = NULL; info->export_targets = NULL;
} }
} }
if (m->m_num_mds > m->m_max_mds) {
/* find max up mds */
for (i = m->m_num_mds; i >= m->m_max_mds; i--) {
if (i == 0 || m->m_info[i-1].state > 0)
break;
}
m->m_num_mds = i;
}
/* pg_pools */ /* pg_pools */
ceph_decode_32_safe(p, end, n, bad); ceph_decode_32_safe(p, end, n, bad);
...@@ -296,14 +309,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -296,14 +309,14 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
for (i = 0; i < n; i++) { for (i = 0; i < n; i++) {
s32 mds = ceph_decode_32(p); s32 mds = ceph_decode_32(p);
if (mds >= 0 && mds < m->m_num_mds) { if (mds >= 0 && mds < m->possible_max_rank) {
if (m->m_info[mds].laggy) if (m->m_info[mds].laggy)
num_laggy++; num_laggy++;
} }
} }
m->m_num_laggy = num_laggy; m->m_num_laggy = num_laggy;
if (n > m->m_num_mds) { if (n > m->possible_max_rank) {
void *new_m_info = krealloc(m->m_info, void *new_m_info = krealloc(m->m_info,
n * sizeof(*m->m_info), n * sizeof(*m->m_info),
GFP_NOFS | __GFP_ZERO); GFP_NOFS | __GFP_ZERO);
...@@ -311,7 +324,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -311,7 +324,7 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
goto nomem; goto nomem;
m->m_info = new_m_info; m->m_info = new_m_info;
} }
m->m_num_mds = n; m->possible_max_rank = n;
} }
/* inc */ /* inc */
...@@ -382,7 +395,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m) ...@@ -382,7 +395,7 @@ void ceph_mdsmap_destroy(struct ceph_mdsmap *m)
{ {
int i; int i;
for (i = 0; i < m->m_num_mds; i++) for (i = 0; i < m->possible_max_rank; i++)
kfree(m->m_info[i].export_targets); kfree(m->m_info[i].export_targets);
kfree(m->m_info); kfree(m->m_info);
kfree(m->m_data_pg_pools); kfree(m->m_data_pg_pools);
...@@ -396,9 +409,9 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m) ...@@ -396,9 +409,9 @@ bool ceph_mdsmap_is_cluster_available(struct ceph_mdsmap *m)
return false; return false;
if (m->m_damaged) if (m->m_damaged)
return false; return false;
if (m->m_num_laggy > 0) if (m->m_num_laggy == m->m_num_active_mds)
return false; return false;
for (i = 0; i < m->m_num_mds; i++) { for (i = 0; i < m->possible_max_rank; i++) {
if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE) if (m->m_info[i].state == CEPH_MDS_STATE_ACTIVE)
nr_active++; nr_active++;
} }
......
...@@ -107,7 +107,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) ...@@ -107,7 +107,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
return 0; return 0;
} }
static int ceph_sync_fs(struct super_block *sb, int wait) static int ceph_sync_fs(struct super_block *sb, int wait)
{ {
struct ceph_fs_client *fsc = ceph_sb_to_client(sb); struct ceph_fs_client *fsc = ceph_sb_to_client(sb);
...@@ -211,7 +210,6 @@ struct ceph_parse_opts_ctx { ...@@ -211,7 +210,6 @@ struct ceph_parse_opts_ctx {
/* /*
* Parse the source parameter. Distinguish the server list from the path. * Parse the source parameter. Distinguish the server list from the path.
* Internally we do not include the leading '/' in the path.
* *
* The source will look like: * The source will look like:
* <server_spec>[,<server_spec>...]:[<path>] * <server_spec>[,<server_spec>...]:[<path>]
...@@ -232,12 +230,15 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc) ...@@ -232,12 +230,15 @@ static int ceph_parse_source(struct fs_parameter *param, struct fs_context *fc)
dev_name_end = strchr(dev_name, '/'); dev_name_end = strchr(dev_name, '/');
if (dev_name_end) { if (dev_name_end) {
if (strlen(dev_name_end) > 1) {
kfree(fsopt->server_path); kfree(fsopt->server_path);
/*
* The server_path will include the whole chars from userland
* including the leading '/'.
*/
fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL); fsopt->server_path = kstrdup(dev_name_end, GFP_KERNEL);
if (!fsopt->server_path) if (!fsopt->server_path)
return -ENOMEM; return -ENOMEM;
}
} else { } else {
dev_name_end = dev_name + strlen(dev_name); dev_name_end = dev_name + strlen(dev_name);
} }
...@@ -461,6 +462,73 @@ static int strcmp_null(const char *s1, const char *s2) ...@@ -461,6 +462,73 @@ static int strcmp_null(const char *s1, const char *s2)
return strcmp(s1, s2); return strcmp(s1, s2);
} }
/**
* path_remove_extra_slash - Remove the extra slashes in the server path
* @server_path: the server path and could be NULL
*
* Return NULL if the path is NULL or only consists of "/", or a string
* without any extra slashes including the leading slash(es) and the
* slash(es) at the end of the server path, such as:
* "//dir1////dir2///" --> "dir1/dir2"
*/
static char *path_remove_extra_slash(const char *server_path)
{
const char *path = server_path;
const char *cur, *end;
char *buf, *p;
int len;
/* if the server path is omitted */
if (!path)
return NULL;
/* remove all the leading slashes */
while (*path == '/')
path++;
/* if the server path only consists of slashes */
if (*path == '\0')
return NULL;
len = strlen(path);
buf = kmalloc(len + 1, GFP_KERNEL);
if (!buf)
return ERR_PTR(-ENOMEM);
end = path + len;
p = buf;
do {
cur = strchr(path, '/');
if (!cur)
cur = end;
len = cur - path;
/* including one '/' */
if (cur != end)
len += 1;
memcpy(p, path, len);
p += len;
while (cur <= end && *cur == '/')
cur++;
path = cur;
} while (path < end);
*p = '\0';
/*
* remove the last slash if there has and just to make sure that
* we will get something like "dir1/dir2"
*/
if (*(--p) == '/')
*p = '\0';
return buf;
}
static int compare_mount_options(struct ceph_mount_options *new_fsopt, static int compare_mount_options(struct ceph_mount_options *new_fsopt,
struct ceph_options *new_opt, struct ceph_options *new_opt,
struct ceph_fs_client *fsc) struct ceph_fs_client *fsc)
...@@ -468,6 +536,7 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, ...@@ -468,6 +536,7 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
struct ceph_mount_options *fsopt1 = new_fsopt; struct ceph_mount_options *fsopt1 = new_fsopt;
struct ceph_mount_options *fsopt2 = fsc->mount_options; struct ceph_mount_options *fsopt2 = fsc->mount_options;
int ofs = offsetof(struct ceph_mount_options, snapdir_name); int ofs = offsetof(struct ceph_mount_options, snapdir_name);
char *p1, *p2;
int ret; int ret;
ret = memcmp(fsopt1, fsopt2, ofs); ret = memcmp(fsopt1, fsopt2, ofs);
...@@ -480,9 +549,21 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, ...@@ -480,9 +549,21 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
if (ret) if (ret)
return ret; return ret;
ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
p1 = path_remove_extra_slash(fsopt1->server_path);
if (IS_ERR(p1))
return PTR_ERR(p1);
p2 = path_remove_extra_slash(fsopt2->server_path);
if (IS_ERR(p2)) {
kfree(p1);
return PTR_ERR(p2);
}
ret = strcmp_null(p1, p2);
kfree(p1);
kfree(p2);
if (ret) if (ret)
return ret; return ret;
ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq); ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq);
if (ret) if (ret)
return ret; return ret;
...@@ -637,6 +718,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, ...@@ -637,6 +718,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
fsc->sb = NULL; fsc->sb = NULL;
fsc->mount_state = CEPH_MOUNT_MOUNTING; fsc->mount_state = CEPH_MOUNT_MOUNTING;
fsc->filp_gen = 1; fsc->filp_gen = 1;
fsc->have_copy_from2 = true;
atomic_long_set(&fsc->writeback_count, 0); atomic_long_set(&fsc->writeback_count, 0);
...@@ -788,7 +870,6 @@ static void destroy_caches(void) ...@@ -788,7 +870,6 @@ static void destroy_caches(void)
ceph_fscache_unregister(); ceph_fscache_unregister();
} }
/* /*
* ceph_umount_begin - initiate forced umount. Tear down down the * ceph_umount_begin - initiate forced umount. Tear down down the
* mount, skipping steps that may hang while waiting for server(s). * mount, skipping steps that may hang while waiting for server(s).
...@@ -868,9 +949,6 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc, ...@@ -868,9 +949,6 @@ static struct dentry *open_root_dentry(struct ceph_fs_client *fsc,
return root; return root;
} }
/* /*
* mount: join the ceph cluster, and open root directory. * mount: join the ceph cluster, and open root directory.
*/ */
...@@ -885,7 +963,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, ...@@ -885,7 +963,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
mutex_lock(&fsc->client->mount_mutex); mutex_lock(&fsc->client->mount_mutex);
if (!fsc->sb->s_root) { if (!fsc->sb->s_root) {
const char *path; const char *path, *p;
err = __ceph_open_session(fsc->client, started); err = __ceph_open_session(fsc->client, started);
if (err < 0) if (err < 0)
goto out; goto out;
...@@ -897,17 +975,22 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, ...@@ -897,17 +975,22 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
goto out; goto out;
} }
if (!fsc->mount_options->server_path) { p = path_remove_extra_slash(fsc->mount_options->server_path);
path = ""; if (IS_ERR(p)) {
dout("mount opening path \\t\n"); err = PTR_ERR(p);
} else { goto out;
path = fsc->mount_options->server_path + 1;
dout("mount opening path %s\n", path);
} }
/* if the server path is omitted or just consists of '/' */
if (!p)
path = "";
else
path = p;
dout("mount opening path '%s'\n", path);
ceph_fs_debugfs_init(fsc); ceph_fs_debugfs_init(fsc);
root = open_root_dentry(fsc, path, started); root = open_root_dentry(fsc, path, started);
kfree(p);
if (IS_ERR(root)) { if (IS_ERR(root)) {
err = PTR_ERR(root); err = PTR_ERR(root);
goto out; goto out;
...@@ -1070,6 +1153,11 @@ static int ceph_get_tree(struct fs_context *fc) ...@@ -1070,6 +1153,11 @@ static int ceph_get_tree(struct fs_context *fc)
return 0; return 0;
out_splat: out_splat:
if (!ceph_mdsmap_is_cluster_available(fsc->mdsc->mdsmap)) {
pr_info("No mds server is up or the cluster is laggy\n");
err = -EHOSTUNREACH;
}
ceph_mdsc_close_sessions(fsc->mdsc); ceph_mdsc_close_sessions(fsc->mdsc);
deactivate_locked_super(sb); deactivate_locked_super(sb);
goto out_final; goto out_final;
......
...@@ -106,6 +106,8 @@ struct ceph_fs_client { ...@@ -106,6 +106,8 @@ struct ceph_fs_client {
unsigned long last_auto_reconnect; unsigned long last_auto_reconnect;
bool blacklisted; bool blacklisted;
bool have_copy_from2;
u32 filp_gen; u32 filp_gen;
loff_t max_file_size; loff_t max_file_size;
......
...@@ -39,7 +39,6 @@ void ceph_file_layout_from_legacy(struct ceph_file_layout *fl, ...@@ -39,7 +39,6 @@ void ceph_file_layout_from_legacy(struct ceph_file_layout *fl,
fl->stripe_count == 0 && fl->object_size == 0) fl->stripe_count == 0 && fl->object_size == 0)
fl->pool_id = -1; fl->pool_id = -1;
} }
EXPORT_SYMBOL(ceph_file_layout_from_legacy);
void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
struct ceph_file_layout_legacy *legacy) struct ceph_file_layout_legacy *legacy)
...@@ -52,7 +51,6 @@ void ceph_file_layout_to_legacy(struct ceph_file_layout *fl, ...@@ -52,7 +51,6 @@ void ceph_file_layout_to_legacy(struct ceph_file_layout *fl,
else else
legacy->fl_pg_pool = 0; legacy->fl_pg_pool = 0;
} }
EXPORT_SYMBOL(ceph_file_layout_to_legacy);
int ceph_flags_to_mode(int flags) int ceph_flags_to_mode(int flags)
{ {
...@@ -82,7 +80,6 @@ int ceph_flags_to_mode(int flags) ...@@ -82,7 +80,6 @@ int ceph_flags_to_mode(int flags)
return mode; return mode;
} }
EXPORT_SYMBOL(ceph_flags_to_mode);
int ceph_caps_for_mode(int mode) int ceph_caps_for_mode(int mode)
{ {
...@@ -101,4 +98,3 @@ int ceph_caps_for_mode(int mode) ...@@ -101,4 +98,3 @@ int ceph_caps_for_mode(int mode)
return caps; return caps;
} }
EXPORT_SYMBOL(ceph_caps_for_mode);
...@@ -655,7 +655,7 @@ static int __build_xattrs(struct inode *inode) ...@@ -655,7 +655,7 @@ static int __build_xattrs(struct inode *inode)
u32 len; u32 len;
const char *name, *val; const char *name, *val;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
int xattr_version; u64 xattr_version;
struct ceph_inode_xattr **xattrs = NULL; struct ceph_inode_xattr **xattrs = NULL;
int err = 0; int err = 0;
int i; int i;
...@@ -851,7 +851,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, ...@@ -851,7 +851,7 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
req_mask = __get_request_mask(inode); req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
dout("getxattr %p ver=%lld index_ver=%lld\n", inode, dout("getxattr %p name '%s' ver=%lld index_ver=%lld\n", inode, name,
ci->i_xattrs.version, ci->i_xattrs.index_version); ci->i_xattrs.version, ci->i_xattrs.index_version);
if (ci->i_xattrs.version == 0 || if (ci->i_xattrs.version == 0 ||
...@@ -1078,7 +1078,8 @@ int __ceph_setxattr(struct inode *inode, const char *name, ...@@ -1078,7 +1078,8 @@ int __ceph_setxattr(struct inode *inode, const char *name,
} }
} }
dout("setxattr %p issued %s\n", inode, ceph_cap_string(issued)); dout("setxattr %p name '%s' issued %s\n", inode, name,
ceph_cap_string(issued));
__build_xattrs(inode); __build_xattrs(inode);
required_blob_size = __get_required_blob_size(ci, name_len, val_len); required_blob_size = __get_required_blob_size(ci, name_len, val_len);
......
...@@ -25,8 +25,9 @@ struct ceph_mdsmap { ...@@ -25,8 +25,9 @@ struct ceph_mdsmap {
u32 m_session_timeout; /* seconds */ u32 m_session_timeout; /* seconds */
u32 m_session_autoclose; /* seconds */ u32 m_session_autoclose; /* seconds */
u64 m_max_file_size; u64 m_max_file_size;
u32 m_max_mds; /* size of m_addr, m_state arrays */ u32 m_max_mds; /* expected up:active mds number */
int m_num_mds; u32 m_num_active_mds; /* actual up:active mds number */
u32 possible_max_rank; /* possible max rank index */
struct ceph_mds_info *m_info; struct ceph_mds_info *m_info;
/* which object pools file data can be stored in */ /* which object pools file data can be stored in */
...@@ -42,7 +43,7 @@ struct ceph_mdsmap { ...@@ -42,7 +43,7 @@ struct ceph_mdsmap {
static inline struct ceph_entity_addr * static inline struct ceph_entity_addr *
ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
{ {
if (w >= m->m_num_mds) if (w >= m->possible_max_rank)
return NULL; return NULL;
return &m->m_info[w].addr; return &m->m_info[w].addr;
} }
...@@ -50,14 +51,14 @@ ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w) ...@@ -50,14 +51,14 @@ ceph_mdsmap_get_addr(struct ceph_mdsmap *m, int w)
static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w) static inline int ceph_mdsmap_get_state(struct ceph_mdsmap *m, int w)
{ {
BUG_ON(w < 0); BUG_ON(w < 0);
if (w >= m->m_num_mds) if (w >= m->possible_max_rank)
return CEPH_MDS_STATE_DNE; return CEPH_MDS_STATE_DNE;
return m->m_info[w].state; return m->m_info[w].state;
} }
static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w) static inline bool ceph_mdsmap_is_laggy(struct ceph_mdsmap *m, int w)
{ {
if (w >= 0 && w < m->m_num_mds) if (w >= 0 && w < m->possible_max_rank)
return m->m_info[w].laggy; return m->m_info[w].laggy;
return false; return false;
} }
......
...@@ -534,6 +534,7 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc, ...@@ -534,6 +534,7 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
struct ceph_object_id *dst_oid, struct ceph_object_id *dst_oid,
struct ceph_object_locator *dst_oloc, struct ceph_object_locator *dst_oloc,
u32 dst_fadvise_flags, u32 dst_fadvise_flags,
u32 truncate_seq, u64 truncate_size,
u8 copy_from_flags); u8 copy_from_flags);
/* watch/notify */ /* watch/notify */
......
...@@ -256,6 +256,7 @@ extern const char *ceph_osd_state_name(int s); ...@@ -256,6 +256,7 @@ extern const char *ceph_osd_state_name(int s);
\ \
/* tiering */ \ /* tiering */ \
f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \ f(COPY_FROM, __CEPH_OSD_OP(WR, DATA, 26), "copy-from") \
f(COPY_FROM2, __CEPH_OSD_OP(WR, DATA, 45), "copy-from2") \
f(COPY_GET_CLASSIC, __CEPH_OSD_OP(RD, DATA, 27), "copy-get-classic") \ f(COPY_GET_CLASSIC, __CEPH_OSD_OP(RD, DATA, 27), "copy-get-classic") \
f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \ f(UNDIRTY, __CEPH_OSD_OP(WR, DATA, 28), "undirty") \
f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \ f(ISDIRTY, __CEPH_OSD_OP(RD, DATA, 29), "isdirty") \
...@@ -446,6 +447,7 @@ enum { ...@@ -446,6 +447,7 @@ enum {
CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
* cloneid */ * cloneid */
CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */ CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */
CEPH_OSD_COPY_FROM_FLAG_TRUNCATE_SEQ = 32, /* send truncate_{seq,size} */
}; };
enum { enum {
......
...@@ -13,5 +13,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \ ...@@ -13,5 +13,5 @@ libceph-y := ceph_common.o messenger.o msgpool.o buffer.o pagelist.o \
auth.o auth_none.o \ auth.o auth_none.o \
crypto.o armor.o \ crypto.o armor.o \
auth_x.o \ auth_x.o \
ceph_fs.o ceph_strings.o ceph_hash.o \ ceph_strings.o ceph_hash.o \
pagevec.o snapshot.o string_table.o pagevec.o snapshot.o string_table.o
...@@ -402,7 +402,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req, ...@@ -402,7 +402,7 @@ static void osd_req_op_data_release(struct ceph_osd_request *osd_req,
case CEPH_OSD_OP_LIST_WATCHERS: case CEPH_OSD_OP_LIST_WATCHERS:
ceph_osd_data_release(&op->list_watchers.response_data); ceph_osd_data_release(&op->list_watchers.response_data);
break; break;
case CEPH_OSD_OP_COPY_FROM: case CEPH_OSD_OP_COPY_FROM2:
ceph_osd_data_release(&op->copy_from.osd_data); ceph_osd_data_release(&op->copy_from.osd_data);
break; break;
default: default:
...@@ -697,7 +697,7 @@ static void get_num_data_items(struct ceph_osd_request *req, ...@@ -697,7 +697,7 @@ static void get_num_data_items(struct ceph_osd_request *req,
case CEPH_OSD_OP_SETXATTR: case CEPH_OSD_OP_SETXATTR:
case CEPH_OSD_OP_CMPXATTR: case CEPH_OSD_OP_CMPXATTR:
case CEPH_OSD_OP_NOTIFY_ACK: case CEPH_OSD_OP_NOTIFY_ACK:
case CEPH_OSD_OP_COPY_FROM: case CEPH_OSD_OP_COPY_FROM2:
*num_request_data_items += 1; *num_request_data_items += 1;
break; break;
...@@ -1029,7 +1029,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst, ...@@ -1029,7 +1029,7 @@ static u32 osd_req_encode_op(struct ceph_osd_op *dst,
case CEPH_OSD_OP_CREATE: case CEPH_OSD_OP_CREATE:
case CEPH_OSD_OP_DELETE: case CEPH_OSD_OP_DELETE:
break; break;
case CEPH_OSD_OP_COPY_FROM: case CEPH_OSD_OP_COPY_FROM2:
dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid); dst->copy_from.snapid = cpu_to_le64(src->copy_from.snapid);
dst->copy_from.src_version = dst->copy_from.src_version =
cpu_to_le64(src->copy_from.src_version); cpu_to_le64(src->copy_from.src_version);
...@@ -1966,7 +1966,7 @@ static void setup_request_data(struct ceph_osd_request *req) ...@@ -1966,7 +1966,7 @@ static void setup_request_data(struct ceph_osd_request *req)
ceph_osdc_msg_data_add(request_msg, ceph_osdc_msg_data_add(request_msg,
&op->notify_ack.request_data); &op->notify_ack.request_data);
break; break;
case CEPH_OSD_OP_COPY_FROM: case CEPH_OSD_OP_COPY_FROM2:
ceph_osdc_msg_data_add(request_msg, ceph_osdc_msg_data_add(request_msg,
&op->copy_from.osd_data); &op->copy_from.osd_data);
break; break;
...@@ -5315,6 +5315,7 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, ...@@ -5315,6 +5315,7 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
struct ceph_object_locator *src_oloc, struct ceph_object_locator *src_oloc,
u32 src_fadvise_flags, u32 src_fadvise_flags,
u32 dst_fadvise_flags, u32 dst_fadvise_flags,
u32 truncate_seq, u64 truncate_size,
u8 copy_from_flags) u8 copy_from_flags)
{ {
struct ceph_osd_req_op *op; struct ceph_osd_req_op *op;
...@@ -5325,7 +5326,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, ...@@ -5325,7 +5326,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
if (IS_ERR(pages)) if (IS_ERR(pages))
return PTR_ERR(pages); return PTR_ERR(pages);
op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM, dst_fadvise_flags); op = _osd_req_op_init(req, 0, CEPH_OSD_OP_COPY_FROM2,
dst_fadvise_flags);
op->copy_from.snapid = src_snapid; op->copy_from.snapid = src_snapid;
op->copy_from.src_version = src_version; op->copy_from.src_version = src_version;
op->copy_from.flags = copy_from_flags; op->copy_from.flags = copy_from_flags;
...@@ -5335,6 +5337,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req, ...@@ -5335,6 +5337,8 @@ static int osd_req_op_copy_from_init(struct ceph_osd_request *req,
end = p + PAGE_SIZE; end = p + PAGE_SIZE;
ceph_encode_string(&p, end, src_oid->name, src_oid->name_len); ceph_encode_string(&p, end, src_oid->name, src_oid->name_len);
encode_oloc(&p, end, src_oloc); encode_oloc(&p, end, src_oloc);
ceph_encode_32(&p, truncate_seq);
ceph_encode_64(&p, truncate_size);
op->indata_len = PAGE_SIZE - (end - p); op->indata_len = PAGE_SIZE - (end - p);
ceph_osd_data_pages_init(&op->copy_from.osd_data, pages, ceph_osd_data_pages_init(&op->copy_from.osd_data, pages,
...@@ -5350,6 +5354,7 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc, ...@@ -5350,6 +5354,7 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
struct ceph_object_id *dst_oid, struct ceph_object_id *dst_oid,
struct ceph_object_locator *dst_oloc, struct ceph_object_locator *dst_oloc,
u32 dst_fadvise_flags, u32 dst_fadvise_flags,
u32 truncate_seq, u64 truncate_size,
u8 copy_from_flags) u8 copy_from_flags)
{ {
struct ceph_osd_request *req; struct ceph_osd_request *req;
...@@ -5366,7 +5371,8 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc, ...@@ -5366,7 +5371,8 @@ int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid, ret = osd_req_op_copy_from_init(req, src_snapid, src_version, src_oid,
src_oloc, src_fadvise_flags, src_oloc, src_fadvise_flags,
dst_fadvise_flags, copy_from_flags); dst_fadvise_flags, truncate_seq,
truncate_size, copy_from_flags);
if (ret) if (ret)
goto out; goto out;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment