Commit 1cf0209c authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "A few groups of patches here.  Alex has been hard at work improving
  the RBD code, layout groundwork for understanding the new formats and
  doing layering.  Most of the infrastructure is now in place for the
  final bits that will come with the next window.

  There are a few changes to the data layout.  Jim Schutt's patch fixes
  some non-ideal CRUSH behavior, and a set of patches from me updates
  the client to speak a newer version of the protocol and implement an
  improved hashing strategy across storage nodes (when the server side
  supports it too).

  A pair of patches from Sam Lang fix the atomicity of open+create
  operations.  Several patches from Yan, Zheng fix various mds/client
  issues that turned up during multi-mds torture tests.

  A final set of patches expose file layouts via virtual xattrs, and
  allow the policies to be set on directories via xattrs as well
  (avoiding the awkward ioctl interface and providing a consistent
  interface for both kernel mount and ceph-fuse users)."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (143 commits)
  libceph: add support for HASHPSPOOL pool flag
  libceph: update osd request/reply encoding
  libceph: calculate placement based on the internal data types
  ceph: update support for PGID64, PGPOOL3, OSDENC protocol features
  ceph: update "ceph_features.h"
  libceph: decode into cpu-native ceph_pg type
  libceph: rename ceph_pg -> ceph_pg_v1
  rbd: pass length, not op for osd completions
  rbd: move rbd_osd_trivial_callback()
  libceph: use a do..while loop in con_work()
  libceph: use a flag to indicate a fault has occurred
  libceph: separate non-locked fault handling
  libceph: encapsulate connection backoff
  libceph: eliminate sparse warnings
  ceph: eliminate sparse warnings in fs code
  rbd: eliminate sparse warnings
  libceph: define connection flag helpers
  rbd: normalize dout() calls
  rbd: barriers are hard
  rbd: ignore zero-length requests
  ...
parents de1a2262 83ca14fd
This diff is collapsed.
...@@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page) ...@@ -236,16 +236,10 @@ static int ceph_readpage(struct file *filp, struct page *page)
static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
{ {
struct inode *inode = req->r_inode; struct inode *inode = req->r_inode;
struct ceph_osd_reply_head *replyhead; int rc = req->r_result;
int rc, bytes; int bytes = le32_to_cpu(msg->hdr.data_len);
int i; int i;
/* parse reply */
replyhead = msg->front.iov_base;
WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
rc = le32_to_cpu(replyhead->result);
bytes = le32_to_cpu(msg->hdr.data_len);
dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes); dout("finish_read %p req %p rc %d bytes %d\n", inode, req, rc, bytes);
/* unlock all pages, zeroing any data we didn't read */ /* unlock all pages, zeroing any data we didn't read */
...@@ -315,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max) ...@@ -315,7 +309,7 @@ static int start_read(struct inode *inode, struct list_head *page_list, int max)
CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ, CEPH_OSD_OP_READ, CEPH_OSD_FLAG_READ,
NULL, 0, NULL, 0,
ci->i_truncate_seq, ci->i_truncate_size, ci->i_truncate_seq, ci->i_truncate_size,
NULL, false, 1, 0); NULL, false, 0);
if (IS_ERR(req)) if (IS_ERR(req))
return PTR_ERR(req); return PTR_ERR(req);
...@@ -492,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ...@@ -492,8 +486,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
&ci->i_layout, snapc, &ci->i_layout, snapc,
page_off, len, page_off, len,
ci->i_truncate_seq, ci->i_truncate_size, ci->i_truncate_seq, ci->i_truncate_size,
&inode->i_mtime, &inode->i_mtime, &page, 1);
&page, 1, 0, 0, true);
if (err < 0) { if (err < 0) {
dout("writepage setting page/mapping error %d %p\n", err, page); dout("writepage setting page/mapping error %d %p\n", err, page);
SetPageError(page); SetPageError(page);
...@@ -554,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req, ...@@ -554,27 +547,18 @@ static void writepages_finish(struct ceph_osd_request *req,
struct ceph_msg *msg) struct ceph_msg *msg)
{ {
struct inode *inode = req->r_inode; struct inode *inode = req->r_inode;
struct ceph_osd_reply_head *replyhead;
struct ceph_osd_op *op;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
unsigned wrote; unsigned wrote;
struct page *page; struct page *page;
int i; int i;
struct ceph_snap_context *snapc = req->r_snapc; struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
__s32 rc = -EIO; int rc = req->r_result;
u64 bytes = 0; u64 bytes = le64_to_cpu(req->r_request_ops[0].extent.length);
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
long writeback_stat; long writeback_stat;
unsigned issued = ceph_caps_issued(ci); unsigned issued = ceph_caps_issued(ci);
/* parse reply */
replyhead = msg->front.iov_base;
WARN_ON(le32_to_cpu(replyhead->num_ops) == 0);
op = (void *)(replyhead + 1);
rc = le32_to_cpu(replyhead->result);
bytes = le64_to_cpu(op->extent.length);
if (rc >= 0) { if (rc >= 0) {
/* /*
* Assume we wrote the pages we originally sent. The * Assume we wrote the pages we originally sent. The
...@@ -741,8 +725,6 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -741,8 +725,6 @@ static int ceph_writepages_start(struct address_space *mapping,
struct page *page; struct page *page;
int want; int want;
u64 offset, len; u64 offset, len;
struct ceph_osd_request_head *reqhead;
struct ceph_osd_op *op;
long writeback_stat; long writeback_stat;
next = 0; next = 0;
...@@ -838,7 +820,7 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -838,7 +820,7 @@ static int ceph_writepages_start(struct address_space *mapping,
snapc, do_sync, snapc, do_sync,
ci->i_truncate_seq, ci->i_truncate_seq,
ci->i_truncate_size, ci->i_truncate_size,
&inode->i_mtime, true, 1, 0); &inode->i_mtime, true, 0);
if (IS_ERR(req)) { if (IS_ERR(req)) {
rc = PTR_ERR(req); rc = PTR_ERR(req);
...@@ -906,10 +888,8 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -906,10 +888,8 @@ static int ceph_writepages_start(struct address_space *mapping,
/* revise final length, page count */ /* revise final length, page count */
req->r_num_pages = locked_pages; req->r_num_pages = locked_pages;
reqhead = req->r_request->front.iov_base; req->r_request_ops[0].extent.length = cpu_to_le64(len);
op = (void *)(reqhead + 1); req->r_request_ops[0].payload_len = cpu_to_le32(len);
op->extent.length = cpu_to_le64(len);
op->payload_len = cpu_to_le32(len);
req->r_request->hdr.data_len = cpu_to_le32(len); req->r_request->hdr.data_len = cpu_to_le32(len);
rc = ceph_osdc_start_request(&fsc->client->osdc, req, true); rc = ceph_osdc_start_request(&fsc->client->osdc, req, true);
......
...@@ -611,8 +611,16 @@ int ceph_add_cap(struct inode *inode, ...@@ -611,8 +611,16 @@ int ceph_add_cap(struct inode *inode,
if (flags & CEPH_CAP_FLAG_AUTH) if (flags & CEPH_CAP_FLAG_AUTH)
ci->i_auth_cap = cap; ci->i_auth_cap = cap;
else if (ci->i_auth_cap == cap) else if (ci->i_auth_cap == cap) {
ci->i_auth_cap = NULL; ci->i_auth_cap = NULL;
spin_lock(&mdsc->cap_dirty_lock);
if (!list_empty(&ci->i_dirty_item)) {
dout(" moving %p to cap_dirty_migrating\n", inode);
list_move(&ci->i_dirty_item,
&mdsc->cap_dirty_migrating);
}
spin_unlock(&mdsc->cap_dirty_lock);
}
dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n", dout("add_cap inode %p (%llx.%llx) cap %p %s now %s seq %d mds%d\n",
inode, ceph_vinop(inode), cap, ceph_cap_string(issued), inode, ceph_vinop(inode), cap, ceph_cap_string(issued),
...@@ -1460,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ...@@ -1460,7 +1468,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_client *mdsc = fsc->mdsc;
struct inode *inode = &ci->vfs_inode; struct inode *inode = &ci->vfs_inode;
struct ceph_cap *cap; struct ceph_cap *cap;
int file_wanted, used; int file_wanted, used, cap_used;
int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */ int took_snap_rwsem = 0; /* true if mdsc->snap_rwsem held */
int issued, implemented, want, retain, revoking, flushing = 0; int issued, implemented, want, retain, revoking, flushing = 0;
int mds = -1; /* keep track of how far we've gone through i_caps list int mds = -1; /* keep track of how far we've gone through i_caps list
...@@ -1563,9 +1571,14 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ...@@ -1563,9 +1571,14 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
/* NOTE: no side-effects allowed, until we take s_mutex */ /* NOTE: no side-effects allowed, until we take s_mutex */
cap_used = used;
if (ci->i_auth_cap && cap != ci->i_auth_cap)
cap_used &= ~ci->i_auth_cap->issued;
revoking = cap->implemented & ~cap->issued; revoking = cap->implemented & ~cap->issued;
dout(" mds%d cap %p issued %s implemented %s revoking %s\n", dout(" mds%d cap %p used %s issued %s implemented %s revoking %s\n",
cap->mds, cap, ceph_cap_string(cap->issued), cap->mds, cap, ceph_cap_string(cap->issued),
ceph_cap_string(cap_used),
ceph_cap_string(cap->implemented), ceph_cap_string(cap->implemented),
ceph_cap_string(revoking)); ceph_cap_string(revoking));
...@@ -1593,7 +1606,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ...@@ -1593,7 +1606,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
} }
/* completed revocation? going down and there are no caps? */ /* completed revocation? going down and there are no caps? */
if (revoking && (revoking & used) == 0) { if (revoking && (revoking & cap_used) == 0) {
dout("completed revocation of %s\n", dout("completed revocation of %s\n",
ceph_cap_string(cap->implemented & ~cap->issued)); ceph_cap_string(cap->implemented & ~cap->issued));
goto ack; goto ack;
...@@ -1670,8 +1683,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ...@@ -1670,8 +1683,8 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
sent++; sent++;
/* __send_cap drops i_ceph_lock */ /* __send_cap drops i_ceph_lock */
delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, used, want, delayed += __send_cap(mdsc, cap, CEPH_CAP_OP_UPDATE, cap_used,
retain, flushing, NULL); want, retain, flushing, NULL);
goto retry; /* retake i_ceph_lock and restart our cap scan. */ goto retry; /* retake i_ceph_lock and restart our cap scan. */
} }
...@@ -2417,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant, ...@@ -2417,7 +2430,9 @@ static void handle_cap_grant(struct inode *inode, struct ceph_mds_caps *grant,
dout("mds wanted %s -> %s\n", dout("mds wanted %s -> %s\n",
ceph_cap_string(le32_to_cpu(grant->wanted)), ceph_cap_string(le32_to_cpu(grant->wanted)),
ceph_cap_string(wanted)); ceph_cap_string(wanted));
grant->wanted = cpu_to_le32(wanted); /* imported cap may not have correct mds_wanted */
if (le32_to_cpu(grant->op) == CEPH_CAP_OP_IMPORT)
check_caps = 1;
} }
cap->seq = seq; cap->seq = seq;
...@@ -2821,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session, ...@@ -2821,6 +2836,9 @@ void ceph_handle_caps(struct ceph_mds_session *session,
dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq, dout(" mds%d seq %lld cap seq %u\n", session->s_mds, session->s_seq,
(unsigned)seq); (unsigned)seq);
if (op == CEPH_CAP_OP_IMPORT)
ceph_add_cap_releases(mdsc, session);
/* lookup ino */ /* lookup ino */
inode = ceph_find_inode(sb, vino); inode = ceph_find_inode(sb, vino);
ci = ceph_inode(inode); ci = ceph_inode(inode);
......
...@@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, ...@@ -243,6 +243,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = ceph_mdsc_do_request(mdsc, err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
req); req);
if (err)
goto out_err;
err = ceph_handle_snapdir(req, dentry, err); err = ceph_handle_snapdir(req, dentry, err);
if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry) if (err == 0 && (flags & O_CREAT) && !req->r_reply_info.head->is_dentry)
err = ceph_handle_notrace_create(dir, dentry); err = ceph_handle_notrace_create(dir, dentry);
...@@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, ...@@ -263,6 +266,9 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
err = finish_no_open(file, dn); err = finish_no_open(file, dn);
} else { } else {
dout("atomic_open finish_open on dn %p\n", dn); dout("atomic_open finish_open on dn %p\n", dn);
if (req->r_op == CEPH_MDS_OP_CREATE && req->r_reply_info.has_create_ino) {
*opened |= FILE_CREATED;
}
err = finish_open(file, dentry, ceph_open, opened); err = finish_open(file, dentry, ceph_open, opened);
} }
...@@ -535,7 +541,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data, ...@@ -535,7 +541,7 @@ static ssize_t ceph_sync_write(struct file *file, const char __user *data,
ci->i_snap_realm->cached_context, ci->i_snap_realm->cached_context,
do_sync, do_sync,
ci->i_truncate_seq, ci->i_truncate_size, ci->i_truncate_seq, ci->i_truncate_size,
&mtime, false, 2, page_align); &mtime, false, page_align);
if (IS_ERR(req)) if (IS_ERR(req))
return PTR_ERR(req); return PTR_ERR(req);
......
...@@ -185,7 +185,6 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ...@@ -185,7 +185,6 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
&ceph_sb_to_client(inode->i_sb)->client->osdc; &ceph_sb_to_client(inode->i_sb)->client->osdc;
u64 len = 1, olen; u64 len = 1, olen;
u64 tmp; u64 tmp;
struct ceph_object_layout ol;
struct ceph_pg pgid; struct ceph_pg pgid;
int r; int r;
...@@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ...@@ -194,7 +193,7 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
return -EFAULT; return -EFAULT;
down_read(&osdc->map_sem); down_read(&osdc->map_sem);
r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, &len, r = ceph_calc_file_object_mapping(&ci->i_layout, dl.file_offset, len,
&dl.object_no, &dl.object_offset, &dl.object_no, &dl.object_offset,
&olen); &olen);
if (r < 0) if (r < 0)
...@@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg) ...@@ -209,10 +208,9 @@ static long ceph_ioctl_get_dataloc(struct file *file, void __user *arg)
snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx", snprintf(dl.object_name, sizeof(dl.object_name), "%llx.%08llx",
ceph_ino(inode), dl.object_no); ceph_ino(inode), dl.object_no);
ceph_calc_object_layout(&ol, dl.object_name, &ci->i_layout, ceph_calc_object_layout(&pgid, dl.object_name, &ci->i_layout,
osdc->osdmap); osdc->osdmap);
pgid = ol.ol_pgid;
dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid); dl.osd = ceph_calc_pg_primary(osdc->osdmap, pgid);
if (dl.osd >= 0) { if (dl.osd >= 0) {
struct ceph_entity_addr *a = struct ceph_entity_addr *a =
......
...@@ -232,6 +232,30 @@ static int parse_reply_info_filelock(void **p, void *end, ...@@ -232,6 +232,30 @@ static int parse_reply_info_filelock(void **p, void *end,
return -EIO; return -EIO;
} }
/*
* parse create results
*/
static int parse_reply_info_create(void **p, void *end,
struct ceph_mds_reply_info_parsed *info,
int features)
{
if (features & CEPH_FEATURE_REPLY_CREATE_INODE) {
if (*p == end) {
info->has_create_ino = false;
} else {
info->has_create_ino = true;
info->ino = ceph_decode_64(p);
}
}
if (unlikely(*p != end))
goto bad;
return 0;
bad:
return -EIO;
}
/* /*
* parse extra results * parse extra results
*/ */
...@@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end, ...@@ -241,8 +265,12 @@ static int parse_reply_info_extra(void **p, void *end,
{ {
if (info->head->op == CEPH_MDS_OP_GETFILELOCK) if (info->head->op == CEPH_MDS_OP_GETFILELOCK)
return parse_reply_info_filelock(p, end, info, features); return parse_reply_info_filelock(p, end, info, features);
else else if (info->head->op == CEPH_MDS_OP_READDIR)
return parse_reply_info_dir(p, end, info, features); return parse_reply_info_dir(p, end, info, features);
else if (info->head->op == CEPH_MDS_OP_CREATE)
return parse_reply_info_create(p, end, info, features);
else
return -EIO;
} }
/* /*
...@@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2170,7 +2198,8 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
mutex_lock(&req->r_fill_mutex); mutex_lock(&req->r_fill_mutex);
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) { if (err == 0) {
if (result == 0 && req->r_op != CEPH_MDS_OP_GETFILELOCK && if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
req->r_op == CEPH_MDS_OP_LSSNAP) &&
rinfo->dir_nr) rinfo->dir_nr)
ceph_readdir_prepopulate(req, req->r_session); ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation); ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
......
...@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed { ...@@ -74,6 +74,12 @@ struct ceph_mds_reply_info_parsed {
struct ceph_mds_reply_info_in *dir_in; struct ceph_mds_reply_info_in *dir_in;
u8 dir_complete, dir_end; u8 dir_complete, dir_end;
}; };
/* for create results */
struct {
bool has_create_ino;
u64 ino;
};
}; };
/* encoded blob describing snapshot contexts for certain /* encoded blob describing snapshot contexts for certain
......
...@@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -59,6 +59,10 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
ceph_decode_16_safe(p, end, version, bad); ceph_decode_16_safe(p, end, version, bad);
if (version > 3) {
pr_warning("got mdsmap version %d > 3, failing", version);
goto bad;
}
ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad); ceph_decode_need(p, end, 8*sizeof(u32) + sizeof(u64), bad);
m->m_epoch = ceph_decode_32(p); m->m_epoch = ceph_decode_32(p);
...@@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end) ...@@ -144,13 +148,13 @@ struct ceph_mdsmap *ceph_mdsmap_decode(void **p, void *end)
/* pg_pools */ /* pg_pools */
ceph_decode_32_safe(p, end, n, bad); ceph_decode_32_safe(p, end, n, bad);
m->m_num_data_pg_pools = n; m->m_num_data_pg_pools = n;
m->m_data_pg_pools = kcalloc(n, sizeof(u32), GFP_NOFS); m->m_data_pg_pools = kcalloc(n, sizeof(u64), GFP_NOFS);
if (!m->m_data_pg_pools) if (!m->m_data_pg_pools)
goto badmem; goto badmem;
ceph_decode_need(p, end, sizeof(u32)*(n+1), bad); ceph_decode_need(p, end, sizeof(u64)*(n+1), bad);
for (i = 0; i < n; i++) for (i = 0; i < n; i++)
m->m_data_pg_pools[i] = ceph_decode_32(p); m->m_data_pg_pools[i] = ceph_decode_64(p);
m->m_cas_pg_pool = ceph_decode_32(p); m->m_cas_pg_pool = ceph_decode_64(p);
/* ok, we don't care about the rest. */ /* ok, we don't care about the rest. */
dout("mdsmap_decode success epoch %u\n", m->m_epoch); dout("mdsmap_decode success epoch %u\n", m->m_epoch);
......
...@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s) ...@@ -15,6 +15,7 @@ const char *ceph_mds_state_name(int s)
case CEPH_MDS_STATE_BOOT: return "up:boot"; case CEPH_MDS_STATE_BOOT: return "up:boot";
case CEPH_MDS_STATE_STANDBY: return "up:standby"; case CEPH_MDS_STATE_STANDBY: return "up:standby";
case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay"; case CEPH_MDS_STATE_STANDBY_REPLAY: return "up:standby-replay";
case CEPH_MDS_STATE_REPLAYONCE: return "up:oneshot-replay";
case CEPH_MDS_STATE_CREATING: return "up:creating"; case CEPH_MDS_STATE_CREATING: return "up:creating";
case CEPH_MDS_STATE_STARTING: return "up:starting"; case CEPH_MDS_STATE_STARTING: return "up:starting";
/* up and in */ /* up and in */
...@@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op) ...@@ -50,10 +51,13 @@ const char *ceph_mds_op_name(int op)
case CEPH_MDS_OP_LOOKUP: return "lookup"; case CEPH_MDS_OP_LOOKUP: return "lookup";
case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash"; case CEPH_MDS_OP_LOOKUPHASH: return "lookuphash";
case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent"; case CEPH_MDS_OP_LOOKUPPARENT: return "lookupparent";
case CEPH_MDS_OP_LOOKUPINO: return "lookupino";
case CEPH_MDS_OP_GETATTR: return "getattr"; case CEPH_MDS_OP_GETATTR: return "getattr";
case CEPH_MDS_OP_SETXATTR: return "setxattr"; case CEPH_MDS_OP_SETXATTR: return "setxattr";
case CEPH_MDS_OP_SETATTR: return "setattr"; case CEPH_MDS_OP_SETATTR: return "setattr";
case CEPH_MDS_OP_RMXATTR: return "rmxattr"; case CEPH_MDS_OP_RMXATTR: return "rmxattr";
case CEPH_MDS_OP_SETLAYOUT: return "setlayou";
case CEPH_MDS_OP_SETDIRLAYOUT: return "setdirlayout";
case CEPH_MDS_OP_READDIR: return "readdir"; case CEPH_MDS_OP_READDIR: return "readdir";
case CEPH_MDS_OP_MKNOD: return "mknod"; case CEPH_MDS_OP_MKNOD: return "mknod";
case CEPH_MDS_OP_LINK: return "link"; case CEPH_MDS_OP_LINK: return "link";
......
...@@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) ...@@ -71,8 +71,14 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
/* /*
* express utilization in terms of large blocks to avoid * express utilization in terms of large blocks to avoid
* overflow on 32-bit machines. * overflow on 32-bit machines.
*
* NOTE: for the time being, we make bsize == frsize to humor
* not-yet-ancient versions of glibc that are broken.
* Someday, we will probably want to report a real block
* size... whatever that may mean for a network file system!
*/ */
buf->f_bsize = 1 << CEPH_BLOCK_SHIFT; buf->f_bsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_frsize = 1 << CEPH_BLOCK_SHIFT;
buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10); buf->f_blocks = le64_to_cpu(st.kb) >> (CEPH_BLOCK_SHIFT-10);
buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bfree = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10); buf->f_bavail = le64_to_cpu(st.kb_avail) >> (CEPH_BLOCK_SHIFT-10);
...@@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf) ...@@ -80,7 +86,6 @@ static int ceph_statfs(struct dentry *dentry, struct kstatfs *buf)
buf->f_files = le64_to_cpu(st.num_objects); buf->f_files = le64_to_cpu(st.num_objects);
buf->f_ffree = -1; buf->f_ffree = -1;
buf->f_namelen = NAME_MAX; buf->f_namelen = NAME_MAX;
buf->f_frsize = PAGE_CACHE_SIZE;
/* leave fsid little-endian, regardless of host endianness */ /* leave fsid little-endian, regardless of host endianness */
fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1); fsid = *(u64 *)(&monmap->fsid) ^ *((u64 *)&monmap->fsid + 1);
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
/* large granularity for statfs utilization stats to facilitate /* large granularity for statfs utilization stats to facilitate
* large volume sizes on 32-bit machines. */ * large volume sizes on 32-bit machines. */
#define CEPH_BLOCK_SHIFT 20 /* 1 MB */ #define CEPH_BLOCK_SHIFT 22 /* 4 MB */
#define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT) #define CEPH_BLOCK (1 << CEPH_BLOCK_SHIFT)
#define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */ #define CEPH_MOUNT_OPT_DIRSTAT (1<<4) /* `cat dirname` for stats */
...@@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma); ...@@ -798,13 +798,7 @@ extern int ceph_mmap(struct file *file, struct vm_area_struct *vma);
/* file.c */ /* file.c */
extern const struct file_operations ceph_file_fops; extern const struct file_operations ceph_file_fops;
extern const struct address_space_operations ceph_aops; extern const struct address_space_operations ceph_aops;
extern int ceph_copy_to_page_vector(struct page **pages,
const char *data,
loff_t off, size_t len);
extern int ceph_copy_from_page_vector(struct page **pages,
char *data,
loff_t off, size_t len);
extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
extern int ceph_open(struct inode *inode, struct file *file); extern int ceph_open(struct inode *inode, struct file *file);
extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry, extern int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct file *file, unsigned flags, umode_t mode, struct file *file, unsigned flags, umode_t mode,
......
...@@ -29,9 +29,94 @@ struct ceph_vxattr { ...@@ -29,9 +29,94 @@ struct ceph_vxattr {
size_t name_size; /* strlen(name) + 1 (for '\0') */ size_t name_size; /* strlen(name) + 1 (for '\0') */
size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val, size_t (*getxattr_cb)(struct ceph_inode_info *ci, char *val,
size_t size); size_t size);
bool readonly; bool readonly, hidden;
bool (*exists_cb)(struct ceph_inode_info *ci);
}; };
/* layouts */
static bool ceph_vxattrcb_layout_exists(struct ceph_inode_info *ci)
{
size_t s;
char *p = (char *)&ci->i_layout;
for (s = 0; s < sizeof(ci->i_layout); s++, p++)
if (*p)
return true;
return false;
}
static size_t ceph_vxattrcb_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
const char *pool_name;
dout("ceph_vxattrcb_layout %p\n", &ci->vfs_inode);
down_read(&osdc->map_sem);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name)
ret = snprintf(val, size,
"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%s",
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
(unsigned long long)ceph_file_layout_object_size(ci->i_layout),
pool_name);
else
ret = snprintf(val, size,
"stripe_unit=%lld stripe_count=%lld object_size=%lld pool=%lld",
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
(unsigned long long)ceph_file_layout_object_size(ci->i_layout),
(unsigned long long)pool);
up_read(&osdc->map_sem);
return ret;
}
static size_t ceph_vxattrcb_layout_stripe_unit(struct ceph_inode_info *ci,
char *val, size_t size)
{
return snprintf(val, size, "%lld",
(unsigned long long)ceph_file_layout_su(ci->i_layout));
}
static size_t ceph_vxattrcb_layout_stripe_count(struct ceph_inode_info *ci,
char *val, size_t size)
{
return snprintf(val, size, "%lld",
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout));
}
static size_t ceph_vxattrcb_layout_object_size(struct ceph_inode_info *ci,
char *val, size_t size)
{
return snprintf(val, size, "%lld",
(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
}
static size_t ceph_vxattrcb_layout_pool(struct ceph_inode_info *ci,
char *val, size_t size)
{
int ret;
struct ceph_fs_client *fsc = ceph_sb_to_client(ci->vfs_inode.i_sb);
struct ceph_osd_client *osdc = &fsc->client->osdc;
s64 pool = ceph_file_layout_pg_pool(ci->i_layout);
const char *pool_name;
down_read(&osdc->map_sem);
pool_name = ceph_pg_pool_name_by_id(osdc->osdmap, pool);
if (pool_name)
ret = snprintf(val, size, "%s", pool_name);
else
ret = snprintf(val, size, "%lld", (unsigned long long)pool);
up_read(&osdc->map_sem);
return ret;
}
/* directories */ /* directories */
static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val, static size_t ceph_vxattrcb_dir_entries(struct ceph_inode_info *ci, char *val,
...@@ -83,7 +168,10 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, ...@@ -83,7 +168,10 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
(long)ci->i_rctime.tv_nsec); (long)ci->i_rctime.tv_nsec);
} }
#define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name #define CEPH_XATTR_NAME(_type, _name) XATTR_CEPH_PREFIX #_type "." #_name
#define CEPH_XATTR_NAME2(_type, _name, _name2) \
XATTR_CEPH_PREFIX #_type "." #_name "." #_name2
#define XATTR_NAME_CEPH(_type, _name) \ #define XATTR_NAME_CEPH(_type, _name) \
{ \ { \
...@@ -91,9 +179,32 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val, ...@@ -91,9 +179,32 @@ static size_t ceph_vxattrcb_dir_rctime(struct ceph_inode_info *ci, char *val,
.name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \ .name_size = sizeof (CEPH_XATTR_NAME(_type, _name)), \
.getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \ .getxattr_cb = ceph_vxattrcb_ ## _type ## _ ## _name, \
.readonly = true, \ .readonly = true, \
.hidden = false, \
.exists_cb = NULL, \
}
#define XATTR_LAYOUT_FIELD(_type, _name, _field) \
{ \
.name = CEPH_XATTR_NAME2(_type, _name, _field), \
.name_size = sizeof (CEPH_XATTR_NAME2(_type, _name, _field)), \
.getxattr_cb = ceph_vxattrcb_ ## _name ## _ ## _field, \
.readonly = false, \
.hidden = true, \
.exists_cb = ceph_vxattrcb_layout_exists, \
} }
static struct ceph_vxattr ceph_dir_vxattrs[] = { static struct ceph_vxattr ceph_dir_vxattrs[] = {
{
.name = "ceph.dir.layout",
.name_size = sizeof("ceph.dir.layout"),
.getxattr_cb = ceph_vxattrcb_layout,
.readonly = false,
.hidden = false,
.exists_cb = ceph_vxattrcb_layout_exists,
},
XATTR_LAYOUT_FIELD(dir, layout, stripe_unit),
XATTR_LAYOUT_FIELD(dir, layout, stripe_count),
XATTR_LAYOUT_FIELD(dir, layout, object_size),
XATTR_LAYOUT_FIELD(dir, layout, pool),
XATTR_NAME_CEPH(dir, entries), XATTR_NAME_CEPH(dir, entries),
XATTR_NAME_CEPH(dir, files), XATTR_NAME_CEPH(dir, files),
XATTR_NAME_CEPH(dir, subdirs), XATTR_NAME_CEPH(dir, subdirs),
...@@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = { ...@@ -102,35 +213,26 @@ static struct ceph_vxattr ceph_dir_vxattrs[] = {
XATTR_NAME_CEPH(dir, rsubdirs), XATTR_NAME_CEPH(dir, rsubdirs),
XATTR_NAME_CEPH(dir, rbytes), XATTR_NAME_CEPH(dir, rbytes),
XATTR_NAME_CEPH(dir, rctime), XATTR_NAME_CEPH(dir, rctime),
{ 0 } /* Required table terminator */ { .name = NULL, 0 } /* Required table terminator */
}; };
static size_t ceph_dir_vxattrs_name_size; /* total size of all names */ static size_t ceph_dir_vxattrs_name_size; /* total size of all names */
/* files */ /* files */
static size_t ceph_vxattrcb_file_layout(struct ceph_inode_info *ci, char *val,
size_t size)
{
int ret;
ret = snprintf(val, size,
"chunk_bytes=%lld\nstripe_count=%lld\nobject_size=%lld\n",
(unsigned long long)ceph_file_layout_su(ci->i_layout),
(unsigned long long)ceph_file_layout_stripe_count(ci->i_layout),
(unsigned long long)ceph_file_layout_object_size(ci->i_layout));
return ret;
}
static struct ceph_vxattr ceph_file_vxattrs[] = { static struct ceph_vxattr ceph_file_vxattrs[] = {
XATTR_NAME_CEPH(file, layout),
/* The following extended attribute name is deprecated */
{ {
.name = XATTR_CEPH_PREFIX "layout", .name = "ceph.file.layout",
.name_size = sizeof (XATTR_CEPH_PREFIX "layout"), .name_size = sizeof("ceph.file.layout"),
.getxattr_cb = ceph_vxattrcb_file_layout, .getxattr_cb = ceph_vxattrcb_layout,
.readonly = true, .readonly = false,
.hidden = false,
.exists_cb = ceph_vxattrcb_layout_exists,
}, },
{ 0 } /* Required table terminator */ XATTR_LAYOUT_FIELD(file, layout, stripe_unit),
XATTR_LAYOUT_FIELD(file, layout, stripe_count),
XATTR_LAYOUT_FIELD(file, layout, object_size),
XATTR_LAYOUT_FIELD(file, layout, pool),
{ .name = NULL, 0 } /* Required table terminator */
}; };
static size_t ceph_file_vxattrs_name_size; /* total size of all names */ static size_t ceph_file_vxattrs_name_size; /* total size of all names */
...@@ -164,6 +266,7 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs) ...@@ -164,6 +266,7 @@ static size_t __init vxattrs_name_size(struct ceph_vxattr *vxattrs)
size_t size = 0; size_t size = 0;
for (vxattr = vxattrs; vxattr->name; vxattr++) for (vxattr = vxattrs; vxattr->name; vxattr++)
if (!vxattr->hidden)
size += vxattr->name_size; size += vxattr->name_size;
return size; return size;
...@@ -572,13 +675,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, ...@@ -572,13 +675,17 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
if (!ceph_is_valid_xattr(name)) if (!ceph_is_valid_xattr(name))
return -ENODATA; return -ENODATA;
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
dout("getxattr %p ver=%lld index_ver=%lld\n", inode, dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
ci->i_xattrs.version, ci->i_xattrs.index_version); ci->i_xattrs.version, ci->i_xattrs.index_version);
/* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name);
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) {
err = vxattr->getxattr_cb(ci, value, size);
goto out;
}
if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) && if (__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1) &&
(ci->i_xattrs.index_version >= ci->i_xattrs.version)) { (ci->i_xattrs.index_version >= ci->i_xattrs.version)) {
goto get_xattr; goto get_xattr;
...@@ -592,11 +699,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, ...@@ -592,11 +699,6 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (vxattr && vxattr->readonly) {
err = vxattr->getxattr_cb(ci, value, size);
goto out;
}
err = __build_xattrs(inode); err = __build_xattrs(inode);
if (err < 0) if (err < 0)
goto out; goto out;
...@@ -604,11 +706,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value, ...@@ -604,11 +706,8 @@ ssize_t ceph_getxattr(struct dentry *dentry, const char *name, void *value,
get_xattr: get_xattr:
err = -ENODATA; /* == ENOATTR */ err = -ENODATA; /* == ENOATTR */
xattr = __get_xattr(ci, name); xattr = __get_xattr(ci, name);
if (!xattr) { if (!xattr)
if (vxattr)
err = vxattr->getxattr_cb(ci, value, size);
goto out; goto out;
}
err = -ERANGE; err = -ERANGE;
if (size && size < xattr->val_len) if (size && size < xattr->val_len)
...@@ -664,22 +763,29 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size) ...@@ -664,22 +763,29 @@ ssize_t ceph_listxattr(struct dentry *dentry, char *names, size_t size)
vir_namelen = ceph_vxattrs_name_size(vxattrs); vir_namelen = ceph_vxattrs_name_size(vxattrs);
/* adding 1 byte per each variable due to the null termination */ /* adding 1 byte per each variable due to the null termination */
namelen = vir_namelen + ci->i_xattrs.names_size + ci->i_xattrs.count; namelen = ci->i_xattrs.names_size + ci->i_xattrs.count;
err = -ERANGE; err = -ERANGE;
if (size && namelen > size) if (size && vir_namelen + namelen > size)
goto out; goto out;
err = namelen; err = namelen + vir_namelen;
if (size == 0) if (size == 0)
goto out; goto out;
names = __copy_xattr_names(ci, names); names = __copy_xattr_names(ci, names);
/* virtual xattr names, too */ /* virtual xattr names, too */
if (vxattrs) err = namelen;
if (vxattrs) {
for (i = 0; vxattrs[i].name; i++) { for (i = 0; vxattrs[i].name; i++) {
if (!vxattrs[i].hidden &&
!(vxattrs[i].exists_cb &&
!vxattrs[i].exists_cb(ci))) {
len = sprintf(names, "%s", vxattrs[i].name); len = sprintf(names, "%s", vxattrs[i].name);
names += len + 1; names += len + 1;
err += len + 1;
}
}
} }
out: out:
...@@ -782,6 +888,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -782,6 +888,10 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
if (vxattr && vxattr->readonly) if (vxattr && vxattr->readonly)
return -EOPNOTSUPP; return -EOPNOTSUPP;
/* pass any unhandled ceph.* xattrs through to the MDS */
if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
goto do_sync_unlocked;
/* preallocate memory for xattr name, value, index node */ /* preallocate memory for xattr name, value, index node */
err = -ENOMEM; err = -ENOMEM;
newname = kmemdup(name, name_len + 1, GFP_NOFS); newname = kmemdup(name, name_len + 1, GFP_NOFS);
...@@ -838,6 +948,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -838,6 +948,7 @@ int ceph_setxattr(struct dentry *dentry, const char *name,
do_sync: do_sync:
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked:
err = ceph_sync_setxattr(dentry, name, value, size, flags); err = ceph_sync_setxattr(dentry, name, value, size, flags);
out: out:
kfree(newname); kfree(newname);
...@@ -892,6 +1003,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -892,6 +1003,10 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
if (vxattr && vxattr->readonly) if (vxattr && vxattr->readonly)
return -EOPNOTSUPP; return -EOPNOTSUPP;
/* pass any unhandled ceph.* xattrs through to the MDS */
if (!strncmp(name, XATTR_CEPH_PREFIX, XATTR_CEPH_PREFIX_LEN))
goto do_sync_unlocked;
err = -ENOMEM; err = -ENOMEM;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
retry: retry:
...@@ -931,6 +1046,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -931,6 +1046,7 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
return err; return err;
do_sync: do_sync:
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
do_sync_unlocked:
err = ceph_send_removexattr(dentry, name); err = ceph_send_removexattr(dentry, name);
out: out:
return err; return err;
......
...@@ -12,16 +12,46 @@ ...@@ -12,16 +12,46 @@
#define CEPH_FEATURE_MONNAMES (1<<5) #define CEPH_FEATURE_MONNAMES (1<<5)
#define CEPH_FEATURE_RECONNECT_SEQ (1<<6) #define CEPH_FEATURE_RECONNECT_SEQ (1<<6)
#define CEPH_FEATURE_DIRLAYOUTHASH (1<<7) #define CEPH_FEATURE_DIRLAYOUTHASH (1<<7)
/* bits 8-17 defined by user-space; not supported yet here */ #define CEPH_FEATURE_OBJECTLOCATOR (1<<8)
#define CEPH_FEATURE_PGID64 (1<<9)
#define CEPH_FEATURE_INCSUBOSDMAP (1<<10)
#define CEPH_FEATURE_PGPOOL3 (1<<11)
#define CEPH_FEATURE_OSDREPLYMUX (1<<12)
#define CEPH_FEATURE_OSDENC (1<<13)
#define CEPH_FEATURE_OMAP (1<<14)
#define CEPH_FEATURE_MONENC (1<<15)
#define CEPH_FEATURE_QUERY_T (1<<16)
#define CEPH_FEATURE_INDEP_PG_MAP (1<<17)
#define CEPH_FEATURE_CRUSH_TUNABLES (1<<18) #define CEPH_FEATURE_CRUSH_TUNABLES (1<<18)
#define CEPH_FEATURE_CHUNKY_SCRUB (1<<19)
#define CEPH_FEATURE_MON_NULLROUTE (1<<20)
#define CEPH_FEATURE_MON_GV (1<<21)
#define CEPH_FEATURE_BACKFILL_RESERVATION (1<<22)
#define CEPH_FEATURE_MSG_AUTH (1<<23)
#define CEPH_FEATURE_RECOVERY_RESERVATION (1<<24)
#define CEPH_FEATURE_CRUSH_TUNABLES2 (1<<25)
#define CEPH_FEATURE_CREATEPOOLID (1<<26)
#define CEPH_FEATURE_REPLY_CREATE_INODE (1<<27)
#define CEPH_FEATURE_OSD_HBMSGS (1<<28)
#define CEPH_FEATURE_MDSENC (1<<29)
#define CEPH_FEATURE_OSDHASHPSPOOL (1<<30)
/* /*
* Features supported. * Features supported.
*/ */
#define CEPH_FEATURES_SUPPORTED_DEFAULT \ #define CEPH_FEATURES_SUPPORTED_DEFAULT \
(CEPH_FEATURE_NOSRCADDR | \ (CEPH_FEATURE_NOSRCADDR | \
CEPH_FEATURE_CRUSH_TUNABLES) CEPH_FEATURE_PGID64 | \
CEPH_FEATURE_PGPOOL3 | \
CEPH_FEATURE_OSDENC | \
CEPH_FEATURE_CRUSH_TUNABLES | \
CEPH_FEATURE_CRUSH_TUNABLES2 | \
CEPH_FEATURE_REPLY_CREATE_INODE | \
CEPH_FEATURE_OSDHASHPSPOOL)
#define CEPH_FEATURES_REQUIRED_DEFAULT \ #define CEPH_FEATURES_REQUIRED_DEFAULT \
(CEPH_FEATURE_NOSRCADDR) (CEPH_FEATURE_NOSRCADDR | \
CEPH_FEATURE_PGID64 | \
CEPH_FEATURE_PGPOOL3 | \
CEPH_FEATURE_OSDENC)
#endif #endif
...@@ -21,9 +21,6 @@ ...@@ -21,9 +21,6 @@
* internal cluster protocols separately from the public, * internal cluster protocols separately from the public,
* client-facing protocol. * client-facing protocol.
*/ */
#define CEPH_OSD_PROTOCOL 8 /* cluster internal */
#define CEPH_MDS_PROTOCOL 12 /* cluster internal */
#define CEPH_MON_PROTOCOL 5 /* cluster internal */
#define CEPH_OSDC_PROTOCOL 24 /* server/client */ #define CEPH_OSDC_PROTOCOL 24 /* server/client */
#define CEPH_MDSC_PROTOCOL 32 /* server/client */ #define CEPH_MDSC_PROTOCOL 32 /* server/client */
#define CEPH_MONC_PROTOCOL 15 /* server/client */ #define CEPH_MONC_PROTOCOL 15 /* server/client */
...@@ -31,6 +28,7 @@ ...@@ -31,6 +28,7 @@
#define CEPH_INO_ROOT 1 #define CEPH_INO_ROOT 1
#define CEPH_INO_CEPH 2 /* hidden .ceph dir */ #define CEPH_INO_CEPH 2 /* hidden .ceph dir */
#define CEPH_INO_DOTDOT 3 /* used by ceph fuse for parent (..) */
/* arbitrary limit on max # of monitors (cluster of 3 is typical) */ /* arbitrary limit on max # of monitors (cluster of 3 is typical) */
#define CEPH_MAX_MON 31 #define CEPH_MAX_MON 31
...@@ -51,7 +49,7 @@ struct ceph_file_layout { ...@@ -51,7 +49,7 @@ struct ceph_file_layout {
__le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */ __le32 fl_object_stripe_unit; /* UNUSED. for per-object parity, if any */
/* object -> pg layout */ /* object -> pg layout */
__le32 fl_unused; /* unused; used to be preferred primary (-1) */ __le32 fl_unused; /* unused; used to be preferred primary for pg (-1 for none) */
__le32 fl_pg_pool; /* namespace, crush ruleset, rep level */ __le32 fl_pg_pool; /* namespace, crush ruleset, rep level */
} __attribute__ ((packed)); } __attribute__ ((packed));
...@@ -101,6 +99,8 @@ struct ceph_dir_layout { ...@@ -101,6 +99,8 @@ struct ceph_dir_layout {
#define CEPH_MSG_MON_SUBSCRIBE_ACK 16 #define CEPH_MSG_MON_SUBSCRIBE_ACK 16
#define CEPH_MSG_AUTH 17 #define CEPH_MSG_AUTH 17
#define CEPH_MSG_AUTH_REPLY 18 #define CEPH_MSG_AUTH_REPLY 18
#define CEPH_MSG_MON_GET_VERSION 19
#define CEPH_MSG_MON_GET_VERSION_REPLY 20
/* client <-> mds */ /* client <-> mds */
#define CEPH_MSG_MDS_MAP 21 #define CEPH_MSG_MDS_MAP 21
...@@ -220,6 +220,11 @@ struct ceph_mon_subscribe_ack { ...@@ -220,6 +220,11 @@ struct ceph_mon_subscribe_ack {
struct ceph_fsid fsid; struct ceph_fsid fsid;
} __attribute__ ((packed)); } __attribute__ ((packed));
/*
* mdsmap flags
*/
#define CEPH_MDSMAP_DOWN (1<<0) /* cluster deliberately down */
/* /*
* mds states * mds states
* > 0 -> in * > 0 -> in
...@@ -233,6 +238,7 @@ struct ceph_mon_subscribe_ack { ...@@ -233,6 +238,7 @@ struct ceph_mon_subscribe_ack {
#define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */ #define CEPH_MDS_STATE_CREATING -6 /* up, creating MDS instance. */
#define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */ #define CEPH_MDS_STATE_STARTING -7 /* up, starting previously stopped mds */
#define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */ #define CEPH_MDS_STATE_STANDBY_REPLAY -8 /* up, tailing active node's journal */
#define CEPH_MDS_STATE_REPLAYONCE -9 /* up, replaying an active node's journal */
#define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */ #define CEPH_MDS_STATE_REPLAY 8 /* up, replaying journal. */
#define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed #define CEPH_MDS_STATE_RESOLVE 9 /* up, disambiguating distributed
...@@ -264,6 +270,7 @@ extern const char *ceph_mds_state_name(int s); ...@@ -264,6 +270,7 @@ extern const char *ceph_mds_state_name(int s);
#define CEPH_LOCK_IXATTR 2048 #define CEPH_LOCK_IXATTR 2048
#define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */ #define CEPH_LOCK_IFLOCK 4096 /* advisory file locks */
#define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */ #define CEPH_LOCK_INO 8192 /* immutable inode bits; not a lock */
#define CEPH_LOCK_IPOLICY 16384 /* policy lock on dirs. MDS internal */
/* client_session ops */ /* client_session ops */
enum { enum {
...@@ -338,6 +345,12 @@ extern const char *ceph_mds_op_name(int op); ...@@ -338,6 +345,12 @@ extern const char *ceph_mds_op_name(int op);
#define CEPH_SETATTR_SIZE 32 #define CEPH_SETATTR_SIZE 32
#define CEPH_SETATTR_CTIME 64 #define CEPH_SETATTR_CTIME 64
/*
* Ceph setxattr request flags.
*/
#define CEPH_XATTR_CREATE 1
#define CEPH_XATTR_REPLACE 2
union ceph_mds_request_args { union ceph_mds_request_args {
struct { struct {
__le32 mask; /* CEPH_CAP_* */ __le32 mask; /* CEPH_CAP_* */
...@@ -522,6 +535,9 @@ int ceph_flags_to_mode(int flags); ...@@ -522,6 +535,9 @@ int ceph_flags_to_mode(int flags);
#define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */ #define CEPH_CAP_GWREXTEND 64 /* (file) client can extend EOF */
#define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */ #define CEPH_CAP_GLAZYIO 128 /* (file) client can perform lazy io */
#define CEPH_CAP_SIMPLE_BITS 2
#define CEPH_CAP_FILE_BITS 8
/* per-lock shift */ /* per-lock shift */
#define CEPH_CAP_SAUTH 2 #define CEPH_CAP_SAUTH 2
#define CEPH_CAP_SLINK 4 #define CEPH_CAP_SLINK 4
......
...@@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n) ...@@ -99,8 +99,8 @@ static inline int ceph_has_room(void **p, void *end, size_t n)
* *
* There are two possible failures: * There are two possible failures:
* - converting the string would require accessing memory at or * - converting the string would require accessing memory at or
* beyond the "end" pointer provided (-E * beyond the "end" pointer provided (-ERANGE)
* - memory could not be allocated for the result * - memory could not be allocated for the result (-ENOMEM)
*/ */
static inline char *ceph_extract_encoded_string(void **p, void *end, static inline char *ceph_extract_encoded_string(void **p, void *end,
size_t *lenp, gfp_t gfp) size_t *lenp, gfp_t gfp)
...@@ -238,6 +238,11 @@ static inline void ceph_encode_string(void **p, void *end, ...@@ -238,6 +238,11 @@ static inline void ceph_encode_string(void **p, void *end,
ceph_encode_need(p, end, sizeof(u16), bad); \ ceph_encode_need(p, end, sizeof(u16), bad); \
ceph_encode_16(p, v); \ ceph_encode_16(p, v); \
} while (0) } while (0)
#define ceph_encode_8_safe(p, end, v, bad) \
do { \
ceph_encode_need(p, end, sizeof(u8), bad); \
ceph_encode_8(p, v); \
} while (0)
#define ceph_encode_copy_safe(p, end, pv, n, bad) \ #define ceph_encode_copy_safe(p, end, pv, n, bad) \
do { \ do { \
......
...@@ -193,6 +193,8 @@ static inline int calc_pages_for(u64 off, u64 len) ...@@ -193,6 +193,8 @@ static inline int calc_pages_for(u64 off, u64 len)
} }
/* ceph_common.c */ /* ceph_common.c */
extern bool libceph_compatible(void *data);
extern const char *ceph_msg_type_name(int type); extern const char *ceph_msg_type_name(int type);
extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid); extern int ceph_check_fsid(struct ceph_client *client, struct ceph_fsid *fsid);
extern struct kmem_cache *ceph_inode_cachep; extern struct kmem_cache *ceph_inode_cachep;
...@@ -220,7 +222,7 @@ extern int ceph_open_session(struct ceph_client *client); ...@@ -220,7 +222,7 @@ extern int ceph_open_session(struct ceph_client *client);
/* pagevec.c */ /* pagevec.c */
extern void ceph_release_page_vector(struct page **pages, int num_pages); extern void ceph_release_page_vector(struct page **pages, int num_pages);
extern struct page **ceph_get_direct_page_vector(const char __user *data, extern struct page **ceph_get_direct_page_vector(const void __user *data,
int num_pages, int num_pages,
bool write_page); bool write_page);
extern void ceph_put_page_vector(struct page **pages, int num_pages, extern void ceph_put_page_vector(struct page **pages, int num_pages,
...@@ -228,15 +230,15 @@ extern void ceph_put_page_vector(struct page **pages, int num_pages, ...@@ -228,15 +230,15 @@ extern void ceph_put_page_vector(struct page **pages, int num_pages,
extern void ceph_release_page_vector(struct page **pages, int num_pages); extern void ceph_release_page_vector(struct page **pages, int num_pages);
extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags); extern struct page **ceph_alloc_page_vector(int num_pages, gfp_t flags);
extern int ceph_copy_user_to_page_vector(struct page **pages, extern int ceph_copy_user_to_page_vector(struct page **pages,
const char __user *data, const void __user *data,
loff_t off, size_t len); loff_t off, size_t len);
extern int ceph_copy_to_page_vector(struct page **pages, extern void ceph_copy_to_page_vector(struct page **pages,
const char *data, const void *data,
loff_t off, size_t len); loff_t off, size_t len);
extern int ceph_copy_from_page_vector(struct page **pages, extern void ceph_copy_from_page_vector(struct page **pages,
char *data, void *data,
loff_t off, size_t len); loff_t off, size_t len);
extern int ceph_copy_page_vector_to_user(struct page **pages, char __user *data, extern int ceph_copy_page_vector_to_user(struct page **pages, void __user *data,
loff_t off, size_t len); loff_t off, size_t len);
extern void ceph_zero_page_vector_range(int off, int len, struct page **pages); extern void ceph_zero_page_vector_range(int off, int len, struct page **pages);
......
...@@ -29,8 +29,8 @@ struct ceph_mdsmap { ...@@ -29,8 +29,8 @@ struct ceph_mdsmap {
/* which object pools file data can be stored in */ /* which object pools file data can be stored in */
int m_num_data_pg_pools; int m_num_data_pg_pools;
u32 *m_data_pg_pools; u64 *m_data_pg_pools;
u32 m_cas_pg_pool; u64 m_cas_pg_pool;
}; };
static inline struct ceph_entity_addr * static inline struct ceph_entity_addr *
......
...@@ -83,9 +83,11 @@ struct ceph_msg { ...@@ -83,9 +83,11 @@ struct ceph_msg {
struct list_head list_head; struct list_head list_head;
struct kref kref; struct kref kref;
#ifdef CONFIG_BLOCK
struct bio *bio; /* instead of pages/pagelist */ struct bio *bio; /* instead of pages/pagelist */
struct bio *bio_iter; /* bio iterator */ struct bio *bio_iter; /* bio iterator */
int bio_seg; /* current bio segment */ int bio_seg; /* current bio segment */
#endif /* CONFIG_BLOCK */
struct ceph_pagelist *trail; /* the trailing part of the data */ struct ceph_pagelist *trail; /* the trailing part of the data */
bool front_is_vmalloc; bool front_is_vmalloc;
bool more_to_follow; bool more_to_follow;
......
...@@ -10,6 +10,7 @@ ...@@ -10,6 +10,7 @@
#include <linux/ceph/osdmap.h> #include <linux/ceph/osdmap.h>
#include <linux/ceph/messenger.h> #include <linux/ceph/messenger.h>
#include <linux/ceph/auth.h> #include <linux/ceph/auth.h>
#include <linux/ceph/pagelist.h>
/* /*
* Maximum object name size * Maximum object name size
...@@ -22,7 +23,6 @@ struct ceph_snap_context; ...@@ -22,7 +23,6 @@ struct ceph_snap_context;
struct ceph_osd_request; struct ceph_osd_request;
struct ceph_osd_client; struct ceph_osd_client;
struct ceph_authorizer; struct ceph_authorizer;
struct ceph_pagelist;
/* /*
* completion callback for async writepages * completion callback for async writepages
...@@ -47,6 +47,9 @@ struct ceph_osd { ...@@ -47,6 +47,9 @@ struct ceph_osd {
struct list_head o_keepalive_item; struct list_head o_keepalive_item;
}; };
#define CEPH_OSD_MAX_OP 10
/* an in-flight request */ /* an in-flight request */
struct ceph_osd_request { struct ceph_osd_request {
u64 r_tid; /* unique for this client */ u64 r_tid; /* unique for this client */
...@@ -63,9 +66,23 @@ struct ceph_osd_request { ...@@ -63,9 +66,23 @@ struct ceph_osd_request {
struct ceph_connection *r_con_filling_msg; struct ceph_connection *r_con_filling_msg;
struct ceph_msg *r_request, *r_reply; struct ceph_msg *r_request, *r_reply;
int r_result;
int r_flags; /* any additional flags for the osd */ int r_flags; /* any additional flags for the osd */
u32 r_sent; /* >0 if r_request is sending/sent */ u32 r_sent; /* >0 if r_request is sending/sent */
int r_num_ops;
/* encoded message content */
struct ceph_osd_op *r_request_ops;
/* these are updated on each send */
__le32 *r_request_osdmap_epoch;
__le32 *r_request_flags;
__le64 *r_request_pool;
void *r_request_pgid;
__le32 *r_request_attempts;
struct ceph_eversion *r_request_reassert_version;
int r_result;
int r_reply_op_len[CEPH_OSD_MAX_OP];
s32 r_reply_op_result[CEPH_OSD_MAX_OP];
int r_got_reply; int r_got_reply;
int r_linger; int r_linger;
...@@ -82,6 +99,7 @@ struct ceph_osd_request { ...@@ -82,6 +99,7 @@ struct ceph_osd_request {
char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */ char r_oid[MAX_OBJ_NAME_SIZE]; /* object name */
int r_oid_len; int r_oid_len;
u64 r_snapid;
unsigned long r_stamp; /* send OR check time */ unsigned long r_stamp; /* send OR check time */
struct ceph_file_layout r_file_layout; struct ceph_file_layout r_file_layout;
...@@ -95,7 +113,7 @@ struct ceph_osd_request { ...@@ -95,7 +113,7 @@ struct ceph_osd_request {
struct bio *r_bio; /* instead of pages */ struct bio *r_bio; /* instead of pages */
#endif #endif
struct ceph_pagelist *r_trail; /* trailing part of the data */ struct ceph_pagelist r_trail; /* trailing part of the data */
}; };
struct ceph_osd_event { struct ceph_osd_event {
...@@ -107,7 +125,6 @@ struct ceph_osd_event { ...@@ -107,7 +125,6 @@ struct ceph_osd_event {
struct rb_node node; struct rb_node node;
struct list_head osd_node; struct list_head osd_node;
struct kref kref; struct kref kref;
struct completion completion;
}; };
struct ceph_osd_event_work { struct ceph_osd_event_work {
...@@ -157,7 +174,7 @@ struct ceph_osd_client { ...@@ -157,7 +174,7 @@ struct ceph_osd_client {
struct ceph_osd_req_op { struct ceph_osd_req_op {
u16 op; /* CEPH_OSD_OP_* */ u16 op; /* CEPH_OSD_OP_* */
u32 flags; /* CEPH_OSD_FLAG_* */ u32 payload_len;
union { union {
struct { struct {
u64 offset, length; u64 offset, length;
...@@ -166,23 +183,24 @@ struct ceph_osd_req_op { ...@@ -166,23 +183,24 @@ struct ceph_osd_req_op {
} extent; } extent;
struct { struct {
const char *name; const char *name;
u32 name_len;
const char *val; const char *val;
u32 name_len;
u32 value_len; u32 value_len;
__u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */ __u8 cmp_op; /* CEPH_OSD_CMPXATTR_OP_* */
__u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */ __u8 cmp_mode; /* CEPH_OSD_CMPXATTR_MODE_* */
} xattr; } xattr;
struct { struct {
const char *class_name; const char *class_name;
__u8 class_len;
const char *method_name; const char *method_name;
__u8 method_len;
__u8 argc;
const char *indata; const char *indata;
u32 indata_len; u32 indata_len;
__u8 class_len;
__u8 method_len;
__u8 argc;
} cls; } cls;
struct { struct {
u64 cookie, count; u64 cookie;
u64 count;
} pgls; } pgls;
struct { struct {
u64 snapid; u64 snapid;
...@@ -190,12 +208,11 @@ struct ceph_osd_req_op { ...@@ -190,12 +208,11 @@ struct ceph_osd_req_op {
struct { struct {
u64 cookie; u64 cookie;
u64 ver; u64 ver;
__u8 flag;
u32 prot_ver; u32 prot_ver;
u32 timeout; u32 timeout;
__u8 flag;
} watch; } watch;
}; };
u32 payload_len;
}; };
extern int ceph_osdc_init(struct ceph_osd_client *osdc, extern int ceph_osdc_init(struct ceph_osd_client *osdc,
...@@ -207,29 +224,19 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc, ...@@ -207,29 +224,19 @@ extern void ceph_osdc_handle_reply(struct ceph_osd_client *osdc,
extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc, extern void ceph_osdc_handle_map(struct ceph_osd_client *osdc,
struct ceph_msg *msg); struct ceph_msg *msg);
extern int ceph_calc_raw_layout(struct ceph_osd_client *osdc,
struct ceph_file_layout *layout,
u64 snapid,
u64 off, u64 *plen, u64 *bno,
struct ceph_osd_request *req,
struct ceph_osd_req_op *op);
extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, extern struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
int flags,
struct ceph_snap_context *snapc, struct ceph_snap_context *snapc,
struct ceph_osd_req_op *ops, unsigned int num_op,
bool use_mempool, bool use_mempool,
gfp_t gfp_flags, gfp_t gfp_flags);
struct page **pages,
struct bio *bio);
extern void ceph_osdc_build_request(struct ceph_osd_request *req, extern void ceph_osdc_build_request(struct ceph_osd_request *req,
u64 off, u64 *plen, u64 off, u64 len,
unsigned int num_op,
struct ceph_osd_req_op *src_ops, struct ceph_osd_req_op *src_ops,
struct ceph_snap_context *snapc, struct ceph_snap_context *snapc,
struct timespec *mtime, u64 snap_id,
const char *oid, struct timespec *mtime);
int oid_len);
extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
struct ceph_file_layout *layout, struct ceph_file_layout *layout,
...@@ -239,8 +246,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *, ...@@ -239,8 +246,7 @@ extern struct ceph_osd_request *ceph_osdc_new_request(struct ceph_osd_client *,
int do_sync, u32 truncate_seq, int do_sync, u32 truncate_seq,
u64 truncate_size, u64 truncate_size,
struct timespec *mtime, struct timespec *mtime,
bool use_mempool, int num_reply, bool use_mempool, int page_align);
int page_align);
extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc, extern void ceph_osdc_set_request_linger(struct ceph_osd_client *osdc,
struct ceph_osd_request *req); struct ceph_osd_request *req);
...@@ -279,17 +285,13 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc, ...@@ -279,17 +285,13 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
u64 off, u64 len, u64 off, u64 len,
u32 truncate_seq, u64 truncate_size, u32 truncate_seq, u64 truncate_size,
struct timespec *mtime, struct timespec *mtime,
struct page **pages, int nr_pages, struct page **pages, int nr_pages);
int flags, int do_sync, bool nofail);
/* watch/notify events */ /* watch/notify events */
extern int ceph_osdc_create_event(struct ceph_osd_client *osdc, extern int ceph_osdc_create_event(struct ceph_osd_client *osdc,
void (*event_cb)(u64, u64, u8, void *), void (*event_cb)(u64, u64, u8, void *),
int one_shot, void *data, void *data, struct ceph_osd_event **pevent);
struct ceph_osd_event **pevent);
extern void ceph_osdc_cancel_event(struct ceph_osd_event *event); extern void ceph_osdc_cancel_event(struct ceph_osd_event *event);
extern int ceph_osdc_wait_event(struct ceph_osd_event *event,
unsigned long timeout);
extern void ceph_osdc_put_event(struct ceph_osd_event *event); extern void ceph_osdc_put_event(struct ceph_osd_event *event);
#endif #endif
...@@ -18,14 +18,31 @@ ...@@ -18,14 +18,31 @@
* The map can be updated either via an incremental map (diff) describing * The map can be updated either via an incremental map (diff) describing
* the change between two successive epochs, or as a fully encoded map. * the change between two successive epochs, or as a fully encoded map.
*/ */
struct ceph_pg {
uint64_t pool;
uint32_t seed;
};
#define CEPH_POOL_FLAG_HASHPSPOOL 1
struct ceph_pg_pool_info { struct ceph_pg_pool_info {
struct rb_node node; struct rb_node node;
int id; s64 id;
struct ceph_pg_pool v; u8 type;
int pg_num_mask, pgp_num_mask, lpg_num_mask, lpgp_num_mask; u8 size;
u8 crush_ruleset;
u8 object_hash;
u32 pg_num, pgp_num;
int pg_num_mask, pgp_num_mask;
u64 flags;
char *name; char *name;
}; };
struct ceph_object_locator {
uint64_t pool;
char *key;
};
struct ceph_pg_mapping { struct ceph_pg_mapping {
struct rb_node node; struct rb_node node;
struct ceph_pg pgid; struct ceph_pg pgid;
...@@ -110,15 +127,16 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map); ...@@ -110,15 +127,16 @@ extern void ceph_osdmap_destroy(struct ceph_osdmap *map);
/* calculate mapping of a file extent to an object */ /* calculate mapping of a file extent to an object */
extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 off, u64 *plen, u64 off, u64 len,
u64 *bno, u64 *oxoff, u64 *oxlen); u64 *bno, u64 *oxoff, u64 *oxlen);
/* calculate mapping of object to a placement group */ /* calculate mapping of object to a placement group */
extern int ceph_calc_object_layout(struct ceph_object_layout *ol, extern int ceph_calc_object_layout(struct ceph_pg *pg,
const char *oid, const char *oid,
struct ceph_file_layout *fl, struct ceph_file_layout *fl,
struct ceph_osdmap *osdmap); struct ceph_osdmap *osdmap);
extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap, struct ceph_pg pgid, extern int ceph_calc_pg_acting(struct ceph_osdmap *osdmap,
struct ceph_pg pgid,
int *acting); int *acting);
extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap, extern int ceph_calc_pg_primary(struct ceph_osdmap *osdmap,
struct ceph_pg pgid); struct ceph_pg pgid);
......
This diff is collapsed.
...@@ -162,6 +162,8 @@ struct crush_map { ...@@ -162,6 +162,8 @@ struct crush_map {
__u32 choose_local_fallback_tries; __u32 choose_local_fallback_tries;
/* choose attempts before giving up */ /* choose attempts before giving up */
__u32 choose_total_tries; __u32 choose_total_tries;
/* attempt chooseleaf inner descent once; on failure retry outer descent */
__u32 chooseleaf_descend_once;
}; };
......
...@@ -28,6 +28,22 @@ ...@@ -28,6 +28,22 @@
#include "crypto.h" #include "crypto.h"
/*
* Module compatibility interface. For now it doesn't do anything,
* but its existence signals a certain level of functionality.
*
* The data buffer is used to pass information both to and from
* libceph. The return value indicates whether libceph determines
* it is compatible with the caller (from another kernel module),
* given the provided data.
*
* The data pointer can be null.
*/
bool libceph_compatible(void *data)
{
return true;
}
EXPORT_SYMBOL(libceph_compatible);
/* /*
* find filename portion of a path (/foo/bar/baz -> baz) * find filename portion of a path (/foo/bar/baz -> baz)
...@@ -590,10 +606,8 @@ static int __init init_ceph_lib(void) ...@@ -590,10 +606,8 @@ static int __init init_ceph_lib(void)
if (ret < 0) if (ret < 0)
goto out_crypto; goto out_crypto;
pr_info("loaded (mon/osd proto %d/%d, osdmap %d/%d %d/%d)\n", pr_info("loaded (mon/osd proto %d/%d)\n",
CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL, CEPH_MONC_PROTOCOL, CEPH_OSDC_PROTOCOL);
CEPH_OSDMAP_VERSION, CEPH_OSDMAP_VERSION_EXT,
CEPH_OSDMAP_INC_VERSION, CEPH_OSDMAP_INC_VERSION_EXT);
return 0; return 0;
......
...@@ -21,9 +21,15 @@ const char *ceph_osd_op_name(int op) ...@@ -21,9 +21,15 @@ const char *ceph_osd_op_name(int op)
switch (op) { switch (op) {
case CEPH_OSD_OP_READ: return "read"; case CEPH_OSD_OP_READ: return "read";
case CEPH_OSD_OP_STAT: return "stat"; case CEPH_OSD_OP_STAT: return "stat";
case CEPH_OSD_OP_MAPEXT: return "mapext";
case CEPH_OSD_OP_SPARSE_READ: return "sparse-read";
case CEPH_OSD_OP_NOTIFY: return "notify";
case CEPH_OSD_OP_NOTIFY_ACK: return "notify-ack";
case CEPH_OSD_OP_ASSERT_VER: return "assert-version";
case CEPH_OSD_OP_MASKTRUNC: return "masktrunc"; case CEPH_OSD_OP_MASKTRUNC: return "masktrunc";
case CEPH_OSD_OP_CREATE: return "create";
case CEPH_OSD_OP_WRITE: return "write"; case CEPH_OSD_OP_WRITE: return "write";
case CEPH_OSD_OP_DELETE: return "delete"; case CEPH_OSD_OP_DELETE: return "delete";
case CEPH_OSD_OP_TRUNCATE: return "truncate"; case CEPH_OSD_OP_TRUNCATE: return "truncate";
...@@ -39,6 +45,11 @@ const char *ceph_osd_op_name(int op) ...@@ -39,6 +45,11 @@ const char *ceph_osd_op_name(int op)
case CEPH_OSD_OP_TMAPUP: return "tmapup"; case CEPH_OSD_OP_TMAPUP: return "tmapup";
case CEPH_OSD_OP_TMAPGET: return "tmapget"; case CEPH_OSD_OP_TMAPGET: return "tmapget";
case CEPH_OSD_OP_TMAPPUT: return "tmapput"; case CEPH_OSD_OP_TMAPPUT: return "tmapput";
case CEPH_OSD_OP_WATCH: return "watch";
case CEPH_OSD_OP_CLONERANGE: return "clonerange";
case CEPH_OSD_OP_ASSERT_SRC_VERSION: return "assert-src-version";
case CEPH_OSD_OP_SRC_CMPXATTR: return "src-cmpxattr";
case CEPH_OSD_OP_GETXATTR: return "getxattr"; case CEPH_OSD_OP_GETXATTR: return "getxattr";
case CEPH_OSD_OP_GETXATTRS: return "getxattrs"; case CEPH_OSD_OP_GETXATTRS: return "getxattrs";
...@@ -53,6 +64,10 @@ const char *ceph_osd_op_name(int op) ...@@ -53,6 +64,10 @@ const char *ceph_osd_op_name(int op)
case CEPH_OSD_OP_BALANCEREADS: return "balance-reads"; case CEPH_OSD_OP_BALANCEREADS: return "balance-reads";
case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads"; case CEPH_OSD_OP_UNBALANCEREADS: return "unbalance-reads";
case CEPH_OSD_OP_SCRUB: return "scrub"; case CEPH_OSD_OP_SCRUB: return "scrub";
case CEPH_OSD_OP_SCRUB_RESERVE: return "scrub-reserve";
case CEPH_OSD_OP_SCRUB_UNRESERVE: return "scrub-unreserve";
case CEPH_OSD_OP_SCRUB_STOP: return "scrub-stop";
case CEPH_OSD_OP_SCRUB_MAP: return "scrub-map";
case CEPH_OSD_OP_WRLOCK: return "wrlock"; case CEPH_OSD_OP_WRLOCK: return "wrlock";
case CEPH_OSD_OP_WRUNLOCK: return "wrunlock"; case CEPH_OSD_OP_WRUNLOCK: return "wrunlock";
...@@ -64,10 +79,34 @@ const char *ceph_osd_op_name(int op) ...@@ -64,10 +79,34 @@ const char *ceph_osd_op_name(int op)
case CEPH_OSD_OP_CALL: return "call"; case CEPH_OSD_OP_CALL: return "call";
case CEPH_OSD_OP_PGLS: return "pgls"; case CEPH_OSD_OP_PGLS: return "pgls";
case CEPH_OSD_OP_PGLS_FILTER: return "pgls-filter";
case CEPH_OSD_OP_OMAPGETKEYS: return "omap-get-keys";
case CEPH_OSD_OP_OMAPGETVALS: return "omap-get-vals";
case CEPH_OSD_OP_OMAPGETHEADER: return "omap-get-header";
case CEPH_OSD_OP_OMAPGETVALSBYKEYS: return "omap-get-vals-by-keys";
case CEPH_OSD_OP_OMAPSETVALS: return "omap-set-vals";
case CEPH_OSD_OP_OMAPSETHEADER: return "omap-set-header";
case CEPH_OSD_OP_OMAPCLEAR: return "omap-clear";
case CEPH_OSD_OP_OMAPRMKEYS: return "omap-rm-keys";
} }
return "???"; return "???";
} }
const char *ceph_osd_state_name(int s)
{
switch (s) {
case CEPH_OSD_EXISTS:
return "exists";
case CEPH_OSD_UP:
return "up";
case CEPH_OSD_AUTOOUT:
return "autoout";
case CEPH_OSD_NEW:
return "new";
default:
return "???";
}
}
const char *ceph_pool_op_name(int op) const char *ceph_pool_op_name(int op)
{ {
......
...@@ -287,6 +287,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in ...@@ -287,6 +287,7 @@ static int is_out(const struct crush_map *map, const __u32 *weight, int item, in
* @outpos: our position in that vector * @outpos: our position in that vector
* @firstn: true if choosing "first n" items, false if choosing "indep" * @firstn: true if choosing "first n" items, false if choosing "indep"
* @recurse_to_leaf: true if we want one device under each item of given type * @recurse_to_leaf: true if we want one device under each item of given type
* @descend_once: true if we should only try one descent before giving up
* @out2: second output vector for leaf items (if @recurse_to_leaf) * @out2: second output vector for leaf items (if @recurse_to_leaf)
*/ */
static int crush_choose(const struct crush_map *map, static int crush_choose(const struct crush_map *map,
...@@ -295,7 +296,7 @@ static int crush_choose(const struct crush_map *map, ...@@ -295,7 +296,7 @@ static int crush_choose(const struct crush_map *map,
int x, int numrep, int type, int x, int numrep, int type,
int *out, int outpos, int *out, int outpos,
int firstn, int recurse_to_leaf, int firstn, int recurse_to_leaf,
int *out2) int descend_once, int *out2)
{ {
int rep; int rep;
unsigned int ftotal, flocal; unsigned int ftotal, flocal;
...@@ -391,7 +392,7 @@ static int crush_choose(const struct crush_map *map, ...@@ -391,7 +392,7 @@ static int crush_choose(const struct crush_map *map,
} }
reject = 0; reject = 0;
if (recurse_to_leaf) { if (!collide && recurse_to_leaf) {
if (item < 0) { if (item < 0) {
if (crush_choose(map, if (crush_choose(map,
map->buckets[-1-item], map->buckets[-1-item],
...@@ -399,6 +400,7 @@ static int crush_choose(const struct crush_map *map, ...@@ -399,6 +400,7 @@ static int crush_choose(const struct crush_map *map,
x, outpos+1, 0, x, outpos+1, 0,
out2, outpos, out2, outpos,
firstn, 0, firstn, 0,
map->chooseleaf_descend_once,
NULL) <= outpos) NULL) <= outpos)
/* didn't get leaf */ /* didn't get leaf */
reject = 1; reject = 1;
...@@ -422,7 +424,10 @@ static int crush_choose(const struct crush_map *map, ...@@ -422,7 +424,10 @@ static int crush_choose(const struct crush_map *map,
ftotal++; ftotal++;
flocal++; flocal++;
if (collide && flocal <= map->choose_local_tries) if (reject && descend_once)
/* let outer call try again */
skip_rep = 1;
else if (collide && flocal <= map->choose_local_tries)
/* retry locally a few times */ /* retry locally a few times */
retry_bucket = 1; retry_bucket = 1;
else if (map->choose_local_fallback_tries > 0 && else if (map->choose_local_fallback_tries > 0 &&
...@@ -485,6 +490,7 @@ int crush_do_rule(const struct crush_map *map, ...@@ -485,6 +490,7 @@ int crush_do_rule(const struct crush_map *map,
int i, j; int i, j;
int numrep; int numrep;
int firstn; int firstn;
const int descend_once = 0;
if ((__u32)ruleno >= map->max_rules) { if ((__u32)ruleno >= map->max_rules) {
dprintk(" bad ruleno %d\n", ruleno); dprintk(" bad ruleno %d\n", ruleno);
...@@ -544,7 +550,8 @@ int crush_do_rule(const struct crush_map *map, ...@@ -544,7 +550,8 @@ int crush_do_rule(const struct crush_map *map,
curstep->arg2, curstep->arg2,
o+osize, j, o+osize, j,
firstn, firstn,
recurse_to_leaf, c+osize); recurse_to_leaf,
descend_once, c+osize);
} }
if (recurse_to_leaf) if (recurse_to_leaf)
......
...@@ -423,7 +423,8 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len, ...@@ -423,7 +423,8 @@ int ceph_encrypt2(struct ceph_crypto_key *secret, void *dst, size_t *dst_len,
} }
} }
int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) static int ceph_key_instantiate(struct key *key,
struct key_preparsed_payload *prep)
{ {
struct ceph_crypto_key *ckey; struct ceph_crypto_key *ckey;
size_t datalen = prep->datalen; size_t datalen = prep->datalen;
...@@ -458,12 +459,12 @@ int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep) ...@@ -458,12 +459,12 @@ int ceph_key_instantiate(struct key *key, struct key_preparsed_payload *prep)
return ret; return ret;
} }
int ceph_key_match(const struct key *key, const void *description) static int ceph_key_match(const struct key *key, const void *description)
{ {
return strcmp(key->description, description) == 0; return strcmp(key->description, description) == 0;
} }
void ceph_key_destroy(struct key *key) { static void ceph_key_destroy(struct key *key) {
struct ceph_crypto_key *ckey = key->payload.data; struct ceph_crypto_key *ckey = key->payload.data;
ceph_crypto_key_destroy(ckey); ceph_crypto_key_destroy(ckey);
......
...@@ -66,9 +66,9 @@ static int osdmap_show(struct seq_file *s, void *p) ...@@ -66,9 +66,9 @@ static int osdmap_show(struct seq_file *s, void *p)
for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) { for (n = rb_first(&client->osdc.osdmap->pg_pools); n; n = rb_next(n)) {
struct ceph_pg_pool_info *pool = struct ceph_pg_pool_info *pool =
rb_entry(n, struct ceph_pg_pool_info, node); rb_entry(n, struct ceph_pg_pool_info, node);
seq_printf(s, "pg_pool %d pg_num %d / %d, lpg_num %d / %d\n", seq_printf(s, "pg_pool %llu pg_num %d / %d\n",
pool->id, pool->v.pg_num, pool->pg_num_mask, (unsigned long long)pool->id, pool->pg_num,
pool->v.lpg_num, pool->lpg_num_mask); pool->pg_num_mask);
} }
for (i = 0; i < client->osdc.osdmap->max_osd; i++) { for (i = 0; i < client->osdc.osdmap->max_osd; i++) {
struct ceph_entity_addr *addr = struct ceph_entity_addr *addr =
...@@ -123,26 +123,16 @@ static int osdc_show(struct seq_file *s, void *pp) ...@@ -123,26 +123,16 @@ static int osdc_show(struct seq_file *s, void *pp)
mutex_lock(&osdc->request_mutex); mutex_lock(&osdc->request_mutex);
for (p = rb_first(&osdc->requests); p; p = rb_next(p)) { for (p = rb_first(&osdc->requests); p; p = rb_next(p)) {
struct ceph_osd_request *req; struct ceph_osd_request *req;
struct ceph_osd_request_head *head; int opcode;
struct ceph_osd_op *op;
int num_ops;
int opcode, olen;
int i; int i;
req = rb_entry(p, struct ceph_osd_request, r_node); req = rb_entry(p, struct ceph_osd_request, r_node);
seq_printf(s, "%lld\tosd%d\t%d.%x\t", req->r_tid, seq_printf(s, "%lld\tosd%d\t%lld.%x\t", req->r_tid,
req->r_osd ? req->r_osd->o_osd : -1, req->r_osd ? req->r_osd->o_osd : -1,
le32_to_cpu(req->r_pgid.pool), req->r_pgid.pool, req->r_pgid.seed);
le16_to_cpu(req->r_pgid.ps));
head = req->r_request->front.iov_base; seq_printf(s, "%.*s", req->r_oid_len, req->r_oid);
op = (void *)(head + 1);
num_ops = le16_to_cpu(head->num_ops);
olen = le32_to_cpu(head->object_len);
seq_printf(s, "%.*s", olen,
(const char *)(head->ops + num_ops));
if (req->r_reassert_version.epoch) if (req->r_reassert_version.epoch)
seq_printf(s, "\t%u'%llu", seq_printf(s, "\t%u'%llu",
...@@ -151,10 +141,9 @@ static int osdc_show(struct seq_file *s, void *pp) ...@@ -151,10 +141,9 @@ static int osdc_show(struct seq_file *s, void *pp)
else else
seq_printf(s, "\t"); seq_printf(s, "\t");
for (i = 0; i < num_ops; i++) { for (i = 0; i < req->r_num_ops; i++) {
opcode = le16_to_cpu(op->op); opcode = le16_to_cpu(req->r_request_ops[i].op);
seq_printf(s, "\t%s", ceph_osd_op_name(opcode)); seq_printf(s, "\t%s", ceph_osd_op_name(opcode));
op++;
} }
seq_printf(s, "\n"); seq_printf(s, "\n");
......
This diff is collapsed.
...@@ -697,7 +697,7 @@ int ceph_monc_delete_snapid(struct ceph_mon_client *monc, ...@@ -697,7 +697,7 @@ int ceph_monc_delete_snapid(struct ceph_mon_client *monc,
u32 pool, u64 snapid) u32 pool, u64 snapid)
{ {
return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP, return do_poolop(monc, POOL_OP_CREATE_UNMANAGED_SNAP,
pool, snapid, 0, 0); pool, snapid, NULL, 0);
} }
......
This diff is collapsed.
This diff is collapsed.
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
/* /*
* build a vector of user pages * build a vector of user pages
*/ */
struct page **ceph_get_direct_page_vector(const char __user *data, struct page **ceph_get_direct_page_vector(const void __user *data,
int num_pages, bool write_page) int num_pages, bool write_page)
{ {
struct page **pages; struct page **pages;
...@@ -93,7 +93,7 @@ EXPORT_SYMBOL(ceph_alloc_page_vector); ...@@ -93,7 +93,7 @@ EXPORT_SYMBOL(ceph_alloc_page_vector);
* copy user data into a page vector * copy user data into a page vector
*/ */
int ceph_copy_user_to_page_vector(struct page **pages, int ceph_copy_user_to_page_vector(struct page **pages,
const char __user *data, const void __user *data,
loff_t off, size_t len) loff_t off, size_t len)
{ {
int i = 0; int i = 0;
...@@ -118,17 +118,17 @@ int ceph_copy_user_to_page_vector(struct page **pages, ...@@ -118,17 +118,17 @@ int ceph_copy_user_to_page_vector(struct page **pages,
} }
EXPORT_SYMBOL(ceph_copy_user_to_page_vector); EXPORT_SYMBOL(ceph_copy_user_to_page_vector);
int ceph_copy_to_page_vector(struct page **pages, void ceph_copy_to_page_vector(struct page **pages,
const char *data, const void *data,
loff_t off, size_t len) loff_t off, size_t len)
{ {
int i = 0; int i = 0;
size_t po = off & ~PAGE_CACHE_MASK; size_t po = off & ~PAGE_CACHE_MASK;
size_t left = len; size_t left = len;
size_t l;
while (left > 0) { while (left > 0) {
l = min_t(size_t, PAGE_CACHE_SIZE-po, left); size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
memcpy(page_address(pages[i]) + po, data, l); memcpy(page_address(pages[i]) + po, data, l);
data += l; data += l;
left -= l; left -= l;
...@@ -138,21 +138,20 @@ int ceph_copy_to_page_vector(struct page **pages, ...@@ -138,21 +138,20 @@ int ceph_copy_to_page_vector(struct page **pages,
i++; i++;
} }
} }
return len;
} }
EXPORT_SYMBOL(ceph_copy_to_page_vector); EXPORT_SYMBOL(ceph_copy_to_page_vector);
int ceph_copy_from_page_vector(struct page **pages, void ceph_copy_from_page_vector(struct page **pages,
char *data, void *data,
loff_t off, size_t len) loff_t off, size_t len)
{ {
int i = 0; int i = 0;
size_t po = off & ~PAGE_CACHE_MASK; size_t po = off & ~PAGE_CACHE_MASK;
size_t left = len; size_t left = len;
size_t l;
while (left > 0) { while (left > 0) {
l = min_t(size_t, PAGE_CACHE_SIZE-po, left); size_t l = min_t(size_t, PAGE_CACHE_SIZE-po, left);
memcpy(data, page_address(pages[i]) + po, l); memcpy(data, page_address(pages[i]) + po, l);
data += l; data += l;
left -= l; left -= l;
...@@ -162,7 +161,6 @@ int ceph_copy_from_page_vector(struct page **pages, ...@@ -162,7 +161,6 @@ int ceph_copy_from_page_vector(struct page **pages,
i++; i++;
} }
} }
return len;
} }
EXPORT_SYMBOL(ceph_copy_from_page_vector); EXPORT_SYMBOL(ceph_copy_from_page_vector);
...@@ -170,7 +168,7 @@ EXPORT_SYMBOL(ceph_copy_from_page_vector); ...@@ -170,7 +168,7 @@ EXPORT_SYMBOL(ceph_copy_from_page_vector);
* copy user data from a page vector into a user pointer * copy user data from a page vector into a user pointer
*/ */
int ceph_copy_page_vector_to_user(struct page **pages, int ceph_copy_page_vector_to_user(struct page **pages,
char __user *data, void __user *data,
loff_t off, size_t len) loff_t off, size_t len)
{ {
int i = 0; int i = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment