Commit 3bf7878f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The main item here is support for v12.y.z ("Luminous") clusters:
  RESEND_ON_SPLIT, RADOS_BACKOFF, OSDMAP_PG_UPMAP and CRUSH_CHOOSE_ARGS
  feature bits, and various other changes in the RADOS client protocol.

  On top of that we have a new fsc mount option to allow supplying
  fscache uniquifier (similar to NFS) and the usual pile of filesystem
  fixes from Zheng"

* tag 'ceph-for-4.13-rc1' of git://github.com/ceph/ceph-client: (44 commits)
  libceph: advertise support for NEW_OSDOP_ENCODING and SERVER_LUMINOUS
  libceph: osd_state is 32 bits wide in luminous
  crush: remove an obsolete comment
  crush: crush_init_workspace starts with struct crush_work
  libceph, crush: per-pool crush_choose_arg_map for crush_do_rule()
  crush: implement weight and id overrides for straw2
  libceph: apply_upmap()
  libceph: compute actual pgid in ceph_pg_to_up_acting_osds()
  libceph: pg_upmap[_items] infrastructure
  libceph: ceph_decode_skip_* helpers
  libceph: kill __{insert,lookup,remove}_pg_mapping()
  libceph: introduce and switch to decode_pg_mapping()
  libceph: don't pass pgid by value
  libceph: respect RADOS_BACKOFF backoffs
  libceph: make DEFINE_RB_* helpers more general
  libceph: avoid unnecessary pi lookups in calc_target()
  libceph: use target pi for calc_target() calculations
  libceph: always populate t->target_{oid,oloc} in calc_target()
  libceph: make sure need_resend targets reflect latest map
  libceph: delete from need_resend_linger before check_linger_pool_dne()
  ...
parents 07d306c8 33e9c8db
...@@ -530,14 +530,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ...@@ -530,14 +530,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
long writeback_stat; long writeback_stat;
u64 truncate_size; u64 truncate_size;
u32 truncate_seq; u32 truncate_seq;
int err = 0, len = PAGE_SIZE; int err, len = PAGE_SIZE;
dout("writepage %p idx %lu\n", page, page->index); dout("writepage %p idx %lu\n", page, page->index);
if (!page->mapping || !page->mapping->host) {
dout("writepage %p - no mapping\n", page);
return -EFAULT;
}
inode = page->mapping->host; inode = page->mapping->host;
ci = ceph_inode(inode); ci = ceph_inode(inode);
fsc = ceph_inode_to_client(inode); fsc = ceph_inode_to_client(inode);
...@@ -547,7 +543,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ...@@ -547,7 +543,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
snapc = page_snap_context(page); snapc = page_snap_context(page);
if (snapc == NULL) { if (snapc == NULL) {
dout("writepage %p page %p not dirty?\n", inode, page); dout("writepage %p page %p not dirty?\n", inode, page);
goto out; return 0;
} }
oldest = get_oldest_context(inode, &snap_size, oldest = get_oldest_context(inode, &snap_size,
&truncate_size, &truncate_seq); &truncate_size, &truncate_seq);
...@@ -555,9 +551,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ...@@ -555,9 +551,10 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage %p page %p snapc %p not writeable - noop\n", dout("writepage %p page %p snapc %p not writeable - noop\n",
inode, page, snapc); inode, page, snapc);
/* we should only noop if called by kswapd */ /* we should only noop if called by kswapd */
WARN_ON((current->flags & PF_MEMALLOC) == 0); WARN_ON(!(current->flags & PF_MEMALLOC));
ceph_put_snap_context(oldest); ceph_put_snap_context(oldest);
goto out; redirty_page_for_writepage(wbc, page);
return 0;
} }
ceph_put_snap_context(oldest); ceph_put_snap_context(oldest);
...@@ -567,8 +564,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ...@@ -567,8 +564,9 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
/* is this a partial page at end of file? */ /* is this a partial page at end of file? */
if (page_off >= snap_size) { if (page_off >= snap_size) {
dout("%p page eof %llu\n", page, snap_size); dout("%p page eof %llu\n", page, snap_size);
goto out; return 0;
} }
if (snap_size < page_off + len) if (snap_size < page_off + len)
len = snap_size - page_off; len = snap_size - page_off;
...@@ -595,7 +593,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ...@@ -595,7 +593,7 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
dout("writepage interrupted page %p\n", page); dout("writepage interrupted page %p\n", page);
redirty_page_for_writepage(wbc, page); redirty_page_for_writepage(wbc, page);
end_page_writeback(page); end_page_writeback(page);
goto out; return err;
} }
dout("writepage setting page/mapping error %d %p\n", dout("writepage setting page/mapping error %d %p\n",
err, page); err, page);
...@@ -611,7 +609,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc) ...@@ -611,7 +609,6 @@ static int writepage_nounlock(struct page *page, struct writeback_control *wbc)
end_page_writeback(page); end_page_writeback(page);
ceph_put_wrbuffer_cap_refs(ci, 1, snapc); ceph_put_wrbuffer_cap_refs(ci, 1, snapc);
ceph_put_snap_context(snapc); /* page's reference */ ceph_put_snap_context(snapc); /* page's reference */
out:
return err; return err;
} }
...@@ -1318,7 +1315,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping, ...@@ -1318,7 +1315,7 @@ static int ceph_write_end(struct file *file, struct address_space *mapping,
struct page *page, void *fsdata) struct page *page, void *fsdata)
{ {
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
int check_cap = 0; bool check_cap = false;
dout("write_end file %p inode %p page %p %d~%d (%d)\n", file, dout("write_end file %p inode %p page %p %d~%d (%d)\n", file,
inode, page, (int)pos, (int)copied, (int)len); inode, page, (int)pos, (int)copied, (int)len);
......
...@@ -35,18 +35,34 @@ struct fscache_netfs ceph_cache_netfs = { ...@@ -35,18 +35,34 @@ struct fscache_netfs ceph_cache_netfs = {
.version = 0, .version = 0,
}; };
static DEFINE_MUTEX(ceph_fscache_lock);
static LIST_HEAD(ceph_fscache_list);
struct ceph_fscache_entry {
struct list_head list;
struct fscache_cookie *fscache;
struct ceph_fsid fsid;
size_t uniq_len;
char uniquifier[0];
};
static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data, static uint16_t ceph_fscache_session_get_key(const void *cookie_netfs_data,
void *buffer, uint16_t maxbuf) void *buffer, uint16_t maxbuf)
{ {
const struct ceph_fs_client* fsc = cookie_netfs_data; const struct ceph_fs_client* fsc = cookie_netfs_data;
uint16_t klen; const char *fscache_uniq = fsc->mount_options->fscache_uniq;
uint16_t fsid_len, uniq_len;
klen = sizeof(fsc->client->fsid); fsid_len = sizeof(fsc->client->fsid);
if (klen > maxbuf) uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
if (fsid_len + uniq_len > maxbuf)
return 0; return 0;
memcpy(buffer, &fsc->client->fsid, klen); memcpy(buffer, &fsc->client->fsid, fsid_len);
return klen; if (uniq_len)
memcpy(buffer + fsid_len, fscache_uniq, uniq_len);
return fsid_len + uniq_len;
} }
static const struct fscache_cookie_def ceph_fscache_fsid_object_def = { static const struct fscache_cookie_def ceph_fscache_fsid_object_def = {
...@@ -67,13 +83,54 @@ void ceph_fscache_unregister(void) ...@@ -67,13 +83,54 @@ void ceph_fscache_unregister(void)
int ceph_fscache_register_fs(struct ceph_fs_client* fsc) int ceph_fscache_register_fs(struct ceph_fs_client* fsc)
{ {
const struct ceph_fsid *fsid = &fsc->client->fsid;
const char *fscache_uniq = fsc->mount_options->fscache_uniq;
size_t uniq_len = fscache_uniq ? strlen(fscache_uniq) : 0;
struct ceph_fscache_entry *ent;
int err = 0;
mutex_lock(&ceph_fscache_lock);
list_for_each_entry(ent, &ceph_fscache_list, list) {
if (memcmp(&ent->fsid, fsid, sizeof(*fsid)))
continue;
if (ent->uniq_len != uniq_len)
continue;
if (uniq_len && memcmp(ent->uniquifier, fscache_uniq, uniq_len))
continue;
pr_err("fscache cookie already registered for fsid %pU\n", fsid);
pr_err(" use fsc=%%s mount option to specify a uniquifier\n");
err = -EBUSY;
goto out_unlock;
}
ent = kzalloc(sizeof(*ent) + uniq_len, GFP_KERNEL);
if (!ent) {
err = -ENOMEM;
goto out_unlock;
}
fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index, fsc->fscache = fscache_acquire_cookie(ceph_cache_netfs.primary_index,
&ceph_fscache_fsid_object_def, &ceph_fscache_fsid_object_def,
fsc, true); fsc, true);
if (!fsc->fscache)
pr_err("Unable to register fsid: %p fscache cookie\n", fsc);
return 0; if (fsc->fscache) {
memcpy(&ent->fsid, fsid, sizeof(*fsid));
if (uniq_len > 0) {
memcpy(&ent->uniquifier, fscache_uniq, uniq_len);
ent->uniq_len = uniq_len;
}
ent->fscache = fsc->fscache;
list_add_tail(&ent->list, &ceph_fscache_list);
} else {
kfree(ent);
pr_err("unable to register fscache cookie for fsid %pU\n",
fsid);
/* all other fs ignore this error */
}
out_unlock:
mutex_unlock(&ceph_fscache_lock);
return err;
} }
static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data, static uint16_t ceph_fscache_inode_get_key(const void *cookie_netfs_data,
...@@ -349,7 +406,24 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page) ...@@ -349,7 +406,24 @@ void ceph_invalidate_fscache_page(struct inode* inode, struct page *page)
void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc) void ceph_fscache_unregister_fs(struct ceph_fs_client* fsc)
{ {
fscache_relinquish_cookie(fsc->fscache, 0); if (fscache_cookie_valid(fsc->fscache)) {
struct ceph_fscache_entry *ent;
bool found = false;
mutex_lock(&ceph_fscache_lock);
list_for_each_entry(ent, &ceph_fscache_list, list) {
if (ent->fscache == fsc->fscache) {
list_del(&ent->list);
kfree(ent);
found = true;
break;
}
}
WARN_ON_ONCE(!found);
mutex_unlock(&ceph_fscache_lock);
__fscache_relinquish_cookie(fsc->fscache, 0);
}
fsc->fscache = NULL; fsc->fscache = NULL;
} }
......
...@@ -1653,6 +1653,21 @@ static int try_nonblocking_invalidate(struct inode *inode) ...@@ -1653,6 +1653,21 @@ static int try_nonblocking_invalidate(struct inode *inode)
return -1; return -1;
} }
bool __ceph_should_report_size(struct ceph_inode_info *ci)
{
loff_t size = ci->vfs_inode.i_size;
/* mds will adjust max size according to the reported size */
if (ci->i_flushing_caps & CEPH_CAP_FILE_WR)
return false;
if (size >= ci->i_max_size)
return true;
/* half of previous max_size increment has been used */
if (ci->i_max_size > ci->i_reported_size &&
(size << 1) >= ci->i_max_size + ci->i_reported_size)
return true;
return false;
}
/* /*
* Swiss army knife function to examine currently used and wanted * Swiss army knife function to examine currently used and wanted
* versus held caps. Release, flush, ack revoked caps to mds as * versus held caps. Release, flush, ack revoked caps to mds as
...@@ -1806,8 +1821,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags, ...@@ -1806,8 +1821,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
} }
/* approaching file_max? */ /* approaching file_max? */
if ((inode->i_size << 1) >= ci->i_max_size && if (__ceph_should_report_size(ci)) {
(ci->i_reported_size << 1) < ci->i_max_size) {
dout("i_size approaching max_size\n"); dout("i_size approaching max_size\n");
goto ack; goto ack;
} }
...@@ -3027,8 +3041,10 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ...@@ -3027,8 +3041,10 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
le32_to_cpu(grant->truncate_seq), le32_to_cpu(grant->truncate_seq),
le64_to_cpu(grant->truncate_size), le64_to_cpu(grant->truncate_size),
size); size);
/* max size increase? */ }
if (ci->i_auth_cap == cap && max_size != ci->i_max_size) {
if (ci->i_auth_cap == cap && (newcaps & CEPH_CAP_ANY_FILE_WR)) {
if (max_size != ci->i_max_size) {
dout("max_size %lld -> %llu\n", dout("max_size %lld -> %llu\n",
ci->i_max_size, max_size); ci->i_max_size, max_size);
ci->i_max_size = max_size; ci->i_max_size = max_size;
...@@ -3037,6 +3053,10 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc, ...@@ -3037,6 +3053,10 @@ static void handle_cap_grant(struct ceph_mds_client *mdsc,
ci->i_requested_max_size = 0; ci->i_requested_max_size = 0;
} }
wake = true; wake = true;
} else if (ci->i_wanted_max_size > ci->i_max_size &&
ci->i_wanted_max_size > ci->i_requested_max_size) {
/* CEPH_CAP_OP_IMPORT */
wake = true;
} }
} }
...@@ -3554,7 +3574,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc, ...@@ -3554,7 +3574,6 @@ static void handle_cap_import(struct ceph_mds_client *mdsc,
} }
/* make sure we re-request max_size, if necessary */ /* make sure we re-request max_size, if necessary */
ci->i_wanted_max_size = 0;
ci->i_requested_max_size = 0; ci->i_requested_max_size = 0;
*old_issued = issued; *old_issued = issued;
...@@ -3790,6 +3809,7 @@ void ceph_handle_caps(struct ceph_mds_session *session, ...@@ -3790,6 +3809,7 @@ void ceph_handle_caps(struct ceph_mds_session *session,
*/ */
void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
{ {
struct inode *inode;
struct ceph_inode_info *ci; struct ceph_inode_info *ci;
int flags = CHECK_CAPS_NODELAY; int flags = CHECK_CAPS_NODELAY;
...@@ -3805,9 +3825,15 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc) ...@@ -3805,9 +3825,15 @@ void ceph_check_delayed_caps(struct ceph_mds_client *mdsc)
time_before(jiffies, ci->i_hold_caps_max)) time_before(jiffies, ci->i_hold_caps_max))
break; break;
list_del_init(&ci->i_cap_delay_list); list_del_init(&ci->i_cap_delay_list);
inode = igrab(&ci->vfs_inode);
spin_unlock(&mdsc->cap_delay_lock); spin_unlock(&mdsc->cap_delay_lock);
dout("check_delayed_caps on %p\n", &ci->vfs_inode);
if (inode) {
dout("check_delayed_caps on %p\n", inode);
ceph_check_caps(ci, flags, NULL); ceph_check_caps(ci, flags, NULL);
iput(inode);
}
} }
spin_unlock(&mdsc->cap_delay_lock); spin_unlock(&mdsc->cap_delay_lock);
} }
......
...@@ -1040,8 +1040,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, ...@@ -1040,8 +1040,8 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
int num_pages; int num_pages;
int written = 0; int written = 0;
int flags; int flags;
int check_caps = 0;
int ret; int ret;
bool check_caps = false;
struct timespec mtime = current_time(inode); struct timespec mtime = current_time(inode);
size_t count = iov_iter_count(from); size_t count = iov_iter_count(from);
......
...@@ -1016,6 +1016,7 @@ static void update_dentry_lease(struct dentry *dentry, ...@@ -1016,6 +1016,7 @@ static void update_dentry_lease(struct dentry *dentry,
long unsigned ttl = from_time + (duration * HZ) / 1000; long unsigned ttl = from_time + (duration * HZ) / 1000;
long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000; long unsigned half_ttl = from_time + (duration * HZ / 2) / 1000;
struct inode *dir; struct inode *dir;
struct ceph_mds_session *old_lease_session = NULL;
/* /*
* Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that * Make sure dentry's inode matches tgt_vino. NULL tgt_vino means that
...@@ -1051,8 +1052,10 @@ static void update_dentry_lease(struct dentry *dentry, ...@@ -1051,8 +1052,10 @@ static void update_dentry_lease(struct dentry *dentry,
time_before(ttl, di->time)) time_before(ttl, di->time))
goto out_unlock; /* we already have a newer lease. */ goto out_unlock; /* we already have a newer lease. */
if (di->lease_session && di->lease_session != session) if (di->lease_session && di->lease_session != session) {
goto out_unlock; old_lease_session = di->lease_session;
di->lease_session = NULL;
}
ceph_dentry_lru_touch(dentry); ceph_dentry_lru_touch(dentry);
...@@ -1065,6 +1068,8 @@ static void update_dentry_lease(struct dentry *dentry, ...@@ -1065,6 +1068,8 @@ static void update_dentry_lease(struct dentry *dentry,
di->time = ttl; di->time = ttl;
out_unlock: out_unlock:
spin_unlock(&dentry->d_lock); spin_unlock(&dentry->d_lock);
if (old_lease_session)
ceph_put_mds_session(old_lease_session);
return; return;
} }
...@@ -1653,20 +1658,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1653,20 +1658,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
return err; return err;
} }
int ceph_inode_set_size(struct inode *inode, loff_t size) bool ceph_inode_set_size(struct inode *inode, loff_t size)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
int ret = 0; bool ret;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size); dout("set_size %p %llu -> %llu\n", inode, inode->i_size, size);
i_size_write(inode, size); i_size_write(inode, size);
inode->i_blocks = calc_inode_blocks(size); inode->i_blocks = calc_inode_blocks(size);
/* tell the MDS if we are approaching max_size */ ret = __ceph_should_report_size(ci);
if ((size << 1) >= ci->i_max_size &&
(ci->i_reported_size << 1) < ci->i_max_size)
ret = 1;
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
return ret; return ret;
......
...@@ -127,6 +127,29 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, ...@@ -127,6 +127,29 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
dout("ceph_lock_wait_for_completion: request %llu was interrupted\n", dout("ceph_lock_wait_for_completion: request %llu was interrupted\n",
req->r_tid); req->r_tid);
mutex_lock(&mdsc->mutex);
if (test_bit(CEPH_MDS_R_GOT_RESULT, &req->r_req_flags)) {
err = 0;
} else {
/*
* ensure we aren't running concurrently with
* ceph_fill_trace or ceph_readdir_prepopulate, which
* rely on locks (dir mutex) held by our caller.
*/
mutex_lock(&req->r_fill_mutex);
req->r_err = err;
set_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags);
mutex_unlock(&req->r_fill_mutex);
if (!req->r_session) {
// haven't sent the request
err = 0;
}
}
mutex_unlock(&mdsc->mutex);
if (!err)
return 0;
intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK, intr_req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_SETFILELOCK,
USE_AUTH_MDS); USE_AUTH_MDS);
if (IS_ERR(intr_req)) if (IS_ERR(intr_req))
...@@ -146,7 +169,7 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc, ...@@ -146,7 +169,7 @@ static int ceph_lock_wait_for_completion(struct ceph_mds_client *mdsc,
if (err && err != -ERESTARTSYS) if (err && err != -ERESTARTSYS)
return err; return err;
wait_for_completion(&req->r_completion); wait_for_completion_killable(&req->r_safe_completion);
return 0; return 0;
} }
......
...@@ -3769,13 +3769,13 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc) ...@@ -3769,13 +3769,13 @@ static void ceph_mdsc_stop(struct ceph_mds_client *mdsc)
void ceph_mdsc_destroy(struct ceph_fs_client *fsc) void ceph_mdsc_destroy(struct ceph_fs_client *fsc)
{ {
struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_client *mdsc = fsc->mdsc;
dout("mdsc_destroy %p\n", mdsc); dout("mdsc_destroy %p\n", mdsc);
ceph_mdsc_stop(mdsc);
/* flush out any connection work with references to us */ /* flush out any connection work with references to us */
ceph_msgr_flush(); ceph_msgr_flush();
ceph_mdsc_stop(mdsc);
fsc->mdsc = NULL; fsc->mdsc = NULL;
kfree(mdsc); kfree(mdsc);
dout("mdsc_destroy %p done\n", mdsc); dout("mdsc_destroy %p done\n", mdsc);
......
...@@ -121,6 +121,7 @@ enum { ...@@ -121,6 +121,7 @@ enum {
/* int args above */ /* int args above */
Opt_snapdirname, Opt_snapdirname,
Opt_mds_namespace, Opt_mds_namespace,
Opt_fscache_uniq,
Opt_last_string, Opt_last_string,
/* string args above */ /* string args above */
Opt_dirstat, Opt_dirstat,
...@@ -158,6 +159,7 @@ static match_table_t fsopt_tokens = { ...@@ -158,6 +159,7 @@ static match_table_t fsopt_tokens = {
/* int args above */ /* int args above */
{Opt_snapdirname, "snapdirname=%s"}, {Opt_snapdirname, "snapdirname=%s"},
{Opt_mds_namespace, "mds_namespace=%s"}, {Opt_mds_namespace, "mds_namespace=%s"},
{Opt_fscache_uniq, "fsc=%s"},
/* string args above */ /* string args above */
{Opt_dirstat, "dirstat"}, {Opt_dirstat, "dirstat"},
{Opt_nodirstat, "nodirstat"}, {Opt_nodirstat, "nodirstat"},
...@@ -223,6 +225,14 @@ static int parse_fsopt_token(char *c, void *private) ...@@ -223,6 +225,14 @@ static int parse_fsopt_token(char *c, void *private)
if (!fsopt->mds_namespace) if (!fsopt->mds_namespace)
return -ENOMEM; return -ENOMEM;
break; break;
case Opt_fscache_uniq:
fsopt->fscache_uniq = kstrndup(argstr[0].from,
argstr[0].to-argstr[0].from,
GFP_KERNEL);
if (!fsopt->fscache_uniq)
return -ENOMEM;
fsopt->flags |= CEPH_MOUNT_OPT_FSCACHE;
break;
/* misc */ /* misc */
case Opt_wsize: case Opt_wsize:
fsopt->wsize = intval; fsopt->wsize = intval;
...@@ -317,6 +327,7 @@ static void destroy_mount_options(struct ceph_mount_options *args) ...@@ -317,6 +327,7 @@ static void destroy_mount_options(struct ceph_mount_options *args)
kfree(args->snapdir_name); kfree(args->snapdir_name);
kfree(args->mds_namespace); kfree(args->mds_namespace);
kfree(args->server_path); kfree(args->server_path);
kfree(args->fscache_uniq);
kfree(args); kfree(args);
} }
...@@ -350,8 +361,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt, ...@@ -350,8 +361,10 @@ static int compare_mount_options(struct ceph_mount_options *new_fsopt,
ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace); ret = strcmp_null(fsopt1->mds_namespace, fsopt2->mds_namespace);
if (ret) if (ret)
return ret; return ret;
ret = strcmp_null(fsopt1->server_path, fsopt2->server_path); ret = strcmp_null(fsopt1->server_path, fsopt2->server_path);
if (ret)
return ret;
ret = strcmp_null(fsopt1->fscache_uniq, fsopt2->fscache_uniq);
if (ret) if (ret)
return ret; return ret;
...@@ -475,8 +488,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) ...@@ -475,8 +488,12 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noasyncreaddir"); seq_puts(m, ",noasyncreaddir");
if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
seq_puts(m, ",nodcache"); seq_puts(m, ",nodcache");
if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) if (fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) {
if (fsopt->fscache_uniq)
seq_printf(m, ",fsc=%s", fsopt->fscache_uniq);
else
seq_puts(m, ",fsc"); seq_puts(m, ",fsc");
}
if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM) if (fsopt->flags & CEPH_MOUNT_OPT_NOPOOLPERM)
seq_puts(m, ",nopoolperm"); seq_puts(m, ",nopoolperm");
...@@ -597,18 +614,11 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, ...@@ -597,18 +614,11 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
if (!fsc->wb_pagevec_pool) if (!fsc->wb_pagevec_pool)
goto fail_trunc_wq; goto fail_trunc_wq;
/* setup fscache */
if ((fsopt->flags & CEPH_MOUNT_OPT_FSCACHE) &&
(ceph_fscache_register_fs(fsc) != 0))
goto fail_fscache;
/* caps */ /* caps */
fsc->min_caps = fsopt->max_readdir; fsc->min_caps = fsopt->max_readdir;
return fsc; return fsc;
fail_fscache:
ceph_fscache_unregister_fs(fsc);
fail_trunc_wq: fail_trunc_wq:
destroy_workqueue(fsc->trunc_wq); destroy_workqueue(fsc->trunc_wq);
fail_pg_inv_wq: fail_pg_inv_wq:
...@@ -626,8 +636,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) ...@@ -626,8 +636,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
{ {
dout("destroy_fs_client %p\n", fsc); dout("destroy_fs_client %p\n", fsc);
ceph_fscache_unregister_fs(fsc);
destroy_workqueue(fsc->wb_wq); destroy_workqueue(fsc->wb_wq);
destroy_workqueue(fsc->pg_inv_wq); destroy_workqueue(fsc->pg_inv_wq);
destroy_workqueue(fsc->trunc_wq); destroy_workqueue(fsc->trunc_wq);
...@@ -636,8 +644,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc) ...@@ -636,8 +644,6 @@ static void destroy_fs_client(struct ceph_fs_client *fsc)
destroy_mount_options(fsc->mount_options); destroy_mount_options(fsc->mount_options);
ceph_fs_debugfs_cleanup(fsc);
ceph_destroy_client(fsc->client); ceph_destroy_client(fsc->client);
kfree(fsc); kfree(fsc);
...@@ -822,6 +828,13 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc) ...@@ -822,6 +828,13 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc)
if (err < 0) if (err < 0)
goto out; goto out;
/* setup fscache */
if (fsc->mount_options->flags & CEPH_MOUNT_OPT_FSCACHE) {
err = ceph_fscache_register_fs(fsc);
if (err < 0)
goto out;
}
if (!fsc->mount_options->server_path) { if (!fsc->mount_options->server_path) {
path = ""; path = "";
dout("mount opening path \\t\n"); dout("mount opening path \\t\n");
...@@ -1040,6 +1053,12 @@ static void ceph_kill_sb(struct super_block *s) ...@@ -1040,6 +1053,12 @@ static void ceph_kill_sb(struct super_block *s)
ceph_mdsc_pre_umount(fsc->mdsc); ceph_mdsc_pre_umount(fsc->mdsc);
generic_shutdown_super(s); generic_shutdown_super(s);
fsc->client->extra_mon_dispatch = NULL;
ceph_fs_debugfs_cleanup(fsc);
ceph_fscache_unregister_fs(fsc);
ceph_mdsc_destroy(fsc); ceph_mdsc_destroy(fsc);
destroy_fs_client(fsc); destroy_fs_client(fsc);
......
...@@ -73,6 +73,7 @@ struct ceph_mount_options { ...@@ -73,6 +73,7 @@ struct ceph_mount_options {
char *snapdir_name; /* default ".snap" */ char *snapdir_name; /* default ".snap" */
char *mds_namespace; /* default NULL */ char *mds_namespace; /* default NULL */
char *server_path; /* default "/" */ char *server_path; /* default "/" */
char *fscache_uniq; /* default NULL */
}; };
struct ceph_fs_client { struct ceph_fs_client {
...@@ -793,7 +794,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -793,7 +794,7 @@ extern int ceph_readdir_prepopulate(struct ceph_mds_request *req,
extern int ceph_inode_holds_cap(struct inode *inode, int mask); extern int ceph_inode_holds_cap(struct inode *inode, int mask);
extern int ceph_inode_set_size(struct inode *inode, loff_t size); extern bool ceph_inode_set_size(struct inode *inode, loff_t size);
extern void __ceph_do_pending_vmtruncate(struct inode *inode); extern void __ceph_do_pending_vmtruncate(struct inode *inode);
extern void ceph_queue_vmtruncate(struct inode *inode); extern void ceph_queue_vmtruncate(struct inode *inode);
...@@ -918,6 +919,7 @@ extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr, ...@@ -918,6 +919,7 @@ extern void ceph_put_wrbuffer_cap_refs(struct ceph_inode_info *ci, int nr,
struct ceph_snap_context *snapc); struct ceph_snap_context *snapc);
extern void ceph_flush_snaps(struct ceph_inode_info *ci, extern void ceph_flush_snaps(struct ceph_inode_info *ci,
struct ceph_mds_session **psession); struct ceph_mds_session **psession);
extern bool __ceph_should_report_size(struct ceph_inode_info *ci);
extern void ceph_check_caps(struct ceph_inode_info *ci, int flags, extern void ceph_check_caps(struct ceph_inode_info *ci, int flags,
struct ceph_mds_session *session); struct ceph_mds_session *session);
extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc); extern void ceph_check_delayed_caps(struct ceph_mds_client *mdsc);
......
...@@ -756,6 +756,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, ...@@ -756,6 +756,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
/* let's see if a virtual xattr was requested */ /* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name); vxattr = ceph_match_vxattr(inode, name);
if (vxattr) { if (vxattr) {
err = ceph_do_getattr(inode, 0, true);
if (err)
return err;
err = -ENODATA; err = -ENODATA;
if (!(vxattr->exists_cb && !vxattr->exists_cb(ci))) if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
err = vxattr->getxattr_cb(ci, value, size); err = vxattr->getxattr_cb(ci, value, size);
......
...@@ -2,103 +2,174 @@ ...@@ -2,103 +2,174 @@
#define __CEPH_FEATURES #define __CEPH_FEATURES
/* /*
* feature bits * Each time we reclaim bits for reuse we need to specify another bit
* that, if present, indicates we have the new incarnation of that
* feature. Base case is 1 (first use).
*/ */
#define CEPH_FEATURE_UID (1ULL<<0) #define CEPH_FEATURE_INCARNATION_1 (0ull)
#define CEPH_FEATURE_NOSRCADDR (1ULL<<1) #define CEPH_FEATURE_INCARNATION_2 (1ull<<57) // CEPH_FEATURE_SERVER_JEWEL
#define CEPH_FEATURE_MONCLOCKCHECK (1ULL<<2)
#define CEPH_FEATURE_FLOCK (1ULL<<3) #define DEFINE_CEPH_FEATURE(bit, incarnation, name) \
#define CEPH_FEATURE_SUBSCRIBE2 (1ULL<<4) const static uint64_t CEPH_FEATURE_##name = (1ULL<<bit); \
#define CEPH_FEATURE_MONNAMES (1ULL<<5) const static uint64_t CEPH_FEATUREMASK_##name = \
#define CEPH_FEATURE_RECONNECT_SEQ (1ULL<<6) (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
#define CEPH_FEATURE_DIRLAYOUTHASH (1ULL<<7)
#define CEPH_FEATURE_OBJECTLOCATOR (1ULL<<8) /* this bit is ignored but still advertised by release *when* */
#define CEPH_FEATURE_PGID64 (1ULL<<9) #define DEFINE_CEPH_FEATURE_DEPRECATED(bit, incarnation, name, when) \
#define CEPH_FEATURE_INCSUBOSDMAP (1ULL<<10) const static uint64_t DEPRECATED_CEPH_FEATURE_##name = (1ULL<<bit); \
#define CEPH_FEATURE_PGPOOL3 (1ULL<<11) const static uint64_t DEPRECATED_CEPH_FEATUREMASK_##name = \
#define CEPH_FEATURE_OSDREPLYMUX (1ULL<<12) (1ULL<<bit | CEPH_FEATURE_INCARNATION_##incarnation);
#define CEPH_FEATURE_OSDENC (1ULL<<13)
#define CEPH_FEATURE_OMAP (1ULL<<14)
#define CEPH_FEATURE_MONENC (1ULL<<15)
#define CEPH_FEATURE_QUERY_T (1ULL<<16)
#define CEPH_FEATURE_INDEP_PG_MAP (1ULL<<17)
#define CEPH_FEATURE_CRUSH_TUNABLES (1ULL<<18)
#define CEPH_FEATURE_CHUNKY_SCRUB (1ULL<<19)
#define CEPH_FEATURE_MON_NULLROUTE (1ULL<<20)
#define CEPH_FEATURE_MON_GV (1ULL<<21)
#define CEPH_FEATURE_BACKFILL_RESERVATION (1ULL<<22)
#define CEPH_FEATURE_MSG_AUTH (1ULL<<23)
#define CEPH_FEATURE_RECOVERY_RESERVATION (1ULL<<24)
#define CEPH_FEATURE_CRUSH_TUNABLES2 (1ULL<<25)
#define CEPH_FEATURE_CREATEPOOLID (1ULL<<26)
#define CEPH_FEATURE_REPLY_CREATE_INODE (1ULL<<27)
#define CEPH_FEATURE_OSD_HBMSGS (1ULL<<28)
#define CEPH_FEATURE_MDSENC (1ULL<<29)
#define CEPH_FEATURE_OSDHASHPSPOOL (1ULL<<30)
#define CEPH_FEATURE_MON_SINGLE_PAXOS (1ULL<<31)
#define CEPH_FEATURE_OSD_SNAPMAPPER (1ULL<<32)
#define CEPH_FEATURE_MON_SCRUB (1ULL<<33)
#define CEPH_FEATURE_OSD_PACKED_RECOVERY (1ULL<<34)
#define CEPH_FEATURE_OSD_CACHEPOOL (1ULL<<35)
#define CEPH_FEATURE_CRUSH_V2 (1ULL<<36) /* new indep; SET_* steps */
#define CEPH_FEATURE_EXPORT_PEER (1ULL<<37)
#define CEPH_FEATURE_OSD_ERASURE_CODES (1ULL<<38)
#define CEPH_FEATURE_OSD_TMAP2OMAP (1ULL<<38) /* overlap with EC */
/* The process supports new-style OSDMap encoding. Monitors also use
this bit to determine if peers support NAK messages. */
#define CEPH_FEATURE_OSDMAP_ENC (1ULL<<39)
#define CEPH_FEATURE_MDS_INLINE_DATA (1ULL<<40)
#define CEPH_FEATURE_CRUSH_TUNABLES3 (1ULL<<41)
#define CEPH_FEATURE_OSD_PRIMARY_AFFINITY (1ULL<<41) /* overlap w/ tunables3 */
#define CEPH_FEATURE_MSGR_KEEPALIVE2 (1ULL<<42)
#define CEPH_FEATURE_OSD_POOLRESEND (1ULL<<43)
#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V2 (1ULL<<44)
#define CEPH_FEATURE_OSD_SET_ALLOC_HINT (1ULL<<45)
#define CEPH_FEATURE_OSD_FADVISE_FLAGS (1ULL<<46)
#define CEPH_FEATURE_OSD_REPOP (1ULL<<46) /* overlap with fadvise */
#define CEPH_FEATURE_OSD_OBJECT_DIGEST (1ULL<<46) /* overlap with fadvise */
#define CEPH_FEATURE_OSD_TRANSACTION_MAY_LAYOUT (1ULL<<46) /* overlap w/ fadvise */
#define CEPH_FEATURE_MDS_QUOTA (1ULL<<47)
#define CEPH_FEATURE_CRUSH_V4 (1ULL<<48) /* straw2 buckets */
#define CEPH_FEATURE_OSD_MIN_SIZE_RECOVERY (1ULL<<49)
// duplicated since it was introduced at the same time as MIN_SIZE_RECOVERY
#define CEPH_FEATURE_OSD_PROXY_FEATURES (1ULL<<49) /* overlap w/ above */
#define CEPH_FEATURE_MON_METADATA (1ULL<<50)
#define CEPH_FEATURE_OSD_BITWISE_HOBJ_SORT (1ULL<<51) /* can sort objs bitwise */
#define CEPH_FEATURE_OSD_PROXY_WRITE_FEATURES (1ULL<<52)
#define CEPH_FEATURE_ERASURE_CODE_PLUGINS_V3 (1ULL<<53)
#define CEPH_FEATURE_OSD_HITSET_GMT (1ULL<<54)
#define CEPH_FEATURE_HAMMER_0_94_4 (1ULL<<55)
#define CEPH_FEATURE_NEW_OSDOP_ENCODING (1ULL<<56) /* New, v7 encoding */
#define CEPH_FEATURE_MON_STATEFUL_SUB (1ULL<<57) /* stateful mon subscription */
#define CEPH_FEATURE_MON_ROUTE_OSDMAP (1ULL<<57) /* peon sends osdmaps */
#define CEPH_FEATURE_CRUSH_TUNABLES5 (1ULL<<58) /* chooseleaf stable mode */
// duplicated since it was introduced at the same time as CEPH_FEATURE_CRUSH_TUNABLES5
#define CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING (1ULL<<58) /* New, v7 encoding */
#define CEPH_FEATURE_FS_FILE_LAYOUT_V2 (1ULL<<58) /* file_layout_t */
/* /*
* The introduction of CEPH_FEATURE_OSD_SNAPMAPPER caused the feature * this bit is ignored by release *unused* and not advertised by
* vector to evaluate to 64 bit ~0. To cope, we designate 1ULL << 63 * release *unadvertised*
* to mean 33 bit ~0, and introduce a helper below to do the */
* translation. #define DEFINE_CEPH_FEATURE_RETIRED(bit, inc, name, unused, unadvertised)
/*
* test for a feature. this test is safer than a typical mask against
* the bit because it ensures that we have the bit AND the marker for the
* bit's incarnation. this must be used in any case where the features
* bits may include an old meaning of the bit.
*/
#define CEPH_HAVE_FEATURE(x, name) \
(((x) & (CEPH_FEATUREMASK_##name)) == (CEPH_FEATUREMASK_##name))
/*
* Notes on deprecation:
*
* A *major* release is a release through which all upgrades must pass
* (e.g., jewel). For example, no pre-jewel server will ever talk to
* a post-jewel server (mon, osd, etc).
*
* For feature bits used *only* on the server-side:
*
* - In the first phase we indicate that a feature is DEPRECATED as of
* a particular release. This is the first major release X (say,
* jewel) that does not depend on its peers advertising the feature.
* That is, it safely assumes its peers all have the feature. We
* indicate this with the DEPRECATED macro. For example,
*
* DEFINE_CEPH_FEATURE_DEPRECATED( 2, 1, MONCLOCKCHECK, JEWEL)
*
* because 10.2.z (jewel) did not care if its peers advertised this
* feature bit.
*
* - In the second phase we stop advertising the the bit and call it
* RETIRED. This can normally be done in the *next* major release
* following the one in which we marked the feature DEPRECATED. In
* the above example, for 12.0.z (luminous) we can say:
*
* DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
* *
* This was introduced by ceph.git commit * - The bit can be reused in the first post-luminous release, 13.0.z
* 9ea02b84104045c2ffd7e7f4e7af512953855ecd v0.58-657-g9ea02b8 * (m).
* and fixed by ceph.git commit *
* 4255b5c2fb54ae40c53284b3ab700fdfc7e61748 v0.65-263-g4255b5c * This ensures that no two versions who have different meanings for
* the bit ever speak to each other.
*/ */
#define CEPH_FEATURE_RESERVED (1ULL<<63)
DEFINE_CEPH_FEATURE( 0, 1, UID)
static inline u64 ceph_sanitize_features(u64 features) DEFINE_CEPH_FEATURE( 1, 1, NOSRCADDR)
{ DEFINE_CEPH_FEATURE_RETIRED( 2, 1, MONCLOCKCHECK, JEWEL, LUMINOUS)
if (features & CEPH_FEATURE_RESERVED) {
/* everything through OSD_SNAPMAPPER */ DEFINE_CEPH_FEATURE( 3, 1, FLOCK)
return 0x1ffffffffull; DEFINE_CEPH_FEATURE( 4, 1, SUBSCRIBE2)
} else { DEFINE_CEPH_FEATURE( 5, 1, MONNAMES)
return features; DEFINE_CEPH_FEATURE( 6, 1, RECONNECT_SEQ)
} DEFINE_CEPH_FEATURE( 7, 1, DIRLAYOUTHASH)
} DEFINE_CEPH_FEATURE( 8, 1, OBJECTLOCATOR)
DEFINE_CEPH_FEATURE( 9, 1, PGID64)
DEFINE_CEPH_FEATURE(10, 1, INCSUBOSDMAP)
DEFINE_CEPH_FEATURE(11, 1, PGPOOL3)
DEFINE_CEPH_FEATURE(12, 1, OSDREPLYMUX)
DEFINE_CEPH_FEATURE(13, 1, OSDENC)
DEFINE_CEPH_FEATURE_RETIRED(14, 1, OMAP, HAMMER, JEWEL)
DEFINE_CEPH_FEATURE(14, 2, SERVER_KRAKEN)
DEFINE_CEPH_FEATURE(15, 1, MONENC)
DEFINE_CEPH_FEATURE_RETIRED(16, 1, QUERY_T, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(17, 1, INDEP_PG_MAP, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE(18, 1, CRUSH_TUNABLES)
DEFINE_CEPH_FEATURE_RETIRED(19, 1, CHUNKY_SCRUB, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(20, 1, MON_NULLROUTE, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(21, 1, MON_GV, HAMMER, JEWEL)
DEFINE_CEPH_FEATURE(21, 2, SERVER_LUMINOUS)
DEFINE_CEPH_FEATURE(21, 2, RESEND_ON_SPLIT) // overlap
DEFINE_CEPH_FEATURE(21, 2, RADOS_BACKOFF) // overlap
DEFINE_CEPH_FEATURE(21, 2, OSDMAP_PG_UPMAP) // overlap
DEFINE_CEPH_FEATURE(21, 2, CRUSH_CHOOSE_ARGS) // overlap
DEFINE_CEPH_FEATURE_RETIRED(22, 1, BACKFILL_RESERVATION, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE(23, 1, MSG_AUTH)
DEFINE_CEPH_FEATURE_RETIRED(24, 1, RECOVERY_RESERVATION, JEWEL, LUNINOUS)
DEFINE_CEPH_FEATURE(25, 1, CRUSH_TUNABLES2)
DEFINE_CEPH_FEATURE(26, 1, CREATEPOOLID)
DEFINE_CEPH_FEATURE(27, 1, REPLY_CREATE_INODE)
DEFINE_CEPH_FEATURE_RETIRED(28, 1, OSD_HBMSGS, HAMMER, JEWEL)
DEFINE_CEPH_FEATURE(28, 2, SERVER_M)
DEFINE_CEPH_FEATURE(29, 1, MDSENC)
DEFINE_CEPH_FEATURE(30, 1, OSDHASHPSPOOL)
DEFINE_CEPH_FEATURE(31, 1, MON_SINGLE_PAXOS) // deprecate me
DEFINE_CEPH_FEATURE_RETIRED(32, 1, OSD_SNAPMAPPER, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(33, 1, MON_SCRUB, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(34, 1, OSD_PACKED_RECOVERY, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE(35, 1, OSD_CACHEPOOL)
DEFINE_CEPH_FEATURE(36, 1, CRUSH_V2)
DEFINE_CEPH_FEATURE(37, 1, EXPORT_PEER)
DEFINE_CEPH_FEATURE(38, 1, OSD_ERASURE_CODES)
DEFINE_CEPH_FEATURE(38, 1, OSD_OSD_TMAP2OMAP) // overlap
DEFINE_CEPH_FEATURE(39, 1, OSDMAP_ENC)
DEFINE_CEPH_FEATURE(40, 1, MDS_INLINE_DATA)
DEFINE_CEPH_FEATURE(41, 1, CRUSH_TUNABLES3)
DEFINE_CEPH_FEATURE(41, 1, OSD_PRIMARY_AFFINITY) // overlap
DEFINE_CEPH_FEATURE(42, 1, MSGR_KEEPALIVE2)
DEFINE_CEPH_FEATURE(43, 1, OSD_POOLRESEND)
DEFINE_CEPH_FEATURE(44, 1, ERASURE_CODE_PLUGINS_V2)
DEFINE_CEPH_FEATURE_RETIRED(45, 1, OSD_SET_ALLOC_HINT, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE(46, 1, OSD_FADVISE_FLAGS)
DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_REPOP, JEWEL, LUMINOUS) // overlap
DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_OBJECT_DIGEST, JEWEL, LUMINOUS) // overlap
DEFINE_CEPH_FEATURE_RETIRED(46, 1, OSD_TRANSACTION_MAY_LAYOUT, JEWEL, LUMINOUS) // overlap
DEFINE_CEPH_FEATURE(47, 1, MDS_QUOTA)
DEFINE_CEPH_FEATURE(48, 1, CRUSH_V4)
DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_MIN_SIZE_RECOVERY, JEWEL, LUMINOUS)
DEFINE_CEPH_FEATURE_RETIRED(49, 1, OSD_PROXY_FEATURES, JEWEL, LUMINOUS) // overlap
DEFINE_CEPH_FEATURE(50, 1, MON_METADATA)
DEFINE_CEPH_FEATURE(51, 1, OSD_BITWISE_HOBJ_SORT)
DEFINE_CEPH_FEATURE(52, 1, OSD_PROXY_WRITE_FEATURES)
DEFINE_CEPH_FEATURE(53, 1, ERASURE_CODE_PLUGINS_V3)
DEFINE_CEPH_FEATURE(54, 1, OSD_HITSET_GMT)
DEFINE_CEPH_FEATURE(55, 1, HAMMER_0_94_4)
DEFINE_CEPH_FEATURE(56, 1, NEW_OSDOP_ENCODING)
DEFINE_CEPH_FEATURE(57, 1, MON_STATEFUL_SUB)
DEFINE_CEPH_FEATURE(57, 1, MON_ROUTE_OSDMAP) // overlap
DEFINE_CEPH_FEATURE(57, 1, OSDSUBOP_NO_SNAPCONTEXT) // overlap
DEFINE_CEPH_FEATURE(57, 1, SERVER_JEWEL) // overlap
DEFINE_CEPH_FEATURE(58, 1, CRUSH_TUNABLES5)
DEFINE_CEPH_FEATURE(58, 1, NEW_OSDOPREPLY_ENCODING) // overlap
DEFINE_CEPH_FEATURE(58, 1, FS_FILE_LAYOUT_V2) // overlap
DEFINE_CEPH_FEATURE(59, 1, FS_BTIME)
DEFINE_CEPH_FEATURE(59, 1, FS_CHANGE_ATTR) // overlap
DEFINE_CEPH_FEATURE(59, 1, MSG_ADDR2) // overlap
DEFINE_CEPH_FEATURE(60, 1, BLKIN_TRACING) // *do not share this bit*
DEFINE_CEPH_FEATURE(61, 1, RESERVED2) // unused, but slow down!
DEFINE_CEPH_FEATURE(62, 1, RESERVED) // do not use; used as a sentinal
DEFINE_CEPH_FEATURE_DEPRECATED(63, 1, RESERVED_BROKEN, LUMINOUS) // client-facing
/* /*
* Features supported. * Features supported.
...@@ -113,6 +184,11 @@ static inline u64 ceph_sanitize_features(u64 features) ...@@ -113,6 +184,11 @@ static inline u64 ceph_sanitize_features(u64 features)
CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_PGPOOL3 | \
CEPH_FEATURE_OSDENC | \ CEPH_FEATURE_OSDENC | \
CEPH_FEATURE_CRUSH_TUNABLES | \ CEPH_FEATURE_CRUSH_TUNABLES | \
CEPH_FEATURE_SERVER_LUMINOUS | \
CEPH_FEATURE_RESEND_ON_SPLIT | \
CEPH_FEATURE_RADOS_BACKOFF | \
CEPH_FEATURE_OSDMAP_PG_UPMAP | \
CEPH_FEATURE_CRUSH_CHOOSE_ARGS | \
CEPH_FEATURE_MSG_AUTH | \ CEPH_FEATURE_MSG_AUTH | \
CEPH_FEATURE_CRUSH_TUNABLES2 | \ CEPH_FEATURE_CRUSH_TUNABLES2 | \
CEPH_FEATURE_REPLY_CREATE_INODE | \ CEPH_FEATURE_REPLY_CREATE_INODE | \
...@@ -126,7 +202,11 @@ static inline u64 ceph_sanitize_features(u64 features) ...@@ -126,7 +202,11 @@ static inline u64 ceph_sanitize_features(u64 features)
CEPH_FEATURE_CRUSH_TUNABLES3 | \ CEPH_FEATURE_CRUSH_TUNABLES3 | \
CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \ CEPH_FEATURE_OSD_PRIMARY_AFFINITY | \
CEPH_FEATURE_MSGR_KEEPALIVE2 | \ CEPH_FEATURE_MSGR_KEEPALIVE2 | \
CEPH_FEATURE_OSD_POOLRESEND | \
CEPH_FEATURE_CRUSH_V4 | \ CEPH_FEATURE_CRUSH_V4 | \
CEPH_FEATURE_NEW_OSDOP_ENCODING | \
CEPH_FEATURE_SERVER_JEWEL | \
CEPH_FEATURE_MON_STATEFUL_SUB | \
CEPH_FEATURE_CRUSH_TUNABLES5 | \ CEPH_FEATURE_CRUSH_TUNABLES5 | \
CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING) CEPH_FEATURE_NEW_OSDOPREPLY_ENCODING)
......
...@@ -147,6 +147,7 @@ struct ceph_dir_layout { ...@@ -147,6 +147,7 @@ struct ceph_dir_layout {
#define CEPH_MSG_OSD_OP 42 #define CEPH_MSG_OSD_OP 42
#define CEPH_MSG_OSD_OPREPLY 43 #define CEPH_MSG_OSD_OPREPLY 43
#define CEPH_MSG_WATCH_NOTIFY 44 #define CEPH_MSG_WATCH_NOTIFY 44
#define CEPH_MSG_OSD_BACKOFF 61
/* watch-notify operations */ /* watch-notify operations */
......
...@@ -132,6 +132,66 @@ static inline char *ceph_extract_encoded_string(void **p, void *end, ...@@ -132,6 +132,66 @@ static inline char *ceph_extract_encoded_string(void **p, void *end,
return ERR_PTR(-ERANGE); return ERR_PTR(-ERANGE);
} }
/*
* skip helpers
*/
#define ceph_decode_skip_n(p, end, n, bad) \
do { \
ceph_decode_need(p, end, n, bad); \
*p += n; \
} while (0)
#define ceph_decode_skip_64(p, end, bad) \
ceph_decode_skip_n(p, end, sizeof(u64), bad)
#define ceph_decode_skip_32(p, end, bad) \
ceph_decode_skip_n(p, end, sizeof(u32), bad)
#define ceph_decode_skip_16(p, end, bad) \
ceph_decode_skip_n(p, end, sizeof(u16), bad)
#define ceph_decode_skip_8(p, end, bad) \
ceph_decode_skip_n(p, end, sizeof(u8), bad)
#define ceph_decode_skip_string(p, end, bad) \
do { \
u32 len; \
\
ceph_decode_32_safe(p, end, len, bad); \
ceph_decode_skip_n(p, end, len, bad); \
} while (0)
#define ceph_decode_skip_set(p, end, type, bad) \
do { \
u32 len; \
\
ceph_decode_32_safe(p, end, len, bad); \
while (len--) \
ceph_decode_skip_##type(p, end, bad); \
} while (0)
#define ceph_decode_skip_map(p, end, ktype, vtype, bad) \
do { \
u32 len; \
\
ceph_decode_32_safe(p, end, len, bad); \
while (len--) { \
ceph_decode_skip_##ktype(p, end, bad); \
ceph_decode_skip_##vtype(p, end, bad); \
} \
} while (0)
#define ceph_decode_skip_map_of_map(p, end, ktype1, ktype2, vtype2, bad) \
do { \
u32 len; \
\
ceph_decode_32_safe(p, end, len, bad); \
while (len--) { \
ceph_decode_skip_##ktype1(p, end, bad); \
ceph_decode_skip_map(p, end, ktype2, vtype2, bad); \
} \
} while (0)
/* /*
* struct ceph_timespec <-> struct timespec * struct ceph_timespec <-> struct timespec
*/ */
......
...@@ -184,10 +184,11 @@ static inline int calc_pages_for(u64 off, u64 len) ...@@ -184,10 +184,11 @@ static inline int calc_pages_for(u64 off, u64 len)
(off >> PAGE_SHIFT); (off >> PAGE_SHIFT);
} }
/* #define RB_BYVAL(a) (a)
* These are not meant to be generic - an integer key is assumed. #define RB_BYPTR(a) (&(a))
*/ #define RB_CMP3WAY(a, b) ((a) < (b) ? -1 : (a) > (b))
#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
#define DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
static void insert_##name(struct rb_root *root, type *t) \ static void insert_##name(struct rb_root *root, type *t) \
{ \ { \
struct rb_node **n = &root->rb_node; \ struct rb_node **n = &root->rb_node; \
...@@ -197,11 +198,13 @@ static void insert_##name(struct rb_root *root, type *t) \ ...@@ -197,11 +198,13 @@ static void insert_##name(struct rb_root *root, type *t) \
\ \
while (*n) { \ while (*n) { \
type *cur = rb_entry(*n, type, nodefld); \ type *cur = rb_entry(*n, type, nodefld); \
int cmp; \
\ \
parent = *n; \ parent = *n; \
if (t->keyfld < cur->keyfld) \ cmp = cmpexp(keyexp(t->keyfld), keyexp(cur->keyfld)); \
if (cmp < 0) \
n = &(*n)->rb_left; \ n = &(*n)->rb_left; \
else if (t->keyfld > cur->keyfld) \ else if (cmp > 0) \
n = &(*n)->rb_right; \ n = &(*n)->rb_right; \
else \ else \
BUG(); \ BUG(); \
...@@ -217,19 +220,24 @@ static void erase_##name(struct rb_root *root, type *t) \ ...@@ -217,19 +220,24 @@ static void erase_##name(struct rb_root *root, type *t) \
RB_CLEAR_NODE(&t->nodefld); \ RB_CLEAR_NODE(&t->nodefld); \
} }
#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \ /*
extern type __lookup_##name##_key; \ * @lookup_param_type is a parameter and not constructed from (@type,
static type *lookup_##name(struct rb_root *root, \ * @keyfld) with typeof() because adding const is too unwieldy.
typeof(__lookup_##name##_key.keyfld) key) \ */
#define DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \
lookup_param_type, nodefld) \
static type *lookup_##name(struct rb_root *root, lookup_param_type key) \
{ \ { \
struct rb_node *n = root->rb_node; \ struct rb_node *n = root->rb_node; \
\ \
while (n) { \ while (n) { \
type *cur = rb_entry(n, type, nodefld); \ type *cur = rb_entry(n, type, nodefld); \
int cmp; \
\ \
if (key < cur->keyfld) \ cmp = cmpexp(key, keyexp(cur->keyfld)); \
if (cmp < 0) \
n = n->rb_left; \ n = n->rb_left; \
else if (key > cur->keyfld) \ else if (cmp > 0) \
n = n->rb_right; \ n = n->rb_right; \
else \ else \
return cur; \ return cur; \
...@@ -238,6 +246,23 @@ static type *lookup_##name(struct rb_root *root, \ ...@@ -238,6 +246,23 @@ static type *lookup_##name(struct rb_root *root, \
return NULL; \ return NULL; \
} }
#define DEFINE_RB_FUNCS2(name, type, keyfld, cmpexp, keyexp, \
lookup_param_type, nodefld) \
DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, cmpexp, keyexp, nodefld) \
DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, cmpexp, keyexp, \
lookup_param_type, nodefld)
/*
* Shorthands for integer keys.
*/
#define DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
DEFINE_RB_INSDEL_FUNCS2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, nodefld)
#define DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) \
extern type __lookup_##name##_key; \
DEFINE_RB_LOOKUP_FUNC2(name, type, keyfld, RB_CMP3WAY, RB_BYVAL, \
typeof(__lookup_##name##_key.keyfld), nodefld)
#define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \ #define DEFINE_RB_FUNCS(name, type, keyfld, nodefld) \
DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \ DEFINE_RB_INSDEL_FUNCS(name, type, keyfld, nodefld) \
DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld) DEFINE_RB_LOOKUP_FUNC(name, type, keyfld, nodefld)
......
...@@ -44,6 +44,8 @@ struct ceph_connection_operations { ...@@ -44,6 +44,8 @@ struct ceph_connection_operations {
struct ceph_msg_header *hdr, struct ceph_msg_header *hdr,
int *skip); int *skip);
void (*reencode_message) (struct ceph_msg *msg);
int (*sign_message) (struct ceph_msg *msg); int (*sign_message) (struct ceph_msg *msg);
int (*check_message_signature) (struct ceph_msg *msg); int (*check_message_signature) (struct ceph_msg *msg);
}; };
......
#ifndef _FS_CEPH_OSD_CLIENT_H #ifndef _FS_CEPH_OSD_CLIENT_H
#define _FS_CEPH_OSD_CLIENT_H #define _FS_CEPH_OSD_CLIENT_H
#include <linux/bitrev.h>
#include <linux/completion.h> #include <linux/completion.h>
#include <linux/kref.h> #include <linux/kref.h>
#include <linux/mempool.h> #include <linux/mempool.h>
...@@ -36,6 +37,8 @@ struct ceph_osd { ...@@ -36,6 +37,8 @@ struct ceph_osd {
struct ceph_connection o_con; struct ceph_connection o_con;
struct rb_root o_requests; struct rb_root o_requests;
struct rb_root o_linger_requests; struct rb_root o_linger_requests;
struct rb_root o_backoff_mappings;
struct rb_root o_backoffs_by_id;
struct list_head o_osd_lru; struct list_head o_osd_lru;
struct ceph_auth_handshake o_auth; struct ceph_auth_handshake o_auth;
unsigned long lru_ttl; unsigned long lru_ttl;
...@@ -136,7 +139,8 @@ struct ceph_osd_request_target { ...@@ -136,7 +139,8 @@ struct ceph_osd_request_target {
struct ceph_object_id target_oid; struct ceph_object_id target_oid;
struct ceph_object_locator target_oloc; struct ceph_object_locator target_oloc;
struct ceph_pg pgid; struct ceph_pg pgid; /* last raw pg we mapped to */
struct ceph_spg spgid; /* last actual spg we mapped to */
u32 pg_num; u32 pg_num;
u32 pg_num_mask; u32 pg_num_mask;
struct ceph_osds acting; struct ceph_osds acting;
...@@ -148,6 +152,9 @@ struct ceph_osd_request_target { ...@@ -148,6 +152,9 @@ struct ceph_osd_request_target {
unsigned int flags; /* CEPH_OSD_FLAG_* */ unsigned int flags; /* CEPH_OSD_FLAG_* */
bool paused; bool paused;
u32 epoch;
u32 last_force_resend;
int osd; int osd;
}; };
...@@ -193,7 +200,6 @@ struct ceph_osd_request { ...@@ -193,7 +200,6 @@ struct ceph_osd_request {
unsigned long r_stamp; /* jiffies, send or check time */ unsigned long r_stamp; /* jiffies, send or check time */
unsigned long r_start_stamp; /* jiffies */ unsigned long r_start_stamp; /* jiffies */
int r_attempts; int r_attempts;
u32 r_last_force_resend;
u32 r_map_dne_bound; u32 r_map_dne_bound;
struct ceph_osd_req_op r_ops[]; struct ceph_osd_req_op r_ops[];
...@@ -203,6 +209,23 @@ struct ceph_request_redirect { ...@@ -203,6 +209,23 @@ struct ceph_request_redirect {
struct ceph_object_locator oloc; struct ceph_object_locator oloc;
}; };
/*
* osd request identifier
*
* caller name + incarnation# + tid to unique identify this request
*/
struct ceph_osd_reqid {
struct ceph_entity_name name;
__le64 tid;
__le32 inc;
} __packed;
struct ceph_blkin_trace_info {
__le64 trace_id;
__le64 span_id;
__le64 parent_span_id;
} __packed;
typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie, typedef void (*rados_watchcb2_t)(void *arg, u64 notify_id, u64 cookie,
u64 notifier_id, void *data, size_t data_len); u64 notifier_id, void *data, size_t data_len);
typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err); typedef void (*rados_watcherrcb_t)(void *arg, u64 cookie, int err);
...@@ -221,7 +244,6 @@ struct ceph_osd_linger_request { ...@@ -221,7 +244,6 @@ struct ceph_osd_linger_request {
struct list_head pending_lworks; struct list_head pending_lworks;
struct ceph_osd_request_target t; struct ceph_osd_request_target t;
u32 last_force_resend;
u32 map_dne_bound; u32 map_dne_bound;
struct timespec mtime; struct timespec mtime;
...@@ -256,6 +278,48 @@ struct ceph_watch_item { ...@@ -256,6 +278,48 @@ struct ceph_watch_item {
struct ceph_entity_addr addr; struct ceph_entity_addr addr;
}; };
struct ceph_spg_mapping {
struct rb_node node;
struct ceph_spg spgid;
struct rb_root backoffs;
};
struct ceph_hobject_id {
void *key;
size_t key_len;
void *oid;
size_t oid_len;
u64 snapid;
u32 hash;
u8 is_max;
void *nspace;
size_t nspace_len;
s64 pool;
/* cache */
u32 hash_reverse_bits;
};
static inline void ceph_hoid_build_hash_cache(struct ceph_hobject_id *hoid)
{
hoid->hash_reverse_bits = bitrev32(hoid->hash);
}
/*
* PG-wide backoff: [begin, end)
* per-object backoff: begin == end
*/
struct ceph_osd_backoff {
struct rb_node spg_node;
struct rb_node id_node;
struct ceph_spg spgid;
u64 id;
struct ceph_hobject_id *begin;
struct ceph_hobject_id *end;
};
#define CEPH_LINGER_ID_START 0xffff000000000000ULL #define CEPH_LINGER_ID_START 0xffff000000000000ULL
struct ceph_osd_client { struct ceph_osd_client {
......
...@@ -24,7 +24,15 @@ struct ceph_pg { ...@@ -24,7 +24,15 @@ struct ceph_pg {
uint32_t seed; uint32_t seed;
}; };
#define CEPH_SPG_NOSHARD -1
struct ceph_spg {
struct ceph_pg pgid;
s8 shard;
};
int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs); int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs);
int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs);
#define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id #define CEPH_POOL_FLAG_HASHPSPOOL (1ULL << 0) /* hash pg seed and pool id
together */ together */
...@@ -135,10 +143,14 @@ struct ceph_pg_mapping { ...@@ -135,10 +143,14 @@ struct ceph_pg_mapping {
struct { struct {
int len; int len;
int osds[]; int osds[];
} pg_temp; } pg_temp, pg_upmap;
struct { struct {
int osd; int osd;
} primary_temp; } primary_temp;
struct {
int len;
int from_to[][2];
} pg_upmap_items;
}; };
}; };
...@@ -150,13 +162,17 @@ struct ceph_osdmap { ...@@ -150,13 +162,17 @@ struct ceph_osdmap {
u32 flags; /* CEPH_OSDMAP_* */ u32 flags; /* CEPH_OSDMAP_* */
u32 max_osd; /* size of osd_state, _offload, _addr arrays */ u32 max_osd; /* size of osd_state, _offload, _addr arrays */
u8 *osd_state; /* CEPH_OSD_* */ u32 *osd_state; /* CEPH_OSD_* */
u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */ u32 *osd_weight; /* 0 = failed, 0x10000 = 100% normal */
struct ceph_entity_addr *osd_addr; struct ceph_entity_addr *osd_addr;
struct rb_root pg_temp; struct rb_root pg_temp;
struct rb_root primary_temp; struct rb_root primary_temp;
/* remap (post-CRUSH, pre-up) */
struct rb_root pg_upmap; /* PG := raw set */
struct rb_root pg_upmap_items; /* from -> to within raw set */
u32 *osd_primary_affinity; u32 *osd_primary_affinity;
struct rb_root pg_pools; struct rb_root pg_pools;
...@@ -187,7 +203,7 @@ static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd) ...@@ -187,7 +203,7 @@ static inline bool ceph_osd_is_down(struct ceph_osdmap *map, int osd)
return !ceph_osd_is_up(map, osd); return !ceph_osd_is_up(map, osd);
} }
extern char *ceph_osdmap_state_str(char *str, int len, int state); char *ceph_osdmap_state_str(char *str, int len, u32 state);
extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd); extern u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd);
static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
...@@ -198,11 +214,13 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map, ...@@ -198,11 +214,13 @@ static inline struct ceph_entity_addr *ceph_osd_addr(struct ceph_osdmap *map,
return &map->osd_addr[osd]; return &map->osd_addr[osd];
} }
#define CEPH_PGID_ENCODING_LEN (1 + 8 + 4 + 4)
static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid) static inline int ceph_decode_pgid(void **p, void *end, struct ceph_pg *pgid)
{ {
__u8 version; __u8 version;
if (!ceph_has_room(p, end, 1 + 8 + 4 + 4)) { if (!ceph_has_room(p, end, CEPH_PGID_ENCODING_LEN)) {
pr_warn("incomplete pg encoding\n"); pr_warn("incomplete pg encoding\n");
return -EINVAL; return -EINVAL;
} }
...@@ -240,6 +258,8 @@ static inline void ceph_osds_init(struct ceph_osds *set) ...@@ -240,6 +258,8 @@ static inline void ceph_osds_init(struct ceph_osds *set)
void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src); void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src);
bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
u32 new_pg_num);
bool ceph_is_new_interval(const struct ceph_osds *old_acting, bool ceph_is_new_interval(const struct ceph_osds *old_acting,
const struct ceph_osds *new_acting, const struct ceph_osds *new_acting,
const struct ceph_osds *old_up, const struct ceph_osds *old_up,
...@@ -262,15 +282,24 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout, ...@@ -262,15 +282,24 @@ extern int ceph_calc_file_object_mapping(struct ceph_file_layout *layout,
u64 off, u64 len, u64 off, u64 len,
u64 *bno, u64 *oxoff, u64 *oxlen); u64 *bno, u64 *oxoff, u64 *oxlen);
int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
const struct ceph_object_id *oid,
const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid);
int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
struct ceph_object_id *oid, const struct ceph_object_id *oid,
struct ceph_object_locator *oloc, const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid); struct ceph_pg *raw_pgid);
void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid, const struct ceph_pg *raw_pgid,
struct ceph_osds *up, struct ceph_osds *up,
struct ceph_osds *acting); struct ceph_osds *acting);
bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_spg *spgid);
int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid); const struct ceph_pg *raw_pgid);
......
...@@ -439,6 +439,12 @@ enum { ...@@ -439,6 +439,12 @@ enum {
const char *ceph_osd_watch_op_name(int o); const char *ceph_osd_watch_op_name(int o);
enum {
CEPH_OSD_BACKOFF_OP_BLOCK = 1,
CEPH_OSD_BACKOFF_OP_ACK_BLOCK = 2,
CEPH_OSD_BACKOFF_OP_UNBLOCK = 3,
};
/* /*
* an individual object operation. each may be accompanied by some data * an individual object operation. each may be accompanied by some data
* payload * payload
......
...@@ -2,6 +2,7 @@ ...@@ -2,6 +2,7 @@
#define CEPH_CRUSH_CRUSH_H #define CEPH_CRUSH_CRUSH_H
#ifdef __KERNEL__ #ifdef __KERNEL__
# include <linux/rbtree.h>
# include <linux/types.h> # include <linux/types.h>
#else #else
# include "crush_compat.h" # include "crush_compat.h"
...@@ -137,6 +138,68 @@ struct crush_bucket { ...@@ -137,6 +138,68 @@ struct crush_bucket {
}; };
/** @ingroup API
*
* Replacement weights for each item in a bucket. The size of the
* array must be exactly the size of the straw2 bucket, just as the
* item_weights array.
*
*/
struct crush_weight_set {
__u32 *weights; /*!< 16.16 fixed point weights
in the same order as items */
__u32 size; /*!< size of the __weights__ array */
};
/** @ingroup API
*
* Replacement weights and ids for a given straw2 bucket, for
* placement purposes.
*
* When crush_do_rule() chooses the Nth item from a straw2 bucket, the
* replacement weights found at __weight_set[N]__ are used instead of
* the weights from __item_weights__. If __N__ is greater than
* __weight_set_size__, the weights found at __weight_set_size-1__ are
* used instead. For instance if __weight_set__ is:
*
* [ [ 0x10000, 0x20000 ], // position 0
* [ 0x20000, 0x40000 ] ] // position 1
*
* choosing the 0th item will use position 0 weights [ 0x10000, 0x20000 ]
* choosing the 1th item will use position 1 weights [ 0x20000, 0x40000 ]
* choosing the 2th item will use position 1 weights [ 0x20000, 0x40000 ]
* etc.
*
*/
struct crush_choose_arg {
__s32 *ids; /*!< values to use instead of items */
__u32 ids_size; /*!< size of the __ids__ array */
struct crush_weight_set *weight_set; /*!< weight replacements for
a given position */
__u32 weight_set_size; /*!< size of the __weight_set__ array */
};
/** @ingroup API
*
* Replacement weights and ids for each bucket in the crushmap. The
* __size__ of the __args__ array must be exactly the same as the
* __map->max_buckets__.
*
* The __crush_choose_arg__ at index N will be used when choosing
* an item from the bucket __map->buckets[N]__ bucket, provided it
* is a straw2 bucket.
*
*/
struct crush_choose_arg_map {
#ifdef __KERNEL__
struct rb_node node;
u64 choose_args_index;
#endif
struct crush_choose_arg *args; /*!< replacement for each bucket
in the crushmap */
__u32 size; /*!< size of the __args__ array */
};
struct crush_bucket_uniform { struct crush_bucket_uniform {
struct crush_bucket h; struct crush_bucket h;
__u32 item_weight; /* 16-bit fixed point; all items equally weighted */ __u32 item_weight; /* 16-bit fixed point; all items equally weighted */
...@@ -236,6 +299,9 @@ struct crush_map { ...@@ -236,6 +299,9 @@ struct crush_map {
__u32 allowed_bucket_algs; __u32 allowed_bucket_algs;
__u32 *choose_tries; __u32 *choose_tries;
#else
/* CrushWrapper::choose_args */
struct rb_root choose_args;
#endif #endif
}; };
......
...@@ -11,11 +11,10 @@ ...@@ -11,11 +11,10 @@
#include "crush.h" #include "crush.h"
extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size); extern int crush_find_rule(const struct crush_map *map, int ruleset, int type, int size);
extern int crush_do_rule(const struct crush_map *map, int crush_do_rule(const struct crush_map *map,
int ruleno, int ruleno, int x, int *result, int result_max,
int x, int *result, int result_max, const __u32 *weight, int weight_max,
const __u32 *weights, int weight_max, void *cwin, const struct crush_choose_arg *choose_args);
void *cwin);
/* /*
* Returns the exact amount of workspace that will need to be used * Returns the exact amount of workspace that will need to be used
......
...@@ -85,6 +85,7 @@ const char *ceph_msg_type_name(int type) ...@@ -85,6 +85,7 @@ const char *ceph_msg_type_name(int type)
case CEPH_MSG_OSD_OP: return "osd_op"; case CEPH_MSG_OSD_OP: return "osd_op";
case CEPH_MSG_OSD_OPREPLY: return "osd_opreply"; case CEPH_MSG_OSD_OPREPLY: return "osd_opreply";
case CEPH_MSG_WATCH_NOTIFY: return "watch_notify"; case CEPH_MSG_WATCH_NOTIFY: return "watch_notify";
case CEPH_MSG_OSD_BACKOFF: return "osd_backoff";
default: return "unknown"; default: return "unknown";
} }
} }
......
#ifdef __KERNEL__ #ifdef __KERNEL__
# include <linux/slab.h> # include <linux/slab.h>
# include <linux/crush/crush.h> # include <linux/crush/crush.h>
void clear_choose_args(struct crush_map *c);
#else #else
# include "crush_compat.h" # include "crush_compat.h"
# include "crush.h" # include "crush.h"
...@@ -127,6 +128,8 @@ void crush_destroy(struct crush_map *map) ...@@ -127,6 +128,8 @@ void crush_destroy(struct crush_map *map)
#ifndef __KERNEL__ #ifndef __KERNEL__
kfree(map->choose_tries); kfree(map->choose_tries);
#else
clear_choose_args(map);
#endif #endif
kfree(map); kfree(map);
} }
......
...@@ -302,19 +302,42 @@ static __u64 crush_ln(unsigned int xin) ...@@ -302,19 +302,42 @@ static __u64 crush_ln(unsigned int xin)
* *
*/ */
static __u32 *get_choose_arg_weights(const struct crush_bucket_straw2 *bucket,
const struct crush_choose_arg *arg,
int position)
{
if (!arg || !arg->weight_set || arg->weight_set_size == 0)
return bucket->item_weights;
if (position >= arg->weight_set_size)
position = arg->weight_set_size - 1;
return arg->weight_set[position].weights;
}
static __s32 *get_choose_arg_ids(const struct crush_bucket_straw2 *bucket,
const struct crush_choose_arg *arg)
{
if (!arg || !arg->ids)
return bucket->h.items;
return arg->ids;
}
static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
int x, int r) int x, int r,
const struct crush_choose_arg *arg,
int position)
{ {
unsigned int i, high = 0; unsigned int i, high = 0;
unsigned int u; unsigned int u;
unsigned int w;
__s64 ln, draw, high_draw = 0; __s64 ln, draw, high_draw = 0;
__u32 *weights = get_choose_arg_weights(bucket, arg, position);
__s32 *ids = get_choose_arg_ids(bucket, arg);
for (i = 0; i < bucket->h.size; i++) { for (i = 0; i < bucket->h.size; i++) {
w = bucket->item_weights[i]; dprintk("weight 0x%x item %d\n", weights[i], ids[i]);
if (w) { if (weights[i]) {
u = crush_hash32_3(bucket->h.hash, x, u = crush_hash32_3(bucket->h.hash, x, ids[i], r);
bucket->h.items[i], r);
u &= 0xffff; u &= 0xffff;
/* /*
...@@ -335,7 +358,7 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, ...@@ -335,7 +358,7 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
* weight means a larger (less negative) value * weight means a larger (less negative) value
* for draw. * for draw.
*/ */
draw = div64_s64(ln, w); draw = div64_s64(ln, weights[i]);
} else { } else {
draw = S64_MIN; draw = S64_MIN;
} }
...@@ -352,7 +375,9 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket, ...@@ -352,7 +375,9 @@ static int bucket_straw2_choose(const struct crush_bucket_straw2 *bucket,
static int crush_bucket_choose(const struct crush_bucket *in, static int crush_bucket_choose(const struct crush_bucket *in,
struct crush_work_bucket *work, struct crush_work_bucket *work,
int x, int r) int x, int r,
const struct crush_choose_arg *arg,
int position)
{ {
dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r); dprintk(" crush_bucket_choose %d x=%d r=%d\n", in->id, x, r);
BUG_ON(in->size == 0); BUG_ON(in->size == 0);
...@@ -374,7 +399,7 @@ static int crush_bucket_choose(const struct crush_bucket *in, ...@@ -374,7 +399,7 @@ static int crush_bucket_choose(const struct crush_bucket *in,
case CRUSH_BUCKET_STRAW2: case CRUSH_BUCKET_STRAW2:
return bucket_straw2_choose( return bucket_straw2_choose(
(const struct crush_bucket_straw2 *)in, (const struct crush_bucket_straw2 *)in,
x, r); x, r, arg, position);
default: default:
dprintk("unknown bucket %d alg %d\n", in->id, in->alg); dprintk("unknown bucket %d alg %d\n", in->id, in->alg);
return in->items[0]; return in->items[0];
...@@ -436,7 +461,8 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -436,7 +461,8 @@ static int crush_choose_firstn(const struct crush_map *map,
unsigned int vary_r, unsigned int vary_r,
unsigned int stable, unsigned int stable,
int *out2, int *out2,
int parent_r) int parent_r,
const struct crush_choose_arg *choose_args)
{ {
int rep; int rep;
unsigned int ftotal, flocal; unsigned int ftotal, flocal;
...@@ -486,7 +512,10 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -486,7 +512,10 @@ static int crush_choose_firstn(const struct crush_map *map,
else else
item = crush_bucket_choose( item = crush_bucket_choose(
in, work->work[-1-in->id], in, work->work[-1-in->id],
x, r); x, r,
(choose_args ?
&choose_args[-1-in->id] : 0),
outpos);
if (item >= map->max_devices) { if (item >= map->max_devices) {
dprintk(" bad item %d\n", item); dprintk(" bad item %d\n", item);
skip_rep = 1; skip_rep = 1;
...@@ -543,7 +572,8 @@ static int crush_choose_firstn(const struct crush_map *map, ...@@ -543,7 +572,8 @@ static int crush_choose_firstn(const struct crush_map *map,
vary_r, vary_r,
stable, stable,
NULL, NULL,
sub_r) <= outpos) sub_r,
choose_args) <= outpos)
/* didn't get leaf */ /* didn't get leaf */
reject = 1; reject = 1;
} else { } else {
...@@ -620,7 +650,8 @@ static void crush_choose_indep(const struct crush_map *map, ...@@ -620,7 +650,8 @@ static void crush_choose_indep(const struct crush_map *map,
unsigned int recurse_tries, unsigned int recurse_tries,
int recurse_to_leaf, int recurse_to_leaf,
int *out2, int *out2,
int parent_r) int parent_r,
const struct crush_choose_arg *choose_args)
{ {
const struct crush_bucket *in = bucket; const struct crush_bucket *in = bucket;
int endpos = outpos + left; int endpos = outpos + left;
...@@ -692,7 +723,10 @@ static void crush_choose_indep(const struct crush_map *map, ...@@ -692,7 +723,10 @@ static void crush_choose_indep(const struct crush_map *map,
item = crush_bucket_choose( item = crush_bucket_choose(
in, work->work[-1-in->id], in, work->work[-1-in->id],
x, r); x, r,
(choose_args ?
&choose_args[-1-in->id] : 0),
outpos);
if (item >= map->max_devices) { if (item >= map->max_devices) {
dprintk(" bad item %d\n", item); dprintk(" bad item %d\n", item);
out[rep] = CRUSH_ITEM_NONE; out[rep] = CRUSH_ITEM_NONE;
...@@ -746,7 +780,8 @@ static void crush_choose_indep(const struct crush_map *map, ...@@ -746,7 +780,8 @@ static void crush_choose_indep(const struct crush_map *map,
x, 1, numrep, 0, x, 1, numrep, 0,
out2, rep, out2, rep,
recurse_tries, 0, recurse_tries, 0,
0, NULL, r); 0, NULL, r,
choose_args);
if (out2[rep] == CRUSH_ITEM_NONE) { if (out2[rep] == CRUSH_ITEM_NONE) {
/* placed nothing; no leaf */ /* placed nothing; no leaf */
break; break;
...@@ -823,7 +858,7 @@ void crush_init_workspace(const struct crush_map *map, void *v) ...@@ -823,7 +858,7 @@ void crush_init_workspace(const struct crush_map *map, void *v)
* set the pointer first and then reserve the space for it to * set the pointer first and then reserve the space for it to
* point to by incrementing the point. * point to by incrementing the point.
*/ */
v += sizeof(struct crush_work *); v += sizeof(struct crush_work);
w->work = v; w->work = v;
v += map->max_buckets * sizeof(struct crush_work_bucket *); v += map->max_buckets * sizeof(struct crush_work_bucket *);
for (b = 0; b < map->max_buckets; ++b) { for (b = 0; b < map->max_buckets; ++b) {
...@@ -854,11 +889,12 @@ void crush_init_workspace(const struct crush_map *map, void *v) ...@@ -854,11 +889,12 @@ void crush_init_workspace(const struct crush_map *map, void *v)
* @weight: weight vector (for map leaves) * @weight: weight vector (for map leaves)
* @weight_max: size of weight vector * @weight_max: size of weight vector
* @cwin: pointer to at least crush_work_size() bytes of memory * @cwin: pointer to at least crush_work_size() bytes of memory
* @choose_args: weights and ids for each known bucket
*/ */
int crush_do_rule(const struct crush_map *map, int crush_do_rule(const struct crush_map *map,
int ruleno, int x, int *result, int result_max, int ruleno, int x, int *result, int result_max,
const __u32 *weight, int weight_max, const __u32 *weight, int weight_max,
void *cwin) void *cwin, const struct crush_choose_arg *choose_args)
{ {
int result_len; int result_len;
struct crush_work *cw = cwin; struct crush_work *cw = cwin;
...@@ -968,11 +1004,6 @@ int crush_do_rule(const struct crush_map *map, ...@@ -968,11 +1004,6 @@ int crush_do_rule(const struct crush_map *map,
for (i = 0; i < wsize; i++) { for (i = 0; i < wsize; i++) {
int bno; int bno;
/*
* see CRUSH_N, CRUSH_N_MINUS macros.
* basically, numrep <= 0 means relative to
* the provided result_max
*/
numrep = curstep->arg1; numrep = curstep->arg1;
if (numrep <= 0) { if (numrep <= 0) {
numrep += result_max; numrep += result_max;
...@@ -1013,7 +1044,8 @@ int crush_do_rule(const struct crush_map *map, ...@@ -1013,7 +1044,8 @@ int crush_do_rule(const struct crush_map *map,
vary_r, vary_r,
stable, stable,
c+osize, c+osize,
0); 0,
choose_args);
} else { } else {
out_size = ((numrep < (result_max-osize)) ? out_size = ((numrep < (result_max-osize)) ?
numrep : (result_max-osize)); numrep : (result_max-osize));
...@@ -1030,7 +1062,8 @@ int crush_do_rule(const struct crush_map *map, ...@@ -1030,7 +1062,8 @@ int crush_do_rule(const struct crush_map *map,
choose_leaf_tries : 1, choose_leaf_tries : 1,
recurse_to_leaf, recurse_to_leaf,
c+osize, c+osize,
0); 0,
choose_args);
osize += out_size; osize += out_size;
} }
} }
......
...@@ -77,7 +77,7 @@ static int osdmap_show(struct seq_file *s, void *p) ...@@ -77,7 +77,7 @@ static int osdmap_show(struct seq_file *s, void *p)
} }
for (i = 0; i < map->max_osd; i++) { for (i = 0; i < map->max_osd; i++) {
struct ceph_entity_addr *addr = &map->osd_addr[i]; struct ceph_entity_addr *addr = &map->osd_addr[i];
int state = map->osd_state[i]; u32 state = map->osd_state[i];
char sb[64]; char sb[64];
seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n", seq_printf(s, "osd%d\t%s\t%3d%%\t(%s)\t%3d%%\n",
...@@ -104,6 +104,29 @@ static int osdmap_show(struct seq_file *s, void *p) ...@@ -104,6 +104,29 @@ static int osdmap_show(struct seq_file *s, void *p)
seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool, seq_printf(s, "primary_temp %llu.%x %d\n", pg->pgid.pool,
pg->pgid.seed, pg->primary_temp.osd); pg->pgid.seed, pg->primary_temp.osd);
} }
for (n = rb_first(&map->pg_upmap); n; n = rb_next(n)) {
struct ceph_pg_mapping *pg =
rb_entry(n, struct ceph_pg_mapping, node);
seq_printf(s, "pg_upmap %llu.%x [", pg->pgid.pool,
pg->pgid.seed);
for (i = 0; i < pg->pg_upmap.len; i++)
seq_printf(s, "%s%d", (i == 0 ? "" : ","),
pg->pg_upmap.osds[i]);
seq_printf(s, "]\n");
}
for (n = rb_first(&map->pg_upmap_items); n; n = rb_next(n)) {
struct ceph_pg_mapping *pg =
rb_entry(n, struct ceph_pg_mapping, node);
seq_printf(s, "pg_upmap_items %llu.%x [", pg->pgid.pool,
pg->pgid.seed);
for (i = 0; i < pg->pg_upmap_items.len; i++)
seq_printf(s, "%s%d->%d", (i == 0 ? "" : ","),
pg->pg_upmap_items.from_to[i][0],
pg->pg_upmap_items.from_to[i][1]);
seq_printf(s, "]\n");
}
up_read(&osdc->lock); up_read(&osdc->lock);
return 0; return 0;
...@@ -147,17 +170,26 @@ static int monc_show(struct seq_file *s, void *p) ...@@ -147,17 +170,26 @@ static int monc_show(struct seq_file *s, void *p)
return 0; return 0;
} }
static void dump_spgid(struct seq_file *s, const struct ceph_spg *spgid)
{
seq_printf(s, "%llu.%x", spgid->pgid.pool, spgid->pgid.seed);
if (spgid->shard != CEPH_SPG_NOSHARD)
seq_printf(s, "s%d", spgid->shard);
}
static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t) static void dump_target(struct seq_file *s, struct ceph_osd_request_target *t)
{ {
int i; int i;
seq_printf(s, "osd%d\t%llu.%x\t[", t->osd, t->pgid.pool, t->pgid.seed); seq_printf(s, "osd%d\t%llu.%x\t", t->osd, t->pgid.pool, t->pgid.seed);
dump_spgid(s, &t->spgid);
seq_puts(s, "\t[");
for (i = 0; i < t->up.size; i++) for (i = 0; i < t->up.size; i++)
seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]); seq_printf(s, "%s%d", (!i ? "" : ","), t->up.osds[i]);
seq_printf(s, "]/%d\t[", t->up.primary); seq_printf(s, "]/%d\t[", t->up.primary);
for (i = 0; i < t->acting.size; i++) for (i = 0; i < t->acting.size; i++)
seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]); seq_printf(s, "%s%d", (!i ? "" : ","), t->acting.osds[i]);
seq_printf(s, "]/%d\t", t->acting.primary); seq_printf(s, "]/%d\te%u\t", t->acting.primary, t->epoch);
if (t->target_oloc.pool_ns) { if (t->target_oloc.pool_ns) {
seq_printf(s, "%*pE/%*pE\t0x%x", seq_printf(s, "%*pE/%*pE\t0x%x",
(int)t->target_oloc.pool_ns->len, (int)t->target_oloc.pool_ns->len,
...@@ -234,6 +266,73 @@ static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd) ...@@ -234,6 +266,73 @@ static void dump_linger_requests(struct seq_file *s, struct ceph_osd *osd)
mutex_unlock(&osd->lock); mutex_unlock(&osd->lock);
} }
static void dump_snapid(struct seq_file *s, u64 snapid)
{
if (snapid == CEPH_NOSNAP)
seq_puts(s, "head");
else if (snapid == CEPH_SNAPDIR)
seq_puts(s, "snapdir");
else
seq_printf(s, "%llx", snapid);
}
static void dump_name_escaped(struct seq_file *s, unsigned char *name,
size_t len)
{
size_t i;
for (i = 0; i < len; i++) {
if (name[i] == '%' || name[i] == ':' || name[i] == '/' ||
name[i] < 32 || name[i] >= 127) {
seq_printf(s, "%%%02x", name[i]);
} else {
seq_putc(s, name[i]);
}
}
}
static void dump_hoid(struct seq_file *s, const struct ceph_hobject_id *hoid)
{
if (hoid->snapid == 0 && hoid->hash == 0 && !hoid->is_max &&
hoid->pool == S64_MIN) {
seq_puts(s, "MIN");
return;
}
if (hoid->is_max) {
seq_puts(s, "MAX");
return;
}
seq_printf(s, "%lld:%08x:", hoid->pool, hoid->hash_reverse_bits);
dump_name_escaped(s, hoid->nspace, hoid->nspace_len);
seq_putc(s, ':');
dump_name_escaped(s, hoid->key, hoid->key_len);
seq_putc(s, ':');
dump_name_escaped(s, hoid->oid, hoid->oid_len);
seq_putc(s, ':');
dump_snapid(s, hoid->snapid);
}
static void dump_backoffs(struct seq_file *s, struct ceph_osd *osd)
{
struct rb_node *n;
mutex_lock(&osd->lock);
for (n = rb_first(&osd->o_backoffs_by_id); n; n = rb_next(n)) {
struct ceph_osd_backoff *backoff =
rb_entry(n, struct ceph_osd_backoff, id_node);
seq_printf(s, "osd%d\t", osd->o_osd);
dump_spgid(s, &backoff->spgid);
seq_printf(s, "\t%llu\t", backoff->id);
dump_hoid(s, backoff->begin);
seq_putc(s, '\t');
dump_hoid(s, backoff->end);
seq_putc(s, '\n');
}
mutex_unlock(&osd->lock);
}
static int osdc_show(struct seq_file *s, void *pp) static int osdc_show(struct seq_file *s, void *pp)
{ {
struct ceph_client *client = s->private; struct ceph_client *client = s->private;
...@@ -259,6 +358,13 @@ static int osdc_show(struct seq_file *s, void *pp) ...@@ -259,6 +358,13 @@ static int osdc_show(struct seq_file *s, void *pp)
} }
dump_linger_requests(s, &osdc->homeless_osd); dump_linger_requests(s, &osdc->homeless_osd);
seq_puts(s, "BACKOFFS\n");
for (n = rb_first(&osdc->osds); n; n = rb_next(n)) {
struct ceph_osd *osd = rb_entry(n, struct ceph_osd, o_node);
dump_backoffs(s, osd);
}
up_read(&osdc->lock); up_read(&osdc->lock);
return 0; return 0;
} }
......
...@@ -1288,13 +1288,16 @@ static void prepare_write_message(struct ceph_connection *con) ...@@ -1288,13 +1288,16 @@ static void prepare_write_message(struct ceph_connection *con)
m->hdr.seq = cpu_to_le64(++con->out_seq); m->hdr.seq = cpu_to_le64(++con->out_seq);
m->needs_out_seq = false; m->needs_out_seq = false;
} }
WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
if (con->ops->reencode_message)
con->ops->reencode_message(m);
dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n", dout("prepare_write_message %p seq %lld type %d len %d+%d+%zd\n",
m, con->out_seq, le16_to_cpu(m->hdr.type), m, con->out_seq, le16_to_cpu(m->hdr.type),
le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len), le32_to_cpu(m->hdr.front_len), le32_to_cpu(m->hdr.middle_len),
m->data_length); m->data_length);
BUG_ON(le32_to_cpu(m->hdr.front_len) != m->front.iov_len); WARN_ON(m->front.iov_len != le32_to_cpu(m->hdr.front_len));
WARN_ON(m->data_length != le32_to_cpu(m->hdr.data_len));
/* tag + hdr + front + middle */ /* tag + hdr + front + middle */
con_out_kvec_add(con, sizeof (tag_msg), &tag_msg); con_out_kvec_add(con, sizeof (tag_msg), &tag_msg);
...@@ -2033,8 +2036,7 @@ static int process_connect(struct ceph_connection *con) ...@@ -2033,8 +2036,7 @@ static int process_connect(struct ceph_connection *con)
{ {
u64 sup_feat = from_msgr(con->msgr)->supported_features; u64 sup_feat = from_msgr(con->msgr)->supported_features;
u64 req_feat = from_msgr(con->msgr)->required_features; u64 req_feat = from_msgr(con->msgr)->required_features;
u64 server_feat = ceph_sanitize_features( u64 server_feat = le64_to_cpu(con->in_reply.features);
le64_to_cpu(con->in_reply.features));
int ret; int ret;
dout("process_connect on %p tag %d\n", con, (int)con->in_tag); dout("process_connect on %p tag %d\n", con, (int)con->in_tag);
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#include <linux/random.h> #include <linux/random.h>
#include <linux/sched.h> #include <linux/sched.h>
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/mon_client.h> #include <linux/ceph/mon_client.h>
#include <linux/ceph/libceph.h> #include <linux/ceph/libceph.h>
#include <linux/ceph/debugfs.h> #include <linux/ceph/debugfs.h>
...@@ -297,6 +298,10 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, ...@@ -297,6 +298,10 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
if (monc->sub_renew_sent) { if (monc->sub_renew_sent) {
/*
* This is only needed for legacy (infernalis or older)
* MONs -- see delayed_work().
*/
monc->sub_renew_after = monc->sub_renew_sent + monc->sub_renew_after = monc->sub_renew_sent +
(seconds >> 1) * HZ - 1; (seconds >> 1) * HZ - 1;
dout("%s sent %lu duration %d renew after %lu\n", __func__, dout("%s sent %lu duration %d renew after %lu\n", __func__,
...@@ -955,7 +960,8 @@ static void delayed_work(struct work_struct *work) ...@@ -955,7 +960,8 @@ static void delayed_work(struct work_struct *work)
__validate_auth(monc); __validate_auth(monc);
} }
if (is_auth) { if (is_auth &&
!(monc->con.peer_features & CEPH_FEATURE_MON_STATEFUL_SUB)) {
unsigned long now = jiffies; unsigned long now = jiffies;
dout("%s renew subs? now %lu renew after %lu\n", dout("%s renew subs? now %lu renew after %lu\n",
......
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
#include <linux/bio.h> #include <linux/bio.h>
#endif #endif
#include <linux/ceph/ceph_features.h>
#include <linux/ceph/libceph.h> #include <linux/ceph/libceph.h>
#include <linux/ceph/osd_client.h> #include <linux/ceph/osd_client.h>
#include <linux/ceph/messenger.h> #include <linux/ceph/messenger.h>
...@@ -49,6 +50,7 @@ static void link_linger(struct ceph_osd *osd, ...@@ -49,6 +50,7 @@ static void link_linger(struct ceph_osd *osd,
struct ceph_osd_linger_request *lreq); struct ceph_osd_linger_request *lreq);
static void unlink_linger(struct ceph_osd *osd, static void unlink_linger(struct ceph_osd *osd,
struct ceph_osd_linger_request *lreq); struct ceph_osd_linger_request *lreq);
static void clear_backoffs(struct ceph_osd *osd);
#if 1 #if 1
static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem) static inline bool rwsem_is_wrlocked(struct rw_semaphore *sem)
...@@ -373,6 +375,7 @@ static void target_copy(struct ceph_osd_request_target *dest, ...@@ -373,6 +375,7 @@ static void target_copy(struct ceph_osd_request_target *dest,
ceph_oloc_copy(&dest->target_oloc, &src->target_oloc); ceph_oloc_copy(&dest->target_oloc, &src->target_oloc);
dest->pgid = src->pgid; /* struct */ dest->pgid = src->pgid; /* struct */
dest->spgid = src->spgid; /* struct */
dest->pg_num = src->pg_num; dest->pg_num = src->pg_num;
dest->pg_num_mask = src->pg_num_mask; dest->pg_num_mask = src->pg_num_mask;
ceph_osds_copy(&dest->acting, &src->acting); ceph_osds_copy(&dest->acting, &src->acting);
...@@ -384,6 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest, ...@@ -384,6 +387,9 @@ static void target_copy(struct ceph_osd_request_target *dest,
dest->flags = src->flags; dest->flags = src->flags;
dest->paused = src->paused; dest->paused = src->paused;
dest->epoch = src->epoch;
dest->last_force_resend = src->last_force_resend;
dest->osd = src->osd; dest->osd = src->osd;
} }
...@@ -537,7 +543,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, ...@@ -537,7 +543,7 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
} }
EXPORT_SYMBOL(ceph_osdc_alloc_request); EXPORT_SYMBOL(ceph_osdc_alloc_request);
static int ceph_oloc_encoding_size(struct ceph_object_locator *oloc) static int ceph_oloc_encoding_size(const struct ceph_object_locator *oloc)
{ {
return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0); return 8 + 4 + 4 + 4 + (oloc->pool_ns ? oloc->pool_ns->len : 0);
} }
...@@ -552,17 +558,21 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp) ...@@ -552,17 +558,21 @@ int ceph_osdc_alloc_messages(struct ceph_osd_request *req, gfp_t gfp)
WARN_ON(ceph_oloc_empty(&req->r_base_oloc)); WARN_ON(ceph_oloc_empty(&req->r_base_oloc));
/* create request message */ /* create request message */
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */ msg_size = CEPH_ENCODING_START_BLK_LEN +
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */ CEPH_PGID_ENCODING_LEN + 1; /* spgid */
msg_size += 4 + 4 + 4; /* hash, osdmap_epoch, flags */
msg_size += CEPH_ENCODING_START_BLK_LEN +
sizeof(struct ceph_osd_reqid); /* reqid */
msg_size += sizeof(struct ceph_blkin_trace_info); /* trace */
msg_size += 4 + sizeof(struct ceph_timespec); /* client_inc, mtime */
msg_size += CEPH_ENCODING_START_BLK_LEN + msg_size += CEPH_ENCODING_START_BLK_LEN +
ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */ ceph_oloc_encoding_size(&req->r_base_oloc); /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pgid */
msg_size += 4 + req->r_base_oid.name_len; /* oid */ msg_size += 4 + req->r_base_oid.name_len; /* oid */
msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op); msg_size += 2 + req->r_num_ops * sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */ msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */ msg_size += 8; /* snap_seq */
msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0); msg_size += 4 + 8 * (req->r_snapc ? req->r_snapc->num_snaps : 0);
msg_size += 4; /* retry_attempt */ msg_size += 4 + 8; /* retry_attempt, features */
if (req->r_mempool) if (req->r_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op, 0); msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
...@@ -1010,6 +1020,8 @@ static void osd_init(struct ceph_osd *osd) ...@@ -1010,6 +1020,8 @@ static void osd_init(struct ceph_osd *osd)
RB_CLEAR_NODE(&osd->o_node); RB_CLEAR_NODE(&osd->o_node);
osd->o_requests = RB_ROOT; osd->o_requests = RB_ROOT;
osd->o_linger_requests = RB_ROOT; osd->o_linger_requests = RB_ROOT;
osd->o_backoff_mappings = RB_ROOT;
osd->o_backoffs_by_id = RB_ROOT;
INIT_LIST_HEAD(&osd->o_osd_lru); INIT_LIST_HEAD(&osd->o_osd_lru);
INIT_LIST_HEAD(&osd->o_keepalive_item); INIT_LIST_HEAD(&osd->o_keepalive_item);
osd->o_incarnation = 1; osd->o_incarnation = 1;
...@@ -1021,6 +1033,8 @@ static void osd_cleanup(struct ceph_osd *osd) ...@@ -1021,6 +1033,8 @@ static void osd_cleanup(struct ceph_osd *osd)
WARN_ON(!RB_EMPTY_NODE(&osd->o_node)); WARN_ON(!RB_EMPTY_NODE(&osd->o_node));
WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_requests));
WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests)); WARN_ON(!RB_EMPTY_ROOT(&osd->o_linger_requests));
WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoff_mappings));
WARN_ON(!RB_EMPTY_ROOT(&osd->o_backoffs_by_id));
WARN_ON(!list_empty(&osd->o_osd_lru)); WARN_ON(!list_empty(&osd->o_osd_lru));
WARN_ON(!list_empty(&osd->o_keepalive_item)); WARN_ON(!list_empty(&osd->o_keepalive_item));
...@@ -1141,6 +1155,7 @@ static void close_osd(struct ceph_osd *osd) ...@@ -1141,6 +1155,7 @@ static void close_osd(struct ceph_osd *osd)
unlink_linger(osd, lreq); unlink_linger(osd, lreq);
link_linger(&osdc->homeless_osd, lreq); link_linger(&osdc->homeless_osd, lreq);
} }
clear_backoffs(osd);
__remove_osd_from_lru(osd); __remove_osd_from_lru(osd);
erase_osd(&osdc->osds, osd); erase_osd(&osdc->osds, osd);
...@@ -1297,7 +1312,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc, ...@@ -1297,7 +1312,7 @@ static bool target_should_be_paused(struct ceph_osd_client *osdc,
ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) || ceph_osdmap_flag(osdc, CEPH_OSDMAP_FULL) ||
__pool_full(pi); __pool_full(pi);
WARN_ON(pi->id != t->base_oloc.pool); WARN_ON(pi->id != t->target_oloc.pool);
return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) || return ((t->flags & CEPH_OSD_FLAG_READ) && pauserd) ||
((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) || ((t->flags & CEPH_OSD_FLAG_WRITE) && pausewr) ||
(osdc->osdmap->epoch < osdc->epoch_barrier); (osdc->osdmap->epoch < osdc->epoch_barrier);
...@@ -1311,19 +1326,21 @@ enum calc_target_result { ...@@ -1311,19 +1326,21 @@ enum calc_target_result {
static enum calc_target_result calc_target(struct ceph_osd_client *osdc, static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
struct ceph_osd_request_target *t, struct ceph_osd_request_target *t,
u32 *last_force_resend, struct ceph_connection *con,
bool any_change) bool any_change)
{ {
struct ceph_pg_pool_info *pi; struct ceph_pg_pool_info *pi;
struct ceph_pg pgid, last_pgid; struct ceph_pg pgid, last_pgid;
struct ceph_osds up, acting; struct ceph_osds up, acting;
bool force_resend = false; bool force_resend = false;
bool need_check_tiering = false; bool unpaused = false;
bool need_resend = false; bool legacy_change;
bool split = false;
bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE); bool sort_bitwise = ceph_osdmap_flag(osdc, CEPH_OSDMAP_SORTBITWISE);
enum calc_target_result ct_res; enum calc_target_result ct_res;
int ret; int ret;
t->epoch = osdc->osdmap->epoch;
pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool); pi = ceph_pg_pool_by_id(osdc->osdmap, t->base_oloc.pool);
if (!pi) { if (!pi) {
t->osd = CEPH_HOMELESS_OSD; t->osd = CEPH_HOMELESS_OSD;
...@@ -1332,33 +1349,33 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, ...@@ -1332,33 +1349,33 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
} }
if (osdc->osdmap->epoch == pi->last_force_request_resend) { if (osdc->osdmap->epoch == pi->last_force_request_resend) {
if (last_force_resend && if (t->last_force_resend < pi->last_force_request_resend) {
*last_force_resend < pi->last_force_request_resend) { t->last_force_resend = pi->last_force_request_resend;
*last_force_resend = pi->last_force_request_resend;
force_resend = true; force_resend = true;
} else if (!last_force_resend) { } else if (t->last_force_resend == 0) {
force_resend = true; force_resend = true;
} }
} }
if (ceph_oid_empty(&t->target_oid) || force_resend) {
/* apply tiering */
ceph_oid_copy(&t->target_oid, &t->base_oid); ceph_oid_copy(&t->target_oid, &t->base_oid);
need_check_tiering = true;
}
if (ceph_oloc_empty(&t->target_oloc) || force_resend) {
ceph_oloc_copy(&t->target_oloc, &t->base_oloc); ceph_oloc_copy(&t->target_oloc, &t->base_oloc);
need_check_tiering = true; if ((t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
}
if (need_check_tiering &&
(t->flags & CEPH_OSD_FLAG_IGNORE_OVERLAY) == 0) {
if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0) if (t->flags & CEPH_OSD_FLAG_READ && pi->read_tier >= 0)
t->target_oloc.pool = pi->read_tier; t->target_oloc.pool = pi->read_tier;
if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0) if (t->flags & CEPH_OSD_FLAG_WRITE && pi->write_tier >= 0)
t->target_oloc.pool = pi->write_tier; t->target_oloc.pool = pi->write_tier;
pi = ceph_pg_pool_by_id(osdc->osdmap, t->target_oloc.pool);
if (!pi) {
t->osd = CEPH_HOMELESS_OSD;
ct_res = CALC_TARGET_POOL_DNE;
goto out;
}
} }
ret = ceph_object_locator_to_pg(osdc->osdmap, &t->target_oid, ret = __ceph_object_locator_to_pg(pi, &t->target_oid, &t->target_oloc,
&t->target_oloc, &pgid); &pgid);
if (ret) { if (ret) {
WARN_ON(ret != -ENOENT); WARN_ON(ret != -ENOENT);
t->osd = CEPH_HOMELESS_OSD; t->osd = CEPH_HOMELESS_OSD;
...@@ -1368,7 +1385,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, ...@@ -1368,7 +1385,7 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
last_pgid.pool = pgid.pool; last_pgid.pool = pgid.pool;
last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask); last_pgid.seed = ceph_stable_mod(pgid.seed, t->pg_num, t->pg_num_mask);
ceph_pg_to_up_acting_osds(osdc->osdmap, &pgid, &up, &acting); ceph_pg_to_up_acting_osds(osdc->osdmap, pi, &pgid, &up, &acting);
if (any_change && if (any_change &&
ceph_is_new_interval(&t->acting, ceph_is_new_interval(&t->acting,
&acting, &acting,
...@@ -1387,13 +1404,16 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, ...@@ -1387,13 +1404,16 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
if (t->paused && !target_should_be_paused(osdc, t, pi)) { if (t->paused && !target_should_be_paused(osdc, t, pi)) {
t->paused = false; t->paused = false;
need_resend = true; unpaused = true;
} }
legacy_change = ceph_pg_compare(&t->pgid, &pgid) ||
ceph_osds_changed(&t->acting, &acting, any_change);
if (t->pg_num)
split = ceph_pg_is_split(&last_pgid, t->pg_num, pi->pg_num);
if (ceph_pg_compare(&t->pgid, &pgid) || if (legacy_change || force_resend || split) {
ceph_osds_changed(&t->acting, &acting, any_change) ||
force_resend) {
t->pgid = pgid; /* struct */ t->pgid = pgid; /* struct */
ceph_pg_to_primary_shard(osdc->osdmap, pi, &pgid, &t->spgid);
ceph_osds_copy(&t->acting, &acting); ceph_osds_copy(&t->acting, &acting);
ceph_osds_copy(&t->up, &up); ceph_osds_copy(&t->up, &up);
t->size = pi->size; t->size = pi->size;
...@@ -1403,15 +1423,342 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc, ...@@ -1403,15 +1423,342 @@ static enum calc_target_result calc_target(struct ceph_osd_client *osdc,
t->sort_bitwise = sort_bitwise; t->sort_bitwise = sort_bitwise;
t->osd = acting.primary; t->osd = acting.primary;
need_resend = true;
} }
ct_res = need_resend ? CALC_TARGET_NEED_RESEND : CALC_TARGET_NO_ACTION; if (unpaused || legacy_change || force_resend ||
(split && con && CEPH_HAVE_FEATURE(con->peer_features,
RESEND_ON_SPLIT)))
ct_res = CALC_TARGET_NEED_RESEND;
else
ct_res = CALC_TARGET_NO_ACTION;
out: out:
dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd); dout("%s t %p -> ct_res %d osd %d\n", __func__, t, ct_res, t->osd);
return ct_res; return ct_res;
} }
static struct ceph_spg_mapping *alloc_spg_mapping(void)
{
struct ceph_spg_mapping *spg;
spg = kmalloc(sizeof(*spg), GFP_NOIO);
if (!spg)
return NULL;
RB_CLEAR_NODE(&spg->node);
spg->backoffs = RB_ROOT;
return spg;
}
static void free_spg_mapping(struct ceph_spg_mapping *spg)
{
WARN_ON(!RB_EMPTY_NODE(&spg->node));
WARN_ON(!RB_EMPTY_ROOT(&spg->backoffs));
kfree(spg);
}
/*
* rbtree of ceph_spg_mapping for handling map<spg_t, ...>, similar to
* ceph_pg_mapping. Used to track OSD backoffs -- a backoff [range] is
* defined only within a specific spgid; it does not pass anything to
* children on split, or to another primary.
*/
DEFINE_RB_FUNCS2(spg_mapping, struct ceph_spg_mapping, spgid, ceph_spg_compare,
RB_BYPTR, const struct ceph_spg *, node)
static u64 hoid_get_bitwise_key(const struct ceph_hobject_id *hoid)
{
return hoid->is_max ? 0x100000000ull : hoid->hash_reverse_bits;
}
static void hoid_get_effective_key(const struct ceph_hobject_id *hoid,
void **pkey, size_t *pkey_len)
{
if (hoid->key_len) {
*pkey = hoid->key;
*pkey_len = hoid->key_len;
} else {
*pkey = hoid->oid;
*pkey_len = hoid->oid_len;
}
}
static int compare_names(const void *name1, size_t name1_len,
const void *name2, size_t name2_len)
{
int ret;
ret = memcmp(name1, name2, min(name1_len, name2_len));
if (!ret) {
if (name1_len < name2_len)
ret = -1;
else if (name1_len > name2_len)
ret = 1;
}
return ret;
}
static int hoid_compare(const struct ceph_hobject_id *lhs,
const struct ceph_hobject_id *rhs)
{
void *effective_key1, *effective_key2;
size_t effective_key1_len, effective_key2_len;
int ret;
if (lhs->is_max < rhs->is_max)
return -1;
if (lhs->is_max > rhs->is_max)
return 1;
if (lhs->pool < rhs->pool)
return -1;
if (lhs->pool > rhs->pool)
return 1;
if (hoid_get_bitwise_key(lhs) < hoid_get_bitwise_key(rhs))
return -1;
if (hoid_get_bitwise_key(lhs) > hoid_get_bitwise_key(rhs))
return 1;
ret = compare_names(lhs->nspace, lhs->nspace_len,
rhs->nspace, rhs->nspace_len);
if (ret)
return ret;
hoid_get_effective_key(lhs, &effective_key1, &effective_key1_len);
hoid_get_effective_key(rhs, &effective_key2, &effective_key2_len);
ret = compare_names(effective_key1, effective_key1_len,
effective_key2, effective_key2_len);
if (ret)
return ret;
ret = compare_names(lhs->oid, lhs->oid_len, rhs->oid, rhs->oid_len);
if (ret)
return ret;
if (lhs->snapid < rhs->snapid)
return -1;
if (lhs->snapid > rhs->snapid)
return 1;
return 0;
}
/*
* For decoding ->begin and ->end of MOSDBackoff only -- no MIN/MAX
* compat stuff here.
*
* Assumes @hoid is zero-initialized.
*/
static int decode_hoid(void **p, void *end, struct ceph_hobject_id *hoid)
{
u8 struct_v;
u32 struct_len;
int ret;
ret = ceph_start_decoding(p, end, 4, "hobject_t", &struct_v,
&struct_len);
if (ret)
return ret;
if (struct_v < 4) {
pr_err("got struct_v %d < 4 of hobject_t\n", struct_v);
goto e_inval;
}
hoid->key = ceph_extract_encoded_string(p, end, &hoid->key_len,
GFP_NOIO);
if (IS_ERR(hoid->key)) {
ret = PTR_ERR(hoid->key);
hoid->key = NULL;
return ret;
}
hoid->oid = ceph_extract_encoded_string(p, end, &hoid->oid_len,
GFP_NOIO);
if (IS_ERR(hoid->oid)) {
ret = PTR_ERR(hoid->oid);
hoid->oid = NULL;
return ret;
}
ceph_decode_64_safe(p, end, hoid->snapid, e_inval);
ceph_decode_32_safe(p, end, hoid->hash, e_inval);
ceph_decode_8_safe(p, end, hoid->is_max, e_inval);
hoid->nspace = ceph_extract_encoded_string(p, end, &hoid->nspace_len,
GFP_NOIO);
if (IS_ERR(hoid->nspace)) {
ret = PTR_ERR(hoid->nspace);
hoid->nspace = NULL;
return ret;
}
ceph_decode_64_safe(p, end, hoid->pool, e_inval);
ceph_hoid_build_hash_cache(hoid);
return 0;
e_inval:
return -EINVAL;
}
static int hoid_encoding_size(const struct ceph_hobject_id *hoid)
{
return 8 + 4 + 1 + 8 + /* snapid, hash, is_max, pool */
4 + hoid->key_len + 4 + hoid->oid_len + 4 + hoid->nspace_len;
}
static void encode_hoid(void **p, void *end, const struct ceph_hobject_id *hoid)
{
ceph_start_encoding(p, 4, 3, hoid_encoding_size(hoid));
ceph_encode_string(p, end, hoid->key, hoid->key_len);
ceph_encode_string(p, end, hoid->oid, hoid->oid_len);
ceph_encode_64(p, hoid->snapid);
ceph_encode_32(p, hoid->hash);
ceph_encode_8(p, hoid->is_max);
ceph_encode_string(p, end, hoid->nspace, hoid->nspace_len);
ceph_encode_64(p, hoid->pool);
}
static void free_hoid(struct ceph_hobject_id *hoid)
{
if (hoid) {
kfree(hoid->key);
kfree(hoid->oid);
kfree(hoid->nspace);
kfree(hoid);
}
}
static struct ceph_osd_backoff *alloc_backoff(void)
{
struct ceph_osd_backoff *backoff;
backoff = kzalloc(sizeof(*backoff), GFP_NOIO);
if (!backoff)
return NULL;
RB_CLEAR_NODE(&backoff->spg_node);
RB_CLEAR_NODE(&backoff->id_node);
return backoff;
}
static void free_backoff(struct ceph_osd_backoff *backoff)
{
WARN_ON(!RB_EMPTY_NODE(&backoff->spg_node));
WARN_ON(!RB_EMPTY_NODE(&backoff->id_node));
free_hoid(backoff->begin);
free_hoid(backoff->end);
kfree(backoff);
}
/*
* Within a specific spgid, backoffs are managed by ->begin hoid.
*/
DEFINE_RB_INSDEL_FUNCS2(backoff, struct ceph_osd_backoff, begin, hoid_compare,
RB_BYVAL, spg_node);
static struct ceph_osd_backoff *lookup_containing_backoff(struct rb_root *root,
const struct ceph_hobject_id *hoid)
{
struct rb_node *n = root->rb_node;
while (n) {
struct ceph_osd_backoff *cur =
rb_entry(n, struct ceph_osd_backoff, spg_node);
int cmp;
cmp = hoid_compare(hoid, cur->begin);
if (cmp < 0) {
n = n->rb_left;
} else if (cmp > 0) {
if (hoid_compare(hoid, cur->end) < 0)
return cur;
n = n->rb_right;
} else {
return cur;
}
}
return NULL;
}
/*
* Each backoff has a unique id within its OSD session.
*/
DEFINE_RB_FUNCS(backoff_by_id, struct ceph_osd_backoff, id, id_node)
static void clear_backoffs(struct ceph_osd *osd)
{
while (!RB_EMPTY_ROOT(&osd->o_backoff_mappings)) {
struct ceph_spg_mapping *spg =
rb_entry(rb_first(&osd->o_backoff_mappings),
struct ceph_spg_mapping, node);
while (!RB_EMPTY_ROOT(&spg->backoffs)) {
struct ceph_osd_backoff *backoff =
rb_entry(rb_first(&spg->backoffs),
struct ceph_osd_backoff, spg_node);
erase_backoff(&spg->backoffs, backoff);
erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
free_backoff(backoff);
}
erase_spg_mapping(&osd->o_backoff_mappings, spg);
free_spg_mapping(spg);
}
}
/*
* Set up a temporary, non-owning view into @t.
*/
static void hoid_fill_from_target(struct ceph_hobject_id *hoid,
const struct ceph_osd_request_target *t)
{
hoid->key = NULL;
hoid->key_len = 0;
hoid->oid = t->target_oid.name;
hoid->oid_len = t->target_oid.name_len;
hoid->snapid = CEPH_NOSNAP;
hoid->hash = t->pgid.seed;
hoid->is_max = false;
if (t->target_oloc.pool_ns) {
hoid->nspace = t->target_oloc.pool_ns->str;
hoid->nspace_len = t->target_oloc.pool_ns->len;
} else {
hoid->nspace = NULL;
hoid->nspace_len = 0;
}
hoid->pool = t->target_oloc.pool;
ceph_hoid_build_hash_cache(hoid);
}
static bool should_plug_request(struct ceph_osd_request *req)
{
struct ceph_osd *osd = req->r_osd;
struct ceph_spg_mapping *spg;
struct ceph_osd_backoff *backoff;
struct ceph_hobject_id hoid;
spg = lookup_spg_mapping(&osd->o_backoff_mappings, &req->r_t.spgid);
if (!spg)
return false;
hoid_fill_from_target(&hoid, &req->r_t);
backoff = lookup_containing_backoff(&spg->backoffs, &hoid);
if (!backoff)
return false;
dout("%s req %p tid %llu backoff osd%d spgid %llu.%xs%d id %llu\n",
__func__, req, req->r_tid, osd->o_osd, backoff->spgid.pgid.pool,
backoff->spgid.pgid.seed, backoff->spgid.shard, backoff->id);
return true;
}
static void setup_request_data(struct ceph_osd_request *req, static void setup_request_data(struct ceph_osd_request *req,
struct ceph_msg *msg) struct ceph_msg *msg)
{ {
...@@ -1483,7 +1830,37 @@ static void setup_request_data(struct ceph_osd_request *req, ...@@ -1483,7 +1830,37 @@ static void setup_request_data(struct ceph_osd_request *req,
WARN_ON(data_len != msg->data_length); WARN_ON(data_len != msg->data_length);
} }
static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) static void encode_pgid(void **p, const struct ceph_pg *pgid)
{
ceph_encode_8(p, 1);
ceph_encode_64(p, pgid->pool);
ceph_encode_32(p, pgid->seed);
ceph_encode_32(p, -1); /* preferred */
}
static void encode_spgid(void **p, const struct ceph_spg *spgid)
{
ceph_start_encoding(p, 1, 1, CEPH_PGID_ENCODING_LEN + 1);
encode_pgid(p, &spgid->pgid);
ceph_encode_8(p, spgid->shard);
}
static void encode_oloc(void **p, void *end,
const struct ceph_object_locator *oloc)
{
ceph_start_encoding(p, 5, 4, ceph_oloc_encoding_size(oloc));
ceph_encode_64(p, oloc->pool);
ceph_encode_32(p, -1); /* preferred */
ceph_encode_32(p, 0); /* key len */
if (oloc->pool_ns)
ceph_encode_string(p, end, oloc->pool_ns->str,
oloc->pool_ns->len);
else
ceph_encode_32(p, 0);
}
static void encode_request_partial(struct ceph_osd_request *req,
struct ceph_msg *msg)
{ {
void *p = msg->front.iov_base; void *p = msg->front.iov_base;
void *const end = p + msg->front_alloc_len; void *const end = p + msg->front_alloc_len;
...@@ -1500,38 +1877,27 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) ...@@ -1500,38 +1877,27 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
setup_request_data(req, msg); setup_request_data(req, msg);
ceph_encode_32(&p, 1); /* client_inc, always 1 */ encode_spgid(&p, &req->r_t.spgid); /* actual spg */
ceph_encode_32(&p, req->r_t.pgid.seed); /* raw hash */
ceph_encode_32(&p, req->r_osdc->osdmap->epoch); ceph_encode_32(&p, req->r_osdc->osdmap->epoch);
ceph_encode_32(&p, req->r_flags); ceph_encode_32(&p, req->r_flags);
ceph_encode_timespec(p, &req->r_mtime);
p += sizeof(struct ceph_timespec);
/* reassert_version */ /* reqid */
memset(p, 0, sizeof(struct ceph_eversion)); ceph_start_encoding(&p, 2, 2, sizeof(struct ceph_osd_reqid));
p += sizeof(struct ceph_eversion); memset(p, 0, sizeof(struct ceph_osd_reqid));
p += sizeof(struct ceph_osd_reqid);
/* oloc */ /* trace */
ceph_start_encoding(&p, 5, 4, memset(p, 0, sizeof(struct ceph_blkin_trace_info));
ceph_oloc_encoding_size(&req->r_t.target_oloc)); p += sizeof(struct ceph_blkin_trace_info);
ceph_encode_64(&p, req->r_t.target_oloc.pool);
ceph_encode_32(&p, -1); /* preferred */
ceph_encode_32(&p, 0); /* key len */
if (req->r_t.target_oloc.pool_ns)
ceph_encode_string(&p, end, req->r_t.target_oloc.pool_ns->str,
req->r_t.target_oloc.pool_ns->len);
else
ceph_encode_32(&p, 0);
/* pgid */ ceph_encode_32(&p, 0); /* client_inc, always 0 */
ceph_encode_8(&p, 1); ceph_encode_timespec(p, &req->r_mtime);
ceph_encode_64(&p, req->r_t.pgid.pool); p += sizeof(struct ceph_timespec);
ceph_encode_32(&p, req->r_t.pgid.seed);
ceph_encode_32(&p, -1); /* preferred */
/* oid */ encode_oloc(&p, end, &req->r_t.target_oloc);
ceph_encode_32(&p, req->r_t.target_oid.name_len); ceph_encode_string(&p, end, req->r_t.target_oid.name,
memcpy(p, req->r_t.target_oid.name, req->r_t.target_oid.name_len); req->r_t.target_oid.name_len);
p += req->r_t.target_oid.name_len;
/* ops, can imply data */ /* ops, can imply data */
ceph_encode_16(&p, req->r_num_ops); ceph_encode_16(&p, req->r_num_ops);
...@@ -1552,11 +1918,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) ...@@ -1552,11 +1918,10 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
} }
ceph_encode_32(&p, req->r_attempts); /* retry_attempt */ ceph_encode_32(&p, req->r_attempts); /* retry_attempt */
BUG_ON(p != end - 8); /* space for features */
BUG_ON(p > end); msg->hdr.version = cpu_to_le16(8); /* MOSDOp v8 */
msg->front.iov_len = p - msg->front.iov_base; /* front_len is finalized in encode_request_finish() */
msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
msg->hdr.data_len = cpu_to_le32(data_len); msg->hdr.data_len = cpu_to_le32(data_len);
/* /*
* The header "data_off" is a hint to the receiver allowing it * The header "data_off" is a hint to the receiver allowing it
...@@ -1565,9 +1930,99 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg) ...@@ -1565,9 +1930,99 @@ static void encode_request(struct ceph_osd_request *req, struct ceph_msg *msg)
*/ */
msg->hdr.data_off = cpu_to_le16(req->r_data_offset); msg->hdr.data_off = cpu_to_le16(req->r_data_offset);
dout("%s req %p oid %s oid_len %d front %zu data %u\n", __func__, dout("%s req %p msg %p oid %s oid_len %d\n", __func__, req, msg,
req, req->r_t.target_oid.name, req->r_t.target_oid.name_len, req->r_t.target_oid.name, req->r_t.target_oid.name_len);
msg->front.iov_len, data_len); }
static void encode_request_finish(struct ceph_msg *msg)
{
void *p = msg->front.iov_base;
void *const end = p + msg->front_alloc_len;
if (CEPH_HAVE_FEATURE(msg->con->peer_features, RESEND_ON_SPLIT)) {
/* luminous OSD -- encode features and be done */
p = end - 8;
ceph_encode_64(&p, msg->con->peer_features);
} else {
struct {
char spgid[CEPH_ENCODING_START_BLK_LEN +
CEPH_PGID_ENCODING_LEN + 1];
__le32 hash;
__le32 epoch;
__le32 flags;
char reqid[CEPH_ENCODING_START_BLK_LEN +
sizeof(struct ceph_osd_reqid)];
char trace[sizeof(struct ceph_blkin_trace_info)];
__le32 client_inc;
struct ceph_timespec mtime;
} __packed head;
struct ceph_pg pgid;
void *oloc, *oid, *tail;
int oloc_len, oid_len, tail_len;
int len;
/*
* Pre-luminous OSD -- reencode v8 into v4 using @head
* as a temporary buffer. Encode the raw PG; the rest
* is just a matter of moving oloc, oid and tail blobs
* around.
*/
memcpy(&head, p, sizeof(head));
p += sizeof(head);
oloc = p;
p += CEPH_ENCODING_START_BLK_LEN;
pgid.pool = ceph_decode_64(&p);
p += 4 + 4; /* preferred, key len */
len = ceph_decode_32(&p);
p += len; /* nspace */
oloc_len = p - oloc;
oid = p;
len = ceph_decode_32(&p);
p += len;
oid_len = p - oid;
tail = p;
tail_len = (end - p) - 8;
p = msg->front.iov_base;
ceph_encode_copy(&p, &head.client_inc, sizeof(head.client_inc));
ceph_encode_copy(&p, &head.epoch, sizeof(head.epoch));
ceph_encode_copy(&p, &head.flags, sizeof(head.flags));
ceph_encode_copy(&p, &head.mtime, sizeof(head.mtime));
/* reassert_version */
memset(p, 0, sizeof(struct ceph_eversion));
p += sizeof(struct ceph_eversion);
BUG_ON(p >= oloc);
memmove(p, oloc, oloc_len);
p += oloc_len;
pgid.seed = le32_to_cpu(head.hash);
encode_pgid(&p, &pgid); /* raw pg */
BUG_ON(p >= oid);
memmove(p, oid, oid_len);
p += oid_len;
/* tail -- ops, snapid, snapc, retry_attempt */
BUG_ON(p >= tail);
memmove(p, tail, tail_len);
p += tail_len;
msg->hdr.version = cpu_to_le16(4); /* MOSDOp v4 */
}
BUG_ON(p > end);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
dout("%s msg %p tid %llu %u+%u+%u v%d\n", __func__, msg,
le64_to_cpu(msg->hdr.tid), le32_to_cpu(msg->hdr.front_len),
le32_to_cpu(msg->hdr.middle_len), le32_to_cpu(msg->hdr.data_len),
le16_to_cpu(msg->hdr.version));
} }
/* /*
...@@ -1580,6 +2035,10 @@ static void send_request(struct ceph_osd_request *req) ...@@ -1580,6 +2035,10 @@ static void send_request(struct ceph_osd_request *req)
verify_osd_locked(osd); verify_osd_locked(osd);
WARN_ON(osd->o_osd != req->r_t.osd); WARN_ON(osd->o_osd != req->r_t.osd);
/* backoff? */
if (should_plug_request(req))
return;
/* /*
* We may have a previously queued request message hanging * We may have a previously queued request message hanging
* around. Cancel it to avoid corrupting the msgr. * around. Cancel it to avoid corrupting the msgr.
...@@ -1593,11 +2052,13 @@ static void send_request(struct ceph_osd_request *req) ...@@ -1593,11 +2052,13 @@ static void send_request(struct ceph_osd_request *req)
else else
WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY); WARN_ON(req->r_flags & CEPH_OSD_FLAG_RETRY);
encode_request(req, req->r_request); encode_request_partial(req, req->r_request);
dout("%s req %p tid %llu to pg %llu.%x osd%d flags 0x%x attempt %d\n", dout("%s req %p tid %llu to pgid %llu.%x spgid %llu.%xs%d osd%d e%u flags 0x%x attempt %d\n",
__func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed, __func__, req, req->r_tid, req->r_t.pgid.pool, req->r_t.pgid.seed,
req->r_t.osd, req->r_flags, req->r_attempts); req->r_t.spgid.pgid.pool, req->r_t.spgid.pgid.seed,
req->r_t.spgid.shard, osd->o_osd, req->r_t.epoch, req->r_flags,
req->r_attempts);
req->r_t.paused = false; req->r_t.paused = false;
req->r_stamp = jiffies; req->r_stamp = jiffies;
...@@ -1645,7 +2106,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked) ...@@ -1645,7 +2106,7 @@ static void __submit_request(struct ceph_osd_request *req, bool wrlocked)
dout("%s req %p wrlocked %d\n", __func__, req, wrlocked); dout("%s req %p wrlocked %d\n", __func__, req, wrlocked);
again: again:
ct_res = calc_target(osdc, &req->r_t, &req->r_last_force_resend, false); ct_res = calc_target(osdc, &req->r_t, NULL, false);
if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked) if (ct_res == CALC_TARGET_POOL_DNE && !wrlocked)
goto promote; goto promote;
...@@ -1737,13 +2198,12 @@ static void submit_request(struct ceph_osd_request *req, bool wrlocked) ...@@ -1737,13 +2198,12 @@ static void submit_request(struct ceph_osd_request *req, bool wrlocked)
static void finish_request(struct ceph_osd_request *req) static void finish_request(struct ceph_osd_request *req)
{ {
struct ceph_osd_client *osdc = req->r_osdc; struct ceph_osd_client *osdc = req->r_osdc;
struct ceph_osd *osd = req->r_osd;
verify_osd_locked(osd); WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid));
dout("%s req %p tid %llu\n", __func__, req, req->r_tid); dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
WARN_ON(lookup_request_mc(&osdc->map_checks, req->r_tid)); if (req->r_osd)
unlink_request(osd, req); unlink_request(req->r_osd, req);
atomic_dec(&osdc->num_requests); atomic_dec(&osdc->num_requests);
/* /*
...@@ -2441,7 +2901,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq) ...@@ -2441,7 +2901,7 @@ static void linger_submit(struct ceph_osd_linger_request *lreq)
struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_client *osdc = lreq->osdc;
struct ceph_osd *osd; struct ceph_osd *osd;
calc_target(osdc, &lreq->t, &lreq->last_force_resend, false); calc_target(osdc, &lreq->t, NULL, false);
osd = lookup_create_osd(osdc, lreq->t.osd, true); osd = lookup_create_osd(osdc, lreq->t.osd, true);
link_linger(osd, lreq); link_linger(osd, lreq);
...@@ -3059,7 +3519,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq) ...@@ -3059,7 +3519,7 @@ recalc_linger_target(struct ceph_osd_linger_request *lreq)
struct ceph_osd_client *osdc = lreq->osdc; struct ceph_osd_client *osdc = lreq->osdc;
enum calc_target_result ct_res; enum calc_target_result ct_res;
ct_res = calc_target(osdc, &lreq->t, &lreq->last_force_resend, true); ct_res = calc_target(osdc, &lreq->t, NULL, true);
if (ct_res == CALC_TARGET_NEED_RESEND) { if (ct_res == CALC_TARGET_NEED_RESEND) {
struct ceph_osd *osd; struct ceph_osd *osd;
...@@ -3117,6 +3577,7 @@ static void scan_requests(struct ceph_osd *osd, ...@@ -3117,6 +3577,7 @@ static void scan_requests(struct ceph_osd *osd,
list_add_tail(&lreq->scan_item, need_resend_linger); list_add_tail(&lreq->scan_item, need_resend_linger);
break; break;
case CALC_TARGET_POOL_DNE: case CALC_TARGET_POOL_DNE:
list_del_init(&lreq->scan_item);
check_linger_pool_dne(lreq); check_linger_pool_dne(lreq);
break; break;
} }
...@@ -3130,8 +3591,8 @@ static void scan_requests(struct ceph_osd *osd, ...@@ -3130,8 +3591,8 @@ static void scan_requests(struct ceph_osd *osd,
n = rb_next(n); /* unlink_request(), check_pool_dne() */ n = rb_next(n); /* unlink_request(), check_pool_dne() */
dout("%s req %p tid %llu\n", __func__, req, req->r_tid); dout("%s req %p tid %llu\n", __func__, req, req->r_tid);
ct_res = calc_target(osdc, &req->r_t, ct_res = calc_target(osdc, &req->r_t, &req->r_osd->o_con,
&req->r_last_force_resend, false); false);
switch (ct_res) { switch (ct_res) {
case CALC_TARGET_NO_ACTION: case CALC_TARGET_NO_ACTION:
force_resend_writes = cleared_full || force_resend_writes = cleared_full ||
...@@ -3229,8 +3690,25 @@ static void kick_requests(struct ceph_osd_client *osdc, ...@@ -3229,8 +3690,25 @@ static void kick_requests(struct ceph_osd_client *osdc,
struct list_head *need_resend_linger) struct list_head *need_resend_linger)
{ {
struct ceph_osd_linger_request *lreq, *nlreq; struct ceph_osd_linger_request *lreq, *nlreq;
enum calc_target_result ct_res;
struct rb_node *n; struct rb_node *n;
/* make sure need_resend targets reflect latest map */
for (n = rb_first(need_resend); n; ) {
struct ceph_osd_request *req =
rb_entry(n, struct ceph_osd_request, r_node);
n = rb_next(n);
if (req->r_t.epoch < osdc->osdmap->epoch) {
ct_res = calc_target(osdc, &req->r_t, NULL, false);
if (ct_res == CALC_TARGET_POOL_DNE) {
erase_request(need_resend, req);
check_pool_dne(req);
}
}
}
for (n = rb_first(need_resend); n; ) { for (n = rb_first(need_resend); n; ) {
struct ceph_osd_request *req = struct ceph_osd_request *req =
rb_entry(n, struct ceph_osd_request, r_node); rb_entry(n, struct ceph_osd_request, r_node);
...@@ -3239,8 +3717,6 @@ static void kick_requests(struct ceph_osd_client *osdc, ...@@ -3239,8 +3717,6 @@ static void kick_requests(struct ceph_osd_client *osdc,
n = rb_next(n); n = rb_next(n);
erase_request(need_resend, req); /* before link_request() */ erase_request(need_resend, req); /* before link_request() */
WARN_ON(req->r_osd);
calc_target(osdc, &req->r_t, NULL, false);
osd = lookup_create_osd(osdc, req->r_t.osd, true); osd = lookup_create_osd(osdc, req->r_t.osd, true);
link_request(osd, req); link_request(osd, req);
if (!req->r_linger) { if (!req->r_linger) {
...@@ -3383,6 +3859,8 @@ static void kick_osd_requests(struct ceph_osd *osd) ...@@ -3383,6 +3859,8 @@ static void kick_osd_requests(struct ceph_osd *osd)
{ {
struct rb_node *n; struct rb_node *n;
clear_backoffs(osd);
for (n = rb_first(&osd->o_requests); n; ) { for (n = rb_first(&osd->o_requests); n; ) {
struct ceph_osd_request *req = struct ceph_osd_request *req =
rb_entry(n, struct ceph_osd_request, r_node); rb_entry(n, struct ceph_osd_request, r_node);
...@@ -3428,6 +3906,261 @@ static void osd_fault(struct ceph_connection *con) ...@@ -3428,6 +3906,261 @@ static void osd_fault(struct ceph_connection *con)
up_write(&osdc->lock); up_write(&osdc->lock);
} }
struct MOSDBackoff {
struct ceph_spg spgid;
u32 map_epoch;
u8 op;
u64 id;
struct ceph_hobject_id *begin;
struct ceph_hobject_id *end;
};
static int decode_MOSDBackoff(const struct ceph_msg *msg, struct MOSDBackoff *m)
{
void *p = msg->front.iov_base;
void *const end = p + msg->front.iov_len;
u8 struct_v;
u32 struct_len;
int ret;
ret = ceph_start_decoding(&p, end, 1, "spg_t", &struct_v, &struct_len);
if (ret)
return ret;
ret = ceph_decode_pgid(&p, end, &m->spgid.pgid);
if (ret)
return ret;
ceph_decode_8_safe(&p, end, m->spgid.shard, e_inval);
ceph_decode_32_safe(&p, end, m->map_epoch, e_inval);
ceph_decode_8_safe(&p, end, m->op, e_inval);
ceph_decode_64_safe(&p, end, m->id, e_inval);
m->begin = kzalloc(sizeof(*m->begin), GFP_NOIO);
if (!m->begin)
return -ENOMEM;
ret = decode_hoid(&p, end, m->begin);
if (ret) {
free_hoid(m->begin);
return ret;
}
m->end = kzalloc(sizeof(*m->end), GFP_NOIO);
if (!m->end) {
free_hoid(m->begin);
return -ENOMEM;
}
ret = decode_hoid(&p, end, m->end);
if (ret) {
free_hoid(m->begin);
free_hoid(m->end);
return ret;
}
return 0;
e_inval:
return -EINVAL;
}
static struct ceph_msg *create_backoff_message(
const struct ceph_osd_backoff *backoff,
u32 map_epoch)
{
struct ceph_msg *msg;
void *p, *end;
int msg_size;
msg_size = CEPH_ENCODING_START_BLK_LEN +
CEPH_PGID_ENCODING_LEN + 1; /* spgid */
msg_size += 4 + 1 + 8; /* map_epoch, op, id */
msg_size += CEPH_ENCODING_START_BLK_LEN +
hoid_encoding_size(backoff->begin);
msg_size += CEPH_ENCODING_START_BLK_LEN +
hoid_encoding_size(backoff->end);
msg = ceph_msg_new(CEPH_MSG_OSD_BACKOFF, msg_size, GFP_NOIO, true);
if (!msg)
return NULL;
p = msg->front.iov_base;
end = p + msg->front_alloc_len;
encode_spgid(&p, &backoff->spgid);
ceph_encode_32(&p, map_epoch);
ceph_encode_8(&p, CEPH_OSD_BACKOFF_OP_ACK_BLOCK);
ceph_encode_64(&p, backoff->id);
encode_hoid(&p, end, backoff->begin);
encode_hoid(&p, end, backoff->end);
BUG_ON(p != end);
msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.version = cpu_to_le16(1); /* MOSDBackoff v1 */
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
return msg;
}
static void handle_backoff_block(struct ceph_osd *osd, struct MOSDBackoff *m)
{
struct ceph_spg_mapping *spg;
struct ceph_osd_backoff *backoff;
struct ceph_msg *msg;
dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
spg = lookup_spg_mapping(&osd->o_backoff_mappings, &m->spgid);
if (!spg) {
spg = alloc_spg_mapping();
if (!spg) {
pr_err("%s failed to allocate spg\n", __func__);
return;
}
spg->spgid = m->spgid; /* struct */
insert_spg_mapping(&osd->o_backoff_mappings, spg);
}
backoff = alloc_backoff();
if (!backoff) {
pr_err("%s failed to allocate backoff\n", __func__);
return;
}
backoff->spgid = m->spgid; /* struct */
backoff->id = m->id;
backoff->begin = m->begin;
m->begin = NULL; /* backoff now owns this */
backoff->end = m->end;
m->end = NULL; /* ditto */
insert_backoff(&spg->backoffs, backoff);
insert_backoff_by_id(&osd->o_backoffs_by_id, backoff);
/*
* Ack with original backoff's epoch so that the OSD can
* discard this if there was a PG split.
*/
msg = create_backoff_message(backoff, m->map_epoch);
if (!msg) {
pr_err("%s failed to allocate msg\n", __func__);
return;
}
ceph_con_send(&osd->o_con, msg);
}
static bool target_contained_by(const struct ceph_osd_request_target *t,
const struct ceph_hobject_id *begin,
const struct ceph_hobject_id *end)
{
struct ceph_hobject_id hoid;
int cmp;
hoid_fill_from_target(&hoid, t);
cmp = hoid_compare(&hoid, begin);
return !cmp || (cmp > 0 && hoid_compare(&hoid, end) < 0);
}
static void handle_backoff_unblock(struct ceph_osd *osd,
const struct MOSDBackoff *m)
{
struct ceph_spg_mapping *spg;
struct ceph_osd_backoff *backoff;
struct rb_node *n;
dout("%s osd%d spgid %llu.%xs%d id %llu\n", __func__, osd->o_osd,
m->spgid.pgid.pool, m->spgid.pgid.seed, m->spgid.shard, m->id);
backoff = lookup_backoff_by_id(&osd->o_backoffs_by_id, m->id);
if (!backoff) {
pr_err("%s osd%d spgid %llu.%xs%d id %llu backoff dne\n",
__func__, osd->o_osd, m->spgid.pgid.pool,
m->spgid.pgid.seed, m->spgid.shard, m->id);
return;
}
if (hoid_compare(backoff->begin, m->begin) &&
hoid_compare(backoff->end, m->end)) {
pr_err("%s osd%d spgid %llu.%xs%d id %llu bad range?\n",
__func__, osd->o_osd, m->spgid.pgid.pool,
m->spgid.pgid.seed, m->spgid.shard, m->id);
/* unblock it anyway... */
}
spg = lookup_spg_mapping(&osd->o_backoff_mappings, &backoff->spgid);
BUG_ON(!spg);
erase_backoff(&spg->backoffs, backoff);
erase_backoff_by_id(&osd->o_backoffs_by_id, backoff);
free_backoff(backoff);
if (RB_EMPTY_ROOT(&spg->backoffs)) {
erase_spg_mapping(&osd->o_backoff_mappings, spg);
free_spg_mapping(spg);
}
for (n = rb_first(&osd->o_requests); n; n = rb_next(n)) {
struct ceph_osd_request *req =
rb_entry(n, struct ceph_osd_request, r_node);
if (!ceph_spg_compare(&req->r_t.spgid, &m->spgid)) {
/*
* Match against @m, not @backoff -- the PG may
* have split on the OSD.
*/
if (target_contained_by(&req->r_t, m->begin, m->end)) {
/*
* If no other installed backoff applies,
* resend.
*/
send_request(req);
}
}
}
}
static void handle_backoff(struct ceph_osd *osd, struct ceph_msg *msg)
{
struct ceph_osd_client *osdc = osd->o_osdc;
struct MOSDBackoff m;
int ret;
down_read(&osdc->lock);
if (!osd_registered(osd)) {
dout("%s osd%d unknown\n", __func__, osd->o_osd);
up_read(&osdc->lock);
return;
}
WARN_ON(osd->o_osd != le64_to_cpu(msg->hdr.src.num));
mutex_lock(&osd->lock);
ret = decode_MOSDBackoff(msg, &m);
if (ret) {
pr_err("failed to decode MOSDBackoff: %d\n", ret);
ceph_msg_dump(msg);
goto out_unlock;
}
switch (m.op) {
case CEPH_OSD_BACKOFF_OP_BLOCK:
handle_backoff_block(osd, &m);
break;
case CEPH_OSD_BACKOFF_OP_UNBLOCK:
handle_backoff_unblock(osd, &m);
break;
default:
pr_err("%s osd%d unknown op %d\n", __func__, osd->o_osd, m.op);
}
free_hoid(m.begin);
free_hoid(m.end);
out_unlock:
mutex_unlock(&osd->lock);
up_read(&osdc->lock);
}
/* /*
* Process osd watch notifications * Process osd watch notifications
*/ */
...@@ -4365,6 +5098,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg) ...@@ -4365,6 +5098,9 @@ static void dispatch(struct ceph_connection *con, struct ceph_msg *msg)
case CEPH_MSG_OSD_OPREPLY: case CEPH_MSG_OSD_OPREPLY:
handle_reply(osd, msg); handle_reply(osd, msg);
break; break;
case CEPH_MSG_OSD_BACKOFF:
handle_backoff(osd, msg);
break;
case CEPH_MSG_WATCH_NOTIFY: case CEPH_MSG_WATCH_NOTIFY:
handle_watch_notify(osdc, msg); handle_watch_notify(osdc, msg);
break; break;
...@@ -4487,6 +5223,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con, ...@@ -4487,6 +5223,7 @@ static struct ceph_msg *alloc_msg(struct ceph_connection *con,
*skip = 0; *skip = 0;
switch (type) { switch (type) {
case CEPH_MSG_OSD_MAP: case CEPH_MSG_OSD_MAP:
case CEPH_MSG_OSD_BACKOFF:
case CEPH_MSG_WATCH_NOTIFY: case CEPH_MSG_WATCH_NOTIFY:
return alloc_msg_with_page_vector(hdr); return alloc_msg_with_page_vector(hdr);
case CEPH_MSG_OSD_OPREPLY: case CEPH_MSG_OSD_OPREPLY:
...@@ -4571,6 +5308,11 @@ static int invalidate_authorizer(struct ceph_connection *con) ...@@ -4571,6 +5308,11 @@ static int invalidate_authorizer(struct ceph_connection *con)
return ceph_monc_validate_auth(&osdc->client->monc); return ceph_monc_validate_auth(&osdc->client->monc);
} }
static void osd_reencode_message(struct ceph_msg *msg)
{
encode_request_finish(msg);
}
static int osd_sign_message(struct ceph_msg *msg) static int osd_sign_message(struct ceph_msg *msg)
{ {
struct ceph_osd *o = msg->con->private; struct ceph_osd *o = msg->con->private;
...@@ -4595,6 +5337,7 @@ static const struct ceph_connection_operations osd_con_ops = { ...@@ -4595,6 +5337,7 @@ static const struct ceph_connection_operations osd_con_ops = {
.verify_authorizer_reply = verify_authorizer_reply, .verify_authorizer_reply = verify_authorizer_reply,
.invalidate_authorizer = invalidate_authorizer, .invalidate_authorizer = invalidate_authorizer,
.alloc_msg = alloc_msg, .alloc_msg = alloc_msg,
.reencode_message = osd_reencode_message,
.sign_message = osd_sign_message, .sign_message = osd_sign_message,
.check_message_signature = osd_check_message_signature, .check_message_signature = osd_check_message_signature,
.fault = osd_fault, .fault = osd_fault,
......
...@@ -11,7 +11,7 @@ ...@@ -11,7 +11,7 @@
#include <linux/crush/hash.h> #include <linux/crush/hash.h>
#include <linux/crush/mapper.h> #include <linux/crush/mapper.h>
char *ceph_osdmap_state_str(char *str, int len, int state) char *ceph_osdmap_state_str(char *str, int len, u32 state)
{ {
if (!len) if (!len)
return str; return str;
...@@ -138,21 +138,177 @@ static int crush_decode_straw2_bucket(void **p, void *end, ...@@ -138,21 +138,177 @@ static int crush_decode_straw2_bucket(void **p, void *end,
return -EINVAL; return -EINVAL;
} }
static int skip_name_map(void **p, void *end) static struct crush_choose_arg_map *alloc_choose_arg_map(void)
{ {
int len; struct crush_choose_arg_map *arg_map;
ceph_decode_32_safe(p, end, len ,bad);
while (len--) { arg_map = kzalloc(sizeof(*arg_map), GFP_NOIO);
int strlen; if (!arg_map)
*p += sizeof(u32); return NULL;
ceph_decode_32_safe(p, end, strlen, bad);
*p += strlen; RB_CLEAR_NODE(&arg_map->node);
return arg_map;
}
static void free_choose_arg_map(struct crush_choose_arg_map *arg_map)
{
if (arg_map) {
int i, j;
WARN_ON(!RB_EMPTY_NODE(&arg_map->node));
for (i = 0; i < arg_map->size; i++) {
struct crush_choose_arg *arg = &arg_map->args[i];
for (j = 0; j < arg->weight_set_size; j++)
kfree(arg->weight_set[j].weights);
kfree(arg->weight_set);
kfree(arg->ids);
}
kfree(arg_map->args);
kfree(arg_map);
}
}
DEFINE_RB_FUNCS(choose_arg_map, struct crush_choose_arg_map, choose_args_index,
node);
void clear_choose_args(struct crush_map *c)
{
while (!RB_EMPTY_ROOT(&c->choose_args)) {
struct crush_choose_arg_map *arg_map =
rb_entry(rb_first(&c->choose_args),
struct crush_choose_arg_map, node);
erase_choose_arg_map(&c->choose_args, arg_map);
free_choose_arg_map(arg_map);
}
}
static u32 *decode_array_32_alloc(void **p, void *end, u32 *plen)
{
u32 *a = NULL;
u32 len;
int ret;
ceph_decode_32_safe(p, end, len, e_inval);
if (len) {
u32 i;
a = kmalloc_array(len, sizeof(u32), GFP_NOIO);
if (!a) {
ret = -ENOMEM;
goto fail;
}
ceph_decode_need(p, end, len * sizeof(u32), e_inval);
for (i = 0; i < len; i++)
a[i] = ceph_decode_32(p);
}
*plen = len;
return a;
e_inval:
ret = -EINVAL;
fail:
kfree(a);
return ERR_PTR(ret);
} }
/*
* Assumes @arg is zero-initialized.
*/
static int decode_choose_arg(void **p, void *end, struct crush_choose_arg *arg)
{
int ret;
ceph_decode_32_safe(p, end, arg->weight_set_size, e_inval);
if (arg->weight_set_size) {
u32 i;
arg->weight_set = kmalloc_array(arg->weight_set_size,
sizeof(*arg->weight_set),
GFP_NOIO);
if (!arg->weight_set)
return -ENOMEM;
for (i = 0; i < arg->weight_set_size; i++) {
struct crush_weight_set *w = &arg->weight_set[i];
w->weights = decode_array_32_alloc(p, end, &w->size);
if (IS_ERR(w->weights)) {
ret = PTR_ERR(w->weights);
w->weights = NULL;
return ret;
}
}
}
arg->ids = decode_array_32_alloc(p, end, &arg->ids_size);
if (IS_ERR(arg->ids)) {
ret = PTR_ERR(arg->ids);
arg->ids = NULL;
return ret;
}
return 0; return 0;
bad:
e_inval:
return -EINVAL; return -EINVAL;
} }
static int decode_choose_args(void **p, void *end, struct crush_map *c)
{
struct crush_choose_arg_map *arg_map = NULL;
u32 num_choose_arg_maps, num_buckets;
int ret;
ceph_decode_32_safe(p, end, num_choose_arg_maps, e_inval);
while (num_choose_arg_maps--) {
arg_map = alloc_choose_arg_map();
if (!arg_map) {
ret = -ENOMEM;
goto fail;
}
ceph_decode_64_safe(p, end, arg_map->choose_args_index,
e_inval);
arg_map->size = c->max_buckets;
arg_map->args = kcalloc(arg_map->size, sizeof(*arg_map->args),
GFP_NOIO);
if (!arg_map->args) {
ret = -ENOMEM;
goto fail;
}
ceph_decode_32_safe(p, end, num_buckets, e_inval);
while (num_buckets--) {
struct crush_choose_arg *arg;
u32 bucket_index;
ceph_decode_32_safe(p, end, bucket_index, e_inval);
if (bucket_index >= arg_map->size)
goto e_inval;
arg = &arg_map->args[bucket_index];
ret = decode_choose_arg(p, end, arg);
if (ret)
goto fail;
}
insert_choose_arg_map(&c->choose_args, arg_map);
}
return 0;
e_inval:
ret = -EINVAL;
fail:
free_choose_arg_map(arg_map);
return ret;
}
static void crush_finalize(struct crush_map *c) static void crush_finalize(struct crush_map *c)
{ {
__s32 b; __s32 b;
...@@ -187,7 +343,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -187,7 +343,6 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
void **p = &pbyval; void **p = &pbyval;
void *start = pbyval; void *start = pbyval;
u32 magic; u32 magic;
u32 num_name_maps;
dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p)); dout("crush_decode %p to %p len %d\n", *p, end, (int)(end - *p));
...@@ -195,6 +350,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -195,6 +350,8 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
if (c == NULL) if (c == NULL)
return ERR_PTR(-ENOMEM); return ERR_PTR(-ENOMEM);
c->choose_args = RB_ROOT;
/* set tunables to default values */ /* set tunables to default values */
c->choose_local_tries = 2; c->choose_local_tries = 2;
c->choose_local_fallback_tries = 5; c->choose_local_fallback_tries = 5;
...@@ -353,12 +510,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -353,12 +510,9 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
} }
} }
/* ignore trailing name maps. */ ceph_decode_skip_map(p, end, 32, string, bad); /* type_map */
for (num_name_maps = 0; num_name_maps < 3; num_name_maps++) { ceph_decode_skip_map(p, end, 32, string, bad); /* name_map */
err = skip_name_map(p, end); ceph_decode_skip_map(p, end, 32, string, bad); /* rule_name_map */
if (err < 0)
goto done;
}
/* tunables */ /* tunables */
ceph_decode_need(p, end, 3*sizeof(u32), done); ceph_decode_need(p, end, 3*sizeof(u32), done);
...@@ -391,6 +545,21 @@ static struct crush_map *crush_decode(void *pbyval, void *end) ...@@ -391,6 +545,21 @@ static struct crush_map *crush_decode(void *pbyval, void *end)
dout("crush decode tunable chooseleaf_stable = %d\n", dout("crush decode tunable chooseleaf_stable = %d\n",
c->chooseleaf_stable); c->chooseleaf_stable);
if (*p != end) {
/* class_map */
ceph_decode_skip_map(p, end, 32, 32, bad);
/* class_name */
ceph_decode_skip_map(p, end, 32, string, bad);
/* class_bucket */
ceph_decode_skip_map_of_map(p, end, 32, 32, 32, bad);
}
if (*p != end) {
err = decode_choose_args(p, end, c);
if (err)
goto bad;
}
done: done:
crush_finalize(c); crush_finalize(c);
dout("crush_decode success\n"); dout("crush_decode success\n");
...@@ -418,74 +587,48 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs) ...@@ -418,74 +587,48 @@ int ceph_pg_compare(const struct ceph_pg *lhs, const struct ceph_pg *rhs)
return 0; return 0;
} }
/* int ceph_spg_compare(const struct ceph_spg *lhs, const struct ceph_spg *rhs)
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
* to a set of osds) and primary_temp (explicit primary setting)
*/
static int __insert_pg_mapping(struct ceph_pg_mapping *new,
struct rb_root *root)
{ {
struct rb_node **p = &root->rb_node; int ret;
struct rb_node *parent = NULL;
struct ceph_pg_mapping *pg = NULL;
int c;
dout("__insert_pg_mapping %llx %p\n", *(u64 *)&new->pgid, new); ret = ceph_pg_compare(&lhs->pgid, &rhs->pgid);
while (*p) { if (ret)
parent = *p; return ret;
pg = rb_entry(parent, struct ceph_pg_mapping, node);
c = ceph_pg_compare(&new->pgid, &pg->pgid); if (lhs->shard < rhs->shard)
if (c < 0) return -1;
p = &(*p)->rb_left; if (lhs->shard > rhs->shard)
else if (c > 0) return 1;
p = &(*p)->rb_right;
else
return -EEXIST;
}
rb_link_node(&new->node, parent, p);
rb_insert_color(&new->node, root);
return 0; return 0;
} }
static struct ceph_pg_mapping *__lookup_pg_mapping(struct rb_root *root, static struct ceph_pg_mapping *alloc_pg_mapping(size_t payload_len)
struct ceph_pg pgid)
{ {
struct rb_node *n = root->rb_node;
struct ceph_pg_mapping *pg; struct ceph_pg_mapping *pg;
int c;
while (n) { pg = kmalloc(sizeof(*pg) + payload_len, GFP_NOIO);
pg = rb_entry(n, struct ceph_pg_mapping, node); if (!pg)
c = ceph_pg_compare(&pgid, &pg->pgid);
if (c < 0) {
n = n->rb_left;
} else if (c > 0) {
n = n->rb_right;
} else {
dout("__lookup_pg_mapping %lld.%x got %p\n",
pgid.pool, pgid.seed, pg);
return pg;
}
}
return NULL; return NULL;
RB_CLEAR_NODE(&pg->node);
return pg;
} }
static int __remove_pg_mapping(struct rb_root *root, struct ceph_pg pgid) static void free_pg_mapping(struct ceph_pg_mapping *pg)
{ {
struct ceph_pg_mapping *pg = __lookup_pg_mapping(root, pgid); WARN_ON(!RB_EMPTY_NODE(&pg->node));
if (pg) {
dout("__remove_pg_mapping %lld.%x %p\n", pgid.pool, pgid.seed,
pg);
rb_erase(&pg->node, root);
kfree(pg); kfree(pg);
return 0;
}
dout("__remove_pg_mapping %lld.%x dne\n", pgid.pool, pgid.seed);
return -ENOENT;
} }
/*
* rbtree of pg_mapping for handling pg_temp (explicit mapping of pgid
* to a set of osds) and primary_temp (explicit primary setting)
*/
DEFINE_RB_FUNCS2(pg_mapping, struct ceph_pg_mapping, pgid, ceph_pg_compare,
RB_BYPTR, const struct ceph_pg *, node)
/* /*
* rbtree of pg pool info * rbtree of pg pool info
*/ */
...@@ -682,11 +825,48 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi) ...@@ -682,11 +825,48 @@ static int decode_pool(void **p, void *end, struct ceph_pg_pool_info *pi)
*p += len; *p += len;
} }
/*
* last_force_op_resend_preluminous, will be overridden if the
* map was encoded with RESEND_ON_SPLIT
*/
if (ev >= 15) if (ev >= 15)
pi->last_force_request_resend = ceph_decode_32(p); pi->last_force_request_resend = ceph_decode_32(p);
else else
pi->last_force_request_resend = 0; pi->last_force_request_resend = 0;
if (ev >= 16)
*p += 4; /* skip min_read_recency_for_promote */
if (ev >= 17)
*p += 8; /* skip expected_num_objects */
if (ev >= 19)
*p += 4; /* skip cache_target_dirty_high_ratio_micro */
if (ev >= 20)
*p += 4; /* skip min_write_recency_for_promote */
if (ev >= 21)
*p += 1; /* skip use_gmt_hitset */
if (ev >= 22)
*p += 1; /* skip fast_read */
if (ev >= 23) {
*p += 4; /* skip hit_set_grade_decay_rate */
*p += 4; /* skip hit_set_search_last_n */
}
if (ev >= 24) {
/* skip opts */
*p += 1 + 1; /* versions */
len = ceph_decode_32(p);
*p += len;
}
if (ev >= 25)
pi->last_force_request_resend = ceph_decode_32(p);
/* ignore the rest */ /* ignore the rest */
*p = pool_end; *p = pool_end;
...@@ -743,6 +923,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void) ...@@ -743,6 +923,8 @@ struct ceph_osdmap *ceph_osdmap_alloc(void)
map->pool_max = -1; map->pool_max = -1;
map->pg_temp = RB_ROOT; map->pg_temp = RB_ROOT;
map->primary_temp = RB_ROOT; map->primary_temp = RB_ROOT;
map->pg_upmap = RB_ROOT;
map->pg_upmap_items = RB_ROOT;
mutex_init(&map->crush_workspace_mutex); mutex_init(&map->crush_workspace_mutex);
return map; return map;
...@@ -757,14 +939,28 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) ...@@ -757,14 +939,28 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
struct ceph_pg_mapping *pg = struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_temp), rb_entry(rb_first(&map->pg_temp),
struct ceph_pg_mapping, node); struct ceph_pg_mapping, node);
rb_erase(&pg->node, &map->pg_temp); erase_pg_mapping(&map->pg_temp, pg);
kfree(pg); free_pg_mapping(pg);
} }
while (!RB_EMPTY_ROOT(&map->primary_temp)) { while (!RB_EMPTY_ROOT(&map->primary_temp)) {
struct ceph_pg_mapping *pg = struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->primary_temp), rb_entry(rb_first(&map->primary_temp),
struct ceph_pg_mapping, node); struct ceph_pg_mapping, node);
rb_erase(&pg->node, &map->primary_temp); erase_pg_mapping(&map->primary_temp, pg);
free_pg_mapping(pg);
}
while (!RB_EMPTY_ROOT(&map->pg_upmap)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_upmap),
struct ceph_pg_mapping, node);
rb_erase(&pg->node, &map->pg_upmap);
kfree(pg);
}
while (!RB_EMPTY_ROOT(&map->pg_upmap_items)) {
struct ceph_pg_mapping *pg =
rb_entry(rb_first(&map->pg_upmap_items),
struct ceph_pg_mapping, node);
rb_erase(&pg->node, &map->pg_upmap_items);
kfree(pg); kfree(pg);
} }
while (!RB_EMPTY_ROOT(&map->pg_pools)) { while (!RB_EMPTY_ROOT(&map->pg_pools)) {
...@@ -788,7 +984,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map) ...@@ -788,7 +984,7 @@ void ceph_osdmap_destroy(struct ceph_osdmap *map)
*/ */
static int osdmap_set_max_osd(struct ceph_osdmap *map, int max) static int osdmap_set_max_osd(struct ceph_osdmap *map, int max)
{ {
u8 *state; u32 *state;
u32 *weight; u32 *weight;
struct ceph_entity_addr *addr; struct ceph_entity_addr *addr;
int i; int i;
...@@ -964,120 +1160,121 @@ static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map) ...@@ -964,120 +1160,121 @@ static int decode_new_pools(void **p, void *end, struct ceph_osdmap *map)
return __decode_pools(p, end, map, true); return __decode_pools(p, end, map, true);
} }
static int __decode_pg_temp(void **p, void *end, struct ceph_osdmap *map, typedef struct ceph_pg_mapping *(*decode_mapping_fn_t)(void **, void *, bool);
bool incremental)
static int decode_pg_mapping(void **p, void *end, struct rb_root *mapping_root,
decode_mapping_fn_t fn, bool incremental)
{ {
u32 n; u32 n;
WARN_ON(!incremental && !fn);
ceph_decode_32_safe(p, end, n, e_inval); ceph_decode_32_safe(p, end, n, e_inval);
while (n--) { while (n--) {
struct ceph_pg_mapping *pg;
struct ceph_pg pgid; struct ceph_pg pgid;
u32 len, i;
int ret; int ret;
ret = ceph_decode_pgid(p, end, &pgid); ret = ceph_decode_pgid(p, end, &pgid);
if (ret) if (ret)
return ret; return ret;
ceph_decode_32_safe(p, end, len, e_inval); pg = lookup_pg_mapping(mapping_root, &pgid);
if (pg) {
WARN_ON(!incremental);
erase_pg_mapping(mapping_root, pg);
free_pg_mapping(pg);
}
ret = __remove_pg_mapping(&map->pg_temp, pgid); if (fn) {
BUG_ON(!incremental && ret != -ENOENT); pg = fn(p, end, incremental);
if (IS_ERR(pg))
return PTR_ERR(pg);
if (!incremental || len > 0) { if (pg) {
struct ceph_pg_mapping *pg; pg->pgid = pgid; /* struct */
insert_pg_mapping(mapping_root, pg);
}
}
}
ceph_decode_need(p, end, len*sizeof(u32), e_inval); return 0;
if (len > (UINT_MAX - sizeof(*pg)) / sizeof(u32)) e_inval:
return -EINVAL; return -EINVAL;
}
static struct ceph_pg_mapping *__decode_pg_temp(void **p, void *end,
bool incremental)
{
struct ceph_pg_mapping *pg;
u32 len, i;
pg = kzalloc(sizeof(*pg) + len*sizeof(u32), GFP_NOFS); ceph_decode_32_safe(p, end, len, e_inval);
if (len == 0 && incremental)
return NULL; /* new_pg_temp: [] to remove */
if (len > (SIZE_MAX - sizeof(*pg)) / sizeof(u32))
return ERR_PTR(-EINVAL);
ceph_decode_need(p, end, len * sizeof(u32), e_inval);
pg = alloc_pg_mapping(len * sizeof(u32));
if (!pg) if (!pg)
return -ENOMEM; return ERR_PTR(-ENOMEM);
pg->pgid = pgid;
pg->pg_temp.len = len; pg->pg_temp.len = len;
for (i = 0; i < len; i++) for (i = 0; i < len; i++)
pg->pg_temp.osds[i] = ceph_decode_32(p); pg->pg_temp.osds[i] = ceph_decode_32(p);
ret = __insert_pg_mapping(pg, &map->pg_temp); return pg;
if (ret) {
kfree(pg);
return ret;
}
}
}
return 0;
e_inval: e_inval:
return -EINVAL; return ERR_PTR(-EINVAL);
} }
static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map) static int decode_pg_temp(void **p, void *end, struct ceph_osdmap *map)
{ {
return __decode_pg_temp(p, end, map, false); return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
false);
} }
static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map) static int decode_new_pg_temp(void **p, void *end, struct ceph_osdmap *map)
{ {
return __decode_pg_temp(p, end, map, true); return decode_pg_mapping(p, end, &map->pg_temp, __decode_pg_temp,
true);
} }
static int __decode_primary_temp(void **p, void *end, struct ceph_osdmap *map, static struct ceph_pg_mapping *__decode_primary_temp(void **p, void *end,
bool incremental) bool incremental)
{ {
u32 n; struct ceph_pg_mapping *pg;
ceph_decode_32_safe(p, end, n, e_inval);
while (n--) {
struct ceph_pg pgid;
u32 osd; u32 osd;
int ret;
ret = ceph_decode_pgid(p, end, &pgid);
if (ret)
return ret;
ceph_decode_32_safe(p, end, osd, e_inval); ceph_decode_32_safe(p, end, osd, e_inval);
if (osd == (u32)-1 && incremental)
return NULL; /* new_primary_temp: -1 to remove */
ret = __remove_pg_mapping(&map->primary_temp, pgid); pg = alloc_pg_mapping(0);
BUG_ON(!incremental && ret != -ENOENT);
if (!incremental || osd != (u32)-1) {
struct ceph_pg_mapping *pg;
pg = kzalloc(sizeof(*pg), GFP_NOFS);
if (!pg) if (!pg)
return -ENOMEM; return ERR_PTR(-ENOMEM);
pg->pgid = pgid;
pg->primary_temp.osd = osd; pg->primary_temp.osd = osd;
return pg;
ret = __insert_pg_mapping(pg, &map->primary_temp);
if (ret) {
kfree(pg);
return ret;
}
}
}
return 0;
e_inval: e_inval:
return -EINVAL; return ERR_PTR(-EINVAL);
} }
static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map) static int decode_primary_temp(void **p, void *end, struct ceph_osdmap *map)
{ {
return __decode_primary_temp(p, end, map, false); return decode_pg_mapping(p, end, &map->primary_temp,
__decode_primary_temp, false);
} }
static int decode_new_primary_temp(void **p, void *end, static int decode_new_primary_temp(void **p, void *end,
struct ceph_osdmap *map) struct ceph_osdmap *map)
{ {
return __decode_primary_temp(p, end, map, true); return decode_pg_mapping(p, end, &map->primary_temp,
__decode_primary_temp, true);
} }
u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd) u32 ceph_get_primary_affinity(struct ceph_osdmap *map, int osd)
...@@ -1168,6 +1365,75 @@ static int decode_new_primary_affinity(void **p, void *end, ...@@ -1168,6 +1365,75 @@ static int decode_new_primary_affinity(void **p, void *end,
return -EINVAL; return -EINVAL;
} }
static struct ceph_pg_mapping *__decode_pg_upmap(void **p, void *end,
bool __unused)
{
return __decode_pg_temp(p, end, false);
}
static int decode_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
false);
}
static int decode_new_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap, __decode_pg_upmap,
true);
}
static int decode_old_pg_upmap(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap, NULL, true);
}
static struct ceph_pg_mapping *__decode_pg_upmap_items(void **p, void *end,
bool __unused)
{
struct ceph_pg_mapping *pg;
u32 len, i;
ceph_decode_32_safe(p, end, len, e_inval);
if (len > (SIZE_MAX - sizeof(*pg)) / (2 * sizeof(u32)))
return ERR_PTR(-EINVAL);
ceph_decode_need(p, end, 2 * len * sizeof(u32), e_inval);
pg = kzalloc(sizeof(*pg) + 2 * len * sizeof(u32), GFP_NOIO);
if (!pg)
return ERR_PTR(-ENOMEM);
pg->pg_upmap_items.len = len;
for (i = 0; i < len; i++) {
pg->pg_upmap_items.from_to[i][0] = ceph_decode_32(p);
pg->pg_upmap_items.from_to[i][1] = ceph_decode_32(p);
}
return pg;
e_inval:
return ERR_PTR(-EINVAL);
}
static int decode_pg_upmap_items(void **p, void *end, struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap_items,
__decode_pg_upmap_items, false);
}
static int decode_new_pg_upmap_items(void **p, void *end,
struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap_items,
__decode_pg_upmap_items, true);
}
static int decode_old_pg_upmap_items(void **p, void *end,
struct ceph_osdmap *map)
{
return decode_pg_mapping(p, end, &map->pg_upmap_items, NULL, true);
}
/* /*
* decode a full map. * decode a full map.
*/ */
...@@ -1218,13 +1484,21 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) ...@@ -1218,13 +1484,21 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
/* osd_state, osd_weight, osd_addrs->client_addr */ /* osd_state, osd_weight, osd_addrs->client_addr */
ceph_decode_need(p, end, 3*sizeof(u32) + ceph_decode_need(p, end, 3*sizeof(u32) +
map->max_osd*(1 + sizeof(*map->osd_weight) + map->max_osd*((struct_v >= 5 ? sizeof(u32) :
sizeof(u8)) +
sizeof(*map->osd_weight) +
sizeof(*map->osd_addr)), e_inval); sizeof(*map->osd_addr)), e_inval);
if (ceph_decode_32(p) != map->max_osd) if (ceph_decode_32(p) != map->max_osd)
goto e_inval; goto e_inval;
ceph_decode_copy(p, map->osd_state, map->max_osd); if (struct_v >= 5) {
for (i = 0; i < map->max_osd; i++)
map->osd_state[i] = ceph_decode_32(p);
} else {
for (i = 0; i < map->max_osd; i++)
map->osd_state[i] = ceph_decode_8(p);
}
if (ceph_decode_32(p) != map->max_osd) if (ceph_decode_32(p) != map->max_osd)
goto e_inval; goto e_inval;
...@@ -1257,9 +1531,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) ...@@ -1257,9 +1531,7 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
if (err) if (err)
goto bad; goto bad;
} else { } else {
/* XXX can this happen? */ WARN_ON(map->osd_primary_affinity);
kfree(map->osd_primary_affinity);
map->osd_primary_affinity = NULL;
} }
/* crush */ /* crush */
...@@ -1268,6 +1540,26 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map) ...@@ -1268,6 +1540,26 @@ static int osdmap_decode(void **p, void *end, struct ceph_osdmap *map)
if (err) if (err)
goto bad; goto bad;
*p += len;
if (struct_v >= 3) {
/* erasure_code_profiles */
ceph_decode_skip_map_of_map(p, end, string, string, string,
bad);
}
if (struct_v >= 4) {
err = decode_pg_upmap(p, end, map);
if (err)
goto bad;
err = decode_pg_upmap_items(p, end, map);
if (err)
goto bad;
} else {
WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap));
WARN_ON(!RB_EMPTY_ROOT(&map->pg_upmap_items));
}
/* ignore the rest */ /* ignore the rest */
*p = end; *p = end;
...@@ -1314,7 +1606,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end) ...@@ -1314,7 +1606,7 @@ struct ceph_osdmap *ceph_osdmap_decode(void **p, void *end)
* new_up_client: { osd=6, addr=... } # set osd_state and addr * new_up_client: { osd=6, addr=... } # set osd_state and addr
* new_state: { osd=6, xorstate=EXISTS } # clear osd_state * new_state: { osd=6, xorstate=EXISTS } # clear osd_state
*/ */
static int decode_new_up_state_weight(void **p, void *end, static int decode_new_up_state_weight(void **p, void *end, u8 struct_v,
struct ceph_osdmap *map) struct ceph_osdmap *map)
{ {
void *new_up_client; void *new_up_client;
...@@ -1330,7 +1622,7 @@ static int decode_new_up_state_weight(void **p, void *end, ...@@ -1330,7 +1622,7 @@ static int decode_new_up_state_weight(void **p, void *end,
new_state = *p; new_state = *p;
ceph_decode_32_safe(p, end, len, e_inval); ceph_decode_32_safe(p, end, len, e_inval);
len *= sizeof(u32) + sizeof(u8); len *= sizeof(u32) + (struct_v >= 5 ? sizeof(u32) : sizeof(u8));
ceph_decode_need(p, end, len, e_inval); ceph_decode_need(p, end, len, e_inval);
*p += len; *p += len;
...@@ -1366,10 +1658,13 @@ static int decode_new_up_state_weight(void **p, void *end, ...@@ -1366,10 +1658,13 @@ static int decode_new_up_state_weight(void **p, void *end,
len = ceph_decode_32(p); len = ceph_decode_32(p);
while (len--) { while (len--) {
s32 osd; s32 osd;
u8 xorstate; u32 xorstate;
int ret; int ret;
osd = ceph_decode_32(p); osd = ceph_decode_32(p);
if (struct_v >= 5)
xorstate = ceph_decode_32(p);
else
xorstate = ceph_decode_8(p); xorstate = ceph_decode_8(p);
if (xorstate == 0) if (xorstate == 0)
xorstate = CEPH_OSD_UP; xorstate = CEPH_OSD_UP;
...@@ -1504,7 +1799,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ...@@ -1504,7 +1799,7 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
} }
/* new_up_client, new_state, new_weight */ /* new_up_client, new_state, new_weight */
err = decode_new_up_state_weight(p, end, map); err = decode_new_up_state_weight(p, end, struct_v, map);
if (err) if (err)
goto bad; goto bad;
...@@ -1527,6 +1822,32 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ...@@ -1527,6 +1822,32 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
goto bad; goto bad;
} }
if (struct_v >= 3) {
/* new_erasure_code_profiles */
ceph_decode_skip_map_of_map(p, end, string, string, string,
bad);
/* old_erasure_code_profiles */
ceph_decode_skip_set(p, end, string, bad);
}
if (struct_v >= 4) {
err = decode_new_pg_upmap(p, end, map);
if (err)
goto bad;
err = decode_old_pg_upmap(p, end, map);
if (err)
goto bad;
err = decode_new_pg_upmap_items(p, end, map);
if (err)
goto bad;
err = decode_old_pg_upmap_items(p, end, map);
if (err)
goto bad;
}
/* ignore the rest */ /* ignore the rest */
*p = end; *p = end;
...@@ -1547,12 +1868,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end, ...@@ -1547,12 +1868,13 @@ struct ceph_osdmap *osdmap_apply_incremental(void **p, void *end,
void ceph_oloc_copy(struct ceph_object_locator *dest, void ceph_oloc_copy(struct ceph_object_locator *dest,
const struct ceph_object_locator *src) const struct ceph_object_locator *src)
{ {
WARN_ON(!ceph_oloc_empty(dest)); ceph_oloc_destroy(dest);
WARN_ON(dest->pool_ns); /* empty() only covers ->pool */
dest->pool = src->pool; dest->pool = src->pool;
if (src->pool_ns) if (src->pool_ns)
dest->pool_ns = ceph_get_string(src->pool_ns); dest->pool_ns = ceph_get_string(src->pool_ns);
else
dest->pool_ns = NULL;
} }
EXPORT_SYMBOL(ceph_oloc_copy); EXPORT_SYMBOL(ceph_oloc_copy);
...@@ -1565,14 +1887,15 @@ EXPORT_SYMBOL(ceph_oloc_destroy); ...@@ -1565,14 +1887,15 @@ EXPORT_SYMBOL(ceph_oloc_destroy);
void ceph_oid_copy(struct ceph_object_id *dest, void ceph_oid_copy(struct ceph_object_id *dest,
const struct ceph_object_id *src) const struct ceph_object_id *src)
{ {
WARN_ON(!ceph_oid_empty(dest)); ceph_oid_destroy(dest);
if (src->name != src->inline_name) { if (src->name != src->inline_name) {
/* very rare, see ceph_object_id definition */ /* very rare, see ceph_object_id definition */
dest->name = kmalloc(src->name_len + 1, dest->name = kmalloc(src->name_len + 1,
GFP_NOIO | __GFP_NOFAIL); GFP_NOIO | __GFP_NOFAIL);
} else {
dest->name = dest->inline_name;
} }
memcpy(dest->name, src->name, src->name_len + 1); memcpy(dest->name, src->name, src->name_len + 1);
dest->name_len = src->name_len; dest->name_len = src->name_len;
} }
...@@ -1714,8 +2037,7 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src) ...@@ -1714,8 +2037,7 @@ void ceph_osds_copy(struct ceph_osds *dest, const struct ceph_osds *src)
dest->primary = src->primary; dest->primary = src->primary;
} }
static bool is_split(const struct ceph_pg *pgid, bool ceph_pg_is_split(const struct ceph_pg *pgid, u32 old_pg_num,
u32 old_pg_num,
u32 new_pg_num) u32 new_pg_num)
{ {
int old_bits = calc_bits_of(old_pg_num); int old_bits = calc_bits_of(old_pg_num);
...@@ -1761,7 +2083,7 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting, ...@@ -1761,7 +2083,7 @@ bool ceph_is_new_interval(const struct ceph_osds *old_acting,
!osds_equal(old_up, new_up) || !osds_equal(old_up, new_up) ||
old_size != new_size || old_size != new_size ||
old_min_size != new_min_size || old_min_size != new_min_size ||
is_split(pgid, old_pg_num, new_pg_num) || ceph_pg_is_split(pgid, old_pg_num, new_pg_num) ||
old_sort_bitwise != new_sort_bitwise; old_sort_bitwise != new_sort_bitwise;
} }
...@@ -1885,16 +2207,12 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping); ...@@ -1885,16 +2207,12 @@ EXPORT_SYMBOL(ceph_calc_file_object_mapping);
* Should only be called with target_oid and target_oloc (as opposed to * Should only be called with target_oid and target_oloc (as opposed to
* base_oid and base_oloc), since tiering isn't taken into account. * base_oid and base_oloc), since tiering isn't taken into account.
*/ */
int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, int __ceph_object_locator_to_pg(struct ceph_pg_pool_info *pi,
struct ceph_object_id *oid, const struct ceph_object_id *oid,
struct ceph_object_locator *oloc, const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid) struct ceph_pg *raw_pgid)
{ {
struct ceph_pg_pool_info *pi; WARN_ON(pi->id != oloc->pool);
pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
if (!pi)
return -ENOENT;
if (!oloc->pool_ns) { if (!oloc->pool_ns) {
raw_pgid->pool = oloc->pool; raw_pgid->pool = oloc->pool;
...@@ -1926,6 +2244,20 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap, ...@@ -1926,6 +2244,20 @@ int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
} }
return 0; return 0;
} }
int ceph_object_locator_to_pg(struct ceph_osdmap *osdmap,
const struct ceph_object_id *oid,
const struct ceph_object_locator *oloc,
struct ceph_pg *raw_pgid)
{
struct ceph_pg_pool_info *pi;
pi = ceph_pg_pool_by_id(osdmap, oloc->pool);
if (!pi)
return -ENOENT;
return __ceph_object_locator_to_pg(pi, oid, oloc, raw_pgid);
}
EXPORT_SYMBOL(ceph_object_locator_to_pg); EXPORT_SYMBOL(ceph_object_locator_to_pg);
/* /*
...@@ -1970,23 +2302,57 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi, ...@@ -1970,23 +2302,57 @@ static u32 raw_pg_to_pps(struct ceph_pg_pool_info *pi,
static int do_crush(struct ceph_osdmap *map, int ruleno, int x, static int do_crush(struct ceph_osdmap *map, int ruleno, int x,
int *result, int result_max, int *result, int result_max,
const __u32 *weight, int weight_max) const __u32 *weight, int weight_max,
u64 choose_args_index)
{ {
struct crush_choose_arg_map *arg_map;
int r; int r;
BUG_ON(result_max > CEPH_PG_MAX_SIZE); BUG_ON(result_max > CEPH_PG_MAX_SIZE);
arg_map = lookup_choose_arg_map(&map->crush->choose_args,
choose_args_index);
mutex_lock(&map->crush_workspace_mutex); mutex_lock(&map->crush_workspace_mutex);
r = crush_do_rule(map->crush, ruleno, x, result, result_max, r = crush_do_rule(map->crush, ruleno, x, result, result_max,
weight, weight_max, map->crush_workspace); weight, weight_max, map->crush_workspace,
arg_map ? arg_map->args : NULL);
mutex_unlock(&map->crush_workspace_mutex); mutex_unlock(&map->crush_workspace_mutex);
return r; return r;
} }
static void remove_nonexistent_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
struct ceph_osds *set)
{
int i;
if (ceph_can_shift_osds(pi)) {
int removed = 0;
/* shift left */
for (i = 0; i < set->size; i++) {
if (!ceph_osd_exists(osdmap, set->osds[i])) {
removed++;
continue;
}
if (removed)
set->osds[i - removed] = set->osds[i];
}
set->size -= removed;
} else {
/* set dne devices to NONE */
for (i = 0; i < set->size; i++) {
if (!ceph_osd_exists(osdmap, set->osds[i]))
set->osds[i] = CRUSH_ITEM_NONE;
}
}
}
/* /*
* Calculate raw set (CRUSH output) for given PG. The result may * Calculate raw set (CRUSH output) for given PG and filter out
* contain nonexistent OSDs. ->primary is undefined for a raw set. * nonexistent OSDs. ->primary is undefined for a raw set.
* *
* Placement seed (CRUSH input) is returned through @ppps. * Placement seed (CRUSH input) is returned through @ppps.
*/ */
...@@ -2020,7 +2386,7 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, ...@@ -2020,7 +2386,7 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
} }
len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size, len = do_crush(osdmap, ruleno, pps, raw->osds, pi->size,
osdmap->osd_weight, osdmap->max_osd); osdmap->osd_weight, osdmap->max_osd, pi->id);
if (len < 0) { if (len < 0) {
pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n", pr_err("error %d from crush rule %d: pool %lld ruleset %d type %d size %d\n",
len, ruleno, pi->id, pi->crush_ruleset, pi->type, len, ruleno, pi->id, pi->crush_ruleset, pi->type,
...@@ -2029,6 +2395,70 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap, ...@@ -2029,6 +2395,70 @@ static void pg_to_raw_osds(struct ceph_osdmap *osdmap,
} }
raw->size = len; raw->size = len;
remove_nonexistent_osds(osdmap, pi, raw);
}
/* apply pg_upmap[_items] mappings */
static void apply_upmap(struct ceph_osdmap *osdmap,
const struct ceph_pg *pgid,
struct ceph_osds *raw)
{
struct ceph_pg_mapping *pg;
int i, j;
pg = lookup_pg_mapping(&osdmap->pg_upmap, pgid);
if (pg) {
/* make sure targets aren't marked out */
for (i = 0; i < pg->pg_upmap.len; i++) {
int osd = pg->pg_upmap.osds[i];
if (osd != CRUSH_ITEM_NONE &&
osd < osdmap->max_osd &&
osdmap->osd_weight[osd] == 0) {
/* reject/ignore explicit mapping */
return;
}
}
for (i = 0; i < pg->pg_upmap.len; i++)
raw->osds[i] = pg->pg_upmap.osds[i];
raw->size = pg->pg_upmap.len;
return;
}
pg = lookup_pg_mapping(&osdmap->pg_upmap_items, pgid);
if (pg) {
/*
* Note: this approach does not allow a bidirectional swap,
* e.g., [[1,2],[2,1]] applied to [0,1,2] -> [0,2,1].
*/
for (i = 0; i < pg->pg_upmap_items.len; i++) {
int from = pg->pg_upmap_items.from_to[i][0];
int to = pg->pg_upmap_items.from_to[i][1];
int pos = -1;
bool exists = false;
/* make sure replacement doesn't already appear */
for (j = 0; j < raw->size; j++) {
int osd = raw->osds[j];
if (osd == to) {
exists = true;
break;
}
/* ignore mapping if target is marked out */
if (osd == from && pos < 0 &&
!(to != CRUSH_ITEM_NONE &&
to < osdmap->max_osd &&
osdmap->osd_weight[to] == 0)) {
pos = j;
}
}
if (!exists && pos >= 0) {
raw->osds[pos] = to;
return;
}
}
}
} }
/* /*
...@@ -2151,18 +2581,16 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap, ...@@ -2151,18 +2581,16 @@ static void apply_primary_affinity(struct ceph_osdmap *osdmap,
*/ */
static void get_temp_osds(struct ceph_osdmap *osdmap, static void get_temp_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi, struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid, const struct ceph_pg *pgid,
struct ceph_osds *temp) struct ceph_osds *temp)
{ {
struct ceph_pg pgid;
struct ceph_pg_mapping *pg; struct ceph_pg_mapping *pg;
int i; int i;
raw_pg_to_pg(pi, raw_pgid, &pgid);
ceph_osds_init(temp); ceph_osds_init(temp);
/* pg_temp? */ /* pg_temp? */
pg = __lookup_pg_mapping(&osdmap->pg_temp, pgid); pg = lookup_pg_mapping(&osdmap->pg_temp, pgid);
if (pg) { if (pg) {
for (i = 0; i < pg->pg_temp.len; i++) { for (i = 0; i < pg->pg_temp.len; i++) {
if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) { if (ceph_osd_is_down(osdmap, pg->pg_temp.osds[i])) {
...@@ -2185,7 +2613,7 @@ static void get_temp_osds(struct ceph_osdmap *osdmap, ...@@ -2185,7 +2613,7 @@ static void get_temp_osds(struct ceph_osdmap *osdmap,
} }
/* primary_temp? */ /* primary_temp? */
pg = __lookup_pg_mapping(&osdmap->primary_temp, pgid); pg = lookup_pg_mapping(&osdmap->primary_temp, pgid);
if (pg) if (pg)
temp->primary = pg->primary_temp.osd; temp->primary = pg->primary_temp.osd;
} }
...@@ -2198,43 +2626,75 @@ static void get_temp_osds(struct ceph_osdmap *osdmap, ...@@ -2198,43 +2626,75 @@ static void get_temp_osds(struct ceph_osdmap *osdmap,
* resend a request. * resend a request.
*/ */
void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap, void ceph_pg_to_up_acting_osds(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid, const struct ceph_pg *raw_pgid,
struct ceph_osds *up, struct ceph_osds *up,
struct ceph_osds *acting) struct ceph_osds *acting)
{ {
struct ceph_pg_pool_info *pi; struct ceph_pg pgid;
u32 pps; u32 pps;
pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool); WARN_ON(pi->id != raw_pgid->pool);
if (!pi) { raw_pg_to_pg(pi, raw_pgid, &pgid);
ceph_osds_init(up);
ceph_osds_init(acting);
goto out;
}
pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps); pg_to_raw_osds(osdmap, pi, raw_pgid, up, &pps);
apply_upmap(osdmap, &pgid, up);
raw_to_up_osds(osdmap, pi, up); raw_to_up_osds(osdmap, pi, up);
apply_primary_affinity(osdmap, pi, pps, up); apply_primary_affinity(osdmap, pi, pps, up);
get_temp_osds(osdmap, pi, raw_pgid, acting); get_temp_osds(osdmap, pi, &pgid, acting);
if (!acting->size) { if (!acting->size) {
memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0])); memcpy(acting->osds, up->osds, up->size * sizeof(up->osds[0]));
acting->size = up->size; acting->size = up->size;
if (acting->primary == -1) if (acting->primary == -1)
acting->primary = up->primary; acting->primary = up->primary;
} }
out:
WARN_ON(!osds_valid(up) || !osds_valid(acting)); WARN_ON(!osds_valid(up) || !osds_valid(acting));
} }
bool ceph_pg_to_primary_shard(struct ceph_osdmap *osdmap,
struct ceph_pg_pool_info *pi,
const struct ceph_pg *raw_pgid,
struct ceph_spg *spgid)
{
struct ceph_pg pgid;
struct ceph_osds up, acting;
int i;
WARN_ON(pi->id != raw_pgid->pool);
raw_pg_to_pg(pi, raw_pgid, &pgid);
if (ceph_can_shift_osds(pi)) {
spgid->pgid = pgid; /* struct */
spgid->shard = CEPH_SPG_NOSHARD;
return true;
}
ceph_pg_to_up_acting_osds(osdmap, pi, &pgid, &up, &acting);
for (i = 0; i < acting.size; i++) {
if (acting.osds[i] == acting.primary) {
spgid->pgid = pgid; /* struct */
spgid->shard = i;
return true;
}
}
return false;
}
/* /*
* Return acting primary for given PG, or -1 if none. * Return acting primary for given PG, or -1 if none.
*/ */
int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap, int ceph_pg_to_acting_primary(struct ceph_osdmap *osdmap,
const struct ceph_pg *raw_pgid) const struct ceph_pg *raw_pgid)
{ {
struct ceph_pg_pool_info *pi;
struct ceph_osds up, acting; struct ceph_osds up, acting;
ceph_pg_to_up_acting_osds(osdmap, raw_pgid, &up, &acting); pi = ceph_pg_pool_by_id(osdmap, raw_pgid->pool);
if (!pi)
return -1;
ceph_pg_to_up_acting_osds(osdmap, pi, raw_pgid, &up, &acting);
return acting.primary; return acting.primary;
} }
EXPORT_SYMBOL(ceph_pg_to_acting_primary); EXPORT_SYMBOL(ceph_pg_to_acting_primary);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment