Commit d5a38f6e authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client

Pull Ceph updates from Sage Weil:
 "There is quite a bit here, including some overdue refactoring and
  cleanup on the mon_client and osd_client code from Ilya, scattered
  writeback support for CephFS and a pile of bug fixes from Zheng, and a
  few random cleanups and fixes from others"

[ I already decided not to pull this because of it having been rebased
  recently, but ended up changing my mind after all.  Next time I'll
  really hold people to it.  Oh well.   - Linus ]

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/sage/ceph-client: (34 commits)
  libceph: use KMEM_CACHE macro
  ceph: use kmem_cache_zalloc
  rbd: use KMEM_CACHE macro
  ceph: use lookup request to revalidate dentry
  ceph: kill ceph_get_dentry_parent_inode()
  ceph: fix security xattr deadlock
  ceph: don't request vxattrs from MDS
  ceph: fix mounting same fs multiple times
  ceph: remove unnecessary NULL check
  ceph: avoid updating directory inode's i_size accidentally
  ceph: fix race during filling readdir cache
  libceph: use sizeof_footer() more
  ceph: kill ceph_empty_snapc
  ceph: fix a wrong comparison
  ceph: replace CURRENT_TIME by current_fs_time()
  ceph: scattered page writeback
  libceph: add helper that duplicates last extent operation
  libceph: enable large, variable-sized OSD requests
  libceph: osdc->req_mempool should be backed by a slab pool
  libceph: make r_request msg_size calculation clearer
  ...
parents 698f415c 5ee61e95
...@@ -1847,14 +1847,12 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req, ...@@ -1847,14 +1847,12 @@ static void rbd_osd_req_callback(struct ceph_osd_request *osd_req,
if (osd_req->r_result < 0) if (osd_req->r_result < 0)
obj_request->result = osd_req->r_result; obj_request->result = osd_req->r_result;
rbd_assert(osd_req->r_num_ops <= CEPH_OSD_MAX_OP);
/* /*
* We support a 64-bit length, but ultimately it has to be * We support a 64-bit length, but ultimately it has to be
* passed to the block layer, which just supports a 32-bit * passed to the block layer, which just supports a 32-bit
* length field. * length field.
*/ */
obj_request->xferred = osd_req->r_reply_op_len[0]; obj_request->xferred = osd_req->r_ops[0].outdata_len;
rbd_assert(obj_request->xferred < (u64)UINT_MAX); rbd_assert(obj_request->xferred < (u64)UINT_MAX);
opcode = osd_req->r_ops[0].op; opcode = osd_req->r_ops[0].op;
...@@ -5643,18 +5641,12 @@ static void rbd_sysfs_cleanup(void) ...@@ -5643,18 +5641,12 @@ static void rbd_sysfs_cleanup(void)
static int rbd_slab_init(void) static int rbd_slab_init(void)
{ {
rbd_assert(!rbd_img_request_cache); rbd_assert(!rbd_img_request_cache);
rbd_img_request_cache = kmem_cache_create("rbd_img_request", rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
sizeof (struct rbd_img_request),
__alignof__(struct rbd_img_request),
0, NULL);
if (!rbd_img_request_cache) if (!rbd_img_request_cache)
return -ENOMEM; return -ENOMEM;
rbd_assert(!rbd_obj_request_cache); rbd_assert(!rbd_obj_request_cache);
rbd_obj_request_cache = kmem_cache_create("rbd_obj_request", rbd_obj_request_cache = KMEM_CACHE(rbd_obj_request, 0);
sizeof (struct rbd_obj_request),
__alignof__(struct rbd_obj_request),
0, NULL);
if (!rbd_obj_request_cache) if (!rbd_obj_request_cache)
goto out_err; goto out_err;
......
...@@ -175,8 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset, ...@@ -175,8 +175,8 @@ static void ceph_invalidatepage(struct page *page, unsigned int offset,
static int ceph_releasepage(struct page *page, gfp_t g) static int ceph_releasepage(struct page *page, gfp_t g)
{ {
struct inode *inode = page->mapping ? page->mapping->host : NULL; dout("%p releasepage %p idx %lu\n", page->mapping->host,
dout("%p releasepage %p idx %lu\n", inode, page, page->index); page, page->index);
WARN_ON(PageDirty(page)); WARN_ON(PageDirty(page));
/* Can we release the page from the cache? */ /* Can we release the page from the cache? */
...@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg) ...@@ -276,7 +276,7 @@ static void finish_read(struct ceph_osd_request *req, struct ceph_msg *msg)
for (i = 0; i < num_pages; i++) { for (i = 0; i < num_pages; i++) {
struct page *page = osd_data->pages[i]; struct page *page = osd_data->pages[i];
if (rc < 0 && rc != ENOENT) if (rc < 0 && rc != -ENOENT)
goto unlock; goto unlock;
if (bytes < (int)PAGE_CACHE_SIZE) { if (bytes < (int)PAGE_CACHE_SIZE) {
/* zero (remainder of) page */ /* zero (remainder of) page */
...@@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req, ...@@ -606,71 +606,71 @@ static void writepages_finish(struct ceph_osd_request *req,
struct inode *inode = req->r_inode; struct inode *inode = req->r_inode;
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
struct ceph_osd_data *osd_data; struct ceph_osd_data *osd_data;
unsigned wrote;
struct page *page; struct page *page;
int num_pages; int num_pages, total_pages = 0;
int i; int i, j;
int rc = req->r_result;
struct ceph_snap_context *snapc = req->r_snapc; struct ceph_snap_context *snapc = req->r_snapc;
struct address_space *mapping = inode->i_mapping; struct address_space *mapping = inode->i_mapping;
int rc = req->r_result;
u64 bytes = req->r_ops[0].extent.length;
struct ceph_fs_client *fsc = ceph_inode_to_client(inode); struct ceph_fs_client *fsc = ceph_inode_to_client(inode);
long writeback_stat; bool remove_page;
unsigned issued = ceph_caps_issued(ci);
dout("writepages_finish %p rc %d\n", inode, rc);
if (rc < 0)
mapping_set_error(mapping, rc);
osd_data = osd_req_op_extent_osd_data(req, 0);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
num_pages = calc_pages_for((u64)osd_data->alignment,
(u64)osd_data->length);
if (rc >= 0) {
/* /*
* Assume we wrote the pages we originally sent. The * We lost the cache cap, need to truncate the page before
* osd might reply with fewer pages if our writeback * it is unlocked, otherwise we'd truncate it later in the
* raced with a truncation and was adjusted at the osd, * page truncation thread, possibly losing some data that
* so don't believe the reply. * raced its way in
*/ */
wrote = num_pages; remove_page = !(ceph_caps_issued(ci) &
} else { (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
wrote = 0;
mapping_set_error(mapping, rc);
}
dout("writepages_finish %p rc %d bytes %llu wrote %d (pages)\n",
inode, rc, bytes, wrote);
/* clean all pages */ /* clean all pages */
for (i = 0; i < num_pages; i++) { for (i = 0; i < req->r_num_ops; i++) {
page = osd_data->pages[i]; if (req->r_ops[i].op != CEPH_OSD_OP_WRITE)
break;
osd_data = osd_req_op_extent_osd_data(req, i);
BUG_ON(osd_data->type != CEPH_OSD_DATA_TYPE_PAGES);
num_pages = calc_pages_for((u64)osd_data->alignment,
(u64)osd_data->length);
total_pages += num_pages;
for (j = 0; j < num_pages; j++) {
page = osd_data->pages[j];
BUG_ON(!page); BUG_ON(!page);
WARN_ON(!PageUptodate(page)); WARN_ON(!PageUptodate(page));
writeback_stat = if (atomic_long_dec_return(&fsc->writeback_count) <
atomic_long_dec_return(&fsc->writeback_count); CONGESTION_OFF_THRESH(
if (writeback_stat < fsc->mount_options->congestion_kb))
CONGESTION_OFF_THRESH(fsc->mount_options->congestion_kb))
clear_bdi_congested(&fsc->backing_dev_info, clear_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC); BLK_RW_ASYNC);
ceph_put_snap_context(page_snap_context(page)); ceph_put_snap_context(page_snap_context(page));
page->private = 0; page->private = 0;
ClearPagePrivate(page); ClearPagePrivate(page);
dout("unlocking %d %p\n", i, page); dout("unlocking %p\n", page);
end_page_writeback(page); end_page_writeback(page);
/* if (remove_page)
* We lost the cache cap, need to truncate the page before generic_error_remove_page(inode->i_mapping,
* it is unlocked, otherwise we'd truncate it later in the page);
* page truncation thread, possibly losing some data that
* raced its way in
*/
if ((issued & (CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO)) == 0)
generic_error_remove_page(inode->i_mapping, page);
unlock_page(page); unlock_page(page);
} }
dout("%p wrote+cleaned %d pages\n", inode, wrote); dout("writepages_finish %p wrote %llu bytes cleaned %d pages\n",
ceph_put_wrbuffer_cap_refs(ci, num_pages, snapc); inode, osd_data->length, rc >= 0 ? num_pages : 0);
ceph_release_pages(osd_data->pages, num_pages); ceph_release_pages(osd_data->pages, num_pages);
}
ceph_put_wrbuffer_cap_refs(ci, total_pages, snapc);
osd_data = osd_req_op_extent_osd_data(req, 0);
if (osd_data->pages_from_pool) if (osd_data->pages_from_pool)
mempool_free(osd_data->pages, mempool_free(osd_data->pages,
ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool); ceph_sb_to_client(inode->i_sb)->wb_pagevec_pool);
...@@ -778,17 +778,15 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -778,17 +778,15 @@ static int ceph_writepages_start(struct address_space *mapping,
while (!done && index <= end) { while (!done && index <= end) {
unsigned i; unsigned i;
int first; int first;
pgoff_t next; pgoff_t strip_unit_end = 0;
int pvec_pages, locked_pages; int num_ops = 0, op_idx;
struct page **pages = NULL; int pvec_pages, locked_pages = 0;
struct page **pages = NULL, **data_pages;
mempool_t *pool = NULL; /* Becomes non-null if mempool used */ mempool_t *pool = NULL; /* Becomes non-null if mempool used */
struct page *page; struct page *page;
int want; int want;
u64 offset, len; u64 offset = 0, len = 0;
long writeback_stat;
next = 0;
locked_pages = 0;
max_pages = max_pages_ever; max_pages = max_pages_ever;
get_more_pages: get_more_pages:
...@@ -824,8 +822,8 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -824,8 +822,8 @@ static int ceph_writepages_start(struct address_space *mapping,
unlock_page(page); unlock_page(page);
break; break;
} }
if (next && (page->index != next)) { if (strip_unit_end && (page->index > strip_unit_end)) {
dout("not consecutive %p\n", page); dout("end of strip unit %p\n", page);
unlock_page(page); unlock_page(page);
break; break;
} }
...@@ -867,36 +865,31 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -867,36 +865,31 @@ static int ceph_writepages_start(struct address_space *mapping,
/* /*
* We have something to write. If this is * We have something to write. If this is
* the first locked page this time through, * the first locked page this time through,
* allocate an osd request and a page array * calculate max possinle write size and
* that it will use. * allocate a page array
*/ */
if (locked_pages == 0) { if (locked_pages == 0) {
BUG_ON(pages); u64 objnum;
u64 objoff;
/* prepare async write request */ /* prepare async write request */
offset = (u64)page_offset(page); offset = (u64)page_offset(page);
len = wsize; len = wsize;
req = ceph_osdc_new_request(&fsc->client->osdc,
&ci->i_layout, vino, rc = ceph_calc_file_object_mapping(&ci->i_layout,
offset, &len, 0, offset, len,
do_sync ? 2 : 1, &objnum, &objoff,
CEPH_OSD_OP_WRITE, &len);
CEPH_OSD_FLAG_WRITE | if (rc < 0) {
CEPH_OSD_FLAG_ONDISK,
snapc, truncate_seq,
truncate_size, true);
if (IS_ERR(req)) {
rc = PTR_ERR(req);
unlock_page(page); unlock_page(page);
break; break;
} }
if (do_sync) num_ops = 1 + do_sync;
osd_req_op_init(req, 1, strip_unit_end = page->index +
CEPH_OSD_OP_STARTSYNC, 0); ((len - 1) >> PAGE_CACHE_SHIFT);
req->r_callback = writepages_finish;
req->r_inode = inode;
BUG_ON(pages);
max_pages = calc_pages_for(0, (u64)len); max_pages = calc_pages_for(0, (u64)len);
pages = kmalloc(max_pages * sizeof (*pages), pages = kmalloc(max_pages * sizeof (*pages),
GFP_NOFS); GFP_NOFS);
...@@ -905,6 +898,20 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -905,6 +898,20 @@ static int ceph_writepages_start(struct address_space *mapping,
pages = mempool_alloc(pool, GFP_NOFS); pages = mempool_alloc(pool, GFP_NOFS);
BUG_ON(!pages); BUG_ON(!pages);
} }
len = 0;
} else if (page->index !=
(offset + len) >> PAGE_CACHE_SHIFT) {
if (num_ops >= (pool ? CEPH_OSD_SLAB_OPS :
CEPH_OSD_MAX_OPS)) {
redirty_page_for_writepage(wbc, page);
unlock_page(page);
break;
}
num_ops++;
offset = (u64)page_offset(page);
len = 0;
} }
/* note position of first page in pvec */ /* note position of first page in pvec */
...@@ -913,18 +920,16 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -913,18 +920,16 @@ static int ceph_writepages_start(struct address_space *mapping,
dout("%p will write page %p idx %lu\n", dout("%p will write page %p idx %lu\n",
inode, page, page->index); inode, page, page->index);
writeback_stat = if (atomic_long_inc_return(&fsc->writeback_count) >
atomic_long_inc_return(&fsc->writeback_count); CONGESTION_ON_THRESH(
if (writeback_stat > CONGESTION_ON_THRESH(
fsc->mount_options->congestion_kb)) { fsc->mount_options->congestion_kb)) {
set_bdi_congested(&fsc->backing_dev_info, set_bdi_congested(&fsc->backing_dev_info,
BLK_RW_ASYNC); BLK_RW_ASYNC);
} }
set_page_writeback(page);
pages[locked_pages] = page; pages[locked_pages] = page;
locked_pages++; locked_pages++;
next = page->index + 1; len += PAGE_CACHE_SIZE;
} }
/* did we get anything? */ /* did we get anything? */
...@@ -944,38 +949,119 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -944,38 +949,119 @@ static int ceph_writepages_start(struct address_space *mapping,
/* shift unused pages over in the pvec... we /* shift unused pages over in the pvec... we
* will need to release them below. */ * will need to release them below. */
for (j = i; j < pvec_pages; j++) { for (j = i; j < pvec_pages; j++) {
dout(" pvec leftover page %p\n", dout(" pvec leftover page %p\n", pvec.pages[j]);
pvec.pages[j]);
pvec.pages[j-i+first] = pvec.pages[j]; pvec.pages[j-i+first] = pvec.pages[j];
} }
pvec.nr -= i-first; pvec.nr -= i-first;
} }
/* Format the osd request message and submit the write */ new_request:
offset = page_offset(pages[0]); offset = page_offset(pages[0]);
len = (u64)locked_pages << PAGE_CACHE_SHIFT; len = wsize;
if (snap_size == -1) {
len = min(len, (u64)i_size_read(inode) - offset); req = ceph_osdc_new_request(&fsc->client->osdc,
&ci->i_layout, vino,
offset, &len, 0, num_ops,
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE |
CEPH_OSD_FLAG_ONDISK,
snapc, truncate_seq,
truncate_size, false);
if (IS_ERR(req)) {
req = ceph_osdc_new_request(&fsc->client->osdc,
&ci->i_layout, vino,
offset, &len, 0,
min(num_ops,
CEPH_OSD_SLAB_OPS),
CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_WRITE |
CEPH_OSD_FLAG_ONDISK,
snapc, truncate_seq,
truncate_size, true);
BUG_ON(IS_ERR(req));
}
BUG_ON(len < page_offset(pages[locked_pages - 1]) +
PAGE_CACHE_SIZE - offset);
req->r_callback = writepages_finish;
req->r_inode = inode;
/* Format the osd request message and submit the write */
len = 0;
data_pages = pages;
op_idx = 0;
for (i = 0; i < locked_pages; i++) {
u64 cur_offset = page_offset(pages[i]);
if (offset + len != cur_offset) {
if (op_idx + do_sync + 1 == req->r_num_ops)
break;
osd_req_op_extent_dup_last(req, op_idx,
cur_offset - offset);
dout("writepages got pages at %llu~%llu\n",
offset, len);
osd_req_op_extent_osd_data_pages(req, op_idx,
data_pages, len, 0,
!!pool, false);
osd_req_op_extent_update(req, op_idx, len);
len = 0;
offset = cur_offset;
data_pages = pages + i;
op_idx++;
}
set_page_writeback(pages[i]);
len += PAGE_CACHE_SIZE;
}
if (snap_size != -1) {
len = min(len, snap_size - offset);
} else if (i == locked_pages) {
/* writepages_finish() clears writeback pages /* writepages_finish() clears writeback pages
* according to the data length, so make sure * according to the data length, so make sure
* data length covers all locked pages */ * data length covers all locked pages */
len = max(len, 1 + u64 min_len = len + 1 - PAGE_CACHE_SIZE;
((u64)(locked_pages - 1) << PAGE_CACHE_SHIFT)); len = min(len, (u64)i_size_read(inode) - offset);
} else { len = max(len, min_len);
len = min(len, snap_size - offset);
} }
dout("writepages got %d pages at %llu~%llu\n", dout("writepages got pages at %llu~%llu\n", offset, len);
locked_pages, offset, len);
osd_req_op_extent_osd_data_pages(req, 0, pages, len, 0,
!!pool, false);
pages = NULL; /* request message now owns the pages array */ osd_req_op_extent_osd_data_pages(req, op_idx, data_pages, len,
pool = NULL; 0, !!pool, false);
osd_req_op_extent_update(req, op_idx, len);
/* Update the write op length in case we changed it */ if (do_sync) {
op_idx++;
osd_req_op_init(req, op_idx, CEPH_OSD_OP_STARTSYNC, 0);
}
BUG_ON(op_idx + 1 != req->r_num_ops);
osd_req_op_extent_update(req, 0, len); pool = NULL;
if (i < locked_pages) {
BUG_ON(num_ops <= req->r_num_ops);
num_ops -= req->r_num_ops;
num_ops += do_sync;
locked_pages -= i;
/* allocate new pages array for next request */
data_pages = pages;
pages = kmalloc(locked_pages * sizeof (*pages),
GFP_NOFS);
if (!pages) {
pool = fsc->wb_pagevec_pool;
pages = mempool_alloc(pool, GFP_NOFS);
BUG_ON(!pages);
}
memcpy(pages, data_pages + i,
locked_pages * sizeof(*pages));
memset(data_pages + i, 0,
locked_pages * sizeof(*pages));
} else {
BUG_ON(num_ops != req->r_num_ops);
index = pages[i - 1]->index + 1;
/* request message now owns the pages array */
pages = NULL;
}
vino = ceph_vino(inode); vino = ceph_vino(inode);
ceph_osdc_build_request(req, offset, snapc, vino.snap, ceph_osdc_build_request(req, offset, snapc, vino.snap,
...@@ -985,9 +1071,10 @@ static int ceph_writepages_start(struct address_space *mapping, ...@@ -985,9 +1071,10 @@ static int ceph_writepages_start(struct address_space *mapping,
BUG_ON(rc); BUG_ON(rc);
req = NULL; req = NULL;
/* continue? */ wbc->nr_to_write -= i;
index = next; if (pages)
wbc->nr_to_write -= locked_pages; goto new_request;
if (wbc->nr_to_write <= 0) if (wbc->nr_to_write <= 0)
done = 1; done = 1;
...@@ -1522,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ...@@ -1522,7 +1609,7 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_vino(inode), 0, &len, 0, 1, ceph_vino(inode), 0, &len, 0, 1,
CEPH_OSD_OP_CREATE, CEPH_OSD_OP_CREATE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
ceph_empty_snapc, 0, 0, false); NULL, 0, 0, false);
if (IS_ERR(req)) { if (IS_ERR(req)) {
err = PTR_ERR(req); err = PTR_ERR(req);
goto out; goto out;
...@@ -1540,9 +1627,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page) ...@@ -1540,9 +1627,8 @@ int ceph_uninline_data(struct file *filp, struct page *locked_page)
ceph_vino(inode), 0, &len, 1, 3, ceph_vino(inode), 0, &len, 1, 3,
CEPH_OSD_OP_WRITE, CEPH_OSD_OP_WRITE,
CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE, CEPH_OSD_FLAG_ONDISK | CEPH_OSD_FLAG_WRITE,
ceph_empty_snapc, NULL, ci->i_truncate_seq,
ci->i_truncate_seq, ci->i_truncate_size, ci->i_truncate_size, false);
false);
if (IS_ERR(req)) { if (IS_ERR(req)) {
err = PTR_ERR(req); err = PTR_ERR(req);
goto out; goto out;
...@@ -1663,8 +1749,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) ...@@ -1663,8 +1749,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
goto out; goto out;
} }
rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, rd_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
ceph_empty_snapc,
1, false, GFP_NOFS); 1, false, GFP_NOFS);
if (!rd_req) { if (!rd_req) {
err = -ENOMEM; err = -ENOMEM;
...@@ -1678,8 +1763,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool) ...@@ -1678,8 +1763,7 @@ static int __ceph_pool_perm_get(struct ceph_inode_info *ci, u32 pool)
"%llx.00000000", ci->i_vino.ino); "%llx.00000000", ci->i_vino.ino);
rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name); rd_req->r_base_oid.name_len = strlen(rd_req->r_base_oid.name);
wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, wr_req = ceph_osdc_alloc_request(&fsc->client->osdc, NULL,
ceph_empty_snapc,
1, false, GFP_NOFS); 1, false, GFP_NOFS);
if (!wr_req) { if (!wr_req) {
err = -ENOMEM; err = -ENOMEM;
......
...@@ -991,7 +991,7 @@ static int send_cap_msg(struct ceph_mds_session *session, ...@@ -991,7 +991,7 @@ static int send_cap_msg(struct ceph_mds_session *session,
u32 seq, u64 flush_tid, u64 oldest_flush_tid, u32 seq, u64 flush_tid, u64 oldest_flush_tid,
u32 issue_seq, u32 mseq, u64 size, u64 max_size, u32 issue_seq, u32 mseq, u64 size, u64 max_size,
struct timespec *mtime, struct timespec *atime, struct timespec *mtime, struct timespec *atime,
u64 time_warp_seq, struct timespec *ctime, u64 time_warp_seq,
kuid_t uid, kgid_t gid, umode_t mode, kuid_t uid, kgid_t gid, umode_t mode,
u64 xattr_version, u64 xattr_version,
struct ceph_buffer *xattrs_buf, struct ceph_buffer *xattrs_buf,
...@@ -1042,6 +1042,8 @@ static int send_cap_msg(struct ceph_mds_session *session, ...@@ -1042,6 +1042,8 @@ static int send_cap_msg(struct ceph_mds_session *session,
ceph_encode_timespec(&fc->mtime, mtime); ceph_encode_timespec(&fc->mtime, mtime);
if (atime) if (atime)
ceph_encode_timespec(&fc->atime, atime); ceph_encode_timespec(&fc->atime, atime);
if (ctime)
ceph_encode_timespec(&fc->ctime, ctime);
fc->time_warp_seq = cpu_to_le32(time_warp_seq); fc->time_warp_seq = cpu_to_le32(time_warp_seq);
fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid)); fc->uid = cpu_to_le32(from_kuid(&init_user_ns, uid));
...@@ -1116,7 +1118,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ...@@ -1116,7 +1118,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
int held, revoking, dropping, keep; int held, revoking, dropping, keep;
u64 seq, issue_seq, mseq, time_warp_seq, follows; u64 seq, issue_seq, mseq, time_warp_seq, follows;
u64 size, max_size; u64 size, max_size;
struct timespec mtime, atime; struct timespec mtime, atime, ctime;
int wake = 0; int wake = 0;
umode_t mode; umode_t mode;
kuid_t uid; kuid_t uid;
...@@ -1180,6 +1182,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ...@@ -1180,6 +1182,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ci->i_requested_max_size = max_size; ci->i_requested_max_size = max_size;
mtime = inode->i_mtime; mtime = inode->i_mtime;
atime = inode->i_atime; atime = inode->i_atime;
ctime = inode->i_ctime;
time_warp_seq = ci->i_time_warp_seq; time_warp_seq = ci->i_time_warp_seq;
uid = inode->i_uid; uid = inode->i_uid;
gid = inode->i_gid; gid = inode->i_gid;
...@@ -1198,7 +1201,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap, ...@@ -1198,7 +1201,7 @@ static int __send_cap(struct ceph_mds_client *mdsc, struct ceph_cap *cap,
ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id, ret = send_cap_msg(session, ceph_vino(inode).ino, cap_id,
op, keep, want, flushing, seq, op, keep, want, flushing, seq,
flush_tid, oldest_flush_tid, issue_seq, mseq, flush_tid, oldest_flush_tid, issue_seq, mseq,
size, max_size, &mtime, &atime, time_warp_seq, size, max_size, &mtime, &atime, &ctime, time_warp_seq,
uid, gid, mode, xattr_version, xattr_blob, uid, gid, mode, xattr_version, xattr_blob,
follows, inline_data); follows, inline_data);
if (ret < 0) { if (ret < 0) {
...@@ -1320,7 +1323,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci, ...@@ -1320,7 +1323,7 @@ void __ceph_flush_snaps(struct ceph_inode_info *ci,
capsnap->dirty, 0, capsnap->flush_tid, 0, capsnap->dirty, 0, capsnap->flush_tid, 0,
0, mseq, capsnap->size, 0, 0, mseq, capsnap->size, 0,
&capsnap->mtime, &capsnap->atime, &capsnap->mtime, &capsnap->atime,
capsnap->time_warp_seq, &capsnap->ctime, capsnap->time_warp_seq,
capsnap->uid, capsnap->gid, capsnap->mode, capsnap->uid, capsnap->gid, capsnap->mode,
capsnap->xattr_version, capsnap->xattr_blob, capsnap->xattr_version, capsnap->xattr_blob,
capsnap->follows, capsnap->inline_data); capsnap->follows, capsnap->inline_data);
......
...@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry) ...@@ -38,7 +38,7 @@ int ceph_init_dentry(struct dentry *dentry)
if (dentry->d_fsdata) if (dentry->d_fsdata)
return 0; return 0;
di = kmem_cache_alloc(ceph_dentry_cachep, GFP_KERNEL | __GFP_ZERO); di = kmem_cache_zalloc(ceph_dentry_cachep, GFP_KERNEL);
if (!di) if (!di)
return -ENOMEM; /* oh well */ return -ENOMEM; /* oh well */
...@@ -68,23 +68,6 @@ int ceph_init_dentry(struct dentry *dentry) ...@@ -68,23 +68,6 @@ int ceph_init_dentry(struct dentry *dentry)
return 0; return 0;
} }
struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry)
{
struct inode *inode = NULL;
if (!dentry)
return NULL;
spin_lock(&dentry->d_lock);
if (!IS_ROOT(dentry)) {
inode = d_inode(dentry->d_parent);
ihold(inode);
}
spin_unlock(&dentry->d_lock);
return inode;
}
/* /*
* for readdir, we encode the directory frag and offset within that * for readdir, we encode the directory frag and offset within that
* frag into f_pos. * frag into f_pos.
...@@ -624,6 +607,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, ...@@ -624,6 +607,7 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
struct ceph_mds_client *mdsc = fsc->mdsc; struct ceph_mds_client *mdsc = fsc->mdsc;
struct ceph_mds_request *req; struct ceph_mds_request *req;
int op; int op;
int mask;
int err; int err;
dout("lookup %p dentry %p '%pd'\n", dout("lookup %p dentry %p '%pd'\n",
...@@ -666,8 +650,12 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry, ...@@ -666,8 +650,12 @@ static struct dentry *ceph_lookup(struct inode *dir, struct dentry *dentry,
return ERR_CAST(req); return ERR_CAST(req);
req->r_dentry = dget(dentry); req->r_dentry = dget(dentry);
req->r_num_caps = 2; req->r_num_caps = 2;
/* we only need inode linkage */
req->r_args.getattr.mask = cpu_to_le32(CEPH_STAT_CAP_INODE); mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.getattr.mask = cpu_to_le32(mask);
req->r_locked_dir = dir; req->r_locked_dir = dir;
err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_mdsc_do_request(mdsc, NULL, req);
err = ceph_handle_snapdir(req, dentry, err); err = ceph_handle_snapdir(req, dentry, err);
...@@ -1095,6 +1083,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry) ...@@ -1095,6 +1083,7 @@ static int dir_lease_is_valid(struct inode *dir, struct dentry *dentry)
static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
{ {
int valid = 0; int valid = 0;
struct dentry *parent;
struct inode *dir; struct inode *dir;
if (flags & LOOKUP_RCU) if (flags & LOOKUP_RCU)
...@@ -1103,7 +1092,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) ...@@ -1103,7 +1092,8 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry, dout("d_revalidate %p '%pd' inode %p offset %lld\n", dentry,
dentry, d_inode(dentry), ceph_dentry(dentry)->offset); dentry, d_inode(dentry), ceph_dentry(dentry)->offset);
dir = ceph_get_dentry_parent_inode(dentry); parent = dget_parent(dentry);
dir = d_inode(parent);
/* always trust cached snapped dentries, snapdir dentry */ /* always trust cached snapped dentries, snapdir dentry */
if (ceph_snap(dir) != CEPH_NOSNAP) { if (ceph_snap(dir) != CEPH_NOSNAP) {
...@@ -1121,13 +1111,48 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags) ...@@ -1121,13 +1111,48 @@ static int ceph_d_revalidate(struct dentry *dentry, unsigned int flags)
valid = 1; valid = 1;
} }
if (!valid) {
struct ceph_mds_client *mdsc =
ceph_sb_to_client(dir->i_sb)->mdsc;
struct ceph_mds_request *req;
int op, mask, err;
op = ceph_snap(dir) == CEPH_SNAPDIR ?
CEPH_MDS_OP_LOOKUPSNAP : CEPH_MDS_OP_LOOKUP;
req = ceph_mdsc_create_request(mdsc, op, USE_ANY_MDS);
if (!IS_ERR(req)) {
req->r_dentry = dget(dentry);
req->r_num_caps = 2;
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.getattr.mask = mask;
req->r_locked_dir = dir;
err = ceph_mdsc_do_request(mdsc, NULL, req);
if (err == 0 || err == -ENOENT) {
if (dentry == req->r_dentry) {
valid = !d_unhashed(dentry);
} else {
d_invalidate(req->r_dentry);
err = -EAGAIN;
}
}
ceph_mdsc_put_request(req);
dout("d_revalidate %p lookup result=%d\n",
dentry, err);
}
}
dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid"); dout("d_revalidate %p %s\n", dentry, valid ? "valid" : "invalid");
if (valid) { if (valid) {
ceph_dentry_lru_touch(dentry); ceph_dentry_lru_touch(dentry);
} else { } else {
ceph_dir_clear_complete(dir); ceph_dir_clear_complete(dir);
} }
iput(dir);
dput(parent);
return valid; return valid;
} }
......
...@@ -71,12 +71,18 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino) ...@@ -71,12 +71,18 @@ static struct dentry *__fh_to_dentry(struct super_block *sb, u64 ino)
inode = ceph_find_inode(sb, vino); inode = ceph_find_inode(sb, vino);
if (!inode) { if (!inode) {
struct ceph_mds_request *req; struct ceph_mds_request *req;
int mask;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPINO,
USE_ANY_MDS); USE_ANY_MDS);
if (IS_ERR(req)) if (IS_ERR(req))
return ERR_CAST(req); return ERR_CAST(req);
mask = CEPH_STAT_CAP_INODE;
if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.getattr.mask = cpu_to_le32(mask);
req->r_ino1 = vino; req->r_ino1 = vino;
req->r_num_caps = 1; req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_mdsc_do_request(mdsc, NULL, req);
...@@ -128,6 +134,7 @@ static struct dentry *__get_parent(struct super_block *sb, ...@@ -128,6 +134,7 @@ static struct dentry *__get_parent(struct super_block *sb,
struct ceph_mds_request *req; struct ceph_mds_request *req;
struct inode *inode; struct inode *inode;
struct dentry *dentry; struct dentry *dentry;
int mask;
int err; int err;
req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT, req = ceph_mdsc_create_request(mdsc, CEPH_MDS_OP_LOOKUPPARENT,
...@@ -144,6 +151,12 @@ static struct dentry *__get_parent(struct super_block *sb, ...@@ -144,6 +151,12 @@ static struct dentry *__get_parent(struct super_block *sb,
.snap = CEPH_NOSNAP, .snap = CEPH_NOSNAP,
}; };
} }
mask = CEPH_STAT_CAP_INODE;
if (ceph_security_xattr_wanted(d_inode(sb->s_root)))
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.getattr.mask = cpu_to_le32(mask);
req->r_num_caps = 1; req->r_num_caps = 1;
err = ceph_mdsc_do_request(mdsc, NULL, req); err = ceph_mdsc_do_request(mdsc, NULL, req);
inode = req->r_target_inode; inode = req->r_target_inode;
......
...@@ -157,7 +157,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode) ...@@ -157,7 +157,7 @@ static int ceph_init_file(struct inode *inode, struct file *file, int fmode)
case S_IFDIR: case S_IFDIR:
dout("init_file %p %p 0%o (regular)\n", inode, file, dout("init_file %p %p 0%o (regular)\n", inode, file,
inode->i_mode); inode->i_mode);
cf = kmem_cache_alloc(ceph_file_cachep, GFP_KERNEL | __GFP_ZERO); cf = kmem_cache_zalloc(ceph_file_cachep, GFP_KERNEL);
if (cf == NULL) { if (cf == NULL) {
ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */ ceph_put_fmode(ceph_inode(inode), fmode); /* clean up */
return -ENOMEM; return -ENOMEM;
...@@ -300,6 +300,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, ...@@ -300,6 +300,7 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
struct ceph_mds_request *req; struct ceph_mds_request *req;
struct dentry *dn; struct dentry *dn;
struct ceph_acls_info acls = {}; struct ceph_acls_info acls = {};
int mask;
int err; int err;
dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n", dout("atomic_open %p dentry %p '%pd' %s flags %d mode 0%o\n",
...@@ -335,6 +336,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry, ...@@ -335,6 +336,12 @@ int ceph_atomic_open(struct inode *dir, struct dentry *dentry,
acls.pagelist = NULL; acls.pagelist = NULL;
} }
} }
mask = CEPH_STAT_CAP_INODE | CEPH_CAP_AUTH_SHARED;
if (ceph_security_xattr_wanted(dir))
mask |= CEPH_CAP_XATTR_SHARED;
req->r_args.open.mask = cpu_to_le32(mask);
req->r_locked_dir = dir; /* caller holds dir->i_mutex */ req->r_locked_dir = dir; /* caller holds dir->i_mutex */
err = ceph_mdsc_do_request(mdsc, err = ceph_mdsc_do_request(mdsc,
(flags & (O_CREAT|O_TRUNC)) ? dir : NULL, (flags & (O_CREAT|O_TRUNC)) ? dir : NULL,
...@@ -725,7 +732,6 @@ static void ceph_aio_retry_work(struct work_struct *work) ...@@ -725,7 +732,6 @@ static void ceph_aio_retry_work(struct work_struct *work)
ret = ceph_osdc_start_request(req->r_osdc, req, false); ret = ceph_osdc_start_request(req->r_osdc, req, false);
out: out:
if (ret < 0) { if (ret < 0) {
BUG_ON(ret == -EOLDSNAPC);
req->r_result = ret; req->r_result = ret;
ceph_aio_complete_req(req, NULL); ceph_aio_complete_req(req, NULL);
} }
...@@ -783,7 +789,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -783,7 +789,7 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
int num_pages = 0; int num_pages = 0;
int flags; int flags;
int ret; int ret;
struct timespec mtime = CURRENT_TIME; struct timespec mtime = current_fs_time(inode->i_sb);
size_t count = iov_iter_count(iter); size_t count = iov_iter_count(iter);
loff_t pos = iocb->ki_pos; loff_t pos = iocb->ki_pos;
bool write = iov_iter_rw(iter) == WRITE; bool write = iov_iter_rw(iter) == WRITE;
...@@ -949,7 +955,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter, ...@@ -949,7 +955,6 @@ ceph_direct_read_write(struct kiocb *iocb, struct iov_iter *iter,
ret = ceph_osdc_start_request(req->r_osdc, ret = ceph_osdc_start_request(req->r_osdc,
req, false); req, false);
if (ret < 0) { if (ret < 0) {
BUG_ON(ret == -EOLDSNAPC);
req->r_result = ret; req->r_result = ret;
ceph_aio_complete_req(req, NULL); ceph_aio_complete_req(req, NULL);
} }
...@@ -988,7 +993,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos, ...@@ -988,7 +993,7 @@ ceph_sync_write(struct kiocb *iocb, struct iov_iter *from, loff_t pos,
int flags; int flags;
int check_caps = 0; int check_caps = 0;
int ret; int ret;
struct timespec mtime = CURRENT_TIME; struct timespec mtime = current_fs_time(inode->i_sb);
size_t count = iov_iter_count(from); size_t count = iov_iter_count(from);
if (ceph_snap(file_inode(file)) != CEPH_NOSNAP) if (ceph_snap(file_inode(file)) != CEPH_NOSNAP)
......
...@@ -549,6 +549,10 @@ int ceph_fill_file_size(struct inode *inode, int issued, ...@@ -549,6 +549,10 @@ int ceph_fill_file_size(struct inode *inode, int issued,
if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 || if (ceph_seq_cmp(truncate_seq, ci->i_truncate_seq) > 0 ||
(truncate_seq == ci->i_truncate_seq && size > inode->i_size)) { (truncate_seq == ci->i_truncate_seq && size > inode->i_size)) {
dout("size %lld -> %llu\n", inode->i_size, size); dout("size %lld -> %llu\n", inode->i_size, size);
if (size > 0 && S_ISDIR(inode->i_mode)) {
pr_err("fill_file_size non-zero size for directory\n");
size = 0;
}
i_size_write(inode, size); i_size_write(inode, size);
inode->i_blocks = (size + (1<<9) - 1) >> 9; inode->i_blocks = (size + (1<<9) - 1) >> 9;
ci->i_reported_size = size; ci->i_reported_size = size;
...@@ -1261,6 +1265,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req, ...@@ -1261,6 +1265,7 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req,
dout(" %p links to %p %llx.%llx, not %llx.%llx\n", dout(" %p links to %p %llx.%llx, not %llx.%llx\n",
dn, d_inode(dn), ceph_vinop(d_inode(dn)), dn, d_inode(dn), ceph_vinop(d_inode(dn)),
ceph_vinop(in)); ceph_vinop(in));
d_invalidate(dn);
have_lease = false; have_lease = false;
} }
...@@ -1349,15 +1354,20 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn, ...@@ -1349,15 +1354,20 @@ static int fill_readdir_cache(struct inode *dir, struct dentry *dn,
if (!ctl->page || pgoff != page_index(ctl->page)) { if (!ctl->page || pgoff != page_index(ctl->page)) {
ceph_readdir_cache_release(ctl); ceph_readdir_cache_release(ctl);
if (idx == 0)
ctl->page = grab_cache_page(&dir->i_data, pgoff); ctl->page = grab_cache_page(&dir->i_data, pgoff);
else
ctl->page = find_lock_page(&dir->i_data, pgoff);
if (!ctl->page) { if (!ctl->page) {
ctl->index = -1; ctl->index = -1;
return -ENOMEM; return idx == 0 ? -ENOMEM : 0;
} }
/* reading/filling the cache are serialized by /* reading/filling the cache are serialized by
* i_mutex, no need to use page lock */ * i_mutex, no need to use page lock */
unlock_page(ctl->page); unlock_page(ctl->page);
ctl->dentries = kmap(ctl->page); ctl->dentries = kmap(ctl->page);
if (idx == 0)
memset(ctl->dentries, 0, PAGE_CACHE_SIZE);
} }
if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) && if (req->r_dir_release_cnt == atomic64_read(&ci->i_release_count) &&
...@@ -1380,7 +1390,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1380,7 +1390,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
struct qstr dname; struct qstr dname;
struct dentry *dn; struct dentry *dn;
struct inode *in; struct inode *in;
int err = 0, ret, i; int err = 0, skipped = 0, ret, i;
struct inode *snapdir = NULL; struct inode *snapdir = NULL;
struct ceph_mds_request_head *rhead = req->r_request->front.iov_base; struct ceph_mds_request_head *rhead = req->r_request->front.iov_base;
struct ceph_dentry_info *di; struct ceph_dentry_info *di;
...@@ -1492,7 +1502,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1492,7 +1502,17 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
} }
if (d_really_is_negative(dn)) { if (d_really_is_negative(dn)) {
struct dentry *realdn = splice_dentry(dn, in); struct dentry *realdn;
if (ceph_security_xattr_deadlock(in)) {
dout(" skip splicing dn %p to inode %p"
" (security xattr deadlock)\n", dn, in);
iput(in);
skipped++;
goto next_item;
}
realdn = splice_dentry(dn, in);
if (IS_ERR(realdn)) { if (IS_ERR(realdn)) {
err = PTR_ERR(realdn); err = PTR_ERR(realdn);
d_drop(dn); d_drop(dn);
...@@ -1509,7 +1529,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1509,7 +1529,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
req->r_session, req->r_session,
req->r_request_started); req->r_request_started);
if (err == 0 && cache_ctl.index >= 0) { if (err == 0 && skipped == 0 && cache_ctl.index >= 0) {
ret = fill_readdir_cache(d_inode(parent), dn, ret = fill_readdir_cache(d_inode(parent), dn,
&cache_ctl, req); &cache_ctl, req);
if (ret < 0) if (ret < 0)
...@@ -1520,7 +1540,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req, ...@@ -1520,7 +1540,7 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
dput(dn); dput(dn);
} }
out: out:
if (err == 0) { if (err == 0 && skipped == 0) {
req->r_did_prepopulate = true; req->r_did_prepopulate = true;
req->r_readdir_cache_idx = cache_ctl.index; req->r_readdir_cache_idx = cache_ctl.index;
} }
...@@ -1950,7 +1970,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr) ...@@ -1950,7 +1970,7 @@ int ceph_setattr(struct dentry *dentry, struct iattr *attr)
if (dirtied) { if (dirtied) {
inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied, inode_dirty_flags = __ceph_mark_dirty_caps(ci, dirtied,
&prealloc_cf); &prealloc_cf);
inode->i_ctime = CURRENT_TIME; inode->i_ctime = current_fs_time(inode->i_sb);
} }
release &= issued; release &= issued;
......
...@@ -1729,7 +1729,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode) ...@@ -1729,7 +1729,7 @@ ceph_mdsc_create_request(struct ceph_mds_client *mdsc, int op, int mode)
init_completion(&req->r_safe_completion); init_completion(&req->r_safe_completion);
INIT_LIST_HEAD(&req->r_unsafe_item); INIT_LIST_HEAD(&req->r_unsafe_item);
req->r_stamp = CURRENT_TIME; req->r_stamp = current_fs_time(mdsc->fsc->sb);
req->r_op = op; req->r_op = op;
req->r_direct_mode = mode; req->r_direct_mode = mode;
...@@ -2540,6 +2540,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2540,6 +2540,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
/* insert trace into our cache */ /* insert trace into our cache */
mutex_lock(&req->r_fill_mutex); mutex_lock(&req->r_fill_mutex);
current->journal_info = req;
err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session); err = ceph_fill_trace(mdsc->fsc->sb, req, req->r_session);
if (err == 0) { if (err == 0) {
if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR || if (result == 0 && (req->r_op == CEPH_MDS_OP_READDIR ||
...@@ -2547,6 +2548,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg) ...@@ -2547,6 +2548,7 @@ static void handle_reply(struct ceph_mds_session *session, struct ceph_msg *msg)
ceph_readdir_prepopulate(req, req->r_session); ceph_readdir_prepopulate(req, req->r_session);
ceph_unreserve_caps(mdsc, &req->r_caps_reservation); ceph_unreserve_caps(mdsc, &req->r_caps_reservation);
} }
current->journal_info = NULL;
mutex_unlock(&req->r_fill_mutex); mutex_unlock(&req->r_fill_mutex);
up_read(&mdsc->snap_rwsem); up_read(&mdsc->snap_rwsem);
...@@ -3764,7 +3766,6 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) ...@@ -3764,7 +3766,6 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
dout("handle_map epoch %u len %d\n", epoch, (int)maplen); dout("handle_map epoch %u len %d\n", epoch, (int)maplen);
/* do we need it? */ /* do we need it? */
ceph_monc_got_mdsmap(&mdsc->fsc->client->monc, epoch);
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) { if (mdsc->mdsmap && epoch <= mdsc->mdsmap->m_epoch) {
dout("handle_map epoch %u <= our %u\n", dout("handle_map epoch %u <= our %u\n",
...@@ -3791,6 +3792,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg) ...@@ -3791,6 +3792,8 @@ void ceph_mdsc_handle_map(struct ceph_mds_client *mdsc, struct ceph_msg *msg)
mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size; mdsc->fsc->sb->s_maxbytes = mdsc->mdsmap->m_max_file_size;
__wake_requests(mdsc, &mdsc->waiting_for_map); __wake_requests(mdsc, &mdsc->waiting_for_map);
ceph_monc_got_map(&mdsc->fsc->client->monc, CEPH_SUB_MDSMAP,
mdsc->mdsmap->m_epoch);
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
schedule_delayed(mdsc); schedule_delayed(mdsc);
......
...@@ -296,8 +296,6 @@ static int cmpu64_rev(const void *a, const void *b) ...@@ -296,8 +296,6 @@ static int cmpu64_rev(const void *a, const void *b)
} }
struct ceph_snap_context *ceph_empty_snapc;
/* /*
* build the snap context for a given realm. * build the snap context for a given realm.
*/ */
...@@ -987,17 +985,3 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc, ...@@ -987,17 +985,3 @@ void ceph_handle_snap(struct ceph_mds_client *mdsc,
up_write(&mdsc->snap_rwsem); up_write(&mdsc->snap_rwsem);
return; return;
} }
int __init ceph_snap_init(void)
{
ceph_empty_snapc = ceph_create_snap_context(0, GFP_NOFS);
if (!ceph_empty_snapc)
return -ENOMEM;
ceph_empty_snapc->seq = 1;
return 0;
}
void ceph_snap_exit(void)
{
ceph_put_snap_context(ceph_empty_snapc);
}
...@@ -439,8 +439,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root) ...@@ -439,8 +439,8 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT) if (fsopt->flags & CEPH_MOUNT_OPT_DIRSTAT)
seq_puts(m, ",dirstat"); seq_puts(m, ",dirstat");
if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES) == 0) if ((fsopt->flags & CEPH_MOUNT_OPT_RBYTES))
seq_puts(m, ",norbytes"); seq_puts(m, ",rbytes");
if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR) if (fsopt->flags & CEPH_MOUNT_OPT_NOASYNCREADDIR)
seq_puts(m, ",noasyncreaddir"); seq_puts(m, ",noasyncreaddir");
if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0) if ((fsopt->flags & CEPH_MOUNT_OPT_DCACHE) == 0)
...@@ -530,7 +530,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt, ...@@ -530,7 +530,7 @@ static struct ceph_fs_client *create_fs_client(struct ceph_mount_options *fsopt,
goto fail; goto fail;
} }
fsc->client->extra_mon_dispatch = extra_mon_dispatch; fsc->client->extra_mon_dispatch = extra_mon_dispatch;
fsc->client->monc.want_mdsmap = 1; ceph_monc_want_map(&fsc->client->monc, CEPH_SUB_MDSMAP, 0, true);
fsc->mount_options = fsopt; fsc->mount_options = fsopt;
...@@ -793,9 +793,10 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, ...@@ -793,9 +793,10 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
struct dentry *root; struct dentry *root;
int first = 0; /* first vfsmount for this super_block */ int first = 0; /* first vfsmount for this super_block */
dout("mount start\n"); dout("mount start %p\n", fsc);
mutex_lock(&fsc->client->mount_mutex); mutex_lock(&fsc->client->mount_mutex);
if (!fsc->sb->s_root) {
err = __ceph_open_session(fsc->client, started); err = __ceph_open_session(fsc->client, started);
if (err < 0) if (err < 0)
goto out; goto out;
...@@ -806,9 +807,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, ...@@ -806,9 +807,6 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
err = PTR_ERR(root); err = PTR_ERR(root);
goto out; goto out;
} }
if (fsc->sb->s_root) {
dput(root);
} else {
fsc->sb->s_root = root; fsc->sb->s_root = root;
first = 1; first = 1;
...@@ -818,6 +816,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, ...@@ -818,6 +816,7 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
} }
if (path[0] == 0) { if (path[0] == 0) {
root = fsc->sb->s_root;
dget(root); dget(root);
} else { } else {
dout("mount opening base mountpoint\n"); dout("mount opening base mountpoint\n");
...@@ -833,16 +832,14 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc, ...@@ -833,16 +832,14 @@ static struct dentry *ceph_real_mount(struct ceph_fs_client *fsc,
mutex_unlock(&fsc->client->mount_mutex); mutex_unlock(&fsc->client->mount_mutex);
return root; return root;
out:
mutex_unlock(&fsc->client->mount_mutex);
return ERR_PTR(err);
fail: fail:
if (first) { if (first) {
dput(fsc->sb->s_root); dput(fsc->sb->s_root);
fsc->sb->s_root = NULL; fsc->sb->s_root = NULL;
} }
goto out; out:
mutex_unlock(&fsc->client->mount_mutex);
return ERR_PTR(err);
} }
static int ceph_set_super(struct super_block *s, void *data) static int ceph_set_super(struct super_block *s, void *data)
...@@ -1042,19 +1039,14 @@ static int __init init_ceph(void) ...@@ -1042,19 +1039,14 @@ static int __init init_ceph(void)
ceph_flock_init(); ceph_flock_init();
ceph_xattr_init(); ceph_xattr_init();
ret = ceph_snap_init();
if (ret)
goto out_xattr;
ret = register_filesystem(&ceph_fs_type); ret = register_filesystem(&ceph_fs_type);
if (ret) if (ret)
goto out_snap; goto out_xattr;
pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL); pr_info("loaded (mds proto %d)\n", CEPH_MDSC_PROTOCOL);
return 0; return 0;
out_snap:
ceph_snap_exit();
out_xattr: out_xattr:
ceph_xattr_exit(); ceph_xattr_exit();
destroy_caches(); destroy_caches();
...@@ -1066,7 +1058,6 @@ static void __exit exit_ceph(void) ...@@ -1066,7 +1058,6 @@ static void __exit exit_ceph(void)
{ {
dout("exit_ceph\n"); dout("exit_ceph\n");
unregister_filesystem(&ceph_fs_type); unregister_filesystem(&ceph_fs_type);
ceph_snap_exit();
ceph_xattr_exit(); ceph_xattr_exit();
destroy_caches(); destroy_caches();
} }
......
...@@ -37,8 +37,7 @@ ...@@ -37,8 +37,7 @@
#define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */ #define CEPH_MOUNT_OPT_FSCACHE (1<<10) /* use fscache */
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */ #define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
#define CEPH_MOUNT_OPT_DEFAULT (CEPH_MOUNT_OPT_RBYTES | \ #define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
CEPH_MOUNT_OPT_DCACHE)
#define ceph_set_mount_opt(fsc, opt) \ #define ceph_set_mount_opt(fsc, opt) \
(fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt; (fsc)->mount_options->flags |= CEPH_MOUNT_OPT_##opt;
...@@ -469,7 +468,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb, ...@@ -469,7 +468,7 @@ static inline struct inode *ceph_find_inode(struct super_block *sb,
#define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */ #define CEPH_I_POOL_PERM (1 << 4) /* pool rd/wr bits are valid */
#define CEPH_I_POOL_RD (1 << 5) /* can read from pool */ #define CEPH_I_POOL_RD (1 << 5) /* can read from pool */
#define CEPH_I_POOL_WR (1 << 6) /* can write to pool */ #define CEPH_I_POOL_WR (1 << 6) /* can write to pool */
#define CEPH_I_SEC_INITED (1 << 7) /* security initialized */
static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci, static inline void __ceph_dir_set_complete(struct ceph_inode_info *ci,
long long release_count, long long release_count,
...@@ -721,7 +720,6 @@ static inline int default_congestion_kb(void) ...@@ -721,7 +720,6 @@ static inline int default_congestion_kb(void)
/* snap.c */ /* snap.c */
extern struct ceph_snap_context *ceph_empty_snapc;
struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc, struct ceph_snap_realm *ceph_lookup_snap_realm(struct ceph_mds_client *mdsc,
u64 ino); u64 ino);
extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc, extern void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
...@@ -738,8 +736,6 @@ extern void ceph_queue_cap_snap(struct ceph_inode_info *ci); ...@@ -738,8 +736,6 @@ extern void ceph_queue_cap_snap(struct ceph_inode_info *ci);
extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci, extern int __ceph_finish_cap_snap(struct ceph_inode_info *ci,
struct ceph_cap_snap *capsnap); struct ceph_cap_snap *capsnap);
extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc); extern void ceph_cleanup_empty_realms(struct ceph_mds_client *mdsc);
extern int ceph_snap_init(void);
extern void ceph_snap_exit(void);
/* /*
* a cap_snap is "pending" if it is still awaiting an in-progress * a cap_snap is "pending" if it is still awaiting an in-progress
...@@ -808,6 +804,20 @@ extern void __init ceph_xattr_init(void); ...@@ -808,6 +804,20 @@ extern void __init ceph_xattr_init(void);
extern void ceph_xattr_exit(void); extern void ceph_xattr_exit(void);
extern const struct xattr_handler *ceph_xattr_handlers[]; extern const struct xattr_handler *ceph_xattr_handlers[];
#ifdef CONFIG_SECURITY
extern bool ceph_security_xattr_deadlock(struct inode *in);
extern bool ceph_security_xattr_wanted(struct inode *in);
#else
static inline bool ceph_security_xattr_deadlock(struct inode *in)
{
return false;
}
static inline bool ceph_security_xattr_wanted(struct inode *in)
{
return false;
}
#endif
/* acl.c */ /* acl.c */
struct ceph_acls_info { struct ceph_acls_info {
void *default_acl; void *default_acl;
...@@ -947,7 +957,6 @@ extern void ceph_dentry_lru_touch(struct dentry *dn); ...@@ -947,7 +957,6 @@ extern void ceph_dentry_lru_touch(struct dentry *dn);
extern void ceph_dentry_lru_del(struct dentry *dn); extern void ceph_dentry_lru_del(struct dentry *dn);
extern void ceph_invalidate_dentry_lease(struct dentry *dentry); extern void ceph_invalidate_dentry_lease(struct dentry *dentry);
extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn); extern unsigned ceph_dentry_hash(struct inode *dir, struct dentry *dn);
extern struct inode *ceph_get_dentry_parent_inode(struct dentry *dentry);
extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl); extern void ceph_readdir_cache_release(struct ceph_readdir_cache_control *ctl);
/* /*
......
...@@ -714,31 +714,62 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci) ...@@ -714,31 +714,62 @@ void __ceph_build_xattrs_blob(struct ceph_inode_info *ci)
} }
} }
static inline int __get_request_mask(struct inode *in) {
struct ceph_mds_request *req = current->journal_info;
int mask = 0;
if (req && req->r_target_inode == in) {
if (req->r_op == CEPH_MDS_OP_LOOKUP ||
req->r_op == CEPH_MDS_OP_LOOKUPINO ||
req->r_op == CEPH_MDS_OP_LOOKUPPARENT ||
req->r_op == CEPH_MDS_OP_GETATTR) {
mask = le32_to_cpu(req->r_args.getattr.mask);
} else if (req->r_op == CEPH_MDS_OP_OPEN ||
req->r_op == CEPH_MDS_OP_CREATE) {
mask = le32_to_cpu(req->r_args.open.mask);
}
}
return mask;
}
ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
size_t size) size_t size)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
int err;
struct ceph_inode_xattr *xattr; struct ceph_inode_xattr *xattr;
struct ceph_vxattr *vxattr = NULL; struct ceph_vxattr *vxattr = NULL;
int req_mask;
int err;
if (!ceph_is_valid_xattr(name)) if (!ceph_is_valid_xattr(name))
return -ENODATA; return -ENODATA;
/* let's see if a virtual xattr was requested */ /* let's see if a virtual xattr was requested */
vxattr = ceph_match_vxattr(inode, name); vxattr = ceph_match_vxattr(inode, name);
if (vxattr && !(vxattr->exists_cb && !vxattr->exists_cb(ci))) { if (vxattr) {
err = -ENODATA;
if (!(vxattr->exists_cb && !vxattr->exists_cb(ci)))
err = vxattr->getxattr_cb(ci, value, size); err = vxattr->getxattr_cb(ci, value, size);
return err; return err;
} }
req_mask = __get_request_mask(inode);
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
dout("getxattr %p ver=%lld index_ver=%lld\n", inode, dout("getxattr %p ver=%lld index_ver=%lld\n", inode,
ci->i_xattrs.version, ci->i_xattrs.index_version); ci->i_xattrs.version, ci->i_xattrs.index_version);
if (ci->i_xattrs.version == 0 || if (ci->i_xattrs.version == 0 ||
!__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1)) { !((req_mask & CEPH_CAP_XATTR_SHARED) ||
__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 1))) {
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
/* security module gets xattr while filling trace */
if (current->journal_info != NULL) {
pr_warn_ratelimited("sync getxattr %p "
"during filling trace\n", inode);
return -EBUSY;
}
/* get xattrs from mds (if we don't already have them) */ /* get xattrs from mds (if we don't already have them) */
err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true); err = ceph_do_getattr(inode, CEPH_STAT_CAP_XATTR, true);
if (err) if (err)
...@@ -765,6 +796,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value, ...@@ -765,6 +796,9 @@ ssize_t __ceph_getxattr(struct inode *inode, const char *name, void *value,
memcpy(value, xattr->val, xattr->val_len); memcpy(value, xattr->val, xattr->val_len);
if (current->journal_info != NULL &&
!strncmp(name, XATTR_SECURITY_PREFIX, XATTR_SECURITY_PREFIX_LEN))
ci->i_ceph_flags |= CEPH_I_SEC_INITED;
out: out:
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
return err; return err;
...@@ -999,7 +1033,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -999,7 +1033,7 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf); &prealloc_cf);
ci->i_xattrs.dirty = true; ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME; inode->i_ctime = current_fs_time(inode->i_sb);
} }
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
...@@ -1015,7 +1049,15 @@ int __ceph_setxattr(struct dentry *dentry, const char *name, ...@@ -1015,7 +1049,15 @@ int __ceph_setxattr(struct dentry *dentry, const char *name,
do_sync_unlocked: do_sync_unlocked:
if (lock_snap_rwsem) if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem); up_read(&mdsc->snap_rwsem);
/* security module set xattr while filling trace */
if (current->journal_info != NULL) {
pr_warn_ratelimited("sync setxattr %p "
"during filling trace\n", inode);
err = -EBUSY;
} else {
err = ceph_sync_setxattr(dentry, name, value, size, flags); err = ceph_sync_setxattr(dentry, name, value, size, flags);
}
out: out:
ceph_free_cap_flush(prealloc_cf); ceph_free_cap_flush(prealloc_cf);
kfree(newname); kfree(newname);
...@@ -1136,7 +1178,7 @@ int __ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -1136,7 +1178,7 @@ int __ceph_removexattr(struct dentry *dentry, const char *name)
dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL, dirty = __ceph_mark_dirty_caps(ci, CEPH_CAP_XATTR_EXCL,
&prealloc_cf); &prealloc_cf);
ci->i_xattrs.dirty = true; ci->i_xattrs.dirty = true;
inode->i_ctime = CURRENT_TIME; inode->i_ctime = current_fs_time(inode->i_sb);
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
if (lock_snap_rwsem) if (lock_snap_rwsem)
up_read(&mdsc->snap_rwsem); up_read(&mdsc->snap_rwsem);
...@@ -1164,3 +1206,25 @@ int ceph_removexattr(struct dentry *dentry, const char *name) ...@@ -1164,3 +1206,25 @@ int ceph_removexattr(struct dentry *dentry, const char *name)
return __ceph_removexattr(dentry, name); return __ceph_removexattr(dentry, name);
} }
#ifdef CONFIG_SECURITY
bool ceph_security_xattr_wanted(struct inode *in)
{
return in->i_security != NULL;
}
bool ceph_security_xattr_deadlock(struct inode *in)
{
struct ceph_inode_info *ci;
bool ret;
if (in->i_security == NULL)
return false;
ci = ceph_inode(in);
spin_lock(&ci->i_ceph_lock);
ret = !(ci->i_ceph_flags & CEPH_I_SEC_INITED) &&
!(ci->i_xattrs.version > 0 &&
__ceph_caps_issued_mask(ci, CEPH_CAP_XATTR_SHARED, 0));
spin_unlock(&ci->i_ceph_lock);
return ret;
}
#endif
...@@ -105,6 +105,7 @@ static inline u64 ceph_sanitize_features(u64 features) ...@@ -105,6 +105,7 @@ static inline u64 ceph_sanitize_features(u64 features)
*/ */
#define CEPH_FEATURES_SUPPORTED_DEFAULT \ #define CEPH_FEATURES_SUPPORTED_DEFAULT \
(CEPH_FEATURE_NOSRCADDR | \ (CEPH_FEATURE_NOSRCADDR | \
CEPH_FEATURE_SUBSCRIBE2 | \
CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_RECONNECT_SEQ | \
CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGID64 | \
CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_PGPOOL3 | \
...@@ -127,6 +128,7 @@ static inline u64 ceph_sanitize_features(u64 features) ...@@ -127,6 +128,7 @@ static inline u64 ceph_sanitize_features(u64 features)
#define CEPH_FEATURES_REQUIRED_DEFAULT \ #define CEPH_FEATURES_REQUIRED_DEFAULT \
(CEPH_FEATURE_NOSRCADDR | \ (CEPH_FEATURE_NOSRCADDR | \
CEPH_FEATURE_SUBSCRIBE2 | \
CEPH_FEATURE_RECONNECT_SEQ | \ CEPH_FEATURE_RECONNECT_SEQ | \
CEPH_FEATURE_PGID64 | \ CEPH_FEATURE_PGID64 | \
CEPH_FEATURE_PGPOOL3 | \ CEPH_FEATURE_PGPOOL3 | \
......
...@@ -198,8 +198,8 @@ struct ceph_client_mount { ...@@ -198,8 +198,8 @@ struct ceph_client_mount {
#define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */ #define CEPH_SUBSCRIBE_ONETIME 1 /* i want only 1 update after have */
struct ceph_mon_subscribe_item { struct ceph_mon_subscribe_item {
__le64 have_version; __le64 have; __le64 start;
__u8 onetime; __u8 flags;
} __attribute__ ((packed)); } __attribute__ ((packed));
struct ceph_mon_subscribe_ack { struct ceph_mon_subscribe_ack {
...@@ -376,7 +376,8 @@ union ceph_mds_request_args { ...@@ -376,7 +376,8 @@ union ceph_mds_request_args {
__le32 stripe_count; /* ... */ __le32 stripe_count; /* ... */
__le32 object_size; __le32 object_size;
__le32 file_replication; __le32 file_replication;
__le32 unused; /* used to be preferred osd */ __le32 mask; /* CEPH_CAP_* */
__le32 old_size;
} __attribute__ ((packed)) open; } __attribute__ ((packed)) open;
struct { struct {
__le32 flags; __le32 flags;
......
...@@ -47,7 +47,6 @@ struct ceph_options { ...@@ -47,7 +47,6 @@ struct ceph_options {
unsigned long mount_timeout; /* jiffies */ unsigned long mount_timeout; /* jiffies */
unsigned long osd_idle_ttl; /* jiffies */ unsigned long osd_idle_ttl; /* jiffies */
unsigned long osd_keepalive_timeout; /* jiffies */ unsigned long osd_keepalive_timeout; /* jiffies */
unsigned long monc_ping_timeout; /* jiffies */
/* /*
* any type that can't be simply compared or doesn't need need * any type that can't be simply compared or doesn't need need
...@@ -68,7 +67,12 @@ struct ceph_options { ...@@ -68,7 +67,12 @@ struct ceph_options {
#define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_MOUNT_TIMEOUT_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000) #define CEPH_OSD_KEEPALIVE_DEFAULT msecs_to_jiffies(5 * 1000)
#define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000) #define CEPH_OSD_IDLE_TTL_DEFAULT msecs_to_jiffies(60 * 1000)
#define CEPH_MONC_PING_TIMEOUT_DEFAULT msecs_to_jiffies(30 * 1000)
#define CEPH_MONC_HUNT_INTERVAL msecs_to_jiffies(3 * 1000)
#define CEPH_MONC_PING_INTERVAL msecs_to_jiffies(10 * 1000)
#define CEPH_MONC_PING_TIMEOUT msecs_to_jiffies(30 * 1000)
#define CEPH_MONC_HUNT_BACKOFF 2
#define CEPH_MONC_HUNT_MAX_MULT 10
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024) #define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024) #define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
......
...@@ -68,18 +68,24 @@ struct ceph_mon_client { ...@@ -68,18 +68,24 @@ struct ceph_mon_client {
bool hunting; bool hunting;
int cur_mon; /* last monitor i contacted */ int cur_mon; /* last monitor i contacted */
unsigned long sub_sent, sub_renew_after; unsigned long sub_renew_after;
unsigned long sub_renew_sent;
struct ceph_connection con; struct ceph_connection con;
bool had_a_connection;
int hunt_mult; /* [1..CEPH_MONC_HUNT_MAX_MULT] */
/* pending generic requests */ /* pending generic requests */
struct rb_root generic_request_tree; struct rb_root generic_request_tree;
int num_generic_requests; int num_generic_requests;
u64 last_tid; u64 last_tid;
/* mds/osd map */ /* subs, indexed with CEPH_SUB_* */
int want_mdsmap; struct {
int want_next_osdmap; /* 1 = want, 2 = want+asked */ struct ceph_mon_subscribe_item item;
u32 have_osdmap, have_mdsmap; bool want;
u32 have; /* epoch */
} subs[3];
#ifdef CONFIG_DEBUG_FS #ifdef CONFIG_DEBUG_FS
struct dentry *debugfs_file; struct dentry *debugfs_file;
...@@ -93,14 +99,23 @@ extern int ceph_monmap_contains(struct ceph_monmap *m, ...@@ -93,14 +99,23 @@ extern int ceph_monmap_contains(struct ceph_monmap *m,
extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl); extern int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl);
extern void ceph_monc_stop(struct ceph_mon_client *monc); extern void ceph_monc_stop(struct ceph_mon_client *monc);
enum {
CEPH_SUB_MDSMAP = 0,
CEPH_SUB_MONMAP,
CEPH_SUB_OSDMAP,
};
extern const char *ceph_sub_str[];
/* /*
* The model here is to indicate that we need a new map of at least * The model here is to indicate that we need a new map of at least
* epoch @want, and also call in when we receive a map. We will * epoch @epoch, and also call in when we receive a map. We will
* periodically rerequest the map from the monitor cluster until we * periodically rerequest the map from the monitor cluster until we
* get what we want. * get what we want.
*/ */
extern int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 have); bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
extern int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 have); bool continuous);
void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch);
extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc); extern void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc);
extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, extern int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
......
...@@ -43,7 +43,8 @@ struct ceph_osd { ...@@ -43,7 +43,8 @@ struct ceph_osd {
}; };
#define CEPH_OSD_MAX_OP 3 #define CEPH_OSD_SLAB_OPS 2
#define CEPH_OSD_MAX_OPS 16
enum ceph_osd_data_type { enum ceph_osd_data_type {
CEPH_OSD_DATA_TYPE_NONE = 0, CEPH_OSD_DATA_TYPE_NONE = 0,
...@@ -77,7 +78,10 @@ struct ceph_osd_data { ...@@ -77,7 +78,10 @@ struct ceph_osd_data {
struct ceph_osd_req_op { struct ceph_osd_req_op {
u16 op; /* CEPH_OSD_OP_* */ u16 op; /* CEPH_OSD_OP_* */
u32 flags; /* CEPH_OSD_OP_FLAG_* */ u32 flags; /* CEPH_OSD_OP_FLAG_* */
u32 payload_len; u32 indata_len; /* request */
u32 outdata_len; /* reply */
s32 rval;
union { union {
struct ceph_osd_data raw_data_in; struct ceph_osd_data raw_data_in;
struct { struct {
...@@ -136,7 +140,6 @@ struct ceph_osd_request { ...@@ -136,7 +140,6 @@ struct ceph_osd_request {
/* request osd ops array */ /* request osd ops array */
unsigned int r_num_ops; unsigned int r_num_ops;
struct ceph_osd_req_op r_ops[CEPH_OSD_MAX_OP];
/* these are updated on each send */ /* these are updated on each send */
__le32 *r_request_osdmap_epoch; __le32 *r_request_osdmap_epoch;
...@@ -148,8 +151,6 @@ struct ceph_osd_request { ...@@ -148,8 +151,6 @@ struct ceph_osd_request {
struct ceph_eversion *r_request_reassert_version; struct ceph_eversion *r_request_reassert_version;
int r_result; int r_result;
int r_reply_op_len[CEPH_OSD_MAX_OP];
s32 r_reply_op_result[CEPH_OSD_MAX_OP];
int r_got_reply; int r_got_reply;
int r_linger; int r_linger;
...@@ -174,6 +175,8 @@ struct ceph_osd_request { ...@@ -174,6 +175,8 @@ struct ceph_osd_request {
unsigned long r_stamp; /* send OR check time */ unsigned long r_stamp; /* send OR check time */
struct ceph_snap_context *r_snapc; /* snap context for writes */ struct ceph_snap_context *r_snapc; /* snap context for writes */
struct ceph_osd_req_op r_ops[];
}; };
struct ceph_request_redirect { struct ceph_request_redirect {
...@@ -263,6 +266,8 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req, ...@@ -263,6 +266,8 @@ extern void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
u64 truncate_size, u32 truncate_seq); u64 truncate_size, u32 truncate_seq);
extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req, extern void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
unsigned int which, u64 length); unsigned int which, u64 length);
extern void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
unsigned int which, u64 offset_inc);
extern struct ceph_osd_data *osd_req_op_extent_osd_data( extern struct ceph_osd_data *osd_req_op_extent_osd_data(
struct ceph_osd_request *osd_req, struct ceph_osd_request *osd_req,
......
...@@ -361,7 +361,6 @@ ceph_parse_options(char *options, const char *dev_name, ...@@ -361,7 +361,6 @@ ceph_parse_options(char *options, const char *dev_name,
opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT; opt->osd_keepalive_timeout = CEPH_OSD_KEEPALIVE_DEFAULT;
opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT; opt->mount_timeout = CEPH_MOUNT_TIMEOUT_DEFAULT;
opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT; opt->osd_idle_ttl = CEPH_OSD_IDLE_TTL_DEFAULT;
opt->monc_ping_timeout = CEPH_MONC_PING_TIMEOUT_DEFAULT;
/* get mon ip(s) */ /* get mon ip(s) */
/* ip1[:port1][,ip2[:port2]...] */ /* ip1[:port1][,ip2[:port2]...] */
...@@ -686,6 +685,9 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started) ...@@ -686,6 +685,9 @@ int __ceph_open_session(struct ceph_client *client, unsigned long started)
return client->auth_err; return client->auth_err;
} }
pr_info("client%llu fsid %pU\n", ceph_client_id(client), &client->fsid);
ceph_debugfs_client_init(client);
return 0; return 0;
} }
EXPORT_SYMBOL(__ceph_open_session); EXPORT_SYMBOL(__ceph_open_session);
......
...@@ -112,15 +112,20 @@ static int monc_show(struct seq_file *s, void *p) ...@@ -112,15 +112,20 @@ static int monc_show(struct seq_file *s, void *p)
struct ceph_mon_generic_request *req; struct ceph_mon_generic_request *req;
struct ceph_mon_client *monc = &client->monc; struct ceph_mon_client *monc = &client->monc;
struct rb_node *rp; struct rb_node *rp;
int i;
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
if (monc->have_mdsmap) for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
seq_printf(s, "have mdsmap %u\n", (unsigned int)monc->have_mdsmap); seq_printf(s, "have %s %u", ceph_sub_str[i],
if (monc->have_osdmap) monc->subs[i].have);
seq_printf(s, "have osdmap %u\n", (unsigned int)monc->have_osdmap); if (monc->subs[i].want)
if (monc->want_next_osdmap) seq_printf(s, " want %llu%s",
seq_printf(s, "want next osdmap\n"); le64_to_cpu(monc->subs[i].item.start),
(monc->subs[i].item.flags &
CEPH_SUBSCRIBE_ONETIME ? "" : "+"));
seq_putc(s, '\n');
}
for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) { for (rp = rb_first(&monc->generic_request_tree); rp; rp = rb_next(rp)) {
__u16 op; __u16 op;
......
...@@ -235,18 +235,12 @@ static struct workqueue_struct *ceph_msgr_wq; ...@@ -235,18 +235,12 @@ static struct workqueue_struct *ceph_msgr_wq;
static int ceph_msgr_slab_init(void) static int ceph_msgr_slab_init(void)
{ {
BUG_ON(ceph_msg_cache); BUG_ON(ceph_msg_cache);
ceph_msg_cache = kmem_cache_create("ceph_msg", ceph_msg_cache = KMEM_CACHE(ceph_msg, 0);
sizeof (struct ceph_msg),
__alignof__(struct ceph_msg), 0, NULL);
if (!ceph_msg_cache) if (!ceph_msg_cache)
return -ENOMEM; return -ENOMEM;
BUG_ON(ceph_msg_data_cache); BUG_ON(ceph_msg_data_cache);
ceph_msg_data_cache = kmem_cache_create("ceph_msg_data", ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0);
sizeof (struct ceph_msg_data),
__alignof__(struct ceph_msg_data),
0, NULL);
if (ceph_msg_data_cache) if (ceph_msg_data_cache)
return 0; return 0;
...@@ -1221,25 +1215,19 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len) ...@@ -1221,25 +1215,19 @@ static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
static void prepare_write_message_footer(struct ceph_connection *con) static void prepare_write_message_footer(struct ceph_connection *con)
{ {
struct ceph_msg *m = con->out_msg; struct ceph_msg *m = con->out_msg;
int v = con->out_kvec_left;
m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE; m->footer.flags |= CEPH_MSG_FOOTER_COMPLETE;
dout("prepare_write_message_footer %p\n", con); dout("prepare_write_message_footer %p\n", con);
con->out_kvec[v].iov_base = &m->footer; con_out_kvec_add(con, sizeof_footer(con), &m->footer);
if (con->peer_features & CEPH_FEATURE_MSG_AUTH) { if (con->peer_features & CEPH_FEATURE_MSG_AUTH) {
if (con->ops->sign_message) if (con->ops->sign_message)
con->ops->sign_message(m); con->ops->sign_message(m);
else else
m->footer.sig = 0; m->footer.sig = 0;
con->out_kvec[v].iov_len = sizeof(m->footer);
con->out_kvec_bytes += sizeof(m->footer);
} else { } else {
m->old_footer.flags = m->footer.flags; m->old_footer.flags = m->footer.flags;
con->out_kvec[v].iov_len = sizeof(m->old_footer);
con->out_kvec_bytes += sizeof(m->old_footer);
} }
con->out_kvec_left++;
con->out_more = m->more_to_follow; con->out_more = m->more_to_follow;
con->out_msg_done = true; con->out_msg_done = true;
} }
...@@ -2409,11 +2397,7 @@ static int read_partial_message(struct ceph_connection *con) ...@@ -2409,11 +2397,7 @@ static int read_partial_message(struct ceph_connection *con)
} }
/* footer */ /* footer */
if (need_sign) size = sizeof_footer(con);
size = sizeof(m->footer);
else
size = sizeof(m->old_footer);
end += size; end += size;
ret = read_partial(con, end, size, &m->footer); ret = read_partial(con, end, size, &m->footer);
if (ret <= 0) if (ret <= 0)
...@@ -3089,10 +3073,7 @@ void ceph_msg_revoke(struct ceph_msg *msg) ...@@ -3089,10 +3073,7 @@ void ceph_msg_revoke(struct ceph_msg *msg)
con->out_skip += con_out_kvec_skip(con); con->out_skip += con_out_kvec_skip(con);
} else { } else {
BUG_ON(!msg->data_length); BUG_ON(!msg->data_length);
if (con->peer_features & CEPH_FEATURE_MSG_AUTH) con->out_skip += sizeof_footer(con);
con->out_skip += sizeof(msg->footer);
else
con->out_skip += sizeof(msg->old_footer);
} }
/* data, middle, front */ /* data, middle, front */
if (msg->data_length) if (msg->data_length)
......
...@@ -122,51 +122,91 @@ static void __close_session(struct ceph_mon_client *monc) ...@@ -122,51 +122,91 @@ static void __close_session(struct ceph_mon_client *monc)
ceph_msg_revoke(monc->m_subscribe); ceph_msg_revoke(monc->m_subscribe);
ceph_msg_revoke_incoming(monc->m_subscribe_ack); ceph_msg_revoke_incoming(monc->m_subscribe_ack);
ceph_con_close(&monc->con); ceph_con_close(&monc->con);
monc->cur_mon = -1;
monc->pending_auth = 0; monc->pending_auth = 0;
ceph_auth_reset(monc->auth); ceph_auth_reset(monc->auth);
} }
/* /*
* Open a session with a (new) monitor. * Pick a new monitor at random and set cur_mon. If we are repicking
* (i.e. cur_mon is already set), be sure to pick a different one.
*/ */
static int __open_session(struct ceph_mon_client *monc) static void pick_new_mon(struct ceph_mon_client *monc)
{
int old_mon = monc->cur_mon;
BUG_ON(monc->monmap->num_mon < 1);
if (monc->monmap->num_mon == 1) {
monc->cur_mon = 0;
} else {
int max = monc->monmap->num_mon;
int o = -1;
int n;
if (monc->cur_mon >= 0) {
if (monc->cur_mon < monc->monmap->num_mon)
o = monc->cur_mon;
if (o >= 0)
max--;
}
n = prandom_u32() % max;
if (o >= 0 && n >= o)
n++;
monc->cur_mon = n;
}
dout("%s mon%d -> mon%d out of %d mons\n", __func__, old_mon,
monc->cur_mon, monc->monmap->num_mon);
}
/*
* Open a session with a new monitor.
*/
static void __open_session(struct ceph_mon_client *monc)
{ {
char r;
int ret; int ret;
if (monc->cur_mon < 0) { pick_new_mon(monc);
get_random_bytes(&r, 1);
monc->cur_mon = r % monc->monmap->num_mon; monc->hunting = true;
dout("open_session num=%d r=%d -> mon%d\n", if (monc->had_a_connection) {
monc->monmap->num_mon, r, monc->cur_mon); monc->hunt_mult *= CEPH_MONC_HUNT_BACKOFF;
monc->sub_sent = 0; if (monc->hunt_mult > CEPH_MONC_HUNT_MAX_MULT)
monc->hunt_mult = CEPH_MONC_HUNT_MAX_MULT;
}
monc->sub_renew_after = jiffies; /* i.e., expired */ monc->sub_renew_after = jiffies; /* i.e., expired */
monc->want_next_osdmap = !!monc->want_next_osdmap; monc->sub_renew_sent = 0;
dout("open_session mon%d opening\n", monc->cur_mon); dout("%s opening mon%d\n", __func__, monc->cur_mon);
ceph_con_open(&monc->con, ceph_con_open(&monc->con, CEPH_ENTITY_TYPE_MON, monc->cur_mon,
CEPH_ENTITY_TYPE_MON, monc->cur_mon,
&monc->monmap->mon_inst[monc->cur_mon].addr); &monc->monmap->mon_inst[monc->cur_mon].addr);
/* send an initial keepalive to ensure our timestamp is /*
* valid by the time we are in an OPENED state */ * send an initial keepalive to ensure our timestamp is valid
* by the time we are in an OPENED state
*/
ceph_con_keepalive(&monc->con); ceph_con_keepalive(&monc->con);
/* initiatiate authentication handshake */ /* initiate authentication handshake */
ret = ceph_auth_build_hello(monc->auth, ret = ceph_auth_build_hello(monc->auth,
monc->m_auth->front.iov_base, monc->m_auth->front.iov_base,
monc->m_auth->front_alloc_len); monc->m_auth->front_alloc_len);
BUG_ON(ret <= 0);
__send_prepared_auth_request(monc, ret); __send_prepared_auth_request(monc, ret);
} else {
dout("open_session mon%d already open\n", monc->cur_mon);
}
return 0;
} }
static bool __sub_expired(struct ceph_mon_client *monc) static void reopen_session(struct ceph_mon_client *monc)
{ {
return time_after_eq(jiffies, monc->sub_renew_after); if (!monc->hunting)
pr_info("mon%d %s session lost, hunting for new mon\n",
monc->cur_mon, ceph_pr_addr(&monc->con.peer_addr.in_addr));
__close_session(monc);
__open_session(monc);
} }
/* /*
...@@ -174,74 +214,70 @@ static bool __sub_expired(struct ceph_mon_client *monc) ...@@ -174,74 +214,70 @@ static bool __sub_expired(struct ceph_mon_client *monc)
*/ */
static void __schedule_delayed(struct ceph_mon_client *monc) static void __schedule_delayed(struct ceph_mon_client *monc)
{ {
struct ceph_options *opt = monc->client->options;
unsigned long delay; unsigned long delay;
if (monc->cur_mon < 0 || __sub_expired(monc)) { if (monc->hunting)
delay = 10 * HZ; delay = CEPH_MONC_HUNT_INTERVAL * monc->hunt_mult;
} else { else
delay = 20 * HZ; delay = CEPH_MONC_PING_INTERVAL;
if (opt->monc_ping_timeout > 0)
delay = min(delay, opt->monc_ping_timeout / 3);
}
dout("__schedule_delayed after %lu\n", delay); dout("__schedule_delayed after %lu\n", delay);
schedule_delayed_work(&monc->delayed_work, mod_delayed_work(system_wq, &monc->delayed_work,
round_jiffies_relative(delay)); round_jiffies_relative(delay));
} }
const char *ceph_sub_str[] = {
[CEPH_SUB_MDSMAP] = "mdsmap",
[CEPH_SUB_MONMAP] = "monmap",
[CEPH_SUB_OSDMAP] = "osdmap",
};
/* /*
* Send subscribe request for mdsmap and/or osdmap. * Send subscribe request for one or more maps, according to
* monc->subs.
*/ */
static void __send_subscribe(struct ceph_mon_client *monc) static void __send_subscribe(struct ceph_mon_client *monc)
{ {
dout("__send_subscribe sub_sent=%u exp=%u want_osd=%d\n",
(unsigned int)monc->sub_sent, __sub_expired(monc),
monc->want_next_osdmap);
if ((__sub_expired(monc) && !monc->sub_sent) ||
monc->want_next_osdmap == 1) {
struct ceph_msg *msg = monc->m_subscribe; struct ceph_msg *msg = monc->m_subscribe;
struct ceph_mon_subscribe_item *i; void *p = msg->front.iov_base;
void *p, *end; void *const end = p + msg->front_alloc_len;
int num; int num = 0;
int i;
p = msg->front.iov_base; dout("%s sent %lu\n", __func__, monc->sub_renew_sent);
end = p + msg->front_alloc_len;
num = 1 + !!monc->want_next_osdmap + !!monc->want_mdsmap; BUG_ON(monc->cur_mon < 0);
ceph_encode_32(&p, num);
if (!monc->sub_renew_sent)
monc->sub_renew_sent = jiffies | 1; /* never 0 */
if (monc->want_next_osdmap) { msg->hdr.version = cpu_to_le16(2);
dout("__send_subscribe to 'osdmap' %u\n",
(unsigned int)monc->have_osdmap); for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
ceph_encode_string(&p, end, "osdmap", 6); if (monc->subs[i].want)
i = p; num++;
i->have = cpu_to_le64(monc->have_osdmap);
i->onetime = 1;
p += sizeof(*i);
monc->want_next_osdmap = 2; /* requested */
} }
if (monc->want_mdsmap) { BUG_ON(num < 1); /* monmap sub is always there */
dout("__send_subscribe to 'mdsmap' %u+\n", ceph_encode_32(&p, num);
(unsigned int)monc->have_mdsmap); for (i = 0; i < ARRAY_SIZE(monc->subs); i++) {
ceph_encode_string(&p, end, "mdsmap", 6); const char *s = ceph_sub_str[i];
i = p;
i->have = cpu_to_le64(monc->have_mdsmap); if (!monc->subs[i].want)
i->onetime = 0; continue;
p += sizeof(*i);
dout("%s %s start %llu flags 0x%x\n", __func__, s,
le64_to_cpu(monc->subs[i].item.start),
monc->subs[i].item.flags);
ceph_encode_string(&p, end, s, strlen(s));
memcpy(p, &monc->subs[i].item, sizeof(monc->subs[i].item));
p += sizeof(monc->subs[i].item);
} }
ceph_encode_string(&p, end, "monmap", 6);
i = p;
i->have = 0;
i->onetime = 0;
p += sizeof(*i);
BUG_ON(p != (end - 35 - (ARRAY_SIZE(monc->subs) - num) * 19));
msg->front.iov_len = p - msg->front.iov_base; msg->front.iov_len = p - msg->front.iov_base;
msg->hdr.front_len = cpu_to_le32(msg->front.iov_len); msg->hdr.front_len = cpu_to_le32(msg->front.iov_len);
ceph_msg_revoke(msg); ceph_msg_revoke(msg);
ceph_con_send(&monc->con, ceph_msg_get(msg)); ceph_con_send(&monc->con, ceph_msg_get(msg));
monc->sub_sent = jiffies | 1; /* never 0 */
}
} }
static void handle_subscribe_ack(struct ceph_mon_client *monc, static void handle_subscribe_ack(struct ceph_mon_client *monc,
...@@ -255,15 +291,16 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, ...@@ -255,15 +291,16 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
seconds = le32_to_cpu(h->duration); seconds = le32_to_cpu(h->duration);
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
if (monc->hunting) { if (monc->sub_renew_sent) {
pr_info("mon%d %s session established\n", monc->sub_renew_after = monc->sub_renew_sent +
monc->cur_mon, (seconds >> 1) * HZ - 1;
ceph_pr_addr(&monc->con.peer_addr.in_addr)); dout("%s sent %lu duration %d renew after %lu\n", __func__,
monc->hunting = false; monc->sub_renew_sent, seconds, monc->sub_renew_after);
monc->sub_renew_sent = 0;
} else {
dout("%s sent %lu renew after %lu, ignoring\n", __func__,
monc->sub_renew_sent, monc->sub_renew_after);
} }
dout("handle_subscribe_ack after %d seconds\n", seconds);
monc->sub_renew_after = monc->sub_sent + (seconds >> 1)*HZ - 1;
monc->sub_sent = 0;
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
return; return;
bad: bad:
...@@ -272,36 +309,82 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc, ...@@ -272,36 +309,82 @@ static void handle_subscribe_ack(struct ceph_mon_client *monc,
} }
/* /*
* Keep track of which maps we have * Register interest in a map
*
* @sub: one of CEPH_SUB_*
* @epoch: X for "every map since X", or 0 for "just the latest"
*/ */
int ceph_monc_got_mdsmap(struct ceph_mon_client *monc, u32 got) static bool __ceph_monc_want_map(struct ceph_mon_client *monc, int sub,
u32 epoch, bool continuous)
{ {
__le64 start = cpu_to_le64(epoch);
u8 flags = !continuous ? CEPH_SUBSCRIBE_ONETIME : 0;
dout("%s %s epoch %u continuous %d\n", __func__, ceph_sub_str[sub],
epoch, continuous);
if (monc->subs[sub].want &&
monc->subs[sub].item.start == start &&
monc->subs[sub].item.flags == flags)
return false;
monc->subs[sub].item.start = start;
monc->subs[sub].item.flags = flags;
monc->subs[sub].want = true;
return true;
}
bool ceph_monc_want_map(struct ceph_mon_client *monc, int sub, u32 epoch,
bool continuous)
{
bool need_request;
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
monc->have_mdsmap = got; need_request = __ceph_monc_want_map(monc, sub, epoch, continuous);
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
return 0;
return need_request;
}
EXPORT_SYMBOL(ceph_monc_want_map);
/*
* Keep track of which maps we have
*
* @sub: one of CEPH_SUB_*
*/
static void __ceph_monc_got_map(struct ceph_mon_client *monc, int sub,
u32 epoch)
{
dout("%s %s epoch %u\n", __func__, ceph_sub_str[sub], epoch);
if (monc->subs[sub].want) {
if (monc->subs[sub].item.flags & CEPH_SUBSCRIBE_ONETIME)
monc->subs[sub].want = false;
else
monc->subs[sub].item.start = cpu_to_le64(epoch + 1);
}
monc->subs[sub].have = epoch;
} }
EXPORT_SYMBOL(ceph_monc_got_mdsmap);
int ceph_monc_got_osdmap(struct ceph_mon_client *monc, u32 got) void ceph_monc_got_map(struct ceph_mon_client *monc, int sub, u32 epoch)
{ {
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
monc->have_osdmap = got; __ceph_monc_got_map(monc, sub, epoch);
monc->want_next_osdmap = 0;
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
return 0;
} }
EXPORT_SYMBOL(ceph_monc_got_map);
/* /*
* Register interest in the next osdmap * Register interest in the next osdmap
*/ */
void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc) void ceph_monc_request_next_osdmap(struct ceph_mon_client *monc)
{ {
dout("request_next_osdmap have %u\n", monc->have_osdmap); dout("%s have %u\n", __func__, monc->subs[CEPH_SUB_OSDMAP].have);
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
if (!monc->want_next_osdmap) if (__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP,
monc->want_next_osdmap = 1; monc->subs[CEPH_SUB_OSDMAP].have + 1, false))
if (monc->want_next_osdmap < 2)
__send_subscribe(monc); __send_subscribe(monc);
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
} }
...@@ -320,14 +403,14 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, ...@@ -320,14 +403,14 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
long ret; long ret;
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
while (monc->have_osdmap < epoch) { while (monc->subs[CEPH_SUB_OSDMAP].have < epoch) {
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
if (timeout && time_after_eq(jiffies, started + timeout)) if (timeout && time_after_eq(jiffies, started + timeout))
return -ETIMEDOUT; return -ETIMEDOUT;
ret = wait_event_interruptible_timeout(monc->client->auth_wq, ret = wait_event_interruptible_timeout(monc->client->auth_wq,
monc->have_osdmap >= epoch, monc->subs[CEPH_SUB_OSDMAP].have >= epoch,
ceph_timeout_jiffies(timeout)); ceph_timeout_jiffies(timeout));
if (ret < 0) if (ret < 0)
return ret; return ret;
...@@ -341,11 +424,14 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch, ...@@ -341,11 +424,14 @@ int ceph_monc_wait_osdmap(struct ceph_mon_client *monc, u32 epoch,
EXPORT_SYMBOL(ceph_monc_wait_osdmap); EXPORT_SYMBOL(ceph_monc_wait_osdmap);
/* /*
* * Open a session with a random monitor. Request monmap and osdmap,
* which are waited upon in __ceph_open_session().
*/ */
int ceph_monc_open_session(struct ceph_mon_client *monc) int ceph_monc_open_session(struct ceph_mon_client *monc)
{ {
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
__ceph_monc_want_map(monc, CEPH_SUB_MONMAP, 0, true);
__ceph_monc_want_map(monc, CEPH_SUB_OSDMAP, 0, false);
__open_session(monc); __open_session(monc);
__schedule_delayed(monc); __schedule_delayed(monc);
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
...@@ -353,29 +439,15 @@ int ceph_monc_open_session(struct ceph_mon_client *monc) ...@@ -353,29 +439,15 @@ int ceph_monc_open_session(struct ceph_mon_client *monc)
} }
EXPORT_SYMBOL(ceph_monc_open_session); EXPORT_SYMBOL(ceph_monc_open_session);
/*
* We require the fsid and global_id in order to initialize our
* debugfs dir.
*/
static bool have_debugfs_info(struct ceph_mon_client *monc)
{
dout("have_debugfs_info fsid %d globalid %lld\n",
(int)monc->client->have_fsid, monc->auth->global_id);
return monc->client->have_fsid && monc->auth->global_id > 0;
}
static void ceph_monc_handle_map(struct ceph_mon_client *monc, static void ceph_monc_handle_map(struct ceph_mon_client *monc,
struct ceph_msg *msg) struct ceph_msg *msg)
{ {
struct ceph_client *client = monc->client; struct ceph_client *client = monc->client;
struct ceph_monmap *monmap = NULL, *old = monc->monmap; struct ceph_monmap *monmap = NULL, *old = monc->monmap;
void *p, *end; void *p, *end;
int had_debugfs_info, init_debugfs = 0;
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
had_debugfs_info = have_debugfs_info(monc);
dout("handle_monmap\n"); dout("handle_monmap\n");
p = msg->front.iov_base; p = msg->front.iov_base;
end = p + msg->front.iov_len; end = p + msg->front.iov_len;
...@@ -395,29 +467,11 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc, ...@@ -395,29 +467,11 @@ static void ceph_monc_handle_map(struct ceph_mon_client *monc,
client->monc.monmap = monmap; client->monc.monmap = monmap;
kfree(old); kfree(old);
if (!client->have_fsid) { __ceph_monc_got_map(monc, CEPH_SUB_MONMAP, monc->monmap->epoch);
client->have_fsid = true; client->have_fsid = true;
if (!had_debugfs_info && have_debugfs_info(monc)) {
pr_info("client%lld fsid %pU\n",
ceph_client_id(monc->client),
&monc->client->fsid);
init_debugfs = 1;
}
mutex_unlock(&monc->mutex);
if (init_debugfs) {
/*
* do debugfs initialization without mutex to avoid
* creating a locking dependency
*/
ceph_debugfs_client_init(monc->client);
}
goto out_unlocked;
}
out: out:
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
out_unlocked:
wake_up_all(&client->auth_wq); wake_up_all(&client->auth_wq);
} }
...@@ -745,18 +799,15 @@ static void delayed_work(struct work_struct *work) ...@@ -745,18 +799,15 @@ static void delayed_work(struct work_struct *work)
dout("monc delayed_work\n"); dout("monc delayed_work\n");
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
if (monc->hunting) { if (monc->hunting) {
__close_session(monc); dout("%s continuing hunt\n", __func__);
__open_session(monc); /* continue hunting */ reopen_session(monc);
} else { } else {
struct ceph_options *opt = monc->client->options;
int is_auth = ceph_auth_is_authenticated(monc->auth); int is_auth = ceph_auth_is_authenticated(monc->auth);
if (ceph_con_keepalive_expired(&monc->con, if (ceph_con_keepalive_expired(&monc->con,
opt->monc_ping_timeout)) { CEPH_MONC_PING_TIMEOUT)) {
dout("monc keepalive timeout\n"); dout("monc keepalive timeout\n");
is_auth = 0; is_auth = 0;
__close_session(monc); reopen_session(monc);
monc->hunting = true;
__open_session(monc);
} }
if (!monc->hunting) { if (!monc->hunting) {
...@@ -764,9 +815,15 @@ static void delayed_work(struct work_struct *work) ...@@ -764,9 +815,15 @@ static void delayed_work(struct work_struct *work)
__validate_auth(monc); __validate_auth(monc);
} }
if (is_auth) if (is_auth) {
unsigned long now = jiffies;
dout("%s renew subs? now %lu renew after %lu\n",
__func__, now, monc->sub_renew_after);
if (time_after_eq(now, monc->sub_renew_after))
__send_subscribe(monc); __send_subscribe(monc);
} }
}
__schedule_delayed(monc); __schedule_delayed(monc);
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
} }
...@@ -852,18 +909,14 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl) ...@@ -852,18 +909,14 @@ int ceph_monc_init(struct ceph_mon_client *monc, struct ceph_client *cl)
&monc->client->msgr); &monc->client->msgr);
monc->cur_mon = -1; monc->cur_mon = -1;
monc->hunting = true; monc->had_a_connection = false;
monc->sub_renew_after = jiffies; monc->hunt_mult = 1;
monc->sub_sent = 0;
INIT_DELAYED_WORK(&monc->delayed_work, delayed_work); INIT_DELAYED_WORK(&monc->delayed_work, delayed_work);
monc->generic_request_tree = RB_ROOT; monc->generic_request_tree = RB_ROOT;
monc->num_generic_requests = 0; monc->num_generic_requests = 0;
monc->last_tid = 0; monc->last_tid = 0;
monc->have_mdsmap = 0;
monc->have_osdmap = 0;
monc->want_next_osdmap = 1;
return 0; return 0;
out_auth_reply: out_auth_reply:
...@@ -888,7 +941,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ...@@ -888,7 +941,7 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
__close_session(monc); __close_session(monc);
monc->cur_mon = -1;
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
/* /*
...@@ -910,26 +963,40 @@ void ceph_monc_stop(struct ceph_mon_client *monc) ...@@ -910,26 +963,40 @@ void ceph_monc_stop(struct ceph_mon_client *monc)
} }
EXPORT_SYMBOL(ceph_monc_stop); EXPORT_SYMBOL(ceph_monc_stop);
static void finish_hunting(struct ceph_mon_client *monc)
{
if (monc->hunting) {
dout("%s found mon%d\n", __func__, monc->cur_mon);
monc->hunting = false;
monc->had_a_connection = true;
monc->hunt_mult /= 2; /* reduce by 50% */
if (monc->hunt_mult < 1)
monc->hunt_mult = 1;
}
}
static void handle_auth_reply(struct ceph_mon_client *monc, static void handle_auth_reply(struct ceph_mon_client *monc,
struct ceph_msg *msg) struct ceph_msg *msg)
{ {
int ret; int ret;
int was_auth = 0; int was_auth = 0;
int had_debugfs_info, init_debugfs = 0;
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
had_debugfs_info = have_debugfs_info(monc);
was_auth = ceph_auth_is_authenticated(monc->auth); was_auth = ceph_auth_is_authenticated(monc->auth);
monc->pending_auth = 0; monc->pending_auth = 0;
ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base, ret = ceph_handle_auth_reply(monc->auth, msg->front.iov_base,
msg->front.iov_len, msg->front.iov_len,
monc->m_auth->front.iov_base, monc->m_auth->front.iov_base,
monc->m_auth->front_alloc_len); monc->m_auth->front_alloc_len);
if (ret > 0) {
__send_prepared_auth_request(monc, ret);
goto out;
}
finish_hunting(monc);
if (ret < 0) { if (ret < 0) {
monc->client->auth_err = ret; monc->client->auth_err = ret;
wake_up_all(&monc->client->auth_wq);
} else if (ret > 0) {
__send_prepared_auth_request(monc, ret);
} else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) { } else if (!was_auth && ceph_auth_is_authenticated(monc->auth)) {
dout("authenticated, starting session\n"); dout("authenticated, starting session\n");
...@@ -939,23 +1006,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc, ...@@ -939,23 +1006,15 @@ static void handle_auth_reply(struct ceph_mon_client *monc,
__send_subscribe(monc); __send_subscribe(monc);
__resend_generic_request(monc); __resend_generic_request(monc);
}
if (!had_debugfs_info && have_debugfs_info(monc)) { pr_info("mon%d %s session established\n", monc->cur_mon,
pr_info("client%lld fsid %pU\n", ceph_pr_addr(&monc->con.peer_addr.in_addr));
ceph_client_id(monc->client),
&monc->client->fsid);
init_debugfs = 1;
} }
mutex_unlock(&monc->mutex);
if (init_debugfs) { out:
/* mutex_unlock(&monc->mutex);
* do debugfs initialization without mutex to avoid if (monc->client->auth_err < 0)
* creating a locking dependency wake_up_all(&monc->client->auth_wq);
*/
ceph_debugfs_client_init(monc->client);
}
} }
static int __validate_auth(struct ceph_mon_client *monc) static int __validate_auth(struct ceph_mon_client *monc)
...@@ -1096,29 +1155,17 @@ static void mon_fault(struct ceph_connection *con) ...@@ -1096,29 +1155,17 @@ static void mon_fault(struct ceph_connection *con)
{ {
struct ceph_mon_client *monc = con->private; struct ceph_mon_client *monc = con->private;
if (!monc)
return;
dout("mon_fault\n");
mutex_lock(&monc->mutex); mutex_lock(&monc->mutex);
if (!con->private) dout("%s mon%d\n", __func__, monc->cur_mon);
goto out; if (monc->cur_mon >= 0) {
if (!monc->hunting)
pr_info("mon%d %s session lost, "
"hunting for new mon\n", monc->cur_mon,
ceph_pr_addr(&monc->con.peer_addr.in_addr));
__close_session(monc);
if (!monc->hunting) { if (!monc->hunting) {
/* start hunting */ dout("%s hunting for new mon\n", __func__);
monc->hunting = true; reopen_session(monc);
__open_session(monc);
} else {
/* already hunting, let's wait a bit */
__schedule_delayed(monc); __schedule_delayed(monc);
} else {
dout("%s already hunting\n", __func__);
}
} }
out:
mutex_unlock(&monc->mutex); mutex_unlock(&monc->mutex);
} }
......
...@@ -338,9 +338,10 @@ static void ceph_osdc_release_request(struct kref *kref) ...@@ -338,9 +338,10 @@ static void ceph_osdc_release_request(struct kref *kref)
ceph_put_snap_context(req->r_snapc); ceph_put_snap_context(req->r_snapc);
if (req->r_mempool) if (req->r_mempool)
mempool_free(req, req->r_osdc->req_mempool); mempool_free(req, req->r_osdc->req_mempool);
else else if (req->r_num_ops <= CEPH_OSD_SLAB_OPS)
kmem_cache_free(ceph_osd_request_cache, req); kmem_cache_free(ceph_osd_request_cache, req);
else
kfree(req);
} }
void ceph_osdc_get_request(struct ceph_osd_request *req) void ceph_osdc_get_request(struct ceph_osd_request *req)
...@@ -369,28 +370,22 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, ...@@ -369,28 +370,22 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
struct ceph_msg *msg; struct ceph_msg *msg;
size_t msg_size; size_t msg_size;
BUILD_BUG_ON(CEPH_OSD_MAX_OP > U16_MAX);
BUG_ON(num_ops > CEPH_OSD_MAX_OP);
msg_size = 4 + 4 + 8 + 8 + 4+8;
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pg_t */
msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
msg_size += 2 + num_ops*sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */
msg_size += 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
msg_size += 4;
if (use_mempool) { if (use_mempool) {
BUG_ON(num_ops > CEPH_OSD_SLAB_OPS);
req = mempool_alloc(osdc->req_mempool, gfp_flags); req = mempool_alloc(osdc->req_mempool, gfp_flags);
memset(req, 0, sizeof(*req)); } else if (num_ops <= CEPH_OSD_SLAB_OPS) {
req = kmem_cache_alloc(ceph_osd_request_cache, gfp_flags);
} else { } else {
req = kmem_cache_zalloc(ceph_osd_request_cache, gfp_flags); BUG_ON(num_ops > CEPH_OSD_MAX_OPS);
req = kmalloc(sizeof(*req) + num_ops * sizeof(req->r_ops[0]),
gfp_flags);
} }
if (req == NULL) if (unlikely(!req))
return NULL; return NULL;
/* req only, each op is zeroed in _osd_req_op_init() */
memset(req, 0, sizeof(*req));
req->r_osdc = osdc; req->r_osdc = osdc;
req->r_mempool = use_mempool; req->r_mempool = use_mempool;
req->r_num_ops = num_ops; req->r_num_ops = num_ops;
...@@ -408,18 +403,36 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc, ...@@ -408,18 +403,36 @@ struct ceph_osd_request *ceph_osdc_alloc_request(struct ceph_osd_client *osdc,
req->r_base_oloc.pool = -1; req->r_base_oloc.pool = -1;
req->r_target_oloc.pool = -1; req->r_target_oloc.pool = -1;
msg_size = OSD_OPREPLY_FRONT_LEN;
if (num_ops > CEPH_OSD_SLAB_OPS) {
/* ceph_osd_op and rval */
msg_size += (num_ops - CEPH_OSD_SLAB_OPS) *
(sizeof(struct ceph_osd_op) + 4);
}
/* create reply message */ /* create reply message */
if (use_mempool) if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0); msg = ceph_msgpool_get(&osdc->msgpool_op_reply, 0);
else else
msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg = ceph_msg_new(CEPH_MSG_OSD_OPREPLY, msg_size,
OSD_OPREPLY_FRONT_LEN, gfp_flags, true); gfp_flags, true);
if (!msg) { if (!msg) {
ceph_osdc_put_request(req); ceph_osdc_put_request(req);
return NULL; return NULL;
} }
req->r_reply = msg; req->r_reply = msg;
msg_size = 4 + 4 + 4; /* client_inc, osdmap_epoch, flags */
msg_size += 4 + 4 + 4 + 8; /* mtime, reassert_version */
msg_size += 2 + 4 + 8 + 4 + 4; /* oloc */
msg_size += 1 + 8 + 4 + 4; /* pgid */
msg_size += 4 + CEPH_MAX_OID_NAME_LEN; /* oid */
msg_size += 2 + num_ops * sizeof(struct ceph_osd_op);
msg_size += 8; /* snapid */
msg_size += 8; /* snap_seq */
msg_size += 4 + 8 * (snapc ? snapc->num_snaps : 0); /* snaps */
msg_size += 4; /* retry_attempt */
/* create request message; allow space for oid */ /* create request message; allow space for oid */
if (use_mempool) if (use_mempool)
msg = ceph_msgpool_get(&osdc->msgpool_op, 0); msg = ceph_msgpool_get(&osdc->msgpool_op, 0);
...@@ -498,7 +511,7 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req, ...@@ -498,7 +511,7 @@ void osd_req_op_extent_init(struct ceph_osd_request *osd_req,
if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL) if (opcode == CEPH_OSD_OP_WRITE || opcode == CEPH_OSD_OP_WRITEFULL)
payload_len += length; payload_len += length;
op->payload_len = payload_len; op->indata_len = payload_len;
} }
EXPORT_SYMBOL(osd_req_op_extent_init); EXPORT_SYMBOL(osd_req_op_extent_init);
...@@ -517,10 +530,32 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req, ...@@ -517,10 +530,32 @@ void osd_req_op_extent_update(struct ceph_osd_request *osd_req,
BUG_ON(length > previous); BUG_ON(length > previous);
op->extent.length = length; op->extent.length = length;
op->payload_len -= previous - length; op->indata_len -= previous - length;
} }
EXPORT_SYMBOL(osd_req_op_extent_update); EXPORT_SYMBOL(osd_req_op_extent_update);
void osd_req_op_extent_dup_last(struct ceph_osd_request *osd_req,
unsigned int which, u64 offset_inc)
{
struct ceph_osd_req_op *op, *prev_op;
BUG_ON(which + 1 >= osd_req->r_num_ops);
prev_op = &osd_req->r_ops[which];
op = _osd_req_op_init(osd_req, which + 1, prev_op->op, prev_op->flags);
/* dup previous one */
op->indata_len = prev_op->indata_len;
op->outdata_len = prev_op->outdata_len;
op->extent = prev_op->extent;
/* adjust offset */
op->extent.offset += offset_inc;
op->extent.length -= offset_inc;
if (op->op == CEPH_OSD_OP_WRITE || op->op == CEPH_OSD_OP_WRITEFULL)
op->indata_len -= offset_inc;
}
EXPORT_SYMBOL(osd_req_op_extent_dup_last);
void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *class, const char *method) u16 opcode, const char *class, const char *method)
{ {
...@@ -554,7 +589,7 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which, ...@@ -554,7 +589,7 @@ void osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
op->cls.argc = 0; /* currently unused */ op->cls.argc = 0; /* currently unused */
op->payload_len = payload_len; op->indata_len = payload_len;
} }
EXPORT_SYMBOL(osd_req_op_cls_init); EXPORT_SYMBOL(osd_req_op_cls_init);
...@@ -587,7 +622,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which, ...@@ -587,7 +622,7 @@ int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
op->xattr.cmp_mode = cmp_mode; op->xattr.cmp_mode = cmp_mode;
ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist); ceph_osd_data_pagelist_init(&op->xattr.osd_data, pagelist);
op->payload_len = payload_len; op->indata_len = payload_len;
return 0; return 0;
} }
EXPORT_SYMBOL(osd_req_op_xattr_init); EXPORT_SYMBOL(osd_req_op_xattr_init);
...@@ -707,7 +742,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, ...@@ -707,7 +742,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE); BUG_ON(osd_data->type == CEPH_OSD_DATA_TYPE_NONE);
dst->cls.indata_len = cpu_to_le32(data_length); dst->cls.indata_len = cpu_to_le32(data_length);
ceph_osdc_msg_data_add(req->r_request, osd_data); ceph_osdc_msg_data_add(req->r_request, osd_data);
src->payload_len += data_length; src->indata_len += data_length;
request_data_len += data_length; request_data_len += data_length;
} }
osd_data = &src->cls.response_data; osd_data = &src->cls.response_data;
...@@ -750,7 +785,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req, ...@@ -750,7 +785,7 @@ static u64 osd_req_encode_op(struct ceph_osd_request *req,
dst->op = cpu_to_le16(src->op); dst->op = cpu_to_le16(src->op);
dst->flags = cpu_to_le32(src->flags); dst->flags = cpu_to_le32(src->flags);
dst->payload_len = cpu_to_le32(src->payload_len); dst->payload_len = cpu_to_le32(src->indata_len);
return request_data_len; return request_data_len;
} }
...@@ -1810,7 +1845,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1810,7 +1845,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
ceph_decode_need(&p, end, 4, bad_put); ceph_decode_need(&p, end, 4, bad_put);
numops = ceph_decode_32(&p); numops = ceph_decode_32(&p);
if (numops > CEPH_OSD_MAX_OP) if (numops > CEPH_OSD_MAX_OPS)
goto bad_put; goto bad_put;
if (numops != req->r_num_ops) if (numops != req->r_num_ops)
goto bad_put; goto bad_put;
...@@ -1821,7 +1856,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1821,7 +1856,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
int len; int len;
len = le32_to_cpu(op->payload_len); len = le32_to_cpu(op->payload_len);
req->r_reply_op_len[i] = len; req->r_ops[i].outdata_len = len;
dout(" op %d has %d bytes\n", i, len); dout(" op %d has %d bytes\n", i, len);
payload_len += len; payload_len += len;
p += sizeof(*op); p += sizeof(*op);
...@@ -1836,7 +1871,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1836,7 +1871,7 @@ static void handle_reply(struct ceph_osd_client *osdc, struct ceph_msg *msg)
ceph_decode_need(&p, end, 4 + numops * 4, bad_put); ceph_decode_need(&p, end, 4 + numops * 4, bad_put);
retry_attempt = ceph_decode_32(&p); retry_attempt = ceph_decode_32(&p);
for (i = 0; i < numops; i++) for (i = 0; i < numops; i++)
req->r_reply_op_result[i] = ceph_decode_32(&p); req->r_ops[i].rval = ceph_decode_32(&p);
if (le16_to_cpu(msg->hdr.version) >= 6) { if (le16_to_cpu(msg->hdr.version) >= 6) {
p += 8 + 4; /* skip replay_version */ p += 8 + 4; /* skip replay_version */
...@@ -2187,7 +2222,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -2187,7 +2222,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
goto bad; goto bad;
done: done:
downgrade_write(&osdc->map_sem); downgrade_write(&osdc->map_sem);
ceph_monc_got_osdmap(&osdc->client->monc, osdc->osdmap->epoch); ceph_monc_got_map(&osdc->client->monc, CEPH_SUB_OSDMAP,
osdc->osdmap->epoch);
/* /*
* subscribe to subsequent osdmap updates if full to ensure * subscribe to subsequent osdmap updates if full to ensure
...@@ -2646,8 +2682,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client) ...@@ -2646,8 +2682,8 @@ int ceph_osdc_init(struct ceph_osd_client *osdc, struct ceph_client *client)
round_jiffies_relative(osdc->client->options->osd_idle_ttl)); round_jiffies_relative(osdc->client->options->osd_idle_ttl));
err = -ENOMEM; err = -ENOMEM;
osdc->req_mempool = mempool_create_kmalloc_pool(10, osdc->req_mempool = mempool_create_slab_pool(10,
sizeof(struct ceph_osd_request)); ceph_osd_request_cache);
if (!osdc->req_mempool) if (!osdc->req_mempool)
goto out; goto out;
...@@ -2782,11 +2818,12 @@ EXPORT_SYMBOL(ceph_osdc_writepages); ...@@ -2782,11 +2818,12 @@ EXPORT_SYMBOL(ceph_osdc_writepages);
int ceph_osdc_setup(void) int ceph_osdc_setup(void)
{ {
size_t size = sizeof(struct ceph_osd_request) +
CEPH_OSD_SLAB_OPS * sizeof(struct ceph_osd_req_op);
BUG_ON(ceph_osd_request_cache); BUG_ON(ceph_osd_request_cache);
ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", ceph_osd_request_cache = kmem_cache_create("ceph_osd_request", size,
sizeof (struct ceph_osd_request), 0, 0, NULL);
__alignof__(struct ceph_osd_request),
0, NULL);
return ceph_osd_request_cache ? 0 : -ENOMEM; return ceph_osd_request_cache ? 0 : -ENOMEM;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment