Commit 31990f0f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'ceph-for-4.20-rc1' of git://github.com/ceph/ceph-client

Pull ceph updates from Ilya Dryomov:
 "The highlights are:

   - a series that fixes some old memory allocation issues in libceph
     (myself). We no longer allocate memory in places where allocation
     failures cannot be handled and BUG when the allocation fails.

   - support for copy_file_range() syscall (Luis Henriques). If size and
     alignment conditions are met, it leverages RADOS copy-from
     operation. Otherwise, a local copy is performed.

   - a patch that reduces memory requirement of ceph_sync_read() from
     the size of the entire read to the size of one object (Zheng Yan).

   - fallocate() syscall is now restricted to FALLOC_FL_PUNCH_HOLE (Luis
     Henriques)"

* tag 'ceph-for-4.20-rc1' of git://github.com/ceph/ceph-client: (25 commits)
  ceph: new mount option to disable usage of copy-from op
  ceph: support copy_file_range file operation
  libceph: support the RADOS copy-from operation
  ceph: add non-blocking parameter to ceph_try_get_caps()
  libceph: check reply num_data_items in setup_request_data()
  libceph: preallocate message data items
  libceph, rbd, ceph: move ceph_osdc_alloc_messages() calls
  libceph: introduce alloc_watch_request()
  libceph: assign cookies in linger_submit()
  libceph: enable fallback to ceph_msg_new() in ceph_msgpool_get()
  ceph: num_ops is off by one in ceph_aio_retry_work()
  libceph: no need to call osd_req_opcode_valid() in osd_req_encode_op()
  ceph: set timeout conditionally in __cap_delay_requeue
  libceph: don't consume a ref on pagelist in ceph_msg_data_add_pagelist()
  libceph: introduce ceph_pagelist_alloc()
  libceph: osd_req_op_cls_init() doesn't need to take opcode
  libceph: bump CEPH_MSG_MAX_DATA_LEN
  ceph: only allow punch hole mode in fallocate
  ceph: refactor ceph_sync_read()
  ceph: check if LOOKUPNAME request was aborted when filling trace
  ...
parents a9ac6cc4 ea4cdc54
......@@ -151,6 +151,11 @@ Mount Options
Report overall filesystem usage in statfs instead of using the root
directory quota.
nocopyfrom
Don't use the RADOS 'copy-from' operation to perform remote object
copies. Currently, it's only used in copy_file_range, which will revert
to the default VFS implementation if this option is used.
More Information
================
......
......@@ -1500,9 +1500,6 @@ rbd_osd_req_create(struct rbd_obj_request *obj_req, unsigned int num_ops)
rbd_dev->header.object_prefix, obj_req->ex.oe_objno))
goto err_req;
if (ceph_osdc_alloc_messages(req, GFP_NOIO))
goto err_req;
return req;
err_req:
......@@ -1945,6 +1942,10 @@ static int __rbd_img_fill_request(struct rbd_img_request *img_req)
}
if (ret)
return ret;
ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
if (ret)
return ret;
}
return 0;
......@@ -2374,8 +2375,7 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
if (!obj_req->osd_req)
return -ENOMEM;
ret = osd_req_op_cls_init(obj_req->osd_req, 0, CEPH_OSD_OP_CALL, "rbd",
"copyup");
ret = osd_req_op_cls_init(obj_req->osd_req, 0, "rbd", "copyup");
if (ret)
return ret;
......@@ -2405,6 +2405,10 @@ static int rbd_obj_issue_copyup(struct rbd_obj_request *obj_req, u32 bytes)
rbd_assert(0);
}
ret = ceph_osdc_alloc_messages(obj_req->osd_req, GFP_NOIO);
if (ret)
return ret;
rbd_obj_request_submit(obj_req);
return 0;
}
......@@ -3784,10 +3788,6 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
ceph_oloc_copy(&req->r_base_oloc, oloc);
req->r_flags = CEPH_OSD_FLAG_READ;
ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
if (ret)
goto out_req;
pages = ceph_alloc_page_vector(num_pages, GFP_KERNEL);
if (IS_ERR(pages)) {
ret = PTR_ERR(pages);
......@@ -3798,6 +3798,10 @@ static int rbd_obj_read_sync(struct rbd_device *rbd_dev,
osd_req_op_extent_osd_data_pages(req, 0, pages, buf_len, 0, false,
true);
ret = ceph_osdc_alloc_messages(req, GFP_KERNEL);
if (ret)
goto out_req;
ceph_osdc_start_request(osdc, req, false);
ret = ceph_osdc_wait_request(osdc, req);
if (ret >= 0)
......@@ -6067,7 +6071,7 @@ static ssize_t rbd_remove_single_major(struct bus_type *bus,
* create control files in sysfs
* /sys/bus/rbd/...
*/
static int rbd_sysfs_init(void)
static int __init rbd_sysfs_init(void)
{
int ret;
......@@ -6082,13 +6086,13 @@ static int rbd_sysfs_init(void)
return ret;
}
static void rbd_sysfs_cleanup(void)
static void __exit rbd_sysfs_cleanup(void)
{
bus_unregister(&rbd_bus_type);
device_unregister(&rbd_root_dev);
}
static int rbd_slab_init(void)
static int __init rbd_slab_init(void)
{
rbd_assert(!rbd_img_request_cache);
rbd_img_request_cache = KMEM_CACHE(rbd_img_request, 0);
......
......@@ -104,6 +104,11 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
struct timespec64 old_ctime = inode->i_ctime;
umode_t new_mode = inode->i_mode, old_mode = inode->i_mode;
if (ceph_snap(inode) != CEPH_NOSNAP) {
ret = -EROFS;
goto out;
}
switch (type) {
case ACL_TYPE_ACCESS:
name = XATTR_NAME_POSIX_ACL_ACCESS;
......@@ -138,11 +143,6 @@ int ceph_set_acl(struct inode *inode, struct posix_acl *acl, int type)
goto out_free;
}
if (ceph_snap(inode) != CEPH_NOSNAP) {
ret = -EROFS;
goto out_free;
}
if (new_mode != old_mode) {
newattrs.ia_ctime = current_time(inode);
newattrs.ia_mode = new_mode;
......@@ -206,10 +206,9 @@ int ceph_pre_init_acls(struct inode *dir, umode_t *mode,
tmp_buf = kmalloc(max(val_size1, val_size2), GFP_KERNEL);
if (!tmp_buf)
goto out_err;
pagelist = kmalloc(sizeof(struct ceph_pagelist), GFP_KERNEL);
pagelist = ceph_pagelist_alloc(GFP_KERNEL);
if (!pagelist)
goto out_err;
ceph_pagelist_init(pagelist);
err = ceph_pagelist_reserve(pagelist, PAGE_SIZE);
if (err)
......
......@@ -322,7 +322,7 @@ static int start_read(struct inode *inode, struct ceph_rw_context *rw_ctx,
/* caller of readpages does not hold buffer and read caps
* (fadvise, madvise and readahead cases) */
int want = CEPH_CAP_FILE_CACHE;
ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, &got);
ret = ceph_try_get_caps(ci, CEPH_CAP_FILE_RD, want, true, &got);
if (ret < 0) {
dout("start_read %p, error getting cap\n", inode);
} else if (!(got & want)) {
......
......@@ -519,9 +519,9 @@ static void __cap_set_timeouts(struct ceph_mds_client *mdsc,
* -> we take mdsc->cap_delay_lock
*/
static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
struct ceph_inode_info *ci)
struct ceph_inode_info *ci,
bool set_timeout)
{
__cap_set_timeouts(mdsc, ci);
dout("__cap_delay_requeue %p flags %d at %lu\n", &ci->vfs_inode,
ci->i_ceph_flags, ci->i_hold_caps_max);
if (!mdsc->stopping) {
......@@ -531,6 +531,8 @@ static void __cap_delay_requeue(struct ceph_mds_client *mdsc,
goto no_change;
list_del_init(&ci->i_cap_delay_list);
}
if (set_timeout)
__cap_set_timeouts(mdsc, ci);
list_add_tail(&ci->i_cap_delay_list, &mdsc->cap_delay_list);
no_change:
spin_unlock(&mdsc->cap_delay_lock);
......@@ -720,7 +722,7 @@ void ceph_add_cap(struct inode *inode,
dout(" issued %s, mds wanted %s, actual %s, queueing\n",
ceph_cap_string(issued), ceph_cap_string(wanted),
ceph_cap_string(actual_wanted));
__cap_delay_requeue(mdsc, ci);
__cap_delay_requeue(mdsc, ci, true);
}
if (flags & CEPH_CAP_FLAG_AUTH) {
......@@ -1647,7 +1649,7 @@ int __ceph_mark_dirty_caps(struct ceph_inode_info *ci, int mask,
if (((was | ci->i_flushing_caps) & CEPH_CAP_FILE_BUFFER) &&
(mask & CEPH_CAP_FILE_BUFFER))
dirty |= I_DIRTY_DATASYNC;
__cap_delay_requeue(mdsc, ci);
__cap_delay_requeue(mdsc, ci, true);
return dirty;
}
......@@ -2065,7 +2067,7 @@ void ceph_check_caps(struct ceph_inode_info *ci, int flags,
/* Reschedule delayed caps release if we delayed anything */
if (delayed)
__cap_delay_requeue(mdsc, ci);
__cap_delay_requeue(mdsc, ci, false);
spin_unlock(&ci->i_ceph_lock);
......@@ -2125,7 +2127,7 @@ static int try_flush_caps(struct inode *inode, u64 *ptid)
if (delayed) {
spin_lock(&ci->i_ceph_lock);
__cap_delay_requeue(mdsc, ci);
__cap_delay_requeue(mdsc, ci, true);
spin_unlock(&ci->i_ceph_lock);
}
} else {
......@@ -2671,17 +2673,18 @@ static void check_max_size(struct inode *inode, loff_t endoff)
ceph_check_caps(ci, CHECK_CAPS_AUTHONLY, NULL);
}
int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want, int *got)
int ceph_try_get_caps(struct ceph_inode_info *ci, int need, int want,
bool nonblock, int *got)
{
int ret, err = 0;
BUG_ON(need & ~CEPH_CAP_FILE_RD);
BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO));
BUG_ON(want & ~(CEPH_CAP_FILE_CACHE|CEPH_CAP_FILE_LAZYIO|CEPH_CAP_FILE_SHARED));
ret = ceph_pool_perm_check(ci, need);
if (ret < 0)
return ret;
ret = try_get_cap_refs(ci, need, want, 0, true, got, &err);
ret = try_get_cap_refs(ci, need, want, 0, nonblock, got, &err);
if (ret) {
if (err == -EAGAIN) {
ret = 0;
......
This diff is collapsed.
......@@ -1132,8 +1132,12 @@ static struct dentry *splice_dentry(struct dentry *dn, struct inode *in)
if (IS_ERR(realdn)) {
pr_err("splice_dentry error %ld %p inode %p ino %llx.%llx\n",
PTR_ERR(realdn), dn, in, ceph_vinop(in));
dput(dn);
dn = realdn; /* note realdn contains the error */
dn = realdn;
/*
* Caller should release 'dn' in the case of error.
* If 'req->r_dentry' is passed to this function,
* caller should leave 'req->r_dentry' untouched.
*/
goto out;
} else if (realdn) {
dout("dn %p (%d) spliced with %p (%d) "
......@@ -1196,7 +1200,9 @@ int ceph_fill_trace(struct super_block *sb, struct ceph_mds_request *req)
WARN_ON_ONCE(1);
}
if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME) {
if (dir && req->r_op == CEPH_MDS_OP_LOOKUPNAME &&
test_bit(CEPH_MDS_R_PARENT_LOCKED, &req->r_req_flags) &&
!test_bit(CEPH_MDS_R_ABORTED, &req->r_req_flags)) {
struct qstr dname;
struct dentry *dn, *parent;
......@@ -1677,7 +1683,6 @@ int ceph_readdir_prepopulate(struct ceph_mds_request *req,
if (IS_ERR(realdn)) {
err = PTR_ERR(realdn);
d_drop(dn);
dn = NULL;
goto next_item;
}
dn = realdn;
......
......@@ -2071,7 +2071,7 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_old_dentry_drop)
len += req->r_old_dentry->d_name.len;
msg = ceph_msg_new(CEPH_MSG_CLIENT_REQUEST, len, GFP_NOFS, false);
msg = ceph_msg_new2(CEPH_MSG_CLIENT_REQUEST, len, 1, GFP_NOFS, false);
if (!msg) {
msg = ERR_PTR(-ENOMEM);
goto out_free2;
......@@ -2136,7 +2136,6 @@ static struct ceph_msg *create_request_message(struct ceph_mds_client *mdsc,
if (req->r_pagelist) {
struct ceph_pagelist *pagelist = req->r_pagelist;
refcount_inc(&pagelist->refcnt);
ceph_msg_data_add_pagelist(msg, pagelist);
msg->hdr.data_len = cpu_to_le32(pagelist->length);
} else {
......@@ -3126,12 +3125,11 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
pr_info("mds%d reconnect start\n", mds);
pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
goto fail_nopagelist;
ceph_pagelist_init(pagelist);
reply = ceph_msg_new(CEPH_MSG_CLIENT_RECONNECT, 0, GFP_NOFS, false);
reply = ceph_msg_new2(CEPH_MSG_CLIENT_RECONNECT, 0, 1, GFP_NOFS, false);
if (!reply)
goto fail_nomsg;
......@@ -3241,6 +3239,7 @@ static void send_mds_reconnect(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex);
up_read(&mdsc->snap_rwsem);
ceph_pagelist_release(pagelist);
return;
fail:
......
......@@ -165,6 +165,8 @@ enum {
Opt_noacl,
Opt_quotadf,
Opt_noquotadf,
Opt_copyfrom,
Opt_nocopyfrom,
};
static match_table_t fsopt_tokens = {
......@@ -203,6 +205,8 @@ static match_table_t fsopt_tokens = {
{Opt_noacl, "noacl"},
{Opt_quotadf, "quotadf"},
{Opt_noquotadf, "noquotadf"},
{Opt_copyfrom, "copyfrom"},
{Opt_nocopyfrom, "nocopyfrom"},
{-1, NULL}
};
......@@ -355,6 +359,12 @@ static int parse_fsopt_token(char *c, void *private)
case Opt_noquotadf:
fsopt->flags |= CEPH_MOUNT_OPT_NOQUOTADF;
break;
case Opt_copyfrom:
fsopt->flags &= ~CEPH_MOUNT_OPT_NOCOPYFROM;
break;
case Opt_nocopyfrom:
fsopt->flags |= CEPH_MOUNT_OPT_NOCOPYFROM;
break;
#ifdef CONFIG_CEPH_FS_POSIX_ACL
case Opt_acl:
fsopt->sb_flags |= SB_POSIXACL;
......@@ -553,6 +563,9 @@ static int ceph_show_options(struct seq_file *m, struct dentry *root)
seq_puts(m, ",noacl");
#endif
if (fsopt->flags & CEPH_MOUNT_OPT_NOCOPYFROM)
seq_puts(m, ",nocopyfrom");
if (fsopt->mds_namespace)
seq_show_option(m, "mds_namespace", fsopt->mds_namespace);
if (fsopt->wsize != CEPH_MAX_WRITE_SIZE)
......
......@@ -40,6 +40,7 @@
#define CEPH_MOUNT_OPT_NOPOOLPERM (1<<11) /* no pool permission check */
#define CEPH_MOUNT_OPT_MOUNTWAIT (1<<12) /* mount waits if no mds is up */
#define CEPH_MOUNT_OPT_NOQUOTADF (1<<13) /* no root dir quota in statfs */
#define CEPH_MOUNT_OPT_NOCOPYFROM (1<<14) /* don't use RADOS 'copy-from' op */
#define CEPH_MOUNT_OPT_DEFAULT CEPH_MOUNT_OPT_DCACHE
......@@ -1008,7 +1009,7 @@ extern int ceph_encode_dentry_release(void **p, struct dentry *dn,
extern int ceph_get_caps(struct ceph_inode_info *ci, int need, int want,
loff_t endoff, int *got, struct page **pinned_page);
extern int ceph_try_get_caps(struct ceph_inode_info *ci,
int need, int want, int *got);
int need, int want, bool nonblock, int *got);
/* for counting open files by mode */
extern void __ceph_get_fmode(struct ceph_inode_info *ci, int mode);
......
......@@ -951,11 +951,10 @@ static int ceph_sync_setxattr(struct inode *inode, const char *name,
if (size > 0) {
/* copy value into pagelist */
pagelist = kmalloc(sizeof(*pagelist), GFP_NOFS);
pagelist = ceph_pagelist_alloc(GFP_NOFS);
if (!pagelist)
return -ENOMEM;
ceph_pagelist_init(pagelist);
err = ceph_pagelist_append(pagelist, value, size);
if (err)
goto out;
......
......@@ -81,7 +81,13 @@ struct ceph_options {
#define CEPH_MSG_MAX_FRONT_LEN (16*1024*1024)
#define CEPH_MSG_MAX_MIDDLE_LEN (16*1024*1024)
#define CEPH_MSG_MAX_DATA_LEN (16*1024*1024)
/*
* Handle the largest possible rbd object in one message.
* There is no limit on the size of cephfs objects, but it has to obey
* rsize and wsize mount options anyway.
*/
#define CEPH_MSG_MAX_DATA_LEN (32*1024*1024)
#define CEPH_AUTH_NAME_DEFAULT "guest"
......
......@@ -82,22 +82,6 @@ enum ceph_msg_data_type {
CEPH_MSG_DATA_BVECS, /* data source/destination is a bio_vec array */
};
static __inline__ bool ceph_msg_data_type_valid(enum ceph_msg_data_type type)
{
switch (type) {
case CEPH_MSG_DATA_NONE:
case CEPH_MSG_DATA_PAGES:
case CEPH_MSG_DATA_PAGELIST:
#ifdef CONFIG_BLOCK
case CEPH_MSG_DATA_BIO:
#endif /* CONFIG_BLOCK */
case CEPH_MSG_DATA_BVECS:
return true;
default:
return false;
}
}
#ifdef CONFIG_BLOCK
struct ceph_bio_iter {
......@@ -181,7 +165,6 @@ struct ceph_bvec_iter {
} while (0)
struct ceph_msg_data {
struct list_head links; /* ceph_msg->data */
enum ceph_msg_data_type type;
union {
#ifdef CONFIG_BLOCK
......@@ -202,7 +185,6 @@ struct ceph_msg_data {
struct ceph_msg_data_cursor {
size_t total_resid; /* across all data items */
struct list_head *data_head; /* = &ceph_msg->data */
struct ceph_msg_data *data; /* current data item */
size_t resid; /* bytes not yet consumed */
......@@ -240,7 +222,9 @@ struct ceph_msg {
struct ceph_buffer *middle;
size_t data_length;
struct list_head data;
struct ceph_msg_data *data;
int num_data_items;
int max_data_items;
struct ceph_msg_data_cursor cursor;
struct ceph_connection *con;
......@@ -381,6 +365,8 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
struct ceph_bvec_iter *bvec_pos);
struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
gfp_t flags, bool can_fail);
extern struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
bool can_fail);
......
......@@ -13,14 +13,15 @@ struct ceph_msgpool {
mempool_t *pool;
int type; /* preallocated message type */
int front_len; /* preallocated payload size */
int max_data_items;
};
extern int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
int front_len, int size, bool blocking,
int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
int front_len, int max_data_items, int size,
const char *name);
extern void ceph_msgpool_destroy(struct ceph_msgpool *pool);
extern struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *,
int front_len);
struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len,
int max_data_items);
extern void ceph_msgpool_put(struct ceph_msgpool *, struct ceph_msg *);
#endif
......@@ -136,6 +136,13 @@ struct ceph_osd_req_op {
u64 expected_object_size;
u64 expected_write_size;
} alloc_hint;
struct {
u64 snapid;
u64 src_version;
u8 flags;
u32 src_fadvise_flags;
struct ceph_osd_data osd_data;
} copy_from;
};
};
......@@ -444,8 +451,7 @@ extern void osd_req_op_cls_response_data_pages(struct ceph_osd_request *,
struct page **pages, u64 length,
u32 alignment, bool pages_from_pool,
bool own_pages);
extern int osd_req_op_cls_init(struct ceph_osd_request *osd_req,
unsigned int which, u16 opcode,
int osd_req_op_cls_init(struct ceph_osd_request *osd_req, unsigned int which,
const char *class, const char *method);
extern int osd_req_op_xattr_init(struct ceph_osd_request *osd_req, unsigned int which,
u16 opcode, const char *name, const void *value,
......@@ -511,6 +517,16 @@ extern int ceph_osdc_writepages(struct ceph_osd_client *osdc,
struct timespec64 *mtime,
struct page **pages, int nr_pages);
int ceph_osdc_copy_from(struct ceph_osd_client *osdc,
u64 src_snapid, u64 src_version,
struct ceph_object_id *src_oid,
struct ceph_object_locator *src_oloc,
u32 src_fadvise_flags,
struct ceph_object_id *dst_oid,
struct ceph_object_locator *dst_oloc,
u32 dst_fadvise_flags,
u8 copy_from_flags);
/* watch/notify */
struct ceph_osd_linger_request *
ceph_osdc_watch(struct ceph_osd_client *osdc,
......
......@@ -23,16 +23,7 @@ struct ceph_pagelist_cursor {
size_t room; /* room remaining to reset to */
};
static inline void ceph_pagelist_init(struct ceph_pagelist *pl)
{
INIT_LIST_HEAD(&pl->head);
pl->mapped_tail = NULL;
pl->length = 0;
pl->room = 0;
INIT_LIST_HEAD(&pl->free_list);
pl->num_pages_free = 0;
refcount_set(&pl->refcnt, 1);
}
struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags);
extern void ceph_pagelist_release(struct ceph_pagelist *pl);
......
......@@ -410,6 +410,14 @@ enum {
enum {
CEPH_OSD_OP_FLAG_EXCL = 1, /* EXCL object create */
CEPH_OSD_OP_FLAG_FAILOK = 2, /* continue despite failure */
CEPH_OSD_OP_FLAG_FADVISE_RANDOM = 0x4, /* the op is random */
CEPH_OSD_OP_FLAG_FADVISE_SEQUENTIAL = 0x8, /* the op is sequential */
CEPH_OSD_OP_FLAG_FADVISE_WILLNEED = 0x10,/* data will be accessed in
the near future */
CEPH_OSD_OP_FLAG_FADVISE_DONTNEED = 0x20,/* data will not be accessed
in the near future */
CEPH_OSD_OP_FLAG_FADVISE_NOCACHE = 0x40,/* data will be accessed only
once by this client */
};
#define EOLDSNAPC ERESTART /* ORDERSNAP flag set; writer has old snapc*/
......@@ -431,6 +439,15 @@ enum {
CEPH_OSD_CMPXATTR_MODE_U64 = 2
};
enum {
CEPH_OSD_COPY_FROM_FLAG_FLUSH = 1, /* part of a flush operation */
CEPH_OSD_COPY_FROM_FLAG_IGNORE_OVERLAY = 2, /* ignore pool overlay */
CEPH_OSD_COPY_FROM_FLAG_IGNORE_CACHE = 4, /* ignore osd cache logic */
CEPH_OSD_COPY_FROM_FLAG_MAP_SNAP_CLONE = 8, /* map snap direct to
* cloneid */
CEPH_OSD_COPY_FROM_FLAG_RWORDERED = 16, /* order with write */
};
enum {
CEPH_OSD_WATCH_OP_UNWATCH = 0,
CEPH_OSD_WATCH_OP_LEGACY_WATCH = 1,
......@@ -497,6 +514,17 @@ struct ceph_osd_op {
__le64 expected_object_size;
__le64 expected_write_size;
} __attribute__ ((packed)) alloc_hint;
struct {
__le64 snapid;
__le64 src_version;
__u8 flags; /* CEPH_OSD_COPY_FROM_FLAG_* */
/*
* CEPH_OSD_OP_FLAG_FADVISE_*: fadvise flags
* for src object, flags for dest object are in
* ceph_osd_op::flags.
*/
__le32 src_fadvise_flags;
} __attribute__ ((packed)) copy_from;
};
__le32 payload_len;
} __attribute__ ((packed));
......
......@@ -156,7 +156,6 @@ static bool con_flag_test_and_set(struct ceph_connection *con,
/* Slab caches for frequently-allocated structures */
static struct kmem_cache *ceph_msg_cache;
static struct kmem_cache *ceph_msg_data_cache;
/* static tag bytes (protocol control messages) */
static char tag_msg = CEPH_MSGR_TAG_MSG;
......@@ -235,23 +234,11 @@ static int ceph_msgr_slab_init(void)
if (!ceph_msg_cache)
return -ENOMEM;
BUG_ON(ceph_msg_data_cache);
ceph_msg_data_cache = KMEM_CACHE(ceph_msg_data, 0);
if (ceph_msg_data_cache)
return 0;
kmem_cache_destroy(ceph_msg_cache);
ceph_msg_cache = NULL;
return -ENOMEM;
}
static void ceph_msgr_slab_exit(void)
{
BUG_ON(!ceph_msg_data_cache);
kmem_cache_destroy(ceph_msg_data_cache);
ceph_msg_data_cache = NULL;
BUG_ON(!ceph_msg_cache);
kmem_cache_destroy(ceph_msg_cache);
ceph_msg_cache = NULL;
......@@ -1141,16 +1128,13 @@ static void __ceph_msg_data_cursor_init(struct ceph_msg_data_cursor *cursor)
static void ceph_msg_data_cursor_init(struct ceph_msg *msg, size_t length)
{
struct ceph_msg_data_cursor *cursor = &msg->cursor;
struct ceph_msg_data *data;
BUG_ON(!length);
BUG_ON(length > msg->data_length);
BUG_ON(list_empty(&msg->data));
BUG_ON(!msg->num_data_items);
cursor->data_head = &msg->data;
cursor->total_resid = length;
data = list_first_entry(&msg->data, struct ceph_msg_data, links);
cursor->data = data;
cursor->data = msg->data;
__ceph_msg_data_cursor_init(cursor);
}
......@@ -1231,8 +1215,7 @@ static void ceph_msg_data_advance(struct ceph_msg_data_cursor *cursor,
if (!cursor->resid && cursor->total_resid) {
WARN_ON(!cursor->last_piece);
BUG_ON(list_is_last(&cursor->data->links, cursor->data_head));
cursor->data = list_next_entry(cursor->data, links);
cursor->data++;
__ceph_msg_data_cursor_init(cursor);
new_piece = true;
}
......@@ -1248,9 +1231,6 @@ static size_t sizeof_footer(struct ceph_connection *con)
static void prepare_message_data(struct ceph_msg *msg, u32 data_len)
{
BUG_ON(!msg);
BUG_ON(!data_len);
/* Initialize data cursor */
ceph_msg_data_cursor_init(msg, (size_t)data_len);
......@@ -1590,7 +1570,7 @@ static int write_partial_message_data(struct ceph_connection *con)
dout("%s %p msg %p\n", __func__, con, msg);
if (list_empty(&msg->data))
if (!msg->num_data_items)
return -EINVAL;
/*
......@@ -2347,8 +2327,7 @@ static int read_partial_msg_data(struct ceph_connection *con)
u32 crc = 0;
int ret;
BUG_ON(!msg);
if (list_empty(&msg->data))
if (!msg->num_data_items)
return -EIO;
if (do_datacrc)
......@@ -3256,32 +3235,16 @@ bool ceph_con_keepalive_expired(struct ceph_connection *con,
return false;
}
static struct ceph_msg_data *ceph_msg_data_create(enum ceph_msg_data_type type)
static struct ceph_msg_data *ceph_msg_data_add(struct ceph_msg *msg)
{
struct ceph_msg_data *data;
if (WARN_ON(!ceph_msg_data_type_valid(type)))
return NULL;
data = kmem_cache_zalloc(ceph_msg_data_cache, GFP_NOFS);
if (!data)
return NULL;
data->type = type;
INIT_LIST_HEAD(&data->links);
return data;
BUG_ON(msg->num_data_items >= msg->max_data_items);
return &msg->data[msg->num_data_items++];
}
static void ceph_msg_data_destroy(struct ceph_msg_data *data)
{
if (!data)
return;
WARN_ON(!list_empty(&data->links));
if (data->type == CEPH_MSG_DATA_PAGELIST)
ceph_pagelist_release(data->pagelist);
kmem_cache_free(ceph_msg_data_cache, data);
}
void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
......@@ -3292,13 +3255,12 @@ void ceph_msg_data_add_pages(struct ceph_msg *msg, struct page **pages,
BUG_ON(!pages);
BUG_ON(!length);
data = ceph_msg_data_create(CEPH_MSG_DATA_PAGES);
BUG_ON(!data);
data = ceph_msg_data_add(msg);
data->type = CEPH_MSG_DATA_PAGES;
data->pages = pages;
data->length = length;
data->alignment = alignment & ~PAGE_MASK;
list_add_tail(&data->links, &msg->data);
msg->data_length += length;
}
EXPORT_SYMBOL(ceph_msg_data_add_pages);
......@@ -3311,11 +3273,11 @@ void ceph_msg_data_add_pagelist(struct ceph_msg *msg,
BUG_ON(!pagelist);
BUG_ON(!pagelist->length);
data = ceph_msg_data_create(CEPH_MSG_DATA_PAGELIST);
BUG_ON(!data);
data = ceph_msg_data_add(msg);
data->type = CEPH_MSG_DATA_PAGELIST;
refcount_inc(&pagelist->refcnt);
data->pagelist = pagelist;
list_add_tail(&data->links, &msg->data);
msg->data_length += pagelist->length;
}
EXPORT_SYMBOL(ceph_msg_data_add_pagelist);
......@@ -3326,12 +3288,11 @@ void ceph_msg_data_add_bio(struct ceph_msg *msg, struct ceph_bio_iter *bio_pos,
{
struct ceph_msg_data *data;
data = ceph_msg_data_create(CEPH_MSG_DATA_BIO);
BUG_ON(!data);
data = ceph_msg_data_add(msg);
data->type = CEPH_MSG_DATA_BIO;
data->bio_pos = *bio_pos;
data->bio_length = length;
list_add_tail(&data->links, &msg->data);
msg->data_length += length;
}
EXPORT_SYMBOL(ceph_msg_data_add_bio);
......@@ -3342,11 +3303,10 @@ void ceph_msg_data_add_bvecs(struct ceph_msg *msg,
{
struct ceph_msg_data *data;
data = ceph_msg_data_create(CEPH_MSG_DATA_BVECS);
BUG_ON(!data);
data = ceph_msg_data_add(msg);
data->type = CEPH_MSG_DATA_BVECS;
data->bvec_pos = *bvec_pos;
list_add_tail(&data->links, &msg->data);
msg->data_length += bvec_pos->iter.bi_size;
}
EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
......@@ -3355,8 +3315,8 @@ EXPORT_SYMBOL(ceph_msg_data_add_bvecs);
* construct a new message with given type, size
* the new msg has a ref count of 1.
*/
struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
bool can_fail)
struct ceph_msg *ceph_msg_new2(int type, int front_len, int max_data_items,
gfp_t flags, bool can_fail)
{
struct ceph_msg *m;
......@@ -3370,7 +3330,6 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
INIT_LIST_HEAD(&m->list_head);
kref_init(&m->kref);
INIT_LIST_HEAD(&m->data);
/* front */
if (front_len) {
......@@ -3385,6 +3344,15 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
}
m->front_alloc_len = m->front.iov_len = front_len;
if (max_data_items) {
m->data = kmalloc_array(max_data_items, sizeof(*m->data),
flags);
if (!m->data)
goto out2;
m->max_data_items = max_data_items;
}
dout("ceph_msg_new %p front %d\n", m, front_len);
return m;
......@@ -3401,6 +3369,13 @@ struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
}
return NULL;
}
EXPORT_SYMBOL(ceph_msg_new2);
struct ceph_msg *ceph_msg_new(int type, int front_len, gfp_t flags,
bool can_fail)
{
return ceph_msg_new2(type, front_len, 0, flags, can_fail);
}
EXPORT_SYMBOL(ceph_msg_new);
/*
......@@ -3496,13 +3471,14 @@ static void ceph_msg_free(struct ceph_msg *m)
{
dout("%s %p\n", __func__, m);
kvfree(m->front.iov_base);
kfree(m->data);
kmem_cache_free(ceph_msg_cache, m);
}
static void ceph_msg_release(struct kref *kref)
{
struct ceph_msg *m = container_of(kref, struct ceph_msg, kref);
struct ceph_msg_data *data, *next;
int i;
dout("%s %p\n", __func__, m);
WARN_ON(!list_empty(&m->list_head));
......@@ -3515,11 +3491,8 @@ static void ceph_msg_release(struct kref *kref)
m->middle = NULL;
}
list_for_each_entry_safe(data, next, &m->data, links) {
list_del_init(&data->links);
ceph_msg_data_destroy(data);
}
m->data_length = 0;
for (i = 0; i < m->num_data_items; i++)
ceph_msg_data_destroy(&m->data[i]);
if (m->pool)
ceph_msgpool_put(m->pool, m);
......
......@@ -14,7 +14,8 @@ static void *msgpool_alloc(gfp_t gfp_mask, void *arg)
struct ceph_msgpool *pool = arg;
struct ceph_msg *msg;
msg = ceph_msg_new(pool->type, pool->front_len, gfp_mask, true);
msg = ceph_msg_new2(pool->type, pool->front_len, pool->max_data_items,
gfp_mask, true);
if (!msg) {
dout("msgpool_alloc %s failed\n", pool->name);
} else {
......@@ -35,11 +36,13 @@ static void msgpool_free(void *element, void *arg)
}
int ceph_msgpool_init(struct ceph_msgpool *pool, int type,
int front_len, int size, bool blocking, const char *name)
int front_len, int max_data_items, int size,
const char *name)
{
dout("msgpool %s init\n", name);
pool->type = type;
pool->front_len = front_len;
pool->max_data_items = max_data_items;
pool->pool = mempool_create(size, msgpool_alloc, msgpool_free, pool);
if (!pool->pool)
return -ENOMEM;
......@@ -53,18 +56,21 @@ void ceph_msgpool_destroy(struct ceph_msgpool *pool)
mempool_destroy(pool->pool);
}
struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool,
int front_len)
struct ceph_msg *ceph_msgpool_get(struct ceph_msgpool *pool, int front_len,
int max_data_items)
{
struct ceph_msg *msg;
if (front_len > pool->front_len) {
dout("msgpool_get %s need front %d, pool size is %d\n",
pool->name, front_len, pool->front_len);
WARN_ON(1);
if (front_len > pool->front_len ||
max_data_items > pool->max_data_items) {
pr_warn_ratelimited("%s need %d/%d, pool %s has %d/%d\n",
__func__, front_len, max_data_items, pool->name,
pool->front_len, pool->max_data_items);
WARN_ON_ONCE(1);
/* try to alloc a fresh message */
return ceph_msg_new(pool->type, front_len, GFP_NOFS, false);
return ceph_msg_new2(pool->type, front_len, max_data_items,
GFP_NOFS, false);
}
msg = mempool_alloc(pool->pool, GFP_NOFS);
......@@ -80,6 +86,9 @@ void ceph_msgpool_put(struct ceph_msgpool *pool, struct ceph_msg *msg)
msg->front.iov_len = pool->front_len;
msg->hdr.front_len = cpu_to_le32(pool->front_len);
msg->data_length = 0;
msg->num_data_items = 0;
kref_init(&msg->kref); /* retake single ref */
mempool_free(msg, pool->pool);
}
This diff is collapsed.
......@@ -6,6 +6,26 @@
#include <linux/highmem.h>
#include <linux/ceph/pagelist.h>
struct ceph_pagelist *ceph_pagelist_alloc(gfp_t gfp_flags)
{
struct ceph_pagelist *pl;
pl = kmalloc(sizeof(*pl), gfp_flags);
if (!pl)
return NULL;
INIT_LIST_HEAD(&pl->head);
pl->mapped_tail = NULL;
pl->length = 0;
pl->room = 0;
INIT_LIST_HEAD(&pl->free_list);
pl->num_pages_free = 0;
refcount_set(&pl->refcnt, 1);
return pl;
}
EXPORT_SYMBOL(ceph_pagelist_alloc);
static void ceph_pagelist_unmap_tail(struct ceph_pagelist *pl)
{
if (pl->mapped_tail) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment