Commit 553adfd9 authored by Yan, Zheng's avatar Yan, Zheng Committed by Ilya Dryomov

ceph: track pending caps flushing accurately

Previously we do not trace accurate TID for flushing caps. when
MDS failovers, we have no choice but to re-send all flushing caps
with a new TID. This can cause problem because MDS can has already
flushed some caps and has issued the same caps to other client.
The re-sent cap flush has a new TID, which makes MDS unable to
detect if it has already processed the cap flush.

This patch adds code to track pending caps flushing accurately.
When re-sending cap flush is needed, we use its original flush
TID.
Signed-off-by: default avatarYan, Zheng <zyan@redhat.com>
parent 6c13a6bb
This diff is collapsed.
...@@ -417,8 +417,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ...@@ -417,8 +417,7 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item); INIT_LIST_HEAD(&ci->i_flushing_item);
ci->i_cap_flush_seq = 0; ci->i_cap_flush_seq = 0;
ci->i_cap_flush_last_tid = 0; ci->i_cap_flush_tree = RB_ROOT;
memset(&ci->i_cap_flush_tid, 0, sizeof(ci->i_cap_flush_tid));
init_waitqueue_head(&ci->i_cap_wq); init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0; ci->i_hold_caps_min = 0;
ci->i_hold_caps_max = 0; ci->i_hold_caps_max = 0;
......
...@@ -1142,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1142,6 +1142,7 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
void *arg) void *arg)
{ {
struct ceph_inode_info *ci = ceph_inode(inode); struct ceph_inode_info *ci = ceph_inode(inode);
LIST_HEAD(to_remove);
int drop = 0; int drop = 0;
dout("removing cap %p, ci is %p, inode is %p\n", dout("removing cap %p, ci is %p, inode is %p\n",
...@@ -1149,9 +1150,19 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1149,9 +1150,19 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
__ceph_remove_cap(cap, false); __ceph_remove_cap(cap, false);
if (!ci->i_auth_cap) { if (!ci->i_auth_cap) {
struct ceph_cap_flush *cf;
struct ceph_mds_client *mdsc = struct ceph_mds_client *mdsc =
ceph_sb_to_client(inode->i_sb)->mdsc; ceph_sb_to_client(inode->i_sb)->mdsc;
while (true) {
struct rb_node *n = rb_first(&ci->i_cap_flush_tree);
if (!n)
break;
cf = rb_entry(n, struct ceph_cap_flush, i_node);
rb_erase(&cf->i_node, &ci->i_cap_flush_tree);
list_add(&cf->list, &to_remove);
}
spin_lock(&mdsc->cap_dirty_lock); spin_lock(&mdsc->cap_dirty_lock);
if (!list_empty(&ci->i_dirty_item)) { if (!list_empty(&ci->i_dirty_item)) {
pr_warn_ratelimited( pr_warn_ratelimited(
...@@ -1173,8 +1184,16 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1173,8 +1184,16 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
drop = 1; drop = 1;
} }
spin_unlock(&mdsc->cap_dirty_lock); spin_unlock(&mdsc->cap_dirty_lock);
} }
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
while (!list_empty(&to_remove)) {
struct ceph_cap_flush *cf;
cf = list_first_entry(&to_remove,
struct ceph_cap_flush, list);
list_del(&cf->list);
kfree(cf);
}
while (drop--) while (drop--)
iput(inode); iput(inode);
return 0; return 0;
...@@ -3408,6 +3427,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) ...@@ -3408,6 +3427,7 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
INIT_LIST_HEAD(&mdsc->snap_flush_list); INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock); spin_lock_init(&mdsc->snap_flush_lock);
mdsc->cap_flush_seq = 0; mdsc->cap_flush_seq = 0;
mdsc->last_cap_flush_tid = 1;
INIT_LIST_HEAD(&mdsc->cap_dirty); INIT_LIST_HEAD(&mdsc->cap_dirty);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
mdsc->num_cap_flushing = 0; mdsc->num_cap_flushing = 0;
......
...@@ -307,6 +307,7 @@ struct ceph_mds_client { ...@@ -307,6 +307,7 @@ struct ceph_mds_client {
spinlock_t snap_flush_lock; spinlock_t snap_flush_lock;
u64 cap_flush_seq; u64 cap_flush_seq;
u64 last_cap_flush_tid;
struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */ struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */ int num_cap_flushing; /* # caps we are flushing */
......
...@@ -186,6 +186,15 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) ...@@ -186,6 +186,15 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
} }
} }
struct ceph_cap_flush {
u64 tid;
int caps;
union {
struct rb_node i_node;
struct list_head list;
};
};
/* /*
* The frag tree describes how a directory is fragmented, potentially across * The frag tree describes how a directory is fragmented, potentially across
* multiple metadata servers. It is also used to indicate points where * multiple metadata servers. It is also used to indicate points where
...@@ -299,7 +308,7 @@ struct ceph_inode_info { ...@@ -299,7 +308,7 @@ struct ceph_inode_info {
/* we need to track cap writeback on a per-cap-bit basis, to allow /* we need to track cap writeback on a per-cap-bit basis, to allow
* overlapping, pipelined cap flushes to the mds. we can probably * overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */ * reduce the tid to 8 bits if we're concerned about inode size. */
u16 i_cap_flush_last_tid, i_cap_flush_tid[CEPH_CAP_BITS]; struct rb_root i_cap_flush_tree;
wait_queue_head_t i_cap_wq; /* threads waiting on a capability */ wait_queue_head_t i_cap_wq; /* threads waiting on a capability */
unsigned long i_hold_caps_min; /* jiffies */ unsigned long i_hold_caps_min; /* jiffies */
unsigned long i_hold_caps_max; /* jiffies */ unsigned long i_hold_caps_max; /* jiffies */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment