Commit 8310b089 authored by Yan, Zheng's avatar Yan, Zheng Committed by Ilya Dryomov

ceph: track pending caps flushing globally

So we know TID of the oldest pending caps flushing. Later patch will
send this information to MDS, so that MDS can trim its completed caps
flush list.

Tracking pending caps flushing globally also simplifies syncfs code.
Signed-off-by: default avatarYan, Zheng <zyan@redhat.com>
parent 553adfd9
...@@ -1415,6 +1415,29 @@ static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci, ...@@ -1415,6 +1415,29 @@ static void __add_cap_flushing_to_inode(struct ceph_inode_info *ci,
rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree); rb_insert_color(&cf->i_node, &ci->i_cap_flush_tree);
} }
static void __add_cap_flushing_to_mdsc(struct ceph_mds_client *mdsc,
struct ceph_cap_flush *cf)
{
struct rb_node **p = &mdsc->cap_flush_tree.rb_node;
struct rb_node *parent = NULL;
struct ceph_cap_flush *other = NULL;
while (*p) {
parent = *p;
other = rb_entry(parent, struct ceph_cap_flush, g_node);
if (cf->tid < other->tid)
p = &(*p)->rb_left;
else if (cf->tid > other->tid)
p = &(*p)->rb_right;
else
BUG();
}
rb_link_node(&cf->g_node, parent, p);
rb_insert_color(&cf->g_node, &mdsc->cap_flush_tree);
}
/* /*
* Add dirty inode to the flushing list. Assigned a seq number so we * Add dirty inode to the flushing list. Assigned a seq number so we
* can wait for caps to flush without starving. * can wait for caps to flush without starving.
...@@ -1449,17 +1472,16 @@ static int __mark_caps_flushing(struct inode *inode, ...@@ -1449,17 +1472,16 @@ static int __mark_caps_flushing(struct inode *inode,
list_del_init(&ci->i_dirty_item); list_del_init(&ci->i_dirty_item);
cf->tid = ++mdsc->last_cap_flush_tid; cf->tid = ++mdsc->last_cap_flush_tid;
__add_cap_flushing_to_mdsc(mdsc, cf);
if (list_empty(&ci->i_flushing_item)) { if (list_empty(&ci->i_flushing_item)) {
ci->i_cap_flush_seq = ++mdsc->cap_flush_seq;
list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing); list_add_tail(&ci->i_flushing_item, &session->s_cap_flushing);
mdsc->num_cap_flushing++; mdsc->num_cap_flushing++;
dout(" inode %p now flushing seq %lld\n", inode, dout(" inode %p now flushing tid %llu\n", inode, cf->tid);
ci->i_cap_flush_seq);
} else { } else {
list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing); list_move_tail(&ci->i_flushing_item, &session->s_cap_flushing);
dout(" inode %p now flushing (more) seq %lld\n", inode, dout(" inode %p now flushing (more) tid %llu\n",
ci->i_cap_flush_seq); inode, cf->tid);
} }
spin_unlock(&mdsc->cap_dirty_lock); spin_unlock(&mdsc->cap_dirty_lock);
...@@ -2123,8 +2145,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc, ...@@ -2123,8 +2145,8 @@ static void kick_flushing_inode_caps(struct ceph_mds_client *mdsc,
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
cap = ci->i_auth_cap; cap = ci->i_auth_cap;
dout("kick_flushing_inode_caps %p flushing %s flush_seq %lld\n", inode, dout("kick_flushing_inode_caps %p flushing %s\n", inode,
ceph_cap_string(ci->i_flushing_caps), ci->i_cap_flush_seq); ceph_cap_string(ci->i_flushing_caps));
__ceph_flush_snaps(ci, &session, 1); __ceph_flush_snaps(ci, &session, 1);
...@@ -2921,12 +2943,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, ...@@ -2921,12 +2943,23 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps), ceph_cap_string(cleaned), ceph_cap_string(ci->i_flushing_caps),
ceph_cap_string(ci->i_flushing_caps & ~cleaned)); ceph_cap_string(ci->i_flushing_caps & ~cleaned));
if (ci->i_flushing_caps == (ci->i_flushing_caps & ~cleaned)) if (list_empty(&to_remove) && !cleaned)
goto out; goto out;
ci->i_flushing_caps &= ~cleaned; ci->i_flushing_caps &= ~cleaned;
spin_lock(&mdsc->cap_dirty_lock); spin_lock(&mdsc->cap_dirty_lock);
if (!list_empty(&to_remove)) {
list_for_each_entry(cf, &to_remove, list)
rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
n = rb_first(&mdsc->cap_flush_tree);
cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
if (!cf || cf->tid > flush_tid)
wake_up_all(&mdsc->cap_flushing_wq);
}
if (ci->i_flushing_caps == 0) { if (ci->i_flushing_caps == 0) {
list_del_init(&ci->i_flushing_item); list_del_init(&ci->i_flushing_item);
if (!list_empty(&session->s_cap_flushing)) if (!list_empty(&session->s_cap_flushing))
...@@ -2936,7 +2969,6 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid, ...@@ -2936,7 +2969,6 @@ static void handle_cap_flush_ack(struct inode *inode, u64 flush_tid,
struct ceph_inode_info, struct ceph_inode_info,
i_flushing_item)->vfs_inode); i_flushing_item)->vfs_inode);
mdsc->num_cap_flushing--; mdsc->num_cap_flushing--;
wake_up_all(&mdsc->cap_flushing_wq);
dout(" inode %p now !flushing\n", inode); dout(" inode %p now !flushing\n", inode);
if (ci->i_dirty_caps == 0) { if (ci->i_dirty_caps == 0) {
......
...@@ -416,7 +416,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb) ...@@ -416,7 +416,6 @@ struct inode *ceph_alloc_inode(struct super_block *sb)
ci->i_flushing_caps = 0; ci->i_flushing_caps = 0;
INIT_LIST_HEAD(&ci->i_dirty_item); INIT_LIST_HEAD(&ci->i_dirty_item);
INIT_LIST_HEAD(&ci->i_flushing_item); INIT_LIST_HEAD(&ci->i_flushing_item);
ci->i_cap_flush_seq = 0;
ci->i_cap_flush_tree = RB_ROOT; ci->i_cap_flush_tree = RB_ROOT;
init_waitqueue_head(&ci->i_cap_wq); init_waitqueue_head(&ci->i_cap_wq);
ci->i_hold_caps_min = 0; ci->i_hold_caps_min = 0;
......
...@@ -1164,6 +1164,10 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap, ...@@ -1164,6 +1164,10 @@ static int remove_session_caps_cb(struct inode *inode, struct ceph_cap *cap,
} }
spin_lock(&mdsc->cap_dirty_lock); spin_lock(&mdsc->cap_dirty_lock);
list_for_each_entry(cf, &to_remove, list)
rb_erase(&cf->g_node, &mdsc->cap_flush_tree);
if (!list_empty(&ci->i_dirty_item)) { if (!list_empty(&ci->i_dirty_item)) {
pr_warn_ratelimited( pr_warn_ratelimited(
" dropping dirty %s state for %p %lld\n", " dropping dirty %s state for %p %lld\n",
...@@ -1467,39 +1471,56 @@ static int trim_caps(struct ceph_mds_client *mdsc, ...@@ -1467,39 +1471,56 @@ static int trim_caps(struct ceph_mds_client *mdsc,
return 0; return 0;
} }
static int check_cap_flush(struct ceph_inode_info *ci, static int check_capsnap_flush(struct ceph_inode_info *ci,
u64 want_flush_seq, u64 want_snap_seq) u64 want_snap_seq)
{ {
int ret1 = 1, ret2 = 1; int ret = 1;
spin_lock(&ci->i_ceph_lock); spin_lock(&ci->i_ceph_lock);
if (want_flush_seq > 0 && ci->i_flushing_caps)
ret1 = ci->i_cap_flush_seq >= want_flush_seq;
if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) { if (want_snap_seq > 0 && !list_empty(&ci->i_cap_snaps)) {
struct ceph_cap_snap *capsnap = struct ceph_cap_snap *capsnap =
list_first_entry(&ci->i_cap_snaps, list_first_entry(&ci->i_cap_snaps,
struct ceph_cap_snap, ci_item); struct ceph_cap_snap, ci_item);
ret2 = capsnap->follows >= want_snap_seq; ret = capsnap->follows >= want_snap_seq;
} }
spin_unlock(&ci->i_ceph_lock); spin_unlock(&ci->i_ceph_lock);
return ret1 && ret2; return ret;
}
static int check_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_tid)
{
struct rb_node *n;
struct ceph_cap_flush *cf;
int ret = 1;
spin_lock(&mdsc->cap_dirty_lock);
n = rb_first(&mdsc->cap_flush_tree);
cf = n ? rb_entry(n, struct ceph_cap_flush, g_node) : NULL;
if (cf && cf->tid <= want_flush_tid) {
dout("check_caps_flush still flushing tid %llu <= %llu\n",
cf->tid, want_flush_tid);
ret = 0;
}
spin_unlock(&mdsc->cap_dirty_lock);
return ret;
} }
/* /*
* flush all dirty inode data to disk. * flush all dirty inode data to disk.
* *
* returns true if we've flushed through want_flush_seq * returns true if we've flushed through want_flush_tid
*/ */
static void wait_caps_flush(struct ceph_mds_client *mdsc, static void wait_caps_flush(struct ceph_mds_client *mdsc,
u64 want_flush_seq, u64 want_snap_seq) u64 want_flush_tid, u64 want_snap_seq)
{ {
int mds; int mds;
dout("check_cap_flush want %lld\n", want_flush_seq); dout("check_caps_flush want %llu snap want %llu\n",
want_flush_tid, want_snap_seq);
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
for (mds = 0; mds < mdsc->max_sessions; ) { for (mds = 0; mds < mdsc->max_sessions; ) {
struct ceph_mds_session *session = mdsc->sessions[mds]; struct ceph_mds_session *session = mdsc->sessions[mds];
struct inode *inode1 = NULL, *inode2 = NULL; struct inode *inode = NULL;
if (!session) { if (!session) {
mds++; mds++;
...@@ -1509,58 +1530,40 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc, ...@@ -1509,58 +1530,40 @@ static void wait_caps_flush(struct ceph_mds_client *mdsc,
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
mutex_lock(&session->s_mutex); mutex_lock(&session->s_mutex);
if (!list_empty(&session->s_cap_flushing)) {
struct ceph_inode_info *ci =
list_first_entry(&session->s_cap_flushing,
struct ceph_inode_info,
i_flushing_item);
if (!check_cap_flush(ci, want_flush_seq, 0)) {
dout("check_cap_flush still flushing %p "
"seq %lld <= %lld to mds%d\n",
&ci->vfs_inode, ci->i_cap_flush_seq,
want_flush_seq, mds);
inode1 = igrab(&ci->vfs_inode);
}
}
if (!list_empty(&session->s_cap_snaps_flushing)) { if (!list_empty(&session->s_cap_snaps_flushing)) {
struct ceph_cap_snap *capsnap = struct ceph_cap_snap *capsnap =
list_first_entry(&session->s_cap_snaps_flushing, list_first_entry(&session->s_cap_snaps_flushing,
struct ceph_cap_snap, struct ceph_cap_snap,
flushing_item); flushing_item);
struct ceph_inode_info *ci = capsnap->ci; struct ceph_inode_info *ci = capsnap->ci;
if (!check_cap_flush(ci, 0, want_snap_seq)) { if (!check_capsnap_flush(ci, want_snap_seq)) {
dout("check_cap_flush still flushing snap %p " dout("check_cap_flush still flushing snap %p "
"follows %lld <= %lld to mds%d\n", "follows %lld <= %lld to mds%d\n",
&ci->vfs_inode, capsnap->follows, &ci->vfs_inode, capsnap->follows,
want_snap_seq, mds); want_snap_seq, mds);
inode2 = igrab(&ci->vfs_inode); inode = igrab(&ci->vfs_inode);
} }
} }
mutex_unlock(&session->s_mutex); mutex_unlock(&session->s_mutex);
ceph_put_mds_session(session); ceph_put_mds_session(session);
if (inode1) { if (inode) {
wait_event(mdsc->cap_flushing_wq,
check_cap_flush(ceph_inode(inode1),
want_flush_seq, 0));
iput(inode1);
}
if (inode2) {
wait_event(mdsc->cap_flushing_wq, wait_event(mdsc->cap_flushing_wq,
check_cap_flush(ceph_inode(inode2), check_capsnap_flush(ceph_inode(inode),
0, want_snap_seq)); want_snap_seq));
iput(inode2); iput(inode);
} } else {
if (!inode1 && !inode2)
mds++; mds++;
}
mutex_lock(&mdsc->mutex); mutex_lock(&mdsc->mutex);
} }
mutex_unlock(&mdsc->mutex); mutex_unlock(&mdsc->mutex);
dout("check_cap_flush ok, flushed thru %lld\n", want_flush_seq);
wait_event(mdsc->cap_flushing_wq,
check_caps_flush(mdsc, want_flush_tid));
dout("check_caps_flush ok, flushed thru %llu\n", want_flush_tid);
} }
/* /*
...@@ -3426,8 +3429,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc) ...@@ -3426,8 +3429,8 @@ int ceph_mdsc_init(struct ceph_fs_client *fsc)
spin_lock_init(&mdsc->cap_delay_lock); spin_lock_init(&mdsc->cap_delay_lock);
INIT_LIST_HEAD(&mdsc->snap_flush_list); INIT_LIST_HEAD(&mdsc->snap_flush_list);
spin_lock_init(&mdsc->snap_flush_lock); spin_lock_init(&mdsc->snap_flush_lock);
mdsc->cap_flush_seq = 0;
mdsc->last_cap_flush_tid = 1; mdsc->last_cap_flush_tid = 1;
mdsc->cap_flush_tree = RB_ROOT;
INIT_LIST_HEAD(&mdsc->cap_dirty); INIT_LIST_HEAD(&mdsc->cap_dirty);
INIT_LIST_HEAD(&mdsc->cap_dirty_migrating); INIT_LIST_HEAD(&mdsc->cap_dirty_migrating);
mdsc->num_cap_flushing = 0; mdsc->num_cap_flushing = 0;
...@@ -3554,7 +3557,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc) ...@@ -3554,7 +3557,7 @@ void ceph_mdsc_sync(struct ceph_mds_client *mdsc)
ceph_flush_dirty_caps(mdsc); ceph_flush_dirty_caps(mdsc);
spin_lock(&mdsc->cap_dirty_lock); spin_lock(&mdsc->cap_dirty_lock);
want_flush = mdsc->cap_flush_seq; want_flush = mdsc->last_cap_flush_tid;
spin_unlock(&mdsc->cap_dirty_lock); spin_unlock(&mdsc->cap_dirty_lock);
down_read(&mdsc->snap_rwsem); down_read(&mdsc->snap_rwsem);
......
...@@ -306,8 +306,8 @@ struct ceph_mds_client { ...@@ -306,8 +306,8 @@ struct ceph_mds_client {
struct list_head snap_flush_list; /* cap_snaps ready to flush */ struct list_head snap_flush_list; /* cap_snaps ready to flush */
spinlock_t snap_flush_lock; spinlock_t snap_flush_lock;
u64 cap_flush_seq;
u64 last_cap_flush_tid; u64 last_cap_flush_tid;
struct rb_root cap_flush_tree;
struct list_head cap_dirty; /* inodes with dirty caps */ struct list_head cap_dirty; /* inodes with dirty caps */
struct list_head cap_dirty_migrating; /* ...that are migration... */ struct list_head cap_dirty_migrating; /* ...that are migration... */
int num_cap_flushing; /* # caps we are flushing */ int num_cap_flushing; /* # caps we are flushing */
......
...@@ -189,6 +189,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap) ...@@ -189,6 +189,7 @@ static inline void ceph_put_cap_snap(struct ceph_cap_snap *capsnap)
struct ceph_cap_flush { struct ceph_cap_flush {
u64 tid; u64 tid;
int caps; int caps;
struct rb_node g_node;
union { union {
struct rb_node i_node; struct rb_node i_node;
struct list_head list; struct list_head list;
...@@ -304,7 +305,6 @@ struct ceph_inode_info { ...@@ -304,7 +305,6 @@ struct ceph_inode_info {
struct ceph_cap *i_auth_cap; /* authoritative cap, if any */ struct ceph_cap *i_auth_cap; /* authoritative cap, if any */
unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */ unsigned i_dirty_caps, i_flushing_caps; /* mask of dirtied fields */
struct list_head i_dirty_item, i_flushing_item; struct list_head i_dirty_item, i_flushing_item;
u64 i_cap_flush_seq;
/* we need to track cap writeback on a per-cap-bit basis, to allow /* we need to track cap writeback on a per-cap-bit basis, to allow
* overlapping, pipelined cap flushes to the mds. we can probably * overlapping, pipelined cap flushes to the mds. we can probably
* reduce the tid to 8 bits if we're concerned about inode size. */ * reduce the tid to 8 bits if we're concerned about inode size. */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment