Commit af85e8e8 authored by Lars Ellenberg's avatar Lars Ellenberg Committed by Philipp Reisner

drbd: fix for spurious fullsync (uuids rotated too fast)

If it was an "empty" resync, the SyncSource may have already "finished"
the resync and rotated the UUIDs, before noticing the connection loss
(and generating a new uuid, if Primary, rotating again), while the
SyncTarget did not change its uuids at all, or only got to the previous
sync-uuid.
This would then again lead to a full sync on next handshake
(see also Bug #251).

Fix:
Use explicit resync finished notification even for empty resyncs,
do not finish an empty resync implicitly on the SyncSource.
Signed-off-by: default avatarPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: default avatarLars Ellenberg <lars.ellenberg@linbit.com>
parent e9ef7bb6
......@@ -1426,6 +1426,11 @@ static void after_state_ch(struct drbd_conf *mdev, union drbd_state os,
(os.user_isp && !ns.user_isp))
resume_next_sg(mdev);
/* sync target done with resync. Explicitly notify peer, even though
* it should (at least for non-empty resyncs) already know itself. */
if (os.disk < D_UP_TO_DATE && os.conn >= C_SYNC_SOURCE && ns.conn == C_CONNECTED)
drbd_send_state(mdev);
/* free tl_hash if we Got thawed and are C_STANDALONE */
if (ns.conn == C_STANDALONE && !is_susp(ns) && mdev->tl_hash)
drbd_free_tl_hash(mdev);
......
......@@ -522,6 +522,12 @@ int w_make_resync_request(struct drbd_conf *mdev,
dev_err(DEV, "%s in w_make_resync_request\n",
drbd_conn_str(mdev->state.conn));
if (mdev->rs_total == 0) {
/* empty resync? */
drbd_resync_finished(mdev);
return 1;
}
if (!get_ldev(mdev)) {
/* Since we only need to access mdev->rsync a
get_ldev_if_state(mdev,D_FAILED) would be sufficient, but
......@@ -768,6 +774,14 @@ static int w_resync_finished(struct drbd_conf *mdev, struct drbd_work *w, int ca
return 1;
}
static void ping_peer(struct drbd_conf *mdev)
{
clear_bit(GOT_PING_ACK, &mdev->flags);
request_ping(mdev);
wait_event(mdev->misc_wait,
test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
}
int drbd_resync_finished(struct drbd_conf *mdev)
{
unsigned long db, dt, dbdt;
......@@ -807,6 +821,8 @@ int drbd_resync_finished(struct drbd_conf *mdev)
if (!get_ldev(mdev))
goto out;
ping_peer(mdev);
spin_lock_irq(&mdev->req_lock);
os = mdev->state;
......@@ -1420,14 +1436,6 @@ int drbd_alter_sa(struct drbd_conf *mdev, int na)
return retcode;
}
static void ping_peer(struct drbd_conf *mdev)
{
clear_bit(GOT_PING_ACK, &mdev->flags);
request_ping(mdev);
wait_event(mdev->misc_wait,
test_bit(GOT_PING_ACK, &mdev->flags) || mdev->state.conn < C_CONNECTED);
}
/**
* drbd_start_resync() - Start the resync process
* @mdev: DRBD device.
......@@ -1527,9 +1535,21 @@ void drbd_start_resync(struct drbd_conf *mdev, enum drbd_conns side)
(unsigned long) mdev->rs_total << (BM_BLOCK_SHIFT-10),
(unsigned long) mdev->rs_total);
if (mdev->rs_total == 0) {
/* Peer still reachable? Beware of failing before-resync-target handlers! */
ping_peer(mdev);
if (mdev->agreed_pro_version < 95 && mdev->rs_total == 0) {
/* This still has a race (about when exactly the peers
* detect connection loss) that can lead to a full sync
* on next handshake. In 8.3.9 we fixed this with explicit
* resync-finished notifications, but the fix
* introduces a protocol change. Sleeping for some
* time longer than the ping interval + timeout on the
* SyncSource, to give the SyncTarget the chance to
* detect connection loss, then waiting for a ping
* response (implicit in drbd_resync_finished) reduces
* the race considerably, but does not solve it. */
if (side == C_SYNC_SOURCE)
schedule_timeout_interruptible(
mdev->net_conf->ping_int * HZ +
mdev->net_conf->ping_timeo*HZ/9);
drbd_resync_finished(mdev);
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment