Commit 7be8da07 authored by Andreas Gruenbacher's avatar Andreas Gruenbacher Committed by Philipp Reisner

drbd: Improve how conflicting writes are handled

The previous algorithm for dealing with overlapping concurrent writes
was generating unnecessary warnings for scenarios which could be
legitimate, and did not always handle partially overlapping requests
correctly.  Improve it algorithm as follows:

* While local or remote write requests are in progress, conflicting new
  local write requests will be delayed (commit 82172f7).

* When a conflict between a local and remote write request is detected,
  the node with the discard flag decides how to resolve the conflict: It
  will ask its peer to discard conflicting requests which are fully
  contained in the local request and retry requests which overlap only
  partially.  This involves a protocol change.
Signed-off-by: default avatarPhilipp Reisner <philipp.reisner@linbit.com>
Signed-off-by: default avatarLars Ellenberg <lars.ellenberg@linbit.com>
parent 71b1c1eb
......@@ -200,7 +200,7 @@ enum drbd_packet {
P_RECV_ACK = 0x15, /* Used in protocol B */
P_WRITE_ACK = 0x16, /* Used in protocol C */
P_RS_WRITE_ACK = 0x17, /* Is a P_WRITE_ACK, additionally call set_in_sync(). */
P_DISCARD_ACK = 0x18, /* Used in proto C, two-primaries conflict detection */
P_DISCARD_WRITE = 0x18, /* Used in proto C, two-primaries conflict detection */
P_NEG_ACK = 0x19, /* Sent if local disk is unusable */
P_NEG_DREPLY = 0x1a, /* Local disk is broken... */
P_NEG_RS_DREPLY = 0x1b, /* Local disk is broken... */
......@@ -223,8 +223,9 @@ enum drbd_packet {
P_RS_CANCEL = 0x29, /* meta: Used to cancel RS_DATA_REQUEST packet by SyncSource */
P_CONN_ST_CHG_REQ = 0x2a, /* data sock: Connection wide state request */
P_CONN_ST_CHG_REPLY = 0x2b, /* meta sock: Connection side state req reply */
P_RETRY_WRITE = 0x2c, /* Protocol C: retry conflicting write request */
P_MAX_CMD = 0x2c,
P_MAX_CMD = 0x2d,
P_MAY_IGNORE = 0x100, /* Flag to test if (cmd > P_MAY_IGNORE) ... */
P_MAX_OPT_CMD = 0x101,
......@@ -350,7 +351,7 @@ struct p_data {
* commands which share a struct:
* p_block_ack:
* P_RECV_ACK (proto B), P_WRITE_ACK (proto C),
* P_DISCARD_ACK (proto C, two-primaries conflict detection)
* P_DISCARD_WRITE (proto C, two-primaries conflict detection)
* p_block_req:
* P_DATA_REQUEST, P_RS_DATA_REQUEST
*/
......@@ -362,7 +363,6 @@ struct p_block_ack {
u32 seq_num;
} __packed;
struct p_block_req {
struct p_header head;
u64 sector;
......@@ -655,6 +655,8 @@ struct drbd_work {
#include "drbd_interval.h"
extern int drbd_wait_misc(struct drbd_conf *, struct drbd_interval *);
struct drbd_request {
struct drbd_work w;
......@@ -752,12 +754,16 @@ enum {
/* This ee has a pointer to a digest instead of a block id */
__EE_HAS_DIGEST,
/* Conflicting local requests need to be restarted after this request */
__EE_RESTART_REQUESTS,
};
#define EE_CALL_AL_COMPLETE_IO (1<<__EE_CALL_AL_COMPLETE_IO)
#define EE_MAY_SET_IN_SYNC (1<<__EE_MAY_SET_IN_SYNC)
#define EE_RESUBMITTED (1<<__EE_RESUBMITTED)
#define EE_WAS_ERROR (1<<__EE_WAS_ERROR)
#define EE_HAS_DIGEST (1<<__EE_HAS_DIGEST)
#define EE_RESTART_REQUESTS (1<<__EE_RESTART_REQUESTS)
/* flag bits per mdev */
enum {
......@@ -1478,6 +1484,7 @@ extern void drbd_free_tconn(struct drbd_tconn *tconn);
extern int proc_details;
/* drbd_req */
extern int __drbd_make_request(struct drbd_conf *, struct bio *, unsigned long);
extern int drbd_make_request(struct request_queue *q, struct bio *bio);
extern int drbd_read_remote(struct drbd_conf *mdev, struct drbd_request *req);
extern int drbd_merge_bvec(struct request_queue *q, struct bvec_merge_data *bvm, struct bio_vec *bvec);
......
......@@ -3003,7 +3003,7 @@ const char *cmdname(enum drbd_packet cmd)
[P_RECV_ACK] = "RecvAck",
[P_WRITE_ACK] = "WriteAck",
[P_RS_WRITE_ACK] = "RSWriteAck",
[P_DISCARD_ACK] = "DiscardAck",
[P_DISCARD_WRITE] = "DiscardWrite",
[P_NEG_ACK] = "NegAck",
[P_NEG_DREPLY] = "NegDReply",
[P_NEG_RS_DREPLY] = "NegRSDReply",
......@@ -3018,6 +3018,7 @@ const char *cmdname(enum drbd_packet cmd)
[P_COMPRESSED_BITMAP] = "CBitmap",
[P_DELAY_PROBE] = "DelayProbe",
[P_OUT_OF_SYNC] = "OutOfSync",
[P_RETRY_WRITE] = "RetryWrite",
[P_MAX_CMD] = NULL,
};
......@@ -3032,6 +3033,38 @@ const char *cmdname(enum drbd_packet cmd)
return cmdnames[cmd];
}
/**
* drbd_wait_misc - wait for a request to make progress
* @mdev: device associated with the request
* @i: the struct drbd_interval embedded in struct drbd_request or
* struct drbd_peer_request
*/
int drbd_wait_misc(struct drbd_conf *mdev, struct drbd_interval *i)
{
struct net_conf *net_conf = mdev->tconn->net_conf;
DEFINE_WAIT(wait);
long timeout;
if (!net_conf)
return -ETIMEDOUT;
timeout = MAX_SCHEDULE_TIMEOUT;
if (net_conf->ko_count)
timeout = net_conf->timeout * HZ / 10 * net_conf->ko_count;
/* Indicate to wake up mdev->misc_wait on progress. */
i->waiting = true;
prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
spin_unlock_irq(&mdev->tconn->req_lock);
timeout = schedule_timeout(timeout);
finish_wait(&mdev->misc_wait, &wait);
spin_lock_irq(&mdev->tconn->req_lock);
if (!timeout || mdev->state.conn < C_CONNECTED)
return -ETIMEDOUT;
if (signal_pending(current))
return -ERESTARTSYS;
return 0;
}
#ifdef CONFIG_DRBD_FAULT_INJECTION
/* Fault insertion support including random number generator shamelessly
* stolen from kernel/rcutorture.c */
......
This diff is collapsed.
......@@ -225,12 +225,16 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
* the receiver,
* the bio_endio completion callbacks.
*/
if (s & RQ_LOCAL_PENDING)
return;
if (req->i.waiting) {
/* Retry all conflicting peer requests. */
wake_up(&mdev->misc_wait);
}
if (s & RQ_NET_QUEUED)
return;
if (s & RQ_NET_PENDING)
return;
if (s & RQ_LOCAL_PENDING)
return;
if (req->master_bio) {
/* this is DATA_RECEIVED (remote read)
......@@ -267,7 +271,7 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
else
root = &mdev->read_requests;
drbd_remove_request_interval(root, req);
} else
} else if (!(s & RQ_POSTPONED))
D_ASSERT((s & (RQ_NET_MASK & ~RQ_NET_DONE)) == 0);
/* for writes we need to do some extra housekeeping */
......@@ -277,8 +281,10 @@ void _req_may_be_done(struct drbd_request *req, struct bio_and_error *m)
/* Update disk stats */
_drbd_end_io_acct(mdev, req);
if (!(s & RQ_POSTPONED)) {
m->error = ok ? 0 : (error ?: -EIO);
m->bio = req->master_bio;
}
req->master_bio = NULL;
}
......@@ -318,6 +324,8 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
{
struct drbd_conf *mdev = req->w.mdev;
int rv = 0;
if (m)
m->bio = NULL;
switch (what) {
......@@ -332,7 +340,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
*/
case TO_BE_SENT: /* via network */
/* reached via drbd_make_request_common
/* reached via __drbd_make_request
* and from w_read_retry_remote */
D_ASSERT(!(req->rq_state & RQ_NET_MASK));
req->rq_state |= RQ_NET_PENDING;
......@@ -340,7 +348,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
break;
case TO_BE_SUBMITTED: /* locally */
/* reached via drbd_make_request_common */
/* reached via __drbd_make_request */
D_ASSERT(!(req->rq_state & RQ_LOCAL_MASK));
req->rq_state |= RQ_LOCAL_PENDING;
break;
......@@ -403,7 +411,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
* no local disk,
* or target area marked as invalid,
* or just got an io-error. */
/* from drbd_make_request_common
/* from __drbd_make_request
* or from bio_endio during read io-error recovery */
/* so we can verify the handle in the answer packet
......@@ -422,7 +430,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
case QUEUE_FOR_NET_WRITE:
/* assert something? */
/* from drbd_make_request_common only */
/* from __drbd_make_request only */
/* corresponding hlist_del is in _req_may_be_done() */
drbd_insert_interval(&mdev->write_requests, &req->i);
......@@ -436,7 +444,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
*
* _req_add_to_epoch(req); this has to be after the
* _maybe_start_new_epoch(req); which happened in
* drbd_make_request_common, because we now may set the bit
* __drbd_make_request, because we now may set the bit
* again ourselves to close the current epoch.
*
* Add req to the (now) current epoch (barrier). */
......@@ -446,7 +454,7 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
* hurting performance. */
set_bit(UNPLUG_REMOTE, &mdev->flags);
/* see drbd_make_request_common,
/* see __drbd_make_request,
* just after it grabs the req_lock */
D_ASSERT(test_bit(CREATE_BARRIER, &mdev->flags) == 0);
......@@ -535,14 +543,10 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
case WRITE_ACKED_BY_PEER_AND_SIS:
req->rq_state |= RQ_NET_SIS;
case CONFLICT_DISCARDED_BY_PEER:
case DISCARD_WRITE:
/* for discarded conflicting writes of multiple primaries,
* there is no need to keep anything in the tl, potential
* node crashes are covered by the activity log. */
if (what == CONFLICT_DISCARDED_BY_PEER)
dev_alert(DEV, "Got DiscardAck packet %llus +%u!"
" DRBD is not a random data generator!\n",
(unsigned long long)req->i.sector, req->i.size);
req->rq_state |= RQ_NET_DONE;
/* fall through */
case WRITE_ACKED_BY_PEER:
......@@ -569,6 +573,17 @@ int __req_mod(struct drbd_request *req, enum drbd_req_event what,
_req_may_be_done_not_susp(req, m);
break;
case POSTPONE_WRITE:
/*
* If this node has already detected the write conflict, the
* worker will be waiting on misc_wait. Wake it up once this
* request has completed locally.
*/
D_ASSERT(req->rq_state & RQ_NET_PENDING);
req->rq_state |= RQ_POSTPONED;
_req_may_be_done_not_susp(req, m);
break;
case NEG_ACKED:
/* assert something? */
if (req->rq_state & RQ_NET_PENDING) {
......@@ -688,24 +703,19 @@ static int complete_conflicting_writes(struct drbd_conf *mdev,
sector_t sector, int size)
{
for(;;) {
DEFINE_WAIT(wait);
struct drbd_interval *i;
int err;
i = drbd_find_overlap(&mdev->write_requests, sector, size);
if (!i)
return 0;
i->waiting = true;
prepare_to_wait(&mdev->misc_wait, &wait, TASK_INTERRUPTIBLE);
spin_unlock_irq(&mdev->tconn->req_lock);
schedule();
finish_wait(&mdev->misc_wait, &wait);
spin_lock_irq(&mdev->tconn->req_lock);
if (signal_pending(current))
return -ERESTARTSYS;
err = drbd_wait_misc(mdev, i);
if (err)
return err;
}
}
static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
int __drbd_make_request(struct drbd_conf *mdev, struct bio *bio, unsigned long start_time)
{
const int rw = bio_rw(bio);
const int size = bio->bi_size;
......@@ -811,7 +821,12 @@ static int drbd_make_request_common(struct drbd_conf *mdev, struct bio *bio, uns
if (rw == WRITE) {
err = complete_conflicting_writes(mdev, sector, size);
if (err) {
if (err != -ERESTARTSYS)
_conn_request_state(mdev->tconn,
NS(conn, C_TIMEOUT),
CS_HARD);
spin_unlock_irq(&mdev->tconn->req_lock);
err = -EIO;
goto fail_free_complete;
}
}
......@@ -1031,7 +1046,7 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
if (likely(s_enr == e_enr)) {
inc_ap_bio(mdev, 1);
return drbd_make_request_common(mdev, bio, start_time);
return __drbd_make_request(mdev, bio, start_time);
}
/* can this bio be split generically?
......@@ -1069,10 +1084,10 @@ int drbd_make_request(struct request_queue *q, struct bio *bio)
D_ASSERT(e_enr == s_enr + 1);
while (drbd_make_request_common(mdev, &bp->bio1, start_time))
while (__drbd_make_request(mdev, &bp->bio1, start_time))
inc_ap_bio(mdev, 1);
while (drbd_make_request_common(mdev, &bp->bio2, start_time))
while (__drbd_make_request(mdev, &bp->bio2, start_time))
inc_ap_bio(mdev, 1);
dec_ap_bio(mdev);
......
......@@ -97,7 +97,8 @@ enum drbd_req_event {
RECV_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER,
WRITE_ACKED_BY_PEER_AND_SIS, /* and set_in_sync */
CONFLICT_DISCARDED_BY_PEER,
DISCARD_WRITE,
POSTPONE_WRITE,
NEG_ACKED,
BARRIER_ACKED, /* in protocol A and B */
DATA_RECEIVED, /* (remote read) */
......@@ -194,6 +195,9 @@ enum drbd_req_state_bits {
/* Should call drbd_al_complete_io() for this request... */
__RQ_IN_ACT_LOG,
/* The peer has sent a retry ACK */
__RQ_POSTPONED,
};
#define RQ_LOCAL_PENDING (1UL << __RQ_LOCAL_PENDING)
......@@ -214,6 +218,7 @@ enum drbd_req_state_bits {
#define RQ_WRITE (1UL << __RQ_WRITE)
#define RQ_IN_ACT_LOG (1UL << __RQ_IN_ACT_LOG)
#define RQ_POSTPONED (1UL << __RQ_POSTPONED)
/* For waking up the frozen transfer log mod_req() has to return if the request
should be counted in the epoch object*/
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment