Commit 542a39ac authored by Josh Durgin's avatar Josh Durgin Committed by Greg Kroah-Hartman

libceph: resend all writes after the osdmap loses the full flag

commit 9a1ea2db upstream.

With the current full handling, there is a race between osds and
clients getting the first map marked full. If the osd wins, it will
return -ENOSPC to any writes, but the client may already have writes
in flight. This results in the client getting the error and
propagating it up the stack. For rbd, the block layer turns this into
EIO, which can cause corruption in filesystems above it.

To avoid this race, osds are being changed to drop writes that came
from clients with an osdmap older than the last osdmap marked full.
In order for this to work, clients must resend all writes after they
encounter a full -> not full transition in the osdmap. osds will wait
for an updated map instead of processing a request from a client with
a newer map, so resent writes will not be dropped by the osd unless
there is another not full -> full transition.

This approach requires both osds and clients to be fixed to avoid the
race. Old clients talking to osds with this fix may hang instead of
returning EIO and potentially corrupting an fs. New clients talking to
old osds have the same behavior as before if they encounter this race.

Fixes: http://tracker.ceph.com/issues/6938Reviewed-by: default avatarSage Weil <sage@inktank.com>
Signed-off-by: default avatarJosh Durgin <josh.durgin@inktank.com>
Signed-off-by: default avatarGreg Kroah-Hartman <gregkh@linuxfoundation.org>
parent 40dea3bd
...@@ -1269,14 +1269,17 @@ static void reset_changed_osds(struct ceph_osd_client *osdc) ...@@ -1269,14 +1269,17 @@ static void reset_changed_osds(struct ceph_osd_client *osdc)
* *
* Caller should hold map_sem for read. * Caller should hold map_sem for read.
*/ */
static void kick_requests(struct ceph_osd_client *osdc, int force_resend) static void kick_requests(struct ceph_osd_client *osdc, bool force_resend,
bool force_resend_writes)
{ {
struct ceph_osd_request *req, *nreq; struct ceph_osd_request *req, *nreq;
struct rb_node *p; struct rb_node *p;
int needmap = 0; int needmap = 0;
int err; int err;
bool force_resend_req;
dout("kick_requests %s\n", force_resend ? " (force resend)" : ""); dout("kick_requests %s %s\n", force_resend ? " (force resend)" : "",
force_resend_writes ? " (force resend writes)" : "");
mutex_lock(&osdc->request_mutex); mutex_lock(&osdc->request_mutex);
for (p = rb_first(&osdc->requests); p; ) { for (p = rb_first(&osdc->requests); p; ) {
req = rb_entry(p, struct ceph_osd_request, r_node); req = rb_entry(p, struct ceph_osd_request, r_node);
...@@ -1299,7 +1302,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) ...@@ -1299,7 +1302,10 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
continue; continue;
} }
err = __map_request(osdc, req, force_resend); force_resend_req = force_resend ||
(force_resend_writes &&
req->r_flags & CEPH_OSD_FLAG_WRITE);
err = __map_request(osdc, req, force_resend_req);
if (err < 0) if (err < 0)
continue; /* error */ continue; /* error */
if (req->r_osd == NULL) { if (req->r_osd == NULL) {
...@@ -1319,7 +1325,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend) ...@@ -1319,7 +1325,8 @@ static void kick_requests(struct ceph_osd_client *osdc, int force_resend)
r_linger_item) { r_linger_item) {
dout("linger req=%p req->r_osd=%p\n", req, req->r_osd); dout("linger req=%p req->r_osd=%p\n", req, req->r_osd);
err = __map_request(osdc, req, force_resend); err = __map_request(osdc, req,
force_resend || force_resend_writes);
dout("__map_request returned %d\n", err); dout("__map_request returned %d\n", err);
if (err == 0) if (err == 0)
continue; /* no change and no osd was specified */ continue; /* no change and no osd was specified */
...@@ -1361,6 +1368,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1361,6 +1368,7 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
struct ceph_osdmap *newmap = NULL, *oldmap; struct ceph_osdmap *newmap = NULL, *oldmap;
int err; int err;
struct ceph_fsid fsid; struct ceph_fsid fsid;
bool was_full;
dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0); dout("handle_map have %u\n", osdc->osdmap ? osdc->osdmap->epoch : 0);
p = msg->front.iov_base; p = msg->front.iov_base;
...@@ -1374,6 +1382,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1374,6 +1382,8 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
down_write(&osdc->map_sem); down_write(&osdc->map_sem);
was_full = ceph_osdmap_flag(osdc->osdmap, CEPH_OSDMAP_FULL);
/* incremental maps */ /* incremental maps */
ceph_decode_32_safe(&p, end, nr_maps, bad); ceph_decode_32_safe(&p, end, nr_maps, bad);
dout(" %d inc maps\n", nr_maps); dout(" %d inc maps\n", nr_maps);
...@@ -1398,7 +1408,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1398,7 +1408,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
ceph_osdmap_destroy(osdc->osdmap); ceph_osdmap_destroy(osdc->osdmap);
osdc->osdmap = newmap; osdc->osdmap = newmap;
} }
kick_requests(osdc, 0); was_full = was_full ||
ceph_osdmap_flag(osdc->osdmap,
CEPH_OSDMAP_FULL);
kick_requests(osdc, 0, was_full);
} else { } else {
dout("ignoring incremental map %u len %d\n", dout("ignoring incremental map %u len %d\n",
epoch, maplen); epoch, maplen);
...@@ -1441,7 +1454,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg) ...@@ -1441,7 +1454,10 @@ void ceph_osdc_handle_map(struct ceph_osd_client *osdc, struct ceph_msg *msg)
skipped_map = 1; skipped_map = 1;
ceph_osdmap_destroy(oldmap); ceph_osdmap_destroy(oldmap);
} }
kick_requests(osdc, skipped_map); was_full = was_full ||
ceph_osdmap_flag(osdc->osdmap,
CEPH_OSDMAP_FULL);
kick_requests(osdc, skipped_map, was_full);
} }
p += maplen; p += maplen;
nr_maps--; nr_maps--;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment