Commit 8434ffe7 authored by Jeff Layton's avatar Jeff Layton Committed by Ilya Dryomov

ceph: take snap_empty_lock atomically with snaprealm refcount change

There is a race in ceph_put_snap_realm. The change to the nref and the
spinlock acquisition are not done atomically, so you could decrement
nref, and before you take the spinlock, the nref is incremented again.
At that point, you end up putting it on the empty list when it
shouldn't be there. Eventually __cleanup_empty_realms runs and frees
it when it's still in-use.

Fix this by protecting the 1->0 transition with atomic_dec_and_lock,
and just drop the spinlock if we can get the rwsem.

Because these objects can also undergo a 0->1 refcount transition, we
must protect that change as well with the spinlock. Increment locklessly
unless the value is at 0, in which case we take the spinlock, increment
and then take it off the empty list if it did the 0->1 transition.

With these changes, I'm removing the dout() messages from these
functions, as well as in __put_snap_realm. They've always been racy, and
it's better to not print values that may be misleading.

Cc: stable@vger.kernel.org
URL: https://tracker.ceph.com/issues/46419Reported-by: default avatarMark Nelson <mnelson@redhat.com>
Signed-off-by: default avatarJeff Layton <jlayton@kernel.org>
Reviewed-by: default avatarLuis Henriques <lhenriques@suse.de>
Signed-off-by: default avatarIlya Dryomov <idryomov@gmail.com>
parent bf2ba432
...@@ -67,19 +67,19 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc, ...@@ -67,19 +67,19 @@ void ceph_get_snap_realm(struct ceph_mds_client *mdsc,
{ {
lockdep_assert_held(&mdsc->snap_rwsem); lockdep_assert_held(&mdsc->snap_rwsem);
dout("get_realm %p %d -> %d\n", realm,
atomic_read(&realm->nref), atomic_read(&realm->nref)+1);
/* /*
* since we _only_ increment realm refs or empty the empty * The 0->1 and 1->0 transitions must take the snap_empty_lock
* list with snap_rwsem held, adjusting the empty list here is * atomically with the refcount change. Go ahead and bump the
* safe. we do need to protect against concurrent empty list * nref here, unless it's 0, in which case we take the spinlock
* additions, however. * and then do the increment and remove it from the list.
*/ */
if (atomic_inc_return(&realm->nref) == 1) { if (atomic_inc_not_zero(&realm->nref))
return;
spin_lock(&mdsc->snap_empty_lock); spin_lock(&mdsc->snap_empty_lock);
if (atomic_inc_return(&realm->nref) == 1)
list_del_init(&realm->empty_item); list_del_init(&realm->empty_item);
spin_unlock(&mdsc->snap_empty_lock); spin_unlock(&mdsc->snap_empty_lock);
}
} }
static void __insert_snap_realm(struct rb_root *root, static void __insert_snap_realm(struct rb_root *root,
...@@ -208,28 +208,28 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc, ...@@ -208,28 +208,28 @@ static void __put_snap_realm(struct ceph_mds_client *mdsc,
{ {
lockdep_assert_held_write(&mdsc->snap_rwsem); lockdep_assert_held_write(&mdsc->snap_rwsem);
dout("__put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, /*
atomic_read(&realm->nref), atomic_read(&realm->nref)-1); * We do not require the snap_empty_lock here, as any caller that
* increments the value must hold the snap_rwsem.
*/
if (atomic_dec_and_test(&realm->nref)) if (atomic_dec_and_test(&realm->nref))
__destroy_snap_realm(mdsc, realm); __destroy_snap_realm(mdsc, realm);
} }
/* /*
* caller needn't hold any locks * See comments in ceph_get_snap_realm. Caller needn't hold any locks.
*/ */
void ceph_put_snap_realm(struct ceph_mds_client *mdsc, void ceph_put_snap_realm(struct ceph_mds_client *mdsc,
struct ceph_snap_realm *realm) struct ceph_snap_realm *realm)
{ {
dout("put_snap_realm %llx %p %d -> %d\n", realm->ino, realm, if (!atomic_dec_and_lock(&realm->nref, &mdsc->snap_empty_lock))
atomic_read(&realm->nref), atomic_read(&realm->nref)-1);
if (!atomic_dec_and_test(&realm->nref))
return; return;
if (down_write_trylock(&mdsc->snap_rwsem)) { if (down_write_trylock(&mdsc->snap_rwsem)) {
spin_unlock(&mdsc->snap_empty_lock);
__destroy_snap_realm(mdsc, realm); __destroy_snap_realm(mdsc, realm);
up_write(&mdsc->snap_rwsem); up_write(&mdsc->snap_rwsem);
} else { } else {
spin_lock(&mdsc->snap_empty_lock);
list_add(&realm->empty_item, &mdsc->snap_empty); list_add(&realm->empty_item, &mdsc->snap_empty);
spin_unlock(&mdsc->snap_empty_lock); spin_unlock(&mdsc->snap_empty_lock);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment