Commit 27d20fdd authored by Nick Piggin's avatar Nick Piggin Committed by Linus Torvalds

radix-tree: fix RCU bug

Salman Qazi describes the following radix-tree bug:

In the following case, we get can get a deadlock:

0.  The radix tree contains two items, one has the index 0.
1.  The reader (in this case find_get_pages) takes the rcu_read_lock.
2.  The reader acquires slot(s) for item(s) including the index 0 item.
3.  The non-zero index item is deleted, and as a consequence the other item is
    moved to the root of the tree. The place where it used to be is queued for
    deletion after the readers finish.
3b. The zero item is deleted, removing it from the direct slot, it remains in
    the rcu-delayed indirect node.
4.  The reader looks at the index 0 slot, and finds that the page has 0 ref
    count
5.  The reader looks at it again, hoping that the item will either be freed or
    the ref count will increase. This never happens, as the slot it is looking
    at will never be updated. Also, this slot can never be reclaimed because
    the reader is holding rcu_read_lock and is in an infinite loop.

The fix is to re-use the same "indirect" pointer case that requires a slot
lookup retry into a general "retry the lookup" bit.
Signed-off-by: default avatarNick Piggin <npiggin@kernel.dk>
Reported-by: default avatarSalman Qazi <sqazi@google.com>
Cc: <stable@kernel.org>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent eaf06b24
...@@ -34,19 +34,13 @@ ...@@ -34,19 +34,13 @@
* needed for RCU lookups (because root->height is unreliable). The only * needed for RCU lookups (because root->height is unreliable). The only
* time callers need worry about this is when doing a lookup_slot under * time callers need worry about this is when doing a lookup_slot under
* RCU. * RCU.
*
* Indirect pointer in fact is also used to tag the last pointer of a node
* when it is shrunk, before we rcu free the node. See shrink code for
* details.
*/ */
#define RADIX_TREE_INDIRECT_PTR 1 #define RADIX_TREE_INDIRECT_PTR 1
#define RADIX_TREE_RETRY ((void *)-1UL)
static inline void *radix_tree_ptr_to_indirect(void *ptr)
{
return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
}
static inline void *radix_tree_indirect_to_ptr(void *ptr)
{
return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
}
#define radix_tree_indirect_to_ptr(ptr) \ #define radix_tree_indirect_to_ptr(ptr) \
radix_tree_indirect_to_ptr((void __force *)(ptr)) radix_tree_indirect_to_ptr((void __force *)(ptr))
...@@ -140,16 +134,29 @@ do { \ ...@@ -140,16 +134,29 @@ do { \
* removed. * removed.
* *
* For use with radix_tree_lookup_slot(). Caller must hold tree at least read * For use with radix_tree_lookup_slot(). Caller must hold tree at least read
* locked across slot lookup and dereference. More likely, will be used with * locked across slot lookup and dereference. Not required if write lock is
* radix_tree_replace_slot(), as well, so caller will hold tree write locked. * held (ie. items cannot be concurrently inserted).
*
* radix_tree_deref_retry must be used to confirm validity of the pointer if
* only the read lock is held.
*/ */
static inline void *radix_tree_deref_slot(void **pslot) static inline void *radix_tree_deref_slot(void **pslot)
{ {
void *ret = rcu_dereference(*pslot); return rcu_dereference(*pslot);
if (unlikely(radix_tree_is_indirect_ptr(ret)))
ret = RADIX_TREE_RETRY;
return ret;
} }
/**
* radix_tree_deref_retry - check radix_tree_deref_slot
* @arg: pointer returned by radix_tree_deref_slot
* Returns: 0 if retry is not required, otherwise retry is required
*
* radix_tree_deref_retry must be used with radix_tree_deref_slot.
*/
static inline int radix_tree_deref_retry(void *arg)
{
return unlikely((unsigned long)arg & RADIX_TREE_INDIRECT_PTR);
}
/** /**
* radix_tree_replace_slot - replace item in a slot * radix_tree_replace_slot - replace item in a slot
* @pslot: pointer to slot, returned by radix_tree_lookup_slot * @pslot: pointer to slot, returned by radix_tree_lookup_slot
......
...@@ -82,6 +82,16 @@ struct radix_tree_preload { ...@@ -82,6 +82,16 @@ struct radix_tree_preload {
}; };
static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, }; static DEFINE_PER_CPU(struct radix_tree_preload, radix_tree_preloads) = { 0, };
static inline void *ptr_to_indirect(void *ptr)
{
return (void *)((unsigned long)ptr | RADIX_TREE_INDIRECT_PTR);
}
static inline void *indirect_to_ptr(void *ptr)
{
return (void *)((unsigned long)ptr & ~RADIX_TREE_INDIRECT_PTR);
}
static inline gfp_t root_gfp_mask(struct radix_tree_root *root) static inline gfp_t root_gfp_mask(struct radix_tree_root *root)
{ {
return root->gfp_mask & __GFP_BITS_MASK; return root->gfp_mask & __GFP_BITS_MASK;
...@@ -265,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) ...@@ -265,7 +275,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
return -ENOMEM; return -ENOMEM;
/* Increase the height. */ /* Increase the height. */
node->slots[0] = radix_tree_indirect_to_ptr(root->rnode); node->slots[0] = indirect_to_ptr(root->rnode);
/* Propagate the aggregated tag info into the new root */ /* Propagate the aggregated tag info into the new root */
for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) { for (tag = 0; tag < RADIX_TREE_MAX_TAGS; tag++) {
...@@ -276,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index) ...@@ -276,7 +286,7 @@ static int radix_tree_extend(struct radix_tree_root *root, unsigned long index)
newheight = root->height+1; newheight = root->height+1;
node->height = newheight; node->height = newheight;
node->count = 1; node->count = 1;
node = radix_tree_ptr_to_indirect(node); node = ptr_to_indirect(node);
rcu_assign_pointer(root->rnode, node); rcu_assign_pointer(root->rnode, node);
root->height = newheight; root->height = newheight;
} while (height > root->height); } while (height > root->height);
...@@ -309,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root, ...@@ -309,7 +319,7 @@ int radix_tree_insert(struct radix_tree_root *root,
return error; return error;
} }
slot = radix_tree_indirect_to_ptr(root->rnode); slot = indirect_to_ptr(root->rnode);
height = root->height; height = root->height;
shift = (height-1) * RADIX_TREE_MAP_SHIFT; shift = (height-1) * RADIX_TREE_MAP_SHIFT;
...@@ -325,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root, ...@@ -325,8 +335,7 @@ int radix_tree_insert(struct radix_tree_root *root,
rcu_assign_pointer(node->slots[offset], slot); rcu_assign_pointer(node->slots[offset], slot);
node->count++; node->count++;
} else } else
rcu_assign_pointer(root->rnode, rcu_assign_pointer(root->rnode, ptr_to_indirect(slot));
radix_tree_ptr_to_indirect(slot));
} }
/* Go a level down */ /* Go a level down */
...@@ -374,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, ...@@ -374,7 +383,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
return NULL; return NULL;
return is_slot ? (void *)&root->rnode : node; return is_slot ? (void *)&root->rnode : node;
} }
node = radix_tree_indirect_to_ptr(node); node = indirect_to_ptr(node);
height = node->height; height = node->height;
if (index > radix_tree_maxindex(height)) if (index > radix_tree_maxindex(height))
...@@ -393,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root, ...@@ -393,7 +402,7 @@ static void *radix_tree_lookup_element(struct radix_tree_root *root,
height--; height--;
} while (height > 0); } while (height > 0);
return is_slot ? (void *)slot:node; return is_slot ? (void *)slot : indirect_to_ptr(node);
} }
/** /**
...@@ -455,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root, ...@@ -455,7 +464,7 @@ void *radix_tree_tag_set(struct radix_tree_root *root,
height = root->height; height = root->height;
BUG_ON(index > radix_tree_maxindex(height)); BUG_ON(index > radix_tree_maxindex(height));
slot = radix_tree_indirect_to_ptr(root->rnode); slot = indirect_to_ptr(root->rnode);
shift = (height - 1) * RADIX_TREE_MAP_SHIFT; shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
while (height > 0) { while (height > 0) {
...@@ -509,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root, ...@@ -509,7 +518,7 @@ void *radix_tree_tag_clear(struct radix_tree_root *root,
shift = (height - 1) * RADIX_TREE_MAP_SHIFT; shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
pathp->node = NULL; pathp->node = NULL;
slot = radix_tree_indirect_to_ptr(root->rnode); slot = indirect_to_ptr(root->rnode);
while (height > 0) { while (height > 0) {
int offset; int offset;
...@@ -579,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root, ...@@ -579,7 +588,7 @@ int radix_tree_tag_get(struct radix_tree_root *root,
if (!radix_tree_is_indirect_ptr(node)) if (!radix_tree_is_indirect_ptr(node))
return (index == 0); return (index == 0);
node = radix_tree_indirect_to_ptr(node); node = indirect_to_ptr(node);
height = node->height; height = node->height;
if (index > radix_tree_maxindex(height)) if (index > radix_tree_maxindex(height))
...@@ -666,7 +675,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root, ...@@ -666,7 +675,7 @@ unsigned long radix_tree_range_tag_if_tagged(struct radix_tree_root *root,
} }
shift = (height - 1) * RADIX_TREE_MAP_SHIFT; shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
slot = radix_tree_indirect_to_ptr(root->rnode); slot = indirect_to_ptr(root->rnode);
/* /*
* we fill the path from (root->height - 2) to 0, leaving the index at * we fill the path from (root->height - 2) to 0, leaving the index at
...@@ -897,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, ...@@ -897,7 +906,7 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
results[0] = node; results[0] = node;
return 1; return 1;
} }
node = radix_tree_indirect_to_ptr(node); node = indirect_to_ptr(node);
max_index = radix_tree_maxindex(node->height); max_index = radix_tree_maxindex(node->height);
...@@ -916,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results, ...@@ -916,7 +925,8 @@ radix_tree_gang_lookup(struct radix_tree_root *root, void **results,
slot = *(((void ***)results)[ret + i]); slot = *(((void ***)results)[ret + i]);
if (!slot) if (!slot)
continue; continue;
results[ret + nr_found] = rcu_dereference_raw(slot); results[ret + nr_found] =
indirect_to_ptr(rcu_dereference_raw(slot));
nr_found++; nr_found++;
} }
ret += nr_found; ret += nr_found;
...@@ -965,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results, ...@@ -965,7 +975,7 @@ radix_tree_gang_lookup_slot(struct radix_tree_root *root, void ***results,
results[0] = (void **)&root->rnode; results[0] = (void **)&root->rnode;
return 1; return 1;
} }
node = radix_tree_indirect_to_ptr(node); node = indirect_to_ptr(node);
max_index = radix_tree_maxindex(node->height); max_index = radix_tree_maxindex(node->height);
...@@ -1090,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, ...@@ -1090,7 +1100,7 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
results[0] = node; results[0] = node;
return 1; return 1;
} }
node = radix_tree_indirect_to_ptr(node); node = indirect_to_ptr(node);
max_index = radix_tree_maxindex(node->height); max_index = radix_tree_maxindex(node->height);
...@@ -1109,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results, ...@@ -1109,7 +1119,8 @@ radix_tree_gang_lookup_tag(struct radix_tree_root *root, void **results,
slot = *(((void ***)results)[ret + i]); slot = *(((void ***)results)[ret + i]);
if (!slot) if (!slot)
continue; continue;
results[ret + nr_found] = rcu_dereference_raw(slot); results[ret + nr_found] =
indirect_to_ptr(rcu_dereference_raw(slot));
nr_found++; nr_found++;
} }
ret += nr_found; ret += nr_found;
...@@ -1159,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results, ...@@ -1159,7 +1170,7 @@ radix_tree_gang_lookup_tag_slot(struct radix_tree_root *root, void ***results,
results[0] = (void **)&root->rnode; results[0] = (void **)&root->rnode;
return 1; return 1;
} }
node = radix_tree_indirect_to_ptr(node); node = indirect_to_ptr(node);
max_index = radix_tree_maxindex(node->height); max_index = radix_tree_maxindex(node->height);
...@@ -1195,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) ...@@ -1195,7 +1206,7 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
void *newptr; void *newptr;
BUG_ON(!radix_tree_is_indirect_ptr(to_free)); BUG_ON(!radix_tree_is_indirect_ptr(to_free));
to_free = radix_tree_indirect_to_ptr(to_free); to_free = indirect_to_ptr(to_free);
/* /*
* The candidate node has more than one child, or its child * The candidate node has more than one child, or its child
...@@ -1208,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root) ...@@ -1208,16 +1219,39 @@ static inline void radix_tree_shrink(struct radix_tree_root *root)
/* /*
* We don't need rcu_assign_pointer(), since we are simply * We don't need rcu_assign_pointer(), since we are simply
* moving the node from one part of the tree to another. If * moving the node from one part of the tree to another: if it
* it was safe to dereference the old pointer to it * was safe to dereference the old pointer to it
* (to_free->slots[0]), it will be safe to dereference the new * (to_free->slots[0]), it will be safe to dereference the new
* one (root->rnode). * one (root->rnode) as far as dependent read barriers go.
*/ */
newptr = to_free->slots[0]; newptr = to_free->slots[0];
if (root->height > 1) if (root->height > 1)
newptr = radix_tree_ptr_to_indirect(newptr); newptr = ptr_to_indirect(newptr);
root->rnode = newptr; root->rnode = newptr;
root->height--; root->height--;
/*
* We have a dilemma here. The node's slot[0] must not be
* NULLed in case there are concurrent lookups expecting to
* find the item. However if this was a bottom-level node,
* then it may be subject to the slot pointer being visible
* to callers dereferencing it. If item corresponding to
* slot[0] is subsequently deleted, these callers would expect
* their slot to become empty sooner or later.
*
* For example, lockless pagecache will look up a slot, deref
* the page pointer, and if the page is 0 refcount it means it
* was concurrently deleted from pagecache so try the deref
* again. Fortunately there is already a requirement for logic
* to retry the entire slot lookup -- the indirect pointer
* problem (replacing direct root node with an indirect pointer
* also results in a stale slot). So tag the slot as indirect
* to force callers to retry.
*/
if (root->height == 0)
*((unsigned long *)&to_free->slots[0]) |=
RADIX_TREE_INDIRECT_PTR;
radix_tree_node_free(to_free); radix_tree_node_free(to_free);
} }
} }
...@@ -1254,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) ...@@ -1254,7 +1288,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
root->rnode = NULL; root->rnode = NULL;
goto out; goto out;
} }
slot = radix_tree_indirect_to_ptr(slot); slot = indirect_to_ptr(slot);
shift = (height - 1) * RADIX_TREE_MAP_SHIFT; shift = (height - 1) * RADIX_TREE_MAP_SHIFT;
pathp->node = NULL; pathp->node = NULL;
...@@ -1296,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index) ...@@ -1296,8 +1330,7 @@ void *radix_tree_delete(struct radix_tree_root *root, unsigned long index)
radix_tree_node_free(to_free); radix_tree_node_free(to_free);
if (pathp->node->count) { if (pathp->node->count) {
if (pathp->node == if (pathp->node == indirect_to_ptr(root->rnode))
radix_tree_indirect_to_ptr(root->rnode))
radix_tree_shrink(root); radix_tree_shrink(root);
goto out; goto out;
} }
......
...@@ -644,7 +644,9 @@ struct page *find_get_page(struct address_space *mapping, pgoff_t offset) ...@@ -644,7 +644,9 @@ struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
pagep = radix_tree_lookup_slot(&mapping->page_tree, offset); pagep = radix_tree_lookup_slot(&mapping->page_tree, offset);
if (pagep) { if (pagep) {
page = radix_tree_deref_slot(pagep); page = radix_tree_deref_slot(pagep);
if (unlikely(!page || page == RADIX_TREE_RETRY)) if (unlikely(!page))
goto out;
if (radix_tree_deref_retry(page))
goto repeat; goto repeat;
if (!page_cache_get_speculative(page)) if (!page_cache_get_speculative(page))
...@@ -660,6 +662,7 @@ struct page *find_get_page(struct address_space *mapping, pgoff_t offset) ...@@ -660,6 +662,7 @@ struct page *find_get_page(struct address_space *mapping, pgoff_t offset)
goto repeat; goto repeat;
} }
} }
out:
rcu_read_unlock(); rcu_read_unlock();
return page; return page;
...@@ -777,12 +780,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start, ...@@ -777,12 +780,11 @@ unsigned find_get_pages(struct address_space *mapping, pgoff_t start,
page = radix_tree_deref_slot((void **)pages[i]); page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page)) if (unlikely(!page))
continue; continue;
/* if (radix_tree_deref_retry(page)) {
* this can only trigger if nr_found == 1, making livelock if (ret)
* a non issue. start = pages[ret-1]->index;
*/
if (unlikely(page == RADIX_TREE_RETRY))
goto restart; goto restart;
}
if (!page_cache_get_speculative(page)) if (!page_cache_get_speculative(page))
goto repeat; goto repeat;
...@@ -830,11 +832,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index, ...@@ -830,11 +832,7 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t index,
page = radix_tree_deref_slot((void **)pages[i]); page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page)) if (unlikely(!page))
continue; continue;
/* if (radix_tree_deref_retry(page))
* this can only trigger if nr_found == 1, making livelock
* a non issue.
*/
if (unlikely(page == RADIX_TREE_RETRY))
goto restart; goto restart;
if (page->mapping == NULL || page->index != index) if (page->mapping == NULL || page->index != index)
...@@ -887,11 +885,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, ...@@ -887,11 +885,7 @@ unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index,
page = radix_tree_deref_slot((void **)pages[i]); page = radix_tree_deref_slot((void **)pages[i]);
if (unlikely(!page)) if (unlikely(!page))
continue; continue;
/* if (radix_tree_deref_retry(page))
* this can only trigger if nr_found == 1, making livelock
* a non issue.
*/
if (unlikely(page == RADIX_TREE_RETRY))
goto restart; goto restart;
if (!page_cache_get_speculative(page)) if (!page_cache_get_speculative(page))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment