Commit df646a64 authored by Julien Jerphanion's avatar Julien Jerphanion

[WIP] Adapt querying logic

This change the logic to query the tree for nearest neighbors.
Start with a simple sequential query for each point.
parent 6c0e5a62
...@@ -124,6 +124,7 @@ cdef I_t find_node_split_dim(D_t* data, ...@@ -124,6 +124,7 @@ cdef I_t find_node_split_dim(D_t* data,
j_max = j j_max = j
return j_max return j_max
cdef cypclass Counter activable: cdef cypclass Counter activable:
""" A simple Counter. """ A simple Counter.
...@@ -283,30 +284,32 @@ cdef cypclass NeighborsHeaps: ...@@ -283,30 +284,32 @@ cdef cypclass NeighborsHeaps:
n_nbrs : int n_nbrs : int
the size of each heap. the size of each heap.
""" """
D_t *_distances
I_t *_indices I_t *_indices
D_t *_distances
I_t _n_pts I_t _n_pts
I_t _n_nbrs I_t _n_nbrs
I_t _n_pushes I_t _n_pushes
bint _sorted bint _sorted
__init__(self, I_t * indices, I_t n_pts, I_t n_nbrs): __init__(self,
cdef I_t i I_t * indices,
D_t * distances,
I_t n_pts,
I_t n_nbrs,
):
self._n_pts = n_pts self._n_pts = n_pts
self._n_nbrs = n_nbrs self._n_nbrs = n_nbrs
self._distances = <D_t *> malloc(n_pts * n_nbrs * sizeof(D_t)) self._distances = distances # <D_t *> malloc(n_pts * n_nbrs * sizeof(D_t))
self._indices = indices self._indices = indices
self._n_pushes = 0 self._n_pushes = 0
self._sorted = False self._sorted = False
cdef I_t i
# We can't use memset here # We can't use memset here
for i in range(n_pts * n_nbrs): for i in range(n_pts * n_nbrs):
self._distances[i] = INF self._distances[i] = INF
void __dealloc__(self):
free(self._distances)
void push(self, I_t row, D_t val, I_t i_val): void push(self, I_t row, D_t val, I_t i_val):
"""push (val, i_val) into the given row""" """push (val, i_val) into the given row"""
cdef: cdef:
...@@ -318,11 +321,11 @@ cdef cypclass NeighborsHeaps: ...@@ -318,11 +321,11 @@ cdef cypclass NeighborsHeaps:
self._n_pushes += 1 self._n_pushes += 1
printf("Pushing for %d, (%d, %lf)\n", row, i_val, val) # printf("Pushing for %d, (%d, %lf)\n", row, i_val, val)
# check if val should be in heap # check if val should be in heap
if val > distances[0]: if val > distances[0]:
printf("Discarding %d\n", row) # printf("Discarding %d\n", row)
return return
# insert val at position zero # insert val at position zero
...@@ -373,7 +376,8 @@ cdef cypclass NeighborsHeaps: ...@@ -373,7 +376,8 @@ cdef cypclass NeighborsHeaps:
_simultaneous_sort( _simultaneous_sort(
self._distances + row * self._n_nbrs, self._distances + row * self._n_nbrs,
self._indices + row * self._n_nbrs, self._indices + row * self._n_nbrs,
self._n_nbrs) self._n_nbrs,
)
self._sorted = True self._sorted = True
...@@ -403,8 +407,7 @@ cdef cypclass Node activable: ...@@ -403,8 +407,7 @@ cdef cypclass Node activable:
NodeData_t *_node_data_ptr NodeData_t *_node_data_ptr
D_t * _node_bounds_ptr D_t * _node_bounds_ptr
# Reference to the head of the allocated arrays # Reference to the head of the allocated arrays data gets not modified via _data_ptr
# data gets not modified via _data_ptr
I_t node_index I_t node_index
active Node _left active Node _left
...@@ -419,6 +422,8 @@ cdef cypclass Node activable: ...@@ -419,6 +422,8 @@ cdef cypclass Node activable:
self._node_data_ptr = node_data_ptr self._node_data_ptr = node_data_ptr
self._node_bounds_ptr = node_bounds_ptr self._node_bounds_ptr = node_bounds_ptr
# Offset between min and max
self._node_bounds_ptr_offset = node_bounds_ptr_offset self._node_bounds_ptr_offset = node_bounds_ptr_offset
# We use this to allow using actors for initialisation # We use this to allow using actors for initialisation
...@@ -440,10 +445,14 @@ cdef cypclass Node activable: ...@@ -440,10 +445,14 @@ cdef cypclass Node activable:
deref(node_data).idx_end = idx_end deref(node_data).idx_end = idx_end
deref(node_data).is_leaf = False deref(node_data).is_leaf = False
cdef DTYPE_t * lower_bounds = self._node_bounds_ptr + node_index * n_features cdef D_t * lower_bounds = (
cdef DTYPE_t * upper_bounds = self._node_bounds_ptr + node_index * n_features + self._node_bounds_ptr + node_index * n_features
)
cdef D_t * upper_bounds = (
lower_bounds + self._node_bounds_ptr_offset
)
cdef DTYPE_t * data_row cdef D_t * data_row
# Determine Node bounds # Determine Node bounds
for j in range(n_features): for j in range(n_features):
...@@ -460,12 +469,13 @@ cdef cypclass Node activable: ...@@ -460,12 +469,13 @@ cdef cypclass Node activable:
upper_bounds[j] = fmax(upper_bounds[j], data_row[j]) upper_bounds[j] = fmax(upper_bounds[j], data_row[j])
# Choose the dimension with maximum spread at each recursion instead. # Choose the dimension with maximum spread at each recursion instead.
cdef I_t next_dim = find_node_split_dim(data_ptr, cdef I_t next_dim = find_node_split_dim(
indices_ptr + start, data=data_ptr,
n_features, node_indices=indices_ptr + idx_start,
end - start) n_features=n_features,
cdef I_t mid = (start + end) // 2 n_points=idx_end - idx_start,
)
cdef I_t mid = (idx_start + idx_end) // 2
if idx_end - idx_start <= leaf_size: if idx_end - idx_start <= leaf_size:
deref(node_data).is_leaf = True deref(node_data).is_leaf = True
...@@ -482,14 +492,31 @@ cdef cypclass Node activable: ...@@ -482,14 +492,31 @@ cdef cypclass Node activable:
self._right = consume Node(self._node_data_ptr, self._node_bounds_ptr, self._node_bounds_ptr_offset) self._right = consume Node(self._node_data_ptr, self._node_bounds_ptr, self._node_bounds_ptr_offset)
# Recursing on both partitions. # Recursing on both partitions.
self._left.build_node(NULL, <I_t> 2 * node_index, self._left.build_node(
data_ptr, indices_ptr, sync_method=NULL,
leaf_size, n_features, next_dim, node_index=2 * node_index,
idx_start, mid, counter) data_ptr=data_ptr,
self._right.build_node(NULL, <I_t> (2 * node_index + 1), indices_ptr=indices_ptr,
data_ptr, indices_ptr, leaf_size=leaf_size,
leaf_size, n_features, next_dim, n_features=n_features,
mid, idx_end, counter) dim=next_dim,
idx_start=idx_start,
idx_end=mid,
counter=counter,
)
self._right.build_node(
sync_method=NULL,
node_index=2 * node_index + 1,
data_ptr=data_ptr,
indices_ptr=indices_ptr,
leaf_size=leaf_size,
n_features=n_features,
dim=next_dim,
idx_start=mid,
idx_end=idx_end,
counter=counter,
)
cdef cypclass KDTree: cdef cypclass KDTree:
...@@ -518,14 +545,14 @@ cdef cypclass KDTree: ...@@ -518,14 +545,14 @@ cdef cypclass KDTree:
I_t *_indices_ptr I_t *_indices_ptr
NodeData_t *_node_data_ptr NodeData_t *_node_data_ptr
D_t * _node_bounds_ptr D_t * _node_bounds_ptr
I_t _node_bounds_ptr_offset I_t _node_bounds_ptr_offset
__init__(self, __init__(
np.ndarray X, self,
I_t leaf_size, np.ndarray X,
): I_t leaf_size,
):
cdef I_t i cdef I_t i
cdef I_t n = X.shape[0] cdef I_t n = X.shape[0]
cdef I_t d = X.shape[1] cdef I_t d = X.shape[1]
...@@ -566,7 +593,7 @@ cdef cypclass KDTree: ...@@ -566,7 +593,7 @@ cdef cypclass KDTree:
# the asynchronous construction of the tree. # the asynchronous construction of the tree.
cdef active Counter counter = consume Counter() cdef active Counter counter = consume Counter()
self._root = consume Node(self._node_data_ptr, self._node_bounds_ptr) self._root = consume Node(self._node_data_ptr, self._node_bounds_ptr, self._node_bounds_ptr_offset)
if self._root is NULL: if self._root is NULL:
printf("Error consuming node\n") printf("Error consuming node\n")
...@@ -578,40 +605,38 @@ cdef cypclass KDTree: ...@@ -578,40 +605,38 @@ cdef cypclass KDTree:
# Also using this separate method allowing using actors # Also using this separate method allowing using actors
# because __init__ can't be reified. # because __init__ can't be reified.
self._root.build_node( self._root.build_node(
NULL, sync_method=NULL,
0, node_index=0,
self._data_ptr, data_ptr=self._data_ptr,
self._indices_ptr, indices_ptr=self._indices_ptr,
self._leaf_size, leaf_size=self._leaf_size,
n_features=self._d, n_features=self._d,
dim=0, dim=0,
start=0, idx_start=0,
end=self._n, idx_end=self._n,
counter=counter, counter=counter,
) )
# Waiting for the tree construction to end # Waiting for the tree construction to end
# Somewhat similar to a thread barrier # Somewhat similar to a thread barrier
while(initialised < self._n): while initialised < self._n:
initialised = counter.value(NULL).getIntResult() initialised = counter.value(NULL).getIntResult()
counter.reset(NULL) counter.reset(NULL)
void __dealloc__(self): void __dealloc__(self):
scheduler.finish() scheduler.finish()
free(self._indices_ptr) free(self._indices_ptr)
free(self._node_data_ptr) free(self._node_data_ptr)
free(self._node_bounds_ptr) free(self._node_bounds_ptr)
int _query_single_depthfirst(self, int _query_single_depthfirst(self,
I_t i_node, I_t i_node,
D_t* pt, D_t* pt,
I_t i_pt, I_t i_pt,
NeighborsHeaps heaps, NeighborsHeaps heaps,
D_t reduced_dist_LB, D_t reduced_dist_LB,
) nogil except -1: ) except -1:
"""Recursive Single-tree k-neighbors query, depth-first approach""" """Recursive Single-tree k-neighbors query, depth-first approach"""
cdef NodeData_t node_info = self._node_data_ptr[i_node] cdef NodeData_t node_info = self._node_data_ptr[i_node]
...@@ -623,16 +648,16 @@ cdef cypclass KDTree: ...@@ -623,16 +648,16 @@ cdef cypclass KDTree:
# trim it from the query # trim it from the query
cdef D_t largest = heaps.largest(i_pt) cdef D_t largest = heaps.largest(i_pt)
printf("reduced_dist_LB=%lf\n", reduced_dist_LB) # printf("reduced_dist_LB=%lf\n", reduced_dist_LB)
if reduced_dist_LB > largest: if reduced_dist_LB > largest:
printf("Discarding node %d because reduced_dist_LB=%lf > largest=%lf\n", reduced_dist_LB, largest) # printf("Discarding node %d because reduced_dist_LB=%lf > largest=%lf\n", reduced_dist_LB, largest)
pass pass
#------------------------------------------------------------ #------------------------------------------------------------
# Case 2: this is a leaf node. Update set of nearby points # Case 2: this is a leaf node. Update set of nearby points
elif node_info.is_leaf: elif node_info.is_leaf:
printf("Inspecting vector in leaf %d\n", i_node) # printf("Inspecting vector in leaf %d\n", i_node)
for i in range(node_info.idx_start, node_info.idx_end): for i in range(node_info.idx_start, node_info.idx_end):
dist_pt = sqeuclidean_dist( dist_pt = sqeuclidean_dist(
x1=pt, x1=pt,
...@@ -645,7 +670,7 @@ cdef cypclass KDTree: ...@@ -645,7 +670,7 @@ cdef cypclass KDTree:
# Case 3: Node is not a leaf. Recursively query subnodes # Case 3: Node is not a leaf. Recursively query subnodes
# starting with the closest # starting with the closest
else: else:
printf("Deleguating to children %d\n", i_node) # printf("Deleguating to children %d\n", i_node)
i1 = 2 * i_node + 1 i1 = 2 * i_node + 1
i2 = i1 + 1 i2 = i1 + 1
reduced_dist_LB_1 = self.min_rdist(i1, pt) reduced_dist_LB_1 = self.min_rdist(i1, pt)
...@@ -663,44 +688,47 @@ cdef cypclass KDTree: ...@@ -663,44 +688,47 @@ cdef cypclass KDTree:
void query(self, void query(self,
np.ndarray query_points, # IN np.ndarray query_points, # IN
np.ndarray closests, # OUT np.ndarray knn_indices, # IN/OUT
np.ndarray knn_distances, # IN/OUT
): ):
cdef: cdef:
I_t completed_queries = 0 I_t completed_queries = 0
I_t i I_t i
I_t n_query = query_points.shape[0] I_t n_query = query_points.shape[0]
I_t n_features = query_points.shape[1] I_t n_features = query_points.shape[1]
I_t n_neighbors = closests.shape[1] I_t n_neighbors = knn_indices.shape[1]
I_t total_n_pushes = n_query * self._n I_t total_n_pushes = n_query * self._n
D_t * _query_points_ptr = <D_t *> query_points.data D_t * _query_points_ptr = <D_t *> query_points.data
D_t rdist_lower_bound D_t rdist_lower_bound
NeighborsHeaps heaps = NeighborsHeaps(<I_t *> closests.data, n_query, n_neighbors) NeighborsHeaps heaps = NeighborsHeaps(
<I_t *> knn_indices.data,
<D_t *> knn_distances.data,
n_query,
n_neighbors
)
for i in range(n_query): for i in range(n_query):
printf("Querying vector %d\n", i) # printf("Querying vector %d\n", i)
rdist_lower_bound = self.min_rdist(0, _query_points_ptr + i * n_features) rdist_lower_bound = self.min_rdist(0, _query_points_ptr + i * n_features)
self._query_single_depthfirst(0, _query_points_ptr, i, heaps, rdist_lower_bound) self._query_single_depthfirst(0, _query_points_ptr, i, heaps, rdist_lower_bound)
printf("Done Querying vector %d\n\n", i) # printf("Done Querying vector %d\n\n", i)
heaps.sort() heaps.sort()
D_t min_rdist( D_t min_rdist(
KDTree self, KDTree self,
I_t i_node, I_t idx_node,
D_t* pt, D_t* pt,
) nogil except -1: ) except -1:
"""Compute the minimum reduced-distance between a point and a node""" """Compute the minimum reduced-distance between a point and a node"""
cdef I_t node_idx cdef:
cdef D_t d, d_lo, d_hi, rdist=0.0 D_t d, d_lo, d_hi, node_min_j, node_max_j, rdist=0.0
cdef I_t j I_t j
cdef D_t node_min_j, node_max_j
for j in range(self._d): for j in range(self._d):
node_min_j = deref(self._node_bounds_ptr + node_idx + j) node_min_j = deref(self._node_bounds_ptr + idx_node + j * self._n_nodes)
node_max_j = deref(self._node_bounds_ptr + node_idx + j + self._node_bounds_ptr_offset) node_max_j = deref(self._node_bounds_ptr + idx_node + j * self._n_nodes + self._node_bounds_ptr_offset)
d_lo = node_min_j - pt[j] d_lo = node_min_j - pt[j]
d_hi = pt[j] - node_max_j d_hi = pt[j] - node_max_j
......
...@@ -14,20 +14,29 @@ def test_creation_deletion(n, d, leaf_size): ...@@ -14,20 +14,29 @@ def test_creation_deletion(n, d, leaf_size):
tree = kdtree.KDTree(X, leaf_size=256) tree = kdtree.KDTree(X, leaf_size=256)
del tree del tree
@pytest.mark.skip(reason="The query is being refactored.")
@pytest.mark.parametrize("n", [10, 100, 1000, 10000]) @pytest.mark.parametrize("n", [10, 100, 1000, 10000])
@pytest.mark.parametrize("d", [10, 100]) @pytest.mark.parametrize("d", [10, 100])
@pytest.mark.parametrize("k", [1, 2, 5, 10]) @pytest.mark.parametrize("k", [1, 2, 5, 10])
@pytest.mark.parametrize("leaf_size", [256, 1024]) @pytest.mark.parametrize("leaf_size", [256, 1024])
def test_against_sklearn(n, d, k, leaf_size): def test_against_sklearn(n, d, k, leaf_size, n_query=1):
np.random.seed(1) np.random.seed(2)
X = np.random.rand(n, d) X = np.random.rand(n, d)
query_points = np.random.rand(n, d) query_points = np.random.rand(n_query, d)
tree = kdtree.KDTree(X, leaf_size=256) tree = kdtree.KDTree(X, leaf_size=256)
skl_tree = KDTree(X, leaf_size=256) skl_tree = KDTree(X, leaf_size=256)
closests = np.zeros((n, k), dtype=np.int32) knn_indices = np.zeros((n_query, k), dtype=np.int32)
tree.query(query_points, closests) knn_distances = np.zeros((n_query, k), dtype=np.float64)
skl_closests = skl_tree.query(query_points, k=k, return_distance=False).astype(np.int32) tree.query(query_points, knn_indices, knn_distances)
np.testing.assert_equal(closests, skl_closests) skl_knn_distances, skl_knn_indices = skl_tree.query(
query_points,
k=k,
return_distance=True
)
# Adapting types
skl_knn_indices = skl_knn_indices.astype(np.int32)
np.testing.assert_equal(knn_indices, skl_knn_indices)
np.testing.assert_almost_equal(knn_distances, skl_knn_distances)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment