Commit df646a64 authored by Julien Jerphanion's avatar Julien Jerphanion

[WIP] Adapt querying logic

This change the logic to query the tree for nearest neighbors.
Start with a simple sequential query for each point.
parent 6c0e5a62
......@@ -124,6 +124,7 @@ cdef I_t find_node_split_dim(D_t* data,
j_max = j
return j_max
cdef cypclass Counter activable:
""" A simple Counter.
......@@ -283,30 +284,32 @@ cdef cypclass NeighborsHeaps:
n_nbrs : int
the size of each heap.
"""
D_t *_distances
I_t *_indices
D_t *_distances
I_t _n_pts
I_t _n_nbrs
I_t _n_pushes
bint _sorted
__init__(self, I_t * indices, I_t n_pts, I_t n_nbrs):
cdef I_t i
__init__(self,
I_t * indices,
D_t * distances,
I_t n_pts,
I_t n_nbrs,
):
self._n_pts = n_pts
self._n_nbrs = n_nbrs
self._distances = <D_t *> malloc(n_pts * n_nbrs * sizeof(D_t))
self._distances = distances # <D_t *> malloc(n_pts * n_nbrs * sizeof(D_t))
self._indices = indices
self._n_pushes = 0
self._sorted = False
cdef I_t i
# We can't use memset here
for i in range(n_pts * n_nbrs):
self._distances[i] = INF
void __dealloc__(self):
free(self._distances)
void push(self, I_t row, D_t val, I_t i_val):
"""push (val, i_val) into the given row"""
cdef:
......@@ -318,11 +321,11 @@ cdef cypclass NeighborsHeaps:
self._n_pushes += 1
printf("Pushing for %d, (%d, %lf)\n", row, i_val, val)
# printf("Pushing for %d, (%d, %lf)\n", row, i_val, val)
# check if val should be in heap
if val > distances[0]:
printf("Discarding %d\n", row)
# printf("Discarding %d\n", row)
return
# insert val at position zero
......@@ -373,7 +376,8 @@ cdef cypclass NeighborsHeaps:
_simultaneous_sort(
self._distances + row * self._n_nbrs,
self._indices + row * self._n_nbrs,
self._n_nbrs)
self._n_nbrs,
)
self._sorted = True
......@@ -403,8 +407,7 @@ cdef cypclass Node activable:
NodeData_t *_node_data_ptr
D_t * _node_bounds_ptr
# Reference to the head of the allocated arrays
# data gets not modified via _data_ptr
# Reference to the head of the allocated arrays data gets not modified via _data_ptr
I_t node_index
active Node _left
......@@ -419,6 +422,8 @@ cdef cypclass Node activable:
self._node_data_ptr = node_data_ptr
self._node_bounds_ptr = node_bounds_ptr
# Offset between min and max
self._node_bounds_ptr_offset = node_bounds_ptr_offset
# We use this to allow using actors for initialisation
......@@ -440,10 +445,14 @@ cdef cypclass Node activable:
deref(node_data).idx_end = idx_end
deref(node_data).is_leaf = False
cdef DTYPE_t * lower_bounds = self._node_bounds_ptr + node_index * n_features
cdef DTYPE_t * upper_bounds = self._node_bounds_ptr + node_index * n_features +
cdef D_t * lower_bounds = (
self._node_bounds_ptr + node_index * n_features
)
cdef D_t * upper_bounds = (
lower_bounds + self._node_bounds_ptr_offset
)
cdef DTYPE_t * data_row
cdef D_t * data_row
# Determine Node bounds
for j in range(n_features):
......@@ -460,12 +469,13 @@ cdef cypclass Node activable:
upper_bounds[j] = fmax(upper_bounds[j], data_row[j])
# Choose the dimension with maximum spread at each recursion instead.
cdef I_t next_dim = find_node_split_dim(data_ptr,
indices_ptr + start,
n_features,
end - start)
cdef I_t mid = (start + end) // 2
cdef I_t next_dim = find_node_split_dim(
data=data_ptr,
node_indices=indices_ptr + idx_start,
n_features=n_features,
n_points=idx_end - idx_start,
)
cdef I_t mid = (idx_start + idx_end) // 2
if idx_end - idx_start <= leaf_size:
deref(node_data).is_leaf = True
......@@ -482,14 +492,31 @@ cdef cypclass Node activable:
self._right = consume Node(self._node_data_ptr, self._node_bounds_ptr, self._node_bounds_ptr_offset)
# Recursing on both partitions.
self._left.build_node(NULL, <I_t> 2 * node_index,
data_ptr, indices_ptr,
leaf_size, n_features, next_dim,
idx_start, mid, counter)
self._right.build_node(NULL, <I_t> (2 * node_index + 1),
data_ptr, indices_ptr,
leaf_size, n_features, next_dim,
mid, idx_end, counter)
self._left.build_node(
sync_method=NULL,
node_index=2 * node_index,
data_ptr=data_ptr,
indices_ptr=indices_ptr,
leaf_size=leaf_size,
n_features=n_features,
dim=next_dim,
idx_start=idx_start,
idx_end=mid,
counter=counter,
)
self._right.build_node(
sync_method=NULL,
node_index=2 * node_index + 1,
data_ptr=data_ptr,
indices_ptr=indices_ptr,
leaf_size=leaf_size,
n_features=n_features,
dim=next_dim,
idx_start=mid,
idx_end=idx_end,
counter=counter,
)
cdef cypclass KDTree:
......@@ -518,14 +545,14 @@ cdef cypclass KDTree:
I_t *_indices_ptr
NodeData_t *_node_data_ptr
D_t * _node_bounds_ptr
I_t _node_bounds_ptr_offset
__init__(self,
np.ndarray X,
I_t leaf_size,
):
__init__(
self,
np.ndarray X,
I_t leaf_size,
):
cdef I_t i
cdef I_t n = X.shape[0]
cdef I_t d = X.shape[1]
......@@ -566,7 +593,7 @@ cdef cypclass KDTree:
# the asynchronous construction of the tree.
cdef active Counter counter = consume Counter()
self._root = consume Node(self._node_data_ptr, self._node_bounds_ptr)
self._root = consume Node(self._node_data_ptr, self._node_bounds_ptr, self._node_bounds_ptr_offset)
if self._root is NULL:
printf("Error consuming node\n")
......@@ -578,40 +605,38 @@ cdef cypclass KDTree:
# Also using this separate method allowing using actors
# because __init__ can't be reified.
self._root.build_node(
NULL,
0,
self._data_ptr,
self._indices_ptr,
self._leaf_size,
n_features=self._d,
dim=0,
start=0,
end=self._n,
counter=counter,
sync_method=NULL,
node_index=0,
data_ptr=self._data_ptr,
indices_ptr=self._indices_ptr,
leaf_size=self._leaf_size,
n_features=self._d,
dim=0,
idx_start=0,
idx_end=self._n,
counter=counter,
)
# Waiting for the tree construction to end
# Somewhat similar to a thread barrier
while(initialised < self._n):
while initialised < self._n:
initialised = counter.value(NULL).getIntResult()
counter.reset(NULL)
void __dealloc__(self):
scheduler.finish()
free(self._indices_ptr)
free(self._node_data_ptr)
free(self._node_bounds_ptr)
int _query_single_depthfirst(self,
I_t i_node,
D_t* pt,
I_t i_pt,
NeighborsHeaps heaps,
D_t reduced_dist_LB,
) nogil except -1:
) except -1:
"""Recursive Single-tree k-neighbors query, depth-first approach"""
cdef NodeData_t node_info = self._node_data_ptr[i_node]
......@@ -623,16 +648,16 @@ cdef cypclass KDTree:
# trim it from the query
cdef D_t largest = heaps.largest(i_pt)
printf("reduced_dist_LB=%lf\n", reduced_dist_LB)
# printf("reduced_dist_LB=%lf\n", reduced_dist_LB)
if reduced_dist_LB > largest:
printf("Discarding node %d because reduced_dist_LB=%lf > largest=%lf\n", reduced_dist_LB, largest)
# printf("Discarding node %d because reduced_dist_LB=%lf > largest=%lf\n", reduced_dist_LB, largest)
pass
#------------------------------------------------------------
# Case 2: this is a leaf node. Update set of nearby points
elif node_info.is_leaf:
printf("Inspecting vector in leaf %d\n", i_node)
# printf("Inspecting vector in leaf %d\n", i_node)
for i in range(node_info.idx_start, node_info.idx_end):
dist_pt = sqeuclidean_dist(
x1=pt,
......@@ -645,7 +670,7 @@ cdef cypclass KDTree:
# Case 3: Node is not a leaf. Recursively query subnodes
# starting with the closest
else:
printf("Deleguating to children %d\n", i_node)
# printf("Deleguating to children %d\n", i_node)
i1 = 2 * i_node + 1
i2 = i1 + 1
reduced_dist_LB_1 = self.min_rdist(i1, pt)
......@@ -663,44 +688,47 @@ cdef cypclass KDTree:
void query(self,
np.ndarray query_points, # IN
np.ndarray closests, # OUT
np.ndarray knn_indices, # IN/OUT
np.ndarray knn_distances, # IN/OUT
):
cdef:
I_t completed_queries = 0
I_t i
I_t n_query = query_points.shape[0]
I_t n_features = query_points.shape[1]
I_t n_neighbors = closests.shape[1]
I_t n_neighbors = knn_indices.shape[1]
I_t total_n_pushes = n_query * self._n
D_t * _query_points_ptr = <D_t *> query_points.data
D_t rdist_lower_bound
NeighborsHeaps heaps = NeighborsHeaps(<I_t *> closests.data, n_query, n_neighbors)
NeighborsHeaps heaps = NeighborsHeaps(
<I_t *> knn_indices.data,
<D_t *> knn_distances.data,
n_query,
n_neighbors
)
for i in range(n_query):
printf("Querying vector %d\n", i)
# printf("Querying vector %d\n", i)
rdist_lower_bound = self.min_rdist(0, _query_points_ptr + i * n_features)
self._query_single_depthfirst(0, _query_points_ptr, i, heaps, rdist_lower_bound)
printf("Done Querying vector %d\n\n", i)
# printf("Done Querying vector %d\n\n", i)
heaps.sort()
D_t min_rdist(
KDTree self,
I_t i_node,
I_t idx_node,
D_t* pt,
) nogil except -1:
) except -1:
"""Compute the minimum reduced-distance between a point and a node"""
cdef I_t node_idx
cdef D_t d, d_lo, d_hi, rdist=0.0
cdef I_t j
cdef D_t node_min_j, node_max_j
cdef:
D_t d, d_lo, d_hi, node_min_j, node_max_j, rdist=0.0
I_t j
for j in range(self._d):
node_min_j = deref(self._node_bounds_ptr + node_idx + j)
node_max_j = deref(self._node_bounds_ptr + node_idx + j + self._node_bounds_ptr_offset)
node_min_j = deref(self._node_bounds_ptr + idx_node + j * self._n_nodes)
node_max_j = deref(self._node_bounds_ptr + idx_node + j * self._n_nodes + self._node_bounds_ptr_offset)
d_lo = node_min_j - pt[j]
d_hi = pt[j] - node_max_j
......
......@@ -14,20 +14,29 @@ def test_creation_deletion(n, d, leaf_size):
tree = kdtree.KDTree(X, leaf_size=256)
del tree
@pytest.mark.skip(reason="The query is being refactored.")
@pytest.mark.parametrize("n", [10, 100, 1000, 10000])
@pytest.mark.parametrize("d", [10, 100])
@pytest.mark.parametrize("k", [1, 2, 5, 10])
@pytest.mark.parametrize("leaf_size", [256, 1024])
def test_against_sklearn(n, d, k, leaf_size):
np.random.seed(1)
def test_against_sklearn(n, d, k, leaf_size, n_query=1):
np.random.seed(2)
X = np.random.rand(n, d)
query_points = np.random.rand(n, d)
query_points = np.random.rand(n_query, d)
tree = kdtree.KDTree(X, leaf_size=256)
skl_tree = KDTree(X, leaf_size=256)
closests = np.zeros((n, k), dtype=np.int32)
tree.query(query_points, closests)
skl_closests = skl_tree.query(query_points, k=k, return_distance=False).astype(np.int32)
np.testing.assert_equal(closests, skl_closests)
knn_indices = np.zeros((n_query, k), dtype=np.int32)
knn_distances = np.zeros((n_query, k), dtype=np.float64)
tree.query(query_points, knn_indices, knn_distances)
skl_knn_distances, skl_knn_indices = skl_tree.query(
query_points,
k=k,
return_distance=True
)
# Adapting types
skl_knn_indices = skl_knn_indices.astype(np.int32)
np.testing.assert_equal(knn_indices, skl_knn_indices)
np.testing.assert_almost_equal(knn_distances, skl_knn_distances)
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment