Skip to content

Commit

Permalink
Merge pull request #1 from mortacious/feature/automatic_sorting
Browse files Browse the repository at this point in the history
Feature/automatic sorting
  • Loading branch information
mortacious authored Sep 20, 2022
2 parents 1ca35f4 + 4a060ec commit 12729d2
Show file tree
Hide file tree
Showing 6 changed files with 47 additions and 40 deletions.
2 changes: 1 addition & 1 deletion cupy_knn/_version.py
Original file line number Diff line number Diff line change
@@ -1 +1 @@
__version__ = "0.2.3"
__version__ = "0.2.4"
13 changes: 8 additions & 5 deletions cupy_knn/cuda/query_knn_kernels.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,19 +7,22 @@ __global__ void query_knn_kernel(const BVHNode *nodes,
const unsigned int* __restrict__ sorted_indices,
const unsigned int root_index,
const float max_radius,
const float3* __restrict__ queries,
const float3* __restrict__ query_points,
const unsigned int* __restrict__ sorted_queries,
const unsigned int N,
// custom parameters
unsigned int* indices_out,
float* distances_out,
unsigned int* n_neighbors_out
)
{
unsigned int query_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (query_idx >= N)
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
return;
StaticPriorityQueue<float, K> queue(max_radius);
query_knn(nodes, points, sorted_indices, root_index, &queries[query_idx], queue);
unsigned int query_idx = sorted_queries[idx];
query_knn(nodes, points, sorted_indices, root_index, &query_points[query_idx], queue);
__syncwarp(); // synchronize the warp before the write operation
queue.write_results(&indices_out[query_idx * K], &distances_out[query_idx * K], &n_neighbors_out[query_idx]); // write back the results
// write back the results at the correct position
queue.write_results(&indices_out[query_idx * K], &distances_out[query_idx * K], &n_neighbors_out[query_idx]);
}
40 changes: 16 additions & 24 deletions cupy_knn/lbvh_index.py
Original file line number Diff line number Diff line change
Expand Up @@ -140,7 +140,8 @@ def prepare_knn(self, module: cp.RawModule, kernel_name: str, radius: Optional[f
const unsigned int* sorted_indices, // the sorted point indices
const unsigned int root_index, // the tree's root index
const float max_radius, // the maximum search radius
const float3* queries, // the query points
const float3* query_points, // the query points
const unsigned int* __restrict__ sorted_queries, // the indices of the query points sorted in morton order if sorting is enabled
const unsigned int N, // the total number of queries
<other custom parameters>
)
Expand Down Expand Up @@ -185,7 +186,8 @@ def prepare_radius(self, module: cp.RawModule, kernel_name: str, radius):
const unsigned int* sorted_indices, // the sorted point indices
const unsigned int root_index, // the tree's root index
const float max_radius, // the maximum search radius
const float3* queries, // the query points
const float3* query_points, // the query points
const unsigned int* __restrict__ sorted_queries, // the indices of the query points sorted in morton order if sorting is enabled
const unsigned int N, // the total number of queries
<other custom parameters>
)
Expand Down Expand Up @@ -330,9 +332,10 @@ def query_knn(self, queries: cp.ndarray, *args):
n_neighbors: cp.ndarray of shape (n,)
The number of nearest neighbors found
Note: This is only returned in default mode (if 'prepare_knn_default' was called)
query_order: cp.ndarray of shape (n,)
The query indices in morton order. This is returned only if 'prepare_knn' was called directly with custom
code and sort_queries is True in order to sort the returned values.
*args: tuple
The additional arguments to the function.
This is returned only if 'prepare_knn' was called directly with custom
code.
"""
if self.num_nodes < 0:
Expand All @@ -350,8 +353,9 @@ def query_knn(self, queries: cp.ndarray, *args):
block_dim, grid_dim = select_block_grid_sizes(queries.device, queries.shape[0])
self._compute_morton_points_kernel_float(grid_dim, block_dim, (queries, self.extent, morton_codes, queries.shape[0]))
sorted_indices = cp.argsort(morton_codes)
queries = queries[sorted_indices]
stream.synchronize()
else:
sorted_indices = cp.arange(queries.shape[0], dtype=cp.uint32)

# use the maximum allowed threads per block from the kernel (depends on the number of registers)
max_threads_per_block = self._query_kernel.attributes['max_threads_per_block']
Expand All @@ -371,25 +375,12 @@ def query_knn(self, queries: cp.ndarray, *args):
cp.uint32(self.root_node),
cp.float32(self.radius),
queries,
sorted_indices,
queries.shape[0],
*args))
stream.synchronize()

if self.sort_queries:
if self._custom_knn > 0:
indices_tmp = cp.empty_like(indices_out)
indices_tmp[sorted_indices] = indices_out # invert the sorting
indices_out = indices_tmp
distances_tmp = cp.empty_like(distances_out)
distances_tmp[sorted_indices] = distances_out
distances_out = distances_tmp
n_neighbors_tmp = cp.empty_like(n_neighbors_out)
n_neighbors_tmp[sorted_indices] = n_neighbors_out
n_neighbors_out = n_neighbors_tmp

return indices_out, distances_out, n_neighbors_out
else:
return sorted_indices
return args

def query_radius(self, queries: cp.ndarray, *args):
"""
Expand Down Expand Up @@ -427,8 +418,9 @@ def query_radius(self, queries: cp.ndarray, *args):
block_dim, grid_dim = select_block_grid_sizes(queries.device, queries.shape[0])
self._compute_morton_points_kernel_float(grid_dim, block_dim, (queries, self.extent, morton_codes, queries.shape[0]))
sorted_indices = cp.argsort(morton_codes)
queries = queries[sorted_indices]
stream.synchronize()
else:
sorted_indices = cp.arange(queries.shape[0], dtype=cp.uint32)

# use the maximum allowed threads per block from the kernel (depends on the number of registers)
max_threads_per_block = self._query_kernel.attributes['max_threads_per_block']
Expand All @@ -441,12 +433,12 @@ def query_radius(self, queries: cp.ndarray, *args):
cp.uint32(self.root_node),
cp.float32(self.radius),
queries,
sorted_indices,
queries.shape[0],
*args))
stream.synchronize()

if self.sort_queries:
return sorted_indices
return args

def tree_data(self, numpy: bool = True):
"""
Expand Down
9 changes: 7 additions & 2 deletions examples/benchmark_knn_query.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,14 +14,19 @@
parser.add_argument("-c", "--compact", action='store_true', help="Compact the tree to remove unused spaces in between")
parser.add_argument("-s", "--shrink_to_fit", action='store_true', help="Shrink to buffer size of the tree to match "
"it's content after compaction")
parser.add_argument("--sort", action='store_true', help="Sort the queries by their morton index before execution")

args = parser.parse_args()
pc = PlyData.read(args.input_file)

points = np.stack([pc.elements[0].data['x'], pc.elements[0].data['y'], pc.elements[0].data['z']], axis=1)

print(f"Benchmarking KNN Search for {points.shape[0]} points with leafsize {args.leafsize} and compact={args.compact} + shrink to fit={args.shrink_to_fit}.")
lbvh = LBVHIndex(leaf_size=args.leafsize, compact=args.compact, shrink_to_fit=args.shrink_to_fit)
print(f"Benchmarking KNN Search for {points.shape[0]} points with leafsize {args.leafsize} and compact={args.compact} + "
f"shrink to fit={args.shrink_to_fit} + sort={args.sort}.")
lbvh = LBVHIndex(leaf_size=args.leafsize,
compact=args.compact,
shrink_to_fit=args.shrink_to_fit,
sort_queries=args.sort)
print(f"Run times for {points.shape[0]} queries with k={args.knn} and r={args.radius}: ")

# build the index
Expand Down
12 changes: 8 additions & 4 deletions examples/cuda/custom_knn_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -7,21 +7,25 @@ __global__ void custom_knn_kernel(const BVHNode *nodes,
const unsigned int* __restrict__ sorted_indices,
unsigned int root_index,
const float max_radius,
const float3* __restrict__ queries,
const float3* __restrict__ query_points,
const unsigned int* __restrict__ sorted_queries,
unsigned int N,
// custom argments
float3* means_out)
{
unsigned int query_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (query_idx >= N)
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
return;
unsigned int query_idx = sorted_queries[idx];

auto queue = query_knn(nodes, points, sorted_indices, root_index, &queries[query_idx], max_radius);
auto queue = query_knn(nodes, points, sorted_indices, root_index, &query_points[query_idx], max_radius);

// compute the mean of all nearest neighbors found as an example
float3 sum = make_float3(0.0, 0.0, 0.0);
for(int i=0; i<queue.size(); ++i) {
sum += points[queue[i].id];
}

// write the result back in an unsorted order
means_out[query_idx] = sum / queue.size();
}
11 changes: 7 additions & 4 deletions examples/cuda/custom_radius_kernel.cu
Original file line number Diff line number Diff line change
Expand Up @@ -45,17 +45,20 @@ __global__ void custom_radius_kernel(const BVHNode *nodes,
const unsigned int* __restrict__ sorted_indices,
unsigned int root_index,
const float max_radius,
const float3* __restrict__ queries,
const float3* __restrict__ query_points,
const unsigned int* __restrict__ sorted_queries,
unsigned int N,
// custom argments
float3* means_out,
unsigned int* num_neighbors_out)
{
unsigned int query_idx = blockIdx.x * blockDim.x + threadIdx.x;
if (query_idx >= N)
unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
if (idx >= N)
return;
unsigned int query_idx = sorted_queries[idx];

RadiusHandler handler(max_radius);
query(nodes, points, sorted_indices, root_index, &queries[query_idx], handler);
query(nodes, points, sorted_indices, root_index, &query_points[query_idx], handler);
means_out[query_idx] = handler.mean();
num_neighbors_out[query_idx] = handler.count();
}

0 comments on commit 12729d2

Please sign in to comment.