Merge pull request #1 from mortacious/feature/automatic_sorting

Feature/automatic sorting
mortacious · Sep 20, 2022 · 12729d2 · 12729d2
2 parents 1ca35f4 + 4a060ec
commit 12729d2
Show file tree

Hide file tree

Showing 6 changed files with 47 additions and 40 deletions.
diff --git a/cupy_knn/_version.py b/cupy_knn/_version.py
@@ -1 +1 @@
-__version__ = "0.2.3"
+__version__ = "0.2.4"
diff --git a/cupy_knn/cuda/query_knn_kernels.cu b/cupy_knn/cuda/query_knn_kernels.cu
@@ -7,19 +7,22 @@ __global__ void query_knn_kernel(const BVHNode *nodes,
                                  const unsigned int* __restrict__ sorted_indices,
                                  const unsigned int root_index,
                                  const float max_radius,
-                                 const float3* __restrict__ queries,
+                                 const float3* __restrict__ query_points,
+                                 const unsigned int* __restrict__ sorted_queries,
                                  const unsigned int N,
                                  // custom parameters
                                  unsigned int* indices_out,
                                  float* distances_out,
                                  unsigned int* n_neighbors_out
                                  )
 {
-    unsigned int query_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (query_idx >= N)
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N)
         return;
     StaticPriorityQueue<float, K> queue(max_radius);
-    query_knn(nodes, points, sorted_indices, root_index, &queries[query_idx], queue);
+    unsigned int query_idx = sorted_queries[idx];
+    query_knn(nodes, points, sorted_indices, root_index, &query_points[query_idx], queue);
     __syncwarp(); // synchronize the warp before the write operation
-    queue.write_results(&indices_out[query_idx * K], &distances_out[query_idx * K], &n_neighbors_out[query_idx]); // write back the results
+    // write back the results at the correct position
+    queue.write_results(&indices_out[query_idx * K], &distances_out[query_idx * K], &n_neighbors_out[query_idx]);
 }
diff --git a/cupy_knn/lbvh_index.py b/cupy_knn/lbvh_index.py
@@ -140,7 +140,8 @@ def prepare_knn(self, module: cp.RawModule, kernel_name: str, radius: Optional[f
                                  const unsigned int* sorted_indices, // the sorted point indices
                                  const unsigned int root_index, // the tree's root index
                                  const float max_radius, // the maximum search radius
-                                 const float3* queries, // the query points
+                                 const float3* query_points, // the query points
+                                 const unsigned int* __restrict__ sorted_queries, // the indices of the query points sorted in morton order if sorting is enabled
                                  const unsigned int N, // the total number of queries
                                  <other custom parameters>
                                  )
@@ -185,7 +186,8 @@ def prepare_radius(self, module: cp.RawModule, kernel_name: str, radius):
                 const unsigned int* sorted_indices, // the sorted point indices
                 const unsigned int root_index, // the tree's root index
                 const float max_radius, // the maximum search radius
-                const float3* queries, // the query points
+                const float3* query_points, // the query points
+                const unsigned int* __restrict__ sorted_queries, // the indices of the query points sorted in morton order if sorting is enabled
                 const unsigned int N, // the total number of queries
                 <other custom parameters>
             )
@@ -330,9 +332,10 @@ def query_knn(self, queries: cp.ndarray, *args):
         n_neighbors: cp.ndarray of shape (n,)
             The number of nearest neighbors found
             Note: This is only returned in default mode (if 'prepare_knn_default' was called)
-        query_order: cp.ndarray of shape (n,)
-            The query indices in morton order. This is returned only if 'prepare_knn' was called directly with custom
-            code and sort_queries is True in order to sort the returned values.
+        *args: tuple
+            The additional arguments to the function.
+            This is returned only if 'prepare_knn' was called directly with custom
+            code.
 
         """
         if self.num_nodes < 0:
@@ -350,8 +353,9 @@ def query_knn(self, queries: cp.ndarray, *args):
             block_dim, grid_dim = select_block_grid_sizes(queries.device, queries.shape[0])
             self._compute_morton_points_kernel_float(grid_dim, block_dim, (queries, self.extent, morton_codes, queries.shape[0]))
             sorted_indices = cp.argsort(morton_codes)
-            queries = queries[sorted_indices]
             stream.synchronize()
+        else:
+            sorted_indices = cp.arange(queries.shape[0], dtype=cp.uint32)
 
         # use the maximum allowed threads per block from the kernel (depends on the number of registers)
         max_threads_per_block = self._query_kernel.attributes['max_threads_per_block']
@@ -371,25 +375,12 @@ def query_knn(self, queries: cp.ndarray, *args):
                                                  cp.uint32(self.root_node),
                                                  cp.float32(self.radius),
                                                  queries,
+                                                 sorted_indices,
                                                  queries.shape[0],
                                                  *args))
         stream.synchronize()
 
-        if self.sort_queries:
-            if self._custom_knn > 0:
-                indices_tmp = cp.empty_like(indices_out)
-                indices_tmp[sorted_indices] = indices_out  # invert the sorting
-                indices_out = indices_tmp
-                distances_tmp = cp.empty_like(distances_out)
-                distances_tmp[sorted_indices] = distances_out
-                distances_out = distances_tmp
-                n_neighbors_tmp = cp.empty_like(n_neighbors_out)
-                n_neighbors_tmp[sorted_indices] = n_neighbors_out
-                n_neighbors_out = n_neighbors_tmp
-
-                return indices_out, distances_out, n_neighbors_out
-            else:
-                return sorted_indices
+        return args
 
     def query_radius(self, queries: cp.ndarray, *args):
         """
@@ -427,8 +418,9 @@ def query_radius(self, queries: cp.ndarray, *args):
             block_dim, grid_dim = select_block_grid_sizes(queries.device, queries.shape[0])
             self._compute_morton_points_kernel_float(grid_dim, block_dim, (queries, self.extent, morton_codes, queries.shape[0]))
             sorted_indices = cp.argsort(morton_codes)
-            queries = queries[sorted_indices]
             stream.synchronize()
+        else:
+            sorted_indices = cp.arange(queries.shape[0], dtype=cp.uint32)
 
         # use the maximum allowed threads per block from the kernel (depends on the number of registers)
         max_threads_per_block = self._query_kernel.attributes['max_threads_per_block']
@@ -441,12 +433,12 @@ def query_radius(self, queries: cp.ndarray, *args):
                                                  cp.uint32(self.root_node),
                                                  cp.float32(self.radius),
                                                  queries,
+                                                 sorted_indices,
                                                  queries.shape[0],
                                                  *args))
         stream.synchronize()
 
-        if self.sort_queries:
-            return sorted_indices
+        return args
 
     def tree_data(self, numpy: bool = True):
         """

diff --git a/examples/benchmark_knn_query.py b/examples/benchmark_knn_query.py
@@ -14,14 +14,19 @@
 parser.add_argument("-c", "--compact", action='store_true', help="Compact the tree to remove unused spaces in between")
 parser.add_argument("-s", "--shrink_to_fit", action='store_true', help="Shrink to buffer size of the tree to match "
                                                                        "it's content after compaction")
+parser.add_argument("--sort", action='store_true', help="Sort the queries by their morton index before execution")
 
 args = parser.parse_args()
 pc = PlyData.read(args.input_file)
 
 points = np.stack([pc.elements[0].data['x'], pc.elements[0].data['y'], pc.elements[0].data['z']], axis=1)
 
-print(f"Benchmarking KNN Search for {points.shape[0]} points with leafsize {args.leafsize} and compact={args.compact} + shrink to fit={args.shrink_to_fit}.")
-lbvh = LBVHIndex(leaf_size=args.leafsize, compact=args.compact, shrink_to_fit=args.shrink_to_fit)
+print(f"Benchmarking KNN Search for {points.shape[0]} points with leafsize {args.leafsize} and compact={args.compact} + "
+      f"shrink to fit={args.shrink_to_fit} + sort={args.sort}.")
+lbvh = LBVHIndex(leaf_size=args.leafsize,
+                 compact=args.compact,
+                 shrink_to_fit=args.shrink_to_fit,
+                 sort_queries=args.sort)
 print(f"Run times for {points.shape[0]} queries with k={args.knn} and r={args.radius}: ")
 
 # build the index

diff --git a/examples/cuda/custom_knn_kernel.cu b/examples/cuda/custom_knn_kernel.cu
@@ -7,21 +7,25 @@ __global__ void custom_knn_kernel(const BVHNode *nodes,
                                  const unsigned int* __restrict__ sorted_indices,
                                  unsigned int root_index,
                                  const float max_radius,
-                                 const float3* __restrict__ queries,
+                                 const float3* __restrict__ query_points,
+                                 const unsigned int* __restrict__ sorted_queries,
                                  unsigned int N,
                                  // custom argments
                                  float3* means_out)
 {
-    unsigned int query_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (query_idx >= N)
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N)
         return;
+    unsigned int query_idx = sorted_queries[idx];
 
-    auto queue = query_knn(nodes, points, sorted_indices, root_index, &queries[query_idx], max_radius);
+    auto queue = query_knn(nodes, points, sorted_indices, root_index, &query_points[query_idx], max_radius);
 
     // compute the mean of all nearest neighbors found as an example
     float3 sum = make_float3(0.0, 0.0, 0.0);
     for(int i=0; i<queue.size(); ++i) {
         sum += points[queue[i].id];
     }
+
+    // write the result back in an unsorted order
     means_out[query_idx] = sum / queue.size();
 }
diff --git a/examples/cuda/custom_radius_kernel.cu b/examples/cuda/custom_radius_kernel.cu
@@ -45,17 +45,20 @@ __global__ void custom_radius_kernel(const BVHNode *nodes,
                                   const unsigned int* __restrict__ sorted_indices,
                                   unsigned int root_index,
                                   const float max_radius,
-                                  const float3* __restrict__ queries,
+                                  const float3* __restrict__ query_points,
+                                  const unsigned int* __restrict__ sorted_queries,
                                   unsigned int N,
                                   // custom argments
                                   float3* means_out,
                                   unsigned int* num_neighbors_out)
 {
-    unsigned int query_idx = blockIdx.x * blockDim.x + threadIdx.x;
-    if (query_idx >= N)
+    unsigned int idx = blockIdx.x * blockDim.x + threadIdx.x;
+    if (idx >= N)
         return;
+    unsigned int query_idx = sorted_queries[idx];
+
     RadiusHandler handler(max_radius);
-    query(nodes, points, sorted_indices, root_index, &queries[query_idx], handler);
+    query(nodes, points, sorted_indices, root_index, &query_points[query_idx], handler);
     means_out[query_idx] = handler.mean();
     num_neighbors_out[query_idx] = handler.count();
 }