ROCm · stanleytsang-amd · Mar 25, 2025 · Mar 17, 2025 · Mar 17, 2025 · Mar 17, 2025
@@ -32,6 +32,8 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec
 
 ### Upcoming changes
 
+* The next major release may change the template parameters of warp and block algorithms.
+
 * The default scan accumulator types for device-level scan algorithms will be changed. This is a breaking change.
 
 Previously, the default accumulator type was set to the input type for inclusive scans and to the initial value type for exclusive scans. These default types could cause unexpected overflow in situations where the input or initial type is smaller than the output type when the user doesn't explicitly set an accumulator type using the `AccType` template parameter.
@@ -56,6 +58,13 @@ The following is the complete list of affected functions and how their default a
 
 * `rocprim::load_cs` and `rocprim::store_cs` are deprecated. Use `rocprim::load_nontemporal` and `rocprim::store_nontemporal` now.
 
+* Due to an upcoming compiler change the following warp size-related symbols will be removed in the next major release and are thus marked as deprecated:
+  * `rocprim::device_warp_size()`
+    * For compile-time constants, this is replaced with `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()`. Use this when allocating global or shared memory.
+    * For run-time constants, this is replaced with `rocprim::arch::wavefront::size().`
+  * `rocprim::warp_size()`
+  * `ROCPRIM_WAVEFRONT_SIZE`
+
 ## rocPRIM 3.4.0 for ROCm 6.4.0
 
 ### Added

@@ -179,7 +179,7 @@ struct operation<atomics_inter_warp_collision, T, ItemsPerThread, BlockSize>
         (void)shared_storage;
         (void)shared_storage_size;
         (void)input;
-        unsigned int index = (threadIdx.x % rocprim::device_warp_size()) * ItemsPerThread
+        unsigned int index = (threadIdx.x % rocprim::arch::wavefront::min_size()) * ItemsPerThread
                              + blockIdx.x * blockDim.x * ItemsPerThread;
         ROCPRIM_UNROLL
         for(unsigned int i = 0; i < ItemsPerThread; ++i)

@@ -398,7 +398,7 @@ struct device_radix_sort_onesweep_benchmark_generator
     template<unsigned int ItemsPerThread, rocprim::block_radix_rank_algorithm RadixRankAlgorithm>
     static constexpr bool is_buildable()
     {
-        // Calculation uses `rocprim::device_warp_size()`, which is 64 on host side unless overridden.
+        // Calculation uses `rocprim::arch::wavefront::min_size()`, which is 64 on host side unless overridden.
         //   However, this does not affect the total size of shared memory for the current configuration space.
         //   Were the implementation to change, causing retuning, this needs to be re-evaluated and possibly taken into account.
         using sharedmem_storage = typename rocprim::detail::onesweep_iteration_helper<

@@ -34,6 +34,7 @@
 #include <rocprim/config.hpp>
 #include <rocprim/device/config_types.hpp>
 #include <rocprim/device/detail/device_config_helper.hpp> // partition_config_params
+#include <rocprim/intrinsics/arch.hpp>
 #include <rocprim/intrinsics/thread.hpp>
 #include <rocprim/type_traits.hpp>
 #include <rocprim/type_traits_interface.hpp>

@@ -54,10 +54,10 @@
 
 namespace common
 {
-
 template<unsigned int LogicalWarpSize>
-__device__ constexpr bool device_test_enabled_for_warp_size_v
-    = ::rocprim::device_warp_size() >= LogicalWarpSize;
+__device__
+constexpr bool device_test_enabled_for_warp_size_v
+    = ::rocprim::arch::wavefront::min_size() >= LogicalWarpSize;
 
 inline char* __get_env(const char* name)
 {

@@ -26,6 +26,7 @@
 
 #include "../functional.hpp"
 #include "../intrinsics.hpp"
+#include "../intrinsics/arch.hpp"
 #include "../types.hpp"
 
 #include "config.hpp"
@@ -88,8 +89,8 @@ class block_exchange
 {
     static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
     // Select warp size
-    static constexpr unsigned int warp_size =
-        detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
+    static constexpr unsigned int warp_size
+        = detail::get_min_warp_size(BlockSize, ::rocprim::arch::wavefront::min_size());
     // Number of warps in block
     static constexpr unsigned int warps_no = ::rocprim::detail::ceiling_div(BlockSize, warp_size);
     static constexpr unsigned int banks_no = ::rocprim::detail::get_lds_banks_no();
@@ -656,16 +657,18 @@ class block_exchange
     ///     ...
     /// }
     /// \endcode
-    template<unsigned int WarpSize = device_warp_size(), class U, class Offset>
+    template<unsigned int WarpSize = arch::wavefront::min_size(), class U, class Offset>
     ROCPRIM_DEVICE ROCPRIM_INLINE
     void scatter_to_warp_striped(const T (&input)[ItemsPerThread],
                                  U (&output)[ItemsPerThread],
                                  const Offset (&ranks)[ItemsPerThread],
                                  storage_type& storage)
     {
-        static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
+        static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= arch::wavefront::max_size(),
                       "WarpSize must be a power of two and equal or less"
                       "than the size of hardware warp.");
+        assert(WarpSize <= arch::wavefront::size());
+
         const unsigned int flat_id
             = ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
         const unsigned int thread_id     = detail::logical_lane_id<WarpSize>();

@@ -770,9 +770,15 @@ class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_wa
     using block_exchange_type = block_exchange<T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;
 
 public:
-    ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(BlockSize % ::rocprim::device_warp_size() == 0,
+    ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(BlockSize % ::rocprim::arch::wavefront::min_size() == 0,
                                         "BlockSize must be a multiple of hardware warpsize");
 
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    block_load()
+    {
+        assert(BlockSize % ::rocprim::arch::wavefront::size() == 0);
+    }
+
     using storage_type = typename block_exchange_type::storage_type;
 
     template<class InputIterator>

@@ -24,9 +24,10 @@
 #include "../config.hpp"
 #include "../detail/various.hpp"
 
-#include "../intrinsics.hpp"
 #include "../functional.hpp"
+#include "../intrinsics.hpp"
 #include "../types.hpp"
+#include "rocprim/intrinsics/arch.hpp"
 
 /// \addtogroup blockmodule
 /// @{
@@ -367,20 +368,20 @@ void block_load_direct_striped(unsigned int flat_id,
 /// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
 /// \param block_input the input iterator from the thread block to load from
 /// \param items array that data is loaded to
-template<
-    unsigned int WarpSize = device_warp_size(),
-    class InputIterator,
-    class T,
-    unsigned int ItemsPerThread
->
+template<unsigned int WarpSize = arch::wavefront::min_size(),
+         class InputIterator,
+         class T,
+         unsigned int ItemsPerThread>
 ROCPRIM_DEVICE ROCPRIM_INLINE
-void block_load_direct_warp_striped(unsigned int flat_id,
+void block_load_direct_warp_striped(unsigned int  flat_id,
                                     InputIterator block_input,
                                     T (&items)[ItemsPerThread])
 {
-    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
-                 "WarpSize must be a power of two and equal or less"
-                 "than the size of hardware warp.");
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= arch::wavefront::max_size(),
+                  "WarpSize must be a power of two and equal or less"
+                  "than the size of hardware warp.");
+    assert(WarpSize <= arch::wavefront::size());
+
     unsigned int thread_id = detail::logical_lane_id<WarpSize>();
     unsigned int warp_id = flat_id / WarpSize;
     unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
@@ -419,21 +420,21 @@ void block_load_direct_warp_striped(unsigned int flat_id,
 /// \param block_input the input iterator from the thread block to load from
 /// \param items array that data is loaded to
 /// \param valid maximum range of valid numbers to load
-template<
-    unsigned int WarpSize = device_warp_size(),
-    class InputIterator,
-    class T,
-    unsigned int ItemsPerThread
->
+template<unsigned int WarpSize = arch::wavefront::min_size(),
+         class InputIterator,
+         class T,
+         unsigned int ItemsPerThread>
 ROCPRIM_DEVICE ROCPRIM_INLINE
-void block_load_direct_warp_striped(unsigned int flat_id,
+void block_load_direct_warp_striped(unsigned int  flat_id,
                                     InputIterator block_input,
                                     T (&items)[ItemsPerThread],
                                     unsigned int valid)
 {
-    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
-                 "WarpSize must be a power of two and equal or less"
-                 "than the size of hardware warp.");
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= arch::wavefront::max_size(),
+                  "WarpSize must be a power of two and equal or less"
+                  "than the size of hardware warp.");
+    assert(WarpSize <= arch::wavefront::size());
+
     unsigned int thread_id = detail::logical_lane_id<WarpSize>();
     unsigned int warp_id = flat_id / WarpSize;
     unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
@@ -479,23 +480,23 @@ void block_load_direct_warp_striped(unsigned int flat_id,
 /// \param items array that data is loaded to
 /// \param valid maximum range of valid numbers to load
 /// \param out_of_bounds default value assigned to out-of-bound items
-template<
-    unsigned int WarpSize = device_warp_size(),
-    class InputIterator,
-    class T,
-    unsigned int ItemsPerThread,
-    class Default
->
+template<unsigned int WarpSize = arch::wavefront::min_size(),
+         class InputIterator,
+         class T,
+         unsigned int ItemsPerThread,
+         class Default>
 ROCPRIM_DEVICE ROCPRIM_INLINE
-void block_load_direct_warp_striped(unsigned int flat_id,
+void block_load_direct_warp_striped(unsigned int  flat_id,
                                     InputIterator block_input,
                                     T (&items)[ItemsPerThread],
                                     unsigned int valid,
-                                    Default out_of_bounds)
+                                    Default      out_of_bounds)
 {
-    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
-                 "WarpSize must be a power of two and equal or less"
-                 "than the size of hardware warp.");
+    static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= arch::wavefront::max_size(),
+                  "WarpSize must be a power of two and equal or less"
+                  "than the size of hardware warp.");
+    assert(WarpSize <= arch::wavefront::size());
+
     ROCPRIM_UNROLL
     for (unsigned int item = 0; item < ItemsPerThread; item++)
     {

@@ -34,6 +34,7 @@
 #include "block_exchange.hpp"
 #include "block_radix_rank.hpp"
 #include "rocprim/block/config.hpp"
+#include "rocprim/intrinsics/arch.hpp"
 
 /// \addtogroup blockmodule
 /// @{
@@ -102,10 +103,11 @@ template<class Key,
          unsigned int BlockSizeY = 1,
          unsigned int BlockSizeZ = 1,
          unsigned int RadixBitsPerPass
-         = (BlockSizeX * BlockSizeY * BlockSizeZ) % device_warp_size() == 0 ? 8 /* match */
-                                                                            : 4 /* basic_memoize */,
+         = (BlockSizeX * BlockSizeY * BlockSizeZ) % arch::wavefront::min_size() == 0
+               ? 8 /* match */
+               : 4 /* basic_memoize */,
          block_radix_rank_algorithm RadixRankAlgorithm
-         = (BlockSizeX * BlockSizeY * BlockSizeZ) % device_warp_size() == 0
+         = (BlockSizeX * BlockSizeY * BlockSizeZ) % arch::wavefront::min_size() == 0
                ? block_radix_rank_algorithm::match
                : block_radix_rank_algorithm::basic_memoize,
          block_padding_hint PaddingHint = block_padding_hint::lds_occupancy_bound>
@@ -119,11 +121,10 @@ class block_radix_sort
     static constexpr bool         with_values = !std::is_same<Value, empty_type>::value;
     static constexpr bool warp_striped = RadixRankAlgorithm == block_radix_rank_algorithm::match;
 
-#if __HIP_DEVICE_COMPILE__
-    static_assert(!warp_striped || (BlockSize % device_warp_size()) == 0,
-                  "When using 'block_radix_rank_algorithm::match', the block size should be a "
-                  "multiple of the warp size");
-#endif
+    ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(
+        !warp_striped || (BlockSize % ::rocprim::arch::wavefront::min_size()) == 0,
+        "When using 'block_radix_rank_algorithm::match', the block size should be a "
+        "multiple of the warp size");
 
     static constexpr bool is_key_and_value_aligned
         = alignof(Key) == alignof(Value) && sizeof(Key) == sizeof(Value);
@@ -160,6 +161,12 @@ class block_radix_sort
     using storage_type = storage_type_; // only for Doxygen
 #endif
 
+        ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    block_radix_sort()
+    {
+        assert(BlockSize % ::rocprim::arch::wavefront::size() == 0);
+    }
+
     /// \brief Performs ascending radix sort over keys partitioned across threads in a block.
     ///
     /// \tparam Decomposer The type of the decomposer argument. Defaults to the identity decomposer.
@@ -1060,7 +1067,7 @@ class block_radix_sort
 
 private:
     static constexpr bool use_warp_exchange
-        = device_warp_size() % ItemsPerThread == 0 && ItemsPerThread <= 4;
+        = ::rocprim::arch::wavefront::min_size() % ItemsPerThread == 0 && ItemsPerThread <= 4;
 
     template<class SortedValue>
     ROCPRIM_DEVICE ROCPRIM_INLINE
@@ -1126,7 +1133,8 @@ class block_radix_sort
         {
             // This appears to be slower with high large items per thread.
             constexpr bool use_warp_exchange
-                = device_warp_size() % ItemsPerThread == 0 && ItemsPerThread <= 4;
+                = ::rocprim::arch::wavefront::min_size() % ItemsPerThread == 0
+                  && ItemsPerThread <= 4;
             blocked_to_warp_striped(keys,
                                     values,
                                     storage,

@@ -98,7 +98,7 @@ struct select_block_reduce_impl<block_reduce_algorithm::raking_reduce_commutativ
 ///   * \p ItemsPerThread is greater than one,
 ///   * \p T is an arithmetic type,
 ///   * reduce operation is simple addition operator, and
-///   * the number of threads in the block is a multiple of the hardware warp size (see rocprim::device_warp_size()).
+///   * the number of threads in the block is a multiple of the hardware warp size (see \p rocprim::arch::wavefront::min_size() ).
 /// * block_reduce has three alternative implementations: \p block_reduce_algorithm::using_warp_reduce,
 ///   \p block_reduce_algorithm::raking_reduce and \p block_reduce_algorithm::raking_reduce_commutative_only.
 /// * If the block sizes less than 64 only one warp reduction is used. The block reduction algorithm

@@ -29,8 +29,9 @@
 #include "../intrinsics.hpp"
 #include "../functional.hpp"
 
-#include "detail/block_scan_warp_scan.hpp"
 #include "detail/block_scan_reduce_then_scan.hpp"
+#include "detail/block_scan_warp_scan.hpp"
+#include "rocprim/intrinsics/arch.hpp"
 
 /// \addtogroup blockmodule
 /// @{
@@ -70,10 +71,9 @@ struct select_block_scan_impl<block_scan_algorithm::reduce_then_scan>
     // When BlockSize is less than hardware warp size block_scan_warp_scan performs better than
     // block_scan_reduce_then_scan by specializing for warps
     using type = typename std::conditional<
-                    (BlockSizeX * BlockSizeY * BlockSizeZ <= ::rocprim::device_warp_size()),
-                    block_scan_warp_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>,
-                    block_scan_reduce_then_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>
-                 >::type;
+        (BlockSizeX * BlockSizeY * BlockSizeZ <= ::rocprim::arch::wavefront::min_size()),
+        block_scan_warp_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>,
+        block_scan_reduce_then_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>>::type;
 };
 
 } // end namespace detail
@@ -96,7 +96,7 @@ struct select_block_scan_impl<block_scan_algorithm::reduce_then_scan>
 ///   * \p ItemsPerThread is greater than one,
 ///   * \p T is an arithmetic type,
 ///   * scan operation is simple addition operator, and
-///   * the number of threads in the block is a multiple of the hardware warp size (see rocprim::device_warp_size()).
+///   * the number of threads in the block is a multiple of the hardware warp size (see \p rocprim::arch::wavefront::min_size() ).
 /// * block_scan has two alternative implementations: \p block_scan_algorithm::using_warp_scan
 ///   and block_scan_algorithm::reduce_then_scan.
 ///

@@ -498,11 +498,17 @@ class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store
     using block_exchange_type = block_exchange<T, BlockSize, ItemsPerThread>;
 
 public:
-    ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(BlockSize % ::rocprim::device_warp_size() == 0,
+    ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(BlockSize % ::rocprim::arch::wavefront::min_size() == 0,
                                         "BlockSize must be a multiple of hardware warpsize");
 
     using storage_type = typename block_exchange_type::storage_type;
 
+    ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
+    block_store()
+    {
+        assert(BlockSize % ::rocprim::arch::wavefront::size() == 0);
+    }
+
     template<class OutputIterator>
     ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
     void store(OutputIterator block_output,