Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

refactor: replace old warp size api #705

Merged
merged 10 commits into from
Mar 25, 2025
9 changes: 9 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
@@ -32,6 +32,8 @@ Full documentation for rocPRIM is available at [https://rocm.docs.amd.com/projec

### Upcoming changes

* The next major release may change the template parameters of warp and block algorithms.

* The default scan accumulator types for device-level scan algorithms will be changed. This is a breaking change.

Previously, the default accumulator type was set to the input type for inclusive scans and to the initial value type for exclusive scans. These default types could cause unexpected overflow in situations where the input or initial type is smaller than the output type when the user doesn't explicitly set an accumulator type using the `AccType` template parameter.
@@ -56,6 +58,13 @@ The following is the complete list of affected functions and how their default a

* `rocprim::load_cs` and `rocprim::store_cs` are deprecated. Use `rocprim::load_nontemporal` and `rocprim::store_nontemporal` now.

* Due to an upcoming compiler change the following warp size-related symbols will be removed in the next major release and are thus marked as deprecated:
* `rocprim::device_warp_size()`
* For compile-time constants, this is replaced with `rocprim::arch::wavefront::min_size()` and `rocprim::arch::wavefront::max_size()`. Use this when allocating global or shared memory.
* For run-time constants, this is replaced with `rocprim::arch::wavefront::size().`
* `rocprim::warp_size()`
* `ROCPRIM_WAVEFRONT_SIZE`

## rocPRIM 3.4.0 for ROCm 6.4.0

### Added
2 changes: 1 addition & 1 deletion benchmark/benchmark_device_memory.cpp
Original file line number Diff line number Diff line change
@@ -179,7 +179,7 @@ struct operation<atomics_inter_warp_collision, T, ItemsPerThread, BlockSize>
(void)shared_storage;
(void)shared_storage_size;
(void)input;
unsigned int index = (threadIdx.x % rocprim::device_warp_size()) * ItemsPerThread
unsigned int index = (threadIdx.x % rocprim::arch::wavefront::min_size()) * ItemsPerThread
+ blockIdx.x * blockDim.x * ItemsPerThread;
ROCPRIM_UNROLL
for(unsigned int i = 0; i < ItemsPerThread; ++i)
Original file line number Diff line number Diff line change
@@ -398,7 +398,7 @@ struct device_radix_sort_onesweep_benchmark_generator
template<unsigned int ItemsPerThread, rocprim::block_radix_rank_algorithm RadixRankAlgorithm>
static constexpr bool is_buildable()
{
// Calculation uses `rocprim::device_warp_size()`, which is 64 on host side unless overridden.
// Calculation uses `rocprim::arch::wavefront::min_size()`, which is 64 on host side unless overridden.
// However, this does not affect the total size of shared memory for the current configuration space.
// Were the implementation to change, causing retuning, this needs to be re-evaluated and possibly taken into account.
using sharedmem_storage = typename rocprim::detail::onesweep_iteration_helper<
1 change: 1 addition & 0 deletions benchmark/benchmark_utils.hpp
Original file line number Diff line number Diff line change
@@ -34,6 +34,7 @@
#include <rocprim/config.hpp>
#include <rocprim/device/config_types.hpp>
#include <rocprim/device/detail/device_config_helper.hpp> // partition_config_params
#include <rocprim/intrinsics/arch.hpp>
#include <rocprim/intrinsics/thread.hpp>
#include <rocprim/type_traits.hpp>
#include <rocprim/type_traits_interface.hpp>
6 changes: 3 additions & 3 deletions common/utils.hpp
Original file line number Diff line number Diff line change
@@ -54,10 +54,10 @@

namespace common
{

template<unsigned int LogicalWarpSize>
__device__ constexpr bool device_test_enabled_for_warp_size_v
= ::rocprim::device_warp_size() >= LogicalWarpSize;
__device__
constexpr bool device_test_enabled_for_warp_size_v
= ::rocprim::arch::wavefront::min_size() >= LogicalWarpSize;

inline char* __get_env(const char* name)
{
11 changes: 7 additions & 4 deletions rocprim/include/rocprim/block/block_exchange.hpp
Original file line number Diff line number Diff line change
@@ -26,6 +26,7 @@

#include "../functional.hpp"
#include "../intrinsics.hpp"
#include "../intrinsics/arch.hpp"
#include "../types.hpp"

#include "config.hpp"
@@ -88,8 +89,8 @@ class block_exchange
{
static constexpr unsigned int BlockSize = BlockSizeX * BlockSizeY * BlockSizeZ;
// Select warp size
static constexpr unsigned int warp_size =
detail::get_min_warp_size(BlockSize, ::rocprim::device_warp_size());
static constexpr unsigned int warp_size
= detail::get_min_warp_size(BlockSize, ::rocprim::arch::wavefront::min_size());
// Number of warps in block
static constexpr unsigned int warps_no = ::rocprim::detail::ceiling_div(BlockSize, warp_size);
static constexpr unsigned int banks_no = ::rocprim::detail::get_lds_banks_no();
@@ -656,16 +657,18 @@ class block_exchange
/// ...
/// }
/// \endcode
template<unsigned int WarpSize = device_warp_size(), class U, class Offset>
template<unsigned int WarpSize = arch::wavefront::min_size(), class U, class Offset>
ROCPRIM_DEVICE ROCPRIM_INLINE
void scatter_to_warp_striped(const T (&input)[ItemsPerThread],
U (&output)[ItemsPerThread],
const Offset (&ranks)[ItemsPerThread],
storage_type& storage)
{
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= arch::wavefront::max_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
assert(WarpSize <= arch::wavefront::size());

const unsigned int flat_id
= ::rocprim::flat_block_thread_id<BlockSizeX, BlockSizeY, BlockSizeZ>();
const unsigned int thread_id = detail::logical_lane_id<WarpSize>();
8 changes: 7 additions & 1 deletion rocprim/include/rocprim/block/block_load.hpp
Original file line number Diff line number Diff line change
@@ -770,9 +770,15 @@ class block_load<T, BlockSizeX, ItemsPerThread, block_load_method::block_load_wa
using block_exchange_type = block_exchange<T, BlockSizeX, ItemsPerThread, BlockSizeY, BlockSizeZ>;

public:
ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(BlockSize % ::rocprim::device_warp_size() == 0,
ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(BlockSize % ::rocprim::arch::wavefront::min_size() == 0,
"BlockSize must be a multiple of hardware warpsize");

ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
block_load()
{
assert(BlockSize % ::rocprim::arch::wavefront::size() == 0);
}

using storage_type = typename block_exchange_type::storage_type;

template<class InputIterator>
67 changes: 34 additions & 33 deletions rocprim/include/rocprim/block/block_load_func.hpp
Original file line number Diff line number Diff line change
@@ -24,9 +24,10 @@
#include "../config.hpp"
#include "../detail/various.hpp"

#include "../intrinsics.hpp"
#include "../functional.hpp"
#include "../intrinsics.hpp"
#include "../types.hpp"
#include "rocprim/intrinsics/arch.hpp"

/// \addtogroup blockmodule
/// @{
@@ -367,20 +368,20 @@ void block_load_direct_striped(unsigned int flat_id,
/// \param flat_id a local flat 1D thread id in a block (tile) for the calling thread
/// \param block_input the input iterator from the thread block to load from
/// \param items array that data is loaded to
template<
unsigned int WarpSize = device_warp_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread
>
template<unsigned int WarpSize = arch::wavefront::min_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_warp_striped(unsigned int flat_id,
void block_load_direct_warp_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread])
{
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= arch::wavefront::max_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
assert(WarpSize <= arch::wavefront::size());

unsigned int thread_id = detail::logical_lane_id<WarpSize>();
unsigned int warp_id = flat_id / WarpSize;
unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
@@ -419,21 +420,21 @@ void block_load_direct_warp_striped(unsigned int flat_id,
/// \param block_input the input iterator from the thread block to load from
/// \param items array that data is loaded to
/// \param valid maximum range of valid numbers to load
template<
unsigned int WarpSize = device_warp_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread
>
template<unsigned int WarpSize = arch::wavefront::min_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_warp_striped(unsigned int flat_id,
void block_load_direct_warp_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid)
{
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= arch::wavefront::max_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
assert(WarpSize <= arch::wavefront::size());

unsigned int thread_id = detail::logical_lane_id<WarpSize>();
unsigned int warp_id = flat_id / WarpSize;
unsigned int warp_offset = warp_id * WarpSize * ItemsPerThread;
@@ -479,23 +480,23 @@ void block_load_direct_warp_striped(unsigned int flat_id,
/// \param items array that data is loaded to
/// \param valid maximum range of valid numbers to load
/// \param out_of_bounds default value assigned to out-of-bound items
template<
unsigned int WarpSize = device_warp_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread,
class Default
>
template<unsigned int WarpSize = arch::wavefront::min_size(),
class InputIterator,
class T,
unsigned int ItemsPerThread,
class Default>
ROCPRIM_DEVICE ROCPRIM_INLINE
void block_load_direct_warp_striped(unsigned int flat_id,
void block_load_direct_warp_striped(unsigned int flat_id,
InputIterator block_input,
T (&items)[ItemsPerThread],
unsigned int valid,
Default out_of_bounds)
Default out_of_bounds)
{
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= device_warp_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
static_assert(detail::is_power_of_two(WarpSize) && WarpSize <= arch::wavefront::max_size(),
"WarpSize must be a power of two and equal or less"
"than the size of hardware warp.");
assert(WarpSize <= arch::wavefront::size());

ROCPRIM_UNROLL
for (unsigned int item = 0; item < ItemsPerThread; item++)
{
28 changes: 18 additions & 10 deletions rocprim/include/rocprim/block/block_radix_sort.hpp
Original file line number Diff line number Diff line change
@@ -34,6 +34,7 @@
#include "block_exchange.hpp"
#include "block_radix_rank.hpp"
#include "rocprim/block/config.hpp"
#include "rocprim/intrinsics/arch.hpp"

/// \addtogroup blockmodule
/// @{
@@ -102,10 +103,11 @@ template<class Key,
unsigned int BlockSizeY = 1,
unsigned int BlockSizeZ = 1,
unsigned int RadixBitsPerPass
= (BlockSizeX * BlockSizeY * BlockSizeZ) % device_warp_size() == 0 ? 8 /* match */
: 4 /* basic_memoize */,
= (BlockSizeX * BlockSizeY * BlockSizeZ) % arch::wavefront::min_size() == 0
? 8 /* match */
: 4 /* basic_memoize */,
block_radix_rank_algorithm RadixRankAlgorithm
= (BlockSizeX * BlockSizeY * BlockSizeZ) % device_warp_size() == 0
= (BlockSizeX * BlockSizeY * BlockSizeZ) % arch::wavefront::min_size() == 0
? block_radix_rank_algorithm::match
: block_radix_rank_algorithm::basic_memoize,
block_padding_hint PaddingHint = block_padding_hint::lds_occupancy_bound>
@@ -119,11 +121,10 @@ class block_radix_sort
static constexpr bool with_values = !std::is_same<Value, empty_type>::value;
static constexpr bool warp_striped = RadixRankAlgorithm == block_radix_rank_algorithm::match;

#if __HIP_DEVICE_COMPILE__
static_assert(!warp_striped || (BlockSize % device_warp_size()) == 0,
"When using 'block_radix_rank_algorithm::match', the block size should be a "
"multiple of the warp size");
#endif
ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(
!warp_striped || (BlockSize % ::rocprim::arch::wavefront::min_size()) == 0,
"When using 'block_radix_rank_algorithm::match', the block size should be a "
"multiple of the warp size");

static constexpr bool is_key_and_value_aligned
= alignof(Key) == alignof(Value) && sizeof(Key) == sizeof(Value);
@@ -160,6 +161,12 @@ class block_radix_sort
using storage_type = storage_type_; // only for Doxygen
#endif

ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
block_radix_sort()
{
assert(BlockSize % ::rocprim::arch::wavefront::size() == 0);
}

/// \brief Performs ascending radix sort over keys partitioned across threads in a block.
///
/// \tparam Decomposer The type of the decomposer argument. Defaults to the identity decomposer.
@@ -1060,7 +1067,7 @@ class block_radix_sort

private:
static constexpr bool use_warp_exchange
= device_warp_size() % ItemsPerThread == 0 && ItemsPerThread <= 4;
= ::rocprim::arch::wavefront::min_size() % ItemsPerThread == 0 && ItemsPerThread <= 4;

template<class SortedValue>
ROCPRIM_DEVICE ROCPRIM_INLINE
@@ -1126,7 +1133,8 @@ class block_radix_sort
{
// This appears to be slower with high large items per thread.
constexpr bool use_warp_exchange
= device_warp_size() % ItemsPerThread == 0 && ItemsPerThread <= 4;
= ::rocprim::arch::wavefront::min_size() % ItemsPerThread == 0
&& ItemsPerThread <= 4;
blocked_to_warp_striped(keys,
values,
storage,
2 changes: 1 addition & 1 deletion rocprim/include/rocprim/block/block_reduce.hpp
Original file line number Diff line number Diff line change
@@ -98,7 +98,7 @@ struct select_block_reduce_impl<block_reduce_algorithm::raking_reduce_commutativ
/// * \p ItemsPerThread is greater than one,
/// * \p T is an arithmetic type,
/// * reduce operation is simple addition operator, and
/// * the number of threads in the block is a multiple of the hardware warp size (see rocprim::device_warp_size()).
/// * the number of threads in the block is a multiple of the hardware warp size (see \p rocprim::arch::wavefront::min_size() ).
/// * block_reduce has three alternative implementations: \p block_reduce_algorithm::using_warp_reduce,
/// \p block_reduce_algorithm::raking_reduce and \p block_reduce_algorithm::raking_reduce_commutative_only.
/// * If the block sizes less than 64 only one warp reduction is used. The block reduction algorithm
12 changes: 6 additions & 6 deletions rocprim/include/rocprim/block/block_scan.hpp
Original file line number Diff line number Diff line change
@@ -29,8 +29,9 @@
#include "../intrinsics.hpp"
#include "../functional.hpp"

#include "detail/block_scan_warp_scan.hpp"
#include "detail/block_scan_reduce_then_scan.hpp"
#include "detail/block_scan_warp_scan.hpp"
#include "rocprim/intrinsics/arch.hpp"

/// \addtogroup blockmodule
/// @{
@@ -70,10 +71,9 @@ struct select_block_scan_impl<block_scan_algorithm::reduce_then_scan>
// When BlockSize is less than hardware warp size block_scan_warp_scan performs better than
// block_scan_reduce_then_scan by specializing for warps
using type = typename std::conditional<
(BlockSizeX * BlockSizeY * BlockSizeZ <= ::rocprim::device_warp_size()),
block_scan_warp_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>,
block_scan_reduce_then_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>
>::type;
(BlockSizeX * BlockSizeY * BlockSizeZ <= ::rocprim::arch::wavefront::min_size()),
block_scan_warp_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>,
block_scan_reduce_then_scan<T, BlockSizeX, BlockSizeY, BlockSizeZ>>::type;
};

} // end namespace detail
@@ -96,7 +96,7 @@ struct select_block_scan_impl<block_scan_algorithm::reduce_then_scan>
/// * \p ItemsPerThread is greater than one,
/// * \p T is an arithmetic type,
/// * scan operation is simple addition operator, and
/// * the number of threads in the block is a multiple of the hardware warp size (see rocprim::device_warp_size()).
/// * the number of threads in the block is a multiple of the hardware warp size (see \p rocprim::arch::wavefront::min_size() ).
/// * block_scan has two alternative implementations: \p block_scan_algorithm::using_warp_scan
/// and block_scan_algorithm::reduce_then_scan.
///
8 changes: 7 additions & 1 deletion rocprim/include/rocprim/block/block_store.hpp
Original file line number Diff line number Diff line change
@@ -498,11 +498,17 @@ class block_store<T, BlockSizeX, ItemsPerThread, block_store_method::block_store
using block_exchange_type = block_exchange<T, BlockSize, ItemsPerThread>;

public:
ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(BlockSize % ::rocprim::device_warp_size() == 0,
ROCPRIM_DETAIL_DEVICE_STATIC_ASSERT(BlockSize % ::rocprim::arch::wavefront::min_size() == 0,
"BlockSize must be a multiple of hardware warpsize");

using storage_type = typename block_exchange_type::storage_type;

ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
block_store()
{
assert(BlockSize % ::rocprim::arch::wavefront::size() == 0);
}

template<class OutputIterator>
ROCPRIM_DEVICE ROCPRIM_FORCE_INLINE
void store(OutputIterator block_output,
Loading