Skip to content
This repository has been archived by the owner on Mar 21, 2024. It is now read-only.

Commit

Permalink
Browse files Browse the repository at this point in the history
deterministic option for reduce, scan, etc.

Mapping of thread blocks onto input data is now "run-to-run"
deterministic for all variants of global reduction on a given device.
(Scan was already deterministic)
  • Loading branch information
dumerrill committed Aug 25, 2017
1 parent b165e1f commit f558c6d
Show file tree
Hide file tree
Showing 31 changed files with 394 additions and 412 deletions.
2 changes: 1 addition & 1 deletion CHANGE_LOG.TXT
Original file line number Diff line number Diff line change
Expand Up @@ -171,7 +171,7 @@
1.3.2 07/28/2014
- Bug fixes:
- Fix for cub::DeviceReduce where reductions of small problems
(small enough to only dispatch a single threadblock) would run in
(small enough to only dispatch a single thread block) would run in
the default stream (stream zero) regardless of whether an alternate
stream was specified.

Expand Down
136 changes: 23 additions & 113 deletions cub/agent/agent_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -38,7 +38,6 @@
#include "../block/block_load.cuh"
#include "../block/block_reduce.cuh"
#include "../grid/grid_mapping.cuh"
#include "../grid/grid_queue.cuh"
#include "../grid/grid_even_share.cuh"
#include "../util_type.cuh"
#include "../iterator/cache_modified_input_iterator.cuh"
Expand All @@ -64,8 +63,7 @@ template <
int _ITEMS_PER_THREAD, ///< Items per thread (per tile of input)
int _VECTOR_LOAD_LENGTH, ///< Number of items per vectorized load
BlockReduceAlgorithm _BLOCK_ALGORITHM, ///< Cooperative block-wide reduction algorithm to use
CacheLoadModifier _LOAD_MODIFIER, ///< Cache load modifier for reading input elements
GridMappingStrategy _GRID_MAPPING> ///< How to map tiles of input onto thread blocks
CacheLoadModifier _LOAD_MODIFIER> ///< Cache load modifier for reading input elements
struct AgentReducePolicy
{
enum
Expand All @@ -77,7 +75,6 @@ struct AgentReducePolicy

static const BlockReduceAlgorithm BLOCK_ALGORITHM = _BLOCK_ALGORITHM; ///< Cooperative block-wide reduction algorithm to use
static const CacheLoadModifier LOAD_MODIFIER = _LOAD_MODIFIER; ///< Cache load modifier for reading input elements
static const GridMappingStrategy GRID_MAPPING = _GRID_MAPPING; ///< How to map tiles of input onto thread blocks
};


Expand Down Expand Up @@ -148,7 +145,6 @@ struct AgentReduce
struct _TempStorage
{
typename BlockReduceT::TempStorage reduce;
OffsetT dequeue_offset;
};

/// Alias wrapper allowing storage to be unioned
Expand Down Expand Up @@ -314,36 +310,35 @@ struct AgentReduce
*/
template <int CAN_VECTORIZE>
__device__ __forceinline__ OutputT ConsumeRange(
OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
OffsetT block_end, ///< [in] Threadblock end offset (exclusive)
GridEvenShare<OffsetT> &even_share, ///< GridEvenShare descriptor
Int2Type<CAN_VECTORIZE> can_vectorize) ///< Whether or not we can vectorize loads
{
OutputT thread_aggregate;

if (block_offset + TILE_ITEMS > block_end)
if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
{
// First tile isn't full (not all threads have valid items)
int valid_items = block_end - block_offset;
ConsumeTile<true>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
int valid_items = even_share.block_end - even_share.block_offset;
ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
}

// At least one full block
ConsumeTile<true>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
block_offset += TILE_ITEMS;
ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
even_share.block_offset += even_share.block_stride;

// Consume subsequent full tiles of input
while (block_offset + TILE_ITEMS <= block_end)
while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
{
ConsumeTile<false>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
block_offset += TILE_ITEMS;
ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
even_share.block_offset += even_share.block_stride;
}

// Consume a partially-full tile
if (block_offset < block_end)
if (even_share.block_offset < even_share.block_end)
{
int valid_items = block_end - block_offset;
ConsumeTile<false>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
int valid_items = even_share.block_end - even_share.block_offset;
ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
}

// Compute block-wide reduction (all threads have valid items)
Expand All @@ -358,113 +353,28 @@ struct AgentReduce
OffsetT block_offset, ///< [in] Threadblock begin offset (inclusive)
OffsetT block_end) ///< [in] Threadblock end offset (exclusive)
{
GridEvenShare<OffsetT> even_share;
even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);

return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeRange(block_offset, block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(block_offset, block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
}


/**
* Reduce a contiguous segment of input tiles
*/
__device__ __forceinline__ OutputT ConsumeTiles(
OffsetT /*num_items*/, ///< [in] Total number of global input items
GridEvenShare<OffsetT> &even_share, ///< [in] GridEvenShare descriptor
GridQueue<OffsetT> &/*queue*/, ///< [in,out] GridQueue descriptor
Int2Type<GRID_MAPPING_EVEN_SHARE> /*is_even_share*/) ///< [in] Marker type indicating this is an even-share mapping
GridEvenShare<OffsetT> &even_share) ///< [in] GridEvenShare descriptor
{
// Initialize even-share descriptor for this thread block
even_share.BlockInit();
// Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();

return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());

}


//---------------------------------------------------------------------
// Dynamically consume tiles
//---------------------------------------------------------------------

/**
* Dequeue and reduce tiles of items as part of a inter-block reduction
*/
template <int CAN_VECTORIZE>
__device__ __forceinline__ OutputT ConsumeTiles(
int num_items, ///< Total number of input items
GridQueue<OffsetT> queue, ///< Queue descriptor for assigning tiles of work to thread blocks
Int2Type<CAN_VECTORIZE> can_vectorize) ///< Whether or not we can vectorize loads
{
// We give each thread block at least one tile of input.
OutputT thread_aggregate;
OffsetT block_offset = blockIdx.x * TILE_ITEMS;
OffsetT even_share_base = gridDim.x * TILE_ITEMS;

if (block_offset + TILE_ITEMS > num_items)
{
// First tile isn't full (not all threads have valid items)
int valid_items = num_items - block_offset;
ConsumeTile<true>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
}

// Consume first full tile of input
ConsumeTile<true>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());

if (num_items > even_share_base)
{
// Dequeue a tile of items
if (threadIdx.x == 0)
temp_storage.dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;

CTA_SYNC();

// Grab tile offset and check if we're done with full tiles
block_offset = temp_storage.dequeue_offset;

// Consume more full tiles
while (block_offset + TILE_ITEMS <= num_items)
{
ConsumeTile<false>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);

CTA_SYNC();

// Dequeue a tile of items
if (threadIdx.x == 0)
temp_storage.dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;

CTA_SYNC();

// Grab tile offset and check if we're done with full tiles
block_offset = temp_storage.dequeue_offset;
}

// Consume partial tile
if (block_offset < num_items)
{
int valid_items = num_items - block_offset;
ConsumeTile<false>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
}
}

// Compute block-wide reduction (all threads have valid items)
return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);

}

/**
* Dequeue and reduce tiles of items as part of a inter-block reduction
*/
__device__ __forceinline__ OutputT ConsumeTiles(
OffsetT num_items, ///< [in] Total number of global input items
GridEvenShare<OffsetT> &/*even_share*/, ///< [in] GridEvenShare descriptor
GridQueue<OffsetT> &queue, ///< [in,out] GridQueue descriptor
Int2Type<GRID_MAPPING_DYNAMIC> /*is_dynamic*/) ///< [in] Marker type indicating this is a dynamic mapping
{
return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
ConsumeTiles(num_items, queue, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
ConsumeTiles(num_items, queue, Int2Type<false && ATTEMPT_VECTORIZATION>());
}

};
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_reduce_by_key.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -223,7 +223,7 @@ struct AgentReduceByKey
typedef KeyOutputT KeyExchangeT[TILE_ITEMS + 1];
typedef ValueOutputT ValueExchangeT[TILE_ITEMS + 1];

// Shared memory type for this threadblock
// Shared memory type for this thread block
union _TempStorage
{
struct
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_rle.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -208,7 +208,7 @@ struct AgentRle

typedef LengthOffsetPair WarpAggregates[WARPS];

// Shared memory type for this threadblock
// Shared memory type for this thread block
struct _TempStorage
{
// Aliasable storage layout
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_scan.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,7 @@ struct AgentScan
ScanOpT>
RunningPrefixCallbackOp;

// Shared memory type for this threadblock
// Shared memory type for this thread block
union _TempStorage
{
typename BlockLoadT::TempStorage load; // Smem needed for tile loading
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_segment_fixup.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -162,7 +162,7 @@ struct AgentSegmentFixup
ScanTileStateT>
TilePrefixCallbackOpT;

// Shared memory type for this threadblock
// Shared memory type for this thread block
union _TempStorage
{
struct
Expand Down
2 changes: 1 addition & 1 deletion cub/agent/agent_select_if.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -192,7 +192,7 @@ struct AgentSelectIf
// Item exchange type
typedef OutputT ItemExchangeT[TILE_ITEMS];

// Shared memory type for this threadblock
// Shared memory type for this thread block
union _TempStorage
{
struct
Expand Down
6 changes: 3 additions & 3 deletions cub/block/block_radix_rank.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -28,7 +28,7 @@

/**
* \file
* cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock
* cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
*/

#pragma once
Expand All @@ -51,7 +51,7 @@ CUB_NS_PREFIX
namespace cub {

/**
* \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock.
* \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
* \ingroup BlockModule
*
* \tparam BLOCK_DIM_X The thread block length in threads along the X dimension
Expand Down Expand Up @@ -398,7 +398,7 @@ public:
// Extract the local ranks of each key
for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
{
// Add in threadblock exclusive prefix
// Add in thread block exclusive prefix
ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
}
}
Expand Down
8 changes: 4 additions & 4 deletions cub/block/block_reduce.cuh
Original file line number Diff line number Diff line change
Expand Up @@ -55,7 +55,7 @@ namespace cub {

/**
* BlockReduceAlgorithm enumerates alternative algorithms for parallel
* reduction across a CUDA threadblock.
* reduction across a CUDA thread block.
*/
enum BlockReduceAlgorithm
{
Expand All @@ -76,7 +76,7 @@ enum BlockReduceAlgorithm
*
* \par
* \image html block_reduce.png
* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
Expand Down Expand Up @@ -106,7 +106,7 @@ enum BlockReduceAlgorithm
*
* \par
* \image html block_reduce.png
* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
* <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - This variant performs more communication than BLOCK_REDUCE_RAKING
Expand Down Expand Up @@ -137,7 +137,7 @@ enum BlockReduceAlgorithm
*
* \par
* \image html block_scan_warpscans.png
* <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
* <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
*
* \par Performance Considerations
* - This variant applies more reduction operators than BLOCK_REDUCE_RAKING
Expand Down
Loading

0 comments on commit f558c6d

Please sign in to comment.