Update to address Issue #108 (https://github.com/NVlabs/cub/issues/108)

deterministic option for reduce, scan, etc. Mapping of thread blocks onto input data is now "run-to-run" deterministic for all variants of global reduction on a given device. (Scan was already deterministic)
NVIDIA · Aug 25, 2017 · f558c6d · f558c6d
1 parent b165e1f
commit f558c6d
Show file tree

Hide file tree

Showing 31 changed files with 394 additions and 412 deletions.
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
@@ -171,7 +171,7 @@
 1.3.2    07/28/2014
     - Bug fixes: 
         - Fix for cub::DeviceReduce where reductions of small problems 
-          (small enough to only dispatch a single threadblock) would run in 
+          (small enough to only dispatch a single thread block) would run in 
           the default stream (stream zero) regardless of whether an alternate
           stream was specified.  
 

diff --git a/cub/agent/agent_reduce.cuh b/cub/agent/agent_reduce.cuh
@@ -38,7 +38,6 @@
 #include "../block/block_load.cuh"
 #include "../block/block_reduce.cuh"
 #include "../grid/grid_mapping.cuh"
-#include "../grid/grid_queue.cuh"
 #include "../grid/grid_even_share.cuh"
 #include "../util_type.cuh"
 #include "../iterator/cache_modified_input_iterator.cuh"
@@ -64,8 +63,7 @@ template <
     int                     _ITEMS_PER_THREAD,      ///< Items per thread (per tile of input)
     int                     _VECTOR_LOAD_LENGTH,    ///< Number of items per vectorized load
     BlockReduceAlgorithm    _BLOCK_ALGORITHM,       ///< Cooperative block-wide reduction algorithm to use
-    CacheLoadModifier       _LOAD_MODIFIER,         ///< Cache load modifier for reading input elements
-    GridMappingStrategy     _GRID_MAPPING>          ///< How to map tiles of input onto thread blocks
+    CacheLoadModifier       _LOAD_MODIFIER>         ///< Cache load modifier for reading input elements
 struct AgentReducePolicy
 {
     enum
@@ -77,7 +75,6 @@ struct AgentReducePolicy
 
     static const BlockReduceAlgorithm  BLOCK_ALGORITHM      = _BLOCK_ALGORITHM;     ///< Cooperative block-wide reduction algorithm to use
     static const CacheLoadModifier     LOAD_MODIFIER        = _LOAD_MODIFIER;       ///< Cache load modifier for reading input elements
-    static const GridMappingStrategy   GRID_MAPPING         = _GRID_MAPPING;        ///< How to map tiles of input onto thread blocks
 };
 
 
@@ -148,7 +145,6 @@ struct AgentReduce
     struct _TempStorage
     {
         typename BlockReduceT::TempStorage  reduce;
-        OffsetT                             dequeue_offset;
     };
 
     /// Alias wrapper allowing storage to be unioned
@@ -314,36 +310,35 @@ struct AgentReduce
      */
     template <int CAN_VECTORIZE>
     __device__ __forceinline__ OutputT ConsumeRange(
-        OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
-        OffsetT block_end,                          ///< [in] Threadblock end offset (exclusive)
+        GridEvenShare<OffsetT> &even_share,          ///< GridEvenShare descriptor
         Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
     {
         OutputT thread_aggregate;
 
-        if (block_offset + TILE_ITEMS > block_end)
+        if (even_share.block_offset + TILE_ITEMS > even_share.block_end)
         {
             // First tile isn't full (not all threads have valid items)
-            int valid_items = block_end - block_offset;
-            ConsumeTile<true>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<true>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
             return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
         }
 
         // At least one full block
-        ConsumeTile<true>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-        block_offset += TILE_ITEMS;
+        ConsumeTile<true>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+        even_share.block_offset += even_share.block_stride;
 
         // Consume subsequent full tiles of input
-        while (block_offset + TILE_ITEMS <= block_end)
+        while (even_share.block_offset + TILE_ITEMS <= even_share.block_end)
         {
-            ConsumeTile<false>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-            block_offset += TILE_ITEMS;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            even_share.block_offset += even_share.block_stride;
         }
 
         // Consume a partially-full tile
-        if (block_offset < block_end)
+        if (even_share.block_offset < even_share.block_end)
         {
-            int valid_items = block_end - block_offset;
-            ConsumeTile<false>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
+            int valid_items = even_share.block_end - even_share.block_offset;
+            ConsumeTile<false>(thread_aggregate, even_share.block_offset, valid_items, Int2Type<false>(), can_vectorize);
         }
 
         // Compute block-wide reduction (all threads have valid items)
@@ -358,113 +353,28 @@ struct AgentReduce
         OffsetT block_offset,                       ///< [in] Threadblock begin offset (inclusive)
         OffsetT block_end)                          ///< [in] Threadblock end offset (exclusive)
     {
+        GridEvenShare<OffsetT> even_share;
+        even_share.template BlockInit<TILE_ITEMS>(block_offset, block_end);
+
         return (IsAligned(d_in + block_offset, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(block_offset, block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(block_offset, block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
     }
 
 
     /**
      * Reduce a contiguous segment of input tiles
      */
     __device__ __forceinline__ OutputT ConsumeTiles(
-        OffsetT                             /*num_items*/,      ///< [in] Total number of global input items
-        GridEvenShare<OffsetT>              &even_share,        ///< [in] GridEvenShare descriptor
-        GridQueue<OffsetT>                  &/*queue*/,         ///< [in,out] GridQueue descriptor
-        Int2Type<GRID_MAPPING_EVEN_SHARE>   /*is_even_share*/)  ///< [in] Marker type indicating this is an even-share mapping
+        GridEvenShare<OffsetT> &even_share)        ///< [in] GridEvenShare descriptor
     {
-        // Initialize even-share descriptor for this thread block
-        even_share.BlockInit();
+        // Initialize GRID_MAPPING_STRIP_MINE even-share descriptor for this thread block
+        even_share.template BlockInit<TILE_ITEMS, GRID_MAPPING_STRIP_MINE>();
 
         return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeRange(even_share.block_offset, even_share.block_end, Int2Type<false && ATTEMPT_VECTORIZATION>());
-
-    }
-
-
-    //---------------------------------------------------------------------
-    // Dynamically consume tiles
-    //---------------------------------------------------------------------
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block reduction
-     */
-    template <int CAN_VECTORIZE>
-    __device__ __forceinline__ OutputT ConsumeTiles(
-        int                     num_items,          ///< Total number of input items
-        GridQueue<OffsetT>      queue,              ///< Queue descriptor for assigning tiles of work to thread blocks
-        Int2Type<CAN_VECTORIZE> can_vectorize)      ///< Whether or not we can vectorize loads
-    {
-        // We give each thread block at least one tile of input.
-        OutputT thread_aggregate;
-        OffsetT block_offset = blockIdx.x * TILE_ITEMS;
-        OffsetT even_share_base = gridDim.x * TILE_ITEMS;
-
-        if (block_offset + TILE_ITEMS > num_items)
-        {
-            // First tile isn't full (not all threads have valid items)
-            int valid_items = num_items - block_offset;
-            ConsumeTile<true>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
-            return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op, valid_items);
-        }
-
-        // Consume first full tile of input
-        ConsumeTile<true>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
+            ConsumeRange(even_share, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
+            ConsumeRange(even_share, Int2Type<false && ATTEMPT_VECTORIZATION>());
 
-        if (num_items > even_share_base)
-        {
-            // Dequeue a tile of items
-            if (threadIdx.x == 0)
-                temp_storage.dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-
-            CTA_SYNC();
-
-            // Grab tile offset and check if we're done with full tiles
-            block_offset = temp_storage.dequeue_offset;
-
-            // Consume more full tiles
-            while (block_offset + TILE_ITEMS <= num_items)
-            {
-                ConsumeTile<false>(thread_aggregate, block_offset, TILE_ITEMS, Int2Type<true>(), can_vectorize);
-
-                CTA_SYNC();
-
-                // Dequeue a tile of items
-                if (threadIdx.x == 0)
-                    temp_storage.dequeue_offset = queue.Drain(TILE_ITEMS) + even_share_base;
-
-                CTA_SYNC();
-
-                // Grab tile offset and check if we're done with full tiles
-                block_offset = temp_storage.dequeue_offset;
-            }
-
-            // Consume partial tile
-            if (block_offset < num_items)
-            {
-                int valid_items = num_items - block_offset;
-                ConsumeTile<false>(thread_aggregate, block_offset, valid_items, Int2Type<false>(), can_vectorize);
-            }
-        }
-
-        // Compute block-wide reduction (all threads have valid items)
-        return BlockReduceT(temp_storage.reduce).Reduce(thread_aggregate, reduction_op);
-
-    }
-
-    /**
-     * Dequeue and reduce tiles of items as part of a inter-block reduction
-     */
-    __device__ __forceinline__ OutputT ConsumeTiles(
-        OffsetT                         num_items,          ///< [in] Total number of global input items
-        GridEvenShare<OffsetT>          &/*even_share*/,    ///< [in] GridEvenShare descriptor
-        GridQueue<OffsetT>              &queue,             ///< [in,out] GridQueue descriptor
-        Int2Type<GRID_MAPPING_DYNAMIC>  /*is_dynamic*/)     ///< [in] Marker type indicating this is a dynamic mapping
-    {
-        return (IsAligned(d_in, Int2Type<ATTEMPT_VECTORIZATION>())) ?
-            ConsumeTiles(num_items, queue, Int2Type<true && ATTEMPT_VECTORIZATION>()) :
-            ConsumeTiles(num_items, queue, Int2Type<false && ATTEMPT_VECTORIZATION>());
     }
 
 };

diff --git a/cub/agent/agent_reduce_by_key.cuh b/cub/agent/agent_reduce_by_key.cuh
@@ -223,7 +223,7 @@ struct AgentReduceByKey
     typedef KeyOutputT    KeyExchangeT[TILE_ITEMS + 1];
     typedef ValueOutputT  ValueExchangeT[TILE_ITEMS + 1];
 
-    // Shared memory type for this threadblock
+    // Shared memory type for this thread block
     union _TempStorage
     {
         struct

diff --git a/cub/agent/agent_rle.cuh b/cub/agent/agent_rle.cuh
@@ -208,7 +208,7 @@ struct AgentRle
 
     typedef LengthOffsetPair WarpAggregates[WARPS];
 
-    // Shared memory type for this threadblock
+    // Shared memory type for this thread block
     struct _TempStorage
     {
         // Aliasable storage layout

diff --git a/cub/agent/agent_scan.cuh b/cub/agent/agent_scan.cuh
@@ -163,7 +163,7 @@ struct AgentScan
             ScanOpT>
         RunningPrefixCallbackOp;
 
-    // Shared memory type for this threadblock
+    // Shared memory type for this thread block
     union _TempStorage
     {
         typename BlockLoadT::TempStorage    load;       // Smem needed for tile loading

diff --git a/cub/agent/agent_segment_fixup.cuh b/cub/agent/agent_segment_fixup.cuh
@@ -162,7 +162,7 @@ struct AgentSegmentFixup
             ScanTileStateT>
         TilePrefixCallbackOpT;
 
-    // Shared memory type for this threadblock
+    // Shared memory type for this thread block
     union _TempStorage
     {
         struct

diff --git a/cub/agent/agent_select_if.cuh b/cub/agent/agent_select_if.cuh
@@ -192,7 +192,7 @@ struct AgentSelectIf
     // Item exchange type
     typedef OutputT ItemExchangeT[TILE_ITEMS];
 
-    // Shared memory type for this threadblock
+    // Shared memory type for this thread block
     union _TempStorage
     {
         struct

diff --git a/cub/block/block_radix_rank.cuh b/cub/block/block_radix_rank.cuh
@@ -28,7 +28,7 @@
 
 /**
  * \file
- * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock
+ * cub::BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block
  */
 
 #pragma once
@@ -51,7 +51,7 @@ CUB_NS_PREFIX
 namespace cub {
 
 /**
- * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA threadblock.
+ * \brief BlockRadixRank provides operations for ranking unsigned integer types within a CUDA thread block.
  * \ingroup BlockModule
  *
  * \tparam BLOCK_DIM_X          The thread block length in threads along the X dimension
@@ -398,7 +398,7 @@ public:
         // Extract the local ranks of each key
         for (int ITEM = 0; ITEM < KEYS_PER_THREAD; ++ITEM)
         {
-            // Add in threadblock exclusive prefix
+            // Add in thread block exclusive prefix
             ranks[ITEM] = thread_prefixes[ITEM] + *digit_counters[ITEM];
         }
     }

diff --git a/cub/block/block_reduce.cuh b/cub/block/block_reduce.cuh
@@ -55,7 +55,7 @@ namespace cub {
 
 /**
  * BlockReduceAlgorithm enumerates alternative algorithms for parallel
- * reduction across a CUDA threadblock.
+ * reduction across a CUDA thread block.
  */
 enum BlockReduceAlgorithm
 {
@@ -76,7 +76,7 @@ enum BlockReduceAlgorithm
      *
      * \par
      * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
      *
      * \par Performance Considerations
      * - This variant performs less communication than BLOCK_REDUCE_RAKING_NON_COMMUTATIVE
@@ -106,7 +106,7 @@ enum BlockReduceAlgorithm
      *
      * \par
      * \image html block_reduce.png
-     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
+     * <div class="centercaption">\p BLOCK_REDUCE_RAKING data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
      *
      * \par Performance Considerations
      * - This variant performs more communication than BLOCK_REDUCE_RAKING
@@ -137,7 +137,7 @@ enum BlockReduceAlgorithm
      *
      * \par
      * \image html block_scan_warpscans.png
-     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread threadblock and 4-thread raking warp.</div>
+     * <div class="centercaption">\p BLOCK_REDUCE_WARP_REDUCTIONS data flow for a hypothetical 16-thread thread block and 4-thread raking warp.</div>
      *
      * \par Performance Considerations
      * - This variant applies more reduction operators than BLOCK_REDUCE_RAKING