From d622848f9fb62f13e5e064e1deb43b6bcbb12bad Mon Sep 17 00:00:00 2001 From: dumerrill Date: Wed, 20 Sep 2017 05:18:45 -0400 Subject: [PATCH] Fix for Issue #115 (https://github.com/NVlabs/cub/issues/115) WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes < 32 --- .cproject | 16 ++++++++++-- CHANGE_LOG.TXT | 5 ++-- README.md | 2 +- cub/warp/specializations/warp_reduce_shfl.cuh | 26 ++++++++++--------- cub/warp/specializations/warp_reduce_smem.cuh | 8 +++--- cub/warp/specializations/warp_scan_shfl.cuh | 20 +++++++++----- cub/warp/specializations/warp_scan_smem.cuh | 8 +++--- test/test_warp_reduce.cu | 5 +++- 8 files changed, 59 insertions(+), 31 deletions(-) diff --git a/.cproject b/.cproject index 18ffbc70f7..e76d1da67e 100644 --- a/.cproject +++ b/.cproject @@ -23,13 +23,19 @@ - - diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT index 4afefc8f3b..43860e691e 100644 --- a/CHANGE_LOG.TXT +++ b/CHANGE_LOG.TXT @@ -1,6 +1,7 @@ -1.7.4 09/19/2017 +1.7.4 09/20/2017 - Bug fixes: - - Issue #114: Can't pair non-trivially-constructible values in radix sort + - Issue #114: Can't pair non-trivially-constructible values in radix sort + - Issue #115: WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes < 32 //----------------------------------------------------------------------------- diff --git a/README.md b/README.md index a3327076a9..c107d673d5 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@

About CUB

-Current release: v1.7.4 (09/19/2017) +Current release: v1.7.4 (09/20/2017) We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples. diff --git a/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/warp/specializations/warp_reduce_shfl.cuh index 550fa349fa..682a5bfedc 100644 --- a/cub/warp/specializations/warp_reduce_shfl.cuh +++ b/cub/warp/specializations/warp_reduce_shfl.cuh @@ -112,9 +112,10 @@ struct WarpReduceShfl // Thread fields //--------------------------------------------------------------------- - int lane_id; - int member_mask; + unsigned int lane_id; + + unsigned int member_mask; //--------------------------------------------------------------------- // Construction @@ -126,9 +127,9 @@ struct WarpReduceShfl : lane_id(LaneId()), - member_mask(IS_ARCH_WARP ? - 0xffffffff : - (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS)) + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ? + 0 : // arch-width subwarps need not be tiled within the arch-warp + ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) {} @@ -470,13 +471,13 @@ struct WarpReduceShfl int folded_items_per_warp, ///< [in] Total number of valid items folded into each logical warp ReductionOp reduction_op) ///< [in] Binary reduction operator { - // Get the last thread in the logical warp - int first_warp_thread = 0; - int last_warp_thread = LOGICAL_WARP_THREADS - 1; + // Get the lane of the first and last thread in the logical warp + int first_thread = 0; + int last_thread = LOGICAL_WARP_THREADS - 1; if (!IS_ARCH_WARP) { - first_warp_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1)); - last_warp_thread |= lane_id; + first_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1)); + last_thread |= lane_id; } // Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32) @@ -484,8 +485,8 @@ struct WarpReduceShfl // Get the last valid lane int last_lane = (ALL_LANES_VALID) ? - last_warp_thread : - CUB_MIN(last_warp_thread, first_warp_thread + lanes_with_valid_data); + last_thread : + CUB_MIN(last_thread, first_thread + lanes_with_valid_data); T output = input; @@ -516,6 +517,7 @@ struct WarpReduceShfl // Get the start flags for each thread in the warp. int warp_flags = WARP_BALLOT(flag, member_mask); + // Convert to tail-segmented if (HEAD_SEGMENTED) warp_flags >>= 1; diff --git a/cub/warp/specializations/warp_reduce_smem.cuh b/cub/warp/specializations/warp_reduce_smem.cuh index ab7e3dfde2..9ba8e94d12 100644 --- a/cub/warp/specializations/warp_reduce_smem.cuh +++ b/cub/warp/specializations/warp_reduce_smem.cuh @@ -113,12 +113,14 @@ struct WarpReduceSmem TempStorage &temp_storage) : temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS), - member_mask(!IS_POW_OF_TWO ? - (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) : // non-power-of-two subwarps cannot be tiled - (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS)) + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? + 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp + ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) {} /****************************************************************************** diff --git a/cub/warp/specializations/warp_scan_shfl.cuh b/cub/warp/specializations/warp_scan_shfl.cuh index 05faaf930f..f0deb8ddef 100644 --- a/cub/warp/specializations/warp_scan_shfl.cuh +++ b/cub/warp/specializations/warp_scan_shfl.cuh @@ -46,6 +46,8 @@ namespace cub { /** * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp. + * + * LOGICAL_WARP_THREADS must be a power-of-two */ template < typename T, ///< Data type being scanned @@ -98,12 +100,11 @@ struct WarpScanShfl __device__ __forceinline__ WarpScanShfl( TempStorage &/*temp_storage*/) : - lane_id(IS_ARCH_WARP ? - LaneId() : - LaneId() % LOGICAL_WARP_THREADS), - member_mask(IS_ARCH_WARP ? - 0xffffffff : - (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS)) + lane_id(LaneId()), + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ? + 0 : // arch-width subwarps need not be tiled within the arch-warp + ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) {} @@ -594,7 +595,12 @@ struct WarpScanShfl { inclusive = scan_op(initial_value, inclusive); exclusive = ShuffleUp(inclusive, 1, 0, member_mask); - if (lane_id == 0) + + unsigned int segment_id = (IS_ARCH_WARP) ? + lane_id : + lane_id % LOGICAL_WARP_THREADS; + + if (segment_id == 0) exclusive = initial_value; } diff --git a/cub/warp/specializations/warp_scan_smem.cuh b/cub/warp/specializations/warp_scan_smem.cuh index 0a7c2bdaf6..c3a7a94ba2 100644 --- a/cub/warp/specializations/warp_scan_smem.cuh +++ b/cub/warp/specializations/warp_scan_smem.cuh @@ -104,12 +104,14 @@ struct WarpScanSmem TempStorage &temp_storage) : temp_storage(temp_storage.Alias()), + lane_id(IS_ARCH_WARP ? LaneId() : LaneId() % LOGICAL_WARP_THREADS), - member_mask(!IS_POW_OF_TWO ? - (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) : // non-power-of-two subwarps cannot be tiled - (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS)) + + member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ? + 0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp + ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS))) {} diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu index 7ff04ef8ba..130f20e3e8 100644 --- a/test/test_warp_reduce.cu +++ b/test/test_warp_reduce.cu @@ -778,7 +778,10 @@ template void Test() { Test<1, LOGICAL_WARP_THREADS>(); - Test<2, LOGICAL_WARP_THREADS>(); + + // Only power-of-two subwarps can be tiled + if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo::VALUE) + Test<2, LOGICAL_WARP_THREADS>(); }