From d622848f9fb62f13e5e064e1deb43b6bcbb12bad Mon Sep 17 00:00:00 2001
From: dumerrill <dumerrill@moochbot>
Date: Wed, 20 Sep 2017 05:18:45 -0400
Subject: [PATCH] Fix for Issue #115 (https://github.com/NVlabs/cub/issues/115)

WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes <
32
---
 .cproject                                     | 16 ++++++++++--
 CHANGE_LOG.TXT                                |  5 ++--
 README.md                                     |  2 +-
 cub/warp/specializations/warp_reduce_shfl.cuh | 26 ++++++++++---------
 cub/warp/specializations/warp_reduce_smem.cuh |  8 +++---
 cub/warp/specializations/warp_scan_shfl.cuh   | 20 +++++++++-----
 cub/warp/specializations/warp_scan_smem.cuh   |  8 +++---
 test/test_warp_reduce.cu                      |  5 +++-
 8 files changed, 59 insertions(+), 31 deletions(-)
diff --git a/.cproject b/.cproject
index 18ffbc70f7..e76d1da67e 100644
--- a/.cproject
+++ b/.cproject
@@ -23,13 +23,19 @@
 							<builder buildPath="${workspace_loc:/PrivateCub}/Default" id="cdt.managedbuild.target.gnu.builder.cygwin.base.412463247" keepEnvironmentInBuildfile="false" name="Gnu Make Builder" superClass="cdt.managedbuild.target.gnu.builder.cygwin.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.assembler.cygwin.base.996758685" name="GCC Assembler" superClass="cdt.managedbuild.tool.gnu.assembler.cygwin.base">
 								<option id="gnu.both.asm.option.include.paths.900454792" name="Include paths (-I)" superClass="gnu.both.asm.option.include.paths" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/crt/device_functions.h&quot;"/>
 									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.assembler.input.221302756" superClass="cdt.managedbuild.tool.gnu.assembler.input"/>
 							</tool>
 							<tool id="cdt.managedbuild.tool.gnu.archiver.cygwin.base.1353653670" name="GCC Archiver" superClass="cdt.managedbuild.tool.gnu.archiver.cygwin.base"/>
 							<tool id="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base.1401626953" name="Cygwin C++ Compiler" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.cygwin.base">
-								<option id="gnu.cpp.compiler.option.include.paths.1909687606" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath"/>
+								<option id="gnu.cpp.compiler.option.include.paths.1909687606" name="Include paths (-I)" superClass="gnu.cpp.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_functions.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
+								</option>
 								<option id="gnu.cpp.compiler.option.preprocessor.def.1893619952" name="Defined symbols (-D)" superClass="gnu.cpp.compiler.option.preprocessor.def" useByScannerDiscovery="false" valueType="definedSymbols">
 									<listOptionValue builtIn="false" value="__device__"/>
 									<listOptionValue builtIn="false" value="__global__"/>
@@ -42,12 +48,17 @@
 									<listOptionValue builtIn="false" value="__launch_bounds__(...)"/>
 									<listOptionValue builtIn="false" value="__align__(...)"/>
 									<listOptionValue builtIn="false" value="__CUDA_ARCH__=350"/>
+									<listOptionValue builtIn="false" value="__CUDACC__=1"/>
 								</option>
 								<option id="gnu.cpp.compiler.option.dialect.std.49639338" name="Language standard" superClass="gnu.cpp.compiler.option.dialect.std" useByScannerDiscovery="true" value="gnu.cpp.compiler.dialect.default" valueType="enumerated"/>
 								<inputType id="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin.1708330939" superClass="cdt.managedbuild.tool.gnu.cpp.compiler.input.cygwin"/>
 							</tool>
 							<tool id="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base.1940954787" name="Cygwin C Compiler" superClass="cdt.managedbuild.tool.gnu.c.compiler.cygwin.base">
-								<option id="gnu.c.compiler.option.include.paths.1945618846" name="Include paths (-I)" superClass="gnu.c.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath"/>
+								<option id="gnu.c.compiler.option.include.paths.1945618846" name="Include paths (-I)" superClass="gnu.c.compiler.option.include.paths" useByScannerDiscovery="false" valueType="includePath">
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/device_launch_parameters.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include/crt/device_functions.h&quot;"/>
+									<listOptionValue builtIn="false" value="&quot;${CUDA_PATH}/include&quot;"/>
+								</option>
 								<option id="gnu.c.compiler.option.preprocessor.def.symbols.1005509663" name="Defined symbols (-D)" superClass="gnu.c.compiler.option.preprocessor.def.symbols" useByScannerDiscovery="false" valueType="definedSymbols">
 									<listOptionValue builtIn="false" value="__device__"/>
 									<listOptionValue builtIn="false" value="__global__"/>
@@ -60,6 +71,7 @@
 									<listOptionValue builtIn="false" value="__launch_bounds__(...)"/>
 									<listOptionValue builtIn="false" value="__align__(...)"/>
 									<listOptionValue builtIn="false" value="__CUDA_ARCH__=350"/>
+									<listOptionValue builtIn="false" value="__CUDACC__=1"/>
 								</option>
 								<inputType id="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin.469104331" superClass="cdt.managedbuild.tool.gnu.c.compiler.input.cygwin"/>
 							</tool>
diff --git a/CHANGE_LOG.TXT b/CHANGE_LOG.TXT
index 4afefc8f3b..43860e691e 100644
--- a/CHANGE_LOG.TXT
+++ b/CHANGE_LOG.TXT
@@ -1,6 +1,7 @@
-1.7.4    09/19/2017
+1.7.4    09/20/2017
     - Bug fixes: 
-        - Issue #114: Can't pair non-trivially-constructible values in radix sort 
+        - Issue #114: Can't pair non-trivially-constructible values in radix sort
+        - Issue #115: WarpReduce segmented reduction broken in CUDA 9 for logical warp sizes < 32 
           		  
 //-----------------------------------------------------------------------------
 
diff --git a/README.md b/README.md
index a3327076a9..c107d673d5 100644
--- a/README.md
+++ b/README.md
@@ -1,7 +1,7 @@
 <hr>
 <h3>About CUB</h3>
 
-Current release: v1.7.4 (09/19/2017)
+Current release: v1.7.4 (09/20/2017)
 
 We recommend the [CUB Project Website](http://nvlabs.github.com/cub) and the [cub-users discussion forum](http://groups.google.com/group/cub-users) for further information and examples.
 
diff --git a/cub/warp/specializations/warp_reduce_shfl.cuh b/cub/warp/specializations/warp_reduce_shfl.cuh
index 550fa349fa..682a5bfedc 100644
--- a/cub/warp/specializations/warp_reduce_shfl.cuh
+++ b/cub/warp/specializations/warp_reduce_shfl.cuh
@@ -112,9 +112,10 @@ struct WarpReduceShfl
     // Thread fields
     //---------------------------------------------------------------------
 
-    int lane_id;
 
-    int member_mask;
+    unsigned int lane_id;
+
+    unsigned int member_mask;
 
     //---------------------------------------------------------------------
     // Construction
@@ -126,9 +127,9 @@ struct WarpReduceShfl
     :
         lane_id(LaneId()),
 
-        member_mask(IS_ARCH_WARP ?
-             0xffffffff :
-             (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
+            0 : // arch-width subwarps need not be tiled within the arch-warp
+            ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
     {}
 
 
@@ -470,13 +471,13 @@ struct WarpReduceShfl
         int             folded_items_per_warp,  ///< [in] Total number of valid items folded into each logical warp
         ReductionOp     reduction_op)           ///< [in] Binary reduction operator
     {
-        // Get the last thread in the logical warp
-        int first_warp_thread   = 0;
-        int last_warp_thread    = LOGICAL_WARP_THREADS - 1;
+        // Get the lane of the first and last thread in the logical warp
+        int first_thread   = 0;
+        int last_thread    = LOGICAL_WARP_THREADS - 1;
         if (!IS_ARCH_WARP)
         {
-            first_warp_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
-            last_warp_thread |= lane_id;
+            first_thread = lane_id & (~(LOGICAL_WARP_THREADS - 1));
+            last_thread |= lane_id;
         }
 
         // Common case is FOLDED_ITEMS_PER_LANE = 1 (or a multiple of 32)
@@ -484,8 +485,8 @@ struct WarpReduceShfl
 
         // Get the last valid lane
         int last_lane = (ALL_LANES_VALID) ?
-            last_warp_thread :
-            CUB_MIN(last_warp_thread, first_warp_thread + lanes_with_valid_data);
+            last_thread :
+            CUB_MIN(last_thread, first_thread + lanes_with_valid_data);
 
         T output = input;
 
@@ -516,6 +517,7 @@ struct WarpReduceShfl
         // Get the start flags for each thread in the warp.
         int warp_flags = WARP_BALLOT(flag, member_mask);
 
+        // Convert to tail-segmented
         if (HEAD_SEGMENTED)
             warp_flags >>= 1;
 
diff --git a/cub/warp/specializations/warp_reduce_smem.cuh b/cub/warp/specializations/warp_reduce_smem.cuh
index ab7e3dfde2..9ba8e94d12 100644
--- a/cub/warp/specializations/warp_reduce_smem.cuh
+++ b/cub/warp/specializations/warp_reduce_smem.cuh
@@ -113,12 +113,14 @@ struct WarpReduceSmem
         TempStorage     &temp_storage)
     :
         temp_storage(temp_storage.Alias()),
+
         lane_id(IS_ARCH_WARP ?
             LaneId() :
             LaneId() % LOGICAL_WARP_THREADS),
-        member_mask(!IS_POW_OF_TWO ?
-            (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) :                                       // non-power-of-two subwarps cannot be tiled
-            (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
     {}
 
     /******************************************************************************
diff --git a/cub/warp/specializations/warp_scan_shfl.cuh b/cub/warp/specializations/warp_scan_shfl.cuh
index 05faaf930f..f0deb8ddef 100644
--- a/cub/warp/specializations/warp_scan_shfl.cuh
+++ b/cub/warp/specializations/warp_scan_shfl.cuh
@@ -46,6 +46,8 @@ namespace cub {
 
 /**
  * \brief WarpScanShfl provides SHFL-based variants of parallel prefix scan of items partitioned across a CUDA thread warp.
+ *
+ * LOGICAL_WARP_THREADS must be a power-of-two
  */
 template <
     typename    T,                      ///< Data type being scanned
@@ -98,12 +100,11 @@ struct WarpScanShfl
     __device__ __forceinline__ WarpScanShfl(
         TempStorage &/*temp_storage*/)
     :
-        lane_id(IS_ARCH_WARP ?
-            LaneId() :
-            LaneId() % LOGICAL_WARP_THREADS),
-        member_mask(IS_ARCH_WARP ?
-             0xffffffff :
-             (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))
+        lane_id(LaneId()),
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP) ?
+            0 : // arch-width subwarps need not be tiled within the arch-warp
+            ((lane_id / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
     {}
 
 
@@ -594,7 +595,12 @@ struct WarpScanShfl
     {
         inclusive = scan_op(initial_value, inclusive);
         exclusive = ShuffleUp(inclusive, 1, 0, member_mask);
-        if (lane_id == 0)
+
+        unsigned int segment_id = (IS_ARCH_WARP) ?
+            lane_id :
+            lane_id % LOGICAL_WARP_THREADS;
+
+        if (segment_id == 0)
             exclusive = initial_value;
     }
 
diff --git a/cub/warp/specializations/warp_scan_smem.cuh b/cub/warp/specializations/warp_scan_smem.cuh
index 0a7c2bdaf6..c3a7a94ba2 100644
--- a/cub/warp/specializations/warp_scan_smem.cuh
+++ b/cub/warp/specializations/warp_scan_smem.cuh
@@ -104,12 +104,14 @@ struct WarpScanSmem
         TempStorage     &temp_storage)
     :
         temp_storage(temp_storage.Alias()),
+
         lane_id(IS_ARCH_WARP ?
             LaneId() :
             LaneId() % LOGICAL_WARP_THREADS),
-        member_mask(!IS_POW_OF_TWO ?
-            (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) :                                       // non-power-of-two subwarps cannot be tiled
-            (0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << (LaneId() / LOGICAL_WARP_THREADS))
+
+        member_mask((0xffffffff >> (32 - LOGICAL_WARP_THREADS)) << ((IS_ARCH_WARP || !IS_POW_OF_TWO ) ?
+            0 : // arch-width and non-power-of-two subwarps cannot be tiled with the arch-warp
+            ((LaneId() / LOGICAL_WARP_THREADS) * LOGICAL_WARP_THREADS)))
     {}
 
 
diff --git a/test/test_warp_reduce.cu b/test/test_warp_reduce.cu
index 7ff04ef8ba..130f20e3e8 100644
--- a/test/test_warp_reduce.cu
+++ b/test/test_warp_reduce.cu
@@ -778,7 +778,10 @@ template <int LOGICAL_WARP_THREADS>
 void Test()
 {
     Test<1, LOGICAL_WARP_THREADS>();
-    Test<2, LOGICAL_WARP_THREADS>();
+
+    // Only power-of-two subwarps can be tiled
+    if ((LOGICAL_WARP_THREADS == 32) || PowerOfTwo<LOGICAL_WARP_THREADS>::VALUE)
+        Test<2, LOGICAL_WARP_THREADS>();
 }