Merge branch 'main' into ci-for-nvhpc-debug-build

NVIDIA · Sep 25, 2023 · cd1ebb4 · cd1ebb4
2 parents 507d6cd + 9939910
commit cd1ebb4
Show file tree

Hide file tree

Showing 73 changed files with 3,254 additions and 1,635 deletions.
diff --git a/.github/copy-pr-bot.yaml b/.github/copy-pr-bot.yaml
@@ -0,0 +1,4 @@
+# Configuration file for `copy-pr-bot` GitHub App
+# https://docs.gha-runners.nvidia.com/apps/copy-pr-bot/
+
+enabled: true
diff --git a/.github/ops-bot.yaml b/.github/ops-bot.yaml
diff --git a/CMakeLists.txt b/CMakeLists.txt
@@ -179,6 +179,8 @@ target_compile_options(stdexec_executable_flags INTERFACE
                        -ferror-limit=0
                        -fmacro-backtrace-limit=0
                        -ftemplate-backtrace-limit=0>
+                       $<$<AND:$<CXX_COMPILER_ID:NVHPC>,$<VERSION_GREATER:$<CXX_COMPILER_VERSION>,23.3.0>>:
+                       -ftemplate-backtrace-limit 0>
                        )
 
 # # Always enable colored output
@@ -197,6 +199,14 @@ target_compile_options(stdexec_executable_flags INTERFACE
                        -include stdexec/__detail/__force_include.hpp>
                        )
 
+# Support target for examples and tests
+add_library(nvexec_executable_flags INTERFACE)
+
+target_compile_options(nvexec_executable_flags INTERFACE
+                       $<$<AND:$<CXX_COMPILER_ID:NVHPC>,$<COMPILE_LANGUAGE:CXX>>:-gpu=nomanaged>)
+target_link_options(nvexec_executable_flags INTERFACE
+                    $<$<AND:$<CXX_COMPILER_ID:NVHPC>,$<COMPILE_LANGUAGE:CXX>>:-gpu=nomanaged>)
+
 # Set up nvexec library
 option(STDEXEC_ENABLE_CUDA "Enable CUDA targets for non-nvc++ compilers" OFF)
 if(CMAKE_CXX_COMPILER_ID STREQUAL "NVHPC")
@@ -218,9 +228,9 @@ if(STDEXEC_ENABLE_CUDA)
     target_link_libraries(nvexec INTERFACE STDEXEC::stdexec)
 
     target_compile_options(nvexec INTERFACE
-      $<$<AND:$<CXX_COMPILER_ID:NVHPC>,$<COMPILE_LANGUAGE:CXX>>:-stdpar -gpu=nomanaged -gpu=cc${CMAKE_CUDA_ARCHITECTURES}>)
+      $<$<AND:$<CXX_COMPILER_ID:NVHPC>,$<COMPILE_LANGUAGE:CXX>>:-stdpar -gpu=cc${CMAKE_CUDA_ARCHITECTURES}>)
     target_link_options(nvexec INTERFACE
-      $<$<AND:$<CXX_COMPILER_ID:NVHPC>,$<COMPILE_LANGUAGE:CXX>>:-stdpar -gpu=nomanaged -gpu=cc${CMAKE_CUDA_ARCHITECTURES}>)
+      $<$<AND:$<CXX_COMPILER_ID:NVHPC>,$<COMPILE_LANGUAGE:CXX>>:-stdpar -gpu=cc${CMAKE_CUDA_ARCHITECTURES}>)
 
     if(NOT (CMAKE_CXX_COMPILER_ID STREQUAL "NVHPC"))
         include(rapids-cuda)

diff --git a/examples/algorithms/retry.hpp b/examples/algorithms/retry.hpp
@@ -127,7 +127,9 @@ struct _retry_sender {
       Env,
       stdexec::completion_signatures<stdexec::set_error_t(std::exception_ptr)>,
       _value,
-      _error>;
+      _error> {
+    return {};
+  }
 
   template <stdexec::receiver R>
   friend _op<S, R> tag_invoke(stdexec::connect_t, _retry_sender&& self, R r) {

diff --git a/examples/algorithms/then.hpp b/examples/algorithms/then.hpp
@@ -72,7 +72,9 @@ struct _then_sender {
 
   template <class Env>
   friend auto tag_invoke(stdexec::get_completion_signatures_t, _then_sender&&, Env)
-    -> _completions_t<Env>;
+    -> _completions_t<Env> {
+    return {};
+  }
 
   // Connect:
   template <class R>

diff --git a/examples/nvexec/CMakeLists.txt b/examples/nvexec/CMakeLists.txt
@@ -40,6 +40,9 @@ add_library(nvexec_example INTERFACE)
 target_include_directories(nvexec_example
     INTERFACE ${CMAKE_CURRENT_LIST_DIR}
 )
+target_link_libraries(nvexec_example
+    INTERFACE nvexec_executable_flags
+)
 
 add_library(stdpar_multicore INTERFACE)
 target_include_directories(stdpar_multicore
@@ -103,6 +106,7 @@ set(nvexec_gpu_examples
     "       example.nvexec.reduce : reduce.cpp"
     "        example.nvexec.split : split.cpp"
     "         example.nvexec.nvtx : nvtx.cpp"
+    "       example.nvexec.launch : launch.cpp"
     "example.nvexec.maxwell_gpu_s : maxwell_gpu_s.cpp"
     "example.nvexec.maxwell_gpu_m : maxwell_gpu_m.cpp"
 )

diff --git a/examples/nvexec/launch.cpp b/examples/nvexec/launch.cpp
@@ -0,0 +1,65 @@
+/*
+ * Copyright (c) 2022 NVIDIA Corporation
+ *
+ * Licensed under the Apache License Version 2.0 with LLVM Exceptions
+ * (the "License"); you may not use this file except in compliance with
+ * the License. You may obtain a copy of the License at
+ *
+ *   https://llvm.org/LICENSE.txt
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#include <nvexec/stream_context.cuh>
+#include <stdexec/execution.hpp>
+
+#include <numeric>
+#include <cub/cub.cuh>
+
+#include <thrust/device_vector.h>
+
+constexpr std::size_t N = 2 * 1024;
+constexpr std::size_t THREAD_BLOCK_SIZE = 128u;
+constexpr std::size_t NUM_BLOCKS = (N + THREAD_BLOCK_SIZE - 1) / THREAD_BLOCK_SIZE;
+
+#define scaling 2
+
+int bench() {
+  std::vector<int> input(N, 0);
+  std::iota(input.begin(), input.end(), 1);
+  std::transform(input.begin(), input.end(), input.begin(), [](int i) { return i * scaling; });
+  return std::accumulate(input.begin(), input.end(), 0);
+}
+
+int main() {
+  thrust::device_vector<int> input(N, 0);
+  std::iota(input.begin(), input.end(), 1);
+  int* first = thrust::raw_pointer_cast(input.data());
+  int* last = first + input.size();
+
+  nvexec::stream_context stream{};
+
+  auto snd = stdexec::transfer_just(stream.get_scheduler(), first, last)
+           | nvexec::launch(
+               {NUM_BLOCKS, THREAD_BLOCK_SIZE},
+               [](cudaStream_t stm, int* first, int* last) {
+                 assert(nvexec::is_on_gpu());
+                 int32_t idx = blockIdx.x * blockDim.x + threadIdx.x;
+                 if (idx < (last - first)) {
+                   first[idx] *= scaling;
+                 }
+               })
+           | stdexec::then([](int* first, int* last) {
+               assert(nvexec::is_on_gpu());
+               return std::accumulate(first, last, 0);
+             });
+
+  auto [result] = stdexec::sync_wait(std::move(snd)).value();
+
+  std::cout << "result: " << result << std::endl;
+  std::cout << "benchmark: " << bench() << std::endl;
+}