diff --git a/.clang-format b/.clang-format index 164e65fe7..b1ae2a76a 100644 --- a/.clang-format +++ b/.clang-format @@ -24,9 +24,9 @@ AlwaysBreakAfterReturnType: None AlwaysBreakBeforeMultilineStrings: true AlwaysBreakTemplateDeclarations: Yes AttributeMacros: [ + 'STDEXEC_ATTRIBUTE', 'STDEXEC_NO_UNIQUE_ADDRESS', 'STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS', - 'STDEXEC_DETAIL_CUDACC_HOST_DEVICE', ] BinPackArguments: false BinPackParameters: false diff --git a/.git-blame-ignore-revs b/.git-blame-ignore-revs index 6025f472d..c79a37d02 100644 --- a/.git-blame-ignore-revs +++ b/.git-blame-ignore-revs @@ -6,3 +6,6 @@ # Fixed access modifier offset 9150f682444f8f45c02bf858b767c1a0db81e548 + +# Put all tests in anonymous namespaces and reformat +5d6b265c3b7ebcf49a725ce17c399b1e9666118d diff --git a/.github/workflows/ci.cpu.yml b/.github/workflows/ci.cpu.yml index 167a7240f..4d64847fb 100644 --- a/.github/workflows/ci.cpu.yml +++ b/.github/workflows/ci.cpu.yml @@ -51,3 +51,33 @@ jobs: - build-cpu steps: - run: echo "CI (CPU) success" + + build-cpu-windows: + runs-on: windows-latest + name: ${{ matrix.name }} + strategy: + fail-fast: false + matrix: + include: + - { compiler: "cl", build: "Debug", name: "CPU (Windows) (msvc, Debug)" } + - { compiler: "cl", build: "Release", name: "CPU (Windows) (msvc, Release)" } + #- { compiler: "clang++", build: "Debug", name: "CPU (Windows) (clang, Debug)" } + #- { compiler: "clang++", build: "Release", name: "CPU (Windows) (clang, Release)" } + #- { compiler: "clang-cl", build: "Debug", name: "CPU (Windows) (clang-cl, Debug)" } + #- { compiler: "clang-cl", build: "Release", name: "CPU (Windows) (clang-cl, Release)" } + steps: + - name: Checkout stdexec (Windows) + uses: actions/checkout@v3 + with: + persist-credentials: false + - name: Build and test CPU schedulers (Windows) + shell: pwsh + run: .github/workflows/test-windows.ps1 -Compiler '${{ matrix.compiler }}' -Config '${{ matrix.build }}' + + ci-cpu-windows: + runs-on: windows-latest + name: CI (CPU) (Windows) + needs: + - build-cpu-windows + steps: + - run: echo "CI (CPU) (Windows) success" diff --git a/.github/workflows/ci.gpu.yml b/.github/workflows/ci.gpu.yml index f3f67a793..088e4afc9 100644 --- a/.github/workflows/ci.gpu.yml +++ b/.github/workflows/ci.gpu.yml @@ -53,7 +53,11 @@ jobs: -DCMAKE_CUDA_COMPILER:FILEPATH="$cxx" \ -DCMAKE_CUDA_ARCHITECTURES:STRING=${{ matrix.sm }}; # Compile - cmake --build build; + cmake --build build -v; + + # Print sccache stats + sccache -s + # Tests ctest --test-dir build --verbose --output-on-failure --timeout 60; # Examples @@ -61,6 +65,7 @@ jobs: ./build/examples/nvexec/maxwell_cpu_mt --iterations=1000 --N=512 --run-std --run-stdpar --run-thread-pool-scheduler ./build/examples/nvexec/maxwell_gpu_s --iterations=1000 --N=512 --run-cuda --run-stdpar --run-stream-scheduler + ci-gpu: runs-on: ubuntu-latest name: CI (GPU) diff --git a/.github/workflows/test-windows.ps1 b/.github/workflows/test-windows.ps1 new file mode 100644 index 000000000..046e693cb --- /dev/null +++ b/.github/workflows/test-windows.ps1 @@ -0,0 +1,39 @@ +param( + [string]$BuildDirectory="build", + [string]$Compiler="cl", + [string]$Config="Debug" +) + +function Invoke-NativeCommand($Command) { + & $Command $Args + + if (!$?) { + throw "${Command}: $LastExitCode" + } +} + +$VSVersion = 2022 +$VSEdition = 'Enterprise' +$Architecture = 'x64' + +Push-Location "C:/Program Files/Microsoft Visual Studio/$VSVersion/$VSEdition/VC/Auxiliary/Build" +$VCVersion = Get-Content 'Microsoft.VCToolsVersion.default.txt' +cmd /c "vcvarsall.bat $Architecture -vcvars_ver=$VCVersion > nul & set" | ForEach-Object { + if ($_ -match '^(.+?)=(.*)') { + Set-Item -Force -Path "ENV:$($matches[1])" -Value $matches[2] + } +} +Pop-Location + +if ($Compiler -ne "cl") { + $ENV:CXX=$Compiler +} + +if (Test-Path -PathType Container $BuildDirectory) { + Remove-Item -Recurse $BuildDirectory | Out-Null +} +New-Item -ItemType Directory $BuildDirectory | Out-Null + +Invoke-NativeCommand cmake -B $BuildDirectory -G Ninja "-DCMAKE_BUILD_TYPE=$Config" . +Invoke-NativeCommand cmake --build $BuildDirectory +Invoke-NativeCommand ctest --test-dir $BuildDirectory diff --git a/.vscode/launch.json b/.vscode/launch.json index 52857b46c..53d36e65c 100644 --- a/.vscode/launch.json +++ b/.vscode/launch.json @@ -47,6 +47,17 @@ "initCommands": ["settings set target.disable-aslr false"], "args": "${input:CXX_PROGRAM_ARGS}", }, + { + "name": "CUDA Current Target (cuda-gdb)", + "type": "cuda-gdb", + "request": "launch", + "stopAtEntry": false, + "breakOnLaunch": false, + "internalConsoleOptions": "neverOpen", + "program": "${command:cmake.launchTargetPath}", + "cwd": "${command:cmake.launchTargetDirectory}", + "args": "${input:CXX_PROGRAM_ARGS}", + }, ], "inputs": [ { diff --git a/CMakeLists.txt b/CMakeLists.txt index e3489a1ce..9096e28ca 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -92,6 +92,18 @@ rapids_cpm_find(Catch2 ${Catch2_VERSION} URL https://github.com/catchorg/Catch2/archive/refs/tags/v${Catch2_VERSION}.zip ) +# Add ICM +set(icm_VERSION 1.5.0) +# Always download it, don't attempt to do `find_package(ICM)` first +set(CPM_DOWNLOAD_icm TRUE) +rapids_cpm_find(icm ${icm_VERSION} + CPM_ARGS + GITHUB_REPOSITORY iboB/icm + GIT_TAG v${icm_VERSION} + VERSION ${icm_VERSION} + PATCH_COMMAND git restore -- . && git apply ${CMAKE_CURRENT_LIST_DIR}/cmake/cpm/patches/icm/regex-build-error.diff +) + # Ensure that we link with the threading library set(CMAKE_THREAD_PREFER_PTHREAD TRUE) rapids_find_package(Threads REQUIRED @@ -102,6 +114,13 @@ rapids_find_package(Threads REQUIRED ############################################################################## # - Main library targets ----------------------------------------------------- +# Detect the compiler frontend (GNU, Clang, MSVC, etc.) +if(DEFINED CMAKE_CXX_COMPILER_FRONTEND_VARIANT) + set(stdexec_compiler_frontend ${CMAKE_CXX_COMPILER_FRONTEND_VARIANT}) +else() + set(stdexec_compiler_frontend ${CMAKE_CXX_COMPILER_ID}) +endif() + set(stdexec_export_targets) # Define the main library @@ -139,10 +158,22 @@ target_compile_options(stdexec INTERFACE $<$:-fcoroutines> ) +# Increase the concepts diagnostics depth for GCC +target_compile_options(stdexec INTERFACE + $<$:-fconcepts-diagnostics-depth=10> + ) + +# Do you want a preprocessor that works? Picky, picky. target_compile_options(stdexec INTERFACE $<$:/Zc:__cplusplus /Zc:preprocessor> ) +option(STDEXEC_ENABLE_EXTRA_TYPE_CHECKING "Enable extra type checking that is costly at compile-time" OFF) + +if (STDEXEC_ENABLE_EXTRA_TYPE_CHECKING) + target_compile_definitions(stdexec INTERFACE STDEXEC_ENABLE_EXTRA_TYPE_CHECKING) +endif() + add_library(STDEXEC::stdexec ALIAS stdexec) # Don't require building everything when installing @@ -153,35 +184,28 @@ add_library(stdexec_executable_flags INTERFACE) # Enable warnings target_compile_options(stdexec_executable_flags INTERFACE - $<$,$,$>: - -Wall> - $<$: - /W4>) + $<$:-Wall> + $<$:-Wall> + $<$:/W4>) # Increase the error limit with NVC++ target_compile_options(stdexec_executable_flags INTERFACE - $<$:-e1000> - ) + $<$:-e1000>) -# Silence warnings with GCC +# Silence warnings target_compile_options(stdexec_executable_flags INTERFACE $<$:-Wno-non-template-friend> - ) - -# Silence warnings with NVHPC -target_compile_options(stdexec_executable_flags INTERFACE $<$:--diag_suppress177,550,111,497,554> - ) + $<$:/wd4100 /wd4101 /wd4127 /wd4324 /wd4456 /wd4459>) # Template backtrace limit target_compile_options(stdexec_executable_flags INTERFACE $<$,$>: - -ferror-limit=0 - -fmacro-backtrace-limit=0 - -ftemplate-backtrace-limit=0> + $<$:/clang:>-ferror-limit=0 + $<$:/clang:>-fmacro-backtrace-limit=0 + $<$:/clang:>-ftemplate-backtrace-limit=0> $<$,$,23.3.0>>: - -ftemplate-backtrace-limit 0> - ) + -ftemplate-backtrace-limit 0>) # # Always enable colored output # target_compile_options(stdexec_executable_flags INTERFACE @@ -199,6 +223,10 @@ target_compile_options(stdexec_executable_flags INTERFACE -include stdexec/__detail/__force_include.hpp> ) +target_compile_definitions( + stdexec_executable_flags INTERFACE + $<$,$>>:STDEXEC_ENABLE_EXTRA_TYPE_CHECKING>) + # Support target for examples and tests add_library(nvexec_executable_flags INTERFACE) @@ -290,7 +318,16 @@ if (STDEXEC_ENABLE_TBB) ) endif () -option (STDEXEC_ENABLE_IO_URING_TESTS "Enable io_uring tests" ON) +option (STDEXEC_ENABLE_NUMA "Enable NUMA affinity for static_thread_pool" OFF) +if (STDEXEC_ENABLE_NUMA) + list(APPEND CMAKE_MODULE_PATH "${CMAKE_CURRENT_SOURCE_DIR}/cmake/Modules") + find_package(numa REQUIRED) + target_link_libraries(stdexec INTERFACE numa::numa) + target_compile_definitions(stdexec INTERFACE STDEXEC_ENABLE_NUMA) +endif() +include(CheckIncludeFileCXX) +CHECK_INCLUDE_FILE_CXX("linux/io_uring.h" STDEXEC_FOUND_IO_URING) +option (STDEXEC_ENABLE_IO_URING_TESTS "Enable io_uring tests" ${STDEXEC_FOUND_IO_URING}) option(STDEXEC_BUILD_EXAMPLES "Build stdexec examples" ON) option(STDEXEC_BUILD_TESTS "Build stdexec tests" ON) diff --git a/cmake/Modules/Findnuma.cmake b/cmake/Modules/Findnuma.cmake new file mode 100644 index 000000000..eb840a8de --- /dev/null +++ b/cmake/Modules/Findnuma.cmake @@ -0,0 +1,95 @@ +# +# Copyright (c) 2023 Maikel Nadolski +# Copyright (c) 2023 NVIDIA Corporation +# +# Licensed under the Apache License Version 2.0 with LLVM Exceptions +# (the "License"); you may not use this file except in compliance with +# the License. You may obtain a copy of the License at +# +# https://llvm.org/LICENSE.txt +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# + +#[=======================================================================[.rst: +Findnuma +------- + +Finds the numa library. + +Imported Targets +^^^^^^^^^^^^^^^^ + +This module provides the following imported targets, if found: + +``numa::numa`` + The numa library + +Result Variables +^^^^^^^^^^^^^^^^ + +This will define the following variables: + +``numa_FOUND`` + True if the system has the Foo library. +``numa_VERSION`` + The version of the Foo library which was found. +``numa_INCLUDE_DIRS`` + Include directories needed to use Foo. +``numa_LIBRARIES`` + Libraries needed to link to Foo. + +Cache Variables +^^^^^^^^^^^^^^^ + +The following cache variables may also be set: + +``numa_INCLUDE_DIR`` + The directory containing ``numa.h``. +``numa_LIBRARY`` + The path to the Foo library. + +#]=======================================================================] + +find_path(numa_INCLUDE_DIR + NAMES numa.h + PATHS ${PC_Foo_INCLUDE_DIRS} + PATH_SUFFIXES numa +) +find_library(numa_LIBRARY + NAMES numa + PATHS ${PC_Foo_LIBRARY_DIRS} +) + +include(FindPackageHandleStandardArgs) +find_package_handle_standard_args(numa + FOUND_VAR numa_FOUND + REQUIRED_VARS + numa_LIBRARY + numa_INCLUDE_DIR + VERSION_VAR numa_VERSION +) + +if(numa_FOUND) + set(numa_LIBRARIES ${numa_LIBRARY}) + set(numa_INCLUDE_DIRS ${numa_INCLUDE_DIR}) + set(numa_DEFINITIONS ${PC_numa_CFLAGS_OTHER}) +endif() + +if(numa_FOUND AND NOT TARGET numa::numa) + add_library(numa::numa UNKNOWN IMPORTED) + set_target_properties(numa::numa PROPERTIES + IMPORTED_LOCATION "${numa_LIBRARY}" + INTERFACE_COMPILE_OPTIONS "${PC_numa_CFLAGS_OTHER}" + INTERFACE_INCLUDE_DIRECTORIES "${numa_INCLUDE_DIR}" + ) +endif() + +mark_as_advanced( + numa_INCLUDE_DIR + numa_LIBRARY +) diff --git a/cmake/cpm/patches/icm/regex-build-error.diff b/cmake/cpm/patches/icm/regex-build-error.diff new file mode 100644 index 000000000..8dd2bfdd9 --- /dev/null +++ b/cmake/cpm/patches/icm/regex-build-error.diff @@ -0,0 +1,28 @@ +diff --git a/icm_build_failure_parse_and_run.cmake b/icm_build_failure_parse_and_run.cmake +index 0e62f6c..2ea5f0a 100644 +--- a/icm_build_failure_parse_and_run.cmake ++++ b/icm_build_failure_parse_and_run.cmake +@@ -29,14 +29,15 @@ endif() + + # collect possible errors from source + file(READ "@parsedSourcePath@" sourceText) ++ + string(REGEX MATCHALL "//[ ]*build error:[^\n]+" matchErrors ${sourceText}) + + # look for collected errors in output + foreach(possibleError ${matchErrors}) + string(REGEX MATCH "//[ ]*build error:[ \t]*(.+)$" _ "${possibleError}") + set(possibleError "${CMAKE_MATCH_1}") +- string(FIND "${out}" "${possibleError}" pos) +- if(NOT pos EQUAL -1) ++ string(REGEX MATCH "${possibleError}" actualError "${out}") ++ if(NOT "${actualError}" STREQUAL "") + message("Success: output when building '@ARG_TARGET@' contains '${possibleError}'") + return() + endif() +@@ -48,4 +49,4 @@ endforeach() + # print execute_process output for debugging purposes + message("${out}") + # print error +-message(FATAL_ERROR "Error: Building '@ARG_TARGET@' failed, but output doesn't contain any of the expected errors:${outErrors}") ++message(FATAL_ERROR "Error: Building '@ARG_TARGET@' failed, but output doesn't match the expected errors:${outErrors}") diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt index 4076cf8a8..57c2964ce 100644 --- a/examples/CMakeLists.txt +++ b/examples/CMakeLists.txt @@ -28,6 +28,10 @@ endfunction() function(def_example example) split(${example} target source) add_executable(${target} ${source}) + set_target_properties(${target} PROPERTIES + CXX_STANDARD 20 + CXX_STANDARD_REQUIRED ON + CXX_EXTENSIONS OFF) target_link_libraries(${target} PRIVATE STDEXEC::stdexec stdexec_executable_flags) @@ -44,10 +48,16 @@ set(stdexec_examples "example.server_theme.on_transfer : server_theme/on_transfer.cpp" "example.server_theme.then_upon : server_theme/then_upon.cpp" "example.server_theme.split_bulk : server_theme/split_bulk.cpp" +"example.benchmark.static_thread_pool : benchmark/static_thread_pool.cpp" +"example.benchmark.static_thread_pool_old : benchmark/static_thread_pool_old.cpp" +"example.benchmark.static_thread_pool_nested : benchmark/static_thread_pool_nested.cpp" +"example.benchmark.static_thread_pool_nested_old : benchmark/static_thread_pool_nested_old.cpp" +"example.benchmark.static_thread_pool_bulk_enqueue : benchmark/static_thread_pool_bulk_enqueue.cpp" +"example.benchmark.static_thread_pool_bulk_enqueue_nested : benchmark/static_thread_pool_bulk_enqueue_nested.cpp" ) if (LINUX) - set(stdexect_examples ${stdexec_examples} + set(stdexec_examples ${stdexec_examples} "example.io_uring : io_uring.cpp" ) endif (LINUX) @@ -59,3 +69,11 @@ endforeach() if(STDEXEC_ENABLE_CUDA) add_subdirectory(nvexec) endif() + +if (STDEXEC_ENABLE_TBB) + add_executable(example.benchmark.tbb_thread_pool benchmark/tbb_thread_pool.cpp) + target_link_libraries(example.benchmark.tbb_thread_pool PRIVATE STDEXEC::tbbexec) + + add_executable(example.benchmark.tbb_thread_pool_nested benchmark/tbb_thread_pool_nested.cpp) + target_link_libraries(example.benchmark.tbb_thread_pool_nested PRIVATE STDEXEC::tbbexec) +endif() \ No newline at end of file diff --git a/examples/algorithms/retry.hpp b/examples/algorithms/retry.hpp index 0a4d9c451..2d1475c7a 100644 --- a/examples/algorithms/retry.hpp +++ b/examples/algorithms/retry.hpp @@ -108,7 +108,7 @@ struct _op { template struct _retry_sender { - using is_sender = void; + using sender_concept = stdexec::sender_t; S s_; explicit _retry_sender(S s) diff --git a/examples/algorithms/then.hpp b/examples/algorithms/then.hpp index 775c5c2fa..0bf1daf71 100644 --- a/examples/algorithms/then.hpp +++ b/examples/algorithms/then.hpp @@ -52,7 +52,7 @@ class _then_receiver : stdexec::receiver_adaptor<_then_receiver, R> { template struct _then_sender { - using is_sender = void; + using sender_concept = stdexec::sender_t; S s_; F f_; diff --git a/examples/benchmark/common.hpp b/examples/benchmark/common.hpp new file mode 100644 index 000000000..d39d20e6c --- /dev/null +++ b/examples/benchmark/common.hpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#if __has_include() +#include +namespace pmr = std::pmr; +#else +#define STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE 1 +#endif + +struct statistics { + std::chrono::milliseconds total_time_ms; + double ops_per_sec; +}; + +statistics compute_perf( + std::chrono::steady_clock::time_point start, + std::chrono::steady_clock::time_point end, + std::size_t total_scheds) { + auto dur = std::chrono::duration_cast(end - start); + auto dur_ms = std::chrono::duration_cast(dur); + auto dur_dbl = std::chrono::duration_cast>(dur); + double ops_per_sec = total_scheds / dur_dbl.count(); + return {dur_ms, ops_per_sec}; +} + +struct statistics_all { + std::chrono::milliseconds total_time_ms; + double ops_per_sec; + double average; + double max; + double min; + double stddev; +}; + +statistics_all compute_perf( + std::span start, + std::span end, + std::size_t i0, + std::size_t i, + std::size_t total_scheds) { + double average = 0.0; + double max = 0.0; + double min = std::numeric_limits::max(); + for (std::size_t j = i0; j <= i; ++j) { + auto stats = compute_perf(start[j], end[j], total_scheds); + average += stats.ops_per_sec / (i + 1 - i0); + max = std::max(max, stats.ops_per_sec); + min = std::min(min, stats.ops_per_sec); + } + // compute variant + double variance = 0.0; + for (std::size_t j = i0; j <= i; ++j) { + auto stats = compute_perf(start[j], end[j], total_scheds); + variance += (stats.ops_per_sec - average) * (stats.ops_per_sec - average); + } + variance /= (i + 1 - i0); + double stddev = std::sqrt(variance); + auto stats = compute_perf(start[i], end[i], total_scheds); + statistics_all all{stats.total_time_ms, stats.ops_per_sec, average, max, min, stddev}; + return all; +} + +struct numa_deleter { + std::size_t size_; + exec::numa_allocator allocator_; + void operator()(char* ptr) noexcept { + allocator_.deallocate(ptr, size_); + } +}; + +template +void my_main(int argc, char** argv, exec::numa_policy* policy = exec::get_numa_policy()) { + int nthreads = std::thread::hardware_concurrency(); + if (argc > 1) { + nthreads = std::atoi(argv[1]); + } + std::size_t total_scheds = 10'000'000; +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + std::vector> buffers; +#endif + std::optional pool{}; + if constexpr (std::same_as) { + pool.emplace(nthreads, exec::bwos_params{}, policy); + } else { + pool.emplace(nthreads); + } + std::barrier<> barrier(nthreads + 1); + std::vector threads; + std::atomic stop{false}; +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + std::size_t buffer_size = 2000 << 20; + for (std::size_t i = 0; i < static_cast(nthreads); ++i) { + exec::numa_allocator alloc(policy->thread_index_to_node(i)); + buffers.push_back(std::unique_ptr{alloc.allocate(buffer_size), numa_deleter{buffer_size, alloc}}); + } +#endif + for (std::size_t i = 0; i < static_cast(nthreads); ++i) { + threads.emplace_back( + RunThread{}, + std::ref(*pool), + total_scheds, + i, + std::ref(barrier), +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + std::span{buffers[i].get(), buffer_size}, +#endif + std::ref(stop), + policy); + } + std::size_t nRuns = 100; + std::size_t warmup = 1; + std::vector starts(nRuns); + std::vector ends(nRuns); + for (std::size_t i = 0; i < nRuns; ++i) { + barrier.arrive_and_wait(); + starts[i] = std::chrono::steady_clock::now(); + barrier.arrive_and_wait(); + ends[i] = std::chrono::steady_clock::now(); + if (i < warmup) { + std::cout << "warmup: skip results\n"; + } else { + auto [dur_ms, ops_per_sec, avg, max, min, stddev] = compute_perf( + starts, ends, warmup, i, total_scheds); + auto percent = stddev / ops_per_sec * 100; + std::cout << i + 1 << " " << dur_ms.count() << "ms, throughput: " << std::setprecision(3) + << ops_per_sec << ", average: " << avg << ", max: " << max << ", min: " << min + << ", stddev: " << stddev << " (" << percent << "%)\n"; + } + } + stop = true; + barrier.arrive_and_wait(); + for (auto& thread: threads) { + thread.join(); + } + auto [dur_ms, ops_per_sec, avg, max, min, stddev] = compute_perf( + starts, ends, warmup, nRuns - 1, total_scheds); + std::cout << avg << " | " << max << " | " << min << " | " << stddev << "\n"; +} \ No newline at end of file diff --git a/examples/benchmark/static_thread_pool.cpp b/examples/benchmark/static_thread_pool.cpp new file mode 100644 index 000000000..6de513496 --- /dev/null +++ b/examples/benchmark/static_thread_pool.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "./common.hpp" +#include + +struct RunThread { + void operator()( + exec::static_thread_pool& pool, + std::size_t total_scheds, + std::size_t tid, + std::barrier<>& barrier, +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + std::span buffer, +#endif + std::atomic& stop, + exec::numa_policy* numa) { + std::size_t numa_node = numa->thread_index_to_node(tid); + numa->bind_to_node(numa_node); + exec::nodemask mask{}; + mask.set(numa_node); + auto scheduler = pool.get_constrained_scheduler(mask); + std::mutex mut; + std::condition_variable cv; + while (true) { + barrier.arrive_and_wait(); + if (stop.load()) { + break; + } +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + pmr::monotonic_buffer_resource resource{ + buffer.data(), buffer.size(), pmr::null_memory_resource()}; + pmr::polymorphic_allocator alloc(&resource); + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + std::atomic counter{scheds}; + auto env = exec::make_env(exec::with(stdexec::get_allocator, alloc)); + while (scheds) { + stdexec::start_detached( // + stdexec::schedule(scheduler) // + | stdexec::then([&] { + auto prev = counter.fetch_sub(1); + if (prev == 1) { + std::lock_guard lock{mut}; + cv.notify_one(); + } + }), + env); + --scheds; + } +#else + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + std::atomic counter{scheds}; + while (scheds) { + stdexec::start_detached( // + stdexec::schedule(scheduler) // + | stdexec::then([&] { + auto prev = counter.fetch_sub(1); + if (prev == 1) { + std::lock_guard lock{mut}; + cv.notify_one(); + } + })); + --scheds; + } +#endif + std::unique_lock lock{mut}; + cv.wait(lock, [&] { return counter.load() == 0; }); + lock.unlock(); + barrier.arrive_and_wait(); + } + } +}; + +struct my_numa_distribution : public exec::default_numa_policy { + std::size_t thread_index_to_node(std::size_t index) override { + return exec::default_numa_policy::thread_index_to_node(2 * index); + } +}; + +int main(int argc, char** argv) { + my_numa_distribution numa{}; + my_main(argc, argv, &numa); +} \ No newline at end of file diff --git a/examples/benchmark/static_thread_pool_bulk_enqueue.cpp b/examples/benchmark/static_thread_pool_bulk_enqueue.cpp new file mode 100644 index 000000000..dcffc415f --- /dev/null +++ b/examples/benchmark/static_thread_pool_bulk_enqueue.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "./common.hpp" +#include + +#if STDEXEC_HAS_STD_RANGES() +#include +#include + +struct RunThread { + void operator()( + exec::static_thread_pool& pool, + std::size_t total_scheds, + std::size_t tid, + std::barrier<>& barrier, +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + std::span buffer, +#endif + std::atomic& stop, + exec::numa_policy* numa) { + std::size_t numa_node = numa->thread_index_to_node(tid); + numa->bind_to_node(numa_node); + while (true) { + barrier.arrive_and_wait(); + if (stop.load()) { + break; + } +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + pmr::monotonic_buffer_resource rsrc{buffer.data(), buffer.size()}; + pmr::polymorphic_allocator alloc{&rsrc}; + auto env = exec::make_env(exec::with(stdexec::get_allocator, alloc)); + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + auto iterate = exec::schedule_all(pool, std::views::iota(start, end)) + | exec::ignore_all_values() | exec::write(env); +#else + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + auto iterate = exec::schedule_all(pool, std::views::iota(start, end)) + | exec::ignore_all_values(); +#endif + stdexec::sync_wait(iterate); + barrier.arrive_and_wait(); + } + } +}; + +struct my_numa_distribution : public exec::default_numa_policy { + std::size_t thread_index_to_node(std::size_t index) override { + return exec::default_numa_policy::thread_index_to_node(2 * index); + } +}; + +int main(int argc, char** argv) { + my_numa_distribution numa{}; + my_main(argc, argv, &numa); +} +#else +int main() { +} +#endif \ No newline at end of file diff --git a/examples/benchmark/static_thread_pool_bulk_enqueue_nested.cpp b/examples/benchmark/static_thread_pool_bulk_enqueue_nested.cpp new file mode 100644 index 000000000..fac8f2350 --- /dev/null +++ b/examples/benchmark/static_thread_pool_bulk_enqueue_nested.cpp @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "./common.hpp" +#include + +#if STDEXEC_HAS_STD_RANGES() +#include +#include + +struct RunThread { + void operator()( + exec::static_thread_pool& pool, + std::size_t total_scheds, + std::size_t tid, + std::barrier<>& barrier, +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + std::span buffer, +#endif + std::atomic& stop, + exec::numa_policy* numa) { + std::size_t numa_node = numa->thread_index_to_node(tid); + numa->bind_to_node(numa_node); + auto scheduler = pool.get_scheduler_on_thread(tid); + while (true) { + barrier.arrive_and_wait(); + if (stop.load()) { + break; + } +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + pmr::monotonic_buffer_resource rsrc{buffer.data(), buffer.size()}; + pmr::polymorphic_allocator alloc{&rsrc}; + auto env = exec::make_env(exec::with(stdexec::get_allocator, alloc)); + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + auto iterate = exec::iterate(std::views::iota(start, end)) | exec::ignore_all_values() + | exec::write(env); +#else + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + auto iterate = exec::iterate(std::views::iota(start, end)) | exec::ignore_all_values(); +#endif + stdexec::sync_wait(stdexec::on(scheduler, iterate)); + barrier.arrive_and_wait(); + } + } +}; + +int main(int argc, char** argv) { + my_main(argc, argv); +} +#else +int main() { +} +#endif \ No newline at end of file diff --git a/examples/benchmark/static_thread_pool_nested.cpp b/examples/benchmark/static_thread_pool_nested.cpp new file mode 100644 index 000000000..39eed6ec6 --- /dev/null +++ b/examples/benchmark/static_thread_pool_nested.cpp @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "./common.hpp" +#include + +struct RunThread { + void operator()( + exec::static_thread_pool& pool, + std::size_t total_scheds, + std::size_t tid, + std::barrier<>& barrier, +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + std::span buffer, +#endif + std::atomic& stop, + exec::numa_policy* numa) { + std::size_t numa_node = numa->thread_index_to_node(tid); + numa->bind_to_node(numa_node); + auto scheduler = pool.get_scheduler(); + std::mutex mut; + std::condition_variable cv; + while (true) { + barrier.arrive_and_wait(); + if (stop.load()) { + break; + } +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + pmr::monotonic_buffer_resource resource{ + buffer.data(), buffer.size(), pmr::null_memory_resource()}; + pmr::polymorphic_allocator alloc(&resource); + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + std::atomic counter{scheds}; + auto env = exec::make_env(exec::with(stdexec::get_allocator, alloc)); + stdexec::sync_wait( + stdexec::schedule(scheduler) // + | stdexec::then([&] { + auto nested_scheduler = pool.get_scheduler(); + while (scheds) { + stdexec::start_detached( // + stdexec::schedule(nested_scheduler) // + | stdexec::then([&] { + auto prev = counter.fetch_sub(1); + if (prev == 1) { + std::lock_guard lock{mut}; + cv.notify_one(); + } + }), + env); + --scheds; + } + })); +#else + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + std::atomic counter{scheds}; + stdexec::sync_wait( + stdexec::schedule(scheduler) // + | stdexec::then([&] { + auto nested_scheduler = pool.get_scheduler(); + while (scheds) { + stdexec::start_detached( // + stdexec::schedule(nested_scheduler) // + | stdexec::then([&] { + auto prev = counter.fetch_sub(1); + if (prev == 1) { + std::lock_guard lock{mut}; + cv.notify_one(); + } + })); + --scheds; + } + })); +#endif + std::unique_lock lock{mut}; + cv.wait(lock, [&] { return counter.load() == 0; }); + lock.unlock(); + barrier.arrive_and_wait(); + } + } +}; + +int main(int argc, char** argv) { + my_main(argc, argv); +} \ No newline at end of file diff --git a/examples/benchmark/static_thread_pool_nested_old.cpp b/examples/benchmark/static_thread_pool_nested_old.cpp new file mode 100644 index 000000000..e09a153fa --- /dev/null +++ b/examples/benchmark/static_thread_pool_nested_old.cpp @@ -0,0 +1,97 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "./common.hpp" +#include "./static_thread_pool_old.hpp" + +struct RunThread { + void operator()( + exec_old::static_thread_pool& pool, + std::size_t total_scheds, + std::size_t tid, + std::barrier<>& barrier, +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + std::span buffer, +#endif + std::atomic& stop, + exec::numa_policy* numa) { + std::size_t numa_node = numa->thread_index_to_node(tid); + numa->bind_to_node(numa_node); + auto scheduler = pool.get_scheduler(); + std::mutex mut; + std::condition_variable cv; + while (true) { + barrier.arrive_and_wait(); + if (stop.load()) { + break; + } +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + pmr::monotonic_buffer_resource resource{ + buffer.data(), buffer.size(), pmr::null_memory_resource()}; + pmr::polymorphic_allocator alloc(&resource); + auto [start, end] = exec_old::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + std::atomic counter{scheds}; + auto env = exec::make_env(exec::with(stdexec::get_allocator, alloc)); + stdexec::sync_wait( + stdexec::schedule(scheduler) // + | stdexec::then([&] { + while (scheds) { + stdexec::start_detached( // + stdexec::schedule(scheduler) // + | stdexec::then([&] { + auto prev = counter.fetch_sub(1); + if (prev == 1) { + std::lock_guard lock{mut}; + cv.notify_one(); + } + }), + env); + --scheds; + } + })); +#else + auto [start, end] = exec_old::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + std::atomic counter{scheds}; + stdexec::sync_wait( + stdexec::schedule(scheduler) // + | stdexec::then([&] { + while (scheds) { + stdexec::start_detached( // + stdexec::schedule(scheduler) // + | stdexec::then([&] { + auto prev = counter.fetch_sub(1); + if (prev == 1) { + std::lock_guard lock{mut}; + cv.notify_one(); + } + })); + --scheds; + } + })); +#endif + std::unique_lock lock{mut}; + cv.wait(lock, [&] { return counter.load() == 0; }); + lock.unlock(); + barrier.arrive_and_wait(); + } + } +}; + +int main(int argc, char** argv) { + my_main(argc, argv); +} \ No newline at end of file diff --git a/examples/benchmark/static_thread_pool_old.cpp b/examples/benchmark/static_thread_pool_old.cpp new file mode 100644 index 000000000..c9591dbec --- /dev/null +++ b/examples/benchmark/static_thread_pool_old.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "./common.hpp" +#include "./static_thread_pool_old.hpp" + +struct RunThread { + void operator()( + exec_old::static_thread_pool& pool, + std::size_t total_scheds, + std::size_t tid, + std::barrier<>& barrier, +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + std::span buffer, +#endif + std::atomic& stop, + exec::numa_policy* numa) { + std::size_t numa_node = numa->thread_index_to_node(tid); + numa->bind_to_node(numa_node); + auto scheduler = pool.get_scheduler(); + std::mutex mut; + std::condition_variable cv; + while (true) { + barrier.arrive_and_wait(); + if (stop.load()) { + break; + } +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + pmr::monotonic_buffer_resource resource{ + buffer.data(), buffer.size(), pmr::null_memory_resource()}; + pmr::polymorphic_allocator alloc(&resource); + auto [start, end] = exec_old::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + std::atomic counter{scheds}; + auto env = exec::make_env(exec::with(stdexec::get_allocator, alloc)); + while (scheds) { + stdexec::start_detached( // + stdexec::schedule(scheduler) // + | stdexec::then([&] { + auto prev = counter.fetch_sub(1); + if (prev == 1) { + std::lock_guard lock{mut}; + cv.notify_one(); + } + }), + env); + --scheds; + } +#else + auto [start, end] = exec_old::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + std::atomic counter{scheds}; + while (scheds) { + stdexec::start_detached( // + stdexec::schedule(scheduler) // + | stdexec::then([&] { + auto prev = counter.fetch_sub(1); + if (prev == 1) { + std::lock_guard lock{mut}; + cv.notify_one(); + } + })); + --scheds; + } +#endif + std::unique_lock lock{mut}; + cv.wait(lock, [&] { return counter.load() == 0; }); + lock.unlock(); + barrier.arrive_and_wait(); + } + } +}; + +int main(int argc, char** argv) { + my_main(argc, argv); +} \ No newline at end of file diff --git a/examples/benchmark/static_thread_pool_old.hpp b/examples/benchmark/static_thread_pool_old.hpp new file mode 100644 index 000000000..96d35c873 --- /dev/null +++ b/examples/benchmark/static_thread_pool_old.hpp @@ -0,0 +1,695 @@ +/* + * Copyright (c) 2021-2022 Facebook, Inc. and its affiliates. + * Copyright (c) 2021-2022 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "stdexec/execution.hpp" +#include "stdexec/__detail/__config.hpp" +#include "stdexec/__detail/__intrusive_queue.hpp" +#include "stdexec/__detail/__meta.hpp" + +#include +#include +#include +#include +#include +#include +#include + +namespace exec_old { + using stdexec::__intrusive_queue; + + // Splits `n` into `size` chunks distributing `n % size` evenly between ranks. + // Returns `[begin, end)` range in `n` for a given `rank`. + // Example: + // ```cpp + // // n_items thread n_threads + // even_share( 11, 0, 3); // -> [0, 4) -> 4 items + // even_share( 11, 1, 3); // -> [4, 8) -> 4 items + // even_share( 11, 2, 3); // -> [8, 11) -> 3 items + // ``` + template + static std::pair + even_share(Shape n, std::uint32_t rank, std::uint32_t size) noexcept { + const auto avg_per_thread = n / size; + const auto n_big_share = avg_per_thread + 1; + const auto big_shares = n % size; + const auto is_big_share = rank < big_shares; + const auto begin = is_big_share + ? n_big_share * rank + : n_big_share * big_shares + (rank - big_shares) * avg_per_thread; + const auto end = begin + (is_big_share ? n_big_share : avg_per_thread); + + return std::make_pair(begin, end); + } + + struct task_base { + task_base* next; + void (*__execute)(task_base*, std::uint32_t tid) noexcept; + }; + + class static_thread_pool { + template + class operation; + + struct schedule_tag { + // TODO: code to reconstitute a static_thread_pool schedule sender + }; + + template + struct bulk_sender; + + template + using bulk_sender_t = // + bulk_sender< + stdexec::__x>, + Shape, + stdexec::__x>>; + +#if STDEXEC_MSVC() + // MSVCBUG https://developercommunity.visualstudio.com/t/Alias-template-with-pack-expansion-in-no/10437850 + + template + struct __bulk_non_throwing { + using __t = stdexec::__decayed_tuple; + static constexpr bool __v = noexcept(__t(std::declval()...)); + }; +#endif + + template + requires stdexec::__callable + using bulk_non_throwing = // + stdexec::__mbool< + // If function invocation doesn't throw + stdexec::__nothrow_callable && + // and emplacing a tuple doesn't throw +#if STDEXEC_MSVC() + __bulk_non_throwing::__v +#else + noexcept(stdexec::__decayed_tuple(std::declval()...)) +#endif + // there's no need to advertise completion with `exception_ptr` + >; + + template + struct bulk_shared_state; + + template + struct bulk_receiver; + + template + struct bulk_op_state; + + struct transform_bulk { + template + auto operator()(stdexec::bulk_t, Data&& data, Sender&& sndr) { + auto [shape, fun] = (Data&&) data; + return bulk_sender_t{ + pool_, (Sender&&) sndr, shape, std::move(fun)}; + } + + static_thread_pool& pool_; + }; + + struct domain { + // For eager customization + template Sender> + auto transform_sender(Sender&& sndr) const noexcept { + auto sched = stdexec::get_completion_scheduler( + stdexec::get_env(sndr)); + return stdexec::apply_sender((Sender&&) sndr, transform_bulk{*sched.pool_}); + } + + // transform the generic bulk sender into a parallel thread-pool bulk sender + template Sender, class Env> + requires stdexec::__callable + auto transform_sender(Sender&& sndr, const Env& env) const noexcept { + auto sched = stdexec::get_scheduler(env); + return stdexec::apply_sender((Sender&&) sndr, transform_bulk{*sched.pool_}); + } + }; + + public: + static_thread_pool(); + static_thread_pool(std::uint32_t threadCount); + ~static_thread_pool(); + + struct scheduler { + using __t = scheduler; + using __id = scheduler; + bool operator==(const scheduler&) const = default; + + private: + template + friend class operation; + + class sender { + public: + using __t = sender; + using __id = sender; + using sender_concept = stdexec::sender_t; + using completion_signatures = + stdexec::completion_signatures< stdexec::set_value_t(), stdexec::set_stopped_t()>; + private: + template + auto make_operation_(Receiver r) const -> operation> { + return operation>{pool_, (Receiver&&) r}; + } + + template + friend auto tag_invoke(stdexec::connect_t, sender s, Receiver r) + -> operation> { + return s.make_operation_((Receiver&&) r); + } + + struct env { + static_thread_pool& pool_; + + template + friend static_thread_pool::scheduler + tag_invoke(stdexec::get_completion_scheduler_t, const env& self) noexcept { + return self.make_scheduler_(); + } + + static_thread_pool::scheduler make_scheduler_() const { + return static_thread_pool::scheduler{pool_}; + } + }; + + friend env tag_invoke(stdexec::get_env_t, const sender& self) noexcept { + return env{self.pool_}; + } + + friend struct static_thread_pool::scheduler; + + explicit sender(static_thread_pool& pool) noexcept + : pool_(pool) { + } + + static_thread_pool& pool_; + }; + + sender make_sender_() const { + return sender{*pool_}; + } + + friend sender tag_invoke(stdexec::schedule_t, const scheduler& s) noexcept { + return s.make_sender_(); + } + + friend stdexec::forward_progress_guarantee + tag_invoke(stdexec::get_forward_progress_guarantee_t, const static_thread_pool&) noexcept { + return stdexec::forward_progress_guarantee::parallel; + } + + friend domain tag_invoke(stdexec::get_domain_t, scheduler) noexcept { + return {}; + } + + friend class static_thread_pool; + + explicit scheduler(static_thread_pool& pool) noexcept + : pool_(&pool) { + } + + static_thread_pool* pool_; + }; + + scheduler get_scheduler() noexcept { + return scheduler{*this}; + } + + void request_stop() noexcept; + + std::uint32_t available_parallelism() const { + return threadCount_; + } + + private: + class thread_state { + public: + task_base* try_pop(); + task_base* pop(); + bool try_push(task_base* task); + void push(task_base* task); + void request_stop(); + + private: + std::mutex mut_; + std::condition_variable cv_; + __intrusive_queue<&task_base::next> queue_; + bool stopRequested_ = false; + }; + + void run(std::uint32_t index) noexcept; + void join() noexcept; + + void enqueue(task_base* task) noexcept; + + template TaskT> + void bulk_enqueue(TaskT* task, std::uint32_t n_threads) noexcept; + + std::uint32_t threadCount_; + std::vector threads_; + std::vector threadStates_; + std::atomic nextThread_; + }; + + inline static_thread_pool::static_thread_pool() + : static_thread_pool(std::thread::hardware_concurrency()) { + } + + inline static_thread_pool::static_thread_pool(std::uint32_t threadCount) + : threadCount_(threadCount) + , threadStates_(threadCount) + , nextThread_(0) { + STDEXEC_ASSERT(threadCount > 0); + + threads_.reserve(threadCount); + + try { + for (std::uint32_t i = 0; i < threadCount; ++i) { + threads_.emplace_back([this, i] { run(i); }); + } + } catch (...) { + request_stop(); + join(); + throw; + } + } + + inline static_thread_pool::~static_thread_pool() { + request_stop(); + join(); + } + + inline void static_thread_pool::request_stop() noexcept { + for (auto& state: threadStates_) { + state.request_stop(); + } + } + + inline void static_thread_pool::run(const std::uint32_t threadIndex) noexcept { + STDEXEC_ASSERT(threadIndex < threadCount_); + while (true) { + task_base* task = nullptr; + std::uint32_t queueIndex = threadIndex; + + // Starting with this thread's queue, try to de-queue a task + // from each thread's queue. try_pop() is non-blocking. + do { + task = threadStates_[queueIndex].try_pop(); + } while (!task && (++queueIndex %= threadCount_) != threadIndex); + + STDEXEC_ASSERT(task || queueIndex == threadIndex); + // Make a blocking call to de-queue a task if we don't already have one. + if (!task && !(task = threadStates_[queueIndex].pop())) + return; // pop() only returns null when request_stop() was called. + + task->__execute(task, queueIndex); + } + } + + inline void static_thread_pool::join() noexcept { + for (auto& t: threads_) { + t.join(); + } + threads_.clear(); + } + + inline void static_thread_pool::enqueue(task_base* task) noexcept { + const std::uint32_t threadCount = static_cast(threads_.size()); + const std::uint32_t startIndex = + nextThread_.fetch_add(1, std::memory_order_relaxed) % threadCount; + + // First try to enqueue to one of the threads without blocking. + for (std::uint32_t i = 0; i < threadCount; ++i) { + const auto index = + (startIndex + i) < threadCount ? (startIndex + i) : (startIndex + i - threadCount); + if (threadStates_[index].try_push(task)) { + return; + } + } + + // Otherwise, do a blocking enqueue on the selected thread. + threadStates_[startIndex].push(task); + } + + template TaskT> + inline void static_thread_pool::bulk_enqueue(TaskT* task, std::uint32_t n_threads) noexcept { + for (std::size_t i = 0; i < n_threads; ++i) { + threadStates_[i % available_parallelism()].push(task + i); + } + } + + inline task_base* static_thread_pool::thread_state::try_pop() { + std::unique_lock lk{mut_, std::try_to_lock}; + if (!lk || queue_.empty()) { + return nullptr; + } + return queue_.pop_front(); + } + + inline task_base* static_thread_pool::thread_state::pop() { + std::unique_lock lk{mut_}; + while (queue_.empty()) { + if (stopRequested_) { + return nullptr; + } + cv_.wait(lk); + } + return queue_.pop_front(); + } + + inline bool static_thread_pool::thread_state::try_push(task_base* task) { + std::unique_lock lk{mut_, std::try_to_lock}; + if (!lk) { + return false; + } + const bool wasEmpty = queue_.empty(); + queue_.push_back(task); + if (wasEmpty) { + cv_.notify_one(); + } + return true; + } + + inline void static_thread_pool::thread_state::push(task_base* task) { + std::lock_guard lk{mut_}; + const bool wasEmpty = queue_.empty(); + queue_.push_back(task); + if (wasEmpty) { + cv_.notify_one(); + } + } + + inline void static_thread_pool::thread_state::request_stop() { + std::lock_guard lk{mut_}; + stopRequested_ = true; + cv_.notify_one(); + } + + template + class static_thread_pool::operation : task_base { + using Receiver = stdexec::__t; + friend static_thread_pool::scheduler::sender; + + static_thread_pool& pool_; + Receiver receiver_; + + explicit operation(static_thread_pool& pool, Receiver&& r) + : pool_(pool) + , receiver_((Receiver&&) r) { + this->__execute = [](task_base* t, const std::uint32_t /* tid */) noexcept { + auto& op = *static_cast(t); + auto stoken = stdexec::get_stop_token(stdexec::get_env(op.receiver_)); + if constexpr (std::unstoppable_token) { + stdexec::set_value((Receiver&&) op.receiver_); + } else if (stoken.stop_requested()) { + stdexec::set_stopped((Receiver&&) op.receiver_); + } else { + stdexec::set_value((Receiver&&) op.receiver_); + } + }; + } + + void enqueue_(task_base* op) const { + pool_.enqueue(op); + } + + friend void tag_invoke(stdexec::start_t, operation& op) noexcept { + op.enqueue_(&op); + } + }; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // What follows is the implementation for parallel bulk execution on static_thread_pool. + template + struct static_thread_pool::bulk_sender { + using Sender = stdexec::__t; + using Fun = stdexec::__t; + using sender_concept = stdexec::sender_t; + + static_thread_pool& pool_; + Sender sndr_; + Shape shape_; + Fun fun_; + + template + using with_error_invoke_t = // + stdexec::__if_c< + stdexec::__v, + stdexec::__q>>, + stdexec::completion_signatures<>, + stdexec::__with_exception_ptr>; + + template + using set_value_t = + stdexec::completion_signatures< stdexec::set_value_t(stdexec::__decay_t...)>; + + template + using completion_signatures = // + stdexec::__try_make_completion_signatures< + stdexec::__copy_cvref_t, + Env, + with_error_invoke_t, Env>, + stdexec::__q>; + + template + using bulk_op_state_t = // + bulk_op_state< + stdexec::__x>, + stdexec::__x>, + Shape, + Fun>; + + template Self, stdexec::receiver Receiver> + requires stdexec:: + receiver_of>> + friend bulk_op_state_t // + tag_invoke(stdexec::connect_t, Self&& self, Receiver&& rcvr) // + noexcept(stdexec::__nothrow_constructible_from< + bulk_op_state_t, + static_thread_pool&, + Shape, + Fun, + Sender, + Receiver>) { + return bulk_op_state_t{ + self.pool_, self.shape_, self.fun_, ((Self&&) self).sndr_, (Receiver&&) rcvr}; + } + + template Self, class Env> + friend auto tag_invoke(stdexec::get_completion_signatures_t, Self&&, Env&&) + -> completion_signatures { + return {}; + } + + friend auto tag_invoke(stdexec::get_env_t, const bulk_sender& self) noexcept + -> stdexec::env_of_t { + return stdexec::get_env(self.sndr_); + } + }; + + template + struct static_thread_pool::bulk_shared_state { + using Sender = stdexec::__t; + using Receiver = stdexec::__t; + + struct bulk_task : task_base { + bulk_shared_state* sh_state_; + + bulk_task(bulk_shared_state* sh_state) + : sh_state_(sh_state) { + this->__execute = [](task_base* t, const std::uint32_t tid) noexcept { + auto& sh_state = *static_cast(t)->sh_state_; + auto total_threads = sh_state.num_agents_required(); + + auto computation = [&](auto&... args) { + auto [begin, end] = even_share(sh_state.shape_, tid, total_threads); + for (Shape i = begin; i < end; ++i) { + sh_state.fn_(i, args...); + } + }; + + auto completion = [&](auto&... args) { + stdexec::set_value((Receiver&&) sh_state.receiver_, std::move(args)...); + }; + + if constexpr (MayThrow) { + try { + sh_state.apply(computation); + } catch (...) { + std::uint32_t expected = total_threads; + + if (sh_state.thread_with_exception_.compare_exchange_strong( + expected, tid, std::memory_order_relaxed, std::memory_order_relaxed)) { + sh_state.exception_ = std::current_exception(); + } + } + + const bool is_last_thread = sh_state.finished_threads_.fetch_add(1) + == (total_threads - 1); + + if (is_last_thread) { + if (sh_state.exception_) { + stdexec::set_error((Receiver&&) sh_state.receiver_, std::move(sh_state.exception_)); + } else { + sh_state.apply(completion); + } + } + } else { + sh_state.apply(computation); + + const bool is_last_thread = sh_state.finished_threads_.fetch_add(1) + == (total_threads - 1); + + if (is_last_thread) { + sh_state.apply(completion); + } + } + }; + } + }; + + using variant_t = // + stdexec::__value_types_of_t< + Sender, + stdexec::env_of_t, + stdexec::__q, + stdexec::__q>; + + variant_t data_; + static_thread_pool& pool_; + Receiver receiver_; + Shape shape_; + Fun fn_; + + std::atomic finished_threads_{0}; + std::atomic thread_with_exception_{0}; + std::exception_ptr exception_; + std::vector tasks_; + + std::uint32_t num_agents_required() const { + return std::min(shape_, static_cast(pool_.available_parallelism())); + } + + template + void apply(F f) { + std::visit( + [&](auto& tupl) -> void { std::apply([&](auto&... args) -> void { f(args...); }, tupl); }, + data_); + } + + bulk_shared_state(static_thread_pool& pool, Receiver receiver, Shape shape, Fun fn) + : pool_{pool} + , receiver_{(Receiver&&) receiver} + , shape_{shape} + , fn_{fn} + , thread_with_exception_{num_agents_required()} + , tasks_{num_agents_required(), {this}} { + } + }; + + template + struct static_thread_pool::bulk_receiver { + using receiver_concept = stdexec::receiver_t; + using Sender = stdexec::__t; + using Receiver = stdexec::__t; + + using shared_state = bulk_shared_state; + + shared_state& shared_state_; + + void enqueue() noexcept { + shared_state_.pool_.bulk_enqueue( + shared_state_.tasks_.data(), shared_state_.num_agents_required()); + } + + template + friend void tag_invoke( + stdexec::same_as auto, + bulk_receiver&& self, + As&&... as) noexcept { + using tuple_t = stdexec::__decayed_tuple; + + shared_state& state = self.shared_state_; + + if constexpr (MayThrow) { + try { + state.data_.template emplace((As&&) as...); + } catch (...) { + stdexec::set_error(std::move(state.receiver_), std::current_exception()); + } + } else { + state.data_.template emplace((As&&) as...); + } + + if (state.shape_) { + self.enqueue(); + } else { + state.apply([&](auto&... args) { + stdexec::set_value(std::move(state.receiver_), std::move(args)...); + }); + } + } + + template Tag, class... As> + friend void tag_invoke(Tag tag, bulk_receiver&& self, As&&... as) noexcept { + shared_state& state = self.shared_state_; + tag((Receiver&&) state.receiver_, (As&&) as...); + } + + friend auto tag_invoke(stdexec::get_env_t, const bulk_receiver& self) noexcept + -> stdexec::env_of_t { + return stdexec::get_env(self.shared_state_.receiver_); + } + }; + + template + struct static_thread_pool::bulk_op_state { + using Sender = stdexec::__t; + using Receiver = stdexec::__t; + + static constexpr bool may_throw = // + !stdexec::__v, + stdexec::__mbind_front_q, + stdexec::__q>>; + + using bulk_rcvr = bulk_receiver; + using shared_state = bulk_shared_state; + using inner_op_state = stdexec::connect_result_t; + + shared_state shared_state_; + + inner_op_state inner_op_; + + friend void tag_invoke(stdexec::start_t, bulk_op_state& op) noexcept { + stdexec::start(op.inner_op_); + } + + bulk_op_state(static_thread_pool& pool, Shape shape, Fun fn, Sender&& sender, Receiver receiver) + : shared_state_(pool, (Receiver&&) receiver, shape, fn) + , inner_op_{stdexec::connect((Sender&&) sender, bulk_rcvr{shared_state_})} { + } + }; + +} // namespace exec diff --git a/examples/benchmark/tbb_thread_pool.cpp b/examples/benchmark/tbb_thread_pool.cpp new file mode 100644 index 000000000..d313069ba --- /dev/null +++ b/examples/benchmark/tbb_thread_pool.cpp @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "./common.hpp" +#include + +struct RunThread { + void operator()( + tbbexec::tbb_thread_pool& pool, + std::size_t total_scheds, + std::size_t tid, + std::barrier<>& barrier, +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + std::span buffer, +#endif + std::atomic& stop, + exec::numa_policy* numa) { + std::size_t numa_node = numa->thread_index_to_node(tid); + numa->bind_to_node(numa_node); + auto scheduler = pool.get_scheduler(); + std::mutex mut; + std::condition_variable cv; + while (true) { + barrier.arrive_and_wait(); + if (stop.load()) { + break; + } +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + pmr::monotonic_buffer_resource resource{ + buffer.data(), buffer.size(), pmr::null_memory_resource()}; + pmr::polymorphic_allocator alloc(&resource); + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + std::atomic counter{scheds}; + auto env = exec::make_env(exec::with(stdexec::get_allocator, alloc)); + while (scheds) { + stdexec::start_detached( // + stdexec::schedule(scheduler) // + | stdexec::then([&] { + auto prev = counter.fetch_sub(1); + if (prev == 1) { + std::lock_guard lock{mut}; + cv.notify_one(); + } + }), + env); + --scheds; + } +#else + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + std::atomic counter{scheds}; + while (scheds) { + stdexec::start_detached( // + stdexec::schedule(scheduler) // + | stdexec::then([&] { + auto prev = counter.fetch_sub(1); + if (prev == 1) { + std::lock_guard lock{mut}; + cv.notify_one(); + } + })); + --scheds; + } +#endif + std::unique_lock lock{mut}; + cv.wait(lock, [&] { return counter.load() == 0; }); + lock.unlock(); + barrier.arrive_and_wait(); + } + } +}; + +int main(int argc, char** argv) { + my_main(argc, argv); +} \ No newline at end of file diff --git a/examples/benchmark/tbb_thread_pool_nested.cpp b/examples/benchmark/tbb_thread_pool_nested.cpp new file mode 100644 index 000000000..7802f319a --- /dev/null +++ b/examples/benchmark/tbb_thread_pool_nested.cpp @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#include "./common.hpp" + +#include +#include + +struct RunThread { + void operator()( + tbbexec::tbb_thread_pool& pool, + std::size_t total_scheds, + std::size_t tid, + std::barrier<>& barrier, +#ifndef STDEXEC_NO_MONOTONIC_BUFFER_RESOURCE + [[maybe_unused]] std::span buffer, +#endif + std::atomic& stop, + exec::numa_policy* numa) { + std::size_t numa_node = numa->thread_index_to_node(tid); + numa->bind_to_node(numa_node); + auto scheduler = pool.get_scheduler(); + std::mutex mut; + std::condition_variable cv; + while (true) { + barrier.arrive_and_wait(); + if (stop.load()) { + break; + } + auto [start, end] = exec::even_share(total_scheds, tid, pool.available_parallelism()); + std::size_t scheds = end - start; + tbb::task_group tg{}; + stdexec::sync_wait( // + stdexec::schedule(scheduler) // + | stdexec::then([&] { + for (std::size_t i = 0; i < scheds; ++i) { + tg.run([&] { + // empty + }); + } + })); + tg.wait(); + barrier.arrive_and_wait(); + } + } +}; + +int main(int argc, char** argv) { + my_main(argc, argv); +} \ No newline at end of file diff --git a/examples/io_uring.cpp b/examples/io_uring.cpp index eb837b6cb..37f8c5a84 100644 --- a/examples/io_uring.cpp +++ b/examples/io_uring.cpp @@ -31,10 +31,10 @@ int main() { exec::io_uring_context context; exec::io_uring_context context2; std::thread io_thread{[&] { - context.run(); + context.run_until_stopped(); }}; std::thread io_thread2{[&] { - context2.run(); + context2.run_until_stopped(); }}; auto scheduler = context.get_scheduler(); auto scheduler2 = context2.get_scheduler(); @@ -56,7 +56,9 @@ int main() { exec::schedule_after(scheduler, 1s) | stdexec::then([] { std::cout << "Hello, 1!\n"; }), exec::schedule_after(scheduler2, 2s) | stdexec::then([] { std::cout << "Hello, 2!\n"; }), exec::schedule_after(scheduler, 3s) | stdexec::then([] { std::cout << "Stop it!\n"; }), - exec::schedule_after(scheduler2, 4s) | stdexec::then([&] { context.request_stop(); }), + exec::finally( + exec::schedule_after(scheduler2, 4s), + stdexec::just() | stdexec::then([&] { context.request_stop(); })), exec::finally( exec::schedule_after(scheduler, 4s), stdexec::just() | stdexec::then([&] { context2.request_stop(); })), @@ -87,8 +89,9 @@ int main() { | stdexec::then([] { std::cout << "This should not print, because the context is stopped.\n"; }) | stdexec::upon_stopped([] { std::cout << "The context is stopped!\n"; })); + context.reset(); io_thread = std::thread{[&] { - context.run(); + context.run_until_stopped(); }}; while (!context.is_running()) diff --git a/examples/nvexec/maxwell/common.cuh b/examples/nvexec/maxwell/common.cuh index f2737e5ee..8e3c8edb8 100644 --- a/examples/nvexec/maxwell/common.cuh +++ b/examples/nvexec/maxwell/common.cuh @@ -51,8 +51,8 @@ struct deleter_t { }; template -STDEXEC_DETAIL_CUDACC_HOST_DEVICE inline std::unique_ptr - allocate_on(bool gpu, std::size_t elements = 1) { +STDEXEC_ATTRIBUTE((host, device)) +inline std::unique_ptr allocate_on(bool gpu, std::size_t elements = 1) { T *ptr{}; #if defined(_NVHPC_CUDA) || defined(__CUDACC__) @@ -90,9 +90,7 @@ struct fields_accessor { float *base_ptr; - [[nodiscard]] STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - float * - get(field_id id) const { + STDEXEC_ATTRIBUTE((nodiscard, host, device)) float *get(field_id id) const { return base_ptr + static_cast(id) * cells; } }; @@ -124,9 +122,8 @@ struct grid_t { constexpr float C0 = 299792458.0f; // Speed of light [metres per second] -STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - inline bool - is_circle_part(float x, float y, float object_x, float object_y, float object_size) { +STDEXEC_ATTRIBUTE((host, device)) +inline bool is_circle_part(float x, float y, float object_x, float object_y, float object_size) { const float os2 = object_size * object_size; return ((x - object_x) * (x - object_x) + (y - object_y) * (y - object_y) <= os2); } @@ -140,9 +137,7 @@ struct grid_initializer_t { float dt; fields_accessor accessor; - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - void - operator()(std::size_t cell_id) const { + STDEXEC_ATTRIBUTE((host, device)) void operator()(std::size_t cell_id) const { const std::size_t row = cell_id / accessor.n; const std::size_t column = cell_id % accessor.n; @@ -185,36 +180,30 @@ inline grid_initializer_t grid_initializer(float dt, fields_accessor accessor) { return {dt, accessor}; } -STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - inline std::size_t - right_nid(std::size_t cell_id, std::size_t col, std::size_t N) { +STDEXEC_ATTRIBUTE((host, device)) +inline std::size_t right_nid(std::size_t cell_id, std::size_t col, std::size_t N) { return col == N - 1 ? cell_id - (N - 1) : cell_id + 1; } -STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - inline std::size_t - left_nid(std::size_t cell_id, std::size_t col, std::size_t N) { +STDEXEC_ATTRIBUTE((host, device)) +inline std::size_t left_nid(std::size_t cell_id, std::size_t col, std::size_t N) { return col == 0 ? cell_id + N - 1 : cell_id - 1; } -STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - inline std::size_t - bottom_nid(std::size_t cell_id, std::size_t row, std::size_t N) { +STDEXEC_ATTRIBUTE((host, device)) +inline std::size_t bottom_nid(std::size_t cell_id, std::size_t row, std::size_t N) { return row == 0 ? cell_id + N * (N - 1) : cell_id - N; } -STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - inline std::size_t - top_nid(std::size_t cell_id, std::size_t row, std::size_t N) { +STDEXEC_ATTRIBUTE((host, device)) +inline std::size_t top_nid(std::size_t cell_id, std::size_t row, std::size_t N) { return row == N - 1 ? cell_id - N * (N - 1) : cell_id + N; } struct h_field_calculator_t { fields_accessor accessor; - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - void - operator()(std::size_t cell_id) const __attribute__((always_inline)) { + STDEXEC_ATTRIBUTE((always_inline, host, device)) void operator()(std::size_t cell_id) const { const std::size_t N = accessor.n; const std::size_t column = cell_id % N; const std::size_t row = cell_id / N; @@ -240,23 +229,19 @@ struct e_field_calculator_t { fields_accessor accessor; std::size_t source_position; - [[nodiscard]] STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - float - gaussian_pulse(float t, float t_0, float tau) const { + STDEXEC_ATTRIBUTE((nodiscard, host, device)) + float gaussian_pulse(float t, float t_0, float tau) const { return exp(-(((t - t_0) / tau) * (t - t_0) / tau)); } - [[nodiscard]] STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - float - calculate_source(float t, float frequency) const { + STDEXEC_ATTRIBUTE((nodiscard, host, device)) + float calculate_source(float t, float frequency) const { const float tau = 0.5f / frequency; const float t_0 = 6.0f * tau; return gaussian_pulse(t, t_0, tau); } - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - void - operator()(std::size_t cell_id) const __attribute__((always_inline)) { + STDEXEC_ATTRIBUTE((always_inline, host, device)) void operator()(std::size_t cell_id) const { const std::size_t N = accessor.n; const std::size_t column = cell_id % N; const std::size_t row = cell_id / N; diff --git a/examples/nvexec/maxwell/snr.cuh b/examples/nvexec/maxwell/snr.cuh index 6ccdec3cc..6555faeb7 100644 --- a/examples/nvexec/maxwell/snr.cuh +++ b/examples/nvexec/maxwell/snr.cuh @@ -26,9 +26,13 @@ #include #else namespace nvexec { - struct stream_receiver_base { }; + struct stream_receiver_base { + using receiver_concept = stdexec::receiver_t; + }; - struct stream_sender_base { }; + struct stream_sender_base { + using sender_concept = stdexec::sender_t; + }; namespace detail { struct stream_op_state_base { }; @@ -185,65 +189,133 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { // #endif namespace repeat_n_detail { + template - class receiver_t { + class receiver_2_t { + using Sender = typename OpT::PredSender; using Receiver = typename OpT::Receiver; OpT& op_state_; public: - using __t = receiver_t; - using __id = receiver_t; - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; template _Tag, class... _Args> - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - friend void - tag_invoke(_Tag __tag, receiver_t&& __self, _Args&&... __args) noexcept { - __tag(std::move(__self.op_state_.rcvr_), (_Args&&) __args...); + friend void tag_invoke(_Tag __tag, receiver_2_t&& __self, _Args&&... __args) noexcept { + OpT& op_state = __self.op_state_; + __tag(std::move(op_state.rcvr_), (_Args&&) __args...); } - friend void tag_invoke(ex::set_value_t, receiver_t&& __self) noexcept { + friend void tag_invoke(ex::set_value_t, receiver_2_t&& __self) noexcept { + using inner_op_state_t = typename OpT::inner_op_state_t; + OpT& op_state = __self.op_state_; + op_state.i_++; - for (std::size_t i = 0; i < op_state.n_; i++) { - stdexec::sync_wait(ex::schedule(exec::inline_scheduler{}) | op_state.closure_); + if (op_state.i_ == op_state.n_) { + stdexec::set_value(std::move(op_state.rcvr_)); + return; } - stdexec::set_value(std::move(op_state.rcvr_)); + auto sch = stdexec::get_scheduler(stdexec::get_env(op_state.rcvr_)); + inner_op_state_t& inner_op_state = op_state.inner_op_state_.emplace( + stdexec::__conv{[&]() noexcept { + return ex::connect(ex::schedule(sch) | op_state.closure_, receiver_2_t{op_state}); + }}); + + ex::start(inner_op_state); } - friend auto tag_invoke(ex::get_env_t, const receiver_t& self) noexcept + friend auto tag_invoke(ex::get_env_t, const receiver_2_t& self) noexcept -> stdexec::env_of_t { return stdexec::get_env(self.op_state_.rcvr_); } - explicit receiver_t(OpT& op_state) + explicit receiver_2_t(OpT& op_state) : op_state_(op_state) { } }; - template + template + class receiver_1_t { + using Receiver = typename OpT::Receiver; + + OpT& op_state_; + + public: + using receiver_concept = stdexec::receiver_t; + + template _Tag, class... _Args> + friend void tag_invoke(_Tag __tag, receiver_1_t&& __self, _Args&&... __args) noexcept { + OpT& op_state = __self.op_state_; + __tag(std::move(op_state.rcvr_), (_Args&&) __args...); + } + + friend void tag_invoke(ex::set_value_t, receiver_1_t&& __self) noexcept { + using inner_op_state_t = typename OpT::inner_op_state_t; + + OpT& op_state = __self.op_state_; + + if (op_state.n_) { + auto sch = stdexec::get_scheduler(stdexec::get_env(op_state.rcvr_)); + inner_op_state_t& inner_op_state = op_state.inner_op_state_.emplace( + stdexec::__conv{[&]() noexcept { + return ex::connect(ex::schedule(sch) | op_state.closure_, receiver_2_t{op_state}); + }}); + + ex::start(inner_op_state); + } else { + stdexec::set_value(std::move(op_state.rcvr_)); + } + } + + friend auto tag_invoke(ex::get_env_t, const receiver_1_t& self) noexcept + -> stdexec::env_of_t { + return stdexec::get_env(self.op_state_.rcvr_); + } + + explicit receiver_1_t(OpT& op_state) + : op_state_(op_state) { + } + }; + + template struct operation_state_t { - using Sender = stdexec::__t; + using PredSender = stdexec::__t; using Receiver = stdexec::__t; + using Scheduler = + stdexec::tag_invoke_result_t>; + using InnerSender = + std::invoke_result_t>; - using inner_op_state_t = stdexec::connect_result_t>; + using predecessor_op_state_t = + ex::connect_result_t>; + using inner_op_state_t = ex::connect_result_t>; - inner_op_state_t op_state_; + PredSender pred_sender_; Closure closure_; Receiver rcvr_; + std::optional pred_op_state_; + std::optional inner_op_state_; std::size_t n_{}; + std::size_t i_{}; - friend void tag_invoke(stdexec::start_t, operation_state_t& self) noexcept { - stdexec::start(self.op_state_); + friend void tag_invoke(stdexec::start_t, operation_state_t& op) noexcept { + if (op.n_) { + stdexec::start(*op.pred_op_state_); + } else { + stdexec::set_value(std::move(op.rcvr_)); + } } - operation_state_t(Sender&& sender, Closure closure, Receiver&& rcvr, std::size_t n) - : op_state_{stdexec::connect((Sender&&) sender, receiver_t{*this})} - , closure_{closure} - , rcvr_{(Receiver&&) rcvr} + operation_state_t(PredSender&& pred_sender, Closure closure, Receiver&& rcvr, std::size_t n) + : pred_sender_{(PredSender&&) pred_sender} + , closure_(closure) + , rcvr_(rcvr) , n_(n) { + pred_op_state_.emplace(stdexec::__conv{[&]() noexcept { + return ex::connect((PredSender&&) pred_sender_, receiver_1_t{*this}); + }}); } }; @@ -252,7 +324,7 @@ namespace repeat_n_detail { using __t = repeat_n_sender_t; using __id = repeat_n_sender_t; using Sender = stdexec::__t; - using is_sender = void; + using sender_concept = stdexec::sender_t; using completion_signatures = // stdexec::completion_signatures< @@ -330,8 +402,7 @@ inline constexpr repeat_n_t repeat_n{}; template [[nodiscard]] bool is_gpu_scheduler(SchedulerT&& scheduler) { - auto snd = ex::just() - | exec::on(scheduler, ex::then([] { return nvexec::is_on_gpu(); })); + auto snd = ex::just() | exec::on(scheduler, ex::then([] { return nvexec::is_on_gpu(); })); auto [on_gpu] = stdexec::sync_wait(std::move(snd)).value(); return on_gpu; } @@ -363,9 +434,7 @@ void run_snr( time_storage_t time{is_gpu_scheduler(computer)}; fields_accessor accessor = grid.accessor(); - auto init = - ex::just() - | exec::on(computer, ex::bulk(grid.cells, grid_initializer(dt, accessor))); + auto init = ex::just() | exec::on(computer, ex::bulk(grid.cells, grid_initializer(dt, accessor))); stdexec::sync_wait(init); auto snd = maxwell_eqs_snr(dt, time.get(), write_vtk, n_iterations, accessor, computer); diff --git a/examples/retry.cpp b/examples/retry.cpp index c6fee2bec..413858236 100644 --- a/examples/retry.cpp +++ b/examples/retry.cpp @@ -23,7 +23,7 @@ /////////////////////////////////////////////////////////////////////////////// // Example code: struct fail_some { - using is_sender = void; + using sender_concept = stdexec::sender_t; using completion_signatures = stdexec:: completion_signatures< stdexec::set_value_t(int), stdexec::set_error_t(std::exception_ptr)>; diff --git a/examples/server_theme/on_transfer.cpp b/examples/server_theme/on_transfer.cpp index 9cb5fbc66..cf0ea3fe2 100644 --- a/examples/server_theme/on_transfer.cpp +++ b/examples/server_theme/on_transfer.cpp @@ -107,7 +107,7 @@ int main() { // do the processing on the worker threads pool | ex::transfer(work_sched) // process the incoming data (on worker threads) - | ex::then([buf](int read_len) { process_read_data(buf, read_len); }) + | ex::then([buf](size_t read_len) { process_read_data(buf, read_len); }) // done ; diff --git a/include/exec/__detail/__atomic_intrusive_queue.hpp b/include/exec/__detail/__atomic_intrusive_queue.hpp index fbd509ce3..8b8beee19 100644 --- a/include/exec/__detail/__atomic_intrusive_queue.hpp +++ b/include/exec/__detail/__atomic_intrusive_queue.hpp @@ -22,29 +22,65 @@ namespace exec { template class __atomic_intrusive_queue; - template - class __atomic_intrusive_queue<_NextPtr> { + template + class alignas(64) __atomic_intrusive_queue<_NextPtr> { public: - using __node_pointer = _Tp*; - using __atomic_node_pointer = std::atomic<_Tp*>; + using __node_pointer = _Tp *; + using __atomic_node_pointer = std::atomic<_Tp *>; [[nodiscard]] bool empty() const noexcept { return __head_.load(std::memory_order_relaxed) == nullptr; } - void push_front(__node_pointer t) noexcept { + struct try_push_result { + bool __success; + bool __was_empty; + }; + + try_push_result try_push_front(__node_pointer t) noexcept { + __node_pointer __old_head = __head_.load(std::memory_order_relaxed); + t->*_NextPtr = __old_head; + return { + __head_.compare_exchange_strong(__old_head, t, std::memory_order_acq_rel), + __old_head == nullptr}; + } + + bool push_front(__node_pointer t) noexcept { __node_pointer __old_head = __head_.load(std::memory_order_relaxed); do { t->*_NextPtr = __old_head; } while (!__head_.compare_exchange_weak(__old_head, t, std::memory_order_acq_rel)); + return __old_head == nullptr; + } + + void prepend(stdexec::__intrusive_queue<_NextPtr> queue) noexcept { + __node_pointer __new_head = queue.front(); + __node_pointer __tail = queue.back(); + __node_pointer __old_head = __head_.load(std::memory_order_relaxed); + __tail->*_NextPtr = __old_head; + while (!__head_.compare_exchange_weak(__old_head, __new_head, std::memory_order_acq_rel)) { + __tail->*_NextPtr = __old_head; + } + queue.clear(); } stdexec::__intrusive_queue<_NextPtr> pop_all() noexcept { - return stdexec::__intrusive_queue<_NextPtr>::make_reversed( - __head_.exchange(nullptr, std::memory_order_acq_rel)); + return stdexec::__intrusive_queue<_NextPtr>::make(reset_head()); + } + + stdexec::__intrusive_queue<_NextPtr> pop_all_reversed() noexcept { + return stdexec::__intrusive_queue<_NextPtr>::make_reversed(reset_head()); } private: + __node_pointer reset_head() noexcept { + __node_pointer __old_head = __head_.load(std::memory_order_relaxed); + while (!__head_.compare_exchange_weak(__old_head, nullptr, std::memory_order_acq_rel)) { + ; + } + return __old_head; + } + __atomic_node_pointer __head_{nullptr}; }; } \ No newline at end of file diff --git a/include/exec/__detail/__basic_sequence.hpp b/include/exec/__detail/__basic_sequence.hpp index bff1ca28a..d4c5605b1 100644 --- a/include/exec/__detail/__basic_sequence.hpp +++ b/include/exec/__detail/__basic_sequence.hpp @@ -31,7 +31,7 @@ namespace exec { template struct __seqexpr<_ImplFn> { - using is_sender = sequence_tag; + using sender_concept = sequence_sender_t; using __t = __seqexpr; using __id = __seqexpr; using __tag_t = stdexec::__call_result_t<_ImplFn, stdexec::__cp, stdexec::__detail::__get_tag>; @@ -42,8 +42,8 @@ namespace exec { mutable _ImplFn __impl_; - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - explicit __seqexpr(_ImplFn __impl) + STDEXEC_ATTRIBUTE((host, device)) + explicit __seqexpr(_ImplFn __impl) : __impl_((_ImplFn&&) __impl) { } @@ -106,12 +106,12 @@ namespace exec { }; template - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - __seqexpr(_ImplFn) -> __seqexpr<_ImplFn>; + STDEXEC_ATTRIBUTE((host, device)) + __seqexpr(_ImplFn) -> __seqexpr<_ImplFn>; #if STDEXEC_NVHPC() || (STDEXEC_GCC() && __GNUC__ < 13) namespace __detail { - template > + template struct make_sequence_expr_t { template constexpr auto operator()(_Data __data = {}, _Children... __children) const { @@ -122,7 +122,7 @@ namespace exec { } #else namespace __detail { - template > + template struct make_sequence_expr_t { template constexpr auto operator()(_Data __data = {}, _Children... __children) const { @@ -133,7 +133,7 @@ namespace exec { } #endif - template > + template inline constexpr __detail::make_sequence_expr_t<_Tag, _Domain> make_sequence_expr{}; template diff --git a/include/exec/__detail/__bwos_lifo_queue.hpp b/include/exec/__detail/__bwos_lifo_queue.hpp new file mode 100644 index 000000000..f04bfc950 --- /dev/null +++ b/include/exec/__detail/__bwos_lifo_queue.hpp @@ -0,0 +1,494 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../../stdexec/__detail/__config.hpp" + +#include +#include +#include +#include +#include +#include +#include + +// The below code for spin_loop_pause is taken from https://github.com/max0x7ba/atomic_queue/blob/master/include/atomic_queue/defs.h +// Copyright (c) 2019 Maxim Egorushkin. MIT License. + +#if defined(__x86_64__) || defined(_M_X64) || defined(__i386__) || defined(_M_IX86) +#if STDEXEC_MSVC() +#include +#endif +namespace exec::bwos { + static inline void spin_loop_pause() noexcept { +#if STDEXEC_MSVC() + _mm_pause(); +#else + __builtin_ia32_pause(); +#endif + } +} +#elif defined(__arm__) || defined(__aarch64__) || defined(_M_ARM64) +namespace exec::bwos { + static inline void spin_loop_pause() noexcept { +#if ( \ + defined(__ARM_ARCH_6K__) || defined(__ARM_ARCH_6Z__) || defined(__ARM_ARCH_6ZK__) \ + || defined(__ARM_ARCH_6T2__) || defined(__ARM_ARCH_7__) || defined(__ARM_ARCH_7A__) \ + || defined(__ARM_ARCH_7R__) || defined(__ARM_ARCH_7M__) || defined(__ARM_ARCH_7S__) \ + || defined(__ARM_ARCH_8A__) || defined(__aarch64__)) + asm volatile("yield" ::: "memory"); +#elif defined(_M_ARM64) + __yield(); +#else + asm volatile("nop" ::: "memory"); +#endif + } +} +#else +namespace exec::bwos { + static inline void spin_loop_pause() noexcept { + } +} +#endif + +/** + * This is an implementation of the BWOS queue as described in + * BWoS: Formally Verified Block-based Work Stealing for Parallel Processing (Wang et al. 2023) + */ + +namespace exec::bwos { + inline constexpr std::size_t hardware_destructive_interference_size = 64; + inline constexpr std::size_t hardware_constructive_interference_size = 64; + + enum class lifo_queue_error_code { + success, + done, + empty, + full, + conflict, + }; + + template + struct fetch_result { + lifo_queue_error_code status; + Tp value; + }; + + struct takeover_result { + std::size_t front; + std::size_t back; + }; + + template > + class lifo_queue { + public: + explicit lifo_queue( + std::size_t num_blocks, + std::size_t block_size, + Allocator allocator = Allocator()); + + Tp pop_back() noexcept; + + Tp steal_front() noexcept; + + bool push_back(Tp value) noexcept; + + template + Iterator push_back(Iterator first, Sentinel last) noexcept; + + std::size_t get_available_capacity() const noexcept; + std::size_t get_free_capacity() const noexcept; + + std::size_t block_size() const noexcept; + std::size_t num_blocks() const noexcept; + + private: + template + using allocator_of_t = typename std::allocator_traits::template rebind_alloc; + + struct block_type { + explicit block_type(std::size_t block_size, Allocator allocator = Allocator()); + + block_type(const block_type &); + block_type &operator=(const block_type &); + + block_type(block_type &&) noexcept; + block_type &operator=(block_type &&) noexcept; + + lifo_queue_error_code put(Tp value) noexcept; + + template + Iterator bulk_put(Iterator first, Sentinel last) noexcept; + + fetch_result get() noexcept; + + fetch_result steal() noexcept; + + takeover_result takeover() noexcept; + bool is_writable() const noexcept; + + std::size_t free_capacity() const noexcept; + + void grant() noexcept; + + bool reclaim() noexcept; + + bool is_stealable() const noexcept; + + std::size_t block_size() const noexcept; + + alignas(hardware_destructive_interference_size) std::atomic head_{}; + alignas(hardware_destructive_interference_size) std::atomic tail_{}; + alignas(hardware_destructive_interference_size) std::atomic steal_head_{}; + alignas(hardware_destructive_interference_size) std::atomic steal_tail_{}; + std::vector ring_buffer_; + }; + + bool advance_get_index() noexcept; + bool advance_steal_index(std::size_t expected_thief_counter) noexcept; + bool advance_put_index() noexcept; + + alignas(hardware_destructive_interference_size) std::atomic owner_block_{1}; + alignas(hardware_destructive_interference_size) std::atomic thief_block_{0}; + std::vector> blocks_{}; + std::size_t mask_{}; + }; + + ///////////////////////////////////////////////////////////////////////////// + // Implementation of lifo_queue member methods + + template + lifo_queue::lifo_queue( + std::size_t num_blocks, + std::size_t block_size, + Allocator allocator) + : blocks_( + std::max(static_cast(2), std::bit_ceil(num_blocks)), + block_type(block_size, allocator), + allocator_of_t(allocator)) + , mask_(blocks_.size() - 1) { + blocks_[owner_block_].reclaim(); + } + + template + Tp lifo_queue::pop_back() noexcept { + do { + std::size_t owner_index = owner_block_.load(std::memory_order_relaxed) & mask_; + block_type ¤t_block = blocks_[owner_index]; + auto [ec, value] = current_block.get(); + if (ec == lifo_queue_error_code::success) { + return value; + } + if (ec == lifo_queue_error_code::done) { + return Tp{}; + } + } while (advance_get_index()); + return Tp{}; + } + + template + Tp lifo_queue::steal_front() noexcept { + std::size_t thief = 0; + do { + thief = thief_block_.load(std::memory_order_relaxed); + std::size_t thief_index = thief & mask_; + block_type &block = blocks_[thief_index]; + fetch_result result = block.steal(); + while (result.status != lifo_queue_error_code::done) { + if (result.status == lifo_queue_error_code::success) { + return result.value; + } + if (result.status == lifo_queue_error_code::empty) { + return Tp{}; + } + result = block.steal(); + } + } while (advance_steal_index(thief)); + return Tp{}; + } + + template + bool lifo_queue::push_back(Tp value) noexcept { + do { + std::size_t owner_index = owner_block_.load(std::memory_order_relaxed) & mask_; + block_type ¤t_block = blocks_[owner_index]; + auto ec = current_block.put(value); + if (ec == lifo_queue_error_code::success) { + return true; + } + } while (advance_put_index()); + return false; + } + + template + template + Iterator lifo_queue::push_back(Iterator first, Sentinel last) noexcept { + do { + std::size_t owner_index = owner_block_.load(std::memory_order_relaxed) & mask_; + block_type ¤t_block = blocks_[owner_index]; + first = current_block.bulk_put(first, last); + } while (first != last && advance_put_index()); + return first; + } + + template + std::size_t lifo_queue::get_free_capacity() const noexcept { + std::size_t owner_counter = owner_block_.load(std::memory_order_relaxed); + std::size_t owner_index = owner_counter & mask_; + std::size_t local_capacity = blocks_[owner_index].free_capacity(); + std::size_t thief_counter = thief_block_.load(std::memory_order_relaxed); + std::size_t diff = owner_counter - thief_counter; + std::size_t rest = blocks_.size() - diff - 1; + return local_capacity + rest * block_size(); + } + + template + std::size_t lifo_queue::get_available_capacity() const noexcept { + return num_blocks() * block_size(); + } + + template + std::size_t lifo_queue::block_size() const noexcept { + return blocks_[0].block_size(); + } + + template + std::size_t lifo_queue::num_blocks() const noexcept { + return blocks_.size(); + } + + template + bool lifo_queue::advance_get_index() noexcept { + std::size_t owner_counter = owner_block_.load(std::memory_order_relaxed); + std::size_t predecessor = owner_counter - 1ul; + std::size_t predecessor_index = predecessor & mask_; + block_type &previous_block = blocks_[predecessor_index]; + takeover_result result = previous_block.takeover(); + if (result.front != result.back) { + std::size_t thief_counter = thief_block_.load(std::memory_order_relaxed); + if (thief_counter == predecessor) { + predecessor += blocks_.size(); + thief_counter += blocks_.size() - 1ul; + thief_block_.store(thief_counter, std::memory_order_relaxed); + } + owner_block_.store(predecessor, std::memory_order_relaxed); + return true; + } + return false; + } + + template + bool lifo_queue::advance_put_index() noexcept { + std::size_t owner_counter = owner_block_.load(std::memory_order_relaxed); + std::size_t next_counter = owner_counter + 1ul; + std::size_t thief_counter = thief_block_.load(std::memory_order_relaxed); + STDEXEC_ASSERT(thief_counter < next_counter); + if (next_counter - thief_counter >= blocks_.size()) { + return false; + } + std::size_t next_index = next_counter & mask_; + block_type &next_block = blocks_[next_index]; + if (!next_block.is_writable()) [[unlikely]] { + return false; + } + std::size_t owner_index = owner_counter & mask_; + block_type ¤t_block = blocks_[owner_index]; + current_block.grant(); + owner_block_.store(next_counter, std::memory_order_relaxed); + next_block.reclaim(); + return true; + } + + template + bool lifo_queue::advance_steal_index(std::size_t expected_thief_counter) noexcept { + std::size_t thief_counter = expected_thief_counter; + std::size_t next_counter = thief_counter + 1; + std::size_t next_index = next_counter & mask_; + block_type &next_block = blocks_[next_index]; + if (next_block.is_stealable()) { + thief_block_.compare_exchange_strong(thief_counter, next_counter, std::memory_order_relaxed); + return true; + } + return thief_block_.load(std::memory_order_relaxed) != thief_counter; + } + + ///////////////////////////////////////////////////////////////////////////// + // Implementation of lifo_queue::block_type member methods + + template + lifo_queue::block_type::block_type(std::size_t block_size, Allocator allocator) + : head_{0} + , tail_{0} + , steal_head_{0} + , steal_tail_{block_size} + , ring_buffer_(block_size, allocator) { + } + + template + lifo_queue::block_type::block_type(const block_type &other) + : ring_buffer_(other.ring_buffer_) { + head_.store(other.head_.load(std::memory_order_relaxed), std::memory_order_relaxed); + tail_.store(other.tail_.load(std::memory_order_relaxed), std::memory_order_relaxed); + steal_tail_.store(other.steal_tail_.load(std::memory_order_relaxed), std::memory_order_relaxed); + steal_head_.store(other.steal_head_.load(std::memory_order_relaxed), std::memory_order_relaxed); + } + + template + typename lifo_queue::block_type & + lifo_queue::block_type::operator=(const block_type &other) { + head_.store(other.head_.load(std::memory_order_relaxed), std::memory_order_relaxed); + tail_.store(other.tail_.load(std::memory_order_relaxed), std::memory_order_relaxed); + steal_tail_.store(other.steal_tail_.load(std::memory_order_relaxed), std::memory_order_relaxed); + steal_head_.store(other.steal_head_.load(std::memory_order_relaxed), std::memory_order_relaxed); + ring_buffer_ = other.ring_buffer_; + return *this; + } + + template + lifo_queue::block_type::block_type(block_type &&other) noexcept { + head_.store(other.head_.load(std::memory_order_relaxed), std::memory_order_relaxed); + tail_.store(other.tail_.load(std::memory_order_relaxed), std::memory_order_relaxed); + steal_tail_.store(other.steal_tail_.load(std::memory_order_relaxed), std::memory_order_relaxed); + steal_head_.store(other.steal_head_.load(std::memory_order_relaxed), std::memory_order_relaxed); + ring_buffer_ = std::exchange(std::move(other.ring_buffer_), {}); + } + + template + typename lifo_queue::block_type & + lifo_queue::block_type::operator=(block_type &&other) noexcept { + head_.store(other.head_.load(std::memory_order_relaxed), std::memory_order_relaxed); + tail_.store(other.tail_.load(std::memory_order_relaxed), std::memory_order_relaxed); + steal_tail_.store(other.steal_tail_.load(std::memory_order_relaxed), std::memory_order_relaxed); + steal_head_.store(other.steal_head_.load(std::memory_order_relaxed), std::memory_order_relaxed); + ring_buffer_ = std::exchange(std::move(other.ring_buffer_), {}); + return *this; + } + + template + lifo_queue_error_code lifo_queue::block_type::put(Tp value) noexcept { + std::uint64_t back = tail_.load(std::memory_order_relaxed); + if (back < block_size()) [[likely]] { + ring_buffer_[back] = static_cast(value); + tail_.store(back + 1, std::memory_order_release); + return lifo_queue_error_code::success; + } + return lifo_queue_error_code::full; + } + + template + template + Iterator lifo_queue::block_type::bulk_put(Iterator first, Sentinel last) noexcept { + std::uint64_t back = tail_.load(std::memory_order_relaxed); + while (first != last && back < block_size()) { + ring_buffer_[back] = static_cast(*first); + ++back; + ++first; + } + tail_.store(back, std::memory_order_release); + return first; + } + + template + fetch_result lifo_queue::block_type::get() noexcept { + std::uint64_t front = head_.load(std::memory_order_relaxed); + if (front == block_size()) [[unlikely]] { + return {lifo_queue_error_code::done, nullptr}; + } + std::uint64_t back = tail_.load(std::memory_order_relaxed); + if (front == back) [[unlikely]] { + return {lifo_queue_error_code::empty, nullptr}; + } + Tp value = static_cast(ring_buffer_[back - 1]); + tail_.store(back - 1, std::memory_order_release); + return {lifo_queue_error_code::success, value}; + } + + template + fetch_result lifo_queue::block_type::steal() noexcept { + std::uint64_t spos = steal_tail_.load(std::memory_order_relaxed); + fetch_result result{}; + if (spos == block_size()) [[unlikely]] { + result.status = lifo_queue_error_code::done; + return result; + } + std::uint64_t back = tail_.load(std::memory_order_acquire); + if (spos == back) [[unlikely]] { + result.status = lifo_queue_error_code::empty; + return result; + } + if (!steal_tail_.compare_exchange_strong(spos, spos + 1, std::memory_order_relaxed)) { + result.status = lifo_queue_error_code::conflict; + return result; + } + result.value = static_cast(ring_buffer_[spos]); + steal_head_.fetch_add(1, std::memory_order_release); + result.status = lifo_queue_error_code::success; + return result; + } + + template + takeover_result lifo_queue::block_type::takeover() noexcept { + std::uint64_t spos = steal_tail_.exchange(block_size(), std::memory_order_relaxed); + if (spos == block_size()) [[unlikely]] { + return {head_.load(std::memory_order_relaxed), tail_.load(std::memory_order_relaxed)}; + } + head_.store(spos, std::memory_order_relaxed); + return {spos, tail_.load(std::memory_order_relaxed)}; + } + + template + bool lifo_queue::block_type::is_writable() const noexcept { + std::uint64_t expected_steal = block_size(); + std::uint64_t spos = steal_tail_.load(std::memory_order_relaxed); + return spos == expected_steal; + } + + template + std::size_t lifo_queue::block_type::free_capacity() const noexcept { + std::uint64_t back = tail_.load(std::memory_order_relaxed); + return block_size() - back; + } + + template + bool lifo_queue::block_type::reclaim() noexcept { + std::uint64_t expected_steal_head_ = tail_.load(std::memory_order_relaxed); + while (steal_head_.load(std::memory_order_acquire) != expected_steal_head_) { + spin_loop_pause(); + } + head_.store(0, std::memory_order_relaxed); + tail_.store(0, std::memory_order_relaxed); + steal_tail_.store(block_size(), std::memory_order_relaxed); + steal_head_.store(0, std::memory_order_relaxed); + return false; + } + + template + std::size_t lifo_queue::block_type::block_size() const noexcept { + return ring_buffer_.size(); + } + + template + void lifo_queue::block_type::grant() noexcept { + std::uint64_t old_head = head_.exchange(block_size(), std::memory_order_relaxed); + steal_tail_.store(old_head, std::memory_order_release); + } + + template + bool lifo_queue::block_type::is_stealable() const noexcept { + return steal_tail_.load(std::memory_order_acquire) != block_size(); + } +} \ No newline at end of file diff --git a/include/exec/__detail/__manual_lifetime.hpp b/include/exec/__detail/__manual_lifetime.hpp index 69723e3a8..509cf9784 100644 --- a/include/exec/__detail/__manual_lifetime.hpp +++ b/include/exec/__detail/__manual_lifetime.hpp @@ -46,7 +46,7 @@ namespace exec { return *::new (static_cast(std::addressof(__value_))) _Ty(((_Func&&) func)()); } - void __destruct() noexcept { + void __destroy() noexcept { __value_.~_Ty(); } diff --git a/include/exec/__detail/__numa.hpp b/include/exec/__detail/__numa.hpp new file mode 100644 index 000000000..7c88bc1f2 --- /dev/null +++ b/include/exec/__detail/__numa.hpp @@ -0,0 +1,254 @@ +/* + * Copyright (c) 2021-2022 NVIDIA Corporation + * Copyright (c) 2023 Maikel Nadolski + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "../../stdexec/__detail/__config.hpp" +#include "../scope.hpp" + +#include +#include +#include + +namespace exec { + struct numa_policy { + virtual std::size_t num_nodes() = 0; + virtual std::size_t num_cpus(int node) = 0; + virtual int bind_to_node(int node) = 0; + virtual std::size_t thread_index_to_node(std::size_t index) = 0; + }; + + class no_numa_policy : public numa_policy { + public: + no_numa_policy() noexcept = default; + std::size_t num_nodes() override { return 1; } + std::size_t num_cpus(int node) override { return std::thread::hardware_concurrency(); } + int bind_to_node(int node) override { return 0; } + std::size_t thread_index_to_node(std::size_t index) override { return 0; } + }; +} + +#if STDEXEC_ENABLE_NUMA +#include +namespace exec { + struct default_numa_policy : numa_policy { + default_numa_policy() : node_to_thread_index_(::numa_num_task_nodes()) { + std::size_t total_cpus = 0; + std::size_t n_nodes = num_nodes(); + for (std::size_t node = 0; node < n_nodes; ++node) { + total_cpus += this->num_cpus(node); + node_to_thread_index_[node] = total_cpus; + } + } + + std::size_t num_nodes() override { return node_to_thread_index_.size(); } + + std::size_t num_cpus(int node) override { + struct ::bitmask* cpus = ::numa_allocate_cpumask(); + if (!cpus) { + return 0; + } + scope_guard sg{[&]() noexcept { ::numa_free_cpumask(cpus); }}; + int rc = ::numa_node_to_cpus(node, cpus); + if (rc < 0) { + return 0; + } + std::size_t num_cpus = ::numa_bitmask_weight(cpus); + return num_cpus; + } + + int bind_to_node(int node) override { + struct ::bitmask* nodes = ::numa_allocate_nodemask(); + if (!nodes) { + return -1; + } + scope_guard sg{[&]() noexcept { ::numa_free_nodemask(nodes); }}; + ::numa_bitmask_setbit(nodes, node); + ::numa_bind(nodes); + return 0; + } + + std::size_t thread_index_to_node(std::size_t index) override { + index %= node_to_thread_index_.back(); + auto it = std::upper_bound(node_to_thread_index_.begin(), node_to_thread_index_.end(), index); + STDEXEC_ASSERT(it != node_to_thread_index_.end()); + return std::distance(node_to_thread_index_.begin(), it); + } + + std::vector node_to_thread_index_{}; + }; + + inline numa_policy* get_numa_policy() noexcept { + thread_local default_numa_policy g_default_numa_policy{}; + thread_local no_numa_policy g_no_numa_policy{}; + if (::numa_available() < 0) { + return &g_no_numa_policy; + } + return &g_default_numa_policy; + } + + template + struct numa_allocator { + using pointer = T*; + using const_pointer = const T*; + using value_type = T; + + explicit numa_allocator(int node) noexcept : node_(node) {} + + template + explicit numa_allocator(const numa_allocator& other) noexcept : node_(other.node_) {} + + int node_; + + void* do_allocate(std::size_t n) { + return ::numa_alloc_onnode(n, node_); + } + + void do_deallocate(void* p, std::size_t n) { + ::numa_free(p, n); + } + + T* allocate(std::size_t n) { + return static_cast(do_allocate(n * sizeof(T))); + } + + void deallocate(T* p, std::size_t n) { + do_deallocate(p, n * sizeof(T)); + } + + friend bool operator==(const numa_allocator&, const numa_allocator&) noexcept = default; + }; + + class nodemask { + static nodemask make_any() noexcept { + nodemask mask; + ::copy_bitmask_to_nodemask(::numa_all_nodes_ptr, &mask.mask_); + return mask; + } + + + public: + nodemask() noexcept + : mask_{} + { + ::copy_bitmask_to_nodemask(::numa_no_nodes_ptr, &mask_); + } + + static const nodemask& any() noexcept { + static nodemask mask = make_any(); + return mask; + } + + bool operator[](std::size_t nodemask) const noexcept { + ::bitmask mask; + mask.maskp = const_cast(mask_.n); + mask.size = sizeof(nodemask_t); + return ::numa_bitmask_isbitset(&mask, nodemask); + } + + void set(std::size_t nodemask) noexcept { + ::bitmask mask; + mask.maskp = const_cast(mask_.n); + mask.size = sizeof(nodemask_t); + ::numa_bitmask_setbit(&mask, nodemask); + } + + bool get(std::size_t nodemask) const noexcept { + ::bitmask mask; + mask.maskp = const_cast(mask_.n); + mask.size = sizeof(nodemask_t); + return ::numa_bitmask_isbitset(&mask, nodemask); + } + + friend bool operator==(const nodemask& lhs, const nodemask& rhs) noexcept { + ::bitmask lhs_mask; + ::bitmask rhs_mask; + lhs_mask.maskp = const_cast(lhs.mask_.n); + lhs_mask.size = sizeof(nodemask_t); + rhs_mask.maskp = const_cast(rhs.mask_.n); + rhs_mask.size = sizeof(nodemask_t); + return ::numa_bitmask_equal(&lhs_mask, &rhs_mask); + } + + private: + ::nodemask_t mask_; + }; +} +#else +namespace exec { + using default_numa_policy = no_numa_policy; + + inline numa_policy* get_numa_policy() noexcept { + thread_local default_numa_policy g_default_numa_policy{}; + return &g_default_numa_policy; + } + + template + struct numa_allocator { + using pointer = T*; + using const_pointer = const T*; + using value_type = T; + + explicit numa_allocator(int) noexcept {} + + template + explicit numa_allocator(const numa_allocator&) noexcept {} + + T* allocate(std::size_t n) { + std::allocator alloc{}; + return alloc.allocate(n); + } + + void deallocate(T* p, std::size_t n) { + std::allocator alloc{}; + alloc.deallocate(p, n); + } + + friend bool operator==(const numa_allocator&, const numa_allocator&) noexcept = default; + }; + + class nodemask { + static nodemask make_any() noexcept { + nodemask mask; + mask.mask_ = true; + return mask; + } + + public: + nodemask() noexcept = default; + + static const nodemask& any() noexcept { + static nodemask mask = make_any(); + return mask; + } + + bool operator[](std::size_t nodemask) const noexcept { + return mask_ && nodemask == 0; + } + + void set(std::size_t nodemask) noexcept { + mask_ |= nodemask == 0; + } + + friend bool operator==(const nodemask& lhs, const nodemask& rhs) noexcept { + return lhs.mask_ == rhs.mask_; + } + + private: + bool mask_{false}; + }; +} +#endif \ No newline at end of file diff --git a/include/exec/__detail/__sender_facade.hpp b/include/exec/__detail/__sender_facade.hpp index 88edbf8fd..dbacdd39c 100644 --- a/include/exec/__detail/__sender_facade.hpp +++ b/include/exec/__detail/__sender_facade.hpp @@ -17,11 +17,9 @@ #include "../../stdexec/execution.hpp" -#ifdef __EDG__ -#pragma diagnostic push -#pragma diag_suppress 1302 -#pragma diag_suppress 497 -#endif +STDEXEC_PRAGMA_PUSH() +STDEXEC_PRAGMA_IGNORE_EDG(1302) +STDEXEC_PRAGMA_IGNORE_EDG(497) namespace exec { struct _FAILURE_TO_CONNECT_ { @@ -29,7 +27,7 @@ namespace exec { struct _WHAT_ { struct __t { using __id = _WHAT_; - using is_sender = void; + using sender_concept = stdexec::sender_t; using completion_signatures = _WHAT_; }; }; @@ -93,13 +91,6 @@ namespace exec { __declval<__data_placeholder&>(), __declval<__receiver_placeholder<_Env>&>())); - struct __dependent_sender { - using is_sender = void; - using __t = __dependent_sender; - friend auto tag_invoke(get_completion_signatures_t, __dependent_sender, no_env) - -> dependent_completion_signatures; - }; - struct __sender_transform_failed { using __t = __sender_transform_failed; }; @@ -111,8 +102,6 @@ namespace exec { } else { if constexpr (__mvalid<__tfx_sender_, _Kernel, _Sender, _Env>) { return __mtype<__tfx_sender_<_Kernel, _Sender, _Env>>{}; - } else if constexpr (same_as<_Env, no_env>) { - return __dependent_sender{}; } else { return __sender_transform_failed{}; } @@ -122,7 +111,6 @@ namespace exec { template auto __transform_sender(_Kernel& __kernel, _Sender&& __sndr, _Data& __data, _Receiver& __rcvr) { - static_assert(!same_as, no_env>); if constexpr (__lacks_transform_sender<_Kernel>) { return (_Sender&&) __sndr; } else { @@ -143,11 +131,7 @@ namespace exec { __declval<_As>()...)); template - using __get_env_ = decltype(__declval<_Kernel&>().get_env(__declval<_Env>())); - - template - using __env_t = - __minvoke< __if_c, __mconst, __q<__get_env_>>, _Kernel, _Env>; + using __env_t = decltype(__declval<_Kernel&>().get_env(__declval<_Env>())); template auto __completions_from_sig(_Tag (*)(_As...)) @@ -163,7 +147,7 @@ namespace exec { template auto __compute_completions_(completion_signatures<_Sigs...>*) -> decltype(__stl::__all_completions( - (__completions_from_sig_t<_Kernel, _Env, _Sigs>) nullptr...)); + static_cast<__completions_from_sig_t<_Kernel, _Env, _Sigs>>(nullptr)...)); template auto __compute_completions_(_NoCompletions*) -> _NoCompletions; @@ -185,12 +169,12 @@ namespace exec { } _Receiver __rcvr_; - STDEXEC_NO_UNIQUE_ADDRESS _Kernel __kernel_; - STDEXEC_NO_UNIQUE_ADDRESS _Data __data_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Kernel __kernel_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Data __data_; }; struct __t { - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using __id = __receiver; __state* __state_; @@ -268,7 +252,7 @@ namespace exec { struct __t { using __id = _DerivedId; - using is_sender = void; + using sender_concept = stdexec::sender_t; _Sender __sndr_; _Kernel __kernel_; @@ -314,19 +298,13 @@ namespace exec { __completions_t<_NewEnv, __pre_completions_t<_NewSender, _NewEnv>>; if constexpr (__valid_completion_signatures<_Completions, _Env>) { return (_Completions(*)()) nullptr; - } else if constexpr (same_as) { - return (dependent_completion_signatures(*)()) nullptr; } else { // assume this is an error message and return it directly return (_Completions(*)()) nullptr; } - } else if constexpr (same_as) { - return (dependent_completion_signatures(*)()) nullptr; } else { return (__diagnostic_t<_Env>(*)()) nullptr; } - } else if constexpr (same_as) { - return (dependent_completion_signatures(*)()) nullptr; } else if constexpr (same_as<_NewSender, __sender_transform_failed>) { return (__diagnostic_t<_Env>(*)()) nullptr; } else { @@ -399,6 +377,4 @@ namespace exec { }; } // namespace exec -#ifdef __EDG__ -#pragma diagnostic pop -#endif +STDEXEC_PRAGMA_POP() diff --git a/include/exec/__detail/__xorshift.hpp b/include/exec/__detail/__xorshift.hpp new file mode 100644 index 000000000..0c3b5acb8 --- /dev/null +++ b/include/exec/__detail/__xorshift.hpp @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2023 Maikel Nadolski + * Copyright (c) 2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +/* I have taken and modified this code from https://gist.github.com/Leandros/6dc334c22db135b033b57e9ee0311553 */ +/* Copyright (c) 2018 Arvid Gerstmann. */ +/* This code is licensed under MIT license. */ + +#include +#include + +namespace exec { + + class xorshift { + public: + using result_type = std::uint32_t; + + static constexpr result_type(min)() { + return 0; + } + + static constexpr result_type(max)() { + return UINT32_MAX; + } + + friend bool operator==(xorshift const &, xorshift const &) = default; + + xorshift() + : m_seed(0xc1f651c67c62c6e0ull) { + } + + explicit xorshift(std::random_device &rd) { + seed(rd); + } + + explicit xorshift(std::uint64_t seed) + : m_seed(seed) { + } + + void seed(std::random_device &rd) { + m_seed = std::uint64_t(rd()) << 31 | std::uint64_t(rd()); + } + + result_type operator()() { + std::uint64_t result = m_seed * 0xd989bcacc137dcd5ull; + m_seed ^= m_seed >> 11; + m_seed ^= m_seed << 31; + m_seed ^= m_seed >> 18; + return std::uint32_t(result >> 32ull); + } + + void discard(unsigned long long n) { + for (unsigned long long i = 0; i < n; ++i) + operator()(); + } + + private: + std::uint64_t m_seed; + }; + +} // namespace exec \ No newline at end of file diff --git a/include/exec/any_sender_of.hpp b/include/exec/any_sender_of.hpp index 221ec42e7..0e7c82cf8 100644 --- a/include/exec/any_sender_of.hpp +++ b/include/exec/any_sender_of.hpp @@ -350,7 +350,7 @@ namespace exec { const __vtable_t* __vtable_{__default_storage_vtable((__vtable_t*) nullptr)}; void* __object_pointer_{nullptr}; alignas(__alignment) std::byte __buffer_[__buffer_size]{}; - STDEXEC_NO_UNIQUE_ADDRESS _Allocator __allocator_{}; + STDEXEC_ATTRIBUTE((no_unique_address)) _Allocator __allocator_{}; }; }; @@ -541,7 +541,7 @@ namespace exec { const __vtable_t* __vtable_{__default_storage_vtable((__vtable_t*) nullptr)}; void* __object_pointer_{nullptr}; alignas(__alignment) std::byte __buffer_[__buffer_size]{}; - STDEXEC_NO_UNIQUE_ADDRESS _Allocator __allocator_{}; + STDEXEC_ATTRIBUTE((no_unique_address)) _Allocator __allocator_{}; }; struct __empty_vtable { @@ -631,7 +631,11 @@ namespace exec { template requires(__is_not_stop_token_query<_Queries> && ...) struct __ref, _Queries...> { +#if !STDEXEC_MSVC() + // MSVCBUG https://developercommunity.visualstudio.com/t/Private-member-inaccessible-when-used-in/10448363 + private: +#endif using __vtable_t = stdexec::__t<__vtable, _Queries...>>; struct __env_t { @@ -652,7 +656,7 @@ namespace exec { } } __env_; public: - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using __id = __ref; using __t = __ref; @@ -694,7 +698,11 @@ namespace exec { template requires(__is_stop_token_query<_Queries> || ...) struct __ref, _Queries...> { +#if !STDEXEC_MSVC() + // MSVCBUG https://developercommunity.visualstudio.com/t/Private-member-inaccessible-when-used-in/10448363 + private: +#endif using _FilteredQueries = __minvoke<__remove_if<__q<__is_never_stop_token_query>>, _Queries...>; using __vtable_t = stdexec::__t< @@ -713,7 +721,7 @@ namespace exec { } } __env_; public: - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using __id = __ref; using __t = __ref; @@ -771,7 +779,7 @@ namespace exec { template struct __operation_base { - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __rcvr_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __rcvr_; stdexec::in_place_stop_source __stop_source_{}; using __stop_callback = typename stdexec::stop_token_of_t< stdexec::env_of_t<_Receiver>>::template callback_type<__on_stop_t>; @@ -786,6 +794,7 @@ namespace exec { using _Receiver = stdexec::__t<_ReceiverId>; struct __t { + using receiver_concept = stdexec::receiver_t; __operation_base<_Receiver>* __op_; template _SetNext, same_as<__t> _Self, class _Item> @@ -821,7 +830,7 @@ namespace exec { friend __env_t> tag_invoke(_GetEnv, const _Self& __self) noexcept { return __make_env( get_env(__self.__op_->__rcvr_), - __with_(get_stop_token, __self.__op_->__stop_source_.get_token())); + __mkprop(__self.__op_->__stop_source_.get_token(), get_stop_token)); } }; }; @@ -873,7 +882,7 @@ namespace exec { } private: - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __rec_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __rec_; __immovable_operation_storage __storage_{}; friend void tag_invoke(start_t, __t& __self) noexcept { @@ -958,7 +967,7 @@ namespace exec { public: using __id = __sender; using completion_signatures = _Sigs; - using is_sender = void; + using sender_concept = stdexec::sender_t; __t(const __t&) = delete; __t& operator=(const __t&) = delete; @@ -1090,7 +1099,7 @@ namespace exec { } public: - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using __t = any_receiver_ref; using __id = any_receiver_ref; @@ -1115,7 +1124,7 @@ namespace exec { return stdexec::tag_invoke(_Tag{}, ((Self&&) __self).__sender_, (_As&&) __as...); } public: - using is_sender = void; + using sender_concept = stdexec::sender_t; using completion_signatures = typename __sender_base::completion_signatures; template _Sender> @@ -1146,9 +1155,19 @@ namespace exec { __ret_equals_to>>, decltype(_SenderQueries)...>; +#if STDEXEC_MSVC() + // MSVCBUG https://developercommunity.visualstudio.com/t/ICE-and-non-ICE-bug-in-NTTP-argument-w/10361081 + + static constexpr auto __any_scheduler_noexcept_signature = + stdexec::get_completion_scheduler.signature; + template + using __schedule_sender_fn = + typename __schedule_receiver::template any_sender< __any_scheduler_noexcept_signature>; +#else template using __schedule_sender_fn = typename __schedule_receiver::template any_sender< stdexec::get_completion_scheduler.template signature>; +#endif using __schedule_sender = stdexec::__mapply, schedule_sender_queries>; diff --git a/include/exec/async_scope.hpp b/include/exec/async_scope.hpp index 3ca4474fe..c846737be 100644 --- a/include/exec/async_scope.hpp +++ b/include/exec/async_scope.hpp @@ -55,7 +55,7 @@ namespace exec { template struct __when_empty_op_base : __task { using _Receiver = __t<_ReceiverId>; - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __rcvr_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __rcvr_; }; template @@ -95,7 +95,7 @@ namespace exec { template struct __when_empty_sender { using _Constrained = __t<_ConstrainedId>; - using is_sender = void; + using sender_concept = stdexec::sender_t; template using __when_empty_op_t = @@ -120,7 +120,7 @@ namespace exec { } const __impl* __scope_; - STDEXEC_NO_UNIQUE_ADDRESS _Constrained __c_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Constrained __c_; }; template @@ -132,12 +132,12 @@ namespace exec { struct __nest_op_base : __immovable { using _Receiver = __t<_ReceiverId>; const __impl* __scope_; - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __rcvr_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __rcvr_; }; template struct __nest_rcvr { - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using _Receiver = __t<_ReceiverId>; __nest_op_base<_ReceiverId>* __op_; @@ -205,10 +205,10 @@ namespace exec { template struct __nest_sender { using _Constrained = __t<_ConstrainedId>; - using is_sender = void; + using sender_concept = stdexec::sender_t; const __impl* __scope_; - STDEXEC_NO_UNIQUE_ADDRESS _Constrained __c_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Constrained __c_; template using __nest_operation_t = __nest_op<_ConstrainedId, __x<_Receiver>>; @@ -333,9 +333,9 @@ namespace exec { } } - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __rcvr_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __rcvr_; std::unique_ptr<__future_state<_Sender, _Env>> __state_; - STDEXEC_NO_UNIQUE_ADDRESS __forward_consumer __forward_consumer_; + STDEXEC_ATTRIBUTE((no_unique_address)) __forward_consumer __forward_consumer_; public: ~__future_op() noexcept { @@ -473,7 +473,7 @@ namespace exec { template struct __future_rcvr { - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using _Completions = __t<_CompletionsId>; using _Env = __t<_EnvId>; __future_state_base<_Completions, _Env>* __state_; @@ -543,7 +543,7 @@ namespace exec { using _Env = __t<_EnvId>; friend struct async_scope; public: - using is_sender = void; + using sender_concept = stdexec::sender_t; __future(__future&&) = default; __future& operator=(__future&&) = default; @@ -602,8 +602,8 @@ namespace exec { using __spawn_env_t = __result_of< __join_env, _Env, - __env::__prop, - __env::__prop>; + __env::__prop, + __env::__prop<__inln::__scheduler(get_scheduler_t)>>; template struct __spawn_op_base { @@ -614,7 +614,7 @@ namespace exec { template struct __spawn_rcvr { - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using _Env = __t<_EnvId>; __spawn_op_base<_EnvId>* __op_; const __impl* __scope_; @@ -647,8 +647,8 @@ namespace exec { template <__decays_to<_Sender> _Sndr> __spawn_op(_Sndr&& __sndr, _Env __env, const __impl* __scope) : __spawn_op_base<_EnvId>{__join_env((_Env&&) __env, - __mkprop(get_stop_token, __scope->__stop_source_.get_token()), - __mkprop(get_scheduler, __inln::__scheduler{})), + __mkprop(__scope->__stop_source_.get_token(), get_stop_token), + __mkprop(__inln::__scheduler{}, get_scheduler)), [](__spawn_op_base<_EnvId>* __op) { delete static_cast<__spawn_op*>(__op); }} diff --git a/include/exec/at_coroutine_exit.hpp b/include/exec/at_coroutine_exit.hpp index d37c2e47a..37c68143c 100644 --- a/include/exec/at_coroutine_exit.hpp +++ b/include/exec/at_coroutine_exit.hpp @@ -38,7 +38,7 @@ namespace exec { template struct __receiver_id { struct __t { - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using __id = __receiver_id; _Receiver __receiver_; @@ -71,7 +71,7 @@ namespace exec { struct __t { using __id = __sender_id; - using is_sender = void; + using sender_concept = stdexec::sender_t; _Sender __sender_; @@ -160,8 +160,7 @@ namespace exec { auto __coro = __p.__is_unhandled_stopped_ ? __p.continuation().unhandled_stopped() : __p.continuation().handle(); - __h.destroy(); - return __coro; + return STDEXEC_DESTROY_AND_CONTINUE(__h, __coro); } void await_resume() const noexcept { diff --git a/include/exec/create.hpp b/include/exec/create.hpp index 8a4d5d2c4..4bd0c71da 100644 --- a/include/exec/create.hpp +++ b/include/exec/create.hpp @@ -32,8 +32,8 @@ namespace exec { template struct __context { - STDEXEC_NO_UNIQUE_ADDRESS _Receiver receiver; - STDEXEC_NO_UNIQUE_ADDRESS _Args args; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver receiver; + STDEXEC_ATTRIBUTE((no_unique_address)) _Args args; }; template @@ -42,9 +42,9 @@ namespace exec { using _Result = __call_result_t<_Fun, _Context&>; using _State = __if_c, __void, std::optional<_Result>>; - STDEXEC_NO_UNIQUE_ADDRESS _Context __ctx_; - STDEXEC_NO_UNIQUE_ADDRESS _Fun __fun_; - STDEXEC_NO_UNIQUE_ADDRESS _State __state_{}; + STDEXEC_ATTRIBUTE((no_unique_address)) _Context __ctx_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Fun __fun_; + STDEXEC_ATTRIBUTE((no_unique_address)) _State __state_{}; friend void tag_invoke(start_t, __operation& __self) noexcept { __self.__state_.emplace(__conv{[&]() noexcept { @@ -56,7 +56,7 @@ namespace exec { template struct __sender { using _Args = __t<_ArgsId>; - using is_sender = void; + using sender_concept = stdexec::sender_t; using completion_signatures = stdexec::completion_signatures<_Sigs...>; _Fun __fun_; diff --git a/include/exec/env.hpp b/include/exec/env.hpp index 802126a90..fae218bfe 100644 --- a/include/exec/env.hpp +++ b/include/exec/env.hpp @@ -17,25 +17,23 @@ #include "../stdexec/execution.hpp" -#ifdef __EDG__ -#pragma diagnostic push -#pragma diag_suppress 1302 -#endif +STDEXEC_PRAGMA_PUSH() +STDEXEC_PRAGMA_IGNORE_EDG(1302) namespace exec { - template + template using with_t = stdexec::__with<_Tag, _Value>; namespace __detail { struct __with_t { template - with_t<_Tag, _Value> operator()(_Tag, _Value&& __val) const { - return stdexec::__with_(_Tag(), (_Value&&) __val); + with_t<_Tag, stdexec::__decay_t<_Value>> operator()(_Tag, _Value&& __val) const { + return stdexec::__mkprop((_Value&&) __val, _Tag()); } template with_t<_Tag> operator()(_Tag) const { - return stdexec::__with_(_Tag()); + return stdexec::__mkprop(_Tag()); } }; } // namespace __detail @@ -57,7 +55,7 @@ namespace exec { using _Default = __t<_DefaultId>; using _Receiver = __t<_ReceiverId>; - STDEXEC_NO_UNIQUE_ADDRESS _Default __default_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Default __default_; _Receiver __rcvr_; friend void tag_invoke(start_t, __operation& __self) noexcept { @@ -77,8 +75,8 @@ namespace exec { template struct __sender { using _Default = __t<_DefaultId>; - using is_sender = void; - STDEXEC_NO_UNIQUE_ADDRESS _Default __default_; + using sender_concept = stdexec::sender_t; + STDEXEC_ATTRIBUTE((no_unique_address)) _Default __default_; template using __value_t = @@ -97,7 +95,7 @@ namespace exec { return {{}, ((_Self&&) __self).__default_, (_Receiver&&) __rcvr}; } - template <__none_of _Env> + template friend auto tag_invoke(get_completion_signatures_t, __sender, _Env&&) -> __completions_t<_Env> { return {}; @@ -118,6 +116,4 @@ namespace exec { inline constexpr stdexec::__write_::__write_t write{}; } // namespace exec -#ifdef __EDG__ -#pragma diagnostic pop -#endif +STDEXEC_PRAGMA_POP() diff --git a/include/exec/finally.hpp b/include/exec/finally.hpp index b925daa9a..858407eef 100644 --- a/include/exec/finally.hpp +++ b/include/exec/finally.hpp @@ -100,7 +100,7 @@ namespace exec { class __t { public: - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; explicit __t(__final_operation_base<_ResultType, _ReceiverId>* __op) noexcept : __op_{__op} { @@ -113,14 +113,14 @@ namespace exec { friend void tag_invoke(_Tag, _Self&& __self) noexcept { if constexpr (std::is_nothrow_move_constructible_v<_ResultType>) { _ResultType __result = (_ResultType&&) __self.__op_->__result_; - __self.__op_->__result_.__destruct(); + __self.__op_->__result_.__destroy(); std::visit( __visitor<_Receiver>{(_Receiver&&) __self.__op_->__receiver_}, (_ResultType&&) __result); } else { try { _ResultType __result = (_ResultType&&) __self.__op_->__result_; - __self.__op_->__result_.__destruct(); + __self.__op_->__result_.__destroy(); std::visit( __visitor<_Receiver>{(_Receiver&&) __self.__op_->__receiver_}, (_ResultType&&) __result); @@ -133,7 +133,7 @@ namespace exec { template <__one_of _Tag, __decays_to<__t> _Self, class... _Error> requires __callable<_Tag, _Receiver&&, _Error...> friend void tag_invoke(_Tag __tag, _Self&& __self, _Error&&... __error) noexcept { - __self.__op_->__result_.__destruct(); + __self.__op_->__result_.__destroy(); __tag((_Receiver&&) __self.__op_->__receiver_, (_Error&&) __error...); } @@ -167,7 +167,7 @@ namespace exec { class __t { public: - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; explicit __t(__base_op_t* __op) noexcept : __op_(__op) { @@ -263,7 +263,7 @@ namespace exec { (_Rec&&) __receiver}; } - template <__decays_to<__t> _Self, __none_of _Env> + template <__decays_to<__t> _Self, class _Env> friend auto tag_invoke(get_completion_signatures_t, _Self&&, _Env&&) noexcept -> __completion_signatures_t< __copy_cvref_t<_Self, _InitialSender>, @@ -273,7 +273,7 @@ namespace exec { } public: - using is_sender = void; + using sender_concept = stdexec::sender_t; template <__decays_to<_InitialSender> _Is, __decays_to<_FinalSender> _Fs> __t(_Is&& __initial_sender, _Fs&& __final_sender) noexcept( diff --git a/include/exec/inline_scheduler.hpp b/include/exec/inline_scheduler.hpp index f587d150e..7a8ff9015 100644 --- a/include/exec/inline_scheduler.hpp +++ b/include/exec/inline_scheduler.hpp @@ -22,52 +22,5 @@ namespace exec { // A simple scheduler that executes its continuation inline, on the // thread of the caller of start(). - struct inline_scheduler { - template - struct __op { - using R = stdexec::__t; - STDEXEC_NO_UNIQUE_ADDRESS R rec_; - - friend void tag_invoke(stdexec::start_t, __op& op) noexcept { - stdexec::set_value((R&&) op.rec_); - } - }; - - struct __sender { - using is_sender = void; - using completion_signatures = stdexec::completion_signatures; - - template - friend auto tag_invoke(stdexec::connect_t, __sender, R&& rec) // - noexcept(stdexec::__nothrow_constructible_from, R>) - -> __op>> { - return {(R&&) rec}; - } - - struct __env { - friend inline_scheduler - tag_invoke(stdexec::get_completion_scheduler_t, const __env&) // - noexcept { - return {}; - } - }; - - friend __env tag_invoke(stdexec::get_env_t, const __sender&) noexcept { - return {}; - } - }; - - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - friend __sender - tag_invoke(stdexec::schedule_t, const inline_scheduler&) noexcept { - return {}; - } - - friend stdexec::forward_progress_guarantee - tag_invoke(stdexec::get_forward_progress_guarantee_t, const inline_scheduler&) noexcept { - return stdexec::forward_progress_guarantee::weakly_parallel; - } - - bool operator==(const inline_scheduler&) const noexcept = default; - }; + using inline_scheduler = stdexec::__inln::__scheduler; } diff --git a/include/exec/linux/io_uring_context.hpp b/include/exec/linux/io_uring_context.hpp index 5b7928fd5..5eb351403 100644 --- a/include/exec/linux/io_uring_context.hpp +++ b/include/exec/linux/io_uring_context.hpp @@ -423,7 +423,7 @@ namespace exec { 0 <= __n_total_submitted_ && __n_total_submitted_ <= static_cast(__params_.cq_entries)); __u32 __max_submissions = __params_.cq_entries - static_cast<__u32>(__n_total_submitted_); - __pending_.append(__requests_.pop_all()); + __pending_.append(__requests_.pop_all_reversed()); __submission_result __result = __submission_queue_.submit( (__task_queue&&) __pending_, __max_submissions, __stop_source_->stop_requested()); __n_total_submitted_ += __result.__n_submitted; @@ -433,7 +433,7 @@ namespace exec { while (!__result.__ready.empty()) { __n_total_submitted_ -= __completion_queue_.complete((__task_queue&&) __result.__ready); STDEXEC_ASSERT(0 <= __n_total_submitted_); - __pending_.append(__requests_.pop_all()); + __pending_.append(__requests_.pop_all_reversed()); __max_submissions = __params_.cq_entries - static_cast<__u32>(__n_total_submitted_); __result = __submission_queue_.submit( (__task_queue&&) __pending_, __max_submissions, __stop_source_->stop_requested()); @@ -466,7 +466,7 @@ namespace exec { scope_guard __not_running{[&]() noexcept { __is_running_.store(false, std::memory_order_relaxed); }}; - __pending_.append(__requests_.pop_all()); + __pending_.append(__requests_.pop_all_reversed()); while (__n_total_submitted_ > 0 || !__pending_.empty()) { run_some(); if ( @@ -481,12 +481,14 @@ namespace exec { && __n_total_submitted_ <= static_cast(__params_.cq_entries)); int rc = __io_uring_enter( __ring_fd_, __n_newly_submitted_, __min_complete, IORING_ENTER_GETEVENTS); - __throw_error_code_if(rc < 0, -rc); - STDEXEC_ASSERT(rc <= __n_newly_submitted_); - __n_newly_submitted_ -= rc; + __throw_error_code_if(rc < 0 && rc != -EINTR, -rc); + if (rc != -EINTR) { + STDEXEC_ASSERT(rc <= __n_newly_submitted_); + __n_newly_submitted_ -= rc; + } __n_total_submitted_ -= __completion_queue_.complete(); STDEXEC_ASSERT(0 <= __n_total_submitted_); - __pending_.append(__requests_.pop_all()); + __pending_.append(__requests_.pop_all_reversed()); } STDEXEC_ASSERT(__n_total_submitted_ <= 1); if (__stop_source_->stop_requested() && __pending_.empty()) { @@ -504,7 +506,7 @@ namespace exec { __n_submissions_in_flight_.load(std::memory_order_relaxed) == __no_new_submissions); // There could have been requests in flight. Complete all of them // and then stop it, finally. - __pending_.append(__requests_.pop_all()); + __pending_.append(__requests_.pop_all_reversed()); __submission_result __result = __submission_queue_.submit( (__task_queue&&) __pending_, __params_.cq_entries, true); STDEXEC_ASSERT(__result.__n_submitted == 0); @@ -558,7 +560,7 @@ namespace exec { class __run_sender { public: - using is_sender = void; + using sender_concept = stdexec::sender_t; using completion_signatures = stdexec::completion_signatures< stdexec::set_value_t(), stdexec::set_error_t(std::exception_ptr), @@ -698,7 +700,7 @@ namespace exec { struct __impl { __context& __context_; - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __receiver_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __receiver_; __impl(__context& __context, _Receiver&& __receiver) : __context_{__context} @@ -1055,7 +1057,7 @@ namespace exec { class __schedule_sender { __schedule_env __env_; public: - using is_sender = void; + using sender_concept = stdexec::sender_t; using __id = __schedule_sender; using __t = __schedule_sender; @@ -1092,7 +1094,7 @@ namespace exec { class __schedule_after_sender { public: - using is_sender = void; + using sender_concept = stdexec::sender_t; using __id = __schedule_after_sender; using __t = __schedule_after_sender; diff --git a/include/exec/materialize.hpp b/include/exec/materialize.hpp index b5a51be95..11ebe9ef0 100644 --- a/include/exec/materialize.hpp +++ b/include/exec/materialize.hpp @@ -28,7 +28,7 @@ namespace exec { class __t { public: - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; __t(_Receiver&& __upstream) : __upstream_{(_Receiver&&) __upstream} { @@ -59,7 +59,7 @@ namespace exec { class __t { public: - using is_sender = void; + using sender_concept = stdexec::sender_t; template <__decays_to<_Sender> _Sndr> __t(_Sndr&& __sender) @@ -125,7 +125,7 @@ namespace exec { class __t { public: - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; __t(_Receiver&& __upstream) : __upstream_{(_Receiver&&) __upstream} { @@ -166,7 +166,7 @@ namespace exec { class __t { public: - using is_sender = void; + using sender_concept = stdexec::sender_t; template <__decays_to<_Sender> _Sndr> __t(_Sndr&& __sndr) noexcept(__nothrow_decay_copyable<_Sndr>) diff --git a/include/exec/on_coro_disposition.hpp b/include/exec/on_coro_disposition.hpp index 44ea44126..6ccd8c3b7 100644 --- a/include/exec/on_coro_disposition.hpp +++ b/include/exec/on_coro_disposition.hpp @@ -101,8 +101,8 @@ namespace exec { private: struct __final_awaitable { - static std::false_type await_ready() noexcept { - return {}; + static constexpr bool await_ready() noexcept { + return false; } static __coro::coroutine_handle<> @@ -111,8 +111,7 @@ namespace exec { auto __coro = __p.__is_unhandled_stopped_ ? __p.continuation().unhandled_stopped() : __p.continuation().handle(); - __h.destroy(); - return __coro; + return STDEXEC_DESTROY_AND_CONTINUE(__h, __coro); } void await_resume() const noexcept { diff --git a/include/exec/repeat_effect_until.hpp b/include/exec/repeat_effect_until.hpp index 57eff5e67..0b4c4417f 100644 --- a/include/exec/repeat_effect_until.hpp +++ b/include/exec/repeat_effect_until.hpp @@ -47,8 +47,8 @@ namespace exec { __call_result_t; using __source_op_t = stdexec::connect_result_t<__source_on_scheduler_sender, __receiver_t>; - STDEXEC_NO_UNIQUE_ADDRESS _Source __source_; - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __rcvr_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Source __source_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __rcvr_; __manual_lifetime<__source_op_t> __source_op_; trampoline_scheduler __sched_; @@ -72,7 +72,7 @@ namespace exec { template struct __receiver<_SourceId, _ReceiverId>::__t { - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using __id = __receiver; using _Source = stdexec::__t<_SourceId>; using _Receiver = stdexec::__t<_ReceiverId>; @@ -98,7 +98,7 @@ namespace exec { auto *__op = __self.__op_; // The following line causes the invalidation of __self. - __op->__source_op_.__destruct(); + __op->__source_op_.__destroy(); // If the sender completed with true, we're done if (__done) { @@ -119,7 +119,7 @@ namespace exec { requires __callable<_Tag, _Receiver> friend void tag_invoke(_Tag, _Self &&__self) noexcept { auto *__op = __self.__op_; - __op->__source_op_.__destruct(); + __op->__source_op_.__destroy(); stdexec::set_stopped((_Receiver &&) __op->__rcvr_); } @@ -127,7 +127,7 @@ namespace exec { requires __callable<_Tag, _Receiver, _Error> friend void tag_invoke(_Tag, _Self &&__self, _Error __error) noexcept { auto *__op = __self.__op_; - __op->__source_op_.__destruct(); + __op->__source_op_.__destroy(); stdexec::set_error((_Receiver &&) __op->__rcvr_, (_Error &&) __error); } @@ -148,9 +148,9 @@ namespace exec { using __receiver_t = stdexec::__t< __receiver<_SourceId, stdexec::__id<_Receiver>>>; struct __t { - using is_sender = void; + using sender_concept = stdexec::sender_t; using __id = __sender; - STDEXEC_NO_UNIQUE_ADDRESS _Source __source_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Source __source_; template using __value_t = stdexec::completion_signatures<>; diff --git a/include/exec/sequence/any_sequence_of.hpp b/include/exec/sequence/any_sequence_of.hpp index 708812ef2..87997a940 100644 --- a/include/exec/sequence/any_sequence_of.hpp +++ b/include/exec/sequence/any_sequence_of.hpp @@ -21,7 +21,7 @@ namespace exec { namespace __any { namespace __next { - template <__is_completion_signatures _Sigs> + template <__valid_completion_signatures _Sigs> struct __rcvr_next_vfun { using __return_sigs = completion_signatures; using __void_sender = typename any_receiver_ref<__return_sigs>::template any_sender<>; @@ -37,7 +37,7 @@ namespace exec { template using __item_sender = typename any_receiver_ref<_Sigs>::template any_sender<>; - template <__is_completion_signatures _Sigs> + template <__valid_completion_signatures _Sigs> constexpr __void_sender (*operator()(_Sigs*) const)(void*, __item_sender<_Sigs>&&) { return +[](void* __r, __item_sender<_Sigs>&& __sndr) noexcept -> __void_sender { return __void_sender{ @@ -116,7 +116,7 @@ namespace exec { using __env_t = stdexec::__t<__env<__next_sigs, _Queries...>>; __env_t __env_; - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; template <__none_of<__t, const __t, __env_t, const __env_t> _Rcvr> requires sequence_receiver_of<_Rcvr, __item_types> @@ -238,7 +238,7 @@ namespace exec { using __id = __sequence_sender; using completion_signatures = __compl_sigs; using item_types = exec::item_types<__item_sender>; - using is_sender = sequence_tag; + using sender_concept = sequence_sender_t; __t(const __t&) = delete; __t& operator=(const __t&) = delete; @@ -284,7 +284,7 @@ namespace exec { public: using __id = any_sequence_receiver_ref; using __t = any_sequence_receiver_ref; - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; template _GetEnv, std::same_as<__t> _Self> requires stdexec::__callable @@ -295,7 +295,7 @@ namespace exec { template < std::same_as _SetNext, std::same_as<__t> _Self, - stdexec::__sender _Sender> + stdexec::sender _Sender> requires stdexec::__callable friend auto tag_invoke(_SetNext, _Self& __self, _Sender&& __sender) { return exec::set_next(__self.__receiver_, static_cast<_Sender&&>(__sender)); @@ -337,7 +337,7 @@ namespace exec { public: using __id = any_sender; using __t = any_sender; - using is_sender = sequence_tag; + using sender_concept = sequence_sender_t; using completion_signatures = typename __sender_base::completion_signatures; using item_types = typename __sender_base::item_types; diff --git a/include/exec/sequence/empty_sequence.hpp b/include/exec/sequence/empty_sequence.hpp index 0be237b62..cc8d6b8c2 100644 --- a/include/exec/sequence/empty_sequence.hpp +++ b/include/exec/sequence/empty_sequence.hpp @@ -29,7 +29,7 @@ namespace exec { struct __t { using __id = __operation; - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __rcvr_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __rcvr_; friend void tag_invoke(start_t, __t& __self) noexcept { stdexec::set_value(static_cast<_Receiver&&>(__self.__rcvr_)); @@ -40,7 +40,7 @@ namespace exec { struct __sender { struct __t { using __id = __sender; - using is_sender = sequence_tag; + using sender_concept = sequence_sender_t; using completion_signatures = stdexec::completion_signatures; using item_types = exec::item_types<>; diff --git a/include/exec/sequence/ignore_all_values.hpp b/include/exec/sequence/ignore_all_values.hpp index c5729fddc..24e861098 100644 --- a/include/exec/sequence/ignore_all_values.hpp +++ b/include/exec/sequence/ignore_all_values.hpp @@ -70,7 +70,7 @@ namespace exec { template struct __item_operation_base { - STDEXEC_NO_UNIQUE_ADDRESS _ItemReceiver __receiver_; + STDEXEC_ATTRIBUTE((no_unique_address)) _ItemReceiver __receiver_; __result_type<_ResultVariant>* __result_; }; @@ -78,7 +78,7 @@ namespace exec { struct __item_receiver { struct __t { using __id = __item_receiver; - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; __item_operation_base<_ItemReceiver, _ResultVariant>* __op_; template _Tag, same_as<__t> _Self, class... _Args> @@ -123,10 +123,9 @@ namespace exec { __t( __result_type<_ResultVariant>* __parent, _Sender&& __sndr, - _ItemReceiver __rcvr) // - noexcept( - __nothrow_decay_copyable<_ItemReceiver> // - && __nothrow_connectable<_Sender, __item_receiver_t>) + _ItemReceiver __rcvr) // + noexcept(__nothrow_decay_copyable<_ItemReceiver> // + && __nothrow_connectable<_Sender, __item_receiver_t>) : __base_type{static_cast<_ItemReceiver&&>(__rcvr), __parent} , __op_{stdexec::connect(static_cast<_Sender&&>(__sndr), __item_receiver_t{this})} { } @@ -140,7 +139,7 @@ namespace exec { template struct __item_sender { struct __t { - using is_sender = void; + using sender_concept = stdexec::sender_t; using completion_signatures = stdexec::completion_signatures; @@ -168,7 +167,7 @@ namespace exec { template struct __operation_base : __result_type<_ResultVariant> { - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __receiver_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __receiver_; }; template @@ -177,7 +176,7 @@ namespace exec { struct __t { using __id = __receiver; - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; __operation_base<_Receiver, _ResultVariant>* __op_; template _SetNext, same_as<__t> _Self, sender _Item> @@ -278,23 +277,26 @@ namespace exec { struct ignore_all_values_t { template auto operator()(_Sender&& __sndr) const { - auto __domain = __get_sender_domain((_Sender&&) __sndr); + auto __domain = __get_early_domain((_Sender&&) __sndr); return transform_sender( - __domain, make_sender_expr(__(), (_Sender&&) __sndr)); + __domain, __make_sexpr(__(), (_Sender&&) __sndr)); } constexpr __binder_back operator()() const noexcept { return {{}, {}, {}}; } + }; + struct __ignore_all_values_impl : __sexpr_defaults { template using __completion_sigs = __sequence_completion_signatures_of_t<_Sequence, _Env>; - template _Sender, class _Env> - static auto get_completion_signatures(_Sender&& __sndr, _Env&&) - -> __completion_sigs<__child_of<_Sender>, _Env> { - return {}; - } + static constexpr auto get_completion_signatures = // + [](_Sender&& __sndr, _Env&&) + -> __completion_sigs<__child_of<_Sender>, _Env> { + static_assert(sender_expr_for<_Sender, ignore_all_values_t>); + return {}; + }; template using _ResultVariant = __result_variant_t<_Child, env_of_t<_Receiver>>; @@ -302,19 +304,25 @@ namespace exec { template using __receiver_t = __t<__receiver<__id<_Receiver>, _ResultVariant<_Child, _Receiver>>>; - template _Sender, receiver _Receiver> + static constexpr auto connect = // + [](_Sender&& __sndr, _Receiver __rcvr) noexcept( + __nothrow_callable<__sexpr_apply_t, _Sender, __connect_fn<_Receiver>>) + -> __call_result_t<__sexpr_apply_t, _Sender, __connect_fn<_Receiver>> requires receiver_of<_Receiver, __completion_sigs<__child_of<_Sender>, env_of_t<_Receiver>>> && sequence_sender_to< __child_of<_Sender>, - __receiver_t<__child_of<_Sender>, _Receiver>> - static auto connect(_Sender&& __sndr, _Receiver __rcvr) noexcept( - __nothrow_callable>) - -> __call_result_t> { - return apply_sender((_Sender&&) __sndr, __connect_fn<_Receiver>{__rcvr}); - } + __receiver_t<__child_of<_Sender>, _Receiver>> { + static_assert(sender_expr_for<_Sender, ignore_all_values_t>); + return __sexpr_apply((_Sender&&) __sndr, __connect_fn<_Receiver>{__rcvr}); + }; }; } using __ignore_all_values::ignore_all_values_t; inline constexpr ignore_all_values_t ignore_all_values{}; +} + +namespace stdexec { + template <> + struct __sexpr_impl : exec::__ignore_all_values::__ignore_all_values_impl {}; } \ No newline at end of file diff --git a/include/exec/sequence/iterate.hpp b/include/exec/sequence/iterate.hpp index 96ef4550f..8532af0e7 100644 --- a/include/exec/sequence/iterate.hpp +++ b/include/exec/sequence/iterate.hpp @@ -34,8 +34,8 @@ namespace exec { template struct __operation_base { - STDEXEC_NO_UNIQUE_ADDRESS _Iterator __iterator_; - STDEXEC_NO_UNIQUE_ADDRESS _Sentinel __sentinel_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Iterator __iterator_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Sentinel __sentinel_; }; template @@ -46,7 +46,7 @@ namespace exec { struct __item_operation { struct __t { using __id = __item_operation; - STDEXEC_NO_UNIQUE_ADDRESS _ItemRcvr __rcvr_; + STDEXEC_ATTRIBUTE((no_unique_address)) _ItemRcvr __rcvr_; __operation_base<_Iterator, _Sentinel>* __parent_; friend void tag_invoke(start_t, __t& __self) noexcept { @@ -60,7 +60,7 @@ namespace exec { struct __sender { struct __t { using __id = __sender; - using is_sender = void; + using sender_concept = stdexec::sender_t; using completion_signatures = stdexec::completion_signatures)>; __operation_base<_Iterator, _Sentinel>* __parent_; @@ -88,7 +88,7 @@ namespace exec { struct __t { using _Receiver = stdexec::__t<_ReceiverId>; using __id = __next_receiver; - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; stdexec::__t<__operation<_Range, _ReceiverId>>* __op_; template _SetValue, same_as<__t> _Self> @@ -189,9 +189,9 @@ namespace exec { sequence_receiver_of>> _Receiver> requires sender_to<_NextSender<_SeqExpr, _Receiver>, _NextReceiver<_SeqExpr, _Receiver>> static auto subscribe(_SeqExpr&& __seq, _Receiver __rcvr) noexcept( - __nothrow_callable>) - -> __call_result_t> { - return apply_sender(static_cast<_SeqExpr&&>(__seq), __subscribe_fn<_Receiver>{__rcvr}); + __nothrow_callable<__sexpr_apply_t, _SeqExpr, __subscribe_fn<_Receiver>>) + -> __call_result_t<__sexpr_apply_t, _SeqExpr, __subscribe_fn<_Receiver>> { + return __sexpr_apply(static_cast<_SeqExpr&&>(__seq), __subscribe_fn<_Receiver>{__rcvr}); } static auto get_completion_signatures(__ignore, __ignore) noexcept diff --git a/include/exec/sequence/transform_each.hpp b/include/exec/sequence/transform_each.hpp index b09582598..57473cc67 100644 --- a/include/exec/sequence/transform_each.hpp +++ b/include/exec/sequence/transform_each.hpp @@ -21,7 +21,7 @@ #include "../__detail/__basic_sequence.hpp" namespace exec { - namespace __transform { + namespace __transform_each { using namespace stdexec; template @@ -35,7 +35,7 @@ namespace exec { using _Receiver = stdexec::__t<_ReceiverId>; struct __t { - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using __id = __receiver; __operation_base<_Receiver, _Adaptor>* __op_; @@ -44,7 +44,7 @@ namespace exec { && __callable> friend auto tag_invoke(_SetNext, _Self& __self, _Item&& __item) noexcept( __nothrow_callable<_SetNext, _Receiver&, __call_result_t<_Adaptor&, _Item>> // - && __nothrow_callable<_Adaptor&, _Item>) + && __nothrow_callable<_Adaptor&, _Item>) -> next_sender_of_t<_Receiver, __call_result_t<_Adaptor&, _Item>> { return exec::set_next( __self.__op_->__receiver_, __self.__op_->__adaptor_(static_cast<_Item&&>(__item))); @@ -104,8 +104,8 @@ namespace exec { template auto operator()(__ignore, _Adaptor __adaptor, _Sequence&& __sequence) noexcept( - __nothrow_decay_copyable<_Adaptor> && __nothrow_decay_copyable<_Sequence> - && __nothrow_decay_copyable<_Receiver>) + __nothrow_decay_copyable<_Adaptor>&& __nothrow_decay_copyable<_Sequence>&& + __nothrow_decay_copyable<_Receiver>) -> __t< __operation<_Sequence, __id<_Receiver>, _Adaptor>> { return { static_cast<_Sequence&&>(__sequence), @@ -121,8 +121,9 @@ namespace exec { struct _WITH_ITEM_SENDER_ { }; template - auto __try_call(_Item*) - -> stdexec::__mexception<_NOT_CALLABLE_ADAPTOR_<_Adaptor&>, _WITH_ITEM_SENDER_>>; + auto __try_call(_Item*) -> stdexec::__mexception< + _NOT_CALLABLE_ADAPTOR_<_Adaptor&>, + _WITH_ITEM_SENDER_>>; template requires stdexec::__callable<_Adaptor&, _Item> @@ -139,9 +140,9 @@ namespace exec { struct transform_each_t { template - auto operator()(_Sequence&& __sndr, _Adaptor&& __adaptor) const noexcept( - __nothrow_decay_copyable<_Sequence> // - && __nothrow_decay_copyable<_Adaptor>) { + auto operator()(_Sequence&& __sndr, _Adaptor&& __adaptor) const + noexcept(__nothrow_decay_copyable<_Sequence> // + && __nothrow_decay_copyable<_Adaptor>) { return make_sequence_expr( static_cast<_Adaptor&&>(__adaptor), static_cast<_Sequence&&>(__sndr)); } @@ -185,20 +186,20 @@ namespace exec { && sequence_receiver_of<_Receiver, __item_types_t<_Self, env_of_t<_Receiver>>> && sequence_sender_to<__child_of<_Self>, __receiver_t<_Self, _Receiver>> static auto subscribe(_Self&& __self, _Receiver __rcvr) noexcept( - __nothrow_callable>) - -> __call_result_t> { - return apply_sender(static_cast<_Self&&>(__self), __subscribe_fn<_Receiver>{__rcvr}); + __nothrow_callable<__sexpr_apply_t, _Self, __subscribe_fn<_Receiver>>) + -> __call_result_t<__sexpr_apply_t, _Self, __subscribe_fn<_Receiver>> { + return __sexpr_apply(static_cast<_Self&&>(__self), __subscribe_fn<_Receiver>{__rcvr}); } template _Sexpr> static env_of_t<__child_of<_Sexpr>> get_env(const _Sexpr& __sexpr) noexcept { - return apply_sender(__sexpr, [](__ignore, __ignore, const _Child& __child) { + return __sexpr_apply(__sexpr, [](__ignore, __ignore, const _Child& __child) { return stdexec::get_env(__child); }); } }; } - using __transform::transform_each_t; + using __transform_each::transform_each_t; inline constexpr transform_each_t transform_each{}; } \ No newline at end of file diff --git a/include/exec/sequence_senders.hpp b/include/exec/sequence_senders.hpp index 5dbc92cc8..3b25ff8e4 100644 --- a/include/exec/sequence_senders.hpp +++ b/include/exec/sequence_senders.hpp @@ -19,7 +19,9 @@ #include "../stdexec/execution.hpp" namespace exec { - struct sequence_tag { }; + struct sequence_sender_t : stdexec::sender_t { }; + + using sequence_tag [[deprecated("Renamed to exec::sequence_sender_t")]] = exec::sequence_sender_t; namespace __sequence_sndr { using namespace stdexec; @@ -77,11 +79,11 @@ namespace exec { template struct __stopped_means_break { struct __t { - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using __id = __stopped_means_break; using _Receiver = stdexec::__t<_ReceiverId>; using _Token = stop_token_of_t>; - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __rcvr_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __rcvr_; template _GetEnv, same_as<__t> _Self> friend env_of_t<_Receiver> tag_invoke(_GetEnv, const _Self& __self) noexcept { @@ -117,9 +119,9 @@ namespace exec { } // namespace __sequence_sndr template - concept __enable_sequence_sender = // - requires { typename _Sender::is_sender; } && // - stdexec::same_as; + concept __enable_sequence_sender = // + requires { typename _Sender::sender_concept; } && // + stdexec::same_as; template inline constexpr bool enable_sequence_sender = __enable_sequence_sender<_Sender>; @@ -135,7 +137,8 @@ namespace exec { namespace __sequence_sndr { struct get_item_types_t; template - using __tfx_sender = transform_sender_result_t<__env_domain_of_t<_Env>, _Sender, _Env>; + using __tfx_sender = + transform_sender_result_t<__late_domain_of_t<_Sender, _Env>, _Sender, _Env>; template concept __with_tag_invoke = // @@ -160,7 +163,7 @@ namespace exec { using _Result = __member_alias_t<_TfxSender, _Env>; return (_Result(*)()) nullptr; } else if constexpr ( - sender<_TfxSender, _Env> && !enable_sequence_sender>) { + sender_in<_TfxSender, _Env> && !enable_sequence_sender>) { using _Result = item_types>; return (_Result(*)()) nullptr; } else if constexpr (__is_debug_env<_Env>) { @@ -194,8 +197,8 @@ namespace exec { decltype(get_item_types(stdexec::__declval<_Sender>(), stdexec::__declval<_Env>())); template - concept sequence_sender = // - stdexec::sender<_Sender, _Env> && // + concept sequence_sender = // + stdexec::sender_in<_Sender, _Env> && // enable_sequence_sender>; template @@ -267,7 +270,11 @@ namespace exec { template using __sequence_completion_signatures_of_t = stdexec::__concat_completion_signatures_t< - stdexec::completion_signatures, + stdexec::__try_make_completion_signatures< + _Sequence, + _Env, + stdexec::completion_signatures, + stdexec::__mconst>>, stdexec::__mapply< stdexec::__q, stdexec::__mapply< @@ -324,20 +331,11 @@ namespace exec { template static constexpr auto __select_impl() noexcept { - using _Domain = __env_domain_of_t>; + using _Domain = __late_domain_of_t<_Sender, env_of_t<_Receiver&>>; constexpr bool _NothrowTfxSender = __nothrow_callable && __nothrow_callable>; using _TfxSender = __tfx_sndr<_Sender, _Receiver>; - if constexpr (!enable_sender<__decay_t<_Sender>>) - __connect::_PLEASE_UPDATE_YOUR_SENDER_TYPE<__decay_t<_Sender>>(); - - if constexpr (!enable_sender<__decay_t<_TfxSender>>) - __connect::_PLEASE_UPDATE_YOUR_SENDER_TYPE<__decay_t<_TfxSender>>(); - - if constexpr (!enable_receiver<__decay_t<_Receiver>>) - __connect::_PLEASE_UPDATE_YOUR_RECEIVER_TYPE<__decay_t<_Receiver>>(); - if constexpr (__next_connectable_with_tag_invoke<_TfxSender, _Receiver>) { using _Result = tag_invoke_result_t< connect_t, @@ -370,7 +368,7 @@ namespace exec { -> __call_result_t<__select_impl_t<_Sender, _Receiver>> { using _TfxSender = __tfx_sndr<_Sender, _Receiver>; auto&& __env = get_env(__rcvr); - auto __domain = __get_env_domain(__env); + auto __domain = __get_late_domain(__sndr, __env); if constexpr (__next_connectable_with_tag_invoke<_TfxSender, _Receiver>) { static_assert( operation_state #include #include #include #include +#include #include #include #include @@ -32,11 +43,121 @@ namespace exec { using stdexec::__intrusive_queue; + // Splits `n` into `size` chunks distributing `n % size` evenly between ranks. + // Returns `[begin, end)` range in `n` for a given `rank`. + // Example: + // ```cpp + // // n_items thread n_threads + // even_share( 11, 0, 3); // -> [0, 4) -> 4 items + // even_share( 11, 1, 3); // -> [4, 8) -> 4 items + // even_share( 11, 2, 3); // -> [8, 11) -> 3 items + // ``` + template + std::pair even_share(Shape n, std::uint32_t rank, std::uint32_t size) noexcept { + const auto avg_per_thread = n / size; + const auto n_big_share = avg_per_thread + 1; + const auto big_shares = n % size; + const auto is_big_share = rank < big_shares; + const auto begin = is_big_share + ? n_big_share * rank + : n_big_share * big_shares + (rank - big_shares) * avg_per_thread; + const auto end = begin + (is_big_share ? n_big_share : avg_per_thread); + + return std::make_pair(begin, end); + } + +#if STDEXEC_HAS_STD_RANGES() + namespace schedule_all_ { + template + struct sequence { + class __t; + }; + } +#endif + + template + struct not_a_sender { + using sender_concept = stdexec::sender_t; + }; + struct task_base { task_base* next; void (*__execute)(task_base*, std::uint32_t tid) noexcept; }; + struct bwos_params { + std::size_t numBlocks{8}; + std::size_t blockSize{1024}; + }; + + struct remote_queue { + explicit remote_queue(std::size_t nthreads) noexcept + : queues_(nthreads) { + } + + explicit remote_queue(remote_queue* next, std::size_t nthreads) noexcept + : next_(next) + , queues_(nthreads) { + } + + remote_queue* next_{}; + std::vector<__atomic_intrusive_queue<&task_base::next>> queues_{}; + std::thread::id id_{std::this_thread::get_id()}; + // This marks whether the submitter is a thread in the pool or not. + std::size_t index_{std::numeric_limits::max()}; + }; + + struct remote_queue_list { + private: + std::atomic head_; + remote_queue* tail_; + std::size_t nthreads_; + remote_queue this_remotes_; + + public: + explicit remote_queue_list(std::size_t nthreads) noexcept + : head_{&this_remotes_} + , tail_{&this_remotes_} + , nthreads_(nthreads) + , this_remotes_(nthreads) { + } + + ~remote_queue_list() noexcept { + remote_queue* head = head_.load(std::memory_order_acquire); + while (head != tail_) { + remote_queue* tmp = std::exchange(head, head->next_); + delete tmp; + } + } + + __intrusive_queue<&task_base::next> pop_all_reversed(std::size_t tid) noexcept { + remote_queue* head = head_.load(std::memory_order_acquire); + __intrusive_queue<&task_base::next> tasks{}; + while (head != nullptr) { + tasks.append(head->queues_[tid].pop_all_reversed()); + head = head->next_; + } + return tasks; + } + + remote_queue* get() { + thread_local std::thread::id this_id = std::this_thread::get_id(); + remote_queue* head = head_.load(std::memory_order_acquire); + remote_queue* queue = head; + while (queue != tail_) { + if (queue->id_ == this_id) { + return queue; + } + queue = queue->next_; + } + remote_queue* new_head = new remote_queue{head, nthreads_}; + while (!head_.compare_exchange_weak(head, new_head, std::memory_order_acq_rel)) { + new_head->next_ = head; + } + return new_head; + } + }; + class static_thread_pool { template class operation; @@ -55,14 +176,28 @@ namespace exec { Shape, stdexec::__x>>; +#if STDEXEC_MSVC() + // MSVCBUG https://developercommunity.visualstudio.com/t/Alias-template-with-pack-expansion-in-no/10437850 + + template + struct __bulk_non_throwing { + using __t = stdexec::__decayed_tuple; + static constexpr bool __v = noexcept(__t(std::declval()...)); + }; +#endif + template requires stdexec::__callable using bulk_non_throwing = // stdexec::__mbool< // If function invocation doesn't throw stdexec::__nothrow_callable && - // and emplacing a tuple doesn't throw + // and emplacing a tuple doesn't throw +#if STDEXEC_MSVC() + __bulk_non_throwing::__v +#else noexcept(stdexec::__decayed_tuple(std::declval()...)) +#endif // there's no need to advertise completion with `exception_ptr` >; @@ -86,27 +221,80 @@ namespace exec { static_thread_pool& pool_; }; +#if STDEXEC_HAS_STD_RANGES() + struct transform_iterate { + template + stdexec::__t> operator()(exec::iterate_t, Range&& range) { + return {static_cast(range), pool_}; + } + + static_thread_pool& pool_; + }; +#endif + + public: struct domain { // For eager customization template Sender> auto transform_sender(Sender&& sndr) const noexcept { - auto sched = stdexec::get_completion_scheduler( - stdexec::get_env(sndr)); - return stdexec::apply_sender((Sender&&) sndr, transform_bulk{*sched.pool_}); + if constexpr (stdexec::__completes_on) { + auto sched = stdexec::get_completion_scheduler( + stdexec::get_env(sndr)); + return stdexec::__sexpr_apply((Sender&&) sndr, transform_bulk{*sched.pool_}); + } else { + static_assert( + stdexec::__completes_on, + "No static_thread_pool instance can be found in the sender's environment " + "on which to schedule bulk work."); + return not_a_sender>(); + } + STDEXEC_UNREACHABLE(); } // transform the generic bulk sender into a parallel thread-pool bulk sender template Sender, class Env> + auto transform_sender(Sender&& sndr, const Env& env) const noexcept { + if constexpr (stdexec::__completes_on) { + auto sched = stdexec::get_completion_scheduler( + stdexec::get_env(sndr)); + return stdexec::__sexpr_apply((Sender&&) sndr, transform_bulk{*sched.pool_}); + } else if constexpr (stdexec::__starts_on) { + auto sched = stdexec::get_scheduler(env); + return stdexec::__sexpr_apply((Sender&&) sndr, transform_bulk{*sched.pool_}); + } else { + static_assert( // + stdexec::__starts_on + || stdexec::__completes_on, + "No static_thread_pool instance can be found in the sender's or receiver's " + "environment on which to schedule bulk work."); + return not_a_sender>(); + } + STDEXEC_UNREACHABLE(); + } + +#if STDEXEC_HAS_STD_RANGES() + template Sender> + auto transform_sender(Sender&& sndr) const noexcept { + auto sched = stdexec::get_completion_scheduler( + stdexec::get_env(sndr)); + return stdexec::__sexpr_apply((Sender&&) sndr, transform_iterate{*sched.pool_}); + } + + template Sender, class Env> requires stdexec::__callable auto transform_sender(Sender&& sndr, const Env& env) const noexcept { auto sched = stdexec::get_scheduler(env); - return stdexec::apply_sender((Sender&&) sndr, transform_bulk{*sched.pool_}); + return stdexec::__sexpr_apply((Sender&&) sndr, transform_iterate{*sched.pool_}); } +#endif }; public: static_thread_pool(); - static_thread_pool(std::uint32_t threadCount); + static_thread_pool( + std::uint32_t threadCount, + bwos_params params = {}, + numa_policy* numa = get_numa_policy()); ~static_thread_pool(); struct scheduler { @@ -122,13 +310,14 @@ namespace exec { public: using __t = sender; using __id = sender; - using is_sender = void; + using sender_concept = stdexec::sender_t; using completion_signatures = stdexec::completion_signatures< stdexec::set_value_t(), stdexec::set_stopped_t()>; private: template auto make_operation_(Receiver r) const -> operation> { - return operation>{pool_, (Receiver&&) r}; + return operation>{ + pool_, queue_, (Receiver&&) r, threadIndex_, constraints_}; } template @@ -139,6 +328,7 @@ namespace exec { struct env { static_thread_pool& pool_; + remote_queue* queue_; template friend static_thread_pool::scheduler @@ -147,25 +337,35 @@ namespace exec { } static_thread_pool::scheduler make_scheduler_() const { - return static_thread_pool::scheduler{pool_}; + return static_thread_pool::scheduler{pool_, *queue_}; } }; friend env tag_invoke(stdexec::get_env_t, const sender& self) noexcept { - return env{self.pool_}; + return env{self.pool_, self.queue_}; } friend struct static_thread_pool::scheduler; - explicit sender(static_thread_pool& pool) noexcept - : pool_(pool) { + explicit sender( + static_thread_pool& pool, + remote_queue* queue, + std::size_t threadIndex, + const nodemask& constraints) noexcept + : pool_(pool) + , queue_(queue) + , threadIndex_(threadIndex) + , constraints_(constraints) { } static_thread_pool& pool_; + remote_queue* queue_; + std::size_t threadIndex_{std::numeric_limits::max()}; + nodemask constraints_{}; }; sender make_sender_() const { - return sender{*pool_}; + return sender{*pool_, queue_, thread_idx_, nodemask_}; } friend sender tag_invoke(stdexec::schedule_t, const scheduler& s) noexcept { @@ -183,68 +383,275 @@ namespace exec { friend class static_thread_pool; - explicit scheduler(static_thread_pool& pool) noexcept - : pool_(&pool) { + explicit scheduler(static_thread_pool& pool, const nodemask& mask = nodemask::any()) noexcept + : pool_(&pool) + , queue_{pool.get_remote_queue()} + , nodemask_{mask} { + } + + explicit scheduler( + static_thread_pool& pool, + remote_queue& queue, + const nodemask& mask = nodemask::any()) noexcept + : pool_(&pool) + , queue_{&queue} + , nodemask_{mask} { + } + + explicit scheduler( + static_thread_pool& pool, + remote_queue& queue, + std::size_t threadIndex) noexcept + : pool_(&pool) + , queue_{&queue} + , thread_idx_{threadIndex} { } static_thread_pool* pool_; + remote_queue* queue_; + nodemask nodemask_; + std::size_t thread_idx_{std::numeric_limits::max()}; }; scheduler get_scheduler() noexcept { return scheduler{*this}; } + scheduler get_scheduler_on_thread(std::size_t threadIndex) noexcept { + return scheduler{*this, *get_remote_queue(), threadIndex}; + } + + scheduler get_constrained_scheduler(const nodemask& constraints) noexcept { + return scheduler{*this, *get_remote_queue(), constraints}; + } + + remote_queue* get_remote_queue() noexcept { + remote_queue* queue = remotes_.get(); + std::size_t index = 0; + for (std::thread& t: threads_) { + if (t.get_id() == queue->id_) { + queue->index_ = index; + break; + } + ++index; + } + return queue; + } + void request_stop() noexcept; std::uint32_t available_parallelism() const { return threadCount_; } + bwos_params params() const { + return params_; + } + + void enqueue(task_base* task, const nodemask& contraints = nodemask::any()) noexcept; + void enqueue( + remote_queue& queue, + task_base* task, + const nodemask& contraints = nodemask::any()) noexcept; + void enqueue(remote_queue& queue, task_base* task, std::size_t threadIndex) noexcept; + + template TaskT> + void bulk_enqueue(TaskT* task, std::uint32_t n_threads) noexcept; + void bulk_enqueue( + remote_queue& queue, + __intrusive_queue<&task_base::next> tasks, + std::size_t tasks_size, + const nodemask& constraints = nodemask::any()) noexcept; + private: - class thread_state { + class workstealing_victim { public: - task_base* try_pop(); - task_base* pop(); - bool try_push(task_base* task); - void push(task_base* task); - void request_stop(); + explicit workstealing_victim( + bwos::lifo_queue>* queue, + std::uint32_t index, + int numa_node) noexcept + : queue_(queue) + , index_(index) + , numa_node_(numa_node) { + } + + task_base* try_steal() noexcept { + return queue_->steal_front(); + } + + std::uint32_t index() const noexcept { + return index_; + } + + int numa_node() const noexcept { + return numa_node_; + } private: - std::mutex mut_; - std::condition_variable cv_; - __intrusive_queue<&task_base::next> queue_; - bool stopRequested_ = false; + bwos::lifo_queue>* queue_; + std::uint32_t index_; + int numa_node_; }; - void run(std::uint32_t index) noexcept; - void join() noexcept; + struct thread_state_base { + explicit thread_state_base(std::uint32_t index, numa_policy* numa) noexcept + : index_(index) + , numa_node_(numa->thread_index_to_node(index)) { + } - void enqueue(task_base* task) noexcept; + std::uint32_t index_; + int numa_node_; + }; - template TaskT> - void bulk_enqueue(TaskT* task, std::uint32_t n_threads) noexcept; + class thread_state : private thread_state_base { + public: + struct pop_result { + task_base* task; + std::uint32_t queueIndex; + }; + + explicit thread_state( + static_thread_pool* pool, + std::uint32_t index, + bwos_params params, + numa_policy* numa) noexcept + : thread_state_base(index, numa) + , local_queue_( + params.numBlocks, + params.blockSize, + numa_allocator(this->numa_node_)) + , state_(state::running) + , pool_(pool) { + std::random_device rd; + rng_.seed(rd); + } + + pop_result pop(); + void push_local(task_base* task); + void push_local(__intrusive_queue<&task_base::next>&& tasks); + + bool notify(); + void request_stop(); + + void victims(const std::vector& victims) { + for (workstealing_victim v: victims) { + if (v.index() == index_) { + // skip self + continue; + } + if (v.numa_node() == numa_node_) { + near_victims_.push_back(v); + } + all_victims_.push_back(v); + } + } + + std::uint32_t index() const noexcept { + return index_; + } + + int numa_node() const noexcept { + return numa_node_; + } + + workstealing_victim as_victim() noexcept { + return workstealing_victim{&local_queue_, index_, numa_node_}; + } + private: + enum state { + running, + stealing, + sleeping, + notified + }; + + pop_result try_pop(); + pop_result try_remote(); + pop_result try_steal(std::span victims); + pop_result try_steal_near(); + pop_result try_steal_any(); + + void notify_one_sleeping(); + void set_stealing(); + void clear_stealing(); + + bwos::lifo_queue> local_queue_; + __intrusive_queue<&task_base::next> pending_queue_{}; + std::mutex mut_{}; + std::condition_variable cv_{}; + bool stopRequested_{false}; + std::vector near_victims_{}; + std::vector all_victims_{}; + std::atomic state_; + static_thread_pool* pool_; + xorshift rng_{}; + }; + + void run(std::uint32_t index, numa_policy* numa) noexcept; + void join() noexcept; + + alignas(64) std::atomic numThiefs_{}; + alignas(64) remote_queue_list remotes_; std::uint32_t threadCount_; + std::uint32_t maxSteals_{threadCount_ + 1}; + bwos_params params_; std::vector threads_; - std::vector threadStates_; - std::atomic nextThread_; + std::vector> threadStates_; + numa_policy* numa_; + + struct thread_index_by_numa_node { + int numa_node; + int thread_index; + + friend bool operator<( + const thread_index_by_numa_node& lhs, + const thread_index_by_numa_node& rhs) noexcept { + return lhs.numa_node < rhs.numa_node; + } + }; + + std::vector threadIndexByNumaNode_; + + std::size_t num_threads(int numa) const noexcept; + std::size_t num_threads(nodemask constraints) const noexcept; + std::size_t get_thread_index(int numa, std::size_t index) const noexcept; + std::size_t random_thread_index_with_constraints(const nodemask& contraints) noexcept; }; inline static_thread_pool::static_thread_pool() : static_thread_pool(std::thread::hardware_concurrency()) { } - inline static_thread_pool::static_thread_pool(std::uint32_t threadCount) - : threadCount_(threadCount) + inline static_thread_pool::static_thread_pool( + std::uint32_t threadCount, + bwos_params params, + numa_policy* numa) + : remotes_(threadCount) + , threadCount_(threadCount) + , params_(params) , threadStates_(threadCount) - , nextThread_(0) { + , numa_{numa} { STDEXEC_ASSERT(threadCount > 0); + for (std::uint32_t index = 0; index < threadCount; ++index) { + threadStates_[index].emplace(this, index, params, numa); + threadIndexByNumaNode_.push_back( + thread_index_by_numa_node{threadStates_[index]->numa_node(), static_cast(index)}); + } + std::sort(threadIndexByNumaNode_.begin(), threadIndexByNumaNode_.end()); + std::vector victims{}; + for (auto& state: threadStates_) { + victims.emplace_back(state->as_victim()); + } + for (auto& state: threadStates_) { + state->victims(victims); + } threads_.reserve(threadCount); try { for (std::uint32_t i = 0; i < threadCount; ++i) { - threads_.emplace_back([this, i] { run(i); }); + threads_.emplace_back([this, i, numa] { run(i, numa); }); } } catch (...) { request_stop(); @@ -260,27 +667,19 @@ namespace exec { inline void static_thread_pool::request_stop() noexcept { for (auto& state: threadStates_) { - state.request_stop(); + state->request_stop(); } } - inline void static_thread_pool::run(const std::uint32_t threadIndex) noexcept { + inline void static_thread_pool::run(std::uint32_t threadIndex, numa_policy* numa) noexcept { + numa->bind_to_node(threadStates_[threadIndex]->numa_node()); STDEXEC_ASSERT(threadIndex < threadCount_); while (true) { - task_base* task = nullptr; - std::uint32_t queueIndex = threadIndex; - - // Starting with this thread's queue, try to de-queue a task - // from each thread's queue. try_pop() is non-blocking. - do { - task = threadStates_[queueIndex].try_pop(); - } while (!task && (++queueIndex %= threadCount_) != threadIndex); - - STDEXEC_ASSERT(task || queueIndex == threadIndex); // Make a blocking call to de-queue a task if we don't already have one. - if (!task && !(task = threadStates_[queueIndex].pop())) + auto [task, queueIndex] = threadStates_[threadIndex]->pop(); + if (!task) { return; // pop() only returns null when request_stop() was called. - + } task->__execute(task, queueIndex); } } @@ -292,89 +691,297 @@ namespace exec { threads_.clear(); } - inline void static_thread_pool::enqueue(task_base* task) noexcept { - const std::uint32_t threadCount = static_cast(threads_.size()); - const std::uint32_t startIndex = - nextThread_.fetch_add(1, std::memory_order_relaxed) % threadCount; + inline void static_thread_pool::enqueue(task_base* task, const nodemask& constraints) noexcept { + this->enqueue(*get_remote_queue(), task, constraints); + } + + inline std::size_t static_thread_pool::num_threads(int numa) const noexcept { + thread_index_by_numa_node key{numa, 0}; + auto it = std::lower_bound(threadIndexByNumaNode_.begin(), threadIndexByNumaNode_.end(), key); + if (it == threadIndexByNumaNode_.end()) { + return 0; + } + auto itEnd = std::upper_bound(it, threadIndexByNumaNode_.end(), key); + return std::distance(it, itEnd); + } + + inline std::size_t static_thread_pool::num_threads(nodemask constraints) const noexcept { + const std::size_t nNodes = threadIndexByNumaNode_.back().numa_node + 1; + std::size_t nThreads = 0; + for (std::size_t nodeIndex = 0; nodeIndex < nNodes; ++nodeIndex) { + if (!constraints[nodeIndex]) { + continue; + } + nThreads += num_threads(nodeIndex); + } + return nThreads; + } + + inline std::size_t + static_thread_pool::get_thread_index(int nodeIndex, std::size_t threadIndex) const noexcept { + thread_index_by_numa_node key{nodeIndex, 0}; + auto it = std::lower_bound(threadIndexByNumaNode_.begin(), threadIndexByNumaNode_.end(), key); + STDEXEC_ASSERT(it != threadIndexByNumaNode_.end()); + std::advance(it, threadIndex); + return it->thread_index; + } + + inline std::size_t + static_thread_pool::random_thread_index_with_constraints(const nodemask& constraints) noexcept { + thread_local std::uint64_t startIndex{std::uint64_t(std::random_device{}())}; + startIndex += 1; + std::size_t targetIndex = startIndex % threadCount_; + std::size_t nThreads = num_threads(constraints); + if (nThreads != 0) { + for (std::size_t nodeIndex = 0; nodeIndex < numa_->num_nodes(); ++nodeIndex) { + if (!constraints[nodeIndex]) { + continue; + } + std::size_t nThreads = num_threads(nodeIndex); + if (targetIndex < nThreads) { + return get_thread_index(nodeIndex, targetIndex); + } + targetIndex -= nThreads; + } + } + return targetIndex; + } - // First try to enqueue to one of the threads without blocking. - for (std::uint32_t i = 0; i < threadCount; ++i) { - const auto index = - (startIndex + i) < threadCount ? (startIndex + i) : (startIndex + i - threadCount); - if (threadStates_[index].try_push(task)) { + inline void static_thread_pool::enqueue( + remote_queue& queue, + task_base* task, + const nodemask& constraints) noexcept { + static thread_local std::thread::id this_id = std::this_thread::get_id(); + remote_queue* correct_queue = this_id == queue.id_ ? &queue : get_remote_queue(); + std::size_t idx = correct_queue->index_; + if (idx < threadStates_.size()) { + std::size_t this_node = threadStates_[idx]->numa_node(); + if (constraints[this_node]) { + threadStates_[idx]->push_local(task); return; } } + const std::size_t threadIndex = random_thread_index_with_constraints(constraints); + queue.queues_[threadIndex].push_front(task); + threadStates_[threadIndex]->notify(); + } - // Otherwise, do a blocking enqueue on the selected thread. - threadStates_[startIndex].push(task); + inline void static_thread_pool::enqueue( + remote_queue& queue, + task_base* task, + std::size_t threadIndex) noexcept { + threadIndex %= threadCount_; + queue.queues_[threadIndex].push_front(task); + threadStates_[threadIndex]->notify(); } template TaskT> - inline void static_thread_pool::bulk_enqueue(TaskT* task, std::uint32_t n_threads) noexcept { + void static_thread_pool::bulk_enqueue(TaskT* task, std::uint32_t n_threads) noexcept { + auto& queue = *get_remote_queue(); for (std::size_t i = 0; i < n_threads; ++i) { - threadStates_[i % available_parallelism()].push(task + i); + std::uint32_t index = i % available_parallelism(); + queue.queues_[index].push_front(task + i); + threadStates_[index]->notify(); } } - inline task_base* static_thread_pool::thread_state::try_pop() { - std::unique_lock lk{mut_, std::try_to_lock}; - if (!lk || queue_.empty()) { - return nullptr; + inline void static_thread_pool::bulk_enqueue( + remote_queue& queue, + __intrusive_queue<&task_base::next> tasks, + std::size_t tasks_size, + const nodemask& constraints) noexcept { + static thread_local std::thread::id this_id = std::this_thread::get_id(); + remote_queue* correct_queue = this_id == queue.id_ ? &queue : get_remote_queue(); + std::size_t idx = correct_queue->index_; + if (idx < threadStates_.size()) { + std::size_t this_node = threadStates_[idx]->numa_node(); + if (constraints[this_node]) { + threadStates_[idx]->push_local(std::move(tasks)); + return; + } + } + std::size_t nThreads = available_parallelism(); + for (std::size_t i = 0; i < nThreads; ++i) { + auto [i0, iEnd] = even_share(tasks_size, i, available_parallelism()); + if (i0 == iEnd) { + continue; + } + __intrusive_queue<&task_base::next> tmp{}; + for (std::size_t j = i0; j < iEnd; ++j) { + tmp.push_back(tasks.pop_front()); + } + correct_queue->queues_[i].prepend(std::move(tmp)); + threadStates_[i]->notify(); } - return queue_.pop_front(); } - inline task_base* static_thread_pool::thread_state::pop() { - std::unique_lock lk{mut_}; - while (queue_.empty()) { - if (stopRequested_) { - return nullptr; - } - cv_.wait(lk); + inline void move_pending_to_local( + __intrusive_queue<&task_base::next>& pending_queue, + bwos::lifo_queue>& local_queue) { + auto last = local_queue.push_back(pending_queue.begin(), pending_queue.end()); + __intrusive_queue<&task_base::next> tmp{}; + tmp.splice(tmp.begin(), pending_queue, pending_queue.begin(), last); + tmp.clear(); + } + + inline static_thread_pool::thread_state::pop_result + static_thread_pool::thread_state::try_remote() { + pop_result result{nullptr, index_}; + __intrusive_queue<& task_base::next> remotes = pool_->remotes_.pop_all_reversed(index_); + pending_queue_.append(std::move(remotes)); + if (!pending_queue_.empty()) { + move_pending_to_local(pending_queue_, local_queue_); + result.task = local_queue_.pop_back(); } - return queue_.pop_front(); + return result; } - inline bool static_thread_pool::thread_state::try_push(task_base* task) { - std::unique_lock lk{mut_, std::try_to_lock}; - if (!lk) { - return false; + inline static_thread_pool::thread_state::pop_result static_thread_pool::thread_state::try_pop() { + pop_result result{nullptr, index_}; + result.task = local_queue_.pop_back(); + if (result.task) [[likely]] { + return result; } - const bool wasEmpty = queue_.empty(); - queue_.push_back(task); - if (wasEmpty) { - cv_.notify_one(); + return try_remote(); + } + + inline static_thread_pool::thread_state::pop_result + static_thread_pool::thread_state::try_steal(std::span victims) { + if (victims.empty()) { + return {nullptr, index_}; + } + std::uniform_int_distribution dist(0, victims.size() - 1); + std::uint32_t victimIndex = dist(rng_); + auto& v = victims[victimIndex]; + return {v.try_steal(), v.index()}; + } + + inline static_thread_pool::thread_state::pop_result + static_thread_pool::thread_state::try_steal_near() { + return try_steal(near_victims_); + } + + inline static_thread_pool::thread_state::pop_result + static_thread_pool::thread_state::try_steal_any() { + return try_steal(all_victims_); + } + + inline void static_thread_pool::thread_state::push_local(task_base* task) { + if (!local_queue_.push_back(task)) { + pending_queue_.push_back(task); + } + } + + inline void + static_thread_pool::thread_state::push_local(__intrusive_queue<&task_base::next>&& tasks) { + pending_queue_.prepend(std::move(tasks)); + } + + inline void static_thread_pool::thread_state::set_stealing() { + pool_->numThiefs_.fetch_add(1, std::memory_order_relaxed); + } + + inline void static_thread_pool::thread_state::clear_stealing() { + if (pool_->numThiefs_.fetch_sub(1, std::memory_order_relaxed) == 1) { + notify_one_sleeping(); + } + } + + inline void static_thread_pool::thread_state::notify_one_sleeping() { + std::uniform_int_distribution dist(0, pool_->threadCount_ - 1); + std::uint32_t startIndex = dist(rng_); + for (std::uint32_t i = 0; i < pool_->threadCount_; ++i) { + std::uint32_t index = (startIndex + i) % pool_->threadCount_; + if (index == index_) { + continue; + } + if (pool_->threadStates_[index]->notify()) { + return; + } + } + } + + inline static_thread_pool::thread_state::pop_result static_thread_pool::thread_state::pop() { + pop_result result = try_pop(); + while (!result.task) { + set_stealing(); + for (std::size_t i = 0; i < pool_->maxSteals_; ++i) { + result = try_steal_near(); + if (result.task) { + clear_stealing(); + return result; + } + } + for (std::size_t i = 0; i < pool_->maxSteals_; ++i) { + result = try_steal_any(); + if (result.task) { + clear_stealing(); + return result; + } + } + std::this_thread::yield(); + clear_stealing(); + + std::unique_lock lock{mut_}; + if (stopRequested_) { + return result; + } + state expected = state::running; + if (state_.compare_exchange_weak(expected, state::sleeping, std::memory_order_relaxed)) { + result = try_remote(); + if (result.task) { + return result; + } + cv_.wait(lock); + } + lock.unlock(); + state_.store(state::running, std::memory_order_relaxed); + result = try_pop(); } - return true; + return result; } - inline void static_thread_pool::thread_state::push(task_base* task) { - std::lock_guard lk{mut_}; - const bool wasEmpty = queue_.empty(); - queue_.push_back(task); - if (wasEmpty) { + inline bool static_thread_pool::thread_state::notify() { + if (state_.exchange(state::notified, std::memory_order_relaxed) == state::sleeping) { + { + std::lock_guard lock{mut_}; + } cv_.notify_one(); + return true; } + return false; } inline void static_thread_pool::thread_state::request_stop() { - std::lock_guard lk{mut_}; - stopRequested_ = true; + { + std::lock_guard lock{mut_}; + stopRequested_ = true; + } cv_.notify_one(); } template - class static_thread_pool::operation : task_base { + class static_thread_pool::operation : public task_base { using Receiver = stdexec::__t; friend static_thread_pool::scheduler::sender; static_thread_pool& pool_; + remote_queue* queue_; Receiver receiver_; - - explicit operation(static_thread_pool& pool, Receiver&& r) + std::size_t threadIndex_{}; + nodemask constraints_{}; + + explicit operation( + static_thread_pool& pool, + remote_queue* queue, + Receiver&& r, + std::size_t tid, + const nodemask& constraints) : pool_(pool) - , receiver_((Receiver&&) r) { + , queue_(queue) + , receiver_((Receiver&&) r) + , threadIndex_{tid} + , constraints_{constraints} { this->__execute = [](task_base* t, const std::uint32_t /* tid */) noexcept { auto& op = *static_cast(t); auto stoken = stdexec::get_stop_token(stdexec::get_env(op.receiver_)); @@ -389,7 +996,11 @@ namespace exec { } void enqueue_(task_base* op) const { - pool_.enqueue(op); + if (threadIndex_ < pool_.available_parallelism()) { + pool_.enqueue(*queue_, op, threadIndex_); + } else { + pool_.enqueue(*queue_, op, constraints_); + } } friend void tag_invoke(stdexec::start_t, operation& op) noexcept { @@ -403,7 +1014,7 @@ namespace exec { struct static_thread_pool::bulk_sender { using Sender = stdexec::__t; using Fun = stdexec::__t; - using is_sender = void; + using sender_concept = stdexec::sender_t; static_thread_pool& pool_; Sender sndr_; @@ -548,31 +1159,9 @@ namespace exec { std::exception_ptr exception_; std::vector tasks_; - // Splits `n` into `size` chunks distributing `n % size` evenly between ranks. - // Returns `[begin, end)` range in `n` for a given `rank`. - // Example: - // ```cpp - // // n_items thread n_threads - // even_share( 11, 0, 3); // -> [0, 4) -> 4 items - // even_share( 11, 1, 3); // -> [4, 8) -> 4 items - // even_share( 11, 2, 3); // -> [8, 11) -> 3 items - // ``` - static std::pair - even_share(Shape n, std::uint32_t rank, std::uint32_t size) noexcept { - const auto avg_per_thread = n / size; - const auto n_big_share = avg_per_thread + 1; - const auto big_shares = n % size; - const auto is_big_share = rank < big_shares; - const auto begin = is_big_share - ? n_big_share * rank - : n_big_share * big_shares + (rank - big_shares) * avg_per_thread; - const auto end = begin + (is_big_share ? n_big_share : avg_per_thread); - - return std::make_pair(begin, end); - } - std::uint32_t num_agents_required() const { - return std::min(shape_, static_cast(pool_.available_parallelism())); + return static_cast( + std::min(shape_, static_cast(pool_.available_parallelism()))); } template @@ -594,7 +1183,7 @@ namespace exec { template struct static_thread_pool::bulk_receiver { - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using Sender = stdexec::__t; using Receiver = stdexec::__t; @@ -677,4 +1266,256 @@ namespace exec { } }; +#if STDEXEC_HAS_STD_RANGES() + namespace schedule_all_ { + template + auto get_allocator(const Rcvr& rcvr) { + if constexpr (stdexec::__callable>) { + return stdexec::get_allocator(stdexec::get_env(rcvr)); + } else { + return std::allocator{}; + } + } + + template + using allocator_of_t = decltype(get_allocator(stdexec::__declval())); + + template + struct operation_base { + Range range_; + static_thread_pool& pool_; + std::mutex start_mutex_{}; + bool has_started_{false}; + __intrusive_queue<&task_base::next> tasks_{}; + std::size_t tasks_size_{}; + std::atomic countdown_{std::ranges::size(range_)}; + }; + + template + struct item_operation { + class __t : private task_base { + using ItemReceiver = stdexec::__t; + + static void execute_(task_base* base, std::uint32_t /* tid */) noexcept { + auto op = static_cast<__t*>(base); + stdexec::set_value(static_cast(op->item_receiver_), *op->it_); + } + + ItemReceiver item_receiver_; + std::ranges::iterator_t it_; + operation_base* parent_; + + friend void tag_invoke(stdexec::start_t, __t& op) noexcept { + std::unique_lock lock{op.parent_->start_mutex_}; + if (!op.parent_->has_started_) { + op.parent_->tasks_.push_back(static_cast(&op)); + op.parent_->tasks_size_ += 1; + } else { + lock.unlock(); + op.parent_->pool_.enqueue(static_cast(&op)); + } + } + + public: + using __id = item_operation; + + __t( + ItemReceiver&& item_receiver, + std::ranges::iterator_t it, + operation_base* parent) + : task_base{.__execute = execute_} + , item_receiver_(static_cast(item_receiver)) + , it_(it) + , parent_(parent) { + } + }; + }; + + template + struct item_sender { + struct __t { + using __id = item_sender; + using sender_concept = stdexec::sender_t; + using completion_signatures = stdexec::completion_signatures)>; + + operation_base* op_; + std::ranges::iterator_t it_; + + struct env { + static_thread_pool* pool_; + + template < + stdexec::same_as> Query> + friend auto tag_invoke(Query, const env& e) noexcept -> static_thread_pool::scheduler { + return e.pool_->get_scheduler(); + } + }; + + template GetEnv, stdexec::__decays_to<__t> Self> + friend auto tag_invoke(GetEnv, Self&& self) noexcept -> env { + return {self.op_->pool_}; + } + + template Self, stdexec::receiver ItemReceiver> + requires stdexec::receiver_of + friend auto tag_invoke(stdexec::connect_t, Self&& self, ItemReceiver rcvr) noexcept + -> stdexec::__t>> { + return {static_cast(rcvr), self.it_, self.op_}; + } + }; + }; + + template + struct operation_base_with_receiver : operation_base { + Receiver receiver_; + + operation_base_with_receiver(Range range, static_thread_pool& pool, Receiver&& receiver) + : operation_base{range, pool} + , receiver_(static_cast(receiver)) { + } + }; + + template + struct next_receiver { + struct __t { + using receiver_concept = stdexec::receiver_t; + operation_base_with_receiver* op_; + + template SetValue, stdexec::same_as<__t> Self> + friend void tag_invoke(SetValue, Self&& self) noexcept { + std::size_t countdown = self.op_->countdown_.fetch_sub(1, std::memory_order_relaxed); + if (countdown == 1) { + stdexec::set_value((Receiver&&) self.op_->receiver_); + } + } + + template SetStopped, stdexec::same_as<__t> Self> + friend void tag_invoke(SetStopped, Self&& self) noexcept { + std::size_t countdown = self.op_->countdown_.fetch_sub(1, std::memory_order_relaxed); + if (countdown == 1) { + stdexec::set_value((Receiver&&) self.op_->receiver_); + } + } + + template GetEnv, stdexec::__decays_to<__t> Self> + friend auto tag_invoke(GetEnv, Self&& self) noexcept -> stdexec::env_of_t { + return stdexec::get_env(self.op_->receiver_); + } + }; + }; + + template + struct operation { + class __t : operation_base_with_receiver { + using Allocator = allocator_of_t; + using ItemSender = stdexec::__t>; + using NextSender = next_sender_of_t; + using NextReceiver = stdexec::__t>; + using ItemOperation = stdexec::connect_result_t; + + using ItemAllocator = + std::allocator_traits::template rebind_alloc<__manual_lifetime>; + + std::vector<__manual_lifetime, ItemAllocator> items_; + + template Self> + friend void tag_invoke(stdexec::start_t, Self& op) noexcept { + std::size_t size = op.items_.size(); + std::size_t nthreads = op.pool_.available_parallelism(); + bwos_params params = op.pool_.params(); + std::size_t localSize = params.blockSize * params.numBlocks; + std::size_t chunkSize = std::min(size / nthreads, localSize * nthreads); + auto& remote_queue = *op.pool_.get_remote_queue(); + std::ranges::iterator_t it = std::ranges::begin(op.range_); + std::size_t i0 = 0; + while (i0 + chunkSize < size) { + for (std::size_t i = i0; i < i0 + chunkSize; ++i) { + op.items_[i].__construct_with([&] { + return stdexec::connect( + set_next(op.receiver_, ItemSender{&op, it + i}), NextReceiver{&op}); + }); + stdexec::start(op.items_[i].__get()); + } + std::unique_lock lock{op.start_mutex_}; + op.pool_.bulk_enqueue(remote_queue, std::move(op.tasks_), op.tasks_size_); + lock.unlock(); + i0 += chunkSize; + } + for (std::size_t i = i0; i < size; ++i) { + op.items_[i].__construct_with([&] { + return stdexec::connect( + set_next(op.receiver_, ItemSender{&op, it + i}), NextReceiver{&op}); + }); + stdexec::start(op.items_[i].__get()); + } + std::unique_lock lock{op.start_mutex_}; + op.has_started_ = true; + op.pool_.bulk_enqueue(remote_queue, std::move(op.tasks_), op.tasks_size_); + } + + public: + using __id = operation; + + __t(Range range, static_thread_pool& pool, Receiver&& receiver) + : operation_base_with_receiver< + Range, + Receiver>{std::move(range), pool, static_cast(receiver)} + , items_(std::ranges::size(this->range_), ItemAllocator(get_allocator(this->receiver_))) { + } + + ~__t() { + if (this->has_started_) { + for (auto& item: items_) { + item.__destroy(); + } + } + } + }; + }; + + template + class sequence::__t { + using item_sender_t = stdexec::__t>; + + Range range_; + static_thread_pool* pool_; + + public: + using __id = sequence; + + using sender_concept = sequence_sender_t; + + using completion_signatures = stdexec::completion_signatures< + stdexec::set_value_t(), + stdexec::set_error_t(std::exception_ptr), + stdexec::set_stopped_t()>; + + using item_types = exec::item_types>>; + + __t(Range range, static_thread_pool& pool) + : range_(static_cast(range)) + , pool_(&pool) { + } + + private: + template Self, exec::sequence_receiver_of Receiver> + friend auto tag_invoke(exec::subscribe_t, Self&& self, Receiver rcvr) noexcept + -> stdexec::__t> { + return {static_cast(self.range_), *self.pool_, static_cast(rcvr)}; + } + }; + } + + struct schedule_all_t { + template + stdexec::__t>> + operator()(static_thread_pool& pool, Range&& range) const { + return {static_cast(range), pool}; + } + }; + + inline constexpr schedule_all_t schedule_all{}; +#endif + } // namespace exec diff --git a/include/exec/task.hpp b/include/exec/task.hpp index e28ed296e..2ad8ef637 100644 --- a/include/exec/task.hpp +++ b/include/exec/task.hpp @@ -96,7 +96,8 @@ namespace exec { static constexpr bool __with_scheduler = _SchedulerAffinity == __scheduler_affinity::__sticky; - STDEXEC_NO_UNIQUE_ADDRESS __if_c<__with_scheduler, __any_scheduler, __ignore> // + STDEXEC_ATTRIBUTE((no_unique_address)) + __if_c<__with_scheduler, __any_scheduler, __ignore> // __scheduler_{exec::inline_scheduler{}}; in_place_stop_token __stop_token_; @@ -334,8 +335,8 @@ namespace exec { private: struct __final_awaitable { - static std::false_type await_ready() noexcept { - return {}; + static constexpr bool await_ready() noexcept { + return false; } static __coro::coroutine_handle<> @@ -422,8 +423,8 @@ namespace exec { __coro_.destroy(); } - static std::false_type await_ready() noexcept { - return {}; + static constexpr bool await_ready() noexcept { + return false; } template diff --git a/include/exec/trampoline_scheduler.hpp b/include/exec/trampoline_scheduler.hpp index 3381420c6..87070a02e 100644 --- a/include/exec/trampoline_scheduler.hpp +++ b/include/exec/trampoline_scheduler.hpp @@ -96,7 +96,7 @@ namespace exec { struct __t : __operation_base { using __id = __operation; - STDEXEC_NO_UNIQUE_ADDRESS _Receiver __receiver_; + STDEXEC_ATTRIBUTE((no_unique_address)) _Receiver __receiver_; explicit __t(_Receiver __rcvr, std::size_t __max_depth) noexcept( __nothrow_decay_copyable<_Receiver>) @@ -126,7 +126,7 @@ namespace exec { using __operation_t = stdexec::__t<__operation<__id<__decay_t<_Receiver>>>>; struct __schedule_sender { - using is_sender = void; + using sender_concept = stdexec::sender_t; using completion_signatures = stdexec::completion_signatures; diff --git a/include/exec/variant_sender.hpp b/include/exec/variant_sender.hpp index d5c1289be..145387275 100644 --- a/include/exec/variant_sender.hpp +++ b/include/exec/variant_sender.hpp @@ -25,10 +25,10 @@ namespace exec { namespace __variant { using namespace stdexec; - template + template struct __operation_state { class __t { - std::variant, stdexec::__t<_ReceiverId>>...> + std::variant, stdexec::__t<_ReceiverId>>...> __variant_; friend void tag_invoke(start_t, __t& __self) noexcept { @@ -50,7 +50,7 @@ namespace exec { struct __sender { template using __completion_signatures_t = __concat_completion_signatures_t< - completion_signatures_of_t<__copy_cvref_t<_Self, stdexec::__t<_SenderIds>>, _Env>...>; + __completion_signatures_of_t<__copy_cvref_t<_Self, stdexec::__t<_SenderIds>>, _Env>...>; template struct __visitor { @@ -78,11 +78,12 @@ namespace exec { return *this; } - template <__decays_to<__t> _Self, class _Receiver> + template <__decays_to<__t> _Self, receiver _Receiver> requires(sender_to<__copy_cvref_t<_Self, stdexec::__t<_SenderIds>>, _Receiver> && ...) - friend stdexec::__t< - __operation_state, __copy_cvref_t<_Self, _SenderIds>...>> - tag_invoke(connect_t, _Self&& __self, _Receiver&& __r) noexcept( + friend stdexec::__t< __operation_state< + stdexec::__id<_Receiver>, + __cvref_id<_Self, stdexec::__t<_SenderIds>>...>> + tag_invoke(connect_t, _Self&& __self, _Receiver __r) noexcept( (__nothrow_connectable<__copy_cvref_t<_Self, stdexec::__t<_SenderIds>>, _Receiver> && ...)) { return std::visit( @@ -96,7 +97,7 @@ namespace exec { } public: - using is_sender = void; + using sender_concept = stdexec::sender_t; using __id = __sender; __t() = default; diff --git a/include/exec/when_any.hpp b/include/exec/when_any.hpp index e5af82756..47e953b88 100644 --- a/include/exec/when_any.hpp +++ b/include/exec/when_any.hpp @@ -122,7 +122,7 @@ namespace exec { try { __result_.emplace(std::tuple{_CPO{}, (_Args&&) __args...}); } catch (...) { - __result_.emplace(set_error_t{}, std::current_exception()); + __result_.emplace(std::tuple{set_error_t{}, std::current_exception()}); } } // stop pending operations @@ -155,7 +155,7 @@ namespace exec { struct __receiver { class __t { public: - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using __id = __receiver; explicit __t(__op_base<_Receiver, _ResultVariant>* __op) noexcept @@ -172,9 +172,8 @@ namespace exec { } friend __env_t> tag_invoke(get_env_t, const __t& __self) noexcept { - using __with_token = __with; - auto __token = __with_(get_stop_token, __self.__op_->__stop_source_.get_token()); - return __make_env(get_env(__self.__op_->__receiver_), (__with_token&&) __token); + auto __token = __mkprop(__self.__op_->__stop_source_.get_token(), get_stop_token); + return __make_env(get_env(__self.__op_->__receiver_), std::move(__token)); } }; }; @@ -242,7 +241,7 @@ namespace exec { class __t { public: using __id = __sender; - using is_sender = void; + using sender_concept = stdexec::sender_t; template explicit(sizeof...(_Senders) == 1) diff --git a/include/nvexec/detail/queue.cuh b/include/nvexec/detail/queue.cuh index d42523ffa..635285af6 100644 --- a/include/nvexec/detail/queue.cuh +++ b/include/nvexec/detail/queue.cuh @@ -41,9 +41,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { namespace queue { struct producer_t { task_base_t** tail_; - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - void - operator()(task_base_t* task) { + STDEXEC_ATTRIBUTE((host, device)) void operator()(task_base_t* task) { atom_task_ref tail_ref(*tail_); task_base_t* old_tail = tail_ref.load(::cuda::memory_order_acquire); diff --git a/include/nvexec/detail/variant.cuh b/include/nvexec/detail/variant.cuh index b072873df..38ba236d7 100644 --- a/include/nvexec/detail/variant.cuh +++ b/include/nvexec/detail/variant.cuh @@ -106,7 +106,8 @@ namespace nvexec { }; template - STDEXEC_DETAIL_CUDACC_HOST_DEVICE void visit_impl( + STDEXEC_ATTRIBUTE((host, device)) + void visit_impl( std::integral_constant, VisitorT&& visitor, V&& v, @@ -117,7 +118,8 @@ namespace nvexec { } template - STDEXEC_DETAIL_CUDACC_HOST_DEVICE void visit_impl( + STDEXEC_ATTRIBUTE((host, device)) + void visit_impl( std::integral_constant, VisitorT&& visitor, V&& v, @@ -133,7 +135,8 @@ namespace nvexec { } template - STDEXEC_DETAIL_CUDACC_HOST_DEVICE void visit(VisitorT&& visitor, V&& v) { + STDEXEC_ATTRIBUTE((host, device)) + void visit(VisitorT&& visitor, V&& v) { detail::visit_impl( std::integral_constant::size - 1>{}, (VisitorT&&) visitor, @@ -142,7 +145,8 @@ namespace nvexec { } template - STDEXEC_DETAIL_CUDACC_HOST_DEVICE void visit(VisitorT&& visitor, V&& v, std::size_t index) { + STDEXEC_ATTRIBUTE((host, device)) + void visit(VisitorT&& visitor, V&& v, std::size_t index) { detail::visit_impl( std::integral_constant::size - 1>{}, (VisitorT&&) visitor, @@ -165,43 +169,48 @@ namespace nvexec { using index_of = std::integral_constant< index_t, detail::find_index()>; template T> - STDEXEC_DETAIL_CUDACC_HOST_DEVICE T& get() noexcept { + STDEXEC_ATTRIBUTE((host, device)) + T& get() noexcept { void* data = storage_.data_; return *static_cast(data); } template - STDEXEC_DETAIL_CUDACC_HOST_DEVICE detail::nth_type& get() noexcept { + STDEXEC_ATTRIBUTE((host, device)) + detail::nth_type& get() noexcept { return get>(); } - STDEXEC_DETAIL_CUDACC_HOST_DEVICE variant_t() + STDEXEC_ATTRIBUTE((host, device)) + variant_t() requires std::default_initializable { emplace(); } - STDEXEC_DETAIL_CUDACC_HOST_DEVICE ~variant_t() { + STDEXEC_ATTRIBUTE((host, device)) ~variant_t() { destroy(); } - STDEXEC_DETAIL_CUDACC_HOST_DEVICE bool holds_alternative() const { + STDEXEC_ATTRIBUTE((host, device)) bool holds_alternative() const { return index_ != detail::npos(); } template T, class... As> - STDEXEC_DETAIL_CUDACC_HOST_DEVICE void emplace(As&&... as) { + STDEXEC_ATTRIBUTE((host, device)) + void emplace(As&&... as) { destroy(); construct((As&&) as...); } template T, class... As> - STDEXEC_DETAIL_CUDACC_HOST_DEVICE void construct(As&&... as) { + STDEXEC_ATTRIBUTE((host, device)) + void construct(As&&... as) { ::new (storage_.data_) T((As&&) as...); index_ = index_of(); } - STDEXEC_DETAIL_CUDACC_HOST_DEVICE void destroy() { + STDEXEC_ATTRIBUTE((host, device)) void destroy() { if (holds_alternative()) { visit( [](auto& val) noexcept { diff --git a/include/nvexec/stream/algorithm_base.cuh b/include/nvexec/stream/algorithm_base.cuh index 5ea172e2e..d08ca312e 100644 --- a/include/nvexec/stream/algorithm_base.cuh +++ b/include/nvexec/stream/algorithm_base.cuh @@ -61,8 +61,8 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS::__algo_range_init_fun { }; operation_state_base_t& op_state_; - STDEXEC_NO_UNIQUE_ADDRESS InitT init_; - STDEXEC_NO_UNIQUE_ADDRESS Fun fun_; + STDEXEC_ATTRIBUTE((no_unique_address)) InitT init_; + STDEXEC_ATTRIBUTE((no_unique_address)) Fun fun_; public: using __id = receiver_t; @@ -104,8 +104,8 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS::__algo_range_init_fun { using _set_value_t = typename DerivedSender::template _set_value_t; Sender sndr_; - STDEXEC_NO_UNIQUE_ADDRESS InitT init_; - STDEXEC_NO_UNIQUE_ADDRESS Fun fun_; + STDEXEC_ATTRIBUTE((no_unique_address)) InitT init_; + STDEXEC_ATTRIBUTE((no_unique_address)) Fun fun_; template using completion_signatures = // diff --git a/include/nvexec/stream/bulk.cuh b/include/nvexec/stream/bulk.cuh index 5ba807e05..3ba6f3719 100644 --- a/include/nvexec/stream/bulk.cuh +++ b/include/nvexec/stream/bulk.cuh @@ -320,7 +320,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { template struct multi_gpu_bulk_sender_t { - using is_sender = void; + using sender_concept = stdexec::sender_t; using Sender = stdexec::__t; struct __t : stream_sender_base { @@ -371,3 +371,15 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { }; }; } + +namespace stdexec::__detail { + template + inline constexpr __mconst< + nvexec::STDEXEC_STREAM_DETAIL_NS::bulk_sender_t<__name_of<__t>, Shape, Fun>> + __name_of_v>{}; + + template + inline constexpr __mconst< + nvexec::STDEXEC_STREAM_DETAIL_NS::multi_gpu_bulk_sender_t<__name_of<__t>, Shape, Fun>> + __name_of_v>{}; +} diff --git a/include/nvexec/stream/common.cuh b/include/nvexec/stream/common.cuh index 5e5ab5c8f..4bec0037c 100644 --- a/include/nvexec/stream/common.cuh +++ b/include/nvexec/stream/common.cuh @@ -59,7 +59,7 @@ namespace nvexec { } #endif - inline STDEXEC_DETAIL_CUDACC_HOST_DEVICE bool is_on_gpu() noexcept { + inline STDEXEC_ATTRIBUTE((host, device)) bool is_on_gpu() noexcept { return get_device_type() == device_type::device; } } @@ -197,10 +197,11 @@ namespace nvexec { struct stream_scheduler; struct stream_sender_base { - using is_sender = void; + using sender_concept = stdexec::sender_t; }; - struct stream_receiver_base : __receiver_base { + struct stream_receiver_base { + using receiver_concept = stdexec::receiver_t; constexpr static std::size_t memory_allocation_size = 0; }; @@ -293,9 +294,8 @@ namespace nvexec { struct set_noop { template - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - void - operator()(Ts&&...) const noexcept { + STDEXEC_ATTRIBUTE((host, device)) + void operator()(Ts&&...) const noexcept { // TODO TRAP std::printf("ERROR: use of empty variant."); } @@ -320,14 +320,14 @@ namespace nvexec { return get_stream_provider(env)->own_stream_.value(); } - STDEXEC_DETAIL_CUDACC_HOST_DEVICE auto operator()() const noexcept { + STDEXEC_ATTRIBUTE((host, device)) auto operator()() const noexcept { return stdexec::read(*this); } }; template auto make_stream_env(BaseEnv&& base_env, stream_provider_t* stream_provider) noexcept { - return __join_env(__mkprop(get_stream_provider, stream_provider), (BaseEnv&&) base_env); + return __join_env(__mkprop(stream_provider, get_stream_provider), (BaseEnv&&) base_env); } template @@ -365,7 +365,7 @@ namespace nvexec { sender_in && // STDEXEC_IS_BASE_OF( stream_sender_base, - __decay_t, S, E>>); + __decay_t, S, E>>); template concept stream_receiver = // @@ -385,21 +385,19 @@ namespace nvexec { queue::producer_t producer_; public: - using is_receiver = void; + using receiver_concept = stdexec::receiver_t; using __id = stream_enqueue_receiver; template <__one_of Tag, class... As> - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - friend void - tag_invoke(Tag, __t&& self, As&&... as) noexcept { + STDEXEC_ATTRIBUTE((host, device)) + friend void tag_invoke(Tag, __t&& self, As&&... as) noexcept { self.variant_->template emplace>(Tag(), std::move(as)...); self.producer_(self.task_); } template _Tag, class Error> - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - friend void - tag_invoke(_Tag, __t&& self, Error&& e) noexcept { + STDEXEC_ATTRIBUTE((host, device)) + friend void tag_invoke(_Tag, __t&& self, Error&& e) noexcept { if constexpr (__decays_to) { // What is `exception_ptr` but death pending self.variant_->template emplace>( diff --git a/include/nvexec/stream/ensure_started.cuh b/include/nvexec/stream/ensure_started.cuh index 9f9618907..ff6a532c8 100644 --- a/include/nvexec/stream/ensure_started.cuh +++ b/include/nvexec/stream/ensure_started.cuh @@ -46,7 +46,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { template struct receiver_t { - class __t : stream_receiver_base { + class __t : public stream_receiver_base { using Sender = stdexec::__t; __intrusive_ptr shared_state_; @@ -300,7 +300,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { template struct ensure_started_sender_t { - using is_sender = void; + using sender_concept = stdexec::sender_t; using Sender = stdexec::__t; struct __t : stream_sender_base { @@ -359,3 +359,10 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { }; }; } + +namespace stdexec::__detail { + template + inline constexpr __mconst< + nvexec::STDEXEC_STREAM_DETAIL_NS::ensure_started_sender_t<__name_of<__t>>> + __name_of_v>{}; +} diff --git a/include/nvexec/stream/launch.cuh b/include/nvexec/stream/launch.cuh index e7bdbefd5..87535c3a5 100644 --- a/include/nvexec/stream/launch.cuh +++ b/include/nvexec/stream/launch.cuh @@ -179,3 +179,10 @@ namespace nvexec { inline constexpr STDEXEC_STREAM_DETAIL_NS::launch_t launch{}; } // namespace nvexec + +namespace stdexec::__detail { + template + inline constexpr __mconst< + nvexec::STDEXEC_STREAM_DETAIL_NS::launch_sender_t<__name_of<__t>, Fun>> + __name_of_v>{}; +} diff --git a/include/nvexec/stream/let_xxx.cuh b/include/nvexec/stream/let_xxx.cuh index 01eb7afa7..2582576e5 100644 --- a/include/nvexec/stream/let_xxx.cuh +++ b/include/nvexec/stream/let_xxx.cuh @@ -251,3 +251,10 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { }; }; } + +namespace stdexec::__detail { + template + inline constexpr __mconst< + nvexec::STDEXEC_STREAM_DETAIL_NS::let_sender_t<__name_of<__t>, Fun, Set>> + __name_of_v>{}; +} diff --git a/include/nvexec/stream/reduce.cuh b/include/nvexec/stream/reduce.cuh index 41f08b8d0..49fb51df4 100644 --- a/include/nvexec/stream/reduce.cuh +++ b/include/nvexec/stream/reduce.cuh @@ -150,3 +150,10 @@ namespace nvexec { inline constexpr STDEXEC_STREAM_DETAIL_NS::reduce_t reduce{}; } + +namespace stdexec::__detail { + template + extern __mconst< + nvexec::STDEXEC_STREAM_DETAIL_NS::reduce_::sender_t<__name_of<__t>, Init, Fun>> + __name_of_v>; +} diff --git a/include/nvexec/stream/split.cuh b/include/nvexec/stream/split.cuh index 99c53b37c..f051518e3 100644 --- a/include/nvexec/stream/split.cuh +++ b/include/nvexec/stream/split.cuh @@ -293,7 +293,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { template struct split_sender_t { - using is_sender = void; + using sender_concept = stdexec::sender_t; using Sender = stdexec::__t; using sh_state_ = _split::sh_state_t; @@ -342,3 +342,9 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { }; }; } + +namespace stdexec::__detail { + template + extern __mconst< nvexec::STDEXEC_STREAM_DETAIL_NS::split_sender_t<__name_of<__t>>> + __name_of_v>; +} diff --git a/include/nvexec/stream/sync_wait.cuh b/include/nvexec/stream/sync_wait.cuh index 726d37f83..e76bc796c 100644 --- a/include/nvexec/stream/sync_wait.cuh +++ b/include/nvexec/stream/sync_wait.cuh @@ -146,7 +146,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { namespace _sync_wait { using receiver_t = stdexec::__t>>; template <__single_value_variant_sender<__env> Sender> - requires sender && __receiver_from, Sender> + requires sender_in && __receiver_from, Sender> auto operator()(context_state_t context_state, Sender&& __sndr) const -> std::optional> { using state_t = state_t>; diff --git a/include/nvexec/stream/then.cuh b/include/nvexec/stream/then.cuh index 2d80372c9..a6c679a2f 100644 --- a/include/nvexec/stream/then.cuh +++ b/include/nvexec/stream/then.cuh @@ -39,7 +39,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { struct receiver_t { using Receiver = stdexec::__t; - class __t : stream_receiver_base { + class __t : public stream_receiver_base { Fun f_; operation_state_base_t& op_state_; @@ -195,3 +195,10 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { }; }; } + +namespace stdexec::__detail { + template + inline constexpr __mconst< + nvexec::STDEXEC_STREAM_DETAIL_NS::then_sender_t<__name_of<__t>, Fun>> + __name_of_v>{}; +} diff --git a/include/nvexec/stream/transfer.cuh b/include/nvexec/stream/transfer.cuh index 5a20f47fe..069310fd4 100644 --- a/include/nvexec/stream/transfer.cuh +++ b/include/nvexec/stream/transfer.cuh @@ -120,10 +120,10 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { Sender sndr_; template - using _set_value_t = completion_signatures&&...)>; + using _set_value_t = completion_signatures&&...)>; template - using _set_error_t = completion_signatures&&)>; + using _set_error_t = completion_signatures&&)>; template using _completion_signatures_t = // diff --git a/include/nvexec/stream/upon_error.cuh b/include/nvexec/stream/upon_error.cuh index 39ed2b2e2..d0089a5cd 100644 --- a/include/nvexec/stream/upon_error.cuh +++ b/include/nvexec/stream/upon_error.cuh @@ -178,3 +178,10 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { }; }; } + +namespace stdexec::__detail { + template + inline constexpr __mconst< + nvexec::STDEXEC_STREAM_DETAIL_NS::upon_error_sender_t<__name_of<__t>, Fun>> + __name_of_v>{}; +} diff --git a/include/nvexec/stream/upon_stopped.cuh b/include/nvexec/stream/upon_stopped.cuh index a80663cdb..005c093f9 100644 --- a/include/nvexec/stream/upon_stopped.cuh +++ b/include/nvexec/stream/upon_stopped.cuh @@ -153,3 +153,10 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { }; }; } + +namespace stdexec::__detail { + template + inline constexpr __mconst< + nvexec::STDEXEC_STREAM_DETAIL_NS::upon_stopped_sender_t<__name_of<__t>, Fun>> + __name_of_v>{}; +} diff --git a/include/nvexec/stream/when_all.cuh b/include/nvexec/stream/when_all.cuh index 26c3396f1..f7f2bbb72 100644 --- a/include/nvexec/stream/when_all.cuh +++ b/include/nvexec/stream/when_all.cuh @@ -47,9 +47,15 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { template using swallow_values = completion_signatures<>; + template + using too_many_completions = __mbool<(1 < __v<__count_of>)>; + template struct completions { - using __t = dependent_completion_signatures; + using InvalidArg = // + __minvoke< __mfind_if<__mbind_back_q, __q<__mfront>>, Senders...>; + + using __t = stdexec::__when_all::__too_many_value_completions_error; }; template @@ -59,11 +65,10 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { } template - requires((__v<__count_of> <= 1) && ...) + requires(!__v> && ...) struct completions { using non_values = // __concat_completion_signatures_t< - completion_signatures< set_error_t(cudaError_t), set_stopped_t()>, __try_make_completion_signatures< Senders, @@ -200,7 +205,7 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { auto env = make_terminal_stream_env( exec::make_env( stdexec::get_env(base()), - __with_(get_stop_token, op_state_->stop_source_.get_token())), + __mkprop(op_state_->stop_source_.get_token(), get_stop_token)), &const_cast(op_state_->stream_providers_[Index])); return env; @@ -415,3 +420,12 @@ namespace nvexec::STDEXEC_STREAM_DETAIL_NS { }; }; } + +namespace stdexec::__detail { + template + inline constexpr __mconst< + nvexec::STDEXEC_STREAM_DETAIL_NS:: + when_all_sender_t>...>> + __name_of_v>{}; +} diff --git a/include/nvexec/stream_context.cuh b/include/nvexec/stream_context.cuh index ac2effb48..56c6d871f 100644 --- a/include/nvexec/stream_context.cuh +++ b/include/nvexec/stream_context.cuh @@ -139,8 +139,8 @@ namespace nvexec { return self.env_; }; - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - inline __t(context_state_t context_state) noexcept + STDEXEC_ATTRIBUTE((host, device)) + inline __t(context_state_t context_state) noexcept : env_{context_state} { } @@ -236,15 +236,13 @@ namespace nvexec { return split_sender_th(sch.context_state_, (S&&) sndr); } - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - friend inline sender_t - tag_invoke(schedule_t, const stream_scheduler& self) noexcept { + STDEXEC_ATTRIBUTE((host, device)) + friend inline sender_t tag_invoke(schedule_t, const stream_scheduler& self) noexcept { return {self.context_state_}; } - friend std::true_type tag_invoke( // - __has_algorithm_customizations_t, // - const stream_scheduler& self) noexcept { + friend std::true_type + tag_invoke(__has_algorithm_customizations_t, const stream_scheduler& self) noexcept { return {}; } diff --git a/include/stdexec/__detail/__basic_sender.hpp b/include/stdexec/__detail/__basic_sender.hpp index 3cfe5409e..331464b3f 100644 --- a/include/stdexec/__detail/__basic_sender.hpp +++ b/include/stdexec/__detail/__basic_sender.hpp @@ -17,11 +17,15 @@ #include "__execution_fwd.hpp" +#include "__env.hpp" #include "__meta.hpp" +#include "__tuple.hpp" #include "__type_traits.hpp" #include "../concepts.hpp" +#include // for tuple_size/tuple_element + namespace stdexec { ///////////////////////////////////////////////////////////////////////////// // Generic __sender type @@ -31,6 +35,7 @@ namespace stdexec { struct __get_tag { template + STDEXEC_ATTRIBUTE((always_inline)) _Tag operator()(_Tag, _Rest&&...) const noexcept { return {}; } @@ -38,6 +43,7 @@ namespace stdexec { struct __get_data { template + STDEXEC_ATTRIBUTE((always_inline)) _Data&& operator()(__ignore, _Data&& __data, _Rest&&...) const noexcept { return (_Data&&) __data; } @@ -45,14 +51,372 @@ namespace stdexec { template struct __get_children { - template - auto operator()(__ignore, __ignore, _Children&&...) const noexcept - -> __mtype<__minvoke<_Continuation, _Children...>> (*)() { + template + STDEXEC_ATTRIBUTE((always_inline)) + auto operator()(__ignore, __ignore, _Child&&...) const noexcept + -> __mtype<__minvoke<_Continuation, _Child...>> (*)() { return nullptr; } }; + + template + struct __desc { + using __tag = _Tag; + using __data = _Data; + using __children = __types<_Child...>; + }; + + template + struct __sexpr_uncurry_fn { + template + requires __minvocable<_Fn, _Tag, _Data, _Child...> + constexpr auto operator()(_Tag, _Data&&, _Child&&...) const noexcept + -> __minvoke<_Fn, _Tag, _Data, _Child...>; + }; + + template + using __sexpr_uncurry = + __call_result_t<__impl_of<_Sender>, __copy_cvref_fn<_Sender>, __sexpr_uncurry_fn<_Fn>>; + + template + using __desc_of = __sexpr_uncurry<_Sender, __q<__desc>>; + + using __get_desc = __sexpr_uncurry_fn<__q<__desc>>; + + template + extern __q<__midentity> __name_of_v; + + template + using __name_of_fn = decltype(__name_of_v<_Sender>); + + template + using __name_of = __minvoke<__name_of_fn<_Sender>, _Sender>; + } // namespace __detail + + template + using tag_of_t = typename __detail::__desc_of<_Sender>::__tag; + + template + using __data_of = typename __detail::__desc_of<_Sender>::__data; + + template > + using __children_of = // + __mapply< _Continuation, typename __detail::__desc_of<_Sender>::__children>; + + template + using __nth_child_of = __children_of<_Sender, __mbind_front_q<__m_at, _Ny>>; + + template + using __nth_child_of_c = __children_of<_Sender, __mbind_front_q<__m_at, __msize_t<_Ny>>>; + + template + using __child_of = __children_of<_Sender, __q<__mfront>>; + + template + inline constexpr std::size_t __nbr_children_of = __v<__children_of<_Sender, __msize>>; + + template + requires __mvalid && __mvalid<__detail::__sexpr_uncurry, _Tp, _Fn> + struct __uncurry_<_Fn, _Tp> { + using __t = __detail::__sexpr_uncurry<_Tp, _Fn>; + }; + + template + struct __sexpr_impl; + + template + using __name_of = __detail::__name_of<_Sender>; + + namespace __detail { + template + struct __op_state; + + template + struct __connect_fn; + + template + using __state_type_t = __decay_t<__result_of< + __sexpr_impl<_Tag>::get_state, _Sexpr, _Receiver&>>; + + template + using __env_type_t = __result_of< + __sexpr_impl<_Tag>::get_env, _Index, __state_type_t<_Tag, _Sexpr, _Receiver>&, _Receiver&>; + + template + concept __connectable = + __callable<__impl_of<_Sexpr>, __copy_cvref_fn<_Sexpr>, __connect_fn<_Sexpr, _Receiver>> + && __mvalid<__state_type_t, tag_of_t<_Sexpr>, _Sexpr, _Receiver>; + + // Note: This is UB. UBSAN allows it for now. + template + _Parent* __parent_from_child(_Child* __child, _Child _Parent::*__mbr_ptr) noexcept { + alignas(_Parent) char __buf[sizeof(_Parent)]; + _Parent* __parent = (_Parent*) &__buf; + const std::ptrdiff_t __offset = (char*) &(__parent->*__mbr_ptr) - __buf; + return (_Parent*) ((char*) __child - __offset); + } + + inline constexpr auto __get_attrs = // + [](__ignore, const auto&... __child) noexcept -> decltype(auto) { + if constexpr (sizeof...(__child) == 1) { + return stdexec::get_env(__child...); // BUGBUG: should be only the forwarding queries + } else { + return empty_env(); + } + STDEXEC_UNREACHABLE(); + }; + + inline constexpr auto __get_env = // + [](__ignore, __ignore, const _Receiver& __rcvr) noexcept + -> env_of_t { + return stdexec::get_env(__rcvr); + }; + + inline constexpr auto __get_state = // + [](_Sender&& __sndr, __ignore) noexcept -> decltype(auto) { + return STDEXEC_CALL_EXPLICIT_THIS_MEMFN((_Sender&&) __sndr, apply)(__get_data()); + }; + + inline constexpr auto __connect = // + [](_Sender&& __sndr, _Receiver __rcvr) + -> __op_state<_Sender, _Receiver> + requires __connectable<_Sender, _Receiver> { + return __op_state<_Sender, _Receiver>{(_Sender&&) __sndr, (_Receiver&&) __rcvr}; + }; + + inline constexpr auto __start = // + [](__ignore, __ignore, _ChildOps&... __ops) noexcept { + (_StartTag()(__ops), ...); + }; + + inline constexpr auto __complete = // + []( + _Index, __ignore, _Receiver& __rcvr, _SetTag, _Args&&... __args) noexcept { + static_assert(__v<_Index> == 0, "I don't know how to complete this operation."); + _SetTag()(std::move(__rcvr), (_Args&&) __args...); + }; + + inline constexpr auto __get_completion_signagures = // + [](__ignore, __ignore) noexcept { + return void(); + }; + + template + struct __receiver { + struct __t { + using receiver_concept = receiver_t; + using _Receiver = stdexec::__t<_ReceiverId>; + using __sexpr = _Sexpr; + using __index = _Idx; + using __id = __receiver; + using __parent_op_t = __op_state<_Sexpr, _Receiver>; + using __tag_t = tag_of_t<_Sexpr>; + + // A pointer to the parent operation state, which contains the one created with + // this receiver. + __parent_op_t* __op_; + + template + static __t __from_op_state(__op_state<_ChildSexpr, _ChildReceiver>* __child) noexcept { + using __parent_op_t = __op_state<_Sexpr, _Receiver>; + std::ptrdiff_t __offset = __parent_op_t::template __get_child_op_offset<__v<_Idx>>(); + __parent_op_t* __parent = (__parent_op_t*) ((char*) __child - __offset); + return __t{__parent}; + } + + template <__completion_tag _Tag, class... _Args> + STDEXEC_ATTRIBUTE((always_inline)) + friend void tag_invoke(_Tag, __t&& __self, _Args&&... __args) noexcept { + __self.__op_->__complete(_Idx(), _Tag(), (_Args&&) __args...); + } + + template _Tag, class _SexprTag = __tag_t> + STDEXEC_ATTRIBUTE((always_inline)) + friend auto tag_invoke(_Tag, const __t& __self) noexcept + -> __env_type_t<_SexprTag, _Idx, _Sexpr, _Receiver> { + return __self.__op_->__get_env(_Idx()); + } + }; + }; + + template + using __sexpr_connected_with = __mapply< + __mbind_front_q<__m_at, typename _Receiver::__index>, + typename __call_result_t<__impl_of, __cp, __get_desc>::__children>; + + template + struct __op_base : __immovable { + using __tag_t = typename __decay_t<_Sexpr>::__tag_t; + using __state_t = __state_type_t<__tag_t, _Sexpr, _Receiver>; + + STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS _Receiver __rcvr_; + STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS __state_t __state_; + + __op_base(_Sexpr&& __sndr, _Receiver&& __rcvr) + : __rcvr_((_Receiver&&) __rcvr) + , __state_(__sexpr_impl<__tag_t>::get_state((_Sexpr&&) __sndr, __rcvr_)) { + } + + _Receiver& __rcvr() & noexcept { + return __rcvr_; + } + }; + + // template + // requires __is_instance_of<__id<_Receiver>, __receiver> + // && __decays_to<_Sexpr, __sexpr_connected_with<_Receiver>> + // struct __op_base<_Sexpr, _Receiver> : __immovable { + // using __tag_t = typename __decay_t<_Sexpr>::__tag_t; + // using __state_t = __state_type_t<__tag_t, _Sexpr, _Receiver>; + + // STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS __state_t __state_; + + // __op_base(_Sexpr&& __sndr, _Receiver&& __rcvr) + // : __state_(__sexpr_impl<__tag_t>::get_state((_Sexpr&&) __sndr, __rcvr)) { + // STDEXEC_ASSERT(this->__rcvr().__op_ == __rcvr.__op_); + // } + + // _Receiver __rcvr() const noexcept { + // return _Receiver::__from_op_state( // + // static_cast<__op_state<_Sexpr, _Receiver>*>( // + // const_cast<__op_base*>(this))); + // } + // }; + + STDEXEC_PRAGMA_PUSH() + STDEXEC_PRAGMA_IGNORE_GNU("-Winvalid-offsetof") + STDEXEC_PRAGMA_IGNORE_EDG(offset_in_non_POD_nonstandard) + + template + struct __enable_receiver_from_this { + using __op_base_t = __op_base<_Sexpr, _Receiver>; + + decltype(auto) __receiver() noexcept { + using __derived_t = decltype(__op_base_t::__state_); + __derived_t* __derived = static_cast<__derived_t*>(this); + constexpr std::size_t __offset = offsetof(__op_base_t, __state_); + __op_base_t* __base = (__op_base_t*) ((char*) __derived - __offset); + return __base->__rcvr(); + } + }; + + STDEXEC_PRAGMA_POP() + + STDEXEC_PRAGMA_PUSH() + STDEXEC_PRAGMA_IGNORE_GNU("-Wmissing-braces") + + template + struct __connect_fn { + template + using __receiver_t = __t<__receiver<__id<_Receiver>, _Sexpr, __mconstant<_Idx>>>; + + __op_state<_Sexpr, _Receiver>* __op_; + + struct __impl { + __op_state<_Sexpr, _Receiver>* __op_; + + template + auto operator()(__indices<_Is...>, _Tag, _Data&&, _Child&&... __child) const + -> __tup::__tuple<__indices<_Is...>, connect_result_t<_Child, __receiver_t<_Is>>...> { + return __tuple{connect((_Child&&) __child, __receiver_t<_Is>{__op_})...}; + } + }; + + template + auto operator()(_Tag, _Data&& __data, _Child&&... __child) const + -> __call_result_t<__impl, __indices_for<_Child...>, _Tag, _Data, _Child...> { + return __impl{ + __op_}(__indices_for<_Child...>(), _Tag(), (_Data&&) __data, (_Child&&) __child...); + } + }; + STDEXEC_PRAGMA_POP() + + template + struct __op_state : __op_base<_Sexpr, _Receiver> { + using __desc_t = typename __decay_t<_Sexpr>::__desc_t; + using __tag_t = typename __desc_t::__tag; + using __data_t = typename __desc_t::__data; + using __children_t = typename __desc_t::__children; + using __state_t = typename __op_state::__state_t; + using __connect_t = __connect_fn<_Sexpr, _Receiver>; + + static auto __connect(__op_state* __self, _Sexpr&& __sexpr) + -> __result_of<__sexpr_apply, _Sexpr, __connect_t> { + return __sexpr_apply((_Sexpr&&) __sexpr, __connect_t{__self}); + } + + using __inner_ops_t = decltype(__op_state::__connect(nullptr, __declval<_Sexpr>())); + __inner_ops_t __inner_ops_; + + template + static std::ptrdiff_t __get_child_op_offset() noexcept { + __op_state* __self = (__op_state*) &__self; + return (std::ptrdiff_t)((char*) &__tup::__get<_Idx>(__self->__inner_ops_) - (char*) __self); + } + + __op_state(_Sexpr&& __sexpr, _Receiver __rcvr) + : __op_state::__op_base{(_Sexpr&&) __sexpr, (_Receiver&&) __rcvr} + , __inner_ops_(__op_state::__connect(this, (_Sexpr&&) __sexpr)) { + } + + template _Tag2> + STDEXEC_ATTRIBUTE((always_inline)) + friend void tag_invoke(_Tag2, __op_state& __self) noexcept { + using __tag_t = typename __op_state::__tag_t; + auto&& __rcvr = __self.__rcvr(); + __tup::__apply( + [&](auto&... __ops) noexcept { + __sexpr_impl<__tag_t>::start(__self.__state_, __rcvr, __ops...); + }, + __self.__inner_ops_); + } + + template + STDEXEC_ATTRIBUTE((always_inline)) + void __complete(_Index, _Tag2, _Args&&... __args) noexcept { + using __tag_t = typename __op_state::__tag_t; + auto&& __rcvr = this->__rcvr(); + __sexpr_impl<__tag_t>::complete( + _Index(), this->__state_, __rcvr, _Tag2(), (_Args&&) __args...); + } + + template + STDEXEC_ATTRIBUTE((always_inline)) // + auto __get_env(_Index) noexcept -> __env_type_t<__tag_t, _Index, _Sexpr, _Receiver> { + const auto& __rcvr = this->__rcvr(); + return __sexpr_impl<__tag_t>::get_env(_Index(), this->__state_, __rcvr); + } + }; + + inline constexpr auto __drop_front = // + [](_Fn __fn) noexcept { + return [__fn = std::move(__fn)](auto&&, _Rest&&... __rest) + noexcept(__nothrow_callable) + -> __call_result_t { + return __fn((_Rest&&) __rest...); + }; + }; } // namespace __detail + struct __sexpr_defaults { + static constexpr auto get_attrs = __detail::__get_attrs; + static constexpr auto get_env = __detail::__get_env; + static constexpr auto get_state = __detail::__get_state; + static constexpr auto connect = __detail::__connect; + static constexpr auto start = __detail::__start; + static constexpr auto complete = __detail::__complete; + static constexpr auto get_completion_signagures = __detail::__get_completion_signagures; + }; + + template + struct __sexpr_impl : __sexpr_defaults {}; + + using __detail::__enable_receiver_from_this; + + template + using __get_attrs_fn = + __result_of<__detail::__drop_front, __mtypeof<__sexpr_impl<_Tag>::get_attrs>>; + ////////////////////////////////////////////////////////////////////////////////////////////////// // __sexpr template @@ -63,36 +427,45 @@ namespace stdexec { template struct __sexpr<_ImplFn> { - using is_sender = void; + using sender_concept = sender_t; using __t = __sexpr; using __id = __sexpr; - using __tag_t = __call_result_t<_ImplFn, __cp, __detail::__get_tag>; + using __desc_t = __call_result_t<_ImplFn, __cp, __detail::__get_desc>; + using __tag_t = typename __desc_t::__tag; + using __data_t = typename __desc_t::__data; + using __children_t = typename __desc_t::__children; + using __arity_t = __mapply<__msize, __children_t>; + + template + using __impl = __sexpr_impl<__meval<__msecond, _Tag, __tag_t>>; + STDEXEC_ATTRIBUTE((always_inline)) // static __tag_t __tag() noexcept { return {}; } mutable _ImplFn __impl_; - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - explicit __sexpr(_ImplFn __impl) + STDEXEC_ATTRIBUTE((host, device, always_inline)) + explicit __sexpr(_ImplFn __impl) : __impl_((_ImplFn&&) __impl) { } template _Tag, same_as<__sexpr> _Self> + STDEXEC_ATTRIBUTE((always_inline)) // friend auto tag_invoke(_Tag, const _Self& __self) noexcept // -> __msecond< - __if_c>, // - decltype(__self.__tag().get_env(__self))> { - static_assert(noexcept(__self.__tag().get_env(__self))); - return __tag_t::get_env(__self); + __if_c && same_as<_Self, __sexpr>>, // + __result_of<__sexpr_apply, const _Self&, __get_attrs_fn<__tag_t>>> { + return __sexpr_apply(__self, __detail::__drop_front(__impl<_Tag>::get_attrs)); } template < same_as _Tag, __decays_to<__sexpr> _Self, class _Env> - friend auto tag_invoke(_Tag, _Self&& __self, _Env&& __env) // + STDEXEC_ATTRIBUTE((always_inline)) // + friend auto tag_invoke(_Tag, _Self&& __self, _Env&& __env) noexcept // -> __msecond< - __if_c>, - decltype(__self.__tag().get_completion_signatures((_Self&&) __self, (_Env&&) __env))> { + __if_c && __decays_to<_Self, __sexpr>>, + __result_of<__impl<_Tag>::get_completion_signatures, _Self, _Env>> { return {}; } @@ -101,34 +474,50 @@ namespace stdexec { same_as _Tag, __decays_to<__sexpr> _Self, /*receiver*/ class _Receiver> - friend auto tag_invoke(_Tag, _Self&& __self, _Receiver&& __rcvr) // - noexcept(noexcept(__self.__tag().connect((_Self&&) __self, (_Receiver&&) __rcvr))) // + STDEXEC_ATTRIBUTE((always_inline)) // + friend auto tag_invoke(_Tag, _Self&& __self, _Receiver&& __rcvr) // + noexcept(noexcept( + __impl<_Tag>::connect((_Self&&) __self, (_Receiver&&) __rcvr))) // -> __msecond< - __if_c>, - decltype(__self.__tag().connect((_Self&&) __self, (_Receiver&&) __rcvr))> { - return __tag_t::connect((_Self&&) __self, (_Receiver&&) __rcvr); + __if_c && __decays_to<_Self, __sexpr>>, + __result_of<__impl<_Tag>::connect, _Self, _Receiver>> { + return __impl<_Tag>::connect((_Self&&) __self, (_Receiver&&) __rcvr); } template + STDEXEC_ATTRIBUTE((always_inline)) // STDEXEC_DEFINE_EXPLICIT_THIS_MEMFN(auto apply)(this _Sender&& __sndr, _ApplyFn&& __fun) // noexcept( __nothrow_callable<__detail::__impl_of<_Sender>, __copy_cvref_fn<_Sender>, _ApplyFn>) // -> __call_result_t<__detail::__impl_of<_Sender>, __copy_cvref_fn<_Sender>, _ApplyFn> { // return ((_Sender&&) __sndr).__impl_(__copy_cvref_fn<_Sender>(), (_ApplyFn&&) __fun); // } + + template _Self> + STDEXEC_ATTRIBUTE((always_inline)) + friend decltype(auto) get(_Self&& __self) noexcept + requires(_Idx < (__v<__arity_t> + 2)) + { + if constexpr (_Idx == 0) { + return __tag_t(); + } else { + return __self.__impl_(__copy_cvref_fn<_Self>(), __nth_pack_element<_Idx>); + } + STDEXEC_UNREACHABLE(); + } }; template - STDEXEC_DETAIL_CUDACC_HOST_DEVICE // - __sexpr(_ImplFn) -> __sexpr<_ImplFn>; + STDEXEC_ATTRIBUTE((host, device)) + __sexpr(_ImplFn) -> __sexpr<_ImplFn>; ////////////////////////////////////////////////////////////////////////////////////////////////// - // make_sender_expr + // __make_sexpr namespace __detail { - template > - struct make_sender_expr_t { - template - constexpr auto operator()(_Data __data = {}, _Children... __children) const; + template + struct __make_sexpr_t { + template + constexpr auto operator()(_Data __data = {}, _Child... __child) const; }; #if STDEXEC_NVHPC() || (STDEXEC_GCC() && __GNUC__ < 13) @@ -146,12 +535,14 @@ namespace stdexec { _Ty __value; + STDEXEC_ATTRIBUTE((always_inline)) explicit __mbc(_Ty& __v) noexcept(std::is_nothrow_move_constructible_v<_Ty>) : __value((_Ty&&) __v) { } // This is a template so as to not be considered a copy/move constructor. Therefore, // it doesn't suppress the generation of the default copy/move constructors. + STDEXEC_ATTRIBUTE((always_inline)) __mbc(same_as<__mbc> auto& __that) noexcept(std::is_nothrow_move_constructible_v<_Ty>) : __value(static_cast<_Ty&&>(__that.__value)) { } @@ -179,12 +570,10 @@ namespace stdexec { }; } // anonymous namespace - template - template - constexpr auto - make_sender_expr_t<_Tag, _Domain>::operator()(_Data __data, _Children... __children) const { - return __sexpr{ - __detail::__make_tuple(_Tag(), __detail::__mbc(__data), __detail::__mbc(__children)...)}; + template + template + constexpr auto __make_sexpr_t<_Tag>::operator()(_Data __data, _Child... __child) const { + return __sexpr{__make_tuple(_Tag(), __detail::__mbc(__data), __detail::__mbc(__child)...)}; } #else // Anonymous namespace here is to avoid symbol name collisions with the @@ -204,24 +593,26 @@ namespace stdexec { }; } // anonymous namespace - template - template - constexpr auto - make_sender_expr_t<_Tag, _Domain>::operator()(_Data __data, _Children... __children) const { - return __sexpr{__detail::__make_tuple(_Tag(), (_Data&&) __data, (_Children&&) __children...)}; + template + template + constexpr auto __make_sexpr_t<_Tag>::operator()(_Data __data, _Child... __child) const { + return __sexpr{__make_tuple(_Tag(), (_Data&&) __data, (_Child&&) __child...)}; }; #endif + + template + inline constexpr __make_sexpr_t<_Tag> __make_sexpr{}; } // namespace __detail - template > - inline constexpr __detail::make_sender_expr_t<_Tag, _Domain> make_sender_expr{}; + using __detail::__make_sexpr; - template - using __sexpr_t = __result_of, _Data, _Children...>; + template + using __sexpr_t = __result_of<__make_sexpr<_Tag>, _Data, _Child...>; namespace __detail { - struct apply_sender_t { + struct __sexpr_apply_t { template + STDEXEC_ATTRIBUTE((always_inline)) // auto operator()(_Sender&& __sndr, _ApplyFn&& __fun) const // noexcept(noexcept( STDEXEC_CALL_EXPLICIT_THIS_MEMFN(((_Sender&&) __sndr), apply)((_ApplyFn&&) __fun))) // @@ -232,62 +623,31 @@ namespace stdexec { }; } // namespace __detail - using __detail::apply_sender_t; - inline constexpr apply_sender_t apply_sender{}; + using __detail::__sexpr_apply_t; + inline constexpr __sexpr_apply_t __sexpr_apply{}; template - using apply_sender_result_t = __call_result_t; - - template - using __tag_of = __call_result_t; - - template - using __data_of = __call_result_t; - - template > - using __children_of = __t<__call_result_t< - __call_result_t>>>; - - template - using __nth_child_of = __children_of<_Sender, __mbind_front_q<__m_at, _Ny>>; - - template - using __nth_child_of_c = __children_of<_Sender, __mbind_front_q<__m_at, __msize_t<_Ny>>>; - - template - using __child_of = __children_of<_Sender, __q<__mfront>>; - - template - inline constexpr std::size_t __nbr_children_of = __v<__children_of<_Sender, __msize>>; + using __sexpr_apply_result_t = __call_result_t<__sexpr_apply_t, _Sender, _ApplyFn>; template concept sender_expr = // - __mvalid<__tag_of, _Sender>; + __mvalid; template concept sender_expr_for = // - sender_expr<_Sender> && same_as<__tag_of<_Sender>, _Tag>; + sender_expr<_Sender> && same_as, _Tag>; // The __name_of utility defined below is used to pretty-print the type names of // senders in compiler diagnostics. namespace __detail { - template - extern __q<__midentity> __name_of_v; - - template - using __name_of_fn = decltype(__name_of_v<_Sender>); - - template - using __name_of = __minvoke<__name_of_fn<_Sender>, _Sender>; - struct __basic_sender_name { template using __f = // - __call_result_t>; + __call_result_t<__sexpr_apply_result_t<_Sender, __basic_sender_name>>; - template - auto operator()(_Tag, _Data&&, _Children&&...) const // - -> __sexpr<_Tag, _Data, __name_of<_Children>...> (*)(); + template + auto operator()(_Tag, _Data&&, _Child&&...) const // + -> __sexpr<_Tag, _Data, __name_of<_Child>...> (*)(); }; struct __id_name { @@ -310,8 +670,24 @@ namespace stdexec { template <__has_id _Sender> requires(!same_as<__id<_Sender>, _Sender>) extern __id_name __name_of_v<_Sender>; - } // namespace __detail - template - using __name_of = __detail::__name_of<_Sender>; + template + _Ty __remove_rvalue_reference_fn(_Ty&&); + + template + using __remove_rvalue_reference_t = + decltype(__detail::__remove_rvalue_reference_fn(__declval<_Ty>())); + } // namespace __detail } // namespace stdexec + +namespace std { + template + struct tuple_size> + : integral_constant< size_t, stdexec::__v::__arity_t> + 2> { }; + + template + struct tuple_element<_Idx, stdexec::__sexpr<_Impl>> { + using type = stdexec::__detail::__remove_rvalue_reference_t< + stdexec::__call_result_t<_Impl, stdexec::__cp, stdexec::__nth_pack_element_t<_Idx>>>; + }; +} diff --git a/include/stdexec/__detail/__config.hpp b/include/stdexec/__detail/__config.hpp index 5264d9ec3..6d7eb04ff 100644 --- a/include/stdexec/__detail/__config.hpp +++ b/include/stdexec/__detail/__config.hpp @@ -28,11 +28,14 @@ #include #include +#define STDEXEC_STRINGIZE(_ARG) #_ARG + #define STDEXEC_CAT_(_XP, ...) _XP##__VA_ARGS__ #define STDEXEC_CAT(_XP, ...) STDEXEC_CAT_(_XP, __VA_ARGS__) #define STDEXEC_EXPAND(...) __VA_ARGS__ #define STDEXEC_EVAL(_MACRO, ...) _MACRO(__VA_ARGS__) +#define STDEXEC_EAT(...) #define STDEXEC_NOT(_XP) STDEXEC_CAT(STDEXEC_NOT_, _XP) #define STDEXEC_NOT_0 1 @@ -46,45 +49,151 @@ STDEXEC_EXPAND(STDEXEC_COUNT_(__VA_ARGS__, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1)) #define STDEXEC_COUNT_(_1, _2, _3, _4, _5, _6, _7, _8, _9, _10, _NP, ...) _NP -#define STDEXEC_CHECK(...) STDEXEC_EXPAND(STDEXEC_CHECK_N(__VA_ARGS__, 0, )) -#define STDEXEC_CHECK_N(_XP, _NP, ...) _NP -#define STDEXEC_PROBE(_XP) _XP, 1, - +#define STDEXEC_CHECK(...) STDEXEC_EXPAND(STDEXEC_CHECK_(__VA_ARGS__, 0, )) +#define STDEXEC_CHECK_(_XP, _NP, ...) _NP +#define STDEXEC_PROBE(...) STDEXEC_PROBE_(__VA_ARGS__, 1) +#define STDEXEC_PROBE_(_XP, _NP, ...) _XP, _NP, + +//////////////////////////////////////////////////////////////////////////////// +// STDEXEC_FOR_EACH +// Inspired by "Recursive macros with C++20 __VA_OPT__", by David Mazières +// https://www.scs.stanford.edu/~dm/blog/va-opt.html +#define STDEXEC_EXPAND_R(...) \ + STDEXEC_EXPAND_R1(STDEXEC_EXPAND_R1(STDEXEC_EXPAND_R1(STDEXEC_EXPAND_R1(__VA_ARGS__)))) \ + /**/ +#define STDEXEC_EXPAND_R1(...) \ + STDEXEC_EXPAND_R2(STDEXEC_EXPAND_R2(STDEXEC_EXPAND_R2(STDEXEC_EXPAND_R2(__VA_ARGS__)))) \ + /**/ +#define STDEXEC_EXPAND_R2(...) \ + STDEXEC_EXPAND_R3(STDEXEC_EXPAND_R3(STDEXEC_EXPAND_R3(STDEXEC_EXPAND_R3(__VA_ARGS__)))) \ + /**/ +#define STDEXEC_EXPAND_R3(...) \ + STDEXEC_EXPAND(STDEXEC_EXPAND(STDEXEC_EXPAND(STDEXEC_EXPAND(__VA_ARGS__)))) \ + /**/ + +#define STDEXEC_PARENS () +#define STDEXEC_FOR_EACH(_MACRO, ...) \ + __VA_OPT__(STDEXEC_EXPAND_R(STDEXEC_FOR_EACH_HELPER(_MACRO, __VA_ARGS__))) \ + /**/ +#define STDEXEC_FOR_EACH_HELPER(_MACRO, _A1, ...) \ + _MACRO(_A1) __VA_OPT__(STDEXEC_FOR_EACH_AGAIN STDEXEC_PARENS(_MACRO, __VA_ARGS__)) /**/ +#define STDEXEC_FOR_EACH_AGAIN() STDEXEC_FOR_EACH_HELPER +//////////////////////////////////////////////////////////////////////////////////////////////////// + +// If tail is non-empty, expand to the tail. Otherwise, expand to the head +#define STDEXEC_HEAD_OR_TAIL(_XP, ...) STDEXEC_EXPAND __VA_OPT__((__VA_ARGS__) STDEXEC_EAT)(_XP) + +// If tail is non-empty, expand to nothing. Otherwise, expand to the head +#define STDEXEC_HEAD_OR_NULL(_XP, ...) STDEXEC_EXPAND __VA_OPT__(() STDEXEC_EAT)(_XP) + +// When used with no arguments, these macros expand to 1 if the current +// compiler corresponds to the macro name; 0, otherwise. When used with arguments, +// they expand to the arguments if if the current compiler corresponds to the +// macro name; nothing, otherwise. #if defined(__NVCC__) -#define STDEXEC_NVCC() 1 +#define STDEXEC_NVCC(...) STDEXEC_HEAD_OR_TAIL(1, __VA_ARGS__) #elif defined(__NVCOMPILER) -#define STDEXEC_NVHPC() 1 +#define STDEXEC_NVHPC(...) STDEXEC_HEAD_OR_TAIL(1, __VA_ARGS__) #elif defined(__EDG__) -#define LEGATE_EDG() 1 +#define STDEXEC_EDG(...) STDEXEC_HEAD_OR_TAIL(1, __VA_ARGS__) #elif defined(__clang__) -#define STDEXEC_CLANG() 1 +#define STDEXEC_CLANG(...) STDEXEC_HEAD_OR_TAIL(1, __VA_ARGS__) +#if defined(_MSC_VER) +#define STDEXEC_CLANG_CL(...) STDEXEC_HEAD_OR_TAIL(1, __VA_ARGS__) +#endif #elif defined(__GNUC__) -#define STDEXEC_GCC() 1 +#define STDEXEC_GCC(...) STDEXEC_HEAD_OR_TAIL(1, __VA_ARGS__) #elif defined(_MSC_VER) -#define STDEXEC_MSVC() 1 +#define STDEXEC_MSVC(...) STDEXEC_HEAD_OR_TAIL(1, __VA_ARGS__) #endif #ifndef STDEXEC_NVCC -#define STDEXEC_NVCC() 0 +#define STDEXEC_NVCC(...) STDEXEC_HEAD_OR_NULL(0, __VA_ARGS__) #endif #ifndef STDEXEC_NVHPC -#define STDEXEC_NVHPC() 0 +#define STDEXEC_NVHPC(...) STDEXEC_HEAD_OR_NULL(0, __VA_ARGS__) #endif #ifndef STDEXEC_EDG -#define STDEXEC_EDG() 0 +#define STDEXEC_EDG(...) STDEXEC_HEAD_OR_NULL(0, __VA_ARGS__) #endif #ifndef STDEXEC_CLANG -#define STDEXEC_CLANG() 0 +#define STDEXEC_CLANG(...) STDEXEC_HEAD_OR_NULL(0, __VA_ARGS__) +#endif +#ifndef STDEXEC_CLANG_CL +#define STDEXEC_CLANG_CL(...) STDEXEC_HEAD_OR_NULL(0, __VA_ARGS__) #endif #ifndef STDEXEC_GCC -#define STDEXEC_GCC() 0 +#define STDEXEC_GCC(...) STDEXEC_HEAD_OR_NULL(0, __VA_ARGS__) #endif #ifndef STDEXEC_MSVC -#define STDEXEC_MSVC() 0 +#define STDEXEC_MSVC(...) STDEXEC_HEAD_OR_NULL(0, __VA_ARGS__) #endif -#define STDEXEC_STRINGIZE(_ARG) #_ARG +//////////////////////////////////////////////////////////////////////////////////////////////////// +#ifdef __CUDACC__ +#define STDEXEC_CUDA(...) STDEXEC_HEAD_OR_TAIL(1, __VA_ARGS__) +#else +#define STDEXEC_CUDA(...) STDEXEC_HEAD_OR_NULL(0, __VA_ARGS__) +#endif + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// For portably declaring attributes on functions and types +// Usage: +// +// STDEXEC_ATTRIBUTE((attr1, attr2, ...)) +// void foo() { ... } +#define STDEXEC_ATTRIBUTE(_XP) STDEXEC_FOR_EACH(STDEXEC_ATTR, STDEXEC_EXPAND _XP) +#define STDEXEC_ATTR(_ATTR) \ + STDEXEC_CAT(STDEXEC_ATTR_WHICH_, STDEXEC_CHECK(STDEXEC_CAT(STDEXEC_ATTR_, _ATTR)))(_ATTR) + +// unknown attributes are treated like C++-style attributes +#define STDEXEC_ATTR_WHICH_0(_ATTR) [[_ATTR]] + +// custom handling for specific attribute types +#ifdef __CUDACC__ +#define STDEXEC_ATTR_WHICH_1(_ATTR) __host__ +#else +#define STDEXEC_ATTR_WHICH_1(_ATTR) +#endif +#define STDEXEC_ATTR_host STDEXEC_PROBE(~, 1) +#define STDEXEC_ATTR___host__ STDEXEC_PROBE(~, 1) +#ifdef __CUDACC__ +#define STDEXEC_ATTR_WHICH_2(_ATTR) __device__ +#else +#define STDEXEC_ATTR_WHICH_2(_ATTR) +#endif +#define STDEXEC_ATTR_device STDEXEC_PROBE(~, 2) +#define STDEXEC_ATTR___device__ STDEXEC_PROBE(~, 2) + +#if STDEXEC_NVHPC() +// NVBUG #4067067: NVHPC does not fully support [[no_unique_address]] +#define STDEXEC_ATTR_WHICH_3(_ATTR) /*nothing*/ +#elif STDEXEC_MSVC() +// MSVCBUG https://developercommunity.visualstudio.com/t/Incorrect-codegen-when-using-msvc::no_/10452874 +#define STDEXEC_ATTR_WHICH_3(_ATTR) // [[msvc::no_unique_address]] +#elif STDEXEC_CLANG_CL() +// clang-cl does not support: https://reviews.llvm.org/D110485 +#define STDEXEC_ATTR_WHICH_3(_ATTR) // [[msvc::no_unique_address]] +#else +#define STDEXEC_ATTR_WHICH_3(_ATTR) [[no_unique_address]] +#endif +#define STDEXEC_ATTR_no_unique_address STDEXEC_PROBE(~, 3) + +#if STDEXEC_MSVC() +#define STDEXEC_ATTR_WHICH_4(_ATTR) __forceinline +#elif STDEXEC_CLANG() +#define STDEXEC_ATTR_WHICH_4(_ATTR) \ + __attribute__((__always_inline__, __artificial__, __nodebug__)) inline +#elif defined(__GNUC__) +#define STDEXEC_ATTR_WHICH_4(_ATTR) __attribute__((__always_inline__, __artificial__)) inline +#else +#define STDEXEC_ATTR_WHICH_4(_ATTR) /*nothing*/ +#endif +#define STDEXEC_ATTR_always_inline STDEXEC_PROBE(~, 4) + +//////////////////////////////////////////////////////////////////////////////////////////////////// +// warning push/pop portability macros #if STDEXEC_NVCC() #define STDEXEC_PRAGMA_PUSH() _Pragma("nv_diagnostic push") #define STDEXEC_PRAGMA_POP() _Pragma("nv_diagnostic pop") @@ -95,19 +204,23 @@ #define STDEXEC_PRAGMA_POP() _Pragma("diagnostic pop") #define STDEXEC_PRAGMA_IGNORE_EDG(...) _Pragma(STDEXEC_STRINGIZE(diag_suppress __VA_ARGS__)) #elif STDEXEC_CLANG() || STDEXEC_GCC() -#define STDEXEC_PRAGMA_PUSH() _Pragma("GCC diagnostic push") +#define STDEXEC_PRAGMA_PUSH() \ + _Pragma("GCC diagnostic push") STDEXEC_PRAGMA_IGNORE_GNU("-Wpragmas") STDEXEC_PRAGMA_IGNORE_GNU( \ + "-Wunknown-pragmas") STDEXEC_PRAGMA_IGNORE_GNU("-Wunknown-warning-option") \ + STDEXEC_PRAGMA_IGNORE_GNU("-Wunknown-attributes") STDEXEC_PRAGMA_IGNORE_GNU("-Wattributes") #define STDEXEC_PRAGMA_POP() _Pragma("GCC diagnostic pop") -#define STDEXEC_PRAGMA_IGNORE_GNU(_ARG) _Pragma(STDEXEC_STRINGIZE(GCC diagnostic ignored _ARG)) +#define STDEXEC_PRAGMA_IGNORE_GNU(...) \ + _Pragma(STDEXEC_STRINGIZE(GCC diagnostic ignored __VA_ARGS__)) #else #define STDEXEC_PRAGMA_PUSH() #define STDEXEC_PRAGMA_POP() #endif #ifndef STDEXEC_PRAGMA_IGNORE_GNU -#define STDEXEC_PRAGMA_IGNORE_GNU(_ARG) +#define STDEXEC_PRAGMA_IGNORE_GNU(...) #endif #ifndef STDEXEC_PRAGMA_IGNORE_EDG -#define STDEXEC_PRAGMA_IGNORE_EDG(_ARG) +#define STDEXEC_PRAGMA_IGNORE_EDG(...) #endif #if !STDEXEC_MSVC() && defined(__has_builtin) @@ -116,6 +229,12 @@ #define STDEXEC_HAS_BUILTIN(...) 0 #endif +#if !STDEXEC_MSVC() && defined(__has_feature) +#define STDEXEC_HAS_FEATURE __has_feature +#else +#define STDEXEC_HAS_FEATURE(...) 0 +#endif + #if STDEXEC_HAS_BUILTIN(__is_trivially_copyable) || STDEXEC_MSVC() #define STDEXEC_IS_TRIVIALLY_COPYABLE(...) __is_trivially_copyable(__VA_ARGS__) #else @@ -136,6 +255,12 @@ #define STDEXEC_IS_CONVERTIBLE_TO(...) std::is_convertible_v<__VA_ARGS__> #endif +#if STDEXEC_HAS_BUILTIN(__is_const) +#define STDEXEC_IS_CONST(...) __is_const(__VA_ARGS__) +#else +#define STDEXEC_IS_CONST(...) stdexec::__is_const<__VA_ARGS__> +#endif + #if defined(__cpp_lib_unreachable) && __cpp_lib_unreachable >= 202202L #define STDEXEC_UNREACHABLE() std::unreachable() #elif STDEXEC_HAS_BUILTIN(__builtin_unreachable) @@ -153,32 +278,19 @@ #define STDEXEC_IMMOVABLE(_XP) _XP(_XP&&) = delete #endif -// NVBUG #4067067 -#if STDEXEC_NVHPC() -#define STDEXEC_NO_UNIQUE_ADDRESS -#else -#define STDEXEC_NO_UNIQUE_ADDRESS [[no_unique_address]] -#endif - // BUG (gcc PR93711): copy elision fails when initializing a // [[no_unique_address]] field from a function returning an object // of class type by value #if STDEXEC_GCC() #define STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS #else -#define STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS [[no_unique_address]] -#endif - -#if STDEXEC_CLANG() && defined(__CUDACC__) -#define STDEXEC_DETAIL_CUDACC_HOST_DEVICE __host__ __device__ -#else -#define STDEXEC_DETAIL_CUDACC_HOST_DEVICE +#define STDEXEC_IMMOVABLE_NO_UNIQUE_ADDRESS STDEXEC_ATTRIBUTE((no_unique_address)) #endif #if STDEXEC_NVHPC() #include #define STDEXEC_TERMINATE() NV_IF_TARGET(NV_IS_HOST, (std::terminate();), (__trap();)) void() -#elif STDEXEC_CLANG() && defined(__CUDACC__) && defined(__CUDA_ARCH__) +#elif STDEXEC_CLANG() && STDEXEC_CUDA() && defined(__CUDA_ARCH__) #define STDEXEC_TERMINATE() \ __trap(); \ __builtin_unreachable() @@ -186,6 +298,12 @@ #define STDEXEC_TERMINATE() std::terminate() #endif +#if STDEXEC_HAS_FEATURE(thread_sanitizer) || defined(__SANITIZE_THREAD__) +#define STDEXEC_TSAN(...) STDEXEC_HEAD_OR_TAIL(1, __VA_ARGS__) +#else +#define STDEXEC_TSAN(...) STDEXEC_HEAD_OR_NULL(0, __VA_ARGS__) +#endif + // Before clang-16, clang did not like libstdc++'s ranges implementation #if __has_include() && \ (defined(__cpp_lib_ranges) && __cpp_lib_ranges >= 201911L) && \ @@ -195,6 +313,13 @@ #define STDEXEC_HAS_STD_RANGES() 0 #endif +#if __has_include() && \ + (defined(__cpp_lib_memory_resource) && __cpp_lib_memory_resource >= 201603L) +#define STDEXEC_HAS_STD_MEMORY_RESOURCE() 1 +#else +#define STDEXEC_HAS_STD_MEMORY_RESOURCE() 0 +#endif + #ifdef STDEXEC_ASSERT #error "Redefinition of STDEXEC_ASSERT is not permitted. Define STDEXEC_ASSERT_FN instead." #endif @@ -222,12 +347,12 @@ #endif #if defined(__cpp_explicit_this_parameter) && (__cpp_explicit_this_parameter >= 202110) -#define STDEXEC_HAS_EXPLICIT_THIS() 1 +#define STDEXEC_EXPLICIT_THIS(...) STDEXEC_HEAD_OR_TAIL(1, __VA_ARGS__) #else -#define STDEXEC_HAS_EXPLICIT_THIS() 0 +#define STDEXEC_EXPLICIT_THIS(...) STDEXEC_HEAD_OR_NULL(0, __VA_ARGS__) #endif -#if STDEXEC_HAS_EXPLICIT_THIS() +#if STDEXEC_EXPLICIT_THIS() #define STDEXEC_DEFINE_EXPLICIT_THIS_MEMFN(...) __VA_ARGS__ #define STDEXEC_CALL_EXPLICIT_THIS_MEMFN(_OBJ, _NAME) (_OBJ)._NAME( STDEXEC_CALL_EXPLICIT_THIS_MEMFN_DETAIL #define STDEXEC_CALL_EXPLICIT_THIS_MEMFN_DETAIL(...) __VA_ARGS__ ) @@ -239,14 +364,31 @@ #define STDEXEC_FUN_ARGS(...) STDEXEC_CAT(STDEXEC_EAT_THIS_DETAIL_, __VA_ARGS__)) #endif +// Configure extra type checking +#define STDEXEC_TYPE_CHECKING_ZERO() 0 +#define STDEXEC_TYPE_CHECKING_ONE() 1 +#define STDEXEC_TYPE_CHECKING_TWO() 2 + +#define STDEXEC_PROBE_TYPE_CHECKING_ STDEXEC_TYPE_CHECKING_ONE +#define STDEXEC_PROBE_TYPE_CHECKING_0 STDEXEC_TYPE_CHECKING_ZERO +#define STDEXEC_PROBE_TYPE_CHECKING_1 STDEXEC_TYPE_CHECKING_ONE +#define STDEXEC_PROBE_TYPE_CHECKING_STDEXEC_ENABLE_EXTRA_TYPE_CHECKING STDEXEC_TYPE_CHECKING_TWO + +#define STDEXEC_TYPE_CHECKING_WHICH3(...) STDEXEC_PROBE_TYPE_CHECKING_##__VA_ARGS__ +#define STDEXEC_TYPE_CHECKING_WHICH2(...) STDEXEC_TYPE_CHECKING_WHICH3(__VA_ARGS__) +#define STDEXEC_TYPE_CHECKING_WHICH STDEXEC_TYPE_CHECKING_WHICH2(STDEXEC_ENABLE_EXTRA_TYPE_CHECKING) + #ifndef STDEXEC_ENABLE_EXTRA_TYPE_CHECKING -// Compile times are bad enough on nvhpc. Disable extra type checking by default. -#if STDEXEC_NVHPC() +#define STDEXEC_ENABLE_EXTRA_TYPE_CHECKING() 0 +#elif STDEXEC_TYPE_CHECKING_WHICH() == 2 +// do nothing +#elif STDEXEC_TYPE_CHECKING_WHICH() == 0 +#undef STDEXEC_ENABLE_EXTRA_TYPE_CHECKING #define STDEXEC_ENABLE_EXTRA_TYPE_CHECKING() 0 #else +#undef STDEXEC_ENABLE_EXTRA_TYPE_CHECKING #define STDEXEC_ENABLE_EXTRA_TYPE_CHECKING() 1 #endif -#endif namespace stdexec { } diff --git a/include/stdexec/__detail/__domain.hpp b/include/stdexec/__detail/__domain.hpp new file mode 100644 index 000000000..51c6ba85f --- /dev/null +++ b/include/stdexec/__detail/__domain.hpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2021-2022 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "__execution_fwd.hpp" + +#include "__basic_sender.hpp" +#include "__env.hpp" +#include "__meta.hpp" + +#include "../functional.hpp" + +namespace stdexec { + + struct default_domain; + struct dependent_domain; + + namespace __domain { + template + using __legacy_c11n_for = typename _Tag::__legacy_customizations_t; + + template + using __legacy_c11n_fn = // + __make_dispatcher<__legacy_c11n_for<_Tag>, __none_such, _Args...>; + + template + concept __has_legacy_c11n = // + __callable<__legacy_c11n_fn<_Tag, _Args...>, _Args...>; + + struct __legacy_customization { + template + requires __has_legacy_c11n<_Tag, _Data, _Children...> + decltype(auto) operator()(_Tag, _Data&& __data, _Children&&... __children) const { + return __legacy_c11n_fn<_Tag, _Data, _Children...>()( + static_cast<_Data&&>(__data), static_cast<_Children&&>(__children)...); + } + }; + + template + concept __has_transform_sender = + requires(_DomainOrTag __tag, _Sender&& __sender, const _Env&... __env) { + __tag.transform_sender((_Sender&&) __sender, __env...); + }; + + template + concept __has_default_transform_sender = // + sender_expr<_Sender> // + && __has_transform_sender, _Sender, _Env...>; + + template + concept __has_transform_env = requires(_Type __obj, _Sender&& __sender, _Env&& __env) { + __obj.transform_env((_Sender&&) __sender, (_Env&&) __env); + }; + + template + concept __has_default_transform_env = // + sender_expr<_Sender> // + && __has_transform_env, _Sender, _Env>; + + template + concept __has_apply_sender = requires(_DomainOrTag __tag, _Args&&... __args) { + __tag.apply_sender((_Args&&) __args...); + }; + } // namespace __domain + + struct default_domain { + default_domain() = default; + + // Called without the environment during eager customization + template + STDEXEC_ATTRIBUTE((always_inline)) + decltype(auto) transform_sender(_Sender&& __sndr) const { + // Look for a legacy customization for the given tag, and if found, apply it. + if constexpr (__callable<__sexpr_apply_t, _Sender, __domain::__legacy_customization>) { + return stdexec::__sexpr_apply((_Sender&&) __sndr, __domain::__legacy_customization()); + } else if constexpr (__domain::__has_default_transform_sender<_Sender>) { + return tag_of_t<_Sender>().transform_sender((_Sender&&) __sndr); + } else { + return static_cast<_Sender>((_Sender&&) __sndr); + } + STDEXEC_UNREACHABLE(); + } + + // Called with an environment during lazy customization + template + STDEXEC_ATTRIBUTE((always_inline)) + decltype(auto) transform_sender(_Sender&& __sndr, const _Env& __env) const { + if constexpr (__domain::__has_default_transform_sender<_Sender, _Env>) { + return tag_of_t<_Sender>().transform_sender((_Sender&&) __sndr, __env); + } else { + return static_cast<_Sender>((_Sender&&) __sndr); + } + STDEXEC_UNREACHABLE(); + } + + template + requires __domain::__has_legacy_c11n<_Tag, _Sender, _Args...> + || __domain::__has_apply_sender<_Tag, _Sender, _Args...> + STDEXEC_ATTRIBUTE((always_inline)) decltype(auto) + apply_sender(_Tag, _Sender&& __sndr, _Args&&... __args) const { + // Look for a legacy customization for the given tag, and if found, apply it. + if constexpr (__domain::__has_legacy_c11n<_Tag, _Sender, _Args...>) { + return __domain::__legacy_c11n_fn<_Tag, _Sender, _Args...>()( + static_cast<_Sender&&>(__sndr), static_cast<_Args&&>(__args)...); + } else { + return _Tag().apply_sender((_Sender&&) __sndr, (_Args&&) __args...); + } + STDEXEC_UNREACHABLE(); + } + + template + decltype(auto) transform_env(_Sender&& __sndr, _Env&& __env) const noexcept { + if constexpr (__domain::__has_default_transform_env<_Sender, _Env>) { + return tag_of_t<_Sender>().transform_env((_Sender&&) __sndr, (_Env&&) __env); + } else { + return static_cast<_Env>((_Env&&) __env); + } + } + }; + + ///////////////////////////////////////////////////////////////////////////// + namespace __detail { + template + using __completion_scheduler_for = + __meval_or<__call_result_t, __none_such, get_completion_scheduler_t<_Tag>, _Env>; + + template + using __completion_domain_for = + __meval_or<__call_result_t, __none_such, get_domain_t, __completion_scheduler_for<_Env, _Tag>>; + + // Check the value, error, and stopped channels for completion schedulers. + // Of the completion schedulers that are known, they must all have compatible + // domains. This computes that domain, or else returns __none_such if there + // are no completion schedulers or if they don't specify a domain. + template + struct __completion_domain_or_none_ + : __mdefer_< + __transform< + __mbind_front_q<__completion_domain_for, _Env>, + __remove<__none_such, __munique<__msingle_or<__none_such>>>>, + set_value_t, + set_error_t, + set_stopped_t> { }; + + template + using __completion_domain_or_none = __t<__completion_domain_or_none_>>; + + template + concept __consistent_completion_domains = __mvalid<__completion_domain_or_none, _Sender>; + + template + concept __has_completion_domain = (!same_as<__completion_domain_or_none<_Sender>, __none_such>); + + template <__has_completion_domain _Sender> + using __completion_domain_of = __completion_domain_or_none<_Sender>; + } // namespace __detail + + ///////////////////////////////////////////////////////////////////////////// + inline constexpr struct __get_early_domain_t { + template + auto operator()(const _Sender&, _Default __def = {}) const noexcept { + if constexpr (__callable>) { + return __call_result_t>(); + } else if constexpr (__detail::__has_completion_domain<_Sender>) { + return __detail::__completion_domain_of<_Sender>(); + } else { + return __def; + } + STDEXEC_UNREACHABLE(); + } + } __get_early_domain{}; + + template + using __early_domain_of_t = __call_result_t<__get_early_domain_t, _Sender, _Default>; + + ///////////////////////////////////////////////////////////////////////////// + inline constexpr struct __get_late_domain_t { + // When connect is looking for a customization, it first checks the sender's + // domain. If the sender knows the domain in which it completes, then that is + // where the subsequent task will execute. Otherwise, look to the receiver for + // late-bound information about the current execution context. + template + auto operator()(const _Sender& __sndr, const _Env& __env) const noexcept { + if constexpr (!same_as>) { + return __get_early_domain(__sndr); + } else if constexpr (__callable) { + return get_domain(__env); + } else if constexpr (__callable<__composed, const _Env&>) { + return get_domain(get_scheduler(__env)); + } else { + return default_domain(); + } + STDEXEC_UNREACHABLE(); + } + + // The transfer algorithm is the exception to the rule. It ignores the domain + // of the predecessor, and dispatches based on the domain of the scheduler + // to which execution is being transferred. + template _Sender, class _Env> + auto operator()(const _Sender& __sndr, const _Env&) const noexcept { + return __sexpr_apply(__sndr, [](__ignore, auto& __data, __ignore) noexcept { + auto __sched = get_completion_scheduler(__data); + return query_or(get_domain, __sched, default_domain()); + }); + } + } __get_late_domain{}; + + template + using __late_domain_of_t = __call_result_t<__get_late_domain_t, _Sender, _Env>; + + namespace __domain { + struct __common_domain_fn { + static default_domain __common_domain() noexcept { + return {}; + } + + template + requires __all_of<_Domain, _OtherDomains...> + static _Domain __common_domain(_Domain __domain, _OtherDomains...) noexcept { + return (_Domain&&) __domain; + } + + template + static auto __common_domain(_Domains...) noexcept // + -> __if_c<__one_of, dependent_domain, __none_such> { + return {}; + } + + auto operator()(__ignore, __ignore, const auto&... __sndrs) const noexcept { + return __common_domain(__get_early_domain(__sndrs)...); + } + }; + + template + using __common_domain_t = // + __call_result_t<__common_domain_fn, int, int, _Senders...>; + + template + concept __has_common_domain = // + __none_of<__none_such, __common_domain_t<_Senders...>>; + } // namespace __domain +} // namespace stdexec diff --git a/include/stdexec/__detail/__env.hpp b/include/stdexec/__detail/__env.hpp new file mode 100644 index 000000000..ed4bca4f3 --- /dev/null +++ b/include/stdexec/__detail/__env.hpp @@ -0,0 +1,487 @@ +/* + * Copyright (c) 2021-2023 NVIDIA Corporation + * + * Licensed under the Apache License Version 2.0 with LLVM Exceptions + * (the "License"); you may not use this file except in compliance with + * the License. You may obtain a copy of the License at + * + * https://llvm.org/LICENSE.txt + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include + +#include "__execution_fwd.hpp" + +#include "__concepts.hpp" + +#include "../functional.hpp" +#include "../stop_token.hpp" + +STDEXEC_PRAGMA_PUSH() +STDEXEC_PRAGMA_IGNORE_EDG(probable_guiding_friend) +STDEXEC_PRAGMA_IGNORE_EDG(type_qualifiers_ignored_on_reference) + +namespace stdexec { + // [exec.queries.queryable] + template + concept queryable = destructible; + + template + struct __query { + template + static inline constexpr Tag (*signature)(Sig) = nullptr; + }; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + // [exec.queries] + namespace __queries { + struct forwarding_query_t { + template + constexpr bool operator()(_Query __query) const noexcept { + if constexpr (tag_invocable) { + return tag_invoke(*this, (_Query&&) __query); + } else if constexpr (std::derived_from<_Query, forwarding_query_t>) { + return true; + } else { + return false; + } + } + }; + + struct query_or_t { + template + constexpr auto operator()(_Query, _Queryable&&, _Default&& __default) const + noexcept(__nothrow_constructible_from<_Default, _Default&&>) -> _Default { + return (_Default&&) __default; + } + + template + requires __callable<_Query, _Queryable> + constexpr auto operator()(_Query __query, _Queryable&& __queryable, _Default&&) const + noexcept(__nothrow_callable<_Query, _Queryable>) -> __call_result_t<_Query, _Queryable> { + return ((_Query&&) __query)((_Queryable&&) __queryable); + } + }; + + struct execute_may_block_caller_t : __query { + template + requires tag_invocable> + constexpr bool operator()(_Tp&& __t) const noexcept { + static_assert( + same_as>>); + static_assert(nothrow_tag_invocable>); + return tag_invoke(execute_may_block_caller_t{}, std::as_const(__t)); + } + + constexpr bool operator()(auto&&) const noexcept { + return true; + } + }; + + struct get_forward_progress_guarantee_t : __query { + template + requires tag_invocable> + constexpr auto operator()(_Tp&& __t) const + noexcept(nothrow_tag_invocable>) + -> tag_invoke_result_t> { + return tag_invoke(get_forward_progress_guarantee_t{}, std::as_const(__t)); + } + + constexpr stdexec::forward_progress_guarantee operator()(auto&&) const noexcept { + return stdexec::forward_progress_guarantee::weakly_parallel; + } + }; + + struct __has_algorithm_customizations_t : __query<__has_algorithm_customizations_t> { + template + using __result_t = tag_invoke_result_t<__has_algorithm_customizations_t, __cref_t<_Tp>>; + + template + requires tag_invocable<__has_algorithm_customizations_t, __cref_t<_Tp>> + constexpr __result_t<_Tp> operator()(_Tp&&) const noexcept(noexcept(__result_t<_Tp>{})) { + using _Boolean = tag_invoke_result_t<__has_algorithm_customizations_t, __cref_t<_Tp>>; + static_assert(_Boolean{} ? true : true); // must be contextually convertible to bool + return _Boolean{}; + } + + constexpr std::false_type operator()(auto&&) const noexcept { + return {}; + } + }; + + // TODO: implement allocator concept + template + concept __allocator_c = true; + + struct get_scheduler_t : __query { + friend constexpr bool tag_invoke(forwarding_query_t, const get_scheduler_t&) noexcept { + return true; + } + + template + requires tag_invocable + auto operator()(const _Env& __env) const noexcept + -> tag_invoke_result_t; + + auto operator()() const noexcept; + }; + + struct get_delegatee_scheduler_t : __query { + friend constexpr bool + tag_invoke(forwarding_query_t, const get_delegatee_scheduler_t&) noexcept { + return true; + } + + template + requires tag_invocable + auto operator()(const _Env& __t) const noexcept + -> tag_invoke_result_t; + + auto operator()() const noexcept; + }; + + struct get_allocator_t : __query { + friend constexpr bool tag_invoke(forwarding_query_t, const get_allocator_t&) noexcept { + return true; + } + + template + requires tag_invocable + auto operator()(const _Env& __env) const noexcept + -> tag_invoke_result_t { + static_assert(nothrow_tag_invocable); + static_assert(__allocator_c>); + return tag_invoke(get_allocator_t{}, __env); + } + + auto operator()() const noexcept; + }; + + struct get_stop_token_t : __query { + friend constexpr bool tag_invoke(forwarding_query_t, const get_stop_token_t&) noexcept { + return true; + } + + template + never_stop_token operator()(const _Env&) const noexcept { + return {}; + } + + template + requires tag_invocable + auto operator()(const _Env& __env) const noexcept + -> tag_invoke_result_t { + static_assert(nothrow_tag_invocable); + static_assert(stoppable_token>); + return tag_invoke(get_stop_token_t{}, __env); + } + + auto operator()() const noexcept; + }; + + template + concept __has_completion_scheduler_for = + queryable<_Queryable> && // + tag_invocable, const _Queryable&>; + + template <__completion_tag _CPO> + struct get_completion_scheduler_t : __query> { + friend constexpr bool + tag_invoke(forwarding_query_t, const get_completion_scheduler_t<_CPO>&) noexcept { + return true; + } + + template <__has_completion_scheduler_for<_CPO> _Queryable> + auto operator()(const _Queryable& __queryable) const noexcept + -> tag_invoke_result_t, const _Queryable&>; + }; + + struct get_domain_t { + template + requires tag_invocable + constexpr auto operator()(const _Ty& __ty) const noexcept + -> tag_invoke_result_t { + static_assert( + nothrow_tag_invocable, + "Customizations of get_domain must be noexcept."); + static_assert( + __class>, + "Customizations of get_domain must return a class type."); + return tag_invoke(get_domain_t{}, __ty); + } + + friend constexpr bool tag_invoke(forwarding_query_t, get_domain_t) noexcept { + return true; + } + }; + } // namespace __queries + + using __queries::forwarding_query_t; + using __queries::query_or_t; + using __queries::execute_may_block_caller_t; + using __queries::__has_algorithm_customizations_t; + using __queries::get_forward_progress_guarantee_t; + using __queries::get_allocator_t; + using __queries::get_scheduler_t; + using __queries::get_delegatee_scheduler_t; + using __queries::get_stop_token_t; + using __queries::get_completion_scheduler_t; + using __queries::get_domain_t; + + inline constexpr forwarding_query_t forwarding_query{}; + inline constexpr query_or_t query_or{}; // NOT TO SPEC + inline constexpr execute_may_block_caller_t execute_may_block_caller{}; + inline constexpr __has_algorithm_customizations_t __has_algorithm_customizations{}; + inline constexpr get_forward_progress_guarantee_t get_forward_progress_guarantee{}; + inline constexpr get_scheduler_t get_scheduler{}; + inline constexpr get_delegatee_scheduler_t get_delegatee_scheduler{}; + inline constexpr get_allocator_t get_allocator{}; + inline constexpr get_stop_token_t get_stop_token{}; +#if !STDEXEC_GCC() || defined(__OPTIMIZE_SIZE__) + template <__completion_tag _CPO> + inline constexpr get_completion_scheduler_t<_CPO> get_completion_scheduler{}; +#else + template <> + inline constexpr get_completion_scheduler_t get_completion_scheduler{}; + template <> + inline constexpr get_completion_scheduler_t get_completion_scheduler{}; + template <> + inline constexpr get_completion_scheduler_t + get_completion_scheduler{}; +#endif + + template + concept __forwarding_query = forwarding_query(_Tag{}); + + inline constexpr get_domain_t get_domain{}; + + template + using __query_result_or_t = __call_result_t; + + ///////////////////////////////////////////////////////////////////////////// + // env_of + namespace __env { + template + struct __prop; + + template + struct __prop<_Value(_Tags...)> { + using __t = __prop; + using __id = __prop; + _Value __value_; + + template <__one_of<_Tags...> _Key> + friend auto tag_invoke(_Key, const __prop& __self) // + noexcept(__nothrow_decay_copyable<_Value>) -> _Value { + return __self.__value_; + } + }; + + template + struct __prop { + using __t = __prop; + using __id = __prop; + + template <__one_of<_Tags...> _Key, class _Self> + requires(std::is_base_of_v<__prop, __decay_t<_Self>>) + friend auto tag_invoke(_Key, _Self&&) noexcept = delete; + }; + + struct __mkprop_t { + template + auto operator()(_Value&& __value, _Tag, _Tags...) const + noexcept(__nothrow_decay_copyable<_Value>) -> __prop<__decay_t<_Value>(_Tag, _Tags...)> { + return {(_Value&&) __value}; + } + + template + auto operator()(_Tag) const -> __prop { + return {}; + } + }; + + template <__nothrow_move_constructible _Fun> + struct __env_fn { + using __t = __env_fn; + using __id = __env_fn; + STDEXEC_ATTRIBUTE((no_unique_address)) _Fun __fun_; + + template + requires __callable + friend auto tag_invoke(_Tag, const __env_fn& __self) // + noexcept(__nothrow_callable) -> __call_result_t { + return __self.__fun_(_Tag()); + } + }; + + template + __env_fn(_Fun) -> __env_fn<_Fun>; + + template + struct __env_fwd { + static_assert(__nothrow_move_constructible<_Env>); + using __t = __env_fwd; + using __id = __env_fwd; + STDEXEC_ATTRIBUTE((no_unique_address)) _Env __env_; + + template <__forwarding_query _Tag> + requires tag_invocable<_Tag, const _Env&> + friend auto tag_invoke(_Tag, const __env_fwd& __self) // + noexcept(nothrow_tag_invocable<_Tag, const _Env&>) + -> tag_invoke_result_t<_Tag, const _Env&> { + return _Tag()(__self.__env_); + } + }; + + template + __env_fwd(_Env&&) -> __env_fwd<_Env>; + + template + struct __joined_env : __env_fwd<_Base> { + static_assert(__nothrow_move_constructible<_Env>); + using __t = __joined_env; + using __id = __joined_env; + STDEXEC_ATTRIBUTE((no_unique_address)) _Env __env_; + + const _Base& base() const noexcept { + return this->__env_fwd<_Base>::__env_; + } + + template + requires tag_invocable<_Tag, const _Env&> + friend auto tag_invoke(_Tag, const __joined_env& __self) // + noexcept(nothrow_tag_invocable<_Tag, const _Env&>) + -> tag_invoke_result_t<_Tag, const _Env&> { + return _Tag()(__self.__env_); + } + }; + + template + struct __joined_env<__prop, _Base> : __env_fwd<_Base> { + using __t = __joined_env; + using __id = __joined_env; + STDEXEC_ATTRIBUTE((no_unique_address)) __prop __env_; + + friend void tag_invoke(_Tag, const __joined_env&) noexcept = delete; + }; + + struct __join_env_t { + template + _Env operator()(_Env&& __env) const noexcept { + return (_Env&&) __env; + } + + template + decltype(auto) operator()(_Env&& __env, _Base&& __base) const noexcept { + using __env_t = __decay_t<_Env>; + using __base_t = __decay_t<_Base>; + if constexpr (__same_as<__env_t, empty_env>) { + return _Base((_Base&&) __base); + } else if constexpr (__same_as<__base_t, empty_env>) { + return _Env((_Env&&) __env); + } else { + return __joined_env<_Env, _Base>{{(_Base&&) __base}, (_Env&&) __env}; + } + } + + template + decltype(auto) operator()(_Env0&& __env0, _Env1&& __env1, _Env2&& __env2, _Envs&&... __envs) + const noexcept { + const auto& __join_env = *this; + return __join_env( + (_Env0&&) __env0, + __join_env((_Env1&&) __env1, __join_env((_Env2&&) __env2, (_Envs&&) __envs...))); + } + }; + + template + using __env_join_t = __call_result_t<__join_env_t, _Envs...>; + + // To be kept in sync with the promise type used in __connect_awaitable + template + struct __env_promise { + template + _Ty&& await_transform(_Ty&& __value) noexcept { + return (_Ty&&) __value; + } + + template + requires tag_invocable + auto await_transform(_Ty&& __value) // + noexcept(nothrow_tag_invocable) + -> tag_invoke_result_t { + return tag_invoke(as_awaitable, (_Ty&&) __value, *this); + } + + friend auto tag_invoke(get_env_t, const __env_promise&) noexcept -> const _Env&; + }; + + // For making an environment from key/value pairs and optionally + // another environment. + struct __make_env_t { + template <__nothrow_move_constructible _Base, __nothrow_move_constructible _Env> + auto operator()(_Base&& __base, _Env&& __env) const noexcept -> __env_join_t<_Env, _Base> { + return __join_env_t()((_Env&&) __env, (_Base&&) __base); + } + + template <__nothrow_move_constructible _Env> + _Env operator()(_Env&& __env) const noexcept { + return (_Env&&) __env; + } + }; + + // For getting an evaluation environment from a receiver + struct get_env_t { + template + requires tag_invocable + STDEXEC_ATTRIBUTE((always_inline)) // + constexpr auto + operator()(const _EnvProvider& __with_env) const noexcept + -> tag_invoke_result_t { + static_assert(queryable >); + static_assert(nothrow_tag_invocable); + return tag_invoke(*this, __with_env); + } + + template + constexpr empty_env operator()(const _EnvProvider&) const noexcept { + return {}; + } + }; + } // namespace __env + + using __env::empty_env; + using __empty_env [[deprecated("Please use stdexec::empty_env now.")]] = empty_env; + + using __env::__env_promise; + + inline constexpr __env::__make_env_t __make_env{}; + inline constexpr __env::__join_env_t __join_env{}; + inline constexpr __env::get_env_t get_env{}; + + // for making an environment from a single key/value pair + inline constexpr __env::__mkprop_t __mkprop{}; + + template + using __with = __env::__prop<_Value(_Tag)>; + + template + using __make_env_t = __call_result_t<__env::__make_env_t, _Ts...>; + + using __default_env = empty_env; + + template + concept environment_provider = // + requires(_EnvProvider& __ep) { + { get_env(std::as_const(__ep)) } -> queryable; + }; +} // namespace stdexec + +STDEXEC_PRAGMA_POP() diff --git a/include/stdexec/__detail/__execution_fwd.hpp b/include/stdexec/__detail/__execution_fwd.hpp index 74f8f70c4..e2eac2653 100644 --- a/include/stdexec/__detail/__execution_fwd.hpp +++ b/include/stdexec/__detail/__execution_fwd.hpp @@ -23,12 +23,7 @@ namespace stdexec { struct __none_such; ////////////////////////////////////////////////////////////////////////////////////////////////// - namespace __domain { - template - struct __default_domain; - } - - using __domain::__default_domain; + struct default_domain; ////////////////////////////////////////////////////////////////////////////////////////////////// namespace __receivers { @@ -47,13 +42,19 @@ namespace stdexec { template concept __completion_tag = __one_of<_Tag, set_value_t, set_error_t, set_stopped_t>; + struct receiver_t; + template extern const bool enable_receiver; ////////////////////////////////////////////////////////////////////////////////////////////////// namespace __env { struct get_env_t; - struct empty_env; + + struct empty_env { + using __t = empty_env; + using __id = empty_env; + }; } using __env::empty_env; @@ -137,6 +138,8 @@ namespace stdexec { template concept __nothrow_connectable = __nothrow_callable; + struct sender_t; + template extern const bool enable_sender; @@ -170,6 +173,41 @@ namespace stdexec { } using __transfer::transfer_t; + extern const transfer_t transfer; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + namespace __transfer_just { + struct transfer_just_t; + } + + using __transfer_just::transfer_just_t; + extern const transfer_just_t transfer_just; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + namespace __bulk { + struct bulk_t; + } + + using __bulk::bulk_t; + extern const bulk_t bulk; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + namespace __split { + struct split_t; + struct __split_t; + } + + using __split::split_t; + extern const split_t split; + + ////////////////////////////////////////////////////////////////////////////////////////////////// + namespace __ensure_started { + struct ensure_started_t; + struct __ensure_started_t; + } + + using __ensure_started::ensure_started_t; + extern const ensure_started_t ensure_started; ////////////////////////////////////////////////////////////////////////////////////////////////// namespace __on_v2 { @@ -179,4 +217,17 @@ namespace stdexec { namespace v2 { using __on_v2::on_t; } + + namespace __detail { + struct __sexpr_apply_t; + } + + using __detail::__sexpr_apply_t; + extern const __sexpr_apply_t __sexpr_apply; } + +template +[[deprecated]] void print() {} + +template +struct undef; diff --git a/include/stdexec/__detail/__intrusive_ptr.hpp b/include/stdexec/__detail/__intrusive_ptr.hpp index 552d49a6a..b92480679 100644 --- a/include/stdexec/__detail/__intrusive_ptr.hpp +++ b/include/stdexec/__detail/__intrusive_ptr.hpp @@ -22,13 +22,27 @@ #include #include +#if STDEXEC_TSAN() +#include +#endif + namespace stdexec { namespace __ptr { template struct __make_intrusive_t; template - struct __enable_intrusive_from_this; + class __intrusive_ptr; + + template + struct __enable_intrusive_from_this { + __intrusive_ptr<_Ty> __intrusive_from_this() noexcept; + __intrusive_ptr __intrusive_from_this() const noexcept; + private: + friend _Ty; + void __inc_ref() noexcept; + void __dec_ref() noexcept; + }; template struct __control_block { @@ -51,6 +65,24 @@ namespace stdexec { _Ty& __value() const noexcept { return *(_Ty*) __value_; } + + void __inc_ref_() noexcept { + __refcount_.fetch_add(1, std::memory_order_relaxed); + } + + STDEXEC_PRAGMA_PUSH() + STDEXEC_PRAGMA_IGNORE_GNU("-Wtsan") + + void __dec_ref_() noexcept { + if (1u == __refcount_.fetch_sub(1, std::memory_order_release)) { + std::atomic_thread_fence(std::memory_order_acquire); + // TSan does not support std::atomic_thread_fence, so we + // need to use the TSan-specific __tsan_acquire instead: + STDEXEC_TSAN(__tsan_acquire(&__refcount_)); + delete this; + } + } + STDEXEC_PRAGMA_POP() }; template @@ -65,20 +97,21 @@ namespace stdexec { : __data_(__data) { } - void __addref_() noexcept { + void __inc_ref_() noexcept { if (__data_) { - __data_->__refcount_.fetch_add(1, std::memory_order_relaxed); + __data_->__inc_ref_(); } } - void __release_() noexcept { - if (__data_ && 1u == __data_->__refcount_.fetch_sub(1, std::memory_order_release)) { - std::atomic_thread_fence(std::memory_order_acquire); - delete __data_; + void __dec_ref_() noexcept { + if (__data_) { + __data_->__dec_ref_(); } } public: + using element_type = _Ty; + __intrusive_ptr() = default; __intrusive_ptr(__intrusive_ptr&& __that) noexcept @@ -87,7 +120,11 @@ namespace stdexec { __intrusive_ptr(const __intrusive_ptr& __that) noexcept : __data_(__that.__data_) { - __addref_(); + __inc_ref_(); + } + + __intrusive_ptr(__enable_intrusive_from_this<_Ty>* __that) noexcept + : __intrusive_ptr(__that ? __that->__intrusive_from_this() : __intrusive_ptr()) { } __intrusive_ptr& operator=(__intrusive_ptr&& __that) noexcept { @@ -100,8 +137,12 @@ namespace stdexec { return operator=(__intrusive_ptr(__that)); } + __intrusive_ptr& operator=(__enable_intrusive_from_this<_Ty>* __that) noexcept { + return operator=(__that ? __that->__intrusive_from_this() : __intrusive_ptr()); + } + ~__intrusive_ptr() { - __release_(); + __dec_ref_(); } void reset() noexcept { @@ -140,23 +181,31 @@ namespace stdexec { }; template - struct __enable_intrusive_from_this { - __intrusive_ptr<_Ty> __intrusive_from_this() noexcept { - static_assert(0 == offsetof(__control_block<_Ty>, __value_)); - _Ty* __this = static_cast<_Ty*>(this); - __intrusive_ptr<_Ty> __p{(__control_block<_Ty>*) __this}; - __p.__addref_(); - return __p; - } + __intrusive_ptr<_Ty> __enable_intrusive_from_this<_Ty>::__intrusive_from_this() noexcept { + auto* __data = (__control_block<_Ty>*) static_cast<_Ty*>(this); + __data->__inc_ref_(); + return __intrusive_ptr<_Ty>{__data}; + } - __intrusive_ptr __intrusive_from_this() const noexcept { - static_assert(0 == offsetof(__control_block<_Ty>, __value_)); - const _Ty* __this = static_cast(this); - __intrusive_ptr __p{(__control_block<_Ty>*) __this}; - __p.__addref_(); - return __p; - } - }; + template + __intrusive_ptr + __enable_intrusive_from_this<_Ty>::__intrusive_from_this() const noexcept { + auto* __data = (__control_block<_Ty>*) static_cast(this); + __data->__inc_ref_(); + return __intrusive_ptr{__data}; + } + + template + void __enable_intrusive_from_this<_Ty>::__inc_ref() noexcept { + auto* __data = (__control_block<_Ty>*) static_cast<_Ty*>(this); + __data->__inc_ref_(); + } + + template + void __enable_intrusive_from_this<_Ty>::__dec_ref() noexcept { + auto* __data = (__control_block<_Ty>*) static_cast<_Ty*>(this); + __data->__dec_ref_(); + } template struct __make_intrusive_t { diff --git a/include/stdexec/__detail/__intrusive_queue.hpp b/include/stdexec/__detail/__intrusive_queue.hpp index 698e8417b..44cbcf8c7 100644 --- a/include/stdexec/__detail/__intrusive_queue.hpp +++ b/include/stdexec/__detail/__intrusive_queue.hpp @@ -37,6 +37,11 @@ namespace stdexec { , __tail_(std::exchange(__other.__tail_, nullptr)) { } + __intrusive_queue(_Item* __head, _Item* __tail) noexcept + : __head_(__head) + , __tail_(__tail) { + } + __intrusive_queue& operator=(__intrusive_queue __other) noexcept { std::swap(__head_, __other.__head_); std::swap(__tail_, __other.__tail_); @@ -63,10 +68,28 @@ namespace stdexec { return __result; } + static __intrusive_queue make(_Item* __list) noexcept { + __intrusive_queue __result{}; + __result.__head_ = __list; + __result.__tail_ = __list; + if (__list == nullptr) { + return __result; + } + while (__result.__tail_->*_Next != nullptr) { + __result.__tail_ = __result.__tail_->*_Next; + } + return __result; + } + [[nodiscard]] bool empty() const noexcept { return __head_ == nullptr; } + void clear() noexcept { + __head_ = nullptr; + __tail_ = nullptr; + } + [[nodiscard]] _Item* pop_front() noexcept { STDEXEC_ASSERT(!empty()); _Item* __item = std::exchange(__head_, __head_->*_Next); @@ -125,6 +148,90 @@ namespace stdexec { __other.__head_ = nullptr; } + struct iterator { + using difference_type = std::ptrdiff_t; + using value_type = _Item*; + + _Item* __predecessor_ = nullptr; + _Item* __item_ = nullptr; + + iterator() noexcept = default; + + explicit iterator(_Item* __pred, _Item* __item) noexcept + : __predecessor_(__pred) + , __item_(__item) { + } + + [[nodiscard]] _Item* operator*() const noexcept { + STDEXEC_ASSERT(__item_ != nullptr); + return __item_; + } + + [[nodiscard]] _Item** operator->() const noexcept { + STDEXEC_ASSERT(__item_ != nullptr); + return &__item_; + } + + iterator& operator++() noexcept { + __predecessor_ = __item_; + if (__item_) { + __item_ = __item_->*_Next; + } + return *this; + } + + iterator operator++(int) noexcept { + iterator __result = *this; + ++*this; + return __result; + } + + friend bool operator==(const iterator&, const iterator&) noexcept = default; + }; + + [[nodiscard]] iterator begin() const noexcept { + return iterator(nullptr, __head_); + } + + [[nodiscard]] iterator end() const noexcept { + return iterator(__tail_, nullptr); + } + + void splice(iterator pos, __intrusive_queue& other, iterator first, iterator last) noexcept { + if (first == last) { + return; + } + STDEXEC_ASSERT(first.__item_ != nullptr); + STDEXEC_ASSERT(last.__predecessor_ != nullptr); + if (other.__head_ == first.__item_) { + other.__head_ = last.__item_; + if (other.__head_ == nullptr) { + other.__tail_ = nullptr; + } + } else { + STDEXEC_ASSERT(first.__predecessor_ != nullptr); + first.__predecessor_->*_Next = last.__item_; + last.__predecessor_->*_Next = pos.__item_; + } + if (empty()) { + __head_ = first.__item_; + __tail_ = last.__predecessor_; + } else { + pos.__predecessor_->*_Next = first.__item_; + if (pos.__item_ == nullptr) { + __tail_ = last.__predecessor_; + } + } + } + + _Item* front() const noexcept { + return __head_; + } + + _Item* back() const noexcept { + return __tail_; + } + private: _Item* __head_ = nullptr; _Item* __tail_ = nullptr; diff --git a/include/stdexec/__detail/__meta.hpp b/include/stdexec/__detail/__meta.hpp index 55b4c95a6..b757f91a5 100644 --- a/include/stdexec/__detail/__meta.hpp +++ b/include/stdexec/__detail/__meta.hpp @@ -33,6 +33,7 @@ namespace stdexec { struct __ignore { __ignore() = default; + STDEXEC_ATTRIBUTE((always_inline)) // constexpr __ignore(auto&&...) noexcept { } }; @@ -78,6 +79,12 @@ namespace stdexec { template using __msize_t = char[_Np + 1]; + template + struct __mconstant_; + + template + using __mconstant = __mconstant_<_Np>*; + template using __mfirst = _Tp; @@ -100,6 +107,9 @@ namespace stdexec { template inline constexpr _Tp __v> = _Ip; + template + inline constexpr __mtypeof<_Np> __v<__mconstant<_Np>> = _Np; + template inline constexpr std::size_t __v = _Ip - 1; @@ -326,6 +336,12 @@ namespace stdexec { template struct __mdefer : __mdefer_<_Fn, _Args...> { }; + template + using __mmemoize = __t<__mdefer<_Fn, _Args...>>; + + template