From a52be1b9a2508c07465b081031df9fed49a9162c Mon Sep 17 00:00:00 2001 From: Matthew Michel <106704043+mmichel11@users.noreply.github.com> Date: Thu, 6 Feb 2025 08:12:15 -0600 Subject: [PATCH] Implement alternative `-O0` workaround for reduce-then-scan with compiler optimization detection (#2046) Reverts https://github.com/uxlfoundation/oneDPL/pull/2040 and instead uses the `__OPTIMIZE__` macro defined by clang-based compilers to detect -O0 compilation and compile reduce-then-scan paths with a sub-group size of 16 to work around hardware bugs on older integrated graphics architectures. This avoids the performance impact of the kernel bundle approach. --------- Signed-off-by: Matthew Michel --- .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 202 +++++++----------- .../parallel_backend_sycl_reduce_then_scan.h | 94 ++++---- .../dpcpp/parallel_backend_sycl_utils.h | 21 -- .../oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h | 6 - include/oneapi/dpl/pstl/onedpl_config.h | 9 + 5 files changed, 138 insertions(+), 194 deletions(-) diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h index be38682049e..ef9b6dcbd19 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h @@ -1057,12 +1057,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen // work-group implementation requires a fundamental type which must also be trivially copyable. if constexpr (std::is_trivially_copyable_v<_Type>) { - bool __use_reduce_then_scan = -#if _ONEDPL_COMPILE_KERNEL - oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec); -#else - false; -#endif + bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec); // TODO: Consider re-implementing single group scan to support types without known identities. This could also // allow us to use single wg scan for the last block of reduce-then-scan if it is sufficiently small. @@ -1081,7 +1076,6 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{}); } } -#if _ONEDPL_COMPILE_KERNEL if (__use_reduce_then_scan) { using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>; @@ -1089,19 +1083,12 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id; _GenInput __gen_transform{__unary_op}; - try - { - return __parallel_transform_reduce_then_scan(__backend_tag, __exec, __in_rng, __out_rng, - __gen_transform, __binary_op, __gen_transform, - _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{}, - /*_IsUniquePattern=*/std::false_type{}); - } - catch (const sycl::exception& __e) - { - __bypass_sycl_kernel_not_supported(__e); - } + + return __parallel_transform_reduce_then_scan( + __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng), + std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{}, + _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{}); } -#endif } //else use multi pass scan implementation @@ -1176,7 +1163,6 @@ struct __invoke_single_group_copy_if } }; -#if _ONEDPL_COMPILE_KERNEL template auto @@ -1195,7 +1181,6 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _ _ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, /*_Inclusive=*/std::true_type{}, __is_unique_pattern); } -#endif template @@ -1248,36 +1233,32 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t // can simply copy the input range to the output. assert(__n > 1); -#if _ONEDPL_COMPILE_KERNEL - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>; - try - { - return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred}, - _WriteOp{_Assign{}}, - /*_IsUniquePattern=*/std::true_type{}); - } - catch (const sycl::exception& __e) - { - __bypass_sycl_kernel_not_supported(__e); - } + + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + _GenMask{__pred}, _WriteOp{_Assign{}}, + /*_IsUniquePattern=*/std::true_type{}); + } + else + { + + using _ReduceOp = std::plus; + using _CreateOp = + oneapi::dpl::__internal::__create_mask_unique_copy, + decltype(__n)>; + using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; + + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, + _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, + _CopyOp{_ReduceOp{}, _Assign{}}); } -#endif - using _ReduceOp = std::plus; - using _CreateOp = - oneapi::dpl::__internal::__create_mask_unique_copy, - decltype(__n)>; - using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>; - - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, - _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}}, - _CopyOp{_ReduceOp{}, _Assign{}}); } -#if _ONEDPL_COMPILE_KERNEL template auto @@ -1302,14 +1283,13 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_ assert(__n > 1); return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), - oneapi::dpl::__ranges::zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), + oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)), + oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)), _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n}, _ScanInputTransform{}, _WriteOp{__binary_pred, __n}, oneapi::dpl::unseq_backend::__no_init_value>{}, /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{}); } -#endif template auto @@ -1317,30 +1297,25 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred) { oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size(); -#if _ONEDPL_COMPILE_KERNEL - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if_else; - try - { - return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred}, - _WriteOp{}, - /*_IsUniquePattern=*/std::false_type{}); - } - catch (const sycl::exception& __e) - { - __bypass_sycl_kernel_not_supported(__e); - } + + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n, + _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{}); } -#endif - using _ReduceOp = std::plus; - using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>; - using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>; + else + { + using _ReduceOp = std::plus; + using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>; + using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), - std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng), + std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}}); + } } template ; return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch( - _SingleGroupInvoker{}, __n, __exec, __n, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), - __pred, __assign); + _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng), + std::forward<_OutRng>(__out_rng), __pred, __assign); } -#if _ONEDPL_COMPILE_KERNEL - else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec)) { using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>; using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>; - try - { - return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __in_rng, __out_rng, __n, _GenMask{__pred}, - _WriteOp{__assign}, - /*_IsUniquePattern=*/std::false_type{}); - } - catch (const sycl::exception& __e) - { - __bypass_sycl_kernel_not_supported(__e); - } - } -#endif - using _ReduceOp = std::plus<_Size>; - using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>; - using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, - /*inclusive*/ std::true_type, 1>; - return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng), - std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred}, - _CopyOp{_ReduceOp{}, __assign}); + return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + _GenMask{__pred}, _WriteOp{__assign}, + /*_IsUniquePattern=*/std::false_type{}); + } + else + { + using _ReduceOp = std::plus<_Size>; + using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>; + using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, + /*inclusive*/ std::true_type, 1>; + + return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n, + _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign}); + } } -#if _ONEDPL_COMPILE_KERNEL template auto @@ -1427,7 +1397,7 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __ return __parallel_transform_reduce_then_scan( __backend_tag, std::forward<_ExecutionPolicy>(__exec), - oneapi::dpl::__ranges::zip_view( + oneapi::dpl::__ranges::make_zip_view( std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), oneapi::dpl::__ranges::all_view( __mask_buf.get_buffer())), @@ -1436,7 +1406,6 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __ _ScanInputTransform{}, _WriteOp{}, oneapi::dpl::unseq_backend::__no_init_value<_Size>{}, /*_Inclusive=*/std::true_type{}, /*__is_unique_pattern=*/std::false_type{}); } -#endif template @@ -1495,23 +1464,18 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _ _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp, _IsOpDifference __is_op_difference) { -#if _ONEDPL_COMPILE_KERNEL - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec)) { - try - { - return __parallel_set_reduce_then_scan(__backend_tag, __exec, __rng1, __rng2, __result, __comp, - __is_op_difference); - } - catch (const sycl::exception& __e) - { - __bypass_sycl_kernel_not_supported(__e); - } + return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2), + std::forward<_Range3>(__result), __comp, __is_op_difference); + } + else + { + return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), + std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp, + __is_op_difference); } -#endif - return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1), - std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp, - __is_op_difference); } //------------------------------------------------------------------------ @@ -2443,24 +2407,18 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe using __val_type = oneapi::dpl::__internal::__value_t<_Range2>; // Prior to icpx 2025.0, the reduce-then-scan path performs poorly and should be avoided. -#if (!defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000) && _ONEDPL_COMPILE_KERNEL +#if !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000 if constexpr (std::is_trivially_copyable_v<__val_type>) { - if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec)) + if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec)) { - try - { - auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( - oneapi::dpl::__internal::__device_backend_tag{}, __exec, __keys, __values, __out_keys, __out_values, - __binary_pred, __binary_op); - // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the - // past-the-end iterator pair of segmented reduction. - return std::get<0>(__res.get()) + 1; - } - catch (const sycl::exception& __e) - { - __bypass_sycl_kernel_not_supported(__e); - } + auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan( + oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec), + std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys), + std::forward<_Range4>(__out_values), __binary_pred, __binary_op); + // Because our init type ends up being tuple, return the first component which is the write index. Add 1 to return the + // past-the-end iterator pair of segmented reduction. + return std::get<0>(__res.get()) + 1; } } #endif diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h index a3c1054a27a..d3b54620fb4 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h @@ -16,9 +16,6 @@ #ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H #define _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H -// Kernel compilation must be supported to properly work around hardware bug on certain iGPUs -#if _ONEDPL_COMPILE_KERNEL - #include #include #include @@ -278,7 +275,14 @@ class __reduce_then_scan_scan_kernel; template -struct __parallel_reduce_then_scan_reduce_submitter +struct __parallel_reduce_then_scan_reduce_submitter; + +template +struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive, + __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType, + __internal::__optional_kernel_name<_KernelName...>> { // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory // input buffer @@ -287,7 +291,7 @@ struct __parallel_reduce_then_scan_reduce_submitter operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num, const sycl::kernel& __reduce_kernel) const + const std::size_t __block_num) const { using _InitValueType = typename _InitType::__value_type; return __exec.queue().submit([&, this](sycl::handler& __cgh) { @@ -296,13 +300,7 @@ struct __parallel_reduce_then_scan_reduce_submitter oneapi::dpl::__ranges::__require_access(__cgh, __in_rng); auto __temp_acc = __scratch_container.template __get_scratch_acc( __cgh, __dpl_sycl::__no_init{}); -#if _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__reduce_kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_KernelName>( -#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT - __reduce_kernel, -#endif + __cgh.parallel_for<_KernelName...>( __nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { _InitValueType* __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc); std::size_t __group_id = __ndi.get_group(0); @@ -414,7 +412,14 @@ struct __parallel_reduce_then_scan_reduce_submitter template -struct __parallel_reduce_then_scan_scan_submitter +struct __parallel_reduce_then_scan_scan_submitter; + +template +struct __parallel_reduce_then_scan_scan_submitter< + __sub_group_size, __max_inputs_per_item, __is_inclusive, __is_unique_pattern_v, _ReduceOp, _GenScanInput, + _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>> { using _InitValueType = typename _InitType::__value_type; @@ -437,7 +442,7 @@ struct __parallel_reduce_then_scan_scan_submitter operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng, _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event, const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item, - const std::size_t __block_num, const sycl::kernel& __scan_kernel) const + const std::size_t __block_num) const { std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size}); std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div( @@ -452,13 +457,8 @@ struct __parallel_reduce_then_scan_scan_submitter auto __temp_acc = __scratch_container.template __get_scratch_acc(__cgh); auto __res_acc = __scratch_container.template __get_result_acc(__cgh, __dpl_sycl::__no_init{}); -#if _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT - __cgh.use_kernel_bundle(__scan_kernel.get_kernel_bundle()); -#endif - __cgh.parallel_for<_KernelName>( -#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT - __scan_kernel, -#endif + + __cgh.parallel_for<_KernelName...>( __nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] { _InitValueType* __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc); _InitValueType* __res_ptr = @@ -726,14 +726,28 @@ struct __parallel_reduce_then_scan_scan_submitter _InitType __init; }; -// reduce_then_scan requires subgroup size of 32, and performs well only on devices with fast coordinated subgroup -// operations. We do not want to run this scan on CPU targets, as they are not performant with this algorithm. +// With optimization enabled, reduce-then-scan requires a sub-group size of 32. Without optimization, we must compile +// to a sub-group size of 16 to workaround a hardware bug on certain Intel integrated graphics architectures. +constexpr inline std::uint8_t +__get_reduce_then_scan_sg_sz() +{ +#if _ONEDPL_DETECT_COMPILER_OPTIMIZATIONS_ENABLED + return 32; +#else + return 16; +#endif +} + +// Enable reduce-then-scan if the device uses the required sub-group size and is ran on a device +// with fast coordinated subgroup operations. We do not want to run this scan on CPU targets, as they are not +// performant with this algorithm. template bool -__is_gpu_with_sg_32(const _ExecutionPolicy& __exec) +__is_gpu_with_reduce_then_scan_sg_sz(const _ExecutionPolicy& __exec) { - const bool __dev_has_sg32 = oneapi::dpl::__internal::__supports_sub_group_size(__exec, 32); - return (__exec.queue().get_device().is_gpu() && __dev_has_sg32); + const bool __dev_supports_sg_sz = + oneapi::dpl::__internal::__supports_sub_group_size(__exec, __get_reduce_then_scan_sg_sz()); + return (__exec.queue().get_device().is_gpu() && __dev_supports_sg_sz); } // General scan-like algorithm helpers @@ -756,23 +770,14 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init, _Inclusive, _IsUniquePattern) { - using _ValueType = typename _InitType::__value_type; using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>; - // Note that __sub_group_size and __max_inputs_per_item are not included in kernel names. __sub_group_size - // is always constant (32) and __max_inputs_per_item is directly tied to the input type so these are not - // necessary to obtain a unique kernel name. However, if these compile time variables are adjusted in the - // future, then we need to be careful here to ensure unique kernel naming. - using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - __reduce_then_scan_reduce_kernel, _CustomName, _ExecutionPolicy, _InRng, _OutRng, _GenReduceInput, _ReduceOp, - _InitType, _Inclusive, _IsUniquePattern>; - using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator< - __reduce_then_scan_scan_kernel, _CustomName, _ExecutionPolicy, _InRng, _OutRng, _GenScanInput, _ReduceOp, - _ScanInputTransform, _WriteOp, _InitType, _Inclusive, _IsUniquePattern>; - static auto __kernels = __internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec); - const sycl::kernel& __reduce_kernel = __kernels[0]; - const sycl::kernel& __scan_kernel = __kernels[1]; - - constexpr std::uint8_t __sub_group_size = 32; + using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __reduce_then_scan_reduce_kernel<_CustomName>>; + using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider< + __reduce_then_scan_scan_kernel<_CustomName>>; + using _ValueType = typename _InitType::__value_type; + + constexpr std::uint8_t __sub_group_size = __get_reduce_then_scan_sg_sz(); constexpr std::uint8_t __block_size_scale = std::max(std::size_t{1}, sizeof(double) / sizeof(_ValueType)); // Empirically determined maximum. May be less for non-full blocks. constexpr std::uint16_t __max_inputs_per_item = 64 * __block_size_scale; @@ -858,10 +863,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range); // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory. __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b, __reduce_kernel); + __inputs_per_sub_group, __inputs_per_item, __b); // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan. __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event, - __inputs_per_sub_group, __inputs_per_item, __b, __scan_kernel); + __inputs_per_sub_group, __inputs_per_item, __b); __inputs_remaining -= std::min(__inputs_remaining, __block_size); // We only need to resize these parameters prior to the last block as it is the only non-full case. if (__b + 2 == __num_blocks) @@ -881,5 +886,4 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_ } // namespace dpl } // namespace oneapi -#endif // _ONEDPL_COMPILE_KERNEL #endif // _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h index 25b50bd20ea..a16fcbd594c 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h @@ -21,7 +21,6 @@ #include #include #include -#include #include "../../iterator_impl.h" @@ -887,26 +886,6 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X, } }; -// This exception handler is intended to handle a software workaround by IGC for a hardware bug that -// causes IGC to throw an exception for certain integrated graphics devices with -O0 compilation and -// a required sub-group size of 32. -inline void -__bypass_sycl_kernel_not_supported(const sycl::exception& __e) -{ - // The SYCL spec compliant solution would be to compare __e.code() and sycl::errc::kernel_not_supported - // and rethrow the encountered exception if the two do not compare equal. However, the icpx compiler currently - // returns a sycl::errc::build in violation of the SYCL spec. If we are using the Intel compiler, then compare - // to this error code. Otherwise, assume the implementation is spec compliant. - const std::error_code __kernel_not_supported_ec = -#if _ONEDPL_SYCL_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN - sycl::errc::build; -#else // Generic SYCL compiler. Assume it is spec compliant. - sycl::errc::kernel_not_supported; -#endif - if (__e.code() != __kernel_not_supported_ec) - throw; -} - struct __scalar_load_op { oneapi::dpl::__internal::__pstl_assign __assigner; diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h index 09706dba195..abce0902be1 100644 --- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h +++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h @@ -108,12 +108,6 @@ #define _ONEDPL_SYCL_DEVICE_COPYABLE_SPECIALIZATION_BROKEN (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(70100)) -// Macro to check if the exception thrown when a kernel cannot be ran on a device does not align with -// sycl::errc::kernel_not_supported as required by the SYCL spec. Detects the Intel DPC++ and open-source intel/llvm -// compilers. No fix has been provided yet, but when the LIBSYCL major version is updated we can re-evaluate if we need -// to extend it to future versions. -#define _ONEDPL_SYCL_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(90000)) - // Macro to check if we are compiling for SPIR-V devices. This macro must only be used within // SYCL kernels for determining SPIR-V compilation. Using this macro on the host may lead to incorrect behavior. #ifndef _ONEDPL_DETECT_SPIRV_COMPILATION // Check if overridden for testing diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h index 71ee884f39e..4ed4e743dfb 100644 --- a/include/oneapi/dpl/pstl/onedpl_config.h +++ b/include/oneapi/dpl/pstl/onedpl_config.h @@ -157,6 +157,15 @@ #define _ONEDPL_CPP17_EXECUTION_POLICIES_PRESENT \ (_ONEDPL___cplusplus >= 201703L && (_MSC_VER >= 1912 || (_GLIBCXX_RELEASE >= 9 && __GLIBCXX__ >= 20190503))) +// In the SYCL backend reduce-then-scan path, we need to be able to differentiate between when a compiler enables +// optimizations and when it does not. With GCC and clang-based compilers, we can detect this with the __OPTIMIZE__ +// flag. +#if _ONEDPL_GCC_VERSION > 0 || defined(_ONEDPL_CLANG_VERSION) +# define _ONEDPL_DETECT_COMPILER_OPTIMIZATIONS_ENABLED __OPTIMIZE__ +#else +# define _ONEDPL_DETECT_COMPILER_OPTIMIZATIONS_ENABLED 0 +#endif + #define _ONEDPL_EARLYEXIT_PRESENT (__INTEL_COMPILER >= 1800) #if (defined(_PSTL_PRAGMA_SIMD_EARLYEXIT) && _PSTL_EARLYEXIT_PRESENT) # define _ONEDPL_PRAGMA_SIMD_EARLYEXIT _PSTL_PRAGMA_SIMD_EARLYEXIT