From a52be1b9a2508c07465b081031df9fed49a9162c Mon Sep 17 00:00:00 2001
From: Matthew Michel <106704043+mmichel11@users.noreply.github.com>
Date: Thu, 6 Feb 2025 08:12:15 -0600
Subject: [PATCH] Implement alternative `-O0` workaround for reduce-then-scan
 with compiler optimization detection (#2046)

Reverts https://github.com/uxlfoundation/oneDPL/pull/2040 and instead uses the `__OPTIMIZE__` macro defined by clang-based compilers to detect -O0 compilation and compile reduce-then-scan paths with a sub-group size of 16 to work around hardware bugs on older integrated graphics architectures. This avoids the performance impact of the kernel bundle approach.
---------

Signed-off-by: Matthew Michel <matthew.michel@intel.com>
---
 .../pstl/hetero/dpcpp/parallel_backend_sycl.h | 202 +++++++-----------
 .../parallel_backend_sycl_reduce_then_scan.h  |  94 ++++----
 .../dpcpp/parallel_backend_sycl_utils.h       |  21 --
 .../oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h  |   6 -
 include/oneapi/dpl/pstl/onedpl_config.h       |   9 +
 5 files changed, 138 insertions(+), 194 deletions(-)

diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
index be38682049e..ef9b6dcbd19 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl.h
@@ -1057,12 +1057,7 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
     // work-group implementation requires a fundamental type which must also be trivially copyable.
     if constexpr (std::is_trivially_copyable_v<_Type>)
     {
-        bool __use_reduce_then_scan =
-#if _ONEDPL_COMPILE_KERNEL
-            oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec);
-#else
-            false;
-#endif
+        bool __use_reduce_then_scan = oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec);
 
         // TODO: Consider re-implementing single group scan to support types without known identities. This could also
         // allow us to use single wg scan for the last block of reduce-then-scan if it is sufficiently small.
@@ -1081,7 +1076,6 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
                     std::forward<_Range2>(__out_rng), __n, __unary_op, __init, __binary_op, _Inclusive{});
             }
         }
-#if _ONEDPL_COMPILE_KERNEL
         if (__use_reduce_then_scan)
         {
             using _GenInput = oneapi::dpl::__par_backend_hetero::__gen_transform_input<_UnaryOperation>;
@@ -1089,19 +1083,12 @@ __parallel_transform_scan(oneapi::dpl::__internal::__device_backend_tag __backen
             using _WriteOp = oneapi::dpl::__par_backend_hetero::__simple_write_to_id;
 
             _GenInput __gen_transform{__unary_op};
-            try
-            {
-                return __parallel_transform_reduce_then_scan(__backend_tag, __exec, __in_rng, __out_rng,
-                                                             __gen_transform, __binary_op, __gen_transform,
-                                                             _ScanInputTransform{}, _WriteOp{}, __init, _Inclusive{},
-                                                             /*_IsUniquePattern=*/std::false_type{});
-            }
-            catch (const sycl::exception& __e)
-            {
-                __bypass_sycl_kernel_not_supported(__e);
-            }
+
+            return __parallel_transform_reduce_then_scan(
+                __backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__in_rng),
+                std::forward<_Range2>(__out_rng), __gen_transform, __binary_op, __gen_transform, _ScanInputTransform{},
+                _WriteOp{}, __init, _Inclusive{}, /*_IsUniquePattern=*/std::false_type{});
         }
-#endif
     }
 
     //else use multi pass scan implementation
@@ -1176,7 +1163,6 @@ struct __invoke_single_group_copy_if
     }
 };
 
-#if _ONEDPL_COMPILE_KERNEL
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _GenMask,
           typename _WriteOp, typename _IsUniquePattern>
 auto
@@ -1195,7 +1181,6 @@ __parallel_reduce_then_scan_copy(oneapi::dpl::__internal::__device_backend_tag _
         _ScanInputTransform{}, __write_op, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
         /*_Inclusive=*/std::true_type{}, __is_unique_pattern);
 }
-#endif
 
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _CreateMaskOp,
           typename _CopyByMaskOp>
@@ -1248,36 +1233,32 @@ __parallel_unique_copy(oneapi::dpl::__internal::__device_backend_tag __backend_t
     // can simply copy the input range to the output.
     assert(__n > 1);
 
-#if _ONEDPL_COMPILE_KERNEL
-    if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
+    if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec))
     {
         using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_unique_mask<_BinaryPredicate>;
         using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<1, _Assign>;
-        try
-        {
-            return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred},
-                                                    _WriteOp{_Assign{}},
-                                                    /*_IsUniquePattern=*/std::true_type{});
-        }
-        catch (const sycl::exception& __e)
-        {
-            __bypass_sycl_kernel_not_supported(__e);
-        }
+
+        return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                                std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
+                                                _GenMask{__pred}, _WriteOp{_Assign{}},
+                                                /*_IsUniquePattern=*/std::true_type{});
+    }
+    else
+    {
+
+        using _ReduceOp = std::plus<decltype(__n)>;
+        using _CreateOp =
+            oneapi::dpl::__internal::__create_mask_unique_copy<oneapi::dpl::__internal::__not_pred<_BinaryPredicate>,
+                                                               decltype(__n)>;
+        using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;
+
+        return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
+                                    std::forward<_Range2>(__result), __n,
+                                    _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
+                                    _CopyOp{_ReduceOp{}, _Assign{}});
     }
-#endif
-    using _ReduceOp = std::plus<decltype(__n)>;
-    using _CreateOp =
-        oneapi::dpl::__internal::__create_mask_unique_copy<oneapi::dpl::__internal::__not_pred<_BinaryPredicate>,
-                                                           decltype(__n)>;
-    using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign, /*inclusive*/ std::true_type, 1>;
-
-    return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
-                                std::forward<_Range2>(__result), __n,
-                                _CreateOp{oneapi::dpl::__internal::__not_pred<_BinaryPredicate>{__pred}},
-                                _CopyOp{_ReduceOp{}, _Assign{}});
 }
 
-#if _ONEDPL_COMPILE_KERNEL
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Range4,
           typename _BinaryPredicate, typename _BinaryOperator>
 auto
@@ -1302,14 +1283,13 @@ __parallel_reduce_by_segment_reduce_then_scan(oneapi::dpl::__internal::__device_
     assert(__n > 1);
     return __parallel_transform_reduce_then_scan(
         __backend_tag, std::forward<_ExecutionPolicy>(__exec),
-        oneapi::dpl::__ranges::zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
-        oneapi::dpl::__ranges::zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
+        oneapi::dpl::__ranges::make_zip_view(std::forward<_Range1>(__keys), std::forward<_Range2>(__values)),
+        oneapi::dpl::__ranges::make_zip_view(std::forward<_Range3>(__out_keys), std::forward<_Range4>(__out_values)),
         _GenReduceInput{__binary_pred}, _ReduceOp{__binary_op}, _GenScanInput{__binary_pred, __n},
         _ScanInputTransform{}, _WriteOp{__binary_pred, __n},
         oneapi::dpl::unseq_backend::__no_init_value<oneapi::dpl::__internal::tuple<std::size_t, _ValueType>>{},
         /*Inclusive*/ std::true_type{}, /*_IsUniquePattern=*/std::false_type{});
 }
-#endif
 
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _UnaryPredicate>
 auto
@@ -1317,30 +1297,25 @@ __parallel_partition_copy(oneapi::dpl::__internal::__device_backend_tag __backen
                           _Range1&& __rng, _Range2&& __result, _UnaryPredicate __pred)
 {
     oneapi::dpl::__internal::__difference_t<_Range1> __n = __rng.size();
-#if _ONEDPL_COMPILE_KERNEL
-    if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
+    if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec))
     {
         using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_UnaryPredicate>;
         using _WriteOp =
             oneapi::dpl::__par_backend_hetero::__write_to_id_if_else<oneapi::dpl::__internal::__pstl_assign>;
-        try
-        {
-            return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __rng, __result, __n, _GenMask{__pred},
-                                                    _WriteOp{},
-                                                    /*_IsUniquePattern=*/std::false_type{});
-        }
-        catch (const sycl::exception& __e)
-        {
-            __bypass_sycl_kernel_not_supported(__e);
-        }
+
+        return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                                std::forward<_Range1>(__rng), std::forward<_Range2>(__result), __n,
+                                                _GenMask{__pred}, _WriteOp{}, /*_IsUniquePattern=*/std::false_type{});
     }
-#endif
-    using _ReduceOp = std::plus<decltype(__n)>;
-    using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>;
-    using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;
+    else
+    {
+        using _ReduceOp = std::plus<decltype(__n)>;
+        using _CreateOp = unseq_backend::__create_mask<_UnaryPredicate, decltype(__n)>;
+        using _CopyOp = unseq_backend::__partition_by_mask<_ReduceOp, /*inclusive*/ std::true_type>;
 
-    return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
-                                std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
+        return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng),
+                                    std::forward<_Range2>(__result), __n, _CreateOp{__pred}, _CopyOp{_ReduceOp{}});
+    }
 }
 
 template <typename _ExecutionPolicy, typename _InRng, typename _OutRng, typename _Size, typename _Pred,
@@ -1371,37 +1346,32 @@ __parallel_copy_if(oneapi::dpl::__internal::__device_backend_tag __backend_tag,
         using _SizeBreakpoints = std::integer_sequence<std::uint16_t, 16, 32, 64, 128, 256, 512, 1024, 2048>;
 
         return __par_backend_hetero::__static_monotonic_dispatcher<_SizeBreakpoints>::__dispatch(
-            _SingleGroupInvoker{}, __n, __exec, __n, std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng),
-            __pred, __assign);
+            _SingleGroupInvoker{}, __n, std::forward<_ExecutionPolicy>(__exec), __n, std::forward<_InRng>(__in_rng),
+            std::forward<_OutRng>(__out_rng), __pred, __assign);
     }
-#if _ONEDPL_COMPILE_KERNEL
-    else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
+    else if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec))
     {
         using _GenMask = oneapi::dpl::__par_backend_hetero::__gen_mask<_Pred>;
         using _WriteOp = oneapi::dpl::__par_backend_hetero::__write_to_id_if<0, _Assign>;
-        try
-        {
-            return __parallel_reduce_then_scan_copy(__backend_tag, __exec, __in_rng, __out_rng, __n, _GenMask{__pred},
-                                                    _WriteOp{__assign},
-                                                    /*_IsUniquePattern=*/std::false_type{});
-        }
-        catch (const sycl::exception& __e)
-        {
-            __bypass_sycl_kernel_not_supported(__e);
-        }
-    }
-#endif
-    using _ReduceOp = std::plus<_Size>;
-    using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
-    using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
-                                                  /*inclusive*/ std::true_type, 1>;
 
-    return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_InRng>(__in_rng),
-                                std::forward<_OutRng>(__out_rng), __n, _CreateOp{__pred},
-                                _CopyOp{_ReduceOp{}, __assign});
+        return __parallel_reduce_then_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                                std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
+                                                _GenMask{__pred}, _WriteOp{__assign},
+                                                /*_IsUniquePattern=*/std::false_type{});
+    }
+    else
+    {
+        using _ReduceOp = std::plus<_Size>;
+        using _CreateOp = unseq_backend::__create_mask<_Pred, _Size>;
+        using _CopyOp = unseq_backend::__copy_by_mask<_ReduceOp, _Assign,
+                                                      /*inclusive*/ std::true_type, 1>;
+
+        return __parallel_scan_copy(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                    std::forward<_InRng>(__in_rng), std::forward<_OutRng>(__out_rng), __n,
+                                    _CreateOp{__pred}, _CopyOp{_ReduceOp{}, __assign});
+    }
 }
 
-#if _ONEDPL_COMPILE_KERNEL
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
           typename _IsOpDifference>
 auto
@@ -1427,7 +1397,7 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __
 
     return __parallel_transform_reduce_then_scan(
         __backend_tag, std::forward<_ExecutionPolicy>(__exec),
-        oneapi::dpl::__ranges::zip_view(
+        oneapi::dpl::__ranges::make_zip_view(
             std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
             oneapi::dpl::__ranges::all_view<std::int32_t, __par_backend_hetero::access_mode::read_write>(
                 __mask_buf.get_buffer())),
@@ -1436,7 +1406,6 @@ __parallel_set_reduce_then_scan(oneapi::dpl::__internal::__device_backend_tag __
         _ScanInputTransform{}, _WriteOp{}, oneapi::dpl::unseq_backend::__no_init_value<_Size>{},
         /*_Inclusive=*/std::true_type{}, /*__is_unique_pattern=*/std::false_type{});
 }
-#endif
 
 template <typename _ExecutionPolicy, typename _Range1, typename _Range2, typename _Range3, typename _Compare,
           typename _IsOpDifference>
@@ -1495,23 +1464,18 @@ __parallel_set_op(oneapi::dpl::__internal::__device_backend_tag __backend_tag, _
                   _Range1&& __rng1, _Range2&& __rng2, _Range3&& __result, _Compare __comp,
                   _IsOpDifference __is_op_difference)
 {
-#if _ONEDPL_COMPILE_KERNEL
-    if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
+    if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec))
     {
-        try
-        {
-            return __parallel_set_reduce_then_scan(__backend_tag, __exec, __rng1, __rng2, __result, __comp,
-                                                   __is_op_difference);
-        }
-        catch (const sycl::exception& __e)
-        {
-            __bypass_sycl_kernel_not_supported(__e);
-        }
+        return __parallel_set_reduce_then_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec),
+                                               std::forward<_Range1>(__rng1), std::forward<_Range2>(__rng2),
+                                               std::forward<_Range3>(__result), __comp, __is_op_difference);
+    }
+    else
+    {
+        return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
+                                   std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
+                                   __is_op_difference);
     }
-#endif
-    return __parallel_set_scan(__backend_tag, std::forward<_ExecutionPolicy>(__exec), std::forward<_Range1>(__rng1),
-                               std::forward<_Range2>(__rng2), std::forward<_Range3>(__result), __comp,
-                               __is_op_difference);
 }
 
 //------------------------------------------------------------------------
@@ -2443,24 +2407,18 @@ __parallel_reduce_by_segment(oneapi::dpl::__internal::__device_backend_tag, _Exe
 
     using __val_type = oneapi::dpl::__internal::__value_t<_Range2>;
     // Prior to icpx 2025.0, the reduce-then-scan path performs poorly and should be avoided.
-#if (!defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000) && _ONEDPL_COMPILE_KERNEL
+#if !defined(__INTEL_LLVM_COMPILER) || __INTEL_LLVM_COMPILER >= 20250000
     if constexpr (std::is_trivially_copyable_v<__val_type>)
     {
-        if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_sg_32(__exec))
+        if (oneapi::dpl::__par_backend_hetero::__is_gpu_with_reduce_then_scan_sg_sz(__exec))
         {
-            try
-            {
-                auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
-                    oneapi::dpl::__internal::__device_backend_tag{}, __exec, __keys, __values, __out_keys, __out_values,
-                    __binary_pred, __binary_op);
-                // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
-                // past-the-end iterator pair of segmented reduction.
-                return std::get<0>(__res.get()) + 1;
-            }
-            catch (const sycl::exception& __e)
-            {
-                __bypass_sycl_kernel_not_supported(__e);
-            }
+            auto __res = oneapi::dpl::__par_backend_hetero::__parallel_reduce_by_segment_reduce_then_scan(
+                oneapi::dpl::__internal::__device_backend_tag{}, std::forward<_ExecutionPolicy>(__exec),
+                std::forward<_Range1>(__keys), std::forward<_Range2>(__values), std::forward<_Range3>(__out_keys),
+                std::forward<_Range4>(__out_values), __binary_pred, __binary_op);
+            // Because our init type ends up being tuple<std::size_t, ValType>, return the first component which is the write index. Add 1 to return the
+            // past-the-end iterator pair of segmented reduction.
+            return std::get<0>(__res.get()) + 1;
         }
     }
 #endif
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
index a3c1054a27a..d3b54620fb4 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_reduce_then_scan.h
@@ -16,9 +16,6 @@
 #ifndef _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H
 #define _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H
 
-// Kernel compilation must be supported to properly work around hardware bug on certain iGPUs
-#if _ONEDPL_COMPILE_KERNEL
-
 #include <algorithm>
 #include <cstdint>
 #include <type_traits>
@@ -278,7 +275,14 @@ class __reduce_then_scan_scan_kernel;
 template <std::uint8_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive,
           bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _InitType,
           typename _KernelName>
-struct __parallel_reduce_then_scan_reduce_submitter
+struct __parallel_reduce_then_scan_reduce_submitter;
+
+template <std::uint8_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive,
+          bool __is_unique_pattern_v, typename _GenReduceInput, typename _ReduceOp, typename _InitType,
+          typename... _KernelName>
+struct __parallel_reduce_then_scan_reduce_submitter<__sub_group_size, __max_inputs_per_item, __is_inclusive,
+                                                    __is_unique_pattern_v, _GenReduceInput, _ReduceOp, _InitType,
+                                                    __internal::__optional_kernel_name<_KernelName...>>
 {
     // Step 1 - SubGroupReduce is expected to perform sub-group reductions to global memory
     // input buffer
@@ -287,7 +291,7 @@ struct __parallel_reduce_then_scan_reduce_submitter
     operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng,
                _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
                const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
-               const std::size_t __block_num, const sycl::kernel& __reduce_kernel) const
+               const std::size_t __block_num) const
     {
         using _InitValueType = typename _InitType::__value_type;
         return __exec.queue().submit([&, this](sycl::handler& __cgh) {
@@ -296,13 +300,7 @@ struct __parallel_reduce_then_scan_reduce_submitter
             oneapi::dpl::__ranges::__require_access(__cgh, __in_rng);
             auto __temp_acc = __scratch_container.template __get_scratch_acc<sycl::access_mode::write>(
                 __cgh, __dpl_sycl::__no_init{});
-#if _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
-            __cgh.use_kernel_bundle(__reduce_kernel.get_kernel_bundle());
-#endif
-            __cgh.parallel_for<_KernelName>(
-#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
-                    __reduce_kernel,
-#endif
+            __cgh.parallel_for<_KernelName...>(
                     __nd_range, [=, *this](sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
                 _InitValueType* __temp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
                 std::size_t __group_id = __ndi.get_group(0);
@@ -414,7 +412,14 @@ struct __parallel_reduce_then_scan_reduce_submitter
 template <std::uint8_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive,
           bool __is_unique_pattern_v, typename _ReduceOp, typename _GenScanInput, typename _ScanInputTransform,
           typename _WriteOp, typename _InitType, typename _KernelName>
-struct __parallel_reduce_then_scan_scan_submitter
+struct __parallel_reduce_then_scan_scan_submitter;
+
+template <std::uint8_t __sub_group_size, std::uint16_t __max_inputs_per_item, bool __is_inclusive,
+          bool __is_unique_pattern_v, typename _ReduceOp, typename _GenScanInput, typename _ScanInputTransform,
+          typename _WriteOp, typename _InitType, typename... _KernelName>
+struct __parallel_reduce_then_scan_scan_submitter<
+    __sub_group_size, __max_inputs_per_item, __is_inclusive, __is_unique_pattern_v, _ReduceOp, _GenScanInput,
+    _ScanInputTransform, _WriteOp, _InitType, __internal::__optional_kernel_name<_KernelName...>>
 {
     using _InitValueType = typename _InitType::__value_type;
 
@@ -437,7 +442,7 @@ struct __parallel_reduce_then_scan_scan_submitter
     operator()(_ExecutionPolicy&& __exec, const sycl::nd_range<1> __nd_range, _InRng&& __in_rng, _OutRng&& __out_rng,
                _TmpStorageAcc& __scratch_container, const sycl::event& __prior_event,
                const std::uint32_t __inputs_per_sub_group, const std::uint32_t __inputs_per_item,
-               const std::size_t __block_num, const sycl::kernel& __scan_kernel) const
+               const std::size_t __block_num) const
     {
         std::uint32_t __inputs_in_block = std::min(__n - __block_num * __max_block_size, std::size_t{__max_block_size});
         std::uint32_t __active_groups = oneapi::dpl::__internal::__dpl_ceiling_div(
@@ -452,13 +457,8 @@ struct __parallel_reduce_then_scan_scan_submitter
             auto __temp_acc = __scratch_container.template __get_scratch_acc<sycl::access_mode::read_write>(__cgh);
             auto __res_acc =
                 __scratch_container.template __get_result_acc<sycl::access_mode::write>(__cgh, __dpl_sycl::__no_init{});
-#if _ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT
-            __cgh.use_kernel_bundle(__scan_kernel.get_kernel_bundle());
-#endif
-            __cgh.parallel_for<_KernelName>(
-#if !_ONEDPL_SYCL2020_KERNEL_BUNDLE_PRESENT && _ONEDPL_LIBSYCL_PROGRAM_PRESENT
-                    __scan_kernel,
-#endif
+
+            __cgh.parallel_for<_KernelName...>(
                     __nd_range, [=, *this] (sycl::nd_item<1> __ndi) [[sycl::reqd_sub_group_size(__sub_group_size)]] {
                 _InitValueType* __tmp_ptr = _TmpStorageAcc::__get_usm_or_buffer_accessor_ptr(__temp_acc);
                 _InitValueType* __res_ptr =
@@ -726,14 +726,28 @@ struct __parallel_reduce_then_scan_scan_submitter
     _InitType __init;
 };
 
-// reduce_then_scan requires subgroup size of 32, and performs well only on devices with fast coordinated subgroup
-// operations.  We do not want to run this scan on CPU targets, as they are not performant with this algorithm.
+// With optimization enabled, reduce-then-scan requires a sub-group size of 32. Without optimization, we must compile
+// to a sub-group size of 16 to workaround a hardware bug on certain Intel integrated graphics architectures.
+constexpr inline std::uint8_t
+__get_reduce_then_scan_sg_sz()
+{
+#if _ONEDPL_DETECT_COMPILER_OPTIMIZATIONS_ENABLED
+    return 32;
+#else
+    return 16;
+#endif
+}
+
+// Enable reduce-then-scan if the device uses the required sub-group size and is ran on a device
+// with fast coordinated subgroup operations. We do not want to run this scan on CPU targets, as they are not
+// performant with this algorithm.
 template <typename _ExecutionPolicy>
 bool
-__is_gpu_with_sg_32(const _ExecutionPolicy& __exec)
+__is_gpu_with_reduce_then_scan_sg_sz(const _ExecutionPolicy& __exec)
 {
-    const bool __dev_has_sg32 = oneapi::dpl::__internal::__supports_sub_group_size(__exec, 32);
-    return (__exec.queue().get_device().is_gpu() && __dev_has_sg32);
+    const bool __dev_supports_sg_sz =
+        oneapi::dpl::__internal::__supports_sub_group_size(__exec, __get_reduce_then_scan_sg_sz());
+    return (__exec.queue().get_device().is_gpu() && __dev_supports_sg_sz);
 }
 
 // General scan-like algorithm helpers
@@ -756,23 +770,14 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
                                       _ScanInputTransform __scan_input_transform, _WriteOp __write_op, _InitType __init,
                                       _Inclusive, _IsUniquePattern)
 {
-    using _ValueType = typename _InitType::__value_type;
     using _CustomName = oneapi::dpl::__internal::__policy_kernel_name<_ExecutionPolicy>;
-    // Note that __sub_group_size and __max_inputs_per_item are not included in kernel names. __sub_group_size
-    // is always constant (32) and __max_inputs_per_item is directly tied to the input type so these are not
-    // necessary to obtain a unique kernel name. However, if these compile time variables are adjusted in the
-    // future, then we need to be careful here to ensure unique kernel naming.
-    using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
-        __reduce_then_scan_reduce_kernel, _CustomName, _ExecutionPolicy, _InRng, _OutRng, _GenReduceInput, _ReduceOp,
-        _InitType, _Inclusive, _IsUniquePattern>;
-    using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_generator<
-        __reduce_then_scan_scan_kernel, _CustomName, _ExecutionPolicy, _InRng, _OutRng, _GenScanInput, _ReduceOp,
-        _ScanInputTransform, _WriteOp, _InitType, _Inclusive, _IsUniquePattern>;
-    static auto __kernels = __internal::__kernel_compiler<_ReduceKernel, _ScanKernel>::__compile(__exec);
-    const sycl::kernel& __reduce_kernel = __kernels[0];
-    const sycl::kernel& __scan_kernel = __kernels[1];
-
-    constexpr std::uint8_t __sub_group_size = 32;
+    using _ReduceKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __reduce_then_scan_reduce_kernel<_CustomName>>;
+    using _ScanKernel = oneapi::dpl::__par_backend_hetero::__internal::__kernel_name_provider<
+        __reduce_then_scan_scan_kernel<_CustomName>>;
+    using _ValueType = typename _InitType::__value_type;
+
+    constexpr std::uint8_t __sub_group_size = __get_reduce_then_scan_sg_sz();
     constexpr std::uint8_t __block_size_scale = std::max(std::size_t{1}, sizeof(double) / sizeof(_ValueType));
     // Empirically determined maximum. May be less for non-full blocks.
     constexpr std::uint16_t __max_inputs_per_item = 64 * __block_size_scale;
@@ -858,10 +863,10 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
         auto __kernel_nd_range = sycl::nd_range<1>(__global_range, __local_range);
         // 1. Reduce step - Reduce assigned input per sub-group, compute and apply intra-wg carries, and write to global memory.
         __event = __reduce_submitter(__exec, __kernel_nd_range, __in_rng, __result_and_scratch, __event,
-                                     __inputs_per_sub_group, __inputs_per_item, __b, __reduce_kernel);
+                                     __inputs_per_sub_group, __inputs_per_item, __b);
         // 2. Scan step - Compute intra-wg carries, determine sub-group carry-ins, and perform full input block scan.
         __event = __scan_submitter(__exec, __kernel_nd_range, __in_rng, __out_rng, __result_and_scratch, __event,
-                                   __inputs_per_sub_group, __inputs_per_item, __b, __scan_kernel);
+                                   __inputs_per_sub_group, __inputs_per_item, __b);
         __inputs_remaining -= std::min(__inputs_remaining, __block_size);
         // We only need to resize these parameters prior to the last block as it is the only non-full case.
         if (__b + 2 == __num_blocks)
@@ -881,5 +886,4 @@ __parallel_transform_reduce_then_scan(oneapi::dpl::__internal::__device_backend_
 } // namespace dpl
 } // namespace oneapi
 
-#endif // _ONEDPL_COMPILE_KERNEL
 #endif // _ONEDPL_PARALLEL_BACKEND_SYCL_REDUCE_THEN_SCAN_H
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 25b50bd20ea..a16fcbd594c 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -21,7 +21,6 @@
 #include <type_traits>
 #include <tuple>
 #include <algorithm>
-#include <system_error>
 
 #include "../../iterator_impl.h"
 
@@ -887,26 +886,6 @@ class __static_monotonic_dispatcher<::std::integer_sequence<::std::uint16_t, _X,
     }
 };
 
-// This exception handler is intended to handle a software workaround by IGC for a hardware bug that
-// causes IGC to throw an exception for certain integrated graphics devices with -O0 compilation and
-// a required sub-group size of 32.
-inline void
-__bypass_sycl_kernel_not_supported(const sycl::exception& __e)
-{
-    // The SYCL spec compliant solution would be to compare __e.code() and sycl::errc::kernel_not_supported
-    // and rethrow the encountered exception if the two do not compare equal. However, the icpx compiler currently
-    // returns a sycl::errc::build in violation of the SYCL spec. If we are using the Intel compiler, then compare
-    // to this error code. Otherwise, assume the implementation is spec compliant.
-    const std::error_code __kernel_not_supported_ec =
-#if _ONEDPL_SYCL_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN
-        sycl::errc::build;
-#else // Generic SYCL compiler. Assume it is spec compliant.
-        sycl::errc::kernel_not_supported;
-#endif
-    if (__e.code() != __kernel_not_supported_ec)
-        throw;
-}
-
 struct __scalar_load_op
 {
     oneapi::dpl::__internal::__pstl_assign __assigner;
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
index 09706dba195..abce0902be1 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/sycl_defs.h
@@ -108,12 +108,6 @@
 
 #define _ONEDPL_SYCL_DEVICE_COPYABLE_SPECIALIZATION_BROKEN (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(70100))
 
-// Macro to check if the exception thrown when a kernel cannot be ran on a device does not align with
-// sycl::errc::kernel_not_supported as required by the SYCL spec. Detects the Intel DPC++ and open-source intel/llvm
-// compilers. No fix has been provided yet, but when the LIBSYCL major version is updated we can re-evaluate if we need
-// to extend it to future versions.
-#define _ONEDPL_SYCL_KERNEL_NOT_SUPPORTED_EXCEPTION_BROKEN (_ONEDPL_LIBSYCL_VERSION_LESS_THAN(90000))
-
 // Macro to check if we are compiling for SPIR-V devices. This macro must only be used within
 // SYCL kernels for determining SPIR-V compilation. Using this macro on the host may lead to incorrect behavior.
 #ifndef _ONEDPL_DETECT_SPIRV_COMPILATION // Check if overridden for testing
diff --git a/include/oneapi/dpl/pstl/onedpl_config.h b/include/oneapi/dpl/pstl/onedpl_config.h
index 71ee884f39e..4ed4e743dfb 100644
--- a/include/oneapi/dpl/pstl/onedpl_config.h
+++ b/include/oneapi/dpl/pstl/onedpl_config.h
@@ -157,6 +157,15 @@
 #define _ONEDPL_CPP17_EXECUTION_POLICIES_PRESENT                                                                       \
     (_ONEDPL___cplusplus >= 201703L && (_MSC_VER >= 1912 || (_GLIBCXX_RELEASE >= 9 && __GLIBCXX__ >= 20190503)))
 
+// In the SYCL backend reduce-then-scan path, we need to be able to differentiate between when a compiler enables
+// optimizations and when it does not. With GCC and clang-based compilers, we can detect this with the __OPTIMIZE__
+// flag.
+#if _ONEDPL_GCC_VERSION > 0 || defined(_ONEDPL_CLANG_VERSION)
+#    define _ONEDPL_DETECT_COMPILER_OPTIMIZATIONS_ENABLED __OPTIMIZE__
+#else
+#    define _ONEDPL_DETECT_COMPILER_OPTIMIZATIONS_ENABLED 0
+#endif
+
 #define _ONEDPL_EARLYEXIT_PRESENT (__INTEL_COMPILER >= 1800)
 #if (defined(_PSTL_PRAGMA_SIMD_EARLYEXIT) && _PSTL_EARLYEXIT_PRESENT)
 #    define _ONEDPL_PRAGMA_SIMD_EARLYEXIT _PSTL_PRAGMA_SIMD_EARLYEXIT