aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorPablo Reble <pablo@reble.org>2021-02-24 17:58:55 -0600
committerMikhail Dvorskiy <mikhail.dvorskiy@intel.com>2021-02-25 09:59:30 +0300
commit3cfce2571f01315c5651a70558023ef57f85ca19 (patch)
treec89fd2ac9faee2b2b72e2a928967c1c0cff1bf59
parentTurn off strict aliasing optimization if used (#137) (diff)
downloadllvm-project-3cfce2571f01315c5651a70558023ef57f85ca19.tar.gz
llvm-project-3cfce2571f01315c5651a70558023ef57f85ca19.tar.bz2
llvm-project-3cfce2571f01315c5651a70558023ef57f85ca19.zip
Async api extensions (#78)
* adding async API as an experimental feature * initial support for DPCPP backend only * implementation for subset of algorithm/numeric (copy,fill,for_each,sort,reduce,transform,transform_reduce) with suffix async; accepting an arbitrary number of sycl::event's as last argument to express input dependencies * returning a future-like object of undefined type that is convertible into a sycl::event. * lifetime of temporary storage is bound to lifetime of returned object
-rw-r--r--include/oneapi/dpl/async45
-rw-r--r--include/oneapi/dpl/internal/async_extension_defs.h120
-rw-r--r--include/oneapi/dpl/internal/async_impl/async_backend_sycl.h137
-rw-r--r--include/oneapi/dpl/internal/async_impl/async_impl.h93
-rw-r--r--include/oneapi/dpl/internal/async_impl/async_impl_hetero.h201
-rw-r--r--include/oneapi/dpl/internal/async_impl/async_utils.h128
-rw-r--r--include/oneapi/dpl/internal/async_impl/glue_async_impl.h165
-rw-r--r--include/oneapi/dpl/pstl/hetero/dpcpp/execution_sycl_defs.h23
-rw-r--r--include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h1
-rw-r--r--test/xpu_api/asynch.pass.cpp153
10 files changed, 1064 insertions, 2 deletions
diff --git a/include/oneapi/dpl/async b/include/oneapi/dpl/async
new file mode 100644
index 000000000000..d76d2b6185db
--- /dev/null
+++ b/include/oneapi/dpl/async
@@ -0,0 +1,45 @@
+// -*- C++ -*-
+//===-- async -------------------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_ASYNC
+#define _ONEDPL_ASYNC
+
+// Workarounds for libstdc++9, libstdc++10 when new TBB version is found in the environment
+#if __cplusplus >= 201703L
+# if __has_include(<tbb/version.h>)
+# ifndef PSTL_USE_PARALLEL_POLICIES
+# define PSTL_USE_PARALLEL_POLICIES (_GLIBCXX_RELEASE != 9)
+# endif
+# ifndef _GLIBCXX_USE_TBB_PAR_BACKEND
+# define _GLIBCXX_USE_TBB_PAR_BACKEND (_GLIBCXX_RELEASE > 10)
+# endif
+# endif // __has_include(<tbb/version.h>)
+#endif // __cplusplus >= 201703L
+
+#include "oneapi/dpl/pstl/onedpl_config.h"
+
+#if !_ONEDPL_ASYNC_FORWARD_DECLARED
+// If not declared, pull in forward declarations
+# include "oneapi/dpl/internal/async_extension_defs.h"
+# define _ONEDPL_ASYNC_FORWARD_DECLARED 1
+#endif
+
+#if _ONEDPL_EXECUTION_POLICIES_DEFINED
+// If <execution> has already been included, pull in implementations
+# include "oneapi/dpl/internal/async_impl/async_impl.h"
+# include "oneapi/dpl/internal/async_impl/glue_async_impl.h"
+#endif
+
+#endif /* _ONEDPL_ASYNC */
diff --git a/include/oneapi/dpl/internal/async_extension_defs.h b/include/oneapi/dpl/internal/async_extension_defs.h
new file mode 100644
index 000000000000..abf10896d4af
--- /dev/null
+++ b/include/oneapi/dpl/internal/async_extension_defs.h
@@ -0,0 +1,120 @@
+/*
+ * Copyright (c) Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ONEDPL_ASYNC_EXTENSION_DEFS_H
+#define _ONEDPL_ASYNC_EXTENSION_DEFS_H
+
+#include "async_impl/async_utils.h"
+
+namespace oneapi
+{
+namespace dpl
+{
+
+// Public API for asynch algorithms:
+namespace experimental
+{
+
+template <typename... _Ts>
+oneapi::dpl::__internal::__enable_if_convertible_to_events<void, _Ts...>
+wait_for_all(_Ts&&... __events);
+
+template <class _ExecutionPolicy, class _ForwardIterator1, class _ForwardIterator2, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_ForwardIterator2>, _Events...>
+copy_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __result,
+ _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Function, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...>
+for_each_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Function __f,
+ _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _ForwardIt, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<typename std::iterator_traits<_ForwardIt>::value_type>,
+ _Events...>
+reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _ForwardIt, class _T, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _T, _Events...>
+reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _T init, _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_double_no_default<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_Tp>, _Tp, _BinaryOperation, _Events...>
+reduce_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Tp __init,
+ _BinaryOperation __binary_op, _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _UnaryOperation, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy,
+ oneapi::dpl::__internal::__future<_ForwardIt2>, _Events...>
+transform_async(_ExecutionPolicy&& __exec, _ForwardIt1 first1, _ForwardIt1 last1, _ForwardIt2 d_first,
+ _UnaryOperation unary_op, _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _ForwardIt3, class _BinaryOperation,
+ class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_ForwardIt3>, _BinaryOperation, _Events...>
+transform_async(_ExecutionPolicy&& __exec, _ForwardIt1 first1, _ForwardIt1 last1, _ForwardIt2 first2,
+ _ForwardIt3 d_first, _BinaryOperation binary_op, _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _T, class _BinaryOp1, class _BinaryOp2,
+ class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_double_no_default<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _BinaryOp1, _BinaryOp2, _Events...>
+transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt1 __first1, _ForwardIt1 __last1, _ForwardIt2 __first2,
+ _T __init, _BinaryOp1 __binary_op1, _BinaryOp2 __binary_op2, _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _T, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__future<_T>,
+ _Events...>
+transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt1 __first1, _ForwardIt1 __last1, _ForwardIt2 __first2,
+ _T __init, _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _ForwardIt, class _T, class _BinaryOp, class _UnaryOp, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _UnaryOp, _Events...>
+transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _T __init,
+ _BinaryOp __binary_op, _UnaryOp __unary_op, _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _RandomAccessIterator, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...>
+sort_async(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last,
+ _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _RandomAccessIterator, class _Compare, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default<
+ _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Compare, _Events...>
+sort_async(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last, _Compare __comp,
+ _Events&&... __dependencies);
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...>
+fill_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value,
+ _Events&&... __dependencies);
+
+} // namespace experimental
+
+} // namespace dpl
+
+} // namespace oneapi
+
+#endif /* _ONEDPL_ASYNC_EXTENSION_DEFS_H */
diff --git a/include/oneapi/dpl/internal/async_impl/async_backend_sycl.h b/include/oneapi/dpl/internal/async_impl/async_backend_sycl.h
new file mode 100644
index 000000000000..7d048e6c8ffc
--- /dev/null
+++ b/include/oneapi/dpl/internal/async_impl/async_backend_sycl.h
@@ -0,0 +1,137 @@
+// -*- C++ -*-
+//===-- async_backend_sycl.h ----------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+//!!! NOTE: This file should be included under the macro _ONEDPL_BACKEND_SYCL
+#ifndef _ONEDPL_async_backend_sycl_H
+#define _ONEDPL_async_backend_sycl_H
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace __par_backend_hetero
+{
+
+// TODO: Merge experimental async pattern into dpcpp backend
+//------------------------------------------------------------------------
+// parallel_transform_reduce - async pattern
+//------------------------------------------------------------------------
+
+template <typename _Tp, ::std::size_t __grainsize = 4, typename _ExecutionPolicy, typename _Up, typename _Cp,
+ typename _Rp, typename... _Ranges>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__future<_Tp>>
+__parallel_transform_reduce_async(_ExecutionPolicy&& __exec, _Up __u, _Cp __combine, _Rp __brick_reduce,
+ _Ranges&&... __rngs)
+{
+ auto __n = __get_first_range(__rngs...).size();
+ assert(__n > 0);
+
+ using _Size = decltype(__n);
+ using _Policy = typename ::std::decay<_ExecutionPolicy>::type;
+ using __kernel_name = typename _Policy::kernel_name;
+#if __SYCL_UNNAMED_LAMBDA__
+ using __kernel_name_t = __parallel_reduce_kernel<_Up, _Cp, _Rp, __kernel_name, _Ranges...>;
+#else
+ using __kernel_name_t = __parallel_reduce_kernel<__kernel_name>;
+#endif
+
+ sycl::cl_uint __max_compute_units = oneapi::dpl::__internal::__max_compute_units(__exec);
+ ::std::size_t __work_group_size = oneapi::dpl::__internal::__max_work_group_size(__exec);
+ // change __work_group_size according to local memory limit
+ __work_group_size = oneapi::dpl::__internal::__max_local_allocation_size<_ExecutionPolicy, _Tp>(
+ ::std::forward<_ExecutionPolicy>(__exec), __work_group_size);
+#if _ONEDPL_COMPILE_KERNEL
+ sycl::kernel __kernel = __kernel_name_t::__compile_kernel(::std::forward<_ExecutionPolicy>(__exec));
+ __work_group_size = ::std::min(__work_group_size, oneapi::dpl::__internal::__kernel_work_group_size(
+ ::std::forward<_ExecutionPolicy>(__exec), __kernel));
+#endif
+ ::std::size_t __iters_per_work_item = __grainsize;
+ // distribution is ~1 work groups per compute init
+ if (__exec.queue().get_device().is_cpu())
+ __iters_per_work_item = (__n - 1) / (__max_compute_units * __work_group_size) + 1;
+ ::std::size_t __size_per_work_group =
+ __iters_per_work_item * __work_group_size; // number of buffer elements processed within workgroup
+ _Size __n_groups = (__n - 1) / __size_per_work_group + 1; // number of work groups
+ _Size __n_items = (__n - 1) / __iters_per_work_item + 1; // number of work items
+
+ _PRINT_INFO_IN_DEBUG_MODE(__exec, __work_group_size, __max_compute_units);
+
+ // Create temporary global buffers to store temporary values
+ sycl::buffer<_Tp> __temp(sycl::range<1>(2 * __n_groups));
+ // __is_first == true. Reduce over each work_group
+ // __is_first == false. Reduce between work groups
+ bool __is_first = true;
+
+ // For memory utilization it's better to use one big buffer instead of two small because size of the buffer is close to a few MB
+ _Size __offset_1 = 0;
+ _Size __offset_2 = __n_groups;
+
+ sycl::event __reduce_event;
+ do
+ {
+ __reduce_event = __exec.queue().submit([&](sycl::handler& __cgh) {
+ __cgh.depends_on(__reduce_event);
+
+ oneapi::dpl::__ranges::__require_access(__cgh, __rngs...); //get an access to data under SYCL buffer
+ auto __temp_acc = __temp.template get_access<access_mode::read_write>(__cgh);
+ sycl::accessor<_Tp, 1, access_mode::read_write, sycl::access::target::local> __temp_local(
+ sycl::range<1>(__work_group_size), __cgh);
+ __cgh.parallel_for<__kernel_name_t>(
+#if _ONEDPL_COMPILE_KERNEL
+ __kernel,
+#endif
+ sycl::nd_range<1>(sycl::range<1>(__n_groups * __work_group_size), sycl::range<1>(__work_group_size)),
+ [=](sycl::nd_item<1> __item_id) {
+ ::std::size_t __global_idx = __item_id.get_global_id(0);
+ ::std::size_t __local_idx = __item_id.get_local_id(0);
+ // 1. Initialization (transform part). Fill local memory
+ if (__is_first)
+ {
+ __u(__item_id, __n, __iters_per_work_item, __global_idx, __temp_local, __rngs...);
+ }
+ else
+ {
+ // TODO: check the approach when we use grainsize here too
+ if (__global_idx < __n_items)
+ __temp_local[__local_idx] = __temp_acc[__offset_2 + __global_idx];
+ __item_id.barrier(sycl::access::fence_space::local_space);
+ }
+ // 2. Reduce within work group using local memory
+ _Tp __result = __brick_reduce(__item_id, __global_idx, __n_items, __temp_local);
+ if (__local_idx == 0)
+ {
+ __temp_acc[__offset_1 + __item_id.get_group(0)] = __result;
+ }
+ });
+ });
+ if (__is_first)
+ {
+ __is_first = false;
+ }
+ ::std::swap(__offset_1, __offset_2);
+ __n_items = __n_groups;
+ __n_groups = (__n_items - 1) / __work_group_size + 1;
+ } while (__n_items > 1);
+ //return future to postpone implicit synchronization point accessing return value
+ return ::oneapi::dpl::__internal::__future<_Tp>(__reduce_event, __temp, __combine, __offset_2);
+}
+
+} // namespace __par_backend_hetero
+
+} // namespace dpl
+
+} // namespace oneapi
+
+#endif /* _ONEDPL_async_backend_sycl_H */
diff --git a/include/oneapi/dpl/internal/async_impl/async_impl.h b/include/oneapi/dpl/internal/async_impl/async_impl.h
new file mode 100644
index 000000000000..ac92bc9a7a32
--- /dev/null
+++ b/include/oneapi/dpl/internal/async_impl/async_impl.h
@@ -0,0 +1,93 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_ASYNC_IMPL_H
+#define _ONEDPL_ASYNC_IMPL_H
+
+#if _ONEDPL_HETERO_BACKEND
+# include "async_impl_hetero.h"
+#endif
+
+#include "glue_async_impl.h"
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace experimental
+{
+
+// [wait_for_all]
+template <typename... _Ts>
+oneapi::dpl::__internal::__enable_if_convertible_to_events<void, _Ts...>
+wait_for_all(_Ts&&... __events)
+{
+ ::std::initializer_list<int> i = {0, (__events.wait(), 0)...};
+ (void)i;
+}
+
+// [async.reduce]
+template <class _ExecutionPolicy, class _ForwardIt, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<typename std::iterator_traits<_ForwardIt>::value_type>,
+ _Events...>
+reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _Events&&... __dependencies)
+{
+ using _Tp = typename std::iterator_traits<_ForwardIt>::value_type;
+ return reduce_async(::std::forward<_ExecutionPolicy>(__exec), __first, __last, _Tp(0), ::std::plus<_Tp>(),
+ ::std::forward<_Events>(__dependencies)...);
+}
+
+template <class _ExecutionPolicy, class _ForwardIt, class _T, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _T, _Events...>
+reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _T __init, _Events&&... __dependencies)
+{
+ return reduce_async(::std::forward<_ExecutionPolicy>(__exec), __first, __last, __init, ::std::plus<_T>(),
+ ::std::forward<_Events>(__dependencies)...);
+}
+
+// [async.transform_reduce]
+template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _T, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__future<_T>,
+ _Events...>
+transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt1 __first1, _ForwardIt1 __last1, _ForwardIt2 __first2,
+ _T __init, _Events&&... __dependencies)
+{
+ using __T1 = typename ::std::iterator_traits<_ForwardIt1>::value_type;
+ return transform_reduce_async(::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __init,
+ ::std::plus<_T>(), ::std::multiplies<__T1>(),
+ ::std::forward<_Events>(__dependencies)...);
+}
+
+// [async.sort]
+template <class _ExecutionPolicy, class _RandomAccessIterator, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...>
+sort_async(_ExecutionPolicy&& __exec, _RandomAccessIterator __first, _RandomAccessIterator __last,
+ _Events&&... __dependencies)
+{
+ using __T = typename ::std::iterator_traits<_RandomAccessIterator>::value_type;
+ return sort_async(::std::forward<_ExecutionPolicy>(__exec), __first, __last, ::std::less<__T>(),
+ ::std::forward<_Events>(__dependencies)...);
+}
+
+} // namespace experimental
+
+} // namespace dpl
+
+} // namespace oneapi
+
+#endif /* _ONEDPL_GLUE_ASYNC_IMPL_H */
diff --git a/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
new file mode 100644
index 000000000000..3ffe694bdb4d
--- /dev/null
+++ b/include/oneapi/dpl/internal/async_impl/async_impl_hetero.h
@@ -0,0 +1,201 @@
+/*
+ * Copyright (c) Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ONEDPL_ASYNC_IMPL_HETERO_H
+#define _ONEDPL_ASYNC_IMPL_HETERO_H
+
+#if _ONEDPL_BACKEND_SYCL
+# include "async_backend_sycl.h"
+#endif
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace __internal
+{
+
+template <typename _ExecutionPolicy, typename _ForwardIterator, typename _Function>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy,
+ oneapi::dpl::__par_backend_hetero::__future<void>>
+__pattern_walk1_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Function __f)
+{
+ auto __n = __last - __first;
+ if (__n <= 0)
+ return oneapi::dpl::__par_backend_hetero::__future<void>(sycl::event{});
+
+ auto __keep =
+ oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _ForwardIterator>();
+ auto __buf = __keep(__first, __last);
+
+ auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
+ ::std::forward<_ExecutionPolicy>(__exec), unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
+ __buf.all_view());
+ return __future_obj;
+}
+
+template <typename _IsSync = ::std::false_type,
+ __par_backend_hetero::access_mode __acc_mode1 = __par_backend_hetero::access_mode::read,
+ __par_backend_hetero::access_mode __acc_mode2 = __par_backend_hetero::access_mode::write,
+ typename _ExecutionPolicy, typename _ForwardIterator1, typename _ForwardIterator2, typename _Function>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy,
+ oneapi::dpl::__internal::__future<_ForwardIterator2>>
+__pattern_walk2_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1,
+ _ForwardIterator2 __first2, _Function __f)
+{
+ auto __n = __last1 - __first1;
+ if (__n <= 0)
+ return oneapi::dpl::__internal::__future<_ForwardIterator2>(sycl::event{}, __first2);
+
+ auto __keep1 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode1, _ForwardIterator1>();
+ auto __buf1 = __keep1(__first1, __last1);
+
+ auto __keep2 = oneapi::dpl::__ranges::__get_sycl_range<__acc_mode2, _ForwardIterator2>();
+ auto __buf2 = __keep2(__first2, __first2 + __n);
+
+ auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
+ ::std::forward<_ExecutionPolicy>(__exec), unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
+ __buf1.all_view(), __buf2.all_view());
+ oneapi::dpl::__internal::__invoke_if(_IsSync(), [&__future_obj]() { __future_obj.wait(); });
+
+ return oneapi::dpl::__internal::__future<_ForwardIterator2>(__future_obj, __first2 + __n);
+}
+
+template <typename _ExecutionPolicy, typename _ForwardIterator1, typename _ForwardIterator2, typename _ForwardIterator3,
+ typename _Function>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy,
+ oneapi::dpl::__internal::__future<_ForwardIterator3>>
+__pattern_walk3_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1,
+ _ForwardIterator2 __first2, _ForwardIterator3 __first3, _Function __f)
+{
+ auto __n = __last1 - __first1;
+ if (__n <= 0)
+ return oneapi::dpl::__internal::__future<_ForwardIterator3>(sycl::event{}, __first3);
+
+ auto __keep1 =
+ oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _ForwardIterator1>();
+ auto __buf1 = __keep1(__first1, __last1);
+ auto __keep2 =
+ oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _ForwardIterator2>();
+ auto __buf2 = __keep2(__first2, __first2 + __n);
+ auto __keep3 =
+ oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::write, _ForwardIterator3>();
+ auto __buf3 = __keep3(__first3, __first3 + __n);
+
+ auto __future_obj = oneapi::dpl::__par_backend_hetero::__parallel_for(
+ ::std::forward<_ExecutionPolicy>(__exec), unseq_backend::walk_n<_ExecutionPolicy, _Function>{__f}, __n,
+ __buf1.all_view(), __buf2.all_view(), __buf3.all_view());
+
+ return oneapi::dpl::__internal::__future<_ForwardIterator3>(__future_obj, __first3 + __n);
+}
+
+template <typename _ExecutionPolicy, typename _ForwardIterator1, typename _ForwardIterator2, typename _Brick>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy,
+ oneapi::dpl::__internal::__future<_ForwardIterator2>>
+__pattern_walk2_brick_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1,
+ _ForwardIterator2 __first2, _Brick __brick)
+{
+ return __pattern_walk2_async(
+ __par_backend_hetero::make_wrapped_policy<__walk2_brick_wrapper>(::std::forward<_ExecutionPolicy>(__exec)),
+ __first1, __last1, __first2, __brick);
+}
+
+//------------------------------------------------------------------------
+// transform_reduce (version with two binary functions)
+//------------------------------------------------------------------------
+
+template <typename _ExecutionPolicy, typename _RandomAccessIterator1, typename _RandomAccessIterator2, typename _Tp,
+ typename _BinaryOperation1, typename _BinaryOperation2>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__future<_Tp>>
+__pattern_transform_reduce_async(_ExecutionPolicy&& __exec, _RandomAccessIterator1 __first1,
+ _RandomAccessIterator1 __last1, _RandomAccessIterator2 __first2, _Tp __init,
+ _BinaryOperation1 __binary_op1, _BinaryOperation2 __binary_op2)
+{
+ if (__first1 == __last1)
+ return oneapi::dpl::__internal::__future<_Tp>(__init);
+
+ using _Policy = _ExecutionPolicy;
+ using _Functor = unseq_backend::walk_n<_Policy, _BinaryOperation2>;
+ using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_Tp>;
+
+ auto __n = __last1 - __first1;
+ auto __keep1 =
+ oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _RandomAccessIterator1>();
+ auto __buf1 = __keep1(__first1, __last1);
+ auto __keep2 =
+ oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _RandomAccessIterator2>();
+ auto __buf2 = __keep2(__first2, __first2 + __n);
+
+ auto __res = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce_async<_RepackedTp>(
+ ::std::forward<_ExecutionPolicy>(__exec),
+ unseq_backend::transform_init<_Policy, _BinaryOperation1, _Functor>{__binary_op1,
+ _Functor{__binary_op2}}, // transform
+ __binary_op1, // combine
+ unseq_backend::reduce<_Policy, _BinaryOperation1, _RepackedTp>{__binary_op1}, // reduce
+ __buf1.all_view(), __buf2.all_view());
+ __res.set(__init);
+ return __res;
+}
+
+//------------------------------------------------------------------------
+// transform_reduce (with unary and binary functions)
+//------------------------------------------------------------------------
+
+template <typename _ExecutionPolicy, typename _ForwardIterator, typename _Tp, typename _BinaryOperation,
+ typename _UnaryOperation>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy, oneapi::dpl::__internal::__future<_Tp>>
+__pattern_transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last,
+ _Tp __init, _BinaryOperation __binary_op, _UnaryOperation __unary_op)
+{
+ if (__first == __last)
+ return oneapi::dpl::__internal::__future<_Tp>(__init);
+
+ using _Policy = _ExecutionPolicy;
+ using _Functor = unseq_backend::walk_n<_Policy, _UnaryOperation>;
+ using _RepackedTp = __par_backend_hetero::__repacked_tuple_t<_Tp>;
+
+ auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read, _ForwardIterator>();
+ auto __buf = __keep(__first, __last);
+
+ auto res = oneapi::dpl::__par_backend_hetero::__parallel_transform_reduce_async<_RepackedTp>(
+ ::std::forward<_ExecutionPolicy>(__exec),
+ unseq_backend::transform_init<_Policy, _BinaryOperation, _Functor>{__binary_op,
+ _Functor{__unary_op}}, // transform
+ __binary_op, // combine
+ unseq_backend::reduce<_Policy, _BinaryOperation, _RepackedTp>{__binary_op}, // reduce
+ __buf.all_view());
+ res.set(__init);
+ return res;
+}
+
+template <typename _ExecutionPolicy, typename _ForwardIterator, typename _T>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<_ExecutionPolicy,
+ oneapi::dpl::__par_backend_hetero::__future<void>>
+__pattern_fill_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, const _T& __value)
+{
+ auto ret_val =
+ __pattern_walk1_async(::std::forward<_ExecutionPolicy>(__exec),
+ __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__first),
+ __par_backend_hetero::make_iter_mode<__par_backend_hetero::access_mode::write>(__last),
+ fill_functor<_T>{__value});
+ return ret_val;
+}
+
+} // namespace __internal
+} // namespace dpl
+} // namespace oneapi
+
+#endif /* _ONEDPL_ASYNC_IMPL_HETERO_H */
diff --git a/include/oneapi/dpl/internal/async_impl/async_utils.h b/include/oneapi/dpl/internal/async_impl/async_utils.h
new file mode 100644
index 000000000000..d4198e74b1c6
--- /dev/null
+++ b/include/oneapi/dpl/internal/async_impl/async_utils.h
@@ -0,0 +1,128 @@
+/*
+ * Copyright (c) Intel Corporation
+ *
+ * Licensed under the Apache License, Version 2.0 (the "License");
+ * you may not use this file except in compliance with the License.
+ * You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing, software
+ * distributed under the License is distributed on an "AS IS" BASIS,
+ * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ * See the License for the specific language governing permissions and
+ * limitations under the License.
+ */
+
+#ifndef _ONEDPL_ASYNC_UTILS_H
+#define _ONEDPL_ASYNC_UTILS_H
+
+#if _ONEDPL_BACKEND_SYCL
+# include <CL/sycl.hpp>
+#endif
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace __internal
+{
+
+template <typename _T>
+struct async_value_base
+{
+ virtual ~async_value_base() = default;
+ virtual _T data(_T) = 0;
+};
+
+template <typename _T, typename _Buf, typename _Op>
+class async_value : public async_value_base<_T>
+{
+ _Buf __my_buffer;
+ _Op __my_op;
+ size_t __my_offset;
+
+ public:
+ async_value(_Buf __b, _Op __o, size_t __i) : __my_buffer(__b), __my_op(__o), __my_offset(__i) {}
+ _T
+ data(_T __init)
+ {
+ return __my_op(__my_buffer.template get_access<access_mode::read>()[__my_offset], __init);
+ }
+};
+
+template <typename _T>
+class __future : public __par_backend_hetero::__future_base
+{
+ ::std::unique_ptr<async_value_base<_T>> __ret_val;
+ _T __init;
+
+ public:
+ __future(_T __i) : __par_backend_hetero::__future_base(), __init(__i) {}
+
+ template <typename _Event, typename _Op, typename _Buf>
+ __future(_Event __e, _Buf __b, _Op __o, size_t __offset) : __par_backend_hetero::__future_base(__e)
+ {
+ __ret_val = ::std::unique_ptr<async_value<_T, _Buf, _Op>>(new async_value<_T, _Buf, _Op>(__b, __o, __offset));
+ }
+ void
+ set(_T __i)
+ {
+ __init = __i;
+ }
+ _T
+ get()
+ {
+ this->wait();
+ return __ret_val->data(__init);
+ }
+};
+
+#if _ONEDPL_BACKEND_SYCL
+// Specialization for sycl_iterator
+template <typename _T>
+class __future<sycl_iterator<sycl::access::mode::read_write, _T, sycl::buffer_allocator>>
+ : public __par_backend_hetero::__future_base
+{
+ using _Tp = sycl_iterator<sycl::access::mode::read_write, _T, sycl::buffer_allocator>;
+ _Tp __data;
+ ::std::unique_ptr<__par_backend_hetero::__lifetime_keeper_base> __tmp;
+
+ public:
+ template <typename... _Ts>
+ __future(sycl::event __e, _Tp __d, _Ts... __t) : __par_backend_hetero::__future_base(__e), __data(__d)
+ {
+ if (sizeof...(_Ts) != 0)
+ __tmp = ::std::unique_ptr<__par_backend_hetero::__lifetime_keeper<_Ts...>>(
+ new __par_backend_hetero::__lifetime_keeper<_Ts...>(__t...));
+ }
+ _Tp
+ get()
+ {
+ this->wait();
+ return __data;
+ }
+};
+#endif
+
+template <typename _ExecPolicy, typename _T, typename _Op1, typename... _Events>
+using __enable_if_device_execution_policy_single_no_default = typename ::std::enable_if<
+ oneapi::dpl::__internal::__is_device_execution_policy<typename ::std::decay<_ExecPolicy>::type>::value &&
+ !::std::is_convertible<_Op1, sycl::event>::value &&
+ oneapi::dpl::__internal::__is_convertible_to_event<_Events...>::value,
+ _T>::type;
+
+template <typename _ExecPolicy, typename _T, typename _Op1, typename _Op2, typename... _Events>
+using __enable_if_device_execution_policy_double_no_default = typename ::std::enable_if<
+ oneapi::dpl::__internal::__is_device_execution_policy<typename ::std::decay<_ExecPolicy>::type>::value &&
+ !::std::is_convertible<_Op1, sycl::event>::value && !::std::is_convertible<_Op2, sycl::event>::value &&
+ oneapi::dpl::__internal::__is_convertible_to_event<_Events...>::value,
+ _T>::type;
+
+} // namespace __internal
+
+} // namespace dpl
+
+} // namespace oneapi
+
+#endif /* _ONEDPL_ASYNC_UTILS_H */
diff --git a/include/oneapi/dpl/internal/async_impl/glue_async_impl.h b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h
new file mode 100644
index 000000000000..e6b25e0e7c90
--- /dev/null
+++ b/include/oneapi/dpl/internal/async_impl/glue_async_impl.h
@@ -0,0 +1,165 @@
+// -*- C++ -*-
+//===----------------------------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef _ONEDPL_GLUE_ASYNC_IMPL_H
+#define _ONEDPL_GLUE_ASYNC_IMPL_H
+
+#if _ONEDPL_HETERO_BACKEND
+# include "async_impl_hetero.h"
+#endif
+
+namespace oneapi
+{
+namespace dpl
+{
+namespace experimental
+{
+
+// [async.transform]
+template <class _ExecutionPolicy, class _ForwardIterator1, class _ForwardIterator2, class _UnaryOperation,
+ class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_ForwardIterator2>, _Events...>
+transform_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterator1 __last,
+ _ForwardIterator2 __result, _UnaryOperation __op, _Events&&... __dependencies)
+{
+ wait_for_all(::std::forward<_Events>(__dependencies)...);
+ auto ret_val = oneapi::dpl::__internal::__pattern_walk2_async(
+ ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result,
+ oneapi::dpl::__internal::__invoke_unary_op<_UnaryOperation>{::std::move(__op)});
+ return ret_val;
+}
+
+template <class _ExecutionPolicy, class _ForwardIterator1, class _ForwardIterator2, class _ForwardIterator,
+ class _BinaryOperation, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_ForwardIterator>, _BinaryOperation, _Events...>
+transform_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first1, _ForwardIterator1 __last1,
+ _ForwardIterator2 __first2, _ForwardIterator __result, _BinaryOperation __op,
+ _Events&&... __dependencies)
+{
+ wait_for_all(::std::forward<_Events>(__dependencies)...);
+ auto ret_val = oneapi::dpl::__internal::__pattern_walk3_async(
+ ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __result,
+ oneapi::dpl::__internal::__transform_functor<
+ oneapi::dpl::__internal::__ref_or_copy<_ExecutionPolicy, _BinaryOperation>>(__op));
+ return ret_val;
+}
+
+// [async.copy]
+template <class _ExecutionPolicy, class _ForwardIterator1, class _ForwardIterator2, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_ForwardIterator2>, _Events...>
+copy_async(_ExecutionPolicy&& __exec, _ForwardIterator1 __first, _ForwardIterator1 __last, _ForwardIterator2 __result,
+ _Events&&... __dependencies)
+{
+ wait_for_all(::std::forward<_Events>(__dependencies)...);
+ auto ret_val = oneapi::dpl::__internal::__pattern_walk2_brick_async(
+ ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __result,
+ oneapi::dpl::__internal::__brick_copy<_ExecutionPolicy>{});
+ return ret_val;
+}
+
+// [async.sort]
+template <class _ExecutionPolicy, class _Iterator, class _Compare, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default<
+ _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Compare, _Events...>
+sort_async(_ExecutionPolicy&& __exec, _Iterator __first, _Iterator __last, _Compare __comp, _Events&&... __dependencies)
+{
+ wait_for_all(::std::forward<_Events>(__dependencies)...);
+ if (__last - __first < 2)
+ return oneapi::dpl::__par_backend_hetero::__future<void>(sycl::event{});
+
+ auto __keep = oneapi::dpl::__ranges::__get_sycl_range<__par_backend_hetero::access_mode::read_write, _Iterator>();
+ auto __buf = __keep(__first, __last);
+
+ return __par_backend_hetero::__parallel_stable_sort(::std::forward<_ExecutionPolicy>(__exec), __buf.all_view(),
+ __comp);
+}
+
+// [async.for_each]
+template <class _ExecutionPolicy, class _ForwardIterator, class _Function, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...>
+for_each_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Function __f,
+ _Events&&... __dependencies)
+{
+ wait_for_all(::std::forward<_Events>(__dependencies)...);
+ auto ret_val =
+ oneapi::dpl::__internal::__pattern_walk1_async(::std::forward<_ExecutionPolicy>(__exec), __first, __last, __f);
+ return ret_val;
+}
+
+// [async.reduce]
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class _BinaryOperation, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_double_no_default<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_Tp>, _Tp, _BinaryOperation, _Events...>
+reduce_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, _Tp __init,
+ _BinaryOperation __binary_op, _Events&&... __dependencies)
+{
+ wait_for_all(::std::forward<_Events>(__dependencies)...);
+ typedef typename ::std::iterator_traits<_ForwardIterator>::value_type _InputType;
+ auto ret_val = oneapi::dpl::__internal::__pattern_transform_reduce_async(
+ ::std::forward<_ExecutionPolicy>(__exec), __first, __last, __init, ::std::plus<_InputType>(),
+ oneapi::dpl::__internal::__no_op());
+ return ret_val;
+}
+
+// [async.fill]
+
+template <class _ExecutionPolicy, class _ForwardIterator, class _Tp, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy<
+ _ExecutionPolicy, oneapi::dpl::__par_backend_hetero::__future<void>, _Events...>
+fill_async(_ExecutionPolicy&& __exec, _ForwardIterator __first, _ForwardIterator __last, const _Tp& __value,
+ _Events&&... __dependencies)
+{
+ wait_for_all(::std::forward<_Events>(__dependencies)...);
+ return oneapi::dpl::__internal::__pattern_fill_async(::std::forward<_ExecutionPolicy>(__exec), __first, __last,
+ __value);
+}
+
+// [async.transform_reduce]
+
+template <class _ExecutionPolicy, class _ForwardIt1, class _ForwardIt2, class _T, class _BinaryOp1, class _BinaryOp2,
+ class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_double_no_default<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _BinaryOp1, _BinaryOp2, _Events...>
+transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt1 __first1, _ForwardIt1 __last1, _ForwardIt2 __first2,
+ _T __init, _BinaryOp1 __binary_op1, _BinaryOp2 __binary_op2, _Events&&... __dependencies)
+{
+ wait_for_all(::std::forward<_Events>(__dependencies)...);
+ return oneapi::dpl::__internal::__pattern_transform_reduce_async(
+ ::std::forward<_ExecutionPolicy>(__exec), __first1, __last1, __first2, __init, __binary_op1, __binary_op2);
+}
+
+template <class _ExecutionPolicy, class _ForwardIt, class _T, class _BinaryOp, class _UnaryOp, class... _Events>
+oneapi::dpl::__internal::__enable_if_device_execution_policy_single_no_default<
+ _ExecutionPolicy, oneapi::dpl::__internal::__future<_T>, _UnaryOp, _Events...>
+transform_reduce_async(_ExecutionPolicy&& __exec, _ForwardIt __first, _ForwardIt __last, _T __init,
+ _BinaryOp __binary_op, _UnaryOp __unary_op, _Events&&... __dependencies)
+{
+ wait_for_all(::std::forward<_Events>(__dependencies)...);
+ return oneapi::dpl::__internal::__pattern_transform_reduce_async(::std::forward<_ExecutionPolicy>(__exec), __first,
+ __last, __init, __binary_op, __unary_op);
+}
+
+} // namespace experimental
+
+} // namespace dpl
+
+} // namespace oneapi
+
+#endif /* _ONEDPL_GLUE_ASYNC_IMPL_H */
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/execution_sycl_defs.h b/include/oneapi/dpl/pstl/hetero/dpcpp/execution_sycl_defs.h
index b871719d4d28..c989ab40c405 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/execution_sycl_defs.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/execution_sycl_defs.h
@@ -280,10 +280,29 @@ struct __ref_or_copy_impl<execution::device_policy<PolicyParams...>, _T>
using type = _T;
};
+// Extension: check if parameter pack is convertible to events
+template <bool...>
+struct __is_true_helper
+{
+};
+
+template <bool... _Ts>
+using __is_all_true = ::std::is_same<__is_true_helper<_Ts..., true>, __is_true_helper<true, _Ts...>>;
+
+template <class... _Ts>
+using __is_convertible_to_event =
+ __is_all_true<::std::is_convertible<typename ::std::decay<_Ts>::type, sycl::event>::value...>;
+
+template <typename _T, typename... _Events>
+using __enable_if_convertible_to_events =
+ typename ::std::enable_if<oneapi::dpl::__internal::__is_convertible_to_event<_Events...>::value, _T>::type;
+
// Extension: execution policies type traits
-template <typename _ExecPolicy, typename _T>
+template <typename _ExecPolicy, typename _T, typename... _Events>
using __enable_if_device_execution_policy = typename ::std::enable_if<
- oneapi::dpl::__internal::__is_device_execution_policy<typename ::std::decay<_ExecPolicy>::type>::value, _T>::type;
+ oneapi::dpl::__internal::__is_device_execution_policy<typename ::std::decay<_ExecPolicy>::type>::value &&
+ oneapi::dpl::__internal::__is_convertible_to_event<_Events...>::value,
+ _T>::type;
template <typename _ExecPolicy, typename _T>
using __enable_if_hetero_execution_policy = typename ::std::enable_if<
diff --git a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
index 81595360d37c..c3dfc4564917 100644
--- a/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
+++ b/include/oneapi/dpl/pstl/hetero/dpcpp/parallel_backend_sycl_utils.h
@@ -386,6 +386,7 @@ class __future_base
sycl::event __my_event;
public:
+ __future_base() : __my_event(sycl::event{}) {}
__future_base(sycl::event __e) : __my_event(__e) {}
void
wait()
diff --git a/test/xpu_api/asynch.pass.cpp b/test/xpu_api/asynch.pass.cpp
new file mode 100644
index 000000000000..0ba6d5cc19e3
--- /dev/null
+++ b/test/xpu_api/asynch.pass.cpp
@@ -0,0 +1,153 @@
+// -*- C++ -*-
+//===-- async.pass.cpp ----------------------------------------------------===//
+//
+// Copyright (C) Intel Corporation
+//
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+// This file incorporates work covered by the following copyright and permission
+// notice:
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+//
+//===----------------------------------------------------------------------===//
+
+#include "oneapi/dpl/execution"
+#include "oneapi/dpl/async"
+#include "oneapi/dpl/iterator"
+
+#include "support/pstl_test_config.h"
+
+#include <iostream>
+#include <iomanip>
+#include <numeric>
+
+#if TEST_SYCL_PRESENT
+# include <CL/sycl.hpp>
+#endif
+
+template <typename _T1, typename _T2>
+void
+ASSERT_EQUAL(_T1&& X, _T2&& Y)
+{
+ if (X != Y)
+ std::cout << "CHECK CORRECTNESS (ASYNC): fail (" << X << "," << Y << ")" << std::endl;
+}
+
+#if TEST_DPCPP_BACKEND_PRESENT
+void
+test_with_buffers()
+{
+ const int n = 100;
+ {
+ //sycl::queue q;
+ sycl::buffer<int> x{n};
+ sycl::buffer<int> y{n};
+
+ //auto my_policy = oneapi::dpl::execution::make_device_policy(q);
+ auto my_policy = oneapi::dpl::execution::make_device_policy<class Copy1>(oneapi::dpl::execution::dpcpp_default);
+ auto res_1a = oneapi::dpl::experimental::copy_async(my_policy, oneapi::dpl::counting_iterator<int>(0),
+ oneapi::dpl::counting_iterator<int>(n),
+ oneapi::dpl::begin(x)); // x = [0..n]
+ auto my_policy1 = oneapi::dpl::execution::make_device_policy<class Fill1>(my_policy);
+ auto res_1b = oneapi::dpl::experimental::fill_async(my_policy1, oneapi::dpl::begin(y), oneapi::dpl::end(y),
+ 7); // y = [7..7]
+ auto my_policy2 = oneapi::dpl::execution::make_device_policy<class ForEach1>(my_policy);
+ auto res_2a = oneapi::dpl::experimental::for_each_async(
+ my_policy2, oneapi::dpl::begin(x), oneapi::dpl::end(x), [](int& e) { ++e; }, res_1a); // x = [1..n]
+ auto my_policy3 = oneapi::dpl::execution::make_device_policy<class Transform1>(my_policy);
+ auto res_2b = oneapi::dpl::experimental::transform_async(
+ my_policy3, oneapi::dpl::begin(y), oneapi::dpl::end(y), oneapi::dpl::begin(y),
+ [](const int& e) { return e / 2; },
+ res_1b); // y = [3..3]
+
+ sycl::buffer<int> z{n}; //std::vector<int> z(n);
+ auto my_policy4 = oneapi::dpl::execution::make_device_policy<class Transform2>(my_policy);
+ auto res_3 = oneapi::dpl::experimental::transform_async(my_policy4, oneapi::dpl::begin(x), oneapi::dpl::end(x),
+ oneapi::dpl::begin(y), oneapi::dpl::begin(z),
+ std::plus<int>(), res_2a, res_2b); // z = [4..n+3]
+ auto my_policy5 = oneapi::dpl::execution::make_device_policy<class Reduce1>(my_policy);
+ auto alpha = oneapi::dpl::experimental::reduce_async(my_policy5, oneapi::dpl::begin(x), oneapi::dpl::end(x), 0,
+ std::plus<int>(),
+ res_2a)
+ .get(); // alpha = n*(n+1)/2
+ auto my_policy6 = oneapi::dpl::execution::make_device_policy<class Reduce2>(my_policy);
+ auto beta =
+ oneapi::dpl::experimental::transform_reduce_async(my_policy6, oneapi::dpl::begin(z), oneapi::dpl::end(z), 0,
+ std::plus<int>(), [=](int e) { return alpha * e; })
+ .get();
+
+ ASSERT_EQUAL(beta, (n * (n + 1) / 2) * ((n + 3) * (n + 4) / 2 - 6));
+ }
+}
+
+void
+test_with_usm()
+{
+ cl::sycl::queue q;
+ const int n = 1024;
+ const int n_small = 13;
+
+ // ASYNC TEST USING USM //
+ // TODO: Extend tests by checking true async behavior in more detail
+ {
+ // Allocate space for data using USM.
+ uint64_t* data1 =
+ static_cast<uint64_t*>(cl::sycl::malloc_shared(n * sizeof(uint64_t), q.get_device(), q.get_context()));
+ uint64_t* data2 =
+ static_cast<uint64_t*>(cl::sycl::malloc_shared(n * sizeof(uint64_t), q.get_device(), q.get_context()));
+
+ // Initialize data
+ for (int i = 0; i != n - 1; ++i)
+ {
+ data1[i] = i % 4 + 1;
+ data2[i] = data1[i] + 1;
+ if (i > 3 && i != n - 2)
+ {
+ ++i;
+ data1[i] = data1[i - 1];
+ data2[i] = data2[i - 1];
+ }
+ }
+ data1[n - 1] = 0;
+ data2[n - 1] = 0;
+
+ // compute reference values
+ const uint64_t ref1 = std::inner_product(data2, data2 + n, data1, 0);
+ const uint64_t ref2 = std::accumulate(data1, data1 + n_small, 0);
+
+ // call first algorithm
+ auto new_policy1 = oneapi::dpl::execution::make_device_policy<class async1>(q);
+ auto fut1 = oneapi::dpl::experimental::transform_reduce_async(
+ new_policy1, data2, data2 + n, data1, 0, std::plus<uint64_t>(), std::multiplies<uint64_t>());
+
+ // call second algorithm and wait for result
+ auto new_policy2 = oneapi::dpl::execution::make_device_policy<class async2>(q);
+ auto res2 = oneapi::dpl::experimental::reduce_async(new_policy2, data1, data1 + n_small).get();
+
+ // call third algorithm that has to wait for first to complete
+ auto new_policy3 = oneapi::dpl::execution::make_device_policy<class async3>(q);
+ oneapi::dpl::experimental::sort_async(new_policy3, data2, data2 + n, fut1);
+
+ // check values
+ auto res1 = fut1.get();
+ ASSERT_EQUAL(res1, ref1);
+ ASSERT_EQUAL(res2, ref2);
+
+ sycl::free(data1, q);
+ sycl::free(data2, q);
+ }
+}
+#endif
+
+int
+main()
+{
+#if TEST_DPCPP_BACKEND_PRESENT
+ test_with_buffers();
+ test_with_usm();
+#endif
+ std::cout << "done" << std::endl;
+ return 0;
+}