mesytec-mnode/external/taskflow-3.8.0/taskflow/sycl/algorithm/reduce.hpp

488 lines
13 KiB
C++
Raw Normal View History

2025-01-04 01:25:05 +01:00
#pragma once
#include "../syclflow.hpp"
namespace tf::detail {
// ----------------------------------------------------------------------------
// reduction helper functions
// ----------------------------------------------------------------------------
/** @private */
template<unsigned nt, typename T>
struct syclBlockReduce {
static const unsigned group_size = std::min(nt, SYCL_WARP_SIZE);
static const unsigned shm_size = std::max(nt, 2* group_size);
static const unsigned num_passes = log2(group_size);
static const unsigned num_items = nt / group_size;
static_assert(
nt && (0 == nt % SYCL_WARP_SIZE),
"syclBlockReduce requires num threads to be a multiple of warp_size (32)"
);
using shm_t = sycl::accessor<
T, 1, sycl::access::mode::read_write, sycl::access::target::local
>;
template<typename op_t>
T operator()(
sycl::nd_item<1>&, T, const shm_t&, unsigned, op_t, bool = true
) const;
};
// function: reduce to be called from a block
template<unsigned nt, typename T>
template<typename op_t>
T syclBlockReduce<nt, T>::operator ()(
sycl::nd_item<1>& item,
T x,
const shm_t& shm,
unsigned count,
op_t op,
bool ret
) const {
auto tid = item.get_local_id(0);
// Store your data into shared memory.
shm[tid] = x;
item.barrier(sycl::access::fence_space::local_space);
if(tid < group_size) {
// Each thread scans within its lane.
sycl_strided_iterate<group_size, num_items>([&](auto i, auto j) {
if(i > 0) {
x = op(x, shm[j]);
}
}, tid, count);
shm[tid] = x;
}
item.barrier(sycl::access::fence_space::local_space);
auto count2 = count < group_size ? count : group_size;
auto first = (1 & num_passes) ? group_size : 0;
if(tid < group_size) {
shm[first + tid] = x;
}
item.barrier(sycl::access::fence_space::local_space);
sycl_iterate<num_passes>([&](auto pass) {
if(tid < group_size) {
if(auto offset = 1 << pass; tid + offset < count2) {
x = op(x, shm[first + offset + tid]);
}
first = group_size - first;
shm[first + tid] = x;
}
item.barrier(sycl::access::fence_space::local_space);
});
if(ret) {
x = shm[0];
item.barrier(sycl::access::fence_space::local_space);
}
return x;
}
/** @private */
template <typename P, typename I, typename T, typename O>
sycl::event sycl_reduce_loop(
P&& p,
I input,
unsigned count,
T* res,
O op,
bool incl,
void* ptr,
std::vector<sycl::event> evs
) {
using E = std::decay_t<P>;
using R = syclBlockReduce<E::nt, T>;
auto buf = static_cast<T*>(ptr);
auto B = (count + E::nv - 1) / E::nv;
auto e = p.queue().submit([=, evs=std::move(evs)](sycl::handler& h) {
h.depends_on(evs);
// create a shared memory
typename R::shm_t shm(sycl::range<1>(R::shm_size), h);
h.parallel_for(
sycl::nd_range<1>{sycl::range<1>(B*E::nt), sycl::range<1>(E::nt)},
[=](sycl::nd_item<1> item) {
auto tid = item.get_local_id(0);
auto bid = item.get_group(0);
// get the tile of this group
auto tile = sycl_get_tile(bid, E::nv, count);
// load data from input to register
auto x = sycl_mem_to_reg_strided<E::nt, E::vt>(
input + tile.begin, tid, tile.count()
);
// reduce multiple values per thread into a scalar.
T s;
sycl_strided_iterate<E::nt, E::vt>(
[&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count()
);
// reduce to a scalar per block.
s = R()(
item, s, shm, (tile.count()<E::nt ? tile.count() : E::nt), op, false
);
if(!tid) {
(1 == B) ? *res = (incl ? op(*res, s) : s) : buf[bid] = s;
}
}
);
});
if(B > 1) {
return sycl_reduce_loop(p, buf, B, res, op, incl, buf+B, {e});
}
else {
return e;
}
}
} // end of namespace detail -------------------------------------------------
namespace tf {
/**
@brief queries the buffer size in bytes needed to call reduce kernels
@tparam P execution policy type
@tparam T value type
@param count number of elements to reduce
The function is used to allocate a buffer for calling asynchronous reduce.
Please refer to @ref SYCLSTDReduce for details.
*/
template <typename P, typename T>
unsigned sycl_reduce_buffer_size(unsigned count) {
using E = std::decay_t<P>;
unsigned B = (count + E::nv - 1) / E::nv;
unsigned n = 0;
for(auto b=B; b>1; n += (b=(b+E::nv-1)/E::nv));
return n*sizeof(T);
}
//// sycl reduction
//template <typename I, typename T, typename C, bool uninitialized>
//auto syclFlow::_reduce_cgh(I first, I last, T* res, C&& op) {
//
// // TODO: special case N == 0?
// size_t N = std::distance(first, last);
// size_t B = _default_group_size(N);
//
// return [=, op=std::forward<C>(op)](sycl::handler& handler) mutable {
//
// // create a shared memory
// sycl::accessor<
// T, 1, sycl::access::mode::read_write, sycl::access::target::local
// > shm(sycl::range<1>(B), handler);
//
// // perform parallel reduction
// handler.parallel_for(
// sycl::nd_range<1>{sycl::range<1>(B), sycl::range<1>(B)},
// [=] (sycl::nd_item<1> item) {
//
// size_t tid = item.get_global_id(0);
//
// if(tid >= N) {
// return;
// }
//
// shm[tid] = *(first+tid);
//
// for(size_t i=tid+B; i<N; i+=B) {
// shm[tid] = op(shm[tid], *(first+i));
// }
//
// item.barrier(sycl::access::fence_space::local_space);
//
// for(size_t s = B / 2; s > 0; s >>= 1) {
// if(tid < s && tid + s < N) {
// shm[tid] = op(shm[tid], shm[tid+s]);
// }
// item.barrier(sycl::access::fence_space::local_space);
// }
//
// if(tid == 0) {
// if constexpr (uninitialized) {
// *res = shm[0];
// }
// else {
// *res = op(*res, shm[0]);
// }
// }
// });
// };
//}
// ----------------------------------------------------------------------------
// SYCL standard reduce algorithms
// ----------------------------------------------------------------------------
/**
@brief performs parallel reduction over a range of items
@tparam P execution policy type
@tparam I input iterator type
@tparam T value type
@tparam O binary operator type
@param p execution policy
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param res pointer to the result
@param op binary operator to apply to reduce elements
This method is equivalent to the parallel execution of the following loop
on a SYCL device:
@code{.cpp}
while (first != last) {
*result = op(*result, *first++);
}
@endcode
*/
template<typename P, typename I, typename T, typename O>
void sycl_reduce(P&& p, I first, I last, T* res, O op) {
unsigned count = std::distance(first, last);
if(count == 0) {
return;
}
// allocate temporary buffer
auto tmp = sycl::malloc_device(
sycl_reduce_buffer_size<P, T>(count), p.queue()
);
// reduction loop
detail::sycl_reduce_loop(p, first, count, res, op, true, tmp, {}).wait();
// deallocate the temporary buffer
sycl::free(tmp, p.queue());
}
/**
@brief performs asynchronous parallel reduction over a range of items
@tparam P execution policy type
@tparam I input iterator type
@tparam T value type
@tparam O binary operator type
@param p execution policy
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param res pointer to the result
@param op binary operator to apply to reduce elements
@param buf pointer to the temporary buffer
@return an SYCL event
Please refer to @ref SYCLSTDReduce for details.
*/
template<typename P, typename I, typename T, typename O>
sycl::event sycl_reduce_async(
P&& p, I first, I last, T* res, O op, void* buf, std::vector<sycl::event> dep
) {
unsigned count = std::distance(first, last);
if(count == 0) {
return {};
}
// reduction loop
return detail::sycl_reduce_loop(
p, first, count, res, op, true, buf, std::move(dep)
);
}
/**
@brief performs parallel reduction over a range of items
without an initial value
@tparam P execution policy type
@tparam I input iterator type
@tparam T value type
@tparam O binary operator type
@param p execution policy
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param res pointer to the result
@param op binary operator to apply to reduce elements
This method is equivalent to the parallel execution of the following loop
on a SYCL device:
@code{.cpp}
*result = *first++; // no initial values partitipcate in the loop
while (first != last) {
*result = op(*result, *first++);
}
@endcode
*/
template<typename P, typename I, typename T, typename O>
void sycl_uninitialized_reduce(P&& p, I first, I last, T* res, O op) {
unsigned count = std::distance(first, last);
if(count == 0) {
return;
}
// allocate temporary buffer
auto tmp = sycl::malloc_device(
sycl_reduce_buffer_size<P, T>(count), p.queue()
);
// reduction loop
detail::sycl_reduce_loop(p, first, count, res, op, false, tmp, {}).wait();
// deallocate the temporary buffer
sycl::free(tmp, p.queue());
}
/**
@brief performs asynchronous parallel reduction over a range of items
without an initial value
@tparam P execution policy type
@tparam I input iterator type
@tparam T value type
@tparam O binary operator type
@param p execution policy
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param res pointer to the result
@param op binary operator to apply to reduce elements
@param buf pointer to the temporary buffer
@return an SYCL event
Please refer to @ref SYCLSTDReduce for details.
*/
template<typename P, typename I, typename T, typename O>
sycl::event sycl_uninitialized_reduce_async(
P&& p, I first, I last, T* res, O op, void* buf, std::vector<sycl::event> dep
) {
unsigned count = std::distance(first, last);
if(count == 0) {
return {};
}
// reduction loop
return detail::sycl_reduce_loop(
p, first, count, res, op, false, buf, std::move(dep)
);
}
// ----------------------------------------------------------------------------
// syclFlow reduce
// ----------------------------------------------------------------------------
// Function: reduce
template <typename I, typename T, typename C>
syclTask syclFlow::reduce(I first, I last, T* res, C&& op) {
//return on(_reduce_cgh<I, T, C, false>(first, last, res, std::forward<C>(op)));
auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>(
std::distance(first, last)
);
return on([=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}]
(sycl::queue& queue, std::vector<sycl::event> events) mutable {
syclDefaultExecutionPolicy p(queue);
return sycl_reduce_async(
p, first, last, res, op, buf.get().data(), std::move(events)
);
});
}
// Function: uninitialized_reduce
template <typename I, typename T, typename C>
syclTask syclFlow::uninitialized_reduce(I first, I last, T* res, C&& op) {
//return on(_reduce_cgh<I, T, C, true>(first, last, res, std::forward<C>(op)));
auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>(
std::distance(first, last)
);
return on([=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}]
(sycl::queue& queue, std::vector<sycl::event> events) mutable {
syclDefaultExecutionPolicy p(queue);
return sycl_uninitialized_reduce_async(
p, first, last, res, op, buf.get().data(), std::move(events)
);
});
}
// ----------------------------------------------------------------------------
// rebind methods
// ----------------------------------------------------------------------------
//// Function: reduce
//template <typename I, typename T, typename C>
//void syclFlow::reduce(syclTask task, I first, I last, T* res, C&& op) {
// //on(task, _reduce_cgh<I, T, C, false>(
// // first, last, res, std::forward<C>(op)
// //));
//
// auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>(
// std::distance(first, last)
// );
//
// on(task, [=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}]
// (sycl::queue& queue, std::vector<sycl::event> events) mutable {
// syclDefaultExecutionPolicy p(queue);
// return sycl_reduce_async(
// p, first, last, res, op, buf.get().data(), std::move(events)
// );
// });
//}
//
//// Function: uninitialized_reduce
//template <typename I, typename T, typename C>
//void syclFlow::uninitialized_reduce(
// syclTask task, I first, I last, T* res, C&& op
//) {
// //on(task, _reduce_cgh<I, T, C, true>(
// // first, last, res, std::forward<C>(op)
// //));
// auto bufsz = sycl_reduce_buffer_size<syclDefaultExecutionPolicy, T>(
// std::distance(first, last)
// );
//
// on(task, [=, buf=MoC{syclScopedDeviceMemory<std::byte>(bufsz, _queue)}]
// (sycl::queue& queue, std::vector<sycl::event> events) mutable {
// syclDefaultExecutionPolicy p(queue);
// return sycl_uninitialized_reduce_async(
// p, first, last, res, op, buf.get().data(), std::move(events)
// );
// });
//}
} // end of namespace tf -----------------------------------------------------