mesytec-mnode/external/taskflow-3.8.0/taskflow/cuda/algorithm/find.hpp

295 lines
7.4 KiB
C++
Raw Normal View History

2025-01-04 01:25:05 +01:00
#pragma once
#include "for_each.hpp"
#include "reduce.hpp"
/**
@file taskflow/cuda/algorithm/find.hpp
@brief cuda find algorithms include file
*/
namespace tf::detail {
/** @private */
template <typename T>
struct cudaFindPair {
T key;
unsigned index;
__device__ operator unsigned () const { return index; }
};
/** @private */
template <typename P, typename I, typename U>
void cuda_find_if_loop(P&& p, I input, unsigned count, unsigned* idx, U pred) {
if(count == 0) {
cuda_single_task(p, [=] __device__ () { *idx = 0; });
return;
}
using E = std::decay_t<P>;
auto B = (count + E::nv - 1) / E::nv;
// set the index to the maximum
cuda_single_task(p, [=] __device__ () { *idx = count; });
// launch the kernel to atomic-find the minimum
cuda_kernel<<<B, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) {
__shared__ unsigned shm_id;
if(!tid) {
shm_id = count;
}
__syncthreads();
auto tile = cuda_get_tile(bid, E::nv, count);
auto x = cuda_mem_to_reg_strided<E::nt, E::vt>(
input + tile.begin, tid, tile.count()
);
auto id = count;
for(unsigned i=0; i<E::vt; i++) {
auto j = E::nt*i + tid;
if(j < tile.count() && pred(x[i])) {
id = j + tile.begin;
break;
}
}
// Note: the reduce version is not faster though
// reduce to a scalar per block.
//__shared__ typename cudaBlockReduce<E::nt, unsigned>::Storage shm;
//id = cudaBlockReduce<E::nt, unsigned>()(
// tid,
// id,
// shm,
// (tile.count() < E::nt ? tile.count() : E::nt),
// cuda_minimum<unsigned>{},
// false
//);
// only need the minimum id
atomicMin(&shm_id, id);
__syncthreads();
// reduce all to the global memory
if(!tid) {
atomicMin(idx, shm_id);
//atomicMin(idx, id);
}
});
}
/** @private */
template <typename P, typename I, typename O>
void cuda_min_element_loop(
P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
) {
if(count == 0) {
cuda_single_task(p, [=] __device__ () { *idx = 0; });
return;
}
using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
cuda_uninitialized_reduce_loop(p,
cuda_make_load_iterator<T>([=]__device__(auto i){
return T{*(input+i), i};
}),
count,
idx,
[=] __device__ (const auto& a, const auto& b) {
return op(a.key, b.key) ? a : b;
},
ptr
);
}
/** @private */
template <typename P, typename I, typename O>
void cuda_max_element_loop(
P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr
) {
if(count == 0) {
cuda_single_task(p, [=] __device__ () { *idx = 0; });
return;
}
using T = cudaFindPair<typename std::iterator_traits<I>::value_type>;
cuda_uninitialized_reduce_loop(p,
cuda_make_load_iterator<T>([=]__device__(auto i){
return T{*(input+i), i};
}),
count,
idx,
[=] __device__ (const auto& a, const auto& b) {
return op(a.key, b.key) ? b : a;
},
ptr
);
}
} // end of namespace tf::detail ---------------------------------------------
namespace tf {
// ----------------------------------------------------------------------------
// cuda_find_if
// ----------------------------------------------------------------------------
/**
@brief finds the index of the first element that satisfies the given criteria
@tparam P execution policy type
@tparam I input iterator type
@tparam U unary operator type
@param p execution policy
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param idx pointer to the index of the found element
@param op unary operator which returns @c true for the required element
The function launches kernels asynchronously to find the index @c idx of the
first element in the range <tt>[first, last)</tt>
such that <tt>op(*(first+idx))</tt> is true.
This is equivalent to the parallel execution of the following loop:
@code{.cpp}
unsigned idx = 0;
for(; first != last; ++first, ++idx) {
if (p(*first)) {
return idx;
}
}
return idx;
@endcode
*/
template <typename P, typename I, typename U>
void cuda_find_if(
P&& p, I first, I last, unsigned* idx, U op
) {
detail::cuda_find_if_loop(p, first, std::distance(first, last), idx, op);
}
// ----------------------------------------------------------------------------
// cuda_min_element
// ----------------------------------------------------------------------------
// Function: min-element_bufsz
template <unsigned NT, unsigned VT>
template <typename T>
unsigned cudaExecutionPolicy<NT, VT>::min_element_bufsz(unsigned count) {
return reduce_bufsz<detail::cudaFindPair<T>>(count);
}
/**
@brief finds the index of the minimum element in a range
@tparam P execution policy type
@tparam I input iterator type
@tparam O comparator type
@param p execution policy object
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param idx solution index of the minimum element
@param op comparison function object
@param buf pointer to the buffer
The function launches kernels asynchronously to find
the smallest element in the range <tt>[first, last)</tt>
using the given comparator @c op.
You need to provide a buffer that holds at least
tf::cuda_min_element_bufsz bytes for internal use.
The function is equivalent to a parallel execution of the following loop:
@code{.cpp}
if(first == last) {
return 0;
}
auto smallest = first;
for (++first; first != last; ++first) {
if (op(*first, *smallest)) {
smallest = first;
}
}
return std::distance(first, smallest);
@endcode
*/
template <typename P, typename I, typename O>
void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
detail::cuda_min_element_loop(
p, first, std::distance(first, last), idx, op, buf
);
}
// ----------------------------------------------------------------------------
// cuda_max_element
// ----------------------------------------------------------------------------
// Function: max_element_bufsz
template <unsigned NT, unsigned VT>
template <typename T>
unsigned cudaExecutionPolicy<NT, VT>::max_element_bufsz(unsigned count) {
return reduce_bufsz<detail::cudaFindPair<T>>(count);
}
/**
@brief finds the index of the maximum element in a range
@tparam P execution policy type
@tparam I input iterator type
@tparam O comparator type
@param p execution policy object
@param first iterator to the beginning of the range
@param last iterator to the end of the range
@param idx solution index of the maximum element
@param op comparison function object
@param buf pointer to the buffer
The function launches kernels asynchronously to find
the largest element in the range <tt>[first, last)</tt>
using the given comparator @c op.
You need to provide a buffer that holds at least
tf::cuda_max_element_bufsz bytes for internal use.
The function is equivalent to a parallel execution of the following loop:
@code{.cpp}
if(first == last) {
return 0;
}
auto largest = first;
for (++first; first != last; ++first) {
if (op(*largest, *first)) {
largest = first;
}
}
return std::distance(first, largest);
@endcode
*/
template <typename P, typename I, typename O>
void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) {
detail::cuda_max_element_loop(
p, first, std::distance(first, last), idx, op, buf
);
}
} // end of namespace tf -----------------------------------------------------