#pragma once #include "for_each.hpp" #include "reduce.hpp" /** @file taskflow/cuda/algorithm/find.hpp @brief cuda find algorithms include file */ namespace tf::detail { /** @private */ template struct cudaFindPair { T key; unsigned index; __device__ operator unsigned () const { return index; } }; /** @private */ template void cuda_find_if_loop(P&& p, I input, unsigned count, unsigned* idx, U pred) { if(count == 0) { cuda_single_task(p, [=] __device__ () { *idx = 0; }); return; } using E = std::decay_t

; auto B = (count + E::nv - 1) / E::nv; // set the index to the maximum cuda_single_task(p, [=] __device__ () { *idx = count; }); // launch the kernel to atomic-find the minimum cuda_kernel<<>>([=] __device__ (auto tid, auto bid) { __shared__ unsigned shm_id; if(!tid) { shm_id = count; } __syncthreads(); auto tile = cuda_get_tile(bid, E::nv, count); auto x = cuda_mem_to_reg_strided( input + tile.begin, tid, tile.count() ); auto id = count; for(unsigned i=0; i::Storage shm; //id = cudaBlockReduce()( // tid, // id, // shm, // (tile.count() < E::nt ? tile.count() : E::nt), // cuda_minimum{}, // false //); // only need the minimum id atomicMin(&shm_id, id); __syncthreads(); // reduce all to the global memory if(!tid) { atomicMin(idx, shm_id); //atomicMin(idx, id); } }); } /** @private */ template void cuda_min_element_loop( P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr ) { if(count == 0) { cuda_single_task(p, [=] __device__ () { *idx = 0; }); return; } using T = cudaFindPair::value_type>; cuda_uninitialized_reduce_loop(p, cuda_make_load_iterator([=]__device__(auto i){ return T{*(input+i), i}; }), count, idx, [=] __device__ (const auto& a, const auto& b) { return op(a.key, b.key) ? a : b; }, ptr ); } /** @private */ template void cuda_max_element_loop( P&& p, I input, unsigned count, unsigned* idx, O op, void* ptr ) { if(count == 0) { cuda_single_task(p, [=] __device__ () { *idx = 0; }); return; } using T = cudaFindPair::value_type>; cuda_uninitialized_reduce_loop(p, cuda_make_load_iterator([=]__device__(auto i){ return T{*(input+i), i}; }), count, idx, [=] __device__ (const auto& a, const auto& b) { return op(a.key, b.key) ? b : a; }, ptr ); } } // end of namespace tf::detail --------------------------------------------- namespace tf { // ---------------------------------------------------------------------------- // cuda_find_if // ---------------------------------------------------------------------------- /** @brief finds the index of the first element that satisfies the given criteria @tparam P execution policy type @tparam I input iterator type @tparam U unary operator type @param p execution policy @param first iterator to the beginning of the range @param last iterator to the end of the range @param idx pointer to the index of the found element @param op unary operator which returns @c true for the required element The function launches kernels asynchronously to find the index @c idx of the first element in the range [first, last) such that op(*(first+idx)) is true. This is equivalent to the parallel execution of the following loop: @code{.cpp} unsigned idx = 0; for(; first != last; ++first, ++idx) { if (p(*first)) { return idx; } } return idx; @endcode */ template void cuda_find_if( P&& p, I first, I last, unsigned* idx, U op ) { detail::cuda_find_if_loop(p, first, std::distance(first, last), idx, op); } // ---------------------------------------------------------------------------- // cuda_min_element // ---------------------------------------------------------------------------- // Function: min-element_bufsz template template unsigned cudaExecutionPolicy::min_element_bufsz(unsigned count) { return reduce_bufsz>(count); } /** @brief finds the index of the minimum element in a range @tparam P execution policy type @tparam I input iterator type @tparam O comparator type @param p execution policy object @param first iterator to the beginning of the range @param last iterator to the end of the range @param idx solution index of the minimum element @param op comparison function object @param buf pointer to the buffer The function launches kernels asynchronously to find the smallest element in the range [first, last) using the given comparator @c op. You need to provide a buffer that holds at least tf::cuda_min_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop: @code{.cpp} if(first == last) { return 0; } auto smallest = first; for (++first; first != last; ++first) { if (op(*first, *smallest)) { smallest = first; } } return std::distance(first, smallest); @endcode */ template void cuda_min_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) { detail::cuda_min_element_loop( p, first, std::distance(first, last), idx, op, buf ); } // ---------------------------------------------------------------------------- // cuda_max_element // ---------------------------------------------------------------------------- // Function: max_element_bufsz template template unsigned cudaExecutionPolicy::max_element_bufsz(unsigned count) { return reduce_bufsz>(count); } /** @brief finds the index of the maximum element in a range @tparam P execution policy type @tparam I input iterator type @tparam O comparator type @param p execution policy object @param first iterator to the beginning of the range @param last iterator to the end of the range @param idx solution index of the maximum element @param op comparison function object @param buf pointer to the buffer The function launches kernels asynchronously to find the largest element in the range [first, last) using the given comparator @c op. You need to provide a buffer that holds at least tf::cuda_max_element_bufsz bytes for internal use. The function is equivalent to a parallel execution of the following loop: @code{.cpp} if(first == last) { return 0; } auto largest = first; for (++first; first != last; ++first) { if (op(*largest, *first)) { largest = first; } } return std::distance(first, largest); @endcode */ template void cuda_max_element(P&& p, I first, I last, unsigned* idx, O op, void* buf) { detail::cuda_max_element_loop( p, first, std::distance(first, last), idx, op, buf ); } } // end of namespace tf -----------------------------------------------------