#pragma once #include "../cudaflow.hpp" /** @file taskflow/cuda/algorithm/reduce.hpp @brief cuda reduce algorithms include file */ namespace tf::detail { // ---------------------------------------------------------------------------- // reduction helper functions // ---------------------------------------------------------------------------- /** @private */ template struct cudaBlockReduce { static const unsigned group_size = std::min(nt, CUDA_WARP_SIZE); static const unsigned num_passes = log2(group_size); static const unsigned num_items = nt / group_size; static_assert( nt && (0 == nt % CUDA_WARP_SIZE), "cudaBlockReduce requires num threads to be a multiple of warp_size (32)" ); /** @private */ struct Storage { T data[std::max(nt, 2 * group_size)]; }; template __device__ T operator()(unsigned, T, Storage&, unsigned, op_t, bool = true) const; }; // function: reduce to be called from a block template template __device__ T cudaBlockReduce::operator ()( unsigned tid, T x, Storage& storage, unsigned count, op_t op, bool ret ) const { // Store your data into shared memory. storage.data[tid] = x; __syncthreads(); if(tid < group_size) { // Each thread scans within its lane. cuda_strided_iterate([&](auto i, auto j) { if(i > 0) { x = op(x, storage.data[j]); } }, tid, count); storage.data[tid] = x; } __syncthreads(); auto count2 = count < group_size ? count : group_size; auto first = (1 & num_passes) ? group_size : 0; if(tid < group_size) { storage.data[first + tid] = x; } __syncthreads(); cuda_iterate([&](auto pass) { if(tid < group_size) { if(auto offset = 1 << pass; tid + offset < count2) { x = op(x, storage.data[first + offset + tid]); } first = group_size - first; storage.data[first + tid] = x; } __syncthreads(); }); if(ret) { x = storage.data[0]; __syncthreads(); } return x; } // ---------------------------------------------------------------------------- // cuda_reduce // ---------------------------------------------------------------------------- /** @private */ template __global__ void cuda_reduce_kernel( I input, unsigned count, T* res, O op, void* ptr ) { using U = typename std::iterator_traits::value_type; __shared__ typename cudaBlockReduce::Storage shm; auto tid = threadIdx.x; auto bid = blockIdx.x; auto tile = cuda_get_tile(bid, nt*vt, count); auto x = cuda_mem_to_reg_strided( input + tile.begin, tid, tile.count() ); // reduce multiple values per thread into a scalar. U s; cuda_strided_iterate( [&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count() ); // reduce to a scalar per block. s = cudaBlockReduce()( tid, s, shm, (tile.count() < nt ? tile.count() : nt), op, false ); if(!tid) { auto buf = static_cast(ptr); (count <= nt*vt) ? *res = op(*res, s) : buf[bid] = s; } } /** @private */ template void cuda_reduce_loop( P&& p, I input, unsigned count, T* res, O op, void* ptr ) { using U = typename std::iterator_traits::value_type; using E = std::decay_t

; auto buf = static_cast(ptr); auto B = E::num_blocks(count); cuda_reduce_kernel<<>>( input, count, res, op, ptr ); if(B > 1) { cuda_reduce_loop(p, buf, B, res, op, buf+B); } } // ---------------------------------------------------------------------------- // cuda_uninitialized_reduce // ---------------------------------------------------------------------------- /** @private */ template __global__ void cuda_uninitialized_reduce_kernel( I input, unsigned count, T* res, O op, void* ptr ) { using U = typename std::iterator_traits::value_type; __shared__ typename cudaBlockReduce::Storage shm; auto tid = threadIdx.x; auto bid = blockIdx.x; auto tile = cuda_get_tile(bid, nt*vt, count); auto x = cuda_mem_to_reg_strided( input + tile.begin, tid, tile.count() ); // reduce multiple values per thread into a scalar. U s; cuda_strided_iterate( [&] (auto i, auto) { s = i ? op(s, x[i]) : x[0]; }, tid, tile.count() ); // reduce to a scalar per block. s = cudaBlockReduce()( tid, s, shm, (tile.count() < nt ? tile.count() : nt), op, false ); if(!tid) { auto buf = static_cast(ptr); (count <= nt*vt) ? *res = s : buf[bid] = s; } } /** @private */ template void cuda_uninitialized_reduce_loop( P&& p, I input, unsigned count, T* res, O op, void* ptr ) { using U = typename std::iterator_traits::value_type; using E = std::decay_t

; auto buf = static_cast(ptr); auto B = (count + E::nv - 1) / E::nv; cuda_uninitialized_reduce_kernel<<>>( input, count, res, op, buf ); if(B > 1) { cuda_uninitialized_reduce_loop(p, buf, B, res, op, buf+B); } } } // namespace tf::detail ---------------------------------------------------- namespace tf { // Function: reduce_bufsz template template unsigned cudaExecutionPolicy::reduce_bufsz(unsigned count) { unsigned B = num_blocks(count); unsigned n = 0; while(B > 1) { n += B; B = num_blocks(B); } return n*sizeof(T); } // ---------------------------------------------------------------------------- // cuda_reduce // ---------------------------------------------------------------------------- /** @brief performs asynchronous parallel reduction over a range of items @tparam P execution policy type @tparam I input iterator type @tparam T value type @tparam O binary operator type @param p execution policy @param first iterator to the beginning of the range @param last iterator to the end of the range @param res pointer to the result @param op binary operator to apply to reduce elements @param buf pointer to the temporary buffer This method is equivalent to the parallel execution of the following loop on a GPU: @code{.cpp} while (first != last) { *result = op(*result, *first++); } @endcode */ template void cuda_reduce( P&& p, I first, I last, T* res, O op, void* buf ) { unsigned count = std::distance(first, last); if(count == 0) { return; } detail::cuda_reduce_loop(p, first, count, res, op, buf); } // ---------------------------------------------------------------------------- // cuda_uninitialized_reduce // ---------------------------------------------------------------------------- /** @brief performs asynchronous parallel reduction over a range of items without an initial value @tparam P execution policy type @tparam I input iterator type @tparam T value type @tparam O binary operator type @param p execution policy @param first iterator to the beginning of the range @param last iterator to the end of the range @param res pointer to the result @param op binary operator to apply to reduce elements @param buf pointer to the temporary buffer This method is equivalent to the parallel execution of the following loop on a GPU: @code{.cpp} *result = *first++; // no initial values partitipcate in the loop while (first != last) { *result = op(*result, *first++); } @endcode */ template void cuda_uninitialized_reduce( P&& p, I first, I last, T* res, O op, void* buf ) { unsigned count = std::distance(first, last); if(count == 0) { return; } detail::cuda_uninitialized_reduce_loop(p, first, count, res, op, buf); } // ---------------------------------------------------------------------------- // transform_reduce // ---------------------------------------------------------------------------- /** @brief performs asynchronous parallel reduction over a range of transformed items without an initial value @tparam P execution policy type @tparam I input iterator type @tparam T value type @tparam O binary operator type @tparam U unary operator type @param p execution policy @param first iterator to the beginning of the range @param last iterator to the end of the range @param res pointer to the result @param bop binary operator to apply to reduce elements @param uop unary operator to apply to transform elements @param buf pointer to the temporary buffer This method is equivalent to the parallel execution of the following loop on a GPU: @code{.cpp} while (first != last) { *result = bop(*result, uop(*first++)); } @endcode */ template void cuda_transform_reduce( P&& p, I first, I last, T* res, O bop, U uop, void* buf ) { unsigned count = std::distance(first, last); if(count == 0) { return; } // reduction loop detail::cuda_reduce_loop(p, cuda_make_load_iterator([=]__device__(auto i){ return uop(*(first+i)); }), count, res, bop, buf ); } // ---------------------------------------------------------------------------- // transform_uninitialized_reduce // ---------------------------------------------------------------------------- /** @brief performs asynchronous parallel reduction over a range of transformed items with an initial value @tparam P execution policy type @tparam I input iterator type @tparam T value type @tparam O binary operator type @tparam U unary operator type @param p execution policy @param first iterator to the beginning of the range @param last iterator to the end of the range @param res pointer to the result @param bop binary operator to apply to reduce elements @param uop unary operator to apply to transform elements @param buf pointer to the temporary buffer This method is equivalent to the parallel execution of the following loop on a GPU: @code{.cpp} *result = uop(*first++); // no initial values partitipcate in the loop while (first != last) { *result = bop(*result, uop(*first++)); } @endcode */ template void cuda_uninitialized_transform_reduce( P&& p, I first, I last, T* res, O bop, U uop, void* buf ) { unsigned count = std::distance(first, last); if(count == 0) { return; } detail::cuda_uninitialized_reduce_loop(p, cuda_make_load_iterator([=]__device__(auto i){ return uop(*(first+i)); }), count, res, bop, buf ); } // ---------------------------------------------------------------------------- //template //__device__ void cuda_warp_reduce( // volatile T* shm, size_t N, size_t tid, C op //) { // if(tid + 32 < N) shm[tid] = op(shm[tid], shm[tid+32]); // if(tid + 16 < N) shm[tid] = op(shm[tid], shm[tid+16]); // if(tid + 8 < N) shm[tid] = op(shm[tid], shm[tid+8]); // if(tid + 4 < N) shm[tid] = op(shm[tid], shm[tid+4]); // if(tid + 2 < N) shm[tid] = op(shm[tid], shm[tid+2]); // if(tid + 1 < N) shm[tid] = op(shm[tid], shm[tid+1]); //} // //template //__global__ void cuda_reduce(I first, size_t N, T* res, C op) { // // size_t tid = threadIdx.x; // // if(tid >= N) { // return; // } // // cudaSharedMemory shared_memory; // T* shm = shared_memory.get(); // // shm[tid] = *(first+tid); // // for(size_t i=tid+blockDim.x; i 32; s >>= 1) { // if(tid < s && tid + s < N) { // shm[tid] = op(shm[tid], shm[tid+s]); // } // __syncthreads(); // } // // if(tid < 32) { // cuda_warp_reduce(shm, N, tid, op); // } // // if(tid == 0) { // if constexpr (uninitialized) { // *res = shm[0]; // } // else { // *res = op(*res, shm[0]); // } // } //} } // end of namespace tf -----------------------------------------------------