#pragma once #include "reduce.hpp" /** @file taskflow/cuda/algorithm/scan.hpp @brief CUDA scan algorithm include file */ namespace tf::detail { // ---------------------------------------------------------------------------- // scan // ---------------------------------------------------------------------------- /** @private */ inline constexpr unsigned cudaScanRecursionThreshold = 8; /** @private */ enum class cudaScanType : int { EXCLUSIVE = 1, INCLUSIVE }; /** @private */ template 0)> struct cudaScanResult { T scan; T reduction; }; /** @private */ template struct cudaScanResult { cudaArray scan; T reduction; }; //----------------------------------------------------------------------------- /** @private */ template struct cudaBlockScan { const static unsigned num_warps = nt / CUDA_WARP_SIZE; const static unsigned num_passes = log2(nt); const static unsigned capacity = nt + num_warps; /** @private */ union storage_t { T data[2 * nt]; struct { T threads[nt], warps[num_warps]; }; }; // standard scan template __device__ cudaScanResult operator ()( unsigned tid, T x, storage_t& storage, unsigned count = nt, op_t op = op_t(), T init = T(), cudaScanType type = cudaScanType::EXCLUSIVE ) const; // vectorized scan. accepts multiple values per thread and adds in // optional global carry-in template __device__ cudaScanResult operator()( unsigned tid, cudaArray x, storage_t& storage, T carry_in = T(), bool use_carry_in = false, unsigned count = nt, op_t op = op_t(), T init = T(), cudaScanType type = cudaScanType::EXCLUSIVE ) const; }; // standard scan template template __device__ cudaScanResult cudaBlockScan::operator () ( unsigned tid, T x, storage_t& storage, unsigned count, op_t op, T init, cudaScanType type ) const { unsigned first = 0; storage.data[first + tid] = x; __syncthreads(); cuda_iterate([&](auto pass) { if(auto offset = 1<= offset) { x = op(storage.data[first + tid - offset], x); } first = nt - first; storage.data[first + tid] = x; __syncthreads(); }); cudaScanResult result; result.reduction = storage.data[first + count - 1]; result.scan = (tid < count) ? (cudaScanType::INCLUSIVE == type ? x : (tid ? storage.data[first + tid - 1] : init)) : result.reduction; __syncthreads(); return result; } // vectorized scan block template template __device__ cudaScanResult cudaBlockScan::operator()( unsigned tid, cudaArray x, storage_t& storage, T carry_in, bool use_carry_in, unsigned count, op_t op, T init, cudaScanType type ) const { // Start with an inclusive scan of the in-range elements. if(count >= nt * vt) { cuda_iterate([&](auto i) { x[i] = i ? op(x[i], x[i - 1]) : x[i]; }); } else { cuda_iterate([&](auto i) { auto index = vt * tid + i; x[i] = i ? ((index < count) ? op(x[i], x[i - 1]) : x[i - 1]) : (x[i] = (index < count) ? x[i] : init); }); } // Scan the thread-local reductions for a carry-in for each thread. auto result = operator()( tid, x[vt - 1], storage, (count + vt - 1) / vt, op, init, cudaScanType::EXCLUSIVE ); // Perform the scan downsweep and add both the global carry-in and the // thread carry-in to the values. if(use_carry_in) { result.reduction = op(carry_in, result.reduction); result.scan = tid ? op(carry_in, result.scan) : carry_in; } else { use_carry_in = tid > 0; } cudaArray y; cuda_iterate([&](auto i) { if(cudaScanType::EXCLUSIVE == type) { y[i] = i ? x[i - 1] : result.scan; if(use_carry_in && i > 0) y[i] = op(result.scan, y[i]); } else { y[i] = use_carry_in ? op(x[i], result.scan) : x[i]; } }); return cudaScanResult { y, result.reduction }; } /** @private @brief single-pass scan for small input */ template void cuda_single_pass_scan( P&& p, cudaScanType scan_type, I input, unsigned count, O output, C op //reduction_it reduction, ) { using T = typename std::iterator_traits::value_type; using E = std::decay_t

; // Small input specialization. This is the non-recursive branch. cuda_kernel<<<1, E::nt, 0, p.stream()>>>([=] __device__ (auto tid, auto bid) { using scan_t = cudaBlockScan; __shared__ union { typename scan_t::storage_t scan; T values[E::nv]; } shared; auto carry_in = T(); for(unsigned cur = 0; cur < count; cur += E::nv) { // Cooperatively load values into register. auto count2 = min(count - cur, E::nv); auto x = cuda_mem_to_reg_thread(input + cur, tid, count2, shared.values); auto result = scan_t()(tid, x, shared.scan, carry_in, cur > 0, count2, op, T(), scan_type); // Store the scanned values back to global memory. cuda_reg_to_mem_thread(result.scan, tid, count2, output + cur, shared.values); // Roll the reduction into carry_in. carry_in = result.reduction; } // Store the carry-out to the reduction pointer. This may be a // discard_iterator_t if no reduction is wanted. //if(!tid) *reduction = carry_in; }); } /** @private @brief main scan loop */ template void cuda_scan_loop( P&& p, cudaScanType scan_type, I input, unsigned count, O output, C op, //reduction_it reduction, void* ptr ) { using E = std::decay_t

; using T = typename std::iterator_traits::value_type; T* buffer = static_cast(ptr); //launch_t::cta_dim(context).B(count); unsigned B = (count + E::nv - 1) / E::nv; if(B > cudaScanRecursionThreshold) { //cudaDeviceVector partials(B); //auto buffer = partials.data(); // upsweep phase cuda_kernel<<>>([=] __device__ (auto tid, auto bid) { __shared__ typename cudaBlockReduce::Storage shm; // Load the tile's data into register. auto tile = cuda_get_tile(bid, E::nv, count); auto x = cuda_mem_to_reg_strided( input + tile.begin, tid, tile.count() ); // Reduce the thread's values into a scalar. T scalar; cuda_strided_iterate( [&] (auto i, auto j) { scalar = i ? op(scalar, x[i]) : x[0]; }, tid, tile.count() ); // Reduce across all threads. auto all_reduce = cudaBlockReduce()( tid, scalar, shm, tile.count(), op ); // Store the final reduction to the partials. if(!tid) { buffer[bid] = all_reduce; } }); // recursively call scan //cuda_scan_loop(p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, S); cuda_scan_loop( p, cudaScanType::EXCLUSIVE, buffer, B, buffer, op, buffer+B ); // downsweep: perform an intra-tile scan and add the scan of the partials // as carry-in cuda_kernel<<>>([=] __device__ (auto tid, auto bid) { using scan_t = cudaBlockScan; __shared__ union { typename scan_t::storage_t scan; T values[E::nv]; } shared; // Load a tile to register in thread order. auto tile = cuda_get_tile(bid, E::nv, count); auto x = cuda_mem_to_reg_thread( input + tile.begin, tid, tile.count(), shared.values ); // Scan the array with carry-in from the partials. auto y = scan_t()(tid, x, shared.scan, buffer[bid], bid > 0, tile.count(), op, T(), scan_type).scan; // Store the scanned values to the output. cuda_reg_to_mem_thread( y, tid, tile.count(), output + tile.begin, shared.values ); }); } // Small input specialization. This is the non-recursive branch. else { cuda_single_pass_scan(p, scan_type, input, count, output, op); } } } // namespace tf::detail ---------------------------------------------------- namespace tf { // Function: scan_bufsz template template unsigned cudaExecutionPolicy::scan_bufsz(unsigned count) { unsigned B = num_blocks(count); unsigned n = 0; for(auto b=B; b>detail::cudaScanRecursionThreshold; b=num_blocks(b)) { n += b; } return n*sizeof(T); } /** @brief performs asynchronous inclusive scan over a range of items @tparam P execution policy type @tparam I input iterator @tparam O output iterator @tparam C binary operator type @param p execution policy @param first iterator to the beginning of the input range @param last iterator to the end of the input range @param output iterator to the beginning of the output range @param op binary operator to apply to scan @param buf pointer to the temporary buffer */ template void cuda_inclusive_scan( P&& p, I first, I last, O output, C op, void* buf ) { unsigned count = std::distance(first, last); if(count == 0) { return; } // launch the scan loop detail::cuda_scan_loop( p, detail::cudaScanType::INCLUSIVE, first, count, output, op, buf ); } /** @brief performs asynchronous inclusive scan over a range of transformed items @tparam P execution policy type @tparam I input iterator @tparam O output iterator @tparam C binary operator type @tparam U unary operator type @param p execution policy @param first iterator to the beginning of the input range @param last iterator to the end of the input range @param output iterator to the beginning of the output range @param bop binary operator to apply to scan @param uop unary operator to apply to transform each item before scan @param buf pointer to the temporary buffer */ template void cuda_transform_inclusive_scan( P&& p, I first, I last, O output, C bop, U uop, void* buf ) { using T = typename std::iterator_traits::value_type; unsigned count = std::distance(first, last); if(count == 0) { return; } // launch the scan loop detail::cuda_scan_loop( p, detail::cudaScanType::INCLUSIVE, cuda_make_load_iterator([=]__device__(auto i){ return uop(*(first+i)); }), count, output, bop, buf ); } /** @brief performs asynchronous exclusive scan over a range of items @tparam P execution policy type @tparam I input iterator @tparam O output iterator @tparam C binary operator type @param p execution policy @param first iterator to the beginning of the input range @param last iterator to the end of the input range @param output iterator to the beginning of the output range @param op binary operator to apply to scan @param buf pointer to the temporary buffer */ template void cuda_exclusive_scan( P&& p, I first, I last, O output, C op, void* buf ) { unsigned count = std::distance(first, last); if(count == 0) { return; } // launch the scan loop detail::cuda_scan_loop( p, detail::cudaScanType::EXCLUSIVE, first, count, output, op, buf ); } /** @brief performs asynchronous exclusive scan over a range of items @tparam P execution policy type @tparam I input iterator @tparam O output iterator @tparam C binary operator type @tparam U unary operator type @param p execution policy @param first iterator to the beginning of the input range @param last iterator to the end of the input range @param output iterator to the beginning of the output range @param bop binary operator to apply to scan @param uop unary operator to apply to transform each item before scan @param buf pointer to the temporary buffer */ template void cuda_transform_exclusive_scan( P&& p, I first, I last, O output, C bop, U uop, void* buf ) { using T = typename std::iterator_traits::value_type; unsigned count = std::distance(first, last); if(count == 0) { return; } // launch the scan loop detail::cuda_scan_loop( p, detail::cudaScanType::EXCLUSIVE, cuda_make_load_iterator([=]__device__(auto i){ return uop(*(first+i)); }), count, output, bop, buf ); } } // end of namespace tf -----------------------------------------------------